edd5559380d31be2ae0e2df8f233ff0da591eb13
[deliverable/linux.git] / arch / x86_64 / mm / numa.c
1 /*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5 #include <linux/kernel.h>
6 #include <linux/mm.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
14
15 #include <asm/e820.h>
16 #include <asm/proto.h>
17 #include <asm/dma.h>
18 #include <asm/numa.h>
19 #include <asm/acpi.h>
20
21 #ifndef Dprintk
22 #define Dprintk(x...)
23 #endif
24
25 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
26 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
28 int memnode_shift;
29 u8 memnodemap[NODEMAPSIZE];
30
31 unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
32 [0 ... NR_CPUS-1] = NUMA_NO_NODE
33 };
34 unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
36 };
37 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
38
39 int numa_off __initdata;
40
41
42 /*
43 * Given a shift value, try to populate memnodemap[]
44 * Returns :
45 * 1 if OK
46 * 0 if memnodmap[] too small (of shift too small)
47 * -1 if node overlap or lost ram (shift too big)
48 */
49 static int __init populate_memnodemap(
50 const struct node *nodes, int numnodes, int shift)
51 {
52 int i;
53 int res = -1;
54 unsigned long addr, end;
55
56 memset(memnodemap, 0xff, sizeof(memnodemap));
57 for (i = 0; i < numnodes; i++) {
58 addr = nodes[i].start;
59 end = nodes[i].end;
60 if (addr >= end)
61 continue;
62 if ((end >> shift) >= NODEMAPSIZE)
63 return 0;
64 do {
65 if (memnodemap[addr >> shift] != 0xff)
66 return -1;
67 memnodemap[addr >> shift] = i;
68 addr += (1 << shift);
69 } while (addr < end);
70 res = 1;
71 }
72 return res;
73 }
74
75 int __init compute_hash_shift(struct node *nodes, int numnodes)
76 {
77 int shift = 20;
78
79 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
80 shift++;
81
82 printk(KERN_DEBUG "Using %d for the hash shift.\n",
83 shift);
84
85 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
86 printk(KERN_INFO
87 "Your memory is not aligned you need to rebuild your kernel "
88 "with a bigger NODEMAPSIZE shift=%d\n",
89 shift);
90 return -1;
91 }
92 return shift;
93 }
94
95 #ifdef CONFIG_SPARSEMEM
96 int early_pfn_to_nid(unsigned long pfn)
97 {
98 return phys_to_nid(pfn << PAGE_SHIFT);
99 }
100 #endif
101
102 /* Initialize bootmem allocator for a node */
103 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
104 {
105 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
106 unsigned long nodedata_phys;
107 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
108
109 start = round_up(start, ZONE_ALIGN);
110
111 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
112
113 start_pfn = start >> PAGE_SHIFT;
114 end_pfn = end >> PAGE_SHIFT;
115
116 memory_present(nodeid, start_pfn, end_pfn);
117 nodedata_phys = find_e820_area(start, end, pgdat_size);
118 if (nodedata_phys == -1L)
119 panic("Cannot find memory pgdat in node %d\n", nodeid);
120
121 Dprintk("nodedata_phys %lx\n", nodedata_phys);
122
123 node_data[nodeid] = phys_to_virt(nodedata_phys);
124 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
125 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
126 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
127 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
128
129 /* Find a place for the bootmem map */
130 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
131 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
132 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
133 if (bootmap_start == -1L)
134 panic("Not enough continuous space for bootmap on node %d", nodeid);
135 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
136
137 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
138 bootmap_start >> PAGE_SHIFT,
139 start_pfn, end_pfn);
140
141 e820_bootmem_free(NODE_DATA(nodeid), start, end);
142
143 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
144 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
145 node_set_online(nodeid);
146 }
147
148 /* Initialize final allocator for a zone */
149 void __init setup_node_zones(int nodeid)
150 {
151 unsigned long start_pfn, end_pfn;
152 unsigned long zones[MAX_NR_ZONES];
153 unsigned long holes[MAX_NR_ZONES];
154
155 start_pfn = node_start_pfn(nodeid);
156 end_pfn = node_end_pfn(nodeid);
157
158 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n",
159 nodeid, start_pfn, end_pfn);
160
161 size_zones(zones, holes, start_pfn, end_pfn);
162 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
163 start_pfn, holes);
164 }
165
166 void __init numa_init_array(void)
167 {
168 int rr, i;
169 /* There are unfortunately some poorly designed mainboards around
170 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
171 mapping. To avoid this fill in the mapping for all possible
172 CPUs, as the number of CPUs is not known yet.
173 We round robin the existing nodes. */
174 rr = first_node(node_online_map);
175 for (i = 0; i < NR_CPUS; i++) {
176 if (cpu_to_node[i] != NUMA_NO_NODE)
177 continue;
178 numa_set_node(i, rr);
179 rr = next_node(rr, node_online_map);
180 if (rr == MAX_NUMNODES)
181 rr = first_node(node_online_map);
182 }
183
184 }
185
186 #ifdef CONFIG_NUMA_EMU
187 int numa_fake __initdata = 0;
188
189 /* Numa emulation */
190 static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
191 {
192 int i;
193 struct node nodes[MAX_NUMNODES];
194 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
195
196 /* Kludge needed for the hash function */
197 if (hweight64(sz) > 1) {
198 unsigned long x = 1;
199 while ((x << 1) < sz)
200 x <<= 1;
201 if (x < sz/2)
202 printk("Numa emulation unbalanced. Complain to maintainer\n");
203 sz = x;
204 }
205
206 memset(&nodes,0,sizeof(nodes));
207 for (i = 0; i < numa_fake; i++) {
208 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
209 if (i == numa_fake-1)
210 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
211 nodes[i].end = nodes[i].start + sz;
212 if (i != numa_fake-1)
213 nodes[i].end--;
214 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
215 i,
216 nodes[i].start, nodes[i].end,
217 (nodes[i].end - nodes[i].start) >> 20);
218 node_set_online(i);
219 }
220 memnode_shift = compute_hash_shift(nodes, numa_fake);
221 if (memnode_shift < 0) {
222 memnode_shift = 0;
223 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
224 return -1;
225 }
226 for_each_online_node(i)
227 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
228 numa_init_array();
229 return 0;
230 }
231 #endif
232
233 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
234 {
235 int i;
236
237 #ifdef CONFIG_NUMA_EMU
238 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
239 return;
240 #endif
241
242 #ifdef CONFIG_ACPI_NUMA
243 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
244 end_pfn << PAGE_SHIFT))
245 return;
246 #endif
247
248 #ifdef CONFIG_K8_NUMA
249 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
250 return;
251 #endif
252 printk(KERN_INFO "%s\n",
253 numa_off ? "NUMA turned off" : "No NUMA configuration found");
254
255 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
256 start_pfn << PAGE_SHIFT,
257 end_pfn << PAGE_SHIFT);
258 /* setup dummy node covering all memory */
259 memnode_shift = 63;
260 memnodemap[0] = 0;
261 nodes_clear(node_online_map);
262 node_set_online(0);
263 for (i = 0; i < NR_CPUS; i++)
264 numa_set_node(i, 0);
265 node_to_cpumask[0] = cpumask_of_cpu(0);
266 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
267 }
268
269 __cpuinit void numa_add_cpu(int cpu)
270 {
271 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
272 }
273
274 void __cpuinit numa_set_node(int cpu, int node)
275 {
276 cpu_pda[cpu].nodenumber = node;
277 cpu_to_node[cpu] = node;
278 }
279
280 unsigned long __init numa_free_all_bootmem(void)
281 {
282 int i;
283 unsigned long pages = 0;
284 for_each_online_node(i) {
285 pages += free_all_bootmem_node(NODE_DATA(i));
286 }
287 return pages;
288 }
289
290 void __init paging_init(void)
291 {
292 int i;
293 for_each_online_node(i) {
294 setup_node_zones(i);
295 }
296 }
297
298 /* [numa=off] */
299 __init int numa_setup(char *opt)
300 {
301 if (!strncmp(opt,"off",3))
302 numa_off = 1;
303 #ifdef CONFIG_NUMA_EMU
304 if(!strncmp(opt, "fake=", 5)) {
305 numa_fake = simple_strtoul(opt+5,NULL,0); ;
306 if (numa_fake >= MAX_NUMNODES)
307 numa_fake = MAX_NUMNODES;
308 }
309 #endif
310 #ifdef CONFIG_ACPI_NUMA
311 if (!strncmp(opt,"noacpi",6))
312 acpi_numa = -1;
313 #endif
314 return 1;
315 }
316
317 EXPORT_SYMBOL(cpu_to_node);
318 EXPORT_SYMBOL(node_to_cpumask);
319 EXPORT_SYMBOL(memnode_shift);
320 EXPORT_SYMBOL(memnodemap);
321 EXPORT_SYMBOL(node_data);
This page took 0.101523 seconds and 4 git commands to generate.