[PATCH] x86-64: Add __copy_from_user_nocache
[deliverable/linux.git] / arch / x86_64 / mm / numa.c
CommitLineData
1da177e4
LT
1/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/numa.h>
19#include <asm/acpi.h>
20
21#ifndef Dprintk
22#define Dprintk(x...)
23#endif
24
6c231b7b 25struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
1da177e4
LT
26bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
dcf36bfa 28struct memnode memnode;
1da177e4 29
3f098c26
AK
30unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
31 [0 ... NR_CPUS-1] = NUMA_NO_NODE
0b07e984 32};
3f098c26
AK
33unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
34 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
35};
36cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
1da177e4
LT
37
38int numa_off __initdata;
39
529a3404
ED
40
41/*
42 * Given a shift value, try to populate memnodemap[]
43 * Returns :
44 * 1 if OK
45 * 0 if memnodmap[] too small (of shift too small)
46 * -1 if node overlap or lost ram (shift too big)
47 */
d18ff470 48static int __init
abe059e7 49populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
1da177e4
LT
50{
51 int i;
529a3404
ED
52 int res = -1;
53 unsigned long addr, end;
b684664f 54
8309cf66
ED
55 if (shift >= 64)
56 return -1;
529a3404 57 memset(memnodemap, 0xff, sizeof(memnodemap));
b684664f 58 for (i = 0; i < numnodes; i++) {
529a3404
ED
59 addr = nodes[i].start;
60 end = nodes[i].end;
61 if (addr >= end)
b684664f 62 continue;
529a3404
ED
63 if ((end >> shift) >= NODEMAPSIZE)
64 return 0;
65 do {
66 if (memnodemap[addr >> shift] != 0xff)
b684664f 67 return -1;
b684664f 68 memnodemap[addr >> shift] = i;
8309cf66 69 addr += (1UL << shift);
529a3404
ED
70 } while (addr < end);
71 res = 1;
1da177e4 72 }
529a3404
ED
73 return res;
74}
75
abe059e7 76int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
529a3404
ED
77{
78 int shift = 20;
79
80 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
81 shift++;
82
6b050f80 83 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
529a3404
ED
84 shift);
85
86 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
87 printk(KERN_INFO
88 "Your memory is not aligned you need to rebuild your kernel "
89 "with a bigger NODEMAPSIZE shift=%d\n",
90 shift);
91 return -1;
92 }
b684664f 93 return shift;
1da177e4
LT
94}
95
bbfceef4
MT
96#ifdef CONFIG_SPARSEMEM
97int early_pfn_to_nid(unsigned long pfn)
98{
99 return phys_to_nid(pfn << PAGE_SHIFT);
100}
101#endif
102
a8062231
AK
103static void * __init
104early_node_mem(int nodeid, unsigned long start, unsigned long end,
105 unsigned long size)
106{
107 unsigned long mem = find_e820_area(start, end, size);
108 void *ptr;
109 if (mem != -1L)
110 return __va(mem);
111 ptr = __alloc_bootmem_nopanic(size,
112 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
113 if (ptr == 0) {
114 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
115 size, nodeid);
116 return NULL;
117 }
118 return ptr;
119}
120
1da177e4
LT
121/* Initialize bootmem allocator for a node */
122void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
123{
124 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
125 unsigned long nodedata_phys;
a8062231 126 void *bootmap;
1da177e4
LT
127 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
128
129 start = round_up(start, ZONE_ALIGN);
130
6b050f80 131 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
1da177e4
LT
132
133 start_pfn = start >> PAGE_SHIFT;
134 end_pfn = end >> PAGE_SHIFT;
135
a8062231
AK
136 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
137 if (node_data[nodeid] == NULL)
138 return;
139 nodedata_phys = __pa(node_data[nodeid]);
1da177e4 140
1da177e4
LT
141 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
142 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
143 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
144 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
145
146 /* Find a place for the bootmem map */
147 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
148 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
a8062231
AK
149 bootmap = early_node_mem(nodeid, bootmap_start, end,
150 bootmap_pages<<PAGE_SHIFT);
151 if (bootmap == NULL) {
152 if (nodedata_phys < start || nodedata_phys >= end)
153 free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
154 node_data[nodeid] = NULL;
155 return;
156 }
157 bootmap_start = __pa(bootmap);
1da177e4
LT
158 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
159
160 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
161 bootmap_start >> PAGE_SHIFT,
162 start_pfn, end_pfn);
163
5cb248ab 164 free_bootmem_with_active_regions(nodeid, end);
1da177e4
LT
165
166 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
167 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
68a3a7fe
AK
168#ifdef CONFIG_ACPI_NUMA
169 srat_reserve_add_area(nodeid);
170#endif
1da177e4
LT
171 node_set_online(nodeid);
172}
173
174/* Initialize final allocator for a zone */
175void __init setup_node_zones(int nodeid)
176{
267b4801 177 unsigned long start_pfn, end_pfn, memmapsize, limit;
1da177e4 178
a2f1b424
AK
179 start_pfn = node_start_pfn(nodeid);
180 end_pfn = node_end_pfn(nodeid);
1da177e4 181
5cb248ab 182 Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
a2f1b424 183 nodeid, start_pfn, end_pfn);
1da177e4 184
267b4801
AK
185 /* Try to allocate mem_map at end to not fill up precious <4GB
186 memory. */
187 memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
188 limit = end_pfn << PAGE_SHIFT;
3b5fd59f 189#ifdef CONFIG_FLAT_NODE_MEM_MAP
267b4801
AK
190 NODE_DATA(nodeid)->node_mem_map =
191 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
192 memmapsize, SMP_CACHE_BYTES,
193 round_down(limit - memmapsize, PAGE_SIZE),
194 limit);
3b5fd59f 195#endif
1da177e4
LT
196}
197
198void __init numa_init_array(void)
199{
200 int rr, i;
201 /* There are unfortunately some poorly designed mainboards around
202 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
203 mapping. To avoid this fill in the mapping for all possible
204 CPUs, as the number of CPUs is not known yet.
205 We round robin the existing nodes. */
85cc5135 206 rr = first_node(node_online_map);
1da177e4
LT
207 for (i = 0; i < NR_CPUS; i++) {
208 if (cpu_to_node[i] != NUMA_NO_NODE)
209 continue;
69d81fcd 210 numa_set_node(i, rr);
1da177e4
LT
211 rr = next_node(rr, node_online_map);
212 if (rr == MAX_NUMNODES)
213 rr = first_node(node_online_map);
1da177e4
LT
214 }
215
1da177e4
LT
216}
217
218#ifdef CONFIG_NUMA_EMU
219int numa_fake __initdata = 0;
220
221/* Numa emulation */
1164c999 222static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
1da177e4
LT
223{
224 int i;
abe059e7 225 struct bootnode nodes[MAX_NUMNODES];
1da177e4
LT
226 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
227
228 /* Kludge needed for the hash function */
229 if (hweight64(sz) > 1) {
230 unsigned long x = 1;
231 while ((x << 1) < sz)
232 x <<= 1;
233 if (x < sz/2)
6b050f80 234 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
1da177e4
LT
235 sz = x;
236 }
237
238 memset(&nodes,0,sizeof(nodes));
239 for (i = 0; i < numa_fake; i++) {
240 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
241 if (i == numa_fake-1)
242 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
243 nodes[i].end = nodes[i].start + sz;
1da177e4
LT
244 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
245 i,
246 nodes[i].start, nodes[i].end,
247 (nodes[i].end - nodes[i].start) >> 20);
248 node_set_online(i);
249 }
250 memnode_shift = compute_hash_shift(nodes, numa_fake);
251 if (memnode_shift < 0) {
252 memnode_shift = 0;
253 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
254 return -1;
255 }
5cb248ab
MG
256 for_each_online_node(i) {
257 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
258 nodes[i].end >> PAGE_SHIFT);
1da177e4 259 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
5cb248ab 260 }
1da177e4
LT
261 numa_init_array();
262 return 0;
263}
264#endif
265
266void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
267{
268 int i;
269
270#ifdef CONFIG_NUMA_EMU
271 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
272 return;
273#endif
274
275#ifdef CONFIG_ACPI_NUMA
276 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
277 end_pfn << PAGE_SHIFT))
278 return;
279#endif
280
281#ifdef CONFIG_K8_NUMA
282 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
283 return;
284#endif
285 printk(KERN_INFO "%s\n",
286 numa_off ? "NUMA turned off" : "No NUMA configuration found");
287
288 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
289 start_pfn << PAGE_SHIFT,
290 end_pfn << PAGE_SHIFT);
291 /* setup dummy node covering all memory */
292 memnode_shift = 63;
293 memnodemap[0] = 0;
294 nodes_clear(node_online_map);
295 node_set_online(0);
296 for (i = 0; i < NR_CPUS; i++)
69d81fcd 297 numa_set_node(i, 0);
1da177e4 298 node_to_cpumask[0] = cpumask_of_cpu(0);
5cb248ab 299 e820_register_active_regions(0, start_pfn, end_pfn);
1da177e4
LT
300 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
301}
302
e6982c67 303__cpuinit void numa_add_cpu(int cpu)
1da177e4 304{
e6a045a5 305 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
1da177e4
LT
306}
307
69d81fcd
AK
308void __cpuinit numa_set_node(int cpu, int node)
309{
df79efde 310 cpu_pda(cpu)->nodenumber = node;
69d81fcd
AK
311 cpu_to_node[cpu] = node;
312}
313
1da177e4
LT
314unsigned long __init numa_free_all_bootmem(void)
315{
316 int i;
317 unsigned long pages = 0;
318 for_each_online_node(i) {
319 pages += free_all_bootmem_node(NODE_DATA(i));
320 }
321 return pages;
322}
323
d3ee871e
BP
324#ifdef CONFIG_SPARSEMEM
325static void __init arch_sparse_init(void)
326{
327 int i;
328
329 for_each_online_node(i)
330 memory_present(i, node_start_pfn(i), node_end_pfn(i));
331
332 sparse_init();
333}
334#else
335#define arch_sparse_init() do {} while (0)
336#endif
337
1da177e4
LT
338void __init paging_init(void)
339{
340 int i;
6391af17
MG
341 unsigned long max_zone_pfns[MAX_NR_ZONES];
342 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
343 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
344 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
345 max_zone_pfns[ZONE_NORMAL] = end_pfn;
d3ee871e
BP
346
347 arch_sparse_init();
348
1da177e4
LT
349 for_each_online_node(i) {
350 setup_node_zones(i);
351 }
5cb248ab
MG
352
353 free_area_init_nodes(max_zone_pfns);
1da177e4
LT
354}
355
2c8c0e6b 356static __init int numa_setup(char *opt)
1da177e4 357{
2c8c0e6b
AK
358 if (!opt)
359 return -EINVAL;
1da177e4
LT
360 if (!strncmp(opt,"off",3))
361 numa_off = 1;
362#ifdef CONFIG_NUMA_EMU
363 if(!strncmp(opt, "fake=", 5)) {
364 numa_fake = simple_strtoul(opt+5,NULL,0); ;
365 if (numa_fake >= MAX_NUMNODES)
366 numa_fake = MAX_NUMNODES;
367 }
368#endif
369#ifdef CONFIG_ACPI_NUMA
370 if (!strncmp(opt,"noacpi",6))
371 acpi_numa = -1;
68a3a7fe
AK
372 if (!strncmp(opt,"hotadd=", 7))
373 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
1da177e4 374#endif
2c8c0e6b 375 return 0;
1da177e4
LT
376}
377
2c8c0e6b
AK
378early_param("numa", numa_setup);
379
05b3cbd8
RT
380/*
381 * Setup early cpu_to_node.
382 *
383 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
384 * and apicid_to_node[] tables have valid entries for a CPU.
385 * This means we skip cpu_to_node[] initialisation for NUMA
386 * emulation and faking node case (when running a kernel compiled
387 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
388 * is already initialized in a round robin manner at numa_init_array,
389 * prior to this call, and this initialization is good enough
390 * for the fake NUMA cases.
391 */
392void __init init_cpu_to_node(void)
393{
394 int i;
395 for (i = 0; i < NR_CPUS; i++) {
396 u8 apicid = x86_cpu_to_apicid[i];
397 if (apicid == BAD_APICID)
398 continue;
399 if (apicid_to_node[apicid] == NUMA_NO_NODE)
400 continue;
d1db4ec8 401 numa_set_node(i,apicid_to_node[apicid]);
05b3cbd8
RT
402 }
403}
404
1da177e4
LT
405EXPORT_SYMBOL(cpu_to_node);
406EXPORT_SYMBOL(node_to_cpumask);
dcf36bfa 407EXPORT_SYMBOL(memnode);
1da177e4 408EXPORT_SYMBOL(node_data);
cf050132
AK
409
410#ifdef CONFIG_DISCONTIGMEM
411/*
412 * Functions to convert PFNs from/to per node page addresses.
413 * These are out of line because they are quite big.
414 * They could be all tuned by pre caching more state.
415 * Should do that.
416 */
417
cf050132
AK
418int pfn_valid(unsigned long pfn)
419{
420 unsigned nid;
421 if (pfn >= num_physpages)
422 return 0;
423 nid = pfn_to_nid(pfn);
424 if (nid == 0xff)
425 return 0;
426 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
427}
428EXPORT_SYMBOL(pfn_valid);
429#endif
This page took 0.198149 seconds and 5 git commands to generate.