x86, acpi: Parse all SRAT cpu entries even above the cpu number limitation
[deliverable/linux.git] / arch / x86 / mm / srat_64.c
CommitLineData
1da177e4
LT
1/*
2 * ACPI 3.0 based NUMA setup
3 * Copyright 2004 Andi Kleen, SuSE Labs.
4 *
5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6 *
7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8 * Assumes all memory regions belonging to a single proximity domain
9 * are in one chunk. Holes between them will be included in the node.
10 */
11
12#include <linux/kernel.h>
13#include <linux/acpi.h>
14#include <linux/mmzone.h>
15#include <linux/bitmap.h>
16#include <linux/module.h>
17#include <linux/topology.h>
68a3a7fe 18#include <linux/bootmem.h>
a9ce6bc1 19#include <linux/memblock.h>
68a3a7fe 20#include <linux/mm.h>
1da177e4
LT
21#include <asm/proto.h>
22#include <asm/numa.h>
8a6fdd3e 23#include <asm/e820.h>
7b6aa335 24#include <asm/apic.h>
4ec71fa2 25#include <asm/uv/uv.h>
1da177e4 26
c31fbb1a
AK
27int acpi_numa __initdata;
28
1da177e4
LT
29static struct acpi_table_slit *acpi_slit;
30
31static nodemask_t nodes_parsed __initdata;
dc098551 32static nodemask_t cpu_nodes_parsed __initdata;
abe059e7 33static struct bootnode nodes[MAX_NUMNODES] __initdata;
4942e998 34static struct bootnode nodes_add[MAX_NUMNODES];
1da177e4 35
6ec6e0d9
SS
36static int num_node_memblks __initdata;
37static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
38static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
39
1da177e4
LT
40static __init int setup_node(int pxm)
41{
762834e8 42 return acpi_map_pxm_to_node(pxm);
1da177e4
LT
43}
44
6ec6e0d9 45static __init int conflicting_memblks(unsigned long start, unsigned long end)
1da177e4
LT
46{
47 int i;
6ec6e0d9
SS
48 for (i = 0; i < num_node_memblks; i++) {
49 struct bootnode *nd = &node_memblk_range[i];
1da177e4
LT
50 if (nd->start == nd->end)
51 continue;
52 if (nd->end > start && nd->start < end)
6ec6e0d9 53 return memblk_nodeid[i];
1da177e4 54 if (nd->end == end && nd->start == start)
6ec6e0d9 55 return memblk_nodeid[i];
1da177e4
LT
56 }
57 return -1;
58}
59
60static __init void cutoff_node(int i, unsigned long start, unsigned long end)
61{
abe059e7 62 struct bootnode *nd = &nodes[i];
68a3a7fe 63
1da177e4
LT
64 if (nd->start < start) {
65 nd->start = start;
66 if (nd->end < nd->start)
67 nd->start = nd->end;
68 }
69 if (nd->end > end) {
1da177e4
LT
70 nd->end = end;
71 if (nd->start > nd->end)
72 nd->start = nd->end;
73 }
74}
75
76static __init void bad_srat(void)
77{
2bce2b54 78 int i;
1da177e4
LT
79 printk(KERN_ERR "SRAT: SRAT not used.\n");
80 acpi_numa = -1;
2bce2b54
AK
81 for (i = 0; i < MAX_LOCAL_APIC; i++)
82 apicid_to_node[i] = NUMA_NO_NODE;
429b2b31
AK
83 for (i = 0; i < MAX_NUMNODES; i++) {
84 nodes[i].start = nodes[i].end = 0;
85 nodes_add[i].start = nodes_add[i].end = 0;
86 }
5cb248ab 87 remove_all_active_ranges();
1da177e4
LT
88}
89
90static __init inline int srat_disabled(void)
91{
92 return numa_off || acpi_numa < 0;
93}
94
95/* Callback for SLIT parsing */
96void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
97{
f302a5bb
YL
98 unsigned length;
99 unsigned long phys;
100
101 length = slit->header.length;
a9ce6bc1 102 phys = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, length,
f302a5bb
YL
103 PAGE_SIZE);
104
a9ce6bc1 105 if (phys == MEMBLOCK_ERROR)
f302a5bb
YL
106 panic(" Can not save slit!\n");
107
108 acpi_slit = __va(phys);
109 memcpy(acpi_slit, slit, length);
a9ce6bc1 110 memblock_x86_reserve_range(phys, phys + length, "ACPI SLIT");
1da177e4
LT
111}
112
7237d3de
SS
113/* Callback for Proximity Domain -> x2APIC mapping */
114void __init
115acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
116{
117 int pxm, node;
118 int apic_id;
119
120 if (srat_disabled())
121 return;
122 if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
123 bad_srat();
124 return;
125 }
126 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
127 return;
128 pxm = pa->proximity_domain;
129 node = setup_node(pxm);
130 if (node < 0) {
131 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
132 bad_srat();
133 return;
134 }
135
136 apic_id = pa->apic_id;
d3bd0588
YL
137 if (apic_id >= MAX_LOCAL_APIC) {
138 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
139 return;
140 }
7237d3de 141 apicid_to_node[apic_id] = node;
dc098551 142 node_set(node, cpu_nodes_parsed);
7237d3de 143 acpi_numa = 1;
163d3866 144 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
7237d3de
SS
145 pxm, apic_id, node);
146}
147
1da177e4
LT
148/* Callback for Proximity Domain -> LAPIC mapping */
149void __init
15a58ed1 150acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
1da177e4
LT
151{
152 int pxm, node;
ef97001f 153 int apic_id;
154
d22fe808
AK
155 if (srat_disabled())
156 return;
15a58ed1 157 if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
fad7906d 158 bad_srat();
d22fe808
AK
159 return;
160 }
15a58ed1 161 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
1da177e4 162 return;
15a58ed1 163 pxm = pa->proximity_domain_lo;
1da177e4
LT
164 node = setup_node(pxm);
165 if (node < 0) {
166 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
167 bad_srat();
168 return;
169 }
beafe91f 170
2e42060c 171 if (get_uv_system_type() >= UV_X2APIC)
a65d1d64
JS
172 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
173 else
174 apic_id = pa->apic_id;
d3bd0588
YL
175
176 if (apic_id >= MAX_LOCAL_APIC) {
177 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
178 return;
179 }
180
ef97001f 181 apicid_to_node[apic_id] = node;
dc098551 182 node_set(node, cpu_nodes_parsed);
1da177e4 183 acpi_numa = 1;
163d3866 184 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
ef97001f 185 pxm, apic_id, node);
1da177e4
LT
186}
187
71efa8fd
KM
188#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
189static inline int save_add_info(void) {return 1;}
190#else
191static inline int save_add_info(void) {return 0;}
192#endif
68a3a7fe 193/*
888a589f
YL
194 * Update nodes_add[]
195 * This code supports one contiguous hot add area per node
68a3a7fe 196 */
888a589f
YL
197static void __init
198update_nodes_add(int node, unsigned long start, unsigned long end)
68a3a7fe
AK
199{
200 unsigned long s_pfn = start >> PAGE_SHIFT;
201 unsigned long e_pfn = end >> PAGE_SHIFT;
888a589f 202 int changed = 0;
68a3a7fe
AK
203 struct bootnode *nd = &nodes_add[node];
204
205 /* I had some trouble with strange memory hotadd regions breaking
206 the boot. Be very strict here and reject anything unexpected.
207 If you want working memory hotadd write correct SRATs.
208
209 The node size check is a basic sanity check to guard against
210 mistakes */
211 if ((signed long)(end - start) < NODE_MIN_SIZE) {
212 printk(KERN_ERR "SRAT: Hotplug area too small\n");
888a589f 213 return;
68a3a7fe
AK
214 }
215
216 /* This check might be a bit too strict, but I'm keeping it for now. */
5cb248ab 217 if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
9c7cd687
MG
218 printk(KERN_ERR
219 "SRAT: Hotplug area %lu -> %lu has existing memory\n",
220 s_pfn, e_pfn);
888a589f 221 return;
68a3a7fe
AK
222 }
223
224 /* Looks good */
225
68a3a7fe 226 if (nd->start == nd->end) {
15a58ed1
AS
227 nd->start = start;
228 nd->end = end;
68a3a7fe 229 changed = 1;
15a58ed1
AS
230 } else {
231 if (nd->start == end) {
232 nd->start = start;
68a3a7fe
AK
233 changed = 1;
234 }
15a58ed1
AS
235 if (nd->end == start) {
236 nd->end = end;
68a3a7fe
AK
237 changed = 1;
238 }
239 if (!changed)
240 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
15a58ed1 241 }
68a3a7fe 242
3a5fc0e4
DR
243 if (changed) {
244 node_set(node, cpu_nodes_parsed);
888a589f
YL
245 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
246 nd->start, nd->end);
3a5fc0e4 247 }
68a3a7fe 248}
68a3a7fe 249
1da177e4
LT
250/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
251void __init
15a58ed1 252acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
1da177e4 253{
68a3a7fe 254 struct bootnode *nd, oldnode;
1da177e4
LT
255 unsigned long start, end;
256 int node, pxm;
257 int i;
258
d22fe808 259 if (srat_disabled())
1da177e4 260 return;
15a58ed1 261 if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
d22fe808
AK
262 bad_srat();
263 return;
264 }
15a58ed1 265 if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
d22fe808 266 return;
15a58ed1
AS
267
268 if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
68a3a7fe 269 return;
15a58ed1
AS
270 start = ma->base_address;
271 end = start + ma->length;
1da177e4
LT
272 pxm = ma->proximity_domain;
273 node = setup_node(pxm);
274 if (node < 0) {
275 printk(KERN_ERR "SRAT: Too many proximity domains.\n");
276 bad_srat();
277 return;
278 }
6ec6e0d9 279 i = conflicting_memblks(start, end);
05d1fa4b
AK
280 if (i == node) {
281 printk(KERN_WARNING
282 "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
283 pxm, start, end, nodes[i].start, nodes[i].end);
284 } else if (i >= 0) {
1da177e4 285 printk(KERN_ERR
05d1fa4b
AK
286 "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
287 pxm, start, end, node_to_pxm(i),
288 nodes[i].start, nodes[i].end);
1da177e4
LT
289 bad_srat();
290 return;
291 }
292 nd = &nodes[node];
68a3a7fe 293 oldnode = *nd;
1da177e4
LT
294 if (!node_test_and_set(node, nodes_parsed)) {
295 nd->start = start;
296 nd->end = end;
297 } else {
298 if (start < nd->start)
299 nd->start = start;
300 if (nd->end < end)
301 nd->end = end;
302 }
68a3a7fe 303
6ec6e0d9
SS
304 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
305 start, end);
68a3a7fe 306
888a589f
YL
307 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
308 update_nodes_add(node, start, end);
309 /* restore nodes[node] */
68a3a7fe
AK
310 *nd = oldnode;
311 if ((nd->start | nd->end) == 0)
312 node_clear(node, nodes_parsed);
313 }
6ec6e0d9
SS
314
315 node_memblk_range[num_node_memblks].start = start;
316 node_memblk_range[num_node_memblks].end = end;
317 memblk_nodeid[num_node_memblks] = node;
318 num_node_memblks++;
1da177e4
LT
319}
320
8a6fdd3e
AK
321/* Sanity check to catch more bad SRATs (they are amazingly common).
322 Make sure the PXMs cover all memory. */
3484d798 323static int __init nodes_cover_memory(const struct bootnode *nodes)
8a6fdd3e
AK
324{
325 int i;
326 unsigned long pxmram, e820ram;
327
328 pxmram = 0;
329 for_each_node_mask(i, nodes_parsed) {
330 unsigned long s = nodes[i].start >> PAGE_SHIFT;
331 unsigned long e = nodes[i].end >> PAGE_SHIFT;
332 pxmram += e - s;
32996250 333 pxmram -= __absent_pages_in_range(i, s, e);
68a3a7fe
AK
334 if ((long)pxmram < 0)
335 pxmram = 0;
8a6fdd3e
AK
336 }
337
a9ce6bc1 338 e820ram = max_pfn - (memblock_x86_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
0964b056
YL
339 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
340 if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
8a6fdd3e
AK
341 printk(KERN_ERR
342 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
343 (pxmram << PAGE_SHIFT) >> 20,
344 (e820ram << PAGE_SHIFT) >> 20);
345 return 0;
346 }
347 return 1;
348}
349
1da177e4
LT
350void __init acpi_numa_arch_fixup(void) {}
351
8716273c
DR
352int __init acpi_get_nodes(struct bootnode *physnodes)
353{
354 int i;
355 int ret = 0;
356
357 for_each_node_mask(i, nodes_parsed) {
358 physnodes[ret].start = nodes[i].start;
359 physnodes[ret].end = nodes[i].end;
360 ret++;
361 }
362 return ret;
363}
364
1da177e4
LT
365/* Use the information discovered above to actually set up the nodes. */
366int __init acpi_scan_nodes(unsigned long start, unsigned long end)
367{
368 int i;
8a6fdd3e 369
ae2c6dcf
DR
370 if (acpi_numa <= 0)
371 return -1;
372
e58e0d03 373 /* First clean up the node list */
7c43769a 374 for (i = 0; i < MAX_NUMNODES; i++)
15a58ed1 375 cutoff_node(i, start, end);
e58e0d03 376
2e618786
JB
377 /*
378 * Join together blocks on the same node, holes between
379 * which don't overlap with memory on other nodes.
380 */
381 for (i = 0; i < num_node_memblks; ++i) {
382 int j, k;
383
384 for (j = i + 1; j < num_node_memblks; ++j) {
385 unsigned long start, end;
386
387 if (memblk_nodeid[i] != memblk_nodeid[j])
388 continue;
389 start = min(node_memblk_range[i].end,
390 node_memblk_range[j].end);
391 end = max(node_memblk_range[i].start,
392 node_memblk_range[j].start);
393 for (k = 0; k < num_node_memblks; ++k) {
394 if (memblk_nodeid[i] == memblk_nodeid[k])
395 continue;
396 if (start < node_memblk_range[k].end &&
397 end > node_memblk_range[k].start)
398 break;
399 }
400 if (k < num_node_memblks)
401 continue;
402 start = min(node_memblk_range[i].start,
403 node_memblk_range[j].start);
404 end = max(node_memblk_range[i].end,
405 node_memblk_range[j].end);
406 printk(KERN_INFO "SRAT: Node %d "
407 "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
408 memblk_nodeid[i],
409 node_memblk_range[i].start,
410 node_memblk_range[i].end,
411 node_memblk_range[j].start,
412 node_memblk_range[j].end,
413 start, end);
414 node_memblk_range[i].start = start;
415 node_memblk_range[i].end = end;
416 k = --num_node_memblks - j;
417 memmove(memblk_nodeid + j, memblk_nodeid + j+1,
418 k * sizeof(*memblk_nodeid));
419 memmove(node_memblk_range + j, node_memblk_range + j+1,
420 k * sizeof(*node_memblk_range));
421 --j;
422 }
423 }
424
6ec6e0d9
SS
425 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
426 memblk_nodeid);
1da177e4
LT
427 if (memnode_shift < 0) {
428 printk(KERN_ERR
429 "SRAT: No NUMA node hash function found. Contact maintainer\n");
430 bad_srat();
431 return -1;
432 }
e58e0d03 433
73cf624d 434 for (i = 0; i < num_node_memblks; i++)
8e4029ee 435 memblock_x86_register_active_regions(memblk_nodeid[i],
73cf624d
YL
436 node_memblk_range[i].start >> PAGE_SHIFT,
437 node_memblk_range[i].end >> PAGE_SHIFT);
438
32996250
YL
439 /* for out of order entries in SRAT */
440 sort_node_map();
8716273c
DR
441 if (!nodes_cover_memory(nodes)) {
442 bad_srat();
443 return -1;
444 }
445
dc098551
JS
446 /* Account for nodes with cpus and no memory */
447 nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
e3f1caee 448
e58e0d03 449 /* Finally register nodes */
e3f1caee 450 for_each_node_mask(i, node_possible_map)
1da177e4 451 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
a8062231
AK
452 /* Try again in case setup_node_bootmem missed one due
453 to missing bootmem */
e3f1caee 454 for_each_node_mask(i, node_possible_map)
a8062231
AK
455 if (!node_online(i))
456 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
457
168ef543 458 for (i = 0; i < nr_cpu_ids; i++) {
0164fe16
MT
459 int node = early_cpu_to_node(i);
460
834beda1 461 if (node == NUMA_NO_NODE)
1da177e4 462 continue;
7c43769a 463 if (!node_online(node))
23ca4bba 464 numa_clear_node(i);
1da177e4
LT
465 }
466 numa_init_array();
467 return 0;
468}
469
3484d798 470#ifdef CONFIG_NUMA_EMU
ef97001f 471static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
472 [0 ... MAX_NUMNODES-1] = PXM_INVAL
473};
602a54a8 474static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
ef97001f 475 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
476};
3484d798
DR
477static int __init find_node_by_addr(unsigned long addr)
478{
479 int ret = NUMA_NO_NODE;
480 int i;
481
482 for_each_node_mask(i, nodes_parsed) {
483 /*
484 * Find the real node that this emulated node appears on. For
485 * the sake of simplicity, we only use a real node's starting
486 * address to determine which emulated node it appears on.
487 */
488 if (addr >= nodes[i].start && addr < nodes[i].end) {
489 ret = i;
490 break;
491 }
492 }
9a1b62fe 493 return ret;
3484d798
DR
494}
495
496/*
497 * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
498 * mappings that respect the real ACPI topology but reflect our emulated
499 * environment. For each emulated node, we find which real node it appears on
500 * and create PXM to NID mappings for those fake nodes which mirror that
501 * locality. SLIT will now represent the correct distances between emulated
502 * nodes as a result of the real topology.
503 */
504void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
505{
08705b89 506 int i, j;
3484d798
DR
507
508 printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
509 "topology.\n");
510 for (i = 0; i < num_nodes; i++) {
511 int nid, pxm;
512
513 nid = find_node_by_addr(fake_nodes[i].start);
514 if (nid == NUMA_NO_NODE)
515 continue;
516 pxm = node_to_pxm(nid);
517 if (pxm == PXM_INVAL)
518 continue;
519 fake_node_to_pxm_map[i] = pxm;
08705b89
DR
520 /*
521 * For each apicid_to_node mapping that exists for this real
522 * node, it must now point to the fake node ID.
523 */
524 for (j = 0; j < MAX_LOCAL_APIC; j++)
b0c4d952
DR
525 if (apicid_to_node[j] == nid &&
526 fake_apicid_to_node[j] == NUMA_NO_NODE)
08705b89 527 fake_apicid_to_node[j] = i;
3484d798
DR
528 }
529 for (i = 0; i < num_nodes; i++)
530 __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
08705b89 531 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
3484d798
DR
532
533 nodes_clear(nodes_parsed);
534 for (i = 0; i < num_nodes; i++)
535 if (fake_nodes[i].start != fake_nodes[i].end)
536 node_set(i, nodes_parsed);
3484d798
DR
537}
538
539static int null_slit_node_compare(int a, int b)
540{
541 return node_to_pxm(a) == node_to_pxm(b);
542}
543#else
544static int null_slit_node_compare(int a, int b)
545{
546 return a == b;
547}
548#endif /* CONFIG_NUMA_EMU */
549
1da177e4
LT
550int __node_distance(int a, int b)
551{
552 int index;
553
554 if (!acpi_slit)
3484d798
DR
555 return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
556 REMOTE_DISTANCE;
15a58ed1 557 index = acpi_slit->locality_count * node_to_pxm(a);
1da177e4
LT
558 return acpi_slit->entry[index + node_to_pxm(b)];
559}
560
561EXPORT_SYMBOL(__node_distance);
4942e998 562
6a1673ae 563#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
4942e998
KM
564int memory_add_physaddr_to_nid(u64 start)
565{
566 int i, ret = 0;
567
568 for_each_node(i)
569 if (nodes_add[i].start <= start && nodes_add[i].end > start)
570 ret = i;
571
572 return ret;
573}
8c2676a5 574EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
6a1673ae 575#endif
This page took 0.606298 seconds and 5 git commands to generate.