arch/x86_64/mm/numa.c

   1 /*
   2  * Generic VM initialization for x86-64 NUMA setups.
   3  * Copyright 2002,2003 Andi Kleen, SuSE Labs.
   4  */
   5 #include <linux/kernel.h>
   6 #include <linux/mm.h>
   7 #include <linux/string.h>
   8 #include <linux/init.h>
   9 #include <linux/bootmem.h>
  10 #include <linux/mmzone.h>
  11 #include <linux/ctype.h>
  12 #include <linux/module.h>
  13 #include <linux/nodemask.h>
  14
  15 #include <asm/e820.h>
  16 #include <asm/proto.h>
  17 #include <asm/dma.h>
  18 #include <asm/numa.h>
  19 #include <asm/acpi.h>
  20
  21 #ifndef Dprintk
  22 #define Dprintk(x...)
  23 #endif
  24
  25 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
  26 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
  27
  28 struct memnode memnode;
  29
  30 unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
  31         [0 ... NR_CPUS-1] = NUMA_NO_NODE
  32 };
  33 unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
  34         [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
  35 };
  36 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
  37
  38 int numa_off __initdata;
  39
  40
  41 /*
  42  * Given a shift value, try to populate memnodemap[]
  43  * Returns :
  44  * 1 if OK
  45  * 0 if memnodmap[] too small (of shift too small)
  46  * -1 if node overlap or lost ram (shift too big)
  47  */
  48 static int __init
  49 populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
  50 {
  51         int i;
  52         int res = -1;
  53         unsigned long addr, end;
  54
  55         if (shift >= 64)
  56                 return -1;
  57         memset(memnodemap, 0xff, sizeof(memnodemap));
  58         for (i = 0; i < numnodes; i++) {
  59                 addr = nodes[i].start;
  60                 end = nodes[i].end;
  61                 if (addr >= end)
  62                         continue;
  63                 if ((end >> shift) >= NODEMAPSIZE)
  64                         return 0;
  65                 do {
  66                         if (memnodemap[addr >> shift] != 0xff)
  67                                 return -1;
  68                         memnodemap[addr >> shift] = i;
  69                        addr += (1UL << shift);
  70                 } while (addr < end);
  71                 res = 1;
  72         }
  73         return res;
  74 }
  75
  76 int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
  77 {
  78         int shift = 20;
  79
  80         while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
  81                 shift++;
  82
  83         printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
  84                 shift);
  85
  86         if (populate_memnodemap(nodes, numnodes, shift) != 1) {
  87                 printk(KERN_INFO
  88         "Your memory is not aligned you need to rebuild your kernel "
  89         "with a bigger NODEMAPSIZE shift=%d\n",
  90                         shift);
  91                 return -1;
  92         }
  93         return shift;
  94 }
  95
  96 #ifdef CONFIG_SPARSEMEM
  97 int early_pfn_to_nid(unsigned long pfn)
  98 {
  99         return phys_to_nid(pfn << PAGE_SHIFT);
 100 }
 101 #endif
 102
 103 static void * __init
 104 early_node_mem(int nodeid, unsigned long start, unsigned long end,
 105               unsigned long size)
 106 {
 107         unsigned long mem = find_e820_area(start, end, size);
 108         void *ptr;
 109         if (mem != -1L)
 110                 return __va(mem);
 111         ptr = __alloc_bootmem_nopanic(size,
 112                                 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
 113         if (ptr == 0) {
 114                 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
 115                         size, nodeid);
 116                 return NULL;
 117         }
 118         return ptr;
 119 }
 120
 121 /* Initialize bootmem allocator for a node */
 122 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 123 {
 124         unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
 125         unsigned long nodedata_phys;
 126         void *bootmap;
 127         const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
 128
 129         start = round_up(start, ZONE_ALIGN);
 130
 131         printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
 132
 133         start_pfn = start >> PAGE_SHIFT;
 134         end_pfn = end >> PAGE_SHIFT;
 135
 136         node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
 137         if (node_data[nodeid] == NULL)
 138                 return;
 139         nodedata_phys = __pa(node_data[nodeid]);
 140
 141         memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
 142         NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
 143         NODE_DATA(nodeid)->node_start_pfn = start_pfn;
 144         NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
 145
 146         /* Find a place for the bootmem map */
 147         bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
 148         bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
 149         bootmap = early_node_mem(nodeid, bootmap_start, end,
 150                                         bootmap_pages<<PAGE_SHIFT);
 151         if (bootmap == NULL)  {
 152                 if (nodedata_phys < start || nodedata_phys >= end)
 153                         free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
 154                 node_data[nodeid] = NULL;
 155                 return;
 156         }
 157         bootmap_start = __pa(bootmap);
 158         Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
 159
 160         bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
 161                                          bootmap_start >> PAGE_SHIFT,
 162                                          start_pfn, end_pfn);
 163
 164         free_bootmem_with_active_regions(nodeid, end);
 165
 166         reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
 167         reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
 168 #ifdef CONFIG_ACPI_NUMA
 169         srat_reserve_add_area(nodeid);
 170 #endif
 171         node_set_online(nodeid);
 172 }
 173
 174 /* Initialize final allocator for a zone */
 175 void __init setup_node_zones(int nodeid)
 176 {
 177         unsigned long start_pfn, end_pfn, memmapsize, limit;
 178
 179         start_pfn = node_start_pfn(nodeid);
 180         end_pfn = node_end_pfn(nodeid);
 181
 182         Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
 183                 nodeid, start_pfn, end_pfn);
 184
 185         /* Try to allocate mem_map at end to not fill up precious <4GB
 186            memory. */
 187         memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
 188         limit = end_pfn << PAGE_SHIFT;
 189 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 190         NODE_DATA(nodeid)->node_mem_map =
 191                 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
 192                                 memmapsize, SMP_CACHE_BYTES,
 193                                 round_down(limit - memmapsize, PAGE_SIZE),
 194                                 limit);
 195 #endif
 196 }
 197
 198 void __init numa_init_array(void)
 199 {
 200         int rr, i;
 201         /* There are unfortunately some poorly designed mainboards around
 202            that only connect memory to a single CPU. This breaks the 1:1 cpu->node
 203            mapping. To avoid this fill in the mapping for all possible
 204            CPUs, as the number of CPUs is not known yet.
 205            We round robin the existing nodes. */
 206         rr = first_node(node_online_map);
 207         for (i = 0; i < NR_CPUS; i++) {
 208                 if (cpu_to_node[i] != NUMA_NO_NODE)
 209                         continue;
 210                 numa_set_node(i, rr);
 211                 rr = next_node(rr, node_online_map);
 212                 if (rr == MAX_NUMNODES)
 213                         rr = first_node(node_online_map);
 214         }
 215
 216 }
 217
 218 #ifdef CONFIG_NUMA_EMU
 219 int numa_fake __initdata = 0;
 220
 221 /* Numa emulation */
 222 static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
 223 {
 224         int i;
 225         struct bootnode nodes[MAX_NUMNODES];
 226         unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
 227
 228         /* Kludge needed for the hash function */
 229         if (hweight64(sz) > 1) {
 230                 unsigned long x = 1;
 231                 while ((x << 1) < sz)
 232                         x <<= 1;
 233                 if (x < sz/2)
 234                         printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
 235                 sz = x;
 236         }
 237
 238         memset(&nodes,0,sizeof(nodes));
 239         for (i = 0; i < numa_fake; i++) {
 240                 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
 241                 if (i == numa_fake-1)
 242                         sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
 243                 nodes[i].end = nodes[i].start + sz;
 244                 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
 245                        i,
 246                        nodes[i].start, nodes[i].end,
 247                        (nodes[i].end - nodes[i].start) >> 20);
 248                 node_set_online(i);
 249         }
 250         memnode_shift = compute_hash_shift(nodes, numa_fake);
 251         if (memnode_shift < 0) {
 252                 memnode_shift = 0;
 253                 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
 254                 return -1;
 255         }
 256         for_each_online_node(i) {
 257                 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
 258                                                 nodes[i].end >> PAGE_SHIFT);
 259                 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 260         }
 261         numa_init_array();
 262         return 0;
 263 }
 264 #endif
 265
 266 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 267 {
 268         int i;
 269
 270 #ifdef CONFIG_NUMA_EMU
 271         if (numa_fake && !numa_emulation(start_pfn, end_pfn))
 272                 return;
 273 #endif
 274
 275 #ifdef CONFIG_ACPI_NUMA
 276         if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
 277                                           end_pfn << PAGE_SHIFT))
 278                 return;
 279 #endif
 280
 281 #ifdef CONFIG_K8_NUMA
 282         if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
 283                 return;
 284 #endif
 285         printk(KERN_INFO "%s\n",
 286                numa_off ? "NUMA turned off" : "No NUMA configuration found");
 287
 288         printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
 289                start_pfn << PAGE_SHIFT,
 290                end_pfn << PAGE_SHIFT);
 291                 /* setup dummy node covering all memory */
 292         memnode_shift = 63;
 293         memnodemap[0] = 0;
 294         nodes_clear(node_online_map);
 295         node_set_online(0);
 296         for (i = 0; i < NR_CPUS; i++)
 297                 numa_set_node(i, 0);
 298         node_to_cpumask[0] = cpumask_of_cpu(0);
 299         e820_register_active_regions(0, start_pfn, end_pfn);
 300         setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
 301 }
 302
 303 __cpuinit void numa_add_cpu(int cpu)
 304 {
 305         set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
 306 }
 307
 308 void __cpuinit numa_set_node(int cpu, int node)
 309 {
 310         cpu_pda(cpu)->nodenumber = node;
 311         cpu_to_node[cpu] = node;
 312 }
 313
 314 unsigned long __init numa_free_all_bootmem(void)
 315 {
 316         int i;
 317         unsigned long pages = 0;
 318         for_each_online_node(i) {
 319                 pages += free_all_bootmem_node(NODE_DATA(i));
 320         }
 321         return pages;
 322 }
 323
 324 #ifdef CONFIG_SPARSEMEM
 325 static void __init arch_sparse_init(void)
 326 {
 327         int i;
 328
 329         for_each_online_node(i)
 330                 memory_present(i, node_start_pfn(i), node_end_pfn(i));
 331
 332         sparse_init();
 333 }
 334 #else
 335 #define arch_sparse_init() do {} while (0)
 336 #endif
 337
 338 void __init paging_init(void)
 339 {
 340         int i;
 341         unsigned long max_zone_pfns[MAX_NR_ZONES];
 342         memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 343         max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
 344         max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
 345         max_zone_pfns[ZONE_NORMAL] = end_pfn;
 346
 347         arch_sparse_init();
 348
 349         for_each_online_node(i) {
 350                 setup_node_zones(i);
 351         }
 352
 353         free_area_init_nodes(max_zone_pfns);
 354 }
 355
 356 static __init int numa_setup(char *opt)
 357 {
 358         if (!opt)
 359                 return -EINVAL;
 360         if (!strncmp(opt,"off",3))
 361                 numa_off = 1;
 362 #ifdef CONFIG_NUMA_EMU
 363         if(!strncmp(opt, "fake=", 5)) {
 364                 numa_fake = simple_strtoul(opt+5,NULL,0); ;
 365                 if (numa_fake >= MAX_NUMNODES)
 366                         numa_fake = MAX_NUMNODES;
 367         }
 368 #endif
 369 #ifdef CONFIG_ACPI_NUMA
 370         if (!strncmp(opt,"noacpi",6))
 371                 acpi_numa = -1;
 372         if (!strncmp(opt,"hotadd=", 7))
 373                 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
 374 #endif
 375         return 0;
 376 }
 377
 378 early_param("numa", numa_setup);
 379
 380 /*
 381  * Setup early cpu_to_node.
 382  *
 383  * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
 384  * and apicid_to_node[] tables have valid entries for a CPU.
 385  * This means we skip cpu_to_node[] initialisation for NUMA
 386  * emulation and faking node case (when running a kernel compiled
 387  * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
 388  * is already initialized in a round robin manner at numa_init_array,
 389  * prior to this call, and this initialization is good enough
 390  * for the fake NUMA cases.
 391  */
 392 void __init init_cpu_to_node(void)
 393 {
 394         int i;
 395         for (i = 0; i < NR_CPUS; i++) {
 396                 u8 apicid = x86_cpu_to_apicid[i];
 397                 if (apicid == BAD_APICID)
 398                         continue;
 399                 if (apicid_to_node[apicid] == NUMA_NO_NODE)
 400                         continue;
 401                 numa_set_node(i,apicid_to_node[apicid]);
 402         }
 403 }
 404
 405 EXPORT_SYMBOL(cpu_to_node);
 406 EXPORT_SYMBOL(node_to_cpumask);
 407 EXPORT_SYMBOL(memnode);
 408 EXPORT_SYMBOL(node_data);
 409
 410 #ifdef CONFIG_DISCONTIGMEM
 411 /*
 412  * Functions to convert PFNs from/to per node page addresses.
 413  * These are out of line because they are quite big.
 414  * They could be all tuned by pre caching more state.
 415  * Should do that.
 416  */
 417
 418 int pfn_valid(unsigned long pfn)
 419 {
 420         unsigned nid;
 421         if (pfn >= num_physpages)
 422                 return 0;
 423         nid = pfn_to_nid(pfn);
 424         if (nid == 0xff)
 425                 return 0;
 426         return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
 427 }
 428 EXPORT_SYMBOL(pfn_valid);
 429 #endif