[PATCH] utilization of kprobe_mutex is incorrect on x86_64
[deliverable/linux.git] / arch / x86_64 / mm / numa.c
CommitLineData
1da177e4
LT
1/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/numa.h>
19#include <asm/acpi.h>
20
21#ifndef Dprintk
22#define Dprintk(x...)
23#endif
24
6c231b7b 25struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
1da177e4
LT
26bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
28int memnode_shift;
29u8 memnodemap[NODEMAPSIZE];
30
3f098c26
AK
31unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
32 [0 ... NR_CPUS-1] = NUMA_NO_NODE
0b07e984 33};
3f098c26
AK
34unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
36};
37cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
1da177e4
LT
38
39int numa_off __initdata;
40
41int __init compute_hash_shift(struct node *nodes, int numnodes)
42{
43 int i;
b684664f
KM
44 int shift = 20;
45 unsigned long addr,maxend=0;
1da177e4 46
b684664f
KM
47 for (i = 0; i < numnodes; i++)
48 if ((nodes[i].start != nodes[i].end) && (nodes[i].end > maxend))
49 maxend = nodes[i].end;
50
51 while ((1UL << shift) < (maxend / NODEMAPSIZE))
52 shift++;
53
54 printk (KERN_DEBUG"Using %d for the hash shift. Max adder is %lx \n",
55 shift,maxend);
56 memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE);
57 for (i = 0; i < numnodes; i++) {
58 if (nodes[i].start == nodes[i].end)
59 continue;
60 for (addr = nodes[i].start;
61 addr < nodes[i].end;
62 addr += (1UL << shift)) {
63 if (memnodemap[addr >> shift] != 0xff) {
64 printk(KERN_INFO
65 "Your memory is not aligned you need to rebuild your kernel "
66 "with a bigger NODEMAPSIZE shift=%d adder=%lu\n",
67 shift,addr);
68 return -1;
1da177e4 69 }
b684664f 70 memnodemap[addr >> shift] = i;
1da177e4 71 }
1da177e4 72 }
b684664f 73 return shift;
1da177e4
LT
74}
75
bbfceef4
MT
76#ifdef CONFIG_SPARSEMEM
77int early_pfn_to_nid(unsigned long pfn)
78{
79 return phys_to_nid(pfn << PAGE_SHIFT);
80}
81#endif
82
1da177e4
LT
83/* Initialize bootmem allocator for a node */
84void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
85{
86 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
87 unsigned long nodedata_phys;
88 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
89
90 start = round_up(start, ZONE_ALIGN);
91
92 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
93
94 start_pfn = start >> PAGE_SHIFT;
95 end_pfn = end >> PAGE_SHIFT;
96
bbfceef4 97 memory_present(nodeid, start_pfn, end_pfn);
1da177e4
LT
98 nodedata_phys = find_e820_area(start, end, pgdat_size);
99 if (nodedata_phys == -1L)
100 panic("Cannot find memory pgdat in node %d\n", nodeid);
101
102 Dprintk("nodedata_phys %lx\n", nodedata_phys);
103
104 node_data[nodeid] = phys_to_virt(nodedata_phys);
105 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
106 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
107 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
108 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
109
110 /* Find a place for the bootmem map */
111 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
112 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
113 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
114 if (bootmap_start == -1L)
115 panic("Not enough continuous space for bootmap on node %d", nodeid);
116 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
117
118 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
119 bootmap_start >> PAGE_SHIFT,
120 start_pfn, end_pfn);
121
122 e820_bootmem_free(NODE_DATA(nodeid), start, end);
123
124 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
125 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
126 node_set_online(nodeid);
127}
128
129/* Initialize final allocator for a zone */
130void __init setup_node_zones(int nodeid)
131{
132 unsigned long start_pfn, end_pfn;
133 unsigned long zones[MAX_NR_ZONES];
485761bd 134 unsigned long holes[MAX_NR_ZONES];
1da177e4
LT
135 unsigned long dma_end_pfn;
136
137 memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES);
485761bd 138 memset(holes, 0, sizeof(unsigned long) * MAX_NR_ZONES);
1da177e4
LT
139
140 start_pfn = node_start_pfn(nodeid);
141 end_pfn = node_end_pfn(nodeid);
142
143 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
144
145 /* All nodes > 0 have a zero length zone DMA */
146 dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
147 if (start_pfn < dma_end_pfn) {
148 zones[ZONE_DMA] = dma_end_pfn - start_pfn;
485761bd 149 holes[ZONE_DMA] = e820_hole_size(start_pfn, dma_end_pfn);
1da177e4 150 zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
485761bd
AK
151 holes[ZONE_NORMAL] = e820_hole_size(dma_end_pfn, end_pfn);
152
1da177e4
LT
153 } else {
154 zones[ZONE_NORMAL] = end_pfn - start_pfn;
485761bd 155 holes[ZONE_NORMAL] = e820_hole_size(start_pfn, end_pfn);
1da177e4
LT
156 }
157
158 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
485761bd 159 start_pfn, holes);
1da177e4
LT
160}
161
162void __init numa_init_array(void)
163{
164 int rr, i;
165 /* There are unfortunately some poorly designed mainboards around
166 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
167 mapping. To avoid this fill in the mapping for all possible
168 CPUs, as the number of CPUs is not known yet.
169 We round robin the existing nodes. */
170 rr = 0;
171 for (i = 0; i < NR_CPUS; i++) {
172 if (cpu_to_node[i] != NUMA_NO_NODE)
173 continue;
174 rr = next_node(rr, node_online_map);
175 if (rr == MAX_NUMNODES)
176 rr = first_node(node_online_map);
177 cpu_to_node[i] = rr;
178 rr++;
179 }
180
181 set_bit(0, &node_to_cpumask[cpu_to_node(0)]);
182}
183
184#ifdef CONFIG_NUMA_EMU
185int numa_fake __initdata = 0;
186
187/* Numa emulation */
188static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
189{
190 int i;
191 struct node nodes[MAX_NUMNODES];
192 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
193
194 /* Kludge needed for the hash function */
195 if (hweight64(sz) > 1) {
196 unsigned long x = 1;
197 while ((x << 1) < sz)
198 x <<= 1;
199 if (x < sz/2)
200 printk("Numa emulation unbalanced. Complain to maintainer\n");
201 sz = x;
202 }
203
204 memset(&nodes,0,sizeof(nodes));
205 for (i = 0; i < numa_fake; i++) {
206 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
207 if (i == numa_fake-1)
208 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
209 nodes[i].end = nodes[i].start + sz;
210 if (i != numa_fake-1)
211 nodes[i].end--;
212 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
213 i,
214 nodes[i].start, nodes[i].end,
215 (nodes[i].end - nodes[i].start) >> 20);
216 node_set_online(i);
217 }
218 memnode_shift = compute_hash_shift(nodes, numa_fake);
219 if (memnode_shift < 0) {
220 memnode_shift = 0;
221 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
222 return -1;
223 }
224 for_each_online_node(i)
225 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
226 numa_init_array();
227 return 0;
228}
229#endif
230
231void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
232{
233 int i;
234
235#ifdef CONFIG_NUMA_EMU
236 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
237 return;
238#endif
239
240#ifdef CONFIG_ACPI_NUMA
241 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
242 end_pfn << PAGE_SHIFT))
243 return;
244#endif
245
246#ifdef CONFIG_K8_NUMA
247 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
248 return;
249#endif
250 printk(KERN_INFO "%s\n",
251 numa_off ? "NUMA turned off" : "No NUMA configuration found");
252
253 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
254 start_pfn << PAGE_SHIFT,
255 end_pfn << PAGE_SHIFT);
256 /* setup dummy node covering all memory */
257 memnode_shift = 63;
258 memnodemap[0] = 0;
259 nodes_clear(node_online_map);
260 node_set_online(0);
261 for (i = 0; i < NR_CPUS; i++)
262 cpu_to_node[i] = 0;
263 node_to_cpumask[0] = cpumask_of_cpu(0);
264 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
265}
266
e6982c67 267__cpuinit void numa_add_cpu(int cpu)
1da177e4
LT
268{
269 /* BP is initialized elsewhere */
270 if (cpu)
271 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
272}
273
274unsigned long __init numa_free_all_bootmem(void)
275{
276 int i;
277 unsigned long pages = 0;
278 for_each_online_node(i) {
279 pages += free_all_bootmem_node(NODE_DATA(i));
280 }
281 return pages;
282}
283
284void __init paging_init(void)
285{
286 int i;
287 for_each_online_node(i) {
288 setup_node_zones(i);
289 }
290}
291
292/* [numa=off] */
293__init int numa_setup(char *opt)
294{
295 if (!strncmp(opt,"off",3))
296 numa_off = 1;
297#ifdef CONFIG_NUMA_EMU
298 if(!strncmp(opt, "fake=", 5)) {
299 numa_fake = simple_strtoul(opt+5,NULL,0); ;
300 if (numa_fake >= MAX_NUMNODES)
301 numa_fake = MAX_NUMNODES;
302 }
303#endif
304#ifdef CONFIG_ACPI_NUMA
305 if (!strncmp(opt,"noacpi",6))
306 acpi_numa = -1;
307#endif
308 return 1;
309}
310
311EXPORT_SYMBOL(cpu_to_node);
312EXPORT_SYMBOL(node_to_cpumask);
313EXPORT_SYMBOL(memnode_shift);
314EXPORT_SYMBOL(memnodemap);
315EXPORT_SYMBOL(node_data);
This page took 0.117909 seconds and 5 git commands to generate.