Commit | Line | Data |
---|---|---|
5ead97c8 JF |
1 | /* |
2 | * Machine specific setup for xen | |
3 | * | |
4 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 | |
5 | */ | |
6 | ||
7 | #include <linux/module.h> | |
8 | #include <linux/sched.h> | |
9 | #include <linux/mm.h> | |
10 | #include <linux/pm.h> | |
a9ce6bc1 | 11 | #include <linux/memblock.h> |
5ead97c8 JF |
12 | |
13 | #include <asm/elf.h> | |
6c3652ef | 14 | #include <asm/vdso.h> |
5ead97c8 JF |
15 | #include <asm/e820.h> |
16 | #include <asm/setup.h> | |
b792c755 | 17 | #include <asm/acpi.h> |
5ead97c8 JF |
18 | #include <asm/xen/hypervisor.h> |
19 | #include <asm/xen/hypercall.h> | |
20 | ||
45263cb0 | 21 | #include <xen/xen.h> |
8006ec3e | 22 | #include <xen/page.h> |
e2a81baf | 23 | #include <xen/interface/callback.h> |
35ae11fd | 24 | #include <xen/interface/memory.h> |
5ead97c8 JF |
25 | #include <xen/interface/physdev.h> |
26 | #include <xen/features.h> | |
27 | ||
28 | #include "xen-ops.h" | |
d2eea68e | 29 | #include "vdso.h" |
5ead97c8 JF |
30 | |
31 | /* These are code, but not functions. Defined in entry.S */ | |
32 | extern const char xen_hypervisor_callback[]; | |
33 | extern const char xen_failsafe_callback[]; | |
f63c2f24 T |
34 | extern void xen_sysenter_target(void); |
35 | extern void xen_syscall_target(void); | |
36 | extern void xen_syscall32_target(void); | |
5ead97c8 | 37 | |
42ee1471 JF |
38 | /* Amount of extra memory space we add to the e820 ranges */ |
39 | phys_addr_t xen_extra_mem_start, xen_extra_mem_size; | |
40 | ||
698bb8d1 JF |
41 | /* |
42 | * The maximum amount of extra memory compared to the base size. The | |
43 | * main scaling factor is the size of struct page. At extreme ratios | |
44 | * of base:extra, all the base memory can be filled with page | |
45 | * structures for the extra memory, leaving no space for anything | |
46 | * else. | |
47 | * | |
48 | * 10x seems like a reasonable balance between scaling flexibility and | |
49 | * leaving a practically usable system. | |
50 | */ | |
51 | #define EXTRA_MEM_RATIO (10) | |
52 | ||
ae15a3b4 | 53 | static void __init xen_add_extra_mem(unsigned long pages) |
42ee1471 | 54 | { |
6eaa412f KRW |
55 | unsigned long pfn; |
56 | ||
42ee1471 | 57 | u64 size = (u64)pages * PAGE_SIZE; |
3654581e | 58 | u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; |
42ee1471 JF |
59 | |
60 | if (!pages) | |
61 | return; | |
62 | ||
3654581e | 63 | e820_add_region(extra_start, size, E820_RAM); |
42ee1471 JF |
64 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); |
65 | ||
520045db | 66 | memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA"); |
42ee1471 JF |
67 | |
68 | xen_extra_mem_size += size; | |
2f7acb20 | 69 | |
3654581e | 70 | xen_max_p2m_pfn = PFN_DOWN(extra_start + size); |
6eaa412f KRW |
71 | |
72 | for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++) | |
73 | __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); | |
42ee1471 JF |
74 | } |
75 | ||
f89e048e JF |
76 | static unsigned long __init xen_release_chunk(phys_addr_t start_addr, |
77 | phys_addr_t end_addr) | |
093d7b46 MR |
78 | { |
79 | struct xen_memory_reservation reservation = { | |
80 | .address_bits = 0, | |
81 | .extent_order = 0, | |
82 | .domid = DOMID_SELF | |
83 | }; | |
093d7b46 | 84 | unsigned long start, end; |
f89e048e | 85 | unsigned long len = 0; |
093d7b46 MR |
86 | unsigned long pfn; |
87 | int ret; | |
88 | ||
89 | start = PFN_UP(start_addr); | |
f89e048e | 90 | end = PFN_DOWN(end_addr); |
093d7b46 MR |
91 | |
92 | if (end <= start) | |
93 | return 0; | |
94 | ||
f89e048e JF |
95 | printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ", |
96 | start, end); | |
97 | for(pfn = start; pfn < end; pfn++) { | |
98 | unsigned long mfn = pfn_to_mfn(pfn); | |
99 | ||
100 | /* Make sure pfn exists to start with */ | |
101 | if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) | |
102 | continue; | |
103 | ||
104 | set_xen_guest_handle(reservation.extent_start, &mfn); | |
105 | reservation.nr_extents = 1; | |
106 | ||
107 | ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, | |
108 | &reservation); | |
109 | WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", | |
110 | start, end, ret); | |
111 | if (ret == 1) { | |
6eaa412f | 112 | __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); |
f89e048e JF |
113 | len++; |
114 | } | |
115 | } | |
8f3c5883 | 116 | printk(KERN_CONT "%lu pages freed\n", len); |
093d7b46 MR |
117 | |
118 | return len; | |
119 | } | |
120 | ||
f89e048e JF |
121 | static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, |
122 | const struct e820map *e820) | |
093d7b46 | 123 | { |
f89e048e | 124 | phys_addr_t max_addr = PFN_PHYS(max_pfn); |
9ec23a7f | 125 | phys_addr_t last_end = ISA_END_ADDRESS; |
093d7b46 MR |
126 | unsigned long released = 0; |
127 | int i; | |
128 | ||
9ec23a7f | 129 | /* Free any unused memory above the low 1Mbyte. */ |
f89e048e JF |
130 | for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { |
131 | phys_addr_t end = e820->map[i].addr; | |
132 | end = min(max_addr, end); | |
133 | ||
9ec23a7f IC |
134 | if (last_end < end) |
135 | released += xen_release_chunk(last_end, end); | |
136 | last_end = max(last_end, e820->map[i].addr + e820->map[i].size); | |
093d7b46 MR |
137 | } |
138 | ||
f89e048e JF |
139 | if (last_end < max_addr) |
140 | released += xen_release_chunk(last_end, max_addr); | |
093d7b46 | 141 | |
8f3c5883 | 142 | printk(KERN_INFO "released %lu pages of unused memory\n", released); |
093d7b46 MR |
143 | return released; |
144 | } | |
5ead97c8 | 145 | |
68df0da7 KRW |
146 | static unsigned long __init xen_set_identity(const struct e820entry *list, |
147 | ssize_t map_size) | |
148 | { | |
149 | phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS; | |
150 | phys_addr_t start_pci = last; | |
151 | const struct e820entry *entry; | |
152 | unsigned long identity = 0; | |
153 | int i; | |
154 | ||
155 | for (i = 0, entry = list; i < map_size; i++, entry++) { | |
156 | phys_addr_t start = entry->addr; | |
157 | phys_addr_t end = start + entry->size; | |
158 | ||
159 | if (start < last) | |
160 | start = last; | |
161 | ||
162 | if (end <= start) | |
163 | continue; | |
164 | ||
165 | /* Skip over the 1MB region. */ | |
166 | if (last > end) | |
167 | continue; | |
168 | ||
15bfc094 | 169 | if ((entry->type == E820_RAM) || (entry->type == E820_UNUSABLE)) { |
68df0da7 KRW |
170 | if (start > start_pci) |
171 | identity += set_phys_range_identity( | |
172 | PFN_UP(start_pci), PFN_DOWN(start)); | |
173 | ||
174 | /* Without saving 'last' we would gooble RAM too | |
175 | * at the end of the loop. */ | |
176 | last = end; | |
177 | start_pci = end; | |
178 | continue; | |
179 | } | |
180 | start_pci = min(start, start_pci); | |
181 | last = end; | |
182 | } | |
183 | if (last > start_pci) | |
184 | identity += set_phys_range_identity( | |
185 | PFN_UP(start_pci), PFN_DOWN(last)); | |
186 | return identity; | |
187 | } | |
5ead97c8 JF |
188 | /** |
189 | * machine_specific_memory_setup - Hook for machine specific memory setup. | |
190 | **/ | |
5ead97c8 JF |
191 | char * __init xen_memory_setup(void) |
192 | { | |
35ae11fd | 193 | static struct e820entry map[E820MAX] __initdata; |
68df0da7 | 194 | static struct e820entry map_raw[E820MAX] __initdata; |
35ae11fd | 195 | |
5ead97c8 | 196 | unsigned long max_pfn = xen_start_info->nr_pages; |
35ae11fd IC |
197 | unsigned long long mem_end; |
198 | int rc; | |
199 | struct xen_memory_map memmap; | |
42ee1471 | 200 | unsigned long extra_pages = 0; |
698bb8d1 | 201 | unsigned long extra_limit; |
68df0da7 | 202 | unsigned long identity_pages = 0; |
35ae11fd | 203 | int i; |
9e9a5fcb | 204 | int op; |
5ead97c8 | 205 | |
8006ec3e | 206 | max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); |
35ae11fd IC |
207 | mem_end = PFN_PHYS(max_pfn); |
208 | ||
209 | memmap.nr_entries = E820MAX; | |
210 | set_xen_guest_handle(memmap.buffer, map); | |
211 | ||
9e9a5fcb IC |
212 | op = xen_initial_domain() ? |
213 | XENMEM_machine_memory_map : | |
214 | XENMEM_memory_map; | |
215 | rc = HYPERVISOR_memory_op(op, &memmap); | |
35ae11fd | 216 | if (rc == -ENOSYS) { |
9ec23a7f | 217 | BUG_ON(xen_initial_domain()); |
35ae11fd IC |
218 | memmap.nr_entries = 1; |
219 | map[0].addr = 0ULL; | |
220 | map[0].size = mem_end; | |
221 | /* 8MB slack (to balance backend allocations). */ | |
222 | map[0].size += 8ULL << 20; | |
223 | map[0].type = E820_RAM; | |
224 | rc = 0; | |
225 | } | |
226 | BUG_ON(rc); | |
8006ec3e | 227 | |
68df0da7 | 228 | memcpy(map_raw, map, sizeof(map)); |
5ead97c8 | 229 | e820.nr_map = 0; |
42ee1471 | 230 | xen_extra_mem_start = mem_end; |
35ae11fd | 231 | for (i = 0; i < memmap.nr_entries; i++) { |
7cb31b75 | 232 | unsigned long long end; |
be5bf9fa | 233 | |
7cb31b75 SS |
234 | /* Guard against non-page aligned E820 entries. */ |
235 | if (map[i].type == E820_RAM) | |
236 | map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE; | |
237 | ||
238 | end = map[i].addr + map[i].size; | |
c2d08791 JF |
239 | if (map[i].type == E820_RAM && end > mem_end) { |
240 | /* RAM off the end - may be partially included */ | |
241 | u64 delta = min(map[i].size, end - mem_end); | |
42ee1471 | 242 | |
c2d08791 JF |
243 | map[i].size -= delta; |
244 | end -= delta; | |
3654581e | 245 | |
c2d08791 | 246 | extra_pages += PFN_DOWN(delta); |
2f14ddc3 ZF |
247 | /* |
248 | * Set RAM below 4GB that is not for us to be unusable. | |
249 | * This prevents "System RAM" address space from being | |
250 | * used as potential resource for I/O address (happens | |
251 | * when 'allocate_resource' is called). | |
252 | */ | |
86b32122 KRW |
253 | if (delta && |
254 | (xen_initial_domain() && end < 0x100000000ULL)) | |
2f14ddc3 | 255 | e820_add_region(end, delta, E820_UNUSABLE); |
3654581e JF |
256 | } |
257 | ||
c2d08791 | 258 | if (map[i].size > 0 && end > xen_extra_mem_start) |
36bc251b | 259 | xen_extra_mem_start = end; |
b5b43ced | 260 | |
c2d08791 JF |
261 | /* Add region if any remains */ |
262 | if (map[i].size > 0) | |
35ae11fd IC |
263 | e820_add_region(map[i].addr, map[i].size, map[i].type); |
264 | } | |
acd049c6 KRW |
265 | /* Align the balloon area so that max_low_pfn does not get set |
266 | * to be at the _end_ of the PCI gap at the far end (fee01000). | |
267 | * Note that xen_extra_mem_start gets set in the loop above to be | |
268 | * past the last E820 region. */ | |
269 | if (xen_initial_domain() && (xen_extra_mem_start < (1ULL<<32))) | |
270 | xen_extra_mem_start = (1ULL<<32); | |
b792c755 JF |
271 | |
272 | /* | |
9ec23a7f IC |
273 | * In domU, the ISA region is normal, usable memory, but we |
274 | * reserve ISA memory anyway because too many things poke | |
b792c755 | 275 | * about in there. |
4ec5387c | 276 | * |
9ec23a7f IC |
277 | * In Dom0, the host E820 information can leave gaps in the |
278 | * ISA range, which would cause us to release those pages. To | |
279 | * avoid this, we unconditionally reserve them here. | |
b792c755 JF |
280 | */ |
281 | e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, | |
282 | E820_RESERVED); | |
5ead97c8 | 283 | |
be5bf9fa JF |
284 | /* |
285 | * Reserve Xen bits: | |
286 | * - mfn_list | |
287 | * - xen_start_info | |
288 | * See comment above "struct start_info" in <xen/interface/xen.h> | |
289 | */ | |
a9ce6bc1 | 290 | memblock_x86_reserve_range(__pa(xen_start_info->mfn_list), |
6b2e8523 JF |
291 | __pa(xen_start_info->pt_base), |
292 | "XEN START INFO"); | |
be5bf9fa JF |
293 | |
294 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); | |
295 | ||
42ee1471 JF |
296 | extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); |
297 | ||
698bb8d1 JF |
298 | /* |
299 | * Clamp the amount of extra memory to a EXTRA_MEM_RATIO | |
300 | * factor the base size. On non-highmem systems, the base | |
301 | * size is the full initial memory allocation; on highmem it | |
302 | * is limited to the max size of lowmem, so that it doesn't | |
303 | * get completely filled. | |
304 | * | |
305 | * In principle there could be a problem in lowmem systems if | |
306 | * the initial memory is also very large with respect to | |
307 | * lowmem, but we won't try to deal with that here. | |
308 | */ | |
309 | extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), | |
310 | max_pfn + extra_pages); | |
311 | ||
312 | if (extra_limit >= max_pfn) | |
313 | extra_pages = extra_limit - max_pfn; | |
314 | else | |
315 | extra_pages = 0; | |
316 | ||
d2a81713 | 317 | xen_add_extra_mem(extra_pages); |
093d7b46 | 318 | |
68df0da7 KRW |
319 | /* |
320 | * Set P2M for all non-RAM pages and E820 gaps to be identity | |
321 | * type PFNs. We supply it with the non-sanitized version | |
322 | * of the E820. | |
323 | */ | |
324 | identity_pages = xen_set_identity(map_raw, memmap.nr_entries); | |
325 | printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages); | |
5ead97c8 JF |
326 | return "Xen"; |
327 | } | |
328 | ||
d2eea68e RM |
329 | /* |
330 | * Set the bit indicating "nosegneg" library variants should be used. | |
6a52e4b1 JF |
331 | * We only need to bother in pure 32-bit mode; compat 32-bit processes |
332 | * can have un-truncated segments, so wrapping around is allowed. | |
d2eea68e | 333 | */ |
08b6d290 | 334 | static void __init fiddle_vdso(void) |
d2eea68e | 335 | { |
6a52e4b1 JF |
336 | #ifdef CONFIG_X86_32 |
337 | u32 *mask; | |
338 | mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK); | |
339 | *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; | |
340 | mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK); | |
d2eea68e | 341 | *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; |
6fcac6d3 | 342 | #endif |
d2eea68e RM |
343 | } |
344 | ||
ae15a3b4 | 345 | static int __cpuinit register_callback(unsigned type, const void *func) |
e2a81baf | 346 | { |
88459d4c JF |
347 | struct callback_register callback = { |
348 | .type = type, | |
349 | .address = XEN_CALLBACK(__KERNEL_CS, func), | |
e2a81baf JF |
350 | .flags = CALLBACKF_mask_events, |
351 | }; | |
352 | ||
88459d4c JF |
353 | return HYPERVISOR_callback_op(CALLBACKOP_register, &callback); |
354 | } | |
355 | ||
356 | void __cpuinit xen_enable_sysenter(void) | |
357 | { | |
6fcac6d3 | 358 | int ret; |
62541c37 | 359 | unsigned sysenter_feature; |
6fcac6d3 JF |
360 | |
361 | #ifdef CONFIG_X86_32 | |
62541c37 | 362 | sysenter_feature = X86_FEATURE_SEP; |
6fcac6d3 | 363 | #else |
62541c37 | 364 | sysenter_feature = X86_FEATURE_SYSENTER32; |
6fcac6d3 | 365 | #endif |
88459d4c | 366 | |
62541c37 JF |
367 | if (!boot_cpu_has(sysenter_feature)) |
368 | return; | |
369 | ||
6fcac6d3 | 370 | ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target); |
62541c37 JF |
371 | if(ret != 0) |
372 | setup_clear_cpu_cap(sysenter_feature); | |
e2a81baf JF |
373 | } |
374 | ||
6fcac6d3 JF |
375 | void __cpuinit xen_enable_syscall(void) |
376 | { | |
377 | #ifdef CONFIG_X86_64 | |
6fcac6d3 | 378 | int ret; |
6fcac6d3 JF |
379 | |
380 | ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); | |
381 | if (ret != 0) { | |
d5303b81 | 382 | printk(KERN_ERR "Failed to set syscall callback: %d\n", ret); |
62541c37 JF |
383 | /* Pretty fatal; 64-bit userspace has no other |
384 | mechanism for syscalls. */ | |
385 | } | |
386 | ||
387 | if (boot_cpu_has(X86_FEATURE_SYSCALL32)) { | |
6fcac6d3 JF |
388 | ret = register_callback(CALLBACKTYPE_syscall32, |
389 | xen_syscall32_target); | |
d5303b81 | 390 | if (ret != 0) |
62541c37 | 391 | setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); |
6fcac6d3 JF |
392 | } |
393 | #endif /* CONFIG_X86_64 */ | |
394 | } | |
395 | ||
5ead97c8 JF |
396 | void __init xen_arch_setup(void) |
397 | { | |
f09f6d19 DD |
398 | xen_panic_handler_init(); |
399 | ||
5ead97c8 JF |
400 | HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); |
401 | HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); | |
402 | ||
403 | if (!xen_feature(XENFEAT_auto_translated_physmap)) | |
f63c2f24 T |
404 | HYPERVISOR_vm_assist(VMASST_CMD_enable, |
405 | VMASST_TYPE_pae_extended_cr3); | |
5ead97c8 | 406 | |
88459d4c JF |
407 | if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || |
408 | register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) | |
409 | BUG(); | |
5ead97c8 | 410 | |
e2a81baf | 411 | xen_enable_sysenter(); |
6fcac6d3 | 412 | xen_enable_syscall(); |
e2a81baf | 413 | |
5ead97c8 JF |
414 | #ifdef CONFIG_ACPI |
415 | if (!(xen_start_info->flags & SIF_INITDOMAIN)) { | |
416 | printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); | |
417 | disable_acpi(); | |
418 | } | |
419 | #endif | |
420 | ||
421 | memcpy(boot_command_line, xen_start_info->cmd_line, | |
422 | MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? | |
423 | COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); | |
424 | ||
bc15fde7 JF |
425 | /* Set up idle, making sure it calls safe_halt() pvop */ |
426 | #ifdef CONFIG_X86_32 | |
427 | boot_cpu_data.hlt_works_ok = 1; | |
428 | #endif | |
429 | pm_idle = default_idle; | |
23febedd | 430 | boot_option_idle_override = IDLE_HALT; |
f87e4cac | 431 | |
d2eea68e | 432 | fiddle_vdso(); |
5ead97c8 | 433 | } |