Commit | Line | Data |
---|---|---|
5ead97c8 JF |
1 | /* |
2 | * Machine specific setup for xen | |
3 | * | |
4 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 | |
5 | */ | |
6 | ||
7 | #include <linux/module.h> | |
8 | #include <linux/sched.h> | |
9 | #include <linux/mm.h> | |
10 | #include <linux/pm.h> | |
a9ce6bc1 | 11 | #include <linux/memblock.h> |
5ead97c8 JF |
12 | |
13 | #include <asm/elf.h> | |
6c3652ef | 14 | #include <asm/vdso.h> |
5ead97c8 JF |
15 | #include <asm/e820.h> |
16 | #include <asm/setup.h> | |
b792c755 | 17 | #include <asm/acpi.h> |
5ead97c8 JF |
18 | #include <asm/xen/hypervisor.h> |
19 | #include <asm/xen/hypercall.h> | |
20 | ||
45263cb0 | 21 | #include <xen/xen.h> |
8006ec3e | 22 | #include <xen/page.h> |
e2a81baf | 23 | #include <xen/interface/callback.h> |
35ae11fd | 24 | #include <xen/interface/memory.h> |
5ead97c8 | 25 | #include <xen/interface/physdev.h> |
093d7b46 | 26 | #include <xen/interface/memory.h> |
5ead97c8 JF |
27 | #include <xen/features.h> |
28 | ||
29 | #include "xen-ops.h" | |
d2eea68e | 30 | #include "vdso.h" |
5ead97c8 JF |
31 | |
32 | /* These are code, but not functions. Defined in entry.S */ | |
33 | extern const char xen_hypervisor_callback[]; | |
34 | extern const char xen_failsafe_callback[]; | |
f63c2f24 T |
35 | extern void xen_sysenter_target(void); |
36 | extern void xen_syscall_target(void); | |
37 | extern void xen_syscall32_target(void); | |
5ead97c8 | 38 | |
42ee1471 JF |
39 | /* Amount of extra memory space we add to the e820 ranges */ |
40 | phys_addr_t xen_extra_mem_start, xen_extra_mem_size; | |
41 | ||
698bb8d1 JF |
42 | /* |
43 | * The maximum amount of extra memory compared to the base size. The | |
44 | * main scaling factor is the size of struct page. At extreme ratios | |
45 | * of base:extra, all the base memory can be filled with page | |
46 | * structures for the extra memory, leaving no space for anything | |
47 | * else. | |
48 | * | |
49 | * 10x seems like a reasonable balance between scaling flexibility and | |
50 | * leaving a practically usable system. | |
51 | */ | |
52 | #define EXTRA_MEM_RATIO (10) | |
53 | ||
42ee1471 JF |
54 | static __init void xen_add_extra_mem(unsigned long pages) |
55 | { | |
56 | u64 size = (u64)pages * PAGE_SIZE; | |
3654581e | 57 | u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; |
42ee1471 JF |
58 | |
59 | if (!pages) | |
60 | return; | |
61 | ||
3654581e | 62 | e820_add_region(extra_start, size, E820_RAM); |
42ee1471 JF |
63 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); |
64 | ||
520045db | 65 | memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA"); |
42ee1471 JF |
66 | |
67 | xen_extra_mem_size += size; | |
2f7acb20 | 68 | |
3654581e | 69 | xen_max_p2m_pfn = PFN_DOWN(extra_start + size); |
42ee1471 JF |
70 | } |
71 | ||
f89e048e JF |
72 | static unsigned long __init xen_release_chunk(phys_addr_t start_addr, |
73 | phys_addr_t end_addr) | |
093d7b46 MR |
74 | { |
75 | struct xen_memory_reservation reservation = { | |
76 | .address_bits = 0, | |
77 | .extent_order = 0, | |
78 | .domid = DOMID_SELF | |
79 | }; | |
093d7b46 | 80 | unsigned long start, end; |
f89e048e | 81 | unsigned long len = 0; |
093d7b46 MR |
82 | unsigned long pfn; |
83 | int ret; | |
84 | ||
85 | start = PFN_UP(start_addr); | |
f89e048e | 86 | end = PFN_DOWN(end_addr); |
093d7b46 MR |
87 | |
88 | if (end <= start) | |
89 | return 0; | |
90 | ||
f89e048e JF |
91 | printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ", |
92 | start, end); | |
93 | for(pfn = start; pfn < end; pfn++) { | |
94 | unsigned long mfn = pfn_to_mfn(pfn); | |
95 | ||
96 | /* Make sure pfn exists to start with */ | |
97 | if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) | |
98 | continue; | |
99 | ||
100 | set_xen_guest_handle(reservation.extent_start, &mfn); | |
101 | reservation.nr_extents = 1; | |
102 | ||
103 | ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, | |
104 | &reservation); | |
105 | WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", | |
106 | start, end, ret); | |
107 | if (ret == 1) { | |
108 | set_phys_to_machine(pfn, INVALID_P2M_ENTRY); | |
109 | len++; | |
110 | } | |
111 | } | |
112 | printk(KERN_CONT "%ld pages freed\n", len); | |
093d7b46 MR |
113 | |
114 | return len; | |
115 | } | |
116 | ||
f89e048e JF |
117 | static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, |
118 | const struct e820map *e820) | |
093d7b46 | 119 | { |
f89e048e JF |
120 | phys_addr_t max_addr = PFN_PHYS(max_pfn); |
121 | phys_addr_t last_end = 0; | |
093d7b46 MR |
122 | unsigned long released = 0; |
123 | int i; | |
124 | ||
f89e048e JF |
125 | for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { |
126 | phys_addr_t end = e820->map[i].addr; | |
127 | end = min(max_addr, end); | |
128 | ||
129 | released += xen_release_chunk(last_end, end); | |
093d7b46 MR |
130 | last_end = e820->map[i].addr + e820->map[i].size; |
131 | } | |
132 | ||
f89e048e JF |
133 | if (last_end < max_addr) |
134 | released += xen_release_chunk(last_end, max_addr); | |
093d7b46 MR |
135 | |
136 | printk(KERN_INFO "released %ld pages of unused memory\n", released); | |
137 | return released; | |
138 | } | |
5ead97c8 JF |
139 | |
140 | /** | |
141 | * machine_specific_memory_setup - Hook for machine specific memory setup. | |
142 | **/ | |
5ead97c8 JF |
143 | char * __init xen_memory_setup(void) |
144 | { | |
35ae11fd IC |
145 | static struct e820entry map[E820MAX] __initdata; |
146 | ||
5ead97c8 | 147 | unsigned long max_pfn = xen_start_info->nr_pages; |
35ae11fd IC |
148 | unsigned long long mem_end; |
149 | int rc; | |
150 | struct xen_memory_map memmap; | |
42ee1471 | 151 | unsigned long extra_pages = 0; |
698bb8d1 | 152 | unsigned long extra_limit; |
35ae11fd | 153 | int i; |
9e9a5fcb | 154 | int op; |
5ead97c8 | 155 | |
8006ec3e | 156 | max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); |
35ae11fd IC |
157 | mem_end = PFN_PHYS(max_pfn); |
158 | ||
159 | memmap.nr_entries = E820MAX; | |
160 | set_xen_guest_handle(memmap.buffer, map); | |
161 | ||
9e9a5fcb IC |
162 | op = xen_initial_domain() ? |
163 | XENMEM_machine_memory_map : | |
164 | XENMEM_memory_map; | |
165 | rc = HYPERVISOR_memory_op(op, &memmap); | |
35ae11fd IC |
166 | if (rc == -ENOSYS) { |
167 | memmap.nr_entries = 1; | |
168 | map[0].addr = 0ULL; | |
169 | map[0].size = mem_end; | |
170 | /* 8MB slack (to balance backend allocations). */ | |
171 | map[0].size += 8ULL << 20; | |
172 | map[0].type = E820_RAM; | |
173 | rc = 0; | |
174 | } | |
175 | BUG_ON(rc); | |
8006ec3e | 176 | |
5ead97c8 | 177 | e820.nr_map = 0; |
42ee1471 | 178 | xen_extra_mem_start = mem_end; |
35ae11fd IC |
179 | for (i = 0; i < memmap.nr_entries; i++) { |
180 | unsigned long long end = map[i].addr + map[i].size; | |
be5bf9fa | 181 | |
35ae11fd | 182 | if (map[i].type == E820_RAM) { |
3654581e | 183 | if (map[i].addr < mem_end && end > mem_end) { |
35ae11fd | 184 | /* Truncate region to max_mem. */ |
3654581e | 185 | u64 delta = end - mem_end; |
42ee1471 | 186 | |
3654581e JF |
187 | map[i].size -= delta; |
188 | extra_pages += PFN_DOWN(delta); | |
189 | ||
190 | end = mem_end; | |
35ae11fd | 191 | } |
3654581e JF |
192 | } |
193 | ||
194 | if (end > xen_extra_mem_start) | |
36bc251b | 195 | xen_extra_mem_start = end; |
b5b43ced | 196 | |
3654581e | 197 | /* If region is non-RAM or below mem_end, add what remains */ |
b5b43ced JF |
198 | if ((map[i].type != E820_RAM || map[i].addr < mem_end) && |
199 | map[i].size > 0) | |
35ae11fd IC |
200 | e820_add_region(map[i].addr, map[i].size, map[i].type); |
201 | } | |
b792c755 JF |
202 | |
203 | /* | |
204 | * Even though this is normal, usable memory under Xen, reserve | |
205 | * ISA memory anyway because too many things think they can poke | |
206 | * about in there. | |
4ec5387c JQ |
207 | * |
208 | * In a dom0 kernel, this region is identity mapped with the | |
209 | * hardware ISA area, so it really is out of bounds. | |
b792c755 JF |
210 | */ |
211 | e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, | |
212 | E820_RESERVED); | |
5ead97c8 | 213 | |
be5bf9fa JF |
214 | /* |
215 | * Reserve Xen bits: | |
216 | * - mfn_list | |
217 | * - xen_start_info | |
218 | * See comment above "struct start_info" in <xen/interface/xen.h> | |
219 | */ | |
a9ce6bc1 | 220 | memblock_x86_reserve_range(__pa(xen_start_info->mfn_list), |
6b2e8523 JF |
221 | __pa(xen_start_info->pt_base), |
222 | "XEN START INFO"); | |
be5bf9fa JF |
223 | |
224 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); | |
225 | ||
42ee1471 JF |
226 | extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); |
227 | ||
698bb8d1 JF |
228 | /* |
229 | * Clamp the amount of extra memory to a EXTRA_MEM_RATIO | |
230 | * factor the base size. On non-highmem systems, the base | |
231 | * size is the full initial memory allocation; on highmem it | |
232 | * is limited to the max size of lowmem, so that it doesn't | |
233 | * get completely filled. | |
234 | * | |
235 | * In principle there could be a problem in lowmem systems if | |
236 | * the initial memory is also very large with respect to | |
237 | * lowmem, but we won't try to deal with that here. | |
238 | */ | |
239 | extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), | |
240 | max_pfn + extra_pages); | |
241 | ||
242 | if (extra_limit >= max_pfn) | |
243 | extra_pages = extra_limit - max_pfn; | |
244 | else | |
245 | extra_pages = 0; | |
246 | ||
9e9a5fcb IC |
247 | if (!xen_initial_domain()) |
248 | xen_add_extra_mem(extra_pages); | |
093d7b46 | 249 | |
5ead97c8 JF |
250 | return "Xen"; |
251 | } | |
252 | ||
253 | static void xen_idle(void) | |
254 | { | |
255 | local_irq_disable(); | |
256 | ||
257 | if (need_resched()) | |
258 | local_irq_enable(); | |
259 | else { | |
260 | current_thread_info()->status &= ~TS_POLLING; | |
261 | smp_mb__after_clear_bit(); | |
262 | safe_halt(); | |
263 | current_thread_info()->status |= TS_POLLING; | |
264 | } | |
265 | } | |
266 | ||
d2eea68e RM |
267 | /* |
268 | * Set the bit indicating "nosegneg" library variants should be used. | |
6a52e4b1 JF |
269 | * We only need to bother in pure 32-bit mode; compat 32-bit processes |
270 | * can have un-truncated segments, so wrapping around is allowed. | |
d2eea68e | 271 | */ |
08b6d290 | 272 | static void __init fiddle_vdso(void) |
d2eea68e | 273 | { |
6a52e4b1 JF |
274 | #ifdef CONFIG_X86_32 |
275 | u32 *mask; | |
276 | mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK); | |
277 | *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; | |
278 | mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK); | |
d2eea68e | 279 | *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; |
6fcac6d3 | 280 | #endif |
d2eea68e RM |
281 | } |
282 | ||
88459d4c | 283 | static __cpuinit int register_callback(unsigned type, const void *func) |
e2a81baf | 284 | { |
88459d4c JF |
285 | struct callback_register callback = { |
286 | .type = type, | |
287 | .address = XEN_CALLBACK(__KERNEL_CS, func), | |
e2a81baf JF |
288 | .flags = CALLBACKF_mask_events, |
289 | }; | |
290 | ||
88459d4c JF |
291 | return HYPERVISOR_callback_op(CALLBACKOP_register, &callback); |
292 | } | |
293 | ||
294 | void __cpuinit xen_enable_sysenter(void) | |
295 | { | |
6fcac6d3 | 296 | int ret; |
62541c37 | 297 | unsigned sysenter_feature; |
6fcac6d3 JF |
298 | |
299 | #ifdef CONFIG_X86_32 | |
62541c37 | 300 | sysenter_feature = X86_FEATURE_SEP; |
6fcac6d3 | 301 | #else |
62541c37 | 302 | sysenter_feature = X86_FEATURE_SYSENTER32; |
6fcac6d3 | 303 | #endif |
88459d4c | 304 | |
62541c37 JF |
305 | if (!boot_cpu_has(sysenter_feature)) |
306 | return; | |
307 | ||
6fcac6d3 | 308 | ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target); |
62541c37 JF |
309 | if(ret != 0) |
310 | setup_clear_cpu_cap(sysenter_feature); | |
e2a81baf JF |
311 | } |
312 | ||
6fcac6d3 JF |
313 | void __cpuinit xen_enable_syscall(void) |
314 | { | |
315 | #ifdef CONFIG_X86_64 | |
6fcac6d3 | 316 | int ret; |
6fcac6d3 JF |
317 | |
318 | ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); | |
319 | if (ret != 0) { | |
d5303b81 | 320 | printk(KERN_ERR "Failed to set syscall callback: %d\n", ret); |
62541c37 JF |
321 | /* Pretty fatal; 64-bit userspace has no other |
322 | mechanism for syscalls. */ | |
323 | } | |
324 | ||
325 | if (boot_cpu_has(X86_FEATURE_SYSCALL32)) { | |
6fcac6d3 JF |
326 | ret = register_callback(CALLBACKTYPE_syscall32, |
327 | xen_syscall32_target); | |
d5303b81 | 328 | if (ret != 0) |
62541c37 | 329 | setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); |
6fcac6d3 JF |
330 | } |
331 | #endif /* CONFIG_X86_64 */ | |
332 | } | |
333 | ||
5ead97c8 JF |
334 | void __init xen_arch_setup(void) |
335 | { | |
336 | struct physdev_set_iopl set_iopl; | |
337 | int rc; | |
338 | ||
f09f6d19 DD |
339 | xen_panic_handler_init(); |
340 | ||
5ead97c8 JF |
341 | HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); |
342 | HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); | |
343 | ||
344 | if (!xen_feature(XENFEAT_auto_translated_physmap)) | |
f63c2f24 T |
345 | HYPERVISOR_vm_assist(VMASST_CMD_enable, |
346 | VMASST_TYPE_pae_extended_cr3); | |
5ead97c8 | 347 | |
88459d4c JF |
348 | if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || |
349 | register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) | |
350 | BUG(); | |
5ead97c8 | 351 | |
e2a81baf | 352 | xen_enable_sysenter(); |
6fcac6d3 | 353 | xen_enable_syscall(); |
e2a81baf | 354 | |
5ead97c8 JF |
355 | set_iopl.iopl = 1; |
356 | rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); | |
357 | if (rc != 0) | |
358 | printk(KERN_INFO "physdev_op failed %d\n", rc); | |
359 | ||
360 | #ifdef CONFIG_ACPI | |
361 | if (!(xen_start_info->flags & SIF_INITDOMAIN)) { | |
362 | printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); | |
363 | disable_acpi(); | |
364 | } | |
365 | #endif | |
366 | ||
367 | memcpy(boot_command_line, xen_start_info->cmd_line, | |
368 | MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? | |
369 | COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); | |
370 | ||
371 | pm_idle = xen_idle; | |
f87e4cac | 372 | |
d2eea68e | 373 | fiddle_vdso(); |
5ead97c8 | 374 | } |