Commit | Line | Data |
---|---|---|
5234f5eb | 1 | /* |
835c34a1 | 2 | * handle transition of Linux booting another kernel |
5234f5eb EB |
3 | * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> |
4 | * | |
5 | * This source code is licensed under the GNU General Public License, | |
6 | * Version 2. See the file COPYING for more details. | |
7 | */ | |
8 | ||
12db5562 VG |
9 | #define pr_fmt(fmt) "kexec: " fmt |
10 | ||
5234f5eb EB |
11 | #include <linux/mm.h> |
12 | #include <linux/kexec.h> | |
5234f5eb | 13 | #include <linux/string.h> |
5a0e3ad6 | 14 | #include <linux/gfp.h> |
5234f5eb | 15 | #include <linux/reboot.h> |
fd59d231 | 16 | #include <linux/numa.h> |
f43fdad8 | 17 | #include <linux/ftrace.h> |
fef3a7a1 | 18 | #include <linux/io.h> |
fee7b0d8 | 19 | #include <linux/suspend.h> |
f43fdad8 | 20 | |
9ebdc79f | 21 | #include <asm/init.h> |
5234f5eb | 22 | #include <asm/pgtable.h> |
5234f5eb EB |
23 | #include <asm/tlbflush.h> |
24 | #include <asm/mmu_context.h> | |
8643e28d | 25 | #include <asm/io_apic.h> |
17f557e5 | 26 | #include <asm/debugreg.h> |
27f48d3e | 27 | #include <asm/kexec-bzimage64.h> |
8bf27556 | 28 | |
74ca317c | 29 | #ifdef CONFIG_KEXEC_FILE |
cb105258 | 30 | static struct kexec_file_ops *kexec_file_loaders[] = { |
27f48d3e | 31 | &kexec_bzImage64_ops, |
cb105258 | 32 | }; |
74ca317c | 33 | #endif |
cb105258 | 34 | |
f5deb796 HY |
35 | static void free_transition_pgtable(struct kimage *image) |
36 | { | |
37 | free_page((unsigned long)image->arch.pud); | |
38 | free_page((unsigned long)image->arch.pmd); | |
39 | free_page((unsigned long)image->arch.pte); | |
40 | } | |
41 | ||
42 | static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) | |
43 | { | |
44 | pud_t *pud; | |
45 | pmd_t *pmd; | |
46 | pte_t *pte; | |
47 | unsigned long vaddr, paddr; | |
48 | int result = -ENOMEM; | |
49 | ||
50 | vaddr = (unsigned long)relocate_kernel; | |
51 | paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); | |
52 | pgd += pgd_index(vaddr); | |
53 | if (!pgd_present(*pgd)) { | |
54 | pud = (pud_t *)get_zeroed_page(GFP_KERNEL); | |
55 | if (!pud) | |
56 | goto err; | |
57 | image->arch.pud = pud; | |
58 | set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); | |
59 | } | |
60 | pud = pud_offset(pgd, vaddr); | |
61 | if (!pud_present(*pud)) { | |
62 | pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); | |
63 | if (!pmd) | |
64 | goto err; | |
65 | image->arch.pmd = pmd; | |
66 | set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); | |
67 | } | |
68 | pmd = pmd_offset(pud, vaddr); | |
69 | if (!pmd_present(*pmd)) { | |
70 | pte = (pte_t *)get_zeroed_page(GFP_KERNEL); | |
71 | if (!pte) | |
72 | goto err; | |
73 | image->arch.pte = pte; | |
74 | set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); | |
75 | } | |
76 | pte = pte_offset_kernel(pmd, vaddr); | |
77 | set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); | |
78 | return 0; | |
79 | err: | |
80 | free_transition_pgtable(image); | |
81 | return result; | |
82 | } | |
83 | ||
9ebdc79f YL |
84 | static void *alloc_pgt_page(void *data) |
85 | { | |
86 | struct kimage *image = (struct kimage *)data; | |
87 | struct page *page; | |
88 | void *p = NULL; | |
89 | ||
90 | page = kimage_alloc_control_pages(image, 0); | |
91 | if (page) { | |
92 | p = page_address(page); | |
93 | clear_page(p); | |
94 | } | |
95 | ||
96 | return p; | |
97 | } | |
98 | ||
5234f5eb EB |
99 | static int init_pgtable(struct kimage *image, unsigned long start_pgtable) |
100 | { | |
9ebdc79f YL |
101 | struct x86_mapping_info info = { |
102 | .alloc_pgt_page = alloc_pgt_page, | |
103 | .context = image, | |
104 | .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, | |
105 | }; | |
084d1283 | 106 | unsigned long mstart, mend; |
8bf27556 | 107 | pgd_t *level4p; |
f5deb796 | 108 | int result; |
084d1283 YL |
109 | int i; |
110 | ||
8bf27556 | 111 | level4p = (pgd_t *)__va(start_pgtable); |
9ebdc79f | 112 | clear_page(level4p); |
0e691cf8 YL |
113 | for (i = 0; i < nr_pfn_mapped; i++) { |
114 | mstart = pfn_mapped[i].start << PAGE_SHIFT; | |
115 | mend = pfn_mapped[i].end << PAGE_SHIFT; | |
116 | ||
117 | result = kernel_ident_mapping_init(&info, | |
118 | level4p, mstart, mend); | |
119 | if (result) | |
120 | return result; | |
121 | } | |
084d1283 | 122 | |
53594547 | 123 | /* |
084d1283 YL |
124 | * segments's mem ranges could be outside 0 ~ max_pfn, |
125 | * for example when jump back to original kernel from kexeced kernel. | |
126 | * or first kernel is booted with user mem map, and second kernel | |
127 | * could be loaded out of that range. | |
53594547 | 128 | */ |
084d1283 YL |
129 | for (i = 0; i < image->nr_segments; i++) { |
130 | mstart = image->segment[i].mem; | |
131 | mend = mstart + image->segment[i].memsz; | |
132 | ||
9ebdc79f YL |
133 | result = kernel_ident_mapping_init(&info, |
134 | level4p, mstart, mend); | |
084d1283 YL |
135 | |
136 | if (result) | |
137 | return result; | |
138 | } | |
139 | ||
f5deb796 | 140 | return init_transition_pgtable(image, level4p); |
5234f5eb EB |
141 | } |
142 | ||
143 | static void set_idt(void *newidt, u16 limit) | |
144 | { | |
36c4fd23 | 145 | struct desc_ptr curidt; |
5234f5eb EB |
146 | |
147 | /* x86-64 supports unaliged loads & stores */ | |
36c4fd23 EB |
148 | curidt.size = limit; |
149 | curidt.address = (unsigned long)newidt; | |
5234f5eb EB |
150 | |
151 | __asm__ __volatile__ ( | |
36c4fd23 EB |
152 | "lidtq %0\n" |
153 | : : "m" (curidt) | |
5234f5eb EB |
154 | ); |
155 | }; | |
156 | ||
157 | ||
158 | static void set_gdt(void *newgdt, u16 limit) | |
159 | { | |
36c4fd23 | 160 | struct desc_ptr curgdt; |
5234f5eb EB |
161 | |
162 | /* x86-64 supports unaligned loads & stores */ | |
36c4fd23 EB |
163 | curgdt.size = limit; |
164 | curgdt.address = (unsigned long)newgdt; | |
5234f5eb EB |
165 | |
166 | __asm__ __volatile__ ( | |
36c4fd23 EB |
167 | "lgdtq %0\n" |
168 | : : "m" (curgdt) | |
5234f5eb EB |
169 | ); |
170 | }; | |
171 | ||
172 | static void load_segments(void) | |
173 | { | |
174 | __asm__ __volatile__ ( | |
36c4fd23 EB |
175 | "\tmovl %0,%%ds\n" |
176 | "\tmovl %0,%%es\n" | |
177 | "\tmovl %0,%%ss\n" | |
178 | "\tmovl %0,%%fs\n" | |
179 | "\tmovl %0,%%gs\n" | |
2ec5e3a8 | 180 | : : "a" (__KERNEL_DS) : "memory" |
5234f5eb | 181 | ); |
5234f5eb EB |
182 | } |
183 | ||
74ca317c | 184 | #ifdef CONFIG_KEXEC_FILE |
dd5f7260 VG |
185 | /* Update purgatory as needed after various image segments have been prepared */ |
186 | static int arch_update_purgatory(struct kimage *image) | |
187 | { | |
188 | int ret = 0; | |
189 | ||
190 | if (!image->file_mode) | |
191 | return 0; | |
192 | ||
193 | /* Setup copying of backup region */ | |
194 | if (image->type == KEXEC_TYPE_CRASH) { | |
195 | ret = kexec_purgatory_get_set_symbol(image, "backup_dest", | |
196 | &image->arch.backup_load_addr, | |
197 | sizeof(image->arch.backup_load_addr), 0); | |
198 | if (ret) | |
199 | return ret; | |
200 | ||
201 | ret = kexec_purgatory_get_set_symbol(image, "backup_src", | |
202 | &image->arch.backup_src_start, | |
203 | sizeof(image->arch.backup_src_start), 0); | |
204 | if (ret) | |
205 | return ret; | |
206 | ||
207 | ret = kexec_purgatory_get_set_symbol(image, "backup_sz", | |
208 | &image->arch.backup_src_sz, | |
209 | sizeof(image->arch.backup_src_sz), 0); | |
210 | if (ret) | |
211 | return ret; | |
212 | } | |
213 | ||
214 | return ret; | |
215 | } | |
74ca317c VG |
216 | #else /* !CONFIG_KEXEC_FILE */ |
217 | static inline int arch_update_purgatory(struct kimage *image) | |
218 | { | |
219 | return 0; | |
220 | } | |
221 | #endif /* CONFIG_KEXEC_FILE */ | |
dd5f7260 | 222 | |
5234f5eb EB |
223 | int machine_kexec_prepare(struct kimage *image) |
224 | { | |
4bfaaef0 | 225 | unsigned long start_pgtable; |
5234f5eb EB |
226 | int result; |
227 | ||
228 | /* Calculate the offsets */ | |
72414d3f | 229 | start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; |
5234f5eb EB |
230 | |
231 | /* Setup the identity mapped 64bit page table */ | |
232 | result = init_pgtable(image, start_pgtable); | |
72414d3f | 233 | if (result) |
5234f5eb | 234 | return result; |
5234f5eb | 235 | |
dd5f7260 VG |
236 | /* update purgatory as needed */ |
237 | result = arch_update_purgatory(image); | |
238 | if (result) | |
239 | return result; | |
240 | ||
5234f5eb EB |
241 | return 0; |
242 | } | |
243 | ||
244 | void machine_kexec_cleanup(struct kimage *image) | |
245 | { | |
f5deb796 | 246 | free_transition_pgtable(image); |
5234f5eb EB |
247 | } |
248 | ||
249 | /* | |
250 | * Do not allocate memory (or fail in any way) in machine_kexec(). | |
251 | * We are past the point of no return, committed to rebooting now. | |
252 | */ | |
3ab83521 | 253 | void machine_kexec(struct kimage *image) |
5234f5eb | 254 | { |
4bfaaef0 MD |
255 | unsigned long page_list[PAGES_NR]; |
256 | void *control_page; | |
fee7b0d8 | 257 | int save_ftrace_enabled; |
5234f5eb | 258 | |
fee7b0d8 | 259 | #ifdef CONFIG_KEXEC_JUMP |
6407df5c | 260 | if (image->preserve_context) |
fee7b0d8 HY |
261 | save_processor_state(); |
262 | #endif | |
263 | ||
264 | save_ftrace_enabled = __ftrace_enabled_save(); | |
f43fdad8 | 265 | |
5234f5eb EB |
266 | /* Interrupts aren't acceptable while we reboot */ |
267 | local_irq_disable(); | |
17f557e5 | 268 | hw_breakpoint_disable(); |
5234f5eb | 269 | |
fee7b0d8 HY |
270 | if (image->preserve_context) { |
271 | #ifdef CONFIG_X86_IO_APIC | |
272 | /* | |
273 | * We need to put APICs in legacy mode so that we can | |
274 | * get timer interrupts in second kernel. kexec/kdump | |
275 | * paths already have calls to disable_IO_APIC() in | |
276 | * one form or other. kexec jump path also need | |
277 | * one. | |
278 | */ | |
279 | disable_IO_APIC(); | |
280 | #endif | |
281 | } | |
282 | ||
4bfaaef0 | 283 | control_page = page_address(image->control_code_page) + PAGE_SIZE; |
fee7b0d8 | 284 | memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); |
4bfaaef0 | 285 | |
e3ebadd9 | 286 | page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); |
fee7b0d8 | 287 | page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; |
4bfaaef0 MD |
288 | page_list[PA_TABLE_PAGE] = |
289 | (unsigned long)__pa(page_address(image->control_code_page)); | |
5234f5eb | 290 | |
fee7b0d8 HY |
291 | if (image->type == KEXEC_TYPE_DEFAULT) |
292 | page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) | |
293 | << PAGE_SHIFT); | |
294 | ||
fef3a7a1 HY |
295 | /* |
296 | * The segment registers are funny things, they have both a | |
2a8a3d5b EB |
297 | * visible and an invisible part. Whenever the visible part is |
298 | * set to a specific selector, the invisible part is loaded | |
299 | * with from a table in memory. At no other time is the | |
300 | * descriptor table in memory accessed. | |
5234f5eb EB |
301 | * |
302 | * I take advantage of this here by force loading the | |
303 | * segments, before I zap the gdt with an invalid value. | |
304 | */ | |
305 | load_segments(); | |
fef3a7a1 HY |
306 | /* |
307 | * The gdt & idt are now invalid. | |
5234f5eb EB |
308 | * If you want to load them you must set up your own idt & gdt. |
309 | */ | |
fef3a7a1 HY |
310 | set_gdt(phys_to_virt(0), 0); |
311 | set_idt(phys_to_virt(0), 0); | |
4bfaaef0 | 312 | |
5234f5eb | 313 | /* now call it */ |
fee7b0d8 HY |
314 | image->start = relocate_kernel((unsigned long)image->head, |
315 | (unsigned long)page_list, | |
316 | image->start, | |
317 | image->preserve_context); | |
318 | ||
319 | #ifdef CONFIG_KEXEC_JUMP | |
6407df5c | 320 | if (image->preserve_context) |
fee7b0d8 HY |
321 | restore_processor_state(); |
322 | #endif | |
323 | ||
324 | __ftrace_enabled_restore(save_ftrace_enabled); | |
5234f5eb | 325 | } |
2c8c0e6b | 326 | |
fd59d231 KO |
327 | void arch_crash_save_vmcoreinfo(void) |
328 | { | |
629c8b4c | 329 | VMCOREINFO_SYMBOL(phys_base); |
69243f91 | 330 | VMCOREINFO_SYMBOL(init_level4_pgt); |
92df5c3e KO |
331 | |
332 | #ifdef CONFIG_NUMA | |
333 | VMCOREINFO_SYMBOL(node_data); | |
334 | VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); | |
335 | #endif | |
b6085a86 ES |
336 | vmcoreinfo_append_str("KERNELOFFSET=%lx\n", |
337 | (unsigned long)&_text - __START_KERNEL); | |
fd59d231 KO |
338 | } |
339 | ||
cb105258 VG |
340 | /* arch-dependent functionality related to kexec file-based syscall */ |
341 | ||
74ca317c | 342 | #ifdef CONFIG_KEXEC_FILE |
cb105258 VG |
343 | int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, |
344 | unsigned long buf_len) | |
345 | { | |
346 | int i, ret = -ENOEXEC; | |
347 | struct kexec_file_ops *fops; | |
348 | ||
349 | for (i = 0; i < ARRAY_SIZE(kexec_file_loaders); i++) { | |
350 | fops = kexec_file_loaders[i]; | |
351 | if (!fops || !fops->probe) | |
352 | continue; | |
353 | ||
354 | ret = fops->probe(buf, buf_len); | |
355 | if (!ret) { | |
356 | image->fops = fops; | |
357 | return ret; | |
358 | } | |
359 | } | |
360 | ||
361 | return ret; | |
362 | } | |
363 | ||
364 | void *arch_kexec_kernel_image_load(struct kimage *image) | |
365 | { | |
dd5f7260 VG |
366 | vfree(image->arch.elf_headers); |
367 | image->arch.elf_headers = NULL; | |
368 | ||
cb105258 VG |
369 | if (!image->fops || !image->fops->load) |
370 | return ERR_PTR(-ENOEXEC); | |
371 | ||
372 | return image->fops->load(image, image->kernel_buf, | |
373 | image->kernel_buf_len, image->initrd_buf, | |
374 | image->initrd_buf_len, image->cmdline_buf, | |
375 | image->cmdline_buf_len); | |
376 | } | |
377 | ||
378 | int arch_kimage_file_post_load_cleanup(struct kimage *image) | |
379 | { | |
380 | if (!image->fops || !image->fops->cleanup) | |
381 | return 0; | |
382 | ||
27f48d3e | 383 | return image->fops->cleanup(image->image_loader_data); |
cb105258 | 384 | } |
12db5562 | 385 | |
8e7d8381 VG |
386 | int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel, |
387 | unsigned long kernel_len) | |
388 | { | |
389 | if (!image->fops || !image->fops->verify_sig) { | |
390 | pr_debug("kernel loader does not support signature verification."); | |
391 | return -EKEYREJECTED; | |
392 | } | |
393 | ||
394 | return image->fops->verify_sig(kernel, kernel_len); | |
395 | } | |
396 | ||
12db5562 VG |
397 | /* |
398 | * Apply purgatory relocations. | |
399 | * | |
400 | * ehdr: Pointer to elf headers | |
401 | * sechdrs: Pointer to section headers. | |
402 | * relsec: section index of SHT_RELA section. | |
403 | * | |
404 | * TODO: Some of the code belongs to generic code. Move that in kexec.c. | |
405 | */ | |
406 | int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr, | |
407 | Elf64_Shdr *sechdrs, unsigned int relsec) | |
408 | { | |
409 | unsigned int i; | |
410 | Elf64_Rela *rel; | |
411 | Elf64_Sym *sym; | |
412 | void *location; | |
413 | Elf64_Shdr *section, *symtabsec; | |
414 | unsigned long address, sec_base, value; | |
415 | const char *strtab, *name, *shstrtab; | |
416 | ||
417 | /* | |
418 | * ->sh_offset has been modified to keep the pointer to section | |
419 | * contents in memory | |
420 | */ | |
421 | rel = (void *)sechdrs[relsec].sh_offset; | |
422 | ||
423 | /* Section to which relocations apply */ | |
424 | section = &sechdrs[sechdrs[relsec].sh_info]; | |
425 | ||
426 | pr_debug("Applying relocate section %u to %u\n", relsec, | |
427 | sechdrs[relsec].sh_info); | |
428 | ||
429 | /* Associated symbol table */ | |
430 | symtabsec = &sechdrs[sechdrs[relsec].sh_link]; | |
431 | ||
432 | /* String table */ | |
433 | if (symtabsec->sh_link >= ehdr->e_shnum) { | |
434 | /* Invalid strtab section number */ | |
435 | pr_err("Invalid string table section index %d\n", | |
436 | symtabsec->sh_link); | |
437 | return -ENOEXEC; | |
438 | } | |
439 | ||
440 | strtab = (char *)sechdrs[symtabsec->sh_link].sh_offset; | |
441 | ||
442 | /* section header string table */ | |
443 | shstrtab = (char *)sechdrs[ehdr->e_shstrndx].sh_offset; | |
444 | ||
445 | for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { | |
446 | ||
447 | /* | |
448 | * rel[i].r_offset contains byte offset from beginning | |
449 | * of section to the storage unit affected. | |
450 | * | |
451 | * This is location to update (->sh_offset). This is temporary | |
452 | * buffer where section is currently loaded. This will finally | |
453 | * be loaded to a different address later, pointed to by | |
454 | * ->sh_addr. kexec takes care of moving it | |
455 | * (kexec_load_segment()). | |
456 | */ | |
457 | location = (void *)(section->sh_offset + rel[i].r_offset); | |
458 | ||
459 | /* Final address of the location */ | |
460 | address = section->sh_addr + rel[i].r_offset; | |
461 | ||
462 | /* | |
463 | * rel[i].r_info contains information about symbol table index | |
464 | * w.r.t which relocation must be made and type of relocation | |
465 | * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get | |
466 | * these respectively. | |
467 | */ | |
468 | sym = (Elf64_Sym *)symtabsec->sh_offset + | |
469 | ELF64_R_SYM(rel[i].r_info); | |
470 | ||
471 | if (sym->st_name) | |
472 | name = strtab + sym->st_name; | |
473 | else | |
474 | name = shstrtab + sechdrs[sym->st_shndx].sh_name; | |
475 | ||
476 | pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n", | |
477 | name, sym->st_info, sym->st_shndx, sym->st_value, | |
478 | sym->st_size); | |
479 | ||
480 | if (sym->st_shndx == SHN_UNDEF) { | |
481 | pr_err("Undefined symbol: %s\n", name); | |
482 | return -ENOEXEC; | |
483 | } | |
484 | ||
485 | if (sym->st_shndx == SHN_COMMON) { | |
486 | pr_err("symbol '%s' in common section\n", name); | |
487 | return -ENOEXEC; | |
488 | } | |
489 | ||
490 | if (sym->st_shndx == SHN_ABS) | |
491 | sec_base = 0; | |
492 | else if (sym->st_shndx >= ehdr->e_shnum) { | |
493 | pr_err("Invalid section %d for symbol %s\n", | |
494 | sym->st_shndx, name); | |
495 | return -ENOEXEC; | |
496 | } else | |
497 | sec_base = sechdrs[sym->st_shndx].sh_addr; | |
498 | ||
499 | value = sym->st_value; | |
500 | value += sec_base; | |
501 | value += rel[i].r_addend; | |
502 | ||
503 | switch (ELF64_R_TYPE(rel[i].r_info)) { | |
504 | case R_X86_64_NONE: | |
505 | break; | |
506 | case R_X86_64_64: | |
507 | *(u64 *)location = value; | |
508 | break; | |
509 | case R_X86_64_32: | |
510 | *(u32 *)location = value; | |
511 | if (value != *(u32 *)location) | |
512 | goto overflow; | |
513 | break; | |
514 | case R_X86_64_32S: | |
515 | *(s32 *)location = value; | |
516 | if ((s64)value != *(s32 *)location) | |
517 | goto overflow; | |
518 | break; | |
519 | case R_X86_64_PC32: | |
520 | value -= (u64)address; | |
521 | *(u32 *)location = value; | |
522 | break; | |
523 | default: | |
524 | pr_err("Unknown rela relocation: %llu\n", | |
525 | ELF64_R_TYPE(rel[i].r_info)); | |
526 | return -ENOEXEC; | |
527 | } | |
528 | } | |
529 | return 0; | |
530 | ||
531 | overflow: | |
532 | pr_err("Overflow in relocation type %d value 0x%lx\n", | |
533 | (int)ELF64_R_TYPE(rel[i].r_info), value); | |
534 | return -ENOEXEC; | |
535 | } | |
74ca317c | 536 | #endif /* CONFIG_KEXEC_FILE */ |