Commit | Line | Data |
---|---|---|
3891a04a PA |
1 | /* ----------------------------------------------------------------------- * |
2 | * | |
3 | * Copyright 2014 Intel Corporation; author: H. Peter Anvin | |
4 | * | |
5 | * This program is free software; you can redistribute it and/or modify it | |
6 | * under the terms and conditions of the GNU General Public License, | |
7 | * version 2, as published by the Free Software Foundation. | |
8 | * | |
9 | * This program is distributed in the hope it will be useful, but WITHOUT | |
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | |
12 | * more details. | |
13 | * | |
14 | * ----------------------------------------------------------------------- */ | |
15 | ||
16 | /* | |
17 | * The IRET instruction, when returning to a 16-bit segment, only | |
18 | * restores the bottom 16 bits of the user space stack pointer. This | |
19 | * causes some 16-bit software to break, but it also leaks kernel state | |
20 | * to user space. | |
21 | * | |
22 | * This works around this by creating percpu "ministacks", each of which | |
23 | * is mapped 2^16 times 64K apart. When we detect that the return SS is | |
24 | * on the LDT, we copy the IRET frame to the ministack and use the | |
25 | * relevant alias to return to userspace. The ministacks are mapped | |
26 | * readonly, so if the IRET fault we promote #GP to #DF which is an IST | |
27 | * vector and thus has its own stack; we then do the fixup in the #DF | |
28 | * handler. | |
29 | * | |
30 | * This file sets up the ministacks and the related page tables. The | |
31 | * actual ministack invocation is in entry_64.S. | |
32 | */ | |
33 | ||
34 | #include <linux/init.h> | |
35 | #include <linux/init_task.h> | |
36 | #include <linux/kernel.h> | |
37 | #include <linux/percpu.h> | |
38 | #include <linux/gfp.h> | |
39 | #include <linux/random.h> | |
40 | #include <asm/pgtable.h> | |
41 | #include <asm/pgalloc.h> | |
42 | #include <asm/setup.h> | |
e1fe9ed8 | 43 | #include <asm/espfix.h> |
3891a04a PA |
44 | |
45 | /* | |
46 | * Note: we only need 6*8 = 48 bytes for the espfix stack, but round | |
47 | * it up to a cache line to avoid unnecessary sharing. | |
48 | */ | |
49 | #define ESPFIX_STACK_SIZE (8*8UL) | |
50 | #define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE) | |
51 | ||
52 | /* There is address space for how many espfix pages? */ | |
53 | #define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16)) | |
54 | ||
55 | #define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE) | |
56 | #if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS | |
57 | # error "Need more than one PGD for the ESPFIX hack" | |
58 | #endif | |
59 | ||
60 | #define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) | |
61 | ||
62 | /* This contains the *bottom* address of the espfix stack */ | |
63 | DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); | |
64 | DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); | |
65 | ||
66 | /* Initialization mutex - should this be a spinlock? */ | |
67 | static DEFINE_MUTEX(espfix_init_mutex); | |
68 | ||
69 | /* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */ | |
70 | #define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE) | |
71 | static void *espfix_pages[ESPFIX_MAX_PAGES]; | |
72 | ||
73 | static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD] | |
74 | __aligned(PAGE_SIZE); | |
75 | ||
76 | static unsigned int page_random, slot_random; | |
77 | ||
78 | /* | |
79 | * This returns the bottom address of the espfix stack for a specific CPU. | |
80 | * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case | |
81 | * we have to account for some amount of padding at the end of each page. | |
82 | */ | |
83 | static inline unsigned long espfix_base_addr(unsigned int cpu) | |
84 | { | |
85 | unsigned long page, slot; | |
86 | unsigned long addr; | |
87 | ||
88 | page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random; | |
89 | slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE; | |
90 | addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE); | |
91 | addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16); | |
92 | addr += ESPFIX_BASE_ADDR; | |
93 | return addr; | |
94 | } | |
95 | ||
96 | #define PTE_STRIDE (65536/PAGE_SIZE) | |
97 | #define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE) | |
98 | #define ESPFIX_PMD_CLONES PTRS_PER_PMD | |
99 | #define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES)) | |
100 | ||
101 | #define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX) | |
102 | ||
103 | static void init_espfix_random(void) | |
104 | { | |
105 | unsigned long rand; | |
106 | ||
107 | /* | |
108 | * This is run before the entropy pools are initialized, | |
109 | * but this is hopefully better than nothing. | |
110 | */ | |
111 | if (!arch_get_random_long(&rand)) { | |
112 | /* The constant is an arbitrary large prime */ | |
113 | rdtscll(rand); | |
114 | rand *= 0xc345c6b72fd16123UL; | |
115 | } | |
116 | ||
117 | slot_random = rand % ESPFIX_STACKS_PER_PAGE; | |
118 | page_random = (rand / ESPFIX_STACKS_PER_PAGE) | |
119 | & (ESPFIX_PAGE_SPACE - 1); | |
120 | } | |
121 | ||
122 | void __init init_espfix_bsp(void) | |
123 | { | |
124 | pgd_t *pgd_p; | |
3891a04a PA |
125 | |
126 | /* Install the espfix pud into the kernel page directory */ | |
127 | pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; | |
128 | pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); | |
129 | ||
130 | /* Randomize the locations */ | |
131 | init_espfix_random(); | |
132 | ||
133 | /* The rest is the same as for any other processor */ | |
134 | init_espfix_ap(); | |
135 | } | |
136 | ||
137 | void init_espfix_ap(void) | |
138 | { | |
139 | unsigned int cpu, page; | |
140 | unsigned long addr; | |
141 | pud_t pud, *pud_p; | |
142 | pmd_t pmd, *pmd_p; | |
143 | pte_t pte, *pte_p; | |
144 | int n; | |
145 | void *stack_page; | |
146 | pteval_t ptemask; | |
147 | ||
148 | /* We only have to do this once... */ | |
149 | if (likely(this_cpu_read(espfix_stack))) | |
150 | return; /* Already initialized */ | |
151 | ||
152 | cpu = smp_processor_id(); | |
153 | addr = espfix_base_addr(cpu); | |
154 | page = cpu/ESPFIX_STACKS_PER_PAGE; | |
155 | ||
156 | /* Did another CPU already set this up? */ | |
157 | stack_page = ACCESS_ONCE(espfix_pages[page]); | |
158 | if (likely(stack_page)) | |
159 | goto done; | |
160 | ||
161 | mutex_lock(&espfix_init_mutex); | |
162 | ||
163 | /* Did we race on the lock? */ | |
164 | stack_page = ACCESS_ONCE(espfix_pages[page]); | |
165 | if (stack_page) | |
166 | goto unlock_done; | |
167 | ||
168 | ptemask = __supported_pte_mask; | |
169 | ||
170 | pud_p = &espfix_pud_page[pud_index(addr)]; | |
171 | pud = *pud_p; | |
172 | if (!pud_present(pud)) { | |
173 | pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP); | |
174 | pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); | |
8762e509 | 175 | paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); |
3891a04a PA |
176 | for (n = 0; n < ESPFIX_PUD_CLONES; n++) |
177 | set_pud(&pud_p[n], pud); | |
178 | } | |
179 | ||
180 | pmd_p = pmd_offset(&pud, addr); | |
181 | pmd = *pmd_p; | |
182 | if (!pmd_present(pmd)) { | |
183 | pte_p = (pte_t *)__get_free_page(PGALLOC_GFP); | |
184 | pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); | |
8762e509 | 185 | paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT); |
3891a04a PA |
186 | for (n = 0; n < ESPFIX_PMD_CLONES; n++) |
187 | set_pmd(&pmd_p[n], pmd); | |
188 | } | |
189 | ||
190 | pte_p = pte_offset_kernel(&pmd, addr); | |
191 | stack_page = (void *)__get_free_page(GFP_KERNEL); | |
192 | pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); | |
3891a04a PA |
193 | for (n = 0; n < ESPFIX_PTE_CLONES; n++) |
194 | set_pte(&pte_p[n*PTE_STRIDE], pte); | |
195 | ||
196 | /* Job is done for this CPU and any CPU which shares this page */ | |
197 | ACCESS_ONCE(espfix_pages[page]) = stack_page; | |
198 | ||
199 | unlock_done: | |
200 | mutex_unlock(&espfix_init_mutex); | |
201 | done: | |
202 | this_cpu_write(espfix_stack, addr); | |
203 | this_cpu_write(espfix_waddr, (unsigned long)stack_page | |
204 | + (addr & ~PAGE_MASK)); | |
205 | } |