x86_64, vsyscall: Turn vsyscalls all the way off when vsyscall==none
[deliverable/linux.git] / arch / x86 / kernel / vsyscall_64.c
1 /*
2 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
3 * Copyright 2003 Andi Kleen, SuSE Labs.
4 *
5 * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ]
6 *
7 * Thanks to hpa@transmeta.com for some useful hint.
8 * Special thanks to Ingo Molnar for his early experience with
9 * a different vsyscall implementation for Linux/IA32 and for the name.
10 *
11 * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
12 * at virtual address -10Mbyte+1024bytes etc... There are at max 4
13 * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
14 * jumping out of line if necessary. We cannot add more with this
15 * mechanism because older kernels won't return -ENOSYS.
16 *
17 * Note: the concept clashes with user mode linux. UML users should
18 * use the vDSO.
19 */
20
21 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22
23 #include <linux/time.h>
24 #include <linux/init.h>
25 #include <linux/kernel.h>
26 #include <linux/timer.h>
27 #include <linux/seqlock.h>
28 #include <linux/jiffies.h>
29 #include <linux/sysctl.h>
30 #include <linux/topology.h>
31 #include <linux/timekeeper_internal.h>
32 #include <linux/getcpu.h>
33 #include <linux/cpu.h>
34 #include <linux/smp.h>
35 #include <linux/notifier.h>
36 #include <linux/syscalls.h>
37 #include <linux/ratelimit.h>
38
39 #include <asm/vsyscall.h>
40 #include <asm/pgtable.h>
41 #include <asm/compat.h>
42 #include <asm/page.h>
43 #include <asm/unistd.h>
44 #include <asm/fixmap.h>
45 #include <asm/errno.h>
46 #include <asm/io.h>
47 #include <asm/segment.h>
48 #include <asm/desc.h>
49 #include <asm/topology.h>
50 #include <asm/traps.h>
51
52 #define CREATE_TRACE_POINTS
53 #include "vsyscall_trace.h"
54
55 static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
56
57 static int __init vsyscall_setup(char *str)
58 {
59 if (str) {
60 if (!strcmp("emulate", str))
61 vsyscall_mode = EMULATE;
62 else if (!strcmp("native", str))
63 vsyscall_mode = NATIVE;
64 else if (!strcmp("none", str))
65 vsyscall_mode = NONE;
66 else
67 return -EINVAL;
68
69 return 0;
70 }
71
72 return -EINVAL;
73 }
74 early_param("vsyscall", vsyscall_setup);
75
76 static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
77 const char *message)
78 {
79 if (!show_unhandled_signals)
80 return;
81
82 printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
83 level, current->comm, task_pid_nr(current),
84 message, regs->ip, regs->cs,
85 regs->sp, regs->ax, regs->si, regs->di);
86 }
87
88 static int addr_to_vsyscall_nr(unsigned long addr)
89 {
90 int nr;
91
92 if ((addr & ~0xC00UL) != VSYSCALL_ADDR)
93 return -EINVAL;
94
95 nr = (addr & 0xC00UL) >> 10;
96 if (nr >= 3)
97 return -EINVAL;
98
99 return nr;
100 }
101
102 static bool write_ok_or_segv(unsigned long ptr, size_t size)
103 {
104 /*
105 * XXX: if access_ok, get_user, and put_user handled
106 * sig_on_uaccess_error, this could go away.
107 */
108
109 if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
110 siginfo_t info;
111 struct thread_struct *thread = &current->thread;
112
113 thread->error_code = 6; /* user fault, no page, write */
114 thread->cr2 = ptr;
115 thread->trap_nr = X86_TRAP_PF;
116
117 memset(&info, 0, sizeof(info));
118 info.si_signo = SIGSEGV;
119 info.si_errno = 0;
120 info.si_code = SEGV_MAPERR;
121 info.si_addr = (void __user *)ptr;
122
123 force_sig_info(SIGSEGV, &info, current);
124 return false;
125 } else {
126 return true;
127 }
128 }
129
130 bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
131 {
132 struct task_struct *tsk;
133 unsigned long caller;
134 int vsyscall_nr, syscall_nr, tmp;
135 int prev_sig_on_uaccess_error;
136 long ret;
137
138 /*
139 * No point in checking CS -- the only way to get here is a user mode
140 * trap to a high address, which means that we're in 64-bit user code.
141 */
142
143 WARN_ON_ONCE(address != regs->ip);
144
145 if (vsyscall_mode == NONE) {
146 warn_bad_vsyscall(KERN_INFO, regs,
147 "vsyscall attempted with vsyscall=none");
148 return false;
149 }
150
151 vsyscall_nr = addr_to_vsyscall_nr(address);
152
153 trace_emulate_vsyscall(vsyscall_nr);
154
155 if (vsyscall_nr < 0) {
156 warn_bad_vsyscall(KERN_WARNING, regs,
157 "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
158 goto sigsegv;
159 }
160
161 if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
162 warn_bad_vsyscall(KERN_WARNING, regs,
163 "vsyscall with bad stack (exploit attempt?)");
164 goto sigsegv;
165 }
166
167 tsk = current;
168
169 /*
170 * Check for access_ok violations and find the syscall nr.
171 *
172 * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
173 * 64-bit, so we don't need to special-case it here. For all the
174 * vsyscalls, NULL means "don't write anything" not "write it at
175 * address 0".
176 */
177 switch (vsyscall_nr) {
178 case 0:
179 if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
180 !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
181 ret = -EFAULT;
182 goto check_fault;
183 }
184
185 syscall_nr = __NR_gettimeofday;
186 break;
187
188 case 1:
189 if (!write_ok_or_segv(regs->di, sizeof(time_t))) {
190 ret = -EFAULT;
191 goto check_fault;
192 }
193
194 syscall_nr = __NR_time;
195 break;
196
197 case 2:
198 if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
199 !write_ok_or_segv(regs->si, sizeof(unsigned))) {
200 ret = -EFAULT;
201 goto check_fault;
202 }
203
204 syscall_nr = __NR_getcpu;
205 break;
206 }
207
208 /*
209 * Handle seccomp. regs->ip must be the original value.
210 * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt.
211 *
212 * We could optimize the seccomp disabled case, but performance
213 * here doesn't matter.
214 */
215 regs->orig_ax = syscall_nr;
216 regs->ax = -ENOSYS;
217 tmp = secure_computing();
218 if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
219 warn_bad_vsyscall(KERN_DEBUG, regs,
220 "seccomp tried to change syscall nr or ip");
221 do_exit(SIGSYS);
222 }
223 if (tmp)
224 goto do_ret; /* skip requested */
225
226 /*
227 * With a real vsyscall, page faults cause SIGSEGV. We want to
228 * preserve that behavior to make writing exploits harder.
229 */
230 prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
231 current_thread_info()->sig_on_uaccess_error = 1;
232
233 ret = -EFAULT;
234 switch (vsyscall_nr) {
235 case 0:
236 ret = sys_gettimeofday(
237 (struct timeval __user *)regs->di,
238 (struct timezone __user *)regs->si);
239 break;
240
241 case 1:
242 ret = sys_time((time_t __user *)regs->di);
243 break;
244
245 case 2:
246 ret = sys_getcpu((unsigned __user *)regs->di,
247 (unsigned __user *)regs->si,
248 NULL);
249 break;
250 }
251
252 current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
253
254 check_fault:
255 if (ret == -EFAULT) {
256 /* Bad news -- userspace fed a bad pointer to a vsyscall. */
257 warn_bad_vsyscall(KERN_INFO, regs,
258 "vsyscall fault (exploit attempt?)");
259
260 /*
261 * If we failed to generate a signal for any reason,
262 * generate one here. (This should be impossible.)
263 */
264 if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
265 !sigismember(&tsk->pending.signal, SIGSEGV)))
266 goto sigsegv;
267
268 return true; /* Don't emulate the ret. */
269 }
270
271 regs->ax = ret;
272
273 do_ret:
274 /* Emulate a ret instruction. */
275 regs->ip = caller;
276 regs->sp += 8;
277 return true;
278
279 sigsegv:
280 force_sig(SIGSEGV, current);
281 return true;
282 }
283
284 /*
285 * A pseudo VMA to allow ptrace access for the vsyscall page. This only
286 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
287 * not need special handling anymore:
288 */
289 static const char *gate_vma_name(struct vm_area_struct *vma)
290 {
291 return "[vsyscall]";
292 }
293 static struct vm_operations_struct gate_vma_ops = {
294 .name = gate_vma_name,
295 };
296 static struct vm_area_struct gate_vma = {
297 .vm_start = VSYSCALL_ADDR,
298 .vm_end = VSYSCALL_ADDR + PAGE_SIZE,
299 .vm_page_prot = PAGE_READONLY_EXEC,
300 .vm_flags = VM_READ | VM_EXEC,
301 .vm_ops = &gate_vma_ops,
302 };
303
304 struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
305 {
306 #ifdef CONFIG_IA32_EMULATION
307 if (!mm || mm->context.ia32_compat)
308 return NULL;
309 #endif
310 if (vsyscall_mode == NONE)
311 return NULL;
312 return &gate_vma;
313 }
314
315 int in_gate_area(struct mm_struct *mm, unsigned long addr)
316 {
317 struct vm_area_struct *vma = get_gate_vma(mm);
318
319 if (!vma)
320 return 0;
321
322 return (addr >= vma->vm_start) && (addr < vma->vm_end);
323 }
324
325 /*
326 * Use this when you have no reliable mm, typically from interrupt
327 * context. It is less reliable than using a task's mm and may give
328 * false positives.
329 */
330 int in_gate_area_no_mm(unsigned long addr)
331 {
332 return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
333 }
334
335 void __init map_vsyscall(void)
336 {
337 extern char __vsyscall_page;
338 unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
339
340 if (vsyscall_mode != NONE)
341 __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
342 vsyscall_mode == NATIVE
343 ? PAGE_KERNEL_VSYSCALL
344 : PAGE_KERNEL_VVAR);
345
346 BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
347 (unsigned long)VSYSCALL_ADDR);
348 }
This page took 0.038168 seconds and 6 git commands to generate.