Merge branch 'linus' into core/printk
[deliverable/linux.git] / arch / x86 / kernel / process_64.c
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * X86-64 port
8 * Andi Kleen.
9 *
10 * CPU hotplug support - ashok.raj@intel.com
11 */
12
13 /*
14 * This file handles the architecture-dependent parts of process handling..
15 */
16
17 #include <stdarg.h>
18
19 #include <linux/stackprotector.h>
20 #include <linux/cpu.h>
21 #include <linux/errno.h>
22 #include <linux/sched.h>
23 #include <linux/fs.h>
24 #include <linux/kernel.h>
25 #include <linux/mm.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/utsname.h>
32 #include <linux/delay.h>
33 #include <linux/module.h>
34 #include <linux/ptrace.h>
35 #include <linux/random.h>
36 #include <linux/notifier.h>
37 #include <linux/kprobes.h>
38 #include <linux/kdebug.h>
39 #include <linux/tick.h>
40 #include <linux/prctl.h>
41 #include <linux/uaccess.h>
42 #include <linux/io.h>
43 #include <linux/ftrace.h>
44 #include <linux/dmi.h>
45
46 #include <asm/pgtable.h>
47 #include <asm/system.h>
48 #include <asm/processor.h>
49 #include <asm/i387.h>
50 #include <asm/mmu_context.h>
51 #include <asm/prctl.h>
52 #include <asm/desc.h>
53 #include <asm/proto.h>
54 #include <asm/ia32.h>
55 #include <asm/idle.h>
56 #include <asm/syscalls.h>
57 #include <asm/ds.h>
58
59 asmlinkage extern void ret_from_fork(void);
60
61 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
62 EXPORT_PER_CPU_SYMBOL(current_task);
63
64 DEFINE_PER_CPU(unsigned long, old_rsp);
65 static DEFINE_PER_CPU(unsigned char, is_idle);
66
67 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
68
69 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
70
71 void idle_notifier_register(struct notifier_block *n)
72 {
73 atomic_notifier_chain_register(&idle_notifier, n);
74 }
75 EXPORT_SYMBOL_GPL(idle_notifier_register);
76
77 void idle_notifier_unregister(struct notifier_block *n)
78 {
79 atomic_notifier_chain_unregister(&idle_notifier, n);
80 }
81 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
82
83 void enter_idle(void)
84 {
85 percpu_write(is_idle, 1);
86 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
87 }
88
89 static void __exit_idle(void)
90 {
91 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
92 return;
93 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
94 }
95
96 /* Called from interrupts to signify idle end */
97 void exit_idle(void)
98 {
99 /* idle loop has pid 0 */
100 if (current->pid)
101 return;
102 __exit_idle();
103 }
104
105 #ifndef CONFIG_SMP
106 static inline void play_dead(void)
107 {
108 BUG();
109 }
110 #endif
111
112 /*
113 * The idle thread. There's no useful work to be
114 * done, so just try to conserve power and have a
115 * low exit latency (ie sit in a loop waiting for
116 * somebody to say that they'd like to reschedule)
117 */
118 void cpu_idle(void)
119 {
120 current_thread_info()->status |= TS_POLLING;
121
122 /*
123 * If we're the non-boot CPU, nothing set the stack canary up
124 * for us. CPU0 already has it initialized but no harm in
125 * doing it again. This is a good place for updating it, as
126 * we wont ever return from this function (so the invalid
127 * canaries already on the stack wont ever trigger).
128 */
129 boot_init_stack_canary();
130
131 /* endless idle loop with no priority at all */
132 while (1) {
133 tick_nohz_stop_sched_tick(1);
134 while (!need_resched()) {
135
136 rmb();
137
138 if (cpu_is_offline(smp_processor_id()))
139 play_dead();
140 /*
141 * Idle routines should keep interrupts disabled
142 * from here on, until they go to idle.
143 * Otherwise, idle callbacks can misfire.
144 */
145 local_irq_disable();
146 enter_idle();
147 /* Don't trace irqs off for idle */
148 stop_critical_timings();
149 pm_idle();
150 start_critical_timings();
151 /* In many cases the interrupt that ended idle
152 has already called exit_idle. But some idle
153 loops can be woken up without interrupt. */
154 __exit_idle();
155 }
156
157 tick_nohz_restart_sched_tick();
158 preempt_enable_no_resched();
159 schedule();
160 preempt_disable();
161 }
162 }
163
164 /* Prints also some state that isn't saved in the pt_regs */
165 void __show_regs(struct pt_regs *regs, int all)
166 {
167 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
168 unsigned long d0, d1, d2, d3, d6, d7;
169 unsigned int fsindex, gsindex;
170 unsigned int ds, cs, es;
171 const char *board;
172
173 printk("\n");
174 print_modules();
175 board = dmi_get_system_info(DMI_PRODUCT_NAME);
176 if (!board)
177 board = "";
178 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
179 current->pid, current->comm, print_tainted(),
180 init_utsname()->release,
181 (int)strcspn(init_utsname()->version, " "),
182 init_utsname()->version, board);
183 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
184 printk_address(regs->ip, 1);
185 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
186 regs->sp, regs->flags);
187 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
188 regs->ax, regs->bx, regs->cx);
189 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
190 regs->dx, regs->si, regs->di);
191 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
192 regs->bp, regs->r8, regs->r9);
193 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
194 regs->r10, regs->r11, regs->r12);
195 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
196 regs->r13, regs->r14, regs->r15);
197
198 asm("movl %%ds,%0" : "=r" (ds));
199 asm("movl %%cs,%0" : "=r" (cs));
200 asm("movl %%es,%0" : "=r" (es));
201 asm("movl %%fs,%0" : "=r" (fsindex));
202 asm("movl %%gs,%0" : "=r" (gsindex));
203
204 rdmsrl(MSR_FS_BASE, fs);
205 rdmsrl(MSR_GS_BASE, gs);
206 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
207
208 if (!all)
209 return;
210
211 cr0 = read_cr0();
212 cr2 = read_cr2();
213 cr3 = read_cr3();
214 cr4 = read_cr4();
215
216 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
217 fs, fsindex, gs, gsindex, shadowgs);
218 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
219 es, cr0);
220 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
221 cr4);
222
223 get_debugreg(d0, 0);
224 get_debugreg(d1, 1);
225 get_debugreg(d2, 2);
226 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
227 get_debugreg(d3, 3);
228 get_debugreg(d6, 6);
229 get_debugreg(d7, 7);
230 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
231 }
232
233 void show_regs(struct pt_regs *regs)
234 {
235 printk(KERN_INFO "CPU %d:", smp_processor_id());
236 __show_regs(regs, 1);
237 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
238 }
239
240 void release_thread(struct task_struct *dead_task)
241 {
242 if (dead_task->mm) {
243 if (dead_task->mm->context.size) {
244 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
245 dead_task->comm,
246 dead_task->mm->context.ldt,
247 dead_task->mm->context.size);
248 BUG();
249 }
250 }
251 }
252
253 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
254 {
255 struct user_desc ud = {
256 .base_addr = addr,
257 .limit = 0xfffff,
258 .seg_32bit = 1,
259 .limit_in_pages = 1,
260 .useable = 1,
261 };
262 struct desc_struct *desc = t->thread.tls_array;
263 desc += tls;
264 fill_ldt(desc, &ud);
265 }
266
267 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
268 {
269 return get_desc_base(&t->thread.tls_array[tls]);
270 }
271
272 /*
273 * This gets called before we allocate a new thread and copy
274 * the current task into it.
275 */
276 void prepare_to_copy(struct task_struct *tsk)
277 {
278 unlazy_fpu(tsk);
279 }
280
281 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
282 unsigned long unused,
283 struct task_struct *p, struct pt_regs *regs)
284 {
285 int err;
286 struct pt_regs *childregs;
287 struct task_struct *me = current;
288
289 childregs = ((struct pt_regs *)
290 (THREAD_SIZE + task_stack_page(p))) - 1;
291 *childregs = *regs;
292
293 childregs->ax = 0;
294 childregs->sp = sp;
295 if (sp == ~0UL)
296 childregs->sp = (unsigned long)childregs;
297
298 p->thread.sp = (unsigned long) childregs;
299 p->thread.sp0 = (unsigned long) (childregs+1);
300 p->thread.usersp = me->thread.usersp;
301
302 set_tsk_thread_flag(p, TIF_FORK);
303
304 p->thread.fs = me->thread.fs;
305 p->thread.gs = me->thread.gs;
306
307 savesegment(gs, p->thread.gsindex);
308 savesegment(fs, p->thread.fsindex);
309 savesegment(es, p->thread.es);
310 savesegment(ds, p->thread.ds);
311
312 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
313 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
314 if (!p->thread.io_bitmap_ptr) {
315 p->thread.io_bitmap_max = 0;
316 return -ENOMEM;
317 }
318 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
319 IO_BITMAP_BYTES);
320 set_tsk_thread_flag(p, TIF_IO_BITMAP);
321 }
322
323 /*
324 * Set a new TLS for the child thread?
325 */
326 if (clone_flags & CLONE_SETTLS) {
327 #ifdef CONFIG_IA32_EMULATION
328 if (test_thread_flag(TIF_IA32))
329 err = do_set_thread_area(p, -1,
330 (struct user_desc __user *)childregs->si, 0);
331 else
332 #endif
333 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
334 if (err)
335 goto out;
336 }
337
338 ds_copy_thread(p, me);
339
340 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
341 p->thread.debugctlmsr = 0;
342
343 err = 0;
344 out:
345 if (err && p->thread.io_bitmap_ptr) {
346 kfree(p->thread.io_bitmap_ptr);
347 p->thread.io_bitmap_max = 0;
348 }
349 return err;
350 }
351
352 void
353 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
354 {
355 loadsegment(fs, 0);
356 loadsegment(es, 0);
357 loadsegment(ds, 0);
358 load_gs_index(0);
359 regs->ip = new_ip;
360 regs->sp = new_sp;
361 percpu_write(old_rsp, new_sp);
362 regs->cs = __USER_CS;
363 regs->ss = __USER_DS;
364 regs->flags = 0x200;
365 set_fs(USER_DS);
366 /*
367 * Free the old FP and other extended state
368 */
369 free_thread_xstate(current);
370 }
371 EXPORT_SYMBOL_GPL(start_thread);
372
373 /*
374 * switch_to(x,y) should switch tasks from x to y.
375 *
376 * This could still be optimized:
377 * - fold all the options into a flag word and test it with a single test.
378 * - could test fs/gs bitsliced
379 *
380 * Kprobes not supported here. Set the probe on schedule instead.
381 * Function graph tracer not supported too.
382 */
383 __notrace_funcgraph struct task_struct *
384 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
385 {
386 struct thread_struct *prev = &prev_p->thread;
387 struct thread_struct *next = &next_p->thread;
388 int cpu = smp_processor_id();
389 struct tss_struct *tss = &per_cpu(init_tss, cpu);
390 unsigned fsindex, gsindex;
391
392 /* we're going to use this soon, after a few expensive things */
393 if (next_p->fpu_counter > 5)
394 prefetch(next->xstate);
395
396 /*
397 * Reload esp0, LDT and the page table pointer:
398 */
399 load_sp0(tss, next);
400
401 /*
402 * Switch DS and ES.
403 * This won't pick up thread selector changes, but I guess that is ok.
404 */
405 savesegment(es, prev->es);
406 if (unlikely(next->es | prev->es))
407 loadsegment(es, next->es);
408
409 savesegment(ds, prev->ds);
410 if (unlikely(next->ds | prev->ds))
411 loadsegment(ds, next->ds);
412
413
414 /* We must save %fs and %gs before load_TLS() because
415 * %fs and %gs may be cleared by load_TLS().
416 *
417 * (e.g. xen_load_tls())
418 */
419 savesegment(fs, fsindex);
420 savesegment(gs, gsindex);
421
422 load_TLS(next, cpu);
423
424 /*
425 * Leave lazy mode, flushing any hypercalls made here.
426 * This must be done before restoring TLS segments so
427 * the GDT and LDT are properly updated, and must be
428 * done before math_state_restore, so the TS bit is up
429 * to date.
430 */
431 arch_leave_lazy_cpu_mode();
432
433 /*
434 * Switch FS and GS.
435 *
436 * Segment register != 0 always requires a reload. Also
437 * reload when it has changed. When prev process used 64bit
438 * base always reload to avoid an information leak.
439 */
440 if (unlikely(fsindex | next->fsindex | prev->fs)) {
441 loadsegment(fs, next->fsindex);
442 /*
443 * Check if the user used a selector != 0; if yes
444 * clear 64bit base, since overloaded base is always
445 * mapped to the Null selector
446 */
447 if (fsindex)
448 prev->fs = 0;
449 }
450 /* when next process has a 64bit base use it */
451 if (next->fs)
452 wrmsrl(MSR_FS_BASE, next->fs);
453 prev->fsindex = fsindex;
454
455 if (unlikely(gsindex | next->gsindex | prev->gs)) {
456 load_gs_index(next->gsindex);
457 if (gsindex)
458 prev->gs = 0;
459 }
460 if (next->gs)
461 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
462 prev->gsindex = gsindex;
463
464 /* Must be after DS reload */
465 unlazy_fpu(prev_p);
466
467 /*
468 * Switch the PDA and FPU contexts.
469 */
470 prev->usersp = percpu_read(old_rsp);
471 percpu_write(old_rsp, next->usersp);
472 percpu_write(current_task, next_p);
473
474 percpu_write(kernel_stack,
475 (unsigned long)task_stack_page(next_p) +
476 THREAD_SIZE - KERNEL_STACK_OFFSET);
477
478 /*
479 * Now maybe reload the debug registers and handle I/O bitmaps
480 */
481 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
482 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
483 __switch_to_xtra(prev_p, next_p, tss);
484
485 /* If the task has used fpu the last 5 timeslices, just do a full
486 * restore of the math state immediately to avoid the trap; the
487 * chances of needing FPU soon are obviously high now
488 *
489 * tsk_used_math() checks prevent calling math_state_restore(),
490 * which can sleep in the case of !tsk_used_math()
491 */
492 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
493 math_state_restore();
494 return prev_p;
495 }
496
497 /*
498 * sys_execve() executes a new program.
499 */
500 asmlinkage
501 long sys_execve(char __user *name, char __user * __user *argv,
502 char __user * __user *envp, struct pt_regs *regs)
503 {
504 long error;
505 char *filename;
506
507 filename = getname(name);
508 error = PTR_ERR(filename);
509 if (IS_ERR(filename))
510 return error;
511 error = do_execve(filename, argv, envp, regs);
512 putname(filename);
513 return error;
514 }
515
516 void set_personality_64bit(void)
517 {
518 /* inherit personality from parent */
519
520 /* Make sure to be in 64bit mode */
521 clear_thread_flag(TIF_IA32);
522
523 /* TBD: overwrites user setup. Should have two bits.
524 But 64bit processes have always behaved this way,
525 so it's not too bad. The main problem is just that
526 32bit childs are affected again. */
527 current->personality &= ~READ_IMPLIES_EXEC;
528 }
529
530 asmlinkage long
531 sys_clone(unsigned long clone_flags, unsigned long newsp,
532 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
533 {
534 if (!newsp)
535 newsp = regs->sp;
536 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
537 }
538
539 unsigned long get_wchan(struct task_struct *p)
540 {
541 unsigned long stack;
542 u64 fp, ip;
543 int count = 0;
544
545 if (!p || p == current || p->state == TASK_RUNNING)
546 return 0;
547 stack = (unsigned long)task_stack_page(p);
548 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
549 return 0;
550 fp = *(u64 *)(p->thread.sp);
551 do {
552 if (fp < (unsigned long)stack ||
553 fp >= (unsigned long)stack+THREAD_SIZE)
554 return 0;
555 ip = *(u64 *)(fp+8);
556 if (!in_sched_functions(ip))
557 return ip;
558 fp = *(u64 *)fp;
559 } while (count++ < 16);
560 return 0;
561 }
562
563 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
564 {
565 int ret = 0;
566 int doit = task == current;
567 int cpu;
568
569 switch (code) {
570 case ARCH_SET_GS:
571 if (addr >= TASK_SIZE_OF(task))
572 return -EPERM;
573 cpu = get_cpu();
574 /* handle small bases via the GDT because that's faster to
575 switch. */
576 if (addr <= 0xffffffff) {
577 set_32bit_tls(task, GS_TLS, addr);
578 if (doit) {
579 load_TLS(&task->thread, cpu);
580 load_gs_index(GS_TLS_SEL);
581 }
582 task->thread.gsindex = GS_TLS_SEL;
583 task->thread.gs = 0;
584 } else {
585 task->thread.gsindex = 0;
586 task->thread.gs = addr;
587 if (doit) {
588 load_gs_index(0);
589 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
590 }
591 }
592 put_cpu();
593 break;
594 case ARCH_SET_FS:
595 /* Not strictly needed for fs, but do it for symmetry
596 with gs */
597 if (addr >= TASK_SIZE_OF(task))
598 return -EPERM;
599 cpu = get_cpu();
600 /* handle small bases via the GDT because that's faster to
601 switch. */
602 if (addr <= 0xffffffff) {
603 set_32bit_tls(task, FS_TLS, addr);
604 if (doit) {
605 load_TLS(&task->thread, cpu);
606 loadsegment(fs, FS_TLS_SEL);
607 }
608 task->thread.fsindex = FS_TLS_SEL;
609 task->thread.fs = 0;
610 } else {
611 task->thread.fsindex = 0;
612 task->thread.fs = addr;
613 if (doit) {
614 /* set the selector to 0 to not confuse
615 __switch_to */
616 loadsegment(fs, 0);
617 ret = checking_wrmsrl(MSR_FS_BASE, addr);
618 }
619 }
620 put_cpu();
621 break;
622 case ARCH_GET_FS: {
623 unsigned long base;
624 if (task->thread.fsindex == FS_TLS_SEL)
625 base = read_32bit_tls(task, FS_TLS);
626 else if (doit)
627 rdmsrl(MSR_FS_BASE, base);
628 else
629 base = task->thread.fs;
630 ret = put_user(base, (unsigned long __user *)addr);
631 break;
632 }
633 case ARCH_GET_GS: {
634 unsigned long base;
635 unsigned gsindex;
636 if (task->thread.gsindex == GS_TLS_SEL)
637 base = read_32bit_tls(task, GS_TLS);
638 else if (doit) {
639 savesegment(gs, gsindex);
640 if (gsindex)
641 rdmsrl(MSR_KERNEL_GS_BASE, base);
642 else
643 base = task->thread.gs;
644 } else
645 base = task->thread.gs;
646 ret = put_user(base, (unsigned long __user *)addr);
647 break;
648 }
649
650 default:
651 ret = -EINVAL;
652 break;
653 }
654
655 return ret;
656 }
657
658 long sys_arch_prctl(int code, unsigned long addr)
659 {
660 return do_arch_prctl(current, code, addr);
661 }
662
663 unsigned long arch_align_stack(unsigned long sp)
664 {
665 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
666 sp -= get_random_int() % 8192;
667 return sp & ~0xf;
668 }
669
670 unsigned long arch_randomize_brk(struct mm_struct *mm)
671 {
672 unsigned long range_end = mm->brk + 0x02000000;
673 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
674 }
This page took 0.044443 seconds and 6 git commands to generate.