Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
process_64.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 1995 Linus Torvalds
3  *
4  * Pentium III FXSR, SSE support
5  * Gareth Hughes <[email protected]>, May 2000
6  *
7  * X86-64 port
8  * Andi Kleen.
9  *
10  * CPU hotplug support - [email protected]
11  */
12 
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16 
17 #include <linux/cpu.h>
18 #include <linux/errno.h>
19 #include <linux/sched.h>
20 #include <linux/fs.h>
21 #include <linux/kernel.h>
22 #include <linux/mm.h>
23 #include <linux/elfcore.h>
24 #include <linux/smp.h>
25 #include <linux/slab.h>
26 #include <linux/user.h>
27 #include <linux/interrupt.h>
28 #include <linux/delay.h>
29 #include <linux/module.h>
30 #include <linux/ptrace.h>
31 #include <linux/notifier.h>
32 #include <linux/kprobes.h>
33 #include <linux/kdebug.h>
34 #include <linux/prctl.h>
35 #include <linux/uaccess.h>
36 #include <linux/io.h>
37 #include <linux/ftrace.h>
38 
39 #include <asm/pgtable.h>
40 #include <asm/processor.h>
41 #include <asm/i387.h>
42 #include <asm/fpu-internal.h>
43 #include <asm/mmu_context.h>
44 #include <asm/prctl.h>
45 #include <asm/desc.h>
46 #include <asm/proto.h>
47 #include <asm/ia32.h>
48 #include <asm/idle.h>
49 #include <asm/syscalls.h>
50 #include <asm/debugreg.h>
51 #include <asm/switch_to.h>
52 
53 asmlinkage extern void ret_from_fork(void);
54 
55 DEFINE_PER_CPU(unsigned long, old_rsp);
56 
57 /* Prints also some state that isn't saved in the pt_regs */
58 void __show_regs(struct pt_regs *regs, int all)
59 {
60  unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
61  unsigned long d0, d1, d2, d3, d6, d7;
62  unsigned int fsindex, gsindex;
63  unsigned int ds, cs, es;
64 
66  printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
67  printk_address(regs->ip, 1);
68  printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
69  regs->sp, regs->flags);
70  printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
71  regs->ax, regs->bx, regs->cx);
72  printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
73  regs->dx, regs->si, regs->di);
74  printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
75  regs->bp, regs->r8, regs->r9);
76  printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
77  regs->r10, regs->r11, regs->r12);
78  printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
79  regs->r13, regs->r14, regs->r15);
80 
81  asm("movl %%ds,%0" : "=r" (ds));
82  asm("movl %%cs,%0" : "=r" (cs));
83  asm("movl %%es,%0" : "=r" (es));
84  asm("movl %%fs,%0" : "=r" (fsindex));
85  asm("movl %%gs,%0" : "=r" (gsindex));
86 
87  rdmsrl(MSR_FS_BASE, fs);
88  rdmsrl(MSR_GS_BASE, gs);
89  rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
90 
91  if (!all)
92  return;
93 
94  cr0 = read_cr0();
95  cr2 = read_cr2();
96  cr3 = read_cr3();
97  cr4 = read_cr4();
98 
99  printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
100  fs, fsindex, gs, gsindex, shadowgs);
101  printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
102  es, cr0);
103  printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
104  cr4);
105 
106  get_debugreg(d0, 0);
107  get_debugreg(d1, 1);
108  get_debugreg(d2, 2);
109  printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
110  get_debugreg(d3, 3);
111  get_debugreg(d6, 6);
112  get_debugreg(d7, 7);
113  printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
114 }
115 
116 void release_thread(struct task_struct *dead_task)
117 {
118  if (dead_task->mm) {
119  if (dead_task->mm->context.size) {
120  pr_warn("WARNING: dead process %8s still has LDT? <%p/%d>\n",
121  dead_task->comm,
122  dead_task->mm->context.ldt,
123  dead_task->mm->context.size);
124  BUG();
125  }
126  }
127 }
128 
129 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
130 {
131  struct user_desc ud = {
132  .base_addr = addr,
133  .limit = 0xfffff,
134  .seg_32bit = 1,
135  .limit_in_pages = 1,
136  .useable = 1,
137  };
138  struct desc_struct *desc = t->thread.tls_array;
139  desc += tls;
140  fill_ldt(desc, &ud);
141 }
142 
143 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
144 {
145  return get_desc_base(&t->thread.tls_array[tls]);
146 }
147 
148 int copy_thread(unsigned long clone_flags, unsigned long sp,
149  unsigned long arg,
150  struct task_struct *p, struct pt_regs *regs)
151 {
152  int err;
153  struct pt_regs *childregs;
154  struct task_struct *me = current;
155 
156  p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
157  childregs = task_pt_regs(p);
158  p->thread.sp = (unsigned long) childregs;
159  p->thread.usersp = me->thread.usersp;
160  set_tsk_thread_flag(p, TIF_FORK);
161  p->fpu_counter = 0;
162  p->thread.io_bitmap_ptr = NULL;
163 
164  savesegment(gs, p->thread.gsindex);
165  p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
166  savesegment(fs, p->thread.fsindex);
167  p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
168  savesegment(es, p->thread.es);
169  savesegment(ds, p->thread.ds);
170  memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
171 
172  if (unlikely(!regs)) {
173  /* kernel thread */
174  memset(childregs, 0, sizeof(struct pt_regs));
175  childregs->sp = (unsigned long)childregs;
176  childregs->ss = __KERNEL_DS;
177  childregs->bx = sp; /* function */
178  childregs->bp = arg;
179  childregs->orig_ax = -1;
180  childregs->cs = __KERNEL_CS | get_kernel_rpl();
181  childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
182  return 0;
183  }
184  *childregs = *regs;
185 
186  childregs->ax = 0;
187  childregs->sp = sp;
188 
189  err = -ENOMEM;
190  memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
191 
192  if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
193  p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
195  if (!p->thread.io_bitmap_ptr) {
196  p->thread.io_bitmap_max = 0;
197  return -ENOMEM;
198  }
199  set_tsk_thread_flag(p, TIF_IO_BITMAP);
200  }
201 
202  /*
203  * Set a new TLS for the child thread?
204  */
205  if (clone_flags & CLONE_SETTLS) {
206 #ifdef CONFIG_IA32_EMULATION
207  if (test_thread_flag(TIF_IA32))
208  err = do_set_thread_area(p, -1,
209  (struct user_desc __user *)childregs->si, 0);
210  else
211 #endif
212  err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
213  if (err)
214  goto out;
215  }
216  err = 0;
217 out:
218  if (err && p->thread.io_bitmap_ptr) {
219  kfree(p->thread.io_bitmap_ptr);
220  p->thread.io_bitmap_max = 0;
221  }
222 
223  return err;
224 }
225 
226 static void
227 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
228  unsigned long new_sp,
229  unsigned int _cs, unsigned int _ss, unsigned int _ds)
230 {
231  loadsegment(fs, 0);
232  loadsegment(es, _ds);
233  loadsegment(ds, _ds);
234  load_gs_index(0);
235  current->thread.usersp = new_sp;
236  regs->ip = new_ip;
237  regs->sp = new_sp;
238  this_cpu_write(old_rsp, new_sp);
239  regs->cs = _cs;
240  regs->ss = _ss;
241  regs->flags = X86_EFLAGS_IF;
242 }
243 
244 void
245 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
246 {
247  start_thread_common(regs, new_ip, new_sp,
248  __USER_CS, __USER_DS, 0);
249 }
250 
251 #ifdef CONFIG_IA32_EMULATION
252 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
253 {
254  start_thread_common(regs, new_ip, new_sp,
255  test_thread_flag(TIF_X32)
258 }
259 #endif
260 
261 /*
262  * switch_to(x,y) should switch tasks from x to y.
263  *
264  * This could still be optimized:
265  * - fold all the options into a flag word and test it with a single test.
266  * - could test fs/gs bitsliced
267  *
268  * Kprobes not supported here. Set the probe on schedule instead.
269  * Function graph tracer not supported too.
270  */
272 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
273 {
274  struct thread_struct *prev = &prev_p->thread;
275  struct thread_struct *next = &next_p->thread;
276  int cpu = smp_processor_id();
277  struct tss_struct *tss = &per_cpu(init_tss, cpu);
278  unsigned fsindex, gsindex;
280 
281  fpu = switch_fpu_prepare(prev_p, next_p, cpu);
282 
283  /*
284  * Reload esp0, LDT and the page table pointer:
285  */
286  load_sp0(tss, next);
287 
288  /*
289  * Switch DS and ES.
290  * This won't pick up thread selector changes, but I guess that is ok.
291  */
292  savesegment(es, prev->es);
293  if (unlikely(next->es | prev->es))
294  loadsegment(es, next->es);
295 
296  savesegment(ds, prev->ds);
297  if (unlikely(next->ds | prev->ds))
298  loadsegment(ds, next->ds);
299 
300 
301  /* We must save %fs and %gs before load_TLS() because
302  * %fs and %gs may be cleared by load_TLS().
303  *
304  * (e.g. xen_load_tls())
305  */
306  savesegment(fs, fsindex);
307  savesegment(gs, gsindex);
308 
309  load_TLS(next, cpu);
310 
311  /*
312  * Leave lazy mode, flushing any hypercalls made here.
313  * This must be done before restoring TLS segments so
314  * the GDT and LDT are properly updated, and must be
315  * done before math_state_restore, so the TS bit is up
316  * to date.
317  */
318  arch_end_context_switch(next_p);
319 
320  /*
321  * Switch FS and GS.
322  *
323  * Segment register != 0 always requires a reload. Also
324  * reload when it has changed. When prev process used 64bit
325  * base always reload to avoid an information leak.
326  */
327  if (unlikely(fsindex | next->fsindex | prev->fs)) {
328  loadsegment(fs, next->fsindex);
329  /*
330  * Check if the user used a selector != 0; if yes
331  * clear 64bit base, since overloaded base is always
332  * mapped to the Null selector
333  */
334  if (fsindex)
335  prev->fs = 0;
336  }
337  /* when next process has a 64bit base use it */
338  if (next->fs)
339  wrmsrl(MSR_FS_BASE, next->fs);
340  prev->fsindex = fsindex;
341 
342  if (unlikely(gsindex | next->gsindex | prev->gs)) {
343  load_gs_index(next->gsindex);
344  if (gsindex)
345  prev->gs = 0;
346  }
347  if (next->gs)
348  wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
349  prev->gsindex = gsindex;
350 
351  switch_fpu_finish(next_p, fpu);
352 
353  /*
354  * Switch the PDA and FPU contexts.
355  */
356  prev->usersp = this_cpu_read(old_rsp);
357  this_cpu_write(old_rsp, next->usersp);
358  this_cpu_write(current_task, next_p);
359 
360  this_cpu_write(kernel_stack,
361  (unsigned long)task_stack_page(next_p) +
363 
364  /*
365  * Now maybe reload the debug registers and handle I/O bitmaps
366  */
369  __switch_to_xtra(prev_p, next_p, tss);
370 
371  return prev_p;
372 }
373 
375 {
376  /* inherit personality from parent */
377 
378  /* Make sure to be in 64bit mode */
379  clear_thread_flag(TIF_IA32);
380  clear_thread_flag(TIF_ADDR32);
381  clear_thread_flag(TIF_X32);
382 
383  /* Ensure the corresponding mm is not marked. */
384  if (current->mm)
385  current->mm->context.ia32_compat = 0;
386 
387  /* TBD: overwrites user setup. Should have two bits.
388  But 64bit processes have always behaved this way,
389  so it's not too bad. The main problem is just that
390  32bit childs are affected again. */
391  current->personality &= ~READ_IMPLIES_EXEC;
392 }
393 
394 void set_personality_ia32(bool x32)
395 {
396  /* inherit personality from parent */
397 
398  /* Make sure to be in 32bit mode */
399  set_thread_flag(TIF_ADDR32);
400 
401  /* Mark the associated mm as containing 32-bit tasks. */
402  if (current->mm)
403  current->mm->context.ia32_compat = 1;
404 
405  if (x32) {
406  clear_thread_flag(TIF_IA32);
407  set_thread_flag(TIF_X32);
408  current->personality &= ~READ_IMPLIES_EXEC;
409  /* is_compat_task() uses the presence of the x32
410  syscall bit flag to determine compat status */
411  current_thread_info()->status &= ~TS_COMPAT;
412  } else {
413  set_thread_flag(TIF_IA32);
414  clear_thread_flag(TIF_X32);
415  current->personality |= force_personality32;
416  /* Prepare the first "return" to user space */
417  current_thread_info()->status |= TS_COMPAT;
418  }
419 }
421 
422 unsigned long get_wchan(struct task_struct *p)
423 {
424  unsigned long stack;
425  u64 fp, ip;
426  int count = 0;
427 
428  if (!p || p == current || p->state == TASK_RUNNING)
429  return 0;
430  stack = (unsigned long)task_stack_page(p);
431  if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
432  return 0;
433  fp = *(u64 *)(p->thread.sp);
434  do {
435  if (fp < (unsigned long)stack ||
436  fp >= (unsigned long)stack+THREAD_SIZE)
437  return 0;
438  ip = *(u64 *)(fp+8);
439  if (!in_sched_functions(ip))
440  return ip;
441  fp = *(u64 *)fp;
442  } while (count++ < 16);
443  return 0;
444 }
445 
446 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
447 {
448  int ret = 0;
449  int doit = task == current;
450  int cpu;
451 
452  switch (code) {
453  case ARCH_SET_GS:
454  if (addr >= TASK_SIZE_OF(task))
455  return -EPERM;
456  cpu = get_cpu();
457  /* handle small bases via the GDT because that's faster to
458  switch. */
459  if (addr <= 0xffffffff) {
460  set_32bit_tls(task, GS_TLS, addr);
461  if (doit) {
462  load_TLS(&task->thread, cpu);
463  load_gs_index(GS_TLS_SEL);
464  }
465  task->thread.gsindex = GS_TLS_SEL;
466  task->thread.gs = 0;
467  } else {
468  task->thread.gsindex = 0;
469  task->thread.gs = addr;
470  if (doit) {
471  load_gs_index(0);
472  ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
473  }
474  }
475  put_cpu();
476  break;
477  case ARCH_SET_FS:
478  /* Not strictly needed for fs, but do it for symmetry
479  with gs */
480  if (addr >= TASK_SIZE_OF(task))
481  return -EPERM;
482  cpu = get_cpu();
483  /* handle small bases via the GDT because that's faster to
484  switch. */
485  if (addr <= 0xffffffff) {
486  set_32bit_tls(task, FS_TLS, addr);
487  if (doit) {
488  load_TLS(&task->thread, cpu);
489  loadsegment(fs, FS_TLS_SEL);
490  }
491  task->thread.fsindex = FS_TLS_SEL;
492  task->thread.fs = 0;
493  } else {
494  task->thread.fsindex = 0;
495  task->thread.fs = addr;
496  if (doit) {
497  /* set the selector to 0 to not confuse
498  __switch_to */
499  loadsegment(fs, 0);
500  ret = wrmsrl_safe(MSR_FS_BASE, addr);
501  }
502  }
503  put_cpu();
504  break;
505  case ARCH_GET_FS: {
506  unsigned long base;
507  if (task->thread.fsindex == FS_TLS_SEL)
508  base = read_32bit_tls(task, FS_TLS);
509  else if (doit)
510  rdmsrl(MSR_FS_BASE, base);
511  else
512  base = task->thread.fs;
513  ret = put_user(base, (unsigned long __user *)addr);
514  break;
515  }
516  case ARCH_GET_GS: {
517  unsigned long base;
518  unsigned gsindex;
519  if (task->thread.gsindex == GS_TLS_SEL)
520  base = read_32bit_tls(task, GS_TLS);
521  else if (doit) {
522  savesegment(gs, gsindex);
523  if (gsindex)
524  rdmsrl(MSR_KERNEL_GS_BASE, base);
525  else
526  base = task->thread.gs;
527  } else
528  base = task->thread.gs;
529  ret = put_user(base, (unsigned long __user *)addr);
530  break;
531  }
532 
533  default:
534  ret = -EINVAL;
535  break;
536  }
537 
538  return ret;
539 }
540 
541 long sys_arch_prctl(int code, unsigned long addr)
542 {
543  return do_arch_prctl(current, code, addr);
544 }
545 
546 unsigned long KSTK_ESP(struct task_struct *task)
547 {
548  return (test_tsk_thread_flag(task, TIF_IA32)) ?
549  (task_pt_regs(task)->sp) : ((task)->thread.usersp);
550 }