Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
kprobes.c
Go to the documentation of this file.
1 /*
2  * Kernel Probes (KProbes)
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software
16  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17  *
18  * Copyright (C) IBM Corporation, 2002, 2004
19  *
20  * 2002-Oct Created by Vamsi Krishna S <[email protected]> Kernel
21  * Probes initial implementation ( includes contributions from
22  * Rusty Russell).
23  * 2004-July Suparna Bhattacharya <[email protected]> added jumper probes
24  * interface to access function arguments.
25  * 2004-Oct Jim Keniston <[email protected]> and Prasanna S Panchamukhi
26  * <[email protected]> adapted for x86_64 from i386.
27  * 2005-Mar Roland McGrath <[email protected]>
28  * Fixed to handle %rip-relative addressing mode correctly.
29  * 2005-May Hien Nguyen <[email protected]>, Jim Keniston
30  * <[email protected]> and Prasanna S Panchamukhi
31  * <[email protected]> added function-return probes.
32  * 2005-May Rusty Lynch <[email protected]>
33  * Added function return probes functionality
34  * 2006-Feb Masami Hiramatsu <[email protected]> added
35  * kprobe-booster and kretprobe-booster for i386.
36  * 2007-Dec Masami Hiramatsu <[email protected]> added kprobe-booster
37  * and kretprobe-booster for x86-64
38  * 2007-Dec Masami Hiramatsu <[email protected]>, Arjan van de Ven
39  * <[email protected]> and Jim Keniston <[email protected]>
40  * unified x86 kprobes code.
41  */
42 #include <linux/kprobes.h>
43 #include <linux/ptrace.h>
44 #include <linux/string.h>
45 #include <linux/slab.h>
46 #include <linux/hardirq.h>
47 #include <linux/preempt.h>
48 #include <linux/module.h>
49 #include <linux/kdebug.h>
50 #include <linux/kallsyms.h>
51 #include <linux/ftrace.h>
52 
53 #include <asm/cacheflush.h>
54 #include <asm/desc.h>
55 #include <asm/pgtable.h>
56 #include <asm/uaccess.h>
57 #include <asm/alternative.h>
58 #include <asm/insn.h>
59 #include <asm/debugreg.h>
60 
61 #include "kprobes-common.h"
62 
63 void jprobe_return_end(void);
64 
65 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
67 
68 #define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
69 
70 #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
71  (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
72  (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
73  (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
74  (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
75  << (row % 32))
76  /*
77  * Undefined/reserved opcodes, conditional jump, Opcode Extension
78  * Groups, and some special opcodes can not boost.
79  * This is non-const and volatile to keep gcc from statically
80  * optimizing it out, as variable_test_bit makes gcc think only
81  * *(unsigned long*) is used.
82  */
83 static volatile u32 twobyte_is_boostable[256 / 32] = {
84  /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
85  /* ---------------------------------------------- */
86  W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
87  W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */
88  W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
89  W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
90  W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
91  W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
92  W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */
93  W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
94  W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */
95  W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
96  W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */
97  W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */
98  W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
99  W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */
100  W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */
101  W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0) /* f0 */
102  /* ----------------------------------------------- */
103  /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
104 };
105 #undef W
106 
108  {"__switch_to", }, /* This function switches only current task, but
109  doesn't switch kernel stack.*/
110  {NULL, NULL} /* Terminator */
111 };
112 
113 const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
114 
115 static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
116 {
117  struct __arch_relative_insn {
118  u8 op;
119  s32 raddr;
120  } __attribute__((packed)) *insn;
121 
122  insn = (struct __arch_relative_insn *)from;
123  insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
124  insn->op = op;
125 }
126 
127 /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
128 void __kprobes synthesize_reljump(void *from, void *to)
129 {
130  __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
131 }
132 
133 /* Insert a call instruction at address 'from', which calls address 'to'.*/
134 void __kprobes synthesize_relcall(void *from, void *to)
135 {
136  __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
137 }
138 
139 /*
140  * Skip the prefixes of the instruction.
141  */
142 static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
143 {
145 
146  attr = inat_get_opcode_attribute((insn_byte_t)*insn);
147  while (inat_is_legacy_prefix(attr)) {
148  insn++;
149  attr = inat_get_opcode_attribute((insn_byte_t)*insn);
150  }
151 #ifdef CONFIG_X86_64
152  if (inat_is_rex_prefix(attr))
153  insn++;
154 #endif
155  return insn;
156 }
157 
158 /*
159  * Returns non-zero if opcode is boostable.
160  * RIP relative instructions are adjusted at copying time in 64 bits mode
161  */
163 {
165  kprobe_opcode_t *orig_opcodes = opcodes;
166 
167  if (search_exception_tables((unsigned long)opcodes))
168  return 0; /* Page fault may occur on this address. */
169 
170 retry:
171  if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
172  return 0;
173  opcode = *(opcodes++);
174 
175  /* 2nd-byte opcode */
176  if (opcode == 0x0f) {
177  if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
178  return 0;
179  return test_bit(*opcodes,
180  (unsigned long *)twobyte_is_boostable);
181  }
182 
183  switch (opcode & 0xf0) {
184 #ifdef CONFIG_X86_64
185  case 0x40:
186  goto retry; /* REX prefix is boostable */
187 #endif
188  case 0x60:
189  if (0x63 < opcode && opcode < 0x67)
190  goto retry; /* prefixes */
191  /* can't boost Address-size override and bound */
192  return (opcode != 0x62 && opcode != 0x67);
193  case 0x70:
194  return 0; /* can't boost conditional jump */
195  case 0xc0:
196  /* can't boost software-interruptions */
197  return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
198  case 0xd0:
199  /* can boost AA* and XLAT */
200  return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
201  case 0xe0:
202  /* can boost in/out and absolute jmps */
203  return ((opcode & 0x04) || opcode == 0xea);
204  case 0xf0:
205  if ((opcode & 0x0c) == 0 && opcode != 0xf1)
206  goto retry; /* lock/rep(ne) prefix */
207  /* clear and set flags are boostable */
208  return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
209  default:
210  /* segment override prefixes are boostable */
211  if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
212  goto retry; /* prefixes */
213  /* CS override prefix and call are not boostable */
214  return (opcode != 0x2e && opcode != 0x9a);
215  }
216 }
217 
218 static unsigned long
219 __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
220 {
221  struct kprobe *kp;
222 
223  kp = get_kprobe((void *)addr);
224  /* There is no probe, return original address */
225  if (!kp)
226  return addr;
227 
228  /*
229  * Basically, kp->ainsn.insn has an original instruction.
230  * However, RIP-relative instruction can not do single-stepping
231  * at different place, __copy_instruction() tweaks the displacement of
232  * that instruction. In that case, we can't recover the instruction
233  * from the kp->ainsn.insn.
234  *
235  * On the other hand, kp->opcode has a copy of the first byte of
236  * the probed instruction, which is overwritten by int3. And
237  * the instruction at kp->addr is not modified by kprobes except
238  * for the first byte, we can recover the original instruction
239  * from it and kp->opcode.
240  */
241  memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
242  buf[0] = kp->opcode;
243  return (unsigned long)buf;
244 }
245 
246 /*
247  * Recover the probed instruction at addr for further analysis.
248  * Caller must lock kprobes by kprobe_mutex, or disable preemption
249  * for preventing to release referencing kprobes.
250  */
251 unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
252 {
253  unsigned long __addr;
254 
255  __addr = __recover_optprobed_insn(buf, addr);
256  if (__addr != addr)
257  return __addr;
258 
259  return __recover_probed_insn(buf, addr);
260 }
261 
262 /* Check if paddr is at an instruction boundary */
263 static int __kprobes can_probe(unsigned long paddr)
264 {
265  unsigned long addr, __addr, offset = 0;
266  struct insn insn;
268 
269  if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
270  return 0;
271 
272  /* Decode instructions */
273  addr = paddr - offset;
274  while (addr < paddr) {
275  /*
276  * Check if the instruction has been modified by another
277  * kprobe, in which case we replace the breakpoint by the
278  * original instruction in our buffer.
279  * Also, jump optimization will change the breakpoint to
280  * relative-jump. Since the relative-jump itself is
281  * normally used, we just go through if there is no kprobe.
282  */
283  __addr = recover_probed_instruction(buf, addr);
284  kernel_insn_init(&insn, (void *)__addr);
285  insn_get_length(&insn);
286 
287  /*
288  * Another debugging subsystem might insert this breakpoint.
289  * In that case, we can't recover it.
290  */
291  if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
292  return 0;
293  addr += insn.length;
294  }
295 
296  return (addr == paddr);
297 }
298 
299 /*
300  * Returns non-zero if opcode modifies the interrupt flag.
301  */
302 static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
303 {
304  /* Skip prefixes */
305  insn = skip_prefixes(insn);
306 
307  switch (*insn) {
308  case 0xfa: /* cli */
309  case 0xfb: /* sti */
310  case 0xcf: /* iret/iretd */
311  case 0x9d: /* popf/popfd */
312  return 1;
313  }
314 
315  return 0;
316 }
317 
318 /*
319  * Copy an instruction and adjust the displacement if the instruction
320  * uses the %rip-relative addressing mode.
321  * If it does, Return the address of the 32-bit displacement word.
322  * If not, return null.
323  * Only applicable to 64-bit x86.
324  */
326 {
327  struct insn insn;
329 
330  kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src));
331  insn_get_length(&insn);
332  /* Another subsystem puts a breakpoint, failed to recover */
333  if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
334  return 0;
335  memcpy(dest, insn.kaddr, insn.length);
336 
337 #ifdef CONFIG_X86_64
338  if (insn_rip_relative(&insn)) {
339  s64 newdisp;
340  u8 *disp;
341  kernel_insn_init(&insn, dest);
342  insn_get_displacement(&insn);
343  /*
344  * The copied instruction uses the %rip-relative addressing
345  * mode. Adjust the displacement for the difference between
346  * the original location of this instruction and the location
347  * of the copy that will actually be run. The tricky bit here
348  * is making sure that the sign extension happens correctly in
349  * this calculation, since we need a signed 32-bit result to
350  * be sign-extended to 64 bits when it's added to the %rip
351  * value and yield the same 64-bit result that the sign-
352  * extension of the original signed 32-bit displacement would
353  * have given.
354  */
355  newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest;
356  BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
357  disp = (u8 *) dest + insn_offset_displacement(&insn);
358  *(s32 *) disp = (s32) newdisp;
359  }
360 #endif
361  return insn.length;
362 }
363 
364 static void __kprobes arch_copy_kprobe(struct kprobe *p)
365 {
366  /* Copy an instruction with recovering if other optprobe modifies it.*/
367  __copy_instruction(p->ainsn.insn, p->addr);
368 
369  /*
370  * __copy_instruction can modify the displacement of the instruction,
371  * but it doesn't affect boostable check.
372  */
373  if (can_boost(p->ainsn.insn))
374  p->ainsn.boostable = 0;
375  else
376  p->ainsn.boostable = -1;
377 
378  /* Also, displacement change doesn't affect the first byte */
379  p->opcode = p->ainsn.insn[0];
380 }
381 
383 {
384  if (alternatives_text_reserved(p->addr, p->addr))
385  return -EINVAL;
386 
387  if (!can_probe((unsigned long)p->addr))
388  return -EILSEQ;
389  /* insn: must be on special executable page on x86. */
390  p->ainsn.insn = get_insn_slot();
391  if (!p->ainsn.insn)
392  return -ENOMEM;
393  arch_copy_kprobe(p);
394  return 0;
395 }
396 
398 {
399  text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
400 }
401 
403 {
404  text_poke(p->addr, &p->opcode, 1);
405 }
406 
408 {
409  if (p->ainsn.insn) {
410  free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
411  p->ainsn.insn = NULL;
412  }
413 }
414 
415 static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
416 {
417  kcb->prev_kprobe.kp = kprobe_running();
418  kcb->prev_kprobe.status = kcb->kprobe_status;
419  kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags;
420  kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
421 }
422 
423 static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
424 {
425  __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
426  kcb->kprobe_status = kcb->prev_kprobe.status;
427  kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
428  kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
429 }
430 
431 static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
432  struct kprobe_ctlblk *kcb)
433 {
434  __this_cpu_write(current_kprobe, p);
436  = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
437  if (is_IF_modifier(p->ainsn.insn))
439 }
440 
441 static void __kprobes clear_btf(void)
442 {
443  if (test_thread_flag(TIF_BLOCKSTEP)) {
444  unsigned long debugctl = get_debugctlmsr();
445 
446  debugctl &= ~DEBUGCTLMSR_BTF;
447  update_debugctlmsr(debugctl);
448  }
449 }
450 
451 static void __kprobes restore_btf(void)
452 {
453  if (test_thread_flag(TIF_BLOCKSTEP)) {
454  unsigned long debugctl = get_debugctlmsr();
455 
456  debugctl |= DEBUGCTLMSR_BTF;
457  update_debugctlmsr(debugctl);
458  }
459 }
460 
461 void __kprobes
463 {
464  unsigned long *sara = stack_addr(regs);
465 
466  ri->ret_addr = (kprobe_opcode_t *) *sara;
467 
468  /* Replace the return addr with trampoline addr */
469  *sara = (unsigned long) &kretprobe_trampoline;
470 }
471 
472 static void __kprobes
473 setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter)
474 {
475  if (setup_detour_execution(p, regs, reenter))
476  return;
477 
478 #if !defined(CONFIG_PREEMPT)
479  if (p->ainsn.boostable == 1 && !p->post_handler) {
480  /* Boost up -- we can execute copied instructions directly */
481  if (!reenter)
482  reset_current_kprobe();
483  /*
484  * Reentering boosted probe doesn't reset current_kprobe,
485  * nor set current_kprobe, because it doesn't use single
486  * stepping.
487  */
488  regs->ip = (unsigned long)p->ainsn.insn;
490  return;
491  }
492 #endif
493  if (reenter) {
494  save_previous_kprobe(kcb);
495  set_current_kprobe(p, regs, kcb);
496  kcb->kprobe_status = KPROBE_REENTER;
497  } else
499  /* Prepare real single stepping */
500  clear_btf();
501  regs->flags |= X86_EFLAGS_TF;
502  regs->flags &= ~X86_EFLAGS_IF;
503  /* single step inline if the instruction is an int3 */
504  if (p->opcode == BREAKPOINT_INSTRUCTION)
505  regs->ip = (unsigned long)p->addr;
506  else
507  regs->ip = (unsigned long)p->ainsn.insn;
508 }
509 
510 /*
511  * We have reentered the kprobe_handler(), since another probe was hit while
512  * within the handler. We save the original kprobes variables and just single
513  * step on the instruction of the new probe without calling any user handlers.
514  */
515 static int __kprobes
516 reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb)
517 {
518  switch (kcb->kprobe_status) {
519  case KPROBE_HIT_SSDONE:
520  case KPROBE_HIT_ACTIVE:
522  setup_singlestep(p, regs, kcb, 1);
523  break;
524  case KPROBE_HIT_SS:
525  /* A probe has been hit in the codepath leading up to, or just
526  * after, single-stepping of a probed instruction. This entire
527  * codepath should strictly reside in .kprobes.text section.
528  * Raise a BUG or we'll continue in an endless reentering loop
529  * and eventually a stack overflow.
530  */
531  printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
532  p->addr);
533  dump_kprobe(p);
534  BUG();
535  default:
536  /* impossible cases */
537  WARN_ON(1);
538  return 0;
539  }
540 
541  return 1;
542 }
543 
544 #ifdef KPROBES_CAN_USE_FTRACE
545 static void __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs,
546  struct kprobe_ctlblk *kcb)
547 {
548  /*
549  * Emulate singlestep (and also recover regs->ip)
550  * as if there is a 5byte nop
551  */
552  regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE;
553  if (unlikely(p->post_handler)) {
554  kcb->kprobe_status = KPROBE_HIT_SSDONE;
555  p->post_handler(p, regs, 0);
556  }
557  __this_cpu_write(current_kprobe, NULL);
558 }
559 #endif
560 
561 /*
562  * Interrupts are disabled on entry as trap3 is an interrupt gate and they
563  * remain disabled throughout this function.
564  */
565 static int __kprobes kprobe_handler(struct pt_regs *regs)
566 {
568  struct kprobe *p;
569  struct kprobe_ctlblk *kcb;
570 
571  addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
572  /*
573  * We don't want to be preempted for the entire
574  * duration of kprobe processing. We conditionally
575  * re-enable preemption at the end of this function,
576  * and also in reenter_kprobe() and setup_singlestep().
577  */
578  preempt_disable();
579 
580  kcb = get_kprobe_ctlblk();
581  p = get_kprobe(addr);
582 
583  if (p) {
584  if (kprobe_running()) {
585  if (reenter_kprobe(p, regs, kcb))
586  return 1;
587  } else {
588  set_current_kprobe(p, regs, kcb);
590 
591  /*
592  * If we have no pre-handler or it returned 0, we
593  * continue with normal processing. If we have a
594  * pre-handler and it returned non-zero, it prepped
595  * for calling the break_handler below on re-entry
596  * for jprobe processing, so get out doing nothing
597  * more here.
598  */
599  if (!p->pre_handler || !p->pre_handler(p, regs))
600  setup_singlestep(p, regs, kcb, 0);
601  return 1;
602  }
603  } else if (*addr != BREAKPOINT_INSTRUCTION) {
604  /*
605  * The breakpoint instruction was removed right
606  * after we hit it. Another cpu has removed
607  * either a probepoint or a debugger breakpoint
608  * at this address. In either case, no further
609  * handling of this interrupt is appropriate.
610  * Back up over the (now missing) int3 and run
611  * the original instruction.
612  */
613  regs->ip = (unsigned long)addr;
615  return 1;
616  } else if (kprobe_running()) {
617  p = __this_cpu_read(current_kprobe);
618  if (p->break_handler && p->break_handler(p, regs)) {
619 #ifdef KPROBES_CAN_USE_FTRACE
620  if (kprobe_ftrace(p)) {
621  skip_singlestep(p, regs, kcb);
622  return 1;
623  }
624 #endif
625  setup_singlestep(p, regs, kcb, 0);
626  return 1;
627  }
628  } /* else: not a kprobe fault; let the kernel handle it */
629 
631  return 0;
632 }
633 
634 /*
635  * When a retprobed function returns, this code saves registers and
636  * calls trampoline_handler() runs, which calls the kretprobe's handler.
637  */
639 {
640  asm volatile (
641  ".global kretprobe_trampoline\n"
642  "kretprobe_trampoline: \n"
643 #ifdef CONFIG_X86_64
644  /* We don't bother saving the ss register */
645  " pushq %rsp\n"
646  " pushfq\n"
648  " movq %rsp, %rdi\n"
649  " call trampoline_handler\n"
650  /* Replace saved sp with true return address. */
651  " movq %rax, 152(%rsp)\n"
653  " popfq\n"
654 #else
655  " pushf\n"
657  " movl %esp, %eax\n"
658  " call trampoline_handler\n"
659  /* Move flags to cs */
660  " movl 56(%esp), %edx\n"
661  " movl %edx, 52(%esp)\n"
662  /* Replace saved flags with true return address. */
663  " movl %eax, 56(%esp)\n"
665  " popf\n"
666 #endif
667  " ret\n");
668 }
669 
670 /*
671  * Called from kretprobe_trampoline
672  */
673 static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
674 {
675  struct kretprobe_instance *ri = NULL;
676  struct hlist_head *head, empty_rp;
677  struct hlist_node *node, *tmp;
678  unsigned long flags, orig_ret_address = 0;
679  unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
680  kprobe_opcode_t *correct_ret_addr = NULL;
681 
682  INIT_HLIST_HEAD(&empty_rp);
683  kretprobe_hash_lock(current, &head, &flags);
684  /* fixup registers */
685 #ifdef CONFIG_X86_64
686  regs->cs = __KERNEL_CS;
687 #else
688  regs->cs = __KERNEL_CS | get_kernel_rpl();
689  regs->gs = 0;
690 #endif
691  regs->ip = trampoline_address;
692  regs->orig_ax = ~0UL;
693 
694  /*
695  * It is possible to have multiple instances associated with a given
696  * task either because multiple functions in the call path have
697  * return probes installed on them, and/or more than one
698  * return probe was registered for a target function.
699  *
700  * We can handle this because:
701  * - instances are always pushed into the head of the list
702  * - when multiple return probes are registered for the same
703  * function, the (chronologically) first instance's ret_addr
704  * will be the real return address, and all the rest will
705  * point to kretprobe_trampoline.
706  */
707  hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
708  if (ri->task != current)
709  /* another task is sharing our hash bucket */
710  continue;
711 
712  orig_ret_address = (unsigned long)ri->ret_addr;
713 
714  if (orig_ret_address != trampoline_address)
715  /*
716  * This is the real return address. Any other
717  * instances associated with this task are for
718  * other calls deeper on the call stack
719  */
720  break;
721  }
722 
723  kretprobe_assert(ri, orig_ret_address, trampoline_address);
724 
725  correct_ret_addr = ri->ret_addr;
726  hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
727  if (ri->task != current)
728  /* another task is sharing our hash bucket */
729  continue;
730 
731  orig_ret_address = (unsigned long)ri->ret_addr;
732  if (ri->rp && ri->rp->handler) {
733  __this_cpu_write(current_kprobe, &ri->rp->kp);
734  get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
735  ri->ret_addr = correct_ret_addr;
736  ri->rp->handler(ri, regs);
737  __this_cpu_write(current_kprobe, NULL);
738  }
739 
740  recycle_rp_inst(ri, &empty_rp);
741 
742  if (orig_ret_address != trampoline_address)
743  /*
744  * This is the real return address. Any other
745  * instances associated with this task are for
746  * other calls deeper on the call stack
747  */
748  break;
749  }
750 
752 
753  hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
754  hlist_del(&ri->hlist);
755  kfree(ri);
756  }
757  return (void *)orig_ret_address;
758 }
759 
760 /*
761  * Called after single-stepping. p->addr is the address of the
762  * instruction whose first byte has been replaced by the "int 3"
763  * instruction. To avoid the SMP problems that can occur when we
764  * temporarily put back the original opcode to single-step, we
765  * single-stepped a copy of the instruction. The address of this
766  * copy is p->ainsn.insn.
767  *
768  * This function prepares to return from the post-single-step
769  * interrupt. We have to fix up the stack as follows:
770  *
771  * 0) Except in the case of absolute or indirect jump or call instructions,
772  * the new ip is relative to the copied instruction. We need to make
773  * it relative to the original instruction.
774  *
775  * 1) If the single-stepped instruction was pushfl, then the TF and IF
776  * flags are set in the just-pushed flags, and may need to be cleared.
777  *
778  * 2) If the single-stepped instruction was a call, the return address
779  * that is atop the stack is the address following the copied instruction.
780  * We need to make it the address following the original instruction.
781  *
782  * If this is the first time we've single-stepped the instruction at
783  * this probepoint, and the instruction is boostable, boost it: add a
784  * jump instruction after the copied instruction, that jumps to the next
785  * instruction after the probepoint.
786  */
787 static void __kprobes
788 resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb)
789 {
790  unsigned long *tos = stack_addr(regs);
791  unsigned long copy_ip = (unsigned long)p->ainsn.insn;
792  unsigned long orig_ip = (unsigned long)p->addr;
793  kprobe_opcode_t *insn = p->ainsn.insn;
794 
795  /* Skip prefixes */
796  insn = skip_prefixes(insn);
797 
798  regs->flags &= ~X86_EFLAGS_TF;
799  switch (*insn) {
800  case 0x9c: /* pushfl */
801  *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF);
802  *tos |= kcb->kprobe_old_flags;
803  break;
804  case 0xc2: /* iret/ret/lret */
805  case 0xc3:
806  case 0xca:
807  case 0xcb:
808  case 0xcf:
809  case 0xea: /* jmp absolute -- ip is correct */
810  /* ip is already adjusted, no more changes required */
811  p->ainsn.boostable = 1;
812  goto no_change;
813  case 0xe8: /* call relative - Fix return addr */
814  *tos = orig_ip + (*tos - copy_ip);
815  break;
816 #ifdef CONFIG_X86_32
817  case 0x9a: /* call absolute -- same as call absolute, indirect */
818  *tos = orig_ip + (*tos - copy_ip);
819  goto no_change;
820 #endif
821  case 0xff:
822  if ((insn[1] & 0x30) == 0x10) {
823  /*
824  * call absolute, indirect
825  * Fix return addr; ip is correct.
826  * But this is not boostable
827  */
828  *tos = orig_ip + (*tos - copy_ip);
829  goto no_change;
830  } else if (((insn[1] & 0x31) == 0x20) ||
831  ((insn[1] & 0x31) == 0x21)) {
832  /*
833  * jmp near and far, absolute indirect
834  * ip is correct. And this is boostable
835  */
836  p->ainsn.boostable = 1;
837  goto no_change;
838  }
839  default:
840  break;
841  }
842 
843  if (p->ainsn.boostable == 0) {
844  if ((regs->ip > copy_ip) &&
845  (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) {
846  /*
847  * These instructions can be executed directly if it
848  * jumps back to correct address.
849  */
850  synthesize_reljump((void *)regs->ip,
851  (void *)orig_ip + (regs->ip - copy_ip));
852  p->ainsn.boostable = 1;
853  } else {
854  p->ainsn.boostable = -1;
855  }
856  }
857 
858  regs->ip += orig_ip - copy_ip;
859 
860 no_change:
861  restore_btf();
862 }
863 
864 /*
865  * Interrupts are disabled on entry as trap1 is an interrupt gate and they
866  * remain disabled throughout this function.
867  */
868 static int __kprobes post_kprobe_handler(struct pt_regs *regs)
869 {
870  struct kprobe *cur = kprobe_running();
871  struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
872 
873  if (!cur)
874  return 0;
875 
876  resume_execution(cur, regs, kcb);
877  regs->flags |= kcb->kprobe_saved_flags;
878 
879  if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
880  kcb->kprobe_status = KPROBE_HIT_SSDONE;
881  cur->post_handler(cur, regs, 0);
882  }
883 
884  /* Restore back the original saved kprobes variables and continue. */
885  if (kcb->kprobe_status == KPROBE_REENTER) {
886  restore_previous_kprobe(kcb);
887  goto out;
888  }
889  reset_current_kprobe();
890 out:
892 
893  /*
894  * if somebody else is singlestepping across a probe point, flags
895  * will have TF set, in which case, continue the remaining processing
896  * of do_debug, as if this is not a probe hit.
897  */
898  if (regs->flags & X86_EFLAGS_TF)
899  return 0;
900 
901  return 1;
902 }
903 
904 int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
905 {
906  struct kprobe *cur = kprobe_running();
907  struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
908 
909  switch (kcb->kprobe_status) {
910  case KPROBE_HIT_SS:
911  case KPROBE_REENTER:
912  /*
913  * We are here because the instruction being single
914  * stepped caused a page fault. We reset the current
915  * kprobe and the ip points back to the probe address
916  * and allow the page fault handler to continue as a
917  * normal page fault.
918  */
919  regs->ip = (unsigned long)cur->addr;
920  regs->flags |= kcb->kprobe_old_flags;
921  if (kcb->kprobe_status == KPROBE_REENTER)
922  restore_previous_kprobe(kcb);
923  else
924  reset_current_kprobe();
926  break;
927  case KPROBE_HIT_ACTIVE:
928  case KPROBE_HIT_SSDONE:
929  /*
930  * We increment the nmissed count for accounting,
931  * we can also use npre/npostfault count for accounting
932  * these specific fault cases.
933  */
935 
936  /*
937  * We come here because instructions in the pre/post
938  * handler caused the page_fault, this could happen
939  * if handler tries to access user space by
940  * copy_from_user(), get_user() etc. Let the
941  * user-specified handler try to fix it first.
942  */
943  if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
944  return 1;
945 
946  /*
947  * In case the user-specified fault handler returned
948  * zero, try to fix up.
949  */
950  if (fixup_exception(regs))
951  return 1;
952 
953  /*
954  * fixup routine could not handle it,
955  * Let do_page_fault() fix it.
956  */
957  break;
958  default:
959  break;
960  }
961  return 0;
962 }
963 
964 /*
965  * Wrapper routine for handling exceptions.
966  */
967 int __kprobes
968 kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data)
969 {
970  struct die_args *args = data;
971  int ret = NOTIFY_DONE;
972 
973  if (args->regs && user_mode_vm(args->regs))
974  return ret;
975 
976  switch (val) {
977  case DIE_INT3:
978  if (kprobe_handler(args->regs))
979  ret = NOTIFY_STOP;
980  break;
981  case DIE_DEBUG:
982  if (post_kprobe_handler(args->regs)) {
983  /*
984  * Reset the BS bit in dr6 (pointed by args->err) to
985  * denote completion of processing
986  */
987  (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
988  ret = NOTIFY_STOP;
989  }
990  break;
991  case DIE_GPF:
992  /*
993  * To be potentially processing a kprobe fault and to
994  * trust the result from kprobe_running(), we have
995  * be non-preemptible.
996  */
997  if (!preemptible() && kprobe_running() &&
998  kprobe_fault_handler(args->regs, args->trapnr))
999  ret = NOTIFY_STOP;
1000  break;
1001  default:
1002  break;
1003  }
1004  return ret;
1005 }
1006 
1007 int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
1008 {
1009  struct jprobe *jp = container_of(p, struct jprobe, kp);
1010  unsigned long addr;
1011  struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1012 
1013  kcb->jprobe_saved_regs = *regs;
1014  kcb->jprobe_saved_sp = stack_addr(regs);
1015  addr = (unsigned long)(kcb->jprobe_saved_sp);
1016 
1017  /*
1018  * As Linus pointed out, gcc assumes that the callee
1019  * owns the argument space and could overwrite it, e.g.
1020  * tailcall optimization. So, to be absolutely safe
1021  * we also save and restore enough stack bytes to cover
1022  * the argument area.
1023  */
1024  memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
1025  MIN_STACK_SIZE(addr));
1026  regs->flags &= ~X86_EFLAGS_IF;
1028  regs->ip = (unsigned long)(jp->entry);
1029  return 1;
1030 }
1031 
1033 {
1034  struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1035 
1036  asm volatile (
1037 #ifdef CONFIG_X86_64
1038  " xchg %%rbx,%%rsp \n"
1039 #else
1040  " xchgl %%ebx,%%esp \n"
1041 #endif
1042  " int3 \n"
1043  " .globl jprobe_return_end\n"
1044  " jprobe_return_end: \n"
1045  " nop \n"::"b"
1046  (kcb->jprobe_saved_sp):"memory");
1047 }
1048 
1049 int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
1050 {
1051  struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1052  u8 *addr = (u8 *) (regs->ip - 1);
1053  struct jprobe *jp = container_of(p, struct jprobe, kp);
1054 
1055  if ((addr > (u8 *) jprobe_return) &&
1056  (addr < (u8 *) jprobe_return_end)) {
1057  if (stack_addr(regs) != kcb->jprobe_saved_sp) {
1058  struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
1060  "current sp %p does not match saved sp %p\n",
1061  stack_addr(regs), kcb->jprobe_saved_sp);
1062  printk(KERN_ERR "Saved registers for jprobe %p\n", jp);
1063  show_regs(saved_regs);
1064  printk(KERN_ERR "Current registers\n");
1065  show_regs(regs);
1066  BUG();
1067  }
1068  *regs = kcb->jprobe_saved_regs;
1070  kcb->jprobes_stack,
1073  return 1;
1074  }
1075  return 0;
1076 }
1077 
1078 #ifdef KPROBES_CAN_USE_FTRACE
1079 /* Ftrace callback handler for kprobes */
1080 void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
1081  struct ftrace_ops *ops, struct pt_regs *regs)
1082 {
1083  struct kprobe *p;
1084  struct kprobe_ctlblk *kcb;
1085  unsigned long flags;
1086 
1087  /* Disable irq for emulating a breakpoint and avoiding preempt */
1088  local_irq_save(flags);
1089 
1090  p = get_kprobe((kprobe_opcode_t *)ip);
1091  if (unlikely(!p) || kprobe_disabled(p))
1092  goto end;
1093 
1094  kcb = get_kprobe_ctlblk();
1095  if (kprobe_running()) {
1097  } else {
1098  /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */
1099  regs->ip = ip + sizeof(kprobe_opcode_t);
1100 
1101  __this_cpu_write(current_kprobe, p);
1103  if (!p->pre_handler || !p->pre_handler(p, regs))
1104  skip_singlestep(p, regs, kcb);
1105  /*
1106  * If pre_handler returns !0, it sets regs->ip and
1107  * resets current kprobe.
1108  */
1109  }
1110 end:
1111  local_irq_restore(flags);
1112 }
1113 
1114 int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p)
1115 {
1116  p->ainsn.insn = NULL;
1117  p->ainsn.boostable = -1;
1118  return 0;
1119 }
1120 #endif
1121 
1123 {
1124  return arch_init_optprobes();
1125 }
1126 
1128 {
1129  return 0;
1130 }