Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
uprobes.c
Go to the documentation of this file.
1 /*
2  * User-space Probes (UProbes) for x86
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software
16  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17  *
18  * Copyright (C) IBM Corporation, 2008-2011
19  * Authors:
20  * Srikar Dronamraju
21  * Jim Keniston
22  */
23 #include <linux/kernel.h>
24 #include <linux/sched.h>
25 #include <linux/ptrace.h>
26 #include <linux/uprobes.h>
27 #include <linux/uaccess.h>
28 
29 #include <linux/kdebug.h>
30 #include <asm/processor.h>
31 #include <asm/insn.h>
32 
33 /* Post-execution fixups. */
34 
35 /* No fixup needed */
36 #define UPROBE_FIX_NONE 0x0
37 
38 /* Adjust IP back to vicinity of actual insn */
39 #define UPROBE_FIX_IP 0x1
40 
41 /* Adjust the return address of a call insn */
42 #define UPROBE_FIX_CALL 0x2
43 
44 /* Instruction will modify TF, don't change it */
45 #define UPROBE_FIX_SETF 0x4
46 
47 #define UPROBE_FIX_RIP_AX 0x8000
48 #define UPROBE_FIX_RIP_CX 0x4000
49 
50 #define UPROBE_TRAP_NR UINT_MAX
51 
52 /* Adaptations for mhiramat x86 decoder v14. */
53 #define OPCODE1(insn) ((insn)->opcode.bytes[0])
54 #define OPCODE2(insn) ((insn)->opcode.bytes[1])
55 #define OPCODE3(insn) ((insn)->opcode.bytes[2])
56 #define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value)
57 
58 #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
59  (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
60  (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
61  (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
62  (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
63  << (row % 32))
64 
65 /*
66  * Good-instruction tables for 32-bit apps. This is non-const and volatile
67  * to keep gcc from statically optimizing it out, as variable_test_bit makes
68  * some versions of gcc to think only *(unsigned long*) is used.
69  */
70 static volatile u32 good_insns_32[256 / 32] = {
71  /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
72  /* ---------------------------------------------- */
73  W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */
74  W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */
75  W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */
76  W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */
77  W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
78  W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
79  W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
80  W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
81  W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
82  W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
83  W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
84  W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
85  W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
86  W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
87  W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
88  W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
89  /* ---------------------------------------------- */
90  /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
91 };
92 
93 /* Using this for both 64-bit and 32-bit apps */
94 static volatile u32 good_2byte_insns[256 / 32] = {
95  /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
96  /* ---------------------------------------------- */
97  W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */
98  W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
99  W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
100  W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
101  W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
102  W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
103  W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
104  W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
105  W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
106  W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
107  W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
108  W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
109  W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
110  W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
111  W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
112  W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */
113  /* ---------------------------------------------- */
114  /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
115 };
116 
117 #ifdef CONFIG_X86_64
118 /* Good-instruction tables for 64-bit apps */
119 static volatile u32 good_insns_64[256 / 32] = {
120  /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
121  /* ---------------------------------------------- */
122  W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */
123  W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */
124  W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */
125  W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */
126  W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
127  W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
128  W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
129  W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
130  W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
131  W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
132  W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
133  W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
134  W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
135  W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
136  W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
137  W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
138  /* ---------------------------------------------- */
139  /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
140 };
141 #endif
142 #undef W
143 
144 /*
145  * opcodes we'll probably never support:
146  *
147  * 6c-6d, e4-e5, ec-ed - in
148  * 6e-6f, e6-e7, ee-ef - out
149  * cc, cd - int3, int
150  * cf - iret
151  * d6 - illegal instruction
152  * f1 - int1/icebp
153  * f4 - hlt
154  * fa, fb - cli, sti
155  * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2
156  *
157  * invalid opcodes in 64-bit mode:
158  *
159  * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5
160  * 63 - we support this opcode in x86_64 but not in i386.
161  *
162  * opcodes we may need to refine support for:
163  *
164  * 0f - 2-byte instructions: For many of these instructions, the validity
165  * depends on the prefix and/or the reg field. On such instructions, we
166  * just consider the opcode combination valid if it corresponds to any
167  * valid instruction.
168  *
169  * 8f - Group 1 - only reg = 0 is OK
170  * c6-c7 - Group 11 - only reg = 0 is OK
171  * d9-df - fpu insns with some illegal encodings
172  * f2, f3 - repnz, repz prefixes. These are also the first byte for
173  * certain floating-point instructions, such as addsd.
174  *
175  * fe - Group 4 - only reg = 0 or 1 is OK
176  * ff - Group 5 - only reg = 0-6 is OK
177  *
178  * others -- Do we need to support these?
179  *
180  * 0f - (floating-point?) prefetch instructions
181  * 07, 17, 1f - pop es, pop ss, pop ds
182  * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes --
183  * but 64 and 65 (fs: and gs:) seem to be used, so we support them
184  * 67 - addr16 prefix
185  * ce - into
186  * f0 - lock prefix
187  */
188 
189 /*
190  * TODO:
191  * - Where necessary, examine the modrm byte and allow only valid instructions
192  * in the different Groups and fpu instructions.
193  */
194 
195 static bool is_prefix_bad(struct insn *insn)
196 {
197  int i;
198 
199  for (i = 0; i < insn->prefixes.nbytes; i++) {
200  switch (insn->prefixes.bytes[i]) {
201  case 0x26: /* INAT_PFX_ES */
202  case 0x2E: /* INAT_PFX_CS */
203  case 0x36: /* INAT_PFX_DS */
204  case 0x3E: /* INAT_PFX_SS */
205  case 0xF0: /* INAT_PFX_LOCK */
206  return true;
207  }
208  }
209  return false;
210 }
211 
212 static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn)
213 {
214  insn_init(insn, auprobe->insn, false);
215 
216  /* Skip good instruction prefixes; reject "bad" ones. */
217  insn_get_opcode(insn);
218  if (is_prefix_bad(insn))
219  return -ENOTSUPP;
220 
221  if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_32))
222  return 0;
223 
224  if (insn->opcode.nbytes == 2) {
225  if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
226  return 0;
227  }
228 
229  return -ENOTSUPP;
230 }
231 
232 /*
233  * Figure out which fixups arch_uprobe_post_xol() will need to perform, and
234  * annotate arch_uprobe->fixups accordingly. To start with,
235  * arch_uprobe->fixups is either zero or it reflects rip-related fixups.
236  */
237 static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)
238 {
239  bool fix_ip = true, fix_call = false; /* defaults */
240  int reg;
241 
242  insn_get_opcode(insn); /* should be a nop */
243 
244  switch (OPCODE1(insn)) {
245  case 0x9d:
246  /* popf */
247  auprobe->fixups |= UPROBE_FIX_SETF;
248  break;
249  case 0xc3: /* ret/lret */
250  case 0xcb:
251  case 0xc2:
252  case 0xca:
253  /* ip is correct */
254  fix_ip = false;
255  break;
256  case 0xe8: /* call relative - Fix return addr */
257  fix_call = true;
258  break;
259  case 0x9a: /* call absolute - Fix return addr, not ip */
260  fix_call = true;
261  fix_ip = false;
262  break;
263  case 0xff:
264  insn_get_modrm(insn);
265  reg = MODRM_REG(insn);
266  if (reg == 2 || reg == 3) {
267  /* call or lcall, indirect */
268  /* Fix return addr; ip is correct. */
269  fix_call = true;
270  fix_ip = false;
271  } else if (reg == 4 || reg == 5) {
272  /* jmp or ljmp, indirect */
273  /* ip is correct. */
274  fix_ip = false;
275  }
276  break;
277  case 0xea: /* jmp absolute -- ip is correct */
278  fix_ip = false;
279  break;
280  default:
281  break;
282  }
283  if (fix_ip)
284  auprobe->fixups |= UPROBE_FIX_IP;
285  if (fix_call)
286  auprobe->fixups |= UPROBE_FIX_CALL;
287 }
288 
289 #ifdef CONFIG_X86_64
290 /*
291  * If arch_uprobe->insn doesn't use rip-relative addressing, return
292  * immediately. Otherwise, rewrite the instruction so that it accesses
293  * its memory operand indirectly through a scratch register. Set
294  * arch_uprobe->fixups and arch_uprobe->rip_rela_target_address
295  * accordingly. (The contents of the scratch register will be saved
296  * before we single-step the modified instruction, and restored
297  * afterward.)
298  *
299  * We do this because a rip-relative instruction can access only a
300  * relatively small area (+/- 2 GB from the instruction), and the XOL
301  * area typically lies beyond that area. At least for instructions
302  * that store to memory, we can't execute the original instruction
303  * and "fix things up" later, because the misdirected store could be
304  * disastrous.
305  *
306  * Some useful facts about rip-relative instructions:
307  *
308  * - There's always a modrm byte.
309  * - There's never a SIB byte.
310  * - The displacement is always 4 bytes.
311  */
312 static void
313 handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
314 {
315  u8 *cursor;
316  u8 reg;
317 
318  if (mm->context.ia32_compat)
319  return;
320 
321  auprobe->rip_rela_target_address = 0x0;
322  if (!insn_rip_relative(insn))
323  return;
324 
325  /*
326  * insn_rip_relative() would have decoded rex_prefix, modrm.
327  * Clear REX.b bit (extension of MODRM.rm field):
328  * we want to encode rax/rcx, not r8/r9.
329  */
330  if (insn->rex_prefix.nbytes) {
331  cursor = auprobe->insn + insn_offset_rex_prefix(insn);
332  *cursor &= 0xfe; /* Clearing REX.B bit */
333  }
334 
335  /*
336  * Point cursor at the modrm byte. The next 4 bytes are the
337  * displacement. Beyond the displacement, for some instructions,
338  * is the immediate operand.
339  */
340  cursor = auprobe->insn + insn_offset_modrm(insn);
341  insn_get_length(insn);
342 
343  /*
344  * Convert from rip-relative addressing to indirect addressing
345  * via a scratch register. Change the r/m field from 0x5 (%rip)
346  * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field.
347  */
348  reg = MODRM_REG(insn);
349  if (reg == 0) {
350  /*
351  * The register operand (if any) is either the A register
352  * (%rax, %eax, etc.) or (if the 0x4 bit is set in the
353  * REX prefix) %r8. In any case, we know the C register
354  * is NOT the register operand, so we use %rcx (register
355  * #1) for the scratch register.
356  */
357  auprobe->fixups = UPROBE_FIX_RIP_CX;
358  /* Change modrm from 00 000 101 to 00 000 001. */
359  *cursor = 0x1;
360  } else {
361  /* Use %rax (register #0) for the scratch register. */
362  auprobe->fixups = UPROBE_FIX_RIP_AX;
363  /* Change modrm from 00 xxx 101 to 00 xxx 000 */
364  *cursor = (reg << 3);
365  }
366 
367  /* Target address = address of next instruction + (signed) offset */
368  auprobe->rip_rela_target_address = (long)insn->length + insn->displacement.value;
369 
370  /* Displacement field is gone; slide immediate field (if any) over. */
371  if (insn->immediate.nbytes) {
372  cursor++;
373  memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes);
374  }
375  return;
376 }
377 
378 static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn)
379 {
380  insn_init(insn, auprobe->insn, true);
381 
382  /* Skip good instruction prefixes; reject "bad" ones. */
383  insn_get_opcode(insn);
384  if (is_prefix_bad(insn))
385  return -ENOTSUPP;
386 
387  if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_64))
388  return 0;
389 
390  if (insn->opcode.nbytes == 2) {
391  if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
392  return 0;
393  }
394  return -ENOTSUPP;
395 }
396 
397 static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
398 {
399  if (mm->context.ia32_compat)
400  return validate_insn_32bits(auprobe, insn);
401  return validate_insn_64bits(auprobe, insn);
402 }
403 #else /* 32-bit: */
404 static void handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
405 {
406  /* No RIP-relative addressing on 32-bit */
407 }
408 
409 static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
410 {
411  return validate_insn_32bits(auprobe, insn);
412 }
413 #endif /* CONFIG_X86_64 */
414 
422 int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
423 {
424  int ret;
425  struct insn insn;
426 
427  auprobe->fixups = 0;
428  ret = validate_insn_bits(auprobe, mm, &insn);
429  if (ret != 0)
430  return ret;
431 
432  handle_riprel_insn(auprobe, mm, &insn);
433  prepare_fixups(auprobe, &insn);
434 
435  return 0;
436 }
437 
438 #ifdef CONFIG_X86_64
439 /*
440  * If we're emulating a rip-relative instruction, save the contents
441  * of the scratch register and store the target address in that register.
442  */
443 static void
444 pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
445  struct arch_uprobe_task *autask)
446 {
447  if (auprobe->fixups & UPROBE_FIX_RIP_AX) {
448  autask->saved_scratch_register = regs->ax;
449  regs->ax = current->utask->vaddr;
450  regs->ax += auprobe->rip_rela_target_address;
451  } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) {
452  autask->saved_scratch_register = regs->cx;
453  regs->cx = current->utask->vaddr;
454  regs->cx += auprobe->rip_rela_target_address;
455  }
456 }
457 #else
458 static void
459 pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
460  struct arch_uprobe_task *autask)
461 {
462  /* No RIP-relative addressing on 32-bit */
463 }
464 #endif
465 
466 /*
467  * arch_uprobe_pre_xol - prepare to execute out of line.
468  * @auprobe: the probepoint information.
469  * @regs: reflects the saved user state of current task.
470  */
471 int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
472 {
473  struct arch_uprobe_task *autask;
474 
475  autask = &current->utask->autask;
476  autask->saved_trap_nr = current->thread.trap_nr;
477  current->thread.trap_nr = UPROBE_TRAP_NR;
478  regs->ip = current->utask->xol_vaddr;
479  pre_xol_rip_insn(auprobe, regs, autask);
480 
481  return 0;
482 }
483 
484 /*
485  * This function is called by arch_uprobe_post_xol() to adjust the return
486  * address pushed by a call instruction executed out of line.
487  */
488 static int adjust_ret_addr(unsigned long sp, long correction)
489 {
490  int rasize, ncopied;
491  long ra = 0;
492 
493  if (is_ia32_task())
494  rasize = 4;
495  else
496  rasize = 8;
497 
498  ncopied = copy_from_user(&ra, (void __user *)sp, rasize);
499  if (unlikely(ncopied))
500  return -EFAULT;
501 
502  ra += correction;
503  ncopied = copy_to_user((void __user *)sp, &ra, rasize);
504  if (unlikely(ncopied))
505  return -EFAULT;
506 
507  return 0;
508 }
509 
510 #ifdef CONFIG_X86_64
511 static bool is_riprel_insn(struct arch_uprobe *auprobe)
512 {
513  return ((auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) != 0);
514 }
515 
516 static void
517 handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
518 {
519  if (is_riprel_insn(auprobe)) {
520  struct arch_uprobe_task *autask;
521 
522  autask = &current->utask->autask;
523  if (auprobe->fixups & UPROBE_FIX_RIP_AX)
524  regs->ax = autask->saved_scratch_register;
525  else
526  regs->cx = autask->saved_scratch_register;
527 
528  /*
529  * The original instruction includes a displacement, and so
530  * is 4 bytes longer than what we've just single-stepped.
531  * Fall through to handle stuff like "jmpq *...(%rip)" and
532  * "callq *...(%rip)".
533  */
534  if (correction)
535  *correction += 4;
536  }
537 }
538 #else
539 static void
540 handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
541 {
542  /* No RIP-relative addressing on 32-bit */
543 }
544 #endif
545 
546 /*
547  * If xol insn itself traps and generates a signal(Say,
548  * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped
549  * instruction jumps back to its own address. It is assumed that anything
550  * like do_page_fault/do_trap/etc sets thread.trap_nr != -1.
551  *
552  * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr,
553  * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to
554  * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol().
555  */
557 {
558  if (t->thread.trap_nr != UPROBE_TRAP_NR)
559  return true;
560 
561  return false;
562 }
563 
564 /*
565  * Called after single-stepping. To avoid the SMP problems that can
566  * occur when we temporarily put back the original opcode to
567  * single-step, we single-stepped a copy of the instruction.
568  *
569  * This function prepares to resume execution after the single-step.
570  * We have to fix things up as follows:
571  *
572  * Typically, the new ip is relative to the copied instruction. We need
573  * to make it relative to the original instruction (FIX_IP). Exceptions
574  * are return instructions and absolute or indirect jump or call instructions.
575  *
576  * If the single-stepped instruction was a call, the return address that
577  * is atop the stack is the address following the copied instruction. We
578  * need to make it the address following the original instruction (FIX_CALL).
579  *
580  * If the original instruction was a rip-relative instruction such as
581  * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent
582  * instruction using a scratch register -- e.g., "movl %edx,(%rax)".
583  * We need to restore the contents of the scratch register and adjust
584  * the ip, keeping in mind that the instruction we executed is 4 bytes
585  * shorter than the original instruction (since we squeezed out the offset
586  * field). (FIX_RIP_AX or FIX_RIP_CX)
587  */
588 int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
589 {
590  struct uprobe_task *utask;
591  long correction;
592  int result = 0;
593 
594  WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR);
595 
596  utask = current->utask;
597  current->thread.trap_nr = utask->autask.saved_trap_nr;
598  correction = (long)(utask->vaddr - utask->xol_vaddr);
599  handle_riprel_post_xol(auprobe, regs, &correction);
600  if (auprobe->fixups & UPROBE_FIX_IP)
601  regs->ip += correction;
602 
603  if (auprobe->fixups & UPROBE_FIX_CALL)
604  result = adjust_ret_addr(regs->sp, correction);
605 
606  return result;
607 }
608 
609 /* callback routine for handling exceptions. */
610 int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data)
611 {
612  struct die_args *args = data;
613  struct pt_regs *regs = args->regs;
614  int ret = NOTIFY_DONE;
615 
616  /* We are only interested in userspace traps */
617  if (regs && !user_mode_vm(regs))
618  return NOTIFY_DONE;
619 
620  switch (val) {
621  case DIE_INT3:
622  if (uprobe_pre_sstep_notifier(regs))
623  ret = NOTIFY_STOP;
624 
625  break;
626 
627  case DIE_DEBUG:
628  if (uprobe_post_sstep_notifier(regs))
629  ret = NOTIFY_STOP;
630 
631  default:
632  break;
633  }
634 
635  return ret;
636 }
637 
638 /*
639  * This function gets called when XOL instruction either gets trapped or
640  * the thread has a fatal signal, so reset the instruction pointer to its
641  * probed address.
642  */
643 void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
644 {
645  struct uprobe_task *utask = current->utask;
646 
647  current->thread.trap_nr = utask->autask.saved_trap_nr;
648  handle_riprel_post_xol(auprobe, regs, NULL);
649  instruction_pointer_set(regs, utask->vaddr);
650 }
651 
652 /*
653  * Skip these instructions as per the currently known x86 ISA.
654  * rep=0x66*; nop=0x90
655  */
656 static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
657 {
658  int i;
659 
660  for (i = 0; i < MAX_UINSN_BYTES; i++) {
661  if (auprobe->insn[i] == 0x66)
662  continue;
663 
664  if (auprobe->insn[i] == 0x90)
665  return true;
666 
667  break;
668  }
669  return false;
670 }
671 
672 bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
673 {
674  bool ret = __skip_sstep(auprobe, regs);
675  if (ret && (regs->flags & X86_EFLAGS_TF))
676  send_sig(SIGTRAP, current, 0);
677  return ret;
678 }
679 
681 {
682  struct task_struct *task = current;
683  struct arch_uprobe_task *autask = &task->utask->autask;
684  struct pt_regs *regs = task_pt_regs(task);
685 
686  autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF);
687 
688  regs->flags |= X86_EFLAGS_TF;
689  if (test_tsk_thread_flag(task, TIF_BLOCKSTEP))
690  set_task_blockstep(task, false);
691 }
692 
694 {
695  struct task_struct *task = current;
696  struct arch_uprobe_task *autask = &task->utask->autask;
697  bool trapped = (task->utask->state == UTASK_SSTEP_TRAPPED);
698  struct pt_regs *regs = task_pt_regs(task);
699  /*
700  * The state of TIF_BLOCKSTEP was not saved so we can get an extra
701  * SIGTRAP if we do not clear TF. We need to examine the opcode to
702  * make it right.
703  */
704  if (unlikely(trapped)) {
705  if (!autask->saved_tf)
706  regs->flags &= ~X86_EFLAGS_TF;
707  } else {
708  if (autask->saved_tf)
709  send_sig(SIGTRAP, task, 0);
710  else if (!(auprobe->fixups & UPROBE_FIX_SETF))
711  regs->flags &= ~X86_EFLAGS_TF;
712  }
713 }