Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
trace_syscalls.c
Go to the documentation of this file.
1 #include <trace/syscall.h>
3 #include <linux/slab.h>
4 #include <linux/kernel.h>
5 #include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
6 #include <linux/ftrace.h>
7 #include <linux/perf_event.h>
8 #include <asm/syscall.h>
9 
10 #include "trace_output.h"
11 #include "trace.h"
12 
13 static DEFINE_MUTEX(syscall_trace_lock);
14 static int sys_refcount_enter;
15 static int sys_refcount_exit;
16 static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
17 static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
18 
19 static int syscall_enter_register(struct ftrace_event_call *event,
20  enum trace_reg type, void *data);
21 static int syscall_exit_register(struct ftrace_event_call *event,
22  enum trace_reg type, void *data);
23 
24 static int syscall_enter_define_fields(struct ftrace_event_call *call);
25 static int syscall_exit_define_fields(struct ftrace_event_call *call);
26 
27 static struct list_head *
28 syscall_get_enter_fields(struct ftrace_event_call *call)
29 {
30  struct syscall_metadata *entry = call->data;
31 
32  return &entry->enter_fields;
33 }
34 
36  .trace = print_syscall_enter,
37 };
38 
40  .trace = print_syscall_exit,
41 };
42 
44  .system = "syscalls",
45  .reg = syscall_enter_register,
46  .define_fields = syscall_enter_define_fields,
47  .get_fields = syscall_get_enter_fields,
48  .raw_init = init_syscall_trace,
49 };
50 
52  .system = "syscalls",
53  .reg = syscall_exit_register,
54  .define_fields = syscall_exit_define_fields,
55  .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
56  .raw_init = init_syscall_trace,
57 };
58 
61 
62 static struct syscall_metadata **syscalls_metadata;
63 
64 #ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
65 static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
66 {
67  /*
68  * Only compare after the "sys" prefix. Archs that use
69  * syscall wrappers may have syscalls symbols aliases prefixed
70  * with "SyS" instead of "sys", leading to an unwanted
71  * mismatch.
72  */
73  return !strcmp(sym + 3, name + 3);
74 }
75 #endif
76 
77 static __init struct syscall_metadata *
78 find_syscall_meta(unsigned long syscall)
79 {
80  struct syscall_metadata **start;
81  struct syscall_metadata **stop;
82  char str[KSYM_SYMBOL_LEN];
83 
84 
85  start = __start_syscalls_metadata;
86  stop = __stop_syscalls_metadata;
87  kallsyms_lookup(syscall, NULL, NULL, NULL, str);
88 
89  if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
90  return NULL;
91 
92  for ( ; start < stop; start++) {
93  if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
94  return *start;
95  }
96  return NULL;
97 }
98 
99 static struct syscall_metadata *syscall_nr_to_meta(int nr)
100 {
101  if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
102  return NULL;
103 
104  return syscalls_metadata[nr];
105 }
106 
107 enum print_line_t
109  struct trace_event *event)
110 {
111  struct trace_seq *s = &iter->seq;
112  struct trace_entry *ent = iter->ent;
113  struct syscall_trace_enter *trace;
114  struct syscall_metadata *entry;
115  int i, ret, syscall;
116 
117  trace = (typeof(trace))ent;
118  syscall = trace->nr;
119  entry = syscall_nr_to_meta(syscall);
120 
121  if (!entry)
122  goto end;
123 
124  if (entry->enter_event->event.type != ent->type) {
125  WARN_ON_ONCE(1);
126  goto end;
127  }
128 
129  ret = trace_seq_printf(s, "%s(", entry->name);
130  if (!ret)
132 
133  for (i = 0; i < entry->nb_args; i++) {
134  /* parameter types */
136  ret = trace_seq_printf(s, "%s ", entry->types[i]);
137  if (!ret)
139  }
140  /* parameter values */
141  ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
142  trace->args[i],
143  i == entry->nb_args - 1 ? "" : ", ");
144  if (!ret)
146  }
147 
148  ret = trace_seq_putc(s, ')');
149  if (!ret)
151 
152 end:
153  ret = trace_seq_putc(s, '\n');
154  if (!ret)
156 
157  return TRACE_TYPE_HANDLED;
158 }
159 
160 enum print_line_t
162  struct trace_event *event)
163 {
164  struct trace_seq *s = &iter->seq;
165  struct trace_entry *ent = iter->ent;
166  struct syscall_trace_exit *trace;
167  int syscall;
168  struct syscall_metadata *entry;
169  int ret;
170 
171  trace = (typeof(trace))ent;
172  syscall = trace->nr;
173  entry = syscall_nr_to_meta(syscall);
174 
175  if (!entry) {
176  trace_seq_printf(s, "\n");
177  return TRACE_TYPE_HANDLED;
178  }
179 
180  if (entry->exit_event->event.type != ent->type) {
181  WARN_ON_ONCE(1);
182  return TRACE_TYPE_UNHANDLED;
183  }
184 
185  ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
186  trace->ret);
187  if (!ret)
189 
190  return TRACE_TYPE_HANDLED;
191 }
192 
193 extern char *__bad_type_size(void);
194 
195 #define SYSCALL_FIELD(type, name) \
196  sizeof(type) != sizeof(trace.name) ? \
197  __bad_type_size() : \
198  #type, #name, offsetof(typeof(trace), name), \
199  sizeof(trace.name), is_signed_type(type)
200 
201 static
202 int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
203 {
204  int i;
205  int pos = 0;
206 
207  /* When len=0, we just calculate the needed length */
208 #define LEN_OR_ZERO (len ? len - pos : 0)
209 
210  pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
211  for (i = 0; i < entry->nb_args; i++) {
212  pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
213  entry->args[i], sizeof(unsigned long),
214  i == entry->nb_args - 1 ? "" : ", ");
215  }
216  pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
217 
218  for (i = 0; i < entry->nb_args; i++) {
219  pos += snprintf(buf + pos, LEN_OR_ZERO,
220  ", ((unsigned long)(REC->%s))", entry->args[i]);
221  }
222 
223 #undef LEN_OR_ZERO
224 
225  /* return the length of print_fmt */
226  return pos;
227 }
228 
229 static int set_syscall_print_fmt(struct ftrace_event_call *call)
230 {
231  char *print_fmt;
232  int len;
233  struct syscall_metadata *entry = call->data;
234 
235  if (entry->enter_event != call) {
236  call->print_fmt = "\"0x%lx\", REC->ret";
237  return 0;
238  }
239 
240  /* First: called with 0 length to calculate the needed length */
241  len = __set_enter_print_fmt(entry, NULL, 0);
242 
243  print_fmt = kmalloc(len + 1, GFP_KERNEL);
244  if (!print_fmt)
245  return -ENOMEM;
246 
247  /* Second: actually write the @print_fmt */
248  __set_enter_print_fmt(entry, print_fmt, len + 1);
249  call->print_fmt = print_fmt;
250 
251  return 0;
252 }
253 
254 static void free_syscall_print_fmt(struct ftrace_event_call *call)
255 {
256  struct syscall_metadata *entry = call->data;
257 
258  if (entry->enter_event == call)
259  kfree(call->print_fmt);
260 }
261 
262 static int syscall_enter_define_fields(struct ftrace_event_call *call)
263 {
264  struct syscall_trace_enter trace;
265  struct syscall_metadata *meta = call->data;
266  int ret;
267  int i;
268  int offset = offsetof(typeof(trace), args);
269 
270  ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
271  if (ret)
272  return ret;
273 
274  for (i = 0; i < meta->nb_args; i++) {
275  ret = trace_define_field(call, meta->types[i],
276  meta->args[i], offset,
277  sizeof(unsigned long), 0,
278  FILTER_OTHER);
279  offset += sizeof(unsigned long);
280  }
281 
282  return ret;
283 }
284 
285 static int syscall_exit_define_fields(struct ftrace_event_call *call)
286 {
287  struct syscall_trace_exit trace;
288  int ret;
289 
290  ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
291  if (ret)
292  return ret;
293 
294  ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
295  FILTER_OTHER);
296 
297  return ret;
298 }
299 
300 void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
301 {
302  struct syscall_trace_enter *entry;
303  struct syscall_metadata *sys_data;
304  struct ring_buffer_event *event;
305  struct ring_buffer *buffer;
306  int size;
307  int syscall_nr;
308 
309  syscall_nr = syscall_get_nr(current, regs);
310  if (syscall_nr < 0)
311  return;
312  if (!test_bit(syscall_nr, enabled_enter_syscalls))
313  return;
314 
315  sys_data = syscall_nr_to_meta(syscall_nr);
316  if (!sys_data)
317  return;
318 
319  size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
320 
321  event = trace_current_buffer_lock_reserve(&buffer,
322  sys_data->enter_event->event.type, size, 0, 0);
323  if (!event)
324  return;
325 
326  entry = ring_buffer_event_data(event);
327  entry->nr = syscall_nr;
328  syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
329 
330  if (!filter_current_check_discard(buffer, sys_data->enter_event,
331  entry, event))
332  trace_current_buffer_unlock_commit(buffer, event, 0, 0);
333 }
334 
335 void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
336 {
337  struct syscall_trace_exit *entry;
338  struct syscall_metadata *sys_data;
339  struct ring_buffer_event *event;
340  struct ring_buffer *buffer;
341  int syscall_nr;
342 
343  syscall_nr = syscall_get_nr(current, regs);
344  if (syscall_nr < 0)
345  return;
346  if (!test_bit(syscall_nr, enabled_exit_syscalls))
347  return;
348 
349  sys_data = syscall_nr_to_meta(syscall_nr);
350  if (!sys_data)
351  return;
352 
353  event = trace_current_buffer_lock_reserve(&buffer,
354  sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
355  if (!event)
356  return;
357 
358  entry = ring_buffer_event_data(event);
359  entry->nr = syscall_nr;
360  entry->ret = syscall_get_return_value(current, regs);
361 
362  if (!filter_current_check_discard(buffer, sys_data->exit_event,
363  entry, event))
364  trace_current_buffer_unlock_commit(buffer, event, 0, 0);
365 }
366 
368 {
369  int ret = 0;
370  int num;
371 
372  num = ((struct syscall_metadata *)call->data)->syscall_nr;
373  if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
374  return -ENOSYS;
375  mutex_lock(&syscall_trace_lock);
376  if (!sys_refcount_enter)
377  ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
378  if (!ret) {
379  set_bit(num, enabled_enter_syscalls);
380  sys_refcount_enter++;
381  }
382  mutex_unlock(&syscall_trace_lock);
383  return ret;
384 }
385 
387 {
388  int num;
389 
390  num = ((struct syscall_metadata *)call->data)->syscall_nr;
391  if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
392  return;
393  mutex_lock(&syscall_trace_lock);
394  sys_refcount_enter--;
395  clear_bit(num, enabled_enter_syscalls);
396  if (!sys_refcount_enter)
397  unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
398  mutex_unlock(&syscall_trace_lock);
399 }
400 
402 {
403  int ret = 0;
404  int num;
405 
406  num = ((struct syscall_metadata *)call->data)->syscall_nr;
407  if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
408  return -ENOSYS;
409  mutex_lock(&syscall_trace_lock);
410  if (!sys_refcount_exit)
411  ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
412  if (!ret) {
413  set_bit(num, enabled_exit_syscalls);
414  sys_refcount_exit++;
415  }
416  mutex_unlock(&syscall_trace_lock);
417  return ret;
418 }
419 
421 {
422  int num;
423 
424  num = ((struct syscall_metadata *)call->data)->syscall_nr;
425  if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
426  return;
427  mutex_lock(&syscall_trace_lock);
428  sys_refcount_exit--;
429  clear_bit(num, enabled_exit_syscalls);
430  if (!sys_refcount_exit)
431  unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
432  mutex_unlock(&syscall_trace_lock);
433 }
434 
436 {
437  int id;
438  int num;
439 
440  num = ((struct syscall_metadata *)call->data)->syscall_nr;
441  if (num < 0 || num >= NR_syscalls) {
442  pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
443  ((struct syscall_metadata *)call->data)->name);
444  return -ENOSYS;
445  }
446 
447  if (set_syscall_print_fmt(call) < 0)
448  return -ENOMEM;
449 
450  id = trace_event_raw_init(call);
451 
452  if (id < 0) {
453  free_syscall_print_fmt(call);
454  return id;
455  }
456 
457  return id;
458 }
459 
460 unsigned long __init __weak arch_syscall_addr(int nr)
461 {
462  return (unsigned long)sys_call_table[nr];
463 }
464 
466 {
467  struct syscall_metadata *meta;
468  unsigned long addr;
469  int i;
470 
471  syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata),
472  GFP_KERNEL);
473  if (!syscalls_metadata) {
474  WARN_ON(1);
475  return -ENOMEM;
476  }
477 
478  for (i = 0; i < NR_syscalls; i++) {
479  addr = arch_syscall_addr(i);
480  meta = find_syscall_meta(addr);
481  if (!meta)
482  continue;
483 
484  meta->syscall_nr = i;
485  syscalls_metadata[i] = meta;
486  }
487 
488  return 0;
489 }
491 
492 #ifdef CONFIG_PERF_EVENTS
493 
494 static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
495 static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
496 static int sys_perf_refcount_enter;
497 static int sys_perf_refcount_exit;
498 
499 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
500 {
501  struct syscall_metadata *sys_data;
502  struct syscall_trace_enter *rec;
503  struct hlist_head *head;
504  int syscall_nr;
505  int rctx;
506  int size;
507 
508  syscall_nr = syscall_get_nr(current, regs);
509  if (syscall_nr < 0)
510  return;
511  if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
512  return;
513 
514  sys_data = syscall_nr_to_meta(syscall_nr);
515  if (!sys_data)
516  return;
517 
518  /* get the size after alignment with the u32 buffer size field */
519  size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
520  size = ALIGN(size + sizeof(u32), sizeof(u64));
521  size -= sizeof(u32);
522 
523  if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
524  "perf buffer not large enough"))
525  return;
526 
527  rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
528  sys_data->enter_event->event.type, regs, &rctx);
529  if (!rec)
530  return;
531 
532  rec->nr = syscall_nr;
533  syscall_get_arguments(current, regs, 0, sys_data->nb_args,
534  (unsigned long *)&rec->args);
535 
536  head = this_cpu_ptr(sys_data->enter_event->perf_events);
537  perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
538 }
539 
540 int perf_sysenter_enable(struct ftrace_event_call *call)
541 {
542  int ret = 0;
543  int num;
544 
545  num = ((struct syscall_metadata *)call->data)->syscall_nr;
546 
547  mutex_lock(&syscall_trace_lock);
548  if (!sys_perf_refcount_enter)
549  ret = register_trace_sys_enter(perf_syscall_enter, NULL);
550  if (ret) {
551  pr_info("event trace: Could not activate"
552  "syscall entry trace point");
553  } else {
554  set_bit(num, enabled_perf_enter_syscalls);
555  sys_perf_refcount_enter++;
556  }
557  mutex_unlock(&syscall_trace_lock);
558  return ret;
559 }
560 
561 void perf_sysenter_disable(struct ftrace_event_call *call)
562 {
563  int num;
564 
565  num = ((struct syscall_metadata *)call->data)->syscall_nr;
566 
567  mutex_lock(&syscall_trace_lock);
568  sys_perf_refcount_enter--;
569  clear_bit(num, enabled_perf_enter_syscalls);
570  if (!sys_perf_refcount_enter)
571  unregister_trace_sys_enter(perf_syscall_enter, NULL);
572  mutex_unlock(&syscall_trace_lock);
573 }
574 
575 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
576 {
577  struct syscall_metadata *sys_data;
578  struct syscall_trace_exit *rec;
579  struct hlist_head *head;
580  int syscall_nr;
581  int rctx;
582  int size;
583 
584  syscall_nr = syscall_get_nr(current, regs);
585  if (syscall_nr < 0)
586  return;
587  if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
588  return;
589 
590  sys_data = syscall_nr_to_meta(syscall_nr);
591  if (!sys_data)
592  return;
593 
594  /* We can probably do that at build time */
595  size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
596  size -= sizeof(u32);
597 
598  /*
599  * Impossible, but be paranoid with the future
600  * How to put this check outside runtime?
601  */
602  if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
603  "exit event has grown above perf buffer size"))
604  return;
605 
606  rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
607  sys_data->exit_event->event.type, regs, &rctx);
608  if (!rec)
609  return;
610 
611  rec->nr = syscall_nr;
612  rec->ret = syscall_get_return_value(current, regs);
613 
614  head = this_cpu_ptr(sys_data->exit_event->perf_events);
615  perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
616 }
617 
618 int perf_sysexit_enable(struct ftrace_event_call *call)
619 {
620  int ret = 0;
621  int num;
622 
623  num = ((struct syscall_metadata *)call->data)->syscall_nr;
624 
625  mutex_lock(&syscall_trace_lock);
626  if (!sys_perf_refcount_exit)
627  ret = register_trace_sys_exit(perf_syscall_exit, NULL);
628  if (ret) {
629  pr_info("event trace: Could not activate"
630  "syscall exit trace point");
631  } else {
632  set_bit(num, enabled_perf_exit_syscalls);
633  sys_perf_refcount_exit++;
634  }
635  mutex_unlock(&syscall_trace_lock);
636  return ret;
637 }
638 
639 void perf_sysexit_disable(struct ftrace_event_call *call)
640 {
641  int num;
642 
643  num = ((struct syscall_metadata *)call->data)->syscall_nr;
644 
645  mutex_lock(&syscall_trace_lock);
646  sys_perf_refcount_exit--;
647  clear_bit(num, enabled_perf_exit_syscalls);
648  if (!sys_perf_refcount_exit)
649  unregister_trace_sys_exit(perf_syscall_exit, NULL);
650  mutex_unlock(&syscall_trace_lock);
651 }
652 
653 #endif /* CONFIG_PERF_EVENTS */
654 
655 static int syscall_enter_register(struct ftrace_event_call *event,
656  enum trace_reg type, void *data)
657 {
658  switch (type) {
659  case TRACE_REG_REGISTER:
660  return reg_event_syscall_enter(event);
663  return 0;
664 
665 #ifdef CONFIG_PERF_EVENTS
666  case TRACE_REG_PERF_REGISTER:
667  return perf_sysenter_enable(event);
668  case TRACE_REG_PERF_UNREGISTER:
669  perf_sysenter_disable(event);
670  return 0;
671  case TRACE_REG_PERF_OPEN:
672  case TRACE_REG_PERF_CLOSE:
673  case TRACE_REG_PERF_ADD:
674  case TRACE_REG_PERF_DEL:
675  return 0;
676 #endif
677  }
678  return 0;
679 }
680 
681 static int syscall_exit_register(struct ftrace_event_call *event,
682  enum trace_reg type, void *data)
683 {
684  switch (type) {
685  case TRACE_REG_REGISTER:
686  return reg_event_syscall_exit(event);
689  return 0;
690 
691 #ifdef CONFIG_PERF_EVENTS
692  case TRACE_REG_PERF_REGISTER:
693  return perf_sysexit_enable(event);
694  case TRACE_REG_PERF_UNREGISTER:
695  perf_sysexit_disable(event);
696  return 0;
697  case TRACE_REG_PERF_OPEN:
698  case TRACE_REG_PERF_CLOSE:
699  case TRACE_REG_PERF_ADD:
700  case TRACE_REG_PERF_DEL:
701  return 0;
702 #endif
703  }
704  return 0;
705 }