Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
buffer_sync.c
Go to the documentation of this file.
1 
24 #include <linux/mm.h>
25 #include <linux/workqueue.h>
26 #include <linux/notifier.h>
27 #include <linux/dcookies.h>
28 #include <linux/profile.h>
29 #include <linux/module.h>
30 #include <linux/fs.h>
31 #include <linux/oprofile.h>
32 #include <linux/sched.h>
33 #include <linux/gfp.h>
34 
35 #include "oprofile_stats.h"
36 #include "event_buffer.h"
37 #include "cpu_buffer.h"
38 #include "buffer_sync.h"
39 
40 static LIST_HEAD(dying_tasks);
41 static LIST_HEAD(dead_tasks);
42 static cpumask_var_t marked_cpus;
43 static DEFINE_SPINLOCK(task_mortuary);
44 static void process_task_mortuary(void);
45 
46 /* Take ownership of the task struct and place it on the
47  * list for processing. Only after two full buffer syncs
48  * does the task eventually get freed, because by then
49  * we are sure we will not reference it again.
50  * Can be invoked from softirq via RCU callback due to
51  * call_rcu() of the task struct, hence the _irqsave.
52  */
53 static int
54 task_free_notify(struct notifier_block *self, unsigned long val, void *data)
55 {
56  unsigned long flags;
57  struct task_struct *task = data;
58  spin_lock_irqsave(&task_mortuary, flags);
59  list_add(&task->tasks, &dying_tasks);
60  spin_unlock_irqrestore(&task_mortuary, flags);
61  return NOTIFY_OK;
62 }
63 
64 
65 /* The task is on its way out. A sync of the buffer means we can catch
66  * any remaining samples for this task.
67  */
68 static int
69 task_exit_notify(struct notifier_block *self, unsigned long val, void *data)
70 {
71  /* To avoid latency problems, we only process the current CPU,
72  * hoping that most samples for the task are on this CPU
73  */
75  return 0;
76 }
77 
78 
79 /* The task is about to try a do_munmap(). We peek at what it's going to
80  * do, and if it's an executable region, process the samples first, so
81  * we don't lose any. This does not have to be exact, it's a QoI issue
82  * only.
83  */
84 static int
85 munmap_notify(struct notifier_block *self, unsigned long val, void *data)
86 {
87  unsigned long addr = (unsigned long)data;
88  struct mm_struct *mm = current->mm;
89  struct vm_area_struct *mpnt;
90 
91  down_read(&mm->mmap_sem);
92 
93  mpnt = find_vma(mm, addr);
94  if (mpnt && mpnt->vm_file && (mpnt->vm_flags & VM_EXEC)) {
95  up_read(&mm->mmap_sem);
96  /* To avoid latency problems, we only process the current CPU,
97  * hoping that most samples for the task are on this CPU
98  */
100  return 0;
101  }
102 
103  up_read(&mm->mmap_sem);
104  return 0;
105 }
106 
107 
108 /* We need to be told about new modules so we don't attribute to a previously
109  * loaded module, or drop the samples on the floor.
110  */
111 static int
112 module_load_notify(struct notifier_block *self, unsigned long val, void *data)
113 {
114 #ifdef CONFIG_MODULES
115  if (val != MODULE_STATE_COMING)
116  return 0;
117 
118  /* FIXME: should we process all CPU buffers ? */
119  mutex_lock(&buffer_mutex);
122  mutex_unlock(&buffer_mutex);
123 #endif
124  return 0;
125 }
126 
127 
128 static struct notifier_block task_free_nb = {
129  .notifier_call = task_free_notify,
130 };
131 
132 static struct notifier_block task_exit_nb = {
133  .notifier_call = task_exit_notify,
134 };
135 
136 static struct notifier_block munmap_nb = {
137  .notifier_call = munmap_notify,
138 };
139 
140 static struct notifier_block module_load_nb = {
141  .notifier_call = module_load_notify,
142 };
143 
144 static void free_all_tasks(void)
145 {
146  /* make sure we don't leak task structs */
147  process_task_mortuary();
148  process_task_mortuary();
149 }
150 
151 int sync_start(void)
152 {
153  int err;
154 
155  if (!zalloc_cpumask_var(&marked_cpus, GFP_KERNEL))
156  return -ENOMEM;
157 
158  err = task_handoff_register(&task_free_nb);
159  if (err)
160  goto out1;
161  err = profile_event_register(PROFILE_TASK_EXIT, &task_exit_nb);
162  if (err)
163  goto out2;
164  err = profile_event_register(PROFILE_MUNMAP, &munmap_nb);
165  if (err)
166  goto out3;
167  err = register_module_notifier(&module_load_nb);
168  if (err)
169  goto out4;
170 
171  start_cpu_work();
172 
173 out:
174  return err;
175 out4:
177 out3:
179 out2:
180  task_handoff_unregister(&task_free_nb);
181  free_all_tasks();
182 out1:
183  free_cpumask_var(marked_cpus);
184  goto out;
185 }
186 
187 
188 void sync_stop(void)
189 {
190  end_cpu_work();
191  unregister_module_notifier(&module_load_nb);
194  task_handoff_unregister(&task_free_nb);
195  barrier(); /* do all of the above first */
196 
197  flush_cpu_work();
198 
199  free_all_tasks();
200  free_cpumask_var(marked_cpus);
201 }
202 
203 
204 /* Optimisation. We can manage without taking the dcookie sem
205  * because we cannot reach this code without at least one
206  * dcookie user still being registered (namely, the reader
207  * of the event buffer). */
208 static inline unsigned long fast_get_dcookie(struct path *path)
209 {
210  unsigned long cookie;
211 
212  if (path->dentry->d_flags & DCACHE_COOKIE)
213  return (unsigned long)path->dentry;
214  get_dcookie(path, &cookie);
215  return cookie;
216 }
217 
218 
219 /* Look up the dcookie for the task's mm->exe_file,
220  * which corresponds loosely to "application name". This is
221  * not strictly necessary but allows oprofile to associate
222  * shared-library samples with particular applications
223  */
224 static unsigned long get_exec_dcookie(struct mm_struct *mm)
225 {
226  unsigned long cookie = NO_COOKIE;
227 
228  if (mm && mm->exe_file)
229  cookie = fast_get_dcookie(&mm->exe_file->f_path);
230 
231  return cookie;
232 }
233 
234 
235 /* Convert the EIP value of a sample into a persistent dentry/offset
236  * pair that can then be added to the global event buffer. We make
237  * sure to do this lookup before a mm->mmap modification happens so
238  * we don't lose track.
239  */
240 static unsigned long
241 lookup_dcookie(struct mm_struct *mm, unsigned long addr, off_t *offset)
242 {
243  unsigned long cookie = NO_COOKIE;
244  struct vm_area_struct *vma;
245 
246  for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
247 
248  if (addr < vma->vm_start || addr >= vma->vm_end)
249  continue;
250 
251  if (vma->vm_file) {
252  cookie = fast_get_dcookie(&vma->vm_file->f_path);
253  *offset = (vma->vm_pgoff << PAGE_SHIFT) + addr -
254  vma->vm_start;
255  } else {
256  /* must be an anonymous map */
257  *offset = addr;
258  }
259 
260  break;
261  }
262 
263  if (!vma)
264  cookie = INVALID_COOKIE;
265 
266  return cookie;
267 }
268 
269 static unsigned long last_cookie = INVALID_COOKIE;
270 
271 static void add_cpu_switch(int i)
272 {
275  add_event_entry(i);
276  last_cookie = INVALID_COOKIE;
277 }
278 
279 static void add_kernel_ctx_switch(unsigned int in_kernel)
280 {
282  if (in_kernel)
284  else
286 }
287 
288 static void
289 add_user_ctx_switch(struct task_struct const *task, unsigned long cookie)
290 {
293  add_event_entry(task->pid);
294  add_event_entry(cookie);
295  /* Another code for daemon back-compat */
298  add_event_entry(task->tgid);
299 }
300 
301 
302 static void add_cookie_switch(unsigned long cookie)
303 {
306  add_event_entry(cookie);
307 }
308 
309 
310 static void add_trace_begin(void)
311 {
314 }
315 
316 static void add_data(struct op_entry *entry, struct mm_struct *mm)
317 {
318  unsigned long code, pc, val;
319  unsigned long cookie;
320  off_t offset;
321 
322  if (!op_cpu_buffer_get_data(entry, &code))
323  return;
324  if (!op_cpu_buffer_get_data(entry, &pc))
325  return;
326  if (!op_cpu_buffer_get_size(entry))
327  return;
328 
329  if (mm) {
330  cookie = lookup_dcookie(mm, pc, &offset);
331 
332  if (cookie == NO_COOKIE)
333  offset = pc;
334  if (cookie == INVALID_COOKIE) {
335  atomic_inc(&oprofile_stats.sample_lost_no_mapping);
336  offset = pc;
337  }
338  if (cookie != last_cookie) {
339  add_cookie_switch(cookie);
340  last_cookie = cookie;
341  }
342  } else
343  offset = pc;
344 
346  add_event_entry(code);
347  add_event_entry(offset); /* Offset from Dcookie */
348 
349  while (op_cpu_buffer_get_data(entry, &val))
350  add_event_entry(val);
351 }
352 
353 static inline void add_sample_entry(unsigned long offset, unsigned long event)
354 {
355  add_event_entry(offset);
356  add_event_entry(event);
357 }
358 
359 
360 /*
361  * Add a sample to the global event buffer. If possible the
362  * sample is converted into a persistent dentry/offset pair
363  * for later lookup from userspace. Return 0 on failure.
364  */
365 static int
366 add_sample(struct mm_struct *mm, struct op_sample *s, int in_kernel)
367 {
368  unsigned long cookie;
369  off_t offset;
370 
371  if (in_kernel) {
372  add_sample_entry(s->eip, s->event);
373  return 1;
374  }
375 
376  /* add userspace sample */
377 
378  if (!mm) {
379  atomic_inc(&oprofile_stats.sample_lost_no_mm);
380  return 0;
381  }
382 
383  cookie = lookup_dcookie(mm, s->eip, &offset);
384 
385  if (cookie == INVALID_COOKIE) {
386  atomic_inc(&oprofile_stats.sample_lost_no_mapping);
387  return 0;
388  }
389 
390  if (cookie != last_cookie) {
391  add_cookie_switch(cookie);
392  last_cookie = cookie;
393  }
394 
395  add_sample_entry(offset, s->event);
396 
397  return 1;
398 }
399 
400 
401 static void release_mm(struct mm_struct *mm)
402 {
403  if (!mm)
404  return;
405  up_read(&mm->mmap_sem);
406  mmput(mm);
407 }
408 
409 
410 static struct mm_struct *take_tasks_mm(struct task_struct *task)
411 {
412  struct mm_struct *mm = get_task_mm(task);
413  if (mm)
414  down_read(&mm->mmap_sem);
415  return mm;
416 }
417 
418 
419 static inline int is_code(unsigned long val)
420 {
421  return val == ESCAPE_CODE;
422 }
423 
424 
425 /* Move tasks along towards death. Any tasks on dead_tasks
426  * will definitely have no remaining references in any
427  * CPU buffers at this point, because we use two lists,
428  * and to have reached the list, it must have gone through
429  * one full sync already.
430  */
431 static void process_task_mortuary(void)
432 {
433  unsigned long flags;
434  LIST_HEAD(local_dead_tasks);
435  struct task_struct *task;
436  struct task_struct *ttask;
437 
438  spin_lock_irqsave(&task_mortuary, flags);
439 
440  list_splice_init(&dead_tasks, &local_dead_tasks);
441  list_splice_init(&dying_tasks, &dead_tasks);
442 
443  spin_unlock_irqrestore(&task_mortuary, flags);
444 
445  list_for_each_entry_safe(task, ttask, &local_dead_tasks, tasks) {
446  list_del(&task->tasks);
447  free_task(task);
448  }
449 }
450 
451 
452 static void mark_done(int cpu)
453 {
454  int i;
455 
456  cpumask_set_cpu(cpu, marked_cpus);
457 
459  if (!cpumask_test_cpu(i, marked_cpus))
460  return;
461  }
462 
463  /* All CPUs have been processed at least once,
464  * we can process the mortuary once
465  */
466  process_task_mortuary();
467 
468  cpumask_clear(marked_cpus);
469 }
470 
471 
472 /* FIXME: this is not sufficient if we implement syscall barrier backtrace
473  * traversal, the code switch to sb_sample_start at first kernel enter/exit
474  * switch so we need a fifth state and some special handling in sync_buffer()
475  */
476 typedef enum {
482 
483 /* Sync one of the CPU's buffers into the global event buffer.
484  * Here we need to go through each batch of samples punctuated
485  * by context switch notes, taking the task's mmap_sem and doing
486  * lookup in task->mm->mmap to convert EIP into dcookie/offset
487  * value.
488  */
489 void sync_buffer(int cpu)
490 {
491  struct mm_struct *mm = NULL;
492  struct mm_struct *oldmm;
493  unsigned long val;
494  struct task_struct *new;
495  unsigned long cookie = 0;
496  int in_kernel = 1;
498  unsigned int i;
499  unsigned long available;
500  unsigned long flags;
501  struct op_entry entry;
502  struct op_sample *sample;
503 
505 
506  add_cpu_switch(cpu);
507 
508  op_cpu_buffer_reset(cpu);
509  available = op_cpu_buffer_entries(cpu);
510 
511  for (i = 0; i < available; ++i) {
512  sample = op_cpu_buffer_read_entry(&entry, cpu);
513  if (!sample)
514  break;
515 
516  if (is_code(sample->eip)) {
517  flags = sample->event;
518  if (flags & TRACE_BEGIN) {
519  state = sb_bt_start;
520  add_trace_begin();
521  }
522  if (flags & KERNEL_CTX_SWITCH) {
523  /* kernel/userspace switch */
524  in_kernel = flags & IS_KERNEL;
525  if (state == sb_buffer_start)
526  state = sb_sample_start;
527  add_kernel_ctx_switch(flags & IS_KERNEL);
528  }
529  if (flags & USER_CTX_SWITCH
530  && op_cpu_buffer_get_data(&entry, &val)) {
531  /* userspace context switch */
532  new = (struct task_struct *)val;
533  oldmm = mm;
534  release_mm(oldmm);
535  mm = take_tasks_mm(new);
536  if (mm != oldmm)
537  cookie = get_exec_dcookie(mm);
538  add_user_ctx_switch(new, cookie);
539  }
540  if (op_cpu_buffer_get_size(&entry))
541  add_data(&entry, mm);
542  continue;
543  }
544 
545  if (state < sb_bt_start)
546  /* ignore sample */
547  continue;
548 
549  if (add_sample(mm, sample, in_kernel))
550  continue;
551 
552  /* ignore backtraces if failed to add a sample */
553  if (state == sb_bt_start) {
554  state = sb_bt_ignore;
555  atomic_inc(&oprofile_stats.bt_lost_no_mapping);
556  }
557  }
558  release_mm(mm);
559 
560  mark_done(cpu);
561 
563 }
564 
565 /* The function can be used to add a buffer worth of data directly to
566  * the kernel buffer. The buffer is assumed to be a circular buffer.
567  * Take the entries from index start and end at index end, wrapping
568  * at max_entries.
569  */
570 void oprofile_put_buff(unsigned long *buf, unsigned int start,
571  unsigned int stop, unsigned int max)
572 {
573  int i;
574 
575  i = start;
576 
578  while (i != stop) {
579  add_event_entry(buf[i++]);
580 
581  if (i >= max)
582  i = 0;
583  }
584 
586 }
587