Linux Kernel  3.7.1
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
kmmio.c
Go to the documentation of this file.
1 /* Support for MMIO probes.
2  * Benfit many code from kprobes
3  * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
4  * 2007 Alexander Eichner
5  * 2008 Pekka Paalanen <pq@iki.fi>
6  */
7 
8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9 
10 #include <linux/list.h>
11 #include <linux/rculist.h>
12 #include <linux/spinlock.h>
13 #include <linux/hash.h>
14 #include <linux/init.h>
15 #include <linux/module.h>
16 #include <linux/kernel.h>
17 #include <linux/uaccess.h>
18 #include <linux/ptrace.h>
19 #include <linux/preempt.h>
20 #include <linux/percpu.h>
21 #include <linux/kdebug.h>
22 #include <linux/mutex.h>
23 #include <linux/io.h>
24 #include <linux/slab.h>
25 #include <asm/cacheflush.h>
26 #include <asm/tlbflush.h>
27 #include <linux/errno.h>
28 #include <asm/debugreg.h>
29 #include <linux/mmiotrace.h>
30 
31 #define KMMIO_PAGE_HASH_BITS 4
32 #define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
33 
35  struct list_head list;
37  unsigned long page; /* location of the fault page */
38  pteval_t old_presence; /* page presence prior to arming */
39  bool armed;
40 
41  /*
42  * Number of times this page has been registered as a part
43  * of a probe. If zero, page is disarmed and this may be freed.
44  * Used only by writers (RCU) and post_kmmio_handler().
45  * Protected by kmmio_lock, when linked into kmmio_page_table.
46  */
47  int count;
48 
50 };
51 
53  struct rcu_head rcu;
55 };
56 
57 struct kmmio_context {
59  struct kmmio_probe *probe;
60  unsigned long saved_flags;
61  unsigned long addr;
62  int active;
63 };
64 
65 static DEFINE_SPINLOCK(kmmio_lock);
66 
67 /* Protected by kmmio_lock */
68 unsigned int kmmio_count;
69 
70 /* Read-protected by RCU, write-protected by kmmio_lock. */
71 static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
72 static LIST_HEAD(kmmio_probes);
73 
74 static struct list_head *kmmio_page_list(unsigned long page)
75 {
76  return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
77 }
78 
79 /* Accessed per-cpu */
80 static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
81 
82 /*
83  * this is basically a dynamic stabbing problem:
84  * Could use the existing prio tree code or
85  * Possible better implementations:
86  * The Interval Skip List: A Data Structure for Finding All Intervals That
87  * Overlap a Point (might be simple)
88  * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
89  */
90 /* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
91 static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
92 {
93  struct kmmio_probe *p;
94  list_for_each_entry_rcu(p, &kmmio_probes, list) {
95  if (addr >= p->addr && addr < (p->addr + p->len))
96  return p;
97  }
98  return NULL;
99 }
100 
101 /* You must be holding RCU read lock. */
102 static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
103 {
104  struct list_head *head;
105  struct kmmio_fault_page *f;
106 
107  page &= PAGE_MASK;
108  head = kmmio_page_list(page);
109  list_for_each_entry_rcu(f, head, list) {
110  if (f->page == page)
111  return f;
112  }
113  return NULL;
114 }
115 
116 static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
117 {
118  pmdval_t v = pmd_val(*pmd);
119  if (clear) {
120  *old = v & _PAGE_PRESENT;
121  v &= ~_PAGE_PRESENT;
122  } else /* presume this has been called with clear==true previously */
123  v |= *old;
124  set_pmd(pmd, __pmd(v));
125 }
126 
127 static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
128 {
129  pteval_t v = pte_val(*pte);
130  if (clear) {
131  *old = v & _PAGE_PRESENT;
132  v &= ~_PAGE_PRESENT;
133  } else /* presume this has been called with clear==true previously */
134  v |= *old;
135  set_pte_atomic(pte, __pte(v));
136 }
137 
138 static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
139 {
140  unsigned int level;
141  pte_t *pte = lookup_address(f->page, &level);
142 
143  if (!pte) {
144  pr_err("no pte for page 0x%08lx\n", f->page);
145  return -1;
146  }
147 
148  switch (level) {
149  case PG_LEVEL_2M:
150  clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
151  break;
152  case PG_LEVEL_4K:
153  clear_pte_presence(pte, clear, &f->old_presence);
154  break;
155  default:
156  pr_err("unexpected page level 0x%x.\n", level);
157  return -1;
158  }
159 
160  __flush_tlb_one(f->page);
161  return 0;
162 }
163 
164 /*
165  * Mark the given page as not present. Access to it will trigger a fault.
166  *
167  * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the
168  * protection is ignored here. RCU read lock is assumed held, so the struct
169  * will not disappear unexpectedly. Furthermore, the caller must guarantee,
170  * that double arming the same virtual address (page) cannot occur.
171  *
172  * Double disarming on the other hand is allowed, and may occur when a fault
173  * and mmiotrace shutdown happen simultaneously.
174  */
175 static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
176 {
177  int ret;
178  WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
179  if (f->armed) {
180  pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n",
181  f->page, f->count, !!f->old_presence);
182  }
183  ret = clear_page_presence(f, true);
184  WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"),
185  f->page);
186  f->armed = true;
187  return ret;
188 }
189 
191 static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
192 {
193  int ret = clear_page_presence(f, false);
194  WARN_ONCE(ret < 0,
195  KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
196  f->armed = false;
197 }
198 
199 /*
200  * This is being called from do_page_fault().
201  *
202  * We may be in an interrupt or a critical section. Also prefecthing may
203  * trigger a page fault. We may be in the middle of process switch.
204  * We cannot take any locks, because we could be executing especially
205  * within a kmmio critical section.
206  *
207  * Local interrupts are disabled, so preemption cannot happen.
208  * Do not enable interrupts, do not sleep, and watch out for other CPUs.
209  */
210 /*
211  * Interrupts are disabled on entry as trap3 is an interrupt gate
212  * and they remain disabled throughout this function.
213  */
214 int kmmio_handler(struct pt_regs *regs, unsigned long addr)
215 {
216  struct kmmio_context *ctx;
217  struct kmmio_fault_page *faultpage;
218  int ret = 0; /* default to fault not handled */
219 
220  /*
221  * Preemption is now disabled to prevent process switch during
222  * single stepping. We can only handle one active kmmio trace
223  * per cpu, so ensure that we finish it before something else
224  * gets to run. We also hold the RCU read lock over single
225  * stepping to avoid looking up the probe and kmmio_fault_page
226  * again.
227  */
228  preempt_disable();
229  rcu_read_lock();
230 
231  faultpage = get_kmmio_fault_page(addr);
232  if (!faultpage) {
233  /*
234  * Either this page fault is not caused by kmmio, or
235  * another CPU just pulled the kmmio probe from under
236  * our feet. The latter case should not be possible.
237  */
238  goto no_kmmio;
239  }
240 
241  ctx = &get_cpu_var(kmmio_ctx);
242  if (ctx->active) {
243  if (addr == ctx->addr) {
244  /*
245  * A second fault on the same page means some other
246  * condition needs handling by do_page_fault(), the
247  * page really not being present is the most common.
248  */
249  pr_debug("secondary hit for 0x%08lx CPU %d.\n",
250  addr, smp_processor_id());
251 
252  if (!faultpage->old_presence)
253  pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n",
254  addr, smp_processor_id());
255  } else {
256  /*
257  * Prevent overwriting already in-flight context.
258  * This should not happen, let's hope disarming at
259  * least prevents a panic.
260  */
261  pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n",
262  smp_processor_id(), addr);
263  pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr);
264  disarm_kmmio_fault_page(faultpage);
265  }
266  goto no_kmmio_ctx;
267  }
268  ctx->active++;
269 
270  ctx->fpage = faultpage;
271  ctx->probe = get_kmmio_probe(addr);
272  ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
273  ctx->addr = addr;
274 
275  if (ctx->probe && ctx->probe->pre_handler)
276  ctx->probe->pre_handler(ctx->probe, regs, addr);
277 
278  /*
279  * Enable single-stepping and disable interrupts for the faulting
280  * context. Local interrupts must not get enabled during stepping.
281  */
282  regs->flags |= X86_EFLAGS_TF;
283  regs->flags &= ~X86_EFLAGS_IF;
284 
285  /* Now we set present bit in PTE and single step. */
286  disarm_kmmio_fault_page(ctx->fpage);
287 
288  /*
289  * If another cpu accesses the same page while we are stepping,
290  * the access will not be caught. It will simply succeed and the
291  * only downside is we lose the event. If this becomes a problem,
292  * the user should drop to single cpu before tracing.
293  */
294 
295  put_cpu_var(kmmio_ctx);
296  return 1; /* fault handled */
297 
298 no_kmmio_ctx:
299  put_cpu_var(kmmio_ctx);
300 no_kmmio:
301  rcu_read_unlock();
303  return ret;
304 }
305 
306 /*
307  * Interrupts are disabled on entry as trap1 is an interrupt gate
308  * and they remain disabled throughout this function.
309  * This must always get called as the pair to kmmio_handler().
310  */
311 static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
312 {
313  int ret = 0;
314  struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
315 
316  if (!ctx->active) {
317  /*
318  * debug traps without an active context are due to either
319  * something external causing them (f.e. using a debugger while
320  * mmio tracing enabled), or erroneous behaviour
321  */
322  pr_warning("unexpected debug trap on CPU %d.\n",
323  smp_processor_id());
324  goto out;
325  }
326 
327  if (ctx->probe && ctx->probe->post_handler)
328  ctx->probe->post_handler(ctx->probe, condition, regs);
329 
330  /* Prevent racing against release_kmmio_fault_page(). */
331  spin_lock(&kmmio_lock);
332  if (ctx->fpage->count)
333  arm_kmmio_fault_page(ctx->fpage);
334  spin_unlock(&kmmio_lock);
335 
336  regs->flags &= ~X86_EFLAGS_TF;
337  regs->flags |= ctx->saved_flags;
338 
339  /* These were acquired in kmmio_handler(). */
340  ctx->active--;
341  BUG_ON(ctx->active);
342  rcu_read_unlock();
344 
345  /*
346  * if somebody else is singlestepping across a probe point, flags
347  * will have TF set, in which case, continue the remaining processing
348  * of do_debug, as if this is not a probe hit.
349  */
350  if (!(regs->flags & X86_EFLAGS_TF))
351  ret = 1;
352 out:
353  put_cpu_var(kmmio_ctx);
354  return ret;
355 }
356 
357 /* You must be holding kmmio_lock. */
358 static int add_kmmio_fault_page(unsigned long page)
359 {
360  struct kmmio_fault_page *f;
361 
362  page &= PAGE_MASK;
363  f = get_kmmio_fault_page(page);
364  if (f) {
365  if (!f->count)
366  arm_kmmio_fault_page(f);
367  f->count++;
368  return 0;
369  }
370 
371  f = kzalloc(sizeof(*f), GFP_ATOMIC);
372  if (!f)
373  return -1;
374 
375  f->count = 1;
376  f->page = page;
377 
378  if (arm_kmmio_fault_page(f)) {
379  kfree(f);
380  return -1;
381  }
382 
383  list_add_rcu(&f->list, kmmio_page_list(f->page));
384 
385  return 0;
386 }
387 
388 /* You must be holding kmmio_lock. */
389 static void release_kmmio_fault_page(unsigned long page,
390  struct kmmio_fault_page **release_list)
391 {
392  struct kmmio_fault_page *f;
393 
394  page &= PAGE_MASK;
395  f = get_kmmio_fault_page(page);
396  if (!f)
397  return;
398 
399  f->count--;
400  BUG_ON(f->count < 0);
401  if (!f->count) {
402  disarm_kmmio_fault_page(f);
403  if (!f->scheduled_for_release) {
404  f->release_next = *release_list;
405  *release_list = f;
406  f->scheduled_for_release = true;
407  }
408  }
409 }
410 
411 /*
412  * With page-unaligned ioremaps, one or two armed pages may contain
413  * addresses from outside the intended mapping. Events for these addresses
414  * are currently silently dropped. The events may result only from programming
415  * mistakes by accessing addresses before the beginning or past the end of a
416  * mapping.
417  */
419 {
420  unsigned long flags;
421  int ret = 0;
422  unsigned long size = 0;
423  const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
424 
425  spin_lock_irqsave(&kmmio_lock, flags);
426  if (get_kmmio_probe(p->addr)) {
427  ret = -EEXIST;
428  goto out;
429  }
430  kmmio_count++;
431  list_add_rcu(&p->list, &kmmio_probes);
432  while (size < size_lim) {
433  if (add_kmmio_fault_page(p->addr + size))
434  pr_err("Unable to set page fault.\n");
435  size += PAGE_SIZE;
436  }
437 out:
438  spin_unlock_irqrestore(&kmmio_lock, flags);
439  /*
440  * XXX: What should I do here?
441  * Here was a call to global_flush_tlb(), but it does not exist
442  * anymore. It seems it's not needed after all.
443  */
444  return ret;
445 }
447 
448 static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
449 {
450  struct kmmio_delayed_release *dr = container_of(
451  head,
452  struct kmmio_delayed_release,
453  rcu);
454  struct kmmio_fault_page *f = dr->release_list;
455  while (f) {
456  struct kmmio_fault_page *next = f->release_next;
457  BUG_ON(f->count);
458  kfree(f);
459  f = next;
460  }
461  kfree(dr);
462 }
463 
464 static void remove_kmmio_fault_pages(struct rcu_head *head)
465 {
466  struct kmmio_delayed_release *dr =
467  container_of(head, struct kmmio_delayed_release, rcu);
468  struct kmmio_fault_page *f = dr->release_list;
469  struct kmmio_fault_page **prevp = &dr->release_list;
470  unsigned long flags;
471 
472  spin_lock_irqsave(&kmmio_lock, flags);
473  while (f) {
474  if (!f->count) {
475  list_del_rcu(&f->list);
476  prevp = &f->release_next;
477  } else {
478  *prevp = f->release_next;
479  f->release_next = NULL;
480  f->scheduled_for_release = false;
481  }
482  f = *prevp;
483  }
484  spin_unlock_irqrestore(&kmmio_lock, flags);
485 
486  /* This is the real RCU destroy call. */
487  call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
488 }
489 
490 /*
491  * Remove a kmmio probe. You have to synchronize_rcu() before you can be
492  * sure that the callbacks will not be called anymore. Only after that
493  * you may actually release your struct kmmio_probe.
494  *
495  * Unregistering a kmmio fault page has three steps:
496  * 1. release_kmmio_fault_page()
497  * Disarm the page, wait a grace period to let all faults finish.
498  * 2. remove_kmmio_fault_pages()
499  * Remove the pages from kmmio_page_table.
500  * 3. rcu_free_kmmio_fault_pages()
501  * Actually free the kmmio_fault_page structs as with RCU.
502  */
504 {
505  unsigned long flags;
506  unsigned long size = 0;
507  const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
508  struct kmmio_fault_page *release_list = NULL;
509  struct kmmio_delayed_release *drelease;
510 
511  spin_lock_irqsave(&kmmio_lock, flags);
512  while (size < size_lim) {
513  release_kmmio_fault_page(p->addr + size, &release_list);
514  size += PAGE_SIZE;
515  }
516  list_del_rcu(&p->list);
517  kmmio_count--;
518  spin_unlock_irqrestore(&kmmio_lock, flags);
519 
520  if (!release_list)
521  return;
522 
523  drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
524  if (!drelease) {
525  pr_crit("leaking kmmio_fault_page objects.\n");
526  return;
527  }
528  drelease->release_list = release_list;
529 
530  /*
531  * This is not really RCU here. We have just disarmed a set of
532  * pages so that they cannot trigger page faults anymore. However,
533  * we cannot remove the pages from kmmio_page_table,
534  * because a probe hit might be in flight on another CPU. The
535  * pages are collected into a list, and they will be removed from
536  * kmmio_page_table when it is certain that no probe hit related to
537  * these pages can be in flight. RCU grace period sounds like a
538  * good choice.
539  *
540  * If we removed the pages too early, kmmio page fault handler might
541  * not find the respective kmmio_fault_page and determine it's not
542  * a kmmio fault, when it actually is. This would lead to madness.
543  */
544  call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
545 }
547 
548 static int
549 kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
550 {
551  struct die_args *arg = args;
552  unsigned long* dr6_p = (unsigned long *)ERR_PTR(arg->err);
553 
554  if (val == DIE_DEBUG && (*dr6_p & DR_STEP))
555  if (post_kmmio_handler(*dr6_p, arg->regs) == 1) {
556  /*
557  * Reset the BS bit in dr6 (pointed by args->err) to
558  * denote completion of processing
559  */
560  *dr6_p &= ~DR_STEP;
561  return NOTIFY_STOP;
562  }
563 
564  return NOTIFY_DONE;
565 }
566 
567 static struct notifier_block nb_die = {
568  .notifier_call = kmmio_die_notifier
569 };
570 
571 int kmmio_init(void)
572 {
573  int i;
574 
575  for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
576  INIT_LIST_HEAD(&kmmio_page_table[i]);
577 
578  return register_die_notifier(&nb_die);
579 }
580 
581 void kmmio_cleanup(void)
582 {
583  int i;
584 
585  unregister_die_notifier(&nb_die);
586  for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
587  WARN_ONCE(!list_empty(&kmmio_page_table[i]),
588  KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n");
589  }
590 }