Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
core-book3s.c
Go to the documentation of this file.
1 /*
2  * Performance event support - powerpc architecture code
3  *
4  * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version
9  * 2 of the License, or (at your option) any later version.
10  */
11 #include <linux/kernel.h>
12 #include <linux/sched.h>
13 #include <linux/perf_event.h>
14 #include <linux/percpu.h>
15 #include <linux/hardirq.h>
16 #include <asm/reg.h>
17 #include <asm/pmc.h>
18 #include <asm/machdep.h>
19 #include <asm/firmware.h>
20 #include <asm/ptrace.h>
21 
22 struct cpu_hw_events {
23  int n_events;
24  int n_percpu;
25  int disabled;
26  int n_added;
27  int n_limited;
29  struct perf_event *event[MAX_HWEVENTS];
31  unsigned int flags[MAX_HWEVENTS];
32  unsigned long mmcr[3];
38 
39  unsigned int group_flag;
41 };
43 
44 struct power_pmu *ppmu;
45 
46 /*
47  * Normally, to ignore kernel events we set the FCS (freeze counters
48  * in supervisor mode) bit in MMCR0, but if the kernel runs with the
49  * hypervisor bit set in the MSR, or if we are running on a processor
50  * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
51  * then we need to use the FCHV bit to ignore kernel events.
52  */
53 static unsigned int freeze_events_kernel = MMCR0_FCS;
54 
55 /*
56  * 32-bit doesn't have MMCRA but does have an MMCR2,
57  * and a few other names are different.
58  */
59 #ifdef CONFIG_PPC32
60 
61 #define MMCR0_FCHV 0
62 #define MMCR0_PMCjCE MMCR0_PMCnCE
63 
64 #define SPRN_MMCRA SPRN_MMCR2
65 #define MMCRA_SAMPLE_ENABLE 0
66 
67 static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
68 {
69  return 0;
70 }
71 static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { }
72 static inline u32 perf_get_misc_flags(struct pt_regs *regs)
73 {
74  return 0;
75 }
76 static inline void perf_read_regs(struct pt_regs *regs)
77 {
78  regs->result = 0;
79 }
80 static inline int perf_intr_is_nmi(struct pt_regs *regs)
81 {
82  return 0;
83 }
84 
85 static inline int siar_valid(struct pt_regs *regs)
86 {
87  return 1;
88 }
89 
90 #endif /* CONFIG_PPC32 */
91 
92 /*
93  * Things that are specific to 64-bit implementations.
94  */
95 #ifdef CONFIG_PPC64
96 
97 static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
98 {
99  unsigned long mmcra = regs->dsisr;
100 
101  if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) {
102  unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
103  if (slot > 1)
104  return 4 * (slot - 1);
105  }
106  return 0;
107 }
108 
109 /*
110  * The user wants a data address recorded.
111  * If we're not doing instruction sampling, give them the SDAR
112  * (sampled data address). If we are doing instruction sampling, then
113  * only give them the SDAR if it corresponds to the instruction
114  * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC or
115  * the [POWER7P_]MMCRA_SDAR_VALID bit in MMCRA.
116  */
117 static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp)
118 {
119  unsigned long mmcra = regs->dsisr;
120  unsigned long sdsync;
121 
122  if (ppmu->flags & PPMU_SIAR_VALID)
123  sdsync = POWER7P_MMCRA_SDAR_VALID;
124  else if (ppmu->flags & PPMU_ALT_SIPR)
125  sdsync = POWER6_MMCRA_SDSYNC;
126  else
127  sdsync = MMCRA_SDSYNC;
128 
129  if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
130  *addrp = mfspr(SPRN_SDAR);
131 }
132 
133 static bool mmcra_sihv(unsigned long mmcra)
134 {
135  unsigned long sihv = MMCRA_SIHV;
136 
137  if (ppmu->flags & PPMU_ALT_SIPR)
138  sihv = POWER6_MMCRA_SIHV;
139 
140  return !!(mmcra & sihv);
141 }
142 
143 static bool mmcra_sipr(unsigned long mmcra)
144 {
145  unsigned long sipr = MMCRA_SIPR;
146 
147  if (ppmu->flags & PPMU_ALT_SIPR)
148  sipr = POWER6_MMCRA_SIPR;
149 
150  return !!(mmcra & sipr);
151 }
152 
153 static inline u32 perf_flags_from_msr(struct pt_regs *regs)
154 {
155  if (regs->msr & MSR_PR)
156  return PERF_RECORD_MISC_USER;
157  if ((regs->msr & MSR_HV) && freeze_events_kernel != MMCR0_FCHV)
160 }
161 
162 static inline u32 perf_get_misc_flags(struct pt_regs *regs)
163 {
164  unsigned long mmcra = regs->dsisr;
165  unsigned long use_siar = regs->result;
166 
167  if (!use_siar)
168  return perf_flags_from_msr(regs);
169 
170  /*
171  * If we don't have flags in MMCRA, rather than using
172  * the MSR, we intuit the flags from the address in
173  * SIAR which should give slightly more reliable
174  * results
175  */
176  if (ppmu->flags & PPMU_NO_SIPR) {
177  unsigned long siar = mfspr(SPRN_SIAR);
178  if (siar >= PAGE_OFFSET)
180  return PERF_RECORD_MISC_USER;
181  }
182 
183  /* PR has priority over HV, so order below is important */
184  if (mmcra_sipr(mmcra))
185  return PERF_RECORD_MISC_USER;
186  if (mmcra_sihv(mmcra) && (freeze_events_kernel != MMCR0_FCHV))
189 }
190 
191 /*
192  * Overload regs->dsisr to store MMCRA so we only need to read it once
193  * on each interrupt.
194  * Overload regs->result to specify whether we should use the MSR (result
195  * is zero) or the SIAR (result is non zero).
196  */
197 static inline void perf_read_regs(struct pt_regs *regs)
198 {
199  unsigned long mmcra = mfspr(SPRN_MMCRA);
200  int marked = mmcra & MMCRA_SAMPLE_ENABLE;
201  int use_siar;
202 
203  /*
204  * If this isn't a PMU exception (eg a software event) the SIAR is
205  * not valid. Use pt_regs.
206  *
207  * If it is a marked event use the SIAR.
208  *
209  * If the PMU doesn't update the SIAR for non marked events use
210  * pt_regs.
211  *
212  * If the PMU has HV/PR flags then check to see if they
213  * place the exception in userspace. If so, use pt_regs. In
214  * continuous sampling mode the SIAR and the PMU exception are
215  * not synchronised, so they may be many instructions apart.
216  * This can result in confusing backtraces. We still want
217  * hypervisor samples as well as samples in the kernel with
218  * interrupts off hence the userspace check.
219  */
220  if (TRAP(regs) != 0xf00)
221  use_siar = 0;
222  else if (marked)
223  use_siar = 1;
224  else if ((ppmu->flags & PPMU_NO_CONT_SAMPLING))
225  use_siar = 0;
226  else if (!(ppmu->flags & PPMU_NO_SIPR) && mmcra_sipr(mmcra))
227  use_siar = 0;
228  else
229  use_siar = 1;
230 
231  regs->dsisr = mmcra;
232  regs->result = use_siar;
233 }
234 
235 /*
236  * If interrupts were soft-disabled when a PMU interrupt occurs, treat
237  * it as an NMI.
238  */
239 static inline int perf_intr_is_nmi(struct pt_regs *regs)
240 {
241  return !regs->softe;
242 }
243 
244 /*
245  * On processors like P7+ that have the SIAR-Valid bit, marked instructions
246  * must be sampled only if the SIAR-valid bit is set.
247  *
248  * For unmarked instructions and for processors that don't have the SIAR-Valid
249  * bit, assume that SIAR is valid.
250  */
251 static inline int siar_valid(struct pt_regs *regs)
252 {
253  unsigned long mmcra = regs->dsisr;
254  int marked = mmcra & MMCRA_SAMPLE_ENABLE;
255 
256  if ((ppmu->flags & PPMU_SIAR_VALID) && marked)
257  return mmcra & POWER7P_MMCRA_SIAR_VALID;
258 
259  return 1;
260 }
261 
262 #endif /* CONFIG_PPC64 */
263 
264 static void perf_event_interrupt(struct pt_regs *regs);
265 
267 {
268 }
269 
270 /*
271  * Read one performance monitor counter (PMC).
272  */
273 static unsigned long read_pmc(int idx)
274 {
275  unsigned long val;
276 
277  switch (idx) {
278  case 1:
279  val = mfspr(SPRN_PMC1);
280  break;
281  case 2:
282  val = mfspr(SPRN_PMC2);
283  break;
284  case 3:
285  val = mfspr(SPRN_PMC3);
286  break;
287  case 4:
288  val = mfspr(SPRN_PMC4);
289  break;
290  case 5:
291  val = mfspr(SPRN_PMC5);
292  break;
293  case 6:
294  val = mfspr(SPRN_PMC6);
295  break;
296 #ifdef CONFIG_PPC64
297  case 7:
298  val = mfspr(SPRN_PMC7);
299  break;
300  case 8:
301  val = mfspr(SPRN_PMC8);
302  break;
303 #endif /* CONFIG_PPC64 */
304  default:
305  printk(KERN_ERR "oops trying to read PMC%d\n", idx);
306  val = 0;
307  }
308  return val;
309 }
310 
311 /*
312  * Write one PMC.
313  */
314 static void write_pmc(int idx, unsigned long val)
315 {
316  switch (idx) {
317  case 1:
318  mtspr(SPRN_PMC1, val);
319  break;
320  case 2:
321  mtspr(SPRN_PMC2, val);
322  break;
323  case 3:
324  mtspr(SPRN_PMC3, val);
325  break;
326  case 4:
327  mtspr(SPRN_PMC4, val);
328  break;
329  case 5:
330  mtspr(SPRN_PMC5, val);
331  break;
332  case 6:
333  mtspr(SPRN_PMC6, val);
334  break;
335 #ifdef CONFIG_PPC64
336  case 7:
337  mtspr(SPRN_PMC7, val);
338  break;
339  case 8:
340  mtspr(SPRN_PMC8, val);
341  break;
342 #endif /* CONFIG_PPC64 */
343  default:
344  printk(KERN_ERR "oops trying to write PMC%d\n", idx);
345  }
346 }
347 
348 /*
349  * Check if a set of events can all go on the PMU at once.
350  * If they can't, this will look at alternative codes for the events
351  * and see if any combination of alternative codes is feasible.
352  * The feasible set is returned in event_id[].
353  */
354 static int power_check_constraints(struct cpu_hw_events *cpuhw,
355  u64 event_id[], unsigned int cflags[],
356  int n_ev)
357 {
358  unsigned long mask, value, nv;
359  unsigned long smasks[MAX_HWEVENTS], svalues[MAX_HWEVENTS];
360  int n_alt[MAX_HWEVENTS], choice[MAX_HWEVENTS];
361  int i, j;
362  unsigned long addf = ppmu->add_fields;
363  unsigned long tadd = ppmu->test_adder;
364 
365  if (n_ev > ppmu->n_counter)
366  return -1;
367 
368  /* First see if the events will go on as-is */
369  for (i = 0; i < n_ev; ++i) {
370  if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
371  && !ppmu->limited_pmc_event(event_id[i])) {
372  ppmu->get_alternatives(event_id[i], cflags[i],
373  cpuhw->alternatives[i]);
374  event_id[i] = cpuhw->alternatives[i][0];
375  }
376  if (ppmu->get_constraint(event_id[i], &cpuhw->amasks[i][0],
377  &cpuhw->avalues[i][0]))
378  return -1;
379  }
380  value = mask = 0;
381  for (i = 0; i < n_ev; ++i) {
382  nv = (value | cpuhw->avalues[i][0]) +
383  (value & cpuhw->avalues[i][0] & addf);
384  if ((((nv + tadd) ^ value) & mask) != 0 ||
385  (((nv + tadd) ^ cpuhw->avalues[i][0]) &
386  cpuhw->amasks[i][0]) != 0)
387  break;
388  value = nv;
389  mask |= cpuhw->amasks[i][0];
390  }
391  if (i == n_ev)
392  return 0; /* all OK */
393 
394  /* doesn't work, gather alternatives... */
395  if (!ppmu->get_alternatives)
396  return -1;
397  for (i = 0; i < n_ev; ++i) {
398  choice[i] = 0;
399  n_alt[i] = ppmu->get_alternatives(event_id[i], cflags[i],
400  cpuhw->alternatives[i]);
401  for (j = 1; j < n_alt[i]; ++j)
402  ppmu->get_constraint(cpuhw->alternatives[i][j],
403  &cpuhw->amasks[i][j],
404  &cpuhw->avalues[i][j]);
405  }
406 
407  /* enumerate all possibilities and see if any will work */
408  i = 0;
409  j = -1;
410  value = mask = nv = 0;
411  while (i < n_ev) {
412  if (j >= 0) {
413  /* we're backtracking, restore context */
414  value = svalues[i];
415  mask = smasks[i];
416  j = choice[i];
417  }
418  /*
419  * See if any alternative k for event_id i,
420  * where k > j, will satisfy the constraints.
421  */
422  while (++j < n_alt[i]) {
423  nv = (value | cpuhw->avalues[i][j]) +
424  (value & cpuhw->avalues[i][j] & addf);
425  if ((((nv + tadd) ^ value) & mask) == 0 &&
426  (((nv + tadd) ^ cpuhw->avalues[i][j])
427  & cpuhw->amasks[i][j]) == 0)
428  break;
429  }
430  if (j >= n_alt[i]) {
431  /*
432  * No feasible alternative, backtrack
433  * to event_id i-1 and continue enumerating its
434  * alternatives from where we got up to.
435  */
436  if (--i < 0)
437  return -1;
438  } else {
439  /*
440  * Found a feasible alternative for event_id i,
441  * remember where we got up to with this event_id,
442  * go on to the next event_id, and start with
443  * the first alternative for it.
444  */
445  choice[i] = j;
446  svalues[i] = value;
447  smasks[i] = mask;
448  value = nv;
449  mask |= cpuhw->amasks[i][j];
450  ++i;
451  j = -1;
452  }
453  }
454 
455  /* OK, we have a feasible combination, tell the caller the solution */
456  for (i = 0; i < n_ev; ++i)
457  event_id[i] = cpuhw->alternatives[i][choice[i]];
458  return 0;
459 }
460 
461 /*
462  * Check if newly-added events have consistent settings for
463  * exclude_{user,kernel,hv} with each other and any previously
464  * added events.
465  */
466 static int check_excludes(struct perf_event **ctrs, unsigned int cflags[],
467  int n_prev, int n_new)
468 {
469  int eu = 0, ek = 0, eh = 0;
470  int i, n, first;
471  struct perf_event *event;
472 
473  n = n_prev + n_new;
474  if (n <= 1)
475  return 0;
476 
477  first = 1;
478  for (i = 0; i < n; ++i) {
479  if (cflags[i] & PPMU_LIMITED_PMC_OK) {
480  cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
481  continue;
482  }
483  event = ctrs[i];
484  if (first) {
485  eu = event->attr.exclude_user;
486  ek = event->attr.exclude_kernel;
487  eh = event->attr.exclude_hv;
488  first = 0;
489  } else if (event->attr.exclude_user != eu ||
490  event->attr.exclude_kernel != ek ||
491  event->attr.exclude_hv != eh) {
492  return -EAGAIN;
493  }
494  }
495 
496  if (eu || ek || eh)
497  for (i = 0; i < n; ++i)
498  if (cflags[i] & PPMU_LIMITED_PMC_OK)
499  cflags[i] |= PPMU_LIMITED_PMC_REQD;
500 
501  return 0;
502 }
503 
504 static u64 check_and_compute_delta(u64 prev, u64 val)
505 {
506  u64 delta = (val - prev) & 0xfffffffful;
507 
508  /*
509  * POWER7 can roll back counter values, if the new value is smaller
510  * than the previous value it will cause the delta and the counter to
511  * have bogus values unless we rolled a counter over. If a coutner is
512  * rolled back, it will be smaller, but within 256, which is the maximum
513  * number of events to rollback at once. If we dectect a rollback
514  * return 0. This can lead to a small lack of precision in the
515  * counters.
516  */
517  if (prev > val && (prev - val) < 256)
518  delta = 0;
519 
520  return delta;
521 }
522 
523 static void power_pmu_read(struct perf_event *event)
524 {
525  s64 val, delta, prev;
526 
527  if (event->hw.state & PERF_HES_STOPPED)
528  return;
529 
530  if (!event->hw.idx)
531  return;
532  /*
533  * Performance monitor interrupts come even when interrupts
534  * are soft-disabled, as long as interrupts are hard-enabled.
535  * Therefore we treat them like NMIs.
536  */
537  do {
538  prev = local64_read(&event->hw.prev_count);
539  barrier();
540  val = read_pmc(event->hw.idx);
541  delta = check_and_compute_delta(prev, val);
542  if (!delta)
543  return;
544  } while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
545 
546  local64_add(delta, &event->count);
547  local64_sub(delta, &event->hw.period_left);
548 }
549 
550 /*
551  * On some machines, PMC5 and PMC6 can't be written, don't respect
552  * the freeze conditions, and don't generate interrupts. This tells
553  * us if `event' is using such a PMC.
554  */
555 static int is_limited_pmc(int pmcnum)
556 {
557  return (ppmu->flags & PPMU_LIMITED_PMC5_6)
558  && (pmcnum == 5 || pmcnum == 6);
559 }
560 
561 static void freeze_limited_counters(struct cpu_hw_events *cpuhw,
562  unsigned long pmc5, unsigned long pmc6)
563 {
564  struct perf_event *event;
565  u64 val, prev, delta;
566  int i;
567 
568  for (i = 0; i < cpuhw->n_limited; ++i) {
569  event = cpuhw->limited_counter[i];
570  if (!event->hw.idx)
571  continue;
572  val = (event->hw.idx == 5) ? pmc5 : pmc6;
573  prev = local64_read(&event->hw.prev_count);
574  event->hw.idx = 0;
575  delta = check_and_compute_delta(prev, val);
576  if (delta)
577  local64_add(delta, &event->count);
578  }
579 }
580 
581 static void thaw_limited_counters(struct cpu_hw_events *cpuhw,
582  unsigned long pmc5, unsigned long pmc6)
583 {
584  struct perf_event *event;
585  u64 val, prev;
586  int i;
587 
588  for (i = 0; i < cpuhw->n_limited; ++i) {
589  event = cpuhw->limited_counter[i];
590  event->hw.idx = cpuhw->limited_hwidx[i];
591  val = (event->hw.idx == 5) ? pmc5 : pmc6;
592  prev = local64_read(&event->hw.prev_count);
593  if (check_and_compute_delta(prev, val))
594  local64_set(&event->hw.prev_count, val);
596  }
597 }
598 
599 /*
600  * Since limited events don't respect the freeze conditions, we
601  * have to read them immediately after freezing or unfreezing the
602  * other events. We try to keep the values from the limited
603  * events as consistent as possible by keeping the delay (in
604  * cycles and instructions) between freezing/unfreezing and reading
605  * the limited events as small and consistent as possible.
606  * Therefore, if any limited events are in use, we read them
607  * both, and always in the same order, to minimize variability,
608  * and do it inside the same asm that writes MMCR0.
609  */
610 static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
611 {
612  unsigned long pmc5, pmc6;
613 
614  if (!cpuhw->n_limited) {
615  mtspr(SPRN_MMCR0, mmcr0);
616  return;
617  }
618 
619  /*
620  * Write MMCR0, then read PMC5 and PMC6 immediately.
621  * To ensure we don't get a performance monitor interrupt
622  * between writing MMCR0 and freezing/thawing the limited
623  * events, we first write MMCR0 with the event overflow
624  * interrupt enable bits turned off.
625  */
626  asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
627  : "=&r" (pmc5), "=&r" (pmc6)
628  : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)),
629  "i" (SPRN_MMCR0),
630  "i" (SPRN_PMC5), "i" (SPRN_PMC6));
631 
632  if (mmcr0 & MMCR0_FC)
633  freeze_limited_counters(cpuhw, pmc5, pmc6);
634  else
635  thaw_limited_counters(cpuhw, pmc5, pmc6);
636 
637  /*
638  * Write the full MMCR0 including the event overflow interrupt
639  * enable bits, if necessary.
640  */
641  if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE))
642  mtspr(SPRN_MMCR0, mmcr0);
643 }
644 
645 /*
646  * Disable all events to prevent PMU interrupts and to allow
647  * events to be added or removed.
648  */
649 static void power_pmu_disable(struct pmu *pmu)
650 {
651  struct cpu_hw_events *cpuhw;
652  unsigned long flags;
653 
654  if (!ppmu)
655  return;
656  local_irq_save(flags);
657  cpuhw = &__get_cpu_var(cpu_hw_events);
658 
659  if (!cpuhw->disabled) {
660  cpuhw->disabled = 1;
661  cpuhw->n_added = 0;
662 
663  /*
664  * Check if we ever enabled the PMU on this cpu.
665  */
666  if (!cpuhw->pmcs_enabled) {
667  ppc_enable_pmcs();
668  cpuhw->pmcs_enabled = 1;
669  }
670 
671  /*
672  * Disable instruction sampling if it was enabled
673  */
674  if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
675  mtspr(SPRN_MMCRA,
676  cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
677  mb();
678  }
679 
680  /*
681  * Set the 'freeze counters' bit.
682  * The barrier is to make sure the mtspr has been
683  * executed and the PMU has frozen the events
684  * before we return.
685  */
686  write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
687  mb();
688  }
689  local_irq_restore(flags);
690 }
691 
692 /*
693  * Re-enable all events if disable == 0.
694  * If we were previously disabled and events were added, then
695  * put the new config on the PMU.
696  */
697 static void power_pmu_enable(struct pmu *pmu)
698 {
699  struct perf_event *event;
700  struct cpu_hw_events *cpuhw;
701  unsigned long flags;
702  long i;
703  unsigned long val;
704  s64 left;
705  unsigned int hwc_index[MAX_HWEVENTS];
706  int n_lim;
707  int idx;
708 
709  if (!ppmu)
710  return;
711  local_irq_save(flags);
712  cpuhw = &__get_cpu_var(cpu_hw_events);
713  if (!cpuhw->disabled) {
714  local_irq_restore(flags);
715  return;
716  }
717  cpuhw->disabled = 0;
718 
719  /*
720  * If we didn't change anything, or only removed events,
721  * no need to recalculate MMCR* settings and reset the PMCs.
722  * Just reenable the PMU with the current MMCR* settings
723  * (possibly updated for removal of events).
724  */
725  if (!cpuhw->n_added) {
726  mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
727  mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
728  if (cpuhw->n_events == 0)
729  ppc_set_pmu_inuse(0);
730  goto out_enable;
731  }
732 
733  /*
734  * Compute MMCR* values for the new set of events
735  */
736  if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_events, hwc_index,
737  cpuhw->mmcr)) {
738  /* shouldn't ever get here */
739  printk(KERN_ERR "oops compute_mmcr failed\n");
740  goto out;
741  }
742 
743  /*
744  * Add in MMCR0 freeze bits corresponding to the
745  * attr.exclude_* bits for the first event.
746  * We have already checked that all events have the
747  * same values for these bits as the first event.
748  */
749  event = cpuhw->event[0];
750  if (event->attr.exclude_user)
751  cpuhw->mmcr[0] |= MMCR0_FCP;
752  if (event->attr.exclude_kernel)
753  cpuhw->mmcr[0] |= freeze_events_kernel;
754  if (event->attr.exclude_hv)
755  cpuhw->mmcr[0] |= MMCR0_FCHV;
756 
757  /*
758  * Write the new configuration to MMCR* with the freeze
759  * bit set and set the hardware events to their initial values.
760  * Then unfreeze the events.
761  */
762  ppc_set_pmu_inuse(1);
763  mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
764  mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
765  mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
766  | MMCR0_FC);
767 
768  /*
769  * Read off any pre-existing events that need to move
770  * to another PMC.
771  */
772  for (i = 0; i < cpuhw->n_events; ++i) {
773  event = cpuhw->event[i];
774  if (event->hw.idx && event->hw.idx != hwc_index[i] + 1) {
775  power_pmu_read(event);
776  write_pmc(event->hw.idx, 0);
777  event->hw.idx = 0;
778  }
779  }
780 
781  /*
782  * Initialize the PMCs for all the new and moved events.
783  */
784  cpuhw->n_limited = n_lim = 0;
785  for (i = 0; i < cpuhw->n_events; ++i) {
786  event = cpuhw->event[i];
787  if (event->hw.idx)
788  continue;
789  idx = hwc_index[i] + 1;
790  if (is_limited_pmc(idx)) {
791  cpuhw->limited_counter[n_lim] = event;
792  cpuhw->limited_hwidx[n_lim] = idx;
793  ++n_lim;
794  continue;
795  }
796  val = 0;
797  if (event->hw.sample_period) {
798  left = local64_read(&event->hw.period_left);
799  if (left < 0x80000000L)
800  val = 0x80000000L - left;
801  }
802  local64_set(&event->hw.prev_count, val);
803  event->hw.idx = idx;
804  if (event->hw.state & PERF_HES_STOPPED)
805  val = 0;
806  write_pmc(idx, val);
808  }
809  cpuhw->n_limited = n_lim;
810  cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
811 
812  out_enable:
813  mb();
814  write_mmcr0(cpuhw, cpuhw->mmcr[0]);
815 
816  /*
817  * Enable instruction sampling if necessary
818  */
819  if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
820  mb();
821  mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
822  }
823 
824  out:
825  local_irq_restore(flags);
826 }
827 
828 static int collect_events(struct perf_event *group, int max_count,
829  struct perf_event *ctrs[], u64 *events,
830  unsigned int *flags)
831 {
832  int n = 0;
833  struct perf_event *event;
834 
835  if (!is_software_event(group)) {
836  if (n >= max_count)
837  return -1;
838  ctrs[n] = group;
839  flags[n] = group->hw.event_base;
840  events[n++] = group->hw.config;
841  }
842  list_for_each_entry(event, &group->sibling_list, group_entry) {
843  if (!is_software_event(event) &&
844  event->state != PERF_EVENT_STATE_OFF) {
845  if (n >= max_count)
846  return -1;
847  ctrs[n] = event;
848  flags[n] = event->hw.event_base;
849  events[n++] = event->hw.config;
850  }
851  }
852  return n;
853 }
854 
855 /*
856  * Add a event to the PMU.
857  * If all events are not already frozen, then we disable and
858  * re-enable the PMU in order to get hw_perf_enable to do the
859  * actual work of reconfiguring the PMU.
860  */
861 static int power_pmu_add(struct perf_event *event, int ef_flags)
862 {
863  struct cpu_hw_events *cpuhw;
864  unsigned long flags;
865  int n0;
866  int ret = -EAGAIN;
867 
868  local_irq_save(flags);
869  perf_pmu_disable(event->pmu);
870 
871  /*
872  * Add the event to the list (if there is room)
873  * and check whether the total set is still feasible.
874  */
875  cpuhw = &__get_cpu_var(cpu_hw_events);
876  n0 = cpuhw->n_events;
877  if (n0 >= ppmu->n_counter)
878  goto out;
879  cpuhw->event[n0] = event;
880  cpuhw->events[n0] = event->hw.config;
881  cpuhw->flags[n0] = event->hw.event_base;
882 
883  if (!(ef_flags & PERF_EF_START))
884  event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
885 
886  /*
887  * If group events scheduling transaction was started,
888  * skip the schedulability test here, it will be performed
889  * at commit time(->commit_txn) as a whole
890  */
891  if (cpuhw->group_flag & PERF_EVENT_TXN)
892  goto nocheck;
893 
894  if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1))
895  goto out;
896  if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1))
897  goto out;
898  event->hw.config = cpuhw->events[n0];
899 
900 nocheck:
901  ++cpuhw->n_events;
902  ++cpuhw->n_added;
903 
904  ret = 0;
905  out:
906  perf_pmu_enable(event->pmu);
907  local_irq_restore(flags);
908  return ret;
909 }
910 
911 /*
912  * Remove a event from the PMU.
913  */
914 static void power_pmu_del(struct perf_event *event, int ef_flags)
915 {
916  struct cpu_hw_events *cpuhw;
917  long i;
918  unsigned long flags;
919 
920  local_irq_save(flags);
921  perf_pmu_disable(event->pmu);
922 
923  power_pmu_read(event);
924 
925  cpuhw = &__get_cpu_var(cpu_hw_events);
926  for (i = 0; i < cpuhw->n_events; ++i) {
927  if (event == cpuhw->event[i]) {
928  while (++i < cpuhw->n_events) {
929  cpuhw->event[i-1] = cpuhw->event[i];
930  cpuhw->events[i-1] = cpuhw->events[i];
931  cpuhw->flags[i-1] = cpuhw->flags[i];
932  }
933  --cpuhw->n_events;
934  ppmu->disable_pmc(event->hw.idx - 1, cpuhw->mmcr);
935  if (event->hw.idx) {
936  write_pmc(event->hw.idx, 0);
937  event->hw.idx = 0;
938  }
940  break;
941  }
942  }
943  for (i = 0; i < cpuhw->n_limited; ++i)
944  if (event == cpuhw->limited_counter[i])
945  break;
946  if (i < cpuhw->n_limited) {
947  while (++i < cpuhw->n_limited) {
948  cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
949  cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
950  }
951  --cpuhw->n_limited;
952  }
953  if (cpuhw->n_events == 0) {
954  /* disable exceptions if no events are running */
955  cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
956  }
957 
958  perf_pmu_enable(event->pmu);
959  local_irq_restore(flags);
960 }
961 
962 /*
963  * POWER-PMU does not support disabling individual counters, hence
964  * program their cycle counter to their max value and ignore the interrupts.
965  */
966 
967 static void power_pmu_start(struct perf_event *event, int ef_flags)
968 {
969  unsigned long flags;
970  s64 left;
971  unsigned long val;
972 
973  if (!event->hw.idx || !event->hw.sample_period)
974  return;
975 
976  if (!(event->hw.state & PERF_HES_STOPPED))
977  return;
978 
979  if (ef_flags & PERF_EF_RELOAD)
980  WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
981 
982  local_irq_save(flags);
983  perf_pmu_disable(event->pmu);
984 
985  event->hw.state = 0;
986  left = local64_read(&event->hw.period_left);
987 
988  val = 0;
989  if (left < 0x80000000L)
990  val = 0x80000000L - left;
991 
992  write_pmc(event->hw.idx, val);
993 
995  perf_pmu_enable(event->pmu);
996  local_irq_restore(flags);
997 }
998 
999 static void power_pmu_stop(struct perf_event *event, int ef_flags)
1000 {
1001  unsigned long flags;
1002 
1003  if (!event->hw.idx || !event->hw.sample_period)
1004  return;
1005 
1006  if (event->hw.state & PERF_HES_STOPPED)
1007  return;
1008 
1009  local_irq_save(flags);
1010  perf_pmu_disable(event->pmu);
1011 
1012  power_pmu_read(event);
1013  event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
1014  write_pmc(event->hw.idx, 0);
1015 
1017  perf_pmu_enable(event->pmu);
1018  local_irq_restore(flags);
1019 }
1020 
1021 /*
1022  * Start group events scheduling transaction
1023  * Set the flag to make pmu::enable() not perform the
1024  * schedulability test, it will be performed at commit time
1025  */
1026 void power_pmu_start_txn(struct pmu *pmu)
1027 {
1028  struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
1029 
1030  perf_pmu_disable(pmu);
1031  cpuhw->group_flag |= PERF_EVENT_TXN;
1032  cpuhw->n_txn_start = cpuhw->n_events;
1033 }
1034 
1035 /*
1036  * Stop group events scheduling transaction
1037  * Clear the flag and pmu::enable() will perform the
1038  * schedulability test.
1039  */
1040 void power_pmu_cancel_txn(struct pmu *pmu)
1041 {
1042  struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
1043 
1044  cpuhw->group_flag &= ~PERF_EVENT_TXN;
1045  perf_pmu_enable(pmu);
1046 }
1047 
1048 /*
1049  * Commit group events scheduling transaction
1050  * Perform the group schedulability test as a whole
1051  * Return 0 if success
1052  */
1053 int power_pmu_commit_txn(struct pmu *pmu)
1054 {
1055  struct cpu_hw_events *cpuhw;
1056  long i, n;
1057 
1058  if (!ppmu)
1059  return -EAGAIN;
1060  cpuhw = &__get_cpu_var(cpu_hw_events);
1061  n = cpuhw->n_events;
1062  if (check_excludes(cpuhw->event, cpuhw->flags, 0, n))
1063  return -EAGAIN;
1064  i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n);
1065  if (i < 0)
1066  return -EAGAIN;
1067 
1068  for (i = cpuhw->n_txn_start; i < n; ++i)
1069  cpuhw->event[i]->hw.config = cpuhw->events[i];
1070 
1071  cpuhw->group_flag &= ~PERF_EVENT_TXN;
1072  perf_pmu_enable(pmu);
1073  return 0;
1074 }
1075 
1076 /*
1077  * Return 1 if we might be able to put event on a limited PMC,
1078  * or 0 if not.
1079  * A event can only go on a limited PMC if it counts something
1080  * that a limited PMC can count, doesn't require interrupts, and
1081  * doesn't exclude any processor mode.
1082  */
1083 static int can_go_on_limited_pmc(struct perf_event *event, u64 ev,
1084  unsigned int flags)
1085 {
1086  int n;
1088 
1089  if (event->attr.exclude_user
1090  || event->attr.exclude_kernel
1091  || event->attr.exclude_hv
1092  || event->attr.sample_period)
1093  return 0;
1094 
1095  if (ppmu->limited_pmc_event(ev))
1096  return 1;
1097 
1098  /*
1099  * The requested event_id isn't on a limited PMC already;
1100  * see if any alternative code goes on a limited PMC.
1101  */
1102  if (!ppmu->get_alternatives)
1103  return 0;
1104 
1105  flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
1106  n = ppmu->get_alternatives(ev, flags, alt);
1107 
1108  return n > 0;
1109 }
1110 
1111 /*
1112  * Find an alternative event_id that goes on a normal PMC, if possible,
1113  * and return the event_id code, or 0 if there is no such alternative.
1114  * (Note: event_id code 0 is "don't count" on all machines.)
1115  */
1116 static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
1117 {
1119  int n;
1120 
1121  flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
1122  n = ppmu->get_alternatives(ev, flags, alt);
1123  if (!n)
1124  return 0;
1125  return alt[0];
1126 }
1127 
1128 /* Number of perf_events counting hardware events */
1129 static atomic_t num_events;
1130 /* Used to avoid races in calling reserve/release_pmc_hardware */
1131 static DEFINE_MUTEX(pmc_reserve_mutex);
1132 
1133 /*
1134  * Release the PMU if this is the last perf_event.
1135  */
1136 static void hw_perf_event_destroy(struct perf_event *event)
1137 {
1138  if (!atomic_add_unless(&num_events, -1, 1)) {
1139  mutex_lock(&pmc_reserve_mutex);
1140  if (atomic_dec_return(&num_events) == 0)
1142  mutex_unlock(&pmc_reserve_mutex);
1143  }
1144 }
1145 
1146 /*
1147  * Translate a generic cache event_id config to a raw event_id code.
1148  */
1149 static int hw_perf_cache_event(u64 config, u64 *eventp)
1150 {
1151  unsigned long type, op, result;
1152  int ev;
1153 
1154  if (!ppmu->cache_events)
1155  return -EINVAL;
1156 
1157  /* unpack config */
1158  type = config & 0xff;
1159  op = (config >> 8) & 0xff;
1160  result = (config >> 16) & 0xff;
1161 
1162  if (type >= PERF_COUNT_HW_CACHE_MAX ||
1165  return -EINVAL;
1166 
1167  ev = (*ppmu->cache_events)[type][op][result];
1168  if (ev == 0)
1169  return -EOPNOTSUPP;
1170  if (ev == -1)
1171  return -EINVAL;
1172  *eventp = ev;
1173  return 0;
1174 }
1175 
1176 static int power_pmu_event_init(struct perf_event *event)
1177 {
1178  u64 ev;
1179  unsigned long flags;
1180  struct perf_event *ctrs[MAX_HWEVENTS];
1181  u64 events[MAX_HWEVENTS];
1182  unsigned int cflags[MAX_HWEVENTS];
1183  int n;
1184  int err;
1185  struct cpu_hw_events *cpuhw;
1186 
1187  if (!ppmu)
1188  return -ENOENT;
1189 
1190  /* does not support taken branch sampling */
1191  if (has_branch_stack(event))
1192  return -EOPNOTSUPP;
1193 
1194  switch (event->attr.type) {
1195  case PERF_TYPE_HARDWARE:
1196  ev = event->attr.config;
1197  if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
1198  return -EOPNOTSUPP;
1199  ev = ppmu->generic_events[ev];
1200  break;
1201  case PERF_TYPE_HW_CACHE:
1202  err = hw_perf_cache_event(event->attr.config, &ev);
1203  if (err)
1204  return err;
1205  break;
1206  case PERF_TYPE_RAW:
1207  ev = event->attr.config;
1208  break;
1209  default:
1210  return -ENOENT;
1211  }
1212 
1213  event->hw.config_base = ev;
1214  event->hw.idx = 0;
1215 
1216  /*
1217  * If we are not running on a hypervisor, force the
1218  * exclude_hv bit to 0 so that we don't care what
1219  * the user set it to.
1220  */
1221  if (!firmware_has_feature(FW_FEATURE_LPAR))
1222  event->attr.exclude_hv = 0;
1223 
1224  /*
1225  * If this is a per-task event, then we can use
1226  * PM_RUN_* events interchangeably with their non RUN_*
1227  * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
1228  * XXX we should check if the task is an idle task.
1229  */
1230  flags = 0;
1231  if (event->attach_state & PERF_ATTACH_TASK)
1232  flags |= PPMU_ONLY_COUNT_RUN;
1233 
1234  /*
1235  * If this machine has limited events, check whether this
1236  * event_id could go on a limited event.
1237  */
1238  if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
1239  if (can_go_on_limited_pmc(event, ev, flags)) {
1240  flags |= PPMU_LIMITED_PMC_OK;
1241  } else if (ppmu->limited_pmc_event(ev)) {
1242  /*
1243  * The requested event_id is on a limited PMC,
1244  * but we can't use a limited PMC; see if any
1245  * alternative goes on a normal PMC.
1246  */
1247  ev = normal_pmc_alternative(ev, flags);
1248  if (!ev)
1249  return -EINVAL;
1250  }
1251  }
1252 
1253  /*
1254  * If this is in a group, check if it can go on with all the
1255  * other hardware events in the group. We assume the event
1256  * hasn't been linked into its leader's sibling list at this point.
1257  */
1258  n = 0;
1259  if (event->group_leader != event) {
1260  n = collect_events(event->group_leader, ppmu->n_counter - 1,
1261  ctrs, events, cflags);
1262  if (n < 0)
1263  return -EINVAL;
1264  }
1265  events[n] = ev;
1266  ctrs[n] = event;
1267  cflags[n] = flags;
1268  if (check_excludes(ctrs, cflags, n, 1))
1269  return -EINVAL;
1270 
1271  cpuhw = &get_cpu_var(cpu_hw_events);
1272  err = power_check_constraints(cpuhw, events, cflags, n + 1);
1274  if (err)
1275  return -EINVAL;
1276 
1277  event->hw.config = events[n];
1278  event->hw.event_base = cflags[n];
1279  event->hw.last_period = event->hw.sample_period;
1280  local64_set(&event->hw.period_left, event->hw.last_period);
1281 
1282  /*
1283  * See if we need to reserve the PMU.
1284  * If no events are currently in use, then we have to take a
1285  * mutex to ensure that we don't race with another task doing
1286  * reserve_pmc_hardware or release_pmc_hardware.
1287  */
1288  err = 0;
1289  if (!atomic_inc_not_zero(&num_events)) {
1290  mutex_lock(&pmc_reserve_mutex);
1291  if (atomic_read(&num_events) == 0 &&
1292  reserve_pmc_hardware(perf_event_interrupt))
1293  err = -EBUSY;
1294  else
1295  atomic_inc(&num_events);
1296  mutex_unlock(&pmc_reserve_mutex);
1297  }
1298  event->destroy = hw_perf_event_destroy;
1299 
1300  return err;
1301 }
1302 
1303 static int power_pmu_event_idx(struct perf_event *event)
1304 {
1305  return event->hw.idx;
1306 }
1307 
1308 struct pmu power_pmu = {
1309  .pmu_enable = power_pmu_enable,
1310  .pmu_disable = power_pmu_disable,
1311  .event_init = power_pmu_event_init,
1312  .add = power_pmu_add,
1313  .del = power_pmu_del,
1314  .start = power_pmu_start,
1315  .stop = power_pmu_stop,
1316  .read = power_pmu_read,
1317  .start_txn = power_pmu_start_txn,
1318  .cancel_txn = power_pmu_cancel_txn,
1319  .commit_txn = power_pmu_commit_txn,
1320  .event_idx = power_pmu_event_idx,
1321 };
1322 
1323 
1324 /*
1325  * A counter has overflowed; update its count and record
1326  * things if requested. Note that interrupts are hard-disabled
1327  * here so there is no possibility of being interrupted.
1328  */
1329 static void record_and_restart(struct perf_event *event, unsigned long val,
1330  struct pt_regs *regs)
1331 {
1332  u64 period = event->hw.sample_period;
1333  s64 prev, delta, left;
1334  int record = 0;
1335 
1336  if (event->hw.state & PERF_HES_STOPPED) {
1337  write_pmc(event->hw.idx, 0);
1338  return;
1339  }
1340 
1341  /* we don't have to worry about interrupts here */
1342  prev = local64_read(&event->hw.prev_count);
1343  delta = check_and_compute_delta(prev, val);
1344  local64_add(delta, &event->count);
1345 
1346  /*
1347  * See if the total period for this event has expired,
1348  * and update for the next period.
1349  */
1350  val = 0;
1351  left = local64_read(&event->hw.period_left) - delta;
1352  if (period) {
1353  if (left <= 0) {
1354  left += period;
1355  if (left <= 0)
1356  left = period;
1357  record = siar_valid(regs);
1358  event->hw.last_period = event->hw.sample_period;
1359  }
1360  if (left < 0x80000000LL)
1361  val = 0x80000000LL - left;
1362  }
1363 
1364  write_pmc(event->hw.idx, val);
1365  local64_set(&event->hw.prev_count, val);
1366  local64_set(&event->hw.period_left, left);
1368 
1369  /*
1370  * Finally record data if requested.
1371  */
1372  if (record) {
1373  struct perf_sample_data data;
1374 
1375  perf_sample_data_init(&data, ~0ULL, event->hw.last_period);
1376 
1377  if (event->attr.sample_type & PERF_SAMPLE_ADDR)
1378  perf_get_data_addr(regs, &data.addr);
1379 
1380  if (perf_event_overflow(event, &data, regs))
1381  power_pmu_stop(event, 0);
1382  }
1383 }
1384 
1385 /*
1386  * Called from generic code to get the misc flags (i.e. processor mode)
1387  * for an event_id.
1388  */
1389 unsigned long perf_misc_flags(struct pt_regs *regs)
1390 {
1391  u32 flags = perf_get_misc_flags(regs);
1392 
1393  if (flags)
1394  return flags;
1395  return user_mode(regs) ? PERF_RECORD_MISC_USER :
1397 }
1398 
1399 /*
1400  * Called from generic code to get the instruction pointer
1401  * for an event_id.
1402  */
1403 unsigned long perf_instruction_pointer(struct pt_regs *regs)
1404 {
1405  unsigned long use_siar = regs->result;
1406 
1407  if (use_siar && siar_valid(regs))
1408  return mfspr(SPRN_SIAR) + perf_ip_adjust(regs);
1409  else if (use_siar)
1410  return 0; // no valid instruction pointer
1411  else
1412  return regs->nip;
1413 }
1414 
1415 static bool pmc_overflow(unsigned long val)
1416 {
1417  if ((int)val < 0)
1418  return true;
1419 
1420  /*
1421  * Events on POWER7 can roll back if a speculative event doesn't
1422  * eventually complete. Unfortunately in some rare cases they will
1423  * raise a performance monitor exception. We need to catch this to
1424  * ensure we reset the PMC. In all cases the PMC will be 256 or less
1425  * cycles from overflow.
1426  *
1427  * We only do this if the first pass fails to find any overflowing
1428  * PMCs because a user might set a period of less than 256 and we
1429  * don't want to mistakenly reset them.
1430  */
1431  if (pvr_version_is(PVR_POWER7) && ((0x80000000 - val) <= 256))
1432  return true;
1433 
1434  return false;
1435 }
1436 
1437 /*
1438  * Performance monitor interrupt stuff
1439  */
1440 static void perf_event_interrupt(struct pt_regs *regs)
1441 {
1442  int i;
1443  struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
1444  struct perf_event *event;
1445  unsigned long val;
1446  int found = 0;
1447  int nmi;
1448 
1449  if (cpuhw->n_limited)
1450  freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
1451  mfspr(SPRN_PMC6));
1452 
1453  perf_read_regs(regs);
1454 
1455  nmi = perf_intr_is_nmi(regs);
1456  if (nmi)
1457  nmi_enter();
1458  else
1459  irq_enter();
1460 
1461  for (i = 0; i < cpuhw->n_events; ++i) {
1462  event = cpuhw->event[i];
1463  if (!event->hw.idx || is_limited_pmc(event->hw.idx))
1464  continue;
1465  val = read_pmc(event->hw.idx);
1466  if ((int)val < 0) {
1467  /* event has overflowed */
1468  found = 1;
1469  record_and_restart(event, val, regs);
1470  }
1471  }
1472 
1473  /*
1474  * In case we didn't find and reset the event that caused
1475  * the interrupt, scan all events and reset any that are
1476  * negative, to avoid getting continual interrupts.
1477  * Any that we processed in the previous loop will not be negative.
1478  */
1479  if (!found) {
1480  for (i = 0; i < ppmu->n_counter; ++i) {
1481  if (is_limited_pmc(i + 1))
1482  continue;
1483  val = read_pmc(i + 1);
1484  if (pmc_overflow(val))
1485  write_pmc(i + 1, 0);
1486  }
1487  }
1488 
1489  /*
1490  * Reset MMCR0 to its normal value. This will set PMXE and
1491  * clear FC (freeze counters) and PMAO (perf mon alert occurred)
1492  * and thus allow interrupts to occur again.
1493  * XXX might want to use MSR.PM to keep the events frozen until
1494  * we get back out of this interrupt.
1495  */
1496  write_mmcr0(cpuhw, cpuhw->mmcr[0]);
1497 
1498  if (nmi)
1499  nmi_exit();
1500  else
1501  irq_exit();
1502 }
1503 
1504 static void power_pmu_setup(int cpu)
1505 {
1506  struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
1507 
1508  if (!ppmu)
1509  return;
1510  memset(cpuhw, 0, sizeof(*cpuhw));
1511  cpuhw->mmcr[0] = MMCR0_FC;
1512 }
1513 
1514 static int __cpuinit
1515 power_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1516 {
1517  unsigned int cpu = (long)hcpu;
1518 
1519  switch (action & ~CPU_TASKS_FROZEN) {
1520  case CPU_UP_PREPARE:
1521  power_pmu_setup(cpu);
1522  break;
1523 
1524  default:
1525  break;
1526  }
1527 
1528  return NOTIFY_OK;
1529 }
1530 
1531 int __cpuinit register_power_pmu(struct power_pmu *pmu)
1532 {
1533  if (ppmu)
1534  return -EBUSY; /* something's already registered */
1535 
1536  ppmu = pmu;
1537  pr_info("%s performance monitor hardware support registered\n",
1538  pmu->name);
1539 
1540 #ifdef MSR_HV
1541  /*
1542  * Use FCHV to ignore kernel events if MSR.HV is set.
1543  */
1544  if (mfmsr() & MSR_HV)
1545  freeze_events_kernel = MMCR0_FCHV;
1546 #endif /* CONFIG_PPC64 */
1547 
1548  perf_pmu_register(&power_pmu, "cpu", PERF_TYPE_RAW);
1549  perf_cpu_notifier(power_pmu_notifier);
1550 
1551  return 0;
1552 }