Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
perf_event.h
Go to the documentation of this file.
1 /*
2  * Performance events:
3  *
4  * Copyright (C) 2008-2009, Thomas Gleixner <[email protected]>
5  * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar
6  * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra
7  *
8  * Data type definitions, declarations, prototypes.
9  *
10  * Started by: Thomas Gleixner and Ingo Molnar
11  *
12  * For licencing details see kernel-base/COPYING
13  */
14 #ifndef _UAPI_LINUX_PERF_EVENT_H
15 #define _UAPI_LINUX_PERF_EVENT_H
16 
17 #include <linux/types.h>
18 #include <linux/ioctl.h>
19 #include <asm/byteorder.h>
20 
21 /*
22  * User-space ABI bits:
23  */
24 
25 /*
26  * attr.type
27  */
35 
36  PERF_TYPE_MAX, /* non-ABI */
37 };
38 
39 /*
40  * Generalized performance event event_id types, used by the
41  * attr.event_id parameter of the sys_perf_event_open()
42  * syscall:
43  */
44 enum perf_hw_id {
45  /*
46  * Common hardware events, generalized by the kernel:
47  */
58 
59  PERF_COUNT_HW_MAX, /* non-ABI */
60 };
61 
62 /*
63  * Generalized hardware cache events:
64  *
65  * { L1-D, L1-I, LLC, ITLB, DTLB, BPU, NODE } x
66  * { read, write, prefetch } x
67  * { accesses, misses }
68  */
77 
78  PERF_COUNT_HW_CACHE_MAX, /* non-ABI */
79 };
80 
85 
87 };
88 
92 
94 };
95 
96 /*
97  * Special "software" events provided by the kernel, even if the hardware
98  * does not support performance events. These events measure various
99  * physical and sw events of the kernel (and allow the profiling of them as
100  * well):
101  */
112 
113  PERF_COUNT_SW_MAX, /* non-ABI */
114 };
115 
116 /*
117  * Bits that can be set in attr.sample_type to request information
118  * in the overflow packets.
119  */
121  PERF_SAMPLE_IP = 1U << 0,
127  PERF_SAMPLE_ID = 1U << 6,
131  PERF_SAMPLE_RAW = 1U << 10,
135 
136  PERF_SAMPLE_MAX = 1U << 14, /* non-ABI */
137 };
138 
139 /*
140  * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set
141  *
142  * If the user does not pass priv level information via branch_sample_type,
143  * the kernel uses the event's priv level. Branch and event priv levels do
144  * not have to match. Branch priv level is checked for permissions.
145  *
146  * The branch types can be combined, however BRANCH_ANY covers all types
147  * of branches and therefore it supersedes all the other types.
148  */
150  PERF_SAMPLE_BRANCH_USER = 1U << 0, /* user branches */
151  PERF_SAMPLE_BRANCH_KERNEL = 1U << 1, /* kernel branches */
152  PERF_SAMPLE_BRANCH_HV = 1U << 2, /* hypervisor branches */
153 
154  PERF_SAMPLE_BRANCH_ANY = 1U << 3, /* any branch types */
155  PERF_SAMPLE_BRANCH_ANY_CALL = 1U << 4, /* any call branch */
156  PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << 5, /* any return branch */
157  PERF_SAMPLE_BRANCH_IND_CALL = 1U << 6, /* indirect calls */
158 
159  PERF_SAMPLE_BRANCH_MAX = 1U << 7, /* non-ABI */
160 };
161 
162 #define PERF_SAMPLE_BRANCH_PLM_ALL \
163  (PERF_SAMPLE_BRANCH_USER|\
164  PERF_SAMPLE_BRANCH_KERNEL|\
165  PERF_SAMPLE_BRANCH_HV)
166 
167 /*
168  * Values to determine ABI of the registers dump.
169  */
174 };
175 
176 /*
177  * The format of the data returned by read() on a perf event fd,
178  * as specified by attr.read_format:
179  *
180  * struct read_format {
181  * { u64 value;
182  * { u64 time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED
183  * { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
184  * { u64 id; } && PERF_FORMAT_ID
185  * } && !PERF_FORMAT_GROUP
186  *
187  * { u64 nr;
188  * { u64 time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED
189  * { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
190  * { u64 value;
191  * { u64 id; } && PERF_FORMAT_ID
192  * } cntr[nr];
193  * } && PERF_FORMAT_GROUP
194  * };
195  */
199  PERF_FORMAT_ID = 1U << 2,
201 
202  PERF_FORMAT_MAX = 1U << 4, /* non-ABI */
203 };
204 
205 #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */
206 #define PERF_ATTR_SIZE_VER1 72 /* add: config2 */
207 #define PERF_ATTR_SIZE_VER2 80 /* add: branch_sample_type */
208 #define PERF_ATTR_SIZE_VER3 96 /* add: sample_regs_user */
209  /* add: sample_stack_user */
210 
211 /*
212  * Hardware event_id to monitor via a performance monitoring event:
213  */
215 
216  /*
217  * Major type: hardware/software/tracepoint/etc.
218  */
220 
221  /*
222  * Size of the attr structure, for fwd/bwd compat.
223  */
225 
226  /*
227  * Type specific configuration information.
228  */
230 
231  union {
234  };
235 
238 
239  __u64 disabled : 1, /* off by default */
240  inherit : 1, /* children inherit it */
241  pinned : 1, /* must always be on PMU */
242  exclusive : 1, /* only group on PMU */
243  exclude_user : 1, /* don't count user */
244  exclude_kernel : 1, /* ditto kernel */
245  exclude_hv : 1, /* ditto hypervisor */
246  exclude_idle : 1, /* don't count when idle */
247  mmap : 1, /* include mmap data */
248  comm : 1, /* include comm data */
249  freq : 1, /* use freq, not period */
250  inherit_stat : 1, /* per task counts */
251  enable_on_exec : 1, /* next exec enables */
252  task : 1, /* trace fork/exit */
253  watermark : 1, /* wakeup_watermark */
254  /*
255  * precise_ip:
256  *
257  * 0 - SAMPLE_IP can have arbitrary skid
258  * 1 - SAMPLE_IP must have constant skid
259  * 2 - SAMPLE_IP requested to have 0 skid
260  * 3 - SAMPLE_IP must have 0 skid
261  *
262  * See also PERF_RECORD_MISC_EXACT_IP
263  */
264  precise_ip : 2, /* skid constraint */
265  mmap_data : 1, /* non-exec mmap data */
266  sample_id_all : 1, /* sample_type all events */
267 
268  exclude_host : 1, /* don't count in host */
269  exclude_guest : 1, /* don't count in guest */
270 
271  exclude_callchain_kernel : 1, /* exclude kernel callchains */
272  exclude_callchain_user : 1, /* exclude user callchains */
273 
274  __reserved_1 : 41;
275 
276  union {
277  __u32 wakeup_events; /* wakeup every n events */
278  __u32 wakeup_watermark; /* bytes before wakeup */
279  };
280 
282  union {
284  __u64 config1; /* extension of config */
285  };
286  union {
288  __u64 config2; /* extension of config1 */
289  };
290  __u64 branch_sample_type; /* enum perf_branch_sample_type */
291 
292  /*
293  * Defines set of user regs to dump on samples.
294  * See asm/perf_regs.h for details.
295  */
297 
298  /*
299  * Defines size of the user stack to dump on samples.
300  */
302 
303  /* Align to u64. */
305 };
306 
307 #define perf_flags(attr) (*(&(attr)->read_format + 1))
308 
309 /*
310  * Ioctls that can be done on a perf event fd:
311  */
312 #define PERF_EVENT_IOC_ENABLE _IO ('$', 0)
313 #define PERF_EVENT_IOC_DISABLE _IO ('$', 1)
314 #define PERF_EVENT_IOC_REFRESH _IO ('$', 2)
315 #define PERF_EVENT_IOC_RESET _IO ('$', 3)
316 #define PERF_EVENT_IOC_PERIOD _IOW('$', 4, __u64)
317 #define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5)
318 #define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *)
319 
322 };
323 
324 /*
325  * Structure of the page that can be mapped via mmap
326  */
328  __u32 version; /* version number of this structure */
329  __u32 compat_version; /* lowest version this is compat with */
330 
331  /*
332  * Bits needed to read the hw events in user-space.
333  *
334  * u32 seq, time_mult, time_shift, idx, width;
335  * u64 count, enabled, running;
336  * u64 cyc, time_offset;
337  * s64 pmc = 0;
338  *
339  * do {
340  * seq = pc->lock;
341  * barrier()
342  *
343  * enabled = pc->time_enabled;
344  * running = pc->time_running;
345  *
346  * if (pc->cap_usr_time && enabled != running) {
347  * cyc = rdtsc();
348  * time_offset = pc->time_offset;
349  * time_mult = pc->time_mult;
350  * time_shift = pc->time_shift;
351  * }
352  *
353  * idx = pc->index;
354  * count = pc->offset;
355  * if (pc->cap_usr_rdpmc && idx) {
356  * width = pc->pmc_width;
357  * pmc = rdpmc(idx - 1);
358  * }
359  *
360  * barrier();
361  * } while (pc->lock != seq);
362  *
363  * NOTE: for obvious reason this only works on self-monitoring
364  * processes.
365  */
366  __u32 lock; /* seqlock for synchronization */
367  __u32 index; /* hardware event identifier */
368  __s64 offset; /* add to hardware event value */
369  __u64 time_enabled; /* time event active */
370  __u64 time_running; /* time event on cpu */
371  union {
374  cap_usr_rdpmc : 1,
375  cap_____res : 62;
376  };
377 
378  /*
379  * If cap_usr_rdpmc this field provides the bit-width of the value
380  * read using the rdpmc() or equivalent instruction. This can be used
381  * to sign extend the result like:
382  *
383  * pmc <<= 64 - width;
384  * pmc >>= 64 - width; // signed shift right
385  * count += pmc;
386  */
388 
389  /*
390  * If cap_usr_time the below fields can be used to compute the time
391  * delta since time_enabled (in ns) using rdtsc or similar.
392  *
393  * u64 quot, rem;
394  * u64 delta;
395  *
396  * quot = (cyc >> time_shift);
397  * rem = cyc & ((1 << time_shift) - 1);
398  * delta = time_offset + quot * time_mult +
399  * ((rem * time_mult) >> time_shift);
400  *
401  * Where time_offset,time_mult,time_shift and cyc are read in the
402  * seqcount loop described above. This delta can then be added to
403  * enabled and possible running (if idx), improving the scaling:
404  *
405  * enabled += delta;
406  * if (idx)
407  * running += delta;
408  *
409  * quot = count / running;
410  * rem = count % running;
411  * count = quot * enabled + (rem * enabled) / running;
412  */
416 
417  /*
418  * Hole for extension of the self monitor capabilities
419  */
420 
421  __u64 __reserved[120]; /* align to 1k */
422 
423  /*
424  * Control data for the mmap() data buffer.
425  *
426  * User-space reading the @data_head value should issue an rmb(), on
427  * SMP capable platforms, after reading this value -- see
428  * perf_event_wakeup().
429  *
430  * When the mapping is PROT_WRITE the @data_tail value should be
431  * written by userspace to reflect the last read data. In this case
432  * the kernel will not over-write unread data.
433  */
434  __u64 data_head; /* head in the data section */
435  __u64 data_tail; /* user-space written tail */
436 };
437 
438 #define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0)
439 #define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0)
440 #define PERF_RECORD_MISC_KERNEL (1 << 0)
441 #define PERF_RECORD_MISC_USER (2 << 0)
442 #define PERF_RECORD_MISC_HYPERVISOR (3 << 0)
443 #define PERF_RECORD_MISC_GUEST_KERNEL (4 << 0)
444 #define PERF_RECORD_MISC_GUEST_USER (5 << 0)
445 
446 /*
447  * Indicates that the content of PERF_SAMPLE_IP points to
448  * the actual instruction that triggered the event. See also
449  * perf_event_attr::precise_ip.
450  */
451 #define PERF_RECORD_MISC_EXACT_IP (1 << 14)
452 /*
453  * Reserve the last bit to indicate some extended misc field
454  */
455 #define PERF_RECORD_MISC_EXT_RESERVED (1 << 15)
456 
461 };
462 
464 
465  /*
466  * If perf_event_attr.sample_id_all is set then all event types will
467  * have the sample_type selected fields related to where/when
468  * (identity) an event took place (TID, TIME, ID, CPU, STREAM_ID)
469  * described in PERF_RECORD_SAMPLE below, it will be stashed just after
470  * the perf_event_header and the fields already present for the existing
471  * fields, i.e. at the end of the payload. That way a newer perf.data
472  * file will be supported by older perf tools, with these new optional
473  * fields being ignored.
474  *
475  * The MMAP events record the PROT_EXEC mappings so that we can
476  * correlate userspace IPs to code. They have the following structure:
477  *
478  * struct {
479  * struct perf_event_header header;
480  *
481  * u32 pid, tid;
482  * u64 addr;
483  * u64 len;
484  * u64 pgoff;
485  * char filename[];
486  * };
487  */
489 
490  /*
491  * struct {
492  * struct perf_event_header header;
493  * u64 id;
494  * u64 lost;
495  * };
496  */
498 
499  /*
500  * struct {
501  * struct perf_event_header header;
502  *
503  * u32 pid, tid;
504  * char comm[];
505  * };
506  */
508 
509  /*
510  * struct {
511  * struct perf_event_header header;
512  * u32 pid, ppid;
513  * u32 tid, ptid;
514  * u64 time;
515  * };
516  */
518 
519  /*
520  * struct {
521  * struct perf_event_header header;
522  * u64 time;
523  * u64 id;
524  * u64 stream_id;
525  * };
526  */
529 
530  /*
531  * struct {
532  * struct perf_event_header header;
533  * u32 pid, ppid;
534  * u32 tid, ptid;
535  * u64 time;
536  * };
537  */
539 
540  /*
541  * struct {
542  * struct perf_event_header header;
543  * u32 pid, tid;
544  *
545  * struct read_format values;
546  * };
547  */
549 
550  /*
551  * struct {
552  * struct perf_event_header header;
553  *
554  * { u64 ip; } && PERF_SAMPLE_IP
555  * { u32 pid, tid; } && PERF_SAMPLE_TID
556  * { u64 time; } && PERF_SAMPLE_TIME
557  * { u64 addr; } && PERF_SAMPLE_ADDR
558  * { u64 id; } && PERF_SAMPLE_ID
559  * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID
560  * { u32 cpu, res; } && PERF_SAMPLE_CPU
561  * { u64 period; } && PERF_SAMPLE_PERIOD
562  *
563  * { struct read_format values; } && PERF_SAMPLE_READ
564  *
565  * { u64 nr,
566  * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN
567  *
568  * #
569  * # The RAW record below is opaque data wrt the ABI
570  * #
571  * # That is, the ABI doesn't make any promises wrt to
572  * # the stability of its content, it may vary depending
573  * # on event, hardware, kernel version and phase of
574  * # the moon.
575  * #
576  * # In other words, PERF_SAMPLE_RAW contents are not an ABI.
577  * #
578  *
579  * { u32 size;
580  * char data[size];}&& PERF_SAMPLE_RAW
581  *
582  * { u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK
583  *
584  * { u64 abi; # enum perf_sample_regs_abi
585  * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER
586  *
587  * { u64 size;
588  * char data[size];
589  * u64 dyn_size; } && PERF_SAMPLE_STACK_USER
590  * };
591  */
593 
594  PERF_RECORD_MAX, /* non-ABI */
595 };
596 
597 #define PERF_MAX_STACK_DEPTH 127
598 
603 
607 
609 };
610 
611 #define PERF_FLAG_FD_NO_GROUP (1U << 0)
612 #define PERF_FLAG_FD_OUTPUT (1U << 1)
613 #define PERF_FLAG_PID_CGROUP (1U << 2) /* pid=cgroup id, per-cpu mode only */
614 
615 #endif /* _UAPI_LINUX_PERF_EVENT_H */