19 #include <sys/prctl.h>
20 #include <sys/resource.h>
26 #define PR_SET_NAME 15
69 #define TASK_STATE_TO_CHAR_STR "RSDTtZX"
174 static u64 get_nsecs(
void)
180 return ts.tv_sec * 1000000000ULL +
ts.tv_nsec;
192 static void sleep_nsecs(
u64 nsecs)
196 ts.tv_nsec = nsecs % 999999999;
197 ts.tv_sec = nsecs / 999999999;
202 static void calibrate_run_measurement_overhead(
struct perf_sched *sched)
207 for (i = 0; i < 10; i++) {
209 burn_nsecs(sched, 0);
212 min_delta =
min(min_delta, delta);
216 printf(
"run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
219 static void calibrate_sleep_measurement_overhead(
struct perf_sched *sched)
224 for (i = 0; i < 10; i++) {
229 min_delta =
min(min_delta, delta);
234 printf(
"sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
280 event = get_new_event(task, timestamp);
293 event = get_new_event(task, timestamp);
295 event->wakee =
wakee;
297 wakee_event = last_event(wakee);
310 event->wait_sem = wakee_event->
wait_sem;
318 struct sched_atom *
event = get_new_event(task, timestamp);
326 unsigned long pid,
const char *
comm)
337 task =
zalloc(
sizeof(*task));
345 add_sched_event_sleep(sched, task, 0, 0);
354 printf(
"registered task #%ld, PID %ld (%s)\n", sched->
nr_tasks, pid, comm);
360 static void print_task_traces(
struct perf_sched *sched)
365 for (i = 0; i < sched->
nr_tasks; i++) {
367 printf(
"task %6ld (%20s:%10ld), nr_events: %ld\n",
372 static void add_cross_task_wakeups(
struct perf_sched *sched)
377 for (i = 0; i < sched->
nr_tasks; i++) {
383 add_sched_event_wakeup(sched, task1, 0, task2);
387 static void perf_sched__process_event(
struct perf_sched *sched,
392 switch (atom->
type) {
413 static u64 get_cpu_usage_nsec_parent(
void)
422 sum = ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1
e3;
423 sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1
e3;
428 static int self_open_counters(
void)
441 pr_err(
"Error: sys_perf_event_open() syscall returned "
442 "with %d (%s)\n", fd,
strerror(errno));
446 static u64 get_cpu_usage_nsec_self(
int fd)
451 ret =
read(fd, &runtime,
sizeof(runtime));
452 BUG_ON(ret !=
sizeof(runtime));
462 static void *thread_func(
void *
ctx)
467 u64 cpu_usage_0, cpu_usage_1;
468 unsigned long i,
ret;
476 fd = self_open_counters();
487 cpu_usage_0 = get_cpu_usage_nsec_self(fd);
489 for (i = 0; i < this_task->
nr_events; i++) {
491 perf_sched__process_event(sched, this_task->
atoms[i]);
494 cpu_usage_1 = get_cpu_usage_nsec_self(fd);
495 this_task->
cpu_usage = cpu_usage_1 - cpu_usage_0;
507 static void create_tasks(
struct perf_sched *sched)
514 err = pthread_attr_init(&attr);
516 err = pthread_attr_setstacksize(&attr,
517 (
size_t)
max(16 * 1024, PTHREAD_STACK_MIN));
523 for (i = 0; i < sched->
nr_tasks; i++) {
532 err = pthread_create(&task->
thread, &attr, thread_func, parms);
537 static void wait_for_tasks(
struct perf_sched *sched)
539 u64 cpu_usage_0, cpu_usage_1;
541 unsigned long i,
ret;
547 for (i = 0; i < sched->
nr_tasks; i++) {
556 cpu_usage_0 = get_cpu_usage_nsec_parent();
560 for (i = 0; i < sched->
nr_tasks; i++) {
569 cpu_usage_1 = get_cpu_usage_nsec_parent();
583 for (i = 0; i < sched->
nr_tasks; i++) {
590 static void run_one_test(
struct perf_sched *sched)
595 wait_for_tasks(sched);
603 if (delta < avg_delta)
604 fluct = avg_delta -
delta;
606 fluct = delta - avg_delta;
612 printf(
"#%-3ld: %0.3f, ", sched->
nr_runs, (
double)delta / 1000000.0);
616 printf(
"cpu: %0.2f / %0.2f",
624 printf(
" [%0.2f / %0.2f]",
636 static void test_calibrations(
struct perf_sched *sched)
641 burn_nsecs(sched, 1e6);
644 printf(
"the run test took %" PRIu64 " nsecs\n", T1 - T0);
650 printf(
"the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
658 const char *comm = perf_evsel__strval(evsel, sample,
"comm");
663 printf(
"sched_wakeup event %p\n", evsel);
665 printf(
" ... pid %d woke up %s/%d\n", sample->
tid, comm, pid);
668 waker = register_pid(sched, sample->
tid,
"<unknown>");
669 wakee = register_pid(sched, pid, comm);
671 add_sched_event_wakeup(sched, waker, sample->
time, wakee);
675 static int replay_switch_event(
struct perf_sched *sched,
680 const char *prev_comm = perf_evsel__strval(evsel, sample,
"prev_comm"),
681 *next_comm = perf_evsel__strval(evsel, sample,
"next_comm");
686 u64 timestamp0, timestamp = sample->
time;
691 printf(
"sched_switch event %p\n", evsel);
698 delta = timestamp - timestamp0;
707 pr_debug(
" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
708 prev_comm, prev_pid, next_comm, next_pid, delta);
710 prev = register_pid(sched, prev_pid, prev_comm);
711 next = register_pid(sched, next_pid, next_comm);
715 add_sched_event_run(sched, prev, timestamp, delta);
716 add_sched_event_sleep(sched, prev, timestamp, prev_state);
724 const char *parent_comm = perf_evsel__strval(evsel, sample,
"parent_comm"),
725 *child_comm = perf_evsel__strval(evsel, sample,
"child_comm");
730 printf(
"sched_fork event %p\n", evsel);
731 printf(
"... parent: %s/%d\n", parent_comm, parent_pid);
732 printf(
"... child: %s/%d\n", child_comm, child_pid);
735 register_pid(sched, parent_pid, parent_comm);
736 register_pid(sched, child_pid, child_comm);
755 ret = sort->
cmp(l, r);
776 cmp = thread_lat_cmp(sort_list, &key, atoms);
802 cmp = thread_lat_cmp(sort_list, data,
this);
805 new = &((*new)->rb_left);
807 new = &((*new)->rb_right);
810 rb_link_node(&data->
node, parent,
new);
814 static int thread_atoms_insert(
struct perf_sched *sched,
struct thread *thread)
818 pr_err(
"No memory at %s\n", __func__);
824 __thread_latency_insert(&sched->atom_root, atoms, &sched->
cmp_pid);
828 static int latency_fork_event(
struct perf_sched *sched __maybe_unused,
836 static char sched_out_state(
u64 prev_state)
850 pr_err(
"Non memory at %s", __func__);
856 if (run_state ==
'R') {
867 u64 timestamp __maybe_unused)
910 static int latency_switch_event(
struct perf_sched *sched,
919 struct thread *sched_out, *sched_in;
920 u64 timestamp0, timestamp = sample->
time;
921 int cpu = sample->
cpu;
929 delta = timestamp - timestamp0;
941 out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->
cmp_pid);
943 if (thread_atoms_insert(sched, sched_out))
945 out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->
cmp_pid);
947 pr_err(
"out-event: Internal tree error");
951 if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
954 in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->
cmp_pid);
956 if (thread_atoms_insert(sched, sched_in))
958 in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->
cmp_pid);
960 pr_err(
"in-event: Internal tree error");
967 if (add_sched_out_event(in_events,
'R', timestamp))
970 add_sched_in_event(in_events, timestamp);
975 static int latency_runtime_event(
struct perf_sched *sched,
978 struct machine *machine)
983 struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->
cmp_pid);
985 int cpu = sample->
cpu;
989 if (thread_atoms_insert(sched, thread))
991 atoms = thread_atoms_search(&sched->atom_root, thread, &sched->
cmp_pid);
993 pr_err(
"in-event: Internal tree error");
996 if (add_sched_out_event(atoms,
'R', timestamp))
1000 add_runtime_event(atoms, runtime, timestamp);
1004 static int latency_wakeup_event(
struct perf_sched *sched,
1007 struct machine *machine)
1013 struct thread *wakee;
1021 atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->
cmp_pid);
1023 if (thread_atoms_insert(sched, wakee))
1025 atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->
cmp_pid);
1027 pr_err(
"wakeup-event: Internal tree error");
1030 if (add_sched_out_event(atoms,
'S', timestamp))
1057 static int latency_migrate_task_event(
struct perf_sched *sched,
1060 struct machine *machine)
1066 struct thread *migrant;
1075 atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->
cmp_pid);
1077 if (thread_atoms_insert(sched, migrant))
1079 register_pid(sched, migrant->
pid, migrant->
comm);
1080 atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->
cmp_pid);
1082 pr_err(
"migration-event: Internal tree error");
1085 if (add_sched_out_event(atoms,
'R', timestamp))
1121 for (i = 0; i < 24 -
ret; i++)
1126 printf(
"|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %9.6f s\n",
1128 work_list->
nb_atoms, (
double)avg / 1e6,
1129 (
double)work_list->
max_lat / 1e6,
1194 static int sort_dimension__add(
const char *tok,
struct list_head *list)
1218 &pid_sort_dimension,
1219 &avg_sort_dimension,
1220 &max_sort_dimension,
1221 &switch_sort_dimension,
1222 &runtime_sort_dimension,
1225 for (i = 0; i <
ARRAY_SIZE(available_sorts); i++) {
1226 if (!
strcmp(available_sorts[i]->
name, tok)) {
1236 static void perf_sched__sort_lat(
struct perf_sched *sched)
1242 node =
rb_first(&sched->atom_root);
1248 __thread_latency_insert(&sched->
sorted_atom_root, data, &sched->sort_list);
1252 static int process_sched_wakeup_event(
struct perf_tool *tool,
1255 struct machine *machine)
1260 return sched->
tp_handler->wakeup_event(sched, evsel, sample, machine);
1266 struct perf_sample *sample,
struct machine *machine)
1272 u64 timestamp0, timestamp = sample->
time;
1274 int cpu, this_cpu = sample->
cpu;
1278 if (this_cpu > sched->
max_cpu)
1284 delta = timestamp - timestamp0;
1301 if (!sched_in->shortname[0]) {
1318 for (cpu = 0; cpu <= sched->
max_cpu; cpu++) {
1319 if (cpu != this_cpu)
1333 printf(
" %12.6f secs ", (
double)timestamp/1e9);
1334 if (new_shortname) {
1336 sched_in->shortname, sched_in->comm, sched_in->pid);
1344 static int process_sched_switch_event(
struct perf_tool *tool,
1347 struct machine *machine)
1350 int this_cpu = sample->
cpu, err = 0;
1359 if (sched->
curr_pid[this_cpu] != prev_pid)
1364 err = sched->
tp_handler->switch_event(sched, evsel, sample, machine);
1366 sched->
curr_pid[this_cpu] = next_pid;
1370 static int process_sched_runtime_event(
struct perf_tool *tool,
1373 struct machine *machine)
1378 return sched->
tp_handler->runtime_event(sched, evsel, sample, machine);
1383 static int process_sched_fork_event(
struct perf_tool *tool,
1386 struct machine *machine __maybe_unused)
1391 return sched->
tp_handler->fork_event(sched, evsel, sample);
1396 static int process_sched_exit_event(
struct perf_tool *tool __maybe_unused,
1399 struct machine *machine __maybe_unused)
1401 pr_debug(
"sched_exit event %p\n", evsel);
1405 static int process_sched_migrate_task_event(
struct perf_tool *tool,
1408 struct machine *machine)
1413 return sched->
tp_handler->migrate_task_event(sched, evsel, sample, machine);
1423 static int perf_sched__process_tracepoint_sample(
struct perf_tool *tool __maybe_unused,
1427 struct machine *machine)
1432 if (thread ==
NULL) {
1433 pr_debug(
"problem processing %s event, skipping it.\n",
1438 evsel->
hists.stats.total_period += sample->
period;
1443 err =
f(tool, evsel, sample, machine);
1453 {
"sched:sched_switch", process_sched_switch_event, },
1454 {
"sched:sched_stat_runtime", process_sched_runtime_event, },
1455 {
"sched:sched_wakeup", process_sched_wakeup_event, },
1456 {
"sched:sched_wakeup_new", process_sched_wakeup_event, },
1457 {
"sched:sched_process_fork", process_sched_fork_event, },
1458 {
"sched:sched_process_exit", process_sched_exit_event, },
1459 {
"sched:sched_migrate_task", process_sched_migrate_task_event, },
1464 if (session ==
NULL) {
1465 pr_debug(
"No Memory for session\n");
1475 pr_err(
"Failed to process events, error %d", err);
1488 *psession = session;
1497 static void print_bad_events(
struct perf_sched *sched)
1500 printf(
" INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
1505 printf(
" INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
1510 printf(
" INFO: %.3f%% state machine bugs (%ld out of %ld)",
1514 printf(
" (due to lost events?)");
1518 printf(
" INFO: %.3f%% context switch bugs (%ld out of %ld)",
1522 printf(
" (due to lost events?)");
1527 static int perf_sched__lat(
struct perf_sched *sched)
1533 if (perf_sched__read_events(sched,
false, &session))
1535 perf_sched__sort_lat(sched);
1537 printf(
"\n ---------------------------------------------------------------------------------------------------------------\n");
1538 printf(
" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | Maximum delay at |\n");
1539 printf(
" ---------------------------------------------------------------------------------------------------------------\n");
1547 output_lat_thread(sched, work_list);
1551 printf(
" -----------------------------------------------------------------------------------------\n");
1555 printf(
" ---------------------------------------------------\n");
1557 print_bad_events(sched);
1564 static int perf_sched__map(
struct perf_sched *sched)
1566 sched->
max_cpu = sysconf(_SC_NPROCESSORS_CONF);
1569 if (perf_sched__read_events(sched,
true,
NULL))
1571 print_bad_events(sched);
1575 static int perf_sched__replay(
struct perf_sched *sched)
1579 calibrate_run_measurement_overhead(sched);
1580 calibrate_sleep_measurement_overhead(sched);
1582 test_calibrations(sched);
1584 if (perf_sched__read_events(sched,
true,
NULL))
1596 printf(
"run atoms optimized: %ld\n",
1599 print_task_traces(sched);
1600 add_cross_task_wakeups(sched);
1602 create_tasks(sched);
1603 printf(
"------------------------------------------------------------\n");
1605 run_one_test(sched);
1611 const char *
const usage_msg[])
1615 for (tok = strtok_r(str,
", ", &tmp);
1616 tok; tok = strtok_r(
NULL,
", ", &tmp)) {
1617 if (sort_dimension__add(tok, &sched->sort_list) < 0) {
1618 error(
"Unknown --sort key: `%s'", tok);
1625 sort_dimension__add(
"pid", &sched->
cmp_pid);
1628 static int __cmd_record(
int argc,
const char **argv)
1630 unsigned int rec_argc,
i,
j;
1631 const char **rec_argv;
1632 const char *
const record_args[] = {
1639 "-e",
"sched:sched_switch",
1640 "-e",
"sched:sched_stat_wait",
1641 "-e",
"sched:sched_stat_sleep",
1642 "-e",
"sched:sched_stat_iowait",
1643 "-e",
"sched:sched_stat_runtime",
1644 "-e",
"sched:sched_process_exit",
1645 "-e",
"sched:sched_process_fork",
1646 "-e",
"sched:sched_wakeup",
1647 "-e",
"sched:sched_migrate_task",
1650 rec_argc =
ARRAY_SIZE(record_args) + argc - 1;
1651 rec_argv = calloc(rec_argc + 1,
sizeof(
char *));
1653 if (rec_argv ==
NULL)
1656 for (i = 0; i <
ARRAY_SIZE(record_args); i++)
1657 rec_argv[i] = strdup(record_args[i]);
1659 for (j = 1; j < (
unsigned int)argc; j++, i++)
1660 rec_argv[i] = argv[j];
1672 .sample = perf_sched__process_tracepoint_sample,
1676 .ordered_samples =
true,
1680 .start_work_mutex = PTHREAD_MUTEX_INITIALIZER,
1681 .work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
1682 .curr_pid = { [0 ...
MAX_CPUS - 1] = -1 },
1684 .replay_repeat = 10,
1686 .next_shortname1 =
'A',
1687 .next_shortname2 =
'0',
1689 const struct option latency_options[] = {
1691 "sort by key(s): runtime, switch, avg, max"),
1693 "be more verbose (show symbol address, etc)"),
1695 "CPU to profile on"),
1697 "dump raw trace in ASCII"),
1700 const struct option replay_options[] = {
1702 "repeat the workload replay N times (-1: infinite)"),
1704 "be more verbose (show symbol address, etc)"),
1706 "dump raw trace in ASCII"),
1709 const struct option sched_options[] = {
1713 "be more verbose (show symbol address, etc)"),
1715 "dump raw trace in ASCII"),
1718 const char *
const latency_usage[] = {
1719 "perf sched latency [<options>]",
1722 const char *
const replay_usage[] = {
1723 "perf sched replay [<options>]",
1726 const char *
const sched_usage[] = {
1727 "perf sched [<options>] {record|latency|map|replay|script}",
1732 .switch_event = latency_switch_event,
1733 .runtime_event = latency_runtime_event,
1734 .fork_event = latency_fork_event,
1735 .migrate_task_event = latency_migrate_task_event,
1742 .switch_event = replay_switch_event,
1743 .fork_event = replay_fork_event,
1746 argc =
parse_options(argc, argv, sched_options, sched_usage,
1754 if (!
strcmp(argv[0],
"script"))
1758 if (!
strncmp(argv[0],
"rec", 3)) {
1759 return __cmd_record(argc, argv);
1760 }
else if (!
strncmp(argv[0],
"lat", 3)) {
1763 argc =
parse_options(argc, argv, latency_options, latency_usage, 0);
1767 setup_sorting(&sched, latency_options, latency_usage);
1768 return perf_sched__lat(&sched);
1769 }
else if (!
strcmp(argv[0],
"map")) {
1771 setup_sorting(&sched, latency_options, latency_usage);
1772 return perf_sched__map(&sched);
1773 }
else if (!
strncmp(argv[0],
"rep", 3)) {
1776 argc =
parse_options(argc, argv, replay_options, replay_usage, 0);
1780 return perf_sched__replay(&sched);