5.1. SystemTap Internals

The focus of this chapter to describe the internals of the mechanisms used in the App Trace Roll.

Read and write system calls are traced using SystemTap. Below is the Tap script used to record open, socket, read, write, lseek and close events:

    1  /*
    2   * $Id: internals.sgml,v 1.2 2005/11/30 21:48:40 bruno Exp $
    3   *
    4   * @Copyright@
    5   * 
    6   *                              Rocks
    7   *                       www.rocksclusters.org
    8   *                         version 4.1 (fuji)
    9   * 
   10   * Copyright (c) 2005 The Regents of the University of California. All
   11   * rights reserved.
   12   * 
   13   * Redistribution and use in source and binary forms, with or without
   14   * modification, are permitted provided that the following conditions are
   15   * met:
   16   * 
   17   * 1. Redistributions of source code must retain the above copyright
   18   * notice, this list of conditions and the following disclaimer.
   19   * 
   20   * 2. Redistributions in binary form must reproduce the above copyright
   21   * notice, this list of conditions and the following disclaimer in the
   22   * documentation and/or other materials provided with the distribution.
   23   * 
   24   * 3. All advertising materials mentioning features or use of this
   25   * software must display the following acknowledgement: 
   26   * 
   27   *      "This product includes software developed by the Rocks 
   28   *      Cluster Group at the San Diego Supercomputer Center and
   29   *      its contributors."
   30   * 
   31   * 4. Neither the name or logo of this software nor the names of its
   32   * authors may be used to endorse or promote products derived from this
   33   * software without specific prior written permission.  The name of the
   34   * software includes the following terms, and any derivatives thereof:
   35   * "Rocks", "Rocks Clusters", and "Avalanche Installer".
   36   * 
   37   * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS''
   38   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
   39   * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   40   * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS
   41   * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   42   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   43   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
   44   * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
   45   * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
   46   * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
   47   * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   48   * 
   49   * @Copyright@
   50   *
   51   * $Log: internals.sgml,v $
   51   * Revision 1.2  2005/11/30 21:48:40  bruno
   51   * touch ups
   51   *
   52   * Revision 1.5  2005/11/29 18:50:54  bruno
   53   * needed new method to get file descriptor for i386
   54   *
   55   * Revision 1.3  2005/11/23 03:47:53  bruno
   56   * major overhaul of the systemtap file
   57   *
   58   * Revision 1.1  2005/11/17 16:55:56  bruno
   59   * first rev
   60   *
   61   *
   62   */
   63
   64  /*
   65   * this code is used to put a file into the proc file system that is used
   66   * by this systemtap program to determine which programs to trace.
   67   *
   68   * the file is: /proc/sys/debug/traced-apps
   69   *
   70   * for example, to trace 'iozone', after starting this systemtap file,
   71   * on the command line execute:
   72   *
   73   *      echo "iozone" > /proc/sys/debug/traced-apps
   74   */
   75
   76  %{
   77  #include <linux/sysctl.h>
   78
   79  /*
   80   * a 'handle' used to deallocate the proc fs table
   81   */
   82  struct ctl_table_header *app_trace_proc_fs;
   83
   84  #define MAX_TRACED_APPS_SIZE    1024
   85
   86
   87  char traced_apps[MAX_TRACED_APPS_SIZE] = "";
   88  char working_traced_apps[MAX_TRACED_APPS_SIZE];
   89
   90  static ctl_table app_trace_table[] = {
   91          {
   92                  .ctl_name       = 1,
   93                  .procname       = "traced-apps",
   94                  .data           = &traced_apps,
   95                  .maxlen         = MAX_TRACED_APPS_SIZE,
   96                  .mode           = 0644,
   97                  .proc_handler   = &proc_dostring,
   98                  .strategy       = &sysctl_string,
   99          }, 
  100
  101          { .ctl_name = 0 }
  102  };
  103
  104  static ctl_table app_trace_root_table[] = {
  105          {
  106                  .ctl_name = CTL_DEBUG,
  107                  .procname = "debug",
  108                  .mode = 0555,
  109                  .child = app_trace_table,
  110          },
  111
  112          { .ctl_name = 0 }
  113  };
  114  %}
  115
  116
  117  function get_returnvalue:long () %{
  118          if (CONTEXT->regs) {
  119  #if defined (__x86_64__)
  120                  THIS->__retvalue = CONTEXT->regs->rax;
  121  #elif defined (__i386__)
  122                  THIS->__retvalue = CONTEXT->regs->eax;
  123  #endif
  124          } else {
  125                  THIS->__retvalue = 0;
  126          }
  127  %}
  128
  129  /*
  130   * get_fd() is called from llseek which is only called from i386 programs.
  131   * but, there is an __x86_64__ clause to make sure the program compiles
  132   * correctly.
  133   */
  134  function get_fd:long (fd) %{
  135          if (CONTEXT->regs) {
  136  #if defined (__x86_64__)
  137                  THIS->__retvalue = THIS->fd;
  138  #elif defined (__i386__)
  139                  THIS->__retvalue = CONTEXT->regs->ebx;
  140  #endif
  141          } else {
  142                  THIS->__retvalue = 0;
  143          }
  144  %}
  145
  146  function is_traced_call:long (name:string) %{
  147          char    *a;
  148          char    *b;
  149
  150          strcpy(working_traced_apps, traced_apps);
  151          b = working_traced_apps;
  152
  153          while ((a = strsep(&b, ","))) {
  154                  if (!strcmp(a, THIS->name)) {
  155                          THIS->__retvalue = 1;
  156                          return;
  157                  }
  158          }
  159
  160          THIS->__retvalue = 0;
  161  %}
  162
  163  function output_header(op, name, pid) {
  164          print(string(gettimeofday_us()) . " op: " . op);
  165          print(" execname: " . name);
  166          print(" pid: " . pid);
  167  }
  168
  169  probe kernel.function("sys_open").return
  170  {
  171          if (is_traced_call(execname())) {
  172                  output_header("open", execname(), string(pid()));
  173
  174                  fd = get_returnvalue();
  175                  print(" fd: " . string(fd) . "\n");
  176          }
  177  }
  178
  179  probe kernel.function("sys_socket").return
  180  {
  181          if (is_traced_call(execname())) {
  182                  output_header("socket", execname(), string(pid()));
  183
  184                  fd = get_returnvalue();
  185                  print(" fd: " . string(fd) . "\n");
  186          }
  187  }
  188
  189  probe kernel.function("sys_read")
  190  {
  191          if (is_traced_call(execname())) {
  192                  output_header("read", execname(), string(pid()));
  193
  194                  fd = get_fd($fd);
  195                  print(" fd: " . string(fd));
  196                  print(" count: " . string($count) .  "\n");
  197          }
  198  }
  199
  200  probe kernel.function("sys_write")
  201  {
  202          if (is_traced_call(execname())) {
  203                  output_header("write", execname(), string(pid()));
  204
  205                  fd = get_fd($fd);
  206                  print(" fd: " . string(fd));
  207                  print(" count: " . string($count) . "\n");
  208          }
  209  }
  210
  211  probe kernel.function("sys_lseek")
  212  {
  213          if (is_traced_call(execname())) {
  214                  output_header("seek", execname(), string(pid()));
  215
  216                  fd = get_fd($fd);
  217                  print(" fd: " . string(fd));
  218          }
  219  }
  220
  221  /*
  222   * llseek is only called on i386
  223   */
  224  probe kernel.function("sys_llseek")
  225  {
  226          if (is_traced_call(execname())) {
  227                  output_header("seek", execname(), string(pid()));
  228
  229                  fd = get_fd($fd);
  230                  print(" fd: " . string(fd));
  231          }
  232  }
  233
  234  probe kernel.function("vfs_llseek").return
  235  {
  236          if (is_traced_call(execname())) {
  237                  offset = get_returnvalue();
  238                  print(" offset: " . string(offset) . "\n");
  239          }
  240  }
  241
  242  probe kernel.function("sys_close")
  243  {
  244          if (is_traced_call(execname())) {
  245                  output_header("close", execname(), string(pid()));
  246
  247                  fd = get_fd($fd);
  248                  print(" fd: " . string(fd) . "\n");
  249          }
  250  }
  251
  252
  253  function init_proc_fs() %{
  254          app_trace_proc_fs = register_sysctl_table(app_trace_root_table, 1);
  255  %}
  256
  257  function deinit_proc_fs() %{
  258          unregister_sysctl_table(app_trace_proc_fs);
  259  %}
  260
  261  probe begin
  262  {
  263          log("trace-io: starting probe");
  264          init_proc_fs();
  265  }
  266
  267  probe end
  268  {
  269          log("trace-io: ending probe");
  270          deinit_proc_fs();
  271  }
  272

Notice in probe kernel.function("sys_read") (line 189) and probe kernel.function("sys_write") (line 200) only the following information is recorded:

Note

Note that no user data is ever examined.

Here is a snippet from a trace when running the I/O benchmark iozone:

1133384755854320 op: close execname: iozone pid: 3693 fd: 3
1133384755854365 op: write execname: iozone pid: 3693 fd: 1 count: 8
1133384755854513 op: write execname: iozone pid: 3693 fd: 1 count: 8
1133384755854629 op: open execname: iozone pid: 3693 fd: 3
1133384755854672 op: read execname: iozone pid: 3693 fd: 3 count: 4096
1133384755854710 op: seek execname: iozone pid: 3693 fd: 3 offset: 0

The Tap script above is loaded with the init script /etc/rc.d/init.d/rocks-app-trace. Below is the line that starts the tracing:

        ulimit -f $FILESIZE && /usr/bin/stap \
                -g /opt/app-trace/systemtap/trace-io.stp > \
                        $TRACEDIR/`hostname`-app-trace.out 2>&1 &