The focus of this chapter to describe the internals of the mechanisms used in the App Trace Roll.
Read and write system calls are traced using SystemTap. Below is the Tap script used to record open, socket, read, write, lseek and close events:
1 /* 2 * $Id: internals.sgml,v 1.2 2005/11/30 21:48:40 bruno Exp $ 3 * 4 * @Copyright@ 5 * 6 * Rocks 7 * www.rocksclusters.org 8 * version 4.1 (fuji) 9 * 10 * Copyright (c) 2005 The Regents of the University of California. All 11 * rights reserved. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions are 15 * met: 16 * 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 24 * 3. All advertising materials mentioning features or use of this 25 * software must display the following acknowledgement: 26 * 27 * "This product includes software developed by the Rocks 28 * Cluster Group at the San Diego Supercomputer Center and 29 * its contributors." 30 * 31 * 4. Neither the name or logo of this software nor the names of its 32 * authors may be used to endorse or promote products derived from this 33 * software without specific prior written permission. The name of the 34 * software includes the following terms, and any derivatives thereof: 35 * "Rocks", "Rocks Clusters", and "Avalanche Installer". 36 * 37 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' 38 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 39 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 40 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS 41 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 44 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 45 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 46 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN 47 * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 48 * 49 * @Copyright@ 50 * 51 * $Log: internals.sgml,v $ 51 * Revision 1.2 2005/11/30 21:48:40 bruno 51 * touch ups 51 * 52 * Revision 1.5 2005/11/29 18:50:54 bruno 53 * needed new method to get file descriptor for i386 54 * 55 * Revision 1.3 2005/11/23 03:47:53 bruno 56 * major overhaul of the systemtap file 57 * 58 * Revision 1.1 2005/11/17 16:55:56 bruno 59 * first rev 60 * 61 * 62 */ 63 64 /* 65 * this code is used to put a file into the proc file system that is used 66 * by this systemtap program to determine which programs to trace. 67 * 68 * the file is: /proc/sys/debug/traced-apps 69 * 70 * for example, to trace 'iozone', after starting this systemtap file, 71 * on the command line execute: 72 * 73 * echo "iozone" > /proc/sys/debug/traced-apps 74 */ 75 76 %{ 77 #include <linux/sysctl.h> 78 79 /* 80 * a 'handle' used to deallocate the proc fs table 81 */ 82 struct ctl_table_header *app_trace_proc_fs; 83 84 #define MAX_TRACED_APPS_SIZE 1024 85 86 87 char traced_apps[MAX_TRACED_APPS_SIZE] = ""; 88 char working_traced_apps[MAX_TRACED_APPS_SIZE]; 89 90 static ctl_table app_trace_table[] = { 91 { 92 .ctl_name = 1, 93 .procname = "traced-apps", 94 .data = &traced_apps, 95 .maxlen = MAX_TRACED_APPS_SIZE, 96 .mode = 0644, 97 .proc_handler = &proc_dostring, 98 .strategy = &sysctl_string, 99 }, 100 101 { .ctl_name = 0 } 102 }; 103 104 static ctl_table app_trace_root_table[] = { 105 { 106 .ctl_name = CTL_DEBUG, 107 .procname = "debug", 108 .mode = 0555, 109 .child = app_trace_table, 110 }, 111 112 { .ctl_name = 0 } 113 }; 114 %} 115 116 117 function get_returnvalue:long () %{ 118 if (CONTEXT->regs) { 119 #if defined (__x86_64__) 120 THIS->__retvalue = CONTEXT->regs->rax; 121 #elif defined (__i386__) 122 THIS->__retvalue = CONTEXT->regs->eax; 123 #endif 124 } else { 125 THIS->__retvalue = 0; 126 } 127 %} 128 129 /* 130 * get_fd() is called from llseek which is only called from i386 programs. 131 * but, there is an __x86_64__ clause to make sure the program compiles 132 * correctly. 133 */ 134 function get_fd:long (fd) %{ 135 if (CONTEXT->regs) { 136 #if defined (__x86_64__) 137 THIS->__retvalue = THIS->fd; 138 #elif defined (__i386__) 139 THIS->__retvalue = CONTEXT->regs->ebx; 140 #endif 141 } else { 142 THIS->__retvalue = 0; 143 } 144 %} 145 146 function is_traced_call:long (name:string) %{ 147 char *a; 148 char *b; 149 150 strcpy(working_traced_apps, traced_apps); 151 b = working_traced_apps; 152 153 while ((a = strsep(&b, ","))) { 154 if (!strcmp(a, THIS->name)) { 155 THIS->__retvalue = 1; 156 return; 157 } 158 } 159 160 THIS->__retvalue = 0; 161 %} 162 163 function output_header(op, name, pid) { 164 print(string(gettimeofday_us()) . " op: " . op); 165 print(" execname: " . name); 166 print(" pid: " . pid); 167 } 168 169 probe kernel.function("sys_open").return 170 { 171 if (is_traced_call(execname())) { 172 output_header("open", execname(), string(pid())); 173 174 fd = get_returnvalue(); 175 print(" fd: " . string(fd) . "\n"); 176 } 177 } 178 179 probe kernel.function("sys_socket").return 180 { 181 if (is_traced_call(execname())) { 182 output_header("socket", execname(), string(pid())); 183 184 fd = get_returnvalue(); 185 print(" fd: " . string(fd) . "\n"); 186 } 187 } 188 189 probe kernel.function("sys_read") 190 { 191 if (is_traced_call(execname())) { 192 output_header("read", execname(), string(pid())); 193 194 fd = get_fd($fd); 195 print(" fd: " . string(fd)); 196 print(" count: " . string($count) . "\n"); 197 } 198 } 199 200 probe kernel.function("sys_write") 201 { 202 if (is_traced_call(execname())) { 203 output_header("write", execname(), string(pid())); 204 205 fd = get_fd($fd); 206 print(" fd: " . string(fd)); 207 print(" count: " . string($count) . "\n"); 208 } 209 } 210 211 probe kernel.function("sys_lseek") 212 { 213 if (is_traced_call(execname())) { 214 output_header("seek", execname(), string(pid())); 215 216 fd = get_fd($fd); 217 print(" fd: " . string(fd)); 218 } 219 } 220 221 /* 222 * llseek is only called on i386 223 */ 224 probe kernel.function("sys_llseek") 225 { 226 if (is_traced_call(execname())) { 227 output_header("seek", execname(), string(pid())); 228 229 fd = get_fd($fd); 230 print(" fd: " . string(fd)); 231 } 232 } 233 234 probe kernel.function("vfs_llseek").return 235 { 236 if (is_traced_call(execname())) { 237 offset = get_returnvalue(); 238 print(" offset: " . string(offset) . "\n"); 239 } 240 } 241 242 probe kernel.function("sys_close") 243 { 244 if (is_traced_call(execname())) { 245 output_header("close", execname(), string(pid())); 246 247 fd = get_fd($fd); 248 print(" fd: " . string(fd) . "\n"); 249 } 250 } 251 252 253 function init_proc_fs() %{ 254 app_trace_proc_fs = register_sysctl_table(app_trace_root_table, 1); 255 %} 256 257 function deinit_proc_fs() %{ 258 unregister_sysctl_table(app_trace_proc_fs); 259 %} 260 261 probe begin 262 { 263 log("trace-io: starting probe"); 264 init_proc_fs(); 265 } 266 267 probe end 268 { 269 log("trace-io: ending probe"); 270 deinit_proc_fs(); 271 } 272 |
Notice in probe kernel.function("sys_read") (line 189) and probe kernel.function("sys_write") (line 200) only the following information is recorded:
Time the event occurred
Name of the program that is issuing the system call
Process Id for the program
File descriptor the system call will be performed upon
I/O size (or the absolute offset in the case of seek)
Note that no user data is ever examined. |
Here is a snippet from a trace when running the I/O benchmark iozone:
1133384755854320 op: close execname: iozone pid: 3693 fd: 3 1133384755854365 op: write execname: iozone pid: 3693 fd: 1 count: 8 1133384755854513 op: write execname: iozone pid: 3693 fd: 1 count: 8 1133384755854629 op: open execname: iozone pid: 3693 fd: 3 1133384755854672 op: read execname: iozone pid: 3693 fd: 3 count: 4096 1133384755854710 op: seek execname: iozone pid: 3693 fd: 3 offset: 0 |
The Tap script above is loaded with the init script /etc/rc.d/init.d/rocks-app-trace. Below is the line that starts the tracing:
ulimit -f $FILESIZE && /usr/bin/stap \ -g /opt/app-trace/systemtap/trace-io.stp > \ $TRACEDIR/`hostname`-app-trace.out 2>&1 & |