The focus of this chapter to describe the internals of the mechanisms used in the App Trace Roll.
Read and write system calls are traced using SystemTap. Below is the Tap script used to record open, socket, read, write, lseek and close events:
1 /*
2 * $Id: internals.sgml,v 1.2 2005/11/30 21:48:40 bruno Exp $
3 *
4 * @Copyright@
5 *
6 * Rocks
7 * www.rocksclusters.org
8 * version 4.1 (fuji)
9 *
10 * Copyright (c) 2005 The Regents of the University of California. All
11 * rights reserved.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions are
15 * met:
16 *
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 *
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 *
24 * 3. All advertising materials mentioning features or use of this
25 * software must display the following acknowledgement:
26 *
27 * "This product includes software developed by the Rocks
28 * Cluster Group at the San Diego Supercomputer Center and
29 * its contributors."
30 *
31 * 4. Neither the name or logo of this software nor the names of its
32 * authors may be used to endorse or promote products derived from this
33 * software without specific prior written permission. The name of the
34 * software includes the following terms, and any derivatives thereof:
35 * "Rocks", "Rocks Clusters", and "Avalanche Installer".
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS''
38 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
39 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
40 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS
41 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
44 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
45 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
46 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
47 * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 *
49 * @Copyright@
50 *
51 * $Log: internals.sgml,v $
51 * Revision 1.2 2005/11/30 21:48:40 bruno
51 * touch ups
51 *
52 * Revision 1.5 2005/11/29 18:50:54 bruno
53 * needed new method to get file descriptor for i386
54 *
55 * Revision 1.3 2005/11/23 03:47:53 bruno
56 * major overhaul of the systemtap file
57 *
58 * Revision 1.1 2005/11/17 16:55:56 bruno
59 * first rev
60 *
61 *
62 */
63
64 /*
65 * this code is used to put a file into the proc file system that is used
66 * by this systemtap program to determine which programs to trace.
67 *
68 * the file is: /proc/sys/debug/traced-apps
69 *
70 * for example, to trace 'iozone', after starting this systemtap file,
71 * on the command line execute:
72 *
73 * echo "iozone" > /proc/sys/debug/traced-apps
74 */
75
76 %{
77 #include <linux/sysctl.h>
78
79 /*
80 * a 'handle' used to deallocate the proc fs table
81 */
82 struct ctl_table_header *app_trace_proc_fs;
83
84 #define MAX_TRACED_APPS_SIZE 1024
85
86
87 char traced_apps[MAX_TRACED_APPS_SIZE] = "";
88 char working_traced_apps[MAX_TRACED_APPS_SIZE];
89
90 static ctl_table app_trace_table[] = {
91 {
92 .ctl_name = 1,
93 .procname = "traced-apps",
94 .data = &traced_apps,
95 .maxlen = MAX_TRACED_APPS_SIZE,
96 .mode = 0644,
97 .proc_handler = &proc_dostring,
98 .strategy = &sysctl_string,
99 },
100
101 { .ctl_name = 0 }
102 };
103
104 static ctl_table app_trace_root_table[] = {
105 {
106 .ctl_name = CTL_DEBUG,
107 .procname = "debug",
108 .mode = 0555,
109 .child = app_trace_table,
110 },
111
112 { .ctl_name = 0 }
113 };
114 %}
115
116
117 function get_returnvalue:long () %{
118 if (CONTEXT->regs) {
119 #if defined (__x86_64__)
120 THIS->__retvalue = CONTEXT->regs->rax;
121 #elif defined (__i386__)
122 THIS->__retvalue = CONTEXT->regs->eax;
123 #endif
124 } else {
125 THIS->__retvalue = 0;
126 }
127 %}
128
129 /*
130 * get_fd() is called from llseek which is only called from i386 programs.
131 * but, there is an __x86_64__ clause to make sure the program compiles
132 * correctly.
133 */
134 function get_fd:long (fd) %{
135 if (CONTEXT->regs) {
136 #if defined (__x86_64__)
137 THIS->__retvalue = THIS->fd;
138 #elif defined (__i386__)
139 THIS->__retvalue = CONTEXT->regs->ebx;
140 #endif
141 } else {
142 THIS->__retvalue = 0;
143 }
144 %}
145
146 function is_traced_call:long (name:string) %{
147 char *a;
148 char *b;
149
150 strcpy(working_traced_apps, traced_apps);
151 b = working_traced_apps;
152
153 while ((a = strsep(&b, ","))) {
154 if (!strcmp(a, THIS->name)) {
155 THIS->__retvalue = 1;
156 return;
157 }
158 }
159
160 THIS->__retvalue = 0;
161 %}
162
163 function output_header(op, name, pid) {
164 print(string(gettimeofday_us()) . " op: " . op);
165 print(" execname: " . name);
166 print(" pid: " . pid);
167 }
168
169 probe kernel.function("sys_open").return
170 {
171 if (is_traced_call(execname())) {
172 output_header("open", execname(), string(pid()));
173
174 fd = get_returnvalue();
175 print(" fd: " . string(fd) . "\n");
176 }
177 }
178
179 probe kernel.function("sys_socket").return
180 {
181 if (is_traced_call(execname())) {
182 output_header("socket", execname(), string(pid()));
183
184 fd = get_returnvalue();
185 print(" fd: " . string(fd) . "\n");
186 }
187 }
188
189 probe kernel.function("sys_read")
190 {
191 if (is_traced_call(execname())) {
192 output_header("read", execname(), string(pid()));
193
194 fd = get_fd($fd);
195 print(" fd: " . string(fd));
196 print(" count: " . string($count) . "\n");
197 }
198 }
199
200 probe kernel.function("sys_write")
201 {
202 if (is_traced_call(execname())) {
203 output_header("write", execname(), string(pid()));
204
205 fd = get_fd($fd);
206 print(" fd: " . string(fd));
207 print(" count: " . string($count) . "\n");
208 }
209 }
210
211 probe kernel.function("sys_lseek")
212 {
213 if (is_traced_call(execname())) {
214 output_header("seek", execname(), string(pid()));
215
216 fd = get_fd($fd);
217 print(" fd: " . string(fd));
218 }
219 }
220
221 /*
222 * llseek is only called on i386
223 */
224 probe kernel.function("sys_llseek")
225 {
226 if (is_traced_call(execname())) {
227 output_header("seek", execname(), string(pid()));
228
229 fd = get_fd($fd);
230 print(" fd: " . string(fd));
231 }
232 }
233
234 probe kernel.function("vfs_llseek").return
235 {
236 if (is_traced_call(execname())) {
237 offset = get_returnvalue();
238 print(" offset: " . string(offset) . "\n");
239 }
240 }
241
242 probe kernel.function("sys_close")
243 {
244 if (is_traced_call(execname())) {
245 output_header("close", execname(), string(pid()));
246
247 fd = get_fd($fd);
248 print(" fd: " . string(fd) . "\n");
249 }
250 }
251
252
253 function init_proc_fs() %{
254 app_trace_proc_fs = register_sysctl_table(app_trace_root_table, 1);
255 %}
256
257 function deinit_proc_fs() %{
258 unregister_sysctl_table(app_trace_proc_fs);
259 %}
260
261 probe begin
262 {
263 log("trace-io: starting probe");
264 init_proc_fs();
265 }
266
267 probe end
268 {
269 log("trace-io: ending probe");
270 deinit_proc_fs();
271 }
272 |
Notice in probe kernel.function("sys_read") (line 189) and probe kernel.function("sys_write") (line 200) only the following information is recorded:
Time the event occurred
Name of the program that is issuing the system call
Process Id for the program
File descriptor the system call will be performed upon
I/O size (or the absolute offset in the case of seek)
![]() | Note that no user data is ever examined. |
Here is a snippet from a trace when running the I/O benchmark iozone:
1133384755854320 op: close execname: iozone pid: 3693 fd: 3 1133384755854365 op: write execname: iozone pid: 3693 fd: 1 count: 8 1133384755854513 op: write execname: iozone pid: 3693 fd: 1 count: 8 1133384755854629 op: open execname: iozone pid: 3693 fd: 3 1133384755854672 op: read execname: iozone pid: 3693 fd: 3 count: 4096 1133384755854710 op: seek execname: iozone pid: 3693 fd: 3 offset: 0 |
The Tap script above is loaded with the init script /etc/rc.d/init.d/rocks-app-trace. Below is the line that starts the tracing:
ulimit -f $FILESIZE && /usr/bin/stap \
-g /opt/app-trace/systemtap/trace-io.stp > \
$TRACEDIR/`hostname`-app-trace.out 2>&1 & |