用systemtap来做系统性能分析
- 做性能分析的前提首先是需要了解一个事件所调用的内核函数、系统调用等,并根据这些函数调用所消耗的时间去初步判断系统或业务的瓶颈,比如说简单的一个创建/删除文件,我们来分析下它所调用的内核函数
#!/bin/bash
echo $$
sleep 60
echo "aaaaaa"> ./aaaa.txt
sleep 60
rm -f ./aaaa.txt
sync
- systemtap脚本如下:
probe $1.call {
if(pid() == $2)
printf ("%s -> %s\n", thread_indent(1), ppfunc())
}
probe $1.return {
if(pid() == $2)
printf ("%s <- %s\n", thread_indent(-1), ppfunc())
}
- 读写一般都是执行
fs/*.c
这些文件里的函数,12203
是shell脚本的pid,执行命令:stap -v trace-thread-calls.stp 'kernel.function("*@fs/*.c")' 12203
部分输出如下:
从上面的输出结果可以看出,读写是在执行
sys_read
和sys_write
两个内核函数(如果要关注vfs
层面,可以关注下vfs_read
和vfs_write
)- 比如说分析一个进程的读写效率:
global reads, reads_time, writes, writes_time
probe kernel.function("sys_read").return {
if(execname() == @1) {
if($return > 0) {
spend_time = gettimeofday_us() - @entry(gettimeofday_us())
reads[execname(), pid()] <<< $count
reads_time[execname(), pid()] <<< spend_time
}
}
}
probe kernel.function("sys_write").return {
if(execname() == @1) {
if($return > 0) {
spend_time = gettimeofday_us() - @entry(gettimeofday_us())
writes[execname(), pid()] <<< $count
writes_time[execname(), pid()] <<< spend_time
}
}
}
probe timer.s(10) {
foreach([name, pid] in reads) {
reads_count_sum = @sum(reads[name, pid])
reads_time_sum = @sum(reads_time[name, pid])
printf("%s[%ld]: read %ld(bytes)/%ld(us).\n", name, pid, reads_count_sum, reads_time_sum)
}
foreach([name, pid] in writes) {
writes_count_sum = @sum(writes[name, pid])
writes_time_sum = @sum(writes_time[name, pid])
printf("%s[%ld]: write %ld(bytes)/%ld(us).\n", name, pid, writes_count_sum, writes_time_sum)
}
delete reads
delete reads_time
delete writes
delete writes_time
}
- 输出结果:(erlang)
beam.smp[24735]: read 3664(bytes)/621(us).
beam.smp[24735]: write 58(bytes)/960(us).
- 再来深入看下
beam.smp
读mnesia文件,用了多长时间(这里用了=~
语法,相当于shell
里面的=~
用来匹配正则的用法,但需注意正则不是用原生字符,所以\
也需要转义)
%{
#include <linux/file.h>
#include <linux/dcache.h>
#include <linux/fdtable.h>
%}
function task_file_handle_d_path:string (task:long, file:long) %{ /* pure */
struct task_struct *p = (struct task_struct *)((long)STAP_ARG_task);
struct file *f = (struct file *)((long)STAP_ARG_file);
struct files_struct *files;
char *page = NULL;
char *path = NULL;
rcu_read_lock();
if ((files = kread(&p->files)) &&
(page = (char *)__get_free_page(GFP_ATOMIC))) {
path = d_path(&f->f_path, page, PAGE_SIZE);
if (path && !IS_ERR(path)) {
snprintf(STAP_RETVALUE, MAXSTRINGLEN, "%s", path);
}
}
CATCH_DEREF_FAULT();
if (page) free_page((unsigned long)page);
rcu_read_unlock();
%}
probe kernel.function("vfs_read").return {
if(execname() == @1) {
task = pid2task(pid())
spend_time = gettimeofday_us() - @entry(gettimeofday_us())
dpath = task_file_handle_d_path(task, $file)
if(dpath =~ "(\\.DCD|\\.DCL|\\.DAT)$") {
printf("%s %s[%ld] %ld(us) %ld(byte) %s\n", probefunc(), execname(), pid(), spend_time, $count, dpath)
}
}
}
- 输出结果:
sys_read beam.smp[25073] 27453(us) 8(byte) /database/mnesia/xxxx/db1_p.DCL
sys_read beam.smp[25073] 18(us) 8(byte) /database/mnesia/xxxx/db2_p.DCL
sys_read beam.smp[1744] 19(us) 8(byte) /database/mnesia/zzzz/db1_p.DCL
sys_read beam.smp[1744] 30(us) 8(byte) /database/mnesia/zzzz/db2_p.DCL
- 跟这里比较下,数量级还是较吻合的:
- 可以看到突然间有耗时上升的情况,如第一行的输出,我们再来从
vfs_read
所调用的函数(如本节开头的调用图)分析下
global times
probe kernel.function("rw_verify_area").return,
kernel.function("do_sync_read").return,
kernel.function("touch_atime").return,
kernel.function("inotify_inode_queue_event").return,
kernel.function("__fsnotify_parent").return,
kernel.function("inotify_dentry_parent_queue_event").return,
kernel.function("fsnotify").return {
if(execname() == @1) {
times[pid(), ppfunc()] += gettimeofday_us() - @entry(gettimeofday_us())
}
}
probe timer.s(10) {
println("========")
foreach([pid, pp] in times- limit 10) {
printf("pid:%5d %50s %10ld(us)\n", pid, pp, times[pid, pp])
}
delete times
}
- 输出结果:
========
pid:10676 do_sync_read 2859(us)
pid:10676 touch_atime 2113(us)
pid:10338 do_sync_read 1348(us)
pid:10338 rw_verify_area 1167(us)
pid:10915 do_sync_read 938(us)
pid:11735 do_sync_read 925(us)
pid:11735 rw_verify_area 888(us)
pid: 1744 rw_verify_area 863(us)
pid:10915 rw_verify_area 836(us)
pid:18692 rw_verify_area 829(us)
========
pid:11891 do_sync_read 1570(us)
pid:11891 rw_verify_area 1209(us)
pid:11735 rw_verify_area 1148(us)
pid:11735 do_sync_read 1136(us)
pid:18692 do_sync_read 1091(us)
pid:10338 rw_verify_area 1091(us)
pid:10338 do_sync_read 1047(us)
pid:18692 rw_verify_area 1018(us)
pid:10915 do_sync_read 955(us)
pid: 1744 rw_verify_area 934(us)
- 这样我们就可以大致知道哪个内核函数占用了较多的时间
用systemtap分析系统事件
salt
使用subprocess.Popen()
来执行shell
脚本,会发现shell
脚本启动的游戏继承了salt
的文件描述符(fd
)- 了解到
Popen
是使用fork
系统调用来生成子进程,使用systemtap
查看下fork
调用了哪些内核函数:
$ stap -v trace-thread-calls.stp 'kernel.function("*@kernel/fork.c")' 26615
- 结果如下:
- 查看
copy_process
函数(在kernel/fork.c
),调用了copy_files
函数来拷贝父进程的fd
,该函数如下:
static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
{
struct files_struct *oldf, *newf;
int error = 0;
/*
* A background process may not have any files ...
*/
oldf = current->files;
if (!oldf)
goto out;
if (clone_flags & CLONE_FILES) {
atomic_inc(&oldf->count);
goto out;
}
newf = dup_fd(oldf, &error);
if (!newf)
goto out;
tsk->files = newf;
error = 0;
out:
return error;
}
- 注意到
if (clone_flags & CLONE_FILES)
这行,如果clone_flags
设置了CLONE_FILES
标志位,则子进程将会共享父进程的fd
(即不执行下面的dup_fd
,不修改fd
) - 再看下
CLONE_FILES
是哪一位:(include/linux/sched.h
)
#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
0x400
转成二进制:10000000000
,是第11位- 如何查看
clone_flags
# stap -L 'kernel.function("copy_process")'
kernel.function("copy_process@kernel/fork.c:1031") $clone_flags:long unsigned int $stack_start:long unsigned int $regs:struct pt_regs* $stack_size:long unsigned int $child_tidptr:int* $pid:struct pid* $trace:int $p:struct task_struct*
- 可得知
copy_process
有该target变量,编写systemtap
脚本:
probe kernel.function("copy_process") {
printf("%s[%ld] %x\n", execname(), pid(), $clone_flags)
}
- 执行结果:
salt-ctl[19560] 3d0f00
0x3D0F00
转成二进制:1111010000111100000000
,第11位是设置的,所以msalt调用的shell
脚本是共享了父进程的fd
Popen
不共享父进程的fd
的解决办法如下:- 加上这个
close_fds
参数后,查看下是否如上面所说会关闭所有父进程的fd
:
probe kernel.function("sys_close") {
printf("%s[%ld] %d\n", execname(), pid(), $fd)
}
- 可以查看到以下关闭的结果:
python[17898] 3
python[17898] 4
python[17898] 5
python[17898] 6
python[17898] 7
python[17898] 8
python[17898] 9
# 中间省略
python[17898] 1013
python[17898] 1014
python[17898] 1015
python[17898] 1016
python[17898] 1017
python[17898] 1018
python[17898] 1019
python[17898] 1020
python[17898] 1021
python[17898] 1022
python[17898] 1023