用systemtap来做系统性能分析和事件分析

用systemtap来做系统性能分析

  • 做性能分析的前提首先是需要了解一个事件所调用的内核函数、系统调用等,并根据这些函数调用所消耗的时间去初步判断系统或业务的瓶颈,比如说简单的一个创建/删除文件,我们来分析下它所调用的内核函数
#!/bin/bash
echo $$  
sleep 60  
echo "aaaaaa"> ./aaaa.txt  
sleep 60  
rm -f ./aaaa.txt  
sync  
  • systemtap脚本如下:
probe $1.call {  
        if(pid() == $2)
                printf ("%s -> %s\n", thread_indent(1), ppfunc())
}
probe $1.return {  
        if(pid() == $2)
                printf ("%s <- %s\n", thread_indent(-1), ppfunc())
}
  • 读写一般都是执行fs/*.c这些文件里的函数,12203是shell脚本的pid,执行命令:stap -v trace-thread-calls.stp 'kernel.function("*@fs/*.c")' 12203
  • 部分输出如下:

  • 从上面的输出结果可以看出,读写是在执行sys_readsys_write两个内核函数(如果要关注vfs层面,可以关注下vfs_readvfs_write

  • 比如说分析一个进程的读写效率:
global reads, reads_time, writes, writes_time  
probe kernel.function("sys_read").return {  
        if(execname() == @1) {
                if($return > 0) {
                        spend_time = gettimeofday_us() - @entry(gettimeofday_us())
                        reads[execname(), pid()] <<< $count
                        reads_time[execname(), pid()] <<< spend_time
                }
        }
}
probe kernel.function("sys_write").return {  
        if(execname() == @1) {
                if($return > 0) {
                        spend_time = gettimeofday_us() - @entry(gettimeofday_us())
                        writes[execname(), pid()] <<< $count
                        writes_time[execname(), pid()] <<< spend_time
                }
        }
}
probe timer.s(10) {  
        foreach([name, pid] in reads) {
                reads_count_sum = @sum(reads[name, pid])
                reads_time_sum = @sum(reads_time[name, pid])
                printf("%s[%ld]: read %ld(bytes)/%ld(us).\n", name, pid, reads_count_sum, reads_time_sum)
        }
        foreach([name, pid] in writes) {
                writes_count_sum =  @sum(writes[name, pid])
                writes_time_sum = @sum(writes_time[name, pid])
                printf("%s[%ld]: write %ld(bytes)/%ld(us).\n", name, pid, writes_count_sum, writes_time_sum)
        }
        delete reads
        delete reads_time
        delete writes
        delete writes_time
}
  • 输出结果:(erlang)
beam.smp[24735]: read 3664(bytes)/621(us).  
beam.smp[24735]: write 58(bytes)/960(us).  
  • 再来深入看下beam.smp读mnesia文件,用了多长时间(这里用了=~语法,相当于shell里面的=~用来匹配正则的用法,但需注意正则不是用原生字符,所以\也需要转义
%{
#include <linux/file.h>
#include <linux/dcache.h>
#include <linux/fdtable.h>
%}
function task_file_handle_d_path:string (task:long, file:long) %{ /* pure */  
        struct task_struct *p = (struct task_struct *)((long)STAP_ARG_task);
        struct file *f = (struct file *)((long)STAP_ARG_file);
        struct files_struct *files;
        char *page = NULL;
        char *path = NULL;
        rcu_read_lock();
        if ((files = kread(&p->files)) &&
        (page = (char *)__get_free_page(GFP_ATOMIC))) {
                path = d_path(&f->f_path, page, PAGE_SIZE);
                if (path && !IS_ERR(path)) {
                        snprintf(STAP_RETVALUE, MAXSTRINGLEN, "%s", path);
                }
        }
        CATCH_DEREF_FAULT();
        if (page) free_page((unsigned long)page);
        rcu_read_unlock();
%}
probe kernel.function("vfs_read").return {  
        if(execname() == @1) {
                task = pid2task(pid())
                spend_time = gettimeofday_us() - @entry(gettimeofday_us())
                dpath = task_file_handle_d_path(task, $file)
                if(dpath =~ "(\\.DCD|\\.DCL|\\.DAT)$") {
                        printf("%s %s[%ld] %ld(us) %ld(byte) %s\n", probefunc(), execname(), pid(), spend_time, $count, dpath)
                }
        }
}
  • 输出结果:
sys_read beam.smp[25073] 27453(us) 8(byte) /database/mnesia/xxxx/db1_p.DCL  
sys_read beam.smp[25073] 18(us) 8(byte) /database/mnesia/xxxx/db2_p.DCL  
sys_read beam.smp[1744] 19(us) 8(byte) /database/mnesia/zzzz/db1_p.DCL  
sys_read beam.smp[1744] 30(us) 8(byte) /database/mnesia/zzzz/db2_p.DCL  
  • 这里比较下,数量级还是较吻合的:
  • 可以看到突然间有耗时上升的情况,如第一行的输出,我们再来从vfs_read所调用的函数(如本节开头的调用图)分析下
global times  
probe kernel.function("rw_verify_area").return,  
      kernel.function("do_sync_read").return,
      kernel.function("touch_atime").return,
      kernel.function("inotify_inode_queue_event").return,
      kernel.function("__fsnotify_parent").return,
      kernel.function("inotify_dentry_parent_queue_event").return,
      kernel.function("fsnotify").return {
        if(execname() == @1) {
                times[pid(), ppfunc()] += gettimeofday_us() - @entry(gettimeofday_us())
        }
}
probe timer.s(10) {  
        println("========")
        foreach([pid, pp] in times- limit 10) {
                printf("pid:%5d %50s %10ld(us)\n", pid, pp, times[pid, pp])
        }
        delete times
}
  • 输出结果:
========
pid:10676                                       do_sync_read       2859(us)  
pid:10676                                        touch_atime       2113(us)  
pid:10338                                       do_sync_read       1348(us)  
pid:10338                                     rw_verify_area       1167(us)  
pid:10915                                       do_sync_read        938(us)  
pid:11735                                       do_sync_read        925(us)  
pid:11735                                     rw_verify_area        888(us)  
pid: 1744                                     rw_verify_area        863(us)  
pid:10915                                     rw_verify_area        836(us)  
pid:18692                                     rw_verify_area        829(us)  
========
pid:11891                                       do_sync_read       1570(us)  
pid:11891                                     rw_verify_area       1209(us)  
pid:11735                                     rw_verify_area       1148(us)  
pid:11735                                       do_sync_read       1136(us)  
pid:18692                                       do_sync_read       1091(us)  
pid:10338                                     rw_verify_area       1091(us)  
pid:10338                                       do_sync_read       1047(us)  
pid:18692                                     rw_verify_area       1018(us)  
pid:10915                                       do_sync_read        955(us)  
pid: 1744                                     rw_verify_area        934(us)  
  • 这样我们就可以大致知道哪个内核函数占用了较多的时间

用systemtap分析系统事件

  • salt使用subprocess.Popen()来执行shell脚本,会发现shell脚本启动的游戏继承了salt的文件描述符(fd
  • 了解到Popen是使用fork系统调用来生成子进程,使用systemtap查看下fork调用了哪些内核函数:
$ stap -v trace-thread-calls.stp 'kernel.function("*@kernel/fork.c")' 26615
  • 结果如下:
  • 查看copy_process函数(在kernel/fork.c),调用了copy_files函数来拷贝父进程的fd,该函数如下:
static int copy_files(unsigned long clone_flags, struct task_struct * tsk)  
{
        struct files_struct *oldf, *newf;
        int error = 0;

        /*
         * A background process may not have any files ...
         */
        oldf = current->files;
        if (!oldf)
                goto out;

        if (clone_flags & CLONE_FILES) {
                atomic_inc(&oldf->count);
                goto out;
        }

        newf = dup_fd(oldf, &error);
        if (!newf)
                goto out;

        tsk->files = newf;
        error = 0;
out:  
        return error;
}
  • 注意到if (clone_flags & CLONE_FILES)这行,如果clone_flags设置了CLONE_FILES标志位,则子进程将会共享父进程的fd(即不执行下面的dup_fd,不修改fd
  • 再看下CLONE_FILES是哪一位:(include/linux/sched.h
#define CLONE_FILES     0x00000400      /* set if open files shared between processes */
  • 0x400转成二进制:10000000000,是第11位
  • 如何查看clone_flags
# stap -L 'kernel.function("copy_process")'
kernel.function("copy_process@kernel/fork.c:1031") $clone_flags:long unsigned int $stack_start:long unsigned int $regs:struct pt_regs* $stack_size:long unsigned int $child_tidptr:int* $pid:struct pid* $trace:int $p:struct task_struct*  
  • 可得知copy_process有该target变量,编写systemtap脚本:
probe kernel.function("copy_process") {  
        printf("%s[%ld] %x\n", execname(), pid(), $clone_flags)
}
  • 执行结果:
salt-ctl[19560] 3d0f00  
  • 0x3D0F00转成二进制:1111010000111100000000,第11位是设置的,所以msalt调用的shell脚本是共享了父进程的fd
  • Popen不共享父进程的fd的解决办法如下:
  • 加上这个close_fds参数后,查看下是否如上面所说会关闭所有父进程的fd
probe kernel.function("sys_close") {  
        printf("%s[%ld] %d\n", execname(), pid(), $fd)
}
  • 可以查看到以下关闭的结果:
python[17898] 3  
python[17898] 4  
python[17898] 5  
python[17898] 6  
python[17898] 7  
python[17898] 8  
python[17898] 9  
# 中间省略
python[17898] 1013  
python[17898] 1014  
python[17898] 1015  
python[17898] 1016  
python[17898] 1017  
python[17898] 1018  
python[17898] 1019  
python[17898] 1020  
python[17898] 1021  
python[17898] 1022  
python[17898] 1023