systemtap probe

常用的探测器(probe)

同步事件

  • syscall.system_call

    1. 进入系统调用system_call时触发
    2. 如果需要在退出系统调用时触发,在后面增加.return;如syscall.closesyscall.close.return
    3. syscall.system_call.return可以使用@entry来获取进入syscall.system_call该时间点的对应数据
    4. 使用命令stap -l syscall.*可以查看可探测的syscall
    5. 举例:监控系统内进程的更改运行状态的等待时间(syscall.wait4
probe syscall.wait4.return  
{
        printf("%s[%ld] %s waits %d us\n", execname(), pid(), pp(), gettimeofday_us() - @entry(gettimeofday_us()))
}
# 输出结果(节选部分):
python[12263] kernel.function("sys_wait4@kernel/exit.c:1745").return waits 15 us  
zabbix_agentd[1429] kernel.function("sys_wait4@kernel/exit.c:1745").return waits 53 us  
sh[12270] kernel.function("sys_wait4@kernel/exit.c:1745").return waits 3734 us  
sh[12270] kernel.function("sys_wait4@kernel/exit.c:1745").return waits 97 us  
sh[12270] kernel.function("sys_wait4@kernel/exit.c:1745").return waits 3 us  
python[12269] kernel.function("sys_wait4@kernel/exit.c:1745").return waits 22 us  
zabbix_agentd[1428] kernel.function("sys_wait4@kernel/exit.c:1745").return waits 25 us  

  • vfs.file_operation

    1. 进入虚拟文件系统vfsfile_operation操作时触发,如vfs.readvfs.write
    2. syscall一样,在后面增加.return,在退出事件时触发
    3. 使用命令stap -l vfs.*可以查看可探测的vfs事件
    4. 与此相同的还有netdev.*ioblock.*ioscheduler.*等,都是systemtap内置的tapset
    5. 举例:查看一个游戏进程的网络流量
global ifxmit, ifrecv  
probe netdev.transmit  
{
        if(pid() == $1)
            ifxmit += truesize
}
probe netdev.receive  
{
        if(pid() == $1)
            ifrecv += truesize
}
probe timer.ms(5000)  
{
        printf("transmit = %d, receive = %d\n", ifxmit, ifrecv)
        delete ifxmit
        delete ifrecv
}
# 输出结果
transmit = 1280, receive = 768  
transmit = 0, receive = 0  
transmit = 0, receive = 0  
transmit = 5120, receive = 4608  
transmit = 5120, receive = 6144  
transmit = 0, receive = 2048  
transmit = 0, receive = 0  
transmit = 1280, receive = 768  

可能会有疑问,这个truesize是从何而来呢?来看下/usr/share/systemtap/tapset/linux/networking.stp,这个文件就是内置的netdev.transmitnetdev.receive这两个tapset的源文件,truesize就是从这里得到的

// Main device receive routine, be called when packet arrives on network device
probe netdev.receive  
        =  kernel.function("netif_receive_skb")
{
        dev_name = kernel_string($skb->dev->name)
        length = $skb->len
        protocol = $skb->protocol
        truesize = $skb->truesize
}

  • kernel.function("function")

    1. 进入内核函数时触发,比如,kernel.function("sys_open")就是当有进程调用内核函数sys_open时就会触发
    2. 在后面增加.return表示内核函数返回试时触发,如kernel.function("sys_open").return
    3. 使用*作为通配符,可以跟踪进入和退出一个内核源码中的所有函数,如kernel.function("*@net/socket.c")kernel.function("*@net/socket.c").return,就是跟踪net/socket.c中所有的函数(内核源码在/usr/src/debug/kernel-$(uname -r | sed -r 's/\.[^\.]*$//g')/linux-$(uname -r)
    4. /boot/System.map-$(uname -r)第二项为T(表示全局函数/函数指针)的是可用来探测的内核函数,如ffffffff8117e340 T sys_open或者更有效的办法:stap -l 'kernel.function("*")'
    5. 举例:
probe kernel.function(@1).call  
{
        called[ppfunc()] <<< 1
}
global called  
probe end  
{
        foreach (fn in called-) 
                printf("%s %d\n", fn, @count(called[fn]))
        exit()
}
# 输出结果(部分)
% stap functioncallcount.stp "*@mm/*.c"
lookup_page_cgroup 14061  
__inc_zone_state 9666  
next_zones_zonelist 8637  
__phys_addr 8140  
vm_normal_page 7528  
find_vma 7218  
handle_mm_fault 6159  
__do_page_fault 6152  
page_remove_rmap 5636  
__zone_watermark_ok 5620  
zone_watermark_ok 5620  
__mod_zone_page_state 5174  
get_page 4017  
page_waitqueue 3792  
unlock_page 3783  
kmem_cache_free 3555  
kmem_cache_alloc 3539  
__inc_zone_page_state 3444  
find_get_page 3180  

  • kernel.trace("tracepoint")

    1. static tracepoint可以理解为一个linux内核中位置固定的占位符函数,可以把它理解为传统C程序中的#if DEBUG部分(相当于perftracepoint
    2. 可用的tracepoint可以简单地使用stap -l 'kernel.trace("*")'find /sys/kernel/debug/tracing/events -type d来查看,如kernel.trace("kfree_skb")
    3. 举例:查看系统内进程的状态转换和等待时间或运行时间
global stats  
probe kernel.trace("sched_stat_wait"), kernel.trace("sched_stat_sleep"), kernel.trace("sched_stat_iowait")  
{
        stat = $tsk->state
        if(stat == -1) stats = "unrunnable"
        if(stat == 0) stats = "runnable"
        if(stat > 0) stats = "stop"
                printf("%s[%ld] %s %s %ld\n", execname(), pid(), pp(), stats, $delay)
        delete stats
}
probe kernel.trace("sched_stat_runtime")  
{
        stat = $tsk->state
        if(stat == -1) stats = "unrunnable"
        if(stat == 0) stats = "runnable"
        if(stat > 0) stats = "stop"
                printf("%s[%ld] %s %s %ld\n", execname(), pid(), pp(), stats, $runtime)
        delete stats
}
probe timer.ms(10)  
{
        exit()
}
# 输出结果(部分)
beam.smp[25073] kernel.trace("sched_stat_runtime") runnable 4144  
beam.smp[25073] kernel.trace("sched_stat_wait") runnable 0  
beam.smp[25073] kernel.trace("sched_stat_runtime") runnable 4108  
beam.smp[25073] kernel.trace("sched_stat_wait") runnable 0  
beam.smp[25073] kernel.trace("sched_stat_runtime") runnable 4182  
beam.smp[25073] kernel.trace("sched_stat_wait") runnable 0  
beam.smp[25073] kernel.trace("sched_stat_runtime") runnable 4224  
beam.smp[25073] kernel.trace("sched_stat_wait") runnable 0  
beam.smp[25073] kernel.trace("sched_stat_runtime") runnable 4143  
beam.smp[25073] kernel.trace("sched_stat_wait") runnable 0  

  • module("module").function("function")

    1. 可以允许探测内核模块内部的函数
    2. probe module("ext3").function("*")probe module("ext3").function("*").return
    3. 内核模块目录是/lib/modules/$(uname -r)

异步事件

  • begin SystemTap进程启动时触发,一般用来输出提示或保存启动时的变量等
  • end SystemTap进程结束时触发,一般用来输出提示或输出统计数据等
  • timer 定时器,用来执行周期性的任务

    1. timer.s(seconds)
    2. timer.ms(milliseconds)
    3. timer.us(microseconds)
    4. timer.hz(hertz)
    5. timer.jiffies(jiffies)