Linux内核kprobe机制实现浅析-dsheng-ChinaUnix博客

Dsheng

首页　| 　博文目录　| 　关于我

dsheng

博客访问： 333127
博文数量： 81
博客积分： 1810
博客等级：上尉
技术积分： 725
用户组：普通用户
注册时间： 2008-05-25 17:38

文章分类

全部博文（81）

linux sched（1）
Python（3）
阅读计划（0）
machine lea（5）
推荐系统（1）
软件工程（1）
poker（0）
amazon（1）
vpn（0）
C（1）
嵌入式系统（5）
GPS（1）
版本管理（2）
时间系统（2）
linux网络（1）
linux调试与诊断（6）
MIPS（4）
RT Linux（9）
开源工具（5）
arm（5）
RFC（2）
linux系统管理（7）
未分配的博文（19）

文章存档

2016年（4）

2015年（11）

2014年（16）

2013年（37）

2012年（11）

2011年（2）

我的朋友

相关博文

Linux内核kprobe机制实现浅析

分类： LINUX

2013-08-12 16:25:46

原文地址：Linux内核kprobe机制实现浅析作者：luoyan_xy

Kprobe机制是内核提供的一种调试机制，它提供了一种方法，能够在不修改现有代码的基础上，灵活的跟踪内核函数的执行。它的基本工作原理是：用户指定一个探测点，并把一个用户定义的处理函数关联到该探测点，当内核执行到该探测点时，相应的关联函数被执行，然后继续执行正常的代码路径。

Kprobe提供了三种形式的探测点，一种是最基本的kprobe，能够在指定代码执行前、执行后进行探测，但此时不能访问被探测函数内的相关变量信息；一种是jprobe，用于探测某一函数的入口，并且能够访问对应的函数参数；一种是kretprobe，用于完成指定函数返回值的探测功能。其中最基本的就是kprobe机制，jprobe以及kretprobe的实现都依赖于kprobe，但其代码的实现都很巧妙，强烈建议每一个内核爱好者阅读。

好了，闲话少叙，开始上代码：

首先是struct kprobe结构，每一个探测点的基本结构。

点击(此处)折叠或打开

struct kprobe {
/*用于保存kprobe的全局hash表，以被探测的addr为key*/
struct hlist_node hlist;
/* list of kprobes for multi-handler support */
/*当对同一个探测点存在多个探测函数时，所有的函数挂在这条链上*/
struct list_head list;
/*count the number of times this probe was temporarily disarmed */
unsigned long nmissed;
/* location of the probe point */
/*被探测的目标地址*/
kprobe_opcode_t *addr;
/* Allow user to indicate symbol name of the probe point */
/*symblo_name的存在，允许用户指定函数名而非确定的地址*/
const char *symbol_name;
/* Offset into the symbol */
/*如果被探测点为函数内部某个指令，需要使用addr + offset的方式*/
unsigned int offset;
/* Called before addr is executed. */
/*探测函数，在目标探测点执行之前调用*/
kprobe_pre_handler_t pre_handler;
/* Called after addr is executed, unless... */
/*探测函数，在目标探测点执行之后调用*/
kprobe_post_handler_t post_handler;
/*
* ... called if executing addr causes a fault (eg. page fault).
* Return 1 if it handled fault, otherwise kernel will see it.
*/
kprobe_fault_handler_t fault_handler;
/*
* ... called if breakpoint trap occurs in probe handler.
* Return 1 if it handled break, otherwise kernel will see it.
*/
kprobe_break_handler_t break_handler;
/*opcode 以及 ainsn 用于保存被替换的指令码*/
/* Saved opcode (which has been replaced with breakpoint) */
kprobe_opcode_t opcode;
/* copy of the original instruction */
struct arch_specific_insn ainsn;
/*
* Indicates various status flags.
* Protected by kprobe_mutex after this kprobe is registered.
*/
u32 flags;
};

对于kprobe功能的实现主要利用了内核中的两个功能特性：异常（尤其是int 3），单步执行（EFLAGS中的TF标志）。

大概的流程：

1）在注册探测点的时候，对被探测函数的指令码进行替换，替换为int 3的指令码；

2）在执行int 3的异常执行中，通过通知链的方式调用kprobe的异常处理函数；

3）在kprobe的异常出来函数中，判断是否存在pre_handler钩子，存在则执行；

4）执行完后，准备进入单步调试，通过设置EFLAGS中的TF标志位，并且把异常返回的地址修改为保存的原指令码；

5）代码返回，执行原有指令，执行结束后触发单步异常；

6）在单步异常的处理中，清除单步标志，执行post_handler流程，并最终返回；

下面又进入代码时间，首先看一下kprobe模块的初始化代码，初始化代码主要做了两件事：标记出哪些代码是不能被探测的，这些代码属于kprobe实现的关键代码；注册通知链到die_notifier，用于接收异常通知。

点击(此处)折叠或打开

初始化代码位于kernel/kprobes.c中
static int __init init_kprobes(void)
{
int i, err = 0;
....
/*kprobe_blacklist中保存的是kprobe实现的关键代码路径，这些函数不应该被kprobe探测*/
/*
* Lookup and populate the kprobe_blacklist.
*
* Unlike the kretprobe blacklist, we'll need to determine
* the range of addresses that belong to the said functions,
* since a kprobe need not necessarily be at the beginning
* of a function.
*/
for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
kprobe_lookup_name(kb->name, addr);
if (!addr)
continue;
kb->start_addr = (unsigned long)addr;
symbol_name = kallsyms_lookup(kb->start_addr,
&size, &offset, &modname, namebuf);
if (!symbol_name)
kb->range = 0;
else
kb->range = size;
}
....
if (!err)
/*注册通知链到die_notifier，用于接收int 3的异常信息*/
err = register_die_notifier(&kprobe_exceptions_nb);
....
}
其中的通知链：
static struct notifier_block kprobe_exceptions_nb = {
.notifier_call = kprobe_exceptions_notify,
/*优先级最高，保证最先执行*/
.priority = 0x7fffffff /* we need to be notified first */
};

kprobe的注册流程register_kprobe。

点击(此处)折叠或打开

int __kprobes register_kprobe(struct kprobe *p)
{
int ret = 0;
struct kprobe *old_p;
struct module *probed_mod;
kprobe_opcode_t *addr;
/*获取被探测点的地址，指定了symbol_name，则从kallsyms中获取；指定了offset，则返回addr + offset*/
addr = kprobe_addr(p);
if (!addr)
return -EINVAL;
p->addr = addr;
/*判断同一个kprobe是否被重复注册*/
ret = check_kprobe_rereg(p);
if (ret)
return ret;
jump_label_lock();
preempt_disable();
/*判断被注册的函数是否位于内核的代码段内，或位于不能探测的kprobe实现路径中*/
if (!kernel_text_address((unsigned long) p->addr) ||
in_kprobes_functions((unsigned long) p->addr) ||
ftrace_text_reserved(p->addr, p->addr) ||
jump_label_text_reserved(p->addr, p->addr))
goto fail_with_jump_label;
/* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
p->flags &= KPROBE_FLAG_DISABLED;
/*
* Check if are we probing a module.
*/
/*判断被探测的地址是否属于某一个模块，并且位于模块的text section内*/
probed_mod = __module_text_address((unsigned long) p->addr);
if (probed_mod) {
/*如果被探测的为模块地址，首先要增加模块的引用计数*/
/*
* We must hold a refcount of the probed module while updating
* its code to prohibit unexpected unloading.
*/
if (unlikely(!try_module_get(probed_mod)))
goto fail_with_jump_label;
/*
* If the module freed .init.text, we couldn't insert
* kprobes in there.
*/
/*如果被探测的地址位于模块的init地址段内，但该段代码区间已被释放，则直接退出*/
if (within_module_init((unsigned long)p->addr, probed_mod) &&
probed_mod->state != MODULE_STATE_COMING) {
module_put(probed_mod);
goto fail_with_jump_label;
}
}
preempt_enable();
jump_label_unlock();
p->nmissed = 0;
INIT_LIST_HEAD(&p->list);
mutex_lock(&kprobe_mutex);
jump_label_lock(); /* needed to call jump_label_text_reserved() */
get_online_cpus(); /* For avoiding text_mutex deadlock. */
mutex_lock(&text_mutex);
/*判断在同一个探测点是否已经注册了其他的探测函数*/
old_p = get_kprobe(p->addr);
if (old_p) {
/* Since this may unoptimize old_p, locking text_mutex. */
/*如果已经存在注册过的kprobe，则将探测点的函数修改为aggr_pre_handler，并将所有的handler挂载到其链表上，由其负责所有handler函数的执行*/
ret = register_aggr_kprobe(old_p, p);
goto out;
}
/* 分配特定的内存地址用于保存原有的指令
* 按照内核注释，被分配的地址必须must be on special executable page on x86.
* 该地址被保存在kprobe->ainsn.insn
*/
ret = arch_prepare_kprobe(p);
if (ret)
goto out;
/*将kprobe加入到相应的hash表内*/
INIT_HLIST_NODE(&p->hlist);
hlist_add_head_rcu(&p->hlist,
&kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
if (!kprobes_all_disarmed && !kprobe_disabled(p))
/*将探测点的指令码修改为int 3指令*/
__arm_kprobe(p);
/* Try to optimize kprobe */
try_to_optimize_kprobe(p);
out:
mutex_unlock(&text_mutex);
put_online_cpus();
jump_label_unlock();
mutex_unlock(&kprobe_mutex);
if (probed_mod)
module_put(probed_mod);
return ret;
fail_with_jump_label:
preempt_enable();
jump_label_unlock();
return -EINVAL;