Linux内核kprobe机制实现浅析-风行蟹-ChinaUnix博客

风行蟹的ChinaUnix博客

首页　| 　博文目录　| 　关于我

风行蟹

博客访问： 137452
博文数量： 17
博客积分： 0
博客等级：民兵
技术积分： 40
用户组：普通用户
注册时间： 2018-08-08 00:05

文章分类

全部博文（17）

未分配的博文（17）

文章存档

2020年（4）

2019年（10）

2018年（3）

我的朋友

相关博文

Linux内核kprobe机制实现浅析

分类： LINUX

2019-11-26 11:11:26

原文地址：Linux内核kprobe机制实现浅析作者：luoyan_xy

Kprobe机制是内核提供的一种调试机制，它提供了一种方法，能够在不修改现有代码的基础上，灵活的跟踪内核函数的执行。它的基本工作原理是：用户指定一个探测点，并把一个用户定义的处理函数关联到该探测点，当内核执行到该探测点时，相应的关联函数被执行，然后继续执行正常的代码路径。

Kprobe提供了三种形式的探测点，一种是最基本的kprobe，能够在指定代码执行前、执行后进行探测，但此时不能访问被探测函数内的相关变量信息；一种是jprobe，用于探测某一函数的入口，并且能够访问对应的函数参数；一种是kretprobe，用于完成指定函数返回值的探测功能。其中最基本的就是kprobe机制，jprobe以及kretprobe的实现都依赖于kprobe，但其代码的实现都很巧妙，强烈建议每一个内核爱好者阅读。

好了，闲话少叙，开始上代码：

首先是struct kprobe结构，每一个探测点的基本结构。

点击(此处)折叠或打开

struct kprobe {
/*用于保存kprobe的全局hash表，以被探测的addr为key*/
struct hlist_node hlist;
/* list of kprobes for multi-handler support */
/*当对同一个探测点存在多个探测函数时，所有的函数挂在这条链上*/
struct list_head list;
/*count the number of times this probe was temporarily disarmed */
unsigned long nmissed;
/* location of the probe point */
/*被探测的目标地址*/
kprobe_opcode_t *addr;
/* Allow user to indicate symbol name of the probe point */
/*symblo_name的存在，允许用户指定函数名而非确定的地址*/
const char *symbol_name;
/* Offset into the symbol */
/*如果被探测点为函数内部某个指令，需要使用addr + offset的方式*/
unsigned int offset;
/* Called before addr is executed. */
/*探测函数，在目标探测点执行之前调用*/
kprobe_pre_handler_t pre_handler;
/* Called after addr is executed, unless... */
/*探测函数，在目标探测点执行之后调用*/
kprobe_post_handler_t post_handler;
/*
* ... called if executing addr causes a fault (eg. page fault).
* Return 1 if it handled fault, otherwise kernel will see it.
*/
kprobe_fault_handler_t fault_handler;
/*
* ... called if breakpoint trap occurs in probe handler.
* Return 1 if it handled break, otherwise kernel will see it.
*/
kprobe_break_handler_t break_handler;
/*opcode 以及 ainsn 用于保存被替换的指令码*/
/* Saved opcode (which has been replaced with breakpoint) */
kprobe_opcode_t opcode;
/* copy of the original instruction */
struct arch_specific_insn ainsn;
/*
* Indicates various status flags.
* Protected by kprobe_mutex after this kprobe is registered.
*/
u32 flags;
};

对于kprobe功能的实现主要利用了内核中的两个功能特性：异常（尤其是int 3），单步执行（EFLAGS中的TF标志）。

大概的流程：

1）在注册探测点的时候，对被探测函数的指令码进行替换，替换为int 3的指令码；

2）在执行int 3的异常执行中，通过通知链的方式调用kprobe的异常处理函数；

3）在kprobe的异常出来函数中，判断是否存在pre_handler钩子，存在则执行；

4）执行完后，准备进入单步调试，通过设置EFLAGS中的TF标志位，并且把异常返回的地址修改为保存的原指令码；

5）代码返回，执行原有指令，执行结束后触发单步异常；

6）在单步异常的处理中，清除单步标志，执行post_handler流程，并最终返回；

下面又进入代码时间，首先看一下kprobe模块的初始化代码，初始化代码主要做了两件事：标记出哪些代码是不能被探测的，这些代码属于kprobe实现的关键代码；注册通知链到die_notifier，用于接收异常通知。

点击(此处)折叠或打开

初始化代码位于kernel/kprobes.c中
static int __init init_kprobes(void)
{
int i, err = 0;
....
/*kprobe_blacklist中保存的是kprobe实现的关键代码路径，这些函数不应该被kprobe探测*/
/*
* Lookup and populate the kprobe_blacklist.
*
* Unlike the kretprobe blacklist, we'll need to determine
* the range of addresses that belong to the said functions,
* since a kprobe need not necessarily be at the beginning
* of a function.
*/
for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
kprobe_lookup_name(kb->name, addr);
if (!addr)
continue;
kb->start_addr = (unsigned long)addr;
symbol_name = kallsyms_lookup(kb->start_addr,
&size, &offset, &modname, namebuf);
if (!symbol_name)
kb->range = 0;
else
kb->range = size;
}
....
if (!err)
/*注册通知链到die_notifier，用于接收int 3的异常信息*/
err = register_die_notifier(&kprobe_exceptions_nb);
....
}
其中的通知链：
static struct notifier_block kprobe_exceptions_nb = {
.notifier_call = kprobe_exceptions_notify,
/*优先级最高，保证最先执行*/
.priority = 0x7fffffff /* we need to be notified first */
};

kprobe的注册流程register_kprobe。

点击(此处)折叠或打开

int __kprobes register_kprobe(struct kprobe *p)
{
int ret = 0;
struct kprobe *old_p;
struct module *probed_mod;
kprobe_opcode_t *addr;
/*获取被探测点的地址，指定了symbol_name，则从kallsyms中获取；指定了offset，则返回addr + offset*/
addr = kprobe_addr(p);
if (!addr)
return -EINVAL;
p->addr = addr;
/*判断同一个kprobe是否被重复注册*/
ret = check_kprobe_rereg(p);
if (ret)
return ret;
jump_label_lock();
preempt_disable();
/*判断被注册的函数是否位于内核的代码段内，或位于不能探测的kprobe实现路径中*/
if (!kernel_text_address((unsigned long) p->addr) ||
in_kprobes_functions((unsigned long) p->addr) ||
ftrace_text_reserved(p->addr, p->addr) ||
jump_label_text_reserved(p->addr, p->addr))
goto fail_with_jump_label;
/* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
p->flags &= KPROBE_FLAG_DISABLED;
/*
* Check if are we probing a module.
*/
/*判断被探测的地址是否属于某一个模块，并且位于模块的text section内*/
probed_mod = __module_text_address((unsigned long) p->addr);
if (probed_mod) {
/*如果被探测的为模块地址，首先要增加模块的引用计数*/
/*
* We must hold a refcount of the probed module while updating
* its code to prohibit unexpected unloading.
*/
if (unlikely(!try_module_get(probed_mod)))
goto fail_with_jump_label;
/*
* If the module freed .init.text, we couldn't insert
* kprobes in there.
*/
/*如果被探测的地址位于模块的init地址段内，但该段代码区间已被释放，则直接退出*/
if (within_module_init((unsigned long)p->addr, probed_mod) &&
probed_mod->state != MODULE_STATE_COMING) {
module_put(probed_mod);
goto fail_with_jump_label;
}
}
preempt_enable();
jump_label_unlock();
p->nmissed = 0;
INIT_LIST_HEAD(&p->list);
mutex_lock(&kprobe_mutex);
jump_label_lock(); /* needed to call jump_label_text_reserved() */
get_online_cpus(); /* For avoiding text_mutex deadlock. */
mutex_lock(&text_mutex);
/*判断在同一个探测点是否已经注册了其他的探测函数*/
old_p = get_kprobe(p->addr);
if (old_p) {
/* Since this may unoptimize old_p, locking text_mutex. */
/*如果已经存在注册过的kprobe，则将探测点的函数修改为aggr_pre_handler，并将所有的handler挂载到其链表上，由其负责所有handler函数的执行*/
ret = register_aggr_kprobe(old_p, p);
goto out;
}
/* 分配特定的内存地址用于保存原有的指令
* 按照内核注释，被分配的地址必须must be on special executable page on x86.
* 该地址被保存在kprobe->ainsn.insn
*/
ret = arch_prepare_kprobe(p);
if (ret)
goto out;
/*将kprobe加入到相应的hash表内*/
INIT_HLIST_NODE(&p->hlist);
hlist_add_head_rcu(&p->hlist,
&kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
if (!kprobes_all_disarmed && !kprobe_disabled(p))
/*将探测点的指令码修改为int 3指令*/
__arm_kprobe(p);
/* Try to optimize kprobe */
try_to_optimize_kprobe(p);
out:
mutex_unlock(&text_mutex);
put_online_cpus();
jump_label_unlock();
mutex_unlock(&kprobe_mutex);
if (probed_mod)
module_put(probed_mod);
return ret;
fail_with_jump_label:
preempt_enable();
jump_label_unlock();
return -EINVAL;