内核通知链不是一个很复杂的东西,但是在内核中应用却很重要,当然也有很多人写过它的文章,当然这里写只供学习之用.
参考资料《深入理解linux网络内幕》,网络上一些文章.
通知链只在内核子系统之间使用,内核和用户空间的通知信息由其他机制来处理,比如ioctl.
内核源码参考: incllude/linux/notifier.h
kernel/notifier.c
通知链的基本数据结构定义:
struct notifier_block {
int (*notifier_call)(struct notifier_block *, unsigned long, void *);
struct notifier_block __rcu *next;
int priority;
};
通知链的本质注册函数:
/*
* Notifier chain core routines. The exported routines below
* are layered on top of these, with appropriate locking added.
*/
static int notifier_chain_register(struct notifier_block **nl,
struct notifier_block *n)
{
while ((*nl) != NULL) {
if (n->priority > (*nl)->priority)
break;
nl = &((*nl)->next);
}
n->next = *nl;
rcu_assign_pointer(*nl, n);
return 0;
}
通用通知的产生:
notifier_call_chain函数
/**
* notifier_call_chain - Informs the registered notifiers about an event.
* @nl: Pointer to head of the blocking notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
* @nr_to_call: Number of notifier functions to be called. Don't care
* value of this parameter is -1.
* @nr_calls: Records the number of notifications sent. Don't care
* value of this field is NULL.
* @returns: notifier_call_chain returns the value returned by the
* last notifier function called.
*/
static int __kprobes notifier_call_chain(struct notifier_block **nl,
unsigned long val, void *v,
int nr_to_call, int *nr_calls)
{
int ret = NOTIFY_DONE;
struct notifier_block *nb, *next_nb;
nb = rcu_dereference_raw(*nl);
while (nb && nr_to_call) {
next_nb = rcu_dereference_raw(nb->next);
#ifdef CONFIG_DEBUG_NOTIFIERS
if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
WARN(1, "Invalid notifier called!");
nb = next_nb;
continue;
}
#endif
ret = nb->notifier_call(nb, val, v);
if (nr_calls)
(*nr_calls)++;
if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
break;
nb = next_nb;
nr_to_call--;
}
return ret;
}
它有四种变体类型:
这里列出notifier.h中的一段注释
/*
* Notifier chains are of four types:
*
* Atomic notifier chains: Chain callbacks run in interrupt/atomic
* context. Callouts are not allowed to block.
* Blocking notifier chains: Chain callbacks run in process context.
* Callouts are allowed to block.
* Raw notifier chains: There are no restrictions on callbacks,
* registration, or unregistration. All locking and protection
* must be provided by the caller.
* SRCU notifier chains: A variant of blocking notifier chains, with
* the same restrictions.
*
* atomic_notifier_chain_register() may be called from an atomic context,
* but blocking_notifier_chain_register() and srcu_notifier_chain_register()
* must be called from a process context. Ditto for the corresponding
* _unregister() routines.
*
* atomic_notifier_chain_unregister(), blocking_notifier_chain_unregister(),
* and srcu_notifier_chain_unregister() _must not_ be called from within
* the call chain.
*
* SRCU notifier chains are an alternative form of blocking notifier chains.
* They use SRCU (Sleepable Read-Copy Update) instead of rw-semaphores for
* protection of the chain links. This means there is _very_ low overhead
* in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
* As compensation, srcu_notifier_chain_unregister() is rather expensive.
* SRCU notifier chains should be used when the chain will be called very
* often but notifier_blocks will seldom be removed. Also, SRCU notifier
* chains are slightly more difficult to use because they require special
* runtime initialization.
*/
1.
原子通知链( Atomic notifier chains):通知链元素的回调函数(当事件发生时要执行的函数)只能在中断上下文中运行,不允许阻塞
struct atomic_notifier_head {
spinlock_t lock;
struct notifier_block __rcu *head;
};
2.
可阻塞通知链( Blocking notifier chains):通知链元素的回调函数在进程上下文中运行,允许阻塞
struct blocking_notifier_head {
struct rw_semaphore rwsem;
struct notifier_block __rcu *head;
};
3.
原始通知链( Raw notifier chains):对通知链元素的回调函数没有任何限制,所有锁和保护机制都由调用者维护
struct raw_notifier_head {
struct notifier_block __rcu *head;
};
4.
SRCU 通知链( SRCU notifier chains ):可阻塞通知链的一种变体
struct srcu_notifier_head {
struct mutex mutex;
struct srcu_struct srcu;
struct notifier_block __rcu *head;
};
这里我们并不会逐个分析,只分析原始通知链( Raw notifier chains).
当然在分析前先提下,内核总已经定义的一些链:
死亡提醒 通过register_die_notifier注册, 当内核函数触发了一个陷阱或违例错误发送,由oops页错误或断点命中引发。例如为一个医学级别卡写设备驱动,你可能想注册自己给死亡
提醒者,以在内核崩溃发生时关闭医疗电子信号。
网路设备提醒 通过register_netdevice_notifier注册:网络接口启动或关闭时产生。
CPU频率提醒 通过cpufreq_register_notifier注册:当处理器频率跃变时分发出去。
因特网地址提醒 通过register_inetaddr_notifier注册:当网络接口的IP地址发生变化被检测时发送
当然内核还有其他这里不再一一列出.
这里以netif_carrier_on为例子来分析下,它的原理和机制.
这个函数是检测网卡链路状态的情况,来通知内核子系统做出相应的处理.它主要会用在网络设备给的驱动里.drivers/net/*
/**
* netif_carrier_on - set carrier
* @dev: network device
*
* Device has detected that carrier.
*/
void netif_carrier_on(struct net_device *dev)
{
if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
if (dev->reg_state == NETREG_UNINITIALIZED)
return;
linkwatch_fire_event(dev);
if (netif_running(dev))
__netdev_watchdog_up(dev);
}
}
这里我们关注主要函数linkwatch_fire_event(dev);即如何通知处理例程.
void
linkwatch_fire_event(struct net_device *dev)
{
bool urgent = linkwatch_urgent_event(dev);
if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
linkwatch_add_event(dev);
} else if (!urgent)
return;
linkwatch_schedule_work(urgent);
}
这个函数里面有两个关键操作,不用我说大家也看的明白:
linkwatch_add_event(dev);和 linkwatch_schedule_work(urgent);
在看上面代码中我们知道在调用linkwatch_add_event(dev);前做了一些有助于后续工作的初始化工作:
bool urgent = linkwatch_urgent_event(dev);
为了后来的调度队列的工作.
static void linkwatch_add_event(struct net_device *dev)
{
unsigned long flags;
spin_lock_irqsave(&lweventlist_lock, flags);
if (list_empty(&dev->link_watch_list)) {
list_add_tail(&dev->link_watch_list, &lweventlist);
dev_hold(dev);
}
spin_unlock_irqrestore(&lweventlist_lock, flags);
}
把监测到链路有载波的设备添加进入队列使用的链表:lweventlist
然后调度队列的执行.
static void linkwatch_schedule_work(int urgent)
{
unsigned long delay = linkwatch_nextevent - jiffies;
if (test_bit(LW_URGENT, &linkwatch_flags))
return;
/* Minimise down-time: drop delay for up event. */
if (urgent) {
if (test_and_set_bit(LW_URGENT, &linkwatch_flags))
return;
delay = 0;
}
/* If we wrap around we'll delay it by at most HZ. */
if (delay > HZ)
delay = 0;
/*
* This is true if we've scheduled it immeditately or if we don't
* need an immediate execution and it's already pending.
*/
if (schedule_delayed_work(&linkwatch_work, delay) == !delay)
return;
/* Don't bother if there is nothing urgent. */
if (!test_bit(LW_URGENT, &linkwatch_flags))
return;
/* It's already running which is good enough. */
if (!__cancel_delayed_work(&linkwatch_work))
return;
/* Otherwise we reschedule it again for immediate execution. */
schedule_delayed_work(&linkwatch_work, 0);
}
调度工作队列linkwatch_work.这里我们看下它的初始化:
static
DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event);
即调度工作队列的执行函数:
static void
linkwatch_event(struct work_struct *dummy)
{
rtnl_lock();
__linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies));
rtnl_unlock();
}
__linkwatch_run_queue:
static void __linkwatch_run_queue(int urgent_only)
{
struct net_device *dev;
LIST_HEAD(wrk);
/*
* Limit the number of linkwatch events to one
* per second so that a runaway driver does not
* cause a storm of messages on the netlink
* socket. This limit does not apply to up events
* while the device qdisc is down.
*/
if (!urgent_only)
linkwatch_nextevent = jiffies + HZ;
/* Limit wrap-around effect on delay. */
else if (time_after(linkwatch_nextevent, jiffies + HZ))
linkwatch_nextevent = jiffies;
clear_bit(LW_URGENT, &linkwatch_flags);
spin_lock_irq(&lweventlist_lock);
list_splice_init(&lweventlist, &wrk);
while (!list_empty(&wrk)) {
dev = list_first_entry(&wrk, struct net_device, link_watch_list);
list_del_init(&dev->link_watch_list);
if (urgent_only && !linkwatch_urgent_event(dev)) {
list_add_tail(&dev->link_watch_list, &lweventlist);
continue;
}
spin_unlock_irq(&lweventlist_lock);
linkwatch_do_dev(dev);
spin_lock_irq(&lweventlist_lock);
}
if (!list_empty(&lweventlist))
linkwatch_schedule_work(0);
spin_unlock_irq(&lweventlist_lock);
}
这个函数把链表lweventlist和工作队列linkwatch_work联系起来.
然后真正调用通知链处理函数
linkwatch_do_dev(dev);------>
netdev_state_change(dev);
/**
* netdev_state_change - device changes state
* @dev: device to cause notification
*
* Called to indicate a device has changed state. This function calls
* the notifier chains for netdev_chain and sends a NEWLINK message
* to the routing socket.
*/
void netdev_state_change(struct net_device *dev)
{
if (dev->flags & IFF_UP) {
call_netdevice_notifiers(NETDEV_CHANGE, dev);
rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
}
}
这里我们只关心
call_netdevice_notifiers(NETDEV_CHANGE, dev);
/**
* call_netdevice_notifiers - call all network notifier blocks
* @val: value passed unmodified to notifier function
* @dev: net_device pointer passed unmodified to notifier function
*
* Call all network notifier blocks. Parameters and return value
* are as for raw_notifier_call_chain().
*/
int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
{
ASSERT_RTNL();
return
raw_notifier_call_chain(&netdev_chain, val, dev);
}
它查询通知链netdev_chain上所有对NETDEV_CHANGE感兴趣的处理事件.
当然这个例子是通过工作队列的形式来监控和通知内核子系统来处理事件,多少看起来有点复杂.
也有其他的调用模式,比如在net/core/dev.c中的dev_open函数:
/**
* dev_open - prepare an interface for use.
* @dev: device to open
*
* Takes a device from down to up state. The device's private open
* function is invoked and then the multicast lists are loaded. Finally
* the device is moved into the up state and a %NETDEV_UP message is
* sent to the netdev notifier chain.
*
* Calling this function on an active interface is a nop. On a failure
* a negative errno code is returned.
*/
int dev_open(struct net_device *dev)
{
int ret;
if (dev->flags & IFF_UP)
return 0;
ret = __dev_open(dev);
if (ret < 0)
return ret;
rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
call_netdevice_notifiers(NETDEV_UP, dev);
return ret;
}
即当设备打开的时候直接通知.
内核中还有很多通知链注册和调用的各种变体,这里我们不再分析,留给大家去发现和深入学习.