小公司研发总监,既当司令也当兵!
分类: LINUX
2015-05-20 11:42:04
本文结合网络上关于Linux网桥的说明、 Linux平台的代码阅读记录,整理的一篇总结性文档。由于时间仓促,分析可能存在不足之外,望大家见谅和指正。
对于接触过Linux 网络的童鞋,对网桥功能应该不陌生。概括来说,网桥实现最重要的两点:
1. MAC学习:学习MAC地址,起初,网桥是没有任何地址与端口的对应关系的,它发送数据,还是得想HUB一样,但是每发送一个数据,它都会关心数据包的来源MAC是从自己的哪个端口来的,由于学习,建立地址-端口的对照表(CAM表)。
2. 报文转发:每发送一个数据包,网桥都会提取其目的MAC地址,从自己的地址-端口对照表(CAM表)中查找由哪个端口把数据包发送出去。
本文目的让读者对Linux网桥有个全面的认识。作重讲述Linux网桥的定义、网桥管理、数据流程和端口-MAC映射管理,以及网桥的Netfilter。关于网桥的STP,由于ap121上,网桥并没启用STP,所以这部分不做详细介绍,只在文档后面做一个简单介绍。
在Linux里面使用网桥非常简单,仅需要做两件事情就可以配置了。其一是在编译内核里把CONFIG_BRIDGE或CONDIG_BRIDGE_MODULE编译选项打开;其二是安装brctl工具。第一步是使内核协议栈支持网桥,第二步是安装用户空间工具,通过一系列的ioctl调用来配置网桥。在我们开发过程中,常见的几条命令:
Brctl addbr br0 (建立一个网桥br0, 同时在Linux内核里面创建虚拟网卡br0)
Brctl addif br0 eth0
Brctl addif br0 ath0
Brctl addif br0 ath1 (分别为网桥br0添加接口eth0, ath0和ath1)
本章我们的目的就是弄清楚以上几条命令在内核中是如何实现、生效的。
按照惯例,先熟悉一下网桥相关的重要数据结构体定义,方便后续讲解。和网桥息息相关的几个结构体包括:网桥自身定义(net_bridge)、网桥端口(net_bridge_port)、网桥端口-MAC映射表项(net_bridge_fdb_entry)等。另外,网桥本身也是一个虚拟的网卡设备(net_device)。Net_device是一个庞大的结构体,我们在这里就不展现了。关于net_device详细介绍请参考《Linux设备驱动程序》网络驱动程序章节, net_device的详细介绍。下面我们介绍网桥、端口、端口-MAC映射表项的数据结构。
网桥定义:
struct net_bridge
{
//自旋锁
spinlock_t lock;
//网桥所有端口的链表,其中每个元素都是一个net_bridge_port结构。
struct list_head port_list;
//网桥会建立一个虚拟设备来进行管理,这个设备的MAC地址是动态指定的,通常就是桥组中一个物理端口的MAC地址
struct net_device *dev;
//这个锁是用来保护下面的那个hash链表。
spinlock_t hash_lock;
//保存forwarding database的一个hash链表(这个也就是地址学习的东东,所以通过hash能 快速定位),这里每个元素都是一个net_bridge_fsb_entry结构
struct hlist_head hash[BR_HASH_SIZE];
//这个结构没有被使用
struct list_head age_list;
unsigned long feature_mask;
#ifdef CONFIG_BRIDGE_NETFILTER
struct rtable fake_rtable;
#endif
unsigned long flags;
#define BR_SET_MAC_ADDR 0x00000001
//stp相关的一些东西
bridge_id designated_root;
bridge_id bridge_id;
u32 root_path_cost;
unsigned long max_age;
unsigned long hello_time;
unsigned long forward_delay;
unsigned long bridge_max_age;
unsigned long ageing_time;
unsigned long bridge_hello_time;
unsigned long bridge_forward_delay;
u8 group_addr[ETH_ALEN];
u16 root_port;
//STP当前使用的协议
enum {
BR_NO_STP, /* no spanning tree */
BR_KERNEL_STP, /* old STP in kernel */
BR_USER_STP, /* new RSTP in userspace */
} stp_enabled;
unsigned char topology_change;
unsigned char topology_change_detected;
//stp要用的一些定时器列表
struct timer_list hello_timer;
struct timer_list tcn_timer;
struct timer_list topology_change_timer;
struct timer_list gc_timer;
struct kobject *ifobj;
};
网桥端口数据结构体:
struct net_bridge_port
{
//当前端口所属的网桥设备
struct net_bridge *br;
//表示链接到这个端口的物理设备
struct net_device *dev;
//同一桥内的端口链表
struct list_head list;
//stp相关的一些参数
u8 priority;
u8 state;
u16 port_no;
unsigned char topology_change_ack;
unsigned char config_pending;
port_id port_id;
port_id designated_port;
bridge_id designated_root;
bridge_id designated_bridge;
u32 path_cost;
u32 designated_cost;
//端口定时器,也就是stp控制超时的一些定时器列表
struct timer_list forward_delay_timer;
struct timer_list hold_timer;
struct timer_list message_age_timer;
struct kobject kobj;
struct rcu_head rcu;
};
网桥端口-MAC映射表项:
struct net_bridge_fdb_entry
{
//用于CAM表连接的链表指针
struct hlist_node hlist;
//桥的端口(最主要的两个域就是这个域和下面的mac地址域)
struct net_bridge_port *dst;
//当使用RCU策略,才用到
struct rcu_head rcu;
//引用计数
atomic_t use_count;
unsigned long ageing_timer;
//mac地址
mac_addr addr;
//标明是否为本机MAC地址
unsigned char is_local;
//标明是否为静态地址
unsigned char is_static;
};
关于net_bridge、 net_bridge_port、net_bridge_fdb_entry它们之间的关系可以使用如下图的示意图表示:
重要数据结构关系示意图
网桥在内核中,被实现为一个内核模块,源代码在~/1xU/ap121/linux/kernels/mips-linux-2.6.31/net/bridge/br.c中。初始化方法br_init:
static int __init br_init(void)
{
int err;
//stp的注册
err = stp_proto_register(&br_stp_proto);
if (err < 0) {
printk(KERN_ERR "bridge: can't register sap for STP\n");
return err;
}
//CAM表的初始化
err = br_fdb_init();
if (err)
goto err_out;
//网桥的netfilter钩子函数的初始化
err = br_netfilter_init();
if (err)
goto err_out1;
//注册到netdevice的通知链上
err = register_netdevice_notifier(&br_device_notifier);
if (err)
goto err_out2;
err = br_netlink_init();
if (err)
goto err_out3;
//设置网桥设备的do_ioctl函数,也就是提供给用户空间ioctl接口
brioctl_set(br_ioctl_deviceless_stub);
//设置网桥数据处理接口
br_handle_frame_hook = br_handle_frame;
//设置网桥CAM数据交换接口
br_fdb_get_hook = br_fdb_get;
br_fdb_put_hook = br_fdb_put;
return 0;
// 异常处理略
…
return err;
}
网桥内核模块初始化后,并没有真正的一个网桥设备被实例化,它只是搭建好了运行环境。要网桥真正的运作,还需要从创建一个网桥设备开始。
接上文,在网桥初始化的时候,设置了网桥的ioctl接口:br_ioctl_deviceless_stub。下面看看br_ioctl_deviceless_stub的实现:
int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg)
{
switch (cmd) {
case SIOCGIFBR:
case SIOCSIFBR:
return old_deviceless(net, uarg);
case SIOCBRADDBR:
case SIOCBRDELBR:
{
char buf[IFNAMSIZ];
if (!capable(CAP_NET_ADMIN))
return -EPERM;
if (copy_from_user(buf, uarg, IFNAMSIZ))
return -EFAULT;
buf[IFNAMSIZ-1] = 0;
if (cmd == SIOCBRADDBR)
return br_add_bridge(net, buf); //添加网桥
return br_del_bridge(net, buf); //删除网桥
}
}
return -EOPNOTSUPP;
}
当我们执行brctl addbr br0时,我们传入的cmd为SIOCBRADDBR,会转入br_add_bridge中进行:
int br_add_bridge(struct net *net, const char *name)
{
struct net_device *dev;
int ret;
// 创建一个网卡设备
dev = new_bridge_dev(net, name);
if (!dev)
return -ENOMEM;
rtnl_lock();
// 内核确认设备名称
if (strchr(dev->name, '%')) {
ret = dev_alloc_name(dev, dev->name);
if (ret < 0)
goto out_free;
}
// 注册网卡设备
ret = register_netdevice(dev);
if (ret)
goto out_free;
//在sysfs中建立相关信息,便于查看和管理
ret = br_sysfs_addbr(dev);
if (ret)
unregister_netdevice(dev);
out:
rtnl_unlock();
return ret;
out_free:
free_netdev(dev);
goto out;
}
网桥是一个虚拟的设备,它的注册跟实际的物理网络设备注册是一样的(可以参看《Linux设备驱动程序》网络驱动程序中,net_device创建和注册过程):
static struct net_device *new_bridge_dev(struct net *net, const char *name)
{
struct net_bridge *br;
struct net_device *dev;
// 创建net_device设备,执行网桥设备初始化程序:br_dev_setup
dev = alloc_netdev(sizeof(struct net_bridge), name,
br_dev_setup);
if (!dev)
return NULL;
// 设定net
dev_net_set(dev, net);
// net_device私有区被指定网桥,然后进行网桥相关初始化填充
br = netdev_priv(dev);
br->dev = dev;
spin_lock_init(&br->lock);
INIT_LIST_HEAD(&br->port_list);
spin_lock_init(&br->hash_lock);
br->bridge_id.prio[0] = 0x80;
br->bridge_id.prio[1] = 0x00;
memcpy(br->group_addr, br_group_address, ETH_ALEN);
br->feature_mask = dev->features;
br->stp_enabled = BR_NO_STP; // 默认不开启STP功能
br->designated_root = br->bridge_id;
br->root_path_cost = 0;
br->root_port = 0;
br->bridge_max_age = br->max_age = 20 * HZ;
br->bridge_hello_time = br->hello_time = 2 * HZ;
br->bridge_forward_delay = br->forward_delay = 15 * HZ;
br->topology_change = 0;
br->topology_change_detected = 0;
br->ageing_time = 300 * HZ;
br_netfilter_rtable_init(br);
INIT_LIST_HEAD(&br->age_list);
br_stp_timer_init(br);
return dev;
}
更详细的,看看网桥虚拟设备初始化的细节:
void br_dev_setup(struct net_device *dev)
{
//初始化MAC
random_ether_addr(dev->dev_addr);
// 网桥设备也是以太网设备,需要进行以太网部分初始化
ether_setup(dev);
dev->netdev_ops = &br_netdev_ops;
dev->destructor = free_netdev;
SET_ETHTOOL_OPS(dev, &br_ethtool_ops);
dev->tx_queue_len = 0;
dev->priv_flags = IFF_EBRIDGE;
dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
NETIF_F_GSO_MASK | NETIF_F_NO_CSUM | NETIF_F_LLTX |
NETIF_F_NETNS_LOCAL | NETIF_F_GSO;
}
static const struct ethtool_ops br_ethtool_ops = {
.get_drvinfo = br_getinfo,
.get_link = ethtool_op_get_link,
.get_tx_csum = ethtool_op_get_tx_csum,
.set_tx_csum = br_set_tx_csum,
.get_sg = ethtool_op_get_sg,
.set_sg = br_set_sg,
.get_tso = ethtool_op_get_tso,
.set_tso = br_set_tso,
.get_ufo = ethtool_op_get_ufo,
.get_flags = ethtool_op_get_flags,
};
static const struct net_device_ops br_netdev_ops = {
.ndo_open = br_dev_open, // 打开设备
.ndo_stop = br_dev_stop, // 停止设备
.ndo_start_xmit = br_dev_xmit, // 发送数据
.ndo_set_mac_address = br_set_mac_address, // 设置MAC
.ndo_set_multicast_list = br_dev_set_multicast_list, // 设置mutlicast
.ndo_change_mtu = br_change_mtu, // 设置MTU
.ndo_do_ioctl = br_dev_ioctl, // 设备ioctl
};
以上是创建网桥及网桥初始化的全部过程,关于网桥删除主要是上述网桥注册过程的逆过程:解除端口,清除定时器,删除sysfs设备,注销虚拟设备:
static void del_br(struct net_bridge *br)
{
struct net_bridge_port *p, *n;
list_for_each_entry_safe(p, n, &br->port_list, list) {
del_nbp(p);
}
del_timer_sync(&br->gc_timer);
br_sysfs_delbr(br->dev);
unregister_netdevice(br->dev);
}
仅仅创建网桥,还是不够的。实际应用中的网桥需要添加实际的端口(即物理接口),比如:
brctl addif br0 eth0
应用程序在使用ioctl来为网桥增加物理接口,对应内核函数br_dev_ioctl(初始化网桥时指定的),代码和分析如下:
int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
{
struct net_bridge *br = netdev_priv(dev);
switch(cmd) {
case SIOCDEVPRIVATE:
return old_dev_ioctl(dev, rq, cmd);
case SIOCBRADDIF:
case SIOCBRDELIF:
// 根据cmd类型执行添加或删除端口
return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF);
}
pr_debug("Bridge does not support ioctl 0x%x\n", cmd);
return -EOPNOTSUPP;
}
static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
{
struct net_device *dev;
int ret;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
dev = dev_get_by_index(dev_net(br->dev), ifindex);
if (dev == NULL)
return -EINVAL;
if (isadd)
ret = br_add_if(br, dev); //增加一个端口
else
ret = br_del_if(br, dev); //删除端口
dev_put(dev);
return ret;
}
事实上,增加一个端口就是实例化并填充一个net_bridge_port,并将其加入到网桥的端口记录表中:
int br_add_if(struct net_bridge *br, struct net_device *dev)
{
struct net_bridge_port *p;
int err = 0;
// 环路端口和非以太网设备不添加
if (dev->flags & IFF_LOOPBACK || dev->type != ARPHRD_ETHER)
return -EINVAL;
// 如果加入端口本身也是网桥设备,不添加
if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit)
return -ELOOP;
// 如果加入端口设备已经属于其他网桥,不添加
if (dev->br_port != NULL)
return -EBUSY;
// new一个port
p = new_nbp(br, dev);
if (IS_ERR(p))
return PTR_ERR(p);
// 设置为混杂模式
err = dev_set_promiscuity(dev, 1);
if (err)
goto put_back;
// 一些初始化
err = kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj),
SYSFS_BRIDGE_PORT_ATTR);
if (err)
goto err0;
// 将端口的MAC插入到端口-MAC映射表中
err = br_fdb_insert(br, p, dev->dev_addr);
if (err)
goto err1;
// 添加到sysfs文件系统中
err = br_sysfs_addif(p);
if (err)
goto err2;
rcu_assign_pointer(dev->br_port, p);
dev_disable_lro(dev);
// 添加到网桥端口记录表中
list_add_rcu(&p->list, &br->port_list);
spin_lock_bh(&br->lock);
br_stp_recalculate_bridge_id(br);
br_features_recompute(br);
if ((dev->flags & IFF_UP) && netif_carrier_ok(dev) &&
(br->dev->flags & IFF_UP))
br_stp_enable_port(p);
spin_unlock_bh(&br->lock);
br_ifinfo_notify(RTM_NEWLINK, p);
dev_set_mtu(br->dev, br_min_mtu(br));
kobject_uevent(&p->kobj, KOBJ_ADD);
return 0;
err2:
br_fdb_delete_by_port(br, p, 1);
err1:
kobject_put(&p->kobj);
err0:
dev_set_promiscuity(dev, -1);
put_back:
dev_put(dev);
kfree(p);
return err;
}
int br_del_if(struct net_bridge *br, struct net_device *dev)
{
struct net_bridge_port *p = dev->br_port;
if (!p || p->br != br)
return -EINVAL;
del_nbp(p);
spin_lock_bh(&br->lock);
br_stp_recalculate_bridge_id(br);
br_features_recompute(br);
spin_unlock_bh(&br->lock);
return 0;
}
在我们日常开发中,最常见的一种拓扑如下图所示:
典型拓扑
其中,DUT有三个端口ath0(本地无线端口)、eth0(有线lan口)、aht1(连接远程无线端口);该三个端口通过br0网桥桥接在一起。本章作重讲述,在该拓扑下,pc1、pc2、pc3以及ROOT-AP之间,是如何通过DUT(br0)进行数据交互的。
现假设pc3向pc1发送一个数据包,数据首先会由eth0网卡接收,此后网卡向CPU发送接收中断。当CPU执行当前指令后(如果开中断的话),马上跳到网卡的驱动程去。eht0的网卡驱动首先生成一个skb结构,然后对以太网层进行分析,最后驱动将该skb结构放到当前CPU的输入队列中,唤醒软中断。如果没有其它中断的到来,那么软中断将调用netif_receive_skb函数。关于网卡驱动和中断响应不是本文讨论的重点,所以我们还是从netif_receive_skb说起。
/**
* netif_receive_skb - process receive buffer from network
* @skb: buffer to process
*
* netif_receive_skb() is the main receive data processing function.
* It always succeeds. The buffer may be dropped during processing
* for congestion control or by the protocol layers.
*
* This function may only be called from softirq context and interrupts
* should be enabled.
*
* Return values (usually ignored):
* NET_RX_SUCCESS: no congestion
* NET_RX_DROP: packet was dropped
*/
int netif_receive_skb(struct sk_buff *skb)
{
struct packet_type *ptype, *pt_prev;
struct net_device *orig_dev;
struct net_device *null_or_orig;
int ret = NET_RX_DROP;
__be16 type;
if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
return NET_RX_SUCCESS;
/* if we've gotten here through NAPI, check netpoll */
#ifdef CONFIG_MAPPING
if (skb->dev)
#endif
if (netpoll_receive_skb(skb))
return NET_RX_DROP;
if (!skb->tstamp.tv64)
net_timestamp(skb);
if (!skb->iif)
skb->iif = skb->dev->ifindex;
null_or_orig = NULL;
orig_dev = skb->dev;
if (orig_dev->master) {
if (skb_bond_should_drop(skb))
null_or_orig = orig_dev; /* deliver only exact match */
else
skb->dev = orig_dev->master;
}
__get_cpu_var(netdev_rx_stat).total++;
skb_reset_network_header(skb);
skb_reset_transport_header(skb);
skb->mac_len = skb->network_header - skb->mac_header;
pt_prev = NULL;
rcu_read_lock();
#ifdef CONFIG_NET_CLS_ACT
if (skb->tc_verd & TC_NCLS) {
skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
goto ncls;
}
#endif
// 检查数据包是否有packet socket来接受该包(比如抓包工具),如果有则往该socket发送一份
list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
ptype->dev == orig_dev) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
#ifdef CONFIG_NET_CLS_ACT
skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
ncls:
#endif
// 尝试交由网桥处理,如果网桥处理了,返回skb=NULL
skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
// 对数据包转到L3层处理
type = skb->protocol;
list_for_each_entry_rcu(ptype,
&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
if (ptype->type == type &&
(ptype->dev == null_or_orig || ptype->dev == skb->dev ||
ptype->dev == orig_dev)) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
if (pt_prev) {
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
} else {
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/
ret = NET_RX_DROP;
}
out:
rcu_read_unlock();
return ret;
}
总结而言,netif_recerve_skb函数主要做三件事情:
1. 如果有socket需要(如抓包应用)skb,则将skb复制给他们;
2. 处理桥接,即如果开启了网桥,进行网桥处理;
3. 将skb交给网络层。
static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
struct packet_type **pt_prev, int *ret,
struct net_device *orig_dev)
{
struct net_bridge_port *port;
// 如果数据包是环回包,或者数据包的产生设备不属于任何网桥,则不进行网桥处理
if (skb->pkt_type == PACKET_LOOPBACK ||
(port = rcu_dereference(skb->dev->br_port)) == NULL)
return skb;
if (*pt_prev) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
}
// 调用网桥处理接口,该接口在网桥初始化时被指定为 br_handle_frame
return br_handle_frame_hook(port, skb);
}
/*
* Called via br_handle_frame_hook.
* Return NULL if skb is handled
* note: already called with rcu_read_lock (preempt_disabled)
*/
struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb)
{
// 获取数据包MAC
const unsigned char *dest = eth_hdr(skb)->h_dest;
int (*rhook)(struct sk_buff *skb);
if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
goto drop;
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb)
return NULL;
#ifdef CONFIG_ATHRS_HW_NAT
skb->ath_hw_nat_fw_flags = 1;
#endif
// 如果目的地址是01:80:c2:00:00:0X,则是发往STP的多播地址,此时可能需要进行STP处理
if (unlikely(is_link_local(dest))) {
/* Pause frames shouldn't be passed up by driver anyway */
if (skb->protocol == htons(ETH_P_PAUSE))
goto drop;
/* If STP is turned off, then forward */
if (p->br->stp_enabled == BR_NO_STP && dest[5] == 0)
goto forward;
// 在老版本的网桥实现中,这里有一个分支进行STP数据包处理。在新内核版本中(2.6,新只是相对的),STP被实现为上层协议,所以会在网桥处理后,传递到上层再进行专门处理。
if (NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev,
NULL, br_handle_local_finish)) //更新CAM表
return NULL; /* frame consumed by filter */
else
return skb;// 由于br_handle_local_finish返回始终为0,所以return skb将继续上层处理
}
forward:
switch (p->state) {
case BR_STATE_FORWARDING:
// 判断是否需要走三层进行转发,这个是broute表的执行函数
rhook = rcu_dereference(br_should_route_hook);
if (rhook != NULL) {
if (rhook(skb))
return skb;
dest = eth_hdr(skb)->h_dest;
}
/* 注意: fall through */
case BR_STATE_LEARNING:
if (!compare_ether_addr(p->br->dev->dev_addr, dest))
skb->pkt_type = PACKET_HOST;
// Netfilter hook点
NF_HOOK(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
br_handle_frame_finish);
break;
default:
drop:
kfree_skb(skb);
}
return NULL;
}
int br_handle_frame_finish(struct sk_buff *skb)
{
const unsigned char *dest = eth_hdr(skb)->h_dest;
struct net_bridge_port *p = rcu_dereference(skb->dev->br_port);
struct net_bridge *br;
struct net_bridge_fdb_entry *dst;
struct sk_buff *skb2;
// 如果网桥处于DISABLED状态,直接drop
if (!p || p->state == BR_STATE_DISABLED)
goto drop;
/* insert into forwarding database after filtering to avoid spoofing */
// 选择端口所属的网桥(可能有多个网桥的情况)
br = p->br;
// 更新端口-MAC映射表
br_fdb_update(br, p, eth_hdr(skb)->h_source);
if (p->state == BR_STATE_LEARNING)
goto drop;
/* The packet skb2 goes to the local host (NULL to skip). */
skb2 = NULL;
if (br->dev->flags & IFF_PROMISC)
skb2 = skb;
dst = NULL;
if (is_multicast_ether_addr(dest)) {
br->dev->stats.multicast++;
skb2 = skb;
} else if ((dst = __br_fdb_get(br, dest)) && dst->is_local) {
skb2 = skb;
/* Do not forward the packet since it's local. */
skb = NULL;
}
if (skb2 == skb)
skb2 = skb_clone(skb, GFP_ATOMIC);
if (skb2)
br_pass_frame_up(br, skb2); // 如果skb2非空,则向上传递报文
if (skb) {
if (dst)
br_forward(dst->dst, skb);
else
br_flood_forward(br, skb); // 多播或端口-MAC表中无记录,需要洪泛发送(每个端口均发送)
}
out:
return 0;
drop:
kfree_skb(skb);
goto out;
}
void br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
{
if (should_deliver(to, skb)) {
__br_forward(to, skb);
return;
}
kfree_skb(skb);
}
static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
{
struct net_device *indev;
indev = skb->dev;
skb->dev = to->dev; //替换报文中的dev为转发端口对应的dev
skb->ip_summed = CHECKSUM_NONE;
// Netfilter
hook处理
NF_HOOK(PF_BRIDGE, NF_BR_FORWARD, skb, indev, skb->dev,
br_forward_finish);
}
void br_flood_forward(struct net_bridge *br, struct sk_buff *skb)
{
br_flood(br, skb, __br_forward);
}
static void br_flood(struct net_bridge *br, struct sk_buff *skb,
void (*__packet_hook)(const struct net_bridge_port *p,
struct sk_buff *skb))
{
struct net_bridge_port *p;
struct net_bridge_port *prev;
prev = NULL;
/* backup multicast address. by HouXB, 07Dec10 */
#ifdef CONFIG_TP_MULTICAST
#define IS_MULTICAST_ADDR(ptr) ((ptr[0] == 0x01) && (ptr[1] == 0x00) && (ptr[2] == 0x5e) ? 1 : 0)
mac_addr multi_mac_addr;
unsigned char *pmac = multi_mac_addr.addr;
memset(pmac, 0, 6/*ETH_ALEN*/);
if(IS_MULTICAST_ADDR(skb_mac_header(skb)))
{
//backup multicast address
memcpy(pmac, skb_mac_header(skb), 6/*ETH_ALEN*/);
}
#endif
// 遍历所有端口,从每个端口发送一份出去, should_deliver会排除进来的端口
list_for_each_entry_rcu(p, &br->port_list, list) {
if (should_deliver(p, skb)) {
if (prev != NULL) {
struct sk_buff *skb2;
if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) {
br->dev->stats.tx_dropped++;
kfree_skb(skb);
return;
}
#ifdef CONFIG_TP_MULTICAST
if(IS_MULTICAST_ADDR(pmac))
{
//restore multicast address
memcpy(skb_mac_header(skb), pmac, 6/*ETH_ALEN*/);
}
#endif
__packet_hook(prev, skb2);
}
prev = p;
}
}
if (prev != NULL) {
#ifdef CONFIG_TP_MULTICAST
if(IS_MULTICAST_ADDR(pmac))
{
//restore multicast address
memcpy(skb_mac_header(skb), pmac, 6/*ETH_ALEN*/);
}
#endif
__packet_hook(prev, skb);
return;
}
kfree_skb(skb);
}
int br_forward_finish(struct sk_buff *skb)
{
// Netfilter hook: NF_BR_POST_ROUTING
return NF_HOOK(PF_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev,
br_dev_queue_push_xmit);
}
int br_dev_queue_push_xmit(struct sk_buff *skb)
{
/* drop mtu oversized packets except gso */
if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb))
kfree_skb(skb);
else {
/* ip_refrag calls ip_fragment, doesn't copy the MAC header. */
if (nf_bridge_maybe_copy_header(skb))
kfree_skb(skb);
else {
skb_push(skb, ETH_HLEN);
dev_queue_xmit(skb); // 进入驱动
}
}
return 0;
}
static void br_pass_frame_up(struct net_bridge *br, struct sk_buff *skb)
{
struct net_device *indev, *brdev = br->dev;
// 数据统计
brdev->stats.rx_packets++;
brdev->stats.rx_bytes += skb->len;
indev = skb->dev;
// 特别注意:此处将skb的dev强制修改为网桥dev
skb->dev = brdev;
// Netfilter hook :NF_BR_LOCAL_IN
NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_IN, skb, indev, NULL,
netif_receive_skb);
}
这段代码非常简单,对net_bridge的数据统计进行更新以后,强制将skb的dev修改为网桥的dev,最后通过NF_HOOK在NF_BR_LOCAL_IN挂接点上调用回了netif_receive_skb方法。
在netif_receive_skb函数中,调用了handle_bridge函数,重新触发了网桥处理流程,现在发往网桥虚拟设备的数据包又回到了netif_receive_skb,那么网桥的处理过程会不会又被调用呢?答案是否定的。回顾网桥入口函数handle_bridge方法,判断是否执行网桥处理流程的判断:
// 如果数据包是环回包,或者数据包的产生设备不属于任何网桥,则不进行网桥处理
if (skb->pkt_type == PACKET_LOOPBACK ||
(port = rcu_dereference(skb->dev->br_port)) == NULL)
return skb;
见上文程序段,br_pass_frame_up函数将skb->dev赋成了br->dev,实际上skb->dev变成了网桥建立的虚拟设备;这个设备是网桥本身而不是桥组的某一端口(它不属于任何网桥设备,因为前面提到过网桥不能添加一个网桥设备做端口),故而在进行网桥处理判断时,不能进入网桥处理流程 ,从而进入上层协议栈处理。
进入桥的数据报文分为几个类型,桥对应的处理方法也不同:
1、 报文是本机发送给自己的,桥不处理,交给上层协议栈;
2、 接收报文的物理接口不是网桥接口,桥不处理,交给上层协议栈;
3、 进入网桥后,如果网桥的状态为Disable,则将包丢弃不处理;
4、 报文源地址无效(广播,多播,以及00:00:00:00:00:00),丢包;
5、 如果是STP的BPDU包,交给上层协议栈;
6、 如果是发给本机的报文,桥直接返回,交给上层协议栈,不转发;
7、 需要转发的报文分三种情况:
1) 广播或多播,则除接收端口外的所有端口都需要转发一份;
2) 单播并且在端口-MAC映射表中能找到端口映射的,只需要网映射端口转发一份即可;
3) 单播但找不到端口映射的,则除了接收端口外其余端口都需要转发。
最后,再回顾一下网桥数据处理主要函数关系图:
图 网桥处理流程示意图
众所周知,网桥需要维护一个MAC地址-端口映射表,端口是指网桥自身提供的端口,而MAC地址是指与端口相连的另一端主机的MAC地址。当网桥收到一个报文时,先获取它的源MAC,更新数据库,然后读取该报文的目标MAC地址,查找该数据库,如果找到,根据找到条目的端口进行转发;否则会把数据包向除入口端口以外的所有端口转发。
通常网桥端口-MAC映射表又被称为网桥转发数据库或CAM,为了简化叙述,统一使用数据库代替。
数据库使用kmem_cache_create函数进行创建,使用kmem_cache_desctory进行销毁。回顾网桥初始化时,会调用br_fdb_init进行数据库初始化:
int __init br_fdb_init(void)
{
br_fdb_cache = kmem_cache_create("bridge_fdb_cache",
sizeof(struct net_bridge_fdb_entry),
0,
SLAB_HWCACHE_ALIGN, NULL);
if (!br_fdb_cache)
return -ENOMEM;
get_random_bytes(&fdb_salt, sizeof(fdb_salt));
return 0;
}
销毁:
void br_fdb_fini(void)
{
kmem_cache_destroy(br_fdb_cache);
}
当网桥收到一个数据包时,它会获取该数据的源MAC地址,然后对数据库进行更新。如果该MAC地址不在数库中,则创建一个新表项。如果存在,更新它的过期时间。数据库使用hash表的结构方式,便于高效查询。数据库更新函数:
void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
const unsigned char *addr)
{
// 使用hash算法,找到skb的MAC所属的表
struct hlist_head *head = &br->hash[br_mac_hash(addr)];
struct net_bridge_fdb_entry *fdb;
/* some users want to always flood. */
if (hold_time(br) == 0)
return;
/* ignore packets unless we are using this port */
if (!(source->state == BR_STATE_LEARNING ||
source->state == BR_STATE_FORWARDING))
return;
fdb = fdb_find(head, addr);
if (likely(fdb)) { // 如果skb的MAC已经存在于数据库中,更新过期时间
/* attempt to update an entry for a local interface */
if (unlikely(fdb->is_local)) {
if (net_ratelimit())
printk(KERN_WARNING "%s: received packet with "
"own address as source address\n",
source->dev->name);
} else {
/* fastpath: update of existing entry */
fdb->dst = source;
fdb->ageing_timer = jiffies;
}
} else { // 如果skb的MAC不在数据中,则新建一条记录
spin_lock(&br->hash_lock);
if (!fdb_find(head, addr))
fdb_create(head, source, addr, 0); // 创建表项
/* else we lose race and someone else inserts
* it first, don't bother updating
*/
spin_unlock(&br->hash_lock);
}
}
见上文程序段,在更新表项的函数里,已经为MAC地址算出其所属的Hash链表,因此,创建函数只需要在该链上添加一个数据项即可:
static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
struct net_bridge_port *source,
const unsigned char *addr,
int is_local)
{
struct net_bridge_fdb_entry *fdb;
fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC);
if (fdb) {
memcpy(fdb->addr.addr, addr, ETH_ALEN);
hlist_add_head_rcu(&fdb->hlist, head);
fdb->dst = source;
fdb->is_local = is_local; // 0
fdb->is_static = is_local; // 0
fdb->ageing_timer = jiffies;
}
return fdb;
}
网桥的数据项查找与一般的查找类似,但略有不同。前面提到,如果要更新一MAC地址,不管该地址是否已经过期了,只需遍历该MAC地址对应的Hash链表,然后更新年龄,此时它肯定不过期了。但网桥要转发数据时,除了要找到该目标MAC的出口端口外,还要判断该记录是否过期了。因此,数据项的查找有两种,一种用于更新,另一用于转发:
static inline struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head,
const unsigned char *addr)
{
struct hlist_node *h;
struct net_bridge_fdb_entry *fdb;
hlist_for_each_entry_rcu(fdb, h, head, hlist) {
if (!compare_ether_addr(fdb->addr.addr, addr))
return fdb;
}
return NULL;
}
/* No locking or refcounting, assumes caller has no preempt (rcu_read_lock) */
struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
const unsigned char *addr)
{
struct hlist_node *h;
struct net_bridge_fdb_entry *fdb;
hlist_for_each_entry_rcu(fdb, h, &br->hash[br_mac_hash(addr)], hlist) {
if (!compare_ether_addr(fdb->addr.addr, addr)) {
if (unlikely(has_expired(br, fdb))) // 判断是否过期
break;
return fdb;
}
}
return NULL;
}
之前我们在专门讲过Linux的Netfilter框架(虽然当时主要是针对IP层),所以这里就不再详细讲解网桥Netfilter的过程。在网桥处理逻辑中,我们已经看到了各个hook点的调用关系,和IP的Netfilter是一致的。关于这部分内容,这里就不重复讲述;这里要讲的,是与IP的Netfilter不同的一些东西。
直接上代码:
int __init br_netfilter_init(void)
{
int ret;
// 注册hook options
ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
if (ret < 0)
return ret;
#ifdef CONFIG_SYSCTL
brnf_sysctl_header = register_sysctl_paths(brnf_path, brnf_table);
if (brnf_sysctl_header == NULL) {
printk(KERN_WARNING
"br_netfilter: can't register to sysctl.\n");
nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
return -ENOMEM;
}
#endif
printk(KERN_NOTICE "Bridge firewalling registered\n");
return 0;
}
static struct nf_hook_ops br_nf_ops[] __read_mostly = {
{ .hook = br_nf_pre_routing,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_PRE_ROUTING,
.priority = NF_BR_PRI_BRNF, }, // 优先级为0
{ .hook = br_nf_local_in,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_LOCAL_IN,
.priority = NF_BR_PRI_BRNF, },
{ .hook = br_nf_forward_ip,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_FORWARD,
.priority = NF_BR_PRI_BRNF - 1, }, //优先级为-1, ip高于arp
{ .hook = br_nf_forward_arp,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_FORWARD,
.priority = NF_BR_PRI_BRNF, },
{ .hook = br_nf_local_out,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_LOCAL_OUT,
.priority = NF_BR_PRI_FIRST, },
{ .hook = br_nf_post_routing,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_POST_ROUTING,
.priority = NF_BR_PRI_LAST, },
{ .hook = ip_sabotage_in,
.owner = THIS_MODULE,
.pf = PF_INET,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_FIRST, },
{ .hook = ip_sabotage_in,
.owner = THIS_MODULE,
.pf = PF_INET6,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP6_PRI_FIRST, },
};
回想IP的Netfilter,每个nf_hook_ops都属于某个特定的表。但Bridge下的Netfilter,在Netfilter初始化的时候,注册了一系列nf_hook_ops,它们不属于任何表,且它们的优先级为0,默认都会被执行。这些hook函数不执行具体的匹配规则,但是会做一些特殊的处理,如调用IP层的hook。这部分功能将在后文讲述Bridge与IP联动的时候讲述。
在网桥的Netfilter下,内建了三张表:broute、nat和filter。其中broute主要用于判断某数据包是否应该进入网络层进行处理(跳过网桥处理)。与传统Netfilter下的表注册不一样,broute注册没有注册nf_hook_ops,所以不能通过NF_HOOK()调用;相反其调用方式是直接通过在适当的位置调用其表执行函数。
broute表
static struct ebt_entries initial_chain = {
.name = "BROUTING",
.policy = EBT_ACCEPT,
};
static struct ebt_replace_kernel initial_table =
{
.name = "broute",
.valid_hooks = 1 << NF_BR_BROUTING,
.entries_size = sizeof(struct ebt_entries),
.hook_entry = {
[NF_BR_BROUTING] = &initial_chain,
},
.entries = (char *)&initial_chain,
};
static struct ebt_table broute_table =
{
.name = "broute",
.table = &initial_table,
.valid_hooks = 1 << NF_BR_BROUTING, // 非传统的几处hook点,专门为brout表定义的一个假hook点
.check = check,
.me = THIS_MODULE,
};
static int __init ebtable_broute_init(void)
{
int ret;
ret = register_pernet_subsys(&broute_net_ops);
if (ret < 0)
return ret;
/* see br_input.c */
rcu_assign_pointer(br_should_route_hook, ebt_broute);
return 0;
}
static int ebt_broute(struct sk_buff *skb)
{
int ret;
ret = ebt_do_table(NF_BR_BROUTING, skb, skb->dev, NULL,
dev_net(skb->dev)->xt.broute_table);
if (ret == NF_DROP)
return 1; /* route it */
return 0; /* bridge it */
}
Nat表
static struct nf_hook_ops ebt_ops_nat[] __read_mostly = {
{
.hook = ebt_nat_out,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_LOCAL_OUT,
.priority = NF_BR_PRI_NAT_DST_OTHER, // 100
},
{
.hook = ebt_nat_out,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_POST_ROUTING,
.priority = NF_BR_PRI_NAT_SRC, //300
},
{
.hook = ebt_nat_in,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_PRE_ROUTING,
.priority = NF_BR_PRI_NAT_DST_BRIDGED, // -300
},
};
static struct ebt_table frame_nat =
{
.name = "nat",
.table = &initial_table,
.valid_hooks = NAT_VALID_HOOKS,
.check = check,
.me = THIS_MODULE,
};
static int __init ebtable_nat_init(void)
{
int ret;
ret = register_pernet_subsys(&frame_nat_net_ops);
if (ret < 0)
return ret;
ret = nf_register_hooks(ebt_ops_nat, ARRAY_SIZE(ebt_ops_nat));
if (ret < 0)
unregister_pernet_subsys(&frame_nat_net_ops);
return ret;
}
Nat表的注册和之前讲过Iptables相关表注册是一致的,都是初始化表结构,初始化nf_hook_ops,让后分别注册。值得注意的是,在注册nf_hook_ops的时候,各个nf_hook_ops的优先级是不一样的。优先级定义:
enum nf_br_hook_priorities {
NF_BR_PRI_FIRST = INT_MIN,
NF_BR_PRI_NAT_DST_BRIDGED = -300,
NF_BR_PRI_FILTER_BRIDGED = -200,
NF_BR_PRI_BRNF = 0,
NF_BR_PRI_NAT_DST_OTHER = 100,
NF_BR_PRI_FILTER_OTHER = 200,
NF_BR_PRI_NAT_SRC = 300,
NF_BR_PRI_LAST = INT_MAX,
};
filter表
和nat表类似,不赘述。
Bridge和IP在透明防火墙中是需要联动的,因为IP层可以做更多的事情。虽然,这些事情也是可以在Bridge中实现的,但是模块化及KISS原则将Bridge从这些复杂的事情中分割出来,仅做它自己该处理的事情;如果需要IP层帮助,则直接调用IP层的hook即可。
下面,我们通过“read the fucking source code”,了解Bridge与IP层到底是如何联动的。
#define NF_HOOK(pf, hook, skb, indev, outdev, okfn) \
NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, INT_MIN) // INT_MIN最小的整数
#define NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, thresh) \
({int __ret; \
if ((__ret=nf_hook_thresh(pf, hook, (skb), indev, outdev, okfn, thresh, 1)) == 1)\ // NF_ACCEPT == 1
__ret = (okfn)(skb); // 执行hook后的回调函数 \
__ret;})
unsigned int nf_iterate(struct list_head *head,
struct sk_buff *skb,
unsigned int hook,
const struct net_device *indev,
const struct net_device *outdev,
struct list_head **i,
int (*okfn)(struct sk_buff *),
int hook_thresh)
{
unsigned int verdict;
/*
* The caller must not block between calls to this
* function because of risk of continuing from deleted element.
*/
list_for_each_continue_rcu(*i, head) { // 遍历所有nf_hook_ops
struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;
// 如果设置的thresh值高于nf_hook_ops注册的优先级,则跳过该ops
if (hook_thresh > elem->priority)
continue;
/* Optimization: we don't need to hold module
reference here, since function can't sleep. --RR */
verdict = elem->hook(hook, skb, indev, outdev, okfn); // 执行hook函数
if (verdict != NF_ACCEPT) {
#ifdef CONFIG_NETFILTER_DEBUG
if (unlikely((verdict & NF_VERDICT_MASK)
> NF_MAX_VERDICT)) {
NFDEBUG("Evil return from %p(%u).\n",
elem->hook, hook);
continue;
}
#endif
if (verdict != NF_REPEAT)
return verdict;
*i = (*i)->prev;
}
}
return NF_ACCEPT;
}
通过上述分析,如果通过NF_HOOK()进入,这在该hook点注册的所有nf_hook_ops都会被执行,毕竟NF_HOOK指定了thresh值是最小整数。相反,如果要控制thresh的值,来过滤一部分nf_hook_ops,则需要显示调用NF_HOOK_THRESH(),并指定thresh的值。
进一步分析NF_BR_PRE_ROUTING这个hook点的数据流情况。回顾5.1节和5.2节,网桥在NF_BR_PRE_ROUTING点上,注册了两个nf_hook_ops:
一个是默认的hook处理:
{ .hook = br_nf_pre_routing,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_PRE_ROUTING,
.priority = NF_BR_PRI_BRNF, }, // 优先级为0
另一个是nat表注册的:
{
.hook = ebt_nat_in,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_PRE_ROUTING,
.priority = NF_BR_PRI_NAT_DST_BRIDGED, //优先级 -300
},
其中,ebt_nat_in是传统Netfilter表处理hook回调函数,通过调用do_tables遍历表规则,对数据处理:
static unsigned int
ebt_nat_in(unsigned int hook, struct sk_buff *skb, const struct net_device *in
, const struct net_device *out, int (*okfn)(struct sk_buff *))
{
return ebt_do_table(hook, skb, in, out, dev_net(in)->xt.frame_nat);
}
但是,br_nf_pre_routing所做的事情却有所不同:
static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
struct iphdr *iph;
__u32 len = nf_bridge_encap_header_len(skb);
if (unlikely(!pskb_may_pull(skb, len)))
goto out;
if (skb->protocol == htons(ETH_P_IPV6) || IS_VLAN_IPV6(skb) ||
IS_PPPOE_IPV6(skb)) { // 如果是IPv6的数据,则交由IPv6的hook处理
#ifdef CONFIG_SYSCTL
if (!brnf_call_ip6tables)
return NF_ACCEPT;
#endif
nf_bridge_pull_encap_header_rcsum(skb);
return br_nf_pre_routing_ipv6(hook, skb, in, out, okfn);
}
#ifdef CONFIG_SYSCTL
if (!brnf_call_iptables)
return NF_ACCEPT;
#endif
if (skb->protocol != htons(ETH_P_IP) && !IS_VLAN_IP(skb) &&
!IS_PPPOE_IP(skb)) // 如果不是IP数据,则通过交由后续规则或上层处理
return NF_ACCEPT;
nf_bridge_pull_encap_header_rcsum(skb);
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto inhdr_error;
iph = ip_hdr(skb);
if (iph->ihl < 5 || iph->version != 4)
goto inhdr_error;
if (!pskb_may_pull(skb, 4 * iph->ihl))
goto inhdr_error;
iph = ip_hdr(skb);
if (ip_fast_csum((__u8 *) iph, iph->ihl) != 0)
goto inhdr_error;
len = ntohs(iph->tot_len);
if (skb->len < len || len < 4 * iph->ihl)
goto inhdr_error;
pskb_trim_rcsum(skb, len);
nf_bridge_put(skb->nf_bridge);
if (!nf_bridge_alloc(skb))
return NF_DROP;
if (!setup_pre_routing(skb))
return NF_DROP;
store_orig_dstaddr(skb);
NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, skb->dev, NULL,
br_nf_pre_routing_finish);
return NF_STOLEN;
inhdr_error:
// IP_INC_STATS_BH(IpInHdrErrors);
out:
return NF_DROP;
}
static int br_nf_pre_routing_finish(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
struct iphdr *iph = ip_hdr(skb);
struct nf_bridge_info *nf_bridge = skb->nf_bridge;
struct rtable *rt;
int err;
if (nf_bridge->mask & BRNF_PKT_TYPE) {
skb->pkt_type = PACKET_OTHERHOST;
nf_bridge->mask ^= BRNF_PKT_TYPE;
}
nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
if (dnat_took_place(skb)) { // 如果做了DNAT,则交由ip层进行route
if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
struct flowi fl = {
.nl_u = {
.ip4_u = {
.daddr = iph->daddr,
.saddr = 0,
.tos = RT_TOS(iph->tos) },
},
.proto = 0,
};
struct in_device *in_dev = in_dev_get(dev);
/* If err equals -EHOSTUNREACH the error is due to a
* martian destination or due to the fact that
* forwarding is disabled. For most martian packets,
* ip_route_output_key() will fail. It won't fail for 2 types of
* martian destinations: loopback destinations and destination
* 0.0.0.0. In both cases the packet will be dropped because the
* destination is the loopback device and not the bridge. */
if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev))
goto free_skb;
if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
/* - Bridged-and-DNAT'ed traffic doesn't
* require ip_forwarding. */
if (((struct dst_entry *)rt)->dev == dev) {
skb_dst_set(skb, (struct dst_entry *)rt);
goto bridged_dnat;
}
/* we are sure that forwarding is disabled, so printing
* this message is no problem. Note that the packet could
* still have a martian destination address, in which case
* the packet could be dropped even if forwarding were enabled */
__br_dnat_complain();
dst_release((struct dst_entry *)rt);
}
free_skb:
kfree_skb(skb);
return 0;
} else {
if (skb_dst(skb)->dev == dev) {
bridged_dnat:
/* Tell br_nf_local_out this is a
* bridged frame */
nf_bridge->mask |= BRNF_BRIDGED_DNAT;
skb->dev = nf_bridge->physindev;
nf_bridge_push_encap_header(skb);
NF_HOOK_THRESH(PF_BRIDGE, NF_BR_PRE_ROUTING,
skb, skb->dev, NULL,
br_nf_pre_routing_finish_bridge,
1);
return 0;
}
memcpy(eth_hdr(skb)->h_dest, dev->dev_addr, ETH_ALEN);
skb->pkt_type = PACKET_HOST;
}
} else {
rt = bridge_parent_rtable(nf_bridge->physindev);
if (!rt) {
kfree_skb(skb);
return 0;
}
dst_hold(&rt->u.dst);
skb_dst_set(skb, &rt->u.dst);
}
skb->dev = nf_bridge->physindev;
nf_bridge_push_encap_header(skb);
NF_HOOK_THRESH(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
br_handle_frame_finish, 1);
return 0;
}
由5.3.1节的分析知,NF_HOOK_THRESH()中thresh设置为1,将从优先级为1的nf_hook_ops执行;整个过程相当于,在Bridge prerouting hook点上,先执行完优先级小于0的hook操作,然后转入IP层执行所有IP prerouting hook点上的hook操作,最后(依据IP层执行)再转回Bridge prerouting hook点,从优先级为1处继续执行:
图 Bridge 与IP联动示意图
事实上,Bridge与IP的联动过程比较复杂,本文只是示例了其核心的机制,很多细节的控制并没有一一说明。关于Bridge与IP的联动过程中,更多实现细节留给读者自行分析吧。
关于其它网桥hook点上,Bridge与IP的联动本文将略去不再讲述,因为其原理和方法大致是一致的,只是不同的处理细节不一致。关于Bridge与IP的联动全貌,可以参考帖子:
针对我司Linux平台的实现,Bridge与IP的联动关系如下图:
图 Linux平台Bridge 与IP联动
生成树协议STP(Spanning Tree Protocol)的主要功能有两个:一是在利用生成树算法、在以太网络中,创建一个以某台交换机的某个端口为根的生成树,避免环路。二是在以太网络拓扑发生变化时,通过生成树协议达到收敛保护的目的。
STP :生成树算法。
BPDU:STP的数据单元,在网桥局域网内传递信息。
TCN:拓扑改变通知BPDU。
根网桥:具有最小网桥ID的网桥被选作根网桥,网桥ID应为唯一的。
根端口:在指定网桥上面,到根网桥路径花费最小的端口为根端口,如果指定网桥上面有几个端口,到根网桥路径花费一样小,那么选择端口id 最小的端口为根端口。
指定网桥:局域网通过所连的网桥,接收和发送数据帧,如果局域网有且只有一个网桥相连,那么这个网桥必定是指定网桥,如果有多个网桥跟这个局域网相连,那么到根网桥路径花费最少的那个网桥为指定网桥,如果,有几个网桥到到根网桥路径花费一样,那么比较网桥id,id最小的被选作为指定网桥。
指定端口:指定网桥上面和局域网相连的端口叫做指定端口,如果指定网桥上面有几个端口,同时和局域网相连,那么选择端口id 最小的端口为所在局域网的指定端口。
根路径花费:当端口为根端口时候,通过这个端口的路径花费。 对于这个网桥来说,路径费用是到根网桥的费用之和。
指定花费:当端口为所在局域网的指定端口时候,即为根路径费用,当不为指定端口时候,是所在局域网指定端口到根网桥的费用。
BR_STATE_DISABLED(0):禁用状态,不参与生成树,不转发任何数据帧。
BR_STATE_LISTENING(1): 监听状态,能够决定根,可以选择根端口、指定端口和非指定端口。在监昕状态的过程中,端口不能学 习任何接收帧的单播地址。
BR_STATE_LEARNING (2): 学习状态,端口能学习流入帧的MAC地址,不能转发帧。
BR_STATE_FORWARDING(3): 转发状态,接口能够转发帧。端口学习到接收帧的源 MAC地址,并可根据目标MAC地址进行恰当地转发。
BR_STATE_BLOCKING(4):阻塞状态,不参与帧转发、监听流人的BPDU,不能学习接收帧的任何MAC地址 。
运行生成树算法(STA)的网桥定期发送BPDU;选取唯一一个根网桥;在每个非根网桥选取唯一一个根端口;在每网段选取唯一一个标志端口。
(1) 选取唯一一个根网桥:BPDU中包含Bridge ID;Bridge ID(8B)=优先级(2B)+交换机MAC地址(6B);一些交换机的优先级默认为32768,可以修改;优先级值最小的成为根网桥;优先级值最小的成为根网桥;优先级值相同,MAC地址最小的成为根网桥;Bridge ID值最小的成为根网桥;根网桥缺省每2秒发送一次BPDU。
(2) 在每个非根网桥选取唯一一个根端口:根网桥上没有根端口;端口代价最小的成为根端口;端口代价相同,Port ID最小端口的成为端口;Port ID通常为端口的MAC地址;MAC地址最小的端口成为根端口。
(3) 在每网段选取唯一一个标志端口:端口代价最小的成为标识端口;根网桥端口到各网段的代价最小;通常只有根网桥端口成为标识端口;被选定为根端口和标识端口的进行转发状态;落选端口进入阻塞状态,只侦听BPDU。
(4) 阻塞端口在指定的时间间隔(缺省20秒)收不到BPDU时,会重新运行生成树算法进行选举;缺点:在运行生成树算法的过程中,网络处理阻断状态,所有端口都不进行转发。计算过程缺省为50秒。
当网桥加电的时,网桥将认为它就是根网桥,并且将过渡到监听状态。一般情况下,当网桥认识到网络拓扑发生变更的时,将出现两种过渡状态:在拓扑变更的过程中,端口需要根据转发延迟计时器的数值而临时性地实施监听和学习状态。
当端口处于监听状态的时,它将利用发送和接收BPDU来确定活跃( active)的拓扑;当网络拓扑处于过渡期的时候,将不传递任何用户数据; 在监听状态的过程中,网桥将处理它所接收的BPDU;对于作为指定端口或根端口的端口,它们将在15秒(转发延迟的默认值)之启过渡到学习状态;对于不是指定端口或根端口的端口,它们将过渡返回到阻塞状态。
当端口处于学习状态的时,将利用从端口所学到的MAC地址来组建自己的MAC地址表;不能转发用户数据帧;在这个时刻,网桥不能传递任何用户数据。
当端口处于数据转发的时,学习状态能够降低所需扩散的数据帧的数量;如果某个端口在学习状态结束的时候仍然是指定端口或根端口,那么该端口就将过渡到转发状态;对于不是指定端口 或根端口的端口,它们将过渡返回到阻塞状态;在转发状态中,端口能够发送和接收用户数据;端口从阻塞状态过渡到转发状态的正常时间是30~50秒。
注:如果端口所连接的对象是主机,那么因为在这些链珞上的转发不会造成STP环路,所以这些端口也就不需要参与STP监听和学习的过程。
在早期的版本中,网桥的STP数据包是在网桥处理过程中,按照特定的组播地址进行识别,然后在网桥处理过程中完成相应的数据处理。后来的实现中,将其实现为一种单独的协议,并最终由IEEE802.2协议进行封包传递。
在网桥初始化的时候,为STP注册了协议,并指定其接收数据的函数为br_stp_proto:
static const struct stp_proto br_stp_proto = {
.rcv = br_stp_rcv, // 接收函数
};
br_init()方法中,注册stp协议:
err = stp_proto_register(&br_stp_proto);
int stp_proto_register(const struct stp_proto *proto)
{
int err = 0;
mutex_lock(&stp_proto_mutex);
if (sap_registered++ == 0) {
sap = llc_sap_open(LLC_SAP_BSPAN, stp_pdu_rcv); //在LLC上注册数据处理函数。LLC_SAP_BSPAN (0x42):Bridge Spanning Tree Proto
if (!sap) {
err = -ENOMEM;
goto out;
}
}
if (is_zero_ether_addr(proto->group_address))
rcu_assign_pointer(stp_proto, proto);
else
rcu_assign_pointer(garp_protos[proto->group_address[5] -
GARP_ADDR_MIN], proto); //按照组播地址下标5的序号,将协议加入到garp_protos中
out:
mutex_unlock(&stp_proto_mutex);
return err;
}
// llc_sap_open 将创建一个llc_sap结构,并加入到LLC协议链表中
struct llc_sap *llc_sap_open(unsigned char lsap,
int (*func)(struct sk_buff *skb,
struct net_device *dev,
struct packet_type *pt,
struct net_device *orig_dev))
{
struct llc_sap *sap = NULL;
write_lock_bh(&llc_sap_list_lock);
if (__llc_sap_find(lsap)) /* SAP already exists */
goto out;
sap = llc_sap_alloc();
if (!sap)
goto out;
sap->laddr.lsap = lsap; // 上层协议标识
sap->rcv_func = func; // 上层协议入口函数
llc_add_sap(sap);
out:
write_unlock_bh(&llc_sap_list_lock);
return sap;
既然STP协议是附在LLC之上的,那么还得从LLC的接收说起。LLC在初始化的时候,注册了其数据接收函数为llc_rev():
int llc_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
struct llc_sap *sap;
struct llc_pdu_sn *pdu;
int dest;
int (*rcv)(struct sk_buff *, struct net_device *,
struct packet_type *, struct net_device *);
……
// 获取报文头部
pdu = llc_pdu_sn_hdr(skb);
if (unlikely(!pdu->dsap)) /* NULL DSAP, refer to station */
goto handle_station;
sap = llc_sap_find(pdu->dsap); // 查找所属上层协议
if (unlikely(!sap)) {/* unknown SAP */
dprintk("%s: llc_sap_find(%02X) failed!\n", __func__,
pdu->dsap);
goto drop;
}
/*
* First the upper layer protocols that don't need the full
* LLC functionality
*/
rcv = rcu_dereference(sap->rcv_func);
if (rcv) {
struct sk_buff *cskb = skb_clone(skb, GFP_ATOMIC);
if (cskb)
rcv(cskb, dev, pt, orig_dev); // 执行上层协议的接收函数
}
dest = llc_pdu_type(skb);
if (unlikely(!dest || !llc_type_handlers[dest - 1]))
goto drop_put;
llc_type_handlers[dest - 1](sap, skb);
……
}
在STP协议注册过程中,指定了LLC到STP的入口函数是stp_pdu_rcv:
static int stp_pdu_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
const struct ethhdr *eh = eth_hdr(skb);
const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
const struct stp_proto *proto;
if (pdu->ssap != LLC_SAP_BSPAN ||
pdu->dsap != LLC_SAP_BSPAN ||
pdu->ctrl_1 != LLC_PDU_TYPE_U)
goto err;
if (eh->h_dest[5] >= GARP_ADDR_MIN && eh->h_dest[5] <= GARP_ADDR_MAX) {
// 读取对用的proto
proto = rcu_dereference(garp_protos[eh->h_dest[5] -
GARP_ADDR_MIN]);
if (proto &&
compare_ether_addr(eh->h_dest, proto->group_address))
goto err;
} else
proto = rcu_dereference(stp_proto);
if (!proto)
goto err;
proto->rcv(proto, skb, dev); // 真正STP协议的执行函数
return 0;
……
}
真正STP协议的执行函数br_stp_rcv:
void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb,
struct net_device *dev)
{
const unsigned char *dest = eth_hdr(skb)->h_dest;
struct net_bridge_port *p = rcu_dereference(dev->br_port);
struct net_bridge *br;
const unsigned char *buf;
if (!p)
goto err;
if (!pskb_may_pull(skb, 4))
goto err;
/* compare of protocol id and version */
buf = skb->data;
if (buf[0] != 0 || buf[1] != 0 || buf[2] != 0)
goto err;
br = p->br;
spin_lock(&br->lock);
if (br->stp_enabled != BR_KERNEL_STP) //没有开启STP功能
goto out;
if (!(br->dev->flags & IFF_UP))
goto out;
if (p->state == BR_STATE_DISABLED)
goto out;
if (compare_ether_addr(dest, br->group_addr) != 0)
goto out;
buf = skb_pull(skb, 3);
if (buf[0] == BPDU_TYPE_CONFIG) {
struct br_config_bpdu bpdu;
if (!pskb_may_pull(skb, 32))
goto out;
buf = skb->data;
bpdu.topology_change = (buf[1] & 0x01) ? 1 : 0;
bpdu.topology_change_ack = (buf[1] & 0x80) ? 1 : 0;
bpdu.root.prio[0] = buf[2];
bpdu.root.prio[1] = buf[3];
bpdu.root.addr[0] = buf[4];
bpdu.root.addr[1] = buf[5];
bpdu.root.addr[2] = buf[6];
bpdu.root.addr[3] = buf[7];
bpdu.root.addr[4] = buf[8];
bpdu.root.addr[5] = buf[9];
bpdu.root_path_cost =
(buf[10] << 24) |
(buf[11] << 16) |
(buf[12] << 8) |
buf[13];
bpdu.bridge_id.prio[0] = buf[14];
bpdu.bridge_id.prio[1] = buf[15];
bpdu.bridge_id.addr[0] = buf[16];
bpdu.bridge_id.addr[1] = buf[17];
bpdu.bridge_id.addr[2] = buf[18];
bpdu.bridge_id.addr[3] = buf[19];
bpdu.bridge_id.addr[4] = buf[20];
bpdu.bridge_id.addr[5] = buf[21];
bpdu.port_id = (buf[22] << 8) | buf[23];
bpdu.message_age = br_get_ticks(buf+24);
bpdu.max_age = br_get_ticks(buf+26);
bpdu.hello_time = br_get_ticks(buf+28);
bpdu.forward_delay = br_get_ticks(buf+30);
br_received_config_bpdu(p, &bpdu); // 进入网桥配置信息处理
}
else if (buf[0] == BPDU_TYPE_TCN) {
br_received_tcn_bpdu(p); // 进入网络变更信息处理
}
out:
spin_unlock(&br->lock);
err:
kfree_skb(skb);
}
void br_received_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu)
{
struct net_bridge *br;
int was_root;
br = p->br;
// 自己是根桥吗?用自己的br_ID和BPDU包中的根ID相比较
was_root = br_is_root_bridge(br);
//比桥BPDU包中的信息(bpdu)和原先的对应的信息(p),如果需要更新,返回1,相同返回0,不需更新返回-1
if (br_supersedes_port_info(p, bpdu)) {
//刷新自己的相关信息
br_record_config_information(p, bpdu);
//进行root_bridge、port的选举
br_configuration_update(br);
//设置端口状态
br_port_state_selection(br);
// 如果因为这个BPDU导致拓朴变化了,如自己以前是根桥,现在不是了,需要发送TCN包,进行通告
if (!br_is_root_bridge(br) && was_root) {
del_timer(&br->hello_timer);
if (br->topology_change_detected) {
del_timer(&br->topology_change_timer);
br_transmit_tcn(br);
mod_timer(&br->tcn_timer,
jiffies + br->bridge_hello_time);
}
}
// 需要把这个BPDU包继续转发下去
if (p->port_no == br->root_port) {
br_record_config_timeout_values(br, bpdu);
br_config_bpdu_generation(br);
if (bpdu->topology_change_ack)
br_topology_change_acknowledged(br);
}
} else if (br_is_designated_port(p)) { //如果收到这个BPDU包,不是“最优”的,而接收数据包的接口不是根端口,直接将转发出去就可以了
br_reply(p);
}
}
void br_received_tcn_bpdu(struct net_bridge_port *p)
{
if (br_is_designated_port(p)) {
pr_info("%s: received tcn bpdu on port %i(%s)\n",
p->br->dev->name, p->port_no, p->dev->name);
br_topology_change_detection(p->br); // 发送变更通知
br_topology_change_acknowledge(p); // 应答变更
}
}