To be a better coder
分类: LINUX
2017-12-01 10:51:21
??datapath为ovs内核模块,负责执行数据交换,也就是把从接收端口收到的数据包在流表中进行匹配,并执行匹配到的动作。
??一个datapath可以对应多个vport,一个vport类似物理交换机的端口概念。一个datapth关联一个flow table,一个flow table包含多个条目,每个条目包括两个内容:一个match/key和一个action,一个match/key可以从包中获取,对应一个action处理行为,最常见的action是在不同flow中进行转发。下图所示的是1个ovs下的几个flow table,以及包在不同flow table进行转发的情况。
vport结构体:
/** * struct vport - one port within a datapath * @rcu: RCU callback head for deferred destruction. * @dp: Datapath to which this port belongs. * @upcall_portids: RCU protected 'struct vport_portids'. * @port_no: Index into @dp's @ports array. * @hash_node: Element in @dev_table hash table in vport.c. * @dp_hash_node: Element in @datapath->ports hash table in datapath.c. * @ops: Class structure. * @percpu_stats: Points to per-CPU statistics used and maintained by vport * @err_stats: Points to error statistics used and maintained by vport */ struct vport { struct rcu_head rcu; struct datapath *dp; struct vport_portids __rcu *upcall_portids; u16 port_no; struct hlist_node hash_node; struct hlist_node dp_hash_node; const struct vport_ops *ops; struct pcpu_sw_netstats __percpu *percpu_stats; struct vport_err_stats err_stats; };
vport_parms结构体,是创建vport所需要的参数结构。
/** * struct vport_parms - parameters for creating a new vport * * @name: New vport's name. * @type: New vport's type. * @options: %OVS_VPORT_ATTR_OPTIONS attribute from Netlink message, %NULL if * none was supplied. * @dp: New vport's datapath. * @port_no: New vport's port number. */ struct vport_parms { const char *name; enum ovs_vport_type type; struct nlattr *options; /* For ovs_vport_alloc(). */ struct datapath *dp; u16 port_no; struct nlattr *upcall_portids; };
vport_ops 定义了对vport的操作
/** * struct vport_ops - definition of a type of virtual port * * @type: %OVS_VPORT_TYPE_* value for this type of virtual port. * @create: Create a new vport configured as specified. On success returns * a new vport allocated with ovs_vport_alloc(), otherwise an ERR_PTR() value. * @destroy: Destroys a vport. Must call vport_free() on the vport but not * before an RCU grace period has elapsed. * @set_options: Modify the configuration of an existing vport. May be %NULL * if modification is not supported. * @get_options: Appends vport-specific attributes for the configuration of an * existing vport to a &struct sk_buff. May be %NULL for a vport that does not * have any configuration. * @get_name: Get the device's name. * @send: Send a packet on the device. Returns the length of the packet sent, * zero for dropped packets or negative for error. * @get_egress_tun_info: Get the egress tunnel 5-tuple and other info for * a packet. */ struct vport_ops { enum ovs_vport_type type; /* Called with ovs_mutex. */ struct vport *(*create)(const struct vport_parms *); void (*destroy)(struct vport *); int (*set_options)(struct vport *, struct nlattr *); int (*get_options)(const struct vport *, struct sk_buff *); /* Called with rcu_read_lock or ovs_mutex. */ const char *(*get_name)(const struct vport *); int (*send)(struct vport *, struct sk_buff *); int (*get_egress_tun_info)(struct vport *, struct sk_buff *, struct ovs_tunnel_info *); };
vport_ops_list[]是vport_ops组成的数组。vport_ops实例化的全部类型如下
/* List of statically compiled vport implementations. Don't forget to also * add yours to the list at the bottom of vport.h. */ static const struct vport_ops *vport_ops_list[] = { &ovs_netdev_vport_ops, &ovs_internal_vport_ops, &ovs_geneve_vport_ops, #if IS_ENABLED(CONFIG_NET_IPGRE_DEMUX) &ovs_gre_vport_ops, &ovs_gre64_vport_ops, #endif &ovs_vxlan_vport_ops, &ovs_lisp_vport_ops, };
比如当我们在为网桥增设端口的时候,就会进入ovs_netdev_vport_ops中的create方法,进而注册网络设备。
??定义维护交换机本地流表相关的数据结构和操作,包括流表结构的创建、更新、删除,对每条流的管理等。位于datapath/flow.h和datapath/flow_table.h中。
flow table:
struct table_instance { struct flex_array *buckets; unsigned int n_buckets; struct rcu_head rcu; int node_ver; u32 hash_seed; bool keep_flows; };
struct flow_table { struct table_instance __rcu *ti; struct mask_cache_entry __percpu *mask_cache; struct mask_array __rcu *mask_array; unsigned long last_rehash; unsigned int count; };
struct sw_flow { struct rcu_head rcu; struct hlist_node hash_node[2]; u32 hash; int stats_last_writer; /* NUMA-node id of the last writer on * 'stats[0]'. */ struct sw_flow_key key; struct sw_flow_key unmasked_key; struct sw_flow_mask *mask; struct sw_flow_actions __rcu *sf_acts; struct flow_stats __rcu *stats[]; /* One for each NUMA node. First one * is allocated at flow creation time, * the rest are allocated on demand * while holding the 'stats[0].lock'. */ };
??datapath运行在内核态,ovsd运行在用户态,两者通过netlink通信。
??因为大量的专用family会占用了family id,而family id数量自身有限(kernel允许32个);同时为了方便用户扩展使用,一个通用的netlink family被定义出来,这就是generic netlink family。
??具体接口可以参考:和库,这两个介绍的很详细。
??我们来看一下初始化代码到底做了些什么事情:
static int __init dp_init(void) { int err; BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); pr_info("Open vSwitch switching datapath %s, built "__DATE__" "__TIME__"\n", VERSION); err = action_fifos_init(); if (err) goto error; err = ovs_internal_dev_rtnl_link_register(); if (err) goto error_action_fifos_exit; err = ovs_flow_init(); if (err) goto error_unreg_rtnl_link; err = ovs_vport_init(); if (err) goto error_flow_exit; err = register_pernet_device(&ovs_net_ops); if (err) goto error_vport_exit; err = register_netdevice_notifier(&ovs_dp_device_notifier); if (err) goto error_netns_exit; err = dp_register_genl(); if (err < 0) goto error_unreg_notifier; return 0; error_unreg_notifier: unregister_netdevice_notifier(&ovs_dp_device_notifier); error_netns_exit: unregister_pernet_device(&ovs_net_ops); error_vport_exit: ovs_vport_exit(); error_flow_exit: ovs_flow_exit(); error_unreg_rtnl_link: ovs_internal_dev_rtnl_link_unregister(); error_action_fifos_exit: action_fifos_exit(); error: return err; }
??其主要分为以下几部分:
int action_fifos_init(void) { action_fifos = alloc_percpu(struct action_fifo); if (!action_fifos) return -ENOMEM; return 0; }
#define DEFERRED_ACTION_FIFO_SIZE 10 struct action_fifo { int head; int tail; /* Deferred action fifo queue storage. */ struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; };
int ovs_internal_dev_rtnl_link_register(void) { return rtnl_link_register(&internal_dev_link_ops); }
/* Initializes the flow module. * Returns zero if successful or a negative error code. */ int ovs_flow_init(void) { BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long)); BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow) + (num_possible_nodes() * sizeof(struct flow_stats *)), 0, 0, NULL); if (flow_cache == NULL) return -ENOMEM; flow_stats_cache = kmem_cache_create("sw_flow_stats", sizeof(struct flow_stats), 0, SLAB_HWCACHE_ALIGN, NULL); if (flow_stats_cache == NULL) { kmem_cache_destroy(flow_cache); flow_cache = NULL; return -ENOMEM; } return 0; }
/** * ovs_vport_init - initialize vport subsystem * * Called at module load time to initialize the vport subsystem. */ int ovs_vport_init(void) { dev_table = kzalloc(VPORT_HASH_BUCKETS * sizeof(struct hlist_head), GFP_KERNEL); if (!dev_table) return -ENOMEM; return 0; }
static int dp_register_genl(void) { int err; int i; for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) { err = genl_register_family(dp_genl_families[i]); if (err) goto error; } return 0; error: dp_unregister_genl(i); return err; }
dp_genl_families[]数组静态定义如下:
static struct genl_family *dp_genl_families[] = { &dp_datapath_genl_family, &dp_vport_genl_family, &dp_flow_genl_family, &dp_packet_genl_family, };
调用dp_register_genl()完成对四种类型的family以及相应操作的注册,包括datapath、vport、flow和packet。前三种 family都对应四种操 作都包括NEW、DEL、GET、SET,而packet的操作仅为EXECUTE。
??继续上面所说的,当接收包将会发生如下代码流:
??这些family和操作的定义均在datapath.c中。以 flow family 为例。代码为:
nla_policy:
static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED }, [OVS_FLOW_ATTR_MASK] = { .type = NLA_NESTED }, [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED }, [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG }, [OVS_FLOW_ATTR_PROBE] = { .type = NLA_FLAG }, };
对generic netlink中flow的操作:
static struct genl_ops dp_flow_genl_ops[] = { { .cmd = OVS_FLOW_CMD_NEW, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = flow_policy, .doit = ovs_flow_cmd_new }, { .cmd = OVS_FLOW_CMD_DEL, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = flow_policy, .doit = ovs_flow_cmd_del }, { .cmd = OVS_FLOW_CMD_GET, .flags = 0, /* OK for unprivileged users. */ .policy = flow_policy, .doit = ovs_flow_cmd_get, .dumpit = ovs_flow_cmd_dump }, { .cmd = OVS_FLOW_CMD_SET, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = flow_policy, .doit = ovs_flow_cmd_set, }, };
generic netlink中的flow的genl_family初始化:
static struct genl_family dp_flow_genl_family = { .id = GENL_ID_GENERATE, .hdrsize = sizeof(struct ovs_header), .name = OVS_FLOW_FAMILY, .version = OVS_FLOW_VERSION, .maxattr = OVS_FLOW_ATTR_MAX, .netnsok = true, .parallel_ops = true, .ops = dp_flow_genl_ops, .n_ops = ARRAY_SIZE(dp_flow_genl_ops), .mcgrps = &ovs_dp_flow_multicast_group, .n_mcgrps = 1, };
??ovsd对于netlink的实现,主要在lib/netlink-socket.c文件中。而对这些netlink操作的调用,主要在lib/dpif-netlink.c文件中对于各个行为的处理,各种可能的消息类型在datapath模块中事先进行了内核注册。
??datapath中对netlink family类型进行了注册,ovsd在使用这些netlink family之前需要获取它们的信息,这一过程主要在lib/dpif-netlink.c文件,dpif_linux_init()函数。代码为:
static int dpif_netlink_init(void) { static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; static int error; if (ovsthread_once_start(&once)) { error = nl_lookup_genl_family(OVS_DATAPATH_FAMILY, &ovs_datapath_family); if (error) { VLOG_ERR("Generic Netlink family '%s' does not exist. " "The Open vSwitch kernel module is probably not loaded.", OVS_DATAPATH_FAMILY); } if (!error) { error = nl_lookup_genl_family(OVS_VPORT_FAMILY, &ovs_vport_family); } if (!error) { error = nl_lookup_genl_family(OVS_FLOW_FAMILY, &ovs_flow_family); } if (!error) { error = nl_lookup_genl_family(OVS_PACKET_FAMILY, &ovs_packet_family); } if (!error) { error = nl_lookup_genl_mcgroup(OVS_VPORT_FAMILY, OVS_VPORT_MCGROUP, &ovs_vport_mcgroup); } ovsthread_once_done(&once); } return error; }
??其中nl_lookup_genl_family代码位于lib/netlink-socket.c中,其作用为查找给定name的netlink family类型是否完成注册,并返回对应的*number值,该值可以直接使用。
/* If '*number' is 0, translates the given Generic Netlink family 'name' to a * number and stores it in '*number'. If successful, returns 0 and the caller * may use '*number' as the family number. On failure, returns a positive * errno value and '*number' caches the errno value. */ int nl_lookup_genl_family(const char *name, int *number) { if (*number == 0) { struct nlattr *attrs[ARRAY_SIZE(family_policy)]; struct ofpbuf *reply; int error; error = do_lookup_genl_family(name, attrs, &reply); if (!error) { *number = nl_attr_get_u16(attrs[CTRL_ATTR_FAMILY_ID]); define_genl_family(*number, name); } else { *number = -error; } ofpbuf_delete(reply); ovs_assert(*number != 0); } return *number > 0 ? 0 : -*number; }
??完成这些查找后,ovsd即可利用dpif中的api,通过发出这些netlink消息给datapath实现对datapath的操作。
??相关的中间层API定义主要在dpif_class(位于 lib/dpif-provider.h)的抽象类型中。下面是关于dpif_class结构体的注释:
/* Datapath interface class structure, to be defined by each implementation of * a datapath interface. * * These functions return 0 if successful or a positive errno value on failure, * except where otherwise noted. * * These functions are expected to execute synchronously, that is, to block as * necessary to obtain a result. Thus, they may not return EAGAIN or * EWOULDBLOCK or EINPROGRESS. We may relax this requirement in the future if * and when we encounter performance problems. */
??一共有两种dpif_class实例化类型,分别为dpif_netlink_class和dpif_netdev_class。dpif_netlink_class表示的是通过netlink和本地的datapath通信,而dpif_netdev_class通过网络协议和远程的datapath通信。
??下图是ovsd使用netlink进行消息发送的过程: