sock_sendmsg --->__sock_sendmsg ---->security_socket_sendmsg/__sock_sendmsg_nosec
显然默认为:sock->ops->sendmsg ;security的前提是初始化了security_ops (security.c)
那么这个sock->
ops->sendmsg是什么呢
我们看看sock和ops分别是什么
-
/**
-
* struct socket - general BSD socket
-
* @state: socket state (%SS_CONNECTED, etc)
-
* @type: socket type (%SOCK_STREAM, etc)
-
* @flags: socket flags (%SOCK_ASYNC_NOSPACE, etc)
-
* @ops: protocol specific socket operations
-
* @file: File back pointer for gc
-
* @sk: internal networking protocol agnostic socket representation
-
* @wq: wait queue for several uses
-
*/
-
struct socket {
-
socket_state state;
-
-
kmemcheck_bitfield_begin(type);
-
short type;
-
kmemcheck_bitfield_end(type);
-
-
unsigned long flags;
-
-
struct socket_wq __rcu *wq;
-
-
struct file *file;
-
struct sock *sk;
-
const struct proto_ops *ops;
-
};
那么我们需要找到内核注册的ops即谁和socket的type关联了.
-
Af_inet.c (net\ipv4):const struct proto_ops inet_stream_ops = {
-
Af_inet.c (net\ipv4):const struct proto_ops inet_dgram_ops =
-
Af_inet.c (net\ipv4):static const struct proto_ops inet_sockraw_ops = {
-
Af_packet.c (net\packet):static const struct proto_ops packet_ops = {
针对tcp的肯定是stream的即struct proto_ops inet_stream_ops
.sendmsg = inet_sendmsg,
-
int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
-
size_t size)
-
{
-
struct sock *sk = sock->sk;
-
-
sock_rps_record_flow(sk);
-
-
/* We may need to bind the socket. */
-
if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
-
inet_autobind(sk))
-
return -EAGAIN;
-
-
return sk->sk_prot->sendmsg(iocb, sk, msg, size);
-
}
看看sk->sk_prot->sendmsg,这个是找到具体的sendmsg.
-
struct sock {
-
/*
-
* Now struct inet_timewait_sock also uses sock_common, so please just
-
* don't add nothing before this first member (__sk_common) --acme
-
*/
-
struct sock_common __sk_common;
-
#define sk_prot __sk_common.skc_prot
在创建socket的时候调用到内核af_inet.c
inet_create 里 sk->sk_prot->init
-
static const struct net_proto_family inet_family_ops = {
-
.family = PF_INET,
-
.create = inet_create,
-
.owner = THIS_MODULE,
-
};
很显然在inet_create的时候关联具体的协议操作(tcp_ops/udp_Ops/raw_ops/packet_ops),它查询全局链表
inetsw_array. 注册接口是inet_register_protosw
我们看看af_inet.c:
-
static int __init inet_init(void)
-
{
-
struct sk_buff *dummy_skb;
-
struct inet_protosw *q;
-
struct list_head *r;
-
int rc = -EINVAL;
-
-
BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
-
-
sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
-
if (!sysctl_local_reserved_ports)
-
goto out;
-
-
rc = proto_register(&tcp_prot, 1);
-
if (rc)
-
goto out_free_reserved_ports;
-
-
rc = proto_register(&udp_prot, 1);
-
if (rc)
-
goto out_unregister_tcp_proto;
-
-
rc = proto_register(&raw_prot, 1);
-
if (rc)
-
goto out_unregister_udp_proto;
-
-
rc = proto_register(&ping_prot, 1);
-
if (rc)
-
goto out_unregister_raw_proto;
-
-
/*
-
* Tell SOCKET that we are alive...
-
*/
-
-
(void)sock_register(&inet_family_ops);
-
-
#ifdef CONFIG_SYSCTL
-
ip_static_sysctl_init();
-
#endif
-
-
tcp_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem;
-
-
/*
-
* Add all the base protocols.
-
*/
-
-
if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
-
pr_crit("%s: Cannot add ICMP protocol\n", __func__);
-
if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
-
pr_crit("%s: Cannot add UDP protocol\n", __func__);
-
if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
-
pr_crit("%s: Cannot add TCP protocol\n", __func__);
-
#ifdef CONFIG_IP_MULTICAST
-
if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
-
pr_crit("%s: Cannot add IGMP protocol\n", __func__);
-
#endif
-
-
/* Register the socket-side information for inet_create. */
-
for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
-
INIT_LIST_HEAD(r);
-
-
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
-
inet_register_protosw(q);
-
-
/*
-
* Set the ARP module up
-
*/
-
-
arp_init();
-
-
/*
-
* Set the IP module up
-
*/
-
-
ip_init();
-
-
tcp_v4_init();
-
-
/* Setup TCP slab cache for open requests. */
-
tcp_init();
-
-
/* Setup UDP memory threshold */
-
udp_init();
-
-
/* Add UDP-Lite (RFC 3828) */
-
udplite4_register();
-
-
ping_init();
-
-
/*
-
* Set the ICMP layer up
-
*/
-
-
if (icmp_init() < 0)
-
panic("Failed to create the ICMP control socket.\n");
-
-
/*
-
* Initialise the multicast router
-
*/
-
#if defined(CONFIG_IP_MROUTE)
-
if (ip_mr_init())
-
pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
-
#endif
-
/*
-
* Initialise per-cpu ipv4 mibs
-
*/
-
-
if (init_ipv4_mibs())
-
pr_crit("%s: Cannot init ipv4 mibs\n", __func__);
-
-
ipv4_proc_init();
-
-
ipfrag_init();
-
-
dev_add_pack(&ip_packet_type);
-
-
rc = 0;
-
out:
-
return rc;
-
out_unregister_raw_proto:
-
proto_unregister(&raw_prot);
-
out_unregister_udp_proto:
-
proto_unregister(&udp_prot);
-
out_unregister_tcp_proto:
-
proto_unregister(&tcp_prot);
-
out_free_reserved_ports:
-
kfree(sysctl_local_reserved_ports);
-
goto out;
-
}
sock_register注册socket协议句柄到
net_families[famliy]; 它注册了inet_stream_ops、inet_dgram_ops、inet_sockraw_ops、packet_ops.
static int __init inet_init(void)
分两部分:(sock ---> inet --->. ....)
1. rc = proto_register(&tcp_prot, 1); // tcp/upd/raw/ping
2. inet_add_protocol(&tcp_protocol, IPPROTO_TCP) // tcp/udp/icmp/igmp
显然2中是注册了tcp的接收处理,就像dev_add_pack一样会注册ip接收函数一样.
而1则注册了tcp协议发送方面的东西.
-
/* Networking protocol blocks we attach to sockets.
-
* socket layer -> transport layer interface
-
* transport -> network interface is defined by struct inet_proto
-
*/
-
struct proto {
-
-
Inet层:
-
/* Upon startup we insert all the elements in inetsw_array[] into
-
* the linked list inetsw.
-
*/
-
static struct inet_protosw inetsw_array[] =
-
{
-
{
-
.type = SOCK_STREAM,
-
.protocol = IPPROTO_TCP,
-
.prot = &tcp_prot,
-
.ops = &inet_stream_ops,
-
.no_check = 0,
-
.flags = INET_PROTOSW_PERMANENT |
-
INET_PROTOSW_ICSK,
-
},
-
-
{
-
.type = SOCK_DGRAM,
-
.protocol = IPPROTO_UDP,
-
.prot = &udp_prot,
-
.ops = &inet_dgram_ops,
-
.no_check = UDP_CSUM_DEFAULT,
-
.flags = INET_PROTOSW_PERMANENT,
-
},
-
-
{
-
.type = SOCK_DGRAM,
-
.protocol = IPPROTO_ICMP,
-
.prot = &ping_prot,
-
.ops = &inet_dgram_ops,
-
.no_check = UDP_CSUM_DEFAULT,
-
.flags = INET_PROTOSW_REUSE,
-
},
-
-
{
-
.type = SOCK_RAW,
-
.protocol = IPPROTO_IP, /* wild card */
-
.prot = &raw_prot,
-
.ops = &inet_sockraw_ops,
-
.no_check = UDP_CSUM_DEFAULT,
-
.flags = INET_PROTOSW_REUSE,
-
}
-
}
-
/* This is used to register socket interfaces for IP protocols. */
-
struct inet_protosw {
-
struct list_head list;
-
-
/* These two fields form the lookup key. */
-
unsigned short type; /* This is the 2nd argument to socket(2). */
-
unsigned short protocol; /* This is the L4 protocol number. */
-
-
struct proto *prot;
-
const struct proto_ops *ops;
-
-
char no_check; /* checksum on rcv/xmit/none? */
-
unsigned char flags; /* See INET_PROTOSW_* below. */
-
}
最后会关联到tcp_sendmsg. 我们都熟悉tcp协议,首先要建立连接 三次握手 , 协商mss.
我们再看这个函数的时候,很明显它会先去协商一个mss值,然后再发送消息。Mtu (之前文章里也讲过mss与mtu)
下面就涉及两个问题:
1. Tcp分片 ,它根据mss
窗口大小
用于流控制(确保连接的任何一方都不会过快地发送过量的分组而淹没另一方),窗口大小指定了从被确认的字节算起可以发送多少个字节
选项
选项部分是为了适合复杂网络环境和更好地服务于应用层设计的。TCP选项最长是40字节。详情见2.2。
数据
无任何数据的TCP段也是合法的,通常用于确认和控制信息
选项字段[2]
TCP选项部分很好出现在已经建立连接的会话中,只要出现在TCP连接建立阶段,即三次握手。TCP选项部分实际运用有以下几种
(1)最大报文传输段(MMS, Maximum Segment Size)
用于发送发与接收方协商最大报文段长度(仅仅是净荷数据,不包括TCP首部字段)。TCP在三次握手中,每一方都会通告期望收到的MSS(MSS只出现在SYN数据包中),
如果一方不接受另一方的MSS值,则使用默认的536字节净荷数据,即主机能够接受20+536字节的TCP报文段。
(2)窗口扩大选项(Window scaling)
TCP报文的窗口大小字段占16位,即最大值是65535,但随着时延和带宽比较大的通信产生(如卫星通信),需要更大的窗口满足性能和吞吐率,这就是窗口扩大选项存
在的意义。
Windows scaling占3个字节,最后一个字节是移位值(Shift count),即首部的窗口位数16向左移动,如移位值为14,则新的窗口最大值增大到65535*(2^14)。
窗口扩大选项是在TCP建立之初进行协商,如果已实现了窗口扩大,当不再需要扩大窗口时,发送移位值=0就可以恢复到原窗口大小,即65535
(3)选择确认选项(SACK, Selective Acknowledgements)
考虑这样情况,主机A发送报文段12345,主机B收到135且报文无差错,SACK用来确保只重传缺少的报文段,而不是重传所有报文段。
SACK选项需要2个功能字节,一个用来指明使用SACK选项(SACK Permission),另一指明这个选项占多少字节
那怎么形容丢失的报文段2,说明2的左右边界分别是1、3。TCP的数据报文是有字块边界的,而这种边界是由序列号表示的
最多能指明多少个字节块的边界信息呢?答案是4个。这是因为选项字段最大是40字节,去除2个功能字节,序列号是32位即4字节,并且需要左右边界,所以(40-2)/8 = 4。
(4)时间戳选项(timestamps)
时间戳选项用来计算往返时间RTT,发送方在发送报文段时把当前时钟的时间值放入时间戳字段,接收方将该时间戳字段的值复制到确认报文中,当接收方收到确认报文,
对比确认报文的时间戳(等于发送方发送报文段的时间戳)和现在的时钟,即可算出RTT
时间戳选项还可用于防止回绕序号PAWS。序列号只有32位,每2^32个序列号就会回绕(想想环形队列),采用时间戳选项很容易区分相同序列号的报文段。
(5)NOP(NO-Operation)
TCP的头部必须是4字节的倍数,而大多数选项不是4字节倍数,不足的用NOP填充。除此之外,NOP也用于分割不同的选项数据,如窗口扩大选项
和SACK之间使用NOP隔离(下面的实例将看到这一点)
先确定mss大小:
Tcp_ipv4.c (net\ipv4): sk->sk_gso_type = SKB_GSO_TCPV4; tcp connect函数调用
Tcp_ipv4.c (net\ipv4): newsk->sk_gso_type = SKB_GSO_TCPV4; // 收到确认ack时
tcp_v4_syn_recv_sock(三次握手已经完成,异步接收)
-
enum {
-
SKB_GSO_TCPV4 = 1 << 0,
-
SKB_GSO_UDP = 1 << 1,
-
-
/* This indicates the skb is from an untrusted source. */
-
SKB_GSO_DODGY = 1 << 2,
-
-
/* This indicates the tcp segment has CWR set. */
-
SKB_GSO_TCP_ECN = 1 << 3,
-
-
SKB_GSO_TCPV6 = 1 << 4,
-
-
SKB_GSO_FCOE = 1 << 5,
-
};
同样在tcp connect函数里
sk_setup_caps(sk, &rt->dst);
---> tcp_v4_connect
/* OK, now commit destination to socket. */
sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->dst);
在判断:
static inline bool sk_can_gso(const struct sock *sk)
{
return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type);
}
时
sk->sk_route_caps来自:
sk->sk_route_caps = dst->dev->features;
而dev->features则是在具体网卡驱动初始化的时候初始化的。
主要是判断了网卡是否支持gso。
2. Ip分片 它根据网卡的mtu