分类: LINUX
2013-07-07 16:52:51
原文地址:增加内核TCP SYN并发支持 作者:瀚海书香
内核版本基于SLES 11 SP2 3.0.13内核(该内核syncookie机制存在bug,导致0.0.0.0数据包)
内核输出“TCP: Possible SYN flooding on port”信息,在内核syncookie机制存在bug的情况下如何调整相应的参数增大并发syn值?
1. 内核输出该信息的位置
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct tcp_extend_values tmp_ext;
struct tcp_options_received tmp_opt;
u8 *hash_location;
struct request_sock *req;
struct inet_request_sock *ireq;
struct tcp_sock *tp = tcp_sk(sk);
struct dst_entry *dst = NULL;
__be32 saddr = ip_hdr(skb)->saddr;
__be32 daddr = ip_hdr(skb)->daddr;
__u32 isn = TCP_SKB_CB(skb)->when;
#ifdef CONFIG_SYN_COOKIES
int want_cookie = 0;
#else
#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
#endif
/* Never answer to SYNs send to broadcast or multicast */
if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
goto drop;
/* TW buckets are converted to open requests without
* limitations, they conserve resources and peer is
* evidently real one.
*/
if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
if (net_ratelimit())
syn_flood_warning(skb);
isn这里可以不关心(对于syn连接,这个值一定是0)
看一下标红的函数
static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
{
return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue);
}
static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
{
return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log;
}
这个代码非常的巧妙,通过右移操作来判断并发syn队列是否已满。要想增大并发syn请求,只需要增大max_qlen_log的值就可以了。
2. 如何增大max_qlen_log值
TCP传输控制块结构体
/** inet_connection_sock - INET connection oriented sock
*
* @icsk_accept_queue: FIFO of established children
* @icsk_bind_hash: Bind node
* @icsk_timeout: Timeout
* @icsk_retransmit_timer: Resend (no ack)
* @icsk_rto: Retransmit timeout
* @icsk_pmtu_cookie Last pmtu seen by socket
* @icsk_ca_ops Pluggable congestion control hook
* @icsk_af_ops Operations which are AF_INET{4,6} specific
* @icsk_ca_state: Congestion control state
* @icsk_retransmits: Number of unrecovered [RTO] timeouts
* @icsk_pending: Scheduled timer event
* @icsk_backoff: Backoff
* @icsk_syn_retries: Number of allowed SYN (or equivalent) retries
* @icsk_probes_out: unanswered 0 window probes
* @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options)
* @icsk_ack: Delayed ACK control data
* @icsk_mtup; MTU probing control data
*/
struct inet_connection_sock {
/* inet_sock has to be the first member! */
struct inet_sock icsk_inet;
struct request_sock_queue icsk_accept_queue;
struct inet_bind_bucket *icsk_bind_hash;
unsigned long icsk_timeout;
struct timer_list icsk_retransmit_timer;
struct timer_list icsk_delack_timer;
__u32 icsk_rto;
__u32 icsk_pmtu_cookie;
const struct tcp_congestion_ops *icsk_ca_ops;
const struct inet_connection_sock_af_ops *icsk_af_ops;
unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
__u8 icsk_ca_state;
__u8 icsk_retransmits;
__u8 icsk_pending;
__u8 icsk_backoff;
__u8 icsk_syn_retries;
__u8 icsk_probes_out;
__u16 icsk_ext_hdr_len;
struct {
__u8 pending; /* ACK is pending */
__u8 quick; /* Scheduled number of quick acks */
__u8 pingpong; /* The session is interactive */
__u8 blocked; /* Delayed ACK was blocked by socket lock */
__u32 ato; /* Predicted tick of soft clock */
unsigned long timeout; /* Currently scheduled timeout */
__u32 lrcvtime; /* timestamp of last received data packet */
__u16 last_seg_size; /* Size of last incoming segment */
__u16 rcv_mss; /* MSS used for delayed ACK decisions */
} icsk_ack;
struct {
int enabled;
/* Range of MTUs to search */
int search_high;
int search_low;
/* Information on the current probe. */
int probe_size;
} icsk_mtup;
u32 icsk_ca_priv[16];
u32 icsk_user_timeout;
#define ICSK_CA_PRIV_SIZE (16 * sizeof(u32))
};
其中红色标注的变量用来保存正在建立连接和已建立连接但未被accept的传输控制块。
/** struct request_sock_queue - queue of request_socks
*
* @rskq_accept_head - FIFO head of established children
* @rskq_accept_tail - FIFO tail of established children
* @rskq_defer_accept - User waits for some data after accept()
* @syn_wait_lock - serializer
*
* %syn_wait_lock is necessary only to avoid proc interface having to grab the main
* lock sock while browsing the listening hash (otherwise it's deadlock prone).
*
* This lock is acquired in read mode only from listening_get_next() seq_file
* op and it's acquired in write mode _only_ from code that is actively
* changing rskq_accept_head. All readers that are holding the master sock lock
* don't need to grab this lock in read mode too as rskq_accept_head. writes
* are always protected from the main sock lock.
*/
struct request_sock_queue {
struct request_sock *rskq_accept_head;
struct request_sock *rskq_accept_tail;
rwlock_t syn_wait_lock;
u8 rskq_defer_accept;
/* 3 bytes hole, try to pack */
struct listen_sock *listen_opt;
};
其中rskq_accept_head和rskq_accept_tail指向的链表保存已完成三次握手的传输控制块;listen_opt中保存处于SYN_RECV状态的传输控制块。
这里我们重点看一下listen_opt
struct listen_sock {
u8 max_qlen_log;
/* 3 bytes hole, try to use */
int qlen;
int qlen_young;
int clock_hand;
u32 hash_rnd;
u32 nr_table_entries;
struct request_sock *syn_table[0];
};
这里syn_table的大小将会影响同时存在SYN_RECV状态的半连接的数量。
通过listen系统调用跟踪一下这个数值的设置:
/*
* Perform a listen. Basically, we allow the protocol to do anything
* necessary for a listen, and if that works, we mark the socket as
* ready for listening.
*/
SYSCALL_DEFINE2(listen, int, fd, int, backlog)
{
struct socket *sock;
int err, fput_needed;
int somaxconn;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock) {
somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn;
/*这里限制backlog的值不会大于net.core.somaxconn的值*/
if ((unsigned)backlog > somaxconn)
backlog = somaxconn;
err = security_socket_listen(sock, backlog);
if (!err)
err = sock->ops->listen(sock, backlog);
fput_light(sock->file, fput_needed);
}
return err;
}
/*
* Move a socket into listening state.
*/
int inet_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
unsigned char old_state;
int err;
lock_sock(sk);
err = -EINVAL;
if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
goto out;
old_state = sk->sk_state;
if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
goto out;
/* Really, if the socket is already in listen state
* we can only allow the backlog to be adjusted.
*/
if (old_state != TCP_LISTEN) {
err = inet_csk_listen_start(sk, backlog);
if (err)
goto out;
}
sk->sk_max_ack_backlog = backlog;
err = 0;
out:
release_sock(sk);
return err;
}
int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
{
struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
if (rc != 0)
return rc;
sk->sk_max_ack_backlog = 0;
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
/* There is race window here: we announce ourselves listening,
* but this transition is still not validated by get_port().
* It is OK, because this socket enters to hash table only
* after validation is complete.
*/
sk->sk_state = TCP_LISTEN;
if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
inet->inet_sport = htons(inet->inet_num);
sk_dst_reset(sk);
sk->sk_prot->hash(sk);
return 0;
}
sk->sk_state = TCP_CLOSE;
__reqsk_queue_destroy(&icsk->icsk_accept_queue);
return -EADDRINUSE;
}
int reqsk_queue_alloc(struct request_sock_queue *queue,
unsigned int nr_table_entries)
{
size_t lopt_size = sizeof(struct listen_sock);
struct listen_sock *lopt;
/*这里可以看出listen_sock->max_qlen_log 为nr_table_entries和sysctl_max_syn_backlog的最小值加1
并向上去整到2的次方后的log。
比如: nr_table_entries = 128 sysctl_max_syn_backlog=20480,
min(nr_table_entries, sysctl_max_syn_backlog)= 128
roundup_pow_of_two(128+1)=256
max_qlen_log=8
*/
nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
nr_table_entries = max_t(u32, nr_table_entries, 8);
nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
lopt_size += nr_table_entries * sizeof(struct request_sock *);
if (lopt_size > PAGE_SIZE)
lopt = vzalloc(lopt_size);
else
lopt = kzalloc(lopt_size, GFP_KERNEL);
if (lopt == NULL)
return -ENOMEM;
for (lopt->max_qlen_log = 3;
(1 << lopt->max_qlen_log) < nr_table_entries;
lopt->max_qlen_log++);
get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
rwlock_init(&queue->syn_wait_lock);
queue->rskq_accept_head = NULL;
lopt->nr_table_entries = nr_table_entries;
write_lock_bh(&queue->syn_wait_lock);
queue->listen_opt = lopt;
write_unlock_bh(&queue->syn_wait_lock);
return 0;
}
经过上面的分析,要想增大并发syn值需要同时调整如下三个参数:
(1)net.core.somaxconn
(2)net.ipv4.tcp_max_syn_backlog
(3)listen系统调用的backlog参数
看来listen系统调用的backlog参数不仅影响已完成三次握手等待accept的最大连接数,还影响SYN_RECV状态的链接数。