Chinaunix首页 | 论坛 | 博客
  • 博客访问: 3934620
  • 博文数量: 93
  • 博客积分: 3189
  • 博客等级: 中校
  • 技术积分: 4229
  • 用 户 组: 普通用户
  • 注册时间: 2009-02-02 13:29
个人简介

出没于杭州和青岛的程序猿一枚,对内核略懂一二

文章分类

全部博文(93)

文章存档

2016年(2)

2015年(3)

2014年(11)

2013年(29)

2012年(16)

2011年(5)

2010年(5)

2009年(22)

分类: LINUX

2013-06-26 11:57:00

声明:版权所有,转载请保留出处http://forever.blog.chinaunix.net

内核版本基于SLES 11 SP2 3.0.13内核(该内核syncookie机制存在bug,导致0.0.0.0数据包)

 

内核输出“TCP: Possible SYN flooding on port”信息,在内核syncookie机制存在bug的情况下如何调整相应的参数增大并发syn值?

1.     内核输出该信息的位置

int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)

{

        struct tcp_extend_values tmp_ext;

        struct tcp_options_received tmp_opt;

        u8 *hash_location;

        struct request_sock *req;

        struct inet_request_sock *ireq;

        struct tcp_sock *tp = tcp_sk(sk);

        struct dst_entry *dst = NULL;

        __be32 saddr = ip_hdr(skb)->saddr;

        __be32 daddr = ip_hdr(skb)->daddr;

        __u32 isn = TCP_SKB_CB(skb)->when;

#ifdef CONFIG_SYN_COOKIES

        int want_cookie = 0;

#else

#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */

#endif

 

        /* Never answer to SYNs send to broadcast or multicast */

        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))

                goto drop;

 

        /* TW buckets are converted to open requests without

         * limitations, they conserve resources and peer is

         * evidently real one.

         */

        if (inet_csk_reqsk_queue_is_full(sk) && !isn) {

                if (net_ratelimit())

                        syn_flood_warning(skb);

 

isn这里可以不关心(对于syn连接,这个值一定是0)

看一下标红的函数

static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)

{

        return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue);

}

static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)

{                                             

        return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log;

}

这个代码非常的巧妙,通过右移操作来判断并发syn队列是否已满。要想增大并发syn请求,只需要增大max_qlen_log的值就可以了。

 

2.     如何增大max_qlen_log值

TCP传输控制块结构体

/** inet_connection_sock - INET connection oriented sock

 *

 * @icsk_accept_queue:     FIFO of established children

 * @icsk_bind_hash:        Bind node

 * @icsk_timeout:          Timeout

 * @icsk_retransmit_timer: Resend (no ack)

 * @icsk_rto:              Retransmit timeout

 * @icsk_pmtu_cookie       Last pmtu seen by socket

 * @icsk_ca_ops            Pluggable congestion control hook

 * @icsk_af_ops            Operations which are AF_INET{4,6} specific

 * @icsk_ca_state:         Congestion control state

 * @icsk_retransmits:      Number of unrecovered [RTO] timeouts

 * @icsk_pending:          Scheduled timer event

 * @icsk_backoff:          Backoff

 * @icsk_syn_retries:      Number of allowed SYN (or equivalent) retries

 * @icsk_probes_out:       unanswered 0 window probes

 * @icsk_ext_hdr_len:      Network protocol overhead (IP/IPv6 options)

 * @icsk_ack:              Delayed ACK control data

 * @icsk_mtup;             MTU probing control data

 */

struct inet_connection_sock {

        /* inet_sock has to be the first member! */

        struct inet_sock          icsk_inet;

        struct request_sock_queue icsk_accept_queue;

        struct inet_bind_bucket   *icsk_bind_hash;

        unsigned long             icsk_timeout;

        struct timer_list         icsk_retransmit_timer;

        struct timer_list         icsk_delack_timer;

        __u32                     icsk_rto;

        __u32                     icsk_pmtu_cookie;

        const struct tcp_congestion_ops *icsk_ca_ops;

        const struct inet_connection_sock_af_ops *icsk_af_ops;

        unsigned int              (*icsk_sync_mss)(struct sock *sk, u32 pmtu);

        __u8                      icsk_ca_state;

        __u8                      icsk_retransmits;

        __u8                      icsk_pending;

        __u8                      icsk_backoff;

        __u8                      icsk_syn_retries;

        __u8                      icsk_probes_out;

        __u16                     icsk_ext_hdr_len;

        struct {

                __u8              pending;       /* ACK is pending                         */

                __u8              quick;         /* Scheduled number of quick acks         */

                __u8              pingpong;      /* The session is interactive             */

                __u8              blocked;       /* Delayed ACK was blocked by socket lock */

                __u32             ato;           /* Predicted tick of soft clock           */

                unsigned long     timeout;       /* Currently scheduled timeout            */

                __u32             lrcvtime;      /* timestamp of last received data packet */

                __u16             last_seg_size; /* Size of last incoming segment          */

                __u16             rcv_mss;       /* MSS used for delayed ACK decisions     */

        } icsk_ack;

        struct {

                int               enabled;

 

                /* Range of MTUs to search */

                int               search_high;

                int               search_low;

 

                /* Information on the current probe. */

                int               probe_size;

        } icsk_mtup;

        u32                       icsk_ca_priv[16];

        u32                       icsk_user_timeout;

#define ICSK_CA_PRIV_SIZE       (16 * sizeof(u32))

};

 

其中红色标注的变量用来保存正在建立连接和已建立连接但未被accept的传输控制块。

/** struct request_sock_queue - queue of request_socks

 *     

 * @rskq_accept_head - FIFO head of established children

 * @rskq_accept_tail - FIFO tail of established children

 * @rskq_defer_accept - User waits for some data after accept()

 * @syn_wait_lock - serializer   

 *     

 * %syn_wait_lock is necessary only to avoid proc interface having to grab the main

 * lock sock while browsing the listening hash (otherwise it's deadlock prone).

 *     

 * This lock is acquired in read mode only from listening_get_next() seq_file

 * op and it's acquired in write mode _only_ from code that is actively

 * changing rskq_accept_head. All readers that are holding the master sock lock

 * don't need to grab this lock in read mode too as rskq_accept_head. writes

 * are always protected from the main sock lock.

 */    

struct request_sock_queue {      

        struct request_sock     *rskq_accept_head;

        struct request_sock     *rskq_accept_tail;

        rwlock_t                syn_wait_lock;  

        u8                      rskq_defer_accept;

        /* 3 bytes hole, try to pack */

        struct listen_sock      *listen_opt;    

};             

其中rskq_accept_headrskq_accept_tail指向的链表保存已完成三次握手的传输控制块;listen_opt中保存处于SYN_RECV状态的传输控制块。

这里我们重点看一下listen_opt

struct listen_sock {

        u8                      max_qlen_log;

        /* 3 bytes hole, try to use */

        int                     qlen;

        int                     qlen_young;

        int                     clock_hand;

        u32                     hash_rnd;

        u32                     nr_table_entries;

        struct request_sock     *syn_table[0];

};

这里syn_table的大小将会影响同时存在SYN_RECV状态的半连接的数量。

通过listen系统调用跟踪一下这个数值的设置:

/*

 *      Perform a listen. Basically, we allow the protocol to do anything

 *      necessary for a listen, and if that works, we mark the socket as

 *      ready for listening.

 */

 

SYSCALL_DEFINE2(listen, int, fd, int, backlog)

{

        struct socket *sock;

        int err, fput_needed;

        int somaxconn;

 

        sock = sockfd_lookup_light(fd, &err, &fput_needed);

        if (sock) {

                somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn;

/*这里限制backlog的值不会大于net.core.somaxconn的值*/

                if ((unsigned)backlog > somaxconn)

                        backlog = somaxconn;

 

                err = security_socket_listen(sock, backlog);

                if (!err)

                        err = sock->ops->listen(sock, backlog);

 

                fput_light(sock->file, fput_needed);

        }

        return err;

}

/*

 *      Move a socket into listening state.

 */

int inet_listen(struct socket *sock, int backlog)

{

        struct sock *sk = sock->sk;

        unsigned char old_state;

        int err;

 

        lock_sock(sk);

 

        err = -EINVAL;

        if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)

                goto out;

 

        old_state = sk->sk_state;

        if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))

                goto out;

 

        /* Really, if the socket is already in listen state

         * we can only allow the backlog to be adjusted.

         */

        if (old_state != TCP_LISTEN) {

                err = inet_csk_listen_start(sk, backlog);

                if (err)

                        goto out;

        }

        sk->sk_max_ack_backlog = backlog;

        err = 0;

 

out:

        release_sock(sk);

        return err;

}

int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)

{

        struct inet_sock *inet = inet_sk(sk);

        struct inet_connection_sock *icsk = inet_csk(sk);

        int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);

 

        if (rc != 0)

                return rc;

       

        sk->sk_max_ack_backlog = 0;

        sk->sk_ack_backlog = 0;

        inet_csk_delack_init(sk);

       

        /* There is race window here: we announce ourselves listening,

         * but this transition is still not validated by get_port().

         * It is OK, because this socket enters to hash table only

         * after validation is complete.

         */    

        sk->sk_state = TCP_LISTEN;

        if (!sk->sk_prot->get_port(sk, inet->inet_num)) {

                inet->inet_sport = htons(inet->inet_num);

 

                sk_dst_reset(sk);

                sk->sk_prot->hash(sk);

 

                return 0;

        }

 

        sk->sk_state = TCP_CLOSE;

        __reqsk_queue_destroy(&icsk->icsk_accept_queue);

        return -EADDRINUSE;

}

int reqsk_queue_alloc(struct request_sock_queue *queue,

                      unsigned int nr_table_entries)

{

        size_t lopt_size = sizeof(struct listen_sock);

        struct listen_sock *lopt;

 

/*这里可以看出listen_sock->max_qlen_log 为nr_table_entries和sysctl_max_syn_backlog的最小值加1

   并向上去整到2的次方后的log。

   比如: nr_table_entries = 128 sysctl_max_syn_backlog=20480,

               min(nr_table_entries, sysctl_max_syn_backlog)= 128

               roundup_pow_of_two(128+1)=256

               max_qlen_log=8

*/

        nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);

        nr_table_entries = max_t(u32, nr_table_entries, 8);

        nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);

        lopt_size += nr_table_entries * sizeof(struct request_sock *);

        if (lopt_size > PAGE_SIZE)

                lopt = vzalloc(lopt_size);

        else

                lopt = kzalloc(lopt_size, GFP_KERNEL);

        if (lopt == NULL)

                return -ENOMEM;

 

        for (lopt->max_qlen_log = 3;

             (1 << lopt->max_qlen_log) < nr_table_entries;

             lopt->max_qlen_log++);

 

        get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));

        rwlock_init(&queue->syn_wait_lock);

        queue->rskq_accept_head = NULL;

        lopt->nr_table_entries = nr_table_entries;

 

        write_lock_bh(&queue->syn_wait_lock);

        queue->listen_opt = lopt;

        write_unlock_bh(&queue->syn_wait_lock);

 

        return 0;

}

 

经过上面的分析,要想增大并发syn值需要同时调整如下三个参数:

 

(1)net.core.somaxconn

(2)net.ipv4.tcp_max_syn_backlog

(3)listen系统调用的backlog参数

 

 

看来listen系统调用的backlog参数不仅影响已完成三次握手等待accept的最大连接数,还影响SYN_RECV状态的链接数。

阅读(7733) | 评论(0) | 转发(1) |
给主人留下些什么吧!~~