runsisi's

technical notes

Linux socket 实现

2020-01-03 runsisi#tcp/ip

tcp/ip init

// net/ipv4/af_inet.c

inet_init
  proto_register(&tcp_prot, 1)
  proto_register(&udp_prot, 1)
  proto_register(&raw_prot, 1)
  proto_register(&ping_prot, 1)

  sock_register(&inet_family_ops)

  // register in inet_protos[MAX_INET_PROTOS]
  // used for l4 packet receiving, i.e., ip_rcv_finish and ip_local_deliver_finish
  inet_add_protocol(&icmp_protocol, IPPROTO_ICMP)
  inet_add_protocol(&udp_protocol, IPPROTO_UDP)
  inet_add_protocol(&tcp_protocol, IPPROTO_TCP)

  for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
    // register in inetsw[SOCK_MAX]
    // used for socket creation, i.e., inet_create
    inet_register_protosw(q)

  arp_init()
  ip_init()
  tcp_v4_init()
    // init tcp_hashinfo.listening_hash hash table
    inet_hashinfo_init(&tcp_hashinfo)
  tcp_init()
    tcp_hashinfo.ehash = alloc_large_system_hash("TCP established")
    tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind")
  udp_init()
    udp_table_init(&udp_table, "UDP")
  udplite4_register()
  ping_init()
  icmp_init()

  dev_add_pack(&ip_packet_type)


// net/ipv4/ip_input.c

ip_rcv
  NF_HOOK(NFPROTO_IPV4, ip_rcv_finish)

ip_rcv_finish
  ipprot = inet_protos[protocol]
  ipprot->early_demux(skb) // tcp_v4_early_demux / udp_v4_early_demux

  ip_route_input_noref
    ip_route_input_slow(skb, daddr, saddr, tos, dev)
      fib_lookup(net, &fl4, &res)
      if (res.type == RTN_LOCAL) {
        rth = rt_dst_alloc(flags | RTCF_LOCAL, res.type)
          rt = dst_alloc
          rt->dst.output = ip_output
          rt->dst.input = ip_local_deliver
        skb_dst_set(skb, &rth->dst)
      }
  ip_rcv_options

  dst_input(skb)
    skb_dst(skb)->input(skb) // ip_local_deliver

ip_local_deliver
  NF_HOOK(NFPROTO_IPV4, ip_local_deliver_finish)

ip_local_deliver_finish
  ipprot = inet_protos[protocol]
  ipprot->handler(skb) // tcp_v4_rcv / udp_rcv / icmp_rcv


// net/ipv4/udp.c

udp_rcv
  __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP)
    sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable)
    udp_queue_rcv_skb(sk, skb)
      __udp_queue_rcv_skb(sk, skb)
        __udp_enqueue_schedule_skb(sk, skb)
          struct sk_buff_head *list = &sk->sk_receive_queue;
          __skb_queue_tail(list, skb)
          sk->sk_data_ready(sk, 0)

socket syscall

// net/socket.c

socket()

// net/ipv4/af_inet.c

static struct inet_protosw inetsw_array[] =
{
	{
		.type =       SOCK_STREAM,
		.protocol =   IPPROTO_TCP,
		.prot =       &tcp_prot,
		.ops =        &inet_stream_ops,
		.flags =      INET_PROTOSW_PERMANENT |
			      INET_PROTOSW_ICSK,
	},

	{
		.type =       SOCK_DGRAM,
		.protocol =   IPPROTO_UDP,
		.prot =       &udp_prot,
		.ops =        &inet_dgram_ops,
		.flags =      INET_PROTOSW_PERMANENT,
       },

       {
		.type =       SOCK_DGRAM,
		.protocol =   IPPROTO_ICMP,
		.prot =       &ping_prot,
		.ops =        &inet_dgram_ops,
		.flags =      INET_PROTOSW_REUSE,
       },

       {
	       .type =       SOCK_RAW,
	       .protocol =   IPPROTO_IP,	/* wild card */
	       .prot =       &raw_prot,
	       .ops =        &inet_sockraw_ops,
	       .flags =      INET_PROTOSW_REUSE,
       }
};

inet_create(struct socket *sock, int protocol)
  list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
  }
  sock->ops = answer->ops; // called by socket syscalls
  answer_prot = answer->prot;
  sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot)
    sk = sk_prot_alloc
    sk->sk_prot = sk->sk_prot_creator = prot; // called by sock->ops
  sock_init_data(sock, sk)
    sk->sk_rcvbuf		=	sysctl_rmem_default;
    sk->sk_sndbuf		=	sysctl_wmem_default;
    sk->sk_state		=	TCP_CLOSE;
    sk_set_socket(sk, sock);
      sk->sk_socket = sock;
    sk->sk_type	=	sock->type;
    sk->sk_wq	=	sock->wq;
    sock->sk	=	sk;
    sk->sk_state_change	=	sock_def_wakeup;
    sk->sk_data_ready	=	sock_def_readable;
    sk->sk_write_space	=	sock_def_write_space;
    sk->sk_error_report	=	sock_def_error_report;
    sk->sk_destruct		=	sock_def_destruct;
  // for TCP, sk->sk_write_space will be overridden to sk_stream_write_space,
  // sk->sk_sndbuf and sk->sk_rcvbuf will be overridden to sysctl_tcp_wmem[1] and sysctl_tcp_rmem[1]
  sk->sk_prot->init(sk) // tcp_v4_init_sock / udp_init_sock

bind()

// net/ipv4/af_inet.c

inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
  // check bind source address
  chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr)
  err = -EADDRNOTAVAIL;
  if (!net->ipv4_sysctl_ip_nonlocal_bind &&
  	    !(inet->freebind || inet->transparent) &&
  	    addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
  	    chk_addr_ret != RTN_LOCAL &&
  	    chk_addr_ret != RTN_MULTICAST &&
  	    chk_addr_ret != RTN_BROADCAST)
    goto out;
  err = -EINVAL;
  // check ctive socket or double bind
  if (sk->sk_state != TCP_CLOSE || inet->inet_num)
    goto out_release_sock;
  inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
  // check bind source port
  snum = ntohs(addr->sin_port)
  // sk->sk_prot->get_port, i.e., inet_csk_get_port / udp_v4_get_port, will set inet_sk(sk)->inet_num
  // to the user provided snum or a found snum
  if ((snum || !inet->bind_address_no_port) && sk->sk_prot->get_port(sk, snum)) {
    err = -EADDRINUSE;
    goto out_release_sock;
  }
  if (inet->inet_rcv_saddr) // user bind to non-zero address
    sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
  if (snum) // user bind to non-zero port
    sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
  inet->inet_sport = htons(inet->inet_num);
  inet->inet_daddr = 0;
  inet->inet_dport = 0;

// net/ipv4/inet_connection_sock.c

inet_csk_get_port

// net/ipv4/udp.c

udp_v4_get_port
  udp_lib_get_port

listen()

TCP only.

// net/ipv4/af_inet.c

inet_listen(struct socket *sock, int backlog)
  if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
    goto out;
  inet_csk_listen_start(sk, backlog)
    reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries)
    sk->sk_state = TCP_LISTEN;
    // check port bind
    if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
      inet->inet_sport = htons(inet->inet_num);

      sk_dst_reset(sk);
      sk->sk_prot->hash(sk); // inet_hash
      return 0;
    }
  sk->sk_max_ack_backlog = backlog
// net/ipv4/inet_hashtables.c

inet_hash(struct sock *sk)
  if (sk->sk_state != TCP_CLOSE) {
    __inet_hash(sk);
  }

__inet_hash(struct sock *sk)
  if (sk->sk_state != TCP_LISTEN) {
    __inet_hash_nolisten(sk, NULL);
    return;
  }
  ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
  __sk_nulls_add_node_rcu(sk, &ilb->head);

__inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
  struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
  struct hlist_nulls_head *list;
  struct inet_ehash_bucket *head;

  sk->sk_hash = inet_sk_ehashfn(sk);
  head = inet_ehash_bucket(hashinfo, sk->sk_hash);
  list = &head->chain;
  __sk_nulls_add_node_rcu(sk, list);
  if (tw) {
    twrefcnt = inet_twsk_unhash(tw);
  }

accept()

TCP only.

// net/ipv4/af_inet.c

inet_accept(struct socket *sock, struct socket *newsock, int flags)
  struct sock *sk1 = sock->sk;
  // inet_csk_accept
  struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);

  sock_graft(sk2, newsock)
    sk->sk_wq = newsock->wq;
    newsock->sk = sk;
    sk_set_socket(sk, newsock);
      sk->sk_socket = newsock

  newsock->state = SS_CONNECTED
// net/ipv4/inet_connection_sock.c

inet_csk_accept(struct sock *sk, int flags, int *err)
  struct request_sock_queue *queue = &icsk->icsk_accept_queue;
  struct sock *newsk;
  struct request_sock *req;

  if (reqsk_queue_empty(queue)) {
    long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);

    /* If this is a non blocking socket don't sleep */
    error = -EAGAIN;
    if (!timeo)
        goto out_err;

    error = inet_csk_wait_for_connect(sk, timeo);
    if (error)
        goto out_err;
  }
  req = reqsk_queue_remove(queue);
  newsk = req->sk;
  return newsk;
// net/ipv4/tcp_ipv4.c

tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
  if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
    tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
    return 0;
  }

  if (sk->sk_state == TCP_LISTEN) {
    struct sock *nsk = tcp_v4_hnd_req(sk, skb);
    if (!nsk)
        goto discard;

    if (nsk != sk) {
        if (tcp_child_process(sk, nsk, skb)) {
            rsk = nsk;
            goto reset;
        }
        return 0;
    }
  }

  tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)

tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
  struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
  						       iph->saddr, iph->daddr)
  return tcp_check_req(sk, skb, req, prev, false)
// net/ipv4/tcp_minisocks.c

tcp_check_req(struct sock *sk, struct sk_buff *skb,
			   struct request_sock *req,
			   struct request_sock **prev,
			   bool fastopen)
  // tcp_v4_syn_recv_sock
  child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL)
  inet_csk_reqsk_queue_add(sk, req, child)
  return child;

connect()

TCP connect

// net/ipv4/tcp_ipv4.c

tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
  orig_sport = inet->inet_sport;
  orig_dport = usin->sin_port;
  fl4 = &inet->cork.fl.u.ip4;
  // setup output route and validate/select source address
  rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
  			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
  			      IPPROTO_TCP,
  			      orig_sport, orig_dport, sk);
  daddr = fl4->daddr
  if (!inet->inet_saddr)
    inet->inet_saddr = fl4->saddr;
  inet->inet_rcv_saddr = inet->inet_saddr;
  inet->inet_dport = usin->sin_port;
  inet->inet_daddr = daddr;

  tcp_set_state(sk, TCP_SYN_SENT);

  // select source port to bind
  inet_hash_connect(&tcp_death_row, sk)

  // setup route cache
  rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
  			       inet->inet_sport, inet->inet_dport, sk)

  // setup wnd, various seq. no. and send the initial SYN packet
  tcp_connect(sk)
// net/ipv4/inet_hashtables.c

inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk)
  __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
  			__inet_check_established, __inet_hash_nolisten)

UDP connect

// net/ipv4/af_inet.c

inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags
  // user did not bind explicitly, find a source port to bind
  if (!inet_sk(sk)->inet_num && inet_autobind(sk))
    return -EAGAIN;
  // ip4_datagram_connect
  sk->sk_prot->connect(sk, uaddr, addr_len)

inet_autobind(struct sock *sk)
  inet = inet_sk(sk);
  if (!inet->inet_num) {
    // inet_csk_get_port / udp_v4_get_port, select a port to bind
    if (sk->sk_prot->get_port(sk, 0)) {
        release_sock(sk);
        return -EAGAIN;
    }
    inet->inet_sport = htons(inet->inet_num);
  }

ip4_datagram_connect
  __ip4_datagram_connect

// net/ipv4/datagram.c

__ip4_datagram_connect
  saddr = inet->inet_saddr
  // call __ip_route_output_key_hash to setup output route and validate/select source address
  rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr,
  			      RT_CONN_FLAGS(sk), oif,
  			      sk->sk_protocol,
  			      inet->inet_sport, usin->sin_port, sk)
  if (!inet->inet_saddr)
    inet->inet_saddr = fl4->saddr;	/* Update source address */
  if (!inet->inet_rcv_saddr) {
    inet->inet_rcv_saddr = fl4->saddr;
    if (sk->sk_prot->rehash)
        sk->sk_prot->rehash(sk);
  }
  inet->inet_daddr = fl4->daddr;
  inet->inet_dport = usin->sin_port;
  sk->sk_state = TCP_ESTABLISHED;
  sk_dst_set(sk, &rt->dst);

sendmsg()

// net/ipv4/af_inet.c

inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size)
  // this check is UDP only, since TCP connect or accepted socket always has a source port, see tcp_v4_connect
  if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
  	    inet_autobind(sk))
    return -EAGAIN;
  // tcp_sendmsg / udp_sendmsg
  sk->sk_prot->sendmsg(iocb, sk, msg, size)

recvmsg()

inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size, int flags)
  // tcp_recvmsg / udp_recvmsg
  sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
  				   flags & ~MSG_DONTWAIT, &addr_len)

参考资料

The C10K problem

http://www.kegel.com/c10k.html

socket - Linux socket interface

http://man7.org/linux/man-pages/man7/socket.7.html

ip - Linux IPv4 protocol implementation

http://man7.org/linux/man-pages/man7/ip.7.html

tcp - TCP protocol

http://man7.org/linux/man-pages/man7/tcp.7.html

udp - User Datagram Protocol for IPv4

http://man7.org/linux/man-pages/man7/udp.7.html

linux tcp三次握手-SYN发送

http://www.cnhalo.net/2016/06/13/linux-tcp-v4-connect/

linux TCP Prequeue队列和backlog队列

http://www.cnhalo.net/2016/07/13/linux-tcp-prequeue-backlog/

linux UDP实现

http://www.cnhalo.net/2016/06/13/linux-udp/

linux tcp GSO和TSO实现

http://www.cnhalo.net/2016/09/13/linux-tcp-gso-tso/

TCP Small Queue(TSQ)

https://blog.csdn.net/u011130578/article/details/44645643

TCP TSQ控制

https://blog.csdn.net/sinat_20184565/article/details/89341370

linux Tcp Small Queue(TSQ)实现

http://www.cnhalo.net/2016/09/13/linux-tcp-small-queue/

TCP states TCPCARecovery vs TCPCALoss

https://stackoverflow.com/questions/48269542/tcp-states-tcp-ca-recovery-vs-tcp-ca-loss

All you need to know about SYN floods

https://blog.dubbelboer.com/2012/04/09/syn-cookies.html

Enabling IP forwarding at kernel compile time

https://unix.stackexchange.com/questions/123981/enabling-ip-forwarding-at-kernel-compile-time

Optimizing TCP: Nagle’s Algorithm and Beyond

https://assets.extrahop.com/whitepapers/TCP-Optimization-Guide-by-ExtraHop.pdf

The Caveats of TCP_NODELAY

https://eklitzke.org/the-caveats-of-tcp-nodelay

TCP Windows and Window Scaling

https://packetlife.net/blog/2010/aug/4/tcp-windows-and-window-scaling/

what is the linux kernel parameter tcplowlatency?

http://www.linuxvox.com/post/what-is-the-linux-kernel-parameter-tcp_low_latency/

Low Latency Performance Tuning for Red Hat Enterprise Linux 7

http://people.redhat.com/jmario/docs/201501-perf-brief-low-latency-tuning-rhel7-v2.0.pdf