tcp/ip init
// net/ipv4/af_inet.c
inet_init
proto_register(&tcp_prot, 1)
proto_register(&udp_prot, 1)
proto_register(&raw_prot, 1)
proto_register(&ping_prot, 1)
sock_register(&inet_family_ops)
// register in inet_protos[MAX_INET_PROTOS]
// used for l4 packet receiving, i.e., ip_rcv_finish and ip_local_deliver_finish
inet_add_protocol(&icmp_protocol, IPPROTO_ICMP)
inet_add_protocol(&udp_protocol, IPPROTO_UDP)
inet_add_protocol(&tcp_protocol, IPPROTO_TCP)
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
// register in inetsw[SOCK_MAX]
// used for socket creation, i.e., inet_create
inet_register_protosw(q)
arp_init()
ip_init()
tcp_v4_init()
// init tcp_hashinfo.listening_hash hash table
inet_hashinfo_init(&tcp_hashinfo)
tcp_init()
tcp_hashinfo.ehash = alloc_large_system_hash("TCP established")
tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind")
udp_init()
udp_table_init(&udp_table, "UDP")
udplite4_register()
ping_init()
icmp_init()
dev_add_pack(&ip_packet_type)
// net/ipv4/ip_input.c
ip_rcv
NF_HOOK(NFPROTO_IPV4, ip_rcv_finish)
ip_rcv_finish
ipprot = inet_protos[protocol]
ipprot->early_demux(skb) // tcp_v4_early_demux / udp_v4_early_demux
ip_route_input_noref
ip_route_input_slow(skb, daddr, saddr, tos, dev)
fib_lookup(net, &fl4, &res)
if (res.type == RTN_LOCAL) {
rth = rt_dst_alloc(flags | RTCF_LOCAL, res.type)
rt = dst_alloc
rt->dst.output = ip_output
rt->dst.input = ip_local_deliver
skb_dst_set(skb, &rth->dst)
}
ip_rcv_options
dst_input(skb)
skb_dst(skb)->input(skb) // ip_local_deliver
ip_local_deliver
NF_HOOK(NFPROTO_IPV4, ip_local_deliver_finish)
ip_local_deliver_finish
ipprot = inet_protos[protocol]
ipprot->handler(skb) // tcp_v4_rcv / udp_rcv / icmp_rcv
// net/ipv4/udp.c
udp_rcv
__udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP)
sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable)
udp_queue_rcv_skb(sk, skb)
__udp_queue_rcv_skb(sk, skb)
__udp_enqueue_schedule_skb(sk, skb)
struct sk_buff_head *list = &sk->sk_receive_queue;
__skb_queue_tail(list, skb)
sk->sk_data_ready(sk, 0)
socket syscall
// net/socket.c
socket()
// net/ipv4/af_inet.c
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.flags = INET_PROTOSW_PERMANENT,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_ICMP,
.prot = &ping_prot,
.ops = &inet_dgram_ops,
.flags = INET_PROTOSW_REUSE,
},
{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.flags = INET_PROTOSW_REUSE,
}
};
inet_create(struct socket *sock, int protocol)
list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
}
sock->ops = answer->ops; // called by socket syscalls
answer_prot = answer->prot;
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot)
sk = sk_prot_alloc
sk->sk_prot = sk->sk_prot_creator = prot; // called by sock->ops
sock_init_data(sock, sk)
sk->sk_rcvbuf = sysctl_rmem_default;
sk->sk_sndbuf = sysctl_wmem_default;
sk->sk_state = TCP_CLOSE;
sk_set_socket(sk, sock);
sk->sk_socket = sock;
sk->sk_type = sock->type;
sk->sk_wq = sock->wq;
sock->sk = sk;
sk->sk_state_change = sock_def_wakeup;
sk->sk_data_ready = sock_def_readable;
sk->sk_write_space = sock_def_write_space;
sk->sk_error_report = sock_def_error_report;
sk->sk_destruct = sock_def_destruct;
// for TCP, sk->sk_write_space will be overridden to sk_stream_write_space,
// sk->sk_sndbuf and sk->sk_rcvbuf will be overridden to sysctl_tcp_wmem[1] and sysctl_tcp_rmem[1]
sk->sk_prot->init(sk) // tcp_v4_init_sock / udp_init_sock
bind()
// net/ipv4/af_inet.c
inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
// check bind source address
chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr)
err = -EADDRNOTAVAIL;
if (!net->ipv4_sysctl_ip_nonlocal_bind &&
!(inet->freebind || inet->transparent) &&
addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST &&
chk_addr_ret != RTN_BROADCAST)
goto out;
err = -EINVAL;
// check ctive socket or double bind
if (sk->sk_state != TCP_CLOSE || inet->inet_num)
goto out_release_sock;
inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
// check bind source port
snum = ntohs(addr->sin_port)
// sk->sk_prot->get_port, i.e., inet_csk_get_port / udp_v4_get_port, will set inet_sk(sk)->inet_num
// to the user provided snum or a found snum
if ((snum || !inet->bind_address_no_port) && sk->sk_prot->get_port(sk, snum)) {
err = -EADDRINUSE;
goto out_release_sock;
}
if (inet->inet_rcv_saddr) // user bind to non-zero address
sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
if (snum) // user bind to non-zero port
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
inet->inet_sport = htons(inet->inet_num);
inet->inet_daddr = 0;
inet->inet_dport = 0;
// net/ipv4/inet_connection_sock.c
inet_csk_get_port
// net/ipv4/udp.c
udp_v4_get_port
udp_lib_get_port
listen()
TCP only.
// net/ipv4/af_inet.c
inet_listen(struct socket *sock, int backlog)
if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
goto out;
inet_csk_listen_start(sk, backlog)
reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries)
sk->sk_state = TCP_LISTEN;
// check port bind
if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
inet->inet_sport = htons(inet->inet_num);
sk_dst_reset(sk);
sk->sk_prot->hash(sk); // inet_hash
return 0;
}
sk->sk_max_ack_backlog = backlog
// net/ipv4/inet_hashtables.c
inet_hash(struct sock *sk)
if (sk->sk_state != TCP_CLOSE) {
__inet_hash(sk);
}
__inet_hash(struct sock *sk)
if (sk->sk_state != TCP_LISTEN) {
__inet_hash_nolisten(sk, NULL);
return;
}
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
__sk_nulls_add_node_rcu(sk, &ilb->head);
__inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct hlist_nulls_head *list;
struct inet_ehash_bucket *head;
sk->sk_hash = inet_sk_ehashfn(sk);
head = inet_ehash_bucket(hashinfo, sk->sk_hash);
list = &head->chain;
__sk_nulls_add_node_rcu(sk, list);
if (tw) {
twrefcnt = inet_twsk_unhash(tw);
}
accept()
TCP only.
// net/ipv4/af_inet.c
inet_accept(struct socket *sock, struct socket *newsock, int flags)
struct sock *sk1 = sock->sk;
// inet_csk_accept
struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
sock_graft(sk2, newsock)
sk->sk_wq = newsock->wq;
newsock->sk = sk;
sk_set_socket(sk, newsock);
sk->sk_socket = newsock
newsock->state = SS_CONNECTED
// net/ipv4/inet_connection_sock.c
inet_csk_accept(struct sock *sk, int flags, int *err)
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
struct sock *newsk;
struct request_sock *req;
if (reqsk_queue_empty(queue)) {
long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
/* If this is a non blocking socket don't sleep */
error = -EAGAIN;
if (!timeo)
goto out_err;
error = inet_csk_wait_for_connect(sk, timeo);
if (error)
goto out_err;
}
req = reqsk_queue_remove(queue);
newsk = req->sk;
return newsk;
// net/ipv4/tcp_ipv4.c
tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
return 0;
}
if (sk->sk_state == TCP_LISTEN) {
struct sock *nsk = tcp_v4_hnd_req(sk, skb);
if (!nsk)
goto discard;
if (nsk != sk) {
if (tcp_child_process(sk, nsk, skb)) {
rsk = nsk;
goto reset;
}
return 0;
}
}
tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)
tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
iph->saddr, iph->daddr)
return tcp_check_req(sk, skb, req, prev, false)
// net/ipv4/tcp_minisocks.c
tcp_check_req(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct request_sock **prev,
bool fastopen)
// tcp_v4_syn_recv_sock
child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL)
inet_csk_reqsk_queue_add(sk, req, child)
return child;
connect()
TCP connect
// net/ipv4/tcp_ipv4.c
tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
orig_sport = inet->inet_sport;
orig_dport = usin->sin_port;
fl4 = &inet->cork.fl.u.ip4;
// setup output route and validate/select source address
rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
IPPROTO_TCP,
orig_sport, orig_dport, sk);
daddr = fl4->daddr
if (!inet->inet_saddr)
inet->inet_saddr = fl4->saddr;
inet->inet_rcv_saddr = inet->inet_saddr;
inet->inet_dport = usin->sin_port;
inet->inet_daddr = daddr;
tcp_set_state(sk, TCP_SYN_SENT);
// select source port to bind
inet_hash_connect(&tcp_death_row, sk)
// setup route cache
rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
inet->inet_sport, inet->inet_dport, sk)
// setup wnd, various seq. no. and send the initial SYN packet
tcp_connect(sk)
// net/ipv4/inet_hashtables.c
inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk)
__inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
__inet_check_established, __inet_hash_nolisten)
UDP connect
// net/ipv4/af_inet.c
inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags
// user did not bind explicitly, find a source port to bind
if (!inet_sk(sk)->inet_num && inet_autobind(sk))
return -EAGAIN;
// ip4_datagram_connect
sk->sk_prot->connect(sk, uaddr, addr_len)
inet_autobind(struct sock *sk)
inet = inet_sk(sk);
if (!inet->inet_num) {
// inet_csk_get_port / udp_v4_get_port, select a port to bind
if (sk->sk_prot->get_port(sk, 0)) {
release_sock(sk);
return -EAGAIN;
}
inet->inet_sport = htons(inet->inet_num);
}
ip4_datagram_connect
__ip4_datagram_connect
// net/ipv4/datagram.c
__ip4_datagram_connect
saddr = inet->inet_saddr
// call __ip_route_output_key_hash to setup output route and validate/select source address
rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr,
RT_CONN_FLAGS(sk), oif,
sk->sk_protocol,
inet->inet_sport, usin->sin_port, sk)
if (!inet->inet_saddr)
inet->inet_saddr = fl4->saddr; /* Update source address */
if (!inet->inet_rcv_saddr) {
inet->inet_rcv_saddr = fl4->saddr;
if (sk->sk_prot->rehash)
sk->sk_prot->rehash(sk);
}
inet->inet_daddr = fl4->daddr;
inet->inet_dport = usin->sin_port;
sk->sk_state = TCP_ESTABLISHED;
sk_dst_set(sk, &rt->dst);
sendmsg()
// net/ipv4/af_inet.c
inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size)
// this check is UDP only, since TCP connect or accepted socket always has a source port, see tcp_v4_connect
if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
inet_autobind(sk))
return -EAGAIN;
// tcp_sendmsg / udp_sendmsg
sk->sk_prot->sendmsg(iocb, sk, msg, size)
recvmsg()
inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size, int flags)
// tcp_recvmsg / udp_recvmsg
sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
flags & ~MSG_DONTWAIT, &addr_len)
参考资料
The C10K problem
http://www.kegel.com/c10k.html
socket - Linux socket interface
http://man7.org/linux/man-pages/man7/socket.7.html
ip - Linux IPv4 protocol implementation
http://man7.org/linux/man-pages/man7/ip.7.html
tcp - TCP protocol
http://man7.org/linux/man-pages/man7/tcp.7.html
udp - User Datagram Protocol for IPv4
http://man7.org/linux/man-pages/man7/udp.7.html
linux tcp三次握手-SYN发送
http://www.cnhalo.net/2016/06/13/linux-tcp-v4-connect/
linux TCP Prequeue队列和backlog队列
http://www.cnhalo.net/2016/07/13/linux-tcp-prequeue-backlog/
linux UDP实现
http://www.cnhalo.net/2016/06/13/linux-udp/
linux tcp GSO和TSO实现
http://www.cnhalo.net/2016/09/13/linux-tcp-gso-tso/
TCP Small Queue(TSQ)
https://blog.csdn.net/u011130578/article/details/44645643
TCP TSQ控制
https://blog.csdn.net/sinat_20184565/article/details/89341370
linux Tcp Small Queue(TSQ)实现
http://www.cnhalo.net/2016/09/13/linux-tcp-small-queue/
TCP states TCP_CA_Recovery vs TCP_CA_Loss
https://stackoverflow.com/questions/48269542/tcp-states-tcp-ca-recovery-vs-tcp-ca-loss
All you need to know about SYN floods
https://blog.dubbelboer.com/2012/04/09/syn-cookies.html
Enabling IP forwarding at kernel compile time
https://unix.stackexchange.com/questions/123981/enabling-ip-forwarding-at-kernel-compile-time
Optimizing TCP: Nagle’s Algorithm and Beyond
https://assets.extrahop.com/whitepapers/TCP-Optimization-Guide-by-ExtraHop.pdf
The Caveats of TCP_NODELAY
https://eklitzke.org/the-caveats-of-tcp-nodelay
TCP Windows and Window Scaling
https://packetlife.net/blog/2010/aug/4/tcp-windows-and-window-scaling/
what is the linux kernel parameter tcp_low_latency?
http://www.linuxvox.com/post/what-is-the-linux-kernel-parameter-tcp_low_latency/
Low Latency Performance Tuning for Red Hat Enterprise Linux 7
http://people.redhat.com/jmario/docs/201501-perf-brief-low-latency-tuning-rhel7-v2.0.pdf
最后修改于 2020-01-03