runsisi's

technical notes

TCP socket 统计

2019-12-16 runsisi#tcp/ip

nstat

$ nstat -z
...
TcpExtTCPDirectCopyFromBacklog  0                  0.0
TcpExtTCPDirectCopyFromPrequeue 0                  0.0
TcpExtTCPPrequeueDropped        0                  0.0
TcpExtTCPHPHits                 10643              0.0
TcpExtTCPHPHitsToUser           0                  0.0
TcpExtTCPPureAcks               3307               0.0
TcpExtTCPHPAcks                 3905               0.0
...

不过需要注意的是,nstat 查询的统计是系统全局数据,并不与单个 socket 关联。这些计数器由 IP_INC_STATS/NET_INC_STATS/TCP_INC_STATS 等宏进行增减,其底层实现为 SNMP_INC_STATS 等宏。

ss

Ubuntu 18.04

$ ss -ntpioem
State                     Recv-Q                 Send-Q                                           Local Address:Port                                         Peer Address:Port
...
ESTAB                     0                      0                                                    127.0.0.1:36482                                           127.0.0.1:63342                 users:(("java",pid=80876,fd=427)) uid:1000 ino:8564787 sk:551 <->
         skmem:(r0,rb1061808,t0,tb2626560,f4096,w0,o0,bl0,d0) ts sack cubic wscale:7,7 rto:204 rtt:0.027/0.015 ato:40 mss:22400 pmtu:65535 rcvmss:2984 advmss:65483 cwnd:10 bytes_acked:428 bytes_received:2984 segs_out:4 segs_in:3 data_segs_out:1 data_segs_in:1 send 66370.4Mbps lastsnd:1452 lastrcv:996 lastack:996 delivery_rate 17920.0Mbps app_limited rcv_rtt:456 rcv_space:43690 rcv_ssthresh:174656 minrtt:0.01
...

CentOS 7.4

$ ss -ntpioem
State       Recv-Q Send-Q                                                  Local Address:Port                                                                 Peer Address:Port
...
ESTAB       0      0                                                           127.0.0.1:6791                                                                    127.0.0.1:58308               users:(("ceph-mon",pid=2352003,fd=37)) uid:1011 ino:72531843 sk:ffff881681a926c0 <->
         ts sack cubic wscale:7,7 rto:206 rtt:5.051/9.999 ato:40 mss:24256 rcvmss:536 advmss:65483 cwnd:10 ssthresh:22 bytes_acked:1743 bytes_received:1265 segs_out:11 segs_in:12 send 384.2Mbps lastsnd:2906 lastrcv:2906 lastack:2866 pacing_rate 768.2Mbps reordering:15 rcv_space:43690
...

其中关键字段与内核代码的对应关系如下:

skmem         // sock_diag_put_meminfo
wscale        tcpi_snd_wscale, tcpi_rcv_wscale    tp->rx_opt.snd_wscale, tp->rx_opt.rcv_wscale
rto           tcpi_rto/1000                       icsk->icsk_rto
backoff       tcpi_backoff                        icsk->icsk_backoff
rtt           tcpi_rtt/1000, tcpi_rttvar/1000     tp->srtt_us >> 3, tp->mdev_us >> 2
ato           tcpi_ato/1000                       icsk->icsk_ack.ato
cwnd          tcpi_snd_cwnd                       tp->snd_cwnd
ssthresh      tcpi_snd_ssthresh                   tp->snd_ssthresh
segs_out      tcpi_segs_out                       tp->segs_out
segs_in       tcpi_segs_in                        tp->segs_in
unacked       tcpi_unacked                        tp->packets_out
// tcpi_retrans has huge difference from tcpi_retransmits, i.e., icsk->icsk_retransmits, which can get from /proc/net/tcp(6)
retrans       tcpi_retrans/tcpi_total_retrans     tp->retrans_out/tp->total_retrans
lost          tcp_lost                            tp->lost_out
// ss: Add tcp_info fields data_segs_in/out
// https://github.com/shemminger/iproute2/commit/414aeec90f82d73614e4b931f04c28caabb21824
data_segs_out tcpi_data_segs_out                  tp->data_segs_out
data_segs_in  tcpi_data_segs_in                   tp->data_segs_in

显然,不同的内核版本会返回不同的信息,同时需要注意的是,每个 socket 显示的字段并不会完全一致,当计数器为 0 时,ss 不会显示该字段,同样的如果当前 socket 没有定时器生效,也不会显示定时器信息。

在定位单个 socket 链路的故障时,ss 的信息非常有用,同时它可以指定过滤条件查询满足指定条件的 socket,因此使用起来比较方便。ss 主要的信息来源为 tcp_diag.ko, udp_diag.ko, inet_diag.ko 等三个内核模块。当然,有时候由于这几个内核模块提供的信息也不是很全,可以自己编写内核模块进行打印,或者综合 /proc/net/tcp(6) 等各方面暴露的信息。

/proc/net/tcp(6)

在 Ubuntu 18.04 上注意加上 sudo 查看 /proc/net/tcp(6) 信息,否则 sk 地址字段会显示为 0:

$ sudo cat /proc/net/tcp
  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode
   0: 00000000:0185 00000000:0000 0A 00000000:00000000 00:00000000 00000000     0        0 39082 1 ffff963682f13000 100 0 0 10 0
   1: 00000000:0386 00000000:0000 0A 00000000:00000000 00:00000000 00000000     0        0 40040 1 ffff963680f62800 100 0 0 10 0
   2: 0100007F:0CEA 00000000:0000 0A 00000000:00000000 00:00000000 00000000   130        0 33267 1 ffff963682f10000 100 0 0 10 0
...
$ sudo cat /proc/net/tcp6
  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode
   0: 00000000000000000000000000000000:0185 00000000000000000000000000000000:0000 0A 00000000:00000000 00:00000000 00000000     0        0 39083 1 ffff963681abc400 100 0 0 10 0
   1: 00000000000000000000000000000000:0386 00000000000000000000000000000000:0000 0A 00000000:00000000 00:00000000 00000000     0        0 41191 1 ffff963671b55d80 100 0 0 10 0
   2: 00000000000000000000000000000000:006F 00000000000000000000000000000000:0000 0A 00000000:00000000 00:00000000 00000000     0        0 21200 1 ffff963681abd500 100 0 0 10 0
   3: 00000000000000000000000000000000:06B4 00000000000000000000000000000000:0000 0A 00000000:00000000 00:00000000 00000000  1000        0 47138 1 ffff96367f8dcc80 100 0 0 10 0
   4: 00000000000000000000000000000000:0016 00000000000000000000000000000000:0000 0A 00000000:00000000 00:00000000 00000000     0        0 82652 1 ffff963671b55500 100 0 0 10 0
...

/proc/net/sockstat(6)

$ cat /proc/net/sockstat
sockets: used 4022
TCP: inuse 76 orphan 2 tw 19678 alloc 3141 mem 1677
UDP: inuse 27 mem 62
UDPLITE: inuse 0
RAW: inuse 1
FRAG: inuse 0 memory 0
$ cat /proc/net/sockstat6
TCP6: inuse 2807
UDP6: inuse 30
UDPLITE6: inuse 0
RAW6: inuse 1
FRAG6: inuse 0 memory 0

/proc/sys/net/netfilter/

$ cat /proc/sys/net/netfilter/nf_conntrack_count
155
$ cat /proc/sys/net/netfilter/nf_conntrack_max
262144

crash

使用 crash 可以使用调试工具直接分析内核数据结构,特别是在通过 ss 工具可打印 sock 实例内核地址的情况下。

Ubuntu 18.04

较新的 Linux 内核(参考 net: add real socket cookies)不再返回 sk 地址,而是一个 64 位整数,我们不能直接使用 ss 工具查看 sk 结构的内核地址(如下面的 sk:4f2)。

$ ss -ntpioe
State                     Recv-Q                 Send-Q                                           Local Address:Port                                         Peer Address:Port
ESTAB                     0                      0                                                192.168.137.3:33592                                          10.1.2.3:80                    users:(("chrome",pid=3183,fd=29)) timer:(keepalive,41sec,0) uid:1000 ino:2797310 sk:4f2 <->
         sack cubic wscale:9,7 rto:232 rtt:29.389/9.703 ato:40 mss:1460 pmtu:1500 rcvmss:1460 advmss:1460 cwnd:10 ssthresh:7 bytes_acked:891 bytes_received:4941 segs_out:18 segs_in:18 data_segs_out:4 data_segs_in:8 send 4.0Mbps lastsnd:349640 lastrcv:183912 lastack:3032 pacing_rate 4.8Mbps delivery_rate 794.7Kbps app_limited busy:104ms rcv_space:29200 rcv_ssthresh:43800 minrtt:26.646
...

因此,需要结合 sudo cat /proc/net/tcp(6) 命令查看对应的 sk 地址。

$ git clone https://git.launchpad.net/~ubuntu-kernel/ubuntu/+source/linux/+git/bionic --branch Ubuntu-4.15.0-72.81 --depth 1
Cloning into 'bionic'...
remote: Counting objects: 67170, done.
remote: Compressing objects: 100% (61668/61668), done.
remote: Total 67170 (delta 8845), reused 32978 (delta 4422)
Receiving objects: 100% (67170/67170), 178.85 MiB | 145.00 KiB/s, done.
Resolving deltas: 100% (8845/8845), done.
Note: checking out '48d6312566e04b7a713cc7c15ae7dcd37efcfa95'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by performing another checkout.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -b with the checkout command again. Example:

  git checkout -b <new-branch-name>

Checking out files: 100% (63347/63347), done.
# curl -o- http://ddebs.ubuntu.com/dbgsym-release-key.asc | apt-key add -
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  2471  100  2471    0     0  38609      0 --:--:-- --:--:-- --:--:-- 38609
OK
# vi /etc/apt/sources.list
...
deb http://ddebs.ubuntu.com/ bionic main restricted universe multiverse
deb http://ddebs.ubuntu.com/ bionic-updates main restricted universe multiverse
deb http://ddebs.ubuntu.com/ bionic-proposed main restricted universe multiverse
# uname -r
4.15.0-72-generic
# apt install linux-image-4.15.0-72-generic-dbgsym
$ sudo crash /usr/lib/debug/boot/vmlinux-4.15.0-72-generic /proc/kcore

crash 7.2.1
Copyright (C) 2002-2017  Red Hat, Inc.
...
crash> struct tcp_sock.rcv_nxt,snd_una ffff96367f8e2000
  rcv_nxt = 2131774235
  snd_una = 2426409825

CentOS 7.x

$ sudo rpm -import /etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-Debug-7
$ rpm -q gpg-pubkey --qf '%{name}-%{version}-%{release}\t%{summary}\n'
...
gpg-pubkey-b6792c39-53c4fbdd    gpg(CentOS-7 Debug (CentOS-7 Debuginfo RPMS) <security@centos.org>)
$ uname -r
3.10.0-693.17.1.el7.x86_64
$ wget http://debuginfo.centos.org/7/x86_64/kernel-debuginfo-3.10.0-693.17.1.el7.x86_64.rpm
$ wget http://debuginfo.centos.org/7/x86_64/kernel-debuginfo-common-x86_64-3.10.0-693.17.1.el7.x86_64.rpm
$ sudo rpm -Uvh kernel-debuginfo-3.10.0-693.17.1.el7.x86_64.rpm kernel-debuginfo-common-x86_64-3.10.0-693.17.1.el7.x86_64.rpm
Preparing...                          ################################# [100%]
Updating / installing...
   1:kernel-debuginfo-common-x86_64-3.################################# [ 50%]
   2:kernel-debuginfo-3.10.0-693.17.1.################################# [100%]
$ ss -ntpioe
...
ESTAB       0      0                                                                 ::1:9168                                                                          ::1:50044               uid:992 ino:36785264 sk:ffff882ffabaf700 <->
         ts sack cubic wscale:7,7 rto:201 rtt:0.053/0.003 ato:40 mss:65464 rcvmss:536 advmss:65464 cwnd:10 bytes_acked:807332766 bytes_received:6030045 segs_out:49631 segs_in:49631 send 98813.6Mbps lastsnd:12532 lastrcv:12774 lastack:12531 rcv_rtt:15121.4 rcv_space:43983
$ sudo crash

crash 7.2.0-6.el7
Copyright (C) 2002-2017  Red Hat, Inc.
...
crash> struct tcp_sock.rcv_nxt,snd_una ffff882ffabaf700
  rcv_nxt = 488064403
  snd_una = 346012635
$ wget http://vault.centos.org/7.4.1708/updates/Source/SPackages/kernel-3.10.0-693.17.1.el7.src.rpm

代码实现

ss && tcpdiag.ko && inetdiag.ko

// iproute2/misc/ss.c

static void tcp_timer_print(struct tcpstat *s)
{
	static const char * const tmr_name[] = {
		"off",
		"on",
		"keepalive",
		"timewait",
		"persist",
		"unknown"
	};

	if (s->timer) {
		if (s->timer > 4)
			s->timer = 5;
		out(" timer:(%s,%s,%d)",
			     tmr_name[s->timer],
			     print_ms_timer(s->timeout),
			     s->retrans);
	}
}
// linux/net/ipv4/inet_diag.c

int inet_sk_diag_fill(...)
{
...
	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
		r->idiag_timer = 1;
		r->idiag_retrans = icsk->icsk_retransmits;
		r->idiag_expires =
			jiffies_to_msecs(icsk->icsk_timeout - jiffies);
	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
		r->idiag_timer = 4;
		r->idiag_retrans = icsk->icsk_probes_out;
		r->idiag_expires =
			jiffies_to_msecs(icsk->icsk_timeout - jiffies);
	} else if (timer_pending(&sk->sk_timer)) {
		r->idiag_timer = 2;
		r->idiag_retrans = icsk->icsk_probes_out;
		r->idiag_expires =
			jiffies_to_msecs(sk->sk_timer.expires - jiffies);
	} else {
		r->idiag_timer = 0;
		r->idiag_expires = 0;
	}
...
}
// linux/net/core/sock_diag.c

int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype)
{
  u32 mem[SK_MEMINFO_VARS];

  mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);           // r
  mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;                       // rb
  mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);           // t
  mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;                       // tb
  mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;             // f
  mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;             // w
  mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);     // o
  mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;                 // bl
  mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);           // d

  return nla_put(skb, attrtype, sizeof(mem), &mem);
}
// linux/net/ipv4/tcp.c

void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
	const struct tcp_sock *tp = tcp_sk(sk);
	const struct inet_connection_sock *icsk = inet_csk(sk);
	u32 now = tcp_time_stamp;
	unsigned int start;
	u32 rate;

	memset(info, 0, sizeof(*info));

	info->tcpi_state = sk->sk_state;
	info->tcpi_ca_state = icsk->icsk_ca_state;
	info->tcpi_retransmits = icsk->icsk_retransmits;
	info->tcpi_probes = icsk->icsk_probes_out;
	info->tcpi_backoff = icsk->icsk_backoff;
...
}

/proc/net/tcp(6)

// linux/net/ipv4/tcp_ipv4.c

static int tcp4_seq_show(struct seq_file *seq, void *v)
{
	struct tcp_iter_state *st;
	struct sock *sk = v;
	int len;

	if (v == SEQ_START_TOKEN) {
		seq_printf(seq, "%-*s\n", TMPSZ - 1,
			   "  sl  local_address rem_address   st tx_queue "
			   "rx_queue tr tm->when retrnsmt   uid  timeout "
			   "inode");
		goto out;
	}
	st = seq->private;

	switch (st->state) {
	case TCP_SEQ_STATE_LISTENING:
	case TCP_SEQ_STATE_ESTABLISHED:
		if (sk->sk_state == TCP_TIME_WAIT)
			get_timewait4_sock(v, seq, st->num, &len);
		else
			get_tcp4_sock(v, seq, st->num, &len);
		break;
	case TCP_SEQ_STATE_OPENREQ:
		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
		break;
	}
	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
out:
	return 0;
}

// linux/net/ipv6/tcp_ipv6.c

static int tcp6_seq_show(struct seq_file *seq, void *v)
{
	struct tcp_iter_state *st;
	struct sock *sk = v;

	if (v == SEQ_START_TOKEN) {
		seq_puts(seq,
			 "  sl  "
			 "local_address                         "
			 "remote_address                        "
			 "st tx_queue rx_queue tr tm->when retrnsmt"
			 "   uid  timeout inode\n");
		goto out;
	}
	st = seq->private;

	switch (st->state) {
	case TCP_SEQ_STATE_LISTENING:
	case TCP_SEQ_STATE_ESTABLISHED:
		if (sk->sk_state == TCP_TIME_WAIT)
			get_timewait6_sock(seq, v, st->num);
		else
			get_tcp6_sock(seq, v, st->num);
		break;
	case TCP_SEQ_STATE_OPENREQ:
		get_openreq6(seq, st->syn_wait_sk, v, st->num, st->uid);
		break;
	}
out:
	return 0;
}

参考资料

/proc/net/tcp(6)

https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt

Inspecting Internal TCP State on Linux

https://blog.janestreet.com/inspecting-internal-tcp-state-on-linux/

ss command: Display Linux TCP / UDP Network/Socket Information

https://www.cyberciti.biz/tips/linux-investigate-sockets-network-connections.html

Detecting network errors and their impact on services

https://www.dynatrace.com/news/blog/detecting-network-errors-impact-on-services/

crash - Analyze Linux crash dump data or a live system

http://man7.org/linux/man-pages/man8/crash.8.html

problem with crash utility in ubuntu 16.10

https://github.com/crash-utility/crash/issues/9

Crash-utility: The problems when running SuSE 12 on VirtualBox

https://www.redhat.com/archives/crash-utility/2015-November/msg00035.html

seq_file_howto

https://lwn.net/Articles/23707/

Passive monitoring of sockets on Linux

http://kristrev.github.io/2013/07/26/passive-monitoring-of-sockets-on-linux

Ubuntu SourceCode

https://wiki.ubuntu.com/Kernel/SourceCode