runsisi's

technical notes

Linux CPU iowait

2020-01-15 runsisi#perf

通过 top 可以查看当前系统的负载情况:

$ top
top - 14:41:06 up 20:13,  2 users,  load average: 0.17, 0.33, 0.34
Tasks: 321 total,   2 running, 254 sleeping,   0 stopped,   0 zombie
%Cpu(s):  3.4 us,  0.7 sy,  0.0 ni, 95.7 id,  0.0 wa,  0.0 hi,  0.1 si,  0.0 st
KiB Mem :  8061020 total,  1123632 free,  5821184 used,  1116204 buff/cache
KiB Swap:  7999484 total,  5669116 free,  2330368 used.  1473740 avail Mem

其中 CPU 的使用率是通过读取 /proc/stat 文件得到的:

$ cat /proc/stat
cpu  992874 2752 214740 27719676 6784 0 25370 0 0 0
cpu0 263752 1456 46932 6919207 4147 0 9295 0 0 0
cpu1 255925 440 46530 6933504 922 0 6889 0 0 0
cpu2 234852 418 72098 6922256 906 0 3690 0 0 0
cpu3 238344 437 49178 6944707 807 0 5494 0 0 0
...

这里我们关注的是 wa 字段的含义,top 的 man 手册页的描述如下:

wa, IO-wait : time waiting for I/O completion

太多人以为这个时间属于 CPU 非 idle 状态的一部分,特别是对 Linux 的进程调度不了解的人特别容易误解上面 man 手册页的解释。显然,OS 怎么可能浪费宝贵的 CPU 资源去等待 IO 完成,当某个进程需要等待 IO 完成时,该进程就被调度器置成休眠状态(TASK_UNINTERRUPTABLE),CPU 资源切换出去用于执行其它进程。

那么,CPU 的 iowait 时间该怎么理解呢,它到底属不属于 idle 状态的一部分呢?通过阅读 Linux 内核的代码,我们可以非常清晰的得到结论:

iowait 是 idle 的一种特殊形态,计算 CPU 使用率时需要排除 iowait 的统计。

源码分析

当 IO 发给块层驱动处理时,进程等待 IO 完成,并进入休眠状态:

// block/block-exec.c

blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, struct request *rq, int at_head)
  DECLARE_COMPLETION_ONSTACK(wait);
  // Insert a fully prepared request at the back of the I/O scheduler queue
  // for execution.  Don't wait for completion.
  rq->end_io_data = &wait;
  blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq)
  // loop until completed
  while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2)));

blk_end_sync_rq(struct request *rq, int error)
  struct completion *waiting = rq->end_io_data;
  rq->end_io_data = NULL;
  // complete completion defined in blk_execute_rq
  complete(waiting);
    x->done++;
    // wake up the task who has wait queue embedded in completion
    __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL, NULL);

注意,在等待 IO 完成这个时间段内,进程运行所在 CPU 的 run queue nr_iowait 计数器会加一(atomic_inc(&rq->nr_iowait)):

// kernel/sched/core.c

wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
  wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE)
    __wait_for_common(x, io_schedule_timeout, timeout, state)
      do_wait_for_common(x, action, timeout, state)
        DECLARE_WAITQUEUE(wait, current);
        // the process who has the wait queue embedded in completion will be waken
        // by blk_end_sync_rq once io completed
        __add_wait_queue_tail_exclusive(&x->wait, &wait);
        do {
          // io_schedule_timeout, put current process sleep to wait
          timeout = action(timeout); // return value indicates remaining time in jiffies, 0 if timer expired
        } while (!x->done && timeout);
        __remove_wait_queue(&x->wait, &wait);

io_schedule_timeout(long timeout)
  rq = raw_rq();
  atomic_inc(&rq->nr_iowait);
  // sleep until timeout
  schedule_timeout(timeout);
  current->in_iowait = old_iowait;
  atomic_dec(&rq->nr_iowait);

内核 CPU 使用时间的统计受内核选项 NO_HZ 的影响(CONFIG_NO_HZ_COMMON),有两种不同的实现,但原理是一致的:

// kernel/sched/cputime.c

account_idle_time
  u64 *cpustat = kcpustat_this_cpu->cpustat;
  struct rq *rq = this_rq();

  if (atomic_read(&rq->nr_iowait) > 0)
    cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
  else
    cpustat[CPUTIME_IDLE] += (__force u64) cputime;

// fs/proc/stat.c

show_stat(struct seq_file *p, void *v)

get_iowait_time(int cpu)
  u64 iowait, iowait_time = -1ULL;
  if (cpu_online(cpu))
    iowait_time = get_cpu_iowait_time_us(cpu, NULL);

  // CONFIG_NO_HZ_COMMON not configured
  if (iowait_time == -1ULL)
    /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
    iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
  else
    iowait = usecs_to_cputime64(iowait_time);

// kernel/time/tick-sched.c

get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
  // CONFIG_NO_HZ_COMMON configured
  if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
    // keep it up to date
    ktime_t delta = ktime_sub(now, ts->idle_entrytime);
    iowait = ktime_add(ts->iowait_sleeptime, delta);
  } else {
    iowait = ts->iowait_sleeptime;
  }

update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
  ktime_t delta;
  if (ts->idle_active) {
    delta = ktime_sub(now, ts->idle_entrytime);
    if (nr_iowait_cpu(cpu) > 0)
      ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
    else
      ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
    ts->idle_entrytime = now;
  }

从上面的代码片段可知,当 rq->nr_iowait 非零时(即归属于当前 CPU 的进程存在等待 IO 完成的行为),idle 时间的统计归属于 iowait,因此我们说 iowait 是 idle 的一种特殊形态。

参考资料

Understanding Linux Process States

https://access.redhat.com/sites/default/files/attachments/processstates_20120831.pdf

Linux Kernel: Process Scheduling

https://medium.com/hungys-blog/linux-kernel-process-scheduling-8ce05939fabd

理解 %IOWAIT

http://linuxperf.com/?p=33

Relevance of High %iowait in Server Performance

http://www.ee.pw.edu.pl/~pileckip/aix/iowait.htm

The precise meaning of I/O wait time in Linux

http://veithen.io/2013/11/18/iowait-linux.html

Linux’s %iowait statistic

https://utcc.utoronto.ca/~cks/space/blog/linux/LinuxIowait

NO_HZ: Reducing Scheduling-Clock Ticks

https://www.kernel.org/doc/Documentation/timers/NO_HZ.txt