Qemu 使用 macvtap 桥接网络

创建 macvtap 接口,注意设置为 bridge 模式(默认为 vepa 模式),vepa 模式下创建在同一宿主机之上的多个虚机之间网络不通(除非外部交换机支持 hairpin 功能):

$ sudo ip link add link eno1 name macvtap1 type macvtap mode bridge
$ sudo ip link add link eno1 name macvtap2 type macvtap mode bridge
$ sudo ip link set dev macvtap1 up
$ sudo ip link set dev macvtap2 up

ip-link(8) — Linux manual page
https://man7.org/linux/man-pages/man8/ip-link.8.html

但是需要注意的是,即使是 bridge 模式也对虚机和该虚机所在宿主机之间的网络连通没有帮助(虽然虚机与外部的其它宿主机之间的网络是连通的),这是 macvtap 与 linux bridge 在用户体验上最大的差异。

This situation is actually not an error — it is the defined behavior of macvtap. Due to the way in which the host’s physical Ethernet is attached to the macvtap bridge, traffic into that bridge from the guests that is forwarded to the physical interface cannot be bounced back up to the host’s IP stack. Additionally, traffic from the host’s IP stack that is sent to the physical interface cannot be bounced back up to the macvtap bridge for forwarding to the guests.

Guest Can Reach Outside Network, but Cannot Reach Host when Using macvtap Interface
https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/6/html/virtualization_host_configuration_and_guest_installation_guide/app_macvtap

Guest and host cannot see each other using linux-kvm and macvtap
https://superuser.com/questions/349253/guest-and-host-cannot-see-each-other-using-linux-kvm-and-macvtap

从发送侧代码来看,macvlan/macvtap 的 vepa 和 private 没有区别:

// linux 5.19

// 收到 net_device 注册的消息,创建 tap 设备文件

macvtap_device_event
  snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex);
  case NETDEV_REGISTER:
    tap_get_minor(macvtap_major, &vlantap->tap);
    devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
    device_create(&macvtap_class, &dev->dev, devt, dev, "%s", tap_name);

// 打开 tap 设备文件

tap_open
  tap = dev_get_by_tap_file(imajor(inode), iminor(inode));
  q = (struct tap_queue *)sk_alloc()
  tap_set_queue(tap, file, q);
    rcu_assign_pointer(q->tap, tap);
    file->private_data = q;

// 通过 tap 设备文件发送报文

tap_write_iter
  tap_get_user
    tap = rcu_dereference(q->tap);
    skb->dev = tap->dev; // macvlan net_device
    dev_queue_xmit
      __dev_queue_xmit
        dev_hard_start_xmit
          xmit_one
            netdev_start_xmit
              __netdev_start_xmit
                ops->ndo_start_xmit(skb, dev);
                  macvlan_start_xmit
                    macvlan_queue_xmit
                      if (vlan->mode == MACVLAN_MODE_BRIDGE) {
                        dev_forward_skb(vlan->lowerdev, skb);
                        return NET_XMIT_SUCCESS;
                      }
                      skb->dev = vlan->lowerdev;
                      dev_queue_xmit_accel(skb, NULL)
                        __dev_queue_xmit

主要的差异体现在接收侧广播/多播报文的处理上(从 macvlan/macvtap 接口发出来的广播/多播报文通过交换机的 hairpin 功能转发回来之后,在 private 模式下仅原始发包的 macvlan/macvtap 接口能接收到该广播/多播报文):

// linux 5.19

// 处理 netlink 消息,创建 macvtap 接口,并为底层物理口注册收包函数

macvtap_newlink
  macvlan_common_newlink
    lowerdev = __dev_get_by_index(nla_get_u32(tb[IFLA_LINK]));
    macvlan_port_create(lowerdev); // only once for all macvlan ifaces
      netdev_rx_handler_register(dev /* i.e., lowerdev */, macvlan_handle_frame);
    register_netdevice(dev);

macvlan_handle_frame
  if (is_multicast_ether_addr(eth->h_dest)) {
    if (macvlan_forward_source(skb, port, eth->h_source)) {
      kfree_skb(skb);
      return RX_HANDLER_CONSUMED;
    }
    src = macvlan_hash_lookup(port, eth->h_source);
    if (src && src->mode != MACVLAN_MODE_VEPA &&
        src->mode != MACVLAN_MODE_BRIDGE) {
      /* forward to original port. */
      vlan = src;
      ret = macvlan_broadcast_one(skb, vlan, eth, 0) ?:
            __netif_rx(skb);
      handle_res = RX_HANDLER_CONSUMED;
      goto out;
    }

    hash = mc_hash(NULL, eth->h_dest);
    if (test_bit(hash, port->mc_filter))
      macvlan_broadcast_enqueue(port, src, skb);

    return RX_HANDLER_PASS;
  }

// 外部的广播/多播报文或
// 由 hairpin 交换机转发回的 private 模式 macvlan/macvtap 接口发出的内部的广播/多播报文独立处理

macvlan_process_broadcast
  if (!src)
    /* frame comes from an external address */
    macvlan_broadcast(skb, port, NULL,
          MACVLAN_MODE_PRIVATE |
          MACVLAN_MODE_VEPA    |
          MACVLAN_MODE_PASSTHRU|
          MACVLAN_MODE_BRIDGE);
  else if (src->mode == MACVLAN_MODE_VEPA)
    /* flood to everyone except source */
    macvlan_broadcast(skb, port, src->dev,
          MACVLAN_MODE_VEPA |
          MACVLAN_MODE_BRIDGE);
  else
    /*
     * flood only to VEPA ports, bridge ports
     * already saw the frame on the way out.
     */
    macvlan_broadcast(skb, port, src->dev,
          MACVLAN_MODE_VEPA);

macvlan: implement bridge, VEPA and private mode
https://github.com/torvalds/linux/commit/618e1b7482f7a8a4c6c6e8ccbe140e4c331df4e9

macvlan: receive multicast with local address
https://github.com/torvalds/linux/commit/729e72a10930ef765c11a5a35031ba47f18221c4

在相同的物理接口上创建 macvlan 或 macvtap 接口并配置 IP 就可以实现虚机和该虚机所在宿主机之间的通信:

$ sudo ip link add link eno1 name macvlan1 type macvlan mode bridge
$ sudo ip link set dev macvlan1 up
$ sudo ip a add dev macvlan1 192.168.0.5/24

One possible method of eliminating this problem would be to create a separate macvtap interface for host use, and give it the IP configuration previously on the physical ethernet - in this way, the host would be an equal peer attached to the macvlap bridge, and thus guest and host could communicate directly.

Some notes on macvlan/macvtap
https://backreference.org/2014/03/20/some-notes-on-macvlanmacvtap/index.html

NETWORKING WITH KVM, MACVLAN AND MACVTAP
https://www.furorteutonicus.eu/2013/08/04/enabling-host-guest-networking-with-kvm-macvlan-and-macvtap/

使用 macvtap 的话必须要直接使用 root 用户,而不仅仅是 sudo,因为使用 <> 重定向操作符打开 tap 设备需要 root 权限:

$ ll /dev/tap*
crw------- 1 root root 503, 1  8月 20 09:31 /dev/tap8
crw------- 1 root root 503, 2  8月 20 09:31 /dev/tap9

How does cat <> file work?
https://unix.stackexchange.com/questions/164391/how-does-cat-file-work

Opening File Descriptors for Reading and Writing
https://www.gnu.org/savannah-checkouts/gnu/bash/manual/bash.html#Opening-File-Descriptors-for-Reading-and-Writing

qemu 命令行的 -netdev 后端指定了 fd= 参数的话,不能再指定 script= 等参数:

qemu-system-x86_64: -netdev tap,id=if2,script=no,downscript=no,fd=3: ifname=, script=, downscript=, vnet_hdr=, helper=, queues=, fds=, and vhostfds= are invalid with fd=

fd= 必须是 -netdev 的最后一个参数,否则后续的参数会被 <> 重定向操作符当作文件名的一部分。

qemu 命令行如下:

# ./qemu-system-x86_64 -M q35 -cpu host \
-enable-kvm -smp 2 -m 8192 -nodefaults \
-drive if=pflash,format=raw,file=/usr/share/OVMF/OVMF_CODE_4M.fd \
-serial mon:stdio \
-chardev socket,id=qmp,path=mon.qmp,server=on,wait=off \
-netdev user,hostfwd=tcp::2222-:22,id=nic1 \
-device virtio-net-pci,bus=pcie.0,addr=0x1,netdev=nic1 \
-device virtio-scsi-pci,id=scsi0,bus=pcie.0,addr=0x2 \
-drive if=none,file=/home/runsisi/working/rocky.qcow2,id=hd1,format=qcow2 \
-device scsi-hd,bus=scsi0.0,drive=hd1,bootindex=1 \
-device virtio-gpu-pci,bus=pcie.0,addr=0x3 \
-display gtk \
-netdev tap,id=nic2,fd=3 3<>/dev/tap$(cat /sys/class/net/macvtap1/ifindex) \
-device virtio-net-pci,bus=pcie.0,addr=0x4,netdev=nic2,mac=$(cat /sys/class/net/macvtap1/address)
# ./qemu-system-x86_64 -M q35 -cpu host \
-enable-kvm -smp 2 -m 8192 -nodefaults \
-drive if=pflash,format=raw,file=/usr/share/OVMF/OVMF_CODE_4M.fd \
-serial mon:stdio \
-chardev socket,id=qmp,path=mon.qmp,server=on,wait=off \
-netdev user,hostfwd=tcp::3333-:22,id=nic1 \
-device virtio-net-pci,bus=pcie.0,addr=0x1,netdev=nic1 \
-device virtio-scsi-pci,id=scsi0,bus=pcie.0,addr=0x2 \
-drive if=none,file=/home/runsisi/working/kernel/rocky2.qcow2,id=hd1,format=qcow2 \
-device scsi-hd,bus=scsi0.0,drive=hd1,bootindex=1 \
-device virtio-gpu-pci,bus=pcie.0,addr=0x3 \
-display gtk \
-netdev tap,id=nic2,fd=3 3<>/dev/tap$(cat /sys/class/net/macvtap2/ifindex) \
-device virtio-net-pci,bus=pcie.0,addr=0x4,netdev=nic2,mac=$(cat /sys/class/net/macvtap2/address)

Get which /dev/tapX got associated to a macvtap without guessing
https://unix.stackexchange.com/questions/561201/get-which-dev-tapx-got-associated-to-a-macvtap-without-guessing

注意使用 macvtap 时需要为虚机中的网卡设备显式指定 mac 地址,且与 macvtap 设备的地址保持一致(也可以进入虚机内部手动修改为一致),否则虚机网络无法使用(即使是同一宿主机内部的虚机之间)。


最后修改于 2022-08-27