Linux kernel Vhost-net 和 Virtio-net代碼詳解

場景

Host上運行qemu kvm虛擬機,其中虛擬機的網卡類型爲virtio-net,而Host上virtio-net backend使用vhost-netnode

 

數據包進入虛擬機代碼分析

首先看vhost-net模塊註冊,主要使用linux內核提供的內存註冊機制,這部分開發過linux kernel的人都應該linux

很瞭解啦api

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
static  struct  miscdevice vhost_net_misc = {
     .minor = VHOST_NET_MINOR,
     .name =  "vhost-net" ,
     .fops = &vhost_net_fops,
};
 
static  int  vhost_net_init( void )
{
     if  (experimental_zcopytx)
         vhost_net_enable_zcopy(VHOST_NET_VQ_TX);
     return  misc_register(&vhost_net_misc);
}
module_init(vhost_net_init);
 
static  void  vhost_net_exit( void )
{
     misc_deregister(&vhost_net_misc);
}
module_exit(vhost_net_exit);
 
MODULE_VERSION( "0.0.1" );
MODULE_LICENSE( "GPL v2" );
MODULE_AUTHOR( "Michael S. Tsirkin" );
MODULE_DESCRIPTION( "Host kernel accelerator for virtio net" );
MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR);
MODULE_ALIAS( "devname:vhost-net" );

其中vhost_net_fops表明字符設備支持的用戶態接口。字符設備爲/dev/vhost-netapp

1
2
3
4
5
6
7
8
9
10
static  const  struct  file_operations vhost_net_fops = {
     .owner          = THIS_MODULE,
     .release        = vhost_net_release,
     .unlocked_ioctl = vhost_net_ioctl,
#ifdef CONFIG_COMPAT
     .compat_ioctl   = vhost_net_compat_ioctl,
#endif
     .open           = vhost_net_open,
     .llseek     = noop_llseek,
};

當用戶態進行使用open系統調用的使用,則執行vhost_net_open函數,該函數主要對該less

字符設備進行初始化socket

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
static  int  vhost_net_open( struct  inode *inode,  struct  file *f)
{
     struct  vhost_net *n = kmalloc( sizeof  *n, GFP_KERNEL);
     struct  vhost_dev *dev;
     struct  vhost_virtqueue **vqs;
     int  r, i;
 
     if  (!n)
         return  -ENOMEM;
     vqs = kmalloc(VHOST_NET_VQ_MAX *  sizeof (*vqs), GFP_KERNEL);
     if  (!vqs) {
         kfree(n);
         return  -ENOMEM;
     }
 
     dev = &n->dev;
     vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq;
     vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq;
     n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick;
     n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick;
     for  (i = 0; i < VHOST_NET_VQ_MAX; i++) {
         n->vqs[i].ubufs = NULL;
         n->vqs[i].ubuf_info = NULL;
         n->vqs[i].upend_idx = 0;
         n->vqs[i].done_idx = 0;
         n->vqs[i].vhost_hlen = 0;
         n->vqs[i].sock_hlen = 0;
     }
     r = vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX);
     if  (r < 0) {
         kfree(n);
         kfree(vqs);
         return  r;
     }
 
     vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
     vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
 
     f->private_data = n;
 
     return  0;
}

從上述代碼中能夠看出vhost-net模塊的核心數據結果關係圖以下函數

 

爲了獲取tap設備的數據包,vhost-net模塊註冊了該設備的tun scoketoop

1
2
3
4
5
6
7
8
9
10
11
12
static  long  vhost_net_set_backend( struct  vhost_net *n, unsigned index,  int  fd)
{
 
     sock = get_socket(fd);
     if  (IS_ERR(sock)) {
         r = PTR_ERR(sock);
         goto  err_vq;
     }
 
         vq->private_data = sock;
 
}

tun socket的收發包函數爲this

1
2
3
4
5
static  const  struct  proto_ops tun_socket_ops = {
     .sendmsg = tun_sendmsg,
     .recvmsg = tun_recvmsg,
     .release = tun_release,
};

  

當tap獲取到數據包的時候,vhost-net會調用atom

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
static  void  handle_rx( struct  vhost_net *net)
{
     struct  vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
     struct  vhost_virtqueue *vq = &nvq->vq;
     unsigned uninitialized_var(in),  log ;
     struct  vhost_log *vq_log;
     struct  msghdr msg = {
         .msg_name = NULL,
         .msg_namelen = 0,
         .msg_control = NULL,  /* FIXME: get and handle RX aux data. */
         .msg_controllen = 0,
         .msg_iov = vq->iov,
         .msg_flags = MSG_DONTWAIT,
     };
     struct  virtio_net_hdr_mrg_rxbuf hdr = {
         .hdr.flags = 0,
         .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
     };
     size_t  total_len = 0;
     int  err, mergeable;
     s16 headcount;
     size_t  vhost_hlen, sock_hlen;
     size_t  vhost_len, sock_len;
     struct  socket *sock;
 
     mutex_lock(&vq->mutex);
     sock = vq->private_data;
     if  (!sock)
         goto  out;
     vhost_disable_notify(&net->dev, vq);
 
     vhost_hlen = nvq->vhost_hlen;
     sock_hlen = nvq->sock_hlen;
 
     vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
         vq-> log  : NULL;
     mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
 
     while  ((sock_len = peek_head_len(sock->sk))) {
         sock_len += sock_hlen;
         vhost_len = sock_len + vhost_hlen;
         headcount = get_rx_bufs(vq, vq->heads, vhost_len,
                     &in, vq_log, & log ,
                     likely(mergeable) ? UIO_MAXIOV : 1);
         /* On error, stop handling until the next kick. */
         if  (unlikely(headcount < 0))
             break ;
         /* On overrun, truncate and discard */
         if  (unlikely(headcount > UIO_MAXIOV)) {
             msg.msg_iovlen = 1;
             err = sock->ops->recvmsg(NULL, sock, &msg,
                          1, MSG_DONTWAIT | MSG_TRUNC);
             pr_debug( "Discarded rx packet: len %zd\n" , sock_len);
             continue ;
         }
         /* OK, now we need to know about added descriptors. */
         if  (!headcount) {
             if  (unlikely(vhost_enable_notify(&net->dev, vq))) {
                 /* They have slipped one in as we were
                  * doing that: check again. */
                 vhost_disable_notify(&net->dev, vq);
                 continue ;
             }
             /* Nothing new?  Wait for eventfd to tell us
              * they refilled. */
             break ;
         }
         /* We don't need to be notified again. */
         if  (unlikely((vhost_hlen)))
             /* Skip header. TODO: support TSO. */
             move_iovec_hdr(vq->iov, nvq->hdr, vhost_hlen, in);
         else
             /* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF:
              * needed because recvmsg can modify msg_iov. */
             copy_iovec_hdr(vq->iov, nvq->hdr, sock_hlen, in);
         msg.msg_iovlen = in;
         err = sock->ops->recvmsg(NULL, sock, &msg,
                      sock_len, MSG_DONTWAIT | MSG_TRUNC);
         /* Userspace might have consumed the packet meanwhile:
          * it's not supposed to do this usually, but might be hard
          * to prevent. Discard data we got (if any) and keep going. */
         if  (unlikely(err != sock_len)) {
             pr_debug( "Discarded rx packet: "
                  " len %d, expected %zd\n" , err, sock_len);
             vhost_discard_vq_desc(vq, headcount);
             continue ;
         }
         if  (unlikely(vhost_hlen) &&
             memcpy_toiovecend(nvq->hdr, (unsigned  char  *)&hdr, 0,
                       vhost_hlen)) {
             vq_err(vq,  "Unable to write vnet_hdr at addr %p\n" ,
                    vq->iov->iov_base);
             break ;
         }
         /* TODO: Should check and handle checksum. */
 
         hdr.num_buffers = cpu_to_vhost16(vq, headcount);
         if  (likely(mergeable) &&
             memcpy_toiovecend(nvq->hdr, ( void  *)&hdr.num_buffers,
                       offsetof(typeof(hdr), num_buffers),
                       sizeof  hdr.num_buffers)) {
             vq_err(vq,  "Failed num_buffers write" );
             vhost_discard_vq_desc(vq, headcount);
             break ;
         }
         vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
                         headcount);
         if  (unlikely(vq_log))
             vhost_log_write(vq, vq_log,  log , vhost_len);
         total_len += vhost_len;
         if  (unlikely(total_len >= VHOST_NET_WEIGHT)) {
             vhost_poll_queue(&vq->poll);
             break ;
         }
     }
out:
     mutex_unlock(&vq->mutex);
}

從上述代碼中能夠看出

sock->ops->recvmsg會執行tun socket ops的tun_recvmsg函數,把tap收到的skb,放到virt_queue結構體中而後經過qemu kvm,以中斷的形式喚醒virtio-net驅動的收報函數,注意vhost-net的收發包隊列與virtio-net的收發包隊列

是共享的

1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
static  int  virtnet_poll( struct  napi_struct *napi,  int  budget)
{
     struct  receive_queue *rq =
         container_of(napi,  struct  receive_queue, napi);
     struct  virtnet_info *vi = rq->vq->vdev->priv;
     void  *buf;
     unsigned  int  r, len, received = 0;
 
again:
     while  (received < budget &&
            (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
         receive_buf(vi, rq, buf, len);
         --rq->num;
         received++;
     }
 
     if  (rq->num < rq->max / 2) {
         if  (!try_fill_recv(vi, rq, GFP_ATOMIC))
             schedule_delayed_work(&vi->refill, 0);
     }
 
     /* Out of packets? */
     if  (received < budget) {
         r = virtqueue_enable_cb_prepare(rq->vq);
         napi_complete(napi);
         if  (unlikely(virtqueue_poll(rq->vq, r)) &&
             napi_schedule_prep(napi)) {
             virtqueue_disable_cb(rq->vq);
             __napi_schedule(napi);
             goto  again;
         }
     }
 
     return  received;
}

  

該函數receive_buf會調用linux kernel標準的協議棧收報函數netif_receive_skb,至此數據包就經過tap到vhost-net

最終送到了虛擬機中

 

虛擬機向外發包

虛擬機向外發送數據包,首先會走linux 協議棧,協議棧發包最終都會調用網卡的xmit函數,對於

virtio-net網卡其xmit函數爲

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
static  netdev_tx_t start_xmit( struct  sk_buff *skb,  struct  net_device *dev)
{
     struct  virtnet_info *vi = netdev_priv(dev);
     int  qnum = skb_get_queue_mapping(skb);
     struct  send_queue *sq = &vi->sq[qnum];
     int  err;
     struct  netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
     bool  kick = !skb->xmit_more;
 
     /* Free up any pending old buffers before queueing new ones. */
     free_old_xmit_skbs(sq);
 
     /* Try to transmit */
     err = xmit_skb(sq, skb);
 
     /* This should not happen! */
     if  (unlikely(err)) {
         dev->stats.tx_fifo_errors++;
         if  (net_ratelimit())
             dev_warn(&dev->dev,
                  "Unexpected TXQ (%d) queue failure: %d\n" , qnum, err);
         dev->stats.tx_dropped++;
         kfree_skb(skb);
         return  NETDEV_TX_OK;
     }
 
     /* Don't wait up for transmitted skbs to be freed. */
     skb_orphan(skb);
     nf_reset(skb);
 
     /* Apparently nice girls don't return TX_BUSY; stop the queue
      * before it gets out of hand.  Naturally, this wastes entries. */
     if  (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
         netif_stop_subqueue(dev, qnum);
         if  (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
             /* More just got used, free them then recheck. */
             free_old_xmit_skbs(sq);
             if  (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
                 netif_start_subqueue(dev, qnum);
                 virtqueue_disable_cb(sq->vq);
             }
         }
     }
 
     if  (kick || netif_xmit_stopped(txq))
         virtqueue_kick(sq->vq);
 
     return  NETDEV_TX_OK;
}

從代碼中看就是把skb發到virtqueue中,而後調用virtqueue_kick通知qemu kvm,kvm 會把該數據包

送給vhost-net,vhost-net會調用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
static  void  handle_tx( struct  vhost_net *net)
{
     struct  vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
     struct  vhost_virtqueue *vq = &nvq->vq;
     unsigned out, in, s;
     int  head;
     struct  msghdr msg = {
         .msg_name = NULL,
         .msg_namelen = 0,
         .msg_control = NULL,
         .msg_controllen = 0,
         .msg_iov = vq->iov,
         .msg_flags = MSG_DONTWAIT,
     };
     size_t  len, total_len = 0;
     int  err;
     size_t  hdr_size;
     struct  socket *sock;
     struct  vhost_net_ubuf_ref *uninitialized_var(ubufs);
     bool  zcopy, zcopy_used;
 
     mutex_lock(&vq->mutex);
     sock = vq->private_data;
     if  (!sock)
         goto  out;
 
     vhost_disable_notify(&net->dev, vq);
 
     hdr_size = nvq->vhost_hlen;
     zcopy = nvq->ubufs;
 
     for  (;;) {
         /* Release DMAs done buffers first */
         if  (zcopy)
             vhost_zerocopy_signal_used(net, vq);
 
         /* If more outstanding DMAs, queue the work.
          * Handle upend_idx wrap around
          */
         if  (unlikely((nvq->upend_idx + vq->num - VHOST_MAX_PEND)
                   % UIO_MAXIOV == nvq->done_idx))
             break ;
 
         head = vhost_get_vq_desc(vq, vq->iov,
                      ARRAY_SIZE(vq->iov),
                      &out, &in,
                      NULL, NULL);
         /* On error, stop handling until the next kick. */
         if  (unlikely(head < 0))
             break ;
         /* Nothing new?  Wait for eventfd to tell us they refilled. */
         if  (head == vq->num) {
             if  (unlikely(vhost_enable_notify(&net->dev, vq))) {
                 vhost_disable_notify(&net->dev, vq);
                 continue ;
             }
             break ;
         }
         if  (in) {
             vq_err(vq,  "Unexpected descriptor format for TX: "
                    "out %d, int %d\n" , out, in);
             break ;
         }
         /* Skip header. TODO: support TSO. */
         s = move_iovec_hdr(vq->iov, nvq->hdr, hdr_size, out);
         msg.msg_iovlen = out;
         len = iov_length(vq->iov, out);
         /* Sanity check */
         if  (!len) {
             vq_err(vq,  "Unexpected header len for TX: "
                    "%zd expected %zd\n" ,
                    iov_length(nvq->hdr, s), hdr_size);
             break ;
         }
         zcopy_used = zcopy && (len >= VHOST_GOODCOPY_LEN ||
                        nvq->upend_idx != nvq->done_idx);
 
         /* use msg_control to pass vhost zerocopy ubuf info to skb */
         if  (zcopy_used) {
             vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
             if  (!vhost_net_tx_select_zcopy(net) ||
                 len < VHOST_GOODCOPY_LEN) {
                 /* copy don't need to wait for DMA done */
                 vq->heads[nvq->upend_idx].len =
                             VHOST_DMA_DONE_LEN;
                 msg.msg_control = NULL;
                 msg.msg_controllen = 0;
                 ubufs = NULL;
             else  {
                 struct  ubuf_info *ubuf;
                 ubuf = nvq->ubuf_info + nvq->upend_idx;
 
                 vq->heads[nvq->upend_idx].len =
                     VHOST_DMA_IN_PROGRESS;
                 ubuf->callback = vhost_zerocopy_callback;
                 ubuf->ctx = nvq->ubufs;
                 ubuf->desc = nvq->upend_idx;
                 msg.msg_control = ubuf;
                 msg.msg_controllen =  sizeof (ubuf);
                 ubufs = nvq->ubufs;
                 atomic_inc(&ubufs->refcount);
             }
             nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
         else
             msg.msg_control = NULL;
         /* TODO: Check specific error and bomb out unless ENOBUFS? */
         err = sock->ops->sendmsg(NULL, sock, &msg, len);
         if  (unlikely(err < 0)) {
             if  (zcopy_used) {
                 if  (ubufs)
                     vhost_net_ubuf_put(ubufs);
                 nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
                     % UIO_MAXIOV;
             }
             vhost_discard_vq_desc(vq, 1);
             break ;
         }
         if  (err != len)
             pr_debug( "Truncated TX packet: "
                  " len %d != %zd\n" , err, len);
         if  (!zcopy_used)
             vhost_add_used_and_signal(&net->dev, vq, head, 0);
         else
             vhost_zerocopy_signal_used(net, vq);
         total_len += len;
         vhost_net_tx_packet(net);
         if  (unlikely(total_len >= VHOST_NET_WEIGHT)) {
             vhost_poll_queue(&vq->poll);
             break ;
         }
     }
out:
     mutex_unlock(&vq->mutex);
}

在該函數中會調用sock->ops->sendmsg,也就是tun_sendmsg,在該函數中最終會調用

netif_rx,該函數就是協議棧網卡的收報函數,表明tap網卡已經收到數據包了,而後就能夠經過

linux協議的briage把數據包發送出去啦

相關文章
相關標籤/搜索