Host上運行qemu kvm虛擬機,其中虛擬機的網卡類型爲virtio-net,而Host上virtio-net backend使用vhost-netnode
首先看vhost-net模塊註冊,主要使用linux內核提供的內存註冊機制,這部分開發過linux kernel的人都應該linux
很瞭解啦api
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
static
struct
miscdevice vhost_net_misc = {
.minor = VHOST_NET_MINOR,
.name =
"vhost-net"
,
.fops = &vhost_net_fops,
};
static
int
vhost_net_init(
void
)
{
if
(experimental_zcopytx)
vhost_net_enable_zcopy(VHOST_NET_VQ_TX);
return
misc_register(&vhost_net_misc);
}
module_init(vhost_net_init);
static
void
vhost_net_exit(
void
)
{
misc_deregister(&vhost_net_misc);
}
module_exit(vhost_net_exit);
MODULE_VERSION(
"0.0.1"
);
MODULE_LICENSE(
"GPL v2"
);
MODULE_AUTHOR(
"Michael S. Tsirkin"
);
MODULE_DESCRIPTION(
"Host kernel accelerator for virtio net"
);
MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR);
MODULE_ALIAS(
"devname:vhost-net"
);
|
其中vhost_net_fops表明字符設備支持的用戶態接口。字符設備爲/dev/vhost-netapp
1
2
3
4
5
6
7
8
9
10
|
static
const
struct
file_operations vhost_net_fops = {
.owner = THIS_MODULE,
.release = vhost_net_release,
.unlocked_ioctl = vhost_net_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = vhost_net_compat_ioctl,
#endif
.open = vhost_net_open,
.llseek = noop_llseek,
};
|
當用戶態進行使用open系統調用的使用,則執行vhost_net_open函數,該函數主要對該less
字符設備進行初始化socket
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
static
int
vhost_net_open(
struct
inode *inode,
struct
file *f)
{
struct
vhost_net *n = kmalloc(
sizeof
*n, GFP_KERNEL);
struct
vhost_dev *dev;
struct
vhost_virtqueue **vqs;
int
r, i;
if
(!n)
return
-ENOMEM;
vqs = kmalloc(VHOST_NET_VQ_MAX *
sizeof
(*vqs), GFP_KERNEL);
if
(!vqs) {
kfree(n);
return
-ENOMEM;
}
dev = &n->dev;
vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq;
vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq;
n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick;
n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick;
for
(i = 0; i < VHOST_NET_VQ_MAX; i++) {
n->vqs[i].ubufs = NULL;
n->vqs[i].ubuf_info = NULL;
n->vqs[i].upend_idx = 0;
n->vqs[i].done_idx = 0;
n->vqs[i].vhost_hlen = 0;
n->vqs[i].sock_hlen = 0;
}
r = vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX);
if
(r < 0) {
kfree(n);
kfree(vqs);
return
r;
}
vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
f->private_data = n;
return
0;
}
|
從上述代碼中能夠看出vhost-net模塊的核心數據結果關係圖以下函數
爲了獲取tap設備的數據包,vhost-net模塊註冊了該設備的tun scoketoop
1
2
3
4
5
6
7
8
9
10
11
12
|
static
long
vhost_net_set_backend(
struct
vhost_net *n, unsigned index,
int
fd)
{
sock = get_socket(fd);
if
(IS_ERR(sock)) {
r = PTR_ERR(sock);
goto
err_vq;
}
vq->private_data = sock;
}
|
tun socket的收發包函數爲this
1
2
3
4
5
|
static
const
struct
proto_ops tun_socket_ops = {
.sendmsg = tun_sendmsg,
.recvmsg = tun_recvmsg,
.release = tun_release,
};
|
當tap獲取到數據包的時候,vhost-net會調用atom
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
static
void
handle_rx(
struct
vhost_net *net)
{
struct
vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
struct
vhost_virtqueue *vq = &nvq->vq;
unsigned uninitialized_var(in),
log
;
struct
vhost_log *vq_log;
struct
msghdr msg = {
.msg_name = NULL,
.msg_namelen = 0,
.msg_control = NULL,
/* FIXME: get and handle RX aux data. */
.msg_controllen = 0,
.msg_iov = vq->iov,
.msg_flags = MSG_DONTWAIT,
};
struct
virtio_net_hdr_mrg_rxbuf hdr = {
.hdr.flags = 0,
.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
};
size_t
total_len = 0;
int
err, mergeable;
s16 headcount;
size_t
vhost_hlen, sock_hlen;
size_t
vhost_len, sock_len;
struct
socket *sock;
mutex_lock(&vq->mutex);
sock = vq->private_data;
if
(!sock)
goto
out;
vhost_disable_notify(&net->dev, vq);
vhost_hlen = nvq->vhost_hlen;
sock_hlen = nvq->sock_hlen;
vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
vq->
log
: NULL;
mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
while
((sock_len = peek_head_len(sock->sk))) {
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
headcount = get_rx_bufs(vq, vq->heads, vhost_len,
&in, vq_log, &
log
,
likely(mergeable) ? UIO_MAXIOV : 1);
/* On error, stop handling until the next kick. */
if
(unlikely(headcount < 0))
break
;
/* On overrun, truncate and discard */
if
(unlikely(headcount > UIO_MAXIOV)) {
msg.msg_iovlen = 1;
err = sock->ops->recvmsg(NULL, sock, &msg,
1, MSG_DONTWAIT | MSG_TRUNC);
pr_debug(
"Discarded rx packet: len %zd\n"
, sock_len);
continue
;
}
/* OK, now we need to know about added descriptors. */
if
(!headcount) {
if
(unlikely(vhost_enable_notify(&net->dev, vq))) {
/* They have slipped one in as we were
* doing that: check again. */
vhost_disable_notify(&net->dev, vq);
continue
;
}
/* Nothing new? Wait for eventfd to tell us
* they refilled. */
break
;
}
/* We don't need to be notified again. */
if
(unlikely((vhost_hlen)))
/* Skip header. TODO: support TSO. */
move_iovec_hdr(vq->iov, nvq->hdr, vhost_hlen, in);
else
/* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF:
* needed because recvmsg can modify msg_iov. */
copy_iovec_hdr(vq->iov, nvq->hdr, sock_hlen, in);
msg.msg_iovlen = in;
err = sock->ops->recvmsg(NULL, sock, &msg,
sock_len, MSG_DONTWAIT | MSG_TRUNC);
/* Userspace might have consumed the packet meanwhile:
* it's not supposed to do this usually, but might be hard
* to prevent. Discard data we got (if any) and keep going. */
if
(unlikely(err != sock_len)) {
pr_debug(
"Discarded rx packet: "
" len %d, expected %zd\n"
, err, sock_len);
vhost_discard_vq_desc(vq, headcount);
continue
;
}
if
(unlikely(vhost_hlen) &&
memcpy_toiovecend(nvq->hdr, (unsigned
char
*)&hdr, 0,
vhost_hlen)) {
vq_err(vq,
"Unable to write vnet_hdr at addr %p\n"
,
vq->iov->iov_base);
break
;
}
/* TODO: Should check and handle checksum. */
hdr.num_buffers = cpu_to_vhost16(vq, headcount);
if
(likely(mergeable) &&
memcpy_toiovecend(nvq->hdr, (
void
*)&hdr.num_buffers,
offsetof(typeof(hdr), num_buffers),
sizeof
hdr.num_buffers)) {
vq_err(vq,
"Failed num_buffers write"
);
vhost_discard_vq_desc(vq, headcount);
break
;
}
vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
headcount);
if
(unlikely(vq_log))
vhost_log_write(vq, vq_log,
log
, vhost_len);
total_len += vhost_len;
if
(unlikely(total_len >= VHOST_NET_WEIGHT)) {
vhost_poll_queue(&vq->poll);
break
;
}
}
out:
mutex_unlock(&vq->mutex);
}
|
從上述代碼中能夠看出
sock->ops->recvmsg會執行tun socket ops的tun_recvmsg函數,把tap收到的skb,放到virt_queue結構體中而後經過qemu kvm,以中斷的形式喚醒virtio-net驅動的收報函數,注意vhost-net的收發包隊列與virtio-net的收發包隊列
是共享的
1
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
static
int
virtnet_poll(
struct
napi_struct *napi,
int
budget)
{
struct
receive_queue *rq =
container_of(napi,
struct
receive_queue, napi);
struct
virtnet_info *vi = rq->vq->vdev->priv;
void
*buf;
unsigned
int
r, len, received = 0;
again:
while
(received < budget &&
(buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
receive_buf(vi, rq, buf, len);
--rq->num;
received++;
}
if
(rq->num < rq->max / 2) {
if
(!try_fill_recv(vi, rq, GFP_ATOMIC))
schedule_delayed_work(&vi->refill, 0);
}
/* Out of packets? */
if
(received < budget) {
r = virtqueue_enable_cb_prepare(rq->vq);
napi_complete(napi);
if
(unlikely(virtqueue_poll(rq->vq, r)) &&
napi_schedule_prep(napi)) {
virtqueue_disable_cb(rq->vq);
__napi_schedule(napi);
goto
again;
}
}
return
received;
}
|
該函數receive_buf會調用linux kernel標準的協議棧收報函數netif_receive_skb,至此數據包就經過tap到vhost-net
最終送到了虛擬機中
虛擬機向外發送數據包,首先會走linux 協議棧,協議棧發包最終都會調用網卡的xmit函數,對於
virtio-net網卡其xmit函數爲
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
|
static
netdev_tx_t start_xmit(
struct
sk_buff *skb,
struct
net_device *dev)
{
struct
virtnet_info *vi = netdev_priv(dev);
int
qnum = skb_get_queue_mapping(skb);
struct
send_queue *sq = &vi->sq[qnum];
int
err;
struct
netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
bool
kick = !skb->xmit_more;
/* Free up any pending old buffers before queueing new ones. */
free_old_xmit_skbs(sq);
/* Try to transmit */
err = xmit_skb(sq, skb);
/* This should not happen! */
if
(unlikely(err)) {
dev->stats.tx_fifo_errors++;
if
(net_ratelimit())
dev_warn(&dev->dev,
"Unexpected TXQ (%d) queue failure: %d\n"
, qnum, err);
dev->stats.tx_dropped++;
kfree_skb(skb);
return
NETDEV_TX_OK;
}
/* Don't wait up for transmitted skbs to be freed. */
skb_orphan(skb);
nf_reset(skb);
/* Apparently nice girls don't return TX_BUSY; stop the queue
* before it gets out of hand. Naturally, this wastes entries. */
if
(sq->vq->num_free < 2+MAX_SKB_FRAGS) {
netif_stop_subqueue(dev, qnum);
if
(unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
/* More just got used, free them then recheck. */
free_old_xmit_skbs(sq);
if
(sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
netif_start_subqueue(dev, qnum);
virtqueue_disable_cb(sq->vq);
}
}
}
if
(kick || netif_xmit_stopped(txq))
virtqueue_kick(sq->vq);
return
NETDEV_TX_OK;
}
|
從代碼中看就是把skb發到virtqueue中,而後調用virtqueue_kick通知qemu kvm,kvm 會把該數據包
送給vhost-net,vhost-net會調用
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
|
static
void
handle_tx(
struct
vhost_net *net)
{
struct
vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
struct
vhost_virtqueue *vq = &nvq->vq;
unsigned out, in, s;
int
head;
struct
msghdr msg = {
.msg_name = NULL,
.msg_namelen = 0,
.msg_control = NULL,
.msg_controllen = 0,
.msg_iov = vq->iov,
.msg_flags = MSG_DONTWAIT,
};
size_t
len, total_len = 0;
int
err;
size_t
hdr_size;
struct
socket *sock;
struct
vhost_net_ubuf_ref *uninitialized_var(ubufs);
bool
zcopy, zcopy_used;
mutex_lock(&vq->mutex);
sock = vq->private_data;
if
(!sock)
goto
out;
vhost_disable_notify(&net->dev, vq);
hdr_size = nvq->vhost_hlen;
zcopy = nvq->ubufs;
for
(;;) {
/* Release DMAs done buffers first */
if
(zcopy)
vhost_zerocopy_signal_used(net, vq);
/* If more outstanding DMAs, queue the work.
* Handle upend_idx wrap around
*/
if
(unlikely((nvq->upend_idx + vq->num - VHOST_MAX_PEND)
% UIO_MAXIOV == nvq->done_idx))
break
;
head = vhost_get_vq_desc(vq, vq->iov,
ARRAY_SIZE(vq->iov),
&out, &in,
NULL, NULL);
/* On error, stop handling until the next kick. */
if
(unlikely(head < 0))
break
;
/* Nothing new? Wait for eventfd to tell us they refilled. */
if
(head == vq->num) {
if
(unlikely(vhost_enable_notify(&net->dev, vq))) {
vhost_disable_notify(&net->dev, vq);
continue
;
}
break
;
}
if
(in) {
vq_err(vq,
"Unexpected descriptor format for TX: "
"out %d, int %d\n"
, out, in);
break
;
}
/* Skip header. TODO: support TSO. */
s = move_iovec_hdr(vq->iov, nvq->hdr, hdr_size, out);
msg.msg_iovlen = out;
len = iov_length(vq->iov, out);
/* Sanity check */
if
(!len) {
vq_err(vq,
"Unexpected header len for TX: "
"%zd expected %zd\n"
,
iov_length(nvq->hdr, s), hdr_size);
break
;
}
zcopy_used = zcopy && (len >= VHOST_GOODCOPY_LEN ||
nvq->upend_idx != nvq->done_idx);
/* use msg_control to pass vhost zerocopy ubuf info to skb */
if
(zcopy_used) {
vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
if
(!vhost_net_tx_select_zcopy(net) ||
len < VHOST_GOODCOPY_LEN) {
/* copy don't need to wait for DMA done */
vq->heads[nvq->upend_idx].len =
VHOST_DMA_DONE_LEN;
msg.msg_control = NULL;
msg.msg_controllen = 0;
ubufs = NULL;
}
else
{
struct
ubuf_info *ubuf;
ubuf = nvq->ubuf_info + nvq->upend_idx;
vq->heads[nvq->upend_idx].len =
VHOST_DMA_IN_PROGRESS;
ubuf->callback = vhost_zerocopy_callback;
ubuf->ctx = nvq->ubufs;
ubuf->desc = nvq->upend_idx;
msg.msg_control = ubuf;
msg.msg_controllen =
sizeof
(ubuf);
ubufs = nvq->ubufs;
atomic_inc(&ubufs->refcount);
}
nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
}
else
msg.msg_control = NULL;
/* TODO: Check specific error and bomb out unless ENOBUFS? */
err = sock->ops->sendmsg(NULL, sock, &msg, len);
if
(unlikely(err < 0)) {
if
(zcopy_used) {
if
(ubufs)
vhost_net_ubuf_put(ubufs);
nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
% UIO_MAXIOV;
}
vhost_discard_vq_desc(vq, 1);
break
;
}
if
(err != len)
pr_debug(
"Truncated TX packet: "
" len %d != %zd\n"
, err, len);
if
(!zcopy_used)
vhost_add_used_and_signal(&net->dev, vq, head, 0);
else
vhost_zerocopy_signal_used(net, vq);
total_len += len;
vhost_net_tx_packet(net);
if
(unlikely(total_len >= VHOST_NET_WEIGHT)) {
vhost_poll_queue(&vq->poll);
break
;
}
}
out:
mutex_unlock(&vq->mutex);
}
|
在該函數中會調用sock->ops->sendmsg,也就是tun_sendmsg,在該函數中最終會調用
netif_rx,該函數就是協議棧網卡的收報函數,表明tap網卡已經收到數據包了,而後就能夠經過
linux協議的briage把數據包發送出去啦