Linux的虛擬網卡TUN和TAP

TUN/TAP 提供了給用戶空間程序的包的接收和傳輸,它能夠當作是簡單的點對點設備或是
以太網設備。它不是從物理設備接收包,而是從用戶空間程序接收包。它發送包不是經過物
理設備來發送包,而是將這些包寫入用戶空間程序來發送。
爲了應用這個驅動,應用程序須要打開/dev/net/tun 設備(字符設備),而後發出一個控
制(ioctl)來註冊一個網卡設備,一個網絡設備將命名爲tunXX 或tapXX.依賴於你所設定的標誌
位。當應用程序關閉文件描述符的時候,網絡設備和其餘相關的路由將會消失。
依賴於所選擇的設備類型,用戶空間的應用程序須要讀寫IP 包(用tun 設備)或以太網包(用
tap 設備).至於具體用那種設備,依賴於傳遞給ioctl 函數的標誌參數.
Tun/tap 設備的源碼包地址是http://vtun.sourceforge.net/tun

包含兩個簡單的例子,用於顯示如何使用tun 設備和tap 設備。兩個程序就像是這兩個網
絡設備接口間的網橋。
br_select.c ‐ bridge based on select system call.
br_sigio.c ‐ bridge based on async io and SIGIO signal.
固然,最好的例子是 is VTun http://vtun.sourceforge.net :))

module_init(tun_init);
module_exit(tun_cleanup);
/* Network device part of the driver */
static LIST_HEAD(tun_dev_list);
static const struct ethtool_ops tun_ethtool_ops;

主要的數據結構
struct miscdevice
struct miscdevice {
int minor;
const char *name;
const struct file_operations *fops;
struct list_head list;
struct device *parent;
struct device *this_device;
};
struct tun_struct
struct tun_struct {
struct list_head list;
unsigned long flags;// //區分tun 和tap 設備

int attached;
uid_t owner;
wait_queue_head_t read_wait;// //等待隊列

struct sk_buff_head readq; // //網絡緩衝區隊列

struct net_device *dev; // //linux 抽象網絡設備結構(結構是linux 內核提供的

統一網絡設備結構,定義了系通通一的訪問接口。)
struct net_device_stats stats; // //網卡狀態信息結構

struct fasync_struct *fasync;// //文件異步通知結構

unsigned long if_flags;
u8 dev_addr[ETH_ALEN];
u32 chr_filter[2];
u32 net_filter[2];
#ifdef TUN_DEBUG
int debug;
#endif
};
Struct ifreq
/*
* Interface request structure used for socket
* ioctl's. All interface ioctl's must have parameter
* definitions which begin with ifr_name. The
* remainder may be interface specific.
*/
struct ifreq
{
#define IFHWADDRLEN 6
union
{
char ifrn_name[IFNAMSIZ]; /* if name, e.g. "en0" */
} ifr_ifrn;
union {
struct sockaddr ifru_addr;
struct sockaddr ifru_dstaddr;
struct sockaddr ifru_broadaddr;
struct sockaddr ifru_netmask;
struct sockaddr ifru_hwaddr;
short ifru_flags;
int ifru_ivalue;
int ifru_mtu;
struct ifmap ifru_map;
char ifru_slave[IFNAMSIZ]; /* Just fits the size */
char ifru_newname[IFNAMSIZ];
void __user * ifru_data;
struct if_settings ifru_settings;
} ifr_ifru;
};
模塊的初始化(tun_init)
static int __init tun_init(void)
{
int ret = 0;
printk(KERN_INFO "tun: %s, %s/n", DRV_DESCRIPTION, DRV_VERSION);
printk(KERN_INFO "tun: %s/n", DRV_COPYRIGHT);
ret = misc_register(&tun_miscdev);
if (ret)
printk(KERN_ERR "tun: Can't register misc device %d/n", TUN_MINOR);
return ret;
}
static struct miscdevice tun_miscdev = {
.minor = TUN_MINOR,
.name = "tun",
.fops = &tun_fops,
};
static const struct file_operations tun_fops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
.read = do_sync_read,
.aio_read = tun_chr_aio_read,
.write = do_sync_write,
.aio_write = tun_chr_aio_write,
.poll = tun_chr_poll,
.ioctl = tun_chr_ioctl,
.open = tun_chr_open,
.release = tun_chr_close,
.fasync = tun_chr_fasync
};
misc_register
//在內核中利用misc_register() 函數將該驅動註冊爲非標準字符設備驅動,提供字符設備具

有的各類程序接口。
int misc_register(struct miscdevice * misc)
{
struct miscdevice *c;
dev_t dev;
int err = 0;
INIT_LIST_HEAD(&misc‐>list);
mutex_lock(&misc_mtx);
list_for_each_entry(c, &misc_list, list) {
if (c‐>minor == misc‐>minor) {
mutex_unlock(&misc_mtx);
return ‐EBUSY;
}
}
if (misc‐>minor == MISC_DYNAMIC_MINOR) {
int i = DYNAMIC_MINORS;
while (‐‐i >= 0)
if ( (misc_minors[i>>3] & (1 << (i&7))) == 0)
break;
if (i<0) {
mutex_unlock(&misc_mtx);
return ‐EBUSY;
}
misc‐>minor = i;
}
if (misc‐>minor < DYNAMIC_MINORS)
misc_minors[misc‐>minor >> 3] |= 1 << (misc‐>minor & 7);
dev = MKDEV(MISC_MAJOR, misc‐>minor);
misc‐>this_device = device_create(misc_class, misc‐>parent, dev,
"%s", misc‐>name);
if (IS_ERR(misc‐>this_device)) {
err = PTR_ERR(misc‐>this_device);
goto out;
}
/*
* Add it to the front, so that later devices can "override"
* earlier defaults
*/
list_add(&misc‐>list, &misc_list);
out:
mutex_unlock(&misc_mtx);
return err;
}
tun 設備的操做(系統調用)
tun_chr_open(打開設備時調用)
當打開一個tun/tap 設備時,open 函數將調用tun_chr_open()函數,其中將完成一些重要的初始化過
程,
初始化函數以及網絡緩衝區鏈表的初始化和等待隊列的初始化
static int tun_chr_open(struct inode *inode, struct file * file)
{
DBG1(KERN_INFO "tunX: tun_chr_open/n");
file‐>private_data = NULL;//初始化設備文件的內容

return 0;
}
tun_chr_ioctl(設備的控制調用接口)
控制調用接口:
Cmd=
.. TUNSETIFF
.. _IOC_TYPE(cmd) == 0x89
.. TUNSETNOCSUM
.. TUNSETPERSIST
.. TUNSETOWNER
.. TUNSETLINK
.. TUNSETDEBUG
.. SIOCGIFFLAGS
.. SIOCSIFFLAGS
.. SIOCGIFHWADDR
.. SIOCSIFHWADDR
.. SIOCADDMULTI
.. SIOCDELMULTI
Tun/tap 驅動中網卡的註冊被嵌入了字符驅動的ioctl 例程中,它是經過對字符設備文件描述符利用自
定義的ioctl 設置標誌 TUNSETIFF 完成網卡的註冊的。
static int tun_chr_ioctl(struct inode *inode, struct file *file,unsigned int cmd, unsigned long arg)
{
struct tun_struct *tun = file‐>private_data;
void __user* argp = (void __user*)arg;
struct ifreq ifr;
if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89)
if (copy_from_user(&ifr, argp, sizeof ifr))//拷貝用戶區的網絡設備配置。在用戶區已

經分配了ifreq 結構的值和配置值,
return ‐EFAULT;
if (cmd == TUNSETIFF && !tun) {//字符設備文件的數據不是空的則

int err;
ifr.ifr_name[IFNAMSIZ‐ ] = '/0';
rtnl_lock();//在中定義

err = tun_set_iff(file, &ifr);
rtnl_unlock();
if (err)
return err;
if (copy_to_user(argp, &ifr, sizeof(ifr)))//把配置數據拷貝到用戶區

return ‐EFAULT;
return 0;
}
if (!tun)//tun 設備錯誤

return ‐EBADFD;
DBG(KERN_INFO "%s: tun_chr_ioctl cmd %d/n", tun‐>dev‐>name, cmd);
switch (cmd) {
case TUNSETNOCSUM:
/* Disable/Enable checksum */
if (arg)
tun‐>flags |= TUN_NOCHECKSUM;
else
tun‐>flags &= ~TUN_NOCHECKSUM;
DBG(KERN_INFO "%s: checksum %s/n",
tun‐>dev‐>name, arg ? "disabled" : "enabled");
break;
case TUNSETPERSIST:
/* Disable/Enable persist mode */
if (arg)
tun‐>flags |= TUN_PERSIST;
else
tun‐>flags &= ~TUN_PERSIST;
DBG(KERN_INFO "%s: persist %s/n",
tun‐>dev‐>name, arg ? "disabled" : "enabled");
break;
case TUNSETOWNER:
/* Set owner of the device */
tun‐>owner = (uid_t) arg;
DBG(KERN_INFO "%s: owner set to %d/n", tun‐>dev‐>name, tun‐>owner);
break;
case TUNSETLINK:
/* Only allow setting the type when the interface is down */
if (tun‐>dev‐>flags & IFF_UP) {
DBG(KERN_INFO "%s: Linktype set failed because interface is up/n",
tun‐>dev‐>name);
return ‐EBUSY;
} else {
tun‐>dev‐>type = (int) arg;
DBG(KERN_INFO "%s: linktype set to %d/n", tun‐>dev‐>name, tun‐>dev‐>type);
}
break;
#ifdef TUN_DEBUG
case TUNSETDEBUG:
tun‐>debug = arg;
break;
#endif
case SIOCGIFFLAGS:
ifr.ifr_flags = tun‐>if_flags;
if (copy_to_user( argp, &ifr, sizeof ifr))
return ‐EFAULT;
return 0;
case SIOCSIFFLAGS:
/** Set the character device's interface flags. Currently only
* IFF_PROMISC and IFF_ALLMULTI are used. */
tun‐>if_flags = ifr.ifr_flags;
DBG(KERN_INFO "%s: interface flags 0x%lx/n",
tun‐>dev‐>name, tun‐>if_flags);
return 0;
case SIOCGIFHWADDR:
/* Note: the actual net device's address may be different */
memcpy(ifr.ifr_hwaddr.sa_data, tun‐>dev_addr,
min(sizeof ifr.ifr_hwaddr.sa_data, sizeof tun‐>dev_addr));
if (copy_to_user( argp, &ifr, sizeof ifr))
return ‐EFAULT;
return 0;
case SIOCSIFHWADDR:
{
/* try to set the actual net device's hw address */
int ret = dev_set_mac_address(tun‐>dev, &ifr.ifr_hwaddr);
if (ret == 0) {
/** Set the character device's hardware address. This is used when
* filtering packets being sent from the network device to the character
* device. */
memcpy(tun‐>dev_addr, ifr.ifr_hwaddr.sa_data,
min(sizeof ifr.ifr_hwaddr.sa_data, sizeof tun‐>dev_addr));
DBG(KERN_DEBUG "%s: set hardware address: %x:%x:%x:%x:%x:%x/n",
tun‐>dev‐>name,
tun‐>dev_addr[0], tun‐>dev_addr[1], tun‐>dev_addr[2],
tun‐>dev_addr[3], tun‐>dev_addr[4], tun‐>dev_addr[5]);
}
return ret;
}
case SIOCADDMULTI:
/** Add the specified group to the character device's multicast filter
* list. */
add_multi(tun‐>chr_filter, ifr.ifr_hwaddr.sa_data);
DBG(KERN_DEBUG "%s: add multi: %x:%x:%x:%x:%x:%x/n",
tun‐>dev‐>name,
(u8)ifr.ifr_hwaddr.sa_data[0], (u8)ifr.ifr_hwaddr.sa_data[1],
(u8)ifr.ifr_hwaddr.sa_data[2], (u8)ifr.ifr_hwaddr.sa_data[3],
(u8)ifr.ifr_hwaddr.sa_data[4], (u8)ifr.ifr_hwaddr.sa_data[5]);
return 0;
case SIOCDELMULTI:
/** Remove the specified group from the character device's multicast
* filter list. */
del_multi(tun‐>chr_filter, ifr.ifr_hwaddr.sa_data);
DBG(KERN_DEBUG "%s: del multi: %x:%x:%x:%x:%x:%x/n",
tun‐>dev‐>name,
(u8)ifr.ifr_hwaddr.sa_data[0], (u8)ifr.ifr_hwaddr.sa_data[1],
(u8)ifr.ifr_hwaddr.sa_data[2], (u8)ifr.ifr_hwaddr.sa_data[3],
(u8)ifr.ifr_hwaddr.sa_data[4], (u8)ifr.ifr_hwaddr.sa_data[5]);
return 0;
default:
return ‐EINVAL;
};
return 0;
}
tun_chr_aio_read(異步讀)(從tun 設備中讀取數據)
static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
unsigned long count, loff_t pos)
{
struct file *file = iocb‐>ki_filp;
struct tun_struct *tun = file‐>private_data;
DECLARE_WAITQUEUE(wait, current);
struct sk_buff *skb;
ssize_t len, ret = 0;
if (!tun)
return ‐EBADFD;
DBG(KERN_INFO "%s: tun_chr_read/n", tun‐>dev‐>name);
len = iov_total(iv, count);
if (len < 0)
return ‐EINVAL;
add_wait_queue(&tun‐>read_wait, &wait);
while (len) {
const u8 ones[ ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
u8 addr[ ETH_ALEN];
int bit_nr;
current‐>state = TASK_INTERRUPTIBLE;
/* Read frames from the queue */
if (!(skb=skb_dequeue(&tun‐>readq))) {
if (file‐>f_flags & O_NONBLOCK) {
ret = ‐EAGAIN;
break;
}
if (signal_pending(current)) {
ret = ‐ERESTARTSYS;
break;
}
/* Nothing to read, let's sleep */
schedule();
continue;
}
netif_wake_queue(tun‐>dev);
/** Decide whether to accept this packet. This code is designed to
* behave identically to an Ethernet interface. Accept the packet if
* ‐ we are promiscuous.
* ‐ the packet is addressed to us.
* ‐ the packet is broadcast.
* ‐ the packet is multicast and
* ‐ we are multicast promiscous.
* ‐ we belong to the multicast group.
*/
skb_copy_from_linear_data(skb, addr, min_t(size_t, sizeof addr,
skb‐>len));
bit_nr = ether_crc(sizeof addr, addr) >> 26;
if ((tun‐>if_flags & IFF_PROMISC) ||
memcmp(addr, tun‐>dev_addr, sizeof addr) == 0 ||
memcmp(addr, ones, sizeof addr) == 0 ||
(((addr[0] == 1 && addr[1] == 0 && addr[2] == 0x5e) ||
(addr[0] == 0x33 && addr[1] == 0x33)) &&
((tun‐>if_flags & IFF_ALLMULTI) ||
(tun‐>chr_filter[bit_nr >> 5] & (1 << (bit_nr & 31)))))) {
DBG(KERN_DEBUG "%s: tun_chr_readv: accepted: %x:%x:%x:%x:%x:%x/n",
tun‐>dev‐>name, addr[0], addr[1], addr[2],
addr[3], addr[4], addr[5]);
ret = tun_put_user(tun, skb, (struct iovec *) iv, len);
kfree_skb(skb);
break;
} else {
DBG(KERN_DEBUG "%s: tun_chr_readv: rejected: %x:%x:%x:%x:%x:%x/n",
tun‐>dev‐>name, addr[0], addr[1], addr[2],
addr[3], addr[4], addr[5]);
kfree_skb(skb);
continue;
}
}
current‐>state = TASK_RUNNING;
remove_wait_queue(&tun‐>read_wait, &wait);
return ret;
}
skb_dequeue(src/net/core/skbuff.c)
/**
* skb_dequeue ‐ remove from the head of the queue
* @list : list to dequeue from
*
* Remove the head of the list. The list lock is taken so the function
* may be used safely with other locking list functions. The head item is
* returned or %NULL if the list is empty.
*/
struct sk_buff *skb_dequeue(struct sk_buff_head *list)
{
unsigned long flags;
struct sk_buff *result;
spin_lock_irqsave(&list‐>lock, flags);
result = __skb_dequeue(list);
spin_unlock_irqrestore(&list‐>lock, flags);
return result;
}
__skb_dequeue
/**
* __skb_dequeue ‐ remove from the head of the queue
* @list : list to dequeue from
*
* Remove the head of the list. This function does not take any locks
* so must be used with appropriate locks held only. The head item is
* returned or %NULL if the list is empty.
*/
extern struct sk_buff *skb_dequeue(struct sk_buff_head *list);
static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list)
{
struct sk_buff *next, *prev, *result;
prev = (struct sk_buff *) list;
next = prev‐>next;
result = NULL;
if (next != prev) {
result = next;
next = next‐>next;
list‐>qlen‐‐;
next‐>prev = prev;
prev‐>next = next;
result‐>next = result‐>prev = NULL;
}
return result;
}
tun_put_user
/* Put packet to the user space buffer */
static __inline__ ssize_t tun_put_user(struct tun_struct *tun,
struct sk_buff *skb,
struct iovec *iv, int len)
{
struct tun_pi pi = { 0, skb‐>protocol };
ssize_t total = 0;
if (!(tun‐>flags & TUN_NO_PI)) {
if ((len ‐= sizeof(pi)) < 0)
return ‐EINVAL;
if (len < skb‐>len) {
/* Packet will be striped */
pi.flags |= TUN_PKT_STRIP;
}
if (memcpy_toiovec(iv, (void *) &pi, sizeof(pi)))
return ‐EFAULT;
total += sizeof(pi);
}
len = min_t(int, skb‐>len, len);
skb_copy_datagram_iovec(skb, 0, iv, len);
total += len;
tun‐>stats.tx_packets++;
tun‐>stats.tx_bytes += len;
return total;
}
tun_chr_aio_write(把數據寫入到tun 設備中)
static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
unsigned long count, loff_t pos)
{
struct tun_struct *tun = iocb‐>ki_filp‐>private_data;
if (!tun)
return ‐EBADFD;
DBG(KERN_INFO "%s: tun_chr_write %ld/n", tun‐>dev‐>name, count);
return tun_get_user(tun, (struct iovec *) iv, iov_total(iv, count));
}
tun_get_user
/* Get packet from user space buffer */
static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv, size_t count)
{
struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
struct sk_buff *skb;
size_t len = count, align = 0;
if (!(tun‐>flags & TUN_NO_PI)) {
if ((len ‐= sizeof(pi)) > count)
return ‐EINVAL;
if(memcpy_fromiovec((void *)&pi, iv, sizeof(pi)))
return ‐EFAULT;
}
if ((tun‐>flags & TUN_TYPE_MASK) == TUN_TAP_DEV)
align = NET_IP_ALIGN;
if (!(skb = alloc_skb(len + align, GFP_KERNEL))) {
tun‐>stats.rx_dropped++;
return ‐ENOMEM;
}
if (align)
skb_reserve(skb, align);
if (memcpy_fromiovec(skb_put(skb, len), iv, len)) {
tun‐>stats.rx_dropped++;
kfree_skb(skb);
return ‐EFAULT;
}
switch (tun‐>flags & TUN_TYPE_MASK) {
case TUN_TUN_DEV:
skb_reset_mac_header(skb);
skb‐>protocol = pi.proto;
skb‐>dev = tun‐>dev;
break;
case TUN_TAP_DEV:
skb‐>protocol = eth_type_trans(skb, tun‐>dev);
break;
};
if (tun‐>flags & TUN_NOCHECKSUM)
skb‐>ip_summed = CHECKSUM_UNNECESSARY;
netif_rx_ni(skb);
tun‐>dev‐>last_rx = jiffies;
tun‐>stats.rx_packets++;
tun‐>stats.rx_bytes += len;
return count;
}node

相關文章
相關標籤/搜索