內核版本:2.6.37
參考[做者:kendo的文章(基於內涵版本2.6.12)]node
第一部份 Socket套接字的建立linux
socket 並非 TCP/IP協議的一部份。
從廣義上來說,socket 是Unix/Linux 抽像的進程間通信的一種方法。網絡 socket 通信僅僅是其若干協議中的一類。而tcp/ip 又是網絡這類中的一種。
從tcp/ip 的解度看 socket ,它更多地體現了用戶 API 與協議棧的一箇中間層接口層。用戶經過調用socket API 將報文遞交給協議棧,或者從協議棧中接收報文件。 數組
1、系統總入口
Linux 內核爲全部的與socket 有關的操做的API,提供了一個統一的系統調用入口,其代碼在net/socket.c 中:緩存
/* * System call vectors. * * Argument checking cleaned up. Saved 20% in size. * This function doesn't need to set the kernel lock because * it is set by the callees. */ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) { unsigned long a[6]; unsigned long a0, a1; int err; unsigned int len; if (call < 1 || call > SYS_RECVMMSG) return -EINVAL; len = nargs[call]; if (len > sizeof(a)) return -EINVAL; /* copy_from_user should be SMP safe. */ if (copy_from_user(a, args, len)) return -EFAULT; audit_socketcall(nargs[call] / sizeof(unsigned long), a); a0 = a[0]; a1 = a[1]; switch (call) { case SYS_SOCKET: err = sys_socket(a0, a1, a[2]); break; case SYS_BIND: err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_CONNECT: err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_LISTEN: err = sys_listen(a0, a1); break; case SYS_ACCEPT: err = sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], 0); break; case SYS_GETSOCKNAME: err = sys_getsockname(a0, (struct sockaddr __user *)a1, (int __user *)a[2]); break; case SYS_GETPEERNAME: err = sys_getpeername(a0, (struct sockaddr __user *)a1, (int __user *)a[2]); break; case SYS_SOCKETPAIR: err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]); break; case SYS_SEND: err = sys_send(a0, (void __user *)a1, a[2], a[3]); break; case SYS_SENDTO: err = sys_sendto(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], a[5]); break; case SYS_RECV: err = sys_recv(a0, (void __user *)a1, a[2], a[3]); break; case SYS_RECVFROM: err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], (int __user *)a[5]); break; case SYS_SHUTDOWN: err = sys_shutdown(a0, a1); break; case SYS_SETSOCKOPT: err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]); break; case SYS_GETSOCKOPT: err = sys_getsockopt(a0, a1, a[2], (char __user *)a[3], (int __user *)a[4]); break; case SYS_SENDMSG: err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]); break; case SYS_RECVMSG: err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]); break; case SYS_RECVMMSG: err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], (struct timespec __user *)a[4]); break; case SYS_ACCEPT4: err = sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], a[3]); break; default: err = -EINVAL; break; } return err; }
首先調用copy_from_user 將用戶態參數拷貝至數組a 。可是問題在於,每一個被調用的 API 的參數不盡相同,那麼每次拷貝的字節在小若是判定?
來看其第三個參數nargs[call],其中 call 是操做碼,後面有個大大的 switch...case就是判斷它。對應的操做碼定義在include/linux/net.h : 網絡
#define SYS_SOCKET 1 /* sys_socket(2) */ #define SYS_BIND 2 /* sys_bind(2) */ #define SYS_CONNECT 3 /* sys_connect(2) */ #define SYS_LISTEN 4 /* sys_listen(2) */ #define SYS_ACCEPT 5 /* sys_accept(2) */ #define SYS_GETSOCKNAME 6 /* sys_getsockname(2) */ #define SYS_GETPEERNAME 7 /* sys_getpeername(2) */ #define SYS_SOCKETPAIR 8 /* sys_socketpair(2) */ #define SYS_SEND 9 /* sys_send(2) */ #define SYS_RECV 10 /* sys_recv(2) */ #define SYS_SENDTO 11 /* sys_sendto(2) */ #define SYS_RECVFROM 12 /* sys_recvfrom(2) */ #define SYS_SHUTDOWN 13 /* sys_shutdown(2) */ #define SYS_SETSOCKOPT 14 /* sys_setsockopt(2) */ #define SYS_GETSOCKOPT 15 /* sys_getsockopt(2) */ #define SYS_SENDMSG 16 /* sys_sendmsg(2) */ #define SYS_RECVMSG 17 /* sys_recvmsg(2) */ #define SYS_ACCEPT4 18 /* sys_accept4(2) */ #define SYS_RECVMMSG 19 /* sys_recvmmsg(2) */
而數組nargs則根據操做碼的不一樣,計算對應的參數的空間大小:數據結構
/* Argument list sizes for sys_socketcall */ #define AL(x) ((x) * sizeof(unsigned long)) static const unsigned char nargs[20] = { AL(0), AL(3), AL(3), AL(3), AL(2), AL(3), AL(3), AL(3), AL(4), AL(4), AL(4), AL(6), AL(6), AL(2), AL(5), AL(5), AL(3), AL(3), AL(4), AL(5) }; #undef AL
當拷貝完成參數後,就進入一個switch...case... 判斷操做碼,跳轉至對應的系統接口。app
2、 sys_socket 函數dom
當用戶空間要建立一個socke 接口時,會調用 API 函數:socket
int socket(int domain, int type, int protocol);
函數,其三個參數分別表示協議族、協議類型(面向鏈接或無鏈接)以及協議。
協議族:async
/* Supported address families. */ #define AF_UNSPEC 0 #define AF_UNIX 1 /* Unix domain sockets */ #define AF_LOCAL 1 /* POSIX name for AF_UNIX */ #define AF_INET 2 /* Internet IP Protocol */ #define AF_AX25 3 /* Amateur Radio AX.25 */ #define AF_IPX 4 /* Novell IPX */ #define AF_APPLETALK 5 /* AppleTalk DDP */ #define AF_NETROM 6 /* Amateur Radio NET/ROM */ #define AF_BRIDGE 7 /* Multiprotocol bridge */ #define AF_ATMPVC 8 /* ATM PVCs */ #define AF_X25 9 /* Reserved for X.25 project */ #define AF_INET6 10 /* IP version 6 */ #define AF_ROSE 11 /* Amateur Radio X.25 PLP */ #define AF_DECnet 12 /* Reserved for DECnet project */ #define AF_NETBEUI 13 /* Reserved for 802.2LLC project*/ #define AF_SECURITY 14 /* Security callback pseudo AF */ #define AF_KEY 15 /* PF_KEY key management API */ #define AF_NETLINK 16 #define AF_ROUTE AF_NETLINK /* Alias to emulate 4.4BSD */ #define AF_PACKET 17 /* Packet family */ #define AF_ASH 18 /* Ash */ #define AF_ECONET 19 /* Acorn Econet */ #define AF_ATMSVC 20 /* ATM SVCs */ #define AF_RDS 21 /* RDS sockets */ #define AF_SNA 22 /* Linux SNA Project (nutters!) */ #define AF_IRDA 23 /* IRDA sockets */ #define AF_PPPOX 24 /* PPPoX sockets */ #define AF_WANPIPE 25 /* Wanpipe API Sockets */ #define AF_LLC 26 /* Linux LLC */ #define AF_CAN 29 /* Controller Area Network */ #define AF_TIPC 30 /* TIPC sockets */ #define AF_BLUETOOTH 31 /* Bluetooth sockets */ #define AF_IUCV 32 /* IUCV sockets */ #define AF_RXRPC 33 /* RxRPC sockets */ #define AF_ISDN 34 /* mISDN sockets */ #define AF_PHONET 35 /* Phonet sockets */ #define AF_IEEE802154 36 /* IEEE802154 sockets */ #define AF_CAIF 37 /* CAIF sockets */ #define AF_MAX 38 /* For now.. */ /* Protocol families, same as address families. */ #define PF_UNSPEC AF_UNSPEC #define PF_UNIX AF_UNIX #define PF_LOCAL AF_LOCAL #define PF_INET AF_INET #define PF_AX25 AF_AX25 #define PF_IPX AF_IPX #define PF_APPLETALK AF_APPLETALK #define PF_NETROM AF_NETROM #define PF_BRIDGE AF_BRIDGE #define PF_ATMPVC AF_ATMPVC #define PF_X25 AF_X25 #define PF_INET6 AF_INET6 #define PF_ROSE AF_ROSE #define PF_DECnet AF_DECnet #define PF_NETBEUI AF_NETBEUI #define PF_SECURITY AF_SECURITY #define PF_KEY AF_KEY #define PF_NETLINK AF_NETLINK #define PF_ROUTE AF_ROUTE #define PF_PACKET AF_PACKET #define PF_ASH AF_ASH #define PF_ECONET AF_ECONET #define PF_ATMSVC AF_ATMSVC #define PF_RDS AF_RDS #define PF_SNA AF_SNA #define PF_IRDA AF_IRDA #define PF_PPPOX AF_PPPOX #define PF_WANPIPE AF_WANPIPE #define PF_LLC AF_LLC #define PF_CAN AF_CAN #define PF_TIPC AF_TIPC #define PF_BLUETOOTH AF_BLUETOOTH #define PF_IUCV AF_IUCV #define PF_RXRPC AF_RXRPC #define PF_ISDN AF_ISDN #define PF_PHONET AF_PHONET #define PF_IEEE802154 AF_IEEE802154 #define PF_CAIF AF_CAIF #define PF_MAX AF_MAX
協議類型:
enum sock_type { SOCK_STREAM = 1, SOCK_DGRAM = 2, SOCK_RAW = 3, SOCK_RDM = 4, SOCK_SEQPACKET = 5, SOCK_DCCP = 6, SOCK_PACKET = 10, };
socket建立經過操做碼SYS_SOCKET是由sys_socket() 實現的:
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) { int retval; struct socket *sock; int flags; /* Check the SOCK_* constants for consistency. */ BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK); flags = type & ~SOCK_TYPE_MASK; if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; type &= SOCK_TYPE_MASK; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; retval = sock_create(family, type, protocol, &sock); if (retval < 0) goto out; retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); if (retval < 0) goto out_release; out: /* It may be already another descriptor 8) Not kernel problem. */ return retval; out_release: sock_release(sock); return retval; }
這段代碼作了兩件事:
1> 分配 sock 與sk,協議簇的協議封裝;
2> sock 面向上層系統調用,主要是與文件系統交互。
經過進程的current指針的files,結合建立socket時返回的文件描符述,能夠找到內核中對應的struct file,再根據file的f_dentry能夠找到對應的目錄項,而目錄項struct dentry中,有d_inode指針,指向與sock封裝在一塊兒的inode。
sock又與sk指針互指,一一對應。
3、 協議簇的協議封裝
int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern) { int err; struct socket *sock; const struct net_proto_family *pf; /* * Check protocol is in range */ if (family < 0 || family >= NPROTO) return -EAFNOSUPPORT; if (type < 0 || type >= SOCK_MAX) return -EINVAL; /* Compatibility. This uglymoron is moved from INET layer to here to avoid deadlock in module load. */ if (family == PF_INET && type == SOCK_PACKET) { static int warned; if (!warned) { warned = 1; printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n", current->comm); } family = PF_PACKET; } err = security_socket_create(family, type, protocol, kern); if (err) return err; /* * Allocate the socket and allow the family to set things up. if * the protocol is 0, the family is instructed to select an appropriate * default. */ sock = sock_alloc(); if (!sock) { if (net_ratelimit()) printk(KERN_WARNING "socket: no more sockets\n"); return -ENFILE; /* Not exactly a match, but its the closest posix thing */ } sock->type = type; #ifdef CONFIG_MODULES /* Attempt to load a protocol module if the find failed. * * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user * requested real, full-featured networking support upon configuration. * Otherwise module support will break! */ if (net_families[family] == NULL) request_module("net-pf-%d", family); #endif rcu_read_lock(); pf = rcu_dereference(net_families[family]); err = -EAFNOSUPPORT; if (!pf) goto out_release; /* * We will call the ->create function, that possibly is in a loadable * module, so we have to bump that loadable module refcnt first. */ if (!try_module_get(pf->owner)) goto out_release; /* Now protected by module ref count */ rcu_read_unlock(); err = pf->create(net, sock, protocol, kern); if (err < 0) goto out_module_put; /* * Now to bump the refcnt of the [loadable] module that owns this * socket at sock_release time we decrement its refcnt. */ if (!try_module_get(sock->ops->owner)) goto out_module_busy; /* * Now that we're done with the ->create function, the [loadable] * module can have its refcnt decremented */ module_put(pf->owner); err = security_socket_post_create(sock, family, type, protocol, kern); if (err) goto out_sock_release; *res = sock; return 0; out_module_busy: err = -EAFNOSUPPORT; out_module_put: sock->ops = NULL; module_put(pf->owner); out_sock_release: sock_release(sock); return err; out_release: rcu_read_unlock(); goto out_sock_release; } EXPORT_SYMBOL(__sock_create);
上面這個函數主要作了三件事:
1> sock_alloc()
在分析這個函數前,首先要了解:爲了對 socket 抽像出文件的概念,內核中爲socket定義了一個專門的文件系統類型sockfs。
static struct vfsmount *sock_mnt __read_mostly; static struct file_system_type sock_fs_type = { .name = "sockfs", .mount = sockfs_mount, .kill_sb = kill_anon_super, };
在模塊初始化的時候,安裝該文件系統:
static int __init sock_init(void) { /* * Initialize sock SLAB cache. */ sk_init(); /* * Initialize skbuff SLAB cache */ skb_init(); /* * Initialize the protocols module. */ init_inodecache(); register_filesystem(&sock_fs_type); sock_mnt = kern_mount(&sock_fs_type); /* The real protocol initialization is performed in later initcalls. */ #ifdef CONFIG_NETFILTER netfilter_init(); #endif #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING skb_timestamping_init(); #endif return 0; } core_initcall(sock_init); /* early initcall */
文件系統安裝中的一個重要步驟kern_mount->kern_mount_data->vfs_kern_mount:
vfs_kern_mount函數中,先根據註冊的文件系統類型,若是文件系統自己有mount成員函數則調用之,沒則調用它的get_sb成員函數指針,獲取相應的超級塊sb 。最後,調置文件系統的超級塊成員指針,使之指向對應的值。
其中sockfs文件系統的mount函數調用mount_pseudo()實現超級塊的初始化,跟節點inode和目錄下dentry建立,sockfs_ops這裏關聯上文件系統。
那前面提到的new_inode()函數分配inode 時調用的: sock_mnt->mnt_sb->s_op->alloc_inode(sock_mnt->mnt_sb);
static const struct super_operations sockfs_ops = { .alloc_inode = sock_alloc_inode, .destroy_inode = sock_destroy_inode, .statfs = simple_statfs, };
這個alloc_inode函數指針也就是sockfs_ops的sock_alloc_inode()函數。
static struct inode *sock_alloc_inode(struct super_block *sb) { struct socket_alloc *ei; ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->socket.wq = kmalloc(sizeof(struct socket_wq), GFP_KERNEL); if (!ei->socket.wq) { kmem_cache_free(sock_inode_cachep, ei); return NULL; } init_waitqueue_head(&ei->socket.wq->wait); ei->socket.wq->fasync_list = NULL; ei->socket.state = SS_UNCONNECTED; ei->socket.flags = 0; ei->socket.ops = NULL; ei->socket.sk = NULL; ei->socket.file = NULL; return &ei->vfs_inode; }
函數先分配了一個用於封裝socket和inode的ei ,而後在高速緩存中爲之申請了一塊空間。這樣,inode和socket就同時都被分配了。接下來初始化socket的各個成員。
struct socket_alloc { struct socket socket; struct inode vfs_inode; };
顯而易見,該結構實現了inode和socket的封裝。已經經過new_inode從sockfs文件系統分配一個inode,能夠經過宏SOCKET_I來獲取與之對應的socket:
sock = SOCKET_I(inode);
分配inode、socket 以及二者如何關聯,都已一一分析了。
2> pf = rcu_dereference(net_families[family]);
net_families[family]的定義:
static const struct net_proto_family *net_families[NPROTO] __read_mostly;
net_proto_family的定義:
struct net_proto_family { int family; int (*create)(struct net *net, struct socket *sock, int protocol, int kern); struct module *owner; };
net_families數組填充函數sock_register():
/** * sock_register - add a socket protocol handler * @ops: description of protocol * * This function is called by a protocol handler that wants to * advertise its address family, and have it linked into the * socket interface. The value ops->family coresponds to the * socket system call protocol family. */ int sock_register(const struct net_proto_family *ops) { int err; if (ops->family >= NPROTO) { printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family, NPROTO); return -ENOBUFS; } spin_lock(&net_family_lock); if (net_families[ops->family]) err = -EEXIST; else { net_families[ops->family] = ops; err = 0; } spin_unlock(&net_family_lock); printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family); return err; } EXPORT_SYMBOL(sock_register);
從這裏咱們看出每一個協議族都是經過sock_register函數註冊到net_families數組中,經過代碼搜索發現每一個協議族都會調用這個函數去註冊。
Af_ax25.c (net\ax25): sock_register(&ax25_family_ops); Af_bluetooth.c (net\bluetooth): err = sock_register(&bt_sock_family_ops); Af_can.c (net\can): sock_register(&can_family_ops); Af_decnet.c (net\decnet): sock_register(&dn_family_ops); Af_econet.c (net\econet): sock_register(&econet_family_ops); Af_ieee802154.c (net\ieee802154): rc = sock_register(&ieee802154_family_ops); Af_inet.c (net\ipv4): (void)sock_register(&inet_family_ops); Af_inet6.c (net\ipv6): err = sock_register(&inet6_family_ops); Af_ipx.c (net\ipx): sock_register(&ipx_family_ops); Af_irda.c (net\irda): rc = sock_register(&irda_family_ops); Af_iucv.c (net\iucv): err = sock_register(&iucv_sock_family_ops); Af_key.c (net\key): err = sock_register(&pfkey_family_ops); Af_llc.c (net\llc): rc = sock_register(&llc_ui_family_ops); Af_netlink.c (net\netlink): sock_register(&netlink_family_ops); Af_netrom.c (net\netrom): if (sock_register(&nr_family_ops)) { Af_packet.c (net\packet): sock_register(&packet_family_ops); Af_phonet.c (net\phonet): err = sock_register(&phonet_proto_family); Af_rds.c (net\rds): ret = sock_register(&rds_family_ops); Af_rose.c (net\rose): sock_register(&rose_family_ops); Af_rxrpc.c (net\rxrpc): ret = sock_register(&rxrpc_family_ops); Af_unix.c (net\unix): sock_register(&unix_family_ops); Af_x25.c (net\x25): rc = sock_register(&x25_family_ops); Caif_socket.c (net\caif): int err = sock_register(&caif_family_ops); Ddp.c (net\appletalk): (void)sock_register(&atalk_family_ops); Net.h (include\linux):extern int sock_register(const struct net_proto_family *fam); Pppox.c (drivers\net): return sock_register(&pppox_proto_family); Pvc.c (net\atm): return sock_register(&pvc_family_ops); Socket.c (drivers\isdn\misdn): err = sock_register(&mISDN_sock_family_ops); Socket.c (net): * sock_register - add a socket protocol handler Socket.c (net):int sock_register(const struct net_proto_family *ops) Socket.c (net):EXPORT_SYMBOL(sock_register); Socket.c (net\tipc): res = sock_register(&tipc_family_ops); Svc.c (net\atm): return sock_register(&svc_family_ops);
本文主要分析的ipv4協議族,因此咱們參考的文件af_inet.c(net/ipv4)。
3> err = pf->create(net, sock, protocol, kern);
在af_inet.c裏面inet_init函數裏面調用sock_register註冊到協議族數組net_families裏:
(void)sock_register(&inet_family_ops);
接着看inet_family_ops定義:
static const struct net_proto_family inet_family_ops = { .family = PF_INET, .create = inet_create, .owner = THIS_MODULE, };
這裏的inet_create就是程序調用的函數:
/* * Create an inet socket. */ static int inet_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct inet_protosw *answer; struct inet_sock *inet; struct proto *answer_prot; unsigned char answer_flags; char answer_no_check; int try_loading_module = 0; int err; if (unlikely(!inet_ehash_secret)) if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) build_ehash_secret(); sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ lookup_protocol: err = -ESOCKTNOSUPPORT; rcu_read_lock(); list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { err = 0; /* Check the non-wild match. */ if (protocol == answer->protocol) { if (protocol != IPPROTO_IP) break; } else { /* Check for the two wild cases. */ if (IPPROTO_IP == protocol) { protocol = answer->protocol; break; } if (IPPROTO_IP == answer->protocol) break; } err = -EPROTONOSUPPORT; } if (unlikely(err)) { if (try_loading_module < 2) { rcu_read_unlock(); /* * Be more specific, e.g. net-pf-2-proto-132-type-1 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) */ if (++try_loading_module == 1) request_module("net-pf-%d-proto-%d-type-%d", PF_INET, protocol, sock->type); /* * Fall back to generic, e.g. net-pf-2-proto-132 * (net-pf-PF_INET-proto-IPPROTO_SCTP) */ else request_module("net-pf-%d-proto-%d", PF_INET, protocol); goto lookup_protocol; } else goto out_rcu_unlock; } err = -EPERM; if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) goto out_rcu_unlock; err = -EAFNOSUPPORT; if (!inet_netns_ok(net, protocol)) goto out_rcu_unlock; sock->ops = answer->ops; answer_prot = answer->prot; answer_no_check = answer->no_check; answer_flags = answer->flags; rcu_read_unlock(); WARN_ON(answer_prot->slab == NULL); err = -ENOBUFS; sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot); if (sk == NULL) goto out; err = 0; sk->sk_no_check = answer_no_check; if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = 1; inet = inet_sk(sk); inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; inet->nodefrag = 0; if (SOCK_RAW == sock->type) { inet->inet_num = protocol; if (IPPROTO_RAW == protocol) inet->hdrincl = 1; } if (ipv4_config.no_pmtu_disc) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; inet->inet_id = 0; sock_init_data(sock, sk); sk->sk_destruct = inet_sock_destruct; sk->sk_protocol = protocol; sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; inet->uc_ttl = -1; inet->mc_loop = 1; inet->mc_ttl = 1; inet->mc_all = 1; inet->mc_index = 0; inet->mc_list = NULL; sk_refcnt_debug_inc(sk); if (inet->inet_num) { /* It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically * shares. */ inet->inet_sport = htons(inet->inet_num); /* Add to protocol hash chains. */ sk->sk_prot->hash(sk); } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); if (err) sk_common_release(sk); } out: return err; out_rcu_unlock: rcu_read_unlock(); goto out; }
在分析inet_create()函數前,就要分析inetsw[SOCK_MAX]這個數組。
static struct list_head inetsw[SOCK_MAX];
這個數組是在inet_init()->inet_register_protosw()裏面填充的。
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) inet_register_protosw(q);
inetsw_array定義:
/* Upon startup we insert all the elements in inetsw_array[] into * the linked list inetsw. */ static struct inet_protosw inetsw_array[] = { { .type = SOCK_STREAM, .protocol = IPPROTO_TCP, .prot = &tcp_prot, .ops = &inet_stream_ops, .no_check = 0, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }, { .type = SOCK_DGRAM, .protocol = IPPROTO_UDP, .prot = &udp_prot, .ops = &inet_dgram_ops, .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_PERMANENT, }, { .type = SOCK_RAW, .protocol = IPPROTO_IP, /* wild card */ .prot = &raw_prot, .ops = &inet_sockraw_ops, .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, } }; #define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)
inet_register_protosw函數分析:
void inet_register_protosw(struct inet_protosw *p) { struct list_head *lh; struct inet_protosw *answer; int protocol = p->protocol; struct list_head *last_perm; spin_lock_bh(&inetsw_lock); if (p->type >= SOCK_MAX) goto out_illegal; /* If we are trying to override a permanent protocol, bail. */ answer = NULL; last_perm = &inetsw[p->type]; list_for_each(lh, &inetsw[p->type]) { answer = list_entry(lh, struct inet_protosw, list); /* Check only the non-wild match. */ if (INET_PROTOSW_PERMANENT & answer->flags) { if (protocol == answer->protocol) break; last_perm = lh; } answer = NULL; } if (answer) goto out_permanent; /* Add the new entry after the last permanent entry if any, so that * the new entry does not override a permanent entry when matched with * a wild-card protocol. But it is allowed to override any existing * non-permanent entry. This means that when we remove this entry, the * system automatically returns to the old behavior. */ list_add_rcu(&p->list, last_perm); out: spin_unlock_bh(&inetsw_lock); return; out_permanent: printk(KERN_ERR "Attempt to override permanent protocol %d.\n", protocol); goto out; out_illegal: printk(KERN_ERR "Ignoring attempt to register invalid socket type %d.\n", p->type); goto out; } EXPORT_SYMBOL(inet_register_protosw);
這個函數完成的工做,就是把inetsw_array 數組中,相同的協議類型(protocol成員)下邊的協議,加入到inetsw 對應的協議類型的鏈表中去。
由於事實上一對一的關係,因此這個函數要簡單得多:
由於不存在其它成員,因此每一次 list_entry 都爲空值,因此不存在覆蓋和追加的狀況,直接調用list_add_rcu(&p->list, last_perm);
把協議類型節點(struct inet_protosw 類型的數組的某個元素)添加到鏈表(鏈表首部自己是一個數組,數組索引是協議對應的協議類型的值的第一個成員。
繼續分析inet_create()函數:
首先,根據sock的成員protocol,把以前在鏈表中註冊的協議節點找出。
而後,將建立的socket 的ops 函數指針集,指向協議類型的例如建立的是SOCK_STREAM,那麼就指向了inet_stream_ops; answer_prot 指針指向了當前要建立的socket 的協議類型下邊的協議,如上例,它就是IPPROTO_TCP 的tcp_prot結構。
接着, 接下來一個重要的工做,就是爲socket分配一個sock,並初始化它。
最後,初始化一個 inet 。
雖然create 的代碼就到這兒了,不過要說清楚sk(socK)的分配,還得費上大力氣。
每個Socket 套接字,都有一個對應的 struct socket 結構來描述(內核中通常使用名稱爲sock),可是同時又一個struct sock 結構(內核中通常使用名稱爲sk),二者之間是一一對應的關係。
在後面的sock_init_data 函數中,能夠看到:
sk->sk_socket = sock;
sock->sk = sk;
socket 結構和 sock 結構其實是同一個事物的兩個方面。不妨說,socket 結構是面向進程和系統調用界面的側面,而 sock 結構則是面向底層驅動程序的側面。
設計者把socket套接字中,與文件系統關係比較密切的那一部份放在socket結構中,而把與通訊關係比較密切的那一部份,則單獨成爲 一個數結結構,那就是sock 結構。
因爲這兩部份邏輯上原本就是一體的,因此要經過指針互相指向對方,造成一對一的關係。
調用sk_alloc()分配一個sk:
在以前proto_register()函數建立的高速緩存中申請分配一個slab緩存項,並清零。而後設置協議族、並把sk中的sk_prot與對應的協議關聯起來。
分配完成sk後,另外一個重要的功能就是初始化它,
sk的成員至關複雜,其主要的初始化工做是在函數sock_init_data()中完成的:
sock 結構中,有三個重要的雙向隊列,分別是 sk_receive_queue、sk_write_queue 和sk_error_queue。從它們的名字就能夠看出來其做用了。
隊列並不是採用通用的list_head來維護,而是使用skb_buffer隊列:
struct sk_buff_head { /* These two members must be first. */ struct sk_buff *next; struct sk_buff *prev; __u32 qlen; spinlock_t lock; };
這樣,隊列中指向的每個skb_buffer,就是一個數據包,分別是接收、發送和投遞錯誤。
inet 初始化:
inet 是一個struct inet_sock 結構類型,來看它的定義:
struct inet_sock { /* sk and pinet6 has to be the first two members of inet_sock */ struct sock sk; …… }
只留意它的第一個成員就足夠了。
咱們說sock 是面向用戶態調用,而sk是面向內核驅動調用的,那sk是如何與協議棧交互的呢?
對於每個類型的協議,爲了與sk聯繫起來,都定義了一個struct XXX_sock 結構,XXX是協議名,例如:
struct tcp_sock { /* inet_sock has to be the first member of tcp_sock */ struct inet_sock inet; int tcp_header_len; /* Bytes of tcp header to send */ …… }
很明顯,它們的結構定構是「af_inet 通常屬性+ 本身的私有屬性」 ,由於它們的第一個成員老是inet 。
如今回頭來照一下起初在af_inet.c中,封裝協議註冊proto_register()的時候,size成員,對於tcp而言:
struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, .close = tcp_close, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v4_init_sock, .destroy = tcp_v4_destroy_sock, .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, ... .obj_size = sizeof(struct tcp_sock), ... };
其它協議相似。
以obj_size 來肯定每一個 slab 緩存項分配的大小,因此,咱們就可說,每次申請分配的,其實是一個struct XXX_sock 結構大小的結構。由於都是定義於上層結構的第一個成員,可使用強制類型轉換來使用這塊分配的內存空間。例如:
struct inet_sock { /* sk and pinet6 has to be the first two members of inet_sock */ struct sock sk; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) struct ipv6_pinfo *pinet6; #endif /* Socket demultiplex comparisons on incoming packets. */ __be32 inet_daddr; __be32 inet_rcv_saddr; __be16 inet_dport; __u16 inet_num; __be32 inet_saddr; __s16 uc_ttl; __u16 cmsg_flags; __be16 inet_sport; __u16 inet_id; ... }; inet = inet_sk(sk); static inline struct inet_sock *inet_sk(const struct sock *sk) { return (struct inet_sock *)sk; //inet_sock->sk }
struct tcp_sock { /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; u16 tcp_header_len; /* Bytes of tcp header to send */ ... }; struct tcp_sock *tp = tcp_sk(sk); static inline struct tcp_sock *tcp_sk(const struct sock *sk) { return (struct tcp_sock *)sk; //tcp_sock->inet_conn->icsk_inet->sk }
inet_create()運行完,一個 socket 套接字基本上就建立完畢了,剩下的就是與文件系統掛鉤。
4、與文件系統交互
sys_socket()函數中來,它在調用完sock_create()後,緊接着調用sock_map_fd()函數:
int sock_map_fd(struct socket *sock, int flags) { struct file *newfile; int fd = sock_alloc_file(sock, &newfile, flags); if (likely(fd >= 0)) fd_install(fd, newfile); return fd; } EXPORT_SYMBOL(sock_map_fd);
這個函數的核心思想,在一開始,就已經分析過了。
從進程的角度來說,一個 socket 套接字就是一個特殊的,已打開的文件。
前面分配好一個socket後,這裏要作的就是將它與文件系統拉上親戚關係。
首先獲取一個空閒的文件描述符號和file結構。而後在文件系統中分配一個目錄項(d_alloc),使其指向已經分配的inode節點(d_add),而後把其目錄項掛在sockfs文件系統的根目錄之下。
而且把目錄項的指針d_op設置成指向 sockfs_dentry_operati,這個數據結構經過函數指針提供他與文件路徑有關的操做:
static const struct dentry_operations sockfs_dentry_operations = { .d_dname = sockfs_dname, };
最後一步,就是將file結構中的f_op和sock結構中的i_fop都指向socket_file_ops,它是一個函數指針集,指向了socket面向文件系統的用戶態調用的一些接口函數:
/* * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear * in the operation structures but are done directly via the socketcall() multiplexor. */ static const struct file_operations socket_file_ops = { .owner = THIS_MODULE, .llseek = no_llseek, .aio_read = sock_aio_read, .aio_write = sock_aio_write, .poll = sock_poll, .unlocked_ioctl = sock_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_sock_ioctl, #endif .mmap = sock_mmap, .open = sock_no_open, /* special open code to disallow open via /proc */ .release = sock_close, .fasync = sock_fasync, .sendpage = sock_sendpage, .splice_write = generic_splice_sendpage, .splice_read = sock_splice_read, };
到這裏,整個socket 套接字的建立工做,就宣告完成了。