下面經過學習linux 1.2.13源碼進一步理解socket通訊機制。對該版本源碼的學習主要參考《Linux內核網絡棧源代碼情景分析》(曹桂平 編著)。node
要理解socket的本質,就要理解當調用socket函數時,該函數到底建立了什麼?返回了什麼?linux
int socket(int family, int type, int protocol);
socket 函數爲用戶層函數,該函數對應的內核函數爲sock_socket(socket.c文件),源碼以下:數組
static int sock_socket(int family, int type, int protocol) { int i, fd; struct socket *sock; struct proto_ops *ops; /* Locate the correct protocol family. */ for (i = 0; i < NPROTO; ++i) { if (pops[i] == NULL) continue; if (pops[i]->family == family) break; } if (i == NPROTO) { return -EINVAL; } ops = pops[i]; /* * Check that this is a type that we know how to manipulate and * the protocol makes sense here. The family can still reject the * protocol later. */ if ((type != SOCK_STREAM && type != SOCK_DGRAM && type != SOCK_SEQPACKET && type != SOCK_RAW && type != SOCK_PACKET) || protocol < 0) return(-EINVAL); /* * Allocate the socket and allow the family to set things up. if * the protocol is 0, the family is instructed to select an appropriate * default. */ if (!(sock = sock_alloc())) { printk("NET: sock_socket: no more sockets\n"); return(-ENOSR); /* Was: EAGAIN, but we are out of system resources! */ } sock->type = type; sock->ops = ops; if ((i = sock->ops->create(sock, protocol)) < 0) { sock_release(sock); return(i); } if ((fd = get_fd(SOCK_INODE(sock))) < 0) { sock_release(sock); return(-EINVAL); } return(fd); }
sock_socket 函數完成以下工做:網絡
(1)分配socket、sock結構,這兩個結構在網絡棧的不一樣層次表示一個套接字鏈接。app
(2)分配inode、file結構用於普通文件操做。socket
(3)分配一個文件描述符並返回給應用程序做爲之後的操做句柄。async
sock_alloc 函數用於分配一個inode節點,並返回該節點的socket指針tcp
struct socket *sock_alloc(void) { struct inode * inode; struct socket * sock; inode = get_empty_inode(); if (!inode) return NULL; inode->i_mode = S_IFSOCK; inode->i_sock = 1; inode->i_uid = current->uid; inode->i_gid = current->gid; sock = &inode->u.socket_i; sock->state = SS_UNCONNECTED; sock->flags = 0; sock->ops = NULL; sock->data = NULL; sock->conn = NULL; sock->iconn = NULL; sock->next = NULL; sock->wait = &inode->i_wait; sock->inode = inode; /* "backlink": we could use pointer arithmetic instead */ sock->fasync_list = NULL; sockets_in_use++; return sock; }
inode的定義以下函數
/* include/fs.h */ struct inode { dev_t i_dev; unsigned long i_ino; umode_t i_mode; nlink_t i_nlink; uid_t i_uid; gid_t i_gid; dev_t i_rdev; off_t i_size; time_t i_atime; time_t i_mtime; time_t i_ctime; unsigned long i_blksize; unsigned long i_blocks; unsigned long i_version; struct semaphore i_sem; struct inode_operations * i_op; struct super_block * i_sb; struct wait_queue * i_wait; struct file_lock * i_flock; struct vm_area_struct * i_mmap; struct inode * i_next, * i_prev; struct inode * i_hash_next, * i_hash_prev; struct inode * i_bound_to, * i_bound_by; struct inode * i_mount; unsigned short i_count; unsigned short i_wcount; unsigned short i_flags; unsigned char i_lock; unsigned char i_dirt; unsigned char i_pipe; unsigned char i_sock; unsigned char i_seek; unsigned char i_update; union { struct pipe_inode_info pipe_i; struct minix_inode_info minix_i; struct ext_inode_info ext_i; struct ext2_inode_info ext2_i; struct hpfs_inode_info hpfs_i; struct msdos_inode_info msdos_i; struct umsdos_inode_info umsdos_i; struct iso_inode_info isofs_i; struct nfs_inode_info nfs_i; struct xiafs_inode_info xiafs_i; struct sysv_inode_info sysv_i; struct socket socket_i; void * generic_ip; } u; };
inode 結構是文件系統的一個結構體,該結構體中的成員變量u指明瞭該inode結構具體的文件類型,當inode是用於socket通訊時,u的值就爲socket_i。sock_alloc 的做用就是建立inode結構體,而後返回socket_i的地址。至於具體如何分配inode涉及到文件系統方面的知識,這裏暫不討論。學習
當協議族爲AF_INET時,ops->create 將調用inet_create(struct socket*sock, int protocol)函數。該函數將建立一個sock結構體並使得socket的data指針指向該sock結構體。
static int inet_create(struct socket *sock, int protocol) { struct sock *sk; struct proto *prot; int err; sk = (struct sock *) kmalloc(sizeof(*sk), GFP_KERNEL); if (sk == NULL) return(-ENOBUFS); sk->num = 0; sk->reuse = 0; switch(sock->type) { case SOCK_STREAM: case SOCK_SEQPACKET: if (protocol && protocol != IPPROTO_TCP) { kfree_s((void *)sk, sizeof(*sk)); return(-EPROTONOSUPPORT); } protocol = IPPROTO_TCP; sk->no_check = TCP_NO_CHECK; prot = &tcp_prot; break; case SOCK_DGRAM: if (protocol && protocol != IPPROTO_UDP) { kfree_s((void *)sk, sizeof(*sk)); return(-EPROTONOSUPPORT); } protocol = IPPROTO_UDP; sk->no_check = UDP_NO_CHECK; prot=&udp_prot; break; case SOCK_RAW: if (!suser()) { kfree_s((void *)sk, sizeof(*sk)); return(-EPERM); } if (!protocol) { kfree_s((void *)sk, sizeof(*sk)); return(-EPROTONOSUPPORT); } prot = &raw_prot; sk->reuse = 1; sk->no_check = 0; /* * Doesn't matter no checksum is * performed anyway. */ sk->num = protocol; break; case SOCK_PACKET: if (!suser()) { kfree_s((void *)sk, sizeof(*sk)); return(-EPERM); } if (!protocol) { kfree_s((void *)sk, sizeof(*sk)); return(-EPROTONOSUPPORT); } prot = &packet_prot; sk->reuse = 1; sk->no_check = 0; /* Doesn't matter no checksum is * performed anyway. */ sk->num = protocol; break; default: kfree_s((void *)sk, sizeof(*sk)); return(-ESOCKTNOSUPPORT); } sk->socket = sock; #ifdef CONFIG_TCP_NAGLE_OFF sk->nonagle = 1; #else sk->nonagle = 0; #endif sk->type = sock->type; sk->stamp.tv_sec=0; sk->protocol = protocol; ...... sk->timer.function = &net_timer; skb_queue_head_init(&sk->back_log); sk->blog = 0; sock->data =(void *) sk; //socket 指向 sock sk->dummy_th.doff = sizeof(sk->dummy_th)/4; ...... if (sk->prot->init) { err = sk->prot->init(sk); if (err != 0) { destroy_sock(sk); return(err); } } return(0); }
最後調用get_fd 返回一個文件描述符給上層應用。
/* socket.c */ static int get_fd(struct inode *inode) { int fd; struct file *file; /* * Find a file descriptor suitable for return to the user. */ file = get_empty_filp(); // 獲取一個閒置的file結構 if (!file) return(-1); for (fd = 0; fd < NR_OPEN; ++fd) if (!current->files->fd[fd]) break; if (fd == NR_OPEN) { file->f_count = 0; return(-1); } FD_CLR(fd, ¤t->files->close_on_exec); current->files->fd[fd] = file; file->f_op = &socket_file_ops; // socket 文件操做 file->f_mode = 3; file->f_flags = O_RDWR; file->f_count = 1; file->f_inode = inode; if (inode) inode->i_count++; file->f_pos = 0; return(fd); }
get_fd 用於爲網絡套接字分配一個文件描述符,分配描述符的同時須要一個file結構,每一個file結構都須要一個inode結構對應。內核維護一個file結構數據,get_empty_filp 函數即經過檢查該數組,獲取一個閒置的成員。f_op 字段的賦值實現了網絡操做的普通文件接口。若是調用write、read函數進行操做就會調用相應的sock_read 和 sock_write 函數。
如何根據文件描述如fd找到相應的sock?