Linux TCP/IP 協議棧之 Socket 的實現分析(一)

內核版本:2.6.37
參考[做者:kendo的文章(基於內涵版本2.6.12)]node

第一部份 Socket套接字的建立linux

socket 並非 TCP/IP協議的一部份。
從廣義上來說,socket 是Unix/Linux 抽像的進程間通信的一種方法。網絡 socket 通信僅僅是其若干協議中的一類。而tcp/ip 又是網絡這類中的一種。
從tcp/ip 的解度看 socket ,它更多地體現了用戶 API 與協議棧的一箇中間層接口層。用戶經過調用socket API 將報文遞交給協議棧,或者從協議棧中接收報文件。 數組

1、系統總入口
Linux 內核爲全部的與socket 有關的操做的API,提供了一個統一的系統調用入口,其代碼在net/socket.c 中:緩存

/*
 *    System call vectors.
 *
 *    Argument checking cleaned up. Saved 20% in size.
 *  This function doesn't need to set the kernel lock because
 *  it is set by the callees.
 */

SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
{
    unsigned long a[6];
    unsigned long a0, a1;
    int err;
    unsigned int len;

    if (call < 1 || call > SYS_RECVMMSG)
        return -EINVAL;

    len = nargs[call];
    if (len > sizeof(a))
        return -EINVAL;

    /* copy_from_user should be SMP safe. */
    if (copy_from_user(a, args, len))
        return -EFAULT;

    audit_socketcall(nargs[call] / sizeof(unsigned long), a);

    a0 = a[0];
    a1 = a[1];

    switch (call) {
    case SYS_SOCKET:
        err = sys_socket(a0, a1, a[2]);
        break;
    case SYS_BIND:
        err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
        break;
    case SYS_CONNECT:
        err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
        break;
    case SYS_LISTEN:
        err = sys_listen(a0, a1);
        break;
    case SYS_ACCEPT:
        err = sys_accept4(a0, (struct sockaddr __user *)a1,
                  (int __user *)a[2], 0);
        break;
    case SYS_GETSOCKNAME:
        err =
            sys_getsockname(a0, (struct sockaddr __user *)a1,
                    (int __user *)a[2]);
        break;
    case SYS_GETPEERNAME:
        err =
            sys_getpeername(a0, (struct sockaddr __user *)a1,
                    (int __user *)a[2]);
        break;
    case SYS_SOCKETPAIR:
        err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
        break;
    case SYS_SEND:
        err = sys_send(a0, (void __user *)a1, a[2], a[3]);
        break;
    case SYS_SENDTO:
        err = sys_sendto(a0, (void __user *)a1, a[2], a[3],
                 (struct sockaddr __user *)a[4], a[5]);
        break;
    case SYS_RECV:
        err = sys_recv(a0, (void __user *)a1, a[2], a[3]);
        break;
    case SYS_RECVFROM:
        err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
                   (struct sockaddr __user *)a[4],
                   (int __user *)a[5]);
        break;
    case SYS_SHUTDOWN:
        err = sys_shutdown(a0, a1);
        break;
    case SYS_SETSOCKOPT:
        err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]);
        break;
    case SYS_GETSOCKOPT:
        err =
            sys_getsockopt(a0, a1, a[2], (char __user *)a[3],
                   (int __user *)a[4]);
        break;
    case SYS_SENDMSG:
        err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]);
        break;
    case SYS_RECVMSG:
        err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
        break;
    case SYS_RECVMMSG:
        err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3],
                   (struct timespec __user *)a[4]);
        break;
    case SYS_ACCEPT4:
        err = sys_accept4(a0, (struct sockaddr __user *)a1,
                  (int __user *)a[2], a[3]);
        break;
    default:
        err = -EINVAL;
        break;
    }
    return err;
}

首先調用copy_from_user 將用戶態參數拷貝至數組a 。可是問題在於,每一個被調用的 API 的參數不盡相同,那麼每次拷貝的字節在小若是判定?
來看其第三個參數nargs[call],其中 call 是操做碼,後面有個大大的 switch...case就是判斷它。對應的操做碼定義在include/linux/net.h : 網絡

#define SYS_SOCKET    1        /* sys_socket(2)        */
#define SYS_BIND    2        /* sys_bind(2)            */
#define SYS_CONNECT    3        /* sys_connect(2)        */
#define SYS_LISTEN    4        /* sys_listen(2)        */
#define SYS_ACCEPT    5        /* sys_accept(2)        */
#define SYS_GETSOCKNAME    6        /* sys_getsockname(2)        */
#define SYS_GETPEERNAME    7        /* sys_getpeername(2)        */
#define SYS_SOCKETPAIR    8        /* sys_socketpair(2)        */
#define SYS_SEND    9        /* sys_send(2)            */
#define SYS_RECV    10        /* sys_recv(2)            */
#define SYS_SENDTO    11        /* sys_sendto(2)        */
#define SYS_RECVFROM    12        /* sys_recvfrom(2)        */
#define SYS_SHUTDOWN    13        /* sys_shutdown(2)        */
#define SYS_SETSOCKOPT    14        /* sys_setsockopt(2)        */
#define SYS_GETSOCKOPT    15        /* sys_getsockopt(2)        */
#define SYS_SENDMSG    16        /* sys_sendmsg(2)        */
#define SYS_RECVMSG    17        /* sys_recvmsg(2)        */
#define SYS_ACCEPT4    18        /* sys_accept4(2)        */
#define SYS_RECVMMSG    19        /* sys_recvmmsg(2)        */

而數組nargs則根據操做碼的不一樣,計算對應的參數的空間大小:數據結構

/* Argument list sizes for sys_socketcall */
#define AL(x) ((x) * sizeof(unsigned long))
static const unsigned char nargs[20] = {
    AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),
    AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),
    AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),
    AL(4), AL(5)
};

#undef AL

當拷貝完成參數後,就進入一個switch...case... 判斷操做碼,跳轉至對應的系統接口。app

2、 sys_socket 函數dom

當用戶空間要建立一個socke 接口時,會調用 API 函數:socket

int socket(int domain, int type, int protocol);

函數,其三個參數分別表示協議族、協議類型(面向鏈接或無鏈接)以及協議
協議族:async

/* Supported address families. */
#define AF_UNSPEC    0
#define AF_UNIX        1    /* Unix domain sockets         */
#define AF_LOCAL    1    /* POSIX name for AF_UNIX    */
#define AF_INET        2    /* Internet IP Protocol     */
#define AF_AX25        3    /* Amateur Radio AX.25         */
#define AF_IPX        4    /* Novell IPX             */
#define AF_APPLETALK    5    /* AppleTalk DDP         */
#define AF_NETROM    6    /* Amateur Radio NET/ROM     */
#define AF_BRIDGE    7    /* Multiprotocol bridge     */
#define AF_ATMPVC    8    /* ATM PVCs            */
#define AF_X25        9    /* Reserved for X.25 project     */
#define AF_INET6    10    /* IP version 6            */
#define AF_ROSE        11    /* Amateur Radio X.25 PLP    */
#define AF_DECnet    12    /* Reserved for DECnet project    */
#define AF_NETBEUI    13    /* Reserved for 802.2LLC project*/
#define AF_SECURITY    14    /* Security callback pseudo AF */
#define AF_KEY        15      /* PF_KEY key management API */
#define AF_NETLINK    16
#define AF_ROUTE    AF_NETLINK /* Alias to emulate 4.4BSD */
#define AF_PACKET    17    /* Packet family        */
#define AF_ASH        18    /* Ash                */
#define AF_ECONET    19    /* Acorn Econet            */
#define AF_ATMSVC    20    /* ATM SVCs            */
#define AF_RDS        21    /* RDS sockets             */
#define AF_SNA        22    /* Linux SNA Project (nutters!) */
#define AF_IRDA        23    /* IRDA sockets            */
#define AF_PPPOX    24    /* PPPoX sockets        */
#define AF_WANPIPE    25    /* Wanpipe API Sockets */
#define AF_LLC        26    /* Linux LLC            */
#define AF_CAN        29    /* Controller Area Network      */
#define AF_TIPC        30    /* TIPC sockets            */
#define AF_BLUETOOTH    31    /* Bluetooth sockets         */
#define AF_IUCV        32    /* IUCV sockets            */
#define AF_RXRPC    33    /* RxRPC sockets         */
#define AF_ISDN        34    /* mISDN sockets         */
#define AF_PHONET    35    /* Phonet sockets        */
#define AF_IEEE802154    36    /* IEEE802154 sockets        */
#define AF_CAIF        37    /* CAIF sockets            */
#define AF_MAX        38    /* For now.. */

/* Protocol families, same as address families. */
#define PF_UNSPEC    AF_UNSPEC
#define PF_UNIX        AF_UNIX
#define PF_LOCAL    AF_LOCAL
#define PF_INET        AF_INET
#define PF_AX25        AF_AX25
#define PF_IPX        AF_IPX
#define PF_APPLETALK    AF_APPLETALK
#define PF_NETROM    AF_NETROM
#define PF_BRIDGE    AF_BRIDGE
#define PF_ATMPVC    AF_ATMPVC
#define PF_X25        AF_X25
#define PF_INET6            AF_INET6
#define PF_ROSE        AF_ROSE
#define PF_DECnet    AF_DECnet
#define PF_NETBEUI    AF_NETBEUI
#define PF_SECURITY    AF_SECURITY
#define PF_KEY        AF_KEY
#define PF_NETLINK    AF_NETLINK
#define PF_ROUTE    AF_ROUTE
#define PF_PACKET    AF_PACKET
#define PF_ASH        AF_ASH
#define PF_ECONET    AF_ECONET
#define PF_ATMSVC    AF_ATMSVC
#define PF_RDS        AF_RDS
#define PF_SNA        AF_SNA
#define PF_IRDA        AF_IRDA
#define PF_PPPOX            AF_PPPOX
#define PF_WANPIPE    AF_WANPIPE
#define PF_LLC        AF_LLC
#define PF_CAN        AF_CAN
#define PF_TIPC        AF_TIPC
#define PF_BLUETOOTH    AF_BLUETOOTH
#define PF_IUCV        AF_IUCV
#define PF_RXRPC    AF_RXRPC
#define PF_ISDN        AF_ISDN
#define PF_PHONET    AF_PHONET
#define PF_IEEE802154    AF_IEEE802154
#define PF_CAIF        AF_CAIF
#define PF_MAX        AF_MAX

協議類型:

enum sock_type {
    SOCK_STREAM    = 1,
    SOCK_DGRAM    = 2,
    SOCK_RAW    = 3,
    SOCK_RDM    = 4,
    SOCK_SEQPACKET    = 5,
    SOCK_DCCP    = 6,
    SOCK_PACKET    = 10,
};

socket建立經過操做碼SYS_SOCKET是由sys_socket() 實現的:

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
    int retval;
    struct socket *sock;
    int flags;

    /* Check the SOCK_* constants for consistency.  */
    BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
    BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
    BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
    BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

    flags = type & ~SOCK_TYPE_MASK;
    if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
        return -EINVAL;
    type &= SOCK_TYPE_MASK;

    if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
        flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

    retval = sock_create(family, type, protocol, &sock);
    if (retval < 0)
        goto out;

    retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
    if (retval < 0)
        goto out_release;

out:
    /* It may be already another descriptor 8) Not kernel problem. */
    return retval;

out_release:
    sock_release(sock);
    return retval;
}

這段代碼作了兩件事:

1>  分配 sock 與sk,協議簇的協議封裝;

2>  sock 面向上層系統調用,主要是與文件系統交互。

  經過進程的current指針的files,結合建立socket時返回的文件描符述,能夠找到內核中對應的struct file,再根據file的f_dentry能夠找到對應的目錄項,而目錄項struct dentry中,有d_inode指針,指向與sock封裝在一塊兒的inode。

  sock又與sk指針互指,一一對應。

3、 協議簇的協議封裝

int __sock_create(struct net *net, int family, int type, int protocol,
             struct socket **res, int kern)
{
    int err;
    struct socket *sock;
    const struct net_proto_family *pf;

    /*
     *      Check protocol is in range
     */
    if (family < 0 || family >= NPROTO)
        return -EAFNOSUPPORT;
    if (type < 0 || type >= SOCK_MAX)
        return -EINVAL;

    /* Compatibility.

       This uglymoron is moved from INET layer to here to avoid
       deadlock in module load.
     */
    if (family == PF_INET && type == SOCK_PACKET) {
        static int warned;
        if (!warned) {
            warned = 1;
            printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
                   current->comm);
        }
        family = PF_PACKET;
    }

    err = security_socket_create(family, type, protocol, kern);
    if (err)
        return err;

    /*
     *    Allocate the socket and allow the family to set things up. if
     *    the protocol is 0, the family is instructed to select an appropriate
     *    default.
     */
    sock = sock_alloc();
    if (!sock) {
        if (net_ratelimit())
            printk(KERN_WARNING "socket: no more sockets\n");
        return -ENFILE;    /* Not exactly a match, but its the
                   closest posix thing */
    }

    sock->type = type;

#ifdef CONFIG_MODULES
    /* Attempt to load a protocol module if the find failed.
     *
     * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
     * requested real, full-featured networking support upon configuration.
     * Otherwise module support will break!
     */
    if (net_families[family] == NULL)
        request_module("net-pf-%d", family);
#endif

    rcu_read_lock();
    pf = rcu_dereference(net_families[family]);
    err = -EAFNOSUPPORT;
    if (!pf)
        goto out_release;

    /*
     * We will call the ->create function, that possibly is in a loadable
     * module, so we have to bump that loadable module refcnt first.
     */
    if (!try_module_get(pf->owner))
        goto out_release;

    /* Now protected by module ref count */
    rcu_read_unlock();

    err = pf->create(net, sock, protocol, kern);
    if (err < 0)
        goto out_module_put;

    /*
     * Now to bump the refcnt of the [loadable] module that owns this
     * socket at sock_release time we decrement its refcnt.
     */
    if (!try_module_get(sock->ops->owner))
        goto out_module_busy;

    /*
     * Now that we're done with the ->create function, the [loadable]
     * module can have its refcnt decremented
     */
    module_put(pf->owner);
    err = security_socket_post_create(sock, family, type, protocol, kern);
    if (err)
        goto out_sock_release;
    *res = sock;

    return 0;

out_module_busy:
    err = -EAFNOSUPPORT;
out_module_put:
    sock->ops = NULL;
    module_put(pf->owner);
out_sock_release:
    sock_release(sock);
    return err;

out_release:
    rcu_read_unlock();
    goto out_sock_release;
}
EXPORT_SYMBOL(__sock_create);

上面這個函數主要作了三件事:

1> sock_alloc()

在分析這個函數前,首先要了解:爲了對 socket 抽像出文件的概念,內核中爲socket定義了一個專門的文件系統類型sockfs。

static struct vfsmount *sock_mnt __read_mostly;

static struct file_system_type sock_fs_type = {
    .name =        "sockfs",
    .mount =    sockfs_mount,
    .kill_sb =    kill_anon_super,
};

在模塊初始化的時候,安裝該文件系統:

static int __init sock_init(void)
{
    /*
     *      Initialize sock SLAB cache.
     */

    sk_init();

    /*
     *      Initialize skbuff SLAB cache
     */
    skb_init();

    /*
     *      Initialize the protocols module.
     */

    init_inodecache();
    register_filesystem(&sock_fs_type);
    sock_mnt = kern_mount(&sock_fs_type);

    /* The real protocol initialization is performed in later initcalls.
     */

#ifdef CONFIG_NETFILTER
    netfilter_init();
#endif

#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
    skb_timestamping_init();
#endif

    return 0;
}

core_initcall(sock_init);    /* early initcall */

文件系統安裝中的一個重要步驟kern_mount->kern_mount_data->vfs_kern_mount:

vfs_kern_mount函數中,先根據註冊的文件系統類型,若是文件系統自己有mount成員函數則調用之,沒則調用它的get_sb成員函數指針,獲取相應的超級塊sb 。最後,調置文件系統的超級塊成員指針,使之指向對應的值。 

其中sockfs文件系統的mount函數調用mount_pseudo()實現超級塊的初始化,跟節點inode和目錄下dentry建立,sockfs_ops這裏關聯上文件系統。

那前面提到的new_inode()函數分配inode 時調用的: sock_mnt->mnt_sb->s_op->alloc_inode(sock_mnt->mnt_sb);

static const struct super_operations sockfs_ops = {
    .alloc_inode    = sock_alloc_inode,
    .destroy_inode    = sock_destroy_inode,
    .statfs        = simple_statfs,
};

這個alloc_inode函數指針也就是sockfs_opssock_alloc_inode()函數。

static struct inode *sock_alloc_inode(struct super_block *sb)
{
    struct socket_alloc *ei;

    ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
    if (!ei)
        return NULL;
    ei->socket.wq = kmalloc(sizeof(struct socket_wq), GFP_KERNEL);
    if (!ei->socket.wq) {
        kmem_cache_free(sock_inode_cachep, ei);
        return NULL;
    }
    init_waitqueue_head(&ei->socket.wq->wait);
    ei->socket.wq->fasync_list = NULL;

    ei->socket.state = SS_UNCONNECTED;
    ei->socket.flags = 0;
    ei->socket.ops = NULL;
    ei->socket.sk = NULL;
    ei->socket.file = NULL;

    return &ei->vfs_inode;
}

函數先分配了一個用於封裝socket和inode的ei ,而後在高速緩存中爲之申請了一塊空間。這樣,inode和socket就同時都被分配了。接下來初始化socket的各個成員。

struct socket_alloc {
    struct socket socket;
    struct inode vfs_inode;
};

顯而易見,該結構實現了inode和socket的封裝。已經經過new_inode從sockfs文件系統分配一個inode,能夠經過宏SOCKET_I來獲取與之對應的socket:

sock = SOCKET_I(inode);

分配inode、socket 以及二者如何關聯,都已一一分析了。

2> pf = rcu_dereference(net_families[family]);

net_families[family]的定義:

static const struct net_proto_family *net_families[NPROTO] __read_mostly;

net_proto_family的定義:

struct net_proto_family {
    int        family;
    int        (*create)(struct net *net, struct socket *sock,
                  int protocol, int kern);
    struct module    *owner;
};

net_families數組填充函數sock_register():

/**
 *    sock_register - add a socket protocol handler
 *    @ops: description of protocol
 *
 *    This function is called by a protocol handler that wants to
 *    advertise its address family, and have it linked into the
 *    socket interface. The value ops->family coresponds to the
 *    socket system call protocol family.
 */
int sock_register(const struct net_proto_family *ops)
{
    int err;

    if (ops->family >= NPROTO) {
        printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family,
               NPROTO);
        return -ENOBUFS;
    }

    spin_lock(&net_family_lock);
    if (net_families[ops->family])
        err = -EEXIST;
    else {
        net_families[ops->family] = ops;
        err = 0;
    }
    spin_unlock(&net_family_lock);

    printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);
    return err;
}
EXPORT_SYMBOL(sock_register);

從這裏咱們看出每一個協議族都是經過sock_register函數註冊到net_families數組中,經過代碼搜索發現每一個協議族都會調用這個函數去註冊。

Af_ax25.c (net\ax25):    sock_register(&ax25_family_ops);
Af_bluetooth.c (net\bluetooth):    err = sock_register(&bt_sock_family_ops);
Af_can.c (net\can):    sock_register(&can_family_ops);
Af_decnet.c (net\decnet):    sock_register(&dn_family_ops);
Af_econet.c (net\econet):    sock_register(&econet_family_ops);
Af_ieee802154.c (net\ieee802154):    rc = sock_register(&ieee802154_family_ops);
Af_inet.c (net\ipv4):    (void)sock_register(&inet_family_ops);
Af_inet6.c (net\ipv6):    err = sock_register(&inet6_family_ops);
Af_ipx.c (net\ipx):    sock_register(&ipx_family_ops);
Af_irda.c (net\irda):        rc = sock_register(&irda_family_ops);
Af_iucv.c (net\iucv):    err = sock_register(&iucv_sock_family_ops);
Af_key.c (net\key):    err = sock_register(&pfkey_family_ops);
Af_llc.c (net\llc):    rc = sock_register(&llc_ui_family_ops);
Af_netlink.c (net\netlink):    sock_register(&netlink_family_ops);
Af_netrom.c (net\netrom):    if (sock_register(&nr_family_ops)) {
Af_packet.c (net\packet):    sock_register(&packet_family_ops);
Af_phonet.c (net\phonet):    err = sock_register(&phonet_proto_family);
Af_rds.c (net\rds):    ret = sock_register(&rds_family_ops);
Af_rose.c (net\rose):    sock_register(&rose_family_ops);
Af_rxrpc.c (net\rxrpc):    ret = sock_register(&rxrpc_family_ops);
Af_unix.c (net\unix):    sock_register(&unix_family_ops);
Af_x25.c (net\x25):    rc = sock_register(&x25_family_ops);
Caif_socket.c (net\caif):    int err = sock_register(&caif_family_ops);
Ddp.c (net\appletalk):    (void)sock_register(&atalk_family_ops);
Net.h (include\linux):extern int         sock_register(const struct net_proto_family *fam);
Pppox.c (drivers\net):    return sock_register(&pppox_proto_family);
Pvc.c (net\atm):    return sock_register(&pvc_family_ops);
Socket.c (drivers\isdn\misdn):    err = sock_register(&mISDN_sock_family_ops);
Socket.c (net): *    sock_register - add a socket protocol handler
Socket.c (net):int sock_register(const struct net_proto_family *ops)
Socket.c (net):EXPORT_SYMBOL(sock_register);
Socket.c (net\tipc):    res = sock_register(&tipc_family_ops);
Svc.c (net\atm):    return sock_register(&svc_family_ops);

本文主要分析的ipv4協議族,因此咱們參考的文件af_inet.c(net/ipv4)。

3> err = pf->create(net, sock, protocol, kern);

在af_inet.c裏面inet_init函數裏面調用sock_register註冊到協議族數組net_families裏:

(void)sock_register(&inet_family_ops);

接着看inet_family_ops定義:

static const struct net_proto_family inet_family_ops = {
    .family = PF_INET,
    .create = inet_create,
    .owner    = THIS_MODULE,
};

這裏的inet_create就是程序調用的函數:

/*
 *    Create an inet socket.
 */

static int inet_create(struct net *net, struct socket *sock, int protocol,
               int kern)
{
    struct sock *sk;
    struct inet_protosw *answer;
    struct inet_sock *inet;
    struct proto *answer_prot;
    unsigned char answer_flags;
    char answer_no_check;
    int try_loading_module = 0;
    int err;

    if (unlikely(!inet_ehash_secret))
        if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
            build_ehash_secret();

    sock->state = SS_UNCONNECTED;

    /* Look for the requested type/protocol pair. */
lookup_protocol:
    err = -ESOCKTNOSUPPORT;
    rcu_read_lock();
    list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {

        err = 0;
        /* Check the non-wild match. */
        if (protocol == answer->protocol) {
            if (protocol != IPPROTO_IP)
                break;
        } else {
            /* Check for the two wild cases. */
            if (IPPROTO_IP == protocol) {
                protocol = answer->protocol;
                break;
            }
            if (IPPROTO_IP == answer->protocol)
                break;
        }
        err = -EPROTONOSUPPORT;
    }

    if (unlikely(err)) {
        if (try_loading_module < 2) {
            rcu_read_unlock();
            /*
             * Be more specific, e.g. net-pf-2-proto-132-type-1
             * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
             */
            if (++try_loading_module == 1)
                request_module("net-pf-%d-proto-%d-type-%d",
                           PF_INET, protocol, sock->type);
            /*
             * Fall back to generic, e.g. net-pf-2-proto-132
             * (net-pf-PF_INET-proto-IPPROTO_SCTP)
             */
            else
                request_module("net-pf-%d-proto-%d",
                           PF_INET, protocol);
            goto lookup_protocol;
        } else
            goto out_rcu_unlock;
    }

    err = -EPERM;
    if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
        goto out_rcu_unlock;

    err = -EAFNOSUPPORT;
    if (!inet_netns_ok(net, protocol))
        goto out_rcu_unlock;

    sock->ops = answer->ops;
    answer_prot = answer->prot;
    answer_no_check = answer->no_check;
    answer_flags = answer->flags;
    rcu_read_unlock();

    WARN_ON(answer_prot->slab == NULL);

    err = -ENOBUFS;
    sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
    if (sk == NULL)
        goto out;

    err = 0;
    sk->sk_no_check = answer_no_check;
    if (INET_PROTOSW_REUSE & answer_flags)
        sk->sk_reuse = 1;

    inet = inet_sk(sk);
    inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

    inet->nodefrag = 0;

    if (SOCK_RAW == sock->type) {
        inet->inet_num = protocol;
        if (IPPROTO_RAW == protocol)
            inet->hdrincl = 1;
    }

    if (ipv4_config.no_pmtu_disc)
        inet->pmtudisc = IP_PMTUDISC_DONT;
    else
        inet->pmtudisc = IP_PMTUDISC_WANT;

    inet->inet_id = 0;

    sock_init_data(sock, sk);

    sk->sk_destruct       = inet_sock_destruct;
    sk->sk_protocol       = protocol;
    sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;

    inet->uc_ttl    = -1;
    inet->mc_loop    = 1;
    inet->mc_ttl    = 1;
    inet->mc_all    = 1;
    inet->mc_index    = 0;
    inet->mc_list    = NULL;

    sk_refcnt_debug_inc(sk);

    if (inet->inet_num) {
        /* It assumes that any protocol which allows
         * the user to assign a number at socket
         * creation time automatically
         * shares.
         */
        inet->inet_sport = htons(inet->inet_num);
        /* Add to protocol hash chains. */
        sk->sk_prot->hash(sk);
    }

    if (sk->sk_prot->init) {
        err = sk->sk_prot->init(sk);
        if (err)
            sk_common_release(sk);
    }
out:
    return err;
out_rcu_unlock:
    rcu_read_unlock();
    goto out;
}

在分析inet_create()函數前,就要分析inetsw[SOCK_MAX]這個數組。

static struct list_head inetsw[SOCK_MAX];

這個數組是在inet_init()->inet_register_protosw()裏面填充的。

    for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
        inet_register_protosw(q);

inetsw_array定義:

/* Upon startup we insert all the elements in inetsw_array[] into
 * the linked list inetsw.
 */
static struct inet_protosw inetsw_array[] =
{
    {
        .type =       SOCK_STREAM,
        .protocol =   IPPROTO_TCP,
        .prot =       &tcp_prot,
        .ops =        &inet_stream_ops,
        .no_check =   0,
        .flags =      INET_PROTOSW_PERMANENT |
                  INET_PROTOSW_ICSK,
    },

    {
        .type =       SOCK_DGRAM,
        .protocol =   IPPROTO_UDP,
        .prot =       &udp_prot,
        .ops =        &inet_dgram_ops,
        .no_check =   UDP_CSUM_DEFAULT,
        .flags =      INET_PROTOSW_PERMANENT,
       },


       {
           .type =       SOCK_RAW,
           .protocol =   IPPROTO_IP,    /* wild card */
           .prot =       &raw_prot,
           .ops =        &inet_sockraw_ops,
           .no_check =   UDP_CSUM_DEFAULT,
           .flags =      INET_PROTOSW_REUSE,
       }
};

#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)

inet_register_protosw函數分析:

void inet_register_protosw(struct inet_protosw *p)
{
    struct list_head *lh;
    struct inet_protosw *answer;
    int protocol = p->protocol;
    struct list_head *last_perm;

    spin_lock_bh(&inetsw_lock);

    if (p->type >= SOCK_MAX)
        goto out_illegal;

    /* If we are trying to override a permanent protocol, bail. */
    answer = NULL;
    last_perm = &inetsw[p->type];
    list_for_each(lh, &inetsw[p->type]) {
        answer = list_entry(lh, struct inet_protosw, list);

        /* Check only the non-wild match. */
        if (INET_PROTOSW_PERMANENT & answer->flags) {
            if (protocol == answer->protocol)
                break;
            last_perm = lh;
        }

        answer = NULL;
    }
    if (answer)
        goto out_permanent;

    /* Add the new entry after the last permanent entry if any, so that
     * the new entry does not override a permanent entry when matched with
     * a wild-card protocol. But it is allowed to override any existing
     * non-permanent entry.  This means that when we remove this entry, the
     * system automatically returns to the old behavior.
     */
    list_add_rcu(&p->list, last_perm);
out:
    spin_unlock_bh(&inetsw_lock);

    return;

out_permanent:
    printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
           protocol);
    goto out;

out_illegal:
    printk(KERN_ERR
           "Ignoring attempt to register invalid socket type %d.\n",
           p->type);
    goto out;
}
EXPORT_SYMBOL(inet_register_protosw);

這個函數完成的工做,就是把inetsw_array 數組中,相同的協議類型(protocol成員)下邊的協議,加入到inetsw 對應的協議類型的鏈表中去。
由於事實上一對一的關係,因此這個函數要簡單得多:
  由於不存在其它成員,因此每一次 list_entry 都爲空值,因此不存在覆蓋和追加的狀況,直接調用list_add_rcu(&p->list, last_perm);
  把協議類型節點(struct inet_protosw 類型的數組的某個元素)添加到鏈表(鏈表首部自己是一個數組,數組索引是協議對應的協議類型的值的第一個成員。

繼續分析inet_create()函數:

  首先,根據sock的成員protocol,把以前在鏈表中註冊的協議節點找出。

  而後,將建立的socket 的ops 函數指針集,指向協議類型的例如建立的是SOCK_STREAM,那麼就指向了inet_stream_ops; answer_prot 指針指向了當前要建立的socket 的協議類型下邊的協議,如上例,它就是IPPROTO_TCP 的tcp_prot結構。

  接着, 接下來一個重要的工做,就是爲socket分配一個sock,並初始化它。

  最後,初始化一個 inet 。

雖然create 的代碼就到這兒了,不過要說清楚sk(socK)的分配,還得費上大力氣。
每個Socket 套接字,都有一個對應的 struct socket 結構來描述(內核中通常使用名稱爲sock),可是同時又一個struct sock 結構(內核中通常使用名稱爲sk),二者之間是一一對應的關係。

在後面的sock_init_data 函數中,能夠看到:

sk->sk_socket = sock; 
sock->sk = sk;

socket 結構和 sock 結構其實是同一個事物的兩個方面。不妨說,socket 結構是面向進程和系統調用界面的側面,而 sock 結構則是面向底層驅動程序的側面。

設計者把socket套接字中,與文件系統關係比較密切的那一部份放在socket結構中,而把與通訊關係比較密切的那一部份,則單獨成爲 一個數結結構,那就是sock 結構。

因爲這兩部份邏輯上原本就是一體的,因此要經過指針互相指向對方,造成一對一的關係。

調用sk_alloc()分配一個sk:

  在以前proto_register()函數建立的高速緩存中申請分配一個slab緩存項,並清零。而後設置協議族、並把sk中的sk_prot與對應的協議關聯起來。

分配完成sk後,另外一個重要的功能就是初始化它

  sk的成員至關複雜,其主要的初始化工做是在函數sock_init_data()中完成的:
  sock 結構中,有三個重要的雙向隊列,分別是 sk_receive_queuesk_write_queuesk_error_queue。從它們的名字就能夠看出來其做用了。
隊列並不是採用通用的list_head來維護,而是使用skb_buffer隊列:

struct sk_buff_head { 
            /* These two members must be first. */ 
        struct sk_buff        *next; 
        struct sk_buff        *prev; 
 
            __u32                        qlen; 
        spinlock_t        lock; 
};

這樣,隊列中指向的每個skb_buffer,就是一個數據包,分別是接收、發送和投遞錯誤。
inet 初始化:
inet 是一個struct inet_sock 結構類型,來看它的定義:

struct inet_sock { 
    /* sk and pinet6 has to be the first two members of inet_sock */ 
    struct sock sk; 
    …… 
}

只留意它的第一個成員就足夠了。
咱們說sock 是面向用戶態調用,而sk是面向內核驅動調用的,那sk是如何與協議棧交互的呢?
對於每個類型的協議,爲了與sk聯繫起來,都定義了一個struct XXX_sock 結構,XXX是協議名,例如:

struct tcp_sock { 
    /* inet_sock has to be the first member of tcp_sock */ 
    struct inet_sock inet; 
    int tcp_header_len; /* Bytes of tcp header to send */ 
    …… 
} 

很明顯,它們的結構定構是「af_inet 通常屬性+ 本身的私有屬性」 ,由於它們的第一個成員老是inet 。

如今回頭來照一下起初在af_inet.c中,封裝協議註冊proto_register()的時候,size成員,對於tcp而言:

struct proto tcp_prot = {
    .name            = "TCP",
    .owner            = THIS_MODULE,
    .close            = tcp_close,
    .connect        = tcp_v4_connect,
    .disconnect        = tcp_disconnect,
    .accept            = inet_csk_accept,
    .ioctl            = tcp_ioctl,
    .init            = tcp_v4_init_sock,
    .destroy        = tcp_v4_destroy_sock,
    .shutdown        = tcp_shutdown,
    .setsockopt        = tcp_setsockopt,
    .getsockopt        = tcp_getsockopt,
    .recvmsg        = tcp_recvmsg,
    .sendmsg        = tcp_sendmsg,
        ...
    .obj_size        = sizeof(struct tcp_sock),
        ...     
};

其它協議相似。

以obj_size 來肯定每一個 slab 緩存項分配的大小,因此,咱們就可說,每次申請分配的,其實是一個struct XXX_sock 結構大小的結構。由於都是定義於上層結構的第一個成員,可使用強制類型轉換來使用這塊分配的內存空間。例如: 

struct inet_sock {
    /* sk and pinet6 has to be the first two members of inet_sock */
    struct sock        sk;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
    struct ipv6_pinfo    *pinet6;
#endif
    /* Socket demultiplex comparisons on incoming packets. */
    __be32            inet_daddr;
    __be32            inet_rcv_saddr;
    __be16            inet_dport;
    __u16            inet_num;
    __be32            inet_saddr;
    __s16            uc_ttl;
    __u16            cmsg_flags;
    __be16            inet_sport;
    __u16            inet_id;
    ...  
};

inet = inet_sk(sk); 
static inline struct inet_sock *inet_sk(const struct sock *sk) 
{ 
  return (struct inet_sock *)sk; //inet_sock->sk
}
struct tcp_sock {
    /* inet_connection_sock has to be the first member of tcp_sock */
    struct inet_connection_sock    inet_conn;
    u16    tcp_header_len;    /* Bytes of tcp header to send        */
    ...
};

struct tcp_sock *tp = tcp_sk(sk); 
static inline struct tcp_sock *tcp_sk(const struct sock *sk) 
{ 
  return (struct tcp_sock *)sk; //tcp_sock->inet_conn->icsk_inet->sk
}

inet_create()運行完,一個 socket 套接字基本上就建立完畢了,剩下的就是與文件系統掛鉤。

4、與文件系統交互

sys_socket()函數中來,它在調用完sock_create()後,緊接着調用sock_map_fd()函數:

int sock_map_fd(struct socket *sock, int flags)
{
    struct file *newfile;
    int fd = sock_alloc_file(sock, &newfile, flags);

    if (likely(fd >= 0))
        fd_install(fd, newfile);

    return fd;
}
EXPORT_SYMBOL(sock_map_fd);

這個函數的核心思想,在一開始,就已經分析過了。
從進程的角度來說,一個 socket 套接字就是一個特殊的,已打開的文件。
前面分配好一個socket後,這裏要作的就是將它與文件系統拉上親戚關係。
首先獲取一個空閒的文件描述符號和file結構。而後在文件系統中分配一個目錄項(d_alloc),使其指向已經分配的inode節點(d_add),而後把其目錄項掛在sockfs文件系統的根目錄之下。
而且把目錄項的指針d_op設置成指向 sockfs_dentry_operati,這個數據結構經過函數指針提供他與文件路徑有關的操做:

static const struct dentry_operations sockfs_dentry_operations = {
    .d_dname  = sockfs_dname,
};

最後一步,就是將file結構中的f_op和sock結構中的i_fop都指向socket_file_ops,它是一個函數指針集,指向了socket面向文件系統的用戶態調用的一些接口函數:

/*
 *    Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
 *    in the operation structures but are done directly via the socketcall() multiplexor.
 */

static const struct file_operations socket_file_ops = {
    .owner =    THIS_MODULE,
    .llseek =    no_llseek,
    .aio_read =    sock_aio_read,
    .aio_write =    sock_aio_write,
    .poll =        sock_poll,
    .unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
    .compat_ioctl = compat_sock_ioctl,
#endif
    .mmap =        sock_mmap,
    .open =        sock_no_open,    /* special open code to disallow open via /proc */
    .release =    sock_close,
    .fasync =    sock_fasync,
    .sendpage =    sock_sendpage,
    .splice_write = generic_splice_sendpage,
    .splice_read =    sock_splice_read,
};

到這裏,整個socket 套接字的建立工做,就宣告完成了。

相關文章
相關標籤/搜索