Linux epoll源碼註釋

時間 2019-11-26

標籤 linux epoll 源碼註釋欄目 Linux 简体版

原文原文鏈接

Linux系統運行源碼剖析-epoll代碼註釋

理解了中斷、等待隊列、調度，你就能懂Linux的80%。node

--老子linux

轉發的話，請註明出處哦：http://www.cnblogs.com/stonehat/
Linux系統內核提供了三個系統調用:數組

include/linux/syscalls.happ

// epoll_create，建立epoll描述符
asmlinkage long sys_epoll_create(int size);
// epoll_ctl, 操做epoll描述符，增刪改
asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
                struct epoll_event __user *event);
// epoll_wait, 你懂的
asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
                int maxevents, int timeout);

其函數實如今fs/eventpoll.c異步

eventpoll 自己也是一個支持poll操做的文件，因此能夠把eventpoll組成一個樹形關係。socket

下面分別按照sys_epoll_create，sys_epoll_ctl，sys_epoll_wait的順序分析三個系統調用。async

重要的結構體

// eventpoll結構體
struct eventpoll {
    /* Protect the this structure access */
    rwlock_t lock;

    /*
     * 同步用的內核信號量
     */
    struct rw_semaphore sem;

    /**
     * 等待隊列，epoll_wait()使用，將調用線程掛在這個隊列上。
     */
    wait_queue_head_t wq;

    /* 等待隊列，file->poll()會使用，在epoll中函數爲ep_eventpoll_poll */
    wait_queue_head_t poll_wait;

    /* 就緒列表*/
    struct list_head rdllist;

    /* 紅黑樹，維護了 */
    struct rb_root rbr;
};

// 內核中文件
struct file {
    struct list_head    f_list;
    struct dentry       *f_dentry;
    struct vfsmount         *f_vfsmnt;
    //文件操做指針
    struct file_operations  *f_op;
    atomic_t        f_count;
    unsigned int        f_flags;
    mode_t          f_mode;
    int         f_error;
    loff_t          f_pos;
    struct fown_struct  f_owner;
    unsigned int        f_uid, f_gid;
    struct file_ra_state    f_ra;

    unsigned long       f_version;
    void            *f_security;

    /* file中的私有自定義數據 */
    void            *private_data;

#ifdef CONFIG_EPOLL
    /* Used by fs/eventpoll.c to link all the hooks to this file */
    struct list_head    f_ep_links;
    spinlock_t      f_ep_lock;
#endif /* #ifdef CONFIG_EPOLL */
    struct address_space    *f_mapping;
};

struct file_operations {
    struct module *owner;
    loff_t (*llseek) (struct file *, loff_t, int);
    ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
    ssize_t (*aio_read) (struct kiocb *, char __user *, size_t, loff_t);
    ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
    ssize_t (*aio_write) (struct kiocb *, const char __user *, size_t, loff_t);
    int (*readdir) (struct file *, void *, filldir_t);
    // 不阻塞，檢測file狀態（可讀、可寫等），若是條件不知足，pt將會被加到等待隊列中。（通常是這種邏輯，最終如何實現仍是要看設備驅動）
    unsigned int (*poll) (struct file *f, struct poll_table_struct *pt);
    int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
    int (*mmap) (struct file *, struct vm_area_struct *);
    int (*open) (struct inode *, struct file *);
    int (*flush) (struct file *);
    int (*release) (struct inode *, struct file *);
    int (*fsync) (struct file *, struct dentry *, int datasync);
    int (*aio_fsync) (struct kiocb *, int datasync);
    int (*fasync) (int, struct file *, int);
    int (*lock) (struct file *, int, struct file_lock *);
    ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *);
    ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
    ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void *);
    ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
    unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
    int (*check_flags)(int);
    int (*dir_notify)(struct file *filp, unsigned long arg);
    int (*flock) (struct file *, int, struct file_lock *);
};

概念與關係

文件描述符fd：進程打開的文件的數字表明形式，是文件指針的索引。
struct file：在內核中表示進程打開的文件。task.files[fd]=file
struct inode：靜態的文件表示。

一. sys_epoll_create

代碼以下：爲了方便理解原理，可有可無的代碼邏輯和異常處理刪掉了ide

asmlinkage long sys_epoll_create(int size)
{
    int error, fd;
    struct inode *inode;
    struct file *file;
    
    .....
      
    /*
     * 建立一個新的file，inode，得到file對應的fd。
     * 而且將file加入到當前進程打開文件列表。
     */
    error = ep_getfd(&fd, &inode, &file);
    /* 建立struct eventpoll，並掛在file的private_data指針上*/
    error = ep_file_init(file);
    .....
    return fd;
}

ep_getfd簡單流程函數

static int ep_getfd(int *efd, struct inode **einode, struct file **efile)
{
    struct qstr this;
    char name[32];
    struct dentry *dentry;
    struct inode *inode;
    struct file *file;
    int error, fd;

    /* Get an ready to use file */
    error = -ENFILE;
    file = get_empty_filp();
    if (!file)
        goto eexit_1;

    /* Allocates an inode from the eventpoll file system */
    inode = ep_eventpoll_inode();
    error = PTR_ERR(inode);
    if (IS_ERR(inode))
        goto eexit_2;

    /* Allocates a free descriptor to plug the file onto */
    error = get_unused_fd();
    if (error < 0)
        goto eexit_3;
    fd = error;

    /*
     * Link the inode to a directory entry by creating a unique name
     * using the inode number.
     */
    error = -ENOMEM;
    sprintf(name, "[%lu]", inode->i_ino);
    this.name = name;
    this.len = strlen(name);
    this.hash = inode->i_ino;
    dentry = d_alloc(eventpoll_mnt->mnt_sb->s_root, &this);
    if (!dentry)
        goto eexit_4;
    dentry->d_op = &eventpollfs_dentry_operations;
    d_add(dentry, inode);
    file->f_vfsmnt = mntget(eventpoll_mnt);
    file->f_dentry = dentry;
    file->f_mapping = inode->i_mapping;

    file->f_pos = 0;
    file->f_flags = O_RDONLY;
    file->f_op = &eventpoll_fops;
    file->f_mode = FMODE_READ;
    file->f_version = 0;
    file->private_data = NULL;

    /* Install the new setup file into the allocated fd. */
    fd_install(fd, file);

    *efd = fd;
    *einode = inode;
    *efile = file;
    return 0;

eexit_4:
    put_unused_fd(fd);
eexit_3:
    iput(inode);
eexit_2:
    put_filp(file);
eexit_1:
    return error;
}

查找一個沒有用的文件描述符。記爲fdui
建立一個空文件file結構體。記爲epfile
在epoll的文件系統中建立一個inode
epfile和inode作關聯。
epfile的f_ops成員（文件操做指針）和epoll的自定義函數組eventpoll_fops作關聯。比較重要的一點是eventpoll_fops有一個自定義的poll函數，這個函數很重要，是實現epoll級聯模型的關鍵。後面能夠經過比較f_ops是否等於eventpoll_fops來判斷file是否是epoll file。

static struct file_operations eventpoll_fops = {
    .release    = ep_eventpoll_close,
    .poll       = ep_eventpoll_poll
   };

將epfile放到進程的打開文件列表中管理，用fd作索引。
初始化eventpoll結構，初始化等待隊列和就緒隊列等。
將epfile的private_data指向eventpoll結構。方便後面取eventpoll的數據。
返回給調用線程fd。

2、sys_epoll_ctl

sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
{
    int error;
    struct file *file, *tfile;
    struct eventpoll *ep;
    struct epitem *epi;
    struct epoll_event epds;

    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
             current, epfd, op, fd, event));

    error = -EFAULT;
    // 1. 從用戶空間拷貝event數據。
    if (EP_OP_HASH_EVENT(op) &&
        copy_from_user(&epds, event, sizeof(struct epoll_event)))
        goto eexit_1;

    /* 2. 根據epollfile的文件描述符得到對應的file結構體，內核中fd和file是有一個映射關係的*/
    error = -EBADF;
    file = fget(epfd);
    if (!file)
        goto eexit_1;

    /* 3. 得到要操做的描述符的file指針，例如socket描述符 */
    tfile = fget(fd);
    if (!tfile)
        goto eexit_2;

    /* 4. 校驗tfile是否支持poll操做，必須支持poll才能使用epoll */
    error = -EPERM;
    if (!tfile->f_op || !tfile->f_op->poll)
        goto eexit_3;

    /*
     * 5. 校驗是不是epoll的file指針
     */
    error = -EINVAL;
    if (file == tfile || !IS_FILE_EPOLL(file))
        goto eexit_3;

    /*
     * 6. 取eventpoll，從建立時，咱們知道epoll把本身的eventpoll結構體放在file->private_data了裏面。
     */
    ep = file->private_data;
    
    down_write(&ep->sem);

    /* Try to lookup the file inside our hash table */
    epi = ep_find(ep, tfile, fd);
    // 7. 具體的邏輯操做
    error = -EINVAL;
    switch (op) {
    // 添加
    case EPOLL_CTL_ADD:
        if (!epi) {
            epds.events |= POLLERR | POLLHUP;

            error = ep_insert(ep, &epds, tfile, fd);
        } else
            error = -EEXIST;
        break;
    // 刪除
    case EPOLL_CTL_DEL:
        if (epi)
            error = ep_remove(ep, epi);
        else
            error = -ENOENT;
        break;
    // 修改
    case EPOLL_CTL_MOD:
        if (epi) {
            epds.events |= POLLERR | POLLHUP;
            error = ep_modify(ep, epi, &epds);
        } else
            error = -ENOENT;
        break;
    }

    /*
     * The function ep_find() increments the usage count of the structure
     * so, if this is not NULL, we need to release it.
     */
    if (epi)
        ep_release_epitem(epi);

    up_write(&ep->sem);

eexit_3:
    fput(tfile);
eexit_2:
    fput(file);
eexit_1:
    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
             current, epfd, op, fd, event, error));

    return error;
}

上面的邏輯很簡單

驗證輸入有效性

邏輯上，只須要了解添加便可。epoll的添加是理解整個流程的關鍵

epoll添加

static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
             struct file *tfile, int fd)
{
    int error, revents, pwake = 0;
    unsigned long flags;
    
    struct epitem *epi;
    struct ep_pqueue epq;

    error = -ENOMEM;
    if (!(epi = EPI_MEM_ALLOC()))
        goto eexit_1;

    /* Item initialization follow here ... */
    EP_RB_INITNODE(&epi->rbn);
    INIT_LIST_HEAD(&epi->rdllink);
    INIT_LIST_HEAD(&epi->fllink);
    INIT_LIST_HEAD(&epi->txlink);
    INIT_LIST_HEAD(&epi->pwqlist);
    epi->ep = ep;
    EP_SET_FFD(&epi->ffd, tfile, fd);
    epi->event = *event;
    atomic_set(&epi->usecnt, 1);
    epi->nwait = 0;

    /* 初始化polltable，當調用poll的時候，會調用ep_ptable_queue_proc函數將自身加入等待隊列中 */
    epq.epi = epi;
    init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

    /*
     * 將epq.pt的結構體傳入tfile進行poll，poll最終調用ep_ptable_queue_proc函數。
     */
    revents = tfile->f_op->poll(tfile, &epq.pt);
 
    /*
     * We have to check if something went wrong during the poll wait queue
     * install process. Namely an allocation for a wait queue failed due
     * high memory pressure.
     */
    if (epi->nwait < 0)
        goto eexit_2;

    /* 操做tfile，把當前項加入到epoll列表中。
     */
    spin_lock(&tfile->f_ep_lock);
    list_add_tail(&epi->fllink, &tfile->f_ep_links);
    spin_unlock(&tfile->f_ep_lock);

    /* We have to drop the new item inside our item list to keep track of it */
    write_lock_irqsave(&ep->lock, flags);

    /* Add the current item to the rb-tree */
    ep_rbtree_insert(ep, epi);

    /* 若是已經有就緒的，就喚醒epollwait等待隊列和poll等待隊列 */
    if ((revents & event->events) && !EP_IS_LINKED(&epi->rdllink)) {
        list_add_tail(&epi->rdllink, &ep->rdllist);

        /* Notify waiting tasks that events are available */
        if (waitqueue_active(&ep->wq))
            wake_up(&ep->wq);
        if (waitqueue_active(&ep->poll_wait))
            pwake++;
    }

    write_unlock_irqrestore(&ep->lock, flags);

    /* We have to call this outside the lock */
    if (pwake)
        ep_poll_safewake(&psw, &ep->poll_wait);

    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
             current, ep, tfile, fd));

    return 0;

eexit_2:
    ep_unregister_pollwait(ep, epi);

    /*
     * We need to do this because an event could have been arrived on some
     * allocated wait queue.
     */
    write_lock_irqsave(&ep->lock, flags);
    if (EP_IS_LINKED(&epi->rdllink))
        EP_LIST_DEL(&epi->rdllink);
    write_unlock_irqrestore(&ep->lock, flags);

    EPI_MEM_FREE(epi);
eexit_1:
    return error;
}

整理一下，向epoll添加一個描述符主要步驟以下：

構建epitem，epitem以後會加入到eventpoll.rbr中。
調用init_poll_funcptr，將ep_ptable_queue_proc函數指針賦值給poll_table的qproc，poll_table記爲epq.pt，在file的poll函數中，能夠傳入poll_table做爲參數，poll函數會主動調用poll_table的qproc函數。

poll_table的結構體以下：

/**
    *@param f:poll的file指針
    *@param whead f的等待隊列
    *@param pt 
   */
   typedef void (*poll_queue_proc)(struct file *f, wait_queue_head_t *whead, struct poll_table_struct *pt);

   typedef struct poll_table_struct {
    poll_queue_proc qproc;
   } poll_table;

poll函數原型

// 當上層傳入pt結構體時，驅動函數當調用poll_table_struct.qproc來實現阻塞隊列的添加工做。
   unsigned int (*poll) (struct file *f, struct poll_table_struct *pt);

調用待監控的文件的poll函數，按第2步所說，poll函數規範的實現應該最終會調用到ep_ptable_queue_proc函數，ep_ptable_queue_proc主要是初始化一個等待隊列項（以ep_ptable_queue_proc爲回調函數），而後將等待隊列項塞到驅動的等待隊列中。ep_ptable_queue_proc註釋以下：

struct __wait_queue {
       
    unsigned int flags;
   #define WQ_FLAG_EXCLUSIVE    0x01
       // 線程指針，若是func爲默認的執行函數，這個須要賦值。
    struct task_struct * task;
       // 等待隊列喚醒執行的函數
    wait_queue_func_t func;
       
    struct list_head task_list;
   };
   typedef struct __wait_queue wait_queue_t;

   static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
                 poll_table *pt)
   {
       // 這是一個特殊的宏操做，由於pt和epitem是包含在ep_queue結構體裏面的，因此能夠根據偏移取同級別的epitem。
    struct epitem *epi = EP_ITEM_FROM_EPQUEUE(pt);
    struct eppoll_entry *pwq;
    
    if (epi->nwait >= 0 && (pwq = PWQ_MEM_ALLOC())) {
           // 初始化一個等待隊列項，而且設置當等待隊列喚醒時的執行函數爲ep_poll_callback
           // 這個很關鍵。等下咱們分析這個ep_poll_call
        init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
        pwq->whead = whead;
        pwq->base = epi;
           // 把剛建立的等待隊列項加入到等待隊列中。
        add_wait_queue(whead, &pwq->wait);
        list_add_tail(&pwq->llink, &epi->pwqlist);
        epi->nwait++;
    } else {
        /* We have to signal that an error occurred */
        epi->nwait = -1;
    }
   }

   static inline void init_waitqueue_func_entry(wait_queue_t *q,
                    wait_queue_func_t func)
   {
    q->flags = 0;
    q->task = NULL;
    q->func = func;
   }

至此，添加一個文件描述符到epoll監控內的流程完成了，總的來說，就是在對應的file中設置等待隊列。等待回調ep_poll_callback，。至於對應的file用什麼機制來確保文件異步就緒，epoll無論。不過通常是經過中斷來實現的。

epoll模型的poll函數實現：

* 
 * structures and helpers for f_op->poll implementations
 */
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);

typedef struct poll_table_struct {
   poll_queue_proc qproc;
} poll_table;
//poll_wait函數實現，其實內部調用了poll_table.qproc成員，poll_table.qproc在epoll中對應了上面的ep_ptable_queue_proc函數
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
   if (p && wait_address)
       p->qproc(filp, wait_address, p);
}
// epollevent的poll函數實現，驅動的邏輯都差很少，有參考意義
static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
{
   unsigned int pollflags = 0;
   unsigned long flags;
   struct eventpoll *ep = file->private_data;

   /* 1. 加入等待隊列中*/
   poll_wait(file, &ep->poll_wait, wait);

   /* Check our condition */
   read_lock_irqsave(&ep->lock, flags);
   if (!list_empty(&ep->rdllist))
       pollflags = POLLIN | POLLRDNORM;
   read_unlock_irqrestore(&ep->lock, flags);

   return pollflags;
}

sys_epoll_wait

瞭解了ep_insert的話，這個其實就很容易理解了：

static struct file_operations eventpoll_fops = {
    .release    = ep_eventpoll_close,
    .poll       = ep_eventpoll_poll
};

/*
 * sys_epoll_wait實現
 */
asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
                   int maxevents, int timeout)
{
    int error;
    struct file *file;
    struct eventpoll *ep;

    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
             current, epfd, events, maxevents, timeout));

    /**
     * 驗證輸入的代碼忽略
     */
    error = -EBADF;
    // 1. 根據epfd得到對應的file
    file = fget(epfd);
    if (!file)
        goto eexit_1;

    // 2. 驗證是不是epoll的file，就是驗證f_op是否等於eventpoll_fops
    error = -EINVAL;
    if (!IS_FILE_EPOLL(file))
        goto eexit_2;

    /*
     * 3. 取eventpoll結構體
     */
    ep = file->private_data;

    /* 4. 調用ep_poll實現具體邏輯。不要被ep_poll名字忽悠了，這個不是poll實現 */
    error = ep_poll(ep, events, maxevents, timeout);

eexit_2:
    fput(file);
eexit_1:
    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
             current, epfd, events, maxevents, timeout, error));

    return error;
}

epoll_wait最終調用ep_poll來實現核心功能。

static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
           int maxevents, long timeout)
{
    int res, eavail;
    unsigned long flags;
    long jtimeout;
    wait_queue_t wait;

    /*
     * 1. 內核中是是用滴答數做爲時間計時的，因此下面代碼是轉換時間爲滴答數。
     */
    jtimeout = timeout == -1 || timeout > (MAX_SCHEDULE_TIMEOUT - 1000) / HZ ?
        MAX_SCHEDULE_TIMEOUT: (timeout * HZ + 999) / 1000;

retry:
    write_lock_irqsave(&ep->lock, flags);

    res = 0;
    // 1. 若是就緒隊列是空的，就進行等待
    if (list_empty(&ep->rdllist)) {
        /*
         * 2. 把當前調用epoll_wait的線程加入到wq等待隊列中，當ep_poll_callback()會喚醒這個線程。
         * current是當前線程的表明，最終是從cpu中取得的。
         */
        init_waitqueue_entry(&wait, current);
        add_wait_queue(&ep->wq, &wait);
        //死循環處理。
        for (;;) {
            /*
             * 3. 設置爲可打斷，方便處理信號。
             */
            set_current_state(TASK_INTERRUPTIBLE);
            if (!list_empty(&ep->rdllist) || !jtimeout)
                break;
             // 4. 處理未處理信號
            if (signal_pending(current)) {
                res = -EINTR;
                break;
            }

            write_unlock_irqrestore(&ep->lock, flags);
             // 相似於睡眠。其返回值爲剩餘時間。該函數會將該cpu的任務切換掉。因此下一行代碼在從新調度前不會執行。
            jtimeout = schedule_timeout(jtimeout);
            write_lock_irqsave(&ep->lock, flags);
        }
        //把調用線程從等待隊列刪除。
        remove_wait_queue(&ep->wq, &wait);

        set_current_state(TASK_RUNNING);
    }

    eavail = !list_empty(&ep->rdllist);

    write_unlock_irqrestore(&ep->lock, flags);

    /*
     * 將events數據傳回用戶空間
     */
    if (!res && eavail &&
        !(res = ep_events_transfer(ep, events, maxevents)) && jtimeout)
        goto retry;

    return res;
}

ep_poll的步驟以下：

轉換超時時間爲cpu滴答計數。
查詢就緒隊列是否就緒，若是有就緒的，就直接返回給上層。
若是沒有就緒的，就等待。

a. 把調用線程添加到eventpoll.wq隊列中。

b. 設置自身爲可打斷狀態

c. 檢查如今是否有就緒，有的話就直接返給上層。

d. 處理信號。

c. 發起調度，將自身切換爲阻塞狀態。等待被喚醒。喚醒的方式有：ep_poll_callback喚醒eventpoll.wq隊列或者其餘中斷喚醒。ep_poll_callback是sys_epoll_ctl添加epoll監聽的時候設置的等待隊列回調。其實現爲：

static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
    int pwake = 0;
    unsigned long flags;
     // 1. 這是一個特殊的宏操做，由於wait和epitem是包含在ep_queue結構體裏面的，因此能夠根據偏移取同級別的epitem。
    struct epitem *epi = EP_ITEM_FROM_WAIT(wait);
    // 2. 得到對應的eventpoll
    struct eventpoll *ep = epi->ep;

    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
             current, epi->file, epi, ep));

    write_lock_irqsave(&ep->lock, flags);
    ....
    // 3. 將就緒item加入到就緒
    list_add_tail(&epi->rdllink, &ep->rdllist);

is_linked:
    /*
     * 4. 喚醒wq等待隊列(就是喚醒等待epoll_wait的線程)
     */
    if (waitqueue_active(&ep->wq))
        wake_up(&ep->wq);
    if (waitqueue_active(&ep->poll_wait))
        pwake++;

is_disabled:
    write_unlock_irqrestore(&ep->lock, flags);

    /* We have to call this outside the lock */
    if (pwake)
        ep_poll_safewake(&psw, &ep->poll_wait);

    return 1;
}

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。