epoll函數源碼剖析

時間 2019-11-06

標籤 epoll 函數源碼剖析简体版

原文原文鏈接

I/O複用函數

epoll是linux中最多見的io複用函數，它的高效簡潔是其餘兩個不能比擬的，解決了以前的大量fd問題後,同時針對poll的效率問題作了提高，它利用內核去保存傳入的fd而非poll_wait時候才保存傳入的fd；另外它也不是將current輪流掛入fd的等待隊列中，而是在設備的等待隊列醒來時調用一個回調函數。node

咱們來看看源碼：linux

asmlinkage long sys_epoll_create(int size)
{
    int error, fd;
    struct inode *inode;
    struct file *file;

    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
             current, size));

    /* Sanity check on the size parameter */
    error = -EINVAL;
    if (size <= 0)
        goto eexit_1;

    /*
     * Creates all the items needed to setup an eventpoll file. That is,
     * a file structure, and inode and a free file descriptor.
     */
    error = ep_getfd(&fd, &inode, &file);
    if (error)
        goto eexit_1;

    /* Setup the file internal data structure ( "struct eventpoll" ) */
    error = ep_file_init(file);
    if (error)
        goto eexit_2;


    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
             current, size, fd));

    return fd;

eexit_2:
    sys_close(fd);
eexit_1:
    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
             current, size, error));
    return error;
}

create第一次調用時候建立了新的inode、file、fd，Linux遵循一切皆文件的原則，一切都是文件操做，返回的也是一個fd。這樣作還有一個好處，指針的指向並很差判斷資源的有效性，可是fd就能夠經過current->files->fd_array[]找到。

咱們再來看看sys_epoll_ctl()：ide

asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
 {
    int error;
    struct file *file, *tfile;
    struct eventpoll *ep;
    struct epitem *epi;
    struct epoll_event epds;

    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
             current, epfd, op, fd, event));

    error = -EFAULT;
    if (EP_OP_HASH_EVENT(op) &&
        copy_from_user(&epds, event, sizeof(struct epoll_event)))//從用戶空間拷貝
        goto eexit_1;

    //拿到flie和tfile，並對tfile、file進行檢查
    error = -EBADF;
    file = fget(epfd);
    if (!file)
        goto eexit_1;
    tfile = fget(fd);
    if (!tfile)    
        goto eexit_2;
    error = -EPERM;
    if (!tfile->f_op || !tfile->f_op->poll)
        goto eexit_3;
    error = -EINVAL;
    if (file == tfile || !IS_FILE_EPOLL(file))
        goto eexit_3;

    ep = file->private_data;//獲取在epoll_create中建立的私有數據
       down_write(&ep->sem);
     /* Try to lookup the file inside our hash table */
    epi = ep_find(ep, tfile, fd);
       error = -EINVAL;
    switch (op) {
    case EPOLL_CTL_ADD:
        if (!epi) {
            epds.events |= POLLERR | POLLHUP;

            error = ep_insert(ep, &epds, tfile, fd);
        } else
            error = -EEXIST;
        break;
    case EPOLL_CTL_DEL:
        if (epi)
            error = ep_remove(ep, epi);
        else
            error = -ENOENT;
        break;
    case EPOLL_CTL_MOD:
        if (epi) {
            epds.events |= POLLERR | POLLHUP;
            error = ep_modify(ep, epi, &epds);
        } else
            error = -ENOENT;
        break;
    }

    if (epi)
        ep_release_epitem(epi);
    up_write(&ep->sem);
 eexit_3:
    fput(tfile);
 eexit_2:
    fput(file);
 eexit_1:
    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
             current, epfd, op, fd, event, error));

    return error;
 }

若是操做是ADD那麼先在eventpoll進行ep_find()隨後若是沒有找到epitem那麼就進行添加，找到的話就返回EEXIST；
若是是DEL，那麼也是先進行查找，找到的話執行ep_remove()，不然返回ENOENT；
若是操做時MOD，那麼查找完成後沒有返回ENOENT，存在的話就對事件進行修改；

咱們能夠在去看看ep_find：函數

static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
{
   int kcmp;
   unsigned long flags;
   struct rb_node *rbp;
   struct epitem *epi, *epir = NULL;
   struct epoll_filefd ffd;

   EP_SET_FFD(&ffd, file, fd);
   read_lock_irqsave(&ep->lock, flags);
   for (rbp = ep->rbr.rb_node; rbp; ) {
       epi = rb_entry(rbp, struct epitem, rbn);
       kcmp = EP_CMP_FFD(&ffd, &epi->ffd);
       if (kcmp > 0)
           rbp = rbp->rb_right;
       else if (kcmp < 0)
           rbp = rbp->rb_left;
       else {
           ep_use_epitem(epi);
           epir = epi;
           break;
       }
   }
   read_unlock_irqrestore(&ep->lock, flags);

   DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n",
            current, file, epir));

   return epir;
}

能夠看出find主要就是爲了從eventpoll中找到儲存文件描述信息的紅黑樹中查找指定的fd對應的poitem

基礎的看完了，來看看核心的sys_epoll_wait：ui

asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
                   int maxevents, int timeout)
{
    int error;
    struct file *file;
    struct eventpoll *ep;

    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
             current, epfd, events, maxevents, timeout));

    /* 事件數量得大於0 */
    if (maxevents <= 0)
        return -EINVAL;

    /* 驗證一下用戶的寫權限 */
    if ((error = verify_area(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))))
        goto eexit_1;

    /* 拿到epfd中的file* */
    error = -EBADF;
    file = fget(epfd);
    if (!file)
        goto eexit_1;

    /* 檢測fd對應的文件結構 */
    error = -EINVAL;
    if (!IS_FILE_EPOLL(file))
        goto eexit_2;

    /*
     * At this point it is safe to assume that the "private_data" contains
     * our own data structure.
     */
    ep = file->private_data;

    /* 進入循環 */
    error = ep_poll(ep, events, maxevents, timeout);

eexit_2:
    fput(file);
eexit_1:
    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
             current, epfd, events, maxevents, timeout, error));

    return error;
}

進入ep_poll：this

static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
           int maxevents, long timeout)
{
    int res, eavail;
    unsigned long flags;
    long jtimeout;
    wait_queue_t wait;

    /*
     * Calculate the timeout by checking for the "infinite" value ( -1 )
     * and the overflow condition. The passed timeout is in milliseconds,
     * that why (t * HZ) / 1000.
     */
    jtimeout = timeout == -1 || timeout > (MAX_SCHEDULE_TIMEOUT - 1000) / HZ ?
        MAX_SCHEDULE_TIMEOUT: (timeout * HZ + 999) / 1000;

    /* 在這裏循環開始 */
retry:
    write_lock_irqsave(&ep->lock, flags);//鎖寫權限

    res = 0;
    if (list_empty(&ep->rdllist)) {
        /* 此時爲空，在這裏阻塞，等待有一個回調函數「把我叫醒」 */
        init_waitqueue_entry(&wait, current);
        add_wait_queue(&ep->wq, &wait);

        for (;;) {
            /* 除了睡覺和隊列爲空，都不能退出循環 */
            set_current_state(TASK_INTERRUPTIBLE);
            if (!list_empty(&ep->rdllist) || !jtimeout)
                break;
            if (signal_pending(current)) {
                res = -EINTR;
                break;
            }

            write_unlock_irqrestore(&ep->lock, flags);
            jtimeout = schedule_timeout(jtimeout);
            write_lock_irqsave(&ep->lock, flags);
        }
        remove_wait_queue(&ep->wq, &wait);

        set_current_state(TASK_RUNNING);
    }

    /* 隊列中還有沒有 */
    eavail = !list_empty(&ep->rdllist);

    write_unlock_irqrestore(&ep->lock, flags);

    /* 嘗試去再次去再次循環 */
    if (!res && eavail &&
        !(res = ep_events_transfer(ep, events, maxevents)) && jtimeout)
        goto retry;

    return res;
}

而負責清空鏈表的就是ep_insert中的init_poll_funcptr(&epq.pt, ep_ptable_queue_proc)，ep_insert主要就是申請了一個eppoll_entry而且設置回掉函數，相對於poll的每次掛入等待隊列，epoll只是在epoll_ctl時候將current掛載，而後經過回調函數收集fd

而在嘗試再次循環時候有一個函數ep_events_transfer，這就是向用戶返回的就緒fd函數：.net

static int ep_events_transfer(struct eventpoll *ep, struct epoll_event __user *events, 
                                 int maxevents)
   {
       int eventcnt = 0;
       struct list_head txlist;
   
       INIT_LIST_HEAD(&txlist);

       down_read(&ep->sem);
        
       //ep_collect_ready_items將完成的在rdlist中的放入txlist
       if (ep_collect_ready_items(ep, &txlist, maxevents) > 0) {
           //將txlist中的fd發回給用戶，它將poll第二參數置爲空，那麼只是返回了事件掩碼mask
           eventcnt = ep_send_events(ep, &txlist, events);
           /* 將txlist中一部分fd返還給rdlist，這裏就是EPOLL——ET模式的體現了，若是沒有ET模式，那麼這個txlist會被清空。而這一部分fd的條件是：!epi->event.events & EPOLLET && epi->revents & epi->event.events，事件被關注，而且事件沒有被標註ET */
           ep_reinject_items(ep, &txlist);
       }
   
       up_read(&ep->sem);
   
       return eventcnt;
   }

最後來總結一下epoll流程：指針

int epoll_create(int size); //這個size在高版本中已經棄用，返回的是註冊的文件系統fd

int epoll_ctl(int epfd, int op, int fd, struct epoll_event *ev);rest

- 第一個參數是epollfd也就是epollcreate返回的那個；
- 第二個則是控制字段，由三個宏實現：
  - EPOLL_CTL_ADD：將描述符fd添加到epoll實例中的興趣列表中去。對於fd上咱們感興趣的事件，都指定在ev所指向的結構體中。若是咱們試圖向興趣列表中添加一個已存在的文件描述符，epoll_ctl()將出現EEXIST錯誤。
  - EPOLL_CTL_MOD：修改描述符上設定的事件，須要用到由ev所指向的結構體中的信息。若是咱們試圖修改不在興趣列表中的文件描述符，epoll_ctl()將出現ENOENT錯誤。
  - EPOLL_CTL_DEL：將文件描述符fd從epfd的興趣列表中移除，該操做忽略參數ev。若是咱們試圖移除一個不在epfd的興趣列表中的文件描述符，epoll_ctl()將出現ENOENT錯誤。關閉一個文件描述符會自動將其從全部的epoll實例的興趣列表移除。
- 第三個則是事件fd
- 第四個是事件結構體指針
  -   
      struct epoll_event  
      {  
            uint32_t    events; /* 一個位掩碼，它指定了咱們爲待檢查的描述符fd上所感興趣的事件集合 */  
            epoll_data_t    data;   /* User data */  
      };  
      typedef union epoll_data  
      {  
            void        *ptr;   /* Pointer to user-defind data */  
            int     fd; /* File descriptor */  
            uint32_t    u32;    /* 32-bit integer */  
            uint64_t    u64;    /* 64-bit integer */  
      }epoll_data_t;  
  - 而這個event.events可設置爲
    + EPOLLIN         可讀次優先級事件
    + EPOLLPRI        可讀高優先級事件
    + EPOLLRDHUP     套接字對端關閉
    + EPOLLOUT 可寫事件
    + EPOLLET ET模式
    + EPOLLONESHOT 完成事件後停用，一次性
    + EPOLLERR 錯誤事件
    + EPOLLHUP 出現掛斷

int epoll_wait(int epfd, struct epoll_event *evlist, int maxevents, int timeout);code
- 參數timeout用來肯定epoll_wait()的阻塞行爲，有以下幾種。
  - 若是timeout等於-1，調用將一直阻塞，直到興趣列表中的文件描述符上有事件產生或者直到捕獲到一個信號爲止。
  - 若是timeout等於0，執行一次非阻塞式地檢查，看興趣列表中的描述符上產生了哪一個事件。
  - 若是timeout大於0，調用將阻塞至多timeout毫秒，直到文件描述符上有事件發生，或者直到捕獲到一個信號爲止
- 返回值：
  - 成功返回就緒態文件描述符數目
  - 超時返回0
  - 若出錯返回-1