EPOLL 內核實現

epoll是由一組系統調用組成。
     int epoll_create(int size);
     int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
     int epoll_wait(int epfd, struct epoll_event *events,int maxevents, int timeout);
     select/poll的缺點在於:
     1.每次調用時要重複地從用戶態讀入參數。
     2.每次調用時要重複地掃描文件描述符。
     3.每次在調用開始時,要把當前進程放入各個文件描述符的等待隊列。在調用結束後,又把進程從各個等待隊列中刪除。
     在實際應用中,select/poll監視的文件描述符可能會很是多,若是每次只是返回一小部分,那麼,這種狀況下select/poll
node

顯得不夠高效。epoll的設計思路,是把select/poll單個的操做拆分爲1個epoll_create+多個epoll_ctrl+一個epoll_wait。linux

epoll機制實現了本身特有的文件系統eventpoll filesystemweb

  1. /* File callbacks that implement the eventpoll file behaviour */  
  2. static const struct file_operations eventpoll_fops = {  
  3.     .release    = ep_eventpoll_release,  
  4.     .poll       = ep_eventpoll_poll  
  5. };  
/* File callbacks that implement the eventpoll file behaviour */ static const struct file_operations eventpoll_fops = { .release = ep_eventpoll_release, .poll = ep_eventpoll_poll };

epoll_create建立一個屬於該文件系統的文件,而後返回其文件描述符。編程

 

struct eventpoll 保存了epoll文件節點的擴展信息,該結構保存於file結構體的private_data域中,每一個epoll_create建立的epollapp

描述符都分配一個該結構體。該結構的各個成員的定義以下,註釋也很詳細。less

  1. /* 
  2.  * This structure is stored inside the "private_data" member of the file 
  3.  * structure and rapresent the main data sructure for the eventpoll 
  4.  * interface. 
  5.  */  
  6. struct eventpoll {  
  7.     /* Protect the this structure access,可用於中斷上下文 */  
  8.     spinlock_t lock;  
  9.     /* 
  10.      * This mutex is used to ensure that files are not removed 
  11.      * while epoll is using them. This is held during the event 
  12.      * collection loop, the file cleanup path, the epoll file exit 
  13.      * code and the ctl operations.用戶進程上下文中 
  14.      */  
  15.     struct mutex mtx;  
  16.     /* Wait queue used by sys_epoll_wait() */  
  17.     wait_queue_head_t wq;  
  18.     /* Wait queue used by file->poll() */  
  19.     wait_queue_head_t poll_wait;  
  20.     /* List of ready file descriptors */  
  21.     struct list_head rdllist;  
  22.     /* RB tree root used to store monitored fd structs */  
  23.     struct rb_root rbr;  
  24.     /* 
  25.      * This is a single linked list that chains all the "struct epitem" that 
  26.      * happened while transfering ready events to userspace w/out 
  27.      * holding ->lock. 
  28.      */  
  29.     struct epitem *ovflist;  
  30.     /* The user that created the eventpoll descriptor */  
  31.     struct user_struct *user;  
  32. };  
/* * This structure is stored inside the "private_data" member of the file * structure and rapresent the main data sructure for the eventpoll * interface. */ struct eventpoll { /* Protect the this structure access,可用於中斷上下文 */ spinlock_t lock; /* * This mutex is used to ensure that files are not removed * while epoll is using them. This is held during the event * collection loop, the file cleanup path, the epoll file exit * code and the ctl operations.用戶進程上下文中 */ struct mutex mtx; /* Wait queue used by sys_epoll_wait() */ wait_queue_head_t wq; /* Wait queue used by file->poll() */ wait_queue_head_t poll_wait; /* List of ready file descriptors */ struct list_head rdllist; /* RB tree root used to store monitored fd structs */ struct rb_root rbr; /* * This is a single linked list that chains all the "struct epitem" that * happened while transfering ready events to userspace w/out * holding ->lock. */ struct epitem *ovflist; /* The user that created the eventpoll descriptor */ struct user_struct *user; };

 

而經過epoll_ctl接口加入該epoll描述符監聽的套接字則屬於socket filesystem,這點必定要注意。每一個添加的待監聽(這裏監聽異步

和listen調用不一樣)都對應於一個epitem結構體,該結構體已紅黑樹的結構組織,eventpoll結構中保存了樹的根節點(rbr成員)。socket

同時有監聽事件到來的套接字的該結構以雙向鏈表組織起來,鏈表頭也保存在eventpoll中(rdllist成員)。tcp

[c-sharp] view plain copy print ?
  1. /* 
  2.  * Each file descriptor added to the eventpoll interface will 
  3.  * have an entry of this type linked to the "rbr" RB tree. 
  4.  */  
  5. struct epitem {  
  6.     /* RB tree node used to link this structure to the eventpoll RB tree */  
  7.     struct rb_node rbn;  
  8.     /* List header used to link this structure to the eventpoll ready list */  
  9.     struct list_head rdllink;  
  10.     /* 
  11.      * Works together "struct eventpoll"->ovflist in keeping the 
  12.      * single linked chain of items. 
  13.      */  
  14.     struct epitem *next;  
  15.     /* The file descriptor information this item refers to */  
  16.     struct epoll_filefd ffd;  
  17.     /* Number of active wait queue attached to poll operations */  
  18.     int nwait;  
  19.     /* List containing poll wait queues */  
  20.     struct list_head pwqlist;  
  21.     /* The "container" of this item */  
  22.     struct eventpoll *ep;  
  23.     /* List header used to link this item to the "struct file" items list */  
  24.     struct list_head fllink;  
  25.     /* The structure that describe the interested events and the source fd */  
  26.     struct epoll_event event;  
  27. };  
/* * Each file descriptor added to the eventpoll interface will * have an entry of this type linked to the "rbr" RB tree. */ struct epitem { /* RB tree node used to link this structure to the eventpoll RB tree */ struct rb_node rbn; /* List header used to link this structure to the eventpoll ready list */ struct list_head rdllink; /* * Works together "struct eventpoll"->ovflist in keeping the * single linked chain of items. */ struct epitem *next; /* The file descriptor information this item refers to */ struct epoll_filefd ffd; /* Number of active wait queue attached to poll operations */ int nwait; /* List containing poll wait queues */ struct list_head pwqlist; /* The "container" of this item */ struct eventpoll *ep; /* List header used to link this item to the "struct file" items list */ struct list_head fllink; /* The structure that describe the interested events and the source fd */ struct epoll_event event; };

 

epoll_create的調用很簡單,就是建立一個epollevent的文件,並返回文件描述符。ide

epoll_ctl用來添加,刪除以及修改監聽項。

[c-sharp] view plain copy print ?
  1. /* 
  2.  * The following function implements the controller interface for 
  3.  * the eventpoll file that enables the insertion/removal/change of 
  4.  * file descriptors inside the interest set. 
  5.  */  
  6. SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,  
  7.         struct epoll_event __user *, event)  
  8. {  
  9.     int error;  
  10.     struct file *file, *tfile;  
  11.     struct eventpoll *ep;  
  12.     struct epitem *epi;  
  13.     struct epoll_event epds;  
  14.     DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)/n",  
  15.              current, epfd, op, fd, event));  
  16.     error = -EFAULT;  
  17.     if (ep_op_has_event(op) &&  
  18.         copy_from_user(&epds, eventsizeof(struct epoll_event)))  
  19.         goto error_return;  
  20.     /* Get the "struct file *" for the eventpoll file */  
  21.     error = -EBADF;  
  22.     file = fget(epfd);  
  23.     if (!file)  
  24.         goto error_return;  
  25.     /* Get the "struct file *" for the target file */  
  26.     tfile = fget(fd);  
  27.     if (!tfile)  
  28.         goto error_fput;  
  29.     /* The target file descriptor must support poll */  
  30.     error = -EPERM;  
  31.     if (!tfile->f_op || !tfile->f_op->poll)  
  32.         goto error_tgt_fput;  
  33.     /* 
  34.      * We have to check that the file structure underneath the file descriptor 
  35.      * the user passed to us _is_ an eventpoll file. And also we do not permit 
  36.      * adding an epoll file descriptor inside itself. 
  37.      */  
  38.     error = -EINVAL;  
  39.     if (file == tfile || !is_file_epoll(file))  
  40.         goto error_tgt_fput;  
  41.     /* 
  42.      * At this point it is safe to assume that the "private_data" contains 
  43.      * our own data structure. 
  44.      */  
  45.     ep = file->private_data;  
  46.     mutex_lock(&ep->mtx);  
  47.     /* 
  48.      * Try to lookup the file inside our RB tree, Since we grabbed "mtx" 
  49.      * above, we can be sure to be able to use the item looked up by 
  50.      * ep_find() till we release the mutex. 
  51.      */  
  52.     epi = ep_find(ep, tfile, fd);  
  53.     error = -EINVAL;  
  54.     switch (op) {  
  55.     case EPOLL_CTL_ADD:  
  56.         if (!epi) {  
  57.             epds.events |= POLLERR | POLLHUP;  
  58.             error = ep_insert(ep, &epds, tfile, fd);  
  59.         } else  
  60.             error = -EEXIST;  
  61.         break;  
  62.     case EPOLL_CTL_DEL:  
  63.         if (epi)  
  64.             error = ep_remove(ep, epi);  
  65.         else  
  66.             error = -ENOENT;  
  67.         break;  
  68.     case EPOLL_CTL_MOD:  
  69.         if (epi) {  
  70.             epds.events |= POLLERR | POLLHUP;  
  71.             error = ep_modify(ep, epi, &epds);  
  72.         } else  
  73.             error = -ENOENT;  
  74.         break;  
  75.     }  
  76.     mutex_unlock(&ep->mtx);  
  77. error_tgt_fput:  
  78.     fput(tfile);  
  79. error_fput:  
  80.     fput(file);  
  81. error_return:  
  82.     DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d/n",  
  83.              current, epfd, op, fd, event, error));  
  84.     return error;  
  85. }  
/* * The following function implements the controller interface for * the eventpoll file that enables the insertion/removal/change of * file descriptors inside the interest set. */ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event) { int error; struct file *file, *tfile; struct eventpoll *ep; struct epitem *epi; struct epoll_event epds; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)/n", current, epfd, op, fd, event)); error = -EFAULT; if (ep_op_has_event(op) && copy_from_user(&epds, event, sizeof(struct epoll_event))) goto error_return; /* Get the "struct file *" for the eventpoll file */ error = -EBADF; file = fget(epfd); if (!file) goto error_return; /* Get the "struct file *" for the target file */ tfile = fget(fd); if (!tfile) goto error_fput; /* The target file descriptor must support poll */ error = -EPERM; if (!tfile->f_op || !tfile->f_op->poll) goto error_tgt_fput; /* * We have to check that the file structure underneath the file descriptor * the user passed to us _is_ an eventpoll file. And also we do not permit * adding an epoll file descriptor inside itself. */ error = -EINVAL; if (file == tfile || !is_file_epoll(file)) goto error_tgt_fput; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = file->private_data; mutex_lock(&ep->mtx); /* * Try to lookup the file inside our RB tree, Since we grabbed "mtx" * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex. */ epi = ep_find(ep, tfile, fd); error = -EINVAL; switch (op) { case EPOLL_CTL_ADD: if (!epi) { epds.events |= POLLERR | POLLHUP; error = ep_insert(ep, &epds, tfile, fd); } else error = -EEXIST; break; case EPOLL_CTL_DEL: if (epi) error = ep_remove(ep, epi); else error = -ENOENT; break; case EPOLL_CTL_MOD: if (epi) { epds.events |= POLLERR | POLLHUP; error = ep_modify(ep, epi, &epds); } else error = -ENOENT; break; } mutex_unlock(&ep->mtx); error_tgt_fput: fput(tfile); error_fput: fput(file); error_return: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d/n", current, epfd, op, fd, event, error)); return error; }

一樣,代碼很清楚。先來看看添加流程

[c-sharp] view plain copy print ?
  1. /* 
  2.  * Must be called with "mtx" held. 
  3.  */  
  4. static int ep_insert(struct eventpoll *ep, struct epoll_event *event,  
  5.              struct file *tfile, int fd)  
  6. {  
  7.     int error, revents, pwake = 0;  
  8.     unsigned long flags;  
  9.     struct epitem *epi;  
  10.     struct ep_pqueue epq;  
  11.         /* 不容許超過最大監聽個數*/  
  12.     if (unlikely(atomic_read(&ep->user->epoll_watches) >=  
  13.              max_user_watches))  
  14.         return -ENOSPC;  
  15.     if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))  
  16.         return -ENOMEM;  
  17.     /* Item initialization follow here ... */  
  18.     INIT_LIST_HEAD(&epi->rdllink);  
  19.     INIT_LIST_HEAD(&epi->fllink);  
  20.     INIT_LIST_HEAD(&epi->pwqlist);  
  21.     epi->ep = ep;  
  22.     ep_set_ffd(&epi->ffd, tfile, fd);  
  23.     epi->event = *event;  
  24.     epi->nwait = 0;  
  25.     epi->next = EP_UNACTIVE_PTR;  
  26.     /* Initialize the poll table using the queue callback */  
  27.     epq.epi = epi;  
  28.     init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);  
  29.     /* 
  30.      * Attach the item to the poll hooks and get current event bits. 
  31.      * We can safely use the file* here because its usage count has 
  32.      * been increased by the caller of this function. Note that after 
  33.      * this operation completes, the poll callback can start hitting 
  34.      * the new item. 
  35.      */  
  36.     revents = tfile->f_op->poll(tfile, &epq.pt);  
  37.     /* 
  38.      * We have to check if something went wrong during the poll wait queue 
  39.      * install process. Namely an allocation for a wait queue failed due 
  40.      * high memory pressure. 
  41.      */  
  42.     error = -ENOMEM;  
  43.     if (epi->nwait < 0)  
  44.         goto error_unregister;  
  45.     /* Add the current item to the list of active epoll hook for this file */  
  46.     spin_lock(&tfile->f_ep_lock);  
  47.     list_add_tail(&epi->fllink, &tfile->f_ep_links);  
  48.     spin_unlock(&tfile->f_ep_lock);  
  49.     /* 
  50.      * Add the current item to the RB tree. All RB tree operations are 
  51.      * protected by "mtx", and ep_insert() is called with "mtx" held. 
  52.      */  
  53.     ep_rbtree_insert(ep, epi);  
  54.     /* We have to drop the new item inside our item list to keep track of it */  
  55.     spin_lock_irqsave(&ep->lock, flags);  
  56.     /* If the file is already "ready" we drop it inside the ready list */  
  57.     if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {  
  58.         list_add_tail(&epi->rdllink, &ep->rdllist);  
  59.         /* Notify waiting tasks that events are available */  
  60.         if (waitqueue_active(&ep->wq))  
  61.             wake_up_locked(&ep->wq);  
  62.         if (waitqueue_active(&ep->poll_wait))  
  63.             pwake++;  
  64.     }  
  65.     spin_unlock_irqrestore(&ep->lock, flags);  
  66.     atomic_inc(&ep->user->epoll_watches);  
  67.     /* We have to call this outside the lock */  
  68.     if (pwake)  
  69.         ep_poll_safewake(&psw, &ep->poll_wait);  
  70.     DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)/n",  
  71.              current, ep, tfile, fd));  
  72.     return 0;  
  73. error_unregister:  
  74.     ep_unregister_pollwait(ep, epi);  
  75.     /* 
  76.      * We need to do this because an event could have been arrived on some 
  77.      * allocated wait queue. Note that we don't care about the ep->ovflist 
  78.      * list, since that is used/cleaned only inside a section bound by "mtx". 
  79.      * And ep_insert() is called with "mtx" held. 
  80.      */  
  81.     spin_lock_irqsave(&ep->lock, flags);  
  82.     if (ep_is_linked(&epi->rdllink))  
  83.         list_del_init(&epi->rdllink);  
  84.     spin_unlock_irqrestore(&ep->lock, flags);  
  85.     kmem_cache_free(epi_cache, epi);  
  86.     return error;  
  87. }  
/* * Must be called with "mtx" held. */ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct file *tfile, int fd) { int error, revents, pwake = 0; unsigned long flags; struct epitem *epi; struct ep_pqueue epq; /* 不容許超過最大監聽個數*/ if (unlikely(atomic_read(&ep->user->epoll_watches) >= max_user_watches)) return -ENOSPC; if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) return -ENOMEM; /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); INIT_LIST_HEAD(&epi->fllink); INIT_LIST_HEAD(&epi->pwqlist); epi->ep = ep; ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; epi->nwait = 0; epi->next = EP_UNACTIVE_PTR; /* Initialize the poll table using the queue callback */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); /* * Attach the item to the poll hooks and get current event bits. * We can safely use the file* here because its usage count has * been increased by the caller of this function. Note that after * this operation completes, the poll callback can start hitting * the new item. */ revents = tfile->f_op->poll(tfile, &epq.pt); /* * We have to check if something went wrong during the poll wait queue * install process. Namely an allocation for a wait queue failed due * high memory pressure. */ error = -ENOMEM; if (epi->nwait < 0) goto error_unregister; /* Add the current item to the list of active epoll hook for this file */ spin_lock(&tfile->f_ep_lock); list_add_tail(&epi->fllink, &tfile->f_ep_links); spin_unlock(&tfile->f_ep_lock); /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. */ ep_rbtree_insert(ep, epi); /* We have to drop the new item inside our item list to keep track of it */ spin_lock_irqsave(&ep->lock, flags); /* If the file is already "ready" we drop it inside the ready list */ if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } spin_unlock_irqrestore(&ep->lock, flags); atomic_inc(&ep->user->epoll_watches); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&psw, &ep->poll_wait); DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)/n", current, ep, tfile, fd)); return 0; error_unregister: ep_unregister_pollwait(ep, epi); /* * We need to do this because an event could have been arrived on some * allocated wait queue. Note that we don't care about the ep->ovflist * list, since that is used/cleaned only inside a section bound by "mtx". * And ep_insert() is called with "mtx" held. */ spin_lock_irqsave(&ep->lock, flags); if (ep_is_linked(&epi->rdllink)) list_del_init(&epi->rdllink); spin_unlock_irqrestore(&ep->lock, flags); kmem_cache_free(epi_cache, epi); return error; }

init_poll_funcptr函數註冊poll table回調函數。而後程序的下一步是調用tfile的poll函數,而且poll函數的第2個參數爲poll table,

這是epoll機制中惟一對監聽套接字調用poll時第2個參數不爲NULL的時機。ep_ptable_queue_proc函數的做用是註冊等待函數

並添加到指定的等待隊列,因此在第一次調用後,該信息已經存在了,無需在poll函數中再次調用了。

[c-sharp] view plain copy print ?
  1. /* 
  2.  * This is the callback that is used to add our wait queue to the 
  3.  * target file wakeup lists. 
  4.  */  
  5. static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,  
  6.                  poll_table *pt)  
  7. {  
  8.     struct epitem *epi = ep_item_from_epqueue(pt);  
  9.     struct eppoll_entry *pwq;  
  10.     if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {  
  11.                 /* 爲監聽套接字註冊一個等待回調函數,在喚醒時調用*/  
  12.         init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);  
  13.         pwq->whead = whead;  
  14.         pwq->base = epi;  
  15.         add_wait_queue(whead, &pwq->wait);  
  16.         list_add_tail(&pwq->llink, &epi->pwqlist);  
  17.         epi->nwait++;  
  18.     } else {  
  19.         /* We have to signal that an error occurred */  
  20.         epi->nwait = -1;  
  21.     }  
  22. }  
/* * This is the callback that is used to add our wait queue to the * target file wakeup lists. */ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt) { struct epitem *epi = ep_item_from_epqueue(pt); struct eppoll_entry *pwq; if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) { /* 爲監聽套接字註冊一個等待回調函數,在喚醒時調用*/ init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; add_wait_queue(whead, &pwq->wait); list_add_tail(&pwq->llink, &epi->pwqlist); epi->nwait++; } else { /* We have to signal that an error occurred */ epi->nwait = -1; } }

 

那麼該poll函數究竟是怎樣的呢,這就要看咱們在傳入到epoll_ctl前建立的套接字的類型(socket調用)。對於建立的tcp套接字

來講,能夠按照建立流程找到其對應得函數是tcp_poll。

tcp_poll的主要功能爲:

  1. 若是poll table回調函數存在(ep_ptable_queue_proc),則調用它來等待。注意這隻限第一次調用,在後面的poll中都無需此步
  2. 判斷事件的到達。(根據tcp的相關成員)

tcp_poll註冊到的等待隊列是sock成員的sk_sleep,等待隊列在對應的IO事件中被喚醒。當等待隊列被喚醒時會調用相應的等待回調函數

,前面看到咱們註冊的是函數ep_poll_callback。該函數可能在中斷上下文中調用。

[c-sharp] view plain copy print ?
  1. /* 
  2.  * This is the callback that is passed to the wait queue wakeup 
  3.  * machanism. It is called by the stored file descriptors when they 
  4.  * have events to report. 
  5.  */  
  6. static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)  
  7. {  
  8.     int pwake = 0;  
  9.     unsigned long flags;  
  10.     struct epitem *epi = ep_item_from_wait(wait);  
  11.     struct eventpoll *ep = epi->ep;  
  12.     DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p/n",  
  13.              current, epi->ffd.file, epi, ep));  
  14.         /* 對eventpoll的spinlock加鎖,由於是在中斷上下文中*/  
  15.     spin_lock_irqsave(&ep->lock, flags);  
  16.     /* 沒有事件到來 
  17.      * If the event mask does not contain any poll(2) event, we consider the 
  18.      * descriptor to be disabled. This condition is likely the effect of the 
  19.      * EPOLLONESHOT bit that disables the descriptor when an event is received, 
  20.      * until the next EPOLL_CTL_MOD will be issued. 
  21.      */  
  22.     if (!(epi->event.events & ~EP_PRIVATE_BITS))  
  23.         goto out_unlock;  
  24.     /* 
  25.      * If we are trasfering events to userspace, we can hold no locks 
  26.      * (because we're accessing user memory, and because of linux f_op->poll() 
  27.      * semantics). All the events that happens during that period of time are 
  28.      * chained in ep->ovflist and requeued later on. 
  29.      */  
  30.     if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {  
  31.         if (epi->next == EP_UNACTIVE_PTR) {  
  32.             epi->next = ep->ovflist;  
  33.             ep->ovflist = epi;  
  34.         }  
  35.         goto out_unlock;  
  36.     }  
  37.     /* If this file is already in the ready list we exit soon */  
  38.     if (ep_is_linked(&epi->rdllink))  
  39.         goto is_linked;  
  40.         /* 加入ready queue*/  
  41.     list_add_tail(&epi->rdllink, &ep->rdllist);  
  42. is_linked:  
  43.     /* 
  44.      * Wake up ( if active ) both the eventpoll wait list and the ->poll() 
  45.      * wait list. 
  46.      */  
  47.     if (waitqueue_active(&ep->wq))  
  48.         wake_up_locked(&ep->wq);  
  49.     if (waitqueue_active(&ep->poll_wait))  
  50.         pwake++;  
  51. out_unlock:  
  52.     spin_unlock_irqrestore(&ep->lock, flags);  
  53.     /* We have to call this outside the lock */  
  54.     if (pwake)  
  55.         ep_poll_safewake(&psw, &ep->poll_wait);  
  56.     return 1;  
  57. }  
/* * This is the callback that is passed to the wait queue wakeup * machanism. It is called by the stored file descriptors when they * have events to report. */ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) { int pwake = 0; unsigned long flags; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p/n", current, epi->ffd.file, epi, ep)); /* 對eventpoll的spinlock加鎖,由於是在中斷上下文中*/ spin_lock_irqsave(&ep->lock, flags); /* 沒有事件到來 * If the event mask does not contain any poll(2) event, we consider the * descriptor to be disabled. This condition is likely the effect of the * EPOLLONESHOT bit that disables the descriptor when an event is received, * until the next EPOLL_CTL_MOD will be issued. */ if (!(epi->event.events & ~EP_PRIVATE_BITS)) goto out_unlock; /* * If we are trasfering events to userspace, we can hold no locks * (because we're accessing user memory, and because of linux f_op->poll() * semantics). All the events that happens during that period of time are * chained in ep->ovflist and requeued later on. */ if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { if (epi->next == EP_UNACTIVE_PTR) { epi->next = ep->ovflist; ep->ovflist = epi; } goto out_unlock; } /* If this file is already in the ready list we exit soon */ if (ep_is_linked(&epi->rdllink)) goto is_linked; /* 加入ready queue*/ list_add_tail(&epi->rdllink, &ep->rdllist); is_linked: /* * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. */ if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; out_unlock: spin_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&psw, &ep->poll_wait); return 1; }

 

注意這裏有2中隊列,一種是在epoll_wait調用中使用的eventpoll的等待隊列,用於判斷是否有監聽套接字可用,一種是對應於每一個套接字

的等待隊列sk_sleep,用於判斷每一個監聽套接字上事件,該隊列喚醒後調用ep_poll_callback,在該函數中又調用wakeup函數來喚醒前一種

隊列,來通知epoll_wait調用進程。

  1. static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,  
  2.            int maxevents, long timeout)  
  3. {  
  4.     int res, eavail;  
  5.     unsigned long flags;  
  6.     long jtimeout;  
  7.     wait_queue_t wait;  
  8.     /* 
  9.      * Calculate the timeout by checking for the "infinite" value ( -1 ) 
  10.      * and the overflow condition. The passed timeout is in milliseconds, 
  11.      * that why (t * HZ) / 1000. 
  12.      */  
  13.     jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ?  
  14.         MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;  
  15. retry:  
  16.     spin_lock_irqsave(&ep->lock, flags);  
  17.     res = 0;  
  18.     if (list_empty(&ep->rdllist)) {  
  19.         /* 
  20.          * We don't have any available event to return to the caller. 
  21.          * We need to sleep here, and we will be wake up by 
  22.          * ep_poll_callback() when events will become available. 
  23.          */  
  24.         init_waitqueue_entry(&wait, current);  
  25.         wait.flags |= WQ_FLAG_EXCLUSIVE;  
  26.         __add_wait_queue(&ep->wq, &wait);  
  27.         for (;;) {  
  28.             /* 
  29.              * We don't want to sleep if the ep_poll_callback() sends us 
  30.              * a wakeup in between. That's why we set the task state 
  31.              * to TASK_INTERRUPTIBLE before doing the checks. 
  32.              */  
  33.             set_current_state(TASK_INTERRUPTIBLE);  
  34.             if (!list_empty(&ep->rdllist) || !jtimeout)  
  35.                 break;  
  36.             if (signal_pending(current)) {  
  37.                 res = -EINTR;  
  38.                 break;  
  39.             }  
  40.             spin_unlock_irqrestore(&ep->lock, flags);  
  41.             jtimeout = schedule_timeout(jtimeout);  
  42.             spin_lock_irqsave(&ep->lock, flags);  
  43.         }  
  44.         __remove_wait_queue(&ep->wq, &wait);  
  45.         set_current_state(TASK_RUNNING);  
  46.     }  
  47.     /* Is it worth to try to dig for events ? */  
  48.     eavail = !list_empty(&ep->rdllist);  
  49.     spin_unlock_irqrestore(&ep->lock, flags);  
  50.     /* 
  51.      * Try to transfer events to user space. In case we get 0 events and 
  52.      * there's still timeout left over, we go trying again in search of 
  53.      * more luck. 
  54.      */  
  55.     if (!res && eavail &&  
  56.         !(res = ep_send_events(ep, events, maxevents)) && jtimeout)  
  57.         goto retry;  
  58.     return res;  
  59. }  
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, long timeout) { int res, eavail; unsigned long flags; long jtimeout; wait_queue_t wait; /* * Calculate the timeout by checking for the "infinite" value ( -1 ) * and the overflow condition. The passed timeout is in milliseconds, * that why (t * HZ) / 1000. */ jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ? MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000; retry: spin_lock_irqsave(&ep->lock, flags); res = 0; if (list_empty(&ep->rdllist)) { /* * We don't have any available event to return to the caller. * We need to sleep here, and we will be wake up by * ep_poll_callback() when events will become available. */ init_waitqueue_entry(&wait, current); wait.flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue(&ep->wq, &wait); for (;;) { /* * We don't want to sleep if the ep_poll_callback() sends us * a wakeup in between. That's why we set the task state * to TASK_INTERRUPTIBLE before doing the checks. */ set_current_state(TASK_INTERRUPTIBLE); if (!list_empty(&ep->rdllist) || !jtimeout) break; if (signal_pending(current)) { res = -EINTR; break; } spin_unlock_irqrestore(&ep->lock, flags); jtimeout = schedule_timeout(jtimeout); spin_lock_irqsave(&ep->lock, flags); } __remove_wait_queue(&ep->wq, &wait); set_current_state(TASK_RUNNING); } /* Is it worth to try to dig for events ? */ eavail = !list_empty(&ep->rdllist); spin_unlock_irqrestore(&ep->lock, flags); /* * Try to transfer events to user space. In case we get 0 events and * there's still timeout left over, we go trying again in search of * more luck. */ if (!res && eavail && !(res = ep_send_events(ep, events, maxevents)) && jtimeout) goto retry; return res; }

該函數是在epoll_wait中調用的等待函數,其等待被ep_poll_callback喚醒,而後調用ep_send_events來把到達事件copy到用戶空間,而後

epoll_wait才返回。

 

最後咱們來看看ep_poll_callback函數和ep_send_events函數的同步,由於他們都要操做ready queue。

eventpoll中巧妙地設置了2種類型的鎖,一個是mtx,是個mutex類型,是對該描述符操做的基本同步鎖,能夠睡眠;因此又存在了另一個

鎖,lock,它是一個spinlock類型,不容許睡眠,因此用在ep_poll_callback中,注意mtx不能用於此。

注意因爲ep_poll_callback函數中會涉及到對eventpoll的ovflist和rdllist成員的訪問,因此在任意其它地方要訪問時都要先加mxt,在加lock鎖。

 

因爲中斷的到來時異步的,爲了方便,先看ep_send_events函數。

  1. static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events,  
  2.               int maxevents)  
  3. {  
  4.     int eventcnt, error = -EFAULT, pwake = 0;  
  5.     unsigned int revents;  
  6.     unsigned long flags;  
  7.     struct epitem *epi, *nepi;  
  8.     struct list_head txlist;  
  9.     INIT_LIST_HEAD(&txlist);  
  10.     /* 
  11.      * We need to lock this because we could be hit by 
  12.      * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL). 
  13.      */  
  14.     mutex_lock(&ep->mtx);  
  15.     /* 
  16.      * Steal the ready list, and re-init the original one to the 
  17.      * empty list. Also, set ep->ovflist to NULL so that events 
  18.      * happening while looping w/out locks, are not lost. We cannot 
  19.      * have the poll callback to queue directly on ep->rdllist, 
  20.      * because we are doing it in the loop below, in a lockless way. 
  21.      */  
  22.     spin_lock_irqsave(&ep->lock, flags);  
  23.     list_splice(&ep->rdllist, &txlist);  
  24.     INIT_LIST_HEAD(&ep->rdllist);  
  25.     ep->ovflist = NULL;  
  26.     spin_unlock_irqrestore(&ep->lock, flags);  
  27.     /* 
  28.      * We can loop without lock because this is a task private list. 
  29.      * We just splice'd out the ep->rdllist in ep_collect_ready_items(). 
  30.      * Items cannot vanish during the loop because we are holding "mtx". 
  31.      */  
  32.     for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;) {  
  33.         epi = list_first_entry(&txlist, struct epitem, rdllink);  
  34.         list_del_init(&epi->rdllink);  
  35.         /* 
  36.          * Get the ready file event set. We can safely use the file 
  37.          * because we are holding the "mtx" and this will guarantee 
  38.          * that both the file and the item will not vanish. 
  39.          */  
  40.         revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);  
  41.         revents &= epi->event.events;  
  42.         /* 
  43.          * Is the event mask intersect the caller-requested one, 
  44.          * deliver the event to userspace. Again, we are holding 
  45.          * "mtx", so no operations coming from userspace can change 
  46.          * the item. 
  47.          */  
  48.         if (revents) {  
  49.             if (__put_user(revents,  
  50.                        &events[eventcnt].events) ||  
  51.                 __put_user(epi->event.data,  
  52.                        &events[eventcnt].data))  
  53.                 goto errxit;  
  54.             if (epi->event.events & EPOLLONESHOT)  
  55.                 epi->event.events &= EP_PRIVATE_BITS;  
  56.             eventcnt++;  
  57.         }  
  58.         /* 
  59.          * At this point, noone can insert into ep->rdllist besides 
  60.          * us. The epoll_ctl() callers are locked out by us holding 
  61.          * "mtx" and the poll callback will queue them in ep->ovflist. 
  62.          */  
  63.         if (!(epi->event.events & EPOLLET) &&  
  64.             (revents & epi->event.events))  
  65.             list_add_tail(&epi->rdllink, &ep->rdllist);  
  66.     }  
  67.     error = 0;  
  68. errxit:  
  69.     spin_lock_irqsave(&ep->lock, flags);  
  70.     /* 
  71.      * During the time we spent in the loop above, some other events 
  72.      * might have been queued by the poll callback. We re-insert them 
  73.      * inside the main ready-list here. 
  74.      */  
  75.     for (nepi = ep->ovflist; (epi = nepi) != NULL;  
  76.          nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {  
  77.         /* 
  78.          * If the above loop quit with errors, the epoll item might still 
  79.          * be linked to "txlist", and the list_splice() done below will 
  80.          * take care of those cases. 
  81.          */  
  82.         if (!ep_is_linked(&epi->rdllink))  
  83.             list_add_tail(&epi->rdllink, &ep->rdllist);  
  84.     }  
  85.     /* 
  86.      * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after 
  87.      * releasing the lock, events will be queued in the normal way inside 
  88.      * ep->rdllist. 
  89.      */  
  90.     ep->ovflist = EP_UNACTIVE_PTR;  
  91.     /* 
  92.      * In case of error in the event-send loop, or in case the number of 
  93.      * ready events exceeds the userspace limit, we need to splice the 
  94.      * "txlist" back inside ep->rdllist. 
  95.      */  
  96.     list_splice(&txlist, &ep->rdllist);  
  97.     if (!list_empty(&ep->rdllist)) {  
  98.         /* 
  99.          * Wake up (if active) both the eventpoll wait list and the ->poll() 
  100.          * wait list (delayed after we release the lock). 
  101.          */  
  102.         if (waitqueue_active(&ep->wq))  
  103.             wake_up_locked(&ep->wq);  
  104.         if (waitqueue_active(&ep->poll_wait))  
  105.             pwake++;  
  106.     }  
  107.     spin_unlock_irqrestore(&ep->lock, flags);  
  108.     mutex_unlock(&ep->mtx);  
  109.     /* We have to call this outside the lock */  
  110.     if (pwake)  
  111.         ep_poll_safewake(&psw, &ep->poll_wait);  
  112.     return eventcnt == 0 ? error: eventcnt;  
  113. }  
static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, int maxevents) { int eventcnt, error = -EFAULT, pwake = 0; unsigned int revents; unsigned long flags; struct epitem *epi, *nepi; struct list_head txlist; INIT_LIST_HEAD(&txlist); /* * We need to lock this because we could be hit by * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL). */ mutex_lock(&ep->mtx); /* * Steal the ready list, and re-init the original one to the * empty list. Also, set ep->ovflist to NULL so that events * happening while looping w/out locks, are not lost. We cannot * have the poll callback to queue directly on ep->rdllist, * because we are doing it in the loop below, in a lockless way. */ spin_lock_irqsave(&ep->lock, flags); list_splice(&ep->rdllist, &txlist); INIT_LIST_HEAD(&ep->rdllist); ep->ovflist = NULL; spin_unlock_irqrestore(&ep->lock, flags); /* * We can loop without lock because this is a task private list. * We just splice'd out the ep->rdllist in ep_collect_ready_items(). * Items cannot vanish during the loop because we are holding "mtx". */ for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;) { epi = list_first_entry(&txlist, struct epitem, rdllink); list_del_init(&epi->rdllink); /* * Get the ready file event set. We can safely use the file * because we are holding the "mtx" and this will guarantee * that both the file and the item will not vanish. */ revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); revents &= epi->event.events; /* * Is the event mask intersect the caller-requested one, * deliver the event to userspace. Again, we are holding * "mtx", so no operations coming from userspace can change * the item. */ if (revents) { if (__put_user(revents, &events[eventcnt].events) || __put_user(epi->event.data, &events[eventcnt].data)) goto errxit; if (epi->event.events & EPOLLONESHOT) epi->event.events &= EP_PRIVATE_BITS; eventcnt++; } /* * At this point, noone can insert into ep->rdllist besides * us. The epoll_ctl() callers are locked out by us holding * "mtx" and the poll callback will queue them in ep->ovflist. */ if (!(epi->event.events & EPOLLET) && (revents & epi->event.events)) list_add_tail(&epi->rdllink, &ep->rdllist); } error = 0; errxit: spin_lock_irqsave(&ep->lock, flags); /* * During the time we spent in the loop above, some other events * might have been queued by the poll callback. We re-insert them * inside the main ready-list here. */ for (nepi = ep->ovflist; (epi = nepi) != NULL; nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { /* * If the above loop quit with errors, the epoll item might still * be linked to "txlist", and the list_splice() done below will * take care of those cases. */ if (!ep_is_linked(&epi->rdllink)) list_add_tail(&epi->rdllink, &ep->rdllist); } /* * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after * releasing the lock, events will be queued in the normal way inside * ep->rdllist. */ ep->ovflist = EP_UNACTIVE_PTR; /* * In case of error in the event-send loop, or in case the number of * ready events exceeds the userspace limit, we need to splice the * "txlist" back inside ep->rdllist. */ list_splice(&txlist, &ep->rdllist); if (!list_empty(&ep->rdllist)) { /* * Wake up (if active) both the eventpoll wait list and the ->poll() * wait list (delayed after we release the lock). */ if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } spin_unlock_irqrestore(&ep->lock, flags); mutex_unlock(&ep->mtx); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&psw, &ep->poll_wait); return eventcnt == 0 ? error: eventcnt; }

該函數的註釋也很清晰,不過咱們從整體上分析下。

 

首先函數加mtx鎖,這時必須的。

而後得工做是要讀取ready queue,可是中斷會寫這個成員,因此要加spinlock;可是接下來的工做會sleep,因此在整個loop都加spinlock顯然

會阻塞ep_poll_callback函數,從而阻塞中斷,這是個很很差的行爲,也不可取。因而epoll中在eventpoll中設置了另外一個成員ovflist。在讀取ready

queue前,咱們設置該成員爲NULL,而後就能夠釋放spinlock了。爲何這樣可行呢,由於對應的,在ep_poll_callback中,獲取spinlock後,對於

到達的事件並不老是放入ready queue,而是先判斷ovflist是否爲EP_UNACTIVE_PTR。

  1. if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {  
  2. /* 進入此處說明用用戶進程在調用ep_poll_callback,因此把事件加入ovflist中,而不是ready queue中*/  
  3.         if (epi->next == EP_UNACTIVE_PTR) {/* 若是此處條件不成立,說明該epi已經在ovflist中,因此直接返回*/  
  4.             epi->next = ep->ovflist;  
  5.             ep->ovflist = epi;  
  6.         }  
  7.         goto out_unlock;  
  8.     }  
if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { /* 進入此處說明用用戶進程在調用ep_poll_callback,因此把事件加入ovflist中,而不是ready queue中*/ if (epi->next == EP_UNACTIVE_PTR) {/* 若是此處條件不成立,說明該epi已經在ovflist中,因此直接返回*/ epi->next = ep->ovflist; ep->ovflist = epi; } goto out_unlock; }

 

因此在此期間,到達的事件放入了ovflist中。當loop結束後,函數接着遍歷該list,添加到ready queue中,最後設置ovflist爲EP_UNACTIVE_PTR,

這樣下次中斷中的事件能夠放入ready queue了。最後判斷是否有其餘epoll_wait調用被阻塞,則喚醒。

 

 

 

從源代碼中,能夠看出epoll的幾大優勢:

  1. 用戶傳入的信息保存在內核中了,無需每次傳入
  2. 事件監聽機制不在是 整個監聽隊列,而是每一個監聽套接字在有事件到達時經過等待回調函數異步通知epoll,而後再返回給用戶。

同時epoll中的同步機制也是一個內核編程的設計經典,值得深刻理解。


epoll描述

相關文章
相關標籤/搜索