select、poll、epoll實現原理分析

時間 2019-11-09

標籤 select poll epoll 實現原理分析简体版

原文原文鏈接

本文轉載自：https://blog.csdn.net/lishenglong666/article/details/45536611html

poll/select/epoll的實現都是基於文件提供的poll方法(f_op->poll)，
該方法利用poll_table提供的_qproc方法向文件內部事件掩碼_key對應的的一個或多個等待隊列(wait_queue_head_t)上添加包含喚醒函數(wait_queue_t.func)的節點(wait_queue_t)，並檢查文件當前就緒的狀態返回給poll的調用者(依賴於文件的實現)。
當文件的狀態發生改變時(例如網絡數據包到達)，文件就會遍歷事件對應的等待隊列並調用回調函數(wait_queue_t.func)喚醒等待線程。node

一般的file.f_ops.poll實現及相關結構體以下安全

     C代碼   
     
   
 struct file {  
     const struct file_operations    *f_op;  
     spinlock_t          f_lock;  
     // 文件內部實現細節  
     void               *private_data;  
 #ifdef CONFIG_EPOLL  
     /* Used by fs/eventpoll.c to link all the hooks to this file */  
     struct list_head    f_ep_links;  
     struct list_head    f_tfile_llink;  
 #endif /* #ifdef CONFIG_EPOLL */  
     // 其餘細節....  
 };  
   
 // 文件操做  
 struct file_operations {  
     // 文件提供給poll/select/epoll  
     // 獲取文件當前狀態, 以及就緒通知接口函數  
     unsigned int (*poll) (struct file *, struct poll_table_struct *);  
     // 其餘方法read/write 等... ...  
 };  
   
 // 一般的file.f_ops.poll 方法的實現  
 unsigned int file_f_op_poll (struct file *filp, struct poll_table_struct *wait)  
 {  
     unsigned int mask = 0;  
     wait_queue_head_t * wait_queue;  
   
     //1. 根據事件掩碼wait->key_和文件實現filep->private_data 取得事件掩碼對應的一個或多個wait queue head  
     some_code();  
   
     // 2. 調用poll_wait 向得到的wait queue head 添加節點  
     poll_wait(filp, wait_queue, wait);  
   
     // 3. 取得當前就緒狀態保存到mask  
     some_code();  
   
     return mask;  
 }  
   
 // select/poll/epoll 向文件註冊就緒後回調節點的接口結構  
 typedef struct poll_table_struct {  
     // 向wait_queue_head 添加回調節點(wait_queue_t)的接口函數  
     poll_queue_proc _qproc;  
     // 關注的事件掩碼, 文件的實現利用此掩碼將等待隊列傳遞給_qproc  
     unsigned long   _key;  
 } poll_table;  
 typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);  
   
   
 // 通用的poll_wait 函數, 文件的f_ops->poll 一般會調用此函數  
 static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)  
 {  
     if (p && p->_qproc && wait_address) {  
         // 調用_qproc 在wait_address 上添加節點和回調函數  
         // 調用 poll_table_struct 上的函數指針向wait_address添加節點, 並設置節點的func  
         // (若是是select或poll 則是 __pollwait, 若是是 epoll 則是 ep_ptable_queue_proc),  
         p->_qproc(filp, wait_address, p);  
     }  
 }  
   
   
 // wait_queue 頭節點  
 typedef struct __wait_queue_head wait_queue_head_t;  
 struct __wait_queue_head {  
     spinlock_t lock;  
     struct list_head task_list;  
 };  
   
 // wait_queue 節點  
 typedef struct __wait_queue wait_queue_t;  
 struct __wait_queue {  
     unsigned int flags;  
 #define WQ_FLAG_EXCLUSIVE   0x01  
     void *private;  
     wait_queue_func_t func;  
     struct list_head task_list;  
 };  
 typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);  
   
   
 // 當文件的狀態發生改變時, 文件會調用此函數，此函數經過調用wait_queue_t.func通知poll的調用者  
 // 其中key是文件當前的事件掩碼  
 void __wake_up(wait_queue_head_t *q, unsigned int mode,  
                int nr_exclusive, void *key)  
 {  
     unsigned long flags;  
   
     spin_lock_irqsave(&q->lock, flags);  
     __wake_up_common(q, mode, nr_exclusive, 0, key);  
     spin_unlock_irqrestore(&q->lock, flags);  
 }  
 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,  
                              int nr_exclusive, int wake_flags, void *key)  
 {  
     wait_queue_t *curr, *next;  
     // 遍歷並調用func 喚醒, 一般func會喚醒調用poll的線程  
     list_for_each_entry_safe(curr, next, &q->task_list, task_list) {  
         unsigned flags = curr->flags;  
   
         if (curr->func(curr, mode, wake_flags, key) &&  
                 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) {  
             break;  
         }  
     }  
 }  

poll 和 select

poll和select的實現基本上是一致的，只是傳遞參數有所不一樣，他們的基本流程以下：cookie

1. 複製用戶數據到內核空間網絡

2. 估計超時時間數據結構

3. 遍歷每一個文件並調用f_op->poll 取得文件當前就緒狀態，若是前面遍歷的文件都沒有就緒，向文件插入wait_queue節點app

4. 遍歷完成後檢查狀態：ide

a). 若是已經有就緒的文件轉到5；函數

b). 若是有信號產生，重啓poll或select（轉到 1或3）；oop

c). 不然掛起進程等待超時或喚醒，超時或被喚醒後再次遍歷全部文件取得每一個文件的就緒狀態

5. 將全部文件的就緒狀態複製到用戶空間

6. 清理申請的資源

關鍵結構體

下面是poll/select共用的結構體及其相關功能:

poll_wqueues 是 select/poll 對poll_table接口的具體化實現,其中的table, inline_index和inline_entries都是爲了管理內存。
poll_table_entry 與一個文件相關聯，用於管理插入到文件的wait_queue節點。

     C代碼   
     
   
 // select/poll 對poll_table的具體化實現  
 struct poll_wqueues {  
     poll_table pt;  
     struct poll_table_page *table;     // 若是inline_entries 空間不足, 從poll_table_page 中分配  
     struct task_struct *polling_task;  // 調用poll 或select 的進程  
     int triggered;                     // 已觸發標記  
     int error;  
     int inline_index;                  // 下一個要分配的inline_entrie 索引  
     struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];//  
 };  
 // 幫助管理select/poll  申請的內存  
 struct poll_table_page {  
     struct poll_table_page  * next;       // 下一個 page  
     struct poll_table_entry * entry;      // 指向第一個entries  
     struct poll_table_entry entries[0];  
 };  
 // 與一個正在poll /select 的文件相關聯,  
 struct poll_table_entry {  
     struct file *filp;               // 在poll/select中的文件  
     unsigned long key;  
     wait_queue_t wait;               // 插入到wait_queue_head_t 的節點  
     wait_queue_head_t *wait_address; // 文件上的wait_queue_head_t 地址  
 };  

公共函數

下面是poll/select公用的一些函數，這些函數實現了poll和select的核心功能。

poll_initwait 用於初始化poll_wqueues，

__pollwait 實現了向文件中添加回調節點的邏輯，

pollwake 當文件狀態發生改變時，由文件調用，用來喚醒線程，

poll_get_entry，free_poll_entry，poll_freewait用來申請釋放poll_table_entry 佔用的內存，並負責釋放文件上的wait_queue節點。

     C代碼   
     
   
 // poll_wqueues 的初始化:  
 // 初始化 poll_wqueues , __pollwait會在文件就緒時被調用  
 void poll_initwait(struct poll_wqueues *pwq)  
 {  
     // 初始化poll_table, 至關於調用基類的構造函數  
     init_poll_funcptr(&pwq->pt, __pollwait);  
     /* 
      * static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc) 
      * { 
      *     pt->_qproc = qproc; 
      *     pt->_key   = ~0UL; 
      * } 
      */  
     pwq->polling_task = current;  
     pwq->triggered = 0;  
     pwq->error = 0;  
     pwq->table = NULL;  
     pwq->inline_index = 0;  
 }  
   
   
 // wait_queue設置函數  
 // poll/select 向文件wait_queue中添加節點的方法  
 static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,  
                        poll_table *p)  
 {  
     struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);  
     struct poll_table_entry *entry = poll_get_entry(pwq);  
     if (!entry) {  
         return;  
     }  
     get_file(filp); //put_file() in free_poll_entry()  
     entry->filp = filp;  
     entry->wait_address = wait_address; // 等待隊列頭  
     entry->key = p->key;  
     // 設置回調爲 pollwake  
     init_waitqueue_func_entry(&entry->wait, pollwake);  
     entry->wait.private = pwq;  
     // 添加到等待隊列  
     add_wait_queue(wait_address, &entry->wait);  
 }  
   
 // 在等待隊列(wait_queue_t)上回調函數(func)  
 // 文件就緒後被調用，喚醒調用進程，其中key是文件提供的當前狀態掩碼  
 static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)  
 {  
     struct poll_table_entry *entry;  
     // 取得文件對應的poll_table_entry  
     entry = container_of(wait, struct poll_table_entry, wait);  
     // 過濾不關注的事件  
     if (key && !((unsigned long)key & entry->key)) {  
         return 0;  
     }  
     // 喚醒  
     return __pollwake(wait, mode, sync, key);  
 }  
 static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)  
 {  
     struct poll_wqueues *pwq = wait->private;  
     // 將調用進程 pwq->polling_task 關聯到 dummy_wait  
     DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);  
     smp_wmb();  
     pwq->triggered = 1;// 標記爲已觸發  
     // 喚醒調用進程  
     return default_wake_function(&dummy_wait, mode, sync, key);  
 }  
   
 // 默認的喚醒函數,poll/select 設置的回調函數會調用此函數喚醒  
 // 直接喚醒等待隊列上的線程,即將線程移到運行隊列(rq)  
 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,  
                           void *key)  
 {  
     // 這個函數比較複雜, 這裏就不具體分析了  
     return try_to_wake_up(curr->private, mode, wake_flags);  
 }  

poll，select對poll_table_entry的申請和釋放採用的是相似內存池的管理方式，先使用預分配的空間，預分配的空間不足時，分配一個內存頁，使用內存頁上的空間。

     C代碼   
     
   
 // 分配或使用已先前申請的 poll_table_entry,  
 static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) {  
     struct poll_table_page *table = p->table;  
   
     if (p->inline_index < N_INLINE_POLL_ENTRIES) {  
         return p->inline_entries + p->inline_index++;  
     }  
   
     if (!table || POLL_TABLE_FULL(table)) {  
         struct poll_table_page *new_table;  
         new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);  
         if (!new_table) {  
             p->error = -ENOMEM;  
             return NULL;  
         }  
         new_table->entry = new_table->entries;  
         new_table->next = table;  
         p->table = new_table;  
         table = new_table;  
     }  
     return table->entry++;  
 }  
   
 // 清理poll_wqueues 佔用的資源  
 void poll_freewait(struct poll_wqueues *pwq)  
 {  
     struct poll_table_page * p = pwq->table;  
     // 遍歷全部已分配的inline poll_table_entry  
     int i;  
     for (i = 0; i < pwq->inline_index; i++) {  
         free_poll_entry(pwq->inline_entries + i);  
     }  
     // 遍歷在poll_table_page上分配的inline poll_table_entry  
     // 並釋放poll_table_page  
     while (p) {  
         struct poll_table_entry * entry;  
         struct poll_table_page *old;  
         entry = p->entry;  
         do {  
             entry--;  
             free_poll_entry(entry);  
         } while (entry > p->entries);  
         old = p;  
         p = p->next;  
         free_page((unsigned long) old);  
     }  
 }  
 static void free_poll_entry(struct poll_table_entry *entry)  
 {  
     // 從等待隊列中刪除, 釋放文件引用計數  
     remove_wait_queue(entry->wait_address, &entry->wait);  
     fput(entry->filp);  
 }  

poll/select核心結構關係

下圖是 poll/select 實現公共部分的關係圖，包含了與文件直接的關係，以及函數之間的依賴。

poll的實現

     C代碼   
     
   
 // poll 使用的結構體  
 struct pollfd {  
     int fd;        // 描述符  
     short events;  // 關注的事件掩碼  
     short revents; // 返回的事件掩碼  
 };  
 // long sys_poll(struct pollfd *ufds, unsigned int nfds, long timeout_msecs)  
 SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,  
                 long, timeout_msecs)  
 {  
     struct timespec end_time, *to = NULL;  
     int ret;  
     if (timeout_msecs >= 0) {  
         to = &end_time;  
         // 將相對超時時間msec 轉化爲絕對時間  
         poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,  
                                 NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));  
     }  
     // do sys poll  
     ret = do_sys_poll(ufds, nfds, to);  
     // do_sys_poll 被信號中斷, 從新調用, 對使用者來講 poll 是不會被信號中斷的.  
     if (ret == -EINTR) {  
         struct restart_block *restart_block;  
         restart_block = &current_thread_info()->restart_block;  
         restart_block->fn = do_restart_poll; // 設置重啓的函數  
         restart_block->poll.ufds = ufds;  
         restart_block->poll.nfds = nfds;  
         if (timeout_msecs >= 0) {  
             restart_block->poll.tv_sec = end_time.tv_sec;  
             restart_block->poll.tv_nsec = end_time.tv_nsec;  
             restart_block->poll.has_timeout = 1;  
         } else {  
             restart_block->poll.has_timeout = 0;  
         }  
         // ERESTART_RESTARTBLOCK 不會返回給用戶進程,  
         // 而是會被系統捕獲, 而後調用 do_restart_poll,  
         ret = -ERESTART_RESTARTBLOCK;  
     }  
     return ret;  
 }  
 int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,  
                 struct timespec *end_time)  
 {  
     struct poll_wqueues table;  
     int err = -EFAULT, fdcount, len, size;  
     /* 首先使用棧上的空間，節約內存，加速訪問 */  
     long stack_pps[POLL_STACK_ALLOC/sizeof(long)];  
     struct poll_list *const head = (struct poll_list *)stack_pps;  
     struct poll_list *walk = head;  
     unsigned long todo = nfds;  
     if (nfds > rlimit(RLIMIT_NOFILE)) {  
         // 文件描述符數量超過當前進程限制  
         return -EINVAL;  
     }  
     // 複製用戶空間數據到內核  
     len = min_t(unsigned int, nfds, N_STACK_PPS);  
     for (;;) {  
         walk->next = NULL;  
         walk->len = len;  
         if (!len) {  
             break;  
         }  
         // 複製到當前的 entries  
         if (copy_from_user(walk->entries, ufds + nfds-todo,  
                            sizeof(struct pollfd) * walk->len)) {  
             goto out_fds;  
         }  
         todo -= walk->len;  
         if (!todo) {  
             break;  
         }  
         // 棧上空間不足，在堆上申請剩餘部分  
         len = min(todo, POLLFD_PER_PAGE);  
         size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;  
         walk = walk->next = kmalloc(size, GFP_KERNEL);  
         if (!walk) {  
             err = -ENOMEM;  
             goto out_fds;  
         }  
     }  
     // 初始化 poll_wqueues 結構, 設置函數指針_qproc  爲__pollwait  
     poll_initwait(&table);  
     // poll  
     fdcount = do_poll(nfds, head, &table, end_time);  
     // 從文件wait queue 中移除對應的節點, 釋放entry.  
     poll_freewait(&table);  
     // 複製結果到用戶空間  
     for (walk = head; walk; walk = walk->next) {  
         struct pollfd *fds = walk->entries;  
         int j;  
         for (j = 0; j < len; j++, ufds++)  
             if (__put_user(fds[j].revents, &ufds->revents)) {  
                 goto out_fds;  
             }  
     }  
     err = fdcount;  
 out_fds:  
     // 釋放申請的內存  
     walk = head->next;  
     while (walk) {  
         struct poll_list *pos = walk;  
         walk = walk->next;  
         kfree(pos);  
     }  
     return err;  
 }  
 // 真正的處理函數  
 static int do_poll(unsigned int nfds,  struct poll_list *list,  
                    struct poll_wqueues *wait, struct timespec *end_time)  
 {  
     poll_table* pt = &wait->pt;  
     ktime_t expire, *to = NULL;  
     int timed_out = 0, count = 0;  
     unsigned long slack = 0;  
     if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {  
         // 已經超時,直接遍歷全部文件描述符, 而後返回  
         pt = NULL;  
         timed_out = 1;  
     }  
     if (end_time && !timed_out) {  
         // 估計進程等待時間，納秒  
         slack = select_estimate_accuracy(end_time);  
     }  
     // 遍歷文件，爲每一個文件的等待隊列添加喚醒函數(pollwake)  
     for (;;) {  
         struct poll_list *walk;  
         for (walk = list; walk != NULL; walk = walk->next) {  
             struct pollfd * pfd, * pfd_end;  
             pfd = walk->entries;  
             pfd_end = pfd + walk->len;  
             for (; pfd != pfd_end; pfd++) {  
                 // do_pollfd 會向文件對應的wait queue 中添加節點  
                 // 和回調函數(若是 pt 不爲空)  
                 // 並檢查當前文件狀態並設置返回的掩碼  
                 if (do_pollfd(pfd, pt)) {  
                     // 該文件已經準備好了.  
                     // 不須要向後面文件的wait queue 中添加喚醒函數了.  
                     count++;  
                     pt = NULL;  
                 }  
             }  
         }  
         // 下次循環的時候不須要向文件的wait queue 中添加節點,  
         // 由於前面的循環已經把該添加的都添加了  
         pt = NULL;  
   
         // 第一次遍歷沒有發現ready的文件  
         if (!count) {  
             count = wait->error;  
             // 有信號產生  
             if (signal_pending(current)) {  
                 count = -EINTR;  
             }  
         }  
   
         // 有ready的文件或已經超時  
         if (count || timed_out) {  
             break;  
         }  
         // 轉換爲內核時間  
         if (end_time && !to) {  
             expire = timespec_to_ktime(*end_time);  
             to = &expire;  
         }  
         // 等待事件就緒, 若是有事件發生或超時，就再循  
         // 環一遍，取得事件狀態掩碼並計數,  
         // 注意這次循環中, 文件 wait queue 中的節點依然存在  
         if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) {  
             timed_out = 1;  
         }  
     }  
     return count;  
 }  
   
   
 static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)  
 {  
     unsigned int mask;  
     int fd;  
     mask = 0;  
     fd = pollfd->fd;  
     if (fd >= 0) {  
         int fput_needed;  
         struct file * file;  
         // 取得fd對應的文件結構體  
         file = fget_light(fd, &fput_needed);  
         mask = POLLNVAL;  
         if (file != NULL) {  
             // 若是沒有 f_op 或 f_op->poll 則認爲文件始終處於就緒狀態.  
             mask = DEFAULT_POLLMASK;  
             if (file->f_op && file->f_op->poll) {  
                 if (pwait) {  
                     // 設置關注的事件掩碼  
                     pwait->key = pollfd->events | POLLERR | POLLHUP;  
                 }  
                 // 註冊回調函數，並返回當前就緒狀態，就緒後會調用pollwake  
                 mask = file->f_op->poll(file, pwait);  
             }  
             mask &= pollfd->events | POLLERR | POLLHUP; // 移除不須要的狀態掩碼  
             fput_light(file, fput_needed);// 釋放文件  
         }  
     }  
     pollfd->revents = mask; // 更新事件狀態  
     return mask;  
 }  
   
   
 static long do_restart_poll(struct restart_block *restart_block)  
 {  
     struct pollfd __user *ufds = restart_block->poll.ufds;  
     int nfds = restart_block->poll.nfds;  
     struct timespec *to = NULL, end_time;  
     int ret;  
     if (restart_block->poll.has_timeout) {  
         // 獲取先前的超時時間  
         end_time.tv_sec = restart_block->poll.tv_sec;  
         end_time.tv_nsec = restart_block->poll.tv_nsec;  
         to = &end_time;  
     }  
     ret = do_sys_poll(ufds, nfds, to); // 從新調用 do_sys_poll  
     if (ret == -EINTR) {  
         // 又被信號中斷了, 再次重啓  
         restart_block->fn = do_restart_poll;  
         ret = -ERESTART_RESTARTBLOCK;  
     }  
     return ret;  
 }  

select 實現

     C代碼   
     
   
 typedef struct {  
     unsigned long *in, *out, *ex;  
     unsigned long *res_in, *res_out, *res_ex;  
 } fd_set_bits;  
 //  long sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)  
 SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,  
                 fd_set __user *, exp, struct timeval __user *, tvp)  
 {  
     struct timespec end_time, *to = NULL;  
     struct timeval tv;  
     int ret;  
     if (tvp) {  
         if (copy_from_user(&tv, tvp, sizeof(tv))) {  
             return -EFAULT;  
         }  
         // 計算超時時間  
         to = &end_time;  
         if (poll_select_set_timeout(to,  
                                     tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),  
                                     (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) {  
             return -EINVAL;  
         }  
     }  
     ret = core_sys_select(n, inp, outp, exp, to);  
     // 複製剩餘時間到用戶空間  
     ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);  
     return ret;  
 }  
   
 int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,  
                     fd_set __user *exp, struct timespec *end_time)  
 {  
     fd_set_bits fds;  
     void *bits;  
     int ret, max_fds;  
     unsigned int size;  
     struct fdtable *fdt;  
     //小對象使用棧上的空間,節約內存, 加快訪問速度  
     long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];  
   
     ret = -EINVAL;  
     if (n < 0) {  
         goto out_nofds;  
     }  
   
     rcu_read_lock();  
     // 取得進程對應的 fdtable  
     fdt = files_fdtable(current->files);  
     max_fds = fdt->max_fds;  
     rcu_read_unlock();  
     if (n > max_fds) {  
         n = max_fds;  
     }  
   
     size = FDS_BYTES(n);  
     bits = stack_fds;  
     if (size > sizeof(stack_fds) / 6) {  
         // 棧上的空間不夠, 申請內存, 所有使用堆上的空間  
         ret = -ENOMEM;  
         bits = kmalloc(6 * size, GFP_KERNEL);  
         if (!bits) {  
             goto out_nofds;  
         }  
     }  
     fds.in     = bits;  
     fds.out    = bits +   size;  
     fds.ex     = bits + 2*size;  
     fds.res_in  = bits + 3*size;  
     fds.res_out = bits + 4*size;  
     fds.res_ex  = bits + 5*size;  
   
     // 複製用戶空間到內核  
     if ((ret = get_fd_set(n, inp, fds.in)) ||  
             (ret = get_fd_set(n, outp, fds.out)) ||  
             (ret = get_fd_set(n, exp, fds.ex))) {  
         goto out;  
     }  
     // 初始化fd set  
     zero_fd_set(n, fds.res_in);  
     zero_fd_set(n, fds.res_out);  
     zero_fd_set(n, fds.res_ex);  
   
     ret = do_select(n, &fds, end_time);  
   
     if (ret < 0) {  
         goto out;  
     }  
     if (!ret) {  
         // 該返回值會被系統捕獲, 並以一樣的參數從新調用sys_select()  
         ret = -ERESTARTNOHAND;  
         if (signal_pending(current)) {  
             goto out;  
         }  
         ret = 0;  
     }  
   
     // 複製到用戶空間  
     if (set_fd_set(n, inp, fds.res_in) ||  
             set_fd_set(n, outp, fds.res_out) ||  
             set_fd_set(n, exp, fds.res_ex)) {  
         ret = -EFAULT;  
     }  
   
 out:  
     if (bits != stack_fds) {  
         kfree(bits);  
     }  
 out_nofds:  
     return ret;  
 }  
   
 int do_select(int n, fd_set_bits *fds, struct timespec *end_time)  
 {  
     ktime_t expire, *to = NULL;  
     struct poll_wqueues table;  
     poll_table *wait;  
     int retval, i, timed_out = 0;  
     unsigned long slack = 0;  
   
     rcu_read_lock();  
     // 檢查fds中fd的有效性, 並獲取當前最大的fd  
     retval = max_select_fd(n, fds);  
     rcu_read_unlock();  
   
     if (retval < 0) {  
         return retval;  
     }  
     n = retval;  
   
     // 初始化 poll_wqueues 結構, 設置函數指針_qproc    爲__pollwait  
     poll_initwait(&table);  
     wait = &table.pt;  
     if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {  
         wait = NULL;  
         timed_out = 1;  
     }  
   
     if (end_time && !timed_out) {  
         // 估計須要等待的時間.  
         slack = select_estimate_accuracy(end_time);  
     }  
   
     retval = 0;  
     for (;;) {  
         unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;  
   
         inp = fds->in;  
         outp = fds->out;  
         exp = fds->ex;  
         rinp = fds->res_in;  
         routp = fds->res_out;  
         rexp = fds->res_ex;  
         // 遍歷全部的描述符, i 文件描述符  
         for (i = 0; i < n; ++rinp, ++routp, ++rexp) {  
             unsigned long in, out, ex, all_bits, bit = 1, mask, j;  
             unsigned long res_in = 0, res_out = 0, res_ex = 0;  
             const struct file_operations *f_op = NULL;  
             struct file *file = NULL;  
             // 檢查當前的 slot 中的描述符  
             in = *inp++;  
             out = *outp++;  
             ex = *exp++;  
             all_bits = in | out | ex;  
             if (all_bits == 0) { // 沒有須要監聽的描述符, 下一個slot  
                 i += __NFDBITS;  
                 continue;  
             }  
   
             for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {  
                 int fput_needed;  
                 if (i >= n) {  
                     break;  
                 }  
                 // 不須要監聽描述符 i  
                 if (!(bit & all_bits)) {  
                     continue;  
                 }  
                 // 取得文件結構  
                 file = fget_light(i, &fput_needed);  
                 if (file) {  
                     f_op = file->f_op;  
                     // 沒有 f_op 的話就認爲一直處於就緒狀態  
                     mask = DEFAULT_POLLMASK;  
                     if (f_op && f_op->poll) {  
                         // 設置等待事件的掩碼  
                         wait_key_set(wait, in, out, bit);  
                         /* 
                         static inline void wait_key_set(poll_table *wait, unsigned long in, 
                         unsigned long out, unsigned long bit) 
                         { 
                         wait->_key = POLLEX_SET;// (POLLPRI) 
                         if (in & bit) 
                         wait->_key |= POLLIN_SET;//(POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR) 
                         if (out & bit) 
                         wait->_key |= POLLOUT_SET;//POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR) 
                         } 
                         */  
                         // 獲取當前的就緒狀態, 並添加到文件的對應等待隊列中  
                         mask = (*f_op->poll)(file, wait);  
                         // 和poll徹底同樣  
                     }  
                     fput_light(file, fput_needed);  
                     // 釋放文件  
                     // 檢查文件 i 是否已有事件就緒，  
                     if ((mask & POLLIN_SET) && (in & bit)) {  
                         res_in |= bit;  
                         retval++;  
                         // 若是已有就緒事件就再也不向其餘文件的  
                         // 等待隊列中添加回調函數  
                         wait = NULL;  
                     }  
                     if ((mask & POLLOUT_SET) && (out & bit)) {  
                         res_out |= bit;  
                         retval++;  
                         wait = NULL;  
                     }  
                     if ((mask & POLLEX_SET) && (ex & bit)) {  
                         res_ex |= bit;  
                         retval++;  
                         wait = NULL;  
                     }  
                 }  
             }  
             if (res_in) {  
                 *rinp = res_in;  
             }  
             if (res_out) {  
                 *routp = res_out;  
             }  
             if (res_ex) {  
                 *rexp = res_ex;  
             }  
             cond_resched();  
         }  
         wait = NULL; // 該添加回調函數的都已經添加了  
         if (retval || timed_out || signal_pending(current)) {  
             break;   // 信號發生，監聽事件就緒或超時  
         }  
         if (table.error) {  
             retval = table.error; // 產生錯誤了  
             break;  
         }  
         // 轉換到內核時間  
         if (end_time && !to) {  
             expire = timespec_to_ktime(*end_time);  
             to = &expire;  
         }  
         // 等待直到超時, 或由回調函數喚醒, 超時後會再次遍歷文件描述符  
         if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,  
                                    to, slack)) {  
             timed_out = 1;  
         }  
     }  
   
     poll_freewait(&table);  
   
     return retval;  
 }  

epoll實現

epoll 的實現比poll/select 複雜一些，這是由於：
1. epoll_wait, epoll_ctl 的調用徹底獨立開來,內核須要鎖機制對這些操做進行保護，而且須要持久的維護添加到epoll的文件
2. epoll自己也是文件，也能夠被poll/select/epoll監視，這可能致使epoll之間循環喚醒的問題
3. 單個文件的狀態改變可能喚醒過多監聽在其上的epoll，產生喚醒風暴

epoll各個功能的實現要很是當心面對這些問題，使得複雜度大大增長。

epoll的核心數據結構

     C代碼   
     
   
 // epoll的核心實現對應於一個epoll描述符  
 struct eventpoll {  
     spinlock_t lock;  
     struct mutex mtx;  
     wait_queue_head_t wq; // sys_epoll_wait() 等待在這裏  
     // f_op->poll()  使用的, 被其餘事件通知機制利用的wait_address  
     wait_queue_head_t poll_wait;  
     /* 已就緒的須要檢查的epitem 列表 */  
     struct list_head rdllist;  
     /* 保存全部加入到當前epoll的文件對應的epitem*/  
     struct rb_root rbr;  
     // 當正在向用戶空間複製數據時, 產生的可用文件  
     struct epitem *ovflist;  
     /* The user that created the eventpoll descriptor */  
     struct user_struct *user;  
     struct file *file;  
     /*優化循環檢查，避免循環檢查中重複的遍歷 */  
     int visited;  
     struct list_head visited_list_link;  
 }  
   
 // 對應於一個加入到epoll的文件  
 struct epitem {  
     // 掛載到eventpoll 的紅黑樹節點  
     struct rb_node rbn;  
     // 掛載到eventpoll.rdllist 的節點  
     struct list_head rdllink;  
     // 鏈接到ovflist 的指針  
     struct epitem *next;  
     /* 文件描述符信息fd + file, 紅黑樹的key */  
     struct epoll_filefd ffd;  
     /* Number of active wait queue attached to poll operations */  
     int nwait;  
     // 當前文件的等待隊列(eppoll_entry)列表  
     // 同一個文件上可能會監視多種事件,  
     // 這些事件可能屬於不一樣的wait_queue中  
     // (取決於對應文件類型的實現),  
     // 因此須要使用鏈表  
     struct list_head pwqlist;  
     // 當前epitem 的全部者  
     struct eventpoll *ep;  
     /* List header used to link this item to the &quot;struct file&quot; items list */  
     struct list_head fllink;  
     /* epoll_ctl 傳入的用戶數據 */  
     struct epoll_event event;  
 };  
   
 struct epoll_filefd {  
     struct file *file;  
     int fd;  
 };  
   
 // 與一個文件上的一個wait_queue_head 相關聯，由於同一文件可能有多個等待的事件，這些事件可能使用不一樣的等待隊列  
 struct eppoll_entry {  
     // List struct epitem.pwqlist  
     struct list_head llink;  
     // 全部者  
     struct epitem *base;  
     // 添加到wait_queue 中的節點  
     wait_queue_t wait;  
     // 文件wait_queue 頭  
     wait_queue_head_t *whead;  
 };  
   
 // 用戶使用的epoll_event  
 struct epoll_event {  
     __u32 events;  
     __u64 data;  
 } EPOLL_PACKED;  

文件系統初始化和epoll_create

     C代碼   
     
   
 // epoll 文件系統的相關實現  
 // epoll 文件系統初始化, 在系統啓動時會調用  
   
 static int __init eventpoll_init(void)  
 {  
     struct sysinfo si;  
   
     si_meminfo(&si);  
     // 限制可添加到epoll的最多的描述符數量  
   
     max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /  
                        EP_ITEM_COST;  
     BUG_ON(max_user_watches < 0);  
   
     // 初始化遞歸檢查隊列  
    ep_nested_calls_init(&poll_loop_ncalls);  
     ep_nested_calls_init(&poll_safewake_ncalls);  
     ep_nested_calls_init(&poll_readywalk_ncalls);  
     // epoll 使用的slab分配器分別用來分配epitem和eppoll_entry  
     epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),  
                                   0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);  
     pwq_cache = kmem_cache_create("eventpoll_pwq",  
                                   sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);  
   
     return 0;  
 }  
   
   
 SYSCALL_DEFINE1(epoll_create, int, size)  
 {  
     if (size <= 0) {  
         return -EINVAL;  
     }  
   
     return sys_epoll_create1(0);  
 }  
   
 SYSCALL_DEFINE1(epoll_create1, int, flags)  
 {  
     int error, fd;  
     struct eventpoll *ep = NULL;  
     struct file *file;  
   
     /* Check the EPOLL_* constant for consistency.  */  
     BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);  
   
     if (flags & ~EPOLL_CLOEXEC) {  
         return -EINVAL;  
     }  
     /* 
      * Create the internal data structure ("struct eventpoll"). 
      */  
     error = ep_alloc(&ep);  
     if (error < 0) {  
         return error;  
     }  
     /* 
      * Creates all the items needed to setup an eventpoll file. That is, 
      * a file structure and a free file descriptor. 
      */  
     fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));  
     if (fd < 0) {  
          error = fd;  
          goto out_free_ep;  
       }  
       // 設置epfd的相關操做，因爲epoll也是文件也提供了poll操做  
     file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,  
                               O_RDWR | (flags & O_CLOEXEC));  
     if (IS_ERR(file)) {  
         error = PTR_ERR(file);  
         goto out_free_fd;  
     }  
     fd_install(fd, file);  
     ep->file = file;  
     return fd;  
   
 out_free_fd:  
     put_unused_fd(fd);  
 out_free_ep:  
     ep_free(ep);  
     return error;  
 }  

epoll中的遞歸死循環和深度檢查

遞歸深度檢測(ep_call_nested)

epoll自己也是文件，也能夠被poll/select/epoll監視，若是epoll之間互相監視就有可能致使死循環。epoll的實現中，全部可能產生遞歸調用的函數都由函函數ep_call_nested進行包裹，遞歸調用過程當中出現死循環或遞歸過深就會打破死循環和遞歸調用直接返回。該函數的實現依賴於一個外部的全局鏈表nested_call_node(不一樣的函數調用使用不一樣的節點)，每次調用可能發生遞歸的函數(nproc)就向鏈表中添加一個包含當前函數調用上下文ctx(進程，CPU，或epoll文件)和處理的對象標識cookie的節點，經過檢測是否有相同的節點就能夠知道是否發生了死循環，檢查鏈表中同一上下文包含的節點個數就能夠知道遞歸的深度。如下就是這一過程的源碼。

     C代碼   
     
   
 struct nested_call_node {  
     struct list_head llink;  
     void *cookie;   // 函數運行標識, 任務標誌  
     void *ctx;      // 運行環境標識  
 };  
 struct nested_calls {  
     struct list_head tasks_call_list;  
     spinlock_t lock;  
 };  
   
 // 全局的不一樣調用使用的鏈表  
 // 死循環檢查和喚醒風暴檢查鏈表  
 static nested_call_node poll_loop_ncalls;  
 // 喚醒時使用的檢查鏈表  
 static nested_call_node poll_safewake_ncalls;  
 // 掃描readylist 時使用的鏈表  
 static nested_call_node poll_readywalk_ncalls;  
   
   
 // 限制epoll 中直接或間接遞歸調用的深度並防止死循環  
 // ctx: 任務運行上下文(進程, CPU 等)  
 // cookie: 每一個任務的標識  
 // priv: 任務運行須要的私有數據  
 // 若是用面嚮對象語言實現應該就會是一個wapper類  
 static int ep_call_nested(struct nested_calls *ncalls, int max_nests,  
                           int (*nproc)(void *, void *, int), void *priv,  
                           void *cookie, void *ctx)  
 {  
     int error, call_nests = 0;  
     unsigned long flags;  
     struct list_head *lsthead = &ncalls->tasks_call_list;  
     struct nested_call_node *tncur;  
     struct nested_call_node tnode;  
     spin_lock_irqsave(&ncalls->lock, flags);  
     // 檢查原有的嵌套調用鏈表ncalls, 查看是否有深度超過限制的狀況  
     list_for_each_entry(tncur, lsthead, llink) {  
         // 同一上下文中(ctx)有相同的任務(cookie)說明產生了死循環  
         // 同一上下文的遞歸深度call_nests 超過限制  
         if (tncur->ctx == ctx &&  
                 (tncur->cookie == cookie || ++call_nests > max_nests)) {  
             error = -1;  
         }  
         goto out_unlock;  
     }  
     /* 將當前的任務請求添加到調用列表*/  
     tnode.ctx = ctx;  
     tnode.cookie = cookie;  
     list_add(&tnode.llink, lsthead);  
     spin_unlock_irqrestore(&ncalls->lock, flags);  
     /* nproc 可能會致使遞歸調用(直接或間接)ep_call_nested 
          * 若是發生遞歸調用, 那麼在此函數返回以前, 
          * ncalls 又會被加入額外的節點, 
          * 這樣經過前面的檢測就能夠知道遞歸調用的深度 
       */  
     error = (*nproc)(priv, cookie, call_nests);  
     /* 從鏈表中刪除當前任務*/  
     spin_lock_irqsave(&ncalls->lock, flags);  
     list_del(&tnode.llink);  
 out_unlock:  
     spin_unlock_irqrestore(&ncalls->lock, flags);  
     return error;  
 }  

循環檢測(ep_loop_check)

循環檢查(ep_loop_check)，該函數遞歸調用ep_loop_check_proc利用ep_call_nested來實現epoll之間相互監視的死循環。由於ep_call_nested中已經對死循環和過深的遞歸作了檢查，實際的ep_loop_check_proc的實現只是遞歸調用本身。其中的visited_list和visited標記徹底是爲了優化處理速度，若是沒有visited_list和visited標記函數也是可以工做的。該函數中得上下文就是當前的進程，cookie就是正在遍歷的epoll結構。

     C代碼   
     
   
 static LIST_HEAD(visited_list);  
 // 檢查 file (epoll)和ep 之間是否有循環  
 static int ep_loop_check(struct eventpoll *ep, struct file *file)  
 {  
     int ret;  
     struct eventpoll *ep_cur, *ep_next;  
   
     ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,  
                          ep_loop_check_proc, file, ep, current);  
     /* 清除鏈表和標誌 */  
     list_for_each_entry_safe(ep_cur, ep_next, &visited_list,  
                              visited_list_link) {  
         ep_cur->visited = 0;  
         list_del(&ep_cur->visited_list_link);  
     }  
     return ret;  
 }  
   
 static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)  
 {  
     int error = 0;  
     struct file *file = priv;  
     struct eventpoll *ep = file->private_data;  
     struct eventpoll *ep_tovisit;  
     struct rb_node *rbp;  
     struct epitem *epi;  
   
     mutex_lock_nested(&ep->mtx, call_nests + 1);  
     // 標記當前爲已遍歷  
     ep->visited = 1;  
     list_add(&ep->visited_list_link, &visited_list);  
     // 遍歷全部ep 監視的文件  
     for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {  
         epi = rb_entry(rbp, struct epitem, rbn);  
         if (unlikely(is_file_epoll(epi->ffd.file))) {  
             ep_tovisit = epi->ffd.file->private_data;  
             // 跳過先前已遍歷的, 避免循環檢查  
             if (ep_tovisit->visited) {  
                 continue;  
             }  
             // 全部ep監視的未遍歷的epoll  
             error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,  
                                    ep_loop_check_proc, epi->ffd.file,  
                                    ep_tovisit, current);  
             if (error != 0) {  
                 break;  
             }  
         } else {  
             // 文件不在tfile_check_list 中, 添加  
             // 最外層的epoll 須要檢查子epoll監視的文件  
             if (list_empty(&epi->ffd.file->f_tfile_llink))  
                 list_add(&epi->ffd.file->f_tfile_llink,  
                          &tfile_check_list);  
         }  
     }  
     mutex_unlock(&ep->mtx);  
   
     return error;  
 }  

喚醒風暴檢測（reverse_path_check）

當文件狀態發生改變時，會喚醒監聽在其上的epoll文件，而這個epoll文件還可能喚醒其餘的epoll文件，這種連續的喚醒就造成了一個喚醒路徑，全部的喚醒路徑就造成了一個有向圖。若是文件對應的epoll喚醒有向圖的節點過多，那麼文件狀態的改變就會喚醒全部的這些epoll(可能會喚醒不少進程，這樣的開銷是很大的)，而實際上一個文件通過少數epoll處理之後就可能從就緒轉到未就緒，剩餘的epoll雖然認爲文件已就緒而實際上通過某些處理後已不可用。epoll的實現中考慮到了此問題，在每次添加新文件到epoll中時，就會首先檢查是否會出現這樣的喚醒風暴。

該函數的實現邏輯是這樣的，遞歸調用reverse_path_check_proc遍歷監聽在當前文件上的epoll文件，在reverse_pach_check_proc中統計並檢查不一樣路徑深度上epoll的個數，從而避免產生喚醒風暴。

     C代碼   
     
   
 #define PATH_ARR_SIZE 5  
 // 在EPOLL_CTL_ADD 時, 檢查是否有可能產生喚醒風暴  
 // epoll 容許的單個文件的喚醒深度小於5, 例如  
 // 一個文件最多容許喚醒1000個深度爲1的epoll描述符,  
 //容許全部被單個文件直接喚醒的epoll描述符再次喚醒的epoll描述符總數是500  
 //  
   
 // 深度限制  
 static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };  
 // 計算出來的深度  
 static int path_count[PATH_ARR_SIZE];  
   
 static int path_count_inc(int nests)  
 {  
     /* Allow an arbitrary number of depth 1 paths */  
     if (nests == 0) {  
         return 0;  
     }  
   
     if (++path_count[nests] > path_limits[nests]) {  
         return -1;  
     }  
     return 0;  
 }  
   
 static void path_count_init(void)  
 {  
     int i;  
   
     for (i = 0; i < PATH_ARR_SIZE; i++) {  
         path_count[i] = 0;  
     }  
 }  
   
 // 喚醒風暴檢查函數  
 static int reverse_path_check(void)  
 {  
     int error = 0;  
     struct file *current_file;  
   
     /* let's call this for all tfiles */  
     // 遍歷全局tfile_check_list 中的文件, 第一級  
     list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {  
         // 初始化  
         path_count_init();  
         // 限制遞歸的深度, 並檢查每一個深度上喚醒的epoll 數量  
         error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,  
                                reverse_path_check_proc, current_file,  
                                current_file, current);  
         if (error) {  
             break;  
         }  
     }  
     return error;  
 }  
 static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)  
 {  
     int error = 0;  
     struct file *file = priv;  
     struct file *child_file;  
     struct epitem *epi;  
   
     list_for_each_entry(epi, &file->f_ep_links, fllink) {  
         // 遍歷監視file 的epoll  
         child_file = epi->ep->file;  
         if (is_file_epoll(child_file)) {  
             if (list_empty(&child_file->f_ep_links)) {  
                 // 沒有其餘的epoll監視當前的這個epoll,  
                 // 已是葉子了  
                 if (path_count_inc(call_nests)) {  
                     error = -1;  
                     break;  
                 }  
             } else {  
                 // 遍歷監視這個epoll 文件的epoll,  
                 // 遞歸調用  
                 error = ep_call_nested(&poll_loop_ncalls,  
                                        EP_MAX_NESTS,  
                                        reverse_path_check_proc,  
                                        child_file, child_file,  
                                        current);  
             }  
             if (error != 0) {  
                 break;  
             }  
         } else {  
             // 不是epoll , 不可能吧?  
             printk(KERN_ERR "reverse_path_check_proc: "  
                    "file is not an ep!\n");  
         }  
     }  
     return error;  
 }  

epoll 的喚醒過程

     C代碼   
     
   
 static void ep_poll_safewake(wait_queue_head_t *wq)  
 {  
     int this_cpu = get_cpu();  
   
     ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,  
                    ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);  
   
     put_cpu();  
 }  
   
 static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)  
 {  
     ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,  
                       1 + call_nests);  
     return 0;  
 }  
   
 static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,  
                                      unsigned long events, int subclass)  
 {  
     // 這回喚醒全部正在等待此epfd 的select/epoll/poll 等  
     // 若是喚醒的是epoll 就可能喚醒其餘的epoll, 產生連鎖反應  
     // 這個極可能在中斷上下文中被調用  
     wake_up_poll(wqueue, events);  
 }  

epoll_ctl

     C代碼   
     
   
 // long epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);  
   
 SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,  
                 struct epoll_event __user *, event)  
 {  
     int error;  
     int did_lock_epmutex = 0;  
     struct file *file, *tfile;  
     struct eventpoll *ep;  
     struct epitem *epi;  
     struct epoll_event epds;  
   
     error = -EFAULT;  
     if (ep_op_has_event(op) &&  
             // 複製用戶空間數據到內核  
             copy_from_user(&epds, event, sizeof(struct epoll_event))) {  
         goto error_return;  
     }  
   
     // 取得 epfd 對應的文件  
     error = -EBADF;  
     file = fget(epfd);  
     if (!file) {  
         goto error_return;  
     }  
   
     // 取得目標文件  
     tfile = fget(fd);  
     if (!tfile) {  
         goto error_fput;  
     }  
   
     // 目標文件必須提供 poll 操做  
     error = -EPERM;  
     if (!tfile->f_op || !tfile->f_op->poll) {  
         goto error_tgt_fput;  
     }  
   
     // 添加自身或epfd 不是epoll 句柄  
     error = -EINVAL;  
     if (file == tfile || !is_file_epoll(file)) {  
         goto error_tgt_fput;  
     }  
   
     // 取得內部結構eventpoll  
     ep = file->private_data;  
   
     // EPOLL_CTL_MOD 不須要加全局鎖 epmutex  
     if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {  
         mutex_lock(&epmutex);  
         did_lock_epmutex = 1;  
     }  
     if (op == EPOLL_CTL_ADD) {  
         if (is_file_epoll(tfile)) {  
             error = -ELOOP;  
             // 目標文件也是epoll 檢測是否有循環包含的問題  
             if (ep_loop_check(ep, tfile) != 0) {  
                 goto error_tgt_fput;  
             }  
         } else  
         {  
             // 將目標文件添加到 epoll 全局的tfile_check_list 中  
             list_add(&tfile->f_tfile_llink, &tfile_check_list);  
         }  
     }  
   
     mutex_lock_nested(&ep->mtx, 0);  
   
     // 以tfile 和fd 爲key 在rbtree 中查找文件對應的epitem  
     epi = ep_find(ep, tfile, fd);  
   
     error = -EINVAL;  
     switch (op) {  
     case EPOLL_CTL_ADD:  
         if (!epi) {  
             // 沒找到, 添加額外添加ERR HUP 事件  
             epds.events |= POLLERR | POLLHUP;  
             error = ep_insert(ep, &epds, tfile, fd);  
         } else {  
             error = -EEXIST;  
         }  
         // 清空文件檢查列表  
         clear_tfile_check_list();  
         break;  
     case EPOLL_CTL_DEL:  
         if (epi) {  
             error = ep_remove(ep, epi);  
         } else {  
             error = -ENOENT;  
         }  
         break;  
     case EPOLL_CTL_MOD:  
         if (epi) {  
             epds.events |= POLLERR | POLLHUP;  
             error = ep_modify(ep, epi, &epds);  
         } else {  
             error = -ENOENT;  
         }  
         break;  
     }  
     mutex_unlock(&ep->mtx);  
   
 error_tgt_fput:  
     if (did_lock_epmutex) {  
         mutex_unlock(&epmutex);  
     }  
   
     fput(tfile);  
 error_fput:  
     fput(file);  
 error_return:  
   
     return error;  
 }  

EPOLL_CTL_ADD 實現

     C代碼   
     
   
 // EPOLL_CTL_ADD  
 static int ep_insert(struct eventpoll *ep, struct epoll_event *event,  
                      struct file *tfile, int fd)  
 {  
     int error, revents, pwake = 0;  
     unsigned long flags;  
     long user_watches;  
     struct epitem *epi;  
     struct ep_pqueue epq;  
     /* 
     struct ep_pqueue { 
         poll_table pt; 
         struct epitem *epi; 
     }; 
     */  
   
     // 增長監視文件數  
     user_watches = atomic_long_read(&ep->user->epoll_watches);  
     if (unlikely(user_watches >= max_user_watches)) {  
         return -ENOSPC;  
     }  
   
     // 分配初始化 epi  
     if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) {  
         return -ENOMEM;  
     }  
   
     INIT_LIST_HEAD(&epi->rdllink);  
     INIT_LIST_HEAD(&epi->fllink);  
     INIT_LIST_HEAD(&epi->pwqlist);  
     epi->ep = ep;  
     // 初始化紅黑樹中的key  
     ep_set_ffd(&epi->ffd, tfile, fd);  
     // 直接複製用戶結構  
     epi->event = *event;  
     epi->nwait = 0;  
     epi->next = EP_UNACTIVE_PTR;  
   
     // 初始化臨時的 epq  
     epq.epi = epi;  
     init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);  
     // 設置事件掩碼  
     epq.pt._key = event->events;  
     //  內部會調用ep_ptable_queue_proc, 在文件對應的wait queue head 上  
     // 註冊回調函數, 並返回當前文件的狀態  
     revents = tfile->f_op->poll(tfile, &epq.pt);  
   
     // 檢查錯誤  
     error = -ENOMEM;  
     if (epi->nwait < 0) { // f_op->poll 過程出錯  
         goto error_unregister;  
     }  
     // 添加當前的epitem 到文件的f_ep_links 鏈表  
     spin_lock(&tfile->f_lock);  
     list_add_tail(&epi->fllink, &tfile->f_ep_links);  
     spin_unlock(&tfile->f_lock);  
   
     // 插入epi 到rbtree  
     ep_rbtree_insert(ep, epi);  
   
     /* now check if we've created too many backpaths */  
     error = -EINVAL;  
     if (reverse_path_check()) {  
         goto error_remove_epi;  
     }  
   
     spin_lock_irqsave(&ep->lock, flags);  
   
     /* 文件已經就緒插入到就緒鏈表rdllist */  
     if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {  
         list_add_tail(&epi->rdllink, &ep->rdllist);  
   
   
         if (waitqueue_active(&ep->wq))  
             // 通知sys_epoll_wait , 調用回調函數喚醒sys_epoll_wait 進程  
         {  
             wake_up_locked(&ep->wq);  
         }  
         // 先不通知調用eventpoll_poll 的進程  
         if (waitqueue_active(&ep->poll_wait)) {  
             pwake++;  
         }  
     }  
   
     spin_unlock_irqrestore(&ep->lock, flags);  
   
     atomic_long_inc(&ep->user->epoll_watches);  
   
     if (pwake)  
         // 安全通知調用eventpoll_poll 的進程  
     {  
         ep_poll_safewake(&ep->poll_wait);  
     }  
   
     return 0;  
   
 error_remove_epi:  
     spin_lock(&tfile->f_lock);  
     // 刪除文件上的 epi  
     if (ep_is_linked(&epi->fllink)) {  
         list_del_init(&epi->fllink);  
     }  
     spin_unlock(&tfile->f_lock);  
   
     // 從紅黑樹中刪除  
     rb_erase(&epi->rbn, &ep->rbr);  
   
 error_unregister:  
     // 從文件的wait_queue 中刪除, 釋放epitem 關聯的全部eppoll_entry  
     ep_unregister_pollwait(ep, epi);  
   
     /* 
      * We need to do this because an event could have been arrived on some 
      * allocated wait queue. Note that we don't care about the ep->ovflist 
      * list, since that is used/cleaned only inside a section bound by "mtx". 
      * And ep_insert() is called with "mtx" held. 
      */  
     // TODO:  
     spin_lock_irqsave(&ep->lock, flags);  
     if (ep_is_linked(&epi->rdllink)) {  
         list_del_init(&epi->rdllink);  
     }  
     spin_unlock_irqrestore(&ep->lock, flags);  
   
     // 釋放epi  
     kmem_cache_free(epi_cache, epi);  
   
     return error;  
 }  

EPOLL_CTL_DEL

EPOLL_CTL_DEL 的實現調用的是 ep_remove 函數，函數只是清除ADD時，添加的各類結構，EPOLL_CTL_MOD 的實現調用的是ep_modify，在ep_modify中用新的事件掩碼調用f_ops->poll，檢測事件是否已可用，若是可用就直接喚醒epoll，這兩個的實現與EPOLL_CTL_ADD 相似，代碼上比較清晰，這裏就不具體分析了。

     C代碼   
     
   
 static int ep_remove(struct eventpoll *ep, struct epitem *epi)  
 {  
     unsigned long flags;  
     struct file *file = epi->ffd.file;  
   
     /* 
      * Removes poll wait queue hooks. We _have_ to do this without holding 
      * the "ep->lock" otherwise a deadlock might occur. This because of the 
      * sequence of the lock acquisition. Here we do "ep->lock" then the wait 
      * queue head lock when unregistering the wait queue. The wakeup callback 
      * will run by holding the wait queue head lock and will call our callback 
      * that will try to get "ep->lock". 
      */  
     ep_unregister_pollwait(ep, epi);  
   
     /* Remove the current item from the list of epoll hooks */  
     spin_lock(&file->f_lock);  
     if (ep_is_linked(&epi->fllink))  
         list_del_init(&epi->fllink);  
     spin_unlock(&file->f_lock);  
   
     rb_erase(&epi->rbn, &ep->rbr);  
   
     spin_lock_irqsave(&ep->lock, flags);  
     if (ep_is_linked(&epi->rdllink))  
         list_del_init(&epi->rdllink);  
     spin_unlock_irqrestore(&ep->lock, flags);  
   
     /* At this point it is safe to free the eventpoll item */  
     kmem_cache_free(epi_cache, epi);  
   
     atomic_long_dec(&ep->user->epoll_watches);  
   
     return 0;  
 }  

     C代碼   
     
   
 /* 
  * Modify the interest event mask by dropping an event if the new mask 
  * has a match in the current file status. Must be called with "mtx" held. 
  */  
 static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event)  
 {  
     int pwake = 0;  
     unsigned int revents;  
     poll_table pt;  
   
     init_poll_funcptr(&pt, NULL);  
   
     /* 
      * Set the new event interest mask before calling f_op->poll(); 
      * otherwise we might miss an event that happens between the 
      * f_op->poll() call and the new event set registering. 
      */  
     epi->event.events = event->events;  
     pt._key = event->events;  
     epi->event.data = event->data; /* protected by mtx */  
   
     /* 
      * Get current event bits. We can safely use the file* here because 
      * its usage count has been increased by the caller of this function. 
      */  
     revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt);  
   
     /* 
      * If the item is "hot" and it is not registered inside the ready 
      * list, push it inside. 
      */  
     if (revents & event->events) {  
         spin_lock_irq(&ep->lock);  
         if (!ep_is_linked(&epi->rdllink)) {  
             list_add_tail(&epi->rdllink, &ep->rdllist);  
   
             /* Notify waiting tasks that events are available */  
             if (waitqueue_active(&ep->wq))  
                 wake_up_locked(&ep->wq);  
             if (waitqueue_active(&ep->poll_wait))  
                 pwake++;  
         }  
         spin_unlock_irq(&ep->lock);  
     }  
   
     /* We have to call this outside the lock */  
     if (pwake)  
         ep_poll_safewake(&ep->poll_wait);  
   
     return 0;  
 }  

epoll_wait

     C代碼   
     
   
 /* 
 epoll_wait實現 
 */  
   
 SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,  
                 int, maxevents, int, timeout)  
 {  
     int error;  
     struct file *file;  
     struct eventpoll *ep;  
   
     // 檢查輸入數據有效性  
     if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) {  
         return -EINVAL;  
     }  
   
     if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {  
         error = -EFAULT;  
         goto error_return;  
     }  
   
     /* Get the "struct file *" for the eventpoll file */  
     error = -EBADF;  
     file = fget(epfd);  
     if (!file) {  
         goto error_return;  
     }  
   
     error = -EINVAL;  
     if (!is_file_epoll(file)) {  
         goto error_fput;  
     }  
     // 取得ep 結構  
     ep = file->private_data;  
   
     // 等待事件  
     error = ep_poll(ep, events, maxevents, timeout);  
   
 error_fput:  
     fput(file);  
 error_return:  
   
     return error;  
 }  
   
 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,  
                    int maxevents, long timeout)  
 {  
     int res = 0, eavail, timed_out = 0;  
     unsigned long flags;  
     long slack = 0;  
     wait_queue_t wait;  
     ktime_t expires, *to = NULL;  
   
     if (timeout > 0) {  
         // 轉換爲內核時間  
         struct timespec end_time = ep_set_mstimeout(timeout);  
   
         slack = select_estimate_accuracy(&end_time);  
         to = &expires;  
         *to = timespec_to_ktime(end_time);  
     } else if (timeout == 0) {  
         // 已經超時直接檢查readylist  
         timed_out = 1;  
         spin_lock_irqsave(&ep->lock, flags);  
         goto check_events;  
     }  
   
 fetch_events:  
     spin_lock_irqsave(&ep->lock, flags);  
   
     // 沒有可用的事件，ready list 和ovflist 都爲空  
     if (!ep_events_available(ep)) {  
   
         // 添加當前進程的喚醒函數  
         init_waitqueue_entry(&wait, current);  
         __add_wait_queue_exclusive(&ep->wq, &wait);  
   
         for (;;) {  
             /* 
              * We don't want to sleep if the ep_poll_callback() sends us 
              * a wakeup in between. That's why we set the task state 
              * to TASK_INTERRUPTIBLE before doing the checks. 
              */  
             set_current_state(TASK_INTERRUPTIBLE);  
             if (ep_events_available(ep) || timed_out) {  
                 break;  
             }  
             if (signal_pending(current)) {  
                 res = -EINTR;  
                 break;  
             }  
   
             spin_unlock_irqrestore(&ep->lock, flags);  
             // 掛起當前進程，等待喚醒或超時  
             if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {  
                 timed_out = 1;  
             }  
   
             spin_lock_irqsave(&ep->lock, flags);  
         }  
       
         __remove_wait_queue(&ep->wq, &wait);  
   
         set_current_state(TASK_RUNNING);  
     }  
 check_events:  
     // 再次檢查是否有可用事件  
     eavail = ep_events_available(ep);  
   
     spin_unlock_irqrestore(&ep->lock, flags);  
   
     /* 
      * Try to transfer events to user space. In case we get 0 events and 
      * there's still timeout left over, we go trying again in search of 
      * more luck. 
      */  
     if (!res && eavail   
             && !(res = ep_send_events(ep, events, maxevents)) // 複製事件到用戶空間  
             && !timed_out) // 複製事件失敗而且沒有超時，從新等待。  
             {  
         goto fetch_events;  
     }  
   
     return res;  
 }  
   
   
 static inline int ep_events_available(struct eventpoll *ep)  
 {  
     return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;  
 }  
   
 struct ep_send_events_data {  
     int maxevents;  
     struct epoll_event __user *events;  
 };  
   
 static int ep_send_events(struct eventpoll *ep,  
                           struct epoll_event __user *events, int maxevents)  
 {  
     struct ep_send_events_data esed;  
   
     esed.maxevents = maxevents;  
     esed.events = events;  
   
     return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);  
 }  
   
 static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,  
                                void *priv)  
 {  
     struct ep_send_events_data *esed = priv;  
     int eventcnt;  
     unsigned int revents;  
     struct epitem *epi;  
     struct epoll_event __user *uevent;  
   
     // 遍歷已就緒鏈表  
     for (eventcnt = 0, uevent = esed->events;  
             !list_empty(head) && eventcnt < esed->maxevents;) {  
         epi = list_first_entry(head, struct epitem, rdllink);  
   
         list_del_init(&epi->rdllink);  
         // 獲取ready 事件掩碼  
         revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &  
                   epi->event.events;  
   
         /* 
          * If the event mask intersect the caller-requested one, 
          * deliver the event to userspace. Again, ep_scan_ready_list() 
          * is holding "mtx", so no operations coming from userspace 
          * can change the item. 
          */  
         if (revents) {  
             // 事件就緒, 複製到用戶空間  
             if (__put_user(revents, &uevent->events) ||  
                     __put_user(epi->event.data, &uevent->data)) {  
                 list_add(&epi->rdllink, head);  
                 return eventcnt ? eventcnt : -EFAULT;  
             }  
             eventcnt++;  
             uevent++;  
             if (epi->event.events & EPOLLONESHOT) {  
                 epi->event.events &= EP_PRIVATE_BITS;  
             } else if (!(epi->event.events & EPOLLET)) {  
                 // 不是邊緣模式, 再次添加到ready list,  
                 // 下次epoll_wait 時直接進入此函數檢查ready list是否仍然繼續  
                 list_add_tail(&epi->rdllink, &ep->rdllist);  
             }  
             // 若是是邊緣模式, 只有當文件狀態發生改變時,  
             // 才文件會再次觸發wait_address 上wait_queue的回調函數,  
         }  
     }  
   
     return eventcnt;  
 }  

eventpoll_poll

因爲epoll自身也是文件系統，其描述符也能夠被poll/select/epoll監視，所以須要實現poll方法。

     C代碼   
     
   
 static const struct file_operations eventpoll_fops = {  
     .release = ep_eventpoll_release,  
     .poll    = ep_eventpoll_poll,  
     .llseek  = noop_llseek,  
 };  
   
 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)  
 {  
     int pollflags;  
     struct eventpoll *ep = file->private_data;  
     // 插入到wait_queue  
     poll_wait(file, &ep->poll_wait, wait);  
     // 掃描就緒的文件列表, 調用每一個文件上的poll 檢測是否真的就緒,  
     // 而後複製到用戶空間  
     // 文件列表中有可能有epoll文件, 調用poll的時候有可能會產生遞歸,  
     // 調用因此用ep_call_nested 包裝一下, 防止死循環和過深的調用  
     pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,  
                                ep_poll_readyevents_proc, ep, ep, current);  
     // static struct nested_calls poll_readywalk_ncalls;  
     return pollflags != -1 ? pollflags : 0;  
 }  
   
 static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)  
 {  
     return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1);  
 }  
   
 static int ep_scan_ready_list(struct eventpoll *ep,  
                               int (*sproc)(struct eventpoll *,  
                                       struct list_head *, void *),  
                               void *priv,  
                               int depth)  
 {  
     int error, pwake = 0;  
     unsigned long flags;  
     struct epitem *epi, *nepi;  
     LIST_HEAD(txlist);  
   
     /* 
      * We need to lock this because we could be hit by 
      * eventpoll_release_file() and epoll_ctl(). 
      */  
     mutex_lock_nested(&ep->mtx, depth);  
   
     spin_lock_irqsave(&ep->lock, flags);  
     // 移動rdllist 到新的鏈表txlist  
     list_splice_init(&ep->rdllist, &txlist);  
     // 改變ovflist 的狀態, 若是ep->ovflist != EP_UNACTIVE_PTR,  
     // 當文件激活wait_queue時，就會將對應的epitem加入到ep->ovflist  
     // 不然將文件直接加入到ep->rdllist，  
     // 這樣作的目的是避免丟失事件  
     // 這裏不須要檢查ep->ovflist 的狀態，由於ep->mtx的存在保證此處的ep->ovflist  
     // 必定是EP_UNACTIVE_PTR  
     ep->ovflist = NULL;  
     spin_unlock_irqrestore(&ep->lock, flags);  
   
     // 調用掃描函數處理txlist  
     error = (*sproc)(ep, &txlist, priv);  
   
     spin_lock_irqsave(&ep->lock, flags);  
   
     // 調用 sproc 時可能有新的事件，遍歷這些新的事件將其插入到ready list  
     for (nepi = ep->ovflist; (epi = nepi) != NULL;  
             nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {  
         // #define EP_UNACTIVE_PTR (void *) -1  
         // epi 不在rdllist, 插入  
         if (!ep_is_linked(&epi->rdllink)) {  
             list_add_tail(&epi->rdllink, &ep->rdllist);  
         }  
     }  
     // 還原ep->ovflist的狀態  
     ep->ovflist = EP_UNACTIVE_PTR;  
   
     // 將處理後的 txlist 連接到 rdllist  
     list_splice(&txlist, &ep->rdllist);  
   
     if (!list_empty(&ep->rdllist)) {  
         // 喚醒epoll_wait  
         if (waitqueue_active(&ep->wq)) {  
             wake_up_locked(&ep->wq);  
         }  
         // 當前的ep有其餘的事件通知機制監控  
         if (waitqueue_active(&ep->poll_wait)) {  
             pwake++;  
         }  
     }  
     spin_unlock_irqrestore(&ep->lock, flags);  
   
     mutex_unlock(&ep->mtx);  
   
     if (pwake) {  
         // 安全喚醒外部的事件通知機制  
         ep_poll_safewake(&ep->poll_wait);  
     }  
   
     return error;  
 }  
   
 static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,  
                                void *priv)  
 {  
     struct epitem *epi, *tmp;  
     poll_table pt;  
     init_poll_funcptr(&pt, NULL);  
     list_for_each_entry_safe(epi, tmp, head, rdllink) {  
         pt._key = epi->event.events;  
         if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &  
                 epi->event.events) {  
             return POLLIN | POLLRDNORM;  
         } else {  
              // 這個事件雖然在就緒列表中,  
              // 可是實際上並無就緒, 將他移除  
          // 這有多是水平觸發模式中沒有將文件從就緒列表中移除  
          // 也多是事件插入到就緒列表後有其餘的線程對文件進行了操做  
             list_del_init(&epi->rdllink);  
         }  
     }  
     return 0;  
 }  

epoll全景

如下是epoll使用的所有數據結構之間的關係圖，採用的是一種類UML圖，但願對理解epoll的內部實現有所幫助。

poll/select/epoll 對比

經過以上的分析能夠看出，poll和select的實現基本是一致，只是用戶到內核傳遞的數據格式有所不一樣，

select和poll即便只有一個描述符就緒，也要遍歷整個集合。若是集合中活躍的描述符不多，遍歷過程的開銷就會變得很大，而若是集合中大部分的描述符都是活躍的，遍歷過程的開銷又能夠忽略。

epoll的實現中每次只遍歷活躍的描述符(若是是水平觸發，也會遍歷先前活躍的描述符)，在活躍描述符較少的狀況下就會頗有優點，在代碼的分析過程當中能夠看到epoll的實現過於複雜而且其實現過程當中須要同步處理(鎖)，若是大部分描述符都是活躍的，epoll的效率可能不如select或poll。(參見epoll 和poll的性能測試 http://jacquesmattheij.com/Poll+vs+Epoll+once+again)

select可以處理的最大fd沒法超出FDSETSIZE。

select會複寫傳入的fd_set 指針，而poll對每一個fd返回一個掩碼，不更改原來的掩碼，從而能夠對同一個集合屢次調用poll，而無需調整。

select對每一個文件描述符最多使用3個bit，而poll採用的pollfd須要使用64個bit，epoll採用的 epoll_event則須要96個bit

若是事件須要循環處理select, poll 每一次的處理都要將所有的數據複製到內核，而epoll的實現中，內核將持久維護加入的描述符，減小了內核和用戶複製數據的開銷。