本文轉載自:https://blog.csdn.net/lishenglong666/article/details/45536611html
poll/select/epoll的實現都是基於文件提供的poll方法(f_op->poll),
該方法利用poll_table提供的_qproc方法向文件內部事件掩碼_key對應的的一個或多個等待隊列(wait_queue_head_t)上添加包含喚醒函數(wait_queue_t.func)的節點(wait_queue_t),並檢查文件當前就緒的狀態返回給poll的調用者(依賴於文件的實現)。
當文件的狀態發生改變時(例如網絡數據包到達),文件就會遍歷事件對應的等待隊列並調用回調函數(wait_queue_t.func)喚醒等待線程。node
一般的file.f_ops.poll實現及相關結構體以下安全
- struct file {
- const struct file_operations *f_op;
- spinlock_t f_lock;
-
- void *private_data;
- #ifdef CONFIG_EPOLL
-
- struct list_head f_ep_links;
- struct list_head f_tfile_llink;
- #endif /* #ifdef CONFIG_EPOLL */
-
- };
-
-
- struct file_operations {
-
-
- unsigned int (*poll) (struct file *, struct poll_table_struct *);
-
- };
-
-
- unsigned int file_f_op_poll (struct file *filp, struct poll_table_struct *wait)
- {
- unsigned int mask = 0;
- wait_queue_head_t * wait_queue;
-
-
- some_code();
-
-
- poll_wait(filp, wait_queue, wait);
-
-
- some_code();
-
- return mask;
- }
-
-
- typedef struct poll_table_struct {
-
- poll_queue_proc _qproc;
-
- unsigned long _key;
- } poll_table;
- typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);
-
-
-
- static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
- {
- if (p && p->_qproc && wait_address) {
-
-
-
- p->_qproc(filp, wait_address, p);
- }
- }
-
-
-
- typedef struct __wait_queue_head wait_queue_head_t;
- struct __wait_queue_head {
- spinlock_t lock;
- struct list_head task_list;
- };
-
-
- typedef struct __wait_queue wait_queue_t;
- struct __wait_queue {
- unsigned int flags;
- #define WQ_FLAG_EXCLUSIVE 0x01
- void *private;
- wait_queue_func_t func;
- struct list_head task_list;
- };
- typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
-
-
-
-
- void __wake_up(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, void *key)
- {
- unsigned long flags;
-
- spin_lock_irqsave(&q->lock, flags);
- __wake_up_common(q, mode, nr_exclusive, 0, key);
- spin_unlock_irqrestore(&q->lock, flags);
- }
- static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, int wake_flags, void *key)
- {
- wait_queue_t *curr, *next;
-
- list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
- unsigned flags = curr->flags;
-
- if (curr->func(curr, mode, wake_flags, key) &&
- (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) {
- break;
- }
- }
- }
poll 和 select
poll和select的實現基本上是一致的,只是傳遞參數有所不一樣,他們的基本流程以下:cookie
1. 複製用戶數據到內核空間網絡
2. 估計超時時間數據結構
3. 遍歷每一個文件並調用f_op->poll 取得文件當前就緒狀態, 若是前面遍歷的文件都沒有就緒,向文件插入wait_queue節點app
4. 遍歷完成後檢查狀態:ide
a). 若是已經有就緒的文件轉到5;函數
b). 若是有信號產生,重啓poll或select(轉到 1或3);oop
c). 不然掛起進程等待超時或喚醒,超時或被喚醒後再次遍歷全部文件取得每一個文件的就緒狀態
5. 將全部文件的就緒狀態複製到用戶空間
6. 清理申請的資源
關鍵結構體
下面是poll/select共用的結構體及其相關功能:
poll_wqueues 是 select/poll 對poll_table接口的具體化實現,其中的table, inline_index和inline_entries都是爲了管理內存。
poll_table_entry 與一個文件相關聯,用於管理插入到文件的wait_queue節點。
-
- struct poll_wqueues {
- poll_table pt;
- struct poll_table_page *table;
- struct task_struct *polling_task;
- int triggered;
- int error;
- int inline_index;
- struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
- };
-
- struct poll_table_page {
- struct poll_table_page * next;
- struct poll_table_entry * entry;
- struct poll_table_entry entries[0];
- };
-
- struct poll_table_entry {
- struct file *filp;
- unsigned long key;
- wait_queue_t wait;
- wait_queue_head_t *wait_address;
- };
公共函數
下面是poll/select公用的一些函數,這些函數實現了poll和select的核心功能。
poll_initwait 用於初始化poll_wqueues,
__pollwait 實現了向文件中添加回調節點的邏輯,
pollwake 當文件狀態發生改變時,由文件調用,用來喚醒線程,
poll_get_entry,free_poll_entry,poll_freewait用來申請釋放poll_table_entry 佔用的內存,並負責釋放文件上的wait_queue節點。
-
-
- void poll_initwait(struct poll_wqueues *pwq)
- {
-
- init_poll_funcptr(&pwq->pt, __pollwait);
-
-
-
-
-
-
-
- pwq->polling_task = current;
- pwq->triggered = 0;
- pwq->error = 0;
- pwq->table = NULL;
- pwq->inline_index = 0;
- }
-
-
-
-
- static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
- poll_table *p)
- {
- struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
- struct poll_table_entry *entry = poll_get_entry(pwq);
- if (!entry) {
- return;
- }
- get_file(filp);
- entry->filp = filp;
- entry->wait_address = wait_address;
- entry->key = p->key;
-
- init_waitqueue_func_entry(&entry->wait, pollwake);
- entry->wait.private = pwq;
-
- add_wait_queue(wait_address, &entry->wait);
- }
-
-
-
- static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
- {
- struct poll_table_entry *entry;
-
- entry = container_of(wait, struct poll_table_entry, wait);
-
- if (key && !((unsigned long)key & entry->key)) {
- return 0;
- }
-
- return __pollwake(wait, mode, sync, key);
- }
- static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
- {
- struct poll_wqueues *pwq = wait->private;
-
- DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
- smp_wmb();
- pwq->triggered = 1;
-
- return default_wake_function(&dummy_wait, mode, sync, key);
- }
-
-
-
- int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
- void *key)
- {
-
- return try_to_wake_up(curr->private, mode, wake_flags);
- }
poll,select對poll_table_entry的申請和釋放採用的是相似內存池的管理方式,先使用預分配的空間,預分配的空間不足時,分配一個內存頁,使用內存頁上的空間。
-
- static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) {
- struct poll_table_page *table = p->table;
-
- if (p->inline_index < N_INLINE_POLL_ENTRIES) {
- return p->inline_entries + p->inline_index++;
- }
-
- if (!table || POLL_TABLE_FULL(table)) {
- struct poll_table_page *new_table;
- new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
- if (!new_table) {
- p->error = -ENOMEM;
- return NULL;
- }
- new_table->entry = new_table->entries;
- new_table->next = table;
- p->table = new_table;
- table = new_table;
- }
- return table->entry++;
- }
-
-
- void poll_freewait(struct poll_wqueues *pwq)
- {
- struct poll_table_page * p = pwq->table;
-
- int i;
- for (i = 0; i < pwq->inline_index; i++) {
- free_poll_entry(pwq->inline_entries + i);
- }
-
-
- while (p) {
- struct poll_table_entry * entry;
- struct poll_table_page *old;
- entry = p->entry;
- do {
- entry--;
- free_poll_entry(entry);
- } while (entry > p->entries);
- old = p;
- p = p->next;
- free_page((unsigned long) old);
- }
- }
- static void free_poll_entry(struct poll_table_entry *entry)
- {
-
- remove_wait_queue(entry->wait_address, &entry->wait);
- fput(entry->filp);
- }
poll/select核心結構關係
下圖是 poll/select 實現公共部分的關係圖,包含了與文件直接的關係,以及函數之間的依賴。
![點擊查看原始大小圖片](http://static.javashuo.com/static/loading.gif)
poll的實現
-
- struct pollfd {
- int fd;
- short events;
- short revents;
- };
-
- SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
- long, timeout_msecs)
- {
- struct timespec end_time, *to = NULL;
- int ret;
- if (timeout_msecs >= 0) {
- to = &end_time;
-
- poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
- NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
- }
-
- ret = do_sys_poll(ufds, nfds, to);
-
- if (ret == -EINTR) {
- struct restart_block *restart_block;
- restart_block = ¤t_thread_info()->restart_block;
- restart_block->fn = do_restart_poll;
- restart_block->poll.ufds = ufds;
- restart_block->poll.nfds = nfds;
- if (timeout_msecs >= 0) {
- restart_block->poll.tv_sec = end_time.tv_sec;
- restart_block->poll.tv_nsec = end_time.tv_nsec;
- restart_block->poll.has_timeout = 1;
- } else {
- restart_block->poll.has_timeout = 0;
- }
-
-
- ret = -ERESTART_RESTARTBLOCK;
- }
- return ret;
- }
- int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
- struct timespec *end_time)
- {
- struct poll_wqueues table;
- int err = -EFAULT, fdcount, len, size;
-
- long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
- struct poll_list *const head = (struct poll_list *)stack_pps;
- struct poll_list *walk = head;
- unsigned long todo = nfds;
- if (nfds > rlimit(RLIMIT_NOFILE)) {
-
- return -EINVAL;
- }
-
- len = min_t(unsigned int, nfds, N_STACK_PPS);
- for (;;) {
- walk->next = NULL;
- walk->len = len;
- if (!len) {
- break;
- }
-
- if (copy_from_user(walk->entries, ufds + nfds-todo,
- sizeof(struct pollfd) * walk->len)) {
- goto out_fds;
- }
- todo -= walk->len;
- if (!todo) {
- break;
- }
-
- len = min(todo, POLLFD_PER_PAGE);
- size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
- walk = walk->next = kmalloc(size, GFP_KERNEL);
- if (!walk) {
- err = -ENOMEM;
- goto out_fds;
- }
- }
-
- poll_initwait(&table);
-
- fdcount = do_poll(nfds, head, &table, end_time);
-
- poll_freewait(&table);
-
- for (walk = head; walk; walk = walk->next) {
- struct pollfd *fds = walk->entries;
- int j;
- for (j = 0; j < len; j++, ufds++)
- if (__put_user(fds[j].revents, &ufds->revents)) {
- goto out_fds;
- }
- }
- err = fdcount;
- out_fds:
-
- walk = head->next;
- while (walk) {
- struct poll_list *pos = walk;
- walk = walk->next;
- kfree(pos);
- }
- return err;
- }
-
- static int do_poll(unsigned int nfds, struct poll_list *list,
- struct poll_wqueues *wait, struct timespec *end_time)
- {
- poll_table* pt = &wait->pt;
- ktime_t expire, *to = NULL;
- int timed_out = 0, count = 0;
- unsigned long slack = 0;
- if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
-
- pt = NULL;
- timed_out = 1;
- }
- if (end_time && !timed_out) {
-
- slack = select_estimate_accuracy(end_time);
- }
-
- for (;;) {
- struct poll_list *walk;
- for (walk = list; walk != NULL; walk = walk->next) {
- struct pollfd * pfd, * pfd_end;
- pfd = walk->entries;
- pfd_end = pfd + walk->len;
- for (; pfd != pfd_end; pfd++) {
-
-
-
- if (do_pollfd(pfd, pt)) {
-
-
- count++;
- pt = NULL;
- }
- }
- }
-
-
- pt = NULL;
-
-
- if (!count) {
- count = wait->error;
-
- if (signal_pending(current)) {
- count = -EINTR;
- }
- }
-
-
- if (count || timed_out) {
- break;
- }
-
- if (end_time && !to) {
- expire = timespec_to_ktime(*end_time);
- to = &expire;
- }
-
-
-
- if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) {
- timed_out = 1;
- }
- }
- return count;
- }
-
-
- static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
- {
- unsigned int mask;
- int fd;
- mask = 0;
- fd = pollfd->fd;
- if (fd >= 0) {
- int fput_needed;
- struct file * file;
-
- file = fget_light(fd, &fput_needed);
- mask = POLLNVAL;
- if (file != NULL) {
-
- mask = DEFAULT_POLLMASK;
- if (file->f_op && file->f_op->poll) {
- if (pwait) {
-
- pwait->key = pollfd->events | POLLERR | POLLHUP;
- }
-
- mask = file->f_op->poll(file, pwait);
- }
- mask &= pollfd->events | POLLERR | POLLHUP;
- fput_light(file, fput_needed);
- }
- }
- pollfd->revents = mask;
- return mask;
- }
-
-
- static long do_restart_poll(struct restart_block *restart_block)
- {
- struct pollfd __user *ufds = restart_block->poll.ufds;
- int nfds = restart_block->poll.nfds;
- struct timespec *to = NULL, end_time;
- int ret;
- if (restart_block->poll.has_timeout) {
-
- end_time.tv_sec = restart_block->poll.tv_sec;
- end_time.tv_nsec = restart_block->poll.tv_nsec;
- to = &end_time;
- }
- ret = do_sys_poll(ufds, nfds, to);
- if (ret == -EINTR) {
-
- restart_block->fn = do_restart_poll;
- ret = -ERESTART_RESTARTBLOCK;
- }
- return ret;
- }
select 實現
- typedef struct {
- unsigned long *in, *out, *ex;
- unsigned long *res_in, *res_out, *res_ex;
- } fd_set_bits;
-
- SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
- fd_set __user *, exp, struct timeval __user *, tvp)
- {
- struct timespec end_time, *to = NULL;
- struct timeval tv;
- int ret;
- if (tvp) {
- if (copy_from_user(&tv, tvp, sizeof(tv))) {
- return -EFAULT;
- }
-
- to = &end_time;
- if (poll_select_set_timeout(to,
- tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
- (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) {
- return -EINVAL;
- }
- }
- ret = core_sys_select(n, inp, outp, exp, to);
-
- ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
- return ret;
- }
-
- int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
- fd_set __user *exp, struct timespec *end_time)
- {
- fd_set_bits fds;
- void *bits;
- int ret, max_fds;
- unsigned int size;
- struct fdtable *fdt;
-
- long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
-
- ret = -EINVAL;
- if (n < 0) {
- goto out_nofds;
- }
-
- rcu_read_lock();
-
- fdt = files_fdtable(current->files);
- max_fds = fdt->max_fds;
- rcu_read_unlock();
- if (n > max_fds) {
- n = max_fds;
- }
-
- size = FDS_BYTES(n);
- bits = stack_fds;
- if (size > sizeof(stack_fds) / 6) {
-
- ret = -ENOMEM;
- bits = kmalloc(6 * size, GFP_KERNEL);
- if (!bits) {
- goto out_nofds;
- }
- }
- fds.in = bits;
- fds.out = bits + size;
- fds.ex = bits + 2*size;
- fds.res_in = bits + 3*size;
- fds.res_out = bits + 4*size;
- fds.res_ex = bits + 5*size;
-
-
- if ((ret = get_fd_set(n, inp, fds.in)) ||
- (ret = get_fd_set(n, outp, fds.out)) ||
- (ret = get_fd_set(n, exp, fds.ex))) {
- goto out;
- }
-
- zero_fd_set(n, fds.res_in);
- zero_fd_set(n, fds.res_out);
- zero_fd_set(n, fds.res_ex);
-
- ret = do_select(n, &fds, end_time);
-
- if (ret < 0) {
- goto out;
- }
- if (!ret) {
-
- ret = -ERESTARTNOHAND;
- if (signal_pending(current)) {
- goto out;
- }
- ret = 0;
- }
-
-
- if (set_fd_set(n, inp, fds.res_in) ||
- set_fd_set(n, outp, fds.res_out) ||
- set_fd_set(n, exp, fds.res_ex)) {
- ret = -EFAULT;
- }
-
- out:
- if (bits != stack_fds) {
- kfree(bits);
- }
- out_nofds:
- return ret;
- }
-
- int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
- {
- ktime_t expire, *to = NULL;
- struct poll_wqueues table;
- poll_table *wait;
- int retval, i, timed_out = 0;
- unsigned long slack = 0;
-
- rcu_read_lock();
-
- retval = max_select_fd(n, fds);
- rcu_read_unlock();
-
- if (retval < 0) {
- return retval;
- }
- n = retval;
-
-
- poll_initwait(&table);
- wait = &table.pt;
- if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
- wait = NULL;
- timed_out = 1;
- }
-
- if (end_time && !timed_out) {
-
- slack = select_estimate_accuracy(end_time);
- }
-
- retval = 0;
- for (;;) {
- unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
-
- inp = fds->in;
- outp = fds->out;
- exp = fds->ex;
- rinp = fds->res_in;
- routp = fds->res_out;
- rexp = fds->res_ex;
-
- for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
- unsigned long in, out, ex, all_bits, bit = 1, mask, j;
- unsigned long res_in = 0, res_out = 0, res_ex = 0;
- const struct file_operations *f_op = NULL;
- struct file *file = NULL;
-
- in = *inp++;
- out = *outp++;
- ex = *exp++;
- all_bits = in | out | ex;
- if (all_bits == 0) {
- i += __NFDBITS;
- continue;
- }
-
- for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {
- int fput_needed;
- if (i >= n) {
- break;
- }
-
- if (!(bit & all_bits)) {
- continue;
- }
-
- file = fget_light(i, &fput_needed);
- if (file) {
- f_op = file->f_op;
-
- mask = DEFAULT_POLLMASK;
- if (f_op && f_op->poll) {
-
- wait_key_set(wait, in, out, bit);
-
-
-
-
-
-
-
-
-
-
-
-
- mask = (*f_op->poll)(file, wait);
-
- }
- fput_light(file, fput_needed);
-
-
- if ((mask & POLLIN_SET) && (in & bit)) {
- res_in |= bit;
- retval++;
-
-
- wait = NULL;
- }
- if ((mask & POLLOUT_SET) && (out & bit)) {
- res_out |= bit;
- retval++;
- wait = NULL;
- }
- if ((mask & POLLEX_SET) && (ex & bit)) {
- res_ex |= bit;
- retval++;
- wait = NULL;
- }
- }
- }
- if (res_in) {
- *rinp = res_in;
- }
- if (res_out) {
- *routp = res_out;
- }
- if (res_ex) {
- *rexp = res_ex;
- }
- cond_resched();
- }
- wait = NULL;
- if (retval || timed_out || signal_pending(current)) {
- break;
- }
- if (table.error) {
- retval = table.error;
- break;
- }
-
- if (end_time && !to) {
- expire = timespec_to_ktime(*end_time);
- to = &expire;
- }
-
- if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
- to, slack)) {
- timed_out = 1;
- }
- }
-
- poll_freewait(&table);
-
- return retval;
- }
epoll實現
epoll 的實現比poll/select 複雜一些,這是由於:
1. epoll_wait, epoll_ctl 的調用徹底獨立開來,內核須要鎖機制對這些操做進行保護,而且須要持久的維護添加到epoll的文件
2. epoll自己也是文件,也能夠被poll/select/epoll監視,這可能致使epoll之間循環喚醒的問題
3. 單個文件的狀態改變可能喚醒過多監聽在其上的epoll,產生喚醒風暴
epoll各個功能的實現要很是當心面對這些問題,使得複雜度大大增長。
epoll的核心數據結構
-
- struct eventpoll {
- spinlock_t lock;
- struct mutex mtx;
- wait_queue_head_t wq;
-
- wait_queue_head_t poll_wait;
-
- struct list_head rdllist;
-
- struct rb_root rbr;
-
- struct epitem *ovflist;
-
- struct user_struct *user;
- struct file *file;
-
- int visited;
- struct list_head visited_list_link;
- }
-
-
- struct epitem {
-
- struct rb_node rbn;
-
- struct list_head rdllink;
-
- struct epitem *next;
-
- struct epoll_filefd ffd;
-
- int nwait;
-
-
-
-
-
- struct list_head pwqlist;
-
- struct eventpoll *ep;
-
- struct list_head fllink;
-
- struct epoll_event event;
- };
-
- struct epoll_filefd {
- struct file *file;
- int fd;
- };
-
-
- struct eppoll_entry {
-
- struct list_head llink;
-
- struct epitem *base;
-
- wait_queue_t wait;
-
- wait_queue_head_t *whead;
- };
-
-
- struct epoll_event {
- __u32 events;
- __u64 data;
- } EPOLL_PACKED;
文件系統初始化和epoll_create
-
-
-
- static int __init eventpoll_init(void)
- {
- struct sysinfo si;
-
- si_meminfo(&si);
-
-
- max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
- EP_ITEM_COST;
- BUG_ON(max_user_watches < 0);
-
-
- ep_nested_calls_init(&poll_loop_ncalls);
- ep_nested_calls_init(&poll_safewake_ncalls);
- ep_nested_calls_init(&poll_readywalk_ncalls);
-
- epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
- 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
- pwq_cache = kmem_cache_create("eventpoll_pwq",
- sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
-
- return 0;
- }
-
-
- SYSCALL_DEFINE1(epoll_create, int, size)
- {
- if (size <= 0) {
- return -EINVAL;
- }
-
- return sys_epoll_create1(0);
- }
-
- SYSCALL_DEFINE1(epoll_create1, int, flags)
- {
- int error, fd;
- struct eventpoll *ep = NULL;
- struct file *file;
-
-
- BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
-
- if (flags & ~EPOLL_CLOEXEC) {
- return -EINVAL;
- }
-
-
-
- error = ep_alloc(&ep);
- if (error < 0) {
- return error;
- }
-
-
-
-
- fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
- if (fd < 0) {
- error = fd;
- goto out_free_ep;
- }
-
- file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
- O_RDWR | (flags & O_CLOEXEC));
- if (IS_ERR(file)) {
- error = PTR_ERR(file);
- goto out_free_fd;
- }
- fd_install(fd, file);
- ep->file = file;
- return fd;
-
- out_free_fd:
- put_unused_fd(fd);
- out_free_ep:
- ep_free(ep);
- return error;
- }
epoll中的遞歸死循環和深度檢查
遞歸深度檢測(ep_call_nested)
epoll自己也是文件,也能夠被poll/select/epoll監視,若是epoll之間互相監視就有可能致使死循環。epoll的實現中,全部可能產生遞歸調用的函數都由函函數ep_call_nested進行包裹,遞歸調用過程當中出現死循環或遞歸過深就會打破死循環和遞歸調用直接返回。該函數的實現依賴於一個外部的全局鏈表nested_call_node(不一樣的函數調用使用不一樣的節點),每次調用可能發生遞歸的函數(nproc)就向鏈表中添加一個包含當前函數調用上下文ctx(進程,CPU,或epoll文件)和處理的對象標識cookie的節點,經過檢測是否有相同的節點就能夠知道是否發生了死循環,檢查鏈表中同一上下文包含的節點個數就能夠知道遞歸的深度。如下就是這一過程的源碼。
- struct nested_call_node {
- struct list_head llink;
- void *cookie;
- void *ctx;
- };
- struct nested_calls {
- struct list_head tasks_call_list;
- spinlock_t lock;
- };
-
-
-
- static nested_call_node poll_loop_ncalls;
-
- static nested_call_node poll_safewake_ncalls;
-
- static nested_call_node poll_readywalk_ncalls;
-
-
-
-
-
-
-
- static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
- int (*nproc)(void *, void *, int), void *priv,
- void *cookie, void *ctx)
- {
- int error, call_nests = 0;
- unsigned long flags;
- struct list_head *lsthead = &ncalls->tasks_call_list;
- struct nested_call_node *tncur;
- struct nested_call_node tnode;
- spin_lock_irqsave(&ncalls->lock, flags);
-
- list_for_each_entry(tncur, lsthead, llink) {
-
-
- if (tncur->ctx == ctx &&
- (tncur->cookie == cookie || ++call_nests > max_nests)) {
- error = -1;
- }
- goto out_unlock;
- }
-
- tnode.ctx = ctx;
- tnode.cookie = cookie;
- list_add(&tnode.llink, lsthead);
- spin_unlock_irqrestore(&ncalls->lock, flags);
-
-
-
-
-
- error = (*nproc)(priv, cookie, call_nests);
-
- spin_lock_irqsave(&ncalls->lock, flags);
- list_del(&tnode.llink);
- out_unlock:
- spin_unlock_irqrestore(&ncalls->lock, flags);
- return error;
- }
循環檢測(ep_loop_check)
循環檢查(ep_loop_check),該函數遞歸調用ep_loop_check_proc利用ep_call_nested來實現epoll之間相互監視的死循環。由於ep_call_nested中已經對死循環和過深的遞歸作了檢查,實際的ep_loop_check_proc的實現只是遞歸調用本身。其中的visited_list和visited標記徹底是爲了優化處理速度,若是沒有visited_list和visited標記函數也是可以工做的。該函數中得上下文就是當前的進程,cookie就是正在遍歷的epoll結構。
- static LIST_HEAD(visited_list);
-
- static int ep_loop_check(struct eventpoll *ep, struct file *file)
- {
- int ret;
- struct eventpoll *ep_cur, *ep_next;
-
- ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
- ep_loop_check_proc, file, ep, current);
-
- list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
- visited_list_link) {
- ep_cur->visited = 0;
- list_del(&ep_cur->visited_list_link);
- }
- return ret;
- }
-
- static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
- {
- int error = 0;
- struct file *file = priv;
- struct eventpoll *ep = file->private_data;
- struct eventpoll *ep_tovisit;
- struct rb_node *rbp;
- struct epitem *epi;
-
- mutex_lock_nested(&ep->mtx, call_nests + 1);
-
- ep->visited = 1;
- list_add(&ep->visited_list_link, &visited_list);
-
- for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
- epi = rb_entry(rbp, struct epitem, rbn);
- if (unlikely(is_file_epoll(epi->ffd.file))) {
- ep_tovisit = epi->ffd.file->private_data;
-
- if (ep_tovisit->visited) {
- continue;
- }
-
- error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
- ep_loop_check_proc, epi->ffd.file,
- ep_tovisit, current);
- if (error != 0) {
- break;
- }
- } else {
-
-
- if (list_empty(&epi->ffd.file->f_tfile_llink))
- list_add(&epi->ffd.file->f_tfile_llink,
- &tfile_check_list);
- }
- }
- mutex_unlock(&ep->mtx);
-
- return error;
- }
喚醒風暴檢測(reverse_path_check)
當文件狀態發生改變時,會喚醒監聽在其上的epoll文件,而這個epoll文件還可能喚醒其餘的epoll文件,這種連續的喚醒就造成了一個喚醒路徑,全部的喚醒路徑就造成了一個有向圖。若是文件對應的epoll喚醒有向圖的節點過多,那麼文件狀態的改變就會喚醒全部的這些epoll(可能會喚醒不少進程,這樣的開銷是很大的),而實際上一個文件通過少數epoll處理之後就可能從就緒轉到未就緒,剩餘的epoll雖然認爲文件已就緒而實際上通過某些處理後已不可用。epoll的實現中考慮到了此問題,在每次添加新文件到epoll中時,就會首先檢查是否會出現這樣的喚醒風暴。
該函數的實現邏輯是這樣的,遞歸調用reverse_path_check_proc遍歷監聽在當前文件上的epoll文件,在reverse_pach_check_proc中統計並檢查不一樣路徑深度上epoll的個數,從而避免產生喚醒風暴。
- #define PATH_ARR_SIZE 5
-
-
-
-
-
-
-
- static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
-
- static int path_count[PATH_ARR_SIZE];
-
- static int path_count_inc(int nests)
- {
-
- if (nests == 0) {
- return 0;
- }
-
- if (++path_count[nests] > path_limits[nests]) {
- return -1;
- }
- return 0;
- }
-
- static void path_count_init(void)
- {
- int i;
-
- for (i = 0; i < PATH_ARR_SIZE; i++) {
- path_count[i] = 0;
- }
- }
-
-
- static int reverse_path_check(void)
- {
- int error = 0;
- struct file *current_file;
-
-
-
- list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
-
- path_count_init();
-
- error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
- reverse_path_check_proc, current_file,
- current_file, current);
- if (error) {
- break;
- }
- }
- return error;
- }
- static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
- {
- int error = 0;
- struct file *file = priv;
- struct file *child_file;
- struct epitem *epi;
-
- list_for_each_entry(epi, &file->f_ep_links, fllink) {
-
- child_file = epi->ep->file;
- if (is_file_epoll(child_file)) {
- if (list_empty(&child_file->f_ep_links)) {
-
-
- if (path_count_inc(call_nests)) {
- error = -1;
- break;
- }
- } else {
-
-
- error = ep_call_nested(&poll_loop_ncalls,
- EP_MAX_NESTS,
- reverse_path_check_proc,
- child_file, child_file,
- current);
- }
- if (error != 0) {
- break;
- }
- } else {
-
- printk(KERN_ERR "reverse_path_check_proc: "
- "file is not an ep!\n");
- }
- }
- return error;
- }
epoll 的喚醒過程
- static void ep_poll_safewake(wait_queue_head_t *wq)
- {
- int this_cpu = get_cpu();
-
- ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
- ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
-
- put_cpu();
- }
-
- static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
- {
- ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
- 1 + call_nests);
- return 0;
- }
-
- static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
- unsigned long events, int subclass)
- {
-
-
-
- wake_up_poll(wqueue, events);
- }
epoll_ctl
-
-
- SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
- struct epoll_event __user *, event)
- {
- int error;
- int did_lock_epmutex = 0;
- struct file *file, *tfile;
- struct eventpoll *ep;
- struct epitem *epi;
- struct epoll_event epds;
-
- error = -EFAULT;
- if (ep_op_has_event(op) &&
-
- copy_from_user(&epds, event, sizeof(struct epoll_event))) {
- goto error_return;
- }
-
-
- error = -EBADF;
- file = fget(epfd);
- if (!file) {
- goto error_return;
- }
-
-
- tfile = fget(fd);
- if (!tfile) {
- goto error_fput;
- }
-
-
- error = -EPERM;
- if (!tfile->f_op || !tfile->f_op->poll) {
- goto error_tgt_fput;
- }
-
-
- error = -EINVAL;
- if (file == tfile || !is_file_epoll(file)) {
- goto error_tgt_fput;
- }
-
-
- ep = file->private_data;
-
-
- if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
- mutex_lock(&epmutex);
- did_lock_epmutex = 1;
- }
- if (op == EPOLL_CTL_ADD) {
- if (is_file_epoll(tfile)) {
- error = -ELOOP;
-
- if (ep_loop_check(ep, tfile) != 0) {
- goto error_tgt_fput;
- }
- } else
- {
-
- list_add(&tfile->f_tfile_llink, &tfile_check_list);
- }
- }
-
- mutex_lock_nested(&ep->mtx, 0);
-
-
- epi = ep_find(ep, tfile, fd);
-
- error = -EINVAL;
- switch (op) {
- case EPOLL_CTL_ADD:
- if (!epi) {
-
- epds.events |= POLLERR | POLLHUP;
- error = ep_insert(ep, &epds, tfile, fd);
- } else {
- error = -EEXIST;
- }
-
- clear_tfile_check_list();
- break;
- case EPOLL_CTL_DEL:
- if (epi) {
- error = ep_remove(ep, epi);
- } else {
- error = -ENOENT;
- }
- break;
- case EPOLL_CTL_MOD:
- if (epi) {
- epds.events |= POLLERR | POLLHUP;
- error = ep_modify(ep, epi, &epds);
- } else {
- error = -ENOENT;
- }
- break;
- }
- mutex_unlock(&ep->mtx);
-
- error_tgt_fput:
- if (did_lock_epmutex) {
- mutex_unlock(&epmutex);
- }
-
- fput(tfile);
- error_fput:
- fput(file);
- error_return:
-
- return error;
- }
EPOLL_CTL_ADD 實現
-
- static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
- struct file *tfile, int fd)
- {
- int error, revents, pwake = 0;
- unsigned long flags;
- long user_watches;
- struct epitem *epi;
- struct ep_pqueue epq;
-
-
-
-
-
-
-
-
- user_watches = atomic_long_read(&ep->user->epoll_watches);
- if (unlikely(user_watches >= max_user_watches)) {
- return -ENOSPC;
- }
-
-
- if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) {
- return -ENOMEM;
- }
-
- INIT_LIST_HEAD(&epi->rdllink);
- INIT_LIST_HEAD(&epi->fllink);
- INIT_LIST_HEAD(&epi->pwqlist);
- epi->ep = ep;
-
- ep_set_ffd(&epi->ffd, tfile, fd);
-
- epi->event = *event;
- epi->nwait = 0;
- epi->next = EP_UNACTIVE_PTR;
-
-
- epq.epi = epi;
- init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
-
- epq.pt._key = event->events;
-
-
- revents = tfile->f_op->poll(tfile, &epq.pt);
-
-
- error = -ENOMEM;
- if (epi->nwait < 0) {
- goto error_unregister;
- }
-
- spin_lock(&tfile->f_lock);
- list_add_tail(&epi->fllink, &tfile->f_ep_links);
- spin_unlock(&tfile->f_lock);
-
-
- ep_rbtree_insert(ep, epi);
-
-
- error = -EINVAL;
- if (reverse_path_check()) {
- goto error_remove_epi;
- }
-
- spin_lock_irqsave(&ep->lock, flags);
-
-
- if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
-
-
- if (waitqueue_active(&ep->wq))
-
- {
- wake_up_locked(&ep->wq);
- }
-
- if (waitqueue_active(&ep->poll_wait)) {
- pwake++;
- }
- }
-
- spin_unlock_irqrestore(&ep->lock, flags);
-
- atomic_long_inc(&ep->user->epoll_watches);
-
- if (pwake)
-
- {
- ep_poll_safewake(&ep->poll_wait);
- }
-
- return 0;
-
- error_remove_epi:
- spin_lock(&tfile->f_lock);
-
- if (ep_is_linked(&epi->fllink)) {
- list_del_init(&epi->fllink);
- }
- spin_unlock(&tfile->f_lock);
-
-
- rb_erase(&epi->rbn, &ep->rbr);
-
- error_unregister:
-
- ep_unregister_pollwait(ep, epi);
-
-
-
-
-
-
-
-
- spin_lock_irqsave(&ep->lock, flags);
- if (ep_is_linked(&epi->rdllink)) {
- list_del_init(&epi->rdllink);
- }
- spin_unlock_irqrestore(&ep->lock, flags);
-
-
- kmem_cache_free(epi_cache, epi);
-
- return error;
- }
EPOLL_CTL_DEL
EPOLL_CTL_DEL 的實現調用的是 ep_remove 函數,函數只是清除ADD時, 添加的各類結構,EPOLL_CTL_MOD 的實現調用的是ep_modify,在ep_modify中用新的事件掩碼調用f_ops->poll,檢測事件是否已可用,若是可用就直接喚醒epoll,這兩個的實現與EPOLL_CTL_ADD 相似,代碼上比較清晰,這裏就不具體分析了。
- static int ep_remove(struct eventpoll *ep, struct epitem *epi)
- {
- unsigned long flags;
- struct file *file = epi->ffd.file;
-
-
-
-
-
-
-
-
-
- ep_unregister_pollwait(ep, epi);
-
-
- spin_lock(&file->f_lock);
- if (ep_is_linked(&epi->fllink))
- list_del_init(&epi->fllink);
- spin_unlock(&file->f_lock);
-
- rb_erase(&epi->rbn, &ep->rbr);
-
- spin_lock_irqsave(&ep->lock, flags);
- if (ep_is_linked(&epi->rdllink))
- list_del_init(&epi->rdllink);
- spin_unlock_irqrestore(&ep->lock, flags);
-
-
- kmem_cache_free(epi_cache, epi);
-
- atomic_long_dec(&ep->user->epoll_watches);
-
- return 0;
- }
-
-
-
-
- static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event)
- {
- int pwake = 0;
- unsigned int revents;
- poll_table pt;
-
- init_poll_funcptr(&pt, NULL);
-
-
-
-
-
-
- epi->event.events = event->events;
- pt._key = event->events;
- epi->event.data = event->data;
-
-
-
-
-
- revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt);
-
-
-
-
-
- if (revents & event->events) {
- spin_lock_irq(&ep->lock);
- if (!ep_is_linked(&epi->rdllink)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
-
-
- if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
- if (waitqueue_active(&ep->poll_wait))
- pwake++;
- }
- spin_unlock_irq(&ep->lock);
- }
-
-
- if (pwake)
- ep_poll_safewake(&ep->poll_wait);
-
- return 0;
- }
epoll_wait
-
-
-
-
- SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
- int, maxevents, int, timeout)
- {
- int error;
- struct file *file;
- struct eventpoll *ep;
-
-
- if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) {
- return -EINVAL;
- }
-
- if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
- error = -EFAULT;
- goto error_return;
- }
-
-
- error = -EBADF;
- file = fget(epfd);
- if (!file) {
- goto error_return;
- }
-
- error = -EINVAL;
- if (!is_file_epoll(file)) {
- goto error_fput;
- }
-
- ep = file->private_data;
-
-
- error = ep_poll(ep, events, maxevents, timeout);
-
- error_fput:
- fput(file);
- error_return:
-
- return error;
- }
-
- static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
- int maxevents, long timeout)
- {
- int res = 0, eavail, timed_out = 0;
- unsigned long flags;
- long slack = 0;
- wait_queue_t wait;
- ktime_t expires, *to = NULL;
-
- if (timeout > 0) {
-
- struct timespec end_time = ep_set_mstimeout(timeout);
-
- slack = select_estimate_accuracy(&end_time);
- to = &expires;
- *to = timespec_to_ktime(end_time);
- } else if (timeout == 0) {
-
- timed_out = 1;
- spin_lock_irqsave(&ep->lock, flags);
- goto check_events;
- }
-
- fetch_events:
- spin_lock_irqsave(&ep->lock, flags);
-
-
- if (!ep_events_available(ep)) {
-
-
- init_waitqueue_entry(&wait, current);
- __add_wait_queue_exclusive(&ep->wq, &wait);
-
- for (;;) {
-
-
-
-
-
- set_current_state(TASK_INTERRUPTIBLE);
- if (ep_events_available(ep) || timed_out) {
- break;
- }
- if (signal_pending(current)) {
- res = -EINTR;
- break;
- }
-
- spin_unlock_irqrestore(&ep->lock, flags);
-
- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
- timed_out = 1;
- }
-
- spin_lock_irqsave(&ep->lock, flags);
- }
-
- __remove_wait_queue(&ep->wq, &wait);
-
- set_current_state(TASK_RUNNING);
- }
- check_events:
-
- eavail = ep_events_available(ep);
-
- spin_unlock_irqrestore(&ep->lock, flags);
-
-
-
-
-
-
- if (!res && eavail
- && !(res = ep_send_events(ep, events, maxevents))
- && !timed_out)
- {
- goto fetch_events;
- }
-
- return res;
- }
-
-
- static inline int ep_events_available(struct eventpoll *ep)
- {
- return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
- }
-
- struct ep_send_events_data {
- int maxevents;
- struct epoll_event __user *events;
- };
-
- static int ep_send_events(struct eventpoll *ep,
- struct epoll_event __user *events, int maxevents)
- {
- struct ep_send_events_data esed;
-
- esed.maxevents = maxevents;
- esed.events = events;
-
- return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
- }
-
- static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
- void *priv)
- {
- struct ep_send_events_data *esed = priv;
- int eventcnt;
- unsigned int revents;
- struct epitem *epi;
- struct epoll_event __user *uevent;
-
-
- for (eventcnt = 0, uevent = esed->events;
- !list_empty(head) && eventcnt < esed->maxevents;) {
- epi = list_first_entry(head, struct epitem, rdllink);
-
- list_del_init(&epi->rdllink);
-
- revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
- epi->event.events;
-
-
-
-
-
-
-
- if (revents) {
-
- if (__put_user(revents, &uevent->events) ||
- __put_user(epi->event.data, &uevent->data)) {
- list_add(&epi->rdllink, head);
- return eventcnt ? eventcnt : -EFAULT;
- }
- eventcnt++;
- uevent++;
- if (epi->event.events & EPOLLONESHOT) {
- epi->event.events &= EP_PRIVATE_BITS;
- } else if (!(epi->event.events & EPOLLET)) {
-
-
- list_add_tail(&epi->rdllink, &ep->rdllist);
- }
-
-
- }
- }
-
- return eventcnt;
- }
eventpoll_poll
因爲epoll自身也是文件系統,其描述符也能夠被poll/select/epoll監視,所以須要實現poll方法。
- static const struct file_operations eventpoll_fops = {
- .release = ep_eventpoll_release,
- .poll = ep_eventpoll_poll,
- .llseek = noop_llseek,
- };
-
- static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
- {
- int pollflags;
- struct eventpoll *ep = file->private_data;
-
- poll_wait(file, &ep->poll_wait, wait);
-
-
-
-
- pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
- ep_poll_readyevents_proc, ep, ep, current);
-
- return pollflags != -1 ? pollflags : 0;
- }
-
- static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
- {
- return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1);
- }
-
- static int ep_scan_ready_list(struct eventpoll *ep,
- int (*sproc)(struct eventpoll *,
- struct list_head *, void *),
- void *priv,
- int depth)
- {
- int error, pwake = 0;
- unsigned long flags;
- struct epitem *epi, *nepi;
- LIST_HEAD(txlist);
-
-
-
-
-
- mutex_lock_nested(&ep->mtx, depth);
-
- spin_lock_irqsave(&ep->lock, flags);
-
- list_splice_init(&ep->rdllist, &txlist);
-
-
-
-
-
-
- ep->ovflist = NULL;
- spin_unlock_irqrestore(&ep->lock, flags);
-
-
- error = (*sproc)(ep, &txlist, priv);
-
- spin_lock_irqsave(&ep->lock, flags);
-
-
- for (nepi = ep->ovflist; (epi = nepi) != NULL;
- nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
-
-
- if (!ep_is_linked(&epi->rdllink)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
- }
- }
-
- ep->ovflist = EP_UNACTIVE_PTR;
-
-
- list_splice(&txlist, &ep->rdllist);
-
- if (!list_empty(&ep->rdllist)) {
-
- if (waitqueue_active(&ep->wq)) {
- wake_up_locked(&ep->wq);
- }
-
- if (waitqueue_active(&ep->poll_wait)) {
- pwake++;
- }
- }
- spin_unlock_irqrestore(&ep->lock, flags);
-
- mutex_unlock(&ep->mtx);
-
- if (pwake) {
-
- ep_poll_safewake(&ep->poll_wait);
- }
-
- return error;
- }
-
- static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
- void *priv)
- {
- struct epitem *epi, *tmp;
- poll_table pt;
- init_poll_funcptr(&pt, NULL);
- list_for_each_entry_safe(epi, tmp, head, rdllink) {
- pt._key = epi->event.events;
- if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
- epi->event.events) {
- return POLLIN | POLLRDNORM;
- } else {
-
-
-
-
- list_del_init(&epi->rdllink);
- }
- }
- return 0;
- }
epoll全景
如下是epoll使用的所有數據結構之間的關係圖,採用的是一種類UML圖,但願對理解epoll的內部實現有所幫助。
![點擊查看原始大小圖片](http://static.javashuo.com/static/loading.gif)
poll/select/epoll 對比
經過以上的分析能夠看出,poll和select的實現基本是一致,只是用戶到內核傳遞的數據格式有所不一樣,
select和poll即便只有一個描述符就緒,也要遍歷整個集合。若是集合中活躍的描述符不多,遍歷過程的開銷就會變得很大,而若是集合中大部分的描述符都是活躍的,遍歷過程的開銷又能夠忽略。
epoll的實現中每次只遍歷活躍的描述符(若是是水平觸發,也會遍歷先前活躍的描述符),在活躍描述符較少的狀況下就會頗有優點,在代碼的分析過程當中能夠看到epoll的實現過於複雜而且其實現過程當中須要同步處理(鎖),若是大部分描述符都是活躍的,epoll的效率可能不如select或poll。(參見epoll 和poll的性能測試 http://jacquesmattheij.com/Poll+vs+Epoll+once+again)
select可以處理的最大fd沒法超出FDSETSIZE。
select會複寫傳入的fd_set 指針,而poll對每一個fd返回一個掩碼,不更改原來的掩碼,從而能夠對同一個集合屢次調用poll,而無需調整。
select對每一個文件描述符最多使用3個bit,而poll採用的pollfd須要使用64個bit,epoll採用的 epoll_event則須要96個bit
若是事件須要循環處理select, poll 每一次的處理都要將所有的數據複製到內核,而epoll的實現中,內核將持久維護加入的描述符,減小了內核和用戶複製數據的開銷。