epoll是linux中最多見的io複用函數,它的高效簡潔是其餘兩個不能比擬的,解決了以前的大量fd問題後,同時針對poll的效率問題作了提高,它利用內核去保存傳入的fd而非poll_wait時候才保存傳入的fd;另外它也不是將current輪流掛入fd的等待隊列中,而是在設備的等待隊列醒來時調用一個回調函數。node
咱們來看看源碼:linux
asmlinkage long sys_epoll_create(int size) { int error, fd; struct inode *inode; struct file *file; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", current, size)); /* Sanity check on the size parameter */ error = -EINVAL; if (size <= 0) goto eexit_1; /* * Creates all the items needed to setup an eventpoll file. That is, * a file structure, and inode and a free file descriptor. */ error = ep_getfd(&fd, &inode, &file); if (error) goto eexit_1; /* Setup the file internal data structure ( "struct eventpoll" ) */ error = ep_file_init(file); if (error) goto eexit_2; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", current, size, fd)); return fd; eexit_2: sys_close(fd); eexit_1: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", current, size, error)); return error; }
咱們再來看看sys_epoll_ctl():ide
asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) { int error; struct file *file, *tfile; struct eventpoll *ep; struct epitem *epi; struct epoll_event epds; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n", current, epfd, op, fd, event)); error = -EFAULT; if (EP_OP_HASH_EVENT(op) && copy_from_user(&epds, event, sizeof(struct epoll_event)))//從用戶空間拷貝 goto eexit_1; //拿到flie和tfile,並對tfile、file進行檢查 error = -EBADF; file = fget(epfd); if (!file) goto eexit_1; tfile = fget(fd); if (!tfile) goto eexit_2; error = -EPERM; if (!tfile->f_op || !tfile->f_op->poll) goto eexit_3; error = -EINVAL; if (file == tfile || !IS_FILE_EPOLL(file)) goto eexit_3; ep = file->private_data;//獲取在epoll_create中建立的私有數據 down_write(&ep->sem); /* Try to lookup the file inside our hash table */ epi = ep_find(ep, tfile, fd); error = -EINVAL; switch (op) { case EPOLL_CTL_ADD: if (!epi) { epds.events |= POLLERR | POLLHUP; error = ep_insert(ep, &epds, tfile, fd); } else error = -EEXIST; break; case EPOLL_CTL_DEL: if (epi) error = ep_remove(ep, epi); else error = -ENOENT; break; case EPOLL_CTL_MOD: if (epi) { epds.events |= POLLERR | POLLHUP; error = ep_modify(ep, epi, &epds); } else error = -ENOENT; break; } if (epi) ep_release_epitem(epi); up_write(&ep->sem); eexit_3: fput(tfile); eexit_2: fput(file); eexit_1: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n", current, epfd, op, fd, event, error)); return error; }
咱們能夠在去看看ep_find:函數
static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) { int kcmp; unsigned long flags; struct rb_node *rbp; struct epitem *epi, *epir = NULL; struct epoll_filefd ffd; EP_SET_FFD(&ffd, file, fd); read_lock_irqsave(&ep->lock, flags); for (rbp = ep->rbr.rb_node; rbp; ) { epi = rb_entry(rbp, struct epitem, rbn); kcmp = EP_CMP_FFD(&ffd, &epi->ffd); if (kcmp > 0) rbp = rbp->rb_right; else if (kcmp < 0) rbp = rbp->rb_left; else { ep_use_epitem(epi); epir = epi; break; } } read_unlock_irqrestore(&ep->lock, flags); DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n", current, file, epir)); return epir; }
基礎的看完了,來看看核心的sys_epoll_wait:ui
asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, int timeout) { int error; struct file *file; struct eventpoll *ep; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n", current, epfd, events, maxevents, timeout)); /* 事件數量得大於0 */ if (maxevents <= 0) return -EINVAL; /* 驗證一下用戶的寫權限 */ if ((error = verify_area(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))) goto eexit_1; /* 拿到epfd中的file* */ error = -EBADF; file = fget(epfd); if (!file) goto eexit_1; /* 檢測fd對應的文件結構 */ error = -EINVAL; if (!IS_FILE_EPOLL(file)) goto eexit_2; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = file->private_data; /* 進入循環 */ error = ep_poll(ep, events, maxevents, timeout); eexit_2: fput(file); eexit_1: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n", current, epfd, events, maxevents, timeout, error)); return error; }
進入ep_poll:this
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, long timeout) { int res, eavail; unsigned long flags; long jtimeout; wait_queue_t wait; /* * Calculate the timeout by checking for the "infinite" value ( -1 ) * and the overflow condition. The passed timeout is in milliseconds, * that why (t * HZ) / 1000. */ jtimeout = timeout == -1 || timeout > (MAX_SCHEDULE_TIMEOUT - 1000) / HZ ? MAX_SCHEDULE_TIMEOUT: (timeout * HZ + 999) / 1000; /* 在這裏循環開始 */ retry: write_lock_irqsave(&ep->lock, flags);//鎖寫權限 res = 0; if (list_empty(&ep->rdllist)) { /* 此時爲空,在這裏阻塞,等待有一個回調函數「把我叫醒」 */ init_waitqueue_entry(&wait, current); add_wait_queue(&ep->wq, &wait); for (;;) { /* 除了睡覺和隊列爲空,都不能退出循環 */ set_current_state(TASK_INTERRUPTIBLE); if (!list_empty(&ep->rdllist) || !jtimeout) break; if (signal_pending(current)) { res = -EINTR; break; } write_unlock_irqrestore(&ep->lock, flags); jtimeout = schedule_timeout(jtimeout); write_lock_irqsave(&ep->lock, flags); } remove_wait_queue(&ep->wq, &wait); set_current_state(TASK_RUNNING); } /* 隊列中還有沒有 */ eavail = !list_empty(&ep->rdllist); write_unlock_irqrestore(&ep->lock, flags); /* 嘗試去再次去再次循環 */ if (!res && eavail && !(res = ep_events_transfer(ep, events, maxevents)) && jtimeout) goto retry; return res; }
而在嘗試再次循環時候有一個函數ep_events_transfer,這就是向用戶返回的就緒fd函數:.net
static int ep_events_transfer(struct eventpoll *ep, struct epoll_event __user *events, int maxevents) { int eventcnt = 0; struct list_head txlist; INIT_LIST_HEAD(&txlist); down_read(&ep->sem); //ep_collect_ready_items將完成的在rdlist中的放入txlist if (ep_collect_ready_items(ep, &txlist, maxevents) > 0) { //將txlist中的fd發回給用戶,它將poll第二參數置爲空,那麼只是返回了事件掩碼mask eventcnt = ep_send_events(ep, &txlist, events); /* 將txlist中一部分fd返還給rdlist,這裏就是EPOLL——ET模式的體現了,若是沒有ET模式,那麼這個txlist會被清空。而這一部分fd的條件是:!epi->event.events & EPOLLET && epi->revents & epi->event.events,事件被關注,而且事件沒有被標註ET */ ep_reinject_items(ep, &txlist); } up_read(&ep->sem); return eventcnt; }
最後來總結一下epoll流程:指針
int epoll_ctl(int epfd, int op, int fd, struct epoll_event *ev);rest
- 第一個參數是epollfd也就是epollcreate返回的那個; - 第二個則是控制字段,由三個宏實現: - EPOLL_CTL_ADD:將描述符fd添加到epoll實例中的興趣列表中去。對於fd上咱們感興趣的事件,都指定在ev所指向的結構體中。若是咱們試圖向興趣列表中添加一個已存在的文件描述符,epoll_ctl()將出現EEXIST錯誤。 - EPOLL_CTL_MOD:修改描述符上設定的事件,須要用到由ev所指向的結構體中的信息。若是咱們試圖修改不在興趣列表中的文件描述符,epoll_ctl()將出現ENOENT錯誤。 - EPOLL_CTL_DEL:將文件描述符fd從epfd的興趣列表中移除,該操做忽略參數ev。若是咱們試圖移除一個不在epfd的興趣列表中的文件描述符,epoll_ctl()將出現ENOENT錯誤。關閉一個文件描述符會自動將其從全部的epoll實例的興趣列表移除。 - 第三個則是事件fd - 第四個是事件結構體指針 - struct epoll_event { uint32_t events; /* 一個位掩碼,它指定了咱們爲待檢查的描述符fd上所感興趣的事件集合 */ epoll_data_t data; /* User data */ }; typedef union epoll_data { void *ptr; /* Pointer to user-defind data */ int fd; /* File descriptor */ uint32_t u32; /* 32-bit integer */ uint64_t u64; /* 64-bit integer */ }epoll_data_t; - 而這個event.events可設置爲 + EPOLLIN 可讀次優先級事件 + EPOLLPRI 可讀高優先級事件 + EPOLLRDHUP 套接字對端關閉 + EPOLLOUT 可寫事件 + EPOLLET ET模式 + EPOLLONESHOT 完成事件後停用,一次性 + EPOLLERR 錯誤事件 + EPOLLHUP 出現掛斷
int epoll_wait(int epfd, struct epoll_event *evlist, int maxevents, int timeout);code
參數timeout用來肯定epoll_wait()的阻塞行爲,有以下幾種。
返回值:
參考博客:
https://blog.csdn.net/huangjh...
https://blog.csdn.net/shansha...