理解了中斷、等待隊列、調度,你就能懂Linux的80%。node
--老子linux
轉發的話,請註明出處哦:http://www.cnblogs.com/stonehat/
Linux系統內核提供了三個系統調用:數組
include/linux/syscalls.happ
// epoll_create,建立epoll描述符 asmlinkage long sys_epoll_create(int size); // epoll_ctl, 操做epoll描述符,增刪改 asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event); // epoll_wait, 你懂的 asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, int timeout);
其函數實如今fs/eventpoll.c異步
eventpoll 自己也是一個支持poll操做的文件,因此能夠把eventpoll組成一個樹形關係。socket
下面分別按照sys_epoll_create,sys_epoll_ctl,sys_epoll_wait的順序分析三個系統調用。async
// eventpoll結構體 struct eventpoll { /* Protect the this structure access */ rwlock_t lock; /* * 同步用的內核信號量 */ struct rw_semaphore sem; /** * 等待隊列,epoll_wait()使用,將調用線程掛在這個隊列上。 */ wait_queue_head_t wq; /* 等待隊列,file->poll()會使用,在epoll中函數爲ep_eventpoll_poll */ wait_queue_head_t poll_wait; /* 就緒列表*/ struct list_head rdllist; /* 紅黑樹,維護了 */ struct rb_root rbr; };
// 內核中文件 struct file { struct list_head f_list; struct dentry *f_dentry; struct vfsmount *f_vfsmnt; //文件操做指針 struct file_operations *f_op; atomic_t f_count; unsigned int f_flags; mode_t f_mode; int f_error; loff_t f_pos; struct fown_struct f_owner; unsigned int f_uid, f_gid; struct file_ra_state f_ra; unsigned long f_version; void *f_security; /* file中的私有自定義數據 */ void *private_data; #ifdef CONFIG_EPOLL /* Used by fs/eventpoll.c to link all the hooks to this file */ struct list_head f_ep_links; spinlock_t f_ep_lock; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; }; struct file_operations { struct module *owner; loff_t (*llseek) (struct file *, loff_t, int); ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*aio_read) (struct kiocb *, char __user *, size_t, loff_t); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*aio_write) (struct kiocb *, const char __user *, size_t, loff_t); int (*readdir) (struct file *, void *, filldir_t); // 不阻塞,檢測file狀態(可讀、可寫等),若是條件不知足,pt將會被加到等待隊列中。(通常是這種邏輯,最終如何實現仍是要看設備驅動) unsigned int (*poll) (struct file *f, struct poll_table_struct *pt); int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); int (*open) (struct inode *, struct file *); int (*flush) (struct file *); int (*release) (struct inode *, struct file *); int (*fsync) (struct file *, struct dentry *, int datasync); int (*aio_fsync) (struct kiocb *, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *); ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *); ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void *); ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); int (*dir_notify)(struct file *filp, unsigned long arg); int (*flock) (struct file *, int, struct file_lock *); };
代碼以下:爲了方便理解原理,可有可無的代碼邏輯和異常處理刪掉了ide
asmlinkage long sys_epoll_create(int size) { int error, fd; struct inode *inode; struct file *file; ..... /* * 建立一個新的file,inode,得到file對應的fd。 * 而且將file加入到當前進程打開文件列表。 */ error = ep_getfd(&fd, &inode, &file); /* 建立struct eventpoll,並掛在file的private_data指針上*/ error = ep_file_init(file); ..... return fd; }
ep_getfd簡單流程函數
static int ep_getfd(int *efd, struct inode **einode, struct file **efile) { struct qstr this; char name[32]; struct dentry *dentry; struct inode *inode; struct file *file; int error, fd; /* Get an ready to use file */ error = -ENFILE; file = get_empty_filp(); if (!file) goto eexit_1; /* Allocates an inode from the eventpoll file system */ inode = ep_eventpoll_inode(); error = PTR_ERR(inode); if (IS_ERR(inode)) goto eexit_2; /* Allocates a free descriptor to plug the file onto */ error = get_unused_fd(); if (error < 0) goto eexit_3; fd = error; /* * Link the inode to a directory entry by creating a unique name * using the inode number. */ error = -ENOMEM; sprintf(name, "[%lu]", inode->i_ino); this.name = name; this.len = strlen(name); this.hash = inode->i_ino; dentry = d_alloc(eventpoll_mnt->mnt_sb->s_root, &this); if (!dentry) goto eexit_4; dentry->d_op = &eventpollfs_dentry_operations; d_add(dentry, inode); file->f_vfsmnt = mntget(eventpoll_mnt); file->f_dentry = dentry; file->f_mapping = inode->i_mapping; file->f_pos = 0; file->f_flags = O_RDONLY; file->f_op = &eventpoll_fops; file->f_mode = FMODE_READ; file->f_version = 0; file->private_data = NULL; /* Install the new setup file into the allocated fd. */ fd_install(fd, file); *efd = fd; *einode = inode; *efile = file; return 0; eexit_4: put_unused_fd(fd); eexit_3: iput(inode); eexit_2: put_filp(file); eexit_1: return error; }
查找一個沒有用的文件描述符。記爲fdui
建立一個空文件file結構體。記爲epfile
在epoll的文件系統中建立一個inode
epfile和inode作關聯。
epfile的f_ops成員(文件操做指針)和epoll的自定義函數組eventpoll_fops作關聯。比較重要的一點是eventpoll_fops有一個自定義的poll函數,這個函數很重要,是實現epoll級聯模型的關鍵。後面能夠經過比較f_ops是否等於eventpoll_fops來判斷file是否是epoll file。
static struct file_operations eventpoll_fops = { .release = ep_eventpoll_close, .poll = ep_eventpoll_poll };
將epfile放到進程的打開文件列表中管理,用fd作索引。
初始化eventpoll結構,初始化等待隊列和就緒隊列等。
將epfile的private_data指向eventpoll結構。方便後面取eventpoll的數據。
返回給調用線程fd。
sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) { int error; struct file *file, *tfile; struct eventpoll *ep; struct epitem *epi; struct epoll_event epds; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n", current, epfd, op, fd, event)); error = -EFAULT; // 1. 從用戶空間拷貝event數據。 if (EP_OP_HASH_EVENT(op) && copy_from_user(&epds, event, sizeof(struct epoll_event))) goto eexit_1; /* 2. 根據epollfile的文件描述符得到對應的file結構體,內核中fd和file是有一個映射關係的*/ error = -EBADF; file = fget(epfd); if (!file) goto eexit_1; /* 3. 得到要操做的描述符的file指針,例如socket描述符 */ tfile = fget(fd); if (!tfile) goto eexit_2; /* 4. 校驗tfile是否支持poll操做,必須支持poll才能使用epoll */ error = -EPERM; if (!tfile->f_op || !tfile->f_op->poll) goto eexit_3; /* * 5. 校驗是不是epoll的file指針 */ error = -EINVAL; if (file == tfile || !IS_FILE_EPOLL(file)) goto eexit_3; /* * 6. 取eventpoll,從建立時,咱們知道epoll把本身的eventpoll結構體放在file->private_data了裏面。 */ ep = file->private_data; down_write(&ep->sem); /* Try to lookup the file inside our hash table */ epi = ep_find(ep, tfile, fd); // 7. 具體的邏輯操做 error = -EINVAL; switch (op) { // 添加 case EPOLL_CTL_ADD: if (!epi) { epds.events |= POLLERR | POLLHUP; error = ep_insert(ep, &epds, tfile, fd); } else error = -EEXIST; break; // 刪除 case EPOLL_CTL_DEL: if (epi) error = ep_remove(ep, epi); else error = -ENOENT; break; // 修改 case EPOLL_CTL_MOD: if (epi) { epds.events |= POLLERR | POLLHUP; error = ep_modify(ep, epi, &epds); } else error = -ENOENT; break; } /* * The function ep_find() increments the usage count of the structure * so, if this is not NULL, we need to release it. */ if (epi) ep_release_epitem(epi); up_write(&ep->sem); eexit_3: fput(tfile); eexit_2: fput(file); eexit_1: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n", current, epfd, op, fd, event, error)); return error; }
上面的邏輯很簡單
邏輯上,只須要了解添加便可。epoll的添加是理解整個流程的關鍵
static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct file *tfile, int fd) { int error, revents, pwake = 0; unsigned long flags; struct epitem *epi; struct ep_pqueue epq; error = -ENOMEM; if (!(epi = EPI_MEM_ALLOC())) goto eexit_1; /* Item initialization follow here ... */ EP_RB_INITNODE(&epi->rbn); INIT_LIST_HEAD(&epi->rdllink); INIT_LIST_HEAD(&epi->fllink); INIT_LIST_HEAD(&epi->txlink); INIT_LIST_HEAD(&epi->pwqlist); epi->ep = ep; EP_SET_FFD(&epi->ffd, tfile, fd); epi->event = *event; atomic_set(&epi->usecnt, 1); epi->nwait = 0; /* 初始化polltable,當調用poll的時候,會調用ep_ptable_queue_proc函數將自身加入等待隊列中 */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); /* * 將epq.pt的結構體傳入tfile進行poll,poll最終調用ep_ptable_queue_proc函數。 */ revents = tfile->f_op->poll(tfile, &epq.pt); /* * We have to check if something went wrong during the poll wait queue * install process. Namely an allocation for a wait queue failed due * high memory pressure. */ if (epi->nwait < 0) goto eexit_2; /* 操做tfile,把當前項加入到epoll列表中。 */ spin_lock(&tfile->f_ep_lock); list_add_tail(&epi->fllink, &tfile->f_ep_links); spin_unlock(&tfile->f_ep_lock); /* We have to drop the new item inside our item list to keep track of it */ write_lock_irqsave(&ep->lock, flags); /* Add the current item to the rb-tree */ ep_rbtree_insert(ep, epi); /* 若是已經有就緒的,就喚醒epollwait等待隊列和poll等待隊列 */ if ((revents & event->events) && !EP_IS_LINKED(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } write_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&psw, &ep->poll_wait); DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n", current, ep, tfile, fd)); return 0; eexit_2: ep_unregister_pollwait(ep, epi); /* * We need to do this because an event could have been arrived on some * allocated wait queue. */ write_lock_irqsave(&ep->lock, flags); if (EP_IS_LINKED(&epi->rdllink)) EP_LIST_DEL(&epi->rdllink); write_unlock_irqrestore(&ep->lock, flags); EPI_MEM_FREE(epi); eexit_1: return error; }
整理一下,向epoll添加一個描述符主要步驟以下:
構建epitem,epitem以後會加入到eventpoll.rbr中。
調用init_poll_funcptr,將ep_ptable_queue_proc函數指針賦值給poll_table的qproc,poll_table記爲epq.pt,在file的poll函數中,能夠傳入poll_table做爲參數,poll函數會主動調用poll_table的qproc函數。
poll_table的結構體以下:
/** *@param f:poll的file指針 *@param whead f的等待隊列 *@param pt */ typedef void (*poll_queue_proc)(struct file *f, wait_queue_head_t *whead, struct poll_table_struct *pt); typedef struct poll_table_struct { poll_queue_proc qproc; } poll_table;
poll函數原型
// 當上層傳入pt結構體時,驅動函數當調用poll_table_struct.qproc來實現阻塞隊列的添加工做。 unsigned int (*poll) (struct file *f, struct poll_table_struct *pt);
struct __wait_queue { unsigned int flags; #define WQ_FLAG_EXCLUSIVE 0x01 // 線程指針,若是func爲默認的執行函數,這個須要賦值。 struct task_struct * task; // 等待隊列喚醒執行的函數 wait_queue_func_t func; struct list_head task_list; }; typedef struct __wait_queue wait_queue_t; static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt) { // 這是一個特殊的宏操做,由於pt和epitem是包含在ep_queue結構體裏面的,因此能夠根據偏移取同級別的epitem。 struct epitem *epi = EP_ITEM_FROM_EPQUEUE(pt); struct eppoll_entry *pwq; if (epi->nwait >= 0 && (pwq = PWQ_MEM_ALLOC())) { // 初始化一個等待隊列項,而且設置當等待隊列喚醒時的執行函數爲ep_poll_callback // 這個很關鍵。等下咱們分析這個ep_poll_call init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; // 把剛建立的等待隊列項加入到等待隊列中。 add_wait_queue(whead, &pwq->wait); list_add_tail(&pwq->llink, &epi->pwqlist); epi->nwait++; } else { /* We have to signal that an error occurred */ epi->nwait = -1; } } static inline void init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func) { q->flags = 0; q->task = NULL; q->func = func; }
至此,添加一個文件描述符到epoll監控內的流程完成了,總的來說,就是在對應的file中設置等待隊列。等待回調ep_poll_callback,。至於對應的file用什麼機制來確保文件異步就緒,epoll無論。不過通常是經過中斷來實現的。
epoll模型的poll函數實現:
* * structures and helpers for f_op->poll implementations */ typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *); typedef struct poll_table_struct { poll_queue_proc qproc; } poll_table; //poll_wait函數實現,其實內部調用了poll_table.qproc成員,poll_table.qproc在epoll中對應了上面的ep_ptable_queue_proc函數 static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) { if (p && wait_address) p->qproc(filp, wait_address, p); } // epollevent的poll函數實現,驅動的邏輯都差很少,有參考意義 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) { unsigned int pollflags = 0; unsigned long flags; struct eventpoll *ep = file->private_data; /* 1. 加入等待隊列中*/ poll_wait(file, &ep->poll_wait, wait); /* Check our condition */ read_lock_irqsave(&ep->lock, flags); if (!list_empty(&ep->rdllist)) pollflags = POLLIN | POLLRDNORM; read_unlock_irqrestore(&ep->lock, flags); return pollflags; }
瞭解了ep_insert的話,這個其實就很容易理解了:
static struct file_operations eventpoll_fops = { .release = ep_eventpoll_close, .poll = ep_eventpoll_poll }; /* * sys_epoll_wait實現 */ asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, int timeout) { int error; struct file *file; struct eventpoll *ep; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n", current, epfd, events, maxevents, timeout)); /** * 驗證輸入的代碼忽略 */ error = -EBADF; // 1. 根據epfd得到對應的file file = fget(epfd); if (!file) goto eexit_1; // 2. 驗證是不是epoll的file,就是驗證f_op是否等於eventpoll_fops error = -EINVAL; if (!IS_FILE_EPOLL(file)) goto eexit_2; /* * 3. 取eventpoll結構體 */ ep = file->private_data; /* 4. 調用ep_poll實現具體邏輯。不要被ep_poll名字忽悠了,這個不是poll實現 */ error = ep_poll(ep, events, maxevents, timeout); eexit_2: fput(file); eexit_1: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n", current, epfd, events, maxevents, timeout, error)); return error; }
epoll_wait最終調用ep_poll來實現核心功能。
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, long timeout) { int res, eavail; unsigned long flags; long jtimeout; wait_queue_t wait; /* * 1. 內核中是是用滴答數做爲時間計時的,因此下面代碼是轉換時間爲滴答數。 */ jtimeout = timeout == -1 || timeout > (MAX_SCHEDULE_TIMEOUT - 1000) / HZ ? MAX_SCHEDULE_TIMEOUT: (timeout * HZ + 999) / 1000; retry: write_lock_irqsave(&ep->lock, flags); res = 0; // 1. 若是就緒隊列是空的,就進行等待 if (list_empty(&ep->rdllist)) { /* * 2. 把當前調用epoll_wait的線程加入到wq等待隊列中,當ep_poll_callback()會喚醒這個線程。 * current是當前線程的表明,最終是從cpu中取得的。 */ init_waitqueue_entry(&wait, current); add_wait_queue(&ep->wq, &wait); //死循環處理。 for (;;) { /* * 3. 設置爲可打斷,方便處理信號。 */ set_current_state(TASK_INTERRUPTIBLE); if (!list_empty(&ep->rdllist) || !jtimeout) break; // 4. 處理未處理信號 if (signal_pending(current)) { res = -EINTR; break; } write_unlock_irqrestore(&ep->lock, flags); // 相似於睡眠。其返回值爲剩餘時間。該函數會將該cpu的任務切換掉。因此下一行代碼在從新調度前不會執行。 jtimeout = schedule_timeout(jtimeout); write_lock_irqsave(&ep->lock, flags); } //把調用線程從等待隊列刪除。 remove_wait_queue(&ep->wq, &wait); set_current_state(TASK_RUNNING); } eavail = !list_empty(&ep->rdllist); write_unlock_irqrestore(&ep->lock, flags); /* * 將events數據傳回用戶空間 */ if (!res && eavail && !(res = ep_events_transfer(ep, events, maxevents)) && jtimeout) goto retry; return res; }
ep_poll的步驟以下:
轉換超時時間爲cpu滴答計數。
查詢就緒隊列是否就緒,若是有就緒的,就直接返回給上層。
若是沒有就緒的,就等待。
a. 把調用線程添加到eventpoll.wq隊列中。
b. 設置自身爲可打斷狀態
c. 檢查如今是否有就緒,有的話就直接返給上層。
d. 處理信號。
c. 發起調度,將自身切換爲阻塞狀態。等待被喚醒。喚醒的方式有:ep_poll_callback喚醒eventpoll.wq隊列或者其餘中斷喚醒。ep_poll_callback是sys_epoll_ctl添加epoll監聽的時候設置的等待隊列回調。其實現爲:
static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) { int pwake = 0; unsigned long flags; // 1. 這是一個特殊的宏操做,由於wait和epitem是包含在ep_queue結構體裏面的,因此能夠根據偏移取同級別的epitem。 struct epitem *epi = EP_ITEM_FROM_WAIT(wait); // 2. 得到對應的eventpoll struct eventpoll *ep = epi->ep; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n", current, epi->file, epi, ep)); write_lock_irqsave(&ep->lock, flags); .... // 3. 將就緒item加入到就緒 list_add_tail(&epi->rdllink, &ep->rdllist); is_linked: /* * 4. 喚醒wq等待隊列(就是喚醒等待epoll_wait的線程) */ if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; is_disabled: write_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&psw, &ep->poll_wait); return 1; }