BIND9採用的是事件驅動的機制來工做,而事件的源頭則是IO,IO在linux使用的EPOLL的邊緣觸發模式。html
本篇說的是epoll,BIND9若是建立了watcher線程(宏USE_WATCHER_THREAD控制),這裏就討論有線程的狀況,實際上即便不建立linux
線程乾的也都是同樣的活。在lib/isc/socket.c中setup_watcher函數:(全部的代碼都是截取的epoll下的片斷,由於還有kqueue,devpoll,select等的實現代碼,太多了)數組
#elif defined(USE_EPOLL) manager->nevents = ISC_SOCKET_MAXEVENTS; manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) * manager->nevents); if (manager->events == NULL) return (ISC_R_NOMEMORY); manager->epoll_fd = epoll_create(manager->nevents); if (manager->epoll_fd == -1) { result = isc__errno2result(errno); isc__strerror(errno, strbuf, sizeof(strbuf)); UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_create %s: %s", isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, ISC_MSG_FAILED, "failed"), strbuf); isc_mem_put(mctx, manager->events, sizeof(struct epoll_event) * manager->nevents); return (result); } #ifdef USE_WATCHER_THREAD result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); if (result != ISC_R_SUCCESS) { close(manager->epoll_fd); isc_mem_put(mctx, manager->events, sizeof(struct epoll_event) * manager->nevents); return (result); } #endif /* USE_WATCHER_THREAD */
先是建立了要監視的最大socket fd數目(manager->nevents)對應的epoll_event結構體數組,而後調用epoll_create函數建立一個epoll fd,參數則是指定監視的socket fdapp
最大數目。個人內核版本是3.13,man一下epoll_create發現它是這樣說的:epoll_create() creates an epoll(7) instance. Since Linux 2.6.8, thesize argument is ignored, but must be greater than zero。這個函數在2.6.8內核之後就忽略參數size了,可是傳遞的參數值必定要大於0。後來找了一下資料,網上的高手的博客說的就很清楚了http://www.cnblogs.com/apprentice89/p/3234677.html。繼續往下說,後面的watch_fd實在建立線程的狀況下才有,就是將pipe_fds[0]這個管道描述符,也就是一個可讀的流,而上述的socket fd都是能夠歸爲流。watch_fd的實現代碼:socket
#elif defined(USE_EPOLL) struct epoll_event event; if (msg == SELECT_POKE_READ) event.events = EPOLLIN; else event.events = EPOLLOUT; memset(&event.data, 0, sizeof(event.data)); event.data.fd = fd; if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 && errno != EEXIST) { result = isc__errno2result(errno); } return (result);
這是將pipe_fds[0]加入epoll_fd的監聽隊列,EPOLL_CTL_ADD是操做類型,註冊該fd到epoll_fd上。這個管道的目的是接收管理該線程的消息,好比線程退出。ide
那麼進入線程看:函數
static isc_threadresult_t watcher(void *uap) { isc__socketmgr_t *manager = uap; isc_boolean_t done; int ctlfd; int cc; #ifdef USE_KQUEUE const char *fnname = "kevent()"; #elif defined (USE_EPOLL) const char *fnname = "epoll_wait()"; #elif defined(USE_DEVPOLL) const char *fnname = "ioctl(DP_POLL)"; struct dvpoll dvp; #elif defined (USE_SELECT) const char *fnname = "select()"; int maxfd; #endif char strbuf[ISC_STRERRORSIZE]; #ifdef ISC_SOCKET_USE_POLLWATCH pollstate_t pollstate = poll_idle; #endif /* * Get the control fd here. This will never change. */ ctlfd = manager->pipe_fds[0]; done = ISC_FALSE; while (!done) { do { #ifdef USE_KQUEUE cc = kevent(manager->kqueue_fd, NULL, 0, manager->events, manager->nevents, NULL); #elif defined(USE_EPOLL) cc = epoll_wait(manager->epoll_fd, manager->events, manager->nevents, -1); #elif defined(USE_DEVPOLL) dvp.dp_fds = manager->events; dvp.dp_nfds = manager->nevents; #ifndef ISC_SOCKET_USE_POLLWATCH dvp.dp_timeout = -1; #else if (pollstate == poll_idle) dvp.dp_timeout = -1; else dvp.dp_timeout = ISC_SOCKET_POLLWATCH_TIMEOUT; #endif /* ISC_SOCKET_USE_POLLWATCH */ cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp); #elif defined(USE_SELECT) LOCK(&manager->lock); memcpy(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize); memcpy(manager->write_fds_copy, manager->write_fds, manager->fd_bufsize); maxfd = manager->maxfd + 1; UNLOCK(&manager->lock); cc = select(maxfd, manager->read_fds_copy, manager->write_fds_copy, NULL, NULL); #endif /* USE_KQUEUE */ if (cc < 0 && !SOFT_ERROR(errno)) { isc__strerror(errno, strbuf, sizeof(strbuf)); FATAL_ERROR(__FILE__, __LINE__, "%s %s: %s", fnname, isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, ISC_MSG_FAILED, "failed"), strbuf); } #if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) if (cc == 0) { if (pollstate == poll_active) pollstate = poll_checking; else if (pollstate == poll_checking) pollstate = poll_idle; } else if (cc > 0) { if (pollstate == poll_checking) { /* * XXX: We'd like to use a more * verbose log level as it's actually an * unexpected event, but the kernel bug * reportedly happens pretty frequently * (and it can also be a false positive) * so it would be just too noisy. */ manager_log(manager, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(1), "unexpected POLL timeout"); } pollstate = poll_active; } #endif } while (cc < 0); #if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL) done = process_fds(manager, manager->events, cc); #elif defined(USE_SELECT) process_fds(manager, maxfd, manager->read_fds_copy, manager->write_fds_copy); /* * Process reads on internal, control fd. */ if (FD_ISSET(ctlfd, manager->read_fds_copy)) done = process_ctlfd(manager); #endif } manager_log(manager, TRACE, "%s", isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, ISC_MSG_EXITING, "watcher exiting")); return ((isc_threadresult_t)0); }
無限循環,epoll_wait當監聽的epoll_fd隊列上有IO事件發生時,將對應的socket fd和事件放入events數組中,而且將這些註冊在epoll_fd上的socket fd對應事件清空。spa
process_fds遍歷數組,找到對應的socket fd,並判斷該fd是否是線程控制管道,若是是則會在執行完其餘socket fd上的對應事件後再處理管道中的控制消息。線程
static isc_boolean_t process_fds(isc__socketmgr_t *manager, struct epoll_event *events, int nevents) { int i; isc_boolean_t done = ISC_FALSE; #ifdef USE_WATCHER_THREAD isc_boolean_t have_ctlevent = ISC_FALSE; #endif if (nevents == manager->nevents) { manager_log(manager, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, "maximum number of FD events (%d) received", nevents); } for (i = 0; i < nevents; i++) { REQUIRE(events[i].data.fd < (int)manager->maxsocks); #ifdef USE_WATCHER_THREAD if (events[i].data.fd == manager->pipe_fds[0]) { have_ctlevent = ISC_TRUE; continue; } #endif if ((events[i].events & EPOLLERR) != 0 || (events[i].events & EPOLLHUP) != 0) { /* * epoll does not set IN/OUT bits on an erroneous * condition, so we need to try both anyway. This is a * bit inefficient, but should be okay for such rare * events. Note also that the read or write attempt * won't block because we use non-blocking sockets. */ events[i].events |= (EPOLLIN | EPOLLOUT); } process_fd(manager, events[i].data.fd, (events[i].events & EPOLLIN) != 0, (events[i].events & EPOLLOUT) != 0); } #ifdef USE_WATCHER_THREAD if (have_ctlevent) done = process_ctlfd(manager); #endif return (done); }
待續code