poll(2) 源碼分析

poll(2)

poll(2) 系統調用的功能和 select(2) 相似:等待一個文件集合中的文件描述符就緒進行I/O操做。html

select(2) 的侷限性:數組

  • 關注的文件描述符集合大小最大隻有 1024
  • 文件描述符集合爲順序的,不能任意指定 fd,浪費佔用的fd

poll(2) 對 select(2) 的改進,關注的文件描述符集合爲動態大小,文件描述能夠任意指定。less

struct pollfd {
       int   fd;         /* file descriptor */
       short events;     /* requested events */
       short revents;    /* returned events */
};

- fd 爲關注的文件描述符
- events 爲關注的事件(輸入),使用位掩碼來表示事件
- revents 爲就緒的事件(輸出),一樣使用位掩碼錶示

#include <poll.h>

int poll(struct pollfd *fds, nfds_t nfds, int timeout);

- \fds 爲文件描述符集合的地址
- \nfds 爲文件描述符集合的長度
- \timeout 爲超時的時間,單位爲 毫秒

返回值爲 revents 不爲 0 的個數,出錯返回 -1

一個簡單的<span id = "poll_usage">例子</span>:等待標準輸入就緒,超時時間爲3s。socket

#include <poll.h>
#include <unistd.h>
#include <stdio.h>

int main()
{
        int timeout = 3000;

        struct pollfd fds = {0};
        fds.events |= POLLIN;  // fd = 0 等待標準輸入

        int ret = poll(&fds, 1, timeout);
        if (ret == -1)
                printf("error poll\n");
        else if (ret)
                printf("data is avaliable now.\n");
        else
                printf("no data within 3000 ms.\n");

}

<span id = "poll_src">實現</span>

代碼位於在 fs/select.c 中,參考中的連接有一些關於文件回調和poll結構的說明ide

poll()

SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
                int, timeout_msecs)
{
        struct timespec64 end_time, *to = NULL;
        int ret;

        if (timeout_msecs >= 0) {
                to = &end_time;
                poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
                        NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
        }

        ret = do_sys_poll(ufds, nfds, to);

        if (ret == -EINTR) {
                struct restart_block *restart_block;

                restart_block = &current->restart_block;
                restart_block->fn = do_restart_poll;
                restart_block->poll.ufds = ufds;
                restart_block->poll.nfds = nfds;

                if (timeout_msecs >= 0) {
                        restart_block->poll.tv_sec = end_time.tv_sec;
                        restart_block->poll.tv_nsec = end_time.tv_nsec;
                        restart_block->poll.has_timeout = 1;
                } else
                        restart_block->poll.has_timeout = 0;

                ret = -ERESTART_RESTARTBLOCK;
        }
        return ret;
}

poll() 代碼很簡單:函數

  1. 處理超時時間
  2. 實現 poll(2)
  3. 處理後事:判斷是否超時或者從新調用。

do_sys_poll()

static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
                struct timespec64 *end_time)
{
        struct poll_wqueues table;
         int err = -EFAULT, fdcount, len, size;
        /* Allocate small arguments on the stack to save memory and be
           faster - use long to make sure the buffer is aligned properly
           on 64 bit archs to avoid unaligned access */
        long stack_pps[POLL_STACK_ALLOC/sizeof(long)];  // 256 字節大小
        struct poll_list *const head = (struct poll_list *)stack_pps;
         struct poll_list *walk = head;
         unsigned long todo = nfds;

        if (nfds > rlimit(RLIMIT_NOFILE))  // 最大打開的文件數量限制
                return -EINVAL;

        // N_STACK_PPS = (256 - 16) / 8 = 30, 棧空間能夠保存 30 個pollfd結構
        // 將用戶空間的 struct pollfd 部分移動至棧空間內的數組中
        len = min_t(unsigned int, nfds, N_STACK_PPS);
        for (;;) {
                walk->next = NULL;
                walk->len = len;
                if (!len)
                        break;

                if (copy_from_user(walk->entries, ufds + nfds-todo,
                                        sizeof(struct pollfd) * walk->len))
                        goto out_fds;

                todo -= walk->len;
                if (!todo)
                        break;

                // POLLFD_PER_PAGE = (4096 - 16) / 8 = 510
                // 申請頁,每頁可容納 510 個 pollfd 結構
                len = min(todo, POLLFD_PER_PAGE);
                size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
                walk = walk->next = kmalloc(size, GFP_KERNEL);
                if (!walk) {
                        err = -ENOMEM;
                        goto out_fds;
                }
        }
        // 將全部的pollfd 結構移動至以 head 爲首地址的內核空間中

        poll_initwait(&table);  // 初始化 table,詳見 select 中的分析,見下參考
        fdcount = do_poll(head, &table, end_time);
        poll_freewait(&table);  // 釋放 table

        // 將 revents 複製到用戶空間
        for (walk = head; walk; walk = walk->next) {
                struct pollfd *fds = walk->entries;
                int j;

                for (j = 0; j < walk->len; j++, ufds++)
                        if (__put_user(fds[j].revents, &ufds->revents))
                                goto out_fds;
          }

        err = fdcount;
out_fds:
        walk = head->next;
        while (walk) {
                struct poll_list *pos = walk;
                walk = walk->next;
                kfree(pos);
        }

        return err;
}

do_sys_poll() 函數也是分爲三步實現oop

  1. 將用戶空間的數據複製到內核空間
  2. 調用核心實現 do_poll()
  3. 將就緒的事件數據從內核空間複製到用戶空間

do_poll()

static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
                   struct timespec64 *end_time)
{
        poll_table* pt = &wait->pt;
        ktime_t expire, *to = NULL;
        int timed_out = 0, count = 0;
        u64 slack = 0;
        __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
        unsigned long busy_start = 0;

        /* Optimise the no-wait case */
        if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
                pt->_qproc = NULL;
                timed_out = 1;
        }

        if (end_time && !timed_out)
                slack = select_estimate_accuracy(end_time);  // 估算進程等待的時間,函數返回 納秒

        for (;;) {
                struct poll_list *walk;
                bool can_busy_loop = false;

                for (walk = list; walk != NULL; walk = walk->next) {
                        struct pollfd * pfd, * pfd_end;

                        pfd = walk->entries;
                        pfd_end = pfd + walk->len;
                        for (; pfd != pfd_end; pfd++) {  // 對全部的 struct pollfd 遍歷處理,do_pollfd 爲單獨處理一個 fd 的函數
                                /*
                                 * Fish for events. If we found one, record it
                                 * and kill poll_table->_qproc, so we don't
                                 * needlessly register any other waiters after
                                 * this. They'll get immediately deregistered
                                 * when we break out and return.
                                 */
                                if (do_pollfd(pfd, pt, &can_busy_loop,
                                              busy_flag)) {
                                        count++;
                                        pt->_qproc = NULL;
                                        /* found something, stop busy polling */
                                        busy_flag = 0;
                                        can_busy_loop = false;
                                }
                        }
                }
                /*
                 * All waiters have already been registered, so don't provide
                 * a poll_table->_qproc to them on the next loop iteration.
                 */
                pt->_qproc = NULL;
                if (!count) {
                        count = wait->error;
                        if (signal_pending(current))
                                count = -EINTR;
                }
                if (count || timed_out)
                        break;

                /* only if found POLL_BUSY_LOOP sockets && not out of time */
                if (can_busy_loop && !need_resched()) {
                        if (!busy_start) {
                                busy_start = busy_loop_current_time();
                                continue;
                        }
                        if (!busy_loop_timeout(busy_start))
                                continue;
                }
                busy_flag = 0;

                /*
                 * If this is the first loop and we have a timeout
                 * given, then we convert to ktime_t and set the to
                 * pointer to the expiry value.
                 */
                if (end_time && !to) {
                        expire = timespec64_to_ktime(*end_time);
                        to = &expire;
                }

                if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))  // 調度直到超時
                        timed_out = 1;
        }
        return count;
}

這個函數寫的很清楚了,也有不少註釋源碼分析

  1. can_busy_loop 是和 CONFIG_NET_RX_BUSY_POLL 配置相關的,不算通用處理狀況,先忽略不考慮
  2. count 爲函數的返回值,在 do_pollfd 有返回匹配的掩碼時遞增,爲就緒的文件描述符數量,無就緒文件的時候爲等待隊列中的錯誤碼
  3. pt->_qproc 爲文件poll操做調用的函數,= NULL 的操做在註釋中已經說明,函數已經註冊到隊列中,沒必要再次註冊. 這個函數相關的內容能夠在另一篇 select(2) 找到具體的說明
/*
 * Fish for events. If we found one, record it and kill poll_table->_qproc, so we don't
 * needlessly register any other waiters after this. They'll get immediately deregistered
 * when we break out and return.
 */

/*
 * All waiters have already been registered, so don't provide a poll_table->_qproc to them on the next loop iteration.
 */

do_pollfd()

/*
 * Fish for pollable events on the pollfd->fd file descriptor. We're only
 * interested in events matching the pollfd->events mask, and the result
 * matching that mask is both recorded in pollfd->revents and returned. The
 * pwait poll_table will be used by the fd-provided poll handler for waiting,
 * if pwait->_qproc is non-NULL.
 */
static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
                                     bool *can_busy_poll,
                                     __poll_t busy_flag)
{
        __poll_t mask;
        int fd;

        mask = 0;
        fd = pollfd->fd;
        if (fd >= 0) {
                struct fd f = fdget(fd);
                mask = EPOLLNVAL;  // 0x20
                if (f.file) {
                        /* userland u16 ->events contains POLL... bitmap */
                        // 設置關注的事件
                        __poll_t filter = demangle_poll(pollfd->events) |
                                                EPOLLERR | EPOLLHUP;
                        mask = DEFAULT_POLLMASK;  // (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM)
                        if (f.file->f_op->poll) {
                                pwait->_key = filter;
                                pwait->_key |= busy_flag;  // key 在喚醒函數的時候用到
                                mask = f.file->f_op->poll(f.file, pwait);  // 獲取就緒的文件掩碼
                                if (mask & busy_flag)
                                        *can_busy_poll = true;
                        }
                        /* Mask out unneeded events. */
                        mask &= filter;  // 將文件返回的事件掩碼與關注的事件作與操做獲得 關注的就緒事件掩碼
                        fdput(f);
                }
        }
        /* ... and so does ->revents */
        pollfd->revents = mangle_poll(mask);  // 設置就緒掩碼

        return mask;
}

討論在不考慮錯誤的狀況下, poll(2) 返回的是revents 非 0 的個數,在 do_pollfd() 中返回一個非 0 的 mask,poll(2) 返回的 count 就 +1。 mask = 0 有兩種可能:this

  1. 和 filter 作與運算,可是這樣作有一個前提就是能夠取到 fd
  2. fd < 0,這種屬於無心義的fd了,屬於用戶的問題

在已瞭解的fd中: eventfd 和普通的文件poll函數返回狀況spa

  • EPOLLIN 或者 EPOLLOUT 或兩個都存在
  • (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM)

當關注的事件不在以上事件中,是可能返回 0,而count不增長的

struct pollfd fds[n];
rn = poll(fds, n, 0);
for (int i = 0; i < rn; ++i)
        if (fds[i].revents ...)

像上面這種操做是有風險的,會訪問不到rn以後的fd。

mangle_poll() 設置就緒掩碼

展開一下 就緒掩碼的設置函數, __MAP 函數有點繞, 大概就是將 v & from 轉換至靠近 to 大小的數值,沒太明白爲何這麼作。在 4.17 內核中 POLLIN 和 EPOLLIN 這類宏定義大小是同樣的。

#define __MAP(v, from, to) \
        (from < to ? (v & from) * (to/from) : (v & from) / (from/to))

static inline __poll_t demangle_poll(u16 val) {
    return (__force __poll_t)__MAP(val, POLLIN, (__force __u16)EPOLLIN) |
           (__force __poll_t)__MAP(val, POLLOUT, (__force __u16)EPOLLOUT) |
           (__force __poll_t)__MAP(val, POLLPRI, (__force __u16)EPOLLPRI) |
           (__force __poll_t)__MAP(val, POLLERR, (__force __u16)EPOLLERR) |
           (__force __poll_t)__MAP(val, POLLNVAL, (__force __u16)EPOLLNVAL) |
           (__force __poll_t)__MAP(val, POLLRDNORM,
                                   (__force __u16)EPOLLRDNORM) |
           (__force __poll_t)__MAP(val, POLLRDBAND,
                                   (__force __u16)EPOLLRDBAND) |
           (__force __poll_t)__MAP(val, POLLWRNORM,
                                   (__force __u16)EPOLLWRNORM) |
           (__force __poll_t)__MAP(val, POLLWRBAND,
                                   (__force __u16)EPOLLWRBAND) |
           (__force __poll_t)__MAP(val, POLLHUP, (__force __u16)EPOLLHUP) |
           (__force __poll_t)__MAP(val, POLLRDHUP, (__force __u16)EPOLLRDHUP) |
           (__force __poll_t)__MAP(val, POLLMSG, (__force __u16)EPOLLMSG);
}

參考

select 源碼分析,上一篇寫的關於 select 的分析,有一些關於 poll 結構和文件回調的分析。

原文出處:https://www.cnblogs.com/shuqin/p/11662645.html

相關文章
相關標籤/搜索