也許你們都會使用libaio接口,但它和內核是如何交互的呢?內核的機制又是怎樣的呢?下面就一塊兒跟蹤下主要的流程。ios
依賴的頭文件api
#include <errno.h> #include <sys/syscall.h> #include <unistd.h>
主要的函數:異步
/* Actual syscalls */ int io_setup(int maxevents, io_context_t *ctxp) { return syscall(__NR_io_setup, maxevents, ctxp); } int io_destroy(io_context_t ctx) { return syscall(__NR_io_destroy, ctx); } int io_submit(io_context_t ctx, long nr, struct iocb *ios[]) { return syscall(__NR_io_submit, ctx, nr, ios); } int io_cancel(io_context_t ctx, struct iocb *iocb, struct io_event *evt) { return syscall(__NR_io_cancel, ctx, iocb, evt); }
(待跟蹤問題:io_get_events() 又是怎樣實現的呢?)ide
那麼上面的系統調用號又具體對應到什麼代碼呢?讓咱們先看看系統調用相關的背景知識:函數
http://lkml.kernel.org/r/<20110529191055.GC9835%40elte.hu>;ui
The x86 architecture has quite a few different ways to jump into
kernel code. Most of these entry points are registered in
arch/x86/kernel/traps.c and implemented in arch/x86/entry/entry_64.S
for 64-bit, arch/x86/entry/entry_32.S for 32-bit and finally
arch/x86/entry/entry_64_compat.S which implements the 32-bit compatibility
syscall entry points and thus provides for 32-bit processes the
ability to execute syscalls when running on 64-bit kernels.this
The IDT vector assignments are listed in arch/x86/include/asm/irq_vectors.h.spa
Some of these entries are:debug
system_call: syscall instruction from 64-bit code.code
entry_INT80_compat: int 0x80 from 32-bit or 64-bit code; compat syscall
either way.
entry_INT80_compat, ia32_sysenter: syscall and sysenter from 32-bit
code
interrupt: An array of entries. Every IDT vector that doesn't
explicitly point somewhere else gets set to the corresponding
value in interrupts. These point to a whole array of
magically-generated functions that make their way to do_IRQ with
the interrupt number as a parameter.
APIC interrupts: Various special-purpose interrupts for things
like TLB shootdown.
Architecturally-defined exceptions like divide_error.
接着看看異步IO系統調用涉及到的一些系統調用編號,參考arch/ia64/include/uapi/asm/unistd.h文件:
define __NR_io_setup 1238 #define __NR_io_destroy 1239 #define __NR_io_getevents 1240 #define __NR_io_submit 1241 #define __NR_io_cancel 1242 #define __NR_epoll_create 1243 #define __NR_epoll_ctl 1244 #define __NR_epoll_wait 1245
那麼這個系統調用編號又是怎樣和功能代碼相管理的呢?
ENTRY(entry_SYSCALL_64) UNWIND_HINT_EMPTY /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, * it is too small to ever cause noticeable irq latency. */ swapgs /* * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it * is not required to switch CR3. */ movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ pushq $__USER_DS /* pt_regs->ss */ pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ pushq %r11 /* pt_regs->flags */ pushq $__USER_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ GLOBAL(entry_SYSCALL_64_after_hwframe) pushq %rax /* pt_regs->orig_ax */ PUSH_AND_CLEAR_REGS rax=$-ENOSYS TRACE_IRQS_OFF /* IRQs are off. */ movq %rax, %rdi movq %rsp, %rsi call do_syscall_64 /* returns with IRQs disabled */ TRACE_IRQS_IRETQ /* we're about to change IF */
#ifdef CONFIG_X86_64 __visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) { struct thread_info *ti; enter_from_user_mode(); local_irq_enable(); ti = current_thread_info(); if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) nr = syscall_trace_enter(regs); /* * NB: Native and x32 syscalls are dispatched from the same * table. The only functional difference is the x32 bit in * regs->orig_ax, which changes the behavior of some syscalls. */ nr &= __SYSCALL_MASK; if (likely(nr < NR_syscalls)) { nr = array_index_nospec(nr, NR_syscalls); regs->ax = sys_call_table[nr](regs); } syscall_return_slowpath(regs); } #endif
sys_call_table 在哪?在include/uapi/asm-generic/unistd.h
#define __NR_io_setup 0 __SC_COMP(__NR_io_setup, sys_io_setup, compat_sys_io_setup) #define __NR_io_destroy 1 __SYSCALL(__NR_io_destroy, sys_io_destroy) #define __NR_io_submit 2 __SC_COMP(__NR_io_submit, sys_io_submit, compat_sys_io_submit) #define __NR_io_cancel 3 __SYSCALL(__NR_io_cancel, sys_io_cancel) #define __NR_io_getevents 4 __SC_COMP(__NR_io_getevents, sys_io_getevents, compat_sys_io_getevents)
/* sys_io_submit: * Queue the nr iocbs pointed to by iocbpp for processing. Returns * the number of iocbs queued. May return -EINVAL if the aio_context * specified by ctx_id is invalid, if nr is < 0, if the iocb at * *iocbpp[0] is not properly initialized, if the operation specified * is invalid for the file descriptor in the iocb. May fail with * -EFAULT if any of the data structures point to invalid data. May * fail with -EBADF if the file descriptor specified in the first * iocb is invalid. May fail with -EAGAIN if insufficient resources * are available to queue any iocbs. Will return 0 if nr is 0. Will * fail with -ENOSYS if not implemented. */ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, struct iocb __user * __user *, iocbpp) { struct kioctx *ctx; long ret = 0; int i = 0; struct blk_plug plug; if (unlikely(nr < 0)) return -EINVAL; ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) { pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } if (nr > ctx->nr_events) nr = ctx->nr_events; blk_start_plug(&plug); for (i = 0; i < nr; i++) { struct iocb __user *user_iocb; if (unlikely(get_user(user_iocb, iocbpp + i))) { ret = -EFAULT; break; } ret = io_submit_one(ctx, user_iocb, false); if (ret) break; }
Documentation/process/adding-syscalls.rst