(一)首先,系統調用有兩種方式:數組
0x80、0x8一、0x82三個中斷號;架構
專門指令(至少分Intel架構和ARM架構),好比SYSENTER/SYSCALL函數
(二)話分兩頭,先說中斷向量方式ui
這是終端向量定義的部分代碼:this
INTERRUPT(0x7d) INTERRUPT(0x7e) USER_TRAP(0x7f, idt64_dtrace_ret) /* Required by dtrace "fasttrap" */USER_TRAP_SPC(0x80,idt64_unix_scall) USER_TRAP_SPC(0x81,idt64_mach_scall) USER_TRAP_SPC(0x82,idt64_mdep_scall) INTERRUPT(0x83) INTERRUPT(0x84) INTERRUPT(0x85) INTERRUPT(0x86)
(BSD風格的系統調用,終端號就是0x80)spa
觸發中斷以及後面的邏輯,都在彙編文件idt64.s中實現,下面簡單看看:unix
/* * System call handlers. * These are entered via a syscall interrupt. The system call number in %rax * is saved to the error code slot in the stack frame. We then branch to the * common state saving code. */ #ifndef UNIX_INT #error NO UNIX INT!!! #endif Entry(idt64_unix_scall) swapgs /* switch to kernel gs (cpu_data) */ pushq %rax /* save system call number */ PUSH_FUNCTION(HNDL_UNIX_SCALL) pushq $(UNIX_INT)
接下來執行PUSH_FUNCTIOIN(HNDL_UNIX_SCALL),先展開PUSH_FUNCTION看看:rest
#if 1#define PUSH_FUNCTION(func) \ sub $8, %rsp ;\ push %rax ;\ leaq func(%rip), %rax ;\ movq %rax, 8(%rsp) ;\ pop %rax #else #define PUSH_FUNCTION(func) pushq func #endif
系統調用號,在寄存器RAX,接下來看看HNDL_UNIX_SCALL:code
Entry(hndl_unix_scall) TIME_TRAP_UENTRY movq %gs:CPU_ACTIVE_THREAD,%rcx /* get current thread */ movq TH_TASK(%rcx),%rbx /* point to current task */ incl TH_SYSCALLS_UNIX(%rcx) /* increment call count */ /* Check for active vtimers in the current task */ TASK_VTIMER_CHECK(%rbx,%rcx) sti CCALL1(unix_syscall, %r15) /* * always returns through thread_exception_return */
主要有一行:unix_syscall,看看unix_syscall函數的definition:orm
* * sysent * proc * uthread **= DEBUG (regs->eax == === uthread->uu_flag & (__improbable(is_vfork != == ( proc * (__improbable(p ==->eax =->efl |== regs->eax &>= NUM_SYSENT ? : code], (uint32_t)regs-> = (vm_offset_t) (regs->uesp + (->efl &= ~= (code >= NUM_SYSENT) ? &sysent[] : & (__improbable(callp === fuword( += (= (code >= NUM_SYSENT) ? &sysent[] : &
經過寄存器中的數據獲得code,再經過code取得數組sysent中的系統調用函數,交給callp;後面的代碼冗長,這裏就不所有貼出來咯。
(關於sysent數組,改天詳述)
(三)再說系統調用專用指令方式(以Intel架構爲例)
SYSENTER用於32位,SYSCALL用於64位,只說SYSCALL吧,先看彙編:
Entry(hi64_syscall) Entry(idt64_syscall)L_syscall_continue: swapgs /* Kapow! get per-cpu data area */ mov %rsp, %gs:CPU_UBER_TMP /* save user stack */ mov %gs:CPU_UBER_ISF, %rsp /* switch stack to pcb */ /* * Save values in the ISF frame in the PCB * to cons up the saved machine state. */ movl $(USER_DS), ISF64_SS(%rsp) movl $(SYSCALL_CS), ISF64_CS(%rsp) /* cs - a pseudo-segment */ mov %r11, ISF64_RFLAGS(%rsp) /* rflags */ mov %rcx, ISF64_RIP(%rsp) /* rip */ mov %gs:CPU_UBER_TMP, %rcx mov %rcx, ISF64_RSP(%rsp) /* user stack */ mov %rax, ISF64_ERR(%rsp) /* err/rax - syscall code */ movq $(T_SYSCALL), ISF64_TRAPNO(%rsp) /* trapno */ leaq HNDL_SYSCALL(%rip), %r11; movq %r11, ISF64_TRAPFN(%rsp) mov ISF64_RFLAGS(%rsp), %r11 /* Avoid leak, restore R11 */ jmp L_dispatch_U64 /* this can only be 64-bit */
主要看看HNDL_SYSCALL:
/* * 64bit Tasks * System call entries via syscall only: * * r15 x86_saved_state64_t * rsp kernel stack * * both rsp and r15 are 16-byte aligned * interrupts disabled * direction flag cleared */ Entry(hndl_syscall) TIME_TRAP_UENTRY movq %gs:CPU_ACTIVE_THREAD,%rcx /* get current thread */ movq TH_TASK(%rcx),%rbx /* point to current task */ /* Check for active vtimers in the current task */ TASK_VTIMER_CHECK(%rbx,%rcx) /* * We can be here either for a mach, unix machdep or diag syscall, * as indicated by the syscall class: */ movl R64_RAX(%r15), %eax /* syscall number/class */ movl %eax, %edx andl $(SYSCALL_CLASS_MASK), %edx /* syscall class */ cmpl $(SYSCALL_CLASS_MACH<<SYSCALL_CLASS_SHIFT), %edx je EXT(hndl_mach_scall64) cmpl $(SYSCALL_CLASS_UNIX<<SYSCALL_CLASS_SHIFT), %edx je EXT(hndl_unix_scall64) cmpl $(SYSCALL_CLASS_MDEP<<SYSCALL_CLASS_SHIFT), %edx je EXT(hndl_mdep_scall64) cmpl $(SYSCALL_CLASS_DIAG<<SYSCALL_CLASS_SHIFT), %edx je EXT(hndl_diag_scall64) /* Syscall class unknown */ sti CCALL3(i386_exception, $(EXC_SYSCALL), %rax, $1) /* no return */
能夠看到,這裏根據寄存器和全局參數區分4種系統調用,BSD風格的系統調用只是第1種,還有3種:mach syscall、machdep syscall、diag syscall;
若是是BSD風格系統調用,那麼就繼續執行hndl_unix_scall64:
Entry(hndl_unix_scall64) incl TH_SYSCALLS_UNIX(%rcx) /* increment call count */ sti CCALL1(unix_syscall64, %r15) /* * always returns through thread_exception_return */
只有一個函數調用,unix_syscall64,接下來看看這個函數的definition:
* sysent * * proc * uthread **= DEBUG (regs->rax == == (__probable(!(uthread->uu_flag &= ( proc *= (__improbable(p ==->rax =->isf.rflags |== = regs->rax &>= NUM_SYSENT ? : code], regs->= (code >= NUM_SYSENT) ? &sysent[] : &= ( *)(®s-> (__improbable(callp === regs->= (code >= NUM_SYSENT) ? &sysent[] : &= ( *)(®s->=
能夠看到這裏首先從x86_saved_state_t中取得系統調用號code,而後從數組sysent中獲得系統調用函數,給callp;再後面是一些參數處理,和callp的執行。
接下去就到了具體的系統調用函數。
(大概介紹如上,有人拍磚嗎?一塊兒瞭解啊~)