Linux內核架構讀書筆記- 2.4.1 進程複製

時間 2019-11-29

標籤 linux 內核架構讀書筆記 2.4.1 進程複製欄目 Linux 简体版

原文原文鏈接

Linux 下進程複製三個函數linux

fork 創建父進程的一個副本，採用了cow 技術
vfork 相似於vfork，可是不建立父進程數據的副本，父子之間共享數據
clone產生線程，能夠對父子之間進程的共享複製進行精確複製

　 1 Cow （ copy on write ）session

　　2 執行系統調用數據結構

　　　　　　　　　　　　　　　系統相關　　　　系統無關多線程

　　　　　　fork 　　　　　　sys_forkapp

　　　　　　vfork -> sys_fork　　　-> do_forkdom

　　　　　　clone　　　　　 sys_vforkide

　do_fork 原型：函數

 1 /*
 2  *  Ok, this is the main fork-routine.
 3  *
 4  * It copies the process, and if successful kick-starts
 5  * it and waits for it to finish using the VM if required.
 6  */
 7 long do_fork(unsigned long clone_flags,
 8           unsigned long stack_start,
 9           struct pt_regs *regs,
10           unsigned long stack_size,
11           int __user *parent_tidptr,
12           int __user *child_tidptr)

　　clone_flags: 標誌集合，指定控制複製過程當中的一些屬性最低字節指定子進程終止時候發送給父進程信號，其他高位字節保存一些常數ui

　　stack_start：用戶棧的起始地址this

　　regs :指向寄存器集合的指針，struct pt_regs 是特定於體系結構

　　stack_size：用戶態棧的大小，該參數不必，設置爲0

　　parent_tid 、child_tid ：指向父子進程的TID（pid tid 區別參考 http://stackoverflow.com/questions/4517301/difference-between-pid-and-tid ）

　　sys_fork 體系相關，

　　/arch/x86/kernel/process_32.c

1 asmlinkage int sys_fork(struct pt_regs regs)
2 {
3     return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
4 }

　　sys_vfork 與 sys_fork 略微不一樣，前者使用額外的標誌

　　/arch/x86/kernel/process_32.c

 1 /*
 2  * This is trivial, and on the face of it looks like it
 3  * could equally well be done in user mode.
 4  *
 5  * Not so, for quite unobvious reasons - register pressure.
 6  * In user mode vfork() cannot have a stack frame, and if
 7  * done by calling the "clone()" system call directly, you
 8  * do not have enough call-clobbered registers to hold all
 9  * the information you need.
10  */
11 asmlinkage int sys_vfork(struct pt_regs regs)
12 {
13     return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
14 }

sys_clone相似

　/arch/x86/kernel/process_32.c

 1 asmlinkage int sys_clone(struct pt_regs regs)
 2 {
 3     unsigned long clone_flags;
 4     unsigned long newsp;
 5     int __user *parent_tidptr, *child_tidptr;
 6 
 7     clone_flags = regs.ebx;
 8     newsp = regs.ecx;
 9     parent_tidptr = (int __user *)regs.edx;
10     child_tidptr = (int __user *)regs.edi;
11     if (!newsp)
12         newsp = regs.esp;
13     return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
14 }

　3 do_fork

　　copy_process 後文描述

　　肯定pid，若是設置了CLONE_NEWPID 調用 task_pid_nr_ns，不然調用調用task_pid_vnr 獲取局部id，代碼以下

　　kernel/fork.c

1 /*
2          * this is enough to call pid_nr_ns here, but this if
3          * improves optimisation of regular fork()
4          */
5         nr = (clone_flags & CLONE_NEWPID) ?
6             task_pid_nr_ns(p, current->nsproxy->pid_ns) :
7                 task_pid_vnr(p);

pid

　　若是使用ptrace 監控新的進程，建立新的進程後會向其發送 SIGSTOP信號，便於調試器檢查數據

kernel/fork.c

1 if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
2             /*
3              * We'll start up with an immediate SIGSTOP.
4              */
5             sigaddset(&p->pending.signal, SIGSTOP);
6             set_tsk_thread_flag(p, TIF_SIGPENDING);
7         }

　　子進程使用wake_up_new_task喚醒，及將task_struct 加入到調度器隊列

　　若是子進程在父進程以前開始運行，能夠大大減小複製內存頁的工做量

　　kernel/fork.c

1 if (!(clone_flags & CLONE_STOPPED))
2             wake_up_new_task(p, clone_flags);
3         else
4             p->state = TASK_STOPPED;

kernel/sched.c

/*
 * wake_up_new_task - wake up a newly created task for the first time.
 *
 * This function will do some initial scheduler statistics housekeeping
 * that must be done for every newly created context, then puts the task
 * on the runqueue and wakes it.
 */
void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
{
    unsigned long flags;
    struct rq *rq;

    rq = task_rq_lock(p, &flags);
    BUG_ON(p->state != TASK_RUNNING);
    update_rq_clock(rq);

    p->prio = effective_prio(p);

    if (!p->sched_class->task_new || !current->se.on_rq) {
        activate_task(rq, p, 0);
    } else {
        /*
         * Let the scheduling class do new task startup
         * management (if any):
         */
        p->sched_class->task_new(rq, p);
        inc_nr_running(p, rq);
    }
    check_preempt_curr(rq, p);
    task_rq_unlock(rq, &flags);
}

wake_up_new_task

若是使用Vfork，須要啓動子進程的的完成機制，子進程的task_struct 的 vfork_done 用於次，藉助於wait_for_completion，父進程一直睡眠直到子進程退出，在子進程退出是，內核調用complete（vfork_done），喚醒因該變量睡眠的進程。經過採用這種方法內核能夠確保vfork生成的子進程的父進程一直處於不活動狀態，直至子進程退出或執行一個新的程序，父進程的臨時睡眠，也確保了兩個進程不會彼此干擾操做對方的進程

kernel/fork.c

1 if (clone_flags & CLONE_VFORK) {
2             freezer_do_not_count();
3             wait_for_completion(&vfork);
4             freezer_count();
5             if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
6                 current->ptrace_message = nr;
7                 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
8             }
9         }

4 進程複製

　　主要由copy_process 完成，下面是簡化版本

　　　　copy_process 定義

　　　　kernel/fork.c

 1 /*
 2  * This creates a new process as a copy of the old one,
 3  * but does not actually start it yet.
 4  *
 5  * It copies the registers, and all the appropriate
 6  * parts of the process environment (as per the clone
 7  * flags). The actual kick-off is left to the caller.
 8  */
 9 static struct task_struct *copy_process(unsigned long clone_flags,
10                     unsigned long stack_start,
11                     struct pt_regs *regs,
12                     unsigned long stack_size,
13                     int __user *child_tidptr,
14                     struct pid *pid)

　　　複製受到許多標註的控制，能夠參考 clone(2)

　　　有些標誌的組合是沒有意義的，　

　　 kernel/fork.c

if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
        return ERR_PTR(-EINVAL);

    /*
     * Thread groups must share signals as well, and detached threads
     * can only be started up within the thread group.
     */
    if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
        return ERR_PTR(-EINVAL);

    /*
     * Shared signal handlers imply shared VM. By way of the above,
     * thread groups also imply shared VM. Blocking this case allows
     * for various simplifications in other code.
     */
    if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
        return ERR_PTR(-EINVAL);

View Code

　　注意

　　linux/err.h

1 static inline void *ERR_PTR(long error)
2 {
3     return (void *) error;
4 }

　　dup_task_struct 創建父進程的副本

　　父子進程的task_struct 實例只有一個不一樣，新進程分配了一個新的核心態棧，即task_struct->stack,棧和thread_info一同保存在一個聯合中，thread_info保存了線程須要的特定於處理器的底層信息。

　　kernel/sched.c

1 union thread_union {
2     struct thread_info thread_info;
3     unsigned long stack[THREAD_SIZE/sizeof(long)];
4 };

　　在大多數體系結構上。使用一兩個內存頁來保存thread_union實例。

　 thread_info 保存了特定於體系的，可是大部分相似

　　<asm-alpha/thread_info.h>

 1 struct thread_info {
 2     struct pcb_struct    pcb;        /* palcode state */
 3 
 4     struct task_struct    *task;        /* main task structure */
 5     unsigned int        flags;        /* low level flags */
 6     unsigned int        ieee_state;    /* see fpu.h */
 7 
 8     struct exec_domain    *exec_domain;    /* execution domain */
 9     mm_segment_t        addr_limit;    /* thread address space */
10     unsigned        cpu;        /* current CPU */
11     int            preempt_count; /* 0 => preemptable, <0 => BUG */
12 
13     int bpt_nsaved;
14     unsigned long bpt_addr[2];        /* breakpoint handling  */
15     unsigned int bpt_insn[2];
16 
17     struct restart_block    restart_block;
18 };

　　task 指向進程的task_struct

　　exec_domain 實現執行區間後者用於在一類計算機上實現多種ABI（應用程序二進制接口），eg：64位運行32位程序

　　flags:特定於進程的標誌，咱們只關心兩個

　　若是進程由待決信號則置位TIF_SIGPENDING

　　TIF_NEED_RESCHED:表示該進程應該或者想要調度器選擇另外一個進程替換本進程執行。

　　其餘都是與硬件相關的，幾乎不使用

　　CPU 在其上執行的CPU數目

　　preempt_count實現內核搶佔所須要的一個計數器

　　addr_limit :指定進程可使用的虛擬地址上線，該限制只限制普通進程，內核進程能夠訪問整個虛擬地址空間

　　restart_block 用於實現信號機制

　　task_strucrt 、thread_info 和內核棧的關係

　　內核組件的棧使用可能了過多的棧空間時，內核棧會溢出，內核提供了kstack_end 函數，用於判斷給定的地址是否位於棧的有效部分

　　dup_task_struct 複製父進程task_struct 和thread_info ，此時父子進程的task_struct除了stack之外都是同樣的，子進程的task_struct 會在copy_process過程當中修改

　　current && current_thread_info 通常體系都定義成宏。current 用於獲取當前task_struct,current_thread_info 用於獲取當前 thread_info

　　繼續copy_process

　　dup_task_struct成功以後，檢查特定用戶是否超過建立最大進程數目

　　kernel/fork.c

1 if (atomic_read(&p->user->processes) >=
2             p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
3         if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
4             p->user != current->nsproxy->user_ns->root_user)
5             goto bad_fork_free;
6     }

　　接下來調用sched_fork ，子進程的狀態進行初始化，爲掛到運行隊列上做準備（後面分析調度，待續。。。）

　　接下來調用copy_xyz 函數

　　　　kernel/fork.c

 1 if ((retval = security_task_alloc(p)))
 2         goto bad_fork_cleanup_policy;
 3     if ((retval = audit_alloc(p)))
 4         goto bad_fork_cleanup_security;
 5     /* copy all the process information */
 6     if ((retval = copy_semundo(clone_flags, p)))
 7         goto bad_fork_cleanup_audit;
 8     if ((retval = copy_files(clone_flags, p)))
 9         goto bad_fork_cleanup_semundo;
10     if ((retval = copy_fs(clone_flags, p)))
11         goto bad_fork_cleanup_files;
12     if ((retval = copy_sighand(clone_flags, p)))
13         goto bad_fork_cleanup_fs;
14     if ((retval = copy_signal(clone_flags, p)))
15         goto bad_fork_cleanup_sighand;
16     if ((retval = copy_mm(clone_flags, p)))
17         goto bad_fork_cleanup_signal;
18     if ((retval = copy_keys(clone_flags, p)))
19         goto bad_fork_cleanup_mm;
20     if ((retval = copy_namespaces(clone_flags, p)))
21         goto bad_fork_cleanup_keys;
22     retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);

　　若是COPY_SYSVSEM 置位。則copy_semundo 使用父進程System V 信號
　　若是CLONE_FILES置位，則copy_files使用父進程的文件描述符，不然建立新的files結構，包含的信息與父進程相同
　　若是CLONE_FS置位，則copy_fs 使用父進程的文件系統上下文
　　若是CLONE_SIGHAND或CLONE_THREAD置位，則使用父進程的信號處理程序
　　若是COPY_MM置位copy_mm讓父子進程共享同一地址空間，此時兩個進程共享mm_struct
　　若是COPY_MM沒有置位，並不必定要複製進程的地址空間，採用cow
　　copy_namespace 建立子進程的命名空間
　　copy_thread特定於體系結構

繼續copy_process ,內核須要填充task_struct父子進程不一樣的各個成員，包含以下

　　task_struct 的鏈表元素

　　間隔定時器成員，cpu_timers

　　待決信號列表（TODO。。。）

　　設置id

　　對於線程，線程組ID與分支進程相同：

　　kernel/fork.c

1 p->pid = pid_nr(pid);
2     p->tgid = p->pid;
3     if (clone_flags & CLONE_THREAD)
4         p->tgid = current->tgid;

　　對於普通進程。父進程是分支進程，對線程，因爲分支進程被視爲分支進程內部的第二（第三四等）個執行序列，其父進程應該是分支進程的父進程：

　　kernel/fork.c

1 /* CLONE_PARENT re-uses the old parent */
2     if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
3         p->real_parent = current->real_parent;
4     else
5         p->real_parent = current;
6     p->parent = p->real_parent;

　　非線程的普通進程能夠經過設置CLONE_PARENT 觸發一樣的行爲，對於線程來講，普通進程的線程組組長是線程自己，對於線程，其組長是當前進程的組長

　　kernel/fork.c

1 p->group_leader = p;
2 if (clone_flags & CLONE_THREAD) {
3         p->group_leader = current->group_leader;
4 }

　　新進程必須經過children鏈表與父進程鏈接起來，經過宏add_parent處理的　，此外新進程必須加入到ID數據結構體系中

　　kernel/fork.c

 1         add_parent(p);
 2         if (unlikely(p->ptrace & PT_PTRACED))
 3             __ptrace_link(p, current->parent);
 4 
 5         if (thread_group_leader(p)) {
 6             if (clone_flags & CLONE_NEWPID)
 7                 p->nsproxy->pid_ns->child_reaper = p;
 8 
 9             p->signal->tty = current->signal->tty;
10             set_task_pgrp(p, task_pgrp_nr(current));
11             set_task_session(p, task_session_nr(current));
12             attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
13             attach_pid(p, PIDTYPE_SID, task_session(current));
14             list_add_tail_rcu(&p->tasks, &init_task.tasks);
15             __get_cpu_var(process_counts)++;
16         }
17         attach_pid(p, PIDTYPE_PID, pid);

5建立線程時的特別問題

　　講一下用戶線程庫用於實現多線程功能的標誌

　　CLONE_PARENT_SETTID將生成線程的pid複製到clone調用指定的用戶空間的某個地址

1 if (clone_flags & CLONE_PARENT_SETTID)
2     put_user(nr, parent_tidptr);

　　CLONE_CHILD_SETTID將另外一個傳遞到CLONE的用戶空間指針保存在新進程的task_struct

1 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;

　　在進程第一次執行時，內核會調用schedule_tail函數將當前PID複製到該地址

　　kernel/sched.c

/**
 * schedule_tail - first thing a freshly forked thread must call.
 * @prev: the thread we just switched away from.
 */
asmlinkage void schedule_tail(struct task_struct *prev)
    __releases(rq->lock)
{
    struct rq *rq = this_rq();

    finish_task_switch(rq, prev);
#ifdef __ARCH_WANT_UNLOCKED_CTXSW
    /* In this case, finish_task_switch does not reenable preemption */
    preempt_enable();
#endif
    if (current->set_child_tid)
        put_user(task_pid_vnr(current), current->set_child_tid);
}

　　CLONE_ CHILD_CLEARTID首先會在copy_process將用戶空間指針child_tidptr保存在task_struct 中，注意此次是另外一個不一樣成員

　　kernel/fork.c

1 /*
2      * Clear TID on mm_release()?
3      */
4     p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;

/* Please note the differences between mmput and mm_release.
 * mmput is called whenever we stop holding onto a mm_struct,
 * error success whatever.
 *
 * mm_release is called after a mm_struct has been removed
 * from the current process.
 *
 * This difference is important for error handling, when we
 * only half set up a mm_struct for a new process and need to restore
 * the old one.  Because we mmput the new mm_struct before
 * restoring the old one. . .
 * Eric Biederman 10 January 1998
 */
void mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
    struct completion *vfork_done = tsk->vfork_done;

    /* Get rid of any cached register state */
    deactivate_mm(tsk, mm);

    /* notify parent sleeping on vfork() */
    if (vfork_done) {
        tsk->vfork_done = NULL;
        complete(vfork_done);
    }

    /*
     * If we're exiting normally, clear a user-space tid field if
     * requested.  We leave this alone when dying by signal, to leave
     * the value intact in a core dump, and to save the unnecessary
     * trouble otherwise.  Userland only wants this done for a sys_exit.
     */
    if (tsk->clear_child_tid
        && !(tsk->flags & PF_SIGNALED)
        && atomic_read(&mm->mm_users) > 1) {
        u32 __user * tidptr = tsk->clear_child_tid;
        tsk->clear_child_tid = NULL;

        /*
         * We don't check the error code - if userspace has
         * not set up a proper pointer then tough luck.
         */
        put_user(0, tidptr);
        sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
    }
}