內核定時任務timer_list

時間 2019-11-24

標籤內核定時任務 timer list 简体版

原文原文鏈接

使用定時器任務，可讓內核在未來的一個指定時刻執行一段指定的代碼。內核定時器相關的接口在linux/timer.h文件中。
linux

本文將會先介紹定時任務的使用，而後在此基礎上了解其內部的實現邏輯。
數組

1、定時任務結構體表示：app

struct timer_list {
    struct list_head entry;  //用於連接到內核定時器鏈表中                                                                                                             
    unsigned long expires;  //定時任務過時時間

    void (*function)(unsigned long); //定時任務的工做函數
    unsigned long data; //定時任務工做函數參數

    struct tvec_base *base; //定時任務關聯的內核定時器
#ifdef CONFIG_TIMER_STATS
    void *start_site;
    char start_comm[16];
    int start_pid;
#endif
#ifdef CONFIG_LOCKDEP
    struct lockdep_map lockdep_map;
#endif
};

2、定時任務相關的接口：ide

1. 初始化定時任務函數

#define TIMER_INITIALIZER(_function, _expires, _data) {     \
        .entry = { .prev = TIMER_ENTRY_STATIC },    \
        .function = (_function),            \
        .expires = (_expires),              \
        .data = (_data),                \
        .base = &boot_tvec_bases,           \
        __TIMER_LOCKDEP_MAP_INITIALIZER(        \
            __FILE__ ":" __stringify(__LINE__)) \
    }
    
#define DEFINE_TIMER(_name, _function, _expires, _data)     \
    struct timer_list _name =               \
        TIMER_INITIALIZER(_function, _expires, _data)
     
#define setup_timer(timer, fn, data)                    \
    do {                                \
        static struct lock_class_key __key;         \
        setup_timer_key((timer), #timer, &__key, (fn), (data));\
    } while (0)

` 主要是完成定時任務的成員初始化，這裏要注意一下.base = &boot_tvec_bases；boot_tvec_bases是內核在初始化的時候建立好的。ui

其實過時時間expires在初始化的時候設置，通常是沒有什麼意義的，一般都是在註冊定時器任務的時候才設置過時時間。
this

2. 註冊定時任務：debug

void add_timer(struct timer_list *timer);

當一個定時任務註冊到內核的定時器列表後，就會處於激活狀態。這裏要注意的是：註冊的定時任務在只會被執行一次，由於在執行的時候會將其從定時器鏈表中移除，若是須要實現每隔一段時間就執行，則須要在其定時任務函數中再次註冊，才能再次被執行。
rest

3. 註銷定時任務：code

int del_timer(struct timer_list * timer);
int del_timer_sync(struct timer_list *timer);

有可能在註銷定時任務的時候，此時的定時任務正在被執行中，那麼調用del_timer_sync()就會等待任務被執行完畢後再註銷。

4. 修改定時任務的過時時間

當調用add_timer()函數將定時任務註冊後，定時任務就處於激活的狀態，此時若是須要修改過時時間，則必須經過以下接口來完成：

int mod_timer(struct timer_list *timer, unsigned long expires);

5. 判判定時任務的狀態：

static inline int timer_pending(const struct timer_list * timer)
{
    return timer->entry.next != NULL;
}

看完上面的接口介紹以後，再看一個簡單的例子：

#include <linux/module.h>
#include <linux/timer.h>
#include <linux/delay.h>

#define ENTER() printk(KERN_DEBUG "%s() Enter", __func__)
#define EXIT() printk(KERN_DEBUG "%s() Exit", __func__)
#define ERR(fmt, args...) printk(KERN_ERR "%s()-%d: " fmt "\n", __func__, __LINE__, ##args)
#define DBG(fmt, args...) printk(KERN_DEBUG "%s()-%d: " fmt "\n", __func__, __LINE__, ##args)


struct test_timer {
    struct timer_list t;
    unsigned long nums;
};

static void my_timer_func(unsigned long data)
{
    struct test_timer *timer = (struct test_timer *)data;
    
    DBG("nums: %lu", timer->nums--);
    if (timer->nums > 0) {
        mod_timer(&timer->t, timer->t.expires + HZ); //再次註冊定時任務
    }
}

static struct test_timer my_timer;

static int __init timer_demo_init(void)
{
    setup_timer(&my_timer.t, my_timer_func, (unsigned long)&my_timer);
    my_timer.nums = 30;

    msleep_interruptible(2000);
    DBG("before mod_timer");
    mod_timer(&my_timer.t, jiffies + 2 * HZ);

    DBG("success");
    return 0;
}

static void __exit timer_demo_exit(void)
{
    ENTER();
    while (my_timer.nums > 0) {
        DBG("waiting my_timer exit");
        msleep_interruptible(1000);
    }

    EXIT();
}

MODULE_LICENSE("GPL");
module_init(timer_demo_init);
module_exit(timer_demo_exit);

3、定時任務的註冊：

接下來，分析一下內核是如何管理咱們註冊的定時任務的，首先從add_timer()開始：

void add_timer(struct timer_list *timer)                                                                                                  
{
    BUG_ON(timer_pending(timer));
    mod_timer(timer, timer->expires);
}

這裏能夠看出，咱們調用add_timer()和調用mod_timer()進行註冊，是同樣的。

int mod_timer(struct timer_list *timer, unsigned long expires)                                                                            
{
    /*
     * This is a common optimization triggered by the
     * networking code - if the timer is re-modified
     * to be the same thing then just return:
     */
    if (timer_pending(timer) && timer->expires == expires)
        return 1;

    return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
}

先判斷下定時任務是否已經處於激活狀態，若是已經處於激活狀態，則直接返回，避免重複註冊，不然調用__mod_timer()：

static inline int __mod_timer(struct timer_list *timer, unsigned long expires,
                        bool pending_only, int pinned)
{
    struct tvec_base *base, *new_base;
    unsigned long flags;
    int ret = 0 , cpu;

    timer_stats_timer_set_start_info(timer);
    BUG_ON(!timer->function);

    base = lock_timer_base(timer, &flags);

    /*若是timer_list已經處於激活狀態，則先將其從鏈表中移除:detach_timer()*/
    if (timer_pending(timer)) {
        detach_timer(timer, 0);
        if (timer->expires == base->next_timer &&
            !tbase_get_deferrable(timer->base))
            base->next_timer = base->timer_jiffies;
        ret = 1;
    } else {
        if (pending_only)
            goto out_unlock;
    }

    debug_activate(timer, expires);

    new_base = __get_cpu_var(tvec_bases);

    cpu = smp_processor_id();

#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
    if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
        int preferred_cpu = get_nohz_load_balancer();

        if (preferred_cpu >= 0)
            cpu = preferred_cpu;
    }
#endif
    new_base = per_cpu(tvec_bases, cpu);
    if (base != new_base) {
        /*
         * We are trying to schedule the timer on the local CPU.
         * However we can't change timer's base while it is running,
         * otherwise del_timer_sync() can't detect that the timer's
         * handler yet has not finished. This also guarantees that
         * the timer is serialized wrt itself.
         */
        if (likely(base->running_timer != timer)) {
            /* See the comment in lock_timer_base() */
            timer_set_base(timer, NULL);
            spin_unlock(&base->lock);
            base = new_base; 
            spin_lock(&base->lock);
            timer_set_base(timer, base);
        }
    }

    timer->expires = expires;
    if (time_before(timer->expires, base->next_timer) &&
        !tbase_get_deferrable(timer->base))
        base->next_timer = timer->expires;
    internal_add_timer(base, timer);

out_unlock:
    spin_unlock_irqrestore(&base->lock, flags);

    return ret;
}

最終調用internal_add_timer()完成註冊：

static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
{
    unsigned long expires = timer->expires;
    unsigned long idx = expires - base->timer_jiffies;
    struct list_head *vec;

    /* 根據過時時間選擇合適的的定時器鏈表 */
    if (idx < TVR_SIZE) {
        int i = expires & TVR_MASK;
        vec = base->tv1.vec + i;
    } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
        int i = (expires >> TVR_BITS) & TVN_MASK;
        vec = base->tv2.vec + i;
    } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
        int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
        vec = base->tv3.vec + i;
    } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
        int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
        vec = base->tv4.vec + i;
    } else if ((signed long) idx < 0) {
        /*
         * Can happen if you add a timer with expires == jiffies,
         * or you set a timer to go off in the past
         */
        vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
    } else {
        int i;
        /* If the timeout is larger than 0xffffffff on 64-bit
         * architectures then we use the maximum timeout:
         */
        if (idx > 0xffffffffUL) {
            idx = 0xffffffffUL;
            expires = idx + base->timer_jiffies;
        }
        i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
        vec = base->tv5.vec + i;
    }
    /*
     * Timers are FIFO:
     */
    list_add_tail(&timer->entry, vec); /*添加到定時器鏈表尾部*/
}

這裏須要補充說明一下struct tvsec_base結構體，看完以後就大體清楚是怎麼管理的了：

/*
 * per-CPU timer vector definitions:
 */
#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
#define TVN_SIZE (1 << TVN_BITS)
#define TVR_SIZE (1 << TVR_BITS)                                                                                                          
#define TVN_MASK (TVN_SIZE - 1)
#define TVR_MASK (TVR_SIZE - 1)

struct tvec {
    struct list_head vec[TVN_SIZE];
};

struct tvec_root {
    struct list_head vec[TVR_SIZE];
};

struct tvec_base {                                                                                                                        
    spinlock_t lock;
    struct timer_list *running_timer; //保存正在運行的定時任務
    unsigned long timer_jiffies;
    unsigned long next_timer;
    struct tvec_root tv1;
    struct tvec tv2;
    struct tvec tv3;
    struct tvec tv4;
    struct tvec tv5;
} ____cacheline_aligned;

每個CPU都會包含一個struct tvsec_base類型的對象，用於存儲註冊到每一個CPU上的定時任務。看完這個結構體，能夠發現包含有5個鏈表數組，分別用於存儲不一樣過時時間的定時任務，分佈以下：

過時時間在0 ～（1<<8) --> tv1, 具體在tv1.vec數組的哪一個鏈表，則是經過掩碼來肯定，即: 過時時間 & ((1 << 8) - 1)

過時時間在(1 << 8) ~ (1 << (8+6)) --> tv2, 具體在tv2.vec數組的哪一個鏈表，則是經過掩碼來肯定，即: (過時時間 -(1 << 8)) & ((1<<6) - 1)

過時時間在(1 << (8+6)) ~ (1 << (8+2*6)) --> tv3，具體在tv3.vec數組的哪一個鏈表，也是經過掩碼肯定，即: (過時時間 - (1 << (8+1*6))) & ((1<<6) - 1)

過時時間在(1 << (8 + 6*2)) ~ (1 << (8 + 3*6)) --> tv4, 具體在tv4.vec數組的哪一個鏈表，也是經過掩碼肯定，即: (過時時間 - (1 << (8+2*6)) & ((1 << 6)- 1)

若是過時時間超過(1 << (8 + 3 * 6)) --> tv5, 具體在tv5.vec數組的哪一個鏈表，也是經過掩碼肯定，即: (過時時間 - ((1 << (8+3*6)) & ((1 << 6) - 1)

之因此要分紅5個數組，就是爲了提升效率，由於當有中斷髮生，就會觸發內核去檢查是否存在過時的定時任務須要執行，若是把全部的鏈表都去遍歷，那麼顯然效率會很低下，因此內核每次只會去檢查tv1.sec數組上的鏈表是否存在須要執行的按期任務。具體是怎麼執行的，下面會有分析。這裏暫時能夠理解爲註冊一個定時任務，就是將此定時任務保存到本地CPU上的定時器的某個鏈表中。

4、定時任務的執行：

定時器的執行，是在軟中斷中執行的，是在一個原子上下文環境中，即不容許定時任務發生睡眠等待。

在內核初始化的時候，會調用init_timers()註冊軟中斷：

void __init init_timers(void)
{
    int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
                (void *)(long)smp_processor_id());

    init_timer_stats();

    BUG_ON(err == NOTIFY_BAD);
    register_cpu_notifier(&timers_nb);                                                                                                    
    open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}

調用open_softirq()函數註冊定時器的軟中斷，處理函數爲run_timer_softirq。軟中斷是由軟件模擬的中斷，大部分狀況下軟中斷會在irq_exit階段被執行，在irq_exit階段沒被處理完的軟中斷，會在守護進程ksoftirqd中執行。這裏暫時不深究軟中斷的實現原理，暫時認爲中斷髮生以後，會觸發定時器軟中斷的處理函數run_timer_softirq的執行。

/*
 * This function runs timers and the timer-tq in bottom half context.
 */
static void run_timer_softirq(struct softirq_action *h)                                                                                   
{
    struct tvec_base *base = __get_cpu_var(tvec_bases);

    perf_event_do_pending();

    hrtimer_run_pending();

    // 判斷是否有超時，jiffies >= base->timer_jiffies則表示有超時，有定時任務須要執行。
    if (time_after_eq(jiffies, base->timer_jiffies))
        __run_timers(base);
}

/**
 * __run_timers - run all expired timers (if any) on this CPU.
 * @base: the timer vector to be processed.
 *
 * This function cascades all vectors and executes all expired timer
 * vectors.
 */
static inline void __run_timers(struct tvec_base *base)
{
    struct timer_list *timer;

    spin_lock_irq(&base->lock);
    while (time_after_eq(jiffies, base->timer_jiffies)) {
        struct list_head work_list;
        struct list_head *head = &work_list;
        int index = base->timer_jiffies & TVR_MASK;

        /*
         * Cascade timers:
         */
         // 尋找已經超時的定時任務鏈表，並將超時的鏈表上的定時任務移動到上一級的鏈表
        if (!index &&
            (!cascade(base, &base->tv2, INDEX(0))) &&
                (!cascade(base, &base->tv3, INDEX(1))) &&
                    !cascade(base, &base->tv4, INDEX(2)))
            cascade(base, &base->tv5, INDEX(3));
        ++base->timer_jiffies;
        
        list_replace_init(base->tv1.vec + index, &work_list);
        while (!list_empty(head)) {
            void (*fn)(unsigned long);
            unsigned long data;

            timer = list_first_entry(head, struct timer_list,entry);
            fn = timer->function; // 定時任務函數
            data = timer->data;

            timer_stats_account_timer(timer);

            set_running_timer(base, timer);
            detach_timer(timer, 1);

            spin_unlock_irq(&base->lock);
            {
                int preempt_count = preempt_count();
                
#ifdef CONFIG_LOCKDEP
                /*
                 * It is permissible to free the timer from
                 * inside the function that is called from
                 * it, this we need to take into account for
                 * lockdep too. To avoid bogus "held lock
                 * freed" warnings as well as problems when
                 * looking into timer->lockdep_map, make a
                 * copy and use that here.
                 */
                struct lockdep_map lockdep_map = timer->lockdep_map;
#endif
                /*
                 * Couple the lock chain with the lock chain at
                 * del_timer_sync() by acquiring the lock_map
                 * around the fn() call here and in
                 * del_timer_sync().
                 */
                lock_map_acquire(&lockdep_map);

                trace_timer_expire_entry(timer);
                fn(data); // 執行定時任務函數
                trace_timer_expire_exit(timer);

                lock_map_release(&lockdep_map);

                if (preempt_count != preempt_count()) {
                    printk(KERN_ERR "huh, entered %p "
                           "with preempt_count %08x, exited"
                           " with %08x?\n",
                           fn, preempt_count,
                           preempt_count());
                    BUG();
                }
            }
            spin_lock_irq(&base->lock);
        }
    }
    set_running_timer(base, NULL);
    spin_unlock_irq(&base->lock);
}

這段代碼的邏輯比較複雜，我也還不能徹底理解，不過從上面來看，就是把已經超時的鏈表取出到work_list，而後依次執行work_list上的定時任務。

在代碼的前面部分，有一段是從新調整定時任務鏈表的操做：

int index = base->timer_jiffies & TVR_MASK;

        /*
         * Cascade timers:
         */
        if (!index &&
            (!cascade(base, &base->tv2, INDEX(0))) &&
                (!cascade(base, &base->tv3, INDEX(1))) &&
                    !cascade(base, &base->tv4, INDEX(2)))
            cascade(base, &base->tv5, INDEX(3));
        
        ++base->timer_jiffies;

這裏要先看一下INDEX宏和cascade()函數：

static int cascade(struct tvec_base *base, struct tvec *tv, int index)
{
    /* cascade all the timers from tv up one level */
    struct timer_list *timer, *tmp;                                                                                                       
    struct list_head tv_list;
    
    list_replace_init(tv->vec + index, &tv_list);
    
    /*
     * We are removing _all_ timers from the list, so we
     * don't have to detach them individually.
     */
    list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
        BUG_ON(tbase_get_base(timer->base) != base);
        internal_add_timer(base, timer);
    }
    
    return index;
}

#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)

能夠看出INDEX宏是根據定時器的過時時間來獲得其所在數組的索引，而cascade()函數就是將此索引對應的鏈表取出，而後將此鏈表上的每個定時任務重新加入到定時器中。