Linux soft lockup分析

關鍵詞:watchdog、soft lockup、percpu thread、lockdep等。html

 

近日遇到一個soft lockup問題,打印相似「[ 56.032356] NMI watchdog: BUG: soft lockup - CPU#0 stuck for 23s! [cat:153]「。node

這是lockup檢測機制在起做用,lockup檢測機制包括soft lockup detector和hard lockup detector。函數

藉機分析下soft lockup機制以及什麼狀況下致使soft watchdog異常、對watchdog的配置、如何定位異常點。post

這裏跳過hard lockup detector的分析。this

1. soft lockup機制分析

lockup_detector_init()函數首先獲取sample_period以及watchdog_cpumask,而後根據狀況建立線程,啓動喂狗程序;建立hrtimer啓動看門狗。atom

而後有兩個重點一個是建立內核線程的API以及struct smp_hotplug_thread結構體。url

void __init lockup_detector_init(void)
{
    set_sample_period();----------------------------------------獲取變量sample_period,爲watchdog_thresh*2/5,即4秒喂一次狗。 ...
    cpumask_copy(&watchdog_cpumask, cpu_possible_mask);

    if (watchdog_enabled)
        watchdog_enable_all_cpus();
}

static int watchdog_enable_all_cpus(void)
{
    int err = 0;

    if (!watchdog_running) {----------------------------------若是當前watchdog_running沒有再運行,那麼爲每一個CPU建立一個watchdog/x線程,這些線程每隔sample_period時間喂一次狗。watchdog_threads時watchdog/x線程的主要輸入參數,watchdog_cpumask規定了爲哪些CPU建立線程。
        err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
                                 &watchdog_cpumask);
        if (err)
            pr_err("Failed to create watchdog threads, disabled\n");
        else
            watchdog_running = 1;
    } else {
        err = update_watchdog_all_cpus();

        if (err) {
            watchdog_disable_all_cpus();
            pr_err("Failed to update lockup detectors, disabled\n");
        }
    }

    if (err)
        watchdog_enabled = 0;

    return err;
}

static void watchdog_disable_all_cpus(void)
{
    if (watchdog_running) {
        watchdog_running = 0;
        smpboot_unregister_percpu_thread(&watchdog_threads);
    }
}

static int update_watchdog_all_cpus(void)
{
    int ret;

    ret = watchdog_park_threads();
    if (ret)
        return ret;

    watchdog_unpark_threads();

    return 0;
}

static int watchdog_park_threads(void)
{
    int cpu, ret = 0;

    atomic_set(&watchdog_park_in_progress, 1);

    for_each_watchdog_cpu(cpu) {
        ret = kthread_park(per_cpu(softlockup_watchdog, cpu));---------------------------設置struct kthread->flags的KTHREAD_SHOULD_PARK位,在watchdog/x線程中會調用unpark成員函數進行處理。 if (ret)
            break;
    }

    atomic_set(&watchdog_park_in_progress, 0);

    return ret;
}

static void watchdog_unpark_threads(void)
{
    int cpu;

    for_each_watchdog_cpu(cpu)
        kthread_unpark(per_cpu(softlockup_watchdog, cpu));-------------------------------清空struct kthread->flags的KTHREAD_SHOULD_PARK位,在watchdog/x線程中會調用park成員函數。
}

 

1.1 watchdog_threads結構體介紹

在介紹如何建立watchdog/x線程以前,有必要先介紹一些struct smp_hotplug_thread線程。spa

struct smp_hotplug_thread {
    struct task_struct __percpu    **store;--------------------------存放percpu strcut task_strcut指針的指針。 struct list_head        list;
    int                (*thread_should_run)(unsigned int cpu);-------檢查是否應該運行watchdog/x線程。 void                (*thread_fn)(unsigned int cpu);--------------watchdog/x線程的主函數。 void                (*create)(unsigned int cpu);
    void                (*setup)(unsigned int cpu);------------------在運行watchdog/x線程以前的準備工做。 void                (*cleanup)(unsigned int cpu, bool online);---在退出watchdog/x線程以後的清楚工做。 void                (*park)(unsigned int cpu);-------------------當CPU offline時,須要臨時中止。 void                (*unpark)(unsigned int cpu);-----------------當CPU變成online時,進行準備工做。
    cpumask_var_t            cpumask;--------------------------------容許哪些CPU online。 bool                selfparking;
    const char            *thread_comm;------------------------------watchdog/x線程名稱。
};

 watchdog_threads是soft lockup監控線程的實體,基於此建立 watchdog/x線程。命令行

static struct smp_hotplug_thread watchdog_threads = {
    .store            = &softlockup_watchdog,
    .thread_should_run    = watchdog_should_run,
    .thread_fn        = watchdog,
    .thread_comm        = "watchdog/%u",
    .setup            = watchdog_enable,
    .cleanup        = watchdog_cleanup,
    .park            = watchdog_disable,
    .unpark            = watchdog_enable,
};

static void watchdog_enable(unsigned int cpu)
{
    struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);

    /* kick off the timer for the hardlockup detector */
    hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
    hrtimer->function = watchdog_timer_fn;------------------------------------------建立一個hrtimer,超時函數爲watchdog_timer_fn,這裏面會檢查watchdog_touch_ts變量是否超過20秒沒有被更新。若是是,則有soft lockup。 /* Enable the perf event */
    watchdog_nmi_enable(cpu);

    /* done here because hrtimer_start can only pin to smp_processor_id() */
    hrtimer_start(hrtimer, ns_to_ktime(sample_period),
              HRTIMER_MODE_REL_PINNED);---------------------------------------------啓動一個超時爲sample_period(4秒)的hrtimer,HRTIMER_MODE_REL_PINNED表示此hrtimer和當前CPU綁定。 /* initialize timestamp */ watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);---------------------------------設置當前線程爲實時FIFO,而且優先級爲實時99.這個優先級表示高於全部的非實時線程,可是實時優先級最低的。
    __touch_watchdog();-------------------------------------------------------------更新watchdog_touch_ts變量,至關於喂狗操做。
}

static void watchdog_set_prio(unsigned int policy, unsigned int prio)
{
    struct sched_param param = { .sched_priority = prio };

    sched_setscheduler(current, policy, &param);
}

/* Commands for resetting the watchdog */
static void __touch_watchdog(void)
{
    __this_cpu_write(watchdog_touch_ts, get_timestamp());----------------------------喂狗的操做就是更新watchdog_touch_ts變量,也即當前時間戳。
}


static void watchdog_disable(unsigned int cpu)-------------------------------------至關於watchdog_enable()反操做,將線程恢復爲普通線程;取消hrtimer。
{
    struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);

    watchdog_set_prio(SCHED_NORMAL, 0);
    hrtimer_cancel(hrtimer);
    /* disable the perf event */
    watchdog_nmi_disable(cpu);
}

static void watchdog_cleanup(unsigned int cpu, bool online)
{
    watchdog_disable(cpu);
}

static int watchdog_should_run(unsigned int cpu)
{
    return __this_cpu_read(hrtimer_interrupts) !=
        __this_cpu_read(soft_lockup_hrtimer_cnt);------------------------------------hrtimer_interrupts記錄了產生hrtimer的次數;在watchdog()中,將hrtimer_interrupts賦給soft_lockup_hrtimer_cnt。二者相等表示沒有hrtimer產生,不須要運行watchdog/x線程;相反不等,則須要watchdog/x線程運行。
}
static void watchdog(unsigned int cpu)
{
    __this_cpu_write(soft_lockup_hrtimer_cnt,
             __this_cpu_read(hrtimer_interrupts));-----------------------------------更新soft_lockup_hrtimer_cnt,在watch_should_run()中就返回false,表示線程不須要運行,即不須要喂狗。
    __touch_watchdog();--------------------------------------------------------------雖然就是一句話,可是卻很重要的喂狗操做。     if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
        watchdog_nmi_disable(cpu);
}

 

1.2 建立喂狗線程watchdog/x

在分析了watchdog_threads以後,再來看看如何建立watchdog/x線程。 線程

int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
                       const struct cpumask *cpumask)
{
    unsigned int cpu;
    int ret = 0;

    if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
        return -ENOMEM;
    cpumask_copy(plug_thread->cpumask, cpumask);

    get_online_cpus();
    mutex_lock(&smpboot_threads_lock);
    for_each_online_cpu(cpu) {------------------------------------------------遍歷全部online CPU,爲每一個CPU建立一個percpu的watchdog/x線程。
        ret = __smpboot_create_thread(plug_thread, cpu);
        if (ret) {
            smpboot_destroy_threads(plug_thread);-----------------------------建立失敗則釋放相關資源。
            free_cpumask_var(plug_thread->cpumask);
            goto out;
        }
        if (cpumask_test_cpu(cpu, cpumask))
            smpboot_unpark_thread(plug_thread, cpu);--------------------------若是當前CPU不在cpumask中,則清空KTHREAD_SHOULD_PARK,進而調用watchdog_therads的umpark成員函數。
    }
    list_add(&plug_thread->list, &hotplug_threads);
out:
    mutex_unlock(&smpboot_threads_lock);
    put_online_cpus();
    return ret;
}

static int
__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
{
    struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
    struct smpboot_thread_data *td;

    if (tsk)
        return 0;

    td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
    if (!td)
        return -ENOMEM;
    td->cpu = cpu;
    td->ht = ht;

    tsk =kthread_create_on_cpu(smpboot_thread_fn, td, cpu,
                    ht->thread_comm);-----------------------------------------在指定CPU上建立watchdog/x線程,處理函數爲smpboot_thread_fn()。 if (IS_ERR(tsk)) {
        kfree(td);
        return PTR_ERR(tsk);
    }
    /*
     * Park the thread so that it could start right on the CPU
     * when it is available.
     */
    kthread_park(tsk);--------------------------------------------------------在CPU上當即啓動watchdog/x線程。
    get_task_struct(tsk);-----------------------------------------------------增長對線程的引用計數。 *per_cpu_ptr(ht->store, cpu) = tsk;---------------------------------------store存放線程結構體指針的指針。 if (ht->create) {
        if (!wait_task_inactive(tsk, TASK_PARKED))
            WARN_ON(1);
        else
            ht->create(cpu);
    }
    return 0;
}

static int smpboot_thread_fn(void *data)
{
    struct smpboot_thread_data *td = data;
    struct smp_hotplug_thread *ht = td->ht;

    while (1) {
        set_current_state(TASK_INTERRUPTIBLE);
        preempt_disable();
        if (kthread_should_stop()) {----------------------------------------若是能夠終止線程,調用cleanup,退出線程。
            __set_current_state(TASK_RUNNING);
            preempt_enable();
            /* cleanup must mirror setup */
            if (ht->cleanup && td->status != HP_THREAD_NONE)
                ht->cleanup(td->cpu, cpu_online(td->cpu));
            kfree(td);
            return 0;
        }

        if (kthread_should_park()) {----------------------------------------若是KTHREAD_SHOULD_PARK置位,調用park()暫停進程執行。
            __set_current_state(TASK_RUNNING);
            preempt_enable();
            if (ht->park && td->status == HP_THREAD_ACTIVE) {
                BUG_ON(td->cpu != smp_processor_id());
                ht->park(td->cpu);
                td->status = HP_THREAD_PARKED;
            }
            kthread_parkme();
            /* We might have been woken for stop */
            continue;
        }

        BUG_ON(td->cpu != smp_processor_id());

        /* Check for state change setup */
        switch (td->status) {
        case HP_THREAD_NONE:-----------------------------------------------至關於第一次運行,調用setup()進行初始化操做。
            __set_current_state(TASK_RUNNING);
            preempt_enable();
            if (ht->setup)
                ht->setup(td->cpu);
            td->status = HP_THREAD_ACTIVE;
            continue;

        case HP_THREAD_PARKED:---------------------------------------------從parked狀態恢復。
            __set_current_state(TASK_RUNNING);
            preempt_enable();
            if (ht->unpark)
                ht->unpark(td->cpu);
            td->status = HP_THREAD_ACTIVE;
            continue;
        }

        if (!ht->thread_should_run(td->cpu)) {-----------------------------若是不須要進程運行,schedule()主動放棄CPU給其餘線程使用。
            preempt_enable_no_resched();
            schedule();
        } else {
            __set_current_state(TASK_RUNNING);
            preempt_enable();
            ht->thread_fn(td->cpu);----------------------------------------調用struct smpboot_thread_fn->thread_fn及watchdog(),進行喂狗操做。
        }
    }
}

void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)----將建立的內核線程移除操做。
{
    get_online_cpus();
    mutex_lock(&smpboot_threads_lock);
    list_del(&plug_thread->list);
    smpboot_destroy_threads(plug_thread);
    mutex_unlock(&smpboot_threads_lock);
    put_online_cpus();
    free_cpumask_var(plug_thread->cpumask);
}

static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
{
    unsigned int cpu;

    /* We need to destroy also the parked threads of offline cpus */
    for_each_possible_cpu(cpu) {
        struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);

        if (tsk) {
            kthread_stop(tsk);
            put_task_struct(tsk);
            *per_cpu_ptr(ht->store, cpu) = NULL;
        }
    }
}

 

1.3 hrtimer看門狗

 在分析了喂狗線程watchdog/x以後,再來分析看門狗是如何實現的?

看門狗是經過啓動一個週期爲4秒的hrtimer來實現的,這個hrtimer和CPU綁定,使用的變量都是percpu的。確保每一個CPU之間不相互干擾。

每次hrtimer超時,都會喚醒watchdog/x線程,並進行一次喂狗操做。

由於hrtimer超時函數在軟中斷中調用,在中斷產生後會比線程優先獲得執行。

因此在watchdog/x線程沒有獲得執行的狀況下,經過is_softlockup()來判斷看門狗是否超過20秒沒有獲得喂狗。

static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
    unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
    struct pt_regs *regs = get_irq_regs();
    int duration;
    int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;

    if (atomic_read(&watchdog_park_in_progress) != 0)
        return HRTIMER_NORESTART;

    /* kick the hardlockup detector */ watchdog_interrupt_count();------------------------------------------------------------------沒產生一次中斷,hrtimer_interrupts計數加1.hrtimer_interrupts記錄了產生hrtimer的次數。 /* kick the softlockup detector */
    wake_up_process(__this_cpu_read(softlockup_watchdog));---------------------------------------喚醒watchdog/x線程,進行喂狗操做。 /* .. and repeat */
    hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));------------------------------------從新設置超時點,造成周期性時鐘。 ...
    duration = is_softlockup(touch_ts);----------------------------------------------------------返回非0表示,看門狗超時。 if (unlikely(duration)) {--------------------------------------------------------------------看門狗超時狀況的處理。         if (kvm_check_and_clear_guest_paused())
            return HRTIMER_RESTART;

        /* only warn once */
        if (__this_cpu_read(soft_watchdog_warn) == true) {
            if (__this_cpu_read(softlockup_task_ptr_saved) !=
                current) {
                __this_cpu_write(soft_watchdog_warn, false);
                __touch_watchdog();
            }
            return HRTIMER_RESTART;
        }

        if (softlockup_all_cpu_backtrace) {
            if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
                /* Someone else will report us. Let's give up */
                __this_cpu_write(soft_watchdog_warn, true);
                return HRTIMER_RESTART;
            }
        }

        pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
            smp_processor_id(), duration,
            current->comm, task_pid_nr(current));-------------------------------------------------打印哪一個CPU被卡死duration秒,以及死在哪一個進程。
        __this_cpu_write(softlockup_task_ptr_saved, current);
        print_modules();
        print_irqtrace_events(current);-----------------------------------------------------------顯示開關中斷、軟中斷信息,禁止中斷和軟中斷也是形成soft lockup的一個緣由。 if (regs)---------------------------------------------------------------------------------有寄存器顯示寄存器信息,同時顯示棧信息。
            show_regs(regs);
        else
            dump_stack();

        if (softlockup_all_cpu_backtrace) {
            trigger_allbutself_cpu_backtrace();

            clear_bit(0, &soft_lockup_nmi_warn);
            /* Barrier to sync with other cpus */
            smp_mb__after_atomic();
        }

        add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
        if (softlockup_panic)---------------------------------------------------------------------若是定義softlockup_panic則進入panic()。
            panic("softlockup: hung tasks");
        __this_cpu_write(soft_watchdog_warn, true);
    } else
        __this_cpu_write(soft_watchdog_warn, false);

    return HRTIMER_RESTART;
}

  static void watchdog_interrupt_count(void)
  {
      __this_cpu_inc(hrtimer_interrupts);
  }

static int is_softlockup(unsigned long touch_ts)
{
    unsigned long now = get_timestamp();

    if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){
        /* Warn about unreasonable delays. */
        if (time_after(now, touch_ts + get_softlockup_thresh()))
            return now - touch_ts;
    }
    return 0;
}

 

2. 對watchdog的設置

 對watchdog行爲的設置有兩個途徑:經過命令行傳入參數和經過proc設置。

2.1 經過命令行設置

經過命令行傳入參數,能夠對soft lockup進行開關設置、超時事後是否panic等等行爲。

static int __init softlockup_panic_setup(char *str)
{
    softlockup_panic = simple_strtoul(str, NULL, 0);

    return 1;
}
__setup("softlockup_panic=", softlockup_panic_setup);

static int __init nowatchdog_setup(char *str)
{
    watchdog_enabled = 0;
    return 1;
}
__setup("nowatchdog", nowatchdog_setup);

static int __init nosoftlockup_setup(char *str)
{
    watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED;
    return 1;
}
__setup("nosoftlockup", nosoftlockup_setup);

#ifdef CONFIG_SMP
static int __init softlockup_all_cpu_backtrace_setup(char *str)
{
    sysctl_softlockup_all_cpu_backtrace =
        !!simple_strtol(str, NULL, 0);
    return 1;
}
__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
static int __init hardlockup_all_cpu_backtrace_setup(char *str)
{
    sysctl_hardlockup_all_cpu_backtrace =
        !!simple_strtol(str, NULL, 0);
    return 1;
}
__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
#endif

 

2.2 經過sysfs節點調節watchdog

 watchdog相關的配置還能夠經過proc文件系統進行配置。

/proc/sys/kernel/nmi_watchdog-------------------------hard lockup開關,proc_nmi_watchdog()。
/proc/sys/kernel/soft_watchdog------------------------soft lockup開關,proc_soft_watchdog()。
/proc/sys/kernel/watchdog-----------------------------watchdog總開關,proc_watchdog()。
/proc/sys/kernel/watchdog_cpumask---------------------watchdog cpumaks,proc_watchdog_cpumask()。
/proc/sys/kernel/watchdog_thresh----------------------watchdog超時閾值設置,proc_watchdog_thresh()。

 

3. 定位soft lockup異常

引發soft lockup的緣由通常是死循環或者死鎖, 死循環能夠經過棧回溯找到問題點;死鎖問題須要打開內核的lockdep功能。

打開內核的lockdep功能能夠參考《Linux死鎖檢測-Lockdep》。

下面看一個while(1)引發的soft lockup異常分析:

[ 5656.032325] NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [cat:157]-----------------------CPU、進程等信息粗略定位。
[ 5656.039314] Modules linked in:
[ 5656.042386] 
[ 5656.042386] CURRENT PROCESS:
[ 5656.042386] 
[ 5656.048229] COMM=cat PID=157
[ 5656.051117] TEXT=00008000-000c5a68 DATA=000c6f1c-000c7175 BSS=000c7175-000c8000
[ 5656.058432] USER-STACK=7fc1ee50  KERNEL-STACK=bd0b7080
[ 5656.058432] 
[ 5656.065069] PC: 0x8032a1b2 (clk_summary_show+0x62/0xb4)--------------------------------------------PC指向出問題的點,更加精確的定位。
[ 5656.070302] LR: 0x8032a186 (clk_summary_show+0x36/0xb4)
[ 5656.075531] SP: 0xbd8b1b74...
[ 5656.217622] 
Call Trace:-----------------------------------------------------------------------------------------經過Call Trace,能夠了解如何作到PC指向的問題點的。前因後果一目瞭然。
[<80155c5e>] seq_read+0xc2/0x46c
[<802826ac>] full_proxy_read+0x58/0x98
[<8013239c>] do_readv_writev+0x31c/0x384
[<80132458>] vfs_readv+0x54/0x8c
[<80160b52>] default_file_splice_read+0x166/0x2b0
[<801606ee>] do_splice_to+0x76/0xb0
[<801607de>] splice_direct_to_actor+0xb6/0x21c
[<801609c2>] do_splice_direct+0x7e/0xa8
[<80132a5a>] do_sendfile+0x21a/0x45c
[<80133776>] SyS_sendfile64+0xf6/0xfc
[<80046186>] csky_systemcall+0x96/0xe0
相關文章
相關標籤/搜索