softlockup檢測(watchdog)原理(用於檢測系統調度是否正常)

softlockup(watchdog)用於檢測系統調度是否正常,即軟鎖的狀況,當發生softlockup時,內核不能調度,但還能響應中斷,對用戶的表現可能爲:能ping通,但沒法登錄系統,沒法進行正常操做。
其基本原理爲:爲每一個CPU啓動一個內核線程(watchdog/x),此線程爲優先級最高的實時線程,在該線程獲得調度時,會更新相應的計數(時間戳),同時會啓動定時器,當定時器到期時檢查相應的時間戳,若是超過指定時間,都沒有更新,則說明這段時間內都沒有發生調度(由於此線程優先級最高),則打印相應告警或根據配置能夠進入panic流程。
基本代碼分析(2.6.32)
rest_init->kernel_init->lockup_detector_init->cpu_callback->watchdog_prepare_cpu(初始化watchdog定時器):

less

點擊(此處)摺疊或打開ide

  1. static int watchdog_prepare_cpu(int cpu)
    函數

  2. {
    this

  3.     struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
    atom


  4.     WARN_ON(per_cpu(softlockup_watchdog, cpu));
    spa

  5.     hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);//初始化高精度定時器
    線程

  6.     hrtimer->function = watchdog_timer_fn;//設置定時器處理函數
    debug


  7.     return 0;
    rest

  8. }get

看門狗定時器處理函數:

點擊(此處)摺疊或打開

  1. static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)

  2. {

  3. //獲取計數watchdog_touch_ts,該計數在watchdog內核線程被調度時更新

  4.     unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);

  5.     struct pt_regs *regs = get_irq_regs();

  6.     int duration;


  7.     /* kick the hardlockup detector */

  8. //增長中斷計數,證實沒有發生硬鎖(關中斷死鎖)

  9.     watchdog_interrupt_count();


  10.     /* kick the softlockup detector */

  11. //喚醒wathdog內核線程

  12.     wake_up_process(__get_cpu_var(softlockup_watchdog));


  13.     /* .. and repeat */

  14. //重啓定時器

  15.     hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));

  16.     if (touch_ts == 0) {

  17.         if (unlikely(__get_cpu_var(softlockup_touch_sync))) {

  18.             /*

  19.              * If the time stamp was touched atomically

  20.              * make sure the scheduler tick is up to date.

  21.              */

  22.             __get_cpu_var(softlockup_touch_sync) = false;

  23.             sched_clock_tick();

  24.         }

  25.         __touch_watchdog();

  26.         return HRTIMER_RESTART;

  27.     }


  28.     /* check for a softlockup

  29.      * This is done by making sure a high priority task is

  30.      * being scheduled. The task touches the watchdog to

  31.      * indicate it is getting cpu time. If it hasn't then

  32.      * this is a good indication some task is hogging the cpu

  33.      */

  34. //判斷是否發生了軟鎖,原理是判斷touch_ts(時間戳)是否超過必定時間沒有更新

  35.     duration = is_softlockup(touch_ts);

  36.     if (unlikely(duration)) {

  37.         /* only warn once */

  38.         if (__get_cpu_var(soft_watchdog_warn) == true)

  39.             return HRTIMER_RESTART;

  40. //發生了軟鎖後,進行一些列的信息記錄和告警。

  41.         printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",

  42.             smp_processor_id(), duration,

  43.             current->comm, task_pid_nr(current));

  44.         print_modules();

  45.         print_irqtrace_events(current);

  46.         if (regs)

  47.             show_regs(regs);

  48.         else

  49.             dump_stack();

  50. //若是配置了softlockup_panic(proc中配置),則panic

  51.         if (softlockup_panic)

  52.             panic("softlockup: hung tasks");

  53.         __get_cpu_var(soft_watchdog_warn) = true;

  54.     } else

  55.         __get_cpu_var(soft_watchdog_warn) = false;


  56.     return HRTIMER_RESTART;

  57. }


啓動看門狗,即建立watchdog內核線程。

點擊(此處)摺疊或打開

  1. static int watchdog_enable(int cpu)

  2. {

  3.     struct task_struct *p = per_cpu(softlockup_watchdog, cpu);

  4.     int err = 0;


  5.     /* enable the perf event */

  6.     err = watchdog_nmi_enable(cpu);


  7.     /* Regardless of err above, fall through and start softlockup */


  8.     /* create the watchdog thread */

  9.     if (!p) {

  10. //建立watchdog內核線程

  11.         p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);

  12.         if (IS_ERR(p)) {

  13.             printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);

  14.             if (!err)

  15.                 /* if hardlockup hasn't already set this */

  16.                 err = PTR_ERR(p);

  17.             goto out;

  18.         }

  19.         kthread_bind(p, cpu);

  20.         per_cpu(watchdog_touch_ts, cpu) = 0;

  21.         per_cpu(softlockup_watchdog, cpu) = p;

  22.         wake_up_process(p);

  23.     }


  24. out:

  25.     return err;

  26. }


watchdog內核線程執行主函數,主要是要更新計數(時間戳)

點擊(此處)摺疊或打開

  1. static int watchdog(void *unused)

  2. {

  3. //設置爲最高優先級

  4.     struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };

  5.     struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);

  6. //設置爲實時線程

  7.     sched_setscheduler(current, SCHED_FIFO, &param);


  8.     /* initialize timestamp */

  9. //初始化計數(時間戳)

  10.     __touch_watchdog();


  11.     /* kick off the timer for the hardlockup detector */

  12.     /* done here because hrtimer_start can only pin to smp_processor_id() */

  13. //啓動定時器,用於檢測是否發生軟鎖

  14.     hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),

  15.          HRTIMER_MODE_REL_PINNED);

  16. //睡眠

  17.     set_current_state(TASK_INTERRUPTIBLE);

  18.     /*

  19.      * Run briefly once per second to reset the softlockup timestamp.

  20.      * If this gets delayed for more than 60 seconds then the

  21.      * debug-printout triggers in watchdog_timer_fn().

  22.      */

  23.     while (!kthread_should_stop()) {

  24. //更新計數

  25.         __touch_watchdog();

  26.         schedule();


  27.         if (kthread_should_stop())

  28.             break;


  29.         set_current_state(TASK_INTERRUPTIBLE);

  30.     }

  31.     __set_current_state(TASK_RUNNING);


  32.     return 0;

  33. }


判斷是否發生軟鎖:is_softlockup

點擊(此處)摺疊或打開

  1. static int is_softlockup(unsigned long touch_ts)

  2. {

  3.     unsigned long now = get_timestamp(smp_processor_id());


  4.     /* Warn about unreasonable delays: */

  5. //檢測計數多久沒有更新了,若是超過了60s,則表示發生了軟鎖

  6.     if (time_after(now, touch_ts + softlockup_thresh))

  7.         return now - touch_ts;


  8.     return 0;

  9. }

相關文章
相關標籤/搜索