softlockup(watchdog)用於檢測系統調度是否正常,即軟鎖的狀況,當發生softlockup時,內核不能調度,但還能響應中斷,對用戶的表現可能爲:能ping通,但沒法登錄系統,沒法進行正常操做。
其基本原理爲:爲每一個CPU啓動一個內核線程(watchdog/x),此線程爲優先級最高的實時線程,在該線程獲得調度時,會更新相應的計數(時間戳),同時會啓動定時器,當定時器到期時檢查相應的時間戳,若是超過指定時間,都沒有更新,則說明這段時間內都沒有發生調度(由於此線程優先級最高),則打印相應告警或根據配置能夠進入panic流程。
基本代碼分析(2.6.32)
rest_init->kernel_init->lockup_detector_init->cpu_callback->watchdog_prepare_cpu(初始化watchdog定時器):
less
點擊(此處)摺疊或打開ide
static int watchdog_prepare_cpu(int cpu)
函數
{
this
struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
atom
WARN_ON(per_cpu(softlockup_watchdog, cpu));
spa
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);//初始化高精度定時器
線程
hrtimer->function = watchdog_timer_fn;//設置定時器處理函數
debug
return 0;
rest
}get
看門狗定時器處理函數:
點擊(此處)摺疊或打開
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
//獲取計數watchdog_touch_ts,該計數在watchdog內核線程被調度時更新
unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);
struct pt_regs *regs = get_irq_regs();
int duration;
/* kick the hardlockup detector */
//增長中斷計數,證實沒有發生硬鎖(關中斷死鎖)
watchdog_interrupt_count();
/* kick the softlockup detector */
//喚醒wathdog內核線程
wake_up_process(__get_cpu_var(softlockup_watchdog));
/* .. and repeat */
//重啓定時器
hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
if (touch_ts == 0) {
if (unlikely(__get_cpu_var(softlockup_touch_sync))) {
/*
* If the time stamp was touched atomically
* make sure the scheduler tick is up to date.
*/
__get_cpu_var(softlockup_touch_sync) = false;
sched_clock_tick();
}
__touch_watchdog();
return HRTIMER_RESTART;
}
/* check for a softlockup
* This is done by making sure a high priority task is
* being scheduled. The task touches the watchdog to
* indicate it is getting cpu time. If it hasn't then
* this is a good indication some task is hogging the cpu
*/
//判斷是否發生了軟鎖,原理是判斷touch_ts(時間戳)是否超過必定時間沒有更新
duration = is_softlockup(touch_ts);
if (unlikely(duration)) {
/* only warn once */
if (__get_cpu_var(soft_watchdog_warn) == true)
return HRTIMER_RESTART;
//發生了軟鎖後,進行一些列的信息記錄和告警。
printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
print_modules();
print_irqtrace_events(current);
if (regs)
show_regs(regs);
else
dump_stack();
//若是配置了softlockup_panic(proc中配置),則panic
if (softlockup_panic)
panic("softlockup: hung tasks");
__get_cpu_var(soft_watchdog_warn) = true;
} else
__get_cpu_var(soft_watchdog_warn) = false;
return HRTIMER_RESTART;
}
啓動看門狗,即建立watchdog內核線程。
點擊(此處)摺疊或打開
static int watchdog_enable(int cpu)
{
struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
int err = 0;
/* enable the perf event */
err = watchdog_nmi_enable(cpu);
/* Regardless of err above, fall through and start softlockup */
/* create the watchdog thread */
if (!p) {
//建立watchdog內核線程
p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
if (IS_ERR(p)) {
printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
if (!err)
/* if hardlockup hasn't already set this */
err = PTR_ERR(p);
goto out;
}
kthread_bind(p, cpu);
per_cpu(watchdog_touch_ts, cpu) = 0;
per_cpu(softlockup_watchdog, cpu) = p;
wake_up_process(p);
}
out:
return err;
}
watchdog內核線程執行主函數,主要是要更新計數(時間戳)
點擊(此處)摺疊或打開
static int watchdog(void *unused)
{
//設置爲最高優先級
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
//設置爲實時線程
sched_setscheduler(current, SCHED_FIFO, ¶m);
/* initialize timestamp */
//初始化計數(時間戳)
__touch_watchdog();
/* kick off the timer for the hardlockup detector */
/* done here because hrtimer_start can only pin to smp_processor_id() */
//啓動定時器,用於檢測是否發生軟鎖
hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
HRTIMER_MODE_REL_PINNED);
//睡眠
set_current_state(TASK_INTERRUPTIBLE);
/*
* Run briefly once per second to reset the softlockup timestamp.
* If this gets delayed for more than 60 seconds then the
* debug-printout triggers in watchdog_timer_fn().
*/
while (!kthread_should_stop()) {
//更新計數
__touch_watchdog();
schedule();
if (kthread_should_stop())
break;
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
return 0;
}
判斷是否發生軟鎖:is_softlockup
點擊(此處)摺疊或打開
static int is_softlockup(unsigned long touch_ts)
{
unsigned long now = get_timestamp(smp_processor_id());
/* Warn about unreasonable delays: */
//檢測計數多久沒有更新了,若是超過了60s,則表示發生了軟鎖
if (time_after(now, touch_ts + softlockup_thresh))
return now - touch_ts;
return 0;
}