LINUX高精度定时器实现分析 | 码农故事

老匹夫 2016-01-04

展开全文

hrtimer，是High-resolution kernel timers的缩写，从字面意思就知道，这是一个高精度内核timer。

HRTIMER用法示例

先调用hrtimer_init初始化，然后设置function回调，例如，此处设置回调为coalesced_timer_fn。

static enum hrtimer_restart coalesced_timer_fn(struct hrtimer *timer)

{

struct kvm_rtc *rtc = container_of(timer, struct kvm_rtc, coalesced_timer);

CHECK_PAUSE_RET(rtc->pause, HRTIMER_NORESTART);

queue_work(rtc->wq, &rtc->ws_coalesced);

return HRTIMER_NORESTART;//这里返回值很重要，如果想此timer为周期timer，返回HRTIMER_RESTART(超时必须先设置)

}

hrtimer_init(&rtc->coalesced_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);

rtc->coalesced_timer.function = coalesced_timer_fn;

设置超时时间，设置一个超时的ABS时间，超时时间设置了，并不代表timer已经运行，还必须将其加入active队列，start系列函数就做此事。

1 2	hrtimer_forward(&rtc->coalesced_timer, ns_to_ktime(now_time), ns_to_ktime(next_time-now_time)); hrtimer_add_expires_ns(&rtc->coalesced_timer, 1000000000);//1秒

加入active队列。

1 2	hrtimer_start_expires(&rtc->coalesced_timer, HRTIMER_MODE_ABS); hrtimer_restart(&rtc->coalesced_timer);

从active队列删除(如正在回调中，会等待回调运行完成)。

1	hrtimer_cancel(&rtc->coalesced_timer);

HRTIMER数据结构

Figure 1数据结构示意图

enum hrtimer_base_type {

HRTIMER_BASE_MONOTONIC,

HRTIMER_BASE_REALTIME,

HRTIMER_BASE_BOOTTIME,

HRTIMER_MAX_CLOCK_BASES, //max 边界，下面的clock_base就是用来做数组大小

};

* struct hrtimer_cpu_base - the per cpu clock bases

struct hrtimer_cpu_base {

raw_spinlock_t lock; //lock protecting the base and associated clock bases and timers

unsigned int active_bases; //Bitfield to mark bases with active timers

unsigned int clock_was_set; //Indicates that clock was set from irq context.

#ifdef CONFIG_HIGH_RES_TIMERS

ktime_t expires_next; //absolute time of the next event which was scheduled, via clock_set_next_event()

int hres_active; //State of high resolution mode

int hang_detected; //The last hrtimer interrupt detected a hang

unsigned long nr_events; //Total number of hrtimer interrupt events

unsigned long nr_retries; //Total number of hrtimer interrupt retries

unsigned long nr_hangs; //Total number of hrtimer interrupt hangs

ktime_t max_hang_time; //Maximum time spent in hrtimer_interrupt

#endif

struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; //array of clock bases for this cpu

};

DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =

{

.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),

.clock_base =

{

.index = HRTIMER_BASE_MONOTONIC,

.clockid = CLOCK_MONOTONIC,

.get_time = &ktime_get, //monotonic time

.resolution = KTIME_LOW_RES, //刚初始化的时候，都是低精度的

{

.index = HRTIMER_BASE_REALTIME,

.clockid = CLOCK_REALTIME,

.get_time = &ktime_get_real, //get the real (wall-) time, TOD

.resolution = KTIME_LOW_RES,

{

.index = HRTIMER_BASE_BOOTTIME,

.clockid = CLOCK_BOOTTIME,

.get_time = &ktime_get_boottime,//monotonic time since boot

.resolution = KTIME_LOW_RES,

}

};

/**

* struct hrtimer_clock_base - the timer base for a specific clock

struct hrtimer_clock_base {

struct hrtimer_cpu_base * cpu_base; //per cpu clock base

int index; //clock type index for per_cpu support when moving a timer to a base on another cpu.

clockid_t clockid; //clock id for per_cpu support

struct timerqueue_head active; //red black tree root node for the active timers, active queue里面存放的，就是hrtimer

ktime_t resolution; //the resolution of the clock, in nanoseconds

ktime_t (*get_time)(void); //function to retrieve the current time of the clock

ktime_t softirq_time; //the time when running the hrtimer queue in the softirq

ktime_t offset; //offset of this clock to the monotonic base

};

struct timerqueue_node {

struct rb_node node;

ktime_t expires; //这个超时，是真正用于比较时间的，据说是为了节能优化

};

struct hrtimer {

struct timerqueue_node node;

ktime_t _softexpires; //这是超时，叫soft expires，对应timerqueue_node->expires叫做hard expires

enum hrtimer_restart (*function)(struct hrtimer *); //这就是回调函数了

struct hrtimer_clock_base *base;

unsigned long state; //状态字段，指示hrtimer处于什么状态，见#状态转换

#ifdef CONFIG_TIMER_STATS

int start_pid;

void *start_site;

char start_comm[16];

#endif

};

HRTIMER初始化流程

asmlinkage void __init start_kernel(void)

{

...

init_IRQ();

init_timers(); //初始化低精度timer

hrtimers_init(); //初始化高进度timer

softirq_init();

timekeeping_init();

time_init();

...

}

//hrtimer的CPU事件通知

static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,

unsigned long action, void *hcpu)

{

int scpu = (long)hcpu;

switch (action) {

case CPU_UP_PREPARE:

case CPU_UP_PREPARE_FROZEN:

init_hrtimers_cpu(scpu);//对每个CPU，初始化其struct hrtimer_cpu_base * 中的hrtimer_clock_base

break;

#ifdef CONFIG_HOTPLUG_CPU //热插拔支持

case CPU_DYING:

case CPU_DYING_FROZEN:

clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu);

break;

case CPU_DEAD:

case CPU_DEAD_FROZEN:

{

clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);

migrate_hrtimers(scpu);//CPU DEAD的时候，将DEAD的CPU上的timer迁移到本CPU

break;

}

#endif

default:

break;

}

return NOTIFY_OK;

}

static struct notifier_block __cpuinitdata hrtimers_nb = {

.notifier_call = hrtimer_cpu_notify,

};

void __init hrtimers_init(void)

{

hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,

(void *)(long)smp_processor_id());//这里将BSP的PREPARE初始化显示调用一下，因为BSP已经启动了，其他CPU通过下面注册的回调

register_cpu_notifier(&hrtimers_nb);//CPU事件通知，会调用hrtimer_cpu_notify

#ifdef CONFIG_HIGH_RES_TIMERS

open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);//注册高精度模式下的定时器软中断

#endif

}

* Functions related to boot-time initialization:

//对每一个CPU，初始化和CPU关联的hrtimer_cpu_base结构

static void __cpuinit init_hrtimers_cpu(int cpu)

{

struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);

int i;

//其实，也初始化了hrtimer_cpu_base中的hrtimer_clock_base数组

//每个clock base，需要将其存放hrtimer的queue初始化好，将回溯指针设置好

for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {

cpu_base->clock_base[i].cpu_base = cpu_base;

timerqueue_init_head(&cpu_base->clock_base[i].active);

}

hrtimer_init_hres(cpu_base);//例如初始化高精度为未激活状态

}

HRTIMER CPU热插拔支持

热插拔支持，其核心功能，就是当一个CPU死掉的时候，将其上面的hrtimer迁移到本CPU上来，需要预编译宏CONFIG_HOTPLUG_CPU。

#ifdef CONFIG_HOTPLUG_CPU

static void migrate_hrtimers(int scpu)

{

struct hrtimer_cpu_base *old_base, *new_base;

int i;

BUG_ON(cpu_online(scpu));

tick_cancel_sched_timer(scpu);

local_irq_disable();//迁移timer的时候关中断

old_base = &per_cpu(hrtimer_bases, scpu);

new_base = &__get_cpu_var(hrtimer_bases);

* The caller is globally serialized and nobody else

* takes two locks at once, deadlock is not possible.

raw_spin_lock(&new_base->lock);

raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);

//新的老的hrtimer_bases都锁起来，然后迁移timer节点

//一个hrtimer_bases下又有多个clock base

for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {

migrate_hrtimer_list(&old_base->clock_base[i],

&new_base->clock_base[i]);

}

raw_spin_unlock(&old_base->lock);

raw_spin_unlock(&new_base->lock);

/* Check, if we got expired work to do */

__hrtimer_peek_ahead_timers();//这里是为了确保实时性，迁移完成后，就检查一下处于本CPU上的hrtimer_bases中是否有timer超时

local_irq_enable();//迁移成功后开中断

}

static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,

struct hrtimer_clock_base *new_base)

{

struct hrtimer *timer;

struct timerqueue_node *node;

//做了个很简单是事情，就是将老的clock_base里的active链表的timer重新连接到新的

while ((node = timerqueue_getnext(&old_base->active))) {

timer = container_of(node, struct hrtimer, node);

BUG_ON(hrtimer_callback_running(timer));

debug_deactivate(timer);

* Mark it as STATE_MIGRATE not INACTIVE otherwise the

* timer could be seen as !active and just vanish away

* under us on another CPU

__remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);

timer->base = new_base;

* Enqueue the timers on the new cpu. This does not

* reprogram the event device in case the timer

* expires before the earliest on this CPU, but we run

* hrtimer_interrupt after we migrated everything to

* sort out already expired timers and reprogram the

* event device.

enqueue_hrtimer(timer, new_base);

/* Clear the migration state bit */

timer->state &= ~HRTIMER_STATE_MIGRATE;

}

#endif /* CONFIG_HOTPLUG_CPU */

时钟设备

如下斜体部分文字摘自陈功的《Linux 时钟管理》

tick device
Tick device 用来处理周期性的 tick event。Tick device 其实是时钟事件设备的一个 wrapper，因此 tick device 也有 one-shot 和周期性这两种中断触发模式。
每注册一个时钟事件设备，这个设备会自动被注册为一个 tick device。全局的 tick device 用来更新诸如 jiffies 这样的全局信息，per-CPU 的 tick device 则用来更新每个 CPU 相关的特定信息。
broadcast
Broadcast 的出现是为了应对这样一种情况：假定 CPU 使用 Local APIC Timer 作为 per-CPU 的 tick device，但是某些特定的 CPU（如 Intel 的 Westmere 之前的 CPU）在进入 C3+ 的状态时 Local APIC Timer 也会同时停止工作，进入睡眠状态。在这种情形下 broadcast 可以替代 Local APIC Timer 继续完成统计进程的执行时间等有关操作。本质上 broadcast 是发送一个 IPI（Inter-processor interrupt）中断给其他所有的 CPU，当目标 CPU 收到这个 IPI 中断后就会调用原先 Local APIC Timer 正常工作时的中断处理函数，从而实现了同样的功能。目前主要在 x86 以及 MIPS 下会用到 broadcast 功能。
Timekeeping & GTOD (Generic Time-of-Day)
Timekeeping（可以理解为时间测量或者计时）是内核时间管理的一个核心组成部分。没有 Timekeeping，就无法更新系统时间，维持系统“心跳”。GTOD 是一个通用的框架，用来实现诸如设置系统时间 gettimeofday 或者修改系统时间 settimeofday 等工作。

100

101

102

* The hpet clock event device

static struct clock_event_device hpet_clockevent = {

.name = "hpet",

.features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,

.set_mode = hpet_legacy_set_mode,

.set_next_event = hpet_legacy_next_event,

.irq = 0,

.rating = 50,

};

asmlinkage void __init start_kernel(void)

{

...

init_IRQ();

init_timers();

hrtimers_init();

softirq_init();

timekeeping_init();

time_init();

...

if (late_time_init)

late_time_init();

...

}

void __init time_init(void)

{

late_time_init = x86_late_time_init;

}

static __init void x86_late_time_init(void)

{

x86_init.timers.timer_init();

tsc_init();

}

struct x86_init_ops x86_init __initdata = {

...

.timers = {

.setup_percpu_clockev = setup_boot_APIC_clock,

.tsc_pre_init = x86_init_noop,

.timer_init = hpet_time_init,

.wallclock_init = x86_init_noop,

...

};

/* Default timer init function */

void __init hpet_time_init(void)

{

if (!hpet_enable())//默认使用HPET，如果HPET不支持，再使用PIT代替时钟源

setup_pit_timer();

setup_default_timer_irq();//为IRQ0设置处理HANDLE

}

hpet_enable ->

static void hpet_legacy_clockevent_register(void)

{

/* Start HPET legacy interrupts */

hpet_enable_legacy_int();

* Start hpet with the boot cpu mask and make it

* global after the IO_APIC has been initialized.

hpet_clockevent.cpumask = cpumask_of(smp_processor_id());

clockevents_config_and_register(&hpet_clockevent, hpet_freq,

HPET_MIN_PROG_DELTA, 0x7FFFFFFF);//非常关键，注册clock event

global_clock_event = &hpet_clockevent;//让IRQ 0的中断HANDLE使用hpet的HANDLE

printk(KERN_DEBUG "hpet clockevent registeredn");

}

clockevents_config_and_register ->

void clockevents_register_device(struct clock_event_device *dev)

{

unsigned long flags;

BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);

if (!dev->cpumask) {

WARN_ON(num_possible_cpus() > 1);

dev->cpumask = cpumask_of(smp_processor_id());

}

raw_spin_lock_irqsave(&clockevents_lock, flags);

//将clock event 加入clockevent_devices链表，可以用在suspend, resume或其他消息通知的时候回调

list_add(&dev->list, &clockevent_devices);

clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);//通知clock event添加事件

clockevents_notify_released();

raw_spin_unlock_irqrestore(&clockevents_lock, flags);

}

* Notify about a clock event change. Called with clockevents_lock

* held.

static void clockevents_do_notify(unsigned long reason, void *dev)

{

raw_notifier_call_chain(&clockevents_chain, reason, dev);

}

CLOCK_EVT_NOTIFY_ADD通知会在tick_notify里收到，然后回调
tick_check_new_device -> tick_setup_device(td, newdev, cpu, cpumask_of(cpu));

* Setup the tick device

static void tick_setup_device(struct tick_device *td,

struct clock_event_device *newdev, int cpu,

const struct cpumask *cpumask)

{

ktime_t next_event;

void (*handler)(struct clock_event_device *) = NULL;

* First device setup ?

if (!td->evtdev) {

* If no cpu took the do_timer update, assign it to

* this cpu:

if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {

tick_do_timer_cpu = cpu;

tick_next_period = ktime_get();

tick_period = ktime_set(0, NSEC_PER_SEC / HZ);

}

* Startup in periodic mode first.

td->mode = TICKDEV_MODE_PERIODIC;//初始的时候，都是PERIODIC模式，当高精度时钟的时候，才是ONE SHORT

} else {

handler = td->evtdev->event_handler;

next_event = td->evtdev->next_event;

td->evtdev->event_handler = clockevents_handle_noop;

}

td->evtdev = newdev;

* When the device is not per cpu, pin the interrupt to the

* current cpu:

if (!cpumask_equal(newdev->cpumask, cpumask))

irq_set_affinity(newdev->irq, cpumask);

* When global broadcasting is active, check if the current

* device is registered as a placeholder for broadcast mode.

* This allows us to handle this x86 misfeature in a generic

* way.

if (tick_device_uses_broadcast(newdev, cpu))

return;

//初始化

if (td->mode == TICKDEV_MODE_PERIODIC)

tick_setup_periodic(newdev, 0);//这里面设置handle为tick_handle_periodic 或tick_handle_periodic_broadcast

else

tick_setup_oneshot(newdev, handler, next_event); //在高精度模式下，handler其实是hrtimer_interrupt

}

低精度模式

所以，周期时钟(低精度)时，回调函数为tick_handle_periodic或tick_handle_periodic_broadcast

* Event handler for periodic ticks

void tick_handle_periodic(struct clock_event_device *dev)

{

int cpu = smp_processor_id();

ktime_t next;

tick_periodic(cpu);

if (dev->mode != CLOCK_EVT_MODE_ONESHOT)

return;

* Setup the next period for devices, which do not have

* periodic mode:

next = ktime_add(dev->next_event, tick_period);

for (;;) {

if (!clockevents_program_event(dev, next, false))

return;

* Have to be careful here. If we're in oneshot mode,

* before we call tick_periodic() in a loop, we need

* to be sure we're using a real hardware clocksource.

* Otherwise we could get trapped in an infinite

* loop, as the tick_periodic() increments jiffies,

* when then will increment time, posibly causing

* the loop to trigger again and again.

if (timekeeping_valid_for_hres())

tick_periodic(cpu);

next = ktime_add(next, tick_period);

}

* Periodic tick

static void tick_periodic(int cpu)

{

if (tick_do_timer_cpu == cpu) {

write_seqlock(&jiffies_lock);

/* Keep track of the next tick event */

tick_next_period = ktime_add(tick_next_period, tick_period);

do_timer(1);

write_sequnlock(&jiffies_lock);

}

update_process_times(user_mode(get_irq_regs()));//低精度下，运行此函数，更新进程时间，调用run_local_timers

profile_tick(CPU_PROFILING);

}

void update_process_times(int user_tick)

{

struct task_struct *p = current;

int cpu = smp_processor_id();

/* Note: this timer irq context must be accounted for as well. */

account_process_tick(p, user_tick);

run_local_timers(); //运行local timers

rcu_check_callbacks(cpu, user_tick);

#ifdef CONFIG_IRQ_WORK

if (in_irq())

irq_work_run();

#endif

scheduler_tick();

run_posix_cpu_timers(p);

}

* Called by the local, per-CPU timer interrupt on SMP.

void run_local_timers(void)

{

hrtimer_run_queues();//hardirq context下运行所有到期的timer

raise_softirq(TIMER_SOFTIRQ); //TIMER软中断，不是HTIMER, softirq context下运行所有到期的timer，start_kernel->init_timers->open_softirq(TIMER_SOFTIRQ, run_timer_softirq); 这里注册的SOFTIRQ回调

}

这里的TIMER SOFTIRQ，是在start_kernel里面调用init_timers初始化的，会调用到run_timer_softirq

void __init init_timers(void)

{

int err;

/* ensure there are enough low bits for flags in timer->base pointer */

BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);

//显示调用一下当前CPU的PREPARE

err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,

(void *)(long)smp_processor_id());

init_timer_stats();

BUG_ON(err != NOTIFY_OK);

register_cpu_notifier(&timers_nb);//这里才是注册回调

open_softirq(TIMER_SOFTIRQ, run_timer_softirq);//注册TIMER_SOFTIRQ软中断

}

static void run_timer_softirq(struct softirq_action *h)

{

struct tvec_base *base = __this_cpu_read(tvec_bases);

hrtimer_run_pending();//检查一下，是否需切换到高精度

if (time_after_eq(jiffies, base->timer_jiffies))

__run_timers(base);//低精度模式下，在软中断上下文调用低精度的timer回调

}

上面的run_local_timers里，调用hrtimer_run_queues，目的是在低精度模式下，实现hrtimer功能

* Called from hardirq context every jiffy

void hrtimer_run_queues(void)

{

struct timerqueue_node *node;

struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);

struct hrtimer_clock_base *base;

int index, gettime = 1;

//如果开启了高精度模式，这里就不会进来

//换言之，这里实现了低精度支持hrtimer的功能

//切记，这个调用是在时钟设备的硬中断中调用的

if (hrtimer_hres_active())

return;

for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {

base = &cpu_base->clock_base[index];//将当前CPU上的所有类型的CLOCK上的所有timer都检查一遍

if (!timerqueue_getnext(&base->active))

continue;

if (gettime) {

hrtimer_get_softirq_time(cpu_base);//这里会将所有的base的softirq_time更新为最新

gettime = 0;

}

raw_spin_lock(&cpu_base->lock);

while ((node = timerqueue_getnext(&base->active))) {

struct hrtimer *timer;

//这里比较的世界是node的时间，即hard expires

timer = container_of(node, struct hrtimer, node);

if (base->softirq_time.tv64 <=

hrtimer_get_expires_tv64(timer))//timer->node.expires.tv64;这里比较时间，用的是timer->node.expires，不是timer->_softexpires

break;

__run_hrtimer(timer, &base->softirq_time);//调用run timer

}

raw_spin_unlock(&cpu_base->lock);

}

低精度切换到高精度

上面低精度模式下，运行TIMER SOFTIRQ的时候(run_timer_softirq)，会检查是否可以切换到高精度模式

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

void hrtimer_run_pending(void)

{

if (hrtimer_hres_active())//已经是高精度了

return;

* This _is_ ugly: We have to check in the softirq context,

* whether we can switch to highres and / or nohz mode. The

* clocksource switch happens in the timer interrupt with

* xtime_lock held. Notification from there only sets the

* check bit in the tick_oneshot code, otherwise we might

* deadlock vs. xtime_lock.

if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))

hrtimer_switch_to_hres();//看下，是否高精度模式开启，是的话切换

}

int tick_check_oneshot_change(int allow_nohz)

{

struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);

//0位用于保存十分clock发生了变化

if (!test_and_clear_bit(0, &ts->check_clocks))

return 0;

if (ts->nohz_mode != NOHZ_MODE_INACTIVE)//已经开启了NOHZ模式

return 0;

//timekeeping不支持高精度或clock event不支持oneshot，无法切换到高精度

if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())

return 0;

//当运行高精度的时候，return 1，会设置hrtimer_interrupt

//否则切换到nohz，设置tick_nohz_handler

if (!allow_nohz)

return 1;

#if 0

low resolution mode High resolution mode

------------------+-----------------------+-----------------------

periodic tick | tick_handle_periodic | hrtimer_interrupt

dynamic tick | tick_nohz_handler | hrtimer_interrupt

------------------+-----------------------+-----------------------

#endif

tick_nohz_switch_to_nohz();

return 0;

}

* Switch to high resolution mode

static int hrtimer_switch_to_hres(void)

{

int i, cpu = smp_processor_id();

struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);

unsigned long flags;

if (base->hres_active)//已经是高精度模式了

return 1;

local_irq_save(flags);

if (tick_init_highres()) { // => tick_switch_to_oneshot(hrtimer_interrupt)

//初始化错了

local_irq_restore(flags);

printk(KERN_WARNING "Could not switch to high resolution "

"mode on CPU %dn", cpu);

return 0;

}

base->hres_active = 1;//这个标志高精度模式active

for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)

base->clock_base[i].resolution = KTIME_HIGH_RES;//修改clock base的标志

tick_setup_sched_timer();//低精度下要干的事，高精度下用一个hrtimer来做，为啥?我想是因为高精度的频率比低精度高，而这些任务用不着高频率处理

/* "Retrigger" the interrupt to get things going */

retrigger_next_event(NULL);

local_irq_restore(flags);

return 1;

}

/**

* tick_switch_to_oneshot - switch to oneshot mode

int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))

{//高精度支持，需要oneshot模式，方便及时切换频率或停止启动

struct tick_device *td = &__get_cpu_var(tick_cpu_device);

struct clock_event_device *dev = td->evtdev;

if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||

!tick_device_is_functional(dev)) {

//糟糕，设备不支持ONESHORT

printk(KERN_INFO "Clockevents: "

"could not switch to one-shot mode:");

if (!dev) {

printk(" no tick devicen");

} else {

if (!tick_device_is_functional(dev))

printk(" %s is not functional.n", dev->name);

else

printk(" %s does not support one-shot mode.n",

dev->name);

}

return -EINVAL;

}

td->mode = TICKDEV_MODE_ONESHOT;//修改为ONESHORT模式

dev->event_handler = handler;//现在，HANDLER也修改了，低精度的回调是在tick_setup_periodic里设置的，高精度是hrtimer_interrupt

clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);

tick_broadcast_switch_to_oneshot();//让broadcast设备也切换到oneshot模式

return 0;

}

/**

* tick_setup_sched_timer - setup the tick emulation timer

void tick_setup_sched_timer(void)

{

struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);

ktime_t now = ktime_get();

* Emulate tick processing via per-CPU hrtimers:

hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);

ts->sched_timer.function = tick_sched_timer;//主要更新下JIFFIES，进程运行时间等在低精度下也要做的工作

/* Get the next period (per cpu) */

hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());

/* Offset the tick to avert jiffies_lock contention. */

if (sched_skew_tick) {//这个不错，防止jiffies lock竞争，让超时每个CPU的超时周期和CPU ID做一个散列

u64 offset = ktime_to_ns(tick_period) >> 1;

do_div(offset, num_possible_cpus());

offset *= smp_processor_id();

hrtimer_add_expires_ns(&ts->sched_timer, offset);

}

for (;;) {//就是确保sched timer运行

hrtimer_forward(&ts->sched_timer, now, tick_period);

hrtimer_start_expires(&ts->sched_timer,

HRTIMER_MODE_ABS_PINNED);

/* Check, if the timer was already in the past */

if (hrtimer_active(&ts->sched_timer))

break;

now = ktime_get();

}

#ifdef CONFIG_NO_HZ

if (tick_nohz_enabled)

ts->nohz_mode = NOHZ_MODE_HIGHRES;

#endif

}

高精度模式

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

* High resolution timer interrupt

* Called with interrupts disabled

void hrtimer_interrupt(struct clock_event_device *dev)

{

struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);

ktime_t expires_next, now, entry_time, delta;

int i, retries = 0;

BUG_ON(!cpu_base->hres_active);

cpu_base->nr_events++;//统计总的interrupt次数

dev->next_event.tv64 = KTIME_MAX;

raw_spin_lock(&cpu_base->lock);

entry_time = now = hrtimer_update_base(cpu_base);//更新clock_base的时间

retry:

expires_next.tv64 = KTIME_MAX;

* We set expires_next to KTIME_MAX here with cpu_base->lock

* held to prevent that a timer is enqueued in our queue via

* the migration code. This does not affect enqueueing of

* timers which run their callback and need to be requeued on

* this CPU.

cpu_base->expires_next.tv64 = KTIME_MAX;

for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {

struct hrtimer_clock_base *base;

struct timerqueue_node *node;

ktime_t basenow;

if (!(cpu_base->active_bases & (1 << i)))//clock不是激活状态，比如，clock base里面没有timer，何必调用一次?

continue;

base = cpu_base->clock_base + i;//每一个CLOCK BASE

basenow = ktime_add(now, base->offset);//每一个CLOCK BASE的当前时间

//取每一个CLOCK BASE的active红黑树中最顶端hrtimer，最可能超时

while ((node = timerqueue_getnext(&base->active))) {

struct hrtimer *timer;

timer = container_of(node, struct hrtimer, node);

* The immediate goal for using the softexpires is

* minimizing wakeups, not running timers at the

* earliest interrupt after their soft expiration.

* This allows us to avoid using a Priority Search

* Tree, which can answer a stabbing querry for

* overlapping intervals and instead use the simple

* BST we already have.

* We don't add extra wakeups by delaying timers that

* are right-of a not yet expired timer, because that

* timer will have to trigger a wakeup anyway.

//这里比较的是soft expires，如果soft expires超过了当前CLOCK BASE的时间，表示还没到期，当前的CLOCK BASE可以中断检查

if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {

ktime_t expires;

expires = ktime_sub(hrtimer_get_expires(timer),

base->offset);//用未超时的timer的hard expires - base->offset，其实就是base 下次触发的时间

if (expires.tv64 < 0)

expires.tv64 = KTIME_MAX;//溢出了?这不科学，设置为最大值

if (expires.tv64 < expires_next.tv64)

expires_next = expires;//expires其实就是next expires

break;

}

__run_hrtimer(timer, &basenow);//调用run timer

}

* Store the new expiry value so the migration code can verify

* against it.

cpu_base->expires_next = expires_next;

raw_spin_unlock(&cpu_base->lock);

/* Reprogramming necessary ? */

if (expires_next.tv64 == KTIME_MAX ||//不需要next expires 或设置硬件next正确

!tick_program_event(expires_next, 0)) {//设置对应硬件的下一次超时，为表示正确

cpu_base->hang_detected = 0;

return;

}

* The next timer was already expired due to:

* - tracing

* - long lasting callbacks

* - being scheduled away when running in a VM

* We need to prevent that we loop forever in the hrtimer

* interrupt routine. We give it 3 attempts to avoid

* overreacting on some spurious event.

* Acquire base lock for updating the offsets and retrieving

* the current time.

raw_spin_lock(&cpu_base->lock);

//当前时间已经超过next time,尝试修复，执行次

now = hrtimer_update_base(cpu_base);

cpu_base->nr_retries++;

if (++retries < 3)

goto retry;

//还是不行?标志hang了

* Give the system a chance to do something else than looping

* here. We stored the entry time, so we know exactly how long

* we spent here. We schedule the next event this amount of

* time away.

cpu_base->nr_hangs++;

cpu_base->hang_detected = 1;

raw_spin_unlock(&cpu_base->lock);

delta = ktime_sub(now, entry_time);//从刚进来到现在，耗时多长?delta

if (delta.tv64 > cpu_base->max_hang_time.tv64)

cpu_base->max_hang_time = delta;//保存最大的hang time就可以了

* Limit it to a sensible value as we enforce a longer

* delay. Give the CPU at least 100ms to catch up.

if (delta.tv64 > 100 * NSEC_PER_MSEC)

expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);

else

expires_next = ktime_add(now, delta);

tick_program_event(expires_next, 1);//设置长一些的超时最大ms

printk_once(KERN_WARNING "hrtimer: interrupt took %llu nsn",

ktime_to_ns(delta));

}

tick_program_event ->

/**

* clockevents_program_event - Reprogram the clock event device.

* @dev: device to program

* @expires: absolute expiry time (monotonic clock)

* @force: program minimum delay if expires can not be set

* Returns 0 on success, -ETIME when the event is in the past.

int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,

bool force)

{

unsigned long long clc;

int64_t delta;

int rc;

if (unlikely(expires.tv64 < 0)) {

WARN_ON_ONCE(1);

return -ETIME;

}

dev->next_event = expires;

if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)

return 0;

/* Shortcut for clockevent devices that can deal with ktime. */

if (dev->features & CLOCK_EVT_FEAT_KTIME)

return dev->set_next_ktime(expires, dev);

delta = ktime_to_ns(ktime_sub(expires, ktime_get()));

if (delta <= 0)//现在的时间，已经超过了想要预设的超时，怎么办?根据是否需要force决定是否设置为min delta

return force ? clockevents_program_min_delta(dev) : -ETIME;

delta = min(delta, (int64_t) dev->max_delta_ns);

delta = max(delta, (int64_t) dev->min_delta_ns);

clc = ((unsigned long long) delta * dev->mult) >> dev->shift;

rc = dev->set_next_event((unsigned long) clc, dev); //比如hpet，其回调为hpet_next_event

//返回非表示错误，如果需要force，那么强行设置为min delta

return (rc && force) ? clockevents_program_min_delta(dev) : rc;

}

static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)

{

struct hrtimer_clock_base *base = timer->base;

struct hrtimer_cpu_base *cpu_base = base->cpu_base;

enum hrtimer_restart (*fn)(struct hrtimer *);

int restart;

WARN_ON(!irqs_disabled());

debug_deactivate(timer);

__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);//先将timer从base中删除，并设置timer的状态为CALLBACK

timer_stats_account_hrtimer(timer);

这里的function回调指针，就是我们调用hrtimer_init后设置的

hrtimer_init(&rtc->coalesced_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);

rtc->coalesced_timer.function = coalesced_timer_fn;

可以看出，我们设置的hrtimer回调是在hardirq context中执行

fn = timer->function;

* Because we run timers from hardirq context, there is no chance

* they get migrated to another cpu, therefore its safe to unlock

* the timer base.

raw_spin_unlock(&cpu_base->lock);//这句话点名了，timer的回调函数是在hardirq context

trace_hrtimer_expire_entry(timer, now);

restart = fn(timer);//调用我们的回调函数

trace_hrtimer_expire_exit(timer);

raw_spin_lock(&cpu_base->lock);

* Note: We clear the CALLBACK bit after enqueue_hrtimer and

* we do not reprogramm the event hardware. Happens either in

* hrtimer_start_range_ns() or in hrtimer_interrupt()

if (restart != HRTIMER_NORESTART) {

BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);

enqueue_hrtimer(timer, base);

}

WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));

timer->state &= ~HRTIMER_STATE_CALLBACK;

}

HRTIMER函数详解

初始化

//hrtimer_init非常简单，就是将hrtimer*加入对应的RB TREE

static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,

enum hrtimer_mode mode)

{

struct hrtimer_cpu_base *cpu_base;

int base;

memset(timer, 0, sizeof(struct hrtimer));

cpu_base = &__raw_get_cpu_var(hrtimer_bases);

if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)

clock_id = CLOCK_MONOTONIC;

base = hrtimer_clockid_to_base(clock_id);

timer->base = &cpu_base->clock_base[base];

timerqueue_init(&timer->node);

#ifdef CONFIG_TIMER_STATS

timer->start_site = NULL;

timer->start_pid = -1;

memset(timer->start_comm, 0, TASK_COMM_LEN);

#endif

}

设置超时

/**

* hrtimer_forward - forward the timer expiry

* @timer: hrtimer to forward

* @now: forward past this time

* @interval: the interval to forward

* Forward the timer expiry so it will expire in the future.

* Returns the number of overruns.

u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)

{

u64 orun = 1;

ktime_t delta;

delta = ktime_sub(now, hrtimer_get_expires(timer));

if (delta.tv64 < 0)//如果timer原本的超时时间还在想要定位的now之后，就不修改，因为这个函数的目的是将超时时间设置在now后

return 0;

if (interval.tv64 < timer->base->resolution.tv64)//如果interval过小，小于clock base所能达到的精度，当然使用clock base的最小精度了

interval.tv64 = timer->base->resolution.tv64;

//欲修改的时间基准与原有超时时间差，大于interval

//这个话使用的是unlikely，表示，这种情况还是不多的

//例如，你在一个周期回调里面，再次add，其interval肯定要大

if (unlikely(delta.tv64 >= interval.tv64)) {

s64 incr = ktime_to_ns(interval);

orun = ktime_divns(delta, incr);//相差有多少个interval

hrtimer_add_expires_ns(timer, incr * orun);

if (hrtimer_get_expires_tv64(timer) > now.tv64)//这个函数，不是简单的add interval，而是触发时间能够>now就可以了，想想周期时钟的用法，确实应该是这样

return orun;

* This (and the ktime_add() below) is the

* correction for exact:

orun++;

}

hrtimer_add_expires(timer, interval);//在上次超时的基础上加上interval

return orun;

}

static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)

{//《Linux 时钟管理》说，将原来必须在hard expire 超时才能执行的一个点变成一个范围后，可以尽量把hrtimer

//中断放在一起处理，这样CPU 被重复唤醒的几率会变小，从而达到节能的效果，同时这个hrtimer 也可以保证其执行精度。

timer->node.expires = ktime_add_safe(timer->node.expires, time);

timer->_softexpires = ktime_add_safe(timer->_softexpires, time);

}

static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns)

{

timer->node.expires = ktime_add_ns(timer->node.expires, ns);

timer->_softexpires = ktime_add_ns(timer->_softexpires, ns);

}

启动

int

hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)

{//这里的mode 和tim有关，mode为HRTIMER_REL的时候，表示tim为相对于now的时间

//顾明思意，如果mode为HRTIMER_ABS，这tim为绝对时间

return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);

}

int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,

unsigned long delta_ns, const enum hrtimer_mode mode,

int wakeup)

{

struct hrtimer_clock_base *base, *new_base;

unsigned long flags;

int ret, leftmost;

base = lock_hrtimer_base(timer, &flags);

/* Remove an active timer from the queue: */

ret = remove_hrtimer(timer, base);

/* Switch the timer base, if necessary: */

//启动的时候，会检查是否需要switch clock base到当前CPU的clock base

new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);

if (mode & HRTIMER_MODE_REL) {

tim = ktime_add_safe(tim, new_base->get_time());

* CONFIG_TIME_LOW_RES is a temporary way for architectures

* to signal that they simply return xtime in

* do_gettimeoffset(). In this case we want to round up by

* resolution when starting a relative timer, to avoid short

* timeouts. This will go away with the GTOD framework.

#ifdef CONFIG_TIME_LOW_RES

tim = ktime_add_safe(tim, base->resolution);

#endif

}

hrtimer_set_expires_range_ns(timer, tim, delta_ns);

timer_stats_hrtimer_set_start_info(timer);

//加入红黑树

leftmost = enqueue_hrtimer(timer, new_base);

* Only allow reprogramming if the new base is on this CPU.

* (it might still be on another CPU if the timer was pending)

* XXX send_remote_softirq() ?

//加入之后，如果发下自己这个hrtimer最早超时, leftmost为

//如果是本CPU上的CLOCK BASE，那么，重新设置超时，因为之前设置的超时比较靠后了

if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)

&& hrtimer_enqueue_reprogram(timer, new_base)) {

//如果加入失败，激活HRTIMER_SOFTIRQ，使得能够在软中断中检查timer是否超时

if (wakeup) {

* We need to drop cpu_base->lock to avoid a

* lock ordering issue vs. rq->lock.

raw_spin_unlock(&new_base->cpu_base->lock);

raise_softirq_irqoff(HRTIMER_SOFTIRQ);

local_irq_restore(flags);

return ret;

} else {

__raise_softirq_irqoff(HRTIMER_SOFTIRQ);

}

unlock_hrtimer_base(timer, &flags);

return ret;

}

删除

static inline int hrtimer_callback_running(struct hrtimer *timer)

{

return timer->state & HRTIMER_STATE_CALLBACK;

}

/**

* hrtimer_try_to_cancel - try to deactivate a timer

* @timer: hrtimer to stop

* Returns:

* 0 when the timer was not active

* 1 when the timer was active

* -1 when the timer is currently excuting the callback function and

* cannot be stopped

int hrtimer_try_to_cancel(struct hrtimer *timer)

{

struct hrtimer_clock_base *base;

unsigned long flags;

int ret = -1;

base = lock_hrtimer_base(timer, &flags);

if (!hrtimer_callback_running(timer))//如果处于回调函数执行状态，不CANCEL

ret = remove_hrtimer(timer, base);

unlock_hrtimer_base(timer, &flags);

return ret;

}

/**

* hrtimer_cancel - cancel a timer and wait for the handler to finish.

* @timer: the timer to be cancelled

* Returns:

* 0 when the timer was not active

* 1 when the timer was active

int hrtimer_cancel(struct hrtimer *timer)

{

for (;;) {

int ret = hrtimer_try_to_cancel(timer);//不停尝试cancel，直到成功

if (ret >= 0)

return ret;

cpu_relax();

}

状态转换

state为hrtimer的四个状态：

#define HRTIMER_STATE_INACTIVE 0x00

#define HRTIMER_STATE_ENQUEUED 0x01

#define HRTIMER_STATE_CALLBACK 0x02

#define HRTIMER_STATE_MIGRATE 0x04

可以想象，HRTIMER_STATE_INACTIVE这个是初始值，调用hrtimer_init后，这个变量设置为HRTIMER_STATE_INACTIVE

调用enqueue_hrtimer将hrtimer*加入到RB TREE后，状态会 OR 上 HRTIMER_STATE_ENQUEUED
什么时候清楚，当调用__remove_hrtimer的从RB TREE里删除后，会“设置”为新状态，此新状态中一定不包含HRTIMER_STATE_ENQUEUED

在remove_hrtimer调用的时候，只保留了CALLBACK状态
1
2
state = timer->state & HRTIMER_STATE_CALLBACK;
__remove_hrtimer(timer, base, state, reprogram);
在__run_hrtimer的时候，会先从RB TREE里将hrtimer*删除，设置状态为CALLBACK，然后调用回调函数
先将timer从base中删除，并设置timer的状态为CALLBACK
1
2
3
4
5
6
__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
restart = fn(timer);
if (restart != HRTIMER_NORESTART) {
enqueue_hrtimer(timer, base);
}
timer->state &= ~HRTIMER_STATE_CALLBACK;

调用完fn后，又将CALLBACK状态去除