From 4b41308d2d0398409620613c7eaaaf52c738b042 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 10 Aug 2011 10:37:59 -0700 Subject: [PATCH 01/23] alarmtimers: Change alarmtimer functions to return alarmtimer_restart values In order to properly fix the denial of service issue with high freq periodic alarm timers, we need to push the re-arming logic into the alarm timer handler, much as the hrtimer code does. This patch introduces alarmtimer_restart enum and changes the alarmtimer handler declarations to use it as a return value. Further, to ease following changes, it extends the alarmtimer handler functions to also take the time at expiration. No logic is yet modified. CC: Thomas Gleixner Signed-off-by: John Stultz --- include/linux/alarmtimer.h | 9 +++++++-- kernel/time/alarmtimer.c | 13 +++++++++---- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h index c5d6095b46f8..0289eb29e794 100644 --- a/include/linux/alarmtimer.h +++ b/include/linux/alarmtimer.h @@ -13,6 +13,11 @@ enum alarmtimer_type { ALARM_NUMTYPE, }; +enum alarmtimer_restart { + ALARMTIMER_NORESTART, + ALARMTIMER_RESTART, +}; + /** * struct alarm - Alarm timer structure * @node: timerqueue node for adding to the event list this value @@ -26,14 +31,14 @@ enum alarmtimer_type { struct alarm { struct timerqueue_node node; ktime_t period; - void (*function)(struct alarm *); + enum alarmtimer_restart (*function)(struct alarm *, ktime_t now); enum alarmtimer_type type; bool enabled; void *data; }; void alarm_init(struct alarm *alarm, enum alarmtimer_type type, - void (*function)(struct alarm *)); + enum alarmtimer_restart (*function)(struct alarm *, ktime_t)); void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period); void alarm_cancel(struct alarm *alarm); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index ea5e1a928d5b..9e786053ffa2 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -196,7 +196,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) } spin_unlock_irqrestore(&base->lock, flags); if (alarm->function) - alarm->function(alarm); + alarm->function(alarm, now); spin_lock_irqsave(&base->lock, flags); } @@ -299,7 +299,7 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) * @function: callback that is run when the alarm fires */ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, - void (*function)(struct alarm *)) + enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) { timerqueue_init(&alarm->node); alarm->period = ktime_set(0, 0); @@ -365,12 +365,15 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) * * Posix timer callback for expired alarm timers. */ -static void alarm_handle_timer(struct alarm *alarm) +static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, + ktime_t now) { struct k_itimer *ptr = container_of(alarm, struct k_itimer, it.alarmtimer); if (posix_timer_event(ptr, 0) != 0) ptr->it_overrun++; + + return ALARMTIMER_NORESTART; } /** @@ -509,13 +512,15 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, * * Wakes up the task that set the alarmtimer */ -static void alarmtimer_nsleep_wakeup(struct alarm *alarm) +static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, + ktime_t now) { struct task_struct *task = (struct task_struct *)alarm->data; alarm->data = NULL; if (task) wake_up_process(task); + return ALARMTIMER_NORESTART; } /** From 54da23b720d5d612f8f1669f9ed3744008fb7382 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 10 Aug 2011 11:08:07 -0700 Subject: [PATCH 02/23] alarmtimers: Push rearming peroidic timers down into alamrtimer handler This patch pushes the periodic alarmtimer re-arming down into the alarmtimer handler, mimicking how hrtimers handle this. CC: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 9e786053ffa2..55e634ea6054 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -174,6 +174,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) unsigned long flags; ktime_t now; int ret = HRTIMER_NORESTART; + int restart = ALARMTIMER_NORESTART; spin_lock_irqsave(&base->lock, flags); now = base->gettime(); @@ -188,16 +189,16 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) timerqueue_del(&base->timerqueue, &alarm->node); alarm->enabled = 0; - /* Re-add periodic timers */ - if (alarm->period.tv64) { - alarm->node.expires = ktime_add(expired, alarm->period); + + spin_unlock_irqrestore(&base->lock, flags); + if (alarm->function) + restart = alarm->function(alarm, now); + spin_lock_irqsave(&base->lock, flags); + + if (restart != ALARMTIMER_NORESTART) { timerqueue_add(&base->timerqueue, &alarm->node); alarm->enabled = 1; } - spin_unlock_irqrestore(&base->lock, flags); - if (alarm->function) - alarm->function(alarm, now); - spin_lock_irqsave(&base->lock, flags); } if (next) { @@ -373,6 +374,11 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, if (posix_timer_event(ptr, 0) != 0) ptr->it_overrun++; + /* Re-add periodic timers */ + if (alarm->period.tv64) { + alarm->node.expires = ktime_add(now, alarm->period); + return ALARMTIMER_RESTART; + } return ALARMTIMER_NORESTART; } From dce75a8c71819ed4c7efdcd53c9b6f6356dc8cb5 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 10 Aug 2011 11:31:03 -0700 Subject: [PATCH 03/23] alarmtimers: Add alarm_forward functionality In order to avoid wasting time expiring and re-adding very high freq periodic alarmtimers, introduce alarm_forward() which is similar to hrtimer_forward and moves the timer to the next future expiration time and returns the number of overruns. CC: Thomas Gleixner Signed-off-by: John Stultz --- include/linux/alarmtimer.h | 2 ++ kernel/time/alarmtimer.c | 37 ++++++++++++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h index 0289eb29e794..17535963b274 100644 --- a/include/linux/alarmtimer.h +++ b/include/linux/alarmtimer.h @@ -42,4 +42,6 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period); void alarm_cancel(struct alarm *alarm); +u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval); + #endif diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 55e634ea6054..f03b04291b6f 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -347,6 +347,41 @@ void alarm_cancel(struct alarm *alarm) } + +u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) +{ + u64 overrun = 1; + ktime_t delta; + + delta = ktime_sub(now, alarm->node.expires); + + if (delta.tv64 < 0) + return 0; + + if (unlikely(delta.tv64 >= interval.tv64)) { + s64 incr = ktime_to_ns(interval); + + overrun = ktime_divns(delta, incr); + + alarm->node.expires = ktime_add_ns(alarm->node.expires, + incr*overrun); + + if (alarm->node.expires.tv64 > now.tv64) + return overrun; + /* + * This (and the ktime_add() below) is the + * correction for exact: + */ + overrun++; + } + + alarm->node.expires = ktime_add(alarm->node.expires, interval); + return overrun; +} + + + + /** * clock2alarm - helper that converts from clockid to alarmtypes * @clockid: clockid. @@ -376,7 +411,7 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, /* Re-add periodic timers */ if (alarm->period.tv64) { - alarm->node.expires = ktime_add(now, alarm->period); + ptr->it_overrun += alarm_forward(alarm, now, alarm->period); return ALARMTIMER_RESTART; } return ALARMTIMER_NORESTART; From d77e23accec56bf2ba12187fe77a2f500a511282 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 10 Aug 2011 11:40:23 -0700 Subject: [PATCH 04/23] alarmtimers: Remove interval cap limit hack Now that the alarmtimers code has been refactored, the interval cap limit can be removed. CC: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index f03b04291b6f..a522c007e6fd 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -525,15 +525,6 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, if (!rtcdev) return -ENOTSUPP; - /* - * XXX HACK! Currently we can DOS a system if the interval - * period on alarmtimers is too small. Cap the interval here - * to 100us and solve this properly in a future patch! -jstultz - */ - if ((new_setting->it_interval.tv_sec == 0) && - (new_setting->it_interval.tv_nsec < 100000)) - new_setting->it_interval.tv_nsec = 100000; - if (old_setting) alarm_timer_get(timr, old_setting); From 9e26476243e438f4534a562660c1296a15a9e202 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 10 Aug 2011 12:09:24 -0700 Subject: [PATCH 05/23] alarmtimers: Remove period from alarm structure Now that periodic alarmtimers are managed by the handler function, remove the period value from the alarm structure and let the handlers manage the interval on their own. CC: Thomas Gleixner Signed-off-by: John Stultz --- include/linux/alarmtimer.h | 3 +-- include/linux/posix-timers.h | 5 ++++- kernel/time/alarmtimer.c | 30 ++++++++++++++---------------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h index 17535963b274..c854a8efa863 100644 --- a/include/linux/alarmtimer.h +++ b/include/linux/alarmtimer.h @@ -30,7 +30,6 @@ enum alarmtimer_restart { */ struct alarm { struct timerqueue_node node; - ktime_t period; enum alarmtimer_restart (*function)(struct alarm *, ktime_t now); enum alarmtimer_type type; bool enabled; @@ -39,7 +38,7 @@ struct alarm { void alarm_init(struct alarm *alarm, enum alarmtimer_type type, enum alarmtimer_restart (*function)(struct alarm *, ktime_t)); -void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period); +void alarm_start(struct alarm *alarm, ktime_t start); void alarm_cancel(struct alarm *alarm); u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval); diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 959c14132f46..042058fdb0af 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -81,7 +81,10 @@ struct k_itimer { unsigned long incr; unsigned long expires; } mmtimer; - struct alarm alarmtimer; + struct { + struct alarm alarmtimer; + ktime_t interval; + } alarm; struct rcu_head rcu; } it; }; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index a522c007e6fd..90935591dd44 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -303,7 +303,6 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) { timerqueue_init(&alarm->node); - alarm->period = ktime_set(0, 0); alarm->function = function; alarm->type = type; alarm->enabled = 0; @@ -313,9 +312,8 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, * alarm_start - Sets an alarm to fire * @alarm: ptr to alarm to set * @start: time to run the alarm - * @period: period at which the alarm will recur */ -void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period) +void alarm_start(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; @@ -324,7 +322,6 @@ void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period) if (alarm->enabled) alarmtimer_remove(base, alarm); alarm->node.expires = start; - alarm->period = period; alarmtimer_enqueue(base, alarm); alarm->enabled = 1; spin_unlock_irqrestore(&base->lock, flags); @@ -405,13 +402,14 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, ktime_t now) { struct k_itimer *ptr = container_of(alarm, struct k_itimer, - it.alarmtimer); + it.alarm.alarmtimer); if (posix_timer_event(ptr, 0) != 0) ptr->it_overrun++; /* Re-add periodic timers */ - if (alarm->period.tv64) { - ptr->it_overrun += alarm_forward(alarm, now, alarm->period); + if (ptr->it.alarm.interval.tv64) { + ptr->it_overrun += alarm_forward(alarm, now, + ptr->it.alarm.interval); return ALARMTIMER_RESTART; } return ALARMTIMER_NORESTART; @@ -471,7 +469,7 @@ static int alarm_timer_create(struct k_itimer *new_timer) type = clock2alarm(new_timer->it_clock); base = &alarm_bases[type]; - alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer); + alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); return 0; } @@ -488,9 +486,9 @@ static void alarm_timer_get(struct k_itimer *timr, memset(cur_setting, 0, sizeof(struct itimerspec)); cur_setting->it_interval = - ktime_to_timespec(timr->it.alarmtimer.period); + ktime_to_timespec(timr->it.alarm.interval); cur_setting->it_value = - ktime_to_timespec(timr->it.alarmtimer.node.expires); + ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires); return; } @@ -505,7 +503,7 @@ static int alarm_timer_del(struct k_itimer *timr) if (!rtcdev) return -ENOTSUPP; - alarm_cancel(&timr->it.alarmtimer); + alarm_cancel(&timr->it.alarm.alarmtimer); return 0; } @@ -529,12 +527,12 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, alarm_timer_get(timr, old_setting); /* If the timer was already set, cancel it */ - alarm_cancel(&timr->it.alarmtimer); + alarm_cancel(&timr->it.alarm.alarmtimer); /* start the timer */ - alarm_start(&timr->it.alarmtimer, - timespec_to_ktime(new_setting->it_value), - timespec_to_ktime(new_setting->it_interval)); + timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); + alarm_start(&timr->it.alarm.alarmtimer, + timespec_to_ktime(new_setting->it_value)); return 0; } @@ -567,7 +565,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) alarm->data = (void *)current; do { set_current_state(TASK_INTERRUPTIBLE); - alarm_start(alarm, absexp, ktime_set(0, 0)); + alarm_start(alarm, absexp); if (likely(alarm->data)) schedule(); From a28cde81ab13cc251748a4c4ef06883dd09a10ea Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 10 Aug 2011 12:30:21 -0700 Subject: [PATCH 06/23] alarmtimers: Add more refined alarm state tracking In order to allow for functionality like try_to_cancel, add more refined state tracking (similar to hrtimers). CC: Thomas Gleixner Signed-off-by: John Stultz --- include/linux/alarmtimer.h | 34 +++++++++++++++++++++++++++++++++- kernel/time/alarmtimer.c | 21 ++++++++++++++------- 2 files changed, 47 insertions(+), 8 deletions(-) diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h index c854a8efa863..304124a6c982 100644 --- a/include/linux/alarmtimer.h +++ b/include/linux/alarmtimer.h @@ -18,6 +18,11 @@ enum alarmtimer_restart { ALARMTIMER_RESTART, }; + +#define ALARMTIMER_STATE_INACTIVE 0x00 +#define ALARMTIMER_STATE_ENQUEUED 0x01 +#define ALARMTIMER_STATE_CALLBACK 0x02 + /** * struct alarm - Alarm timer structure * @node: timerqueue node for adding to the event list this value @@ -32,7 +37,7 @@ struct alarm { struct timerqueue_node node; enum alarmtimer_restart (*function)(struct alarm *, ktime_t now); enum alarmtimer_type type; - bool enabled; + int state; void *data; }; @@ -43,4 +48,31 @@ void alarm_cancel(struct alarm *alarm); u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval); +/* + * A alarmtimer is active, when it is enqueued into timerqueue or the + * callback function is running. + */ +static inline int alarmtimer_active(const struct alarm *timer) +{ + return timer->state != ALARMTIMER_STATE_INACTIVE; +} + +/* + * Helper function to check, whether the timer is on one of the queues + */ +static inline int alarmtimer_is_queued(struct alarm *timer) +{ + return timer->state & ALARMTIMER_STATE_ENQUEUED; +} + +/* + * Helper function to check, whether the timer is running the callback + * function + */ +static inline int alarmtimer_callback_running(struct alarm *timer) +{ + return timer->state & ALARMTIMER_STATE_CALLBACK; +} + + #endif diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 90935591dd44..5b14cc29b6a6 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -126,6 +126,8 @@ static struct rtc_device *alarmtimer_get_rtcdev(void) static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) { timerqueue_add(&base->timerqueue, &alarm->node); + alarm->state |= ALARMTIMER_STATE_ENQUEUED; + if (&alarm->node == timerqueue_getnext(&base->timerqueue)) { hrtimer_try_to_cancel(&base->timer); hrtimer_start(&base->timer, alarm->node.expires, @@ -147,7 +149,12 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) { struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue); + if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED)) + return; + timerqueue_del(&base->timerqueue, &alarm->node); + alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; + if (next == &alarm->node) { hrtimer_try_to_cancel(&base->timer); next = timerqueue_getnext(&base->timerqueue); @@ -188,16 +195,18 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) alarm = container_of(next, struct alarm, node); timerqueue_del(&base->timerqueue, &alarm->node); - alarm->enabled = 0; + alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; + alarm->state |= ALARMTIMER_STATE_CALLBACK; spin_unlock_irqrestore(&base->lock, flags); if (alarm->function) restart = alarm->function(alarm, now); spin_lock_irqsave(&base->lock, flags); + alarm->state &= ~ALARMTIMER_STATE_CALLBACK; if (restart != ALARMTIMER_NORESTART) { timerqueue_add(&base->timerqueue, &alarm->node); - alarm->enabled = 1; + alarm->state |= ALARMTIMER_STATE_ENQUEUED; } } @@ -305,7 +314,7 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, timerqueue_init(&alarm->node); alarm->function = function; alarm->type = type; - alarm->enabled = 0; + alarm->state = ALARMTIMER_STATE_INACTIVE; } /** @@ -319,11 +328,10 @@ void alarm_start(struct alarm *alarm, ktime_t start) unsigned long flags; spin_lock_irqsave(&base->lock, flags); - if (alarm->enabled) + if (alarmtimer_active(alarm)) alarmtimer_remove(base, alarm); alarm->node.expires = start; alarmtimer_enqueue(base, alarm); - alarm->enabled = 1; spin_unlock_irqrestore(&base->lock, flags); } @@ -337,9 +345,8 @@ void alarm_cancel(struct alarm *alarm) unsigned long flags; spin_lock_irqsave(&base->lock, flags); - if (alarm->enabled) + if (alarmtimer_is_queued(alarm)) alarmtimer_remove(base, alarm); - alarm->enabled = 0; spin_unlock_irqrestore(&base->lock, flags); } From 9082c465a5403f4a98734193e078552991a2e283 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 10 Aug 2011 12:41:36 -0700 Subject: [PATCH 07/23] alarmtimers: Add try_to_cancel functionality There's a number of edge cases when cancelling a alarm, so to be sure we accurately do so, introduce try_to_cancel, which returns proper failure errors if it cannot. Also modify cancel to spin until the alarm is properly disabled. CC: Thomas Gleixner Signed-off-by: John Stultz --- include/linux/alarmtimer.h | 3 ++- kernel/time/alarmtimer.c | 43 ++++++++++++++++++++++++++++++++------ 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h index 304124a6c982..975009e1cbe6 100644 --- a/include/linux/alarmtimer.h +++ b/include/linux/alarmtimer.h @@ -44,7 +44,8 @@ struct alarm { void alarm_init(struct alarm *alarm, enum alarmtimer_type type, enum alarmtimer_restart (*function)(struct alarm *, ktime_t)); void alarm_start(struct alarm *alarm, ktime_t start); -void alarm_cancel(struct alarm *alarm); +int alarm_try_to_cancel(struct alarm *alarm); +int alarm_cancel(struct alarm *alarm); u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 5b14cc29b6a6..bdb7342b6896 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -336,21 +336,49 @@ void alarm_start(struct alarm *alarm, ktime_t start) } /** - * alarm_cancel - Tries to cancel an alarm timer + * alarm_try_to_cancel - Tries to cancel an alarm timer * @alarm: ptr to alarm to be canceled + * + * Returns 1 if the timer was canceled, 0 if it was not running, + * and -1 if the callback was running */ -void alarm_cancel(struct alarm *alarm) +int alarm_try_to_cancel(struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; - + int ret = -1; spin_lock_irqsave(&base->lock, flags); - if (alarmtimer_is_queued(alarm)) + + if (alarmtimer_callback_running(alarm)) + goto out; + + if (alarmtimer_is_queued(alarm)) { alarmtimer_remove(base, alarm); + ret = 1; + } else + ret = 0; +out: spin_unlock_irqrestore(&base->lock, flags); + return ret; } +/** + * alarm_cancel - Spins trying to cancel an alarm timer until it is done + * @alarm: ptr to alarm to be canceled + * + * Returns 1 if the timer was canceled, 0 if it was not active. + */ +int alarm_cancel(struct alarm *alarm) +{ + for (;;) { + int ret = alarm_try_to_cancel(alarm); + if (ret >= 0) + return ret; + cpu_relax(); + } +} + u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) { @@ -510,7 +538,9 @@ static int alarm_timer_del(struct k_itimer *timr) if (!rtcdev) return -ENOTSUPP; - alarm_cancel(&timr->it.alarm.alarmtimer); + if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) + return TIMER_RETRY; + return 0; } @@ -534,7 +564,8 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, alarm_timer_get(timr, old_setting); /* If the timer was already set, cancel it */ - alarm_cancel(&timr->it.alarm.alarmtimer); + if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) + return TIMER_RETRY; /* start the timer */ timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); From 8bc0dafb5cf38a19484dfb16e2c6d29e85820046 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 14 Jul 2011 18:35:13 -0700 Subject: [PATCH 08/23] alarmtimers: Rework RTC device selection using class interface This allows cleaner detection of the RTC device being registered, rather then probing any time someone calls alarmtimer_get_rtcdev. CC: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 78 ++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 38 deletions(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index bdb7342b6896..154d5563ab1b 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -52,27 +52,6 @@ static struct rtc_timer rtctimer; static struct rtc_device *rtcdev; static DEFINE_SPINLOCK(rtcdev_lock); -/** - * has_wakealarm - check rtc device has wakealarm ability - * @dev: current device - * @name_ptr: name to be returned - * - * This helper function checks to see if the rtc device can wake - * from suspend. - */ -static int has_wakealarm(struct device *dev, void *name_ptr) -{ - struct rtc_device *candidate = to_rtc_device(dev); - - if (!candidate->ops->set_alarm) - return 0; - if (!device_may_wakeup(candidate->dev.parent)) - return 0; - - *(const char **)name_ptr = dev_name(dev); - return 1; -} - /** * alarmtimer_get_rtcdev - Return selected rtcdevice * @@ -82,37 +61,58 @@ static int has_wakealarm(struct device *dev, void *name_ptr) */ static struct rtc_device *alarmtimer_get_rtcdev(void) { - struct device *dev; - char *str; unsigned long flags; struct rtc_device *ret; spin_lock_irqsave(&rtcdev_lock, flags); - if (!rtcdev) { - /* Find an rtc device and init the rtc_timer */ - dev = class_find_device(rtc_class, NULL, &str, has_wakealarm); - /* If we have a device then str is valid. See has_wakealarm() */ - if (dev) { - rtcdev = rtc_class_open(str); - /* - * Drop the reference we got in class_find_device, - * rtc_open takes its own. - */ - put_device(dev); - rtc_timer_init(&rtctimer, NULL, NULL); - } - } ret = rtcdev; spin_unlock_irqrestore(&rtcdev_lock, flags); return ret; } + + +static int alarmtimer_rtc_add_device(struct device *dev, + struct class_interface *class_intf) +{ + unsigned long flags; + struct rtc_device *rtc = to_rtc_device(dev); + + if (rtcdev) + return -EBUSY; + + if (!rtc->ops->set_alarm) + return -1; + if (!device_may_wakeup(rtc->dev.parent)) + return -1; + + spin_lock_irqsave(&rtcdev_lock, flags); + if (!rtcdev) { + rtcdev = rtc; + /* hold a reference so it doesn't go away */ + get_device(dev); + } + spin_unlock_irqrestore(&rtcdev_lock, flags); + return 0; +} + +static struct class_interface alarmtimer_rtc_interface = { + .add_dev = &alarmtimer_rtc_add_device, +}; + +static void alarmtimer_rtc_interface_setup(void) +{ + alarmtimer_rtc_interface.class = rtc_class; + class_interface_register(&alarmtimer_rtc_interface); +} #else #define alarmtimer_get_rtcdev() (0) #define rtcdev (0) +#define alarmtimer_rtc_interface_setup() #endif + /** * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue * @base: pointer to the base where the timer is being run @@ -244,7 +244,7 @@ static int alarmtimer_suspend(struct device *dev) freezer_delta = ktime_set(0, 0); spin_unlock_irqrestore(&freezer_delta_lock, flags); - rtc = rtcdev; + rtc = alarmtimer_get_rtcdev(); /* If we have no rtcdev, just return */ if (!rtc) return 0; @@ -792,6 +792,8 @@ static int __init alarmtimer_init(void) HRTIMER_MODE_ABS); alarm_bases[i].timer.function = alarmtimer_fired; } + + alarmtimer_rtc_interface_setup(); error = platform_driver_register(&alarmtimer_driver); platform_device_register_simple("alarmtimer", -1, NULL, 0); From ef0e0f5ed9bde6d1e3376169785a463ad2160e6d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 24 Aug 2011 09:36:46 +0200 Subject: [PATCH 09/23] cputime: Clean up cputime_to_usecs and usecs_to_cputime macros Get rid of semicolon so that those expressions can be used also somewhere else than just in an assignment. Signed-off-by: Michal Hocko Acked-by: Arnd Bergmann Cc: Dave Jones Cc: Alexey Dobriyan Link: http://lkml.kernel.org/r/7565417ce30d7e6b1ddc169843af0777dbf66e75.1314172057.git.mhocko@suse.cz Signed-off-by: Thomas Gleixner --- include/asm-generic/cputime.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h index 61e03dd7939e..62ce6823c0f2 100644 --- a/include/asm-generic/cputime.h +++ b/include/asm-generic/cputime.h @@ -38,8 +38,8 @@ typedef u64 cputime64_t; /* * Convert cputime to microseconds and back. */ -#define cputime_to_usecs(__ct) jiffies_to_usecs(__ct); -#define usecs_to_cputime(__msecs) usecs_to_jiffies(__msecs); +#define cputime_to_usecs(__ct) jiffies_to_usecs(__ct) +#define usecs_to_cputime(__msecs) usecs_to_jiffies(__msecs) /* * Convert cputime to seconds and back. From 6beea0cda8ce71c01354e688e5735c47e331e84f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 24 Aug 2011 09:37:48 +0200 Subject: [PATCH 10/23] nohz: Fix update_ts_time_stat idle accounting update_ts_time_stat currently updates idle time even if we are in iowait loop at the moment. The only real users of the idle counter (via get_cpu_idle_time_us) are CPU governors and they expect to get cumulative time for both idle and iowait times. The value (idle_sleeptime) is also printed to userspace by print_cpu but it prints both idle and iowait times so the idle part is misleading. Let's clean this up and fix update_ts_time_stat to account both counters properly and update consumers of idle to consider iowait time as well. If we do this we might use get_cpu_{idle,iowait}_time_us from other contexts as well and we will get expected values. Signed-off-by: Michal Hocko Cc: Dave Jones Cc: Arnd Bergmann Cc: Alexey Dobriyan Link: http://lkml.kernel.org/r/e9c909c221a8da402c4da07e4cd968c3218f8eb1.1314172057.git.mhocko@suse.cz Signed-off-by: Thomas Gleixner --- drivers/cpufreq/cpufreq_conservative.c | 4 +++- drivers/cpufreq/cpufreq_ondemand.c | 4 +++- kernel/time/tick-sched.c | 8 ++++---- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 33b56e5c5c14..c97b468ee9f7 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -120,10 +120,12 @@ static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu, static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) { - u64 idle_time = get_cpu_idle_time_us(cpu, wall); + u64 idle_time = get_cpu_idle_time_us(cpu, NULL); if (idle_time == -1ULL) return get_cpu_idle_time_jiffy(cpu, wall); + else + idle_time += get_cpu_iowait_time_us(cpu, wall); return idle_time; } diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 891360edecdd..07756bddedef 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -144,10 +144,12 @@ static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu, static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) { - u64 idle_time = get_cpu_idle_time_us(cpu, wall); + u64 idle_time = get_cpu_idle_time_us(cpu, NULL); if (idle_time == -1ULL) return get_cpu_idle_time_jiffy(cpu, wall); + else + idle_time += get_cpu_iowait_time_us(cpu, wall); return idle_time; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d5097c44b407..7ab44bca6546 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -159,9 +159,10 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda if (ts->idle_active) { delta = ktime_sub(now, ts->idle_entrytime); - ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); if (nr_iowait_cpu(cpu) > 0) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); + else + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); ts->idle_entrytime = now; } @@ -200,8 +201,7 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) * @last_update_time: variable to store update time in * * Return the cummulative idle time (since boot) for a given - * CPU, in microseconds. The idle time returned includes - * the iowait time (unlike what "top" and co report). + * CPU, in microseconds. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. @@ -221,7 +221,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); -/* +/** * get_cpu_iowait_time_us - get the total iowait time of a cpu * @cpu: CPU number to query * @last_update_time: variable to store update time in From 09a1d34f8535ecf9a347ea76f7597730c2bc0c8d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 24 Aug 2011 09:39:30 +0200 Subject: [PATCH 11/23] nohz: Make idle/iowait counter update conditional get_cpu_{idle,iowait}_time_us update idle/iowait counters unconditionally if the given CPU is in the idle loop. This doesn't work well outside of CPU governors which are singletons so nobody (except for IRQ) can race with them. We will need to use both functions from /proc/stat handler to properly handle nohz idle/iowait times. Make the update depend on a non NULL last_update_time argument. Signed-off-by: Michal Hocko Cc: Dave Jones Cc: Arnd Bergmann Cc: Alexey Dobriyan Link: http://lkml.kernel.org/r/11f23179472635ce52e78921d47a20216b872f23.1314172057.git.mhocko@suse.cz Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 41 ++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7ab44bca6546..664c4a365439 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -198,7 +198,8 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) /** * get_cpu_idle_time_us - get the total idle time of a cpu * @cpu: CPU number to query - * @last_update_time: variable to store update time in + * @last_update_time: variable to store update time in. Do not update + * counters if NULL. * * Return the cummulative idle time (since boot) for a given * CPU, in microseconds. @@ -211,20 +212,35 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t now, idle; if (!tick_nohz_enabled) return -1; - update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); + now = ktime_get(); + if (last_update_time) { + update_ts_time_stats(cpu, ts, now, last_update_time); + idle = ts->idle_sleeptime; + } else { + if (ts->idle_active && !nr_iowait_cpu(cpu)) { + ktime_t delta = ktime_sub(now, ts->idle_entrytime); + + idle = ktime_add(ts->idle_sleeptime, delta); + } else { + idle = ts->idle_sleeptime; + } + } + + return ktime_to_us(idle); - return ktime_to_us(ts->idle_sleeptime); } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** * get_cpu_iowait_time_us - get the total iowait time of a cpu * @cpu: CPU number to query - * @last_update_time: variable to store update time in + * @last_update_time: variable to store update time in. Do not update + * counters if NULL. * * Return the cummulative iowait time (since boot) for a given * CPU, in microseconds. @@ -237,13 +253,26 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t now, iowait; if (!tick_nohz_enabled) return -1; - update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); + now = ktime_get(); + if (last_update_time) { + update_ts_time_stats(cpu, ts, now, last_update_time); + iowait = ts->iowait_sleeptime; + } else { + if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { + ktime_t delta = ktime_sub(now, ts->idle_entrytime); - return ktime_to_us(ts->iowait_sleeptime); + iowait = ktime_add(ts->iowait_sleeptime, delta); + } else { + iowait = ts->iowait_sleeptime; + } + } + + return ktime_to_us(iowait); } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); From a25cac5198d4ff2842ccca63b423962848ad24b2 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 24 Aug 2011 09:40:25 +0200 Subject: [PATCH 12/23] proc: Consider NO_HZ when printing idle and iowait times show_stat handler of the /proc/stat file relies on kstat_cpu(cpu) statistics when priting information about idle and iowait times. This is OK if we are not using tickless kernel (CONFIG_NO_HZ) because counters are updated periodically. With NO_HZ things got more tricky because we are not doing idle/iowait accounting while we are tickless so the value might get outdated. Users of /proc/stat will notice that by unchanged idle/iowait values which is then interpreted as 0% idle/iowait time. From the user space POV this is an unexpected behavior and a change of the interface. Let's fix this by using get_cpu_{idle,iowait}_time_us which accounts the total idle/iowait time since boot and it doesn't rely on sampling or any other periodic activity. Fall back to the previous behavior if NO_HZ is disabled or not configured. Signed-off-by: Michal Hocko Cc: Dave Jones Cc: Arnd Bergmann Cc: Alexey Dobriyan Link: http://lkml.kernel.org/r/39181366adac1b39cb6aa3cd53ff0f7c78d32676.1314172057.git.mhocko@suse.cz Signed-off-by: Thomas Gleixner --- fs/proc/stat.c | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 9758b654a1bc..42b274da92c3 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -10,6 +10,7 @@ #include #include #include +#include #ifndef arch_irq_stat_cpu #define arch_irq_stat_cpu(cpu) 0 @@ -21,6 +22,35 @@ #define arch_idle_time(cpu) 0 #endif +static cputime64_t get_idle_time(int cpu) +{ + u64 idle_time = get_cpu_idle_time_us(cpu, NULL); + cputime64_t idle; + + if (idle_time == -1ULL) { + /* !NO_HZ so we can rely on cpustat.idle */ + idle = kstat_cpu(cpu).cpustat.idle; + idle = cputime64_add(idle, arch_idle_time(cpu)); + } else + idle = usecs_to_cputime(idle_time); + + return idle; +} + +static cputime64_t get_iowait_time(int cpu) +{ + u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL); + cputime64_t iowait; + + if (iowait_time == -1ULL) + /* !NO_HZ so we can rely on cpustat.iowait */ + iowait = kstat_cpu(cpu).cpustat.iowait; + else + iowait = usecs_to_cputime(iowait_time); + + return iowait; +} + static int show_stat(struct seq_file *p, void *v) { int i, j; @@ -42,9 +72,8 @@ static int show_stat(struct seq_file *p, void *v) user = cputime64_add(user, kstat_cpu(i).cpustat.user); nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); system = cputime64_add(system, kstat_cpu(i).cpustat.system); - idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle); - idle = cputime64_add(idle, arch_idle_time(i)); - iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait); + idle = cputime64_add(idle, get_idle_time(i)); + iowait = cputime64_add(iowait, get_iowait_time(i)); irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); @@ -76,14 +105,12 @@ static int show_stat(struct seq_file *p, void *v) (unsigned long long)cputime64_to_clock_t(guest), (unsigned long long)cputime64_to_clock_t(guest_nice)); for_each_online_cpu(i) { - /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ user = kstat_cpu(i).cpustat.user; nice = kstat_cpu(i).cpustat.nice; system = kstat_cpu(i).cpustat.system; - idle = kstat_cpu(i).cpustat.idle; - idle = cputime64_add(idle, arch_idle_time(i)); - iowait = kstat_cpu(i).cpustat.iowait; + idle = get_idle_time(i); + iowait = get_iowait_time(i); irq = kstat_cpu(i).cpustat.irq; softirq = kstat_cpu(i).cpustat.softirq; steal = kstat_cpu(i).cpustat.steal; From 29c158e81c733ac7d6a75c5ee929f34fb9f92983 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 23 Aug 2011 13:20:46 +0200 Subject: [PATCH 13/23] nohz: Remove "Switched to NOHz mode" debugging messages When performing cpu hotplug tests the kernel printk log buffer gets flooded with pointless "Switched to NOHz mode..." messages. Especially when afterwards analyzing a dump this might have removed more interesting stuff out of the buffer. Assuming that switching to NOHz mode simply works just remove the printk. Signed-off-by: Heiko Carstens Link: http://lkml.kernel.org/r/20110823112046.GB2540@osiris.boeblingen.de.ibm.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 664c4a365439..7e2e0817cbfb 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -669,8 +669,6 @@ static void tick_nohz_switch_to_nohz(void) next = ktime_add(next, tick_period); } local_irq_enable(); - - printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); } /* @@ -822,10 +820,8 @@ void tick_setup_sched_timer(void) } #ifdef CONFIG_NO_HZ - if (tick_nohz_enabled) { + if (tick_nohz_enabled) ts->nohz_mode = NOHZ_MODE_HIGHRES; - printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); - } #endif } #endif /* HIGH_RES_TIMERS */ From d1748302f70be7469809809283fe164156a34231 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 23 Aug 2011 15:29:42 +0200 Subject: [PATCH 14/23] clockevents: Make minimum delay adjustments configurable The automatic increase of the min_delta_ns of a clockevents device should be done in the clockevents code as the minimum delay is an attribute of the clockevents device. In addition not all architectures want the automatic adjustment, on a massively virtualized system it can happen that the programming of a clock event fails several times in a row because the virtual cpu has been rescheduled quickly enough. In that case the minimum delay will erroneously be increased with no way back. The new config symbol GENERIC_CLOCKEVENTS_MIN_ADJUST is used to enable the automatic adjustment. The config option is selected only for x86. Signed-off-by: Martin Schwidefsky Cc: john stultz Link: http://lkml.kernel.org/r/20110823133142.494157493@de.ibm.com Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 1 + include/linux/clockchips.h | 2 +- kernel/time/Kconfig | 2 + kernel/time/clockevents.c | 125 +++++++++++++++++++++++++++++++---- kernel/time/tick-broadcast.c | 4 +- kernel/time/tick-common.c | 4 +- kernel/time/tick-internal.h | 2 - kernel/time/tick-oneshot.c | 77 ++------------------- 8 files changed, 123 insertions(+), 94 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6a47bb22657f..a1609cd7e4c6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -68,6 +68,7 @@ config X86 select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP select GENERIC_IRQ_SHOW + select GENERIC_CLOCKEVENTS_MIN_ADJUST select IRQ_FORCED_THREADING select USE_GENERIC_SMP_HELPERS if SMP select HAVE_BPF_JIT if (X86_64 && NET) diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index d6733e27af34..39bb050bdbb2 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -140,7 +140,7 @@ extern void clockevents_set_mode(struct clock_event_device *dev, enum clock_event_mode mode); extern int clockevents_register_notifier(struct notifier_block *nb); extern int clockevents_program_event(struct clock_event_device *dev, - ktime_t expires, ktime_t now); + ktime_t expires, bool force); extern void clockevents_handle_noop(struct clock_event_device *dev); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f06a8a365648..b26c2228fe92 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -27,3 +27,5 @@ config GENERIC_CLOCKEVENTS_BUILD default y depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR +config GENERIC_CLOCKEVENTS_MIN_ADJUST + bool diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index e4c699dfa4e8..713ef94eceef 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -94,42 +94,139 @@ void clockevents_shutdown(struct clock_event_device *dev) dev->next_event.tv64 = KTIME_MAX; } +#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST + +/* Limit min_delta to a jiffie */ +#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) + +/** + * clockevents_increase_min_delta - raise minimum delta of a clock event device + * @dev: device to increase the minimum delta + * + * Returns 0 on success, -ETIME when the minimum delta reached the limit. + */ +static int clockevents_increase_min_delta(struct clock_event_device *dev) +{ + /* Nothing to do if we already reached the limit */ + if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { + printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n"); + dev->next_event.tv64 = KTIME_MAX; + return -ETIME; + } + + if (dev->min_delta_ns < 5000) + dev->min_delta_ns = 5000; + else + dev->min_delta_ns += dev->min_delta_ns >> 1; + + if (dev->min_delta_ns > MIN_DELTA_LIMIT) + dev->min_delta_ns = MIN_DELTA_LIMIT; + + printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", + dev->name ? dev->name : "?", + (unsigned long long) dev->min_delta_ns); + return 0; +} + +/** + * clockevents_program_min_delta - Set clock event device to the minimum delay. + * @dev: device to program + * + * Returns 0 on success, -ETIME when the retry loop failed. + */ +static int clockevents_program_min_delta(struct clock_event_device *dev) +{ + unsigned long long clc; + int64_t delta; + int i; + + for (i = 0;;) { + delta = dev->min_delta_ns; + dev->next_event = ktime_add_ns(ktime_get(), delta); + + if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) + return 0; + + dev->retries++; + clc = ((unsigned long long) delta * dev->mult) >> dev->shift; + if (dev->set_next_event((unsigned long) clc, dev) == 0) + return 0; + + if (++i > 2) { + /* + * We tried 3 times to program the device with the + * given min_delta_ns. Try to increase the minimum + * delta, if that fails as well get out of here. + */ + if (clockevents_increase_min_delta(dev)) + return -ETIME; + i = 0; + } + } +} + +#else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ + +/** + * clockevents_program_min_delta - Set clock event device to the minimum delay. + * @dev: device to program + * + * Returns 0 on success, -ETIME when the retry loop failed. + */ +static int clockevents_program_min_delta(struct clock_event_device *dev) +{ + unsigned long long clc; + int64_t delta; + + delta = dev->min_delta_ns; + dev->next_event = ktime_add_ns(ktime_get(), delta); + + if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) + return 0; + + dev->retries++; + clc = ((unsigned long long) delta * dev->mult) >> dev->shift; + return dev->set_next_event((unsigned long) clc, dev); +} + +#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ + /** * clockevents_program_event - Reprogram the clock event device. + * @dev: device to program * @expires: absolute expiry time (monotonic clock) + * @force: program minimum delay if expires can not be set * * Returns 0 on success, -ETIME when the event is in the past. */ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, - ktime_t now) + bool force) { unsigned long long clc; int64_t delta; + int rc; if (unlikely(expires.tv64 < 0)) { WARN_ON_ONCE(1); return -ETIME; } - delta = ktime_to_ns(ktime_sub(expires, now)); - - if (delta <= 0) - return -ETIME; - dev->next_event = expires; if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) return 0; - if (delta > dev->max_delta_ns) - delta = dev->max_delta_ns; - if (delta < dev->min_delta_ns) - delta = dev->min_delta_ns; + delta = ktime_to_ns(ktime_sub(expires, ktime_get())); + if (delta <= 0) + return force ? clockevents_program_min_delta(dev) : -ETIME; - clc = delta * dev->mult; - clc >>= dev->shift; + delta = min(delta, (int64_t) dev->max_delta_ns); + delta = max(delta, (int64_t) dev->min_delta_ns); - return dev->set_next_event((unsigned long) clc, dev); + clc = ((unsigned long long) delta * dev->mult) >> dev->shift; + rc = dev->set_next_event((unsigned long) clc, dev); + + return (rc && force) ? clockevents_program_min_delta(dev) : rc; } /** @@ -258,7 +355,7 @@ int clockevents_update_freq(struct clock_event_device *dev, u32 freq) if (dev->mode != CLOCK_EVT_MODE_ONESHOT) return 0; - return clockevents_program_event(dev, dev->next_event, ktime_get()); + return clockevents_program_event(dev, dev->next_event, false); } /* diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index c7218d132738..f954282d9a82 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -194,7 +194,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) for (next = dev->next_event; ;) { next = ktime_add(next, tick_period); - if (!clockevents_program_event(dev, next, ktime_get())) + if (!clockevents_program_event(dev, next, false)) return; tick_do_periodic_broadcast(); } @@ -373,7 +373,7 @@ static int tick_broadcast_set_event(ktime_t expires, int force) { struct clock_event_device *bc = tick_broadcast_device.evtdev; - return tick_dev_program_event(bc, expires, force); + return clockevents_program_event(bc, expires, force); } int tick_resume_broadcast_oneshot(struct clock_event_device *bc) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 119528de8235..da6c9ecad4e4 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -94,7 +94,7 @@ void tick_handle_periodic(struct clock_event_device *dev) */ next = ktime_add(dev->next_event, tick_period); for (;;) { - if (!clockevents_program_event(dev, next, ktime_get())) + if (!clockevents_program_event(dev, next, false)) return; /* * Have to be careful here. If we're in oneshot mode, @@ -137,7 +137,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); for (;;) { - if (!clockevents_program_event(dev, next, ktime_get())) + if (!clockevents_program_event(dev, next, false)) return; next = ktime_add(next, tick_period); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 1009b06d6f89..4e265b901fed 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -26,8 +26,6 @@ extern void clockevents_shutdown(struct clock_event_device *dev); extern void tick_setup_oneshot(struct clock_event_device *newdev, void (*handler)(struct clock_event_device *), ktime_t nextevt); -extern int tick_dev_program_event(struct clock_event_device *dev, - ktime_t expires, int force); extern int tick_program_event(ktime_t expires, int force); extern void tick_oneshot_notify(void); extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 2d04411a5f05..824109060a33 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -21,74 +21,6 @@ #include "tick-internal.h" -/* Limit min_delta to a jiffie */ -#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) - -static int tick_increase_min_delta(struct clock_event_device *dev) -{ - /* Nothing to do if we already reached the limit */ - if (dev->min_delta_ns >= MIN_DELTA_LIMIT) - return -ETIME; - - if (dev->min_delta_ns < 5000) - dev->min_delta_ns = 5000; - else - dev->min_delta_ns += dev->min_delta_ns >> 1; - - if (dev->min_delta_ns > MIN_DELTA_LIMIT) - dev->min_delta_ns = MIN_DELTA_LIMIT; - - printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", - dev->name ? dev->name : "?", - (unsigned long long) dev->min_delta_ns); - return 0; -} - -/** - * tick_program_event internal worker function - */ -int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, - int force) -{ - ktime_t now = ktime_get(); - int i; - - for (i = 0;;) { - int ret = clockevents_program_event(dev, expires, now); - - if (!ret || !force) - return ret; - - dev->retries++; - /* - * We tried 3 times to program the device with the given - * min_delta_ns. If that's not working then we increase it - * and emit a warning. - */ - if (++i > 2) { - /* Increase the min. delta and try again */ - if (tick_increase_min_delta(dev)) { - /* - * Get out of the loop if min_delta_ns - * hit the limit already. That's - * better than staying here forever. - * - * We clear next_event so we have a - * chance that the box survives. - */ - printk(KERN_WARNING - "CE: Reprogramming failure. Giving up\n"); - dev->next_event.tv64 = KTIME_MAX; - return -ETIME; - } - i = 0; - } - - now = ktime_get(); - expires = ktime_add_ns(now, dev->min_delta_ns); - } -} - /** * tick_program_event */ @@ -96,7 +28,7 @@ int tick_program_event(ktime_t expires, int force) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); - return tick_dev_program_event(dev, expires, force); + return clockevents_program_event(dev, expires, force); } /** @@ -104,11 +36,10 @@ int tick_program_event(ktime_t expires, int force) */ void tick_resume_oneshot(void) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); - struct clock_event_device *dev = td->evtdev; + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); - tick_program_event(ktime_get(), 1); + clockevents_program_event(dev, ktime_get(), true); } /** @@ -120,7 +51,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, { newdev->event_handler = handler; clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); - tick_dev_program_event(newdev, next_event, 1); + clockevents_program_event(newdev, next_event, true); } /** From 65516f8a7c2028381f0dae4c16ddb621c96158cc Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 23 Aug 2011 15:29:43 +0200 Subject: [PATCH 15/23] clockevents: Add direct ktime programming function There is at least one architecture (s390) with a sane clockevent device that can be programmed with the equivalent of a ktime. No need to create a delta against the current time, the ktime can be used directly. A new clock device function 'set_next_ktime' is introduced that is called with the unmodified ktime for the timer if the clock event device has the CLOCK_EVT_FEAT_KTIME bit set. Signed-off-by: Martin Schwidefsky Cc: john stultz Link: http://lkml.kernel.org/r/20110823133142.815350967@de.ibm.com Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 10 +++++++--- kernel/time/clockevents.c | 4 ++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 39bb050bdbb2..81e803e90aa4 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -45,20 +45,22 @@ enum clock_event_nofitiers { */ #define CLOCK_EVT_FEAT_PERIODIC 0x000001 #define CLOCK_EVT_FEAT_ONESHOT 0x000002 +#define CLOCK_EVT_FEAT_KTIME 0x000004 /* * x86(64) specific misfeatures: * * - Clockevent source stops in C3 State and needs broadcast support. * - Local APIC timer is used as a dummy device. */ -#define CLOCK_EVT_FEAT_C3STOP 0x000004 -#define CLOCK_EVT_FEAT_DUMMY 0x000008 +#define CLOCK_EVT_FEAT_C3STOP 0x000008 +#define CLOCK_EVT_FEAT_DUMMY 0x000010 /** * struct clock_event_device - clock event device descriptor * @event_handler: Assigned by the framework to be called by the low * level handler of the event source - * @set_next_event: set next event function + * @set_next_event: set next event function using a clocksource delta + * @set_next_ktime: set next event function using a direct ktime value * @next_event: local storage for the next event in oneshot mode * @max_delta_ns: maximum delta value in ns * @min_delta_ns: minimum delta value in ns @@ -81,6 +83,8 @@ struct clock_event_device { void (*event_handler)(struct clock_event_device *); int (*set_next_event)(unsigned long evt, struct clock_event_device *); + int (*set_next_ktime)(ktime_t expires, + struct clock_event_device *); ktime_t next_event; u64 max_delta_ns; u64 min_delta_ns; diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 713ef94eceef..1ecd6ba36d6c 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -216,6 +216,10 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) return 0; + /* Shortcut for clockevent devices that can deal with ktime. */ + if (dev->features & CLOCK_EVT_FEAT_KTIME) + return dev->set_next_ktime(expires, dev); + delta = ktime_to_ns(ktime_sub(expires, ktime_get())); if (delta <= 0) return force ? clockevents_program_min_delta(dev) : -ETIME; From 4f37a68cdaf6dea833cfdded2a3e0c47c0f006da Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 23 Aug 2011 15:29:44 +0200 Subject: [PATCH 16/23] s390: Use direct ktime path for s390 clockevent device The clock comparator on s390 uses the same format as the TOD clock. If the value in the clock comparator is smaller than the current TOD value an interrupt is pending. Use the CLOCK_EVT_FEAT_KTIME feature to get the unmodified ktime of the next clockevent expiration and use it to program the clock comparator without querying the TOD clock. Signed-off-by: Martin Schwidefsky Cc: john stultz Link: http://lkml.kernel.org/r/20110823133143.153017933@de.ibm.com Signed-off-by: Thomas Gleixner --- arch/s390/kernel/time.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index dff933065ab6..c53716487e77 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -109,10 +109,14 @@ static void fixup_clock_comparator(unsigned long long delta) set_clock_comparator(S390_lowcore.clock_comparator); } -static int s390_next_event(unsigned long delta, +static int s390_next_ktime(ktime_t expires, struct clock_event_device *evt) { - S390_lowcore.clock_comparator = get_clock() + delta; + s64 nsecs; + + nsecs = ktime_to_ns(ktime_sub(expires, ktime_get_monotonic_offset())); + do_div(nsecs, 125); + S390_lowcore.clock_comparator = TOD_UNIX_EPOCH + (nsecs << 9); set_clock_comparator(S390_lowcore.clock_comparator); return 0; } @@ -137,14 +141,15 @@ void init_cpu_timer(void) cpu = smp_processor_id(); cd = &per_cpu(comparators, cpu); cd->name = "comparator"; - cd->features = CLOCK_EVT_FEAT_ONESHOT; + cd->features = CLOCK_EVT_FEAT_ONESHOT | + CLOCK_EVT_FEAT_KTIME; cd->mult = 16777; cd->shift = 12; cd->min_delta_ns = 1; cd->max_delta_ns = LONG_MAX; cd->rating = 400; cd->cpumask = cpumask_of(cpu); - cd->set_next_event = s390_next_event; + cd->set_next_ktime = s390_next_ktime; cd->set_mode = s390_set_mode; clockevents_register_device(cd); From e8abccb719377af63cb0f1fed289db405e3def16 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 1 Sep 2011 12:42:04 +0200 Subject: [PATCH 17/23] posix-cpu-timers: Cure SMP accounting oddities David reported: Attached below is a watered-down version of rt/tst-cpuclock2.c from GLIBC. Just build it with "gcc -o test test.c -lpthread -lrt" or similar. Run it several times, and you will see cases where the main thread will measure a process clock difference before and after the nanosleep which is smaller than the cpu-burner thread's individual thread clock difference. This doesn't make any sense since the cpu-burner thread is part of the top-level process's thread group. I've reproduced this on both x86-64 and sparc64 (using both 32-bit and 64-bit binaries). For example: [davem@boricha build-x86_64-linux]$ ./test process: before(0.001221967) after(0.498624371) diff(497402404) thread: before(0.000081692) after(0.498316431) diff(498234739) self: before(0.001223521) after(0.001240219) diff(16698) [davem@boricha build-x86_64-linux]$ The diff of 'process' should always be >= the diff of 'thread'. I make sure to wrap the 'thread' clock measurements the most tightly around the nanosleep() call, and that the 'process' clock measurements are the outer-most ones. --- #include #include #include #include #include #include #include #include static pthread_barrier_t barrier; static void *chew_cpu(void *arg) { pthread_barrier_wait(&barrier); while (1) __asm__ __volatile__("" : : : "memory"); return NULL; } int main(void) { clockid_t process_clock, my_thread_clock, th_clock; struct timespec process_before, process_after; struct timespec me_before, me_after; struct timespec th_before, th_after; struct timespec sleeptime; unsigned long diff; pthread_t th; int err; err = clock_getcpuclockid(0, &process_clock); if (err) return 1; err = pthread_getcpuclockid(pthread_self(), &my_thread_clock); if (err) return 1; pthread_barrier_init(&barrier, NULL, 2); err = pthread_create(&th, NULL, chew_cpu, NULL); if (err) return 1; err = pthread_getcpuclockid(th, &th_clock); if (err) return 1; pthread_barrier_wait(&barrier); err = clock_gettime(process_clock, &process_before); if (err) return 1; err = clock_gettime(my_thread_clock, &me_before); if (err) return 1; err = clock_gettime(th_clock, &th_before); if (err) return 1; sleeptime.tv_sec = 0; sleeptime.tv_nsec = 500000000; nanosleep(&sleeptime, NULL); err = clock_gettime(th_clock, &th_after); if (err) return 1; err = clock_gettime(my_thread_clock, &me_after); if (err) return 1; err = clock_gettime(process_clock, &process_after); if (err) return 1; diff = process_after.tv_nsec - process_before.tv_nsec; printf("process: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n", process_before.tv_sec, process_before.tv_nsec, process_after.tv_sec, process_after.tv_nsec, diff); diff = th_after.tv_nsec - th_before.tv_nsec; printf("thread: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n", th_before.tv_sec, th_before.tv_nsec, th_after.tv_sec, th_after.tv_nsec, diff); diff = me_after.tv_nsec - me_before.tv_nsec; printf("self: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n", me_before.tv_sec, me_before.tv_nsec, me_after.tv_sec, me_after.tv_nsec, diff); return 0; } This is due to us using p->se.sum_exec_runtime in thread_group_cputime() where we iterate the thread group and sum all data. This does not take time since the last schedule operation (tick or otherwise) into account. We can cure this by using task_sched_runtime() at the cost of having to take locks. This also means we can (and must) do away with thread_group_sched_runtime() since the modified thread_group_cputime() is now more accurate and would deadlock when called from thread_group_sched_runtime(). Reported-by: David Miller Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1314874459.7945.22.camel@twins Cc: stable@kernel.org Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 1 - kernel/posix-cpu-timers.c | 5 +++-- kernel/sched.c | 24 ------------------------ 3 files changed, 3 insertions(+), 27 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 20b03bf94748..2909fe7a622d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1955,7 +1955,6 @@ static inline void disable_sched_clock_irqtime(void) {} extern unsigned long long task_sched_runtime(struct task_struct *task); -extern unsigned long long thread_group_sched_runtime(struct task_struct *task); /* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 58f405b581e7..c8008dd58ef2 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -250,7 +250,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) do { times->utime = cputime_add(times->utime, t->utime); times->stime = cputime_add(times->stime, t->stime); - times->sum_exec_runtime += t->se.sum_exec_runtime; + times->sum_exec_runtime += task_sched_runtime(t); } while_each_thread(tsk, t); out: rcu_read_unlock(); @@ -312,7 +312,8 @@ static int cpu_clock_sample_group(const clockid_t which_clock, cpu->cpu = cputime.utime; break; case CPUCLOCK_SCHED: - cpu->sched = thread_group_sched_runtime(p); + thread_group_cputime(p, &cputime); + cpu->sched = cputime.sum_exec_runtime; break; } return 0; diff --git a/kernel/sched.c b/kernel/sched.c index ccacdbdecf45..e1290ecee3c2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3724,30 +3724,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -/* - * Return sum_exec_runtime for the thread group. - * In case the task is currently running, return the sum plus current's - * pending runtime that have not been accounted yet. - * - * Note that the thread group might have other running tasks as well, - * so the return value not includes other pending runtime that other - * running tasks might have. - */ -unsigned long long thread_group_sched_runtime(struct task_struct *p) -{ - struct task_cputime totals; - unsigned long flags; - struct rq *rq; - u64 ns; - - rq = task_rq_lock(p, &flags); - thread_group_cputime(p, &totals); - ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); - task_rq_unlock(rq, p, &flags); - - return ns; -} - /* * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to From 9fb60336253edf73dedc527b2aa2bf32eae0d6da Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 Sep 2011 13:32:23 +0200 Subject: [PATCH 18/23] clocksource: Make watchdog reset lockless KGDB needs to trylock watchdog_lock when trying to reset the clocksource watchdog after the system has been stopped to avoid a potential deadlock. When the trylock fails TSC usually becomes unstable. We can be more clever by using an atomic counter and checking it in the clocksource_watchdog callback. We restart the watchdog whenever the counter is > 0 and only decrement the counter when we ran through a full update cycle. Signed-off-by: Thomas Gleixner Cc: John Stultz Acked-by: Jason Wessel Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1109121326280.2723@ionos Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e0980f0d9a0a..cf52fda2e096 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -186,6 +186,7 @@ static struct timer_list watchdog_timer; static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); static DEFINE_SPINLOCK(watchdog_lock); static int watchdog_running; +static atomic_t watchdog_reset_pending; static int clocksource_watchdog_kthread(void *data); static void __clocksource_change_rating(struct clocksource *cs, int rating); @@ -247,12 +248,14 @@ static void clocksource_watchdog(unsigned long data) struct clocksource *cs; cycle_t csnow, wdnow; int64_t wd_nsec, cs_nsec; - int next_cpu; + int next_cpu, reset_pending; spin_lock(&watchdog_lock); if (!watchdog_running) goto out; + reset_pending = atomic_read(&watchdog_reset_pending); + list_for_each_entry(cs, &watchdog_list, wd_list) { /* Clocksource already marked unstable? */ @@ -268,7 +271,8 @@ static void clocksource_watchdog(unsigned long data) local_irq_enable(); /* Clocksource initialized ? */ - if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { + if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || + atomic_read(&watchdog_reset_pending)) { cs->flags |= CLOCK_SOURCE_WATCHDOG; cs->wd_last = wdnow; cs->cs_last = csnow; @@ -283,8 +287,11 @@ static void clocksource_watchdog(unsigned long data) cs->cs_last = csnow; cs->wd_last = wdnow; + if (atomic_read(&watchdog_reset_pending)) + continue; + /* Check the deviation from the watchdog clocksource. */ - if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { + if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { clocksource_unstable(cs, cs_nsec - wd_nsec); continue; } @@ -302,6 +309,13 @@ static void clocksource_watchdog(unsigned long data) } } + /* + * We only clear the watchdog_reset_pending, when we did a + * full cycle through all clocksources. + */ + if (reset_pending) + atomic_dec(&watchdog_reset_pending); + /* * Cycle through CPUs to check if the CPUs stay synchronized * to each other. @@ -344,23 +358,7 @@ static inline void clocksource_reset_watchdog(void) static void clocksource_resume_watchdog(void) { - unsigned long flags; - - /* - * We use trylock here to avoid a potential dead lock when - * kgdb calls this code after the kernel has been stopped with - * watchdog_lock held. When watchdog_lock is held we just - * return and accept, that the watchdog might trigger and mark - * the monitored clock source (usually TSC) unstable. - * - * This does not affect the other caller clocksource_resume() - * because at this point the kernel is UP, interrupts are - * disabled and nothing can hold watchdog_lock. - */ - if (!spin_trylock_irqsave(&watchdog_lock, flags)) - return; - clocksource_reset_watchdog(); - spin_unlock_irqrestore(&watchdog_lock, flags); + atomic_inc(&watchdog_reset_pending); } static void clocksource_enqueue_watchdog(struct clocksource *cs) From 4523f6ada86853750565c68e17126af2e3df9b8a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 14 Sep 2011 10:54:29 +0200 Subject: [PATCH 19/23] alarmtimers: Fix error handling commit 8bc0daf (alarmtimers: Rework RTC device selection using class interface) did not implement required error checks. Add them. Signed-off-by: Thomas Gleixner --- kernel/time/alarmtimer.c | 43 ++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 154d5563ab1b..c436e790b21b 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -100,19 +100,25 @@ static struct class_interface alarmtimer_rtc_interface = { .add_dev = &alarmtimer_rtc_add_device, }; -static void alarmtimer_rtc_interface_setup(void) +static int alarmtimer_rtc_interface_setup(void) { alarmtimer_rtc_interface.class = rtc_class; - class_interface_register(&alarmtimer_rtc_interface); + return class_interface_register(&alarmtimer_rtc_interface); +} +static void alarmtimer_rtc_interface_remove(void) +{ + class_interface_unregister(&alarmtimer_rtc_interface); } #else -#define alarmtimer_get_rtcdev() (0) -#define rtcdev (0) -#define alarmtimer_rtc_interface_setup() +static inline struct rtc_device *alarmtimer_get_rtcdev(void) +{ + return NULL; +} +#define rtcdev (NULL) +static inline int alarmtimer_rtc_interface_setup(void) { return 0; } +static inline void alarmtimer_rtc_interface_remove(void) { } #endif - - /** * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue * @base: pointer to the base where the timer is being run @@ -764,6 +770,7 @@ static struct platform_driver alarmtimer_driver = { */ static int __init alarmtimer_init(void) { + struct platform_device *pdev; int error = 0; int i; struct k_clock alarm_clock = { @@ -793,11 +800,25 @@ static int __init alarmtimer_init(void) alarm_bases[i].timer.function = alarmtimer_fired; } - alarmtimer_rtc_interface_setup(); - error = platform_driver_register(&alarmtimer_driver); - platform_device_register_simple("alarmtimer", -1, NULL, 0); + error = alarmtimer_rtc_interface_setup(); + if (error) + return error; + error = platform_driver_register(&alarmtimer_driver); + if (error) + goto out_if; + + pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0); + if (IS_ERR(pdev)) { + error = PTR_ERR(pdev); + goto out_drv; + } + return 0; + +out_drv: + platform_driver_unregister(&alarmtimer_driver); +out_if: + alarmtimer_rtc_interface_remove(); return error; } device_initcall(alarmtimer_init); - From cbbc719fccdb8cbd87350a05c0d33167c9b79365 Mon Sep 17 00:00:00 2001 From: hank Date: Tue, 20 Sep 2011 13:53:39 -0700 Subject: [PATCH 20/23] time: Change jiffies_to_clock_t() argument type to unsigned long The parameter's origin type is long. On an i386 architecture, it can easily be larger than 0x80000000, causing this function to convert it to a sign-extended u64 type. Change the type to unsigned long so we get the correct result. Signed-off-by: hank Cc: John Stultz Cc: [ build fix ] Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/jiffies.h | 2 +- kernel/time.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index f97672a36fa8..265e2c3cbd1c 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -303,7 +303,7 @@ extern void jiffies_to_timespec(const unsigned long jiffies, extern unsigned long timeval_to_jiffies(const struct timeval *value); extern void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value); -extern clock_t jiffies_to_clock_t(long x); +extern clock_t jiffies_to_clock_t(unsigned long x); extern unsigned long clock_t_to_jiffies(unsigned long x); extern u64 jiffies_64_to_clock_t(u64 x); extern u64 nsec_to_clock_t(u64 x); diff --git a/kernel/time.c b/kernel/time.c index 8e8dc6d705c9..d77606214529 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -575,7 +575,7 @@ EXPORT_SYMBOL(jiffies_to_timeval); /* * Convert jiffies/jiffies_64 to clock_t and back. */ -clock_t jiffies_to_clock_t(long x) +clock_t jiffies_to_clock_t(unsigned long x) { #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 # if HZ < USER_HZ From dcb69290af30f7ef54e03bf82e1be0950f167789 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 16 Aug 2011 15:51:03 -0700 Subject: [PATCH 21/23] time: Cleanup old CONFIG_GENERIC_TIME references that snuck in Awhile back I removed all the CONFIG_GENERIC_TIME referecnes as the last of the non-GENERIC_TIME arches were converted. However, due to the functionality being important and around for awhile, there apparently were some out of tree hardware enablement patches that used it and have since been merged. This patch removes the remaining instances of GENERIC_TIME. Singed-off-by: John Stultz --- arch/arm/Kconfig | 4 ---- arch/mn10300/Kconfig | 3 --- arch/tile/Kconfig | 3 --- arch/tile/configs/tilegx_defconfig | 1 - arch/tile/configs/tilepro_defconfig | 1 - arch/um/defconfig | 1 - arch/xtensa/configs/iss_defconfig | 1 - arch/xtensa/configs/s6105_defconfig | 1 - 8 files changed, 15 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 2c71a8f3535a..37cc7227732e 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -347,7 +347,6 @@ config ARCH_GEMINI config ARCH_PRIMA2 bool "CSR SiRFSoC PRIMA2 ARM Cortex A9 Platform" select CPU_V7 - select GENERIC_TIME select NO_IOPORT select GENERIC_CLOCKEVENTS select CLKDEV_LOOKUP @@ -520,7 +519,6 @@ config ARCH_LPC32XX select ARM_AMBA select USB_ARCH_HAS_OHCI select CLKDEV_LOOKUP - select GENERIC_TIME select GENERIC_CLOCKEVENTS help Support for the NXP LPC32XX family of processors @@ -599,7 +597,6 @@ config ARCH_TEGRA bool "NVIDIA Tegra" select CLKDEV_LOOKUP select CLKSRC_MMIO - select GENERIC_TIME select GENERIC_CLOCKEVENTS select GENERIC_GPIO select HAVE_CLK @@ -911,7 +908,6 @@ config ARCH_VT8500 config ARCH_ZYNQ bool "Xilinx Zynq ARM Cortex A9 Platform" select CPU_V7 - select GENERIC_TIME select GENERIC_CLOCKEVENTS select CLKDEV_LOOKUP select ARM_GIC diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig index 1f870340ebdd..5f7f2f8deb89 100644 --- a/arch/mn10300/Kconfig +++ b/arch/mn10300/Kconfig @@ -47,9 +47,6 @@ config GENERIC_CMOS_UPDATE config GENERIC_HWEIGHT def_bool y -config GENERIC_TIME - def_bool y - config GENERIC_CLOCKEVENTS def_bool y diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index b30f71ac0d06..70a0de46cd1b 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -46,9 +46,6 @@ config NEED_PER_CPU_PAGE_FIRST_CHUNK config SYS_SUPPORTS_HUGETLBFS def_bool y -config GENERIC_TIME - def_bool y - config GENERIC_CLOCKEVENTS def_bool y diff --git a/arch/tile/configs/tilegx_defconfig b/arch/tile/configs/tilegx_defconfig index 2ad73fb707b9..dafdbbae1124 100644 --- a/arch/tile/configs/tilegx_defconfig +++ b/arch/tile/configs/tilegx_defconfig @@ -11,7 +11,6 @@ CONFIG_HAVE_ARCH_ALLOC_REMAP=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_SYS_SUPPORTS_HUGETLBFS=y -CONFIG_GENERIC_TIME=y CONFIG_GENERIC_CLOCKEVENTS=y CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_DEFAULT_MIGRATION_COST=10000000 diff --git a/arch/tile/configs/tilepro_defconfig b/arch/tile/configs/tilepro_defconfig index f58dc362b944..6f05f969b564 100644 --- a/arch/tile/configs/tilepro_defconfig +++ b/arch/tile/configs/tilepro_defconfig @@ -11,7 +11,6 @@ CONFIG_HAVE_ARCH_ALLOC_REMAP=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_SYS_SUPPORTS_HUGETLBFS=y -CONFIG_GENERIC_TIME=y CONFIG_GENERIC_CLOCKEVENTS=y CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_DEFAULT_MIGRATION_COST=10000000 diff --git a/arch/um/defconfig b/arch/um/defconfig index 9f7634f08cf3..761f5e1a657e 100644 --- a/arch/um/defconfig +++ b/arch/um/defconfig @@ -13,7 +13,6 @@ CONFIG_LOCKDEP_SUPPORT=y # CONFIG_STACKTRACE_SUPPORT is not set CONFIG_GENERIC_CALIBRATE_DELAY=y CONFIG_GENERIC_BUG=y -CONFIG_GENERIC_TIME=y CONFIG_GENERIC_CLOCKEVENTS=y CONFIG_IRQ_RELEASE_METHOD=y CONFIG_HZ=100 diff --git a/arch/xtensa/configs/iss_defconfig b/arch/xtensa/configs/iss_defconfig index 0234cd198c54..f932b30b47fb 100644 --- a/arch/xtensa/configs/iss_defconfig +++ b/arch/xtensa/configs/iss_defconfig @@ -15,7 +15,6 @@ CONFIG_GENERIC_GPIO=y # CONFIG_ARCH_HAS_ILOG2_U64 is not set CONFIG_NO_IOPORT=y CONFIG_HZ=100 -CONFIG_GENERIC_TIME=y CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" CONFIG_CONSTRUCTORS=y diff --git a/arch/xtensa/configs/s6105_defconfig b/arch/xtensa/configs/s6105_defconfig index 4891abbf16bc..550e8ed5b5c6 100644 --- a/arch/xtensa/configs/s6105_defconfig +++ b/arch/xtensa/configs/s6105_defconfig @@ -15,7 +15,6 @@ CONFIG_GENERIC_GPIO=y # CONFIG_ARCH_HAS_ILOG2_U64 is not set CONFIG_NO_IOPORT=y CONFIG_HZ=100 -CONFIG_GENERIC_TIME=y CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" # From a1330228f9eec7e355d41f45c17e1297d681f40d Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Mon, 25 Jul 2011 16:34:37 +0100 Subject: [PATCH 22/23] dw_apb_timer: constify clocksource name The clocksource name should be const for correctness. Cc: John Stultz Signed-off-by: Jamie Iles Signed-off-by: John Stultz --- drivers/clocksource/dw_apb_timer.c | 2 +- include/linux/dw_apb_timer.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/dw_apb_timer.c b/drivers/clocksource/dw_apb_timer.c index 580f870541a3..8c2a35f26d9b 100644 --- a/drivers/clocksource/dw_apb_timer.c +++ b/drivers/clocksource/dw_apb_timer.c @@ -348,7 +348,7 @@ static void apbt_restart_clocksource(struct clocksource *cs) * dw_apb_clocksource_register() as the next step. */ struct dw_apb_clocksource * -dw_apb_clocksource_init(unsigned rating, char *name, void __iomem *base, +dw_apb_clocksource_init(unsigned rating, const char *name, void __iomem *base, unsigned long freq) { struct dw_apb_clocksource *dw_cs = kzalloc(sizeof(*dw_cs), GFP_KERNEL); diff --git a/include/linux/dw_apb_timer.h b/include/linux/dw_apb_timer.h index 49638ea3b776..07261d52a6df 100644 --- a/include/linux/dw_apb_timer.h +++ b/include/linux/dw_apb_timer.h @@ -46,7 +46,7 @@ struct dw_apb_clock_event_device * dw_apb_clockevent_init(int cpu, const char *name, unsigned rating, void __iomem *base, int irq, unsigned long freq); struct dw_apb_clocksource * -dw_apb_clocksource_init(unsigned rating, char *name, void __iomem *base, +dw_apb_clocksource_init(unsigned rating, const char *name, void __iomem *base, unsigned long freq); void dw_apb_clocksource_register(struct dw_apb_clocksource *dw_cs); void dw_apb_clocksource_start(struct dw_apb_clocksource *dw_cs); From e35f95b36e43f67a6f806172555a152c11ea0a78 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 22 Sep 2011 09:19:17 +0200 Subject: [PATCH 23/23] time, s390: Get rid of compile warning "s390: Use direct ktime path for s390 clockevent device" in linux-next introduces this compile warning: arch/s390/kernel/time.c: In function 's390_next_ktime': arch/s390/kernel/time.c:118:2: warning: comparison of distinct pointer types lacks a cast [enabled by default] Just use a u64 instead of an s64 variable. This is not a problem since it will always contain a positive value. Signed-off-by: Heiko Carstens Cc: Martin Schwidefsky Link: http://lkml.kernel.org/r/1316675957-5538-1-git-send-email-heiko.carstens@de.ibm.com Signed-off-by: Thomas Gleixner --- arch/s390/kernel/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index c53716487e77..8d65bd0383fc 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -112,7 +112,7 @@ static void fixup_clock_comparator(unsigned long long delta) static int s390_next_ktime(ktime_t expires, struct clock_event_device *evt) { - s64 nsecs; + u64 nsecs; nsecs = ktime_to_ns(ktime_sub(expires, ktime_get_monotonic_offset())); do_div(nsecs, 125);