From 1aa5dfb751d275ae7117d3b73ac423b4a46f2a73 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 20 Aug 2008 16:37:28 -0700 Subject: clocksource: keep track of original clocksource frequency The clocksource frequency is represented by clocksource->mult/2^(clocksource->shift). Currently, when NTP makes adjustments to the clock frequency, they are made directly to the mult value. This has the drawback that once changed, we cannot know what the orignal mult value was, or how much adjustment has been applied. This property causes problems in calculating proper ntp intervals when switching back and forth between clocksources. This patch separates the current mult value into a mult and mult_orig pair. The mult_orig value stays constant, while the ntp clocksource adjustments are done only to the mult value. This allows for correct ntp interval calculation and additionally lays the groundwork for a new notion of time, what I'm calling the monotonic-raw time, which is introduced in a following patch. Signed-off-by: John Stultz Signed-off-by: Roman Zippel Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/time/clocksource.c | 3 +++ kernel/time/jiffies.c | 1 + 2 files changed, 4 insertions(+) (limited to 'kernel/time') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 093d4acf993..9ed2eec9752 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -325,6 +325,9 @@ int clocksource_register(struct clocksource *c) unsigned long flags; int ret; + /* save mult_orig on registration */ + c->mult_orig = c->mult; + spin_lock_irqsave(&clocksource_lock, flags); ret = clocksource_enqueue(c); if (!ret) diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 4c256fdb887..1ca99557e92 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -61,6 +61,7 @@ struct clocksource clocksource_jiffies = { .read = jiffies_read, .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ + .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT, .shift = JIFFIES_SHIFT, }; -- cgit v1.2.3-70-g09d2 From 9a055117d3d9cb562f83f8d4cd88772761f4cab0 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Wed, 20 Aug 2008 16:37:28 -0700 Subject: clocksource: introduce clocksource_forward_now() To keep the raw monotonic patch simple first introduce clocksource_forward_now(), which takes care of the offset since the last update_wall_time() call and adds it to the clock, so there is no need anymore to deal with it explicitly at various places, which need to make significant changes to the clock. This is also gets rid of the timekeeping_suspend_nsecs, instead of waiting until resume, the value is accumulated during suspend. In the end there is only a single user of __get_nsec_offset() left, so I integrated it back to getnstimeofday(). Signed-off-by: Roman Zippel Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 71 ++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 38 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e91c29f961c..83d3555a699 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -58,27 +58,23 @@ struct clocksource *clock; #ifdef CONFIG_GENERIC_TIME /** - * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook + * clocksource_forward_now - update clock to the current time * - * private function, must hold xtime_lock lock when being - * called. Returns the number of nanoseconds since the - * last call to update_wall_time() (adjusted by NTP scaling) + * Forward the current clock to update its state since the last call to + * update_wall_time(). This is useful before significant clock changes, + * as it avoids having to deal with this time offset explicitly. */ -static inline s64 __get_nsec_offset(void) +static void clocksource_forward_now(void) { cycle_t cycle_now, cycle_delta; - s64 ns_offset; + s64 nsec; - /* read clocksource: */ cycle_now = clocksource_read(clock); - - /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + clock->cycle_last = cycle_now; - /* convert to nanoseconds: */ - ns_offset = cyc2ns(clock, cycle_delta); - - return ns_offset; + nsec = cyc2ns(clock, cycle_delta); + timespec_add_ns(&xtime, nsec); } /** @@ -89,6 +85,7 @@ static inline s64 __get_nsec_offset(void) */ void getnstimeofday(struct timespec *ts) { + cycle_t cycle_now, cycle_delta; unsigned long seq; s64 nsecs; @@ -96,7 +93,15 @@ void getnstimeofday(struct timespec *ts) seq = read_seqbegin(&xtime_lock); *ts = xtime; - nsecs = __get_nsec_offset(); + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + nsecs = cyc2ns(clock, cycle_delta); } while (read_seqretry(&xtime_lock, seq)); @@ -129,22 +134,22 @@ EXPORT_SYMBOL(do_gettimeofday); */ int do_settimeofday(struct timespec *tv) { + struct timespec ts_delta; unsigned long flags; - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; write_seqlock_irqsave(&xtime_lock, flags); - nsec -= __get_nsec_offset(); + clocksource_forward_now(); + + ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; + ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; + wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta); - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); + xtime = *tv; - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); update_xtime_cache(0); clock->error = 0; @@ -170,22 +175,17 @@ EXPORT_SYMBOL(do_settimeofday); static void change_clocksource(void) { struct clocksource *new; - cycle_t now; - u64 nsec; new = clocksource_get_next(); if (clock == new) return; - new->cycle_last = 0; - now = clocksource_read(new); - nsec = __get_nsec_offset(); - timespec_add_ns(&xtime, nsec); + clocksource_forward_now(); clock = new; - clock->cycle_last = now; - + clock->cycle_last = 0; + clock->cycle_last = clocksource_read(new); clock->error = 0; clock->xtime_nsec = 0; clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); @@ -200,8 +200,8 @@ static void change_clocksource(void) */ } #else +static inline void clocksource_forward_now(void) { } static inline void change_clocksource(void) { } -static inline s64 __get_nsec_offset(void) { return 0; } #endif /** @@ -265,8 +265,6 @@ void __init timekeeping_init(void) static int timekeeping_suspended; /* time in seconds when suspend began */ static unsigned long timekeeping_suspend_time; -/* xtime offset when we went into suspend */ -static s64 timekeeping_suspend_nsecs; /** * timekeeping_resume - Resumes the generic timekeeping subsystem. @@ -292,8 +290,6 @@ static int timekeeping_resume(struct sys_device *dev) wall_to_monotonic.tv_sec -= sleep_length; total_sleep_time += sleep_length; } - /* Make sure that we have the correct xtime reference */ - timespec_add_ns(&xtime, timekeeping_suspend_nsecs); update_xtime_cache(0); /* re-base the last cycle value */ clock->cycle_last = 0; @@ -319,8 +315,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) timekeeping_suspend_time = read_persistent_clock(); write_seqlock_irqsave(&xtime_lock, flags); - /* Get the current xtime offset */ - timekeeping_suspend_nsecs = __get_nsec_offset(); + clocksource_forward_now(); timekeeping_suspended = 1; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -461,10 +456,10 @@ void update_wall_time(void) */ while (offset >= clock->cycle_interval) { /* accumulate one interval */ - clock->xtime_nsec += clock->xtime_interval; - clock->cycle_last += clock->cycle_interval; offset -= clock->cycle_interval; + clock->cycle_last += clock->cycle_interval; + clock->xtime_nsec += clock->xtime_interval; if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; xtime.tv_sec++; -- cgit v1.2.3-70-g09d2 From 2d42244ae71d6c7b0884b5664cf2eda30fb2ae68 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 20 Aug 2008 16:37:30 -0700 Subject: clocksource: introduce CLOCK_MONOTONIC_RAW In talking with Josip Loncaric, and his work on clock synchronization (see btime.sf.net), he mentioned that for really close synchronization, it is useful to have access to "hardware time", that is a notion of time that is not in any way adjusted by the clock slewing done to keep close time sync. Part of the issue is if we are using the kernel's ntp adjusted representation of time in order to measure how we should correct time, we can run into what Paul McKenney aptly described as "Painting a road using the lines we're painting as the guide". I had been thinking of a similar problem, and was trying to come up with a way to give users access to a purely hardware based time representation that avoided users having to know the underlying frequency and mask values needed to deal with the wide variety of possible underlying hardware counters. My solution is to introduce CLOCK_MONOTONIC_RAW. This exposes a nanosecond based time value, that increments starting at bootup and has no frequency adjustments made to it what so ever. The time is accessed from userspace via the posix_clock_gettime() syscall, passing CLOCK_MONOTONIC_RAW as the clock_id. Signed-off-by: John Stultz Signed-off-by: Roman Zippel Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/clocksource.h | 3 +++ include/linux/time.h | 2 ++ kernel/posix-timers.c | 15 +++++++++++++++ kernel/time/timekeeping.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 64 insertions(+) (limited to 'kernel/time') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index f0a7fb98441..f88d32f8ff7 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -79,6 +79,7 @@ struct clocksource { /* timekeeping specific data, ignore */ cycle_t cycle_interval; u64 xtime_interval; + u32 raw_interval; /* * Second part is written at each timer interrupt * Keep it in a different cache line to dirty no @@ -87,6 +88,7 @@ struct clocksource { cycle_t cycle_last ____cacheline_aligned_in_smp; u64 xtime_nsec; s64 error; + struct timespec raw_time; #ifdef CONFIG_CLOCKSOURCE_WATCHDOG /* Watchdog related data, used by the framework */ @@ -215,6 +217,7 @@ static inline void clocksource_calculate_interval(struct clocksource *c, /* Go back from cycles -> shifted ns, this time use ntp adjused mult */ c->xtime_interval = (u64)c->cycle_interval * c->mult; + c->raw_interval = ((u64)c->cycle_interval * c->mult_orig) >> c->shift; } diff --git a/include/linux/time.h b/include/linux/time.h index e15206a7e82..205f974b9eb 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -117,6 +117,7 @@ extern int do_setitimer(int which, struct itimerval *value, extern unsigned int alarm_setitimer(unsigned int seconds); extern int do_getitimer(int which, struct itimerval *value); extern void getnstimeofday(struct timespec *tv); +extern void getrawmonotonic(struct timespec *ts); extern void getboottime(struct timespec *ts); extern void monotonic_to_bootbased(struct timespec *ts); @@ -214,6 +215,7 @@ struct itimerval { #define CLOCK_MONOTONIC 1 #define CLOCK_PROCESS_CPUTIME_ID 2 #define CLOCK_THREAD_CPUTIME_ID 3 +#define CLOCK_MONOTONIC_RAW 4 /* * The IDs of various hardware clocks: diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index e36d5798cbf..d3c66b53dff 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -222,6 +222,15 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) return 0; } +/* + * Get monotonic time for posix timers + */ +static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) +{ + getrawmonotonic(tp); + return 0; +} + /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ @@ -235,9 +244,15 @@ static __init int init_posix_timers(void) .clock_get = posix_ktime_get_ts, .clock_set = do_posix_clock_nosettime, }; + struct k_clock clock_monotonic_raw = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_monotonic_raw, + .clock_set = do_posix_clock_nosettime, + }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); + register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 83d3555a699..5099c95b8aa 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -75,6 +75,9 @@ static void clocksource_forward_now(void) nsec = cyc2ns(clock, cycle_delta); timespec_add_ns(&xtime, nsec); + + nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; + clock->raw_time.tv_nsec += nsec; } /** @@ -183,6 +186,8 @@ static void change_clocksource(void) clocksource_forward_now(); + new->raw_time = clock->raw_time; + clock = new; clock->cycle_last = 0; clock->cycle_last = clocksource_read(new); @@ -204,6 +209,39 @@ static inline void clocksource_forward_now(void) { } static inline void change_clocksource(void) { } #endif +/** + * getrawmonotonic - Returns the raw monotonic time in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the raw monotonic time (completely un-modified by ntp) + */ +void getrawmonotonic(struct timespec *ts) +{ + unsigned long seq; + s64 nsecs; + cycle_t cycle_now, cycle_delta; + + do { + seq = read_seqbegin(&xtime_lock); + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; + + *ts = clock->raw_time; + + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts, nsecs); +} +EXPORT_SYMBOL(getrawmonotonic); + + /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres */ @@ -466,6 +504,12 @@ void update_wall_time(void) second_overflow(); } + clock->raw_time.tv_nsec += clock->raw_interval; + if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) { + clock->raw_time.tv_nsec -= NSEC_PER_SEC; + clock->raw_time.tv_sec++; + } + /* accumulate error between NTP and clock interval */ clock->error += tick_length; clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); -- cgit v1.2.3-70-g09d2 From 916c7a855174e3b53d182b97a26b2e27a29726a1 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Wed, 20 Aug 2008 16:46:08 -0700 Subject: ntp: fix ADJ_OFFSET_SS_READ bug and do_adjtimex() cleanup Thanks to the review by Michael Kerrisk a bug in the recent ADJ_OFFSET_SS_READ option was discovered, where the ntp time_offset was inadvertently set by it. This fixes this by making the adjtime code more separate from the ntp_adjtime code (both of which really want to be separate syscalls). Signed-off-by: Roman Zippel Signed-off-by: Andrew Morton Acked-by: John Stultz Signed-off-by: Ingo Molnar --- include/linux/timex.h | 9 +++++- kernel/time/ntp.c | 76 +++++++++++++++++++++++++++------------------------ 2 files changed, 48 insertions(+), 37 deletions(-) (limited to 'kernel/time') diff --git a/include/linux/timex.h b/include/linux/timex.h index fc6035d29d5..c00bcdd3ae4 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -141,8 +141,15 @@ struct timex { #define ADJ_MICRO 0x1000 /* select microsecond resolution */ #define ADJ_NANO 0x2000 /* select nanosecond resolution */ #define ADJ_TICK 0x4000 /* tick value */ + +#ifdef __KERNEL__ +#define ADJ_ADJTIME 0x8000 /* switch between adjtime/adjtimex modes */ +#define ADJ_OFFSET_SINGLESHOT 0x0001 /* old-fashioned adjtime */ +#define ADJ_OFFSET_READONLY 0x2000 /* read-only adjtime */ +#else #define ADJ_OFFSET_SINGLESHOT 0x8001 /* old-fashioned adjtime */ -#define ADJ_OFFSET_SS_READ 0xa001 /* read-only adjtime */ +#define ADJ_OFFSET_SS_READ 0xa001 /* read-only adjtime */ +#endif /* xntp 3.4 compatibility names */ #define MOD_OFFSET ADJ_OFFSET diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5125ddd8196..c6921aa1a42 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -277,38 +277,50 @@ static inline void notify_cmos_timer(void) { } int do_adjtimex(struct timex *txc) { struct timespec ts; - long save_adjust, sec; int result; - /* In order to modify anything, you gotta be super-user! */ - if (txc->modes && !capable(CAP_SYS_TIME)) - return -EPERM; - - /* Now we validate the data before disabling interrupts */ - - if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) { + /* Validate the data before disabling interrupts */ + if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ - if (txc->modes & ~ADJ_OFFSET_SS_READ) + if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) return -EINVAL; + if (!(txc->modes & ADJ_OFFSET_READONLY) && + !capable(CAP_SYS_TIME)) + return -EPERM; + } else { + /* In order to modify anything, you gotta be super-user! */ + if (txc->modes && !capable(CAP_SYS_TIME)) + return -EPERM; + + /* if the quartz is off by more than 10% something is VERY wrong! */ + if (txc->modes & ADJ_TICK && + (txc->tick < 900000/USER_HZ || + txc->tick > 1100000/USER_HZ)) + return -EINVAL; + + if (txc->modes & ADJ_STATUS && time_state != TIME_OK) + hrtimer_cancel(&leap_timer); } - /* if the quartz is off by more than 10% something is VERY wrong ! */ - if (txc->modes & ADJ_TICK) - if (txc->tick < 900000/USER_HZ || - txc->tick > 1100000/USER_HZ) - return -EINVAL; - - if (time_state != TIME_OK && txc->modes & ADJ_STATUS) - hrtimer_cancel(&leap_timer); getnstimeofday(&ts); write_seqlock_irq(&xtime_lock); - /* Save for later - semantics of adjtime is to return old value */ - save_adjust = time_adjust; - /* If there are input parameters, then process them */ + if (txc->modes & ADJ_ADJTIME) { + long save_adjust = time_adjust; + + if (!(txc->modes & ADJ_OFFSET_READONLY)) { + /* adjtime() is independent from ntp_adjtime() */ + time_adjust = txc->offset; + ntp_update_frequency(); + } + txc->offset = save_adjust; + goto adj_done; + } if (txc->modes) { + long sec; + if (txc->modes & ADJ_STATUS) { if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { @@ -375,13 +387,8 @@ int do_adjtimex(struct timex *txc) if (txc->modes & ADJ_TAI && txc->constant > 0) time_tai = txc->constant; - if (txc->modes & ADJ_OFFSET) { - if (txc->modes == ADJ_OFFSET_SINGLESHOT) - /* adjtime() is independent from ntp_adjtime() */ - time_adjust = txc->offset; - else - ntp_update_offset(txc->offset); - } + if (txc->modes & ADJ_OFFSET) + ntp_update_offset(txc->offset); if (txc->modes & ADJ_TICK) tick_usec = txc->tick; @@ -389,19 +396,16 @@ int do_adjtimex(struct timex *txc) ntp_update_frequency(); } + txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, + NTP_SCALE_SHIFT); + if (!(time_status & STA_NANO)) + txc->offset /= NSEC_PER_USEC; + +adj_done: result = time_state; /* mostly `TIME_OK' */ if (time_status & (STA_UNSYNC|STA_CLOCKERR)) result = TIME_ERROR; - if ((txc->modes == ADJ_OFFSET_SINGLESHOT) || - (txc->modes == ADJ_OFFSET_SS_READ)) - txc->offset = save_adjust; - else { - txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, - NTP_SCALE_SHIFT); - if (!(time_status & STA_NANO)) - txc->offset /= NSEC_PER_USEC; - } txc->freq = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) * (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT); -- cgit v1.2.3-70-g09d2 From eb3f938fd6292dc79f43a5fe14784b044776e9f0 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 22 Sep 2008 14:42:40 -0700 Subject: ntp: let update_persistent_clock() sleep This is a change that makes the 11-minute RTC update be run in the process context. This is so that update_persistent_clock() can sleep, which may be required for certain types of RTC hardware -- most notably I2C devices. Signed-off-by: Maciej W. Rozycki Cc: Roman Zippel Cc: Rik van Riel Cc: David Brownell Acked-by: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/time/ntp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c6921aa1a42..450a45cb01c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -10,13 +10,13 @@ #include #include -#include #include #include #include #include #include #include +#include #include /* @@ -218,11 +218,11 @@ void second_overflow(void) /* Disable the cmos update - used by virtualization and embedded */ int no_sync_cmos_clock __read_mostly; -static void sync_cmos_clock(unsigned long dummy); +static void sync_cmos_clock(struct work_struct *work); -static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); +static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); -static void sync_cmos_clock(unsigned long dummy) +static void sync_cmos_clock(struct work_struct *work) { struct timespec now, next; int fail = 1; @@ -258,13 +258,13 @@ static void sync_cmos_clock(unsigned long dummy) next.tv_sec++; next.tv_nsec -= NSEC_PER_SEC; } - mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next)); + schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); } static void notify_cmos_timer(void) { if (!no_sync_cmos_clock) - mod_timer(&sync_cmos_timer, jiffies + 1); + schedule_delayed_work(&sync_cmos_work, 0); } #else -- cgit v1.2.3-70-g09d2 From 5cd1c9c5cf30d4b33df3d3f74d8142f278d536b7 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Mon, 22 Sep 2008 14:42:43 -0700 Subject: timekeeping: fix rounding problem during clock update Due to a rounding problem during a clock update it's possible for readers to observe the clock jumping back by 1nsec. The following simplified example demonstrates the problem: cycle xtime 0 0 1000 999999.6 2000 1999999.2 3000 2999998.8 ... 1500 = 1499999.4 = 0.0 + 1499999.4 = 999999.6 + 499999.8 When reading the clock only the full nanosecond part is used, while timekeeping internally keeps nanosecond fractions. If the clock is now updated at cycle 1500 here, a nanosecond is missing due to the truncation. The simple fix is to round up the xtime value during the update, this also changes the distance to the reference time, but the adjustment will automatically take care that it stays under control. Signed-off-by: Roman Zippel Signed-off-by: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e91c29f961c..5ecbfc39a26 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -454,7 +454,7 @@ void update_wall_time(void) #else offset = clock->cycle_interval; #endif - clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; + clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift; /* normally this loop will run just once, however in the * case of lost or late ticks, it will accumulate correctly. @@ -479,9 +479,12 @@ void update_wall_time(void) /* correct the clock when NTP error is too big */ clocksource_adjust(offset); - /* store full nanoseconds into xtime */ - xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; + /* store full nanoseconds into xtime after rounding it up and + * add the remainder to the error difference. + */ + xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1; clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; + clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift); update_xtime_cache(cyc2ns(clock, offset)); -- cgit v1.2.3-70-g09d2 From d40e944c25fb4642adb2a4c580a48218a9f3f824 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Mon, 22 Sep 2008 14:42:44 -0700 Subject: ntp: improve adjtimex frequency rounding Change PPM_SCALE_INV_SHIFT so that it doesn't throw away any input bits (19 is the amount of the factor 2 in PPM_SCALE), the output frequency can then be calculated back to its input value, as the inverse divide produce a slightly larger value, which is then correctly rounded by the final shift. Reported-by: Martin Ziegler Signed-off-by: Roman Zippel Cc: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- include/linux/timex.h | 2 +- kernel/time/ntp.c | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel/time') diff --git a/include/linux/timex.h b/include/linux/timex.h index c00bcdd3ae4..9007313b5b7 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -82,7 +82,7 @@ */ #define SHIFT_USEC 16 /* frequency offset scale (shift) */ #define PPM_SCALE (NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC)) -#define PPM_SCALE_INV_SHIFT 20 +#define PPM_SCALE_INV_SHIFT 19 #define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \ PPM_SCALE + 1) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 450a45cb01c..ddb0465a6ba 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -406,9 +406,8 @@ adj_done: if (time_status & (STA_UNSYNC|STA_CLOCKERR)) result = TIME_ERROR; - txc->freq = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) * - (s64)PPM_SCALE_INV, - NTP_SCALE_SHIFT); + txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * + (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT); txc->maxerror = time_maxerror; txc->esterror = time_esterror; txc->status = time_status; -- cgit v1.2.3-70-g09d2 From 719254faa17ffedc87ba0fadb9b34e535c9758d5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 17 Oct 2008 09:59:47 +0200 Subject: NOHZ: unify the nohz function calls in irq_enter() We have two separate nohz function calls in irq_enter() for no good reason. Just call a single NOHZ function from irq_enter() and call the bits in the tick code. Signed-off-by: Thomas Gleixner --- include/linux/tick.h | 7 +++---- kernel/softirq.c | 10 +++------- kernel/time/tick-sched.c | 13 ++++++++++++- 3 files changed, 18 insertions(+), 12 deletions(-) (limited to 'kernel/time') diff --git a/include/linux/tick.h b/include/linux/tick.h index 98921a3e1aa..b6ec8189ac0 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -96,9 +96,11 @@ extern cpumask_t *tick_get_broadcast_oneshot_mask(void); extern void tick_clock_notify(void); extern int tick_check_oneshot_change(int allow_nohz); extern struct tick_sched *tick_get_tick_sched(int cpu); +extern void tick_check_idle(int cpu); # else static inline void tick_clock_notify(void) { } static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } +static inline void tick_check_idle(int cpu) { } # endif #else /* CONFIG_GENERIC_CLOCKEVENTS */ @@ -106,26 +108,23 @@ static inline void tick_init(void) { } static inline void tick_cancel_sched_timer(int cpu) { } static inline void tick_clock_notify(void) { } static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } +static inline void tick_check_idle(int cpu) { } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ # ifdef CONFIG_NO_HZ extern void tick_nohz_stop_sched_tick(int inidle); extern void tick_nohz_restart_sched_tick(void); -extern void tick_nohz_update_jiffies(void); extern ktime_t tick_nohz_get_sleep_length(void); -extern void tick_nohz_stop_idle(int cpu); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); # else static inline void tick_nohz_stop_sched_tick(int inidle) { } static inline void tick_nohz_restart_sched_tick(void) { } -static inline void tick_nohz_update_jiffies(void) { } static inline ktime_t tick_nohz_get_sleep_length(void) { ktime_t len = { .tv64 = NSEC_PER_SEC/HZ }; return len; } -static inline void tick_nohz_stop_idle(int cpu) { } static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } # endif /* !NO_HZ */ diff --git a/kernel/softirq.c b/kernel/softirq.c index 37d67aa2d56..d410014279e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -265,16 +265,12 @@ asmlinkage void do_softirq(void) */ void irq_enter(void) { -#ifdef CONFIG_NO_HZ int cpu = smp_processor_id(); + if (idle_cpu(cpu) && !in_interrupt()) - tick_nohz_stop_idle(cpu); -#endif + tick_check_idle(cpu); + __irq_enter(); -#ifdef CONFIG_NO_HZ - if (idle_cpu(cpu)) - tick_nohz_update_jiffies(); -#endif } #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b711ffcb106..fdcf3f93bb8 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -155,7 +155,7 @@ void tick_nohz_update_jiffies(void) touch_softlockup_watchdog(); } -void tick_nohz_stop_idle(int cpu) +static void tick_nohz_stop_idle(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); @@ -558,6 +558,17 @@ static inline void tick_nohz_switch_to_nohz(void) { } #endif /* NO_HZ */ +/* + * Called from irq_enter to notify about the possible interruption of idle() + */ +void tick_check_idle(int cpu) +{ +#ifdef CONFIG_NO_HZ + tick_nohz_stop_idle(cpu); + tick_nohz_update_jiffies(); +#endif +} + /* * High resolution timer specific code */ -- cgit v1.2.3-70-g09d2 From c34bec5a44e9486597d78e7a686b2f9088a0564c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 17 Oct 2008 10:04:34 +0200 Subject: NOHZ: split tick_nohz_restart_sched_tick() Split out the clock event device reprogramming. Preparatory patch. Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 49 ++++++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 22 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index fdcf3f93bb8..7aedf434353 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -377,6 +377,32 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } +static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) +{ + hrtimer_cancel(&ts->sched_timer); + ts->sched_timer.expires = ts->idle_tick; + + while (1) { + /* Forward the time to expire in the future */ + hrtimer_forward(&ts->sched_timer, now, tick_period); + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start(&ts->sched_timer, + ts->sched_timer.expires, + HRTIMER_MODE_ABS); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + break; + } else { + if (!tick_program_event(ts->sched_timer.expires, 0)) + break; + } + /* Update jiffies and reread time */ + tick_do_update_jiffies64(now); + now = ktime_get(); + } +} + /** * tick_nohz_restart_sched_tick - restart the idle tick from the idle task * @@ -430,28 +456,7 @@ void tick_nohz_restart_sched_tick(void) */ ts->tick_stopped = 0; ts->idle_exittime = now; - hrtimer_cancel(&ts->sched_timer); - ts->sched_timer.expires = ts->idle_tick; - - while (1) { - /* Forward the time to expire in the future */ - hrtimer_forward(&ts->sched_timer, now, tick_period); - - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start(&ts->sched_timer, - ts->sched_timer.expires, - HRTIMER_MODE_ABS); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - break; - } else { - if (!tick_program_event(ts->sched_timer.expires, 0)) - break; - } - /* Update jiffies and reread time */ - tick_do_update_jiffies64(now); - now = ktime_get(); - } + tick_nohz_restart(ts, now); local_irq_enable(); } -- cgit v1.2.3-70-g09d2 From fb02fbc14d17837b4b7b02dbb36142c16a7bf208 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 17 Oct 2008 10:01:23 +0200 Subject: NOHZ: restart tick device from irq_enter() We did not restart the tick device from irq_enter() to avoid double reprogramming and extra events in the return immediate to idle case. But long lasting softirqs can lead to a situation where jiffies become stale: idle() tick stopped (reprogrammed to next pending timer) halt() interrupt jiffies updated from irq_enter() interrupt handler softirq function 1 runs 20ms softirq function 2 arms a 10ms timer with a stale jiffies value jiffies updated from irq_exit() timer wheel has now an already expired timer (the one added in function 2) timer fires and timer softirq runs This was discovered when debugging a timer problem which happend only when the ath5k driver is active. The debugging proved that there is a softirq function running for more than 20ms, which is a bug by itself. To solve this we restart the tick timer right from irq_enter(), but do not go through the other functions which are necessary to return from idle when need_resched() is set. Reported-by: Elias Oltmanns Signed-off-by: Thomas Gleixner Tested-by: Elias Oltmanns --- kernel/time/tick-broadcast.c | 13 +++++++++++++ kernel/time/tick-internal.h | 2 ++ kernel/time/tick-sched.c | 31 +++++++++++++++++++++++-------- 3 files changed, 38 insertions(+), 8 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index cb01cd8f919..f98a1b7b16e 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -383,6 +383,19 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) return 0; } +/* + * Called from irq_enter() when idle was interrupted to reenable the + * per cpu device. + */ +void tick_check_oneshot_broadcast(int cpu) +{ + if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { + struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + + clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); + } +} + /* * Handle oneshot mode broadcasting */ diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 469248782c2..b1c05bf75ee 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -36,6 +36,7 @@ extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); extern int tick_broadcast_oneshot_active(void); +extern void tick_check_oneshot_broadcast(int cpu); # else /* BROADCAST */ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { @@ -45,6 +46,7 @@ static inline void tick_broadcast_oneshot_control(unsigned long reason) { } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } +static inline void tick_check_oneshot_broadcast(int cpu) { } # endif /* !BROADCAST */ #else /* !ONESHOT */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7aedf434353..0581c11fe6c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -508,10 +508,6 @@ static void tick_nohz_handler(struct clock_event_device *dev) update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); - /* Do not restart, when we are in the idle loop */ - if (ts->tick_stopped) - return; - while (tick_nohz_reprogram(ts, now)) { now = ktime_get(); tick_do_update_jiffies64(now); @@ -557,6 +553,27 @@ static void tick_nohz_switch_to_nohz(void) smp_processor_id()); } +/* + * When NOHZ is enabled and the tick is stopped, we need to kick the + * tick timer from irq_enter() so that the jiffies update is kept + * alive during long running softirqs. That's ugly as hell, but + * correctness is key even if we need to fix the offending softirq in + * the first place. + * + * Note, this is different to tick_nohz_restart. We just kick the + * timer and do not touch the other magic bits which need to be done + * when idle is left. + */ +static void tick_nohz_kick_tick(int cpu) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + + if (!ts->tick_stopped) + return; + + tick_nohz_restart(ts, ktime_get()); +} + #else static inline void tick_nohz_switch_to_nohz(void) { } @@ -568,9 +585,11 @@ static inline void tick_nohz_switch_to_nohz(void) { } */ void tick_check_idle(int cpu) { + tick_check_oneshot_broadcast(cpu); #ifdef CONFIG_NO_HZ tick_nohz_stop_idle(cpu); tick_nohz_update_jiffies(); + tick_nohz_kick_tick(cpu); #endif } @@ -627,10 +646,6 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) profile_tick(CPU_PROFILING); } - /* Do not restart, when we are in the idle loop */ - if (ts->tick_stopped) - return HRTIMER_NORESTART; - hrtimer_forward(timer, now, tick_period); return HRTIMER_RESTART; -- cgit v1.2.3-70-g09d2 From e67ef25a35b949561a9bd77693523ec94ab4a278 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Sep 2008 23:50:23 +0200 Subject: timer_list: print real timer address The current timer_list output prints the address of the on stack copy of the active hrtimer instead of the hrtimer itself. Print the address of the real timer instead. Signed-off-by: Thomas Gleixner --- kernel/time/timer_list.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index a40e20fd000..ec9ea6cadd8 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -47,13 +47,14 @@ static void print_name_offset(struct seq_file *m, void *sym) } static void -print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) +print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, + int idx, u64 now) { #ifdef CONFIG_TIMER_STATS char tmp[TASK_COMM_LEN + 1]; #endif SEQ_printf(m, " #%d: ", idx); - print_name_offset(m, timer); + print_name_offset(m, taddr); SEQ_printf(m, ", "); print_name_offset(m, timer->function); SEQ_printf(m, ", S:%02lx", timer->state); @@ -99,7 +100,7 @@ next_one: tmp = *timer; spin_unlock_irqrestore(&base->cpu_base->lock, flags); - print_timer(m, &tmp, i, now); + print_timer(m, timer, &tmp, i, now); next++; goto next_one; } -- cgit v1.2.3-70-g09d2 From c5b77a3d3a716a5c61a1999d7f2a78e9c39fd1b0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 29 Sep 2008 17:31:41 +0200 Subject: timer_list: print cpu number of clockevents device The per cpu clock events device output of timer_list lacks an association of the device to the cpu which is annoying when looking at the output of /proc/timer_list from a 128 way system. Add the CPU number info and mark the broadcast device in the device list printout. Signed-off-by: Thomas Gleixner --- kernel/time/timer_list.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index ec9ea6cadd8..5479c6e7a02 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -184,12 +184,16 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) #ifdef CONFIG_GENERIC_CLOCKEVENTS static void -print_tickdevice(struct seq_file *m, struct tick_device *td) +print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) { struct clock_event_device *dev = td->evtdev; SEQ_printf(m, "\n"); SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); + if (cpu < 0) + SEQ_printf(m, "Broadcast device\n"); + else + SEQ_printf(m, "Per CPU device: %d\n", cpu); SEQ_printf(m, "Clock Event Device: "); if (!dev) { @@ -223,7 +227,7 @@ static void timer_list_show_tickdevices(struct seq_file *m) int cpu; #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST - print_tickdevice(m, tick_get_broadcast_device()); + print_tickdevice(m, tick_get_broadcast_device(), -1); SEQ_printf(m, "tick_broadcast_mask: %08lx\n", tick_get_broadcast_mask()->bits[0]); #ifdef CONFIG_TICK_ONESHOT @@ -233,7 +237,7 @@ static void timer_list_show_tickdevices(struct seq_file *m) SEQ_printf(m, "\n"); #endif for_each_online_cpu(cpu) - print_tickdevice(m, tick_get_device(cpu)); + print_tickdevice(m, tick_get_device(cpu), cpu); SEQ_printf(m, "\n"); } #else -- cgit v1.2.3-70-g09d2 From 870e2a284567714335d125c390366dce882d726f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 29 Sep 2008 17:41:55 +0200 Subject: timer_list: add base address to clock base The base address of a (per cpu) clock base is a useful debug info. Add it and bump the version number of timer_lists. Signed-off-by: Thomas Gleixner --- kernel/time/timer_list.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/time') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 5479c6e7a02..f6426911e35 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -110,6 +110,7 @@ next_one: static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { + SEQ_printf(m, " .base: %p\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %Lu nsecs\n", @@ -249,7 +250,7 @@ static int timer_list_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.3\n"); + SEQ_printf(m, "Timer List Version: v0.4\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); -- cgit v1.2.3-70-g09d2