diff options
Diffstat (limited to 'arch')
47 files changed, 2909 insertions, 3103 deletions
diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c index ec01f08f564..e101846ab7d 100644 --- a/arch/arm/kernel/irq.c +++ b/arch/arm/kernel/irq.c @@ -159,8 +159,7 @@ void __init init_IRQ(void) int irq; for (irq = 0; irq < NR_IRQS; irq++) - irq_desc[irq].status |= IRQ_NOREQUEST | IRQ_DELAYED_DISABLE | - IRQ_NOPROBE; + irq_desc[irq].status |= IRQ_NOREQUEST | IRQ_NOPROBE; #ifdef CONFIG_SMP bad_irq_desc.affinity = CPU_MASK_ALL; diff --git a/arch/arm/mach-imx/time.c b/arch/arm/mach-imx/time.c index 40039b2a90b..2703a730baf 100644 --- a/arch/arm/mach-imx/time.c +++ b/arch/arm/mach-imx/time.c @@ -87,7 +87,7 @@ static struct clocksource clocksource_imx = { .read = imx_get_cycles, .mask = 0xFFFFFFFF, .shift = 20, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static int __init imx_clocksource_init(void) diff --git a/arch/arm/mach-ixp4xx/common.c b/arch/arm/mach-ixp4xx/common.c index 2ec9a9e9a04..45068c3d8dc 100644 --- a/arch/arm/mach-ixp4xx/common.c +++ b/arch/arm/mach-ixp4xx/common.c @@ -395,7 +395,7 @@ static struct clocksource clocksource_ixp4xx = { .read = ixp4xx_get_cycles, .mask = CLOCKSOURCE_MASK(32), .shift = 20, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; unsigned long ixp4xx_timer_freq = FREQ; diff --git a/arch/arm/mach-netx/time.c b/arch/arm/mach-netx/time.c index 5773b55ef4a..7e132fcccd4 100644 --- a/arch/arm/mach-netx/time.c +++ b/arch/arm/mach-netx/time.c @@ -62,7 +62,7 @@ static struct clocksource clocksource_netx = { .read = netx_get_cycles, .mask = CLOCKSOURCE_MASK(32), .shift = 20, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; /* diff --git a/arch/arm/mach-pxa/time.c b/arch/arm/mach-pxa/time.c index ee2beb40041..fc3b82a740a 100644 --- a/arch/arm/mach-pxa/time.c +++ b/arch/arm/mach-pxa/time.c @@ -112,7 +112,7 @@ static struct clocksource clocksource_pxa = { .read = pxa_get_cycles, .mask = CLOCKSOURCE_MASK(32), .shift = 20, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static void __init pxa_timer_init(void) diff --git a/arch/avr32/kernel/time.c b/arch/avr32/kernel/time.c index a2f74affaa9..c10833f2ee0 100644 --- a/arch/avr32/kernel/time.c +++ b/arch/avr32/kernel/time.c @@ -37,7 +37,7 @@ static struct clocksource clocksource_avr32 = { .read = read_cycle_count, .mask = CLOCKSOURCE_MASK(32), .shift = 16, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; /* diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 595fb771366..1df4a1f1428 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -18,6 +18,18 @@ config GENERIC_TIME bool default y +config CLOCKSOURCE_WATCHDOG + bool + default y + +config GENERIC_CLOCKEVENTS + bool + default y + +config GENERIC_CLOCKEVENTS_BROADCAST + bool + default y + config LOCKDEP_SUPPORT bool default y @@ -74,6 +86,8 @@ source "init/Kconfig" menu "Processor type and features" +source "kernel/time/Kconfig" + config SMP bool "Symmetric multi-processing support" ---help--- @@ -205,7 +219,7 @@ config PARAVIRT config VMI bool "VMI Paravirt-ops support" - depends on PARAVIRT + depends on PARAVIRT && !NO_HZ default y help VMI provides a paravirtualized interface to multiple hypervisors diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index cbe4e601885..4ae3dcf1d2f 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -18,7 +18,7 @@ obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_APM) += apm.o -obj-$(CONFIG_X86_SMP) += smp.o smpboot.o +obj-$(CONFIG_X86_SMP) += smp.o smpboot.o tsc_sync.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o @@ -32,7 +32,6 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_MODULES) += module.o obj-y += sysenter.o vsyscall.o obj-$(CONFIG_ACPI_SRAT) += srat.o -obj-$(CONFIG_HPET_TIMER) += time_hpet.o obj-$(CONFIG_EFI) += efi.o efi_stub.o obj-$(CONFIG_DOUBLEFAULT) += doublefault.o obj-$(CONFIG_VM86) += vm86.o diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index e94aff6888c..fb3e72328a5 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -25,6 +25,7 @@ #include <linux/init.h> #include <linux/acpi.h> +#include <linux/acpi_pmtmr.h> #include <linux/efi.h> #include <linux/cpumask.h> #include <linux/module.h> @@ -615,6 +616,7 @@ static int __init acpi_parse_sbf(struct acpi_table_header *table) } #ifdef CONFIG_HPET_TIMER +#include <asm/hpet.h> static int __init acpi_parse_hpet(struct acpi_table_header *table) { @@ -645,24 +647,11 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) hpet_res->end = (1 * 1024) - 1; } -#ifdef CONFIG_X86_64 - vxtime.hpet_address = hpet_tbl->address.address; - + hpet_address = hpet_tbl->address.address; printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", - hpet_tbl->id, vxtime.hpet_address); - - res_start = vxtime.hpet_address; -#else /* X86 */ - { - extern unsigned long hpet_address; - - hpet_address = hpet_tbl->address.address; - printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", - hpet_tbl->id, hpet_address); + hpet_tbl->id, hpet_address); - res_start = hpet_address; - } -#endif /* X86 */ + res_start = hpet_address; if (hpet_res) { hpet_res->start = res_start; @@ -676,10 +665,6 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) #define acpi_parse_hpet NULL #endif -#ifdef CONFIG_X86_PM_TIMER -extern u32 pmtmr_ioport; -#endif - static int __init acpi_parse_fadt(struct acpi_table_header *table) { diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index f4159e0a7ae..9655c233e6f 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -25,6 +25,8 @@ #include <linux/kernel_stat.h> #include <linux/sysdev.h> #include <linux/cpu.h> +#include <linux/clockchips.h> +#include <linux/acpi_pmtmr.h> #include <linux/module.h> #include <asm/atomic.h> @@ -45,128 +47,549 @@ #include "io_ports.h" /* - * cpu_mask that denotes the CPUs that needs timer interrupt coming in as - * IPIs in place of local APIC timers + * Sanity check */ -static cpumask_t timer_bcast_ipi; +#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F +# error SPURIOUS_APIC_VECTOR definition error +#endif /* * Knob to control our willingness to enable the local APIC. + * + * -1=force-disable, +1=force-enable */ -static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ - -static inline void lapic_disable(void) -{ - enable_local_apic = -1; - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); -} +static int enable_local_apic __initdata = 0; -static inline void lapic_enable(void) -{ - enable_local_apic = 1; -} +/* Local APIC timer verification ok */ +static int local_apic_timer_verify_ok; /* - * Debug level + * Debug level, exported for io_apic.c */ int apic_verbosity; +static unsigned int calibration_result; +static int lapic_next_event(unsigned long delta, + struct clock_event_device *evt); +static void lapic_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt); +static void lapic_timer_broadcast(cpumask_t mask); static void apic_pm_activate(void); +/* + * The local apic timer can be used for any function which is CPU local. + */ +static struct clock_event_device lapic_clockevent = { + .name = "lapic", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT + | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, + .shift = 32, + .set_mode = lapic_timer_setup, + .set_next_event = lapic_next_event, + .broadcast = lapic_timer_broadcast, + .rating = 100, + .irq = -1, +}; +static DEFINE_PER_CPU(struct clock_event_device, lapic_events); + +/* Local APIC was disabled by the BIOS and enabled by the kernel */ +static int enabled_via_apicbase; + +/* + * Get the LAPIC version + */ +static inline int lapic_get_version(void) +{ + return GET_APIC_VERSION(apic_read(APIC_LVR)); +} + +/* + * Check, if the APIC is integrated or a seperate chip + */ +static inline int lapic_is_integrated(void) +{ + return APIC_INTEGRATED(lapic_get_version()); +} + +/* + * Check, whether this is a modern or a first generation APIC + */ static int modern_apic(void) { - unsigned int lvr, version; /* AMD systems use old APIC versions, so check the CPU */ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 >= 0xf) + boot_cpu_data.x86 >= 0xf) return 1; - lvr = apic_read(APIC_LVR); - version = GET_APIC_VERSION(lvr); - return version >= 0x14; + return lapic_get_version() >= 0x14; +} + +/** + * enable_NMI_through_LVT0 - enable NMI through local vector table 0 + */ +void enable_NMI_through_LVT0 (void * dummy) +{ + unsigned int v = APIC_DM_NMI; + + /* Level triggered for 82489DX */ + if (!lapic_is_integrated()) + v |= APIC_LVT_LEVEL_TRIGGER; + apic_write_around(APIC_LVT0, v); +} + +/** + * get_physical_broadcast - Get number of physical broadcast IDs + */ +int get_physical_broadcast(void) +{ + return modern_apic() ? 0xff : 0xf; +} + +/** + * lapic_get_maxlvt - get the maximum number of local vector table entries + */ +int lapic_get_maxlvt(void) +{ + unsigned int v = apic_read(APIC_LVR); + + /* 82489DXs do not report # of LVT entries. */ + return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; } /* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themselves. + * Local APIC timer */ -void ack_bad_irq(unsigned int irq) + +/* Clock divisor is set to 16 */ +#define APIC_DIVISOR 16 + +/* + * This function sets up the local APIC timer, with a timeout of + * 'clocks' APIC bus clock. During calibration we actually call + * this function twice on the boot CPU, once with a bogus timeout + * value, second time for real. The other (noncalibrating) CPUs + * call this function only once, with the real, calibrated value. + * + * We do reads before writes even if unnecessary, to get around the + * P5 APIC double write bug. + */ +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) { - printk("unexpected IRQ trap at vector %02x\n", irq); + unsigned int lvtt_value, tmp_value; + + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; + if (!lapic_is_integrated()) + lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); + + if (!irqen) + lvtt_value |= APIC_LVT_MASKED; + + apic_write_around(APIC_LVTT, lvtt_value); + /* - * Currently unexpected vectors happen only on SMP and APIC. - * We _must_ ack these because every local APIC has only N - * irq slots per priority level, and a 'hanging, unacked' IRQ - * holds up an irq slot - in excessive cases (when multiple - * unexpected vectors occur) that might lock up the APIC - * completely. - * But only ack when the APIC is enabled -AK + * Divide PICLK by 16 */ - if (cpu_has_apic) - ack_APIC_irq(); + tmp_value = apic_read(APIC_TDCR); + apic_write_around(APIC_TDCR, (tmp_value + & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) + | APIC_TDR_DIV_16); + + if (!oneshot) + apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); } -void __init apic_intr_init(void) +/* + * Program the next event, relative to now + */ +static int lapic_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + apic_write_around(APIC_TMICT, delta); + return 0; +} + +/* + * Setup the lapic timer in periodic or oneshot mode + */ +static void lapic_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long flags; + unsigned int v; + + /* Lapic used for broadcast ? */ + if (!local_apic_timer_verify_ok) + return; + + local_irq_save(flags); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + case CLOCK_EVT_MODE_ONESHOT: + __setup_APIC_LVTT(calibration_result, + mode != CLOCK_EVT_MODE_PERIODIC, 1); + break; + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + v = apic_read(APIC_LVTT); + v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write_around(APIC_LVTT, v); + break; + } + + local_irq_restore(flags); +} + +/* + * Local APIC timer broadcast function + */ +static void lapic_timer_broadcast(cpumask_t mask) { #ifdef CONFIG_SMP - smp_intr_init(); + send_IPI_mask(mask, LOCAL_TIMER_VECTOR); #endif - /* self generated IPI for local APIC timer */ - set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); +} - /* IPI vectors for APIC spurious and error interrupts */ - set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +/* + * Setup the local APIC timer for this CPU. Copy the initilized values + * of the boot CPU and register the clock event in the framework. + */ +static void __devinit setup_APIC_timer(void) +{ + struct clock_event_device *levt = &__get_cpu_var(lapic_events); - /* thermal monitor LVT interrupt */ -#ifdef CONFIG_X86_MCE_P4THERMAL - set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); -#endif + memcpy(levt, &lapic_clockevent, sizeof(*levt)); + levt->cpumask = cpumask_of_cpu(smp_processor_id()); + + clockevents_register_device(levt); } -/* Using APIC to generate smp_local_timer_interrupt? */ -int using_apic_timer __read_mostly = 0; +/* + * In this functions we calibrate APIC bus clocks to the external timer. + * + * We want to do the calibration only once since we want to have local timer + * irqs syncron. CPUs connected by the same APIC bus have the very same bus + * frequency. + * + * This was previously done by reading the PIT/HPET and waiting for a wrap + * around to find out, that a tick has elapsed. I have a box, where the PIT + * readout is broken, so it never gets out of the wait loop again. This was + * also reported by others. + * + * Monitoring the jiffies value is inaccurate and the clockevents + * infrastructure allows us to do a simple substitution of the interrupt + * handler. + * + * The calibration routine also uses the pm_timer when possible, as the PIT + * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes + * back to normal later in the boot process). + */ -static int enabled_via_apicbase; +#define LAPIC_CAL_LOOPS (HZ/10) -void enable_NMI_through_LVT0 (void * dummy) +static __initdata volatile int lapic_cal_loops = -1; +static __initdata long lapic_cal_t1, lapic_cal_t2; +static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; +static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; +static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; + +/* + * Temporary interrupt handler. + */ +static void __init lapic_cal_handler(struct clock_event_device *dev) { - unsigned int v, ver; + unsigned long long tsc = 0; + long tapic = apic_read(APIC_TMCCT); + unsigned long pm = acpi_pm_read_early(); - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); - v = APIC_DM_NMI; /* unmask and set to NMI */ - if (!APIC_INTEGRATED(ver)) /* 82489DX */ - v |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT0, v); + if (cpu_has_tsc) + rdtscll(tsc); + + switch (lapic_cal_loops++) { + case 0: + lapic_cal_t1 = tapic; + lapic_cal_tsc1 = tsc; + lapic_cal_pm1 = pm; + lapic_cal_j1 = jiffies; + break; + + case LAPIC_CAL_LOOPS: + lapic_cal_t2 = tapic; + lapic_cal_tsc2 = tsc; + if (pm < lapic_cal_pm1) + pm += ACPI_PM_OVRRUN; + lapic_cal_pm2 = pm; + lapic_cal_j2 = jiffies; + break; + } } -int get_physical_broadcast(void) +/* + * Setup the boot APIC + * + * Calibrate and verify the result. + */ +void __init setup_boot_APIC_clock(void) { - if (modern_apic()) - return 0xff; - else - return 0xf; + struct clock_event_device *levt = &__get_cpu_var(lapic_events); + const long pm_100ms = PMTMR_TICKS_PER_SEC/10; + const long pm_thresh = pm_100ms/100; + void (*real_handler)(struct clock_event_device *dev); + unsigned long deltaj; + long delta, deltapm; + + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" + "calibrating APIC timer ...\n"); + + local_irq_disable(); + + /* Replace the global interrupt handler */ + real_handler = global_clock_event->event_handler; + global_clock_event->event_handler = lapic_cal_handler; + + /* + * Setup the APIC counter to 1e9. There is no way the lapic + * can underflow in the 100ms detection time frame + */ + __setup_APIC_LVTT(1000000000, 0, 0); + + /* Let the interrupts run */ + local_irq_enable(); + + while(lapic_cal_loops <= LAPIC_CAL_LOOPS); + + local_irq_disable(); + + /* Restore the real event handler */ + global_clock_event->event_handler = real_handler; + + /* Build delta t1-t2 as apic timer counts down */ + delta = lapic_cal_t1 - lapic_cal_t2; + apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); + + /* Check, if the PM timer is available */ + deltapm = lapic_cal_pm2 - lapic_cal_pm1; + apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); + + if (deltapm) { + unsigned long mult; + u64 res; + + mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); + + if (deltapm > (pm_100ms - pm_thresh) && + deltapm < (pm_100ms + pm_thresh)) { + apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); + } else { + res = (((u64) deltapm) * mult) >> 22; + do_div(res, 1000000); + printk(KERN_WARNING "APIC calibration not consistent " + "with PM Timer: %ldms instead of 100ms\n", + (long)res); + /* Correct the lapic counter value */ + res = (((u64) delta ) * pm_100ms); + do_div(res, deltapm); + printk(KERN_INFO "APIC delta adjusted to PM-Timer: " + "%lu (%ld)\n", (unsigned long) res, delta); + delta = (long) res; + } + } + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 32); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + + calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; + + apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); + apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); + apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", + calibration_result); + + if (cpu_has_tsc) { + delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); + apic_printk(APIC_VERBOSE, "..... CPU clock speed is " + "%ld.%04ld MHz.\n", + (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ), + (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ)); + } + + apic_printk(APIC_VERBOSE, "..... host bus clock speed is " + "%u.%04u MHz.\n", + calibration_result / (1000000 / HZ), + calibration_result % (1000000 / HZ)); + + + apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); + + /* + * Setup the apic timer manually + */ + local_apic_timer_verify_ok = 1; + levt->event_handler = lapic_cal_handler; + lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt); + lapic_cal_loops = -1; + + /* Let the interrupts run */ + local_irq_enable(); + + while(lapic_cal_loops <= LAPIC_CAL_LOOPS); + + local_irq_disable(); + + /* Stop the lapic timer */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); + + local_irq_enable(); + + /* Jiffies delta */ + deltaj = lapic_cal_j2 - lapic_cal_j1; + apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); + + /* Check, if the PM timer is available */ + deltapm = lapic_cal_pm2 - lapic_cal_pm1; + apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); + + local_apic_timer_verify_ok = 0; + + if (deltapm) { + if (deltapm > (pm_100ms - pm_thresh) && + deltapm < (pm_100ms + pm_thresh)) { + apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); + /* Check, if the jiffies result is consistent */ + if (deltaj < LAPIC_CAL_LOOPS-2 || + deltaj > LAPIC_CAL_LOOPS+2) { + /* + * Not sure, what we can do about this one. + * When high resultion timers are active + * and the lapic timer does not stop in C3 + * we are fine. Otherwise more trouble might + * be waiting. -- tglx + */ + printk(KERN_WARNING "Global event device %s " + "has wrong frequency " + "(%lu ticks instead of %d)\n", + global_clock_event->name, deltaj, + LAPIC_CAL_LOOPS); + } + local_apic_timer_verify_ok = 1; + } + } else { + /* Check, if the jiffies result is consistent */ + if (deltaj >= LAPIC_CAL_LOOPS-2 && + deltaj <= LAPIC_CAL_LOOPS+2) { + apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); + local_apic_timer_verify_ok = 1; + } + } + + if (!local_apic_timer_verify_ok) { + printk(KERN_WARNING + "APIC timer disabled due to verification failure.\n"); + /* No broadcast on UP ! */ + if (num_possible_cpus() == 1) + return; + } else + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + + /* Setup the lapic or request the broadcast */ + setup_APIC_timer(); +} + +void __devinit setup_secondary_APIC_clock(void) +{ + setup_APIC_timer(); } -int get_maxlvt(void) +/* + * The guts of the apic timer interrupt + */ +static void local_apic_timer_interrupt(void) { - unsigned int v, ver, maxlvt; + int cpu = smp_processor_id(); + struct clock_event_device *evt = &per_cpu(lapic_events, cpu); - v = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(v); - /* 82489DXs do not report # of LVT entries. */ - maxlvt = APIC_INTEGRATED(ver) ? GET_APIC_MAXLVT(v) : 2; - return maxlvt; + /* + * Normally we should not be here till LAPIC has been initialized but + * in some cases like kdump, its possible that there is a pending LAPIC + * timer interrupt from previous kernel's context and is delivered in + * new kernel the moment interrupts are enabled. + * + * Interrupts are enabled early and LAPIC is setup much later, hence + * its possible that when we get here evt->event_handler is NULL. + * Check for event_handler being NULL and discard the interrupt as + * spurious. + */ + if (!evt->event_handler) { + printk(KERN_WARNING + "Spurious LAPIC timer interrupt on cpu %d\n", cpu); + /* Switch it off */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); + return; + } + + per_cpu(irq_stat, cpu).apic_timer_irqs++; + + evt->event_handler(evt); +} + +/* + * Local APIC timer interrupt. This is the most natural way for doing + * local interrupts, but local timer interrupts can be emulated by + * broadcast interrupts too. [in case the hw doesn't support APIC timers] + * + * [ if a single-CPU system runs an SMP kernel then we call the local + * interrupt as well. Thus we cannot inline the local irq ... ] + */ + +void fastcall smp_apic_timer_interrupt(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + /* + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. + */ + ack_APIC_irq(); + /* + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ + exit_idle(); + irq_enter(); + local_apic_timer_interrupt(); + irq_exit(); + + set_irq_regs(old_regs); } +int setup_profiling_timer(unsigned int multiplier) +{ + return -EINVAL; +} + +/* + * Local APIC start and shutdown + */ + +/** + * clear_local_APIC - shutdown the local APIC + * + * This is called, when a CPU is disabled and before rebooting, so the state of + * the local APIC has no dangling leftovers. Also used to cleanout any BIOS + * leftovers during boot. + */ void clear_local_APIC(void) { - int maxlvt; + int maxlvt = lapic_get_maxlvt(); unsigned long v; - maxlvt = get_maxlvt(); - /* * Masking an LVT entry can trigger a local APIC error * if the vector is zero. Mask LVTERR first to prevent this. @@ -190,7 +613,7 @@ void clear_local_APIC(void) apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); } -/* lets not touch this if we didn't frob it */ + /* lets not touch this if we didn't frob it */ #ifdef CONFIG_X86_MCE_P4THERMAL if (maxlvt >= 5) { v = apic_read(APIC_LVTTHMR); @@ -212,85 +635,18 @@ void clear_local_APIC(void) if (maxlvt >= 5) apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); #endif - v = GET_APIC_VERSION(apic_read(APIC_LVR)); - if (APIC_INTEGRATED(v)) { /* !82489DX */ - if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP. */ + /* Integrated APIC (!82489DX) ? */ + if (lapic_is_integrated()) { + if (maxlvt > 3) + /* Clear ESR due to Pentium errata 3AP and 11AP */ apic_write(APIC_ESR, 0); apic_read(APIC_ESR); } } -void __init connect_bsp_APIC(void) -{ - if (pic_mode) { - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - /* - * PIC mode, enable APIC mode in the IMCR, i.e. - * connect BSP's local APIC to INT and NMI lines. - */ - apic_printk(APIC_VERBOSE, "leaving PIC mode, " - "enabling APIC mode.\n"); - outb(0x70, 0x22); - outb(0x01, 0x23); - } - enable_apic_mode(); -} - -void disconnect_bsp_APIC(int virt_wire_setup) -{ - if (pic_mode) { - /* - * Put the board back into PIC mode (has an effect - * only on certain older boards). Note that APIC - * interrupts, including IPIs, won't work beyond - * this point! The only exception are INIT IPIs. - */ - apic_printk(APIC_VERBOSE, "disabling APIC mode, " - "entering PIC mode.\n"); - outb(0x70, 0x22); - outb(0x00, 0x23); - } - else { - /* Go back to Virtual Wire compatibility mode */ - unsigned long value; - - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write_around(APIC_SPIV, value); - - if (!virt_wire_setup) { - /* For LVT0 make it edge triggered, active high, external and enabled */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write_around(APIC_LVT0, value); - } - else { - /* Disable LVT0 */ - apic_write_around(APIC_LVT0, APIC_LVT_MASKED); - } - - /* For LVT1 make it edge triggered, active high, nmi and enabled */ - value = apic_read(APIC_LVT1); - value &= ~( - APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write_around(APIC_LVT1, value); - } -} - +/** + * disable_local_APIC - clear and disable the local APIC + */ void disable_local_APIC(void) { unsigned long value; @@ -305,8 +661,13 @@ void disable_local_APIC(void) value &= ~APIC_SPIV_APIC_ENABLED; apic_write_around(APIC_SPIV, value); + /* + * When LAPIC was disabled by the BIOS and enabled by the kernel, + * restore the disabled state. + */ if (enabled_via_apicbase) { unsigned int l, h; + rdmsr(MSR_IA32_APICBASE, l, h); l &= ~MSR_IA32_APICBASE_ENABLE; wrmsr(MSR_IA32_APICBASE, l, h); @@ -314,6 +675,28 @@ void disable_local_APIC(void) } /* + * If Linux enabled the LAPIC against the BIOS default disable it down before + * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and + * not power-off. Additionally clear all LVT entries before disable_local_APIC + * for the case where Linux didn't enable the LAPIC. + */ +void lapic_shutdown(void) +{ + unsigned long flags; + + if (!cpu_has_apic) + return; + + local_irq_save(flags); + clear_local_APIC(); + + if (enabled_via_apicbase) + disable_local_APIC(); + + local_irq_restore(flags); +} + +/* * This is to verify that we're looking at a real local APIC. * Check these against your board if the CPUs aren't getting * started for no apparent reason. @@ -345,7 +728,7 @@ int __init verify_local_APIC(void) reg1 = GET_APIC_VERSION(reg0); if (reg1 == 0x00 || reg1 == 0xff) return 0; - reg1 = get_maxlvt(); + reg1 = lapic_get_maxlvt(); if (reg1 < 0x02 || reg1 == 0xff) return 0; @@ -368,10 +751,15 @@ int __init verify_local_APIC(void) return 1; } +/** + * sync_Arb_IDs - synchronize APIC bus arbitration IDs + */ void __init sync_Arb_IDs(void) { - /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 - And not needed on AMD */ + /* + * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not + * needed on AMD. + */ if (modern_apic()) return; /* @@ -384,14 +772,12 @@ void __init sync_Arb_IDs(void) | APIC_DM_INIT); } -extern void __error_in_apic_c (void); - /* * An initial setup of the virtual wire mode. */ void __init init_bsp_APIC(void) { - unsigned long value, ver; + unsigned long value; /* * Don't do the setup now if we have a SMP BIOS as the @@ -400,9 +786,6 @@ void __init init_bsp_APIC(void) if (smp_found_config || !cpu_has_apic) return; - value = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(value); - /* * Do not trust the local APIC being empty at bootup. */ @@ -414,9 +797,10 @@ void __init init_bsp_APIC(void) value = apic_read(APIC_SPIV); value &= ~APIC_VECTOR_MASK; value |= APIC_SPIV_APIC_ENABLED; - + /* This bit is reserved on P4/Xeon and should be cleared */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 15)) + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + (boot_cpu_data.x86 == 15)) value &= ~APIC_SPIV_FOCUS_DISABLED; else value |= APIC_SPIV_FOCUS_DISABLED; @@ -428,14 +812,17 @@ void __init init_bsp_APIC(void) */ apic_write_around(APIC_LVT0, APIC_DM_EXTINT); value = APIC_DM_NMI; - if (!APIC_INTEGRATED(ver)) /* 82489DX */ + if (!lapic_is_integrated()) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; apic_write_around(APIC_LVT1, value); } +/** + * setup_local_APIC - setup the local APIC + */ void __devinit setup_local_APIC(void) { - unsigned long oldvalue, value, ver, maxlvt; + unsigned long oldvalue, value, maxlvt, integrated; int i, j; /* Pound the ESR really hard over the head with a big hammer - mbligh */ @@ -446,11 +833,7 @@ void __devinit setup_local_APIC(void) apic_write(APIC_ESR, 0); } - value = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(value); - - if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f) - __error_in_apic_c(); + integrated = lapic_is_integrated(); /* * Double-check whether this APIC is really registered. @@ -521,13 +904,10 @@ void __devinit setup_local_APIC(void) * like LRU than MRU (the short-term load is more even across CPUs). * See also the comment in end_level_ioapic_irq(). --macro */ -#if 1 + /* Enable focus processor (bit==0) */ value &= ~APIC_SPIV_FOCUS_DISABLED; -#else - /* Disable focus processor (bit==1) */ - value |= APIC_SPIV_FOCUS_DISABLED; -#endif + /* * Set spurious IRQ vector */ @@ -563,17 +943,18 @@ void __devinit setup_local_APIC(void) value = APIC_DM_NMI; else value = APIC_DM_NMI | APIC_LVT_MASKED; - if (!APIC_INTEGRATED(ver)) /* 82489DX */ + if (!integrated) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; apic_write_around(APIC_LVT1, value); - if (APIC_INTEGRATED(ver) && !esr_disable) { /* !82489DX */ - maxlvt = get_maxlvt(); + if (integrated && !esr_disable) { /* !82489DX */ + maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); oldvalue = apic_read(APIC_ESR); - value = ERROR_APIC_VECTOR; // enables sending errors + /* enables sending errors */ + value = ERROR_APIC_VECTOR; apic_write_around(APIC_LVTERR, value); /* * spec says clear errors after enabling vector. @@ -586,207 +967,30 @@ void __devinit setup_local_APIC(void) "vector: 0x%08lx after: 0x%08lx\n", oldvalue, value); } else { - if (esr_disable) - /* - * Something untraceble is creating bad interrupts on + if (esr_disable) + /* + * Something untraceble is creating bad interrupts on * secondary quads ... for the moment, just leave the * ESR disabled - we can't do anything useful with the * errors anyway - mbligh */ - printk("Leaving ESR disabled.\n"); - else - printk("No ESR for 82489DX.\n"); + printk(KERN_INFO "Leaving ESR disabled.\n"); + else + printk(KERN_INFO "No ESR for 82489DX.\n"); } + /* Disable the local apic timer */ + value = apic_read(APIC_LVTT); + value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write_around(APIC_LVTT, value); + setup_apic_nmi_watchdog(NULL); apic_pm_activate(); } /* - * If Linux enabled the LAPIC against the BIOS default - * disable it down before re-entering the BIOS on shutdown. - * Otherwise the BIOS may get confused and not power-off. - * Additionally clear all LVT entries before disable_local_APIC - * for the case where Linux didn't enable the LAPIC. - */ -void lapic_shutdown(void) -{ - unsigned long flags; - - if (!cpu_has_apic) - return; - - local_irq_save(flags); - clear_local_APIC(); - - if (enabled_via_apicbase) - disable_local_APIC(); - - local_irq_restore(flags); -} - -#ifdef CONFIG_PM - -static struct { - int active; - /* r/w apic fields */ - unsigned int apic_id; - unsigned int apic_taskpri; - unsigned int apic_ldr; - unsigned int apic_dfr; - unsigned int apic_spiv; - unsigned int apic_lvtt; - unsigned int apic_lvtpc; - unsigned int apic_lvt0; - unsigned int apic_lvt1; - unsigned int apic_lvterr; - unsigned int apic_tmict; - unsigned int apic_tdcr; - unsigned int apic_thmr; -} apic_pm_state; - -static int lapic_suspend(struct sys_device *dev, pm_message_t state) -{ - unsigned long flags; - int maxlvt; - - if (!apic_pm_state.active) - return 0; - - maxlvt = get_maxlvt(); - - apic_pm_state.apic_id = apic_read(APIC_ID); - apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); - apic_pm_state.apic_ldr = apic_read(APIC_LDR); - apic_pm_state.apic_dfr = apic_read(APIC_DFR); - apic_pm_state.apic_spiv = apic_read(APIC_SPIV); - apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); - if (maxlvt >= 4) - apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); - apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); - apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); - apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); - apic_pm_state.apic_tmict = apic_read(APIC_TMICT); - apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); -#ifdef CONFIG_X86_MCE_P4THERMAL - if (maxlvt >= 5) - apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); -#endif - - local_irq_save(flags); - disable_local_APIC(); - local_irq_restore(flags); - return 0; -} - -static int lapic_resume(struct sys_device *dev) -{ - unsigned int l, h; - unsigned long flags; - int maxlvt; - - if (!apic_pm_state.active) - return 0; - - maxlvt = get_maxlvt(); - - local_irq_save(flags); - - /* - * Make sure the APICBASE points to the right address - * - * FIXME! This will be wrong if we ever support suspend on - * SMP! We'll need to do this as part of the CPU restore! - */ - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; - wrmsr(MSR_IA32_APICBASE, l, h); - - apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); - apic_write(APIC_ID, apic_pm_state.apic_id); - apic_write(APIC_DFR, apic_pm_state.apic_dfr); - apic_write(APIC_LDR, apic_pm_state.apic_ldr); - apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); - apic_write(APIC_SPIV, apic_pm_state.apic_spiv); - apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); - apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); -#ifdef CONFIG_X86_MCE_P4THERMAL - if (maxlvt >= 5) - apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); -#endif - if (maxlvt >= 4) - apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); - apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); - apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); - apic_write(APIC_TMICT, apic_pm_state.apic_tmict); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - local_irq_restore(flags); - return 0; -} - -/* - * This device has no shutdown method - fully functioning local APICs - * are needed on every CPU up until machine_halt/restart/poweroff. + * Detect and initialize APIC */ - -static struct sysdev_class lapic_sysclass = { - set_kset_name("lapic"), - .resume = lapic_resume, - .suspend = lapic_suspend, -}; - -static struct sys_device device_lapic = { - .id = 0, - .cls = &lapic_sysclass, -}; - -static void __devinit apic_pm_activate(void) -{ - apic_pm_state.active = 1; -} - -static int __init init_lapic_sysfs(void) -{ - int error; - - if (!cpu_has_apic) - return 0; - /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ - - error = sysdev_class_register(&lapic_sysclass); - if (!error) - error = sysdev_register(&device_lapic); - return error; -} -device_initcall(init_lapic_sysfs); - -#else /* CONFIG_PM */ - -static void apic_pm_activate(void) { } - -#endif /* CONFIG_PM */ - -/* - * Detect and enable local APICs on non-SMP boards. - * Original code written by Keir Fraser. - */ - -static int __init apic_set_verbosity(char *str) -{ - if (strcmp("debug", str) == 0) - apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", str) == 0) - apic_verbosity = APIC_VERBOSE; - return 1; -} - -__setup("apic=", apic_set_verbosity); - static int __init detect_init_APIC (void) { u32 h, l, features; @@ -798,7 +1002,7 @@ static int __init detect_init_APIC (void) switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) || - (boot_cpu_data.x86 == 15)) + (boot_cpu_data.x86 == 15)) break; goto no_apic; case X86_VENDOR_INTEL: @@ -812,23 +1016,23 @@ static int __init detect_init_APIC (void) if (!cpu_has_apic) { /* - * Over-ride BIOS and try to enable the local - * APIC only if "lapic" specified. + * Over-ride BIOS and try to enable the local APIC only if + * "lapic" specified. */ if (enable_local_apic <= 0) { - printk("Local APIC disabled by BIOS -- " + printk(KERN_INFO "Local APIC disabled by BIOS -- " "you can enable it with \"lapic\"\n"); return -1; } /* - * Some BIOSes disable the local APIC in the - * APIC_BASE MSR. This can only be done in - * software for Intel P6 or later and AMD K7 - * (Model > 1) or later. + * Some BIOSes disable the local APIC in the APIC_BASE + * MSR. This can only be done in software for Intel P6 or later + * and AMD K7 (Model > 1) or later. */ rdmsr(MSR_IA32_APICBASE, l, h); if (!(l & MSR_IA32_APICBASE_ENABLE)) { - printk("Local APIC disabled by BIOS -- reenabling.\n"); + printk(KERN_INFO + "Local APIC disabled by BIOS -- reenabling.\n"); l &= ~MSR_IA32_APICBASE_BASE; l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; wrmsr(MSR_IA32_APICBASE, l, h); @@ -841,7 +1045,7 @@ static int __init detect_init_APIC (void) */ features = cpuid_edx(1); if (!(features & (1 << X86_FEATURE_APIC))) { - printk("Could not enable APIC!\n"); + printk(KERN_WARNING "Could not enable APIC!\n"); return -1; } set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); @@ -855,17 +1059,20 @@ static int __init detect_init_APIC (void) if (nmi_watchdog != NMI_NONE) nmi_watchdog = NMI_LOCAL_APIC; - printk("Found and enabled local APIC!\n"); + printk(KERN_INFO "Found and enabled local APIC!\n"); apic_pm_activate(); return 0; no_apic: - printk("No local APIC present or hardware disabled\n"); + printk(KERN_INFO "No local APIC present or hardware disabled\n"); return -1; } +/** + * init_apic_mappings - initialize APIC mappings + */ void __init init_apic_mappings(void) { unsigned long apic_phys; @@ -925,385 +1132,92 @@ fake_ioapic_page: } /* - * This part sets up the APIC 32 bit clock in LVTT1, with HZ interrupts - * per second. We assume that the caller has already set up the local - * APIC. - * - * The APIC timer is not exactly sync with the external timer chip, it - * closely follows bus clocks. - */ - -/* - * The timer chip is already set up at HZ interrupts per second here, - * but we do not accept timer interrupts yet. We only allow the BP - * to calibrate. - */ -static unsigned int __devinit get_8254_timer_count(void) -{ - unsigned long flags; - - unsigned int count; - - spin_lock_irqsave(&i8253_lock, flags); - - outb_p(0x00, PIT_MODE); - count = inb_p(PIT_CH0); - count |= inb_p(PIT_CH0) << 8; - - spin_unlock_irqrestore(&i8253_lock, flags); - - return count; -} - -/* next tick in 8254 can be caught by catching timer wraparound */ -static void __devinit wait_8254_wraparound(void) -{ - unsigned int curr_count, prev_count; - - curr_count = get_8254_timer_count(); - do { - prev_count = curr_count; - curr_count = get_8254_timer_count(); - - /* workaround for broken Mercury/Neptune */ - if (prev_count >= curr_count + 0x100) - curr_count = get_8254_timer_count(); - - } while (prev_count >= curr_count); -} - -/* - * Default initialization for 8254 timers. If we use other timers like HPET, - * we override this later - */ -void (*wait_timer_tick)(void) __devinitdata = wait_8254_wraparound; - -/* - * This function sets up the local APIC timer, with a timeout of - * 'clocks' APIC bus clock. During calibration we actually call - * this function twice on the boot CPU, once with a bogus timeout - * value, second time for real. The other (noncalibrating) CPUs - * call this function only once, with the real, calibrated value. - * - * We do reads before writes even if unnecessary, to get around the - * P5 APIC double write bug. + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. */ - -#define APIC_DIVISOR 16 - -static void __setup_APIC_LVTT(unsigned int clocks) +int __init APIC_init_uniprocessor (void) { - unsigned int lvtt_value, tmp_value, ver; - int cpu = smp_processor_id(); - - ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; - if (!APIC_INTEGRATED(ver)) - lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); - - if (cpu_isset(cpu, timer_bcast_ipi)) - lvtt_value |= APIC_LVT_MASKED; + if (enable_local_apic < 0) + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); - apic_write_around(APIC_LVTT, lvtt_value); + if (!smp_found_config && !cpu_has_apic) + return -1; /* - * Divide PICLK by 16 + * Complain if the BIOS pretends there is one. */ - tmp_value = apic_read(APIC_TDCR); - apic_write_around(APIC_TDCR, (tmp_value - & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) - | APIC_TDR_DIV_16); - - apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); -} + if (!cpu_has_apic && + APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", + boot_cpu_physical_apicid); + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + return -1; + } -static void __devinit setup_APIC_timer(unsigned int clocks) -{ - unsigned long flags; + verify_local_APIC(); - local_irq_save(flags); + connect_bsp_APIC(); /* - * Wait for IRQ0's slice: + * Hack: In case of kdump, after a crash, kernel might be booting + * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid + * might be zero if read from MP tables. Get it from LAPIC. */ - wait_timer_tick(); +#ifdef CONFIG_CRASH_DUMP + boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); +#endif + phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); - __setup_APIC_LVTT(clocks); + setup_local_APIC(); - local_irq_restore(flags); +#ifdef CONFIG_X86_IO_APIC + if (smp_found_config) + if (!skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); +#endif + setup_boot_clock(); + + return 0; } /* - * In this function we calibrate APIC bus clocks to the external - * timer. Unfortunately we cannot use jiffies and the timer irq - * to calibrate, since some later bootup code depends on getting - * the first irq? Ugh. - * - * We want to do the calibration only once since we - * want to have local timer irqs syncron. CPUs connected - * by the same APIC bus have the very same bus frequency. - * And we want to have irqs off anyways, no accidental - * APIC irq that way. + * APIC command line parameters */ - -static int __init calibrate_APIC_clock(void) -{ - unsigned long long t1 = 0, t2 = 0; - long tt1, tt2; - long result; - int i; - const int LOOPS = HZ/10; - - apic_printk(APIC_VERBOSE, "calibrating APIC timer ...\n"); - - /* - * Put whatever arbitrary (but long enough) timeout - * value into the APIC clock, we just want to get the - * counter running for calibration. - */ - __setup_APIC_LVTT(1000000000); - - /* - * The timer chip counts down to zero. Let's wait - * for a wraparound to start exact measurement: - * (the current tick might have been already half done) - */ - - wait_timer_tick(); - - /* - * We wrapped around just now. Let's start: - */ - if (cpu_has_tsc) - rdtscll(t1); - tt1 = apic_read(APIC_TMCCT); - - /* - * Let's wait LOOPS wraprounds: - */ - for (i = 0; i < LOOPS; i++) - wait_timer_tick(); - - tt2 = apic_read(APIC_TMCCT); - if (cpu_has_tsc) - rdtscll(t2); - - /* - * The APIC bus clock counter is 32 bits only, it - * might have overflown, but note that we use signed - * longs, thus no extra care needed. - * - * underflown to be exact, as the timer counts down ;) - */ - - result = (tt1-tt2)*APIC_DIVISOR/LOOPS; - - if (cpu_has_tsc) - apic_printk(APIC_VERBOSE, "..... CPU clock speed is " - "%ld.%04ld MHz.\n", - ((long)(t2-t1)/LOOPS)/(1000000/HZ), - ((long)(t2-t1)/LOOPS)%(1000000/HZ)); - - apic_printk(APIC_VERBOSE, "..... host bus clock speed is " - "%ld.%04ld MHz.\n", - result/(1000000/HZ), - result%(1000000/HZ)); - - return result; -} - -static unsigned int calibration_result; - -void __init setup_boot_APIC_clock(void) -{ - unsigned long flags; - apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"); - using_apic_timer = 1; - - local_irq_save(flags); - - calibration_result = calibrate_APIC_clock(); - /* - * Now set up the timer for real. - */ - setup_APIC_timer(calibration_result); - - local_irq_restore(flags); -} - -void __devinit setup_secondary_APIC_clock(void) -{ - setup_APIC_timer(calibration_result); -} - -void disable_APIC_timer(void) -{ - if (using_apic_timer) { - unsigned long v; - - v = apic_read(APIC_LVTT); - /* - * When an illegal vector value (0-15) is written to an LVT - * entry and delivery mode is Fixed, the APIC may signal an - * illegal vector error, with out regard to whether the mask - * bit is set or whether an interrupt is actually seen on input. - * - * Boot sequence might call this function when the LVTT has - * '0' vector value. So make sure vector field is set to - * valid value. - */ - v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write_around(APIC_LVTT, v); - } -} - -void enable_APIC_timer(void) +static int __init parse_lapic(char *arg) { - int cpu = smp_processor_id(); - - if (using_apic_timer && - !cpu_isset(cpu, timer_bcast_ipi)) { - unsigned long v; - - v = apic_read(APIC_LVTT); - apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED); - } + enable_local_apic = 1; + return 0; } +early_param("lapic", parse_lapic); -void switch_APIC_timer_to_ipi(void *cpumask) +static int __init parse_nolapic(char *arg) { - cpumask_t mask = *(cpumask_t *)cpumask; - int cpu = smp_processor_id(); - - if (cpu_isset(cpu, mask) && - !cpu_isset(cpu, timer_bcast_ipi)) { - disable_APIC_timer(); - cpu_set(cpu, timer_bcast_ipi); - } + enable_local_apic = -1; + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + return 0; } -EXPORT_SYMBOL(switch_APIC_timer_to_ipi); +early_param("nolapic", parse_nolapic); -void switch_ipi_to_APIC_timer(void *cpumask) +static int __init apic_set_verbosity(char *str) { - cpumask_t mask = *(cpumask_t *)cpumask; - int cpu = smp_processor_id(); - - if (cpu_isset(cpu, mask) && - cpu_isset(cpu, timer_bcast_ipi)) { - cpu_clear(cpu, timer_bcast_ipi); - enable_APIC_timer(); - } + if (strcmp("debug", str) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", str) == 0) + apic_verbosity = APIC_VERBOSE; + return 1; } -EXPORT_SYMBOL(switch_ipi_to_APIC_timer); -#undef APIC_DIVISOR - -/* - * Local timer interrupt handler. It does both profiling and - * process statistics/rescheduling. - * - * We do profiling in every local tick, statistics/rescheduling - * happen only every 'profiling multiplier' ticks. The default - * multiplier is 1 and it can be changed by writing the new multiplier - * value into /proc/profile. - */ - -inline void smp_local_timer_interrupt(void) -{ - profile_tick(CPU_PROFILING); -#ifdef CONFIG_SMP - update_process_times(user_mode_vm(get_irq_regs())); -#endif +__setup("apic=", apic_set_verbosity); - /* - * We take the 'long' return path, and there every subsystem - * grabs the apropriate locks (kernel lock/ irq lock). - * - * we might want to decouple profiling from the 'long path', - * and do the profiling totally in assembly. - * - * Currently this isn't too much of an issue (performance wise), - * we can take more than 100K local irqs per second on a 100 MHz P5. - */ -} /* - * Local APIC timer interrupt. This is the most natural way for doing - * local interrupts, but local timer interrupts can be emulated by - * broadcast interrupts too. [in case the hw doesn't support APIC timers] - * - * [ if a single-CPU system runs an SMP kernel then we call the local - * interrupt as well. Thus we cannot inline the local irq ... ] + * Local APIC interrupts */ -fastcall void smp_apic_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - int cpu = smp_processor_id(); - - /* - * the NMI deadlock-detector uses this. - */ - per_cpu(irq_stat, cpu).apic_timer_irqs++; - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - */ - ack_APIC_irq(); - /* - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - exit_idle(); - irq_enter(); - smp_local_timer_interrupt(); - irq_exit(); - set_irq_regs(old_regs); -} - -#ifndef CONFIG_SMP -static void up_apic_timer_interrupt_call(void) -{ - int cpu = smp_processor_id(); - - /* - * the NMI deadlock-detector uses this. - */ - per_cpu(irq_stat, cpu).apic_timer_irqs++; - - smp_local_timer_interrupt(); -} -#endif - -void smp_send_timer_broadcast_ipi(void) -{ - cpumask_t mask; - - cpus_and(mask, cpu_online_map, timer_bcast_ipi); - if (!cpus_empty(mask)) { -#ifdef CONFIG_SMP - send_IPI_mask(mask, LOCAL_TIMER_VECTOR); -#else - /* - * We can directly call the apic timer interrupt handler - * in UP case. Minus all irq related functions - */ - up_apic_timer_interrupt_call(); -#endif - } -} - -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -fastcall void smp_spurious_interrupt(struct pt_regs *regs) +void smp_spurious_interrupt(struct pt_regs *regs) { unsigned long v; @@ -1319,16 +1233,15 @@ fastcall void smp_spurious_interrupt(struct pt_regs *regs) ack_APIC_irq(); /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n", - smp_processor_id()); + printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " + "should never happen.\n", smp_processor_id()); irq_exit(); } /* * This interrupt should never happen with our APIC/SMP architecture */ - -fastcall void smp_error_interrupt(struct pt_regs *regs) +void smp_error_interrupt(struct pt_regs *regs) { unsigned long v, v1; @@ -1352,69 +1265,261 @@ fastcall void smp_error_interrupt(struct pt_regs *regs) 7: Illegal register address */ printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", - smp_processor_id(), v , v1); + smp_processor_id(), v , v1); irq_exit(); } /* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. + * Initialize APIC interrupts */ -int __init APIC_init_uniprocessor (void) +void __init apic_intr_init(void) { - if (enable_local_apic < 0) - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); +#ifdef CONFIG_SMP + smp_intr_init(); +#endif + /* self generated IPI for local APIC timer */ + set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - if (!smp_found_config && !cpu_has_apic) - return -1; + /* IPI vectors for APIC spurious and error interrupts */ + set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - /* - * Complain if the BIOS pretends there is one. - */ - if (!cpu_has_apic && APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_physical_apicid); - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); - return -1; + /* thermal monitor LVT interrupt */ +#ifdef CONFIG_X86_MCE_P4THERMAL + set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +#endif +} + +/** + * connect_bsp_APIC - attach the APIC to the interrupt system + */ +void __init connect_bsp_APIC(void) +{ + if (pic_mode) { + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + /* + * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's + * local APIC to INT and NMI lines. + */ + apic_printk(APIC_VERBOSE, "leaving PIC mode, " + "enabling APIC mode.\n"); + outb(0x70, 0x22); + outb(0x01, 0x23); } + enable_apic_mode(); +} - verify_local_APIC(); +/** + * disconnect_bsp_APIC - detach the APIC from the interrupt system + * @virt_wire_setup: indicates, whether virtual wire mode is selected + * + * Virtual wire mode is necessary to deliver legacy interrupts even when the + * APIC is disabled. + */ +void disconnect_bsp_APIC(int virt_wire_setup) +{ + if (pic_mode) { + /* + * Put the board back into PIC mode (has an effect only on + * certain older boards). Note that APIC interrupts, including + * IPIs, won't work beyond this point! The only exception are + * INIT IPIs. + */ + apic_printk(APIC_VERBOSE, "disabling APIC mode, " + "entering PIC mode.\n"); + outb(0x70, 0x22); + outb(0x00, 0x23); + } else { + /* Go back to Virtual Wire compatibility mode */ + unsigned long value; - connect_bsp_APIC(); + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write_around(APIC_SPIV, value); - /* - * Hack: In case of kdump, after a crash, kernel might be booting - * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid - * might be zero if read from MP tables. Get it from LAPIC. - */ -#ifdef CONFIG_CRASH_DUMP - boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); -#endif - phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); + if (!virt_wire_setup) { + /* + * For LVT0 make it edge triggered, active high, + * external and enabled + */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write_around(APIC_LVT0, value); + } else { + /* Disable LVT0 */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED); + } - setup_local_APIC(); + /* + * For LVT1 make it edge triggered, active high, nmi and + * enabled + */ + value = apic_read(APIC_LVT1); + value &= ~( + APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write_around(APIC_LVT1, value); + } +} -#ifdef CONFIG_X86_IO_APIC - if (smp_found_config) - if (!skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); +/* + * Power management + */ +#ifdef CONFIG_PM + +static struct { + int active; + /* r/w apic fields */ + unsigned int apic_id; + unsigned int apic_taskpri; + unsigned int apic_ldr; + unsigned int apic_dfr; + unsigned int apic_spiv; + unsigned int apic_lvtt; + unsigned int apic_lvtpc; + unsigned int apic_lvt0; + unsigned int apic_lvt1; + unsigned int apic_lvterr; + unsigned int apic_tmict; + unsigned int apic_tdcr; + unsigned int apic_thmr; +} apic_pm_state; + +static int lapic_suspend(struct sys_device *dev, pm_message_t state) +{ + unsigned long flags; + int maxlvt; + + if (!apic_pm_state.active) + return 0; + + maxlvt = lapic_get_maxlvt(); + + apic_pm_state.apic_id = apic_read(APIC_ID); + apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); + apic_pm_state.apic_ldr = apic_read(APIC_LDR); + apic_pm_state.apic_dfr = apic_read(APIC_DFR); + apic_pm_state.apic_spiv = apic_read(APIC_SPIV); + apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); + if (maxlvt >= 4) + apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); + apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); + apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); + apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); + apic_pm_state.apic_tmict = apic_read(APIC_TMICT); + apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); +#ifdef CONFIG_X86_MCE_P4THERMAL + if (maxlvt >= 5) + apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); #endif - setup_boot_clock(); + local_irq_save(flags); + disable_local_APIC(); + local_irq_restore(flags); return 0; } -static int __init parse_lapic(char *arg) +static int lapic_resume(struct sys_device *dev) { - lapic_enable(); + unsigned int l, h; + unsigned long flags; + int maxlvt; + + if (!apic_pm_state.active) + return 0; + + maxlvt = lapic_get_maxlvt(); + + local_irq_save(flags); + + /* + * Make sure the APICBASE points to the right address + * + * FIXME! This will be wrong if we ever support suspend on + * SMP! We'll need to do this as part of the CPU restore! + */ + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; + wrmsr(MSR_IA32_APICBASE, l, h); + + apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); + apic_write(APIC_ID, apic_pm_state.apic_id); + apic_write(APIC_DFR, apic_pm_state.apic_dfr); + apic_write(APIC_LDR, apic_pm_state.apic_ldr); + apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); + apic_write(APIC_SPIV, apic_pm_state.apic_spiv); + apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); + apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); +#ifdef CONFIG_X86_MCE_P4THERMAL + if (maxlvt >= 5) + apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); +#endif + if (maxlvt >= 4) + apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); + apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); + apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); + apic_write(APIC_TMICT, apic_pm_state.apic_tmict); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + local_irq_restore(flags); return 0; } -early_param("lapic", parse_lapic); -static int __init parse_nolapic(char *arg) +/* + * This device has no shutdown method - fully functioning local APICs + * are needed on every CPU up until machine_halt/restart/poweroff. + */ + +static struct sysdev_class lapic_sysclass = { + set_kset_name("lapic"), + .resume = lapic_resume, + .suspend = lapic_suspend, +}; + +static struct sys_device device_lapic = { + .id = 0, + .cls = &lapic_sysclass, +}; + +static void __devinit apic_pm_activate(void) { - lapic_disable(); - return 0; + apic_pm_state.active = 1; } -early_param("nolapic", parse_nolapic); +static int __init init_lapic_sysfs(void) +{ + int error; + + if (!cpu_has_apic) + return 0; + /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ + + error = sysdev_class_register(&lapic_sysclass); + if (!error) + error = sysdev_register(&device_lapic); + return error; +} +device_initcall(init_lapic_sysfs); + +#else /* CONFIG_PM */ + +static void apic_pm_activate(void) { } + +#endif /* CONFIG_PM */ diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index f9ba0af7ee1..064bbf2861f 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -236,7 +236,6 @@ #include "io_ports.h" -extern unsigned long get_cmos_time(void); extern void machine_real_restart(unsigned char *, int); #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) @@ -1176,28 +1175,6 @@ out: spin_unlock(&user_list_lock); } -static void set_time(void) -{ - struct timespec ts; - if (got_clock_diff) { /* Must know time zone in order to set clock */ - ts.tv_sec = get_cmos_time() + clock_cmos_diff; - ts.tv_nsec = 0; - do_settimeofday(&ts); - } -} - -static void get_time_diff(void) -{ -#ifndef CONFIG_APM_RTC_IS_GMT - /* - * Estimate time zone so that set_time can update the clock - */ - clock_cmos_diff = -get_cmos_time(); - clock_cmos_diff += get_seconds(); - got_clock_diff = 1; -#endif -} - static void reinit_timer(void) { #ifdef INIT_TIMER_AFTER_SUSPEND @@ -1237,19 +1214,6 @@ static int suspend(int vetoable) local_irq_disable(); device_power_down(PMSG_SUSPEND); - /* serialize with the timer interrupt */ - write_seqlock(&xtime_lock); - - /* protect against access to timer chip registers */ - spin_lock(&i8253_lock); - - get_time_diff(); - /* - * Irq spinlock must be dropped around set_system_power_state. - * We'll undo any timer changes due to interrupts below. - */ - spin_unlock(&i8253_lock); - write_sequnlock(&xtime_lock); local_irq_enable(); save_processor_state(); @@ -1258,7 +1222,6 @@ static int suspend(int vetoable) restore_processor_state(); local_irq_disable(); - set_time(); reinit_timer(); if (err == APM_NO_ERROR) @@ -1288,11 +1251,6 @@ static void standby(void) local_irq_disable(); device_power_down(PMSG_SUSPEND); - /* serialize with the timer interrupt */ - write_seqlock(&xtime_lock); - /* If needed, notify drivers here */ - get_time_diff(); - write_sequnlock(&xtime_lock); local_irq_enable(); err = set_system_power_state(APM_STATE_STANDBY); @@ -1386,7 +1344,6 @@ static void check_events(void) ignore_bounce = 1; if ((event != APM_NORMAL_RESUME) || (ignore_normal_resume == 0)) { - set_time(); device_resume(); pm_send_all(PM_RESUME, (void *)0); queue_event(event, NULL); @@ -1402,7 +1359,6 @@ static void check_events(void) break; case APM_UPDATE_TIME: - set_time(); break; case APM_CRITICAL_SUSPEND: diff --git a/arch/i386/kernel/cpu/cpufreq/Kconfig b/arch/i386/kernel/cpu/cpufreq/Kconfig index 5299c5bf445..6c52182ca32 100644 --- a/arch/i386/kernel/cpu/cpufreq/Kconfig +++ b/arch/i386/kernel/cpu/cpufreq/Kconfig @@ -217,6 +217,15 @@ config X86_LONGHAUL If in doubt, say N. +config X86_E_POWERSAVER + tristate "VIA C7 Enhanced PowerSaver (EXPERIMENTAL)" + select CPU_FREQ_TABLE + depends on EXPERIMENTAL + help + This adds the CPUFreq driver for VIA C7 processors. + + If in doubt, say N. + comment "shared options" config X86_ACPI_CPUFREQ_PROC_INTF diff --git a/arch/i386/kernel/cpu/cpufreq/Makefile b/arch/i386/kernel/cpu/cpufreq/Makefile index 8de3abe322a..560f7760dae 100644 --- a/arch/i386/kernel/cpu/cpufreq/Makefile +++ b/arch/i386/kernel/cpu/cpufreq/Makefile @@ -2,6 +2,7 @@ obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o obj-$(CONFIG_X86_LONGHAUL) += longhaul.o +obj-$(CONFIG_X86_E_POWERSAVER) += e_powersaver.o obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o obj-$(CONFIG_SC520_CPUFREQ) += sc520_freq.o obj-$(CONFIG_X86_LONGRUN) += longrun.o diff --git a/arch/i386/kernel/cpu/cpufreq/e_powersaver.c b/arch/i386/kernel/cpu/cpufreq/e_powersaver.c new file mode 100644 index 00000000000..f43d98e11cc --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/e_powersaver.c @@ -0,0 +1,334 @@ +/* + * Based on documentation provided by Dave Jones. Thanks! + * + * Licensed under the terms of the GNU GPL License version 2. + * + * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/cpufreq.h> +#include <linux/ioport.h> +#include <linux/slab.h> + +#include <asm/msr.h> +#include <asm/tsc.h> +#include <asm/timex.h> +#include <asm/io.h> +#include <asm/delay.h> + +#define EPS_BRAND_C7M 0 +#define EPS_BRAND_C7 1 +#define EPS_BRAND_EDEN 2 +#define EPS_BRAND_C3 3 + +struct eps_cpu_data { + u32 fsb; + struct cpufreq_frequency_table freq_table[]; +}; + +static struct eps_cpu_data *eps_cpu[NR_CPUS]; + + +static unsigned int eps_get(unsigned int cpu) +{ + struct eps_cpu_data *centaur; + u32 lo, hi; + + if (cpu) + return 0; + centaur = eps_cpu[cpu]; + if (centaur == NULL) + return 0; + + /* Return current frequency */ + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + return centaur->fsb * ((lo >> 8) & 0xff); +} + +static int eps_set_state(struct eps_cpu_data *centaur, + unsigned int cpu, + u32 dest_state) +{ + struct cpufreq_freqs freqs; + u32 lo, hi; + int err = 0; + int i; + + freqs.old = eps_get(cpu); + freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff); + freqs.cpu = cpu; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + /* Wait while CPU is busy */ + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + i = 0; + while (lo & ((1 << 16) | (1 << 17))) { + udelay(16); + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + i++; + if (unlikely(i > 64)) { + err = -ENODEV; + goto postchange; + } + } + /* Set new multiplier and voltage */ + wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0); + /* Wait until transition end */ + i = 0; + do { + udelay(16); + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + i++; + if (unlikely(i > 64)) { + err = -ENODEV; + goto postchange; + } + } while (lo & ((1 << 16) | (1 << 17))); + + /* Return current frequency */ +postchange: + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + freqs.new = centaur->fsb * ((lo >> 8) & 0xff); + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + return err; +} + +static int eps_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + struct eps_cpu_data *centaur; + unsigned int newstate = 0; + unsigned int cpu = policy->cpu; + unsigned int dest_state; + int ret; + + if (unlikely(eps_cpu[cpu] == NULL)) + return -ENODEV; + centaur = eps_cpu[cpu]; + + if (unlikely(cpufreq_frequency_table_target(policy, + &eps_cpu[cpu]->freq_table[0], + target_freq, + relation, + &newstate))) { + return -EINVAL; + } + + /* Make frequency transition */ + dest_state = centaur->freq_table[newstate].index & 0xffff; + ret = eps_set_state(centaur, cpu, dest_state); + if (ret) + printk(KERN_ERR "eps: Timeout!\n"); + return ret; +} + +static int eps_verify(struct cpufreq_policy *policy) +{ + return cpufreq_frequency_table_verify(policy, + &eps_cpu[policy->cpu]->freq_table[0]); +} + +static int eps_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int i; + u32 lo, hi; + u64 val; + u8 current_multiplier, current_voltage; + u8 max_multiplier, max_voltage; + u8 min_multiplier, min_voltage; + u8 brand; + u32 fsb; + struct eps_cpu_data *centaur; + struct cpufreq_frequency_table *f_table; + int k, step, voltage; + int ret; + int states; + + if (policy->cpu != 0) + return -ENODEV; + + /* Check brand */ + printk("eps: Detected VIA "); + rdmsr(0x1153, lo, hi); + brand = (((lo >> 2) ^ lo) >> 18) & 3; + switch(brand) { + case EPS_BRAND_C7M: + printk("C7-M\n"); + break; + case EPS_BRAND_C7: + printk("C7\n"); + break; + case EPS_BRAND_EDEN: + printk("Eden\n"); + break; + case EPS_BRAND_C3: + printk("C3\n"); + return -ENODEV; + break; + } + /* Enable Enhanced PowerSaver */ + rdmsrl(MSR_IA32_MISC_ENABLE, val); + if (!(val & 1 << 16)) { + val |= 1 << 16; + wrmsrl(MSR_IA32_MISC_ENABLE, val); + /* Can be locked at 0 */ + rdmsrl(MSR_IA32_MISC_ENABLE, val); + if (!(val & 1 << 16)) { + printk("eps: Can't enable Enhanced PowerSaver\n"); + return -ENODEV; + } + } + + /* Print voltage and multiplier */ + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + current_voltage = lo & 0xff; + printk("eps: Current voltage = %dmV\n", current_voltage * 16 + 700); + current_multiplier = (lo >> 8) & 0xff; + printk("eps: Current multiplier = %d\n", current_multiplier); + + /* Print limits */ + max_voltage = hi & 0xff; + printk("eps: Highest voltage = %dmV\n", max_voltage * 16 + 700); + max_multiplier = (hi >> 8) & 0xff; + printk("eps: Highest multiplier = %d\n", max_multiplier); + min_voltage = (hi >> 16) & 0xff; + printk("eps: Lowest voltage = %dmV\n", min_voltage * 16 + 700); + min_multiplier = (hi >> 24) & 0xff; + printk("eps: Lowest multiplier = %d\n", min_multiplier); + + /* Sanity checks */ + if (current_multiplier == 0 || max_multiplier == 0 + || min_multiplier == 0) + return -EINVAL; + if (current_multiplier > max_multiplier + || max_multiplier <= min_multiplier) + return -EINVAL; + if (current_voltage > 0x1c || max_voltage > 0x1c) + return -EINVAL; + if (max_voltage < min_voltage) + return -EINVAL; + + /* Calc FSB speed */ + fsb = cpu_khz / current_multiplier; + /* Calc number of p-states supported */ + if (brand == EPS_BRAND_C7M) + states = max_multiplier - min_multiplier + 1; + else + states = 2; + + /* Allocate private data and frequency table for current cpu */ + centaur = kzalloc(sizeof(struct eps_cpu_data) + + (states + 1) * sizeof(struct cpufreq_frequency_table), + GFP_KERNEL); + if (!centaur) + return -ENOMEM; + eps_cpu[0] = centaur; + + /* Copy basic values */ + centaur->fsb = fsb; + + /* Fill frequency and MSR value table */ + f_table = ¢aur->freq_table[0]; + if (brand != EPS_BRAND_C7M) { + f_table[0].frequency = fsb * min_multiplier; + f_table[0].index = (min_multiplier << 8) | min_voltage; + f_table[1].frequency = fsb * max_multiplier; + f_table[1].index = (max_multiplier << 8) | max_voltage; + f_table[2].frequency = CPUFREQ_TABLE_END; + } else { + k = 0; + step = ((max_voltage - min_voltage) * 256) + / (max_multiplier - min_multiplier); + for (i = min_multiplier; i <= max_multiplier; i++) { + voltage = (k * step) / 256 + min_voltage; + f_table[k].frequency = fsb * i; + f_table[k].index = (i << 8) | voltage; + k++; + } + f_table[k].frequency = CPUFREQ_TABLE_END; + } + + policy->governor = CPUFREQ_DEFAULT_GOVERNOR; + policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */ + policy->cur = fsb * current_multiplier; + + ret = cpufreq_frequency_table_cpuinfo(policy, ¢aur->freq_table[0]); + if (ret) { + kfree(centaur); + return ret; + } + + cpufreq_frequency_table_get_attr(¢aur->freq_table[0], policy->cpu); + return 0; +} + +static int eps_cpu_exit(struct cpufreq_policy *policy) +{ + unsigned int cpu = policy->cpu; + struct eps_cpu_data *centaur; + u32 lo, hi; + + if (eps_cpu[cpu] == NULL) + return -ENODEV; + centaur = eps_cpu[cpu]; + + /* Get max frequency */ + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + /* Set max frequency */ + eps_set_state(centaur, cpu, hi & 0xffff); + /* Bye */ + cpufreq_frequency_table_put_attr(policy->cpu); + kfree(eps_cpu[cpu]); + eps_cpu[cpu] = NULL; + return 0; +} + +static struct freq_attr* eps_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + +static struct cpufreq_driver eps_driver = { + .verify = eps_verify, + .target = eps_target, + .init = eps_cpu_init, + .exit = eps_cpu_exit, + .get = eps_get, + .name = "e_powersaver", + .owner = THIS_MODULE, + .attr = eps_attr, +}; + +static int __init eps_init(void) +{ + struct cpuinfo_x86 *c = cpu_data; + + /* This driver will work only on Centaur C7 processors with + * Enhanced SpeedStep/PowerSaver registers */ + if (c->x86_vendor != X86_VENDOR_CENTAUR + || c->x86 != 6 || c->x86_model != 10) + return -ENODEV; + if (!cpu_has(c, X86_FEATURE_EST)) + return -ENODEV; + + if (cpufreq_register_driver(&eps_driver)) + return -EINVAL; + return 0; +} + +static void __exit eps_exit(void) +{ + cpufreq_unregister_driver(&eps_driver); +} + +MODULE_AUTHOR("Rafa³ Bilski <rafalbilski@interia.pl>"); +MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's."); +MODULE_LICENSE("GPL"); + +module_init(eps_init); +module_exit(eps_exit); diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.c b/arch/i386/kernel/cpu/cpufreq/longhaul.c index a3db9332d65..b59878a0d9b 100644 --- a/arch/i386/kernel/cpu/cpufreq/longhaul.c +++ b/arch/i386/kernel/cpu/cpufreq/longhaul.c @@ -8,12 +8,11 @@ * VIA have currently 3 different versions of Longhaul. * Version 1 (Longhaul) uses the BCR2 MSR at 0x1147. * It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0. - * Version 2 of longhaul is the same as v1, but adds voltage scaling. - * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C) - * voltage scaling support has currently been disabled in this driver - * until we have code that gets it right. + * Version 2 of longhaul is backward compatible with v1, but adds + * LONGHAUL MSR for purpose of both frequency and voltage scaling. + * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C). * Version 3 of longhaul got renamed to Powersaver and redesigned - * to use the POWERSAVER MSR at 0x110a. + * to use only the POWERSAVER MSR at 0x110a. * It is present in Ezra-T (C5M), Nehemiah (C5X) and above. * It's pretty much the same feature wise to longhaul v2, though * there is provision for scaling FSB too, but this doesn't work @@ -51,10 +50,12 @@ #define CPU_EZRA 3 #define CPU_EZRA_T 4 #define CPU_NEHEMIAH 5 +#define CPU_NEHEMIAH_C 6 /* Flags */ #define USE_ACPI_C3 (1 << 1) #define USE_NORTHBRIDGE (1 << 2) +#define USE_VT8235 (1 << 3) static int cpu_model; static unsigned int numscales=16; @@ -63,7 +64,8 @@ static unsigned int fsb; static struct mV_pos *vrm_mV_table; static unsigned char *mV_vrm_table; struct f_msr { - unsigned char vrm; + u8 vrm; + u8 pos; }; static struct f_msr f_msr_table[32]; @@ -73,10 +75,10 @@ static int can_scale_voltage; static struct acpi_processor *pr = NULL; static struct acpi_processor_cx *cx = NULL; static u8 longhaul_flags; +static u8 longhaul_pos; /* Module parameters */ static int scale_voltage; -static int ignore_latency; #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longhaul", msg) @@ -164,26 +166,47 @@ static void do_longhaul1(unsigned int clock_ratio_index) static void do_powersaver(int cx_address, unsigned int clock_ratio_index) { union msr_longhaul longhaul; + u8 dest_pos; u32 t; + dest_pos = f_msr_table[clock_ratio_index].pos; + rdmsrl(MSR_VIA_LONGHAUL, longhaul.val); + /* Setup new frequency */ longhaul.bits.RevisionKey = longhaul.bits.RevisionID; longhaul.bits.SoftBusRatio = clock_ratio_index & 0xf; longhaul.bits.SoftBusRatio4 = (clock_ratio_index & 0x10) >> 4; - longhaul.bits.EnableSoftBusRatio = 1; - - if (can_scale_voltage) { + /* Setup new voltage */ + if (can_scale_voltage) longhaul.bits.SoftVID = f_msr_table[clock_ratio_index].vrm; + /* Sync to timer tick */ + safe_halt(); + /* Raise voltage if necessary */ + if (can_scale_voltage && longhaul_pos < dest_pos) { longhaul.bits.EnableSoftVID = 1; + wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); + /* Change voltage */ + if (!cx_address) { + ACPI_FLUSH_CPU_CACHE(); + halt(); + } else { + ACPI_FLUSH_CPU_CACHE(); + /* Invoke C3 */ + inb(cx_address); + /* Dummy op - must do something useless after P_LVL3 + * read */ + t = inl(acpi_gbl_FADT.xpm_timer_block.address); + } + longhaul.bits.EnableSoftVID = 0; + wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); + longhaul_pos = dest_pos; } - /* Sync to timer tick */ - safe_halt(); /* Change frequency on next halt or sleep */ + longhaul.bits.EnableSoftBusRatio = 1; wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); if (!cx_address) { ACPI_FLUSH_CPU_CACHE(); - /* Invoke C1 */ halt(); } else { ACPI_FLUSH_CPU_CACHE(); @@ -193,12 +216,29 @@ static void do_powersaver(int cx_address, unsigned int clock_ratio_index) t = inl(acpi_gbl_FADT.xpm_timer_block.address); } /* Disable bus ratio bit */ - local_irq_disable(); - longhaul.bits.RevisionKey = longhaul.bits.RevisionID; longhaul.bits.EnableSoftBusRatio = 0; - longhaul.bits.EnableSoftBSEL = 0; - longhaul.bits.EnableSoftVID = 0; wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); + + /* Reduce voltage if necessary */ + if (can_scale_voltage && longhaul_pos > dest_pos) { + longhaul.bits.EnableSoftVID = 1; + wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); + /* Change voltage */ + if (!cx_address) { + ACPI_FLUSH_CPU_CACHE(); + halt(); + } else { + ACPI_FLUSH_CPU_CACHE(); + /* Invoke C3 */ + inb(cx_address); + /* Dummy op - must do something useless after P_LVL3 + * read */ + t = inl(acpi_gbl_FADT.xpm_timer_block.address); + } + longhaul.bits.EnableSoftVID = 0; + wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); + longhaul_pos = dest_pos; + } } /** @@ -257,26 +297,19 @@ static void longhaul_setstate(unsigned int clock_ratio_index) /* * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B]) * Software controlled multipliers only. - * - * *NB* Until we get voltage scaling working v1 & v2 are the same code. - * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5b] and Ezra [C5C] */ case TYPE_LONGHAUL_V1: - case TYPE_LONGHAUL_V2: do_longhaul1(clock_ratio_index); break; /* + * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C] + * * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N]) - * We can scale voltage with this too, but that's currently - * disabled until we come up with a decent 'match freq to voltage' - * algorithm. - * When we add voltage scaling, we will also need to do the - * voltage/freq setting in order depending on the direction - * of scaling (like we do in powernow-k7.c) * Nehemiah can do FSB scaling too, but this has never been proven * to work in practice. */ + case TYPE_LONGHAUL_V2: case TYPE_POWERSAVER: if (longhaul_flags & USE_ACPI_C3) { /* Don't allow wakeup */ @@ -301,6 +334,7 @@ static void longhaul_setstate(unsigned int clock_ratio_index) local_irq_restore(flags); preempt_enable(); + freqs.new = calc_speed(longhaul_get_cpu_mult()); cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -315,31 +349,19 @@ static void longhaul_setstate(unsigned int clock_ratio_index) #define ROUNDING 0xf -static int _guess(int guess, int mult) -{ - int target; - - target = ((mult/10)*guess); - if (mult%10 != 0) - target += (guess/2); - target += ROUNDING/2; - target &= ~ROUNDING; - return target; -} - - static int guess_fsb(int mult) { - int speed = (cpu_khz/1000); + int speed = cpu_khz / 1000; int i; - int speeds[] = { 66, 100, 133, 200 }; - - speed += ROUNDING/2; - speed &= ~ROUNDING; - - for (i=0; i<4; i++) { - if (_guess(speeds[i], mult) == speed) - return speeds[i]; + int speeds[] = { 666, 1000, 1333, 2000 }; + int f_max, f_min; + + for (i = 0; i < 4; i++) { + f_max = ((speeds[i] * mult) + 50) / 100; + f_max += (ROUNDING / 2); + f_min = f_max - ROUNDING; + if ((speed <= f_max) && (speed >= f_min)) + return speeds[i] / 10; } return 0; } @@ -347,67 +369,40 @@ static int guess_fsb(int mult) static int __init longhaul_get_ranges(void) { - unsigned long invalue; - unsigned int ezra_t_multipliers[32]= { - 90, 30, 40, 100, 55, 35, 45, 95, - 50, 70, 80, 60, 120, 75, 85, 65, - -1, 110, 120, -1, 135, 115, 125, 105, - 130, 150, 160, 140, -1, 155, -1, 145 }; unsigned int j, k = 0; - union msr_longhaul longhaul; - int mult = 0; + int mult; - switch (longhaul_version) { - case TYPE_LONGHAUL_V1: - case TYPE_LONGHAUL_V2: - /* Ugh, Longhaul v1 didn't have the min/max MSRs. - Assume min=3.0x & max = whatever we booted at. */ + /* Get current frequency */ + mult = longhaul_get_cpu_mult(); + if (mult == -1) { + printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n"); + return -EINVAL; + } + fsb = guess_fsb(mult); + if (fsb == 0) { + printk(KERN_INFO PFX "Invalid (reserved) FSB!\n"); + return -EINVAL; + } + /* Get max multiplier - as we always did. + * Longhaul MSR is usefull only when voltage scaling is enabled. + * C3 is booting at max anyway. */ + maxmult = mult; + /* Get min multiplier */ + switch (cpu_model) { + case CPU_NEHEMIAH: + minmult = 50; + break; + case CPU_NEHEMIAH_C: + minmult = 40; + break; + default: minmult = 30; - maxmult = mult = longhaul_get_cpu_mult(); break; - - case TYPE_POWERSAVER: - /* Ezra-T */ - if (cpu_model==CPU_EZRA_T) { - minmult = 30; - rdmsrl (MSR_VIA_LONGHAUL, longhaul.val); - invalue = longhaul.bits.MaxMHzBR; - if (longhaul.bits.MaxMHzBR4) - invalue += 16; - maxmult = mult = ezra_t_multipliers[invalue]; - break; - } - - /* Nehemiah */ - if (cpu_model==CPU_NEHEMIAH) { - rdmsrl (MSR_VIA_LONGHAUL, longhaul.val); - - /* - * TODO: This code works, but raises a lot of questions. - * - Some Nehemiah's seem to have broken Min/MaxMHzBR's. - * We get around this by using a hardcoded multiplier of 4.0x - * for the minimimum speed, and the speed we booted up at for the max. - * This is done in longhaul_get_cpu_mult() by reading the EBLCR register. - * - According to some VIA documentation EBLCR is only - * in pre-Nehemiah C3s. How this still works is a mystery. - * We're possibly using something undocumented and unsupported, - * But it works, so we don't grumble. - */ - minmult=40; - maxmult = mult = longhaul_get_cpu_mult(); - break; - } } - fsb = guess_fsb(mult); dprintk ("MinMult:%d.%dx MaxMult:%d.%dx\n", minmult/10, minmult%10, maxmult/10, maxmult%10); - if (fsb == 0) { - printk (KERN_INFO PFX "Invalid (reserved) FSB!\n"); - return -EINVAL; - } - highest_speed = calc_speed(maxmult); lowest_speed = calc_speed(minmult); dprintk ("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb, @@ -455,6 +450,7 @@ static void __init longhaul_setup_voltagescaling(void) union msr_longhaul longhaul; struct mV_pos minvid, maxvid; unsigned int j, speed, pos, kHz_step, numvscales; + int min_vid_speed; rdmsrl(MSR_VIA_LONGHAUL, longhaul.val); if (!(longhaul.bits.RevisionID & 1)) { @@ -468,14 +464,14 @@ static void __init longhaul_setup_voltagescaling(void) mV_vrm_table = &mV_vrm85[0]; } else { printk (KERN_INFO PFX "Mobile VRM\n"); + if (cpu_model < CPU_NEHEMIAH) + return; vrm_mV_table = &mobilevrm_mV[0]; mV_vrm_table = &mV_mobilevrm[0]; } minvid = vrm_mV_table[longhaul.bits.MinimumVID]; maxvid = vrm_mV_table[longhaul.bits.MaximumVID]; - numvscales = maxvid.pos - minvid.pos + 1; - kHz_step = (highest_speed - lowest_speed) / numvscales; if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) { printk (KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. " @@ -491,20 +487,59 @@ static void __init longhaul_setup_voltagescaling(void) return; } - printk(KERN_INFO PFX "Max VID=%d.%03d Min VID=%d.%03d, %d possible voltage scales\n", + /* How many voltage steps */ + numvscales = maxvid.pos - minvid.pos + 1; + printk(KERN_INFO PFX + "Max VID=%d.%03d " + "Min VID=%d.%03d, " + "%d possible voltage scales\n", maxvid.mV/1000, maxvid.mV%1000, minvid.mV/1000, minvid.mV%1000, numvscales); + /* Calculate max frequency at min voltage */ + j = longhaul.bits.MinMHzBR; + if (longhaul.bits.MinMHzBR4) + j += 16; + min_vid_speed = eblcr_table[j]; + if (min_vid_speed == -1) + return; + switch (longhaul.bits.MinMHzFSB) { + case 0: + min_vid_speed *= 13333; + break; + case 1: + min_vid_speed *= 10000; + break; + case 3: + min_vid_speed *= 6666; + break; + default: + return; + break; + } + if (min_vid_speed >= highest_speed) + return; + /* Calculate kHz for one voltage step */ + kHz_step = (highest_speed - min_vid_speed) / numvscales; + + j = 0; while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) { speed = longhaul_table[j].frequency; - pos = (speed - lowest_speed) / kHz_step + minvid.pos; + if (speed > min_vid_speed) + pos = (speed - min_vid_speed) / kHz_step + minvid.pos; + else + pos = minvid.pos; f_msr_table[longhaul_table[j].index].vrm = mV_vrm_table[pos]; + f_msr_table[longhaul_table[j].index].pos = pos; j++; } + longhaul_pos = maxvid.pos; can_scale_voltage = 1; + printk(KERN_INFO PFX "Voltage scaling enabled. " + "Use of \"conservative\" governor is highly recommended.\n"); } @@ -573,20 +608,51 @@ static int enable_arbiter_disable(void) if (dev != NULL) { /* Enable access to port 0x22 */ pci_read_config_byte(dev, reg, &pci_cmd); - if ( !(pci_cmd & 1<<7) ) { + if (!(pci_cmd & 1<<7)) { pci_cmd |= 1<<7; pci_write_config_byte(dev, reg, pci_cmd); + pci_read_config_byte(dev, reg, &pci_cmd); + if (!(pci_cmd & 1<<7)) { + printk(KERN_ERR PFX + "Can't enable access to port 0x22.\n"); + return 0; + } } return 1; } return 0; } +static int longhaul_setup_vt8235(void) +{ + struct pci_dev *dev; + u8 pci_cmd; + + /* Find VT8235 southbridge */ + dev = pci_find_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL); + if (dev != NULL) { + /* Set transition time to max */ + pci_read_config_byte(dev, 0xec, &pci_cmd); + pci_cmd &= ~(1 << 2); + pci_write_config_byte(dev, 0xec, pci_cmd); + pci_read_config_byte(dev, 0xe4, &pci_cmd); + pci_cmd &= ~(1 << 7); + pci_write_config_byte(dev, 0xe4, pci_cmd); + pci_read_config_byte(dev, 0xe5, &pci_cmd); + pci_cmd |= 1 << 7; + pci_write_config_byte(dev, 0xe5, pci_cmd); + return 1; + } + return 0; +} + static int __init longhaul_cpu_init(struct cpufreq_policy *policy) { struct cpuinfo_x86 *c = cpu_data; char *cpuname=NULL; int ret; + u32 lo, hi; + int vt8235_present; /* Check what we have on this motherboard */ switch (c->x86_model) { @@ -599,16 +665,20 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) break; case 7: - longhaul_version = TYPE_LONGHAUL_V1; switch (c->x86_mask) { case 0: + longhaul_version = TYPE_LONGHAUL_V1; cpu_model = CPU_SAMUEL2; cpuname = "C3 'Samuel 2' [C5B]"; - /* Note, this is not a typo, early Samuel2's had Samuel1 ratios. */ - memcpy (clock_ratio, samuel1_clock_ratio, sizeof(samuel1_clock_ratio)); - memcpy (eblcr_table, samuel2_eblcr, sizeof(samuel2_eblcr)); + /* Note, this is not a typo, early Samuel2's had + * Samuel1 ratios. */ + memcpy(clock_ratio, samuel1_clock_ratio, + sizeof(samuel1_clock_ratio)); + memcpy(eblcr_table, samuel2_eblcr, + sizeof(samuel2_eblcr)); break; case 1 ... 15: + longhaul_version = TYPE_LONGHAUL_V2; if (c->x86_mask < 8) { cpu_model = CPU_SAMUEL2; cpuname = "C3 'Samuel 2' [C5B]"; @@ -616,8 +686,10 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) cpu_model = CPU_EZRA; cpuname = "C3 'Ezra' [C5C]"; } - memcpy (clock_ratio, ezra_clock_ratio, sizeof(ezra_clock_ratio)); - memcpy (eblcr_table, ezra_eblcr, sizeof(ezra_eblcr)); + memcpy(clock_ratio, ezra_clock_ratio, + sizeof(ezra_clock_ratio)); + memcpy(eblcr_table, ezra_eblcr, + sizeof(ezra_eblcr)); break; } break; @@ -632,24 +704,24 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) break; case 9: - cpu_model = CPU_NEHEMIAH; longhaul_version = TYPE_POWERSAVER; - numscales=32; + numscales = 32; + memcpy(clock_ratio, + nehemiah_clock_ratio, + sizeof(nehemiah_clock_ratio)); + memcpy(eblcr_table, nehemiah_eblcr, sizeof(nehemiah_eblcr)); switch (c->x86_mask) { case 0 ... 1: - cpuname = "C3 'Nehemiah A' [C5N]"; - memcpy (clock_ratio, nehemiah_a_clock_ratio, sizeof(nehemiah_a_clock_ratio)); - memcpy (eblcr_table, nehemiah_a_eblcr, sizeof(nehemiah_a_eblcr)); + cpu_model = CPU_NEHEMIAH; + cpuname = "C3 'Nehemiah A' [C5XLOE]"; break; case 2 ... 4: - cpuname = "C3 'Nehemiah B' [C5N]"; - memcpy (clock_ratio, nehemiah_b_clock_ratio, sizeof(nehemiah_b_clock_ratio)); - memcpy (eblcr_table, nehemiah_b_eblcr, sizeof(nehemiah_b_eblcr)); + cpu_model = CPU_NEHEMIAH; + cpuname = "C3 'Nehemiah B' [C5XLOH]"; break; case 5 ... 15: - cpuname = "C3 'Nehemiah C' [C5N]"; - memcpy (clock_ratio, nehemiah_c_clock_ratio, sizeof(nehemiah_c_clock_ratio)); - memcpy (eblcr_table, nehemiah_c_eblcr, sizeof(nehemiah_c_eblcr)); + cpu_model = CPU_NEHEMIAH_C; + cpuname = "C3 'Nehemiah C' [C5P]"; break; } break; @@ -658,6 +730,13 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) cpuname = "Unknown"; break; } + /* Check Longhaul ver. 2 */ + if (longhaul_version == TYPE_LONGHAUL_V2) { + rdmsr(MSR_VIA_LONGHAUL, lo, hi); + if (lo == 0 && hi == 0) + /* Looks like MSR isn't present */ + longhaul_version = TYPE_LONGHAUL_V1; + } printk (KERN_INFO PFX "VIA %s CPU detected. ", cpuname); switch (longhaul_version) { @@ -670,15 +749,18 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) break; }; + /* Doesn't hurt */ + vt8235_present = longhaul_setup_vt8235(); + /* Find ACPI data for processor */ - acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, ACPI_UINT32_MAX, - &longhaul_walk_callback, NULL, (void *)&pr); + acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, &longhaul_walk_callback, + NULL, (void *)&pr); /* Check ACPI support for C3 state */ - if ((pr != NULL) && (longhaul_version == TYPE_POWERSAVER)) { + if (pr != NULL && longhaul_version != TYPE_LONGHAUL_V1) { cx = &pr->power.states[ACPI_STATE_C3]; - if (cx->address > 0 && - (cx->latency <= 1000 || ignore_latency != 0) ) { + if (cx->address > 0 && cx->latency <= 1000) { longhaul_flags |= USE_ACPI_C3; goto print_support_type; } @@ -688,8 +770,11 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) longhaul_flags |= USE_NORTHBRIDGE; goto print_support_type; } - - /* No ACPI C3 or we can't use it */ + /* Use VT8235 southbridge if present */ + if (longhaul_version == TYPE_POWERSAVER && vt8235_present) { + longhaul_flags |= USE_VT8235; + goto print_support_type; + } /* Check ACPI support for bus master arbiter disable */ if ((pr == NULL) || !(pr->flags.bm_control)) { printk(KERN_ERR PFX @@ -698,18 +783,18 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) } print_support_type: - if (!(longhaul_flags & USE_NORTHBRIDGE)) { - printk (KERN_INFO PFX "Using ACPI support.\n"); - } else { + if (longhaul_flags & USE_NORTHBRIDGE) printk (KERN_INFO PFX "Using northbridge support.\n"); - } + else if (longhaul_flags & USE_VT8235) + printk (KERN_INFO PFX "Using VT8235 support.\n"); + else + printk (KERN_INFO PFX "Using ACPI support.\n"); ret = longhaul_get_ranges(); if (ret != 0) return ret; - if ((longhaul_version==TYPE_LONGHAUL_V2 || longhaul_version==TYPE_POWERSAVER) && - (scale_voltage != 0)) + if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0)) longhaul_setup_voltagescaling(); policy->governor = CPUFREQ_DEFAULT_GOVERNOR; @@ -797,8 +882,6 @@ static void __exit longhaul_exit(void) module_param (scale_voltage, int, 0644); MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); -module_param(ignore_latency, int, 0644); -MODULE_PARM_DESC(ignore_latency, "Skip ACPI C3 latency test"); MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>"); MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors."); diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.h b/arch/i386/kernel/cpu/cpufreq/longhaul.h index bc4682aad69..bb0a04b1d1a 100644 --- a/arch/i386/kernel/cpu/cpufreq/longhaul.h +++ b/arch/i386/kernel/cpu/cpufreq/longhaul.h @@ -235,84 +235,14 @@ static int __initdata ezrat_eblcr[32] = { /* * VIA C3 Nehemiah */ -static int __initdata nehemiah_a_clock_ratio[32] = { +static int __initdata nehemiah_clock_ratio[32] = { 100, /* 0000 -> 10.0x */ 160, /* 0001 -> 16.0x */ - -1, /* 0010 -> RESERVED */ - 90, /* 0011 -> 9.0x */ - 95, /* 0100 -> 9.5x */ - -1, /* 0101 -> RESERVED */ - -1, /* 0110 -> RESERVED */ - 55, /* 0111 -> 5.5x */ - 60, /* 1000 -> 6.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 50, /* 1011 -> 5.0x */ - 65, /* 1100 -> 6.5x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 120, /* 1111 -> 12.0x */ - 100, /* 0000 -> 10.0x */ - -1, /* 0001 -> RESERVED */ - 120, /* 0010 -> 12.0x */ - 90, /* 0011 -> 9.0x */ - 105, /* 0100 -> 10.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 135, /* 0111 -> 13.5x */ - 140, /* 1000 -> 14.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 130, /* 1011 -> 13.0x */ - 145, /* 1100 -> 14.5x */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - 120, /* 1111 -> 12.0x */ -}; - -static int __initdata nehemiah_b_clock_ratio[32] = { - 100, /* 0000 -> 10.0x */ - 160, /* 0001 -> 16.0x */ - -1, /* 0010 -> RESERVED */ - 90, /* 0011 -> 9.0x */ - 95, /* 0100 -> 9.5x */ - -1, /* 0101 -> RESERVED */ - -1, /* 0110 -> RESERVED */ - 55, /* 0111 -> 5.5x */ - 60, /* 1000 -> 6.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 50, /* 1011 -> 5.0x */ - 65, /* 1100 -> 6.5x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 120, /* 1111 -> 12.0x */ - 100, /* 0000 -> 10.0x */ - 110, /* 0001 -> 11.0x */ - 120, /* 0010 -> 12.0x */ - 90, /* 0011 -> 9.0x */ - 105, /* 0100 -> 10.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 135, /* 0111 -> 13.5x */ - 140, /* 1000 -> 14.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 130, /* 1011 -> 13.0x */ - 145, /* 1100 -> 14.5x */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - 120, /* 1111 -> 12.0x */ -}; - -static int __initdata nehemiah_c_clock_ratio[32] = { - 100, /* 0000 -> 10.0x */ - 160, /* 0001 -> 16.0x */ - 40, /* 0010 -> RESERVED */ + 40, /* 0010 -> 4.0x */ 90, /* 0011 -> 9.0x */ 95, /* 0100 -> 9.5x */ -1, /* 0101 -> RESERVED */ - 45, /* 0110 -> RESERVED */ + 45, /* 0110 -> 4.5x */ 55, /* 0111 -> 5.5x */ 60, /* 1000 -> 6.0x */ 70, /* 1001 -> 7.0x */ @@ -340,84 +270,14 @@ static int __initdata nehemiah_c_clock_ratio[32] = { 120, /* 1111 -> 12.0x */ }; -static int __initdata nehemiah_a_eblcr[32] = { - 50, /* 0000 -> 5.0x */ - 160, /* 0001 -> 16.0x */ - -1, /* 0010 -> RESERVED */ - 100, /* 0011 -> 10.0x */ - 55, /* 0100 -> 5.5x */ - -1, /* 0101 -> RESERVED */ - -1, /* 0110 -> RESERVED */ - 95, /* 0111 -> 9.5x */ - 90, /* 1000 -> 9.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - 120, /* 1100 -> 12.0x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 65, /* 1111 -> 6.5x */ - 90, /* 0000 -> 9.0x */ - -1, /* 0001 -> RESERVED */ - 120, /* 0010 -> 12.0x */ - 100, /* 0011 -> 10.0x */ - 135, /* 0100 -> 13.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 105, /* 0111 -> 10.5x */ - 130, /* 1000 -> 13.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 140, /* 1011 -> 14.0x */ - 120, /* 1100 -> 12.0x */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - 145 /* 1111 -> 14.5x */ - /* end of table */ -}; -static int __initdata nehemiah_b_eblcr[32] = { - 50, /* 0000 -> 5.0x */ - 160, /* 0001 -> 16.0x */ - -1, /* 0010 -> RESERVED */ - 100, /* 0011 -> 10.0x */ - 55, /* 0100 -> 5.5x */ - -1, /* 0101 -> RESERVED */ - -1, /* 0110 -> RESERVED */ - 95, /* 0111 -> 9.5x */ - 90, /* 1000 -> 9.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - 120, /* 1100 -> 12.0x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 65, /* 1111 -> 6.5x */ - 90, /* 0000 -> 9.0x */ - 110, /* 0001 -> 11.0x */ - 120, /* 0010 -> 12.0x */ - 100, /* 0011 -> 10.0x */ - 135, /* 0100 -> 13.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 105, /* 0111 -> 10.5x */ - 130, /* 1000 -> 13.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 140, /* 1011 -> 14.0x */ - 120, /* 1100 -> 12.0x */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - 145 /* 1111 -> 14.5x */ - /* end of table */ -}; -static int __initdata nehemiah_c_eblcr[32] = { +static int __initdata nehemiah_eblcr[32] = { 50, /* 0000 -> 5.0x */ 160, /* 0001 -> 16.0x */ - 40, /* 0010 -> RESERVED */ + 40, /* 0010 -> 4.0x */ 100, /* 0011 -> 10.0x */ 55, /* 0100 -> 5.5x */ -1, /* 0101 -> RESERVED */ - 45, /* 0110 -> RESERVED */ + 45, /* 0110 -> 4.5x */ 95, /* 0111 -> 9.5x */ 90, /* 1000 -> 9.0x */ 70, /* 1001 -> 7.0x */ @@ -443,7 +303,6 @@ static int __initdata nehemiah_c_eblcr[32] = { 155, /* 1101 -> 15.5x */ -1, /* 1110 -> RESERVED (13.0x) */ 145 /* 1111 -> 14.5x */ - /* end of table */ }; /* diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c index 2d649167255..fe3b67005eb 100644 --- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c @@ -1289,7 +1289,11 @@ static unsigned int powernowk8_get (unsigned int cpu) if (query_current_values_with_pending_wait(data)) goto out; - khz = find_khz_freq_from_fid(data->currfid); + if (cpu_family == CPU_HW_PSTATE) + khz = find_khz_freq_from_fiddid(data->currfid, data->currdid); + else + khz = find_khz_freq_from_fid(data->currfid); + out: set_cpus_allowed(current, oldmask); diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c index 0b29d41322a..e1006b7acc9 100644 --- a/arch/i386/kernel/hpet.c +++ b/arch/i386/kernel/hpet.c @@ -1,4 +1,5 @@ #include <linux/clocksource.h> +#include <linux/clockchips.h> #include <linux/errno.h> #include <linux/hpet.h> #include <linux/init.h> @@ -6,17 +7,278 @@ #include <asm/hpet.h> #include <asm/io.h> +extern struct clock_event_device *global_clock_event; + #define HPET_MASK CLOCKSOURCE_MASK(32) #define HPET_SHIFT 22 /* FSEC = 10^-15 NSEC = 10^-9 */ #define FSEC_PER_NSEC 1000000 -static void __iomem *hpet_ptr; +/* + * HPET address is set in acpi/boot.c, when an ACPI entry exists + */ +unsigned long hpet_address; +static void __iomem * hpet_virt_address; + +static inline unsigned long hpet_readl(unsigned long a) +{ + return readl(hpet_virt_address + a); +} + +static inline void hpet_writel(unsigned long d, unsigned long a) +{ + writel(d, hpet_virt_address + a); +} + +/* + * HPET command line enable / disable + */ +static int boot_hpet_disable; + +static int __init hpet_setup(char* str) +{ + if (str) { + if (!strncmp("disable", str, 7)) + boot_hpet_disable = 1; + } + return 1; +} +__setup("hpet=", hpet_setup); + +static inline int is_hpet_capable(void) +{ + return (!boot_hpet_disable && hpet_address); +} + +/* + * HPET timer interrupt enable / disable + */ +static int hpet_legacy_int_enabled; + +/** + * is_hpet_enabled - check whether the hpet timer interrupt is enabled + */ +int is_hpet_enabled(void) +{ + return is_hpet_capable() && hpet_legacy_int_enabled; +} + +/* + * When the hpet driver (/dev/hpet) is enabled, we need to reserve + * timer 0 and timer 1 in case of RTC emulation. + */ +#ifdef CONFIG_HPET +static void hpet_reserve_platform_timers(unsigned long id) +{ + struct hpet __iomem *hpet = hpet_virt_address; + struct hpet_timer __iomem *timer = &hpet->hpet_timers[2]; + unsigned int nrtimers, i; + struct hpet_data hd; + + nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; + + memset(&hd, 0, sizeof (hd)); + hd.hd_phys_address = hpet_address; + hd.hd_address = hpet_virt_address; + hd.hd_nirqs = nrtimers; + hd.hd_flags = HPET_DATA_PLATFORM; + hpet_reserve_timer(&hd, 0); + +#ifdef CONFIG_HPET_EMULATE_RTC + hpet_reserve_timer(&hd, 1); +#endif + + hd.hd_irq[0] = HPET_LEGACY_8254; + hd.hd_irq[1] = HPET_LEGACY_RTC; + + for (i = 2; i < nrtimers; timer++, i++) + hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >> + Tn_INT_ROUTE_CNF_SHIFT; + + hpet_alloc(&hd); + +} +#else +static void hpet_reserve_platform_timers(unsigned long id) { } +#endif + +/* + * Common hpet info + */ +static unsigned long hpet_period; + +static void hpet_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt); +static int hpet_next_event(unsigned long delta, + struct clock_event_device *evt); + +/* + * The hpet clock event device + */ +static struct clock_event_device hpet_clockevent = { + .name = "hpet", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .set_mode = hpet_set_mode, + .set_next_event = hpet_next_event, + .shift = 32, + .irq = 0, +}; + +static void hpet_start_counter(void) +{ + unsigned long cfg = hpet_readl(HPET_CFG); + + cfg &= ~HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); + hpet_writel(0, HPET_COUNTER); + hpet_writel(0, HPET_COUNTER + 4); + cfg |= HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); +} + +static void hpet_enable_int(void) +{ + unsigned long cfg = hpet_readl(HPET_CFG); + + cfg |= HPET_CFG_LEGACY; + hpet_writel(cfg, HPET_CFG); + hpet_legacy_int_enabled = 1; +} + +static void hpet_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long cfg, cmp, now; + uint64_t delta; + + switch(mode) { + case CLOCK_EVT_MODE_PERIODIC: + delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult; + delta >>= hpet_clockevent.shift; + now = hpet_readl(HPET_COUNTER); + cmp = now + (unsigned long) delta; + cfg = hpet_readl(HPET_T0_CFG); + cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | + HPET_TN_SETVAL | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T0_CFG); + /* + * The first write after writing TN_SETVAL to the + * config register sets the counter value, the second + * write sets the period. + */ + hpet_writel(cmp, HPET_T0_CMP); + udelay(1); + hpet_writel((unsigned long) delta, HPET_T0_CMP); + break; + + case CLOCK_EVT_MODE_ONESHOT: + cfg = hpet_readl(HPET_T0_CFG); + cfg &= ~HPET_TN_PERIODIC; + cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T0_CFG); + break; + + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + cfg = hpet_readl(HPET_T0_CFG); + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_T0_CFG); + break; + } +} + +static int hpet_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + unsigned long cnt; + + cnt = hpet_readl(HPET_COUNTER); + cnt += delta; + hpet_writel(cnt, HPET_T0_CMP); + + return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0); +} + +/* + * Try to setup the HPET timer + */ +int __init hpet_enable(void) +{ + unsigned long id; + uint64_t hpet_freq; + + if (!is_hpet_capable()) + return 0; + + hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); + + /* + * Read the period and check for a sane value: + */ + hpet_period = hpet_readl(HPET_PERIOD); + if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) + goto out_nohpet; + + /* + * The period is a femto seconds value. We need to calculate the + * scaled math multiplication factor for nanosecond to hpet tick + * conversion. + */ + hpet_freq = 1000000000000000ULL; + do_div(hpet_freq, hpet_period); + hpet_clockevent.mult = div_sc((unsigned long) hpet_freq, + NSEC_PER_SEC, 32); + /* Calculate the min / max delta */ + hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, + &hpet_clockevent); + hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, + &hpet_clockevent); + + /* + * Read the HPET ID register to retrieve the IRQ routing + * information and the number of channels + */ + id = hpet_readl(HPET_ID); + +#ifdef CONFIG_HPET_EMULATE_RTC + /* + * The legacy routing mode needs at least two channels, tick timer + * and the rtc emulation channel. + */ + if (!(id & HPET_ID_NUMBER)) + goto out_nohpet; +#endif + + /* Start the counter */ + hpet_start_counter(); + + if (id & HPET_ID_LEGSUP) { + hpet_enable_int(); + hpet_reserve_platform_timers(id); + /* + * Start hpet with the boot cpu mask and make it + * global after the IO_APIC has been initialized. + */ + hpet_clockevent.cpumask =cpumask_of_cpu(0); + clockevents_register_device(&hpet_clockevent); + global_clock_event = &hpet_clockevent; + return 1; + } + return 0; +out_nohpet: + iounmap(hpet_virt_address); + hpet_virt_address = NULL; + return 0; +} + +/* + * Clock source related code + */ static cycle_t read_hpet(void) { - return (cycle_t)readl(hpet_ptr); + return (cycle_t)hpet_readl(HPET_COUNTER); } static struct clocksource clocksource_hpet = { @@ -24,28 +286,17 @@ static struct clocksource clocksource_hpet = { .rating = 250, .read = read_hpet, .mask = HPET_MASK, - .mult = 0, /* set below */ .shift = HPET_SHIFT, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static int __init init_hpet_clocksource(void) { - unsigned long hpet_period; - void __iomem* hpet_base; u64 tmp; - int err; - if (!is_hpet_enabled()) + if (!hpet_virt_address) return -ENODEV; - /* calculate the hpet address: */ - hpet_base = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); - hpet_ptr = hpet_base + HPET_COUNTER; - - /* calculate the frequency: */ - hpet_period = readl(hpet_base + HPET_PERIOD); - /* * hpet period is in femto seconds per cycle * so we need to convert this to ns/cyc units @@ -61,11 +312,218 @@ static int __init init_hpet_clocksource(void) do_div(tmp, FSEC_PER_NSEC); clocksource_hpet.mult = (u32)tmp; - err = clocksource_register(&clocksource_hpet); - if (err) - iounmap(hpet_base); - - return err; + return clocksource_register(&clocksource_hpet); } module_init(init_hpet_clocksource); + +#ifdef CONFIG_HPET_EMULATE_RTC + +/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET + * is enabled, we support RTC interrupt functionality in software. + * RTC has 3 kinds of interrupts: + * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock + * is updated + * 2) Alarm Interrupt - generate an interrupt at a specific time of day + * 3) Periodic Interrupt - generate periodic interrupt, with frequencies + * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) + * (1) and (2) above are implemented using polling at a frequency of + * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt + * overhead. (DEFAULT_RTC_INT_FREQ) + * For (3), we use interrupts at 64Hz or user specified periodic + * frequency, whichever is higher. + */ +#include <linux/mc146818rtc.h> +#include <linux/rtc.h> + +#define DEFAULT_RTC_INT_FREQ 64 +#define DEFAULT_RTC_SHIFT 6 +#define RTC_NUM_INTS 1 + +static unsigned long hpet_rtc_flags; +static unsigned long hpet_prev_update_sec; +static struct rtc_time hpet_alarm_time; +static unsigned long hpet_pie_count; +static unsigned long hpet_t1_cmp; +static unsigned long hpet_default_delta; +static unsigned long hpet_pie_delta; +static unsigned long hpet_pie_limit; + +/* + * Timer 1 for RTC emulation. We use one shot mode, as periodic mode + * is not supported by all HPET implementations for timer 1. + * + * hpet_rtc_timer_init() is called when the rtc is initialized. + */ +int hpet_rtc_timer_init(void) +{ + unsigned long cfg, cnt, delta, flags; + + if (!is_hpet_enabled()) + return 0; + + if (!hpet_default_delta) { + uint64_t clc; + + clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; + clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT; + hpet_default_delta = (unsigned long) clc; + } + + if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) + delta = hpet_default_delta; + else + delta = hpet_pie_delta; + + local_irq_save(flags); + + cnt = delta + hpet_readl(HPET_COUNTER); + hpet_writel(cnt, HPET_T1_CMP); + hpet_t1_cmp = cnt; + + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_PERIODIC; + cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T1_CFG); + + local_irq_restore(flags); + + return 1; +} + +/* + * The functions below are called from rtc driver. + * Return 0 if HPET is not being used. + * Otherwise do the necessary changes and return 1. + */ +int hpet_mask_rtc_irq_bit(unsigned long bit_mask) +{ + if (!is_hpet_enabled()) + return 0; + + hpet_rtc_flags &= ~bit_mask; + return 1; +} + +int hpet_set_rtc_irq_bit(unsigned long bit_mask) +{ + unsigned long oldbits = hpet_rtc_flags; + + if (!is_hpet_enabled()) + return 0; + + hpet_rtc_flags |= bit_mask; + + if (!oldbits) + hpet_rtc_timer_init(); + + return 1; +} + +int hpet_set_alarm_time(unsigned char hrs, unsigned char min, + unsigned char sec) +{ + if (!is_hpet_enabled()) + return 0; + + hpet_alarm_time.tm_hour = hrs; + hpet_alarm_time.tm_min = min; + hpet_alarm_time.tm_sec = sec; + + return 1; +} + +int hpet_set_periodic_freq(unsigned long freq) +{ + uint64_t clc; + + if (!is_hpet_enabled()) + return 0; + + if (freq <= DEFAULT_RTC_INT_FREQ) + hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq; + else { + clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; + do_div(clc, freq); + clc >>= hpet_clockevent.shift; + hpet_pie_delta = (unsigned long) clc; + } + return 1; +} + +int hpet_rtc_dropped_irq(void) +{ + return is_hpet_enabled(); +} + +static void hpet_rtc_timer_reinit(void) +{ + unsigned long cfg, delta; + int lost_ints = -1; + + if (unlikely(!hpet_rtc_flags)) { + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_T1_CFG); + return; + } + + if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) + delta = hpet_default_delta; + else + delta = hpet_pie_delta; + + /* + * Increment the comparator value until we are ahead of the + * current count. + */ + do { + hpet_t1_cmp += delta; + hpet_writel(hpet_t1_cmp, HPET_T1_CMP); + lost_ints++; + } while ((long)(hpet_readl(HPET_COUNTER) - hpet_t1_cmp) > 0); + + if (lost_ints) { + if (hpet_rtc_flags & RTC_PIE) + hpet_pie_count += lost_ints; + if (printk_ratelimit()) + printk(KERN_WARNING "rtc: lost %d interrupts\n", + lost_ints); + } +} + +irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) +{ + struct rtc_time curr_time; + unsigned long rtc_int_flag = 0; + + hpet_rtc_timer_reinit(); + + if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) + rtc_get_rtc_time(&curr_time); + + if (hpet_rtc_flags & RTC_UIE && + curr_time.tm_sec != hpet_prev_update_sec) { + rtc_int_flag = RTC_UF; + hpet_prev_update_sec = curr_time.tm_sec; + } + + if (hpet_rtc_flags & RTC_PIE && + ++hpet_pie_count >= hpet_pie_limit) { + rtc_int_flag |= RTC_PF; + hpet_pie_count = 0; + } + + if (hpet_rtc_flags & RTC_PIE && + (curr_time.tm_sec == hpet_alarm_time.tm_sec) && + (curr_time.tm_min == hpet_alarm_time.tm_min) && + (curr_time.tm_hour == hpet_alarm_time.tm_hour)) + rtc_int_flag |= RTC_AF; + + if (rtc_int_flag) { + rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); + rtc_interrupt(rtc_int_flag, dev_id); + } + return IRQ_HANDLED; +} +#endif diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c index 9a0060b92e3..a6bc7bb3883 100644 --- a/arch/i386/kernel/i8253.c +++ b/arch/i386/kernel/i8253.c @@ -2,7 +2,7 @@ * i8253.c 8253/PIT functions * */ -#include <linux/clocksource.h> +#include <linux/clockchips.h> #include <linux/spinlock.h> #include <linux/jiffies.h> #include <linux/sysdev.h> @@ -19,17 +19,97 @@ DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); -void setup_pit_timer(void) +/* + * HPET replaces the PIT, when enabled. So we need to know, which of + * the two timers is used + */ +struct clock_event_device *global_clock_event; + +/* + * Initialize the PIT timer. + * + * This is also called after resume to bring the PIT into operation again. + */ +static void init_pit_timer(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long flags; + + spin_lock_irqsave(&i8253_lock, flags); + + switch(mode) { + case CLOCK_EVT_MODE_PERIODIC: + /* binary, mode 2, LSB/MSB, ch 0 */ + outb_p(0x34, PIT_MODE); + udelay(10); + outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ + udelay(10); + outb(LATCH >> 8 , PIT_CH0); /* MSB */ + break; + + case CLOCK_EVT_MODE_ONESHOT: + case CLOCK_EVT_MODE_SHUTDOWN: + case CLOCK_EVT_MODE_UNUSED: + /* One shot setup */ + outb_p(0x38, PIT_MODE); + udelay(10); + break; + } + spin_unlock_irqrestore(&i8253_lock, flags); +} + +/* + * Program the next event in oneshot mode + * + * Delta is given in PIT ticks + */ +static int pit_next_event(unsigned long delta, struct clock_event_device *evt) { unsigned long flags; spin_lock_irqsave(&i8253_lock, flags); - outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ - udelay(10); - outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ - udelay(10); - outb(LATCH >> 8 , PIT_CH0); /* MSB */ + outb_p(delta & 0xff , PIT_CH0); /* LSB */ + outb(delta >> 8 , PIT_CH0); /* MSB */ spin_unlock_irqrestore(&i8253_lock, flags); + + return 0; +} + +/* + * On UP the PIT can serve all of the possible timer functions. On SMP systems + * it can be solely used for the global tick. + * + * The profiling and update capabilites are switched off once the local apic is + * registered. This mechanism replaces the previous #ifdef LOCAL_APIC - + * !using_apic_timer decisions in do_timer_interrupt_hook() + */ +struct clock_event_device pit_clockevent = { + .name = "pit", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .set_mode = init_pit_timer, + .set_next_event = pit_next_event, + .shift = 32, + .irq = 0, +}; + +/* + * Initialize the conversion factor and the min/max deltas of the clock event + * structure and register the clock event source with the framework. + */ +void __init setup_pit_timer(void) +{ + /* + * Start pit with the boot cpu mask and make it global after the + * IO_APIC has been initialized. + */ + pit_clockevent.cpumask = cpumask_of_cpu(0); + pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32); + pit_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFF, &pit_clockevent); + pit_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &pit_clockevent); + clockevents_register_device(&pit_clockevent); + global_clock_event = &pit_clockevent; } /* @@ -46,7 +126,7 @@ static cycle_t pit_read(void) static u32 old_jifs; spin_lock_irqsave(&i8253_lock, flags); - /* + /* * Although our caller may have the read side of xtime_lock, * this is now a seqlock, and we are cheating in this routine * by having side effects on state that we cannot undo if diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c index c8d45821c78..03abfdb1a6e 100644 --- a/arch/i386/kernel/i8259.c +++ b/arch/i386/kernel/i8259.c @@ -41,6 +41,7 @@ static void mask_and_ack_8259A(unsigned int); static struct irq_chip i8259A_chip = { .name = "XT-PIC", .mask = disable_8259A_irq, + .disable = disable_8259A_irq, .unmask = enable_8259A_irq, .mask_ack = mask_and_ack_8259A, }; @@ -410,12 +411,6 @@ void __init native_init_IRQ(void) intr_init_hook(); /* - * Set the clock to HZ Hz, we already have a valid - * vector now: - */ - setup_pit_timer(); - - /* * External FPU? Set up irq13 if so, for * original braindamaged IBM FERR coupling. */ diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index e30ccedad0b..4ccebd454e2 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -482,8 +482,8 @@ static void do_irq_balance(void) package_index = CPU_TO_PACKAGEINDEX(i); for (j = 0; j < NR_IRQS; j++) { unsigned long value_now, delta; - /* Is this an active IRQ? */ - if (!irq_desc[j].action) + /* Is this an active IRQ or balancing disabled ? */ + if (!irq_desc[j].action || irq_balancing_disabled(j)) continue; if ( package_index == i ) IRQ_DELTA(package_index,j) = 0; @@ -1281,11 +1281,9 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger) trigger == IOAPIC_LEVEL) set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_fasteoi_irq, "fasteoi"); - else { - irq_desc[irq].status |= IRQ_DELAYED_DISABLE; + else set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); - } set_intr_gate(vector, interrupt[irq]); } @@ -1588,7 +1586,7 @@ void /*__init*/ print_local_APIC(void * dummy) v = apic_read(APIC_LVR); printk(KERN_INFO "... APIC VERSION: %08x\n", v); ver = GET_APIC_VERSION(v); - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); v = apic_read(APIC_TASKPRI); printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index 5785d84103a..0f2ca590bf2 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -10,7 +10,6 @@ * io_apic.c.) */ -#include <asm/uaccess.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/interrupt.h> @@ -21,19 +20,34 @@ #include <asm/idle.h> +#include <asm/apic.h> +#include <asm/uaccess.h> + DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; EXPORT_PER_CPU_SYMBOL(irq_stat); -#ifndef CONFIG_X86_LOCAL_APIC /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. */ void ack_bad_irq(unsigned int irq) { - printk("unexpected IRQ trap at vector %02x\n", irq); -} + printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); + +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But only ack when the APIC is enabled -AK + */ + if (cpu_has_apic) + ack_APIC_irq(); #endif +} #ifdef CONFIG_4KSTACKS /* diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 5d8a07c2028..821df34d2b3 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -23,6 +23,7 @@ #include <linux/dmi.h> #include <linux/kprobes.h> #include <linux/cpumask.h> +#include <linux/kernel_stat.h> #include <asm/smp.h> #include <asm/nmi.h> @@ -973,9 +974,13 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) cpu_clear(cpu, backtrace_mask); } - sum = per_cpu(irq_stat, cpu).apic_timer_irqs; + /* + * Take the local apic timer and PIT/HPET into account. We don't + * know which one is active, when we have highres/dyntick on + */ + sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0); - /* if the apic timer isn't firing, this cpu isn't doing much */ + /* if the none of the timers isn't firing, this cpu isn't doing much */ if (!touched && last_irq_sums[cpu] == sum) { /* * Ayiee, looks like this CPU is stuck ... diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 7845d480c29..bea304d48cd 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -38,6 +38,7 @@ #include <linux/ptrace.h> #include <linux/random.h> #include <linux/personality.h> +#include <linux/tick.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -211,6 +212,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { + tick_nohz_stop_sched_tick(); while (!need_resched()) { void (*idle)(void); @@ -238,6 +240,7 @@ void cpu_idle(void) idle(); __exit_idle(); } + tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index f46a4d095e6..48bfcaa13ec 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -94,12 +94,6 @@ cpumask_t cpu_possible_map; EXPORT_SYMBOL(cpu_possible_map); static cpumask_t smp_commenced_mask; -/* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there - * is no way to resync one AP against BP. TBD: for prescott and above, we - * should use IA64's algorithm - */ -static int __devinitdata tsc_sync_disabled; - /* Per CPU bogomips and other parameters */ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; EXPORT_SYMBOL(cpu_data); @@ -216,151 +210,6 @@ valid_k7: ; } -/* - * TSC synchronization. - * - * We first check whether all CPUs have their TSC's synchronized, - * then we print a warning if not, and always resync. - */ - -static struct { - atomic_t start_flag; - atomic_t count_start; - atomic_t count_stop; - unsigned long long values[NR_CPUS]; -} tsc __cpuinitdata = { - .start_flag = ATOMIC_INIT(0), - .count_start = ATOMIC_INIT(0), - .count_stop = ATOMIC_INIT(0), -}; - -#define NR_LOOPS 5 - -static void __init synchronize_tsc_bp(void) -{ - int i; - unsigned long long t0; - unsigned long long sum, avg; - long long delta; - unsigned int one_usec; - int buggy = 0; - - printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus()); - - /* convert from kcyc/sec to cyc/usec */ - one_usec = cpu_khz / 1000; - - atomic_set(&tsc.start_flag, 1); - wmb(); - - /* - * We loop a few times to get a primed instruction cache, - * then the last pass is more or less synchronized and - * the BP and APs set their cycle counters to zero all at - * once. This reduces the chance of having random offsets - * between the processors, and guarantees that the maximum - * delay between the cycle counters is never bigger than - * the latency of information-passing (cachelines) between - * two CPUs. - */ - for (i = 0; i < NR_LOOPS; i++) { - /* - * all APs synchronize but they loop on '== num_cpus' - */ - while (atomic_read(&tsc.count_start) != num_booting_cpus()-1) - cpu_relax(); - atomic_set(&tsc.count_stop, 0); - wmb(); - /* - * this lets the APs save their current TSC: - */ - atomic_inc(&tsc.count_start); - - rdtscll(tsc.values[smp_processor_id()]); - /* - * We clear the TSC in the last loop: - */ - if (i == NR_LOOPS-1) - write_tsc(0, 0); - - /* - * Wait for all APs to leave the synchronization point: - */ - while (atomic_read(&tsc.count_stop) != num_booting_cpus()-1) - cpu_relax(); - atomic_set(&tsc.count_start, 0); - wmb(); - atomic_inc(&tsc.count_stop); - } - - sum = 0; - for (i = 0; i < NR_CPUS; i++) { - if (cpu_isset(i, cpu_callout_map)) { - t0 = tsc.values[i]; - sum += t0; - } - } - avg = sum; - do_div(avg, num_booting_cpus()); - - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_isset(i, cpu_callout_map)) - continue; - delta = tsc.values[i] - avg; - if (delta < 0) - delta = -delta; - /* - * We report bigger than 2 microseconds clock differences. - */ - if (delta > 2*one_usec) { - long long realdelta; - - if (!buggy) { - buggy = 1; - printk("\n"); - } - realdelta = delta; - do_div(realdelta, one_usec); - if (tsc.values[i] < avg) - realdelta = -realdelta; - - if (realdelta) - printk(KERN_INFO "CPU#%d had %Ld usecs TSC " - "skew, fixed it up.\n", i, realdelta); - } - } - if (!buggy) - printk("passed.\n"); -} - -static void __cpuinit synchronize_tsc_ap(void) -{ - int i; - - /* - * Not every cpu is online at the time - * this gets called, so we first wait for the BP to - * finish SMP initialization: - */ - while (!atomic_read(&tsc.start_flag)) - cpu_relax(); - - for (i = 0; i < NR_LOOPS; i++) { - atomic_inc(&tsc.count_start); - while (atomic_read(&tsc.count_start) != num_booting_cpus()) - cpu_relax(); - - rdtscll(tsc.values[smp_processor_id()]); - if (i == NR_LOOPS-1) - write_tsc(0, 0); - - atomic_inc(&tsc.count_stop); - while (atomic_read(&tsc.count_stop) != num_booting_cpus()) - cpu_relax(); - } -} -#undef NR_LOOPS - extern void calibrate_delay(void); static atomic_t init_deasserted; @@ -438,20 +287,12 @@ static void __cpuinit smp_callin(void) /* * Save our processor parameters */ - smp_store_cpu_info(cpuid); - - disable_APIC_timer(); + smp_store_cpu_info(cpuid); /* * Allow the master to continue. */ cpu_set(cpuid, cpu_callin_map); - - /* - * Synchronize the TSC with the BP - */ - if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled) - synchronize_tsc_ap(); } static int cpucount; @@ -554,13 +395,17 @@ static void __cpuinit start_secondary(void *unused) smp_callin(); while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) rep_nop(); + /* + * Check TSC synchronization with the BP: + */ + check_tsc_sync_target(); + setup_secondary_clock(); if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); enable_NMI_through_LVT0(NULL); enable_8259A_irq(0); } - enable_APIC_timer(); /* * low-memory mappings have been cleared, flush them from * the local TLBs too. @@ -752,7 +597,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) /* * Due to the Pentium erratum 3AP. */ - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) { apic_read_around(APIC_SPIV); apic_write(APIC_ESR, 0); @@ -849,7 +694,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) */ Dprintk("#startup loops: %d.\n", num_starts); - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); for (j = 1; j <= num_starts; j++) { Dprintk("Sending STARTUP #%d.\n",j); @@ -1125,8 +970,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu) info.cpu = cpu; INIT_WORK(&info.task, do_warm_boot_cpu); - tsc_sync_disabled = 1; - /* init low mem mapping */ clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); @@ -1134,7 +977,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu) schedule_work(&info.task); wait_for_completion(&done); - tsc_sync_disabled = 0; zap_low_mappings(); ret = 0; exit: @@ -1331,12 +1173,6 @@ static void __init smp_boot_cpus(unsigned int max_cpus) smpboot_setup_io_apic(); setup_boot_clock(); - - /* - * Synchronize the TSC with the AP - */ - if (cpu_has_tsc && cpucount && cpu_khz) - synchronize_tsc_bp(); } /* These are wrappers to interface to the new boot process. Someone @@ -1471,9 +1307,16 @@ int __cpuinit __cpu_up(unsigned int cpu) } local_irq_enable(); + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; /* Unleash the CPU! */ cpu_set(cpu, smp_commenced_mask); + + /* + * Check TSC synchronization with the AP: + */ + check_tsc_sync_source(cpu); + while (!cpu_isset(cpu, cpu_online_map)) cpu_relax(); diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index a4f67a6e682..a5350059557 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -159,15 +159,6 @@ EXPORT_SYMBOL(profile_pc); */ irqreturn_t timer_interrupt(int irq, void *dev_id) { - /* - * Here we are in the timer irq handler. We just have irqs locally - * disabled but we don't know if the timer_bh is running on the other - * CPU. We need to avoid to SMP race with it. NOTE: we don' t need - * the irq version of write_lock because as just said we have irq - * locally disabled. -arca - */ - write_seqlock(&xtime_lock); - #ifdef CONFIG_X86_IO_APIC if (timer_ack) { /* @@ -186,7 +177,6 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) do_timer_interrupt_hook(); - if (MCA_bus) { /* The PS/2 uses level-triggered interrupts. You can't turn them off, nor would you want to (any attempt to @@ -201,18 +191,11 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ } - write_sequnlock(&xtime_lock); - -#ifdef CONFIG_X86_LOCAL_APIC - if (using_apic_timer) - smp_send_timer_broadcast_ipi(); -#endif - return IRQ_HANDLED; } /* not static: needed by APM */ -unsigned long get_cmos_time(void) +unsigned long read_persistent_clock(void) { unsigned long retval; unsigned long flags; @@ -225,7 +208,6 @@ unsigned long get_cmos_time(void) return retval; } -EXPORT_SYMBOL(get_cmos_time); static void sync_cmos_clock(unsigned long dummy); @@ -278,114 +260,16 @@ void notify_arch_cmos_timer(void) mod_timer(&sync_cmos_timer, jiffies + 1); } -static long clock_cmos_diff; -static unsigned long sleep_start; - -static int timer_suspend(struct sys_device *dev, pm_message_t state) -{ - /* - * Estimate time zone so that set_time can update the clock - */ - unsigned long ctime = get_cmos_time(); - - clock_cmos_diff = -ctime; - clock_cmos_diff += get_seconds(); - sleep_start = ctime; - return 0; -} - -static int timer_resume(struct sys_device *dev) -{ - unsigned long flags; - unsigned long sec; - unsigned long ctime = get_cmos_time(); - long sleep_length = (ctime - sleep_start) * HZ; - struct timespec ts; - - if (sleep_length < 0) { - printk(KERN_WARNING "CMOS clock skew detected in timer resume!\n"); - /* The time after the resume must not be earlier than the time - * before the suspend or some nasty things will happen - */ - sleep_length = 0; - ctime = sleep_start; - } -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled()) - hpet_reenable(); -#endif - setup_pit_timer(); - - sec = ctime + clock_cmos_diff; - ts.tv_sec = sec; - ts.tv_nsec = 0; - do_settimeofday(&ts); - write_seqlock_irqsave(&xtime_lock, flags); - jiffies_64 += sleep_length; - write_sequnlock_irqrestore(&xtime_lock, flags); - touch_softlockup_watchdog(); - return 0; -} - -static struct sysdev_class timer_sysclass = { - .resume = timer_resume, - .suspend = timer_suspend, - set_kset_name("timer"), -}; - - -/* XXX this driverfs stuff should probably go elsewhere later -john */ -static struct sys_device device_timer = { - .id = 0, - .cls = &timer_sysclass, -}; - -static int time_init_device(void) -{ - int error = sysdev_class_register(&timer_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; -} - -device_initcall(time_init_device); - -#ifdef CONFIG_HPET_TIMER extern void (*late_time_init)(void); /* Duplicate of time_init() below, with hpet_enable part added */ static void __init hpet_time_init(void) { - struct timespec ts; - ts.tv_sec = get_cmos_time(); - ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); - - do_settimeofday(&ts); - - if ((hpet_enable() >= 0) && hpet_use_timer) { - printk("Using HPET for base-timer\n"); - } - + if (!hpet_enable()) + setup_pit_timer(); do_time_init(); } -#endif void __init time_init(void) { - struct timespec ts; -#ifdef CONFIG_HPET_TIMER - if (is_hpet_capable()) { - /* - * HPET initialization needs to do memory-mapped io. So, let - * us do a late initialization after mem_init(). - */ - late_time_init = hpet_time_init; - return; - } -#endif - ts.tv_sec = get_cmos_time(); - ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); - - do_settimeofday(&ts); - - do_time_init(); + late_time_init = hpet_time_init; } diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index 46f752a8bbf..3082a418635 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -60,12 +60,6 @@ static inline int check_tsc_unstable(void) return tsc_unstable; } -void mark_tsc_unstable(void) -{ - tsc_unstable = 1; -} -EXPORT_SYMBOL_GPL(mark_tsc_unstable); - /* Accellerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) * basic equation: @@ -222,34 +216,6 @@ out_no_tsc: #ifdef CONFIG_CPU_FREQ -static unsigned int cpufreq_delayed_issched = 0; -static unsigned int cpufreq_init = 0; -static struct work_struct cpufreq_delayed_get_work; - -static void handle_cpufreq_delayed_get(struct work_struct *work) -{ - unsigned int cpu; - - for_each_online_cpu(cpu) - cpufreq_get(cpu); - - cpufreq_delayed_issched = 0; -} - -/* - * if we notice cpufreq oddness, schedule a call to cpufreq_get() as it tries - * to verify the CPU frequency the timing core thinks the CPU is running - * at is still correct. - */ -static inline void cpufreq_delayed_get(void) -{ - if (cpufreq_init && !cpufreq_delayed_issched) { - cpufreq_delayed_issched = 1; - printk(KERN_DEBUG "Checking if CPU frequency changed.\n"); - schedule_work(&cpufreq_delayed_get_work); - } -} - /* * if the CPU frequency is scaled, TSC-based delays will need a different * loops_per_jiffy value to function properly. @@ -313,17 +279,9 @@ static struct notifier_block time_cpufreq_notifier_block = { static int __init cpufreq_tsc(void) { - int ret; - - INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get); - ret = cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - if (!ret) - cpufreq_init = 1; - - return ret; + return cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); } - core_initcall(cpufreq_tsc); #endif @@ -331,7 +289,6 @@ core_initcall(cpufreq_tsc); /* clock source code */ static unsigned long current_tsc_khz = 0; -static int tsc_update_callback(void); static cycle_t read_tsc(void) { @@ -349,37 +306,28 @@ static struct clocksource clocksource_tsc = { .mask = CLOCKSOURCE_MASK(64), .mult = 0, /* to be set */ .shift = 22, - .update_callback = tsc_update_callback, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS | + CLOCK_SOURCE_MUST_VERIFY, }; -static int tsc_update_callback(void) +void mark_tsc_unstable(void) { - int change = 0; - - /* check to see if we should switch to the safe clocksource: */ - if (clocksource_tsc.rating != 0 && check_tsc_unstable()) { - clocksource_tsc.rating = 0; - clocksource_reselect(); - change = 1; - } - - /* only update if tsc_khz has changed: */ - if (current_tsc_khz != tsc_khz) { - current_tsc_khz = tsc_khz; - clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, - clocksource_tsc.shift); - change = 1; + if (!tsc_unstable) { + tsc_unstable = 1; + /* Can be called before registration */ + if (clocksource_tsc.mult) + clocksource_change_rating(&clocksource_tsc, 0); + else + clocksource_tsc.rating = 0; } - - return change; } +EXPORT_SYMBOL_GPL(mark_tsc_unstable); static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d) { printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", d->ident); - mark_tsc_unstable(); + tsc_unstable = 1; return 0; } @@ -396,65 +344,44 @@ static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { {} }; -#define TSC_FREQ_CHECK_INTERVAL (10*MSEC_PER_SEC) /* 10sec in MS */ -static struct timer_list verify_tsc_freq_timer; - -/* XXX - Probably should add locking */ -static void verify_tsc_freq(unsigned long unused) -{ - static u64 last_tsc; - static unsigned long last_jiffies; - - u64 now_tsc, interval_tsc; - unsigned long now_jiffies, interval_jiffies; - - - if (check_tsc_unstable()) - return; - - rdtscll(now_tsc); - now_jiffies = jiffies; - - if (!last_jiffies) { - goto out; - } - - interval_jiffies = now_jiffies - last_jiffies; - interval_tsc = now_tsc - last_tsc; - interval_tsc *= HZ; - do_div(interval_tsc, cpu_khz*1000); - - if (interval_tsc < (interval_jiffies * 3 / 4)) { - printk("TSC appears to be running slowly. " - "Marking it as unstable\n"); - mark_tsc_unstable(); - return; - } - -out: - last_tsc = now_tsc; - last_jiffies = now_jiffies; - /* set us up to go off on the next interval: */ - mod_timer(&verify_tsc_freq_timer, - jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL)); -} - /* * Make an educated guess if the TSC is trustworthy and synchronized * over all CPUs. */ -static __init int unsynchronized_tsc(void) +__cpuinit int unsynchronized_tsc(void) { + if (!cpu_has_tsc || tsc_unstable) + return 1; /* * Intel systems are normally all synchronized. * Exceptions must mark TSC as unstable: */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - return 0; + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + /* assume multi socket systems are not synchronized: */ + if (num_possible_cpus() > 1) + tsc_unstable = 1; + } + return tsc_unstable; +} + +/* + * Geode_LX - the OLPC CPU has a possibly a very reliable TSC + */ +#ifdef CONFIG_MGEODE_LX +/* RTSC counts during suspend */ +#define RTSC_SUSP 0x100 + +static void __init check_geode_tsc_reliable(void) +{ + unsigned long val; - /* assume multi socket systems are not synchronized: */ - return num_possible_cpus() > 1; + rdmsrl(MSR_GEODE_BUSCONT_CONF0, val); + if ((val & RTSC_SUSP)) + clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; } +#else +static inline void check_geode_tsc_reliable(void) { } +#endif static int __init init_tsc_clocksource(void) { @@ -463,20 +390,16 @@ static int __init init_tsc_clocksource(void) /* check blacklist */ dmi_check_system(bad_tsc_dmi_table); - if (unsynchronized_tsc()) /* mark unstable if unsynced */ - mark_tsc_unstable(); + unsynchronized_tsc(); + check_geode_tsc_reliable(); current_tsc_khz = tsc_khz; clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, clocksource_tsc.shift); /* lower the rating if we already know its unstable: */ - if (check_tsc_unstable()) + if (check_tsc_unstable()) { clocksource_tsc.rating = 0; - - init_timer(&verify_tsc_freq_timer); - verify_tsc_freq_timer.function = verify_tsc_freq; - verify_tsc_freq_timer.expires = - jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL); - add_timer(&verify_tsc_freq_timer); + clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; + } return clocksource_register(&clocksource_tsc); } diff --git a/arch/i386/kernel/tsc_sync.c b/arch/i386/kernel/tsc_sync.c new file mode 100644 index 00000000000..12424629af8 --- /dev/null +++ b/arch/i386/kernel/tsc_sync.c @@ -0,0 +1 @@ +#include "../../x86_64/kernel/tsc_sync.c" diff --git a/arch/i386/kernel/vmitime.c b/arch/i386/kernel/vmitime.c index 2e2d8dbcbd6..76d2adcae5a 100644 --- a/arch/i386/kernel/vmitime.c +++ b/arch/i386/kernel/vmitime.c @@ -115,7 +115,7 @@ static struct clocksource clocksource_vmi = { .mask = CLOCKSOURCE_MASK(64), .mult = 0, /* to be set */ .shift = 22, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; diff --git a/arch/i386/mach-default/setup.c b/arch/i386/mach-default/setup.c index cc2f519b2f7..c7881621070 100644 --- a/arch/i386/mach-default/setup.c +++ b/arch/i386/mach-default/setup.c @@ -79,7 +79,12 @@ void __init trap_init_hook(void) { } -static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL}; +static struct irqaction irq0 = { + .handler = timer_interrupt, + .flags = IRQF_DISABLED | IRQF_NOBALANCING, + .mask = CPU_MASK_NONE, + .name = "timer" +}; /** * time_init_hook - do any specific initialisations for the system timer. @@ -90,6 +95,7 @@ static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, **/ void __init time_init_hook(void) { + irq0.mask = cpumask_of_cpu(0); setup_irq(0, &irq0); } diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c index 545fcbc8cea..e5e56bd498d 100644 --- a/arch/mips/kernel/time.c +++ b/arch/mips/kernel/time.c @@ -307,7 +307,7 @@ static unsigned int __init calibrate_hpt(void) struct clocksource clocksource_mips = { .name = "MIPS", .mask = 0xffffffff, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static void __init init_mips_clocksource(void) diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c index 39db1289021..5e5c0e4add9 100644 --- a/arch/powerpc/platforms/powermac/pic.c +++ b/arch/powerpc/platforms/powermac/pic.c @@ -305,8 +305,6 @@ static int pmac_pic_host_map(struct irq_host *h, unsigned int virq, level = !!(level_mask[hw >> 5] & (1UL << (hw & 0x1f))); if (level) desc->status |= IRQ_LEVEL; - else - desc->status |= IRQ_DELAYED_DISABLE; set_irq_chip_and_handler(virq, &pmac_pic, level ? handle_level_irq : handle_edge_irq); return 0; diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 3b91f27ab20..ee9fd7b8592 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -312,7 +312,7 @@ static struct clocksource clocksource_tod = { .mask = -1ULL, .mult = 1000, .shift = 12, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; diff --git a/arch/um/os-Linux/sigio.c b/arch/um/os-Linux/sigio.c index 925a65240cf..b2e1fd8e357 100644 --- a/arch/um/os-Linux/sigio.c +++ b/arch/um/os-Linux/sigio.c @@ -97,20 +97,22 @@ static int write_sigio_thread(void *unused) static int need_poll(struct pollfds *polls, int n) { - if(n <= polls->size){ - polls->used = n; + struct pollfd *new; + + if(n <= polls->size) return 0; - } - kfree(polls->poll); - polls->poll = um_kmalloc_atomic(n * sizeof(struct pollfd)); - if(polls->poll == NULL){ + + new = um_kmalloc_atomic(n * sizeof(struct pollfd)); + if(new == NULL){ printk("need_poll : failed to allocate new pollfds\n"); - polls->size = 0; - polls->used = 0; return -ENOMEM; } + + memcpy(new, polls->poll, polls->used * sizeof(struct pollfd)); + kfree(polls->poll); + + polls->poll = new; polls->size = n; - polls->used = n; return 0; } @@ -171,15 +173,15 @@ int add_sigio_fd(int fd) goto out; } - n = current_poll.used + 1; - err = need_poll(&next_poll, n); + n = current_poll.used; + err = need_poll(&next_poll, n + 1); if(err) goto out; - for(i = 0; i < current_poll.used; i++) - next_poll.poll[i] = current_poll.poll[i]; - - next_poll.poll[n - 1] = *p; + memcpy(next_poll.poll, current_poll.poll, + current_poll.used * sizeof(struct pollfd)); + next_poll.poll[n] = *p; + next_poll.used = n + 1; update_thread(); out: sigio_unlock(); @@ -214,6 +216,7 @@ int ignore_sigio_fd(int fd) if(p->fd != fd) next_poll.poll[n++] = *p; } + next_poll.used = current_poll.used - 1; update_thread(); out: @@ -331,10 +334,9 @@ void maybe_sigio_broken(int fd, int read) sigio_lock(); err = need_poll(&all_sigio_fds, all_sigio_fds.used + 1); - if(err){ - printk("maybe_sigio_broken - failed to add pollfd\n"); + if(err) goto out; - } + all_sigio_fds.poll[all_sigio_fds.used++] = ((struct pollfd) { .fd = fd, .events = read ? POLLIN : POLLOUT, diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 7982cbc3bc9..56eb14c9847 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -24,6 +24,14 @@ config X86 bool default y +config GENERIC_TIME + bool + default y + +config GENERIC_TIME_VSYSCALL + bool + default y + config ZONE_DMA32 bool default y diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index ae399458024..bb47e86f3d0 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -8,7 +8,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ - pci-dma.o pci-nommu.o alternative.o + pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o @@ -19,7 +19,7 @@ obj-$(CONFIG_ACPI) += acpi/ obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CPUID) += cpuid.o -obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o +obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o tsc_sync.o obj-y += apic.o nmi.o obj-y += io_apic.o mpparse.o \ genapic.o genapic_cluster.o genapic_flat.o diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 124b2d27b4a..723417d924c 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -37,6 +37,7 @@ #include <asm/idle.h> #include <asm/proto.h> #include <asm/timex.h> +#include <asm/hpet.h> #include <asm/apic.h> int apic_mapped; @@ -763,7 +764,7 @@ static void setup_APIC_timer(unsigned int clocks) local_irq_save(flags); /* wait for irq slice */ - if (vxtime.hpet_address && hpet_use_timer) { + if (hpet_address && hpet_use_timer) { int trigger = hpet_readl(HPET_T0_CMP); while (hpet_readl(HPET_COUNTER) >= trigger) /* do nothing */ ; @@ -785,7 +786,7 @@ static void setup_APIC_timer(unsigned int clocks) /* Turn off PIT interrupt if we use APIC timer as main timer. Only works with the PM timer right now TBD fix it for HPET too. */ - if (vxtime.mode == VXTIME_PMTMR && + if ((pmtmr_ioport != 0) && smp_processor_id() == boot_cpu_id && apic_runs_main_timer == 1 && !cpu_isset(boot_cpu_id, timer_interrupt_broadcast_ipi_mask)) { diff --git a/arch/i386/kernel/time_hpet.c b/arch/x86_64/kernel/hpet.c index 1e4702dfcd0..65a0edd71a1 100644 --- a/arch/i386/kernel/time_hpet.c +++ b/arch/x86_64/kernel/hpet.c @@ -1,224 +1,138 @@ -/* - * linux/arch/i386/kernel/time_hpet.c - * This code largely copied from arch/x86_64/kernel/time.c - * See that file for credits. - * - * 2003-06-30 Venkatesh Pallipadi - Additional changes for HPET support - */ - -#include <linux/errno.h> #include <linux/kernel.h> -#include <linux/param.h> -#include <linux/string.h> +#include <linux/sched.h> #include <linux/init.h> -#include <linux/smp.h> +#include <linux/mc146818rtc.h> +#include <linux/time.h> +#include <linux/clocksource.h> +#include <linux/ioport.h> +#include <linux/acpi.h> +#include <linux/hpet.h> +#include <asm/pgtable.h> +#include <asm/vsyscall.h> +#include <asm/timex.h> +#include <asm/hpet.h> -#include <asm/timer.h> -#include <asm/fixmap.h> -#include <asm/apic.h> +int nohpet __initdata; -#include <linux/timex.h> +unsigned long hpet_address; +unsigned long hpet_period; /* fsecs / HPET clock */ +unsigned long hpet_tick; /* HPET clocks / interrupt */ -#include <asm/hpet.h> -#include <linux/hpet.h> +int hpet_use_timer; /* Use counter of hpet for time keeping, + * otherwise PIT + */ -static unsigned long hpet_period; /* fsecs / HPET clock */ -unsigned long hpet_tick; /* hpet clks count per tick */ -unsigned long hpet_address; /* hpet memory map physical address */ -int hpet_use_timer; +#ifdef CONFIG_HPET +static __init int late_hpet_init(void) +{ + struct hpet_data hd; + unsigned int ntimer; -static int use_hpet; /* can be used for runtime check of hpet */ -static int boot_hpet_disable; /* boottime override for HPET timer */ -static void __iomem * hpet_virt_address; /* hpet kernel virtual address */ + if (!hpet_address) + return 0; -#define FSEC_TO_USEC (1000000000UL) + memset(&hd, 0, sizeof(hd)); -int hpet_readl(unsigned long a) -{ - return readl(hpet_virt_address + a); -} + ntimer = hpet_readl(HPET_ID); + ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; + ntimer++; -static void hpet_writel(unsigned long d, unsigned long a) -{ - writel(d, hpet_virt_address + a); -} + /* + * Register with driver. + * Timer0 and Timer1 is used by platform. + */ + hd.hd_phys_address = hpet_address; + hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); + hd.hd_nirqs = ntimer; + hd.hd_flags = HPET_DATA_PLATFORM; + hpet_reserve_timer(&hd, 0); +#ifdef CONFIG_HPET_EMULATE_RTC + hpet_reserve_timer(&hd, 1); +#endif + hd.hd_irq[0] = HPET_LEGACY_8254; + hd.hd_irq[1] = HPET_LEGACY_RTC; + if (ntimer > 2) { + struct hpet *hpet; + struct hpet_timer *timer; + int i; + + hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE); + timer = &hpet->hpet_timers[2]; + for (i = 2; i < ntimer; timer++, i++) + hd.hd_irq[i] = (timer->hpet_config & + Tn_INT_ROUTE_CNF_MASK) >> + Tn_INT_ROUTE_CNF_SHIFT; -#ifdef CONFIG_X86_LOCAL_APIC -/* - * HPET counters dont wrap around on every tick. They just change the - * comparator value and continue. Next tick can be caught by checking - * for a change in the comparator value. Used in apic.c. - */ -static void __devinit wait_hpet_tick(void) -{ - unsigned int start_cmp_val, end_cmp_val; + } - start_cmp_val = hpet_readl(HPET_T0_CMP); - do { - end_cmp_val = hpet_readl(HPET_T0_CMP); - } while (start_cmp_val == end_cmp_val); + hpet_alloc(&hd); + return 0; } +fs_initcall(late_hpet_init); #endif -static int hpet_timer_stop_set_go(unsigned long tick) +int hpet_timer_stop_set_go(unsigned long tick) { unsigned int cfg; - /* - * Stop the timers and reset the main counter. - */ +/* + * Stop the timers and reset the main counter. + */ + cfg = hpet_readl(HPET_CFG); - cfg &= ~HPET_CFG_ENABLE; + cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); hpet_writel(cfg, HPET_CFG); hpet_writel(0, HPET_COUNTER); hpet_writel(0, HPET_COUNTER + 4); +/* + * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, + * and period also hpet_tick. + */ if (hpet_use_timer) { - /* - * Set up timer 0, as periodic with first interrupt to happen at - * hpet_tick, and period also hpet_tick. - */ - cfg = hpet_readl(HPET_T0_CFG); - cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | - HPET_TN_SETVAL | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T0_CFG); - - /* - * The first write after writing TN_SETVAL to the config register sets - * the counter value, the second write sets the threshold. - */ - hpet_writel(tick, HPET_T0_CMP); - hpet_writel(tick, HPET_T0_CMP); - } - /* - * Go! - */ - cfg = hpet_readl(HPET_CFG); - if (hpet_use_timer) + hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | + HPET_TN_32BIT, HPET_T0_CFG); + hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */ + hpet_writel(hpet_tick, HPET_T0_CMP); /* period */ cfg |= HPET_CFG_LEGACY; + } +/* + * Go! + */ + cfg |= HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); return 0; } -/* - * Check whether HPET was found by ACPI boot parse. If yes setup HPET - * counter 0 for kernel base timer. - */ -int __init hpet_enable(void) +int hpet_arch_init(void) { unsigned int id; - unsigned long tick_fsec_low, tick_fsec_high; /* tick in femto sec */ - unsigned long hpet_tick_rem; - if (boot_hpet_disable) + if (!hpet_address) return -1; + set_fixmap_nocache(FIX_HPET_BASE, hpet_address); + __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); + +/* + * Read the period, compute tick and quotient. + */ - if (!hpet_address) { - return -1; - } - hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); - /* - * Read the period, compute tick and quotient. - */ id = hpet_readl(HPET_ID); - /* - * We are checking for value '1' or more in number field if - * CONFIG_HPET_EMULATE_RTC is set because we will need an - * additional timer for RTC emulation. - * However, we can do with one timer otherwise using the - * the single HPET timer for system time. - */ -#ifdef CONFIG_HPET_EMULATE_RTC - if (!(id & HPET_ID_NUMBER)) { - iounmap(hpet_virt_address); - hpet_virt_address = NULL; + if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER)) return -1; - } -#endif - hpet_period = hpet_readl(HPET_PERIOD); - if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) { - iounmap(hpet_virt_address); - hpet_virt_address = NULL; + if (hpet_period < 100000 || hpet_period > 100000000) return -1; - } - /* - * 64 bit math - * First changing tick into fsec - * Then 64 bit div to find number of hpet clk per tick - */ - ASM_MUL64_REG(tick_fsec_low, tick_fsec_high, - KERNEL_TICK_USEC, FSEC_TO_USEC); - ASM_DIV64_REG(hpet_tick, hpet_tick_rem, - hpet_period, tick_fsec_low, tick_fsec_high); - - if (hpet_tick_rem > (hpet_period >> 1)) - hpet_tick++; /* rounding the result */ - - hpet_use_timer = id & HPET_ID_LEGSUP; - - if (hpet_timer_stop_set_go(hpet_tick)) { - iounmap(hpet_virt_address); - hpet_virt_address = NULL; - return -1; - } + hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period; - use_hpet = 1; + hpet_use_timer = (id & HPET_ID_LEGSUP); -#ifdef CONFIG_HPET - { - struct hpet_data hd; - unsigned int ntimer; - - memset(&hd, 0, sizeof (hd)); - - ntimer = hpet_readl(HPET_ID); - ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; - ntimer++; - - /* - * Register with driver. - * Timer0 and Timer1 is used by platform. - */ - hd.hd_phys_address = hpet_address; - hd.hd_address = hpet_virt_address; - hd.hd_nirqs = ntimer; - hd.hd_flags = HPET_DATA_PLATFORM; - hpet_reserve_timer(&hd, 0); -#ifdef CONFIG_HPET_EMULATE_RTC - hpet_reserve_timer(&hd, 1); -#endif - hd.hd_irq[0] = HPET_LEGACY_8254; - hd.hd_irq[1] = HPET_LEGACY_RTC; - if (ntimer > 2) { - struct hpet __iomem *hpet; - struct hpet_timer __iomem *timer; - int i; - - hpet = hpet_virt_address; - - for (i = 2, timer = &hpet->hpet_timers[2]; i < ntimer; - timer++, i++) - hd.hd_irq[i] = (timer->hpet_config & - Tn_INT_ROUTE_CNF_MASK) >> - Tn_INT_ROUTE_CNF_SHIFT; - - } - - hpet_alloc(&hd); - } -#endif - -#ifdef CONFIG_X86_LOCAL_APIC - if (hpet_use_timer) - wait_timer_tick = wait_hpet_tick; -#endif - return 0; + return hpet_timer_stop_set_go(hpet_tick); } int hpet_reenable(void) @@ -226,28 +140,51 @@ int hpet_reenable(void) return hpet_timer_stop_set_go(hpet_tick); } -int is_hpet_enabled(void) -{ - return use_hpet; -} +/* + * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing + * it to the HPET timer of known frequency. + */ -int is_hpet_capable(void) +#define TICK_COUNT 100000000 +#define TICK_MIN 5000 + +/* + * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none + * occurs between the reads of the hpet & TSC. + */ +static void __init read_hpet_tsc(int *hpet, int *tsc) { - if (!boot_hpet_disable && hpet_address) - return 1; - return 0; + int tsc1, tsc2, hpet1; + + do { + tsc1 = get_cycles_sync(); + hpet1 = hpet_readl(HPET_COUNTER); + tsc2 = get_cycles_sync(); + } while (tsc2 - tsc1 > TICK_MIN); + *hpet = hpet1; + *tsc = tsc2; } -static int __init hpet_setup(char* str) +unsigned int __init hpet_calibrate_tsc(void) { - if (str) { - if (!strncmp("disable", str, 7)) - boot_hpet_disable = 1; - } - return 1; -} + int tsc_start, hpet_start; + int tsc_now, hpet_now; + unsigned long flags; + + local_irq_save(flags); + + read_hpet_tsc(&hpet_start, &tsc_start); -__setup("hpet=", hpet_setup); + do { + local_irq_disable(); + read_hpet_tsc(&hpet_now, &tsc_now); + local_irq_restore(flags); + } while ((tsc_now - tsc_start) < TICK_COUNT && + (hpet_now - hpet_start) < TICK_COUNT); + + return (tsc_now - tsc_start) * 1000000000L + / ((hpet_now - hpet_start) * hpet_period / 1000); +} #ifdef CONFIG_HPET_EMULATE_RTC /* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET @@ -264,7 +201,6 @@ __setup("hpet=", hpet_setup); * For (3), we use interrupts at 64Hz or user specified periodic * frequency, whichever is higher. */ -#include <linux/mc146818rtc.h> #include <linux/rtc.h> #define DEFAULT_RTC_INT_FREQ 64 @@ -283,6 +219,11 @@ static unsigned long PIE_count; static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ static unsigned int hpet_t1_cmp; /* cached comparator register */ +int is_hpet_enabled(void) +{ + return hpet_address != 0; +} + /* * Timer 1 for RTC, we do not use periodic interrupt feature, * even if HPET supports periodic interrupts on Timer 1. @@ -367,8 +308,9 @@ static void hpet_rtc_timer_reinit(void) if (PIE_on) PIE_count += lost_ints; - printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", - hpet_rtc_int_freq); + if (printk_ratelimit()) + printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", + hpet_rtc_int_freq); } } @@ -450,7 +392,7 @@ int hpet_rtc_dropped_irq(void) return 1; } -irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) +irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) { struct rtc_time curr_time; unsigned long rtc_int_flag = 0; @@ -495,3 +437,75 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) } #endif +static int __init nohpet_setup(char *s) +{ + nohpet = 1; + return 1; +} + +__setup("nohpet", nohpet_setup); + +#define HPET_MASK 0xFFFFFFFF +#define HPET_SHIFT 22 + +/* FSEC = 10^-15 NSEC = 10^-9 */ +#define FSEC_PER_NSEC 1000000 + +static void *hpet_ptr; + +static cycle_t read_hpet(void) +{ + return (cycle_t)readl(hpet_ptr); +} + +static cycle_t __vsyscall_fn vread_hpet(void) +{ + return readl((void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); +} + +struct clocksource clocksource_hpet = { + .name = "hpet", + .rating = 250, + .read = read_hpet, + .mask = (cycle_t)HPET_MASK, + .mult = 0, /* set below */ + .shift = HPET_SHIFT, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .vread = vread_hpet, +}; + +static int __init init_hpet_clocksource(void) +{ + unsigned long hpet_period; + void __iomem *hpet_base; + u64 tmp; + + if (!hpet_address) + return -ENODEV; + + /* calculate the hpet address: */ + hpet_base = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); + hpet_ptr = hpet_base + HPET_COUNTER; + + /* calculate the frequency: */ + hpet_period = readl(hpet_base + HPET_PERIOD); + + /* + * hpet period is in femto seconds per cycle + * so we need to convert this to ns/cyc units + * aproximated by mult/2^shift + * + * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift + * fsec/cyc * 1ns/1000000fsec * 2^shift = mult + * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult + * (fsec/cyc << shift)/1000000 = mult + * (hpet_period << shift)/FSEC_PER_NSEC = mult + */ + tmp = (u64)hpet_period << HPET_SHIFT; + do_div(tmp, FSEC_PER_NSEC); + clocksource_hpet.mult = (u32)tmp; + + return clocksource_register(&clocksource_hpet); +} + +module_init(init_hpet_clocksource); diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index d73c79e821f..01e2cf0bdeb 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -103,6 +103,7 @@ static void mask_and_ack_8259A(unsigned int); static struct irq_chip i8259A_chip = { .name = "XT-PIC", .mask = disable_8259A_irq, + .disable = disable_8259A_irq, .unmask = enable_8259A_irq, .mask_ack = mask_and_ack_8259A, }; diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 566e64d966c..950682f3576 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -810,11 +810,9 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger) trigger == IOAPIC_LEVEL) set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_fasteoi_irq, "fasteoi"); - else { - irq_desc[irq].status |= IRQ_DELAYED_DISABLE; + else set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); - } } static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq) { diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c index 7554458dc9c..ae8f91214f1 100644 --- a/arch/x86_64/kernel/pmtimer.c +++ b/arch/x86_64/kernel/pmtimer.c @@ -24,15 +24,6 @@ #include <asm/msr.h> #include <asm/vsyscall.h> -/* The I/O port the PMTMR resides at. - * The location is detected during setup_arch(), - * in arch/i386/kernel/acpi/boot.c */ -u32 pmtmr_ioport __read_mostly; - -/* value of the Power timer at last timer interrupt */ -static u32 offset_delay; -static u32 last_pmtmr_tick; - #define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ static inline u32 cyc2us(u32 cycles) @@ -48,38 +39,6 @@ static inline u32 cyc2us(u32 cycles) return (cycles >> 10); } -int pmtimer_mark_offset(void) -{ - static int first_run = 1; - unsigned long tsc; - u32 lost; - - u32 tick = inl(pmtmr_ioport); - u32 delta; - - delta = cyc2us((tick - last_pmtmr_tick) & ACPI_PM_MASK); - - last_pmtmr_tick = tick; - monotonic_base += delta * NSEC_PER_USEC; - - delta += offset_delay; - - lost = delta / (USEC_PER_SEC / HZ); - offset_delay = delta % (USEC_PER_SEC / HZ); - - rdtscll(tsc); - vxtime.last_tsc = tsc - offset_delay * (u64)cpu_khz / 1000; - - /* don't calculate delay for first run, - or if we've got less then a tick */ - if (first_run || (lost < 1)) { - first_run = 0; - offset_delay = 0; - } - - return lost - 1; -} - static unsigned pmtimer_wait_tick(void) { u32 a, b; @@ -101,23 +60,6 @@ void pmtimer_wait(unsigned us) } while (cyc2us(b - a) < us); } -void pmtimer_resume(void) -{ - last_pmtmr_tick = inl(pmtmr_ioport); -} - -unsigned int do_gettimeoffset_pm(void) -{ - u32 now, offset, delta = 0; - - offset = last_pmtmr_tick; - now = inl(pmtmr_ioport); - delta = (now - offset) & ACPI_PM_MASK; - - return offset_delay + cyc2us(delta); -} - - static int __init nopmtimer_setup(char *s) { pmtmr_ioport = 0; diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index daf19332f0d..35443729aad 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -148,217 +148,6 @@ static void __cpuinit smp_store_cpu_info(int id) print_cpu_info(c); } -/* - * New Funky TSC sync algorithm borrowed from IA64. - * Main advantage is that it doesn't reset the TSCs fully and - * in general looks more robust and it works better than my earlier - * attempts. I believe it was written by David Mosberger. Some minor - * adjustments for x86-64 by me -AK - * - * Original comment reproduced below. - * - * Synchronize TSC of the current (slave) CPU with the TSC of the - * MASTER CPU (normally the time-keeper CPU). We use a closed loop to - * eliminate the possibility of unaccounted-for errors (such as - * getting a machine check in the middle of a calibration step). The - * basic idea is for the slave to ask the master what itc value it has - * and to read its own itc before and after the master responds. Each - * iteration gives us three timestamps: - * - * slave master - * - * t0 ---\ - * ---\ - * ---> - * tm - * /--- - * /--- - * t1 <--- - * - * - * The goal is to adjust the slave's TSC such that tm falls exactly - * half-way between t0 and t1. If we achieve this, the clocks are - * synchronized provided the interconnect between the slave and the - * master is symmetric. Even if the interconnect were asymmetric, we - * would still know that the synchronization error is smaller than the - * roundtrip latency (t0 - t1). - * - * When the interconnect is quiet and symmetric, this lets us - * synchronize the TSC to within one or two cycles. However, we can - * only *guarantee* that the synchronization is accurate to within a - * round-trip time, which is typically in the range of several hundred - * cycles (e.g., ~500 cycles). In practice, this means that the TSCs - * are usually almost perfectly synchronized, but we shouldn't assume - * that the accuracy is much better than half a micro second or so. - * - * [there are other errors like the latency of RDTSC and of the - * WRMSR. These can also account to hundreds of cycles. So it's - * probably worse. It claims 153 cycles error on a dual Opteron, - * but I suspect the numbers are actually somewhat worse -AK] - */ - -#define MASTER 0 -#define SLAVE (SMP_CACHE_BYTES/8) - -/* Intentionally don't use cpu_relax() while TSC synchronization - because we don't want to go into funky power save modi or cause - hypervisors to schedule us away. Going to sleep would likely affect - latency and low latency is the primary objective here. -AK */ -#define no_cpu_relax() barrier() - -static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock); -static volatile __cpuinitdata unsigned long go[SLAVE + 1]; -static int notscsync __cpuinitdata; - -#undef DEBUG_TSC_SYNC - -#define NUM_ROUNDS 64 /* magic value */ -#define NUM_ITERS 5 /* likewise */ - -/* Callback on boot CPU */ -static __cpuinit void sync_master(void *arg) -{ - unsigned long flags, i; - - go[MASTER] = 0; - - local_irq_save(flags); - { - for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) { - while (!go[MASTER]) - no_cpu_relax(); - go[MASTER] = 0; - rdtscll(go[SLAVE]); - } - } - local_irq_restore(flags); -} - -/* - * Return the number of cycles by which our tsc differs from the tsc - * on the master (time-keeper) CPU. A positive number indicates our - * tsc is ahead of the master, negative that it is behind. - */ -static inline long -get_delta(long *rt, long *master) -{ - unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0; - unsigned long tcenter, t0, t1, tm; - int i; - - for (i = 0; i < NUM_ITERS; ++i) { - rdtscll(t0); - go[MASTER] = 1; - while (!(tm = go[SLAVE])) - no_cpu_relax(); - go[SLAVE] = 0; - rdtscll(t1); - - if (t1 - t0 < best_t1 - best_t0) - best_t0 = t0, best_t1 = t1, best_tm = tm; - } - - *rt = best_t1 - best_t0; - *master = best_tm - best_t0; - - /* average best_t0 and best_t1 without overflow: */ - tcenter = (best_t0/2 + best_t1/2); - if (best_t0 % 2 + best_t1 % 2 == 2) - ++tcenter; - return tcenter - best_tm; -} - -static __cpuinit void sync_tsc(unsigned int master) -{ - int i, done = 0; - long delta, adj, adjust_latency = 0; - unsigned long flags, rt, master_time_stamp, bound; -#ifdef DEBUG_TSC_SYNC - static struct syncdebug { - long rt; /* roundtrip time */ - long master; /* master's timestamp */ - long diff; /* difference between midpoint and master's timestamp */ - long lat; /* estimate of tsc adjustment latency */ - } t[NUM_ROUNDS] __cpuinitdata; -#endif - - printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", - smp_processor_id(), master); - - go[MASTER] = 1; - - /* It is dangerous to broadcast IPI as cpus are coming up, - * as they may not be ready to accept them. So since - * we only need to send the ipi to the boot cpu direct - * the message, and avoid the race. - */ - smp_call_function_single(master, sync_master, NULL, 1, 0); - - while (go[MASTER]) /* wait for master to be ready */ - no_cpu_relax(); - - spin_lock_irqsave(&tsc_sync_lock, flags); - { - for (i = 0; i < NUM_ROUNDS; ++i) { - delta = get_delta(&rt, &master_time_stamp); - if (delta == 0) { - done = 1; /* let's lock on to this... */ - bound = rt; - } - - if (!done) { - unsigned long t; - if (i > 0) { - adjust_latency += -delta; - adj = -delta + adjust_latency/4; - } else - adj = -delta; - - rdtscll(t); - wrmsrl(MSR_IA32_TSC, t + adj); - } -#ifdef DEBUG_TSC_SYNC - t[i].rt = rt; - t[i].master = master_time_stamp; - t[i].diff = delta; - t[i].lat = adjust_latency/4; -#endif - } - } - spin_unlock_irqrestore(&tsc_sync_lock, flags); - -#ifdef DEBUG_TSC_SYNC - for (i = 0; i < NUM_ROUNDS; ++i) - printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n", - t[i].rt, t[i].master, t[i].diff, t[i].lat); -#endif - - printk(KERN_INFO - "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, " - "maxerr %lu cycles)\n", - smp_processor_id(), master, delta, rt); -} - -static void __cpuinit tsc_sync_wait(void) -{ - /* - * When the CPU has synchronized TSCs assume the BIOS - * or the hardware already synced. Otherwise we could - * mess up a possible perfect synchronization with a - * not-quite-perfect algorithm. - */ - if (notscsync || !cpu_has_tsc || !unsynchronized_tsc()) - return; - sync_tsc(0); -} - -static __init int notscsync_setup(char *s) -{ - notscsync = 1; - return 1; -} -__setup("notscsync", notscsync_setup); - static atomic_t init_deasserted __cpuinitdata; /* @@ -546,6 +335,11 @@ void __cpuinit start_secondary(void) /* otherwise gcc will move up the smp_processor_id before the cpu_init */ barrier(); + /* + * Check TSC sync first: + */ + check_tsc_sync_target(); + Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); setup_secondary_APIC_clock(); @@ -565,14 +359,6 @@ void __cpuinit start_secondary(void) */ set_cpu_sibling_map(smp_processor_id()); - /* - * Wait for TSC sync to not schedule things before. - * We still process interrupts, which could see an inconsistent - * time in that window unfortunately. - * Do this here because TSC sync has global unprotected state. - */ - tsc_sync_wait(); - /* * We need to hold call_lock, so there is no inconsistency * between the time smp_call_function() determines number of @@ -592,6 +378,7 @@ void __cpuinit start_secondary(void) cpu_set(smp_processor_id(), cpu_online_map); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; spin_unlock(&vector_lock); + unlock_ipi_call_lock(); cpu_idle(); @@ -1168,6 +955,11 @@ int __cpuinit __cpu_up(unsigned int cpu) /* Unleash the CPU! */ Dprintk("waiting for cpu %d\n", cpu); + /* + * Make sure and check TSC sync: + */ + check_tsc_sync_source(cpu); + while (!cpu_isset(cpu, cpu_online_map)) cpu_relax(); @@ -1190,7 +982,6 @@ void __init smp_cpus_done(unsigned int max_cpus) smp_cleanup_boot(); setup_ioapic_dest(); check_nmi_watchdog(); - time_init_gtod(); } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 3cc6886f1fb..a87c5170562 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -42,10 +42,8 @@ #include <linux/cpufreq.h> #include <linux/hpet.h> #include <asm/apic.h> +#include <asm/hpet.h> -#ifdef CONFIG_CPU_FREQ -static void cpufreq_delayed_get(void); -#endif extern void i8254_timer_resume(void); extern int using_apic_timer; @@ -55,128 +53,7 @@ DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); DEFINE_SPINLOCK(i8253_lock); -int nohpet __initdata = 0; -static int notsc __initdata = 0; - -#define USEC_PER_TICK (USEC_PER_SEC / HZ) -#define NSEC_PER_TICK (NSEC_PER_SEC / HZ) -#define FSEC_PER_TICK (FSEC_PER_SEC / HZ) - -#define NS_SCALE 10 /* 2^10, carefully chosen */ -#define US_SCALE 32 /* 2^32, arbitralrily chosen */ - -unsigned int cpu_khz; /* TSC clocks / usec, not used here */ -EXPORT_SYMBOL(cpu_khz); -static unsigned long hpet_period; /* fsecs / HPET clock */ -unsigned long hpet_tick; /* HPET clocks / interrupt */ -int hpet_use_timer; /* Use counter of hpet for time keeping, otherwise PIT */ -unsigned long vxtime_hz = PIT_TICK_RATE; -int report_lost_ticks; /* command line option */ -unsigned long long monotonic_base; - -struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ - volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; -struct timespec __xtime __section_xtime; -struct timezone __sys_tz __section_sys_tz; - -/* - * do_gettimeoffset() returns microseconds since last timer interrupt was - * triggered by hardware. A memory read of HPET is slower than a register read - * of TSC, but much more reliable. It's also synchronized to the timer - * interrupt. Note that do_gettimeoffset() may return more than hpet_tick, if a - * timer interrupt has happened already, but vxtime.trigger wasn't updated yet. - * This is not a problem, because jiffies hasn't updated either. They are bound - * together by xtime_lock. - */ - -static inline unsigned int do_gettimeoffset_tsc(void) -{ - unsigned long t; - unsigned long x; - t = get_cycles_sync(); - if (t < vxtime.last_tsc) - t = vxtime.last_tsc; /* hack */ - x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> US_SCALE; - return x; -} - -static inline unsigned int do_gettimeoffset_hpet(void) -{ - /* cap counter read to one tick to avoid inconsistencies */ - unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last; - return (min(counter,hpet_tick) * vxtime.quot) >> US_SCALE; -} - -unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc; - -/* - * This version of gettimeofday() has microsecond resolution and better than - * microsecond precision, as we're using at least a 10 MHz (usually 14.31818 - * MHz) HPET timer. - */ - -void do_gettimeofday(struct timeval *tv) -{ - unsigned long seq; - unsigned int sec, usec; - - do { - seq = read_seqbegin(&xtime_lock); - - sec = xtime.tv_sec; - usec = xtime.tv_nsec / NSEC_PER_USEC; - - /* i386 does some correction here to keep the clock - monotonous even when ntpd is fixing drift. - But they didn't work for me, there is a non monotonic - clock anyways with ntp. - I dropped all corrections now until a real solution can - be found. Note when you fix it here you need to do the same - in arch/x86_64/kernel/vsyscall.c and export all needed - variables in vmlinux.lds. -AK */ - usec += do_gettimeoffset(); - - } while (read_seqretry(&xtime_lock, seq)); - - tv->tv_sec = sec + usec / USEC_PER_SEC; - tv->tv_usec = usec % USEC_PER_SEC; -} - -EXPORT_SYMBOL(do_gettimeofday); - -/* - * settimeofday() first undoes the correction that gettimeofday would do - * on the time, and then saves it. This is ugly, but has been like this for - * ages already. - */ - -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - - nsec -= do_gettimeoffset() * NSEC_PER_USEC; - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - ntp_clear(); - - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); unsigned long profile_pc(struct pt_regs *regs) { @@ -267,84 +144,9 @@ static void set_rtc_mmss(unsigned long nowtime) } -/* monotonic_clock(): returns # of nanoseconds passed since time_init() - * Note: This function is required to return accurate - * time even in the absence of multiple timer ticks. - */ -static inline unsigned long long cycles_2_ns(unsigned long long cyc); -unsigned long long monotonic_clock(void) -{ - unsigned long seq; - u32 last_offset, this_offset, offset; - unsigned long long base; - - if (vxtime.mode == VXTIME_HPET) { - do { - seq = read_seqbegin(&xtime_lock); - - last_offset = vxtime.last; - base = monotonic_base; - this_offset = hpet_readl(HPET_COUNTER); - } while (read_seqretry(&xtime_lock, seq)); - offset = (this_offset - last_offset); - offset *= NSEC_PER_TICK / hpet_tick; - } else { - do { - seq = read_seqbegin(&xtime_lock); - - last_offset = vxtime.last_tsc; - base = monotonic_base; - } while (read_seqretry(&xtime_lock, seq)); - this_offset = get_cycles_sync(); - offset = cycles_2_ns(this_offset - last_offset); - } - return base + offset; -} -EXPORT_SYMBOL(monotonic_clock); - -static noinline void handle_lost_ticks(int lost) -{ - static long lost_count; - static int warned; - if (report_lost_ticks) { - printk(KERN_WARNING "time.c: Lost %d timer tick(s)! ", lost); - print_symbol("rip %s)\n", get_irq_regs()->rip); - } - - if (lost_count == 1000 && !warned) { - printk(KERN_WARNING "warning: many lost ticks.\n" - KERN_WARNING "Your time source seems to be instable or " - "some driver is hogging interupts\n"); - print_symbol("rip %s\n", get_irq_regs()->rip); - if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) { - printk(KERN_WARNING "Falling back to HPET\n"); - if (hpet_use_timer) - vxtime.last = hpet_readl(HPET_T0_CMP) - - hpet_tick; - else - vxtime.last = hpet_readl(HPET_COUNTER); - vxtime.mode = VXTIME_HPET; - do_gettimeoffset = do_gettimeoffset_hpet; - } - /* else should fall back to PIT, but code missing. */ - warned = 1; - } else - lost_count++; - -#ifdef CONFIG_CPU_FREQ - /* In some cases the CPU can change frequency without us noticing - Give cpufreq a change to catch up. */ - if ((lost_count+1) % 25 == 0) - cpufreq_delayed_get(); -#endif -} - void main_timer_handler(void) { static unsigned long rtc_update = 0; - unsigned long tsc; - int delay = 0, offset = 0, lost = 0; - /* * Here we are in the timer irq handler. We have irqs locally disabled (so we * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running @@ -354,72 +156,11 @@ void main_timer_handler(void) write_seqlock(&xtime_lock); - if (vxtime.hpet_address) - offset = hpet_readl(HPET_COUNTER); - - if (hpet_use_timer) { - /* if we're using the hpet timer functionality, - * we can more accurately know the counter value - * when the timer interrupt occured. - */ - offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - delay = hpet_readl(HPET_COUNTER) - offset; - } else if (!pmtmr_ioport) { - spin_lock(&i8253_lock); - outb_p(0x00, 0x43); - delay = inb_p(0x40); - delay |= inb(0x40) << 8; - spin_unlock(&i8253_lock); - delay = LATCH - 1 - delay; - } - - tsc = get_cycles_sync(); - - if (vxtime.mode == VXTIME_HPET) { - if (offset - vxtime.last > hpet_tick) { - lost = (offset - vxtime.last) / hpet_tick - 1; - } - - monotonic_base += - (offset - vxtime.last) * NSEC_PER_TICK / hpet_tick; - - vxtime.last = offset; -#ifdef CONFIG_X86_PM_TIMER - } else if (vxtime.mode == VXTIME_PMTMR) { - lost = pmtimer_mark_offset(); -#endif - } else { - offset = (((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> US_SCALE) - USEC_PER_TICK; - - if (offset < 0) - offset = 0; - - if (offset > USEC_PER_TICK) { - lost = offset / USEC_PER_TICK; - offset %= USEC_PER_TICK; - } - - monotonic_base += cycles_2_ns(tsc - vxtime.last_tsc); - - vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot; - - if ((((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> US_SCALE) < offset) - vxtime.last_tsc = tsc - - (((long) offset << US_SCALE) / vxtime.tsc_quot) - 1; - } - - if (lost > 0) - handle_lost_ticks(lost); - else - lost = 0; - /* * Do the timer stuff. */ - do_timer(lost + 1); + do_timer(1); #ifndef CONFIG_SMP update_process_times(user_mode(get_irq_regs())); #endif @@ -460,40 +201,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static unsigned int cyc2ns_scale __read_mostly; - -static inline void set_cyc2ns_scale(unsigned long cpu_khz) -{ - cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / cpu_khz; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> NS_SCALE; -} - -unsigned long long sched_clock(void) -{ - unsigned long a = 0; - -#if 0 - /* Don't do a HPET read here. Using TSC always is much faster - and HPET may not be mapped yet when the scheduler first runs. - Disadvantage is a small drift between CPUs in some configurations, - but that should be tolerable. */ - if (__vxtime.mode == VXTIME_HPET) - return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> US_SCALE; -#endif - - /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, - which means it is not completely exact and may not be monotonous between - CPUs. But the errors should be too small to matter for scheduling - purposes. */ - - rdtscll(a); - return cycles_2_ns(a); -} - static unsigned long get_cmos_time(void) { unsigned int year, mon, day, hour, min, sec; @@ -545,164 +252,6 @@ static unsigned long get_cmos_time(void) return mktime(year, mon, day, hour, min, sec); } -#ifdef CONFIG_CPU_FREQ - -/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency - changes. - - RED-PEN: On SMP we assume all CPUs run with the same frequency. It's - not that important because current Opteron setups do not support - scaling on SMP anyroads. - - Should fix up last_tsc too. Currently gettimeofday in the - first tick after the change will be slightly wrong. */ - -#include <linux/workqueue.h> - -static unsigned int cpufreq_delayed_issched = 0; -static unsigned int cpufreq_init = 0; -static struct work_struct cpufreq_delayed_get_work; - -static void handle_cpufreq_delayed_get(struct work_struct *v) -{ - unsigned int cpu; - for_each_online_cpu(cpu) { - cpufreq_get(cpu); - } - cpufreq_delayed_issched = 0; -} - -/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries - * to verify the CPU frequency the timing core thinks the CPU is running - * at is still correct. - */ -static void cpufreq_delayed_get(void) -{ - static int warned; - if (cpufreq_init && !cpufreq_delayed_issched) { - cpufreq_delayed_issched = 1; - if (!warned) { - warned = 1; - printk(KERN_DEBUG - "Losing some ticks... checking if CPU frequency changed.\n"); - } - schedule_work(&cpufreq_delayed_get_work); - } -} - -static unsigned int ref_freq = 0; -static unsigned long loops_per_jiffy_ref = 0; - -static unsigned long cpu_khz_ref = 0; - -static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct cpufreq_freqs *freq = data; - unsigned long *lpj, dummy; - - if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC)) - return 0; - - lpj = &dummy; - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) -#ifdef CONFIG_SMP - lpj = &cpu_data[freq->cpu].loops_per_jiffy; -#else - lpj = &boot_cpu_data.loops_per_jiffy; -#endif - - if (!ref_freq) { - ref_freq = freq->old; - loops_per_jiffy_ref = *lpj; - cpu_khz_ref = cpu_khz; - } - if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || - (val == CPUFREQ_RESUMECHANGE)) { - *lpj = - cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); - - cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; - } - - set_cyc2ns_scale(cpu_khz_ref); - - return 0; -} - -static struct notifier_block time_cpufreq_notifier_block = { - .notifier_call = time_cpufreq_notifier -}; - -static int __init cpufreq_tsc(void) -{ - INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get); - if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER)) - cpufreq_init = 1; - return 0; -} - -core_initcall(cpufreq_tsc); - -#endif - -/* - * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing - * it to the HPET timer of known frequency. - */ - -#define TICK_COUNT 100000000 -#define TICK_MIN 5000 -#define MAX_READ_RETRIES 5 - -/* - * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none - * occurs between the reads of the hpet & TSC. - */ -static void __init read_hpet_tsc(int *hpet, int *tsc) -{ - int tsc1, tsc2, hpet1, retries = 0; - static int msg; - - do { - tsc1 = get_cycles_sync(); - hpet1 = hpet_readl(HPET_COUNTER); - tsc2 = get_cycles_sync(); - } while (tsc2 - tsc1 > TICK_MIN && retries++ < MAX_READ_RETRIES); - if (retries >= MAX_READ_RETRIES && !msg++) - printk(KERN_WARNING - "hpet.c: exceeded max retries to read HPET & TSC\n"); - *hpet = hpet1; - *tsc = tsc2; -} - - -static unsigned int __init hpet_calibrate_tsc(void) -{ - int tsc_start, hpet_start; - int tsc_now, hpet_now; - unsigned long flags; - - local_irq_save(flags); - local_irq_disable(); - - read_hpet_tsc(&hpet_start, &tsc_start); - - do { - local_irq_disable(); - read_hpet_tsc(&hpet_now, &tsc_now); - local_irq_restore(flags); - } while ((tsc_now - tsc_start) < TICK_COUNT && - (hpet_now - hpet_start) < TICK_COUNT); - - return (tsc_now - tsc_start) * 1000000000L - / ((hpet_now - hpet_start) * hpet_period / 1000); -} - /* * pit_calibrate_tsc() uses the speaker output (channel 2) of @@ -733,124 +282,6 @@ static unsigned int __init pit_calibrate_tsc(void) return (end - start) / 50; } -#ifdef CONFIG_HPET -static __init int late_hpet_init(void) -{ - struct hpet_data hd; - unsigned int ntimer; - - if (!vxtime.hpet_address) - return 0; - - memset(&hd, 0, sizeof (hd)); - - ntimer = hpet_readl(HPET_ID); - ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; - ntimer++; - - /* - * Register with driver. - * Timer0 and Timer1 is used by platform. - */ - hd.hd_phys_address = vxtime.hpet_address; - hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); - hd.hd_nirqs = ntimer; - hd.hd_flags = HPET_DATA_PLATFORM; - hpet_reserve_timer(&hd, 0); -#ifdef CONFIG_HPET_EMULATE_RTC - hpet_reserve_timer(&hd, 1); -#endif - hd.hd_irq[0] = HPET_LEGACY_8254; - hd.hd_irq[1] = HPET_LEGACY_RTC; - if (ntimer > 2) { - struct hpet *hpet; - struct hpet_timer *timer; - int i; - - hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE); - timer = &hpet->hpet_timers[2]; - for (i = 2; i < ntimer; timer++, i++) - hd.hd_irq[i] = (timer->hpet_config & - Tn_INT_ROUTE_CNF_MASK) >> - Tn_INT_ROUTE_CNF_SHIFT; - - } - - hpet_alloc(&hd); - return 0; -} -fs_initcall(late_hpet_init); -#endif - -static int hpet_timer_stop_set_go(unsigned long tick) -{ - unsigned int cfg; - -/* - * Stop the timers and reset the main counter. - */ - - cfg = hpet_readl(HPET_CFG); - cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); - hpet_writel(cfg, HPET_CFG); - hpet_writel(0, HPET_COUNTER); - hpet_writel(0, HPET_COUNTER + 4); - -/* - * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, - * and period also hpet_tick. - */ - if (hpet_use_timer) { - hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | - HPET_TN_32BIT, HPET_T0_CFG); - hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */ - hpet_writel(hpet_tick, HPET_T0_CMP); /* period */ - cfg |= HPET_CFG_LEGACY; - } -/* - * Go! - */ - - cfg |= HPET_CFG_ENABLE; - hpet_writel(cfg, HPET_CFG); - - return 0; -} - -static int hpet_init(void) -{ - unsigned int id; - - if (!vxtime.hpet_address) - return -1; - set_fixmap_nocache(FIX_HPET_BASE, vxtime.hpet_address); - __set_fixmap(VSYSCALL_HPET, vxtime.hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); - -/* - * Read the period, compute tick and quotient. - */ - - id = hpet_readl(HPET_ID); - - if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER)) - return -1; - - hpet_period = hpet_readl(HPET_PERIOD); - if (hpet_period < 100000 || hpet_period > 100000000) - return -1; - - hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period; - - hpet_use_timer = (id & HPET_ID_LEGSUP); - - return hpet_timer_stop_set_go(hpet_tick); -} - -static int hpet_reenable(void) -{ - return hpet_timer_stop_set_go(hpet_tick); -} - #define PIT_MODE 0x43 #define PIT_CH0 0x40 @@ -878,7 +309,7 @@ void __init pit_stop_interrupt(void) void __init stop_timer_interrupt(void) { char *name; - if (vxtime.hpet_address) { + if (hpet_address) { name = "HPET"; hpet_timer_stop_set_go(0); } else { @@ -888,12 +319,6 @@ void __init stop_timer_interrupt(void) printk(KERN_INFO "timer: %s interrupt stopped.\n", name); } -int __init time_setup(char *str) -{ - report_lost_ticks = 1; - return 1; -} - static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL }; @@ -901,124 +326,41 @@ static struct irqaction irq0 = { void __init time_init(void) { if (nohpet) - vxtime.hpet_address = 0; - + hpet_address = 0; xtime.tv_sec = get_cmos_time(); xtime.tv_nsec = 0; set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); - if (!hpet_init()) - vxtime_hz = (FSEC_PER_SEC + hpet_period / 2) / hpet_period; - else - vxtime.hpet_address = 0; + if (hpet_arch_init()) + hpet_address = 0; if (hpet_use_timer) { /* set tick_nsec to use the proper rate for HPET */ tick_nsec = TICK_NSEC_HPET; cpu_khz = hpet_calibrate_tsc(); timename = "HPET"; -#ifdef CONFIG_X86_PM_TIMER - } else if (pmtmr_ioport && !vxtime.hpet_address) { - vxtime_hz = PM_TIMER_FREQUENCY; - timename = "PM"; - pit_init(); - cpu_khz = pit_calibrate_tsc(); -#endif } else { pit_init(); cpu_khz = pit_calibrate_tsc(); timename = "PIT"; } - vxtime.mode = VXTIME_TSC; - vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; - vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; - vxtime.last_tsc = get_cycles_sync(); - set_cyc2ns_scale(cpu_khz); - setup_irq(0, &irq0); - -#ifndef CONFIG_SMP - time_init_gtod(); -#endif -} - -/* - * Make an educated guess if the TSC is trustworthy and synchronized - * over all CPUs. - */ -__cpuinit int unsynchronized_tsc(void) -{ -#ifdef CONFIG_SMP - if (apic_is_clustered_box()) - return 1; -#endif - /* Most intel systems have synchronized TSCs except for - multi node systems */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { -#ifdef CONFIG_ACPI - /* But TSC doesn't tick in C3 so don't use it there */ - if (acpi_gbl_FADT.header.length > 0 && acpi_gbl_FADT.C3latency < 1000) - return 1; -#endif - return 0; - } - - /* Assume multi socket systems are not synchronized */ - return num_present_cpus() > 1; -} - -/* - * Decide what mode gettimeofday should use. - */ -void time_init_gtod(void) -{ - char *timetype; - if (unsynchronized_tsc()) - notsc = 1; + mark_tsc_unstable(); - if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) + if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) vgetcpu_mode = VGETCPU_RDTSCP; else vgetcpu_mode = VGETCPU_LSL; - if (vxtime.hpet_address && notsc) { - timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; - if (hpet_use_timer) - vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; - else - vxtime.last = hpet_readl(HPET_COUNTER); - vxtime.mode = VXTIME_HPET; - do_gettimeoffset = do_gettimeoffset_hpet; -#ifdef CONFIG_X86_PM_TIMER - /* Using PM for gettimeofday is quite slow, but we have no other - choice because the TSC is too unreliable on some systems. */ - } else if (pmtmr_ioport && !vxtime.hpet_address && notsc) { - timetype = "PM"; - do_gettimeoffset = do_gettimeoffset_pm; - vxtime.mode = VXTIME_PMTMR; - sysctl_vsyscall = 0; - printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n"); -#endif - } else { - timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC"; - vxtime.mode = VXTIME_TSC; - } - - printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n", - vxtime_hz / 1000000, vxtime_hz % 1000000, timename, timetype); + set_cyc2ns_scale(cpu_khz); printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); - vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; - vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; - vxtime.last_tsc = get_cycles_sync(); - - set_cyc2ns_scale(cpu_khz); + setup_irq(0, &irq0); } -__setup("report_lost_ticks", time_setup); static long clock_cmos_diff; static unsigned long sleep_start; @@ -1055,7 +397,7 @@ static int timer_resume(struct sys_device *dev) sleep_length = 0; ctime = sleep_start; } - if (vxtime.hpet_address) + if (hpet_address) hpet_reenable(); else i8254_timer_resume(); @@ -1064,20 +406,8 @@ static int timer_resume(struct sys_device *dev) write_seqlock_irqsave(&xtime_lock,flags); xtime.tv_sec = sec; xtime.tv_nsec = 0; - if (vxtime.mode == VXTIME_HPET) { - if (hpet_use_timer) - vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; - else - vxtime.last = hpet_readl(HPET_COUNTER); -#ifdef CONFIG_X86_PM_TIMER - } else if (vxtime.mode == VXTIME_PMTMR) { - pmtimer_resume(); -#endif - } else - vxtime.last_tsc = get_cycles_sync(); - write_sequnlock_irqrestore(&xtime_lock,flags); jiffies += sleep_length; - monotonic_base += sleep_length * (NSEC_PER_SEC/HZ); + write_sequnlock_irqrestore(&xtime_lock,flags); touch_softlockup_watchdog(); return 0; } @@ -1103,270 +433,3 @@ static int time_init_device(void) } device_initcall(time_init_device); - -#ifdef CONFIG_HPET_EMULATE_RTC -/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET - * is enabled, we support RTC interrupt functionality in software. - * RTC has 3 kinds of interrupts: - * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock - * is updated - * 2) Alarm Interrupt - generate an interrupt at a specific time of day - * 3) Periodic Interrupt - generate periodic interrupt, with frequencies - * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) - * (1) and (2) above are implemented using polling at a frequency of - * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt - * overhead. (DEFAULT_RTC_INT_FREQ) - * For (3), we use interrupts at 64Hz or user specified periodic - * frequency, whichever is higher. - */ -#include <linux/rtc.h> - -#define DEFAULT_RTC_INT_FREQ 64 -#define RTC_NUM_INTS 1 - -static unsigned long UIE_on; -static unsigned long prev_update_sec; - -static unsigned long AIE_on; -static struct rtc_time alarm_time; - -static unsigned long PIE_on; -static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ; -static unsigned long PIE_count; - -static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ -static unsigned int hpet_t1_cmp; /* cached comparator register */ - -int is_hpet_enabled(void) -{ - return vxtime.hpet_address != 0; -} - -/* - * Timer 1 for RTC, we do not use periodic interrupt feature, - * even if HPET supports periodic interrupts on Timer 1. - * The reason being, to set up a periodic interrupt in HPET, we need to - * stop the main counter. And if we do that everytime someone diables/enables - * RTC, we will have adverse effect on main kernel timer running on Timer 0. - * So, for the time being, simulate the periodic interrupt in software. - * - * hpet_rtc_timer_init() is called for the first time and during subsequent - * interuppts reinit happens through hpet_rtc_timer_reinit(). - */ -int hpet_rtc_timer_init(void) -{ - unsigned int cfg, cnt; - unsigned long flags; - - if (!is_hpet_enabled()) - return 0; - /* - * Set the counter 1 and enable the interrupts. - */ - if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) - hpet_rtc_int_freq = PIE_freq; - else - hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; - - local_irq_save(flags); - - cnt = hpet_readl(HPET_COUNTER); - cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); - hpet_writel(cnt, HPET_T1_CMP); - hpet_t1_cmp = cnt; - - cfg = hpet_readl(HPET_T1_CFG); - cfg &= ~HPET_TN_PERIODIC; - cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T1_CFG); - - local_irq_restore(flags); - - return 1; -} - -static void hpet_rtc_timer_reinit(void) -{ - unsigned int cfg, cnt, ticks_per_int, lost_ints; - - if (unlikely(!(PIE_on | AIE_on | UIE_on))) { - cfg = hpet_readl(HPET_T1_CFG); - cfg &= ~HPET_TN_ENABLE; - hpet_writel(cfg, HPET_T1_CFG); - return; - } - - if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) - hpet_rtc_int_freq = PIE_freq; - else - hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; - - /* It is more accurate to use the comparator value than current count.*/ - ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq; - hpet_t1_cmp += ticks_per_int; - hpet_writel(hpet_t1_cmp, HPET_T1_CMP); - - /* - * If the interrupt handler was delayed too long, the write above tries - * to schedule the next interrupt in the past and the hardware would - * not interrupt until the counter had wrapped around. - * So we have to check that the comparator wasn't set to a past time. - */ - cnt = hpet_readl(HPET_COUNTER); - if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) { - lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1; - /* Make sure that, even with the time needed to execute - * this code, the next scheduled interrupt has been moved - * back to the future: */ - lost_ints++; - - hpet_t1_cmp += lost_ints * ticks_per_int; - hpet_writel(hpet_t1_cmp, HPET_T1_CMP); - - if (PIE_on) - PIE_count += lost_ints; - - if (printk_ratelimit()) - printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", - hpet_rtc_int_freq); - } -} - -/* - * The functions below are called from rtc driver. - * Return 0 if HPET is not being used. - * Otherwise do the necessary changes and return 1. - */ -int hpet_mask_rtc_irq_bit(unsigned long bit_mask) -{ - if (!is_hpet_enabled()) - return 0; - - if (bit_mask & RTC_UIE) - UIE_on = 0; - if (bit_mask & RTC_PIE) - PIE_on = 0; - if (bit_mask & RTC_AIE) - AIE_on = 0; - - return 1; -} - -int hpet_set_rtc_irq_bit(unsigned long bit_mask) -{ - int timer_init_reqd = 0; - - if (!is_hpet_enabled()) - return 0; - - if (!(PIE_on | AIE_on | UIE_on)) - timer_init_reqd = 1; - - if (bit_mask & RTC_UIE) { - UIE_on = 1; - } - if (bit_mask & RTC_PIE) { - PIE_on = 1; - PIE_count = 0; - } - if (bit_mask & RTC_AIE) { - AIE_on = 1; - } - - if (timer_init_reqd) - hpet_rtc_timer_init(); - - return 1; -} - -int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) -{ - if (!is_hpet_enabled()) - return 0; - - alarm_time.tm_hour = hrs; - alarm_time.tm_min = min; - alarm_time.tm_sec = sec; - - return 1; -} - -int hpet_set_periodic_freq(unsigned long freq) -{ - if (!is_hpet_enabled()) - return 0; - - PIE_freq = freq; - PIE_count = 0; - - return 1; -} - -int hpet_rtc_dropped_irq(void) -{ - if (!is_hpet_enabled()) - return 0; - - return 1; -} - -irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) -{ - struct rtc_time curr_time; - unsigned long rtc_int_flag = 0; - int call_rtc_interrupt = 0; - - hpet_rtc_timer_reinit(); - - if (UIE_on | AIE_on) { - rtc_get_rtc_time(&curr_time); - } - if (UIE_on) { - if (curr_time.tm_sec != prev_update_sec) { - /* Set update int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag = RTC_UF; - prev_update_sec = curr_time.tm_sec; - } - } - if (PIE_on) { - PIE_count++; - if (PIE_count >= hpet_rtc_int_freq/PIE_freq) { - /* Set periodic int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag |= RTC_PF; - PIE_count = 0; - } - } - if (AIE_on) { - if ((curr_time.tm_sec == alarm_time.tm_sec) && - (curr_time.tm_min == alarm_time.tm_min) && - (curr_time.tm_hour == alarm_time.tm_hour)) { - /* Set alarm int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag |= RTC_AF; - } - } - if (call_rtc_interrupt) { - rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); - rtc_interrupt(rtc_int_flag, dev_id); - } - return IRQ_HANDLED; -} -#endif - -static int __init nohpet_setup(char *s) -{ - nohpet = 1; - return 1; -} - -__setup("nohpet", nohpet_setup); - -int __init notsc_setup(char *s) -{ - notsc = 1; - return 1; -} - -__setup("notsc", notsc_setup); diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c new file mode 100644 index 00000000000..89583186501 --- /dev/null +++ b/arch/x86_64/kernel/tsc.c @@ -0,0 +1,226 @@ +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/clocksource.h> +#include <linux/time.h> +#include <linux/acpi.h> +#include <linux/cpufreq.h> + +#include <asm/timex.h> + +static int notsc __initdata = 0; + +unsigned int cpu_khz; /* TSC clocks / usec, not used here */ +EXPORT_SYMBOL(cpu_khz); + +static unsigned int cyc2ns_scale __read_mostly; + +void set_cyc2ns_scale(unsigned long khz) +{ + cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz; +} + +static unsigned long long cycles_2_ns(unsigned long long cyc) +{ + return (cyc * cyc2ns_scale) >> NS_SCALE; +} + +unsigned long long sched_clock(void) +{ + unsigned long a = 0; + + /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, + * which means it is not completely exact and may not be monotonous + * between CPUs. But the errors should be too small to matter for + * scheduling purposes. + */ + + rdtscll(a); + return cycles_2_ns(a); +} + +static int tsc_unstable; + +static inline int check_tsc_unstable(void) +{ + return tsc_unstable; +} +#ifdef CONFIG_CPU_FREQ + +/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency + * changes. + * + * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's + * not that important because current Opteron setups do not support + * scaling on SMP anyroads. + * + * Should fix up last_tsc too. Currently gettimeofday in the + * first tick after the change will be slightly wrong. + */ + +#include <linux/workqueue.h> + +static unsigned int cpufreq_delayed_issched = 0; +static unsigned int cpufreq_init = 0; +static struct work_struct cpufreq_delayed_get_work; + +static void handle_cpufreq_delayed_get(struct work_struct *v) +{ + unsigned int cpu; + for_each_online_cpu(cpu) { + cpufreq_get(cpu); + } + cpufreq_delayed_issched = 0; +} + +static unsigned int ref_freq = 0; +static unsigned long loops_per_jiffy_ref = 0; + +static unsigned long cpu_khz_ref = 0; + +static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + unsigned long *lpj, dummy; + + if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC)) + return 0; + + lpj = &dummy; + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) +#ifdef CONFIG_SMP + lpj = &cpu_data[freq->cpu].loops_per_jiffy; +#else + lpj = &boot_cpu_data.loops_per_jiffy; +#endif + + if (!ref_freq) { + ref_freq = freq->old; + loops_per_jiffy_ref = *lpj; + cpu_khz_ref = cpu_khz; + } + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || + (val == CPUFREQ_RESUMECHANGE)) { + *lpj = + cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); + + cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) + mark_tsc_unstable(); + } + + set_cyc2ns_scale(cpu_khz_ref); + + return 0; +} + +static struct notifier_block time_cpufreq_notifier_block = { + .notifier_call = time_cpufreq_notifier +}; + +static int __init cpufreq_tsc(void) +{ + INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get); + if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER)) + cpufreq_init = 1; + return 0; +} + +core_initcall(cpufreq_tsc); + +#endif + +static int tsc_unstable = 0; + +/* + * Make an educated guess if the TSC is trustworthy and synchronized + * over all CPUs. + */ +__cpuinit int unsynchronized_tsc(void) +{ + if (tsc_unstable) + return 1; + +#ifdef CONFIG_SMP + if (apic_is_clustered_box()) + return 1; +#endif + /* Most intel systems have synchronized TSCs except for + multi node systems */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { +#ifdef CONFIG_ACPI + /* But TSC doesn't tick in C3 so don't use it there */ + if (acpi_gbl_FADT.header.length > 0 && acpi_gbl_FADT.C3latency < 1000) + return 1; +#endif + return 0; + } + + /* Assume multi socket systems are not synchronized */ + return num_present_cpus() > 1; +} + +int __init notsc_setup(char *s) +{ + notsc = 1; + return 1; +} + +__setup("notsc", notsc_setup); + + +/* clock source code: */ +static cycle_t read_tsc(void) +{ + cycle_t ret = (cycle_t)get_cycles_sync(); + return ret; +} + +static cycle_t __vsyscall_fn vread_tsc(void) +{ + cycle_t ret = (cycle_t)get_cycles_sync(); + return ret; +} + +static struct clocksource clocksource_tsc = { + .name = "tsc", + .rating = 300, + .read = read_tsc, + .mask = CLOCKSOURCE_MASK(64), + .shift = 22, + .flags = CLOCK_SOURCE_IS_CONTINUOUS | + CLOCK_SOURCE_MUST_VERIFY, + .vread = vread_tsc, +}; + +void mark_tsc_unstable(void) +{ + if (!tsc_unstable) { + tsc_unstable = 1; + /* Change only the rating, when not registered */ + if (clocksource_tsc.mult) + clocksource_change_rating(&clocksource_tsc, 0); + else + clocksource_tsc.rating = 0; + } +} +EXPORT_SYMBOL_GPL(mark_tsc_unstable); + +static int __init init_tsc_clocksource(void) +{ + if (!notsc) { + clocksource_tsc.mult = clocksource_khz2mult(cpu_khz, + clocksource_tsc.shift); + if (check_tsc_unstable()) + clocksource_tsc.rating = 0; + + return clocksource_register(&clocksource_tsc); + } + return 0; +} + +module_init(init_tsc_clocksource); diff --git a/arch/x86_64/kernel/tsc_sync.c b/arch/x86_64/kernel/tsc_sync.c new file mode 100644 index 00000000000..014f0db45df --- /dev/null +++ b/arch/x86_64/kernel/tsc_sync.c @@ -0,0 +1,187 @@ +/* + * arch/x86_64/kernel/tsc_sync.c: check TSC synchronization. + * + * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar + * + * We check whether all boot CPUs have their TSC's synchronized, + * print a warning if not and turn off the TSC clock-source. + * + * The warp-check is point-to-point between two CPUs, the CPU + * initiating the bootup is the 'source CPU', the freshly booting + * CPU is the 'target CPU'. + * + * Only two CPUs may participate - they can enter in any order. + * ( The serial nature of the boot logic and the CPU hotplug lock + * protects against more than 2 CPUs entering this code. ) + */ +#include <linux/spinlock.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/nmi.h> +#include <asm/tsc.h> + +/* + * Entry/exit counters that make sure that both CPUs + * run the measurement code at once: + */ +static __cpuinitdata atomic_t start_count; +static __cpuinitdata atomic_t stop_count; + +/* + * We use a raw spinlock in this exceptional case, because + * we want to have the fastest, inlined, non-debug version + * of a critical section, to be able to prove TSC time-warps: + */ +static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; +static __cpuinitdata cycles_t last_tsc; +static __cpuinitdata cycles_t max_warp; +static __cpuinitdata int nr_warps; + +/* + * TSC-warp measurement loop running on both CPUs: + */ +static __cpuinit void check_tsc_warp(void) +{ + cycles_t start, now, prev, end; + int i; + + start = get_cycles_sync(); + /* + * The measurement runs for 20 msecs: + */ + end = start + cpu_khz * 20ULL; + now = start; + + for (i = 0; ; i++) { + /* + * We take the global lock, measure TSC, save the + * previous TSC that was measured (possibly on + * another CPU) and update the previous TSC timestamp. + */ + __raw_spin_lock(&sync_lock); + prev = last_tsc; + now = get_cycles_sync(); + last_tsc = now; + __raw_spin_unlock(&sync_lock); + + /* + * Be nice every now and then (and also check whether + * measurement is done [we also insert a 100 million + * loops safety exit, so we dont lock up in case the + * TSC readout is totally broken]): + */ + if (unlikely(!(i & 7))) { + if (now > end || i > 100000000) + break; + cpu_relax(); + touch_nmi_watchdog(); + } + /* + * Outside the critical section we can now see whether + * we saw a time-warp of the TSC going backwards: + */ + if (unlikely(prev > now)) { + __raw_spin_lock(&sync_lock); + max_warp = max(max_warp, prev - now); + nr_warps++; + __raw_spin_unlock(&sync_lock); + } + + } +} + +/* + * Source CPU calls into this - it waits for the freshly booted + * target CPU to arrive and then starts the measurement: + */ +void __cpuinit check_tsc_sync_source(int cpu) +{ + int cpus = 2; + + /* + * No need to check if we already know that the TSC is not + * synchronized: + */ + if (unsynchronized_tsc()) + return; + + printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", + smp_processor_id(), cpu); + + /* + * Reset it - in case this is a second bootup: + */ + atomic_set(&stop_count, 0); + + /* + * Wait for the target to arrive: + */ + while (atomic_read(&start_count) != cpus-1) + cpu_relax(); + /* + * Trigger the target to continue into the measurement too: + */ + atomic_inc(&start_count); + + check_tsc_warp(); + + while (atomic_read(&stop_count) != cpus-1) + cpu_relax(); + + /* + * Reset it - just in case we boot another CPU later: + */ + atomic_set(&start_count, 0); + + if (nr_warps) { + printk("\n"); + printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," + " turning off TSC clock.\n", max_warp); + mark_tsc_unstable(); + nr_warps = 0; + max_warp = 0; + last_tsc = 0; + } else { + printk(" passed.\n"); + } + + /* + * Let the target continue with the bootup: + */ + atomic_inc(&stop_count); +} + +/* + * Freshly booted CPUs call into this: + */ +void __cpuinit check_tsc_sync_target(void) +{ + int cpus = 2; + + if (unsynchronized_tsc()) + return; + + /* + * Register this CPU's participation and wait for the + * source CPU to start the measurement: + */ + atomic_inc(&start_count); + while (atomic_read(&start_count) != cpus) + cpu_relax(); + + check_tsc_warp(); + + /* + * Ok, we are done: + */ + atomic_inc(&stop_count); + + /* + * Wait for the source CPU to print stuff: + */ + while (atomic_read(&stop_count) != cpus) + cpu_relax(); +} +#undef NR_LOOPS + diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index c360c422524..b73212c0a55 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -88,31 +88,25 @@ SECTIONS __vsyscall_0 = VSYSCALL_VIRT_ADDR; . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) } - xtime_lock = VVIRT(.xtime_lock); - - .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) } - vxtime = VVIRT(.vxtime); + .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) + { *(.vsyscall_gtod_data) } + vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } vgetcpu_mode = VVIRT(.vgetcpu_mode); - .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) } - sys_tz = VVIRT(.sys_tz); - - .sysctl_vsyscall : AT(VLOAD(.sysctl_vsyscall)) { *(.sysctl_vsyscall) } - sysctl_vsyscall = VVIRT(.sysctl_vsyscall); - - .xtime : AT(VLOAD(.xtime)) { *(.xtime) } - xtime = VVIRT(.xtime); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } jiffies = VVIRT(.jiffies); - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) } - .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) } - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) } + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) + { *(.vsyscall_1) } + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) + { *(.vsyscall_2) } + .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) + { *(.vsyscall_3) } . = VSYSCALL_VIRT_ADDR + 4096; diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index 313dc6ad780..180ff919eaf 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -26,6 +26,7 @@ #include <linux/seqlock.h> #include <linux/jiffies.h> #include <linux/sysctl.h> +#include <linux/clocksource.h> #include <linux/getcpu.h> #include <linux/cpu.h> #include <linux/smp.h> @@ -34,6 +35,7 @@ #include <asm/vsyscall.h> #include <asm/pgtable.h> #include <asm/page.h> +#include <asm/unistd.h> #include <asm/fixmap.h> #include <asm/errno.h> #include <asm/io.h> @@ -44,56 +46,41 @@ #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) #define __syscall_clobber "r11","rcx","memory" -int __sysctl_vsyscall __section_sysctl_vsyscall = 1; -seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; +struct vsyscall_gtod_data_t { + seqlock_t lock; + int sysctl_enabled; + struct timeval wall_time_tv; + struct timezone sys_tz; + cycle_t offset_base; + struct clocksource clock; +}; int __vgetcpu_mode __section_vgetcpu_mode; -#include <asm/unistd.h> - -static __always_inline void timeval_normalize(struct timeval * tv) +struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data = { - time_t __sec; - - __sec = tv->tv_usec / 1000000; - if (__sec) { - tv->tv_usec %= 1000000; - tv->tv_sec += __sec; - } -} + .lock = SEQLOCK_UNLOCKED, + .sysctl_enabled = 1, +}; -static __always_inline void do_vgettimeofday(struct timeval * tv) +void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) { - long sequence, t; - unsigned long sec, usec; - - do { - sequence = read_seqbegin(&__xtime_lock); - - sec = __xtime.tv_sec; - usec = __xtime.tv_nsec / 1000; - - if (__vxtime.mode != VXTIME_HPET) { - t = get_cycles_sync(); - if (t < __vxtime.last_tsc) - t = __vxtime.last_tsc; - usec += ((t - __vxtime.last_tsc) * - __vxtime.tsc_quot) >> 32; - /* See comment in x86_64 do_gettimeofday. */ - } else { - usec += ((readl((void __iomem *) - fix_to_virt(VSYSCALL_HPET) + 0xf0) - - __vxtime.last) * __vxtime.quot) >> 32; - } - } while (read_seqretry(&__xtime_lock, sequence)); - - tv->tv_sec = sec + usec / 1000000; - tv->tv_usec = usec % 1000000; + unsigned long flags; + + write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + /* copy vsyscall data */ + vsyscall_gtod_data.clock = *clock; + vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time->tv_sec; + vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time->tv_nsec/1000; + vsyscall_gtod_data.sys_tz = sys_tz; + write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } -/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */ +/* RED-PEN may want to readd seq locking, but then the variable should be + * write-once. + */ static __always_inline void do_get_tz(struct timezone * tz) { - *tz = __sys_tz; + *tz = __vsyscall_gtod_data.sys_tz; } static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) @@ -101,7 +88,8 @@ static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) int ret; asm volatile("vsysc2: syscall" : "=a" (ret) - : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber ); + : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) + : __syscall_clobber ); return ret; } @@ -114,10 +102,44 @@ static __always_inline long time_syscall(long *t) return secs; } +static __always_inline void do_vgettimeofday(struct timeval * tv) +{ + cycle_t now, base, mask, cycle_delta; + unsigned long seq, mult, shift, nsec_delta; + cycle_t (*vread)(void); + do { + seq = read_seqbegin(&__vsyscall_gtod_data.lock); + + vread = __vsyscall_gtod_data.clock.vread; + if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) { + gettimeofday(tv,0); + return; + } + now = vread(); + base = __vsyscall_gtod_data.clock.cycle_last; + mask = __vsyscall_gtod_data.clock.mask; + mult = __vsyscall_gtod_data.clock.mult; + shift = __vsyscall_gtod_data.clock.shift; + + *tv = __vsyscall_gtod_data.wall_time_tv; + + } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + + /* calculate interval: */ + cycle_delta = (now - base) & mask; + /* convert to nsecs: */ + nsec_delta = (cycle_delta * mult) >> shift; + + /* convert to usecs and add to timespec: */ + tv->tv_usec += nsec_delta / NSEC_PER_USEC; + while (tv->tv_usec > USEC_PER_SEC) { + tv->tv_sec += 1; + tv->tv_usec -= USEC_PER_SEC; + } +} + int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) { - if (!__sysctl_vsyscall) - return gettimeofday(tv,tz); if (tv) do_vgettimeofday(tv); if (tz) @@ -129,11 +151,11 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) * unlikely */ time_t __vsyscall(1) vtime(time_t *t) { - if (!__sysctl_vsyscall) + if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) return time_syscall(t); else if (t) - *t = __xtime.tv_sec; - return __xtime.tv_sec; + *t = __vsyscall_gtod_data.wall_time_tv.tv_sec; + return __vsyscall_gtod_data.wall_time_tv.tv_sec; } /* Fast way to get current CPU and node. @@ -210,7 +232,7 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, ret = -ENOMEM; goto out; } - if (!sysctl_vsyscall) { + if (!vsyscall_gtod_data.sysctl_enabled) { writew(SYSCALL, map1); writew(SYSCALL, map2); } else { @@ -232,7 +254,8 @@ static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen, static ctl_table kernel_table2[] = { { .ctl_name = 99, .procname = "vsyscall64", - .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644, + .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), + .mode = 0644, .strategy = vsyscall_sysctl_nostrat, .proc_handler = vsyscall_sysctl_change }, {} |