diff options
163 files changed, 9612 insertions, 4547 deletions
diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt index 09dd510c4a5..576ce463cf4 100644 --- a/Documentation/gpio.txt +++ b/Documentation/gpio.txt @@ -78,7 +78,8 @@ Identifying GPIOs ----------------- GPIOs are identified by unsigned integers in the range 0..MAX_INT. That reserves "negative" numbers for other purposes like marking signals as -"not available on this board", or indicating faults. +"not available on this board", or indicating faults. Code that doesn't +touch the underlying hardware treats these integers as opaque cookies. Platforms define how they use those integers, and usually #define symbols for the GPIO lines so that board-specific setup code directly corresponds @@ -139,10 +140,10 @@ issues including wire-OR and output latencies. The get/set calls have no error returns because "invalid GPIO" should have been reported earlier in gpio_set_direction(). However, note that not all platforms can read the value of output pins; those that can't should always -return zero. Also, these calls will be ignored for GPIOs that can't safely -be accessed wihtout sleeping (see below). +return zero. Also, using these calls for GPIOs that can't safely be accessed +without sleeping (see below) is an error. -Platform-specific implementations are encouraged to optimise the two +Platform-specific implementations are encouraged to optimize the two calls to access the GPIO value in cases where the GPIO number (and for output, value) are constant. It's normal for them to need only a couple of instructions in such cases (reading or writing a hardware register), @@ -239,7 +240,8 @@ options are part of the IRQ interface, e.g. IRQF_TRIGGER_FALLING, as are system wakeup capabilities. Non-error values returned from irq_to_gpio() would most commonly be used -with gpio_get_value(). +with gpio_get_value(), for example to initialize or update driver state +when the IRQ is edge-triggered. @@ -260,9 +262,10 @@ pullups (or pulldowns) so that the on-chip ones should not be used. There are other system-specific mechanisms that are not specified here, like the aforementioned options for input de-glitching and wire-OR output. Hardware may support reading or writing GPIOs in gangs, but that's usually -configuration dependednt: for GPIOs sharing the same bank. (GPIOs are +configuration dependent: for GPIOs sharing the same bank. (GPIOs are commonly grouped in banks of 16 or 32, with a given SOC having several such -banks.) Code relying on such mechanisms will necessarily be nonportable. +banks.) Some systems can trigger IRQs from output GPIOs. Code relying on +such mechanisms will necessarily be nonportable. Dynamic definition of GPIOs is not currently supported; for example, as a side effect of configuring an add-on board with some GPIO expanders. diff --git a/Documentation/hrtimer/timer_stats.txt b/Documentation/hrtimer/timer_stats.txt new file mode 100644 index 00000000000..27f782e3593 --- /dev/null +++ b/Documentation/hrtimer/timer_stats.txt @@ -0,0 +1,68 @@ +timer_stats - timer usage statistics +------------------------------------ + +timer_stats is a debugging facility to make the timer (ab)usage in a Linux +system visible to kernel and userspace developers. It is not intended for +production usage as it adds significant overhead to the (hr)timer code and the +(hr)timer data structures. + +timer_stats should be used by kernel and userspace developers to verify that +their code does not make unduly use of timers. This helps to avoid unnecessary +wakeups, which should be avoided to optimize power consumption. + +It can be enabled by CONFIG_TIMER_STATS in the "Kernel hacking" configuration +section. + +timer_stats collects information about the timer events which are fired in a +Linux system over a sample period: + +- the pid of the task(process) which initialized the timer +- the name of the process which initialized the timer +- the function where the timer was intialized +- the callback function which is associated to the timer +- the number of events (callbacks) + +timer_stats adds an entry to /proc: /proc/timer_stats + +This entry is used to control the statistics functionality and to read out the +sampled information. + +The timer_stats functionality is inactive on bootup. + +To activate a sample period issue: +# echo 1 >/proc/timer_stats + +To stop a sample period issue: +# echo 0 >/proc/timer_stats + +The statistics can be retrieved by: +# cat /proc/timer_stats + +The readout of /proc/timer_stats automatically disables sampling. The sampled +information is kept until a new sample period is started. This allows multiple +readouts. + +Sample output of /proc/timer_stats: + +Timerstats sample period: 3.888770 s + 12, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick) + 15, 1 swapper hcd_submit_urb (rh_timer_func) + 4, 959 kedac schedule_timeout (process_timeout) + 1, 0 swapper page_writeback_init (wb_timer_fn) + 28, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick) + 22, 2948 IRQ 4 tty_flip_buffer_push (delayed_work_timer_fn) + 3, 3100 bash schedule_timeout (process_timeout) + 1, 1 swapper queue_delayed_work_on (delayed_work_timer_fn) + 1, 1 swapper queue_delayed_work_on (delayed_work_timer_fn) + 1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer) + 1, 2292 ip __netdev_watchdog_up (dev_watchdog) + 1, 23 events/1 do_cache_clean (delayed_work_timer_fn) +90 total events, 30.0 events/sec + +The first column is the number of events, the second column the pid, the third +column is the name of the process. The forth column shows the function which +initialized the timer and in parantheses the callback function which was +executed on expiry. + + Thomas, Ingo + diff --git a/Documentation/hrtimers/highres.txt b/Documentation/hrtimers/highres.txt new file mode 100644 index 00000000000..ce0e9a91e15 --- /dev/null +++ b/Documentation/hrtimers/highres.txt @@ -0,0 +1,249 @@ +High resolution timers and dynamic ticks design notes +----------------------------------------------------- + +Further information can be found in the paper of the OLS 2006 talk "hrtimers +and beyond". The paper is part of the OLS 2006 Proceedings Volume 1, which can +be found on the OLS website: +http://www.linuxsymposium.org/2006/linuxsymposium_procv1.pdf + +The slides to this talk are available from: +http://tglx.de/projects/hrtimers/ols2006-hrtimers.pdf + +The slides contain five figures (pages 2, 15, 18, 20, 22), which illustrate the +changes in the time(r) related Linux subsystems. Figure #1 (p. 2) shows the +design of the Linux time(r) system before hrtimers and other building blocks +got merged into mainline. + +Note: the paper and the slides are talking about "clock event source", while we +switched to the name "clock event devices" in meantime. + +The design contains the following basic building blocks: + +- hrtimer base infrastructure +- timeofday and clock source management +- clock event management +- high resolution timer functionality +- dynamic ticks + + +hrtimer base infrastructure +--------------------------- + +The hrtimer base infrastructure was merged into the 2.6.16 kernel. Details of +the base implementation are covered in Documentation/hrtimers/hrtimer.txt. See +also figure #2 (OLS slides p. 15) + +The main differences to the timer wheel, which holds the armed timer_list type +timers are: + - time ordered enqueueing into a rb-tree + - independent of ticks (the processing is based on nanoseconds) + + +timeofday and clock source management +------------------------------------- + +John Stultz's Generic Time Of Day (GTOD) framework moves a large portion of +code out of the architecture-specific areas into a generic management +framework, as illustrated in figure #3 (OLS slides p. 18). The architecture +specific portion is reduced to the low level hardware details of the clock +sources, which are registered in the framework and selected on a quality based +decision. The low level code provides hardware setup and readout routines and +initializes data structures, which are used by the generic time keeping code to +convert the clock ticks to nanosecond based time values. All other time keeping +related functionality is moved into the generic code. The GTOD base patch got +merged into the 2.6.18 kernel. + +Further information about the Generic Time Of Day framework is available in the +OLS 2005 Proceedings Volume 1: +http://www.linuxsymposium.org/2005/linuxsymposium_procv1.pdf + +The paper "We Are Not Getting Any Younger: A New Approach to Time and +Timers" was written by J. Stultz, D.V. Hart, & N. Aravamudan. + +Figure #3 (OLS slides p.18) illustrates the transformation. + + +clock event management +---------------------- + +While clock sources provide read access to the monotonically increasing time +value, clock event devices are used to schedule the next event +interrupt(s). The next event is currently defined to be periodic, with its +period defined at compile time. The setup and selection of the event device +for various event driven functionalities is hardwired into the architecture +dependent code. This results in duplicated code across all architectures and +makes it extremely difficult to change the configuration of the system to use +event interrupt devices other than those already built into the +architecture. Another implication of the current design is that it is necessary +to touch all the architecture-specific implementations in order to provide new +functionality like high resolution timers or dynamic ticks. + +The clock events subsystem tries to address this problem by providing a generic +solution to manage clock event devices and their usage for the various clock +event driven kernel functionalities. The goal of the clock event subsystem is +to minimize the clock event related architecture dependent code to the pure +hardware related handling and to allow easy addition and utilization of new +clock event devices. It also minimizes the duplicated code across the +architectures as it provides generic functionality down to the interrupt +service handler, which is almost inherently hardware dependent. + +Clock event devices are registered either by the architecture dependent boot +code or at module insertion time. Each clock event device fills a data +structure with clock-specific property parameters and callback functions. The +clock event management decides, by using the specified property parameters, the +set of system functions a clock event device will be used to support. This +includes the distinction of per-CPU and per-system global event devices. + +System-level global event devices are used for the Linux periodic tick. Per-CPU +event devices are used to provide local CPU functionality such as process +accounting, profiling, and high resolution timers. + +The management layer assignes one or more of the folliwing functions to a clock +event device: + - system global periodic tick (jiffies update) + - cpu local update_process_times + - cpu local profiling + - cpu local next event interrupt (non periodic mode) + +The clock event device delegates the selection of those timer interrupt related +functions completely to the management layer. The clock management layer stores +a function pointer in the device description structure, which has to be called +from the hardware level handler. This removes a lot of duplicated code from the +architecture specific timer interrupt handlers and hands the control over the +clock event devices and the assignment of timer interrupt related functionality +to the core code. + +The clock event layer API is rather small. Aside from the clock event device +registration interface it provides functions to schedule the next event +interrupt, clock event device notification service and support for suspend and +resume. + +The framework adds about 700 lines of code which results in a 2KB increase of +the kernel binary size. The conversion of i386 removes about 100 lines of +code. The binary size decrease is in the range of 400 byte. We believe that the +increase of flexibility and the avoidance of duplicated code across +architectures justifies the slight increase of the binary size. + +The conversion of an architecture has no functional impact, but allows to +utilize the high resolution and dynamic tick functionalites without any change +to the clock event device and timer interrupt code. After the conversion the +enabling of high resolution timers and dynamic ticks is simply provided by +adding the kernel/time/Kconfig file to the architecture specific Kconfig and +adding the dynamic tick specific calls to the idle routine (a total of 3 lines +added to the idle function and the Kconfig file) + +Figure #4 (OLS slides p.20) illustrates the transformation. + + +high resolution timer functionality +----------------------------------- + +During system boot it is not possible to use the high resolution timer +functionality, while making it possible would be difficult and would serve no +useful function. The initialization of the clock event device framework, the +clock source framework (GTOD) and hrtimers itself has to be done and +appropriate clock sources and clock event devices have to be registered before +the high resolution functionality can work. Up to the point where hrtimers are +initialized, the system works in the usual low resolution periodic mode. The +clock source and the clock event device layers provide notification functions +which inform hrtimers about availability of new hardware. hrtimers validates +the usability of the registered clock sources and clock event devices before +switching to high resolution mode. This ensures also that a kernel which is +configured for high resolution timers can run on a system which lacks the +necessary hardware support. + +The high resolution timer code does not support SMP machines which have only +global clock event devices. The support of such hardware would involve IPI +calls when an interrupt happens. The overhead would be much larger than the +benefit. This is the reason why we currently disable high resolution and +dynamic ticks on i386 SMP systems which stop the local APIC in C3 power +state. A workaround is available as an idea, but the problem has not been +tackled yet. + +The time ordered insertion of timers provides all the infrastructure to decide +whether the event device has to be reprogrammed when a timer is added. The +decision is made per timer base and synchronized across per-cpu timer bases in +a support function. The design allows the system to utilize separate per-CPU +clock event devices for the per-CPU timer bases, but currently only one +reprogrammable clock event device per-CPU is utilized. + +When the timer interrupt happens, the next event interrupt handler is called +from the clock event distribution code and moves expired timers from the +red-black tree to a separate double linked list and invokes the softirq +handler. An additional mode field in the hrtimer structure allows the system to +execute callback functions directly from the next event interrupt handler. This +is restricted to code which can safely be executed in the hard interrupt +context. This applies, for example, to the common case of a wakeup function as +used by nanosleep. The advantage of executing the handler in the interrupt +context is the avoidance of up to two context switches - from the interrupted +context to the softirq and to the task which is woken up by the expired +timer. + +Once a system has switched to high resolution mode, the periodic tick is +switched off. This disables the per system global periodic clock event device - +e.g. the PIT on i386 SMP systems. + +The periodic tick functionality is provided by an per-cpu hrtimer. The callback +function is executed in the next event interrupt context and updates jiffies +and calls update_process_times and profiling. The implementation of the hrtimer +based periodic tick is designed to be extended with dynamic tick functionality. +This allows to use a single clock event device to schedule high resolution +timer and periodic events (jiffies tick, profiling, process accounting) on UP +systems. This has been proved to work with the PIT on i386 and the Incrementer +on PPC. + +The softirq for running the hrtimer queues and executing the callbacks has been +separated from the tick bound timer softirq to allow accurate delivery of high +resolution timer signals which are used by itimer and POSIX interval +timers. The execution of this softirq can still be delayed by other softirqs, +but the overall latencies have been significantly improved by this separation. + +Figure #5 (OLS slides p.22) illustrates the transformation. + + +dynamic ticks +------------- + +Dynamic ticks are the logical consequence of the hrtimer based periodic tick +replacement (sched_tick). The functionality of the sched_tick hrtimer is +extended by three functions: + +- hrtimer_stop_sched_tick +- hrtimer_restart_sched_tick +- hrtimer_update_jiffies + +hrtimer_stop_sched_tick() is called when a CPU goes into idle state. The code +evaluates the next scheduled timer event (from both hrtimers and the timer +wheel) and in case that the next event is further away than the next tick it +reprograms the sched_tick to this future event, to allow longer idle sleeps +without worthless interruption by the periodic tick. The function is also +called when an interrupt happens during the idle period, which does not cause a +reschedule. The call is necessary as the interrupt handler might have armed a +new timer whose expiry time is before the time which was identified as the +nearest event in the previous call to hrtimer_stop_sched_tick. + +hrtimer_restart_sched_tick() is called when the CPU leaves the idle state before +it calls schedule(). hrtimer_restart_sched_tick() resumes the periodic tick, +which is kept active until the next call to hrtimer_stop_sched_tick(). + +hrtimer_update_jiffies() is called from irq_enter() when an interrupt happens +in the idle period to make sure that jiffies are up to date and the interrupt +handler has not to deal with an eventually stale jiffy value. + +The dynamic tick feature provides statistical values which are exported to +userspace via /proc/stats and can be made available for enhanced power +management control. + +The implementation leaves room for further development like full tickless +systems, where the time slice is controlled by the scheduler, variable +frequency profiling, and a complete removal of jiffies in the future. + + +Aside the current initial submission of i386 support, the patchset has been +extended to x86_64 and ARM already. Initial (work in progress) support is also +available for MIPS and PowerPC. + + Thomas, Ingo + + + diff --git a/Documentation/hrtimers.txt b/Documentation/hrtimers/hrtimers.txt index ce31f65e12e..ce31f65e12e 100644 --- a/Documentation/hrtimers.txt +++ b/Documentation/hrtimers/hrtimers.txt diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 22b19962a1a..abd575cfc75 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -609,6 +609,10 @@ and is between 256 and 4096 characters. It is defined in the file highmem otherwise. This also works to reduce highmem size on bigger boxes. + highres= [KNL] Enable/disable high resolution timer mode. + Valid parameters: "on", "off" + Default: "on" + hisax= [HW,ISDN] See Documentation/isdn/README.HiSax. @@ -1078,6 +1082,10 @@ and is between 256 and 4096 characters. It is defined in the file in certain environments such as networked servers or real-time systems. + nohz= [KNL] Boottime enable/disable dynamic ticks + Valid arguments: on, off + Default: on + noirqbalance [IA-32,SMP,KNL] Disable kernel irq balancing noirqdebug [IA-32] Disables the code which attempts to detect and diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c index ec01f08f564..e101846ab7d 100644 --- a/arch/arm/kernel/irq.c +++ b/arch/arm/kernel/irq.c @@ -159,8 +159,7 @@ void __init init_IRQ(void) int irq; for (irq = 0; irq < NR_IRQS; irq++) - irq_desc[irq].status |= IRQ_NOREQUEST | IRQ_DELAYED_DISABLE | - IRQ_NOPROBE; + irq_desc[irq].status |= IRQ_NOREQUEST | IRQ_NOPROBE; #ifdef CONFIG_SMP bad_irq_desc.affinity = CPU_MASK_ALL; diff --git a/arch/arm/mach-imx/time.c b/arch/arm/mach-imx/time.c index 40039b2a90b..2703a730baf 100644 --- a/arch/arm/mach-imx/time.c +++ b/arch/arm/mach-imx/time.c @@ -87,7 +87,7 @@ static struct clocksource clocksource_imx = { .read = imx_get_cycles, .mask = 0xFFFFFFFF, .shift = 20, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static int __init imx_clocksource_init(void) diff --git a/arch/arm/mach-ixp4xx/common.c b/arch/arm/mach-ixp4xx/common.c index 2ec9a9e9a04..45068c3d8dc 100644 --- a/arch/arm/mach-ixp4xx/common.c +++ b/arch/arm/mach-ixp4xx/common.c @@ -395,7 +395,7 @@ static struct clocksource clocksource_ixp4xx = { .read = ixp4xx_get_cycles, .mask = CLOCKSOURCE_MASK(32), .shift = 20, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; unsigned long ixp4xx_timer_freq = FREQ; diff --git a/arch/arm/mach-netx/time.c b/arch/arm/mach-netx/time.c index 5773b55ef4a..7e132fcccd4 100644 --- a/arch/arm/mach-netx/time.c +++ b/arch/arm/mach-netx/time.c @@ -62,7 +62,7 @@ static struct clocksource clocksource_netx = { .read = netx_get_cycles, .mask = CLOCKSOURCE_MASK(32), .shift = 20, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; /* diff --git a/arch/arm/mach-pxa/time.c b/arch/arm/mach-pxa/time.c index ee2beb40041..fc3b82a740a 100644 --- a/arch/arm/mach-pxa/time.c +++ b/arch/arm/mach-pxa/time.c @@ -112,7 +112,7 @@ static struct clocksource clocksource_pxa = { .read = pxa_get_cycles, .mask = CLOCKSOURCE_MASK(32), .shift = 20, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static void __init pxa_timer_init(void) diff --git a/arch/avr32/kernel/time.c b/arch/avr32/kernel/time.c index a2f74affaa9..c10833f2ee0 100644 --- a/arch/avr32/kernel/time.c +++ b/arch/avr32/kernel/time.c @@ -37,7 +37,7 @@ static struct clocksource clocksource_avr32 = { .read = read_cycle_count, .mask = CLOCKSOURCE_MASK(32), .shift = 16, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; /* diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 595fb771366..1df4a1f1428 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -18,6 +18,18 @@ config GENERIC_TIME bool default y +config CLOCKSOURCE_WATCHDOG + bool + default y + +config GENERIC_CLOCKEVENTS + bool + default y + +config GENERIC_CLOCKEVENTS_BROADCAST + bool + default y + config LOCKDEP_SUPPORT bool default y @@ -74,6 +86,8 @@ source "init/Kconfig" menu "Processor type and features" +source "kernel/time/Kconfig" + config SMP bool "Symmetric multi-processing support" ---help--- @@ -205,7 +219,7 @@ config PARAVIRT config VMI bool "VMI Paravirt-ops support" - depends on PARAVIRT + depends on PARAVIRT && !NO_HZ default y help VMI provides a paravirtualized interface to multiple hypervisors diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index cbe4e601885..4ae3dcf1d2f 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -18,7 +18,7 @@ obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_APM) += apm.o -obj-$(CONFIG_X86_SMP) += smp.o smpboot.o +obj-$(CONFIG_X86_SMP) += smp.o smpboot.o tsc_sync.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o @@ -32,7 +32,6 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_MODULES) += module.o obj-y += sysenter.o vsyscall.o obj-$(CONFIG_ACPI_SRAT) += srat.o -obj-$(CONFIG_HPET_TIMER) += time_hpet.o obj-$(CONFIG_EFI) += efi.o efi_stub.o obj-$(CONFIG_DOUBLEFAULT) += doublefault.o obj-$(CONFIG_VM86) += vm86.o diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index e94aff6888c..fb3e72328a5 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -25,6 +25,7 @@ #include <linux/init.h> #include <linux/acpi.h> +#include <linux/acpi_pmtmr.h> #include <linux/efi.h> #include <linux/cpumask.h> #include <linux/module.h> @@ -615,6 +616,7 @@ static int __init acpi_parse_sbf(struct acpi_table_header *table) } #ifdef CONFIG_HPET_TIMER +#include <asm/hpet.h> static int __init acpi_parse_hpet(struct acpi_table_header *table) { @@ -645,24 +647,11 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) hpet_res->end = (1 * 1024) - 1; } -#ifdef CONFIG_X86_64 - vxtime.hpet_address = hpet_tbl->address.address; - + hpet_address = hpet_tbl->address.address; printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", - hpet_tbl->id, vxtime.hpet_address); - - res_start = vxtime.hpet_address; -#else /* X86 */ - { - extern unsigned long hpet_address; - - hpet_address = hpet_tbl->address.address; - printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", - hpet_tbl->id, hpet_address); + hpet_tbl->id, hpet_address); - res_start = hpet_address; - } -#endif /* X86 */ + res_start = hpet_address; if (hpet_res) { hpet_res->start = res_start; @@ -676,10 +665,6 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) #define acpi_parse_hpet NULL #endif -#ifdef CONFIG_X86_PM_TIMER -extern u32 pmtmr_ioport; -#endif - static int __init acpi_parse_fadt(struct acpi_table_header *table) { diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index f4159e0a7ae..9655c233e6f 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -25,6 +25,8 @@ #include <linux/kernel_stat.h> #include <linux/sysdev.h> #include <linux/cpu.h> +#include <linux/clockchips.h> +#include <linux/acpi_pmtmr.h> #include <linux/module.h> #include <asm/atomic.h> @@ -45,128 +47,549 @@ #include "io_ports.h" /* - * cpu_mask that denotes the CPUs that needs timer interrupt coming in as - * IPIs in place of local APIC timers + * Sanity check */ -static cpumask_t timer_bcast_ipi; +#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F +# error SPURIOUS_APIC_VECTOR definition error +#endif /* * Knob to control our willingness to enable the local APIC. + * + * -1=force-disable, +1=force-enable */ -static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ - -static inline void lapic_disable(void) -{ - enable_local_apic = -1; - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); -} +static int enable_local_apic __initdata = 0; -static inline void lapic_enable(void) -{ - enable_local_apic = 1; -} +/* Local APIC timer verification ok */ +static int local_apic_timer_verify_ok; /* - * Debug level + * Debug level, exported for io_apic.c */ int apic_verbosity; +static unsigned int calibration_result; +static int lapic_next_event(unsigned long delta, + struct clock_event_device *evt); +static void lapic_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt); +static void lapic_timer_broadcast(cpumask_t mask); static void apic_pm_activate(void); +/* + * The local apic timer can be used for any function which is CPU local. + */ +static struct clock_event_device lapic_clockevent = { + .name = "lapic", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT + | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, + .shift = 32, + .set_mode = lapic_timer_setup, + .set_next_event = lapic_next_event, + .broadcast = lapic_timer_broadcast, + .rating = 100, + .irq = -1, +}; +static DEFINE_PER_CPU(struct clock_event_device, lapic_events); + +/* Local APIC was disabled by the BIOS and enabled by the kernel */ +static int enabled_via_apicbase; + +/* + * Get the LAPIC version + */ +static inline int lapic_get_version(void) +{ + return GET_APIC_VERSION(apic_read(APIC_LVR)); +} + +/* + * Check, if the APIC is integrated or a seperate chip + */ +static inline int lapic_is_integrated(void) +{ + return APIC_INTEGRATED(lapic_get_version()); +} + +/* + * Check, whether this is a modern or a first generation APIC + */ static int modern_apic(void) { - unsigned int lvr, version; /* AMD systems use old APIC versions, so check the CPU */ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 >= 0xf) + boot_cpu_data.x86 >= 0xf) return 1; - lvr = apic_read(APIC_LVR); - version = GET_APIC_VERSION(lvr); - return version >= 0x14; + return lapic_get_version() >= 0x14; +} + +/** + * enable_NMI_through_LVT0 - enable NMI through local vector table 0 + */ +void enable_NMI_through_LVT0 (void * dummy) +{ + unsigned int v = APIC_DM_NMI; + + /* Level triggered for 82489DX */ + if (!lapic_is_integrated()) + v |= APIC_LVT_LEVEL_TRIGGER; + apic_write_around(APIC_LVT0, v); +} + +/** + * get_physical_broadcast - Get number of physical broadcast IDs + */ +int get_physical_broadcast(void) +{ + return modern_apic() ? 0xff : 0xf; +} + +/** + * lapic_get_maxlvt - get the maximum number of local vector table entries + */ +int lapic_get_maxlvt(void) +{ + unsigned int v = apic_read(APIC_LVR); + + /* 82489DXs do not report # of LVT entries. */ + return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; } /* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themselves. + * Local APIC timer */ -void ack_bad_irq(unsigned int irq) + +/* Clock divisor is set to 16 */ +#define APIC_DIVISOR 16 + +/* + * This function sets up the local APIC timer, with a timeout of + * 'clocks' APIC bus clock. During calibration we actually call + * this function twice on the boot CPU, once with a bogus timeout + * value, second time for real. The other (noncalibrating) CPUs + * call this function only once, with the real, calibrated value. + * + * We do reads before writes even if unnecessary, to get around the + * P5 APIC double write bug. + */ +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) { - printk("unexpected IRQ trap at vector %02x\n", irq); + unsigned int lvtt_value, tmp_value; + + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; + if (!lapic_is_integrated()) + lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); + + if (!irqen) + lvtt_value |= APIC_LVT_MASKED; + + apic_write_around(APIC_LVTT, lvtt_value); + /* - * Currently unexpected vectors happen only on SMP and APIC. - * We _must_ ack these because every local APIC has only N - * irq slots per priority level, and a 'hanging, unacked' IRQ - * holds up an irq slot - in excessive cases (when multiple - * unexpected vectors occur) that might lock up the APIC - * completely. - * But only ack when the APIC is enabled -AK + * Divide PICLK by 16 */ - if (cpu_has_apic) - ack_APIC_irq(); + tmp_value = apic_read(APIC_TDCR); + apic_write_around(APIC_TDCR, (tmp_value + & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) + | APIC_TDR_DIV_16); + + if (!oneshot) + apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); } -void __init apic_intr_init(void) +/* + * Program the next event, relative to now + */ +static int lapic_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + apic_write_around(APIC_TMICT, delta); + return 0; +} + +/* + * Setup the lapic timer in periodic or oneshot mode + */ +static void lapic_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long flags; + unsigned int v; + + /* Lapic used for broadcast ? */ + if (!local_apic_timer_verify_ok) + return; + + local_irq_save(flags); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + case CLOCK_EVT_MODE_ONESHOT: + __setup_APIC_LVTT(calibration_result, + mode != CLOCK_EVT_MODE_PERIODIC, 1); + break; + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + v = apic_read(APIC_LVTT); + v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write_around(APIC_LVTT, v); + break; + } + + local_irq_restore(flags); +} + +/* + * Local APIC timer broadcast function + */ +static void lapic_timer_broadcast(cpumask_t mask) { #ifdef CONFIG_SMP - smp_intr_init(); + send_IPI_mask(mask, LOCAL_TIMER_VECTOR); #endif - /* self generated IPI for local APIC timer */ - set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); +} - /* IPI vectors for APIC spurious and error interrupts */ - set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +/* + * Setup the local APIC timer for this CPU. Copy the initilized values + * of the boot CPU and register the clock event in the framework. + */ +static void __devinit setup_APIC_timer(void) +{ + struct clock_event_device *levt = &__get_cpu_var(lapic_events); - /* thermal monitor LVT interrupt */ -#ifdef CONFIG_X86_MCE_P4THERMAL - set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); -#endif + memcpy(levt, &lapic_clockevent, sizeof(*levt)); + levt->cpumask = cpumask_of_cpu(smp_processor_id()); + + clockevents_register_device(levt); } -/* Using APIC to generate smp_local_timer_interrupt? */ -int using_apic_timer __read_mostly = 0; +/* + * In this functions we calibrate APIC bus clocks to the external timer. + * + * We want to do the calibration only once since we want to have local timer + * irqs syncron. CPUs connected by the same APIC bus have the very same bus + * frequency. + * + * This was previously done by reading the PIT/HPET and waiting for a wrap + * around to find out, that a tick has elapsed. I have a box, where the PIT + * readout is broken, so it never gets out of the wait loop again. This was + * also reported by others. + * + * Monitoring the jiffies value is inaccurate and the clockevents + * infrastructure allows us to do a simple substitution of the interrupt + * handler. + * + * The calibration routine also uses the pm_timer when possible, as the PIT + * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes + * back to normal later in the boot process). + */ -static int enabled_via_apicbase; +#define LAPIC_CAL_LOOPS (HZ/10) -void enable_NMI_through_LVT0 (void * dummy) +static __initdata volatile int lapic_cal_loops = -1; +static __initdata long lapic_cal_t1, lapic_cal_t2; +static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; +static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; +static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; + +/* + * Temporary interrupt handler. + */ +static void __init lapic_cal_handler(struct clock_event_device *dev) { - unsigned int v, ver; + unsigned long long tsc = 0; + long tapic = apic_read(APIC_TMCCT); + unsigned long pm = acpi_pm_read_early(); - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); - v = APIC_DM_NMI; /* unmask and set to NMI */ - if (!APIC_INTEGRATED(ver)) /* 82489DX */ - v |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT0, v); + if (cpu_has_tsc) + rdtscll(tsc); + + switch (lapic_cal_loops++) { + case 0: + lapic_cal_t1 = tapic; + lapic_cal_tsc1 = tsc; + lapic_cal_pm1 = pm; + lapic_cal_j1 = jiffies; + break; + + case LAPIC_CAL_LOOPS: + lapic_cal_t2 = tapic; + lapic_cal_tsc2 = tsc; + if (pm < lapic_cal_pm1) + pm += ACPI_PM_OVRRUN; + lapic_cal_pm2 = pm; + lapic_cal_j2 = jiffies; + break; + } } -int get_physical_broadcast(void) +/* + * Setup the boot APIC + * + * Calibrate and verify the result. + */ +void __init setup_boot_APIC_clock(void) { - if (modern_apic()) - return 0xff; - else - return 0xf; + struct clock_event_device *levt = &__get_cpu_var(lapic_events); + const long pm_100ms = PMTMR_TICKS_PER_SEC/10; + const long pm_thresh = pm_100ms/100; + void (*real_handler)(struct clock_event_device *dev); + unsigned long deltaj; + long delta, deltapm; + + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" + "calibrating APIC timer ...\n"); + + local_irq_disable(); + + /* Replace the global interrupt handler */ + real_handler = global_clock_event->event_handler; + global_clock_event->event_handler = lapic_cal_handler; + + /* + * Setup the APIC counter to 1e9. There is no way the lapic + * can underflow in the 100ms detection time frame + */ + __setup_APIC_LVTT(1000000000, 0, 0); + + /* Let the interrupts run */ + local_irq_enable(); + + while(lapic_cal_loops <= LAPIC_CAL_LOOPS); + + local_irq_disable(); + + /* Restore the real event handler */ + global_clock_event->event_handler = real_handler; + + /* Build delta t1-t2 as apic timer counts down */ + delta = lapic_cal_t1 - lapic_cal_t2; + apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); + + /* Check, if the PM timer is available */ + deltapm = lapic_cal_pm2 - lapic_cal_pm1; + apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); + + if (deltapm) { + unsigned long mult; + u64 res; + + mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); + + if (deltapm > (pm_100ms - pm_thresh) && + deltapm < (pm_100ms + pm_thresh)) { + apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); + } else { + res = (((u64) deltapm) * mult) >> 22; + do_div(res, 1000000); + printk(KERN_WARNING "APIC calibration not consistent " + "with PM Timer: %ldms instead of 100ms\n", + (long)res); + /* Correct the lapic counter value */ + res = (((u64) delta ) * pm_100ms); + do_div(res, deltapm); + printk(KERN_INFO "APIC delta adjusted to PM-Timer: " + "%lu (%ld)\n", (unsigned long) res, delta); + delta = (long) res; + } + } + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 32); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + + calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; + + apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); + apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); + apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", + calibration_result); + + if (cpu_has_tsc) { + delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); + apic_printk(APIC_VERBOSE, "..... CPU clock speed is " + "%ld.%04ld MHz.\n", + (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ), + (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ)); + } + + apic_printk(APIC_VERBOSE, "..... host bus clock speed is " + "%u.%04u MHz.\n", + calibration_result / (1000000 / HZ), + calibration_result % (1000000 / HZ)); + + + apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); + + /* + * Setup the apic timer manually + */ + local_apic_timer_verify_ok = 1; + levt->event_handler = lapic_cal_handler; + lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt); + lapic_cal_loops = -1; + + /* Let the interrupts run */ + local_irq_enable(); + + while(lapic_cal_loops <= LAPIC_CAL_LOOPS); + + local_irq_disable(); + + /* Stop the lapic timer */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); + + local_irq_enable(); + + /* Jiffies delta */ + deltaj = lapic_cal_j2 - lapic_cal_j1; + apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); + + /* Check, if the PM timer is available */ + deltapm = lapic_cal_pm2 - lapic_cal_pm1; + apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); + + local_apic_timer_verify_ok = 0; + + if (deltapm) { + if (deltapm > (pm_100ms - pm_thresh) && + deltapm < (pm_100ms + pm_thresh)) { + apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); + /* Check, if the jiffies result is consistent */ + if (deltaj < LAPIC_CAL_LOOPS-2 || + deltaj > LAPIC_CAL_LOOPS+2) { + /* + * Not sure, what we can do about this one. + * When high resultion timers are active + * and the lapic timer does not stop in C3 + * we are fine. Otherwise more trouble might + * be waiting. -- tglx + */ + printk(KERN_WARNING "Global event device %s " + "has wrong frequency " + "(%lu ticks instead of %d)\n", + global_clock_event->name, deltaj, + LAPIC_CAL_LOOPS); + } + local_apic_timer_verify_ok = 1; + } + } else { + /* Check, if the jiffies result is consistent */ + if (deltaj >= LAPIC_CAL_LOOPS-2 && + deltaj <= LAPIC_CAL_LOOPS+2) { + apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); + local_apic_timer_verify_ok = 1; + } + } + + if (!local_apic_timer_verify_ok) { + printk(KERN_WARNING + "APIC timer disabled due to verification failure.\n"); + /* No broadcast on UP ! */ + if (num_possible_cpus() == 1) + return; + } else + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + + /* Setup the lapic or request the broadcast */ + setup_APIC_timer(); +} + +void __devinit setup_secondary_APIC_clock(void) +{ + setup_APIC_timer(); } -int get_maxlvt(void) +/* + * The guts of the apic timer interrupt + */ +static void local_apic_timer_interrupt(void) { - unsigned int v, ver, maxlvt; + int cpu = smp_processor_id(); + struct clock_event_device *evt = &per_cpu(lapic_events, cpu); - v = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(v); - /* 82489DXs do not report # of LVT entries. */ - maxlvt = APIC_INTEGRATED(ver) ? GET_APIC_MAXLVT(v) : 2; - return maxlvt; + /* + * Normally we should not be here till LAPIC has been initialized but + * in some cases like kdump, its possible that there is a pending LAPIC + * timer interrupt from previous kernel's context and is delivered in + * new kernel the moment interrupts are enabled. + * + * Interrupts are enabled early and LAPIC is setup much later, hence + * its possible that when we get here evt->event_handler is NULL. + * Check for event_handler being NULL and discard the interrupt as + * spurious. + */ + if (!evt->event_handler) { + printk(KERN_WARNING + "Spurious LAPIC timer interrupt on cpu %d\n", cpu); + /* Switch it off */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); + return; + } + + per_cpu(irq_stat, cpu).apic_timer_irqs++; + + evt->event_handler(evt); +} + +/* + * Local APIC timer interrupt. This is the most natural way for doing + * local interrupts, but local timer interrupts can be emulated by + * broadcast interrupts too. [in case the hw doesn't support APIC timers] + * + * [ if a single-CPU system runs an SMP kernel then we call the local + * interrupt as well. Thus we cannot inline the local irq ... ] + */ + +void fastcall smp_apic_timer_interrupt(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + /* + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. + */ + ack_APIC_irq(); + /* + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ + exit_idle(); + irq_enter(); + local_apic_timer_interrupt(); + irq_exit(); + + set_irq_regs(old_regs); } +int setup_profiling_timer(unsigned int multiplier) +{ + return -EINVAL; +} + +/* + * Local APIC start and shutdown + */ + +/** + * clear_local_APIC - shutdown the local APIC + * + * This is called, when a CPU is disabled and before rebooting, so the state of + * the local APIC has no dangling leftovers. Also used to cleanout any BIOS + * leftovers during boot. + */ void clear_local_APIC(void) { - int maxlvt; + int maxlvt = lapic_get_maxlvt(); unsigned long v; - maxlvt = get_maxlvt(); - /* * Masking an LVT entry can trigger a local APIC error * if the vector is zero. Mask LVTERR first to prevent this. @@ -190,7 +613,7 @@ void clear_local_APIC(void) apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); } -/* lets not touch this if we didn't frob it */ + /* lets not touch this if we didn't frob it */ #ifdef CONFIG_X86_MCE_P4THERMAL if (maxlvt >= 5) { v = apic_read(APIC_LVTTHMR); @@ -212,85 +635,18 @@ void clear_local_APIC(void) if (maxlvt >= 5) apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); #endif - v = GET_APIC_VERSION(apic_read(APIC_LVR)); - if (APIC_INTEGRATED(v)) { /* !82489DX */ - if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP. */ + /* Integrated APIC (!82489DX) ? */ + if (lapic_is_integrated()) { + if (maxlvt > 3) + /* Clear ESR due to Pentium errata 3AP and 11AP */ apic_write(APIC_ESR, 0); apic_read(APIC_ESR); } } -void __init connect_bsp_APIC(void) -{ - if (pic_mode) { - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - /* - * PIC mode, enable APIC mode in the IMCR, i.e. - * connect BSP's local APIC to INT and NMI lines. - */ - apic_printk(APIC_VERBOSE, "leaving PIC mode, " - "enabling APIC mode.\n"); - outb(0x70, 0x22); - outb(0x01, 0x23); - } - enable_apic_mode(); -} - -void disconnect_bsp_APIC(int virt_wire_setup) -{ - if (pic_mode) { - /* - * Put the board back into PIC mode (has an effect - * only on certain older boards). Note that APIC - * interrupts, including IPIs, won't work beyond - * this point! The only exception are INIT IPIs. - */ - apic_printk(APIC_VERBOSE, "disabling APIC mode, " - "entering PIC mode.\n"); - outb(0x70, 0x22); - outb(0x00, 0x23); - } - else { - /* Go back to Virtual Wire compatibility mode */ - unsigned long value; - - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write_around(APIC_SPIV, value); - - if (!virt_wire_setup) { - /* For LVT0 make it edge triggered, active high, external and enabled */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write_around(APIC_LVT0, value); - } - else { - /* Disable LVT0 */ - apic_write_around(APIC_LVT0, APIC_LVT_MASKED); - } - - /* For LVT1 make it edge triggered, active high, nmi and enabled */ - value = apic_read(APIC_LVT1); - value &= ~( - APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write_around(APIC_LVT1, value); - } -} - +/** + * disable_local_APIC - clear and disable the local APIC + */ void disable_local_APIC(void) { unsigned long value; @@ -305,8 +661,13 @@ void disable_local_APIC(void) value &= ~APIC_SPIV_APIC_ENABLED; apic_write_around(APIC_SPIV, value); + /* + * When LAPIC was disabled by the BIOS and enabled by the kernel, + * restore the disabled state. + */ if (enabled_via_apicbase) { unsigned int l, h; + rdmsr(MSR_IA32_APICBASE, l, h); l &= ~MSR_IA32_APICBASE_ENABLE; wrmsr(MSR_IA32_APICBASE, l, h); @@ -314,6 +675,28 @@ void disable_local_APIC(void) } /* + * If Linux enabled the LAPIC against the BIOS default disable it down before + * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and + * not power-off. Additionally clear all LVT entries before disable_local_APIC + * for the case where Linux didn't enable the LAPIC. + */ +void lapic_shutdown(void) +{ + unsigned long flags; + + if (!cpu_has_apic) + return; + + local_irq_save(flags); + clear_local_APIC(); + + if (enabled_via_apicbase) + disable_local_APIC(); + + local_irq_restore(flags); +} + +/* * This is to verify that we're looking at a real local APIC. * Check these against your board if the CPUs aren't getting * started for no apparent reason. @@ -345,7 +728,7 @@ int __init verify_local_APIC(void) reg1 = GET_APIC_VERSION(reg0); if (reg1 == 0x00 || reg1 == 0xff) return 0; - reg1 = get_maxlvt(); + reg1 = lapic_get_maxlvt(); if (reg1 < 0x02 || reg1 == 0xff) return 0; @@ -368,10 +751,15 @@ int __init verify_local_APIC(void) return 1; } +/** + * sync_Arb_IDs - synchronize APIC bus arbitration IDs + */ void __init sync_Arb_IDs(void) { - /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 - And not needed on AMD */ + /* + * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not + * needed on AMD. + */ if (modern_apic()) return; /* @@ -384,14 +772,12 @@ void __init sync_Arb_IDs(void) | APIC_DM_INIT); } -extern void __error_in_apic_c (void); - /* * An initial setup of the virtual wire mode. */ void __init init_bsp_APIC(void) { - unsigned long value, ver; + unsigned long value; /* * Don't do the setup now if we have a SMP BIOS as the @@ -400,9 +786,6 @@ void __init init_bsp_APIC(void) if (smp_found_config || !cpu_has_apic) return; - value = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(value); - /* * Do not trust the local APIC being empty at bootup. */ @@ -414,9 +797,10 @@ void __init init_bsp_APIC(void) value = apic_read(APIC_SPIV); value &= ~APIC_VECTOR_MASK; value |= APIC_SPIV_APIC_ENABLED; - + /* This bit is reserved on P4/Xeon and should be cleared */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 15)) + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + (boot_cpu_data.x86 == 15)) value &= ~APIC_SPIV_FOCUS_DISABLED; else value |= APIC_SPIV_FOCUS_DISABLED; @@ -428,14 +812,17 @@ void __init init_bsp_APIC(void) */ apic_write_around(APIC_LVT0, APIC_DM_EXTINT); value = APIC_DM_NMI; - if (!APIC_INTEGRATED(ver)) /* 82489DX */ + if (!lapic_is_integrated()) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; apic_write_around(APIC_LVT1, value); } +/** + * setup_local_APIC - setup the local APIC + */ void __devinit setup_local_APIC(void) { - unsigned long oldvalue, value, ver, maxlvt; + unsigned long oldvalue, value, maxlvt, integrated; int i, j; /* Pound the ESR really hard over the head with a big hammer - mbligh */ @@ -446,11 +833,7 @@ void __devinit setup_local_APIC(void) apic_write(APIC_ESR, 0); } - value = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(value); - - if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f) - __error_in_apic_c(); + integrated = lapic_is_integrated(); /* * Double-check whether this APIC is really registered. @@ -521,13 +904,10 @@ void __devinit setup_local_APIC(void) * like LRU than MRU (the short-term load is more even across CPUs). * See also the comment in end_level_ioapic_irq(). --macro */ -#if 1 + /* Enable focus processor (bit==0) */ value &= ~APIC_SPIV_FOCUS_DISABLED; -#else - /* Disable focus processor (bit==1) */ - value |= APIC_SPIV_FOCUS_DISABLED; -#endif + /* * Set spurious IRQ vector */ @@ -563,17 +943,18 @@ void __devinit setup_local_APIC(void) value = APIC_DM_NMI; else value = APIC_DM_NMI | APIC_LVT_MASKED; - if (!APIC_INTEGRATED(ver)) /* 82489DX */ + if (!integrated) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; apic_write_around(APIC_LVT1, value); - if (APIC_INTEGRATED(ver) && !esr_disable) { /* !82489DX */ - maxlvt = get_maxlvt(); + if (integrated && !esr_disable) { /* !82489DX */ + maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); oldvalue = apic_read(APIC_ESR); - value = ERROR_APIC_VECTOR; // enables sending errors + /* enables sending errors */ + value = ERROR_APIC_VECTOR; apic_write_around(APIC_LVTERR, value); /* * spec says clear errors after enabling vector. @@ -586,207 +967,30 @@ void __devinit setup_local_APIC(void) "vector: 0x%08lx after: 0x%08lx\n", oldvalue, value); } else { - if (esr_disable) - /* - * Something untraceble is creating bad interrupts on + if (esr_disable) + /* + * Something untraceble is creating bad interrupts on * secondary quads ... for the moment, just leave the * ESR disabled - we can't do anything useful with the * errors anyway - mbligh */ - printk("Leaving ESR disabled.\n"); - else - printk("No ESR for 82489DX.\n"); + printk(KERN_INFO "Leaving ESR disabled.\n"); + else + printk(KERN_INFO "No ESR for 82489DX.\n"); } + /* Disable the local apic timer */ + value = apic_read(APIC_LVTT); + value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write_around(APIC_LVTT, value); + setup_apic_nmi_watchdog(NULL); apic_pm_activate(); } /* - * If Linux enabled the LAPIC against the BIOS default - * disable it down before re-entering the BIOS on shutdown. - * Otherwise the BIOS may get confused and not power-off. - * Additionally clear all LVT entries before disable_local_APIC - * for the case where Linux didn't enable the LAPIC. - */ -void lapic_shutdown(void) -{ - unsigned long flags; - - if (!cpu_has_apic) - return; - - local_irq_save(flags); - clear_local_APIC(); - - if (enabled_via_apicbase) - disable_local_APIC(); - - local_irq_restore(flags); -} - -#ifdef CONFIG_PM - -static struct { - int active; - /* r/w apic fields */ - unsigned int apic_id; - unsigned int apic_taskpri; - unsigned int apic_ldr; - unsigned int apic_dfr; - unsigned int apic_spiv; - unsigned int apic_lvtt; - unsigned int apic_lvtpc; - unsigned int apic_lvt0; - unsigned int apic_lvt1; - unsigned int apic_lvterr; - unsigned int apic_tmict; - unsigned int apic_tdcr; - unsigned int apic_thmr; -} apic_pm_state; - -static int lapic_suspend(struct sys_device *dev, pm_message_t state) -{ - unsigned long flags; - int maxlvt; - - if (!apic_pm_state.active) - return 0; - - maxlvt = get_maxlvt(); - - apic_pm_state.apic_id = apic_read(APIC_ID); - apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); - apic_pm_state.apic_ldr = apic_read(APIC_LDR); - apic_pm_state.apic_dfr = apic_read(APIC_DFR); - apic_pm_state.apic_spiv = apic_read(APIC_SPIV); - apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); - if (maxlvt >= 4) - apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); - apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); - apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); - apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); - apic_pm_state.apic_tmict = apic_read(APIC_TMICT); - apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); -#ifdef CONFIG_X86_MCE_P4THERMAL - if (maxlvt >= 5) - apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); -#endif - - local_irq_save(flags); - disable_local_APIC(); - local_irq_restore(flags); - return 0; -} - -static int lapic_resume(struct sys_device *dev) -{ - unsigned int l, h; - unsigned long flags; - int maxlvt; - - if (!apic_pm_state.active) - return 0; - - maxlvt = get_maxlvt(); - - local_irq_save(flags); - - /* - * Make sure the APICBASE points to the right address - * - * FIXME! This will be wrong if we ever support suspend on - * SMP! We'll need to do this as part of the CPU restore! - */ - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; - wrmsr(MSR_IA32_APICBASE, l, h); - - apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); - apic_write(APIC_ID, apic_pm_state.apic_id); - apic_write(APIC_DFR, apic_pm_state.apic_dfr); - apic_write(APIC_LDR, apic_pm_state.apic_ldr); - apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); - apic_write(APIC_SPIV, apic_pm_state.apic_spiv); - apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); - apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); -#ifdef CONFIG_X86_MCE_P4THERMAL - if (maxlvt >= 5) - apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); -#endif - if (maxlvt >= 4) - apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); - apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); - apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); - apic_write(APIC_TMICT, apic_pm_state.apic_tmict); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - local_irq_restore(flags); - return 0; -} - -/* - * This device has no shutdown method - fully functioning local APICs - * are needed on every CPU up until machine_halt/restart/poweroff. + * Detect and initialize APIC */ - -static struct sysdev_class lapic_sysclass = { - set_kset_name("lapic"), - .resume = lapic_resume, - .suspend = lapic_suspend, -}; - -static struct sys_device device_lapic = { - .id = 0, - .cls = &lapic_sysclass, -}; - -static void __devinit apic_pm_activate(void) -{ - apic_pm_state.active = 1; -} - -static int __init init_lapic_sysfs(void) -{ - int error; - - if (!cpu_has_apic) - return 0; - /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ - - error = sysdev_class_register(&lapic_sysclass); - if (!error) - error = sysdev_register(&device_lapic); - return error; -} -device_initcall(init_lapic_sysfs); - -#else /* CONFIG_PM */ - -static void apic_pm_activate(void) { } - -#endif /* CONFIG_PM */ - -/* - * Detect and enable local APICs on non-SMP boards. - * Original code written by Keir Fraser. - */ - -static int __init apic_set_verbosity(char *str) -{ - if (strcmp("debug", str) == 0) - apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", str) == 0) - apic_verbosity = APIC_VERBOSE; - return 1; -} - -__setup("apic=", apic_set_verbosity); - static int __init detect_init_APIC (void) { u32 h, l, features; @@ -798,7 +1002,7 @@ static int __init detect_init_APIC (void) switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) || - (boot_cpu_data.x86 == 15)) + (boot_cpu_data.x86 == 15)) break; goto no_apic; case X86_VENDOR_INTEL: @@ -812,23 +1016,23 @@ static int __init detect_init_APIC (void) if (!cpu_has_apic) { /* - * Over-ride BIOS and try to enable the local - * APIC only if "lapic" specified. + * Over-ride BIOS and try to enable the local APIC only if + * "lapic" specified. */ if (enable_local_apic <= 0) { - printk("Local APIC disabled by BIOS -- " + printk(KERN_INFO "Local APIC disabled by BIOS -- " "you can enable it with \"lapic\"\n"); return -1; } /* - * Some BIOSes disable the local APIC in the - * APIC_BASE MSR. This can only be done in - * software for Intel P6 or later and AMD K7 - * (Model > 1) or later. + * Some BIOSes disable the local APIC in the APIC_BASE + * MSR. This can only be done in software for Intel P6 or later + * and AMD K7 (Model > 1) or later. */ rdmsr(MSR_IA32_APICBASE, l, h); if (!(l & MSR_IA32_APICBASE_ENABLE)) { - printk("Local APIC disabled by BIOS -- reenabling.\n"); + printk(KERN_INFO + "Local APIC disabled by BIOS -- reenabling.\n"); l &= ~MSR_IA32_APICBASE_BASE; l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; wrmsr(MSR_IA32_APICBASE, l, h); @@ -841,7 +1045,7 @@ static int __init detect_init_APIC (void) */ features = cpuid_edx(1); if (!(features & (1 << X86_FEATURE_APIC))) { - printk("Could not enable APIC!\n"); + printk(KERN_WARNING "Could not enable APIC!\n"); return -1; } set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); @@ -855,17 +1059,20 @@ static int __init detect_init_APIC (void) if (nmi_watchdog != NMI_NONE) nmi_watchdog = NMI_LOCAL_APIC; - printk("Found and enabled local APIC!\n"); + printk(KERN_INFO "Found and enabled local APIC!\n"); apic_pm_activate(); return 0; no_apic: - printk("No local APIC present or hardware disabled\n"); + printk(KERN_INFO "No local APIC present or hardware disabled\n"); return -1; } +/** + * init_apic_mappings - initialize APIC mappings + */ void __init init_apic_mappings(void) { unsigned long apic_phys; @@ -925,385 +1132,92 @@ fake_ioapic_page: } /* - * This part sets up the APIC 32 bit clock in LVTT1, with HZ interrupts - * per second. We assume that the caller has already set up the local - * APIC. - * - * The APIC timer is not exactly sync with the external timer chip, it - * closely follows bus clocks. - */ - -/* - * The timer chip is already set up at HZ interrupts per second here, - * but we do not accept timer interrupts yet. We only allow the BP - * to calibrate. - */ -static unsigned int __devinit get_8254_timer_count(void) -{ - unsigned long flags; - - unsigned int count; - - spin_lock_irqsave(&i8253_lock, flags); - - outb_p(0x00, PIT_MODE); - count = inb_p(PIT_CH0); - count |= inb_p(PIT_CH0) << 8; - - spin_unlock_irqrestore(&i8253_lock, flags); - - return count; -} - -/* next tick in 8254 can be caught by catching timer wraparound */ -static void __devinit wait_8254_wraparound(void) -{ - unsigned int curr_count, prev_count; - - curr_count = get_8254_timer_count(); - do { - prev_count = curr_count; - curr_count = get_8254_timer_count(); - - /* workaround for broken Mercury/Neptune */ - if (prev_count >= curr_count + 0x100) - curr_count = get_8254_timer_count(); - - } while (prev_count >= curr_count); -} - -/* - * Default initialization for 8254 timers. If we use other timers like HPET, - * we override this later - */ -void (*wait_timer_tick)(void) __devinitdata = wait_8254_wraparound; - -/* - * This function sets up the local APIC timer, with a timeout of - * 'clocks' APIC bus clock. During calibration we actually call - * this function twice on the boot CPU, once with a bogus timeout - * value, second time for real. The other (noncalibrating) CPUs - * call this function only once, with the real, calibrated value. - * - * We do reads before writes even if unnecessary, to get around the - * P5 APIC double write bug. + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. */ - -#define APIC_DIVISOR 16 - -static void __setup_APIC_LVTT(unsigned int clocks) +int __init APIC_init_uniprocessor (void) { - unsigned int lvtt_value, tmp_value, ver; - int cpu = smp_processor_id(); - - ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; - if (!APIC_INTEGRATED(ver)) - lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); - - if (cpu_isset(cpu, timer_bcast_ipi)) - lvtt_value |= APIC_LVT_MASKED; + if (enable_local_apic < 0) + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); - apic_write_around(APIC_LVTT, lvtt_value); + if (!smp_found_config && !cpu_has_apic) + return -1; /* - * Divide PICLK by 16 + * Complain if the BIOS pretends there is one. */ - tmp_value = apic_read(APIC_TDCR); - apic_write_around(APIC_TDCR, (tmp_value - & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) - | APIC_TDR_DIV_16); - - apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); -} + if (!cpu_has_apic && + APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", + boot_cpu_physical_apicid); + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + return -1; + } -static void __devinit setup_APIC_timer(unsigned int clocks) -{ - unsigned long flags; + verify_local_APIC(); - local_irq_save(flags); + connect_bsp_APIC(); /* - * Wait for IRQ0's slice: + * Hack: In case of kdump, after a crash, kernel might be booting + * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid + * might be zero if read from MP tables. Get it from LAPIC. */ - wait_timer_tick(); +#ifdef CONFIG_CRASH_DUMP + boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); +#endif + phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); - __setup_APIC_LVTT(clocks); + setup_local_APIC(); - local_irq_restore(flags); +#ifdef CONFIG_X86_IO_APIC + if (smp_found_config) + if (!skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); +#endif + setup_boot_clock(); + + return 0; } /* - * In this function we calibrate APIC bus clocks to the external - * timer. Unfortunately we cannot use jiffies and the timer irq - * to calibrate, since some later bootup code depends on getting - * the first irq? Ugh. - * - * We want to do the calibration only once since we - * want to have local timer irqs syncron. CPUs connected - * by the same APIC bus have the very same bus frequency. - * And we want to have irqs off anyways, no accidental - * APIC irq that way. + * APIC command line parameters */ - -static int __init calibrate_APIC_clock(void) -{ - unsigned long long t1 = 0, t2 = 0; - long tt1, tt2; - long result; - int i; - const int LOOPS = HZ/10; - - apic_printk(APIC_VERBOSE, "calibrating APIC timer ...\n"); - - /* - * Put whatever arbitrary (but long enough) timeout - * value into the APIC clock, we just want to get the - * counter running for calibration. - */ - __setup_APIC_LVTT(1000000000); - - /* - * The timer chip counts down to zero. Let's wait - * for a wraparound to start exact measurement: - * (the current tick might have been already half done) - */ - - wait_timer_tick(); - - /* - * We wrapped around just now. Let's start: - */ - if (cpu_has_tsc) - rdtscll(t1); - tt1 = apic_read(APIC_TMCCT); - - /* - * Let's wait LOOPS wraprounds: - */ - for (i = 0; i < LOOPS; i++) - wait_timer_tick(); - - tt2 = apic_read(APIC_TMCCT); - if (cpu_has_tsc) - rdtscll(t2); - - /* - * The APIC bus clock counter is 32 bits only, it - * might have overflown, but note that we use signed - * longs, thus no extra care needed. - * - * underflown to be exact, as the timer counts down ;) - */ - - result = (tt1-tt2)*APIC_DIVISOR/LOOPS; - - if (cpu_has_tsc) - apic_printk(APIC_VERBOSE, "..... CPU clock speed is " - "%ld.%04ld MHz.\n", - ((long)(t2-t1)/LOOPS)/(1000000/HZ), - ((long)(t2-t1)/LOOPS)%(1000000/HZ)); - - apic_printk(APIC_VERBOSE, "..... host bus clock speed is " - "%ld.%04ld MHz.\n", - result/(1000000/HZ), - result%(1000000/HZ)); - - return result; -} - -static unsigned int calibration_result; - -void __init setup_boot_APIC_clock(void) -{ - unsigned long flags; - apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"); - using_apic_timer = 1; - - local_irq_save(flags); - - calibration_result = calibrate_APIC_clock(); - /* - * Now set up the timer for real. - */ - setup_APIC_timer(calibration_result); - - local_irq_restore(flags); -} - -void __devinit setup_secondary_APIC_clock(void) -{ - setup_APIC_timer(calibration_result); -} - -void disable_APIC_timer(void) -{ - if (using_apic_timer) { - unsigned long v; - - v = apic_read(APIC_LVTT); - /* - * When an illegal vector value (0-15) is written to an LVT - * entry and delivery mode is Fixed, the APIC may signal an - * illegal vector error, with out regard to whether the mask - * bit is set or whether an interrupt is actually seen on input. - * - * Boot sequence might call this function when the LVTT has - * '0' vector value. So make sure vector field is set to - * valid value. - */ - v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write_around(APIC_LVTT, v); - } -} - -void enable_APIC_timer(void) +static int __init parse_lapic(char *arg) { - int cpu = smp_processor_id(); - - if (using_apic_timer && - !cpu_isset(cpu, timer_bcast_ipi)) { - unsigned long v; - - v = apic_read(APIC_LVTT); - apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED); - } + enable_local_apic = 1; + return 0; } +early_param("lapic", parse_lapic); -void switch_APIC_timer_to_ipi(void *cpumask) +static int __init parse_nolapic(char *arg) { - cpumask_t mask = *(cpumask_t *)cpumask; - int cpu = smp_processor_id(); - - if (cpu_isset(cpu, mask) && - !cpu_isset(cpu, timer_bcast_ipi)) { - disable_APIC_timer(); - cpu_set(cpu, timer_bcast_ipi); - } + enable_local_apic = -1; + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + return 0; } -EXPORT_SYMBOL(switch_APIC_timer_to_ipi); +early_param("nolapic", parse_nolapic); -void switch_ipi_to_APIC_timer(void *cpumask) +static int __init apic_set_verbosity(char *str) { - cpumask_t mask = *(cpumask_t *)cpumask; - int cpu = smp_processor_id(); - - if (cpu_isset(cpu, mask) && - cpu_isset(cpu, timer_bcast_ipi)) { - cpu_clear(cpu, timer_bcast_ipi); - enable_APIC_timer(); - } + if (strcmp("debug", str) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", str) == 0) + apic_verbosity = APIC_VERBOSE; + return 1; } -EXPORT_SYMBOL(switch_ipi_to_APIC_timer); -#undef APIC_DIVISOR - -/* - * Local timer interrupt handler. It does both profiling and - * process statistics/rescheduling. - * - * We do profiling in every local tick, statistics/rescheduling - * happen only every 'profiling multiplier' ticks. The default - * multiplier is 1 and it can be changed by writing the new multiplier - * value into /proc/profile. - */ - -inline void smp_local_timer_interrupt(void) -{ - profile_tick(CPU_PROFILING); -#ifdef CONFIG_SMP - update_process_times(user_mode_vm(get_irq_regs())); -#endif +__setup("apic=", apic_set_verbosity); - /* - * We take the 'long' return path, and there every subsystem - * grabs the apropriate locks (kernel lock/ irq lock). - * - * we might want to decouple profiling from the 'long path', - * and do the profiling totally in assembly. - * - * Currently this isn't too much of an issue (performance wise), - * we can take more than 100K local irqs per second on a 100 MHz P5. - */ -} /* - * Local APIC timer interrupt. This is the most natural way for doing - * local interrupts, but local timer interrupts can be emulated by - * broadcast interrupts too. [in case the hw doesn't support APIC timers] - * - * [ if a single-CPU system runs an SMP kernel then we call the local - * interrupt as well. Thus we cannot inline the local irq ... ] + * Local APIC interrupts */ -fastcall void smp_apic_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - int cpu = smp_processor_id(); - - /* - * the NMI deadlock-detector uses this. - */ - per_cpu(irq_stat, cpu).apic_timer_irqs++; - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - */ - ack_APIC_irq(); - /* - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - exit_idle(); - irq_enter(); - smp_local_timer_interrupt(); - irq_exit(); - set_irq_regs(old_regs); -} - -#ifndef CONFIG_SMP -static void up_apic_timer_interrupt_call(void) -{ - int cpu = smp_processor_id(); - - /* - * the NMI deadlock-detector uses this. - */ - per_cpu(irq_stat, cpu).apic_timer_irqs++; - - smp_local_timer_interrupt(); -} -#endif - -void smp_send_timer_broadcast_ipi(void) -{ - cpumask_t mask; - - cpus_and(mask, cpu_online_map, timer_bcast_ipi); - if (!cpus_empty(mask)) { -#ifdef CONFIG_SMP - send_IPI_mask(mask, LOCAL_TIMER_VECTOR); -#else - /* - * We can directly call the apic timer interrupt handler - * in UP case. Minus all irq related functions - */ - up_apic_timer_interrupt_call(); -#endif - } -} - -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -fastcall void smp_spurious_interrupt(struct pt_regs *regs) +void smp_spurious_interrupt(struct pt_regs *regs) { unsigned long v; @@ -1319,16 +1233,15 @@ fastcall void smp_spurious_interrupt(struct pt_regs *regs) ack_APIC_irq(); /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n", - smp_processor_id()); + printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " + "should never happen.\n", smp_processor_id()); irq_exit(); } /* * This interrupt should never happen with our APIC/SMP architecture */ - -fastcall void smp_error_interrupt(struct pt_regs *regs) +void smp_error_interrupt(struct pt_regs *regs) { unsigned long v, v1; @@ -1352,69 +1265,261 @@ fastcall void smp_error_interrupt(struct pt_regs *regs) 7: Illegal register address */ printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", - smp_processor_id(), v , v1); + smp_processor_id(), v , v1); irq_exit(); } /* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. + * Initialize APIC interrupts */ -int __init APIC_init_uniprocessor (void) +void __init apic_intr_init(void) { - if (enable_local_apic < 0) - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); +#ifdef CONFIG_SMP + smp_intr_init(); +#endif + /* self generated IPI for local APIC timer */ + set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - if (!smp_found_config && !cpu_has_apic) - return -1; + /* IPI vectors for APIC spurious and error interrupts */ + set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - /* - * Complain if the BIOS pretends there is one. - */ - if (!cpu_has_apic && APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_physical_apicid); - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); - return -1; + /* thermal monitor LVT interrupt */ +#ifdef CONFIG_X86_MCE_P4THERMAL + set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +#endif +} + +/** + * connect_bsp_APIC - attach the APIC to the interrupt system + */ +void __init connect_bsp_APIC(void) +{ + if (pic_mode) { + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + /* + * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's + * local APIC to INT and NMI lines. + */ + apic_printk(APIC_VERBOSE, "leaving PIC mode, " + "enabling APIC mode.\n"); + outb(0x70, 0x22); + outb(0x01, 0x23); } + enable_apic_mode(); +} - verify_local_APIC(); +/** + * disconnect_bsp_APIC - detach the APIC from the interrupt system + * @virt_wire_setup: indicates, whether virtual wire mode is selected + * + * Virtual wire mode is necessary to deliver legacy interrupts even when the + * APIC is disabled. + */ +void disconnect_bsp_APIC(int virt_wire_setup) +{ + if (pic_mode) { + /* + * Put the board back into PIC mode (has an effect only on + * certain older boards). Note that APIC interrupts, including + * IPIs, won't work beyond this point! The only exception are + * INIT IPIs. + */ + apic_printk(APIC_VERBOSE, "disabling APIC mode, " + "entering PIC mode.\n"); + outb(0x70, 0x22); + outb(0x00, 0x23); + } else { + /* Go back to Virtual Wire compatibility mode */ + unsigned long value; - connect_bsp_APIC(); + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write_around(APIC_SPIV, value); - /* - * Hack: In case of kdump, after a crash, kernel might be booting - * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid - * might be zero if read from MP tables. Get it from LAPIC. - */ -#ifdef CONFIG_CRASH_DUMP - boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); -#endif - phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); + if (!virt_wire_setup) { + /* + * For LVT0 make it edge triggered, active high, + * external and enabled + */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write_around(APIC_LVT0, value); + } else { + /* Disable LVT0 */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED); + } - setup_local_APIC(); + /* + * For LVT1 make it edge triggered, active high, nmi and + * enabled + */ + value = apic_read(APIC_LVT1); + value &= ~( + APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write_around(APIC_LVT1, value); + } +} -#ifdef CONFIG_X86_IO_APIC - if (smp_found_config) - if (!skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); +/* + * Power management + */ +#ifdef CONFIG_PM + +static struct { + int active; + /* r/w apic fields */ + unsigned int apic_id; + unsigned int apic_taskpri; + unsigned int apic_ldr; + unsigned int apic_dfr; + unsigned int apic_spiv; + unsigned int apic_lvtt; + unsigned int apic_lvtpc; + unsigned int apic_lvt0; + unsigned int apic_lvt1; + unsigned int apic_lvterr; + unsigned int apic_tmict; + unsigned int apic_tdcr; + unsigned int apic_thmr; +} apic_pm_state; + +static int lapic_suspend(struct sys_device *dev, pm_message_t state) +{ + unsigned long flags; + int maxlvt; + + if (!apic_pm_state.active) + return 0; + + maxlvt = lapic_get_maxlvt(); + + apic_pm_state.apic_id = apic_read(APIC_ID); + apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); + apic_pm_state.apic_ldr = apic_read(APIC_LDR); + apic_pm_state.apic_dfr = apic_read(APIC_DFR); + apic_pm_state.apic_spiv = apic_read(APIC_SPIV); + apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); + if (maxlvt >= 4) + apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); + apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); + apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); + apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); + apic_pm_state.apic_tmict = apic_read(APIC_TMICT); + apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); +#ifdef CONFIG_X86_MCE_P4THERMAL + if (maxlvt >= 5) + apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); #endif - setup_boot_clock(); + local_irq_save(flags); + disable_local_APIC(); + local_irq_restore(flags); return 0; } -static int __init parse_lapic(char *arg) +static int lapic_resume(struct sys_device *dev) { - lapic_enable(); + unsigned int l, h; + unsigned long flags; + int maxlvt; + + if (!apic_pm_state.active) + return 0; + + maxlvt = lapic_get_maxlvt(); + + local_irq_save(flags); + + /* + * Make sure the APICBASE points to the right address + * + * FIXME! This will be wrong if we ever support suspend on + * SMP! We'll need to do this as part of the CPU restore! + */ + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; + wrmsr(MSR_IA32_APICBASE, l, h); + + apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); + apic_write(APIC_ID, apic_pm_state.apic_id); + apic_write(APIC_DFR, apic_pm_state.apic_dfr); + apic_write(APIC_LDR, apic_pm_state.apic_ldr); + apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); + apic_write(APIC_SPIV, apic_pm_state.apic_spiv); + apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); + apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); +#ifdef CONFIG_X86_MCE_P4THERMAL + if (maxlvt >= 5) + apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); +#endif + if (maxlvt >= 4) + apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); + apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); + apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); + apic_write(APIC_TMICT, apic_pm_state.apic_tmict); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + local_irq_restore(flags); return 0; } -early_param("lapic", parse_lapic); -static int __init parse_nolapic(char *arg) +/* + * This device has no shutdown method - fully functioning local APICs + * are needed on every CPU up until machine_halt/restart/poweroff. + */ + +static struct sysdev_class lapic_sysclass = { + set_kset_name("lapic"), + .resume = lapic_resume, + .suspend = lapic_suspend, +}; + +static struct sys_device device_lapic = { + .id = 0, + .cls = &lapic_sysclass, +}; + +static void __devinit apic_pm_activate(void) { - lapic_disable(); - return 0; + apic_pm_state.active = 1; } -early_param("nolapic", parse_nolapic); +static int __init init_lapic_sysfs(void) +{ + int error; + + if (!cpu_has_apic) + return 0; + /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ + + error = sysdev_class_register(&lapic_sysclass); + if (!error) + error = sysdev_register(&device_lapic); + return error; +} +device_initcall(init_lapic_sysfs); + +#else /* CONFIG_PM */ + +static void apic_pm_activate(void) { } + +#endif /* CONFIG_PM */ diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index f9ba0af7ee1..064bbf2861f 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -236,7 +236,6 @@ #include "io_ports.h" -extern unsigned long get_cmos_time(void); extern void machine_real_restart(unsigned char *, int); #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) @@ -1176,28 +1175,6 @@ out: spin_unlock(&user_list_lock); } -static void set_time(void) -{ - struct timespec ts; - if (got_clock_diff) { /* Must know time zone in order to set clock */ - ts.tv_sec = get_cmos_time() + clock_cmos_diff; - ts.tv_nsec = 0; - do_settimeofday(&ts); - } -} - -static void get_time_diff(void) -{ -#ifndef CONFIG_APM_RTC_IS_GMT - /* - * Estimate time zone so that set_time can update the clock - */ - clock_cmos_diff = -get_cmos_time(); - clock_cmos_diff += get_seconds(); - got_clock_diff = 1; -#endif -} - static void reinit_timer(void) { #ifdef INIT_TIMER_AFTER_SUSPEND @@ -1237,19 +1214,6 @@ static int suspend(int vetoable) local_irq_disable(); device_power_down(PMSG_SUSPEND); - /* serialize with the timer interrupt */ - write_seqlock(&xtime_lock); - - /* protect against access to timer chip registers */ - spin_lock(&i8253_lock); - - get_time_diff(); - /* - * Irq spinlock must be dropped around set_system_power_state. - * We'll undo any timer changes due to interrupts below. - */ - spin_unlock(&i8253_lock); - write_sequnlock(&xtime_lock); local_irq_enable(); save_processor_state(); @@ -1258,7 +1222,6 @@ static int suspend(int vetoable) restore_processor_state(); local_irq_disable(); - set_time(); reinit_timer(); if (err == APM_NO_ERROR) @@ -1288,11 +1251,6 @@ static void standby(void) local_irq_disable(); device_power_down(PMSG_SUSPEND); - /* serialize with the timer interrupt */ - write_seqlock(&xtime_lock); - /* If needed, notify drivers here */ - get_time_diff(); - write_sequnlock(&xtime_lock); local_irq_enable(); err = set_system_power_state(APM_STATE_STANDBY); @@ -1386,7 +1344,6 @@ static void check_events(void) ignore_bounce = 1; if ((event != APM_NORMAL_RESUME) || (ignore_normal_resume == 0)) { - set_time(); device_resume(); pm_send_all(PM_RESUME, (void *)0); queue_event(event, NULL); @@ -1402,7 +1359,6 @@ static void check_events(void) break; case APM_UPDATE_TIME: - set_time(); break; case APM_CRITICAL_SUSPEND: diff --git a/arch/i386/kernel/cpu/cpufreq/Kconfig b/arch/i386/kernel/cpu/cpufreq/Kconfig index 5299c5bf445..6c52182ca32 100644 --- a/arch/i386/kernel/cpu/cpufreq/Kconfig +++ b/arch/i386/kernel/cpu/cpufreq/Kconfig @@ -217,6 +217,15 @@ config X86_LONGHAUL If in doubt, say N. +config X86_E_POWERSAVER + tristate "VIA C7 Enhanced PowerSaver (EXPERIMENTAL)" + select CPU_FREQ_TABLE + depends on EXPERIMENTAL + help + This adds the CPUFreq driver for VIA C7 processors. + + If in doubt, say N. + comment "shared options" config X86_ACPI_CPUFREQ_PROC_INTF diff --git a/arch/i386/kernel/cpu/cpufreq/Makefile b/arch/i386/kernel/cpu/cpufreq/Makefile index 8de3abe322a..560f7760dae 100644 --- a/arch/i386/kernel/cpu/cpufreq/Makefile +++ b/arch/i386/kernel/cpu/cpufreq/Makefile @@ -2,6 +2,7 @@ obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o obj-$(CONFIG_X86_LONGHAUL) += longhaul.o +obj-$(CONFIG_X86_E_POWERSAVER) += e_powersaver.o obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o obj-$(CONFIG_SC520_CPUFREQ) += sc520_freq.o obj-$(CONFIG_X86_LONGRUN) += longrun.o diff --git a/arch/i386/kernel/cpu/cpufreq/e_powersaver.c b/arch/i386/kernel/cpu/cpufreq/e_powersaver.c new file mode 100644 index 00000000000..f43d98e11cc --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/e_powersaver.c @@ -0,0 +1,334 @@ +/* + * Based on documentation provided by Dave Jones. Thanks! + * + * Licensed under the terms of the GNU GPL License version 2. + * + * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/cpufreq.h> +#include <linux/ioport.h> +#include <linux/slab.h> + +#include <asm/msr.h> +#include <asm/tsc.h> +#include <asm/timex.h> +#include <asm/io.h> +#include <asm/delay.h> + +#define EPS_BRAND_C7M 0 +#define EPS_BRAND_C7 1 +#define EPS_BRAND_EDEN 2 +#define EPS_BRAND_C3 3 + +struct eps_cpu_data { + u32 fsb; + struct cpufreq_frequency_table freq_table[]; +}; + +static struct eps_cpu_data *eps_cpu[NR_CPUS]; + + +static unsigned int eps_get(unsigned int cpu) +{ + struct eps_cpu_data *centaur; + u32 lo, hi; + + if (cpu) + return 0; + centaur = eps_cpu[cpu]; + if (centaur == NULL) + return 0; + + /* Return current frequency */ + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + return centaur->fsb * ((lo >> 8) & 0xff); +} + +static int eps_set_state(struct eps_cpu_data *centaur, + unsigned int cpu, + u32 dest_state) +{ + struct cpufreq_freqs freqs; + u32 lo, hi; + int err = 0; + int i; + + freqs.old = eps_get(cpu); + freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff); + freqs.cpu = cpu; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + /* Wait while CPU is busy */ + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + i = 0; + while (lo & ((1 << 16) | (1 << 17))) { + udelay(16); + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + i++; + if (unlikely(i > 64)) { + err = -ENODEV; + goto postchange; + } + } + /* Set new multiplier and voltage */ + wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0); + /* Wait until transition end */ + i = 0; + do { + udelay(16); + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + i++; + if (unlikely(i > 64)) { + err = -ENODEV; + goto postchange; + } + } while (lo & ((1 << 16) | (1 << 17))); + + /* Return current frequency */ +postchange: + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + freqs.new = centaur->fsb * ((lo >> 8) & 0xff); + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + return err; +} + +static int eps_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + struct eps_cpu_data *centaur; + unsigned int newstate = 0; + unsigned int cpu = policy->cpu; + unsigned int dest_state; + int ret; + + if (unlikely(eps_cpu[cpu] == NULL)) + return -ENODEV; + centaur = eps_cpu[cpu]; + + if (unlikely(cpufreq_frequency_table_target(policy, + &eps_cpu[cpu]->freq_table[0], + target_freq, + relation, + &newstate))) { + return -EINVAL; + } + + /* Make frequency transition */ + dest_state = centaur->freq_table[newstate].index & 0xffff; + ret = eps_set_state(centaur, cpu, dest_state); + if (ret) + printk(KERN_ERR "eps: Timeout!\n"); + return ret; +} + +static int eps_verify(struct cpufreq_policy *policy) +{ + return cpufreq_frequency_table_verify(policy, + &eps_cpu[policy->cpu]->freq_table[0]); +} + +static int eps_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int i; + u32 lo, hi; + u64 val; + u8 current_multiplier, current_voltage; + u8 max_multiplier, max_voltage; + u8 min_multiplier, min_voltage; + u8 brand; + u32 fsb; + struct eps_cpu_data *centaur; + struct cpufreq_frequency_table *f_table; + int k, step, voltage; + int ret; + int states; + + if (policy->cpu != 0) + return -ENODEV; + + /* Check brand */ + printk("eps: Detected VIA "); + rdmsr(0x1153, lo, hi); + brand = (((lo >> 2) ^ lo) >> 18) & 3; + switch(brand) { + case EPS_BRAND_C7M: + printk("C7-M\n"); + break; + case EPS_BRAND_C7: + printk("C7\n"); + break; + case EPS_BRAND_EDEN: + printk("Eden\n"); + break; + case EPS_BRAND_C3: + printk("C3\n"); + return -ENODEV; + break; + } + /* Enable Enhanced PowerSaver */ + rdmsrl(MSR_IA32_MISC_ENABLE, val); + if (!(val & 1 << 16)) { + val |= 1 << 16; + wrmsrl(MSR_IA32_MISC_ENABLE, val); + /* Can be locked at 0 */ + rdmsrl(MSR_IA32_MISC_ENABLE, val); + if (!(val & 1 << 16)) { + printk("eps: Can't enable Enhanced PowerSaver\n"); + return -ENODEV; + } + } + + /* Print voltage and multiplier */ + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + current_voltage = lo & 0xff; + printk("eps: Current voltage = %dmV\n", current_voltage * 16 + 700); + current_multiplier = (lo >> 8) & 0xff; + printk("eps: Current multiplier = %d\n", current_multiplier); + + /* Print limits */ + max_voltage = hi & 0xff; + printk("eps: Highest voltage = %dmV\n", max_voltage * 16 + 700); + max_multiplier = (hi >> 8) & 0xff; + printk("eps: Highest multiplier = %d\n", max_multiplier); + min_voltage = (hi >> 16) & 0xff; + printk("eps: Lowest voltage = %dmV\n", min_voltage * 16 + 700); + min_multiplier = (hi >> 24) & 0xff; + printk("eps: Lowest multiplier = %d\n", min_multiplier); + + /* Sanity checks */ + if (current_multiplier == 0 || max_multiplier == 0 + || min_multiplier == 0) + return -EINVAL; + if (current_multiplier > max_multiplier + || max_multiplier <= min_multiplier) + return -EINVAL; + if (current_voltage > 0x1c || max_voltage > 0x1c) + return -EINVAL; + if (max_voltage < min_voltage) + return -EINVAL; + + /* Calc FSB speed */ + fsb = cpu_khz / current_multiplier; + /* Calc number of p-states supported */ + if (brand == EPS_BRAND_C7M) + states = max_multiplier - min_multiplier + 1; + else + states = 2; + + /* Allocate private data and frequency table for current cpu */ + centaur = kzalloc(sizeof(struct eps_cpu_data) + + (states + 1) * sizeof(struct cpufreq_frequency_table), + GFP_KERNEL); + if (!centaur) + return -ENOMEM; + eps_cpu[0] = centaur; + + /* Copy basic values */ + centaur->fsb = fsb; + + /* Fill frequency and MSR value table */ + f_table = ¢aur->freq_table[0]; + if (brand != EPS_BRAND_C7M) { + f_table[0].frequency = fsb * min_multiplier; + f_table[0].index = (min_multiplier << 8) | min_voltage; + f_table[1].frequency = fsb * max_multiplier; + f_table[1].index = (max_multiplier << 8) | max_voltage; + f_table[2].frequency = CPUFREQ_TABLE_END; + } else { + k = 0; + step = ((max_voltage - min_voltage) * 256) + / (max_multiplier - min_multiplier); + for (i = min_multiplier; i <= max_multiplier; i++) { + voltage = (k * step) / 256 + min_voltage; + f_table[k].frequency = fsb * i; + f_table[k].index = (i << 8) | voltage; + k++; + } + f_table[k].frequency = CPUFREQ_TABLE_END; + } + + policy->governor = CPUFREQ_DEFAULT_GOVERNOR; + policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */ + policy->cur = fsb * current_multiplier; + + ret = cpufreq_frequency_table_cpuinfo(policy, ¢aur->freq_table[0]); + if (ret) { + kfree(centaur); + return ret; + } + + cpufreq_frequency_table_get_attr(¢aur->freq_table[0], policy->cpu); + return 0; +} + +static int eps_cpu_exit(struct cpufreq_policy *policy) +{ + unsigned int cpu = policy->cpu; + struct eps_cpu_data *centaur; + u32 lo, hi; + + if (eps_cpu[cpu] == NULL) + return -ENODEV; + centaur = eps_cpu[cpu]; + + /* Get max frequency */ + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + /* Set max frequency */ + eps_set_state(centaur, cpu, hi & 0xffff); + /* Bye */ + cpufreq_frequency_table_put_attr(policy->cpu); + kfree(eps_cpu[cpu]); + eps_cpu[cpu] = NULL; + return 0; +} + +static struct freq_attr* eps_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + +static struct cpufreq_driver eps_driver = { + .verify = eps_verify, + .target = eps_target, + .init = eps_cpu_init, + .exit = eps_cpu_exit, + .get = eps_get, + .name = "e_powersaver", + .owner = THIS_MODULE, + .attr = eps_attr, +}; + +static int __init eps_init(void) +{ + struct cpuinfo_x86 *c = cpu_data; + + /* This driver will work only on Centaur C7 processors with + * Enhanced SpeedStep/PowerSaver registers */ + if (c->x86_vendor != X86_VENDOR_CENTAUR + || c->x86 != 6 || c->x86_model != 10) + return -ENODEV; + if (!cpu_has(c, X86_FEATURE_EST)) + return -ENODEV; + + if (cpufreq_register_driver(&eps_driver)) + return -EINVAL; + return 0; +} + +static void __exit eps_exit(void) +{ + cpufreq_unregister_driver(&eps_driver); +} + +MODULE_AUTHOR("Rafa³ Bilski <rafalbilski@interia.pl>"); +MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's."); +MODULE_LICENSE("GPL"); + +module_init(eps_init); +module_exit(eps_exit); diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.c b/arch/i386/kernel/cpu/cpufreq/longhaul.c index a3db9332d65..b59878a0d9b 100644 --- a/arch/i386/kernel/cpu/cpufreq/longhaul.c +++ b/arch/i386/kernel/cpu/cpufreq/longhaul.c @@ -8,12 +8,11 @@ * VIA have currently 3 different versions of Longhaul. * Version 1 (Longhaul) uses the BCR2 MSR at 0x1147. * It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0. - * Version 2 of longhaul is the same as v1, but adds voltage scaling. - * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C) - * voltage scaling support has currently been disabled in this driver - * until we have code that gets it right. + * Version 2 of longhaul is backward compatible with v1, but adds + * LONGHAUL MSR for purpose of both frequency and voltage scaling. + * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C). * Version 3 of longhaul got renamed to Powersaver and redesigned - * to use the POWERSAVER MSR at 0x110a. + * to use only the POWERSAVER MSR at 0x110a. * It is present in Ezra-T (C5M), Nehemiah (C5X) and above. * It's pretty much the same feature wise to longhaul v2, though * there is provision for scaling FSB too, but this doesn't work @@ -51,10 +50,12 @@ #define CPU_EZRA 3 #define CPU_EZRA_T 4 #define CPU_NEHEMIAH 5 +#define CPU_NEHEMIAH_C 6 /* Flags */ #define USE_ACPI_C3 (1 << 1) #define USE_NORTHBRIDGE (1 << 2) +#define USE_VT8235 (1 << 3) static int cpu_model; static unsigned int numscales=16; @@ -63,7 +64,8 @@ static unsigned int fsb; static struct mV_pos *vrm_mV_table; static unsigned char *mV_vrm_table; struct f_msr { - unsigned char vrm; + u8 vrm; + u8 pos; }; static struct f_msr f_msr_table[32]; @@ -73,10 +75,10 @@ static int can_scale_voltage; static struct acpi_processor *pr = NULL; static struct acpi_processor_cx *cx = NULL; static u8 longhaul_flags; +static u8 longhaul_pos; /* Module parameters */ static int scale_voltage; -static int ignore_latency; #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longhaul", msg) @@ -164,26 +166,47 @@ static void do_longhaul1(unsigned int clock_ratio_index) static void do_powersaver(int cx_address, unsigned int clock_ratio_index) { union msr_longhaul longhaul; + u8 dest_pos; u32 t; + dest_pos = f_msr_table[clock_ratio_index].pos; + rdmsrl(MSR_VIA_LONGHAUL, longhaul.val); + /* Setup new frequency */ longhaul.bits.RevisionKey = longhaul.bits.RevisionID; longhaul.bits.SoftBusRatio = clock_ratio_index & 0xf; longhaul.bits.SoftBusRatio4 = (clock_ratio_index & 0x10) >> 4; - longhaul.bits.EnableSoftBusRatio = 1; - - if (can_scale_voltage) { + /* Setup new voltage */ + if (can_scale_voltage) longhaul.bits.SoftVID = f_msr_table[clock_ratio_index].vrm; + /* Sync to timer tick */ + safe_halt(); + /* Raise voltage if necessary */ + if (can_scale_voltage && longhaul_pos < dest_pos) { longhaul.bits.EnableSoftVID = 1; + wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); + /* Change voltage */ + if (!cx_address) { + ACPI_FLUSH_CPU_CACHE(); + halt(); + } else { + ACPI_FLUSH_CPU_CACHE(); + /* Invoke C3 */ + inb(cx_address); + /* Dummy op - must do something useless after P_LVL3 + * read */ + t = inl(acpi_gbl_FADT.xpm_timer_block.address); + } + longhaul.bits.EnableSoftVID = 0; + wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); + longhaul_pos = dest_pos; } - /* Sync to timer tick */ - safe_halt(); /* Change frequency on next halt or sleep */ + longhaul.bits.EnableSoftBusRatio = 1; wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); if (!cx_address) { ACPI_FLUSH_CPU_CACHE(); - /* Invoke C1 */ halt(); } else { ACPI_FLUSH_CPU_CACHE(); @@ -193,12 +216,29 @@ static void do_powersaver(int cx_address, unsigned int clock_ratio_index) t = inl(acpi_gbl_FADT.xpm_timer_block.address); } /* Disable bus ratio bit */ - local_irq_disable(); - longhaul.bits.RevisionKey = longhaul.bits.RevisionID; longhaul.bits.EnableSoftBusRatio = 0; - longhaul.bits.EnableSoftBSEL = 0; - longhaul.bits.EnableSoftVID = 0; wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); + + /* Reduce voltage if necessary */ + if (can_scale_voltage && longhaul_pos > dest_pos) { + longhaul.bits.EnableSoftVID = 1; + wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); + /* Change voltage */ + if (!cx_address) { + ACPI_FLUSH_CPU_CACHE(); + halt(); + } else { + ACPI_FLUSH_CPU_CACHE(); + /* Invoke C3 */ + inb(cx_address); + /* Dummy op - must do something useless after P_LVL3 + * read */ + t = inl(acpi_gbl_FADT.xpm_timer_block.address); + } + longhaul.bits.EnableSoftVID = 0; + wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); + longhaul_pos = dest_pos; + } } /** @@ -257,26 +297,19 @@ static void longhaul_setstate(unsigned int clock_ratio_index) /* * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B]) * Software controlled multipliers only. - * - * *NB* Until we get voltage scaling working v1 & v2 are the same code. - * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5b] and Ezra [C5C] */ case TYPE_LONGHAUL_V1: - case TYPE_LONGHAUL_V2: do_longhaul1(clock_ratio_index); break; /* + * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C] + * * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N]) - * We can scale voltage with this too, but that's currently - * disabled until we come up with a decent 'match freq to voltage' - * algorithm. - * When we add voltage scaling, we will also need to do the - * voltage/freq setting in order depending on the direction - * of scaling (like we do in powernow-k7.c) * Nehemiah can do FSB scaling too, but this has never been proven * to work in practice. */ + case TYPE_LONGHAUL_V2: case TYPE_POWERSAVER: if (longhaul_flags & USE_ACPI_C3) { /* Don't allow wakeup */ @@ -301,6 +334,7 @@ static void longhaul_setstate(unsigned int clock_ratio_index) local_irq_restore(flags); preempt_enable(); + freqs.new = calc_speed(longhaul_get_cpu_mult()); cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -315,31 +349,19 @@ static void longhaul_setstate(unsigned int clock_ratio_index) #define ROUNDING 0xf -static int _guess(int guess, int mult) -{ - int target; - - target = ((mult/10)*guess); - if (mult%10 != 0) - target += (guess/2); - target += ROUNDING/2; - target &= ~ROUNDING; - return target; -} - - static int guess_fsb(int mult) { - int speed = (cpu_khz/1000); + int speed = cpu_khz / 1000; int i; - int speeds[] = { 66, 100, 133, 200 }; - - speed += ROUNDING/2; - speed &= ~ROUNDING; - - for (i=0; i<4; i++) { - if (_guess(speeds[i], mult) == speed) - return speeds[i]; + int speeds[] = { 666, 1000, 1333, 2000 }; + int f_max, f_min; + + for (i = 0; i < 4; i++) { + f_max = ((speeds[i] * mult) + 50) / 100; + f_max += (ROUNDING / 2); + f_min = f_max - ROUNDING; + if ((speed <= f_max) && (speed >= f_min)) + return speeds[i] / 10; } return 0; } @@ -347,67 +369,40 @@ static int guess_fsb(int mult) static int __init longhaul_get_ranges(void) { - unsigned long invalue; - unsigned int ezra_t_multipliers[32]= { - 90, 30, 40, 100, 55, 35, 45, 95, - 50, 70, 80, 60, 120, 75, 85, 65, - -1, 110, 120, -1, 135, 115, 125, 105, - 130, 150, 160, 140, -1, 155, -1, 145 }; unsigned int j, k = 0; - union msr_longhaul longhaul; - int mult = 0; + int mult; - switch (longhaul_version) { - case TYPE_LONGHAUL_V1: - case TYPE_LONGHAUL_V2: - /* Ugh, Longhaul v1 didn't have the min/max MSRs. - Assume min=3.0x & max = whatever we booted at. */ + /* Get current frequency */ + mult = longhaul_get_cpu_mult(); + if (mult == -1) { + printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n"); + return -EINVAL; + } + fsb = guess_fsb(mult); + if (fsb == 0) { + printk(KERN_INFO PFX "Invalid (reserved) FSB!\n"); + return -EINVAL; + } + /* Get max multiplier - as we always did. + * Longhaul MSR is usefull only when voltage scaling is enabled. + * C3 is booting at max anyway. */ + maxmult = mult; + /* Get min multiplier */ + switch (cpu_model) { + case CPU_NEHEMIAH: + minmult = 50; + break; + case CPU_NEHEMIAH_C: + minmult = 40; + break; + default: minmult = 30; - maxmult = mult = longhaul_get_cpu_mult(); break; - - case TYPE_POWERSAVER: - /* Ezra-T */ - if (cpu_model==CPU_EZRA_T) { - minmult = 30; - rdmsrl (MSR_VIA_LONGHAUL, longhaul.val); - invalue = longhaul.bits.MaxMHzBR; - if (longhaul.bits.MaxMHzBR4) - invalue += 16; - maxmult = mult = ezra_t_multipliers[invalue]; - break; - } - - /* Nehemiah */ - if (cpu_model==CPU_NEHEMIAH) { - rdmsrl (MSR_VIA_LONGHAUL, longhaul.val); - - /* - * TODO: This code works, but raises a lot of questions. - * - Some Nehemiah's seem to have broken Min/MaxMHzBR's. - * We get around this by using a hardcoded multiplier of 4.0x - * for the minimimum speed, and the speed we booted up at for the max. - * This is done in longhaul_get_cpu_mult() by reading the EBLCR register. - * - According to some VIA documentation EBLCR is only - * in pre-Nehemiah C3s. How this still works is a mystery. - * We're possibly using something undocumented and unsupported, - * But it works, so we don't grumble. - */ - minmult=40; - maxmult = mult = longhaul_get_cpu_mult(); - break; - } } - fsb = guess_fsb(mult); dprintk ("MinMult:%d.%dx MaxMult:%d.%dx\n", minmult/10, minmult%10, maxmult/10, maxmult%10); - if (fsb == 0) { - printk (KERN_INFO PFX "Invalid (reserved) FSB!\n"); - return -EINVAL; - } - highest_speed = calc_speed(maxmult); lowest_speed = calc_speed(minmult); dprintk ("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb, @@ -455,6 +450,7 @@ static void __init longhaul_setup_voltagescaling(void) union msr_longhaul longhaul; struct mV_pos minvid, maxvid; unsigned int j, speed, pos, kHz_step, numvscales; + int min_vid_speed; rdmsrl(MSR_VIA_LONGHAUL, longhaul.val); if (!(longhaul.bits.RevisionID & 1)) { @@ -468,14 +464,14 @@ static void __init longhaul_setup_voltagescaling(void) mV_vrm_table = &mV_vrm85[0]; } else { printk (KERN_INFO PFX "Mobile VRM\n"); + if (cpu_model < CPU_NEHEMIAH) + return; vrm_mV_table = &mobilevrm_mV[0]; mV_vrm_table = &mV_mobilevrm[0]; } minvid = vrm_mV_table[longhaul.bits.MinimumVID]; maxvid = vrm_mV_table[longhaul.bits.MaximumVID]; - numvscales = maxvid.pos - minvid.pos + 1; - kHz_step = (highest_speed - lowest_speed) / numvscales; if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) { printk (KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. " @@ -491,20 +487,59 @@ static void __init longhaul_setup_voltagescaling(void) return; } - printk(KERN_INFO PFX "Max VID=%d.%03d Min VID=%d.%03d, %d possible voltage scales\n", + /* How many voltage steps */ + numvscales = maxvid.pos - minvid.pos + 1; + printk(KERN_INFO PFX + "Max VID=%d.%03d " + "Min VID=%d.%03d, " + "%d possible voltage scales\n", maxvid.mV/1000, maxvid.mV%1000, minvid.mV/1000, minvid.mV%1000, numvscales); + /* Calculate max frequency at min voltage */ + j = longhaul.bits.MinMHzBR; + if (longhaul.bits.MinMHzBR4) + j += 16; + min_vid_speed = eblcr_table[j]; + if (min_vid_speed == -1) + return; + switch (longhaul.bits.MinMHzFSB) { + case 0: + min_vid_speed *= 13333; + break; + case 1: + min_vid_speed *= 10000; + break; + case 3: + min_vid_speed *= 6666; + break; + default: + return; + break; + } + if (min_vid_speed >= highest_speed) + return; + /* Calculate kHz for one voltage step */ + kHz_step = (highest_speed - min_vid_speed) / numvscales; + + j = 0; while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) { speed = longhaul_table[j].frequency; - pos = (speed - lowest_speed) / kHz_step + minvid.pos; + if (speed > min_vid_speed) + pos = (speed - min_vid_speed) / kHz_step + minvid.pos; + else + pos = minvid.pos; f_msr_table[longhaul_table[j].index].vrm = mV_vrm_table[pos]; + f_msr_table[longhaul_table[j].index].pos = pos; j++; } + longhaul_pos = maxvid.pos; can_scale_voltage = 1; + printk(KERN_INFO PFX "Voltage scaling enabled. " + "Use of \"conservative\" governor is highly recommended.\n"); } @@ -573,20 +608,51 @@ static int enable_arbiter_disable(void) if (dev != NULL) { /* Enable access to port 0x22 */ pci_read_config_byte(dev, reg, &pci_cmd); - if ( !(pci_cmd & 1<<7) ) { + if (!(pci_cmd & 1<<7)) { pci_cmd |= 1<<7; pci_write_config_byte(dev, reg, pci_cmd); + pci_read_config_byte(dev, reg, &pci_cmd); + if (!(pci_cmd & 1<<7)) { + printk(KERN_ERR PFX + "Can't enable access to port 0x22.\n"); + return 0; + } } return 1; } return 0; } +static int longhaul_setup_vt8235(void) +{ + struct pci_dev *dev; + u8 pci_cmd; + + /* Find VT8235 southbridge */ + dev = pci_find_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL); + if (dev != NULL) { + /* Set transition time to max */ + pci_read_config_byte(dev, 0xec, &pci_cmd); + pci_cmd &= ~(1 << 2); + pci_write_config_byte(dev, 0xec, pci_cmd); + pci_read_config_byte(dev, 0xe4, &pci_cmd); + pci_cmd &= ~(1 << 7); + pci_write_config_byte(dev, 0xe4, pci_cmd); + pci_read_config_byte(dev, 0xe5, &pci_cmd); + pci_cmd |= 1 << 7; + pci_write_config_byte(dev, 0xe5, pci_cmd); + return 1; + } + return 0; +} + static int __init longhaul_cpu_init(struct cpufreq_policy *policy) { struct cpuinfo_x86 *c = cpu_data; char *cpuname=NULL; int ret; + u32 lo, hi; + int vt8235_present; /* Check what we have on this motherboard */ switch (c->x86_model) { @@ -599,16 +665,20 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) break; case 7: - longhaul_version = TYPE_LONGHAUL_V1; switch (c->x86_mask) { case 0: + longhaul_version = TYPE_LONGHAUL_V1; cpu_model = CPU_SAMUEL2; cpuname = "C3 'Samuel 2' [C5B]"; - /* Note, this is not a typo, early Samuel2's had Samuel1 ratios. */ - memcpy (clock_ratio, samuel1_clock_ratio, sizeof(samuel1_clock_ratio)); - memcpy (eblcr_table, samuel2_eblcr, sizeof(samuel2_eblcr)); + /* Note, this is not a typo, early Samuel2's had + * Samuel1 ratios. */ + memcpy(clock_ratio, samuel1_clock_ratio, + sizeof(samuel1_clock_ratio)); + memcpy(eblcr_table, samuel2_eblcr, + sizeof(samuel2_eblcr)); break; case 1 ... 15: + longhaul_version = TYPE_LONGHAUL_V2; if (c->x86_mask < 8) { cpu_model = CPU_SAMUEL2; cpuname = "C3 'Samuel 2' [C5B]"; @@ -616,8 +686,10 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) cpu_model = CPU_EZRA; cpuname = "C3 'Ezra' [C5C]"; } - memcpy (clock_ratio, ezra_clock_ratio, sizeof(ezra_clock_ratio)); - memcpy (eblcr_table, ezra_eblcr, sizeof(ezra_eblcr)); + memcpy(clock_ratio, ezra_clock_ratio, + sizeof(ezra_clock_ratio)); + memcpy(eblcr_table, ezra_eblcr, + sizeof(ezra_eblcr)); break; } break; @@ -632,24 +704,24 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) break; case 9: - cpu_model = CPU_NEHEMIAH; longhaul_version = TYPE_POWERSAVER; - numscales=32; + numscales = 32; + memcpy(clock_ratio, + nehemiah_clock_ratio, + sizeof(nehemiah_clock_ratio)); + memcpy(eblcr_table, nehemiah_eblcr, sizeof(nehemiah_eblcr)); switch (c->x86_mask) { case 0 ... 1: - cpuname = "C3 'Nehemiah A' [C5N]"; - memcpy (clock_ratio, nehemiah_a_clock_ratio, sizeof(nehemiah_a_clock_ratio)); - memcpy (eblcr_table, nehemiah_a_eblcr, sizeof(nehemiah_a_eblcr)); + cpu_model = CPU_NEHEMIAH; + cpuname = "C3 'Nehemiah A' [C5XLOE]"; break; case 2 ... 4: - cpuname = "C3 'Nehemiah B' [C5N]"; - memcpy (clock_ratio, nehemiah_b_clock_ratio, sizeof(nehemiah_b_clock_ratio)); - memcpy (eblcr_table, nehemiah_b_eblcr, sizeof(nehemiah_b_eblcr)); + cpu_model = CPU_NEHEMIAH; + cpuname = "C3 'Nehemiah B' [C5XLOH]"; break; case 5 ... 15: - cpuname = "C3 'Nehemiah C' [C5N]"; - memcpy (clock_ratio, nehemiah_c_clock_ratio, sizeof(nehemiah_c_clock_ratio)); - memcpy (eblcr_table, nehemiah_c_eblcr, sizeof(nehemiah_c_eblcr)); + cpu_model = CPU_NEHEMIAH_C; + cpuname = "C3 'Nehemiah C' [C5P]"; break; } break; @@ -658,6 +730,13 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) cpuname = "Unknown"; break; } + /* Check Longhaul ver. 2 */ + if (longhaul_version == TYPE_LONGHAUL_V2) { + rdmsr(MSR_VIA_LONGHAUL, lo, hi); + if (lo == 0 && hi == 0) + /* Looks like MSR isn't present */ + longhaul_version = TYPE_LONGHAUL_V1; + } printk (KERN_INFO PFX "VIA %s CPU detected. ", cpuname); switch (longhaul_version) { @@ -670,15 +749,18 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) break; }; + /* Doesn't hurt */ + vt8235_present = longhaul_setup_vt8235(); + /* Find ACPI data for processor */ - acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, ACPI_UINT32_MAX, - &longhaul_walk_callback, NULL, (void *)&pr); + acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, &longhaul_walk_callback, + NULL, (void *)&pr); /* Check ACPI support for C3 state */ - if ((pr != NULL) && (longhaul_version == TYPE_POWERSAVER)) { + if (pr != NULL && longhaul_version != TYPE_LONGHAUL_V1) { cx = &pr->power.states[ACPI_STATE_C3]; - if (cx->address > 0 && - (cx->latency <= 1000 || ignore_latency != 0) ) { + if (cx->address > 0 && cx->latency <= 1000) { longhaul_flags |= USE_ACPI_C3; goto print_support_type; } @@ -688,8 +770,11 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) longhaul_flags |= USE_NORTHBRIDGE; goto print_support_type; } - - /* No ACPI C3 or we can't use it */ + /* Use VT8235 southbridge if present */ + if (longhaul_version == TYPE_POWERSAVER && vt8235_present) { + longhaul_flags |= USE_VT8235; + goto print_support_type; + } /* Check ACPI support for bus master arbiter disable */ if ((pr == NULL) || !(pr->flags.bm_control)) { printk(KERN_ERR PFX @@ -698,18 +783,18 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) } print_support_type: - if (!(longhaul_flags & USE_NORTHBRIDGE)) { - printk (KERN_INFO PFX "Using ACPI support.\n"); - } else { + if (longhaul_flags & USE_NORTHBRIDGE) printk (KERN_INFO PFX "Using northbridge support.\n"); - } + else if (longhaul_flags & USE_VT8235) + printk (KERN_INFO PFX "Using VT8235 support.\n"); + else + printk (KERN_INFO PFX "Using ACPI support.\n"); ret = longhaul_get_ranges(); if (ret != 0) return ret; - if ((longhaul_version==TYPE_LONGHAUL_V2 || longhaul_version==TYPE_POWERSAVER) && - (scale_voltage != 0)) + if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0)) longhaul_setup_voltagescaling(); policy->governor = CPUFREQ_DEFAULT_GOVERNOR; @@ -797,8 +882,6 @@ static void __exit longhaul_exit(void) module_param (scale_voltage, int, 0644); MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); -module_param(ignore_latency, int, 0644); -MODULE_PARM_DESC(ignore_latency, "Skip ACPI C3 latency test"); MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>"); MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors."); diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.h b/arch/i386/kernel/cpu/cpufreq/longhaul.h index bc4682aad69..bb0a04b1d1a 100644 --- a/arch/i386/kernel/cpu/cpufreq/longhaul.h +++ b/arch/i386/kernel/cpu/cpufreq/longhaul.h @@ -235,84 +235,14 @@ static int __initdata ezrat_eblcr[32] = { /* * VIA C3 Nehemiah */ -static int __initdata nehemiah_a_clock_ratio[32] = { +static int __initdata nehemiah_clock_ratio[32] = { 100, /* 0000 -> 10.0x */ 160, /* 0001 -> 16.0x */ - -1, /* 0010 -> RESERVED */ - 90, /* 0011 -> 9.0x */ - 95, /* 0100 -> 9.5x */ - -1, /* 0101 -> RESERVED */ - -1, /* 0110 -> RESERVED */ - 55, /* 0111 -> 5.5x */ - 60, /* 1000 -> 6.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 50, /* 1011 -> 5.0x */ - 65, /* 1100 -> 6.5x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 120, /* 1111 -> 12.0x */ - 100, /* 0000 -> 10.0x */ - -1, /* 0001 -> RESERVED */ - 120, /* 0010 -> 12.0x */ - 90, /* 0011 -> 9.0x */ - 105, /* 0100 -> 10.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 135, /* 0111 -> 13.5x */ - 140, /* 1000 -> 14.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 130, /* 1011 -> 13.0x */ - 145, /* 1100 -> 14.5x */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - 120, /* 1111 -> 12.0x */ -}; - -static int __initdata nehemiah_b_clock_ratio[32] = { - 100, /* 0000 -> 10.0x */ - 160, /* 0001 -> 16.0x */ - -1, /* 0010 -> RESERVED */ - 90, /* 0011 -> 9.0x */ - 95, /* 0100 -> 9.5x */ - -1, /* 0101 -> RESERVED */ - -1, /* 0110 -> RESERVED */ - 55, /* 0111 -> 5.5x */ - 60, /* 1000 -> 6.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 50, /* 1011 -> 5.0x */ - 65, /* 1100 -> 6.5x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 120, /* 1111 -> 12.0x */ - 100, /* 0000 -> 10.0x */ - 110, /* 0001 -> 11.0x */ - 120, /* 0010 -> 12.0x */ - 90, /* 0011 -> 9.0x */ - 105, /* 0100 -> 10.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 135, /* 0111 -> 13.5x */ - 140, /* 1000 -> 14.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 130, /* 1011 -> 13.0x */ - 145, /* 1100 -> 14.5x */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - 120, /* 1111 -> 12.0x */ -}; - -static int __initdata nehemiah_c_clock_ratio[32] = { - 100, /* 0000 -> 10.0x */ - 160, /* 0001 -> 16.0x */ - 40, /* 0010 -> RESERVED */ + 40, /* 0010 -> 4.0x */ 90, /* 0011 -> 9.0x */ 95, /* 0100 -> 9.5x */ -1, /* 0101 -> RESERVED */ - 45, /* 0110 -> RESERVED */ + 45, /* 0110 -> 4.5x */ 55, /* 0111 -> 5.5x */ 60, /* 1000 -> 6.0x */ 70, /* 1001 -> 7.0x */ @@ -340,84 +270,14 @@ static int __initdata nehemiah_c_clock_ratio[32] = { 120, /* 1111 -> 12.0x */ }; -static int __initdata nehemiah_a_eblcr[32] = { - 50, /* 0000 -> 5.0x */ - 160, /* 0001 -> 16.0x */ - -1, /* 0010 -> RESERVED */ - 100, /* 0011 -> 10.0x */ - 55, /* 0100 -> 5.5x */ - -1, /* 0101 -> RESERVED */ - -1, /* 0110 -> RESERVED */ - 95, /* 0111 -> 9.5x */ - 90, /* 1000 -> 9.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - 120, /* 1100 -> 12.0x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 65, /* 1111 -> 6.5x */ - 90, /* 0000 -> 9.0x */ - -1, /* 0001 -> RESERVED */ - 120, /* 0010 -> 12.0x */ - 100, /* 0011 -> 10.0x */ - 135, /* 0100 -> 13.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 105, /* 0111 -> 10.5x */ - 130, /* 1000 -> 13.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 140, /* 1011 -> 14.0x */ - 120, /* 1100 -> 12.0x */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - 145 /* 1111 -> 14.5x */ - /* end of table */ -}; -static int __initdata nehemiah_b_eblcr[32] = { - 50, /* 0000 -> 5.0x */ - 160, /* 0001 -> 16.0x */ - -1, /* 0010 -> RESERVED */ - 100, /* 0011 -> 10.0x */ - 55, /* 0100 -> 5.5x */ - -1, /* 0101 -> RESERVED */ - -1, /* 0110 -> RESERVED */ - 95, /* 0111 -> 9.5x */ - 90, /* 1000 -> 9.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - 120, /* 1100 -> 12.0x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 65, /* 1111 -> 6.5x */ - 90, /* 0000 -> 9.0x */ - 110, /* 0001 -> 11.0x */ - 120, /* 0010 -> 12.0x */ - 100, /* 0011 -> 10.0x */ - 135, /* 0100 -> 13.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 105, /* 0111 -> 10.5x */ - 130, /* 1000 -> 13.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 140, /* 1011 -> 14.0x */ - 120, /* 1100 -> 12.0x */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - 145 /* 1111 -> 14.5x */ - /* end of table */ -}; -static int __initdata nehemiah_c_eblcr[32] = { +static int __initdata nehemiah_eblcr[32] = { 50, /* 0000 -> 5.0x */ 160, /* 0001 -> 16.0x */ - 40, /* 0010 -> RESERVED */ + 40, /* 0010 -> 4.0x */ 100, /* 0011 -> 10.0x */ 55, /* 0100 -> 5.5x */ -1, /* 0101 -> RESERVED */ - 45, /* 0110 -> RESERVED */ + 45, /* 0110 -> 4.5x */ 95, /* 0111 -> 9.5x */ 90, /* 1000 -> 9.0x */ 70, /* 1001 -> 7.0x */ @@ -443,7 +303,6 @@ static int __initdata nehemiah_c_eblcr[32] = { 155, /* 1101 -> 15.5x */ -1, /* 1110 -> RESERVED (13.0x) */ 145 /* 1111 -> 14.5x */ - /* end of table */ }; /* diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c index 2d649167255..fe3b67005eb 100644 --- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c @@ -1289,7 +1289,11 @@ static unsigned int powernowk8_get (unsigned int cpu) if (query_current_values_with_pending_wait(data)) goto out; - khz = find_khz_freq_from_fid(data->currfid); + if (cpu_family == CPU_HW_PSTATE) + khz = find_khz_freq_from_fiddid(data->currfid, data->currdid); + else + khz = find_khz_freq_from_fid(data->currfid); + out: set_cpus_allowed(current, oldmask); diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c index 0b29d41322a..e1006b7acc9 100644 --- a/arch/i386/kernel/hpet.c +++ b/arch/i386/kernel/hpet.c @@ -1,4 +1,5 @@ #include <linux/clocksource.h> +#include <linux/clockchips.h> #include <linux/errno.h> #include <linux/hpet.h> #include <linux/init.h> @@ -6,17 +7,278 @@ #include <asm/hpet.h> #include <asm/io.h> +extern struct clock_event_device *global_clock_event; + #define HPET_MASK CLOCKSOURCE_MASK(32) #define HPET_SHIFT 22 /* FSEC = 10^-15 NSEC = 10^-9 */ #define FSEC_PER_NSEC 1000000 -static void __iomem *hpet_ptr; +/* + * HPET address is set in acpi/boot.c, when an ACPI entry exists + */ +unsigned long hpet_address; +static void __iomem * hpet_virt_address; + +static inline unsigned long hpet_readl(unsigned long a) +{ + return readl(hpet_virt_address + a); +} + +static inline void hpet_writel(unsigned long d, unsigned long a) +{ + writel(d, hpet_virt_address + a); +} + +/* + * HPET command line enable / disable + */ +static int boot_hpet_disable; + +static int __init hpet_setup(char* str) +{ + if (str) { + if (!strncmp("disable", str, 7)) + boot_hpet_disable = 1; + } + return 1; +} +__setup("hpet=", hpet_setup); + +static inline int is_hpet_capable(void) +{ + return (!boot_hpet_disable && hpet_address); +} + +/* + * HPET timer interrupt enable / disable + */ +static int hpet_legacy_int_enabled; + +/** + * is_hpet_enabled - check whether the hpet timer interrupt is enabled + */ +int is_hpet_enabled(void) +{ + return is_hpet_capable() && hpet_legacy_int_enabled; +} + +/* + * When the hpet driver (/dev/hpet) is enabled, we need to reserve + * timer 0 and timer 1 in case of RTC emulation. + */ +#ifdef CONFIG_HPET +static void hpet_reserve_platform_timers(unsigned long id) +{ + struct hpet __iomem *hpet = hpet_virt_address; + struct hpet_timer __iomem *timer = &hpet->hpet_timers[2]; + unsigned int nrtimers, i; + struct hpet_data hd; + + nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; + + memset(&hd, 0, sizeof (hd)); + hd.hd_phys_address = hpet_address; + hd.hd_address = hpet_virt_address; + hd.hd_nirqs = nrtimers; + hd.hd_flags = HPET_DATA_PLATFORM; + hpet_reserve_timer(&hd, 0); + +#ifdef CONFIG_HPET_EMULATE_RTC + hpet_reserve_timer(&hd, 1); +#endif + + hd.hd_irq[0] = HPET_LEGACY_8254; + hd.hd_irq[1] = HPET_LEGACY_RTC; + + for (i = 2; i < nrtimers; timer++, i++) + hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >> + Tn_INT_ROUTE_CNF_SHIFT; + + hpet_alloc(&hd); + +} +#else +static void hpet_reserve_platform_timers(unsigned long id) { } +#endif + +/* + * Common hpet info + */ +static unsigned long hpet_period; + +static void hpet_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt); +static int hpet_next_event(unsigned long delta, + struct clock_event_device *evt); + +/* + * The hpet clock event device + */ +static struct clock_event_device hpet_clockevent = { + .name = "hpet", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .set_mode = hpet_set_mode, + .set_next_event = hpet_next_event, + .shift = 32, + .irq = 0, +}; + +static void hpet_start_counter(void) +{ + unsigned long cfg = hpet_readl(HPET_CFG); + + cfg &= ~HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); + hpet_writel(0, HPET_COUNTER); + hpet_writel(0, HPET_COUNTER + 4); + cfg |= HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); +} + +static void hpet_enable_int(void) +{ + unsigned long cfg = hpet_readl(HPET_CFG); + + cfg |= HPET_CFG_LEGACY; + hpet_writel(cfg, HPET_CFG); + hpet_legacy_int_enabled = 1; +} + +static void hpet_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long cfg, cmp, now; + uint64_t delta; + + switch(mode) { + case CLOCK_EVT_MODE_PERIODIC: + delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult; + delta >>= hpet_clockevent.shift; + now = hpet_readl(HPET_COUNTER); + cmp = now + (unsigned long) delta; + cfg = hpet_readl(HPET_T0_CFG); + cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | + HPET_TN_SETVAL | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T0_CFG); + /* + * The first write after writing TN_SETVAL to the + * config register sets the counter value, the second + * write sets the period. + */ + hpet_writel(cmp, HPET_T0_CMP); + udelay(1); + hpet_writel((unsigned long) delta, HPET_T0_CMP); + break; + + case CLOCK_EVT_MODE_ONESHOT: + cfg = hpet_readl(HPET_T0_CFG); + cfg &= ~HPET_TN_PERIODIC; + cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T0_CFG); + break; + + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + cfg = hpet_readl(HPET_T0_CFG); + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_T0_CFG); + break; + } +} + +static int hpet_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + unsigned long cnt; + + cnt = hpet_readl(HPET_COUNTER); + cnt += delta; + hpet_writel(cnt, HPET_T0_CMP); + + return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0); +} + +/* + * Try to setup the HPET timer + */ +int __init hpet_enable(void) +{ + unsigned long id; + uint64_t hpet_freq; + + if (!is_hpet_capable()) + return 0; + + hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); + + /* + * Read the period and check for a sane value: + */ + hpet_period = hpet_readl(HPET_PERIOD); + if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) + goto out_nohpet; + + /* + * The period is a femto seconds value. We need to calculate the + * scaled math multiplication factor for nanosecond to hpet tick + * conversion. + */ + hpet_freq = 1000000000000000ULL; + do_div(hpet_freq, hpet_period); + hpet_clockevent.mult = div_sc((unsigned long) hpet_freq, + NSEC_PER_SEC, 32); + /* Calculate the min / max delta */ + hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, + &hpet_clockevent); + hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, + &hpet_clockevent); + + /* + * Read the HPET ID register to retrieve the IRQ routing + * information and the number of channels + */ + id = hpet_readl(HPET_ID); + +#ifdef CONFIG_HPET_EMULATE_RTC + /* + * The legacy routing mode needs at least two channels, tick timer + * and the rtc emulation channel. + */ + if (!(id & HPET_ID_NUMBER)) + goto out_nohpet; +#endif + + /* Start the counter */ + hpet_start_counter(); + + if (id & HPET_ID_LEGSUP) { + hpet_enable_int(); + hpet_reserve_platform_timers(id); + /* + * Start hpet with the boot cpu mask and make it + * global after the IO_APIC has been initialized. + */ + hpet_clockevent.cpumask =cpumask_of_cpu(0); + clockevents_register_device(&hpet_clockevent); + global_clock_event = &hpet_clockevent; + return 1; + } + return 0; +out_nohpet: + iounmap(hpet_virt_address); + hpet_virt_address = NULL; + return 0; +} + +/* + * Clock source related code + */ static cycle_t read_hpet(void) { - return (cycle_t)readl(hpet_ptr); + return (cycle_t)hpet_readl(HPET_COUNTER); } static struct clocksource clocksource_hpet = { @@ -24,28 +286,17 @@ static struct clocksource clocksource_hpet = { .rating = 250, .read = read_hpet, .mask = HPET_MASK, - .mult = 0, /* set below */ .shift = HPET_SHIFT, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static int __init init_hpet_clocksource(void) { - unsigned long hpet_period; - void __iomem* hpet_base; u64 tmp; - int err; - if (!is_hpet_enabled()) + if (!hpet_virt_address) return -ENODEV; - /* calculate the hpet address: */ - hpet_base = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); - hpet_ptr = hpet_base + HPET_COUNTER; - - /* calculate the frequency: */ - hpet_period = readl(hpet_base + HPET_PERIOD); - /* * hpet period is in femto seconds per cycle * so we need to convert this to ns/cyc units @@ -61,11 +312,218 @@ static int __init init_hpet_clocksource(void) do_div(tmp, FSEC_PER_NSEC); clocksource_hpet.mult = (u32)tmp; - err = clocksource_register(&clocksource_hpet); - if (err) - iounmap(hpet_base); - - return err; + return clocksource_register(&clocksource_hpet); } module_init(init_hpet_clocksource); + +#ifdef CONFIG_HPET_EMULATE_RTC + +/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET + * is enabled, we support RTC interrupt functionality in software. + * RTC has 3 kinds of interrupts: + * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock + * is updated + * 2) Alarm Interrupt - generate an interrupt at a specific time of day + * 3) Periodic Interrupt - generate periodic interrupt, with frequencies + * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) + * (1) and (2) above are implemented using polling at a frequency of + * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt + * overhead. (DEFAULT_RTC_INT_FREQ) + * For (3), we use interrupts at 64Hz or user specified periodic + * frequency, whichever is higher. + */ +#include <linux/mc146818rtc.h> +#include <linux/rtc.h> + +#define DEFAULT_RTC_INT_FREQ 64 +#define DEFAULT_RTC_SHIFT 6 +#define RTC_NUM_INTS 1 + +static unsigned long hpet_rtc_flags; +static unsigned long hpet_prev_update_sec; +static struct rtc_time hpet_alarm_time; +static unsigned long hpet_pie_count; +static unsigned long hpet_t1_cmp; +static unsigned long hpet_default_delta; +static unsigned long hpet_pie_delta; +static unsigned long hpet_pie_limit; + +/* + * Timer 1 for RTC emulation. We use one shot mode, as periodic mode + * is not supported by all HPET implementations for timer 1. + * + * hpet_rtc_timer_init() is called when the rtc is initialized. + */ +int hpet_rtc_timer_init(void) +{ + unsigned long cfg, cnt, delta, flags; + + if (!is_hpet_enabled()) + return 0; + + if (!hpet_default_delta) { + uint64_t clc; + + clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; + clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT; + hpet_default_delta = (unsigned long) clc; + } + + if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) + delta = hpet_default_delta; + else + delta = hpet_pie_delta; + + local_irq_save(flags); + + cnt = delta + hpet_readl(HPET_COUNTER); + hpet_writel(cnt, HPET_T1_CMP); + hpet_t1_cmp = cnt; + + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_PERIODIC; + cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T1_CFG); + + local_irq_restore(flags); + + return 1; +} + +/* + * The functions below are called from rtc driver. + * Return 0 if HPET is not being used. + * Otherwise do the necessary changes and return 1. + */ +int hpet_mask_rtc_irq_bit(unsigned long bit_mask) +{ + if (!is_hpet_enabled()) + return 0; + + hpet_rtc_flags &= ~bit_mask; + return 1; +} + +int hpet_set_rtc_irq_bit(unsigned long bit_mask) +{ + unsigned long oldbits = hpet_rtc_flags; + + if (!is_hpet_enabled()) + return 0; + + hpet_rtc_flags |= bit_mask; + + if (!oldbits) + hpet_rtc_timer_init(); + + return 1; +} + +int hpet_set_alarm_time(unsigned char hrs, unsigned char min, + unsigned char sec) +{ + if (!is_hpet_enabled()) + return 0; + + hpet_alarm_time.tm_hour = hrs; + hpet_alarm_time.tm_min = min; + hpet_alarm_time.tm_sec = sec; + + return 1; +} + +int hpet_set_periodic_freq(unsigned long freq) +{ + uint64_t clc; + + if (!is_hpet_enabled()) + return 0; + + if (freq <= DEFAULT_RTC_INT_FREQ) + hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq; + else { + clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; + do_div(clc, freq); + clc >>= hpet_clockevent.shift; + hpet_pie_delta = (unsigned long) clc; + } + return 1; +} + +int hpet_rtc_dropped_irq(void) +{ + return is_hpet_enabled(); +} + +static void hpet_rtc_timer_reinit(void) +{ + unsigned long cfg, delta; + int lost_ints = -1; + + if (unlikely(!hpet_rtc_flags)) { + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_T1_CFG); + return; + } + + if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) + delta = hpet_default_delta; + else + delta = hpet_pie_delta; + + /* + * Increment the comparator value until we are ahead of the + * current count. + */ + do { + hpet_t1_cmp += delta; + hpet_writel(hpet_t1_cmp, HPET_T1_CMP); + lost_ints++; + } while ((long)(hpet_readl(HPET_COUNTER) - hpet_t1_cmp) > 0); + + if (lost_ints) { + if (hpet_rtc_flags & RTC_PIE) + hpet_pie_count += lost_ints; + if (printk_ratelimit()) + printk(KERN_WARNING "rtc: lost %d interrupts\n", + lost_ints); + } +} + +irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) +{ + struct rtc_time curr_time; + unsigned long rtc_int_flag = 0; + + hpet_rtc_timer_reinit(); + + if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) + rtc_get_rtc_time(&curr_time); + + if (hpet_rtc_flags & RTC_UIE && + curr_time.tm_sec != hpet_prev_update_sec) { + rtc_int_flag = RTC_UF; + hpet_prev_update_sec = curr_time.tm_sec; + } + + if (hpet_rtc_flags & RTC_PIE && + ++hpet_pie_count >= hpet_pie_limit) { + rtc_int_flag |= RTC_PF; + hpet_pie_count = 0; + } + + if (hpet_rtc_flags & RTC_PIE && + (curr_time.tm_sec == hpet_alarm_time.tm_sec) && + (curr_time.tm_min == hpet_alarm_time.tm_min) && + (curr_time.tm_hour == hpet_alarm_time.tm_hour)) + rtc_int_flag |= RTC_AF; + + if (rtc_int_flag) { + rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); + rtc_interrupt(rtc_int_flag, dev_id); + } + return IRQ_HANDLED; +} +#endif diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c index 9a0060b92e3..a6bc7bb3883 100644 --- a/arch/i386/kernel/i8253.c +++ b/arch/i386/kernel/i8253.c @@ -2,7 +2,7 @@ * i8253.c 8253/PIT functions * */ -#include <linux/clocksource.h> +#include <linux/clockchips.h> #include <linux/spinlock.h> #include <linux/jiffies.h> #include <linux/sysdev.h> @@ -19,17 +19,97 @@ DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); -void setup_pit_timer(void) +/* + * HPET replaces the PIT, when enabled. So we need to know, which of + * the two timers is used + */ +struct clock_event_device *global_clock_event; + +/* + * Initialize the PIT timer. + * + * This is also called after resume to bring the PIT into operation again. + */ +static void init_pit_timer(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long flags; + + spin_lock_irqsave(&i8253_lock, flags); + + switch(mode) { + case CLOCK_EVT_MODE_PERIODIC: + /* binary, mode 2, LSB/MSB, ch 0 */ + outb_p(0x34, PIT_MODE); + udelay(10); + outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ + udelay(10); + outb(LATCH >> 8 , PIT_CH0); /* MSB */ + break; + + case CLOCK_EVT_MODE_ONESHOT: + case CLOCK_EVT_MODE_SHUTDOWN: + case CLOCK_EVT_MODE_UNUSED: + /* One shot setup */ + outb_p(0x38, PIT_MODE); + udelay(10); + break; + } + spin_unlock_irqrestore(&i8253_lock, flags); +} + +/* + * Program the next event in oneshot mode + * + * Delta is given in PIT ticks + */ +static int pit_next_event(unsigned long delta, struct clock_event_device *evt) { unsigned long flags; spin_lock_irqsave(&i8253_lock, flags); - outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ - udelay(10); - outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ - udelay(10); - outb(LATCH >> 8 , PIT_CH0); /* MSB */ + outb_p(delta & 0xff , PIT_CH0); /* LSB */ + outb(delta >> 8 , PIT_CH0); /* MSB */ spin_unlock_irqrestore(&i8253_lock, flags); + + return 0; +} + +/* + * On UP the PIT can serve all of the possible timer functions. On SMP systems + * it can be solely used for the global tick. + * + * The profiling and update capabilites are switched off once the local apic is + * registered. This mechanism replaces the previous #ifdef LOCAL_APIC - + * !using_apic_timer decisions in do_timer_interrupt_hook() + */ +struct clock_event_device pit_clockevent = { + .name = "pit", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .set_mode = init_pit_timer, + .set_next_event = pit_next_event, + .shift = 32, + .irq = 0, +}; + +/* + * Initialize the conversion factor and the min/max deltas of the clock event + * structure and register the clock event source with the framework. + */ +void __init setup_pit_timer(void) +{ + /* + * Start pit with the boot cpu mask and make it global after the + * IO_APIC has been initialized. + */ + pit_clockevent.cpumask = cpumask_of_cpu(0); + pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32); + pit_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFF, &pit_clockevent); + pit_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &pit_clockevent); + clockevents_register_device(&pit_clockevent); + global_clock_event = &pit_clockevent; } /* @@ -46,7 +126,7 @@ static cycle_t pit_read(void) static u32 old_jifs; spin_lock_irqsave(&i8253_lock, flags); - /* + /* * Although our caller may have the read side of xtime_lock, * this is now a seqlock, and we are cheating in this routine * by having side effects on state that we cannot undo if diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c index c8d45821c78..03abfdb1a6e 100644 --- a/arch/i386/kernel/i8259.c +++ b/arch/i386/kernel/i8259.c @@ -41,6 +41,7 @@ static void mask_and_ack_8259A(unsigned int); static struct irq_chip i8259A_chip = { .name = "XT-PIC", .mask = disable_8259A_irq, + .disable = disable_8259A_irq, .unmask = enable_8259A_irq, .mask_ack = mask_and_ack_8259A, }; @@ -410,12 +411,6 @@ void __init native_init_IRQ(void) intr_init_hook(); /* - * Set the clock to HZ Hz, we already have a valid - * vector now: - */ - setup_pit_timer(); - - /* * External FPU? Set up irq13 if so, for * original braindamaged IBM FERR coupling. */ diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index e30ccedad0b..4ccebd454e2 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -482,8 +482,8 @@ static void do_irq_balance(void) package_index = CPU_TO_PACKAGEINDEX(i); for (j = 0; j < NR_IRQS; j++) { unsigned long value_now, delta; - /* Is this an active IRQ? */ - if (!irq_desc[j].action) + /* Is this an active IRQ or balancing disabled ? */ + if (!irq_desc[j].action || irq_balancing_disabled(j)) continue; if ( package_index == i ) IRQ_DELTA(package_index,j) = 0; @@ -1281,11 +1281,9 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger) trigger == IOAPIC_LEVEL) set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_fasteoi_irq, "fasteoi"); - else { - irq_desc[irq].status |= IRQ_DELAYED_DISABLE; + else set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); - } set_intr_gate(vector, interrupt[irq]); } @@ -1588,7 +1586,7 @@ void /*__init*/ print_local_APIC(void * dummy) v = apic_read(APIC_LVR); printk(KERN_INFO "... APIC VERSION: %08x\n", v); ver = GET_APIC_VERSION(v); - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); v = apic_read(APIC_TASKPRI); printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index 5785d84103a..0f2ca590bf2 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -10,7 +10,6 @@ * io_apic.c.) */ -#include <asm/uaccess.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/interrupt.h> @@ -21,19 +20,34 @@ #include <asm/idle.h> +#include <asm/apic.h> +#include <asm/uaccess.h> + DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; EXPORT_PER_CPU_SYMBOL(irq_stat); -#ifndef CONFIG_X86_LOCAL_APIC /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. */ void ack_bad_irq(unsigned int irq) { - printk("unexpected IRQ trap at vector %02x\n", irq); -} + printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); + +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But only ack when the APIC is enabled -AK + */ + if (cpu_has_apic) + ack_APIC_irq(); #endif +} #ifdef CONFIG_4KSTACKS /* diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 5d8a07c2028..821df34d2b3 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -23,6 +23,7 @@ #include <linux/dmi.h> #include <linux/kprobes.h> #include <linux/cpumask.h> +#include <linux/kernel_stat.h> #include <asm/smp.h> #include <asm/nmi.h> @@ -973,9 +974,13 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) cpu_clear(cpu, backtrace_mask); } - sum = per_cpu(irq_stat, cpu).apic_timer_irqs; + /* + * Take the local apic timer and PIT/HPET into account. We don't + * know which one is active, when we have highres/dyntick on + */ + sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0); - /* if the apic timer isn't firing, this cpu isn't doing much */ + /* if the none of the timers isn't firing, this cpu isn't doing much */ if (!touched && last_irq_sums[cpu] == sum) { /* * Ayiee, looks like this CPU is stuck ... diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 7845d480c29..bea304d48cd 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -38,6 +38,7 @@ #include <linux/ptrace.h> #include <linux/random.h> #include <linux/personality.h> +#include <linux/tick.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -211,6 +212,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { + tick_nohz_stop_sched_tick(); while (!need_resched()) { void (*idle)(void); @@ -238,6 +240,7 @@ void cpu_idle(void) idle(); __exit_idle(); } + tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index f46a4d095e6..48bfcaa13ec 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -94,12 +94,6 @@ cpumask_t cpu_possible_map; EXPORT_SYMBOL(cpu_possible_map); static cpumask_t smp_commenced_mask; -/* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there - * is no way to resync one AP against BP. TBD: for prescott and above, we - * should use IA64's algorithm - */ -static int __devinitdata tsc_sync_disabled; - /* Per CPU bogomips and other parameters */ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; EXPORT_SYMBOL(cpu_data); @@ -216,151 +210,6 @@ valid_k7: ; } -/* - * TSC synchronization. - * - * We first check whether all CPUs have their TSC's synchronized, - * then we print a warning if not, and always resync. - */ - -static struct { - atomic_t start_flag; - atomic_t count_start; - atomic_t count_stop; - unsigned long long values[NR_CPUS]; -} tsc __cpuinitdata = { - .start_flag = ATOMIC_INIT(0), - .count_start = ATOMIC_INIT(0), - .count_stop = ATOMIC_INIT(0), -}; - -#define NR_LOOPS 5 - -static void __init synchronize_tsc_bp(void) -{ - int i; - unsigned long long t0; - unsigned long long sum, avg; - long long delta; - unsigned int one_usec; - int buggy = 0; - - printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus()); - - /* convert from kcyc/sec to cyc/usec */ - one_usec = cpu_khz / 1000; - - atomic_set(&tsc.start_flag, 1); - wmb(); - - /* - * We loop a few times to get a primed instruction cache, - * then the last pass is more or less synchronized and - * the BP and APs set their cycle counters to zero all at - * once. This reduces the chance of having random offsets - * between the processors, and guarantees that the maximum - * delay between the cycle counters is never bigger than - * the latency of information-passing (cachelines) between - * two CPUs. - */ - for (i = 0; i < NR_LOOPS; i++) { - /* - * all APs synchronize but they loop on '== num_cpus' - */ - while (atomic_read(&tsc.count_start) != num_booting_cpus()-1) - cpu_relax(); - atomic_set(&tsc.count_stop, 0); - wmb(); - /* - * this lets the APs save their current TSC: - */ - atomic_inc(&tsc.count_start); - - rdtscll(tsc.values[smp_processor_id()]); - /* - * We clear the TSC in the last loop: - */ - if (i == NR_LOOPS-1) - write_tsc(0, 0); - - /* - * Wait for all APs to leave the synchronization point: - */ - while (atomic_read(&tsc.count_stop) != num_booting_cpus()-1) - cpu_relax(); - atomic_set(&tsc.count_start, 0); - wmb(); - atomic_inc(&tsc.count_stop); - } - - sum = 0; - for (i = 0; i < NR_CPUS; i++) { - if (cpu_isset(i, cpu_callout_map)) { - t0 = tsc.values[i]; - sum += t0; - } - } - avg = sum; - do_div(avg, num_booting_cpus()); - - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_isset(i, cpu_callout_map)) - continue; - delta = tsc.values[i] - avg; - if (delta < 0) - delta = -delta; - /* - * We report bigger than 2 microseconds clock differences. - */ - if (delta > 2*one_usec) { - long long realdelta; - - if (!buggy) { - buggy = 1; - printk("\n"); - } - realdelta = delta; - do_div(realdelta, one_usec); - if (tsc.values[i] < avg) - realdelta = -realdelta; - - if (realdelta) - printk(KERN_INFO "CPU#%d had %Ld usecs TSC " - "skew, fixed it up.\n", i, realdelta); - } - } - if (!buggy) - printk("passed.\n"); -} - -static void __cpuinit synchronize_tsc_ap(void) -{ - int i; - - /* - * Not every cpu is online at the time - * this gets called, so we first wait for the BP to - * finish SMP initialization: - */ - while (!atomic_read(&tsc.start_flag)) - cpu_relax(); - - for (i = 0; i < NR_LOOPS; i++) { - atomic_inc(&tsc.count_start); - while (atomic_read(&tsc.count_start) != num_booting_cpus()) - cpu_relax(); - - rdtscll(tsc.values[smp_processor_id()]); - if (i == NR_LOOPS-1) - write_tsc(0, 0); - - atomic_inc(&tsc.count_stop); - while (atomic_read(&tsc.count_stop) != num_booting_cpus()) - cpu_relax(); - } -} -#undef NR_LOOPS - extern void calibrate_delay(void); static atomic_t init_deasserted; @@ -438,20 +287,12 @@ static void __cpuinit smp_callin(void) /* * Save our processor parameters */ - smp_store_cpu_info(cpuid); - - disable_APIC_timer(); + smp_store_cpu_info(cpuid); /* * Allow the master to continue. */ cpu_set(cpuid, cpu_callin_map); - - /* - * Synchronize the TSC with the BP - */ - if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled) - synchronize_tsc_ap(); } static int cpucount; @@ -554,13 +395,17 @@ static void __cpuinit start_secondary(void *unused) smp_callin(); while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) rep_nop(); + /* + * Check TSC synchronization with the BP: + */ + check_tsc_sync_target(); + setup_secondary_clock(); if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); enable_NMI_through_LVT0(NULL); enable_8259A_irq(0); } - enable_APIC_timer(); /* * low-memory mappings have been cleared, flush them from * the local TLBs too. @@ -752,7 +597,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) /* * Due to the Pentium erratum 3AP. */ - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) { apic_read_around(APIC_SPIV); apic_write(APIC_ESR, 0); @@ -849,7 +694,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) */ Dprintk("#startup loops: %d.\n", num_starts); - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); for (j = 1; j <= num_starts; j++) { Dprintk("Sending STARTUP #%d.\n",j); @@ -1125,8 +970,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu) info.cpu = cpu; INIT_WORK(&info.task, do_warm_boot_cpu); - tsc_sync_disabled = 1; - /* init low mem mapping */ clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); @@ -1134,7 +977,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu) schedule_work(&info.task); wait_for_completion(&done); - tsc_sync_disabled = 0; zap_low_mappings(); ret = 0; exit: @@ -1331,12 +1173,6 @@ static void __init smp_boot_cpus(unsigned int max_cpus) smpboot_setup_io_apic(); setup_boot_clock(); - - /* - * Synchronize the TSC with the AP - */ - if (cpu_has_tsc && cpucount && cpu_khz) - synchronize_tsc_bp(); } /* These are wrappers to interface to the new boot process. Someone @@ -1471,9 +1307,16 @@ int __cpuinit __cpu_up(unsigned int cpu) } local_irq_enable(); + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; /* Unleash the CPU! */ cpu_set(cpu, smp_commenced_mask); + + /* + * Check TSC synchronization with the AP: + */ + check_tsc_sync_source(cpu); + while (!cpu_isset(cpu, cpu_online_map)) cpu_relax(); diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index a4f67a6e682..a5350059557 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -159,15 +159,6 @@ EXPORT_SYMBOL(profile_pc); */ irqreturn_t timer_interrupt(int irq, void *dev_id) { - /* - * Here we are in the timer irq handler. We just have irqs locally - * disabled but we don't know if the timer_bh is running on the other - * CPU. We need to avoid to SMP race with it. NOTE: we don' t need - * the irq version of write_lock because as just said we have irq - * locally disabled. -arca - */ - write_seqlock(&xtime_lock); - #ifdef CONFIG_X86_IO_APIC if (timer_ack) { /* @@ -186,7 +177,6 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) do_timer_interrupt_hook(); - if (MCA_bus) { /* The PS/2 uses level-triggered interrupts. You can't turn them off, nor would you want to (any attempt to @@ -201,18 +191,11 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ } - write_sequnlock(&xtime_lock); - -#ifdef CONFIG_X86_LOCAL_APIC - if (using_apic_timer) - smp_send_timer_broadcast_ipi(); -#endif - return IRQ_HANDLED; } /* not static: needed by APM */ -unsigned long get_cmos_time(void) +unsigned long read_persistent_clock(void) { unsigned long retval; unsigned long flags; @@ -225,7 +208,6 @@ unsigned long get_cmos_time(void) return retval; } -EXPORT_SYMBOL(get_cmos_time); static void sync_cmos_clock(unsigned long dummy); @@ -278,114 +260,16 @@ void notify_arch_cmos_timer(void) mod_timer(&sync_cmos_timer, jiffies + 1); } -static long clock_cmos_diff; -static unsigned long sleep_start; - -static int timer_suspend(struct sys_device *dev, pm_message_t state) -{ - /* - * Estimate time zone so that set_time can update the clock - */ - unsigned long ctime = get_cmos_time(); - - clock_cmos_diff = -ctime; - clock_cmos_diff += get_seconds(); - sleep_start = ctime; - return 0; -} - -static int timer_resume(struct sys_device *dev) -{ - unsigned long flags; - unsigned long sec; - unsigned long ctime = get_cmos_time(); - long sleep_length = (ctime - sleep_start) * HZ; - struct timespec ts; - - if (sleep_length < 0) { - printk(KERN_WARNING "CMOS clock skew detected in timer resume!\n"); - /* The time after the resume must not be earlier than the time - * before the suspend or some nasty things will happen - */ - sleep_length = 0; - ctime = sleep_start; - } -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled()) - hpet_reenable(); -#endif - setup_pit_timer(); - - sec = ctime + clock_cmos_diff; - ts.tv_sec = sec; - ts.tv_nsec = 0; - do_settimeofday(&ts); - write_seqlock_irqsave(&xtime_lock, flags); - jiffies_64 += sleep_length; - write_sequnlock_irqrestore(&xtime_lock, flags); - touch_softlockup_watchdog(); - return 0; -} - -static struct sysdev_class timer_sysclass = { - .resume = timer_resume, - .suspend = timer_suspend, - set_kset_name("timer"), -}; - - -/* XXX this driverfs stuff should probably go elsewhere later -john */ -static struct sys_device device_timer = { - .id = 0, - .cls = &timer_sysclass, -}; - -static int time_init_device(void) -{ - int error = sysdev_class_register(&timer_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; -} - -device_initcall(time_init_device); - -#ifdef CONFIG_HPET_TIMER extern void (*late_time_init)(void); /* Duplicate of time_init() below, with hpet_enable part added */ static void __init hpet_time_init(void) { - struct timespec ts; - ts.tv_sec = get_cmos_time(); - ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); - - do_settimeofday(&ts); - - if ((hpet_enable() >= 0) && hpet_use_timer) { - printk("Using HPET for base-timer\n"); - } - + if (!hpet_enable()) + setup_pit_timer(); do_time_init(); } -#endif void __init time_init(void) { - struct timespec ts; -#ifdef CONFIG_HPET_TIMER - if (is_hpet_capable()) { - /* - * HPET initialization needs to do memory-mapped io. So, let - * us do a late initialization after mem_init(). - */ - late_time_init = hpet_time_init; - return; - } -#endif - ts.tv_sec = get_cmos_time(); - ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); - - do_settimeofday(&ts); - - do_time_init(); + late_time_init = hpet_time_init; } diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index 46f752a8bbf..3082a418635 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -60,12 +60,6 @@ static inline int check_tsc_unstable(void) return tsc_unstable; } -void mark_tsc_unstable(void) -{ - tsc_unstable = 1; -} -EXPORT_SYMBOL_GPL(mark_tsc_unstable); - /* Accellerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) * basic equation: @@ -222,34 +216,6 @@ out_no_tsc: #ifdef CONFIG_CPU_FREQ -static unsigned int cpufreq_delayed_issched = 0; -static unsigned int cpufreq_init = 0; -static struct work_struct cpufreq_delayed_get_work; - -static void handle_cpufreq_delayed_get(struct work_struct *work) -{ - unsigned int cpu; - - for_each_online_cpu(cpu) - cpufreq_get(cpu); - - cpufreq_delayed_issched = 0; -} - -/* - * if we notice cpufreq oddness, schedule a call to cpufreq_get() as it tries - * to verify the CPU frequency the timing core thinks the CPU is running - * at is still correct. - */ -static inline void cpufreq_delayed_get(void) -{ - if (cpufreq_init && !cpufreq_delayed_issched) { - cpufreq_delayed_issched = 1; - printk(KERN_DEBUG "Checking if CPU frequency changed.\n"); - schedule_work(&cpufreq_delayed_get_work); - } -} - /* * if the CPU frequency is scaled, TSC-based delays will need a different * loops_per_jiffy value to function properly. @@ -313,17 +279,9 @@ static struct notifier_block time_cpufreq_notifier_block = { static int __init cpufreq_tsc(void) { - int ret; - - INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get); - ret = cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - if (!ret) - cpufreq_init = 1; - - return ret; + return cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); } - core_initcall(cpufreq_tsc); #endif @@ -331,7 +289,6 @@ core_initcall(cpufreq_tsc); /* clock source code */ static unsigned long current_tsc_khz = 0; -static int tsc_update_callback(void); static cycle_t read_tsc(void) { @@ -349,37 +306,28 @@ static struct clocksource clocksource_tsc = { .mask = CLOCKSOURCE_MASK(64), .mult = 0, /* to be set */ .shift = 22, - .update_callback = tsc_update_callback, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS | + CLOCK_SOURCE_MUST_VERIFY, }; -static int tsc_update_callback(void) +void mark_tsc_unstable(void) { - int change = 0; - - /* check to see if we should switch to the safe clocksource: */ - if (clocksource_tsc.rating != 0 && check_tsc_unstable()) { - clocksource_tsc.rating = 0; - clocksource_reselect(); - change = 1; - } - - /* only update if tsc_khz has changed: */ - if (current_tsc_khz != tsc_khz) { - current_tsc_khz = tsc_khz; - clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, - clocksource_tsc.shift); - change = 1; + if (!tsc_unstable) { + tsc_unstable = 1; + /* Can be called before registration */ + if (clocksource_tsc.mult) + clocksource_change_rating(&clocksource_tsc, 0); + else + clocksource_tsc.rating = 0; } - - return change; } +EXPORT_SYMBOL_GPL(mark_tsc_unstable); static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d) { printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", d->ident); - mark_tsc_unstable(); + tsc_unstable = 1; return 0; } @@ -396,65 +344,44 @@ static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { {} }; -#define TSC_FREQ_CHECK_INTERVAL (10*MSEC_PER_SEC) /* 10sec in MS */ -static struct timer_list verify_tsc_freq_timer; - -/* XXX - Probably should add locking */ -static void verify_tsc_freq(unsigned long unused) -{ - static u64 last_tsc; - static unsigned long last_jiffies; - - u64 now_tsc, interval_tsc; - unsigned long now_jiffies, interval_jiffies; - - - if (check_tsc_unstable()) - return; - - rdtscll(now_tsc); - now_jiffies = jiffies; - - if (!last_jiffies) { - goto out; - } - - interval_jiffies = now_jiffies - last_jiffies; - interval_tsc = now_tsc - last_tsc; - interval_tsc *= HZ; - do_div(interval_tsc, cpu_khz*1000); - - if (interval_tsc < (interval_jiffies * 3 / 4)) { - printk("TSC appears to be running slowly. " - "Marking it as unstable\n"); - mark_tsc_unstable(); - return; - } - -out: - last_tsc = now_tsc; - last_jiffies = now_jiffies; - /* set us up to go off on the next interval: */ - mod_timer(&verify_tsc_freq_timer, - jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL)); -} - /* * Make an educated guess if the TSC is trustworthy and synchronized * over all CPUs. */ -static __init int unsynchronized_tsc(void) +__cpuinit int unsynchronized_tsc(void) { + if (!cpu_has_tsc || tsc_unstable) + return 1; /* * Intel systems are normally all synchronized. * Exceptions must mark TSC as unstable: */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - return 0; + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + /* assume multi socket systems are not synchronized: */ + if (num_possible_cpus() > 1) + tsc_unstable = 1; + } + return tsc_unstable; +} + +/* + * Geode_LX - the OLPC CPU has a possibly a very reliable TSC + */ +#ifdef CONFIG_MGEODE_LX +/* RTSC counts during suspend */ +#define RTSC_SUSP 0x100 + +static void __init check_geode_tsc_reliable(void) +{ + unsigned long val; - /* assume multi socket systems are not synchronized: */ - return num_possible_cpus() > 1; + rdmsrl(MSR_GEODE_BUSCONT_CONF0, val); + if ((val & RTSC_SUSP)) + clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; } +#else +static inline void check_geode_tsc_reliable(void) { } +#endif static int __init init_tsc_clocksource(void) { @@ -463,20 +390,16 @@ static int __init init_tsc_clocksource(void) /* check blacklist */ dmi_check_system(bad_tsc_dmi_table); - if (unsynchronized_tsc()) /* mark unstable if unsynced */ - mark_tsc_unstable(); + unsynchronized_tsc(); + check_geode_tsc_reliable(); current_tsc_khz = tsc_khz; clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, clocksource_tsc.shift); /* lower the rating if we already know its unstable: */ - if (check_tsc_unstable()) + if (check_tsc_unstable()) { clocksource_tsc.rating = 0; - - init_timer(&verify_tsc_freq_timer); - verify_tsc_freq_timer.function = verify_tsc_freq; - verify_tsc_freq_timer.expires = - jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL); - add_timer(&verify_tsc_freq_timer); + clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; + } return clocksource_register(&clocksource_tsc); } diff --git a/arch/i386/kernel/tsc_sync.c b/arch/i386/kernel/tsc_sync.c new file mode 100644 index 00000000000..12424629af8 --- /dev/null +++ b/arch/i386/kernel/tsc_sync.c @@ -0,0 +1 @@ +#include "../../x86_64/kernel/tsc_sync.c" diff --git a/arch/i386/kernel/vmitime.c b/arch/i386/kernel/vmitime.c index 2e2d8dbcbd6..76d2adcae5a 100644 --- a/arch/i386/kernel/vmitime.c +++ b/arch/i386/kernel/vmitime.c @@ -115,7 +115,7 @@ static struct clocksource clocksource_vmi = { .mask = CLOCKSOURCE_MASK(64), .mult = 0, /* to be set */ .shift = 22, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; diff --git a/arch/i386/mach-default/setup.c b/arch/i386/mach-default/setup.c index cc2f519b2f7..c7881621070 100644 --- a/arch/i386/mach-default/setup.c +++ b/arch/i386/mach-default/setup.c @@ -79,7 +79,12 @@ void __init trap_init_hook(void) { } -static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL}; +static struct irqaction irq0 = { + .handler = timer_interrupt, + .flags = IRQF_DISABLED | IRQF_NOBALANCING, + .mask = CPU_MASK_NONE, + .name = "timer" +}; /** * time_init_hook - do any specific initialisations for the system timer. @@ -90,6 +95,7 @@ static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, **/ void __init time_init_hook(void) { + irq0.mask = cpumask_of_cpu(0); setup_irq(0, &irq0); } diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c index 545fcbc8cea..e5e56bd498d 100644 --- a/arch/mips/kernel/time.c +++ b/arch/mips/kernel/time.c @@ -307,7 +307,7 @@ static unsigned int __init calibrate_hpt(void) struct clocksource clocksource_mips = { .name = "MIPS", .mask = 0xffffffff, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static void __init init_mips_clocksource(void) diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c index 39db1289021..5e5c0e4add9 100644 --- a/arch/powerpc/platforms/powermac/pic.c +++ b/arch/powerpc/platforms/powermac/pic.c @@ -305,8 +305,6 @@ static int pmac_pic_host_map(struct irq_host *h, unsigned int virq, level = !!(level_mask[hw >> 5] & (1UL << (hw & 0x1f))); if (level) desc->status |= IRQ_LEVEL; - else - desc->status |= IRQ_DELAYED_DISABLE; set_irq_chip_and_handler(virq, &pmac_pic, level ? handle_level_irq : handle_edge_irq); return 0; diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 3b91f27ab20..ee9fd7b8592 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -312,7 +312,7 @@ static struct clocksource clocksource_tod = { .mask = -1ULL, .mult = 1000, .shift = 12, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; diff --git a/arch/um/os-Linux/sigio.c b/arch/um/os-Linux/sigio.c index 925a65240cf..b2e1fd8e357 100644 --- a/arch/um/os-Linux/sigio.c +++ b/arch/um/os-Linux/sigio.c @@ -97,20 +97,22 @@ static int write_sigio_thread(void *unused) static int need_poll(struct pollfds *polls, int n) { - if(n <= polls->size){ - polls->used = n; + struct pollfd *new; + + if(n <= polls->size) return 0; - } - kfree(polls->poll); - polls->poll = um_kmalloc_atomic(n * sizeof(struct pollfd)); - if(polls->poll == NULL){ + + new = um_kmalloc_atomic(n * sizeof(struct pollfd)); + if(new == NULL){ printk("need_poll : failed to allocate new pollfds\n"); - polls->size = 0; - polls->used = 0; return -ENOMEM; } + + memcpy(new, polls->poll, polls->used * sizeof(struct pollfd)); + kfree(polls->poll); + + polls->poll = new; polls->size = n; - polls->used = n; return 0; } @@ -171,15 +173,15 @@ int add_sigio_fd(int fd) goto out; } - n = current_poll.used + 1; - err = need_poll(&next_poll, n); + n = current_poll.used; + err = need_poll(&next_poll, n + 1); if(err) goto out; - for(i = 0; i < current_poll.used; i++) - next_poll.poll[i] = current_poll.poll[i]; - - next_poll.poll[n - 1] = *p; + memcpy(next_poll.poll, current_poll.poll, + current_poll.used * sizeof(struct pollfd)); + next_poll.poll[n] = *p; + next_poll.used = n + 1; update_thread(); out: sigio_unlock(); @@ -214,6 +216,7 @@ int ignore_sigio_fd(int fd) if(p->fd != fd) next_poll.poll[n++] = *p; } + next_poll.used = current_poll.used - 1; update_thread(); out: @@ -331,10 +334,9 @@ void maybe_sigio_broken(int fd, int read) sigio_lock(); err = need_poll(&all_sigio_fds, all_sigio_fds.used + 1); - if(err){ - printk("maybe_sigio_broken - failed to add pollfd\n"); + if(err) goto out; - } + all_sigio_fds.poll[all_sigio_fds.used++] = ((struct pollfd) { .fd = fd, .events = read ? POLLIN : POLLOUT, diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 7982cbc3bc9..56eb14c9847 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -24,6 +24,14 @@ config X86 bool default y +config GENERIC_TIME + bool + default y + +config GENERIC_TIME_VSYSCALL + bool + default y + config ZONE_DMA32 bool default y diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index ae399458024..bb47e86f3d0 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -8,7 +8,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ - pci-dma.o pci-nommu.o alternative.o + pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o @@ -19,7 +19,7 @@ obj-$(CONFIG_ACPI) += acpi/ obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CPUID) += cpuid.o -obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o +obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o tsc_sync.o obj-y += apic.o nmi.o obj-y += io_apic.o mpparse.o \ genapic.o genapic_cluster.o genapic_flat.o diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 124b2d27b4a..723417d924c 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -37,6 +37,7 @@ #include <asm/idle.h> #include <asm/proto.h> #include <asm/timex.h> +#include <asm/hpet.h> #include <asm/apic.h> int apic_mapped; @@ -763,7 +764,7 @@ static void setup_APIC_timer(unsigned int clocks) local_irq_save(flags); /* wait for irq slice */ - if (vxtime.hpet_address && hpet_use_timer) { + if (hpet_address && hpet_use_timer) { int trigger = hpet_readl(HPET_T0_CMP); while (hpet_readl(HPET_COUNTER) >= trigger) /* do nothing */ ; @@ -785,7 +786,7 @@ static void setup_APIC_timer(unsigned int clocks) /* Turn off PIT interrupt if we use APIC timer as main timer. Only works with the PM timer right now TBD fix it for HPET too. */ - if (vxtime.mode == VXTIME_PMTMR && + if ((pmtmr_ioport != 0) && smp_processor_id() == boot_cpu_id && apic_runs_main_timer == 1 && !cpu_isset(boot_cpu_id, timer_interrupt_broadcast_ipi_mask)) { diff --git a/arch/i386/kernel/time_hpet.c b/arch/x86_64/kernel/hpet.c index 1e4702dfcd0..65a0edd71a1 100644 --- a/arch/i386/kernel/time_hpet.c +++ b/arch/x86_64/kernel/hpet.c @@ -1,224 +1,138 @@ -/* - * linux/arch/i386/kernel/time_hpet.c - * This code largely copied from arch/x86_64/kernel/time.c - * See that file for credits. - * - * 2003-06-30 Venkatesh Pallipadi - Additional changes for HPET support - */ - -#include <linux/errno.h> #include <linux/kernel.h> -#include <linux/param.h> -#include <linux/string.h> +#include <linux/sched.h> #include <linux/init.h> -#include <linux/smp.h> +#include <linux/mc146818rtc.h> +#include <linux/time.h> +#include <linux/clocksource.h> +#include <linux/ioport.h> +#include <linux/acpi.h> +#include <linux/hpet.h> +#include <asm/pgtable.h> +#include <asm/vsyscall.h> +#include <asm/timex.h> +#include <asm/hpet.h> -#include <asm/timer.h> -#include <asm/fixmap.h> -#include <asm/apic.h> +int nohpet __initdata; -#include <linux/timex.h> +unsigned long hpet_address; +unsigned long hpet_period; /* fsecs / HPET clock */ +unsigned long hpet_tick; /* HPET clocks / interrupt */ -#include <asm/hpet.h> -#include <linux/hpet.h> +int hpet_use_timer; /* Use counter of hpet for time keeping, + * otherwise PIT + */ -static unsigned long hpet_period; /* fsecs / HPET clock */ -unsigned long hpet_tick; /* hpet clks count per tick */ -unsigned long hpet_address; /* hpet memory map physical address */ -int hpet_use_timer; +#ifdef CONFIG_HPET +static __init int late_hpet_init(void) +{ + struct hpet_data hd; + unsigned int ntimer; -static int use_hpet; /* can be used for runtime check of hpet */ -static int boot_hpet_disable; /* boottime override for HPET timer */ -static void __iomem * hpet_virt_address; /* hpet kernel virtual address */ + if (!hpet_address) + return 0; -#define FSEC_TO_USEC (1000000000UL) + memset(&hd, 0, sizeof(hd)); -int hpet_readl(unsigned long a) -{ - return readl(hpet_virt_address + a); -} + ntimer = hpet_readl(HPET_ID); + ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; + ntimer++; -static void hpet_writel(unsigned long d, unsigned long a) -{ - writel(d, hpet_virt_address + a); -} + /* + * Register with driver. + * Timer0 and Timer1 is used by platform. + */ + hd.hd_phys_address = hpet_address; + hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); + hd.hd_nirqs = ntimer; + hd.hd_flags = HPET_DATA_PLATFORM; + hpet_reserve_timer(&hd, 0); +#ifdef CONFIG_HPET_EMULATE_RTC + hpet_reserve_timer(&hd, 1); +#endif + hd.hd_irq[0] = HPET_LEGACY_8254; + hd.hd_irq[1] = HPET_LEGACY_RTC; + if (ntimer > 2) { + struct hpet *hpet; + struct hpet_timer *timer; + int i; + + hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE); + timer = &hpet->hpet_timers[2]; + for (i = 2; i < ntimer; timer++, i++) + hd.hd_irq[i] = (timer->hpet_config & + Tn_INT_ROUTE_CNF_MASK) >> + Tn_INT_ROUTE_CNF_SHIFT; -#ifdef CONFIG_X86_LOCAL_APIC -/* - * HPET counters dont wrap around on every tick. They just change the - * comparator value and continue. Next tick can be caught by checking - * for a change in the comparator value. Used in apic.c. - */ -static void __devinit wait_hpet_tick(void) -{ - unsigned int start_cmp_val, end_cmp_val; + } - start_cmp_val = hpet_readl(HPET_T0_CMP); - do { - end_cmp_val = hpet_readl(HPET_T0_CMP); - } while (start_cmp_val == end_cmp_val); + hpet_alloc(&hd); + return 0; } +fs_initcall(late_hpet_init); #endif -static int hpet_timer_stop_set_go(unsigned long tick) +int hpet_timer_stop_set_go(unsigned long tick) { unsigned int cfg; - /* - * Stop the timers and reset the main counter. - */ +/* + * Stop the timers and reset the main counter. + */ + cfg = hpet_readl(HPET_CFG); - cfg &= ~HPET_CFG_ENABLE; + cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); hpet_writel(cfg, HPET_CFG); hpet_writel(0, HPET_COUNTER); hpet_writel(0, HPET_COUNTER + 4); +/* + * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, + * and period also hpet_tick. + */ if (hpet_use_timer) { - /* - * Set up timer 0, as periodic with first interrupt to happen at - * hpet_tick, and period also hpet_tick. - */ - cfg = hpet_readl(HPET_T0_CFG); - cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | - HPET_TN_SETVAL | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T0_CFG); - - /* - * The first write after writing TN_SETVAL to the config register sets - * the counter value, the second write sets the threshold. - */ - hpet_writel(tick, HPET_T0_CMP); - hpet_writel(tick, HPET_T0_CMP); - } - /* - * Go! - */ - cfg = hpet_readl(HPET_CFG); - if (hpet_use_timer) + hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | + HPET_TN_32BIT, HPET_T0_CFG); + hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */ + hpet_writel(hpet_tick, HPET_T0_CMP); /* period */ cfg |= HPET_CFG_LEGACY; + } +/* + * Go! + */ + cfg |= HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); return 0; } -/* - * Check whether HPET was found by ACPI boot parse. If yes setup HPET - * counter 0 for kernel base timer. - */ -int __init hpet_enable(void) +int hpet_arch_init(void) { unsigned int id; - unsigned long tick_fsec_low, tick_fsec_high; /* tick in femto sec */ - unsigned long hpet_tick_rem; - if (boot_hpet_disable) + if (!hpet_address) return -1; + set_fixmap_nocache(FIX_HPET_BASE, hpet_address); + __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); + +/* + * Read the period, compute tick and quotient. + */ - if (!hpet_address) { - return -1; - } - hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); - /* - * Read the period, compute tick and quotient. - */ id = hpet_readl(HPET_ID); - /* - * We are checking for value '1' or more in number field if - * CONFIG_HPET_EMULATE_RTC is set because we will need an - * additional timer for RTC emulation. - * However, we can do with one timer otherwise using the - * the single HPET timer for system time. - */ -#ifdef CONFIG_HPET_EMULATE_RTC - if (!(id & HPET_ID_NUMBER)) { - iounmap(hpet_virt_address); - hpet_virt_address = NULL; + if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER)) return -1; - } -#endif - hpet_period = hpet_readl(HPET_PERIOD); - if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) { - iounmap(hpet_virt_address); - hpet_virt_address = NULL; + if (hpet_period < 100000 || hpet_period > 100000000) return -1; - } - /* - * 64 bit math - * First changing tick into fsec - * Then 64 bit div to find number of hpet clk per tick - */ - ASM_MUL64_REG(tick_fsec_low, tick_fsec_high, - KERNEL_TICK_USEC, FSEC_TO_USEC); - ASM_DIV64_REG(hpet_tick, hpet_tick_rem, - hpet_period, tick_fsec_low, tick_fsec_high); - - if (hpet_tick_rem > (hpet_period >> 1)) - hpet_tick++; /* rounding the result */ - - hpet_use_timer = id & HPET_ID_LEGSUP; - - if (hpet_timer_stop_set_go(hpet_tick)) { - iounmap(hpet_virt_address); - hpet_virt_address = NULL; - return -1; - } + hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period; - use_hpet = 1; + hpet_use_timer = (id & HPET_ID_LEGSUP); -#ifdef CONFIG_HPET - { - struct hpet_data hd; - unsigned int ntimer; - - memset(&hd, 0, sizeof (hd)); - - ntimer = hpet_readl(HPET_ID); - ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; - ntimer++; - - /* - * Register with driver. - * Timer0 and Timer1 is used by platform. - */ - hd.hd_phys_address = hpet_address; - hd.hd_address = hpet_virt_address; - hd.hd_nirqs = ntimer; - hd.hd_flags = HPET_DATA_PLATFORM; - hpet_reserve_timer(&hd, 0); -#ifdef CONFIG_HPET_EMULATE_RTC - hpet_reserve_timer(&hd, 1); -#endif - hd.hd_irq[0] = HPET_LEGACY_8254; - hd.hd_irq[1] = HPET_LEGACY_RTC; - if (ntimer > 2) { - struct hpet __iomem *hpet; - struct hpet_timer __iomem *timer; - int i; - - hpet = hpet_virt_address; - - for (i = 2, timer = &hpet->hpet_timers[2]; i < ntimer; - timer++, i++) - hd.hd_irq[i] = (timer->hpet_config & - Tn_INT_ROUTE_CNF_MASK) >> - Tn_INT_ROUTE_CNF_SHIFT; - - } - - hpet_alloc(&hd); - } -#endif - -#ifdef CONFIG_X86_LOCAL_APIC - if (hpet_use_timer) - wait_timer_tick = wait_hpet_tick; -#endif - return 0; + return hpet_timer_stop_set_go(hpet_tick); } int hpet_reenable(void) @@ -226,28 +140,51 @@ int hpet_reenable(void) return hpet_timer_stop_set_go(hpet_tick); } -int is_hpet_enabled(void) -{ - return use_hpet; -} +/* + * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing + * it to the HPET timer of known frequency. + */ -int is_hpet_capable(void) +#define TICK_COUNT 100000000 +#define TICK_MIN 5000 + +/* + * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none + * occurs between the reads of the hpet & TSC. + */ +static void __init read_hpet_tsc(int *hpet, int *tsc) { - if (!boot_hpet_disable && hpet_address) - return 1; - return 0; + int tsc1, tsc2, hpet1; + + do { + tsc1 = get_cycles_sync(); + hpet1 = hpet_readl(HPET_COUNTER); + tsc2 = get_cycles_sync(); + } while (tsc2 - tsc1 > TICK_MIN); + *hpet = hpet1; + *tsc = tsc2; } -static int __init hpet_setup(char* str) +unsigned int __init hpet_calibrate_tsc(void) { - if (str) { - if (!strncmp("disable", str, 7)) - boot_hpet_disable = 1; - } - return 1; -} + int tsc_start, hpet_start; + int tsc_now, hpet_now; + unsigned long flags; + + local_irq_save(flags); + + read_hpet_tsc(&hpet_start, &tsc_start); -__setup("hpet=", hpet_setup); + do { + local_irq_disable(); + read_hpet_tsc(&hpet_now, &tsc_now); + local_irq_restore(flags); + } while ((tsc_now - tsc_start) < TICK_COUNT && + (hpet_now - hpet_start) < TICK_COUNT); + + return (tsc_now - tsc_start) * 1000000000L + / ((hpet_now - hpet_start) * hpet_period / 1000); +} #ifdef CONFIG_HPET_EMULATE_RTC /* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET @@ -264,7 +201,6 @@ __setup("hpet=", hpet_setup); * For (3), we use interrupts at 64Hz or user specified periodic * frequency, whichever is higher. */ -#include <linux/mc146818rtc.h> #include <linux/rtc.h> #define DEFAULT_RTC_INT_FREQ 64 @@ -283,6 +219,11 @@ static unsigned long PIE_count; static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ static unsigned int hpet_t1_cmp; /* cached comparator register */ +int is_hpet_enabled(void) +{ + return hpet_address != 0; +} + /* * Timer 1 for RTC, we do not use periodic interrupt feature, * even if HPET supports periodic interrupts on Timer 1. @@ -367,8 +308,9 @@ static void hpet_rtc_timer_reinit(void) if (PIE_on) PIE_count += lost_ints; - printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", - hpet_rtc_int_freq); + if (printk_ratelimit()) + printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", + hpet_rtc_int_freq); } } @@ -450,7 +392,7 @@ int hpet_rtc_dropped_irq(void) return 1; } -irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) +irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) { struct rtc_time curr_time; unsigned long rtc_int_flag = 0; @@ -495,3 +437,75 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) } #endif +static int __init nohpet_setup(char *s) +{ + nohpet = 1; + return 1; +} + +__setup("nohpet", nohpet_setup); + +#define HPET_MASK 0xFFFFFFFF +#define HPET_SHIFT 22 + +/* FSEC = 10^-15 NSEC = 10^-9 */ +#define FSEC_PER_NSEC 1000000 + +static void *hpet_ptr; + +static cycle_t read_hpet(void) +{ + return (cycle_t)readl(hpet_ptr); +} + +static cycle_t __vsyscall_fn vread_hpet(void) +{ + return readl((void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); +} + +struct clocksource clocksource_hpet = { + .name = "hpet", + .rating = 250, + .read = read_hpet, + .mask = (cycle_t)HPET_MASK, + .mult = 0, /* set below */ + .shift = HPET_SHIFT, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .vread = vread_hpet, +}; + +static int __init init_hpet_clocksource(void) +{ + unsigned long hpet_period; + void __iomem *hpet_base; + u64 tmp; + + if (!hpet_address) + return -ENODEV; + + /* calculate the hpet address: */ + hpet_base = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); + hpet_ptr = hpet_base + HPET_COUNTER; + + /* calculate the frequency: */ + hpet_period = readl(hpet_base + HPET_PERIOD); + + /* + * hpet period is in femto seconds per cycle + * so we need to convert this to ns/cyc units + * aproximated by mult/2^shift + * + * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift + * fsec/cyc * 1ns/1000000fsec * 2^shift = mult + * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult + * (fsec/cyc << shift)/1000000 = mult + * (hpet_period << shift)/FSEC_PER_NSEC = mult + */ + tmp = (u64)hpet_period << HPET_SHIFT; + do_div(tmp, FSEC_PER_NSEC); + clocksource_hpet.mult = (u32)tmp; + + return clocksource_register(&clocksource_hpet); +} + +module_init(init_hpet_clocksource); diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index d73c79e821f..01e2cf0bdeb 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -103,6 +103,7 @@ static void mask_and_ack_8259A(unsigned int); static struct irq_chip i8259A_chip = { .name = "XT-PIC", .mask = disable_8259A_irq, + .disable = disable_8259A_irq, .unmask = enable_8259A_irq, .mask_ack = mask_and_ack_8259A, }; diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 566e64d966c..950682f3576 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -810,11 +810,9 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger) trigger == IOAPIC_LEVEL) set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_fasteoi_irq, "fasteoi"); - else { - irq_desc[irq].status |= IRQ_DELAYED_DISABLE; + else set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); - } } static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq) { diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c index 7554458dc9c..ae8f91214f1 100644 --- a/arch/x86_64/kernel/pmtimer.c +++ b/arch/x86_64/kernel/pmtimer.c @@ -24,15 +24,6 @@ #include <asm/msr.h> #include <asm/vsyscall.h> -/* The I/O port the PMTMR resides at. - * The location is detected during setup_arch(), - * in arch/i386/kernel/acpi/boot.c */ -u32 pmtmr_ioport __read_mostly; - -/* value of the Power timer at last timer interrupt */ -static u32 offset_delay; -static u32 last_pmtmr_tick; - #define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ static inline u32 cyc2us(u32 cycles) @@ -48,38 +39,6 @@ static inline u32 cyc2us(u32 cycles) return (cycles >> 10); } -int pmtimer_mark_offset(void) -{ - static int first_run = 1; - unsigned long tsc; - u32 lost; - - u32 tick = inl(pmtmr_ioport); - u32 delta; - - delta = cyc2us((tick - last_pmtmr_tick) & ACPI_PM_MASK); - - last_pmtmr_tick = tick; - monotonic_base += delta * NSEC_PER_USEC; - - delta += offset_delay; - - lost = delta / (USEC_PER_SEC / HZ); - offset_delay = delta % (USEC_PER_SEC / HZ); - - rdtscll(tsc); - vxtime.last_tsc = tsc - offset_delay * (u64)cpu_khz / 1000; - - /* don't calculate delay for first run, - or if we've got less then a tick */ - if (first_run || (lost < 1)) { - first_run = 0; - offset_delay = 0; - } - - return lost - 1; -} - static unsigned pmtimer_wait_tick(void) { u32 a, b; @@ -101,23 +60,6 @@ void pmtimer_wait(unsigned us) } while (cyc2us(b - a) < us); } -void pmtimer_resume(void) -{ - last_pmtmr_tick = inl(pmtmr_ioport); -} - -unsigned int do_gettimeoffset_pm(void) -{ - u32 now, offset, delta = 0; - - offset = last_pmtmr_tick; - now = inl(pmtmr_ioport); - delta = (now - offset) & ACPI_PM_MASK; - - return offset_delay + cyc2us(delta); -} - - static int __init nopmtimer_setup(char *s) { pmtmr_ioport = 0; diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index daf19332f0d..35443729aad 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -148,217 +148,6 @@ static void __cpuinit smp_store_cpu_info(int id) print_cpu_info(c); } -/* - * New Funky TSC sync algorithm borrowed from IA64. - * Main advantage is that it doesn't reset the TSCs fully and - * in general looks more robust and it works better than my earlier - * attempts. I believe it was written by David Mosberger. Some minor - * adjustments for x86-64 by me -AK - * - * Original comment reproduced below. - * - * Synchronize TSC of the current (slave) CPU with the TSC of the - * MASTER CPU (normally the time-keeper CPU). We use a closed loop to - * eliminate the possibility of unaccounted-for errors (such as - * getting a machine check in the middle of a calibration step). The - * basic idea is for the slave to ask the master what itc value it has - * and to read its own itc before and after the master responds. Each - * iteration gives us three timestamps: - * - * slave master - * - * t0 ---\ - * ---\ - * ---> - * tm - * /--- - * /--- - * t1 <--- - * - * - * The goal is to adjust the slave's TSC such that tm falls exactly - * half-way between t0 and t1. If we achieve this, the clocks are - * synchronized provided the interconnect between the slave and the - * master is symmetric. Even if the interconnect were asymmetric, we - * would still know that the synchronization error is smaller than the - * roundtrip latency (t0 - t1). - * - * When the interconnect is quiet and symmetric, this lets us - * synchronize the TSC to within one or two cycles. However, we can - * only *guarantee* that the synchronization is accurate to within a - * round-trip time, which is typically in the range of several hundred - * cycles (e.g., ~500 cycles). In practice, this means that the TSCs - * are usually almost perfectly synchronized, but we shouldn't assume - * that the accuracy is much better than half a micro second or so. - * - * [there are other errors like the latency of RDTSC and of the - * WRMSR. These can also account to hundreds of cycles. So it's - * probably worse. It claims 153 cycles error on a dual Opteron, - * but I suspect the numbers are actually somewhat worse -AK] - */ - -#define MASTER 0 -#define SLAVE (SMP_CACHE_BYTES/8) - -/* Intentionally don't use cpu_relax() while TSC synchronization - because we don't want to go into funky power save modi or cause - hypervisors to schedule us away. Going to sleep would likely affect - latency and low latency is the primary objective here. -AK */ -#define no_cpu_relax() barrier() - -static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock); -static volatile __cpuinitdata unsigned long go[SLAVE + 1]; -static int notscsync __cpuinitdata; - -#undef DEBUG_TSC_SYNC - -#define NUM_ROUNDS 64 /* magic value */ -#define NUM_ITERS 5 /* likewise */ - -/* Callback on boot CPU */ -static __cpuinit void sync_master(void *arg) -{ - unsigned long flags, i; - - go[MASTER] = 0; - - local_irq_save(flags); - { - for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) { - while (!go[MASTER]) - no_cpu_relax(); - go[MASTER] = 0; - rdtscll(go[SLAVE]); - } - } - local_irq_restore(flags); -} - -/* - * Return the number of cycles by which our tsc differs from the tsc - * on the master (time-keeper) CPU. A positive number indicates our - * tsc is ahead of the master, negative that it is behind. - */ -static inline long -get_delta(long *rt, long *master) -{ - unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0; - unsigned long tcenter, t0, t1, tm; - int i; - - for (i = 0; i < NUM_ITERS; ++i) { - rdtscll(t0); - go[MASTER] = 1; - while (!(tm = go[SLAVE])) - no_cpu_relax(); - go[SLAVE] = 0; - rdtscll(t1); - - if (t1 - t0 < best_t1 - best_t0) - best_t0 = t0, best_t1 = t1, best_tm = tm; - } - - *rt = best_t1 - best_t0; - *master = best_tm - best_t0; - - /* average best_t0 and best_t1 without overflow: */ - tcenter = (best_t0/2 + best_t1/2); - if (best_t0 % 2 + best_t1 % 2 == 2) - ++tcenter; - return tcenter - best_tm; -} - -static __cpuinit void sync_tsc(unsigned int master) -{ - int i, done = 0; - long delta, adj, adjust_latency = 0; - unsigned long flags, rt, master_time_stamp, bound; -#ifdef DEBUG_TSC_SYNC - static struct syncdebug { - long rt; /* roundtrip time */ - long master; /* master's timestamp */ - long diff; /* difference between midpoint and master's timestamp */ - long lat; /* estimate of tsc adjustment latency */ - } t[NUM_ROUNDS] __cpuinitdata; -#endif - - printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", - smp_processor_id(), master); - - go[MASTER] = 1; - - /* It is dangerous to broadcast IPI as cpus are coming up, - * as they may not be ready to accept them. So since - * we only need to send the ipi to the boot cpu direct - * the message, and avoid the race. - */ - smp_call_function_single(master, sync_master, NULL, 1, 0); - - while (go[MASTER]) /* wait for master to be ready */ - no_cpu_relax(); - - spin_lock_irqsave(&tsc_sync_lock, flags); - { - for (i = 0; i < NUM_ROUNDS; ++i) { - delta = get_delta(&rt, &master_time_stamp); - if (delta == 0) { - done = 1; /* let's lock on to this... */ - bound = rt; - } - - if (!done) { - unsigned long t; - if (i > 0) { - adjust_latency += -delta; - adj = -delta + adjust_latency/4; - } else - adj = -delta; - - rdtscll(t); - wrmsrl(MSR_IA32_TSC, t + adj); - } -#ifdef DEBUG_TSC_SYNC - t[i].rt = rt; - t[i].master = master_time_stamp; - t[i].diff = delta; - t[i].lat = adjust_latency/4; -#endif - } - } - spin_unlock_irqrestore(&tsc_sync_lock, flags); - -#ifdef DEBUG_TSC_SYNC - for (i = 0; i < NUM_ROUNDS; ++i) - printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n", - t[i].rt, t[i].master, t[i].diff, t[i].lat); -#endif - - printk(KERN_INFO - "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, " - "maxerr %lu cycles)\n", - smp_processor_id(), master, delta, rt); -} - -static void __cpuinit tsc_sync_wait(void) -{ - /* - * When the CPU has synchronized TSCs assume the BIOS - * or the hardware already synced. Otherwise we could - * mess up a possible perfect synchronization with a - * not-quite-perfect algorithm. - */ - if (notscsync || !cpu_has_tsc || !unsynchronized_tsc()) - return; - sync_tsc(0); -} - -static __init int notscsync_setup(char *s) -{ - notscsync = 1; - return 1; -} -__setup("notscsync", notscsync_setup); - static atomic_t init_deasserted __cpuinitdata; /* @@ -546,6 +335,11 @@ void __cpuinit start_secondary(void) /* otherwise gcc will move up the smp_processor_id before the cpu_init */ barrier(); + /* + * Check TSC sync first: + */ + check_tsc_sync_target(); + Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); setup_secondary_APIC_clock(); @@ -565,14 +359,6 @@ void __cpuinit start_secondary(void) */ set_cpu_sibling_map(smp_processor_id()); - /* - * Wait for TSC sync to not schedule things before. - * We still process interrupts, which could see an inconsistent - * time in that window unfortunately. - * Do this here because TSC sync has global unprotected state. - */ - tsc_sync_wait(); - /* * We need to hold call_lock, so there is no inconsistency * between the time smp_call_function() determines number of @@ -592,6 +378,7 @@ void __cpuinit start_secondary(void) cpu_set(smp_processor_id(), cpu_online_map); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; spin_unlock(&vector_lock); + unlock_ipi_call_lock(); cpu_idle(); @@ -1168,6 +955,11 @@ int __cpuinit __cpu_up(unsigned int cpu) /* Unleash the CPU! */ Dprintk("waiting for cpu %d\n", cpu); + /* + * Make sure and check TSC sync: + */ + check_tsc_sync_source(cpu); + while (!cpu_isset(cpu, cpu_online_map)) cpu_relax(); @@ -1190,7 +982,6 @@ void __init smp_cpus_done(unsigned int max_cpus) smp_cleanup_boot(); setup_ioapic_dest(); check_nmi_watchdog(); - time_init_gtod(); } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 3cc6886f1fb..a87c5170562 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -42,10 +42,8 @@ #include <linux/cpufreq.h> #include <linux/hpet.h> #include <asm/apic.h> +#include <asm/hpet.h> -#ifdef CONFIG_CPU_FREQ -static void cpufreq_delayed_get(void); -#endif extern void i8254_timer_resume(void); extern int using_apic_timer; @@ -55,128 +53,7 @@ DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); DEFINE_SPINLOCK(i8253_lock); -int nohpet __initdata = 0; -static int notsc __initdata = 0; - -#define USEC_PER_TICK (USEC_PER_SEC / HZ) -#define NSEC_PER_TICK (NSEC_PER_SEC / HZ) -#define FSEC_PER_TICK (FSEC_PER_SEC / HZ) - -#define NS_SCALE 10 /* 2^10, carefully chosen */ -#define US_SCALE 32 /* 2^32, arbitralrily chosen */ - -unsigned int cpu_khz; /* TSC clocks / usec, not used here */ -EXPORT_SYMBOL(cpu_khz); -static unsigned long hpet_period; /* fsecs / HPET clock */ -unsigned long hpet_tick; /* HPET clocks / interrupt */ -int hpet_use_timer; /* Use counter of hpet for time keeping, otherwise PIT */ -unsigned long vxtime_hz = PIT_TICK_RATE; -int report_lost_ticks; /* command line option */ -unsigned long long monotonic_base; - -struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ - volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; -struct timespec __xtime __section_xtime; -struct timezone __sys_tz __section_sys_tz; - -/* - * do_gettimeoffset() returns microseconds since last timer interrupt was - * triggered by hardware. A memory read of HPET is slower than a register read - * of TSC, but much more reliable. It's also synchronized to the timer - * interrupt. Note that do_gettimeoffset() may return more than hpet_tick, if a - * timer interrupt has happened already, but vxtime.trigger wasn't updated yet. - * This is not a problem, because jiffies hasn't updated either. They are bound - * together by xtime_lock. - */ - -static inline unsigned int do_gettimeoffset_tsc(void) -{ - unsigned long t; - unsigned long x; - t = get_cycles_sync(); - if (t < vxtime.last_tsc) - t = vxtime.last_tsc; /* hack */ - x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> US_SCALE; - return x; -} - -static inline unsigned int do_gettimeoffset_hpet(void) -{ - /* cap counter read to one tick to avoid inconsistencies */ - unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last; - return (min(counter,hpet_tick) * vxtime.quot) >> US_SCALE; -} - -unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc; - -/* - * This version of gettimeofday() has microsecond resolution and better than - * microsecond precision, as we're using at least a 10 MHz (usually 14.31818 - * MHz) HPET timer. - */ - -void do_gettimeofday(struct timeval *tv) -{ - unsigned long seq; - unsigned int sec, usec; - - do { - seq = read_seqbegin(&xtime_lock); - - sec = xtime.tv_sec; - usec = xtime.tv_nsec / NSEC_PER_USEC; - - /* i386 does some correction here to keep the clock - monotonous even when ntpd is fixing drift. - But they didn't work for me, there is a non monotonic - clock anyways with ntp. - I dropped all corrections now until a real solution can - be found. Note when you fix it here you need to do the same - in arch/x86_64/kernel/vsyscall.c and export all needed - variables in vmlinux.lds. -AK */ - usec += do_gettimeoffset(); - - } while (read_seqretry(&xtime_lock, seq)); - - tv->tv_sec = sec + usec / USEC_PER_SEC; - tv->tv_usec = usec % USEC_PER_SEC; -} - -EXPORT_SYMBOL(do_gettimeofday); - -/* - * settimeofday() first undoes the correction that gettimeofday would do - * on the time, and then saves it. This is ugly, but has been like this for - * ages already. - */ - -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - - nsec -= do_gettimeoffset() * NSEC_PER_USEC; - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - ntp_clear(); - - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); unsigned long profile_pc(struct pt_regs *regs) { @@ -267,84 +144,9 @@ static void set_rtc_mmss(unsigned long nowtime) } -/* monotonic_clock(): returns # of nanoseconds passed since time_init() - * Note: This function is required to return accurate - * time even in the absence of multiple timer ticks. - */ -static inline unsigned long long cycles_2_ns(unsigned long long cyc); -unsigned long long monotonic_clock(void) -{ - unsigned long seq; - u32 last_offset, this_offset, offset; - unsigned long long base; - - if (vxtime.mode == VXTIME_HPET) { - do { - seq = read_seqbegin(&xtime_lock); - - last_offset = vxtime.last; - base = monotonic_base; - this_offset = hpet_readl(HPET_COUNTER); - } while (read_seqretry(&xtime_lock, seq)); - offset = (this_offset - last_offset); - offset *= NSEC_PER_TICK / hpet_tick; - } else { - do { - seq = read_seqbegin(&xtime_lock); - - last_offset = vxtime.last_tsc; - base = monotonic_base; - } while (read_seqretry(&xtime_lock, seq)); - this_offset = get_cycles_sync(); - offset = cycles_2_ns(this_offset - last_offset); - } - return base + offset; -} -EXPORT_SYMBOL(monotonic_clock); - -static noinline void handle_lost_ticks(int lost) -{ - static long lost_count; - static int warned; - if (report_lost_ticks) { - printk(KERN_WARNING "time.c: Lost %d timer tick(s)! ", lost); - print_symbol("rip %s)\n", get_irq_regs()->rip); - } - - if (lost_count == 1000 && !warned) { - printk(KERN_WARNING "warning: many lost ticks.\n" - KERN_WARNING "Your time source seems to be instable or " - "some driver is hogging interupts\n"); - print_symbol("rip %s\n", get_irq_regs()->rip); - if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) { - printk(KERN_WARNING "Falling back to HPET\n"); - if (hpet_use_timer) - vxtime.last = hpet_readl(HPET_T0_CMP) - - hpet_tick; - else - vxtime.last = hpet_readl(HPET_COUNTER); - vxtime.mode = VXTIME_HPET; - do_gettimeoffset = do_gettimeoffset_hpet; - } - /* else should fall back to PIT, but code missing. */ - warned = 1; - } else - lost_count++; - -#ifdef CONFIG_CPU_FREQ - /* In some cases the CPU can change frequency without us noticing - Give cpufreq a change to catch up. */ - if ((lost_count+1) % 25 == 0) - cpufreq_delayed_get(); -#endif -} - void main_timer_handler(void) { static unsigned long rtc_update = 0; - unsigned long tsc; - int delay = 0, offset = 0, lost = 0; - /* * Here we are in the timer irq handler. We have irqs locally disabled (so we * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running @@ -354,72 +156,11 @@ void main_timer_handler(void) write_seqlock(&xtime_lock); - if (vxtime.hpet_address) - offset = hpet_readl(HPET_COUNTER); - - if (hpet_use_timer) { - /* if we're using the hpet timer functionality, - * we can more accurately know the counter value - * when the timer interrupt occured. - */ - offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - delay = hpet_readl(HPET_COUNTER) - offset; - } else if (!pmtmr_ioport) { - spin_lock(&i8253_lock); - outb_p(0x00, 0x43); - delay = inb_p(0x40); - delay |= inb(0x40) << 8; - spin_unlock(&i8253_lock); - delay = LATCH - 1 - delay; - } - - tsc = get_cycles_sync(); - - if (vxtime.mode == VXTIME_HPET) { - if (offset - vxtime.last > hpet_tick) { - lost = (offset - vxtime.last) / hpet_tick - 1; - } - - monotonic_base += - (offset - vxtime.last) * NSEC_PER_TICK / hpet_tick; - - vxtime.last = offset; -#ifdef CONFIG_X86_PM_TIMER - } else if (vxtime.mode == VXTIME_PMTMR) { - lost = pmtimer_mark_offset(); -#endif - } else { - offset = (((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> US_SCALE) - USEC_PER_TICK; - - if (offset < 0) - offset = 0; - - if (offset > USEC_PER_TICK) { - lost = offset / USEC_PER_TICK; - offset %= USEC_PER_TICK; - } - - monotonic_base += cycles_2_ns(tsc - vxtime.last_tsc); - - vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot; - - if ((((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> US_SCALE) < offset) - vxtime.last_tsc = tsc - - (((long) offset << US_SCALE) / vxtime.tsc_quot) - 1; - } - - if (lost > 0) - handle_lost_ticks(lost); - else - lost = 0; - /* * Do the timer stuff. */ - do_timer(lost + 1); + do_timer(1); #ifndef CONFIG_SMP update_process_times(user_mode(get_irq_regs())); #endif @@ -460,40 +201,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static unsigned int cyc2ns_scale __read_mostly; - -static inline void set_cyc2ns_scale(unsigned long cpu_khz) -{ - cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / cpu_khz; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> NS_SCALE; -} - -unsigned long long sched_clock(void) -{ - unsigned long a = 0; - -#if 0 - /* Don't do a HPET read here. Using TSC always is much faster - and HPET may not be mapped yet when the scheduler first runs. - Disadvantage is a small drift between CPUs in some configurations, - but that should be tolerable. */ - if (__vxtime.mode == VXTIME_HPET) - return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> US_SCALE; -#endif - - /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, - which means it is not completely exact and may not be monotonous between - CPUs. But the errors should be too small to matter for scheduling - purposes. */ - - rdtscll(a); - return cycles_2_ns(a); -} - static unsigned long get_cmos_time(void) { unsigned int year, mon, day, hour, min, sec; @@ -545,164 +252,6 @@ static unsigned long get_cmos_time(void) return mktime(year, mon, day, hour, min, sec); } -#ifdef CONFIG_CPU_FREQ - -/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency - changes. - - RED-PEN: On SMP we assume all CPUs run with the same frequency. It's - not that important because current Opteron setups do not support - scaling on SMP anyroads. - - Should fix up last_tsc too. Currently gettimeofday in the - first tick after the change will be slightly wrong. */ - -#include <linux/workqueue.h> - -static unsigned int cpufreq_delayed_issched = 0; -static unsigned int cpufreq_init = 0; -static struct work_struct cpufreq_delayed_get_work; - -static void handle_cpufreq_delayed_get(struct work_struct *v) -{ - unsigned int cpu; - for_each_online_cpu(cpu) { - cpufreq_get(cpu); - } - cpufreq_delayed_issched = 0; -} - -/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries - * to verify the CPU frequency the timing core thinks the CPU is running - * at is still correct. - */ -static void cpufreq_delayed_get(void) -{ - static int warned; - if (cpufreq_init && !cpufreq_delayed_issched) { - cpufreq_delayed_issched = 1; - if (!warned) { - warned = 1; - printk(KERN_DEBUG - "Losing some ticks... checking if CPU frequency changed.\n"); - } - schedule_work(&cpufreq_delayed_get_work); - } -} - -static unsigned int ref_freq = 0; -static unsigned long loops_per_jiffy_ref = 0; - -static unsigned long cpu_khz_ref = 0; - -static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct cpufreq_freqs *freq = data; - unsigned long *lpj, dummy; - - if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC)) - return 0; - - lpj = &dummy; - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) -#ifdef CONFIG_SMP - lpj = &cpu_data[freq->cpu].loops_per_jiffy; -#else - lpj = &boot_cpu_data.loops_per_jiffy; -#endif - - if (!ref_freq) { - ref_freq = freq->old; - loops_per_jiffy_ref = *lpj; - cpu_khz_ref = cpu_khz; - } - if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || - (val == CPUFREQ_RESUMECHANGE)) { - *lpj = - cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); - - cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; - } - - set_cyc2ns_scale(cpu_khz_ref); - - return 0; -} - -static struct notifier_block time_cpufreq_notifier_block = { - .notifier_call = time_cpufreq_notifier -}; - -static int __init cpufreq_tsc(void) -{ - INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get); - if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER)) - cpufreq_init = 1; - return 0; -} - -core_initcall(cpufreq_tsc); - -#endif - -/* - * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing - * it to the HPET timer of known frequency. - */ - -#define TICK_COUNT 100000000 -#define TICK_MIN 5000 -#define MAX_READ_RETRIES 5 - -/* - * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none - * occurs between the reads of the hpet & TSC. - */ -static void __init read_hpet_tsc(int *hpet, int *tsc) -{ - int tsc1, tsc2, hpet1, retries = 0; - static int msg; - - do { - tsc1 = get_cycles_sync(); - hpet1 = hpet_readl(HPET_COUNTER); - tsc2 = get_cycles_sync(); - } while (tsc2 - tsc1 > TICK_MIN && retries++ < MAX_READ_RETRIES); - if (retries >= MAX_READ_RETRIES && !msg++) - printk(KERN_WARNING - "hpet.c: exceeded max retries to read HPET & TSC\n"); - *hpet = hpet1; - *tsc = tsc2; -} - - -static unsigned int __init hpet_calibrate_tsc(void) -{ - int tsc_start, hpet_start; - int tsc_now, hpet_now; - unsigned long flags; - - local_irq_save(flags); - local_irq_disable(); - - read_hpet_tsc(&hpet_start, &tsc_start); - - do { - local_irq_disable(); - read_hpet_tsc(&hpet_now, &tsc_now); - local_irq_restore(flags); - } while ((tsc_now - tsc_start) < TICK_COUNT && - (hpet_now - hpet_start) < TICK_COUNT); - - return (tsc_now - tsc_start) * 1000000000L - / ((hpet_now - hpet_start) * hpet_period / 1000); -} - /* * pit_calibrate_tsc() uses the speaker output (channel 2) of @@ -733,124 +282,6 @@ static unsigned int __init pit_calibrate_tsc(void) return (end - start) / 50; } -#ifdef CONFIG_HPET -static __init int late_hpet_init(void) -{ - struct hpet_data hd; - unsigned int ntimer; - - if (!vxtime.hpet_address) - return 0; - - memset(&hd, 0, sizeof (hd)); - - ntimer = hpet_readl(HPET_ID); - ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; - ntimer++; - - /* - * Register with driver. - * Timer0 and Timer1 is used by platform. - */ - hd.hd_phys_address = vxtime.hpet_address; - hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); - hd.hd_nirqs = ntimer; - hd.hd_flags = HPET_DATA_PLATFORM; - hpet_reserve_timer(&hd, 0); -#ifdef CONFIG_HPET_EMULATE_RTC - hpet_reserve_timer(&hd, 1); -#endif - hd.hd_irq[0] = HPET_LEGACY_8254; - hd.hd_irq[1] = HPET_LEGACY_RTC; - if (ntimer > 2) { - struct hpet *hpet; - struct hpet_timer *timer; - int i; - - hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE); - timer = &hpet->hpet_timers[2]; - for (i = 2; i < ntimer; timer++, i++) - hd.hd_irq[i] = (timer->hpet_config & - Tn_INT_ROUTE_CNF_MASK) >> - Tn_INT_ROUTE_CNF_SHIFT; - - } - - hpet_alloc(&hd); - return 0; -} -fs_initcall(late_hpet_init); -#endif - -static int hpet_timer_stop_set_go(unsigned long tick) -{ - unsigned int cfg; - -/* - * Stop the timers and reset the main counter. - */ - - cfg = hpet_readl(HPET_CFG); - cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); - hpet_writel(cfg, HPET_CFG); - hpet_writel(0, HPET_COUNTER); - hpet_writel(0, HPET_COUNTER + 4); - -/* - * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, - * and period also hpet_tick. - */ - if (hpet_use_timer) { - hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | - HPET_TN_32BIT, HPET_T0_CFG); - hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */ - hpet_writel(hpet_tick, HPET_T0_CMP); /* period */ - cfg |= HPET_CFG_LEGACY; - } -/* - * Go! - */ - - cfg |= HPET_CFG_ENABLE; - hpet_writel(cfg, HPET_CFG); - - return 0; -} - -static int hpet_init(void) -{ - unsigned int id; - - if (!vxtime.hpet_address) - return -1; - set_fixmap_nocache(FIX_HPET_BASE, vxtime.hpet_address); - __set_fixmap(VSYSCALL_HPET, vxtime.hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); - -/* - * Read the period, compute tick and quotient. - */ - - id = hpet_readl(HPET_ID); - - if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER)) - return -1; - - hpet_period = hpet_readl(HPET_PERIOD); - if (hpet_period < 100000 || hpet_period > 100000000) - return -1; - - hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period; - - hpet_use_timer = (id & HPET_ID_LEGSUP); - - return hpet_timer_stop_set_go(hpet_tick); -} - -static int hpet_reenable(void) -{ - return hpet_timer_stop_set_go(hpet_tick); -} - #define PIT_MODE 0x43 #define PIT_CH0 0x40 @@ -878,7 +309,7 @@ void __init pit_stop_interrupt(void) void __init stop_timer_interrupt(void) { char *name; - if (vxtime.hpet_address) { + if (hpet_address) { name = "HPET"; hpet_timer_stop_set_go(0); } else { @@ -888,12 +319,6 @@ void __init stop_timer_interrupt(void) printk(KERN_INFO "timer: %s interrupt stopped.\n", name); } -int __init time_setup(char *str) -{ - report_lost_ticks = 1; - return 1; -} - static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL }; @@ -901,124 +326,41 @@ static struct irqaction irq0 = { void __init time_init(void) { if (nohpet) - vxtime.hpet_address = 0; - + hpet_address = 0; xtime.tv_sec = get_cmos_time(); xtime.tv_nsec = 0; set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); - if (!hpet_init()) - vxtime_hz = (FSEC_PER_SEC + hpet_period / 2) / hpet_period; - else - vxtime.hpet_address = 0; + if (hpet_arch_init()) + hpet_address = 0; if (hpet_use_timer) { /* set tick_nsec to use the proper rate for HPET */ tick_nsec = TICK_NSEC_HPET; cpu_khz = hpet_calibrate_tsc(); timename = "HPET"; -#ifdef CONFIG_X86_PM_TIMER - } else if (pmtmr_ioport && !vxtime.hpet_address) { - vxtime_hz = PM_TIMER_FREQUENCY; - timename = "PM"; - pit_init(); - cpu_khz = pit_calibrate_tsc(); -#endif } else { pit_init(); cpu_khz = pit_calibrate_tsc(); timename = "PIT"; } - vxtime.mode = VXTIME_TSC; - vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; - vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; - vxtime.last_tsc = get_cycles_sync(); - set_cyc2ns_scale(cpu_khz); - setup_irq(0, &irq0); - -#ifndef CONFIG_SMP - time_init_gtod(); -#endif -} - -/* - * Make an educated guess if the TSC is trustworthy and synchronized - * over all CPUs. - */ -__cpuinit int unsynchronized_tsc(void) -{ -#ifdef CONFIG_SMP - if (apic_is_clustered_box()) - return 1; -#endif - /* Most intel systems have synchronized TSCs except for - multi node systems */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { -#ifdef CONFIG_ACPI - /* But TSC doesn't tick in C3 so don't use it there */ - if (acpi_gbl_FADT.header.length > 0 && acpi_gbl_FADT.C3latency < 1000) - return 1; -#endif - return 0; - } - - /* Assume multi socket systems are not synchronized */ - return num_present_cpus() > 1; -} - -/* - * Decide what mode gettimeofday should use. - */ -void time_init_gtod(void) -{ - char *timetype; - if (unsynchronized_tsc()) - notsc = 1; + mark_tsc_unstable(); - if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) + if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) vgetcpu_mode = VGETCPU_RDTSCP; else vgetcpu_mode = VGETCPU_LSL; - if (vxtime.hpet_address && notsc) { - timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; - if (hpet_use_timer) - vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; - else - vxtime.last = hpet_readl(HPET_COUNTER); - vxtime.mode = VXTIME_HPET; - do_gettimeoffset = do_gettimeoffset_hpet; -#ifdef CONFIG_X86_PM_TIMER - /* Using PM for gettimeofday is quite slow, but we have no other - choice because the TSC is too unreliable on some systems. */ - } else if (pmtmr_ioport && !vxtime.hpet_address && notsc) { - timetype = "PM"; - do_gettimeoffset = do_gettimeoffset_pm; - vxtime.mode = VXTIME_PMTMR; - sysctl_vsyscall = 0; - printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n"); -#endif - } else { - timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC"; - vxtime.mode = VXTIME_TSC; - } - - printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n", - vxtime_hz / 1000000, vxtime_hz % 1000000, timename, timetype); + set_cyc2ns_scale(cpu_khz); printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); - vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; - vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; - vxtime.last_tsc = get_cycles_sync(); - - set_cyc2ns_scale(cpu_khz); + setup_irq(0, &irq0); } -__setup("report_lost_ticks", time_setup); static long clock_cmos_diff; static unsigned long sleep_start; @@ -1055,7 +397,7 @@ static int timer_resume(struct sys_device *dev) sleep_length = 0; ctime = sleep_start; } - if (vxtime.hpet_address) + if (hpet_address) hpet_reenable(); else i8254_timer_resume(); @@ -1064,20 +406,8 @@ static int timer_resume(struct sys_device *dev) write_seqlock_irqsave(&xtime_lock,flags); xtime.tv_sec = sec; xtime.tv_nsec = 0; - if (vxtime.mode == VXTIME_HPET) { - if (hpet_use_timer) - vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; - else - vxtime.last = hpet_readl(HPET_COUNTER); -#ifdef CONFIG_X86_PM_TIMER - } else if (vxtime.mode == VXTIME_PMTMR) { - pmtimer_resume(); -#endif - } else - vxtime.last_tsc = get_cycles_sync(); - write_sequnlock_irqrestore(&xtime_lock,flags); jiffies += sleep_length; - monotonic_base += sleep_length * (NSEC_PER_SEC/HZ); + write_sequnlock_irqrestore(&xtime_lock,flags); touch_softlockup_watchdog(); return 0; } @@ -1103,270 +433,3 @@ static int time_init_device(void) } device_initcall(time_init_device); - -#ifdef CONFIG_HPET_EMULATE_RTC -/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET - * is enabled, we support RTC interrupt functionality in software. - * RTC has 3 kinds of interrupts: - * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock - * is updated - * 2) Alarm Interrupt - generate an interrupt at a specific time of day - * 3) Periodic Interrupt - generate periodic interrupt, with frequencies - * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) - * (1) and (2) above are implemented using polling at a frequency of - * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt - * overhead. (DEFAULT_RTC_INT_FREQ) - * For (3), we use interrupts at 64Hz or user specified periodic - * frequency, whichever is higher. - */ -#include <linux/rtc.h> - -#define DEFAULT_RTC_INT_FREQ 64 -#define RTC_NUM_INTS 1 - -static unsigned long UIE_on; -static unsigned long prev_update_sec; - -static unsigned long AIE_on; -static struct rtc_time alarm_time; - -static unsigned long PIE_on; -static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ; -static unsigned long PIE_count; - -static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ -static unsigned int hpet_t1_cmp; /* cached comparator register */ - -int is_hpet_enabled(void) -{ - return vxtime.hpet_address != 0; -} - -/* - * Timer 1 for RTC, we do not use periodic interrupt feature, - * even if HPET supports periodic interrupts on Timer 1. - * The reason being, to set up a periodic interrupt in HPET, we need to - * stop the main counter. And if we do that everytime someone diables/enables - * RTC, we will have adverse effect on main kernel timer running on Timer 0. - * So, for the time being, simulate the periodic interrupt in software. - * - * hpet_rtc_timer_init() is called for the first time and during subsequent - * interuppts reinit happens through hpet_rtc_timer_reinit(). - */ -int hpet_rtc_timer_init(void) -{ - unsigned int cfg, cnt; - unsigned long flags; - - if (!is_hpet_enabled()) - return 0; - /* - * Set the counter 1 and enable the interrupts. - */ - if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) - hpet_rtc_int_freq = PIE_freq; - else - hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; - - local_irq_save(flags); - - cnt = hpet_readl(HPET_COUNTER); - cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); - hpet_writel(cnt, HPET_T1_CMP); - hpet_t1_cmp = cnt; - - cfg = hpet_readl(HPET_T1_CFG); - cfg &= ~HPET_TN_PERIODIC; - cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T1_CFG); - - local_irq_restore(flags); - - return 1; -} - -static void hpet_rtc_timer_reinit(void) -{ - unsigned int cfg, cnt, ticks_per_int, lost_ints; - - if (unlikely(!(PIE_on | AIE_on | UIE_on))) { - cfg = hpet_readl(HPET_T1_CFG); - cfg &= ~HPET_TN_ENABLE; - hpet_writel(cfg, HPET_T1_CFG); - return; - } - - if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) - hpet_rtc_int_freq = PIE_freq; - else - hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; - - /* It is more accurate to use the comparator value than current count.*/ - ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq; - hpet_t1_cmp += ticks_per_int; - hpet_writel(hpet_t1_cmp, HPET_T1_CMP); - - /* - * If the interrupt handler was delayed too long, the write above tries - * to schedule the next interrupt in the past and the hardware would - * not interrupt until the counter had wrapped around. - * So we have to check that the comparator wasn't set to a past time. - */ - cnt = hpet_readl(HPET_COUNTER); - if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) { - lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1; - /* Make sure that, even with the time needed to execute - * this code, the next scheduled interrupt has been moved - * back to the future: */ - lost_ints++; - - hpet_t1_cmp += lost_ints * ticks_per_int; - hpet_writel(hpet_t1_cmp, HPET_T1_CMP); - - if (PIE_on) - PIE_count += lost_ints; - - if (printk_ratelimit()) - printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", - hpet_rtc_int_freq); - } -} - -/* - * The functions below are called from rtc driver. - * Return 0 if HPET is not being used. - * Otherwise do the necessary changes and return 1. - */ -int hpet_mask_rtc_irq_bit(unsigned long bit_mask) -{ - if (!is_hpet_enabled()) - return 0; - - if (bit_mask & RTC_UIE) - UIE_on = 0; - if (bit_mask & RTC_PIE) - PIE_on = 0; - if (bit_mask & RTC_AIE) - AIE_on = 0; - - return 1; -} - -int hpet_set_rtc_irq_bit(unsigned long bit_mask) -{ - int timer_init_reqd = 0; - - if (!is_hpet_enabled()) - return 0; - - if (!(PIE_on | AIE_on | UIE_on)) - timer_init_reqd = 1; - - if (bit_mask & RTC_UIE) { - UIE_on = 1; - } - if (bit_mask & RTC_PIE) { - PIE_on = 1; - PIE_count = 0; - } - if (bit_mask & RTC_AIE) { - AIE_on = 1; - } - - if (timer_init_reqd) - hpet_rtc_timer_init(); - - return 1; -} - -int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) -{ - if (!is_hpet_enabled()) - return 0; - - alarm_time.tm_hour = hrs; - alarm_time.tm_min = min; - alarm_time.tm_sec = sec; - - return 1; -} - -int hpet_set_periodic_freq(unsigned long freq) -{ - if (!is_hpet_enabled()) - return 0; - - PIE_freq = freq; - PIE_count = 0; - - return 1; -} - -int hpet_rtc_dropped_irq(void) -{ - if (!is_hpet_enabled()) - return 0; - - return 1; -} - -irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) -{ - struct rtc_time curr_time; - unsigned long rtc_int_flag = 0; - int call_rtc_interrupt = 0; - - hpet_rtc_timer_reinit(); - - if (UIE_on | AIE_on) { - rtc_get_rtc_time(&curr_time); - } - if (UIE_on) { - if (curr_time.tm_sec != prev_update_sec) { - /* Set update int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag = RTC_UF; - prev_update_sec = curr_time.tm_sec; - } - } - if (PIE_on) { - PIE_count++; - if (PIE_count >= hpet_rtc_int_freq/PIE_freq) { - /* Set periodic int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag |= RTC_PF; - PIE_count = 0; - } - } - if (AIE_on) { - if ((curr_time.tm_sec == alarm_time.tm_sec) && - (curr_time.tm_min == alarm_time.tm_min) && - (curr_time.tm_hour == alarm_time.tm_hour)) { - /* Set alarm int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag |= RTC_AF; - } - } - if (call_rtc_interrupt) { - rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); - rtc_interrupt(rtc_int_flag, dev_id); - } - return IRQ_HANDLED; -} -#endif - -static int __init nohpet_setup(char *s) -{ - nohpet = 1; - return 1; -} - -__setup("nohpet", nohpet_setup); - -int __init notsc_setup(char *s) -{ - notsc = 1; - return 1; -} - -__setup("notsc", notsc_setup); diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c new file mode 100644 index 00000000000..89583186501 --- /dev/null +++ b/arch/x86_64/kernel/tsc.c @@ -0,0 +1,226 @@ +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/clocksource.h> +#include <linux/time.h> +#include <linux/acpi.h> +#include <linux/cpufreq.h> + +#include <asm/timex.h> + +static int notsc __initdata = 0; + +unsigned int cpu_khz; /* TSC clocks / usec, not used here */ +EXPORT_SYMBOL(cpu_khz); + +static unsigned int cyc2ns_scale __read_mostly; + +void set_cyc2ns_scale(unsigned long khz) +{ + cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz; +} + +static unsigned long long cycles_2_ns(unsigned long long cyc) +{ + return (cyc * cyc2ns_scale) >> NS_SCALE; +} + +unsigned long long sched_clock(void) +{ + unsigned long a = 0; + + /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, + * which means it is not completely exact and may not be monotonous + * between CPUs. But the errors should be too small to matter for + * scheduling purposes. + */ + + rdtscll(a); + return cycles_2_ns(a); +} + +static int tsc_unstable; + +static inline int check_tsc_unstable(void) +{ + return tsc_unstable; +} +#ifdef CONFIG_CPU_FREQ + +/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency + * changes. + * + * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's + * not that important because current Opteron setups do not support + * scaling on SMP anyroads. + * + * Should fix up last_tsc too. Currently gettimeofday in the + * first tick after the change will be slightly wrong. + */ + +#include <linux/workqueue.h> + +static unsigned int cpufreq_delayed_issched = 0; +static unsigned int cpufreq_init = 0; +static struct work_struct cpufreq_delayed_get_work; + +static void handle_cpufreq_delayed_get(struct work_struct *v) +{ + unsigned int cpu; + for_each_online_cpu(cpu) { + cpufreq_get(cpu); + } + cpufreq_delayed_issched = 0; +} + +static unsigned int ref_freq = 0; +static unsigned long loops_per_jiffy_ref = 0; + +static unsigned long cpu_khz_ref = 0; + +static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + unsigned long *lpj, dummy; + + if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC)) + return 0; + + lpj = &dummy; + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) +#ifdef CONFIG_SMP + lpj = &cpu_data[freq->cpu].loops_per_jiffy; +#else + lpj = &boot_cpu_data.loops_per_jiffy; +#endif + + if (!ref_freq) { + ref_freq = freq->old; + loops_per_jiffy_ref = *lpj; + cpu_khz_ref = cpu_khz; + } + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || + (val == CPUFREQ_RESUMECHANGE)) { + *lpj = + cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); + + cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) + mark_tsc_unstable(); + } + + set_cyc2ns_scale(cpu_khz_ref); + + return 0; +} + +static struct notifier_block time_cpufreq_notifier_block = { + .notifier_call = time_cpufreq_notifier +}; + +static int __init cpufreq_tsc(void) +{ + INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get); + if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER)) + cpufreq_init = 1; + return 0; +} + +core_initcall(cpufreq_tsc); + +#endif + +static int tsc_unstable = 0; + +/* + * Make an educated guess if the TSC is trustworthy and synchronized + * over all CPUs. + */ +__cpuinit int unsynchronized_tsc(void) +{ + if (tsc_unstable) + return 1; + +#ifdef CONFIG_SMP + if (apic_is_clustered_box()) + return 1; +#endif + /* Most intel systems have synchronized TSCs except for + multi node systems */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { +#ifdef CONFIG_ACPI + /* But TSC doesn't tick in C3 so don't use it there */ + if (acpi_gbl_FADT.header.length > 0 && acpi_gbl_FADT.C3latency < 1000) + return 1; +#endif + return 0; + } + + /* Assume multi socket systems are not synchronized */ + return num_present_cpus() > 1; +} + +int __init notsc_setup(char *s) +{ + notsc = 1; + return 1; +} + +__setup("notsc", notsc_setup); + + +/* clock source code: */ +static cycle_t read_tsc(void) +{ + cycle_t ret = (cycle_t)get_cycles_sync(); + return ret; +} + +static cycle_t __vsyscall_fn vread_tsc(void) +{ + cycle_t ret = (cycle_t)get_cycles_sync(); + return ret; +} + +static struct clocksource clocksource_tsc = { + .name = "tsc", + .rating = 300, + .read = read_tsc, + .mask = CLOCKSOURCE_MASK(64), + .shift = 22, + .flags = CLOCK_SOURCE_IS_CONTINUOUS | + CLOCK_SOURCE_MUST_VERIFY, + .vread = vread_tsc, +}; + +void mark_tsc_unstable(void) +{ + if (!tsc_unstable) { + tsc_unstable = 1; + /* Change only the rating, when not registered */ + if (clocksource_tsc.mult) + clocksource_change_rating(&clocksource_tsc, 0); + else + clocksource_tsc.rating = 0; + } +} +EXPORT_SYMBOL_GPL(mark_tsc_unstable); + +static int __init init_tsc_clocksource(void) +{ + if (!notsc) { + clocksource_tsc.mult = clocksource_khz2mult(cpu_khz, + clocksource_tsc.shift); + if (check_tsc_unstable()) + clocksource_tsc.rating = 0; + + return clocksource_register(&clocksource_tsc); + } + return 0; +} + +module_init(init_tsc_clocksource); diff --git a/arch/x86_64/kernel/tsc_sync.c b/arch/x86_64/kernel/tsc_sync.c new file mode 100644 index 00000000000..014f0db45df --- /dev/null +++ b/arch/x86_64/kernel/tsc_sync.c @@ -0,0 +1,187 @@ +/* + * arch/x86_64/kernel/tsc_sync.c: check TSC synchronization. + * + * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar + * + * We check whether all boot CPUs have their TSC's synchronized, + * print a warning if not and turn off the TSC clock-source. + * + * The warp-check is point-to-point between two CPUs, the CPU + * initiating the bootup is the 'source CPU', the freshly booting + * CPU is the 'target CPU'. + * + * Only two CPUs may participate - they can enter in any order. + * ( The serial nature of the boot logic and the CPU hotplug lock + * protects against more than 2 CPUs entering this code. ) + */ +#include <linux/spinlock.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/nmi.h> +#include <asm/tsc.h> + +/* + * Entry/exit counters that make sure that both CPUs + * run the measurement code at once: + */ +static __cpuinitdata atomic_t start_count; +static __cpuinitdata atomic_t stop_count; + +/* + * We use a raw spinlock in this exceptional case, because + * we want to have the fastest, inlined, non-debug version + * of a critical section, to be able to prove TSC time-warps: + */ +static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; +static __cpuinitdata cycles_t last_tsc; +static __cpuinitdata cycles_t max_warp; +static __cpuinitdata int nr_warps; + +/* + * TSC-warp measurement loop running on both CPUs: + */ +static __cpuinit void check_tsc_warp(void) +{ + cycles_t start, now, prev, end; + int i; + + start = get_cycles_sync(); + /* + * The measurement runs for 20 msecs: + */ + end = start + cpu_khz * 20ULL; + now = start; + + for (i = 0; ; i++) { + /* + * We take the global lock, measure TSC, save the + * previous TSC that was measured (possibly on + * another CPU) and update the previous TSC timestamp. + */ + __raw_spin_lock(&sync_lock); + prev = last_tsc; + now = get_cycles_sync(); + last_tsc = now; + __raw_spin_unlock(&sync_lock); + + /* + * Be nice every now and then (and also check whether + * measurement is done [we also insert a 100 million + * loops safety exit, so we dont lock up in case the + * TSC readout is totally broken]): + */ + if (unlikely(!(i & 7))) { + if (now > end || i > 100000000) + break; + cpu_relax(); + touch_nmi_watchdog(); + } + /* + * Outside the critical section we can now see whether + * we saw a time-warp of the TSC going backwards: + */ + if (unlikely(prev > now)) { + __raw_spin_lock(&sync_lock); + max_warp = max(max_warp, prev - now); + nr_warps++; + __raw_spin_unlock(&sync_lock); + } + + } +} + +/* + * Source CPU calls into this - it waits for the freshly booted + * target CPU to arrive and then starts the measurement: + */ +void __cpuinit check_tsc_sync_source(int cpu) +{ + int cpus = 2; + + /* + * No need to check if we already know that the TSC is not + * synchronized: + */ + if (unsynchronized_tsc()) + return; + + printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", + smp_processor_id(), cpu); + + /* + * Reset it - in case this is a second bootup: + */ + atomic_set(&stop_count, 0); + + /* + * Wait for the target to arrive: + */ + while (atomic_read(&start_count) != cpus-1) + cpu_relax(); + /* + * Trigger the target to continue into the measurement too: + */ + atomic_inc(&start_count); + + check_tsc_warp(); + + while (atomic_read(&stop_count) != cpus-1) + cpu_relax(); + + /* + * Reset it - just in case we boot another CPU later: + */ + atomic_set(&start_count, 0); + + if (nr_warps) { + printk("\n"); + printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," + " turning off TSC clock.\n", max_warp); + mark_tsc_unstable(); + nr_warps = 0; + max_warp = 0; + last_tsc = 0; + } else { + printk(" passed.\n"); + } + + /* + * Let the target continue with the bootup: + */ + atomic_inc(&stop_count); +} + +/* + * Freshly booted CPUs call into this: + */ +void __cpuinit check_tsc_sync_target(void) +{ + int cpus = 2; + + if (unsynchronized_tsc()) + return; + + /* + * Register this CPU's participation and wait for the + * source CPU to start the measurement: + */ + atomic_inc(&start_count); + while (atomic_read(&start_count) != cpus) + cpu_relax(); + + check_tsc_warp(); + + /* + * Ok, we are done: + */ + atomic_inc(&stop_count); + + /* + * Wait for the source CPU to print stuff: + */ + while (atomic_read(&stop_count) != cpus) + cpu_relax(); +} +#undef NR_LOOPS + diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index c360c422524..b73212c0a55 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -88,31 +88,25 @@ SECTIONS __vsyscall_0 = VSYSCALL_VIRT_ADDR; . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) } - xtime_lock = VVIRT(.xtime_lock); - - .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) } - vxtime = VVIRT(.vxtime); + .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) + { *(.vsyscall_gtod_data) } + vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } vgetcpu_mode = VVIRT(.vgetcpu_mode); - .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) } - sys_tz = VVIRT(.sys_tz); - - .sysctl_vsyscall : AT(VLOAD(.sysctl_vsyscall)) { *(.sysctl_vsyscall) } - sysctl_vsyscall = VVIRT(.sysctl_vsyscall); - - .xtime : AT(VLOAD(.xtime)) { *(.xtime) } - xtime = VVIRT(.xtime); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } jiffies = VVIRT(.jiffies); - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) } - .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) } - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) } + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) + { *(.vsyscall_1) } + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) + { *(.vsyscall_2) } + .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) + { *(.vsyscall_3) } . = VSYSCALL_VIRT_ADDR + 4096; diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index 313dc6ad780..180ff919eaf 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -26,6 +26,7 @@ #include <linux/seqlock.h> #include <linux/jiffies.h> #include <linux/sysctl.h> +#include <linux/clocksource.h> #include <linux/getcpu.h> #include <linux/cpu.h> #include <linux/smp.h> @@ -34,6 +35,7 @@ #include <asm/vsyscall.h> #include <asm/pgtable.h> #include <asm/page.h> +#include <asm/unistd.h> #include <asm/fixmap.h> #include <asm/errno.h> #include <asm/io.h> @@ -44,56 +46,41 @@ #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) #define __syscall_clobber "r11","rcx","memory" -int __sysctl_vsyscall __section_sysctl_vsyscall = 1; -seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; +struct vsyscall_gtod_data_t { + seqlock_t lock; + int sysctl_enabled; + struct timeval wall_time_tv; + struct timezone sys_tz; + cycle_t offset_base; + struct clocksource clock; +}; int __vgetcpu_mode __section_vgetcpu_mode; -#include <asm/unistd.h> - -static __always_inline void timeval_normalize(struct timeval * tv) +struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data = { - time_t __sec; - - __sec = tv->tv_usec / 1000000; - if (__sec) { - tv->tv_usec %= 1000000; - tv->tv_sec += __sec; - } -} + .lock = SEQLOCK_UNLOCKED, + .sysctl_enabled = 1, +}; -static __always_inline void do_vgettimeofday(struct timeval * tv) +void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) { - long sequence, t; - unsigned long sec, usec; - - do { - sequence = read_seqbegin(&__xtime_lock); - - sec = __xtime.tv_sec; - usec = __xtime.tv_nsec / 1000; - - if (__vxtime.mode != VXTIME_HPET) { - t = get_cycles_sync(); - if (t < __vxtime.last_tsc) - t = __vxtime.last_tsc; - usec += ((t - __vxtime.last_tsc) * - __vxtime.tsc_quot) >> 32; - /* See comment in x86_64 do_gettimeofday. */ - } else { - usec += ((readl((void __iomem *) - fix_to_virt(VSYSCALL_HPET) + 0xf0) - - __vxtime.last) * __vxtime.quot) >> 32; - } - } while (read_seqretry(&__xtime_lock, sequence)); - - tv->tv_sec = sec + usec / 1000000; - tv->tv_usec = usec % 1000000; + unsigned long flags; + + write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + /* copy vsyscall data */ + vsyscall_gtod_data.clock = *clock; + vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time->tv_sec; + vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time->tv_nsec/1000; + vsyscall_gtod_data.sys_tz = sys_tz; + write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } -/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */ +/* RED-PEN may want to readd seq locking, but then the variable should be + * write-once. + */ static __always_inline void do_get_tz(struct timezone * tz) { - *tz = __sys_tz; + *tz = __vsyscall_gtod_data.sys_tz; } static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) @@ -101,7 +88,8 @@ static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) int ret; asm volatile("vsysc2: syscall" : "=a" (ret) - : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber ); + : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) + : __syscall_clobber ); return ret; } @@ -114,10 +102,44 @@ static __always_inline long time_syscall(long *t) return secs; } +static __always_inline void do_vgettimeofday(struct timeval * tv) +{ + cycle_t now, base, mask, cycle_delta; + unsigned long seq, mult, shift, nsec_delta; + cycle_t (*vread)(void); + do { + seq = read_seqbegin(&__vsyscall_gtod_data.lock); + + vread = __vsyscall_gtod_data.clock.vread; + if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) { + gettimeofday(tv,0); + return; + } + now = vread(); + base = __vsyscall_gtod_data.clock.cycle_last; + mask = __vsyscall_gtod_data.clock.mask; + mult = __vsyscall_gtod_data.clock.mult; + shift = __vsyscall_gtod_data.clock.shift; + + *tv = __vsyscall_gtod_data.wall_time_tv; + + } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + + /* calculate interval: */ + cycle_delta = (now - base) & mask; + /* convert to nsecs: */ + nsec_delta = (cycle_delta * mult) >> shift; + + /* convert to usecs and add to timespec: */ + tv->tv_usec += nsec_delta / NSEC_PER_USEC; + while (tv->tv_usec > USEC_PER_SEC) { + tv->tv_sec += 1; + tv->tv_usec -= USEC_PER_SEC; + } +} + int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) { - if (!__sysctl_vsyscall) - return gettimeofday(tv,tz); if (tv) do_vgettimeofday(tv); if (tz) @@ -129,11 +151,11 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) * unlikely */ time_t __vsyscall(1) vtime(time_t *t) { - if (!__sysctl_vsyscall) + if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) return time_syscall(t); else if (t) - *t = __xtime.tv_sec; - return __xtime.tv_sec; + *t = __vsyscall_gtod_data.wall_time_tv.tv_sec; + return __vsyscall_gtod_data.wall_time_tv.tv_sec; } /* Fast way to get current CPU and node. @@ -210,7 +232,7 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, ret = -ENOMEM; goto out; } - if (!sysctl_vsyscall) { + if (!vsyscall_gtod_data.sysctl_enabled) { writew(SYSCALL, map1); writew(SYSCALL, map2); } else { @@ -232,7 +254,8 @@ static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen, static ctl_table kernel_table2[] = { { .ctl_name = 99, .procname = "vsyscall64", - .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644, + .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), + .mode = 0644, .strategy = vsyscall_sysctl_nostrat, .proc_handler = vsyscall_sysctl_change }, {} diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 6c6751b1405..8206fc1ecc5 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -39,6 +39,17 @@ #include <linux/moduleparam.h> #include <linux/sched.h> /* need_resched() */ #include <linux/latency.h> +#include <linux/clockchips.h> + +/* + * Include the apic definitions for x86 to have the APIC timer related defines + * available also for UP (on SMP it gets magically included via linux/smp.h). + * asm/acpi.h is not an option, as it would require more include magic. Also + * creating an empty asm-ia64/apic.h would just trade pest vs. cholera. + */ +#ifdef CONFIG_X86 +#include <asm/apic.h> +#endif #include <asm/io.h> #include <asm/uaccess.h> @@ -238,6 +249,81 @@ static void acpi_cstate_enter(struct acpi_processor_cx *cstate) } } +#ifdef ARCH_APICTIMER_STOPS_ON_C3 + +/* + * Some BIOS implementations switch to C3 in the published C2 state. + * This seems to be a common problem on AMD boxen, but other vendors + * are affected too. We pick the most conservative approach: we assume + * that the local APIC stops in both C2 and C3. + */ +static void acpi_timer_check_state(int state, struct acpi_processor *pr, + struct acpi_processor_cx *cx) +{ + struct acpi_processor_power *pwr = &pr->power; + + /* + * Check, if one of the previous states already marked the lapic + * unstable + */ + if (pwr->timer_broadcast_on_state < state) + return; + + if (cx->type >= ACPI_STATE_C2) + pr->power.timer_broadcast_on_state = state; +} + +static void acpi_propagate_timer_broadcast(struct acpi_processor *pr) +{ +#ifdef CONFIG_GENERIC_CLOCKEVENTS + unsigned long reason; + + reason = pr->power.timer_broadcast_on_state < INT_MAX ? + CLOCK_EVT_NOTIFY_BROADCAST_ON : CLOCK_EVT_NOTIFY_BROADCAST_OFF; + + clockevents_notify(reason, &pr->id); +#else + cpumask_t mask = cpumask_of_cpu(pr->id); + + if (pr->power.timer_broadcast_on_state < INT_MAX) + on_each_cpu(switch_APIC_timer_to_ipi, &mask, 1, 1); + else + on_each_cpu(switch_ipi_to_APIC_timer, &mask, 1, 1); +#endif +} + +/* Power(C) State timer broadcast control */ +static void acpi_state_timer_broadcast(struct acpi_processor *pr, + struct acpi_processor_cx *cx, + int broadcast) +{ +#ifdef CONFIG_GENERIC_CLOCKEVENTS + + int state = cx - pr->power.states; + + if (state >= pr->power.timer_broadcast_on_state) { + unsigned long reason; + + reason = broadcast ? CLOCK_EVT_NOTIFY_BROADCAST_ENTER : + CLOCK_EVT_NOTIFY_BROADCAST_EXIT; + clockevents_notify(reason, &pr->id); + } +#endif +} + +#else + +static void acpi_timer_check_state(int state, struct acpi_processor *pr, + struct acpi_processor_cx *cstate) { } +static void acpi_propagate_timer_broadcast(struct acpi_processor *pr) { } +static void acpi_state_timer_broadcast(struct acpi_processor *pr, + struct acpi_processor_cx *cx, + int broadcast) +{ +} + +#endif + static void acpi_processor_idle(void) { struct acpi_processor *pr = NULL; @@ -382,6 +468,7 @@ static void acpi_processor_idle(void) /* Get start time (ticks) */ t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); /* Invoke C2 */ + acpi_state_timer_broadcast(pr, cx, 1); acpi_cstate_enter(cx); /* Get end time (ticks) */ t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); @@ -396,6 +483,7 @@ static void acpi_processor_idle(void) /* Compute time (ticks) that we were actually asleep */ sleep_ticks = ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD; + acpi_state_timer_broadcast(pr, cx, 0); break; case ACPI_STATE_C3: @@ -417,6 +505,7 @@ static void acpi_processor_idle(void) /* Get start time (ticks) */ t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); /* Invoke C3 */ + acpi_state_timer_broadcast(pr, cx, 1); acpi_cstate_enter(cx); /* Get end time (ticks) */ t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); @@ -436,6 +525,7 @@ static void acpi_processor_idle(void) /* Compute time (ticks) that we were actually asleep */ sleep_ticks = ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD; + acpi_state_timer_broadcast(pr, cx, 0); break; default: @@ -904,11 +994,7 @@ static int acpi_processor_power_verify(struct acpi_processor *pr) unsigned int i; unsigned int working = 0; -#ifdef ARCH_APICTIMER_STOPS_ON_C3 - int timer_broadcast = 0; - cpumask_t mask = cpumask_of_cpu(pr->id); - on_each_cpu(switch_ipi_to_APIC_timer, &mask, 1, 1); -#endif + pr->power.timer_broadcast_on_state = INT_MAX; for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) { struct acpi_processor_cx *cx = &pr->power.states[i]; @@ -920,21 +1006,14 @@ static int acpi_processor_power_verify(struct acpi_processor *pr) case ACPI_STATE_C2: acpi_processor_power_verify_c2(cx); -#ifdef ARCH_APICTIMER_STOPS_ON_C3 - /* Some AMD systems fake C3 as C2, but still - have timer troubles */ - if (cx->valid && - boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - timer_broadcast++; -#endif + if (cx->valid) + acpi_timer_check_state(i, pr, cx); break; case ACPI_STATE_C3: acpi_processor_power_verify_c3(pr, cx); -#ifdef ARCH_APICTIMER_STOPS_ON_C3 if (cx->valid) - timer_broadcast++; -#endif + acpi_timer_check_state(i, pr, cx); break; } @@ -942,10 +1021,7 @@ static int acpi_processor_power_verify(struct acpi_processor *pr) working++; } -#ifdef ARCH_APICTIMER_STOPS_ON_C3 - if (timer_broadcast) - on_each_cpu(switch_APIC_timer_to_ipi, &mask, 1, 1); -#endif + acpi_propagate_timer_broadcast(pr); return (working); } diff --git a/drivers/char/agp/Makefile b/drivers/char/agp/Makefile index 3e581603d0a..a0d04a23dac 100644 --- a/drivers/char/agp/Makefile +++ b/drivers/char/agp/Makefile @@ -1,6 +1,7 @@ agpgart-y := backend.o frontend.o generic.o isoch.o obj-$(CONFIG_AGP) += agpgart.o +obj-$(CONFIG_COMPAT) += compat_ioctl.o obj-$(CONFIG_AGP_ALI) += ali-agp.o obj-$(CONFIG_AGP_ATI) += ati-agp.o obj-$(CONFIG_AGP_AMD) += amd-k7-agp.o diff --git a/drivers/char/agp/agp.h b/drivers/char/agp/agp.h index 1d59e2a5b9a..9bd68d9f0f5 100644 --- a/drivers/char/agp/agp.h +++ b/drivers/char/agp/agp.h @@ -114,6 +114,7 @@ struct agp_bridge_driver { void (*free_by_type)(struct agp_memory *); void *(*agp_alloc_page)(struct agp_bridge_data *); void (*agp_destroy_page)(void *); + int (*agp_type_to_mask_type) (struct agp_bridge_data *, int); }; struct agp_bridge_data { @@ -218,6 +219,7 @@ struct agp_bridge_data { #define I810_PTE_MAIN_UNCACHED 0x00000000 #define I810_PTE_LOCAL 0x00000002 #define I810_PTE_VALID 0x00000001 +#define I830_PTE_SYSTEM_CACHED 0x00000006 #define I810_SMRAM_MISCC 0x70 #define I810_GFX_MEM_WIN_SIZE 0x00010000 #define I810_GFX_MEM_WIN_32M 0x00010000 @@ -270,8 +272,16 @@ void global_cache_flush(void); void get_agp_version(struct agp_bridge_data *bridge); unsigned long agp_generic_mask_memory(struct agp_bridge_data *bridge, unsigned long addr, int type); +int agp_generic_type_to_mask_type(struct agp_bridge_data *bridge, + int type); struct agp_bridge_data *agp_generic_find_bridge(struct pci_dev *pdev); +/* generic functions for user-populated AGP memory types */ +struct agp_memory *agp_generic_alloc_user(size_t page_count, int type); +void agp_alloc_page_array(size_t size, struct agp_memory *mem); +void agp_free_page_array(struct agp_memory *mem); + + /* generic routines for agp>=3 */ int agp3_generic_fetch_size(void); void agp3_generic_tlbflush(struct agp_memory *mem); @@ -288,6 +298,8 @@ extern struct aper_size_info_16 agp3_generic_sizes[]; extern int agp_off; extern int agp_try_unsupported_boot; +long compat_agp_ioctl(struct file *file, unsigned int cmd, unsigned long arg); + /* Chipset independant registers (from AGP Spec) */ #define AGP_APBASE 0x10 diff --git a/drivers/char/agp/ali-agp.c b/drivers/char/agp/ali-agp.c index 5a31ec7c62f..98177a93076 100644 --- a/drivers/char/agp/ali-agp.c +++ b/drivers/char/agp/ali-agp.c @@ -214,6 +214,7 @@ static struct agp_bridge_driver ali_generic_bridge = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = ali_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, }; static struct agp_bridge_driver ali_m1541_bridge = { @@ -237,6 +238,7 @@ static struct agp_bridge_driver ali_m1541_bridge = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = m1541_alloc_page, .agp_destroy_page = m1541_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, }; diff --git a/drivers/char/agp/alpha-agp.c b/drivers/char/agp/alpha-agp.c index b4e00a343da..b0acf41c0db 100644 --- a/drivers/char/agp/alpha-agp.c +++ b/drivers/char/agp/alpha-agp.c @@ -91,6 +91,9 @@ static int alpha_core_agp_insert_memory(struct agp_memory *mem, off_t pg_start, int num_entries, status; void *temp; + if (type >= AGP_USER_TYPES || mem->type >= AGP_USER_TYPES) + return -EINVAL; + temp = agp_bridge->current_size; num_entries = A_SIZE_FIX(temp)->num_entries; if ((pg_start + mem->page_count) > num_entries) @@ -142,6 +145,7 @@ struct agp_bridge_driver alpha_core_agp_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, }; struct agp_bridge_data *alpha_bridge; diff --git a/drivers/char/agp/amd-k7-agp.c b/drivers/char/agp/amd-k7-agp.c index c85c8cadb6d..3d8d448bf39 100644 --- a/drivers/char/agp/amd-k7-agp.c +++ b/drivers/char/agp/amd-k7-agp.c @@ -381,6 +381,7 @@ static struct agp_bridge_driver amd_irongate_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, }; static struct agp_device_ids amd_agp_device_ids[] __devinitdata = diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index 93d2209fee4..636d984ed4a 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -62,12 +62,18 @@ static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type) { int i, j, num_entries; long long tmp; + int mask_type; + struct agp_bridge_data *bridge = mem->bridge; u32 pte; num_entries = agp_num_entries(); - if (type != 0 || mem->type != 0) + if (type != mem->type) return -EINVAL; + mask_type = bridge->driver->agp_type_to_mask_type(bridge, type); + if (mask_type != 0) + return -EINVAL; + /* Make sure we can fit the range in the gatt table. */ /* FIXME: could wrap */ @@ -90,7 +96,7 @@ static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type) for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { tmp = agp_bridge->driver->mask_memory(agp_bridge, - mem->memory[i], mem->type); + mem->memory[i], mask_type); BUG_ON(tmp & 0xffffff0000000ffcULL); pte = (tmp & 0x000000ff00000000ULL) >> 28; @@ -247,6 +253,7 @@ static struct agp_bridge_driver amd_8151_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, }; /* Some basic sanity checks for the aperture. */ diff --git a/drivers/char/agp/ati-agp.c b/drivers/char/agp/ati-agp.c index 9987dc2e0c3..77c9ad68fba 100644 --- a/drivers/char/agp/ati-agp.c +++ b/drivers/char/agp/ati-agp.c @@ -431,6 +431,7 @@ static struct agp_bridge_driver ati_generic_bridge = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, }; diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c index d59e037ddd1..ebdd6dd66ed 100644 --- a/drivers/char/agp/backend.c +++ b/drivers/char/agp/backend.c @@ -43,7 +43,7 @@ * fix some real stupidity. It's only by chance we can bump * past 0.99 at all due to some boolean logic error. */ #define AGPGART_VERSION_MAJOR 0 -#define AGPGART_VERSION_MINOR 101 +#define AGPGART_VERSION_MINOR 102 static const struct agp_version agp_current_version = { .major = AGPGART_VERSION_MAJOR, diff --git a/drivers/char/agp/compat_ioctl.c b/drivers/char/agp/compat_ioctl.c new file mode 100644 index 00000000000..fcb4b1bf0d4 --- /dev/null +++ b/drivers/char/agp/compat_ioctl.c @@ -0,0 +1,282 @@ +/* + * AGPGART driver frontend compatibility ioctls + * Copyright (C) 2004 Silicon Graphics, Inc. + * Copyright (C) 2002-2003 Dave Jones + * Copyright (C) 1999 Jeff Hartmann + * Copyright (C) 1999 Precision Insight, Inc. + * Copyright (C) 1999 Xi Graphics, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * JEFF HARTMANN, OR ANY OTHER CONTRIBUTORS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE + * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/agpgart.h> +#include <asm/uaccess.h> +#include "agp.h" +#include "compat_ioctl.h" + +static int compat_agpioc_info_wrap(struct agp_file_private *priv, void __user *arg) +{ + struct agp_info32 userinfo; + struct agp_kern_info kerninfo; + + agp_copy_info(agp_bridge, &kerninfo); + + userinfo.version.major = kerninfo.version.major; + userinfo.version.minor = kerninfo.version.minor; + userinfo.bridge_id = kerninfo.device->vendor | + (kerninfo.device->device << 16); + userinfo.agp_mode = kerninfo.mode; + userinfo.aper_base = (compat_long_t)kerninfo.aper_base; + userinfo.aper_size = kerninfo.aper_size; + userinfo.pg_total = userinfo.pg_system = kerninfo.max_memory; + userinfo.pg_used = kerninfo.current_memory; + + if (copy_to_user(arg, &userinfo, sizeof(userinfo))) + return -EFAULT; + + return 0; +} + +static int compat_agpioc_reserve_wrap(struct agp_file_private *priv, void __user *arg) +{ + struct agp_region32 ureserve; + struct agp_region kreserve; + struct agp_client *client; + struct agp_file_private *client_priv; + + DBG(""); + if (copy_from_user(&ureserve, arg, sizeof(ureserve))) + return -EFAULT; + + if ((unsigned) ureserve.seg_count >= ~0U/sizeof(struct agp_segment32)) + return -EFAULT; + + kreserve.pid = ureserve.pid; + kreserve.seg_count = ureserve.seg_count; + + client = agp_find_client_by_pid(kreserve.pid); + + if (kreserve.seg_count == 0) { + /* remove a client */ + client_priv = agp_find_private(kreserve.pid); + + if (client_priv != NULL) { + set_bit(AGP_FF_IS_CLIENT, &client_priv->access_flags); + set_bit(AGP_FF_IS_VALID, &client_priv->access_flags); + } + if (client == NULL) { + /* client is already removed */ + return 0; + } + return agp_remove_client(kreserve.pid); + } else { + struct agp_segment32 *usegment; + struct agp_segment *ksegment; + int seg; + + if (ureserve.seg_count >= 16384) + return -EINVAL; + + usegment = kmalloc(sizeof(*usegment) * ureserve.seg_count, GFP_KERNEL); + if (!usegment) + return -ENOMEM; + + ksegment = kmalloc(sizeof(*ksegment) * kreserve.seg_count, GFP_KERNEL); + if (!ksegment) { + kfree(usegment); + return -ENOMEM; + } + + if (copy_from_user(usegment, (void __user *) ureserve.seg_list, + sizeof(*usegment) * ureserve.seg_count)) { + kfree(usegment); + kfree(ksegment); + return -EFAULT; + } + + for (seg = 0; seg < ureserve.seg_count; seg++) { + ksegment[seg].pg_start = usegment[seg].pg_start; + ksegment[seg].pg_count = usegment[seg].pg_count; + ksegment[seg].prot = usegment[seg].prot; + } + + kfree(usegment); + kreserve.seg_list = ksegment; + + if (client == NULL) { + /* Create the client and add the segment */ + client = agp_create_client(kreserve.pid); + + if (client == NULL) { + kfree(ksegment); + return -ENOMEM; + } + client_priv = agp_find_private(kreserve.pid); + + if (client_priv != NULL) { + set_bit(AGP_FF_IS_CLIENT, &client_priv->access_flags); + set_bit(AGP_FF_IS_VALID, &client_priv->access_flags); + } + } + return agp_create_segment(client, &kreserve); + } + /* Will never really happen */ + return -EINVAL; +} + +static int compat_agpioc_allocate_wrap(struct agp_file_private *priv, void __user *arg) +{ + struct agp_memory *memory; + struct agp_allocate32 alloc; + + DBG(""); + if (copy_from_user(&alloc, arg, sizeof(alloc))) + return -EFAULT; + + memory = agp_allocate_memory_wrap(alloc.pg_count, alloc.type); + + if (memory == NULL) + return -ENOMEM; + + alloc.key = memory->key; + alloc.physical = memory->physical; + + if (copy_to_user(arg, &alloc, sizeof(alloc))) { + agp_free_memory_wrap(memory); + return -EFAULT; + } + return 0; +} + +static int compat_agpioc_bind_wrap(struct agp_file_private *priv, void __user *arg) +{ + struct agp_bind32 bind_info; + struct agp_memory *memory; + + DBG(""); + if (copy_from_user(&bind_info, arg, sizeof(bind_info))) + return -EFAULT; + + memory = agp_find_mem_by_key(bind_info.key); + + if (memory == NULL) + return -EINVAL; + + return agp_bind_memory(memory, bind_info.pg_start); +} + +static int compat_agpioc_unbind_wrap(struct agp_file_private *priv, void __user *arg) +{ + struct agp_memory *memory; + struct agp_unbind32 unbind; + + DBG(""); + if (copy_from_user(&unbind, arg, sizeof(unbind))) + return -EFAULT; + + memory = agp_find_mem_by_key(unbind.key); + + if (memory == NULL) + return -EINVAL; + + return agp_unbind_memory(memory); +} + +long compat_agp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct agp_file_private *curr_priv = file->private_data; + int ret_val = -ENOTTY; + + mutex_lock(&(agp_fe.agp_mutex)); + + if ((agp_fe.current_controller == NULL) && + (cmd != AGPIOC_ACQUIRE32)) { + ret_val = -EINVAL; + goto ioctl_out; + } + if ((agp_fe.backend_acquired != TRUE) && + (cmd != AGPIOC_ACQUIRE32)) { + ret_val = -EBUSY; + goto ioctl_out; + } + if (cmd != AGPIOC_ACQUIRE32) { + if (!(test_bit(AGP_FF_IS_CONTROLLER, &curr_priv->access_flags))) { + ret_val = -EPERM; + goto ioctl_out; + } + /* Use the original pid of the controller, + * in case it's threaded */ + + if (agp_fe.current_controller->pid != curr_priv->my_pid) { + ret_val = -EBUSY; + goto ioctl_out; + } + } + + switch (cmd) { + case AGPIOC_INFO32: + ret_val = compat_agpioc_info_wrap(curr_priv, (void __user *) arg); + break; + + case AGPIOC_ACQUIRE32: + ret_val = agpioc_acquire_wrap(curr_priv); + break; + + case AGPIOC_RELEASE32: + ret_val = agpioc_release_wrap(curr_priv); + break; + + case AGPIOC_SETUP32: + ret_val = agpioc_setup_wrap(curr_priv, (void __user *) arg); + break; + + case AGPIOC_RESERVE32: + ret_val = compat_agpioc_reserve_wrap(curr_priv, (void __user *) arg); + break; + + case AGPIOC_PROTECT32: + ret_val = agpioc_protect_wrap(curr_priv); + break; + + case AGPIOC_ALLOCATE32: + ret_val = compat_agpioc_allocate_wrap(curr_priv, (void __user *) arg); + break; + + case AGPIOC_DEALLOCATE32: + ret_val = agpioc_deallocate_wrap(curr_priv, (int) arg); + break; + + case AGPIOC_BIND32: + ret_val = compat_agpioc_bind_wrap(curr_priv, (void __user *) arg); + break; + + case AGPIOC_UNBIND32: + ret_val = compat_agpioc_unbind_wrap(curr_priv, (void __user *) arg); + break; + } + +ioctl_out: + DBG("ioctl returns %d\n", ret_val); + mutex_unlock(&(agp_fe.agp_mutex)); + return ret_val; +} + diff --git a/drivers/char/agp/compat_ioctl.h b/drivers/char/agp/compat_ioctl.h new file mode 100644 index 00000000000..71939d63723 --- /dev/null +++ b/drivers/char/agp/compat_ioctl.h @@ -0,0 +1,105 @@ +/* + * Copyright (C) 1999 Jeff Hartmann + * Copyright (C) 1999 Precision Insight, Inc. + * Copyright (C) 1999 Xi Graphics, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * JEFF HARTMANN, OR ANY OTHER CONTRIBUTORS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE + * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef _AGP_COMPAT_IOCTL_H +#define _AGP_COMPAT_IOCTL_H + +#include <linux/compat.h> +#include <linux/agpgart.h> + +#define AGPIOC_INFO32 _IOR (AGPIOC_BASE, 0, compat_uptr_t) +#define AGPIOC_ACQUIRE32 _IO (AGPIOC_BASE, 1) +#define AGPIOC_RELEASE32 _IO (AGPIOC_BASE, 2) +#define AGPIOC_SETUP32 _IOW (AGPIOC_BASE, 3, compat_uptr_t) +#define AGPIOC_RESERVE32 _IOW (AGPIOC_BASE, 4, compat_uptr_t) +#define AGPIOC_PROTECT32 _IOW (AGPIOC_BASE, 5, compat_uptr_t) +#define AGPIOC_ALLOCATE32 _IOWR(AGPIOC_BASE, 6, compat_uptr_t) +#define AGPIOC_DEALLOCATE32 _IOW (AGPIOC_BASE, 7, compat_int_t) +#define AGPIOC_BIND32 _IOW (AGPIOC_BASE, 8, compat_uptr_t) +#define AGPIOC_UNBIND32 _IOW (AGPIOC_BASE, 9, compat_uptr_t) + +struct agp_info32 { + struct agp_version version; /* version of the driver */ + u32 bridge_id; /* bridge vendor/device */ + u32 agp_mode; /* mode info of bridge */ + compat_long_t aper_base; /* base of aperture */ + compat_size_t aper_size; /* size of aperture */ + compat_size_t pg_total; /* max pages (swap + system) */ + compat_size_t pg_system; /* max pages (system) */ + compat_size_t pg_used; /* current pages used */ +}; + +/* + * The "prot" down below needs still a "sleep" flag somehow ... + */ +struct agp_segment32 { + compat_off_t pg_start; /* starting page to populate */ + compat_size_t pg_count; /* number of pages */ + compat_int_t prot; /* prot flags for mmap */ +}; + +struct agp_region32 { + compat_pid_t pid; /* pid of process */ + compat_size_t seg_count; /* number of segments */ + struct agp_segment32 *seg_list; +}; + +struct agp_allocate32 { + compat_int_t key; /* tag of allocation */ + compat_size_t pg_count; /* number of pages */ + u32 type; /* 0 == normal, other devspec */ + u32 physical; /* device specific (some devices + * need a phys address of the + * actual page behind the gatt + * table) */ +}; + +struct agp_bind32 { + compat_int_t key; /* tag of allocation */ + compat_off_t pg_start; /* starting page to populate */ +}; + +struct agp_unbind32 { + compat_int_t key; /* tag of allocation */ + u32 priority; /* priority for paging out */ +}; + +extern struct agp_front_data agp_fe; + +int agpioc_acquire_wrap(struct agp_file_private *priv); +int agpioc_release_wrap(struct agp_file_private *priv); +int agpioc_protect_wrap(struct agp_file_private *priv); +int agpioc_setup_wrap(struct agp_file_private *priv, void __user *arg); +int agpioc_deallocate_wrap(struct agp_file_private *priv, int arg); +struct agp_file_private *agp_find_private(pid_t pid); +struct agp_client *agp_create_client(pid_t id); +int agp_remove_client(pid_t id); +int agp_create_segment(struct agp_client *client, struct agp_region *region); +void agp_free_memory_wrap(struct agp_memory *memory); +struct agp_memory *agp_allocate_memory_wrap(size_t pg_count, u32 type); +struct agp_memory *agp_find_mem_by_key(int key); +struct agp_client *agp_find_client_by_pid(pid_t id); + +#endif /* _AGP_COMPAT_H */ diff --git a/drivers/char/agp/efficeon-agp.c b/drivers/char/agp/efficeon-agp.c index 30f730ff81c..658cb1a72d2 100644 --- a/drivers/char/agp/efficeon-agp.c +++ b/drivers/char/agp/efficeon-agp.c @@ -335,6 +335,7 @@ static struct agp_bridge_driver efficeon_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, }; static int __devinit agp_efficeon_probe(struct pci_dev *pdev, diff --git a/drivers/char/agp/frontend.c b/drivers/char/agp/frontend.c index 0f2ed2aa2d8..679d7f97243 100644 --- a/drivers/char/agp/frontend.c +++ b/drivers/char/agp/frontend.c @@ -41,9 +41,9 @@ #include <asm/pgtable.h> #include "agp.h" -static struct agp_front_data agp_fe; +struct agp_front_data agp_fe; -static struct agp_memory *agp_find_mem_by_key(int key) +struct agp_memory *agp_find_mem_by_key(int key) { struct agp_memory *curr; @@ -159,7 +159,7 @@ static pgprot_t agp_convert_mmap_flags(int prot) return vm_get_page_prot(prot_bits); } -static int agp_create_segment(struct agp_client *client, struct agp_region *region) +int agp_create_segment(struct agp_client *client, struct agp_region *region) { struct agp_segment_priv **ret_seg; struct agp_segment_priv *seg; @@ -211,7 +211,7 @@ static void agp_insert_into_pool(struct agp_memory * temp) /* File private list routines */ -static struct agp_file_private *agp_find_private(pid_t pid) +struct agp_file_private *agp_find_private(pid_t pid) { struct agp_file_private *curr; @@ -266,13 +266,13 @@ static void agp_remove_file_private(struct agp_file_private * priv) * Wrappers for agp_free_memory & agp_allocate_memory * These make sure that internal lists are kept updated. */ -static void agp_free_memory_wrap(struct agp_memory *memory) +void agp_free_memory_wrap(struct agp_memory *memory) { agp_remove_from_pool(memory); agp_free_memory(memory); } -static struct agp_memory *agp_allocate_memory_wrap(size_t pg_count, u32 type) +struct agp_memory *agp_allocate_memory_wrap(size_t pg_count, u32 type) { struct agp_memory *memory; @@ -484,7 +484,7 @@ static struct agp_controller *agp_find_controller_for_client(pid_t id) return NULL; } -static struct agp_client *agp_find_client_by_pid(pid_t id) +struct agp_client *agp_find_client_by_pid(pid_t id) { struct agp_client *temp; @@ -509,7 +509,7 @@ static void agp_insert_client(struct agp_client *client) agp_fe.current_controller->num_clients++; } -static struct agp_client *agp_create_client(pid_t id) +struct agp_client *agp_create_client(pid_t id) { struct agp_client *new_client; @@ -522,7 +522,7 @@ static struct agp_client *agp_create_client(pid_t id) return new_client; } -static int agp_remove_client(pid_t id) +int agp_remove_client(pid_t id) { struct agp_client *client; struct agp_client *prev_client; @@ -746,7 +746,7 @@ static int agpioc_info_wrap(struct agp_file_private *priv, void __user *arg) return 0; } -static int agpioc_acquire_wrap(struct agp_file_private *priv) +int agpioc_acquire_wrap(struct agp_file_private *priv) { struct agp_controller *controller; @@ -789,14 +789,14 @@ static int agpioc_acquire_wrap(struct agp_file_private *priv) return 0; } -static int agpioc_release_wrap(struct agp_file_private *priv) +int agpioc_release_wrap(struct agp_file_private *priv) { DBG(""); agp_controller_release_current(agp_fe.current_controller, priv); return 0; } -static int agpioc_setup_wrap(struct agp_file_private *priv, void __user *arg) +int agpioc_setup_wrap(struct agp_file_private *priv, void __user *arg) { struct agp_setup mode; @@ -876,7 +876,7 @@ static int agpioc_reserve_wrap(struct agp_file_private *priv, void __user *arg) return -EINVAL; } -static int agpioc_protect_wrap(struct agp_file_private *priv) +int agpioc_protect_wrap(struct agp_file_private *priv) { DBG(""); /* This function is not currently implemented */ @@ -892,6 +892,9 @@ static int agpioc_allocate_wrap(struct agp_file_private *priv, void __user *arg) if (copy_from_user(&alloc, arg, sizeof(struct agp_allocate))) return -EFAULT; + if (alloc.type >= AGP_USER_TYPES) + return -EINVAL; + memory = agp_allocate_memory_wrap(alloc.pg_count, alloc.type); if (memory == NULL) @@ -907,7 +910,7 @@ static int agpioc_allocate_wrap(struct agp_file_private *priv, void __user *arg) return 0; } -static int agpioc_deallocate_wrap(struct agp_file_private *priv, int arg) +int agpioc_deallocate_wrap(struct agp_file_private *priv, int arg) { struct agp_memory *memory; @@ -1043,6 +1046,9 @@ static const struct file_operations agp_fops = .read = agp_read, .write = agp_write, .ioctl = agp_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_agp_ioctl, +#endif .mmap = agp_mmap, .open = agp_open, .release = agp_release, diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c index 3491d6f84bc..7923337c3d2 100644 --- a/drivers/char/agp/generic.c +++ b/drivers/char/agp/generic.c @@ -101,6 +101,63 @@ static int agp_get_key(void) return -1; } +/* + * Use kmalloc if possible for the page list. Otherwise fall back to + * vmalloc. This speeds things up and also saves memory for small AGP + * regions. + */ + +void agp_alloc_page_array(size_t size, struct agp_memory *mem) +{ + mem->memory = NULL; + mem->vmalloc_flag = 0; + + if (size <= 2*PAGE_SIZE) + mem->memory = kmalloc(size, GFP_KERNEL | __GFP_NORETRY); + if (mem->memory == NULL) { + mem->memory = vmalloc(size); + mem->vmalloc_flag = 1; + } +} +EXPORT_SYMBOL(agp_alloc_page_array); + +void agp_free_page_array(struct agp_memory *mem) +{ + if (mem->vmalloc_flag) { + vfree(mem->memory); + } else { + kfree(mem->memory); + } +} +EXPORT_SYMBOL(agp_free_page_array); + + +static struct agp_memory *agp_create_user_memory(unsigned long num_agp_pages) +{ + struct agp_memory *new; + unsigned long alloc_size = num_agp_pages*sizeof(struct page *); + + new = kzalloc(sizeof(struct agp_memory), GFP_KERNEL); + if (new == NULL) + return NULL; + + new->key = agp_get_key(); + + if (new->key < 0) { + kfree(new); + return NULL; + } + + agp_alloc_page_array(alloc_size, new); + + if (new->memory == NULL) { + agp_free_key(new->key); + kfree(new); + return NULL; + } + new->num_scratch_pages = 0; + return new; +} struct agp_memory *agp_create_memory(int scratch_pages) { @@ -116,7 +173,8 @@ struct agp_memory *agp_create_memory(int scratch_pages) kfree(new); return NULL; } - new->memory = vmalloc(PAGE_SIZE * scratch_pages); + + agp_alloc_page_array(PAGE_SIZE * scratch_pages, new); if (new->memory == NULL) { agp_free_key(new->key); @@ -124,6 +182,7 @@ struct agp_memory *agp_create_memory(int scratch_pages) return NULL; } new->num_scratch_pages = scratch_pages; + new->type = AGP_NORMAL_MEMORY; return new; } EXPORT_SYMBOL(agp_create_memory); @@ -146,6 +205,11 @@ void agp_free_memory(struct agp_memory *curr) if (curr->is_bound == TRUE) agp_unbind_memory(curr); + if (curr->type >= AGP_USER_TYPES) { + agp_generic_free_by_type(curr); + return; + } + if (curr->type != 0) { curr->bridge->driver->free_by_type(curr); return; @@ -157,7 +221,7 @@ void agp_free_memory(struct agp_memory *curr) flush_agp_mappings(); } agp_free_key(curr->key); - vfree(curr->memory); + agp_free_page_array(curr); kfree(curr); } EXPORT_SYMBOL(agp_free_memory); @@ -188,6 +252,13 @@ struct agp_memory *agp_allocate_memory(struct agp_bridge_data *bridge, if ((atomic_read(&bridge->current_memory_agp) + page_count) > bridge->max_memory_agp) return NULL; + if (type >= AGP_USER_TYPES) { + new = agp_generic_alloc_user(page_count, type); + if (new) + new->bridge = bridge; + return new; + } + if (type != 0) { new = bridge->driver->alloc_by_type(page_count, type); if (new) @@ -960,6 +1031,7 @@ int agp_generic_insert_memory(struct agp_memory * mem, off_t pg_start, int type) off_t j; void *temp; struct agp_bridge_data *bridge; + int mask_type; bridge = mem->bridge; if (!bridge) @@ -995,7 +1067,11 @@ int agp_generic_insert_memory(struct agp_memory * mem, off_t pg_start, int type) num_entries -= agp_memory_reserved/PAGE_SIZE; if (num_entries < 0) num_entries = 0; - if (type != 0 || mem->type != 0) { + if (type != mem->type) + return -EINVAL; + + mask_type = bridge->driver->agp_type_to_mask_type(bridge, type); + if (mask_type != 0) { /* The generic routines know nothing of memory types */ return -EINVAL; } @@ -1018,7 +1094,8 @@ int agp_generic_insert_memory(struct agp_memory * mem, off_t pg_start, int type) } for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { - writel(bridge->driver->mask_memory(bridge, mem->memory[i], mem->type), bridge->gatt_table+j); + writel(bridge->driver->mask_memory(bridge, mem->memory[i], mask_type), + bridge->gatt_table+j); } readl(bridge->gatt_table+j-1); /* PCI Posting. */ @@ -1032,6 +1109,7 @@ int agp_generic_remove_memory(struct agp_memory *mem, off_t pg_start, int type) { size_t i; struct agp_bridge_data *bridge; + int mask_type; bridge = mem->bridge; if (!bridge) @@ -1040,7 +1118,11 @@ int agp_generic_remove_memory(struct agp_memory *mem, off_t pg_start, int type) if (mem->page_count == 0) return 0; - if (type != 0 || mem->type != 0) { + if (type != mem->type) + return -EINVAL; + + mask_type = bridge->driver->agp_type_to_mask_type(bridge, type); + if (mask_type != 0) { /* The generic routines know nothing of memory types */ return -EINVAL; } @@ -1056,22 +1138,40 @@ int agp_generic_remove_memory(struct agp_memory *mem, off_t pg_start, int type) } EXPORT_SYMBOL(agp_generic_remove_memory); - struct agp_memory *agp_generic_alloc_by_type(size_t page_count, int type) { return NULL; } EXPORT_SYMBOL(agp_generic_alloc_by_type); - void agp_generic_free_by_type(struct agp_memory *curr) { - vfree(curr->memory); + agp_free_page_array(curr); agp_free_key(curr->key); kfree(curr); } EXPORT_SYMBOL(agp_generic_free_by_type); +struct agp_memory *agp_generic_alloc_user(size_t page_count, int type) +{ + struct agp_memory *new; + int i; + int pages; + + pages = (page_count + ENTRIES_PER_PAGE - 1) / ENTRIES_PER_PAGE; + new = agp_create_user_memory(page_count); + if (new == NULL) + return NULL; + + for (i = 0; i < page_count; i++) + new->memory[i] = 0; + new->page_count = 0; + new->type = type; + new->num_scratch_pages = pages; + + return new; +} +EXPORT_SYMBOL(agp_generic_alloc_user); /* * Basic Page Allocation Routines - @@ -1165,6 +1265,15 @@ unsigned long agp_generic_mask_memory(struct agp_bridge_data *bridge, } EXPORT_SYMBOL(agp_generic_mask_memory); +int agp_generic_type_to_mask_type(struct agp_bridge_data *bridge, + int type) +{ + if (type >= AGP_USER_TYPES) + return 0; + return type; +} +EXPORT_SYMBOL(agp_generic_type_to_mask_type); + /* * These functions are implemented according to the AGPv3 spec, * which covers implementation details that had previously been diff --git a/drivers/char/agp/hp-agp.c b/drivers/char/agp/hp-agp.c index 907fb66ec4a..847deabf7f9 100644 --- a/drivers/char/agp/hp-agp.c +++ b/drivers/char/agp/hp-agp.c @@ -438,6 +438,7 @@ struct agp_bridge_driver hp_zx1_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, .cant_use_aperture = 1, }; diff --git a/drivers/char/agp/i460-agp.c b/drivers/char/agp/i460-agp.c index 91769443d8f..3e7618653ab 100644 --- a/drivers/char/agp/i460-agp.c +++ b/drivers/char/agp/i460-agp.c @@ -293,6 +293,9 @@ static int i460_insert_memory_small_io_page (struct agp_memory *mem, pr_debug("i460_insert_memory_small_io_page(mem=%p, pg_start=%ld, type=%d, paddr0=0x%lx)\n", mem, pg_start, type, mem->memory[0]); + if (type >= AGP_USER_TYPES || mem->type >= AGP_USER_TYPES) + return -EINVAL; + io_pg_start = I460_IOPAGES_PER_KPAGE * pg_start; temp = agp_bridge->current_size; @@ -396,6 +399,9 @@ static int i460_insert_memory_large_io_page (struct agp_memory *mem, struct lp_desc *start, *end, *lp; void *temp; + if (type >= AGP_USER_TYPES || mem->type >= AGP_USER_TYPES) + return -EINVAL; + temp = agp_bridge->current_size; num_entries = A_SIZE_8(temp)->num_entries; @@ -572,6 +578,7 @@ struct agp_bridge_driver intel_i460_driver = { #endif .alloc_by_type = agp_generic_alloc_by_type, .free_by_type = agp_generic_free_by_type, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, .cant_use_aperture = 1, }; diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c index a3011de51f7..06b0bb6d982 100644 --- a/drivers/char/agp/intel-agp.c +++ b/drivers/char/agp/intel-agp.c @@ -5,6 +5,7 @@ #include <linux/module.h> #include <linux/pci.h> #include <linux/init.h> +#include <linux/kernel.h> #include <linux/pagemap.h> #include <linux/agp_backend.h> #include "agp.h" @@ -24,6 +25,9 @@ agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_82965G_HB) +extern int agp_memory_reserved; + + /* Intel 815 register */ #define INTEL_815_APCONT 0x51 #define INTEL_815_ATTBASE_MASK ~0x1FFFFFFF @@ -68,12 +72,15 @@ static struct aper_size_info_fixed intel_i810_sizes[] = #define AGP_DCACHE_MEMORY 1 #define AGP_PHYS_MEMORY 2 +#define INTEL_AGP_CACHED_MEMORY 3 static struct gatt_mask intel_i810_masks[] = { {.mask = I810_PTE_VALID, .type = 0}, {.mask = (I810_PTE_VALID | I810_PTE_LOCAL), .type = AGP_DCACHE_MEMORY}, - {.mask = I810_PTE_VALID, .type = 0} + {.mask = I810_PTE_VALID, .type = 0}, + {.mask = I810_PTE_VALID | I830_PTE_SYSTEM_CACHED, + .type = INTEL_AGP_CACHED_MEMORY} }; static struct _intel_i810_private { @@ -117,13 +124,15 @@ static int intel_i810_configure(void) current_size = A_SIZE_FIX(agp_bridge->current_size); - pci_read_config_dword(intel_i810_private.i810_dev, I810_MMADDR, &temp); - temp &= 0xfff80000; - - intel_i810_private.registers = ioremap(temp, 128 * 4096); if (!intel_i810_private.registers) { - printk(KERN_ERR PFX "Unable to remap memory.\n"); - return -ENOMEM; + pci_read_config_dword(intel_i810_private.i810_dev, I810_MMADDR, &temp); + temp &= 0xfff80000; + + intel_i810_private.registers = ioremap(temp, 128 * 4096); + if (!intel_i810_private.registers) { + printk(KERN_ERR PFX "Unable to remap memory.\n"); + return -ENOMEM; + } } if ((readl(intel_i810_private.registers+I810_DRAM_CTL) @@ -201,62 +210,79 @@ static void i8xx_destroy_pages(void *addr) atomic_dec(&agp_bridge->current_memory_agp); } +static int intel_i830_type_to_mask_type(struct agp_bridge_data *bridge, + int type) +{ + if (type < AGP_USER_TYPES) + return type; + else if (type == AGP_USER_CACHED_MEMORY) + return INTEL_AGP_CACHED_MEMORY; + else + return 0; +} + static int intel_i810_insert_entries(struct agp_memory *mem, off_t pg_start, int type) { int i, j, num_entries; void *temp; + int ret = -EINVAL; + int mask_type; if (mem->page_count == 0) - return 0; + goto out; temp = agp_bridge->current_size; num_entries = A_SIZE_FIX(temp)->num_entries; if ((pg_start + mem->page_count) > num_entries) - return -EINVAL; + goto out_err; - for (j = pg_start; j < (pg_start + mem->page_count); j++) { - if (!PGE_EMPTY(agp_bridge, readl(agp_bridge->gatt_table+j))) - return -EBUSY; - } - if (type != 0 || mem->type != 0) { - if ((type == AGP_DCACHE_MEMORY) && (mem->type == AGP_DCACHE_MEMORY)) { - /* special insert */ - if (!mem->is_flushed) { - global_cache_flush(); - mem->is_flushed = TRUE; - } - - for (i = pg_start; i < (pg_start + mem->page_count); i++) { - writel((i*4096)|I810_PTE_LOCAL|I810_PTE_VALID, intel_i810_private.registers+I810_PTE_BASE+(i*4)); - } - readl(intel_i810_private.registers+I810_PTE_BASE+((i-1)*4)); /* PCI Posting. */ - - agp_bridge->driver->tlb_flush(mem); - return 0; + for (j = pg_start; j < (pg_start + mem->page_count); j++) { + if (!PGE_EMPTY(agp_bridge, readl(agp_bridge->gatt_table+j))) { + ret = -EBUSY; + goto out_err; } - if ((type == AGP_PHYS_MEMORY) && (mem->type == AGP_PHYS_MEMORY)) - goto insert; - return -EINVAL; } -insert: - if (!mem->is_flushed) { - global_cache_flush(); - mem->is_flushed = TRUE; - } + if (type != mem->type) + goto out_err; - for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { - writel(agp_bridge->driver->mask_memory(agp_bridge, - mem->memory[i], mem->type), - intel_i810_private.registers+I810_PTE_BASE+(j*4)); + mask_type = agp_bridge->driver->agp_type_to_mask_type(agp_bridge, type); + + switch (mask_type) { + case AGP_DCACHE_MEMORY: + if (!mem->is_flushed) + global_cache_flush(); + for (i = pg_start; i < (pg_start + mem->page_count); i++) { + writel((i*4096)|I810_PTE_LOCAL|I810_PTE_VALID, + intel_i810_private.registers+I810_PTE_BASE+(i*4)); + } + readl(intel_i810_private.registers+I810_PTE_BASE+((i-1)*4)); + break; + case AGP_PHYS_MEMORY: + case AGP_NORMAL_MEMORY: + if (!mem->is_flushed) + global_cache_flush(); + for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { + writel(agp_bridge->driver->mask_memory(agp_bridge, + mem->memory[i], + mask_type), + intel_i810_private.registers+I810_PTE_BASE+(j*4)); + } + readl(intel_i810_private.registers+I810_PTE_BASE+((j-1)*4)); + break; + default: + goto out_err; } - readl(intel_i810_private.registers+I810_PTE_BASE+((j-1)*4)); /* PCI Posting. */ agp_bridge->driver->tlb_flush(mem); - return 0; +out: + ret = 0; +out_err: + mem->is_flushed = 1; + return ret; } static int intel_i810_remove_entries(struct agp_memory *mem, off_t pg_start, @@ -337,12 +363,11 @@ static struct agp_memory *intel_i810_alloc_by_type(size_t pg_count, int type) new->type = AGP_DCACHE_MEMORY; new->page_count = pg_count; new->num_scratch_pages = 0; - vfree(new->memory); + agp_free_page_array(new); return new; } if (type == AGP_PHYS_MEMORY) return alloc_agpphysmem_i8xx(pg_count, type); - return NULL; } @@ -357,7 +382,7 @@ static void intel_i810_free_by_type(struct agp_memory *curr) gart_to_virt(curr->memory[0])); global_flush_tlb(); } - vfree(curr->memory); + agp_free_page_array(curr); } kfree(curr); } @@ -619,9 +644,11 @@ static int intel_i830_insert_entries(struct agp_memory *mem,off_t pg_start, int { int i,j,num_entries; void *temp; + int ret = -EINVAL; + int mask_type; if (mem->page_count == 0) - return 0; + goto out; temp = agp_bridge->current_size; num_entries = A_SIZE_FIX(temp)->num_entries; @@ -631,34 +658,41 @@ static int intel_i830_insert_entries(struct agp_memory *mem,off_t pg_start, int pg_start,intel_i830_private.gtt_entries); printk (KERN_INFO PFX "Trying to insert into local/stolen memory\n"); - return -EINVAL; + goto out_err; } if ((pg_start + mem->page_count) > num_entries) - return -EINVAL; + goto out_err; /* The i830 can't check the GTT for entries since its read only, * depend on the caller to make the correct offset decisions. */ - if ((type != 0 && type != AGP_PHYS_MEMORY) || - (mem->type != 0 && mem->type != AGP_PHYS_MEMORY)) - return -EINVAL; + if (type != mem->type) + goto out_err; + + mask_type = agp_bridge->driver->agp_type_to_mask_type(agp_bridge, type); - if (!mem->is_flushed) { + if (mask_type != 0 && mask_type != AGP_PHYS_MEMORY && + mask_type != INTEL_AGP_CACHED_MEMORY) + goto out_err; + + if (!mem->is_flushed) global_cache_flush(); - mem->is_flushed = TRUE; - } for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(agp_bridge->driver->mask_memory(agp_bridge, - mem->memory[i], mem->type), - intel_i830_private.registers+I810_PTE_BASE+(j*4)); + mem->memory[i], mask_type), + intel_i830_private.registers+I810_PTE_BASE+(j*4)); } readl(intel_i830_private.registers+I810_PTE_BASE+((j-1)*4)); - agp_bridge->driver->tlb_flush(mem); - return 0; + +out: + ret = 0; +out_err: + mem->is_flushed = 1; + return ret; } static int intel_i830_remove_entries(struct agp_memory *mem,off_t pg_start, @@ -687,7 +721,6 @@ static struct agp_memory *intel_i830_alloc_by_type(size_t pg_count,int type) { if (type == AGP_PHYS_MEMORY) return alloc_agpphysmem_i8xx(pg_count, type); - /* always return NULL for other allocation types for now */ return NULL; } @@ -734,9 +767,11 @@ static int intel_i915_insert_entries(struct agp_memory *mem,off_t pg_start, { int i,j,num_entries; void *temp; + int ret = -EINVAL; + int mask_type; if (mem->page_count == 0) - return 0; + goto out; temp = agp_bridge->current_size; num_entries = A_SIZE_FIX(temp)->num_entries; @@ -746,33 +781,41 @@ static int intel_i915_insert_entries(struct agp_memory *mem,off_t pg_start, pg_start,intel_i830_private.gtt_entries); printk (KERN_INFO PFX "Trying to insert into local/stolen memory\n"); - return -EINVAL; + goto out_err; } if ((pg_start + mem->page_count) > num_entries) - return -EINVAL; + goto out_err; - /* The i830 can't check the GTT for entries since its read only, + /* The i915 can't check the GTT for entries since its read only, * depend on the caller to make the correct offset decisions. */ - if ((type != 0 && type != AGP_PHYS_MEMORY) || - (mem->type != 0 && mem->type != AGP_PHYS_MEMORY)) - return -EINVAL; + if (type != mem->type) + goto out_err; + + mask_type = agp_bridge->driver->agp_type_to_mask_type(agp_bridge, type); - if (!mem->is_flushed) { + if (mask_type != 0 && mask_type != AGP_PHYS_MEMORY && + mask_type != INTEL_AGP_CACHED_MEMORY) + goto out_err; + + if (!mem->is_flushed) global_cache_flush(); - mem->is_flushed = TRUE; - } for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(agp_bridge->driver->mask_memory(agp_bridge, - mem->memory[i], mem->type), intel_i830_private.gtt+j); + mem->memory[i], mask_type), intel_i830_private.gtt+j); } - readl(intel_i830_private.gtt+j-1); + readl(intel_i830_private.gtt+j-1); agp_bridge->driver->tlb_flush(mem); - return 0; + + out: + ret = 0; + out_err: + mem->is_flushed = 1; + return ret; } static int intel_i915_remove_entries(struct agp_memory *mem,off_t pg_start, @@ -803,7 +846,7 @@ static int intel_i915_remove_entries(struct agp_memory *mem,off_t pg_start, */ static int intel_i9xx_fetch_size(void) { - int num_sizes = sizeof(intel_i830_sizes) / sizeof(*intel_i830_sizes); + int num_sizes = ARRAY_SIZE(intel_i830_sizes); int aper_size; /* size in megabytes */ int i; @@ -1384,6 +1427,7 @@ static struct agp_bridge_driver intel_generic_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, }; static struct agp_bridge_driver intel_810_driver = { @@ -1408,6 +1452,7 @@ static struct agp_bridge_driver intel_810_driver = { .free_by_type = intel_i810_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, }; static struct agp_bridge_driver intel_815_driver = { @@ -1431,6 +1476,7 @@ static struct agp_bridge_driver intel_815_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, }; static struct agp_bridge_driver intel_830_driver = { @@ -1455,6 +1501,7 @@ static struct agp_bridge_driver intel_830_driver = { .free_by_type = intel_i810_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = intel_i830_type_to_mask_type, }; static struct agp_bridge_driver intel_820_driver = { @@ -1478,6 +1525,7 @@ static struct agp_bridge_driver intel_820_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, }; static struct agp_bridge_driver intel_830mp_driver = { @@ -1501,6 +1549,7 @@ static struct agp_bridge_driver intel_830mp_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, }; static struct agp_bridge_driver intel_840_driver = { @@ -1524,6 +1573,7 @@ static struct agp_bridge_driver intel_840_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, }; static struct agp_bridge_driver intel_845_driver = { @@ -1547,6 +1597,7 @@ static struct agp_bridge_driver intel_845_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, }; static struct agp_bridge_driver intel_850_driver = { @@ -1570,6 +1621,7 @@ static struct agp_bridge_driver intel_850_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, }; static struct agp_bridge_driver intel_860_driver = { @@ -1593,6 +1645,7 @@ static struct agp_bridge_driver intel_860_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, }; static struct agp_bridge_driver intel_915_driver = { @@ -1617,6 +1670,7 @@ static struct agp_bridge_driver intel_915_driver = { .free_by_type = intel_i810_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = intel_i830_type_to_mask_type, }; static struct agp_bridge_driver intel_i965_driver = { @@ -1641,6 +1695,7 @@ static struct agp_bridge_driver intel_i965_driver = { .free_by_type = intel_i810_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = intel_i830_type_to_mask_type, }; static struct agp_bridge_driver intel_7505_driver = { @@ -1664,6 +1719,7 @@ static struct agp_bridge_driver intel_7505_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, }; static int find_i810(u16 device) diff --git a/drivers/char/agp/nvidia-agp.c b/drivers/char/agp/nvidia-agp.c index df7f37b2739..2563286b2fc 100644 --- a/drivers/char/agp/nvidia-agp.c +++ b/drivers/char/agp/nvidia-agp.c @@ -310,6 +310,7 @@ static struct agp_bridge_driver nvidia_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, }; static int __devinit agp_nvidia_probe(struct pci_dev *pdev, diff --git a/drivers/char/agp/parisc-agp.c b/drivers/char/agp/parisc-agp.c index 17c50b0f83f..b7b4590673a 100644 --- a/drivers/char/agp/parisc-agp.c +++ b/drivers/char/agp/parisc-agp.c @@ -228,6 +228,7 @@ struct agp_bridge_driver parisc_agp_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, .cant_use_aperture = 1, }; diff --git a/drivers/char/agp/sgi-agp.c b/drivers/char/agp/sgi-agp.c index 902648db7ef..92d1dc45b9b 100644 --- a/drivers/char/agp/sgi-agp.c +++ b/drivers/char/agp/sgi-agp.c @@ -265,6 +265,7 @@ struct agp_bridge_driver sgi_tioca_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = sgi_tioca_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, .cant_use_aperture = 1, .needs_scratch_page = 0, .num_aperture_sizes = 1, diff --git a/drivers/char/agp/sis-agp.c b/drivers/char/agp/sis-agp.c index a00fd48a6f0..60342b70815 100644 --- a/drivers/char/agp/sis-agp.c +++ b/drivers/char/agp/sis-agp.c @@ -140,6 +140,7 @@ static struct agp_bridge_driver sis_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, }; static struct agp_device_ids sis_agp_device_ids[] __devinitdata = diff --git a/drivers/char/agp/sworks-agp.c b/drivers/char/agp/sworks-agp.c index 4f2d7d99902..9f5ae7714f8 100644 --- a/drivers/char/agp/sworks-agp.c +++ b/drivers/char/agp/sworks-agp.c @@ -444,6 +444,7 @@ static struct agp_bridge_driver sworks_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, }; static int __devinit agp_serverworks_probe(struct pci_dev *pdev, diff --git a/drivers/char/agp/uninorth-agp.c b/drivers/char/agp/uninorth-agp.c index dffc19382f7..6c45702e542 100644 --- a/drivers/char/agp/uninorth-agp.c +++ b/drivers/char/agp/uninorth-agp.c @@ -510,6 +510,7 @@ struct agp_bridge_driver uninorth_agp_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, .cant_use_aperture = 1, }; @@ -534,6 +535,7 @@ struct agp_bridge_driver u3_agp_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, .cant_use_aperture = 1, .needs_scratch_page = 1, }; diff --git a/drivers/char/agp/via-agp.c b/drivers/char/agp/via-agp.c index 2ded7a280d7..2e7c04370cd 100644 --- a/drivers/char/agp/via-agp.c +++ b/drivers/char/agp/via-agp.c @@ -191,6 +191,7 @@ static struct agp_bridge_driver via_agp3_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, }; static struct agp_bridge_driver via_driver = { @@ -214,6 +215,7 @@ static struct agp_bridge_driver via_driver = { .free_by_type = agp_generic_free_by_type, .agp_alloc_page = agp_generic_alloc_page, .agp_destroy_page = agp_generic_destroy_page, + .agp_type_to_mask_type = agp_generic_type_to_mask_type, }; static struct agp_device_ids via_agp_device_ids[] __devinitdata = diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c index 1aa93a752a9..ae76a9ffe89 100644 --- a/drivers/char/hangcheck-timer.c +++ b/drivers/char/hangcheck-timer.c @@ -117,7 +117,7 @@ __setup("hcheck_reboot", hangcheck_parse_reboot); __setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks); #endif /* not MODULE */ -#if defined(CONFIG_X86_64) || defined(CONFIG_S390) +#if defined(CONFIG_S390) # define HAVE_MONOTONIC # define TIMER_FREQ 1000000000ULL #elif defined(CONFIG_IA64) diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index be73c80d699..1d8c4ae6155 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -36,6 +36,7 @@ #include <linux/workqueue.h> #include <linux/kexec.h> #include <linux/irq.h> +#include <linux/hrtimer.h> #include <asm/ptrace.h> #include <asm/irq_regs.h> @@ -158,6 +159,17 @@ static struct sysrq_key_op sysrq_sync_op = { .enable_mask = SYSRQ_ENABLE_SYNC, }; +static void sysrq_handle_show_timers(int key, struct tty_struct *tty) +{ + sysrq_timer_list_show(); +} + +static struct sysrq_key_op sysrq_show_timers_op = { + .handler = sysrq_handle_show_timers, + .help_msg = "show-all-timers(Q)", + .action_msg = "Show Pending Timers", +}; + static void sysrq_handle_mountro(int key, struct tty_struct *tty) { emergency_remount(); @@ -335,7 +347,7 @@ static struct sysrq_key_op *sysrq_key_table[36] = { /* o: This will often be registered as 'Off' at init time */ NULL, /* o */ &sysrq_showregs_op, /* p */ - NULL, /* q */ + &sysrq_show_timers_op, /* q */ &sysrq_unraw_op, /* r */ &sysrq_sync_op, /* s */ &sysrq_showstate_op, /* t */ diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c index b6bcdbbf57b..ccaa6a39cb4 100644 --- a/drivers/clocksource/acpi_pm.c +++ b/drivers/clocksource/acpi_pm.c @@ -16,15 +16,13 @@ * This file is licensed under the GPL v2. */ +#include <linux/acpi_pmtmr.h> #include <linux/clocksource.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/pci.h> #include <asm/io.h> -/* Number of PMTMR ticks expected during calibration run */ -#define PMTMR_TICKS_PER_SEC 3579545 - /* * The I/O port the PMTMR resides at. * The location is detected during setup_arch(), @@ -32,15 +30,13 @@ */ u32 pmtmr_ioport __read_mostly; -#define ACPI_PM_MASK CLOCKSOURCE_MASK(24) /* limit it to 24 bits */ - static inline u32 read_pmtmr(void) { /* mask the output to 24 bits */ return inl(pmtmr_ioport) & ACPI_PM_MASK; } -static cycle_t acpi_pm_read_verified(void) +u32 acpi_pm_read_verified(void) { u32 v1 = 0, v2 = 0, v3 = 0; @@ -57,7 +53,12 @@ static cycle_t acpi_pm_read_verified(void) } while (unlikely((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1) || (v3 > v1 && v3 < v2))); - return (cycle_t)v2; + return v2; +} + +static cycle_t acpi_pm_read_slow(void) +{ + return (cycle_t)acpi_pm_read_verified(); } static cycle_t acpi_pm_read(void) @@ -72,7 +73,8 @@ static struct clocksource clocksource_acpi_pm = { .mask = (cycle_t)ACPI_PM_MASK, .mult = 0, /*to be caluclated*/ .shift = 22, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + }; @@ -87,7 +89,7 @@ __setup("acpi_pm_good", acpi_pm_good_setup); static inline void acpi_pm_need_workaround(void) { - clocksource_acpi_pm.read = acpi_pm_read_verified; + clocksource_acpi_pm.read = acpi_pm_read_slow; clocksource_acpi_pm.rating = 110; } diff --git a/drivers/clocksource/cyclone.c b/drivers/clocksource/cyclone.c index bf4d3d50d1c..4f3925ceb36 100644 --- a/drivers/clocksource/cyclone.c +++ b/drivers/clocksource/cyclone.c @@ -31,7 +31,7 @@ static struct clocksource clocksource_cyclone = { .mask = CYCLONE_TIMER_MASK, .mult = 10, .shift = 0, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static int __init init_cyclone_clocksource(void) diff --git a/drivers/clocksource/scx200_hrt.c b/drivers/clocksource/scx200_hrt.c index 22915cc46ba..b92da677aa5 100644 --- a/drivers/clocksource/scx200_hrt.c +++ b/drivers/clocksource/scx200_hrt.c @@ -57,7 +57,7 @@ static struct clocksource cs_hrt = { .rating = 250, .read = read_hrt, .mask = CLOCKSOURCE_MASK(32), - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, /* mult, shift are set based on mhz27 flag */ }; diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 491779af8d5..d155e81b5c9 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -16,7 +16,7 @@ config CPU_FREQ if CPU_FREQ config CPU_FREQ_TABLE - def_tristate m + tristate config CPU_FREQ_DEBUG bool "Enable CPUfreq debugging" diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index a45cc89e387..f52facc570f 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -41,8 +41,67 @@ static struct cpufreq_driver *cpufreq_driver; static struct cpufreq_policy *cpufreq_cpu_data[NR_CPUS]; static DEFINE_SPINLOCK(cpufreq_driver_lock); +/* + * cpu_policy_rwsem is a per CPU reader-writer semaphore designed to cure + * all cpufreq/hotplug/workqueue/etc related lock issues. + * + * The rules for this semaphore: + * - Any routine that wants to read from the policy structure will + * do a down_read on this semaphore. + * - Any routine that will write to the policy structure and/or may take away + * the policy altogether (eg. CPU hotplug), will hold this lock in write + * mode before doing so. + * + * Additional rules: + * - All holders of the lock should check to make sure that the CPU they + * are concerned with are online after they get the lock. + * - Governor routines that can be called in cpufreq hotplug path should not + * take this sem as top level hotplug notifier handler takes this. + */ +static DEFINE_PER_CPU(int, policy_cpu); +static DEFINE_PER_CPU(struct rw_semaphore, cpu_policy_rwsem); + +#define lock_policy_rwsem(mode, cpu) \ +int lock_policy_rwsem_##mode \ +(int cpu) \ +{ \ + int policy_cpu = per_cpu(policy_cpu, cpu); \ + BUG_ON(policy_cpu == -1); \ + down_##mode(&per_cpu(cpu_policy_rwsem, policy_cpu)); \ + if (unlikely(!cpu_online(cpu))) { \ + up_##mode(&per_cpu(cpu_policy_rwsem, policy_cpu)); \ + return -1; \ + } \ + \ + return 0; \ +} + +lock_policy_rwsem(read, cpu); +EXPORT_SYMBOL_GPL(lock_policy_rwsem_read); + +lock_policy_rwsem(write, cpu); +EXPORT_SYMBOL_GPL(lock_policy_rwsem_write); + +void unlock_policy_rwsem_read(int cpu) +{ + int policy_cpu = per_cpu(policy_cpu, cpu); + BUG_ON(policy_cpu == -1); + up_read(&per_cpu(cpu_policy_rwsem, policy_cpu)); +} +EXPORT_SYMBOL_GPL(unlock_policy_rwsem_read); + +void unlock_policy_rwsem_write(int cpu) +{ + int policy_cpu = per_cpu(policy_cpu, cpu); + BUG_ON(policy_cpu == -1); + up_write(&per_cpu(cpu_policy_rwsem, policy_cpu)); +} +EXPORT_SYMBOL_GPL(unlock_policy_rwsem_write); + + /* internal prototypes */ static int __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event); +static unsigned int __cpufreq_get(unsigned int cpu); static void handle_update(struct work_struct *work); /** @@ -415,12 +474,8 @@ static ssize_t store_##file_name \ if (ret != 1) \ return -EINVAL; \ \ - lock_cpu_hotplug(); \ - mutex_lock(&policy->lock); \ ret = __cpufreq_set_policy(policy, &new_policy); \ policy->user_policy.object = policy->object; \ - mutex_unlock(&policy->lock); \ - unlock_cpu_hotplug(); \ \ return ret ? ret : count; \ } @@ -434,7 +489,7 @@ store_one(scaling_max_freq,max); static ssize_t show_cpuinfo_cur_freq (struct cpufreq_policy * policy, char *buf) { - unsigned int cur_freq = cpufreq_get(policy->cpu); + unsigned int cur_freq = __cpufreq_get(policy->cpu); if (!cur_freq) return sprintf(buf, "<unknown>"); return sprintf(buf, "%u\n", cur_freq); @@ -479,18 +534,12 @@ static ssize_t store_scaling_governor (struct cpufreq_policy * policy, &new_policy.governor)) return -EINVAL; - lock_cpu_hotplug(); - /* Do not use cpufreq_set_policy here or the user_policy.max will be wrongly overridden */ - mutex_lock(&policy->lock); ret = __cpufreq_set_policy(policy, &new_policy); policy->user_policy.policy = policy->policy; policy->user_policy.governor = policy->governor; - mutex_unlock(&policy->lock); - - unlock_cpu_hotplug(); if (ret) return ret; @@ -595,11 +644,17 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr ,char * buf) policy = cpufreq_cpu_get(policy->cpu); if (!policy) return -EINVAL; + + if (lock_policy_rwsem_read(policy->cpu) < 0) + return -EINVAL; + if (fattr->show) ret = fattr->show(policy, buf); else ret = -EIO; + unlock_policy_rwsem_read(policy->cpu); + cpufreq_cpu_put(policy); return ret; } @@ -613,11 +668,17 @@ static ssize_t store(struct kobject * kobj, struct attribute * attr, policy = cpufreq_cpu_get(policy->cpu); if (!policy) return -EINVAL; + + if (lock_policy_rwsem_write(policy->cpu) < 0) + return -EINVAL; + if (fattr->store) ret = fattr->store(policy, buf, count); else ret = -EIO; + unlock_policy_rwsem_write(policy->cpu); + cpufreq_cpu_put(policy); return ret; } @@ -691,8 +752,10 @@ static int cpufreq_add_dev (struct sys_device * sys_dev) policy->cpu = cpu; policy->cpus = cpumask_of_cpu(cpu); - mutex_init(&policy->lock); - mutex_lock(&policy->lock); + /* Initially set CPU itself as the policy_cpu */ + per_cpu(policy_cpu, cpu) = cpu; + lock_policy_rwsem_write(cpu); + init_completion(&policy->kobj_unregister); INIT_WORK(&policy->update, handle_update); @@ -702,7 +765,7 @@ static int cpufreq_add_dev (struct sys_device * sys_dev) ret = cpufreq_driver->init(policy); if (ret) { dprintk("initialization failed\n"); - mutex_unlock(&policy->lock); + unlock_policy_rwsem_write(cpu); goto err_out; } @@ -716,6 +779,14 @@ static int cpufreq_add_dev (struct sys_device * sys_dev) */ managed_policy = cpufreq_cpu_get(j); if (unlikely(managed_policy)) { + + /* Set proper policy_cpu */ + unlock_policy_rwsem_write(cpu); + per_cpu(policy_cpu, cpu) = managed_policy->cpu; + + if (lock_policy_rwsem_write(cpu) < 0) + goto err_out_driver_exit; + spin_lock_irqsave(&cpufreq_driver_lock, flags); managed_policy->cpus = policy->cpus; cpufreq_cpu_data[cpu] = managed_policy; @@ -726,13 +797,13 @@ static int cpufreq_add_dev (struct sys_device * sys_dev) &managed_policy->kobj, "cpufreq"); if (ret) { - mutex_unlock(&policy->lock); + unlock_policy_rwsem_write(cpu); goto err_out_driver_exit; } cpufreq_debug_enable_ratelimit(); - mutex_unlock(&policy->lock); ret = 0; + unlock_policy_rwsem_write(cpu); goto err_out_driver_exit; /* call driver->exit() */ } } @@ -746,7 +817,7 @@ static int cpufreq_add_dev (struct sys_device * sys_dev) ret = kobject_register(&policy->kobj); if (ret) { - mutex_unlock(&policy->lock); + unlock_policy_rwsem_write(cpu); goto err_out_driver_exit; } /* set up files for this cpu device */ @@ -761,8 +832,10 @@ static int cpufreq_add_dev (struct sys_device * sys_dev) sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr); spin_lock_irqsave(&cpufreq_driver_lock, flags); - for_each_cpu_mask(j, policy->cpus) + for_each_cpu_mask(j, policy->cpus) { cpufreq_cpu_data[j] = policy; + per_cpu(policy_cpu, j) = policy->cpu; + } spin_unlock_irqrestore(&cpufreq_driver_lock, flags); /* symlink affected CPUs */ @@ -778,14 +851,14 @@ static int cpufreq_add_dev (struct sys_device * sys_dev) ret = sysfs_create_link(&cpu_sys_dev->kobj, &policy->kobj, "cpufreq"); if (ret) { - mutex_unlock(&policy->lock); + unlock_policy_rwsem_write(cpu); goto err_out_unregister; } } policy->governor = NULL; /* to assure that the starting sequence is * run in cpufreq_set_policy */ - mutex_unlock(&policy->lock); + unlock_policy_rwsem_write(cpu); /* set default policy */ ret = cpufreq_set_policy(&new_policy); @@ -826,11 +899,13 @@ module_out: /** - * cpufreq_remove_dev - remove a CPU device + * __cpufreq_remove_dev - remove a CPU device * * Removes the cpufreq interface for a CPU device. + * Caller should already have policy_rwsem in write mode for this CPU. + * This routine frees the rwsem before returning. */ -static int cpufreq_remove_dev (struct sys_device * sys_dev) +static int __cpufreq_remove_dev (struct sys_device * sys_dev) { unsigned int cpu = sys_dev->id; unsigned long flags; @@ -849,6 +924,7 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev) if (!data) { spin_unlock_irqrestore(&cpufreq_driver_lock, flags); cpufreq_debug_enable_ratelimit(); + unlock_policy_rwsem_write(cpu); return -EINVAL; } cpufreq_cpu_data[cpu] = NULL; @@ -865,6 +941,7 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev) sysfs_remove_link(&sys_dev->kobj, "cpufreq"); cpufreq_cpu_put(data); cpufreq_debug_enable_ratelimit(); + unlock_policy_rwsem_write(cpu); return 0; } #endif @@ -873,6 +950,7 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev) if (!kobject_get(&data->kobj)) { spin_unlock_irqrestore(&cpufreq_driver_lock, flags); cpufreq_debug_enable_ratelimit(); + unlock_policy_rwsem_write(cpu); return -EFAULT; } @@ -906,10 +984,10 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev) spin_unlock_irqrestore(&cpufreq_driver_lock, flags); #endif - mutex_lock(&data->lock); if (cpufreq_driver->target) __cpufreq_governor(data, CPUFREQ_GOV_STOP); - mutex_unlock(&data->lock); + + unlock_policy_rwsem_write(cpu); kobject_unregister(&data->kobj); @@ -933,6 +1011,18 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev) } +static int cpufreq_remove_dev (struct sys_device * sys_dev) +{ + unsigned int cpu = sys_dev->id; + int retval; + if (unlikely(lock_policy_rwsem_write(cpu))) + BUG(); + + retval = __cpufreq_remove_dev(sys_dev); + return retval; +} + + static void handle_update(struct work_struct *work) { struct cpufreq_policy *policy = @@ -980,9 +1070,12 @@ unsigned int cpufreq_quick_get(unsigned int cpu) unsigned int ret_freq = 0; if (policy) { - mutex_lock(&policy->lock); + if (unlikely(lock_policy_rwsem_read(cpu))) + return ret_freq; + ret_freq = policy->cur; - mutex_unlock(&policy->lock); + + unlock_policy_rwsem_read(cpu); cpufreq_cpu_put(policy); } @@ -991,24 +1084,13 @@ unsigned int cpufreq_quick_get(unsigned int cpu) EXPORT_SYMBOL(cpufreq_quick_get); -/** - * cpufreq_get - get the current CPU frequency (in kHz) - * @cpu: CPU number - * - * Get the CPU current (static) CPU frequency - */ -unsigned int cpufreq_get(unsigned int cpu) +static unsigned int __cpufreq_get(unsigned int cpu) { - struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct cpufreq_policy *policy = cpufreq_cpu_data[cpu]; unsigned int ret_freq = 0; - if (!policy) - return 0; - if (!cpufreq_driver->get) - goto out; - - mutex_lock(&policy->lock); + return (ret_freq); ret_freq = cpufreq_driver->get(cpu); @@ -1022,11 +1104,33 @@ unsigned int cpufreq_get(unsigned int cpu) } } - mutex_unlock(&policy->lock); + return (ret_freq); +} -out: - cpufreq_cpu_put(policy); +/** + * cpufreq_get - get the current CPU frequency (in kHz) + * @cpu: CPU number + * + * Get the CPU current (static) CPU frequency + */ +unsigned int cpufreq_get(unsigned int cpu) +{ + unsigned int ret_freq = 0; + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + + if (!policy) + goto out; + + if (unlikely(lock_policy_rwsem_read(cpu))) + goto out_policy; + + ret_freq = __cpufreq_get(cpu); + unlock_policy_rwsem_read(cpu); + +out_policy: + cpufreq_cpu_put(policy); +out: return (ret_freq); } EXPORT_SYMBOL(cpufreq_get); @@ -1278,7 +1382,6 @@ EXPORT_SYMBOL(cpufreq_unregister_notifier); *********************************************************************/ -/* Must be called with lock_cpu_hotplug held */ int __cpufreq_driver_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) @@ -1304,20 +1407,19 @@ int cpufreq_driver_target(struct cpufreq_policy *policy, if (!policy) return -EINVAL; - lock_cpu_hotplug(); - mutex_lock(&policy->lock); + if (unlikely(lock_policy_rwsem_write(policy->cpu))) + return -EINVAL; ret = __cpufreq_driver_target(policy, target_freq, relation); - mutex_unlock(&policy->lock); - unlock_cpu_hotplug(); + unlock_policy_rwsem_write(policy->cpu); cpufreq_cpu_put(policy); return ret; } EXPORT_SYMBOL_GPL(cpufreq_driver_target); -int cpufreq_driver_getavg(struct cpufreq_policy *policy) +int __cpufreq_driver_getavg(struct cpufreq_policy *policy) { int ret = 0; @@ -1325,20 +1427,15 @@ int cpufreq_driver_getavg(struct cpufreq_policy *policy) if (!policy) return -EINVAL; - mutex_lock(&policy->lock); - if (cpu_online(policy->cpu) && cpufreq_driver->getavg) ret = cpufreq_driver->getavg(policy->cpu); - mutex_unlock(&policy->lock); - cpufreq_cpu_put(policy); return ret; } -EXPORT_SYMBOL_GPL(cpufreq_driver_getavg); +EXPORT_SYMBOL_GPL(__cpufreq_driver_getavg); /* - * Locking: Must be called with the lock_cpu_hotplug() lock held * when "event" is CPUFREQ_GOV_LIMITS */ @@ -1420,9 +1517,7 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu) if (!cpu_policy) return -EINVAL; - mutex_lock(&cpu_policy->lock); memcpy(policy, cpu_policy, sizeof(struct cpufreq_policy)); - mutex_unlock(&cpu_policy->lock); cpufreq_cpu_put(cpu_policy); return 0; @@ -1433,7 +1528,6 @@ EXPORT_SYMBOL(cpufreq_get_policy); /* * data : current policy. * policy : policy to be set. - * Locking: Must be called with the lock_cpu_hotplug() lock held */ static int __cpufreq_set_policy(struct cpufreq_policy *data, struct cpufreq_policy *policy) @@ -1539,10 +1633,9 @@ int cpufreq_set_policy(struct cpufreq_policy *policy) if (!data) return -EINVAL; - lock_cpu_hotplug(); + if (unlikely(lock_policy_rwsem_write(policy->cpu))) + return -EINVAL; - /* lock this CPU */ - mutex_lock(&data->lock); ret = __cpufreq_set_policy(data, policy); data->user_policy.min = data->min; @@ -1550,9 +1643,8 @@ int cpufreq_set_policy(struct cpufreq_policy *policy) data->user_policy.policy = data->policy; data->user_policy.governor = data->governor; - mutex_unlock(&data->lock); + unlock_policy_rwsem_write(policy->cpu); - unlock_cpu_hotplug(); cpufreq_cpu_put(data); return ret; @@ -1576,8 +1668,8 @@ int cpufreq_update_policy(unsigned int cpu) if (!data) return -ENODEV; - lock_cpu_hotplug(); - mutex_lock(&data->lock); + if (unlikely(lock_policy_rwsem_write(cpu))) + return -EINVAL; dprintk("updating policy for CPU %u\n", cpu); memcpy(&policy, data, sizeof(struct cpufreq_policy)); @@ -1602,8 +1694,8 @@ int cpufreq_update_policy(unsigned int cpu) ret = __cpufreq_set_policy(data, &policy); - mutex_unlock(&data->lock); - unlock_cpu_hotplug(); + unlock_policy_rwsem_write(cpu); + cpufreq_cpu_put(data); return ret; } @@ -1613,31 +1705,28 @@ static int cpufreq_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct cpufreq_policy *policy; struct sys_device *sys_dev; + struct cpufreq_policy *policy; sys_dev = get_cpu_sysdev(cpu); - if (sys_dev) { switch (action) { case CPU_ONLINE: cpufreq_add_dev(sys_dev); break; case CPU_DOWN_PREPARE: - /* - * We attempt to put this cpu in lowest frequency - * possible before going down. This will permit - * hardware-managed P-State to switch other related - * threads to min or higher speeds if possible. - */ + if (unlikely(lock_policy_rwsem_write(cpu))) + BUG(); + policy = cpufreq_cpu_data[cpu]; if (policy) { - cpufreq_driver_target(policy, policy->min, + __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_H); } + __cpufreq_remove_dev(sys_dev); break; - case CPU_DEAD: - cpufreq_remove_dev(sys_dev); + case CPU_DOWN_FAILED: + cpufreq_add_dev(sys_dev); break; } } @@ -1751,3 +1840,16 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver) return 0; } EXPORT_SYMBOL_GPL(cpufreq_unregister_driver); + +static int __init cpufreq_core_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + per_cpu(policy_cpu, cpu) = -1; + init_rwsem(&per_cpu(cpu_policy_rwsem, cpu)); + } + return 0; +} + +core_initcall(cpufreq_core_init); diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 05d6c22ba07..26f440ccc3f 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -429,14 +429,12 @@ static void dbs_check_cpu(int cpu) static void do_dbs_timer(struct work_struct *work) { int i; - lock_cpu_hotplug(); mutex_lock(&dbs_mutex); for_each_online_cpu(i) dbs_check_cpu(i); schedule_delayed_work(&dbs_work, usecs_to_jiffies(dbs_tuners_ins.sampling_rate)); mutex_unlock(&dbs_mutex); - unlock_cpu_hotplug(); } static inline void dbs_timer_init(void) diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index f697449327c..d60bcb9d14c 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -52,19 +52,20 @@ static unsigned int def_sampling_rate; static void do_dbs_timer(struct work_struct *work); /* Sampling types */ -enum dbs_sample {DBS_NORMAL_SAMPLE, DBS_SUB_SAMPLE}; +enum {DBS_NORMAL_SAMPLE, DBS_SUB_SAMPLE}; struct cpu_dbs_info_s { cputime64_t prev_cpu_idle; cputime64_t prev_cpu_wall; struct cpufreq_policy *cur_policy; struct delayed_work work; - enum dbs_sample sample_type; - unsigned int enable; struct cpufreq_frequency_table *freq_table; unsigned int freq_lo; unsigned int freq_lo_jiffies; unsigned int freq_hi_jiffies; + int cpu; + unsigned int enable:1, + sample_type:1; }; static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); @@ -402,7 +403,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) if (load < (dbs_tuners_ins.up_threshold - 10)) { unsigned int freq_next, freq_cur; - freq_cur = cpufreq_driver_getavg(policy); + freq_cur = __cpufreq_driver_getavg(policy); if (!freq_cur) freq_cur = policy->cur; @@ -423,9 +424,11 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) static void do_dbs_timer(struct work_struct *work) { - unsigned int cpu = smp_processor_id(); - struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, cpu); - enum dbs_sample sample_type = dbs_info->sample_type; + struct cpu_dbs_info_s *dbs_info = + container_of(work, struct cpu_dbs_info_s, work.work); + unsigned int cpu = dbs_info->cpu; + int sample_type = dbs_info->sample_type; + /* We want all CPUs to do sampling nearly on same jiffy */ int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); @@ -434,15 +437,19 @@ static void do_dbs_timer(struct work_struct *work) delay -= jiffies % delay; - if (!dbs_info->enable) + if (lock_policy_rwsem_write(cpu) < 0) + return; + + if (!dbs_info->enable) { + unlock_policy_rwsem_write(cpu); return; + } + /* Common NORMAL_SAMPLE setup */ dbs_info->sample_type = DBS_NORMAL_SAMPLE; if (!dbs_tuners_ins.powersave_bias || sample_type == DBS_NORMAL_SAMPLE) { - lock_cpu_hotplug(); dbs_check_cpu(dbs_info); - unlock_cpu_hotplug(); if (dbs_info->freq_lo) { /* Setup timer for SUB_SAMPLE */ dbs_info->sample_type = DBS_SUB_SAMPLE; @@ -454,26 +461,27 @@ static void do_dbs_timer(struct work_struct *work) CPUFREQ_RELATION_H); } queue_delayed_work_on(cpu, kondemand_wq, &dbs_info->work, delay); + unlock_policy_rwsem_write(cpu); } -static inline void dbs_timer_init(unsigned int cpu) +static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) { - struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, cpu); /* We want all CPUs to do sampling nearly on same jiffy */ int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); delay -= jiffies % delay; + dbs_info->enable = 1; ondemand_powersave_bias_init(); - INIT_DELAYED_WORK_NAR(&dbs_info->work, do_dbs_timer); dbs_info->sample_type = DBS_NORMAL_SAMPLE; - queue_delayed_work_on(cpu, kondemand_wq, &dbs_info->work, delay); + INIT_DELAYED_WORK_NAR(&dbs_info->work, do_dbs_timer); + queue_delayed_work_on(dbs_info->cpu, kondemand_wq, &dbs_info->work, + delay); } static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info) { dbs_info->enable = 0; cancel_delayed_work(&dbs_info->work); - flush_workqueue(kondemand_wq); } static int cpufreq_governor_dbs(struct cpufreq_policy *policy, @@ -502,21 +510,9 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, mutex_lock(&dbs_mutex); dbs_enable++; - if (dbs_enable == 1) { - kondemand_wq = create_workqueue("kondemand"); - if (!kondemand_wq) { - printk(KERN_ERR - "Creation of kondemand failed\n"); - dbs_enable--; - mutex_unlock(&dbs_mutex); - return -ENOSPC; - } - } rc = sysfs_create_group(&policy->kobj, &dbs_attr_group); if (rc) { - if (dbs_enable == 1) - destroy_workqueue(kondemand_wq); dbs_enable--; mutex_unlock(&dbs_mutex); return rc; @@ -530,7 +526,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j); j_dbs_info->prev_cpu_wall = get_jiffies_64(); } - this_dbs_info->enable = 1; + this_dbs_info->cpu = cpu; /* * Start the timerschedule work, when this governor * is used for first time @@ -550,7 +546,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, dbs_tuners_ins.sampling_rate = def_sampling_rate; } - dbs_timer_init(policy->cpu); + dbs_timer_init(this_dbs_info); mutex_unlock(&dbs_mutex); break; @@ -560,9 +556,6 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, dbs_timer_exit(this_dbs_info); sysfs_remove_group(&policy->kobj, &dbs_attr_group); dbs_enable--; - if (dbs_enable == 0) - destroy_workqueue(kondemand_wq); - mutex_unlock(&dbs_mutex); break; @@ -591,12 +584,18 @@ static struct cpufreq_governor cpufreq_gov_dbs = { static int __init cpufreq_gov_dbs_init(void) { + kondemand_wq = create_workqueue("kondemand"); + if (!kondemand_wq) { + printk(KERN_ERR "Creation of kondemand failed\n"); + return -EFAULT; + } return cpufreq_register_governor(&cpufreq_gov_dbs); } static void __exit cpufreq_gov_dbs_exit(void) { cpufreq_unregister_governor(&cpufreq_gov_dbs); + destroy_workqueue(kondemand_wq); } @@ -608,3 +607,4 @@ MODULE_LICENSE("GPL"); module_init(cpufreq_gov_dbs_init); module_exit(cpufreq_gov_dbs_exit); + diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 91ad342a605..d1c7cac9316 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -370,12 +370,10 @@ __exit cpufreq_stats_exit(void) cpufreq_unregister_notifier(¬ifier_trans_block, CPUFREQ_TRANSITION_NOTIFIER); unregister_hotcpu_notifier(&cpufreq_stat_cpu_notifier); - lock_cpu_hotplug(); for_each_online_cpu(cpu) { cpufreq_stat_cpu_callback(&cpufreq_stat_cpu_notifier, CPU_DEAD, (void *)(long)cpu); } - unlock_cpu_hotplug(); } MODULE_AUTHOR ("Zou Nan hai <nanhai.zou@intel.com>"); diff --git a/drivers/cpufreq/cpufreq_userspace.c b/drivers/cpufreq/cpufreq_userspace.c index 2a4eb0bfaf3..860345c7799 100644 --- a/drivers/cpufreq/cpufreq_userspace.c +++ b/drivers/cpufreq/cpufreq_userspace.c @@ -71,7 +71,6 @@ static int cpufreq_set(unsigned int freq, struct cpufreq_policy *policy) dprintk("cpufreq_set for cpu %u, freq %u kHz\n", policy->cpu, freq); - lock_cpu_hotplug(); mutex_lock(&userspace_mutex); if (!cpu_is_managed[policy->cpu]) goto err; @@ -94,7 +93,6 @@ static int cpufreq_set(unsigned int freq, struct cpufreq_policy *policy) err: mutex_unlock(&userspace_mutex); - unlock_cpu_hotplug(); return ret; } diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c index cd251efda41..0a26e066354 100644 --- a/drivers/input/touchscreen/ads7846.c +++ b/drivers/input/touchscreen/ads7846.c @@ -546,7 +546,7 @@ static void ads7846_rx(void *ads) ts->spi->dev.bus_id, ts->tc.ignore, Rt); #endif hrtimer_start(&ts->timer, ktime_set(0, TS_POLL_PERIOD), - HRTIMER_REL); + HRTIMER_MODE_REL); return; } @@ -578,7 +578,8 @@ static void ads7846_rx(void *ads) #endif } - hrtimer_start(&ts->timer, ktime_set(0, TS_POLL_PERIOD), HRTIMER_REL); + hrtimer_start(&ts->timer, ktime_set(0, TS_POLL_PERIOD), + HRTIMER_MODE_REL); } static int ads7846_debounce(void *ads, int data_idx, int *val) @@ -667,7 +668,7 @@ static void ads7846_rx_val(void *ads) status); } -static int ads7846_timer(struct hrtimer *handle) +static enum hrtimer_restart ads7846_timer(struct hrtimer *handle) { struct ads7846 *ts = container_of(handle, struct ads7846, timer); int status = 0; @@ -724,7 +725,7 @@ static irqreturn_t ads7846_irq(int irq, void *handle) disable_irq(ts->spi->irq); ts->pending = 1; hrtimer_start(&ts->timer, ktime_set(0, TS_POLL_DELAY), - HRTIMER_REL); + HRTIMER_MODE_REL); } } spin_unlock_irqrestore(&ts->lock, flags); @@ -862,7 +863,7 @@ static int __devinit ads7846_probe(struct spi_device *spi) ts->spi = spi; ts->input = input_dev; - hrtimer_init(&ts->timer, CLOCK_MONOTONIC, HRTIMER_REL); + hrtimer_init(&ts->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ts->timer.function = ads7846_timer; spin_lock_init(&ts->lock); diff --git a/drivers/isdn/gigaset/Makefile b/drivers/isdn/gigaset/Makefile index 835b806a9de..077e297d8c7 100644 --- a/drivers/isdn/gigaset/Makefile +++ b/drivers/isdn/gigaset/Makefile @@ -5,4 +5,4 @@ ser_gigaset-y := ser-gigaset.o asyncdata.o obj-$(CONFIG_GIGASET_M105) += usb_gigaset.o gigaset.o obj-$(CONFIG_GIGASET_BASE) += bas_gigaset.o gigaset.o -obj-$(CONFIG_GIGASET_M105) += ser_gigaset.o gigaset.o +obj-$(CONFIG_GIGASET_M101) += ser_gigaset.o gigaset.o diff --git a/drivers/video/s3c2410fb.c b/drivers/video/s3c2410fb.c index ccef56d0c15..ed3426062a8 100644 --- a/drivers/video/s3c2410fb.c +++ b/drivers/video/s3c2410fb.c @@ -791,6 +791,8 @@ static int __init s3c2410fb_probe(struct platform_device *pdev) info = fbinfo->par; info->fb = fbinfo; + info->dev = &pdev->dev; + platform_set_drvdata(pdev, fbinfo); dprintk("devinit\n"); diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index b3609b7cdf1..403e3bad145 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -467,6 +467,7 @@ extern struct kmem_cache *ecryptfs_header_cache_1; extern struct kmem_cache *ecryptfs_header_cache_2; extern struct kmem_cache *ecryptfs_xattr_cache; extern struct kmem_cache *ecryptfs_lower_page_cache; +extern struct kmem_cache *ecryptfs_key_record_cache; int ecryptfs_interpose(struct dentry *hidden_dentry, struct dentry *this_dentry, struct super_block *sb, diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 81156e95ef8..b550dea8eee 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1638,6 +1638,8 @@ out: return rc; } +struct kmem_cache *ecryptfs_key_record_cache; + /** * ecryptfs_generate_key_packet_set * @dest: Virtual address from which to write the key record set @@ -1664,50 +1666,55 @@ ecryptfs_generate_key_packet_set(char *dest_base, &ecryptfs_superblock_to_private( ecryptfs_dentry->d_sb)->mount_crypt_stat; size_t written; - struct ecryptfs_key_record key_rec; + struct ecryptfs_key_record *key_rec; int rc = 0; (*len) = 0; + key_rec = kmem_cache_alloc(ecryptfs_key_record_cache, GFP_KERNEL); + if (!key_rec) { + rc = -ENOMEM; + goto out; + } if (mount_crypt_stat->global_auth_tok) { auth_tok = mount_crypt_stat->global_auth_tok; if (auth_tok->token_type == ECRYPTFS_PASSWORD) { rc = write_tag_3_packet((dest_base + (*len)), max, auth_tok, - crypt_stat, &key_rec, + crypt_stat, key_rec, &written); if (rc) { ecryptfs_printk(KERN_WARNING, "Error " "writing tag 3 packet\n"); - goto out; + goto out_free; } (*len) += written; /* Write auth tok signature packet */ rc = write_tag_11_packet( (dest_base + (*len)), (max - (*len)), - key_rec.sig, ECRYPTFS_SIG_SIZE, &written); + key_rec->sig, ECRYPTFS_SIG_SIZE, &written); if (rc) { ecryptfs_printk(KERN_ERR, "Error writing " "auth tok signature packet\n"); - goto out; + goto out_free; } (*len) += written; } else if (auth_tok->token_type == ECRYPTFS_PRIVATE_KEY) { rc = write_tag_1_packet(dest_base + (*len), max, auth_tok, crypt_stat,mount_crypt_stat, - &key_rec, &written); + key_rec, &written); if (rc) { ecryptfs_printk(KERN_WARNING, "Error " "writing tag 1 packet\n"); - goto out; + goto out_free; } (*len) += written; } else { ecryptfs_printk(KERN_WARNING, "Unsupported " "authentication token type\n"); rc = -EINVAL; - goto out; + goto out_free; } } else BUG(); @@ -1717,6 +1724,9 @@ ecryptfs_generate_key_packet_set(char *dest_base, ecryptfs_printk(KERN_ERR, "Error writing boundary byte\n"); rc = -EIO; } + +out_free: + kmem_cache_free(ecryptfs_key_record_cache, key_rec); out: if (rc) (*len) = 0; diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 26fe405a576..80044d196fe 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -651,6 +651,11 @@ static struct ecryptfs_cache_info { .name = "ecryptfs_lower_page_cache", .size = PAGE_CACHE_SIZE, }, + { + .cache = &ecryptfs_key_record_cache, + .name = "ecryptfs_key_record_cache", + .size = sizeof(struct ecryptfs_key_record), + }, }; static void ecryptfs_free_kmem_caches(void) diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 47d7e7b611f..3baf253be95 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -169,7 +169,8 @@ int ecryptfs_process_helo(unsigned int transport, uid_t uid, pid_t pid) if (!new_id) { rc = -ENOMEM; ecryptfs_printk(KERN_ERR, "Failed to allocate memory; unable " - "to register daemon [%d] for user\n", pid, uid); + "to register daemon [%d] for user [%d]\n", + pid, uid); goto unlock; } if (!ecryptfs_find_daemon_id(uid, &old_id)) { diff --git a/fs/namei.c b/fs/namei.c index 161e2225c75..ee60cc4d345 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2688,10 +2688,11 @@ int __page_symlink(struct inode *inode, const char *symname, int len, { struct address_space *mapping = inode->i_mapping; struct page *page; - int err = -ENOMEM; + int err; char *kaddr; retry: + err = -ENOMEM; page = find_or_create_page(mapping, 0, gfp_mask); if (!page) goto fail; diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 5d94555cdc8..832673b1458 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -61,9 +61,11 @@ /* flags used to simulate posix default ACLs */ #define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \ - | NFS4_ACE_DIRECTORY_INHERIT_ACE | NFS4_ACE_INHERIT_ONLY_ACE) + | NFS4_ACE_DIRECTORY_INHERIT_ACE) -#define NFS4_SUPPORTED_FLAGS (NFS4_INHERITANCE_FLAGS | NFS4_ACE_IDENTIFIER_GROUP) +#define NFS4_SUPPORTED_FLAGS (NFS4_INHERITANCE_FLAGS \ + | NFS4_ACE_INHERIT_ONLY_ACE \ + | NFS4_ACE_IDENTIFIER_GROUP) #define MASK_EQUAL(mask1, mask2) \ ( ((mask1) & NFS4_ACE_MASK_ALL) == ((mask2) & NFS4_ACE_MASK_ALL) ) @@ -87,12 +89,19 @@ mask_from_posix(unsigned short perm, unsigned int flags) } static u32 -deny_mask(u32 allow_mask, unsigned int flags) +deny_mask_from_posix(unsigned short perm, u32 flags) { - u32 ret = ~allow_mask & ~NFS4_MASK_UNSUPP; - if (!(flags & NFS4_ACL_DIR)) - ret &= ~NFS4_ACE_DELETE_CHILD; - return ret; + u32 mask = 0; + + if (perm & ACL_READ) + mask |= NFS4_READ_MODE; + if (perm & ACL_WRITE) + mask |= NFS4_WRITE_MODE; + if ((perm & ACL_WRITE) && (flags & NFS4_ACL_DIR)) + mask |= NFS4_ACE_DELETE_CHILD; + if (perm & ACL_EXECUTE) + mask |= NFS4_EXECUTE_MODE; + return mask; } /* XXX: modify functions to return NFS errors; they're only ever @@ -126,108 +135,151 @@ struct ace_container { }; static short ace2type(struct nfs4_ace *); -static int _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *, unsigned int); -static struct posix_acl *_nfsv4_to_posix_one(struct nfs4_acl *, unsigned int); -int nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t); -static int nfs4_acl_split(struct nfs4_acl *, struct nfs4_acl *); +static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *, + unsigned int); +void nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t); struct nfs4_acl * nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl, unsigned int flags) { struct nfs4_acl *acl; - int error = -EINVAL; + int size = 0; - if ((pacl != NULL && - (posix_acl_valid(pacl) < 0 || pacl->a_count == 0)) || - (dpacl != NULL && - (posix_acl_valid(dpacl) < 0 || dpacl->a_count == 0))) - goto out_err; - - acl = nfs4_acl_new(); - if (acl == NULL) { - error = -ENOMEM; - goto out_err; + if (pacl) { + if (posix_acl_valid(pacl) < 0) + return ERR_PTR(-EINVAL); + size += 2*pacl->a_count; } - - if (pacl != NULL) { - error = _posix_to_nfsv4_one(pacl, acl, - flags & ~NFS4_ACL_TYPE_DEFAULT); - if (error < 0) - goto out_acl; + if (dpacl) { + if (posix_acl_valid(dpacl) < 0) + return ERR_PTR(-EINVAL); + size += 2*dpacl->a_count; } - if (dpacl != NULL) { - error = _posix_to_nfsv4_one(dpacl, acl, - flags | NFS4_ACL_TYPE_DEFAULT); - if (error < 0) - goto out_acl; - } + /* Allocate for worst case: one (deny, allow) pair each: */ + acl = nfs4_acl_new(size); + if (acl == NULL) + return ERR_PTR(-ENOMEM); - return acl; + if (pacl) + _posix_to_nfsv4_one(pacl, acl, flags & ~NFS4_ACL_TYPE_DEFAULT); -out_acl: - nfs4_acl_free(acl); -out_err: - acl = ERR_PTR(error); + if (dpacl) + _posix_to_nfsv4_one(dpacl, acl, flags | NFS4_ACL_TYPE_DEFAULT); return acl; } -static int -nfs4_acl_add_pair(struct nfs4_acl *acl, int eflag, u32 mask, int whotype, - uid_t owner, unsigned int flags) +struct posix_acl_summary { + unsigned short owner; + unsigned short users; + unsigned short group; + unsigned short groups; + unsigned short other; + unsigned short mask; +}; + +static void +summarize_posix_acl(struct posix_acl *acl, struct posix_acl_summary *pas) { - int error; - - error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE, - eflag, mask, whotype, owner); - if (error < 0) - return error; - error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, - eflag, deny_mask(mask, flags), whotype, owner); - return error; + struct posix_acl_entry *pa, *pe; + pas->users = 0; + pas->groups = 0; + pas->mask = 07; + + pe = acl->a_entries + acl->a_count; + + FOREACH_ACL_ENTRY(pa, acl, pe) { + switch (pa->e_tag) { + case ACL_USER_OBJ: + pas->owner = pa->e_perm; + break; + case ACL_GROUP_OBJ: + pas->group = pa->e_perm; + break; + case ACL_USER: + pas->users |= pa->e_perm; + break; + case ACL_GROUP: + pas->groups |= pa->e_perm; + break; + case ACL_OTHER: + pas->other = pa->e_perm; + break; + case ACL_MASK: + pas->mask = pa->e_perm; + break; + } + } + /* We'll only care about effective permissions: */ + pas->users &= pas->mask; + pas->group &= pas->mask; + pas->groups &= pas->mask; } /* We assume the acl has been verified with posix_acl_valid. */ -static int +static void _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, unsigned int flags) { - struct posix_acl_entry *pa, *pe, *group_owner_entry; - int error = -EINVAL; - u32 mask, mask_mask; + struct posix_acl_entry *pa, *group_owner_entry; + struct nfs4_ace *ace; + struct posix_acl_summary pas; + unsigned short deny; int eflag = ((flags & NFS4_ACL_TYPE_DEFAULT) ? NFS4_INHERITANCE_FLAGS : 0); BUG_ON(pacl->a_count < 3); - pe = pacl->a_entries + pacl->a_count; - pa = pe - 2; /* if mask entry exists, it's second from the last. */ - if (pa->e_tag == ACL_MASK) - mask_mask = deny_mask(mask_from_posix(pa->e_perm, flags), flags); - else - mask_mask = 0; + summarize_posix_acl(pacl, &pas); pa = pacl->a_entries; - BUG_ON(pa->e_tag != ACL_USER_OBJ); - mask = mask_from_posix(pa->e_perm, flags | NFS4_ACL_OWNER); - error = nfs4_acl_add_pair(acl, eflag, mask, NFS4_ACL_WHO_OWNER, 0, flags); - if (error < 0) - goto out; - pa++; + ace = acl->aces + acl->naces; - while (pa->e_tag == ACL_USER) { - mask = mask_from_posix(pa->e_perm, flags); - error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, - eflag, mask_mask, NFS4_ACL_WHO_NAMED, pa->e_id); - if (error < 0) - goto out; + /* We could deny everything not granted by the owner: */ + deny = ~pas.owner; + /* + * but it is equivalent (and simpler) to deny only what is not + * granted by later entries: + */ + deny &= pas.users | pas.group | pas.groups | pas.other; + if (deny) { + ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = deny_mask_from_posix(deny, flags); + ace->whotype = NFS4_ACL_WHO_OWNER; + ace++; + acl->naces++; + } + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = mask_from_posix(pa->e_perm, flags | NFS4_ACL_OWNER); + ace->whotype = NFS4_ACL_WHO_OWNER; + ace++; + acl->naces++; + pa++; - error = nfs4_acl_add_pair(acl, eflag, mask, - NFS4_ACL_WHO_NAMED, pa->e_id, flags); - if (error < 0) - goto out; + while (pa->e_tag == ACL_USER) { + deny = ~(pa->e_perm & pas.mask); + deny &= pas.groups | pas.group | pas.other; + if (deny) { + ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = deny_mask_from_posix(deny, flags); + ace->whotype = NFS4_ACL_WHO_NAMED; + ace->who = pa->e_id; + ace++; + acl->naces++; + } + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = mask_from_posix(pa->e_perm & pas.mask, + flags); + ace->whotype = NFS4_ACL_WHO_NAMED; + ace->who = pa->e_id; + ace++; + acl->naces++; pa++; } @@ -236,67 +288,65 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, /* allow ACEs */ - if (pacl->a_count > 3) { - BUG_ON(pa->e_tag != ACL_GROUP_OBJ); - error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, - NFS4_ACE_IDENTIFIER_GROUP | eflag, mask_mask, - NFS4_ACL_WHO_GROUP, 0); - if (error < 0) - goto out; - } group_owner_entry = pa; - mask = mask_from_posix(pa->e_perm, flags); - error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE, - NFS4_ACE_IDENTIFIER_GROUP | eflag, mask, - NFS4_ACL_WHO_GROUP, 0); - if (error < 0) - goto out; + + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = mask_from_posix(pas.group, flags); + ace->whotype = NFS4_ACL_WHO_GROUP; + ace++; + acl->naces++; pa++; while (pa->e_tag == ACL_GROUP) { - mask = mask_from_posix(pa->e_perm, flags); - error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, - NFS4_ACE_IDENTIFIER_GROUP | eflag, mask_mask, - NFS4_ACL_WHO_NAMED, pa->e_id); - if (error < 0) - goto out; - - error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE, - NFS4_ACE_IDENTIFIER_GROUP | eflag, mask, - NFS4_ACL_WHO_NAMED, pa->e_id); - if (error < 0) - goto out; + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; + ace->access_mask = mask_from_posix(pa->e_perm & pas.mask, + flags); + ace->whotype = NFS4_ACL_WHO_NAMED; + ace->who = pa->e_id; + ace++; + acl->naces++; pa++; } /* deny ACEs */ pa = group_owner_entry; - mask = mask_from_posix(pa->e_perm, flags); - error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, - NFS4_ACE_IDENTIFIER_GROUP | eflag, - deny_mask(mask, flags), NFS4_ACL_WHO_GROUP, 0); - if (error < 0) - goto out; + + deny = ~pas.group & pas.other; + if (deny) { + ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; + ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; + ace->access_mask = deny_mask_from_posix(deny, flags); + ace->whotype = NFS4_ACL_WHO_GROUP; + ace++; + acl->naces++; + } pa++; + while (pa->e_tag == ACL_GROUP) { - mask = mask_from_posix(pa->e_perm, flags); - error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, - NFS4_ACE_IDENTIFIER_GROUP | eflag, - deny_mask(mask, flags), NFS4_ACL_WHO_NAMED, pa->e_id); - if (error < 0) - goto out; + deny = ~(pa->e_perm & pas.mask); + deny &= pas.other; + if (deny) { + ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; + ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; + ace->access_mask = mask_from_posix(deny, flags); + ace->whotype = NFS4_ACL_WHO_NAMED; + ace->who = pa->e_id; + ace++; + acl->naces++; + } pa++; } if (pa->e_tag == ACL_MASK) pa++; - BUG_ON(pa->e_tag != ACL_OTHER); - mask = mask_from_posix(pa->e_perm, flags); - error = nfs4_acl_add_pair(acl, eflag, mask, NFS4_ACL_WHO_EVERYONE, 0, flags); - -out: - return error; + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = mask_from_posix(pa->e_perm, flags); + ace->whotype = NFS4_ACL_WHO_EVERYONE; + acl->naces++; } static void @@ -342,46 +392,6 @@ sort_pacl(struct posix_acl *pacl) return; } -int -nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl, - struct posix_acl **dpacl, unsigned int flags) -{ - struct nfs4_acl *dacl; - int error = -ENOMEM; - - *pacl = NULL; - *dpacl = NULL; - - dacl = nfs4_acl_new(); - if (dacl == NULL) - goto out; - - error = nfs4_acl_split(acl, dacl); - if (error) - goto out_acl; - - *pacl = _nfsv4_to_posix_one(acl, flags); - if (IS_ERR(*pacl)) { - error = PTR_ERR(*pacl); - *pacl = NULL; - goto out_acl; - } - - *dpacl = _nfsv4_to_posix_one(dacl, flags); - if (IS_ERR(*dpacl)) { - error = PTR_ERR(*dpacl); - *dpacl = NULL; - } -out_acl: - if (error) { - posix_acl_release(*pacl); - *pacl = NULL; - } - nfs4_acl_free(dacl); -out: - return error; -} - /* * While processing the NFSv4 ACE, this maintains bitmasks representing * which permission bits have been allowed and which denied to a given @@ -406,6 +416,7 @@ struct posix_ace_state_array { * calculated so far: */ struct posix_acl_state { + int empty; struct posix_ace_state owner; struct posix_ace_state group; struct posix_ace_state other; @@ -421,6 +432,7 @@ init_state(struct posix_acl_state *state, int cnt) int alloc; memset(state, 0, sizeof(struct posix_acl_state)); + state->empty = 1; /* * In the worst case, each individual acl could be for a distinct * named user or group, but we don't no which, so we allocate @@ -488,6 +500,20 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) int nace; int i, error = 0; + /* + * ACLs with no ACEs are treated differently in the inheritable + * and effective cases: when there are no inheritable ACEs, we + * set a zero-length default posix acl: + */ + if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) { + pacl = posix_acl_alloc(0, GFP_KERNEL); + return pacl ? pacl : ERR_PTR(-ENOMEM); + } + /* + * When there are no effective ACEs, the following will end + * up setting a 3-element effective posix ACL with all + * permissions zero. + */ nace = 4 + state->users->n + state->groups->n; pacl = posix_acl_alloc(nace, GFP_KERNEL); if (!pacl) @@ -603,6 +629,8 @@ static void process_one_v4_ace(struct posix_acl_state *state, u32 mask = ace->access_mask; int i; + state->empty = 0; + switch (ace2type(ace)) { case ACL_USER_OBJ: if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { @@ -666,75 +694,62 @@ static void process_one_v4_ace(struct posix_acl_state *state, } } -static struct posix_acl * -_nfsv4_to_posix_one(struct nfs4_acl *n4acl, unsigned int flags) +int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl, + struct posix_acl **dpacl, unsigned int flags) { - struct posix_acl_state state; - struct posix_acl *pacl; + struct posix_acl_state effective_acl_state, default_acl_state; struct nfs4_ace *ace; int ret; - ret = init_state(&state, n4acl->naces); + ret = init_state(&effective_acl_state, acl->naces); if (ret) - return ERR_PTR(ret); - - list_for_each_entry(ace, &n4acl->ace_head, l_ace) - process_one_v4_ace(&state, ace); - - pacl = posix_state_to_acl(&state, flags); - - free_state(&state); - - if (!IS_ERR(pacl)) - sort_pacl(pacl); - return pacl; -} - -static int -nfs4_acl_split(struct nfs4_acl *acl, struct nfs4_acl *dacl) -{ - struct list_head *h, *n; - struct nfs4_ace *ace; - int error = 0; - - list_for_each_safe(h, n, &acl->ace_head) { - ace = list_entry(h, struct nfs4_ace, l_ace); - + return ret; + ret = init_state(&default_acl_state, acl->naces); + if (ret) + goto out_estate; + ret = -EINVAL; + for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) { if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE && ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE) - return -EINVAL; - + goto out_dstate; if (ace->flag & ~NFS4_SUPPORTED_FLAGS) - return -EINVAL; - - switch (ace->flag & NFS4_INHERITANCE_FLAGS) { - case 0: - /* Leave this ace in the effective acl: */ + goto out_dstate; + if ((ace->flag & NFS4_INHERITANCE_FLAGS) == 0) { + process_one_v4_ace(&effective_acl_state, ace); continue; - case NFS4_INHERITANCE_FLAGS: - /* Add this ace to the default acl and remove it - * from the effective acl: */ - error = nfs4_acl_add_ace(dacl, ace->type, ace->flag, - ace->access_mask, ace->whotype, ace->who); - if (error) - return error; - list_del(h); - kfree(ace); - acl->naces--; - break; - case NFS4_INHERITANCE_FLAGS & ~NFS4_ACE_INHERIT_ONLY_ACE: - /* Add this ace to the default, but leave it in - * the effective acl as well: */ - error = nfs4_acl_add_ace(dacl, ace->type, ace->flag, - ace->access_mask, ace->whotype, ace->who); - if (error) - return error; - break; - default: - return -EINVAL; } + if (!(flags & NFS4_ACL_DIR)) + goto out_dstate; + /* + * Note that when only one of FILE_INHERIT or DIRECTORY_INHERIT + * is set, we're effectively turning on the other. That's OK, + * according to rfc 3530. + */ + process_one_v4_ace(&default_acl_state, ace); + + if (!(ace->flag & NFS4_ACE_INHERIT_ONLY_ACE)) + process_one_v4_ace(&effective_acl_state, ace); } - return 0; + *pacl = posix_state_to_acl(&effective_acl_state, flags); + if (IS_ERR(*pacl)) { + ret = PTR_ERR(*pacl); + goto out_dstate; + } + *dpacl = posix_state_to_acl(&default_acl_state, + flags | NFS4_ACL_TYPE_DEFAULT); + if (IS_ERR(*dpacl)) { + ret = PTR_ERR(*dpacl); + posix_acl_release(*pacl); + goto out_dstate; + } + sort_pacl(*pacl); + sort_pacl(*dpacl); + ret = 0; +out_dstate: + free_state(&default_acl_state); +out_estate: + free_state(&effective_acl_state); + return ret; } static short @@ -759,48 +774,22 @@ EXPORT_SYMBOL(nfs4_acl_posix_to_nfsv4); EXPORT_SYMBOL(nfs4_acl_nfsv4_to_posix); struct nfs4_acl * -nfs4_acl_new(void) +nfs4_acl_new(int n) { struct nfs4_acl *acl; - if ((acl = kmalloc(sizeof(*acl), GFP_KERNEL)) == NULL) + acl = kmalloc(sizeof(*acl) + n*sizeof(struct nfs4_ace), GFP_KERNEL); + if (acl == NULL) return NULL; - acl->naces = 0; - INIT_LIST_HEAD(&acl->ace_head); - return acl; } void -nfs4_acl_free(struct nfs4_acl *acl) -{ - struct list_head *h; - struct nfs4_ace *ace; - - if (!acl) - return; - - while (!list_empty(&acl->ace_head)) { - h = acl->ace_head.next; - list_del(h); - ace = list_entry(h, struct nfs4_ace, l_ace); - kfree(ace); - } - - kfree(acl); - - return; -} - -int nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask, int whotype, uid_t who) { - struct nfs4_ace *ace; - - if ((ace = kmalloc(sizeof(*ace), GFP_KERNEL)) == NULL) - return -ENOMEM; + struct nfs4_ace *ace = acl->aces + acl->naces; ace->type = type; ace->flag = flag; @@ -808,10 +797,7 @@ nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask, ace->whotype = whotype; ace->who = who; - list_add_tail(&ace->l_ace, &acl->ace_head); acl->naces++; - - return 0; } static struct { @@ -865,7 +851,6 @@ nfs4_acl_write_who(int who, char *p) } EXPORT_SYMBOL(nfs4_acl_new); -EXPORT_SYMBOL(nfs4_acl_free); EXPORT_SYMBOL(nfs4_acl_add_ace); EXPORT_SYMBOL(nfs4_acl_get_whotype); EXPORT_SYMBOL(nfs4_acl_write_who); diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index f57655a7a2b..fb14d68eaca 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -387,7 +387,6 @@ nfsd4_probe_callback(struct nfs4_client *clp) .address = (struct sockaddr *)&addr, .addrsize = sizeof(addr), .timeout = &timeparms, - .servername = clp->cl_name.data, .program = program, .version = nfs_cb_version[1]->number, .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ @@ -397,6 +396,7 @@ nfsd4_probe_callback(struct nfs4_client *clp) .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], .rpc_argp = clp, }; + char clientname[16]; int status; if (atomic_read(&cb->cb_set)) @@ -419,6 +419,11 @@ nfsd4_probe_callback(struct nfs4_client *clp) memset(program->stats, 0, sizeof(cb->cb_stat)); program->stats->program = program; + /* Just here to make some printk's more useful: */ + snprintf(clientname, sizeof(clientname), + "%u.%u.%u.%u", NIPQUAD(addr.sin_addr)); + args.servername = clientname; + /* Create RPC client */ cb->cb_client = rpc_create(&args); if (IS_ERR(cb->cb_client)) { diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 0efba557fb5..5d090f11f2b 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -199,24 +199,22 @@ defer_free(struct nfsd4_compoundargs *argp, static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes) { - void *new = NULL; if (p == argp->tmp) { - new = kmalloc(nbytes, GFP_KERNEL); - if (!new) return NULL; - p = new; + p = kmalloc(nbytes, GFP_KERNEL); + if (!p) + return NULL; memcpy(p, argp->tmp, nbytes); } else { BUG_ON(p != argp->tmpp); argp->tmpp = NULL; } if (defer_free(argp, kfree, p)) { - kfree(new); + kfree(p); return NULL; } else return (char *)p; } - static __be32 nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) { @@ -255,7 +253,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia return status; /* - * According to spec, unsupported attributes return ERR_NOTSUPP; + * According to spec, unsupported attributes return ERR_ATTRNOTSUPP; * read-only attributes return ERR_INVAL. */ if ((bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) || (bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1)) @@ -273,42 +271,42 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia iattr->ia_valid |= ATTR_SIZE; } if (bmval[0] & FATTR4_WORD0_ACL) { - int nace, i; - struct nfs4_ace ace; + int nace; + struct nfs4_ace *ace; READ_BUF(4); len += 4; READ32(nace); - *acl = nfs4_acl_new(); + if (nace > NFS4_ACL_MAX) + return nfserr_resource; + + *acl = nfs4_acl_new(nace); if (*acl == NULL) { host_err = -ENOMEM; goto out_nfserr; } - defer_free(argp, (void (*)(const void *))nfs4_acl_free, *acl); + defer_free(argp, kfree, *acl); - for (i = 0; i < nace; i++) { + (*acl)->naces = nace; + for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) { READ_BUF(16); len += 16; - READ32(ace.type); - READ32(ace.flag); - READ32(ace.access_mask); + READ32(ace->type); + READ32(ace->flag); + READ32(ace->access_mask); READ32(dummy32); READ_BUF(dummy32); len += XDR_QUADLEN(dummy32) << 2; READMEM(buf, dummy32); - ace.whotype = nfs4_acl_get_whotype(buf, dummy32); + ace->whotype = nfs4_acl_get_whotype(buf, dummy32); host_err = 0; - if (ace.whotype != NFS4_ACL_WHO_NAMED) - ace.who = 0; - else if (ace.flag & NFS4_ACE_IDENTIFIER_GROUP) + if (ace->whotype != NFS4_ACL_WHO_NAMED) + ace->who = 0; + else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) host_err = nfsd_map_name_to_gid(argp->rqstp, - buf, dummy32, &ace.who); + buf, dummy32, &ace->who); else host_err = nfsd_map_name_to_uid(argp->rqstp, - buf, dummy32, &ace.who); - if (host_err) - goto out_nfserr; - host_err = nfs4_acl_add_ace(*acl, ace.type, ace.flag, - ace.access_mask, ace.whotype, ace.who); + buf, dummy32, &ace->who); if (host_err) goto out_nfserr; } @@ -1596,7 +1594,6 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, } if (bmval0 & FATTR4_WORD0_ACL) { struct nfs4_ace *ace; - struct list_head *h; if (acl == NULL) { if ((buflen -= 4) < 0) @@ -1609,9 +1606,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, goto out_resource; WRITE32(acl->naces); - list_for_each(h, &acl->ace_head) { - ace = list_entry(h, struct nfs4_ace, l_ace); - + for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) { if ((buflen -= 4*3) < 0) goto out_resource; WRITE32(ace->type); @@ -1821,7 +1816,7 @@ out_acl: status = nfs_ok; out: - nfs4_acl_free(acl); + kfree(acl); if (fhp == &tempfh) fh_put(&tempfh); return status; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 8283236c6a0..7e6aa245b5d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -466,7 +466,10 @@ out: posix_acl_release(dpacl); return (error); out_nfserr: - error = nfserrno(host_error); + if (host_error == -EOPNOTSUPP) + error = nfserr_attrnotsupp; + else + error = nfserrno(host_error); goto out; } diff --git a/include/acpi/processor.h b/include/acpi/processor.h index 7798d2a9f79..916c0102db5 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -79,6 +79,7 @@ struct acpi_processor_power { u32 bm_activity; int count; struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER]; + int timer_broadcast_on_state; }; /* Performance Management */ diff --git a/include/asm-i386/apic.h b/include/asm-i386/apic.h index 3a61206fd10..cc6b1652249 100644 --- a/include/asm-i386/apic.h +++ b/include/asm-i386/apic.h @@ -95,9 +95,7 @@ static inline void ack_APIC_irq(void) apic_write_around(APIC_EOI, 0); } -extern void (*wait_timer_tick)(void); - -extern int get_maxlvt(void); +extern int lapic_get_maxlvt(void); extern void clear_local_APIC(void); extern void connect_bsp_APIC (void); extern void disconnect_bsp_APIC (int virt_wire_setup); @@ -113,14 +111,9 @@ extern void smp_local_timer_interrupt (void); extern void setup_boot_APIC_clock (void); extern void setup_secondary_APIC_clock (void); extern int APIC_init_uniprocessor (void); -extern void disable_APIC_timer(void); -extern void enable_APIC_timer(void); extern void enable_NMI_through_LVT0 (void * dummy); -void smp_send_timer_broadcast_ipi(void); -void switch_APIC_timer_to_ipi(void *cpumask); -void switch_ipi_to_APIC_timer(void *cpumask); #define ARCH_APICTIMER_STOPS_ON_C3 1 extern int timer_over_8254; diff --git a/include/asm-i386/hpet.h b/include/asm-i386/hpet.h index e47be9a56cc..fc03cf9de5c 100644 --- a/include/asm-i386/hpet.h +++ b/include/asm-i386/hpet.h @@ -90,16 +90,19 @@ #define HPET_MIN_PERIOD (100000UL) #define HPET_TICK_RATE (HZ * 100000UL) -extern unsigned long hpet_tick; /* hpet clks count per tick */ extern unsigned long hpet_address; /* hpet memory map physical address */ -extern int hpet_use_timer; +extern int is_hpet_enabled(void); +#ifdef CONFIG_X86_64 +extern unsigned long hpet_tick; /* hpet clks count per tick */ +extern int hpet_use_timer; extern int hpet_rtc_timer_init(void); extern int hpet_enable(void); -extern int hpet_reenable(void); -extern int is_hpet_enabled(void); extern int is_hpet_capable(void); extern int hpet_readl(unsigned long a); +#else +extern int hpet_enable(void); +#endif #ifdef CONFIG_HPET_EMULATE_RTC extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask); @@ -110,5 +113,10 @@ extern int hpet_rtc_dropped_irq(void); extern int hpet_rtc_timer_init(void); extern irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id); #endif /* CONFIG_HPET_EMULATE_RTC */ + +#else + +static inline int hpet_enable(void) { return 0; } + #endif /* CONFIG_HPET_TIMER */ #endif /* _I386_HPET_H */ diff --git a/include/asm-i386/i8253.h b/include/asm-i386/i8253.h index 015d8df0769..6cb0dd4dcdd 100644 --- a/include/asm-i386/i8253.h +++ b/include/asm-i386/i8253.h @@ -1,6 +1,21 @@ #ifndef __ASM_I8253_H__ #define __ASM_I8253_H__ +#include <linux/clockchips.h> + extern spinlock_t i8253_lock; +extern struct clock_event_device *global_clock_event; + +/** + * pit_interrupt_hook - hook into timer tick + * @regs: standard registers from interrupt + * + * Call the global clock event handler. + **/ +static inline void pit_interrupt_hook(void) +{ + global_clock_event->event_handler(global_clock_event); +} + #endif /* __ASM_I8253_H__ */ diff --git a/include/asm-i386/mach-default/do_timer.h b/include/asm-i386/mach-default/do_timer.h index 7d606e3364a..56e5689863a 100644 --- a/include/asm-i386/mach-default/do_timer.h +++ b/include/asm-i386/mach-default/do_timer.h @@ -1,86 +1,16 @@ /* defines for inline arch setup functions */ +#include <linux/clockchips.h> -#include <asm/apic.h> #include <asm/i8259.h> +#include <asm/i8253.h> /** * do_timer_interrupt_hook - hook into timer tick - * @regs: standard registers from interrupt * - * Description: - * This hook is called immediately after the timer interrupt is ack'd. - * It's primary purpose is to allow architectures that don't possess - * individual per CPU clocks (like the CPU APICs supply) to broadcast the - * timer interrupt as a means of triggering reschedules etc. + * Call the pit clock event handler. see asm/i8253.h **/ static inline void do_timer_interrupt_hook(void) { - do_timer(1); -#ifndef CONFIG_SMP - update_process_times(user_mode_vm(get_irq_regs())); -#endif -/* - * In the SMP case we use the local APIC timer interrupt to do the - * profiling, except when we simulate SMP mode on a uniprocessor - * system, in that case we have to call the local interrupt handler. - */ -#ifndef CONFIG_X86_LOCAL_APIC - profile_tick(CPU_PROFILING); -#else - if (!using_apic_timer) - smp_local_timer_interrupt(); -#endif -} - - -/* you can safely undefine this if you don't have the Neptune chipset */ - -#define BUGGY_NEPTUN_TIMER - -/** - * do_timer_overflow - process a detected timer overflow condition - * @count: hardware timer interrupt count on overflow - * - * Description: - * This call is invoked when the jiffies count has not incremented but - * the hardware timer interrupt has. It means that a timer tick interrupt - * came along while the previous one was pending, thus a tick was missed - **/ -static inline int do_timer_overflow(int count) -{ - int i; - - spin_lock(&i8259A_lock); - /* - * This is tricky when I/O APICs are used; - * see do_timer_interrupt(). - */ - i = inb(0x20); - spin_unlock(&i8259A_lock); - - /* assumption about timer being IRQ0 */ - if (i & 0x01) { - /* - * We cannot detect lost timer interrupts ... - * well, that's why we call them lost, don't we? :) - * [hmm, on the Pentium and Alpha we can ... sort of] - */ - count -= LATCH; - } else { -#ifdef BUGGY_NEPTUN_TIMER - /* - * for the Neptun bug we know that the 'latch' - * command doesn't latch the high and low value - * of the counter atomically. Thus we have to - * substract 256 from the counter - * ... funny, isnt it? :) - */ - - count -= 256; -#else - printk("do_slow_gettimeoffset(): hardware timer problem?\n"); -#endif - } - return count; + pit_interrupt_hook(); } diff --git a/include/asm-i386/mach-voyager/do_timer.h b/include/asm-i386/mach-voyager/do_timer.h index 04e69c104a7..60f9dcc15d5 100644 --- a/include/asm-i386/mach-voyager/do_timer.h +++ b/include/asm-i386/mach-voyager/do_timer.h @@ -1,25 +1,18 @@ /* defines for inline arch setup functions */ +#include <linux/clockchips.h> + #include <asm/voyager.h> +#include <asm/i8253.h> +/** + * do_timer_interrupt_hook - hook into timer tick + * @regs: standard registers from interrupt + * + * Call the pit clock event handler. see asm/i8253.h + **/ static inline void do_timer_interrupt_hook(void) { - do_timer(1); -#ifndef CONFIG_SMP - update_process_times(user_mode_vm(irq_regs)); -#endif - + pit_interrupt_hook(); voyager_timer_interrupt(); } -static inline int do_timer_overflow(int count) -{ - /* can't read the ISR, just assume 1 tick - overflow */ - if(count > LATCH || count < 0) { - printk(KERN_ERR "VOYAGER PROBLEM: count is %d, latch is %d\n", count, LATCH); - count = LATCH; - } - count -= LATCH; - - return count; -} diff --git a/include/asm-i386/mpspec.h b/include/asm-i386/mpspec.h index 770bf6da8c3..f21349399d1 100644 --- a/include/asm-i386/mpspec.h +++ b/include/asm-i386/mpspec.h @@ -23,7 +23,6 @@ extern struct mpc_config_intsrc mp_irqs [MAX_IRQ_SOURCES]; extern int mpc_default_type; extern unsigned long mp_lapic_addr; extern int pic_mode; -extern int using_apic_timer; #ifdef CONFIG_ACPI extern void mp_register_lapic (u8 id, u8 enabled); diff --git a/include/asm-i386/msr.h b/include/asm-i386/msr.h index 609a3899475..6db40d0583f 100644 --- a/include/asm-i386/msr.h +++ b/include/asm-i386/msr.h @@ -307,4 +307,7 @@ static inline void wrmsrl (unsigned long msr, unsigned long long val) #define MSR_CORE_PERF_GLOBAL_CTRL 0x38f #define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x390 +/* Geode defined MSRs */ +#define MSR_GEODE_BUSCONT_CONF0 0x1900 + #endif /* __ASM_MSR_H */ diff --git a/include/asm-i386/tsc.h b/include/asm-i386/tsc.h index c13933185c1..e997891cc7c 100644 --- a/include/asm-i386/tsc.h +++ b/include/asm-i386/tsc.h @@ -1,48 +1 @@ -/* - * linux/include/asm-i386/tsc.h - * - * i386 TSC related functions - */ -#ifndef _ASM_i386_TSC_H -#define _ASM_i386_TSC_H - -#include <asm/processor.h> - -/* - * Standard way to access the cycle counter on i586+ CPUs. - * Currently only used on SMP. - * - * If you really have a SMP machine with i486 chips or older, - * compile for that, and this will just always return zero. - * That's ok, it just means that the nicer scheduling heuristics - * won't work for you. - * - * We only use the low 32 bits, and we'd simply better make sure - * that we reschedule before that wraps. Scheduling at least every - * four billion cycles just basically sounds like a good idea, - * regardless of how fast the machine is. - */ -typedef unsigned long long cycles_t; - -extern unsigned int cpu_khz; -extern unsigned int tsc_khz; - -static inline cycles_t get_cycles(void) -{ - unsigned long long ret = 0; - -#ifndef CONFIG_X86_TSC - if (!cpu_has_tsc) - return 0; -#endif - -#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC) - rdtscll(ret); -#endif - return ret; -} - -extern void tsc_init(void); -extern void mark_tsc_unstable(void); - -#endif +#include <asm-x86_64/tsc.h> diff --git a/include/asm-x86_64/hpet.h b/include/asm-x86_64/hpet.h index b39098408b6..59a66f08461 100644 --- a/include/asm-x86_64/hpet.h +++ b/include/asm-x86_64/hpet.h @@ -56,8 +56,15 @@ extern int is_hpet_enabled(void); extern int hpet_rtc_timer_init(void); extern int apic_is_clustered_box(void); +extern int hpet_arch_init(void); +extern int hpet_timer_stop_set_go(unsigned long tick); +extern int hpet_reenable(void); +extern unsigned int hpet_calibrate_tsc(void); extern int hpet_use_timer; +extern unsigned long hpet_address; +extern unsigned long hpet_period; +extern unsigned long hpet_tick; #ifdef CONFIG_HPET_EMULATE_RTC extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask); diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index a6d2ff5c69b..f54f3abf93c 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -45,11 +45,7 @@ extern u32 pmtmr_ioport; #else #define pmtmr_ioport 0 #endif -extern unsigned long long monotonic_base; -extern int sysctl_vsyscall; extern int nohpet; -extern unsigned long vxtime_hz; -extern void time_init_gtod(void); extern void early_printk(const char *fmt, ...) __attribute__((format(printf,1,2))); @@ -91,8 +87,6 @@ extern void check_efer(void); extern int unhandled_signal(struct task_struct *tsk, int sig); -extern int unsynchronized_tsc(void); - extern void select_idle_routine(const struct cpuinfo_x86 *c); extern unsigned long table_start, table_end; diff --git a/include/asm-x86_64/timex.h b/include/asm-x86_64/timex.h index b9e5320b762..8c6808a3fba 100644 --- a/include/asm-x86_64/timex.h +++ b/include/asm-x86_64/timex.h @@ -12,38 +12,21 @@ #include <asm/hpet.h> #include <asm/system.h> #include <asm/processor.h> +#include <asm/tsc.h> #include <linux/compiler.h> #define CLOCK_TICK_RATE PIT_TICK_RATE /* Underlying HZ */ -typedef unsigned long long cycles_t; - -static inline cycles_t get_cycles (void) -{ - unsigned long long ret; - - rdtscll(ret); - return ret; -} - -/* Like get_cycles, but make sure the CPU is synchronized. */ -static __always_inline cycles_t get_cycles_sync(void) -{ - unsigned long long ret; - unsigned eax; - /* Don't do an additional sync on CPUs where we know - RDTSC is already synchronous. */ - alternative_io("cpuid", ASM_NOP2, X86_FEATURE_SYNC_RDTSC, - "=a" (eax), "0" (1) : "ebx","ecx","edx","memory"); - rdtscll(ret); - return ret; -} - -extern unsigned int cpu_khz; - extern int read_current_timer(unsigned long *timer_value); #define ARCH_HAS_READ_CURRENT_TIMER 1 -extern struct vxtime_data vxtime; +#define USEC_PER_TICK (USEC_PER_SEC / HZ) +#define NSEC_PER_TICK (NSEC_PER_SEC / HZ) +#define FSEC_PER_TICK (FSEC_PER_SEC / HZ) + +#define NS_SCALE 10 /* 2^10, carefully chosen */ +#define US_SCALE 32 /* 2^32, arbitralrily chosen */ +extern void mark_tsc_unstable(void); +extern void set_cyc2ns_scale(unsigned long khz); #endif diff --git a/include/asm-x86_64/tsc.h b/include/asm-x86_64/tsc.h new file mode 100644 index 00000000000..9a0a368852c --- /dev/null +++ b/include/asm-x86_64/tsc.h @@ -0,0 +1,66 @@ +/* + * linux/include/asm-x86_64/tsc.h + * + * x86_64 TSC related functions + */ +#ifndef _ASM_x86_64_TSC_H +#define _ASM_x86_64_TSC_H + +#include <asm/processor.h> + +/* + * Standard way to access the cycle counter. + */ +typedef unsigned long long cycles_t; + +extern unsigned int cpu_khz; +extern unsigned int tsc_khz; + +static inline cycles_t get_cycles(void) +{ + unsigned long long ret = 0; + +#ifndef CONFIG_X86_TSC + if (!cpu_has_tsc) + return 0; +#endif + +#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC) + rdtscll(ret); +#endif + return ret; +} + +/* Like get_cycles, but make sure the CPU is synchronized. */ +static __always_inline cycles_t get_cycles_sync(void) +{ + unsigned long long ret; +#ifdef X86_FEATURE_SYNC_RDTSC + unsigned eax; + + /* + * Don't do an additional sync on CPUs where we know + * RDTSC is already synchronous: + */ + alternative_io("cpuid", ASM_NOP2, X86_FEATURE_SYNC_RDTSC, + "=a" (eax), "0" (1) : "ebx","ecx","edx","memory"); +#else + sync_core(); +#endif + rdtscll(ret); + + return ret; +} + +extern void tsc_init(void); +extern void mark_tsc_unstable(void); +extern int unsynchronized_tsc(void); + +/* + * Boot-time check whether the TSCs are synchronized across + * all CPUs/cores: + */ +extern void check_tsc_sync_source(int cpu); +extern void check_tsc_sync_target(void); + +#endif diff --git a/include/asm-x86_64/vsyscall.h b/include/asm-x86_64/vsyscall.h index 0c7847165ea..82b4afe65c9 100644 --- a/include/asm-x86_64/vsyscall.h +++ b/include/asm-x86_64/vsyscall.h @@ -16,46 +16,27 @@ enum vsyscall_num { #ifdef __KERNEL__ #include <linux/seqlock.h> -#define __section_vxtime __attribute__ ((unused, __section__ (".vxtime"), aligned(16))) #define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16))) #define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16))) -#define __section_sys_tz __attribute__ ((unused, __section__ (".sys_tz"), aligned(16))) -#define __section_sysctl_vsyscall __attribute__ ((unused, __section__ (".sysctl_vsyscall"), aligned(16))) -#define __section_xtime __attribute__ ((unused, __section__ (".xtime"), aligned(16))) -#define __section_xtime_lock __attribute__ ((unused, __section__ (".xtime_lock"), aligned(16))) -#define VXTIME_TSC 1 -#define VXTIME_HPET 2 -#define VXTIME_PMTMR 3 +/* Definitions for CONFIG_GENERIC_TIME definitions */ +#define __section_vsyscall_gtod_data __attribute__ \ + ((unused, __section__ (".vsyscall_gtod_data"),aligned(16))) +#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn"))) #define VGETCPU_RDTSCP 1 #define VGETCPU_LSL 2 -struct vxtime_data { - long hpet_address; /* HPET base address */ - int last; - unsigned long last_tsc; - long quot; - long tsc_quot; - int mode; -}; - #define hpet_readl(a) readl((const void __iomem *)fix_to_virt(FIX_HPET_BASE) + a) #define hpet_writel(d,a) writel(d, (void __iomem *)fix_to_virt(FIX_HPET_BASE) + a) -/* vsyscall space (readonly) */ -extern struct vxtime_data __vxtime; extern int __vgetcpu_mode; -extern struct timespec __xtime; extern volatile unsigned long __jiffies; -extern struct timezone __sys_tz; -extern seqlock_t __xtime_lock; /* kernel space (writeable) */ -extern struct vxtime_data vxtime; extern int vgetcpu_mode; extern struct timezone sys_tz; -extern int sysctl_vsyscall; +extern struct vsyscall_gtod_data_t vsyscall_gtod_data; #endif /* __KERNEL__ */ diff --git a/include/linux/acpi_pmtmr.h b/include/linux/acpi_pmtmr.h new file mode 100644 index 00000000000..1d0ef1ae803 --- /dev/null +++ b/include/linux/acpi_pmtmr.h @@ -0,0 +1,38 @@ +#ifndef _ACPI_PMTMR_H_ +#define _ACPI_PMTMR_H_ + +#include <linux/clocksource.h> + +/* Number of PMTMR ticks expected during calibration run */ +#define PMTMR_TICKS_PER_SEC 3579545 + +/* limit it to 24 bits */ +#define ACPI_PM_MASK CLOCKSOURCE_MASK(24) + +/* Overrun value */ +#define ACPI_PM_OVRRUN (1<<24) + +#ifdef CONFIG_X86_PM_TIMER + +extern u32 acpi_pm_read_verified(void); +extern u32 pmtmr_ioport; + +static inline u32 acpi_pm_read_early(void) +{ + if (!pmtmr_ioport) + return 0; + /* mask the output to 24 bits */ + return acpi_pm_read_verified() & ACPI_PM_MASK; +} + +#else + +static inline u32 acpi_pm_read_early(void) +{ + return 0; +} + +#endif + +#endif + diff --git a/include/linux/agp_backend.h b/include/linux/agp_backend.h index a5c8bb5d80b..abc521cfb08 100644 --- a/include/linux/agp_backend.h +++ b/include/linux/agp_backend.h @@ -87,10 +87,15 @@ struct agp_memory { u32 physical; u8 is_bound; u8 is_flushed; + u8 vmalloc_flag; }; #define AGP_NORMAL_MEMORY 0 +#define AGP_USER_TYPES (1 << 16) +#define AGP_USER_MEMORY (AGP_USER_TYPES) +#define AGP_USER_CACHED_MEMORY (AGP_USER_TYPES + 1) + extern struct agp_bridge_data *agp_bridge; extern struct list_head agp_bridges; diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h new file mode 100644 index 00000000000..4ea7e7bcfaf --- /dev/null +++ b/include/linux/clockchips.h @@ -0,0 +1,142 @@ +/* linux/include/linux/clockchips.h + * + * This file contains the structure definitions for clockchips. + * + * If you are not a clockchip, or the time of day code, you should + * not be including this file! + */ +#ifndef _LINUX_CLOCKCHIPS_H +#define _LINUX_CLOCKCHIPS_H + +#ifdef CONFIG_GENERIC_CLOCKEVENTS + +#include <linux/clocksource.h> +#include <linux/cpumask.h> +#include <linux/ktime.h> +#include <linux/notifier.h> + +struct clock_event_device; + +/* Clock event mode commands */ +enum clock_event_mode { + CLOCK_EVT_MODE_UNUSED = 0, + CLOCK_EVT_MODE_SHUTDOWN, + CLOCK_EVT_MODE_PERIODIC, + CLOCK_EVT_MODE_ONESHOT, +}; + +/* Clock event notification values */ +enum clock_event_nofitiers { + CLOCK_EVT_NOTIFY_ADD, + CLOCK_EVT_NOTIFY_BROADCAST_ON, + CLOCK_EVT_NOTIFY_BROADCAST_OFF, + CLOCK_EVT_NOTIFY_BROADCAST_ENTER, + CLOCK_EVT_NOTIFY_BROADCAST_EXIT, + CLOCK_EVT_NOTIFY_SUSPEND, + CLOCK_EVT_NOTIFY_RESUME, + CLOCK_EVT_NOTIFY_CPU_DEAD, +}; + +/* + * Clock event features + */ +#define CLOCK_EVT_FEAT_PERIODIC 0x000001 +#define CLOCK_EVT_FEAT_ONESHOT 0x000002 +/* + * x86(64) specific misfeatures: + * + * - Clockevent source stops in C3 State and needs broadcast support. + * - Local APIC timer is used as a dummy device. + */ +#define CLOCK_EVT_FEAT_C3STOP 0x000004 +#define CLOCK_EVT_FEAT_DUMMY 0x000008 + +/** + * struct clock_event_device - clock event device descriptor + * @name: ptr to clock event name + * @hints: usage hints + * @max_delta_ns: maximum delta value in ns + * @min_delta_ns: minimum delta value in ns + * @mult: nanosecond to cycles multiplier + * @shift: nanoseconds to cycles divisor (power of two) + * @rating: variable to rate clock event devices + * @irq: irq number (only for non cpu local devices) + * @cpumask: cpumask to indicate for which cpus this device works + * @set_next_event: set next event + * @set_mode: set mode function + * @evthandler: Assigned by the framework to be called by the low + * level handler of the event source + * @broadcast: function to broadcast events + * @list: list head for the management code + * @mode: operating mode assigned by the management code + * @next_event: local storage for the next event in oneshot mode + */ +struct clock_event_device { + const char *name; + unsigned int features; + unsigned long max_delta_ns; + unsigned long min_delta_ns; + unsigned long mult; + int shift; + int rating; + int irq; + cpumask_t cpumask; + int (*set_next_event)(unsigned long evt, + struct clock_event_device *); + void (*set_mode)(enum clock_event_mode mode, + struct clock_event_device *); + void (*event_handler)(struct clock_event_device *); + void (*broadcast)(cpumask_t mask); + struct list_head list; + enum clock_event_mode mode; + ktime_t next_event; +}; + +/* + * Calculate a multiplication factor for scaled math, which is used to convert + * nanoseconds based values to clock ticks: + * + * clock_ticks = (nanoseconds * factor) >> shift. + * + * div_sc is the rearranged equation to calculate a factor from a given clock + * ticks / nanoseconds ratio: + * + * factor = (clock_ticks << shift) / nanoseconds + */ +static inline unsigned long div_sc(unsigned long ticks, unsigned long nsec, + int shift) +{ + uint64_t tmp = ((uint64_t)ticks) << shift; + + do_div(tmp, nsec); + return (unsigned long) tmp; +} + +/* Clock event layer functions */ +extern unsigned long clockevent_delta2ns(unsigned long latch, + struct clock_event_device *evt); +extern void clockevents_register_device(struct clock_event_device *dev); + +extern void clockevents_exchange_device(struct clock_event_device *old, + struct clock_event_device *new); +extern +struct clock_event_device *clockevents_request_device(unsigned int features, + cpumask_t cpumask); +extern void clockevents_release_device(struct clock_event_device *dev); +extern void clockevents_set_mode(struct clock_event_device *dev, + enum clock_event_mode mode); +extern int clockevents_register_notifier(struct notifier_block *nb); +extern void clockevents_unregister_notifier(struct notifier_block *nb); +extern int clockevents_program_event(struct clock_event_device *dev, + ktime_t expires, ktime_t now); + +extern void clockevents_notify(unsigned long reason, void *arg); + +#else + +static inline void clockevents_resume_events(void) { } +#define clockevents_notify(reason, arg) do { } while (0) + +#endif + +#endif diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 1622d23a8dc..daa4940cc0f 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -12,11 +12,13 @@ #include <linux/timex.h> #include <linux/time.h> #include <linux/list.h> +#include <linux/timer.h> #include <asm/div64.h> #include <asm/io.h> /* clocksource cycle base type */ typedef u64 cycle_t; +struct clocksource; /** * struct clocksource - hardware abstraction for a free running counter @@ -44,8 +46,8 @@ typedef u64 cycle_t; * subtraction of non 64 bit counters * @mult: cycle to nanosecond multiplier * @shift: cycle to nanosecond divisor (power of two) - * @update_callback: called when safe to alter clocksource values - * @is_continuous: defines if clocksource is free-running. + * @flags: flags describing special properties + * @vread: vsyscall based read * @cycle_interval: Used internally by timekeeping core, please ignore. * @xtime_interval: Used internally by timekeeping core, please ignore. */ @@ -57,15 +59,30 @@ struct clocksource { cycle_t mask; u32 mult; u32 shift; - int (*update_callback)(void); - int is_continuous; + unsigned long flags; + cycle_t (*vread)(void); /* timekeeping specific data, ignore */ cycle_t cycle_last, cycle_interval; u64 xtime_nsec, xtime_interval; s64 error; + +#ifdef CONFIG_CLOCKSOURCE_WATCHDOG + /* Watchdog related data, used by the framework */ + struct list_head wd_list; + cycle_t wd_last; +#endif }; +/* + * Clock source flags bits:: + */ +#define CLOCK_SOURCE_IS_CONTINUOUS 0x01 +#define CLOCK_SOURCE_MUST_VERIFY 0x02 + +#define CLOCK_SOURCE_WATCHDOG 0x10 +#define CLOCK_SOURCE_VALID_FOR_HRES 0x20 + /* simplify initialization of mask field */ #define CLOCKSOURCE_MASK(bits) (cycle_t)(bits<64 ? ((1ULL<<bits)-1) : -1) @@ -178,8 +195,16 @@ static inline void clocksource_calculate_interval(struct clocksource *c, /* used to install a new clocksource */ -int clocksource_register(struct clocksource*); -void clocksource_reselect(void); -struct clocksource* clocksource_get_next(void); +extern int clocksource_register(struct clocksource*); +extern struct clocksource* clocksource_get_next(void); +extern void clocksource_change_rating(struct clocksource *cs, int rating); + +#ifdef CONFIG_GENERIC_TIME_VSYSCALL +extern void update_vsyscall(struct timespec *ts, struct clocksource *c); +#else +static inline void update_vsyscall(struct timespec *ts, struct clocksource *c) +{ +} +#endif #endif /* _LINUX_CLOCKSOURCE_H */ diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 7f008f6bfdc..0899e2cdcdd 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -84,9 +84,6 @@ struct cpufreq_policy { unsigned int policy; /* see above */ struct cpufreq_governor *governor; /* see below */ - struct mutex lock; /* CPU ->setpolicy or ->target may - only be called once a time */ - struct work_struct update; /* if update_policy() needs to be * called, but you're in IRQ context */ @@ -172,11 +169,16 @@ extern int __cpufreq_driver_target(struct cpufreq_policy *policy, unsigned int relation); -extern int cpufreq_driver_getavg(struct cpufreq_policy *policy); +extern int __cpufreq_driver_getavg(struct cpufreq_policy *policy); int cpufreq_register_governor(struct cpufreq_governor *governor); void cpufreq_unregister_governor(struct cpufreq_governor *governor); +int lock_policy_rwsem_read(int cpu); +int lock_policy_rwsem_write(int cpu); +void unlock_policy_rwsem_read(int cpu); +void unlock_policy_rwsem_write(int cpu); + /********************************************************************* * CPUFREQ DRIVER INTERFACE * diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 612472aaa79..7803014f3a1 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -106,7 +106,7 @@ static inline void account_system_vtime(struct task_struct *tsk) * always balanced, so the interrupted value of ->hardirq_context * will always be restored. */ -#define irq_enter() \ +#define __irq_enter() \ do { \ account_system_vtime(current); \ add_preempt_count(HARDIRQ_OFFSET); \ @@ -114,6 +114,11 @@ static inline void account_system_vtime(struct task_struct *tsk) } while (0) /* + * Enter irq context (on NO_HZ, update jiffies): + */ +extern void irq_enter(void); + +/* * Exit irq context without processing softirqs: */ #define __irq_exit() \ @@ -128,7 +133,7 @@ static inline void account_system_vtime(struct task_struct *tsk) */ extern void irq_exit(void); -#define nmi_enter() do { lockdep_off(); irq_enter(); } while (0) +#define nmi_enter() do { lockdep_off(); __irq_enter(); } while (0) #define nmi_exit() do { __irq_exit(); lockdep_on(); } while (0) #endif /* LINUX_HARDIRQ_H */ diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index fca93025ab5..37f9279192a 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -21,22 +21,72 @@ #include <linux/list.h> #include <linux/wait.h> +struct hrtimer_clock_base; +struct hrtimer_cpu_base; + /* * Mode arguments of xxx_hrtimer functions: */ enum hrtimer_mode { - HRTIMER_ABS, /* Time value is absolute */ - HRTIMER_REL, /* Time value is relative to now */ + HRTIMER_MODE_ABS, /* Time value is absolute */ + HRTIMER_MODE_REL, /* Time value is relative to now */ }; +/* + * Return values for the callback function + */ enum hrtimer_restart { - HRTIMER_NORESTART, - HRTIMER_RESTART, + HRTIMER_NORESTART, /* Timer is not restarted */ + HRTIMER_RESTART, /* Timer must be restarted */ }; -#define HRTIMER_INACTIVE ((void *)1UL) +/* + * hrtimer callback modes: + * + * HRTIMER_CB_SOFTIRQ: Callback must run in softirq context + * HRTIMER_CB_IRQSAFE: Callback may run in hardirq context + * HRTIMER_CB_IRQSAFE_NO_RESTART: Callback may run in hardirq context and + * does not restart the timer + * HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: Callback must run in softirq context + * Special mode for tick emultation + */ +enum hrtimer_cb_mode { + HRTIMER_CB_SOFTIRQ, + HRTIMER_CB_IRQSAFE, + HRTIMER_CB_IRQSAFE_NO_RESTART, + HRTIMER_CB_IRQSAFE_NO_SOFTIRQ, +}; -struct hrtimer_base; +/* + * Values to track state of the timer + * + * Possible states: + * + * 0x00 inactive + * 0x01 enqueued into rbtree + * 0x02 callback function running + * 0x04 callback pending (high resolution mode) + * + * Special case: + * 0x03 callback function running and enqueued + * (was requeued on another CPU) + * The "callback function running and enqueued" status is only possible on + * SMP. It happens for example when a posix timer expired and the callback + * queued a signal. Between dropping the lock which protects the posix timer + * and reacquiring the base lock of the hrtimer, another CPU can deliver the + * signal and rearm the timer. We have to preserve the callback running state, + * as otherwise the timer could be removed before the softirq code finishes the + * the handling of the timer. + * + * The HRTIMER_STATE_ENQUEUE bit is always or'ed to the current state to + * preserve the HRTIMER_STATE_CALLBACK bit in the above scenario. + * + * All state transitions are protected by cpu_base->lock. + */ +#define HRTIMER_STATE_INACTIVE 0x00 +#define HRTIMER_STATE_ENQUEUED 0x01 +#define HRTIMER_STATE_CALLBACK 0x02 +#define HRTIMER_STATE_PENDING 0x04 /** * struct hrtimer - the basic hrtimer structure @@ -46,14 +96,34 @@ struct hrtimer_base; * which the timer is based. * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) + * @state: state information (See bit values above) + * @cb_mode: high resolution timer feature to select the callback execution + * mode + * @cb_entry: list head to enqueue an expired timer into the callback list + * @start_site: timer statistics field to store the site where the timer + * was started + * @start_comm: timer statistics field to store the name of the process which + * started the timer + * @start_pid: timer statistics field to store the pid of the task which + * started the timer * - * The hrtimer structure must be initialized by init_hrtimer_#CLOCKTYPE() + * The hrtimer structure must be initialized by hrtimer_init() */ struct hrtimer { - struct rb_node node; - ktime_t expires; - int (*function)(struct hrtimer *); - struct hrtimer_base *base; + struct rb_node node; + ktime_t expires; + enum hrtimer_restart (*function)(struct hrtimer *); + struct hrtimer_clock_base *base; + unsigned long state; +#ifdef CONFIG_HIGH_RES_TIMERS + enum hrtimer_cb_mode cb_mode; + struct list_head cb_entry; +#endif +#ifdef CONFIG_TIMER_STATS + void *start_site; + char start_comm[16]; + int start_pid; +#endif }; /** @@ -70,37 +140,114 @@ struct hrtimer_sleeper { /** * struct hrtimer_base - the timer base for a specific clock - * @index: clock type index for per_cpu support when moving a timer - * to a base on another cpu. - * @lock: lock protecting the base and associated timers + * @index: clock type index for per_cpu support when moving a + * timer to a base on another cpu. * @active: red black tree root node for the active timers * @first: pointer to the timer node which expires first * @resolution: the resolution of the clock, in nanoseconds * @get_time: function to retrieve the current time of the clock * @get_softirq_time: function to retrieve the current time from the softirq - * @curr_timer: the timer which is executing a callback right now * @softirq_time: the time when running the hrtimer queue in the softirq - * @lock_key: the lock_class_key for use with lockdep + * @cb_pending: list of timers where the callback is pending + * @offset: offset of this clock to the monotonic base + * @reprogram: function to reprogram the timer event */ -struct hrtimer_base { +struct hrtimer_clock_base { + struct hrtimer_cpu_base *cpu_base; clockid_t index; - spinlock_t lock; struct rb_root active; struct rb_node *first; ktime_t resolution; ktime_t (*get_time)(void); ktime_t (*get_softirq_time)(void); - struct hrtimer *curr_timer; ktime_t softirq_time; - struct lock_class_key lock_key; +#ifdef CONFIG_HIGH_RES_TIMERS + ktime_t offset; + int (*reprogram)(struct hrtimer *t, + struct hrtimer_clock_base *b, + ktime_t n); +#endif +}; + +#define HRTIMER_MAX_CLOCK_BASES 2 + +/* + * struct hrtimer_cpu_base - the per cpu clock bases + * @lock: lock protecting the base and associated clock bases + * and timers + * @lock_key: the lock_class_key for use with lockdep + * @clock_base: array of clock bases for this cpu + * @curr_timer: the timer which is executing a callback right now + * @expires_next: absolute time of the next event which was scheduled + * via clock_set_next_event() + * @hres_active: State of high resolution mode + * @check_clocks: Indictator, when set evaluate time source and clock + * event devices whether high resolution mode can be + * activated. + * @cb_pending: Expired timers are moved from the rbtree to this + * list in the timer interrupt. The list is processed + * in the softirq. + * @nr_events: Total number of timer interrupt events + */ +struct hrtimer_cpu_base { + spinlock_t lock; + struct lock_class_key lock_key; + struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; +#ifdef CONFIG_HIGH_RES_TIMERS + ktime_t expires_next; + int hres_active; + struct list_head cb_pending; + unsigned long nr_events; +#endif }; +#ifdef CONFIG_HIGH_RES_TIMERS +struct clock_event_device; + +extern void clock_was_set(void); +extern void hrtimer_interrupt(struct clock_event_device *dev); + +/* + * In high resolution mode the time reference must be read accurate + */ +static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) +{ + return timer->base->get_time(); +} + +/* + * The resolution of the clocks. The resolution value is returned in + * the clock_getres() system call to give application programmers an + * idea of the (in)accuracy of timers. Timer values are rounded up to + * this resolution values. + */ +# define KTIME_HIGH_RES (ktime_t) { .tv64 = 1 } +# define KTIME_MONOTONIC_RES KTIME_HIGH_RES + +#else + +# define KTIME_MONOTONIC_RES KTIME_LOW_RES + /* * clock_was_set() is a NOP for non- high-resolution systems. The * time-sorted order guarantees that a timer does not expire early and * is expired in the next softirq when the clock was advanced. */ -#define clock_was_set() do { } while (0) +static inline void clock_was_set(void) { } + +/* + * In non high resolution mode the time reference is taken from + * the base softirq time variable. + */ +static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) +{ + return timer->base->softirq_time; +} + +#endif + +extern ktime_t ktime_get(void); +extern ktime_t ktime_get_real(void); /* Exported timer functions: */ @@ -114,19 +261,33 @@ extern int hrtimer_start(struct hrtimer *timer, ktime_t tim, extern int hrtimer_cancel(struct hrtimer *timer); extern int hrtimer_try_to_cancel(struct hrtimer *timer); -#define hrtimer_restart(timer) hrtimer_start((timer), (timer)->expires, HRTIMER_ABS) +static inline int hrtimer_restart(struct hrtimer *timer) +{ + return hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); +} /* Query timers: */ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp); -#ifdef CONFIG_NO_IDLE_HZ extern ktime_t hrtimer_get_next_event(void); -#endif +/* + * A timer is active, when it is enqueued into the rbtree or the callback + * function is running. + */ static inline int hrtimer_active(const struct hrtimer *timer) { - return rb_parent(&timer->node) != &timer->node; + return timer->state != HRTIMER_STATE_INACTIVE; +} + +/* + * Helper function to check, whether the timer is on one of the queues + */ +static inline int hrtimer_is_queued(struct hrtimer *timer) +{ + return timer->state & + (HRTIMER_STATE_ENQUEUED | HRTIMER_STATE_PENDING); } /* Forward a hrtimer so it expires after now: */ @@ -149,4 +310,53 @@ extern void hrtimer_run_queues(void); /* Bootup initialization: */ extern void __init hrtimers_init(void); +#if BITS_PER_LONG < 64 +extern unsigned long ktime_divns(const ktime_t kt, s64 div); +#else /* BITS_PER_LONG < 64 */ +# define ktime_divns(kt, div) (unsigned long)((kt).tv64 / (div)) +#endif + +/* Show pending timers: */ +extern void sysrq_timer_list_show(void); + +/* + * Timer-statistics info: + */ +#ifdef CONFIG_TIMER_STATS + +extern void timer_stats_update_stats(void *timer, pid_t pid, void *startf, + void *timerf, char * comm); + +static inline void timer_stats_account_hrtimer(struct hrtimer *timer) +{ + timer_stats_update_stats(timer, timer->start_pid, timer->start_site, + timer->function, timer->start_comm); +} + +extern void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, + void *addr); + +static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) +{ + __timer_stats_hrtimer_set_start_info(timer, __builtin_return_address(0)); +} + +static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer) +{ + timer->start_site = NULL; +} +#else +static inline void timer_stats_account_hrtimer(struct hrtimer *timer) +{ +} + +static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) +{ +} + +static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer) +{ +} +#endif + #endif diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 5a8ba0b8ccb..e5ea1411050 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -42,6 +42,8 @@ * IRQF_SHARED - allow sharing the irq among several devices * IRQF_PROBE_SHARED - set by callers when they expect sharing mismatches to occur * IRQF_TIMER - Flag to mark this interrupt as timer interrupt + * IRQF_PERCPU - Interrupt is per cpu + * IRQF_NOBALANCING - Flag to exclude this interrupt from irq balancing */ #define IRQF_DISABLED 0x00000020 #define IRQF_SAMPLE_RANDOM 0x00000040 @@ -49,6 +51,7 @@ #define IRQF_PROBE_SHARED 0x00000100 #define IRQF_TIMER 0x00000200 #define IRQF_PERCPU 0x00000400 +#define IRQF_NOBALANCING 0x00000800 /* * Migration helpers. Scheduled for removal in 1/2007 @@ -239,6 +242,9 @@ enum BLOCK_SOFTIRQ, TASKLET_SOFTIRQ, SCHED_SOFTIRQ, +#ifdef CONFIG_HIGH_RES_TIMERS + HRTIMER_SOFTIRQ, +#endif }; /* softirq mask and active fields moved to irq_cpustat_t in diff --git a/include/linux/irq.h b/include/linux/irq.h index 5504b671357..1939d42c21d 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -31,7 +31,7 @@ typedef void fastcall (*irq_flow_handler_t)(unsigned int irq, /* * IRQ line status. * - * Bits 0-16 are reserved for the IRQF_* bits in linux/interrupt.h + * Bits 0-7 are reserved for the IRQF_* bits in linux/interrupt.h * * IRQ types */ @@ -45,28 +45,30 @@ typedef void fastcall (*irq_flow_handler_t)(unsigned int irq, #define IRQ_TYPE_PROBE 0x00000010 /* Probing in progress */ /* Internal flags */ -#define IRQ_INPROGRESS 0x00010000 /* IRQ handler active - do not enter! */ -#define IRQ_DISABLED 0x00020000 /* IRQ disabled - do not enter! */ -#define IRQ_PENDING 0x00040000 /* IRQ pending - replay on enable */ -#define IRQ_REPLAY 0x00080000 /* IRQ has been replayed but not acked yet */ -#define IRQ_AUTODETECT 0x00100000 /* IRQ is being autodetected */ -#define IRQ_WAITING 0x00200000 /* IRQ not yet seen - for autodetection */ -#define IRQ_LEVEL 0x00400000 /* IRQ level triggered */ -#define IRQ_MASKED 0x00800000 /* IRQ masked - shouldn't be seen again */ -#define IRQ_PER_CPU 0x01000000 /* IRQ is per CPU */ +#define IRQ_INPROGRESS 0x00000100 /* IRQ handler active - do not enter! */ +#define IRQ_DISABLED 0x00000200 /* IRQ disabled - do not enter! */ +#define IRQ_PENDING 0x00000400 /* IRQ pending - replay on enable */ +#define IRQ_REPLAY 0x00000800 /* IRQ has been replayed but not acked yet */ +#define IRQ_AUTODETECT 0x00001000 /* IRQ is being autodetected */ +#define IRQ_WAITING 0x00002000 /* IRQ not yet seen - for autodetection */ +#define IRQ_LEVEL 0x00004000 /* IRQ level triggered */ +#define IRQ_MASKED 0x00008000 /* IRQ masked - shouldn't be seen again */ +#define IRQ_PER_CPU 0x00010000 /* IRQ is per CPU */ +#define IRQ_NOPROBE 0x00020000 /* IRQ is not valid for probing */ +#define IRQ_NOREQUEST 0x00040000 /* IRQ cannot be requested */ +#define IRQ_NOAUTOEN 0x00080000 /* IRQ will not be enabled on request irq */ +#define IRQ_WAKEUP 0x00100000 /* IRQ triggers system wakeup */ +#define IRQ_MOVE_PENDING 0x00200000 /* need to re-target IRQ destination */ +#define IRQ_NO_BALANCING 0x00400000 /* IRQ is excluded from balancing */ + #ifdef CONFIG_IRQ_PER_CPU # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU) +# define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING) #else # define CHECK_IRQ_PER_CPU(var) 0 +# define IRQ_NO_BALANCING_MASK IRQ_NO_BALANCING #endif -#define IRQ_NOPROBE 0x02000000 /* IRQ is not valid for probing */ -#define IRQ_NOREQUEST 0x04000000 /* IRQ cannot be requested */ -#define IRQ_NOAUTOEN 0x08000000 /* IRQ will not be enabled on request irq */ -#define IRQ_DELAYED_DISABLE 0x10000000 /* IRQ disable (masking) happens delayed. */ -#define IRQ_WAKEUP 0x20000000 /* IRQ triggers system wakeup */ -#define IRQ_MOVE_PENDING 0x40000000 /* need to re-target IRQ destination */ - struct proc_dir_entry; struct msi_desc; @@ -127,6 +129,7 @@ struct irq_chip { * * @handle_irq: highlevel irq-events handler [if NULL, __do_IRQ()] * @chip: low level interrupt hardware access + * @msi_desc: MSI descriptor * @handler_data: per-IRQ data for the irq_chip methods * @chip_data: platform-specific per-chip private data for the chip * methods, to allow shared chip implementations @@ -235,11 +238,21 @@ static inline void set_pending_irq(unsigned int irq, cpumask_t mask) #endif /* CONFIG_GENERIC_PENDING_IRQ */ +extern int irq_set_affinity(unsigned int irq, cpumask_t cpumask); +extern int irq_can_set_affinity(unsigned int irq); + #else /* CONFIG_SMP */ #define move_native_irq(x) #define move_masked_irq(x) +static inline int irq_set_affinity(unsigned int irq, cpumask_t cpumask) +{ + return -EINVAL; +} + +static inline int irq_can_set_affinity(unsigned int irq) { return 0; } + #endif /* CONFIG_SMP */ #ifdef CONFIG_IRQBALANCE @@ -261,6 +274,11 @@ static inline int select_smp_affinity(unsigned int irq) extern int no_irq_affinity; +static inline int irq_balancing_disabled(unsigned int irq) +{ + return irq_desc[irq].status & IRQ_NO_BALANCING_MASK; +} + /* Handle irq action chains: */ extern int handle_IRQ_event(unsigned int irq, struct irqaction *action); diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 0ec6e28bccd..c080f61fb02 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -142,13 +142,13 @@ static inline u64 get_jiffies_64(void) * * And some not so obvious. * - * Note that we don't want to return MAX_LONG, because + * Note that we don't want to return LONG_MAX, because * for various timeout reasons we often end up having * to wait "jiffies+1" in order to guarantee that we wait * at _least_ "jiffies" - so "jiffies+1" had better still * be positive. */ -#define MAX_JIFFY_OFFSET ((~0UL >> 1)-1) +#define MAX_JIFFY_OFFSET ((LONG_MAX >> 1)-1) /* * We want to do realistic conversions of time so we need to use the same @@ -259,207 +259,23 @@ static inline u64 get_jiffies_64(void) #endif /* - * Convert jiffies to milliseconds and back. - * - * Avoid unnecessary multiplications/divisions in the - * two most common HZ cases: - */ -static inline unsigned int jiffies_to_msecs(const unsigned long j) -{ -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - return (MSEC_PER_SEC / HZ) * j; -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) - return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); -#else - return (j * MSEC_PER_SEC) / HZ; -#endif -} - -static inline unsigned int jiffies_to_usecs(const unsigned long j) -{ -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (USEC_PER_SEC / HZ) * j; -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); -#else - return (j * USEC_PER_SEC) / HZ; -#endif -} - -static inline unsigned long msecs_to_jiffies(const unsigned int m) -{ - if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) - return m * (HZ / MSEC_PER_SEC); -#else - return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC; -#endif -} - -static inline unsigned long usecs_to_jiffies(const unsigned int u) -{ - if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return u * (HZ / USEC_PER_SEC); -#else - return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC; -#endif -} - -/* - * The TICK_NSEC - 1 rounds up the value to the next resolution. Note - * that a remainder subtract here would not do the right thing as the - * resolution values don't fall on second boundries. I.e. the line: - * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. - * - * Rather, we just shift the bits off the right. - * - * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec - * value to a scaled second value. - */ -static __inline__ unsigned long -timespec_to_jiffies(const struct timespec *value) -{ - unsigned long sec = value->tv_sec; - long nsec = value->tv_nsec + TICK_NSEC - 1; - - if (sec >= MAX_SEC_IN_JIFFIES){ - sec = MAX_SEC_IN_JIFFIES; - nsec = 0; - } - return (((u64)sec * SEC_CONVERSION) + - (((u64)nsec * NSEC_CONVERSION) >> - (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; - -} - -static __inline__ void -jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) -{ - /* - * Convert jiffies to nanoseconds and separate with - * one divide. - */ - u64 nsec = (u64)jiffies * TICK_NSEC; - value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec); -} - -/* Same for "timeval" - * - * Well, almost. The problem here is that the real system resolution is - * in nanoseconds and the value being converted is in micro seconds. - * Also for some machines (those that use HZ = 1024, in-particular), - * there is a LARGE error in the tick size in microseconds. - - * The solution we use is to do the rounding AFTER we convert the - * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. - * Instruction wise, this should cost only an additional add with carry - * instruction above the way it was done above. - */ -static __inline__ unsigned long -timeval_to_jiffies(const struct timeval *value) -{ - unsigned long sec = value->tv_sec; - long usec = value->tv_usec; - - if (sec >= MAX_SEC_IN_JIFFIES){ - sec = MAX_SEC_IN_JIFFIES; - usec = 0; - } - return (((u64)sec * SEC_CONVERSION) + - (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> - (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; -} - -static __inline__ void -jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) -{ - /* - * Convert jiffies to nanoseconds and separate with - * one divide. - */ - u64 nsec = (u64)jiffies * TICK_NSEC; - long tv_usec; - - value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec); - tv_usec /= NSEC_PER_USEC; - value->tv_usec = tv_usec; -} - -/* - * Convert jiffies/jiffies_64 to clock_t and back. + * Convert various time units to each other: */ -static inline clock_t jiffies_to_clock_t(long x) -{ -#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 - return x / (HZ / USER_HZ); -#else - u64 tmp = (u64)x * TICK_NSEC; - do_div(tmp, (NSEC_PER_SEC / USER_HZ)); - return (long)tmp; -#endif -} - -static inline unsigned long clock_t_to_jiffies(unsigned long x) -{ -#if (HZ % USER_HZ)==0 - if (x >= ~0UL / (HZ / USER_HZ)) - return ~0UL; - return x * (HZ / USER_HZ); -#else - u64 jif; - - /* Don't worry about loss of precision here .. */ - if (x >= ~0UL / HZ * USER_HZ) - return ~0UL; - - /* .. but do try to contain it here */ - jif = x * (u64) HZ; - do_div(jif, USER_HZ); - return jif; -#endif -} - -static inline u64 jiffies_64_to_clock_t(u64 x) -{ -#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 - do_div(x, HZ / USER_HZ); -#else - /* - * There are better ways that don't overflow early, - * but even this doesn't overflow in hundreds of years - * in 64 bits, so.. - */ - x *= TICK_NSEC; - do_div(x, (NSEC_PER_SEC / USER_HZ)); -#endif - return x; -} - -static inline u64 nsec_to_clock_t(u64 x) -{ -#if (NSEC_PER_SEC % USER_HZ) == 0 - do_div(x, (NSEC_PER_SEC / USER_HZ)); -#elif (USER_HZ % 512) == 0 - x *= USER_HZ/512; - do_div(x, (NSEC_PER_SEC / 512)); -#else - /* - * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, - * overflow after 64.99 years. - * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... - */ - x *= 9; - do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) - / USER_HZ)); -#endif - return x; -} +extern unsigned int jiffies_to_msecs(const unsigned long j); +extern unsigned int jiffies_to_usecs(const unsigned long j); +extern unsigned long msecs_to_jiffies(const unsigned int m); +extern unsigned long usecs_to_jiffies(const unsigned int u); +extern unsigned long timespec_to_jiffies(const struct timespec *value); +extern void jiffies_to_timespec(const unsigned long jiffies, + struct timespec *value); +extern unsigned long timeval_to_jiffies(const struct timeval *value); +extern void jiffies_to_timeval(const unsigned long jiffies, + struct timeval *value); +extern clock_t jiffies_to_clock_t(long x); +extern unsigned long clock_t_to_jiffies(unsigned long x); +extern u64 jiffies_64_to_clock_t(u64 x); +extern u64 nsec_to_clock_t(u64 x); + +#define TIMESTAMP_SIZE 30 #endif diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 7444a632623..c68c7ac6b23 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -261,8 +261,7 @@ static inline s64 ktime_to_ns(const ktime_t kt) * idea of the (in)accuracy of timers. Timer values are rounded up to * this resolution values. */ -#define KTIME_REALTIME_RES (ktime_t){ .tv64 = TICK_NSEC } -#define KTIME_MONOTONIC_RES (ktime_t){ .tv64 = TICK_NSEC } +#define KTIME_LOW_RES (ktime_t){ .tv64 = TICK_NSEC } /* Get the monotonic time in timespec format: */ extern void ktime_get_ts(struct timespec *ts); diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index db05182ca0e..1be5be88deb 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -105,12 +105,11 @@ struct nfs4_ace { uint32_t access_mask; int whotype; uid_t who; - struct list_head l_ace; }; struct nfs4_acl { uint32_t naces; - struct list_head ace_head; + struct nfs4_ace aces[0]; }; typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier; diff --git a/include/linux/nfs4_acl.h b/include/linux/nfs4_acl.h index 22aff4d01f2..409b6e02f33 100644 --- a/include/linux/nfs4_acl.h +++ b/include/linux/nfs4_acl.h @@ -39,9 +39,12 @@ #include <linux/posix_acl.h> -struct nfs4_acl *nfs4_acl_new(void); -void nfs4_acl_free(struct nfs4_acl *); -int nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t); +/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to + * fit in a page: */ +#define NFS4_ACL_MAX 170 + +struct nfs4_acl *nfs4_acl_new(int); +void nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t); int nfs4_acl_get_whotype(char *, u32); int nfs4_acl_write_who(int who, char *p); int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group, diff --git a/include/linux/tick.h b/include/linux/tick.h new file mode 100644 index 00000000000..9a7252e089b --- /dev/null +++ b/include/linux/tick.h @@ -0,0 +1,109 @@ +/* linux/include/linux/tick.h + * + * This file contains the structure definitions for tick related functions + * + */ +#ifndef _LINUX_TICK_H +#define _LINUX_TICK_H + +#include <linux/clockchips.h> + +#ifdef CONFIG_GENERIC_CLOCKEVENTS + +enum tick_device_mode { + TICKDEV_MODE_PERIODIC, + TICKDEV_MODE_ONESHOT, +}; + +struct tick_device { + struct clock_event_device *evtdev; + enum tick_device_mode mode; +}; + +enum tick_nohz_mode { + NOHZ_MODE_INACTIVE, + NOHZ_MODE_LOWRES, + NOHZ_MODE_HIGHRES, +}; + +/** + * struct tick_sched - sched tick emulation and no idle tick control/stats + * @sched_timer: hrtimer to schedule the periodic tick in high + * resolution mode + * @idle_tick: Store the last idle tick expiry time when the tick + * timer is modified for idle sleeps. This is necessary + * to resume the tick timer operation in the timeline + * when the CPU returns from idle + * @tick_stopped: Indicator that the idle tick has been stopped + * @idle_jiffies: jiffies at the entry to idle for idle time accounting + * @idle_calls: Total number of idle calls + * @idle_sleeps: Number of idle calls, where the sched tick was stopped + * @idle_entrytime: Time when the idle call was entered + * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped + */ +struct tick_sched { + struct hrtimer sched_timer; + unsigned long check_clocks; + enum tick_nohz_mode nohz_mode; + ktime_t idle_tick; + int tick_stopped; + unsigned long idle_jiffies; + unsigned long idle_calls; + unsigned long idle_sleeps; + ktime_t idle_entrytime; + ktime_t idle_sleeptime; + unsigned long last_jiffies; + unsigned long next_jiffies; + ktime_t idle_expires; +}; + +extern void __init tick_init(void); +extern int tick_is_oneshot_available(void); +extern struct tick_device *tick_get_device(int cpu); + +# ifdef CONFIG_HIGH_RES_TIMERS +extern int tick_init_highres(void); +extern int tick_program_event(ktime_t expires, int force); +extern void tick_setup_sched_timer(void); +extern void tick_cancel_sched_timer(int cpu); +# else +static inline void tick_cancel_sched_timer(int cpu) { } +# endif /* HIGHRES */ + +# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern struct tick_device *tick_get_broadcast_device(void); +extern cpumask_t *tick_get_broadcast_mask(void); + +# ifdef CONFIG_TICK_ONESHOT +extern cpumask_t *tick_get_broadcast_oneshot_mask(void); +# endif + +# endif /* BROADCAST */ + +# ifdef CONFIG_TICK_ONESHOT +extern void tick_clock_notify(void); +extern int tick_check_oneshot_change(int allow_nohz); +extern struct tick_sched *tick_get_tick_sched(int cpu); +# else +static inline void tick_clock_notify(void) { } +static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } +# endif + +#else /* CONFIG_GENERIC_CLOCKEVENTS */ +static inline void tick_init(void) { } +static inline void tick_cancel_sched_timer(int cpu) { } +static inline void tick_clock_notify(void) { } +static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } +#endif /* !CONFIG_GENERIC_CLOCKEVENTS */ + +# ifdef CONFIG_NO_HZ +extern void tick_nohz_stop_sched_tick(void); +extern void tick_nohz_restart_sched_tick(void); +extern void tick_nohz_update_jiffies(void); +# else +static inline void tick_nohz_stop_sched_tick(void) { } +static inline void tick_nohz_restart_sched_tick(void) { } +static inline void tick_nohz_update_jiffies(void) { } +# endif /* !NO_HZ */ + +#endif diff --git a/include/linux/time.h b/include/linux/time.h index eceb1a59b07..8ea8dea713c 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -92,6 +92,7 @@ extern struct timespec xtime; extern struct timespec wall_to_monotonic; extern seqlock_t xtime_lock __attribute__((weak)); +extern unsigned long read_persistent_clock(void); void timekeeping_init(void); static inline unsigned long get_seconds(void) diff --git a/include/linux/timer.h b/include/linux/timer.h index fb5edaaf0eb..719113b652d 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -2,6 +2,7 @@ #define _LINUX_TIMER_H #include <linux/list.h> +#include <linux/ktime.h> #include <linux/spinlock.h> #include <linux/stddef.h> @@ -15,6 +16,11 @@ struct timer_list { unsigned long data; struct tvec_t_base_s *base; +#ifdef CONFIG_TIMER_STATS + void *start_site; + char start_comm[16]; + int start_pid; +#endif }; extern struct tvec_t_base_s boot_tvec_bases; @@ -61,7 +67,65 @@ extern int del_timer(struct timer_list * timer); extern int __mod_timer(struct timer_list *timer, unsigned long expires); extern int mod_timer(struct timer_list *timer, unsigned long expires); +/* + * Return when the next timer-wheel timeout occurs (in absolute jiffies), + * locks the timer base: + */ extern unsigned long next_timer_interrupt(void); +/* + * Return when the next timer-wheel timeout occurs (in absolute jiffies), + * locks the timer base and does the comparison against the given + * jiffie. + */ +extern unsigned long get_next_timer_interrupt(unsigned long now); + +/* + * Timer-statistics info: + */ +#ifdef CONFIG_TIMER_STATS + +extern void init_timer_stats(void); + +extern void timer_stats_update_stats(void *timer, pid_t pid, void *startf, + void *timerf, char * comm); + +static inline void timer_stats_account_timer(struct timer_list *timer) +{ + timer_stats_update_stats(timer, timer->start_pid, timer->start_site, + timer->function, timer->start_comm); +} + +extern void __timer_stats_timer_set_start_info(struct timer_list *timer, + void *addr); + +static inline void timer_stats_timer_set_start_info(struct timer_list *timer) +{ + __timer_stats_timer_set_start_info(timer, __builtin_return_address(0)); +} + +static inline void timer_stats_timer_clear_start_info(struct timer_list *timer) +{ + timer->start_site = NULL; +} +#else +static inline void init_timer_stats(void) +{ +} + +static inline void timer_stats_account_timer(struct timer_list *timer) +{ +} + +static inline void timer_stats_timer_set_start_info(struct timer_list *timer) +{ +} + +static inline void timer_stats_timer_clear_start_info(struct timer_list *timer) +{ +} +#endif + +extern void delayed_work_timer_fn(unsigned long __data); /** * add_timer - start a timer @@ -96,7 +160,7 @@ static inline void add_timer(struct timer_list *timer) extern void init_timers(void); extern void run_local_timers(void); struct hrtimer; -extern int it_real_fn(struct hrtimer *); +extern enum hrtimer_restart it_real_fn(struct hrtimer *); unsigned long __round_jiffies(unsigned long j, int cpu); unsigned long __round_jiffies_relative(unsigned long j, int cpu); diff --git a/include/linux/timex.h b/include/linux/timex.h index 9a24e500c31..da929dbbea2 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -286,6 +286,13 @@ static inline void time_interpolator_update(long delta_nsec) #define TICK_LENGTH_SHIFT 32 +#ifdef CONFIG_NO_HZ +#define NTP_INTERVAL_FREQ (2) +#else +#define NTP_INTERVAL_FREQ (HZ) +#endif +#define NTP_INTERVAL_LENGTH (NSEC_PER_SEC/NTP_INTERVAL_FREQ) + /* Returns how long ticks are at present, in ns / 2^(SHIFT_SCALE-10). */ extern u64 current_tick_length(void); diff --git a/init/main.c b/init/main.c index 2421e154412..953500b02ac 100644 --- a/init/main.c +++ b/init/main.c @@ -40,6 +40,7 @@ #include <linux/cpu.h> #include <linux/cpuset.h> #include <linux/efi.h> +#include <linux/tick.h> #include <linux/taskstats_kern.h> #include <linux/delayacct.h> #include <linux/unistd.h> @@ -515,6 +516,7 @@ asmlinkage void __init start_kernel(void) * enable them */ lock_kernel(); + tick_init(); boot_cpu_init(); page_address_init(); printk(KERN_NOTICE); diff --git a/kernel/fork.c b/kernel/fork.c index 0b6293d94d9..d154cc78648 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -858,7 +858,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); - hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL); + hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->it_real_incr.tv64 = 0; sig->real_timer.function = it_real_fn; sig->tsk = tsk; diff --git a/kernel/futex.c b/kernel/futex.c index 5a737de857d..e749e7df14b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1134,7 +1134,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, if (sec != MAX_SCHEDULE_TIMEOUT) { to = &timeout; - hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); + hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); hrtimer_init_sleeper(to, current); to->timer.expires = ktime_set(sec, nsec); } diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f44e499e8fc..476cb0c0b4a 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1,8 +1,9 @@ /* * linux/kernel/hrtimer.c * - * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de> - * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * High-resolution kernel timers * @@ -31,12 +32,17 @@ */ #include <linux/cpu.h> +#include <linux/irq.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/hrtimer.h> #include <linux/notifier.h> #include <linux/syscalls.h> +#include <linux/kallsyms.h> #include <linux/interrupt.h> +#include <linux/tick.h> +#include <linux/seq_file.h> +#include <linux/err.h> #include <asm/uaccess.h> @@ -45,7 +51,7 @@ * * returns the time in ktime_t format */ -static ktime_t ktime_get(void) +ktime_t ktime_get(void) { struct timespec now; @@ -59,7 +65,7 @@ static ktime_t ktime_get(void) * * returns the time in ktime_t format */ -static ktime_t ktime_get_real(void) +ktime_t ktime_get_real(void) { struct timespec now; @@ -79,21 +85,22 @@ EXPORT_SYMBOL_GPL(ktime_get_real); * This ensures that we capture erroneous accesses to these clock ids * rather than moving them into the range of valid clock id's. */ - -#define MAX_HRTIMER_BASES 2 - -static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) = +DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { + + .clock_base = { - .index = CLOCK_REALTIME, - .get_time = &ktime_get_real, - .resolution = KTIME_REALTIME_RES, - }, - { - .index = CLOCK_MONOTONIC, - .get_time = &ktime_get, - .resolution = KTIME_MONOTONIC_RES, - }, + { + .index = CLOCK_REALTIME, + .get_time = &ktime_get_real, + .resolution = KTIME_LOW_RES, + }, + { + .index = CLOCK_MONOTONIC, + .get_time = &ktime_get, + .resolution = KTIME_LOW_RES, + }, + } }; /** @@ -125,20 +132,35 @@ EXPORT_SYMBOL_GPL(ktime_get_ts); * Get the coarse grained time at the softirq based on xtime and * wall_to_monotonic. */ -static void hrtimer_get_softirq_time(struct hrtimer_base *base) +static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { ktime_t xtim, tomono; + struct timespec xts; unsigned long seq; do { seq = read_seqbegin(&xtime_lock); - xtim = timespec_to_ktime(xtime); - tomono = timespec_to_ktime(wall_to_monotonic); - +#ifdef CONFIG_NO_HZ + getnstimeofday(&xts); +#else + xts = xtime; +#endif } while (read_seqretry(&xtime_lock, seq)); - base[CLOCK_REALTIME].softirq_time = xtim; - base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono); + xtim = timespec_to_ktime(xts); + tomono = timespec_to_ktime(wall_to_monotonic); + base->clock_base[CLOCK_REALTIME].softirq_time = xtim; + base->clock_base[CLOCK_MONOTONIC].softirq_time = + ktime_add(xtim, tomono); +} + +/* + * Helper function to check, whether the timer is running the callback + * function + */ +static inline int hrtimer_callback_running(struct hrtimer *timer) +{ + return timer->state & HRTIMER_STATE_CALLBACK; } /* @@ -147,8 +169,6 @@ static void hrtimer_get_softirq_time(struct hrtimer_base *base) */ #ifdef CONFIG_SMP -#define set_curr_timer(b, t) do { (b)->curr_timer = (t); } while (0) - /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are @@ -161,19 +181,20 @@ static void hrtimer_get_softirq_time(struct hrtimer_base *base) * possible to set timer->base = NULL and drop the lock: the timer remains * locked. */ -static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer, - unsigned long *flags) +static +struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, + unsigned long *flags) { - struct hrtimer_base *base; + struct hrtimer_clock_base *base; for (;;) { base = timer->base; if (likely(base != NULL)) { - spin_lock_irqsave(&base->lock, *flags); + spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; /* The timer has migrated to another CPU: */ - spin_unlock_irqrestore(&base->lock, *flags); + spin_unlock_irqrestore(&base->cpu_base->lock, *flags); } cpu_relax(); } @@ -182,12 +203,14 @@ static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer, /* * Switch the timer base to the current CPU when possible. */ -static inline struct hrtimer_base * -switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) +static inline struct hrtimer_clock_base * +switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base) { - struct hrtimer_base *new_base; + struct hrtimer_clock_base *new_base; + struct hrtimer_cpu_base *new_cpu_base; - new_base = &__get_cpu_var(hrtimer_bases)[base->index]; + new_cpu_base = &__get_cpu_var(hrtimer_bases); + new_base = &new_cpu_base->clock_base[base->index]; if (base != new_base) { /* @@ -199,13 +222,13 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) * completed. There is no conflict as we hold the lock until * the timer is enqueued. */ - if (unlikely(base->curr_timer == timer)) + if (unlikely(hrtimer_callback_running(timer))) return base; /* See the comment in lock_timer_base() */ timer->base = NULL; - spin_unlock(&base->lock); - spin_lock(&new_base->lock); + spin_unlock(&base->cpu_base->lock); + spin_lock(&new_base->cpu_base->lock); timer->base = new_base; } return new_base; @@ -213,19 +236,17 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) #else /* CONFIG_SMP */ -#define set_curr_timer(b, t) do { } while (0) - -static inline struct hrtimer_base * +static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { - struct hrtimer_base *base = timer->base; + struct hrtimer_clock_base *base = timer->base; - spin_lock_irqsave(&base->lock, *flags); + spin_lock_irqsave(&base->cpu_base->lock, *flags); return base; } -#define switch_hrtimer_base(t, b) (b) +# define switch_hrtimer_base(t, b) (b) #endif /* !CONFIG_SMP */ @@ -256,15 +277,12 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) return ktime_add(kt, tmp); } - -#else /* CONFIG_KTIME_SCALAR */ - # endif /* !CONFIG_KTIME_SCALAR */ /* * Divide a ktime value by a nanosecond value */ -static unsigned long ktime_divns(const ktime_t kt, s64 div) +unsigned long ktime_divns(const ktime_t kt, s64 div) { u64 dclc, inc, dns; int sft = 0; @@ -281,18 +299,311 @@ static unsigned long ktime_divns(const ktime_t kt, s64 div) return (unsigned long) dclc; } - -#else /* BITS_PER_LONG < 64 */ -# define ktime_divns(kt, div) (unsigned long)((kt).tv64 / (div)) #endif /* BITS_PER_LONG >= 64 */ +/* High resolution timer related functions */ +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer enabled ? + */ +static int hrtimer_hres_enabled __read_mostly = 1; + +/* + * Enable / Disable high resolution mode + */ +static int __init setup_hrtimer_hres(char *str) +{ + if (!strcmp(str, "off")) + hrtimer_hres_enabled = 0; + else if (!strcmp(str, "on")) + hrtimer_hres_enabled = 1; + else + return 0; + return 1; +} + +__setup("highres=", setup_hrtimer_hres); + +/* + * hrtimer_high_res_enabled - query, if the highres mode is enabled + */ +static inline int hrtimer_is_hres_enabled(void) +{ + return hrtimer_hres_enabled; +} + +/* + * Is the high resolution mode active ? + */ +static inline int hrtimer_hres_active(void) +{ + return __get_cpu_var(hrtimer_bases).hres_active; +} + +/* + * Reprogram the event source with checking both queues for the + * next event + * Called with interrupts disabled and base->lock held + */ +static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) +{ + int i; + struct hrtimer_clock_base *base = cpu_base->clock_base; + ktime_t expires; + + cpu_base->expires_next.tv64 = KTIME_MAX; + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { + struct hrtimer *timer; + + if (!base->first) + continue; + timer = rb_entry(base->first, struct hrtimer, node); + expires = ktime_sub(timer->expires, base->offset); + if (expires.tv64 < cpu_base->expires_next.tv64) + cpu_base->expires_next = expires; + } + + if (cpu_base->expires_next.tv64 != KTIME_MAX) + tick_program_event(cpu_base->expires_next, 1); +} + +/* + * Shared reprogramming for clock_realtime and clock_monotonic + * + * When a timer is enqueued and expires earlier than the already enqueued + * timers, we have to check, whether it expires earlier than the timer for + * which the clock event device was armed. + * + * Called with interrupts disabled and base->cpu_base.lock held + */ +static int hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; + ktime_t expires = ktime_sub(timer->expires, base->offset); + int res; + + /* + * When the callback is running, we do not reprogram the clock event + * device. The timer callback is either running on a different CPU or + * the callback is executed in the hrtimer_interupt context. The + * reprogramming is handled either by the softirq, which called the + * callback or at the end of the hrtimer_interrupt. + */ + if (hrtimer_callback_running(timer)) + return 0; + + if (expires.tv64 >= expires_next->tv64) + return 0; + + /* + * Clockevents returns -ETIME, when the event was in the past. + */ + res = tick_program_event(expires, 0); + if (!IS_ERR_VALUE(res)) + *expires_next = expires; + return res; +} + + +/* + * Retrigger next event is called after clock was set + * + * Called with interrupts disabled via on_each_cpu() + */ +static void retrigger_next_event(void *arg) +{ + struct hrtimer_cpu_base *base; + struct timespec realtime_offset; + unsigned long seq; + + if (!hrtimer_hres_active()) + return; + + do { + seq = read_seqbegin(&xtime_lock); + set_normalized_timespec(&realtime_offset, + -wall_to_monotonic.tv_sec, + -wall_to_monotonic.tv_nsec); + } while (read_seqretry(&xtime_lock, seq)); + + base = &__get_cpu_var(hrtimer_bases); + + /* Adjust CLOCK_REALTIME offset */ + spin_lock(&base->lock); + base->clock_base[CLOCK_REALTIME].offset = + timespec_to_ktime(realtime_offset); + + hrtimer_force_reprogram(base); + spin_unlock(&base->lock); +} + +/* + * Clock realtime was set + * + * Change the offset of the realtime clock vs. the monotonic + * clock. + * + * We might have to reprogram the high resolution timer interrupt. On + * SMP we call the architecture specific code to retrigger _all_ high + * resolution timer interrupts. On UP we just disable interrupts and + * call the high resolution interrupt code. + */ +void clock_was_set(void) +{ + /* Retrigger the CPU local events everywhere */ + on_each_cpu(retrigger_next_event, NULL, 0, 1); +} + +/* + * Check, whether the timer is on the callback pending list + */ +static inline int hrtimer_cb_pending(const struct hrtimer *timer) +{ + return timer->state & HRTIMER_STATE_PENDING; +} + +/* + * Remove a timer from the callback pending list + */ +static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) +{ + list_del_init(&timer->cb_entry); +} + +/* + * Initialize the high resolution related parts of cpu_base + */ +static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) +{ + base->expires_next.tv64 = KTIME_MAX; + base->hres_active = 0; + INIT_LIST_HEAD(&base->cb_pending); +} + +/* + * Initialize the high resolution related parts of a hrtimer + */ +static inline void hrtimer_init_timer_hres(struct hrtimer *timer) +{ + INIT_LIST_HEAD(&timer->cb_entry); +} + +/* + * When High resolution timers are active, try to reprogram. Note, that in case + * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry + * check happens. The timer gets enqueued into the rbtree. The reprogramming + * and expiry check is done in the hrtimer_interrupt or in the softirq. + */ +static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { + + /* Timer is expired, act upon the callback mode */ + switch(timer->cb_mode) { + case HRTIMER_CB_IRQSAFE_NO_RESTART: + /* + * We can call the callback from here. No restart + * happens, so no danger of recursion + */ + BUG_ON(timer->function(timer) != HRTIMER_NORESTART); + return 1; + case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: + /* + * This is solely for the sched tick emulation with + * dynamic tick support to ensure that we do not + * restart the tick right on the edge and end up with + * the tick timer in the softirq ! The calling site + * takes care of this. + */ + return 1; + case HRTIMER_CB_IRQSAFE: + case HRTIMER_CB_SOFTIRQ: + /* + * Move everything else into the softirq pending list ! + */ + list_add_tail(&timer->cb_entry, + &base->cpu_base->cb_pending); + timer->state = HRTIMER_STATE_PENDING; + raise_softirq(HRTIMER_SOFTIRQ); + return 1; + default: + BUG(); + } + } + return 0; +} + +/* + * Switch to high resolution mode + */ +static void hrtimer_switch_to_hres(void) +{ + struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + unsigned long flags; + + if (base->hres_active) + return; + + local_irq_save(flags); + + if (tick_init_highres()) { + local_irq_restore(flags); + return; + } + base->hres_active = 1; + base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES; + base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES; + + tick_setup_sched_timer(); + + /* "Retrigger" the interrupt to get things going */ + retrigger_next_event(NULL); + local_irq_restore(flags); + printk(KERN_INFO "Switched to high resolution mode on CPU %d\n", + smp_processor_id()); +} + +#else + +static inline int hrtimer_hres_active(void) { return 0; } +static inline int hrtimer_is_hres_enabled(void) { return 0; } +static inline void hrtimer_switch_to_hres(void) { } +static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } +static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + return 0; +} +static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; } +static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { } +static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } +static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } + +#endif /* CONFIG_HIGH_RES_TIMERS */ + +#ifdef CONFIG_TIMER_STATS +void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr) +{ + if (timer->start_site) + return; + + timer->start_site = addr; + memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); + timer->start_pid = current->pid; +} +#endif + /* * Counterpart to lock_timer_base above: */ static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { - spin_unlock_irqrestore(&timer->base->lock, *flags); + spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); } /** @@ -342,7 +653,8 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. */ -static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) +static void enqueue_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base, int reprogram) { struct rb_node **link = &base->active.rb_node; struct rb_node *parent = NULL; @@ -368,39 +680,85 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) * Insert the timer to the rbtree and check whether it * replaces the first pending timer */ - rb_link_node(&timer->node, parent, link); - rb_insert_color(&timer->node, &base->active); - if (!base->first || timer->expires.tv64 < - rb_entry(base->first, struct hrtimer, node)->expires.tv64) + rb_entry(base->first, struct hrtimer, node)->expires.tv64) { + /* + * Reprogram the clock event device. When the timer is already + * expired hrtimer_enqueue_reprogram has either called the + * callback or added it to the pending list and raised the + * softirq. + * + * This is a NOP for !HIGHRES + */ + if (reprogram && hrtimer_enqueue_reprogram(timer, base)) + return; + base->first = &timer->node; + } + + rb_link_node(&timer->node, parent, link); + rb_insert_color(&timer->node, &base->active); + /* + * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the + * state of a possibly running callback. + */ + timer->state |= HRTIMER_STATE_ENQUEUED; } /* * __remove_hrtimer - internal function to remove a timer * * Caller must hold the base lock. + * + * High resolution timer mode reprograms the clock event device when the + * timer is the one which expires next. The caller can disable this by setting + * reprogram to zero. This is useful, when the context does a reprogramming + * anyway (e.g. timer interrupt) */ -static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) +static void __remove_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base, + unsigned long newstate, int reprogram) { - /* - * Remove the timer from the rbtree and replace the - * first entry pointer if necessary. - */ - if (base->first == &timer->node) - base->first = rb_next(&timer->node); - rb_erase(&timer->node, &base->active); - rb_set_parent(&timer->node, &timer->node); + /* High res. callback list. NOP for !HIGHRES */ + if (hrtimer_cb_pending(timer)) + hrtimer_remove_cb_pending(timer); + else { + /* + * Remove the timer from the rbtree and replace the + * first entry pointer if necessary. + */ + if (base->first == &timer->node) { + base->first = rb_next(&timer->node); + /* Reprogram the clock event device. if enabled */ + if (reprogram && hrtimer_hres_active()) + hrtimer_force_reprogram(base->cpu_base); + } + rb_erase(&timer->node, &base->active); + } + timer->state = newstate; } /* * remove hrtimer, called with base lock held */ static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) { - if (hrtimer_active(timer)) { - __remove_hrtimer(timer, base); + if (hrtimer_is_queued(timer)) { + int reprogram; + + /* + * Remove the timer and force reprogramming when high + * resolution mode is active and the timer is on the current + * CPU. If we remove a timer on another CPU, reprogramming is + * skipped. The interrupt event on this CPU is fired and + * reprogramming happens in the interrupt handler. This is a + * rare case and less expensive than a smp call. + */ + timer_stats_hrtimer_clear_start_info(timer); + reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); + __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, + reprogram); return 1; } return 0; @@ -419,7 +777,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) int hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { - struct hrtimer_base *base, *new_base; + struct hrtimer_clock_base *base, *new_base; unsigned long flags; int ret; @@ -431,7 +789,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) /* Switch the timer base, if necessary: */ new_base = switch_hrtimer_base(timer, base); - if (mode == HRTIMER_REL) { + if (mode == HRTIMER_MODE_REL) { tim = ktime_add(tim, new_base->get_time()); /* * CONFIG_TIME_LOW_RES is a temporary way for architectures @@ -446,7 +804,9 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) } timer->expires = tim; - enqueue_hrtimer(timer, new_base); + timer_stats_hrtimer_set_start_info(timer); + + enqueue_hrtimer(timer, new_base, base == new_base); unlock_hrtimer_base(timer, &flags); @@ -466,13 +826,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start); */ int hrtimer_try_to_cancel(struct hrtimer *timer) { - struct hrtimer_base *base; + struct hrtimer_clock_base *base; unsigned long flags; int ret = -1; base = lock_hrtimer_base(timer, &flags); - if (base->curr_timer != timer) + if (!hrtimer_callback_running(timer)) ret = remove_hrtimer(timer, base); unlock_hrtimer_base(timer, &flags); @@ -508,19 +868,19 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); */ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) { - struct hrtimer_base *base; + struct hrtimer_clock_base *base; unsigned long flags; ktime_t rem; base = lock_hrtimer_base(timer, &flags); - rem = ktime_sub(timer->expires, timer->base->get_time()); + rem = ktime_sub(timer->expires, base->get_time()); unlock_hrtimer_base(timer, &flags); return rem; } EXPORT_SYMBOL_GPL(hrtimer_get_remaining); -#ifdef CONFIG_NO_IDLE_HZ +#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ) /** * hrtimer_get_next_event - get the time until next expiry event * @@ -529,26 +889,31 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining); */ ktime_t hrtimer_get_next_event(void) { - struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_clock_base *base = cpu_base->clock_base; ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; unsigned long flags; int i; - for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { - struct hrtimer *timer; + spin_lock_irqsave(&cpu_base->lock, flags); - spin_lock_irqsave(&base->lock, flags); - if (!base->first) { - spin_unlock_irqrestore(&base->lock, flags); - continue; + if (!hrtimer_hres_active()) { + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { + struct hrtimer *timer; + + if (!base->first) + continue; + + timer = rb_entry(base->first, struct hrtimer, node); + delta.tv64 = timer->expires.tv64; + delta = ktime_sub(delta, base->get_time()); + if (delta.tv64 < mindelta.tv64) + mindelta.tv64 = delta.tv64; } - timer = rb_entry(base->first, struct hrtimer, node); - delta.tv64 = timer->expires.tv64; - spin_unlock_irqrestore(&base->lock, flags); - delta = ktime_sub(delta, base->get_time()); - if (delta.tv64 < mindelta.tv64) - mindelta.tv64 = delta.tv64; } + + spin_unlock_irqrestore(&cpu_base->lock, flags); + if (mindelta.tv64 < 0) mindelta.tv64 = 0; return mindelta; @@ -564,17 +929,23 @@ ktime_t hrtimer_get_next_event(void) void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { - struct hrtimer_base *bases; + struct hrtimer_cpu_base *cpu_base; memset(timer, 0, sizeof(struct hrtimer)); - bases = __raw_get_cpu_var(hrtimer_bases); + cpu_base = &__raw_get_cpu_var(hrtimer_bases); - if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) + if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) clock_id = CLOCK_MONOTONIC; - timer->base = &bases[clock_id]; - rb_set_parent(&timer->node, &timer->node); + timer->base = &cpu_base->clock_base[clock_id]; + hrtimer_init_timer_hres(timer); + +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; + timer->start_pid = -1; + memset(timer->start_comm, 0, TASK_COMM_LEN); +#endif } EXPORT_SYMBOL_GPL(hrtimer_init); @@ -588,21 +959,159 @@ EXPORT_SYMBOL_GPL(hrtimer_init); */ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) { - struct hrtimer_base *bases; + struct hrtimer_cpu_base *cpu_base; - bases = __raw_get_cpu_var(hrtimer_bases); - *tp = ktime_to_timespec(bases[which_clock].resolution); + cpu_base = &__raw_get_cpu_var(hrtimer_bases); + *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); return 0; } EXPORT_SYMBOL_GPL(hrtimer_get_res); +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer interrupt + * Called with interrupts disabled + */ +void hrtimer_interrupt(struct clock_event_device *dev) +{ + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_clock_base *base; + ktime_t expires_next, now; + int i, raise = 0; + + BUG_ON(!cpu_base->hres_active); + cpu_base->nr_events++; + dev->next_event.tv64 = KTIME_MAX; + + retry: + now = ktime_get(); + + expires_next.tv64 = KTIME_MAX; + + base = cpu_base->clock_base; + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + ktime_t basenow; + struct rb_node *node; + + spin_lock(&cpu_base->lock); + + basenow = ktime_add(now, base->offset); + + while ((node = base->first)) { + struct hrtimer *timer; + + timer = rb_entry(node, struct hrtimer, node); + + if (basenow.tv64 < timer->expires.tv64) { + ktime_t expires; + + expires = ktime_sub(timer->expires, + base->offset); + if (expires.tv64 < expires_next.tv64) + expires_next = expires; + break; + } + + /* Move softirq callbacks to the pending list */ + if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { + __remove_hrtimer(timer, base, + HRTIMER_STATE_PENDING, 0); + list_add_tail(&timer->cb_entry, + &base->cpu_base->cb_pending); + raise = 1; + continue; + } + + __remove_hrtimer(timer, base, + HRTIMER_STATE_CALLBACK, 0); + timer_stats_account_hrtimer(timer); + + /* + * Note: We clear the CALLBACK bit after + * enqueue_hrtimer to avoid reprogramming of + * the event hardware. This happens at the end + * of this function anyway. + */ + if (timer->function(timer) != HRTIMER_NORESTART) { + BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); + enqueue_hrtimer(timer, base, 0); + } + timer->state &= ~HRTIMER_STATE_CALLBACK; + } + spin_unlock(&cpu_base->lock); + base++; + } + + cpu_base->expires_next = expires_next; + + /* Reprogramming necessary ? */ + if (expires_next.tv64 != KTIME_MAX) { + if (tick_program_event(expires_next, 0)) + goto retry; + } + + /* Raise softirq ? */ + if (raise) + raise_softirq(HRTIMER_SOFTIRQ); +} + +static void run_hrtimer_softirq(struct softirq_action *h) +{ + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + + spin_lock_irq(&cpu_base->lock); + + while (!list_empty(&cpu_base->cb_pending)) { + enum hrtimer_restart (*fn)(struct hrtimer *); + struct hrtimer *timer; + int restart; + + timer = list_entry(cpu_base->cb_pending.next, + struct hrtimer, cb_entry); + + timer_stats_account_hrtimer(timer); + + fn = timer->function; + __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); + spin_unlock_irq(&cpu_base->lock); + + restart = fn(timer); + + spin_lock_irq(&cpu_base->lock); + + timer->state &= ~HRTIMER_STATE_CALLBACK; + if (restart == HRTIMER_RESTART) { + BUG_ON(hrtimer_active(timer)); + /* + * Enqueue the timer, allow reprogramming of the event + * device + */ + enqueue_hrtimer(timer, timer->base, 1); + } else if (hrtimer_active(timer)) { + /* + * If the timer was rearmed on another CPU, reprogram + * the event device. + */ + if (timer->base->first == &timer->node) + hrtimer_reprogram(timer, timer->base); + } + } + spin_unlock_irq(&cpu_base->lock); +} + +#endif /* CONFIG_HIGH_RES_TIMERS */ + /* * Expire the per base hrtimer-queue: */ -static inline void run_hrtimer_queue(struct hrtimer_base *base) +static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, + int index) { struct rb_node *node; + struct hrtimer_clock_base *base = &cpu_base->clock_base[index]; if (!base->first) return; @@ -610,53 +1119,72 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base) if (base->get_softirq_time) base->softirq_time = base->get_softirq_time(); - spin_lock_irq(&base->lock); + spin_lock_irq(&cpu_base->lock); while ((node = base->first)) { struct hrtimer *timer; - int (*fn)(struct hrtimer *); + enum hrtimer_restart (*fn)(struct hrtimer *); int restart; timer = rb_entry(node, struct hrtimer, node); if (base->softirq_time.tv64 <= timer->expires.tv64) break; + timer_stats_account_hrtimer(timer); + fn = timer->function; - set_curr_timer(base, timer); - __remove_hrtimer(timer, base); - spin_unlock_irq(&base->lock); + __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); + spin_unlock_irq(&cpu_base->lock); restart = fn(timer); - spin_lock_irq(&base->lock); + spin_lock_irq(&cpu_base->lock); + timer->state &= ~HRTIMER_STATE_CALLBACK; if (restart != HRTIMER_NORESTART) { BUG_ON(hrtimer_active(timer)); - enqueue_hrtimer(timer, base); + enqueue_hrtimer(timer, base, 0); } } - set_curr_timer(base, NULL); - spin_unlock_irq(&base->lock); + spin_unlock_irq(&cpu_base->lock); } /* * Called from timer softirq every jiffy, expire hrtimers: + * + * For HRT its the fall back code to run the softirq in the timer + * softirq context in case the hrtimer initialization failed or has + * not been done yet. */ void hrtimer_run_queues(void) { - struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); int i; - hrtimer_get_softirq_time(base); + if (hrtimer_hres_active()) + return; + + /* + * This _is_ ugly: We have to check in the softirq context, + * whether we can switch to highres and / or nohz mode. The + * clocksource switch happens in the timer interrupt with + * xtime_lock held. Notification from there only sets the + * check bit in the tick_oneshot code, otherwise we might + * deadlock vs. xtime_lock. + */ + if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) + hrtimer_switch_to_hres(); - for (i = 0; i < MAX_HRTIMER_BASES; i++) - run_hrtimer_queue(&base[i]); + hrtimer_get_softirq_time(cpu_base); + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + run_hrtimer_queue(cpu_base, i); } /* * Sleep related functions: */ -static int hrtimer_wakeup(struct hrtimer *timer) +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) { struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer); @@ -673,6 +1201,9 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) { sl->timer.function = hrtimer_wakeup; sl->task = task; +#ifdef CONFIG_HIGH_RES_TIMERS + sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART; +#endif } static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) @@ -683,10 +1214,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod set_current_state(TASK_INTERRUPTIBLE); hrtimer_start(&t->timer, t->timer.expires, mode); - schedule(); + if (likely(t->task)) + schedule(); hrtimer_cancel(&t->timer); - mode = HRTIMER_ABS; + mode = HRTIMER_MODE_ABS; } while (t->task && !signal_pending(current)); @@ -702,10 +1234,10 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart) restart->fn = do_no_restart_syscall; - hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS); + hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS); t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; - if (do_nanosleep(&t, HRTIMER_ABS)) + if (do_nanosleep(&t, HRTIMER_MODE_ABS)) return 0; rmtp = (struct timespec __user *) restart->arg1; @@ -738,7 +1270,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, return 0; /* Absolute timers do not update the rmtp value and restart: */ - if (mode == HRTIMER_ABS) + if (mode == HRTIMER_MODE_ABS) return -ERESTARTNOHAND; if (rmtp) { @@ -771,7 +1303,7 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) if (!timespec_valid(&tu)) return -EINVAL; - return hrtimer_nanosleep(&tu, rmtp, HRTIMER_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } /* @@ -779,56 +1311,60 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) */ static void __devinit init_hrtimers_cpu(int cpu) { - struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu); + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; - for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { - spin_lock_init(&base->lock); - lockdep_set_class(&base->lock, &base->lock_key); - } + spin_lock_init(&cpu_base->lock); + lockdep_set_class(&cpu_base->lock, &cpu_base->lock_key); + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + cpu_base->clock_base[i].cpu_base = cpu_base; + + hrtimer_init_hres(cpu_base); } #ifdef CONFIG_HOTPLUG_CPU -static void migrate_hrtimer_list(struct hrtimer_base *old_base, - struct hrtimer_base *new_base) +static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, + struct hrtimer_clock_base *new_base) { struct hrtimer *timer; struct rb_node *node; while ((node = rb_first(&old_base->active))) { timer = rb_entry(node, struct hrtimer, node); - __remove_hrtimer(timer, old_base); + BUG_ON(hrtimer_callback_running(timer)); + __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); timer->base = new_base; - enqueue_hrtimer(timer, new_base); + /* + * Enqueue the timer. Allow reprogramming of the event device + */ + enqueue_hrtimer(timer, new_base, 1); } } static void migrate_hrtimers(int cpu) { - struct hrtimer_base *old_base, *new_base; + struct hrtimer_cpu_base *old_base, *new_base; int i; BUG_ON(cpu_online(cpu)); - old_base = per_cpu(hrtimer_bases, cpu); - new_base = get_cpu_var(hrtimer_bases); - - local_irq_disable(); + old_base = &per_cpu(hrtimer_bases, cpu); + new_base = &get_cpu_var(hrtimer_bases); - for (i = 0; i < MAX_HRTIMER_BASES; i++) { + tick_cancel_sched_timer(cpu); - spin_lock(&new_base->lock); - spin_lock(&old_base->lock); - - BUG_ON(old_base->curr_timer); + local_irq_disable(); - migrate_hrtimer_list(old_base, new_base); + spin_lock(&new_base->lock); + spin_lock(&old_base->lock); - spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - old_base++; - new_base++; + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + migrate_hrtimer_list(&old_base->clock_base[i], + &new_base->clock_base[i]); } + spin_unlock(&old_base->lock); + spin_unlock(&new_base->lock); local_irq_enable(); put_cpu_var(hrtimer_bases); @@ -848,6 +1384,7 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, #ifdef CONFIG_HOTPLUG_CPU case CPU_DEAD: + clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu); migrate_hrtimers(cpu); break; #endif @@ -868,5 +1405,8 @@ void __init hrtimers_init(void) hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); +#ifdef CONFIG_HIGH_RES_TIMERS + open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL); +#endif } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 475e8a71bcd..0133f4f9e9f 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -168,7 +168,7 @@ EXPORT_SYMBOL(set_irq_data); /** * set_irq_data - set irq type data for an irq * @irq: Interrupt number - * @data: Pointer to interrupt specific data + * @entry: Pointer to MSI descriptor data * * Set the hardware irq controller data for an irq */ @@ -230,10 +230,6 @@ static void default_enable(unsigned int irq) */ static void default_disable(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; - - if (!(desc->status & IRQ_DELAYED_DISABLE)) - desc->chip->mask(irq); } /* @@ -298,13 +294,18 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); kstat_cpu(cpu).irqs[irq]++; action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) + if (unlikely(!action || (desc->status & IRQ_DISABLED))) { + if (desc->chip->mask) + desc->chip->mask(irq); + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + desc->status |= IRQ_PENDING; goto out_unlock; + } + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING | IRQ_PENDING); desc->status |= IRQ_INPROGRESS; spin_unlock(&desc->lock); @@ -396,11 +397,13 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) /* * If its disabled or no action available - * keep it masked and get out of here + * then mask it and get out of here: */ action = desc->action; if (unlikely(!action || (desc->status & IRQ_DISABLED))) { desc->status |= IRQ_PENDING; + if (desc->chip->mask) + desc->chip->mask(irq); goto out; } @@ -562,10 +565,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, /* Uninstall? */ if (handle == handle_bad_irq) { - if (desc->chip != &no_irq_chip) { - desc->chip->mask(irq); - desc->chip->ack(irq); - } + if (desc->chip != &no_irq_chip) + mask_ack_irq(desc, irq); desc->status |= IRQ_DISABLED; desc->depth = 1; } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index acc5d9fe462..5597c157442 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -38,6 +38,46 @@ void synchronize_irq(unsigned int irq) } EXPORT_SYMBOL(synchronize_irq); +/** + * irq_can_set_affinity - Check if the affinity of a given irq can be set + * @irq: Interrupt to check + * + */ +int irq_can_set_affinity(unsigned int irq) +{ + struct irq_desc *desc = irq_desc + irq; + + if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || + !desc->chip->set_affinity) + return 0; + + return 1; +} + +/** + * irq_set_affinity - Set the irq affinity of a given irq + * @irq: Interrupt to set affinity + * @cpumask: cpumask + * + */ +int irq_set_affinity(unsigned int irq, cpumask_t cpumask) +{ + struct irq_desc *desc = irq_desc + irq; + + if (!desc->chip->set_affinity) + return -EINVAL; + + set_balance_irq_affinity(irq, cpumask); + +#ifdef CONFIG_GENERIC_PENDING_IRQ + set_pending_irq(irq, cpumask); +#else + desc->affinity = cpumask; + desc->chip->set_affinity(irq, cpumask); +#endif + return 0; +} + #endif /** @@ -281,6 +321,10 @@ int setup_irq(unsigned int irq, struct irqaction *new) if (new->flags & IRQF_PERCPU) desc->status |= IRQ_PER_CPU; #endif + /* Exclude IRQ from balancing */ + if (new->flags & IRQF_NOBALANCING) + desc->status |= IRQ_NO_BALANCING; + if (!shared) { irq_chip_set_defaults(desc->chip); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 6d3be06e8ce..2db91eb54ad 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -16,26 +16,6 @@ static struct proc_dir_entry *root_irq_dir; #ifdef CONFIG_SMP -#ifdef CONFIG_GENERIC_PENDING_IRQ -void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) -{ - set_balance_irq_affinity(irq, mask_val); - - /* - * Save these away for later use. Re-progam when the - * interrupt is pending - */ - set_pending_irq(irq, mask_val); -} -#else -void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) -{ - set_balance_irq_affinity(irq, mask_val); - irq_desc[irq].affinity = mask_val; - irq_desc[irq].chip->set_affinity(irq, mask_val); -} -#endif - static int irq_affinity_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -55,7 +35,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, cpumask_t new_value, tmp; if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || - CHECK_IRQ_PER_CPU(irq_desc[irq].status)) + irq_balancing_disabled(irq)) return -EIO; err = cpumask_parse_user(buffer, count, new_value); @@ -73,7 +53,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, code to set default SMP affinity. */ return select_smp_affinity(irq) ? -EINVAL : full_count; - proc_set_irq_affinity(irq, new_value); + irq_set_affinity(irq, new_value); return full_count; } diff --git a/kernel/itimer.c b/kernel/itimer.c index 204ed7939e7..307c6a632ef 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -128,18 +128,13 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value) /* * The timer is automagically restarted, when interval != 0 */ -int it_real_fn(struct hrtimer *timer) +enum hrtimer_restart it_real_fn(struct hrtimer *timer) { struct signal_struct *sig = container_of(timer, struct signal_struct, real_timer); send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk); - if (sig->it_real_incr.tv64 != 0) { - hrtimer_forward(timer, timer->base->softirq_time, - sig->it_real_incr); - return HRTIMER_RESTART; - } return HRTIMER_NORESTART; } @@ -231,11 +226,14 @@ again: spin_unlock_irq(&tsk->sighand->siglock); goto again; } - tsk->signal->it_real_incr = - timeval_to_ktime(value->it_interval); expires = timeval_to_ktime(value->it_value); - if (expires.tv64 != 0) - hrtimer_start(timer, expires, HRTIMER_REL); + if (expires.tv64 != 0) { + tsk->signal->it_real_incr = + timeval_to_ktime(value->it_interval); + hrtimer_start(timer, expires, HRTIMER_MODE_REL); + } else + tsk->signal->it_real_incr.tv64 = 0; + spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 7c3e1e6dfb5..657f7769741 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -304,7 +304,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) * should be able to see it. */ struct task_struct *p; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_task_by_pid(pid); if (p) { if (CPUCLOCK_PERTHREAD(which_clock)) { @@ -312,12 +312,17 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) error = cpu_clock_sample(which_clock, p, &rtn); } - } else if (p->tgid == pid && p->signal) { - error = cpu_clock_sample_group(which_clock, - p, &rtn); + } else { + read_lock(&tasklist_lock); + if (p->tgid == pid && p->signal) { + error = + cpu_clock_sample_group(which_clock, + p, &rtn); + } + read_unlock(&tasklist_lock); } } - read_unlock(&tasklist_lock); + rcu_read_unlock(); } if (error) diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index a1bf6161783..44318ca7197 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -145,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int, struct itimerspec *, struct itimerspec *); static int common_timer_del(struct k_itimer *timer); -static int posix_timer_fn(struct hrtimer *data); +static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); @@ -334,12 +334,12 @@ EXPORT_SYMBOL_GPL(posix_timer_event); * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. */ -static int posix_timer_fn(struct hrtimer *timer) +static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) { struct k_itimer *timr; unsigned long flags; int si_private = 0; - int ret = HRTIMER_NORESTART; + enum hrtimer_restart ret = HRTIMER_NORESTART; timr = container_of(timer, struct k_itimer, it.real.timer); spin_lock_irqsave(&timr->it_lock, flags); @@ -356,7 +356,7 @@ static int posix_timer_fn(struct hrtimer *timer) if (timr->it.real.interval.tv64 != 0) { timr->it_overrun += hrtimer_forward(timer, - timer->base->softirq_time, + hrtimer_cb_get_time(timer), timr->it.real.interval); ret = HRTIMER_RESTART; ++timr->it_requeue_pending; @@ -722,7 +722,7 @@ common_timer_set(struct k_itimer *timr, int flags, if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) return 0; - mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL; + mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); timr->it.real.timer.function = posix_timer_fn; @@ -734,7 +734,7 @@ common_timer_set(struct k_itimer *timr, int flags, /* SIGEV_NONE timers are not queued ! See common_timer_get */ if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { /* Setup correct expiry time for relative timers */ - if (mode == HRTIMER_REL) + if (mode == HRTIMER_MODE_REL) timer->expires = ktime_add(timer->expires, timer->base->get_time()); return 0; @@ -950,7 +950,8 @@ static int common_nsleep(const clockid_t which_clock, int flags, struct timespec *tsave, struct timespec __user *rmtp) { return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? - HRTIMER_ABS : HRTIMER_REL, which_clock); + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); } asmlinkage long diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 4ab17da46fd..180978cb2f7 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -625,7 +625,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, /* Setup the timer, when timeout != NULL */ if (unlikely(timeout)) hrtimer_start(&timeout->timer, timeout->timer.expires, - HRTIMER_ABS); + HRTIMER_MODE_ABS); for (;;) { /* Try to acquire the lock: */ diff --git a/kernel/signal.c b/kernel/signal.c index 8072e568bbe..e2a7d4bf7d5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -456,26 +456,50 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) { int signr = __dequeue_signal(&tsk->pending, mask, info); - if (!signr) + if (!signr) { signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info); + /* + * itimer signal ? + * + * itimers are process shared and we restart periodic + * itimers in the signal delivery path to prevent DoS + * attacks in the high resolution timer case. This is + * compliant with the old way of self restarting + * itimers, as the SIGALRM is a legacy signal and only + * queued once. Changing the restart behaviour to + * restart the timer in the signal dequeue path is + * reducing the timer noise on heavy loaded !highres + * systems too. + */ + if (unlikely(signr == SIGALRM)) { + struct hrtimer *tmr = &tsk->signal->real_timer; + + if (!hrtimer_is_queued(tmr) && + tsk->signal->it_real_incr.tv64 != 0) { + hrtimer_forward(tmr, tmr->base->get_time(), + tsk->signal->it_real_incr); + hrtimer_restart(tmr); + } + } + } recalc_sigpending_tsk(tsk); - if (signr && unlikely(sig_kernel_stop(signr))) { - /* - * Set a marker that we have dequeued a stop signal. Our - * caller might release the siglock and then the pending - * stop signal it is about to process is no longer in the - * pending bitmasks, but must still be cleared by a SIGCONT - * (and overruled by a SIGKILL). So those cases clear this - * shared flag after we've set it. Note that this flag may - * remain set after the signal we return is ignored or - * handled. That doesn't matter because its only purpose - * is to alert stop-signal processing code when another - * processor has come along and cleared the flag. - */ - if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) - tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; - } + if (signr && unlikely(sig_kernel_stop(signr))) { + /* + * Set a marker that we have dequeued a stop signal. Our + * caller might release the siglock and then the pending + * stop signal it is about to process is no longer in the + * pending bitmasks, but must still be cleared by a SIGCONT + * (and overruled by a SIGKILL). So those cases clear this + * shared flag after we've set it. Note that this flag may + * remain set after the signal we return is ignored or + * handled. That doesn't matter because its only purpose + * is to alert stop-signal processing code when another + * processor has come along and cleared the flag. + */ + if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) + tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; + } if ( signr && ((info->si_code & __SI_MASK) == __SI_TIMER) && info->si_sys_private){ diff --git a/kernel/softirq.c b/kernel/softirq.c index 918e52df090..8b75008e2bd 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -17,6 +17,7 @@ #include <linux/kthread.h> #include <linux/rcupdate.h> #include <linux/smp.h> +#include <linux/tick.h> #include <asm/irq.h> /* @@ -273,6 +274,18 @@ EXPORT_SYMBOL(do_softirq); #endif +/* + * Enter an interrupt context. + */ +void irq_enter(void) +{ + __irq_enter(); +#ifdef CONFIG_NO_HZ + if (idle_cpu(smp_processor_id())) + tick_nohz_update_jiffies(); +#endif +} + #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED # define invoke_softirq() __do_softirq() #else @@ -289,6 +302,12 @@ void irq_exit(void) sub_preempt_count(IRQ_EXIT_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); + +#ifdef CONFIG_NO_HZ + /* Make sure that timer wheel updates are propagated */ + if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) + tick_nohz_stop_sched_tick(); +#endif preempt_enable_no_resched(); } diff --git a/kernel/time.c b/kernel/time.c index 0e017bff4c1..c6c80ea5d0e 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -470,6 +470,260 @@ struct timeval ns_to_timeval(const s64 nsec) return tv; } +/* + * Convert jiffies to milliseconds and back. + * + * Avoid unnecessary multiplications/divisions in the + * two most common HZ cases: + */ +unsigned int jiffies_to_msecs(const unsigned long j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); +#else + return (j * MSEC_PER_SEC) / HZ; +#endif +} +EXPORT_SYMBOL(jiffies_to_msecs); + +unsigned int jiffies_to_usecs(const unsigned long j) +{ +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (USEC_PER_SEC / HZ) * j; +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); +#else + return (j * USEC_PER_SEC) / HZ; +#endif +} +EXPORT_SYMBOL(jiffies_to_usecs); + +/* + * When we convert to jiffies then we interpret incoming values + * the following way: + * + * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) + * + * - 'too large' values [that would result in larger than + * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. + * + * - all other values are converted to jiffies by either multiplying + * the input value by a factor or dividing it with a factor + * + * We must also be careful about 32-bit overflows. + */ +unsigned long msecs_to_jiffies(const unsigned int m) +{ + /* + * Negative value, means infinite timeout: + */ + if ((int)m < 0) + return MAX_JIFFY_OFFSET; + +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + /* + * HZ is equal to or smaller than 1000, and 1000 is a nice + * round multiple of HZ, divide with the factor between them, + * but round upwards: + */ + return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + /* + * HZ is larger than 1000, and HZ is a nice round multiple of + * 1000 - simply multiply with the factor between them. + * + * But first make sure the multiplication result cannot + * overflow: + */ + if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return m * (HZ / MSEC_PER_SEC); +#else + /* + * Generic case - multiply, round and divide. But first + * check that if we are doing a net multiplication, that + * we wouldnt overflow: + */ + if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC; +#endif +} +EXPORT_SYMBOL(msecs_to_jiffies); + +unsigned long usecs_to_jiffies(const unsigned int u) +{ + if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return u * (HZ / USEC_PER_SEC); +#else + return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC; +#endif +} +EXPORT_SYMBOL(usecs_to_jiffies); + +/* + * The TICK_NSEC - 1 rounds up the value to the next resolution. Note + * that a remainder subtract here would not do the right thing as the + * resolution values don't fall on second boundries. I.e. the line: + * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. + * + * Rather, we just shift the bits off the right. + * + * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec + * value to a scaled second value. + */ +unsigned long +timespec_to_jiffies(const struct timespec *value) +{ + unsigned long sec = value->tv_sec; + long nsec = value->tv_nsec + TICK_NSEC - 1; + + if (sec >= MAX_SEC_IN_JIFFIES){ + sec = MAX_SEC_IN_JIFFIES; + nsec = 0; + } + return (((u64)sec * SEC_CONVERSION) + + (((u64)nsec * NSEC_CONVERSION) >> + (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; + +} +EXPORT_SYMBOL(timespec_to_jiffies); + +void +jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u64 nsec = (u64)jiffies * TICK_NSEC; + value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec); +} +EXPORT_SYMBOL(jiffies_to_timespec); + +/* Same for "timeval" + * + * Well, almost. The problem here is that the real system resolution is + * in nanoseconds and the value being converted is in micro seconds. + * Also for some machines (those that use HZ = 1024, in-particular), + * there is a LARGE error in the tick size in microseconds. + + * The solution we use is to do the rounding AFTER we convert the + * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. + * Instruction wise, this should cost only an additional add with carry + * instruction above the way it was done above. + */ +unsigned long +timeval_to_jiffies(const struct timeval *value) +{ + unsigned long sec = value->tv_sec; + long usec = value->tv_usec; + + if (sec >= MAX_SEC_IN_JIFFIES){ + sec = MAX_SEC_IN_JIFFIES; + usec = 0; + } + return (((u64)sec * SEC_CONVERSION) + + (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> + (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; +} + +void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u64 nsec = (u64)jiffies * TICK_NSEC; + long tv_usec; + + value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec); + tv_usec /= NSEC_PER_USEC; + value->tv_usec = tv_usec; +} + +/* + * Convert jiffies/jiffies_64 to clock_t and back. + */ +clock_t jiffies_to_clock_t(long x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 + return x / (HZ / USER_HZ); +#else + u64 tmp = (u64)x * TICK_NSEC; + do_div(tmp, (NSEC_PER_SEC / USER_HZ)); + return (long)tmp; +#endif +} +EXPORT_SYMBOL(jiffies_to_clock_t); + +unsigned long clock_t_to_jiffies(unsigned long x) +{ +#if (HZ % USER_HZ)==0 + if (x >= ~0UL / (HZ / USER_HZ)) + return ~0UL; + return x * (HZ / USER_HZ); +#else + u64 jif; + + /* Don't worry about loss of precision here .. */ + if (x >= ~0UL / HZ * USER_HZ) + return ~0UL; + + /* .. but do try to contain it here */ + jif = x * (u64) HZ; + do_div(jif, USER_HZ); + return jif; +#endif +} +EXPORT_SYMBOL(clock_t_to_jiffies); + +u64 jiffies_64_to_clock_t(u64 x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 + do_div(x, HZ / USER_HZ); +#else + /* + * There are better ways that don't overflow early, + * but even this doesn't overflow in hundreds of years + * in 64 bits, so.. + */ + x *= TICK_NSEC; + do_div(x, (NSEC_PER_SEC / USER_HZ)); +#endif + return x; +} + +EXPORT_SYMBOL(jiffies_64_to_clock_t); + +u64 nsec_to_clock_t(u64 x) +{ +#if (NSEC_PER_SEC % USER_HZ) == 0 + do_div(x, (NSEC_PER_SEC / USER_HZ)); +#elif (USER_HZ % 512) == 0 + x *= USER_HZ/512; + do_div(x, (NSEC_PER_SEC / 512)); +#else + /* + * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, + * overflow after 64.99 years. + * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... + */ + x *= 9; + do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) / + USER_HZ)); +#endif + return x; +} + #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) { diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig new file mode 100644 index 00000000000..f6635112654 --- /dev/null +++ b/kernel/time/Kconfig @@ -0,0 +1,25 @@ +# +# Timer subsystem related configuration options +# +config TICK_ONESHOT + bool + default n + +config NO_HZ + bool "Tickless System (Dynamic Ticks)" + depends on GENERIC_TIME && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + help + This option enables a tickless system: timer interrupts will + only trigger on an as-needed basis both when the system is + busy and when the system is idle. + +config HIGH_RES_TIMERS + bool "High Resolution Timer Support" + depends on GENERIC_TIME && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + help + This option enables high resolution timer support. If your + hardware is not capable then this option only increases + the size of the kernel image. + diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 61a3907d16f..93bccba1f26 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1 +1,8 @@ -obj-y += ntp.o clocksource.o jiffies.o +obj-y += ntp.o clocksource.o jiffies.o timer_list.o + +obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o +obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o +obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o +obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o +obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o +obj-$(CONFIG_TIMER_STATS) += timer_stats.o diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c new file mode 100644 index 00000000000..67932ea78c1 --- /dev/null +++ b/kernel/time/clockevents.c @@ -0,0 +1,345 @@ +/* + * linux/kernel/time/clockevents.c + * + * This file contains functions which manage clock event devices. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ + +#include <linux/clockchips.h> +#include <linux/hrtimer.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/notifier.h> +#include <linux/smp.h> +#include <linux/sysdev.h> + +/* The registered clock event devices */ +static LIST_HEAD(clockevent_devices); +static LIST_HEAD(clockevents_released); + +/* Notification for clock events */ +static RAW_NOTIFIER_HEAD(clockevents_chain); + +/* Protection for the above */ +static DEFINE_SPINLOCK(clockevents_lock); + +/** + * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds + * @latch: value to convert + * @evt: pointer to clock event device descriptor + * + * Math helper, returns latch value converted to nanoseconds (bound checked) + */ +unsigned long clockevent_delta2ns(unsigned long latch, + struct clock_event_device *evt) +{ + u64 clc = ((u64) latch << evt->shift); + + do_div(clc, evt->mult); + if (clc < 1000) + clc = 1000; + if (clc > LONG_MAX) + clc = LONG_MAX; + + return (unsigned long) clc; +} + +/** + * clockevents_set_mode - set the operating mode of a clock event device + * @dev: device to modify + * @mode: new mode + * + * Must be called with interrupts disabled ! + */ +void clockevents_set_mode(struct clock_event_device *dev, + enum clock_event_mode mode) +{ + if (dev->mode != mode) { + dev->set_mode(mode, dev); + dev->mode = mode; + } +} + +/** + * clockevents_program_event - Reprogram the clock event device. + * @expires: absolute expiry time (monotonic clock) + * + * Returns 0 on success, -ETIME when the event is in the past. + */ +int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, + ktime_t now) +{ + unsigned long long clc; + int64_t delta; + + delta = ktime_to_ns(ktime_sub(expires, now)); + + if (delta <= 0) + return -ETIME; + + dev->next_event = expires; + + if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) + return 0; + + if (delta > dev->max_delta_ns) + delta = dev->max_delta_ns; + if (delta < dev->min_delta_ns) + delta = dev->min_delta_ns; + + clc = delta * dev->mult; + clc >>= dev->shift; + + return dev->set_next_event((unsigned long) clc, dev); +} + +/** + * clockevents_register_notifier - register a clock events change listener + */ +int clockevents_register_notifier(struct notifier_block *nb) +{ + int ret; + + spin_lock(&clockevents_lock); + ret = raw_notifier_chain_register(&clockevents_chain, nb); + spin_unlock(&clockevents_lock); + + return ret; +} + +/** + * clockevents_unregister_notifier - unregister a clock events change listener + */ +void clockevents_unregister_notifier(struct notifier_block *nb) +{ + spin_lock(&clockevents_lock); + raw_notifier_chain_unregister(&clockevents_chain, nb); + spin_unlock(&clockevents_lock); +} + +/* + * Notify about a clock event change. Called with clockevents_lock + * held. + */ +static void clockevents_do_notify(unsigned long reason, void *dev) +{ + raw_notifier_call_chain(&clockevents_chain, reason, dev); +} + +/* + * Called after a notify add to make devices availble which were + * released from the notifier call. + */ +static void clockevents_notify_released(void) +{ + struct clock_event_device *dev; + + while (!list_empty(&clockevents_released)) { + dev = list_entry(clockevents_released.next, + struct clock_event_device, list); + list_del(&dev->list); + list_add(&dev->list, &clockevent_devices); + clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + } +} + +/** + * clockevents_register_device - register a clock event device + * @dev: device to register + */ +void clockevents_register_device(struct clock_event_device *dev) +{ + BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + + spin_lock(&clockevents_lock); + + list_add(&dev->list, &clockevent_devices); + clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + clockevents_notify_released(); + + spin_unlock(&clockevents_lock); +} + +/* + * Noop handler when we shut down an event device + */ +static void clockevents_handle_noop(struct clock_event_device *dev) +{ +} + +/** + * clockevents_exchange_device - release and request clock devices + * @old: device to release (can be NULL) + * @new: device to request (can be NULL) + * + * Called from the notifier chain. clockevents_lock is held already + */ +void clockevents_exchange_device(struct clock_event_device *old, + struct clock_event_device *new) +{ + unsigned long flags; + + local_irq_save(flags); + /* + * Caller releases a clock event device. We queue it into the + * released list and do a notify add later. + */ + if (old) { + old->event_handler = clockevents_handle_noop; + clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); + list_del(&old->list); + list_add(&old->list, &clockevents_released); + } + + if (new) { + BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); + clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN); + } + local_irq_restore(flags); +} + +/** + * clockevents_request_device + */ +struct clock_event_device *clockevents_request_device(unsigned int features, + cpumask_t cpumask) +{ + struct clock_event_device *cur, *dev = NULL; + struct list_head *tmp; + + spin_lock(&clockevents_lock); + + list_for_each(tmp, &clockevent_devices) { + cur = list_entry(tmp, struct clock_event_device, list); + + if ((cur->features & features) == features && + cpus_equal(cpumask, cur->cpumask)) { + if (!dev || dev->rating < cur->rating) + dev = cur; + } + } + + clockevents_exchange_device(NULL, dev); + + spin_unlock(&clockevents_lock); + + return dev; +} + +/** + * clockevents_release_device + */ +void clockevents_release_device(struct clock_event_device *dev) +{ + spin_lock(&clockevents_lock); + + clockevents_exchange_device(dev, NULL); + clockevents_notify_released(); + + spin_unlock(&clockevents_lock); +} + +/** + * clockevents_notify - notification about relevant events + */ +void clockevents_notify(unsigned long reason, void *arg) +{ + spin_lock(&clockevents_lock); + clockevents_do_notify(reason, arg); + + switch (reason) { + case CLOCK_EVT_NOTIFY_CPU_DEAD: + /* + * Unregister the clock event devices which were + * released from the users in the notify chain. + */ + while (!list_empty(&clockevents_released)) { + struct clock_event_device *dev; + + dev = list_entry(clockevents_released.next, + struct clock_event_device, list); + list_del(&dev->list); + } + break; + default: + break; + } + spin_unlock(&clockevents_lock); +} +EXPORT_SYMBOL_GPL(clockevents_notify); + +#ifdef CONFIG_SYSFS + +/** + * clockevents_show_registered - sysfs interface for listing clockevents + * @dev: unused + * @buf: char buffer to be filled with clock events list + * + * Provides sysfs interface for listing registered clock event devices + */ +static ssize_t clockevents_show_registered(struct sys_device *dev, char *buf) +{ + struct list_head *tmp; + char *p = buf; + int cpu; + + spin_lock(&clockevents_lock); + + list_for_each(tmp, &clockevent_devices) { + struct clock_event_device *ce; + + ce = list_entry(tmp, struct clock_event_device, list); + p += sprintf(p, "%-20s F:%04x M:%d", ce->name, + ce->features, ce->mode); + p += sprintf(p, " C:"); + if (!cpus_equal(ce->cpumask, cpu_possible_map)) { + for_each_cpu_mask(cpu, ce->cpumask) + p += sprintf(p, " %d", cpu); + } else { + /* + * FIXME: Add the cpu which is handling this sucker + */ + } + p += sprintf(p, "\n"); + } + + spin_unlock(&clockevents_lock); + + return p - buf; +} + +/* + * Sysfs setup bits: + */ +static SYSDEV_ATTR(registered, 0600, + clockevents_show_registered, NULL); + +static struct sysdev_class clockevents_sysclass = { + set_kset_name("clockevents"), +}; + +static struct sys_device clockevents_sys_device = { + .id = 0, + .cls = &clockevents_sysclass, +}; + +static int __init clockevents_sysfs_init(void) +{ + int error = sysdev_class_register(&clockevents_sysclass); + + if (!error) + error = sysdev_register(&clockevents_sys_device); + if (!error) + error = sysdev_create_file( + &clockevents_sys_device, + &attr_registered); + return error; +} +device_initcall(clockevents_sysfs_init); +#endif diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index d9ef176c4e0..193a0793af9 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -29,6 +29,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ +#include <linux/tick.h> /* XXX - Would like a better way for initializing curr_clocksource */ extern struct clocksource clocksource_jiffies; @@ -48,6 +49,7 @@ extern struct clocksource clocksource_jiffies; */ static struct clocksource *curr_clocksource = &clocksource_jiffies; static struct clocksource *next_clocksource; +static struct clocksource *clocksource_override; static LIST_HEAD(clocksource_list); static DEFINE_SPINLOCK(clocksource_lock); static char override_name[32]; @@ -62,9 +64,123 @@ static int __init clocksource_done_booting(void) finished_booting = 1; return 0; } - late_initcall(clocksource_done_booting); +#ifdef CONFIG_CLOCKSOURCE_WATCHDOG +static LIST_HEAD(watchdog_list); +static struct clocksource *watchdog; +static struct timer_list watchdog_timer; +static DEFINE_SPINLOCK(watchdog_lock); +static cycle_t watchdog_last; +/* + * Interval: 0.5sec Treshold: 0.0625s + */ +#define WATCHDOG_INTERVAL (HZ >> 1) +#define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4) + +static void clocksource_ratewd(struct clocksource *cs, int64_t delta) +{ + if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD) + return; + + printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", + cs->name, delta); + cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); + clocksource_change_rating(cs, 0); + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; + list_del(&cs->wd_list); +} + +static void clocksource_watchdog(unsigned long data) +{ + struct clocksource *cs, *tmp; + cycle_t csnow, wdnow; + int64_t wd_nsec, cs_nsec; + + spin_lock(&watchdog_lock); + + wdnow = watchdog->read(); + wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); + watchdog_last = wdnow; + + list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { + csnow = cs->read(); + /* Initialized ? */ + if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { + if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && + (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + /* + * We just marked the clocksource as + * highres-capable, notify the rest of the + * system as well so that we transition + * into high-res mode: + */ + tick_clock_notify(); + } + cs->flags |= CLOCK_SOURCE_WATCHDOG; + cs->wd_last = csnow; + } else { + cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask); + cs->wd_last = csnow; + /* Check the delta. Might remove from the list ! */ + clocksource_ratewd(cs, cs_nsec - wd_nsec); + } + } + + if (!list_empty(&watchdog_list)) { + __mod_timer(&watchdog_timer, + watchdog_timer.expires + WATCHDOG_INTERVAL); + } + spin_unlock(&watchdog_lock); +} +static void clocksource_check_watchdog(struct clocksource *cs) +{ + struct clocksource *cse; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + int started = !list_empty(&watchdog_list); + + list_add(&cs->wd_list, &watchdog_list); + if (!started && watchdog) { + watchdog_last = watchdog->read(); + watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; + add_timer(&watchdog_timer); + } + } else if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) { + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + + if (!watchdog || cs->rating > watchdog->rating) { + if (watchdog) + del_timer(&watchdog_timer); + watchdog = cs; + init_timer(&watchdog_timer); + watchdog_timer.function = clocksource_watchdog; + + /* Reset watchdog cycles */ + list_for_each_entry(cse, &watchdog_list, wd_list) + cse->flags &= ~CLOCK_SOURCE_WATCHDOG; + /* Start if list is not empty */ + if (!list_empty(&watchdog_list)) { + watchdog_last = watchdog->read(); + watchdog_timer.expires = + jiffies + WATCHDOG_INTERVAL; + add_timer(&watchdog_timer); + } + } + } + spin_unlock_irqrestore(&watchdog_lock, flags); +} +#else +static void clocksource_check_watchdog(struct clocksource *cs) +{ + if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; +} +#endif + /** * clocksource_get_next - Returns the selected clocksource * @@ -84,60 +200,54 @@ struct clocksource *clocksource_get_next(void) } /** - * select_clocksource - Finds the best registered clocksource. + * select_clocksource - Selects the best registered clocksource. * * Private function. Must hold clocksource_lock when called. * - * Looks through the list of registered clocksources, returning - * the one with the highest rating value. If there is a clocksource - * name that matches the override string, it returns that clocksource. + * Select the clocksource with the best rating, or the clocksource, + * which is selected by userspace override. */ static struct clocksource *select_clocksource(void) { - struct clocksource *best = NULL; - struct list_head *tmp; + struct clocksource *next; - list_for_each(tmp, &clocksource_list) { - struct clocksource *src; + if (list_empty(&clocksource_list)) + return NULL; - src = list_entry(tmp, struct clocksource, list); - if (!best) - best = src; - - /* check for override: */ - if (strlen(src->name) == strlen(override_name) && - !strcmp(src->name, override_name)) { - best = src; - break; - } - /* pick the highest rating: */ - if (src->rating > best->rating) - best = src; - } + if (clocksource_override) + next = clocksource_override; + else + next = list_entry(clocksource_list.next, struct clocksource, + list); + + if (next == curr_clocksource) + return NULL; - return best; + return next; } -/** - * is_registered_source - Checks if clocksource is registered - * @c: pointer to a clocksource - * - * Private helper function. Must hold clocksource_lock when called. - * - * Returns one if the clocksource is already registered, zero otherwise. +/* + * Enqueue the clocksource sorted by rating */ -static int is_registered_source(struct clocksource *c) +static int clocksource_enqueue(struct clocksource *c) { - int len = strlen(c->name); - struct list_head *tmp; + struct list_head *tmp, *entry = &clocksource_list; list_for_each(tmp, &clocksource_list) { - struct clocksource *src; - - src = list_entry(tmp, struct clocksource, list); - if (strlen(src->name) == len && !strcmp(src->name, c->name)) - return 1; + struct clocksource *cs; + + cs = list_entry(tmp, struct clocksource, list); + if (cs == c) + return -EBUSY; + /* Keep track of the place, where to insert */ + if (cs->rating >= c->rating) + entry = tmp; } + list_add(&c->list, entry); + + if (strlen(c->name) == strlen(override_name) && + !strcmp(c->name, override_name)) + clocksource_override = c; return 0; } @@ -150,42 +260,35 @@ static int is_registered_source(struct clocksource *c) */ int clocksource_register(struct clocksource *c) { - int ret = 0; unsigned long flags; + int ret; spin_lock_irqsave(&clocksource_lock, flags); - /* check if clocksource is already registered */ - if (is_registered_source(c)) { - printk("register_clocksource: Cannot register %s. " - "Already registered!", c->name); - ret = -EBUSY; - } else { - /* register it */ - list_add(&c->list, &clocksource_list); - /* scan the registered clocksources, and pick the best one */ + ret = clocksource_enqueue(c); + if (!ret) next_clocksource = select_clocksource(); - } spin_unlock_irqrestore(&clocksource_lock, flags); + if (!ret) + clocksource_check_watchdog(c); return ret; } EXPORT_SYMBOL(clocksource_register); /** - * clocksource_reselect - Rescan list for next clocksource + * clocksource_change_rating - Change the rating of a registered clocksource * - * A quick helper function to be used if a clocksource changes its - * rating. Forces the clocksource list to be re-scanned for the best - * clocksource. */ -void clocksource_reselect(void) +void clocksource_change_rating(struct clocksource *cs, int rating) { unsigned long flags; spin_lock_irqsave(&clocksource_lock, flags); + list_del(&cs->list); + cs->rating = rating; + clocksource_enqueue(cs); next_clocksource = select_clocksource(); spin_unlock_irqrestore(&clocksource_lock, flags); } -EXPORT_SYMBOL(clocksource_reselect); #ifdef CONFIG_SYSFS /** @@ -221,7 +324,11 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf) static ssize_t sysfs_override_clocksource(struct sys_device *dev, const char *buf, size_t count) { + struct clocksource *ovr = NULL; + struct list_head *tmp; size_t ret = count; + int len; + /* strings from sysfs write are not 0 terminated! */ if (count >= sizeof(override_name)) return -EINVAL; @@ -229,17 +336,32 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, /* strip of \n: */ if (buf[count-1] == '\n') count--; - if (count < 1) - return -EINVAL; spin_lock_irq(&clocksource_lock); - /* copy the name given: */ - memcpy(override_name, buf, count); + if (count > 0) + memcpy(override_name, buf, count); override_name[count] = 0; - /* try to select it: */ - next_clocksource = select_clocksource(); + len = strlen(override_name); + if (len) { + ovr = clocksource_override; + /* try to select it: */ + list_for_each(tmp, &clocksource_list) { + struct clocksource *cs; + + cs = list_entry(tmp, struct clocksource, list); + if (strlen(cs->name) == len && + !strcmp(cs->name, override_name)) + ovr = cs; + } + } + + /* Reselect, when the override name has changed */ + if (ovr != clocksource_override) { + clocksource_override = ovr; + next_clocksource = select_clocksource(); + } spin_unlock_irq(&clocksource_lock); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a99b2a6e6a0..3be8da8fed7 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -62,7 +62,6 @@ struct clocksource clocksource_jiffies = { .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ .shift = JIFFIES_SHIFT, - .is_continuous = 0, /* tick based, not free running */ }; static int __init init_jiffies_clocksource(void) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 3afeaa3a73f..eb12509e00b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -24,7 +24,7 @@ static u64 tick_length, tick_length_base; #define MAX_TICKADJ 500 /* microsecs */ #define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ - TICK_LENGTH_SHIFT) / HZ) + TICK_LENGTH_SHIFT) / NTP_INTERVAL_FREQ) /* * phase-lock loop variables @@ -46,13 +46,17 @@ long time_adjust; static void ntp_update_frequency(void) { - tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; - tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; - tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); + u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) + << TICK_LENGTH_SHIFT; + second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; + second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); - do_div(tick_length_base, HZ); + tick_length_base = second_length; - tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT; + do_div(second_length, HZ); + tick_nsec = second_length >> TICK_LENGTH_SHIFT; + + do_div(tick_length_base, NTP_INTERVAL_FREQ); } /** @@ -162,7 +166,7 @@ void second_overflow(void) tick_length -= MAX_TICKADJ_SCALED; } else { tick_length += (s64)(time_adjust * NSEC_PER_USEC / - HZ) << TICK_LENGTH_SHIFT; + NTP_INTERVAL_FREQ) << TICK_LENGTH_SHIFT; time_adjust = 0; } } @@ -239,7 +243,8 @@ int do_adjtimex(struct timex *txc) result = -EINVAL; goto leave; } - time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC); + time_freq = ((s64)txc->freq * NSEC_PER_USEC) + >> (SHIFT_USEC - SHIFT_NSEC); } if (txc->modes & ADJ_MAXERROR) { @@ -309,7 +314,8 @@ int do_adjtimex(struct timex *txc) freq_adj += time_freq; freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC); time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC); - time_offset = (time_offset / HZ) << SHIFT_UPDATE; + time_offset = (time_offset / NTP_INTERVAL_FREQ) + << SHIFT_UPDATE; } /* STA_PLL */ } /* txc->modes & ADJ_OFFSET */ if (txc->modes & ADJ_TICK) @@ -324,8 +330,10 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) txc->offset = save_adjust; else - txc->offset = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000; - txc->freq = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC); + txc->offset = shift_right(time_offset, SHIFT_UPDATE) + * NTP_INTERVAL_FREQ / 1000; + txc->freq = (time_freq / NSEC_PER_USEC) + << (SHIFT_USEC - SHIFT_NSEC); txc->maxerror = time_maxerror; txc->esterror = time_esterror; txc->status = time_status; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c new file mode 100644 index 00000000000..12b3efeb9f6 --- /dev/null +++ b/kernel/time/tick-broadcast.c @@ -0,0 +1,480 @@ +/* + * linux/kernel/time/tick-broadcast.c + * + * This file contains functions which emulate a local clock-event + * device via a broadcast event source. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/irq.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/tick.h> + +#include "tick-internal.h" + +/* + * Broadcast support for broken x86 hardware, where the local apic + * timer stops in C3 state. + */ + +struct tick_device tick_broadcast_device; +static cpumask_t tick_broadcast_mask; +static DEFINE_SPINLOCK(tick_broadcast_lock); + +/* + * Debugging: see timer_list.c + */ +struct tick_device *tick_get_broadcast_device(void) +{ + return &tick_broadcast_device; +} + +cpumask_t *tick_get_broadcast_mask(void) +{ + return &tick_broadcast_mask; +} + +/* + * Start the device in periodic mode + */ +static void tick_broadcast_start_periodic(struct clock_event_device *bc) +{ + if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN) + tick_setup_periodic(bc, 1); +} + +/* + * Check, if the device can be utilized as broadcast device: + */ +int tick_check_broadcast_device(struct clock_event_device *dev) +{ + if (tick_broadcast_device.evtdev || + (dev->features & CLOCK_EVT_FEAT_C3STOP)) + return 0; + + clockevents_exchange_device(NULL, dev); + tick_broadcast_device.evtdev = dev; + if (!cpus_empty(tick_broadcast_mask)) + tick_broadcast_start_periodic(dev); + return 1; +} + +/* + * Check, if the device is the broadcast device + */ +int tick_is_broadcast_device(struct clock_event_device *dev) +{ + return (dev && tick_broadcast_device.evtdev == dev); +} + +/* + * Check, if the device is disfunctional and a place holder, which + * needs to be handled by the broadcast device. + */ +int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + /* + * Devices might be registered with both periodic and oneshot + * mode disabled. This signals, that the device needs to be + * operated from the broadcast device and is a placeholder for + * the cpu local device. + */ + if (!tick_device_is_functional(dev)) { + dev->event_handler = tick_handle_periodic; + cpu_set(cpu, tick_broadcast_mask); + tick_broadcast_start_periodic(tick_broadcast_device.evtdev); + ret = 1; + } + + spin_unlock_irqrestore(&tick_broadcast_lock, flags); + return ret; +} + +/* + * Broadcast the event to the cpus, which are set in the mask + */ +int tick_do_broadcast(cpumask_t mask) +{ + int ret = 0, cpu = smp_processor_id(); + struct tick_device *td; + + /* + * Check, if the current cpu is in the mask + */ + if (cpu_isset(cpu, mask)) { + cpu_clear(cpu, mask); + td = &per_cpu(tick_cpu_device, cpu); + td->evtdev->event_handler(td->evtdev); + ret = 1; + } + + if (!cpus_empty(mask)) { + /* + * It might be necessary to actually check whether the devices + * have different broadcast functions. For now, just use the + * one of the first device. This works as long as we have this + * misfeature only on x86 (lapic) + */ + cpu = first_cpu(mask); + td = &per_cpu(tick_cpu_device, cpu); + td->evtdev->broadcast(mask); + ret = 1; + } + return ret; +} + +/* + * Periodic broadcast: + * - invoke the broadcast handlers + */ +static void tick_do_periodic_broadcast(void) +{ + cpumask_t mask; + + spin_lock(&tick_broadcast_lock); + + cpus_and(mask, cpu_online_map, tick_broadcast_mask); + tick_do_broadcast(mask); + + spin_unlock(&tick_broadcast_lock); +} + +/* + * Event handler for periodic broadcast ticks + */ +static void tick_handle_periodic_broadcast(struct clock_event_device *dev) +{ + dev->next_event.tv64 = KTIME_MAX; + + tick_do_periodic_broadcast(); + + /* + * The device is in periodic mode. No reprogramming necessary: + */ + if (dev->mode == CLOCK_EVT_MODE_PERIODIC) + return; + + /* + * Setup the next period for devices, which do not have + * periodic mode: + */ + for (;;) { + ktime_t next = ktime_add(dev->next_event, tick_period); + + if (!clockevents_program_event(dev, next, ktime_get())) + return; + tick_do_periodic_broadcast(); + } +} + +/* + * Powerstate information: The system enters/leaves a state, where + * affected devices might stop + */ +static void tick_do_broadcast_on_off(void *why) +{ + struct clock_event_device *bc, *dev; + struct tick_device *td; + unsigned long flags, *reason = why; + int cpu; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + cpu = smp_processor_id(); + td = &per_cpu(tick_cpu_device, cpu); + dev = td->evtdev; + bc = tick_broadcast_device.evtdev; + + /* + * Is the device in broadcast mode forever or is it not + * affected by the powerstate ? + */ + if (!dev || !tick_device_is_functional(dev) || + !(dev->features & CLOCK_EVT_FEAT_C3STOP)) + goto out; + + if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_ON) { + if (!cpu_isset(cpu, tick_broadcast_mask)) { + cpu_set(cpu, tick_broadcast_mask); + if (td->mode == TICKDEV_MODE_PERIODIC) + clockevents_set_mode(dev, + CLOCK_EVT_MODE_SHUTDOWN); + } + } else { + if (cpu_isset(cpu, tick_broadcast_mask)) { + cpu_clear(cpu, tick_broadcast_mask); + if (td->mode == TICKDEV_MODE_PERIODIC) + tick_setup_periodic(dev, 0); + } + } + + if (cpus_empty(tick_broadcast_mask)) + clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + else { + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + tick_broadcast_start_periodic(bc); + else + tick_broadcast_setup_oneshot(bc); + } +out: + spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +/* + * Powerstate information: The system enters/leaves a state, where + * affected devices might stop. + */ +void tick_broadcast_on_off(unsigned long reason, int *oncpu) +{ + int cpu = get_cpu(); + + if (cpu == *oncpu) + tick_do_broadcast_on_off(&reason); + else + smp_call_function_single(*oncpu, tick_do_broadcast_on_off, + &reason, 1, 1); + put_cpu(); +} + +/* + * Set the periodic handler depending on broadcast on/off + */ +void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) +{ + if (!broadcast) + dev->event_handler = tick_handle_periodic; + else + dev->event_handler = tick_handle_periodic_broadcast; +} + +/* + * Remove a CPU from broadcasting + */ +void tick_shutdown_broadcast(unsigned int *cpup) +{ + struct clock_event_device *bc; + unsigned long flags; + unsigned int cpu = *cpup; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + bc = tick_broadcast_device.evtdev; + cpu_clear(cpu, tick_broadcast_mask); + + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { + if (bc && cpus_empty(tick_broadcast_mask)) + clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + } + + spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +#ifdef CONFIG_TICK_ONESHOT + +static cpumask_t tick_broadcast_oneshot_mask; + +/* + * Debugging: see timer_list.c + */ +cpumask_t *tick_get_broadcast_oneshot_mask(void) +{ + return &tick_broadcast_oneshot_mask; +} + +static int tick_broadcast_set_event(ktime_t expires, int force) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + ktime_t now = ktime_get(); + int res; + + for(;;) { + res = clockevents_program_event(bc, expires, now); + if (!res || !force) + return res; + now = ktime_get(); + expires = ktime_add(now, ktime_set(0, bc->min_delta_ns)); + } +} + +/* + * Reprogram the broadcast device: + * + * Called with tick_broadcast_lock held and interrupts disabled. + */ +static int tick_broadcast_reprogram(void) +{ + ktime_t expires = { .tv64 = KTIME_MAX }; + struct tick_device *td; + int cpu; + + /* + * Find the event which expires next: + */ + for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS; + cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) { + td = &per_cpu(tick_cpu_device, cpu); + if (td->evtdev->next_event.tv64 < expires.tv64) + expires = td->evtdev->next_event; + } + + if (expires.tv64 == KTIME_MAX) + return 0; + + return tick_broadcast_set_event(expires, 0); +} + +/* + * Handle oneshot mode broadcasting + */ +static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) +{ + struct tick_device *td; + cpumask_t mask; + ktime_t now; + int cpu; + + spin_lock(&tick_broadcast_lock); +again: + dev->next_event.tv64 = KTIME_MAX; + mask = CPU_MASK_NONE; + now = ktime_get(); + /* Find all expired events */ + for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS; + cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) { + td = &per_cpu(tick_cpu_device, cpu); + if (td->evtdev->next_event.tv64 <= now.tv64) + cpu_set(cpu, mask); + } + + /* + * Wakeup the cpus which have an expired event. The broadcast + * device is reprogrammed in the return from idle code. + */ + if (!tick_do_broadcast(mask)) { + /* + * The global event did not expire any CPU local + * events. This happens in dyntick mode, as the + * maximum PIT delta is quite small. + */ + if (tick_broadcast_reprogram()) + goto again; + } + spin_unlock(&tick_broadcast_lock); +} + +/* + * Powerstate information: The system enters/leaves a state, where + * affected devices might stop + */ +void tick_broadcast_oneshot_control(unsigned long reason) +{ + struct clock_event_device *bc, *dev; + struct tick_device *td; + unsigned long flags; + int cpu; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + /* + * Periodic mode does not care about the enter/exit of power + * states + */ + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + goto out; + + bc = tick_broadcast_device.evtdev; + cpu = smp_processor_id(); + td = &per_cpu(tick_cpu_device, cpu); + dev = td->evtdev; + + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) + goto out; + + if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { + if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) { + cpu_set(cpu, tick_broadcast_oneshot_mask); + clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + if (dev->next_event.tv64 < bc->next_event.tv64) + tick_broadcast_set_event(dev->next_event, 1); + } + } else { + if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { + cpu_clear(cpu, tick_broadcast_oneshot_mask); + clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + if (dev->next_event.tv64 != KTIME_MAX) + tick_program_event(dev->next_event, 1); + } + } + +out: + spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +/** + * tick_broadcast_setup_highres - setup the broadcast device for highres + */ +void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +{ + if (bc->mode != CLOCK_EVT_MODE_ONESHOT) { + bc->event_handler = tick_handle_oneshot_broadcast; + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + bc->next_event.tv64 = KTIME_MAX; + } +} + +/* + * Select oneshot operating mode for the broadcast device + */ +void tick_broadcast_switch_to_oneshot(void) +{ + struct clock_event_device *bc; + unsigned long flags; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; + bc = tick_broadcast_device.evtdev; + if (bc) + tick_broadcast_setup_oneshot(bc); + spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + + +/* + * Remove a dead CPU from broadcasting + */ +void tick_shutdown_broadcast_oneshot(unsigned int *cpup) +{ + struct clock_event_device *bc; + unsigned long flags; + unsigned int cpu = *cpup; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + bc = tick_broadcast_device.evtdev; + cpu_clear(cpu, tick_broadcast_oneshot_mask); + + if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) { + if (bc && cpus_empty(tick_broadcast_oneshot_mask)) + clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + } + + spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +#endif diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c new file mode 100644 index 00000000000..4500e347f1b --- /dev/null +++ b/kernel/time/tick-common.c @@ -0,0 +1,346 @@ +/* + * linux/kernel/time/tick-common.c + * + * This file contains the base functions to manage periodic tick + * related events. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/irq.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/tick.h> + +#include "tick-internal.h" + +/* + * Tick devices + */ +DEFINE_PER_CPU(struct tick_device, tick_cpu_device); +/* + * Tick next event: keeps track of the tick time + */ +ktime_t tick_next_period; +ktime_t tick_period; +static int tick_do_timer_cpu = -1; +DEFINE_SPINLOCK(tick_device_lock); + +/* + * Debugging: see timer_list.c + */ +struct tick_device *tick_get_device(int cpu) +{ + return &per_cpu(tick_cpu_device, cpu); +} + +/** + * tick_is_oneshot_available - check for a oneshot capable event device + */ +int tick_is_oneshot_available(void) +{ + struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + + return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); +} + +/* + * Periodic tick + */ +static void tick_periodic(int cpu) +{ + if (tick_do_timer_cpu == cpu) { + write_seqlock(&xtime_lock); + + /* Keep track of the next tick event */ + tick_next_period = ktime_add(tick_next_period, tick_period); + + do_timer(1); + write_sequnlock(&xtime_lock); + } + + update_process_times(user_mode(get_irq_regs())); + profile_tick(CPU_PROFILING); +} + +/* + * Event handler for periodic ticks + */ +void tick_handle_periodic(struct clock_event_device *dev) +{ + int cpu = smp_processor_id(); + + tick_periodic(cpu); + + if (dev->mode != CLOCK_EVT_MODE_ONESHOT) + return; + /* + * Setup the next period for devices, which do not have + * periodic mode: + */ + for (;;) { + ktime_t next = ktime_add(dev->next_event, tick_period); + + if (!clockevents_program_event(dev, next, ktime_get())) + return; + tick_periodic(cpu); + } +} + +/* + * Setup the device for a periodic tick + */ +void tick_setup_periodic(struct clock_event_device *dev, int broadcast) +{ + tick_set_periodic_handler(dev, broadcast); + + /* Broadcast setup ? */ + if (!tick_device_is_functional(dev)) + return; + + if (dev->features & CLOCK_EVT_FEAT_PERIODIC) { + clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); + } else { + unsigned long seq; + ktime_t next; + + do { + seq = read_seqbegin(&xtime_lock); + next = tick_next_period; + } while (read_seqretry(&xtime_lock, seq)); + + clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + + for (;;) { + if (!clockevents_program_event(dev, next, ktime_get())) + return; + next = ktime_add(next, tick_period); + } + } +} + +/* + * Setup the tick device + */ +static void tick_setup_device(struct tick_device *td, + struct clock_event_device *newdev, int cpu, + cpumask_t cpumask) +{ + ktime_t next_event; + void (*handler)(struct clock_event_device *) = NULL; + + /* + * First device setup ? + */ + if (!td->evtdev) { + /* + * If no cpu took the do_timer update, assign it to + * this cpu: + */ + if (tick_do_timer_cpu == -1) { + tick_do_timer_cpu = cpu; + tick_next_period = ktime_get(); + tick_period = ktime_set(0, NSEC_PER_SEC / HZ); + } + + /* + * Startup in periodic mode first. + */ + td->mode = TICKDEV_MODE_PERIODIC; + } else { + handler = td->evtdev->event_handler; + next_event = td->evtdev->next_event; + } + + td->evtdev = newdev; + + /* + * When the device is not per cpu, pin the interrupt to the + * current cpu: + */ + if (!cpus_equal(newdev->cpumask, cpumask)) + irq_set_affinity(newdev->irq, cpumask); + + /* + * When global broadcasting is active, check if the current + * device is registered as a placeholder for broadcast mode. + * This allows us to handle this x86 misfeature in a generic + * way. + */ + if (tick_device_uses_broadcast(newdev, cpu)) + return; + + if (td->mode == TICKDEV_MODE_PERIODIC) + tick_setup_periodic(newdev, 0); + else + tick_setup_oneshot(newdev, handler, next_event); +} + +/* + * Check, if the new registered device should be used. + */ +static int tick_check_new_device(struct clock_event_device *newdev) +{ + struct clock_event_device *curdev; + struct tick_device *td; + int cpu, ret = NOTIFY_OK; + unsigned long flags; + cpumask_t cpumask; + + spin_lock_irqsave(&tick_device_lock, flags); + + cpu = smp_processor_id(); + if (!cpu_isset(cpu, newdev->cpumask)) + goto out; + + td = &per_cpu(tick_cpu_device, cpu); + curdev = td->evtdev; + cpumask = cpumask_of_cpu(cpu); + + /* cpu local device ? */ + if (!cpus_equal(newdev->cpumask, cpumask)) { + + /* + * If the cpu affinity of the device interrupt can not + * be set, ignore it. + */ + if (!irq_can_set_affinity(newdev->irq)) + goto out_bc; + + /* + * If we have a cpu local device already, do not replace it + * by a non cpu local device + */ + if (curdev && cpus_equal(curdev->cpumask, cpumask)) + goto out_bc; + } + + /* + * If we have an active device, then check the rating and the oneshot + * feature. + */ + if (curdev) { + /* + * Prefer one shot capable devices ! + */ + if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) && + !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) + goto out_bc; + /* + * Check the rating + */ + if (curdev->rating >= newdev->rating) + goto out_bc; + } + + /* + * Replace the eventually existing device by the new + * device. If the current device is the broadcast device, do + * not give it back to the clockevents layer ! + */ + if (tick_is_broadcast_device(curdev)) { + clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN); + curdev = NULL; + } + clockevents_exchange_device(curdev, newdev); + tick_setup_device(td, newdev, cpu, cpumask); + if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_oneshot_notify(); + + spin_unlock_irqrestore(&tick_device_lock, flags); + return NOTIFY_STOP; + +out_bc: + /* + * Can the new device be used as a broadcast device ? + */ + if (tick_check_broadcast_device(newdev)) + ret = NOTIFY_STOP; +out: + spin_unlock_irqrestore(&tick_device_lock, flags); + + return ret; +} + +/* + * Shutdown an event device on a given cpu: + * + * This is called on a life CPU, when a CPU is dead. So we cannot + * access the hardware device itself. + * We just set the mode and remove it from the lists. + */ +static void tick_shutdown(unsigned int *cpup) +{ + struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); + struct clock_event_device *dev = td->evtdev; + unsigned long flags; + + spin_lock_irqsave(&tick_device_lock, flags); + td->mode = TICKDEV_MODE_PERIODIC; + if (dev) { + /* + * Prevent that the clock events layer tries to call + * the set mode function! + */ + dev->mode = CLOCK_EVT_MODE_UNUSED; + clockevents_exchange_device(dev, NULL); + td->evtdev = NULL; + } + spin_unlock_irqrestore(&tick_device_lock, flags); +} + +/* + * Notification about clock event devices + */ +static int tick_notify(struct notifier_block *nb, unsigned long reason, + void *dev) +{ + switch (reason) { + + case CLOCK_EVT_NOTIFY_ADD: + return tick_check_new_device(dev); + + case CLOCK_EVT_NOTIFY_BROADCAST_ON: + case CLOCK_EVT_NOTIFY_BROADCAST_OFF: + tick_broadcast_on_off(reason, dev); + break; + + case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: + case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: + tick_broadcast_oneshot_control(reason); + break; + + case CLOCK_EVT_NOTIFY_CPU_DEAD: + tick_shutdown_broadcast_oneshot(dev); + tick_shutdown_broadcast(dev); + tick_shutdown(dev); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block tick_notifier = { + .notifier_call = tick_notify, +}; + +/** + * tick_init - initialize the tick control + * + * Register the notifier with the clockevents framework + */ +void __init tick_init(void) +{ + clockevents_register_notifier(&tick_notifier); +} diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h new file mode 100644 index 00000000000..54861a0f29f --- /dev/null +++ b/kernel/time/tick-internal.h @@ -0,0 +1,110 @@ +/* + * tick internal variable and functions used by low/high res code + */ +DECLARE_PER_CPU(struct tick_device, tick_cpu_device); +extern spinlock_t tick_device_lock; +extern ktime_t tick_next_period; +extern ktime_t tick_period; + +extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); +extern void tick_handle_periodic(struct clock_event_device *dev); + +/* + * NO_HZ / high resolution timer shared code + */ +#ifdef CONFIG_TICK_ONESHOT +extern void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t nextevt); +extern int tick_program_event(ktime_t expires, int force); +extern void tick_oneshot_notify(void); +extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); + +# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); +extern void tick_broadcast_oneshot_control(unsigned long reason); +extern void tick_broadcast_switch_to_oneshot(void); +extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); +# else /* BROADCAST */ +static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +{ + BUG(); +} +static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline void tick_broadcast_switch_to_oneshot(void) { } +static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } +# endif /* !BROADCAST */ + +#else /* !ONESHOT */ +static inline +void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t nextevt) +{ + BUG(); +} +static inline int tick_program_event(ktime_t expires, int force) +{ + return 0; +} +static inline void tick_oneshot_notify(void) { } +static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +{ + BUG(); +} +static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } +#endif /* !TICK_ONESHOT */ + +/* + * Broadcasting support + */ +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern int tick_do_broadcast(cpumask_t mask); + +extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); +extern int tick_check_broadcast_device(struct clock_event_device *dev); +extern int tick_is_broadcast_device(struct clock_event_device *dev); +extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); +extern void tick_shutdown_broadcast(unsigned int *cpup); + +extern void +tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); + +#else /* !BROADCAST */ + +static inline int tick_check_broadcast_device(struct clock_event_device *dev) +{ + return 0; +} + +static inline int tick_is_broadcast_device(struct clock_event_device *dev) +{ + return 0; +} +static inline int tick_device_uses_broadcast(struct clock_event_device *dev, + int cpu) +{ + return 0; +} +static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } +static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { } +static inline void tick_shutdown_broadcast(unsigned int *cpup) { } + +/* + * Set the periodic handler in non broadcast mode + */ +static inline void tick_set_periodic_handler(struct clock_event_device *dev, + int broadcast) +{ + dev->event_handler = tick_handle_periodic; +} +#endif /* !BROADCAST */ + +/* + * Check, if the device is functional or a dummy for broadcast + */ +static inline int tick_device_is_functional(struct clock_event_device *dev) +{ + return !(dev->features & CLOCK_EVT_FEAT_DUMMY); +} diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c new file mode 100644 index 00000000000..2e8b7ff863c --- /dev/null +++ b/kernel/time/tick-oneshot.c @@ -0,0 +1,84 @@ +/* + * linux/kernel/time/tick-oneshot.c + * + * This file contains functions which manage high resolution tick + * related events. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/irq.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/tick.h> + +#include "tick-internal.h" + +/** + * tick_program_event + */ +int tick_program_event(ktime_t expires, int force) +{ + struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + ktime_t now = ktime_get(); + + while (1) { + int ret = clockevents_program_event(dev, expires, now); + + if (!ret || !force) + return ret; + now = ktime_get(); + expires = ktime_add(now, ktime_set(0, dev->min_delta_ns)); + } +} + +/** + * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) + */ +void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t next_event) +{ + newdev->event_handler = handler; + clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); + clockevents_program_event(newdev, next_event, ktime_get()); +} + +/** + * tick_switch_to_oneshot - switch to oneshot mode + */ +int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) +{ + struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct clock_event_device *dev = td->evtdev; + + if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || + !tick_device_is_functional(dev)) + return -EINVAL; + + td->mode = TICKDEV_MODE_ONESHOT; + dev->event_handler = handler; + clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + tick_broadcast_switch_to_oneshot(); + return 0; +} + +#ifdef CONFIG_HIGH_RES_TIMERS +/** + * tick_init_highres - switch to high resolution mode + * + * Called with interrupts disabled. + */ +int tick_init_highres(void) +{ + return tick_switch_to_oneshot(hrtimer_interrupt); +} +#endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c new file mode 100644 index 00000000000..95e41f7f850 --- /dev/null +++ b/kernel/time/tick-sched.c @@ -0,0 +1,563 @@ +/* + * linux/kernel/time/tick-sched.c + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner + * + * No idle tick implementation for low and high resolution timers + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/tick.h> + +#include "tick-internal.h" + +/* + * Per cpu nohz control structure + */ +static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); + +/* + * The time, when the last jiffy update happened. Protected by xtime_lock. + */ +static ktime_t last_jiffies_update; + +struct tick_sched *tick_get_tick_sched(int cpu) +{ + return &per_cpu(tick_cpu_sched, cpu); +} + +/* + * Must be called with interrupts disabled ! + */ +static void tick_do_update_jiffies64(ktime_t now) +{ + unsigned long ticks = 0; + ktime_t delta; + + /* Reevalute with xtime_lock held */ + write_seqlock(&xtime_lock); + + delta = ktime_sub(now, last_jiffies_update); + if (delta.tv64 >= tick_period.tv64) { + + delta = ktime_sub(delta, tick_period); + last_jiffies_update = ktime_add(last_jiffies_update, + tick_period); + + /* Slow path for long timeouts */ + if (unlikely(delta.tv64 >= tick_period.tv64)) { + s64 incr = ktime_to_ns(tick_period); + + ticks = ktime_divns(delta, incr); + + last_jiffies_update = ktime_add_ns(last_jiffies_update, + incr * ticks); + } + do_timer(++ticks); + } + write_sequnlock(&xtime_lock); +} + +/* + * Initialize and return retrieve the jiffies update. + */ +static ktime_t tick_init_jiffy_update(void) +{ + ktime_t period; + + write_seqlock(&xtime_lock); + /* Did we start the jiffies update yet ? */ + if (last_jiffies_update.tv64 == 0) + last_jiffies_update = tick_next_period; + period = last_jiffies_update; + write_sequnlock(&xtime_lock); + return period; +} + +/* + * NOHZ - aka dynamic tick functionality + */ +#ifdef CONFIG_NO_HZ +/* + * NO HZ enabled ? + */ +static int tick_nohz_enabled __read_mostly = 1; + +/* + * Enable / Disable tickless mode + */ +static int __init setup_tick_nohz(char *str) +{ + if (!strcmp(str, "off")) + tick_nohz_enabled = 0; + else if (!strcmp(str, "on")) + tick_nohz_enabled = 1; + else + return 0; + return 1; +} + +__setup("nohz=", setup_tick_nohz); + +/** + * tick_nohz_update_jiffies - update jiffies when idle was interrupted + * + * Called from interrupt entry when the CPU was idle + * + * In case the sched_tick was stopped on this CPU, we have to check if jiffies + * must be updated. Otherwise an interrupt handler could use a stale jiffy + * value. We do this unconditionally on any cpu, as we don't know whether the + * cpu, which has the update task assigned is in a long sleep. + */ +void tick_nohz_update_jiffies(void) +{ + int cpu = smp_processor_id(); + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + unsigned long flags; + ktime_t now; + + if (!ts->tick_stopped) + return; + + cpu_clear(cpu, nohz_cpu_mask); + now = ktime_get(); + + local_irq_save(flags); + tick_do_update_jiffies64(now); + local_irq_restore(flags); +} + +/** + * tick_nohz_stop_sched_tick - stop the idle tick from the idle task + * + * When the next event is more than a tick into the future, stop the idle tick + * Called either from the idle loop or from irq_exit() when an idle period was + * just interrupted by an interrupt which did not cause a reschedule. + */ +void tick_nohz_stop_sched_tick(void) +{ + unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; + struct tick_sched *ts; + ktime_t last_update, expires, now, delta; + int cpu; + + local_irq_save(flags); + + cpu = smp_processor_id(); + ts = &per_cpu(tick_cpu_sched, cpu); + + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) + goto end; + + if (need_resched()) + goto end; + + cpu = smp_processor_id(); + BUG_ON(local_softirq_pending()); + + now = ktime_get(); + /* + * When called from irq_exit we need to account the idle sleep time + * correctly. + */ + if (ts->tick_stopped) { + delta = ktime_sub(now, ts->idle_entrytime); + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); + } + + ts->idle_entrytime = now; + ts->idle_calls++; + + /* Read jiffies and the time when jiffies were updated last */ + do { + seq = read_seqbegin(&xtime_lock); + last_update = last_jiffies_update; + last_jiffies = jiffies; + } while (read_seqretry(&xtime_lock, seq)); + + /* Get the next timer wheel timer */ + next_jiffies = get_next_timer_interrupt(last_jiffies); + delta_jiffies = next_jiffies - last_jiffies; + + /* + * Do not stop the tick, if we are only one off + * or if the cpu is required for rcu + */ + if (!ts->tick_stopped && (delta_jiffies == 1 || rcu_needs_cpu(cpu))) + goto out; + + /* Schedule the tick, if we are at least one jiffie off */ + if ((long)delta_jiffies >= 1) { + + if (rcu_needs_cpu(cpu)) + delta_jiffies = 1; + else + cpu_set(cpu, nohz_cpu_mask); + /* + * nohz_stop_sched_tick can be called several times before + * the nohz_restart_sched_tick is called. This happens when + * interrupts arrive which do not cause a reschedule. In the + * first call we save the current tick time, so we can restart + * the scheduler tick in nohz_restart_sched_tick. + */ + if (!ts->tick_stopped) { + ts->idle_tick = ts->sched_timer.expires; + ts->tick_stopped = 1; + ts->idle_jiffies = last_jiffies; + } + /* + * calculate the expiry time for the next timer wheel + * timer + */ + expires = ktime_add_ns(last_update, tick_period.tv64 * + delta_jiffies); + ts->idle_expires = expires; + ts->idle_sleeps++; + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start(&ts->sched_timer, expires, + HRTIMER_MODE_ABS); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + goto out; + } else if(!tick_program_event(expires, 0)) + goto out; + /* + * We are past the event already. So we crossed a + * jiffie boundary. Update jiffies and raise the + * softirq. + */ + tick_do_update_jiffies64(ktime_get()); + cpu_clear(cpu, nohz_cpu_mask); + } + raise_softirq_irqoff(TIMER_SOFTIRQ); +out: + ts->next_jiffies = next_jiffies; + ts->last_jiffies = last_jiffies; +end: + local_irq_restore(flags); +} + +/** + * nohz_restart_sched_tick - restart the idle tick from the idle task + * + * Restart the idle tick when the CPU is woken up from idle + */ +void tick_nohz_restart_sched_tick(void) +{ + int cpu = smp_processor_id(); + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + unsigned long ticks; + ktime_t now, delta; + + if (!ts->tick_stopped) + return; + + /* Update jiffies first */ + now = ktime_get(); + + local_irq_disable(); + tick_do_update_jiffies64(now); + cpu_clear(cpu, nohz_cpu_mask); + + /* Account the idle time */ + delta = ktime_sub(now, ts->idle_entrytime); + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); + + /* + * We stopped the tick in idle. Update process times would miss the + * time we slept as update_process_times does only a 1 tick + * accounting. Enforce that this is accounted to idle ! + */ + ticks = jiffies - ts->idle_jiffies; + /* + * We might be one off. Do not randomly account a huge number of ticks! + */ + if (ticks && ticks < LONG_MAX) { + add_preempt_count(HARDIRQ_OFFSET); + account_system_time(current, HARDIRQ_OFFSET, + jiffies_to_cputime(ticks)); + sub_preempt_count(HARDIRQ_OFFSET); + } + + /* + * Cancel the scheduled timer and restore the tick + */ + ts->tick_stopped = 0; + hrtimer_cancel(&ts->sched_timer); + ts->sched_timer.expires = ts->idle_tick; + + while (1) { + /* Forward the time to expire in the future */ + hrtimer_forward(&ts->sched_timer, now, tick_period); + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start(&ts->sched_timer, + ts->sched_timer.expires, + HRTIMER_MODE_ABS); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + break; + } else { + if (!tick_program_event(ts->sched_timer.expires, 0)) + break; + } + /* Update jiffies and reread time */ + tick_do_update_jiffies64(now); + now = ktime_get(); + } + local_irq_enable(); +} + +static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) +{ + hrtimer_forward(&ts->sched_timer, now, tick_period); + return tick_program_event(ts->sched_timer.expires, 0); +} + +/* + * The nohz low res interrupt handler + */ +static void tick_nohz_handler(struct clock_event_device *dev) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct pt_regs *regs = get_irq_regs(); + ktime_t now = ktime_get(); + + dev->next_event.tv64 = KTIME_MAX; + + /* Check, if the jiffies need an update */ + tick_do_update_jiffies64(now); + + /* + * When we are idle and the tick is stopped, we have to touch + * the watchdog as we might not schedule for a really long + * time. This happens on complete idle SMP systems while + * waiting on the login prompt. We also increment the "start + * of idle" jiffy stamp so the idle accounting adjustment we + * do when we go busy again does not account too much ticks. + */ + if (ts->tick_stopped) { + touch_softlockup_watchdog(); + ts->idle_jiffies++; + } + + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING); + + /* Do not restart, when we are in the idle loop */ + if (ts->tick_stopped) + return; + + while (tick_nohz_reprogram(ts, now)) { + now = ktime_get(); + tick_do_update_jiffies64(now); + } +} + +/** + * tick_nohz_switch_to_nohz - switch to nohz mode + */ +static void tick_nohz_switch_to_nohz(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + ktime_t next; + + if (!tick_nohz_enabled) + return; + + local_irq_disable(); + if (tick_switch_to_oneshot(tick_nohz_handler)) { + local_irq_enable(); + return; + } + + ts->nohz_mode = NOHZ_MODE_LOWRES; + + /* + * Recycle the hrtimer in ts, so we can share the + * hrtimer_forward with the highres code. + */ + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + /* Get the next period */ + next = tick_init_jiffy_update(); + + for (;;) { + ts->sched_timer.expires = next; + if (!tick_program_event(next, 0)) + break; + next = ktime_add(next, tick_period); + } + local_irq_enable(); + + printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", + smp_processor_id()); +} + +#else + +static inline void tick_nohz_switch_to_nohz(void) { } + +#endif /* NO_HZ */ + +/* + * High resolution timer specific code + */ +#ifdef CONFIG_HIGH_RES_TIMERS +/* + * We rearm the timer until we get disabled by the idle code + * Called with interrupts disabled and timer->base->cpu_base->lock held. + */ +static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) +{ + struct tick_sched *ts = + container_of(timer, struct tick_sched, sched_timer); + struct hrtimer_cpu_base *base = timer->base->cpu_base; + struct pt_regs *regs = get_irq_regs(); + ktime_t now = ktime_get(); + + /* Check, if the jiffies need an update */ + tick_do_update_jiffies64(now); + + /* + * Do not call, when we are not in irq context and have + * no valid regs pointer + */ + if (regs) { + /* + * When we are idle and the tick is stopped, we have to touch + * the watchdog as we might not schedule for a really long + * time. This happens on complete idle SMP systems while + * waiting on the login prompt. We also increment the "start of + * idle" jiffy stamp so the idle accounting adjustment we do + * when we go busy again does not account too much ticks. + */ + if (ts->tick_stopped) { + touch_softlockup_watchdog(); + ts->idle_jiffies++; + } + /* + * update_process_times() might take tasklist_lock, hence + * drop the base lock. sched-tick hrtimers are per-CPU and + * never accessible by userspace APIs, so this is safe to do. + */ + spin_unlock(&base->lock); + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING); + spin_lock(&base->lock); + } + + /* Do not restart, when we are in the idle loop */ + if (ts->tick_stopped) + return HRTIMER_NORESTART; + + hrtimer_forward(timer, now, tick_period); + + return HRTIMER_RESTART; +} + +/** + * tick_setup_sched_timer - setup the tick emulation timer + */ +void tick_setup_sched_timer(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + ktime_t now = ktime_get(); + + /* + * Emulate tick processing via per-CPU hrtimers: + */ + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + ts->sched_timer.function = tick_sched_timer; + ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + + /* Get the next period */ + ts->sched_timer.expires = tick_init_jiffy_update(); + + for (;;) { + hrtimer_forward(&ts->sched_timer, now, tick_period); + hrtimer_start(&ts->sched_timer, ts->sched_timer.expires, + HRTIMER_MODE_ABS); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + break; + now = ktime_get(); + } + +#ifdef CONFIG_NO_HZ + if (tick_nohz_enabled) + ts->nohz_mode = NOHZ_MODE_HIGHRES; +#endif +} + +void tick_cancel_sched_timer(int cpu) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + + if (ts->sched_timer.base) + hrtimer_cancel(&ts->sched_timer); + ts->tick_stopped = 0; + ts->nohz_mode = NOHZ_MODE_INACTIVE; +} +#endif /* HIGH_RES_TIMERS */ + +/** + * Async notification about clocksource changes + */ +void tick_clock_notify(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); +} + +/* + * Async notification about clock event changes + */ +void tick_oneshot_notify(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + set_bit(0, &ts->check_clocks); +} + +/** + * Check, if a change happened, which makes oneshot possible. + * + * Called cyclic from the hrtimer softirq (driven by the timer + * softirq) allow_nohz signals, that we can switch into low-res nohz + * mode, because high resolution timers are disabled (either compile + * or runtime). + */ +int tick_check_oneshot_change(int allow_nohz) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + if (!test_and_clear_bit(0, &ts->check_clocks)) + return 0; + + if (ts->nohz_mode != NOHZ_MODE_INACTIVE) + return 0; + + if (!timekeeping_is_continuous() || !tick_is_oneshot_available()) + return 0; + + if (!allow_nohz) + return 1; + + tick_nohz_switch_to_nohz(); + return 0; +} diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c new file mode 100644 index 00000000000..f82c635c3d5 --- /dev/null +++ b/kernel/time/timer_list.c @@ -0,0 +1,287 @@ +/* + * kernel/time/timer_list.c + * + * List pending timers + * + * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/kallsyms.h> +#include <linux/tick.h> + +#include <asm/uaccess.h> + +typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); + +DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); + +/* + * This allows printing both to /proc/timer_list and + * to the console (on SysRq-Q): + */ +#define SEQ_printf(m, x...) \ + do { \ + if (m) \ + seq_printf(m, x); \ + else \ + printk(x); \ + } while (0) + +static void print_name_offset(struct seq_file *m, void *sym) +{ + unsigned long addr = (unsigned long)sym; + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); + if (sym_name) + SEQ_printf(m, "%s", sym_name); + else + SEQ_printf(m, "<%p>", sym); +} + +static void +print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) +{ +#ifdef CONFIG_TIMER_STATS + char tmp[TASK_COMM_LEN + 1]; +#endif + SEQ_printf(m, " #%d: ", idx); + print_name_offset(m, timer); + SEQ_printf(m, ", "); + print_name_offset(m, timer->function); + SEQ_printf(m, ", S:%02lx", timer->state); +#ifdef CONFIG_TIMER_STATS + SEQ_printf(m, ", "); + print_name_offset(m, timer->start_site); + memcpy(tmp, timer->start_comm, TASK_COMM_LEN); + tmp[TASK_COMM_LEN] = 0; + SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); +#endif + SEQ_printf(m, "\n"); + SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n", + (unsigned long long)ktime_to_ns(timer->expires), + (unsigned long long)(ktime_to_ns(timer->expires) - now)); +} + +static void +print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, + u64 now) +{ + struct hrtimer *timer, tmp; + unsigned long next = 0, i; + struct rb_node *curr; + unsigned long flags; + +next_one: + i = 0; + spin_lock_irqsave(&base->cpu_base->lock, flags); + + curr = base->first; + /* + * Crude but we have to do this O(N*N) thing, because + * we have to unlock the base when printing: + */ + while (curr && i < next) { + curr = rb_next(curr); + i++; + } + + if (curr) { + + timer = rb_entry(curr, struct hrtimer, node); + tmp = *timer; + spin_unlock_irqrestore(&base->cpu_base->lock, flags); + + print_timer(m, &tmp, i, now); + next++; + goto next_one; + } + spin_unlock_irqrestore(&base->cpu_base->lock, flags); +} + +static void +print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) +{ + SEQ_printf(m, " .index: %d\n", + base->index); + SEQ_printf(m, " .resolution: %Ld nsecs\n", + (unsigned long long)ktime_to_ns(base->resolution)); + SEQ_printf(m, " .get_time: "); + print_name_offset(m, base->get_time); + SEQ_printf(m, "\n"); +#ifdef CONFIG_HIGH_RES_TIMERS + SEQ_printf(m, " .offset: %Ld nsecs\n", + ktime_to_ns(base->offset)); +#endif + SEQ_printf(m, "active timers:\n"); + print_active_timers(m, base, now); +} + +static void print_cpu(struct seq_file *m, int cpu, u64 now) +{ + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); + int i; + + SEQ_printf(m, "\ncpu: %d\n", cpu); + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + SEQ_printf(m, " clock %d:\n", i); + print_base(m, cpu_base->clock_base + i, now); + } +#define P(x) \ + SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(cpu_base->x)) +#define P_ns(x) \ + SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \ + (u64)(ktime_to_ns(cpu_base->x))) + +#ifdef CONFIG_HIGH_RES_TIMERS + P_ns(expires_next); + P(hres_active); + P(nr_events); +#endif +#undef P +#undef P_ns + +#ifdef CONFIG_TICK_ONESHOT +# define P(x) \ + SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(ts->x)) +# define P_ns(x) \ + SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \ + (u64)(ktime_to_ns(ts->x))) + { + struct tick_sched *ts = tick_get_tick_sched(cpu); + P(nohz_mode); + P_ns(idle_tick); + P(tick_stopped); + P(idle_jiffies); + P(idle_calls); + P(idle_sleeps); + P_ns(idle_entrytime); + P_ns(idle_sleeptime); + P(last_jiffies); + P(next_jiffies); + P_ns(idle_expires); + SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies); + } +#endif + +#undef P +#undef P_ns +} + +#ifdef CONFIG_GENERIC_CLOCKEVENTS +static void +print_tickdevice(struct seq_file *m, struct tick_device *td) +{ + struct clock_event_device *dev = td->evtdev; + + SEQ_printf(m, "\nTick Device: mode: %d\n", td->mode); + + SEQ_printf(m, "Clock Event Device: "); + if (!dev) { + SEQ_printf(m, "<NULL>\n"); + return; + } + SEQ_printf(m, "%s\n", dev->name); + SEQ_printf(m, " max_delta_ns: %ld\n", dev->max_delta_ns); + SEQ_printf(m, " min_delta_ns: %ld\n", dev->min_delta_ns); + SEQ_printf(m, " mult: %ld\n", dev->mult); + SEQ_printf(m, " shift: %d\n", dev->shift); + SEQ_printf(m, " mode: %d\n", dev->mode); + SEQ_printf(m, " next_event: %Ld nsecs\n", + (unsigned long long) ktime_to_ns(dev->next_event)); + + SEQ_printf(m, " set_next_event: "); + print_name_offset(m, dev->set_next_event); + SEQ_printf(m, "\n"); + + SEQ_printf(m, " set_mode: "); + print_name_offset(m, dev->set_mode); + SEQ_printf(m, "\n"); + + SEQ_printf(m, " event_handler: "); + print_name_offset(m, dev->event_handler); + SEQ_printf(m, "\n"); +} + +static void timer_list_show_tickdevices(struct seq_file *m) +{ + int cpu; + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST + print_tickdevice(m, tick_get_broadcast_device()); + SEQ_printf(m, "tick_broadcast_mask: %08lx\n", + tick_get_broadcast_mask()->bits[0]); +#ifdef CONFIG_TICK_ONESHOT + SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", + tick_get_broadcast_oneshot_mask()->bits[0]); +#endif + SEQ_printf(m, "\n"); +#endif + for_each_online_cpu(cpu) + print_tickdevice(m, tick_get_device(cpu)); + SEQ_printf(m, "\n"); +} +#else +static void timer_list_show_tickdevices(struct seq_file *m) { } +#endif + +static int timer_list_show(struct seq_file *m, void *v) +{ + u64 now = ktime_to_ns(ktime_get()); + int cpu; + + SEQ_printf(m, "Timer List Version: v0.3\n"); + SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); + SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); + + for_each_online_cpu(cpu) + print_cpu(m, cpu, now); + + SEQ_printf(m, "\n"); + timer_list_show_tickdevices(m); + + return 0; +} + +void sysrq_timer_list_show(void) +{ + timer_list_show(NULL, NULL); +} + +static int timer_list_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, timer_list_show, NULL); +} + +static struct file_operations timer_list_fops = { + .open = timer_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init init_timer_list_procfs(void) +{ + struct proc_dir_entry *pe; + + pe = create_proc_entry("timer_list", 0644, NULL); + if (!pe) + return -ENOMEM; + + pe->proc_fops = &timer_list_fops; + + return 0; +} +__initcall(init_timer_list_procfs); diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c new file mode 100644 index 00000000000..1bc4882e28e --- /dev/null +++ b/kernel/time/timer_stats.c @@ -0,0 +1,411 @@ +/* + * kernel/time/timer_stats.c + * + * Collect timer usage statistics. + * + * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * timer_stats is based on timer_top, a similar functionality which was part of + * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the + * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based + * on dynamic allocation of the statistics entries and linear search based + * lookup combined with a global lock, rather than the static array, hash + * and per-CPU locking which is used by timer_stats. It was written for the + * pre hrtimer kernel code and therefore did not take hrtimers into account. + * Nevertheless it provided the base for the timer_stats implementation and + * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks + * for this effort. + * + * timer_top.c is + * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus + * Written by Daniel Petrini <d.pensator@gmail.com> + * timer_top.c was released under the GNU General Public License version 2 + * + * We export the addresses and counting of timer functions being called, + * the pid and cmdline from the owner process if applicable. + * + * Start/stop data collection: + * # echo 1[0] >/proc/timer_stats + * + * Display the information collected so far: + * # cat /proc/timer_stats + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/kallsyms.h> + +#include <asm/uaccess.h> + +/* + * This is our basic unit of interest: a timer expiry event identified + * by the timer, its start/expire functions and the PID of the task that + * started the timer. We count the number of times an event happens: + */ +struct entry { + /* + * Hash list: + */ + struct entry *next; + + /* + * Hash keys: + */ + void *timer; + void *start_func; + void *expire_func; + pid_t pid; + + /* + * Number of timeout events: + */ + unsigned long count; + + /* + * We save the command-line string to preserve + * this information past task exit: + */ + char comm[TASK_COMM_LEN + 1]; + +} ____cacheline_aligned_in_smp; + +/* + * Spinlock protecting the tables - not taken during lookup: + */ +static DEFINE_SPINLOCK(table_lock); + +/* + * Per-CPU lookup locks for fast hash lookup: + */ +static DEFINE_PER_CPU(spinlock_t, lookup_lock); + +/* + * Mutex to serialize state changes with show-stats activities: + */ +static DEFINE_MUTEX(show_mutex); + +/* + * Collection status, active/inactive: + */ +static int __read_mostly active; + +/* + * Beginning/end timestamps of measurement: + */ +static ktime_t time_start, time_stop; + +/* + * tstat entry structs only get allocated while collection is + * active and never freed during that time - this simplifies + * things quite a bit. + * + * They get freed when a new collection period is started. + */ +#define MAX_ENTRIES_BITS 10 +#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS) + +static unsigned long nr_entries; +static struct entry entries[MAX_ENTRIES]; + +static atomic_t overflow_count; + +static void reset_entries(void) +{ + nr_entries = 0; + memset(entries, 0, sizeof(entries)); + atomic_set(&overflow_count, 0); +} + +static struct entry *alloc_entry(void) +{ + if (nr_entries >= MAX_ENTRIES) + return NULL; + + return entries + nr_entries++; +} + +/* + * The entries are in a hash-table, for fast lookup: + */ +#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1) +#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS) +#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1) + +#define __tstat_hashfn(entry) \ + (((unsigned long)(entry)->timer ^ \ + (unsigned long)(entry)->start_func ^ \ + (unsigned long)(entry)->expire_func ^ \ + (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK) + +#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry)) + +static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly; + +static int match_entries(struct entry *entry1, struct entry *entry2) +{ + return entry1->timer == entry2->timer && + entry1->start_func == entry2->start_func && + entry1->expire_func == entry2->expire_func && + entry1->pid == entry2->pid; +} + +/* + * Look up whether an entry matching this item is present + * in the hash already. Must be called with irqs off and the + * lookup lock held: + */ +static struct entry *tstat_lookup(struct entry *entry, char *comm) +{ + struct entry **head, *curr, *prev; + + head = tstat_hashentry(entry); + curr = *head; + + /* + * The fastpath is when the entry is already hashed, + * we do this with the lookup lock held, but with the + * table lock not held: + */ + while (curr) { + if (match_entries(curr, entry)) + return curr; + + curr = curr->next; + } + /* + * Slowpath: allocate, set up and link a new hash entry: + */ + prev = NULL; + curr = *head; + + spin_lock(&table_lock); + /* + * Make sure we have not raced with another CPU: + */ + while (curr) { + if (match_entries(curr, entry)) + goto out_unlock; + + prev = curr; + curr = curr->next; + } + + curr = alloc_entry(); + if (curr) { + *curr = *entry; + curr->count = 0; + memcpy(curr->comm, comm, TASK_COMM_LEN); + if (prev) + prev->next = curr; + else + *head = curr; + curr->next = NULL; + } + out_unlock: + spin_unlock(&table_lock); + + return curr; +} + +/** + * timer_stats_update_stats - Update the statistics for a timer. + * @timer: pointer to either a timer_list or a hrtimer + * @pid: the pid of the task which set up the timer + * @startf: pointer to the function which did the timer setup + * @timerf: pointer to the timer callback function of the timer + * @comm: name of the process which set up the timer + * + * When the timer is already registered, then the event counter is + * incremented. Otherwise the timer is registered in a free slot. + */ +void timer_stats_update_stats(void *timer, pid_t pid, void *startf, + void *timerf, char * comm) +{ + /* + * It doesnt matter which lock we take: + */ + spinlock_t *lock = &per_cpu(lookup_lock, raw_smp_processor_id()); + struct entry *entry, input; + unsigned long flags; + + input.timer = timer; + input.start_func = startf; + input.expire_func = timerf; + input.pid = pid; + + spin_lock_irqsave(lock, flags); + if (!active) + goto out_unlock; + + entry = tstat_lookup(&input, comm); + if (likely(entry)) + entry->count++; + else + atomic_inc(&overflow_count); + + out_unlock: + spin_unlock_irqrestore(lock, flags); +} + +static void print_name_offset(struct seq_file *m, unsigned long addr) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); + if (sym_name) + seq_printf(m, "%s", sym_name); + else + seq_printf(m, "<%p>", (void *)addr); +} + +static int tstats_show(struct seq_file *m, void *v) +{ + struct timespec period; + struct entry *entry; + unsigned long ms; + long events = 0; + ktime_t time; + int i; + + mutex_lock(&show_mutex); + /* + * If still active then calculate up to now: + */ + if (active) + time_stop = ktime_get(); + + time = ktime_sub(time_stop, time_start); + + period = ktime_to_timespec(time); + ms = period.tv_nsec / 1000000; + + seq_puts(m, "Timer Stats Version: v0.1\n"); + seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); + if (atomic_read(&overflow_count)) + seq_printf(m, "Overflow: %d entries\n", + atomic_read(&overflow_count)); + + for (i = 0; i < nr_entries; i++) { + entry = entries + i; + seq_printf(m, "%4lu, %5d %-16s ", + entry->count, entry->pid, entry->comm); + + print_name_offset(m, (unsigned long)entry->start_func); + seq_puts(m, " ("); + print_name_offset(m, (unsigned long)entry->expire_func); + seq_puts(m, ")\n"); + + events += entry->count; + } + + ms += period.tv_sec * 1000; + if (!ms) + ms = 1; + + if (events && period.tv_sec) + seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events, + events / period.tv_sec, events * 1000 / ms); + else + seq_printf(m, "%ld total events\n", events); + + mutex_unlock(&show_mutex); + + return 0; +} + +/* + * After a state change, make sure all concurrent lookup/update + * activities have stopped: + */ +static void sync_access(void) +{ + unsigned long flags; + int cpu; + + for_each_online_cpu(cpu) { + spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags); + /* nothing */ + spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags); + } +} + +static ssize_t tstats_write(struct file *file, const char __user *buf, + size_t count, loff_t *offs) +{ + char ctl[2]; + + if (count != 2 || *offs) + return -EINVAL; + + if (copy_from_user(ctl, buf, count)) + return -EFAULT; + + mutex_lock(&show_mutex); + switch (ctl[0]) { + case '0': + if (active) { + active = 0; + time_stop = ktime_get(); + sync_access(); + } + break; + case '1': + if (!active) { + reset_entries(); + time_start = ktime_get(); + active = 1; + } + break; + default: + count = -EINVAL; + } + mutex_unlock(&show_mutex); + + return count; +} + +static int tstats_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, tstats_show, NULL); +} + +static struct file_operations tstats_fops = { + .open = tstats_open, + .read = seq_read, + .write = tstats_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +void __init init_timer_stats(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + spin_lock_init(&per_cpu(lookup_lock, cpu)); +} + +static int __init init_tstats_procfs(void) +{ + struct proc_dir_entry *pe; + + pe = create_proc_entry("timer_stats", 0644, NULL); + if (!pe) + return -ENOMEM; + + pe->proc_fops = &tstats_fops; + + return 0; +} +__initcall(init_tstats_procfs); diff --git a/kernel/timer.c b/kernel/timer.c index 4902181e10e..cb1b86a9c52 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -34,6 +34,8 @@ #include <linux/cpu.h> #include <linux/syscalls.h> #include <linux/delay.h> +#include <linux/tick.h> +#include <linux/kallsyms.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -262,6 +264,18 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) list_add_tail(&timer->entry, vec); } +#ifdef CONFIG_TIMER_STATS +void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) +{ + if (timer->start_site) + return; + + timer->start_site = addr; + memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); + timer->start_pid = current->pid; +} +#endif + /** * init_timer - initialize a timer. * @timer: the timer to be initialized @@ -273,11 +287,16 @@ void fastcall init_timer(struct timer_list *timer) { timer->entry.next = NULL; timer->base = __raw_get_cpu_var(tvec_bases); +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; + timer->start_pid = -1; + memset(timer->start_comm, 0, TASK_COMM_LEN); +#endif } EXPORT_SYMBOL(init_timer); static inline void detach_timer(struct timer_list *timer, - int clear_pending) + int clear_pending) { struct list_head *entry = &timer->entry; @@ -324,6 +343,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) unsigned long flags; int ret = 0; + timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); base = lock_timer_base(timer, &flags); @@ -374,6 +394,7 @@ void add_timer_on(struct timer_list *timer, int cpu) tvec_base_t *base = per_cpu(tvec_bases, cpu); unsigned long flags; + timer_stats_timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); timer->base = base; @@ -406,6 +427,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires) { BUG_ON(!timer->function); + timer_stats_timer_set_start_info(timer); /* * This is a common optimization triggered by the * networking code - if the timer is re-modified @@ -436,6 +458,7 @@ int del_timer(struct timer_list *timer) unsigned long flags; int ret = 0; + timer_stats_timer_clear_start_info(timer); if (timer_pending(timer)) { base = lock_timer_base(timer, &flags); if (timer_pending(timer)) { @@ -569,6 +592,8 @@ static inline void __run_timers(tvec_base_t *base) fn = timer->function; data = timer->data; + timer_stats_account_timer(timer); + set_running_timer(base, timer); detach_timer(timer, 1); spin_unlock_irq(&base->lock); @@ -591,105 +616,124 @@ static inline void __run_timers(tvec_base_t *base) spin_unlock_irq(&base->lock); } -#ifdef CONFIG_NO_IDLE_HZ +#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ) /* * Find out when the next timer event is due to happen. This * is used on S/390 to stop all activity when a cpus is idle. * This functions needs to be called disabled. */ -unsigned long next_timer_interrupt(void) +static unsigned long __next_timer_interrupt(tvec_base_t *base) { - tvec_base_t *base; - struct list_head *list; + unsigned long timer_jiffies = base->timer_jiffies; + unsigned long expires = timer_jiffies + (LONG_MAX >> 1); + int index, slot, array, found = 0; struct timer_list *nte; - unsigned long expires; - unsigned long hr_expires = MAX_JIFFY_OFFSET; - ktime_t hr_delta; tvec_t *varray[4]; - int i, j; - - hr_delta = hrtimer_get_next_event(); - if (hr_delta.tv64 != KTIME_MAX) { - struct timespec tsdelta; - tsdelta = ktime_to_timespec(hr_delta); - hr_expires = timespec_to_jiffies(&tsdelta); - if (hr_expires < 3) - return hr_expires + jiffies; - } - hr_expires += jiffies; - - base = __get_cpu_var(tvec_bases); - spin_lock(&base->lock); - expires = base->timer_jiffies + (LONG_MAX >> 1); - list = NULL; /* Look for timer events in tv1. */ - j = base->timer_jiffies & TVR_MASK; + index = slot = timer_jiffies & TVR_MASK; do { - list_for_each_entry(nte, base->tv1.vec + j, entry) { + list_for_each_entry(nte, base->tv1.vec + slot, entry) { + found = 1; expires = nte->expires; - if (j < (base->timer_jiffies & TVR_MASK)) - list = base->tv2.vec + (INDEX(0)); - goto found; + /* Look at the cascade bucket(s)? */ + if (!index || slot < index) + goto cascade; + return expires; } - j = (j + 1) & TVR_MASK; - } while (j != (base->timer_jiffies & TVR_MASK)); + slot = (slot + 1) & TVR_MASK; + } while (slot != index); + +cascade: + /* Calculate the next cascade event */ + if (index) + timer_jiffies += TVR_SIZE - index; + timer_jiffies >>= TVR_BITS; /* Check tv2-tv5. */ varray[0] = &base->tv2; varray[1] = &base->tv3; varray[2] = &base->tv4; varray[3] = &base->tv5; - for (i = 0; i < 4; i++) { - j = INDEX(i); + + for (array = 0; array < 4; array++) { + tvec_t *varp = varray[array]; + + index = slot = timer_jiffies & TVN_MASK; do { - if (list_empty(varray[i]->vec + j)) { - j = (j + 1) & TVN_MASK; - continue; - } - list_for_each_entry(nte, varray[i]->vec + j, entry) + list_for_each_entry(nte, varp->vec + slot, entry) { + found = 1; if (time_before(nte->expires, expires)) expires = nte->expires; - if (j < (INDEX(i)) && i < 3) - list = varray[i + 1]->vec + (INDEX(i + 1)); - goto found; - } while (j != (INDEX(i))); - } -found: - if (list) { - /* - * The search wrapped. We need to look at the next list - * from next tv element that would cascade into tv element - * where we found the timer element. - */ - list_for_each_entry(nte, list, entry) { - if (time_before(nte->expires, expires)) - expires = nte->expires; - } + } + /* + * Do we still search for the first timer or are + * we looking up the cascade buckets ? + */ + if (found) { + /* Look at the cascade bucket(s)? */ + if (!index || slot < index) + break; + return expires; + } + slot = (slot + 1) & TVN_MASK; + } while (slot != index); + + if (index) + timer_jiffies += TVN_SIZE - index; + timer_jiffies >>= TVN_BITS; } - spin_unlock(&base->lock); + return expires; +} - /* - * It can happen that other CPUs service timer IRQs and increment - * jiffies, but we have not yet got a local timer tick to process - * the timer wheels. In that case, the expiry time can be before - * jiffies, but since the high-resolution timer here is relative to - * jiffies, the default expression when high-resolution timers are - * not active, - * - * time_before(MAX_JIFFY_OFFSET + jiffies, expires) - * - * would falsely evaluate to true. If that is the case, just - * return jiffies so that we can immediately fire the local timer - */ - if (time_before(expires, jiffies)) - return jiffies; +/* + * Check, if the next hrtimer event is before the next timer wheel + * event: + */ +static unsigned long cmp_next_hrtimer_event(unsigned long now, + unsigned long expires) +{ + ktime_t hr_delta = hrtimer_get_next_event(); + struct timespec tsdelta; + + if (hr_delta.tv64 == KTIME_MAX) + return expires; - if (time_before(hr_expires, expires)) - return hr_expires; + if (hr_delta.tv64 <= TICK_NSEC) + return now; + tsdelta = ktime_to_timespec(hr_delta); + now += timespec_to_jiffies(&tsdelta); + if (time_before(now, expires)) + return now; return expires; } + +/** + * next_timer_interrupt - return the jiffy of the next pending timer + */ +unsigned long get_next_timer_interrupt(unsigned long now) +{ + tvec_base_t *base = __get_cpu_var(tvec_bases); + unsigned long expires; + + spin_lock(&base->lock); + expires = __next_timer_interrupt(base); + spin_unlock(&base->lock); + + if (time_before_eq(expires, now)) + return now; + + return cmp_next_hrtimer_event(now, expires); +} + +#ifdef CONFIG_NO_IDLE_HZ +unsigned long next_timer_interrupt(void) +{ + return get_next_timer_interrupt(jiffies); +} +#endif + #endif /******************************************************************/ @@ -832,32 +876,35 @@ EXPORT_SYMBOL(do_settimeofday); * * Accumulates current time interval and initializes new clocksource */ -static int change_clocksource(void) +static void change_clocksource(void) { struct clocksource *new; cycle_t now; u64 nsec; + new = clocksource_get_next(); - if (clock != new) { - now = clocksource_read(new); - nsec = __get_nsec_offset(); - timespec_add_ns(&xtime, nsec); - - clock = new; - clock->cycle_last = now; - printk(KERN_INFO "Time: %s clocksource has been installed.\n", - clock->name); - return 1; - } else if (clock->update_callback) { - return clock->update_callback(); - } - return 0; + + if (clock == new) + return; + + now = clocksource_read(new); + nsec = __get_nsec_offset(); + timespec_add_ns(&xtime, nsec); + + clock = new; + clock->cycle_last = now; + + clock->error = 0; + clock->xtime_nsec = 0; + clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); + + tick_clock_notify(); + + printk(KERN_INFO "Time: %s clocksource has been installed.\n", + clock->name); } #else -static inline int change_clocksource(void) -{ - return 0; -} +static inline void change_clocksource(void) { } #endif /** @@ -871,33 +918,56 @@ int timekeeping_is_continuous(void) do { seq = read_seqbegin(&xtime_lock); - ret = clock->is_continuous; + ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; } while (read_seqretry(&xtime_lock, seq)); return ret; } +/** + * read_persistent_clock - Return time in seconds from the persistent clock. + * + * Weak dummy function for arches that do not yet support it. + * Returns seconds from epoch using the battery backed persistent clock. + * Returns zero if unsupported. + * + * XXX - Do be sure to remove it once all arches implement it. + */ +unsigned long __attribute__((weak)) read_persistent_clock(void) +{ + return 0; +} + /* * timekeeping_init - Initializes the clocksource and common timekeeping values */ void __init timekeeping_init(void) { unsigned long flags; + unsigned long sec = read_persistent_clock(); write_seqlock_irqsave(&xtime_lock, flags); ntp_clear(); clock = clocksource_get_next(); - clocksource_calculate_interval(clock, tick_nsec); + clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); clock->cycle_last = clocksource_read(clock); + xtime.tv_sec = sec; + xtime.tv_nsec = 0; + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + write_sequnlock_irqrestore(&xtime_lock, flags); } - +/* flag for if timekeeping is suspended */ static int timekeeping_suspended; +/* time in seconds when suspend began */ +static unsigned long timekeeping_suspend_time; + /** * timekeeping_resume - Resumes the generic timekeeping subsystem. * @dev: unused @@ -909,13 +979,26 @@ static int timekeeping_suspended; static int timekeeping_resume(struct sys_device *dev) { unsigned long flags; + unsigned long now = read_persistent_clock(); write_seqlock_irqsave(&xtime_lock, flags); - /* restart the last cycle value */ + + if (now && (now > timekeeping_suspend_time)) { + unsigned long sleep_length = now - timekeeping_suspend_time; + + xtime.tv_sec += sleep_length; + wall_to_monotonic.tv_sec -= sleep_length; + } + /* re-base the last cycle value */ clock->cycle_last = clocksource_read(clock); clock->error = 0; timekeeping_suspended = 0; write_sequnlock_irqrestore(&xtime_lock, flags); + + touch_softlockup_watchdog(); + /* Resume hrtimers */ + clock_was_set(); + return 0; } @@ -925,6 +1008,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) write_seqlock_irqsave(&xtime_lock, flags); timekeeping_suspended = 1; + timekeeping_suspend_time = read_persistent_clock(); write_sequnlock_irqrestore(&xtime_lock, flags); return 0; } @@ -1089,11 +1173,8 @@ static void update_wall_time(void) clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; /* check to see if there is a new clocksource to use */ - if (change_clocksource()) { - clock->error = 0; - clock->xtime_nsec = 0; - clocksource_calculate_interval(clock, tick_nsec); - } + change_clocksource(); + update_vsyscall(&xtime, clock); } /* @@ -1173,7 +1254,8 @@ static void run_timer_softirq(struct softirq_action *h) { tvec_base_t *base = __get_cpu_var(tvec_bases); - hrtimer_run_queues(); + hrtimer_run_queues(); + if (time_after_eq(jiffies, base->timer_jiffies)) __run_timers(base); } @@ -1619,6 +1701,8 @@ void __init init_timers(void) int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); + init_timer_stats(); + BUG_ON(err == NOTIFY_BAD); register_cpu_notifier(&timers_nb); open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); diff --git a/kernel/tsacct.c b/kernel/tsacct.c index baacc369141..658f638c402 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -22,8 +22,6 @@ #include <linux/acct.h> #include <linux/jiffies.h> - -#define USEC_PER_TICK (USEC_PER_SEC/HZ) /* * fill in basic accounting fields */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 020d1fff57d..b6fa5e63085 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -218,7 +218,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) } EXPORT_SYMBOL_GPL(queue_work); -static void delayed_work_timer_fn(unsigned long __data) +void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; struct workqueue_struct *wq = get_wq_data(&dwork->work); @@ -245,6 +245,7 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq, struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; + timer_stats_timer_set_start_info(timer); if (delay == 0) return queue_work(wq, work); @@ -593,8 +594,10 @@ EXPORT_SYMBOL(schedule_work); * After waiting for a given time this puts a job in the kernel-global * workqueue. */ -int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) +int fastcall schedule_delayed_work(struct delayed_work *dwork, + unsigned long delay) { + timer_stats_timer_set_start_info(&dwork->timer); return queue_delayed_work(keventd_wq, dwork, delay); } EXPORT_SYMBOL(schedule_delayed_work); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 63f04c15e6f..4448f91b865 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -134,6 +134,17 @@ config SCHEDSTATS application, you can say N to avoid the very slight overhead this adds. +config TIMER_STATS + bool "Collect kernel timers statistics" + depends on DEBUG_KERNEL && PROC_FS + help + If you say Y here, additional code will be inserted into the + timer routines to collect statistics about kernel timers being + reprogrammed. The statistics can be read from /proc/timer_stats. + The statistics collection is started by writing 1 to /proc/timer_stats, + writing 0 stops it. This feature is useful to collect information + about timer usage patterns in kernel and userspace. + config DEBUG_SLAB bool "Debug slab memory allocations" depends on DEBUG_KERNEL && SLAB diff --git a/lib/devres.c b/lib/devres.c index 2a668dd7cac..eb38849aa71 100644 --- a/lib/devres.c +++ b/lib/devres.c @@ -274,21 +274,21 @@ int pcim_iomap_regions(struct pci_dev *pdev, u16 mask, const char *name) rc = pci_request_region(pdev, i, name); if (rc) - goto err_region; + goto err_inval; rc = -ENOMEM; if (!pcim_iomap(pdev, i, 0)) - goto err_iomap; + goto err_region; } return 0; - err_iomap: - pcim_iounmap(pdev, iomap[i]); err_region: pci_release_region(pdev, i); err_inval: while (--i >= 0) { + if (!(mask & (1 << i))) + continue; pcim_iounmap(pdev, iomap[i]); pci_release_region(pdev, i); } diff --git a/mm/filemap.c b/mm/filemap.c index 00414849a86..d1060b8d3cd 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2079,21 +2079,27 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, /* Limit the size of the copy to the caller's write size */ bytes = min(bytes, count); - /* - * Limit the size of the copy to that of the current segment, - * because fault_in_pages_readable() doesn't know how to walk - * segments. + /* We only need to worry about prefaulting when writes are from + * user-space. NFSd uses vfs_writev with several non-aligned + * segments in the vector, and limiting to one segment a time is + * a noticeable performance for re-write */ - bytes = min(bytes, cur_iov->iov_len - iov_base); - - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. - */ - fault_in_pages_readable(buf, bytes); + if (!segment_eq(get_fs(), KERNEL_DS)) { + /* + * Limit the size of the copy to that of the current + * segment, because fault_in_pages_readable() doesn't + * know how to walk segments. + */ + bytes = min(bytes, cur_iov->iov_len - iov_base); + /* + * Bring in the user page that we will copy from + * _first_. Otherwise there's a nasty deadlock on + * copying from the same page as we're writing to, + * without it being marked up-to-date. + */ + fault_in_pages_readable(buf, bytes); + } page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); if (!page) { status = -ENOMEM; |