diff options
579 files changed, 13309 insertions, 9481 deletions
diff --git a/Documentation/devicetree/bindings/i2c/ce4100-i2c.txt b/Documentation/devicetree/bindings/i2c/ce4100-i2c.txt new file mode 100644 index 00000000000..569b1624851 --- /dev/null +++ b/Documentation/devicetree/bindings/i2c/ce4100-i2c.txt @@ -0,0 +1,93 @@ +CE4100 I2C +---------- + +CE4100 has one PCI device which is described as the I2C-Controller. This +PCI device has three PCI-bars, each bar contains a complete I2C +controller. So we have a total of three independent I2C-Controllers +which share only an interrupt line. +The driver is probed via the PCI-ID and is gathering the information of +attached devices from the devices tree. +Grant Likely recommended to use the ranges property to map the PCI-Bar +number to its physical address and to use this to find the child nodes +of the specific I2C controller. This were his exact words: + + Here's where the magic happens. Each entry in + ranges describes how the parent pci address space + (middle group of 3) is translated to the local + address space (first group of 2) and the size of + each range (last cell). In this particular case, + the first cell of the local address is chosen to be + 1:1 mapped to the BARs, and the second is the + offset from be base of the BAR (which would be + non-zero if you had 2 or more devices mapped off + the same BAR) + + ranges allows the address mapping to be described + in a way that the OS can interpret without + requiring custom device driver code. + +This is an example which is used on FalconFalls: +------------------------------------------------ + i2c-controller@b,2 { + #address-cells = <2>; + #size-cells = <1>; + compatible = "pci8086,2e68.2", + "pci8086,2e68", + "pciclass,ff0000", + "pciclass,ff00"; + + reg = <0x15a00 0x0 0x0 0x0 0x0>; + interrupts = <16 1>; + + /* as described by Grant, the first number in the group of + * three is the bar number followed by the 64bit bar address + * followed by size of the mapping. The bar address + * requires also a valid translation in parents ranges + * property. + */ + ranges = <0 0 0x02000000 0 0xdffe0500 0x100 + 1 0 0x02000000 0 0xdffe0600 0x100 + 2 0 0x02000000 0 0xdffe0700 0x100>; + + i2c@0 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "intel,ce4100-i2c-controller"; + + /* The first number in the reg property is the + * number of the bar + */ + reg = <0 0 0x100>; + + /* This I2C controller has no devices */ + }; + + i2c@1 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "intel,ce4100-i2c-controller"; + reg = <1 0 0x100>; + + /* This I2C controller has one gpio controller */ + gpio@26 { + #gpio-cells = <2>; + compatible = "ti,pcf8575"; + reg = <0x26>; + gpio-controller; + }; + }; + + i2c@2 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "intel,ce4100-i2c-controller"; + reg = <2 0 0x100>; + + gpio@26 { + #gpio-cells = <2>; + compatible = "ti,pcf8575"; + reg = <0x26>; + gpio-controller; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/rtc/rtc-cmos.txt b/Documentation/devicetree/bindings/rtc/rtc-cmos.txt new file mode 100644 index 00000000000..7382989b305 --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/rtc-cmos.txt @@ -0,0 +1,28 @@ + Motorola mc146818 compatible RTC +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Required properties: + - compatible : "motorola,mc146818" + - reg : should contain registers location and length. + +Optional properties: + - interrupts : should contain interrupt. + - interrupt-parent : interrupt source phandle. + - ctrl-reg : Contains the initial value of the control register also + called "Register B". + - freq-reg : Contains the initial value of the frequency register also + called "Regsiter A". + +"Register A" and "B" are usually initialized by the firmware (BIOS for +instance). If this is not done, it can be performed by the driver. + +ISA Example: + + rtc@70 { + compatible = "motorola,mc146818"; + interrupts = <8 3>; + interrupt-parent = <&ioapic1>; + ctrl-reg = <2>; + freq-reg = <0x26>; + reg = <1 0x70 2>; + }; diff --git a/Documentation/devicetree/bindings/x86/ce4100.txt b/Documentation/devicetree/bindings/x86/ce4100.txt new file mode 100644 index 00000000000..b49ae593a60 --- /dev/null +++ b/Documentation/devicetree/bindings/x86/ce4100.txt @@ -0,0 +1,38 @@ +CE4100 Device Tree Bindings +--------------------------- + +The CE4100 SoC uses for in core peripherals the following compatible +format: <vendor>,<chip>-<device>. +Many of the "generic" devices like HPET or IO APIC have the ce4100 +name in their compatible property because they first appeared in this +SoC. + +The CPU node +------------ + cpu@0 { + device_type = "cpu"; + compatible = "intel,ce4100"; + reg = <0>; + lapic = <&lapic0>; + }; + +The reg property describes the CPU number. The lapic property points to +the local APIC timer. + +The SoC node +------------ + +This node describes the in-core peripherals. Required property: + compatible = "intel,ce4100-cp"; + +The PCI node +------------ +This node describes the PCI bus on the SoC. Its property should be + compatible = "intel,ce4100-pci", "pci"; + +If the OS is using the IO-APIC for interrupt routing then the reported +interrupt numbers for devices is no longer true. In order to obtain the +correct interrupt number, the child node which represents the device has +to contain the interrupt property. Besides the interrupt property it has +to contain at least the reg property containing the PCI bus address and +compatible property according to "PCI Bus Binding Revision 2.1". diff --git a/Documentation/devicetree/bindings/x86/interrupt.txt b/Documentation/devicetree/bindings/x86/interrupt.txt new file mode 100644 index 00000000000..7d19f494f19 --- /dev/null +++ b/Documentation/devicetree/bindings/x86/interrupt.txt @@ -0,0 +1,26 @@ +Interrupt chips +--------------- + +* Intel I/O Advanced Programmable Interrupt Controller (IO APIC) + + Required properties: + -------------------- + compatible = "intel,ce4100-ioapic"; + #interrupt-cells = <2>; + + Device's interrupt property: + + interrupts = <P S>; + + The first number (P) represents the interrupt pin which is wired to the + IO APIC. The second number (S) represents the sense of interrupt which + should be configured and can be one of: + 0 - Edge Rising + 1 - Level Low + 2 - Level High + 3 - Edge Falling + +* Local APIC + Required property: + + compatible = "intel,ce4100-lapic"; diff --git a/Documentation/devicetree/bindings/x86/timer.txt b/Documentation/devicetree/bindings/x86/timer.txt new file mode 100644 index 00000000000..c688af58e3b --- /dev/null +++ b/Documentation/devicetree/bindings/x86/timer.txt @@ -0,0 +1,6 @@ +Timers +------ + +* High Precision Event Timer (HPET) + Required property: + compatible = "intel,ce4100-hpet"; diff --git a/Documentation/devicetree/booting-without-of.txt b/Documentation/devicetree/booting-without-of.txt index 28b1c9d3d35..55fd2623445 100644 --- a/Documentation/devicetree/booting-without-of.txt +++ b/Documentation/devicetree/booting-without-of.txt @@ -13,6 +13,7 @@ Table of Contents I - Introduction 1) Entry point for arch/powerpc + 2) Entry point for arch/x86 II - The DT block format 1) Header @@ -225,6 +226,25 @@ it with special cases. cannot support both configurations with Book E and configurations with classic Powerpc architectures. +2) Entry point for arch/x86 +------------------------------- + + There is one single 32bit entry point to the kernel at code32_start, + the decompressor (the real mode entry point goes to the same 32bit + entry point once it switched into protected mode). That entry point + supports one calling convention which is documented in + Documentation/x86/boot.txt + The physical pointer to the device-tree block (defined in chapter II) + is passed via setup_data which requires at least boot protocol 2.09. + The type filed is defined as + + #define SETUP_DTB 2 + + This device-tree is used as an extension to the "boot page". As such it + does not parse / consider data which is already covered by the boot + page. This includes memory size, reserved ranges, command line arguments + or initrd address. It simply holds information which can not be retrieved + otherwise like interrupt routing or a list of devices behind an I2C bus. II - The DT block format ======================== diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index f4a04c0c7ed..738c6fda3fb 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2444,6 +2444,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. <deci-seconds>: poll all this frequency 0: no polling (default) + threadirqs [KNL] + Force threading of all interrupt handlers except those + marked explicitely IRQF_NO_THREAD. + topology= [S390] Format: {off | on} Specify if the kernel should make use of the cpu diff --git a/Documentation/rtc.txt b/Documentation/rtc.txt index 9104c106208..250160469d8 100644 --- a/Documentation/rtc.txt +++ b/Documentation/rtc.txt @@ -178,38 +178,29 @@ RTC class framework, but can't be supported by the older driver. setting the longer alarm time and enabling its IRQ using a single request (using the same model as EFI firmware). - * RTC_UIE_ON, RTC_UIE_OFF ... if the RTC offers IRQs, it probably - also offers update IRQs whenever the "seconds" counter changes. - If needed, the RTC framework can emulate this mechanism. + * RTC_UIE_ON, RTC_UIE_OFF ... if the RTC offers IRQs, the RTC framework + will emulate this mechanism. - * RTC_PIE_ON, RTC_PIE_OFF, RTC_IRQP_SET, RTC_IRQP_READ ... another - feature often accessible with an IRQ line is a periodic IRQ, issued - at settable frequencies (usually 2^N Hz). + * RTC_PIE_ON, RTC_PIE_OFF, RTC_IRQP_SET, RTC_IRQP_READ ... these icotls + are emulated via a kernel hrtimer. In many cases, the RTC alarm can be a system wake event, used to force Linux out of a low power sleep state (or hibernation) back to a fully operational state. For example, a system could enter a deep power saving state until it's time to execute some scheduled tasks. -Note that many of these ioctls need not actually be implemented by your -driver. The common rtc-dev interface handles many of these nicely if your -driver returns ENOIOCTLCMD. Some common examples: +Note that many of these ioctls are handled by the common rtc-dev interface. +Some common examples: * RTC_RD_TIME, RTC_SET_TIME: the read_time/set_time functions will be called with appropriate values. - * RTC_ALM_SET, RTC_ALM_READ, RTC_WKALM_SET, RTC_WKALM_RD: the - set_alarm/read_alarm functions will be called. + * RTC_ALM_SET, RTC_ALM_READ, RTC_WKALM_SET, RTC_WKALM_RD: gets or sets + the alarm rtc_timer. May call the set_alarm driver function. - * RTC_IRQP_SET, RTC_IRQP_READ: the irq_set_freq function will be called - to set the frequency while the framework will handle the read for you - since the frequency is stored in the irq_freq member of the rtc_device - structure. Your driver needs to initialize the irq_freq member during - init. Make sure you check the requested frequency is in range of your - hardware in the irq_set_freq function. If it isn't, return -EINVAL. If - you cannot actually change the frequency, do not define irq_set_freq. + * RTC_IRQP_SET, RTC_IRQP_READ: These are emulated by the generic code. - * RTC_PIE_ON, RTC_PIE_OFF: the irq_set_state function will be called. + * RTC_PIE_ON, RTC_PIE_OFF: These are also emulated by the generic code. If all else fails, check out the rtc-test.c driver! diff --git a/Documentation/spinlocks.txt b/Documentation/spinlocks.txt index 178c831b907..2e3c64b1a6a 100644 --- a/Documentation/spinlocks.txt +++ b/Documentation/spinlocks.txt @@ -86,7 +86,7 @@ to change the variables it has to get an exclusive write lock. The routines look the same as above: - rwlock_t xxx_lock = RW_LOCK_UNLOCKED; + rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock); unsigned long flags; @@ -196,25 +196,3 @@ appropriate: For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or __SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate. - -SPIN_LOCK_UNLOCKED and RW_LOCK_UNLOCKED are deprecated. These interfere -with lockdep state tracking. - -Most of the time, you can simply turn: - static spinlock_t xxx_lock = SPIN_LOCK_UNLOCKED; -into: - static DEFINE_SPINLOCK(xxx_lock); - -Static structure member variables go from: - - struct foo bar { - .lock = SPIN_LOCK_UNLOCKED; - }; - -to: - - struct foo bar { - .lock = __SPIN_LOCK_UNLOCKED(bar.lock); - }; - -Declaration of static rw_locks undergo a similar transformation. diff --git a/MAINTAINERS b/MAINTAINERS index 560ecce38ff..f1bc3dc6b36 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4292,10 +4292,7 @@ S: Maintained F: net/sched/sch_netem.c NETERION 10GbE DRIVERS (s2io/vxge) -M: Ramkrishna Vepa <ramkrishna.vepa@exar.com> -M: Sivakumar Subramani <sivakumar.subramani@exar.com> -M: Sreenivasa Honnur <sreenivasa.honnur@exar.com> -M: Jon Mason <jon.mason@exar.com> +M: Jon Mason <jdmason@kudzu.us> L: netdev@vger.kernel.org W: http://trac.neterion.com/cgi-bin/trac.cgi/wiki/Linux?Anonymous W: http://trac.neterion.com/cgi-bin/trac.cgi/wiki/X3100Linux?Anonymous @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 38 -EXTRAVERSION = -rc8 +EXTRAVERSION = NAME = Flesh-Eating Bats with Fangs # *DOCUMENTATION* diff --git a/arch/alpha/include/asm/futex.h b/arch/alpha/include/asm/futex.h index 945de222ab9..e8a761aee08 100644 --- a/arch/alpha/include/asm/futex.h +++ b/arch/alpha/include/asm/futex.h @@ -29,7 +29,7 @@ : "r" (uaddr), "r"(oparg) \ : "memory") -static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) +static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -39,7 +39,7 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); @@ -81,21 +81,23 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { - int prev, cmp; + int ret = 0, cmp; + u32 prev; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; __asm__ __volatile__ ( __ASM_SMP_MB - "1: ldl_l %0,0(%2)\n" - " cmpeq %0,%3,%1\n" - " beq %1,3f\n" - " mov %4,%1\n" - "2: stl_c %1,0(%2)\n" - " beq %1,4f\n" + "1: ldl_l %1,0(%3)\n" + " cmpeq %1,%4,%2\n" + " beq %2,3f\n" + " mov %5,%2\n" + "2: stl_c %2,0(%3)\n" + " beq %2,4f\n" "3: .subsection 2\n" "4: br 1b\n" " .previous\n" @@ -105,11 +107,12 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) " .long 2b-.\n" " lda $31,3b-2b(%0)\n" " .previous\n" - : "=&r"(prev), "=&r"(cmp) + : "+r"(ret), "=&r"(prev), "=&r"(cmp) : "r"(uaddr), "r"((long)oldval), "r"(newval) : "memory"); - return prev; + *uval = prev; + return ret; } #endif /* __KERNEL__ */ diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h index 1570c0b5433..a83bbea62c6 100644 --- a/arch/alpha/include/asm/rwsem.h +++ b/arch/alpha/include/asm/rwsem.h @@ -13,44 +13,13 @@ #ifdef __KERNEL__ #include <linux/compiler.h> -#include <linux/list.h> -#include <linux/spinlock.h> -struct rwsem_waiter; - -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); - -/* - * the semaphore definition - */ -struct rw_semaphore { - long count; #define RWSEM_UNLOCKED_VALUE 0x0000000000000000L #define RWSEM_ACTIVE_BIAS 0x0000000000000001L #define RWSEM_ACTIVE_MASK 0x00000000ffffffffL #define RWSEM_WAITING_BIAS (-0x0000000100000000L) #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - spinlock_t wait_lock; - struct list_head wait_list; -}; - -#define __RWSEM_INITIALIZER(name) \ - { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \ - LIST_HEAD_INIT((name).wait_list) } - -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) - -static inline void init_rwsem(struct rw_semaphore *sem) -{ - sem->count = RWSEM_UNLOCKED_VALUE; - spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} static inline void __down_read(struct rw_semaphore *sem) { @@ -250,10 +219,5 @@ static inline long rwsem_atomic_update(long val, struct rw_semaphore *sem) #endif } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* __KERNEL__ */ #endif /* _ALPHA_RWSEM_H */ diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index fe698b5045e..376f2213079 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -230,44 +230,24 @@ linux_to_osf_statfs(struct kstatfs *linux_stat, struct osf_statfs __user *osf_st return copy_to_user(osf_stat, &tmp_stat, bufsiz) ? -EFAULT : 0; } -static int -do_osf_statfs(struct path *path, struct osf_statfs __user *buffer, - unsigned long bufsiz) +SYSCALL_DEFINE3(osf_statfs, const char __user *, pathname, + struct osf_statfs __user *, buffer, unsigned long, bufsiz) { struct kstatfs linux_stat; - int error = vfs_statfs(path, &linux_stat); + int error = user_statfs(pathname, &linux_stat); if (!error) error = linux_to_osf_statfs(&linux_stat, buffer, bufsiz); return error; } -SYSCALL_DEFINE3(osf_statfs, const char __user *, pathname, - struct osf_statfs __user *, buffer, unsigned long, bufsiz) -{ - struct path path; - int retval; - - retval = user_path(pathname, &path); - if (!retval) { - retval = do_osf_statfs(&path, buffer, bufsiz); - path_put(&path); - } - return retval; -} - SYSCALL_DEFINE3(osf_fstatfs, unsigned long, fd, struct osf_statfs __user *, buffer, unsigned long, bufsiz) { - struct file *file; - int retval; - - retval = -EBADF; - file = fget(fd); - if (file) { - retval = do_osf_statfs(&file->f_path, buffer, bufsiz); - fput(file); - } - return retval; + struct kstatfs linux_stat; + int error = fd_statfs(fd, &linux_stat); + if (!error) + error = linux_to_osf_statfs(&linux_stat, buffer, bufsiz); + return error; } /* diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c index f6c108a3d67..8c13a0c7783 100644 --- a/arch/alpha/kernel/sys_titan.c +++ b/arch/alpha/kernel/sys_titan.c @@ -149,6 +149,7 @@ static int titan_set_irq_affinity(struct irq_data *d, const struct cpumask *affinity, bool force) { + unsigned int irq = d->irq; spin_lock(&titan_irq_lock); titan_cpu_set_irq_affinity(irq - 16, *affinity); titan_update_irq_hw(titan_cached_irq_mask); diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c index c1f3e7cb82a..a58e84f1a63 100644 --- a/arch/alpha/kernel/time.c +++ b/arch/alpha/kernel/time.c @@ -159,7 +159,7 @@ void read_persistent_clock(struct timespec *ts) /* * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * as well as call the "xtime_update()" routine every clocktick */ irqreturn_t timer_interrupt(int irq, void *dev) { @@ -172,8 +172,6 @@ irqreturn_t timer_interrupt(int irq, void *dev) profile_tick(CPU_PROFILING); #endif - write_seqlock(&xtime_lock); - /* * Calculate how many ticks have passed since the last update, * including any previous partial leftover. Save any resulting @@ -187,9 +185,7 @@ irqreturn_t timer_interrupt(int irq, void *dev) nticks = delta >> FIX_SHIFT; if (nticks) - do_timer(nticks); - - write_sequnlock(&xtime_lock); + xtime_update(nticks); if (test_irq_work_pending()) { clear_irq_work_pending(); diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h index b33fe7065b3..199a6b6de7f 100644 --- a/arch/arm/include/asm/futex.h +++ b/arch/arm/include/asm/futex.h @@ -35,7 +35,7 @@ : "cc", "memory") static inline int -futex_atomic_op_inuser (int encoded_op, int __user *uaddr) +futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -46,7 +46,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); /* implies preempt_disable() */ @@ -88,36 +88,35 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { - int val; + int ret = 0; + u32 val; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; - pagefault_disable(); /* implies preempt_disable() */ - __asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n" - "1: " T(ldr) " %0, [%3]\n" - " teq %0, %1\n" + "1: " T(ldr) " %1, [%4]\n" + " teq %1, %2\n" " it eq @ explicit IT needed for the 2b label\n" - "2: " T(streq) " %2, [%3]\n" + "2: " T(streq) " %3, [%4]\n" "3:\n" " .pushsection __ex_table,\"a\"\n" " .align 3\n" " .long 1b, 4f, 2b, 4f\n" " .popsection\n" " .pushsection .fixup,\"ax\"\n" - "4: mov %0, %4\n" + "4: mov %0, %5\n" " b 3b\n" " .popsection" - : "=&r" (val) + : "+r" (ret), "=&r" (val) : "r" (oldval), "r" (newval), "r" (uaddr), "Ir" (-EFAULT) : "cc", "memory"); - pagefault_enable(); /* subsumes preempt_enable() */ - - return val; + *uval = val; + return ret; } #endif /* !SMP */ diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c index 3d76bf23373..1ff46cabc7e 100644 --- a/arch/arm/kernel/time.c +++ b/arch/arm/kernel/time.c @@ -107,9 +107,7 @@ void timer_tick(void) { profile_tick(CPU_PROFILING); do_leds(); - write_seqlock(&xtime_lock); - do_timer(1); - write_sequnlock(&xtime_lock); + xtime_update(1); #ifndef CONFIG_SMP update_process_times(user_mode(get_irq_regs())); #endif diff --git a/arch/arm/mach-clps711x/include/mach/time.h b/arch/arm/mach-clps711x/include/mach/time.h index 8fe283ccd1f..61fef9129c6 100644 --- a/arch/arm/mach-clps711x/include/mach/time.h +++ b/arch/arm/mach-clps711x/include/mach/time.h @@ -30,7 +30,7 @@ p720t_timer_interrupt(int irq, void *dev_id) { struct pt_regs *regs = get_irq_regs(); do_leds(); - do_timer(1); + xtime_update(1); #ifndef CONFIG_SMP update_process_times(user_mode(regs)); #endif diff --git a/arch/blackfin/kernel/time.c b/arch/blackfin/kernel/time.c index c9113619029..8d73724c009 100644 --- a/arch/blackfin/kernel/time.c +++ b/arch/blackfin/kernel/time.c @@ -114,16 +114,14 @@ u32 arch_gettimeoffset(void) /* * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * as well as call the "xtime_update()" routine every clocktick */ #ifdef CONFIG_CORE_TIMER_IRQ_L1 __attribute__((l1_text)) #endif irqreturn_t timer_interrupt(int irq, void *dummy) { - write_seqlock(&xtime_lock); - do_timer(1); - write_sequnlock(&xtime_lock); + xtime_update(1); #ifdef CONFIG_IPIPE update_root_process_times(get_irq_regs()); diff --git a/arch/cris/arch-v10/kernel/time.c b/arch/cris/arch-v10/kernel/time.c index 00eb36f8deb..20c85b5dc7d 100644 --- a/arch/cris/arch-v10/kernel/time.c +++ b/arch/cris/arch-v10/kernel/time.c @@ -140,7 +140,7 @@ stop_watchdog(void) /* * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * as well as call the "xtime_update()" routine every clocktick */ //static unsigned short myjiff; /* used by our debug routine print_timestamp */ @@ -176,7 +176,7 @@ timer_interrupt(int irq, void *dev_id) /* call the real timer interrupt handler */ - do_timer(1); + xtime_update(1); cris_do_profile(regs); /* Save profiling information */ return IRQ_HANDLED; diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c index 84fed3b4b07..4c9e3e1ba5d 100644 --- a/arch/cris/arch-v32/kernel/smp.c +++ b/arch/cris/arch-v32/kernel/smp.c @@ -26,7 +26,9 @@ #define FLUSH_ALL (void*)0xffffffff /* Vector of locks used for various atomic operations */ -spinlock_t cris_atomic_locks[] = { [0 ... LOCK_COUNT - 1] = SPIN_LOCK_UNLOCKED}; +spinlock_t cris_atomic_locks[] = { + [0 ... LOCK_COUNT - 1] = __SPIN_LOCK_UNLOCKED(cris_atomic_locks) +}; /* CPU masks */ cpumask_t phys_cpu_present_map = CPU_MASK_NONE; diff --git a/arch/cris/arch-v32/kernel/time.c b/arch/cris/arch-v32/kernel/time.c index a545211e999..bb978ede898 100644 --- a/arch/cris/arch-v32/kernel/time.c +++ b/arch/cris/arch-v32/kernel/time.c @@ -183,7 +183,7 @@ void handle_watchdog_bite(struct pt_regs *regs) /* * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick. + * as well as call the "xtime_update()" routine every clocktick. */ extern void cris_do_profile(struct pt_regs *regs); @@ -216,9 +216,7 @@ static inline irqreturn_t timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; /* Call the real timer interrupt handler */ - write_seqlock(&xtime_lock); - do_timer(1); - write_sequnlock(&xtime_lock); + xtime_update(1); return IRQ_HANDLED; } diff --git a/arch/frv/include/asm/futex.h b/arch/frv/include/asm/futex.h index 08b3d1da358..4bea27f50a7 100644 --- a/arch/frv/include/asm/futex.h +++ b/arch/frv/include/asm/futex.h @@ -7,10 +7,11 @@ #include <asm/errno.h> #include <asm/uaccess.h> -extern int futex_atomic_op_inuser(int encoded_op, int __user *uaddr); +extern int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr); static inline int -futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { return -ENOSYS; } diff --git a/arch/frv/kernel/futex.c b/arch/frv/kernel/futex.c index 14f64b054c7..d155ca9e509 100644 --- a/arch/frv/kernel/futex.c +++ b/arch/frv/kernel/futex.c @@ -18,7 +18,7 @@ * the various futex operations; MMU fault checking is ignored under no-MMU * conditions */ -static inline int atomic_futex_op_xchg_set(int oparg, int __user *uaddr, int *_oldval) +static inline int atomic_futex_op_xchg_set(int oparg, u32 __user *uaddr, int *_oldval) { int oldval, ret; @@ -50,7 +50,7 @@ static inline int atomic_futex_op_xchg_set(int oparg, int __user *uaddr, int *_o return ret; } -static inline int atomic_futex_op_xchg_add(int oparg, int __user *uaddr, int *_oldval) +static inline int atomic_futex_op_xchg_add(int oparg, u32 __user *uaddr, int *_oldval) { int oldval, ret; @@ -83,7 +83,7 @@ static inline int atomic_futex_op_xchg_add(int oparg, int __user *uaddr, int *_o return ret; } -static inline int atomic_futex_op_xchg_or(int oparg, int __user *uaddr, int *_oldval) +static inline int atomic_futex_op_xchg_or(int oparg, u32 __user *uaddr, int *_oldval) { int oldval, ret; @@ -116,7 +116,7 @@ static inline int atomic_futex_op_xchg_or(int oparg, int __user *uaddr, int *_ol return ret; } -static inline int atomic_futex_op_xchg_and(int oparg, int __user *uaddr, int *_oldval) +static inline int atomic_futex_op_xchg_and(int oparg, u32 __user *uaddr, int *_oldval) { int oldval, ret; @@ -149,7 +149,7 @@ static inline int atomic_futex_op_xchg_and(int oparg, int __user *uaddr, int *_o return ret; } -static inline int atomic_futex_op_xchg_xor(int oparg, int __user *uaddr, int *_oldval) +static inline int atomic_futex_op_xchg_xor(int oparg, u32 __user *uaddr, int *_oldval) { int oldval, ret; @@ -186,7 +186,7 @@ static inline int atomic_futex_op_xchg_xor(int oparg, int __user *uaddr, int *_o /* * do the futex operations */ -int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -197,7 +197,7 @@ int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); diff --git a/arch/frv/kernel/time.c b/arch/frv/kernel/time.c index 0ddbbae83cb..b457de496b7 100644 --- a/arch/frv/kernel/time.c +++ b/arch/frv/kernel/time.c @@ -50,21 +50,13 @@ static struct irqaction timer_irq = { /* * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * as well as call the "xtime_update()" routine every clocktick */ static irqreturn_t timer_interrupt(int irq, void *dummy) { profile_tick(CPU_PROFILING); - /* - * Here we are in the timer irq handler. We just have irqs locally - * disabled but we don't know if the timer_bh is running on the other - * CPU. We need to avoid to SMP race with it. NOTE: we don't need - * the irq version of write_lock because as just said we have irq - * locally disabled. -arca - */ - write_seqlock(&xtime_lock); - do_timer(1); + xtime_update(1); #ifdef CONFIG_HEARTBEAT static unsigned short n; @@ -72,8 +64,6 @@ static irqreturn_t timer_interrupt(int irq, void *dummy) __set_LEDS(n); #endif /* CONFIG_HEARTBEAT */ - write_sequnlock(&xtime_lock); - update_process_times(user_mode(get_irq_regs())); return IRQ_HANDLED; diff --git a/arch/h8300/kernel/time.c b/arch/h8300/kernel/time.c index 165005aff9d..32263a138aa 100644 --- a/arch/h8300/kernel/time.c +++ b/arch/h8300/kernel/time.c @@ -35,9 +35,7 @@ void h8300_timer_tick(void) { if (current->pid) profile_tick(CPU_PROFILING); - write_seqlock(&xtime_lock); - do_timer(1); - write_sequnlock(&xtime_lock); + xtime_update(1); update_process_times(user_mode(get_irq_regs())); } diff --git a/arch/h8300/kernel/timer/timer8.c b/arch/h8300/kernel/timer/timer8.c index 3946c0fa837..7a1533fad47 100644 --- a/arch/h8300/kernel/timer/timer8.c +++ b/arch/h8300/kernel/timer/timer8.c @@ -61,7 +61,7 @@ /* * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * as well as call the "xtime_update()" routine every clocktick */ static irqreturn_t timer_interrupt(int irq, void *dev_id) diff --git a/arch/ia64/include/asm/futex.h b/arch/ia64/include/asm/futex.h index c7f0f062239..8428525ddb2 100644 --- a/arch/ia64/include/asm/futex.h +++ b/arch/ia64/include/asm/futex.h @@ -46,7 +46,7 @@ do { \ } while (0) static inline int -futex_atomic_op_inuser (int encoded_op, int __user *uaddr) +futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -56,7 +56,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int))) + if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); @@ -100,23 +100,26 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; { - register unsigned long r8 __asm ("r8"); + register unsigned long r8 __asm ("r8") = 0; + unsigned long prev; __asm__ __volatile__( " mf;; \n" " mov ar.ccv=%3;; \n" "[1:] cmpxchg4.acq %0=[%1],%2,ar.ccv \n" " .xdata4 \"__ex_table\", 1b-., 2f-. \n" "[2:]" - : "=r" (r8) + : "=r" (prev) : "r" (uaddr), "r" (newval), "rO" ((long) (unsigned) oldval) : "memory"); + *uval = prev; return r8; } } diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h index 215d5454c7d..3027e7516d8 100644 --- a/arch/ia64/include/asm/rwsem.h +++ b/arch/ia64/include/asm/rwsem.h @@ -25,20 +25,8 @@ #error "Please don't include <asm/rwsem.h> directly, use <linux/rwsem.h> instead." #endif -#include <linux/list.h> -#include <linux/spinlock.h> - #include <asm/intrinsics.h> -/* - * the semaphore definition - */ -struct rw_semaphore { - signed long count; - spinlock_t wait_lock; - struct list_head wait_list; -}; - #define RWSEM_UNLOCKED_VALUE __IA64_UL_CONST(0x0000000000000000) #define RWSEM_ACTIVE_BIAS (1L) #define RWSEM_ACTIVE_MASK (0xffffffffL) @@ -46,26 +34,6 @@ struct rw_semaphore { #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -#define __RWSEM_INITIALIZER(name) \ - { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ - LIST_HEAD_INIT((name).wait_list) } - -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) - -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); - -static inline void -init_rwsem (struct rw_semaphore *sem) -{ - sem->count = RWSEM_UNLOCKED_VALUE; - spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} - /* * lock for reading */ @@ -174,9 +142,4 @@ __downgrade_write (struct rw_semaphore *sem) #define rwsem_atomic_add(delta, sem) atomic64_add(delta, (atomic64_t *)(&(sem)->count)) #define rwsem_atomic_update(delta, sem) atomic64_add_return(delta, (atomic64_t *)(&(sem)->count)) -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* _ASM_IA64_RWSEM_H */ diff --git a/arch/ia64/include/asm/xen/hypercall.h b/arch/ia64/include/asm/xen/hypercall.h index 96fc62366aa..ed28bcd5bb8 100644 --- a/arch/ia64/include/asm/xen/hypercall.h +++ b/arch/ia64/include/asm/xen/hypercall.h @@ -107,7 +107,7 @@ extern unsigned long __hypercall(unsigned long a1, unsigned long a2, static inline int xencomm_arch_hypercall_sched_op(int cmd, struct xencomm_handle *arg) { - return _hypercall2(int, sched_op_new, cmd, arg); + return _hypercall2(int, sched_op, cmd, arg); } static inline long diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 9702fa92489..156ad803d5b 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -190,19 +190,10 @@ timer_interrupt (int irq, void *dev_id) new_itm += local_cpu_data->itm_delta; - if (smp_processor_id() == time_keeper_id) { - /* - * Here we are in the timer irq handler. We have irqs locally - * disabled, but we don't know if the timer_bh is running on - * another CPU. We need to avoid to SMP race by acquiring the - * xtime_lock. - */ - write_seqlock(&xtime_lock); - do_timer(1); - local_cpu_data->itm_next = new_itm; - write_sequnlock(&xtime_lock); - } else - local_cpu_data->itm_next = new_itm; + if (smp_processor_id() == time_keeper_id) + xtime_update(1); + + local_cpu_data->itm_next = new_itm; if (time_after(new_itm, ia64_get_itc())) break; @@ -222,7 +213,7 @@ skip_process_time_accounting: * comfort, we increase the safety margin by * intentionally dropping the next tick(s). We do NOT * update itm.next because that would force us to call - * do_timer() which in turn would let our clock run + * xtime_update() which in turn would let our clock run * too fast (with the potentially devastating effect * of losing monotony of time). */ diff --git a/arch/ia64/xen/suspend.c b/arch/ia64/xen/suspend.c index fd66b048c6f..419c8620945 100644 --- a/arch/ia64/xen/suspend.c +++ b/arch/ia64/xen/suspend.c @@ -37,19 +37,14 @@ xen_mm_unpin_all(void) /* nothing */ } -void xen_pre_device_suspend(void) -{ - /* nothing */ -} - void -xen_pre_suspend() +xen_arch_pre_suspend() { /* nothing */ } void -xen_post_suspend(int suspend_cancelled) +xen_arch_post_suspend(int suspend_cancelled) { if (suspend_cancelled) return; diff --git a/arch/ia64/xen/time.c b/arch/ia64/xen/time.c index c1c544513e8..1f8244a78be 100644 --- a/arch/ia64/xen/time.c +++ b/arch/ia64/xen/time.c @@ -139,14 +139,11 @@ consider_steal_time(unsigned long new_itm) run_posix_cpu_timers(p); delta_itm += local_cpu_data->itm_delta * (stolen + blocked); - if (cpu == time_keeper_id) { - write_seqlock(&xtime_lock); - do_timer(stolen + blocked); - local_cpu_data->itm_next = delta_itm + new_itm; - write_sequnlock(&xtime_lock); - } else { - local_cpu_data->itm_next = delta_itm + new_itm; - } + if (cpu == time_keeper_id) + xtime_update(stolen + blocked); + + local_cpu_data->itm_next = delta_itm + new_itm; + per_cpu(xen_stolen_time, cpu) += NS_PER_TICK * stolen; per_cpu(xen_blocked_time, cpu) += NS_PER_TICK * blocked; } diff --git a/arch/m32r/kernel/time.c b/arch/m32r/kernel/time.c index bda86820bff..84dd04048db 100644 --- a/arch/m32r/kernel/time.c +++ b/arch/m32r/kernel/time.c @@ -107,15 +107,14 @@ u32 arch_gettimeoffset(void) /* * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * as well as call the "xtime_update()" routine every clocktick */ static irqreturn_t timer_interrupt(int irq, void *dev_id) { #ifndef CONFIG_SMP profile_tick(CPU_PROFILING); #endif - /* XXX FIXME. Uh, the xtime_lock should be held here, no? */ - do_timer(1); + xtime_update(1); #ifndef CONFIG_SMP update_process_times(user_mode(get_irq_regs())); diff --git a/arch/m68k/bvme6000/config.c b/arch/m68k/bvme6000/config.c index 9fe6fefb5e1..1edd95095cb 100644 --- a/arch/m68k/bvme6000/config.c +++ b/arch/m68k/bvme6000/config.c @@ -45,8 +45,8 @@ extern int bvme6000_set_clock_mmss (unsigned long); extern void bvme6000_reset (void); void bvme6000_set_vectors (void); -/* Save tick handler routine pointer, will point to do_timer() in - * kernel/sched.c, called via bvme6000_process_int() */ +/* Save tick handler routine pointer, will point to xtime_update() in + * kernel/timer/timekeeping.c, called via bvme6000_process_int() */ static irq_handler_t tick_handler; diff --git a/arch/m68k/kernel/time.c b/arch/m68k/kernel/time.c index 06438dac08f..18b34ee5db3 100644 --- a/arch/m68k/kernel/time.c +++ b/arch/m68k/kernel/time.c @@ -37,11 +37,11 @@ static inline int set_rtc_mmss(unsigned long nowtime) /* * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * as well as call the "xtime_update()" routine every clocktick */ static irqreturn_t timer_interrupt(int irq, void *dummy) { - do_timer(1); + xtime_update(1); update_process_times(user_mode(get_irq_regs())); profile_tick(CPU_PROFILING); diff --git a/arch/m68k/mvme147/config.c b/arch/m68k/mvme147/config.c index 100baaa692a..6cb9c3a9b6c 100644 --- a/arch/m68k/mvme147/config.c +++ b/arch/m68k/mvme147/config.c @@ -46,8 +46,8 @@ extern void mvme147_reset (void); static int bcd2int (unsigned char b); -/* Save tick handler routine pointer, will point to do_timer() in - * kernel/sched.c, called via mvme147_process_int() */ +/* Save tick handler routine pointer, will point to xtime_update() in + * kernel/time/timekeeping.c, called via mvme147_process_int() */ irq_handler_t tick_handler; diff --git a/arch/m68k/mvme16x/config.c b/arch/m68k/mvme16x/config.c index 11edf61cc2c..0b28e262165 100644 --- a/arch/m68k/mvme16x/config.c +++ b/arch/m68k/mvme16x/config.c @@ -51,8 +51,8 @@ extern void mvme16x_reset (void); int bcd2int (unsigned char b); -/* Save tick handler routine pointer, will point to do_timer() in - * kernel/sched.c, called via mvme16x_process_int() */ +/* Save tick handler routine pointer, will point to xtime_update() in + * kernel/time/timekeeping.c, called via mvme16x_process_int() */ static irq_handler_t tick_handler; diff --git a/arch/m68k/sun3/sun3ints.c b/arch/m68k/sun3/sun3ints.c index 2d9e21bd313..6464ad3ae3e 100644 --- a/arch/m68k/sun3/sun3ints.c +++ b/arch/m68k/sun3/sun3ints.c @@ -66,7 +66,7 @@ static irqreturn_t sun3_int5(int irq, void *dev_id) #ifdef CONFIG_SUN3 intersil_clear(); #endif - do_timer(1); + xtime_update(1); update_process_times(user_mode(get_irq_regs())); if (!(kstat_cpu(0).irqs[irq] % 20)) sun3_leds(led_pattern[(kstat_cpu(0).irqs[irq] % 160) / 20]); diff --git a/arch/m68knommu/kernel/time.c b/arch/m68knommu/kernel/time.c index d6ac2a43453..6623909f70e 100644 --- a/arch/m68knommu/kernel/time.c +++ b/arch/m68knommu/kernel/time.c @@ -36,7 +36,7 @@ static inline int set_rtc_mmss(unsigned long nowtime) #ifndef CONFIG_GENERIC_CLOCKEVENTS /* * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * as well as call the "xtime_update()" routine every clocktick */ irqreturn_t arch_timer_interrupt(int irq, void *dummy) { @@ -44,11 +44,7 @@ irqreturn_t arch_timer_interrupt(int irq, void *dummy) if (current->pid) profile_tick(CPU_PROFILING); - write_seqlock(&xtime_lock); - - do_timer(1); - - write_sequnlock(&xtime_lock); + xtime_update(1); update_process_times(user_mode(get_irq_regs())); diff --git a/arch/microblaze/include/asm/futex.h b/arch/microblaze/include/asm/futex.h index ad3fd61b2fe..b0526d2716f 100644 --- a/arch/microblaze/include/asm/futex.h +++ b/arch/microblaze/include/asm/futex.h @@ -29,7 +29,7 @@ }) static inline int -futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -39,7 +39,7 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); @@ -94,31 +94,34 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { - int prev, cmp; + int ret = 0, cmp; + u32 prev; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; - __asm__ __volatile__ ("1: lwx %0, %2, r0; \ - cmp %1, %0, %3; \ - beqi %1, 3f; \ - 2: swx %4, %2, r0; \ - addic %1, r0, 0; \ - bnei %1, 1b; \ + __asm__ __volatile__ ("1: lwx %1, %3, r0; \ + cmp %2, %1, %4; \ + beqi %2, 3f; \ + 2: swx %5, %3, r0; \ + addic %2, r0, 0; \ + bnei %2, 1b; \ 3: \ .section .fixup,\"ax\"; \ 4: brid 3b; \ - addik %0, r0, %5; \ + addik %0, r0, %6; \ .previous; \ .section __ex_table,\"a\"; \ .word 1b,4b,2b,4b; \ .previous;" \ - : "=&r" (prev), "=&r"(cmp) \ + : "+r" (ret), "=&r" (prev), "=&r"(cmp) \ : "r" (uaddr), "r" (oldval), "r" (newval), "i" (-EFAULT)); - return prev; + *uval = prev; + return ret; } #endif /* __KERNEL__ */ diff --git a/arch/microblaze/include/asm/pci-bridge.h b/arch/microblaze/include/asm/pci-bridge.h index 0c68764ab54..10717669e0c 100644 --- a/arch/microblaze/include/asm/pci-bridge.h +++ b/arch/microblaze/include/asm/pci-bridge.h @@ -104,11 +104,22 @@ struct pci_controller { int global_number; /* PCI domain number */ }; +#ifdef CONFIG_PCI static inline struct pci_controller *pci_bus_to_host(const struct pci_bus *bus) { return bus->sysdata; } +static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus) +{ + struct pci_controller *host; + + if (bus->self) + return pci_device_to_OF_node(bus->self); + host = pci_bus_to_host(bus); + return host ? host->dn : NULL; +} + static inline int isa_vaddr_is_ioport(void __iomem *address) { /* No specific ISA handling on ppc32 at this stage, it @@ -116,6 +127,7 @@ static inline int isa_vaddr_is_ioport(void __iomem *address) */ return 0; } +#endif /* CONFIG_PCI */ /* These are used for config access before all the PCI probing has been done. */ diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h index 2e72af078b0..d0890d36ef6 100644 --- a/arch/microblaze/include/asm/prom.h +++ b/arch/microblaze/include/asm/prom.h @@ -64,21 +64,6 @@ extern void kdump_move_device_tree(void); /* CPU OF node matching */ struct device_node *of_get_cpu_node(int cpu, unsigned int *thread); -/** - * of_irq_map_pci - Resolve the interrupt for a PCI device - * @pdev: the device whose interrupt is to be resolved - * @out_irq: structure of_irq filled by this function - * - * This function resolves the PCI interrupt for a given PCI device. If a - * device-node exists for a given pci_dev, it will use normal OF tree - * walking. If not, it will implement standard swizzling and walk up the - * PCI tree until an device-node is found, at which point it will finish - * resolving using the OF tree walking. - */ -struct pci_dev; -struct of_irq; -extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq); - #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/microblaze/kernel/prom_parse.c b/arch/microblaze/kernel/prom_parse.c index 9ae24f4b882..47187cc2cf0 100644 --- a/arch/microblaze/kernel/prom_parse.c +++ b/arch/microblaze/kernel/prom_parse.c @@ -2,88 +2,11 @@ #include <linux/kernel.h> #include <linux/string.h> -#include <linux/pci_regs.h> #include <linux/module.h> #include <linux/ioport.h> #include <linux/etherdevice.h> #include <linux/of_address.h> #include <asm/prom.h> -#include <asm/pci-bridge.h> - -#ifdef CONFIG_PCI -int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq) -{ - struct device_node *dn, *ppnode; - struct pci_dev *ppdev; - u32 lspec; - u32 laddr[3]; - u8 pin; - int rc; - - /* Check if we have a device node, if yes, fallback to standard OF - * parsing - */ - dn = pci_device_to_OF_node(pdev); - if (dn) - return of_irq_map_one(dn, 0, out_irq); - - /* Ok, we don't, time to have fun. Let's start by building up an - * interrupt spec. we assume #interrupt-cells is 1, which is standard - * for PCI. If you do different, then don't use that routine. - */ - rc = pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin); - if (rc != 0) - return rc; - /* No pin, exit */ - if (pin == 0) - return -ENODEV; - - /* Now we walk up the PCI tree */ - lspec = pin; - for (;;) { - /* Get the pci_dev of our parent */ - ppdev = pdev->bus->self; - - /* Ouch, it's a host bridge... */ - if (ppdev == NULL) { - struct pci_controller *host; - host = pci_bus_to_host(pdev->bus); - ppnode = host ? host->dn : NULL; - /* No node for host bridge ? give up */ - if (ppnode == NULL) - return -EINVAL; - } else - /* We found a P2P bridge, check if it has a node */ - ppnode = pci_device_to_OF_node(ppdev); - - /* Ok, we have found a parent with a device-node, hand over to - * the OF parsing code. - * We build a unit address from the linux device to be used for - * resolution. Note that we use the linux bus number which may - * not match your firmware bus numbering. - * Fortunately, in most cases, interrupt-map-mask doesn't - * include the bus number as part of the matching. - * You should still be careful about that though if you intend - * to rely on this function (you ship a firmware that doesn't - * create device nodes for all PCI devices). - */ - if (ppnode) - break; - - /* We can only get here if we hit a P2P bridge with no node, - * let's do standard swizzling and try again - */ - lspec = pci_swizzle_interrupt_pin(pdev, lspec); - pdev = ppdev; - } - - laddr[0] = (pdev->bus->number << 16) - | (pdev->devfn << 8); - laddr[1] = laddr[2] = 0; - return of_irq_map_raw(ppnode, &lspec, 1, laddr, out_irq); -} -EXPORT_SYMBOL_GPL(of_irq_map_pci); -#endif /* CONFIG_PCI */ void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop, unsigned long *busno, unsigned long *phys, unsigned long *size) diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c index e363615d679..1e01a125363 100644 --- a/arch/microblaze/pci/pci-common.c +++ b/arch/microblaze/pci/pci-common.c @@ -29,6 +29,7 @@ #include <linux/slab.h> #include <linux/of.h> #include <linux/of_address.h> +#include <linux/of_pci.h> #include <asm/processor.h> #include <asm/io.h> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index f5ecc0566bc..d88983516e2 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -4,6 +4,7 @@ config MIPS select HAVE_GENERIC_DMA_COHERENT select HAVE_IDE select HAVE_OPROFILE + select HAVE_IRQ_WORK select HAVE_PERF_EVENTS select PERF_USE_VMALLOC select HAVE_ARCH_KGDB @@ -208,6 +209,7 @@ config MACH_JZ4740 select ARCH_REQUIRE_GPIOLIB select SYS_HAS_EARLY_PRINTK select HAVE_PWM + select HAVE_CLK config LASAT bool "LASAT Networks platforms" @@ -333,6 +335,8 @@ config PNX8550_STB810 config PMC_MSP bool "PMC-Sierra MSP chipsets" depends on EXPERIMENTAL + select CEVT_R4K + select CSRC_R4K select DMA_NONCOHERENT select SWAP_IO_SPACE select NO_EXCEPT_FILL diff --git a/arch/mips/alchemy/mtx-1/board_setup.c b/arch/mips/alchemy/mtx-1/board_setup.c index 6398fa95905..40b84b99119 100644 --- a/arch/mips/alchemy/mtx-1/board_setup.c +++ b/arch/mips/alchemy/mtx-1/board_setup.c @@ -54,8 +54,8 @@ int mtx1_pci_idsel(unsigned int devsel, int assert); static void mtx1_reset(char *c) { - /* Hit BCSR.SYSTEM_CONTROL[SW_RST] */ - au_writel(0x00000000, 0xAE00001C); + /* Jump to the reset vector */ + __asm__ __volatile__("jr\t%0"::"r"(0xbfc00000)); } static void mtx1_power_off(void) diff --git a/arch/mips/alchemy/mtx-1/platform.c b/arch/mips/alchemy/mtx-1/platform.c index e30e42add69..956f946218c 100644 --- a/arch/mips/alchemy/mtx-1/platform.c +++ b/arch/mips/alchemy/mtx-1/platform.c @@ -28,6 +28,8 @@ #include <linux/mtd/physmap.h> #include <mtd/mtd-abi.h> +#include <asm/mach-au1x00/au1xxx_eth.h> + static struct gpio_keys_button mtx1_gpio_button[] = { { .gpio = 207, @@ -140,10 +142,17 @@ static struct __initdata platform_device * mtx1_devs[] = { &mtx1_mtd, }; +static struct au1000_eth_platform_data mtx1_au1000_eth0_pdata = { + .phy_search_highest_addr = 1, + .phy1_search_mac0 = 1, +}; + static int __init mtx1_register_devices(void) { int rc; + au1xxx_override_eth_cfg(0, &mtx1_au1000_eth0_pdata); + rc = gpio_request(mtx1_gpio_button[0].gpio, mtx1_gpio_button[0].desc); if (rc < 0) { diff --git a/arch/mips/alchemy/xxs1500/board_setup.c b/arch/mips/alchemy/xxs1500/board_setup.c index b43c918925d..80c521e5290 100644 --- a/arch/mips/alchemy/xxs1500/board_setup.c +++ b/arch/mips/alchemy/xxs1500/board_setup.c @@ -36,8 +36,8 @@ static void xxs1500_reset(char *c) { - /* Hit BCSR.SYSTEM_CONTROL[SW_RST] */ - au_writel(0x00000000, 0xAE00001C); + /* Jump to the reset vector */ + __asm__ __volatile__("jr\t%0"::"r"(0xbfc00000)); } static void xxs1500_power_off(void) diff --git a/arch/mips/include/asm/futex.h b/arch/mips/include/asm/futex.h index b9cce90346c..6ebf1734b41 100644 --- a/arch/mips/include/asm/futex.h +++ b/arch/mips/include/asm/futex.h @@ -75,7 +75,7 @@ } static inline int -futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -85,7 +85,7 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int))) + if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); @@ -132,11 +132,13 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { - int retval; + int ret = 0; + u32 val; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; if (cpu_has_llsc && R10000_LLSC_WAR) { @@ -145,25 +147,25 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) " .set push \n" " .set noat \n" " .set mips3 \n" - "1: ll %0, %2 \n" - " bne %0, %z3, 3f \n" + "1: ll %1, %3 \n" + " bne %1, %z4, 3f \n" " .set mips0 \n" - " move $1, %z4 \n" + " move $1, %z5 \n" " .set mips3 \n" - "2: sc $1, %1 \n" + "2: sc $1, %2 \n" " beqzl $1, 1b \n" __WEAK_LLSC_MB "3: \n" " .set pop \n" " .section .fixup,\"ax\" \n" - "4: li %0, %5 \n" + "4: li %0, %6 \n" " j 3b \n" " .previous \n" " .section __ex_table,\"a\" \n" " "__UA_ADDR "\t1b, 4b \n" " "__UA_ADDR "\t2b, 4b \n" " .previous \n" - : "=&r" (retval), "=R" (*uaddr) + : "+r" (ret), "=&r" (val), "=R" (*uaddr) : "R" (*uaddr), "Jr" (oldval), "Jr" (newval), "i" (-EFAULT) : "memory"); } else if (cpu_has_llsc) { @@ -172,31 +174,32 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) " .set push \n" " .set noat \n" " .set mips3 \n" - "1: ll %0, %2 \n" - " bne %0, %z3, 3f \n" + "1: ll %1, %3 \n" + " bne %1, %z4, 3f \n" " .set mips0 \n" - " move $1, %z4 \n" + " move $1, %z5 \n" " .set mips3 \n" - "2: sc $1, %1 \n" + "2: sc $1, %2 \n" " beqz $1, 1b \n" __WEAK_LLSC_MB "3: \n" " .set pop \n" " .section .fixup,\"ax\" \n" - "4: li %0, %5 \n" + "4: li %0, %6 \n" " j 3b \n" " .previous \n" " .section __ex_table,\"a\" \n" " "__UA_ADDR "\t1b, 4b \n" " "__UA_ADDR "\t2b, 4b \n" " .previous \n" - : "=&r" (retval), "=R" (*uaddr) + : "+r" (ret), "=&r" (val), "=R" (*uaddr) : "R" (*uaddr), "Jr" (oldval), "Jr" (newval), "i" (-EFAULT) : "memory"); } else return -ENOSYS; - return retval; + *uval = val; + return ret; } #endif diff --git a/arch/mips/include/asm/perf_event.h b/arch/mips/include/asm/perf_event.h index e00007cf816..d0c77496c72 100644 --- a/arch/mips/include/asm/perf_event.h +++ b/arch/mips/include/asm/perf_event.h @@ -11,15 +11,5 @@ #ifndef __MIPS_PERF_EVENT_H__ #define __MIPS_PERF_EVENT_H__ - -/* - * MIPS performance counters do not raise NMI upon overflow, a regular - * interrupt will be signaled. Hence we can do the pending perf event - * work at the tail of the irq handler. - */ -static inline void -set_perf_event_pending(void) -{ -} - +/* Leave it empty here. The file is required by linux/perf_event.h */ #endif /* __MIPS_PERF_EVENT_H__ */ diff --git a/arch/mips/kernel/ftrace.c b/arch/mips/kernel/ftrace.c index 5a84a1f1123..94ca2b018af 100644 --- a/arch/mips/kernel/ftrace.c +++ b/arch/mips/kernel/ftrace.c @@ -17,29 +17,13 @@ #include <asm/cacheflush.h> #include <asm/uasm.h> -/* - * If the Instruction Pointer is in module space (0xc0000000), return true; - * otherwise, it is in kernel space (0x80000000), return false. - * - * FIXME: This will not work when the kernel space and module space are the - * same. If they are the same, we need to modify scripts/recordmcount.pl, - * ftrace_make_nop/call() and the other related parts to ensure the - * enabling/disabling of the calling site to _mcount is right for both kernel - * and module. - */ - -static inline int in_module(unsigned long ip) -{ - return ip & 0x40000000; -} +#include <asm-generic/sections.h> #ifdef CONFIG_DYNAMIC_FTRACE #define JAL 0x0c000000 /* jump & link: ip --> ra, jump to target */ #define ADDR_MASK 0x03ffffff /* op_code|addr : 31...26|25 ....0 */ -#define INSN_B_1F_4 0x10000004 /* b 1f; offset = 4 */ -#define INSN_B_1F_5 0x10000005 /* b 1f; offset = 5 */ #define INSN_NOP 0x00000000 /* nop */ #define INSN_JAL(addr) \ ((unsigned int)(JAL | (((addr) >> 2) & ADDR_MASK))) @@ -69,6 +53,20 @@ static inline void ftrace_dyn_arch_init_insns(void) #endif } +/* + * Check if the address is in kernel space + * + * Clone core_kernel_text() from kernel/extable.c, but doesn't call + * init_kernel_text() for Ftrace doesn't trace functions in init sections. + */ +static inline int in_kernel_space(unsigned long ip) +{ + if (ip >= (unsigned long)_stext && + ip <= (unsigned long)_etext) + return 1; + return 0; +} + static int ftrace_modify_code(unsigned long ip, unsigned int new_code) { int faulted; @@ -84,6 +82,42 @@ static int ftrace_modify_code(unsigned long ip, unsigned int new_code) return 0; } +/* + * The details about the calling site of mcount on MIPS + * + * 1. For kernel: + * + * move at, ra + * jal _mcount --> nop + * + * 2. For modules: + * + * 2.1 For KBUILD_MCOUNT_RA_ADDRESS and CONFIG_32BIT + * + * lui v1, hi_16bit_of_mcount --> b 1f (0x10000005) + * addiu v1, v1, low_16bit_of_mcount + * move at, ra + * move $12, ra_address + * jalr v1 + * sub sp, sp, 8 + * 1: offset = 5 instructions + * 2.2 For the Other situations + * + * lui v1, hi_16bit_of_mcount --> b 1f (0x10000004) + * addiu v1, v1, low_16bit_of_mcount + * move at, ra + * jalr v1 + * nop | move $12, ra_address | sub sp, sp, 8 + * 1: offset = 4 instructions + */ + +#if defined(KBUILD_MCOUNT_RA_ADDRESS) && defined(CONFIG_32BIT) +#define MCOUNT_OFFSET_INSNS 5 +#else +#define MCOUNT_OFFSET_INSNS 4 +#endif +#define INSN_B_1F (0x10000000 | MCOUNT_OFFSET_INSNS) + int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { @@ -91,39 +125,11 @@ int ftrace_make_nop(struct module *mod, unsigned long ip = rec->ip; /* - * We have compiled module with -mlong-calls, but compiled the kernel - * without it, we need to cope with them respectively. + * If ip is in kernel space, no long call, otherwise, long call is + * needed. */ - if (in_module(ip)) { -#if defined(KBUILD_MCOUNT_RA_ADDRESS) && defined(CONFIG_32BIT) - /* - * lui v1, hi_16bit_of_mcount --> b 1f (0x10000005) - * addiu v1, v1, low_16bit_of_mcount - * move at, ra - * move $12, ra_address - * jalr v1 - * sub sp, sp, 8 - * 1: offset = 5 instructions - */ - new = INSN_B_1F_5; -#else - /* - * lui v1, hi_16bit_of_mcount --> b 1f (0x10000004) - * addiu v1, v1, low_16bit_of_mcount - * move at, ra - * jalr v1 - * nop | move $12, ra_address | sub sp, sp, 8 - * 1: offset = 4 instructions - */ - new = INSN_B_1F_4; -#endif - } else { - /* - * move at, ra - * jal _mcount --> nop - */ - new = INSN_NOP; - } + new = in_kernel_space(ip) ? INSN_NOP : INSN_B_1F; + return ftrace_modify_code(ip, new); } @@ -132,8 +138,8 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) unsigned int new; unsigned long ip = rec->ip; - /* ip, module: 0xc0000000, kernel: 0x80000000 */ - new = in_module(ip) ? insn_lui_v1_hi16_mcount : insn_jal_ftrace_caller; + new = in_kernel_space(ip) ? insn_jal_ftrace_caller : + insn_lui_v1_hi16_mcount; return ftrace_modify_code(ip, new); } @@ -190,29 +196,25 @@ int ftrace_disable_ftrace_graph_caller(void) #define S_R_SP (0xafb0 << 16) /* s{d,w} R, offset(sp) */ #define OFFSET_MASK 0xffff /* stack offset range: 0 ~ PT_SIZE */ -unsigned long ftrace_get_parent_addr(unsigned long self_addr, - unsigned long parent, - unsigned long parent_addr, - unsigned long fp) +unsigned long ftrace_get_parent_ra_addr(unsigned long self_ra, unsigned long + old_parent_ra, unsigned long parent_ra_addr, unsigned long fp) { - unsigned long sp, ip, ra; + unsigned long sp, ip, tmp; unsigned int code; int faulted; /* - * For module, move the ip from calling site of mcount to the - * instruction "lui v1, hi_16bit_of_mcount"(offset is 20), but for - * kernel, move to the instruction "move ra, at"(offset is 12) + * For module, move the ip from the return address after the + * instruction "lui v1, hi_16bit_of_mcount"(offset is 24), but for + * kernel, move after the instruction "move ra, at"(offset is 16) */ - ip = self_addr - (in_module(self_addr) ? 20 : 12); + ip = self_ra - (in_kernel_space(self_ra) ? 16 : 24); /* * search the text until finding the non-store instruction or "s{d,w} * ra, offset(sp)" instruction */ do { - ip -= 4; - /* get the code at "ip": code = *(unsigned int *)ip; */ safe_load_code(code, ip, faulted); @@ -224,18 +226,20 @@ unsigned long ftrace_get_parent_addr(unsigned long self_addr, * store the ra on the stack */ if ((code & S_R_SP) != S_R_SP) - return parent_addr; + return parent_ra_addr; - } while (((code & S_RA_SP) != S_RA_SP)); + /* Move to the next instruction */ + ip -= 4; + } while ((code & S_RA_SP) != S_RA_SP); sp = fp + (code & OFFSET_MASK); - /* ra = *(unsigned long *)sp; */ - safe_load_stack(ra, sp, faulted); + /* tmp = *(unsigned long *)sp; */ + safe_load_stack(tmp, sp, faulted); if (unlikely(faulted)) return 0; - if (ra == parent) + if (tmp == old_parent_ra) return sp; return 0; } @@ -246,21 +250,21 @@ unsigned long ftrace_get_parent_addr(unsigned long self_addr, * Hook the return address and push it in the stack of return addrs * in current thread info. */ -void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, +void prepare_ftrace_return(unsigned long *parent_ra_addr, unsigned long self_ra, unsigned long fp) { - unsigned long old; + unsigned long old_parent_ra; struct ftrace_graph_ent trace; unsigned long return_hooker = (unsigned long) &return_to_handler; - int faulted; + int faulted, insns; if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; /* - * "parent" is the stack address saved the return address of the caller - * of _mcount. + * "parent_ra_addr" is the stack address saved the return address of + * the caller of _mcount. * * if the gcc < 4.5, a leaf function does not save the return address * in the stack address, so, we "emulate" one in _mcount's stack space, @@ -275,37 +279,44 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, * do it in ftrace_graph_caller of mcount.S. */ - /* old = *parent; */ - safe_load_stack(old, parent, faulted); + /* old_parent_ra = *parent_ra_addr; */ + safe_load_stack(old_parent_ra, parent_ra_addr, faulted); if (unlikely(faulted)) goto out; #ifndef KBUILD_MCOUNT_RA_ADDRESS - parent = (unsigned long *)ftrace_get_parent_addr(self_addr, old, - (unsigned long)parent, fp); + parent_ra_addr = (unsigned long *)ftrace_get_parent_ra_addr(self_ra, + old_parent_ra, (unsigned long)parent_ra_addr, fp); /* * If fails when getting the stack address of the non-leaf function's * ra, stop function graph tracer and return */ - if (parent == 0) + if (parent_ra_addr == 0) goto out; #endif - /* *parent = return_hooker; */ - safe_store_stack(return_hooker, parent, faulted); + /* *parent_ra_addr = return_hooker; */ + safe_store_stack(return_hooker, parent_ra_addr, faulted); if (unlikely(faulted)) goto out; - if (ftrace_push_return_trace(old, self_addr, &trace.depth, fp) == - -EBUSY) { - *parent = old; + if (ftrace_push_return_trace(old_parent_ra, self_ra, &trace.depth, fp) + == -EBUSY) { + *parent_ra_addr = old_parent_ra; return; } - trace.func = self_addr; + /* + * Get the recorded ip of the current mcount calling site in the + * __mcount_loc section, which will be used to filter the function + * entries configured through the tracing/set_graph_function interface. + */ + + insns = in_kernel_space(self_ra) ? 2 : MCOUNT_OFFSET_INSNS + 1; + trace.func = self_ra - (MCOUNT_INSN_SIZE * insns); /* Only trace if the calling function expects to */ if (!ftrace_graph_entry(&trace)) { current->curr_ret_stack--; - *parent = old; + *parent_ra_addr = old_parent_ra; } return; out: diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c index 2b7f3f703b8..a8244854d3d 100644 --- a/arch/mips/kernel/perf_event.c +++ b/arch/mips/kernel/perf_event.c @@ -161,41 +161,6 @@ mipspmu_event_set_period(struct perf_event *event, return ret; } -static int mipspmu_enable(struct perf_event *event) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; - int idx; - int err = 0; - - /* To look for a free counter for this event. */ - idx = mipspmu->alloc_counter(cpuc, hwc); - if (idx < 0) { - err = idx; - goto out; - } - - /* - * If there is an event in the counter we are going to use then - * make sure it is disabled. - */ - event->hw.idx = idx; - mipspmu->disable_event(idx); - cpuc->events[idx] = event; - - /* Set the period for the event. */ - mipspmu_event_set_period(event, hwc, idx); - - /* Enable the event. */ - mipspmu->enable_event(hwc, idx); - - /* Propagate our changes to the userspace mapping. */ - perf_event_update_userpage(event); - -out: - return err; -} - static void mipspmu_event_update(struct perf_event *event, struct hw_perf_event *hwc, int idx) @@ -204,7 +169,7 @@ static void mipspmu_event_update(struct perf_event *event, unsigned long flags; int shift = 64 - TOTAL_BITS; s64 prev_raw_count, new_raw_count; - s64 delta; + u64 delta; again: prev_raw_count = local64_read(&hwc->prev_count); @@ -231,32 +196,90 @@ again: return; } -static void mipspmu_disable(struct perf_event *event) +static void mipspmu_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!mipspmu) + return; + + if (flags & PERF_EF_RELOAD) + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + + hwc->state = 0; + + /* Set the period for the event. */ + mipspmu_event_set_period(event, hwc, hwc->idx); + + /* Enable the event. */ + mipspmu->enable_event(hwc, hwc->idx); +} + +static void mipspmu_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!mipspmu) + return; + + if (!(hwc->state & PERF_HES_STOPPED)) { + /* We are working on a local event. */ + mipspmu->disable_event(hwc->idx); + barrier(); + mipspmu_event_update(event, hwc, hwc->idx); + hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; + } +} + +static int mipspmu_add(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; + int idx; + int err = 0; + perf_pmu_disable(event->pmu); - WARN_ON(idx < 0 || idx >= mipspmu->num_counters); + /* To look for a free counter for this event. */ + idx = mipspmu->alloc_counter(cpuc, hwc); + if (idx < 0) { + err = idx; + goto out; + } - /* We are working on a local event. */ + /* + * If there is an event in the counter we are going to use then + * make sure it is disabled. + */ + event->hw.idx = idx; mipspmu->disable_event(idx); + cpuc->events[idx] = event; - barrier(); - - mipspmu_event_update(event, hwc, idx); - cpuc->events[idx] = NULL; - clear_bit(idx, cpuc->used_mask); + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + if (flags & PERF_EF_START) + mipspmu_start(event, PERF_EF_RELOAD); + /* Propagate our changes to the userspace mapping. */ perf_event_update_userpage(event); + +out: + perf_pmu_enable(event->pmu); + return err; } -static void mipspmu_unthrottle(struct perf_event *event) +static void mipspmu_del(struct perf_event *event, int flags) { + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; - mipspmu->enable_event(hwc, hwc->idx); + WARN_ON(idx < 0 || idx >= mipspmu->num_counters); + + mipspmu_stop(event, PERF_EF_UPDATE); + cpuc->events[idx] = NULL; + clear_bit(idx, cpuc->used_mask); + + perf_event_update_userpage(event); } static void mipspmu_read(struct perf_event *event) @@ -270,12 +293,17 @@ static void mipspmu_read(struct perf_event *event) mipspmu_event_update(event, hwc, hwc->idx); } -static struct pmu pmu = { - .enable = mipspmu_enable, - .disable = mipspmu_disable, - .unthrottle = mipspmu_unthrottle, - .read = mipspmu_read, -}; +static void mipspmu_enable(struct pmu *pmu) +{ + if (mipspmu) + mipspmu->start(); +} + +static void mipspmu_disable(struct pmu *pmu) +{ + if (mipspmu) + mipspmu->stop(); +} static atomic_t active_events = ATOMIC_INIT(0); static DEFINE_MUTEX(pmu_reserve_mutex); @@ -318,6 +346,82 @@ static void mipspmu_free_irq(void) perf_irq = save_perf_irq; } +/* + * mipsxx/rm9000/loongson2 have different performance counters, they have + * specific low-level init routines. + */ +static void reset_counters(void *arg); +static int __hw_perf_event_init(struct perf_event *event); + +static void hw_perf_event_destroy(struct perf_event *event) +{ + if (atomic_dec_and_mutex_lock(&active_events, + &pmu_reserve_mutex)) { + /* + * We must not call the destroy function with interrupts + * disabled. + */ + on_each_cpu(reset_counters, + (void *)(long)mipspmu->num_counters, 1); + mipspmu_free_irq(); + mutex_unlock(&pmu_reserve_mutex); + } +} + +static int mipspmu_event_init(struct perf_event *event) +{ + int err = 0; + + switch (event->attr.type) { + case PERF_TYPE_RAW: + case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: + break; + + default: + return -ENOENT; + } + + if (!mipspmu || event->cpu >= nr_cpumask_bits || + (event->cpu >= 0 && !cpu_online(event->cpu))) + return -ENODEV; + + if (!atomic_inc_not_zero(&active_events)) { + if (atomic_read(&active_events) > MIPS_MAX_HWEVENTS) { + atomic_dec(&active_events); + return -ENOSPC; + } + + mutex_lock(&pmu_reserve_mutex); + if (atomic_read(&active_events) == 0) + err = mipspmu_get_irq(); + + if (!err) + atomic_inc(&active_events); + mutex_unlock(&pmu_reserve_mutex); + } + + if (err) + return err; + + err = __hw_perf_event_init(event); + if (err) + hw_perf_event_destroy(event); + + return err; +} + +static struct pmu pmu = { + .pmu_enable = mipspmu_enable, + .pmu_disable = mipspmu_disable, + .event_init = mipspmu_event_init, + .add = mipspmu_add, + .del = mipspmu_del, + .start = mipspmu_start, + .stop = mipspmu_stop, + .read = mipspmu_read, +}; + static inline unsigned int mipspmu_perf_event_encode(const struct mips_perf_event *pev) { @@ -382,8 +486,9 @@ static int validate_event(struct cpu_hw_events *cpuc, { struct hw_perf_event fake_hwc = event->hw; - if (event->pmu && event->pmu != &pmu) - return 0; + /* Allow mixed event group. So return 1 to pass validation. */ + if (event->pmu != &pmu || event->state <= PERF_EVENT_STATE_OFF) + return 1; return mipspmu->alloc_counter(cpuc, &fake_hwc) >= 0; } @@ -409,73 +514,6 @@ static int validate_group(struct perf_event *event) return 0; } -/* - * mipsxx/rm9000/loongson2 have different performance counters, they have - * specific low-level init routines. - */ -static void reset_counters(void *arg); -static int __hw_perf_event_init(struct perf_event *event); - -static void hw_perf_event_destroy(struct perf_event *event) -{ - if (atomic_dec_and_mutex_lock(&active_events, - &pmu_reserve_mutex)) { - /* - * We must not call the destroy function with interrupts - * disabled. - */ - on_each_cpu(reset_counters, - (void *)(long)mipspmu->num_counters, 1); - mipspmu_free_irq(); - mutex_unlock(&pmu_reserve_mutex); - } -} - -const struct pmu *hw_perf_event_init(struct perf_event *event) -{ - int err = 0; - - if (!mipspmu || event->cpu >= nr_cpumask_bits || - (event->cpu >= 0 && !cpu_online(event->cpu))) - return ERR_PTR(-ENODEV); - - if (!atomic_inc_not_zero(&active_events)) { - if (atomic_read(&active_events) > MIPS_MAX_HWEVENTS) { - atomic_dec(&active_events); - return ERR_PTR(-ENOSPC); - } - - mutex_lock(&pmu_reserve_mutex); - if (atomic_read(&active_events) == 0) - err = mipspmu_get_irq(); - - if (!err) - atomic_inc(&active_events); - mutex_unlock(&pmu_reserve_mutex); - } - - if (err) - return ERR_PTR(err); - - err = __hw_perf_event_init(event); - if (err) - hw_perf_event_destroy(event); - - return err ? ERR_PTR(err) : &pmu; -} - -void hw_perf_enable(void) -{ - if (mipspmu) - mipspmu->start(); -} - -void hw_perf_disable(void) -{ - if (mipspmu) - mipspmu->stop(); -} - /* This is needed by specific irq handlers in perf_event_*.c */ static void handle_associated_event(struct cpu_hw_events *cpuc, @@ -496,21 +534,13 @@ handle_associated_event(struct cpu_hw_events *cpuc, #include "perf_event_mipsxx.c" /* Callchain handling code. */ -static inline void -callchain_store(struct perf_callchain_entry *entry, - u64 ip) -{ - if (entry->nr < PERF_MAX_STACK_DEPTH) - entry->ip[entry->nr++] = ip; -} /* * Leave userspace callchain empty for now. When we find a way to trace * the user stack callchains, we add here. */ -static void -perf_callchain_user(struct pt_regs *regs, - struct perf_callchain_entry *entry) +void perf_callchain_user(struct perf_callchain_entry *entry, + struct pt_regs *regs) { } @@ -523,23 +553,21 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry, while (!kstack_end(sp)) { addr = *sp++; if (__kernel_text_address(addr)) { - callchain_store(entry, addr); + perf_callchain_store(entry, addr); if (entry->nr >= PERF_MAX_STACK_DEPTH) break; } } } -static void -perf_callchain_kernel(struct pt_regs *regs, - struct perf_callchain_entry *entry) +void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs) { unsigned long sp = regs->regs[29]; #ifdef CONFIG_KALLSYMS unsigned long ra = regs->regs[31]; unsigned long pc = regs->cp0_epc; - callchain_store(entry, PERF_CONTEXT_KERNEL); if (raw_show_trace || !__kernel_text_address(pc)) { unsigned long stack_page = (unsigned long)task_stack_page(current); @@ -549,53 +577,12 @@ perf_callchain_kernel(struct pt_regs *regs, return; } do { - callchain_store(entry, pc); + perf_callchain_store(entry, pc); if (entry->nr >= PERF_MAX_STACK_DEPTH) break; pc = unwind_stack(current, &sp, pc, &ra); } while (pc); #else - callchain_store(entry, PERF_CONTEXT_KERNEL); save_raw_perf_callchain(entry, sp); #endif } - -static void -perf_do_callchain(struct pt_regs *regs, - struct perf_callchain_entry *entry) -{ - int is_user; - - if (!regs) - return; - - is_user = user_mode(regs); - - if (!current || !current->pid) - return; - - if (is_user && current->state != TASK_RUNNING) - return; - - if (!is_user) { - perf_callchain_kernel(regs, entry); - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - if (regs) - perf_callchain_user(regs, entry); -} - -static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); - -struct perf_callchain_entry * -perf_callchain(struct pt_regs *regs) -{ - struct perf_callchain_entry *entry = &__get_cpu_var(pmc_irq_entry); - - entry->nr = 0; - perf_do_callchain(regs, entry); - return entry; -} diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c index 183e0d22666..d9a7db78ed6 100644 --- a/arch/mips/kernel/perf_event_mipsxx.c +++ b/arch/mips/kernel/perf_event_mipsxx.c @@ -696,7 +696,7 @@ static int mipsxx_pmu_handle_shared_irq(void) * interrupt, not NMI. */ if (handled == IRQ_HANDLED) - perf_event_do_pending(); + irq_work_run(); #ifdef CONFIG_MIPS_MT_SMP read_unlock(&pmuint_rwlock); @@ -1045,6 +1045,8 @@ init_hw_perf_events(void) "CPU, irq %d%s\n", mipspmu->name, counters, irq, irq < 0 ? " (share with timer interrupt)" : ""); + perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); + return 0; } early_initcall(init_hw_perf_events); diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index 5922342bca3..dbbe0ce48d8 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -84,7 +84,7 @@ static int protected_save_fp_context(struct sigcontext __user *sc) static int protected_restore_fp_context(struct sigcontext __user *sc) { - int err, tmp; + int err, tmp __maybe_unused; while (1) { lock_fpu_owner(); own_fpu_inatomic(0); diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c index a0ed0e052b2..aae98661379 100644 --- a/arch/mips/kernel/signal32.c +++ b/arch/mips/kernel/signal32.c @@ -115,7 +115,7 @@ static int protected_save_fp_context32(struct sigcontext32 __user *sc) static int protected_restore_fp_context32(struct sigcontext32 __user *sc) { - int err, tmp; + int err, tmp __maybe_unused; while (1) { lock_fpu_owner(); own_fpu_inatomic(0); diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 383aeb95cb4..32a25610108 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -193,6 +193,22 @@ void __devinit smp_prepare_boot_cpu(void) */ static struct task_struct *cpu_idle_thread[NR_CPUS]; +struct create_idle { + struct work_struct work; + struct task_struct *idle; + struct completion done; + int cpu; +}; + +static void __cpuinit do_fork_idle(struct work_struct *work) +{ + struct create_idle *c_idle = + container_of(work, struct create_idle, work); + + c_idle->idle = fork_idle(c_idle->cpu); + complete(&c_idle->done); +} + int __cpuinit __cpu_up(unsigned int cpu) { struct task_struct *idle; @@ -203,8 +219,19 @@ int __cpuinit __cpu_up(unsigned int cpu) * Linux can schedule processes on this slave. */ if (!cpu_idle_thread[cpu]) { - idle = fork_idle(cpu); - cpu_idle_thread[cpu] = idle; + /* + * Schedule work item to avoid forking user task + * Ported from arch/x86/kernel/smpboot.c + */ + struct create_idle c_idle = { + .cpu = cpu, + .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), + }; + + INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle); + schedule_work(&c_idle.work); + wait_for_completion(&c_idle.done); + idle = cpu_idle_thread[cpu] = c_idle.idle; if (IS_ERR(idle)) panic(KERN_ERR "Fork failed for CPU %d", cpu); diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c index 1dc6edff45e..58beabf50b3 100644 --- a/arch/mips/kernel/syscall.c +++ b/arch/mips/kernel/syscall.c @@ -383,12 +383,11 @@ save_static_function(sys_sysmips); static int __used noinline _sys_sysmips(nabi_no_regargs struct pt_regs regs) { - long cmd, arg1, arg2, arg3; + long cmd, arg1, arg2; cmd = regs.regs[4]; arg1 = regs.regs[5]; arg2 = regs.regs[6]; - arg3 = regs.regs[7]; switch (cmd) { case MIPS_ATOMIC_SET: @@ -405,7 +404,7 @@ _sys_sysmips(nabi_no_regargs struct pt_regs regs) if (arg1 & 2) set_thread_flag(TIF_LOGADE); else - clear_thread_flag(TIF_FIXADE); + clear_thread_flag(TIF_LOGADE); return 0; diff --git a/arch/mips/kernel/vpe.c b/arch/mips/kernel/vpe.c index 6a1fdfef8fd..ab52b7cf3b6 100644 --- a/arch/mips/kernel/vpe.c +++ b/arch/mips/kernel/vpe.c @@ -148,9 +148,9 @@ struct { spinlock_t tc_list_lock; struct list_head tc_list; /* Thread contexts */ } vpecontrol = { - .vpe_list_lock = SPIN_LOCK_UNLOCKED, + .vpe_list_lock = __SPIN_LOCK_UNLOCKED(vpe_list_lock), .vpe_list = LIST_HEAD_INIT(vpecontrol.vpe_list), - .tc_list_lock = SPIN_LOCK_UNLOCKED, + .tc_list_lock = __SPIN_LOCK_UNLOCKED(tc_list_lock), .tc_list = LIST_HEAD_INIT(vpecontrol.tc_list) }; diff --git a/arch/mips/loongson/Kconfig b/arch/mips/loongson/Kconfig index 6e1b77fec7e..aca93eed877 100644 --- a/arch/mips/loongson/Kconfig +++ b/arch/mips/loongson/Kconfig @@ -1,6 +1,7 @@ +if MACH_LOONGSON + choice prompt "Machine Type" - depends on MACH_LOONGSON config LEMOTE_FULOONG2E bool "Lemote Fuloong(2e) mini-PC" @@ -87,3 +88,5 @@ config LOONGSON_UART_BASE config LOONGSON_MC146818 bool default n + +endif # MACH_LOONGSON diff --git a/arch/mips/loongson/common/cmdline.c b/arch/mips/loongson/common/cmdline.c index 1a06defc4f7..353e1d2e41a 100644 --- a/arch/mips/loongson/common/cmdline.c +++ b/arch/mips/loongson/common/cmdline.c @@ -44,10 +44,5 @@ void __init prom_init_cmdline(void) strcat(arcs_cmdline, " "); } - if ((strstr(arcs_cmdline, "console=")) == NULL) - strcat(arcs_cmdline, " console=ttyS0,115200"); - if ((strstr(arcs_cmdline, "root=")) == NULL) - strcat(arcs_cmdline, " root=/dev/hda1"); - prom_init_machtype(); } diff --git a/arch/mips/loongson/common/machtype.c b/arch/mips/loongson/common/machtype.c index 81fbe6b73f9..2efd5d9dee2 100644 --- a/arch/mips/loongson/common/machtype.c +++ b/arch/mips/loongson/common/machtype.c @@ -41,7 +41,7 @@ void __weak __init mach_prom_init_machtype(void) void __init prom_init_machtype(void) { - char *p, str[MACHTYPE_LEN]; + char *p, str[MACHTYPE_LEN + 1]; int machtype = MACH_LEMOTE_FL2E; mips_machtype = LOONGSON_MACHTYPE; @@ -53,6 +53,7 @@ void __init prom_init_machtype(void) } p += strlen("machtype="); strncpy(str, p, MACHTYPE_LEN); + str[MACHTYPE_LEN] = '\0'; p = strstr(str, " "); if (p) *p = '\0'; diff --git a/arch/mips/math-emu/ieee754int.h b/arch/mips/math-emu/ieee754int.h index 2701d950095..2a7d43f4f16 100644 --- a/arch/mips/math-emu/ieee754int.h +++ b/arch/mips/math-emu/ieee754int.h @@ -70,7 +70,7 @@ #define COMPXSP \ - unsigned xm; int xe; int xs; int xc + unsigned xm; int xe; int xs __maybe_unused; int xc #define COMPYSP \ unsigned ym; int ye; int ys; int yc @@ -104,7 +104,7 @@ #define COMPXDP \ -u64 xm; int xe; int xs; int xc +u64 xm; int xe; int xs __maybe_unused; int xc #define COMPYDP \ u64 ym; int ye; int ys; int yc diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 2efcbd24c82..279599e9a77 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -324,7 +324,7 @@ int page_is_ram(unsigned long pagenr) void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; - unsigned long lastpfn; + unsigned long lastpfn __maybe_unused; pagetable_init(); diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c index 083d3412d0b..04f9e17db9d 100644 --- a/arch/mips/mm/tlbex.c +++ b/arch/mips/mm/tlbex.c @@ -109,6 +109,8 @@ static bool scratchpad_available(void) static int scratchpad_offset(int i) { BUG(); + /* Really unreachable, but evidently some GCC want this. */ + return 0; } #endif /* diff --git a/arch/mips/pci/ops-pmcmsp.c b/arch/mips/pci/ops-pmcmsp.c index b7c03d80c88..68798f869c0 100644 --- a/arch/mips/pci/ops-pmcmsp.c +++ b/arch/mips/pci/ops-pmcmsp.c @@ -308,7 +308,7 @@ static struct resource pci_mem_resource = { * RETURNS: PCIBIOS_SUCCESSFUL - success * ****************************************************************************/ -static int bpci_interrupt(int irq, void *dev_id) +static irqreturn_t bpci_interrupt(int irq, void *dev_id) { struct msp_pci_regs *preg = (void *)PCI_BASE_REG; unsigned int stat = preg->if_status; @@ -326,7 +326,7 @@ static int bpci_interrupt(int irq, void *dev_id) /* write to clear all asserted interrupts */ preg->if_status = stat; - return PCIBIOS_SUCCESSFUL; + return IRQ_HANDLED; } /***************************************************************************** diff --git a/arch/mips/pmc-sierra/Kconfig b/arch/mips/pmc-sierra/Kconfig index c139988bb85..8d798497c61 100644 --- a/arch/mips/pmc-sierra/Kconfig +++ b/arch/mips/pmc-sierra/Kconfig @@ -4,15 +4,11 @@ choice config PMC_MSP4200_EVAL bool "PMC-Sierra MSP4200 Eval Board" - select CEVT_R4K - select CSRC_R4K select IRQ_MSP_SLP select HW_HAS_PCI config PMC_MSP4200_GW bool "PMC-Sierra MSP4200 VoIP Gateway" - select CEVT_R4K - select CSRC_R4K select IRQ_MSP_SLP select HW_HAS_PCI diff --git a/arch/mips/pmc-sierra/msp71xx/msp_time.c b/arch/mips/pmc-sierra/msp71xx/msp_time.c index cca64e15f57..01df84ce31e 100644 --- a/arch/mips/pmc-sierra/msp71xx/msp_time.c +++ b/arch/mips/pmc-sierra/msp71xx/msp_time.c @@ -81,7 +81,7 @@ void __init plat_time_init(void) mips_hpt_frequency = cpu_rate/2; } -unsigned int __init get_c0_compare_int(void) +unsigned int __cpuinit get_c0_compare_int(void) { return MSP_INT_VPE0_TIMER; } diff --git a/arch/mn10300/include/asm/atomic.h b/arch/mn10300/include/asm/atomic.h index 92d2f9298e3..9d773a63951 100644 --- a/arch/mn10300/include/asm/atomic.h +++ b/arch/mn10300/include/asm/atomic.h @@ -139,7 +139,7 @@ static inline unsigned long __cmpxchg(volatile unsigned long *m, * Atomically reads the value of @v. Note that the guaranteed * useful range of an atomic_t is only 24 bits. */ -#define atomic_read(v) ((v)->counter) +#define atomic_read(v) (ACCESS_ONCE((v)->counter)) /** * atomic_set - set atomic variable diff --git a/arch/mn10300/include/asm/uaccess.h b/arch/mn10300/include/asm/uaccess.h index 679dee0bbd0..3d6e60dad9d 100644 --- a/arch/mn10300/include/asm/uaccess.h +++ b/arch/mn10300/include/asm/uaccess.h @@ -160,9 +160,10 @@ struct __large_struct { unsigned long buf[100]; }; #define __get_user_check(x, ptr, size) \ ({ \ + const __typeof__(ptr) __guc_ptr = (ptr); \ int _e; \ - if (likely(__access_ok((unsigned long) (ptr), (size)))) \ - _e = __get_user_nocheck((x), (ptr), (size)); \ + if (likely(__access_ok((unsigned long) __guc_ptr, (size)))) \ + _e = __get_user_nocheck((x), __guc_ptr, (size)); \ else { \ _e = -EFAULT; \ (x) = (__typeof__(x))0; \ diff --git a/arch/mn10300/kernel/time.c b/arch/mn10300/kernel/time.c index 75da468090b..5b955000626 100644 --- a/arch/mn10300/kernel/time.c +++ b/arch/mn10300/kernel/time.c @@ -104,8 +104,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) unsigned tsc, elapse; irqreturn_t ret; - write_seqlock(&xtime_lock); - while (tsc = get_cycles(), elapse = tsc - mn10300_last_tsc, /* time elapsed since last * tick */ @@ -114,11 +112,9 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) mn10300_last_tsc += MN10300_TSC_PER_HZ; /* advance the kernel's time tracking system */ - do_timer(1); + xtime_update(1); } - write_sequnlock(&xtime_lock); - ret = local_timer_interrupt(); #ifdef CONFIG_SMP send_IPI_allbutself(LOCAL_TIMER_IPI); diff --git a/arch/mn10300/mm/cache-inv-icache.c b/arch/mn10300/mm/cache-inv-icache.c index a8933a60b2d..a6b63dde603 100644 --- a/arch/mn10300/mm/cache-inv-icache.c +++ b/arch/mn10300/mm/cache-inv-icache.c @@ -69,7 +69,7 @@ static void flush_icache_page_range(unsigned long start, unsigned long end) /* invalidate the icache coverage on that region */ mn10300_local_icache_inv_range2(addr + off, size); - smp_cache_call(SMP_ICACHE_INV_FLUSH_RANGE, start, end); + smp_cache_call(SMP_ICACHE_INV_RANGE, start, end); } /** @@ -101,7 +101,7 @@ void flush_icache_range(unsigned long start, unsigned long end) * directly */ start_page = (start >= 0x80000000UL) ? start : 0x80000000UL; mn10300_icache_inv_range(start_page, end); - smp_cache_call(SMP_ICACHE_INV_FLUSH_RANGE, start, end); + smp_cache_call(SMP_ICACHE_INV_RANGE, start, end); if (start_page == start) goto done; end = start_page; diff --git a/arch/parisc/hpux/sys_hpux.c b/arch/parisc/hpux/sys_hpux.c index 30394081d9b..6ab9580b0b0 100644 --- a/arch/parisc/hpux/sys_hpux.c +++ b/arch/parisc/hpux/sys_hpux.c @@ -185,26 +185,21 @@ struct hpux_statfs { int16_t f_pad; }; -static int do_statfs_hpux(struct path *path, struct hpux_statfs *buf) +static int do_statfs_hpux(struct kstatfs *st, struct hpux_statfs __user *p) { - struct kstatfs st; - int retval; - - retval = vfs_statfs(path, &st); - if (retval) - return retval; - - memset(buf, 0, sizeof(*buf)); - buf->f_type = st.f_type; - buf->f_bsize = st.f_bsize; - buf->f_blocks = st.f_blocks; - buf->f_bfree = st.f_bfree; - buf->f_bavail = st.f_bavail; - buf->f_files = st.f_files; - buf->f_ffree = st.f_ffree; - buf->f_fsid[0] = st.f_fsid.val[0]; - buf->f_fsid[1] = st.f_fsid.val[1]; - + struct hpux_statfs buf; + memset(&buf, 0, sizeof(buf)); + buf.f_type = st->f_type; + buf.f_bsize = st->f_bsize; + buf.f_blocks = st->f_blocks; + buf.f_bfree = st->f_bfree; + buf.f_bavail = st->f_bavail; + buf.f_files = st->f_files; + buf.f_ffree = st->f_ffree; + buf.f_fsid[0] = st->f_fsid.val[0]; + buf.f_fsid[1] = st->f_fsid.val[1]; + if (copy_to_user(p, &buf, sizeof(buf))) + return -EFAULT; return 0; } @@ -212,35 +207,19 @@ static int do_statfs_hpux(struct path *path, struct hpux_statfs *buf) asmlinkage long hpux_statfs(const char __user *pathname, struct hpux_statfs __user *buf) { - struct path path; - int error; - - error = user_path(pathname, &path); - if (!error) { - struct hpux_statfs tmp; - error = do_statfs_hpux(&path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_put(&path); - } + struct kstatfs st; + int error = user_statfs(pathname, &st); + if (!error) + error = do_statfs_hpux(&st, buf); return error; } asmlinkage long hpux_fstatfs(unsigned int fd, struct hpux_statfs __user * buf) { - struct file *file; - struct hpux_statfs tmp; - int error; - - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = do_statfs_hpux(&file->f_path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); - out: + struct kstatfs st; + int error = fd_statfs(fd, &st); + if (!error) + error = do_statfs_hpux(&st, buf); return error; } diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h index 0c705c3a55e..67a33cc27ef 100644 --- a/arch/parisc/include/asm/futex.h +++ b/arch/parisc/include/asm/futex.h @@ -8,7 +8,7 @@ #include <asm/errno.h> static inline int -futex_atomic_op_inuser (int encoded_op, int __user *uaddr) +futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -18,7 +18,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int))) + if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); @@ -51,10 +51,10 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr) /* Non-atomic version */ static inline int -futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { - int err = 0; - int uval; + u32 val; /* futex.c wants to do a cmpxchg_inatomic on kernel NULL, which is * our gateway page, and causes no end of trouble... @@ -62,15 +62,15 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) if (segment_eq(KERNEL_DS, get_fs()) && !uaddr) return -EFAULT; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; - err = get_user(uval, uaddr); - if (err) return -EFAULT; - if (uval == oldval) - err = put_user(newval, uaddr); - if (err) return -EFAULT; - return uval; + if (get_user(val, uaddr)) + return -EFAULT; + if (val == oldval && put_user(newval, uaddr)) + return -EFAULT; + *uval = val; + return 0; } #endif /*__KERNEL__*/ diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index 05511ccb61d..45b7389d77a 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -162,11 +162,8 @@ irqreturn_t __irq_entry timer_interrupt(int irq, void *dev_id) update_process_times(user_mode(get_irq_regs())); } - if (cpu == 0) { - write_seqlock(&xtime_lock); - do_timer(ticks_elapsed); - write_sequnlock(&xtime_lock); - } + if (cpu == 0) + xtime_update(ticks_elapsed); return IRQ_HANDLED; } diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h index 7c589ef81fb..c94e4a3fe2e 100644 --- a/arch/powerpc/include/asm/futex.h +++ b/arch/powerpc/include/asm/futex.h @@ -30,7 +30,7 @@ : "b" (uaddr), "i" (-EFAULT), "r" (oparg) \ : "cr0", "memory") -static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) +static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -40,7 +40,7 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int))) + if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); @@ -82,35 +82,38 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { - int prev; + int ret = 0; + u32 prev; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; __asm__ __volatile__ ( PPC_RELEASE_BARRIER -"1: lwarx %0,0,%2 # futex_atomic_cmpxchg_inatomic\n\ - cmpw 0,%0,%3\n\ +"1: lwarx %1,0,%3 # futex_atomic_cmpxchg_inatomic\n\ + cmpw 0,%1,%4\n\ bne- 3f\n" - PPC405_ERR77(0,%2) -"2: stwcx. %4,0,%2\n\ + PPC405_ERR77(0,%3) +"2: stwcx. %5,0,%3\n\ bne- 1b\n" PPC_ACQUIRE_BARRIER "3: .section .fixup,\"ax\"\n\ -4: li %0,%5\n\ +4: li %0,%6\n\ b 3b\n\ .previous\n\ .section __ex_table,\"a\"\n\ .align 3\n\ " PPC_LONG "1b,4b,2b,4b\n\ .previous" \ - : "=&r" (prev), "+m" (*uaddr) + : "+r" (ret), "=&r" (prev), "+m" (*uaddr) : "r" (uaddr), "r" (oldval), "r" (newval), "i" (-EFAULT) : "cc", "memory"); - return prev; + *uval = prev; + return ret; } #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/lppaca.h b/arch/powerpc/include/asm/lppaca.h index 380d48bacd1..26b8c807f8f 100644 --- a/arch/powerpc/include/asm/lppaca.h +++ b/arch/powerpc/include/asm/lppaca.h @@ -33,9 +33,25 @@ // //---------------------------------------------------------------------------- #include <linux/cache.h> +#include <linux/threads.h> #include <asm/types.h> #include <asm/mmu.h> +/* + * We only have to have statically allocated lppaca structs on + * legacy iSeries, which supports at most 64 cpus. + */ +#ifdef CONFIG_PPC_ISERIES +#if NR_CPUS < 64 +#define NR_LPPACAS NR_CPUS +#else +#define NR_LPPACAS 64 +#endif +#else /* not iSeries */ +#define NR_LPPACAS 1 +#endif + + /* The Hypervisor barfs if the lppaca crosses a page boundary. A 1k * alignment is sufficient to prevent this */ struct lppaca { diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 51e9e6f90d1..edeb80fdd2c 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -171,6 +171,16 @@ static inline struct pci_controller *pci_bus_to_host(const struct pci_bus *bus) return bus->sysdata; } +static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus) +{ + struct pci_controller *host; + + if (bus->self) + return pci_device_to_OF_node(bus->self); + host = pci_bus_to_host(bus); + return host ? host->dn : NULL; +} + static inline int isa_vaddr_is_ioport(void __iomem *address) { /* No specific ISA handling on ppc32 at this stage, it diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h index d7275758559..c189aa5fe1f 100644 --- a/arch/powerpc/include/asm/prom.h +++ b/arch/powerpc/include/asm/prom.h @@ -70,21 +70,6 @@ static inline int of_node_to_nid(struct device_node *device) { return 0; } #endif #define of_node_to_nid of_node_to_nid -/** - * of_irq_map_pci - Resolve the interrupt for a PCI device - * @pdev: the device whose interrupt is to be resolved - * @out_irq: structure of_irq filled by this function - * - * This function resolves the PCI interrupt for a given PCI device. If a - * device-node exists for a given pci_dev, it will use normal OF tree - * walking. If not, it will implement standard swizzling and walk up the - * PCI tree until an device-node is found, at which point it will finish - * resolving using the OF tree walking. - */ -struct pci_dev; -struct of_irq; -extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq); - extern void of_instantiate_rtc(void); /* These includes are put at the bottom because they may contain things diff --git a/arch/powerpc/include/asm/rwsem.h b/arch/powerpc/include/asm/rwsem.h index 8447d89fbe7..bb1e2cdeb9b 100644 --- a/arch/powerpc/include/asm/rwsem.h +++ b/arch/powerpc/include/asm/rwsem.h @@ -13,11 +13,6 @@ * by Paul Mackerras <paulus@samba.org>. */ -#include <linux/list.h> -#include <linux/spinlock.h> -#include <asm/atomic.h> -#include <asm/system.h> - /* * the semaphore definition */ @@ -33,47 +28,6 @@ #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -struct rw_semaphore { - long count; - spinlock_t wait_lock; - struct list_head wait_list; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -}; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } -#else -# define __RWSEM_DEP_MAP_INIT(lockname) -#endif - -#define __RWSEM_INITIALIZER(name) \ -{ \ - RWSEM_UNLOCKED_VALUE, \ - __SPIN_LOCK_UNLOCKED((name).wait_lock), \ - LIST_HEAD_INIT((name).wait_list) \ - __RWSEM_DEP_MAP_INIT(name) \ -} - -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) - -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); - -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); - -#define init_rwsem(sem) \ - do { \ - static struct lock_class_key __key; \ - \ - __init_rwsem((sem), #sem, &__key); \ - } while (0) - /* * lock for reading */ @@ -174,10 +128,5 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) return atomic_long_add_return(delta, (atomic_long_t *)&sem->count); } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return sem->count != 0; -} - #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_RWSEM_H */ diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index ebf9846f3c3..f4adf89d761 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -27,20 +27,6 @@ extern unsigned long __toc_start; #ifdef CONFIG_PPC_BOOK3S /* - * We only have to have statically allocated lppaca structs on - * legacy iSeries, which supports at most 64 cpus. - */ -#ifdef CONFIG_PPC_ISERIES -#if NR_CPUS < 64 -#define NR_LPPACAS NR_CPUS -#else -#define NR_LPPACAS 64 -#endif -#else /* not iSeries */ -#define NR_LPPACAS 1 -#endif - -/* * The structure which the hypervisor knows about - this structure * should not cross a page boundary. The vpa_init/register_vpa call * is now known to fail if the lppaca structure crosses a page diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 10a44e68ef1..eb341be9a4d 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -22,6 +22,7 @@ #include <linux/init.h> #include <linux/bootmem.h> #include <linux/of_address.h> +#include <linux/of_pci.h> #include <linux/mm.h> #include <linux/list.h> #include <linux/syscalls.h> diff --git a/arch/powerpc/kernel/prom_parse.c b/arch/powerpc/kernel/prom_parse.c index c2b7a07cc3d..47187cc2cf0 100644 --- a/arch/powerpc/kernel/prom_parse.c +++ b/arch/powerpc/kernel/prom_parse.c @@ -2,95 +2,11 @@ #include <linux/kernel.h> #include <linux/string.h> -#include <linux/pci_regs.h> #include <linux/module.h> #include <linux/ioport.h> #include <linux/etherdevice.h> #include <linux/of_address.h> #include <asm/prom.h> -#include <asm/pci-bridge.h> - -#ifdef CONFIG_PCI -int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq) -{ - struct device_node *dn, *ppnode; - struct pci_dev *ppdev; - u32 lspec; - u32 laddr[3]; - u8 pin; - int rc; - - /* Check if we have a device node, if yes, fallback to standard OF - * parsing - */ - dn = pci_device_to_OF_node(pdev); - if (dn) { - rc = of_irq_map_one(dn, 0, out_irq); - if (!rc) - return rc; - } - - /* Ok, we don't, time to have fun. Let's start by building up an - * interrupt spec. we assume #interrupt-cells is 1, which is standard - * for PCI. If you do different, then don't use that routine. - */ - rc = pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin); - if (rc != 0) - return rc; - /* No pin, exit */ - if (pin == 0) - return -ENODEV; - - /* Now we walk up the PCI tree */ - lspec = pin; - for (;;) { - /* Get the pci_dev of our parent */ - ppdev = pdev->bus->self; - - /* Ouch, it's a host bridge... */ - if (ppdev == NULL) { -#ifdef CONFIG_PPC64 - ppnode = pci_bus_to_OF_node(pdev->bus); -#else - struct pci_controller *host; - host = pci_bus_to_host(pdev->bus); - ppnode = host ? host->dn : NULL; -#endif - /* No node for host bridge ? give up */ - if (ppnode == NULL) - return -EINVAL; - } else - /* We found a P2P bridge, check if it has a node */ - ppnode = pci_device_to_OF_node(ppdev); - - /* Ok, we have found a parent with a device-node, hand over to - * the OF parsing code. - * We build a unit address from the linux device to be used for - * resolution. Note that we use the linux bus number which may - * not match your firmware bus numbering. - * Fortunately, in most cases, interrupt-map-mask doesn't include - * the bus number as part of the matching. - * You should still be careful about that though if you intend - * to rely on this function (you ship a firmware that doesn't - * create device nodes for all PCI devices). - */ - if (ppnode) - break; - - /* We can only get here if we hit a P2P bridge with no node, - * let's do standard swizzling and try again - */ - lspec = pci_swizzle_interrupt_pin(pdev, lspec); - pdev = ppdev; - } - - laddr[0] = (pdev->bus->number << 16) - | (pdev->devfn << 8); - laddr[1] = laddr[2] = 0; - return of_irq_map_raw(ppnode, &lspec, 1, laddr, out_irq); -} -EXPORT_SYMBOL_GPL(of_irq_map_pci); -#endif /* CONFIG_PCI */ void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop, unsigned long *busno, unsigned long *phys, unsigned long *size) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index fd481232957..0dc95c0aa3b 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1516,7 +1516,8 @@ int start_topology_update(void) { int rc = 0; - if (firmware_has_feature(FW_FEATURE_VPHN) && + /* Disabled until races with load balancing are fixed */ + if (0 && firmware_has_feature(FW_FEATURE_VPHN) && get_lppaca()->shared_proc) { vphn_enabled = 1; setup_cpu_associativity_change_counters(); diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c index 187a7d32f86..a3d2ce54ea2 100644 --- a/arch/powerpc/platforms/cell/spufs/syscalls.c +++ b/arch/powerpc/platforms/cell/spufs/syscalls.c @@ -70,7 +70,7 @@ static long do_spu_create(const char __user *pathname, unsigned int flags, if (!IS_ERR(tmp)) { struct nameidata nd; - ret = path_lookup(tmp, LOOKUP_PARENT, &nd); + ret = kern_path_parent(tmp, &nd); if (!ret) { nd.flags |= LOOKUP_OPEN | LOOKUP_CREATE; ret = spufs_create(&nd, flags, mode, neighbor); diff --git a/arch/powerpc/platforms/iseries/dt.c b/arch/powerpc/platforms/iseries/dt.c index fdb7384c0c4..f0491cc2890 100644 --- a/arch/powerpc/platforms/iseries/dt.c +++ b/arch/powerpc/platforms/iseries/dt.c @@ -242,8 +242,8 @@ static void __init dt_cpus(struct iseries_flat_dt *dt) pft_size[0] = 0; /* NUMA CEC cookie, 0 for non NUMA */ pft_size[1] = __ilog2(HvCallHpt_getHptPages() * HW_PAGE_SIZE); - for (i = 0; i < NR_CPUS; i++) { - if (lppaca_of(i).dyn_proc_status >= 2) + for (i = 0; i < NR_LPPACAS; i++) { + if (lppaca[i].dyn_proc_status >= 2) continue; snprintf(p, 32 - (p - buf), "@%d", i); @@ -251,7 +251,7 @@ static void __init dt_cpus(struct iseries_flat_dt *dt) dt_prop_str(dt, "device_type", device_type_cpu); - index = lppaca_of(i).dyn_hv_phys_proc_index; + index = lppaca[i].dyn_hv_phys_proc_index; d = &xIoHriProcessorVpd[index]; dt_prop_u32(dt, "i-cache-size", d->xInstCacheSize * 1024); diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c index b0863410517..2946ae10fbf 100644 --- a/arch/powerpc/platforms/iseries/setup.c +++ b/arch/powerpc/platforms/iseries/setup.c @@ -680,6 +680,7 @@ void * __init iSeries_early_setup(void) * on but calling this function multiple times is fine. */ identify_cpu(0, mfspr(SPRN_PVR)); + initialise_paca(&boot_paca, 0); powerpc_firmware_features |= FW_FEATURE_ISERIES; powerpc_firmware_features |= FW_FEATURE_LPAR; diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h index 5c5d02de49e..81cf36b691f 100644 --- a/arch/s390/include/asm/futex.h +++ b/arch/s390/include/asm/futex.h @@ -7,7 +7,7 @@ #include <linux/uaccess.h> #include <asm/errno.h> -static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) +static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -18,7 +18,7 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int))) + if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); @@ -39,13 +39,13 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr) return ret; } -static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, - int oldval, int newval) +static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int))) + if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; - return uaccess.futex_atomic_cmpxchg(uaddr, oldval, newval); + return uaccess.futex_atomic_cmpxchg(uval, uaddr, oldval, newval); } #endif /* __KERNEL__ */ diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h index 423fdda2322..d0eb4653ceb 100644 --- a/arch/s390/include/asm/rwsem.h +++ b/arch/s390/include/asm/rwsem.h @@ -43,29 +43,6 @@ #ifdef __KERNEL__ -#include <linux/list.h> -#include <linux/spinlock.h> - -struct rwsem_waiter; - -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_downgrade_write(struct rw_semaphore *); - -/* - * the semaphore definition - */ -struct rw_semaphore { - signed long count; - spinlock_t wait_lock; - struct list_head wait_list; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -}; - #ifndef __s390x__ #define RWSEM_UNLOCKED_VALUE 0x00000000 #define RWSEM_ACTIVE_BIAS 0x00000001 @@ -81,41 +58,6 @@ struct rw_semaphore { #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) /* - * initialisation - */ - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } -#else -# define __RWSEM_DEP_MAP_INIT(lockname) -#endif - -#define __RWSEM_INITIALIZER(name) \ - { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait.lock), \ - LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } - -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) - -static inline void init_rwsem(struct rw_semaphore *sem) -{ - sem->count = RWSEM_UNLOCKED_VALUE; - spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} - -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); - -#define init_rwsem(sem) \ -do { \ - static struct lock_class_key __key; \ - \ - __init_rwsem((sem), #sem, &__key); \ -} while (0) - - -/* * lock for reading */ static inline void __down_read(struct rw_semaphore *sem) @@ -377,10 +319,5 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) return new; } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* __KERNEL__ */ #endif /* _S390_RWSEM_H */ diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index d6b1ed0ec52..2d9ea11f919 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -83,8 +83,8 @@ struct uaccess_ops { size_t (*clear_user)(size_t, void __user *); size_t (*strnlen_user)(size_t, const char __user *); size_t (*strncpy_from_user)(size_t, const char __user *, char *); - int (*futex_atomic_op)(int op, int __user *, int oparg, int *old); - int (*futex_atomic_cmpxchg)(int __user *, int old, int new); + int (*futex_atomic_op)(int op, u32 __user *, int oparg, int *old); + int (*futex_atomic_cmpxchg)(u32 *, u32 __user *, u32 old, u32 new); }; extern struct uaccess_ops uaccess; diff --git a/arch/s390/lib/uaccess.h b/arch/s390/lib/uaccess.h index 126011df14f..1d2536cb630 100644 --- a/arch/s390/lib/uaccess.h +++ b/arch/s390/lib/uaccess.h @@ -12,12 +12,12 @@ extern size_t copy_from_user_std(size_t, const void __user *, void *); extern size_t copy_to_user_std(size_t, void __user *, const void *); extern size_t strnlen_user_std(size_t, const char __user *); extern size_t strncpy_from_user_std(size_t, const char __user *, char *); -extern int futex_atomic_cmpxchg_std(int __user *, int, int); -extern int futex_atomic_op_std(int, int __user *, int, int *); +extern int futex_atomic_cmpxchg_std(u32 *, u32 __user *, u32, u32); +extern int futex_atomic_op_std(int, u32 __user *, int, int *); extern size_t copy_from_user_pt(size_t, const void __user *, void *); extern size_t copy_to_user_pt(size_t, void __user *, const void *); -extern int futex_atomic_op_pt(int, int __user *, int, int *); -extern int futex_atomic_cmpxchg_pt(int __user *, int, int); +extern int futex_atomic_op_pt(int, u32 __user *, int, int *); +extern int futex_atomic_cmpxchg_pt(u32 *, u32 __user *, u32, u32); #endif /* __ARCH_S390_LIB_UACCESS_H */ diff --git a/arch/s390/lib/uaccess_pt.c b/arch/s390/lib/uaccess_pt.c index 404f2de296d..74833831417 100644 --- a/arch/s390/lib/uaccess_pt.c +++ b/arch/s390/lib/uaccess_pt.c @@ -302,7 +302,7 @@ fault: : "0" (-EFAULT), "d" (oparg), "a" (uaddr), \ "m" (*uaddr) : "cc" ); -static int __futex_atomic_op_pt(int op, int __user *uaddr, int oparg, int *old) +static int __futex_atomic_op_pt(int op, u32 __user *uaddr, int oparg, int *old) { int oldval = 0, newval, ret; @@ -335,7 +335,7 @@ static int __futex_atomic_op_pt(int op, int __user *uaddr, int oparg, int *old) return ret; } -int futex_atomic_op_pt(int op, int __user *uaddr, int oparg, int *old) +int futex_atomic_op_pt(int op, u32 __user *uaddr, int oparg, int *old) { int ret; @@ -354,26 +354,29 @@ int futex_atomic_op_pt(int op, int __user *uaddr, int oparg, int *old) return ret; } -static int __futex_atomic_cmpxchg_pt(int __user *uaddr, int oldval, int newval) +static int __futex_atomic_cmpxchg_pt(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { int ret; asm volatile("0: cs %1,%4,0(%5)\n" - "1: lr %0,%1\n" + "1: la %0,0\n" "2:\n" EX_TABLE(0b,2b) EX_TABLE(1b,2b) : "=d" (ret), "+d" (oldval), "=m" (*uaddr) : "0" (-EFAULT), "d" (newval), "a" (uaddr), "m" (*uaddr) : "cc", "memory" ); + *uval = oldval; return ret; } -int futex_atomic_cmpxchg_pt(int __user *uaddr, int oldval, int newval) +int futex_atomic_cmpxchg_pt(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { int ret; if (segment_eq(get_fs(), KERNEL_DS)) - return __futex_atomic_cmpxchg_pt(uaddr, oldval, newval); + return __futex_atomic_cmpxchg_pt(uval, uaddr, oldval, newval); spin_lock(¤t->mm->page_table_lock); uaddr = (int __user *) __dat_user_addr((unsigned long) uaddr); if (!uaddr) { @@ -382,7 +385,7 @@ int futex_atomic_cmpxchg_pt(int __user *uaddr, int oldval, int newval) } get_page(virt_to_page(uaddr)); spin_unlock(¤t->mm->page_table_lock); - ret = __futex_atomic_cmpxchg_pt(uaddr, oldval, newval); + ret = __futex_atomic_cmpxchg_pt(uval, uaddr, oldval, newval); put_page(virt_to_page(uaddr)); return ret; } diff --git a/arch/s390/lib/uaccess_std.c b/arch/s390/lib/uaccess_std.c index a6c4f7ed24a..bb1a7eed42c 100644 --- a/arch/s390/lib/uaccess_std.c +++ b/arch/s390/lib/uaccess_std.c @@ -255,7 +255,7 @@ size_t strncpy_from_user_std(size_t size, const char __user *src, char *dst) : "0" (-EFAULT), "d" (oparg), "a" (uaddr), \ "m" (*uaddr) : "cc"); -int futex_atomic_op_std(int op, int __user *uaddr, int oparg, int *old) +int futex_atomic_op_std(int op, u32 __user *uaddr, int oparg, int *old) { int oldval = 0, newval, ret; @@ -287,19 +287,21 @@ int futex_atomic_op_std(int op, int __user *uaddr, int oparg, int *old) return ret; } -int futex_atomic_cmpxchg_std(int __user *uaddr, int oldval, int newval) +int futex_atomic_cmpxchg_std(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { int ret; asm volatile( " sacf 256\n" "0: cs %1,%4,0(%5)\n" - "1: lr %0,%1\n" + "1: la %0,0\n" "2: sacf 0\n" EX_TABLE(0b,2b) EX_TABLE(1b,2b) : "=d" (ret), "+d" (oldval), "=m" (*uaddr) : "0" (-EFAULT), "d" (newval), "a" (uaddr), "m" (*uaddr) : "cc", "memory" ); + *uval = oldval; return ret; } diff --git a/arch/sh/include/asm/futex-irq.h b/arch/sh/include/asm/futex-irq.h index a9f16a7f9ae..6cb9f193a95 100644 --- a/arch/sh/include/asm/futex-irq.h +++ b/arch/sh/include/asm/futex-irq.h @@ -3,7 +3,7 @@ #include <asm/system.h> -static inline int atomic_futex_op_xchg_set(int oparg, int __user *uaddr, +static inline int atomic_futex_op_xchg_set(int oparg, u32 __user *uaddr, int *oldval) { unsigned long flags; @@ -20,7 +20,7 @@ static inline int atomic_futex_op_xchg_set(int oparg, int __user *uaddr, return ret; } -static inline int atomic_futex_op_xchg_add(int oparg, int __user *uaddr, +static inline int atomic_futex_op_xchg_add(int oparg, u32 __user *uaddr, int *oldval) { unsigned long flags; @@ -37,7 +37,7 @@ static inline int atomic_futex_op_xchg_add(int oparg, int __user *uaddr, return ret; } -static inline int atomic_futex_op_xchg_or(int oparg, int __user *uaddr, +static inline int atomic_futex_op_xchg_or(int oparg, u32 __user *uaddr, int *oldval) { unsigned long flags; @@ -54,7 +54,7 @@ static inline int atomic_futex_op_xchg_or(int oparg, int __user *uaddr, return ret; } -static inline int atomic_futex_op_xchg_and(int oparg, int __user *uaddr, +static inline int atomic_futex_op_xchg_and(int oparg, u32 __user *uaddr, int *oldval) { unsigned long flags; @@ -71,7 +71,7 @@ static inline int atomic_futex_op_xchg_and(int oparg, int __user *uaddr, return ret; } -static inline int atomic_futex_op_xchg_xor(int oparg, int __user *uaddr, +static inline int atomic_futex_op_xchg_xor(int oparg, u32 __user *uaddr, int *oldval) { unsigned long flags; @@ -88,11 +88,13 @@ static inline int atomic_futex_op_xchg_xor(int oparg, int __user *uaddr, return ret; } -static inline int atomic_futex_op_cmpxchg_inatomic(int __user *uaddr, - int oldval, int newval) +static inline int atomic_futex_op_cmpxchg_inatomic(u32 *uval, + u32 __user *uaddr, + u32 oldval, u32 newval) { unsigned long flags; - int ret, prev = 0; + int ret; + u32 prev = 0; local_irq_save(flags); @@ -102,10 +104,8 @@ static inline int atomic_futex_op_cmpxchg_inatomic(int __user *uaddr, local_irq_restore(flags); - if (ret) - return ret; - - return prev; + *uval = prev; + return ret; } #endif /* __ASM_SH_FUTEX_IRQ_H */ diff --git a/arch/sh/include/asm/futex.h b/arch/sh/include/asm/futex.h index 68256ec5fa3..7be39a646fb 100644 --- a/arch/sh/include/asm/futex.h +++ b/arch/sh/include/asm/futex.h @@ -10,7 +10,7 @@ /* XXX: UP variants, fix for SH-4A and SMP.. */ #include <asm/futex-irq.h> -static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -21,7 +21,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); @@ -65,12 +65,13 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; - return atomic_futex_op_cmpxchg_inatomic(uaddr, oldval, newval); + return atomic_futex_op_cmpxchg_inatomic(uval, uaddr, oldval, newval); } #endif /* __KERNEL__ */ diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h index 06e2251a5e4..edab5726529 100644 --- a/arch/sh/include/asm/rwsem.h +++ b/arch/sh/include/asm/rwsem.h @@ -11,64 +11,13 @@ #endif #ifdef __KERNEL__ -#include <linux/list.h> -#include <linux/spinlock.h> -#include <asm/atomic.h> -#include <asm/system.h> -/* - * the semaphore definition - */ -struct rw_semaphore { - long count; #define RWSEM_UNLOCKED_VALUE 0x00000000 #define RWSEM_ACTIVE_BIAS 0x00000001 #define RWSEM_ACTIVE_MASK 0x0000ffff #define RWSEM_WAITING_BIAS (-0x00010000) #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - spinlock_t wait_lock; - struct list_head wait_list; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -}; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } -#else -# define __RWSEM_DEP_MAP_INIT(lockname) -#endif - -#define __RWSEM_INITIALIZER(name) \ - { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ - LIST_HEAD_INIT((name).wait_list) \ - __RWSEM_DEP_MAP_INIT(name) } - -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) - -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); - -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); - -#define init_rwsem(sem) \ -do { \ - static struct lock_class_key __key; \ - \ - __init_rwsem((sem), #sem, &__key); \ -} while (0) - -static inline void init_rwsem(struct rw_semaphore *sem) -{ - sem->count = RWSEM_UNLOCKED_VALUE; - spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} /* * lock for reading @@ -179,10 +128,5 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) return atomic_add_return(delta, (atomic_t *)(&sem->count)); } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* __KERNEL__ */ #endif /* _ASM_SH_RWSEM_H */ diff --git a/arch/sparc/include/asm/futex_64.h b/arch/sparc/include/asm/futex_64.h index 47f95839dc6..444e7bea23b 100644 --- a/arch/sparc/include/asm/futex_64.h +++ b/arch/sparc/include/asm/futex_64.h @@ -30,7 +30,7 @@ : "r" (uaddr), "r" (oparg), "i" (-EFAULT) \ : "memory") -static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -38,7 +38,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret, tem; - if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))) + if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) return -EFAULT; if (unlikely((((unsigned long) uaddr) & 0x3UL))) return -EINVAL; @@ -85,26 +85,30 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { + int ret = 0; + __asm__ __volatile__( - "\n1: casa [%3] %%asi, %2, %0\n" + "\n1: casa [%4] %%asi, %3, %1\n" "2:\n" " .section .fixup,#alloc,#execinstr\n" " .align 4\n" "3: sethi %%hi(2b), %0\n" " jmpl %0 + %%lo(2b), %%g0\n" - " mov %4, %0\n" + " mov %5, %0\n" " .previous\n" " .section __ex_table,\"a\"\n" " .align 4\n" " .word 1b, 3b\n" " .previous\n" - : "=r" (newval) - : "0" (newval), "r" (oldval), "r" (uaddr), "i" (-EFAULT) + : "+r" (ret), "=r" (newval) + : "1" (newval), "r" (oldval), "r" (uaddr), "i" (-EFAULT) : "memory"); - return newval; + *uval = newval; + return ret; } #endif /* !(_SPARC64_FUTEX_H) */ diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h index a2b4302869b..069bf4d663a 100644 --- a/arch/sparc/include/asm/rwsem.h +++ b/arch/sparc/include/asm/rwsem.h @@ -13,53 +13,12 @@ #ifdef __KERNEL__ -#include <linux/list.h> -#include <linux/spinlock.h> - -struct rwsem_waiter; - -struct rw_semaphore { - signed long count; #define RWSEM_UNLOCKED_VALUE 0x00000000L #define RWSEM_ACTIVE_BIAS 0x00000001L #define RWSEM_ACTIVE_MASK 0xffffffffL #define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - spinlock_t wait_lock; - struct list_head wait_list; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -}; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } -#else -# define __RWSEM_DEP_MAP_INIT(lockname) -#endif - -#define __RWSEM_INITIALIZER(name) \ -{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ - LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } - -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) - -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); - -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); - -#define init_rwsem(sem) \ -do { \ - static struct lock_class_key __key; \ - \ - __init_rwsem((sem), #sem, &__key); \ -} while (0) /* * lock for reading @@ -160,11 +119,6 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) return atomic64_add_return(delta, (atomic64_t *)(&sem->count)); } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* __KERNEL__ */ #endif /* _SPARC64_RWSEM_H */ diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c index aeaa09a3c65..2cdc131b50a 100644 --- a/arch/sparc/kernel/pcic.c +++ b/arch/sparc/kernel/pcic.c @@ -700,10 +700,8 @@ static void pcic_clear_clock_irq(void) static irqreturn_t pcic_timer_handler (int irq, void *h) { - write_seqlock(&xtime_lock); /* Dummy, to show that we remember */ pcic_clear_clock_irq(); - do_timer(1); - write_sequnlock(&xtime_lock); + xtime_update(1); #ifndef CONFIG_SMP update_process_times(user_mode(get_irq_regs())); #endif diff --git a/arch/sparc/kernel/time_32.c b/arch/sparc/kernel/time_32.c index 9c743b1886f..4211bfc9bca 100644 --- a/arch/sparc/kernel/time_32.c +++ b/arch/sparc/kernel/time_32.c @@ -85,7 +85,7 @@ int update_persistent_clock(struct timespec now) /* * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * as well as call the "xtime_update()" routine every clocktick */ #define TICK_SIZE (tick_nsec / 1000) @@ -96,14 +96,9 @@ static irqreturn_t timer_interrupt(int dummy, void *dev_id) profile_tick(CPU_PROFILING); #endif - /* Protect counter clear so that do_gettimeoffset works */ - write_seqlock(&xtime_lock); - clear_clock_irq(); - do_timer(1); - - write_sequnlock(&xtime_lock); + xtime_update(1); #ifndef CONFIG_SMP update_process_times(user_mode(get_irq_regs())); diff --git a/arch/sparc/lib/atomic32.c b/arch/sparc/lib/atomic32.c index cbddeb38ffd..d3c7a12ad87 100644 --- a/arch/sparc/lib/atomic32.c +++ b/arch/sparc/lib/atomic32.c @@ -16,7 +16,7 @@ #define ATOMIC_HASH(a) (&__atomic_hash[(((unsigned long)a)>>8) & (ATOMIC_HASH_SIZE-1)]) spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] = { - [0 ... (ATOMIC_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED + [0 ... (ATOMIC_HASH_SIZE-1)] = __SPIN_LOCK_UNLOCKED(__atomic_hash) }; #else /* SMP */ diff --git a/arch/tile/include/asm/futex.h b/arch/tile/include/asm/futex.h index fe0d10dcae5..d03ec124a59 100644 --- a/arch/tile/include/asm/futex.h +++ b/arch/tile/include/asm/futex.h @@ -29,16 +29,16 @@ #include <linux/uaccess.h> #include <linux/errno.h> -extern struct __get_user futex_set(int __user *v, int i); -extern struct __get_user futex_add(int __user *v, int n); -extern struct __get_user futex_or(int __user *v, int n); -extern struct __get_user futex_andn(int __user *v, int n); -extern struct __get_user futex_cmpxchg(int __user *v, int o, int n); +extern struct __get_user futex_set(u32 __user *v, int i); +extern struct __get_user futex_add(u32 __user *v, int n); +extern struct __get_user futex_or(u32 __user *v, int n); +extern struct __get_user futex_andn(u32 __user *v, int n); +extern struct __get_user futex_cmpxchg(u32 __user *v, int o, int n); #ifndef __tilegx__ -extern struct __get_user futex_xor(int __user *v, int n); +extern struct __get_user futex_xor(u32 __user *v, int n); #else -static inline struct __get_user futex_xor(int __user *uaddr, int n) +static inline struct __get_user futex_xor(u32 __user *uaddr, int n) { struct __get_user asm_ret = __get_user_4(uaddr); if (!asm_ret.err) { @@ -53,7 +53,7 @@ static inline struct __get_user futex_xor(int __user *uaddr, int n) } #endif -static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -65,7 +65,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); @@ -119,16 +119,17 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) return ret; } -static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, - int newval) +static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { struct __get_user asm_ret; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; asm_ret = futex_cmpxchg(uaddr, oldval, newval); - return asm_ret.err ? asm_ret.err : asm_ret.val; + *uval = asm_ret.val; + return asm_ret.err; } #ifndef __tilegx__ diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common index e351e14b433..1e78940218c 100644 --- a/arch/um/Kconfig.common +++ b/arch/um/Kconfig.common @@ -7,6 +7,7 @@ config UML bool default y select HAVE_GENERIC_HARDIRQS + select GENERIC_HARDIRQS_NO_DEPRECATED config MMU bool diff --git a/arch/um/Kconfig.x86 b/arch/um/Kconfig.x86 index 5ee328099c6..02fb017fed4 100644 --- a/arch/um/Kconfig.x86 +++ b/arch/um/Kconfig.x86 @@ -10,6 +10,8 @@ endmenu config UML_X86 def_bool y + select GENERIC_FIND_FIRST_BIT + select GENERIC_FIND_NEXT_BIT config 64BIT bool @@ -19,6 +21,9 @@ config X86_32 def_bool !64BIT select HAVE_AOUT +config X86_64 + def_bool 64BIT + config RWSEM_XCHGADD_ALGORITHM def_bool X86_XADD diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 975613b23dc..c70e047eed7 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -124,35 +124,18 @@ void mconsole_log(struct mc_request *req) #if 0 void mconsole_proc(struct mc_request *req) { - struct nameidata nd; struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt; struct file *file; - int n, err; + int n; char *ptr = req->request.data, *buf; mm_segment_t old_fs = get_fs(); ptr += strlen("proc"); ptr = skip_spaces(ptr); - err = vfs_path_lookup(mnt->mnt_root, mnt, ptr, LOOKUP_FOLLOW, &nd); - if (err) { - mconsole_reply(req, "Failed to look up file", 1, 0); - goto out; - } - - err = may_open(&nd.path, MAY_READ, O_RDONLY); - if (result) { - mconsole_reply(req, "Failed to open file", 1, 0); - path_put(&nd.path); - goto out; - } - - file = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY, - current_cred()); - err = PTR_ERR(file); + file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY); if (IS_ERR(file)) { mconsole_reply(req, "Failed to open file", 1, 0); - path_put(&nd.path); goto out; } diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index ba4a98ba39c..620f5b70957 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -185,7 +185,7 @@ struct ubd { .no_cow = 0, \ .shared = 0, \ .cow = DEFAULT_COW, \ - .lock = SPIN_LOCK_UNLOCKED, \ + .lock = __SPIN_LOCK_UNLOCKED(ubd_devs.lock), \ .request = NULL, \ .start_sg = 0, \ .end_sg = 0, \ diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 3f0ac9e0c96..64cfea80cfe 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -35,8 +35,10 @@ int show_interrupts(struct seq_file *p, void *v) } if (i < NR_IRQS) { - raw_spin_lock_irqsave(&irq_desc[i].lock, flags); - action = irq_desc[i].action; + struct irq_desc *desc = irq_to_desc(i); + + raw_spin_lock_irqsave(&desc->lock, flags); + action = desc->action; if (!action) goto skip; seq_printf(p, "%3d: ",i); @@ -46,7 +48,7 @@ int show_interrupts(struct seq_file *p, void *v) for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); #endif - seq_printf(p, " %14s", irq_desc[i].chip->name); + seq_printf(p, " %14s", get_irq_desc_chip(desc)->name); seq_printf(p, " %s", action->name); for (action=action->next; action; action = action->next) @@ -54,7 +56,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: - raw_spin_unlock_irqrestore(&irq_desc[i].lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } else if (i == NR_IRQS) seq_putc(p, '\n'); @@ -360,10 +362,10 @@ EXPORT_SYMBOL(um_request_irq); EXPORT_SYMBOL(reactivate_fd); /* - * irq_chip must define (startup || enable) && - * (shutdown || disable) && end + * irq_chip must define at least enable/disable and ack when + * the edge handler is used. */ -static void dummy(unsigned int irq) +static void dummy(struct irq_data *d) { } @@ -371,20 +373,17 @@ static void dummy(unsigned int irq) static struct irq_chip normal_irq_type = { .name = "SIGIO", .release = free_irq_by_irq_and_dev, - .disable = dummy, - .enable = dummy, - .ack = dummy, - .end = dummy + .irq_disable = dummy, + .irq_enable = dummy, + .irq_ack = dummy, }; static struct irq_chip SIGVTALRM_irq_type = { .name = "SIGVTALRM", .release = free_irq_by_irq_and_dev, - .shutdown = dummy, /* never called */ - .disable = dummy, - .enable = dummy, - .ack = dummy, - .end = dummy + .irq_disable = dummy, + .irq_enable = dummy, + .irq_ack = dummy, }; void __init init_IRQ(void) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d5ed94d30aa..f8958b01b97 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -64,8 +64,12 @@ config X86 select HAVE_TEXT_POKE_SMP select HAVE_GENERIC_HARDIRQS select HAVE_SPARSE_IRQ + select GENERIC_FIND_FIRST_BIT + select GENERIC_FIND_NEXT_BIT select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP + select GENERIC_IRQ_SHOW + select IRQ_FORCED_THREADING select USE_GENERIC_SMP_HELPERS if SMP config INSTRUCTION_DECODER @@ -382,6 +386,8 @@ config X86_INTEL_CE depends on X86_32 depends on X86_EXTENDED_PLATFORM select X86_REBOOTFIXUPS + select OF + select OF_EARLY_FLATTREE ---help--- Select for the Intel CE media processor (CE4100) SOC. This option compiles in support for the CE4100 SOC for settop @@ -811,7 +817,7 @@ config X86_LOCAL_APIC config X86_IO_APIC def_bool y - depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC + depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC config X86_VISWS_APIC def_bool y @@ -1705,7 +1711,7 @@ config HAVE_ARCH_EARLY_PFN_TO_NID depends on NUMA config USE_PERCPU_NUMA_NODE_ID - def_bool X86_64 + def_bool y depends on NUMA menu "Power management and ACPI options" @@ -2066,9 +2072,10 @@ config SCx200HR_TIMER config OLPC bool "One Laptop Per Child support" + depends on !X86_PAE select GPIOLIB - select OLPC_OPENFIRMWARE - depends on !X86_64 && !X86_PAE + select OF + select OF_PROMTREE if PROC_DEVICETREE ---help--- Add support for detecting the unique features of the OLPC XO hardware. @@ -2079,21 +2086,6 @@ config OLPC_XO1 ---help--- Add support for non-essential features of the OLPC XO-1 laptop. -config OLPC_OPENFIRMWARE - bool "Support for OLPC's Open Firmware" - depends on !X86_64 && !X86_PAE - default n - select OF - help - This option adds support for the implementation of Open Firmware - that is used on the OLPC XO-1 Children's Machine. - If unsure, say N here. - -config OLPC_OPENFIRMWARE_DT - bool - default y if OLPC_OPENFIRMWARE && PROC_DEVICETREE - select OF_PROMTREE - endif # X86_32 config AMD_NB diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 283c5a6a03a..ed47e6e1747 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -294,11 +294,6 @@ config X86_GENERIC endif -config X86_CPU - def_bool y - select GENERIC_FIND_FIRST_BIT - select GENERIC_FIND_NEXT_BIT - # # Define implied options from the CPU selection here config X86_INTERNODE_CACHE_SHIFT diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c index 646aa78ba5f..46a82388243 100644 --- a/arch/x86/boot/compressed/mkpiggy.c +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -62,7 +62,12 @@ int main(int argc, char *argv[]) if (fseek(f, -4L, SEEK_END)) { perror(argv[1]); } - fread(&olen, sizeof olen, 1, f); + + if (fread(&olen, sizeof(olen), 1, f) != 1) { + perror(argv[1]); + return 1; + } + ilen = ftell(f); olen = getle32(&olen); fclose(f); diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index f729b2e9679..430312ba6e3 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -128,26 +128,20 @@ ENTRY(ia32_sysenter_target) */ ENABLE_INTERRUPTS(CLBR_NONE) movl %ebp,%ebp /* zero extension */ - pushq $__USER32_DS - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $__USER32_DS /*CFI_REL_OFFSET ss,0*/ - pushq %rbp - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rbp CFI_REL_OFFSET rsp,0 - pushfq - CFI_ADJUST_CFA_OFFSET 8 + pushfq_cfi /*CFI_REL_OFFSET rflags,0*/ movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d CFI_REGISTER rip,r10 - pushq $__USER32_CS - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $__USER32_CS /*CFI_REL_OFFSET cs,0*/ movl %eax, %eax - pushq %r10 - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %r10 CFI_REL_OFFSET rip,0 - pushq %rax - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rax cld SAVE_ARGS 0,0,1 /* no need to do an access_ok check here because rbp has been @@ -184,11 +178,9 @@ sysexit_from_sys_call: xorq %r9,%r9 xorq %r10,%r10 xorq %r11,%r11 - popfq - CFI_ADJUST_CFA_OFFSET -8 + popfq_cfi /*CFI_RESTORE rflags*/ - popq %rcx /* User %esp */ - CFI_ADJUST_CFA_OFFSET -8 + popq_cfi %rcx /* User %esp */ CFI_REGISTER rsp,rcx TRACE_IRQS_ON ENABLE_INTERRUPTS_SYSEXIT32 @@ -423,8 +415,7 @@ ENTRY(ia32_syscall) */ ENABLE_INTERRUPTS(CLBR_NONE) movl %eax,%eax - pushq %rax - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rax cld /* note the registers are not zero extended to the sf. this could be a problem. */ @@ -853,4 +844,7 @@ ia32_sys_call_table: .quad sys_fanotify_init .quad sys32_fanotify_mark .quad sys_prlimit64 /* 340 */ + .quad sys_name_to_handle_at + .quad compat_sys_open_by_handle_at + .quad compat_sys_clock_adjtime ia32_syscall_end: diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 4ea15ca89b2..b964ec45754 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -186,15 +186,7 @@ struct bootnode; #ifdef CONFIG_ACPI_NUMA extern int acpi_numa; -extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start, - unsigned long end); -extern int acpi_scan_nodes(unsigned long start, unsigned long end); -#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) - -#ifdef CONFIG_NUMA_EMU -extern void acpi_fake_nodes(const struct bootnode *fake_nodes, - int num_nodes); -#endif +extern int x86_acpi_numa_init(void); #endif /* CONFIG_ACPI_NUMA */ #define acpi_unlazy_tlb(x) leave_mm(x) diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 64dc82ee19f..e264ae5a144 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -9,23 +9,20 @@ struct amd_nb_bus_dev_range { u8 dev_limit; }; -extern struct pci_device_id amd_nb_misc_ids[]; +extern const struct pci_device_id amd_nb_misc_ids[]; extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; struct bootnode; extern int early_is_amd_nb(u32 value); extern int amd_cache_northbridges(void); extern void amd_flush_garts(void); -extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn); -extern int amd_scan_nodes(void); - -#ifdef CONFIG_NUMA_EMU -extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes); -extern void amd_get_nodes(struct bootnode *nodes); -#endif +extern int amd_numa_init(void); +extern int amd_get_subcaches(int); +extern int amd_set_subcaches(int, int); struct amd_northbridge { struct pci_dev *misc; + struct pci_dev *link; }; struct amd_northbridge_info { @@ -37,6 +34,7 @@ extern struct amd_northbridge_info amd_northbridges; #define AMD_NB_GART 0x1 #define AMD_NB_L3_INDEX_DISABLE 0x2 +#define AMD_NB_L3_PARTITIONING 0x4 #ifdef CONFIG_AMD_NB diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 3c896946f4c..a279d98ea95 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -220,7 +220,6 @@ extern void enable_IR_x2apic(void); extern int get_physical_broadcast(void); -extern void apic_disable(void); extern int lapic_get_maxlvt(void); extern void clear_local_APIC(void); extern void connect_bsp_APIC(void); @@ -228,7 +227,6 @@ extern void disconnect_bsp_APIC(int virt_wire_setup); extern void disable_local_APIC(void); extern void lapic_shutdown(void); extern int verify_local_APIC(void); -extern void cache_APIC_registers(void); extern void sync_Arb_IDs(void); extern void init_bsp_APIC(void); extern void setup_local_APIC(void); @@ -239,8 +237,7 @@ void register_lapic_address(unsigned long address); extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); extern int APIC_init_uniprocessor(void); -extern void enable_NMI_through_LVT0(void); -extern int apic_force_enable(void); +extern int apic_force_enable(unsigned long addr); /* * On 32bit this is mach-xxx local @@ -261,7 +258,6 @@ static inline void lapic_shutdown(void) { } #define local_apic_timer_c2_ok 1 static inline void init_apic_mappings(void) { } static inline void disable_local_APIC(void) { } -static inline void apic_disable(void) { } # define setup_boot_APIC_clock x86_init_noop # define setup_secondary_APIC_clock x86_init_noop #endif /* !CONFIG_X86_LOCAL_APIC */ @@ -307,8 +303,6 @@ struct apic { void (*setup_apic_routing)(void); int (*multi_timer_check)(int apic, int irq); - int (*apicid_to_node)(int logical_apicid); - int (*cpu_to_logical_apicid)(int cpu); int (*cpu_present_to_apicid)(int mps_cpu); void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); void (*setup_portio_remap)(void); @@ -356,6 +350,23 @@ struct apic { void (*icr_write)(u32 low, u32 high); void (*wait_icr_idle)(void); u32 (*safe_wait_icr_idle)(void); + +#ifdef CONFIG_X86_32 + /* + * Called very early during boot from get_smp_config(). It should + * return the logical apicid. x86_[bios]_cpu_to_apicid is + * initialized before this function is called. + * + * If logical apicid can't be determined that early, the function + * may return BAD_APICID. Logical apicid will be configured after + * init_apic_ldr() while bringing up CPUs. Note that NUMA affinity + * won't be applied properly during early boot in this case. + */ + int (*x86_32_early_logical_apicid)(int cpu); + + /* determine CPU -> NUMA node mapping */ + int (*x86_32_numa_cpu_node)(int cpu); +#endif }; /* @@ -503,6 +514,11 @@ extern struct apic apic_noop; extern struct apic apic_default; +static inline int noop_x86_32_early_logical_apicid(int cpu) +{ + return BAD_APICID; +} + /* * Set up the logical destination ID. * @@ -522,7 +538,7 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb) return cpuid_apic >> index_msb; } -extern int default_apicid_to_node(int logical_apicid); +extern int default_x86_32_numa_cpu_node(int cpu); #endif @@ -558,12 +574,6 @@ static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_ma *retmap = *phys_map; } -/* Mapping from cpu number to logical apicid */ -static inline int default_cpu_to_logical_apicid(int cpu) -{ - return 1 << cpu; -} - static inline int __default_cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) @@ -596,8 +606,4 @@ extern int default_check_phys_apicid_present(int phys_apicid); #endif /* CONFIG_X86_LOCAL_APIC */ -#ifdef CONFIG_X86_32 -extern u8 cpu_2_logical_apicid[NR_CPUS]; -#endif - #endif /* _ASM_X86_APIC_H */ diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 47a30ff8e51..d87988bacf3 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -426,4 +426,16 @@ struct local_apic { #else #define BAD_APICID 0xFFFFu #endif + +enum ioapic_irq_destination_types { + dest_Fixed = 0, + dest_LowestPrio = 1, + dest_SMI = 2, + dest__reserved_1 = 3, + dest_NMI = 4, + dest_INIT = 5, + dest__reserved_2 = 6, + dest_ExtINT = 7 +}; + #endif /* _ASM_X86_APICDEF_H */ diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index c8bfe63a06d..e020d88ec02 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -12,6 +12,7 @@ /* setup data types */ #define SETUP_NONE 0 #define SETUP_E820_EXT 1 +#define SETUP_DTB 2 /* extensible setup data list node */ struct setup_data { diff --git a/arch/x86/include/asm/ce4100.h b/arch/x86/include/asm/ce4100.h new file mode 100644 index 00000000000..e656ad8c0a2 --- /dev/null +++ b/arch/x86/include/asm/ce4100.h @@ -0,0 +1,6 @@ +#ifndef _ASM_CE4100_H_ +#define _ASM_CE4100_H_ + +int ce4100_pci_init(void); + +#endif diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index e99d55d74df..908b96957d8 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -96,7 +96,7 @@ extern void e820_setup_gap(void); extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, unsigned long start_addr, unsigned long long end_addr); struct setup_data; -extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data); +extern void parse_e820_ext(struct setup_data *data); #if defined(CONFIG_X86_64) || \ (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 57650ab4a5f..1cd6d26a0a8 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -16,10 +16,13 @@ BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) -.irpc idx, "01234567" +.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 +.if NUM_INVALIDATE_TLB_VECTORS > \idx BUILD_INTERRUPT3(invalidate_interrupt\idx, (INVALIDATE_TLB_VECTOR_START)+\idx, smp_invalidate_interrupt) +.endif .endr #endif diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h index 06850a7194e..2c6fc9e6281 100644 --- a/arch/x86/include/asm/frame.h +++ b/arch/x86/include/asm/frame.h @@ -7,14 +7,12 @@ frame pointer later */ #ifdef CONFIG_FRAME_POINTER .macro FRAME - pushl %ebp - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebp CFI_REL_OFFSET ebp,0 movl %esp,%ebp .endm .macro ENDFRAME - popl %ebp - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebp CFI_RESTORE ebp .endm #else diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index 1f11ce44e95..d09bb03653f 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -37,7 +37,7 @@ "+m" (*uaddr), "=&r" (tem) \ : "r" (oparg), "i" (-EFAULT), "1" (0)) -static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -48,7 +48,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) @@ -109,9 +109,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) return ret; } -static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, - int newval) +static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { + int ret = 0; #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) /* Real i386 machines have no cmpxchg instruction */ @@ -119,21 +120,22 @@ static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, return -ENOSYS; #endif - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; - asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n" + asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" "2:\t.section .fixup, \"ax\"\n" - "3:\tmov %2, %0\n" + "3:\tmov %3, %0\n" "\tjmp 2b\n" "\t.previous\n" _ASM_EXTABLE(1b, 3b) - : "=a" (oldval), "+m" (*uaddr) - : "i" (-EFAULT), "r" (newval), "0" (oldval) + : "+r" (ret), "=a" (oldval), "+m" (*uaddr) + : "i" (-EFAULT), "r" (newval), "1" (oldval) : "memory" ); - return oldval; + *uval = oldval; + return ret; } #endif diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 0274ec5a7e6..bb9efe8706e 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -45,6 +45,30 @@ extern void invalidate_interrupt4(void); extern void invalidate_interrupt5(void); extern void invalidate_interrupt6(void); extern void invalidate_interrupt7(void); +extern void invalidate_interrupt8(void); +extern void invalidate_interrupt9(void); +extern void invalidate_interrupt10(void); +extern void invalidate_interrupt11(void); +extern void invalidate_interrupt12(void); +extern void invalidate_interrupt13(void); +extern void invalidate_interrupt14(void); +extern void invalidate_interrupt15(void); +extern void invalidate_interrupt16(void); +extern void invalidate_interrupt17(void); +extern void invalidate_interrupt18(void); +extern void invalidate_interrupt19(void); +extern void invalidate_interrupt20(void); +extern void invalidate_interrupt21(void); +extern void invalidate_interrupt22(void); +extern void invalidate_interrupt23(void); +extern void invalidate_interrupt24(void); +extern void invalidate_interrupt25(void); +extern void invalidate_interrupt26(void); +extern void invalidate_interrupt27(void); +extern void invalidate_interrupt28(void); +extern void invalidate_interrupt29(void); +extern void invalidate_interrupt30(void); +extern void invalidate_interrupt31(void); extern void irq_move_cleanup_interrupt(void); extern void reboot_interrupt(void); diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 36fb1a6a510..8dbe353e41e 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -11,8 +11,8 @@ kernel_physical_mapping_init(unsigned long start, unsigned long page_size_mask); -extern unsigned long __initdata e820_table_start; -extern unsigned long __meminitdata e820_table_end; -extern unsigned long __meminitdata e820_table_top; +extern unsigned long __initdata pgt_buf_start; +extern unsigned long __meminitdata pgt_buf_end; +extern unsigned long __meminitdata pgt_buf_top; #endif /* _ASM_X86_INIT_32_H */ diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index f327d386d6c..c4bd267dfc5 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -63,17 +63,6 @@ union IO_APIC_reg_03 { } __attribute__ ((packed)) bits; }; -enum ioapic_irq_destination_types { - dest_Fixed = 0, - dest_LowestPrio = 1, - dest_SMI = 2, - dest__reserved_1 = 3, - dest_NMI = 4, - dest_INIT = 5, - dest__reserved_2 = 6, - dest_ExtINT = 7 -}; - struct IO_APIC_route_entry { __u32 vector : 8, delivery_mode : 3, /* 000: FIXED @@ -106,6 +95,10 @@ struct IR_IO_APIC_route_entry { index : 15; } __attribute__ ((packed)); +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + #ifdef CONFIG_X86_IO_APIC /* @@ -150,11 +143,6 @@ extern int timer_through_8259; #define io_apic_assign_pci_irqs \ (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs) -extern u8 io_apic_unique_id(u8 id); -extern int io_apic_get_unique_id(int ioapic, int apic_id); -extern int io_apic_get_version(int ioapic); -extern int io_apic_get_redir_entries(int ioapic); - struct io_apic_irq_attr; extern int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr); @@ -162,6 +150,8 @@ void setup_IO_APIC_irq_extra(u32 gsi); extern void ioapic_and_gsi_init(void); extern void ioapic_insert_resources(void); +int io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr); + extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); @@ -186,6 +176,8 @@ extern void __init pre_init_apic_IRQ0(void); extern void mp_save_irq(struct mpc_intsrc *m); +extern void disable_ioapic_support(void); + #else /* !CONFIG_X86_IO_APIC */ #define io_apic_assign_pci_irqs 0 @@ -199,6 +191,26 @@ static inline int mp_find_ioapic(u32 gsi) { return 0; } struct io_apic_irq_attr; static inline int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr) { return 0; } + +static inline struct IO_APIC_route_entry **alloc_ioapic_entries(void) +{ + return NULL; +} + +static inline void free_ioapic_entries(struct IO_APIC_route_entry **ent) { } +static inline int save_IO_APIC_setup(struct IO_APIC_route_entry **ent) +{ + return -ENOMEM; +} + +static inline void mask_IO_APIC_setup(struct IO_APIC_route_entry **ent) { } +static inline int restore_IO_APIC_setup(struct IO_APIC_route_entry **ent) +{ + return -ENOMEM; +} + +static inline void mp_save_irq(struct mpc_intsrc *m) { }; +static inline void disable_ioapic_support(void) { } #endif #endif /* _ASM_X86_IO_APIC_H */ diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h index 0b7228268a6..615fa9061b5 100644 --- a/arch/x86/include/asm/ipi.h +++ b/arch/x86/include/asm/ipi.h @@ -123,10 +123,6 @@ extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector); extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, int vector); -extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, - int vector); -extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, - int vector); /* Avoid include hell */ #define NMI_VECTOR 0x02 @@ -150,6 +146,10 @@ static inline void __default_local_send_IPI_all(int vector) } #ifdef CONFIG_X86_32 +extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, + int vector); +extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, + int vector); extern void default_send_IPI_mask_logical(const struct cpumask *mask, int vector); extern void default_send_IPI_allbutself(int vector); diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index c704b38c57a..ba870bb6dd8 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -10,9 +10,6 @@ #include <asm/apicdef.h> #include <asm/irq_vectors.h> -/* Even though we don't support this, supply it to appease OF */ -static inline void irq_dispose_mapping(unsigned int virq) { } - static inline int irq_canonicalize(int irq) { return ((irq == 2) ? 9 : irq); diff --git a/arch/x86/include/asm/irq_controller.h b/arch/x86/include/asm/irq_controller.h new file mode 100644 index 00000000000..423bbbddf36 --- /dev/null +++ b/arch/x86/include/asm/irq_controller.h @@ -0,0 +1,12 @@ +#ifndef __IRQ_CONTROLLER__ +#define __IRQ_CONTROLLER__ + +struct irq_domain { + int (*xlate)(struct irq_domain *h, const u32 *intspec, u32 intsize, + u32 *out_hwirq, u32 *out_type); + void *priv; + struct device_node *controller; + struct list_head l; +}; + +#endif diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 6af0894dafb..6e976ee3b3e 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_IRQ_VECTORS_H #define _ASM_X86_IRQ_VECTORS_H +#include <linux/threads.h> /* * Linux IRQ vector layout. * @@ -16,8 +17,8 @@ * Vectors 0 ... 31 : system traps and exceptions - hardcoded events * Vectors 32 ... 127 : device interrupts * Vector 128 : legacy int80 syscall interface - * Vectors 129 ... 237 : device interrupts - * Vectors 238 ... 255 : special interrupts + * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 : device interrupts + * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts * * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. * @@ -96,37 +97,43 @@ #define THRESHOLD_APIC_VECTOR 0xf9 #define REBOOT_VECTOR 0xf8 -/* f0-f7 used for spreading out TLB flushes: */ -#define INVALIDATE_TLB_VECTOR_END 0xf7 -#define INVALIDATE_TLB_VECTOR_START 0xf0 -#define NUM_INVALIDATE_TLB_VECTORS 8 - -/* - * Local APIC timer IRQ vector is on a different priority level, - * to work around the 'lost local interrupt if more than 2 IRQ - * sources per level' errata. - */ -#define LOCAL_TIMER_VECTOR 0xef - /* * Generic system vector for platform specific use */ -#define X86_PLATFORM_IPI_VECTOR 0xed +#define X86_PLATFORM_IPI_VECTOR 0xf7 /* * IRQ work vector: */ -#define IRQ_WORK_VECTOR 0xec +#define IRQ_WORK_VECTOR 0xf6 -#define UV_BAU_MESSAGE 0xea +#define UV_BAU_MESSAGE 0xf5 /* * Self IPI vector for machine checks */ -#define MCE_SELF_VECTOR 0xeb +#define MCE_SELF_VECTOR 0xf4 /* Xen vector callback to receive events in a HVM domain */ -#define XEN_HVM_EVTCHN_CALLBACK 0xe9 +#define XEN_HVM_EVTCHN_CALLBACK 0xf3 + +/* + * Local APIC timer IRQ vector is on a different priority level, + * to work around the 'lost local interrupt if more than 2 IRQ + * sources per level' errata. + */ +#define LOCAL_TIMER_VECTOR 0xef + +/* up to 32 vectors used for spreading out TLB flushes: */ +#if NR_CPUS <= 32 +# define NUM_INVALIDATE_TLB_VECTORS (NR_CPUS) +#else +# define NUM_INVALIDATE_TLB_VECTORS (32) +#endif + +#define INVALIDATE_TLB_VECTOR_END (0xee) +#define INVALIDATE_TLB_VECTOR_START \ + (INVALIDATE_TLB_VECTOR_END-NUM_INVALIDATE_TLB_VECTORS+1) #define NR_VECTORS 256 diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 0c90dd9f050..9c7d95f6174 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -25,7 +25,6 @@ extern int pic_mode; #define MAX_IRQ_SOURCES 256 extern unsigned int def_to_bigsmp; -extern u8 apicid_2_node[]; #ifdef CONFIG_X86_NUMAQ extern int mp_bus_id_to_node[MAX_MP_BUSSES]; @@ -33,8 +32,6 @@ extern int mp_bus_id_to_local[MAX_MP_BUSSES]; extern int quad_local_to_mp_bus_id [NR_CPUS/4][4]; #endif -#define MAX_APICID 256 - #else /* CONFIG_X86_64: */ #define MAX_MP_BUSSES 256 diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 27da400d313..3d4dab43c99 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -1,5 +1,57 @@ +#ifndef _ASM_X86_NUMA_H +#define _ASM_X86_NUMA_H + +#include <asm/topology.h> +#include <asm/apicdef.h> + +#ifdef CONFIG_NUMA + +#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) + +/* + * __apicid_to_node[] stores the raw mapping between physical apicid and + * node and is used to initialize cpu_to_node mapping. + * + * The mapping may be overridden by apic->numa_cpu_node() on 32bit and thus + * should be accessed by the accessors - set_apicid_to_node() and + * numa_cpu_node(). + */ +extern s16 __apicid_to_node[MAX_LOCAL_APIC]; + +static inline void set_apicid_to_node(int apicid, s16 node) +{ + __apicid_to_node[apicid] = node; +} +#else /* CONFIG_NUMA */ +static inline void set_apicid_to_node(int apicid, s16 node) +{ +} +#endif /* CONFIG_NUMA */ + #ifdef CONFIG_X86_32 # include "numa_32.h" #else # include "numa_64.h" #endif + +#ifdef CONFIG_NUMA +extern void __cpuinit numa_set_node(int cpu, int node); +extern void __cpuinit numa_clear_node(int cpu); +extern void __init numa_init_array(void); +extern void __init init_cpu_to_node(void); +extern void __cpuinit numa_add_cpu(int cpu); +extern void __cpuinit numa_remove_cpu(int cpu); +#else /* CONFIG_NUMA */ +static inline void numa_set_node(int cpu, int node) { } +static inline void numa_clear_node(int cpu) { } +static inline void numa_init_array(void) { } +static inline void init_cpu_to_node(void) { } +static inline void numa_add_cpu(int cpu) { } +static inline void numa_remove_cpu(int cpu) { } +#endif /* CONFIG_NUMA */ + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable); +#endif + +#endif /* _ASM_X86_NUMA_H */ diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h index b0ef2b449a9..c6beed1ef10 100644 --- a/arch/x86/include/asm/numa_32.h +++ b/arch/x86/include/asm/numa_32.h @@ -4,7 +4,12 @@ extern int numa_off; extern int pxm_to_nid(int pxm); -extern void numa_remove_cpu(int cpu); + +#ifdef CONFIG_NUMA +extern int __cpuinit numa_cpu_node(int cpu); +#else /* CONFIG_NUMA */ +static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } +#endif /* CONFIG_NUMA */ #ifdef CONFIG_HIGHMEM extern void set_highmem_pages_init(void); diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 0493be39607..344eb1790b4 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -2,23 +2,16 @@ #define _ASM_X86_NUMA_64_H #include <linux/nodemask.h> -#include <asm/apicdef.h> struct bootnode { u64 start; u64 end; }; -extern int compute_hash_shift(struct bootnode *nodes, int numblks, - int *nodeids); - #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) -extern void numa_init_array(void); extern int numa_off; -extern s16 apicid_to_node[MAX_LOCAL_APIC]; - extern unsigned long numa_free_all_bootmem(void); extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end); @@ -31,11 +24,11 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, */ #define NODE_MIN_SIZE (4*1024*1024) -extern void __init init_cpu_to_node(void); -extern void __cpuinit numa_set_node(int cpu, int node); -extern void __cpuinit numa_clear_node(int cpu); -extern void __cpuinit numa_add_cpu(int cpu); -extern void __cpuinit numa_remove_cpu(int cpu); +extern nodemask_t numa_nodes_parsed __initdata; + +extern int __cpuinit numa_cpu_node(int cpu); +extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); +extern void __init numa_set_distance(int from, int to, int distance); #ifdef CONFIG_NUMA_EMU #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) @@ -43,11 +36,7 @@ extern void __cpuinit numa_remove_cpu(int cpu); void numa_emu_cmdline(char *); #endif /* CONFIG_NUMA_EMU */ #else -static inline void init_cpu_to_node(void) { } -static inline void numa_set_node(int cpu, int node) { } -static inline void numa_clear_node(int cpu) { } -static inline void numa_add_cpu(int cpu, int node) { } -static inline void numa_remove_cpu(int cpu) { } +static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } #endif #endif /* _ASM_X86_NUMA_64_H */ diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h index 641988efe06..c5d3a5abbb9 100644 --- a/arch/x86/include/asm/olpc_ofw.h +++ b/arch/x86/include/asm/olpc_ofw.h @@ -6,7 +6,7 @@ #define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */ -#ifdef CONFIG_OLPC_OPENFIRMWARE +#ifdef CONFIG_OLPC extern bool olpc_ofw_is_installed(void); @@ -26,19 +26,15 @@ extern void setup_olpc_ofw_pgd(void); /* check if OFW was detected during boot */ extern bool olpc_ofw_present(void); -#else /* !CONFIG_OLPC_OPENFIRMWARE */ - -static inline bool olpc_ofw_is_installed(void) { return false; } +#else /* !CONFIG_OLPC */ static inline void olpc_ofw_detect(void) { } static inline void setup_olpc_ofw_pgd(void) { } -static inline bool olpc_ofw_present(void) { return false; } - -#endif /* !CONFIG_OLPC_OPENFIRMWARE */ +#endif /* !CONFIG_OLPC */ -#ifdef CONFIG_OLPC_OPENFIRMWARE_DT +#ifdef CONFIG_OF_PROMTREE extern void olpc_dt_build_devicetree(void); #else static inline void olpc_dt_build_devicetree(void) { } -#endif /* CONFIG_OLPC_OPENFIRMWARE_DT */ +#endif #endif /* _ASM_X86_OLPC_OFW_H */ diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 1df66211fd1..bce688d54c1 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -2,6 +2,7 @@ #define _ASM_X86_PAGE_DEFS_H #include <linux/const.h> +#include <linux/types.h> /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 @@ -45,11 +46,15 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; +static inline phys_addr_t get_max_mapped(void) +{ + return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; +} + extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); -extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn, - int acpi, int k8); +extern void initmem_init(void); extern void free_initmem(void); #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 45636cefa18..4c25ab48257 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -94,10 +94,6 @@ struct cpuinfo_x86 { int x86_cache_alignment; /* In bytes */ int x86_power; unsigned long loops_per_jiffy; -#ifdef CONFIG_SMP - /* cpus sharing the last level cache: */ - cpumask_var_t llc_shared_map; -#endif /* cpuid returned max cores value: */ u16 x86_max_cores; u16 apicid; diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index b4ec95f0751..971e0b46446 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -1 +1,69 @@ -/* dummy prom.h; here to make linux/of.h's #includes happy */ +/* + * Definitions for Device tree / OpenFirmware handling on X86 + * + * based on arch/powerpc/include/asm/prom.h which is + * Copyright (C) 1996-2005 Paul Mackerras. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ASM_X86_PROM_H +#define _ASM_X86_PROM_H +#ifndef __ASSEMBLY__ + +#include <linux/of.h> +#include <linux/types.h> +#include <linux/pci.h> + +#include <asm/irq.h> +#include <asm/atomic.h> +#include <asm/setup.h> +#include <asm/irq_controller.h> + +#ifdef CONFIG_OF +extern int of_ioapic; +extern u64 initial_dtb; +extern void add_dtb(u64 data); +extern void x86_add_irq_domains(void); +void __cpuinit x86_of_pci_init(void); +void x86_dtb_init(void); + +static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev) +{ + return pdev ? pdev->dev.of_node : NULL; +} + +static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus) +{ + return pci_device_to_OF_node(bus->self); +} + +#else +static inline void add_dtb(u64 data) { } +static inline void x86_add_irq_domains(void) { } +static inline void x86_of_pci_init(void) { } +static inline void x86_dtb_init(void) { } +#define of_ioapic 0 +#endif + +extern char cmd_line[COMMAND_LINE_SIZE]; + +#define pci_address_to_pio pci_address_to_pio +unsigned long pci_address_to_pio(phys_addr_t addr); + +/** + * irq_dispose_mapping - Unmap an interrupt + * @virq: linux virq number of the interrupt to unmap + * + * FIXME: We really should implement proper virq handling like power, + * but that's going to be major surgery. + */ +static inline void irq_dispose_mapping(unsigned int virq) { } + +#define HAVE_ARCH_DEVTREE_FIXUPS + +#endif /* __ASSEMBLY__ */ +#endif diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index d1e41b0f9b6..df4cd32b4cc 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -37,26 +37,9 @@ #endif #ifdef __KERNEL__ - -#include <linux/list.h> -#include <linux/spinlock.h> -#include <linux/lockdep.h> #include <asm/asm.h> -struct rwsem_waiter; - -extern asmregparm struct rw_semaphore * - rwsem_down_read_failed(struct rw_semaphore *sem); -extern asmregparm struct rw_semaphore * - rwsem_down_write_failed(struct rw_semaphore *sem); -extern asmregparm struct rw_semaphore * - rwsem_wake(struct rw_semaphore *); -extern asmregparm struct rw_semaphore * - rwsem_downgrade_wake(struct rw_semaphore *sem); - /* - * the semaphore definition - * * The bias values and the counter type limits the number of * potential readers/writers to 32767 for 32 bits and 2147483647 * for 64 bits. @@ -74,43 +57,6 @@ extern asmregparm struct rw_semaphore * #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -typedef signed long rwsem_count_t; - -struct rw_semaphore { - rwsem_count_t count; - spinlock_t wait_lock; - struct list_head wait_list; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -}; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } -#else -# define __RWSEM_DEP_MAP_INIT(lockname) -#endif - - -#define __RWSEM_INITIALIZER(name) \ -{ \ - RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ - LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) \ -} - -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) - -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); - -#define init_rwsem(sem) \ -do { \ - static struct lock_class_key __key; \ - \ - __init_rwsem((sem), #sem, &__key); \ -} while (0) - /* * lock for reading */ @@ -133,7 +79,7 @@ static inline void __down_read(struct rw_semaphore *sem) */ static inline int __down_read_trylock(struct rw_semaphore *sem) { - rwsem_count_t result, tmp; + long result, tmp; asm volatile("# beginning __down_read_trylock\n\t" " mov %0,%1\n\t" "1:\n\t" @@ -155,7 +101,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) */ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) { - rwsem_count_t tmp; + long tmp; asm volatile("# beginning down_write\n\t" LOCK_PREFIX " xadd %1,(%2)\n\t" /* adds 0xffff0001, returns the old value */ @@ -180,9 +126,8 @@ static inline void __down_write(struct rw_semaphore *sem) */ static inline int __down_write_trylock(struct rw_semaphore *sem) { - rwsem_count_t ret = cmpxchg(&sem->count, - RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); + long ret = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, + RWSEM_ACTIVE_WRITE_BIAS); if (ret == RWSEM_UNLOCKED_VALUE) return 1; return 0; @@ -193,7 +138,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) */ static inline void __up_read(struct rw_semaphore *sem) { - rwsem_count_t tmp; + long tmp; asm volatile("# beginning __up_read\n\t" LOCK_PREFIX " xadd %1,(%2)\n\t" /* subtracts 1, returns the old value */ @@ -211,7 +156,7 @@ static inline void __up_read(struct rw_semaphore *sem) */ static inline void __up_write(struct rw_semaphore *sem) { - rwsem_count_t tmp; + long tmp; asm volatile("# beginning __up_write\n\t" LOCK_PREFIX " xadd %1,(%2)\n\t" /* subtracts 0xffff0001, returns the old value */ @@ -247,8 +192,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(rwsem_count_t delta, - struct rw_semaphore *sem) +static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem) { asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0" : "+m" (sem->count) @@ -258,10 +202,9 @@ static inline void rwsem_atomic_add(rwsem_count_t delta, /* * implement exchange and add functionality */ -static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta, - struct rw_semaphore *sem) +static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) { - rwsem_count_t tmp = delta; + long tmp = delta; asm volatile(LOCK_PREFIX "xadd %0,%1" : "+r" (tmp), "+m" (sem->count) @@ -270,10 +213,5 @@ static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta, return tmp + delta; } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* __KERNEL__ */ #endif /* _ASM_X86_RWSEM_H */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index c1bbfa89a0e..73b11bc0ae6 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -33,6 +33,8 @@ static inline bool cpu_has_ht_siblings(void) DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); +/* cpus sharing the last level cache: */ +DECLARE_PER_CPU(cpumask_var_t, cpu_llc_shared_map); DECLARE_PER_CPU(u16, cpu_llc_id); DECLARE_PER_CPU(int, cpu_number); @@ -46,8 +48,16 @@ static inline struct cpumask *cpu_core_mask(int cpu) return per_cpu(cpu_core_map, cpu); } +static inline struct cpumask *cpu_llc_shared_mask(int cpu) +{ + return per_cpu(cpu_llc_shared_map, cpu); +} + DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) +DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid); +#endif /* Static state in head.S used to set up a CPU */ extern unsigned long stack_start; /* Initial stack pointer address */ diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 33ecc3ea878..12569e691ce 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -98,8 +98,6 @@ do { \ */ #define HAVE_DISABLE_HLT #else -#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t" -#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t" /* frame pointer must be last for get_wchan */ #define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 21899cc31e5..910a7084f7f 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -47,21 +47,6 @@ #include <asm/mpspec.h> -#ifdef CONFIG_X86_32 - -/* Mappings between logical cpu number and node number */ -extern int cpu_to_node_map[]; - -/* Returns the number of the node containing CPU 'cpu' */ -static inline int __cpu_to_node(int cpu) -{ - return cpu_to_node_map[cpu]; -} -#define early_cpu_to_node __cpu_to_node -#define cpu_to_node __cpu_to_node - -#else /* CONFIG_X86_64 */ - /* Mappings between logical cpu number and node number */ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); @@ -84,8 +69,6 @@ static inline int early_cpu_to_node(int cpu) #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ -#endif /* CONFIG_X86_64 */ - /* Mappings between node number and cpus on that node. */ extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; @@ -155,7 +138,7 @@ extern unsigned long node_remap_size[]; .balance_interval = 1, \ } -#ifdef CONFIG_X86_64_ACPI_NUMA +#ifdef CONFIG_X86_64 extern int __node_distance(int, int); #define node_distance(a, b) __node_distance(a, b) #endif diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index b766a5e8ba0..ffaf183c619 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -346,10 +346,13 @@ #define __NR_fanotify_init 338 #define __NR_fanotify_mark 339 #define __NR_prlimit64 340 +#define __NR_name_to_handle_at 341 +#define __NR_open_by_handle_at 342 +#define __NR_clock_adjtime 343 #ifdef __KERNEL__ -#define NR_syscalls 341 +#define NR_syscalls 344 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 363e9b8a715..5466bea670e 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -669,6 +669,12 @@ __SYSCALL(__NR_fanotify_init, sys_fanotify_init) __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) #define __NR_prlimit64 302 __SYSCALL(__NR_prlimit64, sys_prlimit64) +#define __NR_name_to_handle_at 303 +__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at) +#define __NR_open_by_handle_at 304 +__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at) +#define __NR_clock_adjtime 305 +__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index ce1d54c8a43..3e094af443c 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -176,7 +176,7 @@ struct bau_msg_payload { struct bau_msg_header { unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ /* bits 5:0 */ - unsigned int base_dest_nodeid:15; /* nasid (pnode<<1) of */ + unsigned int base_dest_nodeid:15; /* nasid of the */ /* bits 20:6 */ /* first bit in uvhub map */ unsigned int command:8; /* message type */ /* bits 28:21 */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 64642ad019f..643ebf2e2ad 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -83,11 +83,13 @@ struct x86_init_paging { * boot cpu * @tsc_pre_init: platform function called before TSC init * @timer_init: initialize the platform timer (default PIT/HPET) + * @wallclock_init: init the wallclock device */ struct x86_init_timers { void (*setup_percpu_clockev)(void); void (*tsc_pre_init)(void); void (*timer_init)(void); + void (*wallclock_init)(void); }; /** diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index a3c28ae4025..8508bfe5229 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -287,7 +287,7 @@ HYPERVISOR_fpu_taskswitch(int set) static inline int HYPERVISOR_sched_op(int cmd, void *arg) { - return _hypercall2(int, sched_op_new, cmd, arg); + return _hypercall2(int, sched_op, cmd, arg); } static inline long @@ -422,10 +422,17 @@ HYPERVISOR_set_segment_base(int reg, unsigned long value) #endif static inline int -HYPERVISOR_suspend(unsigned long srec) +HYPERVISOR_suspend(unsigned long start_info_mfn) { - return _hypercall3(int, sched_op, SCHEDOP_shutdown, - SHUTDOWN_suspend, srec); + struct sched_shutdown r = { .reason = SHUTDOWN_suspend }; + + /* + * For a PV guest the tools require that the start_info mfn be + * present in rdx/edx when the hypercall is made. Per the + * hypercall calling convention this is the third hypercall + * argument, which is start_info_mfn here. + */ + return _hypercall3(int, sched_op, SCHEDOP_shutdown, &r, start_info_mfn); } static inline int diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index f25bdf238a3..c61934fbf22 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -29,8 +29,10 @@ typedef struct xpaddr { /**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ #define INVALID_P2M_ENTRY (~0UL) -#define FOREIGN_FRAME_BIT (1UL<<31) +#define FOREIGN_FRAME_BIT (1UL<<(BITS_PER_LONG-1)) +#define IDENTITY_FRAME_BIT (1UL<<(BITS_PER_LONG-2)) #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) +#define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT) /* Maximum amount of memory we can handle in a domain in pages */ #define MAX_DOMAIN_PAGES \ @@ -41,12 +43,18 @@ extern unsigned int machine_to_phys_order; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); +extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); +extern unsigned long set_phys_range_identity(unsigned long pfn_s, + unsigned long pfn_e); extern int m2p_add_override(unsigned long mfn, struct page *page); extern int m2p_remove_override(struct page *page); extern struct page *m2p_find_override(unsigned long mfn); extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); +#ifdef CONFIG_XEN_DEBUG_FS +extern int p2m_dump_show(struct seq_file *m, void *v); +#endif static inline unsigned long pfn_to_mfn(unsigned long pfn) { unsigned long mfn; @@ -57,7 +65,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn) mfn = get_phys_to_machine(pfn); if (mfn != INVALID_P2M_ENTRY) - mfn &= ~FOREIGN_FRAME_BIT; + mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); return mfn; } @@ -73,25 +81,44 @@ static inline int phys_to_machine_mapping_valid(unsigned long pfn) static inline unsigned long mfn_to_pfn(unsigned long mfn) { unsigned long pfn; + int ret = 0; if (xen_feature(XENFEAT_auto_translated_physmap)) return mfn; + if (unlikely((mfn >> machine_to_phys_order) != 0)) { + pfn = ~0; + goto try_override; + } pfn = 0; /* * The array access can fail (e.g., device space beyond end of RAM). * In such cases it doesn't matter what we return (we return garbage), * but we must handle the fault without crashing! */ - __get_user(pfn, &machine_to_phys_mapping[mfn]); - - /* - * If this appears to be a foreign mfn (because the pfn - * doesn't map back to the mfn), then check the local override - * table to see if there's a better pfn to use. + ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); +try_override: + /* ret might be < 0 if there are no entries in the m2p for mfn */ + if (ret < 0) + pfn = ~0; + else if (get_phys_to_machine(pfn) != mfn) + /* + * If this appears to be a foreign mfn (because the pfn + * doesn't map back to the mfn), then check the local override + * table to see if there's a better pfn to use. + * + * m2p_find_override_pfn returns ~0 if it doesn't find anything. + */ + pfn = m2p_find_override_pfn(mfn, ~0); + + /* + * pfn is ~0 if there are no entries in the m2p for mfn or if the + * entry doesn't map back to the mfn and m2p_override doesn't have a + * valid entry for it. */ - if (get_phys_to_machine(pfn) != mfn) - pfn = m2p_find_override_pfn(mfn, pfn); + if (pfn == ~0 && + get_phys_to_machine(mfn) == IDENTITY_FRAME(mfn)) + pfn = mfn; return pfn; } diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h index 2329b3eaf8d..aa862098916 100644 --- a/arch/x86/include/asm/xen/pci.h +++ b/arch/x86/include/asm/xen/pci.h @@ -27,16 +27,16 @@ static inline void __init xen_setup_pirqs(void) * its own functions. */ struct xen_pci_frontend_ops { - int (*enable_msi)(struct pci_dev *dev, int **vectors); + int (*enable_msi)(struct pci_dev *dev, int vectors[]); void (*disable_msi)(struct pci_dev *dev); - int (*enable_msix)(struct pci_dev *dev, int **vectors, int nvec); + int (*enable_msix)(struct pci_dev *dev, int vectors[], int nvec); void (*disable_msix)(struct pci_dev *dev); }; extern struct xen_pci_frontend_ops *xen_pci_frontend; static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev, - int **vectors) + int vectors[]) { if (xen_pci_frontend && xen_pci_frontend->enable_msi) return xen_pci_frontend->enable_msi(dev, vectors); @@ -48,7 +48,7 @@ static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev) xen_pci_frontend->disable_msi(dev); } static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev, - int **vectors, int nvec) + int vectors[], int nvec) { if (xen_pci_frontend && xen_pci_frontend->enable_msix) return xen_pci_frontend->enable_msix(dev, vectors, nvec); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 34244b2cd88..62445ba2f8a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -66,9 +66,9 @@ obj-$(CONFIG_PCI) += early-quirks.o apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_SMP) += smp.o -obj-$(CONFIG_SMP) += smpboot.o tsc_sync.o +obj-$(CONFIG_SMP) += smpboot.o +obj-$(CONFIG_SMP) += tsc_sync.o obj-$(CONFIG_SMP) += setup_percpu.o -obj-$(CONFIG_X86_64_SMP) += tsc_sync.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ @@ -109,6 +109,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o +obj-$(CONFIG_OF) += devicetree.o ### # 64 bit specific files diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 3e6e2d68f76..9a966c579af 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -595,14 +595,8 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) nid = acpi_get_node(handle); if (nid == -1 || !node_online(nid)) return; -#ifdef CONFIG_X86_64 - apicid_to_node[physid] = nid; + set_apicid_to_node(physid, nid); numa_set_node(cpu, nid); -#else /* CONFIG_X86_32 */ - apicid_2_node[physid] = nid; - cpu_to_node_map[cpu] = nid; -#endif - #endif } diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 0a99f7198bc..ed3c2e5b714 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -12,7 +12,7 @@ static u32 *flush_words; -struct pci_device_id amd_nb_misc_ids[] = { +const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, @@ -20,6 +20,11 @@ struct pci_device_id amd_nb_misc_ids[] = { }; EXPORT_SYMBOL(amd_nb_misc_ids); +static struct pci_device_id amd_nb_link_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_LINK) }, + {} +}; + const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = { { 0x00, 0x18, 0x20 }, { 0xff, 0x00, 0x20 }, @@ -31,7 +36,7 @@ struct amd_northbridge_info amd_northbridges; EXPORT_SYMBOL(amd_northbridges); static struct pci_dev *next_northbridge(struct pci_dev *dev, - struct pci_device_id *ids) + const struct pci_device_id *ids) { do { dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); @@ -45,7 +50,7 @@ int amd_cache_northbridges(void) { int i = 0; struct amd_northbridge *nb; - struct pci_dev *misc; + struct pci_dev *misc, *link; if (amd_nb_num()) return 0; @@ -64,10 +69,12 @@ int amd_cache_northbridges(void) amd_northbridges.nb = nb; amd_northbridges.num = i; - misc = NULL; + link = misc = NULL; for (i = 0; i != amd_nb_num(); i++) { node_to_amd_nb(i)->misc = misc = next_northbridge(misc, amd_nb_misc_ids); + node_to_amd_nb(i)->link = link = + next_northbridge(link, amd_nb_link_ids); } /* some CPU families (e.g. family 0x11) do not support GART */ @@ -85,6 +92,13 @@ int amd_cache_northbridges(void) boot_cpu_data.x86_mask >= 0x1)) amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; + if (boot_cpu_data.x86 == 0x15) + amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; + + /* L3 cache partitioning is supported on family 0x15 */ + if (boot_cpu_data.x86 == 0x15) + amd_northbridges.flags |= AMD_NB_L3_PARTITIONING; + return 0; } EXPORT_SYMBOL_GPL(amd_cache_northbridges); @@ -93,8 +107,9 @@ EXPORT_SYMBOL_GPL(amd_cache_northbridges); they're useless anyways */ int __init early_is_amd_nb(u32 device) { - struct pci_device_id *id; + const struct pci_device_id *id; u32 vendor = device & 0xffff; + device >>= 16; for (id = amd_nb_misc_ids; id->vendor; id++) if (vendor == id->vendor && device == id->device) @@ -102,6 +117,65 @@ int __init early_is_amd_nb(u32 device) return 0; } +int amd_get_subcaches(int cpu) +{ + struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; + unsigned int mask; + int cuid = 0; + + if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return 0; + + pci_read_config_dword(link, 0x1d4, &mask); + +#ifdef CONFIG_SMP + cuid = cpu_data(cpu).compute_unit_id; +#endif + return (mask >> (4 * cuid)) & 0xf; +} + +int amd_set_subcaches(int cpu, int mask) +{ + static unsigned int reset, ban; + struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); + unsigned int reg; + int cuid = 0; + + if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf) + return -EINVAL; + + /* if necessary, collect reset state of L3 partitioning and BAN mode */ + if (reset == 0) { + pci_read_config_dword(nb->link, 0x1d4, &reset); + pci_read_config_dword(nb->misc, 0x1b8, &ban); + ban &= 0x180000; + } + + /* deactivate BAN mode if any subcaches are to be disabled */ + if (mask != 0xf) { + pci_read_config_dword(nb->misc, 0x1b8, ®); + pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000); + } + +#ifdef CONFIG_SMP + cuid = cpu_data(cpu).compute_unit_id; +#endif + mask <<= 4 * cuid; + mask |= (0xf ^ (1 << cuid)) << 26; + + pci_write_config_dword(nb->link, 0x1d4, mask); + + /* reset BAN mode if L3 partitioning returned to reset state */ + pci_read_config_dword(nb->link, 0x1d4, ®); + if (reg == reset) { + pci_read_config_dword(nb->misc, 0x1b8, ®); + reg &= ~0x180000; + pci_write_config_dword(nb->misc, 0x1b8, reg | ban); + } + + return 0; +} + int amd_cache_gart(void) { int i; diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 51d4e166306..1293c709ee8 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -508,64 +508,12 @@ static int apbt_next_event(unsigned long delta, return 0; } -/* - * APB timer clock is not in sync with pclk on Langwell, which translates to - * unreliable read value caused by sampling error. the error does not add up - * overtime and only happens when sampling a 0 as a 1 by mistake. so the time - * would go backwards. the following code is trying to prevent time traveling - * backwards. little bit paranoid. - */ static cycle_t apbt_read_clocksource(struct clocksource *cs) { - unsigned long t0, t1, t2; - static unsigned long last_read; - -bad_count: - t1 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - t2 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - if (unlikely(t1 < t2)) { - pr_debug("APBT: read current count error %lx:%lx:%lx\n", - t1, t2, t2 - t1); - goto bad_count; - } - /* - * check against cached last read, makes sure time does not go back. - * it could be a normal rollover but we will do tripple check anyway - */ - if (unlikely(t2 > last_read)) { - /* check if we have a normal rollover */ - unsigned long raw_intr_status = - apbt_readl_reg(APBTMRS_RAW_INT_STATUS); - /* - * cs timer interrupt is masked but raw intr bit is set if - * rollover occurs. then we read EOI reg to clear it. - */ - if (raw_intr_status & (1 << phy_cs_timer_id)) { - apbt_readl(phy_cs_timer_id, APBTMR_N_EOI); - goto out; - } - pr_debug("APB CS going back %lx:%lx:%lx ", - t2, last_read, t2 - last_read); -bad_count_x3: - pr_debug("triple check enforced\n"); - t0 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - udelay(1); - t1 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - udelay(1); - t2 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - if ((t2 > t1) || (t1 > t0)) { - printk(KERN_ERR "Error: APB CS tripple check failed\n"); - goto bad_count_x3; - } - } -out: - last_read = t2; - return (cycle_t)~t2; + unsigned long current_count; + + current_count = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE); + return (cycle_t)~current_count; } static int apbt_clocksource_register(void) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 5955a7800a9..7b1e8e10b89 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -13,7 +13,7 @@ #include <linux/kernel.h> #include <linux/types.h> #include <linux/init.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/mmzone.h> #include <linux/pci_ids.h> #include <linux/pci.h> @@ -57,7 +57,7 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) static u32 __init allocate_aperture(void) { u32 aper_size; - void *p; + unsigned long addr; /* aper_size should <= 1G */ if (fallback_aper_order > 5) @@ -83,27 +83,26 @@ static u32 __init allocate_aperture(void) * so don't use 512M below as gart iommu, leave the space for kernel * code for safe */ - p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); + addr = memblock_find_in_range(0, 1ULL<<32, aper_size, 512ULL<<20); + if (addr == MEMBLOCK_ERROR || addr + aper_size > 0xffffffff) { + printk(KERN_ERR + "Cannot allocate aperture memory hole (%lx,%uK)\n", + addr, aper_size>>10); + return 0; + } + memblock_x86_reserve_range(addr, addr + aper_size, "aperture64"); /* * Kmemleak should not scan this block as it may not be mapped via the * kernel direct mapping. */ - kmemleak_ignore(p); - if (!p || __pa(p)+aper_size > 0xffffffff) { - printk(KERN_ERR - "Cannot allocate aperture memory hole (%p,%uK)\n", - p, aper_size>>10); - if (p) - free_bootmem(__pa(p), aper_size); - return 0; - } + kmemleak_ignore(phys_to_virt(addr)); printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", - aper_size >> 10, __pa(p)); - insert_aperture_resource((u32)__pa(p), aper_size); - register_nosave_region((u32)__pa(p) >> PAGE_SHIFT, - (u32)__pa(p+aper_size) >> PAGE_SHIFT); + aper_size >> 10, addr); + insert_aperture_resource((u32)addr, aper_size); + register_nosave_region(addr >> PAGE_SHIFT, + (addr+aper_size) >> PAGE_SHIFT); - return (u32)__pa(p); + return (u32)addr; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 76b96d74978..966673f4414 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -43,6 +43,7 @@ #include <asm/i8259.h> #include <asm/proto.h> #include <asm/apic.h> +#include <asm/io_apic.h> #include <asm/desc.h> #include <asm/hpet.h> #include <asm/idle.h> @@ -78,12 +79,21 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); #ifdef CONFIG_X86_32 + +/* + * On x86_32, the mapping between cpu and logical apicid may vary + * depending on apic in use. The following early percpu variable is + * used for the mapping. This is where the behaviors of x86_64 and 32 + * actually diverge. Let's keep it ugly for now. + */ +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID); + /* * Knob to control our willingness to enable the local APIC. * * +1=force-enable */ -static int force_enable_local_apic; +static int force_enable_local_apic __initdata; /* * APIC command line parameters */ @@ -153,7 +163,7 @@ early_param("nox2apic", setup_nox2apic); unsigned long mp_lapic_addr; int disable_apic; /* Disable local APIC timer from the kernel commandline or via dmi quirk */ -static int disable_apic_timer __cpuinitdata; +static int disable_apic_timer __initdata; /* Local APIC timer works in C2 */ int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); @@ -177,29 +187,8 @@ static struct resource lapic_resource = { static unsigned int calibration_result; -static int lapic_next_event(unsigned long delta, - struct clock_event_device *evt); -static void lapic_timer_setup(enum clock_event_mode mode, - struct clock_event_device *evt); -static void lapic_timer_broadcast(const struct cpumask *mask); static void apic_pm_activate(void); -/* - * The local apic timer can be used for any function which is CPU local. - */ -static struct clock_event_device lapic_clockevent = { - .name = "lapic", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT - | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, - .shift = 32, - .set_mode = lapic_timer_setup, - .set_next_event = lapic_next_event, - .broadcast = lapic_timer_broadcast, - .rating = 100, - .irq = -1, -}; -static DEFINE_PER_CPU(struct clock_event_device, lapic_events); - static unsigned long apic_phys; /* @@ -238,7 +227,7 @@ static int modern_apic(void) * right after this call apic become NOOP driven * so apic->write/read doesn't do anything */ -void apic_disable(void) +static void __init apic_disable(void) { pr_info("APIC: switched to apic NOOP\n"); apic = &apic_noop; @@ -282,23 +271,6 @@ u64 native_apic_icr_read(void) return icr1 | ((u64)icr2 << 32); } -/** - * enable_NMI_through_LVT0 - enable NMI through local vector table 0 - */ -void __cpuinit enable_NMI_through_LVT0(void) -{ - unsigned int v; - - /* unmask and set to NMI */ - v = APIC_DM_NMI; - - /* Level triggered for 82489DX (32bit mode) */ - if (!lapic_is_integrated()) - v |= APIC_LVT_LEVEL_TRIGGER; - - apic_write(APIC_LVT0, v); -} - #ifdef CONFIG_X86_32 /** * get_physical_broadcast - Get number of physical broadcast IDs @@ -508,6 +480,23 @@ static void lapic_timer_broadcast(const struct cpumask *mask) #endif } + +/* + * The local apic timer can be used for any function which is CPU local. + */ +static struct clock_event_device lapic_clockevent = { + .name = "lapic", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT + | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, + .shift = 32, + .set_mode = lapic_timer_setup, + .set_next_event = lapic_next_event, + .broadcast = lapic_timer_broadcast, + .rating = 100, + .irq = -1, +}; +static DEFINE_PER_CPU(struct clock_event_device, lapic_events); + /* * Setup the local APIC timer for this CPU. Copy the initialized values * of the boot CPU and register the clock event in the framework. @@ -1209,7 +1198,7 @@ void __cpuinit setup_local_APIC(void) rdtscll(tsc); if (disable_apic) { - arch_disable_smp_support(); + disable_ioapic_support(); return; } @@ -1237,6 +1226,19 @@ void __cpuinit setup_local_APIC(void) */ apic->init_apic_ldr(); +#ifdef CONFIG_X86_32 + /* + * APIC LDR is initialized. If logical_apicid mapping was + * initialized during get_smp_config(), make sure it matches the + * actual value. + */ + i = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + WARN_ON(i != BAD_APICID && i != logical_smp_processor_id()); + /* always use the value from LDR */ + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = + logical_smp_processor_id(); +#endif + /* * Set Task Priority to 'accept all'. We never change this * later on. @@ -1448,7 +1450,7 @@ int __init enable_IR(void) void __init enable_IR_x2apic(void) { unsigned long flags; - struct IO_APIC_route_entry **ioapic_entries = NULL; + struct IO_APIC_route_entry **ioapic_entries; int ret, x2apic_enabled = 0; int dmar_table_init_ret; @@ -1537,7 +1539,7 @@ static int __init detect_init_APIC(void) } #else -static int apic_verify(void) +static int __init apic_verify(void) { u32 features, h, l; @@ -1562,7 +1564,7 @@ static int apic_verify(void) return 0; } -int apic_force_enable(void) +int __init apic_force_enable(unsigned long addr) { u32 h, l; @@ -1578,7 +1580,7 @@ int apic_force_enable(void) if (!(l & MSR_IA32_APICBASE_ENABLE)) { pr_info("Local APIC disabled by BIOS -- reenabling.\n"); l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; + l |= MSR_IA32_APICBASE_ENABLE | addr; wrmsr(MSR_IA32_APICBASE, l, h); enabled_via_apicbase = 1; } @@ -1619,7 +1621,7 @@ static int __init detect_init_APIC(void) "you can enable it with \"lapic\"\n"); return -1; } - if (apic_force_enable()) + if (apic_force_enable(APIC_DEFAULT_PHYS_BASE)) return -1; } else { if (apic_verify()) @@ -1930,17 +1932,6 @@ void __cpuinit generic_processor_info(int apicid, int version) { int cpu; - /* - * Validate version - */ - if (version == 0x0) { - pr_warning("BIOS bug, APIC version is 0 for CPU#%d! " - "fixing up to 0x10. (tell your hw vendor)\n", - version); - version = 0x10; - } - apic_version[apicid] = version; - if (num_processors >= nr_cpu_ids) { int max = nr_cpu_ids; int thiscpu = max + disabled_cpus; @@ -1954,22 +1945,34 @@ void __cpuinit generic_processor_info(int apicid, int version) } num_processors++; - cpu = cpumask_next_zero(-1, cpu_present_mask); - - if (version != apic_version[boot_cpu_physical_apicid]) - WARN_ONCE(1, - "ACPI: apic version mismatch, bootcpu: %x cpu %d: %x\n", - apic_version[boot_cpu_physical_apicid], cpu, version); - - physid_set(apicid, phys_cpu_present_map); if (apicid == boot_cpu_physical_apicid) { /* * x86_bios_cpu_apicid is required to have processors listed * in same order as logical cpu numbers. Hence the first * entry is BSP, and so on. + * boot_cpu_init() already hold bit 0 in cpu_present_mask + * for BSP. */ cpu = 0; + } else + cpu = cpumask_next_zero(-1, cpu_present_mask); + + /* + * Validate version + */ + if (version == 0x0) { + pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n", + cpu, apicid); + version = 0x10; } + apic_version[apicid] = version; + + if (version != apic_version[boot_cpu_physical_apicid]) { + pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", + apic_version[boot_cpu_physical_apicid], cpu, version); + } + + physid_set(apicid, phys_cpu_present_map); if (apicid > max_physical_apicid) max_physical_apicid = apicid; @@ -1977,7 +1980,10 @@ void __cpuinit generic_processor_info(int apicid, int version) early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; #endif - +#ifdef CONFIG_X86_32 + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = + apic->x86_32_early_logical_apicid(cpu); +#endif set_cpu_possible(cpu, true); set_cpu_present(cpu, true); } @@ -1998,10 +2004,14 @@ void default_init_apic_ldr(void) } #ifdef CONFIG_X86_32 -int default_apicid_to_node(int logical_apicid) +int default_x86_32_numa_cpu_node(int cpu) { -#ifdef CONFIG_SMP - return apicid_2_node[hard_smp_processor_id()]; +#ifdef CONFIG_NUMA + int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); + + if (apicid != BAD_APICID) + return __apicid_to_node[apicid]; + return NUMA_NO_NODE; #else return 0; #endif diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 09d3b17ce0c..5652d31fe10 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -185,8 +185,6 @@ struct apic apic_flat = { .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = NULL, - .cpu_to_logical_apicid = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, @@ -337,8 +335,6 @@ struct apic apic_physflat = { .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = NULL, - .cpu_to_logical_apicid = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index e31b9ffe25f..f1baa2dc087 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -54,11 +54,6 @@ static u64 noop_apic_icr_read(void) return 0; } -static int noop_cpu_to_logical_apicid(int cpu) -{ - return 0; -} - static int noop_phys_pkg_id(int cpuid_apic, int index_msb) { return 0; @@ -113,12 +108,6 @@ static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) cpumask_set_cpu(cpu, retmask); } -int noop_apicid_to_node(int logical_apicid) -{ - /* we're always on node 0 */ - return 0; -} - static u32 noop_apic_read(u32 reg) { WARN_ON_ONCE((cpu_has_apic && !disable_apic)); @@ -130,6 +119,14 @@ static void noop_apic_write(u32 reg, u32 v) WARN_ON_ONCE(cpu_has_apic && !disable_apic); } +#ifdef CONFIG_X86_32 +static int noop_x86_32_numa_cpu_node(int cpu) +{ + /* we're always on node 0 */ + return 0; +} +#endif + struct apic apic_noop = { .name = "noop", .probe = noop_probe, @@ -153,9 +150,7 @@ struct apic apic_noop = { .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = noop_apicid_to_node, - .cpu_to_logical_apicid = noop_cpu_to_logical_apicid, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, @@ -197,4 +192,9 @@ struct apic apic_noop = { .icr_write = noop_apic_icr_write, .wait_icr_idle = noop_apic_wait_icr_idle, .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle, + +#ifdef CONFIG_X86_32 + .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, + .x86_32_numa_cpu_node = noop_x86_32_numa_cpu_node, +#endif }; diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index cb804c5091b..541a2e43165 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -45,6 +45,12 @@ static unsigned long bigsmp_check_apicid_present(int bit) return 1; } +static int bigsmp_early_logical_apicid(int cpu) +{ + /* on bigsmp, logical apicid is the same as physical */ + return early_per_cpu(x86_cpu_to_apicid, cpu); +} + static inline unsigned long calculate_ldr(int cpu) { unsigned long val, id; @@ -80,11 +86,6 @@ static void bigsmp_setup_apic_routing(void) nr_ioapics); } -static int bigsmp_apicid_to_node(int logical_apicid) -{ - return apicid_2_node[hard_smp_processor_id()]; -} - static int bigsmp_cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < nr_cpu_ids) @@ -93,14 +94,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu) return BAD_APICID; } -/* Mapping from cpu number to logical apicid */ -static inline int bigsmp_cpu_to_logical_apicid(int cpu) -{ - if (cpu >= nr_cpu_ids) - return BAD_APICID; - return cpu_physical_id(cpu); -} - static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ @@ -115,7 +108,11 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid) /* As we are using single CPU as destination, pick only one CPU here */ static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask) { - return bigsmp_cpu_to_logical_apicid(cpumask_first(cpumask)); + int cpu = cpumask_first(cpumask); + + if (cpu < nr_cpu_ids) + return cpu_physical_id(cpu); + return BAD_APICID; } static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, @@ -129,9 +126,9 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, */ for_each_cpu_and(cpu, cpumask, andmask) { if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; + return cpu_physical_id(cpu); } - return bigsmp_cpu_to_logical_apicid(cpu); + return BAD_APICID; } static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) @@ -219,8 +216,6 @@ struct apic apic_bigsmp = { .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, .setup_apic_routing = bigsmp_setup_apic_routing, .multi_timer_check = NULL, - .apicid_to_node = bigsmp_apicid_to_node, - .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid, .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, .setup_portio_remap = NULL, @@ -256,4 +251,7 @@ struct apic apic_bigsmp = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = bigsmp_early_logical_apicid, + .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node, }; diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 8593582d802..3e9de4854c5 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -460,6 +460,12 @@ static unsigned long es7000_check_apicid_present(int bit) return physid_isset(bit, phys_cpu_present_map); } +static int es7000_early_logical_apicid(int cpu) +{ + /* on es7000, logical apicid is the same as physical */ + return early_per_cpu(x86_bios_cpu_apicid, cpu); +} + static unsigned long calculate_ldr(int cpu) { unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu); @@ -504,12 +510,11 @@ static void es7000_setup_apic_routing(void) nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); } -static int es7000_apicid_to_node(int logical_apicid) +static int es7000_numa_cpu_node(int cpu) { return 0; } - static int es7000_cpu_present_to_apicid(int mps_cpu) { if (!mps_cpu) @@ -528,18 +533,6 @@ static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap) ++cpu_id; } -/* Mapping from cpu number to logical apicid */ -static int es7000_cpu_to_logical_apicid(int cpu) -{ -#ifdef CONFIG_SMP - if (cpu >= nr_cpu_ids) - return BAD_APICID; - return cpu_2_logical_apicid[cpu]; -#else - return logical_smp_processor_id(); -#endif -} - static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ @@ -561,7 +554,7 @@ static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask) * The cpus in the mask must all be on the apic cluster. */ for_each_cpu(cpu, cpumask) { - int new_apicid = es7000_cpu_to_logical_apicid(cpu); + int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { WARN(1, "Not a valid mask!"); @@ -578,7 +571,7 @@ static unsigned int es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, const struct cpumask *andmask) { - int apicid = es7000_cpu_to_logical_apicid(0); + int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); cpumask_var_t cpumask; if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) @@ -655,8 +648,6 @@ struct apic __refdata apic_es7000_cluster = { .ioapic_phys_id_map = es7000_ioapic_phys_id_map, .setup_apic_routing = es7000_setup_apic_routing, .multi_timer_check = NULL, - .apicid_to_node = es7000_apicid_to_node, - .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid, .cpu_present_to_apicid = es7000_cpu_present_to_apicid, .apicid_to_cpu_present = es7000_apicid_to_cpu_present, .setup_portio_remap = NULL, @@ -695,6 +686,9 @@ struct apic __refdata apic_es7000_cluster = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = es7000_early_logical_apicid, + .x86_32_numa_cpu_node = es7000_numa_cpu_node, }; struct apic __refdata apic_es7000 = { @@ -720,8 +714,6 @@ struct apic __refdata apic_es7000 = { .ioapic_phys_id_map = es7000_ioapic_phys_id_map, .setup_apic_routing = es7000_setup_apic_routing, .multi_timer_check = NULL, - .apicid_to_node = es7000_apicid_to_node, - .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid, .cpu_present_to_apicid = es7000_cpu_present_to_apicid, .apicid_to_cpu_present = es7000_apicid_to_cpu_present, .setup_portio_remap = NULL, @@ -758,4 +750,7 @@ struct apic __refdata apic_es7000 = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = es7000_early_logical_apicid, + .x86_32_numa_cpu_node = es7000_numa_cpu_node, }; diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 79fd43ca6f9..c4e557a1ebb 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -83,7 +83,6 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, arch_spin_lock(&lock); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); show_regs(regs); - dump_stack(); arch_spin_unlock(&lock); cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return NOTIFY_STOP; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ca9e2a3545a..4b5ebd26f56 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -108,7 +108,10 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); int skip_ioapic_setup; -void arch_disable_smp_support(void) +/** + * disable_ioapic_support() - disables ioapic support at runtime + */ +void disable_ioapic_support(void) { #ifdef CONFIG_PCI noioapicquirk = 1; @@ -120,11 +123,14 @@ void arch_disable_smp_support(void) static int __init parse_noapic(char *str) { /* disable IO-APIC */ - arch_disable_smp_support(); + disable_ioapic_support(); return 0; } early_param("noapic", parse_noapic); +static int io_apic_setup_irq_pin_once(unsigned int irq, int node, + struct io_apic_irq_attr *attr); + /* Will be called in mpparse/acpi/sfi codes for saving IRQ info */ void mp_save_irq(struct mpc_intsrc *m) { @@ -181,7 +187,7 @@ int __init arch_early_irq_init(void) irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs); for (i = 0; i < count; i++) { - set_irq_chip_data(i, &cfg[i]); + irq_set_chip_data(i, &cfg[i]); zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node); zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node); /* @@ -200,7 +206,7 @@ int __init arch_early_irq_init(void) #ifdef CONFIG_SPARSE_IRQ static struct irq_cfg *irq_cfg(unsigned int irq) { - return get_irq_chip_data(irq); + return irq_get_chip_data(irq); } static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) @@ -226,7 +232,7 @@ static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) { if (!cfg) return; - set_irq_chip_data(at, NULL); + irq_set_chip_data(at, NULL); free_cpumask_var(cfg->domain); free_cpumask_var(cfg->old_domain); kfree(cfg); @@ -256,14 +262,14 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) if (res < 0) { if (res != -EEXIST) return NULL; - cfg = get_irq_chip_data(at); + cfg = irq_get_chip_data(at); if (cfg) return cfg; } cfg = alloc_irq_cfg(at, node); if (cfg) - set_irq_chip_data(at, cfg); + irq_set_chip_data(at, cfg); else irq_free_desc(at); return cfg; @@ -818,7 +824,7 @@ static int EISA_ELCR(unsigned int irq) #define default_MCA_trigger(idx) (1) #define default_MCA_polarity(idx) default_ISA_polarity(idx) -static int MPBIOS_polarity(int idx) +static int irq_polarity(int idx) { int bus = mp_irqs[idx].srcbus; int polarity; @@ -860,7 +866,7 @@ static int MPBIOS_polarity(int idx) return polarity; } -static int MPBIOS_trigger(int idx) +static int irq_trigger(int idx) { int bus = mp_irqs[idx].srcbus; int trigger; @@ -932,16 +938,6 @@ static int MPBIOS_trigger(int idx) return trigger; } -static inline int irq_polarity(int idx) -{ - return MPBIOS_polarity(idx); -} - -static inline int irq_trigger(int idx) -{ - return MPBIOS_trigger(idx); -} - static int pin_2_irq(int idx, int apic, int pin) { int irq; @@ -1189,7 +1185,7 @@ void __setup_vector_irq(int cpu) raw_spin_lock(&vector_lock); /* Mark the inuse vectors */ for_each_active_irq(irq) { - cfg = get_irq_chip_data(irq); + cfg = irq_get_chip_data(irq); if (!cfg) continue; /* @@ -1220,10 +1216,6 @@ void __setup_vector_irq(int cpu) static struct irq_chip ioapic_chip; static struct irq_chip ir_ioapic_chip; -#define IOAPIC_AUTO -1 -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 - #ifdef CONFIG_X86_32 static inline int IO_APIC_irq_trigger(int irq) { @@ -1248,35 +1240,31 @@ static inline int IO_APIC_irq_trigger(int irq) } #endif -static void ioapic_register_intr(unsigned int irq, unsigned long trigger) +static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg, + unsigned long trigger) { + struct irq_chip *chip = &ioapic_chip; + irq_flow_handler_t hdl; + bool fasteoi; if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) + trigger == IOAPIC_LEVEL) { irq_set_status_flags(irq, IRQ_LEVEL); - else + fasteoi = true; + } else { irq_clear_status_flags(irq, IRQ_LEVEL); + fasteoi = false; + } - if (irq_remapped(get_irq_chip_data(irq))) { + if (irq_remapped(cfg)) { irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - if (trigger) - set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, - handle_fasteoi_irq, - "fasteoi"); - else - set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, - handle_edge_irq, "edge"); - return; + chip = &ir_ioapic_chip; + fasteoi = trigger != 0; } - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, - "fasteoi"); - else - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); + hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq; + irq_set_chip_and_handler_name(irq, chip, hdl, + fasteoi ? "fasteoi" : "edge"); } static int setup_ioapic_entry(int apic_id, int irq, @@ -1374,7 +1362,7 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq, return; } - ioapic_register_intr(irq, trigger); + ioapic_register_intr(irq, cfg, trigger); if (irq < legacy_pic->nr_legacy_irqs) legacy_pic->mask(irq); @@ -1385,33 +1373,26 @@ static struct { DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); } mp_ioapic_routing[MAX_IO_APICS]; -static void __init setup_IO_APIC_irqs(void) +static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin) { - int apic_id, pin, idx, irq, notcon = 0; - int node = cpu_to_node(0); - struct irq_cfg *cfg; + if (idx != -1) + return false; - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n", + mp_ioapics[apic_id].apicid, pin); + return true; +} + +static void __init __io_apic_setup_irqs(unsigned int apic_id) +{ + int idx, node = cpu_to_node(0); + struct io_apic_irq_attr attr; + unsigned int pin, irq; - for (apic_id = 0; apic_id < nr_ioapics; apic_id++) for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { idx = find_irq_entry(apic_id, pin, mp_INT); - if (idx == -1) { - if (!notcon) { - notcon = 1; - apic_printk(APIC_VERBOSE, - KERN_DEBUG " %d-%d", - mp_ioapics[apic_id].apicid, pin); - } else - apic_printk(APIC_VERBOSE, " %d-%d", - mp_ioapics[apic_id].apicid, pin); + if (io_apic_pin_not_connected(idx, apic_id, pin)) continue; - } - if (notcon) { - apic_printk(APIC_VERBOSE, - " (apicid-pin) not connected\n"); - notcon = 0; - } irq = pin_2_irq(idx, apic_id, pin); @@ -1423,25 +1404,24 @@ static void __init setup_IO_APIC_irqs(void) * installed and if it returns 1: */ if (apic->multi_timer_check && - apic->multi_timer_check(apic_id, irq)) + apic->multi_timer_check(apic_id, irq)) continue; - cfg = alloc_irq_and_cfg_at(irq, node); - if (!cfg) - continue; + set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx), + irq_polarity(idx)); - add_pin_to_irq_node(cfg, node, apic_id, pin); - /* - * don't mark it in pin_programmed, so later acpi could - * set it correctly when irq < 16 - */ - setup_ioapic_irq(apic_id, pin, irq, cfg, irq_trigger(idx), - irq_polarity(idx)); + io_apic_setup_irq_pin(irq, node, &attr); } +} - if (notcon) - apic_printk(APIC_VERBOSE, - " (apicid-pin) not connected\n"); +static void __init setup_IO_APIC_irqs(void) +{ + unsigned int apic_id; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + + for (apic_id = 0; apic_id < nr_ioapics; apic_id++) + __io_apic_setup_irqs(apic_id); } /* @@ -1452,7 +1432,7 @@ static void __init setup_IO_APIC_irqs(void) void setup_IO_APIC_irq_extra(u32 gsi) { int apic_id = 0, pin, idx, irq, node = cpu_to_node(0); - struct irq_cfg *cfg; + struct io_apic_irq_attr attr; /* * Convert 'gsi' to 'ioapic.pin'. @@ -1472,21 +1452,10 @@ void setup_IO_APIC_irq_extra(u32 gsi) if (apic_id == 0 || irq < NR_IRQS_LEGACY) return; - cfg = alloc_irq_and_cfg_at(irq, node); - if (!cfg) - return; - - add_pin_to_irq_node(cfg, node, apic_id, pin); - - if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", - mp_ioapics[apic_id].apicid, pin); - return; - } - set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed); + set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx), + irq_polarity(idx)); - setup_ioapic_irq(apic_id, pin, irq, cfg, - irq_trigger(idx), irq_polarity(idx)); + io_apic_setup_irq_pin_once(irq, node, &attr); } /* @@ -1518,7 +1487,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin, * The timer IRQ doesn't have to know that behind the * scene we may have a 8259A-master in AEOI mode ... */ - set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); + irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, + "edge"); /* * Add it to the IO-APIC irq-routing table: @@ -1625,7 +1595,7 @@ __apicdebuginit(void) print_IO_APIC(void) for_each_active_irq(irq) { struct irq_pin_list *entry; - cfg = get_irq_chip_data(irq); + cfg = irq_get_chip_data(irq); if (!cfg) continue; entry = cfg->irq_2_pin; @@ -2391,7 +2361,7 @@ static void irq_complete_move(struct irq_cfg *cfg) void irq_force_complete_move(int irq) { - struct irq_cfg *cfg = get_irq_chip_data(irq); + struct irq_cfg *cfg = irq_get_chip_data(irq); if (!cfg) return; @@ -2405,7 +2375,7 @@ static inline void irq_complete_move(struct irq_cfg *cfg) { } static void ack_apic_edge(struct irq_data *data) { irq_complete_move(data->chip_data); - move_native_irq(data->irq); + irq_move_irq(data); ack_APIC_irq(); } @@ -2462,7 +2432,7 @@ static void ack_apic_level(struct irq_data *data) irq_complete_move(cfg); #ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ - if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { + if (unlikely(irqd_is_setaffinity_pending(data))) { do_unmask_irq = 1; mask_ioapic(cfg); } @@ -2551,7 +2521,7 @@ static void ack_apic_level(struct irq_data *data) * and you can go talk to the chipset vendor about it. */ if (!io_apic_level_ack_pending(cfg)) - move_masked_irq(irq); + irq_move_masked_irq(data); unmask_ioapic(cfg); } } @@ -2614,7 +2584,7 @@ static inline void init_IO_APIC_traps(void) * 0x80, because int 0x80 is hm, kind of importantish. ;) */ for_each_active_irq(irq) { - cfg = get_irq_chip_data(irq); + cfg = irq_get_chip_data(irq); if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { /* * Hmm.. We don't have an entry for this, @@ -2625,7 +2595,7 @@ static inline void init_IO_APIC_traps(void) legacy_pic->make_irq(irq); else /* Strange. Oh, well.. */ - set_irq_chip(irq, &no_irq_chip); + irq_set_chip(irq, &no_irq_chip); } } } @@ -2665,7 +2635,7 @@ static struct irq_chip lapic_chip __read_mostly = { static void lapic_register_intr(int irq) { irq_clear_status_flags(irq, IRQ_LEVEL); - set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, + irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); } @@ -2749,7 +2719,7 @@ int timer_through_8259 __initdata; */ static inline void __init check_timer(void) { - struct irq_cfg *cfg = get_irq_chip_data(0); + struct irq_cfg *cfg = irq_get_chip_data(0); int node = cpu_to_node(0); int apic1, pin1, apic2, pin2; unsigned long flags; @@ -3060,7 +3030,7 @@ unsigned int create_irq_nr(unsigned int from, int node) raw_spin_unlock_irqrestore(&vector_lock, flags); if (ret) { - set_irq_chip_data(irq, cfg); + irq_set_chip_data(irq, cfg); irq_clear_status_flags(irq, IRQ_NOREQUEST); } else { free_irq_at(irq, cfg); @@ -3085,7 +3055,7 @@ int create_irq(void) void destroy_irq(unsigned int irq) { - struct irq_cfg *cfg = get_irq_chip_data(irq); + struct irq_cfg *cfg = irq_get_chip_data(irq); unsigned long flags; irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); @@ -3119,7 +3089,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); - if (irq_remapped(get_irq_chip_data(irq))) { + if (irq_remapped(cfg)) { struct irte irte; int ir_index; u16 sub_handle; @@ -3291,6 +3261,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) { + struct irq_chip *chip = &msi_chip; struct msi_msg msg; int ret; @@ -3298,14 +3269,15 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) if (ret < 0) return ret; - set_irq_msi(irq, msidesc); + irq_set_msi_desc(irq, msidesc); write_msi_msg(irq, &msg); - if (irq_remapped(get_irq_chip_data(irq))) { + if (irq_remapped(irq_get_chip_data(irq))) { irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); - } else - set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); + chip = &msi_ir_chip; + } + + irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); @@ -3423,8 +3395,8 @@ int arch_setup_dmar_msi(unsigned int irq) if (ret < 0) return ret; dmar_msi_write(irq, &msg); - set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, - "edge"); + irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, + "edge"); return 0; } #endif @@ -3482,6 +3454,7 @@ static struct irq_chip hpet_msi_type = { int arch_setup_hpet_msi(unsigned int irq, unsigned int id) { + struct irq_chip *chip = &hpet_msi_type; struct msi_msg msg; int ret; @@ -3501,15 +3474,12 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) if (ret < 0) return ret; - hpet_msi_write(get_irq_data(irq), &msg); + hpet_msi_write(irq_get_handler_data(irq), &msg); irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - if (irq_remapped(get_irq_chip_data(irq))) - set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type, - handle_edge_irq, "edge"); - else - set_irq_chip_and_handler_name(irq, &hpet_msi_type, - handle_edge_irq, "edge"); + if (irq_remapped(irq_get_chip_data(irq))) + chip = &ir_hpet_msi_type; + irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); return 0; } #endif @@ -3596,7 +3566,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) write_ht_irq_msg(irq, &msg); - set_irq_chip_and_handler_name(irq, &ht_irq_chip, + irq_set_chip_and_handler_name(irq, &ht_irq_chip, handle_edge_irq, "edge"); dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); @@ -3605,7 +3575,40 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) } #endif /* CONFIG_HT_IRQ */ -int __init io_apic_get_redir_entries (int ioapic) +int +io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) +{ + struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node); + int ret; + + if (!cfg) + return -EINVAL; + ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin); + if (!ret) + setup_ioapic_irq(attr->ioapic, attr->ioapic_pin, irq, cfg, + attr->trigger, attr->polarity); + return ret; +} + +static int io_apic_setup_irq_pin_once(unsigned int irq, int node, + struct io_apic_irq_attr *attr) +{ + unsigned int id = attr->ioapic, pin = attr->ioapic_pin; + int ret; + + /* Avoid redundant programming */ + if (test_bit(pin, mp_ioapic_routing[id].pin_programmed)) { + pr_debug("Pin %d-%d already programmed\n", + mp_ioapics[id].apicid, pin); + return 0; + } + ret = io_apic_setup_irq_pin(irq, node, attr); + if (!ret) + set_bit(pin, mp_ioapic_routing[id].pin_programmed); + return ret; +} + +static int __init io_apic_get_redir_entries(int ioapic) { union IO_APIC_reg_01 reg_01; unsigned long flags; @@ -3659,96 +3662,24 @@ int __init arch_probe_nr_irqs(void) } #endif -static int __io_apic_set_pci_routing(struct device *dev, int irq, - struct io_apic_irq_attr *irq_attr) +int io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) { - struct irq_cfg *cfg; int node; - int ioapic, pin; - int trigger, polarity; - ioapic = irq_attr->ioapic; if (!IO_APIC_IRQ(irq)) { apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); + irq_attr->ioapic); return -EINVAL; } - if (dev) - node = dev_to_node(dev); - else - node = cpu_to_node(0); - - cfg = alloc_irq_and_cfg_at(irq, node); - if (!cfg) - return 0; - - pin = irq_attr->ioapic_pin; - trigger = irq_attr->trigger; - polarity = irq_attr->polarity; + node = dev ? dev_to_node(dev) : cpu_to_node(0); - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= legacy_pic->nr_legacy_irqs) { - if (__add_pin_to_irq_node(cfg, node, ioapic, pin)) { - printk(KERN_INFO "can not add pin %d for irq %d\n", - pin, irq); - return 0; - } - } - - setup_ioapic_irq(ioapic, pin, irq, cfg, trigger, polarity); - - return 0; + return io_apic_setup_irq_pin_once(irq, node, irq_attr); } -int io_apic_set_pci_routing(struct device *dev, int irq, - struct io_apic_irq_attr *irq_attr) -{ - int ioapic, pin; - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ - ioapic = irq_attr->ioapic; - pin = irq_attr->ioapic_pin; - if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", - mp_ioapics[ioapic].apicid, pin); - return 0; - } - set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed); - - return __io_apic_set_pci_routing(dev, irq, irq_attr); -} - -u8 __init io_apic_unique_id(u8 id) -{ #ifdef CONFIG_X86_32 - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - return io_apic_get_unique_id(nr_ioapics, id); - else - return id; -#else - int i; - DECLARE_BITMAP(used, 256); - - bitmap_zero(used, 256); - for (i = 0; i < nr_ioapics; i++) { - struct mpc_ioapic *ia = &mp_ioapics[i]; - __set_bit(ia->apicid, used); - } - if (!test_bit(id, used)) - return id; - return find_first_zero_bit(used, 256); -#endif -} - -#ifdef CONFIG_X86_32 -int __init io_apic_get_unique_id(int ioapic, int apic_id) +static int __init io_apic_get_unique_id(int ioapic, int apic_id) { union IO_APIC_reg_00 reg_00; static physid_mask_t apic_id_map = PHYSID_MASK_NONE; @@ -3821,9 +3752,33 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) return apic_id; } + +static u8 __init io_apic_unique_id(u8 id) +{ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return io_apic_get_unique_id(nr_ioapics, id); + else + return id; +} +#else +static u8 __init io_apic_unique_id(u8 id) +{ + int i; + DECLARE_BITMAP(used, 256); + + bitmap_zero(used, 256); + for (i = 0; i < nr_ioapics; i++) { + struct mpc_ioapic *ia = &mp_ioapics[i]; + __set_bit(ia->apicid, used); + } + if (!test_bit(id, used)) + return id; + return find_first_zero_bit(used, 256); +} #endif -int __init io_apic_get_version(int ioapic) +static int __init io_apic_get_version(int ioapic) { union IO_APIC_reg_01 reg_01; unsigned long flags; @@ -3868,8 +3823,8 @@ int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity) void __init setup_ioapic_dest(void) { int pin, ioapic, irq, irq_entry; - struct irq_desc *desc; const struct cpumask *mask; + struct irq_data *idata; if (skip_ioapic_setup == 1) return; @@ -3884,21 +3839,20 @@ void __init setup_ioapic_dest(void) if ((ioapic > 0) && (irq > 16)) continue; - desc = irq_to_desc(irq); + idata = irq_get_irq_data(irq); /* * Honour affinities which have been set in early boot */ - if (desc->status & - (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) - mask = desc->irq_data.affinity; + if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata)) + mask = idata->affinity; else mask = apic->target_cpus(); if (intr_remapping_enabled) - ir_ioapic_set_affinity(&desc->irq_data, mask, false); + ir_ioapic_set_affinity(idata, mask, false); else - ioapic_set_affinity(&desc->irq_data, mask, false); + ioapic_set_affinity(idata, mask, false); } } @@ -4026,7 +3980,7 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi) return gsi - mp_gsi_routing[ioapic].gsi_base; } -static int bad_ioapic(unsigned long address) +static __init int bad_ioapic(unsigned long address) { if (nr_ioapics >= MAX_IO_APICS) { printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded " @@ -4086,20 +4040,16 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) /* Enable IOAPIC early just for system timer */ void __init pre_init_apic_IRQ0(void) { - struct irq_cfg *cfg; + struct io_apic_irq_attr attr = { 0, 0, 0, 0 }; printk(KERN_INFO "Early APIC setup for system timer0\n"); #ifndef CONFIG_SMP physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); #endif - /* Make sure the irq descriptor is set up */ - cfg = alloc_irq_and_cfg_at(0, 0); - setup_local_APIC(); - add_pin_to_irq_node(cfg, 0, 0, 0); - set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); - - setup_ioapic_irq(0, 0, 0, cfg, 0, 0); + io_apic_setup_irq_pin(0, 0, &attr); + irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, + "edge"); } diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 08385e090a6..cce91bf2667 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -56,6 +56,8 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, local_irq_restore(flags); } +#ifdef CONFIG_X86_32 + void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector) { @@ -71,8 +73,8 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, local_irq_save(flags); for_each_cpu(query_cpu, mask) __default_send_IPI_dest_field( - apic->cpu_to_logical_apicid(query_cpu), vector, - apic->dest_logical); + early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, apic->dest_logical); local_irq_restore(flags); } @@ -90,14 +92,12 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, if (query_cpu == this_cpu) continue; __default_send_IPI_dest_field( - apic->cpu_to_logical_apicid(query_cpu), vector, - apic->dest_logical); + early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, apic->dest_logical); } local_irq_restore(flags); } -#ifdef CONFIG_X86_32 - /* * This is only used on smaller machines. */ diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 960f26ab5c9..6273eee5134 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -373,13 +373,6 @@ static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask return physids_promote(0xFUL, retmap); } -static inline int numaq_cpu_to_logical_apicid(int cpu) -{ - if (cpu >= nr_cpu_ids) - return BAD_APICID; - return cpu_2_logical_apicid[cpu]; -} - /* * Supporting over 60 cpus on NUMA-Q requires a locality-dependent * cpu to APIC ID relation to properly interact with the intelligent @@ -398,6 +391,15 @@ static inline int numaq_apicid_to_node(int logical_apicid) return logical_apicid >> 4; } +static int numaq_numa_cpu_node(int cpu) +{ + int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + + if (logical_apicid != BAD_APICID) + return numaq_apicid_to_node(logical_apicid); + return NUMA_NO_NODE; +} + static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap) { int node = numaq_apicid_to_node(logical_apicid); @@ -508,8 +510,6 @@ struct apic __refdata apic_numaq = { .ioapic_phys_id_map = numaq_ioapic_phys_id_map, .setup_apic_routing = numaq_setup_apic_routing, .multi_timer_check = numaq_multi_timer_check, - .apicid_to_node = numaq_apicid_to_node, - .cpu_to_logical_apicid = numaq_cpu_to_logical_apicid, .cpu_present_to_apicid = numaq_cpu_present_to_apicid, .apicid_to_cpu_present = numaq_apicid_to_cpu_present, .setup_portio_remap = numaq_setup_portio_remap, @@ -547,4 +547,7 @@ struct apic __refdata apic_numaq = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, + .x86_32_numa_cpu_node = numaq_numa_cpu_node, }; diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 99d2fe01608..fc84c7b6110 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -77,6 +77,11 @@ void __init default_setup_apic_routing(void) apic->setup_apic_routing(); } +static int default_x86_32_early_logical_apicid(int cpu) +{ + return 1 << cpu; +} + static void setup_apic_flat_routing(void) { #ifdef CONFIG_X86_IO_APIC @@ -130,8 +135,6 @@ struct apic apic_default = { .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = setup_apic_flat_routing, .multi_timer_check = NULL, - .apicid_to_node = default_apicid_to_node, - .cpu_to_logical_apicid = default_cpu_to_logical_apicid, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, .setup_portio_remap = NULL, @@ -167,6 +170,9 @@ struct apic apic_default = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid, + .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node, }; extern struct apic apic_numaq; diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 9b419263d90..e4b8059b414 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -194,11 +194,10 @@ static unsigned long summit_check_apicid_present(int bit) return 1; } -static void summit_init_apic_ldr(void) +static int summit_early_logical_apicid(int cpu) { - unsigned long val, id; int count = 0; - u8 my_id = (u8)hard_smp_processor_id(); + u8 my_id = early_per_cpu(x86_cpu_to_apicid, cpu); u8 my_cluster = APIC_CLUSTER(my_id); #ifdef CONFIG_SMP u8 lid; @@ -206,7 +205,7 @@ static void summit_init_apic_ldr(void) /* Create logical APIC IDs by counting CPUs already in cluster. */ for (count = 0, i = nr_cpu_ids; --i >= 0; ) { - lid = cpu_2_logical_apicid[i]; + lid = early_per_cpu(x86_cpu_to_logical_apicid, i); if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) ++count; } @@ -214,7 +213,15 @@ static void summit_init_apic_ldr(void) /* We only have a 4 wide bitmap in cluster mode. If a deranged * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */ BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT); - id = my_cluster | (1UL << count); + return my_cluster | (1UL << count); +} + +static void summit_init_apic_ldr(void) +{ + int cpu = smp_processor_id(); + unsigned long id = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + unsigned long val; + apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE); val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; val |= SET_APIC_LOGICAL_ID(id); @@ -232,27 +239,6 @@ static void summit_setup_apic_routing(void) nr_ioapics); } -static int summit_apicid_to_node(int logical_apicid) -{ -#ifdef CONFIG_SMP - return apicid_2_node[hard_smp_processor_id()]; -#else - return 0; -#endif -} - -/* Mapping from cpu number to logical apicid */ -static inline int summit_cpu_to_logical_apicid(int cpu) -{ -#ifdef CONFIG_SMP - if (cpu >= nr_cpu_ids) - return BAD_APICID; - return cpu_2_logical_apicid[cpu]; -#else - return logical_smp_processor_id(); -#endif -} - static int summit_cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < nr_cpu_ids) @@ -286,7 +272,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) * The cpus in the mask must all be on the apic cluster. */ for_each_cpu(cpu, cpumask) { - int new_apicid = summit_cpu_to_logical_apicid(cpu); + int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { printk("%s: Not a valid mask!\n", __func__); @@ -301,7 +287,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, const struct cpumask *andmask) { - int apicid = summit_cpu_to_logical_apicid(0); + int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); cpumask_var_t cpumask; if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) @@ -528,8 +514,6 @@ struct apic apic_summit = { .ioapic_phys_id_map = summit_ioapic_phys_id_map, .setup_apic_routing = summit_setup_apic_routing, .multi_timer_check = NULL, - .apicid_to_node = summit_apicid_to_node, - .cpu_to_logical_apicid = summit_cpu_to_logical_apicid, .cpu_present_to_apicid = summit_cpu_present_to_apicid, .apicid_to_cpu_present = summit_apicid_to_cpu_present, .setup_portio_remap = NULL, @@ -565,4 +549,7 @@ struct apic apic_summit = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = summit_early_logical_apicid, + .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node, }; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index cf69c59f491..90949bbd566 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -206,8 +206,6 @@ struct apic apic_x2apic_cluster = { .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = NULL, - .cpu_to_logical_apicid = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 8972f38c5ce..c7e6d6645bf 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -195,8 +195,6 @@ struct apic apic_x2apic_phys = { .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = NULL, - .cpu_to_logical_apicid = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index bd16b58b885..3c289281394 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -338,8 +338,6 @@ struct apic __refdata apic_x2apic_uv_x = { .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = NULL, - .cpu_to_logical_apicid = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index cfa82c899f4..4f13fafc526 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -1,5 +1,70 @@ +/* + * Generate definitions needed by assembly language modules. + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ +#define COMPILE_OFFSETS + +#include <linux/crypto.h> +#include <linux/sched.h> +#include <linux/stddef.h> +#include <linux/hardirq.h> +#include <linux/suspend.h> +#include <linux/kbuild.h> +#include <asm/processor.h> +#include <asm/thread_info.h> +#include <asm/sigframe.h> +#include <asm/bootparam.h> +#include <asm/suspend.h> + +#ifdef CONFIG_XEN +#include <xen/interface/xen.h> +#endif + #ifdef CONFIG_X86_32 # include "asm-offsets_32.c" #else # include "asm-offsets_64.c" #endif + +void common(void) { + BLANK(); + OFFSET(TI_flags, thread_info, flags); + OFFSET(TI_status, thread_info, status); + OFFSET(TI_addr_limit, thread_info, addr_limit); + OFFSET(TI_preempt_count, thread_info, preempt_count); + + BLANK(); + OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); + + BLANK(); + OFFSET(pbe_address, pbe, address); + OFFSET(pbe_orig_address, pbe, orig_address); + OFFSET(pbe_next, pbe, next); + +#ifdef CONFIG_PARAVIRT + BLANK(); + OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); + OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); + OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); + OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); + OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); + OFFSET(PV_CPU_iret, pv_cpu_ops, iret); + OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); + OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); + OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); +#endif + +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); + OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); +#endif + + BLANK(); + OFFSET(BP_scratch, boot_params, scratch); + OFFSET(BP_loadflags, boot_params, hdr.loadflags); + OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); + OFFSET(BP_version, boot_params, hdr.version); + OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); +} diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 1a4088dda37..c29d631af6f 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -1,26 +1,4 @@ -/* - * Generate definitions needed by assembly language modules. - * This code generates raw asm output which is post-processed - * to extract and format the required data. - */ - -#include <linux/crypto.h> -#include <linux/sched.h> -#include <linux/signal.h> -#include <linux/personality.h> -#include <linux/suspend.h> -#include <linux/kbuild.h> #include <asm/ucontext.h> -#include <asm/sigframe.h> -#include <asm/pgtable.h> -#include <asm/fixmap.h> -#include <asm/processor.h> -#include <asm/thread_info.h> -#include <asm/bootparam.h> -#include <asm/elf.h> -#include <asm/suspend.h> - -#include <xen/interface/xen.h> #include <linux/lguest.h> #include "../../../drivers/lguest/lg.h" @@ -51,21 +29,10 @@ void foo(void) OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); BLANK(); - OFFSET(TI_task, thread_info, task); - OFFSET(TI_exec_domain, thread_info, exec_domain); - OFFSET(TI_flags, thread_info, flags); - OFFSET(TI_status, thread_info, status); - OFFSET(TI_preempt_count, thread_info, preempt_count); - OFFSET(TI_addr_limit, thread_info, addr_limit); - OFFSET(TI_restart_block, thread_info, restart_block); OFFSET(TI_sysenter_return, thread_info, sysenter_return); OFFSET(TI_cpu, thread_info, cpu); BLANK(); - OFFSET(GDS_size, desc_ptr, size); - OFFSET(GDS_address, desc_ptr, address); - BLANK(); - OFFSET(PT_EBX, pt_regs, bx); OFFSET(PT_ECX, pt_regs, cx); OFFSET(PT_EDX, pt_regs, dx); @@ -85,42 +52,13 @@ void foo(void) OFFSET(PT_OLDSS, pt_regs, ss); BLANK(); - OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); BLANK(); - OFFSET(pbe_address, pbe, address); - OFFSET(pbe_orig_address, pbe, orig_address); - OFFSET(pbe_next, pbe, next); - /* Offset from the sysenter stack to tss.sp0 */ DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - sizeof(struct tss_struct)); - DEFINE(PAGE_SIZE_asm, PAGE_SIZE); - DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT); - DEFINE(THREAD_SIZE_asm, THREAD_SIZE); - - OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); - -#ifdef CONFIG_PARAVIRT - BLANK(); - OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); - OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); - OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); - OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); - OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); - OFFSET(PV_CPU_iret, pv_cpu_ops, iret); - OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); - OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); -#endif - -#ifdef CONFIG_XEN - BLANK(); - OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); - OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); -#endif - #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); @@ -139,11 +77,4 @@ void foo(void) OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); #endif - - BLANK(); - OFFSET(BP_scratch, boot_params, scratch); - OFFSET(BP_loadflags, boot_params, hdr.loadflags); - OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); - OFFSET(BP_version, boot_params, hdr.version); - OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 4a6aeedcd96..e72a1194af2 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -1,27 +1,4 @@ -/* - * Generate definitions needed by assembly language modules. - * This code generates raw asm output which is post-processed to extract - * and format the required data. - */ -#define COMPILE_OFFSETS - -#include <linux/crypto.h> -#include <linux/sched.h> -#include <linux/stddef.h> -#include <linux/errno.h> -#include <linux/hardirq.h> -#include <linux/suspend.h> -#include <linux/kbuild.h> -#include <asm/processor.h> -#include <asm/segment.h> -#include <asm/thread_info.h> #include <asm/ia32.h> -#include <asm/bootparam.h> -#include <asm/suspend.h> - -#include <xen/interface/xen.h> - -#include <asm/sigframe.h> #define __NO_STUBS 1 #undef __SYSCALL @@ -33,41 +10,19 @@ static char syscalls[] = { int main(void) { -#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) - ENTRY(state); - ENTRY(flags); - ENTRY(pid); - BLANK(); -#undef ENTRY -#define ENTRY(entry) DEFINE(TI_ ## entry, offsetof(struct thread_info, entry)) - ENTRY(flags); - ENTRY(addr_limit); - ENTRY(preempt_count); - ENTRY(status); -#ifdef CONFIG_IA32_EMULATION - ENTRY(sysenter_return); -#endif - BLANK(); -#undef ENTRY #ifdef CONFIG_PARAVIRT - BLANK(); - OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); - OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); - OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); - OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); - OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); - OFFSET(PV_CPU_iret, pv_cpu_ops, iret); OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32); OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); - OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); - OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); + BLANK(); #endif - #ifdef CONFIG_IA32_EMULATION -#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) + OFFSET(TI_sysenter_return, thread_info, sysenter_return); + BLANK(); + +#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry) ENTRY(ax); ENTRY(bx); ENTRY(cx); @@ -79,15 +34,12 @@ int main(void) ENTRY(ip); BLANK(); #undef ENTRY - DEFINE(IA32_RT_SIGFRAME_sigcontext, - offsetof (struct rt_sigframe_ia32, uc.uc_mcontext)); + + OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); BLANK(); #endif - DEFINE(pbe_address, offsetof(struct pbe, address)); - DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); - DEFINE(pbe_next, offsetof(struct pbe, next)); - BLANK(); -#define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry)) + +#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry) ENTRY(bx); ENTRY(bx); ENTRY(cx); @@ -107,7 +59,8 @@ int main(void) ENTRY(flags); BLANK(); #undef ENTRY -#define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry)) + +#define ENTRY(entry) OFFSET(saved_context_ ## entry, saved_context, entry) ENTRY(cr0); ENTRY(cr2); ENTRY(cr3); @@ -115,26 +68,11 @@ int main(void) ENTRY(cr8); BLANK(); #undef ENTRY - DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist)); - BLANK(); - DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); - BLANK(); - DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); + OFFSET(TSS_ist, tss_struct, x86_tss.ist); BLANK(); - OFFSET(BP_scratch, boot_params, scratch); - OFFSET(BP_loadflags, boot_params, hdr.loadflags); - OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); - OFFSET(BP_version, boot_params, hdr.version); - OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); - BLANK(); - DEFINE(PAGE_SIZE_asm, PAGE_SIZE); -#ifdef CONFIG_XEN - BLANK(); - OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); - OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); -#undef ENTRY -#endif + DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); + return 0; } diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 13a38917951..452932d3473 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -106,8 +106,8 @@ void __init setup_bios_corruption_check(void) addr += size; } - printk(KERN_INFO "Scanning %d areas for low memory corruption\n", - num_scan_areas); + if (num_scan_areas) + printk(KERN_INFO "Scanning %d areas for low memory corruption\n", num_scan_areas); } @@ -143,12 +143,12 @@ static void check_corruption(struct work_struct *dummy) { check_for_bios_corruption(); schedule_delayed_work(&bios_check_work, - round_jiffies_relative(corruption_check_period*HZ)); + round_jiffies_relative(corruption_check_period*HZ)); } static int start_periodic_check_for_corruption(void) { - if (!memory_corruption_check || corruption_check_period == 0) + if (!num_scan_areas || !memory_corruption_check || corruption_check_period == 0) return 0; printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7c7bedb83c5..f771ab6b49e 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -233,18 +233,22 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) } #endif -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA +/* + * To workaround broken NUMA config. Read the comment in + * srat_detect_node(). + */ static int __cpuinit nearby_node(int apicid) { int i, node; for (i = apicid - 1; i >= 0; i--) { - node = apicid_to_node[i]; + node = __apicid_to_node[i]; if (node != NUMA_NO_NODE && node_online(node)) return node; } for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { - node = apicid_to_node[i]; + node = __apicid_to_node[i]; if (node != NUMA_NO_NODE && node_online(node)) return node; } @@ -261,7 +265,7 @@ static int __cpuinit nearby_node(int apicid) #ifdef CONFIG_X86_HT static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) { - u32 nodes; + u32 nodes, cores_per_cu = 1; u8 node_id; int cpu = smp_processor_id(); @@ -276,6 +280,7 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) /* get compute unit information */ smp_num_siblings = ((ebx >> 8) & 3) + 1; c->compute_unit_id = ebx & 0xff; + cores_per_cu += ((ebx >> 8) & 3); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; @@ -288,15 +293,18 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) /* fixup multi-node processor information */ if (nodes > 1) { u32 cores_per_node; + u32 cus_per_node; set_cpu_cap(c, X86_FEATURE_AMD_DCM); cores_per_node = c->x86_max_cores / nodes; + cus_per_node = cores_per_node / cores_per_cu; /* store NodeID, use llc_shared_map to store sibling info */ per_cpu(cpu_llc_id, cpu) = node_id; - /* core id to be in range from 0 to (cores_per_node - 1) */ - c->cpu_core_id = c->cpu_core_id % cores_per_node; + /* core id has to be in the [0 .. cores_per_node - 1] range */ + c->cpu_core_id %= cores_per_node; + c->compute_unit_id %= cus_per_node; } } #endif @@ -334,31 +342,40 @@ EXPORT_SYMBOL_GPL(amd_get_nb_id); static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA int cpu = smp_processor_id(); int node; unsigned apicid = c->apicid; - node = per_cpu(cpu_llc_id, cpu); + node = numa_cpu_node(cpu); + if (node == NUMA_NO_NODE) + node = per_cpu(cpu_llc_id, cpu); - if (apicid_to_node[apicid] != NUMA_NO_NODE) - node = apicid_to_node[apicid]; if (!node_online(node)) { - /* Two possibilities here: - - The CPU is missing memory and no node was created. - In that case try picking one from a nearby CPU - - The APIC IDs differ from the HyperTransport node IDs - which the K8 northbridge parsing fills in. - Assume they are all increased by a constant offset, - but in the same order as the HT nodeids. - If that doesn't result in a usable node fall back to the - path for the previous case. */ - + /* + * Two possibilities here: + * + * - The CPU is missing memory and no node was created. In + * that case try picking one from a nearby CPU. + * + * - The APIC IDs differ from the HyperTransport node IDs + * which the K8 northbridge parsing fills in. Assume + * they are all increased by a constant offset, but in + * the same order as the HT nodeids. If that doesn't + * result in a usable node fall back to the path for the + * previous case. + * + * This workaround operates directly on the mapping between + * APIC ID and NUMA node, assuming certain relationship + * between APIC ID, HT node ID and NUMA topology. As going + * through CPU mapping may alter the outcome, directly + * access __apicid_to_node[]. + */ int ht_nodeid = c->initial_apicid; if (ht_nodeid >= 0 && - apicid_to_node[ht_nodeid] != NUMA_NO_NODE) - node = apicid_to_node[ht_nodeid]; + __apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = __apicid_to_node[ht_nodeid]; /* Pick a nearby node */ if (!node_online(node)) node = nearby_node(apicid); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 1d59834396b..e2ced0074a4 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -675,7 +675,7 @@ void __init early_cpu_init(void) const struct cpu_dev *const *cdev; int count = 0; -#ifdef PROCESSOR_SELECT +#ifdef CONFIG_PROCESSOR_SELECT printk(KERN_INFO "KERNEL supported cpus:\n"); #endif @@ -687,7 +687,7 @@ void __init early_cpu_init(void) cpu_devs[count] = cpudev; count++; -#ifdef PROCESSOR_SELECT +#ifdef CONFIG_PROCESSOR_SELECT { unsigned int j; @@ -869,7 +869,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) select_idle_routine(c); -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); #endif } diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index 4f6f679f279..4a5a42b842a 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -195,7 +195,7 @@ static unsigned int pcc_get_freq(unsigned int cpu) cmd_incomplete: iowrite16(0, &pcch_hdr->status); spin_unlock(&pcc_lock); - return -EINVAL; + return 0; } static int pcc_cpufreq_target(struct cpufreq_policy *policy, diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index d16c2c53d6b..df86bc8c859 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -276,14 +276,13 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA unsigned node; int cpu = smp_processor_id(); - int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; /* Don't do the funky fallback heuristics the AMD version employs for now. */ - node = apicid_to_node[apicid]; + node = numa_cpu_node(cpu); if (node == NUMA_NO_NODE || !node_online(node)) { /* reuse the value from init_cpu_to_node() */ node = cpu_to_node(cpu); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index ec2c19a7b8e..1ce1af2899d 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -304,8 +304,9 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, struct _cache_attr { struct attribute attr; - ssize_t (*show)(struct _cpuid4_info *, char *); - ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); + ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int); + ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count, + unsigned int); }; #ifdef CONFIG_AMD_NB @@ -400,7 +401,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, #define SHOW_CACHE_DISABLE(slot) \ static ssize_t \ -show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf) \ +show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \ + unsigned int cpu) \ { \ return show_cache_disable(this_leaf, buf, slot); \ } @@ -512,7 +514,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, #define STORE_CACHE_DISABLE(slot) \ static ssize_t \ store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ - const char *buf, size_t count) \ + const char *buf, size_t count, \ + unsigned int cpu) \ { \ return store_cache_disable(this_leaf, buf, count, slot); \ } @@ -524,6 +527,39 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, show_cache_disable_1, store_cache_disable_1); +static ssize_t +show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) +{ + if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return -EINVAL; + + return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); +} + +static ssize_t +store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, + unsigned int cpu) +{ + unsigned long val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return -EINVAL; + + if (strict_strtoul(buf, 16, &val) < 0) + return -EINVAL; + + if (amd_set_subcaches(cpu, val)) + return -EINVAL; + + return count; +} + +static struct _cache_attr subcaches = + __ATTR(subcaches, 0644, show_subcaches, store_subcaches); + #else /* CONFIG_AMD_NB */ #define amd_init_l3_cache(x, y) #endif /* CONFIG_AMD_NB */ @@ -532,9 +568,9 @@ static int __cpuinit cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) { - union _cpuid4_leaf_eax eax; - union _cpuid4_leaf_ebx ebx; - union _cpuid4_leaf_ecx ecx; + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; unsigned edx; if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { @@ -732,11 +768,11 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) struct cpuinfo_x86 *c = &cpu_data(cpu); if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { - for_each_cpu(i, c->llc_shared_map) { + for_each_cpu(i, cpu_llc_shared_mask(cpu)) { if (!per_cpu(ici_cpuid4_info, i)) continue; this_leaf = CPUID4_INFO_IDX(i, index); - for_each_cpu(sibling, c->llc_shared_map) { + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { if (!cpu_online(sibling)) continue; set_bit(sibling, this_leaf->shared_cpu_map); @@ -870,8 +906,8 @@ static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject); #define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y])) #define show_one_plus(file_name, object, val) \ -static ssize_t show_##file_name \ - (struct _cpuid4_info *this_leaf, char *buf) \ +static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \ + unsigned int cpu) \ { \ return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ } @@ -882,7 +918,8 @@ show_one_plus(physical_line_partition, ebx.split.physical_line_partition, 1); show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1); show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); -static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf) +static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf, + unsigned int cpu) { return sprintf(buf, "%luK\n", this_leaf->size / 1024); } @@ -906,17 +943,20 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, return n; } -static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf) +static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf, + unsigned int cpu) { return show_shared_cpu_map_func(leaf, 0, buf); } -static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf) +static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf, + unsigned int cpu) { return show_shared_cpu_map_func(leaf, 1, buf); } -static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) +static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf, + unsigned int cpu) { switch (this_leaf->eax.split.type) { case CACHE_TYPE_DATA: @@ -974,6 +1014,9 @@ static struct attribute ** __cpuinit amd_l3_attrs(void) if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) n += 2; + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + n += 1; + attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL); if (attrs == NULL) return attrs = default_attrs; @@ -986,6 +1029,9 @@ static struct attribute ** __cpuinit amd_l3_attrs(void) attrs[n++] = &cache_disable_1.attr; } + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + attrs[n++] = &subcaches.attr; + return attrs; } #endif @@ -998,7 +1044,7 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) ret = fattr->show ? fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), - buf) : + buf, this_leaf->cpu) : 0; return ret; } @@ -1012,7 +1058,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, ret = fattr->store ? fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), - buf, count) : + buf, count, this_leaf->cpu) : 0; return ret; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 5bf2fac52ac..167f97b5596 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -527,15 +527,12 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) int i, err = 0; struct threshold_bank *b = NULL; char name[32]; -#ifdef CONFIG_SMP - struct cpuinfo_x86 *c = &cpu_data(cpu); -#endif sprintf(name, "threshold_bank%i", bank); #ifdef CONFIG_SMP if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ - i = cpumask_first(c->llc_shared_map); + i = cpumask_first(cpu_llc_shared_mask(cpu)); /* first core not up yet */ if (cpu_data(i).cpu_core_id) @@ -555,7 +552,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (err) goto out; - cpumask_copy(b->cpus, c->llc_shared_map); + cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu)); per_cpu(threshold_banks, cpu)[bank] = b; goto out; diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c new file mode 100644 index 00000000000..7a8cebc9ff2 --- /dev/null +++ b/arch/x86/kernel/devicetree.c @@ -0,0 +1,441 @@ +/* + * Architecture specific OF callbacks. + */ +#include <linux/bootmem.h> +#include <linux/io.h> +#include <linux/interrupt.h> +#include <linux/list.h> +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <linux/of_address.h> +#include <linux/of_platform.h> +#include <linux/of_irq.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <linux/of_pci.h> + +#include <asm/hpet.h> +#include <asm/irq_controller.h> +#include <asm/apic.h> +#include <asm/pci_x86.h> + +__initdata u64 initial_dtb; +char __initdata cmd_line[COMMAND_LINE_SIZE]; +static LIST_HEAD(irq_domains); +static DEFINE_RAW_SPINLOCK(big_irq_lock); + +int __initdata of_ioapic; + +#ifdef CONFIG_X86_IO_APIC +static void add_interrupt_host(struct irq_domain *ih) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&big_irq_lock, flags); + list_add(&ih->l, &irq_domains); + raw_spin_unlock_irqrestore(&big_irq_lock, flags); +} +#endif + +static struct irq_domain *get_ih_from_node(struct device_node *controller) +{ + struct irq_domain *ih, *found = NULL; + unsigned long flags; + + raw_spin_lock_irqsave(&big_irq_lock, flags); + list_for_each_entry(ih, &irq_domains, l) { + if (ih->controller == controller) { + found = ih; + break; + } + } + raw_spin_unlock_irqrestore(&big_irq_lock, flags); + return found; +} + +unsigned int irq_create_of_mapping(struct device_node *controller, + const u32 *intspec, unsigned int intsize) +{ + struct irq_domain *ih; + u32 virq, type; + int ret; + + ih = get_ih_from_node(controller); + if (!ih) + return 0; + ret = ih->xlate(ih, intspec, intsize, &virq, &type); + if (ret) + return ret; + if (type == IRQ_TYPE_NONE) + return virq; + /* set the mask if it is different from current */ + if (type == (irq_to_desc(virq)->status & IRQF_TRIGGER_MASK)) + set_irq_type(virq, type); + return virq; +} +EXPORT_SYMBOL_GPL(irq_create_of_mapping); + +unsigned long pci_address_to_pio(phys_addr_t address) +{ + /* + * The ioport address can be directly used by inX / outX + */ + BUG_ON(address >= (1 << 16)); + return (unsigned long)address; +} +EXPORT_SYMBOL_GPL(pci_address_to_pio); + +void __init early_init_dt_scan_chosen_arch(unsigned long node) +{ + BUG(); +} + +void __init early_init_dt_add_memory_arch(u64 base, u64 size) +{ + BUG(); +} + +void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) +{ + return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS)); +} + +void __init add_dtb(u64 data) +{ + initial_dtb = data + offsetof(struct setup_data, data); +} + +/* + * CE4100 ids. Will be moved to machine_device_initcall() once we have it. + */ +static struct of_device_id __initdata ce4100_ids[] = { + { .compatible = "intel,ce4100-cp", }, + { .compatible = "isa", }, + { .compatible = "pci", }, + {}, +}; + +static int __init add_bus_probe(void) +{ + if (!of_have_populated_dt()) + return 0; + + return of_platform_bus_probe(NULL, ce4100_ids, NULL); +} +module_init(add_bus_probe); + +#ifdef CONFIG_PCI +static int x86_of_pci_irq_enable(struct pci_dev *dev) +{ + struct of_irq oirq; + u32 virq; + int ret; + u8 pin; + + ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + if (ret) + return ret; + if (!pin) + return 0; + + ret = of_irq_map_pci(dev, &oirq); + if (ret) + return ret; + + virq = irq_create_of_mapping(oirq.controller, oirq.specifier, + oirq.size); + if (virq == 0) + return -EINVAL; + dev->irq = virq; + return 0; +} + +static void x86_of_pci_irq_disable(struct pci_dev *dev) +{ +} + +void __cpuinit x86_of_pci_init(void) +{ + struct device_node *np; + + pcibios_enable_irq = x86_of_pci_irq_enable; + pcibios_disable_irq = x86_of_pci_irq_disable; + + for_each_node_by_type(np, "pci") { + const void *prop; + struct pci_bus *bus; + unsigned int bus_min; + struct device_node *child; + + prop = of_get_property(np, "bus-range", NULL); + if (!prop) + continue; + bus_min = be32_to_cpup(prop); + + bus = pci_find_bus(0, bus_min); + if (!bus) { + printk(KERN_ERR "Can't find a node for bus %s.\n", + np->full_name); + continue; + } + + if (bus->self) + bus->self->dev.of_node = np; + else + bus->dev.of_node = np; + + for_each_child_of_node(np, child) { + struct pci_dev *dev; + u32 devfn; + + prop = of_get_property(child, "reg", NULL); + if (!prop) + continue; + + devfn = (be32_to_cpup(prop) >> 8) & 0xff; + dev = pci_get_slot(bus, devfn); + if (!dev) + continue; + dev->dev.of_node = child; + pci_dev_put(dev); + } + } +} +#endif + +static void __init dtb_setup_hpet(void) +{ +#ifdef CONFIG_HPET_TIMER + struct device_node *dn; + struct resource r; + int ret; + + dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-hpet"); + if (!dn) + return; + ret = of_address_to_resource(dn, 0, &r); + if (ret) { + WARN_ON(1); + return; + } + hpet_address = r.start; +#endif +} + +static void __init dtb_lapic_setup(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + struct device_node *dn; + struct resource r; + int ret; + + dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-lapic"); + if (!dn) + return; + + ret = of_address_to_resource(dn, 0, &r); + if (WARN_ON(ret)) + return; + + /* Did the boot loader setup the local APIC ? */ + if (!cpu_has_apic) { + if (apic_force_enable(r.start)) + return; + } + smp_found_config = 1; + pic_mode = 1; + register_lapic_address(r.start); + generic_processor_info(boot_cpu_physical_apicid, + GET_APIC_VERSION(apic_read(APIC_LVR))); +#endif +} + +#ifdef CONFIG_X86_IO_APIC +static unsigned int ioapic_id; + +static void __init dtb_add_ioapic(struct device_node *dn) +{ + struct resource r; + int ret; + + ret = of_address_to_resource(dn, 0, &r); + if (ret) { + printk(KERN_ERR "Can't obtain address from node %s.\n", + dn->full_name); + return; + } + mp_register_ioapic(++ioapic_id, r.start, gsi_top); +} + +static void __init dtb_ioapic_setup(void) +{ + struct device_node *dn; + + for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic") + dtb_add_ioapic(dn); + + if (nr_ioapics) { + of_ioapic = 1; + return; + } + printk(KERN_ERR "Error: No information about IO-APIC in OF.\n"); +} +#else +static void __init dtb_ioapic_setup(void) {} +#endif + +static void __init dtb_apic_setup(void) +{ + dtb_lapic_setup(); + dtb_ioapic_setup(); +} + +#ifdef CONFIG_OF_FLATTREE +static void __init x86_flattree_get_config(void) +{ + u32 size, map_len; + void *new_dtb; + + if (!initial_dtb) + return; + + map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), + (u64)sizeof(struct boot_param_header)); + + initial_boot_params = early_memremap(initial_dtb, map_len); + size = be32_to_cpu(initial_boot_params->totalsize); + if (map_len < size) { + early_iounmap(initial_boot_params, map_len); + initial_boot_params = early_memremap(initial_dtb, size); + map_len = size; + } + + new_dtb = alloc_bootmem(size); + memcpy(new_dtb, initial_boot_params, size); + early_iounmap(initial_boot_params, map_len); + + initial_boot_params = new_dtb; + + /* root level address cells */ + of_scan_flat_dt(early_init_dt_scan_root, NULL); + + unflatten_device_tree(); +} +#else +static inline void x86_flattree_get_config(void) { } +#endif + +void __init x86_dtb_init(void) +{ + x86_flattree_get_config(); + + if (!of_have_populated_dt()) + return; + + dtb_setup_hpet(); + dtb_apic_setup(); +} + +#ifdef CONFIG_X86_IO_APIC + +struct of_ioapic_type { + u32 out_type; + u32 trigger; + u32 polarity; +}; + +static struct of_ioapic_type of_ioapic_type[] = +{ + { + .out_type = IRQ_TYPE_EDGE_RISING, + .trigger = IOAPIC_EDGE, + .polarity = 1, + }, + { + .out_type = IRQ_TYPE_LEVEL_LOW, + .trigger = IOAPIC_LEVEL, + .polarity = 0, + }, + { + .out_type = IRQ_TYPE_LEVEL_HIGH, + .trigger = IOAPIC_LEVEL, + .polarity = 1, + }, + { + .out_type = IRQ_TYPE_EDGE_FALLING, + .trigger = IOAPIC_EDGE, + .polarity = 0, + }, +}; + +static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize, + u32 *out_hwirq, u32 *out_type) +{ + struct io_apic_irq_attr attr; + struct of_ioapic_type *it; + u32 line, idx, type; + + if (intsize < 2) + return -EINVAL; + + line = *intspec; + idx = (u32) id->priv; + *out_hwirq = line + mp_gsi_routing[idx].gsi_base; + + intspec++; + type = *intspec; + + if (type >= ARRAY_SIZE(of_ioapic_type)) + return -EINVAL; + + it = of_ioapic_type + type; + *out_type = it->out_type; + + set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity); + + return io_apic_setup_irq_pin(*out_hwirq, cpu_to_node(0), &attr); +} + +static void __init ioapic_add_ofnode(struct device_node *np) +{ + struct resource r; + int i, ret; + + ret = of_address_to_resource(np, 0, &r); + if (ret) { + printk(KERN_ERR "Failed to obtain address for %s\n", + np->full_name); + return; + } + + for (i = 0; i < nr_ioapics; i++) { + if (r.start == mp_ioapics[i].apicaddr) { + struct irq_domain *id; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + BUG_ON(!id); + id->controller = np; + id->xlate = ioapic_xlate; + id->priv = (void *)i; + add_interrupt_host(id); + return; + } + } + printk(KERN_ERR "IOxAPIC at %s is not registered.\n", np->full_name); +} + +void __init x86_add_irq_domains(void) +{ + struct device_node *dp; + + if (!of_have_populated_dt()) + return; + + for_each_node_with_property(dp, "interrupt-controller") { + if (of_device_is_compatible(dp, "intel,ce4100-ioapic")) + ioapic_add_ofnode(dp); + } +} +#else +void __init x86_add_irq_domains(void) { } +#endif diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 294f26da0c0..cdf5bfd9d4d 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -667,21 +667,15 @@ __init void e820_setup_gap(void) * boot_params.e820_map, others are passed via SETUP_E820_EXT node of * linked list of struct setup_data, which is parsed here. */ -void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data) +void __init parse_e820_ext(struct setup_data *sdata) { - u32 map_len; int entries; struct e820entry *extmap; entries = sdata->len / sizeof(struct e820entry); - map_len = sdata->len + sizeof(struct setup_data); - if (map_len > PAGE_SIZE) - sdata = early_ioremap(pa_data, map_len); extmap = (struct e820entry *)(sdata->data); __append_e820_map(extmap, entries); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - if (map_len > PAGE_SIZE) - early_iounmap(sdata, map_len); printk(KERN_INFO "extended physical RAM map:\n"); e820_print_map("extended"); } @@ -847,15 +841,21 @@ static int __init parse_memopt(char *p) if (!p) return -EINVAL; -#ifdef CONFIG_X86_32 if (!strcmp(p, "nopentium")) { +#ifdef CONFIG_X86_32 setup_clear_cpu_cap(X86_FEATURE_PSE); return 0; - } +#else + printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n"); + return -EINVAL; #endif + } userdef = 1; mem_size = memparse(p, &p); + /* don't remove all of memory when handling "mem={invalid}" param */ + if (mem_size == 0) + return -EINVAL; e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); return 0; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f5accf8eaa7..fa41f7298c8 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -397,7 +397,7 @@ sysenter_past_esp: * A tiny bit of offset fixup is necessary - 4*4 means the 4 words * pushed above; +8 corresponds to copy_thread's esp0 setting. */ - pushl_cfi ((TI_sysenter_return)-THREAD_SIZE_asm+8+4*4)(%esp) + pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp) CFI_REL_OFFSET eip, 0 pushl_cfi %eax @@ -1411,8 +1411,7 @@ END(general_protection) #ifdef CONFIG_KVM_GUEST ENTRY(async_page_fault) RING0_EC_FRAME - pushl $do_async_page_fault - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_async_page_fault jmp error_code CFI_ENDPROC END(apf_page_fault) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 0a0ed794edb..b72b4a6466a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -977,9 +977,12 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \ x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_SMP -.irpc idx, "01234567" +.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 +.if NUM_INVALIDATE_TLB_VECTORS > \idx apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ invalidate_interrupt\idx smp_invalidate_interrupt +.endif .endr #endif @@ -1250,7 +1253,7 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) decl PER_CPU_VAR(irq_count) jmp error_exit CFI_ENDPROC -END(do_hypervisor_callback) +END(xen_do_hypervisor_callback) /* * Hypervisor uses this for application faults while it executes. diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 767d6c43de3..ce0be7cd085 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -73,7 +73,7 @@ MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT */ KERNEL_PAGES = LOWMEM_PAGES -INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm +INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE RESERVE_BRK(pagetables, INIT_MAP_SIZE) /* @@ -137,7 +137,7 @@ ENTRY(startup_32) movsl 1: -#ifdef CONFIG_OLPC_OPENFIRMWARE +#ifdef CONFIG_OLPC /* save OFW's pgdir table for later use when calling into OFW */ movl %cr3, %eax movl %eax, pa(olpc_ofw_pgd) @@ -623,7 +623,7 @@ ENTRY(initial_code) * BSS section */ __PAGE_ALIGNED_BSS - .align PAGE_SIZE_asm + .align PAGE_SIZE #ifdef CONFIG_X86_PAE initial_pg_pmd: .fill 1024*KPMDS,4,0 @@ -644,7 +644,7 @@ ENTRY(swapper_pg_dir) #ifdef CONFIG_X86_PAE __PAGE_ALIGNED_DATA /* Page-aligned for the benefit of paravirt? */ - .align PAGE_SIZE_asm + .align PAGE_SIZE ENTRY(initial_page_table) .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ # if KPMDS == 3 @@ -662,7 +662,7 @@ ENTRY(initial_page_table) # else # error "Kernel PMDs should be 1, 2 or 3" # endif - .align PAGE_SIZE_asm /* needs to be page-sized too */ + .align PAGE_SIZE /* needs to be page-sized too */ #endif .data diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 4ff5968f12d..bfe8f729e08 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -503,7 +503,7 @@ static int hpet_assign_irq(struct hpet_dev *dev) if (!irq) return -EINVAL; - set_irq_data(irq, dev); + irq_set_handler_data(irq, dev); if (hpet_setup_msi_irq(irq)) return -EINVAL; diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 20757cb2efa..d9ca749c123 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -112,7 +112,7 @@ static void make_8259A_irq(unsigned int irq) { disable_irq_nosync(irq); io_apic_irqs &= ~(1<<irq); - set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, + irq_set_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, i8259A_chip.name); enable_irq(irq); } diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 8eec0ec59af..8c968974253 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -14,22 +14,9 @@ #include <linux/slab.h> #include <linux/thread_info.h> #include <linux/syscalls.h> +#include <linux/bitmap.h> #include <asm/syscalls.h> -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ -static void set_bitmap(unsigned long *bitmap, unsigned int base, - unsigned int extent, int new_value) -{ - unsigned int i; - - for (i = base; i < base + extent; i++) { - if (new_value) - __set_bit(i, bitmap); - else - __clear_bit(i, bitmap); - } -} - /* * this changes the io permissions bitmap in the current task. */ @@ -69,7 +56,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) */ tss = &per_cpu(init_tss, get_cpu()); - set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); + if (turn_on) + bitmap_clear(t->io_bitmap_ptr, from, num); + else + bitmap_set(t->io_bitmap_ptr, from, num); /* * Search for a (possibly new) maximum. This is simple and stupid, diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 387b6a0c9e8..948a31eae75 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -44,9 +44,9 @@ void ack_bad_irq(unsigned int irq) #define irq_stats(x) (&per_cpu(irq_stat, x)) /* - * /proc/interrupts printing: + * /proc/interrupts printing for arch specific interrupts */ -static int show_other_interrupts(struct seq_file *p, int prec) +int arch_show_interrupts(struct seq_file *p, int prec) { int j; @@ -122,59 +122,6 @@ static int show_other_interrupts(struct seq_file *p, int prec) return 0; } -int show_interrupts(struct seq_file *p, void *v) -{ - unsigned long flags, any_count = 0; - int i = *(loff_t *) v, j, prec; - struct irqaction *action; - struct irq_desc *desc; - - if (i > nr_irqs) - return 0; - - for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) - j *= 10; - - if (i == nr_irqs) - return show_other_interrupts(p, prec); - - /* print header */ - if (i == 0) { - seq_printf(p, "%*s", prec + 8, ""); - for_each_online_cpu(j) - seq_printf(p, "CPU%-8d", j); - seq_putc(p, '\n'); - } - - desc = irq_to_desc(i); - if (!desc) - return 0; - - raw_spin_lock_irqsave(&desc->lock, flags); - for_each_online_cpu(j) - any_count |= kstat_irqs_cpu(i, j); - action = desc->action; - if (!action && !any_count) - goto out; - - seq_printf(p, "%*d: ", prec, i); - for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); - seq_printf(p, " %8s", desc->irq_data.chip->name); - seq_printf(p, "-%-8s", desc->name); - - if (action) { - seq_printf(p, " %s", action->name); - while ((action = action->next) != NULL) - seq_printf(p, ", %s", action->name); - } - - seq_putc(p, '\n'); -out: - raw_spin_unlock_irqrestore(&desc->lock, flags); - return 0; -} - /* * /proc/stat helpers */ @@ -276,15 +223,6 @@ void smp_x86_platform_ipi(struct pt_regs *regs) EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); -#ifdef CONFIG_OF -unsigned int irq_create_of_mapping(struct device_node *controller, - const u32 *intspec, unsigned int intsize) -{ - return intspec[0]; -} -EXPORT_SYMBOL_GPL(irq_create_of_mapping); -#endif - #ifdef CONFIG_HOTPLUG_CPU /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ void fixup_irqs(void) @@ -293,6 +231,7 @@ void fixup_irqs(void) static int warned; struct irq_desc *desc; struct irq_data *data; + struct irq_chip *chip; for_each_irq_desc(irq, desc) { int break_affinity = 0; @@ -307,10 +246,10 @@ void fixup_irqs(void) /* interrupt's are disabled at this point */ raw_spin_lock(&desc->lock); - data = &desc->irq_data; + data = irq_desc_get_irq_data(desc); affinity = data->affinity; if (!irq_has_action(irq) || - cpumask_equal(affinity, cpu_online_mask)) { + cpumask_subset(affinity, cpu_online_mask)) { raw_spin_unlock(&desc->lock); continue; } @@ -327,16 +266,17 @@ void fixup_irqs(void) affinity = cpu_all_mask; } - if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_mask) - data->chip->irq_mask(data); + chip = irq_data_get_irq_chip(data); + if (!irqd_can_move_in_process_context(data) && chip->irq_mask) + chip->irq_mask(data); - if (data->chip->irq_set_affinity) - data->chip->irq_set_affinity(data, affinity, true); + if (chip->irq_set_affinity) + chip->irq_set_affinity(data, affinity, true); else if (!(warned++)) set_affinity = 0; - if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_unmask) - data->chip->irq_unmask(data); + if (!irqd_can_move_in_process_context(data) && chip->irq_unmask) + chip->irq_unmask(data); raw_spin_unlock(&desc->lock); @@ -368,10 +308,11 @@ void fixup_irqs(void) irq = __this_cpu_read(vector_irq[vector]); desc = irq_to_desc(irq); - data = &desc->irq_data; + data = irq_desc_get_irq_data(desc); + chip = irq_data_get_irq_chip(data); raw_spin_lock(&desc->lock); - if (data->chip->irq_retrigger) - data->chip->irq_retrigger(data); + if (chip->irq_retrigger) + chip->irq_retrigger(data); raw_spin_unlock(&desc->lock); } } diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index c752e973958..f470e4ef993 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -25,6 +25,7 @@ #include <asm/setup.h> #include <asm/i8259.h> #include <asm/traps.h> +#include <asm/prom.h> /* * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: @@ -71,6 +72,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id) static struct irqaction fpu_irq = { .handler = math_error_irq, .name = "fpu", + .flags = IRQF_NO_THREAD, }; #endif @@ -80,6 +82,7 @@ static struct irqaction fpu_irq = { static struct irqaction irq2 = { .handler = no_action, .name = "cascade", + .flags = IRQF_NO_THREAD, }; DEFINE_PER_CPU(vector_irq_t, vector_irq) = { @@ -110,7 +113,7 @@ void __init init_ISA_irqs(void) legacy_pic->init(0); for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) - set_irq_chip_and_handler_name(i, chip, handle_level_irq, name); + irq_set_chip_and_handler_name(i, chip, handle_level_irq, name); } void __init init_IRQ(void) @@ -118,6 +121,12 @@ void __init init_IRQ(void) int i; /* + * We probably need a better place for this, but it works for + * now ... + */ + x86_add_irq_domains(); + + /* * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15. * If these IRQ's are handled by legacy interrupt-controllers like PIC, * then this configuration will likely be static after the boot. If @@ -164,14 +173,77 @@ static void __init smp_intr_init(void) alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); /* IPIs for invalidation */ - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); +#define ALLOC_INVTLB_VEC(NR) \ + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+NR, \ + invalidate_interrupt##NR) + + switch (NUM_INVALIDATE_TLB_VECTORS) { + default: + ALLOC_INVTLB_VEC(31); + case 31: + ALLOC_INVTLB_VEC(30); + case 30: + ALLOC_INVTLB_VEC(29); + case 29: + ALLOC_INVTLB_VEC(28); + case 28: + ALLOC_INVTLB_VEC(27); + case 27: + ALLOC_INVTLB_VEC(26); + case 26: + ALLOC_INVTLB_VEC(25); + case 25: + ALLOC_INVTLB_VEC(24); + case 24: + ALLOC_INVTLB_VEC(23); + case 23: + ALLOC_INVTLB_VEC(22); + case 22: + ALLOC_INVTLB_VEC(21); + case 21: + ALLOC_INVTLB_VEC(20); + case 20: + ALLOC_INVTLB_VEC(19); + case 19: + ALLOC_INVTLB_VEC(18); + case 18: + ALLOC_INVTLB_VEC(17); + case 17: + ALLOC_INVTLB_VEC(16); + case 16: + ALLOC_INVTLB_VEC(15); + case 15: + ALLOC_INVTLB_VEC(14); + case 14: + ALLOC_INVTLB_VEC(13); + case 13: + ALLOC_INVTLB_VEC(12); + case 12: + ALLOC_INVTLB_VEC(11); + case 11: + ALLOC_INVTLB_VEC(10); + case 10: + ALLOC_INVTLB_VEC(9); + case 9: + ALLOC_INVTLB_VEC(8); + case 8: + ALLOC_INVTLB_VEC(7); + case 7: + ALLOC_INVTLB_VEC(6); + case 6: + ALLOC_INVTLB_VEC(5); + case 5: + ALLOC_INVTLB_VEC(4); + case 4: + ALLOC_INVTLB_VEC(3); + case 3: + ALLOC_INVTLB_VEC(2); + case 2: + ALLOC_INVTLB_VEC(1); + case 1: + ALLOC_INVTLB_VEC(0); + break; + } /* IPI for generic function call */ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); @@ -243,7 +315,7 @@ void __init native_init_IRQ(void) set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); } - if (!acpi_ioapic) + if (!acpi_ioapic && !of_ioapic) setup_irq(2, &irq2); #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 0fe6d1a66c3..c5610384ab1 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -66,7 +66,6 @@ struct microcode_amd { unsigned int mpb[0]; }; -#define UCODE_MAX_SIZE 2048 #define UCODE_CONTAINER_SECTION_HDR 8 #define UCODE_CONTAINER_HEADER_SIZE 12 @@ -77,20 +76,20 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) struct cpuinfo_x86 *c = &cpu_data(cpu); u32 dummy; - memset(csig, 0, sizeof(*csig)); if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { - pr_warning("microcode: CPU%d: AMD CPU family 0x%x not " - "supported\n", cpu, c->x86); + pr_warning("CPU%d: family %d not supported\n", cpu, c->x86); return -1; } + rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); - pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev); + pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); + return 0; } -static int get_matching_microcode(int cpu, void *mc, int rev) +static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr, + int rev) { - struct microcode_header_amd *mc_header = mc; unsigned int current_cpu_id; u16 equiv_cpu_id = 0; unsigned int i = 0; @@ -109,17 +108,17 @@ static int get_matching_microcode(int cpu, void *mc, int rev) if (!equiv_cpu_id) return 0; - if (mc_header->processor_rev_id != equiv_cpu_id) + if (mc_hdr->processor_rev_id != equiv_cpu_id) return 0; /* ucode might be chipset specific -- currently we don't support this */ - if (mc_header->nb_dev_id || mc_header->sb_dev_id) { - pr_err("CPU%d: loading of chipset specific code not yet supported\n", + if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { + pr_err("CPU%d: chipset specific code not yet supported\n", cpu); return 0; } - if (mc_header->patch_id <= rev) + if (mc_hdr->patch_id <= rev) return 0; return 1; @@ -144,71 +143,93 @@ static int apply_microcode_amd(int cpu) /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { - pr_err("CPU%d: update failed (for patch_level=0x%x)\n", + pr_err("CPU%d: update failed for patch_level=0x%08x\n", cpu, mc_amd->hdr.patch_id); return -1; } - pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); + pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev); uci->cpu_sig.rev = rev; return 0; } -static void * -get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) +static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size) { - unsigned int total_size; - u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; - void *mc; + struct cpuinfo_x86 *c = &cpu_data(cpu); + unsigned int max_size, actual_size; + +#define F1XH_MPB_MAX_SIZE 2048 +#define F14H_MPB_MAX_SIZE 1824 +#define F15H_MPB_MAX_SIZE 4096 + + switch (c->x86) { + case 0x14: + max_size = F14H_MPB_MAX_SIZE; + break; + case 0x15: + max_size = F15H_MPB_MAX_SIZE; + break; + default: + max_size = F1XH_MPB_MAX_SIZE; + break; + } - get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR); + actual_size = buf[4] + (buf[5] << 8); - if (section_hdr[0] != UCODE_UCODE_TYPE) { - pr_err("error: invalid type field in container file section header\n"); - return NULL; + if (actual_size > size || actual_size > max_size) { + pr_err("section size mismatch\n"); + return 0; } - total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); + return actual_size; +} - if (total_size > size || total_size > UCODE_MAX_SIZE) { - pr_err("error: size mismatch\n"); - return NULL; +static struct microcode_header_amd * +get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size) +{ + struct microcode_header_amd *mc = NULL; + unsigned int actual_size = 0; + + if (buf[0] != UCODE_UCODE_TYPE) { + pr_err("invalid type field in container file section header\n"); + goto out; } - mc = vzalloc(UCODE_MAX_SIZE); + actual_size = verify_ucode_size(cpu, buf, size); + if (!actual_size) + goto out; + + mc = vzalloc(actual_size); if (!mc) - return NULL; + goto out; - get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size); - *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; + get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size); + *mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR; +out: return mc; } static int install_equiv_cpu_table(const u8 *buf) { - u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; - unsigned int *buf_pos = (unsigned int *)container_hdr; - unsigned long size; - - get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE); - - size = buf_pos[2]; - - if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { - pr_err("error: invalid type field in container file section header\n"); - return 0; + unsigned int *ibuf = (unsigned int *)buf; + unsigned int type = ibuf[1]; + unsigned int size = ibuf[2]; + + if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { + pr_err("empty section/" + "invalid type field in container file section header\n"); + return -EINVAL; } equiv_cpu_table = vmalloc(size); if (!equiv_cpu_table) { pr_err("failed to allocate equivalent CPU table\n"); - return 0; + return -ENOMEM; } - buf += UCODE_CONTAINER_HEADER_SIZE; - get_ucode_data(equiv_cpu_table, buf, size); + get_ucode_data(equiv_cpu_table, buf + UCODE_CONTAINER_HEADER_SIZE, size); return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ } @@ -223,16 +244,16 @@ static enum ucode_state generic_load_microcode(int cpu, const u8 *data, size_t size) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct microcode_header_amd *mc_hdr = NULL; + unsigned int mc_size, leftover; + int offset; const u8 *ucode_ptr = data; void *new_mc = NULL; - void *mc; - int new_rev = uci->cpu_sig.rev; - unsigned int leftover; - unsigned long offset; + unsigned int new_rev = uci->cpu_sig.rev; enum ucode_state state = UCODE_OK; offset = install_equiv_cpu_table(ucode_ptr); - if (!offset) { + if (offset < 0) { pr_err("failed to create equivalent cpu table\n"); return UCODE_ERROR; } @@ -241,64 +262,65 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) leftover = size - offset; while (leftover) { - unsigned int uninitialized_var(mc_size); - struct microcode_header_amd *mc_header; - - mc = get_next_ucode(ucode_ptr, leftover, &mc_size); - if (!mc) + mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size); + if (!mc_hdr) break; - mc_header = (struct microcode_header_amd *)mc; - if (get_matching_microcode(cpu, mc, new_rev)) { + if (get_matching_microcode(cpu, mc_hdr, new_rev)) { vfree(new_mc); - new_rev = mc_header->patch_id; - new_mc = mc; + new_rev = mc_hdr->patch_id; + new_mc = mc_hdr; } else - vfree(mc); + vfree(mc_hdr); ucode_ptr += mc_size; leftover -= mc_size; } - if (new_mc) { - if (!leftover) { - vfree(uci->mc); - uci->mc = new_mc; - pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", - cpu, new_rev, uci->cpu_sig.rev); - } else { - vfree(new_mc); - state = UCODE_ERROR; - } - } else + if (!new_mc) { state = UCODE_NFOUND; + goto free_table; + } + if (!leftover) { + vfree(uci->mc); + uci->mc = new_mc; + pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n", + cpu, uci->cpu_sig.rev, new_rev); + } else { + vfree(new_mc); + state = UCODE_ERROR; + } + +free_table: free_equiv_cpu_table(); return state; } -static enum ucode_state request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_amd(int cpu, struct device *device) { const char *fw_name = "amd-ucode/microcode_amd.bin"; - const struct firmware *firmware; - enum ucode_state ret; + const struct firmware *fw; + enum ucode_state ret = UCODE_NFOUND; - if (request_firmware(&firmware, fw_name, device)) { - printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); - return UCODE_NFOUND; + if (request_firmware(&fw, fw_name, device)) { + pr_err("failed to load file %s\n", fw_name); + goto out; } - if (*(u32 *)firmware->data != UCODE_MAGIC) { - pr_err("invalid UCODE_MAGIC (0x%08x)\n", - *(u32 *)firmware->data); - return UCODE_ERROR; + ret = UCODE_ERROR; + if (*(u32 *)fw->data != UCODE_MAGIC) { + pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data); + goto fw_release; } - ret = generic_load_microcode(cpu, firmware->data, firmware->size); + ret = generic_load_microcode(cpu, fw->data, fw->size); - release_firmware(firmware); +fw_release: + release_firmware(fw); +out: return ret; } @@ -319,7 +341,7 @@ static void microcode_fini_cpu_amd(int cpu) static struct microcode_ops microcode_amd_ops = { .request_microcode_user = request_microcode_user, - .request_microcode_fw = request_microcode_fw, + .request_microcode_fw = request_microcode_amd, .collect_cpu_info = collect_cpu_info_amd, .apply_microcode = apply_microcode_amd, .microcode_fini_cpu = microcode_fini_cpu_amd, diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 1cca374a2ba..87af68e0e1e 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -417,8 +417,10 @@ static int mc_sysdev_add(struct sys_device *sys_dev) if (err) return err; - if (microcode_init_cpu(cpu) == UCODE_ERROR) - err = -EINVAL; + if (microcode_init_cpu(cpu) == UCODE_ERROR) { + sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + return -EINVAL; + } return err; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ff455419898..99fa3adf014 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -110,12 +110,9 @@ void show_regs_common(void) init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - printk(KERN_CONT " "); - printk(KERN_CONT "%s %s", vendor, product); - if (board) { - printk(KERN_CONT "/"); - printk(KERN_CONT "%s", board); - } + printk(KERN_CONT " %s %s", vendor, product); + if (board) + printk(KERN_CONT "/%s", board); printk(KERN_CONT "\n"); } diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 6f39cab052d..3f2ad2640d8 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -6,6 +6,7 @@ #include <linux/acpi.h> #include <linux/bcd.h> #include <linux/pnp.h> +#include <linux/of.h> #include <asm/vsyscall.h> #include <asm/x86_init.h> @@ -236,6 +237,8 @@ static __init int add_rtc_cmos(void) } } #endif + if (of_have_populated_dt()) + return 0; platform_device_register(&rtc_device); dev_info(&rtc_device.dev, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d3cfe26c025..b176f2b1f45 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -113,6 +113,7 @@ #endif #include <asm/mce.h> #include <asm/alternative.h> +#include <asm/prom.h> /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. @@ -293,10 +294,32 @@ static void __init init_gbpages(void) else direct_gbpages = 0; } + +static void __init cleanup_highmap_brk_end(void) +{ + pud_t *pud; + pmd_t *pmd; + + mmu_cr4_features = read_cr4(); + + /* + * _brk_end cannot change anymore, but it and _end may be + * located on different 2M pages. cleanup_highmap(), however, + * can only consider _end when it runs, so destroy any + * mappings beyond _brk_end here. + */ + pud = pud_offset(pgd_offset_k(_brk_end), _brk_end); + pmd = pmd_offset(pud, _brk_end - 1); + while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1)) + pmd_clear(pmd); +} #else static inline void init_gbpages(void) { } +static inline void cleanup_highmap_brk_end(void) +{ +} #endif static void __init reserve_brk(void) @@ -307,6 +330,8 @@ static void __init reserve_brk(void) /* Mark brk area as locked down and no longer taking any new allocations */ _brk_start = 0; + + cleanup_highmap_brk_end(); } #ifdef CONFIG_BLK_DEV_INITRD @@ -429,16 +454,30 @@ static void __init parse_setup_data(void) return; pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = early_memremap(pa_data, PAGE_SIZE); + u32 data_len, map_len; + + map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK), + (u64)sizeof(struct setup_data)); + data = early_memremap(pa_data, map_len); + data_len = data->len + sizeof(struct setup_data); + if (data_len > map_len) { + early_iounmap(data, map_len); + data = early_memremap(pa_data, data_len); + map_len = data_len; + } + switch (data->type) { case SETUP_E820_EXT: - parse_e820_ext(data, pa_data); + parse_e820_ext(data); + break; + case SETUP_DTB: + add_dtb(pa_data); break; default: break; } pa_data = data->next; - early_iounmap(data, PAGE_SIZE); + early_iounmap(data, map_len); } } @@ -680,15 +719,6 @@ static int __init parse_reservelow(char *p) early_param("reservelow", parse_reservelow); -static u64 __init get_max_mapped(void) -{ - u64 end = max_pfn_mapped; - - end <<= PAGE_SHIFT; - - return end; -} - /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -704,8 +734,6 @@ static u64 __init get_max_mapped(void) void __init setup_arch(char **cmdline_p) { - int acpi = 0; - int amd = 0; unsigned long flags; #ifdef CONFIG_X86_32 @@ -984,19 +1012,7 @@ void __init setup_arch(char **cmdline_p) early_acpi_boot_init(); -#ifdef CONFIG_ACPI_NUMA - /* - * Parse SRAT to discover nodes. - */ - acpi = acpi_numa_init(); -#endif - -#ifdef CONFIG_AMD_NUMA - if (!acpi) - amd = !amd_numa_init(0, max_pfn); -#endif - - initmem_init(0, max_pfn, acpi, amd); + initmem_init(); memblock_find_dma_reserve(); dma32_reserve_bootmem(); @@ -1029,8 +1045,8 @@ void __init setup_arch(char **cmdline_p) * Read APIC and some other early information from ACPI tables. */ acpi_boot_init(); - sfi_init(); + x86_dtb_init(); /* * get boot-time SMP configuration: @@ -1040,9 +1056,7 @@ void __init setup_arch(char **cmdline_p) prefill_possible_map(); -#ifdef CONFIG_X86_64 init_cpu_to_node(); -#endif init_apic_mappings(); ioapic_and_gsi_init(); @@ -1066,6 +1080,8 @@ void __init setup_arch(char **cmdline_p) #endif x86_init.oem.banner(); + x86_init.timers.wallclock_init(); + mcheck_init(); local_irq_save(flags); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 002b79685f7..71f4727da37 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -225,10 +225,15 @@ void __init setup_per_cpu_areas(void) per_cpu(x86_bios_cpu_apicid, cpu) = early_per_cpu_map(x86_bios_cpu_apicid, cpu); #endif +#ifdef CONFIG_X86_32 + per_cpu(x86_cpu_to_logical_apicid, cpu) = + early_per_cpu_map(x86_cpu_to_logical_apicid, cpu); +#endif #ifdef CONFIG_X86_64 per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack_union.irq_stack, cpu) + IRQ_STACK_SIZE - 64; +#endif #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = early_per_cpu_map(x86_cpu_to_node_map, cpu); @@ -242,7 +247,6 @@ void __init setup_per_cpu_areas(void) */ set_cpu_numa_node(cpu, early_cpu_to_node(cpu)); #endif -#endif /* * Up to this point, the boot CPU has been using .init.data * area. Reload any changed state for the boot CPU. @@ -256,7 +260,10 @@ void __init setup_per_cpu_areas(void) early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; #endif -#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) +#ifdef CONFIG_X86_32 + early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL; +#endif +#ifdef CONFIG_NUMA early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 08776a95348..e9efdfd51c8 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -64,6 +64,7 @@ #include <asm/mtrr.h> #include <asm/mwait.h> #include <asm/apic.h> +#include <asm/io_apic.h> #include <asm/setup.h> #include <asm/uv/uv.h> #include <linux/mc146818rtc.h> @@ -71,10 +72,6 @@ #include <asm/smpboot_hooks.h> #include <asm/i8259.h> -#ifdef CONFIG_X86_32 -u8 apicid_2_node[MAX_APICID]; -#endif - /* State of each CPU */ DEFINE_PER_CPU(int, cpu_state) = { 0 }; @@ -130,68 +127,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); +DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map); + /* Per CPU bogomips and other parameters */ DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); atomic_t init_deasserted; -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) -/* which node each logical CPU is on */ -int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; -EXPORT_SYMBOL(cpu_to_node_map); - -/* set up a mapping between cpu and node. */ -static void map_cpu_to_node(int cpu, int node) -{ - printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node); - cpumask_set_cpu(cpu, node_to_cpumask_map[node]); - cpu_to_node_map[cpu] = node; -} - -/* undo a mapping between cpu and node. */ -static void unmap_cpu_to_node(int cpu) -{ - int node; - - printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu); - for (node = 0; node < MAX_NUMNODES; node++) - cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); - cpu_to_node_map[cpu] = 0; -} -#else /* !(CONFIG_NUMA && CONFIG_X86_32) */ -#define map_cpu_to_node(cpu, node) ({}) -#define unmap_cpu_to_node(cpu) ({}) -#endif - -#ifdef CONFIG_X86_32 -static int boot_cpu_logical_apicid; - -u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = - { [0 ... NR_CPUS-1] = BAD_APICID }; - -static void map_cpu_to_logical_apicid(void) -{ - int cpu = smp_processor_id(); - int apicid = logical_smp_processor_id(); - int node = apic->apicid_to_node(apicid); - - if (!node_online(node)) - node = first_online_node; - - cpu_2_logical_apicid[cpu] = apicid; - map_cpu_to_node(cpu, node); -} - -void numa_remove_cpu(int cpu) -{ - cpu_2_logical_apicid[cpu] = BAD_APICID; - unmap_cpu_to_node(cpu); -} -#else -#define map_cpu_to_logical_apicid() do {} while (0) -#endif - /* * Report back to the Boot Processor. * Running on AP. @@ -259,7 +202,6 @@ static void __cpuinit smp_callin(void) apic->smp_callin_clear_local_apic(); setup_local_APIC(); end_local_APIC_setup(); - map_cpu_to_logical_apicid(); /* * Need to setup vector mappings before we enable interrupts. @@ -355,23 +297,6 @@ notrace static void __cpuinit start_secondary(void *unused) cpu_idle(); } -#ifdef CONFIG_CPUMASK_OFFSTACK -/* In this case, llc_shared_map is a pointer to a cpumask. */ -static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst, - const struct cpuinfo_x86 *src) -{ - struct cpumask *llc = dst->llc_shared_map; - *dst = *src; - dst->llc_shared_map = llc; -} -#else -static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst, - const struct cpuinfo_x86 *src) -{ - *dst = *src; -} -#endif /* CONFIG_CPUMASK_OFFSTACK */ - /* * The bootstrap kernel entry code has set these up. Save them for * a given CPU @@ -381,7 +306,7 @@ void __cpuinit smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = &cpu_data(id); - copy_cpuinfo_x86(c, &boot_cpu_data); + *c = boot_cpu_data; c->cpu_index = id; if (id != 0) identify_secondary_cpu(c); @@ -389,15 +314,12 @@ void __cpuinit smp_store_cpu_info(int id) static void __cpuinit link_thread_siblings(int cpu1, int cpu2) { - struct cpuinfo_x86 *c1 = &cpu_data(cpu1); - struct cpuinfo_x86 *c2 = &cpu_data(cpu2); - cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); - cpumask_set_cpu(cpu1, c2->llc_shared_map); - cpumask_set_cpu(cpu2, c1->llc_shared_map); + cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2)); + cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1)); } @@ -414,6 +336,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) if (cpu_has(c, X86_FEATURE_TOPOEXT)) { if (c->phys_proc_id == o->phys_proc_id && + per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) && c->compute_unit_id == o->compute_unit_id) link_thread_siblings(cpu, i); } else if (c->phys_proc_id == o->phys_proc_id && @@ -425,7 +348,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); } - cpumask_set_cpu(cpu, c->llc_shared_map); + cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); if (__this_cpu_read(cpu_info.x86_max_cores) == 1) { cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); @@ -436,8 +359,8 @@ void __cpuinit set_cpu_sibling_map(int cpu) for_each_cpu(i, cpu_sibling_setup_mask) { if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { - cpumask_set_cpu(i, c->llc_shared_map); - cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map); + cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); + cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); } if (c->phys_proc_id == cpu_data(i).phys_proc_id) { cpumask_set_cpu(i, cpu_core_mask(cpu)); @@ -476,7 +399,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu) !(cpu_has(c, X86_FEATURE_AMD_DCM))) return cpu_core_mask(cpu); else - return c->llc_shared_map; + return cpu_llc_shared_mask(cpu); } static void impress_friends(void) @@ -945,6 +868,14 @@ int __cpuinit native_cpu_up(unsigned int cpu) return 0; } +/** + * arch_disable_smp_support() - disables SMP support for x86 at runtime + */ +void arch_disable_smp_support(void) +{ + disable_ioapic_support(); +} + /* * Fall back to non SMP mode after errors. * @@ -960,7 +891,6 @@ static __init void disable_smp(void) physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); else physid_set_mask_of_physid(0, &phys_cpu_present_map); - map_cpu_to_logical_apicid(); cpumask_set_cpu(0, cpu_sibling_mask(0)); cpumask_set_cpu(0, cpu_core_mask(0)); } @@ -1045,7 +975,7 @@ static int __init smp_sanity_check(unsigned max_cpus) "(tell your hw vendor)\n"); } smpboot_clear_io_apic(); - arch_disable_smp_support(); + disable_ioapic_support(); return -1; } @@ -1089,21 +1019,19 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) preempt_disable(); smp_cpu_index_default(); - memcpy(__this_cpu_ptr(&cpu_info), &boot_cpu_data, sizeof(cpu_info)); - cpumask_copy(cpu_callin_mask, cpumask_of(0)); - mb(); + /* * Setup boot CPU information */ smp_store_cpu_info(0); /* Final full version of the data */ -#ifdef CONFIG_X86_32 - boot_cpu_logical_apicid = logical_smp_processor_id(); -#endif + cpumask_copy(cpu_callin_mask, cpumask_of(0)); + mb(); + current_thread_info()->cpu = 0; /* needed? */ for_each_possible_cpu(i) { zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); - zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); } set_cpu_sibling_map(0); @@ -1139,8 +1067,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) bsp_end_local_APIC_setup(); - map_cpu_to_logical_apicid(); - if (apic->setup_portio_remap) apic->setup_portio_remap(); diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index b35786dc9b8..5f181742e8f 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -340,3 +340,6 @@ ENTRY(sys_call_table) .long sys_fanotify_init .long sys_fanotify_mark .long sys_prlimit64 /* 340 */ + .long sys_name_to_handle_at + .long sys_open_by_handle_at + .long sys_clock_adjtime diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 6d4341d5c52..0381e1f3bae 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -306,7 +306,7 @@ SECTIONS } #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) - PERCPU(THREAD_SIZE) + PERCPU(PAGE_SIZE) #endif . = ALIGN(PAGE_SIZE); diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 1b950d151e5..9796c2f3d07 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -52,6 +52,7 @@ extern void *__memcpy(void *, const void *, __kernel_size_t); EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(__memcpy); +EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(empty_zero_page); #ifndef CONFIG_PARAVIRT diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index ceb2911aa43..c11514e9128 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -70,6 +70,7 @@ struct x86_init_ops x86_init __initdata = { .setup_percpu_clockev = setup_boot_APIC_clock, .tsc_pre_init = x86_init_noop, .timer_init = hpet_time_init, + .wallclock_init = x86_init_noop, }, .iommu = { diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index eba687f0cc0..b9ec1c74943 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -847,7 +847,7 @@ static void __init lguest_init_IRQ(void) void lguest_setup_irq(unsigned int irq) { irq_alloc_desc_at(irq, 0); - set_irq_chip_and_handler_name(irq, &lguest_irq_controller, + irq_set_chip_and_handler_name(irq, &lguest_irq_controller, handle_level_irq, "level"); } @@ -995,7 +995,7 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) static void lguest_time_init(void) { /* Set up the timer interrupt (0) to go to our simple timer routine */ - set_irq_handler(0, lguest_time_irq); + irq_set_handler(0, lguest_time_irq); clocksource_register(&lguest_clock); diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S index 2cda60a06e6..e8e7e0d06f4 100644 --- a/arch/x86/lib/atomic64_386_32.S +++ b/arch/x86/lib/atomic64_386_32.S @@ -15,14 +15,12 @@ /* if you want SMP support, implement these with real spinlocks */ .macro LOCK reg - pushfl - CFI_ADJUST_CFA_OFFSET 4 + pushfl_cfi cli .endm .macro UNLOCK reg - popfl - CFI_ADJUST_CFA_OFFSET -4 + popfl_cfi .endm #define BEGIN(op) \ diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S index 71e080de335..391a083674b 100644 --- a/arch/x86/lib/atomic64_cx8_32.S +++ b/arch/x86/lib/atomic64_cx8_32.S @@ -14,14 +14,12 @@ #include <asm/dwarf2.h> .macro SAVE reg - pushl %\reg - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %\reg CFI_REL_OFFSET \reg, 0 .endm .macro RESTORE reg - popl %\reg - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %\reg CFI_RESTORE \reg .endm diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index adbccd0bbb7..78d16a554db 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -50,11 +50,9 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) */ ENTRY(csum_partial) CFI_STARTPROC - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len @@ -132,11 +130,9 @@ ENTRY(csum_partial) jz 8f roll $8, %eax 8: - popl %ebx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebx CFI_RESTORE ebx - popl %esi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %esi CFI_RESTORE esi ret CFI_ENDPROC @@ -148,11 +144,9 @@ ENDPROC(csum_partial) ENTRY(csum_partial) CFI_STARTPROC - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len @@ -260,11 +254,9 @@ ENTRY(csum_partial) jz 90f roll $8, %eax 90: - popl %ebx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebx CFI_RESTORE ebx - popl %esi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %esi CFI_RESTORE esi ret CFI_ENDPROC @@ -309,14 +301,11 @@ ENTRY(csum_partial_copy_generic) CFI_STARTPROC subl $4,%esp CFI_ADJUST_CFA_OFFSET 4 - pushl %edi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edi CFI_REL_OFFSET edi, 0 - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 movl ARGBASE+16(%esp),%eax # sum movl ARGBASE+12(%esp),%ecx # len @@ -426,17 +415,13 @@ DST( movb %cl, (%edi) ) .previous - popl %ebx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebx CFI_RESTORE ebx - popl %esi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %esi CFI_RESTORE esi - popl %edi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %edi CFI_RESTORE edi - popl %ecx # equivalent to addl $4,%esp - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ecx # equivalent to addl $4,%esp ret CFI_ENDPROC ENDPROC(csum_partial_copy_generic) @@ -459,14 +444,11 @@ ENDPROC(csum_partial_copy_generic) ENTRY(csum_partial_copy_generic) CFI_STARTPROC - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 - pushl %edi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edi CFI_REL_OFFSET edi, 0 - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 movl ARGBASE+4(%esp),%esi #src movl ARGBASE+8(%esp),%edi #dst @@ -527,14 +509,11 @@ DST( movb %dl, (%edi) ) jmp 7b .previous - popl %esi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %esi CFI_RESTORE esi - popl %edi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %edi CFI_RESTORE edi - popl %ebx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebx CFI_RESTORE ebx ret CFI_ENDPROC diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S new file mode 100644 index 00000000000..0ecb8433e5a --- /dev/null +++ b/arch/x86/lib/memmove_64.S @@ -0,0 +1,197 @@ +/* + * Normally compiler builtins are used, but sometimes the compiler calls out + * of line code. Based on asm-i386/string.h. + * + * This assembly file is re-written from memmove_64.c file. + * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> + */ +#define _STRING_C +#include <linux/linkage.h> +#include <asm/dwarf2.h> + +#undef memmove + +/* + * Implement memmove(). This can handle overlap between src and dst. + * + * Input: + * rdi: dest + * rsi: src + * rdx: count + * + * Output: + * rax: dest + */ +ENTRY(memmove) + CFI_STARTPROC + /* Handle more 32bytes in loop */ + mov %rdi, %rax + cmp $0x20, %rdx + jb 1f + + /* Decide forward/backward copy mode */ + cmp %rdi, %rsi + jb 2f + + /* + * movsq instruction have many startup latency + * so we handle small size by general register. + */ + cmp $680, %rdx + jb 3f + /* + * movsq instruction is only good for aligned case. + */ + + cmpb %dil, %sil + je 4f +3: + sub $0x20, %rdx + /* + * We gobble 32byts forward in each loop. + */ +5: + sub $0x20, %rdx + movq 0*8(%rsi), %r11 + movq 1*8(%rsi), %r10 + movq 2*8(%rsi), %r9 + movq 3*8(%rsi), %r8 + leaq 4*8(%rsi), %rsi + + movq %r11, 0*8(%rdi) + movq %r10, 1*8(%rdi) + movq %r9, 2*8(%rdi) + movq %r8, 3*8(%rdi) + leaq 4*8(%rdi), %rdi + jae 5b + addq $0x20, %rdx + jmp 1f + /* + * Handle data forward by movsq. + */ + .p2align 4 +4: + movq %rdx, %rcx + movq -8(%rsi, %rdx), %r11 + lea -8(%rdi, %rdx), %r10 + shrq $3, %rcx + rep movsq + movq %r11, (%r10) + jmp 13f + /* + * Handle data backward by movsq. + */ + .p2align 4 +7: + movq %rdx, %rcx + movq (%rsi), %r11 + movq %rdi, %r10 + leaq -8(%rsi, %rdx), %rsi + leaq -8(%rdi, %rdx), %rdi + shrq $3, %rcx + std + rep movsq + cld + movq %r11, (%r10) + jmp 13f + + /* + * Start to prepare for backward copy. + */ + .p2align 4 +2: + cmp $680, %rdx + jb 6f + cmp %dil, %sil + je 7b +6: + /* + * Calculate copy position to tail. + */ + addq %rdx, %rsi + addq %rdx, %rdi + subq $0x20, %rdx + /* + * We gobble 32byts backward in each loop. + */ +8: + subq $0x20, %rdx + movq -1*8(%rsi), %r11 + movq -2*8(%rsi), %r10 + movq -3*8(%rsi), %r9 + movq -4*8(%rsi), %r8 + leaq -4*8(%rsi), %rsi + + movq %r11, -1*8(%rdi) + movq %r10, -2*8(%rdi) + movq %r9, -3*8(%rdi) + movq %r8, -4*8(%rdi) + leaq -4*8(%rdi), %rdi + jae 8b + /* + * Calculate copy position to head. + */ + addq $0x20, %rdx + subq %rdx, %rsi + subq %rdx, %rdi +1: + cmpq $16, %rdx + jb 9f + /* + * Move data from 16 bytes to 31 bytes. + */ + movq 0*8(%rsi), %r11 + movq 1*8(%rsi), %r10 + movq -2*8(%rsi, %rdx), %r9 + movq -1*8(%rsi, %rdx), %r8 + movq %r11, 0*8(%rdi) + movq %r10, 1*8(%rdi) + movq %r9, -2*8(%rdi, %rdx) + movq %r8, -1*8(%rdi, %rdx) + jmp 13f + .p2align 4 +9: + cmpq $8, %rdx + jb 10f + /* + * Move data from 8 bytes to 15 bytes. + */ + movq 0*8(%rsi), %r11 + movq -1*8(%rsi, %rdx), %r10 + movq %r11, 0*8(%rdi) + movq %r10, -1*8(%rdi, %rdx) + jmp 13f +10: + cmpq $4, %rdx + jb 11f + /* + * Move data from 4 bytes to 7 bytes. + */ + movl (%rsi), %r11d + movl -4(%rsi, %rdx), %r10d + movl %r11d, (%rdi) + movl %r10d, -4(%rdi, %rdx) + jmp 13f +11: + cmp $2, %rdx + jb 12f + /* + * Move data from 2 bytes to 3 bytes. + */ + movw (%rsi), %r11w + movw -2(%rsi, %rdx), %r10w + movw %r11w, (%rdi) + movw %r10w, -2(%rdi, %rdx) + jmp 13f +12: + cmp $1, %rdx + jb 13f + /* + * Move data for 1 byte. + */ + movb (%rsi), %r11b + movb %r11b, (%rdi) +13: + retq + CFI_ENDPROC +ENDPROC(memmove) diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c deleted file mode 100644 index 6d0f0ec41b3..00000000000 --- a/arch/x86/lib/memmove_64.c +++ /dev/null @@ -1,192 +0,0 @@ -/* Normally compiler builtins are used, but sometimes the compiler calls out - of line code. Based on asm-i386/string.h. - */ -#define _STRING_C -#include <linux/string.h> -#include <linux/module.h> - -#undef memmove -void *memmove(void *dest, const void *src, size_t count) -{ - unsigned long d0,d1,d2,d3,d4,d5,d6,d7; - char *ret; - - __asm__ __volatile__( - /* Handle more 32bytes in loop */ - "mov %2, %3\n\t" - "cmp $0x20, %0\n\t" - "jb 1f\n\t" - - /* Decide forward/backward copy mode */ - "cmp %2, %1\n\t" - "jb 2f\n\t" - - /* - * movsq instruction have many startup latency - * so we handle small size by general register. - */ - "cmp $680, %0\n\t" - "jb 3f\n\t" - /* - * movsq instruction is only good for aligned case. - */ - "cmpb %%dil, %%sil\n\t" - "je 4f\n\t" - "3:\n\t" - "sub $0x20, %0\n\t" - /* - * We gobble 32byts forward in each loop. - */ - "5:\n\t" - "sub $0x20, %0\n\t" - "movq 0*8(%1), %4\n\t" - "movq 1*8(%1), %5\n\t" - "movq 2*8(%1), %6\n\t" - "movq 3*8(%1), %7\n\t" - "leaq 4*8(%1), %1\n\t" - - "movq %4, 0*8(%2)\n\t" - "movq %5, 1*8(%2)\n\t" - "movq %6, 2*8(%2)\n\t" - "movq %7, 3*8(%2)\n\t" - "leaq 4*8(%2), %2\n\t" - "jae 5b\n\t" - "addq $0x20, %0\n\t" - "jmp 1f\n\t" - /* - * Handle data forward by movsq. - */ - ".p2align 4\n\t" - "4:\n\t" - "movq %0, %8\n\t" - "movq -8(%1, %0), %4\n\t" - "lea -8(%2, %0), %5\n\t" - "shrq $3, %8\n\t" - "rep movsq\n\t" - "movq %4, (%5)\n\t" - "jmp 13f\n\t" - /* - * Handle data backward by movsq. - */ - ".p2align 4\n\t" - "7:\n\t" - "movq %0, %8\n\t" - "movq (%1), %4\n\t" - "movq %2, %5\n\t" - "leaq -8(%1, %0), %1\n\t" - "leaq -8(%2, %0), %2\n\t" - "shrq $3, %8\n\t" - "std\n\t" - "rep movsq\n\t" - "cld\n\t" - "movq %4, (%5)\n\t" - "jmp 13f\n\t" - - /* - * Start to prepare for backward copy. - */ - ".p2align 4\n\t" - "2:\n\t" - "cmp $680, %0\n\t" - "jb 6f \n\t" - "cmp %%dil, %%sil\n\t" - "je 7b \n\t" - "6:\n\t" - /* - * Calculate copy position to tail. - */ - "addq %0, %1\n\t" - "addq %0, %2\n\t" - "subq $0x20, %0\n\t" - /* - * We gobble 32byts backward in each loop. - */ - "8:\n\t" - "subq $0x20, %0\n\t" - "movq -1*8(%1), %4\n\t" - "movq -2*8(%1), %5\n\t" - "movq -3*8(%1), %6\n\t" - "movq -4*8(%1), %7\n\t" - "leaq -4*8(%1), %1\n\t" - - "movq %4, -1*8(%2)\n\t" - "movq %5, -2*8(%2)\n\t" - "movq %6, -3*8(%2)\n\t" - "movq %7, -4*8(%2)\n\t" - "leaq -4*8(%2), %2\n\t" - "jae 8b\n\t" - /* - * Calculate copy position to head. - */ - "addq $0x20, %0\n\t" - "subq %0, %1\n\t" - "subq %0, %2\n\t" - "1:\n\t" - "cmpq $16, %0\n\t" - "jb 9f\n\t" - /* - * Move data from 16 bytes to 31 bytes. - */ - "movq 0*8(%1), %4\n\t" - "movq 1*8(%1), %5\n\t" - "movq -2*8(%1, %0), %6\n\t" - "movq -1*8(%1, %0), %7\n\t" - "movq %4, 0*8(%2)\n\t" - "movq %5, 1*8(%2)\n\t" - "movq %6, -2*8(%2, %0)\n\t" - "movq %7, -1*8(%2, %0)\n\t" - "jmp 13f\n\t" - ".p2align 4\n\t" - "9:\n\t" - "cmpq $8, %0\n\t" - "jb 10f\n\t" - /* - * Move data from 8 bytes to 15 bytes. - */ - "movq 0*8(%1), %4\n\t" - "movq -1*8(%1, %0), %5\n\t" - "movq %4, 0*8(%2)\n\t" - "movq %5, -1*8(%2, %0)\n\t" - "jmp 13f\n\t" - "10:\n\t" - "cmpq $4, %0\n\t" - "jb 11f\n\t" - /* - * Move data from 4 bytes to 7 bytes. - */ - "movl (%1), %4d\n\t" - "movl -4(%1, %0), %5d\n\t" - "movl %4d, (%2)\n\t" - "movl %5d, -4(%2, %0)\n\t" - "jmp 13f\n\t" - "11:\n\t" - "cmp $2, %0\n\t" - "jb 12f\n\t" - /* - * Move data from 2 bytes to 3 bytes. - */ - "movw (%1), %4w\n\t" - "movw -2(%1, %0), %5w\n\t" - "movw %4w, (%2)\n\t" - "movw %5w, -2(%2, %0)\n\t" - "jmp 13f\n\t" - "12:\n\t" - "cmp $1, %0\n\t" - "jb 13f\n\t" - /* - * Move data for 1 byte. - */ - "movb (%1), %4b\n\t" - "movb %4b, (%2)\n\t" - "13:\n\t" - : "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) , - "=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7) - :"0" (count), - "1" (src), - "2" (dest) - :"memory"); - - return ret; - -} -EXPORT_SYMBOL(memmove); diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S index 41fcf00e49d..67743977398 100644 --- a/arch/x86/lib/rwsem_64.S +++ b/arch/x86/lib/rwsem_64.S @@ -23,43 +23,50 @@ #include <asm/dwarf2.h> #define save_common_regs \ - pushq %rdi; \ - pushq %rsi; \ - pushq %rcx; \ - pushq %r8; \ - pushq %r9; \ - pushq %r10; \ - pushq %r11 + pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \ + pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \ + pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \ + pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \ + pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \ + pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \ + pushq_cfi %r11; CFI_REL_OFFSET r11, 0 #define restore_common_regs \ - popq %r11; \ - popq %r10; \ - popq %r9; \ - popq %r8; \ - popq %rcx; \ - popq %rsi; \ - popq %rdi + popq_cfi %r11; CFI_RESTORE r11; \ + popq_cfi %r10; CFI_RESTORE r10; \ + popq_cfi %r9; CFI_RESTORE r9; \ + popq_cfi %r8; CFI_RESTORE r8; \ + popq_cfi %rcx; CFI_RESTORE rcx; \ + popq_cfi %rsi; CFI_RESTORE rsi; \ + popq_cfi %rdi; CFI_RESTORE rdi /* Fix up special calling conventions */ ENTRY(call_rwsem_down_read_failed) + CFI_STARTPROC save_common_regs - pushq %rdx + pushq_cfi %rdx + CFI_REL_OFFSET rdx, 0 movq %rax,%rdi call rwsem_down_read_failed - popq %rdx + popq_cfi %rdx + CFI_RESTORE rdx restore_common_regs ret - ENDPROC(call_rwsem_down_read_failed) + CFI_ENDPROC +ENDPROC(call_rwsem_down_read_failed) ENTRY(call_rwsem_down_write_failed) + CFI_STARTPROC save_common_regs movq %rax,%rdi call rwsem_down_write_failed restore_common_regs ret - ENDPROC(call_rwsem_down_write_failed) + CFI_ENDPROC +ENDPROC(call_rwsem_down_write_failed) ENTRY(call_rwsem_wake) + CFI_STARTPROC decl %edx /* do nothing if still outstanding active readers */ jnz 1f save_common_regs @@ -67,15 +74,20 @@ ENTRY(call_rwsem_wake) call rwsem_wake restore_common_regs 1: ret - ENDPROC(call_rwsem_wake) + CFI_ENDPROC +ENDPROC(call_rwsem_wake) /* Fix up special calling conventions */ ENTRY(call_rwsem_downgrade_wake) + CFI_STARTPROC save_common_regs - pushq %rdx + pushq_cfi %rdx + CFI_REL_OFFSET rdx, 0 movq %rax,%rdi call rwsem_downgrade_wake - popq %rdx + popq_cfi %rdx + CFI_RESTORE rdx restore_common_regs ret - ENDPROC(call_rwsem_downgrade_wake) + CFI_ENDPROC +ENDPROC(call_rwsem_downgrade_wake) diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S index 648fe474178..06691daa410 100644 --- a/arch/x86/lib/semaphore_32.S +++ b/arch/x86/lib/semaphore_32.S @@ -36,7 +36,7 @@ */ #ifdef CONFIG_SMP ENTRY(__write_lock_failed) - CFI_STARTPROC simple + CFI_STARTPROC FRAME 2: LOCK_PREFIX addl $ RW_LOCK_BIAS,(%eax) @@ -74,29 +74,23 @@ ENTRY(__read_lock_failed) /* Fix up special calling conventions */ ENTRY(call_rwsem_down_read_failed) CFI_STARTPROC - push %ecx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx CFI_REL_OFFSET ecx,0 - push %edx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edx CFI_REL_OFFSET edx,0 call rwsem_down_read_failed - pop %edx - CFI_ADJUST_CFA_OFFSET -4 - pop %ecx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %edx + popl_cfi %ecx ret CFI_ENDPROC ENDPROC(call_rwsem_down_read_failed) ENTRY(call_rwsem_down_write_failed) CFI_STARTPROC - push %ecx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx CFI_REL_OFFSET ecx,0 calll rwsem_down_write_failed - pop %ecx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ecx ret CFI_ENDPROC ENDPROC(call_rwsem_down_write_failed) @@ -105,12 +99,10 @@ ENTRY(call_rwsem_wake) CFI_STARTPROC decw %dx /* do nothing if still outstanding active readers */ jnz 1f - push %ecx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx CFI_REL_OFFSET ecx,0 call rwsem_wake - pop %ecx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ecx 1: ret CFI_ENDPROC ENDPROC(call_rwsem_wake) @@ -118,17 +110,13 @@ ENTRY(call_rwsem_wake) /* Fix up special calling conventions */ ENTRY(call_rwsem_downgrade_wake) CFI_STARTPROC - push %ecx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx CFI_REL_OFFSET ecx,0 - push %edx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edx CFI_REL_OFFSET edx,0 call rwsem_downgrade_wake - pop %edx - CFI_ADJUST_CFA_OFFSET -4 - pop %ecx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %edx + popl_cfi %ecx ret CFI_ENDPROC ENDPROC(call_rwsem_downgrade_wake) diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S index 650b11e00ec..2930ae05d77 100644 --- a/arch/x86/lib/thunk_32.S +++ b/arch/x86/lib/thunk_32.S @@ -7,24 +7,6 @@ #include <linux/linkage.h> -#define ARCH_TRACE_IRQS_ON \ - pushl %eax; \ - pushl %ecx; \ - pushl %edx; \ - call trace_hardirqs_on; \ - popl %edx; \ - popl %ecx; \ - popl %eax; - -#define ARCH_TRACE_IRQS_OFF \ - pushl %eax; \ - pushl %ecx; \ - pushl %edx; \ - call trace_hardirqs_off; \ - popl %edx; \ - popl %ecx; \ - popl %eax; - #ifdef CONFIG_TRACE_IRQFLAGS /* put return address in eax (arg1) */ .macro thunk_ra name,func diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index bf9a7d5a542..782b082c9ff 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -22,26 +22,6 @@ CFI_ENDPROC .endm - /* rdi: arg1 ... normal C conventions. rax is passed from C. */ - .macro thunk_retrax name,func - .globl \name -\name: - CFI_STARTPROC - SAVE_ARGS - call \func - jmp restore_norax - CFI_ENDPROC - .endm - - - .section .sched.text, "ax" -#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM - thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed - thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed - thunk rwsem_wake_thunk,rwsem_wake - thunk rwsem_downgrade_thunk,rwsem_downgrade_wake -#endif - #ifdef CONFIG_TRACE_IRQFLAGS /* put return address in rdi (arg1) */ .macro thunk_ra name,func @@ -72,10 +52,3 @@ restore: RESTORE_ARGS ret CFI_ENDPROC - - CFI_STARTPROC - SAVE_ARGS -restore_norax: - RESTORE_ARGS 1 - ret - CFI_ENDPROC diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 09df2f9a3d6..3e608edf995 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o +obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index f21962c435e..0919c26820d 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -26,9 +26,7 @@ #include <asm/apic.h> #include <asm/amd_nb.h> -static struct bootnode __initdata nodes[8]; static unsigned char __initdata nodeids[8]; -static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; static __init int find_northbridge(void) { @@ -51,7 +49,7 @@ static __init int find_northbridge(void) return num; } - return -1; + return -ENOENT; } static __init void early_get_boot_cpu_id(void) @@ -69,17 +67,18 @@ static __init void early_get_boot_cpu_id(void) #endif } -int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) +int __init amd_numa_init(void) { - unsigned long start = PFN_PHYS(start_pfn); - unsigned long end = PFN_PHYS(end_pfn); + unsigned long start = PFN_PHYS(0); + unsigned long end = PFN_PHYS(max_pfn); unsigned numnodes; unsigned long prevbase; - int i, nb, found = 0; + int i, j, nb; u32 nodeid, reg; + unsigned int bits, cores, apicid_base; if (!early_pci_allowed()) - return -1; + return -EINVAL; nb = find_northbridge(); if (nb < 0) @@ -90,7 +89,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) reg = read_pci_config(0, nb, 0, 0x60); numnodes = ((reg >> 4) & 0xF) + 1; if (numnodes <= 1) - return -1; + return -ENOENT; pr_info("Number of physical nodes %d\n", numnodes); @@ -121,9 +120,9 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) if ((base >> 8) & 3 || (limit >> 8) & 3) { pr_err("Node %d using interleaving mode %lx/%lx\n", nodeid, (base >> 8) & 3, (limit >> 8) & 3); - return -1; + return -EINVAL; } - if (node_isset(nodeid, nodes_parsed)) { + if (node_isset(nodeid, numa_nodes_parsed)) { pr_info("Node %d already present, skipping\n", nodeid); continue; @@ -160,117 +159,28 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) if (prevbase > base) { pr_err("Node map not sorted %lx,%lx\n", prevbase, base); - return -1; + return -EINVAL; } pr_info("Node %d MemBase %016lx Limit %016lx\n", nodeid, base, limit); - found++; - - nodes[nodeid].start = base; - nodes[nodeid].end = limit; - prevbase = base; - - node_set(nodeid, nodes_parsed); - } - - if (!found) - return -1; - return 0; -} - -#ifdef CONFIG_NUMA_EMU -static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE -}; - -void __init amd_get_nodes(struct bootnode *physnodes) -{ - int i; - - for_each_node_mask(i, nodes_parsed) { - physnodes[i].start = nodes[i].start; - physnodes[i].end = nodes[i].end; + numa_add_memblk(nodeid, base, limit); + node_set(nodeid, numa_nodes_parsed); } -} - -static int __init find_node_by_addr(unsigned long addr) -{ - int ret = NUMA_NO_NODE; - int i; - - for (i = 0; i < 8; i++) - if (addr >= nodes[i].start && addr < nodes[i].end) { - ret = i; - break; - } - return ret; -} -/* - * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be - * setup to represent the physical topology but reflect the emulated - * environment. For each emulated node, the real node which it appears on is - * found and a fake pxm to nid mapping is created which mirrors the actual - * locality. node_distance() then represents the correct distances between - * emulated nodes by using the fake acpi mappings to pxms. - */ -void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes) -{ - unsigned int bits; - unsigned int cores; - unsigned int apicid_base = 0; - int i; + if (!nodes_weight(numa_nodes_parsed)) + return -ENOENT; + /* + * We seem to have valid NUMA configuration. Map apicids to nodes + * using the coreid bits from early_identify_cpu. + */ bits = boot_cpu_data.x86_coreid_bits; cores = 1 << bits; - early_get_boot_cpu_id(); - if (boot_cpu_physical_apicid > 0) - apicid_base = boot_cpu_physical_apicid; - - for (i = 0; i < nr_nodes; i++) { - int index; - int nid; - int j; - - nid = find_node_by_addr(nodes[i].start); - if (nid == NUMA_NO_NODE) - continue; - - index = nodeids[nid] << bits; - if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE) - for (j = apicid_base; j < cores + apicid_base; j++) - fake_apicid_to_node[index + j] = i; -#ifdef CONFIG_ACPI_NUMA - __acpi_map_pxm_to_node(nid, i); -#endif - } - memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); -} -#endif /* CONFIG_NUMA_EMU */ - -int __init amd_scan_nodes(void) -{ - unsigned int bits; - unsigned int cores; - unsigned int apicid_base; - int i; - - BUG_ON(nodes_empty(nodes_parsed)); - node_possible_map = nodes_parsed; - memnode_shift = compute_hash_shift(nodes, 8, NULL); - if (memnode_shift < 0) { - pr_err("No NUMA node hash function found. Contact maintainer\n"); - return -1; - } - pr_info("Using node hash shift of %d\n", memnode_shift); - - /* use the coreid bits from early_identify_cpu */ - bits = boot_cpu_data.x86_coreid_bits; - cores = (1<<bits); apicid_base = 0; + /* get the APIC ID of the BSP early for systems with apicid lifting */ early_get_boot_cpu_id(); if (boot_cpu_physical_apicid > 0) { @@ -278,17 +188,9 @@ int __init amd_scan_nodes(void) apicid_base = boot_cpu_physical_apicid; } - for_each_node_mask(i, node_possible_map) { - int j; - - memblock_x86_register_active_regions(i, - nodes[i].start >> PAGE_SHIFT, - nodes[i].end >> PAGE_SHIFT); + for_each_node_mask(i, numa_nodes_parsed) for (j = apicid_base; j < cores + apicid_base; j++) - apicid_to_node[(i << bits) + j] = i; - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - } + set_apicid_to_node((i << bits) + j, i); - numa_init_array(); return 0; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7d90ceb882a..20e3f8702d1 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -229,15 +229,14 @@ void vmalloc_sync_all(void) for (address = VMALLOC_START & PMD_MASK; address >= TASK_SIZE && address < FIXADDR_TOP; address += PMD_SIZE) { - - unsigned long flags; struct page *page; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { spinlock_t *pgt_lock; pmd_t *ret; + /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); @@ -247,7 +246,7 @@ void vmalloc_sync_all(void) if (!ret) break; } - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } } @@ -828,6 +827,13 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, unsigned int fault) { if (fault & VM_FAULT_OOM) { + /* Kernel mode? Handle exceptions or die: */ + if (!(error_code & PF_USER)) { + up_read(¤t->mm->mmap_sem); + no_context(regs, error_code, address); + return; + } + out_of_memory(regs, error_code, address); } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 947f42abe82..286d289b039 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -18,9 +18,9 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); -unsigned long __initdata e820_table_start; -unsigned long __meminitdata e820_table_end; -unsigned long __meminitdata e820_table_top; +unsigned long __initdata pgt_buf_start; +unsigned long __meminitdata pgt_buf_end; +unsigned long __meminitdata pgt_buf_top; int after_bootmem; @@ -33,7 +33,7 @@ int direct_gbpages static void __init find_early_table_space(unsigned long end, int use_pse, int use_gbpages) { - unsigned long puds, pmds, ptes, tables, start; + unsigned long puds, pmds, ptes, tables, start = 0, good_end = end; phys_addr_t base; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; @@ -65,29 +65,20 @@ static void __init find_early_table_space(unsigned long end, int use_pse, #ifdef CONFIG_X86_32 /* for fixmap */ tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); -#endif - /* - * RED-PEN putting page tables only on node 0 could - * cause a hotspot and fill up ZONE_DMA. The page tables - * need roughly 0.5KB per GB. - */ -#ifdef CONFIG_X86_32 - start = 0x7000; -#else - start = 0x8000; + good_end = max_pfn_mapped << PAGE_SHIFT; #endif - base = memblock_find_in_range(start, max_pfn_mapped<<PAGE_SHIFT, - tables, PAGE_SIZE); + + base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); if (base == MEMBLOCK_ERROR) panic("Cannot find space for the kernel page tables"); - e820_table_start = base >> PAGE_SHIFT; - e820_table_end = e820_table_start; - e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); + pgt_buf_start = base >> PAGE_SHIFT; + pgt_buf_end = pgt_buf_start; + pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); + end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); } struct map_range { @@ -279,30 +270,11 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, load_cr3(swapper_pg_dir); #endif -#ifdef CONFIG_X86_64 - if (!after_bootmem && !start) { - pud_t *pud; - pmd_t *pmd; - - mmu_cr4_features = read_cr4(); - - /* - * _brk_end cannot change anymore, but it and _end may be - * located on different 2M pages. cleanup_highmap(), however, - * can only consider _end when it runs, so destroy any - * mappings beyond _brk_end here. - */ - pud = pud_offset(pgd_offset_k(_brk_end), _brk_end); - pmd = pmd_offset(pud, _brk_end - 1); - while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1)) - pmd_clear(pmd); - } -#endif __flush_tlb_all(); - if (!after_bootmem && e820_table_end > e820_table_start) - memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT, - e820_table_end << PAGE_SHIFT, "PGTABLE"); + if (!after_bootmem && pgt_buf_end > pgt_buf_start) + memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT, + pgt_buf_end << PAGE_SHIFT, "PGTABLE"); if (!after_bootmem) early_memtest(start, end); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c821074b7f0..73ad7ebd6e9 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -62,10 +62,10 @@ bool __read_mostly __vmalloc_start_set = false; static __init void *alloc_low_page(void) { - unsigned long pfn = e820_table_end++; + unsigned long pfn = pgt_buf_end++; void *adr; - if (pfn >= e820_table_top) + if (pfn >= pgt_buf_top) panic("alloc_low_page: ran out of memory"); adr = __va(pfn * PAGE_SIZE); @@ -163,8 +163,8 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, if (pmd_idx_kmap_begin != pmd_idx_kmap_end && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end - && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start - || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) { + && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start + || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) { pte_t *newpte; int i; @@ -644,8 +644,7 @@ void __init find_low_pfn_range(void) } #ifndef CONFIG_NEED_MULTIPLE_NODES -void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, - int acpi, int k8) +void __init initmem_init(void) { #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 71a59296af8..a08a62cb136 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -105,18 +105,18 @@ void sync_global_pgds(unsigned long start, unsigned long end) for (address = start; address <= end; address += PGDIR_SIZE) { const pgd_t *pgd_ref = pgd_offset_k(address); - unsigned long flags; struct page *page; if (pgd_none(*pgd_ref)) continue; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; spinlock_t *pgt_lock; pgd = (pgd_t *)page_address(page) + pgd_index(address); + /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); @@ -128,7 +128,7 @@ void sync_global_pgds(unsigned long start, unsigned long end) spin_unlock(pgt_lock); } - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } } @@ -314,7 +314,7 @@ void __init cleanup_highmap(void) static __ref void *alloc_low_page(unsigned long *phys) { - unsigned long pfn = e820_table_end++; + unsigned long pfn = pgt_buf_end++; void *adr; if (after_bootmem) { @@ -324,7 +324,7 @@ static __ref void *alloc_low_page(unsigned long *phys) return adr; } - if (pfn >= e820_table_top) + if (pfn >= pgt_buf_top) panic("alloc_low_page: ran out of memory"); adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); @@ -333,12 +333,28 @@ static __ref void *alloc_low_page(unsigned long *phys) return adr; } +static __ref void *map_low_page(void *virt) +{ + void *adr; + unsigned long phys, left; + + if (after_bootmem) + return virt; + + phys = __pa(virt); + left = phys & (PAGE_SIZE - 1); + adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE); + adr = (void *)(((unsigned long)adr) | left); + + return adr; +} + static __ref void unmap_low_page(void *adr) { if (after_bootmem) return; - early_iounmap(adr, PAGE_SIZE); + early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE); } static unsigned long __meminit @@ -386,15 +402,6 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, } static unsigned long __meminit -phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end, - pgprot_t prot) -{ - pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); - - return phys_pte_init(pte, address, end, prot); -} - -static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, unsigned long page_size_mask, pgprot_t prot) { @@ -420,8 +427,10 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, if (pmd_val(*pmd)) { if (!pmd_large(*pmd)) { spin_lock(&init_mm.page_table_lock); - last_map_addr = phys_pte_update(pmd, address, + pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd)); + last_map_addr = phys_pte_init(pte, address, end, prot); + unmap_low_page(pte); spin_unlock(&init_mm.page_table_lock); continue; } @@ -468,18 +477,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, } static unsigned long __meminit -phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, - unsigned long page_size_mask, pgprot_t prot) -{ - pmd_t *pmd = pmd_offset(pud, 0); - unsigned long last_map_addr; - - last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot); - __flush_tlb_all(); - return last_map_addr; -} - -static unsigned long __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, unsigned long page_size_mask) { @@ -504,8 +501,11 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, if (pud_val(*pud)) { if (!pud_large(*pud)) { - last_map_addr = phys_pmd_update(pud, addr, end, + pmd = map_low_page(pmd_offset(pud, 0)); + last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, prot); + unmap_low_page(pmd); + __flush_tlb_all(); continue; } /* @@ -553,17 +553,6 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, return last_map_addr; } -static unsigned long __meminit -phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, - unsigned long page_size_mask) -{ - pud_t *pud; - - pud = (pud_t *)pgd_page_vaddr(*pgd); - - return phys_pud_init(pud, addr, end, page_size_mask); -} - unsigned long __meminit kernel_physical_mapping_init(unsigned long start, unsigned long end, @@ -587,8 +576,10 @@ kernel_physical_mapping_init(unsigned long start, next = end; if (pgd_val(*pgd)) { - last_map_addr = phys_pud_update(pgd, __pa(start), + pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd)); + last_map_addr = phys_pud_init(pud, __pa(start), __pa(end), page_size_mask); + unmap_low_page(pud); continue; } @@ -612,10 +603,9 @@ kernel_physical_mapping_init(unsigned long start, } #ifndef CONFIG_NUMA -void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, - int acpi, int k8) +void __init initmem_init(void) { - memblock_x86_register_active_regions(0, start_pfn, end_pfn); + memblock_x86_register_active_regions(0, 0, max_pfn); } #endif diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index ebf6d7887a3..9559d360fde 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -26,12 +26,50 @@ static __init int numa_setup(char *opt) early_param("numa", numa_setup); /* - * Which logical CPUs are on which nodes + * apicid, cpu, node mappings */ +s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE +}; + cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; EXPORT_SYMBOL(node_to_cpumask_map); /* + * Map cpu index to node index + */ +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); + +void __cpuinit numa_set_node(int cpu, int node) +{ + int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); + + /* early setting, no percpu area yet */ + if (cpu_to_node_map) { + cpu_to_node_map[cpu] = node; + return; + } + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS + if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { + printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); + dump_stack(); + return; + } +#endif + per_cpu(x86_cpu_to_node_map, cpu) = node; + + if (node != NUMA_NO_NODE) + set_cpu_numa_node(cpu, node); +} + +void __cpuinit numa_clear_node(int cpu) +{ + numa_set_node(cpu, NUMA_NO_NODE); +} + +/* * Allocate node_to_cpumask_map based on number of available nodes * Requires node_possible_map to be valid. * @@ -57,7 +95,174 @@ void __init setup_node_to_cpumask_map(void) pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); } -#ifdef CONFIG_DEBUG_PER_CPU_MAPS +/* + * There are unfortunately some poorly designed mainboards around that + * only connect memory to a single CPU. This breaks the 1:1 cpu->node + * mapping. To avoid this fill in the mapping for all possible CPUs, + * as the number of CPUs is not known yet. We round robin the existing + * nodes. + */ +void __init numa_init_array(void) +{ + int rr, i; + + rr = first_node(node_online_map); + for (i = 0; i < nr_cpu_ids; i++) { + if (early_cpu_to_node(i) != NUMA_NO_NODE) + continue; + numa_set_node(i, rr); + rr = next_node(rr, node_online_map); + if (rr == MAX_NUMNODES) + rr = first_node(node_online_map); + } +} + +static __init int find_near_online_node(int node) +{ + int n, val; + int min_val = INT_MAX; + int best_node = -1; + + for_each_online_node(n) { + val = node_distance(node, n); + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + return best_node; +} + +/* + * Setup early cpu_to_node. + * + * Populate cpu_to_node[] only if x86_cpu_to_apicid[], + * and apicid_to_node[] tables have valid entries for a CPU. + * This means we skip cpu_to_node[] initialisation for NUMA + * emulation and faking node case (when running a kernel compiled + * for NUMA on a non NUMA box), which is OK as cpu_to_node[] + * is already initialized in a round robin manner at numa_init_array, + * prior to this call, and this initialization is good enough + * for the fake NUMA cases. + * + * Called before the per_cpu areas are setup. + */ +void __init init_cpu_to_node(void) +{ + int cpu; + u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); + + BUG_ON(cpu_to_apicid == NULL); + + for_each_possible_cpu(cpu) { + int node = numa_cpu_node(cpu); + + if (node == NUMA_NO_NODE) + continue; + if (!node_online(node)) + node = find_near_online_node(node); + numa_set_node(cpu, node); + } +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS + +# ifndef CONFIG_NUMA_EMU +void __cpuinit numa_add_cpu(int cpu) +{ + cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} +# endif /* !CONFIG_NUMA_EMU */ + +#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ + +int __cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) { + printk(KERN_WARNING + "cpu_to_node(%d): usage too early!\n", cpu); + dump_stack(); + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} +EXPORT_SYMBOL(__cpu_to_node); + +/* + * Same function as cpu_to_node() but used if called before the + * per_cpu areas are setup. + */ +int early_cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + + if (!cpu_possible(cpu)) { + printk(KERN_WARNING + "early_cpu_to_node(%d): no per_cpu area!\n", cpu); + dump_stack(); + return NUMA_NO_NODE; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} + +struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) +{ + int node = early_cpu_to_node(cpu); + struct cpumask *mask; + char buf[64]; + + if (node == NUMA_NO_NODE) { + /* early_cpu_to_node() already emits a warning and trace */ + return NULL; + } + mask = node_to_cpumask_map[node]; + if (!mask) { + pr_err("node_to_cpumask_map[%i] NULL\n", node); + dump_stack(); + return NULL; + } + + cpulist_scnprintf(buf, sizeof(buf), mask); + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", + enable ? "numa_add_cpu" : "numa_remove_cpu", + cpu, node, buf); + return mask; +} + +# ifndef CONFIG_NUMA_EMU +static void __cpuinit numa_set_cpumask(int cpu, int enable) +{ + struct cpumask *mask; + + mask = debug_cpumask_set_cpu(cpu, enable); + if (!mask) + return; + + if (enable) + cpumask_set_cpu(cpu, mask); + else + cpumask_clear_cpu(cpu, mask); +} + +void __cpuinit numa_add_cpu(int cpu) +{ + numa_set_cpumask(cpu, 1); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + numa_set_cpumask(cpu, 0); +} +# endif /* !CONFIG_NUMA_EMU */ + /* * Returns a pointer to the bitmask of CPUs on Node 'node'. */ @@ -80,4 +285,5 @@ const struct cpumask *cpumask_of_node(int node) return node_to_cpumask_map[node]; } EXPORT_SYMBOL(cpumask_of_node); -#endif + +#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 84a3e4c9f27..bde3906420d 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -110,6 +110,12 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); static unsigned long kva_start_pfn; static unsigned long kva_pages; + +int __cpuinit numa_cpu_node(int cpu) +{ + return apic->x86_32_numa_cpu_node(cpu); +} + /* * FLAT - support for basic PC memory model with discontig enabled, essentially * a single node with all available processors in it with a flat @@ -346,8 +352,7 @@ static void init_remap_allocator(int nid) (ulong) node_remap_end_vaddr[nid]); } -void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, - int acpi, int k8) +void __init initmem_init(void) { int nid; long kva_target_pfn; @@ -361,6 +366,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, */ get_memcfg_numa(); + numa_init_array(); kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 95ea1551eeb..9ec0f209a6a 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -13,31 +13,30 @@ #include <linux/module.h> #include <linux/nodemask.h> #include <linux/sched.h> +#include <linux/acpi.h> #include <asm/e820.h> #include <asm/proto.h> #include <asm/dma.h> -#include <asm/numa.h> #include <asm/acpi.h> #include <asm/amd_nb.h> +#include "numa_internal.h" + struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); -struct memnode memnode; +nodemask_t numa_nodes_parsed __initdata; -s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE -}; +struct memnode memnode; static unsigned long __initdata nodemap_addr; static unsigned long __initdata nodemap_size; -/* - * Map cpu index to node index - */ -DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); +static struct numa_meminfo numa_meminfo __initdata; + +static int numa_distance_cnt; +static u8 *numa_distance; /* * Given a shift value, try to populate memnodemap[] @@ -46,16 +45,15 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); * 0 if memnodmap[] too small (of shift too small) * -1 if node overlap or lost ram (shift too big) */ -static int __init populate_memnodemap(const struct bootnode *nodes, - int numnodes, int shift, int *nodeids) +static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift) { unsigned long addr, end; int i, res = -1; memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); - for (i = 0; i < numnodes; i++) { - addr = nodes[i].start; - end = nodes[i].end; + for (i = 0; i < mi->nr_blks; i++) { + addr = mi->blk[i].start; + end = mi->blk[i].end; if (addr >= end) continue; if ((end >> shift) >= memnodemapsize) @@ -63,12 +61,7 @@ static int __init populate_memnodemap(const struct bootnode *nodes, do { if (memnodemap[addr >> shift] != NUMA_NO_NODE) return -1; - - if (!nodeids) - memnodemap[addr >> shift] = i; - else - memnodemap[addr >> shift] = nodeids[i]; - + memnodemap[addr >> shift] = mi->blk[i].nid; addr += (1UL << shift); } while (addr < end); res = 1; @@ -86,7 +79,7 @@ static int __init allocate_cachealigned_memnodemap(void) addr = 0x8000; nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); - nodemap_addr = memblock_find_in_range(addr, max_pfn<<PAGE_SHIFT, + nodemap_addr = memblock_find_in_range(addr, get_max_mapped(), nodemap_size, L1_CACHE_BYTES); if (nodemap_addr == MEMBLOCK_ERROR) { printk(KERN_ERR @@ -106,16 +99,15 @@ static int __init allocate_cachealigned_memnodemap(void) * The LSB of all start and end addresses in the node map is the value of the * maximum possible shift. */ -static int __init extract_lsb_from_nodes(const struct bootnode *nodes, - int numnodes) +static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi) { int i, nodes_used = 0; unsigned long start, end; unsigned long bitfield = 0, memtop = 0; - for (i = 0; i < numnodes; i++) { - start = nodes[i].start; - end = nodes[i].end; + for (i = 0; i < mi->nr_blks; i++) { + start = mi->blk[i].start; + end = mi->blk[i].end; if (start >= end) continue; bitfield |= start; @@ -131,18 +123,17 @@ static int __init extract_lsb_from_nodes(const struct bootnode *nodes, return i; } -int __init compute_hash_shift(struct bootnode *nodes, int numnodes, - int *nodeids) +static int __init compute_hash_shift(const struct numa_meminfo *mi) { int shift; - shift = extract_lsb_from_nodes(nodes, numnodes); + shift = extract_lsb_from_nodes(mi); if (allocate_cachealigned_memnodemap()) return -1; printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift); - if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) { + if (populate_memnodemap(mi, shift) != 1) { printk(KERN_INFO "Your memory is not aligned you need to " "rebuild your kernel with a bigger NODEMAPSIZE " "shift=%d\n", shift); @@ -188,6 +179,63 @@ static void * __init early_node_mem(int nodeid, unsigned long start, return NULL; } +static int __init numa_add_memblk_to(int nid, u64 start, u64 end, + struct numa_meminfo *mi) +{ + /* ignore zero length blks */ + if (start == end) + return 0; + + /* whine about and ignore invalid blks */ + if (start > end || nid < 0 || nid >= MAX_NUMNODES) { + pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", + nid, start, end); + return 0; + } + + if (mi->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("NUMA: too many memblk ranges\n"); + return -EINVAL; + } + + mi->blk[mi->nr_blks].start = start; + mi->blk[mi->nr_blks].end = end; + mi->blk[mi->nr_blks].nid = nid; + mi->nr_blks++; + return 0; +} + +/** + * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo + * @idx: Index of memblk to remove + * @mi: numa_meminfo to remove memblk from + * + * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and + * decrementing @mi->nr_blks. + */ +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) +{ + mi->nr_blks--; + memmove(&mi->blk[idx], &mi->blk[idx + 1], + (mi->nr_blks - idx) * sizeof(mi->blk[0])); +} + +/** + * numa_add_memblk - Add one numa_memblk to numa_meminfo + * @nid: NUMA node ID of the new memblk + * @start: Start address of the new memblk + * @end: End address of the new memblk + * + * Add a new memblk to the default numa_meminfo. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_add_memblk(int nid, u64 start, u64 end) +{ + return numa_add_memblk_to(nid, start, end, &numa_meminfo); +} + /* Initialize bootmem allocator for a node */ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) @@ -234,696 +282,386 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) node_set_online(nodeid); } -/* - * There are unfortunately some poorly designed mainboards around that - * only connect memory to a single CPU. This breaks the 1:1 cpu->node - * mapping. To avoid this fill in the mapping for all possible CPUs, - * as the number of CPUs is not known yet. We round robin the existing - * nodes. +/** + * numa_cleanup_meminfo - Cleanup a numa_meminfo + * @mi: numa_meminfo to clean up + * + * Sanitize @mi by merging and removing unncessary memblks. Also check for + * conflicts and clear unused memblks. + * + * RETURNS: + * 0 on success, -errno on failure. */ -void __init numa_init_array(void) +int __init numa_cleanup_meminfo(struct numa_meminfo *mi) { - int rr, i; + const u64 low = 0; + const u64 high = (u64)max_pfn << PAGE_SHIFT; + int i, j, k; - rr = first_node(node_online_map); - for (i = 0; i < nr_cpu_ids; i++) { - if (early_cpu_to_node(i) != NUMA_NO_NODE) - continue; - numa_set_node(i, rr); - rr = next_node(rr, node_online_map); - if (rr == MAX_NUMNODES) - rr = first_node(node_online_map); - } -} - -#ifdef CONFIG_NUMA_EMU -/* Numa emulation */ -static struct bootnode nodes[MAX_NUMNODES] __initdata; -static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata; -static char *cmdline __initdata; + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; -void __init numa_emu_cmdline(char *str) -{ - cmdline = str; -} + /* make sure all blocks are inside the limits */ + bi->start = max(bi->start, low); + bi->end = min(bi->end, high); -static int __init setup_physnodes(unsigned long start, unsigned long end, - int acpi, int amd) -{ - int ret = 0; - int i; - - memset(physnodes, 0, sizeof(physnodes)); -#ifdef CONFIG_ACPI_NUMA - if (acpi) - acpi_get_nodes(physnodes, start, end); -#endif -#ifdef CONFIG_AMD_NUMA - if (amd) - amd_get_nodes(physnodes); -#endif - /* - * Basic sanity checking on the physical node map: there may be errors - * if the SRAT or AMD code incorrectly reported the topology or the mem= - * kernel parameter is used. - */ - for (i = 0; i < MAX_NUMNODES; i++) { - if (physnodes[i].start == physnodes[i].end) - continue; - if (physnodes[i].start > end) { - physnodes[i].end = physnodes[i].start; - continue; - } - if (physnodes[i].end < start) { - physnodes[i].start = physnodes[i].end; + /* and there's no empty block */ + if (bi->start == bi->end) { + numa_remove_memblk_from(i--, mi); continue; } - if (physnodes[i].start < start) - physnodes[i].start = start; - if (physnodes[i].end > end) - physnodes[i].end = end; - ret++; - } - - /* - * If no physical topology was detected, a single node is faked to cover - * the entire address space. - */ - if (!ret) { - physnodes[ret].start = start; - physnodes[ret].end = end; - ret = 1; - } - return ret; -} - -static void __init fake_physnodes(int acpi, int amd, int nr_nodes) -{ - int i; - - BUG_ON(acpi && amd); -#ifdef CONFIG_ACPI_NUMA - if (acpi) - acpi_fake_nodes(nodes, nr_nodes); -#endif -#ifdef CONFIG_AMD_NUMA - if (amd) - amd_fake_nodes(nodes, nr_nodes); -#endif - if (!acpi && !amd) - for (i = 0; i < nr_cpu_ids; i++) - numa_set_node(i, 0); -} - -/* - * Setups up nid to range from addr to addr + size. If the end - * boundary is greater than max_addr, then max_addr is used instead. - * The return value is 0 if there is additional memory left for - * allocation past addr and -1 otherwise. addr is adjusted to be at - * the end of the node. - */ -static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr) -{ - int ret = 0; - nodes[nid].start = *addr; - *addr += size; - if (*addr >= max_addr) { - *addr = max_addr; - ret = -1; - } - nodes[nid].end = *addr; - node_set(nid, node_possible_map); - printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, - nodes[nid].start, nodes[nid].end, - (nodes[nid].end - nodes[nid].start) >> 20); - return ret; -} - -/* - * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr - * to max_addr. The return value is the number of nodes allocated. - */ -static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes) -{ - nodemask_t physnode_mask = NODE_MASK_NONE; - u64 size; - int big; - int ret = 0; - int i; - - if (nr_nodes <= 0) - return -1; - if (nr_nodes > MAX_NUMNODES) { - pr_info("numa=fake=%d too large, reducing to %d\n", - nr_nodes, MAX_NUMNODES); - nr_nodes = MAX_NUMNODES; - } - - size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes; - /* - * Calculate the number of big nodes that can be allocated as a result - * of consolidating the remainder. - */ - big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / - FAKE_NODE_MIN_SIZE; - - size &= FAKE_NODE_MIN_HASH_MASK; - if (!size) { - pr_err("Not enough memory for each node. " - "NUMA emulation disabled.\n"); - return -1; - } - for (i = 0; i < MAX_NUMNODES; i++) - if (physnodes[i].start != physnodes[i].end) - node_set(i, physnode_mask); - - /* - * Continue to fill physical nodes with fake nodes until there is no - * memory left on any of them. - */ - while (nodes_weight(physnode_mask)) { - for_each_node_mask(i, physnode_mask) { - u64 end = physnodes[i].start + size; - u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); - - if (ret < big) - end += FAKE_NODE_MIN_SIZE; + for (j = i + 1; j < mi->nr_blks; j++) { + struct numa_memblk *bj = &mi->blk[j]; + unsigned long start, end; /* - * Continue to add memory to this fake node if its - * non-reserved memory is less than the per-node size. + * See whether there are overlapping blocks. Whine + * about but allow overlaps of the same nid. They + * will be merged below. */ - while (end - physnodes[i].start - - memblock_x86_hole_size(physnodes[i].start, end) < size) { - end += FAKE_NODE_MIN_SIZE; - if (end > physnodes[i].end) { - end = physnodes[i].end; - break; + if (bi->end > bj->start && bi->start < bj->end) { + if (bi->nid != bj->nid) { + pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", + bi->nid, bi->start, bi->end, + bj->nid, bj->start, bj->end); + return -EINVAL; } + pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", + bi->nid, bi->start, bi->end, + bj->start, bj->end); } /* - * If there won't be at least FAKE_NODE_MIN_SIZE of - * non-reserved memory in ZONE_DMA32 for the next node, - * this one must extend to the boundary. + * Join together blocks on the same node, holes + * between which don't overlap with memory on other + * nodes. */ - if (end < dma32_end && dma32_end - end - - memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) - end = dma32_end; - - /* - * If there won't be enough non-reserved memory for the - * next node, this one must extend to the end of the - * physical node. - */ - if (physnodes[i].end - end - - memblock_x86_hole_size(end, physnodes[i].end) < size) - end = physnodes[i].end; - - /* - * Avoid allocating more nodes than requested, which can - * happen as a result of rounding down each node's size - * to FAKE_NODE_MIN_SIZE. - */ - if (nodes_weight(physnode_mask) + ret >= nr_nodes) - end = physnodes[i].end; - - if (setup_node_range(ret++, &physnodes[i].start, - end - physnodes[i].start, - physnodes[i].end) < 0) - node_clear(i, physnode_mask); + if (bi->nid != bj->nid) + continue; + start = max(min(bi->start, bj->start), low); + end = min(max(bi->end, bj->end), high); + for (k = 0; k < mi->nr_blks; k++) { + struct numa_memblk *bk = &mi->blk[k]; + + if (bi->nid == bk->nid) + continue; + if (start < bk->end && end > bk->start) + break; + } + if (k < mi->nr_blks) + continue; + printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", + bi->nid, bi->start, bi->end, bj->start, bj->end, + start, end); + bi->start = start; + bi->end = end; + numa_remove_memblk_from(j--, mi); } } - return ret; -} -/* - * Returns the end address of a node so that there is at least `size' amount of - * non-reserved memory or `max_addr' is reached. - */ -static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) -{ - u64 end = start + size; - - while (end - start - memblock_x86_hole_size(start, end) < size) { - end += FAKE_NODE_MIN_SIZE; - if (end > max_addr) { - end = max_addr; - break; - } + for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { + mi->blk[i].start = mi->blk[i].end = 0; + mi->blk[i].nid = NUMA_NO_NODE; } - return end; + + return 0; } /* - * Sets up fake nodes of `size' interleaved over physical nodes ranging from - * `addr' to `max_addr'. The return value is the number of nodes allocated. + * Set nodes, which have memory in @mi, in *@nodemask. */ -static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) +static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, + const struct numa_meminfo *mi) { - nodemask_t physnode_mask = NODE_MASK_NONE; - u64 min_size; - int ret = 0; int i; - if (!size) - return -1; - /* - * The limit on emulated nodes is MAX_NUMNODES, so the size per node is - * increased accordingly if the requested size is too small. This - * creates a uniform distribution of node sizes across the entire - * machine (but not necessarily over physical nodes). - */ - min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / - MAX_NUMNODES; - min_size = max(min_size, FAKE_NODE_MIN_SIZE); - if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) - min_size = (min_size + FAKE_NODE_MIN_SIZE) & - FAKE_NODE_MIN_HASH_MASK; - if (size < min_size) { - pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", - size >> 20, min_size >> 20); - size = min_size; - } - size &= FAKE_NODE_MIN_HASH_MASK; - - for (i = 0; i < MAX_NUMNODES; i++) - if (physnodes[i].start != physnodes[i].end) - node_set(i, physnode_mask); - /* - * Fill physical nodes with fake nodes of size until there is no memory - * left on any of them. - */ - while (nodes_weight(physnode_mask)) { - for_each_node_mask(i, physnode_mask) { - u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; - u64 end; - - end = find_end_of_node(physnodes[i].start, - physnodes[i].end, size); - /* - * If there won't be at least FAKE_NODE_MIN_SIZE of - * non-reserved memory in ZONE_DMA32 for the next node, - * this one must extend to the boundary. - */ - if (end < dma32_end && dma32_end - end - - memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) - end = dma32_end; + for (i = 0; i < ARRAY_SIZE(mi->blk); i++) + if (mi->blk[i].start != mi->blk[i].end && + mi->blk[i].nid != NUMA_NO_NODE) + node_set(mi->blk[i].nid, *nodemask); +} - /* - * If there won't be enough non-reserved memory for the - * next node, this one must extend to the end of the - * physical node. - */ - if (physnodes[i].end - end - - memblock_x86_hole_size(end, physnodes[i].end) < size) - end = physnodes[i].end; +/** + * numa_reset_distance - Reset NUMA distance table + * + * The current table is freed. The next numa_set_distance() call will + * create a new one. + */ +void __init numa_reset_distance(void) +{ + size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); - /* - * Setup the fake node that will be allocated as bootmem - * later. If setup_node_range() returns non-zero, there - * is no more memory available on this physical node. - */ - if (setup_node_range(ret++, &physnodes[i].start, - end - physnodes[i].start, - physnodes[i].end) < 0) - node_clear(i, physnode_mask); - } - } - return ret; + /* numa_distance could be 1LU marking allocation failure, test cnt */ + if (numa_distance_cnt) + memblock_x86_free_range(__pa(numa_distance), + __pa(numa_distance) + size); + numa_distance_cnt = 0; + numa_distance = NULL; /* enable table creation */ } -/* - * Sets up the system RAM area from start_pfn to last_pfn according to the - * numa=fake command-line option. - */ -static int __init numa_emulation(unsigned long start_pfn, - unsigned long last_pfn, int acpi, int amd) +static int __init numa_alloc_distance(void) { - u64 addr = start_pfn << PAGE_SHIFT; - u64 max_addr = last_pfn << PAGE_SHIFT; - int num_nodes; - int i; + nodemask_t nodes_parsed; + size_t size; + int i, j, cnt = 0; + u64 phys; - /* - * If the numa=fake command-line contains a 'M' or 'G', it represents - * the fixed node size. Otherwise, if it is just a single number N, - * split the system RAM into N fake nodes. - */ - if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) { - u64 size; + /* size the new table and allocate it */ + nodes_parsed = numa_nodes_parsed; + numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); - size = memparse(cmdline, &cmdline); - num_nodes = split_nodes_size_interleave(addr, max_addr, size); - } else { - unsigned long n; + for_each_node_mask(i, nodes_parsed) + cnt = i; + cnt++; + size = cnt * cnt * sizeof(numa_distance[0]); - n = simple_strtoul(cmdline, NULL, 0); - num_nodes = split_nodes_interleave(addr, max_addr, n); + phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT, + size, PAGE_SIZE); + if (phys == MEMBLOCK_ERROR) { + pr_warning("NUMA: Warning: can't allocate distance table!\n"); + /* don't retry until explicitly reset */ + numa_distance = (void *)1LU; + return -ENOMEM; } + memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); - if (num_nodes < 0) - return num_nodes; - memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); - if (memnode_shift < 0) { - memnode_shift = 0; - printk(KERN_ERR "No NUMA hash function found. NUMA emulation " - "disabled.\n"); - return -1; - } + numa_distance = __va(phys); + numa_distance_cnt = cnt; + + /* fill with the default distances */ + for (i = 0; i < cnt; i++) + for (j = 0; j < cnt; j++) + numa_distance[i * cnt + j] = i == j ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); - /* - * We need to vacate all active ranges that may have been registered for - * the e820 memory map. - */ - remove_all_active_ranges(); - for_each_node_mask(i, node_possible_map) { - memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, - nodes[i].end >> PAGE_SHIFT); - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - } - setup_physnodes(addr, max_addr, acpi, amd); - fake_physnodes(acpi, amd, num_nodes); - numa_init_array(); return 0; } -#endif /* CONFIG_NUMA_EMU */ -void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, - int acpi, int amd) +/** + * numa_set_distance - Set NUMA distance from one NUMA to another + * @from: the 'from' node to set distance + * @to: the 'to' node to set distance + * @distance: NUMA distance + * + * Set the distance from node @from to @to to @distance. If distance table + * doesn't exist, one which is large enough to accomodate all the currently + * known nodes will be created. + * + * If such table cannot be allocated, a warning is printed and further + * calls are ignored until the distance table is reset with + * numa_reset_distance(). + * + * If @from or @to is higher than the highest known node at the time of + * table creation or @distance doesn't make sense, the call is ignored. + * This is to allow simplification of specific NUMA config implementations. + */ +void __init numa_set_distance(int from, int to, int distance) { - int i; - - nodes_clear(node_possible_map); - nodes_clear(node_online_map); - -#ifdef CONFIG_NUMA_EMU - setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT, - acpi, amd); - if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd)) + if (!numa_distance && numa_alloc_distance() < 0) return; - setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT, - acpi, amd); - nodes_clear(node_possible_map); - nodes_clear(node_online_map); -#endif -#ifdef CONFIG_ACPI_NUMA - if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, - last_pfn << PAGE_SHIFT)) + if (from >= numa_distance_cnt || to >= numa_distance_cnt) { + printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", + from, to, distance); return; - nodes_clear(node_possible_map); - nodes_clear(node_online_map); -#endif + } -#ifdef CONFIG_AMD_NUMA - if (!numa_off && amd && !amd_scan_nodes()) + if ((u8)distance != distance || + (from == to && distance != LOCAL_DISTANCE)) { + pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", + from, to, distance); return; - nodes_clear(node_possible_map); - nodes_clear(node_online_map); -#endif - printk(KERN_INFO "%s\n", - numa_off ? "NUMA turned off" : "No NUMA configuration found"); + } - printk(KERN_INFO "Faking a node at %016lx-%016lx\n", - start_pfn << PAGE_SHIFT, - last_pfn << PAGE_SHIFT); - /* setup dummy node covering all memory */ - memnode_shift = 63; - memnodemap = memnode.embedded_map; - memnodemap[0] = 0; - node_set_online(0); - node_set(0, node_possible_map); - for (i = 0; i < nr_cpu_ids; i++) - numa_set_node(i, 0); - memblock_x86_register_active_regions(0, start_pfn, last_pfn); - setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); + numa_distance[from * numa_distance_cnt + to] = distance; } -unsigned long __init numa_free_all_bootmem(void) +int __node_distance(int from, int to) { - unsigned long pages = 0; - int i; + if (from >= numa_distance_cnt || to >= numa_distance_cnt) + return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; + return numa_distance[from * numa_distance_cnt + to]; +} +EXPORT_SYMBOL(__node_distance); - for_each_online_node(i) - pages += free_all_bootmem_node(NODE_DATA(i)); +/* + * Sanity check to catch more bad NUMA configurations (they are amazingly + * common). Make sure the nodes cover all memory. + */ +static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) +{ + unsigned long numaram, e820ram; + int i; - pages += free_all_memory_core_early(MAX_NUMNODES); + numaram = 0; + for (i = 0; i < mi->nr_blks; i++) { + unsigned long s = mi->blk[i].start >> PAGE_SHIFT; + unsigned long e = mi->blk[i].end >> PAGE_SHIFT; + numaram += e - s; + numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); + if ((long)numaram < 0) + numaram = 0; + } - return pages; + e820ram = max_pfn - (memblock_x86_hole_size(0, + max_pfn << PAGE_SHIFT) >> PAGE_SHIFT); + /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ + if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { + printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n", + (numaram << PAGE_SHIFT) >> 20, + (e820ram << PAGE_SHIFT) >> 20); + return false; + } + return true; } -#ifdef CONFIG_NUMA - -static __init int find_near_online_node(int node) +static int __init numa_register_memblks(struct numa_meminfo *mi) { - int n, val; - int min_val = INT_MAX; - int best_node = -1; + int i, nid; - for_each_online_node(n) { - val = node_distance(node, n); + /* Account for nodes with cpus and no memory */ + node_possible_map = numa_nodes_parsed; + numa_nodemask_from_meminfo(&node_possible_map, mi); + if (WARN_ON(nodes_empty(node_possible_map))) + return -EINVAL; + + memnode_shift = compute_hash_shift(mi); + if (memnode_shift < 0) { + printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n"); + return -EINVAL; + } - if (val < min_val) { - min_val = val; - best_node = n; + for (i = 0; i < mi->nr_blks; i++) + memblock_x86_register_active_regions(mi->blk[i].nid, + mi->blk[i].start >> PAGE_SHIFT, + mi->blk[i].end >> PAGE_SHIFT); + + /* for out of order entries */ + sort_node_map(); + if (!numa_meminfo_cover_memory(mi)) + return -EINVAL; + + /* Finally register nodes. */ + for_each_node_mask(nid, node_possible_map) { + u64 start = (u64)max_pfn << PAGE_SHIFT; + u64 end = 0; + + for (i = 0; i < mi->nr_blks; i++) { + if (nid != mi->blk[i].nid) + continue; + start = min(mi->blk[i].start, start); + end = max(mi->blk[i].end, end); } + + if (start < end) + setup_node_bootmem(nid, start, end); } - return best_node; + return 0; } -/* - * Setup early cpu_to_node. +/** + * dummy_numma_init - Fallback dummy NUMA init * - * Populate cpu_to_node[] only if x86_cpu_to_apicid[], - * and apicid_to_node[] tables have valid entries for a CPU. - * This means we skip cpu_to_node[] initialisation for NUMA - * emulation and faking node case (when running a kernel compiled - * for NUMA on a non NUMA box), which is OK as cpu_to_node[] - * is already initialized in a round robin manner at numa_init_array, - * prior to this call, and this initialization is good enough - * for the fake NUMA cases. + * Used if there's no underlying NUMA architecture, NUMA initialization + * fails, or NUMA is disabled on the command line. * - * Called before the per_cpu areas are setup. + * Must online at least one node and add memory blocks that cover all + * allowed memory. This function must not fail. */ -void __init init_cpu_to_node(void) +static int __init dummy_numa_init(void) { - int cpu; - u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); - - BUG_ON(cpu_to_apicid == NULL); + printk(KERN_INFO "%s\n", + numa_off ? "NUMA turned off" : "No NUMA configuration found"); + printk(KERN_INFO "Faking a node at %016lx-%016lx\n", + 0LU, max_pfn << PAGE_SHIFT); - for_each_possible_cpu(cpu) { - int node; - u16 apicid = cpu_to_apicid[cpu]; + node_set(0, numa_nodes_parsed); + numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT); - if (apicid == BAD_APICID) - continue; - node = apicid_to_node[apicid]; - if (node == NUMA_NO_NODE) - continue; - if (!node_online(node)) - node = find_near_online_node(node); - numa_set_node(cpu, node); - } + return 0; } -#endif - -void __cpuinit numa_set_node(int cpu, int node) +static int __init numa_init(int (*init_func)(void)) { - int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); - - /* early setting, no percpu area yet */ - if (cpu_to_node_map) { - cpu_to_node_map[cpu] = node; - return; - } - -#ifdef CONFIG_DEBUG_PER_CPU_MAPS - if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { - printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); - dump_stack(); - return; - } -#endif - per_cpu(x86_cpu_to_node_map, cpu) = node; + int i; + int ret; - if (node != NUMA_NO_NODE) - set_cpu_numa_node(cpu, node); -} + for (i = 0; i < MAX_LOCAL_APIC; i++) + set_apicid_to_node(i, NUMA_NO_NODE); -void __cpuinit numa_clear_node(int cpu) -{ - numa_set_node(cpu, NUMA_NO_NODE); -} + nodes_clear(numa_nodes_parsed); + nodes_clear(node_possible_map); + nodes_clear(node_online_map); + memset(&numa_meminfo, 0, sizeof(numa_meminfo)); + remove_all_active_ranges(); + numa_reset_distance(); -#ifndef CONFIG_DEBUG_PER_CPU_MAPS + ret = init_func(); + if (ret < 0) + return ret; + ret = numa_cleanup_meminfo(&numa_meminfo); + if (ret < 0) + return ret; -#ifndef CONFIG_NUMA_EMU -void __cpuinit numa_add_cpu(int cpu) -{ - cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); -} + numa_emulation(&numa_meminfo, numa_distance_cnt); -void __cpuinit numa_remove_cpu(int cpu) -{ - cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); -} -#else -void __cpuinit numa_add_cpu(int cpu) -{ - unsigned long addr; - u16 apicid; - int physnid; - int nid = NUMA_NO_NODE; + ret = numa_register_memblks(&numa_meminfo); + if (ret < 0) + return ret; - apicid = early_per_cpu(x86_cpu_to_apicid, cpu); - if (apicid != BAD_APICID) - nid = apicid_to_node[apicid]; - if (nid == NUMA_NO_NODE) - nid = early_cpu_to_node(cpu); - BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); - - /* - * Use the starting address of the emulated node to find which physical - * node it is allocated on. - */ - addr = node_start_pfn(nid) << PAGE_SHIFT; - for (physnid = 0; physnid < MAX_NUMNODES; physnid++) - if (addr >= physnodes[physnid].start && - addr < physnodes[physnid].end) - break; + for (i = 0; i < nr_cpu_ids; i++) { + int nid = early_cpu_to_node(i); - /* - * Map the cpu to each emulated node that is allocated on the physical - * node of the cpu's apic id. - */ - for_each_online_node(nid) { - addr = node_start_pfn(nid) << PAGE_SHIFT; - if (addr >= physnodes[physnid].start && - addr < physnodes[physnid].end) - cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); + if (nid == NUMA_NO_NODE) + continue; + if (!node_online(nid)) + numa_clear_node(i); } + numa_init_array(); + return 0; } -void __cpuinit numa_remove_cpu(int cpu) +void __init initmem_init(void) { - int i; + int ret; - for_each_online_node(i) - cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); -} -#endif /* !CONFIG_NUMA_EMU */ - -#else /* CONFIG_DEBUG_PER_CPU_MAPS */ -static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) -{ - int node = early_cpu_to_node(cpu); - struct cpumask *mask; - char buf[64]; - - mask = node_to_cpumask_map[node]; - if (!mask) { - pr_err("node_to_cpumask_map[%i] NULL\n", node); - dump_stack(); - return NULL; + if (!numa_off) { +#ifdef CONFIG_ACPI_NUMA + ret = numa_init(x86_acpi_numa_init); + if (!ret) + return; +#endif +#ifdef CONFIG_AMD_NUMA + ret = numa_init(amd_numa_init); + if (!ret) + return; +#endif } - cpulist_scnprintf(buf, sizeof(buf), mask); - printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", - enable ? "numa_add_cpu" : "numa_remove_cpu", - cpu, node, buf); - return mask; + numa_init(dummy_numa_init); } -/* - * --------- debug versions of the numa functions --------- - */ -#ifndef CONFIG_NUMA_EMU -static void __cpuinit numa_set_cpumask(int cpu, int enable) -{ - struct cpumask *mask; - - mask = debug_cpumask_set_cpu(cpu, enable); - if (!mask) - return; - - if (enable) - cpumask_set_cpu(cpu, mask); - else - cpumask_clear_cpu(cpu, mask); -} -#else -static void __cpuinit numa_set_cpumask(int cpu, int enable) +unsigned long __init numa_free_all_bootmem(void) { - int node = early_cpu_to_node(cpu); - struct cpumask *mask; + unsigned long pages = 0; int i; - for_each_online_node(i) { - unsigned long addr; - - addr = node_start_pfn(i) << PAGE_SHIFT; - if (addr < physnodes[node].start || - addr >= physnodes[node].end) - continue; - mask = debug_cpumask_set_cpu(cpu, enable); - if (!mask) - return; - - if (enable) - cpumask_set_cpu(cpu, mask); - else - cpumask_clear_cpu(cpu, mask); - } -} -#endif /* CONFIG_NUMA_EMU */ + for_each_online_node(i) + pages += free_all_bootmem_node(NODE_DATA(i)); -void __cpuinit numa_add_cpu(int cpu) -{ - numa_set_cpumask(cpu, 1); -} + pages += free_all_memory_core_early(MAX_NUMNODES); -void __cpuinit numa_remove_cpu(int cpu) -{ - numa_set_cpumask(cpu, 0); + return pages; } -int __cpu_to_node(int cpu) +int __cpuinit numa_cpu_node(int cpu) { - if (early_per_cpu_ptr(x86_cpu_to_node_map)) { - printk(KERN_WARNING - "cpu_to_node(%d): usage too early!\n", cpu); - dump_stack(); - return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; - } - return per_cpu(x86_cpu_to_node_map, cpu); -} -EXPORT_SYMBOL(__cpu_to_node); + int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); -/* - * Same function as cpu_to_node() but used if called before the - * per_cpu areas are setup. - */ -int early_cpu_to_node(int cpu) -{ - if (early_per_cpu_ptr(x86_cpu_to_node_map)) - return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; - - if (!cpu_possible(cpu)) { - printk(KERN_WARNING - "early_cpu_to_node(%d): no per_cpu area!\n", cpu); - dump_stack(); - return NUMA_NO_NODE; - } - return per_cpu(x86_cpu_to_node_map, cpu); + if (apicid != BAD_APICID) + return __apicid_to_node[apicid]; + return NUMA_NO_NODE; } - -/* - * --------- end of debug versions of the numa functions --------- - */ - -#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c new file mode 100644 index 00000000000..ad091e4cff1 --- /dev/null +++ b/arch/x86/mm/numa_emulation.c @@ -0,0 +1,494 @@ +/* + * NUMA emulation + */ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/topology.h> +#include <linux/memblock.h> +#include <asm/dma.h> + +#include "numa_internal.h" + +static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata; +static char *emu_cmdline __initdata; + +void __init numa_emu_cmdline(char *str) +{ + emu_cmdline = str; +} + +static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < mi->nr_blks; i++) + if (mi->blk[i].nid == nid) + return i; + return -ENOENT; +} + +/* + * Sets up nid to range from @start to @end. The return value is -errno if + * something went wrong, 0 otherwise. + */ +static int __init emu_setup_memblk(struct numa_meminfo *ei, + struct numa_meminfo *pi, + int nid, int phys_blk, u64 size) +{ + struct numa_memblk *eb = &ei->blk[ei->nr_blks]; + struct numa_memblk *pb = &pi->blk[phys_blk]; + + if (ei->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("NUMA: Too many emulated memblks, failing emulation\n"); + return -EINVAL; + } + + ei->nr_blks++; + eb->start = pb->start; + eb->end = pb->start + size; + eb->nid = nid; + + if (emu_nid_to_phys[nid] == NUMA_NO_NODE) + emu_nid_to_phys[nid] = pb->nid; + + pb->start += size; + if (pb->start >= pb->end) { + WARN_ON_ONCE(pb->start > pb->end); + numa_remove_memblk_from(phys_blk, pi); + } + + printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, + eb->start, eb->end, (eb->end - eb->start) >> 20); + return 0; +} + +/* + * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr + * to max_addr. The return value is the number of nodes allocated. + */ +static int __init split_nodes_interleave(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, int nr_nodes) +{ + nodemask_t physnode_mask = NODE_MASK_NONE; + u64 size; + int big; + int nid = 0; + int i, ret; + + if (nr_nodes <= 0) + return -1; + if (nr_nodes > MAX_NUMNODES) { + pr_info("numa=fake=%d too large, reducing to %d\n", + nr_nodes, MAX_NUMNODES); + nr_nodes = MAX_NUMNODES; + } + + size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes; + /* + * Calculate the number of big nodes that can be allocated as a result + * of consolidating the remainder. + */ + big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / + FAKE_NODE_MIN_SIZE; + + size &= FAKE_NODE_MIN_HASH_MASK; + if (!size) { + pr_err("Not enough memory for each node. " + "NUMA emulation disabled.\n"); + return -1; + } + + for (i = 0; i < pi->nr_blks; i++) + node_set(pi->blk[i].nid, physnode_mask); + + /* + * Continue to fill physical nodes with fake nodes until there is no + * memory left on any of them. + */ + while (nodes_weight(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); + u64 start, limit, end; + int phys_blk; + + phys_blk = emu_find_memblk_by_nid(i, pi); + if (phys_blk < 0) { + node_clear(i, physnode_mask); + continue; + } + start = pi->blk[phys_blk].start; + limit = pi->blk[phys_blk].end; + end = start + size; + + if (nid < big) + end += FAKE_NODE_MIN_SIZE; + + /* + * Continue to add memory to this fake node if its + * non-reserved memory is less than the per-node size. + */ + while (end - start - + memblock_x86_hole_size(start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > limit) { + end = limit; + break; + } + } + + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if (limit - end - + memblock_x86_hole_size(end, limit) < size) + end = limit; + + ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, + phys_blk, + min(end, limit) - start); + if (ret < 0) + return ret; + } + } + return 0; +} + +/* + * Returns the end address of a node so that there is at least `size' amount of + * non-reserved memory or `max_addr' is reached. + */ +static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) +{ + u64 end = start + size; + + while (end - start - memblock_x86_hole_size(start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > max_addr) { + end = max_addr; + break; + } + } + return end; +} + +/* + * Sets up fake nodes of `size' interleaved over physical nodes ranging from + * `addr' to `max_addr'. The return value is the number of nodes allocated. + */ +static int __init split_nodes_size_interleave(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, u64 size) +{ + nodemask_t physnode_mask = NODE_MASK_NONE; + u64 min_size; + int nid = 0; + int i, ret; + + if (!size) + return -1; + /* + * The limit on emulated nodes is MAX_NUMNODES, so the size per node is + * increased accordingly if the requested size is too small. This + * creates a uniform distribution of node sizes across the entire + * machine (but not necessarily over physical nodes). + */ + min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / + MAX_NUMNODES; + min_size = max(min_size, FAKE_NODE_MIN_SIZE); + if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) + min_size = (min_size + FAKE_NODE_MIN_SIZE) & + FAKE_NODE_MIN_HASH_MASK; + if (size < min_size) { + pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", + size >> 20, min_size >> 20); + size = min_size; + } + size &= FAKE_NODE_MIN_HASH_MASK; + + for (i = 0; i < pi->nr_blks; i++) + node_set(pi->blk[i].nid, physnode_mask); + + /* + * Fill physical nodes with fake nodes of size until there is no memory + * left on any of them. + */ + while (nodes_weight(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; + u64 start, limit, end; + int phys_blk; + + phys_blk = emu_find_memblk_by_nid(i, pi); + if (phys_blk < 0) { + node_clear(i, physnode_mask); + continue; + } + start = pi->blk[phys_blk].start; + limit = pi->blk[phys_blk].end; + + end = find_end_of_node(start, limit, size); + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if (limit - end - + memblock_x86_hole_size(end, limit) < size) + end = limit; + + ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, + phys_blk, + min(end, limit) - start); + if (ret < 0) + return ret; + } + } + return 0; +} + +/** + * numa_emulation - Emulate NUMA nodes + * @numa_meminfo: NUMA configuration to massage + * @numa_dist_cnt: The size of the physical NUMA distance table + * + * Emulate NUMA nodes according to the numa=fake kernel parameter. + * @numa_meminfo contains the physical memory configuration and is modified + * to reflect the emulated configuration on success. @numa_dist_cnt is + * used to determine the size of the physical distance table. + * + * On success, the following modifications are made. + * + * - @numa_meminfo is updated to reflect the emulated nodes. + * + * - __apicid_to_node[] is updated such that APIC IDs are mapped to the + * emulated nodes. + * + * - NUMA distance table is rebuilt to represent distances between emulated + * nodes. The distances are determined considering how emulated nodes + * are mapped to physical nodes and match the actual distances. + * + * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical + * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). + * + * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with + * identity mapping and no other modification is made. + */ +void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) +{ + static struct numa_meminfo ei __initdata; + static struct numa_meminfo pi __initdata; + const u64 max_addr = max_pfn << PAGE_SHIFT; + u8 *phys_dist = NULL; + size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); + int max_emu_nid, dfl_phys_nid; + int i, j, ret; + + if (!emu_cmdline) + goto no_emu; + + memset(&ei, 0, sizeof(ei)); + pi = *numa_meminfo; + + for (i = 0; i < MAX_NUMNODES; i++) + emu_nid_to_phys[i] = NUMA_NO_NODE; + + /* + * If the numa=fake command-line contains a 'M' or 'G', it represents + * the fixed node size. Otherwise, if it is just a single number N, + * split the system RAM into N fake nodes. + */ + if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { + u64 size; + + size = memparse(emu_cmdline, &emu_cmdline); + ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); + } else { + unsigned long n; + + n = simple_strtoul(emu_cmdline, NULL, 0); + ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); + } + + if (ret < 0) + goto no_emu; + + if (numa_cleanup_meminfo(&ei) < 0) { + pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); + goto no_emu; + } + + /* copy the physical distance table */ + if (numa_dist_cnt) { + u64 phys; + + phys = memblock_find_in_range(0, + (u64)max_pfn_mapped << PAGE_SHIFT, + phys_size, PAGE_SIZE); + if (phys == MEMBLOCK_ERROR) { + pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); + goto no_emu; + } + memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST"); + phys_dist = __va(phys); + + for (i = 0; i < numa_dist_cnt; i++) + for (j = 0; j < numa_dist_cnt; j++) + phys_dist[i * numa_dist_cnt + j] = + node_distance(i, j); + } + + /* + * Determine the max emulated nid and the default phys nid to use + * for unmapped nodes. + */ + max_emu_nid = 0; + dfl_phys_nid = NUMA_NO_NODE; + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { + if (emu_nid_to_phys[i] != NUMA_NO_NODE) { + max_emu_nid = i; + if (dfl_phys_nid == NUMA_NO_NODE) + dfl_phys_nid = emu_nid_to_phys[i]; + } + } + if (dfl_phys_nid == NUMA_NO_NODE) { + pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n"); + goto no_emu; + } + + /* commit */ + *numa_meminfo = ei; + + /* + * Transform __apicid_to_node table to use emulated nids by + * reverse-mapping phys_nid. The maps should always exist but fall + * back to zero just in case. + */ + for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { + if (__apicid_to_node[i] == NUMA_NO_NODE) + continue; + for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) + if (__apicid_to_node[i] == emu_nid_to_phys[j]) + break; + __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; + } + + /* make sure all emulated nodes are mapped to a physical node */ + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + if (emu_nid_to_phys[i] == NUMA_NO_NODE) + emu_nid_to_phys[i] = dfl_phys_nid; + + /* transform distance table */ + numa_reset_distance(); + for (i = 0; i < max_emu_nid + 1; i++) { + for (j = 0; j < max_emu_nid + 1; j++) { + int physi = emu_nid_to_phys[i]; + int physj = emu_nid_to_phys[j]; + int dist; + + if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) + dist = physi == physj ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + else + dist = phys_dist[physi * numa_dist_cnt + physj]; + + numa_set_distance(i, j, dist); + } + } + + /* free the copied physical distance table */ + if (phys_dist) + memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size); + return; + +no_emu: + /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + emu_nid_to_phys[i] = i; +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS +void __cpuinit numa_add_cpu(int cpu) +{ + int physnid, nid; + + nid = early_cpu_to_node(cpu); + BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); + + physnid = emu_nid_to_phys[nid]; + + /* + * Map the cpu to each emulated node that is allocated on the physical + * node of the cpu's apic id. + */ + for_each_online_node(nid) + if (emu_nid_to_phys[nid] == physnid) + cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + int i; + + for_each_online_node(i) + cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); +} +#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ +static void __cpuinit numa_set_cpumask(int cpu, int enable) +{ + struct cpumask *mask; + int nid, physnid, i; + + nid = early_cpu_to_node(cpu); + if (nid == NUMA_NO_NODE) { + /* early_cpu_to_node() already emits a warning and trace */ + return; + } + + physnid = emu_nid_to_phys[nid]; + + for_each_online_node(i) { + if (emu_nid_to_phys[nid] != physnid) + continue; + + mask = debug_cpumask_set_cpu(cpu, enable); + if (!mask) + return; + + if (enable) + cpumask_set_cpu(cpu, mask); + else + cpumask_clear_cpu(cpu, mask); + } +} + +void __cpuinit numa_add_cpu(int cpu) +{ + numa_set_cpumask(cpu, 1); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + numa_set_cpumask(cpu, 0); +} +#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h new file mode 100644 index 00000000000..ef2d97377d7 --- /dev/null +++ b/arch/x86/mm/numa_internal.h @@ -0,0 +1,31 @@ +#ifndef __X86_MM_NUMA_INTERNAL_H +#define __X86_MM_NUMA_INTERNAL_H + +#include <linux/types.h> +#include <asm/numa.h> + +struct numa_memblk { + u64 start; + u64 end; + int nid; +}; + +struct numa_meminfo { + int nr_blks; + struct numa_memblk blk[NR_NODE_MEMBLKS]; +}; + +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); +int __init numa_cleanup_meminfo(struct numa_meminfo *mi); +void __init numa_reset_distance(void); + +#ifdef CONFIG_NUMA_EMU +void __init numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt); +#else +static inline void numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt) +{ } +#endif + +#endif /* __X86_MM_NUMA_INTERNAL_H */ diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index d343b3c81f3..90825f2eb0f 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -57,12 +57,10 @@ static unsigned long direct_pages_count[PG_LEVEL_NUM]; void update_page_count(int level, unsigned long pages) { - unsigned long flags; - /* Protect against CPA */ - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); direct_pages_count[level] += pages; - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } static void split_page_count(int level) @@ -394,7 +392,7 @@ static int try_preserve_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { - unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; + unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn; pte_t new_pte, old_pte, *tmp; pgprot_t old_prot, new_prot, req_prot; int i, do_split = 1; @@ -403,7 +401,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, if (cpa->force_split) return 1; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); /* * Check for races, another CPU might have split this page * up already: @@ -498,14 +496,14 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, } out_unlock: - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); return do_split; } static int split_large_page(pte_t *kpte, unsigned long address) { - unsigned long flags, pfn, pfninc = 1; + unsigned long pfn, pfninc = 1; unsigned int i, level; pte_t *pbase, *tmp; pgprot_t ref_prot; @@ -519,7 +517,7 @@ static int split_large_page(pte_t *kpte, unsigned long address) if (!base) return -ENOMEM; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); /* * Check for races, another CPU might have split this page * up for us already: @@ -591,7 +589,7 @@ out_unlock: */ if (base) __free_page(base); - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); return 0; } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 500242d3c96..0113d19c8aa 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -121,14 +121,12 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) static void pgd_dtor(pgd_t *pgd) { - unsigned long flags; /* can be called from interrupt context */ - if (SHARED_KERNEL_PMD) return; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); pgd_list_del(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } /* @@ -260,7 +258,6 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; pmd_t *pmds[PREALLOCATED_PMDS]; - unsigned long flags; pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); @@ -280,12 +277,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm) * respect to anything walking the pgd_list, so that they * never see a partially populated pgd. */ - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); pgd_ctor(mm, pgd); pgd_prepopulate_pmd(mm, pgd, pmds); - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); return pgd; diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index ae96e7b8051..48651c6f657 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -57,7 +57,7 @@ struct node_memory_chunk_s { static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; static int __initdata num_memory_chunks; /* total number of memory chunks */ -static u8 __initdata apicid_to_pxm[MAX_APICID]; +static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC]; int acpi_numa __initdata; @@ -254,8 +254,8 @@ int __init get_memcfg_from_srat(void) printk(KERN_DEBUG "Number of memory chunks in system = %d\n", num_memory_chunks); - for (i = 0; i < MAX_APICID; i++) - apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); + for (i = 0; i < MAX_LOCAL_APIC; i++) + set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i])); for (j = 0; j < num_memory_chunks; j++){ struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 603d285d1da..8e9d3394f6d 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -26,88 +26,34 @@ int acpi_numa __initdata; -static struct acpi_table_slit *acpi_slit; - -static nodemask_t nodes_parsed __initdata; -static nodemask_t cpu_nodes_parsed __initdata; -static struct bootnode nodes[MAX_NUMNODES] __initdata; static struct bootnode nodes_add[MAX_NUMNODES]; -static int num_node_memblks __initdata; -static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata; -static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata; - static __init int setup_node(int pxm) { return acpi_map_pxm_to_node(pxm); } -static __init int conflicting_memblks(unsigned long start, unsigned long end) -{ - int i; - for (i = 0; i < num_node_memblks; i++) { - struct bootnode *nd = &node_memblk_range[i]; - if (nd->start == nd->end) - continue; - if (nd->end > start && nd->start < end) - return memblk_nodeid[i]; - if (nd->end == end && nd->start == start) - return memblk_nodeid[i]; - } - return -1; -} - -static __init void cutoff_node(int i, unsigned long start, unsigned long end) -{ - struct bootnode *nd = &nodes[i]; - - if (nd->start < start) { - nd->start = start; - if (nd->end < nd->start) - nd->start = nd->end; - } - if (nd->end > end) { - nd->end = end; - if (nd->start > nd->end) - nd->start = nd->end; - } -} - static __init void bad_srat(void) { - int i; printk(KERN_ERR "SRAT: SRAT not used.\n"); acpi_numa = -1; - for (i = 0; i < MAX_LOCAL_APIC; i++) - apicid_to_node[i] = NUMA_NO_NODE; - for (i = 0; i < MAX_NUMNODES; i++) { - nodes[i].start = nodes[i].end = 0; - nodes_add[i].start = nodes_add[i].end = 0; - } - remove_all_active_ranges(); + memset(nodes_add, 0, sizeof(nodes_add)); } static __init inline int srat_disabled(void) { - return numa_off || acpi_numa < 0; + return acpi_numa < 0; } /* Callback for SLIT parsing */ void __init acpi_numa_slit_init(struct acpi_table_slit *slit) { - unsigned length; - unsigned long phys; - - length = slit->header.length; - phys = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, length, - PAGE_SIZE); - - if (phys == MEMBLOCK_ERROR) - panic(" Can not save slit!\n"); + int i, j; - acpi_slit = __va(phys); - memcpy(acpi_slit, slit, length); - memblock_x86_reserve_range(phys, phys + length, "ACPI SLIT"); + for (i = 0; i < slit->locality_count; i++) + for (j = 0; j < slit->locality_count; j++) + numa_set_distance(pxm_to_node(i), pxm_to_node(j), + slit->entry[slit->locality_count * i + j]); } /* Callback for Proximity Domain -> x2APIC mapping */ @@ -138,8 +84,8 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); return; } - apicid_to_node[apic_id] = node; - node_set(node, cpu_nodes_parsed); + set_apicid_to_node(apic_id, node); + node_set(node, numa_nodes_parsed); acpi_numa = 1; printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node); @@ -178,8 +124,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) return; } - apicid_to_node[apic_id] = node; - node_set(node, cpu_nodes_parsed); + set_apicid_to_node(apic_id, node); + node_set(node, numa_nodes_parsed); acpi_numa = 1; printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node); @@ -241,7 +187,7 @@ update_nodes_add(int node, unsigned long start, unsigned long end) } if (changed) { - node_set(node, cpu_nodes_parsed); + node_set(node, numa_nodes_parsed); printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); } @@ -251,10 +197,8 @@ update_nodes_add(int node, unsigned long start, unsigned long end) void __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { - struct bootnode *nd, oldnode; unsigned long start, end; int node, pxm; - int i; if (srat_disabled()) return; @@ -276,300 +220,31 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) bad_srat(); return; } - i = conflicting_memblks(start, end); - if (i == node) { - printk(KERN_WARNING - "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n", - pxm, start, end, nodes[i].start, nodes[i].end); - } else if (i >= 0) { - printk(KERN_ERR - "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n", - pxm, start, end, node_to_pxm(i), - nodes[i].start, nodes[i].end); + + if (numa_add_memblk(node, start, end) < 0) { bad_srat(); return; } - nd = &nodes[node]; - oldnode = *nd; - if (!node_test_and_set(node, nodes_parsed)) { - nd->start = start; - nd->end = end; - } else { - if (start < nd->start) - nd->start = start; - if (nd->end < end) - nd->end = end; - } printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, start, end); - if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { + if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) update_nodes_add(node, start, end); - /* restore nodes[node] */ - *nd = oldnode; - if ((nd->start | nd->end) == 0) - node_clear(node, nodes_parsed); - } - - node_memblk_range[num_node_memblks].start = start; - node_memblk_range[num_node_memblks].end = end; - memblk_nodeid[num_node_memblks] = node; - num_node_memblks++; -} - -/* Sanity check to catch more bad SRATs (they are amazingly common). - Make sure the PXMs cover all memory. */ -static int __init nodes_cover_memory(const struct bootnode *nodes) -{ - int i; - unsigned long pxmram, e820ram; - - pxmram = 0; - for_each_node_mask(i, nodes_parsed) { - unsigned long s = nodes[i].start >> PAGE_SHIFT; - unsigned long e = nodes[i].end >> PAGE_SHIFT; - pxmram += e - s; - pxmram -= __absent_pages_in_range(i, s, e); - if ((long)pxmram < 0) - pxmram = 0; - } - - e820ram = max_pfn - (memblock_x86_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT); - /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ - if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) { - printk(KERN_ERR - "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", - (pxmram << PAGE_SHIFT) >> 20, - (e820ram << PAGE_SHIFT) >> 20); - return 0; - } - return 1; } void __init acpi_numa_arch_fixup(void) {} -#ifdef CONFIG_NUMA_EMU -void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start, - unsigned long end) -{ - int i; - - for_each_node_mask(i, nodes_parsed) { - cutoff_node(i, start, end); - physnodes[i].start = nodes[i].start; - physnodes[i].end = nodes[i].end; - } -} -#endif /* CONFIG_NUMA_EMU */ - -/* Use the information discovered above to actually set up the nodes. */ -int __init acpi_scan_nodes(unsigned long start, unsigned long end) +int __init x86_acpi_numa_init(void) { - int i; - - if (acpi_numa <= 0) - return -1; - - /* First clean up the node list */ - for (i = 0; i < MAX_NUMNODES; i++) - cutoff_node(i, start, end); - - /* - * Join together blocks on the same node, holes between - * which don't overlap with memory on other nodes. - */ - for (i = 0; i < num_node_memblks; ++i) { - int j, k; - - for (j = i + 1; j < num_node_memblks; ++j) { - unsigned long start, end; - - if (memblk_nodeid[i] != memblk_nodeid[j]) - continue; - start = min(node_memblk_range[i].end, - node_memblk_range[j].end); - end = max(node_memblk_range[i].start, - node_memblk_range[j].start); - for (k = 0; k < num_node_memblks; ++k) { - if (memblk_nodeid[i] == memblk_nodeid[k]) - continue; - if (start < node_memblk_range[k].end && - end > node_memblk_range[k].start) - break; - } - if (k < num_node_memblks) - continue; - start = min(node_memblk_range[i].start, - node_memblk_range[j].start); - end = max(node_memblk_range[i].end, - node_memblk_range[j].end); - printk(KERN_INFO "SRAT: Node %d " - "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", - memblk_nodeid[i], - node_memblk_range[i].start, - node_memblk_range[i].end, - node_memblk_range[j].start, - node_memblk_range[j].end, - start, end); - node_memblk_range[i].start = start; - node_memblk_range[i].end = end; - k = --num_node_memblks - j; - memmove(memblk_nodeid + j, memblk_nodeid + j+1, - k * sizeof(*memblk_nodeid)); - memmove(node_memblk_range + j, node_memblk_range + j+1, - k * sizeof(*node_memblk_range)); - --j; - } - } - - memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, - memblk_nodeid); - if (memnode_shift < 0) { - printk(KERN_ERR - "SRAT: No NUMA node hash function found. Contact maintainer\n"); - bad_srat(); - return -1; - } - - for (i = 0; i < num_node_memblks; i++) - memblock_x86_register_active_regions(memblk_nodeid[i], - node_memblk_range[i].start >> PAGE_SHIFT, - node_memblk_range[i].end >> PAGE_SHIFT); - - /* for out of order entries in SRAT */ - sort_node_map(); - if (!nodes_cover_memory(nodes)) { - bad_srat(); - return -1; - } + int ret; - /* Account for nodes with cpus and no memory */ - nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); - - /* Finally register nodes */ - for_each_node_mask(i, node_possible_map) - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - /* Try again in case setup_node_bootmem missed one due - to missing bootmem */ - for_each_node_mask(i, node_possible_map) - if (!node_online(i)) - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - - for (i = 0; i < nr_cpu_ids; i++) { - int node = early_cpu_to_node(i); - - if (node == NUMA_NO_NODE) - continue; - if (!node_online(node)) - numa_clear_node(i); - } - numa_init_array(); - return 0; -} - -#ifdef CONFIG_NUMA_EMU -static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = { - [0 ... MAX_NUMNODES-1] = PXM_INVAL -}; -static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE -}; -static int __init find_node_by_addr(unsigned long addr) -{ - int ret = NUMA_NO_NODE; - int i; - - for_each_node_mask(i, nodes_parsed) { - /* - * Find the real node that this emulated node appears on. For - * the sake of simplicity, we only use a real node's starting - * address to determine which emulated node it appears on. - */ - if (addr >= nodes[i].start && addr < nodes[i].end) { - ret = i; - break; - } - } - return ret; + ret = acpi_numa_init(); + if (ret < 0) + return ret; + return srat_disabled() ? -EINVAL : 0; } -/* - * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID - * mappings that respect the real ACPI topology but reflect our emulated - * environment. For each emulated node, we find which real node it appears on - * and create PXM to NID mappings for those fake nodes which mirror that - * locality. SLIT will now represent the correct distances between emulated - * nodes as a result of the real topology. - */ -void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) -{ - int i, j; - - for (i = 0; i < num_nodes; i++) { - int nid, pxm; - - nid = find_node_by_addr(fake_nodes[i].start); - if (nid == NUMA_NO_NODE) - continue; - pxm = node_to_pxm(nid); - if (pxm == PXM_INVAL) - continue; - fake_node_to_pxm_map[i] = pxm; - /* - * For each apicid_to_node mapping that exists for this real - * node, it must now point to the fake node ID. - */ - for (j = 0; j < MAX_LOCAL_APIC; j++) - if (apicid_to_node[j] == nid && - fake_apicid_to_node[j] == NUMA_NO_NODE) - fake_apicid_to_node[j] = i; - } - - /* - * If there are apicid-to-node mappings for physical nodes that do not - * have a corresponding emulated node, it should default to a guaranteed - * value. - */ - for (i = 0; i < MAX_LOCAL_APIC; i++) - if (apicid_to_node[i] != NUMA_NO_NODE && - fake_apicid_to_node[i] == NUMA_NO_NODE) - fake_apicid_to_node[i] = 0; - - for (i = 0; i < num_nodes; i++) - __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); - memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); - - nodes_clear(nodes_parsed); - for (i = 0; i < num_nodes; i++) - if (fake_nodes[i].start != fake_nodes[i].end) - node_set(i, nodes_parsed); -} - -static int null_slit_node_compare(int a, int b) -{ - return node_to_pxm(a) == node_to_pxm(b); -} -#else -static int null_slit_node_compare(int a, int b) -{ - return a == b; -} -#endif /* CONFIG_NUMA_EMU */ - -int __node_distance(int a, int b) -{ - int index; - - if (!acpi_slit) - return null_slit_node_compare(a, b) ? LOCAL_DISTANCE : - REMOTE_DISTANCE; - index = acpi_slit->locality_count * node_to_pxm(a); - return acpi_slit->entry[index + node_to_pxm(b)]; -} - -EXPORT_SYMBOL(__node_distance); - #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) int memory_add_physaddr_to_nid(u64 start) { diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6acc724d5d8..d6c0418c3e4 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -179,12 +179,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, sender = this_cpu_read(tlb_vector_offset); f = &flush_state[sender]; - /* - * Could avoid this lock when - * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is - * probably not worth checking this for a cache-hot lock. - */ - raw_spin_lock(&f->tlbstate_lock); + if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) + raw_spin_lock(&f->tlbstate_lock); f->flush_mm = mm; f->flush_va = va; @@ -202,7 +198,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, f->flush_mm = NULL; f->flush_va = 0; - raw_spin_unlock(&f->tlbstate_lock); + if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) + raw_spin_unlock(&f->tlbstate_lock); } void native_flush_tlb_others(const struct cpumask *cpumask, @@ -211,11 +208,10 @@ void native_flush_tlb_others(const struct cpumask *cpumask, if (is_uv_system()) { unsigned int cpu; - cpu = get_cpu(); + cpu = smp_processor_id(); cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); if (cpumask) flush_tlb_others_ipi(cpumask, mm, va); - put_cpu(); return; } flush_tlb_others_ipi(cpumask, mm, va); diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index e27dffbbb1a..026e4931d16 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -350,7 +350,7 @@ static int __init early_fill_mp_bus_info(void) #define ENABLE_CF8_EXT_CFG (1ULL << 46) -static void enable_pci_io_ecs(void *unused) +static void __cpuinit enable_pci_io_ecs(void *unused) { u64 reg; rdmsrl(MSR_AMD64_NB_CFG, reg); diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c index 85b68ef5e80..67858be4b52 100644 --- a/arch/x86/pci/ce4100.c +++ b/arch/x86/pci/ce4100.c @@ -34,6 +34,7 @@ #include <linux/pci.h> #include <linux/init.h> +#include <asm/ce4100.h> #include <asm/pci_x86.h> struct sim_reg { @@ -254,7 +255,7 @@ int bridge_read(unsigned int devfn, int reg, int len, u32 *value) static int ce4100_conf_read(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 *value) { - int i, retval = 1; + int i; if (bus == 1) { for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) { @@ -306,10 +307,10 @@ struct pci_raw_ops ce4100_pci_conf = { .write = ce4100_conf_write, }; -static int __init ce4100_pci_init(void) +int __init ce4100_pci_init(void) { init_sim_regs(); raw_pci_ops = &ce4100_pci_conf; - return 0; + /* Indicate caller that it should invoke pci_legacy_init() */ + return 1; } -subsys_initcall(ce4100_pci_init); diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 25cd4a07d09..8c4085a95ef 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -20,7 +20,8 @@ #include <asm/xen/pci.h> #ifdef CONFIG_ACPI -static int xen_hvm_register_pirq(u32 gsi, int triggering) +static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, + int trigger, int polarity) { int rc, irq; struct physdev_map_pirq map_irq; @@ -41,7 +42,7 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering) return -1; } - if (triggering == ACPI_EDGE_SENSITIVE) { + if (trigger == ACPI_EDGE_SENSITIVE) { shareable = 0; name = "ioapic-edge"; } else { @@ -55,12 +56,6 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering) return irq; } - -static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, - int trigger, int polarity) -{ - return xen_hvm_register_pirq(gsi, trigger); -} #endif #if defined(CONFIG_PCI_MSI) @@ -91,7 +86,7 @@ static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq, static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - int irq, pirq, ret = 0; + int irq, pirq; struct msi_desc *msidesc; struct msi_msg msg; @@ -99,39 +94,32 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) __read_msi_msg(msidesc, &msg); pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) | ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff); - if (xen_irq_from_pirq(pirq) >= 0 && msg.data == XEN_PIRQ_MSI_DATA) { - xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? - "msi-x" : "msi", &irq, &pirq, XEN_ALLOC_IRQ); - if (irq < 0) + if (msg.data != XEN_PIRQ_MSI_DATA || + xen_irq_from_pirq(pirq) < 0) { + pirq = xen_allocate_pirq_msi(dev, msidesc); + if (pirq < 0) goto error; - ret = set_irq_msi(irq, msidesc); - if (ret < 0) - goto error_while; - printk(KERN_DEBUG "xen: msi already setup: msi --> irq=%d" - " pirq=%d\n", irq, pirq); - return 0; + xen_msi_compose_msg(dev, pirq, &msg); + __write_msi_msg(msidesc, &msg); + dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq); + } else { + dev_dbg(&dev->dev, + "xen: msi already bound to pirq=%d\n", pirq); } - xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? - "msi-x" : "msi", &irq, &pirq, (XEN_ALLOC_IRQ | XEN_ALLOC_PIRQ)); - if (irq < 0 || pirq < 0) + irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 0, + (type == PCI_CAP_ID_MSIX) ? + "msi-x" : "msi"); + if (irq < 0) goto error; - printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq); - xen_msi_compose_msg(dev, pirq, &msg); - ret = set_irq_msi(irq, msidesc); - if (ret < 0) - goto error_while; - write_msi_msg(irq, &msg); + dev_dbg(&dev->dev, + "xen: msi --> pirq=%d --> irq=%d\n", pirq, irq); } return 0; -error_while: - unbind_from_irqhandler(irq, NULL); error: - if (ret == -ENODEV) - dev_err(&dev->dev, "Xen PCI frontend has not registered" \ - " MSI/MSI-X support!\n"); - - return ret; + dev_err(&dev->dev, + "Xen PCI frontend has not registered MSI/MSI-X support!\n"); + return -ENODEV; } /* @@ -150,35 +138,26 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return -ENOMEM; if (type == PCI_CAP_ID_MSIX) - ret = xen_pci_frontend_enable_msix(dev, &v, nvec); + ret = xen_pci_frontend_enable_msix(dev, v, nvec); else - ret = xen_pci_frontend_enable_msi(dev, &v); + ret = xen_pci_frontend_enable_msi(dev, v); if (ret) goto error; i = 0; list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = xen_allocate_pirq(v[i], 0, /* not sharable */ - (type == PCI_CAP_ID_MSIX) ? - "pcifront-msi-x" : "pcifront-msi"); - if (irq < 0) { - ret = -1; + irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0, + (type == PCI_CAP_ID_MSIX) ? + "pcifront-msi-x" : + "pcifront-msi"); + if (irq < 0) goto free; - } - - ret = set_irq_msi(irq, msidesc); - if (ret) - goto error_while; i++; } kfree(v); return 0; -error_while: - unbind_from_irqhandler(irq, NULL); error: - if (ret == -ENODEV) - dev_err(&dev->dev, "Xen PCI frontend has not registered" \ - " MSI/MSI-X support!\n"); + dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n"); free: kfree(v); return ret; @@ -193,6 +172,9 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev) xen_pci_frontend_disable_msix(dev); else xen_pci_frontend_disable_msi(dev); + + /* Free the IRQ's and the msidesc using the generic code. */ + default_teardown_msi_irqs(dev); } static void xen_teardown_msi_irq(unsigned int irq) @@ -200,47 +182,82 @@ static void xen_teardown_msi_irq(unsigned int irq) xen_destroy_irq(irq); } +#ifdef CONFIG_XEN_DOM0 static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - int irq, ret; + int ret = 0; struct msi_desc *msidesc; list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = xen_create_msi_irq(dev, msidesc, type); - if (irq < 0) - return -1; + struct physdev_map_pirq map_irq; - ret = set_irq_msi(irq, msidesc); - if (ret) - goto error; - } - return 0; + memset(&map_irq, 0, sizeof(map_irq)); + map_irq.domid = DOMID_SELF; + map_irq.type = MAP_PIRQ_TYPE_MSI; + map_irq.index = -1; + map_irq.pirq = -1; + map_irq.bus = dev->bus->number; + map_irq.devfn = dev->devfn; -error: - xen_destroy_irq(irq); + if (type == PCI_CAP_ID_MSIX) { + int pos; + u32 table_offset, bir; + + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + + pci_read_config_dword(dev, pos + PCI_MSIX_TABLE, + &table_offset); + bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK); + + map_irq.table_base = pci_resource_start(dev, bir); + map_irq.entry_nr = msidesc->msi_attrib.entry_nr; + } + + ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); + if (ret) { + dev_warn(&dev->dev, "xen map irq failed %d\n", ret); + goto out; + } + + ret = xen_bind_pirq_msi_to_irq(dev, msidesc, + map_irq.pirq, map_irq.index, + (type == PCI_CAP_ID_MSIX) ? + "msi-x" : "msi"); + if (ret < 0) + goto out; + } + ret = 0; +out: return ret; } #endif +#endif static int xen_pcifront_enable_irq(struct pci_dev *dev) { int rc; int share = 1; + u8 gsi; - dev_info(&dev->dev, "Xen PCI enabling IRQ: %d\n", dev->irq); - - if (dev->irq < 0) - return -EINVAL; + rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi); + if (rc < 0) { + dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n", + rc); + return rc; + } - if (dev->irq < NR_IRQS_LEGACY) + if (gsi < NR_IRQS_LEGACY) share = 0; - rc = xen_allocate_pirq(dev->irq, share, "pcifront"); + rc = xen_allocate_pirq(gsi, share, "pcifront"); if (rc < 0) { - dev_warn(&dev->dev, "Xen PCI IRQ: %d, failed to register:%d\n", - dev->irq, rc); + dev_warn(&dev->dev, "Xen PCI: failed to register GSI%d: %d\n", + gsi, rc); return rc; } + + dev->irq = rc; + dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq); return 0; } diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index d2c0d51a717..28071bb31db 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -15,21 +15,20 @@ #include <linux/serial_reg.h> #include <linux/serial_8250.h> +#include <asm/ce4100.h> +#include <asm/prom.h> #include <asm/setup.h> +#include <asm/i8259.h> #include <asm/io.h> +#include <asm/io_apic.h> static int ce4100_i8042_detect(void) { return 0; } -static void __init sdv_find_smp_config(void) -{ -} - #ifdef CONFIG_SERIAL_8250 - static unsigned int mem_serial_in(struct uart_port *p, int offset) { offset = offset << p->regshift; @@ -118,6 +117,15 @@ static void __init sdv_arch_setup(void) sdv_serial_fixup(); } +#ifdef CONFIG_X86_IO_APIC +static void __cpuinit sdv_pci_init(void) +{ + x86_of_pci_init(); + /* We can't set this earlier, because we need to calibrate the timer */ + legacy_pic = &null_legacy_pic; +} +#endif + /* * CE4100 specific x86_init function overrides and early setup * calls. @@ -128,5 +136,11 @@ void __init x86_ce4100_early_setup(void) x86_platform.i8042_detect = ce4100_i8042_detect; x86_init.resources.probe_roms = x86_init_noop; x86_init.mpparse.get_smp_config = x86_init_uint_noop; - x86_init.mpparse.find_smp_config = sdv_find_smp_config; + x86_init.mpparse.find_smp_config = x86_init_noop; + x86_init.pci.init = ce4100_pci_init; + +#ifdef CONFIG_X86_IO_APIC + x86_init.pci.init_irq = sdv_pci_init; + x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck; +#endif } diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts new file mode 100644 index 00000000000..dc701ea5854 --- /dev/null +++ b/arch/x86/platform/ce4100/falconfalls.dts @@ -0,0 +1,428 @@ +/* + * CE4100 on Falcon Falls + * + * (c) Copyright 2010 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; version 2 of the License. + */ +/dts-v1/; +/ { + model = "intel,falconfalls"; + compatible = "intel,falconfalls"; + #address-cells = <1>; + #size-cells = <1>; + + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu@0 { + device_type = "cpu"; + compatible = "intel,ce4100"; + reg = <0>; + lapic = <&lapic0>; + }; + }; + + soc@0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "intel,ce4100-cp"; + ranges; + + ioapic1: interrupt-controller@fec00000 { + #interrupt-cells = <2>; + compatible = "intel,ce4100-ioapic"; + interrupt-controller; + reg = <0xfec00000 0x1000>; + }; + + timer@fed00000 { + compatible = "intel,ce4100-hpet"; + reg = <0xfed00000 0x200>; + }; + + lapic0: interrupt-controller@fee00000 { + compatible = "intel,ce4100-lapic"; + reg = <0xfee00000 0x1000>; + }; + + pci@3fc { + #address-cells = <3>; + #size-cells = <2>; + compatible = "intel,ce4100-pci", "pci"; + device_type = "pci"; + bus-range = <0 0>; + ranges = <0x2000000 0 0xbffff000 0xbffff000 0 0x1000 + 0x2000000 0 0xdffe0000 0xdffe0000 0 0x1000 + 0x0000000 0 0x0 0x0 0 0x100>; + + /* Secondary IO-APIC */ + ioapic2: interrupt-controller@0,1 { + #interrupt-cells = <2>; + compatible = "intel,ce4100-ioapic"; + interrupt-controller; + reg = <0x100 0x0 0x0 0x0 0x0>; + assigned-addresses = <0x02000000 0x0 0xbffff000 0x0 0x1000>; + }; + + pci@1,0 { + #address-cells = <3>; + #size-cells = <2>; + compatible = "intel,ce4100-pci", "pci"; + device_type = "pci"; + bus-range = <1 1>; + ranges = <0x2000000 0 0xdffe0000 0x2000000 0 0xdffe0000 0 0x1000>; + + interrupt-parent = <&ioapic2>; + + display@2,0 { + compatible = "pci8086,2e5b.2", + "pci8086,2e5b", + "pciclass038000", + "pciclass0380"; + + reg = <0x11000 0x0 0x0 0x0 0x0>; + interrupts = <0 1>; + }; + + multimedia@3,0 { + compatible = "pci8086,2e5c.2", + "pci8086,2e5c", + "pciclass048000", + "pciclass0480"; + + reg = <0x11800 0x0 0x0 0x0 0x0>; + interrupts = <2 1>; + }; + + multimedia@4,0 { + compatible = "pci8086,2e5d.2", + "pci8086,2e5d", + "pciclass048000", + "pciclass0480"; + + reg = <0x12000 0x0 0x0 0x0 0x0>; + interrupts = <4 1>; + }; + + multimedia@4,1 { + compatible = "pci8086,2e5e.2", + "pci8086,2e5e", + "pciclass048000", + "pciclass0480"; + + reg = <0x12100 0x0 0x0 0x0 0x0>; + interrupts = <5 1>; + }; + + sound@6,0 { + compatible = "pci8086,2e5f.2", + "pci8086,2e5f", + "pciclass040100", + "pciclass0401"; + + reg = <0x13000 0x0 0x0 0x0 0x0>; + interrupts = <6 1>; + }; + + sound@6,1 { + compatible = "pci8086,2e5f.2", + "pci8086,2e5f", + "pciclass040100", + "pciclass0401"; + + reg = <0x13100 0x0 0x0 0x0 0x0>; + interrupts = <7 1>; + }; + + sound@6,2 { + compatible = "pci8086,2e60.2", + "pci8086,2e60", + "pciclass040100", + "pciclass0401"; + + reg = <0x13200 0x0 0x0 0x0 0x0>; + interrupts = <8 1>; + }; + + display@8,0 { + compatible = "pci8086,2e61.2", + "pci8086,2e61", + "pciclass038000", + "pciclass0380"; + + reg = <0x14000 0x0 0x0 0x0 0x0>; + interrupts = <9 1>; + }; + + display@8,1 { + compatible = "pci8086,2e62.2", + "pci8086,2e62", + "pciclass038000", + "pciclass0380"; + + reg = <0x14100 0x0 0x0 0x0 0x0>; + interrupts = <10 1>; + }; + + multimedia@8,2 { + compatible = "pci8086,2e63.2", + "pci8086,2e63", + "pciclass048000", + "pciclass0480"; + + reg = <0x14200 0x0 0x0 0x0 0x0>; + interrupts = <11 1>; + }; + + entertainment-encryption@9,0 { + compatible = "pci8086,2e64.2", + "pci8086,2e64", + "pciclass101000", + "pciclass1010"; + + reg = <0x14800 0x0 0x0 0x0 0x0>; + interrupts = <12 1>; + }; + + localbus@a,0 { + compatible = "pci8086,2e65.2", + "pci8086,2e65", + "pciclassff0000", + "pciclassff00"; + + reg = <0x15000 0x0 0x0 0x0 0x0>; + }; + + serial@b,0 { + compatible = "pci8086,2e66.2", + "pci8086,2e66", + "pciclass070003", + "pciclass0700"; + + reg = <0x15800 0x0 0x0 0x0 0x0>; + interrupts = <14 1>; + }; + + gpio@b,1 { + compatible = "pci8086,2e67.2", + "pci8086,2e67", + "pciclassff0000", + "pciclassff00"; + + #gpio-cells = <2>; + reg = <0x15900 0x0 0x0 0x0 0x0>; + interrupts = <15 1>; + gpio-controller; + }; + + i2c-controller@b,2 { + #address-cells = <2>; + #size-cells = <1>; + compatible = "pci8086,2e68.2", + "pci8086,2e68", + "pciclass,ff0000", + "pciclass,ff00"; + + reg = <0x15a00 0x0 0x0 0x0 0x0>; + interrupts = <16 1>; + ranges = <0 0 0x02000000 0 0xdffe0500 0x100 + 1 0 0x02000000 0 0xdffe0600 0x100 + 2 0 0x02000000 0 0xdffe0700 0x100>; + + i2c@0 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "intel,ce4100-i2c-controller"; + reg = <0 0 0x100>; + }; + + i2c@1 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "intel,ce4100-i2c-controller"; + reg = <1 0 0x100>; + + gpio@26 { + #gpio-cells = <2>; + compatible = "ti,pcf8575"; + reg = <0x26>; + gpio-controller; + }; + }; + + i2c@2 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "intel,ce4100-i2c-controller"; + reg = <2 0 0x100>; + + gpio@26 { + #gpio-cells = <2>; + compatible = "ti,pcf8575"; + reg = <0x26>; + gpio-controller; + }; + }; + }; + + smard-card@b,3 { + compatible = "pci8086,2e69.2", + "pci8086,2e69", + "pciclass070500", + "pciclass0705"; + + reg = <0x15b00 0x0 0x0 0x0 0x0>; + interrupts = <15 1>; + }; + + spi-controller@b,4 { + #address-cells = <1>; + #size-cells = <0>; + compatible = + "pci8086,2e6a.2", + "pci8086,2e6a", + "pciclass,ff0000", + "pciclass,ff00"; + + reg = <0x15c00 0x0 0x0 0x0 0x0>; + interrupts = <15 1>; + + dac@0 { + compatible = "ti,pcm1755"; + reg = <0>; + spi-max-frequency = <115200>; + }; + + dac@1 { + compatible = "ti,pcm1609a"; + reg = <1>; + spi-max-frequency = <115200>; + }; + + eeprom@2 { + compatible = "atmel,at93c46"; + reg = <2>; + spi-max-frequency = <115200>; + }; + }; + + multimedia@b,7 { + compatible = "pci8086,2e6d.2", + "pci8086,2e6d", + "pciclassff0000", + "pciclassff00"; + + reg = <0x15f00 0x0 0x0 0x0 0x0>; + }; + + ethernet@c,0 { + compatible = "pci8086,2e6e.2", + "pci8086,2e6e", + "pciclass020000", + "pciclass0200"; + + reg = <0x16000 0x0 0x0 0x0 0x0>; + interrupts = <21 1>; + }; + + clock@c,1 { + compatible = "pci8086,2e6f.2", + "pci8086,2e6f", + "pciclassff0000", + "pciclassff00"; + + reg = <0x16100 0x0 0x0 0x0 0x0>; + interrupts = <3 1>; + }; + + usb@d,0 { + compatible = "pci8086,2e70.2", + "pci8086,2e70", + "pciclass0c0320", + "pciclass0c03"; + + reg = <0x16800 0x0 0x0 0x0 0x0>; + interrupts = <22 3>; + }; + + usb@d,1 { + compatible = "pci8086,2e70.2", + "pci8086,2e70", + "pciclass0c0320", + "pciclass0c03"; + + reg = <0x16900 0x0 0x0 0x0 0x0>; + interrupts = <22 3>; + }; + + sata@e,0 { + compatible = "pci8086,2e71.0", + "pci8086,2e71", + "pciclass010601", + "pciclass0106"; + + reg = <0x17000 0x0 0x0 0x0 0x0>; + interrupts = <23 3>; + }; + + flash@f,0 { + compatible = "pci8086,701.1", + "pci8086,701", + "pciclass050100", + "pciclass0501"; + + reg = <0x17800 0x0 0x0 0x0 0x0>; + interrupts = <13 1>; + }; + + entertainment-encryption@10,0 { + compatible = "pci8086,702.1", + "pci8086,702", + "pciclass101000", + "pciclass1010"; + + reg = <0x18000 0x0 0x0 0x0 0x0>; + }; + + co-processor@11,0 { + compatible = "pci8086,703.1", + "pci8086,703", + "pciclass0b4000", + "pciclass0b40"; + + reg = <0x18800 0x0 0x0 0x0 0x0>; + interrupts = <1 1>; + }; + + multimedia@12,0 { + compatible = "pci8086,704.0", + "pci8086,704", + "pciclass048000", + "pciclass0480"; + + reg = <0x19000 0x0 0x0 0x0 0x0>; + }; + }; + + isa@1f,0 { + #address-cells = <2>; + #size-cells = <1>; + compatible = "isa"; + ranges = <1 0 0 0 0 0x100>; + + rtc@70 { + compatible = "intel,ce4100-rtc", "motorola,mc146818"; + interrupts = <8 3>; + interrupt-parent = <&ioapic1>; + ctrl-reg = <2>; + freq-reg = <0x26>; + reg = <1 0x70 2>; + }; + }; + }; + }; +}; diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index ea6529e93c6..5c0207bf959 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -31,6 +31,7 @@ #include <asm/apic.h> #include <asm/io_apic.h> #include <asm/mrst.h> +#include <asm/mrst-vrtc.h> #include <asm/io.h> #include <asm/i8259.h> #include <asm/intel_scu_ipc.h> @@ -268,6 +269,7 @@ void __init x86_mrst_early_setup(void) x86_platform.calibrate_tsc = mrst_calibrate_tsc; x86_platform.i8042_detect = mrst_i8042_detect; + x86_init.timers.wallclock_init = mrst_rtc_init; x86_init.pci.init = pci_mrst_init; x86_init.pci.fixup_irqs = x86_init_noop; diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c index 32cd7edd71a..04cf645feb9 100644 --- a/arch/x86/platform/mrst/vrtc.c +++ b/arch/x86/platform/mrst/vrtc.c @@ -100,22 +100,14 @@ int vrtc_set_mmss(unsigned long nowtime) void __init mrst_rtc_init(void) { - unsigned long rtc_paddr; - void __iomem *virt_base; + unsigned long vrtc_paddr = sfi_mrtc_array[0].phys_addr; sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); - if (!sfi_mrtc_num) + if (!sfi_mrtc_num || !vrtc_paddr) return; - rtc_paddr = sfi_mrtc_array[0].phys_addr; - - /* vRTC's register address may not be page aligned */ - set_fixmap_nocache(FIX_LNW_VRTC, rtc_paddr); - - virt_base = (void __iomem *)__fix_to_virt(FIX_LNW_VRTC); - virt_base += rtc_paddr & ~PAGE_MASK; - vrtc_virt_base = virt_base; - + vrtc_virt_base = (void __iomem *)set_fixmap_offset_nocache(FIX_LNW_VRTC, + vrtc_paddr); x86_platform.get_wallclock = vrtc_get_time; x86_platform.set_wallclock = vrtc_set_mmss; } diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile index e797428b163..c2a8cab65e5 100644 --- a/arch/x86/platform/olpc/Makefile +++ b/arch/x86/platform/olpc/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_OLPC) += olpc.o obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o -obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o -obj-$(CONFIG_OLPC_OPENFIRMWARE_DT) += olpc_dt.o +obj-$(CONFIG_OLPC) += olpc_ofw.o +obj-$(CONFIG_OF_PROMTREE) += olpc_dt.o diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index df58e9cad96..a7b38d35c29 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1364,11 +1364,11 @@ uv_activation_descriptor_init(int node, int pnode) memset(bd2, 0, sizeof(struct bau_desc)); bd2->header.sw_ack_flag = 1; /* - * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub + * base_dest_nodeid is the nasid of the first uvhub * in the partition. The bit map will indicate uvhub numbers, * which are 0-N in a partition. Pnodes are unique system-wide. */ - bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1; + bd2->header.base_dest_nodeid = UV_PNODE_TO_NASID(uv_partition_base_pnode); bd2->header.dest_subnodeid = 0x10; /* the LB */ bd2->header.command = UV_NET_ENDPOINT_INTD; bd2->header.int_both = 1; diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 7b24460917d..374a05d8ad2 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -131,7 +131,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long mmr_offset, int limit) { const struct cpumask *eligible_cpu = cpumask_of(cpu); - struct irq_cfg *cfg = get_irq_chip_data(irq); + struct irq_cfg *cfg = irq_get_chip_data(irq); unsigned long mmr_value; struct uv_IO_APIC_route_entry *entry; int mmr_pnode, err; @@ -148,7 +148,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, else irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, + irq_set_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, irq_name); mmr_value = 0; diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c index 63203767174..fe4cf829487 100644 --- a/arch/x86/platform/visws/visws_quirks.c +++ b/arch/x86/platform/visws/visws_quirks.c @@ -569,11 +569,13 @@ out_unlock: static struct irqaction master_action = { .handler = piix4_master_intr, .name = "PIIX4-8259", + .flags = IRQF_NO_THREAD, }; static struct irqaction cascade_action = { .handler = no_action, .name = "cascade", + .flags = IRQF_NO_THREAD, }; static inline void set_piix4_virtual_irq_type(void) @@ -606,7 +608,7 @@ static void __init visws_pre_intr_init(void) chip = &cobalt_irq_type; if (chip) - set_irq_chip(i, chip); + irq_set_chip(i, chip); } setup_irq(CO_IRQ_8259, &master_action); diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 5b54892e4bc..e4343fe488e 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -48,3 +48,11 @@ config XEN_DEBUG_FS help Enable statistics output and various tuning options in debugfs. Enabling this option may incur a significant performance overhead. + +config XEN_DEBUG + bool "Enable Xen debug checks" + depends on XEN + default n + help + Enable various WARN_ON checks in the Xen MMU code. + Enabling this option WILL incur a significant performance overhead. diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 50542efe45f..49dbd78ec3c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1284,15 +1284,14 @@ static int init_hvm_pv_info(int *major, int *minor) xen_setup_features(); - pv_info = xen_info; - pv_info.kernel_rpl = 0; + pv_info.name = "Xen HVM"; xen_domain_type = XEN_HVM_DOMAIN; return 0; } -void xen_hvm_init_shared_info(void) +void __ref xen_hvm_init_shared_info(void) { int cpu; struct xen_add_to_physmap xatp; @@ -1331,6 +1330,8 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, switch (action) { case CPU_UP_PREPARE: per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + if (xen_have_vector_callback) + xen_init_lock_cpu(cpu); break; default: break; @@ -1355,6 +1356,7 @@ static void __init xen_hvm_guest_init(void) if (xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; + xen_hvm_smp_init(); register_cpu_notifier(&xen_hvm_cpu_notifier); xen_unplug_emulated_devices(); have_vcpu_info_placement = 0; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 5e92b61ad57..3f6f3347aa1 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -46,6 +46,7 @@ #include <linux/module.h> #include <linux/gfp.h> #include <linux/memblock.h> +#include <linux/seq_file.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -416,8 +417,12 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) if (val & _PAGE_PRESENT) { unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; pteval_t flags = val & PTE_FLAGS_MASK; - unsigned long mfn = pfn_to_mfn(pfn); + unsigned long mfn; + if (!xen_feature(XENFEAT_auto_translated_physmap)) + mfn = get_phys_to_machine(pfn); + else + mfn = pfn; /* * If there's no mfn for the pfn, then just create an * empty non-present pte. Unfortunately this loses @@ -427,8 +432,18 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) if (unlikely(mfn == INVALID_P2M_ENTRY)) { mfn = 0; flags = 0; + } else { + /* + * Paramount to do this test _after_ the + * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY & + * IDENTITY_FRAME_BIT resolves to true. + */ + mfn &= ~FOREIGN_FRAME_BIT; + if (mfn & IDENTITY_FRAME_BIT) { + mfn &= ~IDENTITY_FRAME_BIT; + flags |= _PAGE_IOMAP; + } } - val = ((pteval_t)mfn << PAGE_SHIFT) | flags; } @@ -532,6 +547,41 @@ pte_t xen_make_pte(pteval_t pte) } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); +#ifdef CONFIG_XEN_DEBUG +pte_t xen_make_pte_debug(pteval_t pte) +{ + phys_addr_t addr = (pte & PTE_PFN_MASK); + phys_addr_t other_addr; + bool io_page = false; + pte_t _pte; + + if (pte & _PAGE_IOMAP) + io_page = true; + + _pte = xen_make_pte(pte); + + if (!addr) + return _pte; + + if (io_page && + (xen_initial_domain() || addr >= ISA_END_ADDRESS)) { + other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT; + WARN(addr != other_addr, + "0x%lx is using VM_IO, but it is 0x%lx!\n", + (unsigned long)addr, (unsigned long)other_addr); + } else { + pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP; + other_addr = (_pte.pte & PTE_PFN_MASK); + WARN((addr == other_addr) && (!io_page) && (!iomap_set), + "0x%lx is missing VM_IO (and wasn't fixed)!\n", + (unsigned long)addr); + } + + return _pte; +} +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug); +#endif + pgd_t xen_make_pgd(pgdval_t pgd) { pgd = pte_pfn_to_mfn(pgd); @@ -986,10 +1036,9 @@ static void xen_pgd_pin(struct mm_struct *mm) */ void xen_mm_pin_all(void) { - unsigned long flags; struct page *page; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { if (!PagePinned(page)) { @@ -998,7 +1047,7 @@ void xen_mm_pin_all(void) } } - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } /* @@ -1099,10 +1148,9 @@ static void xen_pgd_unpin(struct mm_struct *mm) */ void xen_mm_unpin_all(void) { - unsigned long flags; struct page *page; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { if (PageSavePinned(page)) { @@ -1112,7 +1160,7 @@ void xen_mm_unpin_all(void) } } - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) @@ -1443,7 +1491,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) * early_ioremap fixmap slot, make sure it is RO. */ if (!is_early_ioremap_ptep(ptep) && - pfn >= e820_table_start && pfn < e820_table_end) + pfn >= pgt_buf_start && pfn < pgt_buf_end) pte = pte_wrprotect(pte); return pte; @@ -1942,6 +1990,9 @@ __init void xen_ident_map_ISA(void) static __init void xen_post_allocator_init(void) { +#ifdef CONFIG_XEN_DEBUG + pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); +#endif pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; @@ -2074,7 +2125,7 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, in_frames[i] = virt_to_mfn(vaddr); MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); - set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); + __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); if (out_frames) out_frames[i] = virt_to_pfn(vaddr); @@ -2353,6 +2404,18 @@ EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); #ifdef CONFIG_XEN_DEBUG_FS +static int p2m_dump_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, p2m_dump_show, NULL); +} + +static const struct file_operations p2m_dump_fops = { + .open = p2m_dump_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static struct dentry *d_mmu_debug; static int __init xen_mmu_debugfs(void) @@ -2408,6 +2471,7 @@ static int __init xen_mmu_debugfs(void) debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug, &mmu_stats.prot_commit_batched); + debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops); return 0; } fs_initcall(xen_mmu_debugfs); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index fd12d7ce7ff..215a3ce6106 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -23,6 +23,129 @@ * P2M_PER_PAGE depends on the architecture, as a mfn is always * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to * 512 and 1024 entries respectively. + * + * In short, these structures contain the Machine Frame Number (MFN) of the PFN. + * + * However not all entries are filled with MFNs. Specifically for all other + * leaf entries, or for the top root, or middle one, for which there is a void + * entry, we assume it is "missing". So (for example) + * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY. + * + * We also have the possibility of setting 1-1 mappings on certain regions, so + * that: + * pfn_to_mfn(0xc0000)=0xc0000 + * + * The benefit of this is, that we can assume for non-RAM regions (think + * PCI BARs, or ACPI spaces), we can create mappings easily b/c we + * get the PFN value to match the MFN. + * + * For this to work efficiently we have one new page p2m_identity and + * allocate (via reserved_brk) any other pages we need to cover the sides + * (1GB or 4MB boundary violations). All entries in p2m_identity are set to + * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs, + * no other fancy value). + * + * On lookup we spot that the entry points to p2m_identity and return the + * identity value instead of dereferencing and returning INVALID_P2M_ENTRY. + * If the entry points to an allocated page, we just proceed as before and + * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in + * appropriate functions (pfn_to_mfn). + * + * The reason for having the IDENTITY_FRAME_BIT instead of just returning the + * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a + * non-identity pfn. To protect ourselves against we elect to set (and get) the + * IDENTITY_FRAME_BIT on all identity mapped PFNs. + * + * This simplistic diagram is used to explain the more subtle piece of code. + * There is also a digram of the P2M at the end that can help. + * Imagine your E820 looking as so: + * + * 1GB 2GB + * /-------------------+---------\/----\ /----------\ /---+-----\ + * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM | + * \-------------------+---------/\----/ \----------/ \---+-----/ + * ^- 1029MB ^- 2001MB + * + * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100), + * 2048MB = 524288 (0x80000)] + * + * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB + * is actually not present (would have to kick the balloon driver to put it in). + * + * When we are told to set the PFNs for identity mapping (see patch: "xen/setup: + * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start + * of the PFN and the end PFN (263424 and 512256 respectively). The first step + * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page + * covers 512^2 of page estate (1GB) and in case the start or end PFN is not + * aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn + * to end pfn. We reserve_brk top leaf pages if they are missing (means they + * point to p2m_mid_missing). + * + * With the E820 example above, 263424 is not 1GB aligned so we allocate a + * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000. + * Each entry in the allocate page is "missing" (points to p2m_missing). + * + * Next stage is to determine if we need to do a more granular boundary check + * on the 4MB (or 2MB depending on architecture) off the start and end pfn's. + * We check if the start pfn and end pfn violate that boundary check, and if + * so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer + * granularity of setting which PFNs are missing and which ones are identity. + * In our example 263424 and 512256 both fail the check so we reserve_brk two + * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing" + * values) and assign them to p2m[1][2] and p2m[1][488] respectively. + * + * At this point we would at minimum reserve_brk one page, but could be up to + * three. Each call to set_phys_range_identity has at maximum a three page + * cost. If we were to query the P2M at this stage, all those entries from + * start PFN through end PFN (so 1029MB -> 2001MB) would return + * INVALID_P2M_ENTRY ("missing"). + * + * The next step is to walk from the start pfn to the end pfn setting + * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity. + * If we find that the middle leaf is pointing to p2m_missing we can swap it + * over to p2m_identity - this way covering 4MB (or 2MB) PFN space. At this + * point we do not need to worry about boundary aligment (so no need to + * reserve_brk a middle page, figure out which PFNs are "missing" and which + * ones are identity), as that has been done earlier. If we find that the + * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference + * that page (which covers 512 PFNs) and set the appropriate PFN with + * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we + * set from p2m[1][2][256->511] and p2m[1][488][0->256] with + * IDENTITY_FRAME_BIT set. + * + * All other regions that are void (or not filled) either point to p2m_missing + * (considered missing) or have the default value of INVALID_P2M_ENTRY (also + * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511] + * contain the INVALID_P2M_ENTRY value and are considered "missing." + * + * This is what the p2m ends up looking (for the E820 above) with this + * fabulous drawing: + * + * p2m /--------------\ + * /-----\ | &mfn_list[0],| /-----------------\ + * | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. | + * |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] | + * | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] | + * |-----| \ | [p2m_identity]+\\ | .... | + * | 2 |--\ \-------------------->| ... | \\ \----------------/ + * |-----| \ \---------------/ \\ + * | 3 |\ \ \\ p2m_identity + * |-----| \ \-------------------->/---------------\ /-----------------\ + * | .. +->+ | [p2m_identity]+-->| ~0, ~0, ~0, ... | + * \-----/ / | [p2m_identity]+-->| ..., ~0 | + * / /---------------\ | .... | \-----------------/ + * / | IDENTITY[@0] | /-+-[x], ~0, ~0.. | + * / | IDENTITY[@256]|<----/ \---------------/ + * / | ~0, ~0, .... | + * | \---------------/ + * | + * p2m_missing p2m_missing + * /------------------\ /------------\ + * | [p2m_mid_missing]+---->| ~0, ~0, ~0 | + * | [p2m_mid_missing]+---->| ..., ~0 | + * \------------------/ \------------/ + * + * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT) */ #include <linux/init.h> @@ -30,6 +153,7 @@ #include <linux/list.h> #include <linux/hash.h> #include <linux/sched.h> +#include <linux/seq_file.h> #include <asm/cache.h> #include <asm/setup.h> @@ -59,9 +183,15 @@ static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE); + RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); +/* We might hit two boundary violations at the start and end, at max each + * boundary violation will require three middle nodes. */ +RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3); + static inline unsigned p2m_top_index(unsigned long pfn) { BUG_ON(pfn >= MAX_P2M_PFN); @@ -136,7 +266,7 @@ static void p2m_init(unsigned long *p2m) * - After resume we're called from within stop_machine, but the mfn * tree should alreay be completely allocated. */ -void xen_build_mfn_list_list(void) +void __ref xen_build_mfn_list_list(void) { unsigned long pfn; @@ -221,6 +351,9 @@ void __init xen_build_dynamic_phys_to_machine(void) p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_top_init(p2m_top); + p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_init(p2m_identity); + /* * The domain builder gives us a pre-constructed p2m array in * mfn_list for all the pages initially given to us, so we just @@ -266,6 +399,14 @@ unsigned long get_phys_to_machine(unsigned long pfn) mididx = p2m_mid_index(pfn); idx = p2m_index(pfn); + /* + * The INVALID_P2M_ENTRY is filled in both p2m_*identity + * and in p2m_*missing, so returning the INVALID_P2M_ENTRY + * would be wrong. + */ + if (p2m_top[topidx][mididx] == p2m_identity) + return IDENTITY_FRAME(pfn); + return p2m_top[topidx][mididx][idx]; } EXPORT_SYMBOL_GPL(get_phys_to_machine); @@ -335,9 +476,11 @@ static bool alloc_p2m(unsigned long pfn) p2m_top_mfn_p[topidx] = mid_mfn; } - if (p2m_top[topidx][mididx] == p2m_missing) { + if (p2m_top[topidx][mididx] == p2m_identity || + p2m_top[topidx][mididx] == p2m_missing) { /* p2m leaf page is missing */ unsigned long *p2m; + unsigned long *p2m_orig = p2m_top[topidx][mididx]; p2m = alloc_p2m_page(); if (!p2m) @@ -345,7 +488,7 @@ static bool alloc_p2m(unsigned long pfn) p2m_init(p2m); - if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) + if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig) free_p2m_page(p2m); else mid_mfn[mididx] = virt_to_mfn(p2m); @@ -354,11 +497,91 @@ static bool alloc_p2m(unsigned long pfn) return true; } +bool __early_alloc_p2m(unsigned long pfn) +{ + unsigned topidx, mididx, idx; + + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + idx = p2m_index(pfn); + + /* Pfff.. No boundary cross-over, lets get out. */ + if (!idx) + return false; + + WARN(p2m_top[topidx][mididx] == p2m_identity, + "P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n", + topidx, mididx); + + /* + * Could be done by xen_build_dynamic_phys_to_machine.. + */ + if (p2m_top[topidx][mididx] != p2m_missing) + return false; + + /* Boundary cross-over for the edges: */ + if (idx) { + unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); + + p2m_init(p2m); + + p2m_top[topidx][mididx] = p2m; + + } + return idx != 0; +} +unsigned long set_phys_range_identity(unsigned long pfn_s, + unsigned long pfn_e) +{ + unsigned long pfn; + + if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN)) + return 0; + + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) + return pfn_e - pfn_s; + + if (pfn_s > pfn_e) + return 0; + + for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1)); + pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE)); + pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE) + { + unsigned topidx = p2m_top_index(pfn); + if (p2m_top[topidx] == p2m_mid_missing) { + unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); + + p2m_mid_init(mid); + + p2m_top[topidx] = mid; + } + } + + __early_alloc_p2m(pfn_s); + __early_alloc_p2m(pfn_e); + + for (pfn = pfn_s; pfn < pfn_e; pfn++) + if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) + break; + + if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s), + "Identity mapping failed. We are %ld short of 1-1 mappings!\n", + (pfn_e - pfn_s) - (pfn - pfn_s))) + printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn); + + return pfn - pfn_s; +} + /* Try to install p2m mapping; fail if intermediate bits missing */ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) { unsigned topidx, mididx, idx; + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); + return true; + } if (unlikely(pfn >= MAX_P2M_PFN)) { BUG_ON(mfn != INVALID_P2M_ENTRY); return true; @@ -368,6 +591,21 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) mididx = p2m_mid_index(pfn); idx = p2m_index(pfn); + /* For sparse holes were the p2m leaf has real PFN along with + * PCI holes, stick in the PFN as the MFN value. + */ + if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) { + if (p2m_top[topidx][mididx] == p2m_identity) + return true; + + /* Swap over from MISSING to IDENTITY if needed. */ + if (p2m_top[topidx][mididx] == p2m_missing) { + WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing, + p2m_identity) != p2m_missing); + return true; + } + } + if (p2m_top[topidx][mididx] == p2m_missing) return mfn == INVALID_P2M_ENTRY; @@ -378,11 +616,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) { - if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { - BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); - return true; - } - if (unlikely(!__set_phys_to_machine(pfn, mfn))) { if (!alloc_p2m(pfn)) return false; @@ -421,7 +654,7 @@ int m2p_add_override(unsigned long mfn, struct page *page) { unsigned long flags; unsigned long pfn; - unsigned long address; + unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; @@ -455,7 +688,7 @@ int m2p_remove_override(struct page *page) unsigned long flags; unsigned long mfn; unsigned long pfn; - unsigned long address; + unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; @@ -520,3 +753,80 @@ unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn) return ret; } EXPORT_SYMBOL_GPL(m2p_find_override_pfn); + +#ifdef CONFIG_XEN_DEBUG_FS + +int p2m_dump_show(struct seq_file *m, void *v) +{ + static const char * const level_name[] = { "top", "middle", + "entry", "abnormal" }; + static const char * const type_name[] = { "identity", "missing", + "pfn", "abnormal"}; +#define TYPE_IDENTITY 0 +#define TYPE_MISSING 1 +#define TYPE_PFN 2 +#define TYPE_UNKNOWN 3 + unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0; + unsigned int uninitialized_var(prev_level); + unsigned int uninitialized_var(prev_type); + + if (!p2m_top) + return 0; + + for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); + unsigned idx = p2m_index(pfn); + unsigned lvl, type; + + lvl = 4; + type = TYPE_UNKNOWN; + if (p2m_top[topidx] == p2m_mid_missing) { + lvl = 0; type = TYPE_MISSING; + } else if (p2m_top[topidx] == NULL) { + lvl = 0; type = TYPE_UNKNOWN; + } else if (p2m_top[topidx][mididx] == NULL) { + lvl = 1; type = TYPE_UNKNOWN; + } else if (p2m_top[topidx][mididx] == p2m_identity) { + lvl = 1; type = TYPE_IDENTITY; + } else if (p2m_top[topidx][mididx] == p2m_missing) { + lvl = 1; type = TYPE_MISSING; + } else if (p2m_top[topidx][mididx][idx] == 0) { + lvl = 2; type = TYPE_UNKNOWN; + } else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) { + lvl = 2; type = TYPE_IDENTITY; + } else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) { + lvl = 2; type = TYPE_MISSING; + } else if (p2m_top[topidx][mididx][idx] == pfn) { + lvl = 2; type = TYPE_PFN; + } else if (p2m_top[topidx][mididx][idx] != pfn) { + lvl = 2; type = TYPE_PFN; + } + if (pfn == 0) { + prev_level = lvl; + prev_type = type; + } + if (pfn == MAX_DOMAIN_PAGES-1) { + lvl = 3; + type = TYPE_UNKNOWN; + } + if (prev_type != type) { + seq_printf(m, " [0x%lx->0x%lx] %s\n", + prev_pfn_type, pfn, type_name[prev_type]); + prev_pfn_type = pfn; + prev_type = type; + } + if (prev_level != lvl) { + seq_printf(m, " [0x%lx->0x%lx] level %s\n", + prev_pfn_level, pfn, level_name[prev_level]); + prev_pfn_level = pfn; + prev_level = lvl; + } + } + return 0; +#undef TYPE_IDENTITY +#undef TYPE_MISSING +#undef TYPE_PFN +#undef TYPE_UNKNOWN +} +#endif diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index a8a66a50d44..fa0269a9937 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -52,6 +52,8 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size; static __init void xen_add_extra_mem(unsigned long pages) { + unsigned long pfn; + u64 size = (u64)pages * PAGE_SIZE; u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; @@ -66,6 +68,9 @@ static __init void xen_add_extra_mem(unsigned long pages) xen_extra_mem_size += size; xen_max_p2m_pfn = PFN_DOWN(extra_start + size); + + for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++) + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } static unsigned long __init xen_release_chunk(phys_addr_t start_addr, @@ -104,7 +109,7 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr, WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", start, end, ret); if (ret == 1) { - set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); len++; } } @@ -138,12 +143,55 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, return released; } +static unsigned long __init xen_set_identity(const struct e820entry *list, + ssize_t map_size) +{ + phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS; + phys_addr_t start_pci = last; + const struct e820entry *entry; + unsigned long identity = 0; + int i; + + for (i = 0, entry = list; i < map_size; i++, entry++) { + phys_addr_t start = entry->addr; + phys_addr_t end = start + entry->size; + + if (start < last) + start = last; + + if (end <= start) + continue; + + /* Skip over the 1MB region. */ + if (last > end) + continue; + + if (entry->type == E820_RAM) { + if (start > start_pci) + identity += set_phys_range_identity( + PFN_UP(start_pci), PFN_DOWN(start)); + + /* Without saving 'last' we would gooble RAM too + * at the end of the loop. */ + last = end; + start_pci = end; + continue; + } + start_pci = min(start, start_pci); + last = end; + } + if (last > start_pci) + identity += set_phys_range_identity( + PFN_UP(start_pci), PFN_DOWN(last)); + return identity; +} /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ char * __init xen_memory_setup(void) { static struct e820entry map[E820MAX] __initdata; + static struct e820entry map_raw[E820MAX] __initdata; unsigned long max_pfn = xen_start_info->nr_pages; unsigned long long mem_end; @@ -151,6 +199,7 @@ char * __init xen_memory_setup(void) struct xen_memory_map memmap; unsigned long extra_pages = 0; unsigned long extra_limit; + unsigned long identity_pages = 0; int i; int op; @@ -176,6 +225,7 @@ char * __init xen_memory_setup(void) } BUG_ON(rc); + memcpy(map_raw, map, sizeof(map)); e820.nr_map = 0; xen_extra_mem_start = mem_end; for (i = 0; i < memmap.nr_entries; i++) { @@ -194,6 +244,15 @@ char * __init xen_memory_setup(void) end -= delta; extra_pages += PFN_DOWN(delta); + /* + * Set RAM below 4GB that is not for us to be unusable. + * This prevents "System RAM" address space from being + * used as potential resource for I/O address (happens + * when 'allocate_resource' is called). + */ + if (delta && + (xen_initial_domain() && end < 0x100000000ULL)) + e820_add_region(end, delta, E820_UNUSABLE); } if (map[i].size > 0 && end > xen_extra_mem_start) @@ -251,6 +310,13 @@ char * __init xen_memory_setup(void) xen_add_extra_mem(extra_pages); + /* + * Set P2M for all non-RAM pages and E820 gaps to be identity + * type PFNs. We supply it with the non-sanitized version + * of the E820. + */ + identity_pages = xen_set_identity(map_raw, memmap.nr_entries); + printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages); return "Xen"; } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 72a4c795904..30612441ed9 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -509,3 +509,41 @@ void __init xen_smp_init(void) xen_fill_possible_map(); xen_init_spinlocks(); } + +static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) +{ + native_smp_prepare_cpus(max_cpus); + WARN_ON(xen_smp_intr_init(0)); + + if (!xen_have_vector_callback) + return; + xen_init_lock_cpu(0); + xen_init_spinlocks(); +} + +static int __cpuinit xen_hvm_cpu_up(unsigned int cpu) +{ + int rc; + rc = native_cpu_up(cpu); + WARN_ON (xen_smp_intr_init(cpu)); + return rc; +} + +static void xen_hvm_cpu_die(unsigned int cpu) +{ + unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); + native_cpu_die(cpu); +} + +void __init xen_hvm_smp_init(void) +{ + smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; + smp_ops.smp_send_reschedule = xen_smp_send_reschedule; + smp_ops.cpu_up = xen_hvm_cpu_up; + smp_ops.cpu_die = xen_hvm_cpu_die; + smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; + smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; +} diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 9bbd63a129b..45329c8c226 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -12,7 +12,7 @@ #include "xen-ops.h" #include "mmu.h" -void xen_pre_suspend(void) +void xen_arch_pre_suspend(void) { xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); xen_start_info->console.domU.mfn = @@ -26,8 +26,9 @@ void xen_pre_suspend(void) BUG(); } -void xen_hvm_post_suspend(int suspend_cancelled) +void xen_arch_hvm_post_suspend(int suspend_cancelled) { +#ifdef CONFIG_XEN_PVHVM int cpu; xen_hvm_init_shared_info(); xen_callback_vector(); @@ -37,9 +38,10 @@ void xen_hvm_post_suspend(int suspend_cancelled) xen_setup_runstate_info(cpu); } } +#endif } -void xen_post_suspend(int suspend_cancelled) +void xen_arch_post_suspend(int suspend_cancelled) { xen_build_mfn_list_list(); diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 067759e3d6a..2e2d370a47b 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -397,7 +397,9 @@ void xen_setup_timer(int cpu) name = "<timer kasprintf failed>"; irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER, + IRQF_DISABLED|IRQF_PERCPU| + IRQF_NOBALANCING|IRQF_TIMER| + IRQF_FORCE_RESUME, name, NULL); evt = &per_cpu(xen_clock_events, cpu); diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 1a5ff24e29c..aaa7291c925 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -28,9 +28,9 @@ ENTRY(startup_xen) __FINIT .pushsection .text - .align PAGE_SIZE_asm + .align PAGE_SIZE ENTRY(hypercall_page) - .skip PAGE_SIZE_asm + .skip PAGE_SIZE .popsection ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9d41bf98575..3112f55638c 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -64,10 +64,12 @@ void xen_setup_vcpu_info_placement(void); #ifdef CONFIG_SMP void xen_smp_init(void); +void __init xen_hvm_smp_init(void); extern cpumask_var_t xen_cpu_initialized_map; #else static inline void xen_smp_init(void) {} +static inline void xen_hvm_smp_init(void) {} #endif #ifdef CONFIG_PARAVIRT_SPINLOCKS diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h index e39edf5c86f..249619e7e7f 100644 --- a/arch/xtensa/include/asm/rwsem.h +++ b/arch/xtensa/include/asm/rwsem.h @@ -17,44 +17,12 @@ #error "Please don't include <asm/rwsem.h> directly, use <linux/rwsem.h> instead." #endif -#include <linux/list.h> -#include <linux/spinlock.h> -#include <asm/atomic.h> -#include <asm/system.h> - -/* - * the semaphore definition - */ -struct rw_semaphore { - signed long count; #define RWSEM_UNLOCKED_VALUE 0x00000000 #define RWSEM_ACTIVE_BIAS 0x00000001 #define RWSEM_ACTIVE_MASK 0x0000ffff #define RWSEM_WAITING_BIAS (-0x00010000) #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - spinlock_t wait_lock; - struct list_head wait_list; -}; - -#define __RWSEM_INITIALIZER(name) \ - { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \ - LIST_HEAD_INIT((name).wait_list) } - -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) - -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); - -static inline void init_rwsem(struct rw_semaphore *sem) -{ - sem->count = RWSEM_UNLOCKED_VALUE; - spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} /* * lock for reading @@ -160,9 +128,4 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) return atomic_add_return(delta, (atomic_t *)(&sem->count)); } -static inline int rwsem_is_locked(struct rw_semaphore *sem) -{ - return (sem->count != 0); -} - #endif /* _XTENSA_RWSEM_H */ diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c index 19df764f639..f3e5eb43f71 100644 --- a/arch/xtensa/kernel/time.c +++ b/arch/xtensa/kernel/time.c @@ -96,16 +96,12 @@ again: update_process_times(user_mode(get_irq_regs())); #endif - write_seqlock(&xtime_lock); - - do_timer(1); /* Linux handler in kernel/timer.c */ + xtime_update(1); /* Linux handler in kernel/time/timekeeping */ /* Note that writing CCOMPARE clears the interrupt. */ next += CCOUNT_PER_JIFFY; set_linux_timer(next); - - write_sequnlock(&xtime_lock); } /* Allow platform to do something useful (Wdog). */ diff --git a/block/blk-lib.c b/block/blk-lib.c index eec78becb35..bd3e8df4d5e 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -109,7 +109,6 @@ struct bio_batch atomic_t done; unsigned long flags; struct completion *wait; - bio_end_io_t *end_io; }; static void bio_batch_end_io(struct bio *bio, int err) @@ -122,12 +121,9 @@ static void bio_batch_end_io(struct bio *bio, int err) else clear_bit(BIO_UPTODATE, &bb->flags); } - if (bb) { - if (bb->end_io) - bb->end_io(bio, err); - atomic_inc(&bb->done); - complete(bb->wait); - } + if (bb) + if (atomic_dec_and_test(&bb->done)) + complete(bb->wait); bio_put(bio); } @@ -150,13 +146,12 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, int ret; struct bio *bio; struct bio_batch bb; - unsigned int sz, issued = 0; + unsigned int sz; DECLARE_COMPLETION_ONSTACK(wait); - atomic_set(&bb.done, 0); + atomic_set(&bb.done, 1); bb.flags = 1 << BIO_UPTODATE; bb.wait = &wait; - bb.end_io = NULL; submit: ret = 0; @@ -185,12 +180,12 @@ submit: break; } ret = 0; - issued++; + atomic_inc(&bb.done); submit_bio(WRITE, bio); } /* Wait for bios in-flight */ - while (issued != atomic_read(&bb.done)) + if (!atomic_dec_and_test(&bb.done)) wait_for_completion(&wait); if (!test_bit(BIO_UPTODATE, &bb.flags)) diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 5eb25eb3ea4..3b5c3189fd9 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -274,7 +274,7 @@ acpi_table_parse_srat(enum acpi_srat_type id, int __init acpi_numa_init(void) { - int ret = 0; + int cnt = 0; /* * Should not limit number with cpu num that is from NR_CPUS or nr_cpus= @@ -288,7 +288,7 @@ int __init acpi_numa_init(void) acpi_parse_x2apic_affinity, 0); acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, acpi_parse_processor_affinity, 0); - ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, + cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, acpi_parse_memory_affinity, NR_NODE_MEMBLKS); } @@ -297,7 +297,10 @@ int __init acpi_numa_init(void) acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); acpi_numa_arch_fixup(); - return ret; + + if (cnt <= 0) + return cnt ?: -ENOENT; + return 0; } int acpi_get_pxm(acpi_handle h) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index d7aa39e349a..9cb8668ff5f 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -120,6 +120,10 @@ static DEFINE_SPINLOCK(minor_lock); #define EXTENDED (1<<EXT_SHIFT) #define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED)) #define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED)) +#define EMULATED_HD_DISK_MINOR_OFFSET (0) +#define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256) +#define EMULATED_SD_DISK_MINOR_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET + (4 * 16)) +#define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_HD_DISK_NAME_OFFSET + 4) #define DEV_NAME "xvd" /* name in /dev */ @@ -281,7 +285,7 @@ static int blkif_queue_request(struct request *req) info->shadow[id].request = req; ring_req->id = id; - ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req); + ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); ring_req->handle = info->handle; ring_req->operation = rq_data_dir(req) ? @@ -317,7 +321,7 @@ static int blkif_queue_request(struct request *req) rq_data_dir(req) ); info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn); - ring_req->seg[i] = + ring_req->u.rw.seg[i] = (struct blkif_request_segment) { .gref = ref, .first_sect = fsect, @@ -434,6 +438,65 @@ static void xlvbd_flush(struct blkfront_info *info) info->feature_flush ? "enabled" : "disabled"); } +static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) +{ + int major; + major = BLKIF_MAJOR(vdevice); + *minor = BLKIF_MINOR(vdevice); + switch (major) { + case XEN_IDE0_MAJOR: + *offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET; + *minor = ((*minor / 64) * PARTS_PER_DISK) + + EMULATED_HD_DISK_MINOR_OFFSET; + break; + case XEN_IDE1_MAJOR: + *offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET; + *minor = (((*minor / 64) + 2) * PARTS_PER_DISK) + + EMULATED_HD_DISK_MINOR_OFFSET; + break; + case XEN_SCSI_DISK0_MAJOR: + *offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET; + *minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET; + break; + case XEN_SCSI_DISK1_MAJOR: + case XEN_SCSI_DISK2_MAJOR: + case XEN_SCSI_DISK3_MAJOR: + case XEN_SCSI_DISK4_MAJOR: + case XEN_SCSI_DISK5_MAJOR: + case XEN_SCSI_DISK6_MAJOR: + case XEN_SCSI_DISK7_MAJOR: + *offset = (*minor / PARTS_PER_DISK) + + ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) + + EMULATED_SD_DISK_NAME_OFFSET; + *minor = *minor + + ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) + + EMULATED_SD_DISK_MINOR_OFFSET; + break; + case XEN_SCSI_DISK8_MAJOR: + case XEN_SCSI_DISK9_MAJOR: + case XEN_SCSI_DISK10_MAJOR: + case XEN_SCSI_DISK11_MAJOR: + case XEN_SCSI_DISK12_MAJOR: + case XEN_SCSI_DISK13_MAJOR: + case XEN_SCSI_DISK14_MAJOR: + case XEN_SCSI_DISK15_MAJOR: + *offset = (*minor / PARTS_PER_DISK) + + ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) + + EMULATED_SD_DISK_NAME_OFFSET; + *minor = *minor + + ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) + + EMULATED_SD_DISK_MINOR_OFFSET; + break; + case XENVBD_MAJOR: + *offset = *minor / PARTS_PER_DISK; + break; + default: + printk(KERN_WARNING "blkfront: your disk configuration is " + "incorrect, please use an xvd device instead\n"); + return -ENODEV; + } + return 0; +} static int xlvbd_alloc_gendisk(blkif_sector_t capacity, struct blkfront_info *info, @@ -441,7 +504,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, { struct gendisk *gd; int nr_minors = 1; - int err = -ENODEV; + int err; unsigned int offset; int minor; int nr_parts; @@ -456,12 +519,20 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, } if (!VDEV_IS_EXTENDED(info->vdevice)) { - minor = BLKIF_MINOR(info->vdevice); - nr_parts = PARTS_PER_DISK; + err = xen_translate_vdev(info->vdevice, &minor, &offset); + if (err) + return err; + nr_parts = PARTS_PER_DISK; } else { minor = BLKIF_MINOR_EXT(info->vdevice); nr_parts = PARTS_PER_EXT_DISK; + offset = minor / nr_parts; + if (xen_hvm_domain() && offset <= EMULATED_HD_DISK_NAME_OFFSET + 4) + printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with " + "emulated IDE disks,\n\t choose an xvd device name" + "from xvde on\n", info->vdevice); } + err = -ENODEV; if ((minor % nr_parts) == 0) nr_minors = nr_parts; @@ -475,8 +546,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, if (gd == NULL) goto release; - offset = minor / nr_parts; - if (nr_minors > 1) { if (offset < 26) sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset); @@ -615,7 +684,7 @@ static void blkif_completion(struct blk_shadow *s) { int i; for (i = 0; i < s->req.nr_segments; i++) - gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL); + gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL); } static irqreturn_t blkif_interrupt(int irq, void *dev_id) @@ -932,7 +1001,7 @@ static int blkif_recover(struct blkfront_info *info) /* Rewrite any grant references invalidated by susp/resume. */ for (j = 0; j < req->nr_segments; j++) gnttab_grant_foreign_access_ref( - req->seg[j].gref, + req->u.rw.seg[j].gref, info->xbdev->otherend_id, pfn_to_mfn(info->shadow[req->id].frame[j]), rq_data_dir(info->shadow[req->id].request)); diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 7855f9f45b8..62787e30d50 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -900,6 +900,14 @@ static void sender(void *send_info, printk("**Enqueue: %d.%9.9d\n", t.tv_sec, t.tv_usec); #endif + /* + * last_timeout_jiffies is updated here to avoid + * smi_timeout() handler passing very large time_diff + * value to smi_event_handler() that causes + * the send command to abort. + */ + smi_info->last_timeout_jiffies = jiffies; + mod_timer(&smi_info->si_timer, jiffies + SI_TIMEOUT_JIFFIES); if (smi_info->thread) diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c index e6d75627c6c..33dc2298af7 100644 --- a/drivers/char/mmtimer.c +++ b/drivers/char/mmtimer.c @@ -53,6 +53,8 @@ MODULE_LICENSE("GPL"); #define RTC_BITS 55 /* 55 bits for this implementation */ +static struct k_clock sgi_clock; + extern unsigned long sn_rtc_cycles_per_second; #define RTC_COUNTER_ADDR ((long *)LOCAL_MMR_ADDR(SH_RTC)) @@ -487,7 +489,7 @@ static int sgi_clock_get(clockid_t clockid, struct timespec *tp) return 0; }; -static int sgi_clock_set(clockid_t clockid, struct timespec *tp) +static int sgi_clock_set(const clockid_t clockid, const struct timespec *tp) { u64 nsec; @@ -763,15 +765,21 @@ static int sgi_timer_set(struct k_itimer *timr, int flags, return err; } +static int sgi_clock_getres(const clockid_t which_clock, struct timespec *tp) +{ + tp->tv_sec = 0; + tp->tv_nsec = sgi_clock_period; + return 0; +} + static struct k_clock sgi_clock = { - .res = 0, - .clock_set = sgi_clock_set, - .clock_get = sgi_clock_get, - .timer_create = sgi_timer_create, - .nsleep = do_posix_clock_nonanosleep, - .timer_set = sgi_timer_set, - .timer_del = sgi_timer_del, - .timer_get = sgi_timer_get + .clock_set = sgi_clock_set, + .clock_get = sgi_clock_get, + .clock_getres = sgi_clock_getres, + .timer_create = sgi_timer_create, + .timer_set = sgi_timer_set, + .timer_del = sgi_timer_del, + .timer_get = sgi_timer_get }; /** @@ -831,8 +839,8 @@ static int __init mmtimer_init(void) (unsigned long) node); } - sgi_clock_period = sgi_clock.res = NSEC_PER_SEC / sn_rtc_cycles_per_second; - register_posix_clock(CLOCK_SGI_CYCLE, &sgi_clock); + sgi_clock_period = NSEC_PER_SEC / sn_rtc_cycles_per_second; + posix_timers_register_clock(CLOCK_SGI_CYCLE, &sgi_clock); printk(KERN_INFO "%s: v%s, %ld MHz\n", MMTIMER_DESC, MMTIMER_VERSION, sn_rtc_cycles_per_second/(unsigned long)1E6); diff --git a/drivers/gpio/ml_ioh_gpio.c b/drivers/gpio/ml_ioh_gpio.c index cead8e6ff34..7f6f01a4b14 100644 --- a/drivers/gpio/ml_ioh_gpio.c +++ b/drivers/gpio/ml_ioh_gpio.c @@ -326,6 +326,7 @@ static DEFINE_PCI_DEVICE_TABLE(ioh_gpio_pcidev_id) = { { PCI_DEVICE(PCI_VENDOR_ID_ROHM, 0x802E) }, { 0, } }; +MODULE_DEVICE_TABLE(pci, ioh_gpio_pcidev_id); static struct pci_driver ioh_gpio_driver = { .name = "ml_ioh_gpio", diff --git a/drivers/gpio/pch_gpio.c b/drivers/gpio/pch_gpio.c index 0eba0a75c80..2c6af870510 100644 --- a/drivers/gpio/pch_gpio.c +++ b/drivers/gpio/pch_gpio.c @@ -286,6 +286,7 @@ static DEFINE_PCI_DEVICE_TABLE(pch_gpio_pcidev_id) = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x8803) }, { 0, } }; +MODULE_DEVICE_TABLE(pci, pch_gpio_pcidev_id); static struct pci_driver pch_gpio_driver = { .name = "pch_gpio", diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 3e6f486f460..2abe240dae5 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -1553,7 +1553,17 @@ /* Backlight control */ #define BLC_PWM_CTL 0x61254 +#define BACKLIGHT_MODULATION_FREQ_SHIFT (17) #define BLC_PWM_CTL2 0x61250 /* 965+ only */ +#define BLM_COMBINATION_MODE (1 << 30) +/* + * This is the most significant 15 bits of the number of backlight cycles in a + * complete cycle of the modulated backlight control. + * + * The actual value is this field multiplied by two. + */ +#define BACKLIGHT_MODULATION_FREQ_MASK (0x7fff << 17) +#define BLM_LEGACY_MODE (1 << 16) /* * This is the number of cycles out of the backlight modulation cycle for which * the backlight is on. diff --git a/drivers/gpu/drm/i915/intel_panel.c b/drivers/gpu/drm/i915/intel_panel.c index d860abeda70..f8f86e57df2 100644 --- a/drivers/gpu/drm/i915/intel_panel.c +++ b/drivers/gpu/drm/i915/intel_panel.c @@ -30,6 +30,8 @@ #include "intel_drv.h" +#define PCI_LBPC 0xf4 /* legacy/combination backlight modes */ + void intel_fixed_panel_mode(struct drm_display_mode *fixed_mode, struct drm_display_mode *adjusted_mode) @@ -110,6 +112,19 @@ done: dev_priv->pch_pf_size = (width << 16) | height; } +static int is_backlight_combination_mode(struct drm_device *dev) +{ + struct drm_i915_private *dev_priv = dev->dev_private; + + if (INTEL_INFO(dev)->gen >= 4) + return I915_READ(BLC_PWM_CTL2) & BLM_COMBINATION_MODE; + + if (IS_GEN2(dev)) + return I915_READ(BLC_PWM_CTL) & BLM_LEGACY_MODE; + + return 0; +} + static u32 i915_read_blc_pwm_ctl(struct drm_i915_private *dev_priv) { u32 val; @@ -166,6 +181,9 @@ u32 intel_panel_get_max_backlight(struct drm_device *dev) if (INTEL_INFO(dev)->gen < 4) max &= ~1; } + + if (is_backlight_combination_mode(dev)) + max *= 0xff; } DRM_DEBUG_DRIVER("max backlight PWM = %d\n", max); @@ -183,6 +201,14 @@ u32 intel_panel_get_backlight(struct drm_device *dev) val = I915_READ(BLC_PWM_CTL) & BACKLIGHT_DUTY_CYCLE_MASK; if (IS_PINEVIEW(dev)) val >>= 1; + + if (is_backlight_combination_mode(dev)){ + u8 lbpc; + + val &= ~1; + pci_read_config_byte(dev->pdev, PCI_LBPC, &lbpc); + val *= lbpc; + } } DRM_DEBUG_DRIVER("get backlight PWM = %d\n", val); @@ -205,6 +231,16 @@ void intel_panel_set_backlight(struct drm_device *dev, u32 level) if (HAS_PCH_SPLIT(dev)) return intel_pch_panel_set_backlight(dev, level); + + if (is_backlight_combination_mode(dev)){ + u32 max = intel_panel_get_max_backlight(dev); + u8 lbpc; + + lbpc = level * 0xfe / max + 1; + level /= lbpc; + pci_write_config_byte(dev->pdev, PCI_LBPC, lbpc); + } + tmp = I915_READ(BLC_PWM_CTL); if (IS_PINEVIEW(dev)) { tmp &= ~(BACKLIGHT_DUTY_CYCLE_MASK - 1); diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c index d270b3ff896..6140ea1de45 100644 --- a/drivers/gpu/drm/radeon/evergreen.c +++ b/drivers/gpu/drm/radeon/evergreen.c @@ -2194,7 +2194,6 @@ int evergreen_mc_init(struct radeon_device *rdev) rdev->mc.real_vram_size = RREG32(CONFIG_MEMSIZE) * 1024 * 1024; } rdev->mc.visible_vram_size = rdev->mc.aper_size; - rdev->mc.active_vram_size = rdev->mc.visible_vram_size; r700_vram_gtt_location(rdev, &rdev->mc); radeon_update_bandwidth_info(rdev); @@ -2934,7 +2933,7 @@ static int evergreen_startup(struct radeon_device *rdev) /* XXX: ontario has problems blitting to gart at the moment */ if (rdev->family == CHIP_PALM) { rdev->asic->copy = NULL; - rdev->mc.active_vram_size = rdev->mc.visible_vram_size; + radeon_ttm_set_active_vram_size(rdev, rdev->mc.visible_vram_size); } /* allocate wb buffer */ diff --git a/drivers/gpu/drm/radeon/evergreen_blit_kms.c b/drivers/gpu/drm/radeon/evergreen_blit_kms.c index 2adfb03f479..2be698e78ff 100644 --- a/drivers/gpu/drm/radeon/evergreen_blit_kms.c +++ b/drivers/gpu/drm/radeon/evergreen_blit_kms.c @@ -623,7 +623,7 @@ done: dev_err(rdev->dev, "(%d) pin blit object failed\n", r); return r; } - rdev->mc.active_vram_size = rdev->mc.real_vram_size; + radeon_ttm_set_active_vram_size(rdev, rdev->mc.real_vram_size); return 0; } @@ -631,7 +631,7 @@ void evergreen_blit_fini(struct radeon_device *rdev) { int r; - rdev->mc.active_vram_size = rdev->mc.visible_vram_size; + radeon_ttm_set_active_vram_size(rdev, rdev->mc.visible_vram_size); if (rdev->r600_blit.shader_obj == NULL) return; /* If we can't reserve the bo, unref should be enough to destroy diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c index 93fa735c8c1..e372f9e1e5c 100644 --- a/drivers/gpu/drm/radeon/r100.c +++ b/drivers/gpu/drm/radeon/r100.c @@ -70,23 +70,6 @@ MODULE_FIRMWARE(FIRMWARE_R520); void r100_pre_page_flip(struct radeon_device *rdev, int crtc) { - struct radeon_crtc *radeon_crtc = rdev->mode_info.crtcs[crtc]; - u32 tmp; - - /* make sure flip is at vb rather than hb */ - tmp = RREG32(RADEON_CRTC_OFFSET_CNTL + radeon_crtc->crtc_offset); - tmp &= ~RADEON_CRTC_OFFSET_FLIP_CNTL; - /* make sure pending bit is asserted */ - tmp |= RADEON_CRTC_GUI_TRIG_OFFSET_LEFT_EN; - WREG32(RADEON_CRTC_OFFSET_CNTL + radeon_crtc->crtc_offset, tmp); - - /* set pageflip to happen as late as possible in the vblank interval. - * same field for crtc1/2 - */ - tmp = RREG32(RADEON_CRTC_GEN_CNTL); - tmp &= ~RADEON_CRTC_VSTAT_MODE_MASK; - WREG32(RADEON_CRTC_GEN_CNTL, tmp); - /* enable the pflip int */ radeon_irq_kms_pflip_irq_get(rdev, crtc); } @@ -1041,7 +1024,7 @@ int r100_cp_init(struct radeon_device *rdev, unsigned ring_size) return r; } rdev->cp.ready = true; - rdev->mc.active_vram_size = rdev->mc.real_vram_size; + radeon_ttm_set_active_vram_size(rdev, rdev->mc.real_vram_size); return 0; } @@ -1059,7 +1042,7 @@ void r100_cp_fini(struct radeon_device *rdev) void r100_cp_disable(struct radeon_device *rdev) { /* Disable ring */ - rdev->mc.active_vram_size = rdev->mc.visible_vram_size; + radeon_ttm_set_active_vram_size(rdev, rdev->mc.visible_vram_size); rdev->cp.ready = false; WREG32(RADEON_CP_CSQ_MODE, 0); WREG32(RADEON_CP_CSQ_CNTL, 0); @@ -2329,7 +2312,6 @@ void r100_vram_init_sizes(struct radeon_device *rdev) /* FIXME we don't use the second aperture yet when we could use it */ if (rdev->mc.visible_vram_size > rdev->mc.aper_size) rdev->mc.visible_vram_size = rdev->mc.aper_size; - rdev->mc.active_vram_size = rdev->mc.visible_vram_size; config_aper_size = RREG32(RADEON_CONFIG_APER_SIZE); if (rdev->flags & RADEON_IS_IGP) { uint32_t tom; diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c index de88624d5f8..9b3fad23b76 100644 --- a/drivers/gpu/drm/radeon/r600.c +++ b/drivers/gpu/drm/radeon/r600.c @@ -1255,7 +1255,6 @@ int r600_mc_init(struct radeon_device *rdev) rdev->mc.mc_vram_size = RREG32(CONFIG_MEMSIZE); rdev->mc.real_vram_size = RREG32(CONFIG_MEMSIZE); rdev->mc.visible_vram_size = rdev->mc.aper_size; - rdev->mc.active_vram_size = rdev->mc.visible_vram_size; r600_vram_gtt_location(rdev, &rdev->mc); if (rdev->flags & RADEON_IS_IGP) { @@ -1937,7 +1936,7 @@ void r600_pciep_wreg(struct radeon_device *rdev, u32 reg, u32 v) */ void r600_cp_stop(struct radeon_device *rdev) { - rdev->mc.active_vram_size = rdev->mc.visible_vram_size; + radeon_ttm_set_active_vram_size(rdev, rdev->mc.visible_vram_size); WREG32(R_0086D8_CP_ME_CNTL, S_0086D8_CP_ME_HALT(1)); WREG32(SCRATCH_UMSK, 0); } diff --git a/drivers/gpu/drm/radeon/r600_blit_kms.c b/drivers/gpu/drm/radeon/r600_blit_kms.c index 41f7aafc97c..df68d91e819 100644 --- a/drivers/gpu/drm/radeon/r600_blit_kms.c +++ b/drivers/gpu/drm/radeon/r600_blit_kms.c @@ -558,7 +558,7 @@ done: dev_err(rdev->dev, "(%d) pin blit object failed\n", r); return r; } - rdev->mc.active_vram_size = rdev->mc.real_vram_size; + radeon_ttm_set_active_vram_size(rdev, rdev->mc.real_vram_size); return 0; } @@ -566,7 +566,7 @@ void r600_blit_fini(struct radeon_device *rdev) { int r; - rdev->mc.active_vram_size = rdev->mc.visible_vram_size; + radeon_ttm_set_active_vram_size(rdev, rdev->mc.visible_vram_size); if (rdev->r600_blit.shader_obj == NULL) return; /* If we can't reserve the bo, unref should be enough to destroy diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 56c48b67ef3..6b342949511 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -345,7 +345,6 @@ struct radeon_mc { * about vram size near mc fb location */ u64 mc_vram_size; u64 visible_vram_size; - u64 active_vram_size; u64 gtt_size; u64 gtt_start; u64 gtt_end; @@ -1448,6 +1447,7 @@ extern void radeon_vram_location(struct radeon_device *rdev, struct radeon_mc *m extern void radeon_gtt_location(struct radeon_device *rdev, struct radeon_mc *mc); extern int radeon_resume_kms(struct drm_device *dev); extern int radeon_suspend_kms(struct drm_device *dev, pm_message_t state); +extern void radeon_ttm_set_active_vram_size(struct radeon_device *rdev, u64 size); /* r600, rv610, rv630, rv620, rv635, rv670, rs780, rs880 */ extern bool r600_card_posted(struct radeon_device *rdev); diff --git a/drivers/gpu/drm/radeon/radeon_asic.c b/drivers/gpu/drm/radeon/radeon_asic.c index e75d63b8e21..793c5e6026a 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.c +++ b/drivers/gpu/drm/radeon/radeon_asic.c @@ -834,6 +834,9 @@ static struct radeon_asic sumo_asic = { .pm_finish = &evergreen_pm_finish, .pm_init_profile = &rs780_pm_init_profile, .pm_get_dynpm_state = &r600_pm_get_dynpm_state, + .pre_page_flip = &evergreen_pre_page_flip, + .page_flip = &evergreen_page_flip, + .post_page_flip = &evergreen_post_page_flip, }; static struct radeon_asic btc_asic = { diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c index df95eb83dac..1fe95dfe48c 100644 --- a/drivers/gpu/drm/radeon/radeon_gem.c +++ b/drivers/gpu/drm/radeon/radeon_gem.c @@ -156,9 +156,12 @@ int radeon_gem_info_ioctl(struct drm_device *dev, void *data, { struct radeon_device *rdev = dev->dev_private; struct drm_radeon_gem_info *args = data; + struct ttm_mem_type_manager *man; + + man = &rdev->mman.bdev.man[TTM_PL_VRAM]; args->vram_size = rdev->mc.real_vram_size; - args->vram_visible = rdev->mc.real_vram_size; + args->vram_visible = (u64)man->size << PAGE_SHIFT; if (rdev->stollen_vga_memory) args->vram_visible -= radeon_bo_size(rdev->stollen_vga_memory); args->vram_visible -= radeon_fbdev_total_size(rdev); diff --git a/drivers/gpu/drm/radeon/radeon_legacy_crtc.c b/drivers/gpu/drm/radeon/radeon_legacy_crtc.c index cf0638c3b7c..78968b738e8 100644 --- a/drivers/gpu/drm/radeon/radeon_legacy_crtc.c +++ b/drivers/gpu/drm/radeon/radeon_legacy_crtc.c @@ -443,7 +443,7 @@ int radeon_crtc_do_set_base(struct drm_crtc *crtc, (target_fb->bits_per_pixel * 8)); crtc_pitch |= crtc_pitch << 16; - + crtc_offset_cntl |= RADEON_CRTC_GUI_TRIG_OFFSET_LEFT_EN; if (tiling_flags & RADEON_TILING_MACRO) { if (ASIC_IS_R300(rdev)) crtc_offset_cntl |= (R300_CRTC_X_Y_MODE_EN | @@ -502,6 +502,7 @@ int radeon_crtc_do_set_base(struct drm_crtc *crtc, gen_cntl_val = RREG32(gen_cntl_reg); gen_cntl_val &= ~(0xf << 8); gen_cntl_val |= (format << 8); + gen_cntl_val &= ~RADEON_CRTC_VSTAT_MODE_MASK; WREG32(gen_cntl_reg, gen_cntl_val); crtc_offset = (u32)base; diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index e5b2cf10cbf..8389b4c63d1 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -589,6 +589,20 @@ void radeon_ttm_fini(struct radeon_device *rdev) DRM_INFO("radeon: ttm finalized\n"); } +/* this should only be called at bootup or when userspace + * isn't running */ +void radeon_ttm_set_active_vram_size(struct radeon_device *rdev, u64 size) +{ + struct ttm_mem_type_manager *man; + + if (!rdev->mman.initialized) + return; + + man = &rdev->mman.bdev.man[TTM_PL_VRAM]; + /* this just adjusts TTM size idea, which sets lpfn to the correct value */ + man->size = size >> PAGE_SHIFT; +} + static struct vm_operations_struct radeon_ttm_vm_ops; static const struct vm_operations_struct *ttm_vm_ops = NULL; diff --git a/drivers/gpu/drm/radeon/rs600.c b/drivers/gpu/drm/radeon/rs600.c index 5afe294ed51..8af4679db23 100644 --- a/drivers/gpu/drm/radeon/rs600.c +++ b/drivers/gpu/drm/radeon/rs600.c @@ -751,7 +751,6 @@ void rs600_mc_init(struct radeon_device *rdev) rdev->mc.real_vram_size = RREG32(RADEON_CONFIG_MEMSIZE); rdev->mc.mc_vram_size = rdev->mc.real_vram_size; rdev->mc.visible_vram_size = rdev->mc.aper_size; - rdev->mc.active_vram_size = rdev->mc.visible_vram_size; rdev->mc.igp_sideport_enabled = radeon_atombios_sideport_present(rdev); base = RREG32_MC(R_000004_MC_FB_LOCATION); base = G_000004_MC_FB_START(base) << 16; diff --git a/drivers/gpu/drm/radeon/rs690.c b/drivers/gpu/drm/radeon/rs690.c index 6638c8e4c81..66c949b7c18 100644 --- a/drivers/gpu/drm/radeon/rs690.c +++ b/drivers/gpu/drm/radeon/rs690.c @@ -157,7 +157,6 @@ void rs690_mc_init(struct radeon_device *rdev) rdev->mc.aper_base = pci_resource_start(rdev->pdev, 0); rdev->mc.aper_size = pci_resource_len(rdev->pdev, 0); rdev->mc.visible_vram_size = rdev->mc.aper_size; - rdev->mc.active_vram_size = rdev->mc.visible_vram_size; base = RREG32_MC(R_000100_MCCFG_FB_LOCATION); base = G_000100_MC_FB_START(base) << 16; rdev->mc.igp_sideport_enabled = radeon_atombios_sideport_present(rdev); diff --git a/drivers/gpu/drm/radeon/rv770.c b/drivers/gpu/drm/radeon/rv770.c index d8ba6769065..714ad45757d 100644 --- a/drivers/gpu/drm/radeon/rv770.c +++ b/drivers/gpu/drm/radeon/rv770.c @@ -307,7 +307,7 @@ static void rv770_mc_program(struct radeon_device *rdev) */ void r700_cp_stop(struct radeon_device *rdev) { - rdev->mc.active_vram_size = rdev->mc.visible_vram_size; + radeon_ttm_set_active_vram_size(rdev, rdev->mc.visible_vram_size); WREG32(CP_ME_CNTL, (CP_ME_HALT | CP_PFP_HALT)); WREG32(SCRATCH_UMSK, 0); } @@ -1123,7 +1123,6 @@ int rv770_mc_init(struct radeon_device *rdev) rdev->mc.mc_vram_size = RREG32(CONFIG_MEMSIZE); rdev->mc.real_vram_size = RREG32(CONFIG_MEMSIZE); rdev->mc.visible_vram_size = rdev->mc.aper_size; - rdev->mc.active_vram_size = rdev->mc.visible_vram_size; r700_vram_gtt_location(rdev, &rdev->mc); radeon_update_bandwidth_info(rdev); diff --git a/drivers/hwmon/f71882fg.c b/drivers/hwmon/f71882fg.c index 3f49dd376f0..6e06019015a 100644 --- a/drivers/hwmon/f71882fg.c +++ b/drivers/hwmon/f71882fg.c @@ -37,7 +37,7 @@ #define SIO_F71858FG_LD_HWM 0x02 /* Hardware monitor logical device */ #define SIO_F71882FG_LD_HWM 0x04 /* Hardware monitor logical device */ #define SIO_UNLOCK_KEY 0x87 /* Key to enable Super-I/O */ -#define SIO_LOCK_KEY 0xAA /* Key to diasble Super-I/O */ +#define SIO_LOCK_KEY 0xAA /* Key to disable Super-I/O */ #define SIO_REG_LDSEL 0x07 /* Logical device select */ #define SIO_REG_DEVID 0x20 /* Device ID (2 bytes) */ @@ -2111,7 +2111,6 @@ static int f71882fg_remove(struct platform_device *pdev) int nr_fans = (data->type == f71882fg) ? 4 : 3; u8 start_reg = f71882fg_read8(data, F71882FG_REG_START); - platform_set_drvdata(pdev, NULL); if (data->hwmon_dev) hwmon_device_unregister(data->hwmon_dev); @@ -2178,6 +2177,7 @@ static int f71882fg_remove(struct platform_device *pdev) } } + platform_set_drvdata(pdev, NULL); kfree(data); return 0; diff --git a/drivers/i2c/busses/i2c-eg20t.c b/drivers/i2c/busses/i2c-eg20t.c index 2e067dd2ee5..50ea1f43bdc 100644 --- a/drivers/i2c/busses/i2c-eg20t.c +++ b/drivers/i2c/busses/i2c-eg20t.c @@ -29,6 +29,7 @@ #include <linux/pci.h> #include <linux/mutex.h> #include <linux/ktime.h> +#include <linux/slab.h> #define PCH_EVENT_SET 0 /* I2C Interrupt Event Set Status */ #define PCH_EVENT_NONE 1 /* I2C Interrupt Event Clear Status */ diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c index ef3bcb1ce86..1b46a9d9f90 100644 --- a/drivers/i2c/busses/i2c-ocores.c +++ b/drivers/i2c/busses/i2c-ocores.c @@ -249,7 +249,7 @@ static struct i2c_adapter ocores_adapter = { static int ocores_i2c_of_probe(struct platform_device* pdev, struct ocores_i2c* i2c) { - __be32* val; + const __be32* val; val = of_get_property(pdev->dev.of_node, "regstep", NULL); if (!val) { @@ -330,9 +330,7 @@ static int __devinit ocores_i2c_probe(struct platform_device *pdev) i2c->adap = ocores_adapter; i2c_set_adapdata(&i2c->adap, i2c); i2c->adap.dev.parent = &pdev->dev; -#ifdef CONFIG_OF i2c->adap.dev.of_node = pdev->dev.of_node; -#endif /* add i2c adapter to i2c tree */ ret = i2c_add_adapter(&i2c->adap); @@ -390,15 +388,11 @@ static int ocores_i2c_resume(struct platform_device *pdev) #define ocores_i2c_resume NULL #endif -#ifdef CONFIG_OF static struct of_device_id ocores_i2c_match[] = { - { - .compatible = "opencores,i2c-ocores", - }, - {}, + { .compatible = "opencores,i2c-ocores", }, + {}, }; MODULE_DEVICE_TABLE(of, ocores_i2c_match); -#endif /* work with hotplug and coldplug */ MODULE_ALIAS("platform:ocores-i2c"); @@ -411,9 +405,7 @@ static struct platform_driver ocores_i2c_driver = { .driver = { .owner = THIS_MODULE, .name = "ocores-i2c", -#ifdef CONFIG_OF - .of_match_table = ocores_i2c_match, -#endif + .of_match_table = ocores_i2c_match, }, }; diff --git a/drivers/i2c/busses/i2c-omap.c b/drivers/i2c/busses/i2c-omap.c index 829a2a1029f..58a58c7eaa1 100644 --- a/drivers/i2c/busses/i2c-omap.c +++ b/drivers/i2c/busses/i2c-omap.c @@ -378,9 +378,7 @@ static int omap_i2c_init(struct omap_i2c_dev *dev) * REVISIT: Some wkup sources might not be needed. */ dev->westate = OMAP_I2C_WE_ALL; - if (dev->rev < OMAP_I2C_REV_ON_4430) - omap_i2c_write_reg(dev, OMAP_I2C_WE_REG, - dev->westate); + omap_i2c_write_reg(dev, OMAP_I2C_WE_REG, dev->westate); } } omap_i2c_write_reg(dev, OMAP_I2C_CON_REG, 0); diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c index f0bd5bcdf56..045ba6efea4 100644 --- a/drivers/i2c/i2c-core.c +++ b/drivers/i2c/i2c-core.c @@ -537,9 +537,7 @@ i2c_new_device(struct i2c_adapter *adap, struct i2c_board_info const *info) client->dev.parent = &client->adapter->dev; client->dev.bus = &i2c_bus_type; client->dev.type = &i2c_client_type; -#ifdef CONFIG_OF client->dev.of_node = info->of_node; -#endif dev_set_name(&client->dev, "%d-%04x", i2c_adapter_id(adap), client->addr); diff --git a/drivers/media/common/tuners/tda8290.c b/drivers/media/common/tuners/tda8290.c index bc6a67768af..8c4852114ee 100644 --- a/drivers/media/common/tuners/tda8290.c +++ b/drivers/media/common/tuners/tda8290.c @@ -658,13 +658,13 @@ static int tda8290_probe(struct tuner_i2c_props *i2c_props) #define TDA8290_ID 0x89 u8 reg = 0x1f, id; struct i2c_msg msg_read[] = { - { .addr = 0x4b, .flags = 0, .len = 1, .buf = ® }, - { .addr = 0x4b, .flags = I2C_M_RD, .len = 1, .buf = &id }, + { .addr = i2c_props->addr, .flags = 0, .len = 1, .buf = ® }, + { .addr = i2c_props->addr, .flags = I2C_M_RD, .len = 1, .buf = &id }, }; /* detect tda8290 */ if (i2c_transfer(i2c_props->adap, msg_read, 2) != 2) { - printk(KERN_WARNING "%s: tda8290 couldn't read register 0x%02x\n", + printk(KERN_WARNING "%s: couldn't read register 0x%02x\n", __func__, reg); return -ENODEV; } @@ -685,13 +685,13 @@ static int tda8295_probe(struct tuner_i2c_props *i2c_props) #define TDA8295C2_ID 0x8b u8 reg = 0x2f, id; struct i2c_msg msg_read[] = { - { .addr = 0x4b, .flags = 0, .len = 1, .buf = ® }, - { .addr = 0x4b, .flags = I2C_M_RD, .len = 1, .buf = &id }, + { .addr = i2c_props->addr, .flags = 0, .len = 1, .buf = ® }, + { .addr = i2c_props->addr, .flags = I2C_M_RD, .len = 1, .buf = &id }, }; - /* detect tda8290 */ + /* detect tda8295 */ if (i2c_transfer(i2c_props->adap, msg_read, 2) != 2) { - printk(KERN_WARNING "%s: tda8290 couldn't read register 0x%02x\n", + printk(KERN_WARNING "%s: couldn't read register 0x%02x\n", __func__, reg); return -ENODEV; } diff --git a/drivers/media/dvb/dvb-usb/dib0700_devices.c b/drivers/media/dvb/dvb-usb/dib0700_devices.c index defd83964ce..193cdb77b76 100644 --- a/drivers/media/dvb/dvb-usb/dib0700_devices.c +++ b/drivers/media/dvb/dvb-usb/dib0700_devices.c @@ -870,6 +870,23 @@ static int dib7070p_tuner_attach(struct dvb_usb_adapter *adap) return 0; } +static int stk7700p_pid_filter(struct dvb_usb_adapter *adapter, int index, + u16 pid, int onoff) +{ + struct dib0700_state *st = adapter->dev->priv; + if (st->is_dib7000pc) + return dib7000p_pid_filter(adapter->fe, index, pid, onoff); + return dib7000m_pid_filter(adapter->fe, index, pid, onoff); +} + +static int stk7700p_pid_filter_ctrl(struct dvb_usb_adapter *adapter, int onoff) +{ + struct dib0700_state *st = adapter->dev->priv; + if (st->is_dib7000pc) + return dib7000p_pid_filter_ctrl(adapter->fe, onoff); + return dib7000m_pid_filter_ctrl(adapter->fe, onoff); +} + static int stk70x0p_pid_filter(struct dvb_usb_adapter *adapter, int index, u16 pid, int onoff) { return dib7000p_pid_filter(adapter->fe, index, pid, onoff); @@ -1875,8 +1892,8 @@ struct dvb_usb_device_properties dib0700_devices[] = { { .caps = DVB_USB_ADAP_HAS_PID_FILTER | DVB_USB_ADAP_PID_FILTER_CAN_BE_TURNED_OFF, .pid_filter_count = 32, - .pid_filter = stk70x0p_pid_filter, - .pid_filter_ctrl = stk70x0p_pid_filter_ctrl, + .pid_filter = stk7700p_pid_filter, + .pid_filter_ctrl = stk7700p_pid_filter_ctrl, .frontend_attach = stk7700p_frontend_attach, .tuner_attach = stk7700p_tuner_attach, diff --git a/drivers/media/dvb/dvb-usb/lmedm04.c b/drivers/media/dvb/dvb-usb/lmedm04.c index 9eea4188303..46ccd01a769 100644 --- a/drivers/media/dvb/dvb-usb/lmedm04.c +++ b/drivers/media/dvb/dvb-usb/lmedm04.c @@ -659,7 +659,7 @@ static int lme2510_download_firmware(struct usb_device *dev, } /* Default firmware for LME2510C */ -const char lme_firmware[50] = "dvb-usb-lme2510c-s7395.fw"; +char lme_firmware[50] = "dvb-usb-lme2510c-s7395.fw"; static void lme_coldreset(struct usb_device *dev) { @@ -1006,7 +1006,7 @@ static struct dvb_usb_device_properties lme2510c_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .download_firmware = lme2510_download_firmware, - .firmware = lme_firmware, + .firmware = (const char *)&lme_firmware, .size_of_priv = sizeof(struct lme2510_state), .num_adapters = 1, .adapter = { @@ -1109,5 +1109,5 @@ module_exit(lme2510_module_exit); MODULE_AUTHOR("Malcolm Priestley <tvboxspy@gmail.com>"); MODULE_DESCRIPTION("LME2510(C) DVB-S USB2.0"); -MODULE_VERSION("1.74"); +MODULE_VERSION("1.75"); MODULE_LICENSE("GPL"); diff --git a/drivers/media/dvb/frontends/dib7000m.c b/drivers/media/dvb/frontends/dib7000m.c index c7f5ccf54aa..289a79837f2 100644 --- a/drivers/media/dvb/frontends/dib7000m.c +++ b/drivers/media/dvb/frontends/dib7000m.c @@ -1285,6 +1285,25 @@ struct i2c_adapter * dib7000m_get_i2c_master(struct dvb_frontend *demod, enum di } EXPORT_SYMBOL(dib7000m_get_i2c_master); +int dib7000m_pid_filter_ctrl(struct dvb_frontend *fe, u8 onoff) +{ + struct dib7000m_state *state = fe->demodulator_priv; + u16 val = dib7000m_read_word(state, 294 + state->reg_offs) & 0xffef; + val |= (onoff & 0x1) << 4; + dprintk("PID filter enabled %d", onoff); + return dib7000m_write_word(state, 294 + state->reg_offs, val); +} +EXPORT_SYMBOL(dib7000m_pid_filter_ctrl); + +int dib7000m_pid_filter(struct dvb_frontend *fe, u8 id, u16 pid, u8 onoff) +{ + struct dib7000m_state *state = fe->demodulator_priv; + dprintk("PID filter: index %x, PID %d, OnOff %d", id, pid, onoff); + return dib7000m_write_word(state, 300 + state->reg_offs + id, + onoff ? (1 << 13) | pid : 0); +} +EXPORT_SYMBOL(dib7000m_pid_filter); + #if 0 /* used with some prototype boards */ int dib7000m_i2c_enumeration(struct i2c_adapter *i2c, int no_of_demods, diff --git a/drivers/media/dvb/frontends/dib7000m.h b/drivers/media/dvb/frontends/dib7000m.h index 113819ce9f0..81fcf2241c6 100644 --- a/drivers/media/dvb/frontends/dib7000m.h +++ b/drivers/media/dvb/frontends/dib7000m.h @@ -46,6 +46,8 @@ extern struct dvb_frontend *dib7000m_attach(struct i2c_adapter *i2c_adap, extern struct i2c_adapter *dib7000m_get_i2c_master(struct dvb_frontend *, enum dibx000_i2c_interface, int); +extern int dib7000m_pid_filter(struct dvb_frontend *, u8 id, u16 pid, u8 onoff); +extern int dib7000m_pid_filter_ctrl(struct dvb_frontend *fe, u8 onoff); #else static inline struct dvb_frontend *dib7000m_attach(struct i2c_adapter *i2c_adap, @@ -63,6 +65,19 @@ struct i2c_adapter *dib7000m_get_i2c_master(struct dvb_frontend *demod, printk(KERN_WARNING "%s: driver disabled by Kconfig\n", __func__); return NULL; } +static inline int dib7000m_pid_filter(struct dvb_frontend *fe, u8 id, + u16 pid, u8 onoff) +{ + printk(KERN_WARNING "%s: driver disabled by Kconfig\n", __func__); + return -ENODEV; +} + +static inline int dib7000m_pid_filter_ctrl(struct dvb_frontend *fe, + uint8_t onoff) +{ + printk(KERN_WARNING "%s: driver disabled by Kconfig\n", __func__); + return -ENODEV; +} #endif /* TODO diff --git a/drivers/media/dvb/mantis/mantis_pci.c b/drivers/media/dvb/mantis/mantis_pci.c index 59feeb84aec..10a432a79d0 100644 --- a/drivers/media/dvb/mantis/mantis_pci.c +++ b/drivers/media/dvb/mantis/mantis_pci.c @@ -22,7 +22,6 @@ #include <linux/moduleparam.h> #include <linux/kernel.h> #include <asm/io.h> -#include <asm/pgtable.h> #include <asm/page.h> #include <linux/kmod.h> #include <linux/vmalloc.h> diff --git a/drivers/media/rc/ir-raw.c b/drivers/media/rc/ir-raw.c index 73230ff93b8..01f258a2a57 100644 --- a/drivers/media/rc/ir-raw.c +++ b/drivers/media/rc/ir-raw.c @@ -112,7 +112,7 @@ int ir_raw_event_store_edge(struct rc_dev *dev, enum raw_event_type type) { ktime_t now; s64 delta; /* ns */ - struct ir_raw_event ev; + DEFINE_IR_RAW_EVENT(ev); int rc = 0; if (!dev->raw) @@ -125,7 +125,6 @@ int ir_raw_event_store_edge(struct rc_dev *dev, enum raw_event_type type) * being called for the first time, note that delta can't * possibly be negative. */ - ev.duration = 0; if (delta > IR_MAX_DURATION || !dev->raw->last_type) type |= IR_START_EVENT; else diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c index 6df0a498064..e4f8eac7f71 100644 --- a/drivers/media/rc/mceusb.c +++ b/drivers/media/rc/mceusb.c @@ -148,6 +148,7 @@ enum mceusb_model_type { MCE_GEN2_TX_INV, POLARIS_EVK, CX_HYBRID_TV, + MULTIFUNCTION, }; struct mceusb_model { @@ -155,9 +156,10 @@ struct mceusb_model { u32 mce_gen2:1; u32 mce_gen3:1; u32 tx_mask_normal:1; - u32 is_polaris:1; u32 no_tx:1; + int ir_intfnum; + const char *rc_map; /* Allow specify a per-board map */ const char *name; /* per-board name */ }; @@ -179,7 +181,6 @@ static const struct mceusb_model mceusb_model[] = { .tx_mask_normal = 1, }, [POLARIS_EVK] = { - .is_polaris = 1, /* * In fact, the EVK is shipped without * remotes, but we should have something handy, @@ -189,10 +190,13 @@ static const struct mceusb_model mceusb_model[] = { .name = "Conexant Hybrid TV (cx231xx) MCE IR", }, [CX_HYBRID_TV] = { - .is_polaris = 1, .no_tx = 1, /* tx isn't wired up at all */ .name = "Conexant Hybrid TV (cx231xx) MCE IR", }, + [MULTIFUNCTION] = { + .mce_gen2 = 1, + .ir_intfnum = 2, + }, }; static struct usb_device_id mceusb_dev_table[] = { @@ -216,8 +220,9 @@ static struct usb_device_id mceusb_dev_table[] = { { USB_DEVICE(VENDOR_PHILIPS, 0x206c) }, /* Philips/Spinel plus IR transceiver for ASUS */ { USB_DEVICE(VENDOR_PHILIPS, 0x2088) }, - /* Realtek MCE IR Receiver */ - { USB_DEVICE(VENDOR_REALTEK, 0x0161) }, + /* Realtek MCE IR Receiver and card reader */ + { USB_DEVICE(VENDOR_REALTEK, 0x0161), + .driver_info = MULTIFUNCTION }, /* SMK/Toshiba G83C0004D410 */ { USB_DEVICE(VENDOR_SMK, 0x031d), .driver_info = MCE_GEN2_TX_INV }, @@ -1101,7 +1106,7 @@ static int __devinit mceusb_dev_probe(struct usb_interface *intf, bool is_gen3; bool is_microsoft_gen1; bool tx_mask_normal; - bool is_polaris; + int ir_intfnum; dev_dbg(&intf->dev, "%s called\n", __func__); @@ -1110,13 +1115,11 @@ static int __devinit mceusb_dev_probe(struct usb_interface *intf, is_gen3 = mceusb_model[model].mce_gen3; is_microsoft_gen1 = mceusb_model[model].mce_gen1; tx_mask_normal = mceusb_model[model].tx_mask_normal; - is_polaris = mceusb_model[model].is_polaris; + ir_intfnum = mceusb_model[model].ir_intfnum; - if (is_polaris) { - /* Interface 0 is IR */ - if (idesc->desc.bInterfaceNumber) - return -ENODEV; - } + /* There are multi-function devices with non-IR interfaces */ + if (idesc->desc.bInterfaceNumber != ir_intfnum) + return -ENODEV; /* step through the endpoints to find first bulk in and out endpoint */ for (i = 0; i < idesc->desc.bNumEndpoints; ++i) { diff --git a/drivers/media/rc/nuvoton-cir.c b/drivers/media/rc/nuvoton-cir.c index 273d9d67479..d4d64492a05 100644 --- a/drivers/media/rc/nuvoton-cir.c +++ b/drivers/media/rc/nuvoton-cir.c @@ -385,8 +385,9 @@ static void nvt_cir_regs_init(struct nvt_dev *nvt) static void nvt_cir_wake_regs_init(struct nvt_dev *nvt) { - /* set number of bytes needed for wake key comparison (default 67) */ - nvt_cir_wake_reg_write(nvt, CIR_WAKE_FIFO_LEN, CIR_WAKE_FIFO_CMP_DEEP); + /* set number of bytes needed for wake from s3 (default 65) */ + nvt_cir_wake_reg_write(nvt, CIR_WAKE_FIFO_CMP_BYTES, + CIR_WAKE_FIFO_CMP_DEEP); /* set tolerance/variance allowed per byte during wake compare */ nvt_cir_wake_reg_write(nvt, CIR_WAKE_CMP_TOLERANCE, diff --git a/drivers/media/rc/nuvoton-cir.h b/drivers/media/rc/nuvoton-cir.h index 1df82351cb0..048135eea70 100644 --- a/drivers/media/rc/nuvoton-cir.h +++ b/drivers/media/rc/nuvoton-cir.h @@ -305,8 +305,11 @@ struct nvt_dev { #define CIR_WAKE_IRFIFOSTS_RX_EMPTY 0x20 #define CIR_WAKE_IRFIFOSTS_RX_FULL 0x10 -/* CIR Wake FIFO buffer is 67 bytes long */ -#define CIR_WAKE_FIFO_LEN 67 +/* + * The CIR Wake FIFO buffer is 67 bytes long, but the stock remote wakes + * the system comparing only 65 bytes (fails with this set to 67) + */ +#define CIR_WAKE_FIFO_CMP_BYTES 65 /* CIR Wake byte comparison tolerance */ #define CIR_WAKE_CMP_TOLERANCE 5 diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 512a2f4ada0..5b4422ef4e6 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -850,7 +850,7 @@ static ssize_t store_protocols(struct device *device, count++; } else { for (i = 0; i < ARRAY_SIZE(proto_names); i++) { - if (!strncasecmp(tmp, proto_names[i].name, strlen(proto_names[i].name))) { + if (!strcasecmp(tmp, proto_names[i].name)) { tmp += strlen(proto_names[i].name); mask = proto_names[i].type; break; diff --git a/drivers/media/video/au0828/au0828-video.c b/drivers/media/video/au0828/au0828-video.c index e41e4ad5cc4..9c475c600fc 100644 --- a/drivers/media/video/au0828/au0828-video.c +++ b/drivers/media/video/au0828/au0828-video.c @@ -1758,7 +1758,12 @@ static int vidioc_reqbufs(struct file *file, void *priv, if (rc < 0) return rc; - return videobuf_reqbufs(&fh->vb_vidq, rb); + if (fh->type == V4L2_BUF_TYPE_VIDEO_CAPTURE) + rc = videobuf_reqbufs(&fh->vb_vidq, rb); + else if (fh->type == V4L2_BUF_TYPE_VBI_CAPTURE) + rc = videobuf_reqbufs(&fh->vb_vbiq, rb); + + return rc; } static int vidioc_querybuf(struct file *file, void *priv, @@ -1772,7 +1777,12 @@ static int vidioc_querybuf(struct file *file, void *priv, if (rc < 0) return rc; - return videobuf_querybuf(&fh->vb_vidq, b); + if (fh->type == V4L2_BUF_TYPE_VIDEO_CAPTURE) + rc = videobuf_querybuf(&fh->vb_vidq, b); + else if (fh->type == V4L2_BUF_TYPE_VBI_CAPTURE) + rc = videobuf_querybuf(&fh->vb_vbiq, b); + + return rc; } static int vidioc_qbuf(struct file *file, void *priv, struct v4l2_buffer *b) @@ -1785,7 +1795,12 @@ static int vidioc_qbuf(struct file *file, void *priv, struct v4l2_buffer *b) if (rc < 0) return rc; - return videobuf_qbuf(&fh->vb_vidq, b); + if (fh->type == V4L2_BUF_TYPE_VIDEO_CAPTURE) + rc = videobuf_qbuf(&fh->vb_vidq, b); + else if (fh->type == V4L2_BUF_TYPE_VBI_CAPTURE) + rc = videobuf_qbuf(&fh->vb_vbiq, b); + + return rc; } static int vidioc_dqbuf(struct file *file, void *priv, struct v4l2_buffer *b) @@ -1806,7 +1821,12 @@ static int vidioc_dqbuf(struct file *file, void *priv, struct v4l2_buffer *b) dev->greenscreen_detected = 0; } - return videobuf_dqbuf(&fh->vb_vidq, b, file->f_flags & O_NONBLOCK); + if (fh->type == V4L2_BUF_TYPE_VIDEO_CAPTURE) + rc = videobuf_dqbuf(&fh->vb_vidq, b, file->f_flags & O_NONBLOCK); + else if (fh->type == V4L2_BUF_TYPE_VBI_CAPTURE) + rc = videobuf_dqbuf(&fh->vb_vbiq, b, file->f_flags & O_NONBLOCK); + + return rc; } static struct v4l2_file_operations au0828_v4l_fops = { diff --git a/drivers/media/video/cx18/cx18-cards.c b/drivers/media/video/cx18/cx18-cards.c index 87177733cf9..68ad1963f42 100644 --- a/drivers/media/video/cx18/cx18-cards.c +++ b/drivers/media/video/cx18/cx18-cards.c @@ -95,6 +95,53 @@ static const struct cx18_card cx18_card_hvr1600_esmt = { .i2c = &cx18_i2c_std, }; +static const struct cx18_card cx18_card_hvr1600_s5h1411 = { + .type = CX18_CARD_HVR_1600_S5H1411, + .name = "Hauppauge HVR-1600", + .comment = "Simultaneous Digital and Analog TV capture supported\n", + .v4l2_capabilities = CX18_CAP_ENCODER, + .hw_audio_ctrl = CX18_HW_418_AV, + .hw_muxer = CX18_HW_CS5345, + .hw_all = CX18_HW_TVEEPROM | CX18_HW_418_AV | CX18_HW_TUNER | + CX18_HW_CS5345 | CX18_HW_DVB | CX18_HW_GPIO_RESET_CTRL | + CX18_HW_Z8F0811_IR_HAUP, + .video_inputs = { + { CX18_CARD_INPUT_VID_TUNER, 0, CX18_AV_COMPOSITE7 }, + { CX18_CARD_INPUT_SVIDEO1, 1, CX18_AV_SVIDEO1 }, + { CX18_CARD_INPUT_COMPOSITE1, 1, CX18_AV_COMPOSITE3 }, + { CX18_CARD_INPUT_SVIDEO2, 2, CX18_AV_SVIDEO2 }, + { CX18_CARD_INPUT_COMPOSITE2, 2, CX18_AV_COMPOSITE4 }, + }, + .audio_inputs = { + { CX18_CARD_INPUT_AUD_TUNER, + CX18_AV_AUDIO8, CS5345_IN_1 | CS5345_MCLK_1_5 }, + { CX18_CARD_INPUT_LINE_IN1, + CX18_AV_AUDIO_SERIAL1, CS5345_IN_2 }, + { CX18_CARD_INPUT_LINE_IN2, + CX18_AV_AUDIO_SERIAL1, CS5345_IN_3 }, + }, + .radio_input = { CX18_CARD_INPUT_AUD_TUNER, + CX18_AV_AUDIO_SERIAL1, CS5345_IN_4 }, + .ddr = { + /* ESMT M13S128324A-5B memory */ + .chip_config = 0x003, + .refresh = 0x30c, + .timing1 = 0x44220e82, + .timing2 = 0x08, + .tune_lane = 0, + .initial_emrs = 0, + }, + .gpio_init.initial_value = 0x3001, + .gpio_init.direction = 0x3001, + .gpio_i2c_slave_reset = { + .active_lo_mask = 0x3001, + .msecs_asserted = 10, + .msecs_recovery = 40, + .ir_reset_mask = 0x0001, + }, + .i2c = &cx18_i2c_std, +}; + static const struct cx18_card cx18_card_hvr1600_samsung = { .type = CX18_CARD_HVR_1600_SAMSUNG, .name = "Hauppauge HVR-1600 (Preproduction)", @@ -523,7 +570,8 @@ static const struct cx18_card *cx18_card_list[] = { &cx18_card_toshiba_qosmio_dvbt, &cx18_card_leadtek_pvr2100, &cx18_card_leadtek_dvr3100h, - &cx18_card_gotview_dvd3 + &cx18_card_gotview_dvd3, + &cx18_card_hvr1600_s5h1411 }; const struct cx18_card *cx18_get_card(u16 index) diff --git a/drivers/media/video/cx18/cx18-driver.c b/drivers/media/video/cx18/cx18-driver.c index 944af8adbe0..b1c3cbd9274 100644 --- a/drivers/media/video/cx18/cx18-driver.c +++ b/drivers/media/video/cx18/cx18-driver.c @@ -157,6 +157,7 @@ MODULE_PARM_DESC(cardtype, "\t\t\t 7 = Leadtek WinFast PVR2100\n" "\t\t\t 8 = Leadtek WinFast DVR3100 H\n" "\t\t\t 9 = GoTView PCI DVD3 Hybrid\n" + "\t\t\t 10 = Hauppauge HVR 1600 (S5H1411)\n" "\t\t\t 0 = Autodetect (default)\n" "\t\t\t-1 = Ignore this card\n\t\t"); MODULE_PARM_DESC(pal, "Set PAL standard: B, G, H, D, K, I, M, N, Nc, 60"); @@ -337,6 +338,7 @@ void cx18_read_eeprom(struct cx18 *cx, struct tveeprom *tv) switch (cx->card->type) { case CX18_CARD_HVR_1600_ESMT: case CX18_CARD_HVR_1600_SAMSUNG: + case CX18_CARD_HVR_1600_S5H1411: tveeprom_hauppauge_analog(&c, tv, eedata); break; case CX18_CARD_YUAN_MPC718: @@ -365,7 +367,25 @@ static void cx18_process_eeprom(struct cx18 *cx) from the model number. Use the cardtype module option if you have one of these preproduction models. */ switch (tv.model) { - case 74000 ... 74999: + case 74301: /* Retail models */ + case 74321: + case 74351: /* OEM models */ + case 74361: + /* Digital side is s5h1411/tda18271 */ + cx->card = cx18_get_card(CX18_CARD_HVR_1600_S5H1411); + break; + case 74021: /* Retail models */ + case 74031: + case 74041: + case 74141: + case 74541: /* OEM models */ + case 74551: + case 74591: + case 74651: + case 74691: + case 74751: + case 74891: + /* Digital side is s5h1409/mxl5005s */ cx->card = cx18_get_card(CX18_CARD_HVR_1600_ESMT); break; case 0x718: @@ -377,7 +397,8 @@ static void cx18_process_eeprom(struct cx18 *cx) CX18_ERR("Invalid EEPROM\n"); return; default: - CX18_ERR("Unknown model %d, defaulting to HVR-1600\n", tv.model); + CX18_ERR("Unknown model %d, defaulting to original HVR-1600 " + "(cardtype=1)\n", tv.model); cx->card = cx18_get_card(CX18_CARD_HVR_1600_ESMT); break; } diff --git a/drivers/media/video/cx18/cx18-driver.h b/drivers/media/video/cx18/cx18-driver.h index 306caac6d3f..f736679d251 100644 --- a/drivers/media/video/cx18/cx18-driver.h +++ b/drivers/media/video/cx18/cx18-driver.h @@ -85,7 +85,8 @@ #define CX18_CARD_LEADTEK_PVR2100 6 /* Leadtek WinFast PVR2100 */ #define CX18_CARD_LEADTEK_DVR3100H 7 /* Leadtek WinFast DVR3100 H */ #define CX18_CARD_GOTVIEW_PCI_DVD3 8 /* GoTView PCI DVD3 Hybrid */ -#define CX18_CARD_LAST 8 +#define CX18_CARD_HVR_1600_S5H1411 9 /* Hauppauge HVR 1600 s5h1411/tda18271*/ +#define CX18_CARD_LAST 9 #define CX18_ENC_STREAM_TYPE_MPG 0 #define CX18_ENC_STREAM_TYPE_TS 1 diff --git a/drivers/media/video/cx18/cx18-dvb.c b/drivers/media/video/cx18/cx18-dvb.c index f0381d62518..f41922bd402 100644 --- a/drivers/media/video/cx18/cx18-dvb.c +++ b/drivers/media/video/cx18/cx18-dvb.c @@ -29,6 +29,8 @@ #include "cx18-gpio.h" #include "s5h1409.h" #include "mxl5005s.h" +#include "s5h1411.h" +#include "tda18271.h" #include "zl10353.h" #include <linux/firmware.h> @@ -77,6 +79,32 @@ static struct s5h1409_config hauppauge_hvr1600_config = { }; /* + * CX18_CARD_HVR_1600_S5H1411 + */ +static struct s5h1411_config hcw_s5h1411_config = { + .output_mode = S5H1411_SERIAL_OUTPUT, + .gpio = S5H1411_GPIO_OFF, + .vsb_if = S5H1411_IF_44000, + .qam_if = S5H1411_IF_4000, + .inversion = S5H1411_INVERSION_ON, + .status_mode = S5H1411_DEMODLOCKING, + .mpeg_timing = S5H1411_MPEGTIMING_CONTINOUS_NONINVERTING_CLOCK, +}; + +static struct tda18271_std_map hauppauge_tda18271_std_map = { + .atsc_6 = { .if_freq = 5380, .agc_mode = 3, .std = 3, + .if_lvl = 6, .rfagc_top = 0x37 }, + .qam_6 = { .if_freq = 4000, .agc_mode = 3, .std = 0, + .if_lvl = 6, .rfagc_top = 0x37 }, +}; + +static struct tda18271_config hauppauge_tda18271_config = { + .std_map = &hauppauge_tda18271_std_map, + .gate = TDA18271_GATE_DIGITAL, + .output_opt = TDA18271_OUTPUT_LT_OFF, +}; + +/* * CX18_CARD_LEADTEK_DVR3100H */ /* Information/confirmation of proper config values provided by Terry Wu */ @@ -244,6 +272,7 @@ static int cx18_dvb_start_feed(struct dvb_demux_feed *feed) switch (cx->card->type) { case CX18_CARD_HVR_1600_ESMT: case CX18_CARD_HVR_1600_SAMSUNG: + case CX18_CARD_HVR_1600_S5H1411: v = cx18_read_reg(cx, CX18_REG_DMUX_NUM_PORT_0_CONTROL); v |= 0x00400000; /* Serial Mode */ v |= 0x00002000; /* Data Length - Byte */ @@ -455,6 +484,15 @@ static int dvb_register(struct cx18_stream *stream) ret = 0; } break; + case CX18_CARD_HVR_1600_S5H1411: + dvb->fe = dvb_attach(s5h1411_attach, + &hcw_s5h1411_config, + &cx->i2c_adap[0]); + if (dvb->fe != NULL) + dvb_attach(tda18271_attach, dvb->fe, + 0x60, &cx->i2c_adap[0], + &hauppauge_tda18271_config); + break; case CX18_CARD_LEADTEK_DVR3100H: dvb->fe = dvb_attach(zl10353_attach, &leadtek_dvr3100h_demod, diff --git a/drivers/media/video/cx23885/cx23885-i2c.c b/drivers/media/video/cx23885/cx23885-i2c.c index ed3d8f55029..307ff543c25 100644 --- a/drivers/media/video/cx23885/cx23885-i2c.c +++ b/drivers/media/video/cx23885/cx23885-i2c.c @@ -122,10 +122,6 @@ static int i2c_sendbytes(struct i2c_adapter *i2c_adap, if (!i2c_wait_done(i2c_adap)) goto eio; - if (!i2c_slave_did_ack(i2c_adap)) { - retval = -ENXIO; - goto err; - } if (i2c_debug) { printk(" <W %02x %02x", msg->addr << 1, msg->buf[0]); if (!(ctrl & I2C_NOSTOP)) @@ -158,7 +154,6 @@ static int i2c_sendbytes(struct i2c_adapter *i2c_adap, eio: retval = -EIO; - err: if (i2c_debug) printk(KERN_ERR " ERR: %d\n", retval); return retval; @@ -209,10 +204,6 @@ static int i2c_readbytes(struct i2c_adapter *i2c_adap, if (!i2c_wait_done(i2c_adap)) goto eio; - if (cnt == 0 && !i2c_slave_did_ack(i2c_adap)) { - retval = -ENXIO; - goto err; - } msg->buf[cnt] = cx_read(bus->reg_rdata) & 0xff; if (i2c_debug) { dprintk(1, " %02x", msg->buf[cnt]); @@ -224,7 +215,6 @@ static int i2c_readbytes(struct i2c_adapter *i2c_adap, eio: retval = -EIO; - err: if (i2c_debug) printk(KERN_ERR " ERR: %d\n", retval); return retval; diff --git a/drivers/media/video/cx25840/cx25840-core.c b/drivers/media/video/cx25840/cx25840-core.c index 6fc09dd41b9..35796e03524 100644 --- a/drivers/media/video/cx25840/cx25840-core.c +++ b/drivers/media/video/cx25840/cx25840-core.c @@ -2015,7 +2015,8 @@ static int cx25840_probe(struct i2c_client *client, kfree(state); return err; } - v4l2_ctrl_cluster(2, &state->volume); + if (!is_cx2583x(state)) + v4l2_ctrl_cluster(2, &state->volume); v4l2_ctrl_handler_setup(&state->hdl); if (client->dev.platform_data) { diff --git a/drivers/media/video/ivtv/ivtv-irq.c b/drivers/media/video/ivtv/ivtv-irq.c index 9b4faf00919..9c29e964d40 100644 --- a/drivers/media/video/ivtv/ivtv-irq.c +++ b/drivers/media/video/ivtv/ivtv-irq.c @@ -628,22 +628,66 @@ static void ivtv_irq_enc_pio_complete(struct ivtv *itv) static void ivtv_irq_dma_err(struct ivtv *itv) { u32 data[CX2341X_MBOX_MAX_DATA]; + u32 status; del_timer(&itv->dma_timer); + ivtv_api_get_data(&itv->enc_mbox, IVTV_MBOX_DMA_END, 2, data); + status = read_reg(IVTV_REG_DMASTATUS); IVTV_DEBUG_WARN("DMA ERROR %08x %08x %08x %d\n", data[0], data[1], - read_reg(IVTV_REG_DMASTATUS), itv->cur_dma_stream); - write_reg(read_reg(IVTV_REG_DMASTATUS) & 3, IVTV_REG_DMASTATUS); + status, itv->cur_dma_stream); + /* + * We do *not* write back to the IVTV_REG_DMASTATUS register to + * clear the error status, if either the encoder write (0x02) or + * decoder read (0x01) bus master DMA operation do not indicate + * completed. We can race with the DMA engine, which may have + * transitioned to completed status *after* we read the register. + * Setting a IVTV_REG_DMASTATUS flag back to "busy" status, after the + * DMA engine has completed, will cause the DMA engine to stop working. + */ + status &= 0x3; + if (status == 0x3) + write_reg(status, IVTV_REG_DMASTATUS); + if (!test_bit(IVTV_F_I_UDMA, &itv->i_flags) && itv->cur_dma_stream >= 0 && itv->cur_dma_stream < IVTV_MAX_STREAMS) { struct ivtv_stream *s = &itv->streams[itv->cur_dma_stream]; - /* retry */ - if (s->type >= IVTV_DEC_STREAM_TYPE_MPG) + if (s->type >= IVTV_DEC_STREAM_TYPE_MPG) { + /* retry */ + /* + * FIXME - handle cases of DMA error similar to + * encoder below, except conditioned on status & 0x1 + */ ivtv_dma_dec_start(s); - else - ivtv_dma_enc_start(s); - return; + return; + } else { + if ((status & 0x2) == 0) { + /* + * CX2341x Bus Master DMA write is ongoing. + * Reset the timer and let it complete. + */ + itv->dma_timer.expires = + jiffies + msecs_to_jiffies(600); + add_timer(&itv->dma_timer); + return; + } + + if (itv->dma_retries < 3) { + /* + * CX2341x Bus Master DMA write has ended. + * Retry the write, starting with the first + * xfer segment. Just retrying the current + * segment is not sufficient. + */ + s->sg_processed = 0; + itv->dma_retries++; + ivtv_dma_enc_start_xfer(s); + return; + } + /* Too many retries, give up on this one */ + } + } if (test_bit(IVTV_F_I_UDMA, &itv->i_flags)) { ivtv_udma_start(itv); diff --git a/drivers/media/video/mem2mem_testdev.c b/drivers/media/video/mem2mem_testdev.c index c179041d91f..e7e717800ee 100644 --- a/drivers/media/video/mem2mem_testdev.c +++ b/drivers/media/video/mem2mem_testdev.c @@ -1011,7 +1011,6 @@ static int m2mtest_remove(struct platform_device *pdev) v4l2_m2m_release(dev->m2m_dev); del_timer_sync(&dev->timer); video_unregister_device(dev->vfd); - video_device_release(dev->vfd); v4l2_device_unregister(&dev->v4l2_dev); kfree(dev); diff --git a/drivers/media/video/s2255drv.c b/drivers/media/video/s2255drv.c index b63f8cafa67..561909b65ce 100644 --- a/drivers/media/video/s2255drv.c +++ b/drivers/media/video/s2255drv.c @@ -57,7 +57,7 @@ #include <linux/usb.h> #define S2255_MAJOR_VERSION 1 -#define S2255_MINOR_VERSION 20 +#define S2255_MINOR_VERSION 21 #define S2255_RELEASE 0 #define S2255_VERSION KERNEL_VERSION(S2255_MAJOR_VERSION, \ S2255_MINOR_VERSION, \ @@ -312,9 +312,9 @@ struct s2255_fh { }; /* current cypress EEPROM firmware version */ -#define S2255_CUR_USB_FWVER ((3 << 8) | 6) +#define S2255_CUR_USB_FWVER ((3 << 8) | 11) /* current DSP FW version */ -#define S2255_CUR_DSP_FWVER 8 +#define S2255_CUR_DSP_FWVER 10102 /* Need DSP version 5+ for video status feature */ #define S2255_MIN_DSP_STATUS 5 #define S2255_MIN_DSP_COLORFILTER 8 @@ -492,9 +492,11 @@ static void planar422p_to_yuv_packed(const unsigned char *in, static void s2255_reset_dsppower(struct s2255_dev *dev) { - s2255_vendor_req(dev, 0x40, 0x0b0b, 0x0b0b, NULL, 0, 1); + s2255_vendor_req(dev, 0x40, 0x0b0b, 0x0b01, NULL, 0, 1); msleep(10); s2255_vendor_req(dev, 0x50, 0x0000, 0x0000, NULL, 0, 1); + msleep(600); + s2255_vendor_req(dev, 0x10, 0x0000, 0x0000, NULL, 0, 1); return; } diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 6625c057be0..150b5f3cd40 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -1529,7 +1529,7 @@ void mmc_rescan(struct work_struct *work) * still present */ if (host->bus_ops && host->bus_ops->detect && !host->bus_dead - && mmc_card_is_removable(host)) + && !(host->caps & MMC_CAP_NONREMOVABLE)) host->bus_ops->detect(host); /* diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c index fd877f633dd..2f7fc0c5146 100644 --- a/drivers/mmc/host/mmc_spi.c +++ b/drivers/mmc/host/mmc_spi.c @@ -1516,21 +1516,17 @@ static int __devexit mmc_spi_remove(struct spi_device *spi) return 0; } -#if defined(CONFIG_OF) static struct of_device_id mmc_spi_of_match_table[] __devinitdata = { { .compatible = "mmc-spi-slot", }, {}, }; -#endif static struct spi_driver mmc_spi_driver = { .driver = { .name = "mmc_spi", .bus = &spi_bus_type, .owner = THIS_MODULE, -#if defined(CONFIG_OF) .of_match_table = mmc_spi_of_match_table, -#endif }, .probe = mmc_spi_probe, .remove = __devexit_p(mmc_spi_remove), diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c index a8c3e1c9b02..4aaa88f8ab5 100644 --- a/drivers/mtd/chips/cfi_cmdset_0001.c +++ b/drivers/mtd/chips/cfi_cmdset_0001.c @@ -1230,10 +1230,32 @@ static int inval_cache_and_wait_for_operation( sleep_time = chip_op_time / 2; for (;;) { + if (chip->state != chip_state) { + /* Someone's suspended the operation: sleep */ + DECLARE_WAITQUEUE(wait, current); + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&chip->wq, &wait); + mutex_unlock(&chip->mutex); + schedule(); + remove_wait_queue(&chip->wq, &wait); + mutex_lock(&chip->mutex); + continue; + } + status = map_read(map, cmd_adr); if (map_word_andequal(map, status, status_OK, status_OK)) break; + if (chip->erase_suspended && chip_state == FL_ERASING) { + /* Erase suspend occured while sleep: reset timeout */ + timeo = reset_timeo; + chip->erase_suspended = 0; + } + if (chip->write_suspended && chip_state == FL_WRITING) { + /* Write suspend occured while sleep: reset timeout */ + timeo = reset_timeo; + chip->write_suspended = 0; + } if (!timeo) { map_write(map, CMD(0x70), cmd_adr); chip->state = FL_STATUS; @@ -1257,27 +1279,6 @@ static int inval_cache_and_wait_for_operation( timeo--; } mutex_lock(&chip->mutex); - - while (chip->state != chip_state) { - /* Someone's suspended the operation: sleep */ - DECLARE_WAITQUEUE(wait, current); - set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&chip->wq, &wait); - mutex_unlock(&chip->mutex); - schedule(); - remove_wait_queue(&chip->wq, &wait); - mutex_lock(&chip->mutex); - } - if (chip->erase_suspended && chip_state == FL_ERASING) { - /* Erase suspend occured while sleep: reset timeout */ - timeo = reset_timeo; - chip->erase_suspended = 0; - } - if (chip->write_suspended && chip_state == FL_WRITING) { - /* Write suspend occured while sleep: reset timeout */ - timeo = reset_timeo; - chip->write_suspended = 0; - } } /* Done and happy. */ diff --git a/drivers/mtd/chips/jedec_probe.c b/drivers/mtd/chips/jedec_probe.c index d72a5fb2d04..4e1be51cc12 100644 --- a/drivers/mtd/chips/jedec_probe.c +++ b/drivers/mtd/chips/jedec_probe.c @@ -1935,14 +1935,14 @@ static void jedec_reset(u32 base, struct map_info *map, struct cfi_private *cfi) } -static int cfi_jedec_setup(struct cfi_private *p_cfi, int index) +static int cfi_jedec_setup(struct map_info *map, struct cfi_private *cfi, int index) { int i,num_erase_regions; uint8_t uaddr; - if (! (jedec_table[index].devtypes & p_cfi->device_type)) { + if (!(jedec_table[index].devtypes & cfi->device_type)) { DEBUG(MTD_DEBUG_LEVEL1, "Rejecting potential %s with incompatible %d-bit device type\n", - jedec_table[index].name, 4 * (1<<p_cfi->device_type)); + jedec_table[index].name, 4 * (1<<cfi->device_type)); return 0; } @@ -1950,27 +1950,28 @@ static int cfi_jedec_setup(struct cfi_private *p_cfi, int index) num_erase_regions = jedec_table[index].nr_regions; - p_cfi->cfiq = kmalloc(sizeof(struct cfi_ident) + num_erase_regions * 4, GFP_KERNEL); - if (!p_cfi->cfiq) { + cfi->cfiq = kmalloc(sizeof(struct cfi_ident) + num_erase_regions * 4, GFP_KERNEL); + if (!cfi->cfiq) { //xx printk(KERN_WARNING "%s: kmalloc failed for CFI ident structure\n", map->name); return 0; } - memset(p_cfi->cfiq,0,sizeof(struct cfi_ident)); + memset(cfi->cfiq, 0, sizeof(struct cfi_ident)); - p_cfi->cfiq->P_ID = jedec_table[index].cmd_set; - p_cfi->cfiq->NumEraseRegions = jedec_table[index].nr_regions; - p_cfi->cfiq->DevSize = jedec_table[index].dev_size; - p_cfi->cfi_mode = CFI_MODE_JEDEC; + cfi->cfiq->P_ID = jedec_table[index].cmd_set; + cfi->cfiq->NumEraseRegions = jedec_table[index].nr_regions; + cfi->cfiq->DevSize = jedec_table[index].dev_size; + cfi->cfi_mode = CFI_MODE_JEDEC; + cfi->sector_erase_cmd = CMD(0x30); for (i=0; i<num_erase_regions; i++){ - p_cfi->cfiq->EraseRegionInfo[i] = jedec_table[index].regions[i]; + cfi->cfiq->EraseRegionInfo[i] = jedec_table[index].regions[i]; } - p_cfi->cmdset_priv = NULL; + cfi->cmdset_priv = NULL; /* This may be redundant for some cases, but it doesn't hurt */ - p_cfi->mfr = jedec_table[index].mfr_id; - p_cfi->id = jedec_table[index].dev_id; + cfi->mfr = jedec_table[index].mfr_id; + cfi->id = jedec_table[index].dev_id; uaddr = jedec_table[index].uaddr; @@ -1978,8 +1979,8 @@ static int cfi_jedec_setup(struct cfi_private *p_cfi, int index) our brains explode when we see the datasheets talking about address lines numbered from A-1 to A18. The CFI table has unlock addresses in device-words according to the mode the device is connected in */ - p_cfi->addr_unlock1 = unlock_addrs[uaddr].addr1 / p_cfi->device_type; - p_cfi->addr_unlock2 = unlock_addrs[uaddr].addr2 / p_cfi->device_type; + cfi->addr_unlock1 = unlock_addrs[uaddr].addr1 / cfi->device_type; + cfi->addr_unlock2 = unlock_addrs[uaddr].addr2 / cfi->device_type; return 1; /* ok */ } @@ -2175,7 +2176,7 @@ static int jedec_probe_chip(struct map_info *map, __u32 base, "MTD %s(): matched device 0x%x,0x%x unlock_addrs: 0x%.4x 0x%.4x\n", __func__, cfi->mfr, cfi->id, cfi->addr_unlock1, cfi->addr_unlock2 ); - if (!cfi_jedec_setup(cfi, i)) + if (!cfi_jedec_setup(map, cfi, i)) return 0; goto ok_out; } diff --git a/drivers/mtd/maps/amd76xrom.c b/drivers/mtd/maps/amd76xrom.c index 77d64ce19e9..92de7e3a49a 100644 --- a/drivers/mtd/maps/amd76xrom.c +++ b/drivers/mtd/maps/amd76xrom.c @@ -151,6 +151,7 @@ static int __devinit amd76xrom_init_one (struct pci_dev *pdev, printk(KERN_ERR MOD_NAME " %s(): Unable to register resource %pR - kernel bug?\n", __func__, &window->rsrc); + return -EBUSY; } diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index cb20c67995d..e0a2373bf0e 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -413,7 +413,6 @@ error3: error2: list_del(&new->list); error1: - kfree(new); return ret; } diff --git a/drivers/mtd/nand/omap2.c b/drivers/mtd/nand/omap2.c index 15682ec8530..28af71c6183 100644 --- a/drivers/mtd/nand/omap2.c +++ b/drivers/mtd/nand/omap2.c @@ -968,6 +968,6 @@ static void __exit omap_nand_exit(void) module_init(omap_nand_init); module_exit(omap_nand_exit); -MODULE_ALIAS(DRIVER_NAME); +MODULE_ALIAS("platform:" DRIVER_NAME); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Glue layer for NAND flash on TI OMAP boards"); diff --git a/drivers/mtd/onenand/generic.c b/drivers/mtd/onenand/generic.c index e78914938c5..ac08750748a 100644 --- a/drivers/mtd/onenand/generic.c +++ b/drivers/mtd/onenand/generic.c @@ -131,7 +131,7 @@ static struct platform_driver generic_onenand_driver = { .remove = __devexit_p(generic_onenand_remove), }; -MODULE_ALIAS(DRIVER_NAME); +MODULE_ALIAS("platform:" DRIVER_NAME); static int __init generic_onenand_init(void) { diff --git a/drivers/mtd/onenand/omap2.c b/drivers/mtd/onenand/omap2.c index ac31f461cc1..c849cacf4b2 100644 --- a/drivers/mtd/onenand/omap2.c +++ b/drivers/mtd/onenand/omap2.c @@ -860,7 +860,7 @@ static void __exit omap2_onenand_exit(void) module_init(omap2_onenand_init); module_exit(omap2_onenand_exit); -MODULE_ALIAS(DRIVER_NAME); +MODULE_ALIAS("platform:" DRIVER_NAME); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jarkko Lavinen <jarkko.lavinen@nokia.com>"); MODULE_DESCRIPTION("Glue layer for OneNAND flash on OMAP2 / OMAP3"); diff --git a/drivers/net/ariadne.c b/drivers/net/ariadne.c index 39214e51245..7ca0eded256 100644 --- a/drivers/net/ariadne.c +++ b/drivers/net/ariadne.c @@ -425,11 +425,6 @@ static irqreturn_t ariadne_interrupt(int irq, void *data) int csr0, boguscnt; int handled = 0; - if (dev == NULL) { - printk(KERN_WARNING "ariadne_interrupt(): irq for unknown device.\n"); - return IRQ_NONE; - } - lance->RAP = CSR0; /* PCnet-ISA Controller Status */ if (!(lance->RDP & INTR)) /* Check if any interrupt has been */ diff --git a/drivers/net/bnx2x/bnx2x.h b/drivers/net/bnx2x/bnx2x.h index 7897d114b29..8849699c66c 100644 --- a/drivers/net/bnx2x/bnx2x.h +++ b/drivers/net/bnx2x/bnx2x.h @@ -1211,6 +1211,7 @@ struct bnx2x { /* DCBX Negotation results */ struct dcbx_features dcbx_local_feat; u32 dcbx_error; + u32 pending_max; }; /** @@ -1616,8 +1617,8 @@ static inline u32 reg_poll(struct bnx2x *bp, u32 reg, u32 expected, int ms, /* CMNG constants, as derived from system spec calculations */ /* default MIN rate in case VNIC min rate is configured to zero - 100Mbps */ #define DEF_MIN_RATE 100 -/* resolution of the rate shaping timer - 100 usec */ -#define RS_PERIODIC_TIMEOUT_USEC 100 +/* resolution of the rate shaping timer - 400 usec */ +#define RS_PERIODIC_TIMEOUT_USEC 400 /* number of bytes in single QM arbitration cycle - * coefficient for calculating the fairness timer */ #define QM_ARB_BYTES 160000 diff --git a/drivers/net/bnx2x/bnx2x_cmn.c b/drivers/net/bnx2x/bnx2x_cmn.c index 93798129061..a71b3294053 100644 --- a/drivers/net/bnx2x/bnx2x_cmn.c +++ b/drivers/net/bnx2x/bnx2x_cmn.c @@ -996,6 +996,23 @@ void bnx2x_free_skbs(struct bnx2x *bp) bnx2x_free_rx_skbs(bp); } +void bnx2x_update_max_mf_config(struct bnx2x *bp, u32 value) +{ + /* load old values */ + u32 mf_cfg = bp->mf_config[BP_VN(bp)]; + + if (value != bnx2x_extract_max_cfg(bp, mf_cfg)) { + /* leave all but MAX value */ + mf_cfg &= ~FUNC_MF_CFG_MAX_BW_MASK; + + /* set new MAX value */ + mf_cfg |= (value << FUNC_MF_CFG_MAX_BW_SHIFT) + & FUNC_MF_CFG_MAX_BW_MASK; + + bnx2x_fw_command(bp, DRV_MSG_CODE_SET_MF_BW, mf_cfg); + } +} + static void bnx2x_free_msix_irqs(struct bnx2x *bp) { int i, offset = 1; @@ -1464,6 +1481,11 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode) bnx2x_set_eth_mac(bp, 1); + if (bp->pending_max) { + bnx2x_update_max_mf_config(bp, bp->pending_max); + bp->pending_max = 0; + } + if (bp->port.pmf) bnx2x_initial_phy_init(bp, load_mode); diff --git a/drivers/net/bnx2x/bnx2x_cmn.h b/drivers/net/bnx2x/bnx2x_cmn.h index 326ba44b3de..85ea7f26b51 100644 --- a/drivers/net/bnx2x/bnx2x_cmn.h +++ b/drivers/net/bnx2x/bnx2x_cmn.h @@ -341,6 +341,15 @@ void bnx2x_dcbx_init(struct bnx2x *bp); */ int bnx2x_set_power_state(struct bnx2x *bp, pci_power_t state); +/** + * Updates MAX part of MF configuration in HW + * (if required) + * + * @param bp + * @param value + */ +void bnx2x_update_max_mf_config(struct bnx2x *bp, u32 value); + /* dev_close main block */ int bnx2x_nic_unload(struct bnx2x *bp, int unload_mode); diff --git a/drivers/net/bnx2x/bnx2x_ethtool.c b/drivers/net/bnx2x/bnx2x_ethtool.c index ef2919987a1..7e92f9d0dcf 100644 --- a/drivers/net/bnx2x/bnx2x_ethtool.c +++ b/drivers/net/bnx2x/bnx2x_ethtool.c @@ -238,7 +238,7 @@ static int bnx2x_set_settings(struct net_device *dev, struct ethtool_cmd *cmd) speed |= (cmd->speed_hi << 16); if (IS_MF_SI(bp)) { - u32 param = 0, part; + u32 part; u32 line_speed = bp->link_vars.line_speed; /* use 10G if no link detected */ @@ -251,24 +251,22 @@ static int bnx2x_set_settings(struct net_device *dev, struct ethtool_cmd *cmd) REQ_BC_VER_4_SET_MF_BW); return -EINVAL; } + part = (speed * 100) / line_speed; + if (line_speed < speed || !part) { BNX2X_DEV_INFO("Speed setting should be in a range " "from 1%% to 100%% " "of actual line speed\n"); return -EINVAL; } - /* load old values */ - param = bp->mf_config[BP_VN(bp)]; - /* leave only MIN value */ - param &= FUNC_MF_CFG_MIN_BW_MASK; - - /* set new MAX value */ - param |= (part << FUNC_MF_CFG_MAX_BW_SHIFT) - & FUNC_MF_CFG_MAX_BW_MASK; + if (bp->state != BNX2X_STATE_OPEN) + /* store value for following "load" */ + bp->pending_max = part; + else + bnx2x_update_max_mf_config(bp, part); - bnx2x_fw_command(bp, DRV_MSG_CODE_SET_MF_BW, param); return 0; } diff --git a/drivers/net/bnx2x/bnx2x_main.c b/drivers/net/bnx2x/bnx2x_main.c index 032ae184b60..aa032339e32 100644 --- a/drivers/net/bnx2x/bnx2x_main.c +++ b/drivers/net/bnx2x/bnx2x_main.c @@ -2092,8 +2092,9 @@ static void bnx2x_cmng_fns_init(struct bnx2x *bp, u8 read_cfg, u8 cmng_type) bnx2x_calc_vn_weight_sum(bp); /* calculate and set min-max rate for each vn */ - for (vn = VN_0; vn < E1HVN_MAX; vn++) - bnx2x_init_vn_minmax(bp, vn); + if (bp->port.pmf) + for (vn = VN_0; vn < E1HVN_MAX; vn++) + bnx2x_init_vn_minmax(bp, vn); /* always enable rate shaping and fairness */ bp->cmng.flags.cmng_enables |= @@ -2162,13 +2163,6 @@ static void bnx2x_link_attn(struct bnx2x *bp) bnx2x_stats_handle(bp, STATS_EVENT_LINK_UP); } - /* indicate link status only if link status actually changed */ - if (prev_link_status != bp->link_vars.link_status) - bnx2x_link_report(bp); - - if (IS_MF(bp)) - bnx2x_link_sync_notify(bp); - if (bp->link_vars.link_up && bp->link_vars.line_speed) { int cmng_fns = bnx2x_get_cmng_fns_mode(bp); @@ -2180,6 +2174,13 @@ static void bnx2x_link_attn(struct bnx2x *bp) DP(NETIF_MSG_IFUP, "single function mode without fairness\n"); } + + if (IS_MF(bp)) + bnx2x_link_sync_notify(bp); + + /* indicate link status only if link status actually changed */ + if (prev_link_status != bp->link_vars.link_status) + bnx2x_link_report(bp); } void bnx2x__link_status_update(struct bnx2x *bp) diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 1024ae15822..a5d5d0b5b15 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -281,23 +281,23 @@ static inline int __check_agg_selection_timer(struct port *port) } /** - * __get_rx_machine_lock - lock the port's RX machine + * __get_state_machine_lock - lock the port's state machines * @port: the port we're looking at * */ -static inline void __get_rx_machine_lock(struct port *port) +static inline void __get_state_machine_lock(struct port *port) { - spin_lock_bh(&(SLAVE_AD_INFO(port->slave).rx_machine_lock)); + spin_lock_bh(&(SLAVE_AD_INFO(port->slave).state_machine_lock)); } /** - * __release_rx_machine_lock - unlock the port's RX machine + * __release_state_machine_lock - unlock the port's state machines * @port: the port we're looking at * */ -static inline void __release_rx_machine_lock(struct port *port) +static inline void __release_state_machine_lock(struct port *port) { - spin_unlock_bh(&(SLAVE_AD_INFO(port->slave).rx_machine_lock)); + spin_unlock_bh(&(SLAVE_AD_INFO(port->slave).state_machine_lock)); } /** @@ -388,14 +388,14 @@ static u8 __get_duplex(struct port *port) } /** - * __initialize_port_locks - initialize a port's RX machine spinlock + * __initialize_port_locks - initialize a port's STATE machine spinlock * @port: the port we're looking at * */ static inline void __initialize_port_locks(struct port *port) { // make sure it isn't called twice - spin_lock_init(&(SLAVE_AD_INFO(port->slave).rx_machine_lock)); + spin_lock_init(&(SLAVE_AD_INFO(port->slave).state_machine_lock)); } //conversions @@ -1025,9 +1025,6 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port) { rx_states_t last_state; - // Lock to prevent 2 instances of this function to run simultaneously(rx interrupt and periodic machine callback) - __get_rx_machine_lock(port); - // keep current State Machine state to compare later if it was changed last_state = port->sm_rx_state; @@ -1133,7 +1130,6 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port) pr_err("%s: An illegal loopback occurred on adapter (%s).\n" "Check the configuration to verify that all adapters are connected to 802.3ad compliant switch ports\n", port->slave->dev->master->name, port->slave->dev->name); - __release_rx_machine_lock(port); return; } __update_selected(lacpdu, port); @@ -1153,7 +1149,6 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port) break; } } - __release_rx_machine_lock(port); } /** @@ -2155,6 +2150,12 @@ void bond_3ad_state_machine_handler(struct work_struct *work) goto re_arm; } + /* Lock around state machines to protect data accessed + * by all (e.g., port->sm_vars). ad_rx_machine may run + * concurrently due to incoming LACPDU. + */ + __get_state_machine_lock(port); + ad_rx_machine(NULL, port); ad_periodic_machine(port); ad_port_selection_logic(port); @@ -2164,6 +2165,8 @@ void bond_3ad_state_machine_handler(struct work_struct *work) // turn off the BEGIN bit, since we already handled it if (port->sm_vars & AD_PORT_BEGIN) port->sm_vars &= ~AD_PORT_BEGIN; + + __release_state_machine_lock(port); } re_arm: @@ -2200,7 +2203,10 @@ static void bond_3ad_rx_indication(struct lacpdu *lacpdu, struct slave *slave, u case AD_TYPE_LACPDU: pr_debug("Received LACPDU on port %d\n", port->actor_port_number); + /* Protect against concurrent state machines */ + __get_state_machine_lock(port); ad_rx_machine(lacpdu, port); + __release_state_machine_lock(port); break; case AD_TYPE_MARKER: diff --git a/drivers/net/bonding/bond_3ad.h b/drivers/net/bonding/bond_3ad.h index 2c46a154f2c..b28baff7086 100644 --- a/drivers/net/bonding/bond_3ad.h +++ b/drivers/net/bonding/bond_3ad.h @@ -264,7 +264,8 @@ struct ad_bond_info { struct ad_slave_info { struct aggregator aggregator; // 802.3ad aggregator structure struct port port; // 802.3ad port structure - spinlock_t rx_machine_lock; // To avoid race condition between callback and receive interrupt + spinlock_t state_machine_lock; /* mutex state machines vs. + incoming LACPDU */ u16 id; }; diff --git a/drivers/net/ethoc.c b/drivers/net/ethoc.c index b79d7e1555d..db0290f05bd 100644 --- a/drivers/net/ethoc.c +++ b/drivers/net/ethoc.c @@ -1163,15 +1163,11 @@ static int ethoc_resume(struct platform_device *pdev) # define ethoc_resume NULL #endif -#ifdef CONFIG_OF static struct of_device_id ethoc_match[] = { - { - .compatible = "opencores,ethoc", - }, + { .compatible = "opencores,ethoc", }, {}, }; MODULE_DEVICE_TABLE(of, ethoc_match); -#endif static struct platform_driver ethoc_driver = { .probe = ethoc_probe, @@ -1181,9 +1177,7 @@ static struct platform_driver ethoc_driver = { .driver = { .name = "ethoc", .owner = THIS_MODULE, -#ifdef CONFIG_OF .of_match_table = ethoc_match, -#endif }, }; diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 5933621ac3f..fc27a9926d9 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -528,8 +528,9 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, vnet_hdr_len = q->vnet_hdr_sz; err = -EINVAL; - if ((len -= vnet_hdr_len) < 0) + if (len < vnet_hdr_len) goto err; + len -= vnet_hdr_len; err = memcpy_fromiovecend((void *)&vnet_hdr, iv, 0, sizeof(vnet_hdr)); diff --git a/drivers/net/r6040.c b/drivers/net/r6040.c index 27e6f6d43ca..e3ebd90ae65 100644 --- a/drivers/net/r6040.c +++ b/drivers/net/r6040.c @@ -49,8 +49,8 @@ #include <asm/processor.h> #define DRV_NAME "r6040" -#define DRV_VERSION "0.26" -#define DRV_RELDATE "30May2010" +#define DRV_VERSION "0.27" +#define DRV_RELDATE "23Feb2011" /* PHY CHIP Address */ #define PHY1_ADDR 1 /* For MAC1 */ @@ -69,6 +69,8 @@ /* MAC registers */ #define MCR0 0x00 /* Control register 0 */ +#define MCR0_PROMISC 0x0020 /* Promiscuous mode */ +#define MCR0_HASH_EN 0x0100 /* Enable multicast hash table function */ #define MCR1 0x04 /* Control register 1 */ #define MAC_RST 0x0001 /* Reset the MAC */ #define MBCR 0x08 /* Bus control */ @@ -851,77 +853,92 @@ static void r6040_multicast_list(struct net_device *dev) { struct r6040_private *lp = netdev_priv(dev); void __iomem *ioaddr = lp->base; - u16 *adrp; - u16 reg; unsigned long flags; struct netdev_hw_addr *ha; int i; + u16 *adrp; + u16 hash_table[4] = { 0 }; + + spin_lock_irqsave(&lp->lock, flags); - /* MAC Address */ + /* Keep our MAC Address */ adrp = (u16 *)dev->dev_addr; iowrite16(adrp[0], ioaddr + MID_0L); iowrite16(adrp[1], ioaddr + MID_0M); iowrite16(adrp[2], ioaddr + MID_0H); - /* Promiscous Mode */ - spin_lock_irqsave(&lp->lock, flags); - /* Clear AMCP & PROM bits */ - reg = ioread16(ioaddr) & ~0x0120; - if (dev->flags & IFF_PROMISC) { - reg |= 0x0020; - lp->mcr0 |= 0x0020; - } - /* Too many multicast addresses - * accept all traffic */ - else if ((netdev_mc_count(dev) > MCAST_MAX) || - (dev->flags & IFF_ALLMULTI)) - reg |= 0x0020; + lp->mcr0 = ioread16(ioaddr + MCR0) & ~(MCR0_PROMISC | MCR0_HASH_EN); - iowrite16(reg, ioaddr); - spin_unlock_irqrestore(&lp->lock, flags); + /* Promiscuous mode */ + if (dev->flags & IFF_PROMISC) + lp->mcr0 |= MCR0_PROMISC; - /* Build the hash table */ - if (netdev_mc_count(dev) > MCAST_MAX) { - u16 hash_table[4]; - u32 crc; + /* Enable multicast hash table function to + * receive all multicast packets. */ + else if (dev->flags & IFF_ALLMULTI) { + lp->mcr0 |= MCR0_HASH_EN; - for (i = 0; i < 4; i++) - hash_table[i] = 0; + for (i = 0; i < MCAST_MAX ; i++) { + iowrite16(0, ioaddr + MID_1L + 8 * i); + iowrite16(0, ioaddr + MID_1M + 8 * i); + iowrite16(0, ioaddr + MID_1H + 8 * i); + } + for (i = 0; i < 4; i++) + hash_table[i] = 0xffff; + } + /* Use internal multicast address registers if the number of + * multicast addresses is not greater than MCAST_MAX. */ + else if (netdev_mc_count(dev) <= MCAST_MAX) { + i = 0; netdev_for_each_mc_addr(ha, dev) { - char *addrs = ha->addr; + u16 *adrp = (u16 *) ha->addr; + iowrite16(adrp[0], ioaddr + MID_1L + 8 * i); + iowrite16(adrp[1], ioaddr + MID_1M + 8 * i); + iowrite16(adrp[2], ioaddr + MID_1H + 8 * i); + i++; + } + while (i < MCAST_MAX) { + iowrite16(0, ioaddr + MID_1L + 8 * i); + iowrite16(0, ioaddr + MID_1M + 8 * i); + iowrite16(0, ioaddr + MID_1H + 8 * i); + i++; + } + } + /* Otherwise, Enable multicast hash table function. */ + else { + u32 crc; - if (!(*addrs & 1)) - continue; + lp->mcr0 |= MCR0_HASH_EN; + + for (i = 0; i < MCAST_MAX ; i++) { + iowrite16(0, ioaddr + MID_1L + 8 * i); + iowrite16(0, ioaddr + MID_1M + 8 * i); + iowrite16(0, ioaddr + MID_1H + 8 * i); + } - crc = ether_crc_le(6, addrs); + /* Build multicast hash table */ + netdev_for_each_mc_addr(ha, dev) { + u8 *addrs = ha->addr; + + crc = ether_crc(ETH_ALEN, addrs); crc >>= 26; - hash_table[crc >> 4] |= 1 << (15 - (crc & 0xf)); + hash_table[crc >> 4] |= 1 << (crc & 0xf); } - /* Fill the MAC hash tables with their values */ + } + + iowrite16(lp->mcr0, ioaddr + MCR0); + + /* Fill the MAC hash tables with their values */ + if (lp->mcr0 && MCR0_HASH_EN) { iowrite16(hash_table[0], ioaddr + MAR0); iowrite16(hash_table[1], ioaddr + MAR1); iowrite16(hash_table[2], ioaddr + MAR2); iowrite16(hash_table[3], ioaddr + MAR3); } - /* Multicast Address 1~4 case */ - i = 0; - netdev_for_each_mc_addr(ha, dev) { - if (i >= MCAST_MAX) - break; - adrp = (u16 *) ha->addr; - iowrite16(adrp[0], ioaddr + MID_1L + 8 * i); - iowrite16(adrp[1], ioaddr + MID_1M + 8 * i); - iowrite16(adrp[2], ioaddr + MID_1H + 8 * i); - i++; - } - while (i < MCAST_MAX) { - iowrite16(0xffff, ioaddr + MID_1L + 8 * i); - iowrite16(0xffff, ioaddr + MID_1M + 8 * i); - iowrite16(0xffff, ioaddr + MID_1H + 8 * i); - i++; - } + + spin_unlock_irqrestore(&lp->lock, flags); } static void netdev_get_drvinfo(struct net_device *dev, diff --git a/drivers/net/smsc911x.c b/drivers/net/smsc911x.c index 64bfdae5956..d70bde95460 100644 --- a/drivers/net/smsc911x.c +++ b/drivers/net/smsc911x.c @@ -1178,6 +1178,11 @@ static int smsc911x_open(struct net_device *dev) smsc911x_reg_write(pdata, HW_CFG, 0x00050000); smsc911x_reg_write(pdata, AFC_CFG, 0x006E3740); + /* Increase the legal frame size of VLAN tagged frames to 1522 bytes */ + spin_lock_irq(&pdata->mac_lock); + smsc911x_mac_write(pdata, VLAN1, ETH_P_8021Q); + spin_unlock_irq(&pdata->mac_lock); + /* Make sure EEPROM has finished loading before setting GPIO_CFG */ timeout = 50; while ((smsc911x_reg_read(pdata, E2P_CMD) & E2P_CMD_EPC_BUSY_) && diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig index 3c6e100a3ad..d06a6374ed6 100644 --- a/drivers/of/Kconfig +++ b/drivers/of/Kconfig @@ -69,4 +69,10 @@ config OF_MDIO help OpenFirmware MDIO bus (Ethernet PHY) accessors +config OF_PCI + def_tristate PCI + depends on PCI && (PPC || MICROBLAZE || X86) + help + OpenFirmware PCI bus accessors + endmenu # OF diff --git a/drivers/of/Makefile b/drivers/of/Makefile index 3ab21a0a490..f7861ed2f28 100644 --- a/drivers/of/Makefile +++ b/drivers/of/Makefile @@ -9,3 +9,4 @@ obj-$(CONFIG_OF_I2C) += of_i2c.o obj-$(CONFIG_OF_NET) += of_net.o obj-$(CONFIG_OF_SPI) += of_spi.o obj-$(CONFIG_OF_MDIO) += of_mdio.o +obj-$(CONFIG_OF_PCI) += of_pci.o diff --git a/drivers/of/of_pci.c b/drivers/of/of_pci.c new file mode 100644 index 00000000000..ac1ec54e4fd --- /dev/null +++ b/drivers/of/of_pci.c @@ -0,0 +1,92 @@ +#include <linux/kernel.h> +#include <linux/of_pci.h> +#include <linux/of_irq.h> +#include <asm/prom.h> + +/** + * of_irq_map_pci - Resolve the interrupt for a PCI device + * @pdev: the device whose interrupt is to be resolved + * @out_irq: structure of_irq filled by this function + * + * This function resolves the PCI interrupt for a given PCI device. If a + * device-node exists for a given pci_dev, it will use normal OF tree + * walking. If not, it will implement standard swizzling and walk up the + * PCI tree until an device-node is found, at which point it will finish + * resolving using the OF tree walking. + */ +int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq) +{ + struct device_node *dn, *ppnode; + struct pci_dev *ppdev; + u32 lspec; + __be32 lspec_be; + __be32 laddr[3]; + u8 pin; + int rc; + + /* Check if we have a device node, if yes, fallback to standard + * device tree parsing + */ + dn = pci_device_to_OF_node(pdev); + if (dn) { + rc = of_irq_map_one(dn, 0, out_irq); + if (!rc) + return rc; + } + + /* Ok, we don't, time to have fun. Let's start by building up an + * interrupt spec. we assume #interrupt-cells is 1, which is standard + * for PCI. If you do different, then don't use that routine. + */ + rc = pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin); + if (rc != 0) + return rc; + /* No pin, exit */ + if (pin == 0) + return -ENODEV; + + /* Now we walk up the PCI tree */ + lspec = pin; + for (;;) { + /* Get the pci_dev of our parent */ + ppdev = pdev->bus->self; + + /* Ouch, it's a host bridge... */ + if (ppdev == NULL) { + ppnode = pci_bus_to_OF_node(pdev->bus); + + /* No node for host bridge ? give up */ + if (ppnode == NULL) + return -EINVAL; + } else { + /* We found a P2P bridge, check if it has a node */ + ppnode = pci_device_to_OF_node(ppdev); + } + + /* Ok, we have found a parent with a device-node, hand over to + * the OF parsing code. + * We build a unit address from the linux device to be used for + * resolution. Note that we use the linux bus number which may + * not match your firmware bus numbering. + * Fortunately, in most cases, interrupt-map-mask doesn't + * include the bus number as part of the matching. + * You should still be careful about that though if you intend + * to rely on this function (you ship a firmware that doesn't + * create device nodes for all PCI devices). + */ + if (ppnode) + break; + + /* We can only get here if we hit a P2P bridge with no node, + * let's do standard swizzling and try again + */ + lspec = pci_swizzle_interrupt_pin(pdev, lspec); + pdev = ppdev; + } + + lspec_be = cpu_to_be32(lspec); + laddr[0] = cpu_to_be32((pdev->bus->number << 16) | (pdev->devfn << 8)); + laddr[1] = laddr[2] = cpu_to_be32(0); + return of_irq_map_raw(ppnode, &lspec_be, 1, laddr, out_irq); +} +EXPORT_SYMBOL_GPL(of_irq_map_pci); diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index 3a5a6fcc0ea..492b7d807fe 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -243,7 +243,7 @@ struct pci_ops pcifront_bus_ops = { #ifdef CONFIG_PCI_MSI static int pci_frontend_enable_msix(struct pci_dev *dev, - int **vector, int nvec) + int vector[], int nvec) { int err; int i; @@ -277,18 +277,24 @@ static int pci_frontend_enable_msix(struct pci_dev *dev, if (likely(!err)) { if (likely(!op.value)) { /* we get the result */ - for (i = 0; i < nvec; i++) - *(*vector+i) = op.msix_entries[i].vector; - return 0; + for (i = 0; i < nvec; i++) { + if (op.msix_entries[i].vector <= 0) { + dev_warn(&dev->dev, "MSI-X entry %d is invalid: %d!\n", + i, op.msix_entries[i].vector); + err = -EINVAL; + vector[i] = -1; + continue; + } + vector[i] = op.msix_entries[i].vector; + } } else { printk(KERN_DEBUG "enable msix get value %x\n", op.value); - return op.value; } } else { dev_err(&dev->dev, "enable msix get err %x\n", err); - return err; } + return err; } static void pci_frontend_disable_msix(struct pci_dev *dev) @@ -310,7 +316,7 @@ static void pci_frontend_disable_msix(struct pci_dev *dev) dev_err(&dev->dev, "pci_disable_msix get err %x\n", err); } -static int pci_frontend_enable_msi(struct pci_dev *dev, int **vector) +static int pci_frontend_enable_msi(struct pci_dev *dev, int vector[]) { int err; struct xen_pci_op op = { @@ -324,7 +330,13 @@ static int pci_frontend_enable_msi(struct pci_dev *dev, int **vector) err = do_pci_op(pdev, &op); if (likely(!err)) { - *(*vector) = op.value; + vector[0] = op.value; + if (op.value <= 0) { + dev_warn(&dev->dev, "MSI entry is invalid: %d!\n", + op.value); + err = -EINVAL; + vector[0] = -1; + } } else { dev_err(&dev->dev, "pci frontend enable msi failed for dev " "%x:%x\n", op.bus, op.devfn); @@ -733,8 +745,7 @@ static void free_pdev(struct pcifront_device *pdev) pcifront_free_roots(pdev); - /*For PCIE_AER error handling job*/ - flush_scheduled_work(); + cancel_work_sync(&pdev->op_work); if (pdev->irq >= 0) unbind_from_irqhandler(pdev->irq, pdev); diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c index c404b61386b..09b4437b3e6 100644 --- a/drivers/rtc/class.c +++ b/drivers/rtc/class.c @@ -117,6 +117,7 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev, struct module *owner) { struct rtc_device *rtc; + struct rtc_wkalrm alrm; int id, err; if (idr_pre_get(&rtc_idr, GFP_KERNEL) == 0) { @@ -166,6 +167,12 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev, rtc->pie_timer.function = rtc_pie_update_irq; rtc->pie_enabled = 0; + /* Check to see if there is an ALARM already set in hw */ + err = __rtc_read_alarm(rtc, &alrm); + + if (!err && !rtc_valid_tm(&alrm.time)) + rtc_set_alarm(rtc, &alrm); + strlcpy(rtc->name, name, RTC_DEVICE_NAME_SIZE); dev_set_name(&rtc->dev, "rtc%d", id); diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c index cb2f0728fd7..8ec6b069a7f 100644 --- a/drivers/rtc/interface.c +++ b/drivers/rtc/interface.c @@ -116,6 +116,186 @@ int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs) } EXPORT_SYMBOL_GPL(rtc_set_mmss); +static int rtc_read_alarm_internal(struct rtc_device *rtc, struct rtc_wkalrm *alarm) +{ + int err; + + err = mutex_lock_interruptible(&rtc->ops_lock); + if (err) + return err; + + if (rtc->ops == NULL) + err = -ENODEV; + else if (!rtc->ops->read_alarm) + err = -EINVAL; + else { + memset(alarm, 0, sizeof(struct rtc_wkalrm)); + err = rtc->ops->read_alarm(rtc->dev.parent, alarm); + } + + mutex_unlock(&rtc->ops_lock); + return err; +} + +int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) +{ + int err; + struct rtc_time before, now; + int first_time = 1; + unsigned long t_now, t_alm; + enum { none, day, month, year } missing = none; + unsigned days; + + /* The lower level RTC driver may return -1 in some fields, + * creating invalid alarm->time values, for reasons like: + * + * - The hardware may not be capable of filling them in; + * many alarms match only on time-of-day fields, not + * day/month/year calendar data. + * + * - Some hardware uses illegal values as "wildcard" match + * values, which non-Linux firmware (like a BIOS) may try + * to set up as e.g. "alarm 15 minutes after each hour". + * Linux uses only oneshot alarms. + * + * When we see that here, we deal with it by using values from + * a current RTC timestamp for any missing (-1) values. The + * RTC driver prevents "periodic alarm" modes. + * + * But this can be racey, because some fields of the RTC timestamp + * may have wrapped in the interval since we read the RTC alarm, + * which would lead to us inserting inconsistent values in place + * of the -1 fields. + * + * Reading the alarm and timestamp in the reverse sequence + * would have the same race condition, and not solve the issue. + * + * So, we must first read the RTC timestamp, + * then read the RTC alarm value, + * and then read a second RTC timestamp. + * + * If any fields of the second timestamp have changed + * when compared with the first timestamp, then we know + * our timestamp may be inconsistent with that used by + * the low-level rtc_read_alarm_internal() function. + * + * So, when the two timestamps disagree, we just loop and do + * the process again to get a fully consistent set of values. + * + * This could all instead be done in the lower level driver, + * but since more than one lower level RTC implementation needs it, + * then it's probably best best to do it here instead of there.. + */ + + /* Get the "before" timestamp */ + err = rtc_read_time(rtc, &before); + if (err < 0) + return err; + do { + if (!first_time) + memcpy(&before, &now, sizeof(struct rtc_time)); + first_time = 0; + + /* get the RTC alarm values, which may be incomplete */ + err = rtc_read_alarm_internal(rtc, alarm); + if (err) + return err; + + /* full-function RTCs won't have such missing fields */ + if (rtc_valid_tm(&alarm->time) == 0) + return 0; + + /* get the "after" timestamp, to detect wrapped fields */ + err = rtc_read_time(rtc, &now); + if (err < 0) + return err; + + /* note that tm_sec is a "don't care" value here: */ + } while ( before.tm_min != now.tm_min + || before.tm_hour != now.tm_hour + || before.tm_mon != now.tm_mon + || before.tm_year != now.tm_year); + + /* Fill in the missing alarm fields using the timestamp; we + * know there's at least one since alarm->time is invalid. + */ + if (alarm->time.tm_sec == -1) + alarm->time.tm_sec = now.tm_sec; + if (alarm->time.tm_min == -1) + alarm->time.tm_min = now.tm_min; + if (alarm->time.tm_hour == -1) + alarm->time.tm_hour = now.tm_hour; + + /* For simplicity, only support date rollover for now */ + if (alarm->time.tm_mday == -1) { + alarm->time.tm_mday = now.tm_mday; + missing = day; + } + if (alarm->time.tm_mon == -1) { + alarm->time.tm_mon = now.tm_mon; + if (missing == none) + missing = month; + } + if (alarm->time.tm_year == -1) { + alarm->time.tm_year = now.tm_year; + if (missing == none) + missing = year; + } + + /* with luck, no rollover is needed */ + rtc_tm_to_time(&now, &t_now); + rtc_tm_to_time(&alarm->time, &t_alm); + if (t_now < t_alm) + goto done; + + switch (missing) { + + /* 24 hour rollover ... if it's now 10am Monday, an alarm that + * that will trigger at 5am will do so at 5am Tuesday, which + * could also be in the next month or year. This is a common + * case, especially for PCs. + */ + case day: + dev_dbg(&rtc->dev, "alarm rollover: %s\n", "day"); + t_alm += 24 * 60 * 60; + rtc_time_to_tm(t_alm, &alarm->time); + break; + + /* Month rollover ... if it's the 31th, an alarm on the 3rd will + * be next month. An alarm matching on the 30th, 29th, or 28th + * may end up in the month after that! Many newer PCs support + * this type of alarm. + */ + case month: + dev_dbg(&rtc->dev, "alarm rollover: %s\n", "month"); + do { + if (alarm->time.tm_mon < 11) + alarm->time.tm_mon++; + else { + alarm->time.tm_mon = 0; + alarm->time.tm_year++; + } + days = rtc_month_days(alarm->time.tm_mon, + alarm->time.tm_year); + } while (days < alarm->time.tm_mday); + break; + + /* Year rollover ... easy except for leap years! */ + case year: + dev_dbg(&rtc->dev, "alarm rollover: %s\n", "year"); + do { + alarm->time.tm_year++; + } while (rtc_valid_tm(&alarm->time) != 0); + break; + + default: + dev_warn(&rtc->dev, "alarm rollover not handled\n"); + } + +done: + return 0; +} + int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) { int err; diff --git a/drivers/rtc/rtc-at91rm9200.c b/drivers/rtc/rtc-at91rm9200.c index 26d1cf5d19a..518a76ec71c 100644 --- a/drivers/rtc/rtc-at91rm9200.c +++ b/drivers/rtc/rtc-at91rm9200.c @@ -183,33 +183,6 @@ static int at91_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm) return 0; } -/* - * Handle commands from user-space - */ -static int at91_rtc_ioctl(struct device *dev, unsigned int cmd, - unsigned long arg) -{ - int ret = 0; - - pr_debug("%s(): cmd=%08x, arg=%08lx.\n", __func__, cmd, arg); - - /* important: scrub old status before enabling IRQs */ - switch (cmd) { - case RTC_UIE_OFF: /* update off */ - at91_sys_write(AT91_RTC_IDR, AT91_RTC_SECEV); - break; - case RTC_UIE_ON: /* update on */ - at91_sys_write(AT91_RTC_SCCR, AT91_RTC_SECEV); - at91_sys_write(AT91_RTC_IER, AT91_RTC_SECEV); - break; - default: - ret = -ENOIOCTLCMD; - break; - } - - return ret; -} - static int at91_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { pr_debug("%s(): cmd=%08x\n", __func__, enabled); @@ -269,7 +242,6 @@ static irqreturn_t at91_rtc_interrupt(int irq, void *dev_id) } static const struct rtc_class_ops at91_rtc_ops = { - .ioctl = at91_rtc_ioctl, .read_time = at91_rtc_readtime, .set_time = at91_rtc_settime, .read_alarm = at91_rtc_readalarm, diff --git a/drivers/rtc/rtc-at91sam9.c b/drivers/rtc/rtc-at91sam9.c index 5469c52cba3..a3ad957507d 100644 --- a/drivers/rtc/rtc-at91sam9.c +++ b/drivers/rtc/rtc-at91sam9.c @@ -216,33 +216,6 @@ static int at91_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm) return 0; } -/* - * Handle commands from user-space - */ -static int at91_rtc_ioctl(struct device *dev, unsigned int cmd, - unsigned long arg) -{ - struct sam9_rtc *rtc = dev_get_drvdata(dev); - int ret = 0; - u32 mr = rtt_readl(rtc, MR); - - dev_dbg(dev, "ioctl: cmd=%08x, arg=%08lx, mr %08x\n", cmd, arg, mr); - - switch (cmd) { - case RTC_UIE_OFF: /* update off */ - rtt_writel(rtc, MR, mr & ~AT91_RTT_RTTINCIEN); - break; - case RTC_UIE_ON: /* update on */ - rtt_writel(rtc, MR, mr | AT91_RTT_RTTINCIEN); - break; - default: - ret = -ENOIOCTLCMD; - break; - } - - return ret; -} - static int at91_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { struct sam9_rtc *rtc = dev_get_drvdata(dev); @@ -303,7 +276,6 @@ static irqreturn_t at91_rtc_interrupt(int irq, void *_rtc) } static const struct rtc_class_ops at91_rtc_ops = { - .ioctl = at91_rtc_ioctl, .read_time = at91_rtc_readtime, .set_time = at91_rtc_settime, .read_alarm = at91_rtc_readalarm, diff --git a/drivers/rtc/rtc-bfin.c b/drivers/rtc/rtc-bfin.c index 17971d93354..ca9cff85ab8 100644 --- a/drivers/rtc/rtc-bfin.c +++ b/drivers/rtc/rtc-bfin.c @@ -240,32 +240,6 @@ static void bfin_rtc_int_set_alarm(struct bfin_rtc *rtc) */ bfin_rtc_int_set(rtc->rtc_alarm.tm_yday == -1 ? RTC_ISTAT_ALARM : RTC_ISTAT_ALARM_DAY); } -static int bfin_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) -{ - struct bfin_rtc *rtc = dev_get_drvdata(dev); - int ret = 0; - - dev_dbg_stamp(dev); - - bfin_rtc_sync_pending(dev); - - switch (cmd) { - case RTC_UIE_ON: - dev_dbg_stamp(dev); - bfin_rtc_int_set(RTC_ISTAT_SEC); - break; - case RTC_UIE_OFF: - dev_dbg_stamp(dev); - bfin_rtc_int_clear(~RTC_ISTAT_SEC); - break; - - default: - dev_dbg_stamp(dev); - ret = -ENOIOCTLCMD; - } - - return ret; -} static int bfin_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { @@ -358,7 +332,6 @@ static int bfin_rtc_proc(struct device *dev, struct seq_file *seq) } static struct rtc_class_ops bfin_rtc_ops = { - .ioctl = bfin_rtc_ioctl, .read_time = bfin_rtc_read_time, .set_time = bfin_rtc_set_time, .read_alarm = bfin_rtc_read_alarm, diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index c7ff8df347e..911e75cdc12 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -37,6 +37,8 @@ #include <linux/mod_devicetable.h> #include <linux/log2.h> #include <linux/pm.h> +#include <linux/of.h> +#include <linux/of_platform.h> /* this is for "generic access to PC-style RTC" using CMOS_READ/CMOS_WRITE */ #include <asm-generic/rtc.h> @@ -375,50 +377,6 @@ static int cmos_set_alarm(struct device *dev, struct rtc_wkalrm *t) return 0; } -static int cmos_irq_set_freq(struct device *dev, int freq) -{ - struct cmos_rtc *cmos = dev_get_drvdata(dev); - int f; - unsigned long flags; - - if (!is_valid_irq(cmos->irq)) - return -ENXIO; - - if (!is_power_of_2(freq)) - return -EINVAL; - /* 0 = no irqs; 1 = 2^15 Hz ... 15 = 2^0 Hz */ - f = ffs(freq); - if (f-- > 16) - return -EINVAL; - f = 16 - f; - - spin_lock_irqsave(&rtc_lock, flags); - hpet_set_periodic_freq(freq); - CMOS_WRITE(RTC_REF_CLCK_32KHZ | f, RTC_FREQ_SELECT); - spin_unlock_irqrestore(&rtc_lock, flags); - - return 0; -} - -static int cmos_irq_set_state(struct device *dev, int enabled) -{ - struct cmos_rtc *cmos = dev_get_drvdata(dev); - unsigned long flags; - - if (!is_valid_irq(cmos->irq)) - return -ENXIO; - - spin_lock_irqsave(&rtc_lock, flags); - - if (enabled) - cmos_irq_enable(cmos, RTC_PIE); - else - cmos_irq_disable(cmos, RTC_PIE); - - spin_unlock_irqrestore(&rtc_lock, flags); - return 0; -} - static int cmos_alarm_irq_enable(struct device *dev, unsigned int enabled) { struct cmos_rtc *cmos = dev_get_drvdata(dev); @@ -438,25 +396,6 @@ static int cmos_alarm_irq_enable(struct device *dev, unsigned int enabled) return 0; } -static int cmos_update_irq_enable(struct device *dev, unsigned int enabled) -{ - struct cmos_rtc *cmos = dev_get_drvdata(dev); - unsigned long flags; - - if (!is_valid_irq(cmos->irq)) - return -EINVAL; - - spin_lock_irqsave(&rtc_lock, flags); - - if (enabled) - cmos_irq_enable(cmos, RTC_UIE); - else - cmos_irq_disable(cmos, RTC_UIE); - - spin_unlock_irqrestore(&rtc_lock, flags); - return 0; -} - #if defined(CONFIG_RTC_INTF_PROC) || defined(CONFIG_RTC_INTF_PROC_MODULE) static int cmos_procfs(struct device *dev, struct seq_file *seq) @@ -501,10 +440,7 @@ static const struct rtc_class_ops cmos_rtc_ops = { .read_alarm = cmos_read_alarm, .set_alarm = cmos_set_alarm, .proc = cmos_procfs, - .irq_set_freq = cmos_irq_set_freq, - .irq_set_state = cmos_irq_set_state, .alarm_irq_enable = cmos_alarm_irq_enable, - .update_irq_enable = cmos_update_irq_enable, }; /*----------------------------------------------------------------*/ @@ -1123,6 +1059,47 @@ static struct pnp_driver cmos_pnp_driver = { #endif /* CONFIG_PNP */ +#ifdef CONFIG_OF +static const struct of_device_id of_cmos_match[] = { + { + .compatible = "motorola,mc146818", + }, + { }, +}; +MODULE_DEVICE_TABLE(of, of_cmos_match); + +static __init void cmos_of_init(struct platform_device *pdev) +{ + struct device_node *node = pdev->dev.of_node; + struct rtc_time time; + int ret; + const __be32 *val; + + if (!node) + return; + + val = of_get_property(node, "ctrl-reg", NULL); + if (val) + CMOS_WRITE(be32_to_cpup(val), RTC_CONTROL); + + val = of_get_property(node, "freq-reg", NULL); + if (val) + CMOS_WRITE(be32_to_cpup(val), RTC_FREQ_SELECT); + + get_rtc_time(&time); + ret = rtc_valid_tm(&time); + if (ret) { + struct rtc_time def_time = { + .tm_year = 1, + .tm_mday = 1, + }; + set_rtc_time(&def_time); + } +} +#else +static inline void cmos_of_init(struct platform_device *pdev) {} +#define of_cmos_match NULL +#endif /*----------------------------------------------------------------*/ /* Platform setup should have set up an RTC device, when PNP is @@ -1131,6 +1108,7 @@ static struct pnp_driver cmos_pnp_driver = { static int __init cmos_platform_probe(struct platform_device *pdev) { + cmos_of_init(pdev); cmos_wake_setup(&pdev->dev); return cmos_do_probe(&pdev->dev, platform_get_resource(pdev, IORESOURCE_IO, 0), @@ -1162,6 +1140,7 @@ static struct platform_driver cmos_platform_driver = { #ifdef CONFIG_PM .pm = &cmos_pm_ops, #endif + .of_match_table = of_cmos_match, } }; diff --git a/drivers/rtc/rtc-davinci.c b/drivers/rtc/rtc-davinci.c index 34647fc1ee9..8d46838dff8 100644 --- a/drivers/rtc/rtc-davinci.c +++ b/drivers/rtc/rtc-davinci.c @@ -231,10 +231,6 @@ davinci_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) case RTC_WIE_OFF: rtc_ctrl &= ~PRTCSS_RTC_CTRL_WEN; break; - case RTC_UIE_OFF: - case RTC_UIE_ON: - ret = -ENOTTY; - break; default: ret = -ENOIOCTLCMD; } @@ -473,55 +469,6 @@ static int davinci_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm) return 0; } -static int davinci_rtc_irq_set_state(struct device *dev, int enabled) -{ - struct davinci_rtc *davinci_rtc = dev_get_drvdata(dev); - unsigned long flags; - u8 rtc_ctrl; - - spin_lock_irqsave(&davinci_rtc_lock, flags); - - rtc_ctrl = rtcss_read(davinci_rtc, PRTCSS_RTC_CTRL); - - if (enabled) { - while (rtcss_read(davinci_rtc, PRTCSS_RTC_CTRL) - & PRTCSS_RTC_CTRL_WDTBUS) - cpu_relax(); - - rtc_ctrl |= PRTCSS_RTC_CTRL_TE; - rtcss_write(davinci_rtc, rtc_ctrl, PRTCSS_RTC_CTRL); - - rtcss_write(davinci_rtc, 0x0, PRTCSS_RTC_CLKC_CNT); - - rtc_ctrl |= PRTCSS_RTC_CTRL_TIEN | - PRTCSS_RTC_CTRL_TMMD | - PRTCSS_RTC_CTRL_TMRFLG; - } else - rtc_ctrl &= ~PRTCSS_RTC_CTRL_TIEN; - - rtcss_write(davinci_rtc, rtc_ctrl, PRTCSS_RTC_CTRL); - - spin_unlock_irqrestore(&davinci_rtc_lock, flags); - - return 0; -} - -static int davinci_rtc_irq_set_freq(struct device *dev, int freq) -{ - struct davinci_rtc *davinci_rtc = dev_get_drvdata(dev); - unsigned long flags; - u16 tmr_counter = (0x8000 >> (ffs(freq) - 1)); - - spin_lock_irqsave(&davinci_rtc_lock, flags); - - rtcss_write(davinci_rtc, tmr_counter & 0xFF, PRTCSS_RTC_TMR0); - rtcss_write(davinci_rtc, (tmr_counter & 0xFF00) >> 8, PRTCSS_RTC_TMR1); - - spin_unlock_irqrestore(&davinci_rtc_lock, flags); - - return 0; -} - static struct rtc_class_ops davinci_rtc_ops = { .ioctl = davinci_rtc_ioctl, .read_time = davinci_rtc_read_time, @@ -529,8 +476,6 @@ static struct rtc_class_ops davinci_rtc_ops = { .alarm_irq_enable = davinci_rtc_alarm_irq_enable, .read_alarm = davinci_rtc_read_alarm, .set_alarm = davinci_rtc_set_alarm, - .irq_set_state = davinci_rtc_irq_set_state, - .irq_set_freq = davinci_rtc_irq_set_freq, }; static int __init davinci_rtc_probe(struct platform_device *pdev) diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index 37268e97de4..3fffd708711 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -397,29 +397,12 @@ static int ds1511_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) return 0; } -static int ds1511_rtc_update_irq_enable(struct device *dev, - unsigned int enabled) -{ - struct platform_device *pdev = to_platform_device(dev); - struct rtc_plat_data *pdata = platform_get_drvdata(pdev); - - if (pdata->irq <= 0) - return -EINVAL; - if (enabled) - pdata->irqen |= RTC_UF; - else - pdata->irqen &= ~RTC_UF; - ds1511_rtc_update_alarm(pdata); - return 0; -} - static const struct rtc_class_ops ds1511_rtc_ops = { .read_time = ds1511_rtc_read_time, .set_time = ds1511_rtc_set_time, .read_alarm = ds1511_rtc_read_alarm, .set_alarm = ds1511_rtc_set_alarm, .alarm_irq_enable = ds1511_rtc_alarm_irq_enable, - .update_irq_enable = ds1511_rtc_update_irq_enable, }; static ssize_t diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c index ff432e2ca27..fee41b97c9e 100644 --- a/drivers/rtc/rtc-ds1553.c +++ b/drivers/rtc/rtc-ds1553.c @@ -227,29 +227,12 @@ static int ds1553_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) return 0; } -static int ds1553_rtc_update_irq_enable(struct device *dev, - unsigned int enabled) -{ - struct platform_device *pdev = to_platform_device(dev); - struct rtc_plat_data *pdata = platform_get_drvdata(pdev); - - if (pdata->irq <= 0) - return -EINVAL; - if (enabled) - pdata->irqen |= RTC_UF; - else - pdata->irqen &= ~RTC_UF; - ds1553_rtc_update_alarm(pdata); - return 0; -} - static const struct rtc_class_ops ds1553_rtc_ops = { .read_time = ds1553_rtc_read_time, .set_time = ds1553_rtc_set_time, .read_alarm = ds1553_rtc_read_alarm, .set_alarm = ds1553_rtc_set_alarm, .alarm_irq_enable = ds1553_rtc_alarm_irq_enable, - .update_irq_enable = ds1553_rtc_update_irq_enable, }; static ssize_t ds1553_nvram_read(struct file *filp, struct kobject *kobj, diff --git a/drivers/rtc/rtc-ds3232.c b/drivers/rtc/rtc-ds3232.c index 950735415a7..27b7bf672ac 100644 --- a/drivers/rtc/rtc-ds3232.c +++ b/drivers/rtc/rtc-ds3232.c @@ -339,23 +339,6 @@ static int ds3232_alarm_irq_enable(struct device *dev, unsigned int enabled) return 0; } -static int ds3232_update_irq_enable(struct device *dev, unsigned int enabled) -{ - struct i2c_client *client = to_i2c_client(dev); - struct ds3232 *ds3232 = i2c_get_clientdata(client); - - if (client->irq <= 0) - return -EINVAL; - - if (enabled) - ds3232->rtc->irq_data |= RTC_UF; - else - ds3232->rtc->irq_data &= ~RTC_UF; - - ds3232_update_alarm(client); - return 0; -} - static irqreturn_t ds3232_irq(int irq, void *dev_id) { struct i2c_client *client = dev_id; @@ -406,7 +389,6 @@ static const struct rtc_class_ops ds3232_rtc_ops = { .read_alarm = ds3232_read_alarm, .set_alarm = ds3232_set_alarm, .alarm_irq_enable = ds3232_alarm_irq_enable, - .update_irq_enable = ds3232_update_irq_enable, }; static int __devinit ds3232_probe(struct i2c_client *client, diff --git a/drivers/rtc/rtc-jz4740.c b/drivers/rtc/rtc-jz4740.c index 2e16f72c905..b6473631d18 100644 --- a/drivers/rtc/rtc-jz4740.c +++ b/drivers/rtc/rtc-jz4740.c @@ -168,12 +168,6 @@ static int jz4740_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) return ret; } -static int jz4740_rtc_update_irq_enable(struct device *dev, unsigned int enable) -{ - struct jz4740_rtc *rtc = dev_get_drvdata(dev); - return jz4740_rtc_ctrl_set_bits(rtc, JZ_RTC_CTRL_1HZ_IRQ, enable); -} - static int jz4740_rtc_alarm_irq_enable(struct device *dev, unsigned int enable) { struct jz4740_rtc *rtc = dev_get_drvdata(dev); @@ -185,7 +179,6 @@ static struct rtc_class_ops jz4740_rtc_ops = { .set_mmss = jz4740_rtc_set_mmss, .read_alarm = jz4740_rtc_read_alarm, .set_alarm = jz4740_rtc_set_alarm, - .update_irq_enable = jz4740_rtc_update_irq_enable, .alarm_irq_enable = jz4740_rtc_alarm_irq_enable, }; diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c index 5314b153bfb..c4200646955 100644 --- a/drivers/rtc/rtc-mc13xxx.c +++ b/drivers/rtc/rtc-mc13xxx.c @@ -282,12 +282,6 @@ static irqreturn_t mc13xxx_rtc_update_handler(int irq, void *dev) return IRQ_HANDLED; } -static int mc13xxx_rtc_update_irq_enable(struct device *dev, - unsigned int enabled) -{ - return mc13xxx_rtc_irq_enable(dev, enabled, MC13XXX_IRQ_1HZ); -} - static int mc13xxx_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { @@ -300,7 +294,6 @@ static const struct rtc_class_ops mc13xxx_rtc_ops = { .read_alarm = mc13xxx_rtc_read_alarm, .set_alarm = mc13xxx_rtc_set_alarm, .alarm_irq_enable = mc13xxx_rtc_alarm_irq_enable, - .update_irq_enable = mc13xxx_rtc_update_irq_enable, }; static irqreturn_t mc13xxx_rtc_reset_handler(int irq, void *dev) diff --git a/drivers/rtc/rtc-mpc5121.c b/drivers/rtc/rtc-mpc5121.c index dfcdf0901d2..b40c1ff1ebc 100644 --- a/drivers/rtc/rtc-mpc5121.c +++ b/drivers/rtc/rtc-mpc5121.c @@ -240,32 +240,12 @@ static int mpc5121_rtc_alarm_irq_enable(struct device *dev, return 0; } -static int mpc5121_rtc_update_irq_enable(struct device *dev, - unsigned int enabled) -{ - struct mpc5121_rtc_data *rtc = dev_get_drvdata(dev); - struct mpc5121_rtc_regs __iomem *regs = rtc->regs; - int val; - - val = in_8(®s->int_enable); - - if (enabled) - val = (val & ~0x8) | 0x1; - else - val &= ~0x1; - - out_8(®s->int_enable, val); - - return 0; -} - static const struct rtc_class_ops mpc5121_rtc_ops = { .read_time = mpc5121_rtc_read_time, .set_time = mpc5121_rtc_set_time, .read_alarm = mpc5121_rtc_read_alarm, .set_alarm = mpc5121_rtc_set_alarm, .alarm_irq_enable = mpc5121_rtc_alarm_irq_enable, - .update_irq_enable = mpc5121_rtc_update_irq_enable, }; static int __devinit mpc5121_rtc_probe(struct platform_device *op, diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c index 1db62db8469..b86bc328463 100644 --- a/drivers/rtc/rtc-mrst.c +++ b/drivers/rtc/rtc-mrst.c @@ -62,6 +62,17 @@ static inline int is_intr(u8 rtc_intr) return rtc_intr & RTC_IRQMASK; } +static inline unsigned char vrtc_is_updating(void) +{ + unsigned char uip; + unsigned long flags; + + spin_lock_irqsave(&rtc_lock, flags); + uip = (vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP); + spin_unlock_irqrestore(&rtc_lock, flags); + return uip; +} + /* * rtc_time's year contains the increment over 1900, but vRTC's YEAR * register can't be programmed to value larger than 0x64, so vRTC @@ -76,7 +87,7 @@ static int mrst_read_time(struct device *dev, struct rtc_time *time) { unsigned long flags; - if (rtc_is_updating()) + if (vrtc_is_updating()) mdelay(20); spin_lock_irqsave(&rtc_lock, flags); @@ -236,25 +247,6 @@ static int mrst_set_alarm(struct device *dev, struct rtc_wkalrm *t) return 0; } -static int mrst_irq_set_state(struct device *dev, int enabled) -{ - struct mrst_rtc *mrst = dev_get_drvdata(dev); - unsigned long flags; - - if (!mrst->irq) - return -ENXIO; - - spin_lock_irqsave(&rtc_lock, flags); - - if (enabled) - mrst_irq_enable(mrst, RTC_PIE); - else - mrst_irq_disable(mrst, RTC_PIE); - - spin_unlock_irqrestore(&rtc_lock, flags); - return 0; -} - /* Currently, the vRTC doesn't support UIE ON/OFF */ static int mrst_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { @@ -301,7 +293,6 @@ static const struct rtc_class_ops mrst_rtc_ops = { .read_alarm = mrst_read_alarm, .set_alarm = mrst_set_alarm, .proc = mrst_procfs, - .irq_set_state = mrst_irq_set_state, .alarm_irq_enable = mrst_rtc_alarm_irq_enable, }; diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c index 0b06c1e03fd..826ab64a8fa 100644 --- a/drivers/rtc/rtc-mxc.c +++ b/drivers/rtc/rtc-mxc.c @@ -274,12 +274,6 @@ static int mxc_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) return 0; } -static int mxc_rtc_update_irq_enable(struct device *dev, unsigned int enabled) -{ - mxc_rtc_irq_enable(dev, RTC_1HZ_BIT, enabled); - return 0; -} - /* * This function reads the current RTC time into tm in Gregorian date. */ @@ -368,7 +362,6 @@ static struct rtc_class_ops mxc_rtc_ops = { .read_alarm = mxc_rtc_read_alarm, .set_alarm = mxc_rtc_set_alarm, .alarm_irq_enable = mxc_rtc_alarm_irq_enable, - .update_irq_enable = mxc_rtc_update_irq_enable, }; static int __init mxc_rtc_probe(struct platform_device *pdev) diff --git a/drivers/rtc/rtc-nuc900.c b/drivers/rtc/rtc-nuc900.c index ddb0857e15a..781068d62f2 100644 --- a/drivers/rtc/rtc-nuc900.c +++ b/drivers/rtc/rtc-nuc900.c @@ -134,20 +134,6 @@ static void nuc900_rtc_bin2bcd(struct device *dev, struct rtc_time *settm, gettm->bcd_hour = bin2bcd(settm->tm_hour) << 16; } -static int nuc900_update_irq_enable(struct device *dev, unsigned int enabled) -{ - struct nuc900_rtc *rtc = dev_get_drvdata(dev); - - if (enabled) - __raw_writel(__raw_readl(rtc->rtc_reg + REG_RTC_RIER)| - (TICKINTENB), rtc->rtc_reg + REG_RTC_RIER); - else - __raw_writel(__raw_readl(rtc->rtc_reg + REG_RTC_RIER)& - (~TICKINTENB), rtc->rtc_reg + REG_RTC_RIER); - - return 0; -} - static int nuc900_alarm_irq_enable(struct device *dev, unsigned int enabled) { struct nuc900_rtc *rtc = dev_get_drvdata(dev); @@ -234,7 +220,6 @@ static struct rtc_class_ops nuc900_rtc_ops = { .read_alarm = nuc900_rtc_read_alarm, .set_alarm = nuc900_rtc_set_alarm, .alarm_irq_enable = nuc900_alarm_irq_enable, - .update_irq_enable = nuc900_update_irq_enable, }; static int __devinit nuc900_rtc_probe(struct platform_device *pdev) diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c index b4dbf3a319b..de0dd7b1f14 100644 --- a/drivers/rtc/rtc-omap.c +++ b/drivers/rtc/rtc-omap.c @@ -135,44 +135,6 @@ static irqreturn_t rtc_irq(int irq, void *rtc) return IRQ_HANDLED; } -#ifdef CONFIG_RTC_INTF_DEV - -static int -omap_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) -{ - u8 reg; - - switch (cmd) { - case RTC_UIE_OFF: - case RTC_UIE_ON: - break; - default: - return -ENOIOCTLCMD; - } - - local_irq_disable(); - rtc_wait_not_busy(); - reg = rtc_read(OMAP_RTC_INTERRUPTS_REG); - switch (cmd) { - /* UIE = Update Interrupt Enable (1/second) */ - case RTC_UIE_OFF: - reg &= ~OMAP_RTC_INTERRUPTS_IT_TIMER; - break; - case RTC_UIE_ON: - reg |= OMAP_RTC_INTERRUPTS_IT_TIMER; - break; - } - rtc_wait_not_busy(); - rtc_write(reg, OMAP_RTC_INTERRUPTS_REG); - local_irq_enable(); - - return 0; -} - -#else -#define omap_rtc_ioctl NULL -#endif - static int omap_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { u8 reg; @@ -313,7 +275,6 @@ static int omap_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm) } static struct rtc_class_ops omap_rtc_ops = { - .ioctl = omap_rtc_ioctl, .read_time = omap_rtc_read_time, .set_time = omap_rtc_set_time, .read_alarm = omap_rtc_read_alarm, diff --git a/drivers/rtc/rtc-pcap.c b/drivers/rtc/rtc-pcap.c index 25c0b3fd44f..a633abc4289 100644 --- a/drivers/rtc/rtc-pcap.c +++ b/drivers/rtc/rtc-pcap.c @@ -131,18 +131,12 @@ static int pcap_rtc_alarm_irq_enable(struct device *dev, unsigned int en) return pcap_rtc_irq_enable(dev, PCAP_IRQ_TODA, en); } -static int pcap_rtc_update_irq_enable(struct device *dev, unsigned int en) -{ - return pcap_rtc_irq_enable(dev, PCAP_IRQ_1HZ, en); -} - static const struct rtc_class_ops pcap_rtc_ops = { .read_time = pcap_rtc_read_time, .read_alarm = pcap_rtc_read_alarm, .set_alarm = pcap_rtc_set_alarm, .set_mmss = pcap_rtc_set_mmss, .alarm_irq_enable = pcap_rtc_alarm_irq_enable, - .update_irq_enable = pcap_rtc_update_irq_enable, }; static int __devinit pcap_rtc_probe(struct platform_device *pdev) diff --git a/drivers/rtc/rtc-pcf50633.c b/drivers/rtc/rtc-pcf50633.c index 16edf94ab42..f90c574f9d0 100644 --- a/drivers/rtc/rtc-pcf50633.c +++ b/drivers/rtc/rtc-pcf50633.c @@ -106,25 +106,6 @@ pcf50633_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) return 0; } -static int -pcf50633_rtc_update_irq_enable(struct device *dev, unsigned int enabled) -{ - struct pcf50633_rtc *rtc = dev_get_drvdata(dev); - int err; - - if (enabled) - err = pcf50633_irq_unmask(rtc->pcf, PCF50633_IRQ_SECOND); - else - err = pcf50633_irq_mask(rtc->pcf, PCF50633_IRQ_SECOND); - - if (err < 0) - return err; - - rtc->second_enabled = enabled; - - return 0; -} - static int pcf50633_rtc_read_time(struct device *dev, struct rtc_time *tm) { struct pcf50633_rtc *rtc; @@ -262,8 +243,7 @@ static struct rtc_class_ops pcf50633_rtc_ops = { .set_time = pcf50633_rtc_set_time, .read_alarm = pcf50633_rtc_read_alarm, .set_alarm = pcf50633_rtc_set_alarm, - .alarm_irq_enable = pcf50633_rtc_alarm_irq_enable, - .update_irq_enable = pcf50633_rtc_update_irq_enable, + .alarm_irq_enable = pcf50633_rtc_alarm_irq_enable, }; static void pcf50633_rtc_irq(int irq, void *data) diff --git a/drivers/rtc/rtc-pl030.c b/drivers/rtc/rtc-pl030.c index bbdb2f02798..d554368c9f5 100644 --- a/drivers/rtc/rtc-pl030.c +++ b/drivers/rtc/rtc-pl030.c @@ -35,11 +35,6 @@ static irqreturn_t pl030_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static int pl030_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) -{ - return -ENOIOCTLCMD; -} - static int pl030_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) { struct pl030_rtc *rtc = dev_get_drvdata(dev); @@ -96,7 +91,6 @@ static int pl030_set_time(struct device *dev, struct rtc_time *tm) } static const struct rtc_class_ops pl030_ops = { - .ioctl = pl030_ioctl, .read_time = pl030_read_time, .set_time = pl030_set_time, .read_alarm = pl030_read_alarm, diff --git a/drivers/rtc/rtc-pl031.c b/drivers/rtc/rtc-pl031.c index b7a6690e5b3..d829ea63c4f 100644 --- a/drivers/rtc/rtc-pl031.c +++ b/drivers/rtc/rtc-pl031.c @@ -293,57 +293,6 @@ static int pl031_set_alarm(struct device *dev, struct rtc_wkalrm *alarm) return ret; } -/* Periodic interrupt is only available in ST variants. */ -static int pl031_irq_set_state(struct device *dev, int enabled) -{ - struct pl031_local *ldata = dev_get_drvdata(dev); - - if (enabled == 1) { - /* Clear any pending timer interrupt. */ - writel(RTC_BIT_PI, ldata->base + RTC_ICR); - - writel(readl(ldata->base + RTC_IMSC) | RTC_BIT_PI, - ldata->base + RTC_IMSC); - - /* Now start the timer */ - writel(readl(ldata->base + RTC_TCR) | RTC_TCR_EN, - ldata->base + RTC_TCR); - - } else { - writel(readl(ldata->base + RTC_IMSC) & (~RTC_BIT_PI), - ldata->base + RTC_IMSC); - - /* Also stop the timer */ - writel(readl(ldata->base + RTC_TCR) & (~RTC_TCR_EN), - ldata->base + RTC_TCR); - } - /* Wait at least 1 RTC32 clock cycle to ensure next access - * to RTC_TCR will succeed. - */ - udelay(40); - - return 0; -} - -static int pl031_irq_set_freq(struct device *dev, int freq) -{ - struct pl031_local *ldata = dev_get_drvdata(dev); - - /* Cant set timer if it is already enabled */ - if (readl(ldata->base + RTC_TCR) & RTC_TCR_EN) { - dev_err(dev, "can't change frequency while timer enabled\n"); - return -EINVAL; - } - - /* If self start bit in RTC_TCR is set timer will start here, - * but we never set that bit. Instead we start the timer when - * set_state is called with enabled == 1. - */ - writel(RTC_TIMER_FREQ / freq, ldata->base + RTC_TLR); - - return 0; -} - static int pl031_remove(struct amba_device *adev) { struct pl031_local *ldata = dev_get_drvdata(&adev->dev); @@ -440,8 +389,6 @@ static struct rtc_class_ops stv1_pl031_ops = { .read_alarm = pl031_read_alarm, .set_alarm = pl031_set_alarm, .alarm_irq_enable = pl031_alarm_irq_enable, - .irq_set_state = pl031_irq_set_state, - .irq_set_freq = pl031_irq_set_freq, }; /* And the second ST derivative */ @@ -451,8 +398,6 @@ static struct rtc_class_ops stv2_pl031_ops = { .read_alarm = pl031_stv2_read_alarm, .set_alarm = pl031_stv2_set_alarm, .alarm_irq_enable = pl031_alarm_irq_enable, - .irq_set_state = pl031_irq_set_state, - .irq_set_freq = pl031_irq_set_freq, }; static struct amba_id pl031_ids[] = { diff --git a/drivers/rtc/rtc-proc.c b/drivers/rtc/rtc-proc.c index 242bbf86c74..0a59fda5c09 100644 --- a/drivers/rtc/rtc-proc.c +++ b/drivers/rtc/rtc-proc.c @@ -69,6 +69,14 @@ static int rtc_proc_show(struct seq_file *seq, void *offset) alrm.enabled ? "yes" : "no"); seq_printf(seq, "alrm_pending\t: %s\n", alrm.pending ? "yes" : "no"); + seq_printf(seq, "update IRQ enabled\t: %s\n", + (rtc->uie_rtctimer.enabled) ? "yes" : "no"); + seq_printf(seq, "periodic IRQ enabled\t: %s\n", + (rtc->pie_enabled) ? "yes" : "no"); + seq_printf(seq, "periodic IRQ frequency\t: %d\n", + rtc->irq_freq); + seq_printf(seq, "max user IRQ frequency\t: %d\n", + rtc->max_user_freq); } seq_printf(seq, "24hr\t\t: yes\n"); diff --git a/drivers/rtc/rtc-pxa.c b/drivers/rtc/rtc-pxa.c index 29e867a1aaa..fc9f4991574 100644 --- a/drivers/rtc/rtc-pxa.c +++ b/drivers/rtc/rtc-pxa.c @@ -209,32 +209,6 @@ static void pxa_rtc_release(struct device *dev) free_irq(pxa_rtc->irq_1Hz, dev); } -static int pxa_periodic_irq_set_freq(struct device *dev, int freq) -{ - struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); - int period_ms; - - if (freq < 1 || freq > MAXFREQ_PERIODIC) - return -EINVAL; - - period_ms = 1000 / freq; - rtc_writel(pxa_rtc, PIAR, period_ms); - - return 0; -} - -static int pxa_periodic_irq_set_state(struct device *dev, int enabled) -{ - struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); - - if (enabled) - rtsr_set_bits(pxa_rtc, RTSR_PIALE | RTSR_PICE); - else - rtsr_clear_bits(pxa_rtc, RTSR_PIALE | RTSR_PICE); - - return 0; -} - static int pxa_alarm_irq_enable(struct device *dev, unsigned int enabled) { struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); @@ -250,21 +224,6 @@ static int pxa_alarm_irq_enable(struct device *dev, unsigned int enabled) return 0; } -static int pxa_update_irq_enable(struct device *dev, unsigned int enabled) -{ - struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); - - spin_lock_irq(&pxa_rtc->lock); - - if (enabled) - rtsr_set_bits(pxa_rtc, RTSR_HZE); - else - rtsr_clear_bits(pxa_rtc, RTSR_HZE); - - spin_unlock_irq(&pxa_rtc->lock); - return 0; -} - static int pxa_rtc_read_time(struct device *dev, struct rtc_time *tm) { struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); @@ -346,10 +305,7 @@ static const struct rtc_class_ops pxa_rtc_ops = { .read_alarm = pxa_rtc_read_alarm, .set_alarm = pxa_rtc_set_alarm, .alarm_irq_enable = pxa_alarm_irq_enable, - .update_irq_enable = pxa_update_irq_enable, .proc = pxa_rtc_proc, - .irq_set_state = pxa_periodic_irq_set_state, - .irq_set_freq = pxa_periodic_irq_set_freq, }; static int __init pxa_rtc_probe(struct platform_device *pdev) diff --git a/drivers/rtc/rtc-rs5c372.c b/drivers/rtc/rtc-rs5c372.c index 6aaa1550e3b..85c1b848dd7 100644 --- a/drivers/rtc/rtc-rs5c372.c +++ b/drivers/rtc/rtc-rs5c372.c @@ -281,57 +281,6 @@ static int rs5c372_rtc_set_time(struct device *dev, struct rtc_time *tm) return rs5c372_set_datetime(to_i2c_client(dev), tm); } -#if defined(CONFIG_RTC_INTF_DEV) || defined(CONFIG_RTC_INTF_DEV_MODULE) - -static int -rs5c_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) -{ - struct i2c_client *client = to_i2c_client(dev); - struct rs5c372 *rs5c = i2c_get_clientdata(client); - unsigned char buf; - int status, addr; - - buf = rs5c->regs[RS5C_REG_CTRL1]; - switch (cmd) { - case RTC_UIE_OFF: - case RTC_UIE_ON: - /* some 327a modes use a different IRQ pin for 1Hz irqs */ - if (rs5c->type == rtc_rs5c372a - && (buf & RS5C372A_CTRL1_SL1)) - return -ENOIOCTLCMD; - default: - return -ENOIOCTLCMD; - } - - status = rs5c_get_regs(rs5c); - if (status < 0) - return status; - - addr = RS5C_ADDR(RS5C_REG_CTRL1); - switch (cmd) { - case RTC_UIE_OFF: /* update off */ - buf &= ~RS5C_CTRL1_CT_MASK; - break; - case RTC_UIE_ON: /* update on */ - buf &= ~RS5C_CTRL1_CT_MASK; - buf |= RS5C_CTRL1_CT4; - break; - } - - if (i2c_smbus_write_byte_data(client, addr, buf) < 0) { - printk(KERN_WARNING "%s: can't update alarm\n", - rs5c->rtc->name); - status = -EIO; - } else - rs5c->regs[RS5C_REG_CTRL1] = buf; - - return status; -} - -#else -#define rs5c_rtc_ioctl NULL -#endif - static int rs5c_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { @@ -480,7 +429,6 @@ static int rs5c372_rtc_proc(struct device *dev, struct seq_file *seq) static const struct rtc_class_ops rs5c372_rtc_ops = { .proc = rs5c372_rtc_proc, - .ioctl = rs5c_rtc_ioctl, .read_time = rs5c372_rtc_read_time, .set_time = rs5c372_rtc_set_time, .read_alarm = rs5c_read_alarm, diff --git a/drivers/rtc/rtc-rx8025.c b/drivers/rtc/rtc-rx8025.c index af32a62e12a..fde172fb2ab 100644 --- a/drivers/rtc/rtc-rx8025.c +++ b/drivers/rtc/rtc-rx8025.c @@ -424,37 +424,12 @@ static int rx8025_alarm_irq_enable(struct device *dev, unsigned int enabled) return 0; } -static int rx8025_irq_set_state(struct device *dev, int enabled) -{ - struct i2c_client *client = to_i2c_client(dev); - struct rx8025_data *rx8025 = i2c_get_clientdata(client); - int ctrl1; - int err; - - if (client->irq <= 0) - return -ENXIO; - - ctrl1 = rx8025->ctrl1 & ~RX8025_BIT_CTRL1_CT; - if (enabled) - ctrl1 |= RX8025_BIT_CTRL1_CT_1HZ; - if (ctrl1 != rx8025->ctrl1) { - rx8025->ctrl1 = ctrl1; - err = rx8025_write_reg(rx8025->client, RX8025_REG_CTRL1, - rx8025->ctrl1); - if (err) - return err; - } - - return 0; -} - static struct rtc_class_ops rx8025_rtc_ops = { .read_time = rx8025_get_time, .set_time = rx8025_set_time, .read_alarm = rx8025_read_alarm, .set_alarm = rx8025_set_alarm, .alarm_irq_enable = rx8025_alarm_irq_enable, - .irq_set_state = rx8025_irq_set_state, }; /* diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c index b80fa288240..714964913e5 100644 --- a/drivers/rtc/rtc-s3c.c +++ b/drivers/rtc/rtc-s3c.c @@ -93,37 +93,6 @@ static int s3c_rtc_setaie(struct device *dev, unsigned int enabled) return 0; } -static int s3c_rtc_setpie(struct device *dev, int enabled) -{ - unsigned int tmp; - - pr_debug("%s: pie=%d\n", __func__, enabled); - - spin_lock_irq(&s3c_rtc_pie_lock); - - if (s3c_rtc_cpu_type == TYPE_S3C64XX) { - tmp = readw(s3c_rtc_base + S3C2410_RTCCON); - tmp &= ~S3C64XX_RTCCON_TICEN; - - if (enabled) - tmp |= S3C64XX_RTCCON_TICEN; - - writew(tmp, s3c_rtc_base + S3C2410_RTCCON); - } else { - tmp = readb(s3c_rtc_base + S3C2410_TICNT); - tmp &= ~S3C2410_TICNT_ENABLE; - - if (enabled) - tmp |= S3C2410_TICNT_ENABLE; - - writeb(tmp, s3c_rtc_base + S3C2410_TICNT); - } - - spin_unlock_irq(&s3c_rtc_pie_lock); - - return 0; -} - static int s3c_rtc_setfreq(struct device *dev, int freq) { struct platform_device *pdev = to_platform_device(dev); @@ -379,8 +348,6 @@ static const struct rtc_class_ops s3c_rtcops = { .set_time = s3c_rtc_settime, .read_alarm = s3c_rtc_getalarm, .set_alarm = s3c_rtc_setalarm, - .irq_set_freq = s3c_rtc_setfreq, - .irq_set_state = s3c_rtc_setpie, .proc = s3c_rtc_proc, .alarm_irq_enable = s3c_rtc_setaie, }; diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c index 5dfe5ffcb0d..0b40bb88a88 100644 --- a/drivers/rtc/rtc-sa1100.c +++ b/drivers/rtc/rtc-sa1100.c @@ -43,7 +43,6 @@ #define RTC_DEF_TRIM 0 static const unsigned long RTC_FREQ = 1024; -static unsigned long timer_freq; static struct rtc_time rtc_alarm; static DEFINE_SPINLOCK(sa1100_rtc_lock); @@ -156,114 +155,11 @@ static irqreturn_t sa1100_rtc_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static int sa1100_irq_set_freq(struct device *dev, int freq) -{ - if (freq < 1 || freq > timer_freq) { - return -EINVAL; - } else { - struct rtc_device *rtc = (struct rtc_device *)dev; - - rtc->irq_freq = freq; - - return 0; - } -} - -static int rtc_timer1_count; - -static int sa1100_irq_set_state(struct device *dev, int enabled) -{ - spin_lock_irq(&sa1100_rtc_lock); - if (enabled) { - struct rtc_device *rtc = (struct rtc_device *)dev; - - OSMR1 = timer_freq / rtc->irq_freq + OSCR; - OIER |= OIER_E1; - rtc_timer1_count = 1; - } else { - OIER &= ~OIER_E1; - } - spin_unlock_irq(&sa1100_rtc_lock); - - return 0; -} - -static inline int sa1100_timer1_retrigger(struct rtc_device *rtc) -{ - unsigned long diff; - unsigned long period = timer_freq / rtc->irq_freq; - - spin_lock_irq(&sa1100_rtc_lock); - - do { - OSMR1 += period; - diff = OSMR1 - OSCR; - /* If OSCR > OSMR1, diff is a very large number (unsigned - * math). This means we have a lost interrupt. */ - } while (diff > period); - OIER |= OIER_E1; - - spin_unlock_irq(&sa1100_rtc_lock); - - return 0; -} - -static irqreturn_t timer1_interrupt(int irq, void *dev_id) -{ - struct platform_device *pdev = to_platform_device(dev_id); - struct rtc_device *rtc = platform_get_drvdata(pdev); - - /* - * If we match for the first time, rtc_timer1_count will be 1. - * Otherwise, we wrapped around (very unlikely but - * still possible) so compute the amount of missed periods. - * The match reg is updated only when the data is actually retrieved - * to avoid unnecessary interrupts. - */ - OSSR = OSSR_M1; /* clear match on timer1 */ - - rtc_update_irq(rtc, rtc_timer1_count, RTC_PF | RTC_IRQF); - - if (rtc_timer1_count == 1) - rtc_timer1_count = - (rtc->irq_freq * ((1 << 30) / (timer_freq >> 2))); - - /* retrigger. */ - sa1100_timer1_retrigger(rtc); - - return IRQ_HANDLED; -} - -static int sa1100_rtc_read_callback(struct device *dev, int data) -{ - if (data & RTC_PF) { - struct rtc_device *rtc = (struct rtc_device *)dev; - - /* interpolate missed periods and set match for the next */ - unsigned long period = timer_freq / rtc->irq_freq; - unsigned long oscr = OSCR; - unsigned long osmr1 = OSMR1; - unsigned long missed = (oscr - osmr1)/period; - data += missed << 8; - OSSR = OSSR_M1; /* clear match on timer 1 */ - OSMR1 = osmr1 + (missed + 1)*period; - /* Ensure we didn't miss another match in the mean time. - * Here we compare (match - OSCR) 8 instead of 0 -- - * see comment in pxa_timer_interrupt() for explanation. - */ - while ((signed long)((osmr1 = OSMR1) - OSCR) <= 8) { - data += 0x100; - OSSR = OSSR_M1; /* clear match on timer 1 */ - OSMR1 = osmr1 + period; - } - } - return data; -} - static int sa1100_rtc_open(struct device *dev) { int ret; - struct rtc_device *rtc = (struct rtc_device *)dev; + struct platform_device *plat_dev = to_platform_device(dev); + struct rtc_device *rtc = platform_get_drvdata(plat_dev); ret = request_irq(IRQ_RTC1Hz, sa1100_rtc_interrupt, IRQF_DISABLED, "rtc 1Hz", dev); @@ -277,19 +173,11 @@ static int sa1100_rtc_open(struct device *dev) dev_err(dev, "IRQ %d already in use.\n", IRQ_RTCAlrm); goto fail_ai; } - ret = request_irq(IRQ_OST1, timer1_interrupt, IRQF_DISABLED, - "rtc timer", dev); - if (ret) { - dev_err(dev, "IRQ %d already in use.\n", IRQ_OST1); - goto fail_pi; - } rtc->max_user_freq = RTC_FREQ; - sa1100_irq_set_freq(dev, RTC_FREQ); + rtc_irq_set_freq(rtc, NULL, RTC_FREQ); return 0; - fail_pi: - free_irq(IRQ_RTCAlrm, dev); fail_ai: free_irq(IRQ_RTC1Hz, dev); fail_ui: @@ -304,30 +192,10 @@ static void sa1100_rtc_release(struct device *dev) OSSR = OSSR_M1; spin_unlock_irq(&sa1100_rtc_lock); - free_irq(IRQ_OST1, dev); free_irq(IRQ_RTCAlrm, dev); free_irq(IRQ_RTC1Hz, dev); } - -static int sa1100_rtc_ioctl(struct device *dev, unsigned int cmd, - unsigned long arg) -{ - switch (cmd) { - case RTC_UIE_OFF: - spin_lock_irq(&sa1100_rtc_lock); - RTSR &= ~RTSR_HZE; - spin_unlock_irq(&sa1100_rtc_lock); - return 0; - case RTC_UIE_ON: - spin_lock_irq(&sa1100_rtc_lock); - RTSR |= RTSR_HZE; - spin_unlock_irq(&sa1100_rtc_lock); - return 0; - } - return -ENOIOCTLCMD; -} - static int sa1100_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { spin_lock_irq(&sa1100_rtc_lock); @@ -386,31 +254,20 @@ static int sa1100_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) static int sa1100_rtc_proc(struct device *dev, struct seq_file *seq) { - struct rtc_device *rtc = (struct rtc_device *)dev; - - seq_printf(seq, "trim/divider\t: 0x%08x\n", (u32) RTTR); - seq_printf(seq, "update_IRQ\t: %s\n", - (RTSR & RTSR_HZE) ? "yes" : "no"); - seq_printf(seq, "periodic_IRQ\t: %s\n", - (OIER & OIER_E1) ? "yes" : "no"); - seq_printf(seq, "periodic_freq\t: %d\n", rtc->irq_freq); - seq_printf(seq, "RTSR\t\t: 0x%08x\n", (u32)RTSR); + seq_printf(seq, "trim/divider\t\t: 0x%08x\n", (u32) RTTR); + seq_printf(seq, "RTSR\t\t\t: 0x%08x\n", (u32)RTSR); return 0; } static const struct rtc_class_ops sa1100_rtc_ops = { .open = sa1100_rtc_open, - .read_callback = sa1100_rtc_read_callback, .release = sa1100_rtc_release, - .ioctl = sa1100_rtc_ioctl, .read_time = sa1100_rtc_read_time, .set_time = sa1100_rtc_set_time, .read_alarm = sa1100_rtc_read_alarm, .set_alarm = sa1100_rtc_set_alarm, .proc = sa1100_rtc_proc, - .irq_set_freq = sa1100_irq_set_freq, - .irq_set_state = sa1100_irq_set_state, .alarm_irq_enable = sa1100_rtc_alarm_irq_enable, }; @@ -418,8 +275,6 @@ static int sa1100_rtc_probe(struct platform_device *pdev) { struct rtc_device *rtc; - timer_freq = get_clock_tick_rate(); - /* * According to the manual we should be able to let RTTR be zero * and then a default diviser for a 32.768KHz clock is used. @@ -445,11 +300,6 @@ static int sa1100_rtc_probe(struct platform_device *pdev) platform_set_drvdata(pdev, rtc); - /* Set the irq_freq */ - /*TODO: Find out who is messing with this value after we initialize - * it here.*/ - rtc->irq_freq = RTC_FREQ; - /* Fix for a nasty initialization problem the in SA11xx RTSR register. * See also the comments in sa1100_rtc_interrupt(). * diff --git a/drivers/rtc/rtc-sh.c b/drivers/rtc/rtc-sh.c index 93314a9e7fa..e55dc1ac83a 100644 --- a/drivers/rtc/rtc-sh.c +++ b/drivers/rtc/rtc-sh.c @@ -344,27 +344,6 @@ static inline void sh_rtc_setcie(struct device *dev, unsigned int enable) spin_unlock_irq(&rtc->lock); } -static int sh_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) -{ - struct sh_rtc *rtc = dev_get_drvdata(dev); - unsigned int ret = 0; - - switch (cmd) { - case RTC_UIE_OFF: - rtc->periodic_freq &= ~PF_OXS; - sh_rtc_setcie(dev, 0); - break; - case RTC_UIE_ON: - rtc->periodic_freq |= PF_OXS; - sh_rtc_setcie(dev, 1); - break; - default: - ret = -ENOIOCTLCMD; - } - - return ret; -} - static int sh_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { sh_rtc_setaie(dev, enabled); @@ -598,13 +577,10 @@ static int sh_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm) } static struct rtc_class_ops sh_rtc_ops = { - .ioctl = sh_rtc_ioctl, .read_time = sh_rtc_read_time, .set_time = sh_rtc_set_time, .read_alarm = sh_rtc_read_alarm, .set_alarm = sh_rtc_set_alarm, - .irq_set_state = sh_rtc_irq_set_state, - .irq_set_freq = sh_rtc_irq_set_freq, .proc = sh_rtc_proc, .alarm_irq_enable = sh_rtc_alarm_irq_enable, }; diff --git a/drivers/rtc/rtc-stmp3xxx.c b/drivers/rtc/rtc-stmp3xxx.c index 7e7d0c806f2..572e9534b59 100644 --- a/drivers/rtc/rtc-stmp3xxx.c +++ b/drivers/rtc/rtc-stmp3xxx.c @@ -115,19 +115,6 @@ static int stmp3xxx_alarm_irq_enable(struct device *dev, unsigned int enabled) return 0; } -static int stmp3xxx_update_irq_enable(struct device *dev, unsigned int enabled) -{ - struct stmp3xxx_rtc_data *rtc_data = dev_get_drvdata(dev); - - if (enabled) - stmp3xxx_setl(BM_RTC_CTRL_ONEMSEC_IRQ_EN, - rtc_data->io + HW_RTC_CTRL); - else - stmp3xxx_clearl(BM_RTC_CTRL_ONEMSEC_IRQ_EN, - rtc_data->io + HW_RTC_CTRL); - return 0; -} - static int stmp3xxx_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm) { struct stmp3xxx_rtc_data *rtc_data = dev_get_drvdata(dev); @@ -149,8 +136,6 @@ static int stmp3xxx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm) static struct rtc_class_ops stmp3xxx_rtc_ops = { .alarm_irq_enable = stmp3xxx_alarm_irq_enable, - .update_irq_enable = - stmp3xxx_update_irq_enable, .read_time = stmp3xxx_rtc_gettime, .set_mmss = stmp3xxx_rtc_set_mmss, .read_alarm = stmp3xxx_rtc_read_alarm, diff --git a/drivers/rtc/rtc-test.c b/drivers/rtc/rtc-test.c index a82d6fe9707..7e96254bd36 100644 --- a/drivers/rtc/rtc-test.c +++ b/drivers/rtc/rtc-test.c @@ -78,11 +78,16 @@ static ssize_t test_irq_store(struct device *dev, struct rtc_device *rtc = platform_get_drvdata(plat_dev); retval = count; - if (strncmp(buf, "tick", 4) == 0) + if (strncmp(buf, "tick", 4) == 0 && rtc->pie_enabled) rtc_update_irq(rtc, 1, RTC_PF | RTC_IRQF); - else if (strncmp(buf, "alarm", 5) == 0) - rtc_update_irq(rtc, 1, RTC_AF | RTC_IRQF); - else if (strncmp(buf, "update", 6) == 0) + else if (strncmp(buf, "alarm", 5) == 0) { + struct rtc_wkalrm alrm; + int err = rtc_read_alarm(rtc, &alrm); + + if (!err && alrm.enabled) + rtc_update_irq(rtc, 1, RTC_AF | RTC_IRQF); + + } else if (strncmp(buf, "update", 6) == 0 && rtc->uie_rtctimer.enabled) rtc_update_irq(rtc, 1, RTC_UF | RTC_IRQF); else retval = -EINVAL; diff --git a/drivers/rtc/rtc-twl.c b/drivers/rtc/rtc-twl.c index ed1b8682812..f9a2799c44d 100644 --- a/drivers/rtc/rtc-twl.c +++ b/drivers/rtc/rtc-twl.c @@ -213,18 +213,6 @@ static int twl_rtc_alarm_irq_enable(struct device *dev, unsigned enabled) return ret; } -static int twl_rtc_update_irq_enable(struct device *dev, unsigned enabled) -{ - int ret; - - if (enabled) - ret = set_rtc_irq_bit(BIT_RTC_INTERRUPTS_REG_IT_TIMER_M); - else - ret = mask_rtc_irq_bit(BIT_RTC_INTERRUPTS_REG_IT_TIMER_M); - - return ret; -} - /* * Gets current TWL RTC time and date parameters. * @@ -433,7 +421,6 @@ static struct rtc_class_ops twl_rtc_ops = { .read_alarm = twl_rtc_read_alarm, .set_alarm = twl_rtc_set_alarm, .alarm_irq_enable = twl_rtc_alarm_irq_enable, - .update_irq_enable = twl_rtc_update_irq_enable, }; /*----------------------------------------------------------------------*/ diff --git a/drivers/rtc/rtc-vr41xx.c b/drivers/rtc/rtc-vr41xx.c index 769190ac6d1..c5698cda366 100644 --- a/drivers/rtc/rtc-vr41xx.c +++ b/drivers/rtc/rtc-vr41xx.c @@ -207,36 +207,6 @@ static int vr41xx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm) return 0; } -static int vr41xx_rtc_irq_set_freq(struct device *dev, int freq) -{ - u64 count; - - if (!is_power_of_2(freq)) - return -EINVAL; - count = RTC_FREQUENCY; - do_div(count, freq); - - spin_lock_irq(&rtc_lock); - - periodic_count = count; - rtc1_write(RTCL1LREG, periodic_count); - rtc1_write(RTCL1HREG, periodic_count >> 16); - - spin_unlock_irq(&rtc_lock); - - return 0; -} - -static int vr41xx_rtc_irq_set_state(struct device *dev, int enabled) -{ - if (enabled) - enable_irq(pie_irq); - else - disable_irq(pie_irq); - - return 0; -} - static int vr41xx_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -308,8 +278,6 @@ static const struct rtc_class_ops vr41xx_rtc_ops = { .set_time = vr41xx_rtc_set_time, .read_alarm = vr41xx_rtc_read_alarm, .set_alarm = vr41xx_rtc_set_alarm, - .irq_set_freq = vr41xx_rtc_irq_set_freq, - .irq_set_state = vr41xx_rtc_irq_set_state, }; static int __devinit rtc_probe(struct platform_device *pdev) diff --git a/drivers/rtc/rtc-wm831x.c b/drivers/rtc/rtc-wm831x.c index 82931dc65c0..bdc909bd56d 100644 --- a/drivers/rtc/rtc-wm831x.c +++ b/drivers/rtc/rtc-wm831x.c @@ -315,21 +315,6 @@ static int wm831x_rtc_alarm_irq_enable(struct device *dev, return wm831x_rtc_stop_alarm(wm831x_rtc); } -static int wm831x_rtc_update_irq_enable(struct device *dev, - unsigned int enabled) -{ - struct wm831x_rtc *wm831x_rtc = dev_get_drvdata(dev); - int val; - - if (enabled) - val = 1 << WM831X_RTC_PINT_FREQ_SHIFT; - else - val = 0; - - return wm831x_set_bits(wm831x_rtc->wm831x, WM831X_RTC_CONTROL, - WM831X_RTC_PINT_FREQ_MASK, val); -} - static irqreturn_t wm831x_alm_irq(int irq, void *data) { struct wm831x_rtc *wm831x_rtc = data; @@ -354,7 +339,6 @@ static const struct rtc_class_ops wm831x_rtc_ops = { .read_alarm = wm831x_rtc_readalarm, .set_alarm = wm831x_rtc_setalarm, .alarm_irq_enable = wm831x_rtc_alarm_irq_enable, - .update_irq_enable = wm831x_rtc_update_irq_enable, }; #ifdef CONFIG_PM diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c index 3d0dc76b38a..66421426e40 100644 --- a/drivers/rtc/rtc-wm8350.c +++ b/drivers/rtc/rtc-wm8350.c @@ -302,26 +302,6 @@ static int wm8350_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm) return ret; } -static int wm8350_rtc_update_irq_enable(struct device *dev, - unsigned int enabled) -{ - struct wm8350 *wm8350 = dev_get_drvdata(dev); - - /* Suppress duplicate changes since genirq nests enable and - * disable calls. */ - if (enabled == wm8350->rtc.update_enabled) - return 0; - - if (enabled) - wm8350_unmask_irq(wm8350, WM8350_IRQ_RTC_SEC); - else - wm8350_mask_irq(wm8350, WM8350_IRQ_RTC_SEC); - - wm8350->rtc.update_enabled = enabled; - - return 0; -} - static irqreturn_t wm8350_rtc_alarm_handler(int irq, void *data) { struct wm8350 *wm8350 = data; @@ -357,7 +337,6 @@ static const struct rtc_class_ops wm8350_rtc_ops = { .read_alarm = wm8350_rtc_readalarm, .set_alarm = wm8350_rtc_setalarm, .alarm_irq_enable = wm8350_rtc_alarm_irq_enable, - .update_irq_enable = wm8350_rtc_update_irq_enable, }; #ifdef CONFIG_PM diff --git a/drivers/spi/pxa2xx_spi.c b/drivers/spi/pxa2xx_spi.c index 95928833855..a429b01d028 100644 --- a/drivers/spi/pxa2xx_spi.c +++ b/drivers/spi/pxa2xx_spi.c @@ -1557,9 +1557,7 @@ static int __devinit pxa2xx_spi_probe(struct platform_device *pdev) drv_data->ssp = ssp; master->dev.parent = &pdev->dev; -#ifdef CONFIG_OF master->dev.of_node = pdev->dev.of_node; -#endif /* the spi->mode bits understood by this driver: */ master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH; diff --git a/drivers/spi/pxa2xx_spi_pci.c b/drivers/spi/pxa2xx_spi_pci.c index 19752b09e15..378e504f89e 100644 --- a/drivers/spi/pxa2xx_spi_pci.c +++ b/drivers/spi/pxa2xx_spi_pci.c @@ -89,9 +89,7 @@ static int __devinit ce4100_spi_probe(struct pci_dev *dev, goto err_nomem; pdev->dev.parent = &dev->dev; -#ifdef CONFIG_OF pdev->dev.of_node = dev->dev.of_node; -#endif ssp = &spi_info->ssp; ssp->phys_base = pci_resource_start(dev, 0); ssp->mmio_base = ioremap(phys_beg, phys_len); diff --git a/drivers/spi/xilinx_spi.c b/drivers/spi/xilinx_spi.c index 7adaef62a99..4d2c75df886 100644 --- a/drivers/spi/xilinx_spi.c +++ b/drivers/spi/xilinx_spi.c @@ -351,14 +351,12 @@ static irqreturn_t xilinx_spi_irq(int irq, void *dev_id) return IRQ_HANDLED; } -#ifdef CONFIG_OF static const struct of_device_id xilinx_spi_of_match[] = { { .compatible = "xlnx,xps-spi-2.00.a", }, { .compatible = "xlnx,xps-spi-2.00.b", }, {} }; MODULE_DEVICE_TABLE(of, xilinx_spi_of_match); -#endif struct spi_master *xilinx_spi_init(struct device *dev, struct resource *mem, u32 irq, s16 bus_num, int num_cs, int little_endian, int bits_per_word) @@ -394,9 +392,7 @@ struct spi_master *xilinx_spi_init(struct device *dev, struct resource *mem, master->bus_num = bus_num; master->num_chipselect = num_cs; -#ifdef CONFIG_OF master->dev.of_node = dev->of_node; -#endif xspi->mem = *mem; xspi->irq = irq; @@ -539,9 +535,7 @@ static struct platform_driver xilinx_spi_driver = { .driver = { .name = XILINX_SPI_NAME, .owner = THIS_MODULE, -#ifdef CONFIG_OF .of_match_table = xilinx_spi_of_match, -#endif }, }; diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c index 158cecbec71..4a109835e42 100644 --- a/drivers/target/target_core_tmr.c +++ b/drivers/target/target_core_tmr.c @@ -282,6 +282,9 @@ int core_tmr_lun_reset( atomic_set(&task->task_active, 0); atomic_set(&task->task_stop, 0); + } else { + if (atomic_read(&task->task_execute_queue) != 0) + transport_remove_task_from_execute_queue(task, dev); } __transport_stop_task_timer(task, &flags); @@ -301,6 +304,7 @@ int core_tmr_lun_reset( DEBUG_LR("LUN_RESET: got t_transport_active = 1 for" " task: %p, t_fe_count: %d dev: %p\n", task, fe_count, dev); + atomic_set(&T_TASK(cmd)->t_transport_aborted, 1); spin_unlock_irqrestore(&T_TASK(cmd)->t_state_lock, flags); core_tmr_handle_tas_abort(tmr_nacl, cmd, tas, fe_count); @@ -310,6 +314,7 @@ int core_tmr_lun_reset( } DEBUG_LR("LUN_RESET: Got t_transport_active = 0 for task: %p," " t_fe_count: %d dev: %p\n", task, fe_count, dev); + atomic_set(&T_TASK(cmd)->t_transport_aborted, 1); spin_unlock_irqrestore(&T_TASK(cmd)->t_state_lock, flags); core_tmr_handle_tas_abort(tmr_nacl, cmd, tas, fe_count); diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 236e22d8cfa..4bbf6c147f8 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -1207,7 +1207,7 @@ transport_get_task_from_execute_queue(struct se_device *dev) * * */ -static void transport_remove_task_from_execute_queue( +void transport_remove_task_from_execute_queue( struct se_task *task, struct se_device *dev) { @@ -5549,7 +5549,8 @@ static void transport_generic_wait_for_tasks( atomic_set(&T_TASK(cmd)->transport_lun_stop, 0); } - if (!atomic_read(&T_TASK(cmd)->t_transport_active)) + if (!atomic_read(&T_TASK(cmd)->t_transport_active) || + atomic_read(&T_TASK(cmd)->t_transport_aborted)) goto remove; atomic_set(&T_TASK(cmd)->t_transport_stop, 1); @@ -5956,6 +5957,9 @@ static void transport_processing_shutdown(struct se_device *dev) atomic_set(&task->task_active, 0); atomic_set(&task->task_stop, 0); + } else { + if (atomic_read(&task->task_execute_queue) != 0) + transport_remove_task_from_execute_queue(task, dev); } __transport_stop_task_timer(task, &flags); diff --git a/drivers/watchdog/cpwd.c b/drivers/watchdog/cpwd.c index eca855a55c0..3de4ba0260a 100644 --- a/drivers/watchdog/cpwd.c +++ b/drivers/watchdog/cpwd.c @@ -646,7 +646,7 @@ static int __devexit cpwd_remove(struct platform_device *op) struct cpwd *p = dev_get_drvdata(&op->dev); int i; - for (i = 0; i < 4; i++) { + for (i = 0; i < WD_NUMDEVS; i++) { misc_deregister(&p->devs[i].misc); if (!p->enabled) { diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c index 24b966d5061..204a5603c4a 100644 --- a/drivers/watchdog/hpwdt.c +++ b/drivers/watchdog/hpwdt.c @@ -710,7 +710,7 @@ static int __devinit hpwdt_init_nmi_decoding(struct pci_dev *dev) return 0; } -static void __devexit hpwdt_exit_nmi_decoding(void) +static void hpwdt_exit_nmi_decoding(void) { unregister_die_notifier(&die_notifier); if (cru_rom_addr) @@ -726,7 +726,7 @@ static int __devinit hpwdt_init_nmi_decoding(struct pci_dev *dev) return 0; } -static void __devexit hpwdt_exit_nmi_decoding(void) +static void hpwdt_exit_nmi_decoding(void) { } #endif /* CONFIG_HPWDT_NMI_DECODING */ diff --git a/drivers/watchdog/sbc_fitpc2_wdt.c b/drivers/watchdog/sbc_fitpc2_wdt.c index c7d67e9a746..79906255eeb 100644 --- a/drivers/watchdog/sbc_fitpc2_wdt.c +++ b/drivers/watchdog/sbc_fitpc2_wdt.c @@ -201,11 +201,14 @@ static struct miscdevice fitpc2_wdt_miscdev = { static int __init fitpc2_wdt_init(void) { int err; + const char *brd_name; - if (!strstr(dmi_get_system_info(DMI_BOARD_NAME), "SBC-FITPC2")) + brd_name = dmi_get_system_info(DMI_BOARD_NAME); + + if (!brd_name || !strstr(brd_name, "SBC-FITPC2")) return -ENODEV; - pr_info("%s found\n", dmi_get_system_info(DMI_BOARD_NAME)); + pr_info("%s found\n", brd_name); if (!request_region(COMMAND_PORT, 1, WATCHDOG_NAME)) { pr_err("I/O address 0x%04x already in use\n", COMMAND_PORT); diff --git a/drivers/watchdog/sch311x_wdt.c b/drivers/watchdog/sch311x_wdt.c index 0461858e07d..b61ab1c5429 100644 --- a/drivers/watchdog/sch311x_wdt.c +++ b/drivers/watchdog/sch311x_wdt.c @@ -508,7 +508,7 @@ static int __init sch311x_detect(int sio_config_port, unsigned short *addr) sch311x_sio_outb(sio_config_port, 0x07, 0x0a); /* Check if Logical Device Register is currently active */ - if (sch311x_sio_inb(sio_config_port, 0x30) && 0x01 == 0) + if ((sch311x_sio_inb(sio_config_port, 0x30) & 0x01) == 0) printk(KERN_INFO PFX "Seems that LDN 0x0a is not active...\n"); /* Get the base address of the runtime registers */ diff --git a/drivers/watchdog/w83697ug_wdt.c b/drivers/watchdog/w83697ug_wdt.c index a6c12dec91a..df2a64dc967 100644 --- a/drivers/watchdog/w83697ug_wdt.c +++ b/drivers/watchdog/w83697ug_wdt.c @@ -109,7 +109,7 @@ static int w83697ug_select_wd_register(void) outb_p(0x08, WDT_EFDR); /* select logical device 8 (GPIO2) */ outb_p(0x30, WDT_EFER); /* select CR30 */ c = inb_p(WDT_EFDR); - outb_p(c || 0x01, WDT_EFDR); /* set bit 0 to activate GPIO2 */ + outb_p(c | 0x01, WDT_EFDR); /* set bit 0 to activate GPIO2 */ return 0; } diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 43f9f02c7db..718050ace08 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -232,7 +232,7 @@ static int increase_reservation(unsigned long nr_pages) set_phys_to_machine(pfn, frame_list[i]); /* Link back into the page tables if not highmem. */ - if (pfn < max_low_pfn) { + if (!xen_hvm_domain() && pfn < max_low_pfn) { int ret; ret = HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), @@ -280,7 +280,7 @@ static int decrease_reservation(unsigned long nr_pages) scrub_page(page); - if (!PageHighMem(page)) { + if (!xen_hvm_domain() && !PageHighMem(page)) { ret = HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), __pte_ma(0), 0); @@ -296,7 +296,7 @@ static int decrease_reservation(unsigned long nr_pages) /* No more mappings: invalidate P2M and add to balloon. */ for (i = 0; i < nr_pages; i++) { pfn = mfn_to_pfn(frame_list[i]); - set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); balloon_append(pfn_to_page(pfn)); } @@ -392,15 +392,19 @@ static struct notifier_block xenstore_notifier; static int __init balloon_init(void) { - unsigned long pfn, extra_pfn_end; + unsigned long pfn, nr_pages, extra_pfn_end; struct page *page; - if (!xen_pv_domain()) + if (!xen_domain()) return -ENODEV; pr_info("xen_balloon: Initialising balloon driver.\n"); - balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn); + if (xen_pv_domain()) + nr_pages = xen_start_info->nr_pages; + else + nr_pages = max_pfn; + balloon_stats.current_pages = min(nr_pages, max_pfn); balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 74681478100..0ad1699a1b3 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -114,7 +114,7 @@ struct cpu_evtchn_s { static __initdata struct cpu_evtchn_s init_evtchn_mask = { .bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul, }; -static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask; +static struct cpu_evtchn_s __refdata *cpu_evtchn_mask_p = &init_evtchn_mask; static inline unsigned long *cpu_evtchn_mask(int cpu) { @@ -277,7 +277,7 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) BUG_ON(irq == -1); #ifdef CONFIG_SMP - cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu)); + cpumask_copy(irq_to_desc(irq)->irq_data.affinity, cpumask_of(cpu)); #endif clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq))); @@ -294,7 +294,7 @@ static void init_evtchn_cpu_bindings(void) /* By default all event channels notify CPU#0. */ for_each_irq_desc(i, desc) { - cpumask_copy(desc->affinity, cpumask_of(0)); + cpumask_copy(desc->irq_data.affinity, cpumask_of(0)); } #endif @@ -376,81 +376,69 @@ static void unmask_evtchn(int port) put_cpu(); } -static int get_nr_hw_irqs(void) +static int xen_allocate_irq_dynamic(void) { - int ret = 1; + int first = 0; + int irq; #ifdef CONFIG_X86_IO_APIC - ret = get_nr_irqs_gsi(); + /* + * For an HVM guest or domain 0 which see "real" (emulated or + * actual repectively) GSIs we allocate dynamic IRQs + * e.g. those corresponding to event channels or MSIs + * etc. from the range above those "real" GSIs to avoid + * collisions. + */ + if (xen_initial_domain() || xen_hvm_domain()) + first = get_nr_irqs_gsi(); #endif - return ret; -} +retry: + irq = irq_alloc_desc_from(first, -1); -static int find_unbound_pirq(int type) -{ - int rc, i; - struct physdev_get_free_pirq op_get_free_pirq; - op_get_free_pirq.type = type; + if (irq == -ENOMEM && first > NR_IRQS_LEGACY) { + printk(KERN_ERR "Out of dynamic IRQ space and eating into GSI space. You should increase nr_irqs\n"); + first = max(NR_IRQS_LEGACY, first - NR_IRQS_LEGACY); + goto retry; + } - rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq); - if (!rc) - return op_get_free_pirq.pirq; + if (irq < 0) + panic("No available IRQ to bind to: increase nr_irqs!\n"); - for (i = 0; i < nr_irqs; i++) { - if (pirq_to_irq[i] < 0) - return i; - } - return -1; + return irq; } -static int find_unbound_irq(void) +static int xen_allocate_irq_gsi(unsigned gsi) { - struct irq_data *data; - int irq, res; - int bottom = get_nr_hw_irqs(); - int top = nr_irqs-1; - - if (bottom == nr_irqs) - goto no_irqs; + int irq; - /* This loop starts from the top of IRQ space and goes down. - * We need this b/c if we have a PCI device in a Xen PV guest - * we do not have an IO-APIC (though the backend might have them) - * mapped in. To not have a collision of physical IRQs with the Xen - * event channels start at the top of the IRQ space for virtual IRQs. + /* + * A PV guest has no concept of a GSI (since it has no ACPI + * nor access to/knowledge of the physical APICs). Therefore + * all IRQs are dynamically allocated from the entire IRQ + * space. */ - for (irq = top; irq > bottom; irq--) { - data = irq_get_irq_data(irq); - /* only 15->0 have init'd desc; handle irq > 16 */ - if (!data) - break; - if (data->chip == &no_irq_chip) - break; - if (data->chip != &xen_dynamic_chip) - continue; - if (irq_info[irq].type == IRQT_UNBOUND) - return irq; - } - - if (irq == bottom) - goto no_irqs; + if (xen_pv_domain() && !xen_initial_domain()) + return xen_allocate_irq_dynamic(); - res = irq_alloc_desc_at(irq, -1); + /* Legacy IRQ descriptors are already allocated by the arch. */ + if (gsi < NR_IRQS_LEGACY) + return gsi; - if (WARN_ON(res != irq)) - return -1; + irq = irq_alloc_desc_at(gsi, -1); + if (irq < 0) + panic("Unable to allocate to IRQ%d (%d)\n", gsi, irq); return irq; - -no_irqs: - panic("No available IRQ to bind to: increase nr_irqs!\n"); } -static bool identity_mapped_irq(unsigned irq) +static void xen_free_irq(unsigned irq) { - /* identity map all the hardware irqs */ - return irq < get_nr_hw_irqs(); + /* Legacy IRQ descriptors are managed by the arch. */ + if (irq < NR_IRQS_LEGACY) + return; + + irq_free_desc(irq); } static void pirq_unmask_notify(int irq) @@ -486,7 +474,7 @@ static bool probing_irq(int irq) return desc && desc->action == NULL; } -static unsigned int startup_pirq(unsigned int irq) +static unsigned int __startup_pirq(unsigned int irq) { struct evtchn_bind_pirq bind_pirq; struct irq_info *info = info_for_irq(irq); @@ -524,9 +512,15 @@ out: return 0; } -static void shutdown_pirq(unsigned int irq) +static unsigned int startup_pirq(struct irq_data *data) +{ + return __startup_pirq(data->irq); +} + +static void shutdown_pirq(struct irq_data *data) { struct evtchn_close close; + unsigned int irq = data->irq; struct irq_info *info = info_for_irq(irq); int evtchn = evtchn_from_irq(irq); @@ -546,20 +540,20 @@ static void shutdown_pirq(unsigned int irq) info->evtchn = 0; } -static void enable_pirq(unsigned int irq) +static void enable_pirq(struct irq_data *data) { - startup_pirq(irq); + startup_pirq(data); } -static void disable_pirq(unsigned int irq) +static void disable_pirq(struct irq_data *data) { } -static void ack_pirq(unsigned int irq) +static void ack_pirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(irq); + int evtchn = evtchn_from_irq(data->irq); - move_native_irq(irq); + move_native_irq(data->irq); if (VALID_EVTCHN(evtchn)) { mask_evtchn(evtchn); @@ -567,23 +561,6 @@ static void ack_pirq(unsigned int irq) } } -static void end_pirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - struct irq_desc *desc = irq_to_desc(irq); - - if (WARN_ON(!desc)) - return; - - if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) == - (IRQ_DISABLED|IRQ_PENDING)) { - shutdown_pirq(irq); - } else if (VALID_EVTCHN(evtchn)) { - unmask_evtchn(evtchn); - pirq_unmask_notify(irq); - } -} - static int find_irq_by_gsi(unsigned gsi) { int irq; @@ -638,14 +615,7 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name) goto out; /* XXX need refcount? */ } - /* If we are a PV guest, we don't have GSIs (no ACPI passed). Therefore - * we are using the !xen_initial_domain() to drop in the function.*/ - if (identity_mapped_irq(gsi) || (!xen_initial_domain() && - xen_pv_domain())) { - irq = gsi; - irq_alloc_desc_at(irq, -1); - } else - irq = find_unbound_irq(); + irq = xen_allocate_irq_gsi(gsi); set_irq_chip_and_handler_name(irq, &xen_pirq_chip, handle_level_irq, name); @@ -658,7 +628,7 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name) * this in the priv domain. */ if (xen_initial_domain() && HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { - irq_free_desc(irq); + xen_free_irq(irq); irq = -ENOSPC; goto out; } @@ -674,87 +644,46 @@ out: } #ifdef CONFIG_PCI_MSI -#include <linux/msi.h> -#include "../pci/msi.h" - -void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc) +int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc) { - spin_lock(&irq_mapping_update_lock); - - if (alloc & XEN_ALLOC_IRQ) { - *irq = find_unbound_irq(); - if (*irq == -1) - goto out; - } - - if (alloc & XEN_ALLOC_PIRQ) { - *pirq = find_unbound_pirq(MAP_PIRQ_TYPE_MSI); - if (*pirq == -1) - goto out; - } + int rc; + struct physdev_get_free_pirq op_get_free_pirq; - set_irq_chip_and_handler_name(*irq, &xen_pirq_chip, - handle_level_irq, name); + op_get_free_pirq.type = MAP_PIRQ_TYPE_MSI; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq); - irq_info[*irq] = mk_pirq_info(0, *pirq, 0, 0); - pirq_to_irq[*pirq] = *irq; + WARN_ONCE(rc == -ENOSYS, + "hypervisor does not support the PHYSDEVOP_get_free_pirq interface\n"); -out: - spin_unlock(&irq_mapping_update_lock); + return rc ? -1 : op_get_free_pirq.pirq; } -int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type) +int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, + int pirq, int vector, const char *name) { - int irq = -1; - struct physdev_map_pirq map_irq; - int rc; - int pos; - u32 table_offset, bir; - - memset(&map_irq, 0, sizeof(map_irq)); - map_irq.domid = DOMID_SELF; - map_irq.type = MAP_PIRQ_TYPE_MSI; - map_irq.index = -1; - map_irq.pirq = -1; - map_irq.bus = dev->bus->number; - map_irq.devfn = dev->devfn; - - if (type == PCI_CAP_ID_MSIX) { - pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); - - pci_read_config_dword(dev, msix_table_offset_reg(pos), - &table_offset); - bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK); - - map_irq.table_base = pci_resource_start(dev, bir); - map_irq.entry_nr = msidesc->msi_attrib.entry_nr; - } + int irq, ret; spin_lock(&irq_mapping_update_lock); - irq = find_unbound_irq(); - + irq = xen_allocate_irq_dynamic(); if (irq == -1) goto out; - rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); - if (rc) { - printk(KERN_WARNING "xen map irq failed %d\n", rc); - - irq_free_desc(irq); - - irq = -1; - goto out; - } - irq_info[irq] = mk_pirq_info(0, map_irq.pirq, 0, map_irq.index); - set_irq_chip_and_handler_name(irq, &xen_pirq_chip, - handle_level_irq, - (type == PCI_CAP_ID_MSIX) ? "msi-x":"msi"); + handle_level_irq, name); + irq_info[irq] = mk_pirq_info(0, pirq, 0, vector); + pirq_to_irq[pirq] = irq; + ret = irq_set_msi_desc(irq, msidesc); + if (ret < 0) + goto error_irq; out: spin_unlock(&irq_mapping_update_lock); return irq; +error_irq: + spin_unlock(&irq_mapping_update_lock); + xen_free_irq(irq); + return -1; } #endif @@ -779,11 +708,12 @@ int xen_destroy_irq(int irq) printk(KERN_WARNING "unmap irq failed %d\n", rc); goto out; } - pirq_to_irq[info->u.pirq.pirq] = -1; } + pirq_to_irq[info->u.pirq.pirq] = -1; + irq_info[irq] = mk_unbound_info(); - irq_free_desc(irq); + xen_free_irq(irq); out: spin_unlock(&irq_mapping_update_lock); @@ -814,7 +744,7 @@ int bind_evtchn_to_irq(unsigned int evtchn) irq = evtchn_to_irq[evtchn]; if (irq == -1) { - irq = find_unbound_irq(); + irq = xen_allocate_irq_dynamic(); set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, handle_fasteoi_irq, "event"); @@ -839,7 +769,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) irq = per_cpu(ipi_to_irq, cpu)[ipi]; if (irq == -1) { - irq = find_unbound_irq(); + irq = xen_allocate_irq_dynamic(); if (irq < 0) goto out; @@ -875,7 +805,7 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu) irq = per_cpu(virq_to_irq, cpu)[virq]; if (irq == -1) { - irq = find_unbound_irq(); + irq = xen_allocate_irq_dynamic(); set_irq_chip_and_handler_name(irq, &xen_percpu_chip, handle_percpu_irq, "virq"); @@ -934,7 +864,7 @@ static void unbind_from_irq(unsigned int irq) if (irq_info[irq].type != IRQT_UNBOUND) { irq_info[irq] = mk_unbound_info(); - irq_free_desc(irq); + xen_free_irq(irq); } spin_unlock(&irq_mapping_update_lock); @@ -990,7 +920,7 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi, if (irq < 0) return irq; - irqflags |= IRQF_NO_SUSPEND; + irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME; retval = request_irq(irq, handler, irqflags, devname, dev_id); if (retval != 0) { unbind_from_irq(irq); @@ -1234,11 +1164,12 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) return 0; } -static int set_affinity_irq(unsigned irq, const struct cpumask *dest) +static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, + bool force) { unsigned tcpu = cpumask_first(dest); - return rebind_irq_to_cpu(irq, tcpu); + return rebind_irq_to_cpu(data->irq, tcpu); } int resend_irq_on_evtchn(unsigned int irq) @@ -1257,35 +1188,35 @@ int resend_irq_on_evtchn(unsigned int irq) return 1; } -static void enable_dynirq(unsigned int irq) +static void enable_dynirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(irq); + int evtchn = evtchn_from_irq(data->irq); if (VALID_EVTCHN(evtchn)) unmask_evtchn(evtchn); } -static void disable_dynirq(unsigned int irq) +static void disable_dynirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(irq); + int evtchn = evtchn_from_irq(data->irq); if (VALID_EVTCHN(evtchn)) mask_evtchn(evtchn); } -static void ack_dynirq(unsigned int irq) +static void ack_dynirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(irq); + int evtchn = evtchn_from_irq(data->irq); - move_masked_irq(irq); + move_masked_irq(data->irq); if (VALID_EVTCHN(evtchn)) unmask_evtchn(evtchn); } -static int retrigger_dynirq(unsigned int irq) +static int retrigger_dynirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(irq); + int evtchn = evtchn_from_irq(data->irq); struct shared_info *sh = HYPERVISOR_shared_info; int ret = 0; @@ -1334,7 +1265,7 @@ static void restore_cpu_pirqs(void) printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); - startup_pirq(irq); + __startup_pirq(irq); } } @@ -1445,7 +1376,6 @@ void xen_poll_irq(int irq) void xen_irq_resume(void) { unsigned int cpu, irq, evtchn; - struct irq_desc *desc; init_evtchn_cpu_bindings(); @@ -1465,66 +1395,48 @@ void xen_irq_resume(void) restore_cpu_ipis(cpu); } - /* - * Unmask any IRQF_NO_SUSPEND IRQs which are enabled. These - * are not handled by the IRQ core. - */ - for_each_irq_desc(irq, desc) { - if (!desc->action || !(desc->action->flags & IRQF_NO_SUSPEND)) - continue; - if (desc->status & IRQ_DISABLED) - continue; - - evtchn = evtchn_from_irq(irq); - if (evtchn == -1) - continue; - - unmask_evtchn(evtchn); - } - restore_cpu_pirqs(); } static struct irq_chip xen_dynamic_chip __read_mostly = { - .name = "xen-dyn", + .name = "xen-dyn", - .disable = disable_dynirq, - .mask = disable_dynirq, - .unmask = enable_dynirq, + .irq_disable = disable_dynirq, + .irq_mask = disable_dynirq, + .irq_unmask = enable_dynirq, - .eoi = ack_dynirq, - .set_affinity = set_affinity_irq, - .retrigger = retrigger_dynirq, + .irq_eoi = ack_dynirq, + .irq_set_affinity = set_affinity_irq, + .irq_retrigger = retrigger_dynirq, }; static struct irq_chip xen_pirq_chip __read_mostly = { - .name = "xen-pirq", + .name = "xen-pirq", - .startup = startup_pirq, - .shutdown = shutdown_pirq, + .irq_startup = startup_pirq, + .irq_shutdown = shutdown_pirq, - .enable = enable_pirq, - .unmask = enable_pirq, + .irq_enable = enable_pirq, + .irq_unmask = enable_pirq, - .disable = disable_pirq, - .mask = disable_pirq, + .irq_disable = disable_pirq, + .irq_mask = disable_pirq, - .ack = ack_pirq, - .end = end_pirq, + .irq_ack = ack_pirq, - .set_affinity = set_affinity_irq, + .irq_set_affinity = set_affinity_irq, - .retrigger = retrigger_dynirq, + .irq_retrigger = retrigger_dynirq, }; static struct irq_chip xen_percpu_chip __read_mostly = { - .name = "xen-percpu", + .name = "xen-percpu", - .disable = disable_dynirq, - .mask = disable_dynirq, - .unmask = enable_dynirq, + .irq_disable = disable_dynirq, + .irq_mask = disable_dynirq, + .irq_unmask = enable_dynirq, - .ack = ack_dynirq, + .irq_ack = ack_dynirq, }; int xen_set_callback_via(uint64_t via) diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 24177272bcb..ebb292859b5 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -34,42 +34,38 @@ enum shutdown_state { /* Ignore multiple shutdown requests. */ static enum shutdown_state shutting_down = SHUTDOWN_INVALID; -#ifdef CONFIG_PM_SLEEP -static int xen_hvm_suspend(void *data) -{ - int err; - struct sched_shutdown r = { .reason = SHUTDOWN_suspend }; - int *cancelled = data; - - BUG_ON(!irqs_disabled()); - - err = sysdev_suspend(PMSG_SUSPEND); - if (err) { - printk(KERN_ERR "xen_hvm_suspend: sysdev_suspend failed: %d\n", - err); - return err; - } - - *cancelled = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r); +struct suspend_info { + int cancelled; + unsigned long arg; /* extra hypercall argument */ + void (*pre)(void); + void (*post)(int cancelled); +}; - xen_hvm_post_suspend(*cancelled); +static void xen_hvm_post_suspend(int cancelled) +{ + xen_arch_hvm_post_suspend(cancelled); gnttab_resume(); +} - if (!*cancelled) { - xen_irq_resume(); - xen_console_resume(); - xen_timer_resume(); - } - - sysdev_resume(); +static void xen_pre_suspend(void) +{ + xen_mm_pin_all(); + gnttab_suspend(); + xen_arch_pre_suspend(); +} - return 0; +static void xen_post_suspend(int cancelled) +{ + xen_arch_post_suspend(cancelled); + gnttab_resume(); + xen_mm_unpin_all(); } +#ifdef CONFIG_PM_SLEEP static int xen_suspend(void *data) { + struct suspend_info *si = data; int err; - int *cancelled = data; BUG_ON(!irqs_disabled()); @@ -80,22 +76,20 @@ static int xen_suspend(void *data) return err; } - xen_mm_pin_all(); - gnttab_suspend(); - xen_pre_suspend(); + if (si->pre) + si->pre(); /* * This hypercall returns 1 if suspend was cancelled * or the domain was merely checkpointed, and 0 if it * is resuming in a new domain. */ - *cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info)); + si->cancelled = HYPERVISOR_suspend(si->arg); - xen_post_suspend(*cancelled); - gnttab_resume(); - xen_mm_unpin_all(); + if (si->post) + si->post(si->cancelled); - if (!*cancelled) { + if (!si->cancelled) { xen_irq_resume(); xen_console_resume(); xen_timer_resume(); @@ -109,7 +103,7 @@ static int xen_suspend(void *data) static void do_suspend(void) { int err; - int cancelled = 1; + struct suspend_info si; shutting_down = SHUTDOWN_SUSPEND; @@ -139,20 +133,29 @@ static void do_suspend(void) goto out_resume; } - if (xen_hvm_domain()) - err = stop_machine(xen_hvm_suspend, &cancelled, cpumask_of(0)); - else - err = stop_machine(xen_suspend, &cancelled, cpumask_of(0)); + si.cancelled = 1; + + if (xen_hvm_domain()) { + si.arg = 0UL; + si.pre = NULL; + si.post = &xen_hvm_post_suspend; + } else { + si.arg = virt_to_mfn(xen_start_info); + si.pre = &xen_pre_suspend; + si.post = &xen_post_suspend; + } + + err = stop_machine(xen_suspend, &si, cpumask_of(0)); dpm_resume_noirq(PMSG_RESUME); if (err) { printk(KERN_ERR "failed to start xen_suspend: %d\n", err); - cancelled = 1; + si.cancelled = 1; } out_resume: - if (!cancelled) { + if (!si.cancelled) { xen_arch_resume(); xs_resume(); } else @@ -172,12 +175,39 @@ out: } #endif /* CONFIG_PM_SLEEP */ +struct shutdown_handler { + const char *command; + void (*cb)(void); +}; + +static void do_poweroff(void) +{ + shutting_down = SHUTDOWN_POWEROFF; + orderly_poweroff(false); +} + +static void do_reboot(void) +{ + shutting_down = SHUTDOWN_POWEROFF; /* ? */ + ctrl_alt_del(); +} + static void shutdown_handler(struct xenbus_watch *watch, const char **vec, unsigned int len) { char *str; struct xenbus_transaction xbt; int err; + static struct shutdown_handler handlers[] = { + { "poweroff", do_poweroff }, + { "halt", do_poweroff }, + { "reboot", do_reboot }, +#ifdef CONFIG_PM_SLEEP + { "suspend", do_suspend }, +#endif + {NULL, NULL}, + }; + static struct shutdown_handler *handler; if (shutting_down != SHUTDOWN_INVALID) return; @@ -194,7 +224,14 @@ static void shutdown_handler(struct xenbus_watch *watch, return; } - xenbus_write(xbt, "control", "shutdown", ""); + for (handler = &handlers[0]; handler->command; handler++) { + if (strcmp(str, handler->command) == 0) + break; + } + + /* Only acknowledge commands which we are prepared to handle. */ + if (handler->cb) + xenbus_write(xbt, "control", "shutdown", ""); err = xenbus_transaction_end(xbt, 0); if (err == -EAGAIN) { @@ -202,17 +239,8 @@ static void shutdown_handler(struct xenbus_watch *watch, goto again; } - if (strcmp(str, "poweroff") == 0 || - strcmp(str, "halt") == 0) { - shutting_down = SHUTDOWN_POWEROFF; - orderly_poweroff(false); - } else if (strcmp(str, "reboot") == 0) { - shutting_down = SHUTDOWN_POWEROFF; /* ? */ - ctrl_alt_del(); -#ifdef CONFIG_PM_SLEEP - } else if (strcmp(str, "suspend") == 0) { - do_suspend(); -#endif + if (handler->cb) { + handler->cb(); } else { printk(KERN_INFO "Ignoring shutdown request: %s\n", str); shutting_down = SHUTDOWN_INVALID; @@ -291,27 +319,18 @@ static int shutdown_event(struct notifier_block *notifier, return NOTIFY_DONE; } -static int __init __setup_shutdown_event(void) -{ - /* Delay initialization in the PV on HVM case */ - if (xen_hvm_domain()) - return 0; - - if (!xen_pv_domain()) - return -ENODEV; - - return xen_setup_shutdown_event(); -} - int xen_setup_shutdown_event(void) { static struct notifier_block xenstore_notifier = { .notifier_call = shutdown_event }; + + if (!xen_domain()) + return -ENODEV; register_xenstore_notifier(&xenstore_notifier); return 0; } EXPORT_SYMBOL_GPL(xen_setup_shutdown_event); -subsys_initcall(__setup_shutdown_event); +subsys_initcall(xen_setup_shutdown_event); diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index afbe041f42c..319dd0a94d5 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -156,9 +156,6 @@ static int __devinit platform_pci_init(struct pci_dev *pdev, if (ret) goto out; xenbus_probe(NULL); - ret = xen_setup_shutdown_event(); - if (ret) - goto out; return 0; out: diff --git a/fs/Kconfig b/fs/Kconfig index 3db9caa57ed..7cb53aafac1 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -47,7 +47,7 @@ config FS_POSIX_ACL def_bool n config EXPORTFS - tristate + bool config FILE_LOCKING bool "Enable POSIX file locking API" if EXPERT diff --git a/fs/Makefile b/fs/Makefile index a7f7cef0c0c..ba01202844c 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -48,6 +48,8 @@ obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_GENERIC_ACL) += generic_acl.o +obj-$(CONFIG_FHANDLE) += fhandle.o + obj-y += quota/ obj-$(CONFIG_PROC_FS) += proc/ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6f820fa23df..7f78cc78fdd 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -729,6 +729,15 @@ struct btrfs_space_info { u64 disk_total; /* total bytes on disk, takes mirrors into account */ + /* + * we bump reservation progress every time we decrement + * bytes_reserved. This way people waiting for reservations + * know something good has happened and they can check + * for progress. The number here isn't to be trusted, it + * just shows reclaim activity + */ + unsigned long reservation_progress; + int full; /* indicates that we cannot allocate any more chunks for this space */ int force_alloc; /* set if we need to force a chunk alloc for diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index ff27d7a477b..b4ffad859ad 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -21,9 +21,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, int len = *max_len; int type; - if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) || - (connectable && len < BTRFS_FID_SIZE_CONNECTABLE)) + if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) { + *max_len = BTRFS_FID_SIZE_CONNECTABLE; return 255; + } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) { + *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE; + return 255; + } len = BTRFS_FID_SIZE_NON_CONNECTABLE; type = FILEID_BTRFS_WITHOUT_PARENT; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 588ff984987..7b3089b5c2d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3342,15 +3342,16 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, u64 max_reclaim; u64 reclaimed = 0; long time_left; - int pause = 1; int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; int loops = 0; + unsigned long progress; block_rsv = &root->fs_info->delalloc_block_rsv; space_info = block_rsv->space_info; smp_mb(); reserved = space_info->bytes_reserved; + progress = space_info->reservation_progress; if (reserved == 0) return 0; @@ -3365,31 +3366,36 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); spin_lock(&space_info->lock); - if (reserved > space_info->bytes_reserved) { - loops = 0; + if (reserved > space_info->bytes_reserved) reclaimed += reserved - space_info->bytes_reserved; - } else { - loops++; - } reserved = space_info->bytes_reserved; spin_unlock(&space_info->lock); + loops++; + if (reserved == 0 || reclaimed >= max_reclaim) break; if (trans && trans->transaction->blocked) return -EAGAIN; - __set_current_state(TASK_INTERRUPTIBLE); - time_left = schedule_timeout(pause); + time_left = schedule_timeout_interruptible(1); /* We were interrupted, exit */ if (time_left) break; - pause <<= 1; - if (pause > HZ / 10) - pause = HZ / 10; + /* we've kicked the IO a few times, if anything has been freed, + * exit. There is no sense in looping here for a long time + * when we really need to commit the transaction, or there are + * just too many writers without enough free space + */ + + if (loops > 3) { + smp_mb(); + if (progress != space_info->reservation_progress) + break; + } } return reclaimed >= to_reclaim; @@ -3612,6 +3618,7 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, if (num_bytes) { spin_lock(&space_info->lock); space_info->bytes_reserved -= num_bytes; + space_info->reservation_progress++; spin_unlock(&space_info->lock); } } @@ -3844,6 +3851,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; sinfo->bytes_reserved -= num_bytes; + sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; } @@ -4005,7 +4013,6 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) to_reserve = 0; } spin_unlock(&BTRFS_I(inode)->accounting_lock); - to_reserve += calc_csum_metadata_size(inode, num_bytes); ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); if (ret) @@ -4133,6 +4140,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, btrfs_set_block_group_used(&cache->item, old_val); cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; + cache->space_info->reservation_progress++; cache->space_info->bytes_used += num_bytes; cache->space_info->disk_used += num_bytes * factor; spin_unlock(&cache->lock); @@ -4184,6 +4192,7 @@ static int pin_down_extent(struct btrfs_root *root, if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; + cache->space_info->reservation_progress++; } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -4234,6 +4243,7 @@ static int update_reserved_bytes(struct btrfs_block_group_cache *cache, space_info->bytes_readonly += num_bytes; cache->reserved -= num_bytes; space_info->bytes_reserved -= num_bytes; + space_info->reservation_progress++; } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); @@ -4712,6 +4722,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (ret) { spin_lock(&cache->space_info->lock); cache->space_info->bytes_reserved -= buf->len; + cache->space_info->reservation_progress++; spin_unlock(&cache->space_info->lock); } goto out; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fd3f172e94e..714adc4ac4c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3046,17 +3046,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } while (!end) { - off = extent_map_end(em); - if (off >= max) - end = 1; + u64 offset_in_extent; + + /* break if the extent we found is outside the range */ + if (em->start >= max || extent_map_end(em) < off) + break; + + /* + * get_extent may return an extent that starts before our + * requested range. We have to make sure the ranges + * we return to fiemap always move forward and don't + * overlap, so adjust the offsets here + */ + em_start = max(em->start, off); - em_start = em->start; - em_len = em->len; + /* + * record the offset from the start of the extent + * for adjusting the disk offset below + */ + offset_in_extent = em_start - em->start; em_end = extent_map_end(em); + em_len = em_end - em_start; emflags = em->flags; disko = 0; flags = 0; + /* + * bump off for our next call to get_extent + */ + off = extent_map_end(em); + if (off >= max) + end = 1; + if (em->block_start == EXTENT_MAP_LAST_BYTE) { end = 1; flags |= FIEMAP_EXTENT_LAST; @@ -3067,7 +3088,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); } else { - disko = em->block_start; + disko = em->block_start + offset_in_extent; } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7084140d594..f447b783bb8 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -70,6 +70,19 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, /* Flush processor's dcache for this page */ flush_dcache_page(page); + + /* + * if we get a partial write, we can end up with + * partially up to date pages. These add + * a lot of complexity, so make sure they don't + * happen by forcing this copy to be retried. + * + * The rest of the btrfs_file_write code will fall + * back to page at a time copies after we return 0. + */ + if (!PageUptodate(page) && copied < count) + copied = 0; + iov_iter_advance(i, copied); write_bytes -= copied; total_copied += copied; @@ -763,6 +776,27 @@ out: } /* + * on error we return an unlocked page and the error value + * on success we return a locked page and 0 + */ +static int prepare_uptodate_page(struct page *page, u64 pos) +{ + int ret = 0; + + if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) { + ret = btrfs_readpage(NULL, page); + if (ret) + return ret; + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + return -EIO; + } + } + return 0; +} + +/* * this gets pages into the page cache and locks them down, it also properly * waits for data=ordered extents to finish before allowing the pages to be * modified. @@ -777,6 +811,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, unsigned long index = pos >> PAGE_CACHE_SHIFT; struct inode *inode = fdentry(file)->d_inode; int err = 0; + int faili = 0; u64 start_pos; u64 last_pos; @@ -794,15 +829,24 @@ again: for (i = 0; i < num_pages; i++) { pages[i] = grab_cache_page(inode->i_mapping, index + i); if (!pages[i]) { - int c; - for (c = i - 1; c >= 0; c--) { - unlock_page(pages[c]); - page_cache_release(pages[c]); - } - return -ENOMEM; + faili = i - 1; + err = -ENOMEM; + goto fail; + } + + if (i == 0) + err = prepare_uptodate_page(pages[i], pos); + if (i == num_pages - 1) + err = prepare_uptodate_page(pages[i], + pos + write_bytes); + if (err) { + page_cache_release(pages[i]); + faili = i - 1; + goto fail; } wait_on_page_writeback(pages[i]); } + err = 0; if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; lock_extent_bits(&BTRFS_I(inode)->io_tree, @@ -842,6 +886,14 @@ again: WARN_ON(!PageLocked(pages[i])); } return 0; +fail: + while (faili >= 0) { + unlock_page(pages[faili]); + page_cache_release(pages[faili]); + faili--; + } + return err; + } static ssize_t btrfs_file_aio_write(struct kiocb *iocb, @@ -851,7 +903,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, struct file *file = iocb->ki_filp; struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; - struct page *pinned[2]; struct page **pages = NULL; struct iov_iter i; loff_t *ppos = &iocb->ki_pos; @@ -872,9 +923,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || (file->f_flags & O_DIRECT)); - pinned[0] = NULL; - pinned[1] = NULL; - start_pos = pos; vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); @@ -962,32 +1010,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, first_index = pos >> PAGE_CACHE_SHIFT; last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; - /* - * there are lots of better ways to do this, but this code - * makes sure the first and last page in the file range are - * up to date and ready for cow - */ - if ((pos & (PAGE_CACHE_SIZE - 1))) { - pinned[0] = grab_cache_page(inode->i_mapping, first_index); - if (!PageUptodate(pinned[0])) { - ret = btrfs_readpage(NULL, pinned[0]); - BUG_ON(ret); - wait_on_page_locked(pinned[0]); - } else { - unlock_page(pinned[0]); - } - } - if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) { - pinned[1] = grab_cache_page(inode->i_mapping, last_index); - if (!PageUptodate(pinned[1])) { - ret = btrfs_readpage(NULL, pinned[1]); - BUG_ON(ret); - wait_on_page_locked(pinned[1]); - } else { - unlock_page(pinned[1]); - } - } - while (iov_iter_count(&i) > 0) { size_t offset = pos & (PAGE_CACHE_SIZE - 1); size_t write_bytes = min(iov_iter_count(&i), @@ -1024,8 +1046,20 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, copied = btrfs_copy_from_user(pos, num_pages, write_bytes, pages, &i); - dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + + /* + * if we have trouble faulting in the pages, fall + * back to one page at a time + */ + if (copied < write_bytes) + nrptrs = 1; + + if (copied == 0) + dirty_pages = 0; + else + dirty_pages = (copied + offset + + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; if (num_pages > dirty_pages) { if (copied > 0) @@ -1069,10 +1103,6 @@ out: err = ret; kfree(pages); - if (pinned[0]) - page_cache_release(pinned[0]); - if (pinned[1]) - page_cache_release(pinned[1]); *ppos = pos; /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0efdb65953c..4a0107e1874 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4806,9 +4806,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, int err; int drop_inode = 0; - if (inode->i_nlink == 0) - return -ENOENT; - /* do not allow sys_link's with other subvols of the same device */ if (root->objectid != BTRFS_I(inode)->root->objectid) return -EPERM; @@ -4821,10 +4818,11 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; /* - * 1 item for inode ref + * 2 items for inode and inode ref * 2 items for dir items + * 1 item for parent inode */ - trans = btrfs_start_transaction(root, 3); + trans = btrfs_start_transaction(root, 5); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto fail; @@ -6056,6 +6054,7 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, if (!skip_sum) { dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); if (!dip->csums) { + kfree(dip); ret = -ENOMEM; goto free_ordered; } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 099a58615b9..ebafa65a29b 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -993,7 +993,7 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *dir; - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; dir = dentry->d_parent->d_inode; diff --git a/fs/compat.c b/fs/compat.c index f6fd0a00e6c..c6d31a3bab8 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -262,35 +262,19 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs * */ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf) { - struct path path; - int error; - - error = user_path(pathname, &path); - if (!error) { - struct kstatfs tmp; - error = vfs_statfs(&path, &tmp); - if (!error) - error = put_compat_statfs(buf, &tmp); - path_put(&path); - } + struct kstatfs tmp; + int error = user_statfs(pathname, &tmp); + if (!error) + error = put_compat_statfs(buf, &tmp); return error; } asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf) { - struct file * file; struct kstatfs tmp; - int error; - - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = vfs_statfs(&file->f_path, &tmp); + int error = fd_statfs(fd, &tmp); if (!error) error = put_compat_statfs(buf, &tmp); - fput(file); -out: return error; } @@ -329,41 +313,29 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf) { - struct path path; + struct kstatfs tmp; int error; if (sz != sizeof(*buf)) return -EINVAL; - error = user_path(pathname, &path); - if (!error) { - struct kstatfs tmp; - error = vfs_statfs(&path, &tmp); - if (!error) - error = put_compat_statfs64(buf, &tmp); - path_put(&path); - } + error = user_statfs(pathname, &tmp); + if (!error) + error = put_compat_statfs64(buf, &tmp); return error; } asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf) { - struct file * file; struct kstatfs tmp; int error; if (sz != sizeof(*buf)) return -EINVAL; - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = vfs_statfs(&file->f_path, &tmp); + error = fd_statfs(fd, &tmp); if (!error) error = put_compat_statfs64(buf, &tmp); - fput(file); -out: return error; } @@ -1228,7 +1200,9 @@ compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec, file = fget_light(fd, &fput_needed); if (!file) return -EBADF; - ret = compat_readv(file, vec, vlen, &pos); + ret = -ESPIPE; + if (file->f_mode & FMODE_PREAD) + ret = compat_readv(file, vec, vlen, &pos); fput_light(file, fput_needed); return ret; } @@ -1285,7 +1259,9 @@ compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec, file = fget_light(fd, &fput_needed); if (!file) return -EBADF; - ret = compat_writev(file, vec, vlen, &pos); + ret = -ESPIPE; + if (file->f_mode & FMODE_PWRITE) + ret = compat_writev(file, vec, vlen, &pos); fput_light(file, fput_needed); return ret; } @@ -2308,3 +2284,16 @@ asmlinkage long compat_sys_timerfd_gettime(int ufd, } #endif /* CONFIG_TIMERFD */ + +#ifdef CONFIG_FHANDLE +/* + * Exactly like fs/open.c:sys_open_by_handle_at(), except that it + * doesn't set the O_LARGEFILE flag. + */ +asmlinkage long +compat_sys_open_by_handle_at(int mountdirfd, + struct file_handle __user *handle, int flags) +{ + return do_handle_open(mountdirfd, handle, flags); +} +#endif diff --git a/fs/dcache.c b/fs/dcache.c index 2a6bd9a4ae9..a39fe47c466 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -296,8 +296,12 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) __releases(parent->d_lock) __releases(dentry->d_inode->i_lock) { - dentry->d_parent = NULL; list_del(&dentry->d_u.d_child); + /* + * Inform try_to_ascend() that we are no longer attached to the + * dentry tree + */ + dentry->d_flags |= DCACHE_DISCONNECTED; if (parent) spin_unlock(&parent->d_lock); dentry_iput(dentry); @@ -1012,6 +1016,35 @@ void shrink_dcache_for_umount(struct super_block *sb) } /* + * This tries to ascend one level of parenthood, but + * we can race with renaming, so we need to re-check + * the parenthood after dropping the lock and check + * that the sequence number still matches. + */ +static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq) +{ + struct dentry *new = old->d_parent; + + rcu_read_lock(); + spin_unlock(&old->d_lock); + spin_lock(&new->d_lock); + + /* + * might go back up the wrong parent if we have had a rename + * or deletion + */ + if (new != old->d_parent || + (old->d_flags & DCACHE_DISCONNECTED) || + (!locked && read_seqretry(&rename_lock, seq))) { + spin_unlock(&new->d_lock); + new = NULL; + } + rcu_read_unlock(); + return new; +} + + +/* * Search for at least 1 mount point in the dentry's subdirs. * We descend to the next level whenever the d_subdirs * list is non-empty and continue searching. @@ -1066,24 +1099,10 @@ resume: * All done at this level ... ascend and resume the search. */ if (this_parent != parent) { - struct dentry *tmp; - struct dentry *child; - - tmp = this_parent->d_parent; - rcu_read_lock(); - spin_unlock(&this_parent->d_lock); - child = this_parent; - this_parent = tmp; - spin_lock(&this_parent->d_lock); - /* might go back up the wrong parent if we have had a rename - * or deletion */ - if (this_parent != child->d_parent || - (!locked && read_seqretry(&rename_lock, seq))) { - spin_unlock(&this_parent->d_lock); - rcu_read_unlock(); + struct dentry *child = this_parent; + this_parent = try_to_ascend(this_parent, locked, seq); + if (!this_parent) goto rename_retry; - } - rcu_read_unlock(); next = child->d_u.d_child.next; goto resume; } @@ -1181,24 +1200,10 @@ resume: * All done at this level ... ascend and resume the search. */ if (this_parent != parent) { - struct dentry *tmp; - struct dentry *child; - - tmp = this_parent->d_parent; - rcu_read_lock(); - spin_unlock(&this_parent->d_lock); - child = this_parent; - this_parent = tmp; - spin_lock(&this_parent->d_lock); - /* might go back up the wrong parent if we have had a rename - * or deletion */ - if (this_parent != child->d_parent || - (!locked && read_seqretry(&rename_lock, seq))) { - spin_unlock(&this_parent->d_lock); - rcu_read_unlock(); + struct dentry *child = this_parent; + this_parent = try_to_ascend(this_parent, locked, seq); + if (!this_parent) goto rename_retry; - } - rcu_read_unlock(); next = child->d_u.d_child.next; goto resume; } @@ -1523,6 +1528,28 @@ struct dentry * d_alloc_root(struct inode * root_inode) } EXPORT_SYMBOL(d_alloc_root); +static struct dentry * __d_find_any_alias(struct inode *inode) +{ + struct dentry *alias; + + if (list_empty(&inode->i_dentry)) + return NULL; + alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias); + __dget(alias); + return alias; +} + +static struct dentry * d_find_any_alias(struct inode *inode) +{ + struct dentry *de; + + spin_lock(&inode->i_lock); + de = __d_find_any_alias(inode); + spin_unlock(&inode->i_lock); + return de; +} + + /** * d_obtain_alias - find or allocate a dentry for a given inode * @inode: inode to allocate the dentry for @@ -1552,7 +1579,7 @@ struct dentry *d_obtain_alias(struct inode *inode) if (IS_ERR(inode)) return ERR_CAST(inode); - res = d_find_alias(inode); + res = d_find_any_alias(inode); if (res) goto out_iput; @@ -1565,7 +1592,7 @@ struct dentry *d_obtain_alias(struct inode *inode) spin_lock(&inode->i_lock); - res = __d_find_alias(inode, 0); + res = __d_find_any_alias(inode); if (res) { spin_unlock(&inode->i_lock); dput(tmp); @@ -2920,28 +2947,14 @@ resume: spin_unlock(&dentry->d_lock); } if (this_parent != root) { - struct dentry *tmp; - struct dentry *child; - - tmp = this_parent->d_parent; + struct dentry *child = this_parent; if (!(this_parent->d_flags & DCACHE_GENOCIDE)) { this_parent->d_flags |= DCACHE_GENOCIDE; this_parent->d_count--; } - rcu_read_lock(); - spin_unlock(&this_parent->d_lock); - child = this_parent; - this_parent = tmp; - spin_lock(&this_parent->d_lock); - /* might go back up the wrong parent if we have had a rename - * or deletion */ - if (this_parent != child->d_parent || - (!locked && read_seqretry(&rename_lock, seq))) { - spin_unlock(&this_parent->d_lock); - rcu_read_unlock(); + this_parent = try_to_ascend(this_parent, locked, seq); + if (!this_parent) goto rename_retry; - } - rcu_read_unlock(); next = child->d_u.d_child.next; goto resume; } diff --git a/fs/exec.c b/fs/exec.c index 52a447d9b6a..ba99e1abb1a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -115,13 +115,16 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) struct file *file; char *tmp = getname(library); int error = PTR_ERR(tmp); + static const struct open_flags uselib_flags = { + .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN, + .intent = LOOKUP_OPEN + }; if (IS_ERR(tmp)) goto out; - file = do_filp_open(AT_FDCWD, tmp, - O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0, - MAY_READ | MAY_EXEC | MAY_OPEN); + file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW); putname(tmp); error = PTR_ERR(file); if (IS_ERR(file)) @@ -721,10 +724,13 @@ struct file *open_exec(const char *name) { struct file *file; int err; + static const struct open_flags open_exec_flags = { + .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .acc_mode = MAY_EXEC | MAY_OPEN, + .intent = LOOKUP_OPEN + }; - file = do_filp_open(AT_FDCWD, name, - O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0, - MAY_EXEC | MAY_OPEN); + file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW); if (IS_ERR(file)) goto out; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 4b6825740dd..b05acb79613 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -320,9 +320,14 @@ static int export_encode_fh(struct dentry *dentry, struct fid *fid, struct inode * inode = dentry->d_inode; int len = *max_len; int type = FILEID_INO32_GEN; - - if (len < 2 || (connectable && len < 4)) + + if (connectable && (len < 4)) { + *max_len = 4; + return 255; + } else if (len < 2) { + *max_len = 2; return 255; + } len = 2; fid->i32.ino = inode->i_ino; @@ -369,6 +374,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, /* * Try to get any dentry for the given file handle from the filesystem. */ + if (!nop || !nop->fh_to_dentry) + return ERR_PTR(-ESTALE); result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type); if (!result) result = ERR_PTR(-ESTALE); diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index b27ba71810e..561f6925626 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -2253,13 +2253,6 @@ static int ext3_link (struct dentry * old_dentry, dquot_initialize(dir); - /* - * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing - * otherwise has the potential to corrupt the orphan inode list. - */ - if (inode->i_nlink == 0) - return -ENOENT; - retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 85c8cc8f247..9cc19a1dea8 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1936,6 +1936,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sb->s_qcop = &ext3_qctl_operations; sb->dq_op = &ext3_quota_operations; #endif + memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); mutex_init(&sbi->s_resize_lock); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 5485390d32c..e781b7ea563 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2304,13 +2304,6 @@ static int ext4_link(struct dentry *old_dentry, dquot_initialize(dir); - /* - * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing - * otherwise has the potential to corrupt the orphan inode list. - */ - if (inode->i_nlink == 0) - return -ENOENT; - retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f6a318f836b..5977b356a43 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3415,6 +3415,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_qcop = &ext4_qctl_operations; sb->dq_op = &ext4_quota_operations; #endif + memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); + INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); mutex_init(&sbi->s_resize_lock); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 86753fe10bd..0e277ec4b61 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -757,8 +757,10 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable) struct inode *inode = de->d_inode; u32 ipos_h, ipos_m, ipos_l; - if (len < 5) + if (len < 5) { + *lenp = 5; return 255; /* no room */ + } ipos_h = MSDOS_I(inode)->i_pos >> 8; ipos_m = (MSDOS_I(inode)->i_pos & 0xf0) << 24; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index f88f752babd..adae3fb7451 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -43,7 +43,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry) static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd) { - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; /* This is not negative dentry. Always valid. */ @@ -54,7 +54,7 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd) static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd) { - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; /* diff --git a/fs/fcntl.c b/fs/fcntl.c index cb1026181bd..6c82e5bac03 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -131,7 +131,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) SYSCALL_DEFINE1(dup, unsigned int, fildes) { int ret = -EBADF; - struct file *file = fget(fildes); + struct file *file = fget_raw(fildes); if (file) { ret = get_unused_fd(); @@ -426,15 +426,35 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, return err; } +static int check_fcntl_cmd(unsigned cmd) +{ + switch (cmd) { + case F_DUPFD: + case F_DUPFD_CLOEXEC: + case F_GETFD: + case F_SETFD: + case F_GETFL: + return 1; + } + return 0; +} + SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { struct file *filp; long err = -EBADF; - filp = fget(fd); + filp = fget_raw(fd); if (!filp) goto out; + if (unlikely(filp->f_mode & FMODE_PATH)) { + if (!check_fcntl_cmd(cmd)) { + fput(filp); + goto out; + } + } + err = security_file_fcntl(filp, cmd, arg); if (err) { fput(filp); @@ -456,10 +476,17 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, long err; err = -EBADF; - filp = fget(fd); + filp = fget_raw(fd); if (!filp) goto out; + if (unlikely(filp->f_mode & FMODE_PATH)) { + if (!check_fcntl_cmd(cmd)) { + fput(filp); + goto out; + } + } + err = security_file_fcntl(filp, cmd, arg); if (err) { fput(filp); @@ -808,14 +835,14 @@ static int __init fcntl_init(void) * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ - BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( + BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | O_APPEND | /* O_NONBLOCK | */ __O_SYNC | O_DSYNC | FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | O_NOATIME | O_CLOEXEC | - __FMODE_EXEC + __FMODE_EXEC | O_PATH )); fasync_cache = kmem_cache_create("fasync_cache", diff --git a/fs/fhandle.c b/fs/fhandle.c new file mode 100644 index 00000000000..bf93ad2bee0 --- /dev/null +++ b/fs/fhandle.c @@ -0,0 +1,265 @@ +#include <linux/syscalls.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/exportfs.h> +#include <linux/fs_struct.h> +#include <linux/fsnotify.h> +#include <asm/uaccess.h> +#include "internal.h" + +static long do_sys_name_to_handle(struct path *path, + struct file_handle __user *ufh, + int __user *mnt_id) +{ + long retval; + struct file_handle f_handle; + int handle_dwords, handle_bytes; + struct file_handle *handle = NULL; + + /* + * We need t make sure wether the file system + * support decoding of the file handle + */ + if (!path->mnt->mnt_sb->s_export_op || + !path->mnt->mnt_sb->s_export_op->fh_to_dentry) + return -EOPNOTSUPP; + + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) + return -EFAULT; + + if (f_handle.handle_bytes > MAX_HANDLE_SZ) + return -EINVAL; + + handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + GFP_KERNEL); + if (!handle) + return -ENOMEM; + + /* convert handle size to multiple of sizeof(u32) */ + handle_dwords = f_handle.handle_bytes >> 2; + + /* we ask for a non connected handle */ + retval = exportfs_encode_fh(path->dentry, + (struct fid *)handle->f_handle, + &handle_dwords, 0); + handle->handle_type = retval; + /* convert handle size to bytes */ + handle_bytes = handle_dwords * sizeof(u32); + handle->handle_bytes = handle_bytes; + if ((handle->handle_bytes > f_handle.handle_bytes) || + (retval == 255) || (retval == -ENOSPC)) { + /* As per old exportfs_encode_fh documentation + * we could return ENOSPC to indicate overflow + * But file system returned 255 always. So handle + * both the values + */ + /* + * set the handle size to zero so we copy only + * non variable part of the file_handle + */ + handle_bytes = 0; + retval = -EOVERFLOW; + } else + retval = 0; + /* copy the mount id */ + if (copy_to_user(mnt_id, &path->mnt->mnt_id, sizeof(*mnt_id)) || + copy_to_user(ufh, handle, + sizeof(struct file_handle) + handle_bytes)) + retval = -EFAULT; + kfree(handle); + return retval; +} + +/** + * sys_name_to_handle_at: convert name to handle + * @dfd: directory relative to which name is interpreted if not absolute + * @name: name that should be converted to handle. + * @handle: resulting file handle + * @mnt_id: mount id of the file system containing the file + * @flag: flag value to indicate whether to follow symlink or not + * + * @handle->handle_size indicate the space available to store the + * variable part of the file handle in bytes. If there is not + * enough space, the field is updated to return the minimum + * value required. + */ +SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, + struct file_handle __user *, handle, int __user *, mnt_id, + int, flag) +{ + struct path path; + int lookup_flags; + int err; + + if ((flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + + lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0; + if (flag & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + err = user_path_at(dfd, name, lookup_flags, &path); + if (!err) { + err = do_sys_name_to_handle(&path, handle, mnt_id); + path_put(&path); + } + return err; +} + +static struct vfsmount *get_vfsmount_from_fd(int fd) +{ + struct path path; + + if (fd == AT_FDCWD) { + struct fs_struct *fs = current->fs; + spin_lock(&fs->lock); + path = fs->pwd; + mntget(path.mnt); + spin_unlock(&fs->lock); + } else { + int fput_needed; + struct file *file = fget_light(fd, &fput_needed); + if (!file) + return ERR_PTR(-EBADF); + path = file->f_path; + mntget(path.mnt); + fput_light(file, fput_needed); + } + return path.mnt; +} + +static int vfs_dentry_acceptable(void *context, struct dentry *dentry) +{ + return 1; +} + +static int do_handle_to_path(int mountdirfd, struct file_handle *handle, + struct path *path) +{ + int retval = 0; + int handle_dwords; + + path->mnt = get_vfsmount_from_fd(mountdirfd); + if (IS_ERR(path->mnt)) { + retval = PTR_ERR(path->mnt); + goto out_err; + } + /* change the handle size to multiple of sizeof(u32) */ + handle_dwords = handle->handle_bytes >> 2; + path->dentry = exportfs_decode_fh(path->mnt, + (struct fid *)handle->f_handle, + handle_dwords, handle->handle_type, + vfs_dentry_acceptable, NULL); + if (IS_ERR(path->dentry)) { + retval = PTR_ERR(path->dentry); + goto out_mnt; + } + return 0; +out_mnt: + mntput(path->mnt); +out_err: + return retval; +} + +static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, + struct path *path) +{ + int retval = 0; + struct file_handle f_handle; + struct file_handle *handle = NULL; + + /* + * With handle we don't look at the execute bit on the + * the directory. Ideally we would like CAP_DAC_SEARCH. + * But we don't have that + */ + if (!capable(CAP_DAC_READ_SEARCH)) { + retval = -EPERM; + goto out_err; + } + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { + retval = -EFAULT; + goto out_err; + } + if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || + (f_handle.handle_bytes == 0)) { + retval = -EINVAL; + goto out_err; + } + handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + GFP_KERNEL); + if (!handle) { + retval = -ENOMEM; + goto out_err; + } + /* copy the full handle */ + if (copy_from_user(handle, ufh, + sizeof(struct file_handle) + + f_handle.handle_bytes)) { + retval = -EFAULT; + goto out_handle; + } + + retval = do_handle_to_path(mountdirfd, handle, path); + +out_handle: + kfree(handle); +out_err: + return retval; +} + +long do_handle_open(int mountdirfd, + struct file_handle __user *ufh, int open_flag) +{ + long retval = 0; + struct path path; + struct file *file; + int fd; + + retval = handle_to_path(mountdirfd, ufh, &path); + if (retval) + return retval; + + fd = get_unused_fd_flags(open_flag); + if (fd < 0) { + path_put(&path); + return fd; + } + file = file_open_root(path.dentry, path.mnt, "", open_flag); + if (IS_ERR(file)) { + put_unused_fd(fd); + retval = PTR_ERR(file); + } else { + retval = fd; + fsnotify_open(file); + fd_install(fd, file); + } + path_put(&path); + return retval; +} + +/** + * sys_open_by_handle_at: Open the file handle + * @mountdirfd: directory file descriptor + * @handle: file handle to be opened + * @flag: open flags. + * + * @mountdirfd indicate the directory file descriptor + * of the mount point. file handle is decoded relative + * to the vfsmount pointed by the @mountdirfd. @flags + * value is same as the open(2) flags. + */ +SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, + struct file_handle __user *, handle, + int, flags) +{ + long ret; + + if (force_o_largefile()) + flags |= O_LARGEFILE; + + ret = do_handle_open(mountdirfd, handle, flags); + return ret; +} diff --git a/fs/file_table.c b/fs/file_table.c index eb36b6b17e2..74a9544ac77 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -276,11 +276,10 @@ struct file *fget(unsigned int fd) rcu_read_lock(); file = fcheck_files(files, fd); if (file) { - if (!atomic_long_inc_not_zero(&file->f_count)) { - /* File object ref couldn't be taken */ - rcu_read_unlock(); - return NULL; - } + /* File object ref couldn't be taken */ + if (file->f_mode & FMODE_PATH || + !atomic_long_inc_not_zero(&file->f_count)) + file = NULL; } rcu_read_unlock(); @@ -289,6 +288,25 @@ struct file *fget(unsigned int fd) EXPORT_SYMBOL(fget); +struct file *fget_raw(unsigned int fd) +{ + struct file *file; + struct files_struct *files = current->files; + + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + /* File object ref couldn't be taken */ + if (!atomic_long_inc_not_zero(&file->f_count)) + file = NULL; + } + rcu_read_unlock(); + + return file; +} + +EXPORT_SYMBOL(fget_raw); + /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * @@ -313,6 +331,33 @@ struct file *fget_light(unsigned int fd, int *fput_needed) *fput_needed = 0; if (atomic_read(&files->count) == 1) { file = fcheck_files(files, fd); + if (file && (file->f_mode & FMODE_PATH)) + file = NULL; + } else { + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + if (!(file->f_mode & FMODE_PATH) && + atomic_long_inc_not_zero(&file->f_count)) + *fput_needed = 1; + else + /* Didn't get the reference, someone's freed */ + file = NULL; + } + rcu_read_unlock(); + } + + return file; +} + +struct file *fget_raw_light(unsigned int fd, int *fput_needed) +{ + struct file *file; + struct files_struct *files = current->files; + + *fput_needed = 0; + if (atomic_read(&files->count) == 1) { + file = fcheck_files(files, fd); } else { rcu_read_lock(); file = fcheck_files(files, fd); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 83543b5ff94..8bd0ef9286c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -158,7 +158,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) { struct inode *inode; - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; inode = entry->d_inode; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 9e3f68cc1bd..051b1a08452 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -637,8 +637,10 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, u64 nodeid; u32 generation; - if (*max_len < len) + if (*max_len < len) { + *max_len = len; return 255; + } nodeid = get_fuse_inode(inode)->nodeid; generation = inode->i_generation; diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 4a456338b87..0da8da2c991 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -44,7 +44,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) int error; int had_lock = 0; - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; parent = dget_parent(dentry); diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 9023db8184f..b5a5e60df0d 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -36,9 +36,13 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len, struct super_block *sb = inode->i_sb; struct gfs2_inode *ip = GFS2_I(inode); - if (*len < GFS2_SMALL_FH_SIZE || - (connectable && *len < GFS2_LARGE_FH_SIZE)) + if (connectable && (*len < GFS2_LARGE_FH_SIZE)) { + *len = GFS2_LARGE_FH_SIZE; return 255; + } else if (*len < GFS2_SMALL_FH_SIZE) { + *len = GFS2_SMALL_FH_SIZE; + return 255; + } fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32); fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); diff --git a/fs/internal.h b/fs/internal.h index 9b976b57d7f..f3d15de44b1 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -106,6 +106,19 @@ extern void put_super(struct super_block *sb); struct nameidata; extern struct file *nameidata_to_filp(struct nameidata *); extern void release_open_intent(struct nameidata *); +struct open_flags { + int open_flag; + int mode; + int acc_mode; + int intent; +}; +extern struct file *do_filp_open(int dfd, const char *pathname, + const struct open_flags *op, int lookup_flags); +extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, + const char *, const struct open_flags *, int lookup_flags); + +extern long do_handle_open(int mountdirfd, + struct file_handle __user *ufh, int open_flag); /* * inode.c diff --git a/fs/isofs/export.c b/fs/isofs/export.c index ed752cb3847..dd4687ff30d 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -124,9 +124,13 @@ isofs_export_encode_fh(struct dentry *dentry, * offset of the inode and the upper 16 bits of fh32[1] to * hold the offset of the parent. */ - - if (len < 3 || (connectable && len < 5)) + if (connectable && (len < 5)) { + *max_len = 5; + return 255; + } else if (len < 3) { + *max_len = 3; return 255; + } len = 3; fh32[0] = ei->i_iget5_block; diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 81ead850ddb..3f04a180493 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -809,9 +809,6 @@ static int jfs_link(struct dentry *old_dentry, if (ip->i_nlink == JFS_LINK_MAX) return -EMLINK; - if (ip->i_nlink == 0) - return -ENOENT; - dquot_initialize(dir); tid = txBegin(ip->i_sb, 0); @@ -1600,7 +1597,7 @@ out: static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd) { - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; /* * This is not negative dentry. Always valid. diff --git a/fs/namei.c b/fs/namei.c index 0087cf9c2c6..0a601cae23d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -136,7 +136,7 @@ static int do_getname(const char __user *filename, char *page) return retval; } -char * getname(const char __user * filename) +static char *getname_flags(const char __user * filename, int flags) { char *tmp, *result; @@ -147,14 +147,21 @@ char * getname(const char __user * filename) result = tmp; if (retval < 0) { - __putname(tmp); - result = ERR_PTR(retval); + if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { + __putname(tmp); + result = ERR_PTR(retval); + } } } audit_getname(result); return result; } +char *getname(const char __user * filename) +{ + return getname_flags(filename, 0); +} + #ifdef CONFIG_AUDITSYSCALL void putname(const char *name) { @@ -401,9 +408,11 @@ static int nameidata_drop_rcu(struct nameidata *nd) { struct fs_struct *fs = current->fs; struct dentry *dentry = nd->path.dentry; + int want_root = 0; BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (nd->root.mnt) { + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + want_root = 1; spin_lock(&fs->lock); if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry) @@ -414,7 +423,7 @@ static int nameidata_drop_rcu(struct nameidata *nd) goto err; BUG_ON(nd->inode != dentry->d_inode); spin_unlock(&dentry->d_lock); - if (nd->root.mnt) { + if (want_root) { path_get(&nd->root); spin_unlock(&fs->lock); } @@ -427,7 +436,7 @@ static int nameidata_drop_rcu(struct nameidata *nd) err: spin_unlock(&dentry->d_lock); err_root: - if (nd->root.mnt) + if (want_root) spin_unlock(&fs->lock); return -ECHILD; } @@ -454,9 +463,11 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry { struct fs_struct *fs = current->fs; struct dentry *parent = nd->path.dentry; + int want_root = 0; BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (nd->root.mnt) { + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + want_root = 1; spin_lock(&fs->lock); if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry) @@ -476,7 +487,7 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry parent->d_count++; spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); - if (nd->root.mnt) { + if (want_root) { path_get(&nd->root); spin_unlock(&fs->lock); } @@ -490,7 +501,7 @@ err: spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); err_root: - if (nd->root.mnt) + if (want_root) spin_unlock(&fs->lock); return -ECHILD; } @@ -498,8 +509,16 @@ err_root: /* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry) { - if (nd->flags & LOOKUP_RCU) - return nameidata_dentry_drop_rcu(nd, dentry); + if (nd->flags & LOOKUP_RCU) { + if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) { + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + return -ECHILD; + } + } return 0; } @@ -518,7 +537,8 @@ static int nameidata_drop_rcu_last(struct nameidata *nd) BUG_ON(!(nd->flags & LOOKUP_RCU)); nd->flags &= ~LOOKUP_RCU; - nd->root.mnt = NULL; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; spin_lock(&dentry->d_lock); if (!__d_rcu_to_refcount(dentry, nd->seq)) goto err_unlock; @@ -539,14 +559,6 @@ err_unlock: return -ECHILD; } -/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ -static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd) -{ - if (likely(nd->flags & LOOKUP_RCU)) - return nameidata_drop_rcu_last(nd); - return 0; -} - /** * release_open_intent - free up open intent resources * @nd: pointer to nameidata @@ -590,42 +602,8 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd) return dentry; } -static inline struct dentry * -do_revalidate_rcu(struct dentry *dentry, struct nameidata *nd) -{ - int status = d_revalidate(dentry, nd); - if (likely(status > 0)) - return dentry; - if (status == -ECHILD) { - if (nameidata_dentry_drop_rcu(nd, dentry)) - return ERR_PTR(-ECHILD); - return do_revalidate(dentry, nd); - } - if (status < 0) - return ERR_PTR(status); - /* Don't d_invalidate in rcu-walk mode */ - if (nameidata_dentry_drop_rcu(nd, dentry)) - return ERR_PTR(-ECHILD); - if (!d_invalidate(dentry)) { - dput(dentry); - dentry = NULL; - } - return dentry; -} - -static inline int need_reval_dot(struct dentry *dentry) -{ - if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE))) - return 0; - - if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))) - return 0; - - return 1; -} - /* - * force_reval_path - force revalidation of a dentry + * handle_reval_path - force revalidation of a dentry * * In some situations the path walking code will trust dentries without * revalidating them. This causes problems for filesystems that depend on @@ -639,27 +617,28 @@ static inline int need_reval_dot(struct dentry *dentry) * invalidate the dentry. It's up to the caller to handle putting references * to the path if necessary. */ -static int -force_reval_path(struct path *path, struct nameidata *nd) +static inline int handle_reval_path(struct nameidata *nd) { + struct dentry *dentry = nd->path.dentry; int status; - struct dentry *dentry = path->dentry; - /* - * only check on filesystems where it's possible for the dentry to - * become stale. - */ - if (!need_reval_dot(dentry)) + if (likely(!(nd->flags & LOOKUP_JUMPED))) + return 0; + + if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE))) + return 0; + + if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))) return 0; + /* Note: we do not d_invalidate() */ status = d_revalidate(dentry, nd); if (status > 0) return 0; - if (!status) { - d_invalidate(dentry); + if (!status) status = -ESTALE; - } + return status; } @@ -728,6 +707,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l path_put(&nd->path); nd->path = nd->root; path_get(&nd->root); + nd->flags |= LOOKUP_JUMPED; } nd->inode = nd->path.dentry->d_inode; @@ -757,20 +737,44 @@ static inline void path_to_nameidata(const struct path *path, nd->path.dentry = path->dentry; } +static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) +{ + struct inode *inode = link->dentry->d_inode; + if (!IS_ERR(cookie) && inode->i_op->put_link) + inode->i_op->put_link(link->dentry, nd, cookie); + path_put(link); +} + static __always_inline int -__do_follow_link(const struct path *link, struct nameidata *nd, void **p) +follow_link(struct path *link, struct nameidata *nd, void **p) { int error; struct dentry *dentry = link->dentry; BUG_ON(nd->flags & LOOKUP_RCU); + if (unlikely(current->total_link_count >= 40)) { + *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */ + path_put_conditional(link, nd); + path_put(&nd->path); + return -ELOOP; + } + cond_resched(); + current->total_link_count++; + touch_atime(link->mnt, dentry); nd_set_link(nd, NULL); if (link->mnt == nd->path.mnt) mntget(link->mnt); + error = security_inode_follow_link(link->dentry, nd); + if (error) { + *p = ERR_PTR(error); /* no ->put_link(), please */ + path_put(&nd->path); + return error; + } + nd->last_type = LAST_BIND; *p = dentry->d_inode->i_op->follow_link(dentry, nd); error = PTR_ERR(*p); @@ -780,56 +784,18 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p) if (s) error = __vfs_follow_link(nd, s); else if (nd->last_type == LAST_BIND) { - error = force_reval_path(&nd->path, nd); - if (error) + nd->flags |= LOOKUP_JUMPED; + nd->inode = nd->path.dentry->d_inode; + if (nd->inode->i_op->follow_link) { + /* stepped on a _really_ weird one */ path_put(&nd->path); + error = -ELOOP; + } } } return error; } -/* - * This limits recursive symlink follows to 8, while - * limiting consecutive symlinks to 40. - * - * Without that kind of total limit, nasty chains of consecutive - * symlinks can cause almost arbitrarily long lookups. - */ -static inline int do_follow_link(struct inode *inode, struct path *path, struct nameidata *nd) -{ - void *cookie; - int err = -ELOOP; - - /* We drop rcu-walk here */ - if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry)) - return -ECHILD; - BUG_ON(inode != path->dentry->d_inode); - - if (current->link_count >= MAX_NESTED_LINKS) - goto loop; - if (current->total_link_count >= 40) - goto loop; - BUG_ON(nd->depth >= MAX_NESTED_LINKS); - cond_resched(); - err = security_inode_follow_link(path->dentry, nd); - if (err) - goto loop; - current->link_count++; - current->total_link_count++; - nd->depth++; - err = __do_follow_link(path, nd, &cookie); - if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link) - path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie); - path_put(path); - current->link_count--; - nd->depth--; - return err; -loop: - path_put_conditional(path, nd); - path_put(&nd->path); - return err; -} - static int follow_up_rcu(struct path *path) { struct vfsmount *parent; @@ -1068,7 +1034,7 @@ static int follow_dotdot_rcu(struct nameidata *nd) seq = read_seqcount_begin(&parent->d_seq); if (read_seqcount_retry(&old->d_seq, nd->seq)) - return -ECHILD; + goto failed; inode = parent->d_inode; nd->path.dentry = parent; nd->seq = seq; @@ -1081,8 +1047,15 @@ static int follow_dotdot_rcu(struct nameidata *nd) } __follow_mount_rcu(nd, &nd->path, &inode, true); nd->inode = inode; - return 0; + +failed: + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + return -ECHILD; } /* @@ -1216,68 +1189,85 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, { struct vfsmount *mnt = nd->path.mnt; struct dentry *dentry, *parent = nd->path.dentry; - struct inode *dir; + int need_reval = 1; + int status = 1; int err; /* - * See if the low-level filesystem might want - * to use its own hash.. - */ - if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { - err = parent->d_op->d_hash(parent, nd->inode, name); - if (err < 0) - return err; - } - - /* * Rename seqlock is not required here because in the off chance * of a false negative due to a concurrent rename, we're going to * do the non-racy lookup, below. */ if (nd->flags & LOOKUP_RCU) { unsigned seq; - *inode = nd->inode; dentry = __d_lookup_rcu(parent, name, &seq, inode); - if (!dentry) { - if (nameidata_drop_rcu(nd)) - return -ECHILD; - goto need_lookup; - } + if (!dentry) + goto unlazy; + /* Memory barrier in read_seqcount_begin of child is enough */ if (__read_seqcount_retry(&parent->d_seq, nd->seq)) return -ECHILD; - nd->seq = seq; + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { - dentry = do_revalidate_rcu(dentry, nd); - if (!dentry) - goto need_lookup; - if (IS_ERR(dentry)) - goto fail; - if (!(nd->flags & LOOKUP_RCU)) - goto done; + status = d_revalidate(dentry, nd); + if (unlikely(status <= 0)) { + if (status != -ECHILD) + need_reval = 0; + goto unlazy; + } } path->mnt = mnt; path->dentry = dentry; if (likely(__follow_mount_rcu(nd, path, inode, false))) return 0; - if (nameidata_drop_rcu(nd)) - return -ECHILD; - /* fallthru */ +unlazy: + if (dentry) { + if (nameidata_dentry_drop_rcu(nd, dentry)) + return -ECHILD; + } else { + if (nameidata_drop_rcu(nd)) + return -ECHILD; + } + } else { + dentry = __d_lookup(parent, name); } - dentry = __d_lookup(parent, name); - if (!dentry) - goto need_lookup; -found: - if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { - dentry = do_revalidate(dentry, nd); - if (!dentry) - goto need_lookup; - if (IS_ERR(dentry)) - goto fail; + +retry: + if (unlikely(!dentry)) { + struct inode *dir = parent->d_inode; + BUG_ON(nd->inode != dir); + + mutex_lock(&dir->i_mutex); + dentry = d_lookup(parent, name); + if (likely(!dentry)) { + dentry = d_alloc_and_lookup(parent, name, nd); + if (IS_ERR(dentry)) { + mutex_unlock(&dir->i_mutex); + return PTR_ERR(dentry); + } + /* known good */ + need_reval = 0; + status = 1; + } + mutex_unlock(&dir->i_mutex); } -done: + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) + status = d_revalidate(dentry, nd); + if (unlikely(status <= 0)) { + if (status < 0) { + dput(dentry); + return status; + } + if (!d_invalidate(dentry)) { + dput(dentry); + dentry = NULL; + need_reval = 1; + goto retry; + } + } + path->mnt = mnt; path->dentry = dentry; err = follow_managed(path, nd->flags); @@ -1287,39 +1277,113 @@ done: } *inode = path->dentry->d_inode; return 0; +} -need_lookup: - dir = parent->d_inode; - BUG_ON(nd->inode != dir); +static inline int may_lookup(struct nameidata *nd) +{ + if (nd->flags & LOOKUP_RCU) { + int err = exec_permission(nd->inode, IPERM_FLAG_RCU); + if (err != -ECHILD) + return err; + if (nameidata_drop_rcu(nd)) + return -ECHILD; + } + return exec_permission(nd->inode, 0); +} - mutex_lock(&dir->i_mutex); - /* - * First re-do the cached lookup just in case it was created - * while we waited for the directory semaphore, or the first - * lookup failed due to an unrelated rename. - * - * This could use version numbering or similar to avoid unnecessary - * cache lookups, but then we'd have to do the first lookup in the - * non-racy way. However in the common case here, everything should - * be hot in cache, so would it be a big win? - */ - dentry = d_lookup(parent, name); - if (likely(!dentry)) { - dentry = d_alloc_and_lookup(parent, name, nd); - mutex_unlock(&dir->i_mutex); - if (IS_ERR(dentry)) - goto fail; - goto done; +static inline int handle_dots(struct nameidata *nd, int type) +{ + if (type == LAST_DOTDOT) { + if (nd->flags & LOOKUP_RCU) { + if (follow_dotdot_rcu(nd)) + return -ECHILD; + } else + follow_dotdot(nd); + } + return 0; +} + +static void terminate_walk(struct nameidata *nd) +{ + if (!(nd->flags & LOOKUP_RCU)) { + path_put(&nd->path); + } else { + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); } +} + +static inline int walk_component(struct nameidata *nd, struct path *path, + struct qstr *name, int type, int follow) +{ + struct inode *inode; + int err; /* - * Uhhuh! Nasty case: the cache was re-populated while - * we waited on the semaphore. Need to revalidate. + * "." and ".." are special - ".." especially so because it has + * to be able to know about the current root directory and + * parent relationships. */ - mutex_unlock(&dir->i_mutex); - goto found; + if (unlikely(type != LAST_NORM)) + return handle_dots(nd, type); + err = do_lookup(nd, name, path, &inode); + if (unlikely(err)) { + terminate_walk(nd); + return err; + } + if (!inode) { + path_to_nameidata(path, nd); + terminate_walk(nd); + return -ENOENT; + } + if (unlikely(inode->i_op->follow_link) && follow) { + if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry)) + return -ECHILD; + BUG_ON(inode != path->dentry->d_inode); + return 1; + } + path_to_nameidata(path, nd); + nd->inode = inode; + return 0; +} -fail: - return PTR_ERR(dentry); +/* + * This limits recursive symlink follows to 8, while + * limiting consecutive symlinks to 40. + * + * Without that kind of total limit, nasty chains of consecutive + * symlinks can cause almost arbitrarily long lookups. + */ +static inline int nested_symlink(struct path *path, struct nameidata *nd) +{ + int res; + + BUG_ON(nd->depth >= MAX_NESTED_LINKS); + if (unlikely(current->link_count >= MAX_NESTED_LINKS)) { + path_put_conditional(path, nd); + path_put(&nd->path); + return -ELOOP; + } + + nd->depth++; + current->link_count++; + + do { + struct path link = *path; + void *cookie; + + res = follow_link(&link, nd, &cookie); + if (!res) + res = walk_component(nd, path, &nd->last, + nd->last_type, LOOKUP_FOLLOW); + put_link(nd, &link, cookie); + } while (res > 0); + + current->link_count--; + nd->depth--; + return res; } /* @@ -1339,30 +1403,18 @@ static int link_path_walk(const char *name, struct nameidata *nd) while (*name=='/') name++; if (!*name) - goto return_reval; - - if (nd->depth) - lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE); + return 0; /* At this point we know we have a real path component. */ for(;;) { - struct inode *inode; unsigned long hash; struct qstr this; unsigned int c; + int type; nd->flags |= LOOKUP_CONTINUE; - if (nd->flags & LOOKUP_RCU) { - err = exec_permission(nd->inode, IPERM_FLAG_RCU); - if (err == -ECHILD) { - if (nameidata_drop_rcu(nd)) - return -ECHILD; - goto exec_again; - } - } else { -exec_again: - err = exec_permission(nd->inode, 0); - } + + err = may_lookup(nd); if (err) break; @@ -1378,52 +1430,43 @@ exec_again: this.len = name - (const char *) this.name; this.hash = end_name_hash(hash); + type = LAST_NORM; + if (this.name[0] == '.') switch (this.len) { + case 2: + if (this.name[1] == '.') { + type = LAST_DOTDOT; + nd->flags |= LOOKUP_JUMPED; + } + break; + case 1: + type = LAST_DOT; + } + if (likely(type == LAST_NORM)) { + struct dentry *parent = nd->path.dentry; + nd->flags &= ~LOOKUP_JUMPED; + if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { + err = parent->d_op->d_hash(parent, nd->inode, + &this); + if (err < 0) + break; + } + } + /* remove trailing slashes? */ if (!c) goto last_component; while (*++name == '/'); if (!*name) - goto last_with_slashes; + goto last_component; - /* - * "." and ".." are special - ".." especially so because it has - * to be able to know about the current root directory and - * parent relationships. - */ - if (this.name[0] == '.') switch (this.len) { - default: - break; - case 2: - if (this.name[1] != '.') - break; - if (nd->flags & LOOKUP_RCU) { - if (follow_dotdot_rcu(nd)) - return -ECHILD; - } else - follow_dotdot(nd); - /* fallthrough */ - case 1: - continue; - } - /* This does the actual lookups.. */ - err = do_lookup(nd, &this, &next, &inode); - if (err) - break; - err = -ENOENT; - if (!inode) - goto out_dput; + err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW); + if (err < 0) + return err; - if (inode->i_op->follow_link) { - err = do_follow_link(inode, &next, nd); + if (err) { + err = nested_symlink(&next, nd); if (err) - goto return_err; - nd->inode = nd->path.dentry->d_inode; - err = -ENOENT; - if (!nd->inode) - break; - } else { - path_to_nameidata(&next, nd); - nd->inode = inode; + return err; } err = -ENOTDIR; if (!nd->inode->i_op->lookup) @@ -1431,209 +1474,109 @@ exec_again: continue; /* here ends the main loop */ -last_with_slashes: - lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; last_component: /* Clear LOOKUP_CONTINUE iff it was previously unset */ nd->flags &= lookup_flags | ~LOOKUP_CONTINUE; - if (lookup_flags & LOOKUP_PARENT) - goto lookup_parent; - if (this.name[0] == '.') switch (this.len) { - default: - break; - case 2: - if (this.name[1] != '.') - break; - if (nd->flags & LOOKUP_RCU) { - if (follow_dotdot_rcu(nd)) - return -ECHILD; - } else - follow_dotdot(nd); - /* fallthrough */ - case 1: - goto return_reval; - } - err = do_lookup(nd, &this, &next, &inode); - if (err) - break; - if (inode && unlikely(inode->i_op->follow_link) && - (lookup_flags & LOOKUP_FOLLOW)) { - err = do_follow_link(inode, &next, nd); - if (err) - goto return_err; - nd->inode = nd->path.dentry->d_inode; - } else { - path_to_nameidata(&next, nd); - nd->inode = inode; - } - err = -ENOENT; - if (!nd->inode) - break; - if (lookup_flags & LOOKUP_DIRECTORY) { - err = -ENOTDIR; - if (!nd->inode->i_op->lookup) - break; - } - goto return_base; -lookup_parent: nd->last = this; - nd->last_type = LAST_NORM; - if (this.name[0] != '.') - goto return_base; - if (this.len == 1) - nd->last_type = LAST_DOT; - else if (this.len == 2 && this.name[1] == '.') - nd->last_type = LAST_DOTDOT; - else - goto return_base; -return_reval: - /* - * We bypassed the ordinary revalidation routines. - * We may need to check the cached dentry for staleness. - */ - if (need_reval_dot(nd->path.dentry)) { - if (nameidata_drop_rcu_last_maybe(nd)) - return -ECHILD; - /* Note: we do not d_invalidate() */ - err = d_revalidate(nd->path.dentry, nd); - if (!err) - err = -ESTALE; - if (err < 0) - break; - return 0; - } -return_base: - if (nameidata_drop_rcu_last_maybe(nd)) - return -ECHILD; + nd->last_type = type; return 0; -out_dput: - if (!(nd->flags & LOOKUP_RCU)) - path_put_conditional(&next, nd); - break; } - if (!(nd->flags & LOOKUP_RCU)) - path_put(&nd->path); -return_err: + terminate_walk(nd); return err; } -static inline int path_walk_rcu(const char *name, struct nameidata *nd) -{ - current->total_link_count = 0; - - return link_path_walk(name, nd); -} - -static inline int path_walk_simple(const char *name, struct nameidata *nd) -{ - current->total_link_count = 0; - - return link_path_walk(name, nd); -} - -static int path_walk(const char *name, struct nameidata *nd) -{ - struct path save = nd->path; - int result; - - current->total_link_count = 0; - - /* make sure the stuff we saved doesn't go away */ - path_get(&save); - - result = link_path_walk(name, nd); - if (result == -ESTALE) { - /* nd->path had been dropped */ - current->total_link_count = 0; - nd->path = save; - path_get(&nd->path); - nd->flags |= LOOKUP_REVAL; - result = link_path_walk(name, nd); - } - - path_put(&save); - - return result; -} - -static void path_finish_rcu(struct nameidata *nd) -{ - if (nd->flags & LOOKUP_RCU) { - /* RCU dangling. Cancel it. */ - nd->flags &= ~LOOKUP_RCU; - nd->root.mnt = NULL; - rcu_read_unlock(); - br_read_unlock(vfsmount_lock); - } - if (nd->file) - fput(nd->file); -} - -static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd) +static int path_init(int dfd, const char *name, unsigned int flags, + struct nameidata *nd, struct file **fp) { int retval = 0; int fput_needed; struct file *file; nd->last_type = LAST_ROOT; /* if there are only slashes... */ - nd->flags = flags | LOOKUP_RCU; + nd->flags = flags | LOOKUP_JUMPED; nd->depth = 0; + if (flags & LOOKUP_ROOT) { + struct inode *inode = nd->root.dentry->d_inode; + if (*name) { + if (!inode->i_op->lookup) + return -ENOTDIR; + retval = inode_permission(inode, MAY_EXEC); + if (retval) + return retval; + } + nd->path = nd->root; + nd->inode = inode; + if (flags & LOOKUP_RCU) { + br_read_lock(vfsmount_lock); + rcu_read_lock(); + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + } else { + path_get(&nd->path); + } + return 0; + } + nd->root.mnt = NULL; - nd->file = NULL; if (*name=='/') { - struct fs_struct *fs = current->fs; - unsigned seq; - - br_read_lock(vfsmount_lock); - rcu_read_lock(); - - do { - seq = read_seqcount_begin(&fs->seq); - nd->root = fs->root; - nd->path = nd->root; - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); - + if (flags & LOOKUP_RCU) { + br_read_lock(vfsmount_lock); + rcu_read_lock(); + set_root_rcu(nd); + } else { + set_root(nd); + path_get(&nd->root); + } + nd->path = nd->root; } else if (dfd == AT_FDCWD) { - struct fs_struct *fs = current->fs; - unsigned seq; - - br_read_lock(vfsmount_lock); - rcu_read_lock(); + if (flags & LOOKUP_RCU) { + struct fs_struct *fs = current->fs; + unsigned seq; - do { - seq = read_seqcount_begin(&fs->seq); - nd->path = fs->pwd; - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); + br_read_lock(vfsmount_lock); + rcu_read_lock(); + do { + seq = read_seqcount_begin(&fs->seq); + nd->path = fs->pwd; + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + } while (read_seqcount_retry(&fs->seq, seq)); + } else { + get_fs_pwd(current->fs, &nd->path); + } } else { struct dentry *dentry; - file = fget_light(dfd, &fput_needed); + file = fget_raw_light(dfd, &fput_needed); retval = -EBADF; if (!file) goto out_fail; dentry = file->f_path.dentry; - retval = -ENOTDIR; - if (!S_ISDIR(dentry->d_inode->i_mode)) - goto fput_fail; + if (*name) { + retval = -ENOTDIR; + if (!S_ISDIR(dentry->d_inode->i_mode)) + goto fput_fail; - retval = file_permission(file, MAY_EXEC); - if (retval) - goto fput_fail; + retval = file_permission(file, MAY_EXEC); + if (retval) + goto fput_fail; + } nd->path = file->f_path; - if (fput_needed) - nd->file = file; - - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - br_read_lock(vfsmount_lock); - rcu_read_lock(); + if (flags & LOOKUP_RCU) { + if (fput_needed) + *fp = file; + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + br_read_lock(vfsmount_lock); + rcu_read_lock(); + } else { + path_get(&file->f_path); + fput_light(file, fput_needed); + } } + nd->inode = nd->path.dentry->d_inode; return 0; @@ -1643,60 +1586,23 @@ out_fail: return retval; } -static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) +static inline int lookup_last(struct nameidata *nd, struct path *path) { - int retval = 0; - int fput_needed; - struct file *file; + if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) + nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; - nd->last_type = LAST_ROOT; /* if there are only slashes... */ - nd->flags = flags; - nd->depth = 0; - nd->root.mnt = NULL; - - if (*name=='/') { - set_root(nd); - nd->path = nd->root; - path_get(&nd->root); - } else if (dfd == AT_FDCWD) { - get_fs_pwd(current->fs, &nd->path); - } else { - struct dentry *dentry; - - file = fget_light(dfd, &fput_needed); - retval = -EBADF; - if (!file) - goto out_fail; - - dentry = file->f_path.dentry; - - retval = -ENOTDIR; - if (!S_ISDIR(dentry->d_inode->i_mode)) - goto fput_fail; - - retval = file_permission(file, MAY_EXEC); - if (retval) - goto fput_fail; - - nd->path = file->f_path; - path_get(&file->f_path); - - fput_light(file, fput_needed); - } - nd->inode = nd->path.dentry->d_inode; - return 0; - -fput_fail: - fput_light(file, fput_needed); -out_fail: - return retval; + nd->flags &= ~LOOKUP_PARENT; + return walk_component(nd, path, &nd->last, nd->last_type, + nd->flags & LOOKUP_FOLLOW); } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ -static int do_path_lookup(int dfd, const char *name, +static int path_lookupat(int dfd, const char *name, unsigned int flags, struct nameidata *nd) { - int retval; + struct file *base = NULL; + struct path path; + int err; /* * Path walking is largely split up into 2 different synchronisation @@ -1712,44 +1618,75 @@ static int do_path_lookup(int dfd, const char *name, * be handled by restarting a traditional ref-walk (which will always * be able to complete). */ - retval = path_init_rcu(dfd, name, flags, nd); - if (unlikely(retval)) - return retval; - retval = path_walk_rcu(name, nd); - path_finish_rcu(nd); - if (nd->root.mnt) { - path_put(&nd->root); - nd->root.mnt = NULL; + err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base); + + if (unlikely(err)) + return err; + + current->total_link_count = 0; + err = link_path_walk(name, nd); + + if (!err && !(flags & LOOKUP_PARENT)) { + err = lookup_last(nd, &path); + while (err > 0) { + void *cookie; + struct path link = path; + nd->flags |= LOOKUP_PARENT; + err = follow_link(&link, nd, &cookie); + if (!err) + err = lookup_last(nd, &path); + put_link(nd, &link, cookie); + } } - if (unlikely(retval == -ECHILD || retval == -ESTALE)) { - /* slower, locked walk */ - if (retval == -ESTALE) - flags |= LOOKUP_REVAL; - retval = path_init(dfd, name, flags, nd); - if (unlikely(retval)) - return retval; - retval = path_walk(name, nd); - if (nd->root.mnt) { - path_put(&nd->root); - nd->root.mnt = NULL; + if (nd->flags & LOOKUP_RCU) { + /* went all way through without dropping RCU */ + BUG_ON(err); + if (nameidata_drop_rcu_last(nd)) + err = -ECHILD; + } + + if (!err) + err = handle_reval_path(nd); + + if (!err && nd->flags & LOOKUP_DIRECTORY) { + if (!nd->inode->i_op->lookup) { + path_put(&nd->path); + return -ENOTDIR; } } + if (base) + fput(base); + + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + path_put(&nd->root); + nd->root.mnt = NULL; + } + return err; +} + +static int do_path_lookup(int dfd, const char *name, + unsigned int flags, struct nameidata *nd) +{ + int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd); + if (unlikely(retval == -ECHILD)) + retval = path_lookupat(dfd, name, flags, nd); + if (unlikely(retval == -ESTALE)) + retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd); + if (likely(!retval)) { if (unlikely(!audit_dummy_context())) { if (nd->path.dentry && nd->inode) audit_inode(name, nd->path.dentry); } } - return retval; } -int path_lookup(const char *name, unsigned int flags, - struct nameidata *nd) +int kern_path_parent(const char *name, struct nameidata *nd) { - return do_path_lookup(AT_FDCWD, name, flags, nd); + return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd); } int kern_path(const char *name, unsigned int flags, struct path *path) @@ -1773,29 +1710,10 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, const char *name, unsigned int flags, struct nameidata *nd) { - int retval; - - /* same as do_path_lookup */ - nd->last_type = LAST_ROOT; - nd->flags = flags; - nd->depth = 0; - - nd->path.dentry = dentry; - nd->path.mnt = mnt; - path_get(&nd->path); - nd->root = nd->path; - path_get(&nd->root); - nd->inode = nd->path.dentry->d_inode; - - retval = path_walk(name, nd); - if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && - nd->inode)) - audit_inode(name, nd->path.dentry); - - path_put(&nd->root); - nd->root.mnt = NULL; - - return retval; + nd->root.dentry = dentry; + nd->root.mnt = mnt; + /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */ + return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd); } static struct dentry *__lookup_hash(struct qstr *name, @@ -1810,17 +1728,6 @@ static struct dentry *__lookup_hash(struct qstr *name, return ERR_PTR(err); /* - * See if the low-level filesystem might want - * to use its own hash.. - */ - if (base->d_flags & DCACHE_OP_HASH) { - err = base->d_op->d_hash(base, inode, name); - dentry = ERR_PTR(err); - if (err < 0) - goto out; - } - - /* * Don't bother with __d_lookup: callers are for creat as * well as unlink, so a lot of the time it would cost * a double lookup. @@ -1832,7 +1739,7 @@ static struct dentry *__lookup_hash(struct qstr *name, if (!dentry) dentry = d_alloc_and_lookup(base, name, nd); -out: + return dentry; } @@ -1846,28 +1753,6 @@ static struct dentry *lookup_hash(struct nameidata *nd) return __lookup_hash(&nd->last, nd->path.dentry, nd); } -static int __lookup_one_len(const char *name, struct qstr *this, - struct dentry *base, int len) -{ - unsigned long hash; - unsigned int c; - - this->name = name; - this->len = len; - if (!len) - return -EACCES; - - hash = init_name_hash(); - while (len--) { - c = *(const unsigned char *)name++; - if (c == '/' || c == '\0') - return -EACCES; - hash = partial_name_hash(c, hash); - } - this->hash = end_name_hash(hash); - return 0; -} - /** * lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup @@ -1881,14 +1766,34 @@ static int __lookup_one_len(const char *name, struct qstr *this, */ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) { - int err; struct qstr this; + unsigned long hash; + unsigned int c; WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); - err = __lookup_one_len(name, &this, base, len); - if (err) - return ERR_PTR(err); + this.name = name; + this.len = len; + if (!len) + return ERR_PTR(-EACCES); + + hash = init_name_hash(); + while (len--) { + c = *(const unsigned char *)name++; + if (c == '/' || c == '\0') + return ERR_PTR(-EACCES); + hash = partial_name_hash(c, hash); + } + this.hash = end_name_hash(hash); + /* + * See if the low-level filesystem might want + * to use its own hash.. + */ + if (base->d_flags & DCACHE_OP_HASH) { + int err = base->d_op->d_hash(base, base->d_inode, &this); + if (err < 0) + return ERR_PTR(err); + } return __lookup_hash(&this, base, NULL); } @@ -1897,7 +1802,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags, struct path *path) { struct nameidata nd; - char *tmp = getname(name); + char *tmp = getname_flags(name, flags); int err = PTR_ERR(tmp); if (!IS_ERR(tmp)) { @@ -2077,12 +1982,16 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode, return error; } -int may_open(struct path *path, int acc_mode, int flag) +static int may_open(struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; struct inode *inode = dentry->d_inode; int error; + /* O_PATH? */ + if (!acc_mode) + return 0; + if (!inode) return -ENOENT; @@ -2151,34 +2060,6 @@ static int handle_truncate(struct file *filp) } /* - * Be careful about ever adding any more callers of this - * function. Its flags must be in the namei format, not - * what get passed to sys_open(). - */ -static int __open_namei_create(struct nameidata *nd, struct path *path, - int open_flag, int mode) -{ - int error; - struct dentry *dir = nd->path.dentry; - - if (!IS_POSIXACL(dir->d_inode)) - mode &= ~current_umask(); - error = security_path_mknod(&nd->path, path->dentry, mode, 0); - if (error) - goto out_unlock; - error = vfs_create(dir->d_inode, path->dentry, mode, nd); -out_unlock: - mutex_unlock(&dir->d_inode->i_mutex); - dput(nd->path.dentry); - nd->path.dentry = path->dentry; - - if (error) - return error; - /* Don't check for write permission, don't truncate */ - return may_open(&nd->path, 0, open_flag & ~O_TRUNC); -} - -/* * Note that while the flag value (low two bits) for sys_open means: * 00 - read-only * 01 - write-only @@ -2202,126 +2083,115 @@ static inline int open_to_namei_flags(int flag) return flag; } -static int open_will_truncate(int flag, struct inode *inode) -{ - /* - * We'll never write to the fs underlying - * a device file. - */ - if (special_file(inode->i_mode)) - return 0; - return (flag & O_TRUNC); -} - -static struct file *finish_open(struct nameidata *nd, - int open_flag, int acc_mode) -{ - struct file *filp; - int will_truncate; - int error; - - will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode); - if (will_truncate) { - error = mnt_want_write(nd->path.mnt); - if (error) - goto exit; - } - error = may_open(&nd->path, acc_mode, open_flag); - if (error) { - if (will_truncate) - mnt_drop_write(nd->path.mnt); - goto exit; - } - filp = nameidata_to_filp(nd); - if (!IS_ERR(filp)) { - error = ima_file_check(filp, acc_mode); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } - } - if (!IS_ERR(filp)) { - if (will_truncate) { - error = handle_truncate(filp); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } - } - } - /* - * It is now safe to drop the mnt write - * because the filp has had a write taken - * on its behalf. - */ - if (will_truncate) - mnt_drop_write(nd->path.mnt); - path_put(&nd->path); - return filp; - -exit: - path_put(&nd->path); - return ERR_PTR(error); -} - /* - * Handle O_CREAT case for do_filp_open + * Handle the last step of open() */ static struct file *do_last(struct nameidata *nd, struct path *path, - int open_flag, int acc_mode, - int mode, const char *pathname) + const struct open_flags *op, const char *pathname) { struct dentry *dir = nd->path.dentry; + struct dentry *dentry; + int open_flag = op->open_flag; + int will_truncate = open_flag & O_TRUNC; + int want_write = 0; + int acc_mode = op->acc_mode; struct file *filp; - int error = -EISDIR; + int error; + + nd->flags &= ~LOOKUP_PARENT; + nd->flags |= op->intent; switch (nd->last_type) { case LAST_DOTDOT: - follow_dotdot(nd); - dir = nd->path.dentry; case LAST_DOT: - if (need_reval_dot(dir)) { - int status = d_revalidate(nd->path.dentry, nd); - if (!status) - status = -ESTALE; - if (status < 0) { - error = status; - goto exit; - } - } + error = handle_dots(nd, nd->last_type); + if (error) + return ERR_PTR(error); /* fallthrough */ case LAST_ROOT: - goto exit; + if (nd->flags & LOOKUP_RCU) { + if (nameidata_drop_rcu_last(nd)) + return ERR_PTR(-ECHILD); + } + error = handle_reval_path(nd); + if (error) + goto exit; + audit_inode(pathname, nd->path.dentry); + if (open_flag & O_CREAT) { + error = -EISDIR; + goto exit; + } + goto ok; case LAST_BIND: + /* can't be RCU mode here */ + error = handle_reval_path(nd); + if (error) + goto exit; audit_inode(pathname, dir); goto ok; } + if (!(open_flag & O_CREAT)) { + int symlink_ok = 0; + if (nd->last.name[nd->last.len]) + nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; + if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) + symlink_ok = 1; + /* we _can_ be in RCU mode here */ + error = walk_component(nd, path, &nd->last, LAST_NORM, + !symlink_ok); + if (error < 0) + return ERR_PTR(error); + if (error) /* symlink */ + return NULL; + /* sayonara */ + if (nd->flags & LOOKUP_RCU) { + if (nameidata_drop_rcu_last(nd)) + return ERR_PTR(-ECHILD); + } + + error = -ENOTDIR; + if (nd->flags & LOOKUP_DIRECTORY) { + if (!nd->inode->i_op->lookup) + goto exit; + } + audit_inode(pathname, nd->path.dentry); + goto ok; + } + + /* create side of things */ + + if (nd->flags & LOOKUP_RCU) { + if (nameidata_drop_rcu_last(nd)) + return ERR_PTR(-ECHILD); + } + + audit_inode(pathname, dir); + error = -EISDIR; /* trailing slashes? */ if (nd->last.name[nd->last.len]) goto exit; mutex_lock(&dir->d_inode->i_mutex); - path->dentry = lookup_hash(nd); - path->mnt = nd->path.mnt; - - error = PTR_ERR(path->dentry); - if (IS_ERR(path->dentry)) { + dentry = lookup_hash(nd); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) { mutex_unlock(&dir->d_inode->i_mutex); goto exit; } - if (IS_ERR(nd->intent.open.file)) { - error = PTR_ERR(nd->intent.open.file); - goto exit_mutex_unlock; - } + path->dentry = dentry; + path->mnt = nd->path.mnt; /* Negative dentry, just create the file */ - if (!path->dentry->d_inode) { + if (!dentry->d_inode) { + int mode = op->mode; + if (!IS_POSIXACL(dir->d_inode)) + mode &= ~current_umask(); /* * This write is needed to ensure that a - * ro->rw transition does not occur between + * rw->ro transition does not occur between * the time when the file is created and when * a permanent write count is taken through * the 'struct file' in nameidata_to_filp(). @@ -2329,22 +2199,21 @@ static struct file *do_last(struct nameidata *nd, struct path *path, error = mnt_want_write(nd->path.mnt); if (error) goto exit_mutex_unlock; - error = __open_namei_create(nd, path, open_flag, mode); - if (error) { - mnt_drop_write(nd->path.mnt); - goto exit; - } - filp = nameidata_to_filp(nd); - mnt_drop_write(nd->path.mnt); - path_put(&nd->path); - if (!IS_ERR(filp)) { - error = ima_file_check(filp, acc_mode); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } - } - return filp; + want_write = 1; + /* Don't check for write permission, don't truncate */ + open_flag &= ~O_TRUNC; + will_truncate = 0; + acc_mode = MAY_OPEN; + error = security_path_mknod(&nd->path, dentry, mode, 0); + if (error) + goto exit_mutex_unlock; + error = vfs_create(dir->d_inode, dentry, mode, nd); + if (error) + goto exit_mutex_unlock; + mutex_unlock(&dir->d_inode->i_mutex); + dput(nd->path.dentry); + nd->path.dentry = dentry; + goto common; } /* @@ -2374,7 +2243,40 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (S_ISDIR(nd->inode->i_mode)) goto exit; ok: - filp = finish_open(nd, open_flag, acc_mode); + if (!S_ISREG(nd->inode->i_mode)) + will_truncate = 0; + + if (will_truncate) { + error = mnt_want_write(nd->path.mnt); + if (error) + goto exit; + want_write = 1; + } +common: + error = may_open(&nd->path, acc_mode, open_flag); + if (error) + goto exit; + filp = nameidata_to_filp(nd); + if (!IS_ERR(filp)) { + error = ima_file_check(filp, op->acc_mode); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + if (!IS_ERR(filp)) { + if (will_truncate) { + error = handle_truncate(filp); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + } +out: + if (want_write) + mnt_drop_write(nd->path.mnt); + path_put(&nd->path); return filp; exit_mutex_unlock: @@ -2382,197 +2284,103 @@ exit_mutex_unlock: exit_dput: path_put_conditional(path, nd); exit: - path_put(&nd->path); - return ERR_PTR(error); + filp = ERR_PTR(error); + goto out; } -/* - * Note that the low bits of the passed in "open_flag" - * are not the same as in the local variable "flag". See - * open_to_namei_flags() for more details. - */ -struct file *do_filp_open(int dfd, const char *pathname, - int open_flag, int mode, int acc_mode) +static struct file *path_openat(int dfd, const char *pathname, + struct nameidata *nd, const struct open_flags *op, int flags) { + struct file *base = NULL; struct file *filp; - struct nameidata nd; - int error; struct path path; - int count = 0; - int flag = open_to_namei_flags(open_flag); - int flags; - - if (!(open_flag & O_CREAT)) - mode = 0; - - /* Must never be set by userspace */ - open_flag &= ~FMODE_NONOTIFY; - - /* - * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only - * check for O_DSYNC if the need any syncing at all we enforce it's - * always set instead of having to deal with possibly weird behaviour - * for malicious applications setting only __O_SYNC. - */ - if (open_flag & __O_SYNC) - open_flag |= O_DSYNC; - - if (!acc_mode) - acc_mode = MAY_OPEN | ACC_MODE(open_flag); - - /* O_TRUNC implies we need access checks for write permissions */ - if (open_flag & O_TRUNC) - acc_mode |= MAY_WRITE; - - /* Allow the LSM permission hook to distinguish append - access from general write access. */ - if (open_flag & O_APPEND) - acc_mode |= MAY_APPEND; - - flags = LOOKUP_OPEN; - if (open_flag & O_CREAT) { - flags |= LOOKUP_CREATE; - if (open_flag & O_EXCL) - flags |= LOOKUP_EXCL; - } - if (open_flag & O_DIRECTORY) - flags |= LOOKUP_DIRECTORY; - if (!(open_flag & O_NOFOLLOW)) - flags |= LOOKUP_FOLLOW; + int error; filp = get_empty_filp(); if (!filp) return ERR_PTR(-ENFILE); - filp->f_flags = open_flag; - nd.intent.open.file = filp; - nd.intent.open.flags = flag; - nd.intent.open.create_mode = mode; - - if (open_flag & O_CREAT) - goto creat; + filp->f_flags = op->open_flag; + nd->intent.open.file = filp; + nd->intent.open.flags = open_to_namei_flags(op->open_flag); + nd->intent.open.create_mode = op->mode; - /* !O_CREAT, simple open */ - error = do_path_lookup(dfd, pathname, flags, &nd); + error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base); if (unlikely(error)) goto out_filp; - error = -ELOOP; - if (!(nd.flags & LOOKUP_FOLLOW)) { - if (nd.inode->i_op->follow_link) - goto out_path; - } - error = -ENOTDIR; - if (nd.flags & LOOKUP_DIRECTORY) { - if (!nd.inode->i_op->lookup) - goto out_path; - } - audit_inode(pathname, nd.path.dentry); - filp = finish_open(&nd, open_flag, acc_mode); - release_open_intent(&nd); - return filp; -creat: - /* OK, have to create the file. Find the parent. */ - error = path_init_rcu(dfd, pathname, - LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd); - if (error) - goto out_filp; - error = path_walk_rcu(pathname, &nd); - path_finish_rcu(&nd); - if (unlikely(error == -ECHILD || error == -ESTALE)) { - /* slower, locked walk */ - if (error == -ESTALE) { -reval: - flags |= LOOKUP_REVAL; - } - error = path_init(dfd, pathname, - LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd); - if (error) - goto out_filp; - - error = path_walk_simple(pathname, &nd); - } + current->total_link_count = 0; + error = link_path_walk(pathname, nd); if (unlikely(error)) goto out_filp; - if (unlikely(!audit_dummy_context())) - audit_inode(pathname, nd.path.dentry); - /* - * We have the parent and last component. - */ - nd.flags = flags; - filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); + filp = do_last(nd, &path, op, pathname); while (unlikely(!filp)) { /* trailing symlink */ struct path link = path; - struct inode *linki = link.dentry->d_inode; void *cookie; - error = -ELOOP; - if (!(nd.flags & LOOKUP_FOLLOW)) - goto exit_dput; - if (count++ == 32) - goto exit_dput; - /* - * This is subtle. Instead of calling do_follow_link() we do - * the thing by hands. The reason is that this way we have zero - * link_count and path_walk() (called from ->follow_link) - * honoring LOOKUP_PARENT. After that we have the parent and - * last component, i.e. we are in the same situation as after - * the first path_walk(). Well, almost - if the last component - * is normal we get its copy stored in nd->last.name and we will - * have to putname() it when we are done. Procfs-like symlinks - * just set LAST_BIND. - */ - nd.flags |= LOOKUP_PARENT; - error = security_inode_follow_link(link.dentry, &nd); - if (error) - goto exit_dput; - error = __do_follow_link(&link, &nd, &cookie); - if (unlikely(error)) { - if (!IS_ERR(cookie) && linki->i_op->put_link) - linki->i_op->put_link(link.dentry, &nd, cookie); - /* nd.path had been dropped */ - nd.path = link; - goto out_path; + if (!(nd->flags & LOOKUP_FOLLOW)) { + path_put_conditional(&path, nd); + path_put(&nd->path); + filp = ERR_PTR(-ELOOP); + break; } - nd.flags &= ~LOOKUP_PARENT; - filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); - if (linki->i_op->put_link) - linki->i_op->put_link(link.dentry, &nd, cookie); - path_put(&link); + nd->flags |= LOOKUP_PARENT; + nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); + error = follow_link(&link, nd, &cookie); + if (unlikely(error)) + filp = ERR_PTR(error); + else + filp = do_last(nd, &path, op, pathname); + put_link(nd, &link, cookie); } out: - if (nd.root.mnt) - path_put(&nd.root); - if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL)) - goto reval; - release_open_intent(&nd); + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) + path_put(&nd->root); + if (base) + fput(base); + release_open_intent(nd); return filp; -exit_dput: - path_put_conditional(&path, &nd); -out_path: - path_put(&nd.path); out_filp: filp = ERR_PTR(error); goto out; } -/** - * filp_open - open file and return file pointer - * - * @filename: path to open - * @flags: open flags as per the open(2) second argument - * @mode: mode for the new file if O_CREAT is set, else ignored - * - * This is the helper to open a file from kernelspace if you really - * have to. But in generally you should not do this, so please move - * along, nothing to see here.. - */ -struct file *filp_open(const char *filename, int flags, int mode) +struct file *do_filp_open(int dfd, const char *pathname, + const struct open_flags *op, int flags) +{ + struct nameidata nd; + struct file *filp; + + filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); + if (unlikely(filp == ERR_PTR(-ECHILD))) + filp = path_openat(dfd, pathname, &nd, op, flags); + if (unlikely(filp == ERR_PTR(-ESTALE))) + filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL); + return filp; +} + +struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, + const char *name, const struct open_flags *op, int flags) { - return do_filp_open(AT_FDCWD, filename, flags, mode, 0); + struct nameidata nd; + struct file *file; + + nd.root.mnt = mnt; + nd.root.dentry = dentry; + + flags |= LOOKUP_ROOT; + + if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN) + return ERR_PTR(-ELOOP); + + file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU); + if (unlikely(file == ERR_PTR(-ECHILD))) + file = path_openat(-1, name, &nd, op, flags); + if (unlikely(file == ERR_PTR(-ESTALE))) + file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL); + return file; } -EXPORT_SYMBOL(filp_open); /** * lookup_create - lookup a dentry, creating it if it doesn't exist @@ -3111,7 +2919,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de return error; mutex_lock(&inode->i_mutex); - error = dir->i_op->link(old_dentry, dir, new_dentry); + /* Make sure we don't allow creating hardlink to an unlinked file */ + if (inode->i_nlink == 0) + error = -ENOENT; + else + error = dir->i_op->link(old_dentry, dir, new_dentry); mutex_unlock(&inode->i_mutex); if (!error) fsnotify_link(dir, inode, new_dentry); @@ -3133,15 +2945,27 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, struct dentry *new_dentry; struct nameidata nd; struct path old_path; + int how = 0; int error; char *to; - if ((flags & ~AT_SYMLINK_FOLLOW) != 0) + if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; + /* + * To use null names we require CAP_DAC_READ_SEARCH + * This ensures that not everyone will be able to create + * handlink using the passed filedescriptor. + */ + if (flags & AT_EMPTY_PATH) { + if (!capable(CAP_DAC_READ_SEARCH)) + return -ENOENT; + how = LOOKUP_EMPTY; + } + + if (flags & AT_SYMLINK_FOLLOW) + how |= LOOKUP_FOLLOW; - error = user_path_at(olddfd, oldname, - flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0, - &old_path); + error = user_path_at(olddfd, oldname, how, &old_path); if (error) return error; @@ -3578,7 +3402,7 @@ EXPORT_SYMBOL(page_readlink); EXPORT_SYMBOL(__page_symlink); EXPORT_SYMBOL(page_symlink); EXPORT_SYMBOL(page_symlink_inode_operations); -EXPORT_SYMBOL(path_lookup); +EXPORT_SYMBOL(kern_path_parent); EXPORT_SYMBOL(kern_path); EXPORT_SYMBOL(vfs_path_lookup); EXPORT_SYMBOL(inode_permission); diff --git a/fs/namespace.c b/fs/namespace.c index d1edf26025d..dffe6f49ab9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1002,6 +1002,18 @@ const struct seq_operations mounts_op = { .show = show_vfsmnt }; +static int uuid_is_nil(u8 *uuid) +{ + int i; + u8 *cp = (u8 *)uuid; + + for (i = 0; i < 16; i++) { + if (*cp++) + return 0; + } + return 1; +} + static int show_mountinfo(struct seq_file *m, void *v) { struct proc_mounts *p = m->private; @@ -1040,6 +1052,10 @@ static int show_mountinfo(struct seq_file *m, void *v) if (IS_MNT_UNBINDABLE(mnt)) seq_puts(m, " unbindable"); + if (!uuid_is_nil(mnt->mnt_sb->s_uuid)) + /* print the uuid */ + seq_printf(m, " uuid:%pU", mnt->mnt_sb->s_uuid); + /* Filesystem specific data */ seq_puts(m, " - "); show_type(m, sb); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 1cc600e77bb..2f8e61816d7 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -37,6 +37,7 @@ #include <linux/inet.h> #include <linux/nfs_xdr.h> #include <linux/slab.h> +#include <linux/compat.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -89,7 +90,11 @@ int nfs_wait_bit_killable(void *word) */ u64 nfs_compat_user_ino64(u64 fileid) { - int ino; +#ifdef CONFIG_COMPAT + compat_ulong_t ino; +#else + unsigned long ino; +#endif if (enable_ino64) return fileid; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 7a747407314..1be36cf65bf 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -298,6 +298,11 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); #if defined(CONFIG_NFS_V4_1) struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp); +extern void nfs4_schedule_session_recovery(struct nfs4_session *); +#else +static inline void nfs4_schedule_session_recovery(struct nfs4_session *session) +{ +} #endif /* CONFIG_NFS_V4_1 */ extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); @@ -307,10 +312,9 @@ extern void nfs4_put_open_state(struct nfs4_state *); extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t); extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t); extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); -extern void nfs4_schedule_state_recovery(struct nfs_client *); +extern void nfs4_schedule_lease_recovery(struct nfs_client *); extern void nfs4_schedule_state_manager(struct nfs_client *); -extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state); -extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state); +extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); extern void nfs41_handle_recall_slot(struct nfs_client *clp); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index f5c9b125e8c..b73c34375f6 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -219,6 +219,10 @@ decode_and_add_ds(__be32 **pp, struct inode *inode) goto out_err; } buf = kmalloc(rlen + 1, GFP_KERNEL); + if (!buf) { + dprintk("%s: Not enough memory\n", __func__); + goto out_err; + } buf[rlen] = '\0'; memcpy(buf, r_addr, rlen); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1ff76acc7e9..0a07e353a96 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -51,7 +51,6 @@ #include <linux/sunrpc/bc_xprt.h> #include <linux/xattr.h> #include <linux/utsname.h> -#include <linux/mm.h> #include "nfs4_fs.h" #include "delegation.h" @@ -257,12 +256,13 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, case -NFS4ERR_OPENMODE: if (state == NULL) break; - nfs4_state_mark_reclaim_nograce(clp, state); - goto do_state_recovery; + nfs4_schedule_stateid_recovery(server, state); + goto wait_on_recovery; case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_EXPIRED: - goto do_state_recovery; + nfs4_schedule_lease_recovery(clp); + goto wait_on_recovery; #if defined(CONFIG_NFS_V4_1) case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: @@ -273,7 +273,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, case -NFS4ERR_SEQ_MISORDERED: dprintk("%s ERROR: %d Reset session\n", __func__, errorcode); - nfs4_schedule_state_recovery(clp); + nfs4_schedule_session_recovery(clp->cl_session); exception->retry = 1; break; #endif /* defined(CONFIG_NFS_V4_1) */ @@ -296,8 +296,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, } /* We failed to handle the error */ return nfs4_map_errors(ret); -do_state_recovery: - nfs4_schedule_state_recovery(clp); +wait_on_recovery: ret = nfs4_wait_clnt_recover(clp); if (ret == 0) exception->retry = 1; @@ -436,8 +435,8 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * clp = res->sr_session->clp; do_renew_lease(clp, timestamp); /* Check sequence flags */ - if (atomic_read(&clp->cl_count) > 1) - nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); + if (res->sr_status_flags != 0) + nfs4_schedule_lease_recovery(clp); break; case -NFS4ERR_DELAY: /* The server detected a resend of the RPC call and @@ -1256,14 +1255,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_DEADSESSION: - nfs4_schedule_state_recovery( - server->nfs_client); + nfs4_schedule_session_recovery(server->nfs_client->cl_session); goto out; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: /* Don't recall a delegation if it was lost */ - nfs4_schedule_state_recovery(server->nfs_client); + nfs4_schedule_lease_recovery(server->nfs_client); goto out; case -ERESTARTSYS: /* @@ -1272,7 +1270,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state */ case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: - nfs4_state_mark_reclaim_nograce(server->nfs_client, state); + nfs4_schedule_stateid_recovery(server, state); case -EKEYEXPIRED: /* * User RPCSEC_GSS context has expired. @@ -1588,7 +1586,7 @@ static int nfs4_recover_expired_lease(struct nfs_server *server) if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) break; - nfs4_schedule_state_recovery(clp); + nfs4_schedule_state_manager(clp); ret = -EIO; } return ret; @@ -3179,7 +3177,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata) if (task->tk_status < 0) { /* Unless we're shutting down, schedule state recovery! */ if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0) - nfs4_schedule_state_recovery(clp); + nfs4_schedule_lease_recovery(clp); return; } do_renew_lease(clp, timestamp); @@ -3262,7 +3260,7 @@ static int buf_to_pages_noslab(const void *buf, size_t buflen, spages = pages; do { - len = min(PAGE_CACHE_SIZE, buflen); + len = min_t(size_t, PAGE_CACHE_SIZE, buflen); newpage = alloc_page(GFP_KERNEL); if (newpage == NULL) @@ -3504,12 +3502,13 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, case -NFS4ERR_OPENMODE: if (state == NULL) break; - nfs4_state_mark_reclaim_nograce(clp, state); - goto do_state_recovery; + nfs4_schedule_stateid_recovery(server, state); + goto wait_on_recovery; case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_EXPIRED: - goto do_state_recovery; + nfs4_schedule_lease_recovery(clp); + goto wait_on_recovery; #if defined(CONFIG_NFS_V4_1) case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: @@ -3520,7 +3519,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, case -NFS4ERR_SEQ_MISORDERED: dprintk("%s ERROR %d, Reset session\n", __func__, task->tk_status); - nfs4_schedule_state_recovery(clp); + nfs4_schedule_session_recovery(clp->cl_session); task->tk_status = 0; return -EAGAIN; #endif /* CONFIG_NFS_V4_1 */ @@ -3537,9 +3536,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, } task->tk_status = nfs4_map_errors(task->tk_status); return 0; -do_state_recovery: +wait_on_recovery: rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); - nfs4_schedule_state_recovery(clp); if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); task->tk_status = 0; @@ -4150,7 +4148,7 @@ static void nfs4_lock_release(void *calldata) task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp, data->arg.lock_seqid); if (!IS_ERR(task)) - rpc_put_task(task); + rpc_put_task_async(task); dprintk("%s: cancelling lock!\n", __func__); } else nfs_free_seqid(data->arg.lock_seqid); @@ -4174,23 +4172,18 @@ static const struct rpc_call_ops nfs4_recover_lock_ops = { static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error) { - struct nfs_client *clp = server->nfs_client; - struct nfs4_state *state = lsp->ls_state; - switch (error) { case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_EXPIRED: + lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; if (new_lock_owner != 0 || (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) - nfs4_state_mark_reclaim_nograce(clp, state); - lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; + nfs4_schedule_stateid_recovery(server, lsp->ls_state); break; case -NFS4ERR_STALE_STATEID: - if (new_lock_owner != 0 || - (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) - nfs4_state_mark_reclaim_reboot(clp, state); lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; + case -NFS4ERR_EXPIRED: + nfs4_schedule_lease_recovery(server->nfs_client); }; } @@ -4406,12 +4399,14 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) case -NFS4ERR_EXPIRED: case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: + nfs4_schedule_lease_recovery(server->nfs_client); + goto out; case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_DEADSESSION: - nfs4_schedule_state_recovery(server->nfs_client); + nfs4_schedule_session_recovery(server->nfs_client->cl_session); goto out; case -ERESTARTSYS: /* @@ -4421,7 +4416,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OPENMODE: - nfs4_state_mark_reclaim_nograce(server->nfs_client, state); + nfs4_schedule_stateid_recovery(server, state); err = 0; goto out; case -EKEYEXPIRED: @@ -5028,10 +5023,20 @@ int nfs4_proc_create_session(struct nfs_client *clp) int status; unsigned *ptr; struct nfs4_session *session = clp->cl_session; + long timeout = 0; + int err; dprintk("--> %s clp=%p session=%p\n", __func__, clp, session); - status = _nfs4_proc_create_session(clp); + do { + status = _nfs4_proc_create_session(clp); + if (status == -NFS4ERR_DELAY) { + err = nfs4_delay(clp->cl_rpcclient, &timeout); + if (err) + status = err; + } + } while (status == -NFS4ERR_DELAY); + if (status) goto out; @@ -5140,7 +5145,7 @@ static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client rpc_delay(task, NFS4_POLL_RETRY_MAX); return -EAGAIN; default: - nfs4_schedule_state_recovery(clp); + nfs4_schedule_lease_recovery(clp); } return 0; } @@ -5227,7 +5232,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr if (IS_ERR(task)) ret = PTR_ERR(task); else - rpc_put_task(task); + rpc_put_task_async(task); dprintk("<-- %s status=%d\n", __func__, ret); return ret; } @@ -5243,8 +5248,13 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) goto out; } ret = rpc_wait_for_completion_task(task); - if (!ret) + if (!ret) { + struct nfs4_sequence_res *res = task->tk_msg.rpc_resp; + + if (task->tk_status == 0) + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); ret = task->tk_status; + } rpc_put_task(task); out: dprintk("<-- %s status=%d\n", __func__, ret); @@ -5281,7 +5291,7 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf rpc_delay(task, NFS4_POLL_RETRY_MAX); return -EAGAIN; default: - nfs4_schedule_state_recovery(clp); + nfs4_schedule_lease_recovery(clp); } return 0; } @@ -5349,6 +5359,9 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp) status = PTR_ERR(task); goto out; } + status = nfs4_wait_for_completion_rpc_task(task); + if (status == 0) + status = task->tk_status; rpc_put_task(task); return 0; out: diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e6742b57a04..0592288f9f0 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1007,9 +1007,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) } /* - * Schedule a state recovery attempt + * Schedule a lease recovery attempt */ -void nfs4_schedule_state_recovery(struct nfs_client *clp) +void nfs4_schedule_lease_recovery(struct nfs_client *clp) { if (!clp) return; @@ -1018,7 +1018,7 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp) nfs4_schedule_state_manager(clp); } -int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) +static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) { set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); @@ -1032,7 +1032,7 @@ int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *st return 1; } -int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) +static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) { set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); @@ -1041,6 +1041,14 @@ int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *s return 1; } +void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state) +{ + struct nfs_client *clp = server->nfs_client; + + nfs4_state_mark_reclaim_nograce(clp, state); + nfs4_schedule_state_manager(clp); +} + static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops) { struct inode *inode = state->inode; @@ -1436,10 +1444,15 @@ static int nfs4_reclaim_lease(struct nfs_client *clp) } #ifdef CONFIG_NFS_V4_1 +void nfs4_schedule_session_recovery(struct nfs4_session *session) +{ + nfs4_schedule_lease_recovery(session->clp); +} + void nfs41_handle_recall_slot(struct nfs_client *clp) { set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); - nfs4_schedule_state_recovery(clp); + nfs4_schedule_state_manager(clp); } static void nfs4_reset_all_state(struct nfs_client *clp) @@ -1447,7 +1460,7 @@ static void nfs4_reset_all_state(struct nfs_client *clp) if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { clp->cl_boot_time = CURRENT_TIME; nfs4_state_start_reclaim_nograce(clp); - nfs4_schedule_state_recovery(clp); + nfs4_schedule_state_manager(clp); } } @@ -1455,7 +1468,7 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp) { if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { nfs4_state_start_reclaim_reboot(clp); - nfs4_schedule_state_recovery(clp); + nfs4_schedule_state_manager(clp); } } @@ -1475,7 +1488,7 @@ static void nfs41_handle_cb_path_down(struct nfs_client *clp) { nfs_expire_all_delegations(clp); if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0) - nfs4_schedule_state_recovery(clp); + nfs4_schedule_state_manager(clp); } void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 4e2c168b6ee..94d50e86a12 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1660,7 +1660,7 @@ static void encode_create_session(struct xdr_stream *xdr, p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12); *p++ = cpu_to_be32(OP_CREATE_SESSION); - p = xdr_encode_hyper(p, clp->cl_ex_clid); + p = xdr_encode_hyper(p, clp->cl_clientid); *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ *p++ = cpu_to_be32(args->flags); /*flags */ @@ -4694,7 +4694,7 @@ static int decode_exchange_id(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - xdr_decode_hyper(p, &clp->cl_ex_clid); + xdr_decode_hyper(p, &clp->cl_clientid); p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) goto out_overflow; diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 903908a2002..c541093a5bf 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -86,11 +86,14 @@ /* Default path we try to mount. "%s" gets replaced by our IP address */ #define NFS_ROOT "/tftpboot/%s" +/* Default NFSROOT mount options. */ +#define NFS_DEF_OPTIONS "udp" + /* Parameters passed from the kernel command line */ static char nfs_root_parms[256] __initdata = ""; /* Text-based mount options passed to super.c */ -static char nfs_root_options[256] __initdata = ""; +static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS; /* Address of NFS server */ static __be32 servaddr __initdata = htonl(INADDR_NONE); @@ -160,8 +163,14 @@ static int __init root_nfs_copy(char *dest, const char *src, } static int __init root_nfs_cat(char *dest, const char *src, - const size_t destlen) + const size_t destlen) { + size_t len = strlen(dest); + + if (len && dest[len - 1] != ',') + if (strlcat(dest, ",", destlen) > destlen) + return -1; + if (strlcat(dest, src, destlen) > destlen) return -1; return 0; @@ -194,16 +203,6 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath, if (root_nfs_cat(nfs_root_options, incoming, sizeof(nfs_root_options))) return -1; - - /* - * Possibly prepare for more options to be appended - */ - if (nfs_root_options[0] != '\0' && - nfs_root_options[strlen(nfs_root_options)] != ',') - if (root_nfs_cat(nfs_root_options, ",", - sizeof(nfs_root_options))) - return -1; - return 0; } @@ -217,7 +216,7 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath, */ static int __init root_nfs_data(char *cmdline) { - char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1]; + char mand_options[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1]; int len, retval = -1; char *tmp = NULL; const size_t tmplen = sizeof(nfs_export_path); @@ -244,9 +243,9 @@ static int __init root_nfs_data(char *cmdline) * Append mandatory options for nfsroot so they override * what has come before */ - snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4", + snprintf(mand_options, sizeof(mand_options), "nolock,addr=%pI4", &servaddr); - if (root_nfs_cat(nfs_root_options, addr_option, + if (root_nfs_cat(nfs_root_options, mand_options, sizeof(nfs_root_options))) goto out_optionstoolong; diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index e313a51acdd..6481d537d69 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -180,7 +180,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n task_setup_data.rpc_client = NFS_CLIENT(dir); task = rpc_run_task(&task_setup_data); if (!IS_ERR(task)) - rpc_put_task(task); + rpc_put_task_async(task); return 1; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c8278f4046c..42b92d7a9cc 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1292,6 +1292,8 @@ static int nfs_commit_rpcsetup(struct list_head *head, task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); + if (how & FLUSH_SYNC) + rpc_wait_for_completion_task(task); rpc_put_task(task); return 0; } diff --git a/fs/nfsctl.c b/fs/nfsctl.c index bf9cbd242dd..124e8fcb0dd 100644 --- a/fs/nfsctl.c +++ b/fs/nfsctl.c @@ -22,30 +22,17 @@ static struct file *do_open(char *name, int flags) { - struct nameidata nd; struct vfsmount *mnt; - int error; + struct file *file; mnt = do_kern_mount("nfsd", 0, "nfsd", NULL); if (IS_ERR(mnt)) return (struct file *)mnt; - error = vfs_path_lookup(mnt->mnt_root, mnt, name, 0, &nd); - mntput(mnt); /* drop do_kern_mount reference */ - if (error) - return ERR_PTR(error); - - if (flags == O_RDWR) - error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags); - else - error = may_open(&nd.path, MAY_WRITE, flags); + file = file_open_root(mnt->mnt_root, mnt, name, flags); - if (!error) - return dentry_open(nd.path.dentry, nd.path.mnt, flags, - current_cred()); - - path_put(&nd.path); - return ERR_PTR(error); + mntput(mnt); /* drop do_kern_mount reference */ + return file; } static struct { diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index cde36cb0f34..02eb4edf0ec 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -432,7 +432,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr, * If the server returns different values for sessionID, slotID or * sequence number, the server is looney tunes. */ - p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4); + p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4); if (unlikely(p == NULL)) goto out_overflow; memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 54b60bfceb8..7b566ec14e1 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2445,15 +2445,16 @@ nfs4_check_delegmode(struct nfs4_delegation *dp, int flags) static struct nfs4_delegation * find_delegation_file(struct nfs4_file *fp, stateid_t *stid) { - struct nfs4_delegation *dp = NULL; + struct nfs4_delegation *dp; spin_lock(&recall_lock); - list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) { - if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) - break; - } + list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) + if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) { + spin_unlock(&recall_lock); + return dp; + } spin_unlock(&recall_lock); - return dp; + return NULL; } int share_access_to_flags(u32 share_access) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 1275b865507..615f0a9f060 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1142,7 +1142,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, u32 dummy; char *machine_name; - int i; + int i, j; int nr_secflavs; READ_BUF(16); @@ -1215,7 +1215,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, READ_BUF(4); READ32(dummy); READ_BUF(dummy * 4); - for (i = 0; i < dummy; ++i) + for (j = 0; j < dummy; ++j) READ32(dummy); break; case RPC_AUTH_GSS: diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 6d80ecc7834..7eb90403fc8 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -56,7 +56,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, int ret = 0; /* if all else fails, just return false */ struct ocfs2_super *osb; - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; inode = dentry->d_inode; diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 5dbc3062b4f..254652a9b54 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -197,8 +197,12 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len, dentry->d_name.len, dentry->d_name.name, fh, len, connectable); - if (len < 3 || (connectable && len < 6)) { - mlog(ML_ERROR, "fh buffer is too small for encoding\n"); + if (connectable && (len < 6)) { + *max_len = 6; + type = 255; + goto bail; + } else if (len < 3) { + *max_len = 3; type = 255; goto bail; } diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 19ebc5aad39..29623da133c 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4379,7 +4379,7 @@ static int ocfs2_user_path_parent(const char __user *path, if (IS_ERR(s)) return PTR_ERR(s); - error = path_lookup(s, LOOKUP_PARENT, nd); + error = kern_path_parent(s, nd); if (error) putname(s); else diff --git a/fs/open.c b/fs/open.c index 5a2c6ebc22b..3cac0bda46d 100644 --- a/fs/open.c +++ b/fs/open.c @@ -233,6 +233,14 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!(file->f_mode & FMODE_WRITE)) return -EBADF; + + /* It's not possible punch hole on append only file */ + if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode)) + return -EPERM; + + if (IS_IMMUTABLE(inode)) + return -EPERM; + /* * Revalidate the write permissions, in case security policy has * changed since the files were opened. @@ -565,13 +573,15 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, { struct path path; int error = -EINVAL; - int follow; + int lookup_flags; - if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) + if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) goto out; - follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; - error = user_path_at(dfd, filename, follow, &path); + lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + if (flag & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + error = user_path_at(dfd, filename, lookup_flags, &path); if (error) goto out; error = mnt_want_write(path.mnt); @@ -661,11 +671,16 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, int (*open)(struct inode *, struct file *), const struct cred *cred) { + static const struct file_operations empty_fops = {}; struct inode *inode; int error; f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; + + if (unlikely(f->f_flags & O_PATH)) + f->f_mode = FMODE_PATH; + inode = dentry->d_inode; if (f->f_mode & FMODE_WRITE) { error = __get_file_write_access(inode, mnt); @@ -679,9 +694,15 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, f->f_path.dentry = dentry; f->f_path.mnt = mnt; f->f_pos = 0; - f->f_op = fops_get(inode->i_fop); file_sb_list_add(f, inode->i_sb); + if (unlikely(f->f_mode & FMODE_PATH)) { + f->f_op = &empty_fops; + return f; + } + + f->f_op = fops_get(inode->i_fop); + error = security_dentry_open(f, cred); if (error) goto cleanup_all; @@ -882,15 +903,110 @@ void fd_install(unsigned int fd, struct file *file) EXPORT_SYMBOL(fd_install); +static inline int build_open_flags(int flags, int mode, struct open_flags *op) +{ + int lookup_flags = 0; + int acc_mode; + + if (!(flags & O_CREAT)) + mode = 0; + op->mode = mode; + + /* Must never be set by userspace */ + flags &= ~FMODE_NONOTIFY; + + /* + * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only + * check for O_DSYNC if the need any syncing at all we enforce it's + * always set instead of having to deal with possibly weird behaviour + * for malicious applications setting only __O_SYNC. + */ + if (flags & __O_SYNC) + flags |= O_DSYNC; + + /* + * If we have O_PATH in the open flag. Then we + * cannot have anything other than the below set of flags + */ + if (flags & O_PATH) { + flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; + acc_mode = 0; + } else { + acc_mode = MAY_OPEN | ACC_MODE(flags); + } + + op->open_flag = flags; + + /* O_TRUNC implies we need access checks for write permissions */ + if (flags & O_TRUNC) + acc_mode |= MAY_WRITE; + + /* Allow the LSM permission hook to distinguish append + access from general write access. */ + if (flags & O_APPEND) + acc_mode |= MAY_APPEND; + + op->acc_mode = acc_mode; + + op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN; + + if (flags & O_CREAT) { + op->intent |= LOOKUP_CREATE; + if (flags & O_EXCL) + op->intent |= LOOKUP_EXCL; + } + + if (flags & O_DIRECTORY) + lookup_flags |= LOOKUP_DIRECTORY; + if (!(flags & O_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + return lookup_flags; +} + +/** + * filp_open - open file and return file pointer + * + * @filename: path to open + * @flags: open flags as per the open(2) second argument + * @mode: mode for the new file if O_CREAT is set, else ignored + * + * This is the helper to open a file from kernelspace if you really + * have to. But in generally you should not do this, so please move + * along, nothing to see here.. + */ +struct file *filp_open(const char *filename, int flags, int mode) +{ + struct open_flags op; + int lookup = build_open_flags(flags, mode, &op); + return do_filp_open(AT_FDCWD, filename, &op, lookup); +} +EXPORT_SYMBOL(filp_open); + +struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, + const char *filename, int flags) +{ + struct open_flags op; + int lookup = build_open_flags(flags, 0, &op); + if (flags & O_CREAT) + return ERR_PTR(-EINVAL); + if (!filename && (flags & O_DIRECTORY)) + if (!dentry->d_inode->i_op->lookup) + return ERR_PTR(-ENOTDIR); + return do_file_open_root(dentry, mnt, filename, &op, lookup); +} +EXPORT_SYMBOL(file_open_root); + long do_sys_open(int dfd, const char __user *filename, int flags, int mode) { + struct open_flags op; + int lookup = build_open_flags(flags, mode, &op); char *tmp = getname(filename); int fd = PTR_ERR(tmp); if (!IS_ERR(tmp)) { fd = get_unused_fd_flags(flags); if (fd >= 0) { - struct file *f = do_filp_open(dfd, tmp, flags, mode, 0); + struct file *f = do_filp_open(dfd, tmp, &op, lookup); if (IS_ERR(f)) { put_unused_fd(fd); fd = PTR_ERR(f); @@ -960,8 +1076,10 @@ int filp_close(struct file *filp, fl_owner_t id) if (filp->f_op && filp->f_op->flush) retval = filp->f_op->flush(filp, id); - dnotify_flush(filp, id); - locks_remove_posix(filp, id); + if (likely(!(filp->f_mode & FMODE_PATH))) { + dnotify_flush(filp, id); + locks_remove_posix(filp, id); + } fput(filp); return retval; } diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c index 48cec7cbca1..be03a0b08b4 100644 --- a/fs/partitions/osf.c +++ b/fs/partitions/osf.c @@ -10,10 +10,13 @@ #include "check.h" #include "osf.h" +#define MAX_OSF_PARTITIONS 8 + int osf_partition(struct parsed_partitions *state) { int i; int slot = 1; + unsigned int npartitions; Sector sect; unsigned char *data; struct disklabel { @@ -45,7 +48,7 @@ int osf_partition(struct parsed_partitions *state) u8 p_fstype; u8 p_frag; __le16 p_cpg; - } d_partitions[8]; + } d_partitions[MAX_OSF_PARTITIONS]; } * label; struct d_partition * partition; @@ -63,7 +66,12 @@ int osf_partition(struct parsed_partitions *state) put_dev_sector(sect); return 0; } - for (i = 0 ; i < le16_to_cpu(label->d_npartitions); i++, partition++) { + npartitions = le16_to_cpu(label->d_npartitions); + if (npartitions > MAX_OSF_PARTITIONS) { + put_dev_sector(sect); + return 0; + } + for (i = 0 ; i < npartitions; i++, partition++) { if (slot == state->limit) break; if (le32_to_cpu(partition->p_size)) diff --git a/fs/proc/base.c b/fs/proc/base.c index 9d096e82b20..d49c4b5d2c3 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2620,35 +2620,6 @@ static const struct pid_entry proc_base_stuff[] = { &proc_self_inode_operations, NULL, {}), }; -/* - * Exceptional case: normally we are not allowed to unhash a busy - * directory. In this case, however, we can do it - no aliasing problems - * due to the way we treat inodes. - */ -static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd) -{ - struct inode *inode; - struct task_struct *task; - - if (nd->flags & LOOKUP_RCU) - return -ECHILD; - - inode = dentry->d_inode; - task = get_proc_task(inode); - if (task) { - put_task_struct(task); - return 1; - } - d_drop(dentry); - return 0; -} - -static const struct dentry_operations proc_base_dentry_operations = -{ - .d_revalidate = proc_base_revalidate, - .d_delete = pid_delete_dentry, -}; - static struct dentry *proc_base_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { @@ -2685,7 +2656,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir, if (p->fop) inode->i_fop = p->fop; ei->op = p->op; - d_set_d_op(dentry, &proc_base_dentry_operations); d_add(dentry, inode); error = NULL; out: diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 176ce4cda68..d6a7ca1fdac 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -27,6 +27,7 @@ static void proc_evict_inode(struct inode *inode) { struct proc_dir_entry *de; + struct ctl_table_header *head; truncate_inode_pages(&inode->i_data, 0); end_writeback(inode); @@ -38,8 +39,11 @@ static void proc_evict_inode(struct inode *inode) de = PROC_I(inode)->pde; if (de) pde_put(de); - if (PROC_I(inode)->sysctl) - sysctl_head_put(PROC_I(inode)->sysctl); + head = PROC_I(inode)->sysctl; + if (head) { + rcu_assign_pointer(PROC_I(inode)->sysctl, NULL); + sysctl_head_put(head); + } } struct vfsmount *proc_mnt; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 09a1f92a34e..8eb2522111c 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -408,15 +408,18 @@ static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry, const struct inode *inode, unsigned int len, const char *str, const struct qstr *name) { + struct ctl_table_header *head; /* Although proc doesn't have negative dentries, rcu-walk means * that inode here can be NULL */ + /* AV: can it, indeed? */ if (!inode) - return 0; + return 1; if (name->len != len) return 1; if (memcmp(name->name, str, len)) return 1; - return !sysctl_is_seen(PROC_I(inode)->sysctl); + head = rcu_dereference(PROC_I(inode)->sysctl); + return !head || !sysctl_is_seen(head); } static const struct dentry_operations proc_sys_dentry_operations = { diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 0bae036831e..1bba24bad82 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1593,8 +1593,13 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp, struct inode *inode = dentry->d_inode; int maxlen = *lenp; - if (maxlen < 3) + if (need_parent && (maxlen < 5)) { + *lenp = 5; return 255; + } else if (maxlen < 3) { + *lenp = 3; + return 255; + } data[0] = inode->i_ino; data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 68fdf45cc6c..4b2eb564fda 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -1122,10 +1122,6 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, reiserfs_write_unlock(dir->i_sb); return -EMLINK; } - if (inode->i_nlink == 0) { - reiserfs_write_unlock(dir->i_sb); - return -ENOENT; - } /* inc before scheduling so reiserfs_unlink knows we are here */ inc_nlink(inode); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 3cfb2e93364..5c11ca82b78 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -978,8 +978,6 @@ int reiserfs_permission(struct inode *inode, int mask, unsigned int flags) static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd) { - if (nd->flags & LOOKUP_RCU) - return -ECHILD; return -EPERM; } diff --git a/fs/stat.c b/fs/stat.c index d5c61cf2b70..961039121cb 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -75,13 +75,16 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int error = -EINVAL; int lookup_flags = 0; - if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0) + if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | + AT_EMPTY_PATH)) != 0) goto out; if (!(flag & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (flag & AT_NO_AUTOMOUNT) lookup_flags |= LOOKUP_NO_AUTOMOUNT; + if (flag & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; error = user_path_at(dfd, filename, lookup_flags, &path); if (error) @@ -297,7 +300,7 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, if (bufsiz <= 0) return -EINVAL; - error = user_path_at(dfd, pathname, 0, &path); + error = user_path_at(dfd, pathname, LOOKUP_EMPTY, &path); if (!error) { struct inode *inode = path.dentry->d_inode; diff --git a/fs/statfs.c b/fs/statfs.c index 30ea8c8a996..8244924dec5 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -73,149 +73,135 @@ int vfs_statfs(struct path *path, struct kstatfs *buf) } EXPORT_SYMBOL(vfs_statfs); -static int do_statfs_native(struct path *path, struct statfs *buf) +int user_statfs(const char __user *pathname, struct kstatfs *st) { - struct kstatfs st; - int retval; + struct path path; + int error = user_path(pathname, &path); + if (!error) { + error = vfs_statfs(&path, st); + path_put(&path); + } + return error; +} - retval = vfs_statfs(path, &st); - if (retval) - return retval; +int fd_statfs(int fd, struct kstatfs *st) +{ + struct file *file = fget(fd); + int error = -EBADF; + if (file) { + error = vfs_statfs(&file->f_path, st); + fput(file); + } + return error; +} - if (sizeof(*buf) == sizeof(st)) - memcpy(buf, &st, sizeof(st)); +static int do_statfs_native(struct kstatfs *st, struct statfs __user *p) +{ + struct statfs buf; + + if (sizeof(buf) == sizeof(*st)) + memcpy(&buf, st, sizeof(*st)); else { - if (sizeof buf->f_blocks == 4) { - if ((st.f_blocks | st.f_bfree | st.f_bavail | - st.f_bsize | st.f_frsize) & + if (sizeof buf.f_blocks == 4) { + if ((st->f_blocks | st->f_bfree | st->f_bavail | + st->f_bsize | st->f_frsize) & 0xffffffff00000000ULL) return -EOVERFLOW; /* * f_files and f_ffree may be -1; it's okay to stuff * that into 32 bits */ - if (st.f_files != -1 && - (st.f_files & 0xffffffff00000000ULL)) + if (st->f_files != -1 && + (st->f_files & 0xffffffff00000000ULL)) return -EOVERFLOW; - if (st.f_ffree != -1 && - (st.f_ffree & 0xffffffff00000000ULL)) + if (st->f_ffree != -1 && + (st->f_ffree & 0xffffffff00000000ULL)) return -EOVERFLOW; } - buf->f_type = st.f_type; - buf->f_bsize = st.f_bsize; - buf->f_blocks = st.f_blocks; - buf->f_bfree = st.f_bfree; - buf->f_bavail = st.f_bavail; - buf->f_files = st.f_files; - buf->f_ffree = st.f_ffree; - buf->f_fsid = st.f_fsid; - buf->f_namelen = st.f_namelen; - buf->f_frsize = st.f_frsize; - buf->f_flags = st.f_flags; - memset(buf->f_spare, 0, sizeof(buf->f_spare)); + buf.f_type = st->f_type; + buf.f_bsize = st->f_bsize; + buf.f_blocks = st->f_blocks; + buf.f_bfree = st->f_bfree; + buf.f_bavail = st->f_bavail; + buf.f_files = st->f_files; + buf.f_ffree = st->f_ffree; + buf.f_fsid = st->f_fsid; + buf.f_namelen = st->f_namelen; + buf.f_frsize = st->f_frsize; + buf.f_flags = st->f_flags; + memset(buf.f_spare, 0, sizeof(buf.f_spare)); } + if (copy_to_user(p, &buf, sizeof(buf))) + return -EFAULT; return 0; } -static int do_statfs64(struct path *path, struct statfs64 *buf) +static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p) { - struct kstatfs st; - int retval; - - retval = vfs_statfs(path, &st); - if (retval) - return retval; - - if (sizeof(*buf) == sizeof(st)) - memcpy(buf, &st, sizeof(st)); + struct statfs64 buf; + if (sizeof(buf) == sizeof(*st)) + memcpy(&buf, st, sizeof(*st)); else { - buf->f_type = st.f_type; - buf->f_bsize = st.f_bsize; - buf->f_blocks = st.f_blocks; - buf->f_bfree = st.f_bfree; - buf->f_bavail = st.f_bavail; - buf->f_files = st.f_files; - buf->f_ffree = st.f_ffree; - buf->f_fsid = st.f_fsid; - buf->f_namelen = st.f_namelen; - buf->f_frsize = st.f_frsize; - buf->f_flags = st.f_flags; - memset(buf->f_spare, 0, sizeof(buf->f_spare)); + buf.f_type = st->f_type; + buf.f_bsize = st->f_bsize; + buf.f_blocks = st->f_blocks; + buf.f_bfree = st->f_bfree; + buf.f_bavail = st->f_bavail; + buf.f_files = st->f_files; + buf.f_ffree = st->f_ffree; + buf.f_fsid = st->f_fsid; + buf.f_namelen = st->f_namelen; + buf.f_frsize = st->f_frsize; + buf.f_flags = st->f_flags; + memset(buf.f_spare, 0, sizeof(buf.f_spare)); } + if (copy_to_user(p, &buf, sizeof(buf))) + return -EFAULT; return 0; } SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf) { - struct path path; - int error; - - error = user_path(pathname, &path); - if (!error) { - struct statfs tmp; - error = do_statfs_native(&path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_put(&path); - } + struct kstatfs st; + int error = user_statfs(pathname, &st); + if (!error) + error = do_statfs_native(&st, buf); return error; } SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf) { - struct path path; - long error; - + struct kstatfs st; + int error; if (sz != sizeof(*buf)) return -EINVAL; - error = user_path(pathname, &path); - if (!error) { - struct statfs64 tmp; - error = do_statfs64(&path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_put(&path); - } + error = user_statfs(pathname, &st); + if (!error) + error = do_statfs64(&st, buf); return error; } SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf) { - struct file *file; - struct statfs tmp; - int error; - - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = do_statfs_native(&file->f_path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); -out: + struct kstatfs st; + int error = fd_statfs(fd, &st); + if (!error) + error = do_statfs_native(&st, buf); return error; } SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf) { - struct file *file; - struct statfs64 tmp; + struct kstatfs st; int error; if (sz != sizeof(*buf)) return -EINVAL; - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = do_statfs64(&file->f_path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); -out: + error = fd_statfs(fd, &st); + if (!error) + error = do_statfs64(&st, buf); return error; } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 14f64b689d7..7217d67a80a 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -522,24 +522,6 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, ubifs_assert(mutex_is_locked(&dir->i_mutex)); ubifs_assert(mutex_is_locked(&inode->i_mutex)); - /* - * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing - * otherwise has the potential to corrupt the orphan inode list. - * - * Indeed, consider a scenario when 'vfs_link(dirA/fileA)' and - * 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not - * lock 'dirA->i_mutex', so this is possible. Both of the functions - * lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes - * 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this - * case 'ubifs_unlink()' will drop the last reference, and put 'inodeA' - * to the list of orphans. After this, 'vfs_link()' will link - * 'dirB/fileB' to 'inodeA'. This is a problem because, for example, - * the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode - * to the list of orphans. - */ - if (inode->i_nlink == 0) - return -ENOENT; - err = dbg_check_synced_i_size(inode); if (err) return err; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index b7c338d5e9d..f1dce848ef9 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -1286,8 +1286,13 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp, struct fid *fid = (struct fid *)fh; int type = FILEID_UDF_WITHOUT_PARENT; - if (len < 3 || (connectable && len < 5)) + if (connectable && (len < 5)) { + *lenp = 5; + return 255; + } else if (len < 3) { + *lenp = 3; return 255; + } *lenp = 3; fid->udf.block = location.logicalBlockNum; diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c index fc0114da7fd..f4f878fc008 100644 --- a/fs/xfs/linux-2.6/xfs_export.c +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -89,8 +89,10 @@ xfs_fs_encode_fh( * seven combinations work. The real answer is "don't use v2". */ len = xfs_fileid_length(fileid_type); - if (*max_len < len) + if (*max_len < len) { + *max_len = len; return 255; + } *max_len = len; switch (fileid_type) { diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h index 2bcc5c7c22a..61e03dd7939 100644 --- a/include/asm-generic/cputime.h +++ b/include/asm-generic/cputime.h @@ -30,6 +30,9 @@ typedef u64 cputime64_t; #define cputime64_to_jiffies64(__ct) (__ct) #define jiffies64_to_cputime64(__jif) (__jif) #define cputime_to_cputime64(__ct) ((u64) __ct) +#define cputime64_gt(__a, __b) ((__a) > (__b)) + +#define nsecs_to_cputime64(__ct) nsecs_to_jiffies64(__ct) /* diff --git a/include/asm-generic/fcntl.h b/include/asm-generic/fcntl.h index 0fc16e3f0bf..84793c7025e 100644 --- a/include/asm-generic/fcntl.h +++ b/include/asm-generic/fcntl.h @@ -80,6 +80,10 @@ #define O_SYNC (__O_SYNC|O_DSYNC) #endif +#ifndef O_PATH +#define O_PATH 010000000 +#endif + #ifndef O_NDELAY #define O_NDELAY O_NONBLOCK #endif diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h index 3c2344f4813..01f227e1425 100644 --- a/include/asm-generic/futex.h +++ b/include/asm-generic/futex.h @@ -6,7 +6,7 @@ #include <asm/errno.h> static inline int -futex_atomic_op_inuser (int encoded_op, int __user *uaddr) +futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; @@ -16,7 +16,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int))) + if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; pagefault_disable(); @@ -48,7 +48,8 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr) } static inline int -futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) { return -ENOSYS; } diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h index b969770196c..57af0338d27 100644 --- a/include/asm-generic/unistd.h +++ b/include/asm-generic/unistd.h @@ -646,9 +646,13 @@ __SYSCALL(__NR_prlimit64, sys_prlimit64) __SYSCALL(__NR_fanotify_init, sys_fanotify_init) #define __NR_fanotify_mark 263 __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) +#define __NR_name_to_handle_at 264 +__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at) +#define __NR_open_by_handle_at 265 +__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at) #undef __NR_syscalls -#define __NR_syscalls 264 +#define __NR_syscalls 266 /* * All syscalls below here should go away really, diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h index 597692f1fc8..65970b811e2 100644 --- a/include/linux/debugobjects.h +++ b/include/linux/debugobjects.h @@ -34,7 +34,10 @@ struct debug_obj { /** * struct debug_obj_descr - object type specific debug description structure + * * @name: name of the object typee + * @debug_hint: function returning address, which have associated + * kernel symbol, to allow identify the object * @fixup_init: fixup function, which is called when the init check * fails * @fixup_activate: fixup function, which is called when the activate check @@ -46,7 +49,7 @@ struct debug_obj { */ struct debug_obj_descr { const char *name; - + void *(*debug_hint) (void *addr); int (*fixup_init) (void *addr, enum debug_obj_state state); int (*fixup_activate) (void *addr, enum debug_obj_state state); int (*fixup_destroy) (void *addr, enum debug_obj_state state); diff --git a/include/linux/device.h b/include/linux/device.h index 1bf5cf0b451..ca5d25225aa 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -128,9 +128,7 @@ struct device_driver { bool suppress_bind_attrs; /* disables bind/unbind via sysfs */ -#if defined(CONFIG_OF) const struct of_device_id *of_match_table; -#endif int (*probe) (struct device *dev); int (*remove) (struct device *dev); @@ -441,9 +439,8 @@ struct device { override */ /* arch specific additions */ struct dev_archdata archdata; -#ifdef CONFIG_OF - struct device_node *of_node; -#endif + + struct device_node *of_node; /* associated device tree node */ dev_t devt; /* dev_t, creates the sysfs "dev" */ diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 28028988c86..33a42f24b27 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -8,6 +8,9 @@ struct inode; struct super_block; struct vfsmount; +/* limit the handle size to NFSv4 handle size now */ +#define MAX_HANDLE_SZ 128 + /* * The fileid_type identifies how the file within the filesystem is encoded. * In theory this is freely set and parsed by the filesystem, but we try to @@ -121,8 +124,10 @@ struct fid { * set, the encode_fh() should store sufficient information so that a good * attempt can be made to find not only the file but also it's place in the * filesystem. This typically means storing a reference to de->d_parent in - * the filehandle fragment. encode_fh() should return the number of bytes - * stored or a negative error code such as %-ENOSPC + * the filehandle fragment. encode_fh() should return the fileid_type on + * success and on error returns 255 (if the space needed to encode fh is + * greater than @max_len*4 bytes). On error @max_len contains the minimum + * size(in 4 byte unit) needed to encode the file handle. * * fh_to_dentry: * @fh_to_dentry is given a &struct super_block (@sb) and a file handle diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h index a562fa5fb4e..f550f894ba1 100644 --- a/include/linux/fcntl.h +++ b/include/linux/fcntl.h @@ -46,6 +46,7 @@ unlinking file. */ #define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */ #define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount traversal */ +#define AT_EMPTY_PATH 0x1000 /* Allow empty relative pathname */ #ifdef __KERNEL__ diff --git a/include/linux/file.h b/include/linux/file.h index e85baebf627..21a79958541 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -29,6 +29,8 @@ static inline void fput_light(struct file *file, int fput_needed) extern struct file *fget(unsigned int fd); extern struct file *fget_light(unsigned int fd, int *fput_needed); +extern struct file *fget_raw(unsigned int fd); +extern struct file *fget_raw_light(unsigned int fd, int *fput_needed); extern void set_close_on_exec(unsigned int fd, int flag); extern void put_filp(struct file *); extern int alloc_fd(unsigned start, unsigned flags); diff --git a/include/linux/fs.h b/include/linux/fs.h index e38b50a4b9d..13df14e2c42 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -102,6 +102,9 @@ struct inodes_stat_t { /* File is huge (eg. /dev/kmem): treat loff_t as unsigned */ #define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000) +/* File is opened with O_PATH; almost nothing can be done with it */ +#define FMODE_PATH ((__force fmode_t)0x4000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x1000000) @@ -978,6 +981,13 @@ struct file { #endif }; +struct file_handle { + __u32 handle_bytes; + int handle_type; + /* file identifier */ + unsigned char f_handle[0]; +}; + #define get_file(x) atomic_long_inc(&(x)->f_count) #define fput_atomic(x) atomic_long_add_unless(&(x)->f_count, -1, 1) #define file_count(x) atomic_long_read(&(x)->f_count) @@ -1401,6 +1411,7 @@ struct super_block { wait_queue_head_t s_wait_unfrozen; char s_id[32]; /* Informational name */ + u8 s_uuid[16]; /* UUID */ void *s_fs_info; /* Filesystem private info */ fmode_t s_mode; @@ -1874,6 +1885,8 @@ extern void drop_collected_mounts(struct vfsmount *); extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *, struct vfsmount *); extern int vfs_statfs(struct path *, struct kstatfs *); +extern int user_statfs(const char __user *, struct kstatfs *); +extern int fd_statfs(int, struct kstatfs *); extern int statfs_by_dentry(struct dentry *, struct kstatfs *); extern int freeze_super(struct super_block *super); extern int thaw_super(struct super_block *super); @@ -1990,6 +2003,8 @@ extern int do_fallocate(struct file *file, int mode, loff_t offset, extern long do_sys_open(int dfd, const char __user *filename, int flags, int mode); extern struct file *filp_open(const char *, int, int); +extern struct file *file_open_root(struct dentry *, struct vfsmount *, + const char *, int); extern struct file * dentry_open(struct dentry *, struct vfsmount *, int, const struct cred *); extern int filp_close(struct file *, fl_owner_t id); @@ -2205,10 +2220,6 @@ extern struct file *create_read_pipe(struct file *f, int flags); extern struct file *create_write_pipe(int flags); extern void free_write_pipe(struct file *); -extern struct file *do_filp_open(int dfd, const char *pathname, - int open_flag, int mode, int acc_mode); -extern int may_open(struct path *, int, int); - extern int kernel_read(struct file *, loff_t, char *, unsigned long); extern struct file * open_exec(const char *); diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index f376ddc64c4..62f500c724f 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -54,11 +54,13 @@ enum hrtimer_restart { * 0x00 inactive * 0x01 enqueued into rbtree * 0x02 callback function running + * 0x04 timer is migrated to another cpu * * Special cases: * 0x03 callback function running and enqueued * (was requeued on another CPU) - * 0x09 timer was migrated on CPU hotunplug + * 0x05 timer was migrated on CPU hotunplug + * * The "callback function running and enqueued" status is only possible on * SMP. It happens for example when a posix timer expired and the callback * queued a signal. Between dropping the lock which protects the posix timer @@ -67,8 +69,11 @@ enum hrtimer_restart { * as otherwise the timer could be removed before the softirq code finishes the * the handling of the timer. * - * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state to - * preserve the HRTIMER_STATE_CALLBACK bit in the above scenario. + * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state + * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This + * also affects HRTIMER_STATE_MIGRATE where the preservation is not + * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is + * enqueued on the new cpu. * * All state transitions are protected by cpu_base->lock. */ @@ -148,7 +153,12 @@ struct hrtimer_clock_base { #endif }; -#define HRTIMER_MAX_CLOCK_BASES 2 +enum hrtimer_base_type { + HRTIMER_BASE_REALTIME, + HRTIMER_BASE_MONOTONIC, + HRTIMER_BASE_BOOTTIME, + HRTIMER_MAX_CLOCK_BASES, +}; /* * struct hrtimer_cpu_base - the per cpu clock bases @@ -308,6 +318,7 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer) extern ktime_t ktime_get(void); extern ktime_t ktime_get_real(void); +extern ktime_t ktime_get_boottime(void); DECLARE_PER_CPU(struct tick_device, tick_cpu_device); @@ -370,8 +381,9 @@ extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp); extern ktime_t hrtimer_get_next_event(void); /* - * A timer is active, when it is enqueued into the rbtree or the callback - * function is running. + * A timer is active, when it is enqueued into the rbtree or the + * callback function is running or it's in the state of being migrated + * to another cpu. */ static inline int hrtimer_active(const struct hrtimer *timer) { diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 903576df88d..06a8d9c7de9 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -258,9 +258,7 @@ struct i2c_board_info { unsigned short addr; void *platform_data; struct dev_archdata *archdata; -#ifdef CONFIG_OF struct device_node *of_node; -#endif int irq; }; diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 55e0d4253e4..59b72ca1c5d 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -14,6 +14,8 @@ #include <linux/smp.h> #include <linux/percpu.h> #include <linux/hrtimer.h> +#include <linux/kref.h> +#include <linux/workqueue.h> #include <asm/atomic.h> #include <asm/ptrace.h> @@ -55,7 +57,8 @@ * Used by threaded interrupts which need to keep the * irq line disabled until the threaded handler has been run. * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend - * + * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set + * IRQF_NO_THREAD - Interrupt cannot be threaded */ #define IRQF_DISABLED 0x00000020 #define IRQF_SAMPLE_RANDOM 0x00000040 @@ -67,22 +70,10 @@ #define IRQF_IRQPOLL 0x00001000 #define IRQF_ONESHOT 0x00002000 #define IRQF_NO_SUSPEND 0x00004000 +#define IRQF_FORCE_RESUME 0x00008000 +#define IRQF_NO_THREAD 0x00010000 -#define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND) - -/* - * Bits used by threaded handlers: - * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run - * IRQTF_DIED - handler thread died - * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed - * IRQTF_AFFINITY - irq thread is requested to adjust affinity - */ -enum { - IRQTF_RUNTHREAD, - IRQTF_DIED, - IRQTF_WARNED, - IRQTF_AFFINITY, -}; +#define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD) /* * These values can be returned by request_any_context_irq() and @@ -110,6 +101,7 @@ typedef irqreturn_t (*irq_handler_t)(int, void *); * @thread_fn: interupt handler function for threaded interrupts * @thread: thread pointer for threaded interrupts * @thread_flags: flags related to @thread + * @thread_mask: bitmask for keeping track of @thread activity */ struct irqaction { irq_handler_t handler; @@ -120,6 +112,7 @@ struct irqaction { irq_handler_t thread_fn; struct task_struct *thread; unsigned long thread_flags; + unsigned long thread_mask; const char *name; struct proc_dir_entry *dir; } ____cacheline_internodealigned_in_smp; @@ -240,6 +233,35 @@ extern int irq_can_set_affinity(unsigned int irq); extern int irq_select_affinity(unsigned int irq); extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m); + +/** + * struct irq_affinity_notify - context for notification of IRQ affinity changes + * @irq: Interrupt to which notification applies + * @kref: Reference count, for internal use + * @work: Work item, for internal use + * @notify: Function to be called on change. This will be + * called in process context. + * @release: Function to be called on release. This will be + * called in process context. Once registered, the + * structure must only be freed when this function is + * called or later. + */ +struct irq_affinity_notify { + unsigned int irq; + struct kref kref; + struct work_struct work; + void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask); + void (*release)(struct kref *ref); +}; + +extern int +irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify); + +static inline void irq_run_affinity_notifiers(void) +{ + flush_scheduled_work(); +} + #else /* CONFIG_SMP */ static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m) @@ -255,7 +277,7 @@ static inline int irq_can_set_affinity(unsigned int irq) static inline int irq_select_affinity(unsigned int irq) { return 0; } static inline int irq_set_affinity_hint(unsigned int irq, - const struct cpumask *m) + const struct cpumask *m) { return -EINVAL; } @@ -314,16 +336,24 @@ static inline void enable_irq_lockdep_irqrestore(unsigned int irq, unsigned long } /* IRQ wakeup (PM) control: */ -extern int set_irq_wake(unsigned int irq, unsigned int on); +extern int irq_set_irq_wake(unsigned int irq, unsigned int on); + +#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT +/* Please do not use: Use the replacement functions instead */ +static inline int set_irq_wake(unsigned int irq, unsigned int on) +{ + return irq_set_irq_wake(irq, on); +} +#endif static inline int enable_irq_wake(unsigned int irq) { - return set_irq_wake(irq, 1); + return irq_set_irq_wake(irq, 1); } static inline int disable_irq_wake(unsigned int irq) { - return set_irq_wake(irq, 0); + return irq_set_irq_wake(irq, 0); } #else /* !CONFIG_GENERIC_HARDIRQS */ @@ -353,6 +383,13 @@ static inline int disable_irq_wake(unsigned int irq) } #endif /* CONFIG_GENERIC_HARDIRQS */ + +#ifdef CONFIG_IRQ_FORCED_THREADING +extern bool force_irqthreads; +#else +#define force_irqthreads (0) +#endif + #ifndef __ARCH_SET_SOFTIRQ_PENDING #define set_softirq_pending(x) (local_softirq_pending() = (x)) #define or_softirq_pending(x) (local_softirq_pending() |= (x)) @@ -426,6 +463,13 @@ extern void raise_softirq(unsigned int nr); */ DECLARE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); +DECLARE_PER_CPU(struct task_struct *, ksoftirqd); + +static inline struct task_struct *this_cpu_ksoftirqd(void) +{ + return this_cpu_read(ksoftirqd); +} + /* Try to send a softirq to a remote cpu. If this cannot be done, the * work will be queued to the local cpu. */ @@ -645,6 +689,7 @@ static inline void init_irq_proc(void) struct seq_file; int show_interrupts(struct seq_file *p, void *v); +int arch_show_interrupts(struct seq_file *p, int prec); extern int early_irq_init(void); extern int arch_probe_nr_irqs(void); diff --git a/include/linux/irq.h b/include/linux/irq.h index 80fcb53057b..1d3577f30d4 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -29,61 +29,104 @@ #include <asm/irq_regs.h> struct irq_desc; +struct irq_data; typedef void (*irq_flow_handler_t)(unsigned int irq, struct irq_desc *desc); - +typedef void (*irq_preflow_handler_t)(struct irq_data *data); /* * IRQ line status. * - * Bits 0-7 are reserved for the IRQF_* bits in linux/interrupt.h + * Bits 0-7 are the same as the IRQF_* bits in linux/interrupt.h + * + * IRQ_TYPE_NONE - default, unspecified type + * IRQ_TYPE_EDGE_RISING - rising edge triggered + * IRQ_TYPE_EDGE_FALLING - falling edge triggered + * IRQ_TYPE_EDGE_BOTH - rising and falling edge triggered + * IRQ_TYPE_LEVEL_HIGH - high level triggered + * IRQ_TYPE_LEVEL_LOW - low level triggered + * IRQ_TYPE_LEVEL_MASK - Mask to filter out the level bits + * IRQ_TYPE_SENSE_MASK - Mask for all the above bits + * IRQ_TYPE_PROBE - Special flag for probing in progress + * + * Bits which can be modified via irq_set/clear/modify_status_flags() + * IRQ_LEVEL - Interrupt is level type. Will be also + * updated in the code when the above trigger + * bits are modified via set_irq_type() + * IRQ_PER_CPU - Mark an interrupt PER_CPU. Will protect + * it from affinity setting + * IRQ_NOPROBE - Interrupt cannot be probed by autoprobing + * IRQ_NOREQUEST - Interrupt cannot be requested via + * request_irq() + * IRQ_NOAUTOEN - Interrupt is not automatically enabled in + * request/setup_irq() + * IRQ_NO_BALANCING - Interrupt cannot be balanced (affinity set) + * IRQ_MOVE_PCNTXT - Interrupt can be migrated from process context + * IRQ_NESTED_TRHEAD - Interrupt nests into another thread + * + * Deprecated bits. They are kept updated as long as + * CONFIG_GENERIC_HARDIRQS_NO_COMPAT is not set. Will go away soon. These bits + * are internal state of the core code and if you really need to acces + * them then talk to the genirq maintainer instead of hacking + * something weird. * - * IRQ types */ -#define IRQ_TYPE_NONE 0x00000000 /* Default, unspecified type */ -#define IRQ_TYPE_EDGE_RISING 0x00000001 /* Edge rising type */ -#define IRQ_TYPE_EDGE_FALLING 0x00000002 /* Edge falling type */ -#define IRQ_TYPE_EDGE_BOTH (IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING) -#define IRQ_TYPE_LEVEL_HIGH 0x00000004 /* Level high type */ -#define IRQ_TYPE_LEVEL_LOW 0x00000008 /* Level low type */ -#define IRQ_TYPE_SENSE_MASK 0x0000000f /* Mask of the above */ -#define IRQ_TYPE_PROBE 0x00000010 /* Probing in progress */ - -/* Internal flags */ -#define IRQ_INPROGRESS 0x00000100 /* IRQ handler active - do not enter! */ -#define IRQ_DISABLED 0x00000200 /* IRQ disabled - do not enter! */ -#define IRQ_PENDING 0x00000400 /* IRQ pending - replay on enable */ -#define IRQ_REPLAY 0x00000800 /* IRQ has been replayed but not acked yet */ -#define IRQ_AUTODETECT 0x00001000 /* IRQ is being autodetected */ -#define IRQ_WAITING 0x00002000 /* IRQ not yet seen - for autodetection */ -#define IRQ_LEVEL 0x00004000 /* IRQ level triggered */ -#define IRQ_MASKED 0x00008000 /* IRQ masked - shouldn't be seen again */ -#define IRQ_PER_CPU 0x00010000 /* IRQ is per CPU */ -#define IRQ_NOPROBE 0x00020000 /* IRQ is not valid for probing */ -#define IRQ_NOREQUEST 0x00040000 /* IRQ cannot be requested */ -#define IRQ_NOAUTOEN 0x00080000 /* IRQ will not be enabled on request irq */ -#define IRQ_WAKEUP 0x00100000 /* IRQ triggers system wakeup */ -#define IRQ_MOVE_PENDING 0x00200000 /* need to re-target IRQ destination */ -#define IRQ_NO_BALANCING 0x00400000 /* IRQ is excluded from balancing */ -#define IRQ_SPURIOUS_DISABLED 0x00800000 /* IRQ was disabled by the spurious trap */ -#define IRQ_MOVE_PCNTXT 0x01000000 /* IRQ migration from process context */ -#define IRQ_AFFINITY_SET 0x02000000 /* IRQ affinity was set from userspace*/ -#define IRQ_SUSPENDED 0x04000000 /* IRQ has gone through suspend sequence */ -#define IRQ_ONESHOT 0x08000000 /* IRQ is not unmasked after hardirq */ -#define IRQ_NESTED_THREAD 0x10000000 /* IRQ is nested into another, no own handler thread */ +enum { + IRQ_TYPE_NONE = 0x00000000, + IRQ_TYPE_EDGE_RISING = 0x00000001, + IRQ_TYPE_EDGE_FALLING = 0x00000002, + IRQ_TYPE_EDGE_BOTH = (IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING), + IRQ_TYPE_LEVEL_HIGH = 0x00000004, + IRQ_TYPE_LEVEL_LOW = 0x00000008, + IRQ_TYPE_LEVEL_MASK = (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH), + IRQ_TYPE_SENSE_MASK = 0x0000000f, + + IRQ_TYPE_PROBE = 0x00000010, + + IRQ_LEVEL = (1 << 8), + IRQ_PER_CPU = (1 << 9), + IRQ_NOPROBE = (1 << 10), + IRQ_NOREQUEST = (1 << 11), + IRQ_NOAUTOEN = (1 << 12), + IRQ_NO_BALANCING = (1 << 13), + IRQ_MOVE_PCNTXT = (1 << 14), + IRQ_NESTED_THREAD = (1 << 15), + +#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT + IRQ_INPROGRESS = (1 << 16), + IRQ_REPLAY = (1 << 17), + IRQ_WAITING = (1 << 18), + IRQ_DISABLED = (1 << 19), + IRQ_PENDING = (1 << 20), + IRQ_MASKED = (1 << 21), + IRQ_MOVE_PENDING = (1 << 22), + IRQ_AFFINITY_SET = (1 << 23), + IRQ_WAKEUP = (1 << 24), +#endif +}; #define IRQF_MODIFY_MASK \ (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \ IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \ - IRQ_PER_CPU) + IRQ_PER_CPU | IRQ_NESTED_THREAD) -#ifdef CONFIG_IRQ_PER_CPU -# define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU) -# define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING) -#else -# define CHECK_IRQ_PER_CPU(var) 0 -# define IRQ_NO_BALANCING_MASK IRQ_NO_BALANCING -#endif +#define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING) + +static inline __deprecated bool CHECK_IRQ_PER_CPU(unsigned int status) +{ + return status & IRQ_PER_CPU; +} + +/* + * Return value for chip->irq_set_affinity() + * + * IRQ_SET_MASK_OK - OK, core updates irq_data.affinity + * IRQ_SET_MASK_NOCPY - OK, chip did update irq_data.affinity + */ +enum { + IRQ_SET_MASK_OK = 0, + IRQ_SET_MASK_OK_NOCOPY, +}; struct msi_desc; @@ -91,6 +134,8 @@ struct msi_desc; * struct irq_data - per irq and irq chip data passed down to chip functions * @irq: interrupt number * @node: node index useful for balancing + * @state_use_accessor: status information for irq chip functions. + * Use accessor functions to deal with it * @chip: low level interrupt hardware access * @handler_data: per-IRQ data for the irq_chip methods * @chip_data: platform-specific per-chip private data for the chip @@ -105,6 +150,7 @@ struct msi_desc; struct irq_data { unsigned int irq; unsigned int node; + unsigned int state_use_accessors; struct irq_chip *chip; void *handler_data; void *chip_data; @@ -114,6 +160,80 @@ struct irq_data { #endif }; +/* + * Bit masks for irq_data.state + * + * IRQD_TRIGGER_MASK - Mask for the trigger type bits + * IRQD_SETAFFINITY_PENDING - Affinity setting is pending + * IRQD_NO_BALANCING - Balancing disabled for this IRQ + * IRQD_PER_CPU - Interrupt is per cpu + * IRQD_AFFINITY_SET - Interrupt affinity was set + * IRQD_LEVEL - Interrupt is level triggered + * IRQD_WAKEUP_STATE - Interrupt is configured for wakeup + * from suspend + * IRDQ_MOVE_PCNTXT - Interrupt can be moved in process + * context + */ +enum { + IRQD_TRIGGER_MASK = 0xf, + IRQD_SETAFFINITY_PENDING = (1 << 8), + IRQD_NO_BALANCING = (1 << 10), + IRQD_PER_CPU = (1 << 11), + IRQD_AFFINITY_SET = (1 << 12), + IRQD_LEVEL = (1 << 13), + IRQD_WAKEUP_STATE = (1 << 14), + IRQD_MOVE_PCNTXT = (1 << 15), +}; + +static inline bool irqd_is_setaffinity_pending(struct irq_data *d) +{ + return d->state_use_accessors & IRQD_SETAFFINITY_PENDING; +} + +static inline bool irqd_is_per_cpu(struct irq_data *d) +{ + return d->state_use_accessors & IRQD_PER_CPU; +} + +static inline bool irqd_can_balance(struct irq_data *d) +{ + return !(d->state_use_accessors & (IRQD_PER_CPU | IRQD_NO_BALANCING)); +} + +static inline bool irqd_affinity_was_set(struct irq_data *d) +{ + return d->state_use_accessors & IRQD_AFFINITY_SET; +} + +static inline u32 irqd_get_trigger_type(struct irq_data *d) +{ + return d->state_use_accessors & IRQD_TRIGGER_MASK; +} + +/* + * Must only be called inside irq_chip.irq_set_type() functions. + */ +static inline void irqd_set_trigger_type(struct irq_data *d, u32 type) +{ + d->state_use_accessors &= ~IRQD_TRIGGER_MASK; + d->state_use_accessors |= type & IRQD_TRIGGER_MASK; +} + +static inline bool irqd_is_level_type(struct irq_data *d) +{ + return d->state_use_accessors & IRQD_LEVEL; +} + +static inline bool irqd_is_wakeup_set(struct irq_data *d) +{ + return d->state_use_accessors & IRQD_WAKEUP_STATE; +} + +static inline bool irqd_can_move_in_process_context(struct irq_data *d) +{ + return d->state_use_accessors & IRQD_MOVE_PCNTXT; +} + /** * struct irq_chip - hardware interrupt chip descriptor * @@ -150,6 +270,7 @@ struct irq_data { * @irq_set_wake: enable/disable power-management wake-on of an IRQ * @irq_bus_lock: function to lock access to slow bus (i2c) chips * @irq_bus_sync_unlock:function to sync and unlock slow bus (i2c) chips + * @flags: chip specific flags * * @release: release function solely used by UML */ @@ -196,12 +317,27 @@ struct irq_chip { void (*irq_bus_lock)(struct irq_data *data); void (*irq_bus_sync_unlock)(struct irq_data *data); + unsigned long flags; + /* Currently used only by UML, might disappear one day.*/ #ifdef CONFIG_IRQ_RELEASE_METHOD void (*release)(unsigned int irq, void *dev_id); #endif }; +/* + * irq_chip specific flags + * + * IRQCHIP_SET_TYPE_MASKED: Mask before calling chip.irq_set_type() + * IRQCHIP_EOI_IF_HANDLED: Only issue irq_eoi() when irq was handled + * IRQCHIP_MASK_ON_SUSPEND: Mask non wake irqs in the suspend path + */ +enum { + IRQCHIP_SET_TYPE_MASKED = (1 << 0), + IRQCHIP_EOI_IF_HANDLED = (1 << 1), + IRQCHIP_MASK_ON_SUSPEND = (1 << 2), +}; + /* This include will go away once we isolated irq_desc usage to core code */ #include <linux/irqdesc.h> @@ -218,7 +354,7 @@ struct irq_chip { # define ARCH_IRQ_INIT_FLAGS 0 #endif -#define IRQ_DEFAULT_INIT_FLAGS (IRQ_DISABLED | ARCH_IRQ_INIT_FLAGS) +#define IRQ_DEFAULT_INIT_FLAGS ARCH_IRQ_INIT_FLAGS struct irqaction; extern int setup_irq(unsigned int irq, struct irqaction *new); @@ -229,9 +365,13 @@ extern void remove_irq(unsigned int irq, struct irqaction *act); #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) void move_native_irq(int irq); void move_masked_irq(int irq); +void irq_move_irq(struct irq_data *data); +void irq_move_masked_irq(struct irq_data *data); #else static inline void move_native_irq(int irq) { } static inline void move_masked_irq(int irq) { } +static inline void irq_move_irq(struct irq_data *data) { } +static inline void irq_move_masked_irq(struct irq_data *data) { } #endif extern int no_irq_affinity; @@ -267,23 +407,23 @@ extern struct irq_chip no_irq_chip; extern struct irq_chip dummy_irq_chip; extern void -set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, - irq_flow_handler_t handle); -extern void -set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, +irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, irq_flow_handler_t handle, const char *name); +static inline void irq_set_chip_and_handler(unsigned int irq, struct irq_chip *chip, + irq_flow_handler_t handle) +{ + irq_set_chip_and_handler_name(irq, chip, handle, NULL); +} + extern void -__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, +__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name); -/* - * Set a highlevel flow handler for a given IRQ: - */ static inline void -set_irq_handler(unsigned int irq, irq_flow_handler_t handle) +irq_set_handler(unsigned int irq, irq_flow_handler_t handle) { - __set_irq_handler(irq, handle, 0, NULL); + __irq_set_handler(irq, handle, 0, NULL); } /* @@ -292,14 +432,11 @@ set_irq_handler(unsigned int irq, irq_flow_handler_t handle) * IRQ_NOREQUEST and IRQ_NOPROBE) */ static inline void -set_irq_chained_handler(unsigned int irq, - irq_flow_handler_t handle) +irq_set_chained_handler(unsigned int irq, irq_flow_handler_t handle) { - __set_irq_handler(irq, handle, 1, NULL); + __irq_set_handler(irq, handle, 1, NULL); } -extern void set_irq_nested_thread(unsigned int irq, int nest); - void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set); static inline void irq_set_status_flags(unsigned int irq, unsigned long set) @@ -312,16 +449,24 @@ static inline void irq_clear_status_flags(unsigned int irq, unsigned long clr) irq_modify_status(irq, clr, 0); } -static inline void set_irq_noprobe(unsigned int irq) +static inline void irq_set_noprobe(unsigned int irq) { irq_modify_status(irq, 0, IRQ_NOPROBE); } -static inline void set_irq_probe(unsigned int irq) +static inline void irq_set_probe(unsigned int irq) { irq_modify_status(irq, IRQ_NOPROBE, 0); } +static inline void irq_set_nested_thread(unsigned int irq, bool nest) +{ + if (nest) + irq_set_status_flags(irq, IRQ_NESTED_THREAD); + else + irq_clear_status_flags(irq, IRQ_NESTED_THREAD); +} + /* Handle dynamic irq creation and destruction */ extern unsigned int create_irq_nr(unsigned int irq_want, int node); extern int create_irq(void); @@ -338,14 +483,14 @@ static inline void dynamic_irq_init(unsigned int irq) } /* Set/get chip/data for an IRQ: */ -extern int set_irq_chip(unsigned int irq, struct irq_chip *chip); -extern int set_irq_data(unsigned int irq, void *data); -extern int set_irq_chip_data(unsigned int irq, void *data); -extern int set_irq_type(unsigned int irq, unsigned int type); -extern int set_irq_msi(unsigned int irq, struct msi_desc *entry); +extern int irq_set_chip(unsigned int irq, struct irq_chip *chip); +extern int irq_set_handler_data(unsigned int irq, void *data); +extern int irq_set_chip_data(unsigned int irq, void *data); +extern int irq_set_irq_type(unsigned int irq, unsigned int type); +extern int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry); extern struct irq_data *irq_get_irq_data(unsigned int irq); -static inline struct irq_chip *get_irq_chip(unsigned int irq) +static inline struct irq_chip *irq_get_chip(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->chip : NULL; @@ -356,7 +501,7 @@ static inline struct irq_chip *irq_data_get_irq_chip(struct irq_data *d) return d->chip; } -static inline void *get_irq_chip_data(unsigned int irq) +static inline void *irq_get_chip_data(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->chip_data : NULL; @@ -367,18 +512,18 @@ static inline void *irq_data_get_irq_chip_data(struct irq_data *d) return d->chip_data; } -static inline void *get_irq_data(unsigned int irq) +static inline void *irq_get_handler_data(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->handler_data : NULL; } -static inline void *irq_data_get_irq_data(struct irq_data *d) +static inline void *irq_data_get_irq_handler_data(struct irq_data *d) { return d->handler_data; } -static inline struct msi_desc *get_irq_msi(unsigned int irq) +static inline struct msi_desc *irq_get_msi_desc(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->msi_desc : NULL; @@ -389,6 +534,89 @@ static inline struct msi_desc *irq_data_get_msi(struct irq_data *d) return d->msi_desc; } +#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT +/* Please do not use: Use the replacement functions instead */ +static inline int set_irq_chip(unsigned int irq, struct irq_chip *chip) +{ + return irq_set_chip(irq, chip); +} +static inline int set_irq_data(unsigned int irq, void *data) +{ + return irq_set_handler_data(irq, data); +} +static inline int set_irq_chip_data(unsigned int irq, void *data) +{ + return irq_set_chip_data(irq, data); +} +static inline int set_irq_type(unsigned int irq, unsigned int type) +{ + return irq_set_irq_type(irq, type); +} +static inline int set_irq_msi(unsigned int irq, struct msi_desc *entry) +{ + return irq_set_msi_desc(irq, entry); +} +static inline struct irq_chip *get_irq_chip(unsigned int irq) +{ + return irq_get_chip(irq); +} +static inline void *get_irq_chip_data(unsigned int irq) +{ + return irq_get_chip_data(irq); +} +static inline void *get_irq_data(unsigned int irq) +{ + return irq_get_handler_data(irq); +} +static inline void *irq_data_get_irq_data(struct irq_data *d) +{ + return irq_data_get_irq_handler_data(d); +} +static inline struct msi_desc *get_irq_msi(unsigned int irq) +{ + return irq_get_msi_desc(irq); +} +static inline void set_irq_noprobe(unsigned int irq) +{ + irq_set_noprobe(irq); +} +static inline void set_irq_probe(unsigned int irq) +{ + irq_set_probe(irq); +} +static inline void set_irq_nested_thread(unsigned int irq, int nest) +{ + irq_set_nested_thread(irq, nest); +} +static inline void +set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, + irq_flow_handler_t handle, const char *name) +{ + irq_set_chip_and_handler_name(irq, chip, handle, name); +} +static inline void +set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, + irq_flow_handler_t handle) +{ + irq_set_chip_and_handler(irq, chip, handle); +} +static inline void +__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, + const char *name) +{ + __irq_set_handler(irq, handle, is_chained, name); +} +static inline void set_irq_handler(unsigned int irq, irq_flow_handler_t handle) +{ + irq_set_handler(irq, handle); +} +static inline void +set_irq_chained_handler(unsigned int irq, irq_flow_handler_t handle) +{ + irq_set_chained_handler(irq, handle); +} +#endif + int irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node); void irq_free_descs(unsigned int irq, unsigned int cnt); int irq_reserve_irqs(unsigned int from, unsigned int cnt); diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index c1a95b7b58d..00218371518 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -8,6 +8,7 @@ * For now it's included from <linux/irq.h> */ +struct irq_affinity_notify; struct proc_dir_entry; struct timer_rand_state; /** @@ -18,13 +19,16 @@ struct timer_rand_state; * @handle_irq: highlevel irq-events handler [if NULL, __do_IRQ()] * @action: the irq action chain * @status: status information + * @core_internal_state__do_not_mess_with_it: core internal status information * @depth: disable-depth, for nested irq_disable() calls * @wake_depth: enable depth, for multiple set_irq_wake() callers * @irq_count: stats field to detect stalled irqs * @last_unhandled: aging timer for unhandled count * @irqs_unhandled: stats field for spurious unhandled interrupts * @lock: locking for SMP + * @affinity_notify: context for notification of affinity changes * @pending_mask: pending rebalanced interrupts + * @threads_oneshot: bitfield to handle shared oneshot threads * @threads_active: number of irqaction threads currently running * @wait_for_threads: wait queue for sync_irq to wait for threaded handlers * @dir: /proc/irq/ procfs entry @@ -45,6 +49,7 @@ struct irq_desc { struct { unsigned int irq; unsigned int node; + unsigned int pad_do_not_even_think_about_it; struct irq_chip *chip; void *handler_data; void *chip_data; @@ -59,9 +64,16 @@ struct irq_desc { struct timer_rand_state *timer_rand_state; unsigned int __percpu *kstat_irqs; irq_flow_handler_t handle_irq; +#ifdef CONFIG_IRQ_PREFLOW_FASTEOI + irq_preflow_handler_t preflow_handler; +#endif struct irqaction *action; /* IRQ action list */ +#ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT + unsigned int status_use_accessors; +#else unsigned int status; /* IRQ status */ - +#endif + unsigned int core_internal_state__do_not_mess_with_it; unsigned int depth; /* nested irq disables */ unsigned int wake_depth; /* nested wake enables */ unsigned int irq_count; /* For detecting broken IRQs */ @@ -70,10 +82,12 @@ struct irq_desc { raw_spinlock_t lock; #ifdef CONFIG_SMP const struct cpumask *affinity_hint; + struct irq_affinity_notify *affinity_notify; #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_var_t pending_mask; #endif #endif + unsigned long threads_oneshot; atomic_t threads_active; wait_queue_head_t wait_for_threads; #ifdef CONFIG_PROC_FS @@ -95,10 +109,51 @@ static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) #ifdef CONFIG_GENERIC_HARDIRQS -#define get_irq_desc_chip(desc) ((desc)->irq_data.chip) -#define get_irq_desc_chip_data(desc) ((desc)->irq_data.chip_data) -#define get_irq_desc_data(desc) ((desc)->irq_data.handler_data) -#define get_irq_desc_msi(desc) ((desc)->irq_data.msi_desc) +static inline struct irq_data *irq_desc_get_irq_data(struct irq_desc *desc) +{ + return &desc->irq_data; +} + +static inline struct irq_chip *irq_desc_get_chip(struct irq_desc *desc) +{ + return desc->irq_data.chip; +} + +static inline void *irq_desc_get_chip_data(struct irq_desc *desc) +{ + return desc->irq_data.chip_data; +} + +static inline void *irq_desc_get_handler_data(struct irq_desc *desc) +{ + return desc->irq_data.handler_data; +} + +static inline struct msi_desc *irq_desc_get_msi_desc(struct irq_desc *desc) +{ + return desc->irq_data.msi_desc; +} + +#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT +static inline struct irq_chip *get_irq_desc_chip(struct irq_desc *desc) +{ + return irq_desc_get_chip(desc); +} +static inline void *get_irq_desc_data(struct irq_desc *desc) +{ + return irq_desc_get_handler_data(desc); +} + +static inline void *get_irq_desc_chip_data(struct irq_desc *desc) +{ + return irq_desc_get_chip_data(desc); +} + +static inline struct msi_desc *get_irq_desc_msi(struct irq_desc *desc) +{ + return irq_desc_get_msi_desc(desc); +} +#endif /* * Architectures call this to let the generic IRQ layer @@ -123,6 +178,7 @@ static inline int irq_has_action(unsigned int irq) return desc->action != NULL; } +#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT static inline int irq_balancing_disabled(unsigned int irq) { struct irq_desc *desc; @@ -130,6 +186,7 @@ static inline int irq_balancing_disabled(unsigned int irq) desc = irq_to_desc(irq); return desc->status & IRQ_NO_BALANCING_MASK; } +#endif /* caller has locked the irq_desc and both params are valid */ static inline void __set_irq_handler_unlocked(int irq, @@ -140,6 +197,17 @@ static inline void __set_irq_handler_unlocked(int irq, desc = irq_to_desc(irq); desc->handle_irq = handler; } + +#ifdef CONFIG_IRQ_PREFLOW_FASTEOI +static inline void +__irq_set_preflow_handler(unsigned int irq, irq_preflow_handler_t handler) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + desc->preflow_handler = handler; +} +#endif #endif #endif diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 6811f4bfc6e..922aa313c9f 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -307,6 +307,7 @@ extern clock_t jiffies_to_clock_t(long x); extern unsigned long clock_t_to_jiffies(unsigned long x); extern u64 jiffies_64_to_clock_t(u64 x); extern u64 nsec_to_clock_t(u64 x); +extern u64 nsecs_to_jiffies64(u64 n); extern unsigned long nsecs_to_jiffies(u64 n); #define TIMESTAMP_SIZE 30 diff --git a/include/linux/kthread.h b/include/linux/kthread.h index ce0775aa64c..7ff16f7d3ed 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -64,7 +64,7 @@ struct kthread_work { }; #define KTHREAD_WORKER_INIT(worker) { \ - .lock = SPIN_LOCK_UNLOCKED, \ + .lock = __SPIN_LOCK_UNLOCKED((worker).lock), \ .work_list = LIST_HEAD_INIT((worker).work_list), \ } diff --git a/include/linux/mm.h b/include/linux/mm.h index f6385fc17ad..679300c050f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1309,8 +1309,6 @@ int add_from_early_node_map(struct range *range, int az, int nr_range, int nid); u64 __init find_memory_core_early(int nid, u64 size, u64 align, u64 goal, u64 limit); -void *__alloc_memory_core_early(int nodeid, u64 size, u64 align, - u64 goal, u64 limit); typedef int (*work_fn_t)(unsigned long, unsigned long, void *); extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); extern void sparse_memory_present_with_active_regions(int nid); diff --git a/include/linux/namei.h b/include/linux/namei.h index f276d4fa01f..9c8603872c3 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -19,7 +19,6 @@ struct nameidata { struct path path; struct qstr last; struct path root; - struct file *file; struct inode *inode; /* path.dentry.d_inode */ unsigned int flags; unsigned seq; @@ -63,6 +62,10 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; #define LOOKUP_EXCL 0x0400 #define LOOKUP_RENAME_TARGET 0x0800 +#define LOOKUP_JUMPED 0x1000 +#define LOOKUP_ROOT 0x2000 +#define LOOKUP_EMPTY 0x4000 + extern int user_path_at(int, const char __user *, unsigned, struct path *); #define user_path(name, path) user_path_at(AT_FDCWD, name, LOOKUP_FOLLOW, path) @@ -72,7 +75,7 @@ extern int user_path_at(int, const char __user *, unsigned, struct path *); extern int kern_path(const char *, unsigned, struct path *); -extern int path_lookup(const char *, unsigned, struct nameidata *); +extern int kern_path_parent(const char *, struct nameidata *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct nameidata *); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d971346b034..71caf7a5e6c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2392,6 +2392,9 @@ extern int netdev_notice(const struct net_device *dev, const char *format, ...) extern int netdev_info(const struct net_device *dev, const char *format, ...) __attribute__ ((format (printf, 2, 3))); +#define MODULE_ALIAS_NETDEV(device) \ + MODULE_ALIAS("netdev-" device) + #if defined(DEBUG) #define netdev_dbg(__dev, format, args...) \ netdev_printk(KERN_DEBUG, __dev, format, ##args) diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index b197563913b..3e112de12d8 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -68,11 +68,7 @@ struct nfs_client { unsigned char cl_id_uniquifier; u32 cl_cb_ident; /* v4.0 callback identifier */ const struct nfs4_minor_version_ops *cl_mvops; -#endif /* CONFIG_NFS_V4 */ -#ifdef CONFIG_NFS_V4_1 - /* clientid returned from EXCHANGE_ID, used by session operations */ - u64 cl_ex_clid; /* The sequence id to use for the next CREATE_SESSION */ u32 cl_seqid; /* The flags used for obtaining the clientid during EXCHANGE_ID */ @@ -80,7 +76,7 @@ struct nfs_client { struct nfs4_session *cl_session; /* sharred session */ struct list_head cl_layouts; struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */ -#endif /* CONFIG_NFS_V4_1 */ +#endif /* CONFIG_NFS_V4 */ #ifdef CONFIG_NFS_FSCACHE struct fscache_cookie *fscache; /* client index cache cookie */ @@ -185,7 +181,7 @@ struct nfs_server { /* maximum number of slots to use */ #define NFS4_MAX_SLOT_TABLE RPC_MAX_SLOT_TABLE -#if defined(CONFIG_NFS_V4_1) +#if defined(CONFIG_NFS_V4) /* Sessions */ #define SLOT_TABLE_SZ (NFS4_MAX_SLOT_TABLE/(8*sizeof(long))) @@ -225,5 +221,5 @@ struct nfs4_session { struct nfs_client *clp; }; -#endif /* CONFIG_NFS_V4_1 */ +#endif /* CONFIG_NFS_V4 */ #endif diff --git a/include/linux/of.h b/include/linux/of.h index cad7cf0ab27..266db1d0baa 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -23,8 +23,6 @@ #include <asm/byteorder.h> -#ifdef CONFIG_OF - typedef u32 phandle; typedef u32 ihandle; @@ -65,11 +63,18 @@ struct device_node { #endif }; +#ifdef CONFIG_OF + /* Pointer for first entry in chain of all nodes. */ extern struct device_node *allnodes; extern struct device_node *of_chosen; extern rwlock_t devtree_lock; +static inline bool of_have_populated_dt(void) +{ + return allnodes != NULL; +} + static inline bool of_node_is_root(const struct device_node *node) { return node && (node->parent == NULL); @@ -222,5 +227,12 @@ extern void of_attach_node(struct device_node *); extern void of_detach_node(struct device_node *); #endif +#else + +static inline bool of_have_populated_dt(void) +{ + return false; +} + #endif /* CONFIG_OF */ #endif /* _LINUX_OF_H */ diff --git a/include/linux/of_pci.h b/include/linux/of_pci.h new file mode 100644 index 00000000000..85a27b650d7 --- /dev/null +++ b/include/linux/of_pci.h @@ -0,0 +1,9 @@ +#ifndef __OF_PCI_H +#define __OF_PCI_H + +#include <linux/pci.h> + +struct pci_dev; +struct of_irq; +int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq); +#endif diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 3adb06ebf84..580de67f318 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -518,6 +518,7 @@ #define PCI_DEVICE_ID_AMD_11H_NB_MISC 0x1303 #define PCI_DEVICE_ID_AMD_11H_NB_LINK 0x1304 #define PCI_DEVICE_ID_AMD_15H_NB_MISC 0x1603 +#define PCI_DEVICE_ID_AMD_15H_NB_LINK 0x1604 #define PCI_DEVICE_ID_AMD_CNB17H_F3 0x1703 #define PCI_DEVICE_ID_AMD_LANCE 0x2000 #define PCI_DEVICE_ID_AMD_LANCE_HOME 0x2001 diff --git a/include/linux/plist.h b/include/linux/plist.h index 7254eda078e..c9b9f322c8d 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h @@ -31,15 +31,17 @@ * * Simple ASCII art explanation: * - * |HEAD | - * | | - * |prio_list.prev|<------------------------------------| - * |prio_list.next|<->|pl|<->|pl|<--------------->|pl|<-| - * |10 | |10| |21| |21| |21| |40| (prio) - * | | | | | | | | | | | | - * | | | | | | | | | | | | - * |node_list.next|<->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<-| - * |node_list.prev|<------------------------------------| + * pl:prio_list (only for plist_node) + * nl:node_list + * HEAD| NODE(S) + * | + * ||------------------------------------| + * ||->|pl|<->|pl|<--------------->|pl|<-| + * | |10| |21| |21| |21| |40| (prio) + * | | | | | | | | | | | + * | | | | | | | | | | | + * |->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<-| + * |-------------------------------------------| * * The nodes on the prio_list list are sorted by priority to simplify * the insertion of new nodes. There are no nodes with duplicate @@ -78,7 +80,6 @@ #include <linux/spinlock_types.h> struct plist_head { - struct list_head prio_list; struct list_head node_list; #ifdef CONFIG_DEBUG_PI_LIST raw_spinlock_t *rawlock; @@ -88,7 +89,8 @@ struct plist_head { struct plist_node { int prio; - struct plist_head plist; + struct list_head prio_list; + struct list_head node_list; }; #ifdef CONFIG_DEBUG_PI_LIST @@ -100,7 +102,6 @@ struct plist_node { #endif #define _PLIST_HEAD_INIT(head) \ - .prio_list = LIST_HEAD_INIT((head).prio_list), \ .node_list = LIST_HEAD_INIT((head).node_list) /** @@ -133,7 +134,8 @@ struct plist_node { #define PLIST_NODE_INIT(node, __prio) \ { \ .prio = (__prio), \ - .plist = { _PLIST_HEAD_INIT((node).plist) }, \ + .prio_list = LIST_HEAD_INIT((node).prio_list), \ + .node_list = LIST_HEAD_INIT((node).node_list), \ } /** @@ -144,7 +146,6 @@ struct plist_node { static inline void plist_head_init(struct plist_head *head, spinlock_t *lock) { - INIT_LIST_HEAD(&head->prio_list); INIT_LIST_HEAD(&head->node_list); #ifdef CONFIG_DEBUG_PI_LIST head->spinlock = lock; @@ -160,7 +161,6 @@ plist_head_init(struct plist_head *head, spinlock_t *lock) static inline void plist_head_init_raw(struct plist_head *head, raw_spinlock_t *lock) { - INIT_LIST_HEAD(&head->prio_list); INIT_LIST_HEAD(&head->node_list); #ifdef CONFIG_DEBUG_PI_LIST head->rawlock = lock; @@ -176,7 +176,8 @@ plist_head_init_raw(struct plist_head *head, raw_spinlock_t *lock) static inline void plist_node_init(struct plist_node *node, int prio) { node->prio = prio; - plist_head_init(&node->plist, NULL); + INIT_LIST_HEAD(&node->prio_list); + INIT_LIST_HEAD(&node->node_list); } extern void plist_add(struct plist_node *node, struct plist_head *head); @@ -188,7 +189,7 @@ extern void plist_del(struct plist_node *node, struct plist_head *head); * @head: the head for your list */ #define plist_for_each(pos, head) \ - list_for_each_entry(pos, &(head)->node_list, plist.node_list) + list_for_each_entry(pos, &(head)->node_list, node_list) /** * plist_for_each_safe - iterate safely over a plist of given type @@ -199,7 +200,7 @@ extern void plist_del(struct plist_node *node, struct plist_head *head); * Iterate over a plist of given type, safe against removal of list entry. */ #define plist_for_each_safe(pos, n, head) \ - list_for_each_entry_safe(pos, n, &(head)->node_list, plist.node_list) + list_for_each_entry_safe(pos, n, &(head)->node_list, node_list) /** * plist_for_each_entry - iterate over list of given type @@ -208,7 +209,7 @@ extern void plist_del(struct plist_node *node, struct plist_head *head); * @mem: the name of the list_struct within the struct */ #define plist_for_each_entry(pos, head, mem) \ - list_for_each_entry(pos, &(head)->node_list, mem.plist.node_list) + list_for_each_entry(pos, &(head)->node_list, mem.node_list) /** * plist_for_each_entry_safe - iterate safely over list of given type @@ -220,7 +221,7 @@ extern void plist_del(struct plist_node *node, struct plist_head *head); * Iterate over list of given type, safe against removal of list entry. */ #define plist_for_each_entry_safe(pos, n, head, m) \ - list_for_each_entry_safe(pos, n, &(head)->node_list, m.plist.node_list) + list_for_each_entry_safe(pos, n, &(head)->node_list, m.node_list) /** * plist_head_empty - return !0 if a plist_head is empty @@ -237,7 +238,7 @@ static inline int plist_head_empty(const struct plist_head *head) */ static inline int plist_node_empty(const struct plist_node *node) { - return plist_head_empty(&node->plist); + return list_empty(&node->node_list); } /* All functions below assume the plist_head is not empty. */ @@ -285,7 +286,7 @@ static inline int plist_node_empty(const struct plist_node *node) static inline struct plist_node *plist_first(const struct plist_head *head) { return list_entry(head->node_list.next, - struct plist_node, plist.node_list); + struct plist_node, node_list); } /** @@ -297,7 +298,7 @@ static inline struct plist_node *plist_first(const struct plist_head *head) static inline struct plist_node *plist_last(const struct plist_head *head) { return list_entry(head->node_list.prev, - struct plist_node, plist.node_list); + struct plist_node, node_list); } #endif diff --git a/include/linux/posix-clock.h b/include/linux/posix-clock.h new file mode 100644 index 00000000000..369e19d3750 --- /dev/null +++ b/include/linux/posix-clock.h @@ -0,0 +1,150 @@ +/* + * posix-clock.h - support for dynamic clock devices + * + * Copyright (C) 2010 OMICRON electronics GmbH + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#ifndef _LINUX_POSIX_CLOCK_H_ +#define _LINUX_POSIX_CLOCK_H_ + +#include <linux/cdev.h> +#include <linux/fs.h> +#include <linux/poll.h> +#include <linux/posix-timers.h> + +struct posix_clock; + +/** + * struct posix_clock_operations - functional interface to the clock + * + * Every posix clock is represented by a character device. Drivers may + * optionally offer extended capabilities by implementing the + * character device methods. The character device file operations are + * first handled by the clock device layer, then passed on to the + * driver by calling these functions. + * + * @owner: The clock driver should set to THIS_MODULE + * @clock_adjtime: Adjust the clock + * @clock_gettime: Read the current time + * @clock_getres: Get the clock resolution + * @clock_settime: Set the current time value + * @timer_create: Create a new timer + * @timer_delete: Remove a previously created timer + * @timer_gettime: Get remaining time and interval of a timer + * @timer_setttime: Set a timer's initial expiration and interval + * @fasync: Optional character device fasync method + * @mmap: Optional character device mmap method + * @open: Optional character device open method + * @release: Optional character device release method + * @ioctl: Optional character device ioctl method + * @read: Optional character device read method + * @poll: Optional character device poll method + */ +struct posix_clock_operations { + struct module *owner; + + int (*clock_adjtime)(struct posix_clock *pc, struct timex *tx); + + int (*clock_gettime)(struct posix_clock *pc, struct timespec *ts); + + int (*clock_getres) (struct posix_clock *pc, struct timespec *ts); + + int (*clock_settime)(struct posix_clock *pc, + const struct timespec *ts); + + int (*timer_create) (struct posix_clock *pc, struct k_itimer *kit); + + int (*timer_delete) (struct posix_clock *pc, struct k_itimer *kit); + + void (*timer_gettime)(struct posix_clock *pc, + struct k_itimer *kit, struct itimerspec *tsp); + + int (*timer_settime)(struct posix_clock *pc, + struct k_itimer *kit, int flags, + struct itimerspec *tsp, struct itimerspec *old); + /* + * Optional character device methods: + */ + int (*fasync) (struct posix_clock *pc, + int fd, struct file *file, int on); + + long (*ioctl) (struct posix_clock *pc, + unsigned int cmd, unsigned long arg); + + int (*mmap) (struct posix_clock *pc, + struct vm_area_struct *vma); + + int (*open) (struct posix_clock *pc, fmode_t f_mode); + + uint (*poll) (struct posix_clock *pc, + struct file *file, poll_table *wait); + + int (*release) (struct posix_clock *pc); + + ssize_t (*read) (struct posix_clock *pc, + uint flags, char __user *buf, size_t cnt); +}; + +/** + * struct posix_clock - represents a dynamic posix clock + * + * @ops: Functional interface to the clock + * @cdev: Character device instance for this clock + * @kref: Reference count. + * @mutex: Protects the 'zombie' field from concurrent access. + * @zombie: If 'zombie' is true, then the hardware has disappeared. + * @release: A function to free the structure when the reference count reaches + * zero. May be NULL if structure is statically allocated. + * + * Drivers should embed their struct posix_clock within a private + * structure, obtaining a reference to it during callbacks using + * container_of(). + */ +struct posix_clock { + struct posix_clock_operations ops; + struct cdev cdev; + struct kref kref; + struct mutex mutex; + bool zombie; + void (*release)(struct posix_clock *clk); +}; + +/** + * posix_clock_register() - register a new clock + * @clk: Pointer to the clock. Caller must provide 'ops' and 'release' + * @devid: Allocated device id + * + * A clock driver calls this function to register itself with the + * clock device subsystem. If 'clk' points to dynamically allocated + * memory, then the caller must provide a 'release' function to free + * that memory. + * + * Returns zero on success, non-zero otherwise. + */ +int posix_clock_register(struct posix_clock *clk, dev_t devid); + +/** + * posix_clock_unregister() - unregister a clock + * @clk: Clock instance previously registered via posix_clock_register() + * + * A clock driver calls this function to remove itself from the clock + * device subsystem. The posix_clock itself will remain (in an + * inactive state) until its reference count drops to zero, at which + * point it will be deallocated with its 'release' method. + */ +void posix_clock_unregister(struct posix_clock *clk); + +#endif diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 3e23844a699..d51243ae072 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -4,6 +4,7 @@ #include <linux/spinlock.h> #include <linux/list.h> #include <linux/sched.h> +#include <linux/timex.h> union cpu_time_count { cputime_t cpu; @@ -17,10 +18,21 @@ struct cpu_timer_list { int firing; }; +/* + * Bit fields within a clockid: + * + * The most significant 29 bits hold either a pid or a file descriptor. + * + * Bit 2 indicates whether a cpu clock refers to a thread or a process. + * + * Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3. + * + * A clockid is invalid if bits 2, 1, and 0 are all set. + */ #define CPUCLOCK_PID(clock) ((pid_t) ~((clock) >> 3)) #define CPUCLOCK_PERTHREAD(clock) \ (((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0) -#define CPUCLOCK_PID_MASK 7 + #define CPUCLOCK_PERTHREAD_MASK 4 #define CPUCLOCK_WHICH(clock) ((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK) #define CPUCLOCK_CLOCK_MASK 3 @@ -28,12 +40,17 @@ struct cpu_timer_list { #define CPUCLOCK_VIRT 1 #define CPUCLOCK_SCHED 2 #define CPUCLOCK_MAX 3 +#define CLOCKFD CPUCLOCK_MAX +#define CLOCKFD_MASK (CPUCLOCK_PERTHREAD_MASK|CPUCLOCK_CLOCK_MASK) #define MAKE_PROCESS_CPUCLOCK(pid, clock) \ ((~(clockid_t) (pid) << 3) | (clockid_t) (clock)) #define MAKE_THREAD_CPUCLOCK(tid, clock) \ MAKE_PROCESS_CPUCLOCK((tid), (clock) | CPUCLOCK_PERTHREAD_MASK) +#define FD_TO_CLOCKID(fd) ((~(clockid_t) (fd) << 3) | CLOCKFD) +#define CLOCKID_TO_FD(clk) ((unsigned int) ~((clk) >> 3)) + /* POSIX.1b interval timer structure. */ struct k_itimer { struct list_head list; /* free/ allocate list */ @@ -67,10 +84,11 @@ struct k_itimer { }; struct k_clock { - int res; /* in nanoseconds */ int (*clock_getres) (const clockid_t which_clock, struct timespec *tp); - int (*clock_set) (const clockid_t which_clock, struct timespec * tp); + int (*clock_set) (const clockid_t which_clock, + const struct timespec *tp); int (*clock_get) (const clockid_t which_clock, struct timespec * tp); + int (*clock_adj) (const clockid_t which_clock, struct timex *tx); int (*timer_create) (struct k_itimer *timer); int (*nsleep) (const clockid_t which_clock, int flags, struct timespec *, struct timespec __user *); @@ -84,28 +102,14 @@ struct k_clock { struct itimerspec * cur_setting); }; -void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock); +extern struct k_clock clock_posix_cpu; +extern struct k_clock clock_posix_dynamic; -/* error handlers for timer_create, nanosleep and settime */ -int do_posix_clock_nonanosleep(const clockid_t, int flags, struct timespec *, - struct timespec __user *); -int do_posix_clock_nosettime(const clockid_t, struct timespec *tp); +void posix_timers_register_clock(const clockid_t clock_id, struct k_clock *new_clock); /* function to call to trigger timer event */ int posix_timer_event(struct k_itimer *timr, int si_private); -int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *ts); -int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *ts); -int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *ts); -int posix_cpu_timer_create(struct k_itimer *timer); -int posix_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct timespec __user *rmtp); -long posix_cpu_nsleep_restart(struct restart_block *restart_block); -int posix_cpu_timer_set(struct k_itimer *timer, int flags, - struct itimerspec *new, struct itimerspec *old); -int posix_cpu_timer_del(struct k_itimer *timer); -void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp); - void posix_cpu_timer_schedule(struct k_itimer *timer); void run_posix_cpu_timers(struct task_struct *task); diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 89c3e518299..2ca7e8a7806 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -133,7 +133,6 @@ extern struct class *rtc_class; * The (current) exceptions are mostly filesystem hooks: * - the proc() hook for procfs * - non-ioctl() chardev hooks: open(), release(), read_callback() - * - periodic irq calls: irq_set_state(), irq_set_freq() * * REVISIT those periodic irq calls *do* have ops_lock when they're * issued through ioctl() ... @@ -148,11 +147,8 @@ struct rtc_class_ops { int (*set_alarm)(struct device *, struct rtc_wkalrm *); int (*proc)(struct device *, struct seq_file *); int (*set_mmss)(struct device *, unsigned long secs); - int (*irq_set_state)(struct device *, int enabled); - int (*irq_set_freq)(struct device *, int freq); int (*read_callback)(struct device *, int data); int (*alarm_irq_enable)(struct device *, unsigned int enabled); - int (*update_irq_enable)(struct device *, unsigned int enabled); }; #define RTC_DEVICE_NAME_SIZE 20 @@ -227,6 +223,7 @@ extern void rtc_device_unregister(struct rtc_device *rtc); extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm); extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm); extern int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs); +int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm); extern int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alrm); extern int rtc_set_alarm(struct rtc_device *rtc, diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h index bd31808c7d8..cc0072e93e3 100644 --- a/include/linux/rwlock_types.h +++ b/include/linux/rwlock_types.h @@ -43,14 +43,6 @@ typedef struct { RW_DEP_MAP_INIT(lockname) } #endif -/* - * RW_LOCK_UNLOCKED defeat lockdep state tracking and is hence - * deprecated. - * - * Please use DEFINE_RWLOCK() or __RW_LOCK_UNLOCKED() as appropriate. - */ -#define RW_LOCK_UNLOCKED __RW_LOCK_UNLOCKED(old_style_rw_init) - #define DEFINE_RWLOCK(x) rwlock_t x = __RW_LOCK_UNLOCKED(x) #endif /* __LINUX_RWLOCK_TYPES_H */ diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h index bdfcc252797..34701241b67 100644 --- a/include/linux/rwsem-spinlock.h +++ b/include/linux/rwsem-spinlock.h @@ -12,15 +12,7 @@ #error "please don't include linux/rwsem-spinlock.h directly, use linux/rwsem.h instead" #endif -#include <linux/spinlock.h> -#include <linux/list.h> - #ifdef __KERNEL__ - -#include <linux/types.h> - -struct rwsem_waiter; - /* * the rw-semaphore definition * - if activity is 0 then there are no active readers or writers @@ -37,28 +29,7 @@ struct rw_semaphore { #endif }; -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } -#else -# define __RWSEM_DEP_MAP_INIT(lockname) -#endif - -#define __RWSEM_INITIALIZER(name) \ -{ 0, __SPIN_LOCK_UNLOCKED(name.wait_lock), LIST_HEAD_INIT((name).wait_list) \ - __RWSEM_DEP_MAP_INIT(name) } - -#define DECLARE_RWSEM(name) \ - struct rw_semaphore name = __RWSEM_INITIALIZER(name) - -extern void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key); - -#define init_rwsem(sem) \ -do { \ - static struct lock_class_key __key; \ - \ - __init_rwsem((sem), #sem, &__key); \ -} while (0) +#define RWSEM_UNLOCKED_VALUE 0x00000000 extern void __down_read(struct rw_semaphore *sem); extern int __down_read_trylock(struct rw_semaphore *sem); diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index efd348fe8ca..a8afe9cd000 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -11,6 +11,9 @@ #include <linux/types.h> #include <linux/kernel.h> +#include <linux/list.h> +#include <linux/spinlock.h> + #include <asm/system.h> #include <asm/atomic.h> @@ -19,9 +22,57 @@ struct rw_semaphore; #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK #include <linux/rwsem-spinlock.h> /* use a generic implementation */ #else -#include <asm/rwsem.h> /* use an arch-specific implementation */ +/* All arch specific implementations share the same struct */ +struct rw_semaphore { + long count; + spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); +extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); + +/* Include the arch specific part */ +#include <asm/rwsem.h> + +/* In all implementations count != 0 means locked */ +static inline int rwsem_is_locked(struct rw_semaphore *sem) +{ + return sem->count != 0; +} + +#endif + +/* Common initializer macros and functions */ + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname } +#else +# define __RWSEM_DEP_MAP_INIT(lockname) #endif +#define __RWSEM_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED(name.wait_lock), \ + LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } + +#define DECLARE_RWSEM(name) \ + struct rw_semaphore name = __RWSEM_INITIALIZER(name) + +extern void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key); + +#define init_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __init_rwsem((sem), #sem, &__key); \ +} while (0) + /* * lock for reading */ diff --git a/include/linux/sched.h b/include/linux/sched.h index c57e5278df8..c15936fe998 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1058,6 +1058,7 @@ struct sched_class { void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); + bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); @@ -1084,12 +1085,10 @@ struct sched_class { void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); void (*task_fork) (struct task_struct *p); - void (*switched_from) (struct rq *this_rq, struct task_struct *task, - int running); - void (*switched_to) (struct rq *this_rq, struct task_struct *task, - int running); + void (*switched_from) (struct rq *this_rq, struct task_struct *task); + void (*switched_to) (struct rq *this_rq, struct task_struct *task); void (*prio_changed) (struct rq *this_rq, struct task_struct *task, - int oldprio, int running); + int oldprio); unsigned int (*get_rr_interval) (struct rq *rq, struct task_struct *task); @@ -1715,7 +1714,6 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * /* * Per process flags */ -#define PF_KSOFTIRQD 0x00000001 /* I am ksoftirqd */ #define PF_STARTING 0x00000002 /* being created */ #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ @@ -1945,8 +1943,6 @@ int sched_rt_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -extern unsigned int sysctl_sched_compat_yield; - #ifdef CONFIG_SCHED_AUTOGROUP extern unsigned int sysctl_sched_autogroup_enabled; @@ -1977,6 +1973,7 @@ static inline int rt_mutex_getprio(struct task_struct *p) # define rt_mutex_adjust_pi(p) do { } while (0) #endif +extern bool yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); extern int task_prio(const struct task_struct *p); extern int task_nice(const struct task_struct *p); @@ -2049,7 +2046,7 @@ extern void release_uids(struct user_namespace *ns); #include <asm/current.h> -extern void do_timer(unsigned long ticks); +extern void xtime_update(unsigned long ticks); extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); diff --git a/include/linux/security.h b/include/linux/security.h index b2b7f9749f5..debbd97db7a 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -53,7 +53,7 @@ struct audit_krule; */ extern int cap_capable(struct task_struct *tsk, const struct cred *cred, int cap, int audit); -extern int cap_settime(struct timespec *ts, struct timezone *tz); +extern int cap_settime(const struct timespec *ts, const struct timezone *tz); extern int cap_ptrace_access_check(struct task_struct *child, unsigned int mode); extern int cap_ptrace_traceme(struct task_struct *parent); extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); @@ -1387,7 +1387,7 @@ struct security_operations { int (*quotactl) (int cmds, int type, int id, struct super_block *sb); int (*quota_on) (struct dentry *dentry); int (*syslog) (int type); - int (*settime) (struct timespec *ts, struct timezone *tz); + int (*settime) (const struct timespec *ts, const struct timezone *tz); int (*vm_enough_memory) (struct mm_struct *mm, long pages); int (*bprm_set_creds) (struct linux_binprm *bprm); @@ -1669,7 +1669,7 @@ int security_sysctl(struct ctl_table *table, int op); int security_quotactl(int cmds, int type, int id, struct super_block *sb); int security_quota_on(struct dentry *dentry); int security_syslog(int type); -int security_settime(struct timespec *ts, struct timezone *tz); +int security_settime(const struct timespec *ts, const struct timezone *tz); int security_vm_enough_memory(long pages); int security_vm_enough_memory_mm(struct mm_struct *mm, long pages); int security_vm_enough_memory_kern(long pages); @@ -1904,7 +1904,8 @@ static inline int security_syslog(int type) return 0; } -static inline int security_settime(struct timespec *ts, struct timezone *tz) +static inline int security_settime(const struct timespec *ts, + const struct timezone *tz) { return cap_settime(ts, tz); } diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h index 851b7783720..73548eb13a5 100644 --- a/include/linux/spinlock_types.h +++ b/include/linux/spinlock_types.h @@ -81,14 +81,6 @@ typedef struct spinlock { #define __SPIN_LOCK_UNLOCKED(lockname) \ (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname) -/* - * SPIN_LOCK_UNLOCKED defeats lockdep state tracking and is hence - * deprecated. - * Please use DEFINE_SPINLOCK() or __SPIN_LOCK_UNLOCKED() as - * appropriate. - */ -#define SPIN_LOCK_UNLOCKED __SPIN_LOCK_UNLOCKED(old_style_spin_init) - #define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) #include <linux/rwlock_types.h> diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 88513fd8e20..d81db8012c6 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -212,6 +212,7 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *); struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req, const struct rpc_call_ops *ops); void rpc_put_task(struct rpc_task *); +void rpc_put_task_async(struct rpc_task *); void rpc_exit_task(struct rpc_task *); void rpc_exit(struct rpc_task *, int); void rpc_release_calldata(const struct rpc_call_ops *, void *); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a17fcea2ca5..1f5c18e6f4f 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -62,6 +62,7 @@ struct robust_list_head; struct getcpu_cache; struct old_linux_dirent; struct perf_event_attr; +struct file_handle; #include <linux/types.h> #include <linux/aio_abi.h> @@ -315,6 +316,8 @@ asmlinkage long sys_clock_settime(clockid_t which_clock, const struct timespec __user *tp); asmlinkage long sys_clock_gettime(clockid_t which_clock, struct timespec __user *tp); +asmlinkage long sys_clock_adjtime(clockid_t which_clock, + struct timex __user *tx); asmlinkage long sys_clock_getres(clockid_t which_clock, struct timespec __user *tp); asmlinkage long sys_clock_nanosleep(clockid_t which_clock, int flags, @@ -834,5 +837,10 @@ asmlinkage long sys_mmap_pgoff(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff); asmlinkage long sys_old_mmap(struct mmap_arg_struct __user *arg); - +asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name, + struct file_handle __user *handle, + int __user *mnt_id, int flag); +asmlinkage long sys_open_by_handle_at(int mountdirfd, + struct file_handle __user *handle, + int flags); #endif diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 7bb5cb64f3b..11684d9e6bd 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -930,6 +930,7 @@ enum #ifdef __KERNEL__ #include <linux/list.h> +#include <linux/rcupdate.h> /* For the /proc/sys support */ struct ctl_table; @@ -1037,10 +1038,15 @@ struct ctl_table_root { struct ctl_table trees. */ struct ctl_table_header { - struct ctl_table *ctl_table; - struct list_head ctl_entry; - int used; - int count; + union { + struct { + struct ctl_table *ctl_table; + struct list_head ctl_entry; + int used; + int count; + }; + struct rcu_head rcu; + }; struct completion *unregistering; struct ctl_table *ctl_table_arg; struct ctl_table_root *root; diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index c9069654417..20fc303947d 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -18,9 +18,6 @@ struct compat_timespec; struct restart_block { long (*fn)(struct restart_block *); union { - struct { - unsigned long arg0, arg1, arg2, arg3; - }; /* For futex_wait and futex_wait_requeue_pi */ struct { u32 __user *uaddr; diff --git a/include/linux/time.h b/include/linux/time.h index 1e6d3b59238..454a2620578 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -113,8 +113,6 @@ static inline struct timespec timespec_sub(struct timespec lhs, #define timespec_valid(ts) \ (((ts)->tv_sec >= 0) && (((unsigned long) (ts)->tv_nsec) < NSEC_PER_SEC)) -extern seqlock_t xtime_lock; - extern void read_persistent_clock(struct timespec *ts); extern void read_boot_clock(struct timespec *ts); extern int update_persistent_clock(struct timespec now); @@ -125,8 +123,9 @@ extern int timekeeping_suspended; unsigned long get_seconds(void); struct timespec current_kernel_time(void); struct timespec __current_kernel_time(void); /* does not take xtime_lock */ -struct timespec __get_wall_to_monotonic(void); /* does not take xtime_lock */ struct timespec get_monotonic_coarse(void); +void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, + struct timespec *wtom, struct timespec *sleep); #define CURRENT_TIME (current_kernel_time()) #define CURRENT_TIME_SEC ((struct timespec) { get_seconds(), 0 }) @@ -147,8 +146,9 @@ static inline u32 arch_gettimeoffset(void) { return 0; } #endif extern void do_gettimeofday(struct timeval *tv); -extern int do_settimeofday(struct timespec *tv); -extern int do_sys_settimeofday(struct timespec *tv, struct timezone *tz); +extern int do_settimeofday(const struct timespec *tv); +extern int do_sys_settimeofday(const struct timespec *tv, + const struct timezone *tz); #define do_posix_clock_monotonic_gettime(ts) ktime_get_ts(ts) extern long do_utimes(int dfd, const char __user *filename, struct timespec *times, int flags); struct itimerval; @@ -162,12 +162,13 @@ extern void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real); extern void getboottime(struct timespec *ts); extern void monotonic_to_bootbased(struct timespec *ts); +extern void get_monotonic_boottime(struct timespec *ts); extern struct timespec timespec_trunc(struct timespec t, unsigned gran); extern int timekeeping_valid_for_hres(void); extern u64 timekeeping_max_deferment(void); -extern void update_wall_time(void); extern void timekeeping_leap_insert(int leapsecond); +extern int timekeeping_inject_offset(struct timespec *ts); struct tms; extern void do_sys_times(struct tms *); @@ -292,6 +293,7 @@ struct itimerval { #define CLOCK_MONOTONIC_RAW 4 #define CLOCK_REALTIME_COARSE 5 #define CLOCK_MONOTONIC_COARSE 6 +#define CLOCK_BOOTTIME 7 /* * The IDs of various hardware clocks: diff --git a/include/linux/timex.h b/include/linux/timex.h index d23999f9499..aa60fe7b6ed 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -73,7 +73,7 @@ struct timex { long tolerance; /* clock frequency tolerance (ppm) * (read only) */ - struct timeval time; /* (read only) */ + struct timeval time; /* (read only, except for ADJ_SETOFFSET) */ long tick; /* (modified) usecs between clock ticks */ long ppsfreq; /* pps frequency (scaled ppm) (ro) */ @@ -102,6 +102,7 @@ struct timex { #define ADJ_STATUS 0x0010 /* clock status */ #define ADJ_TIMECONST 0x0020 /* pll time constant */ #define ADJ_TAI 0x0080 /* set TAI offset */ +#define ADJ_SETOFFSET 0x0100 /* add 'time' to current time */ #define ADJ_MICRO 0x1000 /* select microsecond resolution */ #define ADJ_NANO 0x2000 /* select nanosecond resolution */ #define ADJ_TICK 0x4000 /* tick value */ diff --git a/include/target/target_core_transport.h b/include/target/target_core_transport.h index 24694051157..2e8ec51f061 100644 --- a/include/target/target_core_transport.h +++ b/include/target/target_core_transport.h @@ -135,6 +135,8 @@ extern void transport_complete_task(struct se_task *, int); extern void transport_add_task_to_execute_queue(struct se_task *, struct se_task *, struct se_device *); +extern void transport_remove_task_from_execute_queue(struct se_task *, + struct se_device *); unsigned char *transport_dump_cmd_direction(struct se_cmd *); extern void transport_dump_dev_state(struct se_device *, char *, int *); extern void transport_dump_dev_info(struct se_device *, struct se_lun *, diff --git a/include/xen/events.h b/include/xen/events.h index 00f53ddcc06..962da2ced5b 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -75,11 +75,9 @@ int xen_allocate_pirq(unsigned gsi, int shareable, char *name); int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name); #ifdef CONFIG_PCI_MSI -/* Allocate an irq and a pirq to be used with MSIs. */ -#define XEN_ALLOC_PIRQ (1 << 0) -#define XEN_ALLOC_IRQ (1 << 1) -void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc_mask); -int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type); +int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc); +int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, + int pirq, int vector, const char *name); #endif /* De-allocates the above mentioned physical interrupt. */ diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h index c2d1fa4dc1e..61e523af3c4 100644 --- a/include/xen/interface/io/blkif.h +++ b/include/xen/interface/io/blkif.h @@ -51,11 +51,7 @@ typedef uint64_t blkif_sector_t; */ #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 -struct blkif_request { - uint8_t operation; /* BLKIF_OP_??? */ - uint8_t nr_segments; /* number of segments */ - blkif_vdev_t handle; /* only for read/write requests */ - uint64_t id; /* private guest value, echoed in resp */ +struct blkif_request_rw { blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ struct blkif_request_segment { grant_ref_t gref; /* reference to I/O buffer frame */ @@ -65,6 +61,16 @@ struct blkif_request { } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; }; +struct blkif_request { + uint8_t operation; /* BLKIF_OP_??? */ + uint8_t nr_segments; /* number of segments */ + blkif_vdev_t handle; /* only for read/write requests */ + uint64_t id; /* private guest value, echoed in resp */ + union { + struct blkif_request_rw rw; + } u; +}; + struct blkif_response { uint64_t id; /* copied from request */ uint8_t operation; /* copied from request */ @@ -91,4 +97,25 @@ DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response); #define VDISK_REMOVABLE 0x2 #define VDISK_READONLY 0x4 +/* Xen-defined major numbers for virtual disks, they look strangely + * familiar */ +#define XEN_IDE0_MAJOR 3 +#define XEN_IDE1_MAJOR 22 +#define XEN_SCSI_DISK0_MAJOR 8 +#define XEN_SCSI_DISK1_MAJOR 65 +#define XEN_SCSI_DISK2_MAJOR 66 +#define XEN_SCSI_DISK3_MAJOR 67 +#define XEN_SCSI_DISK4_MAJOR 68 +#define XEN_SCSI_DISK5_MAJOR 69 +#define XEN_SCSI_DISK6_MAJOR 70 +#define XEN_SCSI_DISK7_MAJOR 71 +#define XEN_SCSI_DISK8_MAJOR 128 +#define XEN_SCSI_DISK9_MAJOR 129 +#define XEN_SCSI_DISK10_MAJOR 130 +#define XEN_SCSI_DISK11_MAJOR 131 +#define XEN_SCSI_DISK12_MAJOR 132 +#define XEN_SCSI_DISK13_MAJOR 133 +#define XEN_SCSI_DISK14_MAJOR 134 +#define XEN_SCSI_DISK15_MAJOR 135 + #endif /* __XEN_PUBLIC_IO_BLKIF_H__ */ diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 2befa3e2f1b..b33257bc7e8 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -30,7 +30,7 @@ #define __HYPERVISOR_stack_switch 3 #define __HYPERVISOR_set_callbacks 4 #define __HYPERVISOR_fpu_taskswitch 5 -#define __HYPERVISOR_sched_op 6 +#define __HYPERVISOR_sched_op_compat 6 #define __HYPERVISOR_dom0_op 7 #define __HYPERVISOR_set_debugreg 8 #define __HYPERVISOR_get_debugreg 9 @@ -52,7 +52,7 @@ #define __HYPERVISOR_mmuext_op 26 #define __HYPERVISOR_acm_op 27 #define __HYPERVISOR_nmi_op 28 -#define __HYPERVISOR_sched_op_new 29 +#define __HYPERVISOR_sched_op 29 #define __HYPERVISOR_callback_op 30 #define __HYPERVISOR_xenoprof_op 31 #define __HYPERVISOR_event_channel_op 32 diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 98b92154a26..03c85d7387f 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -5,9 +5,9 @@ DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); -void xen_pre_suspend(void); -void xen_post_suspend(int suspend_cancelled); -void xen_hvm_post_suspend(int suspend_cancelled); +void xen_arch_pre_suspend(void); +void xen_arch_post_suspend(int suspend_cancelled); +void xen_arch_hvm_post_suspend(int suspend_cancelled); void xen_mm_pin_all(void); void xen_mm_unpin_all(void); diff --git a/init/Kconfig b/init/Kconfig index 4c4edf2ec4a..5721d27af62 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -287,6 +287,18 @@ config BSD_PROCESS_ACCT_V3 for processing it. A preliminary version of these tools is available at <http://www.gnu.org/software/acct/>. +config FHANDLE + bool "open by fhandle syscalls" + select EXPORTFS + help + If you say Y here, a user level program will be able to map + file names to handle and then later use the handle for + different file system operations. This is useful in implementing + userspace file servers, which now track files using handles instead + of names. The handle would remain the same even if file names + get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2) + syscalls. + config TASKSTATS bool "Export task/process statistics through netlink (EXPERIMENTAL)" depends on NET diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index d2e3c786646..e683869365d 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -144,9 +144,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) } /* Initialize a parent watch entry. */ -static struct audit_parent *audit_init_parent(struct nameidata *ndp) +static struct audit_parent *audit_init_parent(struct path *path) { - struct inode *inode = ndp->path.dentry->d_inode; + struct inode *inode = path->dentry->d_inode; struct audit_parent *parent; int ret; @@ -353,53 +353,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent) } /* Get path information necessary for adding watches. */ -static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) +static int audit_get_nd(struct audit_watch *watch, struct path *parent) { - struct nameidata *ndparent, *ndwatch; + struct nameidata nd; + struct dentry *d; int err; - ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); - if (unlikely(!ndparent)) - return -ENOMEM; + err = kern_path_parent(watch->path, &nd); + if (err) + return err; - ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); - if (unlikely(!ndwatch)) { - kfree(ndparent); - return -ENOMEM; + if (nd.last_type != LAST_NORM) { + path_put(&nd.path); + return -EINVAL; } - err = path_lookup(path, LOOKUP_PARENT, ndparent); - if (err) { - kfree(ndparent); - kfree(ndwatch); - return err; + mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len); + if (IS_ERR(d)) { + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + path_put(&nd.path); + return PTR_ERR(d); } - - err = path_lookup(path, 0, ndwatch); - if (err) { - kfree(ndwatch); - ndwatch = NULL; + if (d->d_inode) { + /* update watch filter fields */ + watch->dev = d->d_inode->i_sb->s_dev; + watch->ino = d->d_inode->i_ino; } + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - *ndp = ndparent; - *ndw = ndwatch; - + *parent = nd.path; + dput(d); return 0; } -/* Release resources used for watch path information. */ -static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) -{ - if (ndp) { - path_put(&ndp->path); - kfree(ndp); - } - if (ndw) { - path_put(&ndw->path); - kfree(ndw); - } -} - /* Associate the given rule with an existing parent. * Caller must hold audit_filter_mutex. */ static void audit_add_to_parent(struct audit_krule *krule, @@ -440,31 +427,24 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) { struct audit_watch *watch = krule->watch; struct audit_parent *parent; - struct nameidata *ndp = NULL, *ndw = NULL; + struct path parent_path; int h, ret = 0; mutex_unlock(&audit_filter_mutex); /* Avoid calling path_lookup under audit_filter_mutex. */ - ret = audit_get_nd(watch->path, &ndp, &ndw); - if (ret) { - /* caller expects mutex locked */ - mutex_lock(&audit_filter_mutex); - goto error; - } + ret = audit_get_nd(watch, &parent_path); + /* caller expects mutex locked */ mutex_lock(&audit_filter_mutex); - /* update watch filter fields */ - if (ndw) { - watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; - watch->ino = ndw->path.dentry->d_inode->i_ino; - } + if (ret) + return ret; /* either find an old parent or attach a new one */ - parent = audit_find_parent(ndp->path.dentry->d_inode); + parent = audit_find_parent(parent_path.dentry->d_inode); if (!parent) { - parent = audit_init_parent(ndp); + parent = audit_init_parent(&parent_path); if (IS_ERR(parent)) { ret = PTR_ERR(parent); goto error; @@ -479,9 +459,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) h = audit_hash_ino((u32)watch->ino); *list = &audit_inode_hash[h]; error: - audit_put_nd(ndp, ndw); /* NULL args OK */ + path_put(&parent_path); return ret; - } void audit_remove_watch_rule(struct audit_krule *krule) diff --git a/kernel/compat.c b/kernel/compat.c index c9e2ec0b34a..38b1d2c1cbe 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -52,6 +52,64 @@ static int compat_put_timeval(struct compat_timeval __user *o, put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; } +static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp) +{ + memset(txc, 0, sizeof(struct timex)); + + if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || + __get_user(txc->modes, &utp->modes) || + __get_user(txc->offset, &utp->offset) || + __get_user(txc->freq, &utp->freq) || + __get_user(txc->maxerror, &utp->maxerror) || + __get_user(txc->esterror, &utp->esterror) || + __get_user(txc->status, &utp->status) || + __get_user(txc->constant, &utp->constant) || + __get_user(txc->precision, &utp->precision) || + __get_user(txc->tolerance, &utp->tolerance) || + __get_user(txc->time.tv_sec, &utp->time.tv_sec) || + __get_user(txc->time.tv_usec, &utp->time.tv_usec) || + __get_user(txc->tick, &utp->tick) || + __get_user(txc->ppsfreq, &utp->ppsfreq) || + __get_user(txc->jitter, &utp->jitter) || + __get_user(txc->shift, &utp->shift) || + __get_user(txc->stabil, &utp->stabil) || + __get_user(txc->jitcnt, &utp->jitcnt) || + __get_user(txc->calcnt, &utp->calcnt) || + __get_user(txc->errcnt, &utp->errcnt) || + __get_user(txc->stbcnt, &utp->stbcnt)) + return -EFAULT; + + return 0; +} + +static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc) +{ + if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || + __put_user(txc->modes, &utp->modes) || + __put_user(txc->offset, &utp->offset) || + __put_user(txc->freq, &utp->freq) || + __put_user(txc->maxerror, &utp->maxerror) || + __put_user(txc->esterror, &utp->esterror) || + __put_user(txc->status, &utp->status) || + __put_user(txc->constant, &utp->constant) || + __put_user(txc->precision, &utp->precision) || + __put_user(txc->tolerance, &utp->tolerance) || + __put_user(txc->time.tv_sec, &utp->time.tv_sec) || + __put_user(txc->time.tv_usec, &utp->time.tv_usec) || + __put_user(txc->tick, &utp->tick) || + __put_user(txc->ppsfreq, &utp->ppsfreq) || + __put_user(txc->jitter, &utp->jitter) || + __put_user(txc->shift, &utp->shift) || + __put_user(txc->stabil, &utp->stabil) || + __put_user(txc->jitcnt, &utp->jitcnt) || + __put_user(txc->calcnt, &utp->calcnt) || + __put_user(txc->errcnt, &utp->errcnt) || + __put_user(txc->stbcnt, &utp->stbcnt) || + __put_user(txc->tai, &utp->tai)) + return -EFAULT; + return 0; +} + asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) { @@ -617,6 +675,29 @@ long compat_sys_clock_gettime(clockid_t which_clock, return err; } +long compat_sys_clock_adjtime(clockid_t which_clock, + struct compat_timex __user *utp) +{ + struct timex txc; + mm_segment_t oldfs; + int err, ret; + + err = compat_get_timex(&txc, utp); + if (err) + return err; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc); + set_fs(oldfs); + + err = compat_put_timex(utp, &txc); + if (err) + return err; + + return ret; +} + long compat_sys_clock_getres(clockid_t which_clock, struct compat_timespec __user *tp) { @@ -951,58 +1032,17 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) { struct timex txc; - int ret; - - memset(&txc, 0, sizeof(struct timex)); + int err, ret; - if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || - __get_user(txc.modes, &utp->modes) || - __get_user(txc.offset, &utp->offset) || - __get_user(txc.freq, &utp->freq) || - __get_user(txc.maxerror, &utp->maxerror) || - __get_user(txc.esterror, &utp->esterror) || - __get_user(txc.status, &utp->status) || - __get_user(txc.constant, &utp->constant) || - __get_user(txc.precision, &utp->precision) || - __get_user(txc.tolerance, &utp->tolerance) || - __get_user(txc.time.tv_sec, &utp->time.tv_sec) || - __get_user(txc.time.tv_usec, &utp->time.tv_usec) || - __get_user(txc.tick, &utp->tick) || - __get_user(txc.ppsfreq, &utp->ppsfreq) || - __get_user(txc.jitter, &utp->jitter) || - __get_user(txc.shift, &utp->shift) || - __get_user(txc.stabil, &utp->stabil) || - __get_user(txc.jitcnt, &utp->jitcnt) || - __get_user(txc.calcnt, &utp->calcnt) || - __get_user(txc.errcnt, &utp->errcnt) || - __get_user(txc.stbcnt, &utp->stbcnt)) - return -EFAULT; + err = compat_get_timex(&txc, utp); + if (err) + return err; ret = do_adjtimex(&txc); - if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || - __put_user(txc.modes, &utp->modes) || - __put_user(txc.offset, &utp->offset) || - __put_user(txc.freq, &utp->freq) || - __put_user(txc.maxerror, &utp->maxerror) || - __put_user(txc.esterror, &utp->esterror) || - __put_user(txc.status, &utp->status) || - __put_user(txc.constant, &utp->constant) || - __put_user(txc.precision, &utp->precision) || - __put_user(txc.tolerance, &utp->tolerance) || - __put_user(txc.time.tv_sec, &utp->time.tv_sec) || - __put_user(txc.time.tv_usec, &utp->time.tv_usec) || - __put_user(txc.tick, &utp->tick) || - __put_user(txc.ppsfreq, &utp->ppsfreq) || - __put_user(txc.jitter, &utp->jitter) || - __put_user(txc.shift, &utp->shift) || - __put_user(txc.stabil, &utp->stabil) || - __put_user(txc.jitcnt, &utp->jitcnt) || - __put_user(txc.calcnt, &utp->calcnt) || - __put_user(txc.errcnt, &utp->errcnt) || - __put_user(txc.stbcnt, &utp->stbcnt) || - __put_user(txc.tai, &utp->tai)) - ret = -EFAULT; + err = compat_put_timex(utp, &txc); + if (err) + return err; return ret; } diff --git a/kernel/cred.c b/kernel/cred.c index 3a9d6dd53a6..2343c132c5a 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar; static struct thread_group_cred init_tgcred = { .usage = ATOMIC_INIT(2), .tgid = 0, - .lock = SPIN_LOCK_UNLOCKED, + .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock), }; #endif diff --git a/kernel/futex.c b/kernel/futex.c index b766d28accd..bda41571538 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -381,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, return NULL; } -static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) +static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, + u32 uval, u32 newval) { - u32 curval; + int ret; pagefault_disable(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); pagefault_enable(); - return curval; + return ret; } static int get_futex_value_locked(u32 *dest, u32 __user *from) @@ -674,7 +675,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, struct task_struct *task, int set_waiters) { int lock_taken, ret, ownerdied = 0; - u32 uval, newval, curval; + u32 uval, newval, curval, vpid = task_pid_vnr(task); retry: ret = lock_taken = 0; @@ -684,19 +685,17 @@ retry: * (by doing a 0 -> TID atomic cmpxchg), while holding all * the locks. It will most likely not succeed. */ - newval = task_pid_vnr(task); + newval = vpid; if (set_waiters) newval |= FUTEX_WAITERS; - curval = cmpxchg_futex_value_locked(uaddr, 0, newval); - - if (unlikely(curval == -EFAULT)) + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval))) return -EFAULT; /* * Detect deadlocks. */ - if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) + if ((unlikely((curval & FUTEX_TID_MASK) == vpid))) return -EDEADLK; /* @@ -723,14 +722,12 @@ retry: */ if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { /* Keep the OWNER_DIED bit */ - newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); + newval = (curval & ~FUTEX_TID_MASK) | vpid; ownerdied = 0; lock_taken = 1; } - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (unlikely(curval == -EFAULT)) + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) return -EFAULT; if (unlikely(curval != uval)) goto retry; @@ -775,6 +772,24 @@ retry: return ret; } +/** + * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket + * @q: The futex_q to unqueue + * + * The q->lock_ptr must not be NULL and must be held by the caller. + */ +static void __unqueue_futex(struct futex_q *q) +{ + struct futex_hash_bucket *hb; + + if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr) + || plist_node_empty(&q->list))) + return; + + hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); + plist_del(&q->list, &hb->chain); +} + /* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. @@ -792,7 +807,7 @@ static void wake_futex(struct futex_q *q) */ get_task_struct(p); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); /* * The waiting task can free the futex_q as soon as * q->lock_ptr = NULL is written, without taking any locks. A @@ -843,9 +858,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) newval = FUTEX_WAITERS | task_pid_vnr(new_owner); - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (curval == -EFAULT) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) ret = -EFAULT; else if (curval != uval) ret = -EINVAL; @@ -880,10 +893,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval) * There is no waiter, so we unlock the futex. The owner died * bit has not to be preserved here. We are the owner: */ - oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); - - if (oldval == -EFAULT) - return oldval; + if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0)) + return -EFAULT; if (oldval != uval) return -EAGAIN; @@ -1071,9 +1082,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, plist_del(&q->list, &hb1->chain); plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; -#ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.spinlock = &hb2->lock; -#endif } get_futex_key_refs(key2); q->key = *key2; @@ -1100,16 +1108,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, get_futex_key_refs(key); q->key = *key; - WARN_ON(plist_node_empty(&q->list)); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); WARN_ON(!q->rt_waiter); q->rt_waiter = NULL; q->lock_ptr = &hb->lock; -#ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.spinlock = &hb->lock; -#endif wake_up_state(q->task, TASK_NORMAL); } @@ -1457,9 +1461,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) prio = min(current->normal_prio, MAX_RT_PRIO); plist_node_init(&q->list, prio); -#ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.spinlock = &hb->lock; -#endif plist_add(&q->list, &hb->chain); q->task = current; spin_unlock(&hb->lock); @@ -1504,8 +1505,7 @@ retry: spin_unlock(lock_ptr); goto retry; } - WARN_ON(plist_node_empty(&q->list)); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); BUG_ON(q->pi_state); @@ -1525,8 +1525,7 @@ retry: static void unqueue_me_pi(struct futex_q *q) __releases(q->lock_ptr) { - WARN_ON(plist_node_empty(&q->list)); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); BUG_ON(!q->pi_state); free_pi_state(q->pi_state); @@ -1556,10 +1555,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, /* * We are here either because we stole the rtmutex from the - * pending owner or we are the pending owner which failed to - * get the rtmutex. We have to replace the pending owner TID - * in the user space variable. This must be atomic as we have - * to preserve the owner died bit here. + * previous highest priority waiter or we are the highest priority + * waiter but failed to get the rtmutex the first time. + * We have to replace the newowner TID in the user space variable. + * This must be atomic as we have to preserve the owner died bit here. * * Note: We write the user space value _before_ changing the pi_state * because we can fault here. Imagine swapped out pages or a fork @@ -1578,9 +1577,7 @@ retry: while (1) { newval = (uval & FUTEX_OWNER_DIED) | newtid; - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (curval == -EFAULT) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) goto handle_fault; if (curval == uval) break; @@ -1608,8 +1605,8 @@ retry: /* * To handle the page fault we need to drop the hash bucket - * lock here. That gives the other task (either the pending - * owner itself or the task which stole the rtmutex) the + * lock here. That gives the other task (either the highest priority + * waiter itself or the task which stole the rtmutex) the * chance to try the fixup of the pi_state. So once we are * back from handling the fault we need to check the pi_state * after reacquiring the hash bucket lock and before trying to @@ -1685,18 +1682,20 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) /* * pi_state is incorrect, some other task did a lock steal and * we returned due to timeout or signal without taking the - * rt_mutex. Too late. We can access the rt_mutex_owner without - * locking, as the other task is now blocked on the hash bucket - * lock. Fix the state up. + * rt_mutex. Too late. */ + raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); owner = rt_mutex_owner(&q->pi_state->pi_mutex); + if (!owner) + owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); + raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); ret = fixup_pi_state_owner(uaddr, q, owner); goto out; } /* * Paranoia check. If we did not take the lock, then we should not be - * the owner, nor the pending owner, of the rt_mutex. + * the owner of the rt_mutex. */ if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " @@ -1781,13 +1780,14 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, * * The basic logical guarantee of a futex is that it blocks ONLY * if cond(var) is known to be true at the time of blocking, for - * any cond. If we queued after testing *uaddr, that would open - * a race condition where we could block indefinitely with + * any cond. If we locked the hash-bucket after testing *uaddr, that + * would open a race condition where we could block indefinitely with * cond(var) false, which would violate the guarantee. * - * A consequence is that futex_wait() can return zero and absorb - * a wakeup when *uaddr != val on entry to the syscall. This is - * rare, but normal. + * On the other hand, we insert q and release the hash-bucket only + * after testing *uaddr. This guarantees that futex_wait() will NOT + * absorb a wakeup if *uaddr does not match the desired values + * while the syscall executes. */ retry: ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); @@ -2046,9 +2046,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { struct futex_hash_bucket *hb; struct futex_q *this, *next; - u32 uval; struct plist_head *head; union futex_key key = FUTEX_KEY_INIT; + u32 uval, vpid = task_pid_vnr(current); int ret; retry: @@ -2057,7 +2057,7 @@ retry: /* * We release only a lock we actually own: */ - if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) + if ((uval & FUTEX_TID_MASK) != vpid) return -EPERM; ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); @@ -2072,17 +2072,14 @@ retry: * again. If it succeeds then we can return without waking * anyone else up: */ - if (!(uval & FUTEX_OWNER_DIED)) - uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0); - - - if (unlikely(uval == -EFAULT)) + if (!(uval & FUTEX_OWNER_DIED) && + cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) goto pi_faulted; /* * Rare case: we managed to release the lock atomically, * no need to wake anyone else up: */ - if (unlikely(uval == task_pid_vnr(current))) + if (unlikely(uval == vpid)) goto out_unlock; /* @@ -2167,7 +2164,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * We were woken prior to requeue by a timeout or a signal. * Unqueue the futex_q and determine which it was. */ - plist_del(&q->list, &q->list.plist); + plist_del(&q->list, &hb->chain); /* Handle spurious wakeups gracefully */ ret = -EWOULDBLOCK; @@ -2463,11 +2460,20 @@ retry: * userspace. */ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; - nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); - - if (nval == -EFAULT) - return -1; - + /* + * We are not holding a lock here, but we want to have + * the pagefault_disable/enable() protection because + * we want to handle the fault gracefully. If the + * access fails we try to fault in the futex with R/W + * verification via get_user_pages. get_user() above + * does not guarantee R/W access. If that fails we + * give up and leave the futex locked. + */ + if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { + if (fault_in_user_writeable(uaddr)) + return -1; + goto retry; + } if (nval != uval) goto retry; @@ -2678,8 +2684,7 @@ static int __init futex_init(void) * implementation, the non-functional ones will return * -ENOSYS. */ - curval = cmpxchg_futex_value_locked(NULL, 0, 0); - if (curval == -EFAULT) + if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) futex_cmpxchg_enabled = 1; for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 0c8d7c04861..9017478c5d4 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -53,11 +53,10 @@ /* * The timer bases: * - * Note: If we want to add new timer bases, we have to skip the two - * clock ids captured by the cpu-timers. We do this by holding empty - * entries rather than doing math adjustment of the clock ids. - * This ensures that we capture erroneous accesses to these clock ids - * rather than moving them into the range of valid clock id's. + * There are more clockids then hrtimer bases. Thus, we index + * into the timer bases by the hrtimer_base_type enum. When trying + * to reach a base using a clockid, hrtimer_clockid_to_base() + * is used to convert from clockid to the proper hrtimer_base_type. */ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { @@ -74,30 +73,39 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .get_time = &ktime_get, .resolution = KTIME_LOW_RES, }, + { + .index = CLOCK_BOOTTIME, + .get_time = &ktime_get_boottime, + .resolution = KTIME_LOW_RES, + }, } }; +static int hrtimer_clock_to_base_table[MAX_CLOCKS]; + +static inline int hrtimer_clockid_to_base(clockid_t clock_id) +{ + return hrtimer_clock_to_base_table[clock_id]; +} + + /* * Get the coarse grained time at the softirq based on xtime and * wall_to_monotonic. */ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { - ktime_t xtim, tomono; - struct timespec xts, tom; - unsigned long seq; + ktime_t xtim, mono, boot; + struct timespec xts, tom, slp; - do { - seq = read_seqbegin(&xtime_lock); - xts = __current_kernel_time(); - tom = __get_wall_to_monotonic(); - } while (read_seqretry(&xtime_lock, seq)); + get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); xtim = timespec_to_ktime(xts); - tomono = timespec_to_ktime(tom); - base->clock_base[CLOCK_REALTIME].softirq_time = xtim; - base->clock_base[CLOCK_MONOTONIC].softirq_time = - ktime_add(xtim, tomono); + mono = ktime_add(xtim, timespec_to_ktime(tom)); + boot = ktime_add(mono, timespec_to_ktime(slp)); + base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; + base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; + base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; } /* @@ -184,10 +192,11 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, struct hrtimer_cpu_base *new_cpu_base; int this_cpu = smp_processor_id(); int cpu = hrtimer_get_target(this_cpu, pinned); + int basenum = hrtimer_clockid_to_base(base->index); again: new_cpu_base = &per_cpu(hrtimer_bases, cpu); - new_base = &new_cpu_base->clock_base[base->index]; + new_base = &new_cpu_base->clock_base[basenum]; if (base != new_base) { /* @@ -334,6 +343,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe); static struct debug_obj_descr hrtimer_debug_descr; +static void *hrtimer_debug_hint(void *addr) +{ + return ((struct hrtimer *) addr)->function; +} + /* * fixup_init is called when: * - an active object is initialized @@ -393,6 +407,7 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) static struct debug_obj_descr hrtimer_debug_descr = { .name = "hrtimer", + .debug_hint = hrtimer_debug_hint, .fixup_init = hrtimer_fixup_init, .fixup_activate = hrtimer_fixup_activate, .fixup_free = hrtimer_fixup_free, @@ -611,24 +626,23 @@ static int hrtimer_reprogram(struct hrtimer *timer, static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base; - struct timespec realtime_offset, wtm; - unsigned long seq; + struct timespec realtime_offset, wtm, sleep; if (!hrtimer_hres_active()) return; - do { - seq = read_seqbegin(&xtime_lock); - wtm = __get_wall_to_monotonic(); - } while (read_seqretry(&xtime_lock, seq)); + get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm, + &sleep); set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); base = &__get_cpu_var(hrtimer_bases); /* Adjust CLOCK_REALTIME offset */ raw_spin_lock(&base->lock); - base->clock_base[CLOCK_REALTIME].offset = + base->clock_base[HRTIMER_BASE_REALTIME].offset = timespec_to_ktime(realtime_offset); + base->clock_base[HRTIMER_BASE_BOOTTIME].offset = + timespec_to_ktime(sleep); hrtimer_force_reprogram(base, 0); raw_spin_unlock(&base->lock); @@ -673,14 +687,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) } /* - * Initialize the high resolution related parts of a hrtimer - */ -static inline void hrtimer_init_timer_hres(struct hrtimer *timer) -{ -} - - -/* * When High resolution timers are active, try to reprogram. Note, that in case * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry * check happens. The timer gets enqueued into the rbtree. The reprogramming @@ -725,8 +731,9 @@ static int hrtimer_switch_to_hres(void) return 0; } base->hres_active = 1; - base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES; - base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES; + base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES; + base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES; + base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES; tick_setup_sched_timer(); @@ -750,7 +757,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, return 0; } static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } -static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } #endif /* CONFIG_HIGH_RES_TIMERS */ @@ -1121,6 +1127,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { struct hrtimer_cpu_base *cpu_base; + int base; memset(timer, 0, sizeof(struct hrtimer)); @@ -1129,8 +1136,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) clock_id = CLOCK_MONOTONIC; - timer->base = &cpu_base->clock_base[clock_id]; - hrtimer_init_timer_hres(timer); + base = hrtimer_clockid_to_base(clock_id); + timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); #ifdef CONFIG_TIMER_STATS @@ -1165,9 +1172,10 @@ EXPORT_SYMBOL_GPL(hrtimer_init); int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) { struct hrtimer_cpu_base *cpu_base; + int base = hrtimer_clockid_to_base(which_clock); cpu_base = &__raw_get_cpu_var(hrtimer_bases); - *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); + *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); return 0; } @@ -1714,6 +1722,10 @@ static struct notifier_block __cpuinitdata hrtimers_nb = { void __init hrtimers_init(void) { + hrtimer_clock_to_base_table[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME; + hrtimer_clock_to_base_table[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC; + hrtimer_clock_to_base_table[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME; + hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 8e42fec7686..09bef82d74c 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -1,5 +1,6 @@ +# Select this to activate the generic irq options below config HAVE_GENERIC_HARDIRQS - def_bool n + bool if HAVE_GENERIC_HARDIRQS menu "IRQ subsystem" @@ -11,26 +12,44 @@ config GENERIC_HARDIRQS # Select this to disable the deprecated stuff config GENERIC_HARDIRQS_NO_DEPRECATED - def_bool n + bool + +config GENERIC_HARDIRQS_NO_COMPAT + bool # Options selectable by the architecture code + +# Make sparse irq Kconfig switch below available config HAVE_SPARSE_IRQ - def_bool n + bool +# Enable the generic irq autoprobe mechanism config GENERIC_IRQ_PROBE - def_bool n + bool + +# Use the generic /proc/interrupts implementation +config GENERIC_IRQ_SHOW + bool +# Support for delayed migration from interrupt context config GENERIC_PENDING_IRQ - def_bool n + bool +# Alpha specific irq affinity mechanism config AUTO_IRQ_AFFINITY - def_bool n - -config IRQ_PER_CPU - def_bool n + bool +# Tasklet based software resend for pending interrupts on enable_irq() config HARDIRQS_SW_RESEND - def_bool n + bool + +# Preflow handler support for fasteoi (sparc64) +config IRQ_PREFLOW_FASTEOI + bool + +# Support forced irq threading +config IRQ_FORCED_THREADING + bool config SPARSE_IRQ bool "Support sparse irq numbering" diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 505798f86c3..394784c5706 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -17,7 +17,7 @@ /* * Autodetection depends on the fact that any interrupt that * comes in on to an unassigned handler will get stuck with - * "IRQ_WAITING" cleared and the interrupt disabled. + * "IRQS_WAITING" cleared and the interrupt disabled. */ static DEFINE_MUTEX(probing_active); @@ -32,7 +32,6 @@ unsigned long probe_irq_on(void) { struct irq_desc *desc; unsigned long mask = 0; - unsigned int status; int i; /* @@ -46,13 +45,7 @@ unsigned long probe_irq_on(void) */ for_each_irq_desc_reverse(i, desc) { raw_spin_lock_irq(&desc->lock); - if (!desc->action && !(desc->status & IRQ_NOPROBE)) { - /* - * An old-style architecture might still have - * the handle_bad_irq handler there: - */ - compat_irq_chip_set_default_handler(desc); - + if (!desc->action && irq_settings_can_probe(desc)) { /* * Some chips need to know about probing in * progress: @@ -60,7 +53,7 @@ unsigned long probe_irq_on(void) if (desc->irq_data.chip->irq_set_type) desc->irq_data.chip->irq_set_type(&desc->irq_data, IRQ_TYPE_PROBE); - desc->irq_data.chip->irq_startup(&desc->irq_data); + irq_startup(desc); } raw_spin_unlock_irq(&desc->lock); } @@ -75,10 +68,12 @@ unsigned long probe_irq_on(void) */ for_each_irq_desc_reverse(i, desc) { raw_spin_lock_irq(&desc->lock); - if (!desc->action && !(desc->status & IRQ_NOPROBE)) { - desc->status |= IRQ_AUTODETECT | IRQ_WAITING; - if (desc->irq_data.chip->irq_startup(&desc->irq_data)) - desc->status |= IRQ_PENDING; + if (!desc->action && irq_settings_can_probe(desc)) { + desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; + if (irq_startup(desc)) { + irq_compat_set_pending(desc); + desc->istate |= IRQS_PENDING; + } } raw_spin_unlock_irq(&desc->lock); } @@ -93,13 +88,12 @@ unsigned long probe_irq_on(void) */ for_each_irq_desc(i, desc) { raw_spin_lock_irq(&desc->lock); - status = desc->status; - if (status & IRQ_AUTODETECT) { + if (desc->istate & IRQS_AUTODETECT) { /* It triggered already - consider it spurious. */ - if (!(status & IRQ_WAITING)) { - desc->status = status & ~IRQ_AUTODETECT; - desc->irq_data.chip->irq_shutdown(&desc->irq_data); + if (!(desc->istate & IRQS_WAITING)) { + desc->istate &= ~IRQS_AUTODETECT; + irq_shutdown(desc); } else if (i < 32) mask |= 1 << i; @@ -125,20 +119,18 @@ EXPORT_SYMBOL(probe_irq_on); */ unsigned int probe_irq_mask(unsigned long val) { - unsigned int status, mask = 0; + unsigned int mask = 0; struct irq_desc *desc; int i; for_each_irq_desc(i, desc) { raw_spin_lock_irq(&desc->lock); - status = desc->status; - - if (status & IRQ_AUTODETECT) { - if (i < 16 && !(status & IRQ_WAITING)) + if (desc->istate & IRQS_AUTODETECT) { + if (i < 16 && !(desc->istate & IRQS_WAITING)) mask |= 1 << i; - desc->status = status & ~IRQ_AUTODETECT; - desc->irq_data.chip->irq_shutdown(&desc->irq_data); + desc->istate &= ~IRQS_AUTODETECT; + irq_shutdown(desc); } raw_spin_unlock_irq(&desc->lock); } @@ -169,20 +161,18 @@ int probe_irq_off(unsigned long val) { int i, irq_found = 0, nr_of_irqs = 0; struct irq_desc *desc; - unsigned int status; for_each_irq_desc(i, desc) { raw_spin_lock_irq(&desc->lock); - status = desc->status; - if (status & IRQ_AUTODETECT) { - if (!(status & IRQ_WAITING)) { + if (desc->istate & IRQS_AUTODETECT) { + if (!(desc->istate & IRQS_WAITING)) { if (!nr_of_irqs) irq_found = i; nr_of_irqs++; } - desc->status = status & ~IRQ_AUTODETECT; - desc->irq_data.chip->irq_shutdown(&desc->irq_data); + desc->istate &= ~IRQS_AUTODETECT; + irq_shutdown(desc); } raw_spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index baa5c4acad8..c9c0601f061 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -19,140 +19,110 @@ #include "internals.h" /** - * set_irq_chip - set the irq chip for an irq + * irq_set_chip - set the irq chip for an irq * @irq: irq number * @chip: pointer to irq chip description structure */ -int set_irq_chip(unsigned int irq, struct irq_chip *chip) +int irq_set_chip(unsigned int irq, struct irq_chip *chip) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); - if (!desc) { - WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq); + if (!desc) return -EINVAL; - } if (!chip) chip = &no_irq_chip; - raw_spin_lock_irqsave(&desc->lock, flags); irq_chip_set_defaults(chip); desc->irq_data.chip = chip; - raw_spin_unlock_irqrestore(&desc->lock, flags); - + irq_put_desc_unlock(desc, flags); return 0; } -EXPORT_SYMBOL(set_irq_chip); +EXPORT_SYMBOL(irq_set_chip); /** - * set_irq_type - set the irq trigger type for an irq + * irq_set_type - set the irq trigger type for an irq * @irq: irq number * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h */ -int set_irq_type(unsigned int irq, unsigned int type) +int irq_set_irq_type(unsigned int irq, unsigned int type) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - int ret = -ENXIO; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + int ret = 0; - if (!desc) { - printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); - return -ENODEV; - } + if (!desc) + return -EINVAL; type &= IRQ_TYPE_SENSE_MASK; - if (type == IRQ_TYPE_NONE) - return 0; - - raw_spin_lock_irqsave(&desc->lock, flags); - ret = __irq_set_trigger(desc, irq, type); - raw_spin_unlock_irqrestore(&desc->lock, flags); + if (type != IRQ_TYPE_NONE) + ret = __irq_set_trigger(desc, irq, type); + irq_put_desc_busunlock(desc, flags); return ret; } -EXPORT_SYMBOL(set_irq_type); +EXPORT_SYMBOL(irq_set_irq_type); /** - * set_irq_data - set irq type data for an irq + * irq_set_handler_data - set irq handler data for an irq * @irq: Interrupt number * @data: Pointer to interrupt specific data * * Set the hardware irq controller data for an irq */ -int set_irq_data(unsigned int irq, void *data) +int irq_set_handler_data(unsigned int irq, void *data) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); - if (!desc) { - printk(KERN_ERR - "Trying to install controller data for IRQ%d\n", irq); + if (!desc) return -EINVAL; - } - - raw_spin_lock_irqsave(&desc->lock, flags); desc->irq_data.handler_data = data; - raw_spin_unlock_irqrestore(&desc->lock, flags); + irq_put_desc_unlock(desc, flags); return 0; } -EXPORT_SYMBOL(set_irq_data); +EXPORT_SYMBOL(irq_set_handler_data); /** - * set_irq_msi - set MSI descriptor data for an irq + * irq_set_msi_desc - set MSI descriptor data for an irq * @irq: Interrupt number * @entry: Pointer to MSI descriptor data * * Set the MSI descriptor entry for an irq */ -int set_irq_msi(unsigned int irq, struct msi_desc *entry) +int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); - if (!desc) { - printk(KERN_ERR - "Trying to install msi data for IRQ%d\n", irq); + if (!desc) return -EINVAL; - } - - raw_spin_lock_irqsave(&desc->lock, flags); desc->irq_data.msi_desc = entry; if (entry) entry->irq = irq; - raw_spin_unlock_irqrestore(&desc->lock, flags); + irq_put_desc_unlock(desc, flags); return 0; } /** - * set_irq_chip_data - set irq chip data for an irq + * irq_set_chip_data - set irq chip data for an irq * @irq: Interrupt number * @data: Pointer to chip specific data * * Set the hardware irq chip data for an irq */ -int set_irq_chip_data(unsigned int irq, void *data) +int irq_set_chip_data(unsigned int irq, void *data) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); - if (!desc) { - printk(KERN_ERR - "Trying to install chip data for IRQ%d\n", irq); - return -EINVAL; - } - - if (!desc->irq_data.chip) { - printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); + if (!desc) return -EINVAL; - } - - raw_spin_lock_irqsave(&desc->lock, flags); desc->irq_data.chip_data = data; - raw_spin_unlock_irqrestore(&desc->lock, flags); - + irq_put_desc_unlock(desc, flags); return 0; } -EXPORT_SYMBOL(set_irq_chip_data); +EXPORT_SYMBOL(irq_set_chip_data); struct irq_data *irq_get_irq_data(unsigned int irq) { @@ -162,72 +132,75 @@ struct irq_data *irq_get_irq_data(unsigned int irq) } EXPORT_SYMBOL_GPL(irq_get_irq_data); -/** - * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq - * - * @irq: Interrupt number - * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag - * - * The IRQ_NESTED_THREAD flag indicates that on - * request_threaded_irq() no separate interrupt thread should be - * created for the irq as the handler are called nested in the - * context of a demultiplexing interrupt handler thread. - */ -void set_irq_nested_thread(unsigned int irq, int nest) +static void irq_state_clr_disabled(struct irq_desc *desc) { - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - if (!desc) - return; - - raw_spin_lock_irqsave(&desc->lock, flags); - if (nest) - desc->status |= IRQ_NESTED_THREAD; - else - desc->status &= ~IRQ_NESTED_THREAD; - raw_spin_unlock_irqrestore(&desc->lock, flags); + desc->istate &= ~IRQS_DISABLED; + irq_compat_clr_disabled(desc); } -EXPORT_SYMBOL_GPL(set_irq_nested_thread); -/* - * default enable function - */ -static void default_enable(struct irq_data *data) +static void irq_state_set_disabled(struct irq_desc *desc) { - struct irq_desc *desc = irq_data_to_desc(data); + desc->istate |= IRQS_DISABLED; + irq_compat_set_disabled(desc); +} - desc->irq_data.chip->irq_unmask(&desc->irq_data); - desc->status &= ~IRQ_MASKED; +static void irq_state_clr_masked(struct irq_desc *desc) +{ + desc->istate &= ~IRQS_MASKED; + irq_compat_clr_masked(desc); } -/* - * default disable function - */ -static void default_disable(struct irq_data *data) +static void irq_state_set_masked(struct irq_desc *desc) { + desc->istate |= IRQS_MASKED; + irq_compat_set_masked(desc); } -/* - * default startup function - */ -static unsigned int default_startup(struct irq_data *data) +int irq_startup(struct irq_desc *desc) { - struct irq_desc *desc = irq_data_to_desc(data); + irq_state_clr_disabled(desc); + desc->depth = 0; + + if (desc->irq_data.chip->irq_startup) { + int ret = desc->irq_data.chip->irq_startup(&desc->irq_data); + irq_state_clr_masked(desc); + return ret; + } - desc->irq_data.chip->irq_enable(data); + irq_enable(desc); return 0; } -/* - * default shutdown function - */ -static void default_shutdown(struct irq_data *data) +void irq_shutdown(struct irq_desc *desc) { - struct irq_desc *desc = irq_data_to_desc(data); + irq_state_set_disabled(desc); + desc->depth = 1; + if (desc->irq_data.chip->irq_shutdown) + desc->irq_data.chip->irq_shutdown(&desc->irq_data); + if (desc->irq_data.chip->irq_disable) + desc->irq_data.chip->irq_disable(&desc->irq_data); + else + desc->irq_data.chip->irq_mask(&desc->irq_data); + irq_state_set_masked(desc); +} - desc->irq_data.chip->irq_mask(&desc->irq_data); - desc->status |= IRQ_MASKED; +void irq_enable(struct irq_desc *desc) +{ + irq_state_clr_disabled(desc); + if (desc->irq_data.chip->irq_enable) + desc->irq_data.chip->irq_enable(&desc->irq_data); + else + desc->irq_data.chip->irq_unmask(&desc->irq_data); + irq_state_clr_masked(desc); +} + +void irq_disable(struct irq_desc *desc) +{ + irq_state_set_disabled(desc); + if (desc->irq_data.chip->irq_disable) { + desc->irq_data.chip->irq_disable(&desc->irq_data); + irq_state_set_masked(desc); + } } #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED @@ -315,10 +288,6 @@ static void compat_bus_sync_unlock(struct irq_data *data) void irq_chip_set_defaults(struct irq_chip *chip) { #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED - /* - * Compat fixup functions need to be before we set the - * defaults for enable/disable/startup/shutdown - */ if (chip->enable) chip->irq_enable = compat_irq_enable; if (chip->disable) @@ -327,33 +296,8 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->irq_shutdown = compat_irq_shutdown; if (chip->startup) chip->irq_startup = compat_irq_startup; -#endif - /* - * The real defaults - */ - if (!chip->irq_enable) - chip->irq_enable = default_enable; - if (!chip->irq_disable) - chip->irq_disable = default_disable; - if (!chip->irq_startup) - chip->irq_startup = default_startup; - /* - * We use chip->irq_disable, when the user provided its own. When - * we have default_disable set for chip->irq_disable, then we need - * to use default_shutdown, otherwise the irq line is not - * disabled on free_irq(): - */ - if (!chip->irq_shutdown) - chip->irq_shutdown = chip->irq_disable != default_disable ? - chip->irq_disable : default_shutdown; - -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED if (!chip->end) chip->end = dummy_irq_chip.end; - - /* - * Now fix up the remaining compat handlers - */ if (chip->bus_lock) chip->irq_bus_lock = compat_bus_lock; if (chip->bus_sync_unlock) @@ -388,22 +332,22 @@ static inline void mask_ack_irq(struct irq_desc *desc) if (desc->irq_data.chip->irq_ack) desc->irq_data.chip->irq_ack(&desc->irq_data); } - desc->status |= IRQ_MASKED; + irq_state_set_masked(desc); } -static inline void mask_irq(struct irq_desc *desc) +void mask_irq(struct irq_desc *desc) { if (desc->irq_data.chip->irq_mask) { desc->irq_data.chip->irq_mask(&desc->irq_data); - desc->status |= IRQ_MASKED; + irq_state_set_masked(desc); } } -static inline void unmask_irq(struct irq_desc *desc) +void unmask_irq(struct irq_desc *desc) { if (desc->irq_data.chip->irq_unmask) { desc->irq_data.chip->irq_unmask(&desc->irq_data); - desc->status &= ~IRQ_MASKED; + irq_state_clr_masked(desc); } } @@ -428,10 +372,11 @@ void handle_nested_irq(unsigned int irq) kstat_incr_irqs_this_cpu(irq, desc); action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) + if (unlikely(!action || (desc->istate & IRQS_DISABLED))) goto out_unlock; - desc->status |= IRQ_INPROGRESS; + irq_compat_set_progress(desc); + desc->istate |= IRQS_INPROGRESS; raw_spin_unlock_irq(&desc->lock); action_ret = action->thread_fn(action->irq, action->dev_id); @@ -439,13 +384,21 @@ void handle_nested_irq(unsigned int irq) note_interrupt(irq, desc, action_ret); raw_spin_lock_irq(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; + desc->istate &= ~IRQS_INPROGRESS; + irq_compat_clr_progress(desc); out_unlock: raw_spin_unlock_irq(&desc->lock); } EXPORT_SYMBOL_GPL(handle_nested_irq); +static bool irq_check_poll(struct irq_desc *desc) +{ + if (!(desc->istate & IRQS_POLL_INPROGRESS)) + return false; + return irq_wait_for_poll(desc); +} + /** * handle_simple_irq - Simple and software-decoded IRQs. * @irq: the interrupt number @@ -461,29 +414,20 @@ EXPORT_SYMBOL_GPL(handle_nested_irq); void handle_simple_irq(unsigned int irq, struct irq_desc *desc) { - struct irqaction *action; - irqreturn_t action_ret; - raw_spin_lock(&desc->lock); - if (unlikely(desc->status & IRQ_INPROGRESS)) - goto out_unlock; - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + if (unlikely(desc->istate & IRQS_INPROGRESS)) + if (!irq_check_poll(desc)) + goto out_unlock; + + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); - action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) + if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) goto out_unlock; - desc->status |= IRQ_INPROGRESS; - raw_spin_unlock(&desc->lock); + handle_irq_event(desc); - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - - raw_spin_lock(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; out_unlock: raw_spin_unlock(&desc->lock); } @@ -501,42 +445,42 @@ out_unlock: void handle_level_irq(unsigned int irq, struct irq_desc *desc) { - struct irqaction *action; - irqreturn_t action_ret; - raw_spin_lock(&desc->lock); mask_ack_irq(desc); - if (unlikely(desc->status & IRQ_INPROGRESS)) - goto out_unlock; - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + if (unlikely(desc->istate & IRQS_INPROGRESS)) + if (!irq_check_poll(desc)) + goto out_unlock; + + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); /* * If its disabled or no action available * keep it masked and get out of here */ - action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) + if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) goto out_unlock; - desc->status |= IRQ_INPROGRESS; - raw_spin_unlock(&desc->lock); - - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); + handle_irq_event(desc); - raw_spin_lock(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; - - if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) + if (!(desc->istate & (IRQS_DISABLED | IRQS_ONESHOT))) unmask_irq(desc); out_unlock: raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_level_irq); +#ifdef CONFIG_IRQ_PREFLOW_FASTEOI +static inline void preflow_handler(struct irq_desc *desc) +{ + if (desc->preflow_handler) + desc->preflow_handler(&desc->irq_data); +} +#else +static inline void preflow_handler(struct irq_desc *desc) { } +#endif + /** * handle_fasteoi_irq - irq handler for transparent controllers * @irq: the interrupt number @@ -550,42 +494,41 @@ EXPORT_SYMBOL_GPL(handle_level_irq); void handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { - struct irqaction *action; - irqreturn_t action_ret; - raw_spin_lock(&desc->lock); - if (unlikely(desc->status & IRQ_INPROGRESS)) - goto out; + if (unlikely(desc->istate & IRQS_INPROGRESS)) + if (!irq_check_poll(desc)) + goto out; - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); /* * If its disabled or no action available * then mask it and get out of here: */ - action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) { - desc->status |= IRQ_PENDING; + if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) { + irq_compat_set_pending(desc); + desc->istate |= IRQS_PENDING; mask_irq(desc); goto out; } - desc->status |= IRQ_INPROGRESS; - desc->status &= ~IRQ_PENDING; - raw_spin_unlock(&desc->lock); + if (desc->istate & IRQS_ONESHOT) + mask_irq(desc); - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); + preflow_handler(desc); + handle_irq_event(desc); - raw_spin_lock(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; -out: +out_eoi: desc->irq_data.chip->irq_eoi(&desc->irq_data); - +out_unlock: raw_spin_unlock(&desc->lock); + return; +out: + if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED)) + goto out_eoi; + goto out_unlock; } /** @@ -609,32 +552,28 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) { raw_spin_lock(&desc->lock); - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); /* * If we're currently running this IRQ, or its disabled, * we shouldn't process the IRQ. Mark it pending, handle * the necessary masking and go out */ - if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || - !desc->action)) { - desc->status |= (IRQ_PENDING | IRQ_MASKED); - mask_ack_irq(desc); - goto out_unlock; + if (unlikely((desc->istate & (IRQS_DISABLED | IRQS_INPROGRESS) || + !desc->action))) { + if (!irq_check_poll(desc)) { + irq_compat_set_pending(desc); + desc->istate |= IRQS_PENDING; + mask_ack_irq(desc); + goto out_unlock; + } } kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ desc->irq_data.chip->irq_ack(&desc->irq_data); - /* Mark the IRQ currently in progress.*/ - desc->status |= IRQ_INPROGRESS; - do { - struct irqaction *action = desc->action; - irqreturn_t action_ret; - - if (unlikely(!action)) { + if (unlikely(!desc->action)) { mask_irq(desc); goto out_unlock; } @@ -644,22 +583,17 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) * one, we could have masked the irq. * Renable it, if it was not disabled in meantime. */ - if (unlikely((desc->status & - (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == - (IRQ_PENDING | IRQ_MASKED))) { - unmask_irq(desc); + if (unlikely(desc->istate & IRQS_PENDING)) { + if (!(desc->istate & IRQS_DISABLED) && + (desc->istate & IRQS_MASKED)) + unmask_irq(desc); } - desc->status &= ~IRQ_PENDING; - raw_spin_unlock(&desc->lock); - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - raw_spin_lock(&desc->lock); + handle_irq_event(desc); - } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); + } while ((desc->istate & IRQS_PENDING) && + !(desc->istate & IRQS_DISABLED)); - desc->status &= ~IRQ_INPROGRESS; out_unlock: raw_spin_unlock(&desc->lock); } @@ -674,103 +608,84 @@ out_unlock: void handle_percpu_irq(unsigned int irq, struct irq_desc *desc) { - irqreturn_t action_ret; + struct irq_chip *chip = irq_desc_get_chip(desc); kstat_incr_irqs_this_cpu(irq, desc); - if (desc->irq_data.chip->irq_ack) - desc->irq_data.chip->irq_ack(&desc->irq_data); + if (chip->irq_ack) + chip->irq_ack(&desc->irq_data); - action_ret = handle_IRQ_event(irq, desc->action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); + handle_irq_event_percpu(desc, desc->action); - if (desc->irq_data.chip->irq_eoi) - desc->irq_data.chip->irq_eoi(&desc->irq_data); + if (chip->irq_eoi) + chip->irq_eoi(&desc->irq_data); } void -__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, +__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); - if (!desc) { - printk(KERN_ERR - "Trying to install type control for IRQ%d\n", irq); + if (!desc) return; - } - if (!handle) + if (!handle) { handle = handle_bad_irq; - else if (desc->irq_data.chip == &no_irq_chip) { - printk(KERN_WARNING "Trying to install %sinterrupt handler " - "for IRQ%d\n", is_chained ? "chained " : "", irq); - /* - * Some ARM implementations install a handler for really dumb - * interrupt hardware without setting an irq_chip. This worked - * with the ARM no_irq_chip but the check in setup_irq would - * prevent us to setup the interrupt at all. Switch it to - * dummy_irq_chip for easy transition. - */ - desc->irq_data.chip = &dummy_irq_chip; + } else { + if (WARN_ON(desc->irq_data.chip == &no_irq_chip)) + goto out; } - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, flags); - /* Uninstall? */ if (handle == handle_bad_irq) { if (desc->irq_data.chip != &no_irq_chip) mask_ack_irq(desc); - desc->status |= IRQ_DISABLED; + irq_compat_set_disabled(desc); + desc->istate |= IRQS_DISABLED; desc->depth = 1; } desc->handle_irq = handle; desc->name = name; if (handle != handle_bad_irq && is_chained) { - desc->status &= ~IRQ_DISABLED; - desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; - desc->depth = 0; - desc->irq_data.chip->irq_startup(&desc->irq_data); + irq_settings_set_noprobe(desc); + irq_settings_set_norequest(desc); + irq_startup(desc); } - raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); -} -EXPORT_SYMBOL_GPL(__set_irq_handler); - -void -set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, - irq_flow_handler_t handle) -{ - set_irq_chip(irq, chip); - __set_irq_handler(irq, handle, 0, NULL); +out: + irq_put_desc_busunlock(desc, flags); } +EXPORT_SYMBOL_GPL(__irq_set_handler); void -set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, +irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, irq_flow_handler_t handle, const char *name) { - set_irq_chip(irq, chip); - __set_irq_handler(irq, handle, 0, name); + irq_set_chip(irq, chip); + __irq_set_handler(irq, handle, 0, name); } void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); if (!desc) return; + irq_settings_clr_and_set(desc, clr, set); + + irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | + IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT); + if (irq_settings_has_no_balance_set(desc)) + irqd_set(&desc->irq_data, IRQD_NO_BALANCING); + if (irq_settings_is_per_cpu(desc)) + irqd_set(&desc->irq_data, IRQD_PER_CPU); + if (irq_settings_can_move_pcntxt(desc)) + irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT); - /* Sanitize flags */ - set &= IRQF_MODIFY_MASK; - clr &= IRQF_MODIFY_MASK; + irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); - raw_spin_lock_irqsave(&desc->lock, flags); - desc->status &= ~clr; - desc->status |= set; - raw_spin_unlock_irqrestore(&desc->lock, flags); + irq_put_desc_unlock(desc, flags); } diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h new file mode 100644 index 00000000000..6bbaf66aca8 --- /dev/null +++ b/kernel/irq/compat.h @@ -0,0 +1,72 @@ +/* + * Compat layer for transition period + */ +#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT +static inline void irq_compat_set_progress(struct irq_desc *desc) +{ + desc->status |= IRQ_INPROGRESS; +} + +static inline void irq_compat_clr_progress(struct irq_desc *desc) +{ + desc->status &= ~IRQ_INPROGRESS; +} +static inline void irq_compat_set_disabled(struct irq_desc *desc) +{ + desc->status |= IRQ_DISABLED; +} +static inline void irq_compat_clr_disabled(struct irq_desc *desc) +{ + desc->status &= ~IRQ_DISABLED; +} +static inline void irq_compat_set_pending(struct irq_desc *desc) +{ + desc->status |= IRQ_PENDING; +} + +static inline void irq_compat_clr_pending(struct irq_desc *desc) +{ + desc->status &= ~IRQ_PENDING; +} +static inline void irq_compat_set_masked(struct irq_desc *desc) +{ + desc->status |= IRQ_MASKED; +} + +static inline void irq_compat_clr_masked(struct irq_desc *desc) +{ + desc->status &= ~IRQ_MASKED; +} +static inline void irq_compat_set_move_pending(struct irq_desc *desc) +{ + desc->status |= IRQ_MOVE_PENDING; +} + +static inline void irq_compat_clr_move_pending(struct irq_desc *desc) +{ + desc->status &= ~IRQ_MOVE_PENDING; +} +static inline void irq_compat_set_affinity(struct irq_desc *desc) +{ + desc->status |= IRQ_AFFINITY_SET; +} + +static inline void irq_compat_clr_affinity(struct irq_desc *desc) +{ + desc->status &= ~IRQ_AFFINITY_SET; +} +#else +static inline void irq_compat_set_progress(struct irq_desc *desc) { } +static inline void irq_compat_clr_progress(struct irq_desc *desc) { } +static inline void irq_compat_set_disabled(struct irq_desc *desc) { } +static inline void irq_compat_clr_disabled(struct irq_desc *desc) { } +static inline void irq_compat_set_pending(struct irq_desc *desc) { } +static inline void irq_compat_clr_pending(struct irq_desc *desc) { } +static inline void irq_compat_set_masked(struct irq_desc *desc) { } +static inline void irq_compat_clr_masked(struct irq_desc *desc) { } +static inline void irq_compat_set_move_pending(struct irq_desc *desc) { } +static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { } +static inline void irq_compat_set_affinity(struct irq_desc *desc) { } +static inline void irq_compat_clr_affinity(struct irq_desc *desc) { } +#endif + diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h new file mode 100644 index 00000000000..d1a33b7fa61 --- /dev/null +++ b/kernel/irq/debug.h @@ -0,0 +1,40 @@ +/* + * Debugging printout: + */ + +#include <linux/kallsyms.h> + +#define P(f) if (desc->status & f) printk("%14s set\n", #f) +#define PS(f) if (desc->istate & f) printk("%14s set\n", #f) + +static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) +{ + printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", + irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); + printk("->handle_irq(): %p, ", desc->handle_irq); + print_symbol("%s\n", (unsigned long)desc->handle_irq); + printk("->irq_data.chip(): %p, ", desc->irq_data.chip); + print_symbol("%s\n", (unsigned long)desc->irq_data.chip); + printk("->action(): %p\n", desc->action); + if (desc->action) { + printk("->action->handler(): %p, ", desc->action->handler); + print_symbol("%s\n", (unsigned long)desc->action->handler); + } + + P(IRQ_LEVEL); + P(IRQ_PER_CPU); + P(IRQ_NOPROBE); + P(IRQ_NOREQUEST); + P(IRQ_NOAUTOEN); + + PS(IRQS_AUTODETECT); + PS(IRQS_INPROGRESS); + PS(IRQS_REPLAY); + PS(IRQS_WAITING); + PS(IRQS_DISABLED); + PS(IRQS_PENDING); + PS(IRQS_MASKED); +} + +#undef P +#undef PS diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 3540a719012..517561fc731 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -51,30 +51,92 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } -/** - * handle_IRQ_event - irq action chain handler - * @irq: the interrupt number - * @action: the interrupt action chain for this irq - * - * Handles the action chain of an irq event - */ -irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) +static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) +{ + /* + * Wake up the handler thread for this action. In case the + * thread crashed and was killed we just pretend that we + * handled the interrupt. The hardirq handler has disabled the + * device interrupt, so no irq storm is lurking. If the + * RUNTHREAD bit is already set, nothing to do. + */ + if (test_bit(IRQTF_DIED, &action->thread_flags) || + test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + return; + + /* + * It's safe to OR the mask lockless here. We have only two + * places which write to threads_oneshot: This code and the + * irq thread. + * + * This code is the hard irq context and can never run on two + * cpus in parallel. If it ever does we have more serious + * problems than this bitmask. + * + * The irq threads of this irq which clear their "running" bit + * in threads_oneshot are serialized via desc->lock against + * each other and they are serialized against this code by + * IRQS_INPROGRESS. + * + * Hard irq handler: + * + * spin_lock(desc->lock); + * desc->state |= IRQS_INPROGRESS; + * spin_unlock(desc->lock); + * set_bit(IRQTF_RUNTHREAD, &action->thread_flags); + * desc->threads_oneshot |= mask; + * spin_lock(desc->lock); + * desc->state &= ~IRQS_INPROGRESS; + * spin_unlock(desc->lock); + * + * irq thread: + * + * again: + * spin_lock(desc->lock); + * if (desc->state & IRQS_INPROGRESS) { + * spin_unlock(desc->lock); + * while(desc->state & IRQS_INPROGRESS) + * cpu_relax(); + * goto again; + * } + * if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + * desc->threads_oneshot &= ~mask; + * spin_unlock(desc->lock); + * + * So either the thread waits for us to clear IRQS_INPROGRESS + * or we are waiting in the flow handler for desc->lock to be + * released before we reach this point. The thread also checks + * IRQTF_RUNTHREAD under desc->lock. If set it leaves + * threads_oneshot untouched and runs the thread another time. + */ + desc->threads_oneshot |= action->thread_mask; + wake_up_process(action->thread); +} + +irqreturn_t +handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) { - irqreturn_t ret, retval = IRQ_NONE; - unsigned int status = 0; + irqreturn_t retval = IRQ_NONE; + unsigned int random = 0, irq = desc->irq_data.irq; do { + irqreturn_t res; + trace_irq_handler_entry(irq, action); - ret = action->handler(irq, action->dev_id); - trace_irq_handler_exit(irq, action, ret); + res = action->handler(irq, action->dev_id); + trace_irq_handler_exit(irq, action, res); - switch (ret) { + if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n", + irq, action->handler)) + local_irq_disable(); + + switch (res) { case IRQ_WAKE_THREAD: /* * Set result to handled so the spurious check * does not trigger. */ - ret = IRQ_HANDLED; + res = IRQ_HANDLED; /* * Catch drivers which return WAKE_THREAD but @@ -85,36 +147,56 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) break; } - /* - * Wake up the handler thread for this - * action. In case the thread crashed and was - * killed we just pretend that we handled the - * interrupt. The hardirq handler above has - * disabled the device interrupt, so no irq - * storm is lurking. - */ - if (likely(!test_bit(IRQTF_DIED, - &action->thread_flags))) { - set_bit(IRQTF_RUNTHREAD, &action->thread_flags); - wake_up_process(action->thread); - } + irq_wake_thread(desc, action); /* Fall through to add to randomness */ case IRQ_HANDLED: - status |= action->flags; + random |= action->flags; break; default: break; } - retval |= ret; + retval |= res; action = action->next; } while (action); - if (status & IRQF_SAMPLE_RANDOM) + if (random & IRQF_SAMPLE_RANDOM) add_interrupt_randomness(irq); - local_irq_disable(); + if (!noirqdebug) + note_interrupt(irq, desc, retval); return retval; } + +irqreturn_t handle_irq_event(struct irq_desc *desc) +{ + struct irqaction *action = desc->action; + irqreturn_t ret; + + irq_compat_clr_pending(desc); + desc->istate &= ~IRQS_PENDING; + irq_compat_set_progress(desc); + desc->istate |= IRQS_INPROGRESS; + raw_spin_unlock(&desc->lock); + + ret = handle_irq_event_percpu(desc, action); + + raw_spin_lock(&desc->lock); + desc->istate &= ~IRQS_INPROGRESS; + irq_compat_clr_progress(desc); + return ret; +} + +/** + * handle_IRQ_event - irq action chain handler + * @irq: the interrupt number + * @action: the interrupt action chain for this irq + * + * Handles the action chain of an irq event + */ +irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) +{ + return handle_irq_event_percpu(irq_to_desc(irq), action); +} diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 99c3bc8a6fb..6c6ec9a4902 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -1,5 +1,9 @@ /* * IRQ subsystem internal functions and variables: + * + * Do not ever include this file from anything else than + * kernel/irq/. Do not even think about using any information outside + * of this file for your non core code. */ #include <linux/irqdesc.h> @@ -9,25 +13,89 @@ # define IRQ_BITMAP_BITS NR_IRQS #endif +#define istate core_internal_state__do_not_mess_with_it + +#ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT +# define status status_use_accessors +#endif + extern int noirqdebug; +/* + * Bits used by threaded handlers: + * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run + * IRQTF_DIED - handler thread died + * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed + * IRQTF_AFFINITY - irq thread is requested to adjust affinity + * IRQTF_FORCED_THREAD - irq action is force threaded + */ +enum { + IRQTF_RUNTHREAD, + IRQTF_DIED, + IRQTF_WARNED, + IRQTF_AFFINITY, + IRQTF_FORCED_THREAD, +}; + +/* + * Bit masks for desc->state + * + * IRQS_AUTODETECT - autodetection in progress + * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt + * detection + * IRQS_POLL_INPROGRESS - polling in progress + * IRQS_INPROGRESS - Interrupt in progress + * IRQS_ONESHOT - irq is not unmasked in primary handler + * IRQS_REPLAY - irq is replayed + * IRQS_WAITING - irq is waiting + * IRQS_DISABLED - irq is disabled + * IRQS_PENDING - irq is pending and replayed later + * IRQS_MASKED - irq is masked + * IRQS_SUSPENDED - irq is suspended + */ +enum { + IRQS_AUTODETECT = 0x00000001, + IRQS_SPURIOUS_DISABLED = 0x00000002, + IRQS_POLL_INPROGRESS = 0x00000008, + IRQS_INPROGRESS = 0x00000010, + IRQS_ONESHOT = 0x00000020, + IRQS_REPLAY = 0x00000040, + IRQS_WAITING = 0x00000080, + IRQS_DISABLED = 0x00000100, + IRQS_PENDING = 0x00000200, + IRQS_MASKED = 0x00000400, + IRQS_SUSPENDED = 0x00000800, +}; + +#include "compat.h" +#include "debug.h" +#include "settings.h" + #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) /* Set default functions for irq_chip structures: */ extern void irq_chip_set_defaults(struct irq_chip *chip); -/* Set default handler: */ -extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); - extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags); extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); +extern int irq_startup(struct irq_desc *desc); +extern void irq_shutdown(struct irq_desc *desc); +extern void irq_enable(struct irq_desc *desc); +extern void irq_disable(struct irq_desc *desc); +extern void mask_irq(struct irq_desc *desc); +extern void unmask_irq(struct irq_desc *desc); + extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); +irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); +irqreturn_t handle_irq_event(struct irq_desc *desc); + /* Resending of interrupts :*/ void check_irq_resend(struct irq_desc *desc, unsigned int irq); +bool irq_wait_for_poll(struct irq_desc *desc); #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); @@ -43,20 +111,10 @@ static inline void unregister_handler_proc(unsigned int irq, struct irqaction *action) { } #endif -extern int irq_select_affinity_usr(unsigned int irq); +extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); extern void irq_set_thread_affinity(struct irq_desc *desc); -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED -static inline void irq_end(unsigned int irq, struct irq_desc *desc) -{ - if (desc->irq_data.chip && desc->irq_data.chip->end) - desc->irq_data.chip->end(irq); -} -#else -static inline void irq_end(unsigned int irq, struct irq_desc *desc) { } -#endif - /* Inline functions for support of irq chips on slow busses */ static inline void chip_bus_lock(struct irq_desc *desc) { @@ -70,43 +128,60 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); } +struct irq_desc * +__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus); +void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus); + +static inline struct irq_desc * +irq_get_desc_buslock(unsigned int irq, unsigned long *flags) +{ + return __irq_get_desc_lock(irq, flags, true); +} + +static inline void +irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags) +{ + __irq_put_desc_unlock(desc, flags, true); +} + +static inline struct irq_desc * +irq_get_desc_lock(unsigned int irq, unsigned long *flags) +{ + return __irq_get_desc_lock(irq, flags, false); +} + +static inline void +irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) +{ + __irq_put_desc_unlock(desc, flags, false); +} + /* - * Debugging printout: + * Manipulation functions for irq_data.state */ +static inline void irqd_set_move_pending(struct irq_data *d) +{ + d->state_use_accessors |= IRQD_SETAFFINITY_PENDING; + irq_compat_set_move_pending(irq_data_to_desc(d)); +} -#include <linux/kallsyms.h> - -#define P(f) if (desc->status & f) printk("%14s set\n", #f) +static inline void irqd_clr_move_pending(struct irq_data *d) +{ + d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING; + irq_compat_clr_move_pending(irq_data_to_desc(d)); +} -static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) +static inline void irqd_clear(struct irq_data *d, unsigned int mask) { - printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", - irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); - printk("->handle_irq(): %p, ", desc->handle_irq); - print_symbol("%s\n", (unsigned long)desc->handle_irq); - printk("->irq_data.chip(): %p, ", desc->irq_data.chip); - print_symbol("%s\n", (unsigned long)desc->irq_data.chip); - printk("->action(): %p\n", desc->action); - if (desc->action) { - printk("->action->handler(): %p, ", desc->action->handler); - print_symbol("%s\n", (unsigned long)desc->action->handler); - } - - P(IRQ_INPROGRESS); - P(IRQ_DISABLED); - P(IRQ_PENDING); - P(IRQ_REPLAY); - P(IRQ_AUTODETECT); - P(IRQ_WAITING); - P(IRQ_LEVEL); - P(IRQ_MASKED); -#ifdef CONFIG_IRQ_PER_CPU - P(IRQ_PER_CPU); -#endif - P(IRQ_NOPROBE); - P(IRQ_NOREQUEST); - P(IRQ_NOAUTOEN); + d->state_use_accessors &= ~mask; } -#undef P +static inline void irqd_set(struct irq_data *d, unsigned int mask) +{ + d->state_use_accessors |= mask; +} +static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) +{ + return d->state_use_accessors & mask; +} diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 2039bea31bd..dbccc799407 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -79,7 +79,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_data.chip_data = NULL; desc->irq_data.handler_data = NULL; desc->irq_data.msi_desc = NULL; - desc->status = IRQ_DEFAULT_INIT_FLAGS; + irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); + desc->istate = IRQS_DISABLED; desc->handle_irq = handle_bad_irq; desc->depth = 1; desc->irq_count = 0; @@ -206,6 +207,14 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) return NULL; } +static int irq_expand_nr_irqs(unsigned int nr) +{ + if (nr > IRQ_BITMAP_BITS) + return -ENOMEM; + nr_irqs = nr; + return 0; +} + int __init early_irq_init(void) { int i, initcnt, node = first_online_node; @@ -238,7 +247,7 @@ int __init early_irq_init(void) struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { - .status = IRQ_DEFAULT_INIT_FLAGS, + .istate = IRQS_DISABLED, .handle_irq = handle_bad_irq, .depth = 1, .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), @@ -260,8 +269,8 @@ int __init early_irq_init(void) for (i = 0; i < count; i++) { desc[i].irq_data.irq = i; desc[i].irq_data.chip = &no_irq_chip; - /* TODO : do this allocation on-demand ... */ desc[i].kstat_irqs = alloc_percpu(unsigned int); + irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); alloc_masks(desc + i, GFP_KERNEL, node); desc_smp_init(desc + i, node); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); @@ -286,24 +295,14 @@ static void free_desc(unsigned int irq) static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) { -#if defined(CONFIG_KSTAT_IRQS_ONDEMAND) - struct irq_desc *desc; - unsigned int i; - - for (i = 0; i < cnt; i++) { - desc = irq_to_desc(start + i); - if (desc && !desc->kstat_irqs) { - unsigned int __percpu *stats = alloc_percpu(unsigned int); - - if (!stats) - return -1; - if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL) - free_percpu(stats); - } - } -#endif return start; } + +static int irq_expand_nr_irqs(unsigned int nr) +{ + return -ENOMEM; +} + #endif /* !CONFIG_SPARSE_IRQ */ /* Dynamic interrupt handling */ @@ -347,14 +346,17 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) mutex_lock(&sparse_irq_lock); - start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); + start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, + from, cnt, 0); ret = -EEXIST; if (irq >=0 && start != irq) goto err; - ret = -ENOMEM; - if (start >= nr_irqs) - goto err; + if (start + cnt > nr_irqs) { + ret = irq_expand_nr_irqs(start + cnt); + if (ret) + goto err; + } bitmap_set(allocated_irqs, start, cnt); mutex_unlock(&sparse_irq_lock); @@ -401,6 +403,26 @@ unsigned int irq_get_next_irq(unsigned int offset) return find_next_bit(allocated_irqs, nr_irqs, offset); } +struct irq_desc * +__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc) { + if (bus) + chip_bus_lock(desc); + raw_spin_lock_irqsave(&desc->lock, *flags); + } + return desc; +} + +void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) +{ + raw_spin_unlock_irqrestore(&desc->lock, flags); + if (bus) + chip_bus_sync_unlock(desc); +} + /** * dynamic_irq_cleanup - cleanup a dynamically allocated irq * @irq: irq number to initialize diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9033c1c7082..acd599a43bf 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -17,6 +17,17 @@ #include "internals.h" +#ifdef CONFIG_IRQ_FORCED_THREADING +__read_mostly bool force_irqthreads; + +static int __init setup_forced_irqthreads(char *arg) +{ + force_irqthreads = true; + return 0; +} +early_param("threadirqs", setup_forced_irqthreads); +#endif + /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * @irq: interrupt number to wait for @@ -30,7 +41,7 @@ void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - unsigned int status; + unsigned int state; if (!desc) return; @@ -42,16 +53,16 @@ void synchronize_irq(unsigned int irq) * Wait until we're out of the critical section. This might * give the wrong answer due to the lack of memory barriers. */ - while (desc->status & IRQ_INPROGRESS) + while (desc->istate & IRQS_INPROGRESS) cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ raw_spin_lock_irqsave(&desc->lock, flags); - status = desc->status; + state = desc->istate; raw_spin_unlock_irqrestore(&desc->lock, flags); /* Oops, that failed? */ - } while (status & IRQ_INPROGRESS); + } while (state & IRQS_INPROGRESS); /* * We made sure that no hardirq handler is running. Now verify @@ -73,8 +84,8 @@ int irq_can_set_affinity(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip || - !desc->irq_data.chip->irq_set_affinity) + if (!desc || !irqd_can_balance(&desc->irq_data) || + !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) return 0; return 1; @@ -100,67 +111,169 @@ void irq_set_thread_affinity(struct irq_desc *desc) } } +#ifdef CONFIG_GENERIC_PENDING_IRQ +static inline bool irq_can_move_pcntxt(struct irq_desc *desc) +{ + return irq_settings_can_move_pcntxt(desc); +} +static inline bool irq_move_pending(struct irq_desc *desc) +{ + return irqd_is_setaffinity_pending(&desc->irq_data); +} +static inline void +irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) +{ + cpumask_copy(desc->pending_mask, mask); +} +static inline void +irq_get_pending(struct cpumask *mask, struct irq_desc *desc) +{ + cpumask_copy(mask, desc->pending_mask); +} +#else +static inline bool irq_can_move_pcntxt(struct irq_desc *desc) { return true; } +static inline bool irq_move_pending(struct irq_desc *desc) { return false; } +static inline void +irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { } +static inline void +irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } +#endif + /** * irq_set_affinity - Set the irq affinity of a given irq * @irq: Interrupt to set affinity * @cpumask: cpumask * */ -int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) +int irq_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_chip *chip = desc->irq_data.chip; unsigned long flags; + int ret = 0; if (!chip->irq_set_affinity) return -EINVAL; raw_spin_lock_irqsave(&desc->lock, flags); -#ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PCNTXT) { - if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { - cpumask_copy(desc->irq_data.affinity, cpumask); + if (irq_can_move_pcntxt(desc)) { + ret = chip->irq_set_affinity(&desc->irq_data, mask, false); + switch (ret) { + case IRQ_SET_MASK_OK: + cpumask_copy(desc->irq_data.affinity, mask); + case IRQ_SET_MASK_OK_NOCOPY: irq_set_thread_affinity(desc); + ret = 0; } + } else { + irqd_set_move_pending(&desc->irq_data); + irq_copy_pending(desc, mask); } - else { - desc->status |= IRQ_MOVE_PENDING; - cpumask_copy(desc->pending_mask, cpumask); - } -#else - if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { - cpumask_copy(desc->irq_data.affinity, cpumask); - irq_set_thread_affinity(desc); + + if (desc->affinity_notify) { + kref_get(&desc->affinity_notify->kref); + schedule_work(&desc->affinity_notify->work); } -#endif - desc->status |= IRQ_AFFINITY_SET; + irq_compat_set_affinity(desc); + irqd_set(&desc->irq_data, IRQD_AFFINITY_SET); raw_spin_unlock_irqrestore(&desc->lock, flags); - return 0; + return ret; } int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) { + unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + + if (!desc) + return -EINVAL; + desc->affinity_hint = m; + irq_put_desc_unlock(desc, flags); + return 0; +} +EXPORT_SYMBOL_GPL(irq_set_affinity_hint); + +static void irq_affinity_notify(struct work_struct *work) +{ + struct irq_affinity_notify *notify = + container_of(work, struct irq_affinity_notify, work); + struct irq_desc *desc = irq_to_desc(notify->irq); + cpumask_var_t cpumask; + unsigned long flags; + + if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL)) + goto out; + + raw_spin_lock_irqsave(&desc->lock, flags); + if (irq_move_pending(desc)) + irq_get_pending(cpumask, desc); + else + cpumask_copy(cpumask, desc->irq_data.affinity); + raw_spin_unlock_irqrestore(&desc->lock, flags); + + notify->notify(notify, cpumask); + + free_cpumask_var(cpumask); +out: + kref_put(¬ify->kref, notify->release); +} + +/** + * irq_set_affinity_notifier - control notification of IRQ affinity changes + * @irq: Interrupt for which to enable/disable notification + * @notify: Context for notification, or %NULL to disable + * notification. Function pointers must be initialised; + * the other fields will be initialised by this function. + * + * Must be called in process context. Notification may only be enabled + * after the IRQ is allocated and must be disabled before the IRQ is + * freed using free_irq(). + */ +int +irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) +{ struct irq_desc *desc = irq_to_desc(irq); + struct irq_affinity_notify *old_notify; unsigned long flags; + /* The release function is promised process context */ + might_sleep(); + if (!desc) return -EINVAL; + /* Complete initialisation of *notify */ + if (notify) { + notify->irq = irq; + kref_init(¬ify->kref); + INIT_WORK(¬ify->work, irq_affinity_notify); + } + raw_spin_lock_irqsave(&desc->lock, flags); - desc->affinity_hint = m; + old_notify = desc->affinity_notify; + desc->affinity_notify = notify; raw_spin_unlock_irqrestore(&desc->lock, flags); + if (old_notify) + kref_put(&old_notify->kref, old_notify->release); + return 0; } -EXPORT_SYMBOL_GPL(irq_set_affinity_hint); +EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); #ifndef CONFIG_AUTO_IRQ_AFFINITY /* * Generic version of the affinity autoselector. */ -static int setup_affinity(unsigned int irq, struct irq_desc *desc) +static int +setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) { + struct irq_chip *chip = irq_desc_get_chip(desc); + struct cpumask *set = irq_default_affinity; + int ret; + + /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!irq_can_set_affinity(irq)) return 0; @@ -168,22 +281,29 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc) * Preserve an userspace affinity setup, but make sure that * one of the targets is online. */ - if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { - if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask) - < nr_cpu_ids) - goto set_affinity; - else - desc->status &= ~IRQ_AFFINITY_SET; + if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { + if (cpumask_intersects(desc->irq_data.affinity, + cpu_online_mask)) + set = desc->irq_data.affinity; + else { + irq_compat_clr_affinity(desc); + irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); + } } - cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity); -set_affinity: - desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false); - + cpumask_and(mask, cpu_online_mask, set); + ret = chip->irq_set_affinity(&desc->irq_data, mask, false); + switch (ret) { + case IRQ_SET_MASK_OK: + cpumask_copy(desc->irq_data.affinity, mask); + case IRQ_SET_MASK_OK_NOCOPY: + irq_set_thread_affinity(desc); + } return 0; } #else -static inline int setup_affinity(unsigned int irq, struct irq_desc *d) +static inline int +setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask) { return irq_select_affinity(irq); } @@ -192,23 +312,21 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *d) /* * Called when affinity is set via /proc/irq */ -int irq_select_affinity_usr(unsigned int irq) +int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; int ret; raw_spin_lock_irqsave(&desc->lock, flags); - ret = setup_affinity(irq, desc); - if (!ret) - irq_set_thread_affinity(desc); + ret = setup_affinity(irq, desc, mask); raw_spin_unlock_irqrestore(&desc->lock, flags); - return ret; } #else -static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) +static inline int +setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) { return 0; } @@ -219,13 +337,23 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) if (suspend) { if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) return; - desc->status |= IRQ_SUSPENDED; + desc->istate |= IRQS_SUSPENDED; } - if (!desc->depth++) { - desc->status |= IRQ_DISABLED; - desc->irq_data.chip->irq_disable(&desc->irq_data); - } + if (!desc->depth++) + irq_disable(desc); +} + +static int __disable_irq_nosync(unsigned int irq) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + + if (!desc) + return -EINVAL; + __disable_irq(desc, irq, false); + irq_put_desc_busunlock(desc, flags); + return 0; } /** @@ -241,17 +369,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) */ void disable_irq_nosync(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - if (!desc) - return; - - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, flags); - __disable_irq(desc, irq, false); - raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); + __disable_irq_nosync(irq); } EXPORT_SYMBOL(disable_irq_nosync); @@ -269,21 +387,24 @@ EXPORT_SYMBOL(disable_irq_nosync); */ void disable_irq(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); - - if (!desc) - return; - - disable_irq_nosync(irq); - if (desc->action) + if (!__disable_irq_nosync(irq)) synchronize_irq(irq); } EXPORT_SYMBOL(disable_irq); void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) { - if (resume) - desc->status &= ~IRQ_SUSPENDED; + if (resume) { + if (!(desc->istate & IRQS_SUSPENDED)) { + if (!desc->action) + return; + if (!(desc->action->flags & IRQF_FORCE_RESUME)) + return; + /* Pretend that it got disabled ! */ + desc->depth++; + } + desc->istate &= ~IRQS_SUSPENDED; + } switch (desc->depth) { case 0: @@ -291,12 +412,11 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); break; case 1: { - unsigned int status = desc->status & ~IRQ_DISABLED; - - if (desc->status & IRQ_SUSPENDED) + if (desc->istate & IRQS_SUSPENDED) goto err_out; /* Prevent probing on this irq: */ - desc->status = status | IRQ_NOPROBE; + irq_settings_set_noprobe(desc); + irq_enable(desc); check_irq_resend(desc, irq); /* fall-through */ } @@ -318,21 +438,18 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) */ void enable_irq(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); if (!desc) return; + if (WARN(!desc->irq_data.chip, + KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) + goto out; - if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable, - KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) - return; - - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, false); - raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); +out: + irq_put_desc_busunlock(desc, flags); } EXPORT_SYMBOL(enable_irq); @@ -348,7 +465,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) } /** - * set_irq_wake - control irq power management wakeup + * irq_set_irq_wake - control irq power management wakeup * @irq: interrupt to control * @on: enable/disable power management wakeup * @@ -359,23 +476,22 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) * Wakeup mode lets this IRQ wake the system from sleep * states like "suspend to RAM". */ -int set_irq_wake(unsigned int irq, unsigned int on) +int irq_set_irq_wake(unsigned int irq, unsigned int on) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); int ret = 0; /* wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ - raw_spin_lock_irqsave(&desc->lock, flags); if (on) { if (desc->wake_depth++ == 0) { ret = set_irq_wake_real(irq, on); if (ret) desc->wake_depth = 0; else - desc->status |= IRQ_WAKEUP; + irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE); } } else { if (desc->wake_depth == 0) { @@ -385,14 +501,13 @@ int set_irq_wake(unsigned int irq, unsigned int on) if (ret) desc->wake_depth = 1; else - desc->status &= ~IRQ_WAKEUP; + irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); } } - - raw_spin_unlock_irqrestore(&desc->lock, flags); + irq_put_desc_busunlock(desc, flags); return ret; } -EXPORT_SYMBOL(set_irq_wake); +EXPORT_SYMBOL(irq_set_irq_wake); /* * Internal function that tells the architecture code whether a @@ -401,43 +516,27 @@ EXPORT_SYMBOL(set_irq_wake); */ int can_request_irq(unsigned int irq, unsigned long irqflags) { - struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action; unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + int canrequest = 0; if (!desc) return 0; - if (desc->status & IRQ_NOREQUEST) - return 0; - - raw_spin_lock_irqsave(&desc->lock, flags); - action = desc->action; - if (action) - if (irqflags & action->flags & IRQF_SHARED) - action = NULL; - - raw_spin_unlock_irqrestore(&desc->lock, flags); - - return !action; -} - -void compat_irq_chip_set_default_handler(struct irq_desc *desc) -{ - /* - * If the architecture still has not overriden - * the flow handler then zap the default. This - * should catch incorrect flow-type setting. - */ - if (desc->handle_irq == &handle_bad_irq) - desc->handle_irq = NULL; + if (irq_settings_can_request(desc)) { + if (desc->action) + if (irqflags & desc->action->flags & IRQF_SHARED) + canrequest =1; + } + irq_put_desc_unlock(desc, flags); + return canrequest; } int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags) { - int ret; struct irq_chip *chip = desc->irq_data.chip; + int ret, unmask = 0; if (!chip || !chip->irq_set_type) { /* @@ -449,23 +548,43 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, return 0; } + flags &= IRQ_TYPE_SENSE_MASK; + + if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { + if (!(desc->istate & IRQS_MASKED)) + mask_irq(desc); + if (!(desc->istate & IRQS_DISABLED)) + unmask = 1; + } + /* caller masked out all except trigger mode flags */ ret = chip->irq_set_type(&desc->irq_data, flags); - if (ret) - pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", - flags, irq, chip->irq_set_type); - else { - if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) - flags |= IRQ_LEVEL; - /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ - desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); - desc->status |= flags; + switch (ret) { + case IRQ_SET_MASK_OK: + irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); + irqd_set(&desc->irq_data, flags); + + case IRQ_SET_MASK_OK_NOCOPY: + flags = irqd_get_trigger_type(&desc->irq_data); + irq_settings_set_trigger_mask(desc, flags); + irqd_clear(&desc->irq_data, IRQD_LEVEL); + irq_settings_clr_level(desc); + if (flags & IRQ_TYPE_LEVEL_MASK) { + irq_settings_set_level(desc); + irqd_set(&desc->irq_data, IRQD_LEVEL); + } if (chip != desc->irq_data.chip) irq_chip_set_defaults(desc->irq_data.chip); + ret = 0; + break; + default: + pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", + flags, irq, chip->irq_set_type); } - + if (unmask) + unmask_irq(desc); return ret; } @@ -509,8 +628,11 @@ static int irq_wait_for_interrupt(struct irqaction *action) * handler finished. unmask if the interrupt has not been disabled and * is marked MASKED. */ -static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) +static void irq_finalize_oneshot(struct irq_desc *desc, + struct irqaction *action, bool force) { + if (!(desc->istate & IRQS_ONESHOT)) + return; again: chip_bus_lock(desc); raw_spin_lock_irq(&desc->lock); @@ -522,26 +644,44 @@ again: * The thread is faster done than the hard interrupt handler * on the other CPU. If we unmask the irq line then the * interrupt can come in again and masks the line, leaves due - * to IRQ_INPROGRESS and the irq line is masked forever. + * to IRQS_INPROGRESS and the irq line is masked forever. + * + * This also serializes the state of shared oneshot handlers + * versus "desc->threads_onehsot |= action->thread_mask;" in + * irq_wake_thread(). See the comment there which explains the + * serialization. */ - if (unlikely(desc->status & IRQ_INPROGRESS)) { + if (unlikely(desc->istate & IRQS_INPROGRESS)) { raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); cpu_relax(); goto again; } - if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { - desc->status &= ~IRQ_MASKED; + /* + * Now check again, whether the thread should run. Otherwise + * we would clear the threads_oneshot bit of this thread which + * was just set. + */ + if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + goto out_unlock; + + desc->threads_oneshot &= ~action->thread_mask; + + if (!desc->threads_oneshot && !(desc->istate & IRQS_DISABLED) && + (desc->istate & IRQS_MASKED)) { + irq_compat_clr_masked(desc); + desc->istate &= ~IRQS_MASKED; desc->irq_data.chip->irq_unmask(&desc->irq_data); } +out_unlock: raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); } #ifdef CONFIG_SMP /* - * Check whether we need to change the affinity of the interrupt thread. + * Check whether we need to chasnge the affinity of the interrupt thread. */ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) @@ -573,6 +713,32 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } #endif /* + * Interrupts which are not explicitely requested as threaded + * interrupts rely on the implicit bh/preempt disable of the hard irq + * context. So we need to disable bh here to avoid deadlocks and other + * side effects. + */ +static void +irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) +{ + local_bh_disable(); + action->thread_fn(action->irq, action->dev_id); + irq_finalize_oneshot(desc, action, false); + local_bh_enable(); +} + +/* + * Interrupts explicitely requested as threaded interupts want to be + * preemtible - many of them need to sleep and wait for slow busses to + * complete. + */ +static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action) +{ + action->thread_fn(action->irq, action->dev_id); + irq_finalize_oneshot(desc, action, false); +} + +/* * Interrupt handler thread */ static int irq_thread(void *data) @@ -582,7 +748,14 @@ static int irq_thread(void *data) }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); - int wake, oneshot = desc->status & IRQ_ONESHOT; + void (*handler_fn)(struct irq_desc *desc, struct irqaction *action); + int wake; + + if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, + &action->thread_flags)) + handler_fn = irq_forced_thread_fn; + else + handler_fn = irq_thread_fn; sched_setscheduler(current, SCHED_FIFO, ¶m); current->irqaction = action; @@ -594,23 +767,20 @@ static int irq_thread(void *data) atomic_inc(&desc->threads_active); raw_spin_lock_irq(&desc->lock); - if (unlikely(desc->status & IRQ_DISABLED)) { + if (unlikely(desc->istate & IRQS_DISABLED)) { /* * CHECKME: We might need a dedicated * IRQ_THREAD_PENDING flag here, which * retriggers the thread in check_irq_resend() - * but AFAICT IRQ_PENDING should be fine as it + * but AFAICT IRQS_PENDING should be fine as it * retriggers the interrupt itself --- tglx */ - desc->status |= IRQ_PENDING; + irq_compat_set_pending(desc); + desc->istate |= IRQS_PENDING; raw_spin_unlock_irq(&desc->lock); } else { raw_spin_unlock_irq(&desc->lock); - - action->thread_fn(action->irq, action->dev_id); - - if (oneshot) - irq_finalize_oneshot(action->irq, desc); + handler_fn(desc, action); } wake = atomic_dec_and_test(&desc->threads_active); @@ -619,6 +789,9 @@ static int irq_thread(void *data) wake_up(&desc->wait_for_threads); } + /* Prevent a stale desc->threads_oneshot */ + irq_finalize_oneshot(desc, action, true); + /* * Clear irqaction. Otherwise exit_irq_thread() would make * fuzz about an active irq thread going into nirvana. @@ -633,6 +806,7 @@ static int irq_thread(void *data) void exit_irq_thread(void) { struct task_struct *tsk = current; + struct irq_desc *desc; if (!tsk->irqaction) return; @@ -641,6 +815,14 @@ void exit_irq_thread(void) "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); + desc = irq_to_desc(tsk->irqaction->irq); + + /* + * Prevent a stale desc->threads_oneshot. Must be called + * before setting the IRQTF_DIED flag. + */ + irq_finalize_oneshot(desc, tsk->irqaction, true); + /* * Set the THREAD DIED flag to prevent further wakeups of the * soon to be gone threaded handler. @@ -648,6 +830,22 @@ void exit_irq_thread(void) set_bit(IRQTF_DIED, &tsk->irqaction->flags); } +static void irq_setup_forced_threading(struct irqaction *new) +{ + if (!force_irqthreads) + return; + if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) + return; + + new->flags |= IRQF_ONESHOT; + + if (!new->thread_fn) { + set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); + new->thread_fn = new->handler; + new->handler = irq_default_primary_handler; + } +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -657,9 +855,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) { struct irqaction *old, **old_ptr; const char *old_name = NULL; - unsigned long flags; - int nested, shared = 0; - int ret; + unsigned long flags, thread_mask = 0; + int ret, nested, shared = 0; + cpumask_var_t mask; if (!desc) return -EINVAL; @@ -683,15 +881,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) rand_initialize_irq(irq); } - /* Oneshot interrupts are not allowed with shared */ - if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED)) - return -EINVAL; - /* * Check whether the interrupt nests into another interrupt * thread. */ - nested = desc->status & IRQ_NESTED_THREAD; + nested = irq_settings_is_nested_thread(desc); if (nested) { if (!new->thread_fn) return -EINVAL; @@ -701,6 +895,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * dummy function which warns when called. */ new->handler = irq_nested_primary_handler; + } else { + irq_setup_forced_threading(new); } /* @@ -724,6 +920,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->thread = t; } + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { + ret = -ENOMEM; + goto out_thread; + } + /* * The following block of code has to be executed atomically */ @@ -735,29 +936,40 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * Can't share interrupts unless both agree to and are * the same type (level, edge, polarity). So both flag * fields must have IRQF_SHARED set and the bits which - * set the trigger type must match. + * set the trigger type must match. Also all must + * agree on ONESHOT. */ if (!((old->flags & new->flags) & IRQF_SHARED) || - ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) { + ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || + ((old->flags ^ new->flags) & IRQF_ONESHOT)) { old_name = old->name; goto mismatch; } -#if defined(CONFIG_IRQ_PER_CPU) /* All handlers must agree on per-cpuness */ if ((old->flags & IRQF_PERCPU) != (new->flags & IRQF_PERCPU)) goto mismatch; -#endif /* add new interrupt at end of irq queue */ do { + thread_mask |= old->thread_mask; old_ptr = &old->next; old = *old_ptr; } while (old); shared = 1; } + /* + * Setup the thread mask for this irqaction. Unlikely to have + * 32 resp 64 irqs sharing one line, but who knows. + */ + if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) { + ret = -EBUSY; + goto out_mask; + } + new->thread_mask = 1 << ffz(thread_mask); + if (!shared) { irq_chip_set_defaults(desc->irq_data.chip); @@ -769,42 +981,44 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->flags & IRQF_TRIGGER_MASK); if (ret) - goto out_thread; - } else - compat_irq_chip_set_default_handler(desc); -#if defined(CONFIG_IRQ_PER_CPU) - if (new->flags & IRQF_PERCPU) - desc->status |= IRQ_PER_CPU; -#endif + goto out_mask; + } - desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT | - IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); + desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ + IRQS_INPROGRESS | IRQS_ONESHOT | \ + IRQS_WAITING); + + if (new->flags & IRQF_PERCPU) { + irqd_set(&desc->irq_data, IRQD_PER_CPU); + irq_settings_set_per_cpu(desc); + } if (new->flags & IRQF_ONESHOT) - desc->status |= IRQ_ONESHOT; + desc->istate |= IRQS_ONESHOT; - if (!(desc->status & IRQ_NOAUTOEN)) { - desc->depth = 0; - desc->status &= ~IRQ_DISABLED; - desc->irq_data.chip->irq_startup(&desc->irq_data); - } else + if (irq_settings_can_autoenable(desc)) + irq_startup(desc); + else /* Undo nested disables: */ desc->depth = 1; /* Exclude IRQ from balancing if requested */ - if (new->flags & IRQF_NOBALANCING) - desc->status |= IRQ_NO_BALANCING; + if (new->flags & IRQF_NOBALANCING) { + irq_settings_set_no_balancing(desc); + irqd_set(&desc->irq_data, IRQD_NO_BALANCING); + } /* Set default affinity mask once everything is setup */ - setup_affinity(irq, desc); - - } else if ((new->flags & IRQF_TRIGGER_MASK) - && (new->flags & IRQF_TRIGGER_MASK) - != (desc->status & IRQ_TYPE_SENSE_MASK)) { - /* hope the handler works with the actual trigger mode... */ - pr_warning("IRQ %d uses trigger mode %d; requested %d\n", - irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK), - (int)(new->flags & IRQF_TRIGGER_MASK)); + setup_affinity(irq, desc, mask); + + } else if (new->flags & IRQF_TRIGGER_MASK) { + unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; + unsigned int omsk = irq_settings_get_trigger_mask(desc); + + if (nmsk != omsk) + /* hope the handler works with current trigger mode */ + pr_warning("IRQ %d uses trigger mode %u; requested %u\n", + irq, nmsk, omsk); } new->irq = irq; @@ -818,8 +1032,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * Check whether we disabled the irq via the spurious handler * before. Reenable it and give it another chance. */ - if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) { - desc->status &= ~IRQ_SPURIOUS_DISABLED; + if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { + desc->istate &= ~IRQS_SPURIOUS_DISABLED; __enable_irq(desc, irq, false); } @@ -849,6 +1063,9 @@ mismatch: #endif ret = -EBUSY; +out_mask: + free_cpumask_var(mask); + out_thread: raw_spin_unlock_irqrestore(&desc->lock, flags); if (new->thread) { @@ -871,9 +1088,14 @@ out_thread: */ int setup_irq(unsigned int irq, struct irqaction *act) { + int retval; struct irq_desc *desc = irq_to_desc(irq); - return __setup_irq(irq, desc, act); + chip_bus_lock(desc); + retval = __setup_irq(irq, desc, act); + chip_bus_sync_unlock(desc); + + return retval; } EXPORT_SYMBOL_GPL(setup_irq); @@ -924,13 +1146,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) #endif /* If this was the last handler, shut down the IRQ line: */ - if (!desc->action) { - desc->status |= IRQ_DISABLED; - if (desc->irq_data.chip->irq_shutdown) - desc->irq_data.chip->irq_shutdown(&desc->irq_data); - else - desc->irq_data.chip->irq_disable(&desc->irq_data); - } + if (!desc->action) + irq_shutdown(desc); #ifdef CONFIG_SMP /* make sure affinity_hint is cleaned up */ @@ -1004,6 +1221,11 @@ void free_irq(unsigned int irq, void *dev_id) if (!desc) return; +#ifdef CONFIG_SMP + if (WARN_ON(desc->affinity_notify)) + desc->affinity_notify = NULL; +#endif + chip_bus_lock(desc); kfree(__free_irq(irq, dev_id)); chip_bus_sync_unlock(desc); @@ -1074,7 +1296,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, if (!desc) return -EINVAL; - if (desc->status & IRQ_NOREQUEST) + if (!irq_settings_can_request(desc)) return -EINVAL; if (!handler) { @@ -1149,7 +1371,7 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler, if (!desc) return -EINVAL; - if (desc->status & IRQ_NESTED_THREAD) { + if (irq_settings_is_nested_thread(desc)) { ret = request_threaded_irq(irq, NULL, handler, flags, name, dev_id); return !ret ? IRQC_IS_NESTED : ret; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 441fd629ff0..ec4806d4778 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -4,23 +4,23 @@ #include "internals.h" -void move_masked_irq(int irq) +void irq_move_masked_irq(struct irq_data *idata) { - struct irq_desc *desc = irq_to_desc(irq); - struct irq_chip *chip = desc->irq_data.chip; + struct irq_desc *desc = irq_data_to_desc(idata); + struct irq_chip *chip = idata->chip; - if (likely(!(desc->status & IRQ_MOVE_PENDING))) + if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) return; /* * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. */ - if (CHECK_IRQ_PER_CPU(desc->status)) { + if (!irqd_can_balance(&desc->irq_data)) { WARN_ON(1); return; } - desc->status &= ~IRQ_MOVE_PENDING; + irqd_clr_move_pending(&desc->irq_data); if (unlikely(cpumask_empty(desc->pending_mask))) return; @@ -53,15 +53,20 @@ void move_masked_irq(int irq) cpumask_clear(desc->pending_mask); } -void move_native_irq(int irq) +void move_masked_irq(int irq) +{ + irq_move_masked_irq(irq_get_irq_data(irq)); +} + +void irq_move_irq(struct irq_data *idata) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc = irq_data_to_desc(idata); bool masked; - if (likely(!(desc->status & IRQ_MOVE_PENDING))) + if (likely(!irqd_is_setaffinity_pending(idata))) return; - if (unlikely(desc->status & IRQ_DISABLED)) + if (unlikely(desc->istate & IRQS_DISABLED)) return; /* @@ -69,10 +74,15 @@ void move_native_irq(int irq) * threaded interrupt with ONESHOT set, we can end up with an * interrupt storm. */ - masked = desc->status & IRQ_MASKED; + masked = desc->istate & IRQS_MASKED; if (!masked) - desc->irq_data.chip->irq_mask(&desc->irq_data); - move_masked_irq(irq); + idata->chip->irq_mask(idata); + irq_move_masked_irq(idata); if (!masked) - desc->irq_data.chip->irq_unmask(&desc->irq_data); + idata->chip->irq_unmask(idata); +} + +void move_native_irq(int irq) +{ + irq_move_irq(irq_get_irq_data(irq)); } diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 0d4005d85b0..f76fc00c987 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -18,7 +18,7 @@ * During system-wide suspend or hibernation device drivers need to be prevented * from receiving interrupts and this function is provided for this purpose. * It marks all interrupt lines in use, except for the timer ones, as disabled - * and sets the IRQ_SUSPENDED flag for each of them. + * and sets the IRQS_SUSPENDED flag for each of them. */ void suspend_device_irqs(void) { @@ -34,7 +34,7 @@ void suspend_device_irqs(void) } for_each_irq_desc(irq, desc) - if (desc->status & IRQ_SUSPENDED) + if (desc->istate & IRQS_SUSPENDED) synchronize_irq(irq); } EXPORT_SYMBOL_GPL(suspend_device_irqs); @@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs); * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() * * Enable all interrupt lines previously disabled by suspend_device_irqs() that - * have the IRQ_SUSPENDED flag set. + * have the IRQS_SUSPENDED flag set. */ void resume_device_irqs(void) { @@ -53,9 +53,6 @@ void resume_device_irqs(void) for_each_irq_desc(irq, desc) { unsigned long flags; - if (!(desc->status & IRQ_SUSPENDED)) - continue; - raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, true); raw_spin_unlock_irqrestore(&desc->lock, flags); @@ -71,9 +68,24 @@ int check_wakeup_irqs(void) struct irq_desc *desc; int irq; - for_each_irq_desc(irq, desc) - if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING)) - return -EBUSY; + for_each_irq_desc(irq, desc) { + if (irqd_is_wakeup_set(&desc->irq_data)) { + if (desc->istate & IRQS_PENDING) + return -EBUSY; + continue; + } + /* + * Check the non wakeup interrupts whether they need + * to be masked before finally going into suspend + * state. That's for hardware which has no wakeup + * source configuration facility. The chip + * implementation indicates that with + * IRQCHIP_MASK_ON_SUSPEND. + */ + if (desc->istate & IRQS_SUSPENDED && + irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) + mask_irq(desc); + } return 0; } diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 6c8a2a9f8a7..4cc2e5ed0be 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -11,6 +11,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/interrupt.h> +#include <linux/kernel_stat.h> #include "internals.h" @@ -24,7 +25,7 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v) const struct cpumask *mask = desc->irq_data.affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PENDING) + if (irqd_is_setaffinity_pending(&desc->irq_data)) mask = desc->pending_mask; #endif seq_cpumask(m, mask); @@ -65,8 +66,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, cpumask_var_t new_value; int err; - if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity || - irq_balancing_disabled(irq)) + if (!irq_can_set_affinity(irq) || no_irq_affinity) return -EIO; if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) @@ -89,7 +89,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, if (!cpumask_intersects(new_value, cpu_online_mask)) { /* Special case for empty set - allow the architecture code to set default SMP affinity. */ - err = irq_select_affinity_usr(irq) ? -EINVAL : count; + err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count; } else { irq_set_affinity(irq, new_value); err = count; @@ -357,3 +357,65 @@ void init_irq_proc(void) } } +#ifdef CONFIG_GENERIC_IRQ_SHOW + +int __weak arch_show_interrupts(struct seq_file *p, int prec) +{ + return 0; +} + +int show_interrupts(struct seq_file *p, void *v) +{ + static int prec; + + unsigned long flags, any_count = 0; + int i = *(loff_t *) v, j; + struct irqaction *action; + struct irq_desc *desc; + + if (i > nr_irqs) + return 0; + + if (i == nr_irqs) + return arch_show_interrupts(p, prec); + + /* print header and calculate the width of the first column */ + if (i == 0) { + for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) + j *= 10; + + seq_printf(p, "%*s", prec + 8, ""); + for_each_online_cpu(j) + seq_printf(p, "CPU%-8d", j); + seq_putc(p, '\n'); + } + + desc = irq_to_desc(i); + if (!desc) + return 0; + + raw_spin_lock_irqsave(&desc->lock, flags); + for_each_online_cpu(j) + any_count |= kstat_irqs_cpu(i, j); + action = desc->action; + if (!action && !any_count) + goto out; + + seq_printf(p, "%*d: ", prec, i); + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + seq_printf(p, " %8s", desc->irq_data.chip->name); + seq_printf(p, "-%-8s", desc->name); + + if (action) { + seq_printf(p, " %s", action->name); + while ((action = action->next) != NULL) + seq_printf(p, ", %s", action->name); + } + + seq_putc(p, '\n'); +out: + raw_spin_unlock_irqrestore(&desc->lock, flags); + return 0; +} +#endif diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index dc49358b73f..ad683a99b1e 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -55,20 +55,19 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); */ void check_irq_resend(struct irq_desc *desc, unsigned int irq) { - unsigned int status = desc->status; - - /* - * Make sure the interrupt is enabled, before resending it: - */ - desc->irq_data.chip->irq_enable(&desc->irq_data); - /* * We do not resend level type interrupts. Level type * interrupts are resent by hardware when they are still * active. */ - if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { - desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; + if (irq_settings_is_level(desc)) + return; + if (desc->istate & IRQS_REPLAY) + return; + if (desc->istate & IRQS_PENDING) { + irq_compat_clr_pending(desc); + desc->istate &= ~IRQS_PENDING; + desc->istate |= IRQS_REPLAY; if (!desc->irq_data.chip->irq_retrigger || !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h new file mode 100644 index 00000000000..0227ad35827 --- /dev/null +++ b/kernel/irq/settings.h @@ -0,0 +1,138 @@ +/* + * Internal header to deal with irq_desc->status which will be renamed + * to irq_desc->settings. + */ +enum { + _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS, + _IRQ_PER_CPU = IRQ_PER_CPU, + _IRQ_LEVEL = IRQ_LEVEL, + _IRQ_NOPROBE = IRQ_NOPROBE, + _IRQ_NOREQUEST = IRQ_NOREQUEST, + _IRQ_NOAUTOEN = IRQ_NOAUTOEN, + _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, + _IRQ_NO_BALANCING = IRQ_NO_BALANCING, + _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, + _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, +}; + +#define IRQ_INPROGRESS GOT_YOU_MORON +#define IRQ_REPLAY GOT_YOU_MORON +#define IRQ_WAITING GOT_YOU_MORON +#define IRQ_DISABLED GOT_YOU_MORON +#define IRQ_PENDING GOT_YOU_MORON +#define IRQ_MASKED GOT_YOU_MORON +#define IRQ_WAKEUP GOT_YOU_MORON +#define IRQ_MOVE_PENDING GOT_YOU_MORON +#define IRQ_PER_CPU GOT_YOU_MORON +#define IRQ_NO_BALANCING GOT_YOU_MORON +#define IRQ_AFFINITY_SET GOT_YOU_MORON +#define IRQ_LEVEL GOT_YOU_MORON +#define IRQ_NOPROBE GOT_YOU_MORON +#define IRQ_NOREQUEST GOT_YOU_MORON +#define IRQ_NOAUTOEN GOT_YOU_MORON +#define IRQ_NESTED_THREAD GOT_YOU_MORON +#undef IRQF_MODIFY_MASK +#define IRQF_MODIFY_MASK GOT_YOU_MORON + +static inline void +irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set) +{ + desc->status &= ~(clr & _IRQF_MODIFY_MASK); + desc->status |= (set & _IRQF_MODIFY_MASK); +} + +static inline bool irq_settings_is_per_cpu(struct irq_desc *desc) +{ + return desc->status & _IRQ_PER_CPU; +} + +static inline void irq_settings_set_per_cpu(struct irq_desc *desc) +{ + desc->status |= _IRQ_PER_CPU; +} + +static inline void irq_settings_set_no_balancing(struct irq_desc *desc) +{ + desc->status |= _IRQ_NO_BALANCING; +} + +static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc) +{ + return desc->status & _IRQ_NO_BALANCING; +} + +static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc) +{ + return desc->status & IRQ_TYPE_SENSE_MASK; +} + +static inline void +irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask) +{ + desc->status &= ~IRQ_TYPE_SENSE_MASK; + desc->status |= mask & IRQ_TYPE_SENSE_MASK; +} + +static inline bool irq_settings_is_level(struct irq_desc *desc) +{ + return desc->status & _IRQ_LEVEL; +} + +static inline void irq_settings_clr_level(struct irq_desc *desc) +{ + desc->status &= ~_IRQ_LEVEL; +} + +static inline void irq_settings_set_level(struct irq_desc *desc) +{ + desc->status |= _IRQ_LEVEL; +} + +static inline bool irq_settings_can_request(struct irq_desc *desc) +{ + return !(desc->status & _IRQ_NOREQUEST); +} + +static inline void irq_settings_clr_norequest(struct irq_desc *desc) +{ + desc->status &= ~_IRQ_NOREQUEST; +} + +static inline void irq_settings_set_norequest(struct irq_desc *desc) +{ + desc->status |= _IRQ_NOREQUEST; +} + +static inline bool irq_settings_can_probe(struct irq_desc *desc) +{ + return !(desc->status & _IRQ_NOPROBE); +} + +static inline void irq_settings_clr_noprobe(struct irq_desc *desc) +{ + desc->status &= ~_IRQ_NOPROBE; +} + +static inline void irq_settings_set_noprobe(struct irq_desc *desc) +{ + desc->status |= _IRQ_NOPROBE; +} + +static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc) +{ + return desc->status & _IRQ_MOVE_PCNTXT; +} + +static inline bool irq_settings_can_autoenable(struct irq_desc *desc) +{ + return !(desc->status & _IRQ_NOAUTOEN); +} + +static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) +{ + return desc->status & _IRQ_NESTED_THREAD; +} + +/* Nothing should touch desc->status from now on */ +#undef status +#define status USE_THE_PROPER_WRAPPERS_YOU_MORON diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 3089d3b9d5f..dd586ebf9c8 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -21,70 +21,94 @@ static int irqfixup __read_mostly; #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) static void poll_spurious_irqs(unsigned long dummy); static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); +static int irq_poll_cpu; +static atomic_t irq_poll_active; + +/* + * We wait here for a poller to finish. + * + * If the poll runs on this CPU, then we yell loudly and return + * false. That will leave the interrupt line disabled in the worst + * case, but it should never happen. + * + * We wait until the poller is done and then recheck disabled and + * action (about to be disabled). Only if it's still active, we return + * true and let the handler run. + */ +bool irq_wait_for_poll(struct irq_desc *desc) +{ + if (WARN_ONCE(irq_poll_cpu == smp_processor_id(), + "irq poll in progress on cpu %d for irq %d\n", + smp_processor_id(), desc->irq_data.irq)) + return false; + +#ifdef CONFIG_SMP + do { + raw_spin_unlock(&desc->lock); + while (desc->istate & IRQS_INPROGRESS) + cpu_relax(); + raw_spin_lock(&desc->lock); + } while (desc->istate & IRQS_INPROGRESS); + /* Might have been disabled in meantime */ + return !(desc->istate & IRQS_DISABLED) && desc->action; +#else + return false; +#endif +} + /* * Recovery handler for misrouted interrupts. */ -static int try_one_irq(int irq, struct irq_desc *desc) +static int try_one_irq(int irq, struct irq_desc *desc, bool force) { + irqreturn_t ret = IRQ_NONE; struct irqaction *action; - int ok = 0, work = 0; raw_spin_lock(&desc->lock); - /* Already running on another processor */ - if (desc->status & IRQ_INPROGRESS) { - /* - * Already running: If it is shared get the other - * CPU to go looking for our mystery interrupt too - */ - if (desc->action && (desc->action->flags & IRQF_SHARED)) - desc->status |= IRQ_PENDING; - raw_spin_unlock(&desc->lock); - return ok; - } - /* Honour the normal IRQ locking */ - desc->status |= IRQ_INPROGRESS; - action = desc->action; - raw_spin_unlock(&desc->lock); - while (action) { - /* Only shared IRQ handlers are safe to call */ - if (action->flags & IRQF_SHARED) { - if (action->handler(irq, action->dev_id) == - IRQ_HANDLED) - ok = 1; - } - action = action->next; - } - local_irq_disable(); - /* Now clean up the flags */ - raw_spin_lock(&desc->lock); - action = desc->action; + /* PER_CPU and nested thread interrupts are never polled */ + if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc)) + goto out; /* - * While we were looking for a fixup someone queued a real - * IRQ clashing with our walk: + * Do not poll disabled interrupts unless the spurious + * disabled poller asks explicitely. */ - while ((desc->status & IRQ_PENDING) && action) { + if ((desc->istate & IRQS_DISABLED) && !force) + goto out; + + /* + * All handlers must agree on IRQF_SHARED, so we test just the + * first. Check for action->next as well. + */ + action = desc->action; + if (!action || !(action->flags & IRQF_SHARED) || + (action->flags & __IRQF_TIMER) || !action->next) + goto out; + + /* Already running on another processor */ + if (desc->istate & IRQS_INPROGRESS) { /* - * Perform real IRQ processing for the IRQ we deferred + * Already running: If it is shared get the other + * CPU to go looking for our mystery interrupt too */ - work = 1; - raw_spin_unlock(&desc->lock); - handle_IRQ_event(irq, action); - raw_spin_lock(&desc->lock); - desc->status &= ~IRQ_PENDING; + irq_compat_set_pending(desc); + desc->istate |= IRQS_PENDING; + goto out; } - desc->status &= ~IRQ_INPROGRESS; - /* - * If we did actual work for the real IRQ line we must let the - * IRQ controller clean up too - */ - if (work) - irq_end(irq, desc); - raw_spin_unlock(&desc->lock); - return ok; + /* Mark it poll in progress */ + desc->istate |= IRQS_POLL_INPROGRESS; + do { + if (handle_irq_event(desc) == IRQ_HANDLED) + ret = IRQ_HANDLED; + action = desc->action; + } while ((desc->istate & IRQS_PENDING) && action); + desc->istate &= ~IRQS_POLL_INPROGRESS; +out: + raw_spin_unlock(&desc->lock); + return ret == IRQ_HANDLED; } static int misrouted_irq(int irq) @@ -92,6 +116,11 @@ static int misrouted_irq(int irq) struct irq_desc *desc; int i, ok = 0; + if (atomic_inc_return(&irq_poll_active) == 1) + goto out; + + irq_poll_cpu = smp_processor_id(); + for_each_irq_desc(i, desc) { if (!i) continue; @@ -99,9 +128,11 @@ static int misrouted_irq(int irq) if (i == irq) /* Already tried */ continue; - if (try_one_irq(i, desc)) + if (try_one_irq(i, desc, false)) ok = 1; } +out: + atomic_dec(&irq_poll_active); /* So the caller can adjust the irq error counts */ return ok; } @@ -111,23 +142,28 @@ static void poll_spurious_irqs(unsigned long dummy) struct irq_desc *desc; int i; + if (atomic_inc_return(&irq_poll_active) != 1) + goto out; + irq_poll_cpu = smp_processor_id(); + for_each_irq_desc(i, desc) { - unsigned int status; + unsigned int state; if (!i) continue; /* Racy but it doesn't matter */ - status = desc->status; + state = desc->istate; barrier(); - if (!(status & IRQ_SPURIOUS_DISABLED)) + if (!(state & IRQS_SPURIOUS_DISABLED)) continue; local_irq_disable(); - try_one_irq(i, desc); + try_one_irq(i, desc, true); local_irq_enable(); } - +out: + atomic_dec(&irq_poll_active); mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } @@ -139,15 +175,13 @@ static void poll_spurious_irqs(unsigned long dummy) * * (The other 100-of-100,000 interrupts may have been a correctly * functioning device sharing an IRQ with the failing one) - * - * Called under desc->lock */ - static void __report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { struct irqaction *action; + unsigned long flags; if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { printk(KERN_ERR "irq event %d: bogus return value %x\n", @@ -159,6 +193,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, dump_stack(); printk(KERN_ERR "handlers:\n"); + /* + * We need to take desc->lock here. note_interrupt() is called + * w/o desc->lock held, but IRQ_PROGRESS set. We might race + * with something else removing an action. It's ok to take + * desc->lock here. See synchronize_irq(). + */ + raw_spin_lock_irqsave(&desc->lock, flags); action = desc->action; while (action) { printk(KERN_ERR "[<%p>]", action->handler); @@ -167,6 +208,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, printk("\n"); action = action->next; } + raw_spin_unlock_irqrestore(&desc->lock, flags); } static void @@ -218,6 +260,9 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, void note_interrupt(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { + if (desc->istate & IRQS_POLL_INPROGRESS) + return; + if (unlikely(action_ret != IRQ_HANDLED)) { /* * If we are seeing only the odd spurious IRQ caused by @@ -254,9 +299,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, * Now kill the IRQ */ printk(KERN_EMERG "Disabling IRQ #%d\n", irq); - desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; + desc->istate |= IRQS_SPURIOUS_DISABLED; desc->depth++; - desc->irq_data.chip->irq_disable(&desc->irq_data); + irq_disable(desc); mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 05bb7173850..67fea9d25d5 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -176,7 +176,8 @@ static inline cputime_t virt_ticks(struct task_struct *p) return p->utime; } -int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) +static int +posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) { int error = check_clock(which_clock); if (!error) { @@ -194,7 +195,8 @@ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) return error; } -int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) +static int +posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) { /* * You can never reset a CPU clock, but we check for other errors @@ -317,7 +319,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock, } -int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) +static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) { const pid_t pid = CPUCLOCK_PID(which_clock); int error = -EINVAL; @@ -379,7 +381,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) * This is called from sys_timer_create() and do_cpu_nanosleep() with the * new timer already all-zeros initialized. */ -int posix_cpu_timer_create(struct k_itimer *new_timer) +static int posix_cpu_timer_create(struct k_itimer *new_timer) { int ret = 0; const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); @@ -425,7 +427,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) * If we return TIMER_RETRY, it's necessary to release the timer's lock * and try again. (This happens when the timer is in the middle of firing.) */ -int posix_cpu_timer_del(struct k_itimer *timer) +static int posix_cpu_timer_del(struct k_itimer *timer) { struct task_struct *p = timer->it.cpu.task; int ret = 0; @@ -665,8 +667,8 @@ static int cpu_timer_sample_group(const clockid_t which_clock, * If we return TIMER_RETRY, it's necessary to release the timer's lock * and try again. (This happens when the timer is in the middle of firing.) */ -int posix_cpu_timer_set(struct k_itimer *timer, int flags, - struct itimerspec *new, struct itimerspec *old) +static int posix_cpu_timer_set(struct k_itimer *timer, int flags, + struct itimerspec *new, struct itimerspec *old) { struct task_struct *p = timer->it.cpu.task; union cpu_time_count old_expires, new_expires, old_incr, val; @@ -820,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, return ret; } -void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) +static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) { union cpu_time_count now; struct task_struct *p = timer->it.cpu.task; @@ -1481,11 +1483,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, return error; } -int posix_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct timespec __user *rmtp) +static long posix_cpu_nsleep_restart(struct restart_block *restart_block); + +static int posix_cpu_nsleep(const clockid_t which_clock, int flags, + struct timespec *rqtp, struct timespec __user *rmtp) { struct restart_block *restart_block = - ¤t_thread_info()->restart_block; + ¤t_thread_info()->restart_block; struct itimerspec it; int error; @@ -1501,56 +1505,47 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags, if (error == -ERESTART_RESTARTBLOCK) { - if (flags & TIMER_ABSTIME) + if (flags & TIMER_ABSTIME) return -ERESTARTNOHAND; /* - * Report back to the user the time still remaining. - */ - if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + * Report back to the user the time still remaining. + */ + if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) return -EFAULT; restart_block->fn = posix_cpu_nsleep_restart; - restart_block->arg0 = which_clock; - restart_block->arg1 = (unsigned long) rmtp; - restart_block->arg2 = rqtp->tv_sec; - restart_block->arg3 = rqtp->tv_nsec; + restart_block->nanosleep.index = which_clock; + restart_block->nanosleep.rmtp = rmtp; + restart_block->nanosleep.expires = timespec_to_ns(rqtp); } return error; } -long posix_cpu_nsleep_restart(struct restart_block *restart_block) +static long posix_cpu_nsleep_restart(struct restart_block *restart_block) { - clockid_t which_clock = restart_block->arg0; - struct timespec __user *rmtp; + clockid_t which_clock = restart_block->nanosleep.index; struct timespec t; struct itimerspec it; int error; - rmtp = (struct timespec __user *) restart_block->arg1; - t.tv_sec = restart_block->arg2; - t.tv_nsec = restart_block->arg3; + t = ns_to_timespec(restart_block->nanosleep.expires); - restart_block->fn = do_no_restart_syscall; error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); if (error == -ERESTART_RESTARTBLOCK) { + struct timespec __user *rmtp = restart_block->nanosleep.rmtp; /* - * Report back to the user the time still remaining. - */ - if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + * Report back to the user the time still remaining. + */ + if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) return -EFAULT; - restart_block->fn = posix_cpu_nsleep_restart; - restart_block->arg0 = which_clock; - restart_block->arg1 = (unsigned long) rmtp; - restart_block->arg2 = t.tv_sec; - restart_block->arg3 = t.tv_nsec; + restart_block->nanosleep.expires = timespec_to_ns(&t); } return error; } - #define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) #define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) @@ -1594,38 +1589,37 @@ static int thread_cpu_timer_create(struct k_itimer *timer) timer->it_clock = THREAD_CLOCK; return posix_cpu_timer_create(timer); } -static int thread_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct timespec __user *rmtp) -{ - return -EINVAL; -} -static long thread_cpu_nsleep_restart(struct restart_block *restart_block) -{ - return -EINVAL; -} + +struct k_clock clock_posix_cpu = { + .clock_getres = posix_cpu_clock_getres, + .clock_set = posix_cpu_clock_set, + .clock_get = posix_cpu_clock_get, + .timer_create = posix_cpu_timer_create, + .nsleep = posix_cpu_nsleep, + .nsleep_restart = posix_cpu_nsleep_restart, + .timer_set = posix_cpu_timer_set, + .timer_del = posix_cpu_timer_del, + .timer_get = posix_cpu_timer_get, +}; static __init int init_posix_cpu_timers(void) { struct k_clock process = { - .clock_getres = process_cpu_clock_getres, - .clock_get = process_cpu_clock_get, - .clock_set = do_posix_clock_nosettime, - .timer_create = process_cpu_timer_create, - .nsleep = process_cpu_nsleep, - .nsleep_restart = process_cpu_nsleep_restart, + .clock_getres = process_cpu_clock_getres, + .clock_get = process_cpu_clock_get, + .timer_create = process_cpu_timer_create, + .nsleep = process_cpu_nsleep, + .nsleep_restart = process_cpu_nsleep_restart, }; struct k_clock thread = { - .clock_getres = thread_cpu_clock_getres, - .clock_get = thread_cpu_clock_get, - .clock_set = do_posix_clock_nosettime, - .timer_create = thread_cpu_timer_create, - .nsleep = thread_cpu_nsleep, - .nsleep_restart = thread_cpu_nsleep_restart, + .clock_getres = thread_cpu_clock_getres, + .clock_get = thread_cpu_clock_get, + .timer_create = thread_cpu_timer_create, }; struct timespec ts; - register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); - register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); + posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); + posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread); cputime_to_timespec(cputime_one_jiffy, &ts); onecputick = ts.tv_nsec; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 93bd2eb2bc5..4c0124919f9 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -41,6 +41,7 @@ #include <linux/init.h> #include <linux/compiler.h> #include <linux/idr.h> +#include <linux/posix-clock.h> #include <linux/posix-timers.h> #include <linux/syscalls.h> #include <linux/wait.h> @@ -81,6 +82,14 @@ static DEFINE_SPINLOCK(idr_lock); #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif +/* + * parisc wants ENOTSUP instead of EOPNOTSUPP + */ +#ifndef ENOTSUP +# define ENANOSLEEP_NOTSUP EOPNOTSUPP +#else +# define ENANOSLEEP_NOTSUP ENOTSUP +#endif /* * The timer ID is turned into a timer address by idr_find(). @@ -94,11 +103,7 @@ static DEFINE_SPINLOCK(idr_lock); /* * CLOCKs: The POSIX standard calls for a couple of clocks and allows us * to implement others. This structure defines the various - * clocks and allows the possibility of adding others. We - * provide an interface to add clocks to the table and expect - * the "arch" code to add at least one clock that is high - * resolution. Here we define the standard CLOCK_REALTIME as a - * 1/HZ resolution clock. + * clocks. * * RESOLUTION: Clock resolution is used to round up timer and interval * times, NOT to report clock times, which are reported with as @@ -108,20 +113,13 @@ static DEFINE_SPINLOCK(idr_lock); * necessary code is written. The standard says we should say * something about this issue in the documentation... * - * FUNCTIONS: The CLOCKs structure defines possible functions to handle - * various clock functions. For clocks that use the standard - * system timer code these entries should be NULL. This will - * allow dispatch without the overhead of indirect function - * calls. CLOCKS that depend on other sources (e.g. WWV or GPS) - * must supply functions here, even if the function just returns - * ENOSYS. The standard POSIX timer management code assumes the - * following: 1.) The k_itimer struct (sched.h) is used for the - * timer. 2.) The list, it_lock, it_clock, it_id and it_pid - * fields are not modified by timer code. + * FUNCTIONS: The CLOCKs structure defines possible functions to + * handle various clock functions. * - * At this time all functions EXCEPT clock_nanosleep can be - * redirected by the CLOCKS structure. Clock_nanosleep is in - * there, but the code ignores it. + * The standard POSIX timer management code assumes the + * following: 1.) The k_itimer struct (sched.h) is used for + * the timer. 2.) The list, it_lock, it_clock, it_id and + * it_pid fields are not modified by timer code. * * Permissions: It is assumed that the clock_settime() function defined * for each clock will take care of permission checks. Some @@ -138,6 +136,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS]; */ static int common_nsleep(const clockid_t, int flags, struct timespec *t, struct timespec __user *rmtp); +static int common_timer_create(struct k_itimer *new_timer); static void common_timer_get(struct k_itimer *, struct itimerspec *); static int common_timer_set(struct k_itimer *, int, struct itimerspec *, struct itimerspec *); @@ -158,76 +157,24 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) spin_unlock_irqrestore(&timr->it_lock, flags); } -/* - * Call the k_clock hook function if non-null, or the default function. - */ -#define CLOCK_DISPATCH(clock, call, arglist) \ - ((clock) < 0 ? posix_cpu_##call arglist : \ - (posix_clocks[clock].call != NULL \ - ? (*posix_clocks[clock].call) arglist : common_##call arglist)) - -/* - * Default clock hook functions when the struct k_clock passed - * to register_posix_clock leaves a function pointer null. - * - * The function common_CALL is the default implementation for - * the function pointer CALL in struct k_clock. - */ - -static inline int common_clock_getres(const clockid_t which_clock, - struct timespec *tp) -{ - tp->tv_sec = 0; - tp->tv_nsec = posix_clocks[which_clock].res; - return 0; -} - -/* - * Get real time for posix timers - */ -static int common_clock_get(clockid_t which_clock, struct timespec *tp) +/* Get clock_realtime */ +static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp) { ktime_get_real_ts(tp); return 0; } -static inline int common_clock_set(const clockid_t which_clock, - struct timespec *tp) +/* Set clock_realtime */ +static int posix_clock_realtime_set(const clockid_t which_clock, + const struct timespec *tp) { return do_sys_settimeofday(tp, NULL); } -static int common_timer_create(struct k_itimer *new_timer) -{ - hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); - return 0; -} - -static int no_timer_create(struct k_itimer *new_timer) -{ - return -EOPNOTSUPP; -} - -static int no_nsleep(const clockid_t which_clock, int flags, - struct timespec *tsave, struct timespec __user *rmtp) -{ - return -EOPNOTSUPP; -} - -/* - * Return nonzero if we know a priori this clockid_t value is bogus. - */ -static inline int invalid_clockid(const clockid_t which_clock) +static int posix_clock_realtime_adj(const clockid_t which_clock, + struct timex *t) { - if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ - return 0; - if ((unsigned) which_clock >= MAX_CLOCKS) - return 1; - if (posix_clocks[which_clock].clock_getres != NULL) - return 0; - if (posix_clocks[which_clock].res != 0) - return 0; - return 1; + return do_adjtimex(t); } /* @@ -240,7 +187,7 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) } /* - * Get monotonic time for posix timers + * Get monotonic-raw time for posix timers */ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) { @@ -267,46 +214,70 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp *tp = ktime_to_timespec(KTIME_LOW_RES); return 0; } + +static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) +{ + get_monotonic_boottime(tp); + return 0; +} + + /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ static __init int init_posix_timers(void) { struct k_clock clock_realtime = { - .clock_getres = hrtimer_get_res, + .clock_getres = hrtimer_get_res, + .clock_get = posix_clock_realtime_get, + .clock_set = posix_clock_realtime_set, + .clock_adj = posix_clock_realtime_adj, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, }; struct k_clock clock_monotonic = { - .clock_getres = hrtimer_get_res, - .clock_get = posix_ktime_get_ts, - .clock_set = do_posix_clock_nosettime, + .clock_getres = hrtimer_get_res, + .clock_get = posix_ktime_get_ts, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, }; struct k_clock clock_monotonic_raw = { - .clock_getres = hrtimer_get_res, - .clock_get = posix_get_monotonic_raw, - .clock_set = do_posix_clock_nosettime, - .timer_create = no_timer_create, - .nsleep = no_nsleep, + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_monotonic_raw, }; struct k_clock clock_realtime_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_realtime_coarse, - .clock_set = do_posix_clock_nosettime, - .timer_create = no_timer_create, - .nsleep = no_nsleep, + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, }; struct k_clock clock_monotonic_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_monotonic_coarse, - .clock_set = do_posix_clock_nosettime, - .timer_create = no_timer_create, - .nsleep = no_nsleep, + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, + }; + struct k_clock clock_boottime = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_boottime, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, }; - register_posix_clock(CLOCK_REALTIME, &clock_realtime); - register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); - register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); - register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); - register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); + posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime); + posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic); + posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); + posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); + posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); + posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, @@ -482,17 +453,29 @@ static struct pid *good_sigevent(sigevent_t * event) return task_pid(rtn); } -void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) +void posix_timers_register_clock(const clockid_t clock_id, + struct k_clock *new_clock) { if ((unsigned) clock_id >= MAX_CLOCKS) { - printk("POSIX clock register failed for clock_id %d\n", + printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n", + clock_id); + return; + } + + if (!new_clock->clock_get) { + printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n", + clock_id); + return; + } + if (!new_clock->clock_getres) { + printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n", clock_id); return; } posix_clocks[clock_id] = *new_clock; } -EXPORT_SYMBOL_GPL(register_posix_clock); +EXPORT_SYMBOL_GPL(posix_timers_register_clock); static struct k_itimer * alloc_posix_timer(void) { @@ -523,19 +506,39 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) kmem_cache_free(posix_timers_cache, tmr); } +static struct k_clock *clockid_to_kclock(const clockid_t id) +{ + if (id < 0) + return (id & CLOCKFD_MASK) == CLOCKFD ? + &clock_posix_dynamic : &clock_posix_cpu; + + if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres) + return NULL; + return &posix_clocks[id]; +} + +static int common_timer_create(struct k_itimer *new_timer) +{ + hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); + return 0; +} + /* Create a POSIX.1b interval timer. */ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, struct sigevent __user *, timer_event_spec, timer_t __user *, created_timer_id) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct k_itimer *new_timer; int error, new_timer_id; sigevent_t event; int it_id_set = IT_ID_NOT_SET; - if (invalid_clockid(which_clock)) + if (!kc) return -EINVAL; + if (!kc->timer_create) + return -EOPNOTSUPP; new_timer = alloc_posix_timer(); if (unlikely(!new_timer)) @@ -597,7 +600,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, goto out; } - error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); + error = kc->timer_create(new_timer); if (error) goto out; @@ -607,7 +610,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, spin_unlock_irq(¤t->sighand->siglock); return 0; - /* + /* * In the case of the timer belonging to another task, after * the task is unlocked, the timer is owned by the other task * and may cease to exist at any time. Don't use or modify @@ -709,22 +712,28 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, struct itimerspec __user *, setting) { - struct k_itimer *timr; struct itimerspec cur_setting; + struct k_itimer *timr; + struct k_clock *kc; unsigned long flags; + int ret = 0; timr = lock_timer(timer_id, &flags); if (!timr) return -EINVAL; - CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting)); + kc = clockid_to_kclock(timr->it_clock); + if (WARN_ON_ONCE(!kc || !kc->timer_get)) + ret = -EINVAL; + else + kc->timer_get(timr, &cur_setting); unlock_timer(timr, flags); - if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) + if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) return -EFAULT; - return 0; + return ret; } /* @@ -813,6 +822,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, int error = 0; unsigned long flag; struct itimerspec *rtn = old_setting ? &old_spec : NULL; + struct k_clock *kc; if (!new_setting) return -EINVAL; @@ -828,8 +838,11 @@ retry: if (!timr) return -EINVAL; - error = CLOCK_DISPATCH(timr->it_clock, timer_set, - (timr, flags, &new_spec, rtn)); + kc = clockid_to_kclock(timr->it_clock); + if (WARN_ON_ONCE(!kc || !kc->timer_set)) + error = -EINVAL; + else + error = kc->timer_set(timr, flags, &new_spec, rtn); unlock_timer(timr, flag); if (error == TIMER_RETRY) { @@ -844,7 +857,7 @@ retry: return error; } -static inline int common_timer_del(struct k_itimer *timer) +static int common_timer_del(struct k_itimer *timer) { timer->it.real.interval.tv64 = 0; @@ -855,7 +868,11 @@ static inline int common_timer_del(struct k_itimer *timer) static inline int timer_delete_hook(struct k_itimer *timer) { - return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer)); + struct k_clock *kc = clockid_to_kclock(timer->it_clock); + + if (WARN_ON_ONCE(!kc || !kc->timer_del)) + return -EINVAL; + return kc->timer_del(timer); } /* Delete a POSIX.1b interval timer. */ @@ -927,69 +944,76 @@ void exit_itimers(struct signal_struct *sig) } } -/* Not available / possible... functions */ -int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp) -{ - return -EINVAL; -} -EXPORT_SYMBOL_GPL(do_posix_clock_nosettime); - -int do_posix_clock_nonanosleep(const clockid_t clock, int flags, - struct timespec *t, struct timespec __user *r) -{ -#ifndef ENOTSUP - return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */ -#else /* parisc does define it separately. */ - return -ENOTSUP; -#endif -} -EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); - SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec new_tp; - if (invalid_clockid(which_clock)) + if (!kc || !kc->clock_set) return -EINVAL; + if (copy_from_user(&new_tp, tp, sizeof (*tp))) return -EFAULT; - return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp)); + return kc->clock_set(which_clock, &new_tp); } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct timespec __user *,tp) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec kernel_tp; int error; - if (invalid_clockid(which_clock)) + if (!kc) return -EINVAL; - error = CLOCK_DISPATCH(which_clock, clock_get, - (which_clock, &kernel_tp)); + + error = kc->clock_get(which_clock, &kernel_tp); + if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) error = -EFAULT; return error; +} + +SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, + struct timex __user *, utx) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct timex ktx; + int err; + + if (!kc) + return -EINVAL; + if (!kc->clock_adj) + return -EOPNOTSUPP; + + if (copy_from_user(&ktx, utx, sizeof(ktx))) + return -EFAULT; + + err = kc->clock_adj(which_clock, &ktx); + + if (!err && copy_to_user(utx, &ktx, sizeof(ktx))) + return -EFAULT; + return err; } SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec rtn_tp; int error; - if (invalid_clockid(which_clock)) + if (!kc) return -EINVAL; - error = CLOCK_DISPATCH(which_clock, clock_getres, - (which_clock, &rtn_tp)); + error = kc->clock_getres(which_clock, &rtn_tp); - if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) { + if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) error = -EFAULT; - } return error; } @@ -1009,10 +1033,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, const struct timespec __user *, rqtp, struct timespec __user *, rmtp) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec t; - if (invalid_clockid(which_clock)) + if (!kc) return -EINVAL; + if (!kc->nsleep) + return -ENANOSLEEP_NOTSUP; if (copy_from_user(&t, rqtp, sizeof (struct timespec))) return -EFAULT; @@ -1020,27 +1047,20 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, if (!timespec_valid(&t)) return -EINVAL; - return CLOCK_DISPATCH(which_clock, nsleep, - (which_clock, flags, &t, rmtp)); -} - -/* - * nanosleep_restart for monotonic and realtime clocks - */ -static int common_nsleep_restart(struct restart_block *restart_block) -{ - return hrtimer_nanosleep_restart(restart_block); + return kc->nsleep(which_clock, flags, &t, rmtp); } /* * This will restart clock_nanosleep. This is required only by * compat_clock_nanosleep_restart for now. */ -long -clock_nanosleep_restart(struct restart_block *restart_block) +long clock_nanosleep_restart(struct restart_block *restart_block) { - clockid_t which_clock = restart_block->arg0; + clockid_t which_clock = restart_block->nanosleep.index; + struct k_clock *kc = clockid_to_kclock(which_clock); + + if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) + return -EINVAL; - return CLOCK_DISPATCH(which_clock, nsleep_restart, - (restart_block)); + return kc->nsleep_restart(restart_block); } diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index ddabb54bb5c..3c7cbc2c33b 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -215,7 +215,6 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) put_pid(waiter->deadlock_task_pid); TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - TRACE_WARN_ON(waiter->task); memset(waiter, 0x22, sizeof(*waiter)); } diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 66cb89bc5ef..5c9ccd38096 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -9,7 +9,6 @@ #include <linux/kthread.h> #include <linux/module.h> #include <linux/sched.h> -#include <linux/smp_lock.h> #include <linux/spinlock.h> #include <linux/sysdev.h> #include <linux/timer.h> @@ -27,7 +26,6 @@ struct test_thread_data { int opcode; int opdata; int mutexes[MAX_RT_TEST_MUTEXES]; - int bkl; int event; struct sys_device sysdev; }; @@ -46,9 +44,8 @@ enum test_opcodes { RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ - RTTEST_LOCKBKL, /* 9 Lock BKL */ - RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ - RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */ + /* 9, 10 - reserved for BKL commemoration */ + RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */ RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ RTTEST_RESET = 99, /* 99 Reset all pending operations */ }; @@ -74,13 +71,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) td->mutexes[i] = 0; } } - - if (!lockwakeup && td->bkl == 4) { -#ifdef CONFIG_LOCK_KERNEL - unlock_kernel(); -#endif - td->bkl = 0; - } return 0; case RTTEST_RESETEVENT: @@ -131,25 +121,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) td->mutexes[id] = 0; return 0; - case RTTEST_LOCKBKL: - if (td->bkl) - return 0; - td->bkl = 1; -#ifdef CONFIG_LOCK_KERNEL - lock_kernel(); -#endif - td->bkl = 4; - return 0; - - case RTTEST_UNLOCKBKL: - if (td->bkl != 4) - break; -#ifdef CONFIG_LOCK_KERNEL - unlock_kernel(); -#endif - td->bkl = 0; - return 0; - default: break; } @@ -196,7 +167,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex) td->event = atomic_add_return(1, &rttest_event); break; - case RTTEST_LOCKBKL: default: break; } @@ -229,8 +199,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex) td->event = atomic_add_return(1, &rttest_event); return; - case RTTEST_LOCKBKL: - return; default: return; } @@ -380,11 +348,11 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute spin_lock(&rttest_lock); curr += sprintf(curr, - "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", + "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:", td->opcode, td->event, tsk->state, (MAX_RT_PRIO - 1) - tsk->prio, (MAX_RT_PRIO - 1) - tsk->normal_prio, - tsk->pi_blocked_on, td->bkl); + tsk->pi_blocked_on); for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) curr += sprintf(curr, "%d", td->mutexes[i]); diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index a9604815786..ab449117aaf 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -20,41 +20,34 @@ /* * lock->owner state tracking: * - * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 - * are used to keep track of the "owner is pending" and "lock has - * waiters" state. + * lock->owner holds the task_struct pointer of the owner. Bit 0 + * is used to keep track of the "lock has waiters" state. * - * owner bit1 bit0 - * NULL 0 0 lock is free (fast acquire possible) - * NULL 0 1 invalid state - * NULL 1 0 Transitional State* - * NULL 1 1 invalid state - * taskpointer 0 0 lock is held (fast release possible) - * taskpointer 0 1 task is pending owner - * taskpointer 1 0 lock is held and has waiters - * taskpointer 1 1 task is pending owner and lock has more waiters - * - * Pending ownership is assigned to the top (highest priority) - * waiter of the lock, when the lock is released. The thread is woken - * up and can now take the lock. Until the lock is taken (bit 0 - * cleared) a competing higher priority thread can steal the lock - * which puts the woken up thread back on the waiters list. + * owner bit0 + * NULL 0 lock is free (fast acquire possible) + * NULL 1 lock is free and has waiters and the top waiter + * is going to take the lock* + * taskpointer 0 lock is held (fast release possible) + * taskpointer 1 lock is held and has waiters** * * The fast atomic compare exchange based acquire and release is only - * possible when bit 0 and 1 of lock->owner are 0. + * possible when bit 0 of lock->owner is 0. + * + * (*) It also can be a transitional state when grabbing the lock + * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock, + * we need to set the bit0 before looking at the lock, and the owner may be + * NULL in this small time, hence this can be a transitional state. * - * (*) There's a small time where the owner can be NULL and the - * "lock has waiters" bit is set. This can happen when grabbing the lock. - * To prevent a cmpxchg of the owner releasing the lock, we need to set this - * bit before looking at the lock, hence the reason this is a transitional - * state. + * (**) There is a small time when bit 0 is set but there are no + * waiters. This can happen when grabbing the lock in the slow path. + * To prevent a cmpxchg of the owner releasing the lock, we need to + * set this bit before looking at the lock. */ static void -rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, - unsigned long mask) +rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) { - unsigned long val = (unsigned long)owner | mask; + unsigned long val = (unsigned long)owner; if (rt_mutex_has_waiters(lock)) val |= RT_MUTEX_HAS_WAITERS; @@ -203,15 +196,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * reached or the state of the chain has changed while we * dropped the locks. */ - if (!waiter || !waiter->task) + if (!waiter) goto out_unlock_pi; /* * Check the orig_waiter state. After we dropped the locks, - * the previous owner of the lock might have released the lock - * and made us the pending owner: + * the previous owner of the lock might have released the lock. */ - if (orig_waiter && !orig_waiter->task) + if (orig_waiter && !rt_mutex_owner(orig_lock)) goto out_unlock_pi; /* @@ -254,6 +246,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* Release the task */ raw_spin_unlock_irqrestore(&task->pi_lock, flags); + if (!rt_mutex_owner(lock)) { + /* + * If the requeue above changed the top waiter, then we need + * to wake the new top waiter up to try to get the lock. + */ + + if (top_waiter != rt_mutex_top_waiter(lock)) + wake_up_process(rt_mutex_top_waiter(lock)->task); + raw_spin_unlock(&lock->wait_lock); + goto out_put_task; + } put_task_struct(task); /* Grab the next task */ @@ -296,78 +299,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } /* - * Optimization: check if we can steal the lock from the - * assigned pending owner [which might not have taken the - * lock yet]: - */ -static inline int try_to_steal_lock(struct rt_mutex *lock, - struct task_struct *task) -{ - struct task_struct *pendowner = rt_mutex_owner(lock); - struct rt_mutex_waiter *next; - unsigned long flags; - - if (!rt_mutex_owner_pending(lock)) - return 0; - - if (pendowner == task) - return 1; - - raw_spin_lock_irqsave(&pendowner->pi_lock, flags); - if (task->prio >= pendowner->prio) { - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); - return 0; - } - - /* - * Check if a waiter is enqueued on the pending owners - * pi_waiters list. Remove it and readjust pending owners - * priority. - */ - if (likely(!rt_mutex_has_waiters(lock))) { - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); - return 1; - } - - /* No chain handling, pending owner is not blocked on anything: */ - next = rt_mutex_top_waiter(lock); - plist_del(&next->pi_list_entry, &pendowner->pi_waiters); - __rt_mutex_adjust_prio(pendowner); - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); - - /* - * We are going to steal the lock and a waiter was - * enqueued on the pending owners pi_waiters queue. So - * we have to enqueue this waiter into - * task->pi_waiters list. This covers the case, - * where task is boosted because it holds another - * lock and gets unboosted because the booster is - * interrupted, so we would delay a waiter with higher - * priority as task->normal_prio. - * - * Note: in the rare case of a SCHED_OTHER task changing - * its priority and thus stealing the lock, next->task - * might be task: - */ - if (likely(next->task != task)) { - raw_spin_lock_irqsave(&task->pi_lock, flags); - plist_add(&next->pi_list_entry, &task->pi_waiters); - __rt_mutex_adjust_prio(task); - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - } - return 1; -} - -/* * Try to take an rt-mutex * - * This fails - * - when the lock has a real owner - * - when a different pending owner exists and has higher priority than current - * * Must be called with lock->wait_lock held. + * + * @lock: the lock to be acquired. + * @task: the task which wants to acquire the lock + * @waiter: the waiter that is queued to the lock's wait list. (could be NULL) */ -static int try_to_take_rt_mutex(struct rt_mutex *lock) +static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, + struct rt_mutex_waiter *waiter) { /* * We have to be careful here if the atomic speedups are @@ -390,15 +331,52 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) */ mark_rt_mutex_waiters(lock); - if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) + if (rt_mutex_owner(lock)) return 0; + /* + * It will get the lock because of one of these conditions: + * 1) there is no waiter + * 2) higher priority than waiters + * 3) it is top waiter + */ + if (rt_mutex_has_waiters(lock)) { + if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { + if (!waiter || waiter != rt_mutex_top_waiter(lock)) + return 0; + } + } + + if (waiter || rt_mutex_has_waiters(lock)) { + unsigned long flags; + struct rt_mutex_waiter *top; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + + /* remove the queued waiter. */ + if (waiter) { + plist_del(&waiter->list_entry, &lock->wait_list); + task->pi_blocked_on = NULL; + } + + /* + * We have to enqueue the top waiter(if it exists) into + * task->pi_waiters list. + */ + if (rt_mutex_has_waiters(lock)) { + top = rt_mutex_top_waiter(lock); + top->pi_list_entry.prio = top->list_entry.prio; + plist_add(&top->pi_list_entry, &task->pi_waiters); + } + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + } + /* We got the lock. */ debug_rt_mutex_lock(lock); - rt_mutex_set_owner(lock, current, 0); + rt_mutex_set_owner(lock, task); - rt_mutex_deadlock_account_lock(lock, current); + rt_mutex_deadlock_account_lock(lock, task); return 1; } @@ -436,6 +414,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, raw_spin_unlock_irqrestore(&task->pi_lock, flags); + if (!owner) + return 0; + if (waiter == rt_mutex_top_waiter(lock)) { raw_spin_lock_irqsave(&owner->pi_lock, flags); plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); @@ -472,21 +453,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, /* * Wake up the next waiter on the lock. * - * Remove the top waiter from the current tasks waiter list and from - * the lock waiter list. Set it as pending owner. Then wake it up. + * Remove the top waiter from the current tasks waiter list and wake it up. * * Called with lock->wait_lock held. */ static void wakeup_next_waiter(struct rt_mutex *lock) { struct rt_mutex_waiter *waiter; - struct task_struct *pendowner; unsigned long flags; raw_spin_lock_irqsave(¤t->pi_lock, flags); waiter = rt_mutex_top_waiter(lock); - plist_del(&waiter->list_entry, &lock->wait_list); /* * Remove it from current->pi_waiters. We do not adjust a @@ -495,43 +473,19 @@ static void wakeup_next_waiter(struct rt_mutex *lock) * lock->wait_lock. */ plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); - pendowner = waiter->task; - waiter->task = NULL; - rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); + rt_mutex_set_owner(lock, NULL); raw_spin_unlock_irqrestore(¤t->pi_lock, flags); - /* - * Clear the pi_blocked_on variable and enqueue a possible - * waiter into the pi_waiters list of the pending owner. This - * prevents that in case the pending owner gets unboosted a - * waiter with higher priority than pending-owner->normal_prio - * is blocked on the unboosted (pending) owner. - */ - raw_spin_lock_irqsave(&pendowner->pi_lock, flags); - - WARN_ON(!pendowner->pi_blocked_on); - WARN_ON(pendowner->pi_blocked_on != waiter); - WARN_ON(pendowner->pi_blocked_on->lock != lock); - - pendowner->pi_blocked_on = NULL; - - if (rt_mutex_has_waiters(lock)) { - struct rt_mutex_waiter *next; - - next = rt_mutex_top_waiter(lock); - plist_add(&next->pi_list_entry, &pendowner->pi_waiters); - } - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); - - wake_up_process(pendowner); + wake_up_process(waiter->task); } /* - * Remove a waiter from a lock + * Remove a waiter from a lock and give up * - * Must be called with lock->wait_lock held + * Must be called with lock->wait_lock held and + * have just failed to try_to_take_rt_mutex(). */ static void remove_waiter(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) @@ -543,11 +497,13 @@ static void remove_waiter(struct rt_mutex *lock, raw_spin_lock_irqsave(¤t->pi_lock, flags); plist_del(&waiter->list_entry, &lock->wait_list); - waiter->task = NULL; current->pi_blocked_on = NULL; raw_spin_unlock_irqrestore(¤t->pi_lock, flags); - if (first && owner != current) { + if (!owner) + return; + + if (first) { raw_spin_lock_irqsave(&owner->pi_lock, flags); @@ -614,21 +570,19 @@ void rt_mutex_adjust_pi(struct task_struct *task) * or TASK_UNINTERRUPTIBLE) * @timeout: the pre-initialized and started timer, or NULL for none * @waiter: the pre-initialized rt_mutex_waiter - * @detect_deadlock: passed to task_blocks_on_rt_mutex * * lock->wait_lock must be held by the caller. */ static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - struct rt_mutex_waiter *waiter, - int detect_deadlock) + struct rt_mutex_waiter *waiter) { int ret = 0; for (;;) { /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock)) + if (try_to_take_rt_mutex(lock, current, waiter)) break; /* @@ -645,39 +599,11 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, break; } - /* - * waiter->task is NULL the first time we come here and - * when we have been woken up by the previous owner - * but the lock got stolen by a higher prio task. - */ - if (!waiter->task) { - ret = task_blocks_on_rt_mutex(lock, waiter, current, - detect_deadlock); - /* - * If we got woken up by the owner then start loop - * all over without going into schedule to try - * to get the lock now: - */ - if (unlikely(!waiter->task)) { - /* - * Reset the return value. We might - * have returned with -EDEADLK and the - * owner released the lock while we - * were walking the pi chain. - */ - ret = 0; - continue; - } - if (unlikely(ret)) - break; - } - raw_spin_unlock(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); - if (waiter->task) - schedule_rt_mutex(lock); + schedule_rt_mutex(lock); raw_spin_lock(&lock->wait_lock); set_current_state(state); @@ -698,12 +624,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, int ret = 0; debug_rt_mutex_init_waiter(&waiter); - waiter.task = NULL; raw_spin_lock(&lock->wait_lock); /* Try to acquire the lock again: */ - if (try_to_take_rt_mutex(lock)) { + if (try_to_take_rt_mutex(lock, current, NULL)) { raw_spin_unlock(&lock->wait_lock); return 0; } @@ -717,12 +642,14 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, timeout->task = NULL; } - ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, - detect_deadlock); + ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock); + + if (likely(!ret)) + ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); set_current_state(TASK_RUNNING); - if (unlikely(waiter.task)) + if (unlikely(ret)) remove_waiter(lock, &waiter); /* @@ -737,14 +664,6 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, if (unlikely(timeout)) hrtimer_cancel(&timeout->timer); - /* - * Readjust priority, when we did not get the lock. We might - * have been the pending owner and boosted. Since we did not - * take the lock, the PI boost has to go. - */ - if (unlikely(ret)) - rt_mutex_adjust_prio(current); - debug_rt_mutex_free_waiter(&waiter); return ret; @@ -762,7 +681,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock) if (likely(rt_mutex_owner(lock) != current)) { - ret = try_to_take_rt_mutex(lock); + ret = try_to_take_rt_mutex(lock, current, NULL); /* * try_to_take_rt_mutex() sets the lock waiters * bit unconditionally. Clean this up. @@ -992,7 +911,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, { __rt_mutex_init(lock, NULL); debug_rt_mutex_proxy_lock(lock, proxy_owner); - rt_mutex_set_owner(lock, proxy_owner, 0); + rt_mutex_set_owner(lock, proxy_owner); rt_mutex_deadlock_account_lock(lock, proxy_owner); } @@ -1008,7 +927,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner) { debug_rt_mutex_proxy_unlock(lock); - rt_mutex_set_owner(lock, NULL, 0); + rt_mutex_set_owner(lock, NULL); rt_mutex_deadlock_account_unlock(proxy_owner); } @@ -1034,20 +953,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, raw_spin_lock(&lock->wait_lock); - mark_rt_mutex_waiters(lock); - - if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { - /* We got the lock for task. */ - debug_rt_mutex_lock(lock); - rt_mutex_set_owner(lock, task, 0); + if (try_to_take_rt_mutex(lock, task, NULL)) { raw_spin_unlock(&lock->wait_lock); - rt_mutex_deadlock_account_lock(lock, task); return 1; } ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); - if (ret && !waiter->task) { + if (ret && !rt_mutex_owner(lock)) { /* * Reset the return value. We might have * returned with -EDEADLK and the owner @@ -1056,6 +969,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, */ ret = 0; } + + if (unlikely(ret)) + remove_waiter(lock, waiter); + raw_spin_unlock(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); @@ -1110,12 +1027,11 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, set_current_state(TASK_INTERRUPTIBLE); - ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, - detect_deadlock); + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); set_current_state(TASK_RUNNING); - if (unlikely(waiter->task)) + if (unlikely(ret)) remove_waiter(lock, waiter); /* @@ -1126,13 +1042,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, raw_spin_unlock(&lock->wait_lock); - /* - * Readjust priority, when we did not get the lock. We might have been - * the pending owner and boosted. Since we did not take the lock, the - * PI boost has to go. - */ - if (unlikely(ret)) - rt_mutex_adjust_prio(current); - return ret; } diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index 97a2f81866a..53a66c85261 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -91,9 +91,8 @@ task_top_pi_waiter(struct task_struct *p) /* * lock->owner state tracking: */ -#define RT_MUTEX_OWNER_PENDING 1UL -#define RT_MUTEX_HAS_WAITERS 2UL -#define RT_MUTEX_OWNER_MASKALL 3UL +#define RT_MUTEX_HAS_WAITERS 1UL +#define RT_MUTEX_OWNER_MASKALL 1UL static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) { @@ -101,17 +100,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); } -static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) -{ - return (struct task_struct *) - ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); -} - -static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) -{ - return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; -} - /* * PI-futex support (proxy locking functions, etc.): */ diff --git a/kernel/sched.c b/kernel/sched.c index 79e611cd83d..c8e40b7005c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -324,7 +324,7 @@ struct cfs_rq { * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ - struct sched_entity *curr, *next, *last; + struct sched_entity *curr, *next, *last, *skip; unsigned int nr_spread_over; @@ -1683,6 +1683,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) __release(rq2->lock); } +#else /* CONFIG_SMP */ + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + BUG_ON(rq1 != rq2); + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + BUG_ON(rq1 != rq2); + raw_spin_unlock(&rq1->lock); + __release(rq2->lock); +} + #endif static void calc_load_account_idle(struct rq *this_rq); @@ -1877,7 +1910,7 @@ void account_system_vtime(struct task_struct *curr) */ if (hardirq_count()) __this_cpu_add(cpu_hardirq_time, delta); - else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) + else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) __this_cpu_add(cpu_softirq_time, delta); irq_time_write_end(); @@ -1917,8 +1950,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) sched_rt_avg_update(rq, irq_delta); } +static int irqtime_account_hi_update(void) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_hardirq_time); + if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) + ret = 1; + local_irq_restore(flags); + return ret; +} + +static int irqtime_account_si_update(void) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_softirq_time); + if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) + ret = 1; + local_irq_restore(flags); + return ret; +} + #else /* CONFIG_IRQ_TIME_ACCOUNTING */ +#define sched_clock_irqtime (0) + static void update_rq_clock_task(struct rq *rq, s64 delta) { rq->clock_task += delta; @@ -2022,14 +2087,14 @@ inline int task_curr(const struct task_struct *p) static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, - int oldprio, int running) + int oldprio) { if (prev_class != p->sched_class) { if (prev_class->switched_from) - prev_class->switched_from(rq, p, running); - p->sched_class->switched_to(rq, p, running); - } else - p->sched_class->prio_changed(rq, p, oldprio, running); + prev_class->switched_from(rq, p); + p->sched_class->switched_to(rq, p); + } else if (oldprio != p->prio) + p->sched_class->prio_changed(rq, p, oldprio); } static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) @@ -2221,7 +2286,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * yield - it could be a while. */ if (unlikely(on_rq)) { - schedule_timeout_uninterruptible(1); + ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&to, HRTIMER_MODE_REL); continue; } @@ -2542,6 +2610,7 @@ static void __sched_fork(struct task_struct *p) p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; + p->se.vruntime = 0; #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); @@ -3547,6 +3616,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, } /* + * Account system cpu time to a process and desired cpustat field + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + * @target_cputime64: pointer to cpustat field that has to be updated + */ +static inline +void __account_system_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled, cputime64_t *target_cputime64) +{ + cputime64_t tmp = cputime_to_cputime64(cputime); + + /* Add system time to process. */ + p->stime = cputime_add(p->stime, cputime); + p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); + account_group_system_time(p, cputime); + + /* Add system time to cpustat. */ + *target_cputime64 = cputime64_add(*target_cputime64, tmp); + cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); + + /* Account for system time used */ + acct_update_integrals(p); +} + +/* * Account system cpu time to a process. * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() @@ -3557,36 +3652,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cputime_t cputime, cputime_t cputime_scaled) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t tmp; + cputime64_t *target_cputime64; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { account_guest_time(p, cputime, cputime_scaled); return; } - /* Add system time to process. */ - p->stime = cputime_add(p->stime, cputime); - p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); - account_group_system_time(p, cputime); - - /* Add system time to cpustat. */ - tmp = cputime_to_cputime64(cputime); if (hardirq_count() - hardirq_offset) - cpustat->irq = cputime64_add(cpustat->irq, tmp); + target_cputime64 = &cpustat->irq; else if (in_serving_softirq()) - cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + target_cputime64 = &cpustat->softirq; else - cpustat->system = cputime64_add(cpustat->system, tmp); + target_cputime64 = &cpustat->system; - cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); - - /* Account for system time used */ - acct_update_integrals(p); + __account_system_time(p, cputime, cputime_scaled, target_cputime64); } /* * Account for involuntary wait time. - * @steal: the cpu time spent in involuntary wait + * @cputime: the cpu time spent in involuntary wait */ void account_steal_time(cputime_t cputime) { @@ -3614,6 +3699,73 @@ void account_idle_time(cputime_t cputime) #ifndef CONFIG_VIRT_CPU_ACCOUNTING +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +/* + * Account a tick to a process and cpustat + * @p: the process that the cpu time gets accounted to + * @user_tick: is the tick from userspace + * @rq: the pointer to rq + * + * Tick demultiplexing follows the order + * - pending hardirq update + * - pending softirq update + * - user_time + * - idle_time + * - system time + * - check for guest_time + * - else account as system_time + * + * Check for hardirq is done both for system and user time as there is + * no timer going off while we are on hardirq and hence we may never get an + * opportunity to update it solely in system time. + * p->stime and friends are only updated on system time and not on irq + * softirq as those do not count in task exec_runtime any more. + */ +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq) +{ + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + + if (irqtime_account_hi_update()) { + cpustat->irq = cputime64_add(cpustat->irq, tmp); + } else if (irqtime_account_si_update()) { + cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + } else if (this_cpu_ksoftirqd() == p) { + /* + * ksoftirqd time do not get accounted in cpu_softirq_time. + * So, we have to handle it separately here. + * Also, p->stime needs to be updated for ksoftirqd. + */ + __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, + &cpustat->softirq); + } else if (user_tick) { + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + } else if (p == rq->idle) { + account_idle_time(cputime_one_jiffy); + } else if (p->flags & PF_VCPU) { /* System time or guest time */ + account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); + } else { + __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, + &cpustat->system); + } +} + +static void irqtime_account_idle_ticks(int ticks) +{ + int i; + struct rq *rq = this_rq(); + + for (i = 0; i < ticks; i++) + irqtime_account_process_tick(current, 0, rq); +} +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +static void irqtime_account_idle_ticks(int ticks) {} +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq) {} +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + /* * Account a single tick of cpu time. * @p: the process that the cpu time gets accounted to @@ -3624,6 +3776,11 @@ void account_process_tick(struct task_struct *p, int user_tick) cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); struct rq *rq = this_rq(); + if (sched_clock_irqtime) { + irqtime_account_process_tick(p, user_tick, rq); + return; + } + if (user_tick) account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) @@ -3649,6 +3806,12 @@ void account_steal_ticks(unsigned long ticks) */ void account_idle_ticks(unsigned long ticks) { + + if (sched_clock_irqtime) { + irqtime_account_idle_ticks(ticks); + return; + } + account_idle_time(jiffies_to_cputime(ticks)); } @@ -4189,6 +4352,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) { __wake_up_common(q, mode, 1, 0, key); } +EXPORT_SYMBOL_GPL(__wake_up_locked_key); /** * __wake_up_sync_key - wake up threads blocked on a waitqueue. @@ -4546,11 +4710,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); - if (on_rq) { + if (on_rq) enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); - check_class_changed(rq, p, prev_class, oldprio, running); - } + check_class_changed(rq, p, prev_class, oldprio); task_rq_unlock(rq, &flags); } @@ -4798,12 +4961,15 @@ recheck: param->sched_priority > rlim_rtprio) return -EPERM; } + /* - * Like positive nice levels, dont allow tasks to - * move out of SCHED_IDLE either: + * Treat SCHED_IDLE as nice 20. Only allow a switch to + * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ - if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) - return -EPERM; + if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { + if (!can_nice(p, TASK_NICE(p))) + return -EPERM; + } /* can't change other user's priorities */ if (!check_same_owner(p)) @@ -4878,11 +5044,10 @@ recheck: if (running) p->sched_class->set_curr_task(rq); - if (on_rq) { + if (on_rq) activate_task(rq, p, 0); - check_class_changed(rq, p, prev_class, oldprio, running); - } + check_class_changed(rq, p, prev_class, oldprio); __task_rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -5299,6 +5464,65 @@ void __sched yield(void) } EXPORT_SYMBOL(yield); +/** + * yield_to - yield the current processor to another thread in + * your thread group, or accelerate that thread toward the + * processor it's on. + * + * It's the caller's job to ensure that the target task struct + * can't go away on us before we can do any checks. + * + * Returns true if we indeed boosted the target task. + */ +bool __sched yield_to(struct task_struct *p, bool preempt) +{ + struct task_struct *curr = current; + struct rq *rq, *p_rq; + unsigned long flags; + bool yielded = 0; + + local_irq_save(flags); + rq = this_rq(); + +again: + p_rq = task_rq(p); + double_rq_lock(rq, p_rq); + while (task_rq(p) != p_rq) { + double_rq_unlock(rq, p_rq); + goto again; + } + + if (!curr->sched_class->yield_to_task) + goto out; + + if (curr->sched_class != p->sched_class) + goto out; + + if (task_running(p_rq, p) || p->state) + goto out; + + yielded = curr->sched_class->yield_to_task(rq, p, preempt); + if (yielded) { + schedstat_inc(rq, yld_count); + /* + * Make p's CPU reschedule; pick_next_entity takes care of + * fairness. + */ + if (preempt && rq != p_rq) + resched_task(p_rq->curr); + } + +out: + double_rq_unlock(rq, p_rq); + local_irq_restore(flags); + + if (yielded) + schedule(); + + return yielded; +} +EXPORT_SYMBOL_GPL(yield_to); + /* * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. @@ -7772,6 +7996,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) INIT_LIST_HEAD(&cfs_rq->tasks); #ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq->rq = rq; + /* allow initial update_cfs_load() to truncate */ +#ifdef CONFIG_SMP + cfs_rq->load_stamp = 1; +#endif #endif cfs_rq->min_vruntime = (u64)(-(1LL << 20)); } @@ -8085,6 +8313,8 @@ EXPORT_SYMBOL(__might_sleep); #ifdef CONFIG_MAGIC_SYSRQ static void normalize_task(struct rq *rq, struct task_struct *p) { + const struct sched_class *prev_class = p->sched_class; + int old_prio = p->prio; int on_rq; on_rq = p->se.on_rq; @@ -8095,6 +8325,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p) activate_task(rq, p, 0); resched_task(rq->curr); } + + check_class_changed(rq, p, prev_class, old_prio); } void normalize_rt_tasks(void) @@ -8486,7 +8718,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) /* Propagate contribution to hierarchy */ raw_spin_lock_irqsave(&rq->lock, flags); for_each_sched_entity(se) - update_cfs_shares(group_cfs_rq(se), 0); + update_cfs_shares(group_cfs_rq(se)); raw_spin_unlock_irqrestore(&rq->lock, flags); } diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index 9fb65628315..5946ac51560 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c @@ -12,7 +12,6 @@ static atomic_t autogroup_seq_nr; static void __init autogroup_init(struct task_struct *init_task) { autogroup_default.tg = &root_task_group; - root_task_group.autogroup = &autogroup_default; kref_init(&autogroup_default.kref); init_rwsem(&autogroup_default.lock); init_task->signal->autogroup = &autogroup_default; @@ -130,7 +129,7 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg) static inline bool task_group_is_autogroup(struct task_group *tg) { - return tg != &root_task_group && tg->autogroup; + return !!tg->autogroup; } static inline struct task_group * @@ -161,11 +160,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) p->signal->autogroup = autogroup_kref_get(ag); + if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) + goto out; + t = p; do { sched_move_task(t); } while_each_thread(p, t); +out: unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } @@ -247,10 +250,14 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) { struct autogroup *ag = autogroup_task_get(p); + if (!task_group_is_autogroup(ag->tg)) + goto out; + down_read(&ag->lock); seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); up_read(&ag->lock); +out: autogroup_kref_put(ag); } #endif /* CONFIG_PROC_FS */ @@ -258,9 +265,7 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) #ifdef CONFIG_SCHED_DEBUG static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) { - int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); - - if (!enabled || !tg->autogroup) + if (!task_group_is_autogroup(tg)) return 0; return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h index 7b859ffe5da..05577055cfc 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched_autogroup.h @@ -1,6 +1,11 @@ #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup { + /* + * reference doesn't mean how many thread attach to this + * autogroup now. It just stands for the number of task + * could use this autogroup. + */ struct kref kref; struct task_group *tg; struct rw_semaphore lock; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index eb6cb8edd07..7bacd83a415 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -179,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) raw_spin_lock_irqsave(&rq->lock, flags); if (cfs_rq->rb_leftmost) - MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; + MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; last = __pick_last_entity(cfs_rq); if (last) max_vruntime = last->vruntime; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0c26e2df450..3f7ec9e27ee 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -69,14 +69,6 @@ static unsigned int sched_nr_latency = 8; unsigned int sysctl_sched_child_runs_first __read_mostly; /* - * sys_sched_yield() compat mode - * - * This option switches the agressive yield implementation of the - * old scheduler back on. - */ -unsigned int __read_mostly sysctl_sched_compat_yield; - -/* * SCHED_OTHER wake-up granularity. * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) * @@ -419,7 +411,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } -static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) +static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { struct rb_node *left = cfs_rq->rb_leftmost; @@ -429,6 +421,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) return rb_entry(left, struct sched_entity, run_node); } +static struct sched_entity *__pick_next_entity(struct sched_entity *se) +{ + struct rb_node *next = rb_next(&se->run_node); + + if (!next) + return NULL; + + return rb_entry(next, struct sched_entity, run_node); +} + +#ifdef CONFIG_SCHED_DEBUG static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); @@ -443,7 +446,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) * Scheduling class statistics methods: */ -#ifdef CONFIG_SCHED_DEBUG int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -540,7 +542,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); -static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); +static void update_cfs_shares(struct cfs_rq *cfs_rq); /* * Update the current task's runtime statistics. Skip current tasks that @@ -733,6 +735,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) now - cfs_rq->load_last > 4 * period) { cfs_rq->load_period = 0; cfs_rq->load_avg = 0; + delta = period - 1; } cfs_rq->load_stamp = now; @@ -763,16 +766,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) list_del_leaf_cfs_rq(cfs_rq); } -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, - long weight_delta) +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { long load_weight, load, shares; - load = cfs_rq->load.weight + weight_delta; + load = cfs_rq->load.weight; load_weight = atomic_read(&tg->load_weight); - load_weight -= cfs_rq->load_contribution; load_weight += load; + load_weight -= cfs_rq->load_contribution; shares = (tg->shares * load); if (load_weight) @@ -790,7 +792,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq) { if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); } } # else /* CONFIG_SMP */ @@ -798,8 +800,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) { } -static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, - long weight_delta) +static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { return tg->shares; } @@ -824,7 +825,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, account_entity_enqueue(cfs_rq, se); } -static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) +static void update_cfs_shares(struct cfs_rq *cfs_rq) { struct task_group *tg; struct sched_entity *se; @@ -838,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) if (likely(se->load.weight == tg->shares)) return; #endif - shares = calc_cfs_shares(cfs_rq, tg, weight_delta); + shares = calc_cfs_shares(cfs_rq, tg); reweight_entity(cfs_rq_of(se), se, shares); } @@ -847,7 +848,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) { } -static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) +static inline void update_cfs_shares(struct cfs_rq *cfs_rq) { } @@ -978,8 +979,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_curr(cfs_rq); update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, se->load.weight); account_entity_enqueue(cfs_rq, se); + update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -996,19 +997,49 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) list_add_leaf_cfs_rq(cfs_rq); } -static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void __clear_buddies_last(struct sched_entity *se) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->last == se) + cfs_rq->last = NULL; + else + break; + } +} + +static void __clear_buddies_next(struct sched_entity *se) { - if (!se || cfs_rq->last == se) - cfs_rq->last = NULL; + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->next == se) + cfs_rq->next = NULL; + else + break; + } +} - if (!se || cfs_rq->next == se) - cfs_rq->next = NULL; +static void __clear_buddies_skip(struct sched_entity *se) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->skip == se) + cfs_rq->skip = NULL; + else + break; + } } static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { - for_each_sched_entity(se) - __clear_buddies(cfs_rq_of(se), se); + if (cfs_rq->last == se) + __clear_buddies_last(se); + + if (cfs_rq->next == se) + __clear_buddies_next(se); + + if (cfs_rq->skip == se) + __clear_buddies_skip(se); } static void @@ -1041,7 +1072,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); /* * Normalize the entity after updating the min_vruntime because the @@ -1084,7 +1115,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) return; if (cfs_rq->nr_running > 1) { - struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *se = __pick_first_entity(cfs_rq); s64 delta = curr->vruntime - se->vruntime; if (delta < 0) @@ -1128,13 +1159,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) static int wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); +/* + * Pick the next process, keeping these things in mind, in this order: + * 1) keep things fair between processes/task groups + * 2) pick the "next" process, since someone really wants that to run + * 3) pick the "last" process, for cache locality + * 4) do not run the "skip" process, if something else is available + */ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { - struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *se = __pick_first_entity(cfs_rq); struct sched_entity *left = se; - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) - se = cfs_rq->next; + /* + * Avoid running the skip buddy, if running something else can + * be done without getting too unfair. + */ + if (cfs_rq->skip == se) { + struct sched_entity *second = __pick_next_entity(se); + if (second && wakeup_preempt_entity(second, left) < 1) + se = second; + } /* * Prefer last buddy, try to return the CPU to a preempted task. @@ -1142,6 +1187,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) se = cfs_rq->last; + /* + * Someone really wants this to run. If it's not unfair, run it. + */ + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) + se = cfs_rq->next; + clear_buddies(cfs_rq, se); return se; @@ -1282,7 +1333,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq = cfs_rq_of(se); update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); } hrtick_update(rq); @@ -1312,58 +1363,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq = cfs_rq_of(se); update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); } hrtick_update(rq); } -/* - * sched_yield() support is very simple - we dequeue and enqueue. - * - * If compat_yield is turned on then we requeue to the end of the tree. - */ -static void yield_task_fair(struct rq *rq) -{ - struct task_struct *curr = rq->curr; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); - struct sched_entity *rightmost, *se = &curr->se; - - /* - * Are we the only task in the tree? - */ - if (unlikely(cfs_rq->nr_running == 1)) - return; - - clear_buddies(cfs_rq, se); - - if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { - update_rq_clock(rq); - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - - return; - } - /* - * Find the rightmost entry in the rbtree: - */ - rightmost = __pick_last_entity(cfs_rq); - /* - * Already in the rightmost position? - */ - if (unlikely(!rightmost || entity_before(rightmost, se))) - return; - - /* - * Minimally necessary key value to be last in the tree: - * Upon rescheduling, sched_class::put_prev_task() will place - * 'current' within the tree based on its new key value. - */ - se->vruntime = rightmost->vruntime + 1; -} - #ifdef CONFIG_SMP static void task_waking_fair(struct rq *rq, struct task_struct *p) @@ -1834,6 +1839,14 @@ static void set_next_buddy(struct sched_entity *se) } } +static void set_skip_buddy(struct sched_entity *se) +{ + if (likely(task_of(se)->policy != SCHED_IDLE)) { + for_each_sched_entity(se) + cfs_rq_of(se)->skip = se; + } +} + /* * Preempt the current task with a newly woken task if needed: */ @@ -1857,16 +1870,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (test_tsk_need_resched(curr)) return; + /* Idle tasks are by definition preempted by non-idle tasks. */ + if (unlikely(curr->policy == SCHED_IDLE) && + likely(p->policy != SCHED_IDLE)) + goto preempt; + /* - * Batch and idle tasks do not preempt (their preemption is driven by - * the tick): + * Batch and idle tasks do not preempt non-idle tasks (their preemption + * is driven by the tick): */ if (unlikely(p->policy != SCHED_NORMAL)) return; - /* Idle tasks are by definition preempted by everybody. */ - if (unlikely(curr->policy == SCHED_IDLE)) - goto preempt; if (!sched_feat(WAKEUP_PREEMPT)) return; @@ -1932,6 +1947,51 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) } } +/* + * sched_yield() is very simple + * + * The magic of dealing with the ->skip buddy is in pick_next_entity. + */ +static void yield_task_fair(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + struct sched_entity *se = &curr->se; + + /* + * Are we the only task in the tree? + */ + if (unlikely(rq->nr_running == 1)) + return; + + clear_buddies(cfs_rq, se); + + if (curr->policy != SCHED_BATCH) { + update_rq_clock(rq); + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + } + + set_skip_buddy(se); +} + +static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) +{ + struct sched_entity *se = &p->se; + + if (!se->on_rq) + return false; + + /* Tell the scheduler that we'd really like pse to run next. */ + set_next_buddy(se); + + yield_task_fair(rq); + + return true; +} + #ifdef CONFIG_SMP /************************************************** * Fair scheduling class load-balancing methods: @@ -2123,7 +2183,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu) * We need to update shares after updating tg->load_weight in * order to adjust the weight of groups with long running tasks. */ - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); raw_spin_unlock_irqrestore(&rq->lock, flags); @@ -2610,7 +2670,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu * @load_idx: Load index of sched_domain of this_cpu for load calc. - * @sd_idle: Idle status of the sched_domain containing group. * @local_group: Does group contain this_cpu. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. @@ -2618,7 +2677,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) */ static inline void update_sg_lb_stats(struct sched_domain *sd, struct sched_group *group, int this_cpu, - enum cpu_idle_type idle, int load_idx, int *sd_idle, + enum cpu_idle_type idle, int load_idx, int local_group, const struct cpumask *cpus, int *balance, struct sg_lb_stats *sgs) { @@ -2638,9 +2697,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, for_each_cpu_and(i, sched_group_cpus(group), cpus) { struct rq *rq = cpu_rq(i); - if (*sd_idle && rq->nr_running) - *sd_idle = 0; - /* Bias balancing toward cpus of our domain */ if (local_group) { if (idle_cpu(i) && !first_idle_cpu) { @@ -2685,7 +2741,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, /* * Consider the group unbalanced when the imbalance is larger - * than the average weight of two tasks. + * than the average weight of a task. * * APZ: with cgroup the avg task weight can vary wildly and * might not be a suitable number - should we keep a @@ -2695,7 +2751,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if (sgs->sum_nr_running) avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) + if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) sgs->group_imb = 1; sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); @@ -2755,15 +2811,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, * @sd: sched_domain whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu - * @sd_idle: Idle status of the sched_domain containing sg. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. */ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, - enum cpu_idle_type idle, int *sd_idle, - const struct cpumask *cpus, int *balance, - struct sd_lb_stats *sds) + enum cpu_idle_type idle, const struct cpumask *cpus, + int *balance, struct sd_lb_stats *sds) { struct sched_domain *child = sd->child; struct sched_group *sg = sd->groups; @@ -2781,7 +2835,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, + update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, local_group, cpus, balance, &sgs); if (local_group && !(*balance)) @@ -3033,7 +3087,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, * @imbalance: Variable which stores amount of weighted load which should * be moved to restore balance/put a group to idle. * @idle: The idle status of this_cpu. - * @sd_idle: The idleness of sd * @cpus: The set of CPUs under consideration for load-balancing. * @balance: Pointer to a variable indicating if this_cpu * is the appropriate cpu to perform load balancing at this_level. @@ -3046,7 +3099,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, const struct cpumask *cpus, int *balance) + const struct cpumask *cpus, int *balance) { struct sd_lb_stats sds; @@ -3056,22 +3109,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * Compute the various statistics relavent for load balancing at * this level. */ - update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, - balance, &sds); - - /* Cases where imbalance does not exist from POV of this_cpu */ - /* 1) this_cpu is not the appropriate cpu to perform load balancing - * at this level. - * 2) There is no busy sibling group to pull from. - * 3) This group is the busiest group. - * 4) This group is more busy than the avg busieness at this - * sched_domain. - * 5) The imbalance is within the specified limit. - * - * Note: when doing newidle balance, if the local group has excess - * capacity (i.e. nr_running < group_capacity) and the busiest group - * does not have any capacity, we force a load balance to pull tasks - * to the local group. In this case, we skip past checks 3, 4 and 5. + update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); + + /* + * this_cpu is not the appropriate cpu to perform load balancing at + * this level. */ if (!(*balance)) goto ret; @@ -3080,41 +3122,55 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, check_asym_packing(sd, &sds, this_cpu, imbalance)) return sds.busiest; + /* There is no busy sibling group to pull tasks from */ if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; - /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ + /* + * If the busiest group is imbalanced the below checks don't + * work because they assumes all things are equal, which typically + * isn't true due to cpus_allowed constraints and the like. + */ + if (sds.group_imb) + goto force_balance; + + /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && !sds.busiest_has_capacity) goto force_balance; + /* + * If the local group is more busy than the selected busiest group + * don't try and pull any tasks. + */ if (sds.this_load >= sds.max_load) goto out_balanced; + /* + * Don't pull any tasks if this group is already above the domain + * average load. + */ sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; - if (sds.this_load >= sds.avg_load) goto out_balanced; - /* - * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative. - * And to check for busy balance use !idle_cpu instead of - * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE - * even when they are idle. - */ - if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) { - if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) - goto out_balanced; - } else { + if (idle == CPU_IDLE) { /* * This cpu is idle. If the busiest group load doesn't * have more tasks than the number of available cpu's and * there is no imbalance between this and busiest group * wrt to idle cpu's, it is balanced. */ - if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && + if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && sds.busiest_nr_running <= sds.busiest_group_weight) goto out_balanced; + } else { + /* + * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use + * imbalance_pct to be conservative. + */ + if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) + goto out_balanced; } force_balance: @@ -3193,7 +3249,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, /* Working cpumask for load_balance and load_balance_newidle. */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); -static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, +static int need_active_balance(struct sched_domain *sd, int idle, int busiest_cpu, int this_cpu) { if (idle == CPU_NEWLY_IDLE) { @@ -3225,10 +3281,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, * move_tasks() will succeed. ld_moved will be true and this * active balance code will not be triggered. */ - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return 0; - if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) return 0; } @@ -3246,7 +3298,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; + int ld_moved, all_pinned = 0, active_balance = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; @@ -3255,20 +3307,10 @@ static int load_balance(int this_cpu, struct rq *this_rq, cpumask_copy(cpus, cpu_active_mask); - /* - * When power savings policy is enabled for the parent domain, idle - * sibling can pick up load irrespective of busy siblings. In this case, - * let the state of idle sibling percolate up as CPU_IDLE, instead of - * portraying it as CPU_NOT_IDLE. - */ - if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - sd_idle = 1; - schedstat_inc(sd, lb_count[idle]); redo: - group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, + group = find_busiest_group(sd, this_cpu, &imbalance, idle, cpus, balance); if (*balance == 0) @@ -3330,8 +3372,7 @@ redo: if (idle != CPU_NEWLY_IDLE) sd->nr_balance_failed++; - if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), - this_cpu)) { + if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, @@ -3386,10 +3427,6 @@ redo: sd->balance_interval *= 2; } - if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - ld_moved = -1; - goto out; out_balanced: @@ -3403,11 +3440,7 @@ out_one_pinned: (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - ld_moved = -1; - else - ld_moved = 0; + ld_moved = 0; out: return ld_moved; } @@ -3831,8 +3864,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (load_balance(cpu, rq, sd, idle, &balance)) { /* * We've pulled tasks over so either we're no - * longer idle, or one of our SMT siblings is - * not idle. + * longer idle. */ idle = CPU_NOT_IDLE; } @@ -4079,33 +4111,62 @@ static void task_fork_fair(struct task_struct *p) * Priority of the task has changed. Check to see if we preempt * the current task. */ -static void prio_changed_fair(struct rq *rq, struct task_struct *p, - int oldprio, int running) +static void +prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) { + if (!p->se.on_rq) + return; + /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (running) { + if (rq->curr == p) { if (p->prio > oldprio) resched_task(rq->curr); } else check_preempt_curr(rq, p, 0); } +static void switched_from_fair(struct rq *rq, struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* + * Ensure the task's vruntime is normalized, so that when its + * switched back to the fair class the enqueue_entity(.flags=0) will + * do the right thing. + * + * If it was on_rq, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it was !on_rq, then only when + * the task is sleeping will it still have non-normalized vruntime. + */ + if (!se->on_rq && p->state != TASK_RUNNING) { + /* + * Fix up our vruntime so that the current sleep doesn't + * cause 'unlimited' sleep bonus. + */ + place_entity(cfs_rq, se, 0); + se->vruntime -= cfs_rq->min_vruntime; + } +} + /* * We switched to the sched_fair class. */ -static void switched_to_fair(struct rq *rq, struct task_struct *p, - int running) +static void switched_to_fair(struct rq *rq, struct task_struct *p) { + if (!p->se.on_rq) + return; + /* * We were most likely switched from sched_rt, so * kick off the schedule if running, otherwise just see * if we can still preempt the current task. */ - if (running) + if (rq->curr == p) resched_task(rq->curr); else check_preempt_curr(rq, p, 0); @@ -4171,6 +4232,7 @@ static const struct sched_class fair_sched_class = { .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, + .yield_to_task = yield_to_task_fair, .check_preempt_curr = check_preempt_wakeup, @@ -4191,6 +4253,7 @@ static const struct sched_class fair_sched_class = { .task_fork = task_fork_fair, .prio_changed = prio_changed_fair, + .switched_from = switched_from_fair, .switched_to = switched_to_fair, .get_rr_interval = get_rr_interval_fair, diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 9fa0f402c87..c82f26c1b7c 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -52,31 +52,15 @@ static void set_curr_task_idle(struct rq *rq) { } -static void switched_to_idle(struct rq *rq, struct task_struct *p, - int running) +static void switched_to_idle(struct rq *rq, struct task_struct *p) { - /* Can this actually happen?? */ - if (running) - resched_task(rq->curr); - else - check_preempt_curr(rq, p, 0); + BUG(); } -static void prio_changed_idle(struct rq *rq, struct task_struct *p, - int oldprio, int running) +static void +prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) { - /* This can happen for hot plug CPUS */ - - /* - * Reschedule if we are currently running on this runqueue and - * our priority decreased, or if we are not currently running on - * this runqueue and our priority is higher than the current's - */ - if (running) { - if (p->prio > oldprio) - resched_task(rq->curr); - } else - check_preempt_curr(rq, p, 0); + BUG(); } static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index ad6267714c8..db308cb08b7 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -210,11 +210,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { - int this_cpu = smp_processor_id(); struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; struct sched_rt_entity *rt_se; - rt_se = rt_rq->tg->rt_se[this_cpu]; + int cpu = cpu_of(rq_of_rt_rq(rt_rq)); + + rt_se = rt_rq->tg->rt_se[cpu]; if (rt_rq->rt_nr_running) { if (rt_se && !on_rt_rq(rt_se)) @@ -226,10 +227,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) { - int this_cpu = smp_processor_id(); struct sched_rt_entity *rt_se; + int cpu = cpu_of(rq_of_rt_rq(rt_rq)); - rt_se = rt_rq->tg->rt_se[this_cpu]; + rt_se = rt_rq->tg->rt_se[cpu]; if (rt_se && on_rt_rq(rt_se)) dequeue_rt_entity(rt_se); @@ -565,8 +566,11 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; raw_spin_unlock(&rt_rq->rt_runtime_lock); - } else if (rt_rq->rt_nr_running) + } else if (rt_rq->rt_nr_running) { idle = 0; + if (!rt_rq_throttled(rt_rq)) + enqueue = 1; + } if (enqueue) sched_rt_rq_enqueue(rt_rq); @@ -1595,8 +1599,7 @@ static void rq_offline_rt(struct rq *rq) * When switch from the rt queue, we bring ourselves to a position * that we might want to pull RT tasks from other runqueues. */ -static void switched_from_rt(struct rq *rq, struct task_struct *p, - int running) +static void switched_from_rt(struct rq *rq, struct task_struct *p) { /* * If there are other RT tasks then we will reschedule @@ -1605,7 +1608,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p, * we may need to handle the pulling of RT tasks * now. */ - if (!rq->rt.rt_nr_running) + if (p->se.on_rq && !rq->rt.rt_nr_running) pull_rt_task(rq); } @@ -1624,8 +1627,7 @@ static inline void init_sched_rt_class(void) * with RT tasks. In this case we try to push them off to * other runqueues. */ -static void switched_to_rt(struct rq *rq, struct task_struct *p, - int running) +static void switched_to_rt(struct rq *rq, struct task_struct *p) { int check_resched = 1; @@ -1636,7 +1638,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p, * If that current running task is also an RT task * then see if we can move to another run queue. */ - if (!running) { + if (p->se.on_rq && rq->curr != p) { #ifdef CONFIG_SMP if (rq->rt.overloaded && push_rt_task(rq) && /* Don't resched if we changed runqueues */ @@ -1652,10 +1654,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p, * Priority of the task has changed. This may cause * us to initiate a push or pull. */ -static void prio_changed_rt(struct rq *rq, struct task_struct *p, - int oldprio, int running) +static void +prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) { - if (running) { + if (!p->se.on_rq) + return; + + if (rq->curr == p) { #ifdef CONFIG_SMP /* * If our priority decreases while running, we diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 2bf6b47058c..84ec9bcf82d 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c @@ -59,14 +59,13 @@ static void set_curr_task_stop(struct rq *rq) { } -static void switched_to_stop(struct rq *rq, struct task_struct *p, - int running) +static void switched_to_stop(struct rq *rq, struct task_struct *p) { BUG(); /* its impossible to change to this class */ } -static void prio_changed_stop(struct rq *rq, struct task_struct *p, - int oldprio, int running) +static void +prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) { BUG(); /* how!?, what priority? */ } diff --git a/kernel/softirq.c b/kernel/softirq.c index 68eb5efec38..56e5dec837f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_stat); static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; -static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); +DEFINE_PER_CPU(struct task_struct *, ksoftirqd); char *softirq_to_name[NR_SOFTIRQS] = { "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", @@ -311,9 +311,21 @@ void irq_enter(void) } #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED -# define invoke_softirq() __do_softirq() +static inline void invoke_softirq(void) +{ + if (!force_irqthreads) + __do_softirq(); + else + wakeup_softirqd(); +} #else -# define invoke_softirq() do_softirq() +static inline void invoke_softirq(void) +{ + if (!force_irqthreads) + do_softirq(); + else + wakeup_softirqd(); +} #endif /* @@ -721,7 +733,6 @@ static int run_ksoftirqd(void * __bind_cpu) { set_current_state(TASK_INTERRUPTIBLE); - current->flags |= PF_KSOFTIRQD; while (!kthread_should_stop()) { preempt_disable(); if (!local_softirq_pending()) { @@ -738,7 +749,10 @@ static int run_ksoftirqd(void * __bind_cpu) don't process */ if (cpu_is_offline((long)__bind_cpu)) goto wait_to_die; - do_softirq(); + local_irq_disable(); + if (local_softirq_pending()) + __do_softirq(); + local_irq_enable(); preempt_enable_no_resched(); cond_resched(); preempt_disable(); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c782fe9924c..25cc41cd8f3 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -186,3 +186,8 @@ cond_syscall(sys_perf_event_open); /* fanotify! */ cond_syscall(sys_fanotify_init); cond_syscall(sys_fanotify_mark); + +/* open by handle */ +cond_syscall(sys_name_to_handle_at); +cond_syscall(sys_open_by_handle_at); +cond_syscall(compat_sys_open_by_handle_at); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index daef911cbad..51054fea5d9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -194,9 +194,9 @@ static int sysrq_sysctl_handler(ctl_table *table, int write, static struct ctl_table root_table[]; static struct ctl_table_root sysctl_table_root; static struct ctl_table_header root_table_header = { - .count = 1, + {{.count = 1, .ctl_table = root_table, - .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list), + .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}}, .root = &sysctl_table_root, .set = &sysctl_table_root.default_set, }; @@ -361,20 +361,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rt_handler, }, - { - .procname = "sched_compat_yield", - .data = &sysctl_sched_compat_yield, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #ifdef CONFIG_SCHED_AUTOGROUP { .procname = "sched_autogroup_enabled", .data = &sysctl_sched_autogroup_enabled, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, @@ -1567,11 +1560,16 @@ void sysctl_head_get(struct ctl_table_header *head) spin_unlock(&sysctl_lock); } +static void free_head(struct rcu_head *rcu) +{ + kfree(container_of(rcu, struct ctl_table_header, rcu)); +} + void sysctl_head_put(struct ctl_table_header *head) { spin_lock(&sysctl_lock); if (!--head->count) - kfree(head); + call_rcu(&head->rcu, free_head); spin_unlock(&sysctl_lock); } @@ -1948,10 +1946,10 @@ void unregister_sysctl_table(struct ctl_table_header * header) start_unregistering(header); if (!--header->parent->count) { WARN_ON(1); - kfree(header->parent); + call_rcu(&header->parent->rcu, free_head); } if (!--header->count) - kfree(header); + call_rcu(&header->rcu, free_head); spin_unlock(&sysctl_lock); } diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index b875bedf7c9..3b8e028b960 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1321,13 +1321,11 @@ static ssize_t binary_sysctl(const int *name, int nlen, void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) { const struct bin_table *table = NULL; - struct nameidata nd; struct vfsmount *mnt; struct file *file; ssize_t result; char *pathname; int flags; - int acc_mode; pathname = sysctl_getname(name, nlen, &table); result = PTR_ERR(pathname); @@ -1337,28 +1335,17 @@ static ssize_t binary_sysctl(const int *name, int nlen, /* How should the sysctl be accessed? */ if (oldval && oldlen && newval && newlen) { flags = O_RDWR; - acc_mode = MAY_READ | MAY_WRITE; } else if (newval && newlen) { flags = O_WRONLY; - acc_mode = MAY_WRITE; } else if (oldval && oldlen) { flags = O_RDONLY; - acc_mode = MAY_READ; } else { result = 0; goto out_putname; } mnt = current->nsproxy->pid_ns->proc_mnt; - result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd); - if (result) - goto out_putname; - - result = may_open(&nd.path, acc_mode, flags); - if (result) - goto out_putpath; - - file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred()); + file = file_open_root(mnt->mnt_root, mnt, pathname, flags); result = PTR_ERR(file); if (IS_ERR(file)) goto out_putname; @@ -1370,10 +1357,6 @@ out_putname: putname(pathname); out: return result; - -out_putpath: - path_put(&nd.path); - goto out_putname; } diff --git a/kernel/time.c b/kernel/time.c index 32174359576..8e8dc6d705c 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -150,7 +150,7 @@ static inline void warp_clock(void) * various programs will get confused when the clock gets warped. */ -int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) +int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) { static int firsttime = 1; int error = 0; @@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x) } /** - * nsecs_to_jiffies - Convert nsecs in u64 to jiffies + * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 * * @n: nsecs in u64 * @@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x) * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years */ -unsigned long nsecs_to_jiffies(u64 n) +u64 nsecs_to_jiffies64(u64 n) { #if (NSEC_PER_SEC % HZ) == 0 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ @@ -674,22 +674,23 @@ unsigned long nsecs_to_jiffies(u64 n) #endif } -#if (BITS_PER_LONG < 64) -u64 get_jiffies_64(void) +/** + * nsecs_to_jiffies - Convert nsecs in u64 to jiffies + * + * @n: nsecs in u64 + * + * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. + * And this doesn't return MAX_JIFFY_OFFSET since this function is designed + * for scheduler, not for use in device drivers to calculate timeout value. + * + * note: + * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) + * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + */ +unsigned long nsecs_to_jiffies(u64 n) { - unsigned long seq; - u64 ret; - - do { - seq = read_seqbegin(&xtime_lock); - ret = jiffies_64; - } while (read_seqretry(&xtime_lock, seq)); - return ret; + return (unsigned long)nsecs_to_jiffies64(n); } -EXPORT_SYMBOL(get_jiffies_64); -#endif - -EXPORT_SYMBOL(jiffies); /* * Add two timespec values and do a safety check for overflow. diff --git a/kernel/time/Makefile b/kernel/time/Makefile index ee266620b06..b0425991e9a 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,5 @@ -obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o +obj-y += timeconv.o posix-clock.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index d7395fdfb9f..0d74b9ba90c 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -18,7 +18,6 @@ #include <linux/notifier.h> #include <linux/smp.h> #include <linux/sysdev.h> -#include <linux/tick.h> #include "tick-internal.h" diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 5404a845690..b2fa506667c 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -22,8 +22,11 @@ ************************************************************************/ #include <linux/clocksource.h> #include <linux/jiffies.h> +#include <linux/module.h> #include <linux/init.h> +#include "tick-internal.h" + /* The Jiffies based clocksource is the lowest common * denominator clock source which should function on * all systems. It has the same coarse resolution as @@ -64,6 +67,23 @@ struct clocksource clocksource_jiffies = { .shift = JIFFIES_SHIFT, }; +#if (BITS_PER_LONG < 64) +u64 get_jiffies_64(void) +{ + unsigned long seq; + u64 ret; + + do { + seq = read_seqbegin(&xtime_lock); + ret = jiffies_64; + } while (read_seqretry(&xtime_lock, seq)); + return ret; +} +EXPORT_SYMBOL(get_jiffies_64); +#endif + +EXPORT_SYMBOL(jiffies); + static int __init init_jiffies_clocksource(void) { return clocksource_register(&clocksource_jiffies); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5c00242fa92..5f1bb8e2008 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -16,6 +16,8 @@ #include <linux/mm.h> #include <linux/module.h> +#include "tick-internal.h" + /* * NTP timekeeping variables: */ @@ -646,6 +648,17 @@ int do_adjtimex(struct timex *txc) hrtimer_cancel(&leap_timer); } + if (txc->modes & ADJ_SETOFFSET) { + struct timespec delta; + delta.tv_sec = txc->time.tv_sec; + delta.tv_nsec = txc->time.tv_usec; + if (!(txc->modes & ADJ_NANO)) + delta.tv_nsec *= 1000; + result = timekeeping_inject_offset(&delta); + if (result) + return result; + } + getnstimeofday(&ts); write_seqlock_irq(&xtime_lock); diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c new file mode 100644 index 00000000000..25028dd4fa1 --- /dev/null +++ b/kernel/time/posix-clock.c @@ -0,0 +1,451 @@ +/* + * posix-clock.c - support for dynamic clock devices + * + * Copyright (C) 2010 OMICRON electronics GmbH + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include <linux/device.h> +#include <linux/file.h> +#include <linux/mutex.h> +#include <linux/posix-clock.h> +#include <linux/slab.h> +#include <linux/syscalls.h> +#include <linux/uaccess.h> + +static void delete_clock(struct kref *kref); + +/* + * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. + */ +static struct posix_clock *get_posix_clock(struct file *fp) +{ + struct posix_clock *clk = fp->private_data; + + mutex_lock(&clk->mutex); + + if (!clk->zombie) + return clk; + + mutex_unlock(&clk->mutex); + + return NULL; +} + +static void put_posix_clock(struct posix_clock *clk) +{ + mutex_unlock(&clk->mutex); +} + +static ssize_t posix_clock_read(struct file *fp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -EINVAL; + + if (!clk) + return -ENODEV; + + if (clk->ops.read) + err = clk->ops.read(clk, fp->f_flags, buf, count); + + put_posix_clock(clk); + + return err; +} + +static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) +{ + struct posix_clock *clk = get_posix_clock(fp); + int result = 0; + + if (!clk) + return -ENODEV; + + if (clk->ops.poll) + result = clk->ops.poll(clk, fp, wait); + + put_posix_clock(clk); + + return result; +} + +static int posix_clock_fasync(int fd, struct file *fp, int on) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = 0; + + if (!clk) + return -ENODEV; + + if (clk->ops.fasync) + err = clk->ops.fasync(clk, fd, fp, on); + + put_posix_clock(clk); + + return err; +} + +static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENODEV; + + if (!clk) + return -ENODEV; + + if (clk->ops.mmap) + err = clk->ops.mmap(clk, vma); + + put_posix_clock(clk); + + return err; +} + +static long posix_clock_ioctl(struct file *fp, + unsigned int cmd, unsigned long arg) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENOTTY; + + if (!clk) + return -ENODEV; + + if (clk->ops.ioctl) + err = clk->ops.ioctl(clk, cmd, arg); + + put_posix_clock(clk); + + return err; +} + +#ifdef CONFIG_COMPAT +static long posix_clock_compat_ioctl(struct file *fp, + unsigned int cmd, unsigned long arg) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENOTTY; + + if (!clk) + return -ENODEV; + + if (clk->ops.ioctl) + err = clk->ops.ioctl(clk, cmd, arg); + + put_posix_clock(clk); + + return err; +} +#endif + +static int posix_clock_open(struct inode *inode, struct file *fp) +{ + int err; + struct posix_clock *clk = + container_of(inode->i_cdev, struct posix_clock, cdev); + + mutex_lock(&clk->mutex); + + if (clk->zombie) { + err = -ENODEV; + goto out; + } + if (clk->ops.open) + err = clk->ops.open(clk, fp->f_mode); + else + err = 0; + + if (!err) { + kref_get(&clk->kref); + fp->private_data = clk; + } +out: + mutex_unlock(&clk->mutex); + return err; +} + +static int posix_clock_release(struct inode *inode, struct file *fp) +{ + struct posix_clock *clk = fp->private_data; + int err = 0; + + if (clk->ops.release) + err = clk->ops.release(clk); + + kref_put(&clk->kref, delete_clock); + + fp->private_data = NULL; + + return err; +} + +static const struct file_operations posix_clock_file_operations = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = posix_clock_read, + .poll = posix_clock_poll, + .unlocked_ioctl = posix_clock_ioctl, + .open = posix_clock_open, + .release = posix_clock_release, + .fasync = posix_clock_fasync, + .mmap = posix_clock_mmap, +#ifdef CONFIG_COMPAT + .compat_ioctl = posix_clock_compat_ioctl, +#endif +}; + +int posix_clock_register(struct posix_clock *clk, dev_t devid) +{ + int err; + + kref_init(&clk->kref); + mutex_init(&clk->mutex); + + cdev_init(&clk->cdev, &posix_clock_file_operations); + clk->cdev.owner = clk->ops.owner; + err = cdev_add(&clk->cdev, devid, 1); + if (err) + goto no_cdev; + + return err; +no_cdev: + mutex_destroy(&clk->mutex); + return err; +} +EXPORT_SYMBOL_GPL(posix_clock_register); + +static void delete_clock(struct kref *kref) +{ + struct posix_clock *clk = container_of(kref, struct posix_clock, kref); + mutex_destroy(&clk->mutex); + if (clk->release) + clk->release(clk); +} + +void posix_clock_unregister(struct posix_clock *clk) +{ + cdev_del(&clk->cdev); + + mutex_lock(&clk->mutex); + clk->zombie = true; + mutex_unlock(&clk->mutex); + + kref_put(&clk->kref, delete_clock); +} +EXPORT_SYMBOL_GPL(posix_clock_unregister); + +struct posix_clock_desc { + struct file *fp; + struct posix_clock *clk; +}; + +static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd) +{ + struct file *fp = fget(CLOCKID_TO_FD(id)); + int err = -EINVAL; + + if (!fp) + return err; + + if (fp->f_op->open != posix_clock_open || !fp->private_data) + goto out; + + cd->fp = fp; + cd->clk = get_posix_clock(fp); + + err = cd->clk ? 0 : -ENODEV; +out: + if (err) + fput(fp); + return err; +} + +static void put_clock_desc(struct posix_clock_desc *cd) +{ + put_posix_clock(cd->clk); + fput(cd->fp); +} + +static int pc_clock_adjtime(clockid_t id, struct timex *tx) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + err = -EACCES; + goto out; + } + + if (cd.clk->ops.clock_adjtime) + err = cd.clk->ops.clock_adjtime(cd.clk, tx); + else + err = -EOPNOTSUPP; +out: + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_gettime(clockid_t id, struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_gettime) + err = cd.clk->ops.clock_gettime(cd.clk, ts); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_getres(clockid_t id, struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_getres) + err = cd.clk->ops.clock_getres(cd.clk, ts); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_settime(clockid_t id, const struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + err = -EACCES; + goto out; + } + + if (cd.clk->ops.clock_settime) + err = cd.clk->ops.clock_settime(cd.clk, ts); + else + err = -EOPNOTSUPP; +out: + put_clock_desc(&cd); + + return err; +} + +static int pc_timer_create(struct k_itimer *kit) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_create) + err = cd.clk->ops.timer_create(cd.clk, kit); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_timer_delete(struct k_itimer *kit) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_delete) + err = cd.clk->ops.timer_delete(cd.clk, kit); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + + if (get_clock_desc(id, &cd)) + return; + + if (cd.clk->ops.timer_gettime) + cd.clk->ops.timer_gettime(cd.clk, kit, ts); + + put_clock_desc(&cd); +} + +static int pc_timer_settime(struct k_itimer *kit, int flags, + struct itimerspec *ts, struct itimerspec *old) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_settime) + err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +struct k_clock clock_posix_dynamic = { + .clock_getres = pc_clock_getres, + .clock_set = pc_clock_settime, + .clock_get = pc_clock_gettime, + .clock_adj = pc_clock_adjtime, + .timer_create = pc_timer_create, + .timer_set = pc_timer_settime, + .timer_del = pc_timer_delete, + .timer_get = pc_timer_gettime, +}; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index a3b5aff6260..da800ffa810 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -18,7 +18,6 @@ #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> -#include <linux/tick.h> #include "tick-internal.h" diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index ed228ef6f6b..119528de823 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -18,7 +18,6 @@ #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> -#include <linux/tick.h> #include <asm/irq_regs.h> diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f65d3a723a6..1009b06d6f8 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -1,6 +1,10 @@ /* * tick internal variable and functions used by low/high res code */ +#include <linux/hrtimer.h> +#include <linux/tick.h> + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD #define TICK_DO_TIMER_NONE -1 #define TICK_DO_TIMER_BOOT -2 @@ -135,3 +139,8 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) { return !(dev->features & CLOCK_EVT_FEAT_DUMMY); } + +#endif + +extern void do_timer(unsigned long ticks); +extern seqlock_t xtime_lock; diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 5cbc101f908..2d04411a5f0 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -18,7 +18,6 @@ #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> -#include <linux/tick.h> #include "tick-internal.h" diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c55ea243347..d5097c44b40 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -19,7 +19,6 @@ #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> -#include <linux/tick.h> #include <linux/module.h> #include <asm/irq_regs.h> diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d27c7562902..3bd7e3d5c63 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -353,7 +353,7 @@ EXPORT_SYMBOL(do_gettimeofday); * * Sets the time of day to the new time and update NTP and notify hrtimers */ -int do_settimeofday(struct timespec *tv) +int do_settimeofday(const struct timespec *tv) { struct timespec ts_delta; unsigned long flags; @@ -387,6 +387,42 @@ int do_settimeofday(struct timespec *tv) EXPORT_SYMBOL(do_settimeofday); + +/** + * timekeeping_inject_offset - Adds or subtracts from the current time. + * @tv: pointer to the timespec variable containing the offset + * + * Adds or subtracts an offset value from the current time. + */ +int timekeeping_inject_offset(struct timespec *ts) +{ + unsigned long flags; + + if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + write_seqlock_irqsave(&xtime_lock, flags); + + timekeeping_forward_now(); + + xtime = timespec_add(xtime, *ts); + wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts); + + timekeeper.ntp_error = 0; + ntp_clear(); + + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); + + write_sequnlock_irqrestore(&xtime_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); + + return 0; +} +EXPORT_SYMBOL(timekeeping_inject_offset); + /** * change_clocksource - Swaps clocksources if a new one is available * @@ -779,7 +815,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) * * Called from the timer interrupt, must hold a write on xtime_lock. */ -void update_wall_time(void) +static void update_wall_time(void) { struct clocksource *clock; cycle_t offset; @@ -871,7 +907,7 @@ void update_wall_time(void) * getboottime - Return the real time of system boot. * @ts: pointer to the timespec to be set * - * Returns the time of day in a timespec. + * Returns the wall-time of boot in a timespec. * * This is based on the wall_to_monotonic offset and the total suspend * time. Calls to settimeofday will affect the value returned (which @@ -889,6 +925,55 @@ void getboottime(struct timespec *ts) } EXPORT_SYMBOL_GPL(getboottime); + +/** + * get_monotonic_boottime - Returns monotonic time since boot + * @ts: pointer to the timespec to be set + * + * Returns the monotonic time since boot in a timespec. + * + * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also + * includes the time spent in suspend. + */ +void get_monotonic_boottime(struct timespec *ts) +{ + struct timespec tomono, sleep; + unsigned int seq; + s64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqbegin(&xtime_lock); + *ts = xtime; + tomono = wall_to_monotonic; + sleep = total_sleep_time; + nsecs = timekeeping_get_ns(); + + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, + ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); +} +EXPORT_SYMBOL_GPL(get_monotonic_boottime); + +/** + * ktime_get_boottime - Returns monotonic time since boot in a ktime + * + * Returns the monotonic time since boot in a ktime + * + * This is similar to CLOCK_MONTONIC/ktime_get, but also + * includes the time spent in suspend. + */ +ktime_t ktime_get_boottime(void) +{ + struct timespec ts; + + get_monotonic_boottime(&ts); + return timespec_to_ktime(ts); +} +EXPORT_SYMBOL_GPL(ktime_get_boottime); + /** * monotonic_to_bootbased - Convert the monotonic time to boot based. * @ts: pointer to the timespec to be converted @@ -910,11 +995,6 @@ struct timespec __current_kernel_time(void) return xtime; } -struct timespec __get_wall_to_monotonic(void) -{ - return wall_to_monotonic; -} - struct timespec current_kernel_time(void) { struct timespec now; @@ -946,3 +1026,48 @@ struct timespec get_monotonic_coarse(void) now.tv_nsec + mono.tv_nsec); return now; } + +/* + * The 64-bit jiffies value is not atomic - you MUST NOT read it + * without sampling the sequence number in xtime_lock. + * jiffies is defined in the linker script... + */ +void do_timer(unsigned long ticks) +{ + jiffies_64 += ticks; + update_wall_time(); + calc_global_load(ticks); +} + +/** + * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic, + * and sleep offsets. + * @xtim: pointer to timespec to be set with xtime + * @wtom: pointer to timespec to be set with wall_to_monotonic + * @sleep: pointer to timespec to be set with time in suspend + */ +void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, + struct timespec *wtom, struct timespec *sleep) +{ + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + *xtim = xtime; + *wtom = wall_to_monotonic; + *sleep = total_sleep_time; + } while (read_seqretry(&xtime_lock, seq)); +} + +/** + * xtime_update() - advances the timekeeping infrastructure + * @ticks: number of ticks, that have elapsed since the last call. + * + * Must be called with interrupts disabled. + */ +void xtime_update(unsigned long ticks) +{ + write_seqlock(&xtime_lock); + do_timer(ticks); + write_sequnlock(&xtime_lock); +} diff --git a/kernel/timer.c b/kernel/timer.c index d6459923d24..fd6198692b5 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -404,6 +404,11 @@ static void timer_stats_account_timer(struct timer_list *timer) {} static struct debug_obj_descr timer_debug_descr; +static void *timer_debug_hint(void *addr) +{ + return ((struct timer_list *) addr)->function; +} + /* * fixup_init is called when: * - an active object is initialized @@ -477,6 +482,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) static struct debug_obj_descr timer_debug_descr = { .name = "timer_list", + .debug_hint = timer_debug_hint, .fixup_init = timer_fixup_init, .fixup_activate = timer_fixup_activate, .fixup_free = timer_fixup_free, @@ -964,6 +970,25 @@ EXPORT_SYMBOL(try_to_del_timer_sync); * add_timer_on(). Upon exit the timer is not queued and the handler is * not running on any CPU. * + * Note: You must not hold locks that are held in interrupt context + * while calling this function. Even if the lock has nothing to do + * with the timer in question. Here's why: + * + * CPU0 CPU1 + * ---- ---- + * <SOFTIRQ> + * call_timer_fn(); + * base->running_timer = mytimer; + * spin_lock_irq(somelock); + * <IRQ> + * spin_lock(somelock); + * del_timer_sync(mytimer); + * while (base->running_timer == mytimer); + * + * Now del_timer_sync() will never return and never release somelock. + * The interrupt on the other CPU is waiting to grab somelock but + * it has interrupted the softirq that CPU0 is waiting to finish. + * * The function returns whether it has deactivated a pending timer or not. */ int del_timer_sync(struct timer_list *timer) @@ -971,6 +996,10 @@ int del_timer_sync(struct timer_list *timer) #ifdef CONFIG_LOCKDEP unsigned long flags; + /* + * If lockdep gives a backtrace here, please reference + * the synchronization rules above. + */ local_irq_save(flags); lock_map_acquire(&timer->lockdep_map); lock_map_release(&timer->lockdep_map); @@ -1295,19 +1324,6 @@ void run_local_timers(void) raise_softirq(TIMER_SOFTIRQ); } -/* - * The 64-bit jiffies value is not atomic - you MUST NOT read it - * without sampling the sequence number in xtime_lock. - * jiffies is defined in the linker script... - */ - -void do_timer(unsigned long ticks) -{ - jiffies_64 += ticks; - update_wall_time(); - calc_global_load(ticks); -} - #ifdef __ARCH_WANT_SYS_ALARM /* diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ee6578b578a..b5fe4c00eb3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -316,6 +316,11 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, static struct debug_obj_descr work_debug_descr; +static void *work_debug_hint(void *addr) +{ + return ((struct work_struct *) addr)->func; +} + /* * fixup_init is called when: * - an active object is initialized @@ -387,6 +392,7 @@ static int work_fixup_free(void *addr, enum debug_obj_state state) static struct debug_obj_descr work_debug_descr = { .name = "work_struct", + .debug_hint = work_debug_hint, .fixup_init = work_fixup_init, .fixup_activate = work_fixup_activate, .fixup_free = work_fixup_free, diff --git a/lib/debugobjects.c b/lib/debugobjects.c index deebcc57d4e..9d86e45086f 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -249,14 +249,17 @@ static struct debug_bucket *get_bucket(unsigned long addr) static void debug_print_object(struct debug_obj *obj, char *msg) { + struct debug_obj_descr *descr = obj->descr; static int limit; - if (limit < 5 && obj->descr != descr_test) { + if (limit < 5 && descr != descr_test) { + void *hint = descr->debug_hint ? + descr->debug_hint(obj->object) : NULL; limit++; WARN(1, KERN_ERR "ODEBUG: %s %s (active state %u) " - "object type: %s\n", + "object type: %s hint: %pS\n", msg, obj_states[obj->state], obj->astate, - obj->descr->name); + descr->name, hint); } debug_objects_warnings++; } diff --git a/lib/plist.c b/lib/plist.c index 1471988d919..0ae7e643172 100644 --- a/lib/plist.c +++ b/lib/plist.c @@ -28,6 +28,8 @@ #ifdef CONFIG_DEBUG_PI_LIST +static struct plist_head test_head; + static void plist_check_prev_next(struct list_head *t, struct list_head *p, struct list_head *n) { @@ -54,12 +56,13 @@ static void plist_check_list(struct list_head *top) static void plist_check_head(struct plist_head *head) { - WARN_ON(!head->rawlock && !head->spinlock); + WARN_ON(head != &test_head && !head->rawlock && !head->spinlock); if (head->rawlock) WARN_ON_SMP(!raw_spin_is_locked(head->rawlock)); if (head->spinlock) WARN_ON_SMP(!spin_is_locked(head->spinlock)); - plist_check_list(&head->prio_list); + if (!plist_head_empty(head)) + plist_check_list(&plist_first(head)->prio_list); plist_check_list(&head->node_list); } @@ -75,25 +78,33 @@ static void plist_check_head(struct plist_head *head) */ void plist_add(struct plist_node *node, struct plist_head *head) { - struct plist_node *iter; + struct plist_node *first, *iter, *prev = NULL; + struct list_head *node_next = &head->node_list; plist_check_head(head); WARN_ON(!plist_node_empty(node)); + WARN_ON(!list_empty(&node->prio_list)); + + if (plist_head_empty(head)) + goto ins_node; - list_for_each_entry(iter, &head->prio_list, plist.prio_list) { - if (node->prio < iter->prio) - goto lt_prio; - else if (node->prio == iter->prio) { - iter = list_entry(iter->plist.prio_list.next, - struct plist_node, plist.prio_list); - goto eq_prio; + first = iter = plist_first(head); + + do { + if (node->prio < iter->prio) { + node_next = &iter->node_list; + break; } - } -lt_prio: - list_add_tail(&node->plist.prio_list, &iter->plist.prio_list); -eq_prio: - list_add_tail(&node->plist.node_list, &iter->plist.node_list); + prev = iter; + iter = list_entry(iter->prio_list.next, + struct plist_node, prio_list); + } while (iter != first); + + if (!prev || prev->prio != node->prio) + list_add_tail(&node->prio_list, &iter->prio_list); +ins_node: + list_add_tail(&node->node_list, node_next); plist_check_head(head); } @@ -108,14 +119,98 @@ void plist_del(struct plist_node *node, struct plist_head *head) { plist_check_head(head); - if (!list_empty(&node->plist.prio_list)) { - struct plist_node *next = plist_first(&node->plist); + if (!list_empty(&node->prio_list)) { + if (node->node_list.next != &head->node_list) { + struct plist_node *next; + + next = list_entry(node->node_list.next, + struct plist_node, node_list); - list_move_tail(&next->plist.prio_list, &node->plist.prio_list); - list_del_init(&node->plist.prio_list); + /* add the next plist_node into prio_list */ + if (list_empty(&next->prio_list)) + list_add(&next->prio_list, &node->prio_list); + } + list_del_init(&node->prio_list); } - list_del_init(&node->plist.node_list); + list_del_init(&node->node_list); plist_check_head(head); } + +#ifdef CONFIG_DEBUG_PI_LIST +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/init.h> + +static struct plist_node __initdata test_node[241]; + +static void __init plist_test_check(int nr_expect) +{ + struct plist_node *first, *prio_pos, *node_pos; + + if (plist_head_empty(&test_head)) { + BUG_ON(nr_expect != 0); + return; + } + + prio_pos = first = plist_first(&test_head); + plist_for_each(node_pos, &test_head) { + if (nr_expect-- < 0) + break; + if (node_pos == first) + continue; + if (node_pos->prio == prio_pos->prio) { + BUG_ON(!list_empty(&node_pos->prio_list)); + continue; + } + + BUG_ON(prio_pos->prio > node_pos->prio); + BUG_ON(prio_pos->prio_list.next != &node_pos->prio_list); + prio_pos = node_pos; + } + + BUG_ON(nr_expect != 0); + BUG_ON(prio_pos->prio_list.next != &first->prio_list); +} + +static int __init plist_test(void) +{ + int nr_expect = 0, i, loop; + unsigned int r = local_clock(); + + printk(KERN_INFO "start plist test\n"); + plist_head_init(&test_head, NULL); + for (i = 0; i < ARRAY_SIZE(test_node); i++) + plist_node_init(test_node + i, 0); + + for (loop = 0; loop < 1000; loop++) { + r = r * 193939 % 47629; + i = r % ARRAY_SIZE(test_node); + if (plist_node_empty(test_node + i)) { + r = r * 193939 % 47629; + test_node[i].prio = r % 99; + plist_add(test_node + i, &test_head); + nr_expect++; + } else { + plist_del(test_node + i, &test_head); + nr_expect--; + } + plist_test_check(nr_expect); + } + + for (i = 0; i < ARRAY_SIZE(test_node); i++) { + if (plist_node_empty(test_node + i)) + continue; + plist_del(test_node + i, &test_head); + nr_expect--; + plist_test_check(nr_expect); + } + + printk(KERN_INFO "end plist test\n"); + return 0; +} + +module_init(plist_test); + +#endif diff --git a/lib/rwsem.c b/lib/rwsem.c index f236d7cd5cf..aa7c3052261 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -222,8 +222,7 @@ rwsem_down_failed_common(struct rw_semaphore *sem, /* * wait for the read lock to be granted */ -asmregparm struct rw_semaphore __sched * -rwsem_down_read_failed(struct rw_semaphore *sem) +struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) { return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_READ, -RWSEM_ACTIVE_READ_BIAS); @@ -232,8 +231,7 @@ rwsem_down_read_failed(struct rw_semaphore *sem) /* * wait for the write lock to be granted */ -asmregparm struct rw_semaphore __sched * -rwsem_down_write_failed(struct rw_semaphore *sem) +struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) { return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_WRITE, -RWSEM_ACTIVE_WRITE_BIAS); @@ -243,7 +241,7 @@ rwsem_down_write_failed(struct rw_semaphore *sem) * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here */ -asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) +struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; @@ -263,7 +261,7 @@ asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) * - caller incremented waiting part of count and discovered it still negative * - just wake up any readers at the front of the queue */ -asmregparm struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) +struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) { unsigned long flags; diff --git a/mm/Makefile b/mm/Makefile index 2b1b575ae71..42a8326c3e3 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -7,7 +7,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ vmalloc.o pagewalk.o pgtable-generic.o -obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ +obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ @@ -15,6 +15,12 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ $(mmu-y) obj-y += init-mm.o +ifdef CONFIG_NO_BOOTMEM + obj-y += nobootmem.o +else + obj-y += bootmem.o +endif + obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_BOUNCE) += bounce.o diff --git a/mm/bootmem.c b/mm/bootmem.c index 13b0caa9793..07aeb89e396 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -23,6 +23,13 @@ #include "internal.h" +#ifndef CONFIG_NEED_MULTIPLE_NODES +struct pglist_data __refdata contig_page_data = { + .bdata = &bootmem_node_data[0] +}; +EXPORT_SYMBOL(contig_page_data); +#endif + unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; @@ -35,7 +42,6 @@ unsigned long max_pfn; unsigned long saved_max_pfn; #endif -#ifndef CONFIG_NO_BOOTMEM bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); @@ -146,7 +152,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) min_low_pfn = start; return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); } -#endif + /* * free_bootmem_late - free bootmem pages directly to page allocator * @addr: starting address of the range @@ -171,53 +177,6 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) } } -#ifdef CONFIG_NO_BOOTMEM -static void __init __free_pages_memory(unsigned long start, unsigned long end) -{ - int i; - unsigned long start_aligned, end_aligned; - int order = ilog2(BITS_PER_LONG); - - start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); - end_aligned = end & ~(BITS_PER_LONG - 1); - - if (end_aligned <= start_aligned) { - for (i = start; i < end; i++) - __free_pages_bootmem(pfn_to_page(i), 0); - - return; - } - - for (i = start; i < start_aligned; i++) - __free_pages_bootmem(pfn_to_page(i), 0); - - for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) - __free_pages_bootmem(pfn_to_page(i), order); - - for (i = end_aligned; i < end; i++) - __free_pages_bootmem(pfn_to_page(i), 0); -} - -unsigned long __init free_all_memory_core_early(int nodeid) -{ - int i; - u64 start, end; - unsigned long count = 0; - struct range *range = NULL; - int nr_range; - - nr_range = get_free_all_memory_range(&range, nodeid); - - for (i = 0; i < nr_range; i++) { - start = range[i].start; - end = range[i].end; - count += end - start; - __free_pages_memory(start, end); - } - - return count; -} -#else static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { int aligned; @@ -278,7 +237,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) return count; } -#endif /** * free_all_bootmem_node - release a node's free pages to the buddy allocator @@ -289,12 +247,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { register_page_bootmem_info_node(pgdat); -#ifdef CONFIG_NO_BOOTMEM - /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ - return 0; -#else return free_all_bootmem_core(pgdat->bdata); -#endif } /** @@ -304,16 +257,6 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) */ unsigned long __init free_all_bootmem(void) { -#ifdef CONFIG_NO_BOOTMEM - /* - * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id - * because in some case like Node0 doesnt have RAM installed - * low ram will be on Node1 - * Use MAX_NUMNODES will make sure all ranges in early_node_map[] - * will be used instead of only Node0 related - */ - return free_all_memory_core_early(MAX_NUMNODES); -#else unsigned long total_pages = 0; bootmem_data_t *bdata; @@ -321,10 +264,8 @@ unsigned long __init free_all_bootmem(void) total_pages += free_all_bootmem_core(bdata); return total_pages; -#endif } -#ifndef CONFIG_NO_BOOTMEM static void __init __free(bootmem_data_t *bdata, unsigned long sidx, unsigned long eidx) { @@ -419,7 +360,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, } BUG(); } -#endif /** * free_bootmem_node - mark a page range as usable @@ -434,10 +374,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { -#ifdef CONFIG_NO_BOOTMEM - kmemleak_free_part(__va(physaddr), size); - memblock_x86_free_range(physaddr, physaddr + size); -#else unsigned long start, end; kmemleak_free_part(__va(physaddr), size); @@ -446,7 +382,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, end = PFN_DOWN(physaddr + size); mark_bootmem_node(pgdat->bdata, start, end, 0, 0); -#endif } /** @@ -460,10 +395,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, */ void __init free_bootmem(unsigned long addr, unsigned long size) { -#ifdef CONFIG_NO_BOOTMEM - kmemleak_free_part(__va(addr), size); - memblock_x86_free_range(addr, addr + size); -#else unsigned long start, end; kmemleak_free_part(__va(addr), size); @@ -472,7 +403,6 @@ void __init free_bootmem(unsigned long addr, unsigned long size) end = PFN_DOWN(addr + size); mark_bootmem(start, end, 0, 0); -#endif } /** @@ -489,17 +419,12 @@ void __init free_bootmem(unsigned long addr, unsigned long size) int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags) { -#ifdef CONFIG_NO_BOOTMEM - panic("no bootmem"); - return 0; -#else unsigned long start, end; start = PFN_DOWN(physaddr); end = PFN_UP(physaddr + size); return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); -#endif } /** @@ -515,20 +440,14 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, int __init reserve_bootmem(unsigned long addr, unsigned long size, int flags) { -#ifdef CONFIG_NO_BOOTMEM - panic("no bootmem"); - return 0; -#else unsigned long start, end; start = PFN_DOWN(addr); end = PFN_UP(addr + size); return mark_bootmem(start, end, 1, flags); -#endif } -#ifndef CONFIG_NO_BOOTMEM int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len, int flags) { @@ -685,33 +604,12 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, #endif return NULL; } -#endif static void * __init ___alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { -#ifdef CONFIG_NO_BOOTMEM - void *ptr; - - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc(size, GFP_NOWAIT); - -restart: - - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); - - if (ptr) - return ptr; - - if (goal != 0) { - goal = 0; - goto restart; - } - - return NULL; -#else bootmem_data_t *bdata; void *region; @@ -737,7 +635,6 @@ restart: } return NULL; -#endif } /** @@ -758,10 +655,6 @@ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, { unsigned long limit = 0; -#ifdef CONFIG_NO_BOOTMEM - limit = -1UL; -#endif - return ___alloc_bootmem_nopanic(size, align, goal, limit); } @@ -798,14 +691,9 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, { unsigned long limit = 0; -#ifdef CONFIG_NO_BOOTMEM - limit = -1UL; -#endif - return ___alloc_bootmem(size, align, goal, limit); } -#ifndef CONFIG_NO_BOOTMEM static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) @@ -822,7 +710,6 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, return ___alloc_bootmem(size, align, goal, limit); } -#endif /** * __alloc_bootmem_node - allocate boot memory from a specific node @@ -842,24 +729,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - void *ptr; - if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); -#ifdef CONFIG_NO_BOOTMEM - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - goal, -1ULL); - if (ptr) - return ptr; - - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, - goal, -1ULL); -#else - ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); -#endif - - return ptr; + return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); } void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, @@ -880,13 +753,8 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, unsigned long new_goal; new_goal = MAX_DMA32_PFN << PAGE_SHIFT; -#ifdef CONFIG_NO_BOOTMEM - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - new_goal, -1ULL); -#else ptr = alloc_bootmem_core(pgdat->bdata, size, align, new_goal, 0); -#endif if (ptr) return ptr; } @@ -907,16 +775,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, void * __init alloc_bootmem_section(unsigned long size, unsigned long section_nr) { -#ifdef CONFIG_NO_BOOTMEM - unsigned long pfn, goal, limit; - - pfn = section_nr_to_pfn(section_nr); - goal = pfn << PAGE_SHIFT; - limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; - - return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, - SMP_CACHE_BYTES, goal, limit); -#else bootmem_data_t *bdata; unsigned long pfn, goal, limit; @@ -926,7 +784,6 @@ void * __init alloc_bootmem_section(unsigned long size, bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); -#endif } #endif @@ -938,16 +795,11 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); -#ifdef CONFIG_NO_BOOTMEM - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - goal, -1ULL); -#else ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); if (ptr) return ptr; ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); -#endif if (ptr) return ptr; @@ -995,21 +847,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - void *ptr; - if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); -#ifdef CONFIG_NO_BOOTMEM - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, ARCH_LOW_ADDRESS_LIMIT); - if (ptr) - return ptr; - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); -#else - ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); -#endif - return ptr; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index dbe99a5f207..113e35c4750 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1762,6 +1762,10 @@ static void collapse_huge_page(struct mm_struct *mm, #ifndef CONFIG_NUMA VM_BUG_ON(!*hpage); new_page = *hpage; + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + up_read(&mm->mmap_sem); + return; + } #else VM_BUG_ON(*hpage); /* @@ -1781,12 +1785,12 @@ static void collapse_huge_page(struct mm_struct *mm, *hpage = ERR_PTR(-ENOMEM); return; } -#endif if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { up_read(&mm->mmap_sem); put_page(new_page); return; } +#endif /* after allocating the hugepage upgrade to mmap_sem write mode */ up_read(&mm->mmap_sem); diff --git a/mm/nobootmem.c b/mm/nobootmem.c new file mode 100644 index 00000000000..e2bdb07079c --- /dev/null +++ b/mm/nobootmem.c @@ -0,0 +1,435 @@ +/* + * bootmem - A boot-time physical memory allocator and configurator + * + * Copyright (C) 1999 Ingo Molnar + * 1999 Kanoj Sarcar, SGI + * 2008 Johannes Weiner + * + * Access to this subsystem has to be serialized externally (which is true + * for the boot process anyway). + */ +#include <linux/init.h> +#include <linux/pfn.h> +#include <linux/slab.h> +#include <linux/bootmem.h> +#include <linux/module.h> +#include <linux/kmemleak.h> +#include <linux/range.h> +#include <linux/memblock.h> + +#include <asm/bug.h> +#include <asm/io.h> +#include <asm/processor.h> + +#include "internal.h" + +#ifndef CONFIG_NEED_MULTIPLE_NODES +struct pglist_data __refdata contig_page_data; +EXPORT_SYMBOL(contig_page_data); +#endif + +unsigned long max_low_pfn; +unsigned long min_low_pfn; +unsigned long max_pfn; + +#ifdef CONFIG_CRASH_DUMP +/* + * If we have booted due to a crash, max_pfn will be a very low value. We need + * to know the amount of memory that the previous kernel used. + */ +unsigned long saved_max_pfn; +#endif + +static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, + u64 goal, u64 limit) +{ + void *ptr; + u64 addr; + + if (limit > memblock.current_limit) + limit = memblock.current_limit; + + addr = find_memory_core_early(nid, size, align, goal, limit); + + if (addr == MEMBLOCK_ERROR) + return NULL; + + ptr = phys_to_virt(addr); + memset(ptr, 0, size); + memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. + */ + kmemleak_alloc(ptr, size, 0, 0); + return ptr; +} + +/* + * free_bootmem_late - free bootmem pages directly to page allocator + * @addr: starting address of the range + * @size: size of the range in bytes + * + * This is only useful when the bootmem allocator has already been torn + * down, but we are still initializing the system. Pages are given directly + * to the page allocator, no bootmem metadata is updated because it is gone. + */ +void __init free_bootmem_late(unsigned long addr, unsigned long size) +{ + unsigned long cursor, end; + + kmemleak_free_part(__va(addr), size); + + cursor = PFN_UP(addr); + end = PFN_DOWN(addr + size); + + for (; cursor < end; cursor++) { + __free_pages_bootmem(pfn_to_page(cursor), 0); + totalram_pages++; + } +} + +static void __init __free_pages_memory(unsigned long start, unsigned long end) +{ + int i; + unsigned long start_aligned, end_aligned; + int order = ilog2(BITS_PER_LONG); + + start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); + end_aligned = end & ~(BITS_PER_LONG - 1); + + if (end_aligned <= start_aligned) { + for (i = start; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + return; + } + + for (i = start; i < start_aligned; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) + __free_pages_bootmem(pfn_to_page(i), order); + + for (i = end_aligned; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); +} + +unsigned long __init free_all_memory_core_early(int nodeid) +{ + int i; + u64 start, end; + unsigned long count = 0; + struct range *range = NULL; + int nr_range; + + nr_range = get_free_all_memory_range(&range, nodeid); + + for (i = 0; i < nr_range; i++) { + start = range[i].start; + end = range[i].end; + count += end - start; + __free_pages_memory(start, end); + } + + return count; +} + +/** + * free_all_bootmem_node - release a node's free pages to the buddy allocator + * @pgdat: node to be released + * + * Returns the number of pages actually released. + */ +unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) +{ + register_page_bootmem_info_node(pgdat); + + /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ + return 0; +} + +/** + * free_all_bootmem - release free pages to the buddy allocator + * + * Returns the number of pages actually released. + */ +unsigned long __init free_all_bootmem(void) +{ + /* + * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id + * because in some case like Node0 doesnt have RAM installed + * low ram will be on Node1 + * Use MAX_NUMNODES will make sure all ranges in early_node_map[] + * will be used instead of only Node0 related + */ + return free_all_memory_core_early(MAX_NUMNODES); +} + +/** + * free_bootmem_node - mark a page range as usable + * @pgdat: node the range resides on + * @physaddr: starting address of the range + * @size: size of the range in bytes + * + * Partial pages will be considered reserved and left as they are. + * + * The range must reside completely on the specified node. + */ +void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, + unsigned long size) +{ + kmemleak_free_part(__va(physaddr), size); + memblock_x86_free_range(physaddr, physaddr + size); +} + +/** + * free_bootmem - mark a page range as usable + * @addr: starting address of the range + * @size: size of the range in bytes + * + * Partial pages will be considered reserved and left as they are. + * + * The range must be contiguous but may span node boundaries. + */ +void __init free_bootmem(unsigned long addr, unsigned long size) +{ + kmemleak_free_part(__va(addr), size); + memblock_x86_free_range(addr, addr + size); +} + +static void * __init ___alloc_bootmem_nopanic(unsigned long size, + unsigned long align, + unsigned long goal, + unsigned long limit) +{ + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc(size, GFP_NOWAIT); + +restart: + + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); + + if (ptr) + return ptr; + + if (goal != 0) { + goal = 0; + goto restart; + } + + return NULL; +} + +/** + * __alloc_bootmem_nopanic - allocate boot memory without panicking + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * Returns NULL on failure. + */ +void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, + unsigned long goal) +{ + unsigned long limit = -1UL; + + return ___alloc_bootmem_nopanic(size, align, goal, limit); +} + +static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) +{ + void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit); + + if (mem) + return mem; + /* + * Whoops, we cannot satisfy the allocation request. + */ + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; +} + +/** + * __alloc_bootmem - allocate boot memory + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem(unsigned long size, unsigned long align, + unsigned long goal) +{ + unsigned long limit = -1UL; + + return ___alloc_bootmem(size, align, goal, limit); +} + +/** + * __alloc_bootmem_node - allocate boot memory from a specific node + * @pgdat: node to allocate from + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may fall back to any node in the system if the specified node + * can not hold the requested memory. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); + if (ptr) + return ptr; + + return __alloc_memory_core_early(MAX_NUMNODES, size, align, + goal, -1ULL); +} + +void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ +#ifdef MAX_DMA32_PFN + unsigned long end_pfn; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + /* update goal according ...MAX_DMA32_PFN */ + end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; + + if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && + (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { + void *ptr; + unsigned long new_goal; + + new_goal = MAX_DMA32_PFN << PAGE_SHIFT; + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + new_goal, -1ULL); + if (ptr) + return ptr; + } +#endif + + return __alloc_bootmem_node(pgdat, size, align, goal); + +} + +#ifdef CONFIG_SPARSEMEM +/** + * alloc_bootmem_section - allocate boot memory from a specific section + * @size: size of the request in bytes + * @section_nr: sparse map section to allocate from + * + * Return NULL on failure. + */ +void * __init alloc_bootmem_section(unsigned long size, + unsigned long section_nr) +{ + unsigned long pfn, goal, limit; + + pfn = section_nr_to_pfn(section_nr); + goal = pfn << PAGE_SHIFT; + limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; + + return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, + SMP_CACHE_BYTES, goal, limit); +} +#endif + +void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); + if (ptr) + return ptr; + + return __alloc_bootmem_nopanic(size, align, goal); +} + +#ifndef ARCH_LOW_ADDRESS_LIMIT +#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL +#endif + +/** + * __alloc_bootmem_low - allocate low boot memory + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, + unsigned long goal) +{ + return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); +} + +/** + * __alloc_bootmem_low_node - allocate low boot memory from a specific node + * @pgdat: node to allocate from + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may fall back to any node in the system if the specified node + * can not hold the requested memory. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); + if (ptr) + return ptr; + + return __alloc_memory_core_early(MAX_NUMNODES, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cdef1d4b4e4..bd7625676a6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3699,13 +3699,45 @@ void __init free_bootmem_with_active_regions(int nid, } #ifdef CONFIG_HAVE_MEMBLOCK +/* + * Basic iterator support. Return the last range of PFNs for a node + * Note: nid == MAX_NUMNODES returns last region regardless of node + */ +static int __meminit last_active_region_index_in_nid(int nid) +{ + int i; + + for (i = nr_nodemap_entries - 1; i >= 0; i--) + if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) + return i; + + return -1; +} + +/* + * Basic iterator support. Return the previous active range of PFNs for a node + * Note: nid == MAX_NUMNODES returns next region regardless of node + */ +static int __meminit previous_active_region_index_in_nid(int index, int nid) +{ + for (index = index - 1; index >= 0; index--) + if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) + return index; + + return -1; +} + +#define for_each_active_range_index_in_nid_reverse(i, nid) \ + for (i = last_active_region_index_in_nid(nid); i != -1; \ + i = previous_active_region_index_in_nid(i, nid)) + u64 __init find_memory_core_early(int nid, u64 size, u64 align, u64 goal, u64 limit) { int i; /* Need to go over early_node_map to find out good range for node */ - for_each_active_range_index_in_nid(i, nid) { + for_each_active_range_index_in_nid_reverse(i, nid) { u64 addr; u64 ei_start, ei_last; u64 final_start, final_end; @@ -3748,34 +3780,6 @@ int __init add_from_early_node_map(struct range *range, int az, return nr_range; } -#ifdef CONFIG_NO_BOOTMEM -void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, - u64 goal, u64 limit) -{ - void *ptr; - u64 addr; - - if (limit > memblock.current_limit) - limit = memblock.current_limit; - - addr = find_memory_core_early(nid, size, align, goal, limit); - - if (addr == MEMBLOCK_ERROR) - return NULL; - - ptr = phys_to_virt(addr); - memset(ptr, 0, size); - memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); - /* - * The min_count is set to 0 so that bootmem allocated blocks - * are never reported as leaks. - */ - kmemleak_alloc(ptr, size, 0, 0); - return ptr; -} -#endif - - void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) { int i; @@ -4809,15 +4813,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) dma_reserve = new_dma_reserve; } -#ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data __refdata contig_page_data = { -#ifndef CONFIG_NO_BOOTMEM - .bdata = &bootmem_node_data[0] -#endif - }; -EXPORT_SYMBOL(contig_page_data); -#endif - void __init free_area_init(unsigned long *zones_size) { free_area_init_node(0, zones_size, diff --git a/mm/rmap.c b/mm/rmap.c index f21f4a1d6a1..941bf82e896 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -497,41 +497,51 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; int referenced = 0; - /* - * Don't want to elevate referenced for mlocked page that gets this far, - * in order that it progresses to try_to_unmap and is moved to the - * unevictable list. - */ - if (vma->vm_flags & VM_LOCKED) { - *mapcount = 0; /* break early from loop */ - *vm_flags |= VM_LOCKED; - goto out; - } - - /* Pretend the page is referenced if the task has the - swap token and is in the middle of a page fault. */ - if (mm != current->mm && has_swap_token(mm) && - rwsem_is_locked(&mm->mmap_sem)) - referenced++; - if (unlikely(PageTransHuge(page))) { pmd_t *pmd; spin_lock(&mm->page_table_lock); + /* + * rmap might return false positives; we must filter + * these out using page_check_address_pmd(). + */ pmd = page_check_address_pmd(page, mm, address, PAGE_CHECK_ADDRESS_PMD_FLAG); - if (pmd && !pmd_trans_splitting(*pmd) && - pmdp_clear_flush_young_notify(vma, address, pmd)) + if (!pmd) { + spin_unlock(&mm->page_table_lock); + goto out; + } + + if (vma->vm_flags & VM_LOCKED) { + spin_unlock(&mm->page_table_lock); + *mapcount = 0; /* break early from loop */ + *vm_flags |= VM_LOCKED; + goto out; + } + + /* go ahead even if the pmd is pmd_trans_splitting() */ + if (pmdp_clear_flush_young_notify(vma, address, pmd)) referenced++; spin_unlock(&mm->page_table_lock); } else { pte_t *pte; spinlock_t *ptl; + /* + * rmap might return false positives; we must filter + * these out using page_check_address(). + */ pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; + if (vma->vm_flags & VM_LOCKED) { + pte_unmap_unlock(pte, ptl); + *mapcount = 0; /* break early from loop */ + *vm_flags |= VM_LOCKED; + goto out; + } + if (ptep_clear_flush_young_notify(vma, address, pte)) { /* * Don't treat a reference through a sequentially read @@ -546,6 +556,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); } + /* Pretend the page is referenced if the task has the + swap token and is in the middle of a page fault. */ + if (mm != current->mm && has_swap_token(mm) && + rwsem_is_locked(&mm->mmap_sem)) + referenced++; + (*mapcount)--; if (referenced) diff --git a/mm/shmem.c b/mm/shmem.c index 5ee67c99060..3437b65d6d6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2144,8 +2144,10 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, { struct inode *inode = dentry->d_inode; - if (*len < 3) + if (*len < 3) { + *len = 3; return 255; + } if (inode_unhashed(inode)) { /* Unfortunately insert_inode_hash is not idempotent, diff --git a/net/Makefile b/net/Makefile index a3330ebe2c5..a51d9465e62 100644 --- a/net/Makefile +++ b/net/Makefile @@ -19,9 +19,7 @@ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ -ifneq ($(CONFIG_IPV6),) -obj-y += ipv6/ -endif +obj-$(CONFIG_NET) += ipv6/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ obj-$(CONFIG_BRIDGE) += bridge/ diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig index 9190ae462cb..6dee7bf648a 100644 --- a/net/bridge/Kconfig +++ b/net/bridge/Kconfig @@ -6,6 +6,7 @@ config BRIDGE tristate "802.1d Ethernet Bridging" select LLC select STP + depends on IPV6 || IPV6=n ---help--- If you say Y here, then your Linux box will be able to act as an Ethernet bridge, which means that the different Ethernet segments it diff --git a/net/core/dev.c b/net/core/dev.c index 8ae6631abcc..6561021d22d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1114,13 +1114,21 @@ EXPORT_SYMBOL(netdev_bonding_change); void dev_load(struct net *net, const char *name) { struct net_device *dev; + int no_module; rcu_read_lock(); dev = dev_get_by_name_rcu(net, name); rcu_read_unlock(); - if (!dev && capable(CAP_NET_ADMIN)) - request_module("%s", name); + no_module = !dev; + if (no_module && capable(CAP_NET_ADMIN)) + no_module = request_module("netdev-%s", name); + if (no_module && capable(CAP_SYS_MODULE)) { + if (!request_module("%s", name)) + pr_err("Loading kernel module for a network device " +"with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s " +"instead\n", name); + } } EXPORT_SYMBOL(dev_load); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index a9e7fc4c461..b5bada92f63 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3321,7 +3321,7 @@ static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) pkt_dev->started_at); ktime_t idle = ns_to_ktime(pkt_dev->idle_acc); - p += sprintf(p, "OK: %llu(c%llu+d%llu) nsec, %llu (%dbyte,%dfrags)\n", + p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n", (unsigned long long)ktime_to_us(elapsed), (unsigned long long)ktime_to_us(ktime_sub(elapsed, idle)), (unsigned long long)ktime_to_us(idle), diff --git a/net/core/scm.c b/net/core/scm.c index bbe45445080..4c1ef026d69 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -95,7 +95,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) int fd = fdp[i]; struct file *file; - if (fd < 0 || !(file = fget(fd))) + if (fd < 0 || !(file = fget_raw(fd))) return -EBADF; *fpp++ = file; fpl->count++; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index df4616fce92..036652c8166 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -670,7 +670,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) ifap = &ifa->ifa_next) { if (!strcmp(ifr.ifr_name, ifa->ifa_label) && sin_orig.sin_addr.s_addr == - ifa->ifa_address) { + ifa->ifa_local) { break; /* found */ } } @@ -1040,8 +1040,8 @@ static void inetdev_send_gratuitous_arp(struct net_device *dev, return; arp_send(ARPOP_REQUEST, ETH_P_ARP, - ifa->ifa_address, dev, - ifa->ifa_address, NULL, + ifa->ifa_local, dev, + ifa->ifa_local, NULL, dev->dev_addr, NULL); } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 6613edfac28..d1d0e2c256f 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1765,4 +1765,4 @@ module_exit(ipgre_fini); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("gre"); MODULE_ALIAS_RTNL_LINK("gretap"); -MODULE_ALIAS("gre0"); +MODULE_ALIAS_NETDEV("gre0"); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 988f52fba54..a5f58e7cbb2 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -913,4 +913,4 @@ static void __exit ipip_fini(void) module_init(ipip_init); module_exit(ipip_fini); MODULE_LICENSE("GPL"); -MODULE_ALIAS("tunl0"); +MODULE_ALIAS_NETDEV("tunl0"); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 4f4483e697b..e528a42a52b 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -57,6 +57,7 @@ MODULE_AUTHOR("Ville Nuorvala"); MODULE_DESCRIPTION("IPv6 tunneling device"); MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETDEV("ip6tnl0"); #ifdef IP6_TNL_DEBUG #define IP6_TNL_TRACE(x...) printk(KERN_DEBUG "%s:" x "\n", __func__) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 904312e25a3..e7db7014e89 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -739,8 +739,10 @@ restart: if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src); - else + else if (!(rt->dst.flags & DST_HOST)) nrt = rt6_alloc_clone(rt, &fl->fl6_dst); + else + goto out2; dst_release(&rt->dst); rt = nrt ? : net->ipv6.ip6_null_entry; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 8ce38f10a54..d2c16e10f65 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1290,4 +1290,4 @@ static int __init sit_init(void) module_init(sit_init); module_exit(sit_cleanup); MODULE_LICENSE("GPL"); -MODULE_ALIAS("sit0"); +MODULE_ALIAS_NETDEV("sit0"); diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 71f373c421b..c47a511f203 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -551,7 +551,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, if (conn->c_loopback && rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) { rds_cong_map_updated(conn->c_fcong, ~(u64) 0); - return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; + scat = &rm->data.op_sg[sg]; + ret = sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; + ret = min_t(int, ret, scat->length - conn->c_xmit_data_off); + return ret; } /* FIXME we may overallocate here */ diff --git a/net/rds/loop.c b/net/rds/loop.c index aeec1d483b1..bca6761a3ca 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -61,10 +61,15 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off) { + struct scatterlist *sgp = &rm->data.op_sg[sg]; + int ret = sizeof(struct rds_header) + + be32_to_cpu(rm->m_inc.i_hdr.h_len); + /* Do not send cong updates to loopback */ if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) { rds_cong_map_updated(conn->c_fcong, ~(u64) 0); - return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; + ret = min_t(int, ret, sgp->length - conn->c_xmit_data_off); + goto out; } BUG_ON(hdr_off || sg || off); @@ -80,8 +85,8 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, NULL); rds_inc_put(&rm->m_inc); - - return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len); +out: + return ret; } /* diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 243fc09b164..59e599498e3 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -252,23 +252,37 @@ static void rpc_set_active(struct rpc_task *task) /* * Mark an RPC call as having completed by clearing the 'active' bit + * and then waking up all tasks that were sleeping. */ -static void rpc_mark_complete_task(struct rpc_task *task) +static int rpc_complete_task(struct rpc_task *task) { - smp_mb__before_clear_bit(); + void *m = &task->tk_runstate; + wait_queue_head_t *wq = bit_waitqueue(m, RPC_TASK_ACTIVE); + struct wait_bit_key k = __WAIT_BIT_KEY_INITIALIZER(m, RPC_TASK_ACTIVE); + unsigned long flags; + int ret; + + spin_lock_irqsave(&wq->lock, flags); clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate); - smp_mb__after_clear_bit(); - wake_up_bit(&task->tk_runstate, RPC_TASK_ACTIVE); + ret = atomic_dec_and_test(&task->tk_count); + if (waitqueue_active(wq)) + __wake_up_locked_key(wq, TASK_NORMAL, &k); + spin_unlock_irqrestore(&wq->lock, flags); + return ret; } /* * Allow callers to wait for completion of an RPC call + * + * Note the use of out_of_line_wait_on_bit() rather than wait_on_bit() + * to enforce taking of the wq->lock and hence avoid races with + * rpc_complete_task(). */ int __rpc_wait_for_completion_task(struct rpc_task *task, int (*action)(void *)) { if (action == NULL) action = rpc_wait_bit_killable; - return wait_on_bit(&task->tk_runstate, RPC_TASK_ACTIVE, + return out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_ACTIVE, action, TASK_KILLABLE); } EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task); @@ -857,34 +871,67 @@ static void rpc_async_release(struct work_struct *work) rpc_free_task(container_of(work, struct rpc_task, u.tk_work)); } -void rpc_put_task(struct rpc_task *task) +static void rpc_release_resources_task(struct rpc_task *task) { - if (!atomic_dec_and_test(&task->tk_count)) - return; - /* Release resources */ if (task->tk_rqstp) xprt_release(task); if (task->tk_msg.rpc_cred) put_rpccred(task->tk_msg.rpc_cred); rpc_task_release_client(task); - if (task->tk_workqueue != NULL) { +} + +static void rpc_final_put_task(struct rpc_task *task, + struct workqueue_struct *q) +{ + if (q != NULL) { INIT_WORK(&task->u.tk_work, rpc_async_release); - queue_work(task->tk_workqueue, &task->u.tk_work); + queue_work(q, &task->u.tk_work); } else rpc_free_task(task); } + +static void rpc_do_put_task(struct rpc_task *task, struct workqueue_struct *q) +{ + if (atomic_dec_and_test(&task->tk_count)) { + rpc_release_resources_task(task); + rpc_final_put_task(task, q); + } +} + +void rpc_put_task(struct rpc_task *task) +{ + rpc_do_put_task(task, NULL); +} EXPORT_SYMBOL_GPL(rpc_put_task); +void rpc_put_task_async(struct rpc_task *task) +{ + rpc_do_put_task(task, task->tk_workqueue); +} +EXPORT_SYMBOL_GPL(rpc_put_task_async); + static void rpc_release_task(struct rpc_task *task) { dprintk("RPC: %5u release task\n", task->tk_pid); BUG_ON (RPC_IS_QUEUED(task)); - /* Wake up anyone who is waiting for task completion */ - rpc_mark_complete_task(task); + rpc_release_resources_task(task); - rpc_put_task(task); + /* + * Note: at this point we have been removed from rpc_clnt->cl_tasks, + * so it should be safe to use task->tk_count as a test for whether + * or not any other processes still hold references to our rpc_task. + */ + if (atomic_read(&task->tk_count) != 1 + !RPC_IS_ASYNC(task)) { + /* Wake up anyone who may be waiting for task completion */ + if (!rpc_complete_task(task)) + return; + } else { + if (!atomic_dec_and_test(&task->tk_count)) + return; + } + rpc_final_put_task(task, task->tk_workqueue); } int rpciod_up(void) diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 9df1eadc912..1a10dcd999e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -1335,6 +1335,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, p, 0, length, DMA_FROM_DEVICE); if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) { put_page(p); + svc_rdma_put_context(ctxt, 1); return; } atomic_inc(&xprt->sc_dma_used); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c431f5a5796..be96d429b47 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1631,7 +1631,8 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt, } xs_reclassify_socket(family, sock); - if (xs_bind(transport, sock)) { + err = xs_bind(transport, sock); + if (err) { sock_release(sock); goto out; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index dd419d28620..ba5b8c20849 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -850,7 +850,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * Get the parent directory, calculate the hash for last * component. */ - err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd); + err = kern_path_parent(sunaddr->sun_path, &nd); if (err) goto out_mknod_parent; @@ -1724,7 +1724,11 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_namelen = 0; - mutex_lock(&u->readlock); + err = mutex_lock_interruptible(&u->readlock); + if (err) { + err = sock_intr_errno(sock_rcvtimeo(sk, noblock)); + goto out; + } skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) { @@ -1864,7 +1868,11 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, memset(&tmp_scm, 0, sizeof(tmp_scm)); } - mutex_lock(&u->readlock); + err = mutex_lock_interruptible(&u->readlock); + if (err) { + err = sock_intr_errno(timeo); + goto out; + } do { int chunk; @@ -1895,11 +1903,12 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, timeo = unix_stream_data_wait(sk, timeo); - if (signal_pending(current)) { + if (signal_pending(current) + || mutex_lock_interruptible(&u->readlock)) { err = sock_intr_errno(timeo); goto out; } - mutex_lock(&u->readlock); + continue; unlock: unix_state_unlock(sk); diff --git a/net/unix/garbage.c b/net/unix/garbage.c index f89f83bf828..b6f4b994eb3 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -104,7 +104,7 @@ struct sock *unix_get_socket(struct file *filp) /* * Socket ? */ - if (S_ISSOCK(inode->i_mode)) { + if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { struct socket *sock = SOCKET_I(inode); struct sock *s = sock->sk; diff --git a/scripts/basic/fixdep.c b/scripts/basic/fixdep.c index 6c94c6ce292..291228e2598 100644 --- a/scripts/basic/fixdep.c +++ b/scripts/basic/fixdep.c @@ -309,6 +309,11 @@ static void do_config_file(const char *filename) close(fd); } +/* + * Important: The below generated source_foo.o and deps_foo.o variable + * assignments are parsed not only by make, but also by the rather simple + * parser in scripts/mod/sumversion.c. + */ static void parse_dep_file(void *map, size_t len) { char *m = map; @@ -323,7 +328,6 @@ static void parse_dep_file(void *map, size_t len) exit(1); } memcpy(s, m, p-m); s[p-m] = 0; - printf("deps_%s := \\\n", target); m = p+1; clear_config(); @@ -343,12 +347,15 @@ static void parse_dep_file(void *map, size_t len) strrcmp(s, "arch/um/include/uml-config.h") && strrcmp(s, ".ver")) { /* - * Do not output the first dependency (the - * source file), so that kbuild is not confused - * if a .c file is rewritten into .S or vice - * versa. + * Do not list the source file as dependency, so that + * kbuild is not confused if a .c file is rewritten + * into .S or vice versa. Storing it in source_* is + * needed for modpost to compute srcversions. */ - if (!first) + if (first) { + printf("source_%s := %s\n\n", target, s); + printf("deps_%s := \\\n", target); + } else printf(" %s \\\n", s); do_config_file(s); } diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 4c0383da1c9..58848e3e392 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2654,11 +2654,6 @@ sub process { WARN("Use of volatile is usually wrong: see Documentation/volatile-considered-harmful.txt\n" . $herecurr); } -# SPIN_LOCK_UNLOCKED & RW_LOCK_UNLOCKED are deprecated - if ($line =~ /\b(SPIN_LOCK_UNLOCKED|RW_LOCK_UNLOCKED)/) { - ERROR("Use of $1 is deprecated: see Documentation/spinlocks.txt\n" . $herecurr); - } - # warn about #if 0 if ($line =~ /^.\s*\#\s*if\s+0\b/) { CHK("if this code is redundant consider removing it\n" . diff --git a/scripts/mod/sumversion.c b/scripts/mod/sumversion.c index ecf9c7dc182..9dfcd6d988d 100644 --- a/scripts/mod/sumversion.c +++ b/scripts/mod/sumversion.c @@ -300,8 +300,8 @@ static int is_static_library(const char *objfile) return 0; } -/* We have dir/file.o. Open dir/.file.o.cmd, look for deps_ line to - * figure out source file. */ +/* We have dir/file.o. Open dir/.file.o.cmd, look for source_ and deps_ line + * to figure out source files. */ static int parse_source_files(const char *objfile, struct md4_ctx *md) { char *cmd, *file, *line, *dir; @@ -340,6 +340,21 @@ static int parse_source_files(const char *objfile, struct md4_ctx *md) */ while ((line = get_next_line(&pos, file, flen)) != NULL) { char* p = line; + + if (strncmp(line, "source_", sizeof("source_")-1) == 0) { + p = strrchr(line, ' '); + if (!p) { + warn("malformed line: %s\n", line); + goto out_file; + } + p++; + if (!parse_file(p, md)) { + warn("could not open %s: %s\n", + p, strerror(errno)); + goto out_file; + } + continue; + } if (strncmp(line, "deps_", sizeof("deps_")-1) == 0) { check_files = 1; continue; diff --git a/scripts/rt-tester/rt-tester.py b/scripts/rt-tester/rt-tester.py index 44423b4dcb8..8c81d76959e 100644 --- a/scripts/rt-tester/rt-tester.py +++ b/scripts/rt-tester/rt-tester.py @@ -33,8 +33,6 @@ cmd_opcodes = { "lockintnowait" : "6", "lockcont" : "7", "unlock" : "8", - "lockbkl" : "9", - "unlockbkl" : "10", "signal" : "11", "resetevent" : "98", "reset" : "99", diff --git a/scripts/rt-tester/t2-l1-2rt-sameprio.tst b/scripts/rt-tester/t2-l1-2rt-sameprio.tst index 8821f27cc8b..3710c8b2090 100644 --- a/scripts/rt-tester/t2-l1-2rt-sameprio.tst +++ b/scripts/rt-tester/t2-l1-2rt-sameprio.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal 0 # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t2-l1-pi.tst b/scripts/rt-tester/t2-l1-pi.tst index cde1f189a02..b4cc95975ad 100644 --- a/scripts/rt-tester/t2-l1-pi.tst +++ b/scripts/rt-tester/t2-l1-pi.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal 0 # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t2-l1-signal.tst b/scripts/rt-tester/t2-l1-signal.tst index 3ab0bfc4995..1b57376cc1f 100644 --- a/scripts/rt-tester/t2-l1-signal.tst +++ b/scripts/rt-tester/t2-l1-signal.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal 0 # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t2-l2-2rt-deadlock.tst b/scripts/rt-tester/t2-l2-2rt-deadlock.tst index f4b5d5d6215..68b10629b6f 100644 --- a/scripts/rt-tester/t2-l2-2rt-deadlock.tst +++ b/scripts/rt-tester/t2-l2-2rt-deadlock.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal 0 # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t3-l1-pi-1rt.tst b/scripts/rt-tester/t3-l1-pi-1rt.tst index 63440ca2cce..8e6c8b11ae5 100644 --- a/scripts/rt-tester/t3-l1-pi-1rt.tst +++ b/scripts/rt-tester/t3-l1-pi-1rt.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal thread to signal (0-7) # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t3-l1-pi-2rt.tst b/scripts/rt-tester/t3-l1-pi-2rt.tst index e5816fe67df..69c2212fc52 100644 --- a/scripts/rt-tester/t3-l1-pi-2rt.tst +++ b/scripts/rt-tester/t3-l1-pi-2rt.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal thread to signal (0-7) # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t3-l1-pi-3rt.tst b/scripts/rt-tester/t3-l1-pi-3rt.tst index 718b82b5d3b..9b0f1eb26a8 100644 --- a/scripts/rt-tester/t3-l1-pi-3rt.tst +++ b/scripts/rt-tester/t3-l1-pi-3rt.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal thread to signal (0-7) # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t3-l1-pi-signal.tst b/scripts/rt-tester/t3-l1-pi-signal.tst index c6e21356349..39ec74ab06e 100644 --- a/scripts/rt-tester/t3-l1-pi-signal.tst +++ b/scripts/rt-tester/t3-l1-pi-signal.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal thread to signal (0-7) # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t3-l1-pi-steal.tst b/scripts/rt-tester/t3-l1-pi-steal.tst index f53749d59d7..e03db7e010f 100644 --- a/scripts/rt-tester/t3-l1-pi-steal.tst +++ b/scripts/rt-tester/t3-l1-pi-steal.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal thread to signal (0-7) # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t3-l2-pi.tst b/scripts/rt-tester/t3-l2-pi.tst index cdc3e4fd7ba..7b59100d3e4 100644 --- a/scripts/rt-tester/t3-l2-pi.tst +++ b/scripts/rt-tester/t3-l2-pi.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal thread to signal (0-7) # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t4-l2-pi-deboost.tst b/scripts/rt-tester/t4-l2-pi-deboost.tst index baa14137f47..2f0e049d644 100644 --- a/scripts/rt-tester/t4-l2-pi-deboost.tst +++ b/scripts/rt-tester/t4-l2-pi-deboost.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal thread to signal (0-7) # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst b/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst index e6ec0c81b54..04f4034ff89 100644 --- a/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst +++ b/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal thread to signal (0-7) # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/scripts/rt-tester/t5-l4-pi-boost-deboost.tst b/scripts/rt-tester/t5-l4-pi-boost-deboost.tst index ca64f8bbf4b..a48a6ee29dd 100644 --- a/scripts/rt-tester/t5-l4-pi-boost-deboost.tst +++ b/scripts/rt-tester/t5-l4-pi-boost-deboost.tst @@ -19,8 +19,6 @@ # lockintnowait lock nr (0-7) # lockcont lock nr (0-7) # unlock lock nr (0-7) -# lockbkl lock nr (0-7) -# unlockbkl lock nr (0-7) # signal thread to signal (0-7) # reset 0 # resetevent 0 @@ -39,9 +37,6 @@ # blocked lock nr (0-7) # blockedwake lock nr (0-7) # unlocked lock nr (0-7) -# lockedbkl dont care -# blockedbkl dont care -# unlockedbkl dont care # opcodeeq command opcode or number # opcodelt number # opcodegt number diff --git a/security/commoncap.c b/security/commoncap.c index 64c2ed9c901..dbfdaed4cc6 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -93,7 +93,7 @@ int cap_capable(struct task_struct *tsk, const struct cred *cred, int cap, * Determine whether the current process may set the system clock and timezone * information, returning 0 if permission granted, -ve if denied. */ -int cap_settime(struct timespec *ts, struct timezone *tz) +int cap_settime(const struct timespec *ts, const struct timezone *tz) { if (!capable(CAP_SYS_TIME)) return -EPERM; diff --git a/security/security.c b/security/security.c index 7b7308ace8c..bb33ecadcf9 100644 --- a/security/security.c +++ b/security/security.c @@ -201,7 +201,7 @@ int security_syslog(int type) return security_ops->syslog(type); } -int security_settime(struct timespec *ts, struct timezone *tz) +int security_settime(const struct timespec *ts, const struct timezone *tz) { return security_ops->settime(ts, tz); } diff --git a/sound/soc/codecs/wm8978.c b/sound/soc/codecs/wm8978.c index 4bbc3442703..8dfb0a0da67 100644 --- a/sound/soc/codecs/wm8978.c +++ b/sound/soc/codecs/wm8978.c @@ -145,18 +145,18 @@ static const struct snd_kcontrol_new wm8978_snd_controls[] = { SOC_SINGLE("DAC Playback Limiter Threshold", WM8978_DAC_LIMITER_2, 4, 7, 0), SOC_SINGLE("DAC Playback Limiter Boost", - WM8978_DAC_LIMITER_2, 0, 15, 0), + WM8978_DAC_LIMITER_2, 0, 12, 0), SOC_ENUM("ALC Enable Switch", alc1), SOC_SINGLE("ALC Capture Min Gain", WM8978_ALC_CONTROL_1, 0, 7, 0), SOC_SINGLE("ALC Capture Max Gain", WM8978_ALC_CONTROL_1, 3, 7, 0), - SOC_SINGLE("ALC Capture Hold", WM8978_ALC_CONTROL_2, 4, 7, 0), + SOC_SINGLE("ALC Capture Hold", WM8978_ALC_CONTROL_2, 4, 10, 0), SOC_SINGLE("ALC Capture Target", WM8978_ALC_CONTROL_2, 0, 15, 0), SOC_ENUM("ALC Capture Mode", alc3), - SOC_SINGLE("ALC Capture Decay", WM8978_ALC_CONTROL_3, 4, 15, 0), - SOC_SINGLE("ALC Capture Attack", WM8978_ALC_CONTROL_3, 0, 15, 0), + SOC_SINGLE("ALC Capture Decay", WM8978_ALC_CONTROL_3, 4, 10, 0), + SOC_SINGLE("ALC Capture Attack", WM8978_ALC_CONTROL_3, 0, 10, 0), SOC_SINGLE("ALC Capture Noise Gate Switch", WM8978_NOISE_GATE, 3, 1, 0), SOC_SINGLE("ALC Capture Noise Gate Threshold", @@ -211,8 +211,10 @@ static const struct snd_kcontrol_new wm8978_snd_controls[] = { WM8978_LOUT2_SPK_CONTROL, WM8978_ROUT2_SPK_CONTROL, 6, 1, 1), /* DAC / ADC oversampling */ - SOC_SINGLE("DAC 128x Oversampling Switch", WM8978_DAC_CONTROL, 8, 1, 0), - SOC_SINGLE("ADC 128x Oversampling Switch", WM8978_ADC_CONTROL, 8, 1, 0), + SOC_SINGLE("DAC 128x Oversampling Switch", WM8978_DAC_CONTROL, + 5, 1, 0), + SOC_SINGLE("ADC 128x Oversampling Switch", WM8978_ADC_CONTROL, + 5, 1, 0), }; /* Mixer #1: Output (OUT1, OUT2) Mixer: mix AUX, Input mixer output and DAC */ diff --git a/sound/soc/codecs/wm8994.c b/sound/soc/codecs/wm8994.c index 4afbe3b2e44..c6c958ee5d5 100644 --- a/sound/soc/codecs/wm8994.c +++ b/sound/soc/codecs/wm8994.c @@ -1418,7 +1418,7 @@ SND_SOC_DAPM_DAC_E("DAC1R", NULL, SND_SOC_NOPM, 0, 0, static const struct snd_soc_dapm_widget wm8994_dac_widgets[] = { SND_SOC_DAPM_DAC("DAC2L", NULL, WM8994_POWER_MANAGEMENT_5, 3, 0), -SND_SOC_DAPM_DAC("DAC1R", NULL, WM8994_POWER_MANAGEMENT_5, 2, 0), +SND_SOC_DAPM_DAC("DAC2R", NULL, WM8994_POWER_MANAGEMENT_5, 2, 0), SND_SOC_DAPM_DAC("DAC1L", NULL, WM8994_POWER_MANAGEMENT_5, 1, 0), SND_SOC_DAPM_DAC("DAC1R", NULL, WM8994_POWER_MANAGEMENT_5, 0, 0), }; @@ -3325,6 +3325,12 @@ static int wm8994_codec_probe(struct snd_soc_codec *codec) case WM8958: snd_soc_add_controls(codec, wm8958_snd_controls, ARRAY_SIZE(wm8958_snd_controls)); + snd_soc_dapm_new_controls(dapm, wm8994_lateclk_widgets, + ARRAY_SIZE(wm8994_lateclk_widgets)); + snd_soc_dapm_new_controls(dapm, wm8994_adc_widgets, + ARRAY_SIZE(wm8994_adc_widgets)); + snd_soc_dapm_new_controls(dapm, wm8994_dac_widgets, + ARRAY_SIZE(wm8994_dac_widgets)); snd_soc_dapm_new_controls(dapm, wm8958_dapm_widgets, ARRAY_SIZE(wm8958_dapm_widgets)); break; @@ -3350,6 +3356,8 @@ static int wm8994_codec_probe(struct snd_soc_codec *codec) } break; case WM8958: + snd_soc_dapm_add_routes(dapm, wm8994_lateclk_intercon, + ARRAY_SIZE(wm8994_lateclk_intercon)); snd_soc_dapm_add_routes(dapm, wm8958_intercon, ARRAY_SIZE(wm8958_intercon)); break; diff --git a/sound/soc/omap/am3517evm.c b/sound/soc/omap/am3517evm.c index 161750443eb..73dde4a1adc 100644 --- a/sound/soc/omap/am3517evm.c +++ b/sound/soc/omap/am3517evm.c @@ -139,7 +139,7 @@ static struct snd_soc_dai_link am3517evm_dai = { .cpu_dai_name ="omap-mcbsp-dai.0", .codec_dai_name = "tlv320aic23-hifi", .platform_name = "omap-pcm-audio", - .codec_name = "tlv320aic23-codec", + .codec_name = "tlv320aic23-codec.2-001a", .init = am3517evm_aic23_init, .ops = &am3517evm_ops, }; diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 25e54230cc6..1790f83ee66 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -941,7 +941,7 @@ static void dapm_seq_run(struct snd_soc_dapm_context *dapm, } if (!list_empty(&pending)) - dapm_seq_run_coalesced(dapm, &pending); + dapm_seq_run_coalesced(cur_dapm, &pending); } static void dapm_widget_update(struct snd_soc_dapm_context *dapm) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 5a72d421e21..e5230c0ef95 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -189,11 +189,15 @@ int build_id_cache__add_s(const char *sbuild_id, const char *debugdir, const char *name, bool is_kallsyms) { const size_t size = PATH_MAX; - char *realname = realpath(name, NULL), - *filename = malloc(size), + char *realname, *filename = malloc(size), *linkname = malloc(size), *targetname; int len, err = -1; + if (is_kallsyms) + realname = (char *)name; + else + realname = realpath(name, NULL); + if (realname == NULL || filename == NULL || linkname == NULL) goto out_free; @@ -225,7 +229,8 @@ int build_id_cache__add_s(const char *sbuild_id, const char *debugdir, if (symlink(targetname, linkname) == 0) err = 0; out_free: - free(realname); + if (!is_kallsyms) + free(realname); free(filename); free(linkname); return err; diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 17f9c4a66dd..194f9e2a328 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -1328,7 +1328,7 @@ static int probe_point_lazy_walker(const char *fname, int lineno, * Continue if no error, because the lazy pattern will match * to other lines */ - return ret < 0 ?: 0; + return ret < 0 ? ret : 0; } /* Find probe points from lazy pattern */ |