diff options
313 files changed, 6646 insertions, 3529 deletions
diff --git a/Documentation/keys.txt b/Documentation/keys.txt index b56aacc1fff..e4dbbdb1bd9 100644 --- a/Documentation/keys.txt +++ b/Documentation/keys.txt @@ -26,7 +26,7 @@ This document has the following sections: - Notes on accessing payload contents - Defining a key type - Request-key callback service - - Key access filesystem + - Garbage collection ============ @@ -113,6 +113,9 @@ Each key has a number of attributes: (*) Dead. The key's type was unregistered, and so the key is now useless. +Keys in the last three states are subject to garbage collection. See the +section on "Garbage collection". + ==================== KEY SERVICE OVERVIEW @@ -754,6 +757,26 @@ The keyctl syscall functions are: successful. + (*) Install the calling process's session keyring on its parent. + + long keyctl(KEYCTL_SESSION_TO_PARENT); + + This functions attempts to install the calling process's session keyring + on to the calling process's parent, replacing the parent's current session + keyring. + + The calling process must have the same ownership as its parent, the + keyring must have the same ownership as the calling process, the calling + process must have LINK permission on the keyring and the active LSM module + mustn't deny permission, otherwise error EPERM will be returned. + + Error ENOMEM will be returned if there was insufficient memory to complete + the operation, otherwise 0 will be returned to indicate success. + + The keyring will be replaced next time the parent process leaves the + kernel and resumes executing userspace. + + =============== KERNEL SERVICES =============== @@ -1231,3 +1254,17 @@ by executing: In this case, the program isn't required to actually attach the key to a ring; the rings are provided for reference. + + +================== +GARBAGE COLLECTION +================== + +Dead keys (for which the type has been removed) will be automatically unlinked +from those keyrings that point to them and deleted as soon as possible by a +background garbage collector. + +Similarly, revoked and expired keys will be garbage collected, but only after a +certain amount of time has passed. This time is set as a number of seconds in: + + /proc/sys/kernel/keys/gc_delay diff --git a/Documentation/kmemleak.txt b/Documentation/kmemleak.txt index 89068030b01..34f6638aa5a 100644 --- a/Documentation/kmemleak.txt +++ b/Documentation/kmemleak.txt @@ -27,6 +27,13 @@ To trigger an intermediate memory scan: # echo scan > /sys/kernel/debug/kmemleak +To clear the list of all current possible memory leaks: + + # echo clear > /sys/kernel/debug/kmemleak + +New leaks will then come up upon reading /sys/kernel/debug/kmemleak +again. + Note that the orphan objects are listed in the order they were allocated and one object at the beginning of the list may cause other subsequent objects to be reported as orphan. @@ -42,6 +49,9 @@ Memory scanning parameters can be modified at run-time by writing to the scan=<secs> - set the automatic memory scanning period in seconds (default 600, 0 to stop the automatic scanning) scan - trigger a memory scan + clear - clear list of current memory leak suspects, done by + marking all current reported unreferenced objects grey + dump=<addr> - dump information about the object found at <addr> Kmemleak can also be disabled at boot-time by passing "kmemleak=off" on the kernel command line. @@ -86,6 +96,27 @@ avoid this, kmemleak can also store the number of values pointing to an address inside the block address range that need to be found so that the block is not considered a leak. One example is __vmalloc(). +Testing specific sections with kmemleak +--------------------------------------- + +Upon initial bootup your /sys/kernel/debug/kmemleak output page may be +quite extensive. This can also be the case if you have very buggy code +when doing development. To work around these situations you can use the +'clear' command to clear all reported unreferenced objects from the +/sys/kernel/debug/kmemleak output. By issuing a 'scan' after a 'clear' +you can find new unreferenced objects; this should help with testing +specific sections of code. + +To test a critical section on demand with a clean kmemleak do: + + # echo clear > /sys/kernel/debug/kmemleak + ... test your kernel or modules ... + # echo scan > /sys/kernel/debug/kmemleak + +Then as usual to get your report with: + + # cat /sys/kernel/debug/kmemleak + Kmemleak API ------------ diff --git a/Documentation/s390/s390dbf.txt b/Documentation/s390/s390dbf.txt index 2d10053dd97..ae66f9b90a2 100644 --- a/Documentation/s390/s390dbf.txt +++ b/Documentation/s390/s390dbf.txt @@ -495,6 +495,13 @@ and for each vararg a long value. So e.g. for a debug entry with a format string plus two varargs one would need to allocate a (3 * sizeof(long)) byte data area in the debug_register() function. +IMPORTANT: Using "%s" in sprintf event functions is dangerous. You can only +use "%s" in the sprintf event functions, if the memory for the passed string is +available as long as the debug feature exists. The reason behind this is that +due to performance considerations only a pointer to the string is stored in +the debug feature. If you log a string that is freed afterwards, you will get +an OOPS when inspecting the debug feature, because then the debug feature will +access the already freed memory. NOTE: If using the sprintf view do NOT use other event/exception functions than the sprintf-event and -exception functions. diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 322a00bb99d..2dbff53369d 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -19,6 +19,7 @@ Currently, these files might (depending on your configuration) show up in /proc/sys/kernel: - acpi_video_flags - acct +- callhome [ S390 only ] - auto_msgmni - core_pattern - core_uses_pid @@ -91,6 +92,21 @@ valid for 30 seconds. ============================================================== +callhome: + +Controls the kernel's callhome behavior in case of a kernel panic. + +The s390 hardware allows an operating system to send a notification +to a service organization (callhome) in case of an operating system panic. + +When the value in this file is 0 (which is the default behavior) +nothing happens in case of a kernel panic. If this value is set to "1" +the complete kernel oops message is send to the IBM customer service +organization in case the mainframe the Linux operating system is running +on has a service contract with IBM. + +============================================================== + core_pattern: core_pattern is used to specify a core dumpfile pattern name. diff --git a/MAINTAINERS b/MAINTAINERS index 8dca9d89c6c..989ff114939 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -439,7 +439,7 @@ F: drivers/hwmon/ams/ AMSO1100 RNIC DRIVER M: Tom Tucker <tom@opengridcomputing.com> M: Steve Wise <swise@opengridcomputing.com> -L: general@lists.openfabrics.org +L: linux-rdma@vger.kernel.org S: Maintained F: drivers/infiniband/hw/amso1100/ @@ -1494,7 +1494,7 @@ F: drivers/net/cxgb3/ CXGB3 IWARP RNIC DRIVER (IW_CXGB3) M: Steve Wise <swise@chelsio.com> -L: general@lists.openfabrics.org +L: linux-rdma@vger.kernel.org W: http://www.openfabrics.org S: Supported F: drivers/infiniband/hw/cxgb3/ @@ -1868,7 +1868,7 @@ F: fs/efs/ EHCA (IBM GX bus InfiniBand adapter) DRIVER M: Hoang-Nam Nguyen <hnguyen@de.ibm.com> M: Christoph Raisch <raisch@de.ibm.com> -L: general@lists.openfabrics.org +L: linux-rdma@vger.kernel.org S: Supported F: drivers/infiniband/hw/ehca/ @@ -2552,7 +2552,7 @@ INFINIBAND SUBSYSTEM M: Roland Dreier <rolandd@cisco.com> M: Sean Hefty <sean.hefty@intel.com> M: Hal Rosenstock <hal.rosenstock@gmail.com> -L: general@lists.openfabrics.org (moderated for non-subscribers) +L: linux-rdma@vger.kernel.org W: http://www.openib.org/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband.git S: Supported @@ -2729,7 +2729,7 @@ F: drivers/net/ipg.c IPATH DRIVER M: Ralph Campbell <infinipath@qlogic.com> -L: general@lists.openfabrics.org +L: linux-rdma@vger.kernel.org T: git git://git.qlogic.com/ipath-linux-2.6 S: Supported F: drivers/infiniband/hw/ipath/ @@ -3485,7 +3485,7 @@ F: drivers/scsi/NCR_D700.* NETEFFECT IWARP RNIC DRIVER (IW_NES) M: Faisal Latif <faisal.latif@intel.com> M: Chien Tung <chien.tin.tung@intel.com> -L: general@lists.openfabrics.org +L: linux-rdma@vger.kernel.org W: http://www.neteffect.com S: Supported F: drivers/infiniband/hw/nes/ diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h index 60c83abfde7..5076a8860b1 100644 --- a/arch/alpha/include/asm/thread_info.h +++ b/arch/alpha/include/asm/thread_info.h @@ -75,6 +75,7 @@ register struct thread_info *__current_thread_info __asm__("$8"); #define TIF_UAC_SIGBUS 7 #define TIF_MEMDIE 8 #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal */ +#define TIF_NOTIFY_RESUME 10 /* callback before returning to user */ #define TIF_FREEZE 16 /* is freezing for suspend */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) @@ -82,10 +83,12 @@ register struct thread_info *__current_thread_info __asm__("$8"); #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) +#define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) #define _TIF_FREEZE (1<<TIF_FREEZE) /* Work to do on interrupt/exception return. */ -#define _TIF_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED) +#define _TIF_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \ + _TIF_NOTIFY_RESUME) /* Work to do on any return to userspace. */ #define _TIF_ALLWORK_MASK (_TIF_WORK_MASK \ diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c index df65eaa84c4..0932dbb1ef8 100644 --- a/arch/alpha/kernel/signal.c +++ b/arch/alpha/kernel/signal.c @@ -20,6 +20,7 @@ #include <linux/binfmts.h> #include <linux/bitops.h> #include <linux/syscalls.h> +#include <linux/tracehook.h> #include <asm/uaccess.h> #include <asm/sigcontext.h> @@ -683,4 +684,11 @@ do_notify_resume(struct pt_regs *regs, struct switch_stack *sw, { if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK)) do_signal(regs, sw, r0, r19); + + if (thread_info_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); + } } diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 73394e50cbc..d3a39b1e6c0 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -130,11 +130,13 @@ extern void vfp_sync_state(struct thread_info *thread); * TIF_SYSCALL_TRACE - syscall trace active * TIF_SIGPENDING - signal pending * TIF_NEED_RESCHED - rescheduling necessary + * TIF_NOTIFY_RESUME - callback before returning to user * TIF_USEDFPU - FPU was used by this task this quantum (SMP) * TIF_POLLING_NRFLAG - true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_SIGPENDING 0 #define TIF_NEED_RESCHED 1 +#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_SYSCALL_TRACE 8 #define TIF_POLLING_NRFLAG 16 #define TIF_USING_IWMMXT 17 @@ -143,6 +145,7 @@ extern void vfp_sync_state(struct thread_info *thread); #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT) diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 8c3de1a350b..7813ab782fd 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -51,7 +51,7 @@ fast_work_pending: work_pending: tst r1, #_TIF_NEED_RESCHED bne work_resched - tst r1, #_TIF_SIGPENDING + tst r1, #_TIF_SIGPENDING|_TIF_NOTIFY_RESUME beq no_work_pending mov r0, sp @ 'regs' mov r2, why @ 'syscall' diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index f6bc5d44278..b76fe06d92e 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -12,6 +12,7 @@ #include <linux/personality.h> #include <linux/freezer.h> #include <linux/uaccess.h> +#include <linux/tracehook.h> #include <asm/elf.h> #include <asm/cacheflush.h> @@ -707,4 +708,11 @@ do_notify_resume(struct pt_regs *regs, unsigned int thread_flags, int syscall) { if (thread_flags & _TIF_SIGPENDING) do_signal(¤t->blocked, regs, syscall); + + if (thread_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); + } } diff --git a/arch/avr32/include/asm/thread_info.h b/arch/avr32/include/asm/thread_info.h index fc42de5ca20..fd0c5d7e933 100644 --- a/arch/avr32/include/asm/thread_info.h +++ b/arch/avr32/include/asm/thread_info.h @@ -84,6 +84,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_MEMDIE 6 #define TIF_RESTORE_SIGMASK 7 /* restore signal mask in do_signal */ #define TIF_CPU_GOING_TO_SLEEP 8 /* CPU is entering sleep 0 mode */ +#define TIF_NOTIFY_RESUME 9 /* callback before returning to user */ #define TIF_FREEZE 29 #define TIF_DEBUG 30 /* debugging enabled */ #define TIF_USERSPACE 31 /* true if FS sets userspace */ @@ -96,6 +97,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_MEMDIE (1 << TIF_MEMDIE) #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) #define _TIF_CPU_GOING_TO_SLEEP (1 << TIF_CPU_GOING_TO_SLEEP) +#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_FREEZE (1 << TIF_FREEZE) /* Note: The masks below must never span more than 16 bits! */ @@ -103,13 +105,15 @@ static inline struct thread_info *current_thread_info(void) /* work to do on interrupt/exception return */ #define _TIF_WORK_MASK \ ((1 << TIF_SIGPENDING) \ + | _TIF_NOTIFY_RESUME \ | (1 << TIF_NEED_RESCHED) \ | (1 << TIF_POLLING_NRFLAG) \ | (1 << TIF_BREAKPOINT) \ | (1 << TIF_RESTORE_SIGMASK)) /* work to do on any return to userspace */ -#define _TIF_ALLWORK_MASK (_TIF_WORK_MASK | (1 << TIF_SYSCALL_TRACE)) +#define _TIF_ALLWORK_MASK (_TIF_WORK_MASK | (1 << TIF_SYSCALL_TRACE) | \ + _TIF_NOTIFY_RESUME) /* work to do on return from debug mode */ #define _TIF_DBGWORK_MASK (_TIF_WORK_MASK & ~(1 << TIF_BREAKPOINT)) diff --git a/arch/avr32/kernel/entry-avr32b.S b/arch/avr32/kernel/entry-avr32b.S index 009a80155d6..169268c40ae 100644 --- a/arch/avr32/kernel/entry-avr32b.S +++ b/arch/avr32/kernel/entry-avr32b.S @@ -281,7 +281,7 @@ syscall_exit_work: ld.w r1, r0[TI_flags] rjmp 1b -2: mov r2, _TIF_SIGPENDING | _TIF_RESTORE_SIGMASK +2: mov r2, _TIF_SIGPENDING | _TIF_RESTORE_SIGMASK | _TIF_NOTIFY_RESUME tst r1, r2 breq 3f unmask_interrupts diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c index 27227561bad..64f886fac2e 100644 --- a/arch/avr32/kernel/signal.c +++ b/arch/avr32/kernel/signal.c @@ -16,6 +16,7 @@ #include <linux/ptrace.h> #include <linux/unistd.h> #include <linux/freezer.h> +#include <linux/tracehook.h> #include <asm/uaccess.h> #include <asm/ucontext.h> @@ -322,4 +323,11 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, struct thread_info *ti) if (ti->flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK)) do_signal(regs, ¤t->blocked, syscall); + + if (ti->flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); + } } diff --git a/arch/cris/kernel/ptrace.c b/arch/cris/kernel/ptrace.c index b326023baab..48b0f391263 100644 --- a/arch/cris/kernel/ptrace.c +++ b/arch/cris/kernel/ptrace.c @@ -16,6 +16,7 @@ #include <linux/errno.h> #include <linux/ptrace.h> #include <linux/user.h> +#include <linux/tracehook.h> #include <asm/uaccess.h> #include <asm/page.h> @@ -36,4 +37,11 @@ void do_notify_resume(int canrestart, struct pt_regs *regs, /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(canrestart,regs); + + if (thread_info_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); + } } diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c index 4a7a62c6e78..6b0a2b6fed6 100644 --- a/arch/frv/kernel/signal.c +++ b/arch/frv/kernel/signal.c @@ -572,6 +572,8 @@ asmlinkage void do_notify_resume(__u32 thread_info_flags) if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(__frame); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } } /* end do_notify_resume() */ diff --git a/arch/h8300/include/asm/thread_info.h b/arch/h8300/include/asm/thread_info.h index 8bbc8b0ee45..70e67e47d02 100644 --- a/arch/h8300/include/asm/thread_info.h +++ b/arch/h8300/include/asm/thread_info.h @@ -89,6 +89,7 @@ static inline struct thread_info *current_thread_info(void) TIF_NEED_RESCHED */ #define TIF_MEMDIE 4 #define TIF_RESTORE_SIGMASK 5 /* restore signal mask in do_signal() */ +#define TIF_NOTIFY_RESUME 6 /* callback before returning to user */ #define TIF_FREEZE 16 /* is freezing for suspend */ /* as above, but as bit values */ @@ -97,6 +98,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) +#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_FREEZE (1<<TIF_FREEZE) #define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */ diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c index cf3472f7389..af842c369d2 100644 --- a/arch/h8300/kernel/signal.c +++ b/arch/h8300/kernel/signal.c @@ -39,6 +39,7 @@ #include <linux/tty.h> #include <linux/binfmts.h> #include <linux/freezer.h> +#include <linux/tracehook.h> #include <asm/setup.h> #include <asm/uaccess.h> @@ -552,4 +553,11 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags) { if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK)) do_signal(regs, NULL); + + if (thread_info_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); + } } diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 5d7c0e5b9e7..89969e95004 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -192,6 +192,8 @@ do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall) if (test_thread_flag(TIF_NOTIFY_RESUME)) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(&scr->pt); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } /* copy user rbs to kernel rbs */ diff --git a/arch/m32r/include/asm/thread_info.h b/arch/m32r/include/asm/thread_info.h index 07bb5bd00e2..71578151a40 100644 --- a/arch/m32r/include/asm/thread_info.h +++ b/arch/m32r/include/asm/thread_info.h @@ -149,6 +149,7 @@ static inline unsigned int get_thread_fault_code(void) #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ #define TIF_SINGLESTEP 3 /* restore singlestep on return to user mode */ #define TIF_IRET 4 /* return with iret */ +#define TIF_NOTIFY_RESUME 5 /* callback before returning to user */ #define TIF_RESTORE_SIGMASK 8 /* restore signal mask in do_signal() */ #define TIF_USEDFPU 16 /* FPU was used by this task this quantum (SMP) */ #define TIF_POLLING_NRFLAG 17 /* true if poll_idle() is polling TIF_NEED_RESCHED */ @@ -160,6 +161,7 @@ static inline unsigned int get_thread_fault_code(void) #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) #define _TIF_SINGLESTEP (1<<TIF_SINGLESTEP) #define _TIF_IRET (1<<TIF_IRET) +#define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) #define _TIF_USEDFPU (1<<TIF_USEDFPU) #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c index 18124542a6e..144b0f124fc 100644 --- a/arch/m32r/kernel/signal.c +++ b/arch/m32r/kernel/signal.c @@ -21,6 +21,7 @@ #include <linux/stddef.h> #include <linux/personality.h> #include <linux/freezer.h> +#include <linux/tracehook.h> #include <asm/cacheflush.h> #include <asm/ucontext.h> #include <asm/uaccess.h> @@ -408,5 +409,12 @@ void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs,oldset); + if (thread_info_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); + } + clear_thread_flag(TIF_IRET); } diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h index f9df720d2e4..01cc1630b66 100644 --- a/arch/mips/include/asm/thread_info.h +++ b/arch/mips/include/asm/thread_info.h @@ -115,6 +115,7 @@ register struct thread_info *__current_thread_info __asm__("$28"); #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ #define TIF_SYSCALL_AUDIT 3 /* syscall auditing active */ #define TIF_SECCOMP 4 /* secure computing */ +#define TIF_NOTIFY_RESUME 5 /* callback before returning to user */ #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */ #define TIF_USEDFPU 16 /* FPU was used by this task this quantum (SMP) */ #define TIF_POLLING_NRFLAG 17 /* true if poll_idle() is polling TIF_NEED_RESCHED */ @@ -139,6 +140,7 @@ register struct thread_info *__current_thread_info __asm__("$28"); #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1<<TIF_SECCOMP) +#define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) #define _TIF_USEDFPU (1<<TIF_USEDFPU) #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index 830c5ef9932..6254041b942 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -21,6 +21,7 @@ #include <linux/compiler.h> #include <linux/syscalls.h> #include <linux/uaccess.h> +#include <linux/tracehook.h> #include <asm/abi.h> #include <asm/asm.h> @@ -700,4 +701,11 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused, /* deal with pending signal delivery */ if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK)) do_signal(regs); + + if (thread_info_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); + } } diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c index feb2f2e810d..a21f43bc68e 100644 --- a/arch/mn10300/kernel/signal.c +++ b/arch/mn10300/kernel/signal.c @@ -568,5 +568,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags) if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(__frame); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } } diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h index 4ce0edfbe96..ac775a76bff 100644 --- a/arch/parisc/include/asm/thread_info.h +++ b/arch/parisc/include/asm/thread_info.h @@ -59,6 +59,7 @@ struct thread_info { #define TIF_MEMDIE 5 #define TIF_RESTORE_SIGMASK 6 /* restore saved signal mask */ #define TIF_FREEZE 7 /* is freezing for suspend */ +#define TIF_NOTIFY_RESUME 8 /* callback before returning to user */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) @@ -67,8 +68,9 @@ struct thread_info { #define _TIF_32BIT (1 << TIF_32BIT) #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) #define _TIF_FREEZE (1 << TIF_FREEZE) +#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) -#define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | \ +#define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | \ _TIF_NEED_RESCHED | _TIF_RESTORE_SIGMASK) #endif /* __KERNEL__ */ diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index e552e547cb9..8c4712b74dc 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -948,7 +948,7 @@ intr_check_sig: /* As above */ mfctl %cr30,%r1 LDREG TI_FLAGS(%r1),%r19 - ldi (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK), %r20 + ldi (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NOTIFY_RESUME), %r20 and,COND(<>) %r19, %r20, %r0 b,n intr_restore /* skip past if we've nothing to do */ diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c index f82544225e8..8eb3c63c407 100644 --- a/arch/parisc/kernel/signal.c +++ b/arch/parisc/kernel/signal.c @@ -25,6 +25,7 @@ #include <linux/stddef.h> #include <linux/compat.h> #include <linux/elf.h> +#include <linux/tracehook.h> #include <asm/ucontext.h> #include <asm/rt_sigframe.h> #include <asm/uaccess.h> @@ -645,4 +646,11 @@ void do_notify_resume(struct pt_regs *regs, long in_syscall) if (test_thread_flag(TIF_SIGPENDING) || test_thread_flag(TIF_RESTORE_SIGMASK)) do_signal(regs, in_syscall); + + if (test_thread_flag(TIF_NOTIFY_RESUME)) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); + } } diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 2ae5d72f47e..e030e86ff6a 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -95,7 +95,6 @@ config S390 select HAVE_ARCH_TRACEHOOK select INIT_ALL_POSSIBLE select HAVE_PERF_COUNTERS - select GENERIC_ATOMIC64 if !64BIT config SCHED_OMIT_FRAME_POINTER bool @@ -481,13 +480,6 @@ config CMM_IUCV Select this option to enable the special message interface to the cooperative memory management. -config PAGE_STATES - bool "Unused page notification" - help - This enables the notification of unused pages to the - hypervisor. The ESSA instruction is used to do the states - changes between a page that has content and the unused state. - config APPLDATA_BASE bool "Linux - VM Monitor Stream, base infrastructure" depends on PROC_FS diff --git a/arch/s390/Makefile b/arch/s390/Makefile index 0ff387cebf8..fc8fb20e7fc 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -88,8 +88,7 @@ LDFLAGS_vmlinux := -e start head-y := arch/s390/kernel/head.o arch/s390/kernel/init_task.o core-y += arch/s390/mm/ arch/s390/kernel/ arch/s390/crypto/ \ - arch/s390/appldata/ arch/s390/hypfs/ arch/s390/kvm/ \ - arch/s390/power/ + arch/s390/appldata/ arch/s390/hypfs/ arch/s390/kvm/ libs-y += arch/s390/lib/ drivers-y += drivers/s390/ diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index 5a805df216b..bd9914b8948 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -355,11 +355,7 @@ static struct dentry *hypfs_create_file(struct super_block *sb, { struct dentry *dentry; struct inode *inode; - struct qstr qname; - qname.name = name; - qname.len = strlen(name); - qname.hash = full_name_hash(name, qname.len); mutex_lock(&parent->d_inode->i_mutex); dentry = lookup_one_len(name, parent, strlen(name)); if (IS_ERR(dentry)) { @@ -426,7 +422,7 @@ struct dentry *hypfs_create_u64(struct super_block *sb, struct dentry *dir, char tmp[TMP_SIZE]; struct dentry *dentry; - snprintf(tmp, TMP_SIZE, "%lld\n", (unsigned long long int)value); + snprintf(tmp, TMP_SIZE, "%llu\n", (unsigned long long int)value); buffer = kstrdup(tmp, GFP_KERNEL); if (!buffer) return ERR_PTR(-ENOMEM); diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h index c7d0abfb0f0..ae7c8f9f94a 100644 --- a/arch/s390/include/asm/atomic.h +++ b/arch/s390/include/asm/atomic.h @@ -1,33 +1,23 @@ #ifndef __ARCH_S390_ATOMIC__ #define __ARCH_S390_ATOMIC__ -#include <linux/compiler.h> -#include <linux/types.h> - /* - * include/asm-s390/atomic.h + * Copyright 1999,2009 IBM Corp. + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>, + * Denis Joseph Barrow, + * Arnd Bergmann <arndb@de.ibm.com>, * - * S390 version - * Copyright (C) 1999-2005 IBM Deutschland Entwicklung GmbH, IBM Corporation - * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), - * Denis Joseph Barrow, - * Arnd Bergmann (arndb@de.ibm.com) - * - * Derived from "include/asm-i386/bitops.h" - * Copyright (C) 1992, Linus Torvalds + * Atomic operations that C can't guarantee us. + * Useful for resource counting etc. + * s390 uses 'Compare And Swap' for atomicity in SMP enviroment. * */ -/* - * Atomic operations that C can't guarantee us. Useful for - * resource counting etc.. - * S390 uses 'Compare And Swap' for atomicity in SMP enviroment - */ +#include <linux/compiler.h> +#include <linux/types.h> #define ATOMIC_INIT(i) { (i) } -#ifdef __KERNEL__ - #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 2) #define __CS_LOOP(ptr, op_val, op_string) ({ \ @@ -77,7 +67,7 @@ static inline void atomic_set(atomic_t *v, int i) barrier(); } -static __inline__ int atomic_add_return(int i, atomic_t * v) +static inline int atomic_add_return(int i, atomic_t *v) { return __CS_LOOP(v, i, "ar"); } @@ -87,7 +77,7 @@ static __inline__ int atomic_add_return(int i, atomic_t * v) #define atomic_inc_return(_v) atomic_add_return(1, _v) #define atomic_inc_and_test(_v) (atomic_add_return(1, _v) == 0) -static __inline__ int atomic_sub_return(int i, atomic_t * v) +static inline int atomic_sub_return(int i, atomic_t *v) { return __CS_LOOP(v, i, "sr"); } @@ -97,19 +87,19 @@ static __inline__ int atomic_sub_return(int i, atomic_t * v) #define atomic_dec_return(_v) atomic_sub_return(1, _v) #define atomic_dec_and_test(_v) (atomic_sub_return(1, _v) == 0) -static __inline__ void atomic_clear_mask(unsigned long mask, atomic_t * v) +static inline void atomic_clear_mask(unsigned long mask, atomic_t *v) { - __CS_LOOP(v, ~mask, "nr"); + __CS_LOOP(v, ~mask, "nr"); } -static __inline__ void atomic_set_mask(unsigned long mask, atomic_t * v) +static inline void atomic_set_mask(unsigned long mask, atomic_t *v) { - __CS_LOOP(v, mask, "or"); + __CS_LOOP(v, mask, "or"); } #define atomic_xchg(v, new) (xchg(&((v)->counter), new)) -static __inline__ int atomic_cmpxchg(atomic_t *v, int old, int new) +static inline int atomic_cmpxchg(atomic_t *v, int old, int new) { #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 2) asm volatile( @@ -127,7 +117,7 @@ static __inline__ int atomic_cmpxchg(atomic_t *v, int old, int new) return old; } -static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) +static inline int atomic_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); @@ -146,9 +136,10 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) #undef __CS_LOOP -#ifdef __s390x__ #define ATOMIC64_INIT(i) { (i) } +#ifdef CONFIG_64BIT + #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 2) #define __CSG_LOOP(ptr, op_val, op_string) ({ \ @@ -162,7 +153,7 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) : "=&d" (old_val), "=&d" (new_val), \ "=Q" (((atomic_t *)(ptr))->counter) \ : "d" (op_val), "Q" (((atomic_t *)(ptr))->counter) \ - : "cc", "memory" ); \ + : "cc", "memory"); \ new_val; \ }) @@ -180,7 +171,7 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) "=m" (((atomic_t *)(ptr))->counter) \ : "a" (ptr), "d" (op_val), \ "m" (((atomic_t *)(ptr))->counter) \ - : "cc", "memory" ); \ + : "cc", "memory"); \ new_val; \ }) @@ -198,39 +189,29 @@ static inline void atomic64_set(atomic64_t *v, long long i) barrier(); } -static __inline__ long long atomic64_add_return(long long i, atomic64_t * v) +static inline long long atomic64_add_return(long long i, atomic64_t *v) { return __CSG_LOOP(v, i, "agr"); } -#define atomic64_add(_i, _v) atomic64_add_return(_i, _v) -#define atomic64_add_negative(_i, _v) (atomic64_add_return(_i, _v) < 0) -#define atomic64_inc(_v) atomic64_add_return(1, _v) -#define atomic64_inc_return(_v) atomic64_add_return(1, _v) -#define atomic64_inc_and_test(_v) (atomic64_add_return(1, _v) == 0) -static __inline__ long long atomic64_sub_return(long long i, atomic64_t * v) +static inline long long atomic64_sub_return(long long i, atomic64_t *v) { return __CSG_LOOP(v, i, "sgr"); } -#define atomic64_sub(_i, _v) atomic64_sub_return(_i, _v) -#define atomic64_sub_and_test(_i, _v) (atomic64_sub_return(_i, _v) == 0) -#define atomic64_dec(_v) atomic64_sub_return(1, _v) -#define atomic64_dec_return(_v) atomic64_sub_return(1, _v) -#define atomic64_dec_and_test(_v) (atomic64_sub_return(1, _v) == 0) -static __inline__ void atomic64_clear_mask(unsigned long mask, atomic64_t * v) +static inline void atomic64_clear_mask(unsigned long mask, atomic64_t *v) { - __CSG_LOOP(v, ~mask, "ngr"); + __CSG_LOOP(v, ~mask, "ngr"); } -static __inline__ void atomic64_set_mask(unsigned long mask, atomic64_t * v) +static inline void atomic64_set_mask(unsigned long mask, atomic64_t *v) { - __CSG_LOOP(v, mask, "ogr"); + __CSG_LOOP(v, mask, "ogr"); } #define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) -static __inline__ long long atomic64_cmpxchg(atomic64_t *v, +static inline long long atomic64_cmpxchg(atomic64_t *v, long long old, long long new) { #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 2) @@ -249,8 +230,112 @@ static __inline__ long long atomic64_cmpxchg(atomic64_t *v, return old; } -static __inline__ int atomic64_add_unless(atomic64_t *v, - long long a, long long u) +#undef __CSG_LOOP + +#else /* CONFIG_64BIT */ + +typedef struct { + long long counter; +} atomic64_t; + +static inline long long atomic64_read(const atomic64_t *v) +{ + register_pair rp; + + asm volatile( + " lm %0,%N0,0(%1)" + : "=&d" (rp) + : "a" (&v->counter), "m" (v->counter) + ); + return rp.pair; +} + +static inline void atomic64_set(atomic64_t *v, long long i) +{ + register_pair rp = {.pair = i}; + + asm volatile( + " stm %1,%N1,0(%2)" + : "=m" (v->counter) + : "d" (rp), "a" (&v->counter) + ); +} + +static inline long long atomic64_xchg(atomic64_t *v, long long new) +{ + register_pair rp_new = {.pair = new}; + register_pair rp_old; + + asm volatile( + " lm %0,%N0,0(%2)\n" + "0: cds %0,%3,0(%2)\n" + " jl 0b\n" + : "=&d" (rp_old), "+m" (v->counter) + : "a" (&v->counter), "d" (rp_new) + : "cc"); + return rp_old.pair; +} + +static inline long long atomic64_cmpxchg(atomic64_t *v, + long long old, long long new) +{ + register_pair rp_old = {.pair = old}; + register_pair rp_new = {.pair = new}; + + asm volatile( + " cds %0,%3,0(%2)" + : "+&d" (rp_old), "+m" (v->counter) + : "a" (&v->counter), "d" (rp_new) + : "cc"); + return rp_old.pair; +} + + +static inline long long atomic64_add_return(long long i, atomic64_t *v) +{ + long long old, new; + + do { + old = atomic64_read(v); + new = old + i; + } while (atomic64_cmpxchg(v, old, new) != old); + return new; +} + +static inline long long atomic64_sub_return(long long i, atomic64_t *v) +{ + long long old, new; + + do { + old = atomic64_read(v); + new = old - i; + } while (atomic64_cmpxchg(v, old, new) != old); + return new; +} + +static inline void atomic64_set_mask(unsigned long long mask, atomic64_t *v) +{ + long long old, new; + + do { + old = atomic64_read(v); + new = old | mask; + } while (atomic64_cmpxchg(v, old, new) != old); +} + +static inline void atomic64_clear_mask(unsigned long long mask, atomic64_t *v) +{ + long long old, new; + + do { + old = atomic64_read(v); + new = old & mask; + } while (atomic64_cmpxchg(v, old, new) != old); +} + +#endif /* CONFIG_64BIT */ + +static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u) { long long c, old; c = atomic64_read(v); @@ -265,15 +350,17 @@ static __inline__ int atomic64_add_unless(atomic64_t *v, return c != u; } -#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) - -#undef __CSG_LOOP - -#else /* __s390x__ */ - -#include <asm-generic/atomic64.h> - -#endif /* __s390x__ */ +#define atomic64_add(_i, _v) atomic64_add_return(_i, _v) +#define atomic64_add_negative(_i, _v) (atomic64_add_return(_i, _v) < 0) +#define atomic64_inc(_v) atomic64_add_return(1, _v) +#define atomic64_inc_return(_v) atomic64_add_return(1, _v) +#define atomic64_inc_and_test(_v) (atomic64_add_return(1, _v) == 0) +#define atomic64_sub(_i, _v) atomic64_sub_return(_i, _v) +#define atomic64_sub_and_test(_i, _v) (atomic64_sub_return(_i, _v) == 0) +#define atomic64_dec(_v) atomic64_sub_return(1, _v) +#define atomic64_dec_return(_v) atomic64_sub_return(1, _v) +#define atomic64_dec_and_test(_v) (atomic64_sub_return(1, _v) == 0) +#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) #define smp_mb__before_atomic_dec() smp_mb() #define smp_mb__after_atomic_dec() smp_mb() @@ -281,5 +368,5 @@ static __inline__ int atomic64_add_unless(atomic64_t *v, #define smp_mb__after_atomic_inc() smp_mb() #include <asm-generic/atomic-long.h> -#endif /* __KERNEL__ */ + #endif /* __ARCH_S390_ATOMIC__ */ diff --git a/arch/s390/include/asm/checksum.h b/arch/s390/include/asm/checksum.h index d5a8e7c1477..6c00f6800a3 100644 --- a/arch/s390/include/asm/checksum.h +++ b/arch/s390/include/asm/checksum.h @@ -78,28 +78,11 @@ csum_partial_copy_nocheck (const void *src, void *dst, int len, __wsum sum) */ static inline __sum16 csum_fold(__wsum sum) { -#ifndef __s390x__ - register_pair rp; + u32 csum = (__force u32) sum; - asm volatile( - " slr %N1,%N1\n" /* %0 = H L */ - " lr %1,%0\n" /* %0 = H L, %1 = H L 0 0 */ - " srdl %1,16\n" /* %0 = H L, %1 = 0 H L 0 */ - " alr %1,%N1\n" /* %0 = H L, %1 = L H L 0 */ - " alr %0,%1\n" /* %0 = H+L+C L+H */ - " srl %0,16\n" /* %0 = H+L+C */ - : "+&d" (sum), "=d" (rp) : : "cc"); -#else /* __s390x__ */ - asm volatile( - " sr 3,3\n" /* %0 = H*65536 + L */ - " lr 2,%0\n" /* %0 = H L, 2/3 = H L / 0 0 */ - " srdl 2,16\n" /* %0 = H L, 2/3 = 0 H / L 0 */ - " alr 2,3\n" /* %0 = H L, 2/3 = L H / L 0 */ - " alr %0,2\n" /* %0 = H+L+C L+H */ - " srl %0,16\n" /* %0 = H+L+C */ - : "+&d" (sum) : : "cc", "2", "3"); -#endif /* __s390x__ */ - return (__force __sum16) ~sum; + csum += (csum >> 16) + (csum << 16); + csum >>= 16; + return (__force __sum16) ~csum; } /* diff --git a/arch/s390/include/asm/chsc.h b/arch/s390/include/asm/chsc.h index 807997f7414..4943654ed7f 100644 --- a/arch/s390/include/asm/chsc.h +++ b/arch/s390/include/asm/chsc.h @@ -125,4 +125,32 @@ struct chsc_cpd_info { #define CHSC_INFO_CPD _IOWR(CHSC_IOCTL_MAGIC, 0x87, struct chsc_cpd_info) #define CHSC_INFO_DCAL _IOWR(CHSC_IOCTL_MAGIC, 0x88, struct chsc_dcal) +#ifdef __KERNEL__ + +struct css_general_char { + u64 : 12; + u32 dynio : 1; /* bit 12 */ + u32 : 28; + u32 aif : 1; /* bit 41 */ + u32 : 3; + u32 mcss : 1; /* bit 45 */ + u32 fcs : 1; /* bit 46 */ + u32 : 1; + u32 ext_mb : 1; /* bit 48 */ + u32 : 7; + u32 aif_tdd : 1; /* bit 56 */ + u32 : 1; + u32 qebsm : 1; /* bit 58 */ + u32 : 8; + u32 aif_osa : 1; /* bit 67 */ + u32 : 14; + u32 cib : 1; /* bit 82 */ + u32 : 5; + u32 fcx : 1; /* bit 88 */ + u32 : 7; +}__attribute__((packed)); + +extern struct css_general_char css_general_characteristics; + +#endif /* __KERNEL__ */ #endif diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h index 619bf94b11f..e85679af54d 100644 --- a/arch/s390/include/asm/cio.h +++ b/arch/s390/include/asm/cio.h @@ -15,228 +15,7 @@ #define LPM_ANYPATH 0xff #define __MAX_CSSID 0 -/** - * struct cmd_scsw - command-mode subchannel status word - * @key: subchannel key - * @sctl: suspend control - * @eswf: esw format - * @cc: deferred condition code - * @fmt: format - * @pfch: prefetch - * @isic: initial-status interruption control - * @alcc: address-limit checking control - * @ssi: suppress-suspended interruption - * @zcc: zero condition code - * @ectl: extended control - * @pno: path not operational - * @res: reserved - * @fctl: function control - * @actl: activity control - * @stctl: status control - * @cpa: channel program address - * @dstat: device status - * @cstat: subchannel status - * @count: residual count - */ -struct cmd_scsw { - __u32 key : 4; - __u32 sctl : 1; - __u32 eswf : 1; - __u32 cc : 2; - __u32 fmt : 1; - __u32 pfch : 1; - __u32 isic : 1; - __u32 alcc : 1; - __u32 ssi : 1; - __u32 zcc : 1; - __u32 ectl : 1; - __u32 pno : 1; - __u32 res : 1; - __u32 fctl : 3; - __u32 actl : 7; - __u32 stctl : 5; - __u32 cpa; - __u32 dstat : 8; - __u32 cstat : 8; - __u32 count : 16; -} __attribute__ ((packed)); - -/** - * struct tm_scsw - transport-mode subchannel status word - * @key: subchannel key - * @eswf: esw format - * @cc: deferred condition code - * @fmt: format - * @x: IRB-format control - * @q: interrogate-complete - * @ectl: extended control - * @pno: path not operational - * @fctl: function control - * @actl: activity control - * @stctl: status control - * @tcw: TCW address - * @dstat: device status - * @cstat: subchannel status - * @fcxs: FCX status - * @schxs: subchannel-extended status - */ -struct tm_scsw { - u32 key:4; - u32 :1; - u32 eswf:1; - u32 cc:2; - u32 fmt:3; - u32 x:1; - u32 q:1; - u32 :1; - u32 ectl:1; - u32 pno:1; - u32 :1; - u32 fctl:3; - u32 actl:7; - u32 stctl:5; - u32 tcw; - u32 dstat:8; - u32 cstat:8; - u32 fcxs:8; - u32 schxs:8; -} __attribute__ ((packed)); - -/** - * union scsw - subchannel status word - * @cmd: command-mode SCSW - * @tm: transport-mode SCSW - */ -union scsw { - struct cmd_scsw cmd; - struct tm_scsw tm; -} __attribute__ ((packed)); - -int scsw_is_tm(union scsw *scsw); -u32 scsw_key(union scsw *scsw); -u32 scsw_eswf(union scsw *scsw); -u32 scsw_cc(union scsw *scsw); -u32 scsw_ectl(union scsw *scsw); -u32 scsw_pno(union scsw *scsw); -u32 scsw_fctl(union scsw *scsw); -u32 scsw_actl(union scsw *scsw); -u32 scsw_stctl(union scsw *scsw); -u32 scsw_dstat(union scsw *scsw); -u32 scsw_cstat(union scsw *scsw); -int scsw_is_solicited(union scsw *scsw); -int scsw_is_valid_key(union scsw *scsw); -int scsw_is_valid_eswf(union scsw *scsw); -int scsw_is_valid_cc(union scsw *scsw); -int scsw_is_valid_ectl(union scsw *scsw); -int scsw_is_valid_pno(union scsw *scsw); -int scsw_is_valid_fctl(union scsw *scsw); -int scsw_is_valid_actl(union scsw *scsw); -int scsw_is_valid_stctl(union scsw *scsw); -int scsw_is_valid_dstat(union scsw *scsw); -int scsw_is_valid_cstat(union scsw *scsw); -int scsw_cmd_is_valid_key(union scsw *scsw); -int scsw_cmd_is_valid_sctl(union scsw *scsw); -int scsw_cmd_is_valid_eswf(union scsw *scsw); -int scsw_cmd_is_valid_cc(union scsw *scsw); -int scsw_cmd_is_valid_fmt(union scsw *scsw); -int scsw_cmd_is_valid_pfch(union scsw *scsw); -int scsw_cmd_is_valid_isic(union scsw *scsw); -int scsw_cmd_is_valid_alcc(union scsw *scsw); -int scsw_cmd_is_valid_ssi(union scsw *scsw); -int scsw_cmd_is_valid_zcc(union scsw *scsw); -int scsw_cmd_is_valid_ectl(union scsw *scsw); -int scsw_cmd_is_valid_pno(union scsw *scsw); -int scsw_cmd_is_valid_fctl(union scsw *scsw); -int scsw_cmd_is_valid_actl(union scsw *scsw); -int scsw_cmd_is_valid_stctl(union scsw *scsw); -int scsw_cmd_is_valid_dstat(union scsw *scsw); -int scsw_cmd_is_valid_cstat(union scsw *scsw); -int scsw_cmd_is_solicited(union scsw *scsw); -int scsw_tm_is_valid_key(union scsw *scsw); -int scsw_tm_is_valid_eswf(union scsw *scsw); -int scsw_tm_is_valid_cc(union scsw *scsw); -int scsw_tm_is_valid_fmt(union scsw *scsw); -int scsw_tm_is_valid_x(union scsw *scsw); -int scsw_tm_is_valid_q(union scsw *scsw); -int scsw_tm_is_valid_ectl(union scsw *scsw); -int scsw_tm_is_valid_pno(union scsw *scsw); -int scsw_tm_is_valid_fctl(union scsw *scsw); -int scsw_tm_is_valid_actl(union scsw *scsw); -int scsw_tm_is_valid_stctl(union scsw *scsw); -int scsw_tm_is_valid_dstat(union scsw *scsw); -int scsw_tm_is_valid_cstat(union scsw *scsw); -int scsw_tm_is_valid_fcxs(union scsw *scsw); -int scsw_tm_is_valid_schxs(union scsw *scsw); -int scsw_tm_is_solicited(union scsw *scsw); - -#define SCSW_FCTL_CLEAR_FUNC 0x1 -#define SCSW_FCTL_HALT_FUNC 0x2 -#define SCSW_FCTL_START_FUNC 0x4 - -#define SCSW_ACTL_SUSPENDED 0x1 -#define SCSW_ACTL_DEVACT 0x2 -#define SCSW_ACTL_SCHACT 0x4 -#define SCSW_ACTL_CLEAR_PEND 0x8 -#define SCSW_ACTL_HALT_PEND 0x10 -#define SCSW_ACTL_START_PEND 0x20 -#define SCSW_ACTL_RESUME_PEND 0x40 - -#define SCSW_STCTL_STATUS_PEND 0x1 -#define SCSW_STCTL_SEC_STATUS 0x2 -#define SCSW_STCTL_PRIM_STATUS 0x4 -#define SCSW_STCTL_INTER_STATUS 0x8 -#define SCSW_STCTL_ALERT_STATUS 0x10 - -#define DEV_STAT_ATTENTION 0x80 -#define DEV_STAT_STAT_MOD 0x40 -#define DEV_STAT_CU_END 0x20 -#define DEV_STAT_BUSY 0x10 -#define DEV_STAT_CHN_END 0x08 -#define DEV_STAT_DEV_END 0x04 -#define DEV_STAT_UNIT_CHECK 0x02 -#define DEV_STAT_UNIT_EXCEP 0x01 - -#define SCHN_STAT_PCI 0x80 -#define SCHN_STAT_INCORR_LEN 0x40 -#define SCHN_STAT_PROG_CHECK 0x20 -#define SCHN_STAT_PROT_CHECK 0x10 -#define SCHN_STAT_CHN_DATA_CHK 0x08 -#define SCHN_STAT_CHN_CTRL_CHK 0x04 -#define SCHN_STAT_INTF_CTRL_CHK 0x02 -#define SCHN_STAT_CHAIN_CHECK 0x01 - -/* - * architectured values for first sense byte - */ -#define SNS0_CMD_REJECT 0x80 -#define SNS_CMD_REJECT SNS0_CMD_REJEC -#define SNS0_INTERVENTION_REQ 0x40 -#define SNS0_BUS_OUT_CHECK 0x20 -#define SNS0_EQUIPMENT_CHECK 0x10 -#define SNS0_DATA_CHECK 0x08 -#define SNS0_OVERRUN 0x04 -#define SNS0_INCOMPL_DOMAIN 0x01 - -/* - * architectured values for second sense byte - */ -#define SNS1_PERM_ERR 0x80 -#define SNS1_INV_TRACK_FORMAT 0x40 -#define SNS1_EOC 0x20 -#define SNS1_MESSAGE_TO_OPER 0x10 -#define SNS1_NO_REC_FOUND 0x08 -#define SNS1_FILE_PROTECTED 0x04 -#define SNS1_WRITE_INHIBITED 0x02 -#define SNS1_INPRECISE_END 0x01 - -/* - * architectured values for third sense byte - */ -#define SNS2_REQ_INH_WRITE 0x80 -#define SNS2_CORRECTABLE 0x40 -#define SNS2_FIRST_LOG_ERR 0x20 -#define SNS2_ENV_DATA_PRESENT 0x10 -#define SNS2_INPRECISE_END 0x04 +#include <asm/scsw.h> /** * struct ccw1 - channel command word diff --git a/arch/s390/include/asm/cpu.h b/arch/s390/include/asm/cpu.h new file mode 100644 index 00000000000..471234b9057 --- /dev/null +++ b/arch/s390/include/asm/cpu.h @@ -0,0 +1,26 @@ +/* + * Copyright IBM Corp. 2000,2009 + * Author(s): Hartmut Penner <hp@de.ibm.com>, + * Martin Schwidefsky <schwidefsky@de.ibm.com>, + * Christian Ehrhardt <ehrhardt@de.ibm.com>, + */ + +#ifndef _ASM_S390_CPU_H +#define _ASM_S390_CPU_H + +#define MAX_CPU_ADDRESS 255 + +#ifndef __ASSEMBLY__ + +#include <linux/types.h> + +struct cpuid +{ + unsigned int version : 8; + unsigned int ident : 24; + unsigned int machine : 16; + unsigned int unused : 16; +} __packed; + +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_S390_CPU_H */ diff --git a/arch/s390/include/asm/cpuid.h b/arch/s390/include/asm/cpuid.h deleted file mode 100644 index 07836a2e522..00000000000 --- a/arch/s390/include/asm/cpuid.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright IBM Corp. 2000,2009 - * Author(s): Hartmut Penner <hp@de.ibm.com>, - * Martin Schwidefsky <schwidefsky@de.ibm.com> - * Christian Ehrhardt <ehrhardt@de.ibm.com> - */ - -#ifndef _ASM_S390_CPUID_H_ -#define _ASM_S390_CPUID_H_ - -/* - * CPU type and hardware bug flags. Kept separately for each CPU. - * Members of this structure are referenced in head.S, so think twice - * before touching them. [mj] - */ - -typedef struct -{ - unsigned int version : 8; - unsigned int ident : 24; - unsigned int machine : 16; - unsigned int unused : 16; -} __attribute__ ((packed)) cpuid_t; - -#endif /* _ASM_S390_CPUID_H_ */ diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h index 31ed5686a96..18124b75a7a 100644 --- a/arch/s390/include/asm/debug.h +++ b/arch/s390/include/asm/debug.h @@ -167,6 +167,10 @@ debug_text_event(debug_info_t* id, int level, const char* txt) return debug_event_common(id,level,txt,strlen(txt)); } +/* + * IMPORTANT: Use "%s" in sprintf format strings with care! Only pointers are + * stored in the s390dbf. See Documentation/s390/s390dbf.txt for more details! + */ extern debug_entry_t * debug_sprintf_event(debug_info_t* id,int level,char *string,...) __attribute__ ((format(printf, 3, 4))); @@ -206,7 +210,10 @@ debug_text_exception(debug_info_t* id, int level, const char* txt) return debug_exception_common(id,level,txt,strlen(txt)); } - +/* + * IMPORTANT: Use "%s" in sprintf format strings with care! Only pointers are + * stored in the s390dbf. See Documentation/s390/s390dbf.txt for more details! + */ extern debug_entry_t * debug_sprintf_exception(debug_info_t* id,int level,char *string,...) __attribute__ ((format(printf, 3, 4))); diff --git a/arch/s390/include/asm/hardirq.h b/arch/s390/include/asm/hardirq.h index 89ec7056da2..498bc389238 100644 --- a/arch/s390/include/asm/hardirq.h +++ b/arch/s390/include/asm/hardirq.h @@ -18,13 +18,6 @@ #include <linux/interrupt.h> #include <asm/lowcore.h> -/* irq_cpustat_t is unused currently, but could be converted - * into a percpu variable instead of storing softirq_pending - * on the lowcore */ -typedef struct { - unsigned int __softirq_pending; -} irq_cpustat_t; - #define local_softirq_pending() (S390_lowcore.softirq_pending) #define __ARCH_IRQ_STAT diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index 1171e6d144a..5e95d95450b 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -57,6 +57,8 @@ struct ipl_block_fcp { } __attribute__((packed)); #define DIAG308_VMPARM_SIZE 64 +#define DIAG308_SCPDATA_SIZE (PAGE_SIZE - (sizeof(struct ipl_list_hdr) + \ + offsetof(struct ipl_block_fcp, scp_data))) struct ipl_block_ccw { u8 load_parm[8]; @@ -91,7 +93,8 @@ extern void do_halt(void); extern void do_poff(void); extern void ipl_save_parameters(void); extern void ipl_update_parameters(void); -extern void get_ipl_vmparm(char *); +extern size_t append_ipl_vmparm(char *, size_t); +extern size_t append_ipl_scpdata(char *, size_t); enum { IPL_DEVNO_VALID = 1, diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 1cd02f6073a..698988f6940 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -17,7 +17,7 @@ #include <linux/interrupt.h> #include <linux/kvm_host.h> #include <asm/debug.h> -#include <asm/cpuid.h> +#include <asm/cpu.h> #define KVM_MAX_VCPUS 64 #define KVM_MEMORY_SLOTS 32 @@ -217,8 +217,8 @@ struct kvm_vcpu_arch { struct hrtimer ckc_timer; struct tasklet_struct tasklet; union { - cpuid_t cpu_id; - u64 stidp_data; + struct cpuid cpu_id; + u64 stidp_data; }; }; diff --git a/arch/s390/include/asm/kvm_virtio.h b/arch/s390/include/asm/kvm_virtio.h index 0503936f101..acdfdff2661 100644 --- a/arch/s390/include/asm/kvm_virtio.h +++ b/arch/s390/include/asm/kvm_virtio.h @@ -54,14 +54,4 @@ struct kvm_vqconfig { * This is pagesize for historical reasons. */ #define KVM_S390_VIRTIO_RING_ALIGN 4096 -#ifdef __KERNEL__ -/* early virtio console setup */ -#ifdef CONFIG_S390_GUEST -extern void s390_virtio_console_init(void); -#else -static inline void s390_virtio_console_init(void) -{ -} -#endif /* CONFIG_VIRTIO_CONSOLE */ -#endif /* __KERNEL__ */ #endif diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 5046ad6b7a6..6bc9426a6fb 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -132,7 +132,7 @@ #ifndef __ASSEMBLY__ -#include <asm/cpuid.h> +#include <asm/cpu.h> #include <asm/ptrace.h> #include <linux/types.h> @@ -275,7 +275,7 @@ struct _lowcore __u32 user_exec_asce; /* 0x02ac */ /* SMP info area */ - cpuid_t cpu_id; /* 0x02b0 */ + struct cpuid cpu_id; /* 0x02b0 */ __u32 cpu_nr; /* 0x02b8 */ __u32 softirq_pending; /* 0x02bc */ __u32 percpu_offset; /* 0x02c0 */ @@ -380,7 +380,7 @@ struct _lowcore __u64 user_exec_asce; /* 0x0318 */ /* SMP info area */ - cpuid_t cpu_id; /* 0x0320 */ + struct cpuid cpu_id; /* 0x0320 */ __u32 cpu_nr; /* 0x0328 */ __u32 softirq_pending; /* 0x032c */ __u64 percpu_offset; /* 0x0330 */ diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index 3b59216e628..03be99919d6 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -2,6 +2,7 @@ #define __MMU_H typedef struct { + spinlock_t list_lock; struct list_head crst_list; struct list_head pgtable_list; unsigned long asce_bits; diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 3e3594d01f8..5e9daf5d7f2 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -125,8 +125,6 @@ page_get_storage_key(unsigned long addr) return skey; } -#ifdef CONFIG_PAGE_STATES - struct page; void arch_free_page(struct page *page, int order); void arch_alloc_page(struct page *page, int order); @@ -134,8 +132,6 @@ void arch_alloc_page(struct page *page, int order); #define HAVE_ARCH_FREE_PAGE #define HAVE_ARCH_ALLOC_PAGE -#endif - #endif /* !__ASSEMBLY__ */ #define __PAGE_OFFSET 0x0UL diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index b2658b9220f..ddad5903341 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -140,6 +140,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) static inline pgd_t *pgd_alloc(struct mm_struct *mm) { + spin_lock_init(&mm->context.list_lock); INIT_LIST_HEAD(&mm->context.crst_list); INIT_LIST_HEAD(&mm->context.pgtable_list); return (pgd_t *) crst_table_alloc(mm, s390_noexec); diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index c139fa7b8e8..cf8eed3fa77 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -14,7 +14,7 @@ #define __ASM_S390_PROCESSOR_H #include <linux/linkage.h> -#include <asm/cpuid.h> +#include <asm/cpu.h> #include <asm/page.h> #include <asm/ptrace.h> #include <asm/setup.h> @@ -26,7 +26,7 @@ */ #define current_text_addr() ({ void *pc; asm("basr %0,0" : "=a" (pc)); pc; }) -static inline void get_cpu_id(cpuid_t *ptr) +static inline void get_cpu_id(struct cpuid *ptr) { asm volatile("stidp 0(%1)" : "=m" (*ptr) : "a" (ptr)); } diff --git a/arch/s390/include/asm/scatterlist.h b/arch/s390/include/asm/scatterlist.h index 29ec8e28c8d..35d786fe93a 100644 --- a/arch/s390/include/asm/scatterlist.h +++ b/arch/s390/include/asm/scatterlist.h @@ -1,19 +1 @@ -#ifndef _ASMS390_SCATTERLIST_H -#define _ASMS390_SCATTERLIST_H - -struct scatterlist { -#ifdef CONFIG_DEBUG_SG - unsigned long sg_magic; -#endif - unsigned long page_link; - unsigned int offset; - unsigned int length; -}; - -#ifdef __s390x__ -#define ISA_DMA_THRESHOLD (0xffffffffffffffffUL) -#else -#define ISA_DMA_THRESHOLD (0xffffffffUL) -#endif - -#endif /* _ASMS390X_SCATTERLIST_H */ +#include <asm-generic/scatterlist.h> diff --git a/drivers/s390/cio/scsw.c b/arch/s390/include/asm/scsw.h index f8da25ab576..de389cb54d2 100644 --- a/drivers/s390/cio/scsw.c +++ b/arch/s390/include/asm/scsw.h @@ -1,15 +1,182 @@ /* * Helper functions for scsw access. * - * Copyright IBM Corp. 2008 + * Copyright IBM Corp. 2008,2009 * Author(s): Peter Oberparleiter <peter.oberparleiter@de.ibm.com> */ +#ifndef _ASM_S390_SCSW_H_ +#define _ASM_S390_SCSW_H_ + #include <linux/types.h> -#include <linux/module.h> +#include <asm/chsc.h> #include <asm/cio.h> -#include "css.h" -#include "chsc.h" + +/** + * struct cmd_scsw - command-mode subchannel status word + * @key: subchannel key + * @sctl: suspend control + * @eswf: esw format + * @cc: deferred condition code + * @fmt: format + * @pfch: prefetch + * @isic: initial-status interruption control + * @alcc: address-limit checking control + * @ssi: suppress-suspended interruption + * @zcc: zero condition code + * @ectl: extended control + * @pno: path not operational + * @res: reserved + * @fctl: function control + * @actl: activity control + * @stctl: status control + * @cpa: channel program address + * @dstat: device status + * @cstat: subchannel status + * @count: residual count + */ +struct cmd_scsw { + __u32 key : 4; + __u32 sctl : 1; + __u32 eswf : 1; + __u32 cc : 2; + __u32 fmt : 1; + __u32 pfch : 1; + __u32 isic : 1; + __u32 alcc : 1; + __u32 ssi : 1; + __u32 zcc : 1; + __u32 ectl : 1; + __u32 pno : 1; + __u32 res : 1; + __u32 fctl : 3; + __u32 actl : 7; + __u32 stctl : 5; + __u32 cpa; + __u32 dstat : 8; + __u32 cstat : 8; + __u32 count : 16; +} __attribute__ ((packed)); + +/** + * struct tm_scsw - transport-mode subchannel status word + * @key: subchannel key + * @eswf: esw format + * @cc: deferred condition code + * @fmt: format + * @x: IRB-format control + * @q: interrogate-complete + * @ectl: extended control + * @pno: path not operational + * @fctl: function control + * @actl: activity control + * @stctl: status control + * @tcw: TCW address + * @dstat: device status + * @cstat: subchannel status + * @fcxs: FCX status + * @schxs: subchannel-extended status + */ +struct tm_scsw { + u32 key:4; + u32 :1; + u32 eswf:1; + u32 cc:2; + u32 fmt:3; + u32 x:1; + u32 q:1; + u32 :1; + u32 ectl:1; + u32 pno:1; + u32 :1; + u32 fctl:3; + u32 actl:7; + u32 stctl:5; + u32 tcw; + u32 dstat:8; + u32 cstat:8; + u32 fcxs:8; + u32 schxs:8; +} __attribute__ ((packed)); + +/** + * union scsw - subchannel status word + * @cmd: command-mode SCSW + * @tm: transport-mode SCSW + */ +union scsw { + struct cmd_scsw cmd; + struct tm_scsw tm; +} __attribute__ ((packed)); + +#define SCSW_FCTL_CLEAR_FUNC 0x1 +#define SCSW_FCTL_HALT_FUNC 0x2 +#define SCSW_FCTL_START_FUNC 0x4 + +#define SCSW_ACTL_SUSPENDED 0x1 +#define SCSW_ACTL_DEVACT 0x2 +#define SCSW_ACTL_SCHACT 0x4 +#define SCSW_ACTL_CLEAR_PEND 0x8 +#define SCSW_ACTL_HALT_PEND 0x10 +#define SCSW_ACTL_START_PEND 0x20 +#define SCSW_ACTL_RESUME_PEND 0x40 + +#define SCSW_STCTL_STATUS_PEND 0x1 +#define SCSW_STCTL_SEC_STATUS 0x2 +#define SCSW_STCTL_PRIM_STATUS 0x4 +#define SCSW_STCTL_INTER_STATUS 0x8 +#define SCSW_STCTL_ALERT_STATUS 0x10 + +#define DEV_STAT_ATTENTION 0x80 +#define DEV_STAT_STAT_MOD 0x40 +#define DEV_STAT_CU_END 0x20 +#define DEV_STAT_BUSY 0x10 +#define DEV_STAT_CHN_END 0x08 +#define DEV_STAT_DEV_END 0x04 +#define DEV_STAT_UNIT_CHECK 0x02 +#define DEV_STAT_UNIT_EXCEP 0x01 + +#define SCHN_STAT_PCI 0x80 +#define SCHN_STAT_INCORR_LEN 0x40 +#define SCHN_STAT_PROG_CHECK 0x20 +#define SCHN_STAT_PROT_CHECK 0x10 +#define SCHN_STAT_CHN_DATA_CHK 0x08 +#define SCHN_STAT_CHN_CTRL_CHK 0x04 +#define SCHN_STAT_INTF_CTRL_CHK 0x02 +#define SCHN_STAT_CHAIN_CHECK 0x01 + +/* + * architectured values for first sense byte + */ +#define SNS0_CMD_REJECT 0x80 +#define SNS_CMD_REJECT SNS0_CMD_REJEC +#define SNS0_INTERVENTION_REQ 0x40 +#define SNS0_BUS_OUT_CHECK 0x20 +#define SNS0_EQUIPMENT_CHECK 0x10 +#define SNS0_DATA_CHECK 0x08 +#define SNS0_OVERRUN 0x04 +#define SNS0_INCOMPL_DOMAIN 0x01 + +/* + * architectured values for second sense byte + */ +#define SNS1_PERM_ERR 0x80 +#define SNS1_INV_TRACK_FORMAT 0x40 +#define SNS1_EOC 0x20 +#define SNS1_MESSAGE_TO_OPER 0x10 +#define SNS1_NO_REC_FOUND 0x08 +#define SNS1_FILE_PROTECTED 0x04 +#define SNS1_WRITE_INHIBITED 0x02 +#define SNS1_INPRECISE_END 0x01 + +/* + * architectured values for third sense byte + */ +#define SNS2_REQ_INH_WRITE 0x80 +#define SNS2_CORRECTABLE 0x40 +#define SNS2_FIRST_LOG_ERR 0x20 +#define SNS2_ENV_DATA_PRESENT 0x10 +#define SNS2_INPRECISE_END 0x04 /** * scsw_is_tm - check for transport mode scsw @@ -18,11 +185,10 @@ * Return non-zero if the specified scsw is a transport mode scsw, zero * otherwise. */ -int scsw_is_tm(union scsw *scsw) +static inline int scsw_is_tm(union scsw *scsw) { return css_general_characteristics.fcx && (scsw->tm.x == 1); } -EXPORT_SYMBOL(scsw_is_tm); /** * scsw_key - return scsw key field @@ -31,14 +197,13 @@ EXPORT_SYMBOL(scsw_is_tm); * Return the value of the key field of the specified scsw, regardless of * whether it is a transport mode or command mode scsw. */ -u32 scsw_key(union scsw *scsw) +static inline u32 scsw_key(union scsw *scsw) { if (scsw_is_tm(scsw)) return scsw->tm.key; else return scsw->cmd.key; } -EXPORT_SYMBOL(scsw_key); /** * scsw_eswf - return scsw eswf field @@ -47,14 +212,13 @@ EXPORT_SYMBOL(scsw_key); * Return the value of the eswf field of the specified scsw, regardless of * whether it is a transport mode or command mode scsw. */ -u32 scsw_eswf(union scsw *scsw) +static inline u32 scsw_eswf(union scsw *scsw) { if (scsw_is_tm(scsw)) return scsw->tm.eswf; else return scsw->cmd.eswf; } -EXPORT_SYMBOL(scsw_eswf); /** * scsw_cc - return scsw cc field @@ -63,14 +227,13 @@ EXPORT_SYMBOL(scsw_eswf); * Return the value of the cc field of the specified scsw, regardless of * whether it is a transport mode or command mode scsw. */ -u32 scsw_cc(union scsw *scsw) +static inline u32 scsw_cc(union scsw *scsw) { if (scsw_is_tm(scsw)) return scsw->tm.cc; else return scsw->cmd.cc; } -EXPORT_SYMBOL(scsw_cc); /** * scsw_ectl - return scsw ectl field @@ -79,14 +242,13 @@ EXPORT_SYMBOL(scsw_cc); * Return the value of the ectl field of the specified scsw, regardless of * whether it is a transport mode or command mode scsw. */ -u32 scsw_ectl(union scsw *scsw) +static inline u32 scsw_ectl(union scsw *scsw) { if (scsw_is_tm(scsw)) return scsw->tm.ectl; else return scsw->cmd.ectl; } -EXPORT_SYMBOL(scsw_ectl); /** * scsw_pno - return scsw pno field @@ -95,14 +257,13 @@ EXPORT_SYMBOL(scsw_ectl); * Return the value of the pno field of the specified scsw, regardless of * whether it is a transport mode or command mode scsw. */ -u32 scsw_pno(union scsw *scsw) +static inline u32 scsw_pno(union scsw *scsw) { if (scsw_is_tm(scsw)) return scsw->tm.pno; else return scsw->cmd.pno; } -EXPORT_SYMBOL(scsw_pno); /** * scsw_fctl - return scsw fctl field @@ -111,14 +272,13 @@ EXPORT_SYMBOL(scsw_pno); * Return the value of the fctl field of the specified scsw, regardless of * whether it is a transport mode or command mode scsw. */ -u32 scsw_fctl(union scsw *scsw) +static inline u32 scsw_fctl(union scsw *scsw) { if (scsw_is_tm(scsw)) return scsw->tm.fctl; else return scsw->cmd.fctl; } -EXPORT_SYMBOL(scsw_fctl); /** * scsw_actl - return scsw actl field @@ -127,14 +287,13 @@ EXPORT_SYMBOL(scsw_fctl); * Return the value of the actl field of the specified scsw, regardless of * whether it is a transport mode or command mode scsw. */ -u32 scsw_actl(union scsw *scsw) +static inline u32 scsw_actl(union scsw *scsw) { if (scsw_is_tm(scsw)) return scsw->tm.actl; else return scsw->cmd.actl; } -EXPORT_SYMBOL(scsw_actl); /** * scsw_stctl - return scsw stctl field @@ -143,14 +302,13 @@ EXPORT_SYMBOL(scsw_actl); * Return the value of the stctl field of the specified scsw, regardless of * whether it is a transport mode or command mode scsw. */ -u32 scsw_stctl(union scsw *scsw) +static inline u32 scsw_stctl(union scsw *scsw) { if (scsw_is_tm(scsw)) return scsw->tm.stctl; else return scsw->cmd.stctl; } -EXPORT_SYMBOL(scsw_stctl); /** * scsw_dstat - return scsw dstat field @@ -159,14 +317,13 @@ EXPORT_SYMBOL(scsw_stctl); * Return the value of the dstat field of the specified scsw, regardless of * whether it is a transport mode or command mode scsw. */ -u32 scsw_dstat(union scsw *scsw) +static inline u32 scsw_dstat(union scsw *scsw) { if (scsw_is_tm(scsw)) return scsw->tm.dstat; else return scsw->cmd.dstat; } -EXPORT_SYMBOL(scsw_dstat); /** * scsw_cstat - return scsw cstat field @@ -175,14 +332,13 @@ EXPORT_SYMBOL(scsw_dstat); * Return the value of the cstat field of the specified scsw, regardless of * whether it is a transport mode or command mode scsw. */ -u32 scsw_cstat(union scsw *scsw) +static inline u32 scsw_cstat(union scsw *scsw) { if (scsw_is_tm(scsw)) return scsw->tm.cstat; else return scsw->cmd.cstat; } -EXPORT_SYMBOL(scsw_cstat); /** * scsw_cmd_is_valid_key - check key field validity @@ -191,11 +347,10 @@ EXPORT_SYMBOL(scsw_cstat); * Return non-zero if the key field of the specified command mode scsw is * valid, zero otherwise. */ -int scsw_cmd_is_valid_key(union scsw *scsw) +static inline int scsw_cmd_is_valid_key(union scsw *scsw) { return (scsw->cmd.fctl & SCSW_FCTL_START_FUNC); } -EXPORT_SYMBOL(scsw_cmd_is_valid_key); /** * scsw_cmd_is_valid_sctl - check fctl field validity @@ -204,11 +359,10 @@ EXPORT_SYMBOL(scsw_cmd_is_valid_key); * Return non-zero if the fctl field of the specified command mode scsw is * valid, zero otherwise. */ -int scsw_cmd_is_valid_sctl(union scsw *scsw) +static inline int scsw_cmd_is_valid_sctl(union scsw *scsw) { return (scsw->cmd.fctl & SCSW_FCTL_START_FUNC); } -EXPORT_SYMBOL(scsw_cmd_is_valid_sctl); /** * scsw_cmd_is_valid_eswf - check eswf field validity @@ -217,11 +371,10 @@ EXPORT_SYMBOL(scsw_cmd_is_valid_sctl); * Return non-zero if the eswf field of the specified command mode scsw is * valid, zero otherwise. */ -int scsw_cmd_is_valid_eswf(union scsw *scsw) +static inline int scsw_cmd_is_valid_eswf(union scsw *scsw) { return (scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND); } -EXPORT_SYMBOL(scsw_cmd_is_valid_eswf); /** * scsw_cmd_is_valid_cc - check cc field validity @@ -230,12 +383,11 @@ EXPORT_SYMBOL(scsw_cmd_is_valid_eswf); * Return non-zero if the cc field of the specified command mode scsw is * valid, zero otherwise. */ -int scsw_cmd_is_valid_cc(union scsw *scsw) +static inline int scsw_cmd_is_valid_cc(union scsw *scsw) { return (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) && (scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND); } -EXPORT_SYMBOL(scsw_cmd_is_valid_cc); /** * scsw_cmd_is_valid_fmt - check fmt field validity @@ -244,11 +396,10 @@ EXPORT_SYMBOL(scsw_cmd_is_valid_cc); * Return non-zero if the fmt field of the specified command mode scsw is * valid, zero otherwise. */ -int scsw_cmd_is_valid_fmt(union scsw *scsw) +static inline int scsw_cmd_is_valid_fmt(union scsw *scsw) { return (scsw->cmd.fctl & SCSW_FCTL_START_FUNC); } -EXPORT_SYMBOL(scsw_cmd_is_valid_fmt); /** * scsw_cmd_is_valid_pfch - check pfch field validity @@ -257,11 +408,10 @@ EXPORT_SYMBOL(scsw_cmd_is_valid_fmt); * Return non-zero if the pfch field of the specified command mode scsw is * valid, zero otherwise. */ -int scsw_cmd_is_valid_pfch(union scsw *scsw) +static inline int scsw_cmd_is_valid_pfch(union scsw *scsw) { return (scsw->cmd.fctl & SCSW_FCTL_START_FUNC); } -EXPORT_SYMBOL(scsw_cmd_is_valid_pfch); /** * scsw_cmd_is_valid_isic - check isic field validity @@ -270,11 +420,10 @@ EXPORT_SYMBOL(scsw_cmd_is_valid_pfch); * Return non-zero if the isic field of the specified command mode scsw is * valid, zero otherwise. */ -int scsw_cmd_is_valid_isic(union scsw *scsw) +static inline int scsw_cmd_is_valid_isic(union scsw *scsw) { return (scsw->cmd.fctl & SCSW_FCTL_START_FUNC); } -EXPORT_SYMBOL(scsw_cmd_is_valid_isic); /** * scsw_cmd_is_valid_alcc - check alcc field validity @@ -283,11 +432,10 @@ EXPORT_SYMBOL(scsw_cmd_is_valid_isic); * Return non-zero if the alcc field of the specified command mode scsw is * valid, zero otherwise. */ -int scsw_cmd_is_valid_alcc(union scsw *scsw) +static inline int scsw_cmd_is_valid_alcc(union scsw *scsw) { return (scsw->cmd.fctl & SCSW_FCTL_START_FUNC); } -EXPORT_SYMBOL(scsw_cmd_is_valid_alcc); /** * scsw_cmd_is_valid_ssi - check ssi field validity @@ -296,11 +444,10 @@ EXPORT_SYMBOL(scsw_cmd_is_valid_alcc); * Return non-zero if the ssi field of the specified command mode scsw is * valid, zero otherwise. */ -int scsw_cmd_is_valid_ssi(union scsw *scsw) +static inline int scsw_cmd_is_valid_ssi(union scsw *scsw) { return (scsw->cmd.fctl & SCSW_FCTL_START_FUNC); } -EXPORT_SYMBOL(scsw_cmd_is_valid_ssi); /** * scsw_cmd_is_valid_zcc - check zcc field validity @@ -309,12 +456,11 @@ EXPORT_SYMBOL(scsw_cmd_is_valid_ssi); * Return non-zero if the zcc field of the specified command mode scsw is * valid, zero otherwise. */ -int scsw_cmd_is_valid_zcc(union scsw *scsw) +static inline int scsw_cmd_is_valid_zcc(union scsw *scsw) { return (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) && (scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS); } -EXPORT_SYMBOL(scsw_cmd_is_valid_zcc); /** * scsw_cmd_is_valid_ectl - check ectl field validity @@ -323,13 +469,12 @@ EXPORT_SYMBOL(scsw_cmd_is_valid_zcc); * Return non-zero if the ectl field of the specified command mode scsw is * valid, zero otherwise. */ -int scsw_cmd_is_valid_ectl(union scsw *scsw) +static inline int scsw_cmd_is_valid_ectl(union scsw *scsw) { return (scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND) && !(scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS) && (scsw->cmd.stctl & SCSW_STCTL_ALERT_STATUS); } -EXPORT_SYMBOL(scsw_cmd_is_valid_ectl); /** * scsw_cmd_is_valid_pno - check pno field validity @@ -338,7 +483,7 @@ EXPORT_SYMBOL(scsw_cmd_is_valid_ectl); * Return non-zero if the pno field of the specified command mode scsw is * valid, zero otherwise. */ -int scsw_cmd_is_valid_pno(union scsw *scsw) +static inline int scsw_cmd_is_valid_pno(union scsw *scsw) { return (scsw->cmd.fctl != 0) && (scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND) && @@ -346,7 +491,6 @@ int scsw_cmd_is_valid_pno(union scsw *scsw) ((scsw->cmd.stctl & SCSW_STCTL_INTER_STATUS) && (scsw->cmd.actl & SCSW_ACTL_SUSPENDED))); } -EXPORT_SYMBOL(scsw_cmd_is_valid_pno); /** * scsw_cmd_is_valid_fctl - check fctl field validity @@ -355,12 +499,11 @@ EXPORT_SYMBOL(scsw_cmd_is_valid_pno); * Return non-zero if the fctl field of the specified command mode scsw is * valid, zero otherwise. */ -int scsw_cmd_is_valid_fctl(union scsw *scsw) +static inline int scsw_cmd_is_valid_fctl(union scsw *scsw) { /* Only valid if pmcw.dnv == 1*/ return 1; } -EXPORT_SYMBOL(scsw_cmd_is_valid_fctl); /** * scsw_cmd_is_valid_actl - check actl field validity @@ -369,12 +512,11 @@ EXPORT_SYMBOL(scsw_cmd_is_valid_fctl); * Return non-zero if the actl field of the specified command mode scsw is * valid, zero otherwise. */ -int scsw_cmd_is_valid_actl(union scsw *scsw) +static inline int scsw_cmd_is_valid_actl(union scsw *scsw) { /* Only valid if pmcw.dnv == 1*/ return 1; } -EXPORT_SYMBOL(scsw_cmd_is_valid_actl); /** * scsw_cmd_is_valid_stctl - check stctl field validity @@ -383,12 +525,11 @@ EXPORT_SYMBOL(scsw_cmd_is_valid_actl); * Return non-zero if the stctl field of the specified command mode scsw is * valid, zero otherwise. */ -int scsw_cmd_is_valid_stctl(union scsw *scsw) +static inline int scsw_cmd_is_valid_stctl(union scsw *scsw) { /* Only valid if pmcw.dnv == 1*/ return 1; } -EXPORT_SYMBOL(scsw_cmd_is_valid_stctl); /** * scsw_cmd_is_valid_dstat - check dstat field validity @@ -397,12 +538,11 @@ EXPORT_SYMBOL(scsw_cmd_is_valid_stctl); * Return non-zero if the dstat field of the specified command mode scsw is * valid, zero otherwise. */ -int scsw_cmd_is_valid_dstat(union scsw *scsw) +static inline int scsw_cmd_is_valid_dstat(union scsw *scsw) { return (scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND) && (scsw->cmd.cc != 3); } -EXPORT_SYMBOL(scsw_cmd_is_valid_dstat); /** * scsw_cmd_is_valid_cstat - check cstat field validity @@ -411,12 +551,11 @@ EXPORT_SYMBOL(scsw_cmd_is_valid_dstat); * Return non-zero if the cstat field of the specified command mode scsw is * valid, zero otherwise. */ -int scsw_cmd_is_valid_cstat(union scsw *scsw) +static inline int scsw_cmd_is_valid_cstat(union scsw *scsw) { return (scsw->cmd.stctl & SCSW_STCTL_STATUS_PEND) && (scsw->cmd.cc != 3); } -EXPORT_SYMBOL(scsw_cmd_is_valid_cstat); /** * scsw_tm_is_valid_key - check key field validity @@ -425,11 +564,10 @@ EXPORT_SYMBOL(scsw_cmd_is_valid_cstat); * Return non-zero if the key field of the specified transport mode scsw is * valid, zero otherwise. */ -int scsw_tm_is_valid_key(union scsw *scsw) +static inline int scsw_tm_is_valid_key(union scsw *scsw) { return (scsw->tm.fctl & SCSW_FCTL_START_FUNC); } -EXPORT_SYMBOL(scsw_tm_is_valid_key); /** * scsw_tm_is_valid_eswf - check eswf field validity @@ -438,11 +576,10 @@ EXPORT_SYMBOL(scsw_tm_is_valid_key); * Return non-zero if the eswf field of the specified transport mode scsw is * valid, zero otherwise. */ -int scsw_tm_is_valid_eswf(union scsw *scsw) +static inline int scsw_tm_is_valid_eswf(union scsw *scsw) { return (scsw->tm.stctl & SCSW_STCTL_STATUS_PEND); } -EXPORT_SYMBOL(scsw_tm_is_valid_eswf); /** * scsw_tm_is_valid_cc - check cc field validity @@ -451,12 +588,11 @@ EXPORT_SYMBOL(scsw_tm_is_valid_eswf); * Return non-zero if the cc field of the specified transport mode scsw is * valid, zero otherwise. */ -int scsw_tm_is_valid_cc(union scsw *scsw) +static inline int scsw_tm_is_valid_cc(union scsw *scsw) { return (scsw->tm.fctl & SCSW_FCTL_START_FUNC) && (scsw->tm.stctl & SCSW_STCTL_STATUS_PEND); } -EXPORT_SYMBOL(scsw_tm_is_valid_cc); /** * scsw_tm_is_valid_fmt - check fmt field validity @@ -465,11 +601,10 @@ EXPORT_SYMBOL(scsw_tm_is_valid_cc); * Return non-zero if the fmt field of the specified transport mode scsw is * valid, zero otherwise. */ -int scsw_tm_is_valid_fmt(union scsw *scsw) +static inline int scsw_tm_is_valid_fmt(union scsw *scsw) { return 1; } -EXPORT_SYMBOL(scsw_tm_is_valid_fmt); /** * scsw_tm_is_valid_x - check x field validity @@ -478,11 +613,10 @@ EXPORT_SYMBOL(scsw_tm_is_valid_fmt); * Return non-zero if the x field of the specified transport mode scsw is * valid, zero otherwise. */ -int scsw_tm_is_valid_x(union scsw *scsw) +static inline int scsw_tm_is_valid_x(union scsw *scsw) { return 1; } -EXPORT_SYMBOL(scsw_tm_is_valid_x); /** * scsw_tm_is_valid_q - check q field validity @@ -491,11 +625,10 @@ EXPORT_SYMBOL(scsw_tm_is_valid_x); * Return non-zero if the q field of the specified transport mode scsw is * valid, zero otherwise. */ -int scsw_tm_is_valid_q(union scsw *scsw) +static inline int scsw_tm_is_valid_q(union scsw *scsw) { return 1; } -EXPORT_SYMBOL(scsw_tm_is_valid_q); /** * scsw_tm_is_valid_ectl - check ectl field validity @@ -504,13 +637,12 @@ EXPORT_SYMBOL(scsw_tm_is_valid_q); * Return non-zero if the ectl field of the specified transport mode scsw is * valid, zero otherwise. */ -int scsw_tm_is_valid_ectl(union scsw *scsw) +static inline int scsw_tm_is_valid_ectl(union scsw *scsw) { return (scsw->tm.stctl & SCSW_STCTL_STATUS_PEND) && !(scsw->tm.stctl & SCSW_STCTL_INTER_STATUS) && (scsw->tm.stctl & SCSW_STCTL_ALERT_STATUS); } -EXPORT_SYMBOL(scsw_tm_is_valid_ectl); /** * scsw_tm_is_valid_pno - check pno field validity @@ -519,7 +651,7 @@ EXPORT_SYMBOL(scsw_tm_is_valid_ectl); * Return non-zero if the pno field of the specified transport mode scsw is * valid, zero otherwise. */ -int scsw_tm_is_valid_pno(union scsw *scsw) +static inline int scsw_tm_is_valid_pno(union scsw *scsw) { return (scsw->tm.fctl != 0) && (scsw->tm.stctl & SCSW_STCTL_STATUS_PEND) && @@ -527,7 +659,6 @@ int scsw_tm_is_valid_pno(union scsw *scsw) ((scsw->tm.stctl & SCSW_STCTL_INTER_STATUS) && (scsw->tm.actl & SCSW_ACTL_SUSPENDED))); } -EXPORT_SYMBOL(scsw_tm_is_valid_pno); /** * scsw_tm_is_valid_fctl - check fctl field validity @@ -536,12 +667,11 @@ EXPORT_SYMBOL(scsw_tm_is_valid_pno); * Return non-zero if the fctl field of the specified transport mode scsw is * valid, zero otherwise. */ -int scsw_tm_is_valid_fctl(union scsw *scsw) +static inline int scsw_tm_is_valid_fctl(union scsw *scsw) { /* Only valid if pmcw.dnv == 1*/ return 1; } -EXPORT_SYMBOL(scsw_tm_is_valid_fctl); /** * scsw_tm_is_valid_actl - check actl field validity @@ -550,12 +680,11 @@ EXPORT_SYMBOL(scsw_tm_is_valid_fctl); * Return non-zero if the actl field of the specified transport mode scsw is * valid, zero otherwise. */ -int scsw_tm_is_valid_actl(union scsw *scsw) +static inline int scsw_tm_is_valid_actl(union scsw *scsw) { /* Only valid if pmcw.dnv == 1*/ return 1; } -EXPORT_SYMBOL(scsw_tm_is_valid_actl); /** * scsw_tm_is_valid_stctl - check stctl field validity @@ -564,12 +693,11 @@ EXPORT_SYMBOL(scsw_tm_is_valid_actl); * Return non-zero if the stctl field of the specified transport mode scsw is * valid, zero otherwise. */ -int scsw_tm_is_valid_stctl(union scsw *scsw) +static inline int scsw_tm_is_valid_stctl(union scsw *scsw) { /* Only valid if pmcw.dnv == 1*/ return 1; } -EXPORT_SYMBOL(scsw_tm_is_valid_stctl); /** * scsw_tm_is_valid_dstat - check dstat field validity @@ -578,12 +706,11 @@ EXPORT_SYMBOL(scsw_tm_is_valid_stctl); * Return non-zero if the dstat field of the specified transport mode scsw is * valid, zero otherwise. */ -int scsw_tm_is_valid_dstat(union scsw *scsw) +static inline int scsw_tm_is_valid_dstat(union scsw *scsw) { return (scsw->tm.stctl & SCSW_STCTL_STATUS_PEND) && (scsw->tm.cc != 3); } -EXPORT_SYMBOL(scsw_tm_is_valid_dstat); /** * scsw_tm_is_valid_cstat - check cstat field validity @@ -592,12 +719,11 @@ EXPORT_SYMBOL(scsw_tm_is_valid_dstat); * Return non-zero if the cstat field of the specified transport mode scsw is * valid, zero otherwise. */ -int scsw_tm_is_valid_cstat(union scsw *scsw) +static inline int scsw_tm_is_valid_cstat(union scsw *scsw) { return (scsw->tm.stctl & SCSW_STCTL_STATUS_PEND) && (scsw->tm.cc != 3); } -EXPORT_SYMBOL(scsw_tm_is_valid_cstat); /** * scsw_tm_is_valid_fcxs - check fcxs field validity @@ -606,11 +732,10 @@ EXPORT_SYMBOL(scsw_tm_is_valid_cstat); * Return non-zero if the fcxs field of the specified transport mode scsw is * valid, zero otherwise. */ -int scsw_tm_is_valid_fcxs(union scsw *scsw) +static inline int scsw_tm_is_valid_fcxs(union scsw *scsw) { return 1; } -EXPORT_SYMBOL(scsw_tm_is_valid_fcxs); /** * scsw_tm_is_valid_schxs - check schxs field validity @@ -619,14 +744,13 @@ EXPORT_SYMBOL(scsw_tm_is_valid_fcxs); * Return non-zero if the schxs field of the specified transport mode scsw is * valid, zero otherwise. */ -int scsw_tm_is_valid_schxs(union scsw *scsw) +static inline int scsw_tm_is_valid_schxs(union scsw *scsw) { return (scsw->tm.cstat & (SCHN_STAT_PROG_CHECK | SCHN_STAT_INTF_CTRL_CHK | SCHN_STAT_PROT_CHECK | SCHN_STAT_CHN_DATA_CHK)); } -EXPORT_SYMBOL(scsw_tm_is_valid_schxs); /** * scsw_is_valid_actl - check actl field validity @@ -636,14 +760,13 @@ EXPORT_SYMBOL(scsw_tm_is_valid_schxs); * regardless of whether it is a transport mode or command mode scsw. * Return zero if the field does not contain a valid value. */ -int scsw_is_valid_actl(union scsw *scsw) +static inline int scsw_is_valid_actl(union scsw *scsw) { if (scsw_is_tm(scsw)) return scsw_tm_is_valid_actl(scsw); else return scsw_cmd_is_valid_actl(scsw); } -EXPORT_SYMBOL(scsw_is_valid_actl); /** * scsw_is_valid_cc - check cc field validity @@ -653,14 +776,13 @@ EXPORT_SYMBOL(scsw_is_valid_actl); * regardless of whether it is a transport mode or command mode scsw. * Return zero if the field does not contain a valid value. */ -int scsw_is_valid_cc(union scsw *scsw) +static inline int scsw_is_valid_cc(union scsw *scsw) { if (scsw_is_tm(scsw)) return scsw_tm_is_valid_cc(scsw); else return scsw_cmd_is_valid_cc(scsw); } -EXPORT_SYMBOL(scsw_is_valid_cc); /** * scsw_is_valid_cstat - check cstat field validity @@ -670,14 +792,13 @@ EXPORT_SYMBOL(scsw_is_valid_cc); * regardless of whether it is a transport mode or command mode scsw. * Return zero if the field does not contain a valid value. */ -int scsw_is_valid_cstat(union scsw *scsw) +static inline int scsw_is_valid_cstat(union scsw *scsw) { if (scsw_is_tm(scsw)) return scsw_tm_is_valid_cstat(scsw); else return scsw_cmd_is_valid_cstat(scsw); } -EXPORT_SYMBOL(scsw_is_valid_cstat); /** * scsw_is_valid_dstat - check dstat field validity @@ -687,14 +808,13 @@ EXPORT_SYMBOL(scsw_is_valid_cstat); * regardless of whether it is a transport mode or command mode scsw. * Return zero if the field does not contain a valid value. */ -int scsw_is_valid_dstat(union scsw *scsw) +static inline int scsw_is_valid_dstat(union scsw *scsw) { if (scsw_is_tm(scsw)) return scsw_tm_is_valid_dstat(scsw); else return scsw_cmd_is_valid_dstat(scsw); } -EXPORT_SYMBOL(scsw_is_valid_dstat); /** * scsw_is_valid_ectl - check ectl field validity @@ -704,14 +824,13 @@ EXPORT_SYMBOL(scsw_is_valid_dstat); * regardless of whether it is a transport mode or command mode scsw. * Return zero if the field does not contain a valid value. */ -int scsw_is_valid_ectl(union scsw *scsw) +static inline int scsw_is_valid_ectl(union scsw *scsw) { if (scsw_is_tm(scsw)) return scsw_tm_is_valid_ectl(scsw); else return scsw_cmd_is_valid_ectl(scsw); } -EXPORT_SYMBOL(scsw_is_valid_ectl); /** * scsw_is_valid_eswf - check eswf field validity @@ -721,14 +840,13 @@ EXPORT_SYMBOL(scsw_is_valid_ectl); * regardless of whether it is a transport mode or command mode scsw. * Return zero if the field does not contain a valid value. */ -int scsw_is_valid_eswf(union scsw *scsw) +static inline int scsw_is_valid_eswf(union scsw *scsw) { if (scsw_is_tm(scsw)) return scsw_tm_is_valid_eswf(scsw); else return scsw_cmd_is_valid_eswf(scsw); } -EXPORT_SYMBOL(scsw_is_valid_eswf); /** * scsw_is_valid_fctl - check fctl field validity @@ -738,14 +856,13 @@ EXPORT_SYMBOL(scsw_is_valid_eswf); * regardless of whether it is a transport mode or command mode scsw. * Return zero if the field does not contain a valid value. */ -int scsw_is_valid_fctl(union scsw *scsw) +static inline int scsw_is_valid_fctl(union scsw *scsw) { if (scsw_is_tm(scsw)) return scsw_tm_is_valid_fctl(scsw); else return scsw_cmd_is_valid_fctl(scsw); } -EXPORT_SYMBOL(scsw_is_valid_fctl); /** * scsw_is_valid_key - check key field validity @@ -755,14 +872,13 @@ EXPORT_SYMBOL(scsw_is_valid_fctl); * regardless of whether it is a transport mode or command mode scsw. * Return zero if the field does not contain a valid value. */ -int scsw_is_valid_key(union scsw *scsw) +static inline int scsw_is_valid_key(union scsw *scsw) { if (scsw_is_tm(scsw)) return scsw_tm_is_valid_key(scsw); else return scsw_cmd_is_valid_key(scsw); } -EXPORT_SYMBOL(scsw_is_valid_key); /** * scsw_is_valid_pno - check pno field validity @@ -772,14 +888,13 @@ EXPORT_SYMBOL(scsw_is_valid_key); * regardless of whether it is a transport mode or command mode scsw. * Return zero if the field does not contain a valid value. */ -int scsw_is_valid_pno(union scsw *scsw) +static inline int scsw_is_valid_pno(union scsw *scsw) { if (scsw_is_tm(scsw)) return scsw_tm_is_valid_pno(scsw); else return scsw_cmd_is_valid_pno(scsw); } -EXPORT_SYMBOL(scsw_is_valid_pno); /** * scsw_is_valid_stctl - check stctl field validity @@ -789,14 +904,13 @@ EXPORT_SYMBOL(scsw_is_valid_pno); * regardless of whether it is a transport mode or command mode scsw. * Return zero if the field does not contain a valid value. */ -int scsw_is_valid_stctl(union scsw *scsw) +static inline int scsw_is_valid_stctl(union scsw *scsw) { if (scsw_is_tm(scsw)) return scsw_tm_is_valid_stctl(scsw); else return scsw_cmd_is_valid_stctl(scsw); } -EXPORT_SYMBOL(scsw_is_valid_stctl); /** * scsw_cmd_is_solicited - check for solicited scsw @@ -805,12 +919,11 @@ EXPORT_SYMBOL(scsw_is_valid_stctl); * Return non-zero if the command mode scsw indicates that the associated * status condition is solicited, zero if it is unsolicited. */ -int scsw_cmd_is_solicited(union scsw *scsw) +static inline int scsw_cmd_is_solicited(union scsw *scsw) { return (scsw->cmd.cc != 0) || (scsw->cmd.stctl != (SCSW_STCTL_STATUS_PEND | SCSW_STCTL_ALERT_STATUS)); } -EXPORT_SYMBOL(scsw_cmd_is_solicited); /** * scsw_tm_is_solicited - check for solicited scsw @@ -819,12 +932,11 @@ EXPORT_SYMBOL(scsw_cmd_is_solicited); * Return non-zero if the transport mode scsw indicates that the associated * status condition is solicited, zero if it is unsolicited. */ -int scsw_tm_is_solicited(union scsw *scsw) +static inline int scsw_tm_is_solicited(union scsw *scsw) { return (scsw->tm.cc != 0) || (scsw->tm.stctl != (SCSW_STCTL_STATUS_PEND | SCSW_STCTL_ALERT_STATUS)); } -EXPORT_SYMBOL(scsw_tm_is_solicited); /** * scsw_is_solicited - check for solicited scsw @@ -833,11 +945,12 @@ EXPORT_SYMBOL(scsw_tm_is_solicited); * Return non-zero if the transport or command mode scsw indicates that the * associated status condition is solicited, zero if it is unsolicited. */ -int scsw_is_solicited(union scsw *scsw) +static inline int scsw_is_solicited(union scsw *scsw) { if (scsw_is_tm(scsw)) return scsw_tm_is_solicited(scsw); else return scsw_cmd_is_solicited(scsw); } -EXPORT_SYMBOL(scsw_is_solicited); + +#endif /* _ASM_S390_SCSW_H_ */ diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 38b0fc221ed..e37478e8728 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -8,7 +8,7 @@ #ifndef _ASM_S390_SETUP_H #define _ASM_S390_SETUP_H -#define COMMAND_LINE_SIZE 1024 +#define COMMAND_LINE_SIZE 4096 #define ARCH_COMMAND_LINE_SIZE 896 diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index 72137bc907a..c991fe6473c 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -51,32 +51,7 @@ extern void machine_power_off_smp(void); #define PROC_CHANGE_PENALTY 20 /* Schedule penalty */ #define raw_smp_processor_id() (S390_lowcore.cpu_nr) - -/* - * returns 1 if cpu is in stopped/check stopped state or not operational - * returns 0 otherwise - */ -static inline int -smp_cpu_not_running(int cpu) -{ - __u32 status; - - switch (signal_processor_ps(&status, 0, cpu, sigp_sense)) { - case sigp_order_code_accepted: - case sigp_status_stored: - /* Check for stopped and check stop state */ - if (status & 0x50) - return 1; - break; - case sigp_not_operational: - return 1; - default: - break; - } - return 0; -} - -#define cpu_logical_map(cpu) (cpu) +#define cpu_logical_map(cpu) (cpu) extern int __cpu_disable (void); extern void __cpu_die (unsigned int cpu); @@ -91,11 +66,6 @@ extern void arch_send_call_function_ipi(cpumask_t mask); #endif -#ifndef CONFIG_SMP -#define hard_smp_processor_id() 0 -#define smp_cpu_not_running(cpu) 1 -#endif - #ifdef CONFIG_HOTPLUG_CPU extern int smp_rescan_cpus(void); #else diff --git a/arch/s390/include/asm/system.h b/arch/s390/include/asm/system.h index 4fb83c1cdb7..379661d2f81 100644 --- a/arch/s390/include/asm/system.h +++ b/arch/s390/include/asm/system.h @@ -109,11 +109,7 @@ extern void pfault_fini(void); #define pfault_fini() do { } while (0) #endif /* CONFIG_PFAULT */ -#ifdef CONFIG_PAGE_STATES extern void cmma_init(void); -#else -static inline void cmma_init(void) { } -#endif #define finish_arch_switch(prev) do { \ set_fs(current->thread.mm_segment); \ diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index cc21e3e20fd..24aa1cda20a 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -90,4 +90,18 @@ unsigned long long monotonic_clock(void); extern u64 sched_clock_base_cc; +/** + * get_clock_monotonic - returns current time in clock rate units + * + * The caller must ensure that preemption is disabled. + * The clock and sched_clock_base get changed via stop_machine. + * Therefore preemption must be disabled when calling this + * function, otherwise the returned value is not guaranteed to + * be monotonic. + */ +static inline unsigned long long get_clock_monotonic(void) +{ + return get_clock_xt() - sched_clock_base_cc; +} + #endif diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index c75ed43b1a1..c7be8e10b87 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -32,7 +32,7 @@ extra-y += head.o init_task.o vmlinux.lds obj-$(CONFIG_MODULES) += s390_ksyms.o module.o obj-$(CONFIG_SMP) += smp.o topology.o - +obj-$(CONFIG_HIBERNATION) += suspend.o swsusp_asm64.o obj-$(CONFIG_AUDIT) += audit.o compat-obj-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_COMPAT) += compat_linux.o compat_signal.o \ @@ -41,7 +41,7 @@ obj-$(CONFIG_COMPAT) += compat_linux.o compat_signal.o \ obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_KPROBES) += kprobes.o -obj-$(CONFIG_FUNCTION_TRACER) += mcount.o +obj-$(CONFIG_FUNCTION_TRACER) += $(if $(CONFIG_64BIT),mcount64.o,mcount.o) obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index cae14c49951..bf8b4ae7ff2 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -6,6 +6,9 @@ * Heiko Carstens <heiko.carstens@de.ibm.com> */ +#define KMSG_COMPONENT "setup" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/compiler.h> #include <linux/init.h> #include <linux/errno.h> @@ -16,6 +19,7 @@ #include <linux/module.h> #include <linux/pfn.h> #include <linux/uaccess.h> +#include <linux/kernel.h> #include <asm/ebcdic.h> #include <asm/ipl.h> #include <asm/lowcore.h> @@ -35,8 +39,6 @@ char kernel_nss_name[NSS_NAME_SIZE + 1]; -static unsigned long machine_flags; - static void __init setup_boot_command_line(void); /* @@ -81,6 +83,8 @@ asm( " br 14\n" " .size savesys_ipl_nss, .-savesys_ipl_nss\n"); +static __initdata char upper_command_line[COMMAND_LINE_SIZE]; + static noinline __init void create_kernel_nss(void) { unsigned int i, stext_pfn, eshared_pfn, end_pfn, min_size; @@ -90,7 +94,6 @@ static noinline __init void create_kernel_nss(void) int response; size_t len; char *savesys_ptr; - char upper_command_line[COMMAND_LINE_SIZE]; char defsys_cmd[DEFSYS_CMD_SIZE]; char savesys_cmd[SAVESYS_CMD_SIZE]; @@ -141,6 +144,8 @@ static noinline __init void create_kernel_nss(void) __cpcmd(defsys_cmd, NULL, 0, &response); if (response != 0) { + pr_err("Defining the Linux kernel NSS failed with rc=%d\n", + response); kernel_nss_name[0] = '\0'; return; } @@ -153,8 +158,11 @@ static noinline __init void create_kernel_nss(void) * max SAVESYS_CMD_SIZE * On error: response contains the numeric portion of cp error message. * for SAVESYS it will be >= 263 + * for missing privilege class, it will be 1 */ - if (response > SAVESYS_CMD_SIZE) { + if (response > SAVESYS_CMD_SIZE || response == 1) { + pr_err("Saving the Linux kernel NSS failed with rc=%d\n", + response); kernel_nss_name[0] = '\0'; return; } @@ -205,12 +213,9 @@ static noinline __init void detect_machine_type(void) /* Running under KVM? If not we assume z/VM */ if (!memcmp(vmms.vm[0].cpi, "\xd2\xe5\xd4", 3)) - machine_flags |= MACHINE_FLAG_KVM; + S390_lowcore.machine_flags |= MACHINE_FLAG_KVM; else - machine_flags |= MACHINE_FLAG_VM; - - /* Store machine flags for setting up lowcore early */ - S390_lowcore.machine_flags = machine_flags; + S390_lowcore.machine_flags |= MACHINE_FLAG_VM; } static __init void early_pgm_check_handler(void) @@ -245,7 +250,7 @@ static noinline __init void setup_hpage(void) facilities = stfl(); if (!(facilities & (1UL << 23)) || !(facilities & (1UL << 29))) return; - machine_flags |= MACHINE_FLAG_HPAGE; + S390_lowcore.machine_flags |= MACHINE_FLAG_HPAGE; __ctl_set_bit(0, 23); #endif } @@ -263,7 +268,7 @@ static __init void detect_mvpg(void) EX_TABLE(0b,1b) : "=d" (rc) : "0" (-EOPNOTSUPP), "a" (0) : "memory", "cc", "0"); if (!rc) - machine_flags |= MACHINE_FLAG_MVPG; + S390_lowcore.machine_flags |= MACHINE_FLAG_MVPG; #endif } @@ -279,7 +284,7 @@ static __init void detect_ieee(void) EX_TABLE(0b,1b) : "=d" (rc), "=d" (tmp): "0" (-EOPNOTSUPP) : "cc"); if (!rc) - machine_flags |= MACHINE_FLAG_IEEE; + S390_lowcore.machine_flags |= MACHINE_FLAG_IEEE; #endif } @@ -298,7 +303,7 @@ static __init void detect_csp(void) EX_TABLE(0b,1b) : "=d" (rc) : "0" (-EOPNOTSUPP) : "cc", "0", "1", "2"); if (!rc) - machine_flags |= MACHINE_FLAG_CSP; + S390_lowcore.machine_flags |= MACHINE_FLAG_CSP; #endif } @@ -315,7 +320,7 @@ static __init void detect_diag9c(void) EX_TABLE(0b,1b) : "=d" (rc) : "0" (-EOPNOTSUPP), "d" (cpu_address) : "cc"); if (!rc) - machine_flags |= MACHINE_FLAG_DIAG9C; + S390_lowcore.machine_flags |= MACHINE_FLAG_DIAG9C; } static __init void detect_diag44(void) @@ -330,7 +335,7 @@ static __init void detect_diag44(void) EX_TABLE(0b,1b) : "=d" (rc) : "0" (-EOPNOTSUPP) : "cc"); if (!rc) - machine_flags |= MACHINE_FLAG_DIAG44; + S390_lowcore.machine_flags |= MACHINE_FLAG_DIAG44; #endif } @@ -341,11 +346,11 @@ static __init void detect_machine_facilities(void) facilities = stfl(); if (facilities & (1 << 28)) - machine_flags |= MACHINE_FLAG_IDTE; + S390_lowcore.machine_flags |= MACHINE_FLAG_IDTE; if (facilities & (1 << 23)) - machine_flags |= MACHINE_FLAG_PFMF; + S390_lowcore.machine_flags |= MACHINE_FLAG_PFMF; if (facilities & (1 << 4)) - machine_flags |= MACHINE_FLAG_MVCOS; + S390_lowcore.machine_flags |= MACHINE_FLAG_MVCOS; #endif } @@ -367,21 +372,35 @@ static __init void rescue_initrd(void) } /* Set up boot command line */ -static void __init setup_boot_command_line(void) +static void __init append_to_cmdline(size_t (*ipl_data)(char *, size_t)) { - char *parm = NULL; + char *parm, *delim; + size_t rc, len; + + len = strlen(boot_command_line); + + delim = boot_command_line + len; /* '\0' character position */ + parm = boot_command_line + len + 1; /* append right after '\0' */ + rc = ipl_data(parm, COMMAND_LINE_SIZE - len - 1); + if (rc) { + if (*parm == '=') + memmove(boot_command_line, parm + 1, rc); + else + *delim = ' '; /* replace '\0' with space */ + } +} + +static void __init setup_boot_command_line(void) +{ /* copy arch command line */ strlcpy(boot_command_line, COMMAND_LINE, ARCH_COMMAND_LINE_SIZE); /* append IPL PARM data to the boot command line */ - if (MACHINE_IS_VM) { - parm = boot_command_line + strlen(boot_command_line); - *parm++ = ' '; - get_ipl_vmparm(parm); - if (parm[0] == '=') - memmove(boot_command_line, parm + 1, strlen(parm)); - } + if (MACHINE_IS_VM) + append_to_cmdline(append_ipl_vmparm); + + append_to_cmdline(append_ipl_scpdata); } @@ -413,7 +432,6 @@ void __init startup_init(void) setup_hpage(); sclp_facilities_detect(); detect_memory_layout(memory_chunk); - S390_lowcore.machine_flags = machine_flags; #ifdef CONFIG_DYNAMIC_FTRACE S390_lowcore.ftrace_func = (unsigned long)ftrace_caller; #endif diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index c4c80a22bc1..f78580a7403 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -278,7 +278,8 @@ sysc_return: bnz BASED(sysc_work) # there is work to do (signals etc.) sysc_restore: #ifdef CONFIG_TRACE_IRQFLAGS - la %r1,BASED(sysc_restore_trace_psw) + la %r1,BASED(sysc_restore_trace_psw_addr) + l %r1,0(%r1) lpsw 0(%r1) sysc_restore_trace: TRACE_IRQS_CHECK @@ -289,10 +290,15 @@ sysc_leave: sysc_done: #ifdef CONFIG_TRACE_IRQFLAGS +sysc_restore_trace_psw_addr: + .long sysc_restore_trace_psw + + .section .data,"aw",@progbits .align 8 .globl sysc_restore_trace_psw sysc_restore_trace_psw: .long 0, sysc_restore_trace + 0x80000000 + .previous #endif # @@ -606,7 +612,8 @@ io_return: bnz BASED(io_work) # there is work to do (signals etc.) io_restore: #ifdef CONFIG_TRACE_IRQFLAGS - la %r1,BASED(io_restore_trace_psw) + la %r1,BASED(io_restore_trace_psw_addr) + l %r1,0(%r1) lpsw 0(%r1) io_restore_trace: TRACE_IRQS_CHECK @@ -617,10 +624,15 @@ io_leave: io_done: #ifdef CONFIG_TRACE_IRQFLAGS +io_restore_trace_psw_addr: + .long io_restore_trace_psw + + .section .data,"aw",@progbits .align 8 .globl io_restore_trace_psw io_restore_trace_psw: .long 0, io_restore_trace + 0x80000000 + .previous #endif # diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S index f6618e9e15e..009ca6175db 100644 --- a/arch/s390/kernel/entry64.S +++ b/arch/s390/kernel/entry64.S @@ -284,10 +284,12 @@ sysc_leave: sysc_done: #ifdef CONFIG_TRACE_IRQFLAGS + .section .data,"aw",@progbits .align 8 .globl sysc_restore_trace_psw sysc_restore_trace_psw: .quad 0, sysc_restore_trace + .previous #endif # @@ -595,10 +597,12 @@ io_leave: io_done: #ifdef CONFIG_TRACE_IRQFLAGS + .section .data,"aw",@progbits .align 8 .globl io_restore_trace_psw io_restore_trace_psw: .quad 0, io_restore_trace + .previous #endif # diff --git a/arch/s390/kernel/head.S b/arch/s390/kernel/head.S index ec688234852..c52b4f7742f 100644 --- a/arch/s390/kernel/head.S +++ b/arch/s390/kernel/head.S @@ -27,6 +27,7 @@ #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/page.h> +#include <asm/cpu.h> #ifdef CONFIG_64BIT #define ARCH_OFFSET 4 diff --git a/arch/s390/kernel/head31.S b/arch/s390/kernel/head31.S index 2ced846065b..602b508cd4c 100644 --- a/arch/s390/kernel/head31.S +++ b/arch/s390/kernel/head31.S @@ -24,6 +24,7 @@ startup_continue: # Setup stack # l %r15,.Linittu-.LPG1(%r13) + st %r15,__LC_THREAD_INFO # cache thread info in lowcore mvc __LC_CURRENT(4),__TI_task(%r15) ahi %r15,1<<(PAGE_SHIFT+THREAD_ORDER) # init_task_union+THREAD_SIZE st %r15,__LC_KERNEL_STACK # set end of kernel stack diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index 65667b2e65c..6a250808092 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -62,9 +62,9 @@ startup_continue: clr %r11,%r12 je 5f # no more space in prefix array 4: - ahi %r8,1 # next cpu (r8 += 1) - cl %r8,.Llast_cpu-.LPG1(%r13) # is last possible cpu ? - jl 1b # jump if not last cpu + ahi %r8,1 # next cpu (r8 += 1) + chi %r8,MAX_CPU_ADDRESS # is last possible cpu ? + jle 1b # jump if not last cpu 5: lhi %r1,2 # mode 2 = esame (dump) j 6f @@ -92,6 +92,7 @@ startup_continue: # Setup stack # larl %r15,init_thread_union + stg %r15,__LC_THREAD_INFO # cache thread info in lowcore lg %r14,__TI_task(%r15) # cache current in lowcore stg %r14,__LC_CURRENT aghi %r15,1<<(PAGE_SHIFT+THREAD_ORDER) # init_task_union + THREAD_SIZE @@ -129,8 +130,6 @@ startup_continue: #ifdef CONFIG_ZFCPDUMP .Lcurrent_cpu: .long 0x0 -.Llast_cpu: - .long 0x0000ffff .Lpref_arr_ptr: .long zfcpdump_prefix_array #endif /* CONFIG_ZFCPDUMP */ diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 371a2d88f4a..ee57a42e6e9 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -272,17 +272,18 @@ static ssize_t ipl_type_show(struct kobject *kobj, struct kobj_attribute *attr, static struct kobj_attribute sys_ipl_type_attr = __ATTR_RO(ipl_type); /* VM IPL PARM routines */ -static void reipl_get_ascii_vmparm(char *dest, +size_t reipl_get_ascii_vmparm(char *dest, size_t size, const struct ipl_parameter_block *ipb) { int i; - int len = 0; + size_t len; char has_lowercase = 0; + len = 0; if ((ipb->ipl_info.ccw.vm_flags & DIAG308_VM_FLAGS_VP_VALID) && (ipb->ipl_info.ccw.vm_parm_len > 0)) { - len = ipb->ipl_info.ccw.vm_parm_len; + len = min_t(size_t, size - 1, ipb->ipl_info.ccw.vm_parm_len); memcpy(dest, ipb->ipl_info.ccw.vm_parm, len); /* If at least one character is lowercase, we assume mixed * case; otherwise we convert everything to lowercase. @@ -299,14 +300,20 @@ static void reipl_get_ascii_vmparm(char *dest, EBCASC(dest, len); } dest[len] = 0; + + return len; } -void get_ipl_vmparm(char *dest) +size_t append_ipl_vmparm(char *dest, size_t size) { + size_t rc; + + rc = 0; if (diag308_set_works && (ipl_block.hdr.pbt == DIAG308_IPL_TYPE_CCW)) - reipl_get_ascii_vmparm(dest, &ipl_block); + rc = reipl_get_ascii_vmparm(dest, size, &ipl_block); else dest[0] = 0; + return rc; } static ssize_t ipl_vm_parm_show(struct kobject *kobj, @@ -314,10 +321,65 @@ static ssize_t ipl_vm_parm_show(struct kobject *kobj, { char parm[DIAG308_VMPARM_SIZE + 1] = {}; - get_ipl_vmparm(parm); + append_ipl_vmparm(parm, sizeof(parm)); return sprintf(page, "%s\n", parm); } +static size_t scpdata_length(const char* buf, size_t count) +{ + while (count) { + if (buf[count - 1] != '\0' && buf[count - 1] != ' ') + break; + count--; + } + return count; +} + +size_t reipl_append_ascii_scpdata(char *dest, size_t size, + const struct ipl_parameter_block *ipb) +{ + size_t count; + size_t i; + int has_lowercase; + + count = min(size - 1, scpdata_length(ipb->ipl_info.fcp.scp_data, + ipb->ipl_info.fcp.scp_data_len)); + if (!count) + goto out; + + has_lowercase = 0; + for (i = 0; i < count; i++) { + if (!isascii(ipb->ipl_info.fcp.scp_data[i])) { + count = 0; + goto out; + } + if (!has_lowercase && islower(ipb->ipl_info.fcp.scp_data[i])) + has_lowercase = 1; + } + + if (has_lowercase) + memcpy(dest, ipb->ipl_info.fcp.scp_data, count); + else + for (i = 0; i < count; i++) + dest[i] = tolower(ipb->ipl_info.fcp.scp_data[i]); +out: + dest[count] = '\0'; + return count; +} + +size_t append_ipl_scpdata(char *dest, size_t len) +{ + size_t rc; + + rc = 0; + if (ipl_block.hdr.pbt == DIAG308_IPL_TYPE_FCP) + rc = reipl_append_ascii_scpdata(dest, len, &ipl_block); + else + dest[0] = 0; + return rc; +} + + static struct kobj_attribute sys_ipl_vm_parm_attr = __ATTR(parm, S_IRUGO, ipl_vm_parm_show, NULL); @@ -553,7 +615,7 @@ static ssize_t reipl_generic_vmparm_show(struct ipl_parameter_block *ipb, { char vmparm[DIAG308_VMPARM_SIZE + 1] = {}; - reipl_get_ascii_vmparm(vmparm, ipb); + reipl_get_ascii_vmparm(vmparm, sizeof(vmparm), ipb); return sprintf(page, "%s\n", vmparm); } @@ -626,6 +688,59 @@ static struct kobj_attribute sys_reipl_ccw_vmparm_attr = /* FCP reipl device attributes */ +static ssize_t reipl_fcp_scpdata_read(struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, size_t count) +{ + size_t size = reipl_block_fcp->ipl_info.fcp.scp_data_len; + void *scp_data = reipl_block_fcp->ipl_info.fcp.scp_data; + + return memory_read_from_buffer(buf, count, &off, scp_data, size); +} + +static ssize_t reipl_fcp_scpdata_write(struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, size_t count) +{ + size_t padding; + size_t scpdata_len; + + if (off < 0) + return -EINVAL; + + if (off >= DIAG308_SCPDATA_SIZE) + return -ENOSPC; + + if (count > DIAG308_SCPDATA_SIZE - off) + count = DIAG308_SCPDATA_SIZE - off; + + memcpy(reipl_block_fcp->ipl_info.fcp.scp_data, buf + off, count); + scpdata_len = off + count; + + if (scpdata_len % 8) { + padding = 8 - (scpdata_len % 8); + memset(reipl_block_fcp->ipl_info.fcp.scp_data + scpdata_len, + 0, padding); + scpdata_len += padding; + } + + reipl_block_fcp->ipl_info.fcp.scp_data_len = scpdata_len; + reipl_block_fcp->hdr.len = IPL_PARM_BLK_FCP_LEN + scpdata_len; + reipl_block_fcp->hdr.blk0_len = IPL_PARM_BLK0_FCP_LEN + scpdata_len; + + return count; +} + +static struct bin_attribute sys_reipl_fcp_scp_data_attr = { + .attr = { + .name = "scp_data", + .mode = S_IRUGO | S_IWUSR, + }, + .size = PAGE_SIZE, + .read = reipl_fcp_scpdata_read, + .write = reipl_fcp_scpdata_write, +}; + DEFINE_IPL_ATTR_RW(reipl_fcp, wwpn, "0x%016llx\n", "%016llx\n", reipl_block_fcp->ipl_info.fcp.wwpn); DEFINE_IPL_ATTR_RW(reipl_fcp, lun, "0x%016llx\n", "%016llx\n", @@ -647,7 +762,6 @@ static struct attribute *reipl_fcp_attrs[] = { }; static struct attribute_group reipl_fcp_attr_group = { - .name = IPL_FCP_STR, .attrs = reipl_fcp_attrs, }; @@ -895,6 +1009,7 @@ static struct kobj_attribute reipl_type_attr = __ATTR(reipl_type, 0644, reipl_type_show, reipl_type_store); static struct kset *reipl_kset; +static struct kset *reipl_fcp_kset; static void get_ipl_string(char *dst, struct ipl_parameter_block *ipb, const enum ipl_method m) @@ -906,7 +1021,7 @@ static void get_ipl_string(char *dst, struct ipl_parameter_block *ipb, reipl_get_ascii_loadparm(loadparm, ipb); reipl_get_ascii_nss_name(nss_name, ipb); - reipl_get_ascii_vmparm(vmparm, ipb); + reipl_get_ascii_vmparm(vmparm, sizeof(vmparm), ipb); switch (m) { case REIPL_METHOD_CCW_VM: @@ -1076,23 +1191,44 @@ static int __init reipl_fcp_init(void) int rc; if (!diag308_set_works) { - if (ipl_info.type == IPL_TYPE_FCP) + if (ipl_info.type == IPL_TYPE_FCP) { make_attrs_ro(reipl_fcp_attrs); - else + sys_reipl_fcp_scp_data_attr.attr.mode = S_IRUGO; + } else return 0; } reipl_block_fcp = (void *) get_zeroed_page(GFP_KERNEL); if (!reipl_block_fcp) return -ENOMEM; - rc = sysfs_create_group(&reipl_kset->kobj, &reipl_fcp_attr_group); + + /* sysfs: create fcp kset for mixing attr group and bin attrs */ + reipl_fcp_kset = kset_create_and_add(IPL_FCP_STR, NULL, + &reipl_kset->kobj); + if (!reipl_kset) { + free_page((unsigned long) reipl_block_fcp); + return -ENOMEM; + } + + rc = sysfs_create_group(&reipl_fcp_kset->kobj, &reipl_fcp_attr_group); + if (rc) { + kset_unregister(reipl_fcp_kset); + free_page((unsigned long) reipl_block_fcp); + return rc; + } + + rc = sysfs_create_bin_file(&reipl_fcp_kset->kobj, + &sys_reipl_fcp_scp_data_attr); if (rc) { - free_page((unsigned long)reipl_block_fcp); + sysfs_remove_group(&reipl_fcp_kset->kobj, &reipl_fcp_attr_group); + kset_unregister(reipl_fcp_kset); + free_page((unsigned long) reipl_block_fcp); return rc; } - if (ipl_info.type == IPL_TYPE_FCP) { + + if (ipl_info.type == IPL_TYPE_FCP) memcpy(reipl_block_fcp, IPL_PARMBLOCK_START, PAGE_SIZE); - } else { + else { reipl_block_fcp->hdr.len = IPL_PARM_BLK_FCP_LEN; reipl_block_fcp->hdr.version = IPL_PARM_BLOCK_VERSION; reipl_block_fcp->hdr.blk0_len = IPL_PARM_BLK0_FCP_LEN; diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S index 2a0a5e97ba8..dfe015d7398 100644 --- a/arch/s390/kernel/mcount.S +++ b/arch/s390/kernel/mcount.S @@ -11,111 +11,27 @@ ftrace_stub: br %r14 -#ifdef CONFIG_64BIT - -#ifdef CONFIG_DYNAMIC_FTRACE - .globl _mcount _mcount: - br %r14 - - .globl ftrace_caller -ftrace_caller: - larl %r1,function_trace_stop - icm %r1,0xf,0(%r1) - bnzr %r14 - stmg %r2,%r5,32(%r15) - stg %r14,112(%r15) - lgr %r1,%r15 - aghi %r15,-160 - stg %r1,__SF_BACKCHAIN(%r15) - lgr %r2,%r14 - lg %r3,168(%r15) - larl %r14,ftrace_dyn_func - lg %r14,0(%r14) - basr %r14,%r14 -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - .globl ftrace_graph_caller -ftrace_graph_caller: - # This unconditional branch gets runtime patched. Change only if - # you know what you are doing. See ftrace_enable_graph_caller(). - j 0f - lg %r2,272(%r15) - lg %r3,168(%r15) - brasl %r14,prepare_ftrace_return - stg %r2,168(%r15) -0: -#endif - aghi %r15,160 - lmg %r2,%r5,32(%r15) - lg %r14,112(%r15) +#ifdef CONFIG_DYNAMIC_FTRACE br %r14 .data .globl ftrace_dyn_func ftrace_dyn_func: - .quad ftrace_stub + .long ftrace_stub .previous -#else /* CONFIG_DYNAMIC_FTRACE */ - - .globl _mcount -_mcount: - larl %r1,function_trace_stop - icm %r1,0xf,0(%r1) - bnzr %r14 - stmg %r2,%r5,32(%r15) - stg %r14,112(%r15) - lgr %r1,%r15 - aghi %r15,-160 - stg %r1,__SF_BACKCHAIN(%r15) - lgr %r2,%r14 - lg %r3,168(%r15) - larl %r14,ftrace_trace_function - lg %r14,0(%r14) - basr %r14,%r14 -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - lg %r2,272(%r15) - lg %r3,168(%r15) - brasl %r14,prepare_ftrace_return - stg %r2,168(%r15) -#endif - aghi %r15,160 - lmg %r2,%r5,32(%r15) - lg %r14,112(%r15) - br %r14 - -#endif /* CONFIG_DYNAMIC_FTRACE */ - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - - .globl return_to_handler -return_to_handler: - stmg %r2,%r5,32(%r15) - lgr %r1,%r15 - aghi %r15,-160 - stg %r1,__SF_BACKCHAIN(%r15) - brasl %r14,ftrace_return_to_handler - aghi %r15,160 - lgr %r14,%r2 - lmg %r2,%r5,32(%r15) - br %r14 - -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - -#else /* CONFIG_64BIT */ - -#ifdef CONFIG_DYNAMIC_FTRACE - - .globl _mcount -_mcount: - br %r14 - .globl ftrace_caller ftrace_caller: +#endif stm %r2,%r5,16(%r15) bras %r1,2f +#ifdef CONFIG_DYNAMIC_FTRACE +0: .long ftrace_dyn_func +#else 0: .long ftrace_trace_function +#endif 1: .long function_trace_stop 2: l %r2,1b-0b(%r1) icm %r2,0xf,0(%r2) @@ -131,53 +47,13 @@ ftrace_caller: l %r14,0(%r14) basr %r14,%r14 #ifdef CONFIG_FUNCTION_GRAPH_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE .globl ftrace_graph_caller ftrace_graph_caller: # This unconditional branch gets runtime patched. Change only if # you know what you are doing. See ftrace_enable_graph_caller(). j 1f - bras %r1,0f - .long prepare_ftrace_return -0: l %r2,152(%r15) - l %r4,0(%r1) - l %r3,100(%r15) - basr %r14,%r4 - st %r2,100(%r15) -1: #endif - ahi %r15,96 - l %r14,56(%r15) -3: lm %r2,%r5,16(%r15) - br %r14 - - .data - .globl ftrace_dyn_func -ftrace_dyn_func: - .long ftrace_stub - .previous - -#else /* CONFIG_DYNAMIC_FTRACE */ - - .globl _mcount -_mcount: - stm %r2,%r5,16(%r15) - bras %r1,2f -0: .long ftrace_trace_function -1: .long function_trace_stop -2: l %r2,1b-0b(%r1) - icm %r2,0xf,0(%r2) - jnz 3f - st %r14,56(%r15) - lr %r0,%r15 - ahi %r15,-96 - l %r3,100(%r15) - la %r2,0(%r14) - st %r0,__SF_BACKCHAIN(%r15) - la %r3,0(%r3) - l %r14,0b-0b(%r1) - l %r14,0(%r14) - basr %r14,%r14 -#ifdef CONFIG_FUNCTION_GRAPH_TRACER bras %r1,0f .long prepare_ftrace_return 0: l %r2,152(%r15) @@ -185,14 +61,13 @@ _mcount: l %r3,100(%r15) basr %r14,%r4 st %r2,100(%r15) +1: #endif ahi %r15,96 l %r14,56(%r15) 3: lm %r2,%r5,16(%r15) br %r14 -#endif /* CONFIG_DYNAMIC_FTRACE */ - #ifdef CONFIG_FUNCTION_GRAPH_TRACER .globl return_to_handler @@ -211,6 +86,4 @@ return_to_handler: lm %r2,%r5,16(%r15) br %r14 -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - -#endif /* CONFIG_64BIT */ +#endif diff --git a/arch/s390/kernel/mcount64.S b/arch/s390/kernel/mcount64.S new file mode 100644 index 00000000000..c37211c6092 --- /dev/null +++ b/arch/s390/kernel/mcount64.S @@ -0,0 +1,78 @@ +/* + * Copyright IBM Corp. 2008,2009 + * + * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>, + * + */ + +#include <asm/asm-offsets.h> + + .globl ftrace_stub +ftrace_stub: + br %r14 + + .globl _mcount +_mcount: +#ifdef CONFIG_DYNAMIC_FTRACE + br %r14 + + .data + .globl ftrace_dyn_func +ftrace_dyn_func: + .quad ftrace_stub + .previous + + .globl ftrace_caller +ftrace_caller: +#endif + larl %r1,function_trace_stop + icm %r1,0xf,0(%r1) + bnzr %r14 + stmg %r2,%r5,32(%r15) + stg %r14,112(%r15) + lgr %r1,%r15 + aghi %r15,-160 + stg %r1,__SF_BACKCHAIN(%r15) + lgr %r2,%r14 + lg %r3,168(%r15) +#ifdef CONFIG_DYNAMIC_FTRACE + larl %r14,ftrace_dyn_func +#else + larl %r14,ftrace_trace_function +#endif + lg %r14,0(%r14) + basr %r14,%r14 +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE + .globl ftrace_graph_caller +ftrace_graph_caller: + # This unconditional branch gets runtime patched. Change only if + # you know what you are doing. See ftrace_enable_graph_caller(). + j 0f +#endif + lg %r2,272(%r15) + lg %r3,168(%r15) + brasl %r14,prepare_ftrace_return + stg %r2,168(%r15) +0: +#endif + aghi %r15,160 + lmg %r2,%r5,32(%r15) + lg %r14,112(%r15) + br %r14 + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + + .globl return_to_handler +return_to_handler: + stmg %r2,%r5,32(%r15) + lgr %r1,%r15 + aghi %r15,-160 + stg %r1,__SF_BACKCHAIN(%r15) + brasl %r14,ftrace_return_to_handler + aghi %r15,160 + lgr %r14,%r2 + lmg %r2,%r5,32(%r15) + br %r14 + +#endif diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index cbb897bc50b..9ed13a1ed37 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -156,15 +156,11 @@ __setup("condev=", condev_setup); static void __init set_preferred_console(void) { - if (MACHINE_IS_KVM) { + if (MACHINE_IS_KVM) add_preferred_console("hvc", 0, NULL); - s390_virtio_console_init(); - return; - } - - if (CONSOLE_IS_3215 || CONSOLE_IS_SCLP) + else if (CONSOLE_IS_3215 || CONSOLE_IS_SCLP) add_preferred_console("ttyS", 0, NULL); - if (CONSOLE_IS_3270) + else if (CONSOLE_IS_3270) add_preferred_console("tty3270", 0, NULL); } diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 062bd64e65f..6b4fef877f9 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -536,4 +536,6 @@ void do_notify_resume(struct pt_regs *regs) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index be2cae08340..56c16876b91 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -49,6 +49,7 @@ #include <asm/sclp.h> #include <asm/cputime.h> #include <asm/vdso.h> +#include <asm/cpu.h> #include "entry.h" static struct task_struct *current_set[NR_CPUS]; @@ -70,6 +71,23 @@ static DEFINE_PER_CPU(struct cpu, cpu_devices); static void smp_ext_bitcall(int, ec_bit_sig); +static int cpu_stopped(int cpu) +{ + __u32 status; + + switch (signal_processor_ps(&status, 0, cpu, sigp_sense)) { + case sigp_order_code_accepted: + case sigp_status_stored: + /* Check for stopped and check stop state */ + if (status & 0x50) + return 1; + break; + default: + break; + } + return 0; +} + void smp_send_stop(void) { int cpu, rc; @@ -86,7 +104,7 @@ void smp_send_stop(void) rc = signal_processor(cpu, sigp_stop); } while (rc == sigp_busy); - while (!smp_cpu_not_running(cpu)) + while (!cpu_stopped(cpu)) cpu_relax(); } } @@ -269,19 +287,6 @@ static inline void smp_get_save_area(unsigned int cpu, unsigned int phy_cpu) { } #endif /* CONFIG_ZFCPDUMP */ -static int cpu_stopped(int cpu) -{ - __u32 status; - - /* Check for stopped state */ - if (signal_processor_ps(&status, 0, cpu, sigp_sense) == - sigp_status_stored) { - if (status & 0x40) - return 1; - } - return 0; -} - static int cpu_known(int cpu_id) { int cpu; @@ -300,7 +305,7 @@ static int smp_rescan_cpus_sigp(cpumask_t avail) logical_cpu = cpumask_first(&avail); if (logical_cpu >= nr_cpu_ids) return 0; - for (cpu_id = 0; cpu_id <= 65535; cpu_id++) { + for (cpu_id = 0; cpu_id <= MAX_CPU_ADDRESS; cpu_id++) { if (cpu_known(cpu_id)) continue; __cpu_logical_map[logical_cpu] = cpu_id; @@ -379,7 +384,7 @@ static void __init smp_detect_cpus(void) /* Use sigp detection algorithm if sclp doesn't work. */ if (sclp_get_cpu_info(info)) { smp_use_sigp_detection = 1; - for (cpu = 0; cpu <= 65535; cpu++) { + for (cpu = 0; cpu <= MAX_CPU_ADDRESS; cpu++) { if (cpu == boot_cpu_addr) continue; __cpu_logical_map[CPU_INIT_NO] = cpu; @@ -635,7 +640,7 @@ int __cpu_disable(void) void __cpu_die(unsigned int cpu) { /* Wait until target cpu is down */ - while (!smp_cpu_not_running(cpu)) + while (!cpu_stopped(cpu)) cpu_relax(); smp_free_lowcore(cpu); pr_info("Processor %d stopped\n", cpu); diff --git a/arch/s390/power/swsusp.c b/arch/s390/kernel/suspend.c index bd1f5c6b0b8..086bee970ca 100644 --- a/arch/s390/power/swsusp.c +++ b/arch/s390/kernel/suspend.c @@ -1,13 +1,44 @@ /* - * Support for suspend and resume on s390 + * Suspend support specific for s390. * * Copyright IBM Corp. 2009 * * Author(s): Hans-Joachim Picht <hans@linux.vnet.ibm.com> - * */ +#include <linux/suspend.h> +#include <linux/reboot.h> +#include <linux/pfn.h> +#include <linux/mm.h> +#include <asm/sections.h> #include <asm/system.h> +#include <asm/ipl.h> + +/* + * References to section boundaries + */ +extern const void __nosave_begin, __nosave_end; + +/* + * check if given pfn is in the 'nosave' or in the read only NSS section + */ +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; + unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) + >> PAGE_SHIFT; + unsigned long eshared_pfn = PFN_DOWN(__pa(&_eshared)) - 1; + unsigned long stext_pfn = PFN_DOWN(__pa(&_stext)); + + if (pfn >= nosave_begin_pfn && pfn < nosave_end_pfn) + return 1; + if (pfn >= stext_pfn && pfn <= eshared_pfn) { + if (ipl_info.type == IPL_TYPE_NSS) + return 1; + } else if ((tprot(pfn * PAGE_SIZE) && pfn > 0)) + return 1; + return 0; +} void save_processor_state(void) { diff --git a/arch/s390/power/swsusp_asm64.S b/arch/s390/kernel/swsusp_asm64.S index b26df5c5933..7cd6b096f0d 100644 --- a/arch/s390/power/swsusp_asm64.S +++ b/arch/s390/kernel/swsusp_asm64.S @@ -21,7 +21,7 @@ * This function runs with disabled interrupts. */ .section .text - .align 2 + .align 4 .globl swsusp_arch_suspend swsusp_arch_suspend: stmg %r6,%r15,__SF_GPRS(%r15) diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index d4c8e9c47c8..54e327e9af0 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -60,6 +60,7 @@ #define TICK_SIZE tick u64 sched_clock_base_cc = -1; /* Force to data section. */ +EXPORT_SYMBOL_GPL(sched_clock_base_cc); static DEFINE_PER_CPU(struct clock_event_device, comparators); @@ -68,7 +69,7 @@ static DEFINE_PER_CPU(struct clock_event_device, comparators); */ unsigned long long notrace sched_clock(void) { - return ((get_clock_xt() - sched_clock_base_cc) * 125) >> 9; + return (get_clock_monotonic() * 125) >> 9; } /* diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index a53db23ee09..7315f9e67e1 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -52,55 +52,18 @@ SECTIONS . = ALIGN(PAGE_SIZE); _eshared = .; /* End of shareable data */ - . = ALIGN(16); /* Exception table */ - __ex_table : { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :data - - .data : { /* Data */ - DATA_DATA - CONSTRUCTORS - } - - . = ALIGN(PAGE_SIZE); - .data_nosave : { - __nosave_begin = .; - *(.data.nosave) - } - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - - . = ALIGN(PAGE_SIZE); - .data.page_aligned : { - *(.data.idt) - } + EXCEPTION_TABLE(16) :data - . = ALIGN(0x100); - .data.cacheline_aligned : { - *(.data.cacheline_aligned) - } + RW_DATA_SECTION(0x100, PAGE_SIZE, THREAD_SIZE) - . = ALIGN(0x100); - .data.read_mostly : { - *(.data.read_mostly) - } _edata = .; /* End of data section */ - . = ALIGN(THREAD_SIZE); /* init_task */ - .data.init_task : { - *(.data.init_task) - } - /* will be freed after init */ . = ALIGN(PAGE_SIZE); /* Init code and data */ __init_begin = .; - .init.text : { - _sinittext = .; - INIT_TEXT - _einittext = .; - } + + INIT_TEXT_SECTION(PAGE_SIZE) + /* * .exit.text is discarded at runtime, not link time, * to deal with references from __bug_table @@ -111,49 +74,13 @@ SECTIONS /* early.c uses stsi, which requires page aligned data. */ . = ALIGN(PAGE_SIZE); - .init.data : { - INIT_DATA - } - . = ALIGN(0x100); - .init.setup : { - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - - .con_initcall.init : { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - SECURITY_INIT - -#ifdef CONFIG_BLK_DEV_INITRD - . = ALIGN(0x100); - .init.ramfs : { - __initramfs_start = .; - *(.init.ramfs) - . = ALIGN(2); - __initramfs_end = .; - } -#endif + INIT_DATA_SECTION(0x100) PERCPU(PAGE_SIZE) . = ALIGN(PAGE_SIZE); __init_end = .; /* freed after init ends here */ - /* BSS */ - .bss : { - __bss_start = .; - *(.bss) - . = ALIGN(2); - __bss_stop = .; - } + BSS_SECTION(0, 2, 0) _end = . ; diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index db05661ac89..eec05448441 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -2,7 +2,7 @@ # Makefile for the linux s390-specific parts of the memory manager. # -obj-y := init.o fault.o extmem.o mmap.o vmem.o pgtable.o maccess.o +obj-y := init.o fault.o extmem.o mmap.o vmem.o pgtable.o maccess.o \ + page-states.o obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_PAGE_STATES) += page-states.o diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index e5e119fe03b..1abbadd497e 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -10,6 +10,7 @@ * Copyright (C) 1995 Linus Torvalds */ +#include <linux/perf_counter.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/kernel.h> @@ -305,7 +306,7 @@ do_exception(struct pt_regs *regs, unsigned long error_code, int write) * interrupts again and then search the VMAs */ local_irq_enable(); - + perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); down_read(&mm->mmap_sem); si_code = SEGV_MAPERR; @@ -363,11 +364,15 @@ good_area: } BUG(); } - if (fault & VM_FAULT_MAJOR) + if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - else + perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + regs, address); + } else { tsk->min_flt++; - + perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + regs, address); + } up_read(&mm->mmap_sem); /* * The instruction that caused the program check will diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index fc0ad73ffd9..f92ec203ad9 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -1,6 +1,4 @@ /* - * arch/s390/mm/page-states.c - * * Copyright IBM Corp. 2008 * * Guest page hinting for unused pages. @@ -17,11 +15,12 @@ #define ESSA_SET_STABLE 1 #define ESSA_SET_UNUSED 2 -static int cmma_flag; +static int cmma_flag = 1; static int __init cmma(char *str) { char *parm; + parm = strstrip(str); if (strcmp(parm, "yes") == 0 || strcmp(parm, "on") == 0) { cmma_flag = 1; @@ -32,7 +31,6 @@ static int __init cmma(char *str) return 1; return 0; } - __setup("cmma=", cmma); void __init cmma_init(void) diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 56566720798..c7021524707 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -78,9 +78,9 @@ unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec) } page->index = page_to_phys(shadow); } - spin_lock(&mm->page_table_lock); + spin_lock(&mm->context.list_lock); list_add(&page->lru, &mm->context.crst_list); - spin_unlock(&mm->page_table_lock); + spin_unlock(&mm->context.list_lock); return (unsigned long *) page_to_phys(page); } @@ -89,9 +89,9 @@ void crst_table_free(struct mm_struct *mm, unsigned long *table) unsigned long *shadow = get_shadow_table(table); struct page *page = virt_to_page(table); - spin_lock(&mm->page_table_lock); + spin_lock(&mm->context.list_lock); list_del(&page->lru); - spin_unlock(&mm->page_table_lock); + spin_unlock(&mm->context.list_lock); if (shadow) free_pages((unsigned long) shadow, ALLOC_ORDER); free_pages((unsigned long) table, ALLOC_ORDER); @@ -182,7 +182,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) unsigned long bits; bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; - spin_lock(&mm->page_table_lock); + spin_lock(&mm->context.list_lock); page = NULL; if (!list_empty(&mm->context.pgtable_list)) { page = list_first_entry(&mm->context.pgtable_list, @@ -191,7 +191,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) page = NULL; } if (!page) { - spin_unlock(&mm->page_table_lock); + spin_unlock(&mm->context.list_lock); page = alloc_page(GFP_KERNEL|__GFP_REPEAT); if (!page) return NULL; @@ -202,7 +202,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) clear_table_pgstes(table); else clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); - spin_lock(&mm->page_table_lock); + spin_lock(&mm->context.list_lock); list_add(&page->lru, &mm->context.pgtable_list); } table = (unsigned long *) page_to_phys(page); @@ -213,7 +213,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) page->flags |= bits; if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1)) list_move_tail(&page->lru, &mm->context.pgtable_list); - spin_unlock(&mm->page_table_lock); + spin_unlock(&mm->context.list_lock); return table; } @@ -225,7 +225,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - spin_lock(&mm->page_table_lock); + spin_lock(&mm->context.list_lock); page->flags ^= bits; if (page->flags & FRAG_MASK) { /* Page now has some free pgtable fragments. */ @@ -234,7 +234,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) } else /* All fragments of the 4K page have been freed. */ list_del(&page->lru); - spin_unlock(&mm->page_table_lock); + spin_unlock(&mm->context.list_lock); if (page) { pgtable_page_dtor(page); __free_page(page); @@ -245,7 +245,7 @@ void disable_noexec(struct mm_struct *mm, struct task_struct *tsk) { struct page *page; - spin_lock(&mm->page_table_lock); + spin_lock(&mm->context.list_lock); /* Free shadow region and segment tables. */ list_for_each_entry(page, &mm->context.crst_list, lru) if (page->index) { @@ -255,7 +255,7 @@ void disable_noexec(struct mm_struct *mm, struct task_struct *tsk) /* "Free" second halves of page tables. */ list_for_each_entry(page, &mm->context.pgtable_list, lru) page->flags &= ~SECOND_HALVES; - spin_unlock(&mm->page_table_lock); + spin_unlock(&mm->context.list_lock); mm->context.noexec = 0; update_mm(mm, tsk); } diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index e4868bfc672..5f91a38d759 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -331,6 +331,7 @@ void __init vmem_map_init(void) unsigned long start, end; int i; + spin_lock_init(&init_mm.context.list_lock); INIT_LIST_HEAD(&init_mm.context.crst_list); INIT_LIST_HEAD(&init_mm.context.pgtable_list); init_mm.context.noexec = 0; diff --git a/arch/s390/power/Makefile b/arch/s390/power/Makefile deleted file mode 100644 index 973bb45a8fe..00000000000 --- a/arch/s390/power/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -# -# Makefile for s390 PM support -# - -obj-$(CONFIG_HIBERNATION) += suspend.o -obj-$(CONFIG_HIBERNATION) += swsusp.o -obj-$(CONFIG_HIBERNATION) += swsusp_64.o -obj-$(CONFIG_HIBERNATION) += swsusp_asm64.o diff --git a/arch/s390/power/suspend.c b/arch/s390/power/suspend.c deleted file mode 100644 index b3351eceebb..00000000000 --- a/arch/s390/power/suspend.c +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Suspend support specific for s390. - * - * Copyright IBM Corp. 2009 - * - * Author(s): Hans-Joachim Picht <hans@linux.vnet.ibm.com> - */ - -#include <linux/mm.h> -#include <linux/suspend.h> -#include <linux/reboot.h> -#include <linux/pfn.h> -#include <asm/sections.h> -#include <asm/ipl.h> - -/* - * References to section boundaries - */ -extern const void __nosave_begin, __nosave_end; - -/* - * check if given pfn is in the 'nosave' or in the read only NSS section - */ -int pfn_is_nosave(unsigned long pfn) -{ - unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; - unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) - >> PAGE_SHIFT; - unsigned long eshared_pfn = PFN_DOWN(__pa(&_eshared)) - 1; - unsigned long stext_pfn = PFN_DOWN(__pa(&_stext)); - - if (pfn >= nosave_begin_pfn && pfn < nosave_end_pfn) - return 1; - if (pfn >= stext_pfn && pfn <= eshared_pfn) { - if (ipl_info.type == IPL_TYPE_NSS) - return 1; - } else if ((tprot(pfn * PAGE_SIZE) && pfn > 0)) - return 1; - return 0; -} diff --git a/arch/s390/power/swsusp_64.c b/arch/s390/power/swsusp_64.c deleted file mode 100644 index 9516a517d72..00000000000 --- a/arch/s390/power/swsusp_64.c +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Support for suspend and resume on s390 - * - * Copyright IBM Corp. 2009 - * - * Author(s): Hans-Joachim Picht <hans@linux.vnet.ibm.com> - * - */ - -#include <asm/system.h> -#include <linux/interrupt.h> - -void do_after_copyback(void) -{ - mb(); -} - diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c index b5afbec1db5..04a21883f32 100644 --- a/arch/sh/kernel/signal_32.c +++ b/arch/sh/kernel/signal_32.c @@ -640,5 +640,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned int save_r0, if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } } diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c index 0663a0ee602..9e5c9b1d7e9 100644 --- a/arch/sh/kernel/signal_64.c +++ b/arch/sh/kernel/signal_64.c @@ -772,5 +772,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned long thread_info if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } } diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c index 181d069a2d4..7ce1a1005b1 100644 --- a/arch/sparc/kernel/signal_32.c +++ b/arch/sparc/kernel/signal_32.c @@ -590,6 +590,8 @@ void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } } diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index ec82d76dc6f..647afbda7ae 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -613,5 +613,8 @@ void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, unsigned long if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } } + diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 676debfc170..128111d8ffe 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -20,6 +20,7 @@ #include <linux/bitops.h> #include <linux/ioport.h> #include <linux/suspend.h> +#include <linux/kmemleak.h> #include <asm/e820.h> #include <asm/io.h> #include <asm/iommu.h> @@ -94,6 +95,11 @@ static u32 __init allocate_aperture(void) * code for safe */ p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); + /* + * Kmemleak should not scan this block as it may not be mapped via the + * kernel direct mapping. + */ + kmemleak_ignore(p); if (!p || __pa(p)+aper_size > 0xffffffff) { printk(KERN_ERR "Cannot allocate aperture memory hole (%p,%uK)\n", diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1a041bcf506..fa80f60e960 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -3,6 +3,7 @@ #include <linux/dmar.h> #include <linux/bootmem.h> #include <linux/pci.h> +#include <linux/kmemleak.h> #include <asm/proto.h> #include <asm/dma.h> @@ -88,6 +89,11 @@ void __init dma32_reserve_bootmem(void) size = roundup(dma32_bootmem_size, align); dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 512ULL<<20); + /* + * Kmemleak should not scan this block as it may not be mapped via the + * kernel direct mapping. + */ + kmemleak_ignore(dma32_bootmem_ptr); if (dma32_bootmem_ptr) dma32_bootmem_size = size; else diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 4c578751e94..81e58238c4c 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -869,6 +869,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index 2c55ed09865..528bf954eb7 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -331,6 +331,20 @@ static void kmemcheck_read_strict(struct pt_regs *regs, kmemcheck_shadow_set(shadow, size); } +bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size) +{ + enum kmemcheck_shadow status; + void *shadow; + + shadow = kmemcheck_shadow_lookup(addr); + if (!shadow) + return true; + + status = kmemcheck_shadow_test(shadow, size); + + return status == KMEMCHECK_SHADOW_INITIALIZED; +} + /* Access may cross page boundary */ static void kmemcheck_read(struct pt_regs *regs, unsigned long addr, unsigned int size) diff --git a/block/blk-core.c b/block/blk-core.c index e3299a77a0d..e695634882a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -501,6 +501,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; q->backing_dev_info.state = 0; q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; + q->backing_dev_info.name = "block"; err = bdi_init(&q->backing_dev_info); if (err) { diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 1e15889c4b9..95d344971ed 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -268,6 +268,7 @@ aoeblk_gdalloc(void *vp) if (!d->blkq) goto err_mempool; blk_queue_make_request(d->blkq, aoeblk_make_request); + d->blkq->backing_dev_info.name = "aoe"; if (bdi_init(&d->blkq->backing_dev_info)) goto err_blkq; spin_lock_irqsave(&d->lock, flags); diff --git a/drivers/char/hvc_iucv.c b/drivers/char/hvc_iucv.c index 86105efb4eb..0ecac7e532f 100644 --- a/drivers/char/hvc_iucv.c +++ b/drivers/char/hvc_iucv.c @@ -1006,7 +1006,7 @@ static int __init hvc_iucv_alloc(int id, unsigned int is_console) priv->dev->release = (void (*)(struct device *)) kfree; rc = device_register(priv->dev); if (rc) { - kfree(priv->dev); + put_device(priv->dev); goto out_error_dev; } diff --git a/drivers/char/mem.c b/drivers/char/mem.c index afa8813e737..645237bda68 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -822,6 +822,7 @@ static const struct file_operations zero_fops = { * - permits private mappings, "copies" are taken of the source of zeros */ static struct backing_dev_info zero_bdi = { + .name = "char/mem", .capabilities = BDI_CAP_MAP_COPY, }; diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c index aec1931608a..0b73e4ec1ad 100644 --- a/drivers/char/tpm/tpm_tis.c +++ b/drivers/char/tpm/tpm_tis.c @@ -450,6 +450,12 @@ static int tpm_tis_init(struct device *dev, resource_size_t start, goto out_err; } + /* Default timeouts */ + chip->vendor.timeout_a = msecs_to_jiffies(TIS_SHORT_TIMEOUT); + chip->vendor.timeout_b = msecs_to_jiffies(TIS_LONG_TIMEOUT); + chip->vendor.timeout_c = msecs_to_jiffies(TIS_SHORT_TIMEOUT); + chip->vendor.timeout_d = msecs_to_jiffies(TIS_SHORT_TIMEOUT); + if (request_locality(chip, 0) != 0) { rc = -ENODEV; goto out_err; @@ -457,12 +463,6 @@ static int tpm_tis_init(struct device *dev, resource_size_t start, vendor = ioread32(chip->vendor.iobase + TPM_DID_VID(0)); - /* Default timeouts */ - chip->vendor.timeout_a = msecs_to_jiffies(TIS_SHORT_TIMEOUT); - chip->vendor.timeout_b = msecs_to_jiffies(TIS_LONG_TIMEOUT); - chip->vendor.timeout_c = msecs_to_jiffies(TIS_SHORT_TIMEOUT); - chip->vendor.timeout_d = msecs_to_jiffies(TIS_SHORT_TIMEOUT); - dev_info(dev, "1.2 TPM (device-id 0x%X, rev-id %d)\n", vendor >> 16, ioread8(chip->vendor.iobase + TPM_RID(0))); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 8f9509e1ebf..55d093a36ae 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -362,6 +362,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id) * In either case, must tell the provider to reject. */ cm_id_priv->state = IW_CM_STATE_DESTROYING; + cm_id->device->iwcm->reject(cm_id, NULL, 0); break; case IW_CM_STATE_CONN_SENT: case IW_CM_STATE_DESTROYING: diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index de922a04ca2..7522008fda8 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -2,6 +2,7 @@ * Copyright (c) 2004-2007 Voltaire, Inc. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -45,14 +46,21 @@ MODULE_DESCRIPTION("kernel IB MAD API"); MODULE_AUTHOR("Hal Rosenstock"); MODULE_AUTHOR("Sean Hefty"); +int mad_sendq_size = IB_MAD_QP_SEND_SIZE; +int mad_recvq_size = IB_MAD_QP_RECV_SIZE; + +module_param_named(send_queue_size, mad_sendq_size, int, 0444); +MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests"); +module_param_named(recv_queue_size, mad_recvq_size, int, 0444); +MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests"); + static struct kmem_cache *ib_mad_cache; static struct list_head ib_mad_port_list; static u32 ib_mad_client_id = 0; /* Port list lock */ -static spinlock_t ib_mad_port_list_lock; - +static DEFINE_SPINLOCK(ib_mad_port_list_lock); /* Forward declarations */ static int method_in_use(struct ib_mad_mgmt_method_table **method, @@ -1974,7 +1982,7 @@ static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv) unsigned long delay; if (list_empty(&mad_agent_priv->wait_list)) { - cancel_delayed_work(&mad_agent_priv->timed_work); + __cancel_delayed_work(&mad_agent_priv->timed_work); } else { mad_send_wr = list_entry(mad_agent_priv->wait_list.next, struct ib_mad_send_wr_private, @@ -1983,7 +1991,7 @@ static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv) if (time_after(mad_agent_priv->timeout, mad_send_wr->timeout)) { mad_agent_priv->timeout = mad_send_wr->timeout; - cancel_delayed_work(&mad_agent_priv->timed_work); + __cancel_delayed_work(&mad_agent_priv->timed_work); delay = mad_send_wr->timeout - jiffies; if ((long)delay <= 0) delay = 1; @@ -2023,7 +2031,7 @@ static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr) /* Reschedule a work item if we have a shorter timeout */ if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list) { - cancel_delayed_work(&mad_agent_priv->timed_work); + __cancel_delayed_work(&mad_agent_priv->timed_work); queue_delayed_work(mad_agent_priv->qp_info->port_priv->wq, &mad_agent_priv->timed_work, delay); } @@ -2736,8 +2744,8 @@ static int create_mad_qp(struct ib_mad_qp_info *qp_info, qp_init_attr.send_cq = qp_info->port_priv->cq; qp_init_attr.recv_cq = qp_info->port_priv->cq; qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; - qp_init_attr.cap.max_send_wr = IB_MAD_QP_SEND_SIZE; - qp_init_attr.cap.max_recv_wr = IB_MAD_QP_RECV_SIZE; + qp_init_attr.cap.max_send_wr = mad_sendq_size; + qp_init_attr.cap.max_recv_wr = mad_recvq_size; qp_init_attr.cap.max_send_sge = IB_MAD_SEND_REQ_MAX_SG; qp_init_attr.cap.max_recv_sge = IB_MAD_RECV_REQ_MAX_SG; qp_init_attr.qp_type = qp_type; @@ -2752,8 +2760,8 @@ static int create_mad_qp(struct ib_mad_qp_info *qp_info, goto error; } /* Use minimum queue sizes unless the CQ is resized */ - qp_info->send_queue.max_active = IB_MAD_QP_SEND_SIZE; - qp_info->recv_queue.max_active = IB_MAD_QP_RECV_SIZE; + qp_info->send_queue.max_active = mad_sendq_size; + qp_info->recv_queue.max_active = mad_recvq_size; return 0; error: @@ -2792,7 +2800,7 @@ static int ib_mad_port_open(struct ib_device *device, init_mad_qp(port_priv, &port_priv->qp_info[0]); init_mad_qp(port_priv, &port_priv->qp_info[1]); - cq_size = (IB_MAD_QP_SEND_SIZE + IB_MAD_QP_RECV_SIZE) * 2; + cq_size = (mad_sendq_size + mad_recvq_size) * 2; port_priv->cq = ib_create_cq(port_priv->device, ib_mad_thread_completion_handler, NULL, port_priv, cq_size, 0); @@ -2984,7 +2992,11 @@ static int __init ib_mad_init_module(void) { int ret; - spin_lock_init(&ib_mad_port_list_lock); + mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE); + mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE); + + mad_sendq_size = min(mad_sendq_size, IB_MAD_QP_MAX_SIZE); + mad_sendq_size = max(mad_sendq_size, IB_MAD_QP_MIN_SIZE); ib_mad_cache = kmem_cache_create("ib_mad", sizeof(struct ib_mad_private), @@ -3021,4 +3033,3 @@ static void __exit ib_mad_cleanup_module(void) module_init(ib_mad_init_module); module_exit(ib_mad_cleanup_module); - diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index 05ce331733b..9430ab4969c 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -2,6 +2,7 @@ * Copyright (c) 2004, 2005, Voltaire, Inc. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -49,6 +50,8 @@ /* QP and CQ parameters */ #define IB_MAD_QP_SEND_SIZE 128 #define IB_MAD_QP_RECV_SIZE 512 +#define IB_MAD_QP_MIN_SIZE 64 +#define IB_MAD_QP_MAX_SIZE 8192 #define IB_MAD_SEND_REQ_MAX_SG 2 #define IB_MAD_RECV_REQ_MAX_SG 1 diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index 107f170c57c..8d82ba17135 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -106,6 +106,8 @@ struct mcast_group { struct ib_sa_query *query; int query_id; u16 pkey_index; + u8 leave_state; + int retries; }; struct mcast_member { @@ -350,6 +352,7 @@ static int send_leave(struct mcast_group *group, u8 leave_state) rec = group->rec; rec.join_state = leave_state; + group->leave_state = leave_state; ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device, port->port_num, IB_SA_METHOD_DELETE, &rec, @@ -542,7 +545,11 @@ static void leave_handler(int status, struct ib_sa_mcmember_rec *rec, { struct mcast_group *group = context; - mcast_work_handler(&group->work); + if (status && group->retries > 0 && + !send_leave(group, group->leave_state)) + group->retries--; + else + mcast_work_handler(&group->work); } static struct mcast_group *acquire_group(struct mcast_port *port, @@ -565,6 +572,7 @@ static struct mcast_group *acquire_group(struct mcast_port *port, if (!group) return NULL; + group->retries = 3; group->port = port; group->rec.mgid = *mgid; group->pkey_index = MCAST_INVALID_PKEY_INDEX; diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 1865049e80f..82543716d59 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -109,10 +109,10 @@ static struct ib_client sa_client = { .remove = ib_sa_remove_one }; -static spinlock_t idr_lock; +static DEFINE_SPINLOCK(idr_lock); static DEFINE_IDR(query_idr); -static spinlock_t tid_lock; +static DEFINE_SPINLOCK(tid_lock); static u32 tid; #define PATH_REC_FIELD(field) \ @@ -1077,9 +1077,6 @@ static int __init ib_sa_init(void) { int ret; - spin_lock_init(&idr_lock); - spin_lock_init(&tid_lock); - get_random_bytes(&tid, sizeof tid); ret = ib_register_client(&sa_client); diff --git a/drivers/infiniband/core/smi.c b/drivers/infiniband/core/smi.c index 87236753bce..5855e4405d9 100644 --- a/drivers/infiniband/core/smi.c +++ b/drivers/infiniband/core/smi.c @@ -52,6 +52,10 @@ enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, hop_cnt = smp->hop_cnt; /* See section 14.2.2.2, Vol 1 IB spec */ + /* C14-6 -- valid hop_cnt values are from 0 to 63 */ + if (hop_cnt >= IB_SMP_MAX_PATH_HOPS) + return IB_SMI_DISCARD; + if (!ib_get_smp_direction(smp)) { /* C14-9:1 */ if (hop_cnt && hop_ptr == 0) { @@ -133,6 +137,10 @@ enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type, hop_cnt = smp->hop_cnt; /* See section 14.2.2.2, Vol 1 IB spec */ + /* C14-6 -- valid hop_cnt values are from 0 to 63 */ + if (hop_cnt >= IB_SMP_MAX_PATH_HOPS) + return IB_SMI_DISCARD; + if (!ib_get_smp_direction(smp)) { /* C14-9:1 -- sender should have incremented hop_ptr */ if (hop_cnt && hop_ptr == 0) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index eb36a81dd09..d3fff9e008a 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -73,7 +73,7 @@ DEFINE_IDR(ib_uverbs_cq_idr); DEFINE_IDR(ib_uverbs_qp_idr); DEFINE_IDR(ib_uverbs_srq_idr); -static spinlock_t map_lock; +static DEFINE_SPINLOCK(map_lock); static struct ib_uverbs_device *dev_table[IB_UVERBS_MAX_DEVICES]; static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); @@ -584,14 +584,16 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, if (hdr.command < 0 || hdr.command >= ARRAY_SIZE(uverbs_cmd_table) || - !uverbs_cmd_table[hdr.command] || - !(file->device->ib_dev->uverbs_cmd_mask & (1ull << hdr.command))) + !uverbs_cmd_table[hdr.command]) return -EINVAL; if (!file->ucontext && hdr.command != IB_USER_VERBS_CMD_GET_CONTEXT) return -EINVAL; + if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << hdr.command))) + return -ENOSYS; + return uverbs_cmd_table[hdr.command](file, buf + sizeof hdr, hdr.in_words * 4, hdr.out_words * 4); } @@ -836,8 +838,6 @@ static int __init ib_uverbs_init(void) { int ret; - spin_lock_init(&map_lock); - ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES, "infiniband_verbs"); if (ret) { diff --git a/drivers/infiniband/hw/amso1100/c2.c b/drivers/infiniband/hw/amso1100/c2.c index 0cfbb6d2f76..8250740c94b 100644 --- a/drivers/infiniband/hw/amso1100/c2.c +++ b/drivers/infiniband/hw/amso1100/c2.c @@ -86,11 +86,7 @@ MODULE_DEVICE_TABLE(pci, c2_pci_table); static void c2_print_macaddr(struct net_device *netdev) { - pr_debug("%s: MAC %02X:%02X:%02X:%02X:%02X:%02X, " - "IRQ %u\n", netdev->name, - netdev->dev_addr[0], netdev->dev_addr[1], netdev->dev_addr[2], - netdev->dev_addr[3], netdev->dev_addr[4], netdev->dev_addr[5], - netdev->irq); + pr_debug("%s: MAC %pM, IRQ %u\n", netdev->name, netdev->dev_addr, netdev->irq); } static void c2_set_rxbufsize(struct c2_port *c2_port) diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c index f1948fad85d..ad723bd8bf4 100644 --- a/drivers/infiniband/hw/amso1100/c2_provider.c +++ b/drivers/infiniband/hw/amso1100/c2_provider.c @@ -780,11 +780,11 @@ int c2_register_device(struct c2_dev *dev) /* Register pseudo network device */ dev->pseudo_netdev = c2_pseudo_netdev_init(dev); if (!dev->pseudo_netdev) - goto out3; + goto out; ret = register_netdev(dev->pseudo_netdev); if (ret) - goto out2; + goto out_free_netdev; pr_debug("%s:%u\n", __func__, __LINE__); strlcpy(dev->ibdev.name, "amso%d", IB_DEVICE_NAME_MAX); @@ -851,6 +851,10 @@ int c2_register_device(struct c2_dev *dev) dev->ibdev.post_recv = c2_post_receive; dev->ibdev.iwcm = kmalloc(sizeof(*dev->ibdev.iwcm), GFP_KERNEL); + if (dev->ibdev.iwcm == NULL) { + ret = -ENOMEM; + goto out_unregister_netdev; + } dev->ibdev.iwcm->add_ref = c2_add_ref; dev->ibdev.iwcm->rem_ref = c2_rem_ref; dev->ibdev.iwcm->get_qp = c2_get_qp; @@ -862,23 +866,25 @@ int c2_register_device(struct c2_dev *dev) ret = ib_register_device(&dev->ibdev); if (ret) - goto out1; + goto out_free_iwcm; for (i = 0; i < ARRAY_SIZE(c2_dev_attributes); ++i) { ret = device_create_file(&dev->ibdev.dev, c2_dev_attributes[i]); if (ret) - goto out0; + goto out_unregister_ibdev; } - goto out3; + goto out; -out0: +out_unregister_ibdev: ib_unregister_device(&dev->ibdev); -out1: +out_free_iwcm: + kfree(dev->ibdev.iwcm); +out_unregister_netdev: unregister_netdev(dev->pseudo_netdev); -out2: +out_free_netdev: free_netdev(dev->pseudo_netdev); -out3: +out: pr_debug("%s:%u ret=%d\n", __func__, __LINE__, ret); return ret; } diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c index 62f9cf2f94e..72ed3396b72 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.c +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c @@ -852,7 +852,9 @@ int cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr) wqe->qpcaps = attr->qpcaps; wqe->ulpdu_size = cpu_to_be16(attr->tcp_emss); wqe->rqe_count = cpu_to_be16(attr->rqe_count); - wqe->flags_rtr_type = cpu_to_be16(attr->flags|V_RTR_TYPE(attr->rtr_type)); + wqe->flags_rtr_type = cpu_to_be16(attr->flags | + V_RTR_TYPE(attr->rtr_type) | + V_CHAN(attr->chan)); wqe->ord = cpu_to_be32(attr->ord); wqe->ird = cpu_to_be32(attr->ird); wqe->qp_dma_addr = cpu_to_be64(attr->qp_dma_addr); @@ -1032,6 +1034,7 @@ err3: err2: cxio_hal_destroy_ctrl_qp(rdev_p); err1: + rdev_p->t3cdev_p->ulp = NULL; list_del(&rdev_p->entry); return err; } diff --git a/drivers/infiniband/hw/cxgb3/cxio_wr.h b/drivers/infiniband/hw/cxgb3/cxio_wr.h index 32e3b1461d8..a197a5b7ac7 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_wr.h +++ b/drivers/infiniband/hw/cxgb3/cxio_wr.h @@ -327,6 +327,11 @@ enum rdma_init_rtr_types { #define V_RTR_TYPE(x) ((x) << S_RTR_TYPE) #define G_RTR_TYPE(x) ((((x) >> S_RTR_TYPE)) & M_RTR_TYPE) +#define S_CHAN 4 +#define M_CHAN 0x3 +#define V_CHAN(x) ((x) << S_CHAN) +#define G_CHAN(x) ((((x) >> S_CHAN)) & M_CHAN) + struct t3_rdma_init_attr { u32 tid; u32 qpid; @@ -346,6 +351,7 @@ struct t3_rdma_init_attr { u16 flags; u16 rqe_count; u32 irs; + u32 chan; }; struct t3_rdma_init_wr { diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c index 26fc0a4eaa7..b0ea0105ddf 100644 --- a/drivers/infiniband/hw/cxgb3/iwch.c +++ b/drivers/infiniband/hw/cxgb3/iwch.c @@ -51,7 +51,7 @@ cxgb3_cpl_handler_func t3c_handlers[NUM_CPL_CMDS]; static void open_rnic_dev(struct t3cdev *); static void close_rnic_dev(struct t3cdev *); -static void iwch_err_handler(struct t3cdev *, u32, u32); +static void iwch_event_handler(struct t3cdev *, u32, u32); struct cxgb3_client t3c_client = { .name = "iw_cxgb3", @@ -59,7 +59,7 @@ struct cxgb3_client t3c_client = { .remove = close_rnic_dev, .handlers = t3c_handlers, .redirect = iwch_ep_redirect, - .err_handler = iwch_err_handler + .event_handler = iwch_event_handler }; static LIST_HEAD(dev_list); @@ -105,11 +105,9 @@ static void rnic_init(struct iwch_dev *rnicp) static void open_rnic_dev(struct t3cdev *tdev) { struct iwch_dev *rnicp; - static int vers_printed; PDBG("%s t3cdev %p\n", __func__, tdev); - if (!vers_printed++) - printk(KERN_INFO MOD "Chelsio T3 RDMA Driver - version %s\n", + printk_once(KERN_INFO MOD "Chelsio T3 RDMA Driver - version %s\n", DRV_VERSION); rnicp = (struct iwch_dev *)ib_alloc_device(sizeof(*rnicp)); if (!rnicp) { @@ -162,21 +160,36 @@ static void close_rnic_dev(struct t3cdev *tdev) mutex_unlock(&dev_mutex); } -static void iwch_err_handler(struct t3cdev *tdev, u32 status, u32 error) +static void iwch_event_handler(struct t3cdev *tdev, u32 evt, u32 port_id) { struct cxio_rdev *rdev = tdev->ulp; - struct iwch_dev *rnicp = rdev_to_iwch_dev(rdev); + struct iwch_dev *rnicp; struct ib_event event; + u32 portnum = port_id + 1; - if (status == OFFLOAD_STATUS_DOWN) { + if (!rdev) + return; + rnicp = rdev_to_iwch_dev(rdev); + switch (evt) { + case OFFLOAD_STATUS_DOWN: { rdev->flags = CXIO_ERROR_FATAL; - - event.device = &rnicp->ibdev; event.event = IB_EVENT_DEVICE_FATAL; - event.element.port_num = 0; - ib_dispatch_event(&event); + break; + } + case OFFLOAD_PORT_DOWN: { + event.event = IB_EVENT_PORT_ERR; + break; + } + case OFFLOAD_PORT_UP: { + event.event = IB_EVENT_PORT_ACTIVE; + break; + } } + event.device = &rnicp->ibdev; + event.element.port_num = portnum; + ib_dispatch_event(&event); + return; } diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c index 52d7bb0c2a1..66b41351910 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -286,7 +286,7 @@ void __free_ep(struct kref *kref) ep = container_of(container_of(kref, struct iwch_ep_common, kref), struct iwch_ep, com); PDBG("%s ep %p state %s\n", __func__, ep, states[state_read(&ep->com)]); - if (ep->com.flags & RELEASE_RESOURCES) { + if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) { cxgb3_remove_tid(ep->com.tdev, (void *)ep, ep->hwtid); dst_release(ep->dst); l2t_release(L2DATA(ep->com.tdev), ep->l2t); @@ -297,7 +297,7 @@ void __free_ep(struct kref *kref) static void release_ep_resources(struct iwch_ep *ep) { PDBG("%s ep %p tid %d\n", __func__, ep, ep->hwtid); - ep->com.flags |= RELEASE_RESOURCES; + set_bit(RELEASE_RESOURCES, &ep->com.flags); put_ep(&ep->com); } @@ -786,10 +786,12 @@ static void connect_request_upcall(struct iwch_ep *ep) event.private_data_len = ep->plen; event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); event.provider_data = ep; - if (state_read(&ep->parent_ep->com) != DEAD) + if (state_read(&ep->parent_ep->com) != DEAD) { + get_ep(&ep->com); ep->parent_ep->com.cm_id->event_handler( ep->parent_ep->com.cm_id, &event); + } put_ep(&ep->parent_ep->com); ep->parent_ep = NULL; } @@ -1156,8 +1158,7 @@ static int abort_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) * We get 2 abort replies from the HW. The first one must * be ignored except for scribbling that we need one more. */ - if (!(ep->com.flags & ABORT_REQ_IN_PROGRESS)) { - ep->com.flags |= ABORT_REQ_IN_PROGRESS; + if (!test_and_set_bit(ABORT_REQ_IN_PROGRESS, &ep->com.flags)) { return CPL_RET_BUF_DONE; } @@ -1477,10 +1478,14 @@ static int peer_close(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) /* * We're gonna mark this puppy DEAD, but keep * the reference on it until the ULP accepts or - * rejects the CR. + * rejects the CR. Also wake up anyone waiting + * in rdma connection migration (see iwch_accept_cr()). */ __state_set(&ep->com, CLOSING); - get_ep(&ep->com); + ep->com.rpl_done = 1; + ep->com.rpl_err = -ECONNRESET; + PDBG("waking up ep %p\n", ep); + wake_up(&ep->com.waitq); break; case MPA_REP_SENT: __state_set(&ep->com, CLOSING); @@ -1561,8 +1566,7 @@ static int peer_abort(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) * We get 2 peer aborts from the HW. The first one must * be ignored except for scribbling that we need one more. */ - if (!(ep->com.flags & PEER_ABORT_IN_PROGRESS)) { - ep->com.flags |= PEER_ABORT_IN_PROGRESS; + if (!test_and_set_bit(PEER_ABORT_IN_PROGRESS, &ep->com.flags)) { return CPL_RET_BUF_DONE; } @@ -1589,9 +1593,13 @@ static int peer_abort(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) /* * We're gonna mark this puppy DEAD, but keep * the reference on it until the ULP accepts or - * rejects the CR. + * rejects the CR. Also wake up anyone waiting + * in rdma connection migration (see iwch_accept_cr()). */ - get_ep(&ep->com); + ep->com.rpl_done = 1; + ep->com.rpl_err = -ECONNRESET; + PDBG("waking up ep %p\n", ep); + wake_up(&ep->com.waitq); break; case MORIBUND: case CLOSING: @@ -1797,6 +1805,7 @@ int iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) err = send_mpa_reject(ep, pdata, pdata_len); err = iwch_ep_disconnect(ep, 0, GFP_KERNEL); } + put_ep(&ep->com); return 0; } @@ -1810,8 +1819,10 @@ int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct iwch_qp *qp = get_qhp(h, conn_param->qpn); PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); - if (state_read(&ep->com) == DEAD) - return -ECONNRESET; + if (state_read(&ep->com) == DEAD) { + err = -ECONNRESET; + goto err; + } BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD); BUG_ON(!qp); @@ -1819,15 +1830,14 @@ int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if ((conn_param->ord > qp->rhp->attr.max_rdma_read_qp_depth) || (conn_param->ird > qp->rhp->attr.max_rdma_reads_per_qp)) { abort_connection(ep, NULL, GFP_KERNEL); - return -EINVAL; + err = -EINVAL; + goto err; } cm_id->add_ref(cm_id); ep->com.cm_id = cm_id; ep->com.qp = qp; - ep->com.rpl_done = 0; - ep->com.rpl_err = 0; ep->ird = conn_param->ird; ep->ord = conn_param->ord; @@ -1836,8 +1846,6 @@ int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) PDBG("%s %d ird %d ord %d\n", __func__, __LINE__, ep->ird, ep->ord); - get_ep(&ep->com); - /* bind QP to EP and move to RTS */ attrs.mpa_attr = ep->mpa_attr; attrs.max_ird = ep->ird; @@ -1855,30 +1863,31 @@ int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) err = iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, mask, &attrs, 1); if (err) - goto err; + goto err1; /* if needed, wait for wr_ack */ if (iwch_rqes_posted(qp)) { wait_event(ep->com.waitq, ep->com.rpl_done); err = ep->com.rpl_err; if (err) - goto err; + goto err1; } err = send_mpa_reply(ep, conn_param->private_data, conn_param->private_data_len); if (err) - goto err; + goto err1; state_set(&ep->com, FPDU_MODE); established_upcall(ep); put_ep(&ep->com); return 0; -err: +err1: ep->com.cm_id = NULL; ep->com.qp = NULL; cm_id->rem_ref(cm_id); +err: put_ep(&ep->com); return err; } @@ -2097,14 +2106,17 @@ int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, gfp_t gfp) ep->com.state = CLOSING; start_ep_timer(ep); } + set_bit(CLOSE_SENT, &ep->com.flags); break; case CLOSING: - close = 1; - if (abrupt) { - stop_ep_timer(ep); - ep->com.state = ABORTING; - } else - ep->com.state = MORIBUND; + if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) { + close = 1; + if (abrupt) { + stop_ep_timer(ep); + ep->com.state = ABORTING; + } else + ep->com.state = MORIBUND; + } break; case MORIBUND: case ABORTING: diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.h b/drivers/infiniband/hw/cxgb3/iwch_cm.h index 43c0aea7ead..b9efadfffb4 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.h +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.h @@ -145,9 +145,10 @@ enum iwch_ep_state { }; enum iwch_ep_flags { - PEER_ABORT_IN_PROGRESS = (1 << 0), - ABORT_REQ_IN_PROGRESS = (1 << 1), - RELEASE_RESOURCES = (1 << 2), + PEER_ABORT_IN_PROGRESS = 0, + ABORT_REQ_IN_PROGRESS = 1, + RELEASE_RESOURCES = 2, + CLOSE_SENT = 3, }; struct iwch_ep_common { @@ -162,7 +163,7 @@ struct iwch_ep_common { wait_queue_head_t waitq; int rpl_done; int rpl_err; - u32 flags; + unsigned long flags; }; struct iwch_listen_ep { diff --git a/drivers/infiniband/hw/cxgb3/iwch_mem.c b/drivers/infiniband/hw/cxgb3/iwch_mem.c index ec49a5cbdeb..e1ec65ebb01 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_mem.c +++ b/drivers/infiniband/hw/cxgb3/iwch_mem.c @@ -39,7 +39,7 @@ #include "iwch.h" #include "iwch_provider.h" -static void iwch_finish_mem_reg(struct iwch_mr *mhp, u32 stag) +static int iwch_finish_mem_reg(struct iwch_mr *mhp, u32 stag) { u32 mmid; @@ -47,14 +47,15 @@ static void iwch_finish_mem_reg(struct iwch_mr *mhp, u32 stag) mhp->attr.stag = stag; mmid = stag >> 8; mhp->ibmr.rkey = mhp->ibmr.lkey = stag; - insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid); PDBG("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp); + return insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid); } int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, struct iwch_mr *mhp, int shift) { u32 stag; + int ret; if (cxio_register_phys_mem(&rhp->rdev, &stag, mhp->attr.pdid, @@ -66,9 +67,11 @@ int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, mhp->attr.pbl_size, mhp->attr.pbl_addr)) return -ENOMEM; - iwch_finish_mem_reg(mhp, stag); - - return 0; + ret = iwch_finish_mem_reg(mhp, stag); + if (ret) + cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); + return ret; } int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, @@ -77,6 +80,7 @@ int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, int npages) { u32 stag; + int ret; /* We could support this... */ if (npages > mhp->attr.pbl_size) @@ -93,9 +97,12 @@ int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, mhp->attr.pbl_size, mhp->attr.pbl_addr)) return -ENOMEM; - iwch_finish_mem_reg(mhp, stag); + ret = iwch_finish_mem_reg(mhp, stag); + if (ret) + cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); - return 0; + return ret; } int iwch_alloc_pbl(struct iwch_mr *mhp, int npages) diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index e2a63214008..6895523779d 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -195,7 +195,11 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, int entries, int ve spin_lock_init(&chp->lock); atomic_set(&chp->refcnt, 1); init_waitqueue_head(&chp->wait); - insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid); + if (insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid)) { + cxio_destroy_cq(&chp->rhp->rdev, &chp->cq); + kfree(chp); + return ERR_PTR(-ENOMEM); + } if (ucontext) { struct iwch_mm_entry *mm; @@ -750,7 +754,11 @@ static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd) mhp->attr.stag = stag; mmid = (stag) >> 8; mhp->ibmw.rkey = stag; - insert_handle(rhp, &rhp->mmidr, mhp, mmid); + if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) { + cxio_deallocate_window(&rhp->rdev, mhp->attr.stag); + kfree(mhp); + return ERR_PTR(-ENOMEM); + } PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag); return &(mhp->ibmw); } @@ -778,37 +786,43 @@ static struct ib_mr *iwch_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth) struct iwch_mr *mhp; u32 mmid; u32 stag = 0; - int ret; + int ret = 0; php = to_iwch_pd(pd); rhp = php->rhp; mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); if (!mhp) - return ERR_PTR(-ENOMEM); + goto err; mhp->rhp = rhp; ret = iwch_alloc_pbl(mhp, pbl_depth); - if (ret) { - kfree(mhp); - return ERR_PTR(ret); - } + if (ret) + goto err1; mhp->attr.pbl_size = pbl_depth; ret = cxio_allocate_stag(&rhp->rdev, &stag, php->pdid, mhp->attr.pbl_size, mhp->attr.pbl_addr); - if (ret) { - iwch_free_pbl(mhp); - kfree(mhp); - return ERR_PTR(ret); - } + if (ret) + goto err2; mhp->attr.pdid = php->pdid; mhp->attr.type = TPT_NON_SHARED_MR; mhp->attr.stag = stag; mhp->attr.state = 1; mmid = (stag) >> 8; mhp->ibmr.rkey = mhp->ibmr.lkey = stag; - insert_handle(rhp, &rhp->mmidr, mhp, mmid); + if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) + goto err3; + PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag); return &(mhp->ibmr); +err3: + cxio_dereg_mem(&rhp->rdev, stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); +err2: + iwch_free_pbl(mhp); +err1: + kfree(mhp); +err: + return ERR_PTR(ret); } static struct ib_fast_reg_page_list *iwch_alloc_fastreg_pbl( @@ -961,7 +975,13 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd, spin_lock_init(&qhp->lock); init_waitqueue_head(&qhp->wait); atomic_set(&qhp->refcnt, 1); - insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.qpid); + + if (insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.qpid)) { + cxio_destroy_qp(&rhp->rdev, &qhp->wq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx); + kfree(qhp); + return ERR_PTR(-ENOMEM); + } if (udata) { @@ -1418,6 +1438,7 @@ int iwch_register_device(struct iwch_dev *dev) bail2: ib_unregister_device(&dev->ibdev); bail1: + kfree(dev->ibdev.iwcm); return ret; } @@ -1430,5 +1451,6 @@ void iwch_unregister_device(struct iwch_dev *dev) device_remove_file(&dev->ibdev.dev, iwch_class_attributes[i]); ib_unregister_device(&dev->ibdev); + kfree(dev->ibdev.iwcm); return; } diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c index 27bbdc8e773..6e865347194 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_qp.c +++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c @@ -889,6 +889,7 @@ static int rdma_init(struct iwch_dev *rhp, struct iwch_qp *qhp, init_attr.qp_dma_size = (1UL << qhp->wq.size_log2); init_attr.rqe_count = iwch_rqes_posted(qhp); init_attr.flags = qhp->attr.mpa_attr.initiator ? MPA_INITIATOR : 0; + init_attr.chan = qhp->ep->l2t->smt_idx; if (peer2peer) { init_attr.rtr_type = RTR_READ; if (init_attr.ord == 0 && qhp->attr.mpa_attr.initiator) diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c index fab18a2c74a..5b635aa5947 100644 --- a/drivers/infiniband/hw/ehca/ehca_main.c +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -52,7 +52,7 @@ #include "ehca_tools.h" #include "hcp_if.h" -#define HCAD_VERSION "0028" +#define HCAD_VERSION "0029" MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Christoph Raisch <raisch@de.ibm.com>"); @@ -64,7 +64,7 @@ static int ehca_hw_level = 0; static int ehca_poll_all_eqs = 1; int ehca_debug_level = 0; -int ehca_nr_ports = 2; +int ehca_nr_ports = -1; int ehca_use_hp_mr = 0; int ehca_port_act_time = 30; int ehca_static_rate = -1; @@ -95,8 +95,8 @@ MODULE_PARM_DESC(hw_level, "Hardware level (0: autosensing (default), " "0x10..0x14: eHCA, 0x20..0x23: eHCA2)"); MODULE_PARM_DESC(nr_ports, - "number of connected ports (-1: autodetect, 1: port one only, " - "2: two ports (default)"); + "number of connected ports (-1: autodetect (default), " + "1: port one only, 2: two ports)"); MODULE_PARM_DESC(use_hp_mr, "Use high performance MRs (default: no)"); MODULE_PARM_DESC(port_act_time, diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c index 5a3d96f84c7..8fd88cd828f 100644 --- a/drivers/infiniband/hw/ehca/ehca_reqs.c +++ b/drivers/infiniband/hw/ehca/ehca_reqs.c @@ -786,7 +786,11 @@ repoll: wc->slid = cqe->rlid; wc->dlid_path_bits = cqe->dlid; wc->src_qp = cqe->remote_qp_number; - wc->wc_flags = cqe->w_completion_flags; + /* + * HW has "Immed data present" and "GRH present" in bits 6 and 5. + * SW defines those in bits 1 and 0, so we can just shift and mask. + */ + wc->wc_flags = (cqe->w_completion_flags >> 5) & 3; wc->ex.imm_data = cpu_to_be32(cqe->immediate_data); wc->sl = cqe->service_level; diff --git a/drivers/infiniband/hw/ehca/ehca_sqp.c b/drivers/infiniband/hw/ehca/ehca_sqp.c index c568b28f4e2..8c1213f8916 100644 --- a/drivers/infiniband/hw/ehca/ehca_sqp.c +++ b/drivers/infiniband/hw/ehca/ehca_sqp.c @@ -125,14 +125,30 @@ struct ib_perf { u8 data[192]; } __attribute__ ((packed)); +/* TC/SL/FL packed into 32 bits, as in ClassPortInfo */ +struct tcslfl { + u32 tc:8; + u32 sl:4; + u32 fl:20; +} __attribute__ ((packed)); + +/* IP Version/TC/FL packed into 32 bits, as in GRH */ +struct vertcfl { + u32 ver:4; + u32 tc:8; + u32 fl:20; +} __attribute__ ((packed)); static int ehca_process_perf(struct ib_device *ibdev, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, struct ib_mad *in_mad, struct ib_mad *out_mad) { struct ib_perf *in_perf = (struct ib_perf *)in_mad; struct ib_perf *out_perf = (struct ib_perf *)out_mad; struct ib_class_port_info *poi = (struct ib_class_port_info *)out_perf->data; + struct tcslfl *tcslfl = + (struct tcslfl *)&poi->redirect_tcslfl; struct ehca_shca *shca = container_of(ibdev, struct ehca_shca, ib_device); struct ehca_sport *sport = &shca->sport[port_num - 1]; @@ -158,10 +174,29 @@ static int ehca_process_perf(struct ib_device *ibdev, u8 port_num, poi->base_version = 1; poi->class_version = 1; poi->resp_time_value = 18; - poi->redirect_lid = sport->saved_attr.lid; - poi->redirect_qp = sport->pma_qp_nr; + + /* copy local routing information from WC where applicable */ + tcslfl->sl = in_wc->sl; + poi->redirect_lid = + sport->saved_attr.lid | in_wc->dlid_path_bits; + poi->redirect_qp = sport->pma_qp_nr; poi->redirect_qkey = IB_QP1_QKEY; - poi->redirect_pkey = IB_DEFAULT_PKEY_FULL; + + ehca_query_pkey(ibdev, port_num, in_wc->pkey_index, + &poi->redirect_pkey); + + /* if request was globally routed, copy route info */ + if (in_grh) { + struct vertcfl *vertcfl = + (struct vertcfl *)&in_grh->version_tclass_flow; + memcpy(poi->redirect_gid, in_grh->dgid.raw, + sizeof(poi->redirect_gid)); + tcslfl->tc = vertcfl->tc; + tcslfl->fl = vertcfl->fl; + } else + /* else only fill in default GID */ + ehca_query_gid(ibdev, port_num, 0, + (union ib_gid *)&poi->redirect_gid); ehca_dbg(ibdev, "ehca_pma_lid=%x ehca_pma_qp=%x", sport->saved_attr.lid, sport->pma_qp_nr); @@ -183,8 +218,7 @@ perf_reply: int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh, - struct ib_mad *in_mad, - struct ib_mad *out_mad) + struct ib_mad *in_mad, struct ib_mad *out_mad) { int ret; @@ -196,7 +230,8 @@ int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, return IB_MAD_RESULT_SUCCESS; ehca_dbg(ibdev, "port_num=%x src_qp=%x", port_num, in_wc->src_qp); - ret = ehca_process_perf(ibdev, port_num, in_mad, out_mad); + ret = ehca_process_perf(ibdev, port_num, in_wc, in_grh, + in_mad, out_mad); return ret; } diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index 23173982b32..38a28700661 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -1616,7 +1616,7 @@ static int try_alloc_port(struct ipath_devdata *dd, int port, pd->port_cnt = 1; port_fp(fp) = pd; pd->port_pid = get_pid(task_pid(current)); - strncpy(pd->port_comm, current->comm, sizeof(pd->port_comm)); + strlcpy(pd->port_comm, current->comm, sizeof(pd->port_comm)); ipath_stats.sps_ports++; ret = 0; } else diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c index 16a702d4601..ceb98ee7866 100644 --- a/drivers/infiniband/hw/ipath/ipath_mad.c +++ b/drivers/infiniband/hw/ipath/ipath_mad.c @@ -60,7 +60,7 @@ static int recv_subn_get_nodedescription(struct ib_smp *smp, if (smp->attr_mod) smp->status |= IB_SMP_INVALID_FIELD; - strncpy(smp->data, ibdev->node_desc, sizeof(smp->data)); + memcpy(smp->data, ibdev->node_desc, sizeof(smp->data)); return reply(smp); } diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index ae3d7590346..3cb3f47a10b 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -342,6 +342,9 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, struct mlx4_ib_alloc_ucontext_resp resp; int err; + if (!dev->ib_active) + return ERR_PTR(-EAGAIN); + resp.qp_tab_size = dev->dev->caps.num_qps; resp.bf_reg_size = dev->dev->caps.bf_reg_size; resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; @@ -540,15 +543,11 @@ static struct device_attribute *mlx4_class_attributes[] = { static void *mlx4_ib_add(struct mlx4_dev *dev) { - static int mlx4_ib_version_printed; struct mlx4_ib_dev *ibdev; int num_ports = 0; int i; - if (!mlx4_ib_version_printed) { - printk(KERN_INFO "%s", mlx4_ib_version); - ++mlx4_ib_version_printed; - } + printk_once(KERN_INFO "%s", mlx4_ib_version); mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) num_ports++; @@ -673,6 +672,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) goto err_reg; } + ibdev->ib_active = true; + return ibdev; err_reg: @@ -729,6 +730,7 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, break; case MLX4_DEV_EVENT_CATASTROPHIC_ERROR: + ibdev->ib_active = false; ibev.event = IB_EVENT_DEVICE_FATAL; break; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 8a7dd6795fa..3486d7675e5 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -175,6 +175,7 @@ struct mlx4_ib_dev { spinlock_t sm_lock; struct mutex cap_mask_mutex; + bool ib_active; }; static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index c4a02648c8a..219b10397b4 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -615,10 +615,12 @@ static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state) } static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) + __acquires(&send_cq->lock) __acquires(&recv_cq->lock) { - if (send_cq == recv_cq) + if (send_cq == recv_cq) { spin_lock_irq(&send_cq->lock); - else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + __acquire(&recv_cq->lock); + } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { spin_lock_irq(&send_cq->lock); spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); } else { @@ -628,10 +630,12 @@ static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv } static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) + __releases(&send_cq->lock) __releases(&recv_cq->lock) { - if (send_cq == recv_cq) + if (send_cq == recv_cq) { + __release(&recv_cq->lock); spin_unlock_irq(&send_cq->lock); - else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { spin_unlock(&recv_cq->lock); spin_unlock_irq(&send_cq->lock); } else { diff --git a/drivers/infiniband/hw/mthca/mthca_catas.c b/drivers/infiniband/hw/mthca/mthca_catas.c index 65ad359fdf1..056b2a4c697 100644 --- a/drivers/infiniband/hw/mthca/mthca_catas.c +++ b/drivers/infiniband/hw/mthca/mthca_catas.c @@ -88,6 +88,7 @@ static void handle_catas(struct mthca_dev *dev) event.device = &dev->ib_dev; event.event = IB_EVENT_DEVICE_FATAL; event.element.port_num = 0; + dev->active = false; ib_dispatch_event(&event); diff --git a/drivers/infiniband/hw/mthca/mthca_config_reg.h b/drivers/infiniband/hw/mthca/mthca_config_reg.h index 75671f75cac..155bc66395b 100644 --- a/drivers/infiniband/hw/mthca/mthca_config_reg.h +++ b/drivers/infiniband/hw/mthca/mthca_config_reg.h @@ -34,8 +34,6 @@ #ifndef MTHCA_CONFIG_REG_H #define MTHCA_CONFIG_REG_H -#include <asm/page.h> - #define MTHCA_HCR_BASE 0x80680 #define MTHCA_HCR_SIZE 0x0001c #define MTHCA_ECR_BASE 0x80700 diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h index 9ef611f6dd3..7e6a6d64ad4 100644 --- a/drivers/infiniband/hw/mthca/mthca_dev.h +++ b/drivers/infiniband/hw/mthca/mthca_dev.h @@ -357,6 +357,7 @@ struct mthca_dev { struct ib_ah *sm_ah[MTHCA_MAX_PORTS]; spinlock_t sm_lock; u8 rate[MTHCA_MAX_PORTS]; + bool active; }; #ifdef CONFIG_INFINIBAND_MTHCA_DEBUG diff --git a/drivers/infiniband/hw/mthca/mthca_eq.c b/drivers/infiniband/hw/mthca/mthca_eq.c index 90e4e450a12..8c31fa36e95 100644 --- a/drivers/infiniband/hw/mthca/mthca_eq.c +++ b/drivers/infiniband/hw/mthca/mthca_eq.c @@ -829,27 +829,34 @@ int mthca_init_eq_table(struct mthca_dev *dev) if (dev->mthca_flags & MTHCA_FLAG_MSI_X) { static const char *eq_name[] = { - [MTHCA_EQ_COMP] = DRV_NAME " (comp)", - [MTHCA_EQ_ASYNC] = DRV_NAME " (async)", - [MTHCA_EQ_CMD] = DRV_NAME " (cmd)" + [MTHCA_EQ_COMP] = DRV_NAME "-comp", + [MTHCA_EQ_ASYNC] = DRV_NAME "-async", + [MTHCA_EQ_CMD] = DRV_NAME "-cmd" }; for (i = 0; i < MTHCA_NUM_EQ; ++i) { + snprintf(dev->eq_table.eq[i].irq_name, + IB_DEVICE_NAME_MAX, + "%s@pci:%s", eq_name[i], + pci_name(dev->pdev)); err = request_irq(dev->eq_table.eq[i].msi_x_vector, mthca_is_memfree(dev) ? mthca_arbel_msi_x_interrupt : mthca_tavor_msi_x_interrupt, - 0, eq_name[i], dev->eq_table.eq + i); + 0, dev->eq_table.eq[i].irq_name, + dev->eq_table.eq + i); if (err) goto err_out_cmd; dev->eq_table.eq[i].have_irq = 1; } } else { + snprintf(dev->eq_table.eq[0].irq_name, IB_DEVICE_NAME_MAX, + DRV_NAME "@pci:%s", pci_name(dev->pdev)); err = request_irq(dev->pdev->irq, mthca_is_memfree(dev) ? mthca_arbel_interrupt : mthca_tavor_interrupt, - IRQF_SHARED, DRV_NAME, dev); + IRQF_SHARED, dev->eq_table.eq[0].irq_name, dev); if (err) goto err_out_cmd; dev->eq_table.have_irq = 1; diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 13da9f1d24c..b01b2898787 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -1116,6 +1116,8 @@ static int __mthca_init_one(struct pci_dev *pdev, int hca_type) pci_set_drvdata(pdev, mdev); mdev->hca_type = hca_type; + mdev->active = true; + return 0; err_unregister: @@ -1215,15 +1217,11 @@ int __mthca_restart_one(struct pci_dev *pdev) static int __devinit mthca_init_one(struct pci_dev *pdev, const struct pci_device_id *id) { - static int mthca_version_printed = 0; int ret; mutex_lock(&mthca_device_mutex); - if (!mthca_version_printed) { - printk(KERN_INFO "%s", mthca_version); - ++mthca_version_printed; - } + printk_once(KERN_INFO "%s", mthca_version); if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) { printk(KERN_ERR PFX "%s has invalid driver data %lx\n", diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 87ad889e367..bcf7a401482 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -334,6 +334,9 @@ static struct ib_ucontext *mthca_alloc_ucontext(struct ib_device *ibdev, struct mthca_ucontext *context; int err; + if (!(to_mdev(ibdev)->active)) + return ERR_PTR(-EAGAIN); + memset(&uresp, 0, sizeof uresp); uresp.qp_tab_size = to_mdev(ibdev)->limits.num_qps; diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h index c621f8794b8..90f4c4d2e98 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.h +++ b/drivers/infiniband/hw/mthca/mthca_provider.h @@ -113,6 +113,7 @@ struct mthca_eq { int nent; struct mthca_buf_list *page_list; struct mthca_mr mr; + char irq_name[IB_DEVICE_NAME_MAX]; }; struct mthca_av; diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index f5081bfde6d..c10576fa60c 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -1319,10 +1319,12 @@ int mthca_alloc_qp(struct mthca_dev *dev, } static void mthca_lock_cqs(struct mthca_cq *send_cq, struct mthca_cq *recv_cq) + __acquires(&send_cq->lock) __acquires(&recv_cq->lock) { - if (send_cq == recv_cq) + if (send_cq == recv_cq) { spin_lock_irq(&send_cq->lock); - else if (send_cq->cqn < recv_cq->cqn) { + __acquire(&recv_cq->lock); + } else if (send_cq->cqn < recv_cq->cqn) { spin_lock_irq(&send_cq->lock); spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); } else { @@ -1332,10 +1334,12 @@ static void mthca_lock_cqs(struct mthca_cq *send_cq, struct mthca_cq *recv_cq) } static void mthca_unlock_cqs(struct mthca_cq *send_cq, struct mthca_cq *recv_cq) + __releases(&send_cq->lock) __releases(&recv_cq->lock) { - if (send_cq == recv_cq) + if (send_cq == recv_cq) { + __release(&recv_cq->lock); spin_unlock_irq(&send_cq->lock); - else if (send_cq->cqn < recv_cq->cqn) { + } else if (send_cq->cqn < recv_cq->cqn) { spin_unlock(&recv_cq->lock); spin_unlock_irq(&send_cq->lock); } else { diff --git a/drivers/infiniband/hw/mthca/mthca_reset.c b/drivers/infiniband/hw/mthca/mthca_reset.c index acb6817f606..2a13a163d33 100644 --- a/drivers/infiniband/hw/mthca/mthca_reset.c +++ b/drivers/infiniband/hw/mthca/mthca_reset.c @@ -30,7 +30,6 @@ * SOFTWARE. */ -#include <linux/init.h> #include <linux/errno.h> #include <linux/pci.h> #include <linux/delay.h> diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h index bf1720f7f35..bcc6abc4faf 100644 --- a/drivers/infiniband/hw/nes/nes.h +++ b/drivers/infiniband/hw/nes/nes.h @@ -523,7 +523,7 @@ int nes_cm_disconn(struct nes_qp *); void nes_cm_disconn_worker(void *); /* nes_verbs.c */ -int nes_hw_modify_qp(struct nes_device *, struct nes_qp *, u32, u32); +int nes_hw_modify_qp(struct nes_device *, struct nes_qp *, u32, u32, u32); int nes_modify_qp(struct ib_qp *, struct ib_qp_attr *, int, struct ib_udata *); struct nes_ib_device *nes_init_ofa_device(struct net_device *); void nes_destroy_ofa_device(struct nes_ib_device *); diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 114b802771a..73473db1986 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -2450,19 +2450,16 @@ static int nes_cm_init_tsa_conn(struct nes_qp *nesqp, struct nes_cm_node *cm_nod */ int nes_cm_disconn(struct nes_qp *nesqp) { - unsigned long flags; - - spin_lock_irqsave(&nesqp->lock, flags); - if (nesqp->disconn_pending == 0) { - nesqp->disconn_pending++; - spin_unlock_irqrestore(&nesqp->lock, flags); - /* init our disconnect work element, to */ - INIT_WORK(&nesqp->disconn_work, nes_disconnect_worker); + struct disconn_work *work; - queue_work(g_cm_core->disconn_wq, &nesqp->disconn_work); - } else - spin_unlock_irqrestore(&nesqp->lock, flags); + work = kzalloc(sizeof *work, GFP_ATOMIC); + if (!work) + return -ENOMEM; /* Timer will clean up */ + nes_add_ref(&nesqp->ibqp); + work->nesqp = nesqp; + INIT_WORK(&work->work, nes_disconnect_worker); + queue_work(g_cm_core->disconn_wq, &work->work); return 0; } @@ -2472,11 +2469,14 @@ int nes_cm_disconn(struct nes_qp *nesqp) */ static void nes_disconnect_worker(struct work_struct *work) { - struct nes_qp *nesqp = container_of(work, struct nes_qp, disconn_work); + struct disconn_work *dwork = container_of(work, struct disconn_work, work); + struct nes_qp *nesqp = dwork->nesqp; + kfree(dwork); nes_debug(NES_DBG_CM, "processing AEQE id 0x%04X for QP%u.\n", nesqp->last_aeq, nesqp->hwqp.qp_id); nes_cm_disconn_true(nesqp); + nes_rem_ref(&nesqp->ibqp); } @@ -2493,7 +2493,12 @@ static int nes_cm_disconn_true(struct nes_qp *nesqp) u16 last_ae; u8 original_hw_tcp_state; u8 original_ibqp_state; - u8 issued_disconnect_reset = 0; + enum iw_cm_event_type disconn_status = IW_CM_EVENT_STATUS_OK; + int issue_disconn = 0; + int issue_close = 0; + int issue_flush = 0; + u32 flush_q = NES_CQP_FLUSH_RQ; + struct ib_event ibevent; if (!nesqp) { nes_debug(NES_DBG_CM, "disconnect_worker nesqp is NULL\n"); @@ -2517,24 +2522,55 @@ static int nes_cm_disconn_true(struct nes_qp *nesqp) original_ibqp_state = nesqp->ibqp_state; last_ae = nesqp->last_aeq; + if (nesqp->term_flags) { + issue_disconn = 1; + issue_close = 1; + nesqp->cm_id = NULL; + if (nesqp->flush_issued == 0) { + nesqp->flush_issued = 1; + issue_flush = 1; + } + } else if ((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) || + ((original_ibqp_state == IB_QPS_RTS) && + (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) { + issue_disconn = 1; + if (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET) + disconn_status = IW_CM_EVENT_STATUS_RESET; + } + + if (((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSED) || + (original_hw_tcp_state == NES_AEQE_TCP_STATE_TIME_WAIT) || + (last_ae == NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) || + (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) { + issue_close = 1; + nesqp->cm_id = NULL; + if (nesqp->flush_issued == 0) { + nesqp->flush_issued = 1; + issue_flush = 1; + } + } + + spin_unlock_irqrestore(&nesqp->lock, flags); - nes_debug(NES_DBG_CM, "set ibqp_state=%u\n", nesqp->ibqp_state); + if ((issue_flush) && (nesqp->destroyed == 0)) { + /* Flush the queue(s) */ + if (nesqp->hw_iwarp_state >= NES_AEQE_IWARP_STATE_TERMINATE) + flush_q |= NES_CQP_FLUSH_SQ; + flush_wqes(nesvnic->nesdev, nesqp, flush_q, 1); - if ((nesqp->cm_id) && (cm_id->event_handler)) { - if ((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) || - ((original_ibqp_state == IB_QPS_RTS) && - (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) { + if (nesqp->term_flags) { + ibevent.device = nesqp->ibqp.device; + ibevent.event = nesqp->terminate_eventtype; + ibevent.element.qp = &nesqp->ibqp; + nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); + } + } + + if ((cm_id) && (cm_id->event_handler)) { + if (issue_disconn) { atomic_inc(&cm_disconnects); cm_event.event = IW_CM_EVENT_DISCONNECT; - if (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET) { - cm_event.status = IW_CM_EVENT_STATUS_RESET; - nes_debug(NES_DBG_CM, "Generating a CM " - "Disconnect Event (status reset) for " - "QP%u, cm_id = %p. \n", - nesqp->hwqp.qp_id, cm_id); - } else - cm_event.status = IW_CM_EVENT_STATUS_OK; - + cm_event.status = disconn_status; cm_event.local_addr = cm_id->local_addr; cm_event.remote_addr = cm_id->remote_addr; cm_event.private_data = NULL; @@ -2547,29 +2583,14 @@ static int nes_cm_disconn_true(struct nes_qp *nesqp) nesqp->hwqp.sq_tail, cm_id, atomic_read(&nesqp->refcount)); - spin_unlock_irqrestore(&nesqp->lock, flags); ret = cm_id->event_handler(cm_id, &cm_event); if (ret) nes_debug(NES_DBG_CM, "OFA CM event_handler " "returned, ret=%d\n", ret); - spin_lock_irqsave(&nesqp->lock, flags); } - nesqp->disconn_pending = 0; - /* There might have been another AE while the lock was released */ - original_hw_tcp_state = nesqp->hw_tcp_state; - original_ibqp_state = nesqp->ibqp_state; - last_ae = nesqp->last_aeq; - - if ((issued_disconnect_reset == 0) && (nesqp->cm_id) && - ((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSED) || - (original_hw_tcp_state == NES_AEQE_TCP_STATE_TIME_WAIT) || - (last_ae == NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) || - (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) { + if (issue_close) { atomic_inc(&cm_closes); - nesqp->cm_id = NULL; - nesqp->in_disconnect = 0; - spin_unlock_irqrestore(&nesqp->lock, flags); nes_disconnect(nesqp, 1); cm_id->provider_data = nesqp; @@ -2588,28 +2609,7 @@ static int nes_cm_disconn_true(struct nes_qp *nesqp) } cm_id->rem_ref(cm_id); - - spin_lock_irqsave(&nesqp->lock, flags); - if (nesqp->flush_issued == 0) { - nesqp->flush_issued = 1; - spin_unlock_irqrestore(&nesqp->lock, flags); - flush_wqes(nesvnic->nesdev, nesqp, - NES_CQP_FLUSH_RQ, 1); - } else - spin_unlock_irqrestore(&nesqp->lock, flags); - } else { - cm_id = nesqp->cm_id; - spin_unlock_irqrestore(&nesqp->lock, flags); - /* check to see if the inbound reset beat the outbound reset */ - if ((!cm_id) && (last_ae==NES_AEQE_AEID_RESET_SENT)) { - nes_debug(NES_DBG_CM, "QP%u: Decing refcount " - "due to inbound reset beating the " - "outbound reset.\n", nesqp->hwqp.qp_id); - } } - } else { - nesqp->disconn_pending = 0; - spin_unlock_irqrestore(&nesqp->lock, flags); } return 0; diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index 8b7e7c0e496..90e8e4d8a5c 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -410,8 +410,6 @@ struct nes_cm_ops { int schedule_nes_timer(struct nes_cm_node *, struct sk_buff *, enum nes_timer_type, int, int); -int nes_cm_disconn(struct nes_qp *); - int nes_accept(struct iw_cm_id *, struct iw_cm_conn_param *); int nes_reject(struct iw_cm_id *, const void *, u8); int nes_connect(struct iw_cm_id *, struct iw_cm_conn_param *); diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index 4a84d02ece0..63a1a8e1e8a 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -74,6 +74,8 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, static void process_critical_error(struct nes_device *nesdev); static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number); static unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode); +static void nes_terminate_timeout(unsigned long context); +static void nes_terminate_start_timer(struct nes_qp *nesqp); #ifdef CONFIG_INFINIBAND_NES_DEBUG static unsigned char *nes_iwarp_state_str[] = { @@ -2903,6 +2905,417 @@ static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq) } +static u8 *locate_mpa(u8 *pkt, u32 aeq_info) +{ + u16 pkt_len; + + if (aeq_info & NES_AEQE_Q2_DATA_ETHERNET) { + /* skip over ethernet header */ + pkt_len = be16_to_cpu(*(u16 *)(pkt + ETH_HLEN - 2)); + pkt += ETH_HLEN; + + /* Skip over IP and TCP headers */ + pkt += 4 * (pkt[0] & 0x0f); + pkt += 4 * ((pkt[12] >> 4) & 0x0f); + } + return pkt; +} + +/* Determine if incoming error pkt is rdma layer */ +static u32 iwarp_opcode(struct nes_qp *nesqp, u32 aeq_info) +{ + u8 *pkt; + u16 *mpa; + u32 opcode = 0xffffffff; + + if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) { + pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET; + mpa = (u16 *)locate_mpa(pkt, aeq_info); + opcode = be16_to_cpu(mpa[1]) & 0xf; + } + + return opcode; +} + +/* Build iWARP terminate header */ +static int nes_bld_terminate_hdr(struct nes_qp *nesqp, u16 async_event_id, u32 aeq_info) +{ + u8 *pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET; + u16 ddp_seg_len; + int copy_len = 0; + u8 is_tagged = 0; + u8 flush_code = 0; + struct nes_terminate_hdr *termhdr; + + termhdr = (struct nes_terminate_hdr *)nesqp->hwqp.q2_vbase; + memset(termhdr, 0, 64); + + if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) { + + /* Use data from offending packet to fill in ddp & rdma hdrs */ + pkt = locate_mpa(pkt, aeq_info); + ddp_seg_len = be16_to_cpu(*(u16 *)pkt); + if (ddp_seg_len) { + copy_len = 2; + termhdr->hdrct = DDP_LEN_FLAG; + if (pkt[2] & 0x80) { + is_tagged = 1; + if (ddp_seg_len >= TERM_DDP_LEN_TAGGED) { + copy_len += TERM_DDP_LEN_TAGGED; + termhdr->hdrct |= DDP_HDR_FLAG; + } + } else { + if (ddp_seg_len >= TERM_DDP_LEN_UNTAGGED) { + copy_len += TERM_DDP_LEN_UNTAGGED; + termhdr->hdrct |= DDP_HDR_FLAG; + } + + if (ddp_seg_len >= (TERM_DDP_LEN_UNTAGGED + TERM_RDMA_LEN)) { + if ((pkt[3] & RDMA_OPCODE_MASK) == RDMA_READ_REQ_OPCODE) { + copy_len += TERM_RDMA_LEN; + termhdr->hdrct |= RDMA_HDR_FLAG; + } + } + } + } + } + + switch (async_event_id) { + case NES_AEQE_AEID_AMP_UNALLOCATED_STAG: + switch (iwarp_opcode(nesqp, aeq_info)) { + case IWARP_OPCODE_WRITE: + flush_code = IB_WC_LOC_PROT_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; + termhdr->error_code = DDP_TAGGED_INV_STAG; + break; + default: + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_INV_STAG; + } + break; + case NES_AEQE_AEID_AMP_INVALID_STAG: + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_INV_STAG; + break; + case NES_AEQE_AEID_AMP_BAD_QP: + flush_code = IB_WC_LOC_QP_OP_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_QN; + break; + case NES_AEQE_AEID_AMP_BAD_STAG_KEY: + case NES_AEQE_AEID_AMP_BAD_STAG_INDEX: + switch (iwarp_opcode(nesqp, aeq_info)) { + case IWARP_OPCODE_SEND_INV: + case IWARP_OPCODE_SEND_SE_INV: + flush_code = IB_WC_REM_OP_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; + termhdr->error_code = RDMAP_CANT_INV_STAG; + break; + default: + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_INV_STAG; + } + break; + case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION: + if (aeq_info & (NES_AEQE_Q2_DATA_ETHERNET | NES_AEQE_Q2_DATA_MPA)) { + flush_code = IB_WC_LOC_PROT_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; + termhdr->error_code = DDP_TAGGED_BOUNDS; + } else { + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_INV_BOUNDS; + } + break; + case NES_AEQE_AEID_AMP_RIGHTS_VIOLATION: + case NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS: + case NES_AEQE_AEID_PRIV_OPERATION_DENIED: + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_ACCESS; + break; + case NES_AEQE_AEID_AMP_TO_WRAP: + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_TO_WRAP; + break; + case NES_AEQE_AEID_AMP_BAD_PD: + switch (iwarp_opcode(nesqp, aeq_info)) { + case IWARP_OPCODE_WRITE: + flush_code = IB_WC_LOC_PROT_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; + termhdr->error_code = DDP_TAGGED_UNASSOC_STAG; + break; + case IWARP_OPCODE_SEND_INV: + case IWARP_OPCODE_SEND_SE_INV: + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_CANT_INV_STAG; + break; + default: + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_UNASSOC_STAG; + } + break; + case NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH: + flush_code = IB_WC_LOC_LEN_ERR; + termhdr->layer_etype = (LAYER_MPA << 4) | DDP_LLP; + termhdr->error_code = MPA_MARKER; + break; + case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR: + flush_code = IB_WC_GENERAL_ERR; + termhdr->layer_etype = (LAYER_MPA << 4) | DDP_LLP; + termhdr->error_code = MPA_CRC; + break; + case NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE: + case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL: + flush_code = IB_WC_LOC_LEN_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_CATASTROPHIC; + termhdr->error_code = DDP_CATASTROPHIC_LOCAL; + break; + case NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC: + case NES_AEQE_AEID_DDP_NO_L_BIT: + flush_code = IB_WC_FATAL_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_CATASTROPHIC; + termhdr->error_code = DDP_CATASTROPHIC_LOCAL; + break; + case NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN: + case NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID: + flush_code = IB_WC_GENERAL_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_MSN_RANGE; + break; + case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER: + flush_code = IB_WC_LOC_LEN_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_TOO_LONG; + break; + case NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION: + flush_code = IB_WC_GENERAL_ERR; + if (is_tagged) { + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; + termhdr->error_code = DDP_TAGGED_INV_DDP_VER; + } else { + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_DDP_VER; + } + break; + case NES_AEQE_AEID_DDP_UBE_INVALID_MO: + flush_code = IB_WC_GENERAL_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_MO; + break; + case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE: + flush_code = IB_WC_REM_OP_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_MSN_NO_BUF; + break; + case NES_AEQE_AEID_DDP_UBE_INVALID_QN: + flush_code = IB_WC_GENERAL_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_QN; + break; + case NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION: + flush_code = IB_WC_GENERAL_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; + termhdr->error_code = RDMAP_INV_RDMAP_VER; + break; + case NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE: + flush_code = IB_WC_LOC_QP_OP_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; + termhdr->error_code = RDMAP_UNEXPECTED_OP; + break; + default: + flush_code = IB_WC_FATAL_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; + termhdr->error_code = RDMAP_UNSPECIFIED; + break; + } + + if (copy_len) + memcpy(termhdr + 1, pkt, copy_len); + + if ((flush_code) && ((NES_AEQE_INBOUND_RDMA & aeq_info) == 0)) { + if (aeq_info & NES_AEQE_SQ) + nesqp->term_sq_flush_code = flush_code; + else + nesqp->term_rq_flush_code = flush_code; + } + + return sizeof(struct nes_terminate_hdr) + copy_len; +} + +static void nes_terminate_connection(struct nes_device *nesdev, struct nes_qp *nesqp, + struct nes_hw_aeqe *aeqe, enum ib_event_type eventtype) +{ + u64 context; + unsigned long flags; + u32 aeq_info; + u16 async_event_id; + u8 tcp_state; + u8 iwarp_state; + u32 termlen = 0; + u32 mod_qp_flags = NES_CQP_QP_IWARP_STATE_TERMINATE | + NES_CQP_QP_TERM_DONT_SEND_FIN; + struct nes_adapter *nesadapter = nesdev->nesadapter; + + if (nesqp->term_flags & NES_TERM_SENT) + return; /* Sanity check */ + + aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); + tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT; + iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT; + async_event_id = (u16)aeq_info; + + context = (unsigned long)nesadapter->qp_table[le32_to_cpu( + aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]) - NES_FIRST_QPN]; + if (!context) { + WARN_ON(!context); + return; + } + + nesqp = (struct nes_qp *)(unsigned long)context; + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + nesqp->terminate_eventtype = eventtype; + spin_unlock_irqrestore(&nesqp->lock, flags); + + if (nesadapter->send_term_ok) + termlen = nes_bld_terminate_hdr(nesqp, async_event_id, aeq_info); + else + mod_qp_flags |= NES_CQP_QP_TERM_DONT_SEND_TERM_MSG; + + nes_terminate_start_timer(nesqp); + nesqp->term_flags |= NES_TERM_SENT; + nes_hw_modify_qp(nesdev, nesqp, mod_qp_flags, termlen, 0); +} + +static void nes_terminate_send_fin(struct nes_device *nesdev, + struct nes_qp *nesqp, struct nes_hw_aeqe *aeqe) +{ + u32 aeq_info; + u16 async_event_id; + u8 tcp_state; + u8 iwarp_state; + unsigned long flags; + + aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); + tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT; + iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT; + async_event_id = (u16)aeq_info; + + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + spin_unlock_irqrestore(&nesqp->lock, flags); + + /* Send the fin only */ + nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_TERMINATE | + NES_CQP_QP_TERM_DONT_SEND_TERM_MSG, 0, 0); +} + +/* Cleanup after a terminate sent or received */ +static void nes_terminate_done(struct nes_qp *nesqp, int timeout_occurred) +{ + u32 next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR; + unsigned long flags; + struct nes_vnic *nesvnic = to_nesvnic(nesqp->ibqp.device); + struct nes_device *nesdev = nesvnic->nesdev; + u8 first_time = 0; + + spin_lock_irqsave(&nesqp->lock, flags); + if (nesqp->hte_added) { + nesqp->hte_added = 0; + next_iwarp_state |= NES_CQP_QP_DEL_HTE; + } + + first_time = (nesqp->term_flags & NES_TERM_DONE) == 0; + nesqp->term_flags |= NES_TERM_DONE; + spin_unlock_irqrestore(&nesqp->lock, flags); + + /* Make sure we go through this only once */ + if (first_time) { + if (timeout_occurred == 0) + del_timer(&nesqp->terminate_timer); + else + next_iwarp_state |= NES_CQP_QP_RESET; + + nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0); + nes_cm_disconn(nesqp); + } +} + +static void nes_terminate_received(struct nes_device *nesdev, + struct nes_qp *nesqp, struct nes_hw_aeqe *aeqe) +{ + u32 aeq_info; + u8 *pkt; + u32 *mpa; + u8 ddp_ctl; + u8 rdma_ctl; + u16 aeq_id = 0; + + aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); + if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) { + /* Terminate is not a performance path so the silicon */ + /* did not validate the frame - do it now */ + pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET; + mpa = (u32 *)locate_mpa(pkt, aeq_info); + ddp_ctl = (be32_to_cpu(mpa[0]) >> 8) & 0xff; + rdma_ctl = be32_to_cpu(mpa[0]) & 0xff; + if ((ddp_ctl & 0xc0) != 0x40) + aeq_id = NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC; + else if ((ddp_ctl & 0x03) != 1) + aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION; + else if (be32_to_cpu(mpa[2]) != 2) + aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_QN; + else if (be32_to_cpu(mpa[3]) != 1) + aeq_id = NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN; + else if (be32_to_cpu(mpa[4]) != 0) + aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_MO; + else if ((rdma_ctl & 0xc0) != 0x40) + aeq_id = NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION; + + if (aeq_id) { + /* Bad terminate recvd - send back a terminate */ + aeq_info = (aeq_info & 0xffff0000) | aeq_id; + aeqe->aeqe_words[NES_AEQE_MISC_IDX] = cpu_to_le32(aeq_info); + nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL); + return; + } + } + + nesqp->term_flags |= NES_TERM_RCVD; + nesqp->terminate_eventtype = IB_EVENT_QP_FATAL; + nes_terminate_start_timer(nesqp); + nes_terminate_send_fin(nesdev, nesqp, aeqe); +} + +/* Timeout routine in case terminate fails to complete */ +static void nes_terminate_timeout(unsigned long context) +{ + struct nes_qp *nesqp = (struct nes_qp *)(unsigned long)context; + + nes_terminate_done(nesqp, 1); +} + +/* Set a timer in case hw cannot complete the terminate sequence */ +static void nes_terminate_start_timer(struct nes_qp *nesqp) +{ + init_timer(&nesqp->terminate_timer); + nesqp->terminate_timer.function = nes_terminate_timeout; + nesqp->terminate_timer.expires = jiffies + HZ; + nesqp->terminate_timer.data = (unsigned long)nesqp; + add_timer(&nesqp->terminate_timer); +} + /** * nes_process_iwarp_aeqe */ @@ -2910,28 +3323,27 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, struct nes_hw_aeqe *aeqe) { u64 context; - u64 aeqe_context = 0; unsigned long flags; struct nes_qp *nesqp; + struct nes_hw_cq *hw_cq; + struct nes_cq *nescq; int resource_allocated; - /* struct iw_cm_id *cm_id; */ struct nes_adapter *nesadapter = nesdev->nesadapter; - struct ib_event ibevent; - /* struct iw_cm_event cm_event; */ u32 aeq_info; u32 next_iwarp_state = 0; u16 async_event_id; u8 tcp_state; u8 iwarp_state; + int must_disconn = 1; + int must_terminate = 0; + struct ib_event ibevent; nes_debug(NES_DBG_AEQ, "\n"); aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); - if ((NES_AEQE_INBOUND_RDMA&aeq_info) || (!(NES_AEQE_QP&aeq_info))) { + if ((NES_AEQE_INBOUND_RDMA & aeq_info) || (!(NES_AEQE_QP & aeq_info))) { context = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]); context += ((u64)le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX])) << 32; } else { - aeqe_context = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]); - aeqe_context += ((u64)le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX])) << 32; context = (unsigned long)nesadapter->qp_table[le32_to_cpu( aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]) - NES_FIRST_QPN]; BUG_ON(!context); @@ -2948,7 +3360,11 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, switch (async_event_id) { case NES_AEQE_AEID_LLP_FIN_RECEIVED: - nesqp = *((struct nes_qp **)&context); + nesqp = (struct nes_qp *)(unsigned long)context; + + if (nesqp->term_flags) + return; /* Ignore it, wait for close complete */ + if (atomic_inc_return(&nesqp->close_timer_started) == 1) { nesqp->cm_id->add_ref(nesqp->cm_id); schedule_nes_timer(nesqp->cm_node, (struct sk_buff *)nesqp, @@ -2959,18 +3375,24 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), async_event_id, nesqp->last_aeq, tcp_state); } + if ((tcp_state != NES_AEQE_TCP_STATE_CLOSE_WAIT) || (nesqp->ibqp_state != IB_QPS_RTS)) { /* FIN Received but tcp state or IB state moved on, should expect a close complete */ return; } + case NES_AEQE_AEID_LLP_CLOSE_COMPLETE: + nesqp = (struct nes_qp *)(unsigned long)context; + if (nesqp->term_flags) { + nes_terminate_done(nesqp, 0); + return; + } + case NES_AEQE_AEID_LLP_CONNECTION_RESET: - case NES_AEQE_AEID_TERMINATE_SENT: - case NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE: case NES_AEQE_AEID_RESET_SENT: - nesqp = *((struct nes_qp **)&context); + nesqp = (struct nes_qp *)(unsigned long)context; if (async_event_id == NES_AEQE_AEID_RESET_SENT) { tcp_state = NES_AEQE_TCP_STATE_CLOSED; } @@ -2982,12 +3404,7 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, if ((tcp_state == NES_AEQE_TCP_STATE_CLOSED) || (tcp_state == NES_AEQE_TCP_STATE_TIME_WAIT)) { nesqp->hte_added = 0; - spin_unlock_irqrestore(&nesqp->lock, flags); - nes_debug(NES_DBG_AEQ, "issuing hw modifyqp for QP%u to remove hte\n", - nesqp->hwqp.qp_id); - nes_hw_modify_qp(nesdev, nesqp, - NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_DEL_HTE, 0); - spin_lock_irqsave(&nesqp->lock, flags); + next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_DEL_HTE; } if ((nesqp->ibqp_state == IB_QPS_RTS) && @@ -2999,151 +3416,106 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING; break; case NES_AEQE_IWARP_STATE_TERMINATE: - next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE; - nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_TERMINATE; - if (async_event_id == NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) { - next_iwarp_state |= 0x02000000; - nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED; - } + must_disconn = 0; /* terminate path takes care of disconn */ + if (nesqp->term_flags == 0) + must_terminate = 1; break; - default: - next_iwarp_state = 0; - } - spin_unlock_irqrestore(&nesqp->lock, flags); - if (next_iwarp_state) { - nes_debug(NES_DBG_AEQ, "issuing hw modifyqp for QP%u. next state = 0x%08X," - " also added another reference\n", - nesqp->hwqp.qp_id, next_iwarp_state); - nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0); } - nes_cm_disconn(nesqp); } else { if (async_event_id == NES_AEQE_AEID_LLP_FIN_RECEIVED) { /* FIN Received but ib state not RTS, close complete will be on its way */ - spin_unlock_irqrestore(&nesqp->lock, flags); - return; - } - spin_unlock_irqrestore(&nesqp->lock, flags); - if (async_event_id == NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) { - next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000; - nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED; - nes_debug(NES_DBG_AEQ, "issuing hw modifyqp for QP%u. next state = 0x%08X," - " also added another reference\n", - nesqp->hwqp.qp_id, next_iwarp_state); - nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0); + must_disconn = 0; } - nes_cm_disconn(nesqp); } - break; - case NES_AEQE_AEID_LLP_TERMINATE_RECEIVED: - nesqp = *((struct nes_qp **)&context); - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = iwarp_state; - nesqp->hw_tcp_state = tcp_state; - nesqp->last_aeq = async_event_id; spin_unlock_irqrestore(&nesqp->lock, flags); - nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_LLP_TERMINATE_RECEIVED" - " event on QP%u \n Q2 Data:\n", - nesqp->hwqp.qp_id); - if (nesqp->ibqp.event_handler) { - ibevent.device = nesqp->ibqp.device; - ibevent.element.qp = &nesqp->ibqp; - ibevent.event = IB_EVENT_QP_FATAL; - nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); - } - if ((tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) || - ((nesqp->ibqp_state == IB_QPS_RTS)&& - (async_event_id == NES_AEQE_AEID_LLP_CONNECTION_RESET))) { + + if (must_terminate) + nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL); + else if (must_disconn) { + if (next_iwarp_state) { + nes_debug(NES_DBG_AEQ, "issuing hw modifyqp for QP%u. next state = 0x%08X\n", + nesqp->hwqp.qp_id, next_iwarp_state); + nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0); + } nes_cm_disconn(nesqp); - } else { - nesqp->in_disconnect = 0; - wake_up(&nesqp->kick_waitq); } break; - case NES_AEQE_AEID_LLP_TOO_MANY_RETRIES: - nesqp = *((struct nes_qp **)&context); - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_ERROR; - nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED; - nesqp->last_aeq = async_event_id; - if (nesqp->cm_id) { - nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_LLP_TOO_MANY_RETRIES" - " event on QP%u, remote IP = 0x%08X \n", - nesqp->hwqp.qp_id, - ntohl(nesqp->cm_id->remote_addr.sin_addr.s_addr)); - } else { - nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_LLP_TOO_MANY_RETRIES" - " event on QP%u \n", - nesqp->hwqp.qp_id); - } - spin_unlock_irqrestore(&nesqp->lock, flags); - next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_RESET; - nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0); - if (nesqp->ibqp.event_handler) { - ibevent.device = nesqp->ibqp.device; - ibevent.element.qp = &nesqp->ibqp; - ibevent.event = IB_EVENT_QP_FATAL; - nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); - } + + case NES_AEQE_AEID_TERMINATE_SENT: + nesqp = (struct nes_qp *)(unsigned long)context; + nes_terminate_send_fin(nesdev, nesqp, aeqe); break; - case NES_AEQE_AEID_AMP_BAD_STAG_INDEX: - if (NES_AEQE_INBOUND_RDMA&aeq_info) { - nesqp = nesadapter->qp_table[le32_to_cpu( - aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX])-NES_FIRST_QPN]; - } else { - /* TODO: get the actual WQE and mask off wqe index */ - context &= ~((u64)511); - nesqp = *((struct nes_qp **)&context); - } - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = iwarp_state; - nesqp->hw_tcp_state = tcp_state; - nesqp->last_aeq = async_event_id; - spin_unlock_irqrestore(&nesqp->lock, flags); - nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_AMP_BAD_STAG_INDEX event on QP%u\n", - nesqp->hwqp.qp_id); - if (nesqp->ibqp.event_handler) { - ibevent.device = nesqp->ibqp.device; - ibevent.element.qp = &nesqp->ibqp; - ibevent.event = IB_EVENT_QP_ACCESS_ERR; - nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); - } + + case NES_AEQE_AEID_LLP_TERMINATE_RECEIVED: + nesqp = (struct nes_qp *)(unsigned long)context; + nes_terminate_received(nesdev, nesqp, aeqe); break; + + case NES_AEQE_AEID_AMP_BAD_STAG_KEY: + case NES_AEQE_AEID_AMP_BAD_STAG_INDEX: case NES_AEQE_AEID_AMP_UNALLOCATED_STAG: - nesqp = *((struct nes_qp **)&context); - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = iwarp_state; - nesqp->hw_tcp_state = tcp_state; - nesqp->last_aeq = async_event_id; - spin_unlock_irqrestore(&nesqp->lock, flags); - nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_AMP_UNALLOCATED_STAG event on QP%u\n", - nesqp->hwqp.qp_id); - if (nesqp->ibqp.event_handler) { - ibevent.device = nesqp->ibqp.device; - ibevent.element.qp = &nesqp->ibqp; - ibevent.event = IB_EVENT_QP_ACCESS_ERR; - nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); - } - break; + case NES_AEQE_AEID_AMP_INVALID_STAG: + case NES_AEQE_AEID_AMP_RIGHTS_VIOLATION: + case NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS: case NES_AEQE_AEID_PRIV_OPERATION_DENIED: - nesqp = nesadapter->qp_table[le32_to_cpu(aeqe->aeqe_words - [NES_AEQE_COMP_QP_CQ_ID_IDX])-NES_FIRST_QPN]; - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = iwarp_state; - nesqp->hw_tcp_state = tcp_state; - nesqp->last_aeq = async_event_id; - spin_unlock_irqrestore(&nesqp->lock, flags); - nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_PRIV_OPERATION_DENIED event on QP%u," - " nesqp = %p, AE reported %p\n", - nesqp->hwqp.qp_id, nesqp, *((struct nes_qp **)&context)); - if (nesqp->ibqp.event_handler) { - ibevent.device = nesqp->ibqp.device; - ibevent.element.qp = &nesqp->ibqp; - ibevent.event = IB_EVENT_QP_ACCESS_ERR; - nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); + case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER: + case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION: + case NES_AEQE_AEID_AMP_TO_WRAP: + nesqp = (struct nes_qp *)(unsigned long)context; + nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_ACCESS_ERR); + break; + + case NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE: + case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL: + case NES_AEQE_AEID_DDP_UBE_INVALID_MO: + case NES_AEQE_AEID_DDP_UBE_INVALID_QN: + nesqp = (struct nes_qp *)(unsigned long)context; + if (iwarp_opcode(nesqp, aeq_info) > IWARP_OPCODE_TERM) { + aeq_info &= 0xffff0000; + aeq_info |= NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE; + aeqe->aeqe_words[NES_AEQE_MISC_IDX] = cpu_to_le32(aeq_info); } + + case NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE: + case NES_AEQE_AEID_LLP_TOO_MANY_RETRIES: + case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE: + case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR: + case NES_AEQE_AEID_AMP_BAD_QP: + case NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH: + case NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC: + case NES_AEQE_AEID_DDP_NO_L_BIT: + case NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN: + case NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID: + case NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION: + case NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION: + case NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE: + case NES_AEQE_AEID_AMP_BAD_PD: + case NES_AEQE_AEID_AMP_FASTREG_SHARED: + case NES_AEQE_AEID_AMP_FASTREG_VALID_STAG: + case NES_AEQE_AEID_AMP_FASTREG_MW_STAG: + case NES_AEQE_AEID_AMP_FASTREG_INVALID_RIGHTS: + case NES_AEQE_AEID_AMP_FASTREG_PBL_TABLE_OVERFLOW: + case NES_AEQE_AEID_AMP_FASTREG_INVALID_LENGTH: + case NES_AEQE_AEID_AMP_INVALIDATE_SHARED: + case NES_AEQE_AEID_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS: + case NES_AEQE_AEID_AMP_MWBIND_VALID_STAG: + case NES_AEQE_AEID_AMP_MWBIND_OF_MR_STAG: + case NES_AEQE_AEID_AMP_MWBIND_TO_ZERO_BASED_STAG: + case NES_AEQE_AEID_AMP_MWBIND_TO_MW_STAG: + case NES_AEQE_AEID_AMP_MWBIND_INVALID_RIGHTS: + case NES_AEQE_AEID_AMP_MWBIND_INVALID_BOUNDS: + case NES_AEQE_AEID_AMP_MWBIND_TO_INVALID_PARENT: + case NES_AEQE_AEID_AMP_MWBIND_BIND_DISABLED: + case NES_AEQE_AEID_BAD_CLOSE: + case NES_AEQE_AEID_RDMA_READ_WHILE_ORD_ZERO: + case NES_AEQE_AEID_STAG_ZERO_INVALID: + case NES_AEQE_AEID_ROE_INVALID_RDMA_READ_REQUEST: + case NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP: + nesqp = (struct nes_qp *)(unsigned long)context; + nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL); break; + case NES_AEQE_AEID_CQ_OPERATION_ERROR: context <<= 1; nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u, %p\n", @@ -3153,83 +3525,19 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, if (resource_allocated) { printk(KERN_ERR PFX "%s: Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u\n", __func__, le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX])); + hw_cq = (struct nes_hw_cq *)(unsigned long)context; + if (hw_cq) { + nescq = container_of(hw_cq, struct nes_cq, hw_cq); + if (nescq->ibcq.event_handler) { + ibevent.device = nescq->ibcq.device; + ibevent.event = IB_EVENT_CQ_ERR; + ibevent.element.cq = &nescq->ibcq; + nescq->ibcq.event_handler(&ibevent, nescq->ibcq.cq_context); + } + } } break; - case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER: - nesqp = nesadapter->qp_table[le32_to_cpu( - aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX])-NES_FIRST_QPN]; - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = iwarp_state; - nesqp->hw_tcp_state = tcp_state; - nesqp->last_aeq = async_event_id; - spin_unlock_irqrestore(&nesqp->lock, flags); - nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG" - "_FOR_AVAILABLE_BUFFER event on QP%u\n", - nesqp->hwqp.qp_id); - if (nesqp->ibqp.event_handler) { - ibevent.device = nesqp->ibqp.device; - ibevent.element.qp = &nesqp->ibqp; - ibevent.event = IB_EVENT_QP_ACCESS_ERR; - nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); - } - /* tell cm to disconnect, cm will queue work to thread */ - nes_cm_disconn(nesqp); - break; - case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE: - nesqp = *((struct nes_qp **)&context); - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = iwarp_state; - nesqp->hw_tcp_state = tcp_state; - nesqp->last_aeq = async_event_id; - spin_unlock_irqrestore(&nesqp->lock, flags); - nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_DDP_UBE_INVALID_MSN" - "_NO_BUFFER_AVAILABLE event on QP%u\n", - nesqp->hwqp.qp_id); - if (nesqp->ibqp.event_handler) { - ibevent.device = nesqp->ibqp.device; - ibevent.element.qp = &nesqp->ibqp; - ibevent.event = IB_EVENT_QP_FATAL; - nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); - } - /* tell cm to disconnect, cm will queue work to thread */ - nes_cm_disconn(nesqp); - break; - case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR: - nesqp = *((struct nes_qp **)&context); - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = iwarp_state; - nesqp->hw_tcp_state = tcp_state; - nesqp->last_aeq = async_event_id; - spin_unlock_irqrestore(&nesqp->lock, flags); - nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR" - " event on QP%u \n Q2 Data:\n", - nesqp->hwqp.qp_id); - if (nesqp->ibqp.event_handler) { - ibevent.device = nesqp->ibqp.device; - ibevent.element.qp = &nesqp->ibqp; - ibevent.event = IB_EVENT_QP_FATAL; - nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); - } - /* tell cm to disconnect, cm will queue work to thread */ - nes_cm_disconn(nesqp); - break; - /* TODO: additional AEs need to be here */ - case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION: - nesqp = *((struct nes_qp **)&context); - spin_lock_irqsave(&nesqp->lock, flags); - nesqp->hw_iwarp_state = iwarp_state; - nesqp->hw_tcp_state = tcp_state; - nesqp->last_aeq = async_event_id; - spin_unlock_irqrestore(&nesqp->lock, flags); - if (nesqp->ibqp.event_handler) { - ibevent.device = nesqp->ibqp.device; - ibevent.element.qp = &nesqp->ibqp; - ibevent.event = IB_EVENT_QP_ACCESS_ERR; - nesqp->ibqp.event_handler(&ibevent, - nesqp->ibqp.qp_context); - } - nes_cm_disconn(nesqp); - break; + default: nes_debug(NES_DBG_AEQ, "Processing an iWARP related AE for QP, misc = 0x%04X\n", async_event_id); @@ -3238,7 +3546,6 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, } - /** * nes_iwarp_ce_handler */ @@ -3373,6 +3680,8 @@ void flush_wqes(struct nes_device *nesdev, struct nes_qp *nesqp, { struct nes_cqp_request *cqp_request; struct nes_hw_cqp_wqe *cqp_wqe; + u32 sq_code = (NES_IWARP_CQE_MAJOR_FLUSH << 16) | NES_IWARP_CQE_MINOR_FLUSH; + u32 rq_code = (NES_IWARP_CQE_MAJOR_FLUSH << 16) | NES_IWARP_CQE_MINOR_FLUSH; int ret; cqp_request = nes_get_cqp_request(nesdev); @@ -3389,6 +3698,24 @@ void flush_wqes(struct nes_device *nesdev, struct nes_qp *nesqp, cqp_wqe = &cqp_request->cqp_wqe; nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + /* If wqe in error was identified, set code to be put into cqe */ + if ((nesqp->term_sq_flush_code) && (which_wq & NES_CQP_FLUSH_SQ)) { + which_wq |= NES_CQP_FLUSH_MAJ_MIN; + sq_code = (CQE_MAJOR_DRV << 16) | nesqp->term_sq_flush_code; + nesqp->term_sq_flush_code = 0; + } + + if ((nesqp->term_rq_flush_code) && (which_wq & NES_CQP_FLUSH_RQ)) { + which_wq |= NES_CQP_FLUSH_MAJ_MIN; + rq_code = (CQE_MAJOR_DRV << 16) | nesqp->term_rq_flush_code; + nesqp->term_rq_flush_code = 0; + } + + if (which_wq & NES_CQP_FLUSH_MAJ_MIN) { + cqp_wqe->wqe_words[NES_CQP_QP_WQE_FLUSH_SQ_CODE] = cpu_to_le32(sq_code); + cqp_wqe->wqe_words[NES_CQP_QP_WQE_FLUSH_RQ_CODE] = cpu_to_le32(rq_code); + } + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_FLUSH_WQES | which_wq); cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesqp->hwqp.qp_id); diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h index c3654c6383f..f28a41ba9fa 100644 --- a/drivers/infiniband/hw/nes/nes_hw.h +++ b/drivers/infiniband/hw/nes/nes_hw.h @@ -241,6 +241,7 @@ enum nes_cqp_stag_wqeword_idx { }; #define NES_CQP_OP_IWARP_STATE_SHIFT 28 +#define NES_CQP_OP_TERMLEN_SHIFT 28 enum nes_cqp_qp_bits { NES_CQP_QP_ARP_VALID = (1<<8), @@ -265,12 +266,16 @@ enum nes_cqp_qp_bits { NES_CQP_QP_IWARP_STATE_TERMINATE = (5<<NES_CQP_OP_IWARP_STATE_SHIFT), NES_CQP_QP_IWARP_STATE_ERROR = (6<<NES_CQP_OP_IWARP_STATE_SHIFT), NES_CQP_QP_IWARP_STATE_MASK = (7<<NES_CQP_OP_IWARP_STATE_SHIFT), + NES_CQP_QP_TERM_DONT_SEND_FIN = (1<<24), + NES_CQP_QP_TERM_DONT_SEND_TERM_MSG = (1<<25), NES_CQP_QP_RESET = (1<<31), }; enum nes_cqp_qp_wqe_word_idx { NES_CQP_QP_WQE_CONTEXT_LOW_IDX = 6, NES_CQP_QP_WQE_CONTEXT_HIGH_IDX = 7, + NES_CQP_QP_WQE_FLUSH_SQ_CODE = 8, + NES_CQP_QP_WQE_FLUSH_RQ_CODE = 9, NES_CQP_QP_WQE_NEW_MSS_IDX = 15, }; @@ -361,6 +366,7 @@ enum nes_cqp_arp_bits { enum nes_cqp_flush_bits { NES_CQP_FLUSH_SQ = (1<<30), NES_CQP_FLUSH_RQ = (1<<31), + NES_CQP_FLUSH_MAJ_MIN = (1<<28), }; enum nes_cqe_opcode_bits { @@ -633,11 +639,14 @@ enum nes_aeqe_bits { NES_AEQE_INBOUND_RDMA = (1<<19), NES_AEQE_IWARP_STATE_MASK = (7<<20), NES_AEQE_TCP_STATE_MASK = (0xf<<24), + NES_AEQE_Q2_DATA_WRITTEN = (0x3<<28), NES_AEQE_VALID = (1<<31), }; #define NES_AEQE_IWARP_STATE_SHIFT 20 #define NES_AEQE_TCP_STATE_SHIFT 24 +#define NES_AEQE_Q2_DATA_ETHERNET (1<<28) +#define NES_AEQE_Q2_DATA_MPA (1<<29) enum nes_aeqe_iwarp_state { NES_AEQE_IWARP_STATE_NON_EXISTANT = 0, @@ -751,6 +760,15 @@ enum nes_iwarp_sq_wqe_bits { NES_IWARP_SQ_OP_NOP = 12, }; +enum nes_iwarp_cqe_major_code { + NES_IWARP_CQE_MAJOR_FLUSH = 1, + NES_IWARP_CQE_MAJOR_DRV = 0x8000 +}; + +enum nes_iwarp_cqe_minor_code { + NES_IWARP_CQE_MINOR_FLUSH = 1 +}; + #define NES_EEPROM_READ_REQUEST (1<<16) #define NES_MAC_ADDR_VALID (1<<20) @@ -1119,6 +1137,7 @@ struct nes_adapter { u8 netdev_max; /* from host nic address count in EEPROM */ u8 port_count; u8 virtwq; + u8 send_term_ok; u8 et_use_adaptive_rx_coalesce; u8 adapter_fcn_count; u8 pft_mcast_map[NES_PFT_SIZE]; @@ -1217,6 +1236,90 @@ struct nes_ib_device { u32 num_pd; }; +enum nes_hdrct_flags { + DDP_LEN_FLAG = 0x80, + DDP_HDR_FLAG = 0x40, + RDMA_HDR_FLAG = 0x20 +}; + +enum nes_term_layers { + LAYER_RDMA = 0, + LAYER_DDP = 1, + LAYER_MPA = 2 +}; + +enum nes_term_error_types { + RDMAP_CATASTROPHIC = 0, + RDMAP_REMOTE_PROT = 1, + RDMAP_REMOTE_OP = 2, + DDP_CATASTROPHIC = 0, + DDP_TAGGED_BUFFER = 1, + DDP_UNTAGGED_BUFFER = 2, + DDP_LLP = 3 +}; + +enum nes_term_rdma_errors { + RDMAP_INV_STAG = 0x00, + RDMAP_INV_BOUNDS = 0x01, + RDMAP_ACCESS = 0x02, + RDMAP_UNASSOC_STAG = 0x03, + RDMAP_TO_WRAP = 0x04, + RDMAP_INV_RDMAP_VER = 0x05, + RDMAP_UNEXPECTED_OP = 0x06, + RDMAP_CATASTROPHIC_LOCAL = 0x07, + RDMAP_CATASTROPHIC_GLOBAL = 0x08, + RDMAP_CANT_INV_STAG = 0x09, + RDMAP_UNSPECIFIED = 0xff +}; + +enum nes_term_ddp_errors { + DDP_CATASTROPHIC_LOCAL = 0x00, + DDP_TAGGED_INV_STAG = 0x00, + DDP_TAGGED_BOUNDS = 0x01, + DDP_TAGGED_UNASSOC_STAG = 0x02, + DDP_TAGGED_TO_WRAP = 0x03, + DDP_TAGGED_INV_DDP_VER = 0x04, + DDP_UNTAGGED_INV_QN = 0x01, + DDP_UNTAGGED_INV_MSN_NO_BUF = 0x02, + DDP_UNTAGGED_INV_MSN_RANGE = 0x03, + DDP_UNTAGGED_INV_MO = 0x04, + DDP_UNTAGGED_INV_TOO_LONG = 0x05, + DDP_UNTAGGED_INV_DDP_VER = 0x06 +}; + +enum nes_term_mpa_errors { + MPA_CLOSED = 0x01, + MPA_CRC = 0x02, + MPA_MARKER = 0x03, + MPA_REQ_RSP = 0x04, +}; + +struct nes_terminate_hdr { + u8 layer_etype; + u8 error_code; + u8 hdrct; + u8 rsvd; +}; + +/* Used to determine how to fill in terminate error codes */ +#define IWARP_OPCODE_WRITE 0 +#define IWARP_OPCODE_READREQ 1 +#define IWARP_OPCODE_READRSP 2 +#define IWARP_OPCODE_SEND 3 +#define IWARP_OPCODE_SEND_INV 4 +#define IWARP_OPCODE_SEND_SE 5 +#define IWARP_OPCODE_SEND_SE_INV 6 +#define IWARP_OPCODE_TERM 7 + +/* These values are used only during terminate processing */ +#define TERM_DDP_LEN_TAGGED 14 +#define TERM_DDP_LEN_UNTAGGED 18 +#define TERM_RDMA_LEN 28 +#define RDMA_OPCODE_MASK 0x0f +#define RDMA_READ_REQ_OPCODE 1 +#define BAD_FRAME_OFFSET 64 +#define CQE_MAJOR_DRV 0x8000 + #define nes_vlan_rx vlan_hwaccel_receive_skb #define nes_netif_rx netif_receive_skb diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c index a282031d15c..9687c397ce1 100644 --- a/drivers/infiniband/hw/nes/nes_utils.c +++ b/drivers/infiniband/hw/nes/nes_utils.c @@ -183,6 +183,9 @@ int nes_read_eeprom_values(struct nes_device *nesdev, struct nes_adapter *nesada } else if (((major_ver == 2) && (minor_ver > 21)) || ((major_ver > 2) && (major_ver != 255))) { nesadapter->virtwq = 1; } + if (((major_ver == 3) && (minor_ver >= 16)) || (major_ver > 3)) + nesadapter->send_term_ok = 1; + nesadapter->firmware_version = (((u32)(u8)(eeprom_data>>8)) << 16) + (u32)((u8)eeprom_data); @@ -548,7 +551,7 @@ struct nes_cqp_request *nes_get_cqp_request(struct nes_device *nesdev) spin_unlock_irqrestore(&nesdev->cqp.lock, flags); } if (cqp_request == NULL) { - cqp_request = kzalloc(sizeof(struct nes_cqp_request), GFP_KERNEL); + cqp_request = kzalloc(sizeof(struct nes_cqp_request), GFP_ATOMIC); if (cqp_request) { cqp_request->dynamic = 1; INIT_LIST_HEAD(&cqp_request->list); diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 21e0fd336cf..a680c42d6e8 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -667,15 +667,32 @@ static int nes_query_device(struct ib_device *ibdev, struct ib_device_attr *prop */ static int nes_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + struct net_device *netdev = nesvnic->netdev; + memset(props, 0, sizeof(*props)); - props->max_mtu = IB_MTU_2048; - props->active_mtu = IB_MTU_2048; + props->max_mtu = IB_MTU_4096; + + if (netdev->mtu >= 4096) + props->active_mtu = IB_MTU_4096; + else if (netdev->mtu >= 2048) + props->active_mtu = IB_MTU_2048; + else if (netdev->mtu >= 1024) + props->active_mtu = IB_MTU_1024; + else if (netdev->mtu >= 512) + props->active_mtu = IB_MTU_512; + else + props->active_mtu = IB_MTU_256; + props->lid = 1; props->lmc = 0; props->sm_lid = 0; props->sm_sl = 0; - props->state = IB_PORT_ACTIVE; + if (nesvnic->linkup) + props->state = IB_PORT_ACTIVE; + else + props->state = IB_PORT_DOWN; props->phys_state = 0; props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP | IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; @@ -1506,12 +1523,45 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, /** + * nes_clean_cq + */ +static void nes_clean_cq(struct nes_qp *nesqp, struct nes_cq *nescq) +{ + u32 cq_head; + u32 lo; + u32 hi; + u64 u64temp; + unsigned long flags = 0; + + spin_lock_irqsave(&nescq->lock, flags); + + cq_head = nescq->hw_cq.cq_head; + while (le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_VALID) { + rmb(); + lo = le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]); + hi = le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX]); + u64temp = (((u64)hi) << 32) | ((u64)lo); + u64temp &= ~(NES_SW_CONTEXT_ALIGN-1); + if (u64temp == (u64)(unsigned long)nesqp) { + /* Zero the context value so cqe will be ignored */ + nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX] = 0; + nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX] = 0; + } + + if (++cq_head >= nescq->hw_cq.cq_size) + cq_head = 0; + } + + spin_unlock_irqrestore(&nescq->lock, flags); +} + + +/** * nes_destroy_qp */ static int nes_destroy_qp(struct ib_qp *ibqp) { struct nes_qp *nesqp = to_nesqp(ibqp); - /* struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); */ struct nes_ucontext *nes_ucontext; struct ib_qp_attr attr; struct iw_cm_id *cm_id; @@ -1548,7 +1598,6 @@ static int nes_destroy_qp(struct ib_qp *ibqp) nes_debug(NES_DBG_QP, "OFA CM event_handler returned, ret=%d\n", ret); } - if (nesqp->user_mode) { if ((ibqp->uobject)&&(ibqp->uobject->context)) { nes_ucontext = to_nesucontext(ibqp->uobject->context); @@ -1560,6 +1609,13 @@ static int nes_destroy_qp(struct ib_qp *ibqp) } if (nesqp->pbl_pbase) kunmap(nesqp->page); + } else { + /* Clean any pending completions from the cq(s) */ + if (nesqp->nesscq) + nes_clean_cq(nesqp, nesqp->nesscq); + + if ((nesqp->nesrcq) && (nesqp->nesrcq != nesqp->nesscq)) + nes_clean_cq(nesqp, nesqp->nesrcq); } nes_rem_ref(&nesqp->ibqp); @@ -2884,7 +2940,7 @@ static int nes_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, * nes_hw_modify_qp */ int nes_hw_modify_qp(struct nes_device *nesdev, struct nes_qp *nesqp, - u32 next_iwarp_state, u32 wait_completion) + u32 next_iwarp_state, u32 termlen, u32 wait_completion) { struct nes_hw_cqp_wqe *cqp_wqe; /* struct iw_cm_id *cm_id = nesqp->cm_id; */ @@ -2916,6 +2972,13 @@ int nes_hw_modify_qp(struct nes_device *nesdev, struct nes_qp *nesqp, set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id); set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, (u64)nesqp->nesqp_context_pbase); + /* If sending a terminate message, fill in the length (in words) */ + if (((next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK) == NES_CQP_QP_IWARP_STATE_TERMINATE) && + !(next_iwarp_state & NES_CQP_QP_TERM_DONT_SEND_TERM_MSG)) { + termlen = ((termlen + 3) >> 2) << NES_CQP_OP_TERMLEN_SHIFT; + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_NEW_MSS_IDX, termlen); + } + atomic_set(&cqp_request->refcount, 2); nes_post_cqp_request(nesdev, cqp_request); @@ -3086,6 +3149,9 @@ int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, } nes_debug(NES_DBG_MOD_QP, "QP%u: new state = error\n", nesqp->hwqp.qp_id); + if (nesqp->term_flags) + del_timer(&nesqp->terminate_timer); + next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR; /* next_iwarp_state = (NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000); */ if (nesqp->hte_added) { @@ -3163,7 +3229,7 @@ int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (issue_modify_qp) { nes_debug(NES_DBG_MOD_QP, "call nes_hw_modify_qp\n"); - ret = nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 1); + ret = nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 1); if (ret) nes_debug(NES_DBG_MOD_QP, "nes_hw_modify_qp (next_iwarp_state = 0x%08X)" " failed for QP%u.\n", @@ -3328,6 +3394,12 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, head = nesqp->hwqp.sq_head; while (ib_wr) { + /* Check for QP error */ + if (nesqp->term_flags) { + err = -EINVAL; + break; + } + /* Check for SQ overflow */ if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) { err = -EINVAL; @@ -3484,6 +3556,12 @@ static int nes_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr, head = nesqp->hwqp.rq_head; while (ib_wr) { + /* Check for QP error */ + if (nesqp->term_flags) { + err = -EINVAL; + break; + } + if (ib_wr->num_sge > nesdev->nesadapter->max_sge) { err = -EINVAL; break; @@ -3547,7 +3625,6 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) { u64 u64temp; u64 wrid; - /* u64 u64temp; */ unsigned long flags = 0; struct nes_vnic *nesvnic = to_nesvnic(ibcq->device); struct nes_device *nesdev = nesvnic->nesdev; @@ -3555,12 +3632,13 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) struct nes_qp *nesqp; struct nes_hw_cqe cqe; u32 head; - u32 wq_tail; + u32 wq_tail = 0; u32 cq_size; u32 cqe_count = 0; u32 wqe_index; u32 u32temp; - /* u32 counter; */ + u32 move_cq_head = 1; + u32 err_code; nes_debug(NES_DBG_CQ, "\n"); @@ -3570,29 +3648,40 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) cq_size = nescq->hw_cq.cq_size; while (cqe_count < num_entries) { - if (le32_to_cpu(nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]) & - NES_CQE_VALID) { - /* - * Make sure we read CQ entry contents *after* - * we've checked the valid bit. - */ - rmb(); - - cqe = nescq->hw_cq.cq_vbase[head]; - nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0; - u32temp = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]); - wqe_index = u32temp & - (nesdev->nesadapter->max_qp_wr - 1); - u32temp &= ~(NES_SW_CONTEXT_ALIGN-1); - /* parse CQE, get completion context from WQE (either rq or sq */ - u64temp = (((u64)(le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX])))<<32) | - ((u64)u32temp); - nesqp = *((struct nes_qp **)&u64temp); + if ((le32_to_cpu(nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]) & + NES_CQE_VALID) == 0) + break; + + /* + * Make sure we read CQ entry contents *after* + * we've checked the valid bit. + */ + rmb(); + + cqe = nescq->hw_cq.cq_vbase[head]; + u32temp = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]); + wqe_index = u32temp & (nesdev->nesadapter->max_qp_wr - 1); + u32temp &= ~(NES_SW_CONTEXT_ALIGN-1); + /* parse CQE, get completion context from WQE (either rq or sq) */ + u64temp = (((u64)(le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX])))<<32) | + ((u64)u32temp); + + if (u64temp) { + nesqp = (struct nes_qp *)(unsigned long)u64temp; memset(entry, 0, sizeof *entry); if (cqe.cqe_words[NES_CQE_ERROR_CODE_IDX] == 0) { entry->status = IB_WC_SUCCESS; } else { - entry->status = IB_WC_WR_FLUSH_ERR; + err_code = le32_to_cpu(cqe.cqe_words[NES_CQE_ERROR_CODE_IDX]); + if (NES_IWARP_CQE_MAJOR_DRV == (err_code >> 16)) { + entry->status = err_code & 0x0000ffff; + + /* The rest of the cqe's will be marked as flushed */ + nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_ERROR_CODE_IDX] = + cpu_to_le32((NES_IWARP_CQE_MAJOR_FLUSH << 16) | + NES_IWARP_CQE_MINOR_FLUSH); + } else + entry->status = IB_WC_WR_FLUSH_ERR; } entry->qp = &nesqp->ibqp; @@ -3601,20 +3690,18 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) if (le32_to_cpu(cqe.cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_SQ) { if (nesqp->skip_lsmm) { nesqp->skip_lsmm = 0; - wq_tail = nesqp->hwqp.sq_tail++; + nesqp->hwqp.sq_tail++; } /* Working on a SQ Completion*/ - wq_tail = wqe_index; - nesqp->hwqp.sq_tail = (wqe_index+1)&(nesqp->hwqp.sq_size - 1); - wrid = (((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wq_tail]. + wrid = (((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wqe_index]. wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_HIGH_IDX]))) << 32) | - ((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wq_tail]. + ((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wqe_index]. wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX]))); - entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wq_tail]. + entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index]. wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX]); - switch (le32_to_cpu(nesqp->hwqp.sq_vbase[wq_tail]. + switch (le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index]. wqe_words[NES_IWARP_SQ_WQE_MISC_IDX]) & 0x3f) { case NES_IWARP_SQ_OP_RDMAW: nes_debug(NES_DBG_CQ, "Operation = RDMA WRITE.\n"); @@ -3623,7 +3710,7 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) case NES_IWARP_SQ_OP_RDMAR: nes_debug(NES_DBG_CQ, "Operation = RDMA READ.\n"); entry->opcode = IB_WC_RDMA_READ; - entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wq_tail]. + entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index]. wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX]); break; case NES_IWARP_SQ_OP_SENDINV: @@ -3634,33 +3721,54 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) entry->opcode = IB_WC_SEND; break; } + + nesqp->hwqp.sq_tail = (wqe_index+1)&(nesqp->hwqp.sq_size - 1); + if ((entry->status != IB_WC_SUCCESS) && (nesqp->hwqp.sq_tail != nesqp->hwqp.sq_head)) { + move_cq_head = 0; + wq_tail = nesqp->hwqp.sq_tail; + } } else { /* Working on a RQ Completion*/ - wq_tail = wqe_index; - nesqp->hwqp.rq_tail = (wqe_index+1)&(nesqp->hwqp.rq_size - 1); entry->byte_len = le32_to_cpu(cqe.cqe_words[NES_CQE_PAYLOAD_LENGTH_IDX]); - wrid = ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wq_tail].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_LOW_IDX]))) | - ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wq_tail].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_HIGH_IDX]))<<32); + wrid = ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wqe_index].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_LOW_IDX]))) | + ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wqe_index].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_HIGH_IDX]))<<32); entry->opcode = IB_WC_RECV; + + nesqp->hwqp.rq_tail = (wqe_index+1)&(nesqp->hwqp.rq_size - 1); + if ((entry->status != IB_WC_SUCCESS) && (nesqp->hwqp.rq_tail != nesqp->hwqp.rq_head)) { + move_cq_head = 0; + wq_tail = nesqp->hwqp.rq_tail; + } } + entry->wr_id = wrid; + entry++; + cqe_count++; + } + if (move_cq_head) { + nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0; if (++head >= cq_size) head = 0; - cqe_count++; nescq->polled_completions++; + if ((nescq->polled_completions > (cq_size / 2)) || (nescq->polled_completions == 255)) { nes_debug(NES_DBG_CQ, "CQ%u Issuing CQE Allocate since more than half of cqes" - " are pending %u of %u.\n", - nescq->hw_cq.cq_number, nescq->polled_completions, cq_size); + " are pending %u of %u.\n", + nescq->hw_cq.cq_number, nescq->polled_completions, cq_size); nes_write32(nesdev->regs+NES_CQE_ALLOC, - nescq->hw_cq.cq_number | (nescq->polled_completions << 16)); + nescq->hw_cq.cq_number | (nescq->polled_completions << 16)); nescq->polled_completions = 0; } - entry++; - } else - break; + } else { + /* Update the wqe index and set status to flush */ + wqe_index = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]); + wqe_index = (wqe_index & (~(nesdev->nesadapter->max_qp_wr - 1))) | wq_tail; + nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX] = + cpu_to_le32(wqe_index); + move_cq_head = 1; /* ready for next pass */ + } } if (nescq->polled_completions) { diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h index 41c07f29f7c..89822d75f82 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.h +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -40,6 +40,10 @@ struct nes_device; #define NES_MAX_USER_DB_REGIONS 4096 #define NES_MAX_USER_WQ_REGIONS 4096 +#define NES_TERM_SENT 0x01 +#define NES_TERM_RCVD 0x02 +#define NES_TERM_DONE 0x04 + struct nes_ucontext { struct ib_ucontext ibucontext; struct nes_device *nesdev; @@ -119,6 +123,11 @@ struct nes_wq { spinlock_t lock; }; +struct disconn_work { + struct work_struct work; + struct nes_qp *nesqp; +}; + struct iw_cm_id; struct ietf_mpa_frame; @@ -127,7 +136,6 @@ struct nes_qp { void *allocated_buffer; struct iw_cm_id *cm_id; struct workqueue_struct *wq; - struct work_struct disconn_work; struct nes_cq *nesscq; struct nes_cq *nesrcq; struct nes_pd *nespd; @@ -155,9 +163,13 @@ struct nes_qp { void *pbl_vbase; dma_addr_t pbl_pbase; struct page *page; + struct timer_list terminate_timer; + enum ib_event_type terminate_eventtype; wait_queue_head_t kick_waitq; u16 in_disconnect; u16 private_data_len; + u16 term_sq_flush_code; + u16 term_rq_flush_code; u8 active_conn; u8 skip_lsmm; u8 user_mode; @@ -165,7 +177,7 @@ struct nes_qp { u8 hw_iwarp_state; u8 flush_issued; u8 hw_tcp_state; - u8 disconn_pending; + u8 term_flags; u8 destroyed; }; #endif /* NES_VERBS_H */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 181b1f32325..8f4b4fca2a1 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -31,7 +31,6 @@ */ #include <rdma/ib_cm.h> -#include <rdma/ib_cache.h> #include <net/dst.h> #include <net/icmp.h> #include <linux/icmpv6.h> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index e7e5adf84e8..e35f4a0ea9d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -36,7 +36,6 @@ #include <linux/delay.h> #include <linux/dma-mapping.h> -#include <rdma/ib_cache.h> #include <linux/ip.h> #include <linux/tcp.h> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index e319d91f60a..2bf5116deec 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -604,8 +604,11 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) skb_queue_len(&neigh->queue)); goto err_drop; } - } else + } else { + spin_unlock_irqrestore(&priv->lock, flags); ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb_dst(skb)->neighbour->ha)); + return; + } } else { neigh->ah = NULL; @@ -688,7 +691,9 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, ipoib_dbg(priv, "Send unicast ARP to %04x\n", be16_to_cpu(path->pathrec.dlid)); + spin_unlock_irqrestore(&priv->lock, flags); ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr)); + return; } else if ((path->query || !path_rec_start(dev, path)) && skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { /* put pseudoheader back on for next time */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index a0e97532e71..25874fc680c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -720,7 +720,9 @@ out: } } + spin_unlock_irqrestore(&priv->lock, flags); ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN); + return; } unlock: @@ -758,6 +760,20 @@ void ipoib_mcast_dev_flush(struct net_device *dev) } } +static int ipoib_mcast_addr_is_valid(const u8 *addr, unsigned int addrlen, + const u8 *broadcast) +{ + if (addrlen != INFINIBAND_ALEN) + return 0; + /* reserved QPN, prefix, scope */ + if (memcmp(addr, broadcast, 6)) + return 0; + /* signature lower, pkey */ + if (memcmp(addr + 7, broadcast + 7, 3)) + return 0; + return 1; +} + void ipoib_mcast_restart_task(struct work_struct *work) { struct ipoib_dev_priv *priv = @@ -791,6 +807,11 @@ void ipoib_mcast_restart_task(struct work_struct *work) for (mclist = dev->mc_list; mclist; mclist = mclist->next) { union ib_gid mgid; + if (!ipoib_mcast_addr_is_valid(mclist->dmi_addr, + mclist->dmi_addrlen, + dev->broadcast)) + continue; + memcpy(mgid.raw, mclist->dmi_addr + 4, sizeof mgid); mcast = __ipoib_mcast_find(dev, &mgid); diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index 6e186b1a062..652bd33109e 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c @@ -582,7 +582,7 @@ static int userspace_status(struct dm_dirty_log *log, status_type_t status_type, break; case STATUSTYPE_TABLE: sz = 0; - table_args = strstr(lc->usr_argv_str, " "); + table_args = strchr(lc->usr_argv_str, ' '); BUG_ON(!table_args); /* There will always be a ' ' */ table_args++; diff --git a/drivers/net/cxgb3/cxgb3_main.c b/drivers/net/cxgb3/cxgb3_main.c index fb5df5c6203..c97ab82ec74 100644 --- a/drivers/net/cxgb3/cxgb3_main.c +++ b/drivers/net/cxgb3/cxgb3_main.c @@ -1286,6 +1286,7 @@ static int cxgb_open(struct net_device *dev) if (!other_ports) schedule_chk_task(adapter); + cxgb3_event_notify(&adapter->tdev, OFFLOAD_PORT_UP, pi->port_id); return 0; } @@ -1318,6 +1319,7 @@ static int cxgb_close(struct net_device *dev) if (!adapter->open_device_map) cxgb_down(adapter); + cxgb3_event_notify(&adapter->tdev, OFFLOAD_PORT_DOWN, pi->port_id); return 0; } @@ -2717,7 +2719,7 @@ static int t3_adapter_error(struct adapter *adapter, int reset) if (is_offload(adapter) && test_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map)) { - cxgb3_err_notify(&adapter->tdev, OFFLOAD_STATUS_DOWN, 0); + cxgb3_event_notify(&adapter->tdev, OFFLOAD_STATUS_DOWN, 0); offload_close(&adapter->tdev); } @@ -2782,7 +2784,7 @@ static void t3_resume_ports(struct adapter *adapter) } if (is_offload(adapter) && !ofld_disable) - cxgb3_err_notify(&adapter->tdev, OFFLOAD_STATUS_UP, 0); + cxgb3_event_notify(&adapter->tdev, OFFLOAD_STATUS_UP, 0); } /* diff --git a/drivers/net/cxgb3/cxgb3_offload.c b/drivers/net/cxgb3/cxgb3_offload.c index f9f54b57b28..75064eea1d8 100644 --- a/drivers/net/cxgb3/cxgb3_offload.c +++ b/drivers/net/cxgb3/cxgb3_offload.c @@ -153,14 +153,14 @@ void cxgb3_remove_clients(struct t3cdev *tdev) mutex_unlock(&cxgb3_db_lock); } -void cxgb3_err_notify(struct t3cdev *tdev, u32 status, u32 error) +void cxgb3_event_notify(struct t3cdev *tdev, u32 event, u32 port) { struct cxgb3_client *client; mutex_lock(&cxgb3_db_lock); list_for_each_entry(client, &client_list, client_list) { - if (client->err_handler) - client->err_handler(tdev, status, error); + if (client->event_handler) + client->event_handler(tdev, event, port); } mutex_unlock(&cxgb3_db_lock); } diff --git a/drivers/net/cxgb3/cxgb3_offload.h b/drivers/net/cxgb3/cxgb3_offload.h index 55945f422ae..670aa62042d 100644 --- a/drivers/net/cxgb3/cxgb3_offload.h +++ b/drivers/net/cxgb3/cxgb3_offload.h @@ -64,14 +64,16 @@ void cxgb3_register_client(struct cxgb3_client *client); void cxgb3_unregister_client(struct cxgb3_client *client); void cxgb3_add_clients(struct t3cdev *tdev); void cxgb3_remove_clients(struct t3cdev *tdev); -void cxgb3_err_notify(struct t3cdev *tdev, u32 status, u32 error); +void cxgb3_event_notify(struct t3cdev *tdev, u32 event, u32 port); typedef int (*cxgb3_cpl_handler_func)(struct t3cdev *dev, struct sk_buff *skb, void *ctx); enum { OFFLOAD_STATUS_UP, - OFFLOAD_STATUS_DOWN + OFFLOAD_STATUS_DOWN, + OFFLOAD_PORT_DOWN, + OFFLOAD_PORT_UP }; struct cxgb3_client { @@ -82,7 +84,7 @@ struct cxgb3_client { int (*redirect)(void *ctx, struct dst_entry *old, struct dst_entry *new, struct l2t_entry *l2t); struct list_head client_list; - void (*err_handler)(struct t3cdev *tdev, u32 status, u32 error); + void (*event_handler)(struct t3cdev *tdev, u32 event, u32 port); }; /* diff --git a/drivers/net/mlx4/cq.c b/drivers/net/mlx4/cq.c index ac57b6a42c6..ccfe276943f 100644 --- a/drivers/net/mlx4/cq.c +++ b/drivers/net/mlx4/cq.c @@ -34,7 +34,6 @@ * SOFTWARE. */ -#include <linux/init.h> #include <linux/hardirq.h> #include <linux/mlx4/cmd.h> diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c index b9ceddde46c..bffb7995cb7 100644 --- a/drivers/net/mlx4/eq.c +++ b/drivers/net/mlx4/eq.c @@ -31,7 +31,6 @@ * SOFTWARE. */ -#include <linux/init.h> #include <linux/interrupt.h> #include <linux/mm.h> #include <linux/dma-mapping.h> @@ -42,6 +41,10 @@ #include "fw.h" enum { + MLX4_IRQNAME_SIZE = 64 +}; + +enum { MLX4_NUM_ASYNC_EQE = 0x100, MLX4_NUM_SPARE_EQE = 0x80, MLX4_EQ_ENTRY_SIZE = 0x20 @@ -526,48 +529,6 @@ static void mlx4_unmap_clr_int(struct mlx4_dev *dev) iounmap(priv->clr_base); } -int mlx4_map_eq_icm(struct mlx4_dev *dev, u64 icm_virt) -{ - struct mlx4_priv *priv = mlx4_priv(dev); - int ret; - - /* - * We assume that mapping one page is enough for the whole EQ - * context table. This is fine with all current HCAs, because - * we only use 32 EQs and each EQ uses 64 bytes of context - * memory, or 1 KB total. - */ - priv->eq_table.icm_virt = icm_virt; - priv->eq_table.icm_page = alloc_page(GFP_HIGHUSER); - if (!priv->eq_table.icm_page) - return -ENOMEM; - priv->eq_table.icm_dma = pci_map_page(dev->pdev, priv->eq_table.icm_page, 0, - PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); - if (pci_dma_mapping_error(dev->pdev, priv->eq_table.icm_dma)) { - __free_page(priv->eq_table.icm_page); - return -ENOMEM; - } - - ret = mlx4_MAP_ICM_page(dev, priv->eq_table.icm_dma, icm_virt); - if (ret) { - pci_unmap_page(dev->pdev, priv->eq_table.icm_dma, PAGE_SIZE, - PCI_DMA_BIDIRECTIONAL); - __free_page(priv->eq_table.icm_page); - } - - return ret; -} - -void mlx4_unmap_eq_icm(struct mlx4_dev *dev) -{ - struct mlx4_priv *priv = mlx4_priv(dev); - - mlx4_UNMAP_ICM(dev, priv->eq_table.icm_virt, 1); - pci_unmap_page(dev->pdev, priv->eq_table.icm_dma, PAGE_SIZE, - PCI_DMA_BIDIRECTIONAL); - __free_page(priv->eq_table.icm_page); -} - int mlx4_alloc_eq_table(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); @@ -615,7 +576,9 @@ int mlx4_init_eq_table(struct mlx4_dev *dev) priv->eq_table.clr_int = priv->clr_base + (priv->eq_table.inta_pin < 32 ? 4 : 0); - priv->eq_table.irq_names = kmalloc(16 * dev->caps.num_comp_vectors, GFP_KERNEL); + priv->eq_table.irq_names = + kmalloc(MLX4_IRQNAME_SIZE * (dev->caps.num_comp_vectors + 1), + GFP_KERNEL); if (!priv->eq_table.irq_names) { err = -ENOMEM; goto err_out_bitmap; @@ -638,17 +601,25 @@ int mlx4_init_eq_table(struct mlx4_dev *dev) goto err_out_comp; if (dev->flags & MLX4_FLAG_MSI_X) { - static const char async_eq_name[] = "mlx4-async"; const char *eq_name; for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) { if (i < dev->caps.num_comp_vectors) { - snprintf(priv->eq_table.irq_names + i * 16, 16, - "mlx4-comp-%d", i); - eq_name = priv->eq_table.irq_names + i * 16; - } else - eq_name = async_eq_name; + snprintf(priv->eq_table.irq_names + + i * MLX4_IRQNAME_SIZE, + MLX4_IRQNAME_SIZE, + "mlx4-comp-%d@pci:%s", i, + pci_name(dev->pdev)); + } else { + snprintf(priv->eq_table.irq_names + + i * MLX4_IRQNAME_SIZE, + MLX4_IRQNAME_SIZE, + "mlx4-async@pci:%s", + pci_name(dev->pdev)); + } + eq_name = priv->eq_table.irq_names + + i * MLX4_IRQNAME_SIZE; err = request_irq(priv->eq_table.eq[i].irq, mlx4_msi_x_interrupt, 0, eq_name, priv->eq_table.eq + i); @@ -658,8 +629,12 @@ int mlx4_init_eq_table(struct mlx4_dev *dev) priv->eq_table.eq[i].have_irq = 1; } } else { + snprintf(priv->eq_table.irq_names, + MLX4_IRQNAME_SIZE, + DRV_NAME "@pci:%s", + pci_name(dev->pdev)); err = request_irq(dev->pdev->irq, mlx4_interrupt, - IRQF_SHARED, DRV_NAME, dev); + IRQF_SHARED, priv->eq_table.irq_names, dev); if (err) goto err_out_async; diff --git a/drivers/net/mlx4/icm.c b/drivers/net/mlx4/icm.c index baf4bf66062..04b382fcb8c 100644 --- a/drivers/net/mlx4/icm.c +++ b/drivers/net/mlx4/icm.c @@ -31,7 +31,6 @@ * SOFTWARE. */ -#include <linux/init.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/scatterlist.h> diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index dac621b1e9f..3dd481e77f9 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -525,7 +525,10 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, goto err_unmap_aux; } - err = mlx4_map_eq_icm(dev, init_hca->eqc_base); + err = mlx4_init_icm_table(dev, &priv->eq_table.table, + init_hca->eqc_base, dev_cap->eqc_entry_sz, + dev->caps.num_eqs, dev->caps.num_eqs, + 0, 0); if (err) { mlx4_err(dev, "Failed to map EQ context memory, aborting.\n"); goto err_unmap_cmpt; @@ -668,7 +671,7 @@ err_unmap_mtt: mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table); err_unmap_eq: - mlx4_unmap_eq_icm(dev); + mlx4_cleanup_icm_table(dev, &priv->eq_table.table); err_unmap_cmpt: mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table); @@ -698,11 +701,11 @@ static void mlx4_free_icms(struct mlx4_dev *dev) mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table); mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table); mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table); + mlx4_cleanup_icm_table(dev, &priv->eq_table.table); mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table); mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table); mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table); mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table); - mlx4_unmap_eq_icm(dev); mlx4_UNMAP_ICM_AUX(dev); mlx4_free_icm(dev, priv->fw.aux_icm, 0); @@ -786,7 +789,7 @@ static int mlx4_init_hca(struct mlx4_dev *dev) return 0; err_close: - mlx4_close_hca(dev); + mlx4_CLOSE_HCA(dev, 0); err_free_icm: mlx4_free_icms(dev); @@ -1070,18 +1073,12 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) goto err_disable_pdev; } - err = pci_request_region(pdev, 0, DRV_NAME); + err = pci_request_regions(pdev, DRV_NAME); if (err) { - dev_err(&pdev->dev, "Cannot request control region, aborting.\n"); + dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n"); goto err_disable_pdev; } - err = pci_request_region(pdev, 2, DRV_NAME); - if (err) { - dev_err(&pdev->dev, "Cannot request UAR region, aborting.\n"); - goto err_release_bar0; - } - pci_set_master(pdev); err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); @@ -1090,7 +1087,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); if (err) { dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n"); - goto err_release_bar2; + goto err_release_regions; } } err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); @@ -1101,7 +1098,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) if (err) { dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, " "aborting.\n"); - goto err_release_bar2; + goto err_release_regions; } } @@ -1110,7 +1107,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) dev_err(&pdev->dev, "Device struct alloc failed, " "aborting.\n"); err = -ENOMEM; - goto err_release_bar2; + goto err_release_regions; } dev = &priv->dev; @@ -1205,11 +1202,8 @@ err_cmd: err_free_dev: kfree(priv); -err_release_bar2: - pci_release_region(pdev, 2); - -err_release_bar0: - pci_release_region(pdev, 0); +err_release_regions: + pci_release_regions(pdev); err_disable_pdev: pci_disable_device(pdev); @@ -1265,8 +1259,7 @@ static void mlx4_remove_one(struct pci_dev *pdev) pci_disable_msix(pdev); kfree(priv); - pci_release_region(pdev, 2); - pci_release_region(pdev, 0); + pci_release_regions(pdev); pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); } diff --git a/drivers/net/mlx4/mcg.c b/drivers/net/mlx4/mcg.c index 6053c357a47..5ccbce9866f 100644 --- a/drivers/net/mlx4/mcg.c +++ b/drivers/net/mlx4/mcg.c @@ -31,7 +31,6 @@ * SOFTWARE. */ -#include <linux/init.h> #include <linux/string.h> #include <linux/slab.h> diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index 5bd79c2b184..bc72d6e4919 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -205,9 +205,7 @@ struct mlx4_eq_table { void __iomem **uar_map; u32 clr_mask; struct mlx4_eq *eq; - u64 icm_virt; - struct page *icm_page; - dma_addr_t icm_dma; + struct mlx4_icm_table table; struct mlx4_icm_table cmpt_table; int have_irq; u8 inta_pin; @@ -373,9 +371,6 @@ u64 mlx4_make_profile(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, struct mlx4_init_hca_param *init_hca); -int mlx4_map_eq_icm(struct mlx4_dev *dev, u64 icm_virt); -void mlx4_unmap_eq_icm(struct mlx4_dev *dev); - int mlx4_cmd_init(struct mlx4_dev *dev); void mlx4_cmd_cleanup(struct mlx4_dev *dev); void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param); diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c index f96948be0a4..ca7ab8e7b4c 100644 --- a/drivers/net/mlx4/mr.c +++ b/drivers/net/mlx4/mr.c @@ -32,7 +32,6 @@ * SOFTWARE. */ -#include <linux/init.h> #include <linux/errno.h> #include <linux/mlx4/cmd.h> diff --git a/drivers/net/mlx4/pd.c b/drivers/net/mlx4/pd.c index 26d1a7a9e37..c4988d6bd5b 100644 --- a/drivers/net/mlx4/pd.c +++ b/drivers/net/mlx4/pd.c @@ -31,7 +31,6 @@ * SOFTWARE. */ -#include <linux/init.h> #include <linux/errno.h> #include <asm/page.h> diff --git a/drivers/net/mlx4/profile.c b/drivers/net/mlx4/profile.c index bd22df95adf..ca25b9dc837 100644 --- a/drivers/net/mlx4/profile.c +++ b/drivers/net/mlx4/profile.c @@ -32,8 +32,6 @@ * SOFTWARE. */ -#include <linux/init.h> - #include "mlx4.h" #include "fw.h" diff --git a/drivers/net/mlx4/qp.c b/drivers/net/mlx4/qp.c index 1c565ef8d17..42ab9fc01d3 100644 --- a/drivers/net/mlx4/qp.c +++ b/drivers/net/mlx4/qp.c @@ -33,8 +33,6 @@ * SOFTWARE. */ -#include <linux/init.h> - #include <linux/mlx4/cmd.h> #include <linux/mlx4/qp.h> diff --git a/drivers/net/mlx4/reset.c b/drivers/net/mlx4/reset.c index 3951b884c0f..e5741dab382 100644 --- a/drivers/net/mlx4/reset.c +++ b/drivers/net/mlx4/reset.c @@ -31,7 +31,6 @@ * SOFTWARE. */ -#include <linux/init.h> #include <linux/errno.h> #include <linux/pci.h> #include <linux/delay.h> diff --git a/drivers/net/mlx4/srq.c b/drivers/net/mlx4/srq.c index fe9f218691f..1377d0dc8f1 100644 --- a/drivers/net/mlx4/srq.c +++ b/drivers/net/mlx4/srq.c @@ -31,8 +31,6 @@ * SOFTWARE. */ -#include <linux/init.h> - #include <linux/mlx4/cmd.h> #include "mlx4.h" diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 42b6c6319bc..87214a257d2 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -130,17 +130,10 @@ static inline struct tun_sock *tun_sk(struct sock *sk) static int tun_attach(struct tun_struct *tun, struct file *file) { struct tun_file *tfile = file->private_data; - const struct cred *cred = current_cred(); int err; ASSERT_RTNL(); - /* Check permissions */ - if (((tun->owner != -1 && cred->euid != tun->owner) || - (tun->group != -1 && !in_egroup_p(tun->group))) && - !capable(CAP_NET_ADMIN)) - return -EPERM; - netif_tx_lock_bh(tun->dev); err = -EINVAL; @@ -926,6 +919,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) dev = __dev_get_by_name(net, ifr->ifr_name); if (dev) { + const struct cred *cred = current_cred(); + if (ifr->ifr_flags & IFF_TUN_EXCL) return -EBUSY; if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops) @@ -935,6 +930,14 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) else return -EINVAL; + if (((tun->owner != -1 && cred->euid != tun->owner) || + (tun->group != -1 && !in_egroup_p(tun->group))) && + !capable(CAP_NET_ADMIN)) + return -EPERM; + err = security_tun_dev_attach(tun->sk); + if (err < 0) + return err; + err = tun_attach(tun, file); if (err < 0) return err; @@ -947,6 +950,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) if (!capable(CAP_NET_ADMIN)) return -EPERM; + err = security_tun_dev_create(); + if (err < 0) + return err; /* Set dev type */ if (ifr->ifr_flags & IFF_TUN) { @@ -989,6 +995,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) tun->sk = sk; container_of(sk, struct tun_sock, sk)->tun = tun; + security_tun_dev_post_create(sk); + tun_net_init(dev); if (strchr(dev->name, '%')) { diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 3f62dd50bbb..e109da4583a 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -669,14 +669,14 @@ static void dasd_profile_end(struct dasd_block *block, * memory and 2) dasd_smalloc_request uses the static ccw memory * that gets allocated for each device. */ -struct dasd_ccw_req *dasd_kmalloc_request(char *magic, int cplength, +struct dasd_ccw_req *dasd_kmalloc_request(int magic, int cplength, int datasize, struct dasd_device *device) { struct dasd_ccw_req *cqr; /* Sanity checks */ - BUG_ON( magic == NULL || datasize > PAGE_SIZE || + BUG_ON(datasize > PAGE_SIZE || (cplength*sizeof(struct ccw1)) > PAGE_SIZE); cqr = kzalloc(sizeof(struct dasd_ccw_req), GFP_ATOMIC); @@ -700,14 +700,13 @@ struct dasd_ccw_req *dasd_kmalloc_request(char *magic, int cplength, return ERR_PTR(-ENOMEM); } } - strncpy((char *) &cqr->magic, magic, 4); - ASCEBC((char *) &cqr->magic, 4); + cqr->magic = magic; set_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags); dasd_get_device(device); return cqr; } -struct dasd_ccw_req *dasd_smalloc_request(char *magic, int cplength, +struct dasd_ccw_req *dasd_smalloc_request(int magic, int cplength, int datasize, struct dasd_device *device) { @@ -717,7 +716,7 @@ struct dasd_ccw_req *dasd_smalloc_request(char *magic, int cplength, int size; /* Sanity checks */ - BUG_ON( magic == NULL || datasize > PAGE_SIZE || + BUG_ON(datasize > PAGE_SIZE || (cplength*sizeof(struct ccw1)) > PAGE_SIZE); size = (sizeof(struct dasd_ccw_req) + 7L) & -8L; @@ -744,8 +743,7 @@ struct dasd_ccw_req *dasd_smalloc_request(char *magic, int cplength, cqr->data = data; memset(cqr->data, 0, datasize); } - strncpy((char *) &cqr->magic, magic, 4); - ASCEBC((char *) &cqr->magic, 4); + cqr->magic = magic; set_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags); dasd_get_device(device); return cqr; @@ -899,9 +897,6 @@ int dasd_start_IO(struct dasd_ccw_req *cqr) switch (rc) { case 0: cqr->status = DASD_CQR_IN_IO; - DBF_DEV_EVENT(DBF_DEBUG, device, - "start_IO: request %p started successful", - cqr); break; case -EBUSY: DBF_DEV_EVENT(DBF_DEBUG, device, "%s", @@ -1699,8 +1694,11 @@ static void __dasd_process_request_queue(struct dasd_block *block) * for that. State DASD_STATE_ONLINE is normal block device * operation. */ - if (basedev->state < DASD_STATE_READY) + if (basedev->state < DASD_STATE_READY) { + while ((req = blk_fetch_request(block->request_queue))) + __blk_end_request_all(req, -EIO); return; + } /* Now we try to fetch requests from the request queue */ while (!blk_queue_plugged(queue) && (req = blk_peek_request(queue))) { if (basedev->features & DASD_FEATURE_READONLY && @@ -2530,7 +2528,7 @@ EXPORT_SYMBOL_GPL(dasd_generic_restore_device); static struct dasd_ccw_req *dasd_generic_build_rdc(struct dasd_device *device, void *rdc_buffer, int rdc_buffer_size, - char *magic) + int magic) { struct dasd_ccw_req *cqr; struct ccw1 *ccw; @@ -2561,7 +2559,7 @@ static struct dasd_ccw_req *dasd_generic_build_rdc(struct dasd_device *device, } -int dasd_generic_read_dev_chars(struct dasd_device *device, char *magic, +int dasd_generic_read_dev_chars(struct dasd_device *device, int magic, void *rdc_buffer, int rdc_buffer_size) { int ret; diff --git a/drivers/s390/block/dasd_3990_erp.c b/drivers/s390/block/dasd_3990_erp.c index 27991b69205..e8ff7b0c961 100644 --- a/drivers/s390/block/dasd_3990_erp.c +++ b/drivers/s390/block/dasd_3990_erp.c @@ -7,7 +7,7 @@ * */ -#define KMSG_COMPONENT "dasd" +#define KMSG_COMPONENT "dasd-eckd" #include <linux/timer.h> #include <linux/slab.h> diff --git a/drivers/s390/block/dasd_alias.c b/drivers/s390/block/dasd_alias.c index 5b7bbc87593..70a008c0052 100644 --- a/drivers/s390/block/dasd_alias.c +++ b/drivers/s390/block/dasd_alias.c @@ -5,7 +5,7 @@ * Author(s): Stefan Weinhuber <wein@de.ibm.com> */ -#define KMSG_COMPONENT "dasd" +#define KMSG_COMPONENT "dasd-eckd" #include <linux/list.h> #include <asm/ebcdic.h> @@ -379,8 +379,7 @@ static int read_unit_address_configuration(struct dasd_device *device, int rc; unsigned long flags; - cqr = dasd_kmalloc_request("ECKD", - 1 /* PSF */ + 1 /* RSSD */ , + cqr = dasd_kmalloc_request(DASD_ECKD_MAGIC, 1 /* PSF */ + 1 /* RSSD */, (sizeof(struct dasd_psf_prssd_data)), device); if (IS_ERR(cqr)) diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c index 644086ba2ed..4e49b4a6c88 100644 --- a/drivers/s390/block/dasd_diag.c +++ b/drivers/s390/block/dasd_diag.c @@ -8,7 +8,7 @@ * */ -#define KMSG_COMPONENT "dasd" +#define KMSG_COMPONENT "dasd-diag" #include <linux/stddef.h> #include <linux/kernel.h> @@ -523,8 +523,7 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct dasd_device *memdev, /* Build the request */ datasize = sizeof(struct dasd_diag_req) + count*sizeof(struct dasd_diag_bio); - cqr = dasd_smalloc_request(dasd_diag_discipline.name, 0, - datasize, memdev); + cqr = dasd_smalloc_request(DASD_DIAG_MAGIC, 0, datasize, memdev); if (IS_ERR(cqr)) return cqr; diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index c11770f5b36..a1ce573648a 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -10,7 +10,7 @@ * Author.........: Nigel Hislop <hislop_nigel@emc.com> */ -#define KMSG_COMPONENT "dasd" +#define KMSG_COMPONENT "dasd-eckd" #include <linux/stddef.h> #include <linux/kernel.h> @@ -730,7 +730,8 @@ static struct dasd_ccw_req *dasd_eckd_build_rcd_lpm(struct dasd_device *device, struct dasd_ccw_req *cqr; struct ccw1 *ccw; - cqr = dasd_smalloc_request("ECKD", 1 /* RCD */, ciw->count, device); + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1 /* RCD */, ciw->count, + device); if (IS_ERR(cqr)) { DBF_DEV_EVENT(DBF_WARNING, device, "%s", @@ -934,8 +935,7 @@ static int dasd_eckd_read_features(struct dasd_device *device) struct dasd_eckd_private *private; private = (struct dasd_eckd_private *) device->private; - cqr = dasd_smalloc_request(dasd_eckd_discipline.name, - 1 /* PSF */ + 1 /* RSSD */ , + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1 /* PSF */ + 1 /* RSSD */, (sizeof(struct dasd_psf_prssd_data) + sizeof(struct dasd_rssd_features)), device); @@ -998,7 +998,7 @@ static struct dasd_ccw_req *dasd_eckd_build_psf_ssc(struct dasd_device *device, struct dasd_psf_ssc_data *psf_ssc_data; struct ccw1 *ccw; - cqr = dasd_smalloc_request("ECKD", 1 /* PSF */ , + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1 /* PSF */ , sizeof(struct dasd_psf_ssc_data), device); @@ -1149,8 +1149,8 @@ dasd_eckd_check_characteristics(struct dasd_device *device) goto out_err3; /* Read Device Characteristics */ - rc = dasd_generic_read_dev_chars(device, "ECKD", &private->rdc_data, - 64); + rc = dasd_generic_read_dev_chars(device, DASD_ECKD_MAGIC, + &private->rdc_data, 64); if (rc) { DBF_EVENT(DBF_WARNING, "Read device characteristics failed, rc=%d for " @@ -1217,8 +1217,7 @@ dasd_eckd_analysis_ccw(struct dasd_device *device) cplength = 8; datasize = sizeof(struct DE_eckd_data) + 2*sizeof(struct LO_eckd_data); - cqr = dasd_smalloc_request(dasd_eckd_discipline.name, - cplength, datasize, device); + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, cplength, datasize, device); if (IS_ERR(cqr)) return cqr; ccw = cqr->cpaddr; @@ -1499,8 +1498,7 @@ dasd_eckd_format_device(struct dasd_device * device, return ERR_PTR(-EINVAL); } /* Allocate the format ccw request. */ - fcp = dasd_smalloc_request(dasd_eckd_discipline.name, - cplength, datasize, device); + fcp = dasd_smalloc_request(DASD_ECKD_MAGIC, cplength, datasize, device); if (IS_ERR(fcp)) return fcp; @@ -1783,8 +1781,8 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_single( datasize += count*sizeof(struct LO_eckd_data); } /* Allocate the ccw request. */ - cqr = dasd_smalloc_request(dasd_eckd_discipline.name, - cplength, datasize, startdev); + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, cplength, datasize, + startdev); if (IS_ERR(cqr)) return cqr; ccw = cqr->cpaddr; @@ -1948,8 +1946,8 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_track( cidaw * sizeof(unsigned long long); /* Allocate the ccw request. */ - cqr = dasd_smalloc_request(dasd_eckd_discipline.name, - cplength, datasize, startdev); + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, cplength, datasize, + startdev); if (IS_ERR(cqr)) return cqr; ccw = cqr->cpaddr; @@ -2249,8 +2247,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_tpm_track( /* Allocate the ccw request. */ itcw_size = itcw_calc_size(0, ctidaw, 0); - cqr = dasd_smalloc_request(dasd_eckd_discipline.name, - 0, itcw_size, startdev); + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 0, itcw_size, startdev); if (IS_ERR(cqr)) return cqr; @@ -2557,8 +2554,7 @@ dasd_eckd_release(struct dasd_device *device) if (!capable(CAP_SYS_ADMIN)) return -EACCES; - cqr = dasd_smalloc_request(dasd_eckd_discipline.name, - 1, 32, device); + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1, 32, device); if (IS_ERR(cqr)) { DBF_DEV_EVENT(DBF_WARNING, device, "%s", "Could not allocate initialization request"); @@ -2600,8 +2596,7 @@ dasd_eckd_reserve(struct dasd_device *device) if (!capable(CAP_SYS_ADMIN)) return -EACCES; - cqr = dasd_smalloc_request(dasd_eckd_discipline.name, - 1, 32, device); + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1, 32, device); if (IS_ERR(cqr)) { DBF_DEV_EVENT(DBF_WARNING, device, "%s", "Could not allocate initialization request"); @@ -2642,8 +2637,7 @@ dasd_eckd_steal_lock(struct dasd_device *device) if (!capable(CAP_SYS_ADMIN)) return -EACCES; - cqr = dasd_smalloc_request(dasd_eckd_discipline.name, - 1, 32, device); + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1, 32, device); if (IS_ERR(cqr)) { DBF_DEV_EVENT(DBF_WARNING, device, "%s", "Could not allocate initialization request"); @@ -2681,8 +2675,7 @@ dasd_eckd_performance(struct dasd_device *device, void __user *argp) struct ccw1 *ccw; int rc; - cqr = dasd_smalloc_request(dasd_eckd_discipline.name, - 1 /* PSF */ + 1 /* RSSD */ , + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1 /* PSF */ + 1 /* RSSD */, (sizeof(struct dasd_psf_prssd_data) + sizeof(struct dasd_rssd_perf_stats_t)), device); @@ -2828,7 +2821,7 @@ static int dasd_symm_io(struct dasd_device *device, void __user *argp) } /* setup CCWs for PSF + RSSD */ - cqr = dasd_smalloc_request("ECKD", 2 , 0, device); + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 2 , 0, device); if (IS_ERR(cqr)) { DBF_DEV_EVENT(DBF_WARNING, device, "%s", "Could not allocate initialization request"); @@ -3254,7 +3247,7 @@ int dasd_eckd_restore_device(struct dasd_device *device) /* Read Device Characteristics */ memset(&private->rdc_data, 0, sizeof(private->rdc_data)); - rc = dasd_generic_read_dev_chars(device, "ECKD", + rc = dasd_generic_read_dev_chars(device, DASD_ECKD_MAGIC, &private->rdc_data, 64); if (rc) { DBF_EVENT(DBF_WARNING, diff --git a/drivers/s390/block/dasd_eer.c b/drivers/s390/block/dasd_eer.c index c24c8c30380..d96039eae59 100644 --- a/drivers/s390/block/dasd_eer.c +++ b/drivers/s390/block/dasd_eer.c @@ -6,7 +6,7 @@ * Author(s): Stefan Weinhuber <wein@de.ibm.com> */ -#define KMSG_COMPONENT "dasd" +#define KMSG_COMPONENT "dasd-eckd" #include <linux/init.h> #include <linux/fs.h> @@ -464,7 +464,7 @@ int dasd_eer_enable(struct dasd_device *device) if (!device->discipline || strcmp(device->discipline->name, "ECKD")) return -EPERM; /* FIXME: -EMEDIUMTYPE ? */ - cqr = dasd_kmalloc_request("ECKD", 1 /* SNSS */, + cqr = dasd_kmalloc_request(DASD_ECKD_MAGIC, 1 /* SNSS */, SNSS_DATA_SIZE, device); if (IS_ERR(cqr)) return -ENOMEM; diff --git a/drivers/s390/block/dasd_erp.c b/drivers/s390/block/dasd_erp.c index cb8f9cef742..7656384a811 100644 --- a/drivers/s390/block/dasd_erp.c +++ b/drivers/s390/block/dasd_erp.c @@ -99,8 +99,8 @@ dasd_default_erp_action(struct dasd_ccw_req *cqr) cqr->lpm = LPM_ANYPATH; cqr->status = DASD_CQR_FILLED; } else { - dev_err(&device->cdev->dev, - "default ERP has run out of retries and failed\n"); + pr_err("%s: default ERP has run out of retries and failed\n", + dev_name(&device->cdev->dev)); cqr->status = DASD_CQR_FAILED; cqr->stopclk = get_clock(); } diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c index 31849ad5e59..f245377e8e2 100644 --- a/drivers/s390/block/dasd_fba.c +++ b/drivers/s390/block/dasd_fba.c @@ -5,7 +5,7 @@ * Copyright IBM Corp. 1999, 2009 */ -#define KMSG_COMPONENT "dasd" +#define KMSG_COMPONENT "dasd-fba" #include <linux/stddef.h> #include <linux/kernel.h> @@ -152,8 +152,8 @@ dasd_fba_check_characteristics(struct dasd_device *device) block->base = device; /* Read Device Characteristics */ - rc = dasd_generic_read_dev_chars(device, "FBA ", &private->rdc_data, - 32); + rc = dasd_generic_read_dev_chars(device, DASD_FBA_MAGIC, + &private->rdc_data, 32); if (rc) { DBF_EVENT(DBF_WARNING, "Read device characteristics returned " "error %d for device: %s", @@ -305,8 +305,7 @@ static struct dasd_ccw_req *dasd_fba_build_cp(struct dasd_device * memdev, datasize += (count - 1)*sizeof(struct LO_fba_data); } /* Allocate the ccw request. */ - cqr = dasd_smalloc_request(dasd_fba_discipline.name, - cplength, datasize, memdev); + cqr = dasd_smalloc_request(DASD_FBA_MAGIC, cplength, datasize, memdev); if (IS_ERR(cqr)) return cqr; ccw = cqr->cpaddr; diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index b699ca356ac..5e47a1ee52b 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -59,6 +59,11 @@ #include <asm/dasd.h> #include <asm/idals.h> +/* DASD discipline magic */ +#define DASD_ECKD_MAGIC 0xC5C3D2C4 +#define DASD_DIAG_MAGIC 0xC4C9C1C7 +#define DASD_FBA_MAGIC 0xC6C2C140 + /* * SECTION: Type definitions */ @@ -540,9 +545,9 @@ extern struct block_device_operations dasd_device_operations; extern struct kmem_cache *dasd_page_cache; struct dasd_ccw_req * -dasd_kmalloc_request(char *, int, int, struct dasd_device *); +dasd_kmalloc_request(int , int, int, struct dasd_device *); struct dasd_ccw_req * -dasd_smalloc_request(char *, int, int, struct dasd_device *); +dasd_smalloc_request(int , int, int, struct dasd_device *); void dasd_kfree_request(struct dasd_ccw_req *, struct dasd_device *); void dasd_sfree_request(struct dasd_ccw_req *, struct dasd_device *); @@ -587,7 +592,7 @@ void dasd_generic_handle_state_change(struct dasd_device *); int dasd_generic_pm_freeze(struct ccw_device *); int dasd_generic_restore_device(struct ccw_device *); -int dasd_generic_read_dev_chars(struct dasd_device *, char *, void *, int); +int dasd_generic_read_dev_chars(struct dasd_device *, int, void *, int); char *dasd_get_sense(struct irb *); /* externals in dasd_devmap.c */ diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c index df918ef2796..f756a1b0c57 100644 --- a/drivers/s390/block/dasd_ioctl.c +++ b/drivers/s390/block/dasd_ioctl.c @@ -98,8 +98,8 @@ static int dasd_ioctl_quiesce(struct dasd_block *block) if (!capable (CAP_SYS_ADMIN)) return -EACCES; - dev_info(&base->cdev->dev, "The DASD has been put in the quiesce " - "state\n"); + pr_info("%s: The DASD has been put in the quiesce " + "state\n", dev_name(&base->cdev->dev)); spin_lock_irqsave(get_ccwdev_lock(base->cdev), flags); base->stopped |= DASD_STOPPED_QUIESCE; spin_unlock_irqrestore(get_ccwdev_lock(base->cdev), flags); @@ -119,8 +119,8 @@ static int dasd_ioctl_resume(struct dasd_block *block) if (!capable (CAP_SYS_ADMIN)) return -EACCES; - dev_info(&base->cdev->dev, "I/O operations have been resumed " - "on the DASD\n"); + pr_info("%s: I/O operations have been resumed " + "on the DASD\n", dev_name(&base->cdev->dev)); spin_lock_irqsave(get_ccwdev_lock(base->cdev), flags); base->stopped &= ~DASD_STOPPED_QUIESCE; spin_unlock_irqrestore(get_ccwdev_lock(base->cdev), flags); @@ -146,8 +146,8 @@ static int dasd_format(struct dasd_block *block, struct format_data_t *fdata) return -EPERM; if (base->state != DASD_STATE_BASIC) { - dev_warn(&base->cdev->dev, - "The DASD cannot be formatted while it is enabled\n"); + pr_warning("%s: The DASD cannot be formatted while it is " + "enabled\n", dev_name(&base->cdev->dev)); return -EBUSY; } @@ -175,9 +175,9 @@ static int dasd_format(struct dasd_block *block, struct format_data_t *fdata) dasd_sfree_request(cqr, cqr->memdev); if (rc) { if (rc != -ERESTARTSYS) - dev_err(&base->cdev->dev, - "Formatting unit %d failed with " - "rc=%d\n", fdata->start_unit, rc); + pr_err("%s: Formatting unit %d failed with " + "rc=%d\n", dev_name(&base->cdev->dev), + fdata->start_unit, rc); return rc; } fdata->start_unit++; @@ -204,9 +204,9 @@ dasd_ioctl_format(struct block_device *bdev, void __user *argp) if (copy_from_user(&fdata, argp, sizeof(struct format_data_t))) return -EFAULT; if (bdev != bdev->bd_contains) { - dev_warn(&block->base->cdev->dev, - "The specified DASD is a partition and cannot be " - "formatted\n"); + pr_warning("%s: The specified DASD is a partition and cannot " + "be formatted\n", + dev_name(&block->base->cdev->dev)); return -EINVAL; } return dasd_format(block, &fdata); diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c index db442cd6621..ee604e92a5f 100644 --- a/drivers/s390/block/xpram.c +++ b/drivers/s390/block/xpram.c @@ -42,7 +42,6 @@ #include <linux/suspend.h> #include <linux/platform_device.h> #include <asm/uaccess.h> -#include <asm/checksum.h> #define XPRAM_NAME "xpram" #define XPRAM_DEVS 1 /* one partition */ @@ -51,7 +50,6 @@ typedef struct { unsigned int size; /* size of xpram segment in pages */ unsigned int offset; /* start page of xpram segment */ - unsigned int csum; /* partition checksum for suspend */ } xpram_device_t; static xpram_device_t xpram_devices[XPRAM_MAX_DEVS]; @@ -387,58 +385,6 @@ out: } /* - * Save checksums for all partitions. - */ -static int xpram_save_checksums(void) -{ - unsigned long mem_page; - int rc, i; - - rc = 0; - mem_page = (unsigned long) __get_free_page(GFP_KERNEL); - if (!mem_page) - return -ENOMEM; - for (i = 0; i < xpram_devs; i++) { - rc = xpram_page_in(mem_page, xpram_devices[i].offset); - if (rc) - goto fail; - xpram_devices[i].csum = csum_partial((const void *) mem_page, - PAGE_SIZE, 0); - } -fail: - free_page(mem_page); - return rc ? -ENXIO : 0; -} - -/* - * Verify checksums for all partitions. - */ -static int xpram_validate_checksums(void) -{ - unsigned long mem_page; - unsigned int csum; - int rc, i; - - rc = 0; - mem_page = (unsigned long) __get_free_page(GFP_KERNEL); - if (!mem_page) - return -ENOMEM; - for (i = 0; i < xpram_devs; i++) { - rc = xpram_page_in(mem_page, xpram_devices[i].offset); - if (rc) - goto fail; - csum = csum_partial((const void *) mem_page, PAGE_SIZE, 0); - if (xpram_devices[i].csum != csum) { - rc = -EINVAL; - goto fail; - } - } -fail: - free_page(mem_page); - return rc ? -ENXIO : 0; -} - -/* * Resume failed: Print error message and call panic. */ static void xpram_resume_error(const char *message) @@ -458,21 +404,10 @@ static int xpram_restore(struct device *dev) xpram_resume_error("xpram disappeared"); if (xpram_pages != xpram_highest_page_index() + 1) xpram_resume_error("Size of xpram changed"); - if (xpram_validate_checksums()) - xpram_resume_error("Data of xpram changed"); return 0; } -/* - * Save necessary state in suspend. - */ -static int xpram_freeze(struct device *dev) -{ - return xpram_save_checksums(); -} - static struct dev_pm_ops xpram_pm_ops = { - .freeze = xpram_freeze, .restore = xpram_restore, }; diff --git a/drivers/s390/char/Kconfig b/drivers/s390/char/Kconfig index 0769ced52db..4e34d3686c2 100644 --- a/drivers/s390/char/Kconfig +++ b/drivers/s390/char/Kconfig @@ -82,6 +82,16 @@ config SCLP_CPI You should only select this option if you know what you are doing, need this feature and intend to run your kernel in LPAR. +config SCLP_ASYNC + tristate "Support for Call Home via Asynchronous SCLP Records" + depends on S390 + help + This option enables the call home function, which is able to inform + the service element and connected organisations about a kernel panic. + You should only select this option if you know what you are doing, + want for inform other people about your kernel panics, + need this feature and intend to run your kernel in LPAR. + config S390_TAPE tristate "S/390 tape device support" depends on CCW diff --git a/drivers/s390/char/Makefile b/drivers/s390/char/Makefile index 7e73e39a174..efb500ab66c 100644 --- a/drivers/s390/char/Makefile +++ b/drivers/s390/char/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_SCLP_TTY) += sclp_tty.o obj-$(CONFIG_SCLP_CONSOLE) += sclp_con.o obj-$(CONFIG_SCLP_VT220_TTY) += sclp_vt220.o obj-$(CONFIG_SCLP_CPI) += sclp_cpi.o +obj-$(CONFIG_SCLP_ASYNC) += sclp_async.o obj-$(CONFIG_ZVM_WATCHDOG) += vmwatchdog.o obj-$(CONFIG_VMLOGRDR) += vmlogrdr.o diff --git a/drivers/s390/char/monreader.c b/drivers/s390/char/monreader.c index 3234e90bd7f..89ece1c235a 100644 --- a/drivers/s390/char/monreader.c +++ b/drivers/s390/char/monreader.c @@ -581,7 +581,7 @@ static int __init mon_init(void) monreader_device->release = (void (*)(struct device *))kfree; rc = device_register(monreader_device); if (rc) { - kfree(monreader_device); + put_device(monreader_device); goto out_driver; } diff --git a/drivers/s390/char/sclp.h b/drivers/s390/char/sclp.h index 60e7cb07095..6bb5a6bdfab 100644 --- a/drivers/s390/char/sclp.h +++ b/drivers/s390/char/sclp.h @@ -27,6 +27,7 @@ #define EVTYP_VT220MSG 0x1A #define EVTYP_CONFMGMDATA 0x04 #define EVTYP_SDIAS 0x1C +#define EVTYP_ASYNC 0x0A #define EVTYP_OPCMD_MASK 0x80000000 #define EVTYP_MSG_MASK 0x40000000 @@ -38,6 +39,7 @@ #define EVTYP_VT220MSG_MASK 0x00000040 #define EVTYP_CONFMGMDATA_MASK 0x10000000 #define EVTYP_SDIAS_MASK 0x00000010 +#define EVTYP_ASYNC_MASK 0x00400000 #define GNRLMSGFLGS_DOM 0x8000 #define GNRLMSGFLGS_SNDALRM 0x4000 @@ -85,12 +87,12 @@ struct sccb_header { } __attribute__((packed)); extern u64 sclp_facilities; - #define SCLP_HAS_CHP_INFO (sclp_facilities & 0x8000000000000000ULL) #define SCLP_HAS_CHP_RECONFIG (sclp_facilities & 0x2000000000000000ULL) #define SCLP_HAS_CPU_INFO (sclp_facilities & 0x0800000000000000ULL) #define SCLP_HAS_CPU_RECONFIG (sclp_facilities & 0x0400000000000000ULL) + struct gds_subvector { u8 length; u8 key; diff --git a/drivers/s390/char/sclp_async.c b/drivers/s390/char/sclp_async.c new file mode 100644 index 00000000000..daaec185ed3 --- /dev/null +++ b/drivers/s390/char/sclp_async.c @@ -0,0 +1,224 @@ +/* + * Enable Asynchronous Notification via SCLP. + * + * Copyright IBM Corp. 2009 + * Author(s): Hans-Joachim Picht <hans@linux.vnet.ibm.com> + * + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/stat.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/kmod.h> +#include <linux/err.h> +#include <linux/errno.h> +#include <linux/proc_fs.h> +#include <linux/sysctl.h> +#include <linux/utsname.h> +#include "sclp.h" + +static int callhome_enabled; +static struct sclp_req *request; +static struct sclp_async_sccb *sccb; +static int sclp_async_send_wait(char *message); +static struct ctl_table_header *callhome_sysctl_header; +static DEFINE_SPINLOCK(sclp_async_lock); +static char nodename[64]; +#define SCLP_NORMAL_WRITE 0x00 + +struct async_evbuf { + struct evbuf_header header; + u64 reserved; + u8 rflags; + u8 empty; + u8 rtype; + u8 otype; + char comp_id[12]; + char data[3000]; /* there is still some space left */ +} __attribute__((packed)); + +struct sclp_async_sccb { + struct sccb_header header; + struct async_evbuf evbuf; +} __attribute__((packed)); + +static struct sclp_register sclp_async_register = { + .send_mask = EVTYP_ASYNC_MASK, +}; + +static int call_home_on_panic(struct notifier_block *self, + unsigned long event, void *data) +{ + strncat(data, nodename, strlen(nodename)); + sclp_async_send_wait(data); + return NOTIFY_DONE; +} + +static struct notifier_block call_home_panic_nb = { + .notifier_call = call_home_on_panic, + .priority = INT_MAX, +}; + +static int proc_handler_callhome(ctl_table *ctl, int write, struct file *filp, + void __user *buffer, size_t *count, + loff_t *ppos) +{ + unsigned long val; + int len, rc; + char buf[2]; + + if (!*count | (*ppos && !write)) { + *count = 0; + return 0; + } + if (!write) { + len = sprintf(buf, "%d\n", callhome_enabled); + buf[len] = '\0'; + rc = copy_to_user(buffer, buf, sizeof(buf)); + if (rc != 0) + return -EFAULT; + } else { + len = *count; + rc = copy_from_user(buf, buffer, sizeof(buf)); + if (rc != 0) + return -EFAULT; + if (strict_strtoul(buf, 0, &val) != 0) + return -EINVAL; + if (val != 0 && val != 1) + return -EINVAL; + callhome_enabled = val; + } + *count = len; + *ppos += len; + return 0; +} + +static struct ctl_table callhome_table[] = { + { + .procname = "callhome", + .mode = 0644, + .proc_handler = &proc_handler_callhome, + }, + { .ctl_name = 0 } +}; + +static struct ctl_table kern_dir_table[] = { + { + .ctl_name = CTL_KERN, + .procname = "kernel", + .maxlen = 0, + .mode = 0555, + .child = callhome_table, + }, + { .ctl_name = 0 } +}; + +/* + * Function used to transfer asynchronous notification + * records which waits for send completion + */ +static int sclp_async_send_wait(char *message) +{ + struct async_evbuf *evb; + int rc; + unsigned long flags; + + if (!callhome_enabled) + return 0; + sccb->evbuf.header.type = EVTYP_ASYNC; + sccb->evbuf.rtype = 0xA5; + sccb->evbuf.otype = 0x00; + evb = &sccb->evbuf; + request->command = SCLP_CMDW_WRITE_EVENT_DATA; + request->sccb = sccb; + request->status = SCLP_REQ_FILLED; + strncpy(sccb->evbuf.data, message, sizeof(sccb->evbuf.data)); + /* + * Retain Queue + * e.g. 5639CC140 500 Red Hat RHEL5 Linux for zSeries (RHEL AS) + */ + strncpy(sccb->evbuf.comp_id, "000000000", sizeof(sccb->evbuf.comp_id)); + sccb->evbuf.header.length = sizeof(sccb->evbuf); + sccb->header.length = sizeof(sccb->evbuf) + sizeof(sccb->header); + sccb->header.function_code = SCLP_NORMAL_WRITE; + rc = sclp_add_request(request); + if (rc) + return rc; + spin_lock_irqsave(&sclp_async_lock, flags); + while (request->status != SCLP_REQ_DONE && + request->status != SCLP_REQ_FAILED) { + sclp_sync_wait(); + } + spin_unlock_irqrestore(&sclp_async_lock, flags); + if (request->status != SCLP_REQ_DONE) + return -EIO; + rc = ((struct sclp_async_sccb *) + request->sccb)->header.response_code; + if (rc != 0x0020) + return -EIO; + if (evb->header.flags != 0x80) + return -EIO; + return rc; +} + +static int __init sclp_async_init(void) +{ + int rc; + + rc = sclp_register(&sclp_async_register); + if (rc) + return rc; + callhome_sysctl_header = register_sysctl_table(kern_dir_table); + if (!callhome_sysctl_header) { + rc = -ENOMEM; + goto out_sclp; + } + if (!(sclp_async_register.sclp_receive_mask & EVTYP_ASYNC_MASK)) { + rc = -EOPNOTSUPP; + goto out_sclp; + } + rc = -ENOMEM; + request = kzalloc(sizeof(struct sclp_req), GFP_KERNEL); + if (!request) + goto out_sys; + sccb = (struct sclp_async_sccb *) get_zeroed_page(GFP_KERNEL | GFP_DMA); + if (!sccb) + goto out_mem; + rc = atomic_notifier_chain_register(&panic_notifier_list, + &call_home_panic_nb); + if (rc) + goto out_mem; + + strncpy(nodename, init_utsname()->nodename, 64); + return 0; + +out_mem: + kfree(request); + free_page((unsigned long) sccb); +out_sys: + unregister_sysctl_table(callhome_sysctl_header); +out_sclp: + sclp_unregister(&sclp_async_register); + return rc; + +} +module_init(sclp_async_init); + +static void __exit sclp_async_exit(void) +{ + atomic_notifier_chain_unregister(&panic_notifier_list, + &call_home_panic_nb); + unregister_sysctl_table(callhome_sysctl_header); + sclp_unregister(&sclp_async_register); + free_page((unsigned long) sccb); + kfree(request); +} +module_exit(sclp_async_exit); + +MODULE_AUTHOR("Copyright IBM Corp. 2009"); +MODULE_AUTHOR("Hans-Joachim Picht <hans@linux.vnet.ibm.com>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SCLP Asynchronous Notification Records"); diff --git a/drivers/s390/char/tape_34xx.c b/drivers/s390/char/tape_34xx.c index 5a519fac37b..2fe45ff77b7 100644 --- a/drivers/s390/char/tape_34xx.c +++ b/drivers/s390/char/tape_34xx.c @@ -8,7 +8,7 @@ * Martin Schwidefsky <schwidefsky@de.ibm.com> */ -#define KMSG_COMPONENT "tape" +#define KMSG_COMPONENT "tape_34xx" #include <linux/module.h> #include <linux/init.h> diff --git a/drivers/s390/char/tape_3590.c b/drivers/s390/char/tape_3590.c index 418f72dd39b..e4cc3aae916 100644 --- a/drivers/s390/char/tape_3590.c +++ b/drivers/s390/char/tape_3590.c @@ -8,7 +8,7 @@ * Martin Schwidefsky <schwidefsky@de.ibm.com> */ -#define KMSG_COMPONENT "tape" +#define KMSG_COMPONENT "tape_3590" #include <linux/module.h> #include <linux/init.h> @@ -39,8 +39,6 @@ EXPORT_SYMBOL(TAPE_DBF_AREA); * - Read Alternate: implemented *******************************************************************/ -#define KMSG_COMPONENT "tape" - static const char *tape_3590_msg[TAPE_3590_MAX_MSG] = { [0x00] = "", [0x10] = "Lost Sense", diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c index 47ff695255e..4cb9e70507a 100644 --- a/drivers/s390/char/tape_block.c +++ b/drivers/s390/char/tape_block.c @@ -302,8 +302,6 @@ tapeblock_revalidate_disk(struct gendisk *disk) if (!device->blk_data.medium_changed) return 0; - dev_info(&device->cdev->dev, "Determining the size of the recorded " - "area...\n"); rc = tape_mtop(device, MTFSFM, 1); if (rc) return rc; @@ -312,6 +310,8 @@ tapeblock_revalidate_disk(struct gendisk *disk) if (rc < 0) return rc; + pr_info("%s: Determining the size of the recorded area...\n", + dev_name(&device->cdev->dev)); DBF_LH(3, "Image file ends at %d\n", rc); nr_of_blks = rc; @@ -330,8 +330,8 @@ tapeblock_revalidate_disk(struct gendisk *disk) device->bof = rc; nr_of_blks -= rc; - dev_info(&device->cdev->dev, "The size of the recorded area is %i " - "blocks\n", nr_of_blks); + pr_info("%s: The size of the recorded area is %i blocks\n", + dev_name(&device->cdev->dev), nr_of_blks); set_capacity(device->blk_data.disk, nr_of_blks*(TAPEBLOCK_HSEC_SIZE/512)); @@ -366,8 +366,8 @@ tapeblock_open(struct block_device *bdev, fmode_t mode) if (device->required_tapemarks) { DBF_EVENT(2, "TBLOCK: missing tapemarks\n"); - dev_warn(&device->cdev->dev, "Opening the tape failed because" - " of missing end-of-file marks\n"); + pr_warning("%s: Opening the tape failed because of missing " + "end-of-file marks\n", dev_name(&device->cdev->dev)); rc = -EPERM; goto put_device; } diff --git a/drivers/s390/char/tape_core.c b/drivers/s390/char/tape_core.c index 1d420d94759..5cd31e07164 100644 --- a/drivers/s390/char/tape_core.c +++ b/drivers/s390/char/tape_core.c @@ -214,13 +214,15 @@ tape_med_state_set(struct tape_device *device, enum tape_medium_state newstate) switch(newstate){ case MS_UNLOADED: device->tape_generic_status |= GMT_DR_OPEN(~0); - dev_info(&device->cdev->dev, "The tape cartridge has been " - "successfully unloaded\n"); + if (device->medium_state == MS_LOADED) + pr_info("%s: The tape cartridge has been successfully " + "unloaded\n", dev_name(&device->cdev->dev)); break; case MS_LOADED: device->tape_generic_status &= ~GMT_DR_OPEN(~0); - dev_info(&device->cdev->dev, "A tape cartridge has been " - "mounted\n"); + if (device->medium_state == MS_UNLOADED) + pr_info("%s: A tape cartridge has been mounted\n", + dev_name(&device->cdev->dev)); break; default: // print nothing @@ -358,11 +360,11 @@ tape_generic_online(struct tape_device *device, out_char: tapechar_cleanup_device(device); +out_minor: + tape_remove_minor(device); out_discipline: device->discipline->cleanup_device(device); device->discipline = NULL; -out_minor: - tape_remove_minor(device); out: module_put(discipline->owner); return rc; @@ -654,8 +656,8 @@ tape_generic_remove(struct ccw_device *cdev) */ DBF_EVENT(3, "(%08x): Drive in use vanished!\n", device->cdev_id); - dev_warn(&device->cdev->dev, "A tape unit was detached" - " while in use\n"); + pr_warning("%s: A tape unit was detached while in " + "use\n", dev_name(&device->cdev->dev)); tape_state_set(device, TS_NOT_OPER); __tape_discard_requests(device); spin_unlock_irq(get_ccwdev_lock(device->cdev)); diff --git a/drivers/s390/char/tape_std.c b/drivers/s390/char/tape_std.c index 1a9420ba518..750354ad16e 100644 --- a/drivers/s390/char/tape_std.c +++ b/drivers/s390/char/tape_std.c @@ -68,7 +68,7 @@ tape_std_assign(struct tape_device *device) * to another host (actually this shouldn't happen but it does). * So we set up a timeout for this call. */ - init_timer(&timeout); + init_timer_on_stack(&timeout); timeout.function = tape_std_assign_timeout; timeout.data = (unsigned long) request; timeout.expires = jiffies + 2 * HZ; diff --git a/drivers/s390/char/vmlogrdr.c b/drivers/s390/char/vmlogrdr.c index c20a4fe6da5..d1a142fa3eb 100644 --- a/drivers/s390/char/vmlogrdr.c +++ b/drivers/s390/char/vmlogrdr.c @@ -765,8 +765,10 @@ static int vmlogrdr_register_device(struct vmlogrdr_priv_t *priv) } else return -ENOMEM; ret = device_register(dev); - if (ret) + if (ret) { + put_device(dev); return ret; + } ret = sysfs_create_group(&dev->kobj, &vmlogrdr_attr_group); if (ret) { diff --git a/drivers/s390/char/vmur.c b/drivers/s390/char/vmur.c index 31b902e94f7..77571b68539 100644 --- a/drivers/s390/char/vmur.c +++ b/drivers/s390/char/vmur.c @@ -1026,9 +1026,15 @@ static int __init ur_init(void) debug_set_level(vmur_dbf, 6); + vmur_class = class_create(THIS_MODULE, "vmur"); + if (IS_ERR(vmur_class)) { + rc = PTR_ERR(vmur_class); + goto fail_free_dbf; + } + rc = ccw_driver_register(&ur_driver); if (rc) - goto fail_free_dbf; + goto fail_class_destroy; rc = alloc_chrdev_region(&dev, 0, NUM_MINORS, "vmur"); if (rc) { @@ -1038,18 +1044,13 @@ static int __init ur_init(void) } ur_first_dev_maj_min = MKDEV(MAJOR(dev), 0); - vmur_class = class_create(THIS_MODULE, "vmur"); - if (IS_ERR(vmur_class)) { - rc = PTR_ERR(vmur_class); - goto fail_unregister_region; - } pr_info("%s loaded.\n", ur_banner); return 0; -fail_unregister_region: - unregister_chrdev_region(ur_first_dev_maj_min, NUM_MINORS); fail_unregister_driver: ccw_driver_unregister(&ur_driver); +fail_class_destroy: + class_destroy(vmur_class); fail_free_dbf: debug_unregister(vmur_dbf); return rc; @@ -1057,9 +1058,9 @@ fail_free_dbf: static void __exit ur_exit(void) { - class_destroy(vmur_class); unregister_chrdev_region(ur_first_dev_maj_min, NUM_MINORS); ccw_driver_unregister(&ur_driver); + class_destroy(vmur_class); debug_unregister(vmur_dbf); pr_info("%s unloaded.\n", ur_banner); } diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c index 1bbae433fbd..c431198bdbc 100644 --- a/drivers/s390/char/zcore.c +++ b/drivers/s390/char/zcore.c @@ -275,7 +275,7 @@ struct zcore_header { u32 num_pages; u32 pad1; u64 tod; - cpuid_t cpu_id; + struct cpuid cpu_id; u32 arch_id; u32 volnr; u32 build_arch; diff --git a/drivers/s390/cio/Makefile b/drivers/s390/cio/Makefile index adb3dd30152..fa4c9662f65 100644 --- a/drivers/s390/cio/Makefile +++ b/drivers/s390/cio/Makefile @@ -2,7 +2,7 @@ # Makefile for the S/390 common i/o drivers # -obj-y += airq.o blacklist.o chsc.o cio.o css.o chp.o idset.o isc.o scsw.o \ +obj-y += airq.o blacklist.o chsc.o cio.o css.o chp.o idset.o isc.o \ fcx.o itcw.o crw.o ccw_device-objs += device.o device_fsm.o device_ops.o ccw_device-objs += device_id.o device_pgid.o device_status.o diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c index 3e5f304ad88..40002830d48 100644 --- a/drivers/s390/cio/chp.c +++ b/drivers/s390/cio/chp.c @@ -417,7 +417,8 @@ int chp_new(struct chp_id chpid) if (ret) { CIO_MSG_EVENT(0, "Could not register chp%x.%02x: %d\n", chpid.cssid, chpid.id, ret); - goto out_free; + put_device(&chp->dev); + goto out; } ret = sysfs_create_group(&chp->dev.kobj, &chp_attr_group); if (ret) { diff --git a/drivers/s390/cio/chsc.h b/drivers/s390/cio/chsc.h index 425e8f89a6c..37aa611d4ac 100644 --- a/drivers/s390/cio/chsc.h +++ b/drivers/s390/cio/chsc.h @@ -37,29 +37,6 @@ struct channel_path_desc { struct channel_path; -struct css_general_char { - u64 : 12; - u32 dynio : 1; /* bit 12 */ - u32 : 28; - u32 aif : 1; /* bit 41 */ - u32 : 3; - u32 mcss : 1; /* bit 45 */ - u32 fcs : 1; /* bit 46 */ - u32 : 1; - u32 ext_mb : 1; /* bit 48 */ - u32 : 7; - u32 aif_tdd : 1; /* bit 56 */ - u32 : 1; - u32 qebsm : 1; /* bit 58 */ - u32 : 8; - u32 aif_osa : 1; /* bit 67 */ - u32 : 14; - u32 cib : 1; /* bit 82 */ - u32 : 5; - u32 fcx : 1; /* bit 88 */ - u32 : 7; -}__attribute__((packed)); - struct css_chsc_char { u64 res; u64 : 20; @@ -72,7 +49,6 @@ struct css_chsc_char { u32 : 19; }__attribute__((packed)); -extern struct css_general_char css_general_characteristics; extern struct css_chsc_char css_chsc_characteristics; struct chsc_ssd_info { diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c index 5ec7789bd9d..138124fcfca 100644 --- a/drivers/s390/cio/cio.c +++ b/drivers/s390/cio/cio.c @@ -139,12 +139,11 @@ cio_start_key (struct subchannel *sch, /* subchannel structure */ __u8 lpm, /* logical path mask */ __u8 key) /* storage key */ { - char dbf_txt[15]; int ccode; union orb *orb; - CIO_TRACE_EVENT(4, "stIO"); - CIO_TRACE_EVENT(4, dev_name(&sch->dev)); + CIO_TRACE_EVENT(5, "stIO"); + CIO_TRACE_EVENT(5, dev_name(&sch->dev)); orb = &to_io_private(sch)->orb; memset(orb, 0, sizeof(union orb)); @@ -169,8 +168,7 @@ cio_start_key (struct subchannel *sch, /* subchannel structure */ ccode = ssch(sch->schid, orb); /* process condition code */ - sprintf(dbf_txt, "ccode:%d", ccode); - CIO_TRACE_EVENT(4, dbf_txt); + CIO_HEX_EVENT(5, &ccode, sizeof(ccode)); switch (ccode) { case 0: @@ -201,16 +199,14 @@ cio_start (struct subchannel *sch, struct ccw1 *cpa, __u8 lpm) int cio_resume (struct subchannel *sch) { - char dbf_txt[15]; int ccode; - CIO_TRACE_EVENT (4, "resIO"); + CIO_TRACE_EVENT(4, "resIO"); CIO_TRACE_EVENT(4, dev_name(&sch->dev)); ccode = rsch (sch->schid); - sprintf (dbf_txt, "ccode:%d", ccode); - CIO_TRACE_EVENT (4, dbf_txt); + CIO_HEX_EVENT(4, &ccode, sizeof(ccode)); switch (ccode) { case 0: @@ -235,13 +231,12 @@ cio_resume (struct subchannel *sch) int cio_halt(struct subchannel *sch) { - char dbf_txt[15]; int ccode; if (!sch) return -ENODEV; - CIO_TRACE_EVENT (2, "haltIO"); + CIO_TRACE_EVENT(2, "haltIO"); CIO_TRACE_EVENT(2, dev_name(&sch->dev)); /* @@ -249,8 +244,7 @@ cio_halt(struct subchannel *sch) */ ccode = hsch (sch->schid); - sprintf (dbf_txt, "ccode:%d", ccode); - CIO_TRACE_EVENT (2, dbf_txt); + CIO_HEX_EVENT(2, &ccode, sizeof(ccode)); switch (ccode) { case 0: @@ -270,13 +264,12 @@ cio_halt(struct subchannel *sch) int cio_clear(struct subchannel *sch) { - char dbf_txt[15]; int ccode; if (!sch) return -ENODEV; - CIO_TRACE_EVENT (2, "clearIO"); + CIO_TRACE_EVENT(2, "clearIO"); CIO_TRACE_EVENT(2, dev_name(&sch->dev)); /* @@ -284,8 +277,7 @@ cio_clear(struct subchannel *sch) */ ccode = csch (sch->schid); - sprintf (dbf_txt, "ccode:%d", ccode); - CIO_TRACE_EVENT (2, dbf_txt); + CIO_HEX_EVENT(2, &ccode, sizeof(ccode)); switch (ccode) { case 0: @@ -306,19 +298,17 @@ cio_clear(struct subchannel *sch) int cio_cancel (struct subchannel *sch) { - char dbf_txt[15]; int ccode; if (!sch) return -ENODEV; - CIO_TRACE_EVENT (2, "cancelIO"); + CIO_TRACE_EVENT(2, "cancelIO"); CIO_TRACE_EVENT(2, dev_name(&sch->dev)); ccode = xsch (sch->schid); - sprintf (dbf_txt, "ccode:%d", ccode); - CIO_TRACE_EVENT (2, dbf_txt); + CIO_HEX_EVENT(2, &ccode, sizeof(ccode)); switch (ccode) { case 0: /* success */ @@ -429,11 +419,10 @@ EXPORT_SYMBOL_GPL(cio_update_schib); */ int cio_enable_subchannel(struct subchannel *sch, u32 intparm) { - char dbf_txt[15]; int retry; int ret; - CIO_TRACE_EVENT (2, "ensch"); + CIO_TRACE_EVENT(2, "ensch"); CIO_TRACE_EVENT(2, dev_name(&sch->dev)); if (sch_is_pseudo_sch(sch)) @@ -460,8 +449,7 @@ int cio_enable_subchannel(struct subchannel *sch, u32 intparm) } else break; } - sprintf (dbf_txt, "ret:%d", ret); - CIO_TRACE_EVENT (2, dbf_txt); + CIO_HEX_EVENT(2, &ret, sizeof(ret)); return ret; } EXPORT_SYMBOL_GPL(cio_enable_subchannel); @@ -472,11 +460,10 @@ EXPORT_SYMBOL_GPL(cio_enable_subchannel); */ int cio_disable_subchannel(struct subchannel *sch) { - char dbf_txt[15]; int retry; int ret; - CIO_TRACE_EVENT (2, "dissch"); + CIO_TRACE_EVENT(2, "dissch"); CIO_TRACE_EVENT(2, dev_name(&sch->dev)); if (sch_is_pseudo_sch(sch)) @@ -495,8 +482,7 @@ int cio_disable_subchannel(struct subchannel *sch) } else break; } - sprintf (dbf_txt, "ret:%d", ret); - CIO_TRACE_EVENT (2, dbf_txt); + CIO_HEX_EVENT(2, &ret, sizeof(ret)); return ret; } EXPORT_SYMBOL_GPL(cio_disable_subchannel); @@ -578,11 +564,6 @@ int cio_validate_subchannel(struct subchannel *sch, struct subchannel_id schid) goto out; } mutex_init(&sch->reg_mutex); - /* Set a name for the subchannel */ - if (cio_is_console(schid)) - sch->dev.init_name = cio_get_console_sch_name(schid); - else - dev_set_name(&sch->dev, "0.%x.%04x", schid.ssid, schid.sch_no); /* * The first subchannel that is not-operational (ccode==3) @@ -686,7 +667,6 @@ void __irq_entry do_IRQ(struct pt_regs *regs) #ifdef CONFIG_CCW_CONSOLE static struct subchannel console_subchannel; -static char console_sch_name[10] = "0.x.xxxx"; static struct io_subchannel_private console_priv; static int console_subchannel_in_use; @@ -873,12 +853,6 @@ cio_get_console_subchannel(void) return &console_subchannel; } -const char *cio_get_console_sch_name(struct subchannel_id schid) -{ - snprintf(console_sch_name, 10, "0.%x.%04x", schid.ssid, schid.sch_no); - return (const char *)console_sch_name; -} - #endif static int __disable_subchannel_easy(struct subchannel_id schid, struct schib *schib) diff --git a/drivers/s390/cio/cio.h b/drivers/s390/cio/cio.h index 5150fba742a..2e43558c704 100644 --- a/drivers/s390/cio/cio.h +++ b/drivers/s390/cio/cio.h @@ -133,15 +133,11 @@ extern int cio_is_console(struct subchannel_id); extern struct subchannel *cio_get_console_subchannel(void); extern spinlock_t * cio_get_console_lock(void); extern void *cio_get_console_priv(void); -extern const char *cio_get_console_sch_name(struct subchannel_id schid); -extern const char *cio_get_console_cdev_name(struct subchannel *sch); #else #define cio_is_console(schid) 0 #define cio_get_console_subchannel() NULL #define cio_get_console_lock() NULL #define cio_get_console_priv() NULL -#define cio_get_console_sch_name(schid) NULL -#define cio_get_console_cdev_name(sch) NULL #endif #endif diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index 85d43c6bcb6..e995123fd80 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -152,24 +152,15 @@ css_alloc_subchannel(struct subchannel_id schid) } static void -css_free_subchannel(struct subchannel *sch) -{ - if (sch) { - /* Reset intparm to zeroes. */ - sch->config.intparm = 0; - cio_commit_config(sch); - kfree(sch->lock); - kfree(sch); - } -} - -static void css_subchannel_release(struct device *dev) { struct subchannel *sch; sch = to_subchannel(dev); if (!cio_is_console(sch->schid)) { + /* Reset intparm to zeroes. */ + sch->config.intparm = 0; + cio_commit_config(sch); kfree(sch->lock); kfree(sch); } @@ -180,6 +171,8 @@ static int css_sch_device_register(struct subchannel *sch) int ret; mutex_lock(&sch->reg_mutex); + dev_set_name(&sch->dev, "0.%x.%04x", sch->schid.ssid, + sch->schid.sch_no); ret = device_register(&sch->dev); mutex_unlock(&sch->reg_mutex); return ret; @@ -327,7 +320,7 @@ int css_probe_device(struct subchannel_id schid) return PTR_ERR(sch); ret = css_register_subchannel(sch); if (ret) - css_free_subchannel(sch); + put_device(&sch->dev); return ret; } @@ -644,7 +637,10 @@ __init_channel_subsystem(struct subchannel_id schid, void *data) * not working) so we do it now. This is true e.g. for the * console subchannel. */ - css_register_subchannel(sch); + if (css_register_subchannel(sch)) { + if (!cio_is_console(schid)) + put_device(&sch->dev); + } return 0; } @@ -661,8 +657,8 @@ css_generate_pgid(struct channel_subsystem *css, u32 tod_high) css->global_pgid.pgid_high.cpu_addr = 0; #endif } - css->global_pgid.cpu_id = ((cpuid_t *) __LC_CPUID)->ident; - css->global_pgid.cpu_model = ((cpuid_t *) __LC_CPUID)->machine; + css->global_pgid.cpu_id = S390_lowcore.cpu_id.ident; + css->global_pgid.cpu_model = S390_lowcore.cpu_id.machine; css->global_pgid.tod_high = tod_high; } @@ -920,8 +916,10 @@ init_channel_subsystem (void) goto out_device; } ret = device_register(&css->pseudo_subchannel->dev); - if (ret) + if (ret) { + put_device(&css->pseudo_subchannel->dev); goto out_file; + } } ret = register_reboot_notifier(&css_reboot_notifier); if (ret) diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index d593bc76afe..0f95405c2c5 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -307,8 +307,11 @@ int ccw_device_is_orphan(struct ccw_device *cdev) static void ccw_device_unregister(struct ccw_device *cdev) { - if (test_and_clear_bit(1, &cdev->private->registered)) + if (test_and_clear_bit(1, &cdev->private->registered)) { device_del(&cdev->dev); + /* Release reference from device_initialize(). */ + put_device(&cdev->dev); + } } static void ccw_device_remove_orphan_cb(struct work_struct *work) @@ -319,7 +322,6 @@ static void ccw_device_remove_orphan_cb(struct work_struct *work) priv = container_of(work, struct ccw_device_private, kick_work); cdev = priv->cdev; ccw_device_unregister(cdev); - put_device(&cdev->dev); /* Release cdev reference for workqueue processing. */ put_device(&cdev->dev); } @@ -333,15 +335,15 @@ ccw_device_remove_disconnected(struct ccw_device *cdev) * Forced offline in disconnected state means * 'throw away device'. */ - /* Get cdev reference for workqueue processing. */ - if (!get_device(&cdev->dev)) - return; if (ccw_device_is_orphan(cdev)) { /* * Deregister ccw device. * Unfortunately, we cannot do this directly from the * attribute method. */ + /* Get cdev reference for workqueue processing. */ + if (!get_device(&cdev->dev)) + return; spin_lock_irqsave(cdev->ccwlock, flags); cdev->private->state = DEV_STATE_NOT_OPER; spin_unlock_irqrestore(cdev->ccwlock, flags); @@ -380,30 +382,34 @@ int ccw_device_set_offline(struct ccw_device *cdev) } cdev->online = 0; spin_lock_irq(cdev->ccwlock); - ret = ccw_device_offline(cdev); - if (ret == -ENODEV) { - if (cdev->private->state != DEV_STATE_NOT_OPER) { - cdev->private->state = DEV_STATE_OFFLINE; - dev_fsm_event(cdev, DEV_EVENT_NOTOPER); - } + /* Wait until a final state or DISCONNECTED is reached */ + while (!dev_fsm_final_state(cdev) && + cdev->private->state != DEV_STATE_DISCONNECTED) { spin_unlock_irq(cdev->ccwlock); - /* Give up reference from ccw_device_set_online(). */ - put_device(&cdev->dev); - return ret; + wait_event(cdev->private->wait_q, (dev_fsm_final_state(cdev) || + cdev->private->state == DEV_STATE_DISCONNECTED)); + spin_lock_irq(cdev->ccwlock); } + ret = ccw_device_offline(cdev); + if (ret) + goto error; spin_unlock_irq(cdev->ccwlock); - if (ret == 0) { - wait_event(cdev->private->wait_q, dev_fsm_final_state(cdev)); - /* Give up reference from ccw_device_set_online(). */ - put_device(&cdev->dev); - } else { - CIO_MSG_EVENT(0, "ccw_device_offline returned %d, " - "device 0.%x.%04x\n", - ret, cdev->private->dev_id.ssid, - cdev->private->dev_id.devno); - cdev->online = 1; - } - return ret; + wait_event(cdev->private->wait_q, (dev_fsm_final_state(cdev) || + cdev->private->state == DEV_STATE_DISCONNECTED)); + /* Give up reference from ccw_device_set_online(). */ + put_device(&cdev->dev); + return 0; + +error: + CIO_MSG_EVENT(0, "ccw_device_offline returned %d, device 0.%x.%04x\n", + ret, cdev->private->dev_id.ssid, + cdev->private->dev_id.devno); + cdev->private->state = DEV_STATE_OFFLINE; + dev_fsm_event(cdev, DEV_EVENT_NOTOPER); + spin_unlock_irq(cdev->ccwlock); + /* Give up reference from ccw_device_set_online(). */ + put_device(&cdev->dev); + return -ENODEV; } /** @@ -421,6 +427,7 @@ int ccw_device_set_offline(struct ccw_device *cdev) int ccw_device_set_online(struct ccw_device *cdev) { int ret; + int ret2; if (!cdev) return -ENODEV; @@ -444,28 +451,53 @@ int ccw_device_set_online(struct ccw_device *cdev) put_device(&cdev->dev); return ret; } - if (cdev->private->state != DEV_STATE_ONLINE) { + spin_lock_irq(cdev->ccwlock); + /* Check if online processing was successful */ + if ((cdev->private->state != DEV_STATE_ONLINE) && + (cdev->private->state != DEV_STATE_W4SENSE)) { + spin_unlock_irq(cdev->ccwlock); /* Give up online reference since onlining failed. */ put_device(&cdev->dev); return -ENODEV; } - if (!cdev->drv->set_online || cdev->drv->set_online(cdev) == 0) { - cdev->online = 1; - return 0; - } + spin_unlock_irq(cdev->ccwlock); + if (cdev->drv->set_online) + ret = cdev->drv->set_online(cdev); + if (ret) + goto rollback; + cdev->online = 1; + return 0; + +rollback: spin_lock_irq(cdev->ccwlock); - ret = ccw_device_offline(cdev); + /* Wait until a final state or DISCONNECTED is reached */ + while (!dev_fsm_final_state(cdev) && + cdev->private->state != DEV_STATE_DISCONNECTED) { + spin_unlock_irq(cdev->ccwlock); + wait_event(cdev->private->wait_q, (dev_fsm_final_state(cdev) || + cdev->private->state == DEV_STATE_DISCONNECTED)); + spin_lock_irq(cdev->ccwlock); + } + ret2 = ccw_device_offline(cdev); + if (ret2) + goto error; spin_unlock_irq(cdev->ccwlock); - if (ret == 0) - wait_event(cdev->private->wait_q, dev_fsm_final_state(cdev)); - else - CIO_MSG_EVENT(0, "ccw_device_offline returned %d, " - "device 0.%x.%04x\n", - ret, cdev->private->dev_id.ssid, - cdev->private->dev_id.devno); + wait_event(cdev->private->wait_q, (dev_fsm_final_state(cdev) || + cdev->private->state == DEV_STATE_DISCONNECTED)); /* Give up online reference since onlining failed. */ put_device(&cdev->dev); - return (ret == 0) ? -ENODEV : ret; + return ret; + +error: + CIO_MSG_EVENT(0, "rollback ccw_device_offline returned %d, " + "device 0.%x.%04x\n", + ret2, cdev->private->dev_id.ssid, + cdev->private->dev_id.devno); + cdev->private->state = DEV_STATE_OFFLINE; + spin_unlock_irq(cdev->ccwlock); + /* Give up online reference since onlining failed. */ + put_device(&cdev->dev); + return ret; } static int online_store_handle_offline(struct ccw_device *cdev) @@ -637,8 +669,12 @@ static int ccw_device_register(struct ccw_device *cdev) int ret; dev->bus = &ccw_bus_type; - - if ((ret = device_add(dev))) + ret = dev_set_name(&cdev->dev, "0.%x.%04x", cdev->private->dev_id.ssid, + cdev->private->dev_id.devno); + if (ret) + return ret; + ret = device_add(dev); + if (ret) return ret; set_bit(1, &cdev->private->registered); @@ -1024,9 +1060,6 @@ static void ccw_device_call_sch_unregister(struct work_struct *work) return; sch = to_subchannel(cdev->dev.parent); css_sch_device_unregister(sch); - /* Reset intparm to zeroes. */ - sch->config.intparm = 0; - cio_commit_config(sch); /* Release cdev reference for workqueue processing.*/ put_device(&cdev->dev); /* Release subchannel reference for local processing. */ @@ -1035,6 +1068,9 @@ static void ccw_device_call_sch_unregister(struct work_struct *work) void ccw_device_schedule_sch_unregister(struct ccw_device *cdev) { + /* Get cdev reference for workqueue processing. */ + if (!get_device(&cdev->dev)) + return; PREPARE_WORK(&cdev->private->kick_work, ccw_device_call_sch_unregister); queue_work(slow_path_wq, &cdev->private->kick_work); @@ -1055,9 +1091,6 @@ io_subchannel_recog_done(struct ccw_device *cdev) /* Device did not respond in time. */ case DEV_STATE_NOT_OPER: cdev->private->flags.recog_done = 1; - /* Remove device found not operational. */ - if (!get_device(&cdev->dev)) - break; ccw_device_schedule_sch_unregister(cdev); if (atomic_dec_and_test(&ccw_device_init_count)) wake_up(&ccw_device_init_wq); @@ -1095,13 +1128,6 @@ io_subchannel_recog(struct ccw_device *cdev, struct subchannel *sch) init_waitqueue_head(&priv->wait_q); init_timer(&priv->timer); - /* Set an initial name for the device. */ - if (cio_is_console(sch->schid)) - cdev->dev.init_name = cio_get_console_cdev_name(sch); - else - dev_set_name(&cdev->dev, "0.%x.%04x", - sch->schid.ssid, sch->schib.pmcw.dev); - /* Increase counter of devices currently in recognition. */ atomic_inc(&ccw_device_init_count); @@ -1171,8 +1197,8 @@ static void io_subchannel_irq(struct subchannel *sch) cdev = sch_get_cdev(sch); - CIO_TRACE_EVENT(3, "IRQ"); - CIO_TRACE_EVENT(3, dev_name(&sch->dev)); + CIO_TRACE_EVENT(6, "IRQ"); + CIO_TRACE_EVENT(6, dev_name(&sch->dev)); if (cdev) dev_fsm_event(cdev, DEV_EVENT_INTERRUPT); } @@ -1210,9 +1236,6 @@ static void io_subchannel_do_unreg(struct work_struct *work) sch = container_of(work, struct subchannel, work); css_sch_device_unregister(sch); - /* Reset intparm to zeroes. */ - sch->config.intparm = 0; - cio_commit_config(sch); put_device(&sch->dev); } @@ -1334,7 +1357,6 @@ io_subchannel_remove (struct subchannel *sch) cdev->private->state = DEV_STATE_NOT_OPER; spin_unlock_irqrestore(cdev->ccwlock, flags); ccw_device_unregister(cdev); - put_device(&cdev->dev); kfree(sch->private); sysfs_remove_group(&sch->dev.kobj, &io_subchannel_attr_group); return 0; @@ -1571,8 +1593,6 @@ static int purge_fn(struct device *dev, void *data) spin_unlock_irq(cdev->ccwlock); if (!unreg) goto out; - if (!get_device(&cdev->dev)) - goto out; CIO_MSG_EVENT(3, "ccw: purging 0.%x.%04x\n", priv->dev_id.ssid, priv->dev_id.devno); ccw_device_schedule_sch_unregister(cdev); @@ -1688,10 +1708,6 @@ static int io_subchannel_sch_event(struct subchannel *sch, int slow) spin_unlock_irqrestore(sch->lock, flags); css_sch_device_unregister(sch); spin_lock_irqsave(sch->lock, flags); - - /* Reset intparm to zeroes. */ - sch->config.intparm = 0; - cio_commit_config(sch); break; case REPROBE: ccw_device_trigger_reprobe(cdev); @@ -1712,7 +1728,6 @@ static int io_subchannel_sch_event(struct subchannel *sch, int slow) #ifdef CONFIG_CCW_CONSOLE static struct ccw_device console_cdev; -static char console_cdev_name[10] = "0.x.xxxx"; static struct ccw_device_private console_private; static int console_cdev_in_use; @@ -1796,13 +1811,6 @@ int ccw_device_force_console(void) return ccw_device_pm_restore(&console_cdev.dev); } EXPORT_SYMBOL_GPL(ccw_device_force_console); - -const char *cio_get_console_cdev_name(struct subchannel *sch) -{ - snprintf(console_cdev_name, 10, "0.%x.%04x", - sch->schid.ssid, sch->schib.pmcw.dev); - return (const char *)console_cdev_name; -} #endif /* @@ -2020,7 +2028,9 @@ static void __ccw_device_pm_restore(struct ccw_device *cdev) spin_unlock_irq(sch->lock); if (ret) { CIO_MSG_EVENT(0, "Couldn't start recognition for device " - "%s (ret=%d)\n", dev_name(&cdev->dev), ret); + "0.%x.%04x (ret=%d)\n", + cdev->private->dev_id.ssid, + cdev->private->dev_id.devno, ret); spin_lock_irq(sch->lock); cdev->private->state = DEV_STATE_DISCONNECTED; spin_unlock_irq(sch->lock); @@ -2083,8 +2093,9 @@ static int ccw_device_pm_restore(struct device *dev) } /* check if the device id has changed */ if (sch->schib.pmcw.dev != cdev->private->dev_id.devno) { - CIO_MSG_EVENT(0, "resume: sch %s: failed (devno changed from " - "%04x to %04x)\n", dev_name(&sch->dev), + CIO_MSG_EVENT(0, "resume: sch 0.%x.%04x: failed (devno " + "changed from %04x to %04x)\n", + sch->schid.ssid, sch->schid.sch_no, cdev->private->dev_id.devno, sch->schib.pmcw.dev); goto out_unreg_unlock; @@ -2117,8 +2128,9 @@ static int ccw_device_pm_restore(struct device *dev) if (cm_enabled) { ret = ccw_set_cmf(cdev, 1); if (ret) { - CIO_MSG_EVENT(2, "resume: cdev %s: cmf failed " - "(rc=%d)\n", dev_name(&cdev->dev), ret); + CIO_MSG_EVENT(2, "resume: cdev 0.%x.%04x: cmf failed " + "(rc=%d)\n", cdev->private->dev_id.ssid, + cdev->private->dev_id.devno, ret); ret = 0; } } diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c index 3db88c52d28..e728ce447f6 100644 --- a/drivers/s390/cio/device_fsm.c +++ b/drivers/s390/cio/device_fsm.c @@ -394,6 +394,13 @@ ccw_device_done(struct ccw_device *cdev, int state) ccw_device_schedule_sch_unregister(cdev); cdev->private->flags.donotify = 0; } + if (state == DEV_STATE_NOT_OPER) { + CIO_MSG_EVENT(0, "Device %04x gone on subchannel %04x\n", + cdev->private->dev_id.devno, sch->schid.sch_no); + if (!ccw_device_notify(cdev, CIO_GONE)) + ccw_device_schedule_sch_unregister(cdev); + cdev->private->flags.donotify = 0; + } if (cdev->private->flags.donotify) { cdev->private->flags.donotify = 0; @@ -731,6 +738,17 @@ static void ccw_device_generic_notoper(struct ccw_device *cdev, } /* + * Handle path verification event in offline state. + */ +static void ccw_device_offline_verify(struct ccw_device *cdev, + enum dev_event dev_event) +{ + struct subchannel *sch = to_subchannel(cdev->dev.parent); + + css_schedule_eval(sch->schid); +} + +/* * Handle path verification event. */ static void @@ -887,6 +905,8 @@ ccw_device_w4sense(struct ccw_device *cdev, enum dev_event dev_event) } call_handler: cdev->private->state = DEV_STATE_ONLINE; + /* In case sensing interfered with setting the device online */ + wake_up(&cdev->private->wait_q); /* Call the handler. */ if (ccw_device_call_handler(cdev) && cdev->private->flags.doverify) /* Start delayed path verification. */ @@ -1149,7 +1169,7 @@ fsm_func_t *dev_jumptable[NR_DEV_STATES][NR_DEV_EVENTS] = { [DEV_EVENT_NOTOPER] = ccw_device_generic_notoper, [DEV_EVENT_INTERRUPT] = ccw_device_offline_irq, [DEV_EVENT_TIMEOUT] = ccw_device_nop, - [DEV_EVENT_VERIFY] = ccw_device_nop, + [DEV_EVENT_VERIFY] = ccw_device_offline_verify, }, [DEV_STATE_VERIFY] = { [DEV_EVENT_NOTOPER] = ccw_device_generic_notoper, diff --git a/drivers/s390/cio/qdio.h b/drivers/s390/cio/qdio.h index b1241f8fae8..ff7748a9199 100644 --- a/drivers/s390/cio/qdio.h +++ b/drivers/s390/cio/qdio.h @@ -1,7 +1,7 @@ /* * linux/drivers/s390/cio/qdio.h * - * Copyright 2000,2008 IBM Corp. + * Copyright 2000,2009 IBM Corp. * Author(s): Utz Bacher <utz.bacher@de.ibm.com> * Jan Glauber <jang@linux.vnet.ibm.com> */ @@ -246,6 +246,7 @@ struct qdio_q { atomic_t nr_buf_used; struct qdio_irq *irq_ptr; + struct dentry *debugfs_q; struct tasklet_struct tasklet; /* error condition during a data transfer */ @@ -267,6 +268,7 @@ struct qdio_irq { struct qib qib; u32 *dsci; /* address of device state change indicator */ struct ccw_device *cdev; + struct dentry *debugfs_dev; unsigned long int_parm; struct subchannel_id schid; diff --git a/drivers/s390/cio/qdio_debug.c b/drivers/s390/cio/qdio_debug.c index b8626d4df11..1b78f639ead 100644 --- a/drivers/s390/cio/qdio_debug.c +++ b/drivers/s390/cio/qdio_debug.c @@ -1,14 +1,12 @@ /* * drivers/s390/cio/qdio_debug.c * - * Copyright IBM Corp. 2008 + * Copyright IBM Corp. 2008,2009 * * Author: Jan Glauber (jang@linux.vnet.ibm.com) */ -#include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/debugfs.h> -#include <asm/qdio.h> #include <asm/debug.h> #include "qdio_debug.h" #include "qdio.h" @@ -17,10 +15,7 @@ debug_info_t *qdio_dbf_setup; debug_info_t *qdio_dbf_error; static struct dentry *debugfs_root; -#define MAX_DEBUGFS_QUEUES 32 -static struct dentry *debugfs_queues[MAX_DEBUGFS_QUEUES] = { NULL }; -static DEFINE_MUTEX(debugfs_mutex); -#define QDIO_DEBUGFS_NAME_LEN 40 +#define QDIO_DEBUGFS_NAME_LEN 10 void qdio_allocate_dbf(struct qdio_initialize *init_data, struct qdio_irq *irq_ptr) @@ -130,20 +125,6 @@ static int qstat_seq_open(struct inode *inode, struct file *filp) filp->f_path.dentry->d_inode->i_private); } -static void remove_debugfs_entry(struct qdio_q *q) -{ - int i; - - for (i = 0; i < MAX_DEBUGFS_QUEUES; i++) { - if (!debugfs_queues[i]) - continue; - if (debugfs_queues[i]->d_inode->i_private == q) { - debugfs_remove(debugfs_queues[i]); - debugfs_queues[i] = NULL; - } - } -} - static struct file_operations debugfs_fops = { .owner = THIS_MODULE, .open = qstat_seq_open, @@ -155,22 +136,15 @@ static struct file_operations debugfs_fops = { static void setup_debugfs_entry(struct qdio_q *q, struct ccw_device *cdev) { - int i = 0; char name[QDIO_DEBUGFS_NAME_LEN]; - while (debugfs_queues[i] != NULL) { - i++; - if (i >= MAX_DEBUGFS_QUEUES) - return; - } - snprintf(name, QDIO_DEBUGFS_NAME_LEN, "%s_%s_%d", - dev_name(&cdev->dev), + snprintf(name, QDIO_DEBUGFS_NAME_LEN, "%s_%d", q->is_input_q ? "input" : "output", q->nr); - debugfs_queues[i] = debugfs_create_file(name, S_IFREG | S_IRUGO | S_IWUSR, - debugfs_root, q, &debugfs_fops); - if (IS_ERR(debugfs_queues[i])) - debugfs_queues[i] = NULL; + q->debugfs_q = debugfs_create_file(name, S_IFREG | S_IRUGO | S_IWUSR, + q->irq_ptr->debugfs_dev, q, &debugfs_fops); + if (IS_ERR(q->debugfs_q)) + q->debugfs_q = NULL; } void qdio_setup_debug_entries(struct qdio_irq *irq_ptr, struct ccw_device *cdev) @@ -178,12 +152,14 @@ void qdio_setup_debug_entries(struct qdio_irq *irq_ptr, struct ccw_device *cdev) struct qdio_q *q; int i; - mutex_lock(&debugfs_mutex); + irq_ptr->debugfs_dev = debugfs_create_dir(dev_name(&cdev->dev), + debugfs_root); + if (IS_ERR(irq_ptr->debugfs_dev)) + irq_ptr->debugfs_dev = NULL; for_each_input_queue(irq_ptr, q, i) setup_debugfs_entry(q, cdev); for_each_output_queue(irq_ptr, q, i) setup_debugfs_entry(q, cdev); - mutex_unlock(&debugfs_mutex); } void qdio_shutdown_debug_entries(struct qdio_irq *irq_ptr, struct ccw_device *cdev) @@ -191,17 +167,16 @@ void qdio_shutdown_debug_entries(struct qdio_irq *irq_ptr, struct ccw_device *cd struct qdio_q *q; int i; - mutex_lock(&debugfs_mutex); for_each_input_queue(irq_ptr, q, i) - remove_debugfs_entry(q); + debugfs_remove(q->debugfs_q); for_each_output_queue(irq_ptr, q, i) - remove_debugfs_entry(q); - mutex_unlock(&debugfs_mutex); + debugfs_remove(q->debugfs_q); + debugfs_remove(irq_ptr->debugfs_dev); } int __init qdio_debug_init(void) { - debugfs_root = debugfs_create_dir("qdio_queues", NULL); + debugfs_root = debugfs_create_dir("qdio", NULL); qdio_dbf_setup = debug_register("qdio_setup", 16, 1, 16); debug_register_view(qdio_dbf_setup, &debug_hex_ascii_view); diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index 0038750ad94..9aef402a5f1 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -798,8 +798,10 @@ static void __tiqdio_inbound_processing(struct qdio_q *q) if (!qdio_inbound_q_done(q)) { qdio_perf_stat_inc(&perf_stats.thinint_inbound_loop); - if (likely(q->irq_ptr->state != QDIO_IRQ_STATE_STOPPED)) + if (likely(q->irq_ptr->state != QDIO_IRQ_STATE_STOPPED)) { tasklet_schedule(&q->tasklet); + return; + } } qdio_stop_polling(q); diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index ed3dcdea7fe..090b32a339c 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -648,7 +648,9 @@ static int ap_bus_suspend(struct device *dev, pm_message_t state) /* Poll on the device until all requests are finished. */ do { flags = 0; + spin_lock_bh(&ap_dev->lock); __ap_poll_device(ap_dev, &flags); + spin_unlock_bh(&ap_dev->lock); } while ((flags & 1) || (flags & 2)); ap_device_remove(dev); @@ -1109,12 +1111,15 @@ static void ap_scan_bus(struct work_struct *unused) ap_dev->device.bus = &ap_bus_type; ap_dev->device.parent = ap_root_device; - dev_set_name(&ap_dev->device, "card%02x", - AP_QID_DEVICE(ap_dev->qid)); + if (dev_set_name(&ap_dev->device, "card%02x", + AP_QID_DEVICE(ap_dev->qid))) { + kfree(ap_dev); + continue; + } ap_dev->device.release = ap_device_release; rc = device_register(&ap_dev->device); if (rc) { - kfree(ap_dev); + put_device(&ap_dev->device); continue; } /* Add device attributes. */ @@ -1407,14 +1412,12 @@ static void ap_reset(struct ap_device *ap_dev) static int __ap_poll_device(struct ap_device *ap_dev, unsigned long *flags) { - spin_lock(&ap_dev->lock); if (!ap_dev->unregistered) { if (ap_poll_queue(ap_dev, flags)) ap_dev->unregistered = 1; if (ap_dev->reset == AP_RESET_DO) ap_reset(ap_dev); } - spin_unlock(&ap_dev->lock); return 0; } @@ -1441,7 +1444,9 @@ static void ap_poll_all(unsigned long dummy) flags = 0; spin_lock(&ap_device_list_lock); list_for_each_entry(ap_dev, &ap_device_list, list) { + spin_lock(&ap_dev->lock); __ap_poll_device(ap_dev, &flags); + spin_unlock(&ap_dev->lock); } spin_unlock(&ap_device_list_lock); } while (flags & 1); @@ -1487,7 +1492,9 @@ static int ap_poll_thread(void *data) flags = 0; spin_lock_bh(&ap_device_list_lock); list_for_each_entry(ap_dev, &ap_device_list, list) { + spin_lock(&ap_dev->lock); __ap_poll_device(ap_dev, &flags); + spin_unlock(&ap_dev->lock); } spin_unlock_bh(&ap_device_list_lock); } diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c index e38e5d306fa..2930fc763ac 100644 --- a/drivers/s390/kvm/kvm_virtio.c +++ b/drivers/s390/kvm/kvm_virtio.c @@ -403,10 +403,14 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) return len; } -void __init s390_virtio_console_init(void) +static int __init s390_virtio_console_init(void) { - virtio_cons_early_init(early_put_chars); + if (!MACHINE_IS_KVM) + return -ENODEV; + return virtio_cons_early_init(early_put_chars); } +console_initcall(s390_virtio_console_init); + /* * We do this after core stuff, but before the drivers. diff --git a/drivers/s390/net/netiucv.c b/drivers/s390/net/netiucv.c index 8c36eafcfbf..87dff11061b 100644 --- a/drivers/s390/net/netiucv.c +++ b/drivers/s390/net/netiucv.c @@ -1839,9 +1839,10 @@ static int netiucv_register_device(struct net_device *ndev) return -ENOMEM; ret = device_register(dev); - - if (ret) + if (ret) { + put_device(dev); return ret; + } ret = netiucv_add_files(dev); if (ret) goto out_unreg; @@ -2226,8 +2227,10 @@ static int __init netiucv_init(void) netiucv_dev->release = (void (*)(struct device *))kfree; netiucv_dev->driver = &netiucv_driver; rc = device_register(netiucv_dev); - if (rc) + if (rc) { + put_device(netiucv_dev); goto out_driver; + } netiucv_banner(); return rc; diff --git a/drivers/s390/net/smsgiucv.c b/drivers/s390/net/smsgiucv.c index e76a320d373..102000d1af6 100644 --- a/drivers/s390/net/smsgiucv.c +++ b/drivers/s390/net/smsgiucv.c @@ -219,13 +219,13 @@ static int __init smsg_init(void) smsg_dev->driver = &smsg_driver; rc = device_register(smsg_dev); if (rc) - goto out_free_dev; + goto out_put; cpcmd("SET SMSG IUCV", NULL, 0, NULL); return 0; -out_free_dev: - kfree(smsg_dev); +out_put: + put_device(smsg_dev); out_free_path: iucv_path_free(smsg_path); smsg_path = NULL; diff --git a/drivers/scsi/cxgb3i/cxgb3i_init.c b/drivers/scsi/cxgb3i/cxgb3i_init.c index 042d9bce991..d0ab23a5835 100644 --- a/drivers/scsi/cxgb3i/cxgb3i_init.c +++ b/drivers/scsi/cxgb3i/cxgb3i_init.c @@ -26,7 +26,7 @@ MODULE_VERSION(DRV_MODULE_VERSION); static void open_s3_dev(struct t3cdev *); static void close_s3_dev(struct t3cdev *); -static void s3_err_handler(struct t3cdev *tdev, u32 status, u32 error); +static void s3_event_handler(struct t3cdev *tdev, u32 event, u32 port); static cxgb3_cpl_handler_func cxgb3i_cpl_handlers[NUM_CPL_CMDS]; static struct cxgb3_client t3c_client = { @@ -34,7 +34,7 @@ static struct cxgb3_client t3c_client = { .handlers = cxgb3i_cpl_handlers, .add = open_s3_dev, .remove = close_s3_dev, - .err_handler = s3_err_handler, + .event_handler = s3_event_handler, }; /** @@ -66,16 +66,16 @@ static void close_s3_dev(struct t3cdev *t3dev) cxgb3i_ddp_cleanup(t3dev); } -static void s3_err_handler(struct t3cdev *tdev, u32 status, u32 error) +static void s3_event_handler(struct t3cdev *tdev, u32 event, u32 port) { struct cxgb3i_adapter *snic = cxgb3i_adapter_find_by_tdev(tdev); - cxgb3i_log_info("snic 0x%p, tdev 0x%p, status 0x%x, err 0x%x.\n", - snic, tdev, status, error); + cxgb3i_log_info("snic 0x%p, tdev 0x%p, event 0x%x, port 0x%x.\n", + snic, tdev, event, port); if (!snic) return; - switch (status) { + switch (event) { case OFFLOAD_STATUS_DOWN: snic->flags |= CXGB3I_ADAPTER_FLAG_RESET; break; diff --git a/drivers/staging/comedi/comedi_fops.c b/drivers/staging/comedi/comedi_fops.c index 9d7c99394ec..640f65c6ef8 100644 --- a/drivers/staging/comedi/comedi_fops.c +++ b/drivers/staging/comedi/comedi_fops.c @@ -1752,12 +1752,12 @@ static int comedi_open(struct inode *inode, struct file *file) mutex_lock(&dev->mutex); if (dev->attached) goto ok; - if (!capable(CAP_SYS_MODULE) && dev->in_request_module) { + if (!capable(CAP_NET_ADMIN) && dev->in_request_module) { DPRINTK("in request module\n"); mutex_unlock(&dev->mutex); return -ENODEV; } - if (capable(CAP_SYS_MODULE) && dev->in_request_module) + if (capable(CAP_NET_ADMIN) && dev->in_request_module) goto ok; dev->in_request_module = 1; @@ -1770,8 +1770,8 @@ static int comedi_open(struct inode *inode, struct file *file) dev->in_request_module = 0; - if (!dev->attached && !capable(CAP_SYS_MODULE)) { - DPRINTK("not attached and not CAP_SYS_MODULE\n"); + if (!dev->attached && !capable(CAP_NET_ADMIN)) { + DPRINTK("not attached and not CAP_NET_ADMIN\n"); mutex_unlock(&dev->mutex); return -ENODEV; } diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c index 7b605795b77..e63c9bea6c5 100644 --- a/drivers/staging/pohmelfs/inode.c +++ b/drivers/staging/pohmelfs/inode.c @@ -1950,14 +1950,7 @@ static int pohmelfs_get_sb(struct file_system_type *fs_type, */ static void pohmelfs_kill_super(struct super_block *sb) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .range_start = 0, - .range_end = LLONG_MAX, - .nr_to_write = LONG_MAX, - }; - generic_sync_sb_inodes(sb, &wbc); - + sync_inodes_sb(sb); kill_anon_super(sb); } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index b7c1603cd4b..7c1e65d5487 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -501,22 +501,22 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, } } - /* - * Now fill out the bss section. First pad the last page up - * to the page boundary, and then perform a mmap to make sure - * that there are zero-mapped pages up to and including the - * last bss page. - */ - if (padzero(elf_bss)) { - error = -EFAULT; - goto out_close; - } + if (last_bss > elf_bss) { + /* + * Now fill out the bss section. First pad the last page up + * to the page boundary, and then perform a mmap to make sure + * that there are zero-mapped pages up to and including the + * last bss page. + */ + if (padzero(elf_bss)) { + error = -EFAULT; + goto out_close; + } - /* What we have mapped so far */ - elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1); + /* What we have mapped so far */ + elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1); - /* Map the last of the bss segment */ - if (last_bss > elf_bss) { + /* Map the last of the bss segment */ down_write(¤t->mm->mmap_sem); error = do_brk(elf_bss, last_bss - elf_bss); up_write(¤t->mm->mmap_sem); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e83be2e4602..15831d5c736 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1352,6 +1352,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) { int err; + bdi->name = "btrfs"; bdi->capabilities = BDI_CAP_MAP_COPY; err = bdi_init(bdi); if (err) diff --git a/fs/buffer.c b/fs/buffer.c index 28f320fac4d..90a98865b0c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -281,7 +281,7 @@ static void free_more_memory(void) struct zone *zone; int nid; - wakeup_pdflush(1024); + wakeup_flusher_threads(1024); yield(); for_each_online_node(nid) { diff --git a/fs/char_dev.c b/fs/char_dev.c index 2f18c1e4e30..3cbc57f932d 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -31,6 +31,7 @@ * - no readahead or I/O queue unplugging required */ struct backing_dev_info directly_mappable_cdev_bdi = { + .name = "char", .capabilities = ( #ifdef CONFIG_MMU /* permit private copies of the data to be taken */ diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 4921e7426d9..a2f746066c5 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -51,6 +51,7 @@ static const struct address_space_operations configfs_aops = { }; static struct backing_dev_info configfs_backing_dev_info = { + .name = "configfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index d636e1297ca..a63d44256a7 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -230,7 +230,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) return error; } -static int +int ext2_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); @@ -246,12 +246,6 @@ ext2_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int -ext2_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, ext2_check_acl); -} - /* * Initialize the ACLs of a new inode. Called from ext2_new_inode. * diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index ecefe478898..3ff6cbb9ac4 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -54,13 +54,13 @@ static inline int ext2_acl_count(size_t size) #ifdef CONFIG_EXT2_FS_POSIX_ACL /* acl.c */ -extern int ext2_permission (struct inode *, int); +extern int ext2_check_acl (struct inode *, int); extern int ext2_acl_chmod (struct inode *); extern int ext2_init_acl (struct inode *, struct inode *); #else #include <linux/sched.h> -#define ext2_permission NULL +#define ext2_check_acl NULL #define ext2_get_acl NULL #define ext2_set_acl NULL diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 2b9e47dc922..a2f3afd1a1c 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -85,6 +85,6 @@ const struct inode_operations ext2_file_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .permission = ext2_permission, + .check_acl = ext2_check_acl, .fiemap = ext2_fiemap, }; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 78d9b925fc9..23701f289e9 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -400,7 +400,7 @@ const struct inode_operations ext2_dir_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .permission = ext2_permission, + .check_acl = ext2_check_acl, }; const struct inode_operations ext2_special_inode_operations = { @@ -411,5 +411,5 @@ const struct inode_operations ext2_special_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .permission = ext2_permission, + .check_acl = ext2_check_acl, }; diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index e167bae37ef..c9b0df376b5 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -238,7 +238,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -static int +int ext3_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); @@ -254,12 +254,6 @@ ext3_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int -ext3_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, ext3_check_acl); -} - /* * Initialize the ACLs of a new inode. Called from ext3_new_inode. * diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h index 07d15a3a596..597334626de 100644 --- a/fs/ext3/acl.h +++ b/fs/ext3/acl.h @@ -54,13 +54,13 @@ static inline int ext3_acl_count(size_t size) #ifdef CONFIG_EXT3_FS_POSIX_ACL /* acl.c */ -extern int ext3_permission (struct inode *, int); +extern int ext3_check_acl (struct inode *, int); extern int ext3_acl_chmod (struct inode *); extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT3_FS_POSIX_ACL */ #include <linux/sched.h> -#define ext3_permission NULL +#define ext3_check_acl NULL static inline int ext3_acl_chmod(struct inode *inode) diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 5b49704b231..29925321478 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -137,7 +137,7 @@ const struct inode_operations ext3_file_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext3_permission, + .check_acl = ext3_check_acl, .fiemap = ext3_fiemap, }; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 6ff7b973023..aad6400c9b7 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -2445,7 +2445,7 @@ const struct inode_operations ext3_dir_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext3_permission, + .check_acl = ext3_check_acl, }; const struct inode_operations ext3_special_inode_operations = { @@ -2456,5 +2456,5 @@ const struct inode_operations ext3_special_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext3_permission, + .check_acl = ext3_check_acl, }; diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index f6d8967149c..0df88b2a69b 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -236,7 +236,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -static int +int ext4_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); @@ -252,12 +252,6 @@ ext4_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int -ext4_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, ext4_check_acl); -} - /* * Initialize the ACLs of a new inode. Called from ext4_new_inode. * diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 949789d2bba..9d843d5deac 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -54,13 +54,13 @@ static inline int ext4_acl_count(size_t size) #ifdef CONFIG_EXT4_FS_POSIX_ACL /* acl.c */ -extern int ext4_permission(struct inode *, int); +extern int ext4_check_acl(struct inode *, int); extern int ext4_acl_chmod(struct inode *); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT4_FS_POSIX_ACL */ #include <linux/sched.h> -#define ext4_permission NULL +#define ext4_check_acl NULL static inline int ext4_acl_chmod(struct inode *inode) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3f1873fef1c..27f3c5354c0 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -207,7 +207,7 @@ const struct inode_operations ext4_file_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext4_permission, + .check_acl = ext4_check_acl, .fallocate = ext4_fallocate, .fiemap = ext4_fiemap, }; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index de04013d16f..114abe5d2c1 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2536,7 +2536,7 @@ const struct inode_operations ext4_dir_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext4_permission, + .check_acl = ext4_check_acl, .fiemap = ext4_fiemap, }; @@ -2548,5 +2548,5 @@ const struct inode_operations ext4_special_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext4_permission, + .check_acl = ext4_check_acl, }; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index c54226be529..da86ef58e42 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -19,171 +19,223 @@ #include <linux/sched.h> #include <linux/fs.h> #include <linux/mm.h> +#include <linux/kthread.h> +#include <linux/freezer.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/buffer_head.h> #include "internal.h" +#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) -/** - * writeback_acquire - attempt to get exclusive writeback access to a device - * @bdi: the device's backing_dev_info structure - * - * It is a waste of resources to have more than one pdflush thread blocked on - * a single request queue. Exclusion at the request_queue level is obtained - * via a flag in the request_queue's backing_dev_info.state. - * - * Non-request_queue-backed address_spaces will share default_backing_dev_info, - * unless they implement their own. Which is somewhat inefficient, as this - * may prevent concurrent writeback against multiple devices. +/* + * We don't actually have pdflush, but this one is exported though /proc... */ -static int writeback_acquire(struct backing_dev_info *bdi) +int nr_pdflush_threads; + +/* + * Work items for the bdi_writeback threads + */ +struct bdi_work { + struct list_head list; + struct list_head wait_list; + struct rcu_head rcu_head; + + unsigned long seen; + atomic_t pending; + + struct super_block *sb; + unsigned long nr_pages; + enum writeback_sync_modes sync_mode; + + unsigned long state; +}; + +enum { + WS_USED_B = 0, + WS_ONSTACK_B, +}; + +#define WS_USED (1 << WS_USED_B) +#define WS_ONSTACK (1 << WS_ONSTACK_B) + +static inline bool bdi_work_on_stack(struct bdi_work *work) { - return !test_and_set_bit(BDI_pdflush, &bdi->state); + return test_bit(WS_ONSTACK_B, &work->state); +} + +static inline void bdi_work_init(struct bdi_work *work, + struct writeback_control *wbc) +{ + INIT_RCU_HEAD(&work->rcu_head); + work->sb = wbc->sb; + work->nr_pages = wbc->nr_to_write; + work->sync_mode = wbc->sync_mode; + work->state = WS_USED; +} + +static inline void bdi_work_init_on_stack(struct bdi_work *work, + struct writeback_control *wbc) +{ + bdi_work_init(work, wbc); + work->state |= WS_ONSTACK; } /** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. * - * Determine whether there is writeback in progress against a backing device. + * Determine whether there is writeback waiting to be handled against a + * backing device. */ int writeback_in_progress(struct backing_dev_info *bdi) { - return test_bit(BDI_pdflush, &bdi->state); + return !list_empty(&bdi->work_list); } -/** - * writeback_release - relinquish exclusive writeback access against a device. - * @bdi: the device's backing_dev_info structure - */ -static void writeback_release(struct backing_dev_info *bdi) +static void bdi_work_clear(struct bdi_work *work) { - BUG_ON(!writeback_in_progress(bdi)); - clear_bit(BDI_pdflush, &bdi->state); + clear_bit(WS_USED_B, &work->state); + smp_mb__after_clear_bit(); + wake_up_bit(&work->state, WS_USED_B); } -static noinline void block_dump___mark_inode_dirty(struct inode *inode) +static void bdi_work_free(struct rcu_head *head) { - if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { - struct dentry *dentry; - const char *name = "?"; + struct bdi_work *work = container_of(head, struct bdi_work, rcu_head); - dentry = d_find_alias(inode); - if (dentry) { - spin_lock(&dentry->d_lock); - name = (const char *) dentry->d_name.name; - } - printk(KERN_DEBUG - "%s(%d): dirtied inode %lu (%s) on %s\n", - current->comm, task_pid_nr(current), inode->i_ino, - name, inode->i_sb->s_id); - if (dentry) { - spin_unlock(&dentry->d_lock); - dput(dentry); - } - } + if (!bdi_work_on_stack(work)) + kfree(work); + else + bdi_work_clear(work); } -/** - * __mark_inode_dirty - internal function - * @inode: inode to mark - * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) - * Mark an inode as dirty. Callers should use mark_inode_dirty or - * mark_inode_dirty_sync. - * - * Put the inode on the super block's dirty list. - * - * CAREFUL! We mark it dirty unconditionally, but move it onto the - * dirty list only if it is hashed or if it refers to a blockdev. - * If it was not hashed, it will never be added to the dirty list - * even if it is later hashed, as it will have been marked dirty already. - * - * In short, make sure you hash any inodes _before_ you start marking - * them dirty. - * - * This function *must* be atomic for the I_DIRTY_PAGES case - - * set_page_dirty() is called under spinlock in several places. - * - * Note that for blockdevs, inode->dirtied_when represents the dirtying time of - * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of - * the kernel-internal blockdev inode represents the dirtying time of the - * blockdev's pages. This is why for I_DIRTY_PAGES we always use - * page->mapping->host, so the page-dirtying time is recorded in the internal - * blockdev inode. - */ -void __mark_inode_dirty(struct inode *inode, int flags) +static void wb_work_complete(struct bdi_work *work) { - struct super_block *sb = inode->i_sb; + const enum writeback_sync_modes sync_mode = work->sync_mode; /* - * Don't do this for I_DIRTY_PAGES - that doesn't actually - * dirty the inode itself + * For allocated work, we can clear the done/seen bit right here. + * For on-stack work, we need to postpone both the clear and free + * to after the RCU grace period, since the stack could be invalidated + * as soon as bdi_work_clear() has done the wakeup. */ - if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { - if (sb->s_op->dirty_inode) - sb->s_op->dirty_inode(inode); - } + if (!bdi_work_on_stack(work)) + bdi_work_clear(work); + if (sync_mode == WB_SYNC_NONE || bdi_work_on_stack(work)) + call_rcu(&work->rcu_head, bdi_work_free); +} +static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work) +{ /* - * make sure that changes are seen by all cpus before we test i_state - * -- mikulas + * The caller has retrieved the work arguments from this work, + * drop our reference. If this is the last ref, delete and free it */ - smp_mb(); + if (atomic_dec_and_test(&work->pending)) { + struct backing_dev_info *bdi = wb->bdi; - /* avoid the locking if we can */ - if ((inode->i_state & flags) == flags) - return; + spin_lock(&bdi->wb_lock); + list_del_rcu(&work->list); + spin_unlock(&bdi->wb_lock); - if (unlikely(block_dump)) - block_dump___mark_inode_dirty(inode); - - spin_lock(&inode_lock); - if ((inode->i_state & flags) != flags) { - const int was_dirty = inode->i_state & I_DIRTY; + wb_work_complete(work); + } +} - inode->i_state |= flags; +static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) +{ + if (work) { + work->seen = bdi->wb_mask; + BUG_ON(!work->seen); + atomic_set(&work->pending, bdi->wb_cnt); + BUG_ON(!bdi->wb_cnt); /* - * If the inode is being synced, just update its dirty state. - * The unlocker will place the inode on the appropriate - * superblock list, based upon its state. + * Make sure stores are seen before it appears on the list */ - if (inode->i_state & I_SYNC) - goto out; + smp_mb(); - /* - * Only add valid (hashed) inodes to the superblock's - * dirty list. Add blockdev inodes as well. - */ - if (!S_ISBLK(inode->i_mode)) { - if (hlist_unhashed(&inode->i_hash)) - goto out; - } - if (inode->i_state & (I_FREEING|I_CLEAR)) - goto out; + spin_lock(&bdi->wb_lock); + list_add_tail_rcu(&work->list, &bdi->work_list); + spin_unlock(&bdi->wb_lock); + } + + /* + * If the default thread isn't there, make sure we add it. When + * it gets created and wakes up, we'll run this work. + */ + if (unlikely(list_empty_careful(&bdi->wb_list))) + wake_up_process(default_backing_dev_info.wb.task); + else { + struct bdi_writeback *wb = &bdi->wb; /* - * If the inode was already on s_dirty/s_io/s_more_io, don't - * reposition it (that would break s_dirty time-ordering). + * If we failed allocating the bdi work item, wake up the wb + * thread always. As a safety precaution, it'll flush out + * everything */ - if (!was_dirty) { - inode->dirtied_when = jiffies; - list_move(&inode->i_list, &sb->s_dirty); - } + if (!wb_has_dirty_io(wb)) { + if (work) + wb_clear_pending(wb, work); + } else if (wb->task) + wake_up_process(wb->task); } -out: - spin_unlock(&inode_lock); } -EXPORT_SYMBOL(__mark_inode_dirty); +/* + * Used for on-stack allocated work items. The caller needs to wait until + * the wb threads have acked the work before it's safe to continue. + */ +static void bdi_wait_on_work_clear(struct bdi_work *work) +{ + wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait, + TASK_UNINTERRUPTIBLE); +} -static int write_inode(struct inode *inode, int sync) +static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc) { - if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) - return inode->i_sb->s_op->write_inode(inode, sync); - return 0; + struct bdi_work *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) + bdi_work_init(work, wbc); + + return work; +} + +void bdi_start_writeback(struct writeback_control *wbc) +{ + const bool must_wait = wbc->sync_mode == WB_SYNC_ALL; + struct bdi_work work_stack, *work = NULL; + + if (!must_wait) + work = bdi_alloc_work(wbc); + + if (!work) { + work = &work_stack; + bdi_work_init_on_stack(work, wbc); + } + + bdi_queue_work(wbc->bdi, work); + + /* + * If the sync mode is WB_SYNC_ALL, block waiting for the work to + * complete. If not, we only need to wait for the work to be started, + * if we allocated it on-stack. We use the same mechanism, if the + * wait bit is set in the bdi_work struct, then threads will not + * clear pending until after they are done. + * + * Note that work == &work_stack if must_wait is true, so we don't + * need to do call_rcu() here ever, since the completion path will + * have done that for us. + */ + if (must_wait || work == &work_stack) { + bdi_wait_on_work_clear(work); + if (work != &work_stack) + call_rcu(&work->rcu_head, bdi_work_free); + } } /* @@ -191,31 +243,32 @@ static int write_inode(struct inode *inode, int sync) * furthest end of its superblock's dirty-inode list. * * Before stamping the inode's ->dirtied_when, we check to see whether it is - * already the most-recently-dirtied inode on the s_dirty list. If that is + * already the most-recently-dirtied inode on the b_dirty list. If that is * the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ static void redirty_tail(struct inode *inode) { - struct super_block *sb = inode->i_sb; + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - if (!list_empty(&sb->s_dirty)) { - struct inode *tail_inode; + if (!list_empty(&wb->b_dirty)) { + struct inode *tail; - tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); - if (time_before(inode->dirtied_when, - tail_inode->dirtied_when)) + tail = list_entry(wb->b_dirty.next, struct inode, i_list); + if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } - list_move(&inode->i_list, &sb->s_dirty); + list_move(&inode->i_list, &wb->b_dirty); } /* - * requeue inode for re-scanning after sb->s_io list is exhausted. + * requeue inode for re-scanning after bdi->b_io list is exhausted. */ static void requeue_io(struct inode *inode) { - list_move(&inode->i_list, &inode->i_sb->s_more_io); + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + + list_move(&inode->i_list, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) @@ -262,20 +315,18 @@ static void move_expired_inodes(struct list_head *delaying_queue, /* * Queue all expired dirty inodes for io, eldest first. */ -static void queue_io(struct super_block *sb, - unsigned long *older_than_this) +static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) { - list_splice_init(&sb->s_more_io, sb->s_io.prev); - move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this); + list_splice_init(&wb->b_more_io, wb->b_io.prev); + move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); } -int sb_has_dirty_inodes(struct super_block *sb) +static int write_inode(struct inode *inode, int sync) { - return !list_empty(&sb->s_dirty) || - !list_empty(&sb->s_io) || - !list_empty(&sb->s_more_io); + if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) + return inode->i_sb->s_op->write_inode(inode, sync); + return 0; } -EXPORT_SYMBOL(sb_has_dirty_inodes); /* * Wait for writeback on an inode to complete. @@ -322,11 +373,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (inode->i_state & I_SYNC) { /* * If this inode is locked for writeback and we are not doing - * writeback-for-data-integrity, move it to s_more_io so that + * writeback-for-data-integrity, move it to b_more_io so that * writeback can proceed with the other inodes on s_io. * * We'll have another go at writing back this inode when we - * completed a full scan of s_io. + * completed a full scan of b_io. */ if (!wait) { requeue_io(inode); @@ -371,11 +422,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) /* * We didn't write back all the pages. nfs_writepages() * sometimes bales out without doing anything. Redirty - * the inode; Move it from s_io onto s_more_io/s_dirty. + * the inode; Move it from b_io onto b_more_io/b_dirty. */ /* * akpm: if the caller was the kupdate function we put - * this inode at the head of s_dirty so it gets first + * this inode at the head of b_dirty so it gets first * consideration. Otherwise, move it to the tail, for * the reasons described there. I'm not really sure * how much sense this makes. Presumably I had a good @@ -385,7 +436,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->for_kupdate) { /* * For the kupdate function we move the inode - * to s_more_io so it will get more writeout as + * to b_more_io so it will get more writeout as * soon as the queue becomes uncongested. */ inode->i_state |= I_DIRTY_PAGES; @@ -434,50 +485,84 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } /* - * Write out a superblock's list of dirty inodes. A wait will be performed - * upon no inodes, all inodes or the final one, depending upon sync_mode. - * - * If older_than_this is non-NULL, then only write out inodes which - * had their first dirtying at a time earlier than *older_than_this. - * - * If we're a pdflush thread, then implement pdflush collision avoidance - * against the entire list. + * For WB_SYNC_NONE writeback, the caller does not have the sb pinned + * before calling writeback. So make sure that we do pin it, so it doesn't + * go away while we are writing inodes from it. * - * If `bdi' is non-zero then we're being asked to writeback a specific queue. - * This function assumes that the blockdev superblock's inodes are backed by - * a variety of queues, so all inodes are searched. For other superblocks, - * assume that all inodes are backed by the same queue. - * - * FIXME: this linear search could get expensive with many fileystems. But - * how to fix? We need to go from an address_space to all inodes which share - * a queue with that address_space. (Easy: have a global "dirty superblocks" - * list). - * - * The inodes to be written are parked on sb->s_io. They are moved back onto - * sb->s_dirty as they are selected for writing. This way, none can be missed - * on the writer throttling path, and we get decent balancing between many - * throttled threads: we don't want them all piling up on inode_sync_wait. + * Returns 0 if the super was successfully pinned (or pinning wasn't needed), + * 1 if we failed. */ -void generic_sync_sb_inodes(struct super_block *sb, +static int pin_sb_for_writeback(struct writeback_control *wbc, + struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + /* + * Caller must already hold the ref for this + */ + if (wbc->sync_mode == WB_SYNC_ALL) { + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + return 0; + } + + spin_lock(&sb_lock); + sb->s_count++; + if (down_read_trylock(&sb->s_umount)) { + if (sb->s_root) { + spin_unlock(&sb_lock); + return 0; + } + /* + * umounted, drop rwsem again and fall through to failure + */ + up_read(&sb->s_umount); + } + + sb->s_count--; + spin_unlock(&sb_lock); + return 1; +} + +static void unpin_sb_for_writeback(struct writeback_control *wbc, + struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (wbc->sync_mode == WB_SYNC_ALL) + return; + + up_read(&sb->s_umount); + put_super(sb); +} + +static void writeback_inodes_wb(struct bdi_writeback *wb, struct writeback_control *wbc) { + struct super_block *sb = wbc->sb; + const int is_blkdev_sb = sb_is_blkdev_sb(sb); const unsigned long start = jiffies; /* livelock avoidance */ - int sync = wbc->sync_mode == WB_SYNC_ALL; spin_lock(&inode_lock); - if (!wbc->for_kupdate || list_empty(&sb->s_io)) - queue_io(sb, wbc->older_than_this); - while (!list_empty(&sb->s_io)) { - struct inode *inode = list_entry(sb->s_io.prev, + if (!wbc->for_kupdate || list_empty(&wb->b_io)) + queue_io(wb, wbc->older_than_this); + + while (!list_empty(&wb->b_io)) { + struct inode *inode = list_entry(wb->b_io.prev, struct inode, i_list); - struct address_space *mapping = inode->i_mapping; - struct backing_dev_info *bdi = mapping->backing_dev_info; long pages_skipped; - if (!bdi_cap_writeback_dirty(bdi)) { + /* + * super block given and doesn't match, skip this inode + */ + if (sb && sb != inode->i_sb) { + redirty_tail(inode); + continue; + } + + if (!bdi_cap_writeback_dirty(wb->bdi)) { redirty_tail(inode); - if (sb_is_blkdev_sb(sb)) { + if (is_blkdev_sb) { /* * Dirty memory-backed blockdev: the ramdisk * driver does this. Skip just this inode @@ -497,21 +582,14 @@ void generic_sync_sb_inodes(struct super_block *sb, continue; } - if (wbc->nonblocking && bdi_write_congested(bdi)) { + if (wbc->nonblocking && bdi_write_congested(wb->bdi)) { wbc->encountered_congestion = 1; - if (!sb_is_blkdev_sb(sb)) + if (!is_blkdev_sb) break; /* Skip a congested fs */ requeue_io(inode); continue; /* Skip a congested blockdev */ } - if (wbc->bdi && bdi != wbc->bdi) { - if (!sb_is_blkdev_sb(sb)) - break; /* fs has the wrong queue */ - requeue_io(inode); - continue; /* blockdev has wrong queue */ - } - /* * Was this inode dirtied after sync_sb_inodes was called? * This keeps sync from extra jobs and livelock. @@ -519,16 +597,16 @@ void generic_sync_sb_inodes(struct super_block *sb, if (inode_dirtied_after(inode, start)) break; - /* Is another pdflush already flushing this queue? */ - if (current_is_pdflush() && !writeback_acquire(bdi)) - break; + if (pin_sb_for_writeback(wbc, inode)) { + requeue_io(inode); + continue; + } BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); __iget(inode); pages_skipped = wbc->pages_skipped; writeback_single_inode(inode, wbc); - if (current_is_pdflush()) - writeback_release(bdi); + unpin_sb_for_writeback(wbc, inode); if (wbc->pages_skipped != pages_skipped) { /* * writeback is not making progress due to locked @@ -544,144 +622,571 @@ void generic_sync_sb_inodes(struct super_block *sb, wbc->more_io = 1; break; } - if (!list_empty(&sb->s_more_io)) + if (!list_empty(&wb->b_more_io)) wbc->more_io = 1; } - if (sync) { - struct inode *inode, *old_inode = NULL; + spin_unlock(&inode_lock); + /* Leave any unwritten inodes on b_io */ +} + +void writeback_inodes_wbc(struct writeback_control *wbc) +{ + struct backing_dev_info *bdi = wbc->bdi; + writeback_inodes_wb(&bdi->wb, wbc); +} + +/* + * The maximum number of pages to writeout in a single bdi flush/kupdate + * operation. We do this so we don't hold I_SYNC against an inode for + * enormous amounts of time, which would block a userspace task which has + * been forced to throttle against that inode. Also, the code reevaluates + * the dirty each time it has written this many pages. + */ +#define MAX_WRITEBACK_PAGES 1024 + +static inline bool over_bground_thresh(void) +{ + unsigned long background_thresh, dirty_thresh; + + get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); + + return (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) >= background_thresh); +} + +/* + * Explicit flushing or periodic writeback of "old" data. + * + * Define "old": the first time one of an inode's pages is dirtied, we mark the + * dirtying-time in the inode's address_space. So this periodic writeback code + * just walks the superblock inode list, writing back any inodes which are + * older than a specific point in time. + * + * Try to run once per dirty_writeback_interval. But if a writeback event + * takes longer than a dirty_writeback_interval interval, then leave a + * one-second gap. + * + * older_than_this takes precedence over nr_to_write. So we'll only write back + * all dirty pages if they are all attached to "old" mappings. + */ +static long wb_writeback(struct bdi_writeback *wb, long nr_pages, + struct super_block *sb, + enum writeback_sync_modes sync_mode, int for_kupdate) +{ + struct writeback_control wbc = { + .bdi = wb->bdi, + .sb = sb, + .sync_mode = sync_mode, + .older_than_this = NULL, + .for_kupdate = for_kupdate, + .range_cyclic = 1, + }; + unsigned long oldest_jif; + long wrote = 0; + + if (wbc.for_kupdate) { + wbc.older_than_this = &oldest_jif; + oldest_jif = jiffies - + msecs_to_jiffies(dirty_expire_interval * 10); + } + + for (;;) { /* - * Data integrity sync. Must wait for all pages under writeback, - * because there may have been pages dirtied before our sync - * call, but which had writeout started before we write it out. - * In which case, the inode may not be on the dirty list, but - * we still have to wait for that writeout. + * Don't flush anything for non-integrity writeback where + * no nr_pages was given */ - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - struct address_space *mapping; + if (!for_kupdate && nr_pages <= 0 && sync_mode == WB_SYNC_NONE) + break; - if (inode->i_state & - (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) - continue; - mapping = inode->i_mapping; - if (mapping->nrpages == 0) + /* + * If no specific pages were given and this is just a + * periodic background writeout and we are below the + * background dirty threshold, don't do anything + */ + if (for_kupdate && nr_pages <= 0 && !over_bground_thresh()) + break; + + wbc.more_io = 0; + wbc.encountered_congestion = 0; + wbc.nr_to_write = MAX_WRITEBACK_PAGES; + wbc.pages_skipped = 0; + writeback_inodes_wb(wb, &wbc); + nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; + wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; + + /* + * If we ran out of stuff to write, bail unless more_io got set + */ + if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { + if (wbc.more_io && !wbc.for_kupdate) continue; - __iget(inode); - spin_unlock(&inode_lock); + break; + } + } + + return wrote; +} + +/* + * Return the next bdi_work struct that hasn't been processed by this + * wb thread yet + */ +static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi, + struct bdi_writeback *wb) +{ + struct bdi_work *work, *ret = NULL; + + rcu_read_lock(); + + list_for_each_entry_rcu(work, &bdi->work_list, list) { + if (!test_and_clear_bit(wb->nr, &work->seen)) + continue; + + ret = work; + break; + } + + rcu_read_unlock(); + return ret; +} + +static long wb_check_old_data_flush(struct bdi_writeback *wb) +{ + unsigned long expired; + long nr_pages; + + expired = wb->last_old_flush + + msecs_to_jiffies(dirty_writeback_interval * 10); + if (time_before(jiffies, expired)) + return 0; + + wb->last_old_flush = jiffies; + nr_pages = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) + + (inodes_stat.nr_inodes - inodes_stat.nr_unused); + + if (nr_pages) + return wb_writeback(wb, nr_pages, NULL, WB_SYNC_NONE, 1); + + return 0; +} + +/* + * Retrieve work items and do the writeback they describe + */ +long wb_do_writeback(struct bdi_writeback *wb, int force_wait) +{ + struct backing_dev_info *bdi = wb->bdi; + struct bdi_work *work; + long nr_pages, wrote = 0; + + while ((work = get_next_work_item(bdi, wb)) != NULL) { + enum writeback_sync_modes sync_mode; + + nr_pages = work->nr_pages; + + /* + * Override sync mode, in case we must wait for completion + */ + if (force_wait) + work->sync_mode = sync_mode = WB_SYNC_ALL; + else + sync_mode = work->sync_mode; + + /* + * If this isn't a data integrity operation, just notify + * that we have seen this work and we are now starting it. + */ + if (sync_mode == WB_SYNC_NONE) + wb_clear_pending(wb, work); + + wrote += wb_writeback(wb, nr_pages, work->sb, sync_mode, 0); + + /* + * This is a data integrity writeback, so only do the + * notification when we have completed the work. + */ + if (sync_mode == WB_SYNC_ALL) + wb_clear_pending(wb, work); + } + + /* + * Check for periodic writeback, kupdated() style + */ + wrote += wb_check_old_data_flush(wb); + + return wrote; +} + +/* + * Handle writeback of dirty data for the device backed by this bdi. Also + * wakes up periodically and does kupdated style flushing. + */ +int bdi_writeback_task(struct bdi_writeback *wb) +{ + unsigned long last_active = jiffies; + unsigned long wait_jiffies = -1UL; + long pages_written; + + while (!kthread_should_stop()) { + pages_written = wb_do_writeback(wb, 0); + + if (pages_written) + last_active = jiffies; + else if (wait_jiffies != -1UL) { + unsigned long max_idle; + /* - * We hold a reference to 'inode' so it couldn't have - * been removed from s_inodes list while we dropped the - * inode_lock. We cannot iput the inode now as we can - * be holding the last reference and we cannot iput it - * under inode_lock. So we keep the reference and iput - * it later. + * Longest period of inactivity that we tolerate. If we + * see dirty data again later, the task will get + * recreated automatically. */ - iput(old_inode); - old_inode = inode; + max_idle = max(5UL * 60 * HZ, wait_jiffies); + if (time_after(jiffies, max_idle + last_active)) + break; + } - filemap_fdatawait(mapping); + wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(wait_jiffies); + try_to_freeze(); + } - cond_resched(); + return 0; +} + +/* + * Schedule writeback for all backing devices. Expensive! If this is a data + * integrity operation, writeback will be complete when this returns. If + * we are simply called for WB_SYNC_NONE, then writeback will merely be + * scheduled to run. + */ +static void bdi_writeback_all(struct writeback_control *wbc) +{ + const bool must_wait = wbc->sync_mode == WB_SYNC_ALL; + struct backing_dev_info *bdi; + struct bdi_work *work; + LIST_HEAD(list); + +restart: + spin_lock(&bdi_lock); + + list_for_each_entry(bdi, &bdi_list, bdi_list) { + struct bdi_work *work; - spin_lock(&inode_lock); + if (!bdi_has_dirty_io(bdi)) + continue; + + /* + * If work allocation fails, do the writes inline. We drop + * the lock and restart the list writeout. This should be OK, + * since this happens rarely and because the writeout should + * eventually make more free memory available. + */ + work = bdi_alloc_work(wbc); + if (!work) { + struct writeback_control __wbc; + + /* + * Not a data integrity writeout, just continue + */ + if (!must_wait) + continue; + + spin_unlock(&bdi_lock); + __wbc = *wbc; + __wbc.bdi = bdi; + writeback_inodes_wbc(&__wbc); + goto restart; } - spin_unlock(&inode_lock); - iput(old_inode); - } else - spin_unlock(&inode_lock); + if (must_wait) + list_add_tail(&work->wait_list, &list); + + bdi_queue_work(bdi, work); + } + + spin_unlock(&bdi_lock); - return; /* Leave any unwritten inodes on s_io */ + /* + * If this is for WB_SYNC_ALL, wait for pending work to complete + * before returning. + */ + while (!list_empty(&list)) { + work = list_entry(list.next, struct bdi_work, wait_list); + list_del(&work->wait_list); + bdi_wait_on_work_clear(work); + call_rcu(&work->rcu_head, bdi_work_free); + } } -EXPORT_SYMBOL_GPL(generic_sync_sb_inodes); -static void sync_sb_inodes(struct super_block *sb, - struct writeback_control *wbc) +/* + * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back + * the whole world. + */ +void wakeup_flusher_threads(long nr_pages) { - generic_sync_sb_inodes(sb, wbc); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .range_cyclic = 1, + }; + + if (nr_pages == 0) + nr_pages = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + wbc.nr_to_write = nr_pages; + bdi_writeback_all(&wbc); } -/* - * Start writeback of dirty pagecache data against all unlocked inodes. +static noinline void block_dump___mark_inode_dirty(struct inode *inode) +{ + if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { + struct dentry *dentry; + const char *name = "?"; + + dentry = d_find_alias(inode); + if (dentry) { + spin_lock(&dentry->d_lock); + name = (const char *) dentry->d_name.name; + } + printk(KERN_DEBUG + "%s(%d): dirtied inode %lu (%s) on %s\n", + current->comm, task_pid_nr(current), inode->i_ino, + name, inode->i_sb->s_id); + if (dentry) { + spin_unlock(&dentry->d_lock); + dput(dentry); + } + } +} + +/** + * __mark_inode_dirty - internal function + * @inode: inode to mark + * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) + * Mark an inode as dirty. Callers should use mark_inode_dirty or + * mark_inode_dirty_sync. * - * Note: - * We don't need to grab a reference to superblock here. If it has non-empty - * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed - * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all - * empty. Since __sync_single_inode() regains inode_lock before it finally moves - * inode from superblock lists we are OK. + * Put the inode on the super block's dirty list. + * + * CAREFUL! We mark it dirty unconditionally, but move it onto the + * dirty list only if it is hashed or if it refers to a blockdev. + * If it was not hashed, it will never be added to the dirty list + * even if it is later hashed, as it will have been marked dirty already. * - * If `older_than_this' is non-zero then only flush inodes which have a - * flushtime older than *older_than_this. + * In short, make sure you hash any inodes _before_ you start marking + * them dirty. * - * If `bdi' is non-zero then we will scan the first inode against each - * superblock until we find the matching ones. One group will be the dirty - * inodes against a filesystem. Then when we hit the dummy blockdev superblock, - * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not - * super-efficient but we're about to do a ton of I/O... + * This function *must* be atomic for the I_DIRTY_PAGES case - + * set_page_dirty() is called under spinlock in several places. + * + * Note that for blockdevs, inode->dirtied_when represents the dirtying time of + * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of + * the kernel-internal blockdev inode represents the dirtying time of the + * blockdev's pages. This is why for I_DIRTY_PAGES we always use + * page->mapping->host, so the page-dirtying time is recorded in the internal + * blockdev inode. */ -void -writeback_inodes(struct writeback_control *wbc) +void __mark_inode_dirty(struct inode *inode, int flags) { - struct super_block *sb; + struct super_block *sb = inode->i_sb; - might_sleep(); - spin_lock(&sb_lock); -restart: - list_for_each_entry_reverse(sb, &super_blocks, s_list) { - if (sb_has_dirty_inodes(sb)) { - /* we're making our own get_super here */ - sb->s_count++; - spin_unlock(&sb_lock); - /* - * If we can't get the readlock, there's no sense in - * waiting around, most of the time the FS is going to - * be unmounted by the time it is released. - */ - if (down_read_trylock(&sb->s_umount)) { - if (sb->s_root) - sync_sb_inodes(sb, wbc); - up_read(&sb->s_umount); + /* + * Don't do this for I_DIRTY_PAGES - that doesn't actually + * dirty the inode itself + */ + if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + if (sb->s_op->dirty_inode) + sb->s_op->dirty_inode(inode); + } + + /* + * make sure that changes are seen by all cpus before we test i_state + * -- mikulas + */ + smp_mb(); + + /* avoid the locking if we can */ + if ((inode->i_state & flags) == flags) + return; + + if (unlikely(block_dump)) + block_dump___mark_inode_dirty(inode); + + spin_lock(&inode_lock); + if ((inode->i_state & flags) != flags) { + const int was_dirty = inode->i_state & I_DIRTY; + + inode->i_state |= flags; + + /* + * If the inode is being synced, just update its dirty state. + * The unlocker will place the inode on the appropriate + * superblock list, based upon its state. + */ + if (inode->i_state & I_SYNC) + goto out; + + /* + * Only add valid (hashed) inodes to the superblock's + * dirty list. Add blockdev inodes as well. + */ + if (!S_ISBLK(inode->i_mode)) { + if (hlist_unhashed(&inode->i_hash)) + goto out; + } + if (inode->i_state & (I_FREEING|I_CLEAR)) + goto out; + + /* + * If the inode was already on b_dirty/b_io/b_more_io, don't + * reposition it (that would break b_dirty time-ordering). + */ + if (!was_dirty) { + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + struct backing_dev_info *bdi = wb->bdi; + + if (bdi_cap_writeback_dirty(bdi) && + !test_bit(BDI_registered, &bdi->state)) { + WARN_ON(1); + printk(KERN_ERR "bdi-%s not registered\n", + bdi->name); } - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; + + inode->dirtied_when = jiffies; + list_move(&inode->i_list, &wb->b_dirty); } - if (wbc->nr_to_write <= 0) - break; } - spin_unlock(&sb_lock); +out: + spin_unlock(&inode_lock); } +EXPORT_SYMBOL(__mark_inode_dirty); /* - * writeback and wait upon the filesystem's dirty inodes. The caller will - * do this in two passes - one to write, and one to wait. + * Write out a superblock's list of dirty inodes. A wait will be performed + * upon no inodes, all inodes or the final one, depending upon sync_mode. + * + * If older_than_this is non-NULL, then only write out inodes which + * had their first dirtying at a time earlier than *older_than_this. * - * A finite limit is set on the number of pages which will be written. - * To prevent infinite livelock of sys_sync(). + * If we're a pdlfush thread, then implement pdflush collision avoidance + * against the entire list. * - * We add in the number of potentially dirty inodes, because each inode write - * can dirty pagecache in the underlying blockdev. + * If `bdi' is non-zero then we're being asked to writeback a specific queue. + * This function assumes that the blockdev superblock's inodes are backed by + * a variety of queues, so all inodes are searched. For other superblocks, + * assume that all inodes are backed by the same queue. + * + * The inodes to be written are parked on bdi->b_io. They are moved back onto + * bdi->b_dirty as they are selected for writing. This way, none can be missed + * on the writer throttling path, and we get decent balancing between many + * throttled threads: we don't want them all piling up on inode_sync_wait. */ -void sync_inodes_sb(struct super_block *sb, int wait) +static void wait_sb_inodes(struct writeback_control *wbc) +{ + struct inode *inode, *old_inode = NULL; + + /* + * We need to be protected against the filesystem going from + * r/o to r/w or vice versa. + */ + WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount)); + + spin_lock(&inode_lock); + + /* + * Data integrity sync. Must wait for all pages under writeback, + * because there may have been pages dirtied before our sync + * call, but which had writeout started before we write it out. + * In which case, the inode may not be on the dirty list, but + * we still have to wait for that writeout. + */ + list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) { + struct address_space *mapping; + + if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) + continue; + mapping = inode->i_mapping; + if (mapping->nrpages == 0) + continue; + __iget(inode); + spin_unlock(&inode_lock); + /* + * We hold a reference to 'inode' so it couldn't have + * been removed from s_inodes list while we dropped the + * inode_lock. We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it + * under inode_lock. So we keep the reference and iput + * it later. + */ + iput(old_inode); + old_inode = inode; + + filemap_fdatawait(mapping); + + cond_resched(); + + spin_lock(&inode_lock); + } + spin_unlock(&inode_lock); + iput(old_inode); +} + +/** + * writeback_inodes_sb - writeback dirty inodes from given super_block + * @sb: the superblock + * + * Start writeback on some inodes on this super_block. No guarantees are made + * on how many (if any) will be written, and this function does not wait + * for IO completion of submitted IO. The number of pages submitted is + * returned. + */ +long writeback_inodes_sb(struct super_block *sb) { struct writeback_control wbc = { - .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, + .sb = sb, + .sync_mode = WB_SYNC_NONE, .range_start = 0, .range_end = LLONG_MAX, }; + unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); + unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); + long nr_to_write; - if (!wait) { - unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); - unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); - - wbc.nr_to_write = nr_dirty + nr_unstable + + nr_to_write = nr_dirty + nr_unstable + (inodes_stat.nr_inodes - inodes_stat.nr_unused); - } else - wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */ - sync_sb_inodes(sb, &wbc); + wbc.nr_to_write = nr_to_write; + bdi_writeback_all(&wbc); + return nr_to_write - wbc.nr_to_write; +} +EXPORT_SYMBOL(writeback_inodes_sb); + +/** + * sync_inodes_sb - sync sb inode pages + * @sb: the superblock + * + * This function writes and waits on any dirty inode belonging to this + * super_block. The number of pages synced is returned. + */ +long sync_inodes_sb(struct super_block *sb) +{ + struct writeback_control wbc = { + .sb = sb, + .sync_mode = WB_SYNC_ALL, + .range_start = 0, + .range_end = LLONG_MAX, + }; + long nr_to_write = LONG_MAX; /* doesn't actually matter */ + + wbc.nr_to_write = nr_to_write; + bdi_writeback_all(&wbc); + wait_sb_inodes(&wbc); + return nr_to_write - wbc.nr_to_write; } +EXPORT_SYMBOL(sync_inodes_sb); /** * write_inode_now - write an inode to disk diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index f91ccc4a189..4567db6f943 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -801,6 +801,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) { int err; + fc->bdi.name = "fuse"; fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; fc->bdi.unplug_io_fn = default_unplug_io_fn; /* fuse does it's own writeback accounting */ diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index cb88dac8cca..a93b885311d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -44,6 +44,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations; static const struct inode_operations hugetlbfs_inode_operations; static struct backing_dev_info hugetlbfs_backing_dev_info = { + .name = "hugetlbfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 8fcb6239218..7edb62e9741 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -258,7 +258,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) return rc; } -static int jffs2_check_acl(struct inode *inode, int mask) +int jffs2_check_acl(struct inode *inode, int mask) { struct posix_acl *acl; int rc; @@ -274,11 +274,6 @@ static int jffs2_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int jffs2_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, jffs2_check_acl); -} - int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode) { struct posix_acl *acl, *clone; diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index fc929f2a14f..f0ba63e3c36 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -26,7 +26,7 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL -extern int jffs2_permission(struct inode *, int); +extern int jffs2_check_acl(struct inode *, int); extern int jffs2_acl_chmod(struct inode *); extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); extern int jffs2_init_acl_post(struct inode *); @@ -36,7 +36,7 @@ extern struct xattr_handler jffs2_acl_default_xattr_handler; #else -#define jffs2_permission (NULL) +#define jffs2_check_acl (NULL) #define jffs2_acl_chmod(inode) (0) #define jffs2_init_acl_pre(dir_i,inode,mode) (0) #define jffs2_init_acl_post(inode) (0) diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 6f60cc910f4..7aa4417e085 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -55,7 +55,7 @@ const struct inode_operations jffs2_dir_inode_operations = .rmdir = jffs2_rmdir, .mknod = jffs2_mknod, .rename = jffs2_rename, - .permission = jffs2_permission, + .check_acl = jffs2_check_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 23c94753986..b7b74e29914 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -56,7 +56,7 @@ const struct file_operations jffs2_file_operations = const struct inode_operations jffs2_file_inode_operations = { - .permission = jffs2_permission, + .check_acl = jffs2_check_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index b7339c3b6ad..4ec11e8bda8 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -21,7 +21,7 @@ const struct inode_operations jffs2_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = jffs2_follow_link, - .permission = jffs2_permission, + .check_acl = jffs2_check_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index a29c7c3e3fb..d66477c3430 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -114,7 +114,7 @@ out: return rc; } -static int jfs_check_acl(struct inode *inode, int mask) +int jfs_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); @@ -129,11 +129,6 @@ static int jfs_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int jfs_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, jfs_check_acl); -} - int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) { struct posix_acl *acl = NULL; diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 7f6063acaa3..2b70fa78e4a 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -96,7 +96,7 @@ const struct inode_operations jfs_file_inode_operations = { .removexattr = jfs_removexattr, #ifdef CONFIG_JFS_POSIX_ACL .setattr = jfs_setattr, - .permission = jfs_permission, + .check_acl = jfs_check_acl, #endif }; diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index 88475f10a38..b07bd417ef8 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -20,7 +20,7 @@ #ifdef CONFIG_JFS_POSIX_ACL -int jfs_permission(struct inode *, int); +int jfs_check_acl(struct inode *, int); int jfs_init_acl(tid_t, struct inode *, struct inode *); int jfs_setattr(struct dentry *, struct iattr *); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 514ee2edb92..c79a4270f08 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1543,7 +1543,7 @@ const struct inode_operations jfs_dir_inode_operations = { .removexattr = jfs_removexattr, #ifdef CONFIG_JFS_POSIX_ACL .setattr = jfs_setattr, - .permission = jfs_permission, + .check_acl = jfs_check_acl, #endif }; diff --git a/fs/locks.c b/fs/locks.c index b6440f52178..52366e877d7 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1591,7 +1591,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) if (can_sleep) lock->fl_flags |= FL_SLEEP; - error = security_file_lock(filp, cmd); + error = security_file_lock(filp, lock->fl_type); if (error) goto out_free; diff --git a/fs/namei.c b/fs/namei.c index 1f13751693a..d11f404667e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -169,19 +169,10 @@ void putname(const char *name) EXPORT_SYMBOL(putname); #endif - -/** - * generic_permission - check for access rights on a Posix-like filesystem - * @inode: inode to check access rights for - * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) - * @check_acl: optional callback to check for Posix ACLs - * - * Used to check for read/write/execute permissions on a file. - * We use "fsuid" for this, letting us set arbitrary permissions - * for filesystem access without changing the "normal" uids which - * are used for other things.. +/* + * This does basic POSIX ACL permission checking */ -int generic_permission(struct inode *inode, int mask, +static int acl_permission_check(struct inode *inode, int mask, int (*check_acl)(struct inode *inode, int mask)) { umode_t mode = inode->i_mode; @@ -193,9 +184,7 @@ int generic_permission(struct inode *inode, int mask, else { if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { int error = check_acl(inode, mask); - if (error == -EACCES) - goto check_capabilities; - else if (error != -EAGAIN) + if (error != -EAGAIN) return error; } @@ -208,8 +197,32 @@ int generic_permission(struct inode *inode, int mask, */ if ((mask & ~mode) == 0) return 0; + return -EACCES; +} + +/** + * generic_permission - check for access rights on a Posix-like filesystem + * @inode: inode to check access rights for + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * @check_acl: optional callback to check for Posix ACLs + * + * Used to check for read/write/execute permissions on a file. + * We use "fsuid" for this, letting us set arbitrary permissions + * for filesystem access without changing the "normal" uids which + * are used for other things.. + */ +int generic_permission(struct inode *inode, int mask, + int (*check_acl)(struct inode *inode, int mask)) +{ + int ret; + + /* + * Do the basic POSIX ACL permission checks. + */ + ret = acl_permission_check(inode, mask, check_acl); + if (ret != -EACCES) + return ret; - check_capabilities: /* * Read/write DACs are always overridable. * Executable DACs are overridable if at least one exec bit is set. @@ -262,7 +275,7 @@ int inode_permission(struct inode *inode, int mask) if (inode->i_op->permission) retval = inode->i_op->permission(inode, mask); else - retval = generic_permission(inode, mask, NULL); + retval = generic_permission(inode, mask, inode->i_op->check_acl); if (retval) return retval; @@ -432,29 +445,22 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, */ static int exec_permission_lite(struct inode *inode) { - umode_t mode = inode->i_mode; - - if (inode->i_op->permission) - return -EAGAIN; + int ret; - if (current_fsuid() == inode->i_uid) - mode >>= 6; - else if (in_group_p(inode->i_gid)) - mode >>= 3; - - if (mode & MAY_EXEC) - goto ok; - - if ((inode->i_mode & S_IXUGO) && capable(CAP_DAC_OVERRIDE)) - goto ok; - - if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_OVERRIDE)) + if (inode->i_op->permission) { + ret = inode->i_op->permission(inode, MAY_EXEC); + if (!ret) + goto ok; + return ret; + } + ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl); + if (!ret) goto ok; - if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_READ_SEARCH)) + if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)) goto ok; - return -EACCES; + return ret; ok: return security_inode_permission(inode, MAY_EXEC); } @@ -853,12 +859,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd) nd->flags |= LOOKUP_CONTINUE; err = exec_permission_lite(inode); - if (err == -EAGAIN) - err = inode_permission(nd->path.dentry->d_inode, - MAY_EXEC); - if (!err) - err = ima_path_check(&nd->path, MAY_EXEC, - IMA_COUNT_UPDATE); if (err) break; @@ -1533,9 +1533,11 @@ int may_open(struct path *path, int acc_mode, int flag) if (error) return error; - error = ima_path_check(path, - acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC), + error = ima_path_check(path, acc_mode ? + acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) : + ACC_MODE(flag) & (MAY_READ | MAY_WRITE), IMA_COUNT_UPDATE); + if (error) return error; /* diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 8d25ccb2d51..c6be84a161f 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -879,6 +879,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * server->rsize = NFS_MAX_FILE_IO_SIZE; server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + server->backing_dev_info.name = "nfs"; server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; if (server->wsize > max_rpc_payload) diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 5573508f707..36fcabbf518 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -34,6 +34,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) int flags = nfsexp_flags(rqstp, exp); int ret; + validate_process_creds(); + /* discard any old override before preparing the new set */ revert_creds(get_cred(current->real_cred)); new = prepare_creds(); @@ -86,8 +88,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) else new->cap_effective = cap_raise_nfsd_set(new->cap_effective, new->cap_permitted); + validate_process_creds(); put_cred(override_creds(new)); put_cred(new); + validate_process_creds(); return 0; oom: diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 492c79b7800..24d58adfe5f 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -496,7 +496,9 @@ nfsd(void *vrqstp) /* Lock the export hash tables for reading. */ exp_readlock(); + validate_process_creds(); svc_process(rqstp); + validate_process_creds(); /* Unlock export hash tables */ exp_readunlock(); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 23341c1063b..8fa09bfbcba 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -684,6 +684,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, __be32 err; int host_err; + validate_process_creds(); + /* * If we get here, then the client has already done an "open", * and (hopefully) checked permission - so allow OWNER_OVERRIDE @@ -740,6 +742,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, out_nfserr: err = nfserrno(host_err); out: + validate_process_creds(); return err; } diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c index 1c9efb406a9..02bf17808bd 100644 --- a/fs/ocfs2/dlm/dlmfs.c +++ b/fs/ocfs2/dlm/dlmfs.c @@ -325,6 +325,7 @@ clear_fields: } static struct backing_dev_info dlmfs_backing_dev_info = { + .name = "ocfs2-dlmfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; diff --git a/fs/open.c b/fs/open.c index dd98e807602..31191bf513e 100644 --- a/fs/open.c +++ b/fs/open.c @@ -199,7 +199,7 @@ out: int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { - int err; + int ret; struct iattr newattrs; /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ @@ -214,12 +214,14 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, } /* Remove suid/sgid on truncate too */ - newattrs.ia_valid |= should_remove_suid(dentry); + ret = should_remove_suid(dentry); + if (ret) + newattrs.ia_valid |= ret | ATTR_FORCE; mutex_lock(&dentry->d_inode->i_mutex); - err = notify_change(dentry, &newattrs); + ret = notify_change(dentry, &newattrs); mutex_unlock(&dentry->d_inode->i_mutex); - return err; + return ret; } static long do_sys_truncate(const char __user *pathname, loff_t length) @@ -957,6 +959,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, int error; struct file *f; + validate_creds(cred); + /* * We must always pass in a valid mount pointer. Historically * callers got away with not passing it, but we must enforce this at diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 0ff7566c767..a7f0110fca4 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -46,6 +46,7 @@ static const struct super_operations ramfs_ops; static const struct inode_operations ramfs_dir_inode_operations; static struct backing_dev_info ramfs_backing_dev_info = { + .name = "ramfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY | diff --git a/fs/super.c b/fs/super.c index 2761d3e22ed..9cda337ddae 100644 --- a/fs/super.c +++ b/fs/super.c @@ -62,9 +62,6 @@ static struct super_block *alloc_super(struct file_system_type *type) s = NULL; goto out; } - INIT_LIST_HEAD(&s->s_dirty); - INIT_LIST_HEAD(&s->s_io); - INIT_LIST_HEAD(&s->s_more_io); INIT_LIST_HEAD(&s->s_files); INIT_LIST_HEAD(&s->s_instances); INIT_HLIST_HEAD(&s->s_anon); @@ -171,7 +168,7 @@ int __put_super_and_need_restart(struct super_block *sb) * Drops a temporary reference, frees superblock if there's no * references left. */ -static void put_super(struct super_block *sb) +void put_super(struct super_block *sb) { spin_lock(&sb_lock); __put_super(sb); diff --git a/fs/sync.c b/fs/sync.c index 3422ba61d86..103cc7fdd3d 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -19,20 +19,22 @@ SYNC_FILE_RANGE_WAIT_AFTER) /* - * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0) - * just dirties buffers with inodes so we have to submit IO for these buffers - * via __sync_blockdev(). This also speeds up the wait == 1 case since in that - * case write_inode() functions do sync_dirty_buffer() and thus effectively - * write one block at a time. + * Do the filesystem syncing work. For simple filesystems + * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to + * submit IO for these buffers via __sync_blockdev(). This also speeds up the + * wait == 1 case since in that case write_inode() functions do + * sync_dirty_buffer() and thus effectively write one block at a time. */ static int __sync_filesystem(struct super_block *sb, int wait) { /* Avoid doing twice syncing and cache pruning for quota sync */ - if (!wait) + if (!wait) { writeout_quota_sb(sb, -1); - else + writeback_inodes_sb(sb); + } else { sync_quota_sb(sb, -1); - sync_inodes_sb(sb, wait); + sync_inodes_sb(sb); + } if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, wait); return __sync_blockdev(sb->s_bdev, wait); @@ -118,7 +120,7 @@ restart: */ SYSCALL_DEFINE0(sync) { - wakeup_pdflush(0); + wakeup_flusher_threads(0); sync_filesystems(0); sync_filesystems(1); if (unlikely(laptop_mode)) diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 14f2d71ea3c..0050fc40e8c 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -760,6 +760,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, const struct inode_operations sysfs_dir_inode_operations = { .lookup = sysfs_lookup, .setattr = sysfs_setattr, + .setxattr = sysfs_setxattr, }; static void remove_dir(struct sysfs_dirent *sd) diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 555f0ff988d..e28cecf179f 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -18,6 +18,8 @@ #include <linux/capability.h> #include <linux/errno.h> #include <linux/sched.h> +#include <linux/xattr.h> +#include <linux/security.h> #include "sysfs.h" extern struct super_block * sysfs_sb; @@ -29,12 +31,14 @@ static const struct address_space_operations sysfs_aops = { }; static struct backing_dev_info sysfs_backing_dev_info = { + .name = "sysfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; static const struct inode_operations sysfs_inode_operations ={ .setattr = sysfs_setattr, + .setxattr = sysfs_setxattr, }; int __init sysfs_inode_init(void) @@ -42,18 +46,37 @@ int __init sysfs_inode_init(void) return bdi_init(&sysfs_backing_dev_info); } +struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd) +{ + struct sysfs_inode_attrs *attrs; + struct iattr *iattrs; + + attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL); + if (!attrs) + return NULL; + iattrs = &attrs->ia_iattr; + + /* assign default attributes */ + iattrs->ia_mode = sd->s_mode; + iattrs->ia_uid = 0; + iattrs->ia_gid = 0; + iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; + + return attrs; +} int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) { struct inode * inode = dentry->d_inode; struct sysfs_dirent * sd = dentry->d_fsdata; - struct iattr * sd_iattr; + struct sysfs_inode_attrs *sd_attrs; + struct iattr *iattrs; unsigned int ia_valid = iattr->ia_valid; int error; if (!sd) return -EINVAL; - sd_iattr = sd->s_iattr; + sd_attrs = sd->s_iattr; error = inode_change_ok(inode, iattr); if (error) @@ -65,42 +88,77 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) if (error) return error; - if (!sd_iattr) { + if (!sd_attrs) { /* setting attributes for the first time, allocate now */ - sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL); - if (!sd_iattr) + sd_attrs = sysfs_init_inode_attrs(sd); + if (!sd_attrs) return -ENOMEM; - /* assign default attributes */ - sd_iattr->ia_mode = sd->s_mode; - sd_iattr->ia_uid = 0; - sd_iattr->ia_gid = 0; - sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; - sd->s_iattr = sd_iattr; + sd->s_iattr = sd_attrs; + } else { + /* attributes were changed at least once in past */ + iattrs = &sd_attrs->ia_iattr; + + if (ia_valid & ATTR_UID) + iattrs->ia_uid = iattr->ia_uid; + if (ia_valid & ATTR_GID) + iattrs->ia_gid = iattr->ia_gid; + if (ia_valid & ATTR_ATIME) + iattrs->ia_atime = timespec_trunc(iattr->ia_atime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MTIME) + iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_CTIME) + iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + iattrs->ia_mode = sd->s_mode = mode; + } } + return error; +} - /* attributes were changed atleast once in past */ - - if (ia_valid & ATTR_UID) - sd_iattr->ia_uid = iattr->ia_uid; - if (ia_valid & ATTR_GID) - sd_iattr->ia_gid = iattr->ia_gid; - if (ia_valid & ATTR_ATIME) - sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime, - inode->i_sb->s_time_gran); - if (ia_valid & ATTR_MTIME) - sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime, - inode->i_sb->s_time_gran); - if (ia_valid & ATTR_CTIME) - sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime, - inode->i_sb->s_time_gran); - if (ia_valid & ATTR_MODE) { - umode_t mode = iattr->ia_mode; - - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) - mode &= ~S_ISGID; - sd_iattr->ia_mode = sd->s_mode = mode; - } +int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + struct sysfs_dirent *sd = dentry->d_fsdata; + struct sysfs_inode_attrs *iattrs; + void *secdata; + int error; + u32 secdata_len = 0; + + if (!sd) + return -EINVAL; + if (!sd->s_iattr) + sd->s_iattr = sysfs_init_inode_attrs(sd); + if (!sd->s_iattr) + return -ENOMEM; + + iattrs = sd->s_iattr; + + if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { + const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + error = security_inode_setsecurity(dentry->d_inode, suffix, + value, size, flags); + if (error) + goto out; + error = security_inode_getsecctx(dentry->d_inode, + &secdata, &secdata_len); + if (error) + goto out; + if (iattrs->ia_secdata) + security_release_secctx(iattrs->ia_secdata, + iattrs->ia_secdata_len); + iattrs->ia_secdata = secdata; + iattrs->ia_secdata_len = secdata_len; + } else + return -EINVAL; +out: return error; } @@ -146,6 +204,7 @@ static int sysfs_count_nlink(struct sysfs_dirent *sd) static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) { struct bin_attribute *bin_attr; + struct sysfs_inode_attrs *iattrs; inode->i_private = sysfs_get(sd); inode->i_mapping->a_ops = &sysfs_aops; @@ -154,16 +213,20 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) inode->i_ino = sd->s_ino; lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key); - if (sd->s_iattr) { + iattrs = sd->s_iattr; + if (iattrs) { /* sysfs_dirent has non-default attributes * get them for the new inode from persistent copy * in sysfs_dirent */ - set_inode_attr(inode, sd->s_iattr); + set_inode_attr(inode, &iattrs->ia_iattr); + if (iattrs->ia_secdata) + security_inode_notifysecctx(inode, + iattrs->ia_secdata, + iattrs->ia_secdata_len); } else set_default_inode_attr(inode, sd->s_mode); - /* initialize inode according to type */ switch (sysfs_type(sd)) { case SYSFS_DIR: diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 1d897ad808e..c5081ad7702 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -16,6 +16,7 @@ #include <linux/kobject.h> #include <linux/namei.h> #include <linux/mutex.h> +#include <linux/security.h> #include "sysfs.h" @@ -209,6 +210,7 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *co } const struct inode_operations sysfs_symlink_inode_operations = { + .setxattr = sysfs_setxattr, .readlink = generic_readlink, .follow_link = sysfs_follow_link, .put_link = sysfs_put_link, diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 3fa0d98481e..af4c4e7482a 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -8,6 +8,8 @@ * This file is released under the GPLv2. */ +#include <linux/fs.h> + struct sysfs_open_dirent; /* type-specific structures for sysfs_dirent->s_* union members */ @@ -31,6 +33,12 @@ struct sysfs_elem_bin_attr { struct hlist_head buffers; }; +struct sysfs_inode_attrs { + struct iattr ia_iattr; + void *ia_secdata; + u32 ia_secdata_len; +}; + /* * sysfs_dirent - the building block of sysfs hierarchy. Each and * every sysfs node is represented by single sysfs_dirent. @@ -56,7 +64,7 @@ struct sysfs_dirent { unsigned int s_flags; ino_t s_ino; umode_t s_mode; - struct iattr *s_iattr; + struct sysfs_inode_attrs *s_iattr; }; #define SD_DEACTIVATED_BIAS INT_MIN @@ -148,6 +156,8 @@ static inline void __sysfs_put(struct sysfs_dirent *sd) struct inode *sysfs_get_inode(struct sysfs_dirent *sd); void sysfs_delete_inode(struct inode *inode); int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); +int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags); int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); int sysfs_inode_init(void); diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index eaf6d891d46..1c8991b0db1 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -65,26 +65,14 @@ static int shrink_liability(struct ubifs_info *c, int nr_to_write) { int nr_written; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .range_end = LLONG_MAX, - .nr_to_write = nr_to_write, - }; - - generic_sync_sb_inodes(c->vfs_sb, &wbc); - nr_written = nr_to_write - wbc.nr_to_write; + nr_written = writeback_inodes_sb(c->vfs_sb); if (!nr_written) { /* * Re-try again but wait on pages/inodes which are being * written-back concurrently (e.g., by pdflush). */ - memset(&wbc, 0, sizeof(struct writeback_control)); - wbc.sync_mode = WB_SYNC_ALL; - wbc.range_end = LLONG_MAX; - wbc.nr_to_write = nr_to_write; - generic_sync_sb_inodes(c->vfs_sb, &wbc); - nr_written = nr_to_write - wbc.nr_to_write; + nr_written = sync_inodes_sb(c->vfs_sb); } dbg_budg("%d pages were written back", nr_written); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 26d2e0d8046..51763aa8f4d 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -438,12 +438,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait) { int i, err; struct ubifs_info *c = sb->s_fs_info; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .range_start = 0, - .range_end = LLONG_MAX, - .nr_to_write = LONG_MAX, - }; /* * Zero @wait is just an advisory thing to help the file system shove @@ -462,7 +456,7 @@ static int ubifs_sync_fs(struct super_block *sb, int wait) * the user be able to get more accurate results of 'statfs()' after * they synchronize the file system. */ - generic_sync_sb_inodes(sb, &wbc); + sync_inodes_sb(sb); /* * Synchronize write buffers, because 'ubifs_run_commit()' does not @@ -1971,6 +1965,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) * * Read-ahead will be disabled because @c->bdi.ra_pages is 0. */ + c->bdi.name = "ubifs", c->bdi.capabilities = BDI_CAP_MAP_COPY; c->bdi.unplug_io_fn = default_unplug_io_fn; err = bdi_init(&c->bdi); diff --git a/fs/xattr.c b/fs/xattr.c index 1c3d0af59dd..6d4f6d3449f 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -66,22 +66,28 @@ xattr_permission(struct inode *inode, const char *name, int mask) return inode_permission(inode, mask); } -int -vfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) +/** + * __vfs_setxattr_noperm - perform setxattr operation without performing + * permission checks. + * + * @dentry - object to perform setxattr on + * @name - xattr name to set + * @value - value to set @name to + * @size - size of @value + * @flags - flags to pass into filesystem operations + * + * returns the result of the internal setxattr or setsecurity operations. + * + * This function requires the caller to lock the inode's i_mutex before it + * is executed. It also assumes that the caller will make the appropriate + * permission checks. + */ +int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; - int error; - - error = xattr_permission(inode, name, MAY_WRITE); - if (error) - return error; + int error = -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); - error = security_inode_setxattr(dentry, name, value, size, flags); - if (error) - goto out; - error = -EOPNOTSUPP; if (inode->i_op->setxattr) { error = inode->i_op->setxattr(dentry, name, value, size, flags); if (!error) { @@ -97,6 +103,29 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (!error) fsnotify_xattr(dentry); } + + return error; +} + + +int +vfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = xattr_permission(inode, name, MAY_WRITE); + if (error) + return error; + + mutex_lock(&inode->i_mutex); + error = security_inode_setxattr(dentry, name, value, size, flags); + if (error) + goto out; + + error = __vfs_setxattr_noperm(dentry, name, value, size, flags); + out: mutex_unlock(&inode->i_mutex); return error; diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 8070b34cc28..6c32f1d63d8 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -485,14 +485,6 @@ xfs_vn_put_link( } STATIC int -xfs_vn_permission( - struct inode *inode, - int mask) -{ - return generic_permission(inode, mask, xfs_check_acl); -} - -STATIC int xfs_vn_getattr( struct vfsmount *mnt, struct dentry *dentry, @@ -696,7 +688,7 @@ xfs_vn_fiemap( } static const struct inode_operations xfs_inode_operations = { - .permission = xfs_vn_permission, + .check_acl = xfs_check_acl, .truncate = xfs_vn_truncate, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, @@ -724,7 +716,7 @@ static const struct inode_operations xfs_dir_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .permission = xfs_vn_permission, + .check_acl = xfs_check_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -749,7 +741,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .permission = xfs_vn_permission, + .check_acl = xfs_check_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -762,7 +754,7 @@ static const struct inode_operations xfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = xfs_vn_follow_link, .put_link = xfs_vn_put_link, - .permission = xfs_vn_permission, + .check_acl = xfs_check_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 1d52425a611..f169bcb90b5 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -13,6 +13,8 @@ #include <linux/proportions.h> #include <linux/kernel.h> #include <linux/fs.h> +#include <linux/sched.h> +#include <linux/writeback.h> #include <asm/atomic.h> struct page; @@ -23,9 +25,11 @@ struct dentry; * Bits in backing_dev_info.state */ enum bdi_state { - BDI_pdflush, /* A pdflush thread is working this device */ + BDI_pending, /* On its way to being activated */ + BDI_wb_alloc, /* Default embedded wb allocated */ BDI_async_congested, /* The async (write) queue is getting full */ BDI_sync_congested, /* The sync queue is getting full */ + BDI_registered, /* bdi_register() was done */ BDI_unused, /* Available bits start here */ }; @@ -39,7 +43,22 @@ enum bdi_stat_item { #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) +struct bdi_writeback { + struct list_head list; /* hangs off the bdi */ + + struct backing_dev_info *bdi; /* our parent bdi */ + unsigned int nr; + + unsigned long last_old_flush; /* last old data flush */ + + struct task_struct *task; /* writeback task */ + struct list_head b_dirty; /* dirty inodes */ + struct list_head b_io; /* parked for writeback */ + struct list_head b_more_io; /* parked for more writeback */ +}; + struct backing_dev_info { + struct list_head bdi_list; unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ unsigned long state; /* Always use atomic bitops on this */ unsigned int capabilities; /* Device capabilities */ @@ -48,6 +67,8 @@ struct backing_dev_info { void (*unplug_io_fn)(struct backing_dev_info *, struct page *); void *unplug_io_data; + char *name; + struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; struct prop_local_percpu completions; @@ -56,6 +77,14 @@ struct backing_dev_info { unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; + struct bdi_writeback wb; /* default writeback info for this bdi */ + spinlock_t wb_lock; /* protects update side of wb_list */ + struct list_head wb_list; /* the flusher threads hanging off this bdi */ + unsigned long wb_mask; /* bitmask of registered tasks */ + unsigned int wb_cnt; /* number of registered tasks */ + + struct list_head work_list; + struct device *dev; #ifdef CONFIG_DEBUG_FS @@ -71,6 +100,19 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...); int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); void bdi_unregister(struct backing_dev_info *bdi); +void bdi_start_writeback(struct writeback_control *wbc); +int bdi_writeback_task(struct bdi_writeback *wb); +int bdi_has_dirty_io(struct backing_dev_info *bdi); + +extern spinlock_t bdi_lock; +extern struct list_head bdi_list; + +static inline int wb_has_dirty_io(struct bdi_writeback *wb) +{ + return !list_empty(&wb->b_dirty) || + !list_empty(&wb->b_io) || + !list_empty(&wb->b_more_io); +} static inline void __add_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item, s64 amount) @@ -261,6 +303,11 @@ static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi) return bdi->capabilities & BDI_CAP_SWAP_BACKED; } +static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi) +{ + return bdi == &default_backing_dev_info; +} + static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) { return bdi_cap_writeback_dirty(mapping->backing_dev_info); @@ -276,4 +323,10 @@ static inline bool mapping_cap_swap_backed(struct address_space *mapping) return bdi_cap_swap_backed(mapping->backing_dev_info); } +static inline int bdi_sched_wait(void *word) +{ + schedule(); + return 0; +} + #endif /* _LINUX_BACKING_DEV_H */ diff --git a/include/linux/cred.h b/include/linux/cred.h index 4fa99969631..24520a539c6 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -114,6 +114,13 @@ struct thread_group_cred { */ struct cred { atomic_t usage; +#ifdef CONFIG_DEBUG_CREDENTIALS + atomic_t subscribers; /* number of processes subscribed */ + void *put_addr; + unsigned magic; +#define CRED_MAGIC 0x43736564 +#define CRED_MAGIC_DEAD 0x44656144 +#endif uid_t uid; /* real UID of the task */ gid_t gid; /* real GID of the task */ uid_t suid; /* saved UID of the task */ @@ -143,7 +150,9 @@ struct cred { }; extern void __put_cred(struct cred *); +extern void exit_creds(struct task_struct *); extern int copy_creds(struct task_struct *, unsigned long); +extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); extern struct cred *prepare_exec_creds(void); extern struct cred *prepare_usermodehelper_creds(void); @@ -158,6 +167,60 @@ extern int set_security_override_from_ctx(struct cred *, const char *); extern int set_create_files_as(struct cred *, struct inode *); extern void __init cred_init(void); +/* + * check for validity of credentials + */ +#ifdef CONFIG_DEBUG_CREDENTIALS +extern void __invalid_creds(const struct cred *, const char *, unsigned); +extern void __validate_process_creds(struct task_struct *, + const char *, unsigned); + +static inline bool creds_are_invalid(const struct cred *cred) +{ + if (cred->magic != CRED_MAGIC) + return true; + if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers)) + return true; +#ifdef CONFIG_SECURITY_SELINUX + if ((unsigned long) cred->security < PAGE_SIZE) + return true; + if ((*(u32*)cred->security & 0xffffff00) == + (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)) + return true; +#endif + return false; +} + +static inline void __validate_creds(const struct cred *cred, + const char *file, unsigned line) +{ + if (unlikely(creds_are_invalid(cred))) + __invalid_creds(cred, file, line); +} + +#define validate_creds(cred) \ +do { \ + __validate_creds((cred), __FILE__, __LINE__); \ +} while(0) + +#define validate_process_creds() \ +do { \ + __validate_process_creds(current, __FILE__, __LINE__); \ +} while(0) + +extern void validate_creds_for_do_exit(struct task_struct *); +#else +static inline void validate_creds(const struct cred *cred) +{ +} +static inline void validate_creds_for_do_exit(struct task_struct *tsk) +{ +} +static inline void validate_process_creds(void) +{ +} +#endif + /** * get_new_cred - Get a reference on a new set of credentials * @cred: The new credentials to reference @@ -186,7 +249,9 @@ static inline struct cred *get_new_cred(struct cred *cred) */ static inline const struct cred *get_cred(const struct cred *cred) { - return get_new_cred((struct cred *) cred); + struct cred *nonconst_cred = (struct cred *) cred; + validate_creds(cred); + return get_new_cred(nonconst_cred); } /** @@ -204,7 +269,7 @@ static inline void put_cred(const struct cred *_cred) { struct cred *cred = (struct cred *) _cred; - BUG_ON(atomic_read(&(cred)->usage) <= 0); + validate_creds(cred); if (atomic_dec_and_test(&(cred)->usage)) __put_cred(cred); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 3972ffb597c..a79f48373e7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -715,7 +715,7 @@ struct posix_acl; struct inode { struct hlist_node i_hash; - struct list_head i_list; + struct list_head i_list; /* backing dev IO list */ struct list_head i_sb_list; struct list_head i_dentry; unsigned long i_ino; @@ -1336,9 +1336,6 @@ struct super_block { struct xattr_handler **s_xattr; struct list_head s_inodes; /* all inodes */ - struct list_head s_dirty; /* dirty inodes */ - struct list_head s_io; /* parked for writeback */ - struct list_head s_more_io; /* parked for more writeback */ struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ struct list_head s_files; /* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */ @@ -1528,6 +1525,7 @@ struct inode_operations { void (*put_link) (struct dentry *, struct nameidata *, void *); void (*truncate) (struct inode *); int (*permission) (struct inode *, int); + int (*check_acl)(struct inode *, int); int (*setattr) (struct dentry *, struct iattr *); int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); @@ -1788,6 +1786,7 @@ extern int get_sb_pseudo(struct file_system_type *, char *, struct vfsmount *mnt); extern void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb); int __put_super_and_need_restart(struct super_block *sb); +void put_super(struct super_block *sb); /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ #define fops_get(fops) \ @@ -2083,8 +2082,6 @@ static inline void invalidate_remote_inode(struct inode *inode) extern int invalidate_inode_pages2(struct address_space *mapping); extern int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end); -extern void generic_sync_sb_inodes(struct super_block *sb, - struct writeback_control *wbc); extern int write_inode_now(struct inode *, int); extern int filemap_fdatawrite(struct address_space *); extern int filemap_flush(struct address_space *); @@ -2199,7 +2196,6 @@ extern int bdev_read_only(struct block_device *); extern int set_blocksize(struct block_device *, int); extern int sb_set_blocksize(struct super_block *, int); extern int sb_min_blocksize(struct super_block *, int); -extern int sb_has_dirty_inodes(struct super_block *); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); diff --git a/include/linux/key.h b/include/linux/key.h index e544f466d69..cd50dfa1d4c 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -129,7 +129,10 @@ struct key { struct rw_semaphore sem; /* change vs change sem */ struct key_user *user; /* owner of this key */ void *security; /* security data for this key */ - time_t expiry; /* time at which key expires (or 0) */ + union { + time_t expiry; /* time at which key expires (or 0) */ + time_t revoked_at; /* time at which key was revoked */ + }; uid_t uid; gid_t gid; key_perm_t perm; /* access permissions */ @@ -275,6 +278,8 @@ static inline key_serial_t key_serial(struct key *key) extern ctl_table key_sysctls[]; #endif +extern void key_replace_session_keyring(void); + /* * the userspace interface */ @@ -297,6 +302,7 @@ extern void key_init(void); #define key_fsuid_changed(t) do { } while(0) #define key_fsgid_changed(t) do { } while(0) #define key_init() do { } while(0) +#define key_replace_session_keyring() do { } while(0) #endif /* CONFIG_KEYS */ #endif /* __KERNEL__ */ diff --git a/include/linux/keyctl.h b/include/linux/keyctl.h index c0688eb7209..bd383f1944f 100644 --- a/include/linux/keyctl.h +++ b/include/linux/keyctl.h @@ -52,5 +52,6 @@ #define KEYCTL_SET_TIMEOUT 15 /* set key timeout */ #define KEYCTL_ASSUME_AUTHORITY 16 /* assume request_key() authorisation */ #define KEYCTL_GET_SECURITY 17 /* get key security label */ +#define KEYCTL_SESSION_TO_PARENT 18 /* apply session keyring to parent process */ #endif /* _LINUX_KEYCTL_H */ diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h index 47b39b7c7e8..dc2fd545db0 100644 --- a/include/linux/kmemcheck.h +++ b/include/linux/kmemcheck.h @@ -34,6 +34,8 @@ void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n); int kmemcheck_show_addr(unsigned long address); int kmemcheck_hide_addr(unsigned long address); +bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size); + #else #define kmemcheck_enabled 0 @@ -99,6 +101,11 @@ static inline void kmemcheck_mark_initialized_pages(struct page *p, { } +static inline bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size) +{ + return true; +} + #endif /* CONFIG_KMEMCHECK */ /* diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 6a63807f714..3c7497d46ee 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -23,18 +23,18 @@ #ifdef CONFIG_DEBUG_KMEMLEAK -extern void kmemleak_init(void); +extern void kmemleak_init(void) __ref; extern void kmemleak_alloc(const void *ptr, size_t size, int min_count, - gfp_t gfp); -extern void kmemleak_free(const void *ptr); -extern void kmemleak_free_part(const void *ptr, size_t size); + gfp_t gfp) __ref; +extern void kmemleak_free(const void *ptr) __ref; +extern void kmemleak_free_part(const void *ptr, size_t size) __ref; extern void kmemleak_padding(const void *ptr, unsigned long offset, - size_t size); -extern void kmemleak_not_leak(const void *ptr); -extern void kmemleak_ignore(const void *ptr); + size_t size) __ref; +extern void kmemleak_not_leak(const void *ptr) __ref; +extern void kmemleak_ignore(const void *ptr) __ref; extern void kmemleak_scan_area(const void *ptr, unsigned long offset, - size_t length, gfp_t gfp); -extern void kmemleak_no_scan(const void *ptr); + size_t length, gfp_t gfp) __ref; +extern void kmemleak_no_scan(const void *ptr) __ref; static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, int min_count, unsigned long flags, diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h index e461b2c3d71..190c3785487 100644 --- a/include/linux/lsm_audit.h +++ b/include/linux/lsm_audit.h @@ -33,6 +33,7 @@ struct common_audit_data { #define LSM_AUDIT_DATA_IPC 4 #define LSM_AUDIT_DATA_TASK 5 #define LSM_AUDIT_DATA_KEY 6 +#define LSM_AUDIT_NO_AUDIT 7 struct task_struct *tsk; union { struct { @@ -66,16 +67,19 @@ struct common_audit_data { } key_struct; #endif } u; - const char *function; /* this union contains LSM specific data */ union { +#ifdef CONFIG_SECURITY_SMACK /* SMACK data */ struct smack_audit_data { + const char *function; char *subject; char *object; char *request; int result; } smack_audit_data; +#endif +#ifdef CONFIG_SECURITY_SELINUX /* SELinux data */ struct { u32 ssid; @@ -83,10 +87,12 @@ struct common_audit_data { u16 tclass; u32 requested; u32 audited; + u32 denied; struct av_decision *avd; int result; } selinux_audit_data; - } lsm_priv; +#endif + }; /* these callback will be implemented by a specific LSM */ void (*lsm_pre_audit)(struct audit_buffer *, void *); void (*lsm_post_audit)(struct audit_buffer *, void *); @@ -104,7 +110,7 @@ int ipv6_skb_to_auditdata(struct sk_buff *skb, /* Initialize an LSM audit data structure. */ #define COMMON_AUDIT_DATA_INIT(_d, _t) \ { memset((_d), 0, sizeof(struct common_audit_data)); \ - (_d)->type = LSM_AUDIT_DATA_##_t; (_d)->function = __func__; } + (_d)->type = LSM_AUDIT_DATA_##_t; } void common_lsm_audit(struct common_audit_data *a); diff --git a/include/linux/sched.h b/include/linux/sched.h index 0f1ea4a6695..9304027673b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1292,6 +1292,7 @@ struct task_struct { struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations * (notably. ptrace) */ + struct cred *replacement_session_keyring; /* for KEYCTL_SESSION_TO_PARENT */ char comm[TASK_COMM_LEN]; /* executable name excluding path - access with [gs]et_task_comm (which lock @@ -2077,7 +2078,7 @@ static inline unsigned long wait_task_inactive(struct task_struct *p, #define for_each_process(p) \ for (p = &init_task ; (p = next_task(p)) != &init_task ; ) -extern bool is_single_threaded(struct task_struct *); +extern bool current_is_single_threaded(void); /* * Careful: do_each_thread/while_each_thread is a double loop so diff --git a/include/linux/security.h b/include/linux/security.h index 1f16eea2017..d050b66ab9e 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -53,7 +53,7 @@ struct audit_krule; extern int cap_capable(struct task_struct *tsk, const struct cred *cred, int cap, int audit); extern int cap_settime(struct timespec *ts, struct timezone *tz); -extern int cap_ptrace_may_access(struct task_struct *child, unsigned int mode); +extern int cap_ptrace_access_check(struct task_struct *child, unsigned int mode); extern int cap_ptrace_traceme(struct task_struct *parent); extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); extern int cap_capset(struct cred *new, const struct cred *old, @@ -653,6 +653,11 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * manual page for definitions of the @clone_flags. * @clone_flags contains the flags indicating what should be shared. * Return 0 if permission is granted. + * @cred_alloc_blank: + * @cred points to the credentials. + * @gfp indicates the atomicity of any memory allocations. + * Only allocate sufficient memory and attach to @cred such that + * cred_transfer() will not get ENOMEM. * @cred_free: * @cred points to the credentials. * Deallocate and clear the cred->security field in a set of credentials. @@ -665,6 +670,10 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @new points to the new credentials. * @old points to the original credentials. * Install a new set of credentials. + * @cred_transfer: + * @new points to the new credentials. + * @old points to the original credentials. + * Transfer data from original creds to new creds * @kernel_act_as: * Set the credentials for a kernel service to act as (subjective context). * @new points to the credentials to be modified. @@ -678,6 +687,10 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @inode points to the inode to use as a reference. * The current task must be the one that nominated @inode. * Return 0 if successful. + * @kernel_module_request: + * Ability to trigger the kernel to automatically upcall to userspace for + * userspace to load a kernel module with the given name. + * Return 0 if successful. * @task_setuid: * Check permission before setting one or more of the user identity * attributes of the current process. The @flags parameter indicates @@ -994,6 +1007,17 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * Sets the connection's peersid to the secmark on skb. * @req_classify_flow: * Sets the flow's sid to the openreq sid. + * @tun_dev_create: + * Check permissions prior to creating a new TUN device. + * @tun_dev_post_create: + * This hook allows a module to update or allocate a per-socket security + * structure. + * @sk contains the newly created sock structure. + * @tun_dev_attach: + * Check permissions prior to attaching to a persistent TUN device. This + * hook can also be used by the module to update any security state + * associated with the TUN device's sock structure. + * @sk contains the existing sock structure. * * Security hooks for XFRM operations. * @@ -1088,6 +1112,13 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * Return the length of the string (including terminating NUL) or -ve if * an error. * May also return 0 (and a NULL buffer pointer) if there is no label. + * @key_session_to_parent: + * Forcibly assign the session keyring from a process to its parent + * process. + * @cred: Pointer to process's credentials + * @parent_cred: Pointer to parent process's credentials + * @keyring: Proposed new session keyring + * Return 0 if permission is granted, -ve error otherwise. * * Security hooks affecting all System V IPC operations. * @@ -1229,7 +1260,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @alter contains the flag indicating whether changes are to be made. * Return 0 if permission is granted. * - * @ptrace_may_access: + * @ptrace_access_check: * Check permission before allowing the current process to trace the * @child process. * Security modules may also want to perform a process tracing check @@ -1244,7 +1275,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * Check that the @parent process has sufficient permission to trace the * current process before allowing the current process to present itself * to the @parent process for tracing. - * The parent process will still have to undergo the ptrace_may_access + * The parent process will still have to undergo the ptrace_access_check * checks before it is allowed to trace this one. * @parent contains the task_struct structure for debugger process. * Return 0 if permission is granted. @@ -1351,12 +1382,47 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * audit_rule_init. * @rule contains the allocated rule * + * @inode_notifysecctx: + * Notify the security module of what the security context of an inode + * should be. Initializes the incore security context managed by the + * security module for this inode. Example usage: NFS client invokes + * this hook to initialize the security context in its incore inode to the + * value provided by the server for the file when the server returned the + * file's attributes to the client. + * + * Must be called with inode->i_mutex locked. + * + * @inode we wish to set the security context of. + * @ctx contains the string which we wish to set in the inode. + * @ctxlen contains the length of @ctx. + * + * @inode_setsecctx: + * Change the security context of an inode. Updates the + * incore security context managed by the security module and invokes the + * fs code as needed (via __vfs_setxattr_noperm) to update any backing + * xattrs that represent the context. Example usage: NFS server invokes + * this hook to change the security context in its incore inode and on the + * backing filesystem to a value provided by the client on a SETATTR + * operation. + * + * Must be called with inode->i_mutex locked. + * + * @dentry contains the inode we wish to set the security context of. + * @ctx contains the string which we wish to set in the inode. + * @ctxlen contains the length of @ctx. + * + * @inode_getsecctx: + * Returns a string containing all relavent security context information + * + * @inode we wish to set the security context of. + * @ctx is a pointer in which to place the allocated security context. + * @ctxlen points to the place to put the length of @ctx. * This is the main security structure. */ struct security_operations { char name[SECURITY_NAME_MAX + 1]; - int (*ptrace_may_access) (struct task_struct *child, unsigned int mode); + int (*ptrace_access_check) (struct task_struct *child, unsigned int mode); int (*ptrace_traceme) (struct task_struct *parent); int (*capget) (struct task_struct *target, kernel_cap_t *effective, @@ -1483,12 +1549,15 @@ struct security_operations { int (*dentry_open) (struct file *file, const struct cred *cred); int (*task_create) (unsigned long clone_flags); + int (*cred_alloc_blank) (struct cred *cred, gfp_t gfp); void (*cred_free) (struct cred *cred); int (*cred_prepare)(struct cred *new, const struct cred *old, gfp_t gfp); void (*cred_commit)(struct cred *new, const struct cred *old); + void (*cred_transfer)(struct cred *new, const struct cred *old); int (*kernel_act_as)(struct cred *new, u32 secid); int (*kernel_create_files_as)(struct cred *new, struct inode *inode); + int (*kernel_module_request)(void); int (*task_setuid) (uid_t id0, uid_t id1, uid_t id2, int flags); int (*task_fix_setuid) (struct cred *new, const struct cred *old, int flags); @@ -1556,6 +1625,10 @@ struct security_operations { int (*secctx_to_secid) (const char *secdata, u32 seclen, u32 *secid); void (*release_secctx) (char *secdata, u32 seclen); + int (*inode_notifysecctx)(struct inode *inode, void *ctx, u32 ctxlen); + int (*inode_setsecctx)(struct dentry *dentry, void *ctx, u32 ctxlen); + int (*inode_getsecctx)(struct inode *inode, void **ctx, u32 *ctxlen); + #ifdef CONFIG_SECURITY_NETWORK int (*unix_stream_connect) (struct socket *sock, struct socket *other, struct sock *newsk); @@ -1592,6 +1665,9 @@ struct security_operations { void (*inet_csk_clone) (struct sock *newsk, const struct request_sock *req); void (*inet_conn_established) (struct sock *sk, struct sk_buff *skb); void (*req_classify_flow) (const struct request_sock *req, struct flowi *fl); + int (*tun_dev_create)(void); + void (*tun_dev_post_create)(struct sock *sk); + int (*tun_dev_attach)(struct sock *sk); #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_NETWORK_XFRM @@ -1620,6 +1696,9 @@ struct security_operations { const struct cred *cred, key_perm_t perm); int (*key_getsecurity)(struct key *key, char **_buffer); + int (*key_session_to_parent)(const struct cred *cred, + const struct cred *parent_cred, + struct key *key); #endif /* CONFIG_KEYS */ #ifdef CONFIG_AUDIT @@ -1637,7 +1716,7 @@ extern int security_module_enable(struct security_operations *ops); extern int register_security(struct security_operations *ops); /* Security operations */ -int security_ptrace_may_access(struct task_struct *child, unsigned int mode); +int security_ptrace_access_check(struct task_struct *child, unsigned int mode); int security_ptrace_traceme(struct task_struct *parent); int security_capget(struct task_struct *target, kernel_cap_t *effective, @@ -1736,11 +1815,14 @@ int security_file_send_sigiotask(struct task_struct *tsk, int security_file_receive(struct file *file); int security_dentry_open(struct file *file, const struct cred *cred); int security_task_create(unsigned long clone_flags); +int security_cred_alloc_blank(struct cred *cred, gfp_t gfp); void security_cred_free(struct cred *cred); int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp); void security_commit_creds(struct cred *new, const struct cred *old); +void security_transfer_creds(struct cred *new, const struct cred *old); int security_kernel_act_as(struct cred *new, u32 secid); int security_kernel_create_files_as(struct cred *new, struct inode *inode); +int security_kernel_module_request(void); int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags); int security_task_fix_setuid(struct cred *new, const struct cred *old, int flags); @@ -1796,6 +1878,9 @@ int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen); int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid); void security_release_secctx(char *secdata, u32 seclen); +int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen); +int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen); +int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen); #else /* CONFIG_SECURITY */ struct security_mnt_opts { }; @@ -1818,10 +1903,10 @@ static inline int security_init(void) return 0; } -static inline int security_ptrace_may_access(struct task_struct *child, +static inline int security_ptrace_access_check(struct task_struct *child, unsigned int mode) { - return cap_ptrace_may_access(child, mode); + return cap_ptrace_access_check(child, mode); } static inline int security_ptrace_traceme(struct task_struct *parent) @@ -2266,6 +2351,11 @@ static inline int security_task_create(unsigned long clone_flags) return 0; } +static inline int security_cred_alloc_blank(struct cred *cred, gfp_t gfp) +{ + return 0; +} + static inline void security_cred_free(struct cred *cred) { } @@ -2281,6 +2371,11 @@ static inline void security_commit_creds(struct cred *new, { } +static inline void security_transfer_creds(struct cred *new, + const struct cred *old) +{ +} + static inline int security_kernel_act_as(struct cred *cred, u32 secid) { return 0; @@ -2292,6 +2387,11 @@ static inline int security_kernel_create_files_as(struct cred *cred, return 0; } +static inline int security_kernel_module_request(void) +{ + return 0; +} + static inline int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags) { @@ -2537,6 +2637,19 @@ static inline int security_secctx_to_secid(const char *secdata, static inline void security_release_secctx(char *secdata, u32 seclen) { } + +static inline int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen) +{ + return -EOPNOTSUPP; +} +static inline int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen) +{ + return -EOPNOTSUPP; +} +static inline int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_SECURITY */ #ifdef CONFIG_SECURITY_NETWORK @@ -2575,6 +2688,9 @@ void security_inet_csk_clone(struct sock *newsk, const struct request_sock *req); void security_inet_conn_established(struct sock *sk, struct sk_buff *skb); +int security_tun_dev_create(void); +void security_tun_dev_post_create(struct sock *sk); +int security_tun_dev_attach(struct sock *sk); #else /* CONFIG_SECURITY_NETWORK */ static inline int security_unix_stream_connect(struct socket *sock, @@ -2725,6 +2841,20 @@ static inline void security_inet_conn_established(struct sock *sk, struct sk_buff *skb) { } + +static inline int security_tun_dev_create(void) +{ + return 0; +} + +static inline void security_tun_dev_post_create(struct sock *sk) +{ +} + +static inline int security_tun_dev_attach(struct sock *sk) +{ + return 0; +} #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_NETWORK_XFRM @@ -2881,6 +3011,9 @@ void security_key_free(struct key *key); int security_key_permission(key_ref_t key_ref, const struct cred *cred, key_perm_t perm); int security_key_getsecurity(struct key *key, char **_buffer); +int security_key_session_to_parent(const struct cred *cred, + const struct cred *parent_cred, + struct key *key); #else @@ -2908,6 +3041,13 @@ static inline int security_key_getsecurity(struct key *key, char **_buffer) return 0; } +static inline int security_key_session_to_parent(const struct cred *cred, + const struct cred *parent_cred, + struct key *key) +{ + return 0; +} + #endif #endif /* CONFIG_KEYS */ diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index abff6c9b413..6d3f2f449ea 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -39,7 +39,7 @@ static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) } #ifdef CONFIG_TMPFS_POSIX_ACL -int shmem_permission(struct inode *, int); +int shmem_check_acl(struct inode *, int); int shmem_acl_init(struct inode *, struct inode *); extern struct xattr_handler shmem_xattr_acl_access_handler; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 3224820c851..78b1e4684cc 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -14,17 +14,6 @@ extern struct list_head inode_in_use; extern struct list_head inode_unused; /* - * Yes, writeback.h requires sched.h - * No, sched.h is not included from here. - */ -static inline int task_is_pdflush(struct task_struct *task) -{ - return task->flags & PF_FLUSHER; -} - -#define current_is_pdflush() task_is_pdflush(current) - -/* * fs/fs-writeback.c */ enum writeback_sync_modes { @@ -40,6 +29,8 @@ enum writeback_sync_modes { struct writeback_control { struct backing_dev_info *bdi; /* If !NULL, only write back this queue */ + struct super_block *sb; /* if !NULL, only write inodes from + this super_block */ enum writeback_sync_modes sync_mode; unsigned long *older_than_this; /* If !NULL, only write back inodes older than this */ @@ -76,9 +67,13 @@ struct writeback_control { /* * fs/fs-writeback.c */ -void writeback_inodes(struct writeback_control *wbc); +struct bdi_writeback; int inode_wait(void *); -void sync_inodes_sb(struct super_block *, int wait); +long writeback_inodes_sb(struct super_block *); +long sync_inodes_sb(struct super_block *); +void writeback_inodes_wbc(struct writeback_control *wbc); +long wb_do_writeback(struct bdi_writeback *wb, int force_wait); +void wakeup_flusher_threads(long nr_pages); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) @@ -98,7 +93,6 @@ static inline void inode_sync_wait(struct inode *inode) /* * mm/page-writeback.c */ -int wakeup_pdflush(long nr_pages); void laptop_io_completion(void); void laptop_sync_completion(void); void throttle_vm_writeout(gfp_t gfp_mask); @@ -150,7 +144,6 @@ balance_dirty_pages_ratelimited(struct address_space *mapping) typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, void *data); -int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0); int generic_writepages(struct address_space *mapping, struct writeback_control *wbc); int write_cache_pages(struct address_space *mapping, diff --git a/include/linux/xattr.h b/include/linux/xattr.h index d131e352cfe..5c84af8c5f6 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -49,6 +49,7 @@ struct xattr_handler { ssize_t xattr_getsecurity(struct inode *, const char *, void *, size_t); ssize_t vfs_getxattr(struct dentry *, const char *, void *, size_t); ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size); +int __vfs_setxattr_noperm(struct dentry *, const char *, const void *, size_t, int); int vfs_setxattr(struct dentry *, const char *, const void *, size_t, int); int vfs_removexattr(struct dentry *, const char *); diff --git a/kernel/acct.c b/kernel/acct.c index 9f3391090b3..9a4715a2f6b 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -491,13 +491,17 @@ static void do_acct_process(struct bsd_acct_struct *acct, u64 run_time; struct timespec uptime; struct tty_struct *tty; + const struct cred *orig_cred; + + /* Perform file operations on behalf of whoever enabled accounting */ + orig_cred = override_creds(file->f_cred); /* * First check to see if there is enough free_space to continue * the process accounting system. */ if (!check_free_space(acct, file)) - return; + goto out; /* * Fill the accounting struct with the needed info as recorded @@ -578,6 +582,8 @@ static void do_acct_process(struct bsd_acct_struct *acct, sizeof(acct_t), &file->f_pos); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; set_fs(fs); +out: + revert_creds(orig_cred); } /** diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b6eadfe30e7..c7ece8f027f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -600,6 +600,7 @@ static struct inode_operations cgroup_dir_inode_operations; static struct file_operations proc_cgroupstats_operations; static struct backing_dev_info cgroup_backing_dev_info = { + .name = "cgroup", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; diff --git a/kernel/cred.c b/kernel/cred.c index 1bb4d7e5d61..006fcab009d 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -18,6 +18,18 @@ #include <linux/cn_proc.h> #include "cred-internals.h" +#if 0 +#define kdebug(FMT, ...) \ + printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) +#else +static inline __attribute__((format(printf, 1, 2))) +void no_printk(const char *fmt, ...) +{ +} +#define kdebug(FMT, ...) \ + no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) +#endif + static struct kmem_cache *cred_jar; /* @@ -36,6 +48,10 @@ static struct thread_group_cred init_tgcred = { */ struct cred init_cred = { .usage = ATOMIC_INIT(4), +#ifdef CONFIG_DEBUG_CREDENTIALS + .subscribers = ATOMIC_INIT(2), + .magic = CRED_MAGIC, +#endif .securebits = SECUREBITS_DEFAULT, .cap_inheritable = CAP_INIT_INH_SET, .cap_permitted = CAP_FULL_SET, @@ -48,6 +64,31 @@ struct cred init_cred = { #endif }; +static inline void set_cred_subscribers(struct cred *cred, int n) +{ +#ifdef CONFIG_DEBUG_CREDENTIALS + atomic_set(&cred->subscribers, n); +#endif +} + +static inline int read_cred_subscribers(const struct cred *cred) +{ +#ifdef CONFIG_DEBUG_CREDENTIALS + return atomic_read(&cred->subscribers); +#else + return 0; +#endif +} + +static inline void alter_cred_subscribers(const struct cred *_cred, int n) +{ +#ifdef CONFIG_DEBUG_CREDENTIALS + struct cred *cred = (struct cred *) _cred; + + atomic_add(n, &cred->subscribers); +#endif +} + /* * Dispose of the shared task group credentials */ @@ -85,9 +126,22 @@ static void put_cred_rcu(struct rcu_head *rcu) { struct cred *cred = container_of(rcu, struct cred, rcu); + kdebug("put_cred_rcu(%p)", cred); + +#ifdef CONFIG_DEBUG_CREDENTIALS + if (cred->magic != CRED_MAGIC_DEAD || + atomic_read(&cred->usage) != 0 || + read_cred_subscribers(cred) != 0) + panic("CRED: put_cred_rcu() sees %p with" + " mag %x, put %p, usage %d, subscr %d\n", + cred, cred->magic, cred->put_addr, + atomic_read(&cred->usage), + read_cred_subscribers(cred)); +#else if (atomic_read(&cred->usage) != 0) panic("CRED: put_cred_rcu() sees %p with usage %d\n", cred, atomic_read(&cred->usage)); +#endif security_cred_free(cred); key_put(cred->thread_keyring); @@ -106,12 +160,90 @@ static void put_cred_rcu(struct rcu_head *rcu) */ void __put_cred(struct cred *cred) { + kdebug("__put_cred(%p{%d,%d})", cred, + atomic_read(&cred->usage), + read_cred_subscribers(cred)); + BUG_ON(atomic_read(&cred->usage) != 0); +#ifdef CONFIG_DEBUG_CREDENTIALS + BUG_ON(read_cred_subscribers(cred) != 0); + cred->magic = CRED_MAGIC_DEAD; + cred->put_addr = __builtin_return_address(0); +#endif + BUG_ON(cred == current->cred); + BUG_ON(cred == current->real_cred); call_rcu(&cred->rcu, put_cred_rcu); } EXPORT_SYMBOL(__put_cred); +/* + * Clean up a task's credentials when it exits + */ +void exit_creds(struct task_struct *tsk) +{ + struct cred *cred; + + kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred, + atomic_read(&tsk->cred->usage), + read_cred_subscribers(tsk->cred)); + + cred = (struct cred *) tsk->real_cred; + tsk->real_cred = NULL; + validate_creds(cred); + alter_cred_subscribers(cred, -1); + put_cred(cred); + + cred = (struct cred *) tsk->cred; + tsk->cred = NULL; + validate_creds(cred); + alter_cred_subscribers(cred, -1); + put_cred(cred); + + cred = (struct cred *) tsk->replacement_session_keyring; + if (cred) { + tsk->replacement_session_keyring = NULL; + validate_creds(cred); + put_cred(cred); + } +} + +/* + * Allocate blank credentials, such that the credentials can be filled in at a + * later date without risk of ENOMEM. + */ +struct cred *cred_alloc_blank(void) +{ + struct cred *new; + + new = kmem_cache_zalloc(cred_jar, GFP_KERNEL); + if (!new) + return NULL; + +#ifdef CONFIG_KEYS + new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); + if (!new->tgcred) { + kfree(new); + return NULL; + } + atomic_set(&new->tgcred->usage, 1); +#endif + + atomic_set(&new->usage, 1); + + if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) + goto error; + +#ifdef CONFIG_DEBUG_CREDENTIALS + new->magic = CRED_MAGIC; +#endif + return new; + +error: + abort_creds(new); + return NULL; +} + /** * prepare_creds - Prepare a new set of credentials for modification * @@ -132,16 +264,19 @@ struct cred *prepare_creds(void) const struct cred *old; struct cred *new; - BUG_ON(atomic_read(&task->real_cred->usage) < 1); + validate_process_creds(); new = kmem_cache_alloc(cred_jar, GFP_KERNEL); if (!new) return NULL; + kdebug("prepare_creds() alloc %p", new); + old = task->cred; memcpy(new, old, sizeof(struct cred)); atomic_set(&new->usage, 1); + set_cred_subscribers(new, 0); get_group_info(new->group_info); get_uid(new->user); @@ -157,6 +292,7 @@ struct cred *prepare_creds(void) if (security_prepare_creds(new, old, GFP_KERNEL) < 0) goto error; + validate_creds(new); return new; error: @@ -229,9 +365,12 @@ struct cred *prepare_usermodehelper_creds(void) if (!new) return NULL; + kdebug("prepare_usermodehelper_creds() alloc %p", new); + memcpy(new, &init_cred, sizeof(struct cred)); atomic_set(&new->usage, 1); + set_cred_subscribers(new, 0); get_group_info(new->group_info); get_uid(new->user); @@ -250,6 +389,7 @@ struct cred *prepare_usermodehelper_creds(void) #endif if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0) goto error; + validate_creds(new); BUG_ON(atomic_read(&new->usage) != 1); return new; @@ -286,6 +426,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) ) { p->real_cred = get_cred(p->cred); get_cred(p->cred); + alter_cred_subscribers(p->cred, 2); + kdebug("share_creds(%p{%d,%d})", + p->cred, atomic_read(&p->cred->usage), + read_cred_subscribers(p->cred)); atomic_inc(&p->cred->user->processes); return 0; } @@ -331,6 +475,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) atomic_inc(&new->user->processes); p->cred = p->real_cred = get_cred(new); + alter_cred_subscribers(new, 2); + validate_creds(new); return 0; error_put: @@ -355,13 +501,20 @@ error_put: int commit_creds(struct cred *new) { struct task_struct *task = current; - const struct cred *old; + const struct cred *old = task->real_cred; - BUG_ON(task->cred != task->real_cred); - BUG_ON(atomic_read(&task->real_cred->usage) < 2); + kdebug("commit_creds(%p{%d,%d})", new, + atomic_read(&new->usage), + read_cred_subscribers(new)); + + BUG_ON(task->cred != old); +#ifdef CONFIG_DEBUG_CREDENTIALS + BUG_ON(read_cred_subscribers(old) < 2); + validate_creds(old); + validate_creds(new); +#endif BUG_ON(atomic_read(&new->usage) < 1); - old = task->real_cred; security_commit_creds(new, old); get_cred(new); /* we will require a ref for the subj creds too */ @@ -390,12 +543,14 @@ int commit_creds(struct cred *new) * cheaply with the new uid cache, so if it matters * we should be checking for it. -DaveM */ + alter_cred_subscribers(new, 2); if (new->user != old->user) atomic_inc(&new->user->processes); rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user) atomic_dec(&old->user->processes); + alter_cred_subscribers(old, -2); sched_switch_user(task); @@ -428,6 +583,13 @@ EXPORT_SYMBOL(commit_creds); */ void abort_creds(struct cred *new) { + kdebug("abort_creds(%p{%d,%d})", new, + atomic_read(&new->usage), + read_cred_subscribers(new)); + +#ifdef CONFIG_DEBUG_CREDENTIALS + BUG_ON(read_cred_subscribers(new) != 0); +#endif BUG_ON(atomic_read(&new->usage) < 1); put_cred(new); } @@ -444,7 +606,20 @@ const struct cred *override_creds(const struct cred *new) { const struct cred *old = current->cred; - rcu_assign_pointer(current->cred, get_cred(new)); + kdebug("override_creds(%p{%d,%d})", new, + atomic_read(&new->usage), + read_cred_subscribers(new)); + + validate_creds(old); + validate_creds(new); + get_cred(new); + alter_cred_subscribers(new, 1); + rcu_assign_pointer(current->cred, new); + alter_cred_subscribers(old, -1); + + kdebug("override_creds() = %p{%d,%d}", old, + atomic_read(&old->usage), + read_cred_subscribers(old)); return old; } EXPORT_SYMBOL(override_creds); @@ -460,7 +635,15 @@ void revert_creds(const struct cred *old) { const struct cred *override = current->cred; + kdebug("revert_creds(%p{%d,%d})", old, + atomic_read(&old->usage), + read_cred_subscribers(old)); + + validate_creds(old); + validate_creds(override); + alter_cred_subscribers(old, 1); rcu_assign_pointer(current->cred, old); + alter_cred_subscribers(override, -1); put_cred(override); } EXPORT_SYMBOL(revert_creds); @@ -502,11 +685,15 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) if (!new) return NULL; + kdebug("prepare_kernel_cred() alloc %p", new); + if (daemon) old = get_task_cred(daemon); else old = get_cred(&init_cred); + validate_creds(old); + *new = *old; get_uid(new->user); get_group_info(new->group_info); @@ -526,7 +713,9 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) goto error; atomic_set(&new->usage, 1); + set_cred_subscribers(new, 0); put_cred(old); + validate_creds(new); return new; error: @@ -589,3 +778,95 @@ int set_create_files_as(struct cred *new, struct inode *inode) return security_kernel_create_files_as(new, inode); } EXPORT_SYMBOL(set_create_files_as); + +#ifdef CONFIG_DEBUG_CREDENTIALS + +/* + * dump invalid credentials + */ +static void dump_invalid_creds(const struct cred *cred, const char *label, + const struct task_struct *tsk) +{ + printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n", + label, cred, + cred == &init_cred ? "[init]" : "", + cred == tsk->real_cred ? "[real]" : "", + cred == tsk->cred ? "[eff]" : ""); + printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n", + cred->magic, cred->put_addr); + printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n", + atomic_read(&cred->usage), + read_cred_subscribers(cred)); + printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n", + cred->uid, cred->euid, cred->suid, cred->fsuid); + printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n", + cred->gid, cred->egid, cred->sgid, cred->fsgid); +#ifdef CONFIG_SECURITY + printk(KERN_ERR "CRED: ->security is %p\n", cred->security); + if ((unsigned long) cred->security >= PAGE_SIZE && + (((unsigned long) cred->security & 0xffffff00) != + (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))) + printk(KERN_ERR "CRED: ->security {%x, %x}\n", + ((u32*)cred->security)[0], + ((u32*)cred->security)[1]); +#endif +} + +/* + * report use of invalid credentials + */ +void __invalid_creds(const struct cred *cred, const char *file, unsigned line) +{ + printk(KERN_ERR "CRED: Invalid credentials\n"); + printk(KERN_ERR "CRED: At %s:%u\n", file, line); + dump_invalid_creds(cred, "Specified", current); + BUG(); +} +EXPORT_SYMBOL(__invalid_creds); + +/* + * check the credentials on a process + */ +void __validate_process_creds(struct task_struct *tsk, + const char *file, unsigned line) +{ + if (tsk->cred == tsk->real_cred) { + if (unlikely(read_cred_subscribers(tsk->cred) < 2 || + creds_are_invalid(tsk->cred))) + goto invalid_creds; + } else { + if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 || + read_cred_subscribers(tsk->cred) < 1 || + creds_are_invalid(tsk->real_cred) || + creds_are_invalid(tsk->cred))) + goto invalid_creds; + } + return; + +invalid_creds: + printk(KERN_ERR "CRED: Invalid process credentials\n"); + printk(KERN_ERR "CRED: At %s:%u\n", file, line); + + dump_invalid_creds(tsk->real_cred, "Real", tsk); + if (tsk->cred != tsk->real_cred) + dump_invalid_creds(tsk->cred, "Effective", tsk); + else + printk(KERN_ERR "CRED: Effective creds == Real creds\n"); + BUG(); +} +EXPORT_SYMBOL(__validate_process_creds); + +/* + * check creds for do_exit() + */ +void validate_creds_for_do_exit(struct task_struct *tsk) +{ + kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})", + tsk->real_cred, tsk->cred, + atomic_read(&tsk->cred->usage), + read_cred_subscribers(tsk->cred)); + + __validate_process_creds(tsk, __FILE__, __LINE__); +} + +#endif /* CONFIG_DEBUG_CREDENTIALS */ diff --git a/kernel/exit.c b/kernel/exit.c index 869dc221733..c98ff7a8025 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -901,6 +901,8 @@ NORET_TYPE void do_exit(long code) tracehook_report_exit(&code); + validate_creds_for_do_exit(tsk); + /* * We're taking recursive faults here in do_exit. Safest is to just * leave this task alone and wait for reboot. @@ -1009,6 +1011,8 @@ NORET_TYPE void do_exit(long code) if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); + validate_creds_for_do_exit(tsk); + preempt_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; diff --git a/kernel/fork.c b/kernel/fork.c index e6c04d462ab..aab8579c609 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -152,8 +152,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); - put_cred(tsk->real_cred); - put_cred(tsk->cred); + exit_creds(tsk); delayacct_tsk_free(tsk); if (!profile_handoff_task(tsk)) @@ -1297,8 +1296,7 @@ bad_fork_cleanup_put_domain: module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); - put_cred(p->real_cred); - put_cred(p->cred); + exit_creds(p); bad_fork_free: free_task(p); fork_out: diff --git a/kernel/kmod.c b/kernel/kmod.c index 385c31a1bdb..4e8cae2e914 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -78,6 +78,10 @@ int __request_module(bool wait, const char *fmt, ...) #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; + ret = security_kernel_module_request(); + if (ret) + return ret; + va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); @@ -462,6 +466,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int retval = 0; BUG_ON(atomic_read(&sub_info->cred->usage) != 1); + validate_creds(sub_info->cred); helper_lock(); if (sub_info->path[0] == '\0') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 082c320e4db..307c285af59 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -152,7 +152,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) if (!dumpable && !capable(CAP_SYS_PTRACE)) return -EPERM; - return security_ptrace_may_access(task, mode); + return security_ptrace_access_check(task, mode); } bool ptrace_may_access(struct task_struct *task, unsigned int mode) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 58be76017fd..71d8dc7f992 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -49,7 +49,6 @@ #include <linux/acpi.h> #include <linux/reboot.h> #include <linux/ftrace.h> -#include <linux/security.h> #include <linux/slow-work.h> #include <linux/perf_counter.h> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 12327b2bb78..fbb87cf138c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -653,6 +653,21 @@ config DEBUG_NOTIFIERS This is a relatively cheap check but if you care about maximum performance, say N. +config DEBUG_CREDENTIALS + bool "Debug credential management" + depends on DEBUG_KERNEL + help + Enable this to turn on some debug checking for credential + management. The additional code keeps track of the number of + pointers from task_structs to any given cred struct, and checks to + see that this number never exceeds the usage count of the cred + struct. + + Furthermore, if SELinux is enabled, this also checks that the + security pointer in the cred struct is never seen to be invalid. + + If unsure, say N. + # # Select this config option from the architecture Kconfig, if it # it is preferred to always offer frame pointers as a config diff --git a/lib/is_single_threaded.c b/lib/is_single_threaded.c index f1ed2fe76c6..bd2bea96336 100644 --- a/lib/is_single_threaded.c +++ b/lib/is_single_threaded.c @@ -12,34 +12,47 @@ #include <linux/sched.h> -/** - * is_single_threaded - Determine if a thread group is single-threaded or not - * @p: A task in the thread group in question - * - * This returns true if the thread group to which a task belongs is single - * threaded, false if it is not. +/* + * Returns true if the task does not share ->mm with another thread/process. */ -bool is_single_threaded(struct task_struct *p) +bool current_is_single_threaded(void) { - struct task_struct *g, *t; - struct mm_struct *mm = p->mm; + struct task_struct *task = current; + struct mm_struct *mm = task->mm; + struct task_struct *p, *t; + bool ret; - if (atomic_read(&p->signal->count) != 1) - goto no; + if (atomic_read(&task->signal->live) != 1) + return false; - if (atomic_read(&p->mm->mm_users) != 1) { - read_lock(&tasklist_lock); - do_each_thread(g, t) { - if (t->mm == mm && t != p) - goto no_unlock; - } while_each_thread(g, t); - read_unlock(&tasklist_lock); - } + if (atomic_read(&mm->mm_users) == 1) + return true; - return true; + ret = false; + rcu_read_lock(); + for_each_process(p) { + if (unlikely(p->flags & PF_KTHREAD)) + continue; + if (unlikely(p == task->group_leader)) + continue; + + t = p; + do { + if (unlikely(t->mm == mm)) + goto found; + if (likely(t->mm)) + break; + /* + * t->mm == NULL. Make sure next_thread/next_task + * will see other CLONE_VM tasks which might be + * forked before exiting. + */ + smp_rmb(); + } while_each_thread(p, t); + } + ret = true; +found: + rcu_read_unlock(); -no_unlock: - read_unlock(&tasklist_lock); -no: - return false; + return ret; } diff --git a/mm/Makefile b/mm/Makefile index 5e0bd642669..147a7a7873c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ vmalloc.o obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ - maccess.o page_alloc.o page-writeback.o pdflush.o \ + maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ page_isolation.o mm_init.o $(mmu-y) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c86edd24429..d3ca0dac111 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1,8 +1,11 @@ #include <linux/wait.h> #include <linux/backing-dev.h> +#include <linux/kthread.h> +#include <linux/freezer.h> #include <linux/fs.h> #include <linux/pagemap.h> +#include <linux/mm.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/writeback.h> @@ -14,6 +17,7 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) EXPORT_SYMBOL(default_unplug_io_fn); struct backing_dev_info default_backing_dev_info = { + .name = "default", .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, .state = 0, .capabilities = BDI_CAP_MAP_COPY, @@ -22,6 +26,18 @@ struct backing_dev_info default_backing_dev_info = { EXPORT_SYMBOL_GPL(default_backing_dev_info); static struct class *bdi_class; +DEFINE_SPINLOCK(bdi_lock); +LIST_HEAD(bdi_list); +LIST_HEAD(bdi_pending_list); + +static struct task_struct *sync_supers_tsk; +static struct timer_list sync_supers_timer; + +static int bdi_sync_supers(void *); +static void sync_supers_timer_fn(unsigned long); +static void arm_supers_timer(void); + +static void bdi_add_default_flusher_task(struct backing_dev_info *bdi); #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> @@ -37,9 +53,29 @@ static void bdi_debug_init(void) static int bdi_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; + struct bdi_writeback *wb; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; + unsigned long nr_dirty, nr_io, nr_more_io, nr_wb; + struct inode *inode; + + /* + * inode lock is enough here, the bdi->wb_list is protected by + * RCU on the reader side + */ + nr_wb = nr_dirty = nr_io = nr_more_io = 0; + spin_lock(&inode_lock); + list_for_each_entry(wb, &bdi->wb_list, list) { + nr_wb++; + list_for_each_entry(inode, &wb->b_dirty, i_list) + nr_dirty++; + list_for_each_entry(inode, &wb->b_io, i_list) + nr_io++; + list_for_each_entry(inode, &wb->b_more_io, i_list) + nr_more_io++; + } + spin_unlock(&inode_lock); get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); @@ -49,12 +85,22 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "BdiReclaimable: %8lu kB\n" "BdiDirtyThresh: %8lu kB\n" "DirtyThresh: %8lu kB\n" - "BackgroundThresh: %8lu kB\n", + "BackgroundThresh: %8lu kB\n" + "WriteBack threads:%8lu\n" + "b_dirty: %8lu\n" + "b_io: %8lu\n" + "b_more_io: %8lu\n" + "bdi_list: %8u\n" + "state: %8lx\n" + "wb_mask: %8lx\n" + "wb_list: %8u\n" + "wb_cnt: %8u\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), - K(bdi_thresh), - K(dirty_thresh), - K(background_thresh)); + K(bdi_thresh), K(dirty_thresh), + K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io, + !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask, + !list_empty(&bdi->wb_list), bdi->wb_cnt); #undef K return 0; @@ -185,6 +231,13 @@ static int __init default_bdi_init(void) { int err; + sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers"); + BUG_ON(IS_ERR(sync_supers_tsk)); + + init_timer(&sync_supers_timer); + setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); + arm_supers_timer(); + err = bdi_init(&default_backing_dev_info); if (!err) bdi_register(&default_backing_dev_info, NULL, "default"); @@ -193,6 +246,248 @@ static int __init default_bdi_init(void) } subsys_initcall(default_bdi_init); +static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) +{ + memset(wb, 0, sizeof(*wb)); + + wb->bdi = bdi; + wb->last_old_flush = jiffies; + INIT_LIST_HEAD(&wb->b_dirty); + INIT_LIST_HEAD(&wb->b_io); + INIT_LIST_HEAD(&wb->b_more_io); +} + +static void bdi_task_init(struct backing_dev_info *bdi, + struct bdi_writeback *wb) +{ + struct task_struct *tsk = current; + + spin_lock(&bdi->wb_lock); + list_add_tail_rcu(&wb->list, &bdi->wb_list); + spin_unlock(&bdi->wb_lock); + + tsk->flags |= PF_FLUSHER | PF_SWAPWRITE; + set_freezable(); + + /* + * Our parent may run at a different priority, just set us to normal + */ + set_user_nice(tsk, 0); +} + +static int bdi_start_fn(void *ptr) +{ + struct bdi_writeback *wb = ptr; + struct backing_dev_info *bdi = wb->bdi; + int ret; + + /* + * Add us to the active bdi_list + */ + spin_lock(&bdi_lock); + list_add(&bdi->bdi_list, &bdi_list); + spin_unlock(&bdi_lock); + + bdi_task_init(bdi, wb); + + /* + * Clear pending bit and wakeup anybody waiting to tear us down + */ + clear_bit(BDI_pending, &bdi->state); + smp_mb__after_clear_bit(); + wake_up_bit(&bdi->state, BDI_pending); + + ret = bdi_writeback_task(wb); + + /* + * Remove us from the list + */ + spin_lock(&bdi->wb_lock); + list_del_rcu(&wb->list); + spin_unlock(&bdi->wb_lock); + + /* + * Flush any work that raced with us exiting. No new work + * will be added, since this bdi isn't discoverable anymore. + */ + if (!list_empty(&bdi->work_list)) + wb_do_writeback(wb, 1); + + wb->task = NULL; + return ret; +} + +int bdi_has_dirty_io(struct backing_dev_info *bdi) +{ + return wb_has_dirty_io(&bdi->wb); +} + +static void bdi_flush_io(struct backing_dev_info *bdi) +{ + struct writeback_control wbc = { + .bdi = bdi, + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .range_cyclic = 1, + .nr_to_write = 1024, + }; + + writeback_inodes_wbc(&wbc); +} + +/* + * kupdated() used to do this. We cannot do it from the bdi_forker_task() + * or we risk deadlocking on ->s_umount. The longer term solution would be + * to implement sync_supers_bdi() or similar and simply do it from the + * bdi writeback tasks individually. + */ +static int bdi_sync_supers(void *unused) +{ + set_user_nice(current, 0); + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + + /* + * Do this periodically, like kupdated() did before. + */ + sync_supers(); + } + + return 0; +} + +static void arm_supers_timer(void) +{ + unsigned long next; + + next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; + mod_timer(&sync_supers_timer, round_jiffies_up(next)); +} + +static void sync_supers_timer_fn(unsigned long unused) +{ + wake_up_process(sync_supers_tsk); + arm_supers_timer(); +} + +static int bdi_forker_task(void *ptr) +{ + struct bdi_writeback *me = ptr; + + bdi_task_init(me->bdi, me); + + for (;;) { + struct backing_dev_info *bdi, *tmp; + struct bdi_writeback *wb; + + /* + * Temporary measure, we want to make sure we don't see + * dirty data on the default backing_dev_info + */ + if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) + wb_do_writeback(me, 0); + + spin_lock(&bdi_lock); + + /* + * Check if any existing bdi's have dirty data without + * a thread registered. If so, set that up. + */ + list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) { + if (bdi->wb.task) + continue; + if (list_empty(&bdi->work_list) && + !bdi_has_dirty_io(bdi)) + continue; + + bdi_add_default_flusher_task(bdi); + } + + set_current_state(TASK_INTERRUPTIBLE); + + if (list_empty(&bdi_pending_list)) { + unsigned long wait; + + spin_unlock(&bdi_lock); + wait = msecs_to_jiffies(dirty_writeback_interval * 10); + schedule_timeout(wait); + try_to_freeze(); + continue; + } + + __set_current_state(TASK_RUNNING); + + /* + * This is our real job - check for pending entries in + * bdi_pending_list, and create the tasks that got added + */ + bdi = list_entry(bdi_pending_list.next, struct backing_dev_info, + bdi_list); + list_del_init(&bdi->bdi_list); + spin_unlock(&bdi_lock); + + wb = &bdi->wb; + wb->task = kthread_run(bdi_start_fn, wb, "flush-%s", + dev_name(bdi->dev)); + /* + * If task creation fails, then readd the bdi to + * the pending list and force writeout of the bdi + * from this forker thread. That will free some memory + * and we can try again. + */ + if (IS_ERR(wb->task)) { + wb->task = NULL; + + /* + * Add this 'bdi' to the back, so we get + * a chance to flush other bdi's to free + * memory. + */ + spin_lock(&bdi_lock); + list_add_tail(&bdi->bdi_list, &bdi_pending_list); + spin_unlock(&bdi_lock); + + bdi_flush_io(bdi); + } + } + + return 0; +} + +/* + * Add the default flusher task that gets created for any bdi + * that has dirty data pending writeout + */ +void static bdi_add_default_flusher_task(struct backing_dev_info *bdi) +{ + if (!bdi_cap_writeback_dirty(bdi)) + return; + + if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) { + printk(KERN_ERR "bdi %p/%s is not registered!\n", + bdi, bdi->name); + return; + } + + /* + * Check with the helper whether to proceed adding a task. Will only + * abort if we two or more simultanous calls to + * bdi_add_default_flusher_task() occured, further additions will block + * waiting for previous additions to finish. + */ + if (!test_and_set_bit(BDI_pending, &bdi->state)) { + list_move_tail(&bdi->bdi_list, &bdi_pending_list); + + /* + * We are now on the pending list, wake up bdi_forker_task() + * to finish the job and add us back to the active bdi_list + */ + wake_up_process(default_backing_dev_info.wb.task); + } +} + int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...) { @@ -211,9 +506,35 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, goto exit; } + spin_lock(&bdi_lock); + list_add_tail(&bdi->bdi_list, &bdi_list); + spin_unlock(&bdi_lock); + bdi->dev = dev; - bdi_debug_register(bdi, dev_name(dev)); + /* + * Just start the forker thread for our default backing_dev_info, + * and add other bdi's to the list. They will get a thread created + * on-demand when they need it. + */ + if (bdi_cap_flush_forker(bdi)) { + struct bdi_writeback *wb = &bdi->wb; + + wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s", + dev_name(dev)); + if (IS_ERR(wb->task)) { + wb->task = NULL; + ret = -ENOMEM; + + spin_lock(&bdi_lock); + list_del(&bdi->bdi_list); + spin_unlock(&bdi_lock); + goto exit; + } + } + + bdi_debug_register(bdi, dev_name(dev)); + set_bit(BDI_registered, &bdi->state); exit: return ret; } @@ -225,9 +546,42 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) } EXPORT_SYMBOL(bdi_register_dev); +/* + * Remove bdi from the global list and shutdown any threads we have running + */ +static void bdi_wb_shutdown(struct backing_dev_info *bdi) +{ + struct bdi_writeback *wb; + + if (!bdi_cap_writeback_dirty(bdi)) + return; + + /* + * If setup is pending, wait for that to complete first + */ + wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, + TASK_UNINTERRUPTIBLE); + + /* + * Make sure nobody finds us on the bdi_list anymore + */ + spin_lock(&bdi_lock); + list_del(&bdi->bdi_list); + spin_unlock(&bdi_lock); + + /* + * Finally, kill the kernel threads. We don't need to be RCU + * safe anymore, since the bdi is gone from visibility. + */ + list_for_each_entry(wb, &bdi->wb_list, list) + kthread_stop(wb->task); +} + void bdi_unregister(struct backing_dev_info *bdi) { if (bdi->dev) { + if (!bdi_cap_flush_forker(bdi)) + bdi_wb_shutdown(bdi); bdi_debug_unregister(bdi); device_unregister(bdi->dev); bdi->dev = NULL; @@ -237,14 +591,25 @@ EXPORT_SYMBOL(bdi_unregister); int bdi_init(struct backing_dev_info *bdi) { - int i; - int err; + int i, err; bdi->dev = NULL; bdi->min_ratio = 0; bdi->max_ratio = 100; bdi->max_prop_frac = PROP_FRAC_BASE; + spin_lock_init(&bdi->wb_lock); + INIT_LIST_HEAD(&bdi->bdi_list); + INIT_LIST_HEAD(&bdi->wb_list); + INIT_LIST_HEAD(&bdi->work_list); + + bdi_wb_init(&bdi->wb, bdi); + + /* + * Just one thread support for now, hard code mask and count + */ + bdi->wb_mask = 1; + bdi->wb_cnt = 1; for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { err = percpu_counter_init(&bdi->bdi_stat[i], 0); @@ -269,6 +634,8 @@ void bdi_destroy(struct backing_dev_info *bdi) { int i; + WARN_ON(bdi_has_dirty_io(bdi)); + bdi_unregister(bdi); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) diff --git a/mm/bootmem.c b/mm/bootmem.c index 701740c9e81..555d5d2731c 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -521,7 +521,11 @@ find_block: region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + start_off); memset(region, 0, size); - kmemleak_alloc(region, size, 1, 0); + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. + */ + kmemleak_alloc(region, size, 0, 0); return region; } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 487267310a8..4ea4510e299 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -92,11 +92,13 @@ #include <linux/string.h> #include <linux/nodemask.h> #include <linux/mm.h> +#include <linux/workqueue.h> #include <asm/sections.h> #include <asm/processor.h> #include <asm/atomic.h> +#include <linux/kmemcheck.h> #include <linux/kmemleak.h> /* @@ -107,6 +109,7 @@ #define SECS_FIRST_SCAN 60 /* delay before the first scan */ #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ #define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */ +#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ #define BYTES_PER_POINTER sizeof(void *) @@ -120,6 +123,9 @@ struct kmemleak_scan_area { size_t length; }; +#define KMEMLEAK_GREY 0 +#define KMEMLEAK_BLACK -1 + /* * Structure holding the metadata for each allocated memory block. * Modifications to such objects should be made while holding the @@ -161,6 +167,15 @@ struct kmemleak_object { /* flag set on newly allocated objects */ #define OBJECT_NEW (1 << 3) +/* number of bytes to print per line; must be 16 or 32 */ +#define HEX_ROW_SIZE 16 +/* number of bytes to print at a time (1, 2, 4, 8) */ +#define HEX_GROUP_SIZE 1 +/* include ASCII after the hex output */ +#define HEX_ASCII 1 +/* max number of lines to be printed */ +#define HEX_MAX_LINES 2 + /* the list of all allocated objects */ static LIST_HEAD(object_list); /* the list of gray-colored objects (see color_gray comment below) */ @@ -228,11 +243,14 @@ struct early_log { int min_count; /* minimum reference count */ unsigned long offset; /* scan area offset */ size_t length; /* scan area length */ + unsigned long trace[MAX_TRACE]; /* stack trace */ + unsigned int trace_len; /* stack trace length */ }; /* early logging buffer and current position */ -static struct early_log early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE]; -static int crt_early_log; +static struct early_log + early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata; +static int crt_early_log __initdata; static void kmemleak_disable(void); @@ -255,6 +273,35 @@ static void kmemleak_disable(void); } while (0) /* + * Printing of the objects hex dump to the seq file. The number of lines to be + * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The + * actual number of printed bytes depends on HEX_ROW_SIZE. It must be called + * with the object->lock held. + */ +static void hex_dump_object(struct seq_file *seq, + struct kmemleak_object *object) +{ + const u8 *ptr = (const u8 *)object->pointer; + int i, len, remaining; + unsigned char linebuf[HEX_ROW_SIZE * 5]; + + /* limit the number of lines to HEX_MAX_LINES */ + remaining = len = + min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE)); + + seq_printf(seq, " hex dump (first %d bytes):\n", len); + for (i = 0; i < len; i += HEX_ROW_SIZE) { + int linelen = min(remaining, HEX_ROW_SIZE); + + remaining -= HEX_ROW_SIZE; + hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE, + HEX_GROUP_SIZE, linebuf, sizeof(linebuf), + HEX_ASCII); + seq_printf(seq, " %s\n", linebuf); + } +} + +/* * Object colors, encoded with count and min_count: * - white - orphan object, not enough references to it (count < min_count) * - gray - not orphan, not marked as false positive (min_count == 0) or @@ -264,19 +311,21 @@ static void kmemleak_disable(void); * Newly created objects don't have any color assigned (object->count == -1) * before the next memory scan when they become white. */ -static int color_white(const struct kmemleak_object *object) +static bool color_white(const struct kmemleak_object *object) { - return object->count != -1 && object->count < object->min_count; + return object->count != KMEMLEAK_BLACK && + object->count < object->min_count; } -static int color_gray(const struct kmemleak_object *object) +static bool color_gray(const struct kmemleak_object *object) { - return object->min_count != -1 && object->count >= object->min_count; + return object->min_count != KMEMLEAK_BLACK && + object->count >= object->min_count; } -static int color_black(const struct kmemleak_object *object) +static bool color_black(const struct kmemleak_object *object) { - return object->min_count == -1; + return object->min_count == KMEMLEAK_BLACK; } /* @@ -284,7 +333,7 @@ static int color_black(const struct kmemleak_object *object) * not be deleted and have a minimum age to avoid false positives caused by * pointers temporarily stored in CPU registers. */ -static int unreferenced_object(struct kmemleak_object *object) +static bool unreferenced_object(struct kmemleak_object *object) { return (object->flags & OBJECT_ALLOCATED) && color_white(object) && time_before_eq(object->jiffies + jiffies_min_age, @@ -304,6 +353,7 @@ static void print_unreferenced(struct seq_file *seq, object->pointer, object->size); seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", object->comm, object->pid, object->jiffies); + hex_dump_object(seq, object); seq_printf(seq, " backtrace:\n"); for (i = 0; i < object->trace_len; i++) { @@ -330,6 +380,7 @@ static void dump_object_info(struct kmemleak_object *object) object->comm, object->pid, object->jiffies); pr_notice(" min_count = %d\n", object->min_count); pr_notice(" count = %d\n", object->count); + pr_notice(" flags = 0x%lx\n", object->flags); pr_notice(" backtrace:\n"); print_stack_trace(&trace, 4); } @@ -434,21 +485,36 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) } /* + * Save stack trace to the given array of MAX_TRACE size. + */ +static int __save_stack_trace(unsigned long *trace) +{ + struct stack_trace stack_trace; + + stack_trace.max_entries = MAX_TRACE; + stack_trace.nr_entries = 0; + stack_trace.entries = trace; + stack_trace.skip = 2; + save_stack_trace(&stack_trace); + + return stack_trace.nr_entries; +} + +/* * Create the metadata (struct kmemleak_object) corresponding to an allocated * memory block and add it to the object_list and object_tree_root. */ -static void create_object(unsigned long ptr, size_t size, int min_count, - gfp_t gfp) +static struct kmemleak_object *create_object(unsigned long ptr, size_t size, + int min_count, gfp_t gfp) { unsigned long flags; struct kmemleak_object *object; struct prio_tree_node *node; - struct stack_trace trace; object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); if (!object) { kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); - return; + return NULL; } INIT_LIST_HEAD(&object->object_list); @@ -482,18 +548,14 @@ static void create_object(unsigned long ptr, size_t size, int min_count, } /* kernel backtrace */ - trace.max_entries = MAX_TRACE; - trace.nr_entries = 0; - trace.entries = object->trace; - trace.skip = 1; - save_stack_trace(&trace); - object->trace_len = trace.nr_entries; + object->trace_len = __save_stack_trace(object->trace); INIT_PRIO_TREE_NODE(&object->tree_node); object->tree_node.start = ptr; object->tree_node.last = ptr + size - 1; write_lock_irqsave(&kmemleak_lock, flags); + min_addr = min(min_addr, ptr); max_addr = max(max_addr, ptr + size); node = prio_tree_insert(&object_tree_root, &object->tree_node); @@ -504,20 +566,19 @@ static void create_object(unsigned long ptr, size_t size, int min_count, * random memory blocks. */ if (node != &object->tree_node) { - unsigned long flags; - kmemleak_stop("Cannot insert 0x%lx into the object search tree " "(already existing)\n", ptr); object = lookup_object(ptr, 1); - spin_lock_irqsave(&object->lock, flags); + spin_lock(&object->lock); dump_object_info(object); - spin_unlock_irqrestore(&object->lock, flags); + spin_unlock(&object->lock); goto out; } list_add_tail_rcu(&object->object_list, &object_list); out: write_unlock_irqrestore(&kmemleak_lock, flags); + return object; } /* @@ -604,46 +665,55 @@ static void delete_object_part(unsigned long ptr, size_t size) put_object(object); } -/* - * Make a object permanently as gray-colored so that it can no longer be - * reported as a leak. This is used in general to mark a false positive. - */ -static void make_gray_object(unsigned long ptr) + +static void __paint_it(struct kmemleak_object *object, int color) +{ + object->min_count = color; + if (color == KMEMLEAK_BLACK) + object->flags |= OBJECT_NO_SCAN; +} + +static void paint_it(struct kmemleak_object *object, int color) { unsigned long flags; + + spin_lock_irqsave(&object->lock, flags); + __paint_it(object, color); + spin_unlock_irqrestore(&object->lock, flags); +} + +static void paint_ptr(unsigned long ptr, int color) +{ struct kmemleak_object *object; object = find_and_get_object(ptr, 0); if (!object) { - kmemleak_warn("Graying unknown object at 0x%08lx\n", ptr); + kmemleak_warn("Trying to color unknown object " + "at 0x%08lx as %s\n", ptr, + (color == KMEMLEAK_GREY) ? "Grey" : + (color == KMEMLEAK_BLACK) ? "Black" : "Unknown"); return; } - - spin_lock_irqsave(&object->lock, flags); - object->min_count = 0; - spin_unlock_irqrestore(&object->lock, flags); + paint_it(object, color); put_object(object); } /* + * Make a object permanently as gray-colored so that it can no longer be + * reported as a leak. This is used in general to mark a false positive. + */ +static void make_gray_object(unsigned long ptr) +{ + paint_ptr(ptr, KMEMLEAK_GREY); +} + +/* * Mark the object as black-colored so that it is ignored from scans and * reporting. */ static void make_black_object(unsigned long ptr) { - unsigned long flags; - struct kmemleak_object *object; - - object = find_and_get_object(ptr, 0); - if (!object) { - kmemleak_warn("Blacking unknown object at 0x%08lx\n", ptr); - return; - } - - spin_lock_irqsave(&object->lock, flags); - object->min_count = -1; - spin_unlock_irqrestore(&object->lock, flags); - put_object(object); + paint_ptr(ptr, KMEMLEAK_BLACK); } /* @@ -715,14 +785,15 @@ static void object_no_scan(unsigned long ptr) * Log an early kmemleak_* call to the early_log buffer. These calls will be * processed later once kmemleak is fully initialized. */ -static void log_early(int op_type, const void *ptr, size_t size, - int min_count, unsigned long offset, size_t length) +static void __init log_early(int op_type, const void *ptr, size_t size, + int min_count, unsigned long offset, size_t length) { unsigned long flags; struct early_log *log; if (crt_early_log >= ARRAY_SIZE(early_log)) { - pr_warning("Early log buffer exceeded\n"); + pr_warning("Early log buffer exceeded, " + "please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n"); kmemleak_disable(); return; } @@ -739,16 +810,45 @@ static void log_early(int op_type, const void *ptr, size_t size, log->min_count = min_count; log->offset = offset; log->length = length; + if (op_type == KMEMLEAK_ALLOC) + log->trace_len = __save_stack_trace(log->trace); crt_early_log++; local_irq_restore(flags); } /* + * Log an early allocated block and populate the stack trace. + */ +static void early_alloc(struct early_log *log) +{ + struct kmemleak_object *object; + unsigned long flags; + int i; + + if (!atomic_read(&kmemleak_enabled) || !log->ptr || IS_ERR(log->ptr)) + return; + + /* + * RCU locking needed to ensure object is not freed via put_object(). + */ + rcu_read_lock(); + object = create_object((unsigned long)log->ptr, log->size, + log->min_count, GFP_KERNEL); + spin_lock_irqsave(&object->lock, flags); + for (i = 0; i < log->trace_len; i++) + object->trace[i] = log->trace[i]; + object->trace_len = log->trace_len; + spin_unlock_irqrestore(&object->lock, flags); + rcu_read_unlock(); +} + +/* * Memory allocation function callback. This function is called from the * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc, * vmalloc etc.). */ -void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) +void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, + gfp_t gfp) { pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); @@ -763,7 +863,7 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc); * Memory freeing function callback. This function is called from the kernel * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.). */ -void kmemleak_free(const void *ptr) +void __ref kmemleak_free(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); @@ -778,7 +878,7 @@ EXPORT_SYMBOL_GPL(kmemleak_free); * Partial memory freeing function callback. This function is usually called * from bootmem allocator when (part of) a memory block is freed. */ -void kmemleak_free_part(const void *ptr, size_t size) +void __ref kmemleak_free_part(const void *ptr, size_t size) { pr_debug("%s(0x%p)\n", __func__, ptr); @@ -793,7 +893,7 @@ EXPORT_SYMBOL_GPL(kmemleak_free_part); * Mark an already allocated memory block as a false positive. This will cause * the block to no longer be reported as leak and always be scanned. */ -void kmemleak_not_leak(const void *ptr) +void __ref kmemleak_not_leak(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); @@ -809,7 +909,7 @@ EXPORT_SYMBOL(kmemleak_not_leak); * corresponding block is not a leak and does not contain any references to * other allocated memory blocks. */ -void kmemleak_ignore(const void *ptr) +void __ref kmemleak_ignore(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); @@ -823,8 +923,8 @@ EXPORT_SYMBOL(kmemleak_ignore); /* * Limit the range to be scanned in an allocated memory block. */ -void kmemleak_scan_area(const void *ptr, unsigned long offset, size_t length, - gfp_t gfp) +void __ref kmemleak_scan_area(const void *ptr, unsigned long offset, + size_t length, gfp_t gfp) { pr_debug("%s(0x%p)\n", __func__, ptr); @@ -838,7 +938,7 @@ EXPORT_SYMBOL(kmemleak_scan_area); /* * Inform kmemleak not to scan the given memory block. */ -void kmemleak_no_scan(const void *ptr) +void __ref kmemleak_no_scan(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); @@ -882,15 +982,22 @@ static void scan_block(void *_start, void *_end, unsigned long *end = _end - (BYTES_PER_POINTER - 1); for (ptr = start; ptr < end; ptr++) { - unsigned long flags; - unsigned long pointer = *ptr; struct kmemleak_object *object; + unsigned long flags; + unsigned long pointer; if (allow_resched) cond_resched(); if (scan_should_stop()) break; + /* don't scan uninitialized memory */ + if (!kmemcheck_is_obj_initialized((unsigned long)ptr, + BYTES_PER_POINTER)) + continue; + + pointer = *ptr; + object = find_and_get_object(pointer, 1); if (!object) continue; @@ -949,10 +1056,21 @@ static void scan_object(struct kmemleak_object *object) if (!(object->flags & OBJECT_ALLOCATED)) /* already freed object */ goto out; - if (hlist_empty(&object->area_list)) - scan_block((void *)object->pointer, - (void *)(object->pointer + object->size), object, 0); - else + if (hlist_empty(&object->area_list)) { + void *start = (void *)object->pointer; + void *end = (void *)(object->pointer + object->size); + + while (start < end && (object->flags & OBJECT_ALLOCATED) && + !(object->flags & OBJECT_NO_SCAN)) { + scan_block(start, min(start + MAX_SCAN_SIZE, end), + object, 0); + start += MAX_SCAN_SIZE; + + spin_unlock_irqrestore(&object->lock, flags); + cond_resched(); + spin_lock_irqsave(&object->lock, flags); + } + } else hlist_for_each_entry(area, elem, &object->area_list, node) scan_block((void *)(object->pointer + area->offset), (void *)(object->pointer + area->offset @@ -970,7 +1088,6 @@ static void kmemleak_scan(void) { unsigned long flags; struct kmemleak_object *object, *tmp; - struct task_struct *task; int i; int new_leaks = 0; int gray_list_pass = 0; @@ -1037,15 +1154,16 @@ static void kmemleak_scan(void) } /* - * Scanning the task stacks may introduce false negatives and it is - * not enabled by default. + * Scanning the task stacks (may introduce false negatives). */ if (kmemleak_stack_scan) { + struct task_struct *p, *g; + read_lock(&tasklist_lock); - for_each_process(task) - scan_block(task_stack_page(task), - task_stack_page(task) + THREAD_SIZE, - NULL, 0); + do_each_thread(g, p) { + scan_block(task_stack_page(p), task_stack_page(p) + + THREAD_SIZE, NULL, 0); + } while_each_thread(g, p); read_unlock(&tasklist_lock); } @@ -1170,7 +1288,7 @@ static int kmemleak_scan_thread(void *arg) * Start the automatic memory scanning thread. This function must be called * with the scan_mutex held. */ -void start_scan_thread(void) +static void start_scan_thread(void) { if (scan_thread) return; @@ -1185,7 +1303,7 @@ void start_scan_thread(void) * Stop the automatic memory scanning thread. This function must be called * with the scan_mutex held. */ -void stop_scan_thread(void) +static void stop_scan_thread(void) { if (scan_thread) { kthread_stop(scan_thread); @@ -1294,6 +1412,49 @@ static int kmemleak_release(struct inode *inode, struct file *file) return seq_release(inode, file); } +static int dump_str_object_info(const char *str) +{ + unsigned long flags; + struct kmemleak_object *object; + unsigned long addr; + + addr= simple_strtoul(str, NULL, 0); + object = find_and_get_object(addr, 0); + if (!object) { + pr_info("Unknown object at 0x%08lx\n", addr); + return -EINVAL; + } + + spin_lock_irqsave(&object->lock, flags); + dump_object_info(object); + spin_unlock_irqrestore(&object->lock, flags); + + put_object(object); + return 0; +} + +/* + * We use grey instead of black to ensure we can do future scans on the same + * objects. If we did not do future scans these black objects could + * potentially contain references to newly allocated objects in the future and + * we'd end up with false positives. + */ +static void kmemleak_clear(void) +{ + struct kmemleak_object *object; + unsigned long flags; + + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + spin_lock_irqsave(&object->lock, flags); + if ((object->flags & OBJECT_REPORTED) && + unreferenced_object(object)) + __paint_it(object, KMEMLEAK_GREY); + spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); +} + /* * File write operation to configure kmemleak at run-time. The following * commands can be written to the /sys/kernel/debug/kmemleak file: @@ -1305,6 +1466,9 @@ static int kmemleak_release(struct inode *inode, struct file *file) * scan=... - set the automatic memory scanning period in seconds (0 to * disable it) * scan - trigger a memory scan + * clear - mark all current reported unreferenced kmemleak objects as + * grey to ignore printing them + * dump=... - dump information about the object found at the given address */ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, size_t size, loff_t *ppos) @@ -1345,6 +1509,10 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, } } else if (strncmp(buf, "scan", 4) == 0) kmemleak_scan(); + else if (strncmp(buf, "clear", 5) == 0) + kmemleak_clear(); + else if (strncmp(buf, "dump=", 5) == 0) + ret = dump_str_object_info(buf + 5); else ret = -EINVAL; @@ -1371,7 +1539,7 @@ static const struct file_operations kmemleak_fops = { * Perform the freeing of the kmemleak internal objects after waiting for any * current memory scan to complete. */ -static int kmemleak_cleanup_thread(void *arg) +static void kmemleak_do_cleanup(struct work_struct *work) { struct kmemleak_object *object; @@ -1383,22 +1551,9 @@ static int kmemleak_cleanup_thread(void *arg) delete_object_full(object->pointer); rcu_read_unlock(); mutex_unlock(&scan_mutex); - - return 0; } -/* - * Start the clean-up thread. - */ -static void kmemleak_cleanup(void) -{ - struct task_struct *cleanup_thread; - - cleanup_thread = kthread_run(kmemleak_cleanup_thread, NULL, - "kmemleak-clean"); - if (IS_ERR(cleanup_thread)) - pr_warning("Failed to create the clean-up thread\n"); -} +static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); /* * Disable kmemleak. No memory allocation/freeing will be traced once this @@ -1416,7 +1571,7 @@ static void kmemleak_disable(void) /* check whether it is too early for a kernel thread */ if (atomic_read(&kmemleak_initialized)) - kmemleak_cleanup(); + schedule_work(&cleanup_work); pr_info("Kernel memory leak detector disabled\n"); } @@ -1469,8 +1624,7 @@ void __init kmemleak_init(void) switch (log->op_type) { case KMEMLEAK_ALLOC: - kmemleak_alloc(log->ptr, log->size, log->min_count, - GFP_KERNEL); + early_alloc(log); break; case KMEMLEAK_FREE: kmemleak_free(log->ptr); @@ -1513,7 +1667,7 @@ static int __init kmemleak_late_init(void) * after setting kmemleak_initialized and we may end up with * two clean-up threads but serialized by scan_mutex. */ - kmemleak_cleanup(); + schedule_work(&cleanup_work); return -ENOMEM; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 81627ebcd31..25e7770309b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -36,15 +36,6 @@ #include <linux/pagevec.h> /* - * The maximum number of pages to writeout in a single bdflush/kupdate - * operation. We do this so we don't hold I_SYNC against an inode for - * enormous amounts of time, which would block a userspace task which has - * been forced to throttle against that inode. Also, the code reevaluates - * the dirty each time it has written this many pages. - */ -#define MAX_WRITEBACK_PAGES 1024 - -/* * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited * will look to see if it needs to force writeback or throttling. */ @@ -117,8 +108,6 @@ EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ -static void background_writeout(unsigned long _min_pages); - /* * Scale the writeback cache size proportional to the relative writeout speeds. * @@ -320,15 +309,13 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) /* * */ -static DEFINE_SPINLOCK(bdi_lock); static unsigned int bdi_min_ratio; int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { int ret = 0; - unsigned long flags; - spin_lock_irqsave(&bdi_lock, flags); + spin_lock(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; } else { @@ -340,27 +327,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) ret = -EINVAL; } } - spin_unlock_irqrestore(&bdi_lock, flags); + spin_unlock(&bdi_lock); return ret; } int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) { - unsigned long flags; int ret = 0; if (max_ratio > 100) return -EINVAL; - spin_lock_irqsave(&bdi_lock, flags); + spin_lock(&bdi_lock); if (bdi->min_ratio > max_ratio) { ret = -EINVAL; } else { bdi->max_ratio = max_ratio; bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; } - spin_unlock_irqrestore(&bdi_lock, flags); + spin_unlock(&bdi_lock); return ret; } @@ -546,7 +532,7 @@ static void balance_dirty_pages(struct address_space *mapping) * up. */ if (bdi_nr_reclaimable > bdi_thresh) { - writeback_inodes(&wbc); + writeback_inodes_wbc(&wbc); pages_written += write_chunk - wbc.nr_to_write; get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); @@ -575,7 +561,7 @@ static void balance_dirty_pages(struct address_space *mapping) if (pages_written >= write_chunk) break; /* We've done our duty */ - congestion_wait(BLK_RW_ASYNC, HZ/10); + schedule_timeout(1); } if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && @@ -594,10 +580,18 @@ static void balance_dirty_pages(struct address_space *mapping) * background_thresh, to keep the amount of dirty memory low. */ if ((laptop_mode && pages_written) || - (!laptop_mode && (global_page_state(NR_FILE_DIRTY) - + global_page_state(NR_UNSTABLE_NFS) - > background_thresh))) - pdflush_operation(background_writeout, 0); + (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS)) + > background_thresh))) { + struct writeback_control wbc = { + .bdi = bdi, + .sync_mode = WB_SYNC_NONE, + .nr_to_write = nr_writeback, + }; + + + bdi_start_writeback(&wbc); + } } void set_page_dirty_balance(struct page *page, int page_mkwrite) @@ -681,153 +675,35 @@ void throttle_vm_writeout(gfp_t gfp_mask) } } -/* - * writeback at least _min_pages, and keep writing until the amount of dirty - * memory is less than the background threshold, or until we're all clean. - */ -static void background_writeout(unsigned long _min_pages) -{ - long min_pages = _min_pages; - struct writeback_control wbc = { - .bdi = NULL, - .sync_mode = WB_SYNC_NONE, - .older_than_this = NULL, - .nr_to_write = 0, - .nonblocking = 1, - .range_cyclic = 1, - }; - - for ( ; ; ) { - unsigned long background_thresh; - unsigned long dirty_thresh; - - get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); - if (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) < background_thresh - && min_pages <= 0) - break; - wbc.more_io = 0; - wbc.encountered_congestion = 0; - wbc.nr_to_write = MAX_WRITEBACK_PAGES; - wbc.pages_skipped = 0; - writeback_inodes(&wbc); - min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; - if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { - /* Wrote less than expected */ - if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(BLK_RW_ASYNC, HZ/10); - else - break; - } - } -} - -/* - * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back - * the whole world. Returns 0 if a pdflush thread was dispatched. Returns - * -1 if all pdflush threads were busy. - */ -int wakeup_pdflush(long nr_pages) -{ - if (nr_pages == 0) - nr_pages = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - return pdflush_operation(background_writeout, nr_pages); -} - -static void wb_timer_fn(unsigned long unused); static void laptop_timer_fn(unsigned long unused); -static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0); static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); /* - * Periodic writeback of "old" data. - * - * Define "old": the first time one of an inode's pages is dirtied, we mark the - * dirtying-time in the inode's address_space. So this periodic writeback code - * just walks the superblock inode list, writing back any inodes which are - * older than a specific point in time. - * - * Try to run once per dirty_writeback_interval. But if a writeback event - * takes longer than a dirty_writeback_interval interval, then leave a - * one-second gap. - * - * older_than_this takes precedence over nr_to_write. So we'll only write back - * all dirty pages if they are all attached to "old" mappings. - */ -static void wb_kupdate(unsigned long arg) -{ - unsigned long oldest_jif; - unsigned long start_jif; - unsigned long next_jif; - long nr_to_write; - struct writeback_control wbc = { - .bdi = NULL, - .sync_mode = WB_SYNC_NONE, - .older_than_this = &oldest_jif, - .nr_to_write = 0, - .nonblocking = 1, - .for_kupdate = 1, - .range_cyclic = 1, - }; - - sync_supers(); - - oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); - start_jif = jiffies; - next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10); - nr_to_write = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); - while (nr_to_write > 0) { - wbc.more_io = 0; - wbc.encountered_congestion = 0; - wbc.nr_to_write = MAX_WRITEBACK_PAGES; - writeback_inodes(&wbc); - if (wbc.nr_to_write > 0) { - if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(BLK_RW_ASYNC, HZ/10); - else - break; /* All the old data is written */ - } - nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; - } - if (time_before(next_jif, jiffies + HZ)) - next_jif = jiffies + HZ; - if (dirty_writeback_interval) - mod_timer(&wb_timer, next_jif); -} - -/* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ int dirty_writeback_centisecs_handler(ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, file, buffer, length, ppos); - if (dirty_writeback_interval) - mod_timer(&wb_timer, jiffies + - msecs_to_jiffies(dirty_writeback_interval * 10)); - else - del_timer(&wb_timer); return 0; } -static void wb_timer_fn(unsigned long unused) -{ - if (pdflush_operation(wb_kupdate, 0) < 0) - mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */ -} - -static void laptop_flush(unsigned long unused) +static void do_laptop_sync(struct work_struct *work) { - sys_sync(); + wakeup_flusher_threads(0); + kfree(work); } static void laptop_timer_fn(unsigned long unused) { - pdflush_operation(laptop_flush, 0); + struct work_struct *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + INIT_WORK(work, do_laptop_sync); + schedule_work(work); + } } /* @@ -910,8 +786,6 @@ void __init page_writeback_init(void) { int shift; - mod_timer(&wb_timer, - jiffies + msecs_to_jiffies(dirty_writeback_interval * 10)); writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); diff --git a/mm/pdflush.c b/mm/pdflush.c deleted file mode 100644 index 235ac440c44..00000000000 --- a/mm/pdflush.c +++ /dev/null @@ -1,269 +0,0 @@ -/* - * mm/pdflush.c - worker threads for writing back filesystem data - * - * Copyright (C) 2002, Linus Torvalds. - * - * 09Apr2002 Andrew Morton - * Initial version - * 29Feb2004 kaos@sgi.com - * Move worker thread creation to kthread to avoid chewing - * up stack space with nested calls to kernel_thread. - */ - -#include <linux/sched.h> -#include <linux/list.h> -#include <linux/signal.h> -#include <linux/spinlock.h> -#include <linux/gfp.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/fs.h> /* Needed by writeback.h */ -#include <linux/writeback.h> /* Prototypes pdflush_operation() */ -#include <linux/kthread.h> -#include <linux/cpuset.h> -#include <linux/freezer.h> - - -/* - * Minimum and maximum number of pdflush instances - */ -#define MIN_PDFLUSH_THREADS 2 -#define MAX_PDFLUSH_THREADS 8 - -static void start_one_pdflush_thread(void); - - -/* - * The pdflush threads are worker threads for writing back dirty data. - * Ideally, we'd like one thread per active disk spindle. But the disk - * topology is very hard to divine at this level. Instead, we take - * care in various places to prevent more than one pdflush thread from - * performing writeback against a single filesystem. pdflush threads - * have the PF_FLUSHER flag set in current->flags to aid in this. - */ - -/* - * All the pdflush threads. Protected by pdflush_lock - */ -static LIST_HEAD(pdflush_list); -static DEFINE_SPINLOCK(pdflush_lock); - -/* - * The count of currently-running pdflush threads. Protected - * by pdflush_lock. - * - * Readable by sysctl, but not writable. Published to userspace at - * /proc/sys/vm/nr_pdflush_threads. - */ -int nr_pdflush_threads = 0; - -/* - * The time at which the pdflush thread pool last went empty - */ -static unsigned long last_empty_jifs; - -/* - * The pdflush thread. - * - * Thread pool management algorithm: - * - * - The minimum and maximum number of pdflush instances are bound - * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS. - * - * - If there have been no idle pdflush instances for 1 second, create - * a new one. - * - * - If the least-recently-went-to-sleep pdflush thread has been asleep - * for more than one second, terminate a thread. - */ - -/* - * A structure for passing work to a pdflush thread. Also for passing - * state information between pdflush threads. Protected by pdflush_lock. - */ -struct pdflush_work { - struct task_struct *who; /* The thread */ - void (*fn)(unsigned long); /* A callback function */ - unsigned long arg0; /* An argument to the callback */ - struct list_head list; /* On pdflush_list, when idle */ - unsigned long when_i_went_to_sleep; -}; - -static int __pdflush(struct pdflush_work *my_work) -{ - current->flags |= PF_FLUSHER | PF_SWAPWRITE; - set_freezable(); - my_work->fn = NULL; - my_work->who = current; - INIT_LIST_HEAD(&my_work->list); - - spin_lock_irq(&pdflush_lock); - for ( ; ; ) { - struct pdflush_work *pdf; - - set_current_state(TASK_INTERRUPTIBLE); - list_move(&my_work->list, &pdflush_list); - my_work->when_i_went_to_sleep = jiffies; - spin_unlock_irq(&pdflush_lock); - schedule(); - try_to_freeze(); - spin_lock_irq(&pdflush_lock); - if (!list_empty(&my_work->list)) { - /* - * Someone woke us up, but without removing our control - * structure from the global list. swsusp will do this - * in try_to_freeze()->refrigerator(). Handle it. - */ - my_work->fn = NULL; - continue; - } - if (my_work->fn == NULL) { - printk("pdflush: bogus wakeup\n"); - continue; - } - spin_unlock_irq(&pdflush_lock); - - (*my_work->fn)(my_work->arg0); - - spin_lock_irq(&pdflush_lock); - - /* - * Thread creation: For how long have there been zero - * available threads? - * - * To throttle creation, we reset last_empty_jifs. - */ - if (time_after(jiffies, last_empty_jifs + 1 * HZ)) { - if (list_empty(&pdflush_list)) { - if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) { - last_empty_jifs = jiffies; - nr_pdflush_threads++; - spin_unlock_irq(&pdflush_lock); - start_one_pdflush_thread(); - spin_lock_irq(&pdflush_lock); - } - } - } - - my_work->fn = NULL; - - /* - * Thread destruction: For how long has the sleepiest - * thread slept? - */ - if (list_empty(&pdflush_list)) - continue; - if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) - continue; - pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); - if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) { - /* Limit exit rate */ - pdf->when_i_went_to_sleep = jiffies; - break; /* exeunt */ - } - } - nr_pdflush_threads--; - spin_unlock_irq(&pdflush_lock); - return 0; -} - -/* - * Of course, my_work wants to be just a local in __pdflush(). It is - * separated out in this manner to hopefully prevent the compiler from - * performing unfortunate optimisations against the auto variables. Because - * these are visible to other tasks and CPUs. (No problem has actually - * been observed. This is just paranoia). - */ -static int pdflush(void *dummy) -{ - struct pdflush_work my_work; - cpumask_var_t cpus_allowed; - - /* - * Since the caller doesn't even check kthread_run() worked, let's not - * freak out too much if this fails. - */ - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { - printk(KERN_WARNING "pdflush failed to allocate cpumask\n"); - return 0; - } - - /* - * pdflush can spend a lot of time doing encryption via dm-crypt. We - * don't want to do that at keventd's priority. - */ - set_user_nice(current, 0); - - /* - * Some configs put our parent kthread in a limited cpuset, - * which kthread() overrides, forcing cpus_allowed == cpu_all_mask. - * Our needs are more modest - cut back to our cpusets cpus_allowed. - * This is needed as pdflush's are dynamically created and destroyed. - * The boottime pdflush's are easily placed w/o these 2 lines. - */ - cpuset_cpus_allowed(current, cpus_allowed); - set_cpus_allowed_ptr(current, cpus_allowed); - free_cpumask_var(cpus_allowed); - - return __pdflush(&my_work); -} - -/* - * Attempt to wake up a pdflush thread, and get it to do some work for you. - * Returns zero if it indeed managed to find a worker thread, and passed your - * payload to it. - */ -int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0) -{ - unsigned long flags; - int ret = 0; - - BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */ - - spin_lock_irqsave(&pdflush_lock, flags); - if (list_empty(&pdflush_list)) { - ret = -1; - } else { - struct pdflush_work *pdf; - - pdf = list_entry(pdflush_list.next, struct pdflush_work, list); - list_del_init(&pdf->list); - if (list_empty(&pdflush_list)) - last_empty_jifs = jiffies; - pdf->fn = fn; - pdf->arg0 = arg0; - wake_up_process(pdf->who); - } - spin_unlock_irqrestore(&pdflush_lock, flags); - - return ret; -} - -static void start_one_pdflush_thread(void) -{ - struct task_struct *k; - - k = kthread_run(pdflush, NULL, "pdflush"); - if (unlikely(IS_ERR(k))) { - spin_lock_irq(&pdflush_lock); - nr_pdflush_threads--; - spin_unlock_irq(&pdflush_lock); - } -} - -static int __init pdflush_init(void) -{ - int i; - - /* - * Pre-set nr_pdflush_threads... If we fail to create, - * the count will be decremented. - */ - nr_pdflush_threads = MIN_PDFLUSH_THREADS; - - for (i = 0; i < MIN_PDFLUSH_THREADS; i++) - start_one_pdflush_thread(); - return 0; -} - -module_init(pdflush_init); diff --git a/mm/shmem.c b/mm/shmem.c index d713239ce2c..5a0b3d4055f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2446,7 +2446,7 @@ static const struct inode_operations shmem_inode_operations = { .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, - .permission = shmem_permission, + .check_acl = shmem_check_acl, #endif }; @@ -2469,7 +2469,7 @@ static const struct inode_operations shmem_dir_inode_operations = { .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, - .permission = shmem_permission, + .check_acl = shmem_check_acl, #endif }; @@ -2480,7 +2480,7 @@ static const struct inode_operations shmem_special_inode_operations = { .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, - .permission = shmem_permission, + .check_acl = shmem_check_acl, #endif }; diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c index 606a8e757a4..df2c87fdae5 100644 --- a/mm/shmem_acl.c +++ b/mm/shmem_acl.c @@ -157,7 +157,7 @@ shmem_acl_init(struct inode *inode, struct inode *dir) /** * shmem_check_acl - check_acl() callback for generic_permission() */ -static int +int shmem_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); @@ -169,12 +169,3 @@ shmem_check_acl(struct inode *inode, int mask) } return -EAGAIN; } - -/** - * shmem_permission - permission() inode operation - */ -int -shmem_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, shmem_check_acl); -} diff --git a/mm/swap_state.c b/mm/swap_state.c index 42cd38eba79..5ae6b8b78c8 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -34,6 +34,7 @@ static const struct address_space_operations swap_aops = { }; static struct backing_dev_info swap_backing_dev_info = { + .name = "swap", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, .unplug_io_fn = swap_unplug_io_fn, }; diff --git a/mm/vmscan.c b/mm/vmscan.c index 94e86dd6954..ba8228e0a80 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1720,7 +1720,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, */ if (total_scanned > sc->swap_cluster_max + sc->swap_cluster_max / 2) { - wakeup_pdflush(laptop_mode ? 0 : total_scanned); + wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); sc->may_writepage = 1; } diff --git a/net/core/dev.c b/net/core/dev.c index 6a94475aee8..278d489aad3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1031,7 +1031,7 @@ void dev_load(struct net *net, const char *name) dev = __dev_get_by_name(net, name); read_unlock(&dev_base_lock); - if (!dev && capable(CAP_SYS_MODULE)) + if (!dev && capable(CAP_NET_ADMIN)) request_module("%s", name); } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index e92beb9e55e..6428b342b16 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -116,7 +116,7 @@ int tcp_set_default_congestion_control(const char *name) spin_lock(&tcp_cong_list_lock); ca = tcp_ca_find(name); #ifdef CONFIG_MODULES - if (!ca && capable(CAP_SYS_MODULE)) { + if (!ca && capable(CAP_NET_ADMIN)) { spin_unlock(&tcp_cong_list_lock); request_module("tcp_%s", name); @@ -246,7 +246,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) #ifdef CONFIG_MODULES /* not found attempt to autoload module */ - if (!ca && capable(CAP_SYS_MODULE)) { + if (!ca && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); request_module("tcp_%s", name); rcu_read_lock(); diff --git a/security/Makefile b/security/Makefile index b56e7f9ecbc..95ecc06392d 100644 --- a/security/Makefile +++ b/security/Makefile @@ -16,9 +16,7 @@ obj-$(CONFIG_SECURITYFS) += inode.o # Must precede capability.o in order to stack properly. obj-$(CONFIG_SECURITY_SELINUX) += selinux/built-in.o obj-$(CONFIG_SECURITY_SMACK) += smack/built-in.o -ifeq ($(CONFIG_AUDIT),y) -obj-$(CONFIG_SECURITY_SMACK) += lsm_audit.o -endif +obj-$(CONFIG_AUDIT) += lsm_audit.o obj-$(CONFIG_SECURITY_TOMOYO) += tomoyo/built-in.o obj-$(CONFIG_SECURITY_ROOTPLUG) += root_plug.o obj-$(CONFIG_CGROUP_DEVICE) += device_cgroup.o diff --git a/security/capability.c b/security/capability.c index 88f752e8152..fce07a7bc82 100644 --- a/security/capability.c +++ b/security/capability.c @@ -373,6 +373,11 @@ static int cap_task_create(unsigned long clone_flags) return 0; } +static int cap_cred_alloc_blank(struct cred *cred, gfp_t gfp) +{ + return 0; +} + static void cap_cred_free(struct cred *cred) { } @@ -386,6 +391,10 @@ static void cap_cred_commit(struct cred *new, const struct cred *old) { } +static void cap_cred_transfer(struct cred *new, const struct cred *old) +{ +} + static int cap_kernel_act_as(struct cred *new, u32 secid) { return 0; @@ -396,6 +405,11 @@ static int cap_kernel_create_files_as(struct cred *new, struct inode *inode) return 0; } +static int cap_kernel_module_request(void) +{ + return 0; +} + static int cap_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags) { return 0; @@ -701,10 +715,26 @@ static void cap_inet_conn_established(struct sock *sk, struct sk_buff *skb) { } + + static void cap_req_classify_flow(const struct request_sock *req, struct flowi *fl) { } + +static int cap_tun_dev_create(void) +{ + return 0; +} + +static void cap_tun_dev_post_create(struct sock *sk) +{ +} + +static int cap_tun_dev_attach(struct sock *sk) +{ + return 0; +} #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_NETWORK_XFRM @@ -792,6 +822,20 @@ static void cap_release_secctx(char *secdata, u32 seclen) { } +static int cap_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen) +{ + return 0; +} + +static int cap_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen) +{ + return 0; +} + +static int cap_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) +{ + return 0; +} #ifdef CONFIG_KEYS static int cap_key_alloc(struct key *key, const struct cred *cred, unsigned long flags) @@ -815,6 +859,13 @@ static int cap_key_getsecurity(struct key *key, char **_buffer) return 0; } +static int cap_key_session_to_parent(const struct cred *cred, + const struct cred *parent_cred, + struct key *key) +{ + return 0; +} + #endif /* CONFIG_KEYS */ #ifdef CONFIG_AUDIT @@ -854,7 +905,7 @@ struct security_operations default_security_ops = { void security_fixup_ops(struct security_operations *ops) { - set_to_cap_if_null(ops, ptrace_may_access); + set_to_cap_if_null(ops, ptrace_access_check); set_to_cap_if_null(ops, ptrace_traceme); set_to_cap_if_null(ops, capget); set_to_cap_if_null(ops, capset); @@ -940,11 +991,14 @@ void security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, file_receive); set_to_cap_if_null(ops, dentry_open); set_to_cap_if_null(ops, task_create); + set_to_cap_if_null(ops, cred_alloc_blank); set_to_cap_if_null(ops, cred_free); set_to_cap_if_null(ops, cred_prepare); set_to_cap_if_null(ops, cred_commit); + set_to_cap_if_null(ops, cred_transfer); set_to_cap_if_null(ops, kernel_act_as); set_to_cap_if_null(ops, kernel_create_files_as); + set_to_cap_if_null(ops, kernel_module_request); set_to_cap_if_null(ops, task_setuid); set_to_cap_if_null(ops, task_fix_setuid); set_to_cap_if_null(ops, task_setgid); @@ -992,6 +1046,9 @@ void security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, secid_to_secctx); set_to_cap_if_null(ops, secctx_to_secid); set_to_cap_if_null(ops, release_secctx); + set_to_cap_if_null(ops, inode_notifysecctx); + set_to_cap_if_null(ops, inode_setsecctx); + set_to_cap_if_null(ops, inode_getsecctx); #ifdef CONFIG_SECURITY_NETWORK set_to_cap_if_null(ops, unix_stream_connect); set_to_cap_if_null(ops, unix_may_send); @@ -1020,6 +1077,9 @@ void security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, inet_csk_clone); set_to_cap_if_null(ops, inet_conn_established); set_to_cap_if_null(ops, req_classify_flow); + set_to_cap_if_null(ops, tun_dev_create); + set_to_cap_if_null(ops, tun_dev_post_create); + set_to_cap_if_null(ops, tun_dev_attach); #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_NETWORK_XFRM set_to_cap_if_null(ops, xfrm_policy_alloc_security); @@ -1038,6 +1098,7 @@ void security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, key_free); set_to_cap_if_null(ops, key_permission); set_to_cap_if_null(ops, key_getsecurity); + set_to_cap_if_null(ops, key_session_to_parent); #endif /* CONFIG_KEYS */ #ifdef CONFIG_AUDIT set_to_cap_if_null(ops, audit_rule_init); diff --git a/security/commoncap.c b/security/commoncap.c index e3097c0a131..fe30751a6cd 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -101,7 +101,7 @@ int cap_settime(struct timespec *ts, struct timezone *tz) } /** - * cap_ptrace_may_access - Determine whether the current process may access + * cap_ptrace_access_check - Determine whether the current process may access * another * @child: The process to be accessed * @mode: The mode of attachment. @@ -109,7 +109,7 @@ int cap_settime(struct timespec *ts, struct timezone *tz) * Determine whether a process may access another, returning 0 if permission * granted, -ve if denied. */ -int cap_ptrace_may_access(struct task_struct *child, unsigned int mode) +int cap_ptrace_access_check(struct task_struct *child, unsigned int mode) { int ret = 0; diff --git a/security/keys/Makefile b/security/keys/Makefile index 747a464943a..74d5447d7df 100644 --- a/security/keys/Makefile +++ b/security/keys/Makefile @@ -3,6 +3,7 @@ # obj-y := \ + gc.o \ key.o \ keyring.o \ keyctl.o \ diff --git a/security/keys/compat.c b/security/keys/compat.c index c766c68a63b..792c0a611a6 100644 --- a/security/keys/compat.c +++ b/security/keys/compat.c @@ -82,6 +82,9 @@ asmlinkage long compat_sys_keyctl(u32 option, case KEYCTL_GET_SECURITY: return keyctl_get_security(arg2, compat_ptr(arg3), arg4); + case KEYCTL_SESSION_TO_PARENT: + return keyctl_session_to_parent(); + default: return -EOPNOTSUPP; } diff --git a/security/keys/gc.c b/security/keys/gc.c new file mode 100644 index 00000000000..1e616aef55f --- /dev/null +++ b/security/keys/gc.c @@ -0,0 +1,194 @@ +/* Key garbage collector + * + * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <keys/keyring-type.h> +#include "internal.h" + +/* + * Delay between key revocation/expiry in seconds + */ +unsigned key_gc_delay = 5 * 60; + +/* + * Reaper + */ +static void key_gc_timer_func(unsigned long); +static void key_garbage_collector(struct work_struct *); +static DEFINE_TIMER(key_gc_timer, key_gc_timer_func, 0, 0); +static DECLARE_WORK(key_gc_work, key_garbage_collector); +static key_serial_t key_gc_cursor; /* the last key the gc considered */ +static unsigned long key_gc_executing; +static time_t key_gc_next_run = LONG_MAX; + +/* + * Schedule a garbage collection run + * - precision isn't particularly important + */ +void key_schedule_gc(time_t gc_at) +{ + unsigned long expires; + time_t now = current_kernel_time().tv_sec; + + kenter("%ld", gc_at - now); + + gc_at += key_gc_delay; + + if (now >= gc_at) { + schedule_work(&key_gc_work); + } else if (gc_at < key_gc_next_run) { + expires = jiffies + (gc_at - now) * HZ; + mod_timer(&key_gc_timer, expires); + } +} + +/* + * The garbage collector timer kicked off + */ +static void key_gc_timer_func(unsigned long data) +{ + kenter(""); + key_gc_next_run = LONG_MAX; + schedule_work(&key_gc_work); +} + +/* + * Garbage collect pointers from a keyring + * - return true if we altered the keyring + */ +static bool key_gc_keyring(struct key *keyring, time_t limit) + __releases(key_serial_lock) +{ + struct keyring_list *klist; + struct key *key; + int loop; + + kenter("%x", key_serial(keyring)); + + if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) + goto dont_gc; + + /* scan the keyring looking for dead keys */ + klist = rcu_dereference(keyring->payload.subscriptions); + if (!klist) + goto dont_gc; + + for (loop = klist->nkeys - 1; loop >= 0; loop--) { + key = klist->keys[loop]; + if (test_bit(KEY_FLAG_DEAD, &key->flags) || + (key->expiry > 0 && key->expiry <= limit)) + goto do_gc; + } + +dont_gc: + kleave(" = false"); + return false; + +do_gc: + key_gc_cursor = keyring->serial; + key_get(keyring); + spin_unlock(&key_serial_lock); + keyring_gc(keyring, limit); + key_put(keyring); + kleave(" = true"); + return true; +} + +/* + * Garbage collector for keys + * - this involves scanning the keyrings for dead, expired and revoked keys + * that have overstayed their welcome + */ +static void key_garbage_collector(struct work_struct *work) +{ + struct rb_node *rb; + key_serial_t cursor; + struct key *key, *xkey; + time_t new_timer = LONG_MAX, limit; + + kenter(""); + + if (test_and_set_bit(0, &key_gc_executing)) { + key_schedule_gc(current_kernel_time().tv_sec); + return; + } + + limit = current_kernel_time().tv_sec; + if (limit > key_gc_delay) + limit -= key_gc_delay; + else + limit = key_gc_delay; + + spin_lock(&key_serial_lock); + + if (RB_EMPTY_ROOT(&key_serial_tree)) + goto reached_the_end; + + cursor = key_gc_cursor; + if (cursor < 0) + cursor = 0; + + /* find the first key above the cursor */ + key = NULL; + rb = key_serial_tree.rb_node; + while (rb) { + xkey = rb_entry(rb, struct key, serial_node); + if (cursor < xkey->serial) { + key = xkey; + rb = rb->rb_left; + } else if (cursor > xkey->serial) { + rb = rb->rb_right; + } else { + rb = rb_next(rb); + if (!rb) + goto reached_the_end; + key = rb_entry(rb, struct key, serial_node); + break; + } + } + + if (!key) + goto reached_the_end; + + /* trawl through the keys looking for keyrings */ + for (;;) { + if (key->expiry > 0 && key->expiry < new_timer) + new_timer = key->expiry; + + if (key->type == &key_type_keyring && + key_gc_keyring(key, limit)) { + /* the gc ate our lock */ + schedule_work(&key_gc_work); + goto no_unlock; + } + + rb = rb_next(&key->serial_node); + if (!rb) { + key_gc_cursor = 0; + break; + } + key = rb_entry(rb, struct key, serial_node); + } + +out: + spin_unlock(&key_serial_lock); +no_unlock: + clear_bit(0, &key_gc_executing); + if (new_timer < LONG_MAX) + key_schedule_gc(new_timer); + + kleave(""); + return; + +reached_the_end: + key_gc_cursor = 0; + goto out; +} diff --git a/security/keys/internal.h b/security/keys/internal.h index 9fb679c66b8..24ba0307b7a 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -124,11 +124,18 @@ extern struct key *request_key_and_link(struct key_type *type, struct key *dest_keyring, unsigned long flags); -extern key_ref_t lookup_user_key(key_serial_t id, int create, int partial, +extern key_ref_t lookup_user_key(key_serial_t id, unsigned long flags, key_perm_t perm); +#define KEY_LOOKUP_CREATE 0x01 +#define KEY_LOOKUP_PARTIAL 0x02 +#define KEY_LOOKUP_FOR_UNLINK 0x04 extern long join_session_keyring(const char *name); +extern unsigned key_gc_delay; +extern void keyring_gc(struct key *keyring, time_t limit); +extern void key_schedule_gc(time_t expiry_at); + /* * check to see whether permission is granted to use a key in the desired way */ @@ -194,6 +201,7 @@ extern long keyctl_set_timeout(key_serial_t, unsigned); extern long keyctl_assume_authority(key_serial_t); extern long keyctl_get_security(key_serial_t keyid, char __user *buffer, size_t buflen); +extern long keyctl_session_to_parent(void); /* * debugging key validation diff --git a/security/keys/key.c b/security/keys/key.c index 4a1297d1ada..08531ad0f25 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -500,6 +500,7 @@ int key_negate_and_link(struct key *key, set_bit(KEY_FLAG_INSTANTIATED, &key->flags); now = current_kernel_time(); key->expiry = now.tv_sec + timeout; + key_schedule_gc(key->expiry); if (test_and_clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags)) awaken = 1; @@ -642,10 +643,8 @@ struct key *key_lookup(key_serial_t id) goto error; found: - /* pretend it doesn't exist if it's dead */ - if (atomic_read(&key->usage) == 0 || - test_bit(KEY_FLAG_DEAD, &key->flags) || - key->type == &key_type_dead) + /* pretend it doesn't exist if it is awaiting deletion */ + if (atomic_read(&key->usage) == 0) goto not_found; /* this races with key_put(), but that doesn't matter since key_put() @@ -890,6 +889,9 @@ EXPORT_SYMBOL(key_update); */ void key_revoke(struct key *key) { + struct timespec now; + time_t time; + key_check(key); /* make sure no one's trying to change or use the key when we mark it @@ -902,6 +904,14 @@ void key_revoke(struct key *key) key->type->revoke) key->type->revoke(key); + /* set the death time to no more than the expiry time */ + now = current_kernel_time(); + time = now.tv_sec; + if (key->revoked_at == 0 || key->revoked_at > time) { + key->revoked_at = time; + key_schedule_gc(key->revoked_at); + } + up_write(&key->sem); } /* end key_revoke() */ @@ -958,8 +968,10 @@ void unregister_key_type(struct key_type *ktype) for (_n = rb_first(&key_serial_tree); _n; _n = rb_next(_n)) { key = rb_entry(_n, struct key, serial_node); - if (key->type == ktype) + if (key->type == ktype) { key->type = &key_type_dead; + set_bit(KEY_FLAG_DEAD, &key->flags); + } } spin_unlock(&key_serial_lock); @@ -984,6 +996,8 @@ void unregister_key_type(struct key_type *ktype) spin_unlock(&key_serial_lock); up_write(&key_types_sem); + key_schedule_gc(0); + } /* end unregister_key_type() */ EXPORT_SYMBOL(unregister_key_type); diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 7f09fb897d2..74c96852459 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -103,7 +103,7 @@ SYSCALL_DEFINE5(add_key, const char __user *, _type, } /* find the target keyring (which must be writable) */ - keyring_ref = lookup_user_key(ringid, 1, 0, KEY_WRITE); + keyring_ref = lookup_user_key(ringid, KEY_LOOKUP_CREATE, KEY_WRITE); if (IS_ERR(keyring_ref)) { ret = PTR_ERR(keyring_ref); goto error3; @@ -185,7 +185,8 @@ SYSCALL_DEFINE4(request_key, const char __user *, _type, /* get the destination keyring if specified */ dest_ref = NULL; if (destringid) { - dest_ref = lookup_user_key(destringid, 1, 0, KEY_WRITE); + dest_ref = lookup_user_key(destringid, KEY_LOOKUP_CREATE, + KEY_WRITE); if (IS_ERR(dest_ref)) { ret = PTR_ERR(dest_ref); goto error3; @@ -233,9 +234,11 @@ SYSCALL_DEFINE4(request_key, const char __user *, _type, long keyctl_get_keyring_ID(key_serial_t id, int create) { key_ref_t key_ref; + unsigned long lflags; long ret; - key_ref = lookup_user_key(id, create, 0, KEY_SEARCH); + lflags = create ? KEY_LOOKUP_CREATE : 0; + key_ref = lookup_user_key(id, lflags, KEY_SEARCH); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error; @@ -309,7 +312,7 @@ long keyctl_update_key(key_serial_t id, } /* find the target key (which must be writable) */ - key_ref = lookup_user_key(id, 0, 0, KEY_WRITE); + key_ref = lookup_user_key(id, 0, KEY_WRITE); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error2; @@ -337,10 +340,16 @@ long keyctl_revoke_key(key_serial_t id) key_ref_t key_ref; long ret; - key_ref = lookup_user_key(id, 0, 0, KEY_WRITE); + key_ref = lookup_user_key(id, 0, KEY_WRITE); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); - goto error; + if (ret != -EACCES) + goto error; + key_ref = lookup_user_key(id, 0, KEY_SETATTR); + if (IS_ERR(key_ref)) { + ret = PTR_ERR(key_ref); + goto error; + } } key_revoke(key_ref_to_ptr(key_ref)); @@ -363,7 +372,7 @@ long keyctl_keyring_clear(key_serial_t ringid) key_ref_t keyring_ref; long ret; - keyring_ref = lookup_user_key(ringid, 1, 0, KEY_WRITE); + keyring_ref = lookup_user_key(ringid, KEY_LOOKUP_CREATE, KEY_WRITE); if (IS_ERR(keyring_ref)) { ret = PTR_ERR(keyring_ref); goto error; @@ -389,13 +398,13 @@ long keyctl_keyring_link(key_serial_t id, key_serial_t ringid) key_ref_t keyring_ref, key_ref; long ret; - keyring_ref = lookup_user_key(ringid, 1, 0, KEY_WRITE); + keyring_ref = lookup_user_key(ringid, KEY_LOOKUP_CREATE, KEY_WRITE); if (IS_ERR(keyring_ref)) { ret = PTR_ERR(keyring_ref); goto error; } - key_ref = lookup_user_key(id, 1, 0, KEY_LINK); + key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE, KEY_LINK); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error2; @@ -423,13 +432,13 @@ long keyctl_keyring_unlink(key_serial_t id, key_serial_t ringid) key_ref_t keyring_ref, key_ref; long ret; - keyring_ref = lookup_user_key(ringid, 0, 0, KEY_WRITE); + keyring_ref = lookup_user_key(ringid, 0, KEY_WRITE); if (IS_ERR(keyring_ref)) { ret = PTR_ERR(keyring_ref); goto error; } - key_ref = lookup_user_key(id, 0, 0, 0); + key_ref = lookup_user_key(id, KEY_LOOKUP_FOR_UNLINK, 0); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error2; @@ -465,7 +474,7 @@ long keyctl_describe_key(key_serial_t keyid, char *tmpbuf; long ret; - key_ref = lookup_user_key(keyid, 0, 1, KEY_VIEW); + key_ref = lookup_user_key(keyid, KEY_LOOKUP_PARTIAL, KEY_VIEW); if (IS_ERR(key_ref)) { /* viewing a key under construction is permitted if we have the * authorisation token handy */ @@ -474,7 +483,8 @@ long keyctl_describe_key(key_serial_t keyid, if (!IS_ERR(instkey)) { key_put(instkey); key_ref = lookup_user_key(keyid, - 0, 1, 0); + KEY_LOOKUP_PARTIAL, + 0); if (!IS_ERR(key_ref)) goto okay; } @@ -558,7 +568,7 @@ long keyctl_keyring_search(key_serial_t ringid, } /* get the keyring at which to begin the search */ - keyring_ref = lookup_user_key(ringid, 0, 0, KEY_SEARCH); + keyring_ref = lookup_user_key(ringid, 0, KEY_SEARCH); if (IS_ERR(keyring_ref)) { ret = PTR_ERR(keyring_ref); goto error2; @@ -567,7 +577,8 @@ long keyctl_keyring_search(key_serial_t ringid, /* get the destination keyring if specified */ dest_ref = NULL; if (destringid) { - dest_ref = lookup_user_key(destringid, 1, 0, KEY_WRITE); + dest_ref = lookup_user_key(destringid, KEY_LOOKUP_CREATE, + KEY_WRITE); if (IS_ERR(dest_ref)) { ret = PTR_ERR(dest_ref); goto error3; @@ -637,7 +648,7 @@ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen) long ret; /* find the key first */ - key_ref = lookup_user_key(keyid, 0, 0, 0); + key_ref = lookup_user_key(keyid, 0, 0); if (IS_ERR(key_ref)) { ret = -ENOKEY; goto error; @@ -700,7 +711,8 @@ long keyctl_chown_key(key_serial_t id, uid_t uid, gid_t gid) if (uid == (uid_t) -1 && gid == (gid_t) -1) goto error; - key_ref = lookup_user_key(id, 1, 1, KEY_SETATTR); + key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL, + KEY_SETATTR); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error; @@ -805,7 +817,8 @@ long keyctl_setperm_key(key_serial_t id, key_perm_t perm) if (perm & ~(KEY_POS_ALL | KEY_USR_ALL | KEY_GRP_ALL | KEY_OTH_ALL)) goto error; - key_ref = lookup_user_key(id, 1, 1, KEY_SETATTR); + key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL, + KEY_SETATTR); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error; @@ -847,7 +860,7 @@ static long get_instantiation_keyring(key_serial_t ringid, /* if a specific keyring is nominated by ID, then use that */ if (ringid > 0) { - dkref = lookup_user_key(ringid, 1, 0, KEY_WRITE); + dkref = lookup_user_key(ringid, KEY_LOOKUP_CREATE, KEY_WRITE); if (IS_ERR(dkref)) return PTR_ERR(dkref); *_dest_keyring = key_ref_to_ptr(dkref); @@ -1083,7 +1096,8 @@ long keyctl_set_timeout(key_serial_t id, unsigned timeout) time_t expiry; long ret; - key_ref = lookup_user_key(id, 1, 1, KEY_SETATTR); + key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL, + KEY_SETATTR); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error; @@ -1101,6 +1115,7 @@ long keyctl_set_timeout(key_serial_t id, unsigned timeout) } key->expiry = expiry; + key_schedule_gc(key->expiry); up_write(&key->sem); key_put(key); @@ -1170,7 +1185,7 @@ long keyctl_get_security(key_serial_t keyid, char *context; long ret; - key_ref = lookup_user_key(keyid, 0, 1, KEY_VIEW); + key_ref = lookup_user_key(keyid, KEY_LOOKUP_PARTIAL, KEY_VIEW); if (IS_ERR(key_ref)) { if (PTR_ERR(key_ref) != -EACCES) return PTR_ERR(key_ref); @@ -1182,7 +1197,7 @@ long keyctl_get_security(key_serial_t keyid, return PTR_ERR(key_ref); key_put(instkey); - key_ref = lookup_user_key(keyid, 0, 1, 0); + key_ref = lookup_user_key(keyid, KEY_LOOKUP_PARTIAL, 0); if (IS_ERR(key_ref)) return PTR_ERR(key_ref); } @@ -1213,6 +1228,105 @@ long keyctl_get_security(key_serial_t keyid, return ret; } +/* + * attempt to install the calling process's session keyring on the process's + * parent process + * - the keyring must exist and must grant us LINK permission + * - implements keyctl(KEYCTL_SESSION_TO_PARENT) + */ +long keyctl_session_to_parent(void) +{ + struct task_struct *me, *parent; + const struct cred *mycred, *pcred; + struct cred *cred, *oldcred; + key_ref_t keyring_r; + int ret; + + keyring_r = lookup_user_key(KEY_SPEC_SESSION_KEYRING, 0, KEY_LINK); + if (IS_ERR(keyring_r)) + return PTR_ERR(keyring_r); + + /* our parent is going to need a new cred struct, a new tgcred struct + * and new security data, so we allocate them here to prevent ENOMEM in + * our parent */ + ret = -ENOMEM; + cred = cred_alloc_blank(); + if (!cred) + goto error_keyring; + + cred->tgcred->session_keyring = key_ref_to_ptr(keyring_r); + keyring_r = NULL; + + me = current; + write_lock_irq(&tasklist_lock); + + parent = me->real_parent; + ret = -EPERM; + + /* the parent mustn't be init and mustn't be a kernel thread */ + if (parent->pid <= 1 || !parent->mm) + goto not_permitted; + + /* the parent must be single threaded */ + if (atomic_read(&parent->signal->count) != 1) + goto not_permitted; + + /* the parent and the child must have different session keyrings or + * there's no point */ + mycred = current_cred(); + pcred = __task_cred(parent); + if (mycred == pcred || + mycred->tgcred->session_keyring == pcred->tgcred->session_keyring) + goto already_same; + + /* the parent must have the same effective ownership and mustn't be + * SUID/SGID */ + if (pcred-> uid != mycred->euid || + pcred->euid != mycred->euid || + pcred->suid != mycred->euid || + pcred-> gid != mycred->egid || + pcred->egid != mycred->egid || + pcred->sgid != mycred->egid) + goto not_permitted; + + /* the keyrings must have the same UID */ + if (pcred ->tgcred->session_keyring->uid != mycred->euid || + mycred->tgcred->session_keyring->uid != mycred->euid) + goto not_permitted; + + /* the LSM must permit the replacement of the parent's keyring with the + * keyring from this process */ + ret = security_key_session_to_parent(mycred, pcred, + key_ref_to_ptr(keyring_r)); + if (ret < 0) + goto not_permitted; + + /* if there's an already pending keyring replacement, then we replace + * that */ + oldcred = parent->replacement_session_keyring; + + /* the replacement session keyring is applied just prior to userspace + * restarting */ + parent->replacement_session_keyring = cred; + cred = NULL; + set_ti_thread_flag(task_thread_info(parent), TIF_NOTIFY_RESUME); + + write_unlock_irq(&tasklist_lock); + if (oldcred) + put_cred(oldcred); + return 0; + +already_same: + ret = 0; +not_permitted: + put_cred(cred); + return ret; + +error_keyring: + key_ref_put(keyring_r); + return ret; +} + /*****************************************************************************/ /* * the key control system call @@ -1298,6 +1412,9 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3, (char __user *) arg3, (size_t) arg4); + case KEYCTL_SESSION_TO_PARENT: + return keyctl_session_to_parent(); + default: return -EOPNOTSUPP; } diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 3dba81c2eba..ac977f661a7 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -1000,3 +1000,88 @@ static void keyring_revoke(struct key *keyring) } } /* end keyring_revoke() */ + +/* + * Determine whether a key is dead + */ +static bool key_is_dead(struct key *key, time_t limit) +{ + return test_bit(KEY_FLAG_DEAD, &key->flags) || + (key->expiry > 0 && key->expiry <= limit); +} + +/* + * Collect garbage from the contents of a keyring + */ +void keyring_gc(struct key *keyring, time_t limit) +{ + struct keyring_list *klist, *new; + struct key *key; + int loop, keep, max; + + kenter("%x", key_serial(keyring)); + + down_write(&keyring->sem); + + klist = keyring->payload.subscriptions; + if (!klist) + goto just_return; + + /* work out how many subscriptions we're keeping */ + keep = 0; + for (loop = klist->nkeys - 1; loop >= 0; loop--) + if (!key_is_dead(klist->keys[loop], limit)); + keep++; + + if (keep == klist->nkeys) + goto just_return; + + /* allocate a new keyring payload */ + max = roundup(keep, 4); + new = kmalloc(sizeof(struct keyring_list) + max * sizeof(struct key *), + GFP_KERNEL); + if (!new) + goto just_return; + new->maxkeys = max; + new->nkeys = 0; + new->delkey = 0; + + /* install the live keys + * - must take care as expired keys may be updated back to life + */ + keep = 0; + for (loop = klist->nkeys - 1; loop >= 0; loop--) { + key = klist->keys[loop]; + if (!key_is_dead(key, limit)) { + if (keep >= max) + goto discard_new; + new->keys[keep++] = key_get(key); + } + } + new->nkeys = keep; + + /* adjust the quota */ + key_payload_reserve(keyring, + sizeof(struct keyring_list) + + KEYQUOTA_LINK_BYTES * keep); + + if (keep == 0) { + rcu_assign_pointer(keyring->payload.subscriptions, NULL); + kfree(new); + } else { + rcu_assign_pointer(keyring->payload.subscriptions, new); + } + + up_write(&keyring->sem); + + call_rcu(&klist->rcu, keyring_clear_rcu_disposal); + kleave(" [yes]"); + return; + +discard_new: + new->nkeys = keep; + keyring_clear_rcu_disposal(&new->rcu); +just_return: + up_write(&keyring->sem); + kleave(" [no]"); +} diff --git a/security/keys/proc.c b/security/keys/proc.c index 769f9bdfd2b..9d01021ca0c 100644 --- a/security/keys/proc.c +++ b/security/keys/proc.c @@ -91,59 +91,94 @@ __initcall(key_proc_init); */ #ifdef CONFIG_KEYS_DEBUG_PROC_KEYS -static struct rb_node *__key_serial_next(struct rb_node *n) +static struct rb_node *key_serial_next(struct rb_node *n) { + struct user_namespace *user_ns = current_user_ns(); + + n = rb_next(n); while (n) { struct key *key = rb_entry(n, struct key, serial_node); - if (key->user->user_ns == current_user_ns()) + if (key->user->user_ns == user_ns) break; n = rb_next(n); } return n; } -static struct rb_node *key_serial_next(struct rb_node *n) +static int proc_keys_open(struct inode *inode, struct file *file) { - return __key_serial_next(rb_next(n)); + return seq_open(file, &proc_keys_ops); } -static struct rb_node *key_serial_first(struct rb_root *r) +static struct key *find_ge_key(key_serial_t id) { - struct rb_node *n = rb_first(r); - return __key_serial_next(n); -} + struct user_namespace *user_ns = current_user_ns(); + struct rb_node *n = key_serial_tree.rb_node; + struct key *minkey = NULL; -static int proc_keys_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &proc_keys_ops); + while (n) { + struct key *key = rb_entry(n, struct key, serial_node); + if (id < key->serial) { + if (!minkey || minkey->serial > key->serial) + minkey = key; + n = n->rb_left; + } else if (id > key->serial) { + n = n->rb_right; + } else { + minkey = key; + break; + } + key = NULL; + } + if (!minkey) + return NULL; + + for (;;) { + if (minkey->user->user_ns == user_ns) + return minkey; + n = rb_next(&minkey->serial_node); + if (!n) + return NULL; + minkey = rb_entry(n, struct key, serial_node); + } } static void *proc_keys_start(struct seq_file *p, loff_t *_pos) + __acquires(key_serial_lock) { - struct rb_node *_p; - loff_t pos = *_pos; + key_serial_t pos = *_pos; + struct key *key; spin_lock(&key_serial_lock); - _p = key_serial_first(&key_serial_tree); - while (pos > 0 && _p) { - pos--; - _p = key_serial_next(_p); - } - - return _p; + if (*_pos > INT_MAX) + return NULL; + key = find_ge_key(pos); + if (!key) + return NULL; + *_pos = key->serial; + return &key->serial_node; +} +static inline key_serial_t key_node_serial(struct rb_node *n) +{ + struct key *key = rb_entry(n, struct key, serial_node); + return key->serial; } static void *proc_keys_next(struct seq_file *p, void *v, loff_t *_pos) { - (*_pos)++; - return key_serial_next((struct rb_node *) v); + struct rb_node *n; + n = key_serial_next(v); + if (n) + *_pos = key_node_serial(n); + return n; } static void proc_keys_stop(struct seq_file *p, void *v) + __releases(key_serial_lock) { spin_unlock(&key_serial_lock); } @@ -174,11 +209,9 @@ static int proc_keys_show(struct seq_file *m, void *v) /* come up with a suitable timeout value */ if (key->expiry == 0) { memcpy(xbuf, "perm", 5); - } - else if (now.tv_sec >= key->expiry) { + } else if (now.tv_sec >= key->expiry) { memcpy(xbuf, "expd", 5); - } - else { + } else { timo = key->expiry - now.tv_sec; if (timo < 60) @@ -218,9 +251,7 @@ static int proc_keys_show(struct seq_file *m, void *v) seq_putc(m, '\n'); rcu_read_unlock(); - return 0; - } #endif /* CONFIG_KEYS_DEBUG_PROC_KEYS */ @@ -246,6 +277,7 @@ static struct rb_node *key_user_first(struct rb_root *r) struct rb_node *n = rb_first(r); return __key_user_next(n); } + /*****************************************************************************/ /* * implement "/proc/key-users" to provides a list of the key users @@ -253,10 +285,10 @@ static struct rb_node *key_user_first(struct rb_root *r) static int proc_key_users_open(struct inode *inode, struct file *file) { return seq_open(file, &proc_key_users_ops); - } static void *proc_key_users_start(struct seq_file *p, loff_t *_pos) + __acquires(key_user_lock) { struct rb_node *_p; loff_t pos = *_pos; @@ -270,17 +302,16 @@ static void *proc_key_users_start(struct seq_file *p, loff_t *_pos) } return _p; - } static void *proc_key_users_next(struct seq_file *p, void *v, loff_t *_pos) { (*_pos)++; return key_user_next((struct rb_node *) v); - } static void proc_key_users_stop(struct seq_file *p, void *v) + __releases(key_user_lock) { spin_unlock(&key_user_lock); } diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 276d27882ce..5c23afb31ec 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -17,6 +17,7 @@ #include <linux/fs.h> #include <linux/err.h> #include <linux/mutex.h> +#include <linux/security.h> #include <linux/user_namespace.h> #include <asm/uaccess.h> #include "internal.h" @@ -487,7 +488,7 @@ static int lookup_user_key_possessed(const struct key *key, const void *target) * - don't create special keyrings unless so requested * - partially constructed keys aren't found unless requested */ -key_ref_t lookup_user_key(key_serial_t id, int create, int partial, +key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags, key_perm_t perm) { struct request_key_auth *rka; @@ -503,7 +504,7 @@ try_again: switch (id) { case KEY_SPEC_THREAD_KEYRING: if (!cred->thread_keyring) { - if (!create) + if (!(lflags & KEY_LOOKUP_CREATE)) goto error; ret = install_thread_keyring(); @@ -521,7 +522,7 @@ try_again: case KEY_SPEC_PROCESS_KEYRING: if (!cred->tgcred->process_keyring) { - if (!create) + if (!(lflags & KEY_LOOKUP_CREATE)) goto error; ret = install_process_keyring(); @@ -642,7 +643,14 @@ try_again: break; } - if (!partial) { + /* unlink does not use the nominated key in any way, so can skip all + * the permission checks as it is only concerned with the keyring */ + if (lflags & KEY_LOOKUP_FOR_UNLINK) { + ret = 0; + goto error; + } + + if (!(lflags & KEY_LOOKUP_PARTIAL)) { ret = wait_for_key_construction(key, true); switch (ret) { case -ERESTARTSYS: @@ -660,7 +668,8 @@ try_again: } ret = -EIO; - if (!partial && !test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) + if (!(lflags & KEY_LOOKUP_PARTIAL) && + !test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) goto invalid_key; /* check the permissions */ @@ -702,7 +711,7 @@ long join_session_keyring(const char *name) /* only permit this if there's a single thread in the thread group - * this avoids us having to adjust the creds on all threads and risking * ENOMEM */ - if (!is_single_threaded(current)) + if (!current_is_single_threaded()) return -EMLINK; new = prepare_creds(); @@ -760,3 +769,51 @@ error: abort_creds(new); return ret; } + +/* + * Replace a process's session keyring when that process resumes userspace on + * behalf of one of its children + */ +void key_replace_session_keyring(void) +{ + const struct cred *old; + struct cred *new; + + if (!current->replacement_session_keyring) + return; + + write_lock_irq(&tasklist_lock); + new = current->replacement_session_keyring; + current->replacement_session_keyring = NULL; + write_unlock_irq(&tasklist_lock); + + if (!new) + return; + + old = current_cred(); + new-> uid = old-> uid; + new-> euid = old-> euid; + new-> suid = old-> suid; + new->fsuid = old->fsuid; + new-> gid = old-> gid; + new-> egid = old-> egid; + new-> sgid = old-> sgid; + new->fsgid = old->fsgid; + new->user = get_uid(old->user); + new->group_info = get_group_info(old->group_info); + + new->securebits = old->securebits; + new->cap_inheritable = old->cap_inheritable; + new->cap_permitted = old->cap_permitted; + new->cap_effective = old->cap_effective; + new->cap_bset = old->cap_bset; + + new->jit_keyring = old->jit_keyring; + new->thread_keyring = key_get(old->thread_keyring); + new->tgcred->tgid = old->tgcred->tgid; + new->tgcred->process_keyring = key_get(old->tgcred->process_keyring); + + security_transfer_creds(new, old); + + commit_creds(new); +} diff --git a/security/keys/sysctl.c b/security/keys/sysctl.c index b611d493c2d..5e05dc09e2d 100644 --- a/security/keys/sysctl.c +++ b/security/keys/sysctl.c @@ -13,6 +13,8 @@ #include <linux/sysctl.h> #include "internal.h" +static const int zero, one = 1, max = INT_MAX; + ctl_table key_sysctls[] = { { .ctl_name = CTL_UNNUMBERED, @@ -20,7 +22,9 @@ ctl_table key_sysctls[] = { .data = &key_quota_maxkeys, .maxlen = sizeof(unsigned), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_dointvec_minmax, + .extra1 = (void *) &one, + .extra2 = (void *) &max, }, { .ctl_name = CTL_UNNUMBERED, @@ -28,7 +32,9 @@ ctl_table key_sysctls[] = { .data = &key_quota_maxbytes, .maxlen = sizeof(unsigned), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_dointvec_minmax, + .extra1 = (void *) &one, + .extra2 = (void *) &max, }, { .ctl_name = CTL_UNNUMBERED, @@ -36,7 +42,9 @@ ctl_table key_sysctls[] = { .data = &key_quota_root_maxkeys, .maxlen = sizeof(unsigned), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_dointvec_minmax, + .extra1 = (void *) &one, + .extra2 = (void *) &max, }, { .ctl_name = CTL_UNNUMBERED, @@ -44,7 +52,19 @@ ctl_table key_sysctls[] = { .data = &key_quota_root_maxbytes, .maxlen = sizeof(unsigned), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_dointvec_minmax, + .extra1 = (void *) &one, + .extra2 = (void *) &max, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "gc_delay", + .data = &key_gc_delay, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = (void *) &zero, + .extra2 = (void *) &max, }, { .ctl_name = 0 } }; diff --git a/security/lsm_audit.c b/security/lsm_audit.c index 94b868494b3..500aad0ebd6 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -220,6 +220,8 @@ static void dump_common_audit_data(struct audit_buffer *ab, } switch (a->type) { + case LSM_AUDIT_NO_AUDIT: + return; case LSM_AUDIT_DATA_IPC: audit_log_format(ab, " key=%d ", a->u.ipc_id); break; diff --git a/security/security.c b/security/security.c index dc7674fbfc7..c4c673240c1 100644 --- a/security/security.c +++ b/security/security.c @@ -124,9 +124,9 @@ int register_security(struct security_operations *ops) /* Security operations */ -int security_ptrace_may_access(struct task_struct *child, unsigned int mode) +int security_ptrace_access_check(struct task_struct *child, unsigned int mode) { - return security_ops->ptrace_may_access(child, mode); + return security_ops->ptrace_access_check(child, mode); } int security_ptrace_traceme(struct task_struct *parent) @@ -684,6 +684,11 @@ int security_task_create(unsigned long clone_flags) return security_ops->task_create(clone_flags); } +int security_cred_alloc_blank(struct cred *cred, gfp_t gfp) +{ + return security_ops->cred_alloc_blank(cred, gfp); +} + void security_cred_free(struct cred *cred) { security_ops->cred_free(cred); @@ -699,6 +704,11 @@ void security_commit_creds(struct cred *new, const struct cred *old) security_ops->cred_commit(new, old); } +void security_transfer_creds(struct cred *new, const struct cred *old) +{ + security_ops->cred_transfer(new, old); +} + int security_kernel_act_as(struct cred *new, u32 secid) { return security_ops->kernel_act_as(new, secid); @@ -709,6 +719,11 @@ int security_kernel_create_files_as(struct cred *new, struct inode *inode) return security_ops->kernel_create_files_as(new, inode); } +int security_kernel_module_request(void) +{ + return security_ops->kernel_module_request(); +} + int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags) { return security_ops->task_setuid(id0, id1, id2, flags); @@ -959,6 +974,24 @@ void security_release_secctx(char *secdata, u32 seclen) } EXPORT_SYMBOL(security_release_secctx); +int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen) +{ + return security_ops->inode_notifysecctx(inode, ctx, ctxlen); +} +EXPORT_SYMBOL(security_inode_notifysecctx); + +int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen) +{ + return security_ops->inode_setsecctx(dentry, ctx, ctxlen); +} +EXPORT_SYMBOL(security_inode_setsecctx); + +int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) +{ + return security_ops->inode_getsecctx(inode, ctx, ctxlen); +} +EXPORT_SYMBOL(security_inode_getsecctx); + #ifdef CONFIG_SECURITY_NETWORK int security_unix_stream_connect(struct socket *sock, struct socket *other, @@ -1112,6 +1145,24 @@ void security_inet_conn_established(struct sock *sk, security_ops->inet_conn_established(sk, skb); } +int security_tun_dev_create(void) +{ + return security_ops->tun_dev_create(); +} +EXPORT_SYMBOL(security_tun_dev_create); + +void security_tun_dev_post_create(struct sock *sk) +{ + return security_ops->tun_dev_post_create(sk); +} +EXPORT_SYMBOL(security_tun_dev_post_create); + +int security_tun_dev_attach(struct sock *sk) +{ + return security_ops->tun_dev_attach(sk); +} +EXPORT_SYMBOL(security_tun_dev_attach); + #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_NETWORK_XFRM @@ -1218,6 +1269,13 @@ int security_key_getsecurity(struct key *key, char **_buffer) return security_ops->key_getsecurity(key, _buffer); } +int security_key_session_to_parent(const struct cred *cred, + const struct cred *parent_cred, + struct key *key) +{ + return security_ops->key_session_to_parent(cred, parent_cred, key); +} + #endif /* CONFIG_KEYS */ #ifdef CONFIG_AUDIT diff --git a/security/selinux/avc.c b/security/selinux/avc.c index b2ab6085983..e3d19014259 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -137,7 +137,7 @@ static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass) * @tclass: target security class * @av: access vector */ -void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av) +static void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av) { const char **common_pts = NULL; u32 common_base = 0; @@ -492,23 +492,35 @@ out: return node; } -static inline void avc_print_ipv6_addr(struct audit_buffer *ab, - struct in6_addr *addr, __be16 port, - char *name1, char *name2) +/** + * avc_audit_pre_callback - SELinux specific information + * will be called by generic audit code + * @ab: the audit buffer + * @a: audit_data + */ +static void avc_audit_pre_callback(struct audit_buffer *ab, void *a) { - if (!ipv6_addr_any(addr)) - audit_log_format(ab, " %s=%pI6", name1, addr); - if (port) - audit_log_format(ab, " %s=%d", name2, ntohs(port)); + struct common_audit_data *ad = a; + audit_log_format(ab, "avc: %s ", + ad->selinux_audit_data.denied ? "denied" : "granted"); + avc_dump_av(ab, ad->selinux_audit_data.tclass, + ad->selinux_audit_data.audited); + audit_log_format(ab, " for "); } -static inline void avc_print_ipv4_addr(struct audit_buffer *ab, __be32 addr, - __be16 port, char *name1, char *name2) +/** + * avc_audit_post_callback - SELinux specific information + * will be called by generic audit code + * @ab: the audit buffer + * @a: audit_data + */ +static void avc_audit_post_callback(struct audit_buffer *ab, void *a) { - if (addr) - audit_log_format(ab, " %s=%pI4", name1, &addr); - if (port) - audit_log_format(ab, " %s=%d", name2, ntohs(port)); + struct common_audit_data *ad = a; + audit_log_format(ab, " "); + avc_dump_query(ab, ad->selinux_audit_data.ssid, + ad->selinux_audit_data.tsid, + ad->selinux_audit_data.tclass); } /** @@ -532,13 +544,10 @@ static inline void avc_print_ipv4_addr(struct audit_buffer *ab, __be32 addr, */ void avc_audit(u32 ssid, u32 tsid, u16 tclass, u32 requested, - struct av_decision *avd, int result, struct avc_audit_data *a) + struct av_decision *avd, int result, struct common_audit_data *a) { - struct task_struct *tsk = current; - struct inode *inode = NULL; + struct common_audit_data stack_data; u32 denied, audited; - struct audit_buffer *ab; - denied = requested & ~avd->allowed; if (denied) { audited = denied; @@ -551,144 +560,20 @@ void avc_audit(u32 ssid, u32 tsid, if (!(audited & avd->auditallow)) return; } - - ab = audit_log_start(current->audit_context, GFP_ATOMIC, AUDIT_AVC); - if (!ab) - return; /* audit_panic has been called */ - audit_log_format(ab, "avc: %s ", denied ? "denied" : "granted"); - avc_dump_av(ab, tclass, audited); - audit_log_format(ab, " for "); - if (a && a->tsk) - tsk = a->tsk; - if (tsk && tsk->pid) { - audit_log_format(ab, " pid=%d comm=", tsk->pid); - audit_log_untrustedstring(ab, tsk->comm); + if (!a) { + a = &stack_data; + memset(a, 0, sizeof(*a)); + a->type = LSM_AUDIT_NO_AUDIT; } - if (a) { - switch (a->type) { - case AVC_AUDIT_DATA_IPC: - audit_log_format(ab, " key=%d", a->u.ipc_id); - break; - case AVC_AUDIT_DATA_CAP: - audit_log_format(ab, " capability=%d", a->u.cap); - break; - case AVC_AUDIT_DATA_FS: - if (a->u.fs.path.dentry) { - struct dentry *dentry = a->u.fs.path.dentry; - if (a->u.fs.path.mnt) { - audit_log_d_path(ab, "path=", - &a->u.fs.path); - } else { - audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, dentry->d_name.name); - } - inode = dentry->d_inode; - } else if (a->u.fs.inode) { - struct dentry *dentry; - inode = a->u.fs.inode; - dentry = d_find_alias(inode); - if (dentry) { - audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, dentry->d_name.name); - dput(dentry); - } - } - if (inode) - audit_log_format(ab, " dev=%s ino=%lu", - inode->i_sb->s_id, - inode->i_ino); - break; - case AVC_AUDIT_DATA_NET: - if (a->u.net.sk) { - struct sock *sk = a->u.net.sk; - struct unix_sock *u; - int len = 0; - char *p = NULL; - - switch (sk->sk_family) { - case AF_INET: { - struct inet_sock *inet = inet_sk(sk); - - avc_print_ipv4_addr(ab, inet->rcv_saddr, - inet->sport, - "laddr", "lport"); - avc_print_ipv4_addr(ab, inet->daddr, - inet->dport, - "faddr", "fport"); - break; - } - case AF_INET6: { - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *inet6 = inet6_sk(sk); - - avc_print_ipv6_addr(ab, &inet6->rcv_saddr, - inet->sport, - "laddr", "lport"); - avc_print_ipv6_addr(ab, &inet6->daddr, - inet->dport, - "faddr", "fport"); - break; - } - case AF_UNIX: - u = unix_sk(sk); - if (u->dentry) { - struct path path = { - .dentry = u->dentry, - .mnt = u->mnt - }; - audit_log_d_path(ab, "path=", - &path); - break; - } - if (!u->addr) - break; - len = u->addr->len-sizeof(short); - p = &u->addr->name->sun_path[0]; - audit_log_format(ab, " path="); - if (*p) - audit_log_untrustedstring(ab, p); - else - audit_log_n_hex(ab, p, len); - break; - } - } - - switch (a->u.net.family) { - case AF_INET: - avc_print_ipv4_addr(ab, a->u.net.v4info.saddr, - a->u.net.sport, - "saddr", "src"); - avc_print_ipv4_addr(ab, a->u.net.v4info.daddr, - a->u.net.dport, - "daddr", "dest"); - break; - case AF_INET6: - avc_print_ipv6_addr(ab, &a->u.net.v6info.saddr, - a->u.net.sport, - "saddr", "src"); - avc_print_ipv6_addr(ab, &a->u.net.v6info.daddr, - a->u.net.dport, - "daddr", "dest"); - break; - } - if (a->u.net.netif > 0) { - struct net_device *dev; - - /* NOTE: we always use init's namespace */ - dev = dev_get_by_index(&init_net, - a->u.net.netif); - if (dev) { - audit_log_format(ab, " netif=%s", - dev->name); - dev_put(dev); - } - } - break; - } - } - audit_log_format(ab, " "); - avc_dump_query(ab, ssid, tsid, tclass); - audit_log_end(ab); + a->selinux_audit_data.tclass = tclass; + a->selinux_audit_data.requested = requested; + a->selinux_audit_data.ssid = ssid; + a->selinux_audit_data.tsid = tsid; + a->selinux_audit_data.audited = audited; + a->selinux_audit_data.denied = denied; + a->lsm_pre_audit = avc_audit_pre_callback; + a->lsm_post_audit = avc_audit_post_callback; + common_lsm_audit(a); } /** @@ -956,7 +841,7 @@ out: * another -errno upon other errors. */ int avc_has_perm(u32 ssid, u32 tsid, u16 tclass, - u32 requested, struct avc_audit_data *auditdata) + u32 requested, struct common_audit_data *auditdata) { struct av_decision avd; int rc; @@ -970,3 +855,9 @@ u32 avc_policy_seqno(void) { return avc_cache.latest_notif; } + +void avc_disable(void) +{ + if (avc_node_cachep) + kmem_cache_destroy(avc_node_cachep); +} diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 8d8b69c5664..417f7c99452 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -13,8 +13,8 @@ * Eric Paris <eparis@redhat.com> * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. * <dgoeddel@trustedcs.com> - * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P. - * Paul Moore <paul.moore@hp.com> + * Copyright (C) 2006, 2007, 2009 Hewlett-Packard Development Company, L.P. + * Paul Moore <paul.moore@hp.com> * Copyright (C) 2007 Hitachi Software Engineering Co., Ltd. * Yuichi Nakamura <ynakam@hitachisoft.jp> * @@ -448,6 +448,10 @@ static int sb_finish_set_opts(struct super_block *sb) sbsec->behavior > ARRAY_SIZE(labeling_behaviors)) sbsec->flags &= ~SE_SBLABELSUPP; + /* Special handling for sysfs. Is genfs but also has setxattr handler*/ + if (strncmp(sb->s_type->name, "sysfs", sizeof("sysfs")) == 0) + sbsec->flags |= SE_SBLABELSUPP; + /* Initialize the root inode. */ rc = inode_doinit_with_dentry(root_inode, root); @@ -1479,14 +1483,14 @@ static int task_has_capability(struct task_struct *tsk, const struct cred *cred, int cap, int audit) { - struct avc_audit_data ad; + struct common_audit_data ad; struct av_decision avd; u16 sclass; u32 sid = cred_sid(cred); u32 av = CAP_TO_MASK(cap); int rc; - AVC_AUDIT_DATA_INIT(&ad, CAP); + COMMON_AUDIT_DATA_INIT(&ad, CAP); ad.tsk = tsk; ad.u.cap = cap; @@ -1525,12 +1529,14 @@ static int task_has_system(struct task_struct *tsk, static int inode_has_perm(const struct cred *cred, struct inode *inode, u32 perms, - struct avc_audit_data *adp) + struct common_audit_data *adp) { struct inode_security_struct *isec; - struct avc_audit_data ad; + struct common_audit_data ad; u32 sid; + validate_creds(cred); + if (unlikely(IS_PRIVATE(inode))) return 0; @@ -1539,7 +1545,7 @@ static int inode_has_perm(const struct cred *cred, if (!adp) { adp = &ad; - AVC_AUDIT_DATA_INIT(&ad, FS); + COMMON_AUDIT_DATA_INIT(&ad, FS); ad.u.fs.inode = inode; } @@ -1555,9 +1561,9 @@ static inline int dentry_has_perm(const struct cred *cred, u32 av) { struct inode *inode = dentry->d_inode; - struct avc_audit_data ad; + struct common_audit_data ad; - AVC_AUDIT_DATA_INIT(&ad, FS); + COMMON_AUDIT_DATA_INIT(&ad, FS); ad.u.fs.path.mnt = mnt; ad.u.fs.path.dentry = dentry; return inode_has_perm(cred, inode, av, &ad); @@ -1577,11 +1583,11 @@ static int file_has_perm(const struct cred *cred, { struct file_security_struct *fsec = file->f_security; struct inode *inode = file->f_path.dentry->d_inode; - struct avc_audit_data ad; + struct common_audit_data ad; u32 sid = cred_sid(cred); int rc; - AVC_AUDIT_DATA_INIT(&ad, FS); + COMMON_AUDIT_DATA_INIT(&ad, FS); ad.u.fs.path = file->f_path; if (sid != fsec->sid) { @@ -1612,7 +1618,7 @@ static int may_create(struct inode *dir, struct inode_security_struct *dsec; struct superblock_security_struct *sbsec; u32 sid, newsid; - struct avc_audit_data ad; + struct common_audit_data ad; int rc; dsec = dir->i_security; @@ -1621,7 +1627,7 @@ static int may_create(struct inode *dir, sid = tsec->sid; newsid = tsec->create_sid; - AVC_AUDIT_DATA_INIT(&ad, FS); + COMMON_AUDIT_DATA_INIT(&ad, FS); ad.u.fs.path.dentry = dentry; rc = avc_has_perm(sid, dsec->sid, SECCLASS_DIR, @@ -1665,7 +1671,7 @@ static int may_link(struct inode *dir, { struct inode_security_struct *dsec, *isec; - struct avc_audit_data ad; + struct common_audit_data ad; u32 sid = current_sid(); u32 av; int rc; @@ -1673,7 +1679,7 @@ static int may_link(struct inode *dir, dsec = dir->i_security; isec = dentry->d_inode->i_security; - AVC_AUDIT_DATA_INIT(&ad, FS); + COMMON_AUDIT_DATA_INIT(&ad, FS); ad.u.fs.path.dentry = dentry; av = DIR__SEARCH; @@ -1708,7 +1714,7 @@ static inline int may_rename(struct inode *old_dir, struct dentry *new_dentry) { struct inode_security_struct *old_dsec, *new_dsec, *old_isec, *new_isec; - struct avc_audit_data ad; + struct common_audit_data ad; u32 sid = current_sid(); u32 av; int old_is_dir, new_is_dir; @@ -1719,7 +1725,7 @@ static inline int may_rename(struct inode *old_dir, old_is_dir = S_ISDIR(old_dentry->d_inode->i_mode); new_dsec = new_dir->i_security; - AVC_AUDIT_DATA_INIT(&ad, FS); + COMMON_AUDIT_DATA_INIT(&ad, FS); ad.u.fs.path.dentry = old_dentry; rc = avc_has_perm(sid, old_dsec->sid, SECCLASS_DIR, @@ -1761,7 +1767,7 @@ static inline int may_rename(struct inode *old_dir, static int superblock_has_perm(const struct cred *cred, struct super_block *sb, u32 perms, - struct avc_audit_data *ad) + struct common_audit_data *ad) { struct superblock_security_struct *sbsec; u32 sid = cred_sid(cred); @@ -1855,12 +1861,12 @@ static inline u32 open_file_to_av(struct file *file) /* Hook functions begin here. */ -static int selinux_ptrace_may_access(struct task_struct *child, +static int selinux_ptrace_access_check(struct task_struct *child, unsigned int mode) { int rc; - rc = cap_ptrace_may_access(child, mode); + rc = cap_ptrace_access_check(child, mode); if (rc) return rc; @@ -2101,7 +2107,7 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm) const struct task_security_struct *old_tsec; struct task_security_struct *new_tsec; struct inode_security_struct *isec; - struct avc_audit_data ad; + struct common_audit_data ad; struct inode *inode = bprm->file->f_path.dentry->d_inode; int rc; @@ -2139,7 +2145,7 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm) return rc; } - AVC_AUDIT_DATA_INIT(&ad, FS); + COMMON_AUDIT_DATA_INIT(&ad, FS); ad.u.fs.path = bprm->file->f_path; if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) @@ -2232,7 +2238,7 @@ extern struct dentry *selinux_null; static inline void flush_unauthorized_files(const struct cred *cred, struct files_struct *files) { - struct avc_audit_data ad; + struct common_audit_data ad; struct file *file, *devnull = NULL; struct tty_struct *tty; struct fdtable *fdt; @@ -2266,7 +2272,7 @@ static inline void flush_unauthorized_files(const struct cred *cred, /* Revalidate access to inherited open files. */ - AVC_AUDIT_DATA_INIT(&ad, FS); + COMMON_AUDIT_DATA_INIT(&ad, FS); spin_lock(&files->file_lock); for (;;) { @@ -2515,7 +2521,7 @@ out: static int selinux_sb_kern_mount(struct super_block *sb, int flags, void *data) { const struct cred *cred = current_cred(); - struct avc_audit_data ad; + struct common_audit_data ad; int rc; rc = superblock_doinit(sb, data); @@ -2526,7 +2532,7 @@ static int selinux_sb_kern_mount(struct super_block *sb, int flags, void *data) if (flags & MS_KERNMOUNT) return 0; - AVC_AUDIT_DATA_INIT(&ad, FS); + COMMON_AUDIT_DATA_INIT(&ad, FS); ad.u.fs.path.dentry = sb->s_root; return superblock_has_perm(cred, sb, FILESYSTEM__MOUNT, &ad); } @@ -2534,9 +2540,9 @@ static int selinux_sb_kern_mount(struct super_block *sb, int flags, void *data) static int selinux_sb_statfs(struct dentry *dentry) { const struct cred *cred = current_cred(); - struct avc_audit_data ad; + struct common_audit_data ad; - AVC_AUDIT_DATA_INIT(&ad, FS); + COMMON_AUDIT_DATA_INIT(&ad, FS); ad.u.fs.path.dentry = dentry->d_sb->s_root; return superblock_has_perm(cred, dentry->d_sb, FILESYSTEM__GETATTR, &ad); } @@ -2711,12 +2717,18 @@ static int selinux_inode_permission(struct inode *inode, int mask) static int selinux_inode_setattr(struct dentry *dentry, struct iattr *iattr) { const struct cred *cred = current_cred(); + unsigned int ia_valid = iattr->ia_valid; + + /* ATTR_FORCE is just used for ATTR_KILL_S[UG]ID. */ + if (ia_valid & ATTR_FORCE) { + ia_valid &= ~(ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_MODE | + ATTR_FORCE); + if (!ia_valid) + return 0; + } - if (iattr->ia_valid & ATTR_FORCE) - return 0; - - if (iattr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | - ATTR_ATIME_SET | ATTR_MTIME_SET)) + if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | + ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_TIMES_SET)) return dentry_has_perm(cred, NULL, dentry, FILE__SETATTR); return dentry_has_perm(cred, NULL, dentry, FILE__WRITE); @@ -2756,7 +2768,7 @@ static int selinux_inode_setxattr(struct dentry *dentry, const char *name, struct inode *inode = dentry->d_inode; struct inode_security_struct *isec = inode->i_security; struct superblock_security_struct *sbsec; - struct avc_audit_data ad; + struct common_audit_data ad; u32 newsid, sid = current_sid(); int rc = 0; @@ -2770,7 +2782,7 @@ static int selinux_inode_setxattr(struct dentry *dentry, const char *name, if (!is_owner_or_cap(inode)) return -EPERM; - AVC_AUDIT_DATA_INIT(&ad, FS); + COMMON_AUDIT_DATA_INIT(&ad, FS); ad.u.fs.path.dentry = dentry; rc = avc_has_perm(sid, isec->sid, isec->sclass, @@ -2915,6 +2927,7 @@ static int selinux_inode_setsecurity(struct inode *inode, const char *name, return rc; isec->sid = newsid; + isec->initialized = 1; return 0; } @@ -2939,11 +2952,6 @@ static int selinux_revalidate_file_permission(struct file *file, int mask) const struct cred *cred = current_cred(); struct inode *inode = file->f_path.dentry->d_inode; - if (!mask) { - /* No permission to check. Existence test. */ - return 0; - } - /* file_mask_to_av won't add FILE__WRITE if MAY_APPEND is set */ if ((file->f_flags & O_APPEND) && (mask & MAY_WRITE)) mask |= MAY_APPEND; @@ -2954,10 +2962,20 @@ static int selinux_revalidate_file_permission(struct file *file, int mask) static int selinux_file_permission(struct file *file, int mask) { + struct inode *inode = file->f_path.dentry->d_inode; + struct file_security_struct *fsec = file->f_security; + struct inode_security_struct *isec = inode->i_security; + u32 sid = current_sid(); + if (!mask) /* No permission to check. Existence test. */ return 0; + if (sid == fsec->sid && fsec->isid == isec->sid && + fsec->pseqno == avc_policy_seqno()) + /* No change since dentry_open check. */ + return 0; + return selinux_revalidate_file_permission(file, mask); } @@ -3220,12 +3238,29 @@ static int selinux_task_create(unsigned long clone_flags) } /* + * allocate the SELinux part of blank credentials + */ +static int selinux_cred_alloc_blank(struct cred *cred, gfp_t gfp) +{ + struct task_security_struct *tsec; + + tsec = kzalloc(sizeof(struct task_security_struct), gfp); + if (!tsec) + return -ENOMEM; + + cred->security = tsec; + return 0; +} + +/* * detach and free the LSM part of a set of credentials */ static void selinux_cred_free(struct cred *cred) { struct task_security_struct *tsec = cred->security; - cred->security = NULL; + + BUG_ON((unsigned long) cred->security < PAGE_SIZE); + cred->security = (void *) 0x7UL; kfree(tsec); } @@ -3249,6 +3284,17 @@ static int selinux_cred_prepare(struct cred *new, const struct cred *old, } /* + * transfer the SELinux data to a blank set of creds + */ +static void selinux_cred_transfer(struct cred *new, const struct cred *old) +{ + const struct task_security_struct *old_tsec = old->security; + struct task_security_struct *tsec = new->security; + + *tsec = *old_tsec; +} + +/* * set the security data for a kernel service * - all the creation contexts are set to unlabelled */ @@ -3292,6 +3338,11 @@ static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode) return 0; } +static int selinux_kernel_module_request(void) +{ + return task_has_system(current, SYSTEM__MODULE_REQUEST); +} + static int selinux_task_setpgid(struct task_struct *p, pid_t pgid) { return current_has_perm(p, PROCESS__SETPGID); @@ -3409,7 +3460,7 @@ static void selinux_task_to_inode(struct task_struct *p, /* Returns error only if unable to parse addresses */ static int selinux_parse_skb_ipv4(struct sk_buff *skb, - struct avc_audit_data *ad, u8 *proto) + struct common_audit_data *ad, u8 *proto) { int offset, ihlen, ret = -EINVAL; struct iphdr _iph, *ih; @@ -3490,7 +3541,7 @@ out: /* Returns error only if unable to parse addresses */ static int selinux_parse_skb_ipv6(struct sk_buff *skb, - struct avc_audit_data *ad, u8 *proto) + struct common_audit_data *ad, u8 *proto) { u8 nexthdr; int ret = -EINVAL, offset; @@ -3561,7 +3612,7 @@ out: #endif /* IPV6 */ -static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad, +static int selinux_parse_skb(struct sk_buff *skb, struct common_audit_data *ad, char **_addrp, int src, u8 *proto) { char *addrp; @@ -3643,7 +3694,7 @@ static int socket_has_perm(struct task_struct *task, struct socket *sock, u32 perms) { struct inode_security_struct *isec; - struct avc_audit_data ad; + struct common_audit_data ad; u32 sid; int err = 0; @@ -3653,7 +3704,7 @@ static int socket_has_perm(struct task_struct *task, struct socket *sock, goto out; sid = task_sid(task); - AVC_AUDIT_DATA_INIT(&ad, NET); + COMMON_AUDIT_DATA_INIT(&ad, NET); ad.u.net.sk = sock->sk; err = avc_has_perm(sid, isec->sid, isec->sclass, perms, &ad); @@ -3740,7 +3791,7 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in if (family == PF_INET || family == PF_INET6) { char *addrp; struct inode_security_struct *isec; - struct avc_audit_data ad; + struct common_audit_data ad; struct sockaddr_in *addr4 = NULL; struct sockaddr_in6 *addr6 = NULL; unsigned short snum; @@ -3769,7 +3820,7 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in snum, &sid); if (err) goto out; - AVC_AUDIT_DATA_INIT(&ad, NET); + COMMON_AUDIT_DATA_INIT(&ad, NET); ad.u.net.sport = htons(snum); ad.u.net.family = family; err = avc_has_perm(isec->sid, sid, @@ -3802,7 +3853,7 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in if (err) goto out; - AVC_AUDIT_DATA_INIT(&ad, NET); + COMMON_AUDIT_DATA_INIT(&ad, NET); ad.u.net.sport = htons(snum); ad.u.net.family = family; @@ -3836,7 +3887,7 @@ static int selinux_socket_connect(struct socket *sock, struct sockaddr *address, isec = SOCK_INODE(sock)->i_security; if (isec->sclass == SECCLASS_TCP_SOCKET || isec->sclass == SECCLASS_DCCP_SOCKET) { - struct avc_audit_data ad; + struct common_audit_data ad; struct sockaddr_in *addr4 = NULL; struct sockaddr_in6 *addr6 = NULL; unsigned short snum; @@ -3861,7 +3912,7 @@ static int selinux_socket_connect(struct socket *sock, struct sockaddr *address, perm = (isec->sclass == SECCLASS_TCP_SOCKET) ? TCP_SOCKET__NAME_CONNECT : DCCP_SOCKET__NAME_CONNECT; - AVC_AUDIT_DATA_INIT(&ad, NET); + COMMON_AUDIT_DATA_INIT(&ad, NET); ad.u.net.dport = htons(snum); ad.u.net.family = sk->sk_family; err = avc_has_perm(isec->sid, sid, isec->sclass, perm, &ad); @@ -3951,13 +4002,13 @@ static int selinux_socket_unix_stream_connect(struct socket *sock, struct sk_security_struct *ssec; struct inode_security_struct *isec; struct inode_security_struct *other_isec; - struct avc_audit_data ad; + struct common_audit_data ad; int err; isec = SOCK_INODE(sock)->i_security; other_isec = SOCK_INODE(other)->i_security; - AVC_AUDIT_DATA_INIT(&ad, NET); + COMMON_AUDIT_DATA_INIT(&ad, NET); ad.u.net.sk = other->sk; err = avc_has_perm(isec->sid, other_isec->sid, @@ -3983,13 +4034,13 @@ static int selinux_socket_unix_may_send(struct socket *sock, { struct inode_security_struct *isec; struct inode_security_struct *other_isec; - struct avc_audit_data ad; + struct common_audit_data ad; int err; isec = SOCK_INODE(sock)->i_security; other_isec = SOCK_INODE(other)->i_security; - AVC_AUDIT_DATA_INIT(&ad, NET); + COMMON_AUDIT_DATA_INIT(&ad, NET); ad.u.net.sk = other->sk; err = avc_has_perm(isec->sid, other_isec->sid, @@ -4002,7 +4053,7 @@ static int selinux_socket_unix_may_send(struct socket *sock, static int selinux_inet_sys_rcv_skb(int ifindex, char *addrp, u16 family, u32 peer_sid, - struct avc_audit_data *ad) + struct common_audit_data *ad) { int err; u32 if_sid; @@ -4030,10 +4081,10 @@ static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb, struct sk_security_struct *sksec = sk->sk_security; u32 peer_sid; u32 sk_sid = sksec->sid; - struct avc_audit_data ad; + struct common_audit_data ad; char *addrp; - AVC_AUDIT_DATA_INIT(&ad, NET); + COMMON_AUDIT_DATA_INIT(&ad, NET); ad.u.net.netif = skb->iif; ad.u.net.family = family; err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL); @@ -4071,7 +4122,7 @@ static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) struct sk_security_struct *sksec = sk->sk_security; u16 family = sk->sk_family; u32 sk_sid = sksec->sid; - struct avc_audit_data ad; + struct common_audit_data ad; char *addrp; u8 secmark_active; u8 peerlbl_active; @@ -4095,7 +4146,7 @@ static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) if (!secmark_active && !peerlbl_active) return 0; - AVC_AUDIT_DATA_INIT(&ad, NET); + COMMON_AUDIT_DATA_INIT(&ad, NET); ad.u.net.netif = skb->iif; ad.u.net.family = family; err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL); @@ -4309,6 +4360,59 @@ static void selinux_req_classify_flow(const struct request_sock *req, fl->secid = req->secid; } +static int selinux_tun_dev_create(void) +{ + u32 sid = current_sid(); + + /* we aren't taking into account the "sockcreate" SID since the socket + * that is being created here is not a socket in the traditional sense, + * instead it is a private sock, accessible only to the kernel, and + * representing a wide range of network traffic spanning multiple + * connections unlike traditional sockets - check the TUN driver to + * get a better understanding of why this socket is special */ + + return avc_has_perm(sid, sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__CREATE, + NULL); +} + +static void selinux_tun_dev_post_create(struct sock *sk) +{ + struct sk_security_struct *sksec = sk->sk_security; + + /* we don't currently perform any NetLabel based labeling here and it + * isn't clear that we would want to do so anyway; while we could apply + * labeling without the support of the TUN user the resulting labeled + * traffic from the other end of the connection would almost certainly + * cause confusion to the TUN user that had no idea network labeling + * protocols were being used */ + + /* see the comments in selinux_tun_dev_create() about why we don't use + * the sockcreate SID here */ + + sksec->sid = current_sid(); + sksec->sclass = SECCLASS_TUN_SOCKET; +} + +static int selinux_tun_dev_attach(struct sock *sk) +{ + struct sk_security_struct *sksec = sk->sk_security; + u32 sid = current_sid(); + int err; + + err = avc_has_perm(sid, sksec->sid, SECCLASS_TUN_SOCKET, + TUN_SOCKET__RELABELFROM, NULL); + if (err) + return err; + err = avc_has_perm(sid, sid, SECCLASS_TUN_SOCKET, + TUN_SOCKET__RELABELTO, NULL); + if (err) + return err; + + sksec->sid = sid; + + return 0; +} + static int selinux_nlmsg_perm(struct sock *sk, struct sk_buff *skb) { int err = 0; @@ -4353,7 +4457,7 @@ static unsigned int selinux_ip_forward(struct sk_buff *skb, int ifindex, int err; char *addrp; u32 peer_sid; - struct avc_audit_data ad; + struct common_audit_data ad; u8 secmark_active; u8 netlbl_active; u8 peerlbl_active; @@ -4370,7 +4474,7 @@ static unsigned int selinux_ip_forward(struct sk_buff *skb, int ifindex, if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0) return NF_DROP; - AVC_AUDIT_DATA_INIT(&ad, NET); + COMMON_AUDIT_DATA_INIT(&ad, NET); ad.u.net.netif = ifindex; ad.u.net.family = family; if (selinux_parse_skb(skb, &ad, &addrp, 1, NULL) != 0) @@ -4458,7 +4562,7 @@ static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb, { struct sock *sk = skb->sk; struct sk_security_struct *sksec; - struct avc_audit_data ad; + struct common_audit_data ad; char *addrp; u8 proto; @@ -4466,7 +4570,7 @@ static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb, return NF_ACCEPT; sksec = sk->sk_security; - AVC_AUDIT_DATA_INIT(&ad, NET); + COMMON_AUDIT_DATA_INIT(&ad, NET); ad.u.net.netif = ifindex; ad.u.net.family = family; if (selinux_parse_skb(skb, &ad, &addrp, 0, &proto)) @@ -4490,7 +4594,7 @@ static unsigned int selinux_ip_postroute(struct sk_buff *skb, int ifindex, u32 secmark_perm; u32 peer_sid; struct sock *sk; - struct avc_audit_data ad; + struct common_audit_data ad; char *addrp; u8 secmark_active; u8 peerlbl_active; @@ -4549,7 +4653,7 @@ static unsigned int selinux_ip_postroute(struct sk_buff *skb, int ifindex, secmark_perm = PACKET__SEND; } - AVC_AUDIT_DATA_INIT(&ad, NET); + COMMON_AUDIT_DATA_INIT(&ad, NET); ad.u.net.netif = ifindex; ad.u.net.family = family; if (selinux_parse_skb(skb, &ad, &addrp, 0, NULL)) @@ -4619,13 +4723,13 @@ static int selinux_netlink_send(struct sock *sk, struct sk_buff *skb) static int selinux_netlink_recv(struct sk_buff *skb, int capability) { int err; - struct avc_audit_data ad; + struct common_audit_data ad; err = cap_netlink_recv(skb, capability); if (err) return err; - AVC_AUDIT_DATA_INIT(&ad, CAP); + COMMON_AUDIT_DATA_INIT(&ad, CAP); ad.u.cap = capability; return avc_has_perm(NETLINK_CB(skb).sid, NETLINK_CB(skb).sid, @@ -4684,12 +4788,12 @@ static int ipc_has_perm(struct kern_ipc_perm *ipc_perms, u32 perms) { struct ipc_security_struct *isec; - struct avc_audit_data ad; + struct common_audit_data ad; u32 sid = current_sid(); isec = ipc_perms->security; - AVC_AUDIT_DATA_INIT(&ad, IPC); + COMMON_AUDIT_DATA_INIT(&ad, IPC); ad.u.ipc_id = ipc_perms->key; return avc_has_perm(sid, isec->sid, isec->sclass, perms, &ad); @@ -4709,7 +4813,7 @@ static void selinux_msg_msg_free_security(struct msg_msg *msg) static int selinux_msg_queue_alloc_security(struct msg_queue *msq) { struct ipc_security_struct *isec; - struct avc_audit_data ad; + struct common_audit_data ad; u32 sid = current_sid(); int rc; @@ -4719,7 +4823,7 @@ static int selinux_msg_queue_alloc_security(struct msg_queue *msq) isec = msq->q_perm.security; - AVC_AUDIT_DATA_INIT(&ad, IPC); + COMMON_AUDIT_DATA_INIT(&ad, IPC); ad.u.ipc_id = msq->q_perm.key; rc = avc_has_perm(sid, isec->sid, SECCLASS_MSGQ, @@ -4739,12 +4843,12 @@ static void selinux_msg_queue_free_security(struct msg_queue *msq) static int selinux_msg_queue_associate(struct msg_queue *msq, int msqflg) { struct ipc_security_struct *isec; - struct avc_audit_data ad; + struct common_audit_data ad; u32 sid = current_sid(); isec = msq->q_perm.security; - AVC_AUDIT_DATA_INIT(&ad, IPC); + COMMON_AUDIT_DATA_INIT(&ad, IPC); ad.u.ipc_id = msq->q_perm.key; return avc_has_perm(sid, isec->sid, SECCLASS_MSGQ, @@ -4783,7 +4887,7 @@ static int selinux_msg_queue_msgsnd(struct msg_queue *msq, struct msg_msg *msg, { struct ipc_security_struct *isec; struct msg_security_struct *msec; - struct avc_audit_data ad; + struct common_audit_data ad; u32 sid = current_sid(); int rc; @@ -4804,7 +4908,7 @@ static int selinux_msg_queue_msgsnd(struct msg_queue *msq, struct msg_msg *msg, return rc; } - AVC_AUDIT_DATA_INIT(&ad, IPC); + COMMON_AUDIT_DATA_INIT(&ad, IPC); ad.u.ipc_id = msq->q_perm.key; /* Can this process write to the queue? */ @@ -4828,14 +4932,14 @@ static int selinux_msg_queue_msgrcv(struct msg_queue *msq, struct msg_msg *msg, { struct ipc_security_struct *isec; struct msg_security_struct *msec; - struct avc_audit_data ad; + struct common_audit_data ad; u32 sid = task_sid(target); int rc; isec = msq->q_perm.security; msec = msg->security; - AVC_AUDIT_DATA_INIT(&ad, IPC); + COMMON_AUDIT_DATA_INIT(&ad, IPC); ad.u.ipc_id = msq->q_perm.key; rc = avc_has_perm(sid, isec->sid, @@ -4850,7 +4954,7 @@ static int selinux_msg_queue_msgrcv(struct msg_queue *msq, struct msg_msg *msg, static int selinux_shm_alloc_security(struct shmid_kernel *shp) { struct ipc_security_struct *isec; - struct avc_audit_data ad; + struct common_audit_data ad; u32 sid = current_sid(); int rc; @@ -4860,7 +4964,7 @@ static int selinux_shm_alloc_security(struct shmid_kernel *shp) isec = shp->shm_perm.security; - AVC_AUDIT_DATA_INIT(&ad, IPC); + COMMON_AUDIT_DATA_INIT(&ad, IPC); ad.u.ipc_id = shp->shm_perm.key; rc = avc_has_perm(sid, isec->sid, SECCLASS_SHM, @@ -4880,12 +4984,12 @@ static void selinux_shm_free_security(struct shmid_kernel *shp) static int selinux_shm_associate(struct shmid_kernel *shp, int shmflg) { struct ipc_security_struct *isec; - struct avc_audit_data ad; + struct common_audit_data ad; u32 sid = current_sid(); isec = shp->shm_perm.security; - AVC_AUDIT_DATA_INIT(&ad, IPC); + COMMON_AUDIT_DATA_INIT(&ad, IPC); ad.u.ipc_id = shp->shm_perm.key; return avc_has_perm(sid, isec->sid, SECCLASS_SHM, @@ -4942,7 +5046,7 @@ static int selinux_shm_shmat(struct shmid_kernel *shp, static int selinux_sem_alloc_security(struct sem_array *sma) { struct ipc_security_struct *isec; - struct avc_audit_data ad; + struct common_audit_data ad; u32 sid = current_sid(); int rc; @@ -4952,7 +5056,7 @@ static int selinux_sem_alloc_security(struct sem_array *sma) isec = sma->sem_perm.security; - AVC_AUDIT_DATA_INIT(&ad, IPC); + COMMON_AUDIT_DATA_INIT(&ad, IPC); ad.u.ipc_id = sma->sem_perm.key; rc = avc_has_perm(sid, isec->sid, SECCLASS_SEM, @@ -4972,12 +5076,12 @@ static void selinux_sem_free_security(struct sem_array *sma) static int selinux_sem_associate(struct sem_array *sma, int semflg) { struct ipc_security_struct *isec; - struct avc_audit_data ad; + struct common_audit_data ad; u32 sid = current_sid(); isec = sma->sem_perm.security; - AVC_AUDIT_DATA_INIT(&ad, IPC); + COMMON_AUDIT_DATA_INIT(&ad, IPC); ad.u.ipc_id = sma->sem_perm.key; return avc_has_perm(sid, isec->sid, SECCLASS_SEM, @@ -5195,7 +5299,7 @@ static int selinux_setprocattr(struct task_struct *p, /* Only allow single threaded processes to change context */ error = -EPERM; - if (!is_single_threaded(p)) { + if (!current_is_single_threaded()) { error = security_bounded_transition(tsec->sid, sid); if (error) goto abort_change; @@ -5252,6 +5356,32 @@ static void selinux_release_secctx(char *secdata, u32 seclen) kfree(secdata); } +/* + * called with inode->i_mutex locked + */ +static int selinux_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen) +{ + return selinux_inode_setsecurity(inode, XATTR_SELINUX_SUFFIX, ctx, ctxlen, 0); +} + +/* + * called with inode->i_mutex locked + */ +static int selinux_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen) +{ + return __vfs_setxattr_noperm(dentry, XATTR_NAME_SELINUX, ctx, ctxlen, 0); +} + +static int selinux_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) +{ + int len = 0; + len = selinux_inode_getsecurity(inode, XATTR_SELINUX_SUFFIX, + ctx, true); + if (len < 0) + return len; + *ctxlen = len; + return 0; +} #ifdef CONFIG_KEYS static int selinux_key_alloc(struct key *k, const struct cred *cred, @@ -5323,7 +5453,7 @@ static int selinux_key_getsecurity(struct key *key, char **_buffer) static struct security_operations selinux_ops = { .name = "selinux", - .ptrace_may_access = selinux_ptrace_may_access, + .ptrace_access_check = selinux_ptrace_access_check, .ptrace_traceme = selinux_ptrace_traceme, .capget = selinux_capget, .capset = selinux_capset, @@ -5396,10 +5526,13 @@ static struct security_operations selinux_ops = { .dentry_open = selinux_dentry_open, .task_create = selinux_task_create, + .cred_alloc_blank = selinux_cred_alloc_blank, .cred_free = selinux_cred_free, .cred_prepare = selinux_cred_prepare, + .cred_transfer = selinux_cred_transfer, .kernel_act_as = selinux_kernel_act_as, .kernel_create_files_as = selinux_kernel_create_files_as, + .kernel_module_request = selinux_kernel_module_request, .task_setpgid = selinux_task_setpgid, .task_getpgid = selinux_task_getpgid, .task_getsid = selinux_task_getsid, @@ -5448,6 +5581,9 @@ static struct security_operations selinux_ops = { .secid_to_secctx = selinux_secid_to_secctx, .secctx_to_secid = selinux_secctx_to_secid, .release_secctx = selinux_release_secctx, + .inode_notifysecctx = selinux_inode_notifysecctx, + .inode_setsecctx = selinux_inode_setsecctx, + .inode_getsecctx = selinux_inode_getsecctx, .unix_stream_connect = selinux_socket_unix_stream_connect, .unix_may_send = selinux_socket_unix_may_send, @@ -5477,6 +5613,9 @@ static struct security_operations selinux_ops = { .inet_csk_clone = selinux_inet_csk_clone, .inet_conn_established = selinux_inet_conn_established, .req_classify_flow = selinux_req_classify_flow, + .tun_dev_create = selinux_tun_dev_create, + .tun_dev_post_create = selinux_tun_dev_post_create, + .tun_dev_attach = selinux_tun_dev_attach, #ifdef CONFIG_SECURITY_NETWORK_XFRM .xfrm_policy_alloc_security = selinux_xfrm_policy_alloc, @@ -5691,6 +5830,9 @@ int selinux_disable(void) selinux_disabled = 1; selinux_enabled = 0; + /* Try to destroy the avc node cache */ + avc_disable(); + /* Reset security_ops to the secondary module, dummy or capability. */ security_ops = secondary_ops; diff --git a/security/selinux/include/av_inherit.h b/security/selinux/include/av_inherit.h index 8377a4ba3b9..abedcd704da 100644 --- a/security/selinux/include/av_inherit.h +++ b/security/selinux/include/av_inherit.h @@ -15,6 +15,7 @@ S_(SECCLASS_KEY_SOCKET, socket, 0x00400000UL) S_(SECCLASS_UNIX_STREAM_SOCKET, socket, 0x00400000UL) S_(SECCLASS_UNIX_DGRAM_SOCKET, socket, 0x00400000UL) + S_(SECCLASS_TUN_SOCKET, socket, 0x00400000UL) S_(SECCLASS_IPC, ipc, 0x00000200UL) S_(SECCLASS_SEM, ipc, 0x00000200UL) S_(SECCLASS_MSGQ, ipc, 0x00000200UL) diff --git a/security/selinux/include/av_perm_to_string.h b/security/selinux/include/av_perm_to_string.h index 31df1d7c1ae..2b683ad83d2 100644 --- a/security/selinux/include/av_perm_to_string.h +++ b/security/selinux/include/av_perm_to_string.h @@ -107,6 +107,7 @@ S_(SECCLASS_SYSTEM, SYSTEM__SYSLOG_READ, "syslog_read") S_(SECCLASS_SYSTEM, SYSTEM__SYSLOG_MOD, "syslog_mod") S_(SECCLASS_SYSTEM, SYSTEM__SYSLOG_CONSOLE, "syslog_console") + S_(SECCLASS_SYSTEM, SYSTEM__MODULE_REQUEST, "module_request") S_(SECCLASS_CAPABILITY, CAPABILITY__CHOWN, "chown") S_(SECCLASS_CAPABILITY, CAPABILITY__DAC_OVERRIDE, "dac_override") S_(SECCLASS_CAPABILITY, CAPABILITY__DAC_READ_SEARCH, "dac_read_search") diff --git a/security/selinux/include/av_permissions.h b/security/selinux/include/av_permissions.h index d645192ee95..0546d616cca 100644 --- a/security/selinux/include/av_permissions.h +++ b/security/selinux/include/av_permissions.h @@ -423,6 +423,28 @@ #define UNIX_DGRAM_SOCKET__RECV_MSG 0x00080000UL #define UNIX_DGRAM_SOCKET__SEND_MSG 0x00100000UL #define UNIX_DGRAM_SOCKET__NAME_BIND 0x00200000UL +#define TUN_SOCKET__IOCTL 0x00000001UL +#define TUN_SOCKET__READ 0x00000002UL +#define TUN_SOCKET__WRITE 0x00000004UL +#define TUN_SOCKET__CREATE 0x00000008UL +#define TUN_SOCKET__GETATTR 0x00000010UL +#define TUN_SOCKET__SETATTR 0x00000020UL +#define TUN_SOCKET__LOCK 0x00000040UL +#define TUN_SOCKET__RELABELFROM 0x00000080UL +#define TUN_SOCKET__RELABELTO 0x00000100UL +#define TUN_SOCKET__APPEND 0x00000200UL +#define TUN_SOCKET__BIND 0x00000400UL +#define TUN_SOCKET__CONNECT 0x00000800UL +#define TUN_SOCKET__LISTEN 0x00001000UL +#define TUN_SOCKET__ACCEPT 0x00002000UL +#define TUN_SOCKET__GETOPT 0x00004000UL +#define TUN_SOCKET__SETOPT 0x00008000UL +#define TUN_SOCKET__SHUTDOWN 0x00010000UL +#define TUN_SOCKET__RECVFROM 0x00020000UL +#define TUN_SOCKET__SENDTO 0x00040000UL +#define TUN_SOCKET__RECV_MSG 0x00080000UL +#define TUN_SOCKET__SEND_MSG 0x00100000UL +#define TUN_SOCKET__NAME_BIND 0x00200000UL #define PROCESS__FORK 0x00000001UL #define PROCESS__TRANSITION 0x00000002UL #define PROCESS__SIGCHLD 0x00000004UL @@ -508,6 +530,7 @@ #define SYSTEM__SYSLOG_READ 0x00000002UL #define SYSTEM__SYSLOG_MOD 0x00000004UL #define SYSTEM__SYSLOG_CONSOLE 0x00000008UL +#define SYSTEM__MODULE_REQUEST 0x00000010UL #define CAPABILITY__CHOWN 0x00000001UL #define CAPABILITY__DAC_OVERRIDE 0x00000002UL #define CAPABILITY__DAC_READ_SEARCH 0x00000004UL diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h index d12ff1a9c0a..e94e82f7381 100644 --- a/security/selinux/include/avc.h +++ b/security/selinux/include/avc.h @@ -13,6 +13,7 @@ #include <linux/spinlock.h> #include <linux/init.h> #include <linux/audit.h> +#include <linux/lsm_audit.h> #include <linux/in6.h> #include <linux/path.h> #include <asm/system.h> @@ -36,48 +37,6 @@ struct inode; struct sock; struct sk_buff; -/* Auxiliary data to use in generating the audit record. */ -struct avc_audit_data { - char type; -#define AVC_AUDIT_DATA_FS 1 -#define AVC_AUDIT_DATA_NET 2 -#define AVC_AUDIT_DATA_CAP 3 -#define AVC_AUDIT_DATA_IPC 4 - struct task_struct *tsk; - union { - struct { - struct path path; - struct inode *inode; - } fs; - struct { - int netif; - struct sock *sk; - u16 family; - __be16 dport; - __be16 sport; - union { - struct { - __be32 daddr; - __be32 saddr; - } v4; - struct { - struct in6_addr daddr; - struct in6_addr saddr; - } v6; - } fam; - } net; - int cap; - int ipc_id; - } u; -}; - -#define v4info fam.v4 -#define v6info fam.v6 - -/* Initialize an AVC audit data structure. */ -#define AVC_AUDIT_DATA_INIT(_d,_t) \ - { memset((_d), 0, sizeof(struct avc_audit_data)); (_d)->type = AVC_AUDIT_DATA_##_t; } - /* * AVC statistics */ @@ -98,7 +57,9 @@ void __init avc_init(void); void avc_audit(u32 ssid, u32 tsid, u16 tclass, u32 requested, - struct av_decision *avd, int result, struct avc_audit_data *auditdata); + struct av_decision *avd, + int result, + struct common_audit_data *a); #define AVC_STRICT 1 /* Ignore permissive mode. */ int avc_has_perm_noaudit(u32 ssid, u32 tsid, @@ -108,7 +69,7 @@ int avc_has_perm_noaudit(u32 ssid, u32 tsid, int avc_has_perm(u32 ssid, u32 tsid, u16 tclass, u32 requested, - struct avc_audit_data *auditdata); + struct common_audit_data *auditdata); u32 avc_policy_seqno(void); @@ -127,13 +88,13 @@ int avc_add_callback(int (*callback)(u32 event, u32 ssid, u32 tsid, u32 events, u32 ssid, u32 tsid, u16 tclass, u32 perms); -/* Shows permission in human readable form */ -void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av); - /* Exported to selinuxfs */ int avc_get_hash_stats(char *page); extern unsigned int avc_cache_threshold; +/* Attempt to free avc node cache */ +void avc_disable(void); + #ifdef CONFIG_SECURITY_SELINUX_AVC_STATS DECLARE_PER_CPU(struct avc_cache_stats, avc_cache_stats); #endif diff --git a/security/selinux/include/class_to_string.h b/security/selinux/include/class_to_string.h index 21ec786611d..7ab9299bfb6 100644 --- a/security/selinux/include/class_to_string.h +++ b/security/selinux/include/class_to_string.h @@ -77,3 +77,4 @@ S_(NULL) S_(NULL) S_("kernel_service") + S_("tun_socket") diff --git a/security/selinux/include/flask.h b/security/selinux/include/flask.h index 882f27d66fa..f248500a1e3 100644 --- a/security/selinux/include/flask.h +++ b/security/selinux/include/flask.h @@ -53,6 +53,7 @@ #define SECCLASS_PEER 68 #define SECCLASS_CAPABILITY2 69 #define SECCLASS_KERNEL_SERVICE 74 +#define SECCLASS_TUN_SOCKET 75 /* * Security identifier indices for initial entities diff --git a/security/selinux/include/netlabel.h b/security/selinux/include/netlabel.h index b4b5b9b2f0b..8d7384280a7 100644 --- a/security/selinux/include/netlabel.h +++ b/security/selinux/include/netlabel.h @@ -59,7 +59,7 @@ int selinux_netlbl_socket_post_create(struct sock *sk, u16 family); int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, struct sk_buff *skb, u16 family, - struct avc_audit_data *ad); + struct common_audit_data *ad); int selinux_netlbl_socket_setsockopt(struct socket *sock, int level, int optname); @@ -129,7 +129,7 @@ static inline int selinux_netlbl_socket_post_create(struct sock *sk, static inline int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, struct sk_buff *skb, u16 family, - struct avc_audit_data *ad) + struct common_audit_data *ad) { return 0; } diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h index 289e24b39e3..13128f9a3e5 100644 --- a/security/selinux/include/xfrm.h +++ b/security/selinux/include/xfrm.h @@ -41,9 +41,9 @@ static inline int selinux_xfrm_enabled(void) } int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb, - struct avc_audit_data *ad); + struct common_audit_data *ad); int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb, - struct avc_audit_data *ad, u8 proto); + struct common_audit_data *ad, u8 proto); int selinux_xfrm_decode_session(struct sk_buff *skb, u32 *sid, int ckall); static inline void selinux_xfrm_notify_policyload(void) @@ -57,13 +57,13 @@ static inline int selinux_xfrm_enabled(void) } static inline int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb, - struct avc_audit_data *ad) + struct common_audit_data *ad) { return 0; } static inline int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb, - struct avc_audit_data *ad, u8 proto) + struct common_audit_data *ad, u8 proto) { return 0; } diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c index 2e984413c7b..e68823741ad 100644 --- a/security/selinux/netlabel.c +++ b/security/selinux/netlabel.c @@ -342,7 +342,7 @@ int selinux_netlbl_socket_post_create(struct sock *sk, u16 family) int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, struct sk_buff *skb, u16 family, - struct avc_audit_data *ad) + struct common_audit_data *ad) { int rc; u32 nlbl_sid; diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 500e6f78e11..ff17820d35e 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -22,6 +22,11 @@ * * Added validation of kernel classes and permissions * + * Updated: KaiGai Kohei <kaigai@ak.jp.nec.com> + * + * Added support for bounds domain and audit messaged on masked permissions + * + * Copyright (C) 2008, 2009 NEC Corporation * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P. * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc. * Copyright (C) 2003 - 2004, 2006 Tresys Technology, LLC @@ -279,6 +284,95 @@ mls_ops: } /* + * security_dump_masked_av - dumps masked permissions during + * security_compute_av due to RBAC, MLS/Constraint and Type bounds. + */ +static int dump_masked_av_helper(void *k, void *d, void *args) +{ + struct perm_datum *pdatum = d; + char **permission_names = args; + + BUG_ON(pdatum->value < 1 || pdatum->value > 32); + + permission_names[pdatum->value - 1] = (char *)k; + + return 0; +} + +static void security_dump_masked_av(struct context *scontext, + struct context *tcontext, + u16 tclass, + u32 permissions, + const char *reason) +{ + struct common_datum *common_dat; + struct class_datum *tclass_dat; + struct audit_buffer *ab; + char *tclass_name; + char *scontext_name = NULL; + char *tcontext_name = NULL; + char *permission_names[32]; + int index, length; + bool need_comma = false; + + if (!permissions) + return; + + tclass_name = policydb.p_class_val_to_name[tclass - 1]; + tclass_dat = policydb.class_val_to_struct[tclass - 1]; + common_dat = tclass_dat->comdatum; + + /* init permission_names */ + if (common_dat && + hashtab_map(common_dat->permissions.table, + dump_masked_av_helper, permission_names) < 0) + goto out; + + if (hashtab_map(tclass_dat->permissions.table, + dump_masked_av_helper, permission_names) < 0) + goto out; + + /* get scontext/tcontext in text form */ + if (context_struct_to_string(scontext, + &scontext_name, &length) < 0) + goto out; + + if (context_struct_to_string(tcontext, + &tcontext_name, &length) < 0) + goto out; + + /* audit a message */ + ab = audit_log_start(current->audit_context, + GFP_ATOMIC, AUDIT_SELINUX_ERR); + if (!ab) + goto out; + + audit_log_format(ab, "op=security_compute_av reason=%s " + "scontext=%s tcontext=%s tclass=%s perms=", + reason, scontext_name, tcontext_name, tclass_name); + + for (index = 0; index < 32; index++) { + u32 mask = (1 << index); + + if ((mask & permissions) == 0) + continue; + + audit_log_format(ab, "%s%s", + need_comma ? "," : "", + permission_names[index] + ? permission_names[index] : "????"); + need_comma = true; + } + audit_log_end(ab); +out: + /* release scontext/tcontext */ + kfree(tcontext_name); + kfree(scontext_name); + + return; +} + +/* * security_boundary_permission - drops violated permissions * on boundary constraint. */ @@ -347,28 +441,12 @@ static void type_attribute_bounds_av(struct context *scontext, } if (masked) { - struct audit_buffer *ab; - char *stype_name - = policydb.p_type_val_to_name[source->value - 1]; - char *ttype_name - = policydb.p_type_val_to_name[target->value - 1]; - char *tclass_name - = policydb.p_class_val_to_name[tclass - 1]; - /* mask violated permissions */ avd->allowed &= ~masked; - /* notice to userspace via audit message */ - ab = audit_log_start(current->audit_context, - GFP_ATOMIC, AUDIT_SELINUX_ERR); - if (!ab) - return; - - audit_log_format(ab, "av boundary violation: " - "source=%s target=%s tclass=%s", - stype_name, ttype_name, tclass_name); - avc_dump_av(ab, tclass, masked); - audit_log_end(ab); + /* audit masked permissions */ + security_dump_masked_av(scontext, tcontext, + tclass, masked, "bounds"); } } @@ -480,7 +558,7 @@ static int context_struct_compute_av(struct context *scontext, if ((constraint->permissions & (avd->allowed)) && !constraint_expr_eval(scontext, tcontext, NULL, constraint->expr)) { - avd->allowed = (avd->allowed) & ~(constraint->permissions); + avd->allowed &= ~(constraint->permissions); } constraint = constraint->next; } @@ -499,8 +577,8 @@ static int context_struct_compute_av(struct context *scontext, break; } if (!ra) - avd->allowed = (avd->allowed) & ~(PROCESS__TRANSITION | - PROCESS__DYNTRANSITION); + avd->allowed &= ~(PROCESS__TRANSITION | + PROCESS__DYNTRANSITION); } /* @@ -687,6 +765,26 @@ int security_bounded_transition(u32 old_sid, u32 new_sid) } index = type->bounds; } + + if (rc) { + char *old_name = NULL; + char *new_name = NULL; + int length; + + if (!context_struct_to_string(old_context, + &old_name, &length) && + !context_struct_to_string(new_context, + &new_name, &length)) { + audit_log(current->audit_context, + GFP_ATOMIC, AUDIT_SELINUX_ERR, + "op=security_bounded_transition " + "result=denied " + "oldcontext=%s newcontext=%s", + old_name, new_name); + } + kfree(new_name); + kfree(old_name); + } out: read_unlock(&policy_rwlock); diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c index 72b18452e1a..f3cb9ed731a 100644 --- a/security/selinux/xfrm.c +++ b/security/selinux/xfrm.c @@ -401,7 +401,7 @@ int selinux_xfrm_state_delete(struct xfrm_state *x) * gone thru the IPSec process. */ int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb, - struct avc_audit_data *ad) + struct common_audit_data *ad) { int i, rc = 0; struct sec_path *sp; @@ -442,7 +442,7 @@ int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb, * checked in the selinux_xfrm_state_pol_flow_match hook above. */ int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb, - struct avc_audit_data *ad, u8 proto) + struct common_audit_data *ad, u8 proto) { struct dst_entry *dst; int rc = 0; diff --git a/security/smack/smack.h b/security/smack/smack.h index 243bec175be..c6e9acae72e 100644 --- a/security/smack/smack.h +++ b/security/smack/smack.h @@ -275,7 +275,7 @@ static inline void smk_ad_init(struct smk_audit_info *a, const char *func, { memset(a, 0, sizeof(*a)); a->a.type = type; - a->a.function = func; + a->a.smack_audit_data.function = func; } static inline void smk_ad_setfield_u_tsk(struct smk_audit_info *a, diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c index 513dc1aa16d..0f9ac814690 100644 --- a/security/smack/smack_access.c +++ b/security/smack/smack_access.c @@ -240,8 +240,9 @@ static inline void smack_str_from_perm(char *string, int access) static void smack_log_callback(struct audit_buffer *ab, void *a) { struct common_audit_data *ad = a; - struct smack_audit_data *sad = &ad->lsm_priv.smack_audit_data; - audit_log_format(ab, "lsm=SMACK fn=%s action=%s", ad->function, + struct smack_audit_data *sad = &ad->smack_audit_data; + audit_log_format(ab, "lsm=SMACK fn=%s action=%s", + ad->smack_audit_data.function, sad->result ? "denied" : "granted"); audit_log_format(ab, " subject="); audit_log_untrustedstring(ab, sad->subject); @@ -274,11 +275,11 @@ void smack_log(char *subject_label, char *object_label, int request, if (result == 0 && (log_policy & SMACK_AUDIT_ACCEPT) == 0) return; - if (a->function == NULL) - a->function = "unknown"; + if (a->smack_audit_data.function == NULL) + a->smack_audit_data.function = "unknown"; /* end preparing the audit data */ - sad = &a->lsm_priv.smack_audit_data; + sad = &a->smack_audit_data; smack_str_from_perm(request_buffer, request); sad->subject = subject_label; sad->object = object_label; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 0023182078c..acae7ef4092 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -91,7 +91,7 @@ struct inode_smack *new_inode_smack(char *smack) */ /** - * smack_ptrace_may_access - Smack approval on PTRACE_ATTACH + * smack_ptrace_access_check - Smack approval on PTRACE_ATTACH * @ctp: child task pointer * @mode: ptrace attachment mode * @@ -99,13 +99,13 @@ struct inode_smack *new_inode_smack(char *smack) * * Do the capability checks, and require read and write. */ -static int smack_ptrace_may_access(struct task_struct *ctp, unsigned int mode) +static int smack_ptrace_access_check(struct task_struct *ctp, unsigned int mode) { int rc; struct smk_audit_info ad; char *sp, *tsp; - rc = cap_ptrace_may_access(ctp, mode); + rc = cap_ptrace_access_check(ctp, mode); if (rc != 0) return rc; @@ -1080,6 +1080,22 @@ static int smack_file_receive(struct file *file) */ /** + * smack_cred_alloc_blank - "allocate" blank task-level security credentials + * @new: the new credentials + * @gfp: the atomicity of any memory allocations + * + * Prepare a blank set of credentials for modification. This must allocate all + * the memory the LSM module might require such that cred_transfer() can + * complete without error. + */ +static int smack_cred_alloc_blank(struct cred *cred, gfp_t gfp) +{ + cred->security = NULL; + return 0; +} + + +/** * smack_cred_free - "free" task-level security credentials * @cred: the credentials in question * @@ -1117,6 +1133,18 @@ static void smack_cred_commit(struct cred *new, const struct cred *old) } /** + * smack_cred_transfer - Transfer the old credentials to the new credentials + * @new: the new credentials + * @old: the original credentials + * + * Fill in a set of blank credentials from another set of credentials. + */ +static void smack_cred_transfer(struct cred *new, const struct cred *old) +{ + new->security = old->security; +} + +/** * smack_kernel_act_as - Set the subjective context in a set of credentials * @new: points to the set of credentials to be modified. * @secid: specifies the security ID to be set @@ -1638,6 +1666,7 @@ static int smack_inode_setsecurity(struct inode *inode, const char *name, if (strcmp(name, XATTR_SMACK_SUFFIX) == 0) { nsp->smk_inode = sp; + nsp->smk_flags |= SMK_INODE_INSTANT; return 0; } /* @@ -2464,7 +2493,7 @@ static int smack_socket_sendmsg(struct socket *sock, struct msghdr *msg, /* * Perfectly reasonable for this to be NULL */ - if (sip == NULL || sip->sin_family != PF_INET) + if (sip == NULL || sip->sin_family != AF_INET) return 0; return smack_netlabel_send(sock->sk, sip); @@ -3029,10 +3058,31 @@ static void smack_release_secctx(char *secdata, u32 seclen) { } +static int smack_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen) +{ + return smack_inode_setsecurity(inode, XATTR_SMACK_SUFFIX, ctx, ctxlen, 0); +} + +static int smack_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen) +{ + return __vfs_setxattr_noperm(dentry, XATTR_NAME_SMACK, ctx, ctxlen, 0); +} + +static int smack_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) +{ + int len = 0; + len = smack_inode_getsecurity(inode, XATTR_SMACK_SUFFIX, ctx, true); + + if (len < 0) + return len; + *ctxlen = len; + return 0; +} + struct security_operations smack_ops = { .name = "smack", - .ptrace_may_access = smack_ptrace_may_access, + .ptrace_access_check = smack_ptrace_access_check, .ptrace_traceme = smack_ptrace_traceme, .syslog = smack_syslog, @@ -3073,9 +3123,11 @@ struct security_operations smack_ops = { .file_send_sigiotask = smack_file_send_sigiotask, .file_receive = smack_file_receive, + .cred_alloc_blank = smack_cred_alloc_blank, .cred_free = smack_cred_free, .cred_prepare = smack_cred_prepare, .cred_commit = smack_cred_commit, + .cred_transfer = smack_cred_transfer, .kernel_act_as = smack_kernel_act_as, .kernel_create_files_as = smack_kernel_create_files_as, .task_setpgid = smack_task_setpgid, @@ -3155,6 +3207,9 @@ struct security_operations smack_ops = { .secid_to_secctx = smack_secid_to_secctx, .secctx_to_secid = smack_secctx_to_secid, .release_secctx = smack_release_secctx, + .inode_notifysecctx = smack_inode_notifysecctx, + .inode_setsecctx = smack_inode_setsecctx, + .inode_getsecctx = smack_inode_getsecctx, }; diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c index fdd1f4b8c44..3c8bd8ee0b9 100644 --- a/security/tomoyo/common.c +++ b/security/tomoyo/common.c @@ -1285,6 +1285,36 @@ static bool tomoyo_is_select_one(struct tomoyo_io_buffer *head, } /** + * tomoyo_delete_domain - Delete a domain. + * + * @domainname: The name of domain. + * + * Returns 0. + */ +static int tomoyo_delete_domain(char *domainname) +{ + struct tomoyo_domain_info *domain; + struct tomoyo_path_info name; + + name.name = domainname; + tomoyo_fill_path_info(&name); + down_write(&tomoyo_domain_list_lock); + /* Is there an active domain? */ + list_for_each_entry(domain, &tomoyo_domain_list, list) { + /* Never delete tomoyo_kernel_domain */ + if (domain == &tomoyo_kernel_domain) + continue; + if (domain->is_deleted || + tomoyo_pathcmp(domain->domainname, &name)) + continue; + domain->is_deleted = true; + break; + } + up_write(&tomoyo_domain_list_lock); + return 0; +} + +/** * tomoyo_write_domain_policy - Write domain policy. * * @head: Pointer to "struct tomoyo_io_buffer". diff --git a/security/tomoyo/common.h b/security/tomoyo/common.h index 6d6ba09af45..31df541911f 100644 --- a/security/tomoyo/common.h +++ b/security/tomoyo/common.h @@ -339,8 +339,6 @@ const char *tomoyo_get_last_name(const struct tomoyo_domain_info *domain); const char *tomoyo_get_msg(const bool is_enforce); /* Convert single path operation to operation name. */ const char *tomoyo_sp2keyword(const u8 operation); -/* Delete a domain. */ -int tomoyo_delete_domain(char *data); /* Create "alias" entry in exception policy. */ int tomoyo_write_alias_policy(char *data, const bool is_delete); /* diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index 1d8b1696057..fcf52accce2 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c @@ -717,38 +717,6 @@ int tomoyo_write_alias_policy(char *data, const bool is_delete) return tomoyo_update_alias_entry(data, cp, is_delete); } -/* Domain create/delete handler. */ - -/** - * tomoyo_delete_domain - Delete a domain. - * - * @domainname: The name of domain. - * - * Returns 0. - */ -int tomoyo_delete_domain(char *domainname) -{ - struct tomoyo_domain_info *domain; - struct tomoyo_path_info name; - - name.name = domainname; - tomoyo_fill_path_info(&name); - down_write(&tomoyo_domain_list_lock); - /* Is there an active domain? */ - list_for_each_entry(domain, &tomoyo_domain_list, list) { - /* Never delete tomoyo_kernel_domain */ - if (domain == &tomoyo_kernel_domain) - continue; - if (domain->is_deleted || - tomoyo_pathcmp(domain->domainname, &name)) - continue; - domain->is_deleted = true; - break; - } - up_write(&tomoyo_domain_list_lock); - return 0; -} - /** * tomoyo_find_or_assign_new_domain - Create a domain. * @@ -818,13 +786,11 @@ struct tomoyo_domain_info *tomoyo_find_or_assign_new_domain(const char * /** * tomoyo_find_next_domain - Find a domain. * - * @bprm: Pointer to "struct linux_binprm". - * @next_domain: Pointer to pointer to "struct tomoyo_domain_info". + * @bprm: Pointer to "struct linux_binprm". * * Returns 0 on success, negative value otherwise. */ -int tomoyo_find_next_domain(struct linux_binprm *bprm, - struct tomoyo_domain_info **next_domain) +int tomoyo_find_next_domain(struct linux_binprm *bprm) { /* * This function assumes that the size of buffer returned by @@ -946,9 +912,11 @@ int tomoyo_find_next_domain(struct linux_binprm *bprm, tomoyo_set_domain_flag(old_domain, false, TOMOYO_DOMAIN_FLAGS_TRANSITION_FAILED); out: + if (!domain) + domain = old_domain; + bprm->cred->security = domain; tomoyo_free(real_program_name); tomoyo_free(symlink_program_name); - *next_domain = domain ? domain : old_domain; tomoyo_free(tmp); return retval; } diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c index 3194d09fe0f..9548a0984cc 100644 --- a/security/tomoyo/tomoyo.c +++ b/security/tomoyo/tomoyo.c @@ -14,6 +14,12 @@ #include "tomoyo.h" #include "realpath.h" +static int tomoyo_cred_alloc_blank(struct cred *new, gfp_t gfp) +{ + new->security = NULL; + return 0; +} + static int tomoyo_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp) { @@ -25,6 +31,15 @@ static int tomoyo_cred_prepare(struct cred *new, const struct cred *old, return 0; } +static void tomoyo_cred_transfer(struct cred *new, const struct cred *old) +{ + /* + * Since "struct tomoyo_domain_info *" is a sharable pointer, + * we don't need to duplicate. + */ + new->security = old->security; +} + static int tomoyo_bprm_set_creds(struct linux_binprm *bprm) { int rc; @@ -61,14 +76,8 @@ static int tomoyo_bprm_check_security(struct linux_binprm *bprm) * Execute permission is checked against pathname passed to do_execve() * using current domain. */ - if (!domain) { - struct tomoyo_domain_info *next_domain = NULL; - int retval = tomoyo_find_next_domain(bprm, &next_domain); - - if (!retval) - bprm->cred->security = next_domain; - return retval; - } + if (!domain) + return tomoyo_find_next_domain(bprm); /* * Read permission is checked against interpreters using next domain. * '1' is the result of open_to_namei_flags(O_RDONLY). @@ -268,7 +277,9 @@ static int tomoyo_dentry_open(struct file *f, const struct cred *cred) */ static struct security_operations tomoyo_security_ops = { .name = "tomoyo", + .cred_alloc_blank = tomoyo_cred_alloc_blank, .cred_prepare = tomoyo_cred_prepare, + .cred_transfer = tomoyo_cred_transfer, .bprm_set_creds = tomoyo_bprm_set_creds, .bprm_check_security = tomoyo_bprm_check_security, #ifdef CONFIG_SYSCTL diff --git a/security/tomoyo/tomoyo.h b/security/tomoyo/tomoyo.h index 0fd588a629c..cd6ba0bf706 100644 --- a/security/tomoyo/tomoyo.h +++ b/security/tomoyo/tomoyo.h @@ -31,8 +31,7 @@ int tomoyo_check_2path_perm(struct tomoyo_domain_info *domain, struct path *path2); int tomoyo_check_rewrite_permission(struct tomoyo_domain_info *domain, struct file *filp); -int tomoyo_find_next_domain(struct linux_binprm *bprm, - struct tomoyo_domain_info **next_domain); +int tomoyo_find_next_domain(struct linux_binprm *bprm); /* Index numbers for Access Controls. */ |