diff options
258 files changed, 9472 insertions, 5203 deletions
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 03497909539..31575e220f3 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -556,3 +556,35 @@ Why: udev fully replaces this special file system that only contains CAPI NCCI TTY device nodes. User space (pppdcapiplugin) works without noticing the difference. Who: Jan Kiszka <jan.kiszka@web.de> + +---------------------------- + +What: KVM memory aliases support +When: July 2010 +Why: Memory aliasing support is used for speeding up guest vga access + through the vga windows. + + Modern userspace no longer uses this feature, so it's just bitrotted + code and can be removed with no impact. +Who: Avi Kivity <avi@redhat.com> + +---------------------------- + +What: KVM kernel-allocated memory slots +When: July 2010 +Why: Since 2.6.25, kvm supports user-allocated memory slots, which are + much more flexible than kernel-allocated slots. All current userspace + supports the newer interface and this code can be removed with no + impact. +Who: Avi Kivity <avi@redhat.com> + +---------------------------- + +What: KVM paravirt mmu host support +When: January 2011 +Why: The paravirt mmu host support is slower than non-paravirt mmu, both + on newer and older hardware. It is already not exposed to the guest, + and kept only for live migration purposes. +Who: Avi Kivity <avi@redhat.com> + +---------------------------- diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 18b9d0ca063..06bbbed7120 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -460,13 +460,6 @@ in sys_read() and friends. --------------------------- dquot_operations ------------------------------- prototypes: - int (*initialize) (struct inode *, int); - int (*drop) (struct inode *); - int (*alloc_space) (struct inode *, qsize_t, int); - int (*alloc_inode) (const struct inode *, unsigned long); - int (*free_space) (struct inode *, qsize_t); - int (*free_inode) (const struct inode *, unsigned long); - int (*transfer) (struct inode *, struct iattr *); int (*write_dquot) (struct dquot *); int (*acquire_dquot) (struct dquot *); int (*release_dquot) (struct dquot *); @@ -479,13 +472,6 @@ a proper locking wrt the filesystem and call the generic quota operations. What filesystem should expect from the generic quota functions: FS recursion Held locks when called -initialize: yes maybe dqonoff_sem -drop: yes - -alloc_space: ->mark_dirty() - -alloc_inode: ->mark_dirty() - -free_space: ->mark_dirty() - -free_inode: ->mark_dirty() - -transfer: yes - write_dquot: yes dqonoff_sem or dqptr_sem acquire_dquot: yes dqonoff_sem or dqptr_sem release_dquot: yes dqonoff_sem or dqptr_sem @@ -495,10 +481,6 @@ write_info: yes dqonoff_sem FS recursion means calling ->quota_read() and ->quota_write() from superblock operations. -->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called -only directly by the filesystem and do not call any fs functions only -the ->mark_dirty() operation. - More details about quota locking can be found in fs/dquot.c. --------------------------- vm_operations_struct ----------------------------- diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt index 053037a1fe6..2f9115c0ae6 100644 --- a/Documentation/kprobes.txt +++ b/Documentation/kprobes.txt @@ -1,6 +1,7 @@ Title : Kernel Probes (Kprobes) Authors : Jim Keniston <jkenisto@us.ibm.com> - : Prasanna S Panchamukhi <prasanna@in.ibm.com> + : Prasanna S Panchamukhi <prasanna.panchamukhi@gmail.com> + : Masami Hiramatsu <mhiramat@redhat.com> CONTENTS @@ -15,6 +16,7 @@ CONTENTS 9. Jprobes Example 10. Kretprobes Example Appendix A: The kprobes debugfs interface +Appendix B: The kprobes sysctl interface 1. Concepts: Kprobes, Jprobes, Return Probes @@ -42,13 +44,13 @@ registration/unregistration of a group of *probes. These functions can speed up unregistration process when you have to unregister a lot of probes at once. -The next three subsections explain how the different types of -probes work. They explain certain things that you'll need to -know in order to make the best use of Kprobes -- e.g., the -difference between a pre_handler and a post_handler, and how -to use the maxactive and nmissed fields of a kretprobe. But -if you're in a hurry to start using Kprobes, you can skip ahead -to section 2. +The next four subsections explain how the different types of +probes work and how jump optimization works. They explain certain +things that you'll need to know in order to make the best use of +Kprobes -- e.g., the difference between a pre_handler and +a post_handler, and how to use the maxactive and nmissed fields of +a kretprobe. But if you're in a hurry to start using Kprobes, you +can skip ahead to section 2. 1.1 How Does a Kprobe Work? @@ -161,13 +163,125 @@ In case probed function is entered but there is no kretprobe_instance object available, then in addition to incrementing the nmissed count, the user entry_handler invocation is also skipped. +1.4 How Does Jump Optimization Work? + +If you configured your kernel with CONFIG_OPTPROBES=y (currently +this option is supported on x86/x86-64, non-preemptive kernel) and +the "debug.kprobes_optimization" kernel parameter is set to 1 (see +sysctl(8)), Kprobes tries to reduce probe-hit overhead by using a jump +instruction instead of a breakpoint instruction at each probepoint. + +1.4.1 Init a Kprobe + +When a probe is registered, before attempting this optimization, +Kprobes inserts an ordinary, breakpoint-based kprobe at the specified +address. So, even if it's not possible to optimize this particular +probepoint, there'll be a probe there. + +1.4.2 Safety Check + +Before optimizing a probe, Kprobes performs the following safety checks: + +- Kprobes verifies that the region that will be replaced by the jump +instruction (the "optimized region") lies entirely within one function. +(A jump instruction is multiple bytes, and so may overlay multiple +instructions.) + +- Kprobes analyzes the entire function and verifies that there is no +jump into the optimized region. Specifically: + - the function contains no indirect jump; + - the function contains no instruction that causes an exception (since + the fixup code triggered by the exception could jump back into the + optimized region -- Kprobes checks the exception tables to verify this); + and + - there is no near jump to the optimized region (other than to the first + byte). + +- For each instruction in the optimized region, Kprobes verifies that +the instruction can be executed out of line. + +1.4.3 Preparing Detour Buffer + +Next, Kprobes prepares a "detour" buffer, which contains the following +instruction sequence: +- code to push the CPU's registers (emulating a breakpoint trap) +- a call to the trampoline code which calls user's probe handlers. +- code to restore registers +- the instructions from the optimized region +- a jump back to the original execution path. + +1.4.4 Pre-optimization + +After preparing the detour buffer, Kprobes verifies that none of the +following situations exist: +- The probe has either a break_handler (i.e., it's a jprobe) or a +post_handler. +- Other instructions in the optimized region are probed. +- The probe is disabled. +In any of the above cases, Kprobes won't start optimizing the probe. +Since these are temporary situations, Kprobes tries to start +optimizing it again if the situation is changed. + +If the kprobe can be optimized, Kprobes enqueues the kprobe to an +optimizing list, and kicks the kprobe-optimizer workqueue to optimize +it. If the to-be-optimized probepoint is hit before being optimized, +Kprobes returns control to the original instruction path by setting +the CPU's instruction pointer to the copied code in the detour buffer +-- thus at least avoiding the single-step. + +1.4.5 Optimization + +The Kprobe-optimizer doesn't insert the jump instruction immediately; +rather, it calls synchronize_sched() for safety first, because it's +possible for a CPU to be interrupted in the middle of executing the +optimized region(*). As you know, synchronize_sched() can ensure +that all interruptions that were active when synchronize_sched() +was called are done, but only if CONFIG_PREEMPT=n. So, this version +of kprobe optimization supports only kernels with CONFIG_PREEMPT=n.(**) + +After that, the Kprobe-optimizer calls stop_machine() to replace +the optimized region with a jump instruction to the detour buffer, +using text_poke_smp(). + +1.4.6 Unoptimization + +When an optimized kprobe is unregistered, disabled, or blocked by +another kprobe, it will be unoptimized. If this happens before +the optimization is complete, the kprobe is just dequeued from the +optimized list. If the optimization has been done, the jump is +replaced with the original code (except for an int3 breakpoint in +the first byte) by using text_poke_smp(). + +(*)Please imagine that the 2nd instruction is interrupted and then +the optimizer replaces the 2nd instruction with the jump *address* +while the interrupt handler is running. When the interrupt +returns to original address, there is no valid instruction, +and it causes an unexpected result. + +(**)This optimization-safety checking may be replaced with the +stop-machine method that ksplice uses for supporting a CONFIG_PREEMPT=y +kernel. + +NOTE for geeks: +The jump optimization changes the kprobe's pre_handler behavior. +Without optimization, the pre_handler can change the kernel's execution +path by changing regs->ip and returning 1. However, when the probe +is optimized, that modification is ignored. Thus, if you want to +tweak the kernel's execution path, you need to suppress optimization, +using one of the following techniques: +- Specify an empty function for the kprobe's post_handler or break_handler. + or +- Config CONFIG_OPTPROBES=n. + or +- Execute 'sysctl -w debug.kprobes_optimization=n' + 2. Architectures Supported Kprobes, jprobes, and return probes are implemented on the following architectures: -- i386 -- x86_64 (AMD-64, EM64T) +- i386 (Supports jump optimization) +- x86_64 (AMD-64, EM64T) (Supports jump optimization) - ppc64 - ia64 (Does not support probes on instruction slot1.) - sparc64 (Return probes not yet implemented.) @@ -193,6 +307,10 @@ it useful to "Compile the kernel with debug info" (CONFIG_DEBUG_INFO), so you can use "objdump -d -l vmlinux" to see the source-to-object code mapping. +If you want to reduce probing overhead, set "Kprobes jump optimization +support" (CONFIG_OPTPROBES) to "y". You can find this option under the +"Kprobes" line. + 4. API Reference The Kprobes API includes a "register" function and an "unregister" @@ -389,7 +507,10 @@ the probe which has been registered. Kprobes allows multiple probes at the same address. Currently, however, there cannot be multiple jprobes on the same function at -the same time. +the same time. Also, a probepoint for which there is a jprobe or +a post_handler cannot be optimized. So if you install a jprobe, +or a kprobe with a post_handler, at an optimized probepoint, the +probepoint will be unoptimized automatically. In general, you can install a probe anywhere in the kernel. In particular, you can probe interrupt handlers. Known exceptions @@ -453,6 +574,38 @@ reason, Kprobes doesn't support return probes (or kprobes or jprobes) on the x86_64 version of __switch_to(); the registration functions return -EINVAL. +On x86/x86-64, since the Jump Optimization of Kprobes modifies +instructions widely, there are some limitations to optimization. To +explain it, we introduce some terminology. Imagine a 3-instruction +sequence consisting of a two 2-byte instructions and one 3-byte +instruction. + + IA + | +[-2][-1][0][1][2][3][4][5][6][7] + [ins1][ins2][ ins3 ] + [<- DCR ->] + [<- JTPR ->] + +ins1: 1st Instruction +ins2: 2nd Instruction +ins3: 3rd Instruction +IA: Insertion Address +JTPR: Jump Target Prohibition Region +DCR: Detoured Code Region + +The instructions in DCR are copied to the out-of-line buffer +of the kprobe, because the bytes in DCR are replaced by +a 5-byte jump instruction. So there are several limitations. + +a) The instructions in DCR must be relocatable. +b) The instructions in DCR must not include a call instruction. +c) JTPR must not be targeted by any jump or call instruction. +d) DCR must not straddle the border betweeen functions. + +Anyway, these limitations are checked by the in-kernel instruction +decoder, so you don't need to worry about that. + 6. Probe Overhead On a typical CPU in use in 2005, a kprobe hit takes 0.5 to 1.0 @@ -476,6 +629,19 @@ k = 0.49 usec; j = 0.76; r = 0.80; kr = 0.82; jr = 1.07 ppc64: POWER5 (gr), 1656 MHz (SMT disabled, 1 virtual CPU per physical CPU) k = 0.77 usec; j = 1.31; r = 1.26; kr = 1.45; jr = 1.99 +6.1 Optimized Probe Overhead + +Typically, an optimized kprobe hit takes 0.07 to 0.1 microseconds to +process. Here are sample overhead figures (in usec) for x86 architectures. +k = unoptimized kprobe, b = boosted (single-step skipped), o = optimized kprobe, +r = unoptimized kretprobe, rb = boosted kretprobe, ro = optimized kretprobe. + +i386: Intel(R) Xeon(R) E5410, 2.33GHz, 4656.90 bogomips +k = 0.80 usec; b = 0.33; o = 0.05; r = 1.10; rb = 0.61; ro = 0.33 + +x86-64: Intel(R) Xeon(R) E5410, 2.33GHz, 4656.90 bogomips +k = 0.99 usec; b = 0.43; o = 0.06; r = 1.24; rb = 0.68; ro = 0.30 + 7. TODO a. SystemTap (http://sourceware.org/systemtap): Provides a simplified @@ -523,7 +689,8 @@ is also specified. Following columns show probe status. If the probe is on a virtual address that is no longer valid (module init sections, module virtual addresses that correspond to modules that've been unloaded), such probes are marked with [GONE]. If the probe is temporarily disabled, -such probes are marked with [DISABLED]. +such probes are marked with [DISABLED]. If the probe is optimized, it is +marked with [OPTIMIZED]. /sys/kernel/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly. @@ -533,3 +700,19 @@ registered probes will be disarmed, till such time a "1" is echoed to this file. Note that this knob just disarms and arms all kprobes and doesn't change each probe's disabling state. This means that disabled kprobes (marked [DISABLED]) will be not enabled if you turn ON all kprobes by this knob. + + +Appendix B: The kprobes sysctl interface + +/proc/sys/debug/kprobes-optimization: Turn kprobes optimization ON/OFF. + +When CONFIG_OPTPROBES=y, this sysctl interface appears and it provides +a knob to globally and forcibly turn jump optimization (see section +1.4) ON or OFF. By default, jump optimization is allowed (ON). +If you echo "0" to this file or set "debug.kprobes_optimization" to +0 via sysctl, all optimized probes will be unoptimized, and any new +probes registered after that will not be optimized. Note that this +knob *changes* the optimized state. This means that optimized probes +(marked [OPTIMIZED]) will be unoptimized ([OPTIMIZED] tag will be +removed). If the knob is turned on, they will be optimized again. + diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt index 2811e452f75..c6416a39816 100644 --- a/Documentation/kvm/api.txt +++ b/Documentation/kvm/api.txt @@ -23,12 +23,12 @@ of a virtual machine. The ioctls belong to three classes Only run vcpu ioctls from the same thread that was used to create the vcpu. -2. File descritpors +2. File descriptors The kvm API is centered around file descriptors. An initial open("/dev/kvm") obtains a handle to the kvm subsystem; this handle can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this -handle will create a VM file descripror which can be used to issue VM +handle will create a VM file descriptor which can be used to issue VM ioctls. A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu and return a file descriptor pointing to it. Finally, ioctls on a vcpu fd can be used to control the vcpu, including the important task of @@ -643,7 +643,7 @@ Type: vm ioctl Parameters: struct kvm_clock_data (in) Returns: 0 on success, -1 on error -Sets the current timestamp of kvmclock to the valued specific in its parameter. +Sets the current timestamp of kvmclock to the value specified in its parameter. In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios such as migration. @@ -795,11 +795,11 @@ Unused. __u64 data_offset; /* relative to kvm_run start */ } io; -If exit_reason is KVM_EXIT_IO_IN or KVM_EXIT_IO_OUT, then the vcpu has +If exit_reason is KVM_EXIT_IO, then the vcpu has executed a port I/O instruction which could not be satisfied by kvm. data_offset describes where the data is located (KVM_EXIT_IO_OUT) or where kvm expects application code to place the data for the next -KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a patcked array. +KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a packed array. struct { struct kvm_debug_exit_arch arch; @@ -815,7 +815,7 @@ Unused. __u8 is_write; } mmio; -If exit_reason is KVM_EXIT_MMIO or KVM_EXIT_IO_OUT, then the vcpu has +If exit_reason is KVM_EXIT_MMIO, then the vcpu has executed a memory-mapped I/O instruction which could not be satisfied by kvm. The 'data' member contains the written data if 'is_write' is true, and should be filled by application code otherwise. diff --git a/MAINTAINERS b/MAINTAINERS index c6591bca646..51d8b5221dd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3173,7 +3173,7 @@ F: arch/x86/include/asm/svm.h F: arch/x86/kvm/svm.c KERNEL VIRTUAL MACHINE (KVM) FOR POWERPC -M: Hollis Blanchard <hollisb@us.ibm.com> +M: Alexander Graf <agraf@suse.de> L: kvm-ppc@vger.kernel.org W: http://kvm.qumranet.com S: Supported diff --git a/arch/Kconfig b/arch/Kconfig index 215e46073c4..e5eb1337a53 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -41,6 +41,17 @@ config KPROBES for kernel debugging, non-intrusive instrumentation and testing. If in doubt, say "N". +config OPTPROBES + bool "Kprobes jump optimization support (EXPERIMENTAL)" + default y + depends on KPROBES + depends on !PREEMPT + depends on HAVE_OPTPROBES + select KALLSYMS_ALL + help + This option will allow kprobes to optimize breakpoint to + a jump for reducing its overhead. + config HAVE_EFFICIENT_UNALIGNED_ACCESS bool help @@ -83,6 +94,8 @@ config HAVE_KPROBES config HAVE_KRETPROBES bool +config HAVE_OPTPROBES + bool # # An arch should select this if it provides all these things: # diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig index 01c75797119..fa4d1e59deb 100644 --- a/arch/ia64/kvm/Kconfig +++ b/arch/ia64/kvm/Kconfig @@ -26,6 +26,7 @@ config KVM select ANON_INODES select HAVE_KVM_IRQCHIP select KVM_APIC_ARCHITECTURE + select KVM_MMIO ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 5fdeec5fddc..26e0e089bfe 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -241,10 +241,10 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 0; mmio: if (p->dir) - r = kvm_io_bus_read(&vcpu->kvm->mmio_bus, p->addr, + r = kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, p->addr, p->size, &p->data); else - r = kvm_io_bus_write(&vcpu->kvm->mmio_bus, p->addr, + r = kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, p->addr, p->size, &p->data); if (r) printk(KERN_ERR"kvm: No iodevice found! addr:%lx\n", p->addr); @@ -636,12 +636,9 @@ static void kvm_vcpu_post_transition(struct kvm_vcpu *vcpu) static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { union context *host_ctx, *guest_ctx; - int r; + int r, idx; - /* - * down_read() may sleep and return with interrupts enabled - */ - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); again: if (signal_pending(current)) { @@ -663,7 +660,7 @@ again: if (r < 0) goto vcpu_run_fail; - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); kvm_guest_enter(); /* @@ -687,7 +684,7 @@ again: kvm_guest_exit(); preempt_enable(); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_handle_exit(kvm_run, vcpu); @@ -697,10 +694,10 @@ again: } out: - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); if (r > 0) { kvm_resched(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); goto again; } @@ -971,7 +968,7 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; r = kvm_setup_default_irq_routing(kvm); if (r) { - kfree(kvm->arch.vioapic); + kvm_ioapic_destroy(kvm); goto out; } break; @@ -1377,12 +1374,14 @@ static void free_kvm(struct kvm *kvm) static void kvm_release_vm_pages(struct kvm *kvm) { + struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int i, j; unsigned long base_gfn; - for (i = 0; i < kvm->nmemslots; i++) { - memslot = &kvm->memslots[i]; + slots = rcu_dereference(kvm->memslots); + for (i = 0; i < slots->nmemslots; i++) { + memslot = &slots->memslots[i]; base_gfn = memslot->base_gfn; for (j = 0; j < memslot->npages; j++) { @@ -1405,6 +1404,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kfree(kvm->arch.vioapic); kvm_release_vm_pages(kvm); kvm_free_physmem(kvm); + cleanup_srcu_struct(&kvm->srcu); free_kvm(kvm); } @@ -1576,15 +1576,15 @@ out: return r; } -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, int user_alloc) { unsigned long i; unsigned long pfn; - int npages = mem->memory_size >> PAGE_SHIFT; - struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; + int npages = memslot->npages; unsigned long base_gfn = memslot->base_gfn; if (base_gfn + npages > (KVM_MAX_MEM_SIZE >> PAGE_SHIFT)) @@ -1608,6 +1608,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm, return 0; } +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + return; +} + void kvm_arch_flush_shadow(struct kvm *kvm) { kvm_flush_remote_tlbs(kvm); @@ -1802,7 +1810,7 @@ static int kvm_ia64_sync_dirty_log(struct kvm *kvm, if (log->slot >= KVM_MEMORY_SLOTS) goto out; - memslot = &kvm->memslots[log->slot]; + memslot = &kvm->memslots->memslots[log->slot]; r = -ENOENT; if (!memslot->dirty_bitmap) goto out; @@ -1827,6 +1835,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot; int is_dirty = 0; + mutex_lock(&kvm->slots_lock); spin_lock(&kvm->arch.dirty_log_lock); r = kvm_ia64_sync_dirty_log(kvm, log); @@ -1840,12 +1849,13 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { kvm_flush_remote_tlbs(kvm); - memslot = &kvm->memslots[log->slot]; + memslot = &kvm->memslots->memslots[log->slot]; n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; memset(memslot->dirty_bitmap, 0, n); } r = 0; out: + mutex_unlock(&kvm->slots_lock); spin_unlock(&kvm->arch.dirty_log_lock); return r; } diff --git a/arch/ia64/kvm/kvm_fw.c b/arch/ia64/kvm/kvm_fw.c index e4b82319881..cb548ee9fca 100644 --- a/arch/ia64/kvm/kvm_fw.c +++ b/arch/ia64/kvm/kvm_fw.c @@ -75,7 +75,7 @@ static void set_pal_result(struct kvm_vcpu *vcpu, struct exit_ctl_data *p; p = kvm_get_exit_data(vcpu); - if (p && p->exit_reason == EXIT_REASON_PAL_CALL) { + if (p->exit_reason == EXIT_REASON_PAL_CALL) { p->u.pal_data.ret = result; return ; } @@ -87,7 +87,7 @@ static void set_sal_result(struct kvm_vcpu *vcpu, struct exit_ctl_data *p; p = kvm_get_exit_data(vcpu); - if (p && p->exit_reason == EXIT_REASON_SAL_CALL) { + if (p->exit_reason == EXIT_REASON_SAL_CALL) { p->u.sal_data.ret = result; return ; } @@ -322,7 +322,7 @@ static u64 kvm_get_pal_call_index(struct kvm_vcpu *vcpu) struct exit_ctl_data *p; p = kvm_get_exit_data(vcpu); - if (p && (p->exit_reason == EXIT_REASON_PAL_CALL)) + if (p->exit_reason == EXIT_REASON_PAL_CALL) index = p->u.pal_data.gr28; return index; @@ -646,18 +646,16 @@ static void kvm_get_sal_call_data(struct kvm_vcpu *vcpu, u64 *in0, u64 *in1, p = kvm_get_exit_data(vcpu); - if (p) { - if (p->exit_reason == EXIT_REASON_SAL_CALL) { - *in0 = p->u.sal_data.in0; - *in1 = p->u.sal_data.in1; - *in2 = p->u.sal_data.in2; - *in3 = p->u.sal_data.in3; - *in4 = p->u.sal_data.in4; - *in5 = p->u.sal_data.in5; - *in6 = p->u.sal_data.in6; - *in7 = p->u.sal_data.in7; - return ; - } + if (p->exit_reason == EXIT_REASON_SAL_CALL) { + *in0 = p->u.sal_data.in0; + *in1 = p->u.sal_data.in1; + *in2 = p->u.sal_data.in2; + *in3 = p->u.sal_data.in3; + *in4 = p->u.sal_data.in4; + *in5 = p->u.sal_data.in5; + *in6 = p->u.sal_data.in6; + *in7 = p->u.sal_data.in7; + return ; } *in0 = 0; } diff --git a/arch/ia64/kvm/mmio.c b/arch/ia64/kvm/mmio.c index 9bf55afd08d..fb8f9f59a1e 100644 --- a/arch/ia64/kvm/mmio.c +++ b/arch/ia64/kvm/mmio.c @@ -316,8 +316,8 @@ void emulate_io_inst(struct kvm_vcpu *vcpu, u64 padr, u64 ma) return; } else { inst_type = -1; - panic_vm(vcpu, "Unsupported MMIO access instruction! \ - Bunld[0]=0x%lx, Bundle[1]=0x%lx\n", + panic_vm(vcpu, "Unsupported MMIO access instruction! " + "Bunld[0]=0x%lx, Bundle[1]=0x%lx\n", bundle.i64[0], bundle.i64[1]); } diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c index dce75b70cdd..958815c9787 100644 --- a/arch/ia64/kvm/vcpu.c +++ b/arch/ia64/kvm/vcpu.c @@ -1639,8 +1639,8 @@ void vcpu_set_psr(struct kvm_vcpu *vcpu, unsigned long val) * Otherwise panic */ if (val & (IA64_PSR_PK | IA64_PSR_IS | IA64_PSR_VM)) - panic_vm(vcpu, "Only support guests with vpsr.pk =0 \ - & vpsr.is=0\n"); + panic_vm(vcpu, "Only support guests with vpsr.pk =0 " + "& vpsr.is=0\n"); /* * For those IA64_PSR bits: id/da/dd/ss/ed/ia diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index af2abe74f54..aadf2dd6f84 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -97,4 +97,10 @@ #define RESUME_HOST RESUME_FLAG_HOST #define RESUME_HOST_NV (RESUME_FLAG_HOST|RESUME_FLAG_NV) +#define KVM_GUEST_MODE_NONE 0 +#define KVM_GUEST_MODE_GUEST 1 +#define KVM_GUEST_MODE_SKIP 2 + +#define KVM_INST_FETCH_FAILED -1 + #endif /* __POWERPC_KVM_ASM_H__ */ diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 74b7369770d..db7db0a9696 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -22,7 +22,7 @@ #include <linux/types.h> #include <linux/kvm_host.h> -#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s_64_asm.h> struct kvmppc_slb { u64 esid; @@ -33,7 +33,8 @@ struct kvmppc_slb { bool Ks; bool Kp; bool nx; - bool large; + bool large; /* PTEs are 16MB */ + bool tb; /* 1TB segment */ bool class; }; @@ -69,6 +70,7 @@ struct kvmppc_sid_map { struct kvmppc_vcpu_book3s { struct kvm_vcpu vcpu; + struct kvmppc_book3s_shadow_vcpu shadow_vcpu; struct kvmppc_sid_map sid_map[SID_MAP_NUM]; struct kvmppc_slb slb[64]; struct { @@ -89,6 +91,7 @@ struct kvmppc_vcpu_book3s { u64 vsid_next; u64 vsid_max; int context_id; + ulong prog_flags; /* flags to inject when giving a 700 trap */ }; #define CONTEXT_HOST 0 @@ -119,6 +122,10 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, extern u32 kvmppc_trampoline_lowmem; extern u32 kvmppc_trampoline_enter; +extern void kvmppc_rmcall(ulong srr0, ulong srr1); +extern void kvmppc_load_up_fpu(void); +extern void kvmppc_load_up_altivec(void); +extern void kvmppc_load_up_vsx(void); static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu) { diff --git a/arch/powerpc/include/asm/kvm_book3s_64_asm.h b/arch/powerpc/include/asm/kvm_book3s_64_asm.h index 2e06ee8184e..183461b4840 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_64_asm.h @@ -20,6 +20,8 @@ #ifndef __ASM_KVM_BOOK3S_ASM_H__ #define __ASM_KVM_BOOK3S_ASM_H__ +#ifdef __ASSEMBLY__ + #ifdef CONFIG_KVM_BOOK3S_64_HANDLER #include <asm/kvm_asm.h> @@ -55,4 +57,20 @@ kvmppc_resume_\intno: #endif /* CONFIG_KVM_BOOK3S_64_HANDLER */ +#else /*__ASSEMBLY__ */ + +struct kvmppc_book3s_shadow_vcpu { + ulong gpr[14]; + u32 cr; + u32 xer; + ulong host_r1; + ulong host_r2; + ulong handler; + ulong scratch0; + ulong scratch1; + ulong vmhandler; +}; + +#endif /*__ASSEMBLY__ */ + #endif /* __ASM_KVM_BOOK3S_ASM_H__ */ diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h index 9d497ce4972..7fea26fffb2 100644 --- a/arch/powerpc/include/asm/kvm_e500.h +++ b/arch/powerpc/include/asm/kvm_e500.h @@ -52,9 +52,12 @@ struct kvmppc_vcpu_e500 { u32 mas5; u32 mas6; u32 mas7; + u32 l1csr0; u32 l1csr1; u32 hid0; u32 hid1; + u32 tlb0cfg; + u32 tlb1cfg; struct kvm_vcpu vcpu; }; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 1201f62d0d7..5e5bae7e152 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -167,23 +167,40 @@ struct kvm_vcpu_arch { ulong trampoline_lowmem; ulong trampoline_enter; ulong highmem_handler; + ulong rmcall; ulong host_paca_phys; struct kvmppc_mmu mmu; #endif - u64 fpr[32]; ulong gpr[32]; + u64 fpr[32]; + u32 fpscr; + +#ifdef CONFIG_ALTIVEC + vector128 vr[32]; + vector128 vscr; +#endif + +#ifdef CONFIG_VSX + u64 vsr[32]; +#endif + ulong pc; - u32 cr; ulong ctr; ulong lr; + +#ifdef CONFIG_BOOKE ulong xer; + u32 cr; +#endif ulong msr; #ifdef CONFIG_PPC64 ulong shadow_msr; + ulong shadow_srr1; ulong hflags; + ulong guest_owned_ext; #endif u32 mmucr; ulong sprg0; @@ -242,6 +259,8 @@ struct kvm_vcpu_arch { #endif ulong fault_dear; ulong fault_esr; + ulong queued_dear; + ulong queued_esr; gpa_t paddr_accessed; u8 io_gpr; /* GPR used as IO source/target */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 269ee46ab02..e2642829e43 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -28,6 +28,9 @@ #include <linux/types.h> #include <linux/kvm_types.h> #include <linux/kvm_host.h> +#ifdef CONFIG_PPC_BOOK3S +#include <asm/kvm_book3s.h> +#endif enum emulation_result { EMULATE_DONE, /* no further processing */ @@ -80,8 +83,9 @@ extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu); extern void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu); extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu); -extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu); +extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags); extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu); +extern void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu); extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq); @@ -95,4 +99,81 @@ extern void kvmppc_booke_exit(void); extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu); +#ifdef CONFIG_PPC_BOOK3S + +/* We assume we're always acting on the current vcpu */ + +static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) +{ + if ( num < 14 ) { + get_paca()->shadow_vcpu.gpr[num] = val; + to_book3s(vcpu)->shadow_vcpu.gpr[num] = val; + } else + vcpu->arch.gpr[num] = val; +} + +static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num) +{ + if ( num < 14 ) + return get_paca()->shadow_vcpu.gpr[num]; + else + return vcpu->arch.gpr[num]; +} + +static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val) +{ + get_paca()->shadow_vcpu.cr = val; + to_book3s(vcpu)->shadow_vcpu.cr = val; +} + +static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu) +{ + return get_paca()->shadow_vcpu.cr; +} + +static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val) +{ + get_paca()->shadow_vcpu.xer = val; + to_book3s(vcpu)->shadow_vcpu.xer = val; +} + +static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu) +{ + return get_paca()->shadow_vcpu.xer; +} + +#else + +static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) +{ + vcpu->arch.gpr[num] = val; +} + +static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num) +{ + return vcpu->arch.gpr[num]; +} + +static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val) +{ + vcpu->arch.cr = val; +} + +static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.cr; +} + +static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val) +{ + vcpu->arch.xer = val; +} + +static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.xer; +} + +#endif + #endif /* __POWERPC_KVM_PPC_H__ */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 5e9b4ef7141..d8a693109c8 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -19,6 +19,9 @@ #include <asm/mmu.h> #include <asm/page.h> #include <asm/exception-64e.h> +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER +#include <asm/kvm_book3s_64_asm.h> +#endif register struct paca_struct *local_paca asm("r13"); @@ -135,6 +138,8 @@ struct paca_struct { u64 esid; u64 vsid; } kvm_slb[64]; /* guest SLB */ + /* We use this to store guest state in */ + struct kvmppc_book3s_shadow_vcpu shadow_vcpu; u8 kvm_slb_max; /* highest used guest slb entry */ u8 kvm_in_guest; /* are we inside the guest? */ #endif diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index bc8dd53f718..5572e86223f 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -426,6 +426,10 @@ #define SRR1_WAKEMT 0x00280000 /* mtctrl */ #define SRR1_WAKEDEC 0x00180000 /* Decrementer interrupt */ #define SRR1_WAKETHERM 0x00100000 /* Thermal management interrupt */ +#define SRR1_PROGFPE 0x00100000 /* Floating Point Enabled */ +#define SRR1_PROGPRIV 0x00040000 /* Privileged instruction */ +#define SRR1_PROGTRAP 0x00020000 /* Trap */ +#define SRR1_PROGADDR 0x00010000 /* SRR0 contains subsequent addr */ #define SPRN_HSRR0 0x13A /* Save/Restore Register 0 */ #define SPRN_HSRR1 0x13B /* Save/Restore Register 1 */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index a6c2b63227b..957ceb7059c 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -194,6 +194,30 @@ int main(void) DEFINE(PACA_KVM_IN_GUEST, offsetof(struct paca_struct, kvm_in_guest)); DEFINE(PACA_KVM_SLB, offsetof(struct paca_struct, kvm_slb)); DEFINE(PACA_KVM_SLB_MAX, offsetof(struct paca_struct, kvm_slb_max)); + DEFINE(PACA_KVM_CR, offsetof(struct paca_struct, shadow_vcpu.cr)); + DEFINE(PACA_KVM_XER, offsetof(struct paca_struct, shadow_vcpu.xer)); + DEFINE(PACA_KVM_R0, offsetof(struct paca_struct, shadow_vcpu.gpr[0])); + DEFINE(PACA_KVM_R1, offsetof(struct paca_struct, shadow_vcpu.gpr[1])); + DEFINE(PACA_KVM_R2, offsetof(struct paca_struct, shadow_vcpu.gpr[2])); + DEFINE(PACA_KVM_R3, offsetof(struct paca_struct, shadow_vcpu.gpr[3])); + DEFINE(PACA_KVM_R4, offsetof(struct paca_struct, shadow_vcpu.gpr[4])); + DEFINE(PACA_KVM_R5, offsetof(struct paca_struct, shadow_vcpu.gpr[5])); + DEFINE(PACA_KVM_R6, offsetof(struct paca_struct, shadow_vcpu.gpr[6])); + DEFINE(PACA_KVM_R7, offsetof(struct paca_struct, shadow_vcpu.gpr[7])); + DEFINE(PACA_KVM_R8, offsetof(struct paca_struct, shadow_vcpu.gpr[8])); + DEFINE(PACA_KVM_R9, offsetof(struct paca_struct, shadow_vcpu.gpr[9])); + DEFINE(PACA_KVM_R10, offsetof(struct paca_struct, shadow_vcpu.gpr[10])); + DEFINE(PACA_KVM_R11, offsetof(struct paca_struct, shadow_vcpu.gpr[11])); + DEFINE(PACA_KVM_R12, offsetof(struct paca_struct, shadow_vcpu.gpr[12])); + DEFINE(PACA_KVM_R13, offsetof(struct paca_struct, shadow_vcpu.gpr[13])); + DEFINE(PACA_KVM_HOST_R1, offsetof(struct paca_struct, shadow_vcpu.host_r1)); + DEFINE(PACA_KVM_HOST_R2, offsetof(struct paca_struct, shadow_vcpu.host_r2)); + DEFINE(PACA_KVM_VMHANDLER, offsetof(struct paca_struct, + shadow_vcpu.vmhandler)); + DEFINE(PACA_KVM_SCRATCH0, offsetof(struct paca_struct, + shadow_vcpu.scratch0)); + DEFINE(PACA_KVM_SCRATCH1, offsetof(struct paca_struct, + shadow_vcpu.scratch1)); #endif #endif /* CONFIG_PPC64 */ @@ -389,8 +413,6 @@ int main(void) DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid)); DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr)); DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr)); - DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr)); - DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer)); DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr)); DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc)); DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.msr)); @@ -411,11 +433,16 @@ int main(void) DEFINE(VCPU_HOST_R2, offsetof(struct kvm_vcpu, arch.host_r2)); DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr)); DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr)); + DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1)); DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem)); DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter)); DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler)); + DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall)); DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags)); -#endif +#else + DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr)); + DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer)); +#endif /* CONFIG_PPC64 */ #endif #ifdef CONFIG_44x DEFINE(PGD_T_LOG2, PGD_T_LOG2); diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c index 425451453e9..ab3e392ac63 100644 --- a/arch/powerpc/kernel/ppc_ksyms.c +++ b/arch/powerpc/kernel/ppc_ksyms.c @@ -107,6 +107,7 @@ EXPORT_SYMBOL(giveup_altivec); #endif /* CONFIG_ALTIVEC */ #ifdef CONFIG_VSX EXPORT_SYMBOL(giveup_vsx); +EXPORT_SYMBOL_GPL(__giveup_vsx); #endif /* CONFIG_VSX */ #ifdef CONFIG_SPE EXPORT_SYMBOL(giveup_spe); diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c index 61af58fcece..65ea083a5b2 100644 --- a/arch/powerpc/kvm/44x_emulate.c +++ b/arch/powerpc/kvm/44x_emulate.c @@ -65,13 +65,14 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, */ switch (dcrn) { case DCRN_CPR0_CONFIG_ADDR: - vcpu->arch.gpr[rt] = vcpu->arch.cpr0_cfgaddr; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr); break; case DCRN_CPR0_CONFIG_DATA: local_irq_disable(); mtdcr(DCRN_CPR0_CONFIG_ADDR, vcpu->arch.cpr0_cfgaddr); - vcpu->arch.gpr[rt] = mfdcr(DCRN_CPR0_CONFIG_DATA); + kvmppc_set_gpr(vcpu, rt, + mfdcr(DCRN_CPR0_CONFIG_DATA)); local_irq_enable(); break; default: @@ -93,11 +94,11 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, /* emulate some access in kernel */ switch (dcrn) { case DCRN_CPR0_CONFIG_ADDR: - vcpu->arch.cpr0_cfgaddr = vcpu->arch.gpr[rs]; + vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs); break; default: run->dcr.dcrn = dcrn; - run->dcr.data = vcpu->arch.gpr[rs]; + run->dcr.data = kvmppc_get_gpr(vcpu, rs); run->dcr.is_write = 1; vcpu->arch.dcr_needed = 1; kvmppc_account_exit(vcpu, DCR_EXITS); @@ -146,13 +147,13 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) switch (sprn) { case SPRN_PID: - kvmppc_set_pid(vcpu, vcpu->arch.gpr[rs]); break; + kvmppc_set_pid(vcpu, kvmppc_get_gpr(vcpu, rs)); break; case SPRN_MMUCR: - vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break; + vcpu->arch.mmucr = kvmppc_get_gpr(vcpu, rs); break; case SPRN_CCR0: - vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break; + vcpu->arch.ccr0 = kvmppc_get_gpr(vcpu, rs); break; case SPRN_CCR1: - vcpu->arch.ccr1 = vcpu->arch.gpr[rs]; break; + vcpu->arch.ccr1 = kvmppc_get_gpr(vcpu, rs); break; default: emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, rs); } @@ -167,13 +168,13 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) switch (sprn) { case SPRN_PID: - vcpu->arch.gpr[rt] = vcpu->arch.pid; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.pid); break; case SPRN_MMUCR: - vcpu->arch.gpr[rt] = vcpu->arch.mmucr; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.mmucr); break; case SPRN_CCR0: - vcpu->arch.gpr[rt] = vcpu->arch.ccr0; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ccr0); break; case SPRN_CCR1: - vcpu->arch.gpr[rt] = vcpu->arch.ccr1; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ccr1); break; default: emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt); } diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index ff3cb63b811..2570fcc7665 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -439,7 +439,7 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws) struct kvmppc_44x_tlbe *tlbe; unsigned int gtlb_index; - gtlb_index = vcpu->arch.gpr[ra]; + gtlb_index = kvmppc_get_gpr(vcpu, ra); if (gtlb_index > KVM44x_GUEST_TLB_SIZE) { printk("%s: index %d\n", __func__, gtlb_index); kvmppc_dump_vcpu(vcpu); @@ -455,15 +455,15 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws) switch (ws) { case PPC44x_TLB_PAGEID: tlbe->tid = get_mmucr_stid(vcpu); - tlbe->word0 = vcpu->arch.gpr[rs]; + tlbe->word0 = kvmppc_get_gpr(vcpu, rs); break; case PPC44x_TLB_XLAT: - tlbe->word1 = vcpu->arch.gpr[rs]; + tlbe->word1 = kvmppc_get_gpr(vcpu, rs); break; case PPC44x_TLB_ATTRIB: - tlbe->word2 = vcpu->arch.gpr[rs]; + tlbe->word2 = kvmppc_get_gpr(vcpu, rs); break; default: @@ -500,18 +500,20 @@ int kvmppc_44x_emul_tlbsx(struct kvm_vcpu *vcpu, u8 rt, u8 ra, u8 rb, u8 rc) unsigned int as = get_mmucr_sts(vcpu); unsigned int pid = get_mmucr_stid(vcpu); - ea = vcpu->arch.gpr[rb]; + ea = kvmppc_get_gpr(vcpu, rb); if (ra) - ea += vcpu->arch.gpr[ra]; + ea += kvmppc_get_gpr(vcpu, ra); gtlb_index = kvmppc_44x_tlb_index(vcpu, ea, pid, as); if (rc) { + u32 cr = kvmppc_get_cr(vcpu); + if (gtlb_index < 0) - vcpu->arch.cr &= ~0x20000000; + kvmppc_set_cr(vcpu, cr & ~0x20000000); else - vcpu->arch.cr |= 0x20000000; + kvmppc_set_cr(vcpu, cr | 0x20000000); } - vcpu->arch.gpr[rt] = gtlb_index; + kvmppc_set_gpr(vcpu, rt, gtlb_index); kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS); return EMULATE_DONE; diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index fe037fdaf1b..60624cc9f4d 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -20,6 +20,7 @@ config KVM bool select PREEMPT_NOTIFIERS select ANON_INODES + select KVM_MMIO config KVM_BOOK3S_64_HANDLER bool diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 3e294bd9b8c..9a271f0929c 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -33,12 +33,9 @@ /* #define EXIT_DEBUG */ /* #define EXIT_DEBUG_SIMPLE */ +/* #define DEBUG_EXT */ -/* Without AGGRESSIVE_DEC we only fire off a DEC interrupt when DEC turns 0. - * When set, we retrigger a DEC interrupt after that if DEC <= 0. - * PPC32 Linux runs faster without AGGRESSIVE_DEC, PPC64 Linux requires it. */ - -/* #define AGGRESSIVE_DEC */ +static void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr); struct kvm_stats_debugfs_item debugfs_entries[] = { { "exits", VCPU_STAT(sum_exits) }, @@ -72,16 +69,24 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu) void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { memcpy(get_paca()->kvm_slb, to_book3s(vcpu)->slb_shadow, sizeof(get_paca()->kvm_slb)); + memcpy(&get_paca()->shadow_vcpu, &to_book3s(vcpu)->shadow_vcpu, + sizeof(get_paca()->shadow_vcpu)); get_paca()->kvm_slb_max = to_book3s(vcpu)->slb_shadow_max; } void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) { memcpy(to_book3s(vcpu)->slb_shadow, get_paca()->kvm_slb, sizeof(get_paca()->kvm_slb)); + memcpy(&to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu, + sizeof(get_paca()->shadow_vcpu)); to_book3s(vcpu)->slb_shadow_max = get_paca()->kvm_slb_max; + + kvmppc_giveup_ext(vcpu, MSR_FP); + kvmppc_giveup_ext(vcpu, MSR_VEC); + kvmppc_giveup_ext(vcpu, MSR_VSX); } -#if defined(AGGRESSIVE_DEC) || defined(EXIT_DEBUG) +#if defined(EXIT_DEBUG) static u32 kvmppc_get_dec(struct kvm_vcpu *vcpu) { u64 jd = mftb() - vcpu->arch.dec_jiffies; @@ -89,6 +94,23 @@ static u32 kvmppc_get_dec(struct kvm_vcpu *vcpu) } #endif +static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) +{ + vcpu->arch.shadow_msr = vcpu->arch.msr; + /* Guest MSR values */ + vcpu->arch.shadow_msr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | + MSR_BE | MSR_DE; + /* Process MSR values */ + vcpu->arch.shadow_msr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | + MSR_EE; + /* External providers the guest reserved */ + vcpu->arch.shadow_msr |= (vcpu->arch.msr & vcpu->arch.guest_owned_ext); + /* 64-bit Process MSR values */ +#ifdef CONFIG_PPC_BOOK3S_64 + vcpu->arch.shadow_msr |= MSR_ISF | MSR_HV; +#endif +} + void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) { ulong old_msr = vcpu->arch.msr; @@ -96,12 +118,10 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) #ifdef EXIT_DEBUG printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr); #endif + msr &= to_book3s(vcpu)->msr_mask; vcpu->arch.msr = msr; - vcpu->arch.shadow_msr = msr | MSR_USER32; - vcpu->arch.shadow_msr &= ( MSR_VEC | MSR_VSX | MSR_FP | MSR_FE0 | - MSR_USER64 | MSR_SE | MSR_BE | MSR_DE | - MSR_FE1); + kvmppc_recalc_shadow_msr(vcpu); if (msr & (MSR_WE|MSR_POW)) { if (!vcpu->arch.pending_exceptions) { @@ -125,11 +145,10 @@ void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags) vcpu->arch.mmu.reset_msr(vcpu); } -void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) +static int kvmppc_book3s_vec2irqprio(unsigned int vec) { unsigned int prio; - vcpu->stat.queue_intr++; switch (vec) { case 0x100: prio = BOOK3S_IRQPRIO_SYSTEM_RESET; break; case 0x200: prio = BOOK3S_IRQPRIO_MACHINE_CHECK; break; @@ -149,15 +168,31 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) default: prio = BOOK3S_IRQPRIO_MAX; break; } - set_bit(prio, &vcpu->arch.pending_exceptions); + return prio; +} + +static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, + unsigned int vec) +{ + clear_bit(kvmppc_book3s_vec2irqprio(vec), + &vcpu->arch.pending_exceptions); +} + +void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) +{ + vcpu->stat.queue_intr++; + + set_bit(kvmppc_book3s_vec2irqprio(vec), + &vcpu->arch.pending_exceptions); #ifdef EXIT_DEBUG printk(KERN_INFO "Queueing interrupt %x\n", vec); #endif } -void kvmppc_core_queue_program(struct kvm_vcpu *vcpu) +void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags) { + to_book3s(vcpu)->prog_flags = flags; kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_PROGRAM); } @@ -171,6 +206,11 @@ int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu) return test_bit(BOOK3S_INTERRUPT_DECREMENTER >> 7, &vcpu->arch.pending_exceptions); } +void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu) +{ + kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER); +} + void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { @@ -181,6 +221,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) { int deliver = 1; int vec = 0; + ulong flags = 0ULL; switch (priority) { case BOOK3S_IRQPRIO_DECREMENTER: @@ -214,6 +255,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) break; case BOOK3S_IRQPRIO_PROGRAM: vec = BOOK3S_INTERRUPT_PROGRAM; + flags = to_book3s(vcpu)->prog_flags; break; case BOOK3S_IRQPRIO_VSX: vec = BOOK3S_INTERRUPT_VSX; @@ -244,7 +286,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) #endif if (deliver) - kvmppc_inject_interrupt(vcpu, vec, 0ULL); + kvmppc_inject_interrupt(vcpu, vec, flags); return deliver; } @@ -254,21 +296,15 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) unsigned long *pending = &vcpu->arch.pending_exceptions; unsigned int priority; - /* XXX be more clever here - no need to mftb() on every entry */ - /* Issue DEC again if it's still active */ -#ifdef AGGRESSIVE_DEC - if (vcpu->arch.msr & MSR_EE) - if (kvmppc_get_dec(vcpu) & 0x80000000) - kvmppc_core_queue_dec(vcpu); -#endif - #ifdef EXIT_DEBUG if (vcpu->arch.pending_exceptions) printk(KERN_EMERG "KVM: Check pending: %lx\n", vcpu->arch.pending_exceptions); #endif priority = __ffs(*pending); while (priority <= (sizeof(unsigned int) * 8)) { - if (kvmppc_book3s_irqprio_deliver(vcpu, priority)) { + if (kvmppc_book3s_irqprio_deliver(vcpu, priority) && + (priority != BOOK3S_IRQPRIO_DECREMENTER)) { + /* DEC interrupts get cleared by mtdec */ clear_bit(priority, &vcpu->arch.pending_exceptions); break; } @@ -503,14 +539,14 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* Page not found in guest PTE entries */ vcpu->arch.dear = vcpu->arch.fault_dear; to_book3s(vcpu)->dsisr = vcpu->arch.fault_dsisr; - vcpu->arch.msr |= (vcpu->arch.shadow_msr & 0x00000000f8000000ULL); + vcpu->arch.msr |= (vcpu->arch.shadow_srr1 & 0x00000000f8000000ULL); kvmppc_book3s_queue_irqprio(vcpu, vec); } else if (page_found == -EPERM) { /* Storage protection */ vcpu->arch.dear = vcpu->arch.fault_dear; to_book3s(vcpu)->dsisr = vcpu->arch.fault_dsisr & ~DSISR_NOHPTE; to_book3s(vcpu)->dsisr |= DSISR_PROTFAULT; - vcpu->arch.msr |= (vcpu->arch.shadow_msr & 0x00000000f8000000ULL); + vcpu->arch.msr |= (vcpu->arch.shadow_srr1 & 0x00000000f8000000ULL); kvmppc_book3s_queue_irqprio(vcpu, vec); } else if (page_found == -EINVAL) { /* Page not found in guest SLB */ @@ -532,13 +568,122 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, r = kvmppc_emulate_mmio(run, vcpu); if ( r == RESUME_HOST_NV ) r = RESUME_HOST; - if ( r == RESUME_GUEST_NV ) - r = RESUME_GUEST; } return r; } +static inline int get_fpr_index(int i) +{ +#ifdef CONFIG_VSX + i *= 2; +#endif + return i; +} + +/* Give up external provider (FPU, Altivec, VSX) */ +static void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr) +{ + struct thread_struct *t = ¤t->thread; + u64 *vcpu_fpr = vcpu->arch.fpr; + u64 *vcpu_vsx = vcpu->arch.vsr; + u64 *thread_fpr = (u64*)t->fpr; + int i; + + if (!(vcpu->arch.guest_owned_ext & msr)) + return; + +#ifdef DEBUG_EXT + printk(KERN_INFO "Giving up ext 0x%lx\n", msr); +#endif + + switch (msr) { + case MSR_FP: + giveup_fpu(current); + for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) + vcpu_fpr[i] = thread_fpr[get_fpr_index(i)]; + + vcpu->arch.fpscr = t->fpscr.val; + break; + case MSR_VEC: +#ifdef CONFIG_ALTIVEC + giveup_altivec(current); + memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr)); + vcpu->arch.vscr = t->vscr; +#endif + break; + case MSR_VSX: +#ifdef CONFIG_VSX + __giveup_vsx(current); + for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) + vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1]; +#endif + break; + default: + BUG(); + } + + vcpu->arch.guest_owned_ext &= ~msr; + current->thread.regs->msr &= ~msr; + kvmppc_recalc_shadow_msr(vcpu); +} + +/* Handle external providers (FPU, Altivec, VSX) */ +static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, + ulong msr) +{ + struct thread_struct *t = ¤t->thread; + u64 *vcpu_fpr = vcpu->arch.fpr; + u64 *vcpu_vsx = vcpu->arch.vsr; + u64 *thread_fpr = (u64*)t->fpr; + int i; + + if (!(vcpu->arch.msr & msr)) { + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + return RESUME_GUEST; + } + +#ifdef DEBUG_EXT + printk(KERN_INFO "Loading up ext 0x%lx\n", msr); +#endif + + current->thread.regs->msr |= msr; + + switch (msr) { + case MSR_FP: + for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) + thread_fpr[get_fpr_index(i)] = vcpu_fpr[i]; + + t->fpscr.val = vcpu->arch.fpscr; + t->fpexc_mode = 0; + kvmppc_load_up_fpu(); + break; + case MSR_VEC: +#ifdef CONFIG_ALTIVEC + memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr)); + t->vscr = vcpu->arch.vscr; + t->vrsave = -1; + kvmppc_load_up_altivec(); +#endif + break; + case MSR_VSX: +#ifdef CONFIG_VSX + for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) + thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i]; + kvmppc_load_up_vsx(); +#endif + break; + default: + BUG(); + } + + vcpu->arch.guest_owned_ext |= msr; + + kvmppc_recalc_shadow_msr(vcpu); + + return RESUME_GUEST; +} + int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int exit_nr) { @@ -563,7 +708,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_INST_STORAGE: vcpu->stat.pf_instruc++; /* only care about PTEG not found errors, but leave NX alone */ - if (vcpu->arch.shadow_msr & 0x40000000) { + if (vcpu->arch.shadow_srr1 & 0x40000000) { r = kvmppc_handle_pagefault(run, vcpu, vcpu->arch.pc, exit_nr); vcpu->stat.sp_instruc++; } else if (vcpu->arch.mmu.is_dcbz32(vcpu) && @@ -575,7 +720,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, */ kvmppc_mmu_pte_flush(vcpu, vcpu->arch.pc, ~0xFFFULL); } else { - vcpu->arch.msr |= (vcpu->arch.shadow_msr & 0x58000000); + vcpu->arch.msr |= vcpu->arch.shadow_srr1 & 0x58000000; kvmppc_book3s_queue_irqprio(vcpu, exit_nr); kvmppc_mmu_pte_flush(vcpu, vcpu->arch.pc, ~0xFFFULL); r = RESUME_GUEST; @@ -621,6 +766,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_PROGRAM: { enum emulation_result er; + ulong flags; + + flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; if (vcpu->arch.msr & MSR_PR) { #ifdef EXIT_DEBUG @@ -628,7 +776,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, #endif if ((vcpu->arch.last_inst & 0xff0007ff) != (INS_DCBZ & 0xfffffff7)) { - kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + kvmppc_core_queue_program(vcpu, flags); r = RESUME_GUEST; break; } @@ -638,12 +786,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, er = kvmppc_emulate_instruction(run, vcpu); switch (er) { case EMULATE_DONE: - r = RESUME_GUEST; + r = RESUME_GUEST_NV; break; case EMULATE_FAIL: printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", __func__, vcpu->arch.pc, vcpu->arch.last_inst); - kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + kvmppc_core_queue_program(vcpu, flags); r = RESUME_GUEST; break; default: @@ -653,23 +801,30 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, } case BOOK3S_INTERRUPT_SYSCALL: #ifdef EXIT_DEBUG - printk(KERN_INFO "Syscall Nr %d\n", (int)vcpu->arch.gpr[0]); + printk(KERN_INFO "Syscall Nr %d\n", (int)kvmppc_get_gpr(vcpu, 0)); #endif vcpu->stat.syscall_exits++; kvmppc_book3s_queue_irqprio(vcpu, exit_nr); r = RESUME_GUEST; break; - case BOOK3S_INTERRUPT_MACHINE_CHECK: case BOOK3S_INTERRUPT_FP_UNAVAIL: - case BOOK3S_INTERRUPT_TRACE: + r = kvmppc_handle_ext(vcpu, exit_nr, MSR_FP); + break; case BOOK3S_INTERRUPT_ALTIVEC: + r = kvmppc_handle_ext(vcpu, exit_nr, MSR_VEC); + break; case BOOK3S_INTERRUPT_VSX: + r = kvmppc_handle_ext(vcpu, exit_nr, MSR_VSX); + break; + case BOOK3S_INTERRUPT_MACHINE_CHECK: + case BOOK3S_INTERRUPT_TRACE: kvmppc_book3s_queue_irqprio(vcpu, exit_nr); r = RESUME_GUEST; break; default: /* Ugh - bork here! What did we get? */ - printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", exit_nr, vcpu->arch.pc, vcpu->arch.shadow_msr); + printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", + exit_nr, vcpu->arch.pc, vcpu->arch.shadow_srr1); r = RESUME_HOST; BUG(); break; @@ -712,10 +867,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int i; regs->pc = vcpu->arch.pc; - regs->cr = vcpu->arch.cr; + regs->cr = kvmppc_get_cr(vcpu); regs->ctr = vcpu->arch.ctr; regs->lr = vcpu->arch.lr; - regs->xer = vcpu->arch.xer; + regs->xer = kvmppc_get_xer(vcpu); regs->msr = vcpu->arch.msr; regs->srr0 = vcpu->arch.srr0; regs->srr1 = vcpu->arch.srr1; @@ -729,7 +884,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->sprg7 = vcpu->arch.sprg6; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) - regs->gpr[i] = vcpu->arch.gpr[i]; + regs->gpr[i] = kvmppc_get_gpr(vcpu, i); return 0; } @@ -739,10 +894,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int i; vcpu->arch.pc = regs->pc; - vcpu->arch.cr = regs->cr; + kvmppc_set_cr(vcpu, regs->cr); vcpu->arch.ctr = regs->ctr; vcpu->arch.lr = regs->lr; - vcpu->arch.xer = regs->xer; + kvmppc_set_xer(vcpu, regs->xer); kvmppc_set_msr(vcpu, regs->msr); vcpu->arch.srr0 = regs->srr0; vcpu->arch.srr1 = regs->srr1; @@ -754,8 +909,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.sprg6 = regs->sprg5; vcpu->arch.sprg7 = regs->sprg6; - for (i = 0; i < ARRAY_SIZE(vcpu->arch.gpr); i++) - vcpu->arch.gpr[i] = regs->gpr[i]; + for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) + kvmppc_set_gpr(vcpu, i, regs->gpr[i]); return 0; } @@ -850,7 +1005,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, int is_dirty = 0; int r, n; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); r = kvm_get_dirty_log(kvm, log, &is_dirty); if (r) @@ -858,7 +1013,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { - memslot = &kvm->memslots[log->slot]; + memslot = &kvm->memslots->memslots[log->slot]; ga = memslot->base_gfn << PAGE_SHIFT; ga_end = ga + (memslot->npages << PAGE_SHIFT); @@ -872,7 +1027,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, r = 0; out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -910,6 +1065,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) vcpu->arch.trampoline_lowmem = kvmppc_trampoline_lowmem; vcpu->arch.trampoline_enter = kvmppc_trampoline_enter; vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem; + vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall; vcpu->arch.shadow_msr = MSR_USER64; @@ -943,6 +1099,10 @@ extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { int ret; + struct thread_struct ext_bkp; + bool save_vec = current->thread.used_vr; + bool save_vsx = current->thread.used_vsr; + ulong ext_msr; /* No need to go into the guest when all we do is going out */ if (signal_pending(current)) { @@ -950,6 +1110,35 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return -EINTR; } + /* Save FPU state in stack */ + if (current->thread.regs->msr & MSR_FP) + giveup_fpu(current); + memcpy(ext_bkp.fpr, current->thread.fpr, sizeof(current->thread.fpr)); + ext_bkp.fpscr = current->thread.fpscr; + ext_bkp.fpexc_mode = current->thread.fpexc_mode; + +#ifdef CONFIG_ALTIVEC + /* Save Altivec state in stack */ + if (save_vec) { + if (current->thread.regs->msr & MSR_VEC) + giveup_altivec(current); + memcpy(ext_bkp.vr, current->thread.vr, sizeof(ext_bkp.vr)); + ext_bkp.vscr = current->thread.vscr; + ext_bkp.vrsave = current->thread.vrsave; + } + ext_bkp.used_vr = current->thread.used_vr; +#endif + +#ifdef CONFIG_VSX + /* Save VSX state in stack */ + if (save_vsx && (current->thread.regs->msr & MSR_VSX)) + __giveup_vsx(current); + ext_bkp.used_vsr = current->thread.used_vsr; +#endif + + /* Remember the MSR with disabled extensions */ + ext_msr = current->thread.regs->msr; + /* XXX we get called with irq disabled - change that! */ local_irq_enable(); @@ -957,6 +1146,32 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) local_irq_disable(); + current->thread.regs->msr = ext_msr; + + /* Make sure we save the guest FPU/Altivec/VSX state */ + kvmppc_giveup_ext(vcpu, MSR_FP); + kvmppc_giveup_ext(vcpu, MSR_VEC); + kvmppc_giveup_ext(vcpu, MSR_VSX); + + /* Restore FPU state from stack */ + memcpy(current->thread.fpr, ext_bkp.fpr, sizeof(ext_bkp.fpr)); + current->thread.fpscr = ext_bkp.fpscr; + current->thread.fpexc_mode = ext_bkp.fpexc_mode; + +#ifdef CONFIG_ALTIVEC + /* Restore Altivec state from stack */ + if (save_vec && current->thread.used_vr) { + memcpy(current->thread.vr, ext_bkp.vr, sizeof(ext_bkp.vr)); + current->thread.vscr = ext_bkp.vscr; + current->thread.vrsave= ext_bkp.vrsave; + } + current->thread.used_vr = ext_bkp.used_vr; +#endif + +#ifdef CONFIG_VSX + current->thread.used_vsr = ext_bkp.used_vsr; +#endif + return ret; } diff --git a/arch/powerpc/kvm/book3s_64_emulate.c b/arch/powerpc/kvm/book3s_64_emulate.c index 1027eac6d47..2b0ee7e040c 100644 --- a/arch/powerpc/kvm/book3s_64_emulate.c +++ b/arch/powerpc/kvm/book3s_64_emulate.c @@ -65,11 +65,11 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, case 31: switch (get_xop(inst)) { case OP_31_XOP_MFMSR: - vcpu->arch.gpr[get_rt(inst)] = vcpu->arch.msr; + kvmppc_set_gpr(vcpu, get_rt(inst), vcpu->arch.msr); break; case OP_31_XOP_MTMSRD: { - ulong rs = vcpu->arch.gpr[get_rs(inst)]; + ulong rs = kvmppc_get_gpr(vcpu, get_rs(inst)); if (inst & 0x10000) { vcpu->arch.msr &= ~(MSR_RI | MSR_EE); vcpu->arch.msr |= rs & (MSR_RI | MSR_EE); @@ -78,30 +78,30 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } case OP_31_XOP_MTMSR: - kvmppc_set_msr(vcpu, vcpu->arch.gpr[get_rs(inst)]); + kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, get_rs(inst))); break; case OP_31_XOP_MFSRIN: { int srnum; - srnum = (vcpu->arch.gpr[get_rb(inst)] >> 28) & 0xf; + srnum = (kvmppc_get_gpr(vcpu, get_rb(inst)) >> 28) & 0xf; if (vcpu->arch.mmu.mfsrin) { u32 sr; sr = vcpu->arch.mmu.mfsrin(vcpu, srnum); - vcpu->arch.gpr[get_rt(inst)] = sr; + kvmppc_set_gpr(vcpu, get_rt(inst), sr); } break; } case OP_31_XOP_MTSRIN: vcpu->arch.mmu.mtsrin(vcpu, - (vcpu->arch.gpr[get_rb(inst)] >> 28) & 0xf, - vcpu->arch.gpr[get_rs(inst)]); + (kvmppc_get_gpr(vcpu, get_rb(inst)) >> 28) & 0xf, + kvmppc_get_gpr(vcpu, get_rs(inst))); break; case OP_31_XOP_TLBIE: case OP_31_XOP_TLBIEL: { bool large = (inst & 0x00200000) ? true : false; - ulong addr = vcpu->arch.gpr[get_rb(inst)]; + ulong addr = kvmppc_get_gpr(vcpu, get_rb(inst)); vcpu->arch.mmu.tlbie(vcpu, addr, large); break; } @@ -111,14 +111,16 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, if (!vcpu->arch.mmu.slbmte) return EMULATE_FAIL; - vcpu->arch.mmu.slbmte(vcpu, vcpu->arch.gpr[get_rs(inst)], - vcpu->arch.gpr[get_rb(inst)]); + vcpu->arch.mmu.slbmte(vcpu, + kvmppc_get_gpr(vcpu, get_rs(inst)), + kvmppc_get_gpr(vcpu, get_rb(inst))); break; case OP_31_XOP_SLBIE: if (!vcpu->arch.mmu.slbie) return EMULATE_FAIL; - vcpu->arch.mmu.slbie(vcpu, vcpu->arch.gpr[get_rb(inst)]); + vcpu->arch.mmu.slbie(vcpu, + kvmppc_get_gpr(vcpu, get_rb(inst))); break; case OP_31_XOP_SLBIA: if (!vcpu->arch.mmu.slbia) @@ -132,9 +134,9 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, } else { ulong t, rb; - rb = vcpu->arch.gpr[get_rb(inst)]; + rb = kvmppc_get_gpr(vcpu, get_rb(inst)); t = vcpu->arch.mmu.slbmfee(vcpu, rb); - vcpu->arch.gpr[get_rt(inst)] = t; + kvmppc_set_gpr(vcpu, get_rt(inst), t); } break; case OP_31_XOP_SLBMFEV: @@ -143,20 +145,20 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, } else { ulong t, rb; - rb = vcpu->arch.gpr[get_rb(inst)]; + rb = kvmppc_get_gpr(vcpu, get_rb(inst)); t = vcpu->arch.mmu.slbmfev(vcpu, rb); - vcpu->arch.gpr[get_rt(inst)] = t; + kvmppc_set_gpr(vcpu, get_rt(inst), t); } break; case OP_31_XOP_DCBZ: { - ulong rb = vcpu->arch.gpr[get_rb(inst)]; + ulong rb = kvmppc_get_gpr(vcpu, get_rb(inst)); ulong ra = 0; ulong addr; u32 zeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; if (get_ra(inst)) - ra = vcpu->arch.gpr[get_ra(inst)]; + ra = kvmppc_get_gpr(vcpu, get_ra(inst)); addr = (ra + rb) & ~31ULL; if (!(vcpu->arch.msr & MSR_SF)) @@ -233,43 +235,44 @@ static void kvmppc_write_bat(struct kvm_vcpu *vcpu, int sprn, u32 val) int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) { int emulated = EMULATE_DONE; + ulong spr_val = kvmppc_get_gpr(vcpu, rs); switch (sprn) { case SPRN_SDR1: - to_book3s(vcpu)->sdr1 = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->sdr1 = spr_val; break; case SPRN_DSISR: - to_book3s(vcpu)->dsisr = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->dsisr = spr_val; break; case SPRN_DAR: - vcpu->arch.dear = vcpu->arch.gpr[rs]; + vcpu->arch.dear = spr_val; break; case SPRN_HIOR: - to_book3s(vcpu)->hior = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->hior = spr_val; break; case SPRN_IBAT0U ... SPRN_IBAT3L: case SPRN_IBAT4U ... SPRN_IBAT7L: case SPRN_DBAT0U ... SPRN_DBAT3L: case SPRN_DBAT4U ... SPRN_DBAT7L: - kvmppc_write_bat(vcpu, sprn, (u32)vcpu->arch.gpr[rs]); + kvmppc_write_bat(vcpu, sprn, (u32)spr_val); /* BAT writes happen so rarely that we're ok to flush * everything here */ kvmppc_mmu_pte_flush(vcpu, 0, 0); break; case SPRN_HID0: - to_book3s(vcpu)->hid[0] = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->hid[0] = spr_val; break; case SPRN_HID1: - to_book3s(vcpu)->hid[1] = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->hid[1] = spr_val; break; case SPRN_HID2: - to_book3s(vcpu)->hid[2] = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->hid[2] = spr_val; break; case SPRN_HID4: - to_book3s(vcpu)->hid[4] = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->hid[4] = spr_val; break; case SPRN_HID5: - to_book3s(vcpu)->hid[5] = vcpu->arch.gpr[rs]; + to_book3s(vcpu)->hid[5] = spr_val; /* guest HID5 set can change is_dcbz32 */ if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV)) @@ -299,38 +302,38 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) switch (sprn) { case SPRN_SDR1: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->sdr1; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1); break; case SPRN_DSISR: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->dsisr; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->dsisr); break; case SPRN_DAR: - vcpu->arch.gpr[rt] = vcpu->arch.dear; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.dear); break; case SPRN_HIOR: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->hior; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hior); break; case SPRN_HID0: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[0]; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[0]); break; case SPRN_HID1: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[1]; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[1]); break; case SPRN_HID2: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[2]; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[2]); break; case SPRN_HID4: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[4]; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[4]); break; case SPRN_HID5: - vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[5]; + kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[5]); break; case SPRN_THRM1: case SPRN_THRM2: case SPRN_THRM3: case SPRN_CTRLF: case SPRN_CTRLT: - vcpu->arch.gpr[rt] = 0; + kvmppc_set_gpr(vcpu, rt, 0); break; default: printk(KERN_INFO "KVM: invalid SPR read: %d\n", sprn); diff --git a/arch/powerpc/kvm/book3s_64_exports.c b/arch/powerpc/kvm/book3s_64_exports.c index 5b2db38ed86..1dd5a1ddfd0 100644 --- a/arch/powerpc/kvm/book3s_64_exports.c +++ b/arch/powerpc/kvm/book3s_64_exports.c @@ -22,3 +22,11 @@ EXPORT_SYMBOL_GPL(kvmppc_trampoline_enter); EXPORT_SYMBOL_GPL(kvmppc_trampoline_lowmem); +EXPORT_SYMBOL_GPL(kvmppc_rmcall); +EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu); +#ifdef CONFIG_ALTIVEC +EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec); +#endif +#ifdef CONFIG_VSX +EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx); +#endif diff --git a/arch/powerpc/kvm/book3s_64_interrupts.S b/arch/powerpc/kvm/book3s_64_interrupts.S index 7b55d8094c8..c1584d0cbce 100644 --- a/arch/powerpc/kvm/book3s_64_interrupts.S +++ b/arch/powerpc/kvm/book3s_64_interrupts.S @@ -28,11 +28,6 @@ #define ULONG_SIZE 8 #define VCPU_GPR(n) (VCPU_GPRS + (n * ULONG_SIZE)) -.macro mfpaca tmp_reg, src_reg, offset, vcpu_reg - ld \tmp_reg, (PACA_EXMC+\offset)(r13) - std \tmp_reg, VCPU_GPR(\src_reg)(\vcpu_reg) -.endm - .macro DISABLE_INTERRUPTS mfmsr r0 rldicl r0,r0,48,1 @@ -40,6 +35,26 @@ mtmsrd r0,1 .endm +#define VCPU_LOAD_NVGPRS(vcpu) \ + ld r14, VCPU_GPR(r14)(vcpu); \ + ld r15, VCPU_GPR(r15)(vcpu); \ + ld r16, VCPU_GPR(r16)(vcpu); \ + ld r17, VCPU_GPR(r17)(vcpu); \ + ld r18, VCPU_GPR(r18)(vcpu); \ + ld r19, VCPU_GPR(r19)(vcpu); \ + ld r20, VCPU_GPR(r20)(vcpu); \ + ld r21, VCPU_GPR(r21)(vcpu); \ + ld r22, VCPU_GPR(r22)(vcpu); \ + ld r23, VCPU_GPR(r23)(vcpu); \ + ld r24, VCPU_GPR(r24)(vcpu); \ + ld r25, VCPU_GPR(r25)(vcpu); \ + ld r26, VCPU_GPR(r26)(vcpu); \ + ld r27, VCPU_GPR(r27)(vcpu); \ + ld r28, VCPU_GPR(r28)(vcpu); \ + ld r29, VCPU_GPR(r29)(vcpu); \ + ld r30, VCPU_GPR(r30)(vcpu); \ + ld r31, VCPU_GPR(r31)(vcpu); \ + /***************************************************************************** * * * Guest entry / exit code that is in kernel module memory (highmem) * @@ -67,61 +82,32 @@ kvm_start_entry: SAVE_NVGPRS(r1) /* Save LR */ - mflr r14 - std r14, _LINK(r1) - -/* XXX optimize non-volatile loading away */ -kvm_start_lightweight: + std r0, _LINK(r1) - DISABLE_INTERRUPTS + /* Load non-volatile guest state from the vcpu */ + VCPU_LOAD_NVGPRS(r4) /* Save R1/R2 in the PACA */ - std r1, PACAR1(r13) - std r2, (PACA_EXMC+EX_SRR0)(r13) + std r1, PACA_KVM_HOST_R1(r13) + std r2, PACA_KVM_HOST_R2(r13) + + /* XXX swap in/out on load? */ ld r3, VCPU_HIGHMEM_HANDLER(r4) - std r3, PACASAVEDMSR(r13) + std r3, PACA_KVM_VMHANDLER(r13) - /* Load non-volatile guest state from the vcpu */ - ld r14, VCPU_GPR(r14)(r4) - ld r15, VCPU_GPR(r15)(r4) - ld r16, VCPU_GPR(r16)(r4) - ld r17, VCPU_GPR(r17)(r4) - ld r18, VCPU_GPR(r18)(r4) - ld r19, VCPU_GPR(r19)(r4) - ld r20, VCPU_GPR(r20)(r4) - ld r21, VCPU_GPR(r21)(r4) - ld r22, VCPU_GPR(r22)(r4) - ld r23, VCPU_GPR(r23)(r4) - ld r24, VCPU_GPR(r24)(r4) - ld r25, VCPU_GPR(r25)(r4) - ld r26, VCPU_GPR(r26)(r4) - ld r27, VCPU_GPR(r27)(r4) - ld r28, VCPU_GPR(r28)(r4) - ld r29, VCPU_GPR(r29)(r4) - ld r30, VCPU_GPR(r30)(r4) - ld r31, VCPU_GPR(r31)(r4) +kvm_start_lightweight: ld r9, VCPU_PC(r4) /* r9 = vcpu->arch.pc */ ld r10, VCPU_SHADOW_MSR(r4) /* r10 = vcpu->arch.shadow_msr */ - ld r3, VCPU_TRAMPOLINE_ENTER(r4) - mtsrr0 r3 - - LOAD_REG_IMMEDIATE(r3, MSR_KERNEL & ~(MSR_IR | MSR_DR)) - mtsrr1 r3 - - /* Load guest state in the respective registers */ - lwz r3, VCPU_CR(r4) /* r3 = vcpu->arch.cr */ - stw r3, (PACA_EXMC + EX_CCR)(r13) - - ld r3, VCPU_CTR(r4) /* r3 = vcpu->arch.ctr */ - mtctr r3 /* CTR = r3 */ + /* Load some guest state in the respective registers */ + ld r5, VCPU_CTR(r4) /* r5 = vcpu->arch.ctr */ + /* will be swapped in by rmcall */ ld r3, VCPU_LR(r4) /* r3 = vcpu->arch.lr */ mtlr r3 /* LR = r3 */ - ld r3, VCPU_XER(r4) /* r3 = vcpu->arch.xer */ - std r3, (PACA_EXMC + EX_R3)(r13) + DISABLE_INTERRUPTS /* Some guests may need to have dcbz set to 32 byte length. * @@ -141,36 +127,15 @@ kvm_start_lightweight: mtspr SPRN_HID5,r3 no_dcbz32_on: - /* Load guest GPRs */ - - ld r3, VCPU_GPR(r9)(r4) - std r3, (PACA_EXMC + EX_R9)(r13) - ld r3, VCPU_GPR(r10)(r4) - std r3, (PACA_EXMC + EX_R10)(r13) - ld r3, VCPU_GPR(r11)(r4) - std r3, (PACA_EXMC + EX_R11)(r13) - ld r3, VCPU_GPR(r12)(r4) - std r3, (PACA_EXMC + EX_R12)(r13) - ld r3, VCPU_GPR(r13)(r4) - std r3, (PACA_EXMC + EX_R13)(r13) - - ld r0, VCPU_GPR(r0)(r4) - ld r1, VCPU_GPR(r1)(r4) - ld r2, VCPU_GPR(r2)(r4) - ld r3, VCPU_GPR(r3)(r4) - ld r5, VCPU_GPR(r5)(r4) - ld r6, VCPU_GPR(r6)(r4) - ld r7, VCPU_GPR(r7)(r4) - ld r8, VCPU_GPR(r8)(r4) - ld r4, VCPU_GPR(r4)(r4) - - /* This sets the Magic value for the trampoline */ - - li r11, 1 - stb r11, PACA_KVM_IN_GUEST(r13) + + ld r6, VCPU_RMCALL(r4) + mtctr r6 + + ld r3, VCPU_TRAMPOLINE_ENTER(r4) + LOAD_REG_IMMEDIATE(r4, MSR_KERNEL & ~(MSR_IR | MSR_DR)) /* Jump to SLB patching handlder and into our guest */ - RFI + bctr /* * This is the handler in module memory. It gets jumped at from the @@ -184,125 +149,70 @@ kvmppc_handler_highmem: /* * Register usage at this point: * - * R00 = guest R13 - * R01 = host R1 - * R02 = host R2 - * R10 = guest PC - * R11 = guest MSR - * R12 = exit handler id - * R13 = PACA - * PACA.exmc.R9 = guest R1 - * PACA.exmc.R10 = guest R10 - * PACA.exmc.R11 = guest R11 - * PACA.exmc.R12 = guest R12 - * PACA.exmc.R13 = guest R2 - * PACA.exmc.DAR = guest DAR - * PACA.exmc.DSISR = guest DSISR - * PACA.exmc.LR = guest instruction - * PACA.exmc.CCR = guest CR - * PACA.exmc.SRR0 = guest R0 + * R0 = guest last inst + * R1 = host R1 + * R2 = host R2 + * R3 = guest PC + * R4 = guest MSR + * R5 = guest DAR + * R6 = guest DSISR + * R13 = PACA + * PACA.KVM.* = guest * * */ - std r3, (PACA_EXMC+EX_R3)(r13) + /* R7 = vcpu */ + ld r7, GPR4(r1) - /* save the exit id in R3 */ - mr r3, r12 + /* Now save the guest state */ - /* R12 = vcpu */ - ld r12, GPR4(r1) + stw r0, VCPU_LAST_INST(r7) - /* Now save the guest state */ + std r3, VCPU_PC(r7) + std r4, VCPU_SHADOW_SRR1(r7) + std r5, VCPU_FAULT_DEAR(r7) + std r6, VCPU_FAULT_DSISR(r7) - std r0, VCPU_GPR(r13)(r12) - std r4, VCPU_GPR(r4)(r12) - std r5, VCPU_GPR(r5)(r12) - std r6, VCPU_GPR(r6)(r12) - std r7, VCPU_GPR(r7)(r12) - std r8, VCPU_GPR(r8)(r12) - std r9, VCPU_GPR(r9)(r12) - - /* get registers from PACA */ - mfpaca r5, r0, EX_SRR0, r12 - mfpaca r5, r3, EX_R3, r12 - mfpaca r5, r1, EX_R9, r12 - mfpaca r5, r10, EX_R10, r12 - mfpaca r5, r11, EX_R11, r12 - mfpaca r5, r12, EX_R12, r12 - mfpaca r5, r2, EX_R13, r12 - - lwz r5, (PACA_EXMC+EX_LR)(r13) - stw r5, VCPU_LAST_INST(r12) - - lwz r5, (PACA_EXMC+EX_CCR)(r13) - stw r5, VCPU_CR(r12) - - ld r5, VCPU_HFLAGS(r12) + ld r5, VCPU_HFLAGS(r7) rldicl. r5, r5, 0, 63 /* CR = ((r5 & 1) == 0) */ beq no_dcbz32_off + li r4, 0 mfspr r5,SPRN_HID5 - rldimi r5,r5,6,56 + rldimi r5,r4,6,56 mtspr SPRN_HID5,r5 no_dcbz32_off: - /* XXX maybe skip on lightweight? */ - std r14, VCPU_GPR(r14)(r12) - std r15, VCPU_GPR(r15)(r12) - std r16, VCPU_GPR(r16)(r12) - std r17, VCPU_GPR(r17)(r12) - std r18, VCPU_GPR(r18)(r12) - std r19, VCPU_GPR(r19)(r12) - std r20, VCPU_GPR(r20)(r12) - std r21, VCPU_GPR(r21)(r12) - std r22, VCPU_GPR(r22)(r12) - std r23, VCPU_GPR(r23)(r12) - std r24, VCPU_GPR(r24)(r12) - std r25, VCPU_GPR(r25)(r12) - std r26, VCPU_GPR(r26)(r12) - std r27, VCPU_GPR(r27)(r12) - std r28, VCPU_GPR(r28)(r12) - std r29, VCPU_GPR(r29)(r12) - std r30, VCPU_GPR(r30)(r12) - std r31, VCPU_GPR(r31)(r12) - - /* Restore non-volatile host registers (r14 - r31) */ - REST_NVGPRS(r1) - - /* Save guest PC (R10) */ - std r10, VCPU_PC(r12) - - /* Save guest msr (R11) */ - std r11, VCPU_SHADOW_MSR(r12) - - /* Save guest CTR (in R12) */ + std r14, VCPU_GPR(r14)(r7) + std r15, VCPU_GPR(r15)(r7) + std r16, VCPU_GPR(r16)(r7) + std r17, VCPU_GPR(r17)(r7) + std r18, VCPU_GPR(r18)(r7) + std r19, VCPU_GPR(r19)(r7) + std r20, VCPU_GPR(r20)(r7) + std r21, VCPU_GPR(r21)(r7) + std r22, VCPU_GPR(r22)(r7) + std r23, VCPU_GPR(r23)(r7) + std r24, VCPU_GPR(r24)(r7) + std r25, VCPU_GPR(r25)(r7) + std r26, VCPU_GPR(r26)(r7) + std r27, VCPU_GPR(r27)(r7) + std r28, VCPU_GPR(r28)(r7) + std r29, VCPU_GPR(r29)(r7) + std r30, VCPU_GPR(r30)(r7) + std r31, VCPU_GPR(r31)(r7) + + /* Save guest CTR */ mfctr r5 - std r5, VCPU_CTR(r12) + std r5, VCPU_CTR(r7) /* Save guest LR */ mflr r5 - std r5, VCPU_LR(r12) - - /* Save guest XER */ - mfxer r5 - std r5, VCPU_XER(r12) - - /* Save guest DAR */ - ld r5, (PACA_EXMC+EX_DAR)(r13) - std r5, VCPU_FAULT_DEAR(r12) - - /* Save guest DSISR */ - lwz r5, (PACA_EXMC+EX_DSISR)(r13) - std r5, VCPU_FAULT_DSISR(r12) + std r5, VCPU_LR(r7) /* Restore host msr -> SRR1 */ - ld r7, VCPU_HOST_MSR(r12) - mtsrr1 r7 - - /* Restore host IP -> SRR0 */ - ld r6, VCPU_HOST_RETIP(r12) - mtsrr0 r6 + ld r6, VCPU_HOST_MSR(r7) /* * For some interrupts, we need to call the real Linux @@ -314,13 +224,14 @@ no_dcbz32_off: * r3 = address of interrupt handler (exit reason) */ - cmpwi r3, BOOK3S_INTERRUPT_EXTERNAL + cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL beq call_linux_handler - cmpwi r3, BOOK3S_INTERRUPT_DECREMENTER + cmpwi r12, BOOK3S_INTERRUPT_DECREMENTER beq call_linux_handler - /* Back to Interruptable Mode! (goto kvm_return_point) */ - RFI + /* Back to EE=1 */ + mtmsr r6 + b kvm_return_point call_linux_handler: @@ -333,16 +244,22 @@ call_linux_handler: * interrupt handler! * * R3 still contains the exit code, - * R6 VCPU_HOST_RETIP and - * R7 VCPU_HOST_MSR + * R5 VCPU_HOST_RETIP and + * R6 VCPU_HOST_MSR */ - mtlr r3 + /* Restore host IP -> SRR0 */ + ld r5, VCPU_HOST_RETIP(r7) + + /* XXX Better move to a safe function? + * What if we get an HTAB flush in between mtsrr0 and mtsrr1? */ - ld r5, VCPU_TRAMPOLINE_LOWMEM(r12) - mtsrr0 r5 - LOAD_REG_IMMEDIATE(r5, MSR_KERNEL & ~(MSR_IR | MSR_DR)) - mtsrr1 r5 + mtlr r12 + + ld r4, VCPU_TRAMPOLINE_LOWMEM(r7) + mtsrr0 r4 + LOAD_REG_IMMEDIATE(r3, MSR_KERNEL & ~(MSR_IR | MSR_DR)) + mtsrr1 r3 RFI @@ -351,42 +268,51 @@ kvm_return_point: /* Jump back to lightweight entry if we're supposed to */ /* go back into the guest */ - mr r5, r3 + + /* Pass the exit number as 3rd argument to kvmppc_handle_exit */ + mr r5, r12 + /* Restore r3 (kvm_run) and r4 (vcpu) */ REST_2GPRS(3, r1) bl KVMPPC_HANDLE_EXIT -#if 0 /* XXX get lightweight exits back */ + /* If RESUME_GUEST, get back in the loop */ cmpwi r3, RESUME_GUEST - bne kvm_exit_heavyweight + beq kvm_loop_lightweight - /* put VCPU and KVM_RUN back into place and roll again! */ - REST_2GPRS(3, r1) - b kvm_start_lightweight + cmpwi r3, RESUME_GUEST_NV + beq kvm_loop_heavyweight -kvm_exit_heavyweight: - /* Restore non-volatile host registers */ - ld r14, _LINK(r1) - mtlr r14 - REST_NVGPRS(r1) +kvm_exit_loop: - addi r1, r1, SWITCH_FRAME_SIZE -#else ld r4, _LINK(r1) mtlr r4 - cmpwi r3, RESUME_GUEST - bne kvm_exit_heavyweight + /* Restore non-volatile host registers (r14 - r31) */ + REST_NVGPRS(r1) + + addi r1, r1, SWITCH_FRAME_SIZE + blr + +kvm_loop_heavyweight: + + ld r4, _LINK(r1) + std r4, (16 + SWITCH_FRAME_SIZE)(r1) + /* Load vcpu and cpu_run */ REST_2GPRS(3, r1) - addi r1, r1, SWITCH_FRAME_SIZE + /* Load non-volatile guest state from the vcpu */ + VCPU_LOAD_NVGPRS(r4) - b kvm_start_entry + /* Jump back into the beginning of this function */ + b kvm_start_lightweight -kvm_exit_heavyweight: +kvm_loop_lightweight: - addi r1, r1, SWITCH_FRAME_SIZE -#endif + /* We'll need the vcpu pointer */ + REST_GPR(4, r1) + + /* Jump back into the beginning of this function */ + b kvm_start_lightweight - blr diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index e4beeb371a7..512dcff7755 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -54,7 +54,7 @@ static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( if (!vcpu_book3s->slb[i].valid) continue; - if (vcpu_book3s->slb[i].large) + if (vcpu_book3s->slb[i].tb) cmp_esid = esid_1t; if (vcpu_book3s->slb[i].esid == cmp_esid) @@ -65,9 +65,10 @@ static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( eaddr, esid, esid_1t); for (i = 0; i < vcpu_book3s->slb_nr; i++) { if (vcpu_book3s->slb[i].vsid) - dprintk(" %d: %c%c %llx %llx\n", i, + dprintk(" %d: %c%c%c %llx %llx\n", i, vcpu_book3s->slb[i].valid ? 'v' : ' ', vcpu_book3s->slb[i].large ? 'l' : ' ', + vcpu_book3s->slb[i].tb ? 't' : ' ', vcpu_book3s->slb[i].esid, vcpu_book3s->slb[i].vsid); } @@ -84,7 +85,7 @@ static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, if (!slb) return 0; - if (slb->large) + if (slb->tb) return (((u64)eaddr >> 12) & 0xfffffff) | (((u64)slb->vsid) << 28); @@ -309,7 +310,8 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb) slbe = &vcpu_book3s->slb[slb_nr]; slbe->large = (rs & SLB_VSID_L) ? 1 : 0; - slbe->esid = slbe->large ? esid_1t : esid; + slbe->tb = (rs & SLB_VSID_B_1T) ? 1 : 0; + slbe->esid = slbe->tb ? esid_1t : esid; slbe->vsid = rs >> 12; slbe->valid = (rb & SLB_ESID_V) ? 1 : 0; slbe->Ks = (rs & SLB_VSID_KS) ? 1 : 0; diff --git a/arch/powerpc/kvm/book3s_64_rmhandlers.S b/arch/powerpc/kvm/book3s_64_rmhandlers.S index fb7dd2e9ac8..c83c60ad96c 100644 --- a/arch/powerpc/kvm/book3s_64_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_64_rmhandlers.S @@ -45,36 +45,25 @@ kvmppc_trampoline_\intno: * To distinguish, we check a magic byte in the PACA */ mfspr r13, SPRN_SPRG_PACA /* r13 = PACA */ - std r12, (PACA_EXMC + EX_R12)(r13) + std r12, PACA_KVM_SCRATCH0(r13) mfcr r12 - stw r12, (PACA_EXMC + EX_CCR)(r13) + stw r12, PACA_KVM_SCRATCH1(r13) lbz r12, PACA_KVM_IN_GUEST(r13) - cmpwi r12, 0 + cmpwi r12, KVM_GUEST_MODE_NONE bne ..kvmppc_handler_hasmagic_\intno /* No KVM guest? Then jump back to the Linux handler! */ - lwz r12, (PACA_EXMC + EX_CCR)(r13) + lwz r12, PACA_KVM_SCRATCH1(r13) mtcr r12 - ld r12, (PACA_EXMC + EX_R12)(r13) + ld r12, PACA_KVM_SCRATCH0(r13) mfspr r13, SPRN_SPRG_SCRATCH0 /* r13 = original r13 */ b kvmppc_resume_\intno /* Get back original handler */ /* Now we know we're handling a KVM guest */ ..kvmppc_handler_hasmagic_\intno: - /* Unset guest state */ - li r12, 0 - stb r12, PACA_KVM_IN_GUEST(r13) - std r1, (PACA_EXMC+EX_R9)(r13) - std r10, (PACA_EXMC+EX_R10)(r13) - std r11, (PACA_EXMC+EX_R11)(r13) - std r2, (PACA_EXMC+EX_R13)(r13) - - mfsrr0 r10 - mfsrr1 r11 - - /* Restore R1/R2 so we can handle faults */ - ld r1, PACAR1(r13) - ld r2, (PACA_EXMC+EX_SRR0)(r13) + /* Should we just skip the faulting instruction? */ + cmpwi r12, KVM_GUEST_MODE_SKIP + beq kvmppc_handler_skip_ins /* Let's store which interrupt we're handling */ li r12, \intno @@ -102,23 +91,107 @@ INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_ALTIVEC INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_VSX /* + * Bring us back to the faulting code, but skip the + * faulting instruction. + * + * This is a generic exit path from the interrupt + * trampolines above. + * + * Input Registers: + * + * R12 = free + * R13 = PACA + * PACA.KVM.SCRATCH0 = guest R12 + * PACA.KVM.SCRATCH1 = guest CR + * SPRG_SCRATCH0 = guest R13 + * + */ +kvmppc_handler_skip_ins: + + /* Patch the IP to the next instruction */ + mfsrr0 r12 + addi r12, r12, 4 + mtsrr0 r12 + + /* Clean up all state */ + lwz r12, PACA_KVM_SCRATCH1(r13) + mtcr r12 + ld r12, PACA_KVM_SCRATCH0(r13) + mfspr r13, SPRN_SPRG_SCRATCH0 + + /* And get back into the code */ + RFI + +/* * This trampoline brings us back to a real mode handler * * Input Registers: * - * R6 = SRR0 - * R7 = SRR1 + * R5 = SRR0 + * R6 = SRR1 * LR = real-mode IP * */ .global kvmppc_handler_lowmem_trampoline kvmppc_handler_lowmem_trampoline: - mtsrr0 r6 - mtsrr1 r7 + mtsrr0 r5 + mtsrr1 r6 blr kvmppc_handler_lowmem_trampoline_end: +/* + * Call a function in real mode + * + * Input Registers: + * + * R3 = function + * R4 = MSR + * R5 = CTR + * + */ +_GLOBAL(kvmppc_rmcall) + mtmsr r4 /* Disable relocation, so mtsrr + doesn't get interrupted */ + mtctr r5 + mtsrr0 r3 + mtsrr1 r4 + RFI + +/* + * Activate current's external feature (FPU/Altivec/VSX) + */ +#define define_load_up(what) \ + \ +_GLOBAL(kvmppc_load_up_ ## what); \ + subi r1, r1, INT_FRAME_SIZE; \ + mflr r3; \ + std r3, _LINK(r1); \ + mfmsr r4; \ + std r31, GPR3(r1); \ + mr r31, r4; \ + li r5, MSR_DR; \ + oris r5, r5, MSR_EE@h; \ + andc r4, r4, r5; \ + mtmsr r4; \ + \ + bl .load_up_ ## what; \ + \ + mtmsr r31; \ + ld r3, _LINK(r1); \ + ld r31, GPR3(r1); \ + addi r1, r1, INT_FRAME_SIZE; \ + mtlr r3; \ + blr + +define_load_up(fpu) +#ifdef CONFIG_ALTIVEC +define_load_up(altivec) +#endif +#ifdef CONFIG_VSX +define_load_up(vsx) +#endif + .global kvmppc_trampoline_lowmem kvmppc_trampoline_lowmem: .long kvmppc_handler_lowmem_trampoline - _stext diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S index ecd237a03fd..35b76272218 100644 --- a/arch/powerpc/kvm/book3s_64_slb.S +++ b/arch/powerpc/kvm/book3s_64_slb.S @@ -31,7 +31,7 @@ #define REBOLT_SLB_ENTRY(num) \ ld r10, SHADOW_SLB_ESID(num)(r11); \ cmpdi r10, 0; \ - beq slb_exit_skip_1; \ + beq slb_exit_skip_ ## num; \ oris r10, r10, SLB_ESID_V@h; \ ld r9, SHADOW_SLB_VSID(num)(r11); \ slbmte r9, r10; \ @@ -51,23 +51,21 @@ kvmppc_handler_trampoline_enter: * * MSR = ~IR|DR * R13 = PACA + * R1 = host R1 + * R2 = host R2 * R9 = guest IP * R10 = guest MSR - * R11 = free - * R12 = free - * PACA[PACA_EXMC + EX_R9] = guest R9 - * PACA[PACA_EXMC + EX_R10] = guest R10 - * PACA[PACA_EXMC + EX_R11] = guest R11 - * PACA[PACA_EXMC + EX_R12] = guest R12 - * PACA[PACA_EXMC + EX_R13] = guest R13 - * PACA[PACA_EXMC + EX_CCR] = guest CR - * PACA[PACA_EXMC + EX_R3] = guest XER + * all other GPRS = free + * PACA[KVM_CR] = guest CR + * PACA[KVM_XER] = guest XER */ mtsrr0 r9 mtsrr1 r10 - mtspr SPRN_SPRG_SCRATCH0, r0 + /* Activate guest mode, so faults get handled by KVM */ + li r11, KVM_GUEST_MODE_GUEST + stb r11, PACA_KVM_IN_GUEST(r13) /* Remove LPAR shadow entries */ @@ -131,20 +129,27 @@ slb_do_enter: /* Enter guest */ - mfspr r0, SPRN_SPRG_SCRATCH0 - - ld r9, (PACA_EXMC+EX_R9)(r13) - ld r10, (PACA_EXMC+EX_R10)(r13) - ld r12, (PACA_EXMC+EX_R12)(r13) - - lwz r11, (PACA_EXMC+EX_CCR)(r13) + ld r0, (PACA_KVM_R0)(r13) + ld r1, (PACA_KVM_R1)(r13) + ld r2, (PACA_KVM_R2)(r13) + ld r3, (PACA_KVM_R3)(r13) + ld r4, (PACA_KVM_R4)(r13) + ld r5, (PACA_KVM_R5)(r13) + ld r6, (PACA_KVM_R6)(r13) + ld r7, (PACA_KVM_R7)(r13) + ld r8, (PACA_KVM_R8)(r13) + ld r9, (PACA_KVM_R9)(r13) + ld r10, (PACA_KVM_R10)(r13) + ld r12, (PACA_KVM_R12)(r13) + + lwz r11, (PACA_KVM_CR)(r13) mtcr r11 - ld r11, (PACA_EXMC+EX_R3)(r13) + ld r11, (PACA_KVM_XER)(r13) mtxer r11 - ld r11, (PACA_EXMC+EX_R11)(r13) - ld r13, (PACA_EXMC+EX_R13)(r13) + ld r11, (PACA_KVM_R11)(r13) + ld r13, (PACA_KVM_R13)(r13) RFI kvmppc_handler_trampoline_enter_end: @@ -162,28 +167,54 @@ kvmppc_handler_trampoline_exit: /* Register usage at this point: * - * SPRG_SCRATCH0 = guest R13 - * R01 = host R1 - * R02 = host R2 - * R10 = guest PC - * R11 = guest MSR - * R12 = exit handler id - * R13 = PACA - * PACA.exmc.CCR = guest CR - * PACA.exmc.R9 = guest R1 - * PACA.exmc.R10 = guest R10 - * PACA.exmc.R11 = guest R11 - * PACA.exmc.R12 = guest R12 - * PACA.exmc.R13 = guest R2 + * SPRG_SCRATCH0 = guest R13 + * R12 = exit handler id + * R13 = PACA + * PACA.KVM.SCRATCH0 = guest R12 + * PACA.KVM.SCRATCH1 = guest CR * */ /* Save registers */ - std r0, (PACA_EXMC+EX_SRR0)(r13) - std r9, (PACA_EXMC+EX_R3)(r13) - std r10, (PACA_EXMC+EX_LR)(r13) - std r11, (PACA_EXMC+EX_DAR)(r13) + std r0, PACA_KVM_R0(r13) + std r1, PACA_KVM_R1(r13) + std r2, PACA_KVM_R2(r13) + std r3, PACA_KVM_R3(r13) + std r4, PACA_KVM_R4(r13) + std r5, PACA_KVM_R5(r13) + std r6, PACA_KVM_R6(r13) + std r7, PACA_KVM_R7(r13) + std r8, PACA_KVM_R8(r13) + std r9, PACA_KVM_R9(r13) + std r10, PACA_KVM_R10(r13) + std r11, PACA_KVM_R11(r13) + + /* Restore R1/R2 so we can handle faults */ + ld r1, PACA_KVM_HOST_R1(r13) + ld r2, PACA_KVM_HOST_R2(r13) + + /* Save guest PC and MSR in GPRs */ + mfsrr0 r3 + mfsrr1 r4 + + /* Get scratch'ed off registers */ + mfspr r9, SPRN_SPRG_SCRATCH0 + std r9, PACA_KVM_R13(r13) + + ld r8, PACA_KVM_SCRATCH0(r13) + std r8, PACA_KVM_R12(r13) + + lwz r7, PACA_KVM_SCRATCH1(r13) + stw r7, PACA_KVM_CR(r13) + + /* Save more register state */ + + mfxer r6 + stw r6, PACA_KVM_XER(r13) + + mfdar r5 + mfdsisr r6 /* * In order for us to easily get the last instruction, @@ -202,17 +233,28 @@ kvmppc_handler_trampoline_exit: ld_last_inst: /* Save off the guest instruction we're at */ + + /* Set guest mode to 'jump over instruction' so if lwz faults + * we'll just continue at the next IP. */ + li r9, KVM_GUEST_MODE_SKIP + stb r9, PACA_KVM_IN_GUEST(r13) + /* 1) enable paging for data */ mfmsr r9 ori r11, r9, MSR_DR /* Enable paging for data */ mtmsr r11 /* 2) fetch the instruction */ - lwz r0, 0(r10) + li r0, KVM_INST_FETCH_FAILED /* In case lwz faults */ + lwz r0, 0(r3) /* 3) disable paging again */ mtmsr r9 no_ld_last_inst: + /* Unset guest mode */ + li r9, KVM_GUEST_MODE_NONE + stb r9, PACA_KVM_IN_GUEST(r13) + /* Restore bolted entries from the shadow and fix it along the way */ /* We don't store anything in entry 0, so we don't need to take care of it */ @@ -233,29 +275,27 @@ no_ld_last_inst: slb_do_exit: - /* Restore registers */ - - ld r11, (PACA_EXMC+EX_DAR)(r13) - ld r10, (PACA_EXMC+EX_LR)(r13) - ld r9, (PACA_EXMC+EX_R3)(r13) - - /* Save last inst */ - stw r0, (PACA_EXMC+EX_LR)(r13) - - /* Save DAR and DSISR before going to paged mode */ - mfdar r0 - std r0, (PACA_EXMC+EX_DAR)(r13) - mfdsisr r0 - stw r0, (PACA_EXMC+EX_DSISR)(r13) + /* Register usage at this point: + * + * R0 = guest last inst + * R1 = host R1 + * R2 = host R2 + * R3 = guest PC + * R4 = guest MSR + * R5 = guest DAR + * R6 = guest DSISR + * R12 = exit handler id + * R13 = PACA + * PACA.KVM.* = guest * + * + */ /* RFI into the highmem handler */ - mfmsr r0 - ori r0, r0, MSR_IR|MSR_DR|MSR_RI /* Enable paging */ - mtsrr1 r0 - ld r0, PACASAVEDMSR(r13) /* Highmem handler address */ - mtsrr0 r0 - - mfspr r0, SPRN_SPRG_SCRATCH0 + mfmsr r7 + ori r7, r7, MSR_IR|MSR_DR|MSR_RI /* Enable paging */ + mtsrr1 r7 + ld r8, PACA_KVM_VMHANDLER(r13) /* Highmem handler address */ + mtsrr0 r8 RFI kvmppc_handler_trampoline_exit_end: diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 06f5a9ecc42..4d686cc6b26 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -69,10 +69,10 @@ void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu) for (i = 0; i < 32; i += 4) { printk("gpr%02d: %08lx %08lx %08lx %08lx\n", i, - vcpu->arch.gpr[i], - vcpu->arch.gpr[i+1], - vcpu->arch.gpr[i+2], - vcpu->arch.gpr[i+3]); + kvmppc_get_gpr(vcpu, i), + kvmppc_get_gpr(vcpu, i+1), + kvmppc_get_gpr(vcpu, i+2), + kvmppc_get_gpr(vcpu, i+3)); } } @@ -82,8 +82,32 @@ static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu, set_bit(priority, &vcpu->arch.pending_exceptions); } -void kvmppc_core_queue_program(struct kvm_vcpu *vcpu) +static void kvmppc_core_queue_dtlb_miss(struct kvm_vcpu *vcpu, + ulong dear_flags, ulong esr_flags) { + vcpu->arch.queued_dear = dear_flags; + vcpu->arch.queued_esr = esr_flags; + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DTLB_MISS); +} + +static void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, + ulong dear_flags, ulong esr_flags) +{ + vcpu->arch.queued_dear = dear_flags; + vcpu->arch.queued_esr = esr_flags; + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DATA_STORAGE); +} + +static void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, + ulong esr_flags) +{ + vcpu->arch.queued_esr = esr_flags; + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE); +} + +void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong esr_flags) +{ + vcpu->arch.queued_esr = esr_flags; kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM); } @@ -97,6 +121,11 @@ int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu) return test_bit(BOOKE_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions); } +void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu) +{ + clear_bit(BOOKE_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions); +} + void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { @@ -109,14 +138,19 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, { int allowed = 0; ulong msr_mask; + bool update_esr = false, update_dear = false; switch (priority) { - case BOOKE_IRQPRIO_PROGRAM: case BOOKE_IRQPRIO_DTLB_MISS: - case BOOKE_IRQPRIO_ITLB_MISS: - case BOOKE_IRQPRIO_SYSCALL: case BOOKE_IRQPRIO_DATA_STORAGE: + update_dear = true; + /* fall through */ case BOOKE_IRQPRIO_INST_STORAGE: + case BOOKE_IRQPRIO_PROGRAM: + update_esr = true; + /* fall through */ + case BOOKE_IRQPRIO_ITLB_MISS: + case BOOKE_IRQPRIO_SYSCALL: case BOOKE_IRQPRIO_FP_UNAVAIL: case BOOKE_IRQPRIO_SPE_UNAVAIL: case BOOKE_IRQPRIO_SPE_FP_DATA: @@ -151,6 +185,10 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, vcpu->arch.srr0 = vcpu->arch.pc; vcpu->arch.srr1 = vcpu->arch.msr; vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority]; + if (update_esr == true) + vcpu->arch.esr = vcpu->arch.queued_esr; + if (update_dear == true) + vcpu->arch.dear = vcpu->arch.queued_dear; kvmppc_set_msr(vcpu, vcpu->arch.msr & msr_mask); clear_bit(priority, &vcpu->arch.pending_exceptions); @@ -223,8 +261,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, if (vcpu->arch.msr & MSR_PR) { /* Program traps generated by user-level software must be handled * by the guest kernel. */ - vcpu->arch.esr = vcpu->arch.fault_esr; - kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM); + kvmppc_core_queue_program(vcpu, vcpu->arch.fault_esr); r = RESUME_GUEST; kvmppc_account_exit(vcpu, USR_PR_INST); break; @@ -280,16 +317,14 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case BOOKE_INTERRUPT_DATA_STORAGE: - vcpu->arch.dear = vcpu->arch.fault_dear; - vcpu->arch.esr = vcpu->arch.fault_esr; - kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DATA_STORAGE); + kvmppc_core_queue_data_storage(vcpu, vcpu->arch.fault_dear, + vcpu->arch.fault_esr); kvmppc_account_exit(vcpu, DSI_EXITS); r = RESUME_GUEST; break; case BOOKE_INTERRUPT_INST_STORAGE: - vcpu->arch.esr = vcpu->arch.fault_esr; - kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE); + kvmppc_core_queue_inst_storage(vcpu, vcpu->arch.fault_esr); kvmppc_account_exit(vcpu, ISI_EXITS); r = RESUME_GUEST; break; @@ -310,9 +345,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr); if (gtlb_index < 0) { /* The guest didn't have a mapping for it. */ - kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DTLB_MISS); - vcpu->arch.dear = vcpu->arch.fault_dear; - vcpu->arch.esr = vcpu->arch.fault_esr; + kvmppc_core_queue_dtlb_miss(vcpu, + vcpu->arch.fault_dear, + vcpu->arch.fault_esr); kvmppc_mmu_dtlb_miss(vcpu); kvmppc_account_exit(vcpu, DTLB_REAL_MISS_EXITS); r = RESUME_GUEST; @@ -426,7 +461,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { vcpu->arch.pc = 0; vcpu->arch.msr = 0; - vcpu->arch.gpr[1] = (16<<20) - 8; /* -8 for the callee-save LR slot */ + kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ vcpu->arch.shadow_pid = 1; @@ -444,10 +479,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int i; regs->pc = vcpu->arch.pc; - regs->cr = vcpu->arch.cr; + regs->cr = kvmppc_get_cr(vcpu); regs->ctr = vcpu->arch.ctr; regs->lr = vcpu->arch.lr; - regs->xer = vcpu->arch.xer; + regs->xer = kvmppc_get_xer(vcpu); regs->msr = vcpu->arch.msr; regs->srr0 = vcpu->arch.srr0; regs->srr1 = vcpu->arch.srr1; @@ -461,7 +496,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->sprg7 = vcpu->arch.sprg6; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) - regs->gpr[i] = vcpu->arch.gpr[i]; + regs->gpr[i] = kvmppc_get_gpr(vcpu, i); return 0; } @@ -471,10 +506,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int i; vcpu->arch.pc = regs->pc; - vcpu->arch.cr = regs->cr; + kvmppc_set_cr(vcpu, regs->cr); vcpu->arch.ctr = regs->ctr; vcpu->arch.lr = regs->lr; - vcpu->arch.xer = regs->xer; + kvmppc_set_xer(vcpu, regs->xer); kvmppc_set_msr(vcpu, regs->msr); vcpu->arch.srr0 = regs->srr0; vcpu->arch.srr1 = regs->srr1; @@ -486,8 +521,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.sprg6 = regs->sprg5; vcpu->arch.sprg7 = regs->sprg6; - for (i = 0; i < ARRAY_SIZE(vcpu->arch.gpr); i++) - vcpu->arch.gpr[i] = regs->gpr[i]; + for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) + kvmppc_set_gpr(vcpu, i, regs->gpr[i]); return 0; } diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index aebc65e93f4..cbc790ee192 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -62,20 +62,20 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, case OP_31_XOP_MFMSR: rt = get_rt(inst); - vcpu->arch.gpr[rt] = vcpu->arch.msr; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.msr); kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS); break; case OP_31_XOP_MTMSR: rs = get_rs(inst); kvmppc_set_exit_type(vcpu, EMULATED_MTMSR_EXITS); - kvmppc_set_msr(vcpu, vcpu->arch.gpr[rs]); + kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, rs)); break; case OP_31_XOP_WRTEE: rs = get_rs(inst); vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE) - | (vcpu->arch.gpr[rs] & MSR_EE); + | (kvmppc_get_gpr(vcpu, rs) & MSR_EE); kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS); break; @@ -101,22 +101,23 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) { int emulated = EMULATE_DONE; + ulong spr_val = kvmppc_get_gpr(vcpu, rs); switch (sprn) { case SPRN_DEAR: - vcpu->arch.dear = vcpu->arch.gpr[rs]; break; + vcpu->arch.dear = spr_val; break; case SPRN_ESR: - vcpu->arch.esr = vcpu->arch.gpr[rs]; break; + vcpu->arch.esr = spr_val; break; case SPRN_DBCR0: - vcpu->arch.dbcr0 = vcpu->arch.gpr[rs]; break; + vcpu->arch.dbcr0 = spr_val; break; case SPRN_DBCR1: - vcpu->arch.dbcr1 = vcpu->arch.gpr[rs]; break; + vcpu->arch.dbcr1 = spr_val; break; case SPRN_DBSR: - vcpu->arch.dbsr &= ~vcpu->arch.gpr[rs]; break; + vcpu->arch.dbsr &= ~spr_val; break; case SPRN_TSR: - vcpu->arch.tsr &= ~vcpu->arch.gpr[rs]; break; + vcpu->arch.tsr &= ~spr_val; break; case SPRN_TCR: - vcpu->arch.tcr = vcpu->arch.gpr[rs]; + vcpu->arch.tcr = spr_val; kvmppc_emulate_dec(vcpu); break; @@ -124,64 +125,64 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) * loaded into the real SPRGs when resuming the * guest. */ case SPRN_SPRG4: - vcpu->arch.sprg4 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg4 = spr_val; break; case SPRN_SPRG5: - vcpu->arch.sprg5 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg5 = spr_val; break; case SPRN_SPRG6: - vcpu->arch.sprg6 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg6 = spr_val; break; case SPRN_SPRG7: - vcpu->arch.sprg7 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg7 = spr_val; break; case SPRN_IVPR: - vcpu->arch.ivpr = vcpu->arch.gpr[rs]; + vcpu->arch.ivpr = spr_val; break; case SPRN_IVOR0: - vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = spr_val; break; case SPRN_IVOR1: - vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = spr_val; break; case SPRN_IVOR2: - vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = spr_val; break; case SPRN_IVOR3: - vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = spr_val; break; case SPRN_IVOR4: - vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = spr_val; break; case SPRN_IVOR5: - vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = spr_val; break; case SPRN_IVOR6: - vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = spr_val; break; case SPRN_IVOR7: - vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = spr_val; break; case SPRN_IVOR8: - vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = spr_val; break; case SPRN_IVOR9: - vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = spr_val; break; case SPRN_IVOR10: - vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = spr_val; break; case SPRN_IVOR11: - vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = spr_val; break; case SPRN_IVOR12: - vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = spr_val; break; case SPRN_IVOR13: - vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = spr_val; break; case SPRN_IVOR14: - vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = spr_val; break; case SPRN_IVOR15: - vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = spr_val; break; default: @@ -197,65 +198,65 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) switch (sprn) { case SPRN_IVPR: - vcpu->arch.gpr[rt] = vcpu->arch.ivpr; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivpr); break; case SPRN_DEAR: - vcpu->arch.gpr[rt] = vcpu->arch.dear; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.dear); break; case SPRN_ESR: - vcpu->arch.gpr[rt] = vcpu->arch.esr; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.esr); break; case SPRN_DBCR0: - vcpu->arch.gpr[rt] = vcpu->arch.dbcr0; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr0); break; case SPRN_DBCR1: - vcpu->arch.gpr[rt] = vcpu->arch.dbcr1; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr1); break; case SPRN_DBSR: - vcpu->arch.gpr[rt] = vcpu->arch.dbsr; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbsr); break; case SPRN_IVOR0: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]); break; case SPRN_IVOR1: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK]); break; case SPRN_IVOR2: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]); break; case SPRN_IVOR3: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE]); break; case SPRN_IVOR4: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL]); break; case SPRN_IVOR5: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT]); break; case SPRN_IVOR6: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM]); break; case SPRN_IVOR7: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL]); break; case SPRN_IVOR8: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]); break; case SPRN_IVOR9: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL]); break; case SPRN_IVOR10: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER]); break; case SPRN_IVOR11: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_FIT]); break; case SPRN_IVOR12: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG]); break; case SPRN_IVOR13: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]); break; case SPRN_IVOR14: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]); break; case SPRN_IVOR15: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]); break; default: diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 64949eef43f..efa1198940a 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -60,6 +60,12 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) kvmppc_e500_tlb_setup(vcpu_e500); + /* Registers init */ + vcpu->arch.pvr = mfspr(SPRN_PVR); + + /* Since booke kvm only support one core, update all vcpus' PIR to 0 */ + vcpu->vcpu_id = 0; + return 0; } diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index be95b8d8e3b..8e3edfbc963 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -74,54 +74,59 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int emulated = EMULATE_DONE; + ulong spr_val = kvmppc_get_gpr(vcpu, rs); switch (sprn) { case SPRN_PID: vcpu_e500->pid[0] = vcpu->arch.shadow_pid = - vcpu->arch.pid = vcpu->arch.gpr[rs]; + vcpu->arch.pid = spr_val; break; case SPRN_PID1: - vcpu_e500->pid[1] = vcpu->arch.gpr[rs]; break; + vcpu_e500->pid[1] = spr_val; break; case SPRN_PID2: - vcpu_e500->pid[2] = vcpu->arch.gpr[rs]; break; + vcpu_e500->pid[2] = spr_val; break; case SPRN_MAS0: - vcpu_e500->mas0 = vcpu->arch.gpr[rs]; break; + vcpu_e500->mas0 = spr_val; break; case SPRN_MAS1: - vcpu_e500->mas1 = vcpu->arch.gpr[rs]; break; + vcpu_e500->mas1 = spr_val; break; case SPRN_MAS2: - vcpu_e500->mas2 = vcpu->arch.gpr[rs]; break; + vcpu_e500->mas2 = spr_val; break; case SPRN_MAS3: - vcpu_e500->mas3 = vcpu->arch.gpr[rs]; break; + vcpu_e500->mas3 = spr_val; break; case SPRN_MAS4: - vcpu_e500->mas4 = vcpu->arch.gpr[rs]; break; + vcpu_e500->mas4 = spr_val; break; case SPRN_MAS6: - vcpu_e500->mas6 = vcpu->arch.gpr[rs]; break; + vcpu_e500->mas6 = spr_val; break; case SPRN_MAS7: - vcpu_e500->mas7 = vcpu->arch.gpr[rs]; break; + vcpu_e500->mas7 = spr_val; break; + case SPRN_L1CSR0: + vcpu_e500->l1csr0 = spr_val; + vcpu_e500->l1csr0 &= ~(L1CSR0_DCFI | L1CSR0_CLFC); + break; case SPRN_L1CSR1: - vcpu_e500->l1csr1 = vcpu->arch.gpr[rs]; break; + vcpu_e500->l1csr1 = spr_val; break; case SPRN_HID0: - vcpu_e500->hid0 = vcpu->arch.gpr[rs]; break; + vcpu_e500->hid0 = spr_val; break; case SPRN_HID1: - vcpu_e500->hid1 = vcpu->arch.gpr[rs]; break; + vcpu_e500->hid1 = spr_val; break; case SPRN_MMUCSR0: emulated = kvmppc_e500_emul_mt_mmucsr0(vcpu_e500, - vcpu->arch.gpr[rs]); + spr_val); break; /* extra exceptions */ case SPRN_IVOR32: - vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] = spr_val; break; case SPRN_IVOR33: - vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA] = spr_val; break; case SPRN_IVOR34: - vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND] = spr_val; break; case SPRN_IVOR35: - vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = vcpu->arch.gpr[rs]; + vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = spr_val; break; default: @@ -138,63 +143,57 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) switch (sprn) { case SPRN_PID: - vcpu->arch.gpr[rt] = vcpu_e500->pid[0]; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[0]); break; case SPRN_PID1: - vcpu->arch.gpr[rt] = vcpu_e500->pid[1]; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[1]); break; case SPRN_PID2: - vcpu->arch.gpr[rt] = vcpu_e500->pid[2]; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[2]); break; case SPRN_MAS0: - vcpu->arch.gpr[rt] = vcpu_e500->mas0; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas0); break; case SPRN_MAS1: - vcpu->arch.gpr[rt] = vcpu_e500->mas1; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas1); break; case SPRN_MAS2: - vcpu->arch.gpr[rt] = vcpu_e500->mas2; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas2); break; case SPRN_MAS3: - vcpu->arch.gpr[rt] = vcpu_e500->mas3; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas3); break; case SPRN_MAS4: - vcpu->arch.gpr[rt] = vcpu_e500->mas4; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas4); break; case SPRN_MAS6: - vcpu->arch.gpr[rt] = vcpu_e500->mas6; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas6); break; case SPRN_MAS7: - vcpu->arch.gpr[rt] = vcpu_e500->mas7; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas7); break; case SPRN_TLB0CFG: - vcpu->arch.gpr[rt] = mfspr(SPRN_TLB0CFG); - vcpu->arch.gpr[rt] &= ~0xfffUL; - vcpu->arch.gpr[rt] |= vcpu_e500->guest_tlb_size[0]; - break; - + kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb0cfg); break; case SPRN_TLB1CFG: - vcpu->arch.gpr[rt] = mfspr(SPRN_TLB1CFG); - vcpu->arch.gpr[rt] &= ~0xfffUL; - vcpu->arch.gpr[rt] |= vcpu_e500->guest_tlb_size[1]; - break; - + kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb1cfg); break; + case SPRN_L1CSR0: + kvmppc_set_gpr(vcpu, rt, vcpu_e500->l1csr0); break; case SPRN_L1CSR1: - vcpu->arch.gpr[rt] = vcpu_e500->l1csr1; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->l1csr1); break; case SPRN_HID0: - vcpu->arch.gpr[rt] = vcpu_e500->hid0; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->hid0); break; case SPRN_HID1: - vcpu->arch.gpr[rt] = vcpu_e500->hid1; break; + kvmppc_set_gpr(vcpu, rt, vcpu_e500->hid1); break; case SPRN_MMUCSR0: - vcpu->arch.gpr[rt] = 0; break; + kvmppc_set_gpr(vcpu, rt, 0); break; case SPRN_MMUCFG: - vcpu->arch.gpr[rt] = mfspr(SPRN_MMUCFG); break; + kvmppc_set_gpr(vcpu, rt, mfspr(SPRN_MMUCFG)); break; /* extra exceptions */ case SPRN_IVOR32: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]); break; case SPRN_IVOR33: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA]); break; case SPRN_IVOR34: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]); break; case SPRN_IVOR35: - vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]); break; default: emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt); diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index fb1e1dc11ba..0d772e6b631 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -417,7 +417,7 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) int esel, tlbsel; gva_t ea; - ea = ((ra) ? vcpu->arch.gpr[ra] : 0) + vcpu->arch.gpr[rb]; + ea = ((ra) ? kvmppc_get_gpr(vcpu, ra) : 0) + kvmppc_get_gpr(vcpu, rb); ia = (ea >> 2) & 0x1; @@ -470,7 +470,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) struct tlbe *gtlbe = NULL; gva_t ea; - ea = vcpu->arch.gpr[rb]; + ea = kvmppc_get_gpr(vcpu, rb); for (tlbsel = 0; tlbsel < 2; tlbsel++) { esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as); @@ -728,6 +728,12 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) if (vcpu_e500->shadow_pages[1] == NULL) goto err_out_page0; + /* Init TLB configuration register */ + vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL; + vcpu_e500->tlb0cfg |= vcpu_e500->guest_tlb_size[0]; + vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL; + vcpu_e500->tlb1cfg |= vcpu_e500->guest_tlb_size[1]; + return 0; err_out_page0: diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index 4a9ac6640fa..cb72a65f4ec 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -83,6 +83,9 @@ void kvmppc_emulate_dec(struct kvm_vcpu *vcpu) pr_debug("mtDEC: %x\n", vcpu->arch.dec); #ifdef CONFIG_PPC64 + /* mtdec lowers the interrupt line when positive. */ + kvmppc_core_dequeue_dec(vcpu); + /* POWER4+ triggers a dec interrupt if the value is < 0 */ if (vcpu->arch.dec & 0x80000000) { hrtimer_try_to_cancel(&vcpu->arch.dec_timer); @@ -140,14 +143,18 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) pr_debug(KERN_INFO "Emulating opcode %d / %d\n", get_op(inst), get_xop(inst)); + /* Try again next time */ + if (inst == KVM_INST_FETCH_FAILED) + return EMULATE_DONE; + switch (get_op(inst)) { case OP_TRAP: #ifdef CONFIG_PPC64 case OP_TRAP_64: + kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP); #else - vcpu->arch.esr |= ESR_PTR; + kvmppc_core_queue_program(vcpu, vcpu->arch.esr | ESR_PTR); #endif - kvmppc_core_queue_program(vcpu); advance = 0; break; @@ -167,14 +174,14 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case OP_31_XOP_STWX: rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, - vcpu->arch.gpr[rs], + kvmppc_get_gpr(vcpu, rs), 4, 1); break; case OP_31_XOP_STBX: rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, - vcpu->arch.gpr[rs], + kvmppc_get_gpr(vcpu, rs), 1, 1); break; @@ -183,14 +190,14 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) ra = get_ra(inst); rb = get_rb(inst); - ea = vcpu->arch.gpr[rb]; + ea = kvmppc_get_gpr(vcpu, rb); if (ra) - ea += vcpu->arch.gpr[ra]; + ea += kvmppc_get_gpr(vcpu, ra); emulated = kvmppc_handle_store(run, vcpu, - vcpu->arch.gpr[rs], + kvmppc_get_gpr(vcpu, rs), 1, 1); - vcpu->arch.gpr[rs] = ea; + kvmppc_set_gpr(vcpu, rs, ea); break; case OP_31_XOP_LHZX: @@ -203,12 +210,12 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) ra = get_ra(inst); rb = get_rb(inst); - ea = vcpu->arch.gpr[rb]; + ea = kvmppc_get_gpr(vcpu, rb); if (ra) - ea += vcpu->arch.gpr[ra]; + ea += kvmppc_get_gpr(vcpu, ra); emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); - vcpu->arch.gpr[ra] = ea; + kvmppc_set_gpr(vcpu, ra, ea); break; case OP_31_XOP_MFSPR: @@ -217,47 +224,49 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) switch (sprn) { case SPRN_SRR0: - vcpu->arch.gpr[rt] = vcpu->arch.srr0; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.srr0); break; case SPRN_SRR1: - vcpu->arch.gpr[rt] = vcpu->arch.srr1; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.srr1); break; case SPRN_PVR: - vcpu->arch.gpr[rt] = vcpu->arch.pvr; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.pvr); break; case SPRN_PIR: - vcpu->arch.gpr[rt] = vcpu->vcpu_id; break; + kvmppc_set_gpr(vcpu, rt, vcpu->vcpu_id); break; case SPRN_MSSSR0: - vcpu->arch.gpr[rt] = 0; break; + kvmppc_set_gpr(vcpu, rt, 0); break; /* Note: mftb and TBRL/TBWL are user-accessible, so * the guest can always access the real TB anyways. * In fact, we probably will never see these traps. */ case SPRN_TBWL: - vcpu->arch.gpr[rt] = get_tb() >> 32; break; + kvmppc_set_gpr(vcpu, rt, get_tb() >> 32); break; case SPRN_TBWU: - vcpu->arch.gpr[rt] = get_tb(); break; + kvmppc_set_gpr(vcpu, rt, get_tb()); break; case SPRN_SPRG0: - vcpu->arch.gpr[rt] = vcpu->arch.sprg0; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg0); break; case SPRN_SPRG1: - vcpu->arch.gpr[rt] = vcpu->arch.sprg1; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg1); break; case SPRN_SPRG2: - vcpu->arch.gpr[rt] = vcpu->arch.sprg2; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg2); break; case SPRN_SPRG3: - vcpu->arch.gpr[rt] = vcpu->arch.sprg3; break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg3); break; /* Note: SPRG4-7 are user-readable, so we don't get * a trap. */ case SPRN_DEC: { u64 jd = get_tb() - vcpu->arch.dec_jiffies; - vcpu->arch.gpr[rt] = vcpu->arch.dec - jd; - pr_debug(KERN_INFO "mfDEC: %x - %llx = %lx\n", vcpu->arch.dec, jd, vcpu->arch.gpr[rt]); + kvmppc_set_gpr(vcpu, rt, vcpu->arch.dec - jd); + pr_debug(KERN_INFO "mfDEC: %x - %llx = %lx\n", + vcpu->arch.dec, jd, + kvmppc_get_gpr(vcpu, rt)); break; } default: emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, rt); if (emulated == EMULATE_FAIL) { printk("mfspr: unknown spr %x\n", sprn); - vcpu->arch.gpr[rt] = 0; + kvmppc_set_gpr(vcpu, rt, 0); } break; } @@ -269,7 +278,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) rb = get_rb(inst); emulated = kvmppc_handle_store(run, vcpu, - vcpu->arch.gpr[rs], + kvmppc_get_gpr(vcpu, rs), 2, 1); break; @@ -278,14 +287,14 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) ra = get_ra(inst); rb = get_rb(inst); - ea = vcpu->arch.gpr[rb]; + ea = kvmppc_get_gpr(vcpu, rb); if (ra) - ea += vcpu->arch.gpr[ra]; + ea += kvmppc_get_gpr(vcpu, ra); emulated = kvmppc_handle_store(run, vcpu, - vcpu->arch.gpr[rs], + kvmppc_get_gpr(vcpu, rs), 2, 1); - vcpu->arch.gpr[ra] = ea; + kvmppc_set_gpr(vcpu, ra, ea); break; case OP_31_XOP_MTSPR: @@ -293,9 +302,9 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) rs = get_rs(inst); switch (sprn) { case SPRN_SRR0: - vcpu->arch.srr0 = vcpu->arch.gpr[rs]; break; + vcpu->arch.srr0 = kvmppc_get_gpr(vcpu, rs); break; case SPRN_SRR1: - vcpu->arch.srr1 = vcpu->arch.gpr[rs]; break; + vcpu->arch.srr1 = kvmppc_get_gpr(vcpu, rs); break; /* XXX We need to context-switch the timebase for * watchdog and FIT. */ @@ -305,18 +314,18 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case SPRN_MSSSR0: break; case SPRN_DEC: - vcpu->arch.dec = vcpu->arch.gpr[rs]; + vcpu->arch.dec = kvmppc_get_gpr(vcpu, rs); kvmppc_emulate_dec(vcpu); break; case SPRN_SPRG0: - vcpu->arch.sprg0 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg0 = kvmppc_get_gpr(vcpu, rs); break; case SPRN_SPRG1: - vcpu->arch.sprg1 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg1 = kvmppc_get_gpr(vcpu, rs); break; case SPRN_SPRG2: - vcpu->arch.sprg2 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg2 = kvmppc_get_gpr(vcpu, rs); break; case SPRN_SPRG3: - vcpu->arch.sprg3 = vcpu->arch.gpr[rs]; break; + vcpu->arch.sprg3 = kvmppc_get_gpr(vcpu, rs); break; default: emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, rs); @@ -348,7 +357,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) rb = get_rb(inst); emulated = kvmppc_handle_store(run, vcpu, - vcpu->arch.gpr[rs], + kvmppc_get_gpr(vcpu, rs), 4, 0); break; @@ -363,7 +372,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) rb = get_rb(inst); emulated = kvmppc_handle_store(run, vcpu, - vcpu->arch.gpr[rs], + kvmppc_get_gpr(vcpu, rs), 2, 0); break; @@ -382,7 +391,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) ra = get_ra(inst); rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); - vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; + kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); break; case OP_LBZ: @@ -394,35 +403,39 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) ra = get_ra(inst); rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1); - vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; + kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); break; case OP_STW: rs = get_rs(inst); - emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 4, 1); break; case OP_STWU: ra = get_ra(inst); rs = get_rs(inst); - emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 4, 1); - vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; + kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); break; case OP_STB: rs = get_rs(inst); - emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 1, 1); break; case OP_STBU: ra = get_ra(inst); rs = get_rs(inst); - emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 1, 1); - vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; + kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); break; case OP_LHZ: @@ -434,21 +447,23 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) ra = get_ra(inst); rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); - vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; + kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); break; case OP_STH: rs = get_rs(inst); - emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 2, 1); break; case OP_STHU: ra = get_ra(inst); rs = get_rs(inst); - emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs], + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 2, 1); - vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed; + kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); break; default: @@ -461,6 +476,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) advance = 0; printk(KERN_ERR "Couldn't emulate instruction 0x%08x " "(op %d xop %d)\n", inst, get_op(inst), get_xop(inst)); + kvmppc_core_queue_program(vcpu, 0); } } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index f06cf93b178..51aedd7f16b 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -137,6 +137,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) { kvmppc_free_vcpus(kvm); kvm_free_physmem(kvm); + cleanup_srcu_struct(&kvm->srcu); kfree(kvm); } @@ -165,14 +166,24 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - int user_alloc) +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, + int user_alloc) { return 0; } +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + return; +} + + void kvm_arch_flush_shadow(struct kvm *kvm) { } @@ -260,34 +271,35 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu, struct kvm_run *run) { - ulong *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr]; - *gpr = run->dcr.data; + kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, run->dcr.data); } static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, struct kvm_run *run) { - ulong *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr]; + ulong gpr; - if (run->mmio.len > sizeof(*gpr)) { + if (run->mmio.len > sizeof(gpr)) { printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len); return; } if (vcpu->arch.mmio_is_bigendian) { switch (run->mmio.len) { - case 4: *gpr = *(u32 *)run->mmio.data; break; - case 2: *gpr = *(u16 *)run->mmio.data; break; - case 1: *gpr = *(u8 *)run->mmio.data; break; + case 4: gpr = *(u32 *)run->mmio.data; break; + case 2: gpr = *(u16 *)run->mmio.data; break; + case 1: gpr = *(u8 *)run->mmio.data; break; } } else { /* Convert BE data from userland back to LE. */ switch (run->mmio.len) { - case 4: *gpr = ld_le32((u32 *)run->mmio.data); break; - case 2: *gpr = ld_le16((u16 *)run->mmio.data); break; - case 1: *gpr = *(u8 *)run->mmio.data; break; + case 4: gpr = ld_le32((u32 *)run->mmio.data); break; + case 2: gpr = ld_le16((u16 *)run->mmio.data); break; + case 1: gpr = *(u8 *)run->mmio.data; break; } } + + kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr); } int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 3fa0a10e466..49292869a5c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -242,6 +242,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_free_physmem(kvm); free_page((unsigned long)(kvm->arch.sca)); debug_unregister(kvm->arch.dbf); + cleanup_srcu_struct(&kvm->srcu); kfree(kvm); } @@ -690,14 +691,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } /* Section: memory related */ -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - int user_alloc) +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, + int user_alloc) { - int i; - struct kvm_vcpu *vcpu; - /* A few sanity checks. We can have exactly one memory slot which has to start at guest virtual zero and which has to be located at a page boundary in userland and which has to end at a page boundary. @@ -720,14 +719,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm, if (!user_alloc) return -EINVAL; + return 0; +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + int i; + struct kvm_vcpu *vcpu; + /* request update of sie control block for all available vcpus */ kvm_for_each_vcpu(i, vcpu, kvm) { if (test_and_set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) continue; kvm_s390_inject_sigp_stop(vcpu, ACTION_RELOADVCPU_ON_STOP); } - - return 0; } void kvm_arch_flush_shadow(struct kvm *kvm) diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 06cce8285ba..60f09ab3672 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -67,10 +67,14 @@ static inline long kvm_s390_vcpu_get_memsize(struct kvm_vcpu *vcpu) static inline void kvm_s390_vcpu_set_mem(struct kvm_vcpu *vcpu) { + int idx; struct kvm_memory_slot *mem; + struct kvm_memslots *memslots; - down_read(&vcpu->kvm->slots_lock); - mem = &vcpu->kvm->memslots[0]; + idx = srcu_read_lock(&vcpu->kvm->srcu); + memslots = rcu_dereference(vcpu->kvm->memslots); + + mem = &memslots->memslots[0]; vcpu->arch.sie_block->gmsor = mem->userspace_addr; vcpu->arch.sie_block->gmslm = @@ -78,7 +82,7 @@ static inline void kvm_s390_vcpu_set_mem(struct kvm_vcpu *vcpu) (mem->npages << PAGE_SHIFT) + VIRTIODESCSPACE - 1ul; - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); } /* implemented in priv.c */ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 57ccdcec146..f15f37bfbd6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -31,6 +31,7 @@ config X86 select ARCH_WANT_FRAME_POINTERS select HAVE_DMA_ATTRS select HAVE_KRETPROBES + select HAVE_OPTPROBES select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 9f828f87ca3..493092efaa3 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -11,6 +11,7 @@ header-y += sigcontext32.h header-y += ucontext.h header-y += processor-flags.h header-y += hw_breakpoint.h +header-y += hyperv.h unifdef-y += e820.h unifdef-y += ist.h diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index f1e253ceba4..b09ec55650b 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -165,10 +165,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, * invalid instruction possible) or if the instructions are changed from a * consistent state to another consistent state atomically. * More care must be taken when modifying code in the SMP case because of - * Intel's errata. + * Intel's errata. text_poke_smp() takes care that errata, but still + * doesn't support NMI/MCE handler code modifying. * On the local CPU you need to be protected again NMI or MCE handlers seeing an * inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); +extern void *text_poke_smp(void *addr, const void *opcode, size_t len); #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h new file mode 100644 index 00000000000..e153a2b3889 --- /dev/null +++ b/arch/x86/include/asm/hyperv.h @@ -0,0 +1,186 @@ +#ifndef _ASM_X86_KVM_HYPERV_H +#define _ASM_X86_KVM_HYPERV_H + +#include <linux/types.h> + +/* + * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent + * is set by CPUID(HvCpuIdFunctionVersionAndFeatures). + */ +#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000 +#define HYPERV_CPUID_INTERFACE 0x40000001 +#define HYPERV_CPUID_VERSION 0x40000002 +#define HYPERV_CPUID_FEATURES 0x40000003 +#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004 +#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 + +/* + * Feature identification. EAX indicates which features are available + * to the partition based upon the current partition privileges. + */ + +/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */ +#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0) +/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ +#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) +/* + * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM + * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available + */ +#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2) +/* + * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through + * HV_X64_MSR_STIMER3_COUNT) available + */ +#define HV_X64_MSR_SYNTIMER_AVAILABLE (1 << 3) +/* + * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR) + * are available + */ +#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4) +/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/ +#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5) +/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/ +#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6) +/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/ +#define HV_X64_MSR_RESET_AVAILABLE (1 << 7) + /* + * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE, + * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE, + * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available + */ +#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8) + +/* + * Feature identification: EBX indicates which flags were specified at + * partition creation. The format is the same as the partition creation + * flag structure defined in section Partition Creation Flags. + */ +#define HV_X64_CREATE_PARTITIONS (1 << 0) +#define HV_X64_ACCESS_PARTITION_ID (1 << 1) +#define HV_X64_ACCESS_MEMORY_POOL (1 << 2) +#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3) +#define HV_X64_POST_MESSAGES (1 << 4) +#define HV_X64_SIGNAL_EVENTS (1 << 5) +#define HV_X64_CREATE_PORT (1 << 6) +#define HV_X64_CONNECT_PORT (1 << 7) +#define HV_X64_ACCESS_STATS (1 << 8) +#define HV_X64_DEBUGGING (1 << 11) +#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12) +#define HV_X64_CONFIGURE_PROFILER (1 << 13) + +/* + * Feature identification. EDX indicates which miscellaneous features + * are available to the partition. + */ +/* The MWAIT instruction is available (per section MONITOR / MWAIT) */ +#define HV_X64_MWAIT_AVAILABLE (1 << 0) +/* Guest debugging support is available */ +#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1) +/* Performance Monitor support is available*/ +#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2) +/* Support for physical CPU dynamic partitioning events is available*/ +#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3) +/* + * Support for passing hypercall input parameter block via XMM + * registers is available + */ +#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4) +/* Support for a virtual guest idle state is available */ +#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5) + +/* + * Implementation recommendations. Indicates which behaviors the hypervisor + * recommends the OS implement for optimal performance. + */ + /* + * Recommend using hypercall for address space switches rather + * than MOV to CR3 instruction + */ +#define HV_X64_MWAIT_RECOMMENDED (1 << 0) +/* Recommend using hypercall for local TLB flushes rather + * than INVLPG or MOV to CR3 instructions */ +#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1) +/* + * Recommend using hypercall for remote TLB flushes rather + * than inter-processor interrupts + */ +#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2) +/* + * Recommend using MSRs for accessing APIC registers + * EOI, ICR and TPR rather than their memory-mapped counterparts + */ +#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3) +/* Recommend using the hypervisor-provided MSR to initiate a system RESET */ +#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4) +/* + * Recommend using relaxed timing for this partition. If used, + * the VM should disable any watchdog timeouts that rely on the + * timely delivery of external interrupts + */ +#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5) + +/* MSR used to identify the guest OS. */ +#define HV_X64_MSR_GUEST_OS_ID 0x40000000 + +/* MSR used to setup pages used to communicate with the hypervisor. */ +#define HV_X64_MSR_HYPERCALL 0x40000001 + +/* MSR used to provide vcpu index */ +#define HV_X64_MSR_VP_INDEX 0x40000002 + +/* Define the virtual APIC registers */ +#define HV_X64_MSR_EOI 0x40000070 +#define HV_X64_MSR_ICR 0x40000071 +#define HV_X64_MSR_TPR 0x40000072 +#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073 + +/* Define synthetic interrupt controller model specific registers. */ +#define HV_X64_MSR_SCONTROL 0x40000080 +#define HV_X64_MSR_SVERSION 0x40000081 +#define HV_X64_MSR_SIEFP 0x40000082 +#define HV_X64_MSR_SIMP 0x40000083 +#define HV_X64_MSR_EOM 0x40000084 +#define HV_X64_MSR_SINT0 0x40000090 +#define HV_X64_MSR_SINT1 0x40000091 +#define HV_X64_MSR_SINT2 0x40000092 +#define HV_X64_MSR_SINT3 0x40000093 +#define HV_X64_MSR_SINT4 0x40000094 +#define HV_X64_MSR_SINT5 0x40000095 +#define HV_X64_MSR_SINT6 0x40000096 +#define HV_X64_MSR_SINT7 0x40000097 +#define HV_X64_MSR_SINT8 0x40000098 +#define HV_X64_MSR_SINT9 0x40000099 +#define HV_X64_MSR_SINT10 0x4000009A +#define HV_X64_MSR_SINT11 0x4000009B +#define HV_X64_MSR_SINT12 0x4000009C +#define HV_X64_MSR_SINT13 0x4000009D +#define HV_X64_MSR_SINT14 0x4000009E +#define HV_X64_MSR_SINT15 0x4000009F + + +#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 +#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) + +/* Declare the various hypercall operations. */ +#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT 0x0008 + +#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001 +#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) + +#define HV_PROCESSOR_POWER_STATE_C0 0 +#define HV_PROCESSOR_POWER_STATE_C1 1 +#define HV_PROCESSOR_POWER_STATE_C2 2 +#define HV_PROCESSOR_POWER_STATE_C3 3 + +/* hypercall status code */ +#define HV_STATUS_SUCCESS 0 +#define HV_STATUS_INVALID_HYPERCALL_CODE 2 +#define HV_STATUS_INVALID_HYPERCALL_INPUT 3 +#define HV_STATUS_INVALID_ALIGNMENT 4 + +#endif diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 4fe681de1e7..4ffa345a8cc 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -32,7 +32,10 @@ struct kprobe; typedef u8 kprobe_opcode_t; #define BREAKPOINT_INSTRUCTION 0xcc -#define RELATIVEJUMP_INSTRUCTION 0xe9 +#define RELATIVEJUMP_OPCODE 0xe9 +#define RELATIVEJUMP_SIZE 5 +#define RELATIVECALL_OPCODE 0xe8 +#define RELATIVE_ADDR_SIZE 4 #define MAX_INSN_SIZE 16 #define MAX_STACK_SIZE 64 #define MIN_STACK_SIZE(ADDR) \ @@ -44,6 +47,17 @@ typedef u8 kprobe_opcode_t; #define flush_insn_slot(p) do { } while (0) +/* optinsn template addresses */ +extern kprobe_opcode_t optprobe_template_entry; +extern kprobe_opcode_t optprobe_template_val; +extern kprobe_opcode_t optprobe_template_call; +extern kprobe_opcode_t optprobe_template_end; +#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE) +#define MAX_OPTINSN_SIZE \ + (((unsigned long)&optprobe_template_end - \ + (unsigned long)&optprobe_template_entry) + \ + MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE) + extern const int kretprobe_blacklist_size; void arch_remove_kprobe(struct kprobe *p); @@ -64,6 +78,21 @@ struct arch_specific_insn { int boostable; }; +struct arch_optimized_insn { + /* copy of the original instructions */ + kprobe_opcode_t copied_insn[RELATIVE_ADDR_SIZE]; + /* detour code buffer */ + kprobe_opcode_t *insn; + /* the size of instructions copied to detour code buffer */ + size_t size; +}; + +/* Return true (!0) if optinsn is prepared for optimization. */ +static inline int arch_prepared_optinsn(struct arch_optimized_insn *optinsn) +{ + return optinsn->size; +} + struct prev_kprobe { struct kprobe *kp; unsigned long status; diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 7c18e1230f5..7a6f54fa13b 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -54,13 +54,23 @@ struct x86_emulate_ctxt; struct x86_emulate_ops { /* * read_std: Read bytes of standard (non-emulated/special) memory. - * Used for instruction fetch, stack operations, and others. + * Used for descriptor reading. * @addr: [IN ] Linear address from which to read. * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ int (*read_std)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu); + unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); + + /* + * fetch: Read bytes of standard (non-emulated/special) memory. + * Used for instruction fetch. + * @addr: [IN ] Linear address from which to read. + * @val: [OUT] Value read from memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*fetch)(unsigned long addr, void *val, + unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); /* * read_emulated: Read bytes from emulated/special memory area. @@ -74,7 +84,7 @@ struct x86_emulate_ops { struct kvm_vcpu *vcpu); /* - * write_emulated: Read bytes from emulated/special memory area. + * write_emulated: Write bytes to emulated/special memory area. * @addr: [IN ] Linear address to which to write. * @val: [IN ] Value to write to memory (low-order bytes used as * required). @@ -168,6 +178,7 @@ struct x86_emulate_ctxt { /* Execution mode, passed to the emulator. */ #define X86EMUL_MODE_REAL 0 /* Real mode. */ +#define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */ #define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ #define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ #define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4f865e8b854..06d9e79ca37 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -25,7 +25,7 @@ #include <asm/mtrr.h> #include <asm/msr-index.h> -#define KVM_MAX_VCPUS 16 +#define KVM_MAX_VCPUS 64 #define KVM_MEMORY_SLOTS 32 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 @@ -38,19 +38,6 @@ #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 0xFFFFFF0000000000ULL) -#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) -#define KVM_GUEST_CR0_MASK \ - (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP) -#define KVM_VM_CR0_ALWAYS_ON \ - (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_GUEST_CR4_MASK \ - (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) -#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) -#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) - #define INVALID_PAGE (~(hpa_t)0) #define UNMAPPED_GVA (~(gpa_t)0) @@ -256,7 +243,8 @@ struct kvm_mmu { void (*new_cr3)(struct kvm_vcpu *vcpu); int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); void (*free)(struct kvm_vcpu *vcpu); - gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); + gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, + u32 *error); void (*prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page); int (*sync_page)(struct kvm_vcpu *vcpu, @@ -282,13 +270,15 @@ struct kvm_vcpu_arch { u32 regs_dirty; unsigned long cr0; + unsigned long cr0_guest_owned_bits; unsigned long cr2; unsigned long cr3; unsigned long cr4; + unsigned long cr4_guest_owned_bits; unsigned long cr8; u32 hflags; u64 pdptrs[4]; /* pae */ - u64 shadow_efer; + u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ int32_t apic_arb_prio; @@ -374,17 +364,27 @@ struct kvm_vcpu_arch { /* used for guest single stepping over the given code position */ u16 singlestep_cs; unsigned long singlestep_rip; + /* fields used by HYPER-V emulation */ + u64 hv_vapic; }; struct kvm_mem_alias { gfn_t base_gfn; unsigned long npages; gfn_t target_gfn; +#define KVM_ALIAS_INVALID 1UL + unsigned long flags; }; -struct kvm_arch{ - int naliases; +#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION + +struct kvm_mem_aliases { struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; + int naliases; +}; + +struct kvm_arch { + struct kvm_mem_aliases *aliases; unsigned int n_free_mmu_pages; unsigned int n_requested_mmu_pages; @@ -416,6 +416,10 @@ struct kvm_arch{ s64 kvmclock_offset; struct kvm_xen_hvm_config xen_hvm_config; + + /* fields used by HYPER-V emulation */ + u64 hv_guest_os_id; + u64 hv_hypercall; }; struct kvm_vm_stat { @@ -471,6 +475,7 @@ struct kvm_x86_ops { int (*hardware_setup)(void); /* __init */ void (*hardware_unsetup)(void); /* __exit */ bool (*cpu_has_accelerated_tpr)(void); + void (*cpuid_update)(struct kvm_vcpu *vcpu); /* Create, but do not attach this VCPU */ struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); @@ -492,6 +497,7 @@ struct kvm_x86_ops { void (*set_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); + void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -501,12 +507,13 @@ struct kvm_x86_ops { void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); - unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr); - void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value, - int *exception); + int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest); + int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + void (*fpu_activate)(struct kvm_vcpu *vcpu); + void (*fpu_deactivate)(struct kvm_vcpu *vcpu); void (*tlb_flush)(struct kvm_vcpu *vcpu); @@ -531,7 +538,8 @@ struct kvm_x86_ops { int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); - bool (*gb_page_enable)(void); + int (*get_lpage_level)(void); + bool (*rdtscp_supported)(void); const struct trace_print_flags *exit_reasons_str; }; @@ -606,8 +614,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - int type_bits, int seg); +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason); @@ -653,6 +660,10 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); @@ -666,6 +677,7 @@ void kvm_disable_tdp(void); int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); int complete_pio(struct kvm_vcpu *vcpu); +bool kvm_check_iopl(struct kvm_vcpu *vcpu); struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index c584076a47f..ffae1420e7d 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -2,6 +2,7 @@ #define _ASM_X86_KVM_PARA_H #include <linux/types.h> +#include <asm/hyperv.h> /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It * should be used to determine that a VM is running under KVM. diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 1fecb7e6113..38638cd2fa4 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -313,7 +313,7 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_EXIT_ERR -1 -#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */ +#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP) #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" #define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 2b4945419a8..fb9a080740e 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -53,6 +53,7 @@ */ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 +#define SECONDARY_EXEC_RDTSCP 0x00000008 #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 @@ -251,6 +252,7 @@ enum vmcs_field { #define EXIT_REASON_MSR_READ 31 #define EXIT_REASON_MSR_WRITE 32 #define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_MONITOR_INSTRUCTION 39 #define EXIT_REASON_PAUSE_INSTRUCTION 40 #define EXIT_REASON_MCE_DURING_VMENTRY 41 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 @@ -362,6 +364,7 @@ enum vmcs_field { #define VMX_EPTP_UC_BIT (1ull << 8) #define VMX_EPTP_WB_BIT (1ull << 14) #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) +#define VMX_EPT_1GB_PAGE_BIT (1ull << 17) #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) @@ -374,7 +377,7 @@ enum vmcs_field { #define VMX_EPT_READABLE_MASK 0x1ull #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull -#define VMX_EPT_IGMT_BIT (1ull << 6) +#define VMX_EPT_IPAT_BIT (1ull << 6) #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index e6ea0342c8f..3a4bf35c179 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -7,6 +7,7 @@ #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/memory.h> +#include <linux/stop_machine.h> #include <asm/alternative.h> #include <asm/sections.h> #include <asm/pgtable.h> @@ -572,3 +573,62 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) local_irq_restore(flags); return addr; } + +/* + * Cross-modifying kernel text with stop_machine(). + * This code originally comes from immediate value. + */ +static atomic_t stop_machine_first; +static int wrote_text; + +struct text_poke_params { + void *addr; + const void *opcode; + size_t len; +}; + +static int __kprobes stop_machine_text_poke(void *data) +{ + struct text_poke_params *tpp = data; + + if (atomic_dec_and_test(&stop_machine_first)) { + text_poke(tpp->addr, tpp->opcode, tpp->len); + smp_wmb(); /* Make sure other cpus see that this has run */ + wrote_text = 1; + } else { + while (!wrote_text) + cpu_relax(); + smp_mb(); /* Load wrote_text before following execution */ + } + + flush_icache_range((unsigned long)tpp->addr, + (unsigned long)tpp->addr + tpp->len); + return 0; +} + +/** + * text_poke_smp - Update instructions on a live kernel on SMP + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * Modify multi-byte instruction by using stop_machine() on SMP. This allows + * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying + * should be allowed, since stop_machine() does _not_ protect code against + * NMI and MCE. + * + * Note: Must be called under get_online_cpus() and text_mutex. + */ +void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) +{ + struct text_poke_params tpp; + + tpp.addr = addr; + tpp.opcode = opcode; + tpp.len = len; + atomic_set(&stop_machine_first, 1); + wrote_text = 0; + stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); + return addr; +} + diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index fe4622e8c83..79556bd9b60 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -145,6 +145,7 @@ struct set_mtrr_data { /** * ipi_handler - Synchronisation handler. Executed by "other" CPUs. + * @info: pointer to mtrr configuration data * * Returns nothing. */ diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 5de9f4a9c3f..b43bbaebe2c 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -49,6 +49,7 @@ #include <linux/module.h> #include <linux/kdebug.h> #include <linux/kallsyms.h> +#include <linux/ftrace.h> #include <asm/cacheflush.h> #include <asm/desc.h> @@ -106,16 +107,22 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = { }; const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); -/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ -static void __kprobes set_jmp_op(void *from, void *to) +static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) { - struct __arch_jmp_op { - char op; + struct __arch_relative_insn { + u8 op; s32 raddr; - } __attribute__((packed)) * jop; - jop = (struct __arch_jmp_op *)from; - jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); - jop->op = RELATIVEJUMP_INSTRUCTION; + } __attribute__((packed)) *insn; + + insn = (struct __arch_relative_insn *)from; + insn->raddr = (s32)((long)(to) - ((long)(from) + 5)); + insn->op = op; +} + +/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ +static void __kprobes synthesize_reljump(void *from, void *to) +{ + __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); } /* @@ -202,7 +209,7 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) /* * Basically, kp->ainsn.insn has an original instruction. * However, RIP-relative instruction can not do single-stepping - * at different place, fix_riprel() tweaks the displacement of + * at different place, __copy_instruction() tweaks the displacement of * that instruction. In that case, we can't recover the instruction * from the kp->ainsn.insn. * @@ -284,21 +291,37 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) } /* - * Adjust the displacement if the instruction uses the %rip-relative - * addressing mode. + * Copy an instruction and adjust the displacement if the instruction + * uses the %rip-relative addressing mode. * If it does, Return the address of the 32-bit displacement word. * If not, return null. * Only applicable to 64-bit x86. */ -static void __kprobes fix_riprel(struct kprobe *p) +static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) { -#ifdef CONFIG_X86_64 struct insn insn; - kernel_insn_init(&insn, p->ainsn.insn); + int ret; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + kernel_insn_init(&insn, src); + if (recover) { + insn_get_opcode(&insn); + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, + (unsigned long)src); + if (ret) + return 0; + kernel_insn_init(&insn, buf); + } + } + insn_get_length(&insn); + memcpy(dest, insn.kaddr, insn.length); + +#ifdef CONFIG_X86_64 if (insn_rip_relative(&insn)) { s64 newdisp; u8 *disp; + kernel_insn_init(&insn, dest); insn_get_displacement(&insn); /* * The copied instruction uses the %rip-relative addressing @@ -312,20 +335,23 @@ static void __kprobes fix_riprel(struct kprobe *p) * extension of the original signed 32-bit displacement would * have given. */ - newdisp = (u8 *) p->addr + (s64) insn.displacement.value - - (u8 *) p->ainsn.insn; + newdisp = (u8 *) src + (s64) insn.displacement.value - + (u8 *) dest; BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ - disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn); + disp = (u8 *) dest + insn_offset_displacement(&insn); *(s32 *) disp = (s32) newdisp; } #endif + return insn.length; } static void __kprobes arch_copy_kprobe(struct kprobe *p) { - memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); - - fix_riprel(p); + /* + * Copy an instruction without recovering int3, because it will be + * put by another subsystem. + */ + __copy_instruction(p->ainsn.insn, p->addr, 0); if (can_boost(p->addr)) p->ainsn.boostable = 0; @@ -406,18 +432,6 @@ static void __kprobes restore_btf(void) update_debugctlmsr(current->thread.debugctlmsr); } -static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) -{ - clear_btf(); - regs->flags |= X86_EFLAGS_TF; - regs->flags &= ~X86_EFLAGS_IF; - /* single step inline if the instruction is an int3 */ - if (p->opcode == BREAKPOINT_INSTRUCTION) - regs->ip = (unsigned long)p->addr; - else - regs->ip = (unsigned long)p->ainsn.insn; -} - void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { @@ -429,20 +443,50 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, *sara = (unsigned long) &kretprobe_trampoline; } +#ifdef CONFIG_OPTPROBES +static int __kprobes setup_detour_execution(struct kprobe *p, + struct pt_regs *regs, + int reenter); +#else +#define setup_detour_execution(p, regs, reenter) (0) +#endif + static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) + struct kprobe_ctlblk *kcb, int reenter) { + if (setup_detour_execution(p, regs, reenter)) + return; + #if !defined(CONFIG_PREEMPT) if (p->ainsn.boostable == 1 && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ - reset_current_kprobe(); + if (!reenter) + reset_current_kprobe(); + /* + * Reentering boosted probe doesn't reset current_kprobe, + * nor set current_kprobe, because it doesn't use single + * stepping. + */ regs->ip = (unsigned long)p->ainsn.insn; preempt_enable_no_resched(); return; } #endif - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_HIT_SS; + if (reenter) { + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); + kcb->kprobe_status = KPROBE_REENTER; + } else + kcb->kprobe_status = KPROBE_HIT_SS; + /* Prepare real single stepping */ + clear_btf(); + regs->flags |= X86_EFLAGS_TF; + regs->flags &= ~X86_EFLAGS_IF; + /* single step inline if the instruction is an int3 */ + if (p->opcode == BREAKPOINT_INSTRUCTION) + regs->ip = (unsigned long)p->addr; + else + regs->ip = (unsigned long)p->ainsn.insn; } /* @@ -456,11 +500,8 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, switch (kcb->kprobe_status) { case KPROBE_HIT_SSDONE: case KPROBE_HIT_ACTIVE: - save_previous_kprobe(kcb); - set_current_kprobe(p, regs, kcb); kprobes_inc_nmissed_count(p); - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_REENTER; + setup_singlestep(p, regs, kcb, 1); break; case KPROBE_HIT_SS: /* A probe has been hit in the codepath leading up to, or just @@ -535,13 +576,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) * more here. */ if (!p->pre_handler || !p->pre_handler(p, regs)) - setup_singlestep(p, regs, kcb); + setup_singlestep(p, regs, kcb, 0); return 1; } } else if (kprobe_running()) { p = __get_cpu_var(current_kprobe); if (p->break_handler && p->break_handler(p, regs)) { - setup_singlestep(p, regs, kcb); + setup_singlestep(p, regs, kcb, 0); return 1; } } /* else: not a kprobe fault; let the kernel handle it */ @@ -550,6 +591,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) return 0; } +#ifdef CONFIG_X86_64 +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax. */ \ + " subq $24, %rsp\n" \ + " pushq %rdi\n" \ + " pushq %rsi\n" \ + " pushq %rdx\n" \ + " pushq %rcx\n" \ + " pushq %rax\n" \ + " pushq %r8\n" \ + " pushq %r9\n" \ + " pushq %r10\n" \ + " pushq %r11\n" \ + " pushq %rbx\n" \ + " pushq %rbp\n" \ + " pushq %r12\n" \ + " pushq %r13\n" \ + " pushq %r14\n" \ + " pushq %r15\n" +#define RESTORE_REGS_STRING \ + " popq %r15\n" \ + " popq %r14\n" \ + " popq %r13\n" \ + " popq %r12\n" \ + " popq %rbp\n" \ + " popq %rbx\n" \ + " popq %r11\n" \ + " popq %r10\n" \ + " popq %r9\n" \ + " popq %r8\n" \ + " popq %rax\n" \ + " popq %rcx\n" \ + " popq %rdx\n" \ + " popq %rsi\n" \ + " popq %rdi\n" \ + /* Skip orig_ax, ip, cs */ \ + " addq $24, %rsp\n" +#else +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax and gs. */ \ + " subl $16, %esp\n" \ + " pushl %fs\n" \ + " pushl %ds\n" \ + " pushl %es\n" \ + " pushl %eax\n" \ + " pushl %ebp\n" \ + " pushl %edi\n" \ + " pushl %esi\n" \ + " pushl %edx\n" \ + " pushl %ecx\n" \ + " pushl %ebx\n" +#define RESTORE_REGS_STRING \ + " popl %ebx\n" \ + " popl %ecx\n" \ + " popl %edx\n" \ + " popl %esi\n" \ + " popl %edi\n" \ + " popl %ebp\n" \ + " popl %eax\n" \ + /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ + " addl $24, %esp\n" +#endif + /* * When a retprobed function returns, this code saves registers and * calls trampoline_handler() runs, which calls the kretprobe's handler. @@ -563,65 +667,16 @@ static void __used __kprobes kretprobe_trampoline_holder(void) /* We don't bother saving the ss register */ " pushq %rsp\n" " pushfq\n" - /* - * Skip cs, ip, orig_ax. - * trampoline_handler() will plug in these values - */ - " subq $24, %rsp\n" - " pushq %rdi\n" - " pushq %rsi\n" - " pushq %rdx\n" - " pushq %rcx\n" - " pushq %rax\n" - " pushq %r8\n" - " pushq %r9\n" - " pushq %r10\n" - " pushq %r11\n" - " pushq %rbx\n" - " pushq %rbp\n" - " pushq %r12\n" - " pushq %r13\n" - " pushq %r14\n" - " pushq %r15\n" + SAVE_REGS_STRING " movq %rsp, %rdi\n" " call trampoline_handler\n" /* Replace saved sp with true return address. */ " movq %rax, 152(%rsp)\n" - " popq %r15\n" - " popq %r14\n" - " popq %r13\n" - " popq %r12\n" - " popq %rbp\n" - " popq %rbx\n" - " popq %r11\n" - " popq %r10\n" - " popq %r9\n" - " popq %r8\n" - " popq %rax\n" - " popq %rcx\n" - " popq %rdx\n" - " popq %rsi\n" - " popq %rdi\n" - /* Skip orig_ax, ip, cs */ - " addq $24, %rsp\n" + RESTORE_REGS_STRING " popfq\n" #else " pushf\n" - /* - * Skip cs, ip, orig_ax and gs. - * trampoline_handler() will plug in these values - */ - " subl $16, %esp\n" - " pushl %fs\n" - " pushl %es\n" - " pushl %ds\n" - " pushl %eax\n" - " pushl %ebp\n" - " pushl %edi\n" - " pushl %esi\n" - " pushl %edx\n" - " pushl %ecx\n" - " pushl %ebx\n" + SAVE_REGS_STRING " movl %esp, %eax\n" " call trampoline_handler\n" /* Move flags to cs */ @@ -629,15 +684,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void) " movl %edx, 52(%esp)\n" /* Replace saved flags with true return address. */ " movl %eax, 56(%esp)\n" - " popl %ebx\n" - " popl %ecx\n" - " popl %edx\n" - " popl %esi\n" - " popl %edi\n" - " popl %ebp\n" - " popl %eax\n" - /* Skip ds, es, fs, gs, orig_ax and ip */ - " addl $24, %esp\n" + RESTORE_REGS_STRING " popf\n" #endif " ret\n"); @@ -805,8 +852,8 @@ static void __kprobes resume_execution(struct kprobe *p, * These instructions can be executed directly if it * jumps back to correct address. */ - set_jmp_op((void *)regs->ip, - (void *)orig_ip + (regs->ip - copy_ip)); + synthesize_reljump((void *)regs->ip, + (void *)orig_ip + (regs->ip - copy_ip)); p->ainsn.boostable = 1; } else { p->ainsn.boostable = -1; @@ -1033,6 +1080,358 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } + +#ifdef CONFIG_OPTPROBES + +/* Insert a call instruction at address 'from', which calls address 'to'.*/ +static void __kprobes synthesize_relcall(void *from, void *to) +{ + __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); +} + +/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ +static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, + unsigned long val) +{ +#ifdef CONFIG_X86_64 + *addr++ = 0x48; + *addr++ = 0xbf; +#else + *addr++ = 0xb8; +#endif + *(unsigned long *)addr = val; +} + +void __kprobes kprobes_optinsn_template_holder(void) +{ + asm volatile ( + ".global optprobe_template_entry\n" + "optprobe_template_entry: \n" +#ifdef CONFIG_X86_64 + /* We don't bother saving the ss register */ + " pushq %rsp\n" + " pushfq\n" + SAVE_REGS_STRING + " movq %rsp, %rsi\n" + ".global optprobe_template_val\n" + "optprobe_template_val: \n" + ASM_NOP5 + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call: \n" + ASM_NOP5 + /* Move flags to rsp */ + " movq 144(%rsp), %rdx\n" + " movq %rdx, 152(%rsp)\n" + RESTORE_REGS_STRING + /* Skip flags entry */ + " addq $8, %rsp\n" + " popfq\n" +#else /* CONFIG_X86_32 */ + " pushf\n" + SAVE_REGS_STRING + " movl %esp, %edx\n" + ".global optprobe_template_val\n" + "optprobe_template_val: \n" + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call: \n" + ASM_NOP5 + RESTORE_REGS_STRING + " addl $4, %esp\n" /* skip cs */ + " popf\n" +#endif + ".global optprobe_template_end\n" + "optprobe_template_end: \n"); +} + +#define TMPL_MOVE_IDX \ + ((long)&optprobe_template_val - (long)&optprobe_template_entry) +#define TMPL_CALL_IDX \ + ((long)&optprobe_template_call - (long)&optprobe_template_entry) +#define TMPL_END_IDX \ + ((long)&optprobe_template_end - (long)&optprobe_template_entry) + +#define INT3_SIZE sizeof(kprobe_opcode_t) + +/* Optimized kprobe call back function: called from optinsn */ +static void __kprobes optimized_callback(struct optimized_kprobe *op, + struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + preempt_disable(); + if (kprobe_running()) { + kprobes_inc_nmissed_count(&op->kp); + } else { + /* Save skipped registers */ +#ifdef CONFIG_X86_64 + regs->cs = __KERNEL_CS; +#else + regs->cs = __KERNEL_CS | get_kernel_rpl(); + regs->gs = 0; +#endif + regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; + regs->orig_ax = ~0UL; + + __get_cpu_var(current_kprobe) = &op->kp; + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + opt_pre_handler(&op->kp, regs); + __get_cpu_var(current_kprobe) = NULL; + } + preempt_enable_no_resched(); +} + +static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) +{ + int len = 0, ret; + + while (len < RELATIVEJUMP_SIZE) { + ret = __copy_instruction(dest + len, src + len, 1); + if (!ret || !can_boost(dest + len)) + return -EINVAL; + len += ret; + } + /* Check whether the address range is reserved */ + if (ftrace_text_reserved(src, src + len - 1) || + alternatives_text_reserved(src, src + len - 1)) + return -EBUSY; + + return len; +} + +/* Check whether insn is indirect jump */ +static int __kprobes insn_is_indirect_jump(struct insn *insn) +{ + return ((insn->opcode.bytes[0] == 0xff && + (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ + insn->opcode.bytes[0] == 0xea); /* Segment based jump */ +} + +/* Check whether insn jumps into specified address range */ +static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) +{ + unsigned long target = 0; + + switch (insn->opcode.bytes[0]) { + case 0xe0: /* loopne */ + case 0xe1: /* loope */ + case 0xe2: /* loop */ + case 0xe3: /* jcxz */ + case 0xe9: /* near relative jump */ + case 0xeb: /* short relative jump */ + break; + case 0x0f: + if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ + break; + return 0; + default: + if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ + break; + return 0; + } + target = (unsigned long)insn->next_byte + insn->immediate.value; + + return (start <= target && target <= start + len); +} + +/* Decode whole function to ensure any instructions don't jump into target */ +static int __kprobes can_optimize(unsigned long paddr) +{ + int ret; + unsigned long addr, size = 0, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + /* Dummy buffers for lookup_symbol_attrs */ + static char __dummy_buf[KSYM_NAME_LEN]; + + /* Lookup symbol including addr */ + if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf)) + return 0; + + /* Check there is enough space for a relative jump. */ + if (size - offset < RELATIVEJUMP_SIZE) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr - offset + size) { /* Decode until function end */ + if (search_exception_tables(addr)) + /* + * Since some fixup code will jumps into this function, + * we can't optimize kprobe in this function. + */ + return 0; + kernel_insn_init(&insn, (void *)addr); + insn_get_opcode(&insn); + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, addr); + if (ret) + return 0; + kernel_insn_init(&insn, buf); + } + insn_get_length(&insn); + /* Recover address */ + insn.kaddr = (void *)addr; + insn.next_byte = (void *)(addr + insn.length); + /* Check any instructions don't jump into target */ + if (insn_is_indirect_jump(&insn) || + insn_jump_into_range(&insn, paddr + INT3_SIZE, + RELATIVE_ADDR_SIZE)) + return 0; + addr += insn.length; + } + + return 1; +} + +/* Check optimized_kprobe can actually be optimized. */ +int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op) +{ + int i; + struct kprobe *p; + + for (i = 1; i < op->optinsn.size; i++) { + p = get_kprobe(op->kp.addr + i); + if (p && !kprobe_disabled(p)) + return -EEXIST; + } + + return 0; +} + +/* Check the addr is within the optimized instructions. */ +int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op, + unsigned long addr) +{ + return ((unsigned long)op->kp.addr <= addr && + (unsigned long)op->kp.addr + op->optinsn.size > addr); +} + +/* Free optimized instruction slot */ +static __kprobes +void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) +{ + if (op->optinsn.insn) { + free_optinsn_slot(op->optinsn.insn, dirty); + op->optinsn.insn = NULL; + op->optinsn.size = 0; + } +} + +void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) +{ + __arch_remove_optimized_kprobe(op, 1); +} + +/* + * Copy replacing target instructions + * Target instructions MUST be relocatable (checked inside) + */ +int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) +{ + u8 *buf; + int ret; + long rel; + + if (!can_optimize((unsigned long)op->kp.addr)) + return -EILSEQ; + + op->optinsn.insn = get_optinsn_slot(); + if (!op->optinsn.insn) + return -ENOMEM; + + /* + * Verify if the address gap is in 2GB range, because this uses + * a relative jump. + */ + rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; + if (abs(rel) > 0x7fffffff) + return -ERANGE; + + buf = (u8 *)op->optinsn.insn; + + /* Copy instructions into the out-of-line buffer */ + ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); + if (ret < 0) { + __arch_remove_optimized_kprobe(op, 0); + return ret; + } + op->optinsn.size = ret; + + /* Copy arch-dep-instance from template */ + memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); + + /* Set probe information */ + synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); + + /* Set probe function call */ + synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); + + /* Set returning jmp instruction at the tail of out-of-line buffer */ + synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, + (u8 *)op->kp.addr + op->optinsn.size); + + flush_icache_range((unsigned long) buf, + (unsigned long) buf + TMPL_END_IDX + + op->optinsn.size + RELATIVEJUMP_SIZE); + return 0; +} + +/* Replace a breakpoint (int3) with a relative jump. */ +int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) +{ + unsigned char jmp_code[RELATIVEJUMP_SIZE]; + s32 rel = (s32)((long)op->optinsn.insn - + ((long)op->kp.addr + RELATIVEJUMP_SIZE)); + + /* Backup instructions which will be replaced by jump address */ + memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, + RELATIVE_ADDR_SIZE); + + jmp_code[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&jmp_code[1]) = rel; + + /* + * text_poke_smp doesn't support NMI/MCE code modifying. + * However, since kprobes itself also doesn't support NMI/MCE + * code probing, it's not a problem. + */ + text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE); + return 0; +} + +/* Replace a relative jump with a breakpoint (int3). */ +void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) +{ + u8 buf[RELATIVEJUMP_SIZE]; + + /* Set int3 to first byte for kprobes */ + buf[0] = BREAKPOINT_INSTRUCTION; + memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); +} + +static int __kprobes setup_detour_execution(struct kprobe *p, + struct pt_regs *regs, + int reenter) +{ + struct optimized_kprobe *op; + + if (p->flags & KPROBE_FLAG_OPTIMIZED) { + /* This kprobe is really able to run optimized path. */ + op = container_of(p, struct optimized_kprobe, kp); + /* Detour through copied instructions */ + regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; + if (!reenter) + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } + return 0; +} +#endif + int __init arch_init_kprobes(void) { return 0; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 9055e5872ff..1c0c6ab9c60 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -301,7 +301,8 @@ static int __init vsyscall_init(void) register_sysctl_table(kernel_root_table2); #endif on_each_cpu(cpu_vsyscall_init, NULL, 1); - hotcpu_notifier(cpu_vsyscall_notifier, 0); + /* notifier priority > KVM */ + hotcpu_notifier(cpu_vsyscall_notifier, 30); return 0; } diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 3c4d0109ad2..970bbd47951 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -29,6 +29,7 @@ config KVM select HAVE_KVM_EVENTFD select KVM_APIC_ARCHITECTURE select USER_RETURN_NOTIFIER + select KVM_MMIO ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7e8faea4651..4dade6ac082 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -32,7 +32,7 @@ #include <linux/module.h> #include <asm/kvm_emulate.h> -#include "mmu.h" /* for is_long_mode() */ +#include "x86.h" /* * Opcode effective-address decode tables. @@ -76,6 +76,8 @@ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */ /* Misc flags */ +#define Lock (1<<26) /* lock prefix is allowed for the instruction */ +#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) /* Source 2 operand type */ #define Src2None (0<<29) @@ -88,39 +90,40 @@ enum { Group1_80, Group1_81, Group1_82, Group1_83, Group1A, Group3_Byte, Group3, Group4, Group5, Group7, + Group8, Group9, }; static u32 opcode_table[256] = { /* 0x00 - 0x07 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x08 - 0x0F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, 0, /* 0x10 - 0x17 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x18 - 0x1F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x20 - 0x27 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, /* 0x28 - 0x2F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 0, 0, 0, 0, /* 0x30 - 0x37 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 0, 0, 0, 0, /* 0x38 - 0x3F */ @@ -156,7 +159,7 @@ static u32 opcode_table[256] = { Group | Group1_80, Group | Group1_81, Group | Group1_82, Group | Group1_83, ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, /* 0x88 - 0x8F */ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, @@ -210,7 +213,7 @@ static u32 opcode_table[256] = { SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, - ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, + ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3, /* 0xF8 - 0xFF */ ImplicitOps, 0, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, @@ -218,16 +221,20 @@ static u32 opcode_table[256] = { static u32 twobyte_table[256] = { /* 0x00 - 0x0F */ - 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, + 0, Group | GroupDual | Group7, 0, 0, + 0, ImplicitOps, ImplicitOps | Priv, 0, + ImplicitOps | Priv, ImplicitOps | Priv, 0, 0, + 0, ImplicitOps | ModRM, 0, 0, /* 0x10 - 0x1F */ 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, /* 0x20 - 0x2F */ - ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, + ModRM | ImplicitOps | Priv, ModRM | Priv, + ModRM | ImplicitOps | Priv, ModRM | Priv, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 - 0x3F */ - ImplicitOps, 0, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, + ImplicitOps | Priv, 0, ImplicitOps | Priv, 0, + ImplicitOps, ImplicitOps | Priv, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 - 0x47 */ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, @@ -257,21 +264,23 @@ static u32 twobyte_table[256] = { DstMem | SrcReg | Src2CL | ModRM, 0, 0, /* 0xA8 - 0xAF */ ImplicitOps | Stack, ImplicitOps | Stack, - 0, DstMem | SrcReg | ModRM | BitOp, + 0, DstMem | SrcReg | ModRM | BitOp | Lock, DstMem | SrcReg | Src2ImmByte | ModRM, DstMem | SrcReg | Src2CL | ModRM, ModRM, 0, /* 0xB0 - 0xB7 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, - DstMem | SrcReg | ModRM | BitOp, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, + 0, DstMem | SrcReg | ModRM | BitOp | Lock, 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xB8 - 0xBF */ - 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, + 0, 0, + Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock, 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xC0 - 0xCF */ - 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, + 0, 0, 0, DstMem | SrcReg | ModRM | Mov, + 0, 0, 0, Group | GroupDual | Group9, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xD0 - 0xDF */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -283,25 +292,41 @@ static u32 twobyte_table[256] = { static u32 group_table[] = { [Group1_80*8] = - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM, [Group1_81*8] = - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM, [Group1_82*8] = - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64, [Group1_83*8] = - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM, [Group1A*8] = DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, [Group3_Byte*8] = @@ -320,24 +345,39 @@ static u32 group_table[] = { SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, [Group7*8] = - 0, 0, ModRM | SrcMem, ModRM | SrcMem, + 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, SrcNone | ModRM | DstMem | Mov, 0, - SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, + SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv, + [Group8*8] = + 0, 0, 0, 0, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, + [Group9*8] = + 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0, }; static u32 group2_table[] = { [Group7*8] = - SrcNone | ModRM, 0, 0, SrcNone | ModRM, + SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM, SrcNone | ModRM | DstMem | Mov, 0, SrcMem16 | ModRM | Mov, 0, + [Group9*8] = + 0, 0, 0, 0, 0, 0, 0, 0, }; /* EFLAGS bit definitions. */ +#define EFLG_ID (1<<21) +#define EFLG_VIP (1<<20) +#define EFLG_VIF (1<<19) +#define EFLG_AC (1<<18) #define EFLG_VM (1<<17) #define EFLG_RF (1<<16) +#define EFLG_IOPL (3<<12) +#define EFLG_NT (1<<14) #define EFLG_OF (1<<11) #define EFLG_DF (1<<10) #define EFLG_IF (1<<9) +#define EFLG_TF (1<<8) #define EFLG_SF (1<<7) #define EFLG_ZF (1<<6) #define EFLG_AF (1<<4) @@ -606,7 +646,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, if (linear < fc->start || linear >= fc->end) { size = min(15UL, PAGE_SIZE - offset_in_page(linear)); - rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); + rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL); if (rc) return rc; fc->start = linear; @@ -661,11 +701,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, op_bytes = 3; *address = 0; rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, - ctxt->vcpu); + ctxt->vcpu, NULL); if (rc) return rc; rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, - ctxt->vcpu); + ctxt->vcpu, NULL); return rc; } @@ -889,6 +929,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) switch (mode) { case X86EMUL_MODE_REAL: + case X86EMUL_MODE_VM86: case X86EMUL_MODE_PROT16: def_op_bytes = def_ad_bytes = 2; break; @@ -975,7 +1016,7 @@ done_prefixes: } if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { - kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");; + kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction"); return -1; } @@ -1196,13 +1237,56 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, rc = ops->read_emulated(register_address(c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]), dest, len, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); return rc; } +static int emulate_popf(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + void *dest, int len) +{ + int rc; + unsigned long val, change_mask; + int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu); + + rc = emulate_pop(ctxt, ops, &val, len); + if (rc != X86EMUL_CONTINUE) + return rc; + + change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF + | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID; + + switch(ctxt->mode) { + case X86EMUL_MODE_PROT64: + case X86EMUL_MODE_PROT32: + case X86EMUL_MODE_PROT16: + if (cpl == 0) + change_mask |= EFLG_IOPL; + if (cpl <= iopl) + change_mask |= EFLG_IF; + break; + case X86EMUL_MODE_VM86: + if (iopl < 3) { + kvm_inject_gp(ctxt->vcpu, 0); + return X86EMUL_PROPAGATE_FAULT; + } + change_mask |= EFLG_IF; + break; + default: /* real mode */ + change_mask |= (EFLG_IOPL | EFLG_IF); + break; + } + + *(unsigned long *)dest = + (ctxt->eflags & ~change_mask) | (val & change_mask); + + return rc; +} + static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) { struct decode_cache *c = &ctxt->decode; @@ -1225,7 +1309,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, if (rc != 0) return rc; - rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, 1, seg); + rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg); return rc; } @@ -1370,7 +1454,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, int rc; rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || @@ -1385,7 +1469,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, (u32) c->regs[VCPU_REGS_RBX]; rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; ctxt->eflags |= EFLG_ZF; } @@ -1407,7 +1491,7 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); if (rc) return rc; - rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS); + rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS); return rc; } @@ -1451,7 +1535,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, &c->dst.val, c->dst.bytes, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; break; case OP_NONE: @@ -1514,9 +1598,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) u64 msr_data; /* syscall is not available in real mode */ - if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL - || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) - return -1; + if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) + return X86EMUL_UNHANDLEABLE; setup_syscalls_segments(ctxt, &cs, &ss); @@ -1553,7 +1636,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); } - return 0; + return X86EMUL_CONTINUE; } static int @@ -1563,22 +1646,17 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) struct kvm_segment cs, ss; u64 msr_data; - /* inject #UD if LOCK prefix is used */ - if (c->lock_prefix) - return -1; - - /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL || - !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { + /* inject #GP if in real mode */ + if (ctxt->mode == X86EMUL_MODE_REAL) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_UNHANDLEABLE; } /* XXX sysenter/sysexit have not been tested in 64bit mode. * Therefore, we inject an #UD. */ if (ctxt->mode == X86EMUL_MODE_PROT64) - return -1; + return X86EMUL_UNHANDLEABLE; setup_syscalls_segments(ctxt, &cs, &ss); @@ -1587,13 +1665,13 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) case X86EMUL_MODE_PROT32: if ((msr_data & 0xfffc) == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } break; case X86EMUL_MODE_PROT64: if (msr_data == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } break; } @@ -1618,7 +1696,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); c->regs[VCPU_REGS_RSP] = msr_data; - return 0; + return X86EMUL_CONTINUE; } static int @@ -1629,21 +1707,11 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) u64 msr_data; int usermode; - /* inject #UD if LOCK prefix is used */ - if (c->lock_prefix) - return -1; - - /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL - || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - - /* sysexit must be called from CPL 0 */ - if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) { + /* inject #GP if in real mode or Virtual 8086 mode */ + if (ctxt->mode == X86EMUL_MODE_REAL || + ctxt->mode == X86EMUL_MODE_VM86) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_UNHANDLEABLE; } setup_syscalls_segments(ctxt, &cs, &ss); @@ -1661,7 +1729,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) cs.selector = (u16)(msr_data + 16); if ((msr_data & 0xfffc) == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } ss.selector = (u16)(msr_data + 24); break; @@ -1669,7 +1737,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) cs.selector = (u16)(msr_data + 32); if (msr_data == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } ss.selector = cs.selector + 8; cs.db = 0; @@ -1685,7 +1753,58 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; - return 0; + return X86EMUL_CONTINUE; +} + +static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) +{ + int iopl; + if (ctxt->mode == X86EMUL_MODE_REAL) + return false; + if (ctxt->mode == X86EMUL_MODE_VM86) + return true; + iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl; +} + +static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 port, u16 len) +{ + struct kvm_segment tr_seg; + int r; + u16 io_bitmap_ptr; + u8 perm, bit_idx = port & 0x7; + unsigned mask = (1 << len) - 1; + + kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR); + if (tr_seg.unusable) + return false; + if (tr_seg.limit < 103) + return false; + r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, + NULL); + if (r != X86EMUL_CONTINUE) + return false; + if (io_bitmap_ptr + port/8 > tr_seg.limit) + return false; + r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1, + ctxt->vcpu, NULL); + if (r != X86EMUL_CONTINUE) + return false; + if ((perm >> bit_idx) & mask) + return false; + return true; +} + +static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 port, u16 len) +{ + if (emulator_bad_iopl(ctxt)) + if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) + return false; + return true; } int @@ -1709,6 +1828,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); saved_eip = c->eip; + /* LOCK prefix is allowed only with some instructions */ + if (c->lock_prefix && !(c->d & Lock)) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; + } + + /* Privileged instruction can be executed only in CPL=0 */ + if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) memop = c->modrm_ea; @@ -1749,7 +1880,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) &c->src.val, c->src.bytes, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; c->src.orig_val = c->src.val; } @@ -1768,12 +1899,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) c->dst.ptr = (void *)c->dst.ptr + (c->src.val & mask) / 8; } - if (!(c->d & Mov) && - /* optimisation - avoid slow emulated read */ - ((rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, ctxt->vcpu)) != 0)) - goto done; + if (!(c->d & Mov)) { + /* optimisation - avoid slow emulated read */ + rc = ops->read_emulated((unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + goto done; + } } c->dst.orig_val = c->dst.val; @@ -1876,7 +2010,12 @@ special_insn: break; case 0x6c: /* insb */ case 0x6d: /* insw/insd */ - if (kvm_emulate_pio_string(ctxt->vcpu, + if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (kvm_emulate_pio_string(ctxt->vcpu, 1, (c->d & ByteOp) ? 1 : c->op_bytes, c->rep_prefix ? @@ -1892,6 +2031,11 @@ special_insn: return 0; case 0x6e: /* outsb */ case 0x6f: /* outsw/outsd */ + if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } if (kvm_emulate_pio_string(ctxt->vcpu, 0, (c->d & ByteOp) ? 1 : c->op_bytes, @@ -1978,25 +2122,19 @@ special_insn: break; case 0x8e: { /* mov seg, r/m16 */ uint16_t sel; - int type_bits; - int err; sel = c->src.val; - if (c->modrm_reg == VCPU_SREG_SS) - toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); - if (c->modrm_reg <= 5) { - type_bits = (c->modrm_reg == 1) ? 9 : 1; - err = kvm_load_segment_descriptor(ctxt->vcpu, sel, - type_bits, c->modrm_reg); - } else { - printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n", - c->modrm); - goto cannot_emulate; + if (c->modrm_reg == VCPU_SREG_CS || + c->modrm_reg > VCPU_SREG_GS) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; } - if (err < 0) - goto cannot_emulate; + if (c->modrm_reg == VCPU_SREG_SS) + toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); + + rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg); c->dst.type = OP_NONE; /* Disable writeback. */ break; @@ -2025,7 +2163,10 @@ special_insn: c->dst.type = OP_REG; c->dst.ptr = (unsigned long *) &ctxt->eflags; c->dst.bytes = c->op_bytes; - goto pop_instruction; + rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); + if (rc != X86EMUL_CONTINUE) + goto done; + break; case 0xa0 ... 0xa1: /* mov */ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; c->dst.val = c->src.val; @@ -2039,11 +2180,12 @@ special_insn: c->dst.ptr = (unsigned long *)register_address(c, es_base(ctxt), c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated(register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), + rc = ops->read_emulated(register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]), &c->dst.val, - c->dst.bytes, ctxt->vcpu)) != 0) + c->dst.bytes, ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; register_address_increment(c, &c->regs[VCPU_REGS_RSI], (ctxt->eflags & EFLG_DF) ? -c->dst.bytes @@ -2058,10 +2200,11 @@ special_insn: c->src.ptr = (unsigned long *)register_address(c, seg_override_base(ctxt, c), c->regs[VCPU_REGS_RSI]); - if ((rc = ops->read_emulated((unsigned long)c->src.ptr, - &c->src.val, - c->src.bytes, - ctxt->vcpu)) != 0) + rc = ops->read_emulated((unsigned long)c->src.ptr, + &c->src.val, + c->src.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; c->dst.type = OP_NONE; /* Disable writeback. */ @@ -2069,10 +2212,11 @@ special_insn: c->dst.ptr = (unsigned long *)register_address(c, es_base(ctxt), c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu)) != 0) + rc = ops->read_emulated((unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); @@ -2102,12 +2246,13 @@ special_insn: c->dst.type = OP_REG; c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - if ((rc = ops->read_emulated(register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), - &c->dst.val, - c->dst.bytes, - ctxt->vcpu)) != 0) + rc = ops->read_emulated(register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]), + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; register_address_increment(c, &c->regs[VCPU_REGS_RSI], (ctxt->eflags & EFLG_DF) ? -c->dst.bytes @@ -2163,11 +2308,9 @@ special_insn: case 0xe9: /* jmp rel */ goto jmp; case 0xea: /* jmp far */ - if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, - VCPU_SREG_CS) < 0) { - DPRINTF("jmp far: Failed to load CS descriptor\n"); - goto cannot_emulate; - } + if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, + VCPU_SREG_CS)) + goto done; c->eip = c->src.val; break; @@ -2185,7 +2328,13 @@ special_insn: case 0xef: /* out (e/r)ax,dx */ port = c->regs[VCPU_REGS_RDX]; io_dir_in = 0; - do_io: if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, + do_io: + if (!emulator_io_permited(ctxt, ops, port, + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, (c->d & ByteOp) ? 1 : c->op_bytes, port) != 0) { c->eip = saved_eip; @@ -2210,13 +2359,21 @@ special_insn: c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfa: /* cli */ - ctxt->eflags &= ~X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ + if (emulator_bad_iopl(ctxt)) + kvm_inject_gp(ctxt->vcpu, 0); + else { + ctxt->eflags &= ~X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } break; case 0xfb: /* sti */ - toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); - ctxt->eflags |= X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ + if (emulator_bad_iopl(ctxt)) + kvm_inject_gp(ctxt->vcpu, 0); + else { + toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); + ctxt->eflags |= X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } break; case 0xfc: /* cld */ ctxt->eflags &= ~EFLG_DF; @@ -2319,8 +2476,9 @@ twobyte_insn: } break; case 0x05: /* syscall */ - if (emulate_syscall(ctxt) == -1) - goto cannot_emulate; + rc = emulate_syscall(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; @@ -2391,14 +2549,16 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 0x34: /* sysenter */ - if (emulate_sysenter(ctxt) == -1) - goto cannot_emulate; + rc = emulate_sysenter(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; case 0x35: /* sysexit */ - if (emulate_sysexit(ctxt) == -1) - goto cannot_emulate; + rc = emulate_sysexit(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 15578f180e5..294698b6daf 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -242,11 +242,11 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, irq_ack_notifier); - spin_lock(&ps->inject_lock); + raw_spin_lock(&ps->inject_lock); if (atomic_dec_return(&ps->pit_timer.pending) < 0) atomic_inc(&ps->pit_timer.pending); ps->irq_ack = 1; - spin_unlock(&ps->inject_lock); + raw_spin_unlock(&ps->inject_lock); } void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -605,7 +605,7 @@ static const struct kvm_io_device_ops speaker_dev_ops = { .write = speaker_ioport_write, }; -/* Caller must have writers lock on slots_lock */ +/* Caller must hold slots_lock */ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) { struct kvm_pit *pit; @@ -624,7 +624,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) mutex_init(&pit->pit_state.lock); mutex_lock(&pit->pit_state.lock); - spin_lock_init(&pit->pit_state.inject_lock); + raw_spin_lock_init(&pit->pit_state.inject_lock); kvm->arch.vpit = pit; pit->kvm = kvm; @@ -645,13 +645,13 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); kvm_iodevice_init(&pit->dev, &pit_dev_ops); - ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev); if (ret < 0) goto fail; if (flags & KVM_PIT_SPEAKER_DUMMY) { kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); - ret = __kvm_io_bus_register_dev(&kvm->pio_bus, + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); if (ret < 0) goto fail_unregister; @@ -660,11 +660,12 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) return pit; fail_unregister: - __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); fail: - if (pit->irq_source_id >= 0) - kvm_free_irq_source_id(kvm, pit->irq_source_id); + kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); + kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); + kvm_free_irq_source_id(kvm, pit->irq_source_id); kfree(pit); return NULL; @@ -723,12 +724,12 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) /* Try to inject pending interrupts when * last one has been acked. */ - spin_lock(&ps->inject_lock); + raw_spin_lock(&ps->inject_lock); if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { ps->irq_ack = 0; inject = 1; } - spin_unlock(&ps->inject_lock); + raw_spin_unlock(&ps->inject_lock); if (inject) __inject_pit_timer_intr(kvm); } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index d4c1c7ffdc0..900d6b0ba7c 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -27,7 +27,7 @@ struct kvm_kpit_state { u32 speaker_data_on; struct mutex lock; struct kvm_pit *pit; - spinlock_t inject_lock; + raw_spinlock_t inject_lock; unsigned long irq_ack; struct kvm_irq_ack_notifier irq_ack_notifier; }; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index d057c0cbd24..07771da85de 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -44,18 +44,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) * Other interrupt may be delivered to PIC while lock is dropped but * it should be safe since PIC state is already updated at this stage. */ - spin_unlock(&s->pics_state->lock); + raw_spin_unlock(&s->pics_state->lock); kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); - spin_lock(&s->pics_state->lock); + raw_spin_lock(&s->pics_state->lock); } void kvm_pic_clear_isr_ack(struct kvm *kvm) { struct kvm_pic *s = pic_irqchip(kvm); - spin_lock(&s->lock); + + raw_spin_lock(&s->lock); s->pics[0].isr_ack = 0xff; s->pics[1].isr_ack = 0xff; - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); } /* @@ -156,9 +157,9 @@ static void pic_update_irq(struct kvm_pic *s) void kvm_pic_update_irq(struct kvm_pic *s) { - spin_lock(&s->lock); + raw_spin_lock(&s->lock); pic_update_irq(s); - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); } int kvm_pic_set_irq(void *opaque, int irq, int level) @@ -166,14 +167,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) struct kvm_pic *s = opaque; int ret = -1; - spin_lock(&s->lock); + raw_spin_lock(&s->lock); if (irq >= 0 && irq < PIC_NUM_PINS) { ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); pic_update_irq(s); trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, s->pics[irq >> 3].imr, ret == 0); } - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return ret; } @@ -203,7 +204,7 @@ int kvm_pic_read_irq(struct kvm *kvm) int irq, irq2, intno; struct kvm_pic *s = pic_irqchip(kvm); - spin_lock(&s->lock); + raw_spin_lock(&s->lock); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { pic_intack(&s->pics[0], irq); @@ -228,7 +229,7 @@ int kvm_pic_read_irq(struct kvm *kvm) intno = s->pics[0].irq_base + irq; } pic_update_irq(s); - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return intno; } @@ -442,7 +443,7 @@ static int picdev_write(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte write\n"); return 0; } - spin_lock(&s->lock); + raw_spin_lock(&s->lock); switch (addr) { case 0x20: case 0x21: @@ -455,7 +456,7 @@ static int picdev_write(struct kvm_io_device *this, elcr_ioport_write(&s->pics[addr & 1], addr, data); break; } - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return 0; } @@ -472,7 +473,7 @@ static int picdev_read(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte read\n"); return 0; } - spin_lock(&s->lock); + raw_spin_lock(&s->lock); switch (addr) { case 0x20: case 0x21: @@ -486,7 +487,7 @@ static int picdev_read(struct kvm_io_device *this, break; } *(unsigned char *)val = data; - spin_unlock(&s->lock); + raw_spin_unlock(&s->lock); return 0; } @@ -520,7 +521,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) return NULL; - spin_lock_init(&s->lock); + raw_spin_lock_init(&s->lock); s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; s->pics[1].elcr_mask = 0xde; @@ -533,7 +534,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) * Initialize PIO device */ kvm_iodevice_init(&s->dev, &picdev_ops); - ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev); + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev); + mutex_unlock(&kvm->slots_lock); if (ret < 0) { kfree(s); return NULL; @@ -541,3 +544,14 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) return s; } + +void kvm_destroy_pic(struct kvm *kvm) +{ + struct kvm_pic *vpic = kvm->arch.vpic; + + if (vpic) { + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev); + kvm->arch.vpic = NULL; + kfree(vpic); + } +} diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index be399e207d5..34b15915754 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -62,7 +62,7 @@ struct kvm_kpic_state { }; struct kvm_pic { - spinlock_t lock; + raw_spinlock_t lock; unsigned pending_acks; struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ @@ -75,6 +75,7 @@ struct kvm_pic { }; struct kvm_pic *kvm_create_pic(struct kvm *kvm); +void kvm_destroy_pic(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); void kvm_pic_clear_isr_ack(struct kvm *kvm); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 7bcc5b6a440..cff851cf532 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -1,6 +1,11 @@ #ifndef ASM_KVM_CACHE_REGS_H #define ASM_KVM_CACHE_REGS_H +#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS +#define KVM_POSSIBLE_CR4_GUEST_BITS \ + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT | X86_CR4_PGE) + static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, enum kvm_reg reg) { @@ -38,4 +43,30 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) return vcpu->arch.pdptrs[index]; } +static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) +{ + ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; + if (tmask & vcpu->arch.cr0_guest_owned_bits) + kvm_x86_ops->decache_cr0_guest_bits(vcpu); + return vcpu->arch.cr0 & mask; +} + +static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, ~0UL); +} + +static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) +{ + ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; + if (tmask & vcpu->arch.cr4_guest_owned_bits) + kvm_x86_ops->decache_cr4_guest_bits(vcpu); + return vcpu->arch.cr4 & mask; +} + +static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, ~0UL); +} + #endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ba8c045da78..4b224f90087 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1246,3 +1246,34 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) return 0; } + +int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (!irqchip_in_kernel(vcpu->kvm)) + return 1; + + /* if this is ICR write vector before command */ + if (reg == APIC_ICR) + apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); + return apic_reg_write(apic, reg, (u32)data); +} + +int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 low, high = 0; + + if (!irqchip_in_kernel(vcpu->kvm)) + return 1; + + if (apic_reg_read(apic, reg, 4, &low)) + return 1; + if (reg == APIC_ICR) + apic_reg_read(apic, APIC_ICR2, 4, &high); + + *data = (((u64)high) << 32) | low; + + return 0; +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 40010b09c4a..f5fe32c5eda 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -48,4 +48,12 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); + +int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); + +static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; +} #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 89a49fb46a2..741373e8ca7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -18,6 +18,7 @@ */ #include "mmu.h" +#include "x86.h" #include "kvm_cache_regs.h" #include <linux/kvm_host.h> @@ -29,6 +30,7 @@ #include <linux/swap.h> #include <linux/hugetlb.h> #include <linux/compiler.h> +#include <linux/srcu.h> #include <asm/page.h> #include <asm/cmpxchg.h> @@ -136,16 +138,6 @@ module_param(oos_shadow, bool, 0644); #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | PT64_NX_MASK) -#define PFERR_PRESENT_MASK (1U << 0) -#define PFERR_WRITE_MASK (1U << 1) -#define PFERR_USER_MASK (1U << 2) -#define PFERR_RSVD_MASK (1U << 3) -#define PFERR_FETCH_MASK (1U << 4) - -#define PT_PDPE_LEVEL 3 -#define PT_DIRECTORY_LEVEL 2 -#define PT_PAGE_TABLE_LEVEL 1 - #define RMAP_EXT 4 #define ACC_EXEC_MASK 1 @@ -153,6 +145,9 @@ module_param(oos_shadow, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) +#include <trace/events/kvm.h> + +#undef TRACE_INCLUDE_FILE #define CREATE_TRACE_POINTS #include "mmutrace.h" @@ -229,7 +224,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); static int is_write_protection(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr0 & X86_CR0_WP; + return kvm_read_cr0_bits(vcpu, X86_CR0_WP); } static int is_cpuid_PSE36(void) @@ -239,7 +234,7 @@ static int is_cpuid_PSE36(void) static int is_nx(struct kvm_vcpu *vcpu) { - return vcpu->arch.shadow_efer & EFER_NX; + return vcpu->arch.efer & EFER_NX; } static int is_shadow_present_pte(u64 pte) @@ -253,7 +248,7 @@ static int is_large_pte(u64 pte) return pte & PT_PAGE_SIZE_MASK; } -static int is_writeble_pte(unsigned long pte) +static int is_writable_pte(unsigned long pte) { return pte & PT_WRITABLE_MASK; } @@ -470,24 +465,10 @@ static int has_wrprotected_page(struct kvm *kvm, static int host_mapping_level(struct kvm *kvm, gfn_t gfn) { - unsigned long page_size = PAGE_SIZE; - struct vm_area_struct *vma; - unsigned long addr; + unsigned long page_size; int i, ret = 0; - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return PT_PAGE_TABLE_LEVEL; - - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, addr); - if (!vma) - goto out; - - page_size = vma_kernel_pagesize(vma); - -out: - up_read(¤t->mm->mmap_sem); + page_size = kvm_host_page_size(kvm, gfn); for (i = PT_PAGE_TABLE_LEVEL; i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { @@ -503,8 +484,7 @@ out: static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) { struct kvm_memory_slot *slot; - int host_level; - int level = PT_PAGE_TABLE_LEVEL; + int host_level, level, max_level; slot = gfn_to_memslot(vcpu->kvm, large_gfn); if (slot && slot->dirty_bitmap) @@ -515,7 +495,10 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) if (host_level == PT_PAGE_TABLE_LEVEL) return host_level; - for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) + max_level = kvm_x86_ops->get_lpage_level() < host_level ? + kvm_x86_ops->get_lpage_level() : host_level; + + for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) break; @@ -633,7 +616,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) pfn = spte_to_pfn(*spte); if (*spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); - if (is_writeble_pte(*spte)) + if (is_writable_pte(*spte)) kvm_set_pfn_dirty(pfn); rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); if (!*rmapp) { @@ -662,6 +645,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) prev_desc = desc; desc = desc->more; } + pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); BUG(); } } @@ -708,7 +692,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!spte); BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); - if (is_writeble_pte(*spte)) { + if (is_writable_pte(*spte)) { __set_spte(spte, *spte & ~PT_WRITABLE_MASK); write_protected = 1; } @@ -732,7 +716,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!(*spte & PT_PRESENT_MASK)); BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); - if (is_writeble_pte(*spte)) { + if (is_writable_pte(*spte)) { rmap_remove(kvm, spte); --kvm->stat.lpages; __set_spte(spte, shadow_trap_nonpresent_pte); @@ -787,7 +771,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; - if (is_writeble_pte(*spte)) + if (is_writable_pte(*spte)) kvm_set_pfn_dirty(spte_to_pfn(*spte)); __set_spte(spte, new_spte); spte = rmap_next(kvm, rmapp, spte); @@ -805,35 +789,32 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, unsigned long data)) { int i, j; + int ret; int retval = 0; + struct kvm_memslots *slots; - /* - * If mmap_sem isn't taken, we can look the memslots with only - * the mmu_lock by skipping over the slots with userspace_addr == 0. - */ - for (i = 0; i < kvm->nmemslots; i++) { - struct kvm_memory_slot *memslot = &kvm->memslots[i]; + slots = rcu_dereference(kvm->memslots); + + for (i = 0; i < slots->nmemslots; i++) { + struct kvm_memory_slot *memslot = &slots->memslots[i]; unsigned long start = memslot->userspace_addr; unsigned long end; - /* mmu_lock protects userspace_addr */ - if (!start) - continue; - end = start + (memslot->npages << PAGE_SHIFT); if (hva >= start && hva < end) { gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; - retval |= handler(kvm, &memslot->rmap[gfn_offset], - data); + ret = handler(kvm, &memslot->rmap[gfn_offset], data); for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { int idx = gfn_offset; idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); - retval |= handler(kvm, + ret |= handler(kvm, &memslot->lpage_info[j][idx].rmap_pde, data); } + trace_kvm_age_page(hva, memslot, ret); + retval |= ret; } } @@ -856,9 +837,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 *spte; int young = 0; - /* always return old for EPT */ + /* + * Emulate the accessed bit for EPT, by checking if this page has + * an EPT mapping, and clearing it if it does. On the next access, + * a new EPT mapping will be established. + * This has some overhead, but not as much as the cost of swapping + * out actively used pages or breaking up actively used hugepages. + */ if (!shadow_accessed_mask) - return 0; + return kvm_unmap_rmapp(kvm, rmapp, data); spte = rmap_next(kvm, rmapp, NULL); while (spte) { @@ -1615,7 +1602,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) { - int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); + int slot = memslot_id(kvm, gfn); struct kvm_mmu_page *sp = page_header(__pa(pte)); __set_bit(slot, sp->slot_bitmap); @@ -1639,7 +1626,7 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) { struct page *page; - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); if (gpa == UNMAPPED_GVA) return NULL; @@ -1852,7 +1839,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * is responsibility of mmu_get_page / kvm_sync_page. * Same reasoning can be applied to dirty page accounting. */ - if (!can_unsync && is_writeble_pte(*sptep)) + if (!can_unsync && is_writable_pte(*sptep)) goto set_pte; if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { @@ -1860,7 +1847,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, __func__, gfn); ret = 1; pte_access &= ~ACC_WRITE_MASK; - if (is_writeble_pte(spte)) + if (is_writable_pte(spte)) spte &= ~PT_WRITABLE_MASK; } } @@ -1881,7 +1868,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, bool reset_host_protection) { int was_rmapped = 0; - int was_writeble = is_writeble_pte(*sptep); + int was_writable = is_writable_pte(*sptep); int rmap_count; pgprintk("%s: spte %llx access %x write_fault %d" @@ -1932,7 +1919,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (rmap_count > RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, sptep, gfn); } else { - if (was_writeble) + if (was_writable) kvm_release_pfn_dirty(pfn); else kvm_release_pfn_clean(pfn); @@ -2162,8 +2149,11 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->kvm->mmu_lock); } -static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, + u32 access, u32 *error) { + if (error) + *error = 0; return vaddr; } @@ -2747,7 +2737,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) if (tdp_enabled) return 0; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); spin_lock(&vcpu->kvm->mmu_lock); r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); @@ -2847,16 +2837,13 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) */ page = alloc_page(GFP_KERNEL | __GFP_DMA32); if (!page) - goto error_1; + return -ENOMEM; + vcpu->arch.mmu.pae_root = page_address(page); for (i = 0; i < 4; ++i) vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; return 0; - -error_1: - free_mmu_pages(vcpu); - return -ENOMEM; } int kvm_mmu_create(struct kvm_vcpu *vcpu) @@ -2936,10 +2923,9 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - int npages; + int npages, idx; - if (!down_read_trylock(&kvm->slots_lock)) - continue; + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); npages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; @@ -2952,7 +2938,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) nr_to_scan--; spin_unlock(&kvm->mmu_lock); - up_read(&kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, idx); } if (kvm_freed) list_move_tail(&kvm_freed->vm_list, &vm_list); @@ -3019,9 +3005,11 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) int i; unsigned int nr_mmu_pages; unsigned int nr_pages = 0; + struct kvm_memslots *slots; - for (i = 0; i < kvm->nmemslots; i++) - nr_pages += kvm->memslots[i].npages; + slots = rcu_dereference(kvm->memslots); + for (i = 0; i < slots->nmemslots; i++) + nr_pages += slots->memslots[i].npages; nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; nr_mmu_pages = max(nr_mmu_pages, @@ -3246,7 +3234,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) audit_mappings_page(vcpu, ent, va, level - 1); else { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); + gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL); gfn_t gfn = gpa >> PAGE_SHIFT; pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; @@ -3291,10 +3279,12 @@ static void audit_mappings(struct kvm_vcpu *vcpu) static int count_rmaps(struct kvm_vcpu *vcpu) { int nmaps = 0; - int i, j, k; + int i, j, k, idx; + idx = srcu_read_lock(&kvm->srcu); + slots = rcu_dereference(kvm->memslots); for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; + struct kvm_memory_slot *m = &slots->memslots[i]; struct kvm_rmap_desc *d; for (j = 0; j < m->npages; ++j) { @@ -3317,6 +3307,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu) } } } + srcu_read_unlock(&kvm->srcu, idx); return nmaps; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 61a1b3884b4..be66759321a 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -2,6 +2,7 @@ #define __KVM_X86_MMU_H #include <linux/kvm_host.h> +#include "kvm_cache_regs.h" #define PT64_PT_BITS 9 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) @@ -37,6 +38,16 @@ #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 +#define PT_PDPE_LEVEL 3 +#define PT_DIRECTORY_LEVEL 2 +#define PT_PAGE_TABLE_LEVEL 1 + +#define PFERR_PRESENT_MASK (1U << 0) +#define PFERR_WRITE_MASK (1U << 1) +#define PFERR_USER_MASK (1U << 2) +#define PFERR_RSVD_MASK (1U << 3) +#define PFERR_FETCH_MASK (1U << 4) + int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) @@ -53,30 +64,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) return kvm_mmu_load(vcpu); } -static inline int is_long_mode(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_X86_64 - return vcpu->arch.shadow_efer & EFER_LMA; -#else - return 0; -#endif -} - -static inline int is_pae(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr4 & X86_CR4_PAE; -} - -static inline int is_pse(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr4 & X86_CR4_PSE; -} - -static inline int is_paging(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr0 & X86_CR0_PG; -} - static inline int is_present_gpte(unsigned long pte) { return pte & PT_PRESENT_MASK; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index ede2131a922..81eab9a50e6 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -162,7 +162,7 @@ walk: if (rsvd_fault) goto access_error; - if (write_fault && !is_writeble_pte(pte)) + if (write_fault && !is_writable_pte(pte)) if (user_fault || is_write_protection(vcpu)) goto access_error; @@ -490,18 +490,23 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) spin_unlock(&vcpu->kvm->mmu_lock); } -static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, + u32 *error) { struct guest_walker walker; gpa_t gpa = UNMAPPED_GVA; int r; - r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); + r = FNAME(walk_addr)(&walker, vcpu, vaddr, + !!(access & PFERR_WRITE_MASK), + !!(access & PFERR_USER_MASK), + !!(access & PFERR_FETCH_MASK)); if (r) { gpa = gfn_to_gpa(walker.gfn); gpa |= vaddr & ~PAGE_MASK; - } + } else if (error) + *error = walker.error_code; return gpa; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1d9b33843c8..52f78dd0301 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -231,7 +231,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) efer &= ~EFER_LME; to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; - vcpu->arch.shadow_efer = efer; + vcpu->arch.efer = efer; } static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, @@ -540,6 +540,8 @@ static void init_vmcb(struct vcpu_svm *svm) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; + svm->vcpu.fpu_active = 1; + control->intercept_cr_read = INTERCEPT_CR0_MASK | INTERCEPT_CR3_MASK | INTERCEPT_CR4_MASK; @@ -552,13 +554,19 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept_dr_read = INTERCEPT_DR0_MASK | INTERCEPT_DR1_MASK | INTERCEPT_DR2_MASK | - INTERCEPT_DR3_MASK; + INTERCEPT_DR3_MASK | + INTERCEPT_DR4_MASK | + INTERCEPT_DR5_MASK | + INTERCEPT_DR6_MASK | + INTERCEPT_DR7_MASK; control->intercept_dr_write = INTERCEPT_DR0_MASK | INTERCEPT_DR1_MASK | INTERCEPT_DR2_MASK | INTERCEPT_DR3_MASK | + INTERCEPT_DR4_MASK | INTERCEPT_DR5_MASK | + INTERCEPT_DR6_MASK | INTERCEPT_DR7_MASK; control->intercept_exceptions = (1 << PF_VECTOR) | @@ -569,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept = (1ULL << INTERCEPT_INTR) | (1ULL << INTERCEPT_NMI) | (1ULL << INTERCEPT_SMI) | + (1ULL << INTERCEPT_SELECTIVE_CR0) | (1ULL << INTERCEPT_CPUID) | (1ULL << INTERCEPT_INVD) | (1ULL << INTERCEPT_HLT) | @@ -641,10 +650,8 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | (1ULL << INTERCEPT_INVLPG)); control->intercept_exceptions &= ~(1 << PF_VECTOR); - control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| - INTERCEPT_CR3_MASK); - control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK| - INTERCEPT_CR3_MASK); + control->intercept_cr_read &= ~INTERCEPT_CR3_MASK; + control->intercept_cr_write &= ~INTERCEPT_CR3_MASK; save->g_pat = 0x0007040600070406ULL; save->cr3 = 0; save->cr4 = 0; @@ -730,7 +737,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) init_vmcb(svm); fx_init(&svm->vcpu); - svm->vcpu.fpu_active = 1; svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; @@ -765,14 +771,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (unlikely(cpu != vcpu->cpu)) { u64 delta; - /* - * Make sure that the guest sees a monotonically - * increasing TSC. - */ - delta = vcpu->arch.host_tsc - native_read_tsc(); - svm->vmcb->control.tsc_offset += delta; - if (is_nested(svm)) - svm->nested.hsave->control.tsc_offset += delta; + if (check_tsc_unstable()) { + /* + * Make sure that the guest sees a monotonically + * increasing TSC. + */ + delta = vcpu->arch.host_tsc - native_read_tsc(); + svm->vmcb->control.tsc_offset += delta; + if (is_nested(svm)) + svm->nested.hsave->control.tsc_offset += delta; + } vcpu->cpu = cpu; kvm_migrate_timers(vcpu); svm->asid_generation = 0; @@ -954,42 +962,59 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) svm->vmcb->save.gdtr.base = dt->base ; } +static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +{ +} + static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { } +static void update_cr0_intercept(struct vcpu_svm *svm) +{ + ulong gcr0 = svm->vcpu.arch.cr0; + u64 *hcr0 = &svm->vmcb->save.cr0; + + if (!svm->vcpu.fpu_active) + *hcr0 |= SVM_CR0_SELECTIVE_MASK; + else + *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) + | (gcr0 & SVM_CR0_SELECTIVE_MASK); + + + if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { + svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; + svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; + } else { + svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; + svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; + } +} + static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->arch.shadow_efer & EFER_LME) { + if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { - vcpu->arch.shadow_efer |= EFER_LMA; + vcpu->arch.efer |= EFER_LMA; svm->vmcb->save.efer |= EFER_LMA | EFER_LME; } if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { - vcpu->arch.shadow_efer &= ~EFER_LMA; + vcpu->arch.efer &= ~EFER_LMA; svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); } } #endif - if (npt_enabled) - goto set; + vcpu->arch.cr0 = cr0; - if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { - svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - vcpu->fpu_active = 1; - } + if (!npt_enabled) + cr0 |= X86_CR0_PG | X86_CR0_WP; - vcpu->arch.cr0 = cr0; - cr0 |= X86_CR0_PG | X86_CR0_WP; - if (!vcpu->fpu_active) { - svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); + if (!vcpu->fpu_active) cr0 |= X86_CR0_TS; - } -set: /* * re-enable caching here because the QEMU bios * does not do it - this results in some delay at @@ -997,6 +1022,7 @@ set: */ cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; + update_cr0_intercept(svm); } static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1102,76 +1128,70 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->vmcb->control.asid = sd->next_asid++; } -static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) +static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest) { struct vcpu_svm *svm = to_svm(vcpu); - unsigned long val; switch (dr) { case 0 ... 3: - val = vcpu->arch.db[dr]; + *dest = vcpu->arch.db[dr]; break; + case 4: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 6: if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - val = vcpu->arch.dr6; + *dest = vcpu->arch.dr6; else - val = svm->vmcb->save.dr6; + *dest = svm->vmcb->save.dr6; break; + case 5: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 7: if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - val = vcpu->arch.dr7; + *dest = vcpu->arch.dr7; else - val = svm->vmcb->save.dr7; + *dest = svm->vmcb->save.dr7; break; - default: - val = 0; } - return val; + return EMULATE_DONE; } -static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, - int *exception) +static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value) { struct vcpu_svm *svm = to_svm(vcpu); - *exception = 0; - switch (dr) { case 0 ... 3: vcpu->arch.db[dr] = value; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) vcpu->arch.eff_db[dr] = value; - return; - case 4 ... 5: - if (vcpu->arch.cr4 & X86_CR4_DE) - *exception = UD_VECTOR; - return; + break; + case 4: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 6: - if (value & 0xffffffff00000000ULL) { - *exception = GP_VECTOR; - return; - } vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1; - return; + break; + case 5: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return EMULATE_FAIL; /* will re-inject UD */ + /* fall through */ case 7: - if (value & 0xffffffff00000000ULL) { - *exception = GP_VECTOR; - return; - } vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { svm->vmcb->save.dr7 = vcpu->arch.dr7; vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK); } - return; - default: - /* FIXME: Possible case? */ - printk(KERN_DEBUG "%s: unexpected dr %u\n", - __func__, dr); - *exception = UD_VECTOR; - return; + break; } + + return EMULATE_DONE; } static int pf_interception(struct vcpu_svm *svm) @@ -1239,13 +1259,17 @@ static int ud_interception(struct vcpu_svm *svm) return 1; } -static int nm_interception(struct vcpu_svm *svm) +static void svm_fpu_activate(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) - svm->vmcb->save.cr0 &= ~X86_CR0_TS; svm->vcpu.fpu_active = 1; + update_cr0_intercept(svm); +} +static int nm_interception(struct vcpu_svm *svm) +{ + svm_fpu_activate(&svm->vcpu); return 1; } @@ -1337,7 +1361,7 @@ static int vmmcall_interception(struct vcpu_svm *svm) static int nested_svm_check_permissions(struct vcpu_svm *svm) { - if (!(svm->vcpu.arch.shadow_efer & EFER_SVME) + if (!(svm->vcpu.arch.efer & EFER_SVME) || !is_paging(&svm->vcpu)) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; @@ -1740,8 +1764,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) hsave->save.ds = vmcb->save.ds; hsave->save.gdtr = vmcb->save.gdtr; hsave->save.idtr = vmcb->save.idtr; - hsave->save.efer = svm->vcpu.arch.shadow_efer; - hsave->save.cr0 = svm->vcpu.arch.cr0; + hsave->save.efer = svm->vcpu.arch.efer; + hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); hsave->save.cr4 = svm->vcpu.arch.cr4; hsave->save.rflags = vmcb->save.rflags; hsave->save.rip = svm->next_rip; @@ -2153,9 +2177,10 @@ static int rdmsr_interception(struct vcpu_svm *svm) u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u64 data; - if (svm_get_msr(&svm->vcpu, ecx, &data)) + if (svm_get_msr(&svm->vcpu, ecx, &data)) { + trace_kvm_msr_read_ex(ecx); kvm_inject_gp(&svm->vcpu, 0); - else { + } else { trace_kvm_msr_read(ecx, data); svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; @@ -2247,13 +2272,15 @@ static int wrmsr_interception(struct vcpu_svm *svm) u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); - trace_kvm_msr_write(ecx, data); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - if (svm_set_msr(&svm->vcpu, ecx, data)) + if (svm_set_msr(&svm->vcpu, ecx, data)) { + trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); - else + } else { + trace_kvm_msr_write(ecx, data); skip_emulated_instruction(&svm->vcpu); + } return 1; } @@ -2297,7 +2324,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR3] = emulate_on_interception, [SVM_EXIT_READ_CR4] = emulate_on_interception, [SVM_EXIT_READ_CR8] = emulate_on_interception, - /* for now: */ + [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, [SVM_EXIT_WRITE_CR0] = emulate_on_interception, [SVM_EXIT_WRITE_CR3] = emulate_on_interception, [SVM_EXIT_WRITE_CR4] = emulate_on_interception, @@ -2306,11 +2333,17 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_DR1] = emulate_on_interception, [SVM_EXIT_READ_DR2] = emulate_on_interception, [SVM_EXIT_READ_DR3] = emulate_on_interception, + [SVM_EXIT_READ_DR4] = emulate_on_interception, + [SVM_EXIT_READ_DR5] = emulate_on_interception, + [SVM_EXIT_READ_DR6] = emulate_on_interception, + [SVM_EXIT_READ_DR7] = emulate_on_interception, [SVM_EXIT_WRITE_DR0] = emulate_on_interception, [SVM_EXIT_WRITE_DR1] = emulate_on_interception, [SVM_EXIT_WRITE_DR2] = emulate_on_interception, [SVM_EXIT_WRITE_DR3] = emulate_on_interception, + [SVM_EXIT_WRITE_DR4] = emulate_on_interception, [SVM_EXIT_WRITE_DR5] = emulate_on_interception, + [SVM_EXIT_WRITE_DR6] = emulate_on_interception, [SVM_EXIT_WRITE_DR7] = emulate_on_interception, [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, @@ -2383,20 +2416,10 @@ static int handle_exit(struct kvm_vcpu *vcpu) svm_complete_interrupts(svm); - if (npt_enabled) { - int mmu_reload = 0; - if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { - svm_set_cr0(vcpu, svm->vmcb->save.cr0); - mmu_reload = 1; - } + if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK)) vcpu->arch.cr0 = svm->vmcb->save.cr0; + if (npt_enabled) vcpu->arch.cr3 = svm->vmcb->save.cr3; - if (mmu_reload) { - kvm_mmu_reset_context(vcpu); - kvm_mmu_load(vcpu); - } - } - if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; @@ -2798,12 +2821,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) svm->vmcb->save.cr3 = root; force_new_asid(vcpu); - - if (vcpu->fpu_active) { - svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); - svm->vmcb->save.cr0 |= X86_CR0_TS; - vcpu->fpu_active = 0; - } } static int is_disabled(void) @@ -2852,6 +2869,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return 0; } +static void svm_cpuid_update(struct kvm_vcpu *vcpu) +{ +} + static const struct trace_print_flags svm_exit_reasons_str[] = { { SVM_EXIT_READ_CR0, "read_cr0" }, { SVM_EXIT_READ_CR3, "read_cr3" }, @@ -2905,9 +2926,22 @@ static const struct trace_print_flags svm_exit_reasons_str[] = { { -1, NULL } }; -static bool svm_gb_page_enable(void) +static int svm_get_lpage_level(void) { - return true; + return PT_PDPE_LEVEL; +} + +static bool svm_rdtscp_supported(void) +{ + return false; +} + +static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + update_cr0_intercept(svm); + svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; } static struct kvm_x86_ops svm_x86_ops = { @@ -2936,6 +2970,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_segment = svm_set_segment, .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, + .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, .set_cr0 = svm_set_cr0, .set_cr3 = svm_set_cr3, @@ -2950,6 +2985,8 @@ static struct kvm_x86_ops svm_x86_ops = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + .fpu_activate = svm_fpu_activate, + .fpu_deactivate = svm_fpu_deactivate, .tlb_flush = svm_flush_tlb, @@ -2975,7 +3012,11 @@ static struct kvm_x86_ops svm_x86_ops = { .get_mt_mask = svm_get_mt_mask, .exit_reasons_str = svm_exit_reasons_str, - .gb_page_enable = svm_gb_page_enable, + .get_lpage_level = svm_get_lpage_level, + + .cpuid_update = svm_cpuid_update, + + .rdtscp_supported = svm_rdtscp_supported, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 816e0449db0..6ad30a29f04 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -56,6 +56,38 @@ TRACE_EVENT(kvm_hypercall, ); /* + * Tracepoint for hypercall. + */ +TRACE_EVENT(kvm_hv_hypercall, + TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx, + __u64 ingpa, __u64 outgpa), + TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), + + TP_STRUCT__entry( + __field( __u16, code ) + __field( bool, fast ) + __field( __u16, rep_cnt ) + __field( __u16, rep_idx ) + __field( __u64, ingpa ) + __field( __u64, outgpa ) + ), + + TP_fast_assign( + __entry->code = code; + __entry->fast = fast; + __entry->rep_cnt = rep_cnt; + __entry->rep_idx = rep_idx; + __entry->ingpa = ingpa; + __entry->outgpa = outgpa; + ), + + TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", + __entry->code, __entry->fast ? "fast" : "slow", + __entry->rep_cnt, __entry->rep_idx, __entry->ingpa, + __entry->outgpa) +); + +/* * Tracepoint for PIO. */ TRACE_EVENT(kvm_pio, @@ -214,28 +246,33 @@ TRACE_EVENT(kvm_page_fault, * Tracepoint for guest MSR access. */ TRACE_EVENT(kvm_msr, - TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data), - TP_ARGS(rw, ecx, data), + TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception), + TP_ARGS(write, ecx, data, exception), TP_STRUCT__entry( - __field( unsigned int, rw ) - __field( unsigned int, ecx ) - __field( unsigned long, data ) + __field( unsigned, write ) + __field( u32, ecx ) + __field( u64, data ) + __field( u8, exception ) ), TP_fast_assign( - __entry->rw = rw; + __entry->write = write; __entry->ecx = ecx; __entry->data = data; + __entry->exception = exception; ), - TP_printk("msr_%s %x = 0x%lx", - __entry->rw ? "write" : "read", - __entry->ecx, __entry->data) + TP_printk("msr_%s %x = 0x%llx%s", + __entry->write ? "write" : "read", + __entry->ecx, __entry->data, + __entry->exception ? " (#GP)" : "") ); -#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data) -#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data) +#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false) +#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false) +#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true) +#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true) /* * Tracepoint for guest CR access. diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d4918d6fc92..14873b9f843 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -61,6 +61,21 @@ module_param_named(unrestricted_guest, static int __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); +#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) +#define KVM_GUEST_CR0_MASK \ + (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE) +#define KVM_VM_CR0_ALWAYS_ON \ + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_CR4_GUEST_OWNED_BITS \ + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT) + +#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) +#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -136,6 +151,8 @@ struct vcpu_vmx { ktime_t entry_time; s64 vnmi_blocked_time; u32 exit_reason; + + bool rdtscp_enabled; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -210,7 +227,7 @@ static const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif - MSR_EFER, MSR_K6_STAR, + MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR, }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) @@ -301,6 +318,11 @@ static inline bool cpu_has_vmx_ept_2m_page(void) return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); } +static inline bool cpu_has_vmx_ept_1g_page(void) +{ + return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT); +} + static inline int cpu_has_vmx_invept_individual_addr(void) { return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); @@ -336,9 +358,7 @@ static inline int cpu_has_vmx_ple(void) static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) { - return flexpriority_enabled && - (cpu_has_vmx_virtualize_apic_accesses()) && - (irqchip_in_kernel(kvm)); + return flexpriority_enabled && irqchip_in_kernel(kvm); } static inline int cpu_has_vmx_vpid(void) @@ -347,6 +367,12 @@ static inline int cpu_has_vmx_vpid(void) SECONDARY_EXEC_ENABLE_VPID; } +static inline int cpu_has_vmx_rdtscp(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_RDTSCP; +} + static inline int cpu_has_virtual_nmis(void) { return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; @@ -551,22 +577,18 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; - eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); - if (!vcpu->fpu_active) - eb |= 1u << NM_VECTOR; - /* - * Unconditionally intercept #DB so we can maintain dr6 without - * reading it every exit. - */ - eb |= 1u << DB_VECTOR; - if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { - if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) - eb |= 1u << BP_VECTOR; - } + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | + (1u << NM_VECTOR) | (1u << DB_VECTOR); + if ((vcpu->guest_debug & + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) + eb |= 1u << BP_VECTOR; if (to_vmx(vcpu)->rmode.vm86_active) eb = ~0; if (enable_ept) eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ + if (vcpu->fpu_active) + eb &= ~(1u << NM_VECTOR); vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -589,7 +611,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) u64 guest_efer; u64 ignore_bits; - guest_efer = vmx->vcpu.arch.shadow_efer; + guest_efer = vmx->vcpu.arch.efer; /* * NX is emulated; LMA and LME handled by hardware; SCE meaninless @@ -767,22 +789,30 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) static void vmx_fpu_activate(struct kvm_vcpu *vcpu) { + ulong cr0; + if (vcpu->fpu_active) return; vcpu->fpu_active = 1; - vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); - if (vcpu->arch.cr0 & X86_CR0_TS) - vmcs_set_bits(GUEST_CR0, X86_CR0_TS); + cr0 = vmcs_readl(GUEST_CR0); + cr0 &= ~(X86_CR0_TS | X86_CR0_MP); + cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP); + vmcs_writel(GUEST_CR0, cr0); update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); } +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); + static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) { - if (!vcpu->fpu_active) - return; - vcpu->fpu_active = 0; - vmcs_set_bits(GUEST_CR0, X86_CR0_TS); + vmx_decache_cr0_guest_bits(vcpu); + vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits = 0; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); + vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); } static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) @@ -878,6 +908,11 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); } +static bool vmx_rdtscp_supported(void) +{ + return cpu_has_vmx_rdtscp(); +} + /* * Swap MSR entry in host/guest MSR entry array. */ @@ -913,12 +948,15 @@ static void setup_msrs(struct vcpu_vmx *vmx) index = __find_msr_index(vmx, MSR_CSTAR); if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_TSC_AUX); + if (index >= 0 && vmx->rdtscp_enabled) + move_msr_up(vmx, index, save_nmsrs++); /* * MSR_K6_STAR is only needed on long mode guests, and only * if efer.sce is enabled. */ index = __find_msr_index(vmx, MSR_K6_STAR); - if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE)) + if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) move_msr_up(vmx, index, save_nmsrs++); } #endif @@ -1002,6 +1040,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_IA32_SYSENTER_ESP: data = vmcs_readl(GUEST_SYSENTER_ESP); break; + case MSR_TSC_AUX: + if (!to_vmx(vcpu)->rdtscp_enabled) + return 1; + /* Otherwise falls through */ default: vmx_load_host_state(to_vmx(vcpu)); msr = find_msr_entry(to_vmx(vcpu), msr_index); @@ -1065,7 +1107,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vcpu->arch.pat = data; break; } - /* Otherwise falls through to kvm_set_msr_common */ + ret = kvm_set_msr_common(vcpu, msr_index, data); + break; + case MSR_TSC_AUX: + if (!vmx->rdtscp_enabled) + return 1; + /* Check reserved bit, higher 32 bits should be zero */ + if ((data >> 32) != 0) + return 1; + /* Otherwise falls through */ default: msr = find_msr_entry(vmx, msr_index); if (msr) { @@ -1224,6 +1274,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING | CPU_BASED_INVLPG_EXITING; opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | @@ -1243,7 +1295,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST | - SECONDARY_EXEC_PAUSE_LOOP_EXITING; + SECONDARY_EXEC_PAUSE_LOOP_EXITING | + SECONDARY_EXEC_RDTSCP; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -1457,8 +1510,12 @@ static void enter_pmode(struct kvm_vcpu *vcpu) static gva_t rmode_tss_base(struct kvm *kvm) { if (!kvm->arch.tss_addr) { - gfn_t base_gfn = kvm->memslots[0].base_gfn + - kvm->memslots[0].npages - 3; + struct kvm_memslots *slots; + gfn_t base_gfn; + + slots = rcu_dereference(kvm->memslots); + base_gfn = kvm->memslots->memslots[0].base_gfn + + kvm->memslots->memslots[0].npages - 3; return base_gfn << PAGE_SHIFT; } return kvm->arch.tss_addr; @@ -1544,9 +1601,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) * of this msr depends on is_long_mode(). */ vmx_load_host_state(to_vmx(vcpu)); - vcpu->arch.shadow_efer = efer; - if (!msr) - return; + vcpu->arch.efer = efer; if (efer & EFER_LMA) { vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) | @@ -1576,13 +1631,13 @@ static void enter_lmode(struct kvm_vcpu *vcpu) (guest_tr_ar & ~AR_TYPE_MASK) | AR_TYPE_BUSY_64_TSS); } - vcpu->arch.shadow_efer |= EFER_LMA; - vmx_set_efer(vcpu, vcpu->arch.shadow_efer); + vcpu->arch.efer |= EFER_LMA; + vmx_set_efer(vcpu, vcpu->arch.efer); } static void exit_lmode(struct kvm_vcpu *vcpu) { - vcpu->arch.shadow_efer &= ~EFER_LMA; + vcpu->arch.efer &= ~EFER_LMA; vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) @@ -1598,10 +1653,20 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu) ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); } +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +{ + ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; + + vcpu->arch.cr0 &= ~cr0_guest_owned_bits; + vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; +} + static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { - vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; - vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; + ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; + + vcpu->arch.cr4 &= ~cr4_guest_owned_bits; + vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; } static void ept_load_pdptrs(struct kvm_vcpu *vcpu) @@ -1646,7 +1711,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, (CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, vcpu->arch.cr4); + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } else if (!is_paging(vcpu)) { /* From nonpaging to paging */ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, @@ -1654,23 +1719,13 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, vcpu->arch.cr4); + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } if (!(cr0 & X86_CR0_WP)) *hw_cr0 &= ~X86_CR0_WP; } -static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, - struct kvm_vcpu *vcpu) -{ - if (!is_paging(vcpu)) { - *hw_cr4 &= ~X86_CR4_PAE; - *hw_cr4 |= X86_CR4_PSE; - } else if (!(vcpu->arch.cr4 & X86_CR4_PAE)) - *hw_cr4 &= ~X86_CR4_PAE; -} - static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -1682,8 +1737,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) else hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; - vmx_fpu_deactivate(vcpu); - if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); @@ -1691,7 +1744,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) enter_rmode(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->arch.shadow_efer & EFER_LME) { + if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) enter_lmode(vcpu); if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) @@ -1702,12 +1755,12 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (enable_ept) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); + if (!vcpu->fpu_active) + hw_cr0 |= X86_CR0_TS | X86_CR0_MP; + vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; - - if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) - vmx_fpu_activate(vcpu); } static u64 construct_eptp(unsigned long root_hpa) @@ -1738,8 +1791,6 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vmx_flush_tlb(vcpu); vmcs_writel(GUEST_CR3, guest_cr3); - if (vcpu->arch.cr0 & X86_CR0_PE) - vmx_fpu_deactivate(vcpu); } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1748,8 +1799,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); vcpu->arch.cr4 = cr4; - if (enable_ept) - ept_update_paging_mode_cr4(&hw_cr4, vcpu); + if (enable_ept) { + if (!is_paging(vcpu)) { + hw_cr4 &= ~X86_CR4_PAE; + hw_cr4 |= X86_CR4_PSE; + } else if (!(cr4 & X86_CR4_PAE)) { + hw_cr4 &= ~X86_CR4_PAE; + } + } vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); @@ -1787,7 +1844,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, static int vmx_get_cpl(struct kvm_vcpu *vcpu) { - if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ + if (!is_protmode(vcpu)) return 0; if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ @@ -2042,7 +2099,7 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) static bool guest_state_valid(struct kvm_vcpu *vcpu) { /* real mode guest state checks */ - if (!(vcpu->arch.cr0 & X86_CR0_PE)) { + if (!is_protmode(vcpu)) { if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) return false; if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) @@ -2175,7 +2232,7 @@ static int alloc_apic_access_page(struct kvm *kvm) struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); if (kvm->arch.apic_access_page) goto out; kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; @@ -2188,7 +2245,7 @@ static int alloc_apic_access_page(struct kvm *kvm) kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -2197,7 +2254,7 @@ static int alloc_identity_pagetable(struct kvm *kvm) struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); if (kvm->arch.ept_identity_pagetable) goto out; kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; @@ -2212,7 +2269,7 @@ static int alloc_identity_pagetable(struct kvm *kvm) kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -2384,14 +2441,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) for (i = 0; i < NR_VMX_MSR; ++i) { u32 index = vmx_msr_index[i]; u32 data_low, data_high; - u64 data; int j = vmx->nmsrs; if (rdmsr_safe(index, &data_low, &data_high) < 0) continue; if (wrmsr_safe(index, data_low, data_high) < 0) continue; - data = data_low | ((u64)data_high << 32); vmx->guest_msrs[j].index = i; vmx->guest_msrs[j].data = 0; vmx->guest_msrs[j].mask = -1ull; @@ -2404,7 +2459,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); - vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); + vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; + if (enable_ept) + vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; + vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; rdtscll(tsc_this); @@ -2429,10 +2487,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 msr; - int ret; + int ret, idx; vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); if (!init_rmode(vmx->vcpu.kvm)) { ret = -ENOMEM; goto out; @@ -2526,7 +2584,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ + vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */ vmx_set_cr4(&vmx->vcpu, 0); vmx_set_efer(&vmx->vcpu, 0); vmx_fpu_activate(&vmx->vcpu); @@ -2540,7 +2598,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->emulation_required = 0; out: - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); return ret; } @@ -2717,6 +2775,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, kvm_queue_exception(vcpu, vec); return 1; case BP_VECTOR: + /* + * Update instruction length as we may reinject the exception + * from user space while in guest debugging mode. + */ + to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) return 0; /* fall through */ @@ -2839,6 +2903,13 @@ static int handle_exception(struct kvm_vcpu *vcpu) kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); /* fall through */ case BP_VECTOR: + /* + * Update instruction length as we may reinject #BP from + * user space while in guest debugging mode. Reading it for + * #DB as well causes no harm, it is not used in that case. + */ + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; kvm_run->debug.arch.exception = ex_no; @@ -2940,11 +3011,10 @@ static int handle_cr(struct kvm_vcpu *vcpu) }; break; case 2: /* clts */ - vmx_fpu_deactivate(vcpu); - vcpu->arch.cr0 &= ~X86_CR0_TS; - vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); - vmx_fpu_activate(vcpu); + vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); + trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); skip_emulated_instruction(vcpu); + vmx_fpu_activate(vcpu); return 1; case 1: /*mov from cr*/ switch (cr) { @@ -2962,7 +3032,9 @@ static int handle_cr(struct kvm_vcpu *vcpu) } break; case 3: /* lmsw */ - kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); + val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; + trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); + kvm_lmsw(vcpu, val); skip_emulated_instruction(vcpu); return 1; @@ -2975,12 +3047,22 @@ static int handle_cr(struct kvm_vcpu *vcpu) return 0; } +static int check_dr_alias(struct kvm_vcpu *vcpu) +{ + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return -1; + } + return 0; +} + static int handle_dr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; unsigned long val; int dr, reg; + /* Do not handle if the CPL > 0, will trigger GP on re-entry */ if (!kvm_require_cpl(vcpu, 0)) return 1; dr = vmcs_readl(GUEST_DR7); @@ -3016,14 +3098,20 @@ static int handle_dr(struct kvm_vcpu *vcpu) case 0 ... 3: val = vcpu->arch.db[dr]; break; + case 4: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ case 6: val = vcpu->arch.dr6; break; - case 7: + case 5: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ + default: /* 7 */ val = vcpu->arch.dr7; break; - default: - val = 0; } kvm_register_write(vcpu, reg, val); } else { @@ -3034,21 +3122,25 @@ static int handle_dr(struct kvm_vcpu *vcpu) if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) vcpu->arch.eff_db[dr] = val; break; - case 4 ... 5: - if (vcpu->arch.cr4 & X86_CR4_DE) - kvm_queue_exception(vcpu, UD_VECTOR); - break; + case 4: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ case 6: if (val & 0xffffffff00000000ULL) { - kvm_queue_exception(vcpu, GP_VECTOR); - break; + kvm_inject_gp(vcpu, 0); + return 1; } vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; break; - case 7: + case 5: + if (check_dr_alias(vcpu) < 0) + return 1; + /* fall through */ + default: /* 7 */ if (val & 0xffffffff00000000ULL) { - kvm_queue_exception(vcpu, GP_VECTOR); - break; + kvm_inject_gp(vcpu, 0); + return 1; } vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { @@ -3075,6 +3167,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) u64 data; if (vmx_get_msr(vcpu, ecx, &data)) { + trace_kvm_msr_read_ex(ecx); kvm_inject_gp(vcpu, 0); return 1; } @@ -3094,13 +3187,13 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); - trace_kvm_msr_write(ecx, data); - if (vmx_set_msr(vcpu, ecx, data) != 0) { + trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; } + trace_kvm_msr_write(ecx, data); skip_emulated_instruction(vcpu); return 1; } @@ -3385,7 +3478,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) } if (err != EMULATE_DONE) { - kvm_report_emulation_failure(vcpu, "emulation failure"); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; @@ -3416,6 +3508,12 @@ static int handle_pause(struct kvm_vcpu *vcpu) return 1; } +static int handle_invalid_op(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -3453,6 +3551,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, + [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, + [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, }; static const int kvm_vmx_max_exit_handlers = @@ -3686,9 +3786,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) */ vmcs_writel(HOST_CR0, read_cr0()); - if (vcpu->arch.switch_db_regs) - set_debugreg(vcpu->arch.dr6, 6); - asm( /* Store host registers */ "push %%"R"dx; push %%"R"bp;" @@ -3789,9 +3886,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) | (1 << VCPU_EXREG_PDPTR)); vcpu->arch.regs_dirty = 0; - if (vcpu->arch.switch_db_regs) - get_debugreg(vcpu->arch.dr6, 6); - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); @@ -3920,7 +4014,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) * b. VT-d with snooping control feature: snooping control feature of * VT-d engine can guarantee the cache correctness. Just set it * to WB to keep consistent with host. So the same as item 3. - * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep + * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep * consistent with host MTRR */ if (is_mmio) @@ -3931,37 +4025,88 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) VMX_EPT_MT_EPTE_SHIFT; else ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) - | VMX_EPT_IGMT_BIT; + | VMX_EPT_IPAT_BIT; return ret; } +#define _ER(x) { EXIT_REASON_##x, #x } + static const struct trace_print_flags vmx_exit_reasons_str[] = { - { EXIT_REASON_EXCEPTION_NMI, "exception" }, - { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" }, - { EXIT_REASON_TRIPLE_FAULT, "triple_fault" }, - { EXIT_REASON_NMI_WINDOW, "nmi_window" }, - { EXIT_REASON_IO_INSTRUCTION, "io_instruction" }, - { EXIT_REASON_CR_ACCESS, "cr_access" }, - { EXIT_REASON_DR_ACCESS, "dr_access" }, - { EXIT_REASON_CPUID, "cpuid" }, - { EXIT_REASON_MSR_READ, "rdmsr" }, - { EXIT_REASON_MSR_WRITE, "wrmsr" }, - { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" }, - { EXIT_REASON_HLT, "halt" }, - { EXIT_REASON_INVLPG, "invlpg" }, - { EXIT_REASON_VMCALL, "hypercall" }, - { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" }, - { EXIT_REASON_APIC_ACCESS, "apic_access" }, - { EXIT_REASON_WBINVD, "wbinvd" }, - { EXIT_REASON_TASK_SWITCH, "task_switch" }, - { EXIT_REASON_EPT_VIOLATION, "ept_violation" }, + _ER(EXCEPTION_NMI), + _ER(EXTERNAL_INTERRUPT), + _ER(TRIPLE_FAULT), + _ER(PENDING_INTERRUPT), + _ER(NMI_WINDOW), + _ER(TASK_SWITCH), + _ER(CPUID), + _ER(HLT), + _ER(INVLPG), + _ER(RDPMC), + _ER(RDTSC), + _ER(VMCALL), + _ER(VMCLEAR), + _ER(VMLAUNCH), + _ER(VMPTRLD), + _ER(VMPTRST), + _ER(VMREAD), + _ER(VMRESUME), + _ER(VMWRITE), + _ER(VMOFF), + _ER(VMON), + _ER(CR_ACCESS), + _ER(DR_ACCESS), + _ER(IO_INSTRUCTION), + _ER(MSR_READ), + _ER(MSR_WRITE), + _ER(MWAIT_INSTRUCTION), + _ER(MONITOR_INSTRUCTION), + _ER(PAUSE_INSTRUCTION), + _ER(MCE_DURING_VMENTRY), + _ER(TPR_BELOW_THRESHOLD), + _ER(APIC_ACCESS), + _ER(EPT_VIOLATION), + _ER(EPT_MISCONFIG), + _ER(WBINVD), { -1, NULL } }; -static bool vmx_gb_page_enable(void) +#undef _ER + +static int vmx_get_lpage_level(void) +{ + if (enable_ept && !cpu_has_vmx_ept_1g_page()) + return PT_DIRECTORY_LEVEL; + else + /* For shadow and EPT supported 1GB page */ + return PT_PDPE_LEVEL; +} + +static inline u32 bit(int bitno) +{ + return 1 << (bitno & 31); +} + +static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { - return false; + struct kvm_cpuid_entry2 *best; + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exec_control; + + vmx->rdtscp_enabled = false; + if (vmx_rdtscp_supported()) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + if (exec_control & SECONDARY_EXEC_RDTSCP) { + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) + vmx->rdtscp_enabled = true; + else { + exec_control &= ~SECONDARY_EXEC_RDTSCP; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + } + } + } } static struct kvm_x86_ops vmx_x86_ops = { @@ -3990,6 +4135,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_segment = vmx_set_segment, .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, + .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, .set_cr0 = vmx_set_cr0, .set_cr3 = vmx_set_cr3, @@ -4002,6 +4148,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + .fpu_activate = vmx_fpu_activate, + .fpu_deactivate = vmx_fpu_deactivate, .tlb_flush = vmx_flush_tlb, @@ -4027,7 +4175,11 @@ static struct kvm_x86_ops vmx_x86_ops = { .get_mt_mask = vmx_get_mt_mask, .exit_reasons_str = vmx_exit_reasons_str, - .gb_page_enable = vmx_gb_page_enable, + .get_lpage_level = vmx_get_lpage_level, + + .cpuid_update = vmx_cpuid_update, + + .rdtscp_supported = vmx_rdtscp_supported, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1e1bc9d412..e46282a5656 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -38,6 +38,7 @@ #include <linux/intel-iommu.h> #include <linux/cpufreq.h> #include <linux/user-return-notifier.h> +#include <linux/srcu.h> #include <trace/events/kvm.h> #undef TRACE_INCLUDE_FILE #define CREATE_TRACE_POINTS @@ -93,16 +94,16 @@ module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); struct kvm_shared_msrs_global { int nr; - struct kvm_shared_msr { - u32 msr; - u64 value; - } msrs[KVM_NR_SHARED_MSRS]; + u32 msrs[KVM_NR_SHARED_MSRS]; }; struct kvm_shared_msrs { struct user_return_notifier urn; bool registered; - u64 current_value[KVM_NR_SHARED_MSRS]; + struct kvm_shared_msr_values { + u64 host; + u64 curr; + } values[KVM_NR_SHARED_MSRS]; }; static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; @@ -147,53 +148,64 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; - struct kvm_shared_msr *global; struct kvm_shared_msrs *locals = container_of(urn, struct kvm_shared_msrs, urn); + struct kvm_shared_msr_values *values; for (slot = 0; slot < shared_msrs_global.nr; ++slot) { - global = &shared_msrs_global.msrs[slot]; - if (global->value != locals->current_value[slot]) { - wrmsrl(global->msr, global->value); - locals->current_value[slot] = global->value; + values = &locals->values[slot]; + if (values->host != values->curr) { + wrmsrl(shared_msrs_global.msrs[slot], values->host); + values->curr = values->host; } } locals->registered = false; user_return_notifier_unregister(urn); } -void kvm_define_shared_msr(unsigned slot, u32 msr) +static void shared_msr_update(unsigned slot, u32 msr) { - int cpu; + struct kvm_shared_msrs *smsr; u64 value; + smsr = &__get_cpu_var(shared_msrs); + /* only read, and nobody should modify it at this time, + * so don't need lock */ + if (slot >= shared_msrs_global.nr) { + printk(KERN_ERR "kvm: invalid MSR slot!"); + return; + } + rdmsrl_safe(msr, &value); + smsr->values[slot].host = value; + smsr->values[slot].curr = value; +} + +void kvm_define_shared_msr(unsigned slot, u32 msr) +{ if (slot >= shared_msrs_global.nr) shared_msrs_global.nr = slot + 1; - shared_msrs_global.msrs[slot].msr = msr; - rdmsrl_safe(msr, &value); - shared_msrs_global.msrs[slot].value = value; - for_each_online_cpu(cpu) - per_cpu(shared_msrs, cpu).current_value[slot] = value; + shared_msrs_global.msrs[slot] = msr; + /* we need ensured the shared_msr_global have been updated */ + smp_wmb(); } EXPORT_SYMBOL_GPL(kvm_define_shared_msr); static void kvm_shared_msr_cpu_online(void) { unsigned i; - struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs); for (i = 0; i < shared_msrs_global.nr; ++i) - locals->current_value[i] = shared_msrs_global.msrs[i].value; + shared_msr_update(i, shared_msrs_global.msrs[i]); } void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) { struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); - if (((value ^ smsr->current_value[slot]) & mask) == 0) + if (((value ^ smsr->values[slot].curr) & mask) == 0) return; - smsr->current_value[slot] = value; - wrmsrl(shared_msrs_global.msrs[slot].msr, value); + smsr->values[slot].curr = value; + wrmsrl(shared_msrs_global.msrs[slot], value); if (!smsr->registered) { smsr->urn.on_user_return = kvm_on_user_return; user_return_notifier_register(&smsr->urn); @@ -257,12 +269,68 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) } EXPORT_SYMBOL_GPL(kvm_set_apic_base); +#define EXCPT_BENIGN 0 +#define EXCPT_CONTRIBUTORY 1 +#define EXCPT_PF 2 + +static int exception_class(int vector) +{ + switch (vector) { + case PF_VECTOR: + return EXCPT_PF; + case DE_VECTOR: + case TS_VECTOR: + case NP_VECTOR: + case SS_VECTOR: + case GP_VECTOR: + return EXCPT_CONTRIBUTORY; + default: + break; + } + return EXCPT_BENIGN; +} + +static void kvm_multiple_exception(struct kvm_vcpu *vcpu, + unsigned nr, bool has_error, u32 error_code) +{ + u32 prev_nr; + int class1, class2; + + if (!vcpu->arch.exception.pending) { + queue: + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = has_error; + vcpu->arch.exception.nr = nr; + vcpu->arch.exception.error_code = error_code; + return; + } + + /* to check exception */ + prev_nr = vcpu->arch.exception.nr; + if (prev_nr == DF_VECTOR) { + /* triple fault -> shutdown */ + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + return; + } + class1 = exception_class(prev_nr); + class2 = exception_class(nr); + if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) + || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { + /* generate double fault per SDM Table 5-5 */ + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = true; + vcpu->arch.exception.nr = DF_VECTOR; + vcpu->arch.exception.error_code = 0; + } else + /* replace previous exception with a new one in a hope + that instruction re-execution will regenerate lost + exception */ + goto queue; +} + void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - WARN_ON(vcpu->arch.exception.pending); - vcpu->arch.exception.pending = true; - vcpu->arch.exception.has_error_code = false; - vcpu->arch.exception.nr = nr; + kvm_multiple_exception(vcpu, nr, false, 0); } EXPORT_SYMBOL_GPL(kvm_queue_exception); @@ -270,25 +338,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, u32 error_code) { ++vcpu->stat.pf_guest; - - if (vcpu->arch.exception.pending) { - switch(vcpu->arch.exception.nr) { - case DF_VECTOR: - /* triple fault -> shutdown */ - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); - return; - case PF_VECTOR: - vcpu->arch.exception.nr = DF_VECTOR; - vcpu->arch.exception.error_code = 0; - return; - default: - /* replace previous exception with a new one in a hope - that instruction re-execution will regenerate lost - exception */ - vcpu->arch.exception.pending = false; - break; - } - } vcpu->arch.cr2 = addr; kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); } @@ -301,11 +350,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - WARN_ON(vcpu->arch.exception.pending); - vcpu->arch.exception.pending = true; - vcpu->arch.exception.has_error_code = true; - vcpu->arch.exception.nr = nr; - vcpu->arch.exception.error_code = error_code; + kvm_multiple_exception(vcpu, nr, true, error_code); } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); @@ -383,12 +428,18 @@ out: void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { - if (cr0 & CR0_RESERVED_BITS) { + cr0 |= X86_CR0_ET; + +#ifdef CONFIG_X86_64 + if (cr0 & 0xffffffff00000000UL) { printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", - cr0, vcpu->arch.cr0); + cr0, kvm_read_cr0(vcpu)); kvm_inject_gp(vcpu, 0); return; } +#endif + + cr0 &= ~CR0_RESERVED_BITS; if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); @@ -405,7 +456,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { #ifdef CONFIG_X86_64 - if ((vcpu->arch.shadow_efer & EFER_LME)) { + if ((vcpu->arch.efer & EFER_LME)) { int cs_db, cs_l; if (!is_pae(vcpu)) { @@ -443,13 +494,13 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { - kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); + kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(kvm_lmsw); void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long old_cr4 = vcpu->arch.cr4; + unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; if (cr4 & CR4_RESERVED_BITS) { @@ -575,9 +626,11 @@ static inline u32 bit(int bitno) * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 2 +#define KVM_SAVE_MSRS_BEGIN 5 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + HV_X64_MSR_APIC_ASSIST_PAGE, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_K6_STAR, #ifdef CONFIG_X86_64 @@ -602,7 +655,7 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) } if (is_paging(vcpu) - && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { + && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) { printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); kvm_inject_gp(vcpu, 0); return; @@ -633,9 +686,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) kvm_x86_ops->set_efer(vcpu, efer); efer &= ~EFER_LMA; - efer |= vcpu->arch.shadow_efer & EFER_LMA; + efer |= vcpu->arch.efer & EFER_LMA; - vcpu->arch.shadow_efer = efer; + vcpu->arch.efer = efer; vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; kvm_mmu_reset_context(vcpu); @@ -957,6 +1010,100 @@ out: return r; } +static bool kvm_hv_hypercall_enabled(struct kvm *kvm) +{ + return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; +} + +static bool kvm_hv_msr_partition_wide(u32 msr) +{ + bool r = false; + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + case HV_X64_MSR_HYPERCALL: + r = true; + break; + } + + return r; +} + +static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + struct kvm *kvm = vcpu->kvm; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + kvm->arch.hv_guest_os_id = data; + /* setting guest os id to zero disables hypercall page */ + if (!kvm->arch.hv_guest_os_id) + kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; + break; + case HV_X64_MSR_HYPERCALL: { + u64 gfn; + unsigned long addr; + u8 instructions[4]; + + /* if guest os id is not set hypercall should remain disabled */ + if (!kvm->arch.hv_guest_os_id) + break; + if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { + kvm->arch.hv_hypercall = data; + break; + } + gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return 1; + kvm_x86_ops->patch_hypercall(vcpu, instructions); + ((unsigned char *)instructions)[3] = 0xc3; /* ret */ + if (copy_to_user((void __user *)addr, instructions, 4)) + return 1; + kvm->arch.hv_hypercall = data; + break; + } + default: + pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); + return 1; + } + return 0; +} + +static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + switch (msr) { + case HV_X64_MSR_APIC_ASSIST_PAGE: { + unsigned long addr; + + if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { + vcpu->arch.hv_vapic = data; + break; + } + addr = gfn_to_hva(vcpu->kvm, data >> + HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); + if (kvm_is_error_hva(addr)) + return 1; + if (clear_user((void __user *)addr, PAGE_SIZE)) + return 1; + vcpu->arch.hv_vapic = data; + break; + } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); + default: + pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); + return 1; + } + + return 0; +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { @@ -1071,6 +1218,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " "0x%x data 0x%llx\n", msr, data); break; + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_lock(&vcpu->kvm->lock); + r = set_msr_hyperv_pw(vcpu, msr, data); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return set_msr_hyperv(vcpu, msr, data); + break; default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); @@ -1170,6 +1327,54 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } +static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + struct kvm *kvm = vcpu->kvm; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + data = kvm->arch.hv_guest_os_id; + break; + case HV_X64_MSR_HYPERCALL: + data = kvm->arch.hv_hypercall; + break; + default: + pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + + *pdata = data; + return 0; +} + +static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + + switch (msr) { + case HV_X64_MSR_VP_INDEX: { + int r; + struct kvm_vcpu *v; + kvm_for_each_vcpu(r, v, vcpu->kvm) + if (v == vcpu) + data = r; + break; + } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); + default: + pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + *pdata = data; + return 0; +} + int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { u64 data; @@ -1221,7 +1426,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data |= (((uint64_t)4ULL) << 40); break; case MSR_EFER: - data = vcpu->arch.shadow_efer; + data = vcpu->arch.efer; break; case MSR_KVM_WALL_CLOCK: data = vcpu->kvm->arch.wall_clock; @@ -1236,6 +1441,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: return get_msr_mce(vcpu, msr, pdata); + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_lock(&vcpu->kvm->lock); + r = get_msr_hyperv_pw(vcpu, msr, pdata); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return get_msr_hyperv(vcpu, msr, pdata); + break; default: if (!ignore_msrs) { pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); @@ -1261,15 +1476,15 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data)) { - int i; + int i, idx; vcpu_load(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); for (i = 0; i < msrs->nmsrs; ++i) if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); vcpu_put(vcpu); @@ -1351,6 +1566,11 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_XEN_HVM: case KVM_CAP_ADJUST_CLOCK: case KVM_CAP_VCPU_EVENTS: + case KVM_CAP_HYPERV: + case KVM_CAP_HYPERV_VAPIC: + case KVM_CAP_HYPERV_SPIN: + case KVM_CAP_PCI_SEGMENT: + case KVM_CAP_X86_ROBUST_SINGLESTEP: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1464,8 +1684,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); + kvm_x86_ops->vcpu_put(vcpu); } static int is_efer_nx(void) @@ -1530,6 +1750,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, cpuid_fix_nx_cap(vcpu); r = 0; kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); out_free: vfree(cpuid_entries); @@ -1552,6 +1773,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, goto out; vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); return 0; out: @@ -1594,12 +1816,15 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, u32 index, int *nent, int maxnent) { unsigned f_nx = is_efer_nx() ? F(NX) : 0; - unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0; #ifdef CONFIG_X86_64 + unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) + ? F(GBPAGES) : 0; unsigned f_lm = F(LM); #else + unsigned f_gbpages = 0; unsigned f_lm = 0; #endif + unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; /* cpuid 1.edx */ const u32 kvm_supported_word0_x86_features = @@ -1619,7 +1844,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | F(PAT) | F(PSE36) | 0 /* Reserved */ | f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | - F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | + F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ const u32 kvm_supported_word4_x86_features = @@ -1866,7 +2091,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, return 0; if (mce->status & MCI_STATUS_UC) { if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || - !(vcpu->arch.cr4 & X86_CR4_MCE)) { + !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { printk(KERN_DEBUG "kvm: set_mce: " "injects mce exception while " "previous one is in progress!\n"); @@ -2160,14 +2385,14 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) return -EINVAL; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); spin_lock(&kvm->mmu_lock); kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; spin_unlock(&kvm->mmu_lock); - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return 0; } @@ -2176,13 +2401,35 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) return kvm->arch.n_alloc_mmu_pages; } +gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; + + aliases = rcu_dereference(kvm->arch.aliases); + + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; + if (alias->flags & KVM_ALIAS_INVALID) + continue; + if (gfn >= alias->base_gfn + && gfn < alias->base_gfn + alias->npages) + return alias->target_gfn + gfn - alias->base_gfn; + } + return gfn; +} + gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) { int i; struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; - for (i = 0; i < kvm->arch.naliases; ++i) { - alias = &kvm->arch.aliases[i]; + aliases = rcu_dereference(kvm->arch.aliases); + + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; if (gfn >= alias->base_gfn && gfn < alias->base_gfn + alias->npages) return alias->target_gfn + gfn - alias->base_gfn; @@ -2200,6 +2447,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, { int r, n; struct kvm_mem_alias *p; + struct kvm_mem_aliases *aliases, *old_aliases; r = -EINVAL; /* General sanity checks */ @@ -2216,26 +2464,48 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, < alias->target_phys_addr) goto out; - down_write(&kvm->slots_lock); - spin_lock(&kvm->mmu_lock); + r = -ENOMEM; + aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!aliases) + goto out; + + mutex_lock(&kvm->slots_lock); - p = &kvm->arch.aliases[alias->slot]; + /* invalidate any gfn reference in case of deletion/shrinking */ + memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); + aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID; + old_aliases = kvm->arch.aliases; + rcu_assign_pointer(kvm->arch.aliases, aliases); + synchronize_srcu_expedited(&kvm->srcu); + kvm_mmu_zap_all(kvm); + kfree(old_aliases); + + r = -ENOMEM; + aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!aliases) + goto out_unlock; + + memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); + + p = &aliases->aliases[alias->slot]; p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; p->npages = alias->memory_size >> PAGE_SHIFT; p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; + p->flags &= ~(KVM_ALIAS_INVALID); for (n = KVM_ALIAS_SLOTS; n > 0; --n) - if (kvm->arch.aliases[n - 1].npages) + if (aliases->aliases[n - 1].npages) break; - kvm->arch.naliases = n; + aliases->naliases = n; - spin_unlock(&kvm->mmu_lock); - kvm_mmu_zap_all(kvm); - - up_write(&kvm->slots_lock); - - return 0; + old_aliases = kvm->arch.aliases; + rcu_assign_pointer(kvm->arch.aliases, aliases); + synchronize_srcu_expedited(&kvm->srcu); + kfree(old_aliases); + r = 0; +out_unlock: + mutex_unlock(&kvm->slots_lock); out: return r; } @@ -2273,18 +2543,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - spin_lock(&pic_irqchip(kvm)->lock); + raw_spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + raw_spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_PIC_SLAVE: - spin_lock(&pic_irqchip(kvm)->lock); + raw_spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + raw_spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_IOAPIC: r = kvm_set_ioapic(kvm, &chip->chip.ioapic); @@ -2364,29 +2634,62 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - int r; - int n; + int r, n, i; struct kvm_memory_slot *memslot; - int is_dirty = 0; + unsigned long is_dirty = 0; + unsigned long *dirty_bitmap = NULL; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); - r = kvm_get_dirty_log(kvm, log, &is_dirty); - if (r) + r = -EINVAL; + if (log->slot >= KVM_MEMORY_SLOTS) + goto out; + + memslot = &kvm->memslots->memslots[log->slot]; + r = -ENOENT; + if (!memslot->dirty_bitmap) + goto out; + + n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; + + r = -ENOMEM; + dirty_bitmap = vmalloc(n); + if (!dirty_bitmap) goto out; + memset(dirty_bitmap, 0, n); + + for (i = 0; !is_dirty && i < n/sizeof(long); i++) + is_dirty = memslot->dirty_bitmap[i]; /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { + struct kvm_memslots *slots, *old_slots; + spin_lock(&kvm->mmu_lock); kvm_mmu_slot_remove_write_access(kvm, log->slot); spin_unlock(&kvm->mmu_lock); - memslot = &kvm->memslots[log->slot]; - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; - memset(memslot->dirty_bitmap, 0, n); + + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) + goto out_free; + + memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); + slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; + + old_slots = kvm->memslots; + rcu_assign_pointer(kvm->memslots, slots); + synchronize_srcu_expedited(&kvm->srcu); + dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; + kfree(old_slots); } + r = 0; + if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) + r = -EFAULT; +out_free: + vfree(dirty_bitmap); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -2469,6 +2772,8 @@ long kvm_arch_vm_ioctl(struct file *filp, if (vpic) { r = kvm_ioapic_init(kvm); if (r) { + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &vpic->dev); kfree(vpic); goto create_irqchip_unlock; } @@ -2480,10 +2785,8 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_setup_default_irq_routing(kvm); if (r) { mutex_lock(&kvm->irq_lock); - kfree(kvm->arch.vpic); - kfree(kvm->arch.vioapic); - kvm->arch.vpic = NULL; - kvm->arch.vioapic = NULL; + kvm_ioapic_destroy(kvm); + kvm_destroy_pic(kvm); mutex_unlock(&kvm->irq_lock); } create_irqchip_unlock: @@ -2499,7 +2802,7 @@ long kvm_arch_vm_ioctl(struct file *filp, sizeof(struct kvm_pit_config))) goto out; create_pit: - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); r = -EEXIST; if (kvm->arch.vpit) goto create_pit_unlock; @@ -2508,7 +2811,7 @@ long kvm_arch_vm_ioctl(struct file *filp, if (kvm->arch.vpit) r = 0; create_pit_unlock: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); break; case KVM_IRQ_LINE_STATUS: case KVM_IRQ_LINE: { @@ -2725,7 +3028,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) return 0; - return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); + return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); } static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) @@ -2734,17 +3037,44 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) return 0; - return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); + return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); } -static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu) +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + + gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + access |= PFERR_FETCH_MASK; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + access |= PFERR_WRITE_MASK; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + +/* uses this to access any guest's mapped memory without checking CPL */ +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); +} + +static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 access, + u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -2767,14 +3097,37 @@ out: return r; } +/* used for instruction fetching */ +static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, + access | PFERR_FETCH_MASK, error); +} + +static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, + error); +} + +static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); +} + static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu) + struct kvm_vcpu *vcpu, u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -2804,6 +3157,7 @@ static int emulator_read_emulated(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + u32 error_code; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -2813,17 +3167,20 @@ static int emulator_read_emulated(unsigned long addr, return X86EMUL_CONTINUE; } - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); + + if (gpa == UNMAPPED_GVA) { + kvm_inject_page_fault(vcpu, addr, error_code); + return X86EMUL_PROPAGATE_FAULT; + } /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto mmio; - if (kvm_read_guest_virt(addr, val, bytes, vcpu) + if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; mmio: /* @@ -2862,11 +3219,12 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + u32 error_code; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); if (gpa == UNMAPPED_GVA) { - kvm_inject_page_fault(vcpu, addr, 2); + kvm_inject_page_fault(vcpu, addr, error_code); return X86EMUL_PROPAGATE_FAULT; } @@ -2930,7 +3288,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, char *kaddr; u64 val; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); if (gpa == UNMAPPED_GVA || (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -2967,35 +3325,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) int emulate_clts(struct kvm_vcpu *vcpu) { - kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); + kvm_x86_ops->fpu_activate(vcpu); return X86EMUL_CONTINUE; } int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) { - struct kvm_vcpu *vcpu = ctxt->vcpu; - - switch (dr) { - case 0 ... 3: - *dest = kvm_x86_ops->get_dr(vcpu, dr); - return X86EMUL_CONTINUE; - default: - pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr); - return X86EMUL_UNHANDLEABLE; - } + return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest); } int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) { unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; - int exception; - kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); - if (exception) { - /* FIXME: better handling */ - return X86EMUL_UNHANDLEABLE; - } - return X86EMUL_CONTINUE; + return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask); } void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) @@ -3009,7 +3353,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); + kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL); printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); @@ -3017,7 +3361,8 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); static struct x86_emulate_ops emulate_ops = { - .read_std = kvm_read_guest_virt, + .read_std = kvm_read_guest_virt_system, + .fetch = kvm_fetch_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated, @@ -3060,8 +3405,9 @@ int emulate_instruction(struct kvm_vcpu *vcpu, vcpu->arch.emulate_ctxt.vcpu = vcpu; vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); vcpu->arch.emulate_ctxt.mode = + (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_REAL : cs_l + ? X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; @@ -3153,12 +3499,17 @@ static int pio_copy_data(struct kvm_vcpu *vcpu) gva_t q = vcpu->arch.pio.guest_gva; unsigned bytes; int ret; + u32 error_code; bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; if (vcpu->arch.pio.in) - ret = kvm_write_guest_virt(q, p, bytes, vcpu); + ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code); else - ret = kvm_read_guest_virt(q, p, bytes, vcpu); + ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code); + + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, q, error_code); + return ret; } @@ -3179,7 +3530,7 @@ int complete_pio(struct kvm_vcpu *vcpu) if (io->in) { r = pio_copy_data(vcpu); if (r) - return r; + goto out; } delta = 1; @@ -3206,7 +3557,7 @@ int complete_pio(struct kvm_vcpu *vcpu) kvm_register_write(vcpu, VCPU_REGS_RSI, val); } } - +out: io->count -= io->cur_count; io->cur_count = 0; @@ -3219,11 +3570,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) int r; if (vcpu->arch.pio.in) - r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, + r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, vcpu->arch.pio.size, pd); else - r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); + r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, + vcpu->arch.pio.port, vcpu->arch.pio.size, + pd); return r; } @@ -3234,7 +3586,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu) int i, r = 0; for (i = 0; i < io->cur_count; i++) { - if (kvm_io_bus_write(&vcpu->kvm->pio_bus, + if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, io->port, io->size, pd)) { r = -EOPNOTSUPP; break; @@ -3248,6 +3600,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) { unsigned long val; + trace_kvm_pio(!in, port, size, 1); + vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = vcpu->arch.pio.size = size; @@ -3259,11 +3613,10 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) vcpu->arch.pio.down = 0; vcpu->arch.pio.rep = 0; - trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, - size, 1); - - val = kvm_register_read(vcpu, VCPU_REGS_RAX); - memcpy(vcpu->arch.pio_data, &val, 4); + if (!vcpu->arch.pio.in) { + val = kvm_register_read(vcpu, VCPU_REGS_RAX); + memcpy(vcpu->arch.pio_data, &val, 4); + } if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { complete_pio(vcpu); @@ -3280,6 +3633,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, unsigned now, in_page; int ret = 0; + trace_kvm_pio(!in, port, size, count); + vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = vcpu->arch.pio.size = size; @@ -3291,9 +3646,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, vcpu->arch.pio.down = down; vcpu->arch.pio.rep = rep; - trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, - size, count); - if (!count) { kvm_x86_ops->skip_emulated_instruction(vcpu); return 1; @@ -3325,10 +3677,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, if (!vcpu->arch.pio.in) { /* string PIO write */ ret = pio_copy_data(vcpu); - if (ret == X86EMUL_PROPAGATE_FAULT) { - kvm_inject_gp(vcpu, 0); + if (ret == X86EMUL_PROPAGATE_FAULT) return 1; - } if (ret == 0 && !pio_string_write(vcpu)) { complete_pio(vcpu); if (vcpu->arch.pio.count == 0) @@ -3487,11 +3837,76 @@ static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, return a0 | ((gpa_t)a1 << 32); } +int kvm_hv_hypercall(struct kvm_vcpu *vcpu) +{ + u64 param, ingpa, outgpa, ret; + uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; + bool fast, longmode; + int cs_db, cs_l; + + /* + * hypercall generates UD from non zero cpl and real mode + * per HYPER-V spec + */ + if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + longmode = is_long_mode(vcpu) && cs_l == 1; + + if (!longmode) { + param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); + ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); + outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); + } +#ifdef CONFIG_X86_64 + else { + param = kvm_register_read(vcpu, VCPU_REGS_RCX); + ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); + outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); + } +#endif + + code = param & 0xffff; + fast = (param >> 16) & 0x1; + rep_cnt = (param >> 32) & 0xfff; + rep_idx = (param >> 48) & 0xfff; + + trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); + + switch (code) { + case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: + kvm_vcpu_on_spin(vcpu); + break; + default: + res = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + } + + ret = res | (((u64)rep_done & 0xfff) << 32); + if (longmode) { + kvm_register_write(vcpu, VCPU_REGS_RAX, ret); + } else { + kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); + kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); + } + + return 1; +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; int r = 1; + if (kvm_hv_hypercall_enabled(vcpu->kvm)) + return kvm_hv_hypercall(vcpu); + nr = kvm_register_read(vcpu, VCPU_REGS_RAX); a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); @@ -3534,10 +3949,8 @@ EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); int kvm_fix_hypercall(struct kvm_vcpu *vcpu) { char instruction[3]; - int ret = 0; unsigned long rip = kvm_rip_read(vcpu); - /* * Blow out the MMU to ensure that no other VCPU has an active mapping * to ensure that the updated hypercall appears atomically across all @@ -3546,11 +3959,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) kvm_mmu_zap_all(vcpu->kvm); kvm_x86_ops->patch_hypercall(vcpu, instruction); - if (emulator_write_emulated(rip, instruction, 3, vcpu) - != X86EMUL_CONTINUE) - ret = -EFAULT; - return ret; + return emulator_write_emulated(rip, instruction, 3, vcpu); } static u64 mk_cr_64(u64 curr_cr, u32 new_val) @@ -3583,10 +3993,9 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) { unsigned long value; - kvm_x86_ops->decache_cr4_guest_bits(vcpu); switch (cr) { case 0: - value = vcpu->arch.cr0; + value = kvm_read_cr0(vcpu); break; case 2: value = vcpu->arch.cr2; @@ -3595,7 +4004,7 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) value = vcpu->arch.cr3; break; case 4: - value = vcpu->arch.cr4; + value = kvm_read_cr4(vcpu); break; case 8: value = kvm_get_cr8(vcpu); @@ -3613,7 +4022,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, { switch (cr) { case 0: - kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); + kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); *rflags = kvm_get_rflags(vcpu); break; case 2: @@ -3623,7 +4032,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, kvm_set_cr3(vcpu, val); break; case 4: - kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); + kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); break; case 8: kvm_set_cr8(vcpu, val & 0xfUL); @@ -3690,6 +4099,7 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, } return best; } +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) { @@ -3773,14 +4183,15 @@ static void vapic_enter(struct kvm_vcpu *vcpu) static void vapic_exit(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; + int idx; if (!apic || !apic->vapic_addr) return; - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_release_page_dirty(apic->vapic_page); mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); } static void update_cr8_intercept(struct kvm_vcpu *vcpu) @@ -3876,12 +4287,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 0; goto out; } + if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { + vcpu->fpu_active = 0; + kvm_x86_ops->fpu_deactivate(vcpu); + } } preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); - kvm_load_guest_fpu(vcpu); + if (vcpu->fpu_active) + kvm_load_guest_fpu(vcpu); local_irq_disable(); @@ -3909,7 +4325,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_lapic_sync_to_vapic(vcpu); } - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); kvm_guest_enter(); @@ -3951,7 +4367,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_enable(); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); /* * Profile KVM exit RIPs: @@ -3973,6 +4389,7 @@ out: static int __vcpu_run(struct kvm_vcpu *vcpu) { int r; + struct kvm *kvm = vcpu->kvm; if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { pr_debug("vcpu %d received sipi with vector # %x\n", @@ -3984,7 +4401,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); vapic_enter(vcpu); r = 1; @@ -3992,9 +4409,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) r = vcpu_enter_guest(vcpu); else { - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) { switch(vcpu->arch.mp_state) { @@ -4029,13 +4446,13 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) ++vcpu->stat.signal_exits; } if (need_resched()) { - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_resched(vcpu); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); } } - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); post_kvm_run_save(vcpu); vapic_exit(vcpu); @@ -4074,10 +4491,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->mmio_read_completed = 1; vcpu->mmio_needed = 0; - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, EMULTYPE_NO_DECODE); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (r == EMULATE_DO_MMIO) { /* * Read-modify-write. Back to userspace. @@ -4204,13 +4621,12 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs->gdt.limit = dt.limit; sregs->gdt.base = dt.base; - kvm_x86_ops->decache_cr4_guest_bits(vcpu); - sregs->cr0 = vcpu->arch.cr0; + sregs->cr0 = kvm_read_cr0(vcpu); sregs->cr2 = vcpu->arch.cr2; sregs->cr3 = vcpu->arch.cr3; - sregs->cr4 = vcpu->arch.cr4; + sregs->cr4 = kvm_read_cr4(vcpu); sregs->cr8 = kvm_get_cr8(vcpu); - sregs->efer = vcpu->arch.shadow_efer; + sregs->efer = vcpu->arch.efer; sregs->apic_base = kvm_get_apic_base(vcpu); memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); @@ -4298,14 +4714,23 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, { struct descriptor_table dtable; u16 index = selector >> 3; + int ret; + u32 err; + gva_t addr; get_segment_descriptor_dtable(vcpu, selector, &dtable); if (dtable.limit < index * 8 + 7) { kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); - return 1; + return X86EMUL_PROPAGATE_FAULT; } - return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); + addr = dtable.base + index * 8; + ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc), + vcpu, &err); + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, addr, err); + + return ret; } /* allowed just for 8 bytes segments */ @@ -4319,15 +4744,23 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, if (dtable.limit < index * 8 + 7) return 1; - return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); + return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL); +} + +static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu, + struct desc_struct *seg_desc) +{ + u32 base_addr = get_desc_base(seg_desc); + + return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL); } -static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu, +static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu, struct desc_struct *seg_desc) { u32 base_addr = get_desc_base(seg_desc); - return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); + return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL); } static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) @@ -4338,18 +4771,6 @@ static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) return kvm_seg.selector; } -static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, - u16 selector, - struct kvm_segment *kvm_seg) -{ - struct desc_struct seg_desc; - - if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) - return 1; - seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); - return 0; -} - static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) { struct kvm_segment segvar = { @@ -4367,7 +4788,7 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se .unusable = 0, }; kvm_x86_ops->set_segment(vcpu, &segvar, seg); - return 0; + return X86EMUL_CONTINUE; } static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) @@ -4377,24 +4798,112 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); } -int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - int type_bits, int seg) +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg) { struct kvm_segment kvm_seg; + struct desc_struct seg_desc; + u8 dpl, rpl, cpl; + unsigned err_vec = GP_VECTOR; + u32 err_code = 0; + bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ + int ret; - if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) + if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu)) return kvm_load_realmode_segment(vcpu, selector, seg); - if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) - return 1; - kvm_seg.type |= type_bits; - if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && - seg != VCPU_SREG_LDTR) - if (!kvm_seg.s) - kvm_seg.unusable = 1; + /* NULL selector is not valid for TR, CS and SS */ + if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) + && null_selector) + goto exception; + + /* TR should be in GDT only */ + if (seg == VCPU_SREG_TR && (selector & (1 << 2))) + goto exception; + + ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc); + if (ret) + return ret; + + seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg); + + if (null_selector) { /* for NULL selector skip all following checks */ + kvm_seg.unusable = 1; + goto load; + } + + err_code = selector & 0xfffc; + err_vec = GP_VECTOR; + /* can't load system descriptor into segment selecor */ + if (seg <= VCPU_SREG_GS && !kvm_seg.s) + goto exception; + + if (!kvm_seg.present) { + err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; + goto exception; + } + + rpl = selector & 3; + dpl = kvm_seg.dpl; + cpl = kvm_x86_ops->get_cpl(vcpu); + + switch (seg) { + case VCPU_SREG_SS: + /* + * segment is not a writable data segment or segment + * selector's RPL != CPL or segment selector's RPL != CPL + */ + if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl) + goto exception; + break; + case VCPU_SREG_CS: + if (!(kvm_seg.type & 8)) + goto exception; + + if (kvm_seg.type & 4) { + /* conforming */ + if (dpl > cpl) + goto exception; + } else { + /* nonconforming */ + if (rpl > cpl || dpl != cpl) + goto exception; + } + /* CS(RPL) <- CPL */ + selector = (selector & 0xfffc) | cpl; + break; + case VCPU_SREG_TR: + if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9)) + goto exception; + break; + case VCPU_SREG_LDTR: + if (kvm_seg.s || kvm_seg.type != 2) + goto exception; + break; + default: /* DS, ES, FS, or GS */ + /* + * segment is not a data or readable code segment or + * ((segment is a data or nonconforming code segment) + * and (both RPL and CPL > DPL)) + */ + if ((kvm_seg.type & 0xa) == 0x8 || + (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl))) + goto exception; + break; + } + + if (!kvm_seg.unusable && kvm_seg.s) { + /* mark segment as accessed */ + kvm_seg.type |= 1; + seg_desc.type |= 1; + save_guest_segment_descriptor(vcpu, selector, &seg_desc); + } +load: kvm_set_segment(vcpu, &kvm_seg, seg); - return 0; + return X86EMUL_CONTINUE; +exception: + kvm_queue_exception_e(vcpu, err_vec, err_code); + return X86EMUL_PROPAGATE_FAULT; } static void save_state_to_tss32(struct kvm_vcpu *vcpu, @@ -4420,6 +4929,14 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu, tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); } +static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg) +{ + struct kvm_segment kvm_seg; + kvm_get_segment(vcpu, &kvm_seg, seg); + kvm_seg.selector = sel; + kvm_set_segment(vcpu, &kvm_seg, seg); +} + static int load_state_from_tss32(struct kvm_vcpu *vcpu, struct tss_segment_32 *tss) { @@ -4437,25 +4954,41 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); - if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS); + kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) + if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) + if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS)) return 1; return 0; } @@ -4495,19 +5028,33 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu, kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); - if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) return 1; return 0; } @@ -4529,7 +5076,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, sizeof tss_segment_16)) goto out; - if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), &tss_segment_16, sizeof tss_segment_16)) goto out; @@ -4537,7 +5084,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, tss_segment_16.prev_task_link = old_tss_sel; if (kvm_write_guest(vcpu->kvm, - get_tss_base_addr(vcpu, nseg_desc), + get_tss_base_addr_write(vcpu, nseg_desc), &tss_segment_16.prev_task_link, sizeof tss_segment_16.prev_task_link)) goto out; @@ -4568,7 +5115,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, sizeof tss_segment_32)) goto out; - if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), &tss_segment_32, sizeof tss_segment_32)) goto out; @@ -4576,7 +5123,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, tss_segment_32.prev_task_link = old_tss_sel; if (kvm_write_guest(vcpu->kvm, - get_tss_base_addr(vcpu, nseg_desc), + get_tss_base_addr_write(vcpu, nseg_desc), &tss_segment_32.prev_task_link, sizeof tss_segment_32.prev_task_link)) goto out; @@ -4599,7 +5146,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); - old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); + old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL); /* FIXME: Handle errors. Failure to read either TSS or their * descriptors should generate a pagefault. @@ -4658,7 +5205,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) &nseg_desc); } - kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS); seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); tr_seg.type = 11; kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); @@ -4689,17 +5236,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_set_cr8(vcpu, sregs->cr8); - mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; + mmu_reset_needed |= vcpu->arch.efer != sregs->efer; kvm_x86_ops->set_efer(vcpu, sregs->efer); kvm_set_apic_base(vcpu, sregs->apic_base); - kvm_x86_ops->decache_cr4_guest_bits(vcpu); - - mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; + mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; kvm_x86_ops->set_cr0(vcpu, sregs->cr0); vcpu->arch.cr0 = sregs->cr0; - mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; + mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); if (!is_long_mode(vcpu) && is_pae(vcpu)) { load_pdptrs(vcpu, vcpu->arch.cr3); @@ -4734,7 +5279,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, /* Older userspace won't unhalt the vcpu on reset. */ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && - !(vcpu->arch.cr0 & X86_CR0_PE)) + !is_protmode(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; vcpu_put(vcpu); @@ -4832,11 +5377,12 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, { unsigned long vaddr = tr->linear_address; gpa_t gpa; + int idx; vcpu_load(vcpu); - down_read(&vcpu->kvm->slots_lock); - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); - up_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); + gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); + srcu_read_unlock(&vcpu->kvm->srcu, idx); tr->physical_address = gpa; tr->valid = gpa != UNMAPPED_GVA; tr->writeable = 1; @@ -4917,14 +5463,14 @@ EXPORT_SYMBOL_GPL(fx_init); void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) + if (vcpu->guest_fpu_loaded) return; vcpu->guest_fpu_loaded = 1; kvm_fx_save(&vcpu->arch.host_fx_image); kvm_fx_restore(&vcpu->arch.guest_fx_image); + trace_kvm_fpu(1); } -EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { @@ -4935,8 +5481,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) kvm_fx_save(&vcpu->arch.guest_fx_image); kvm_fx_restore(&vcpu->arch.host_fx_image); ++vcpu->stat.fpu_reload; + set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); + trace_kvm_fpu(0); } -EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { @@ -5088,11 +5635,13 @@ fail: void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { + int idx; + kfree(vcpu->arch.mce_banks); kvm_free_lapic(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_mmu_destroy(vcpu); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); } @@ -5103,6 +5652,12 @@ struct kvm *kvm_arch_create_vm(void) if (!kvm) return ERR_PTR(-ENOMEM); + kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!kvm->arch.aliases) { + kfree(kvm); + return ERR_PTR(-ENOMEM); + } + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); @@ -5159,16 +5714,18 @@ void kvm_arch_destroy_vm(struct kvm *kvm) put_page(kvm->arch.apic_access_page); if (kvm->arch.ept_identity_pagetable) put_page(kvm->arch.ept_identity_pagetable); + cleanup_srcu_struct(&kvm->srcu); + kfree(kvm->arch.aliases); kfree(kvm); } -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, int user_alloc) { - int npages = mem->memory_size >> PAGE_SHIFT; - struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; + int npages = memslot->npages; /*To keep backward compatibility with older userspace, *x86 needs to hanlde !user_alloc case. @@ -5188,26 +5745,35 @@ int kvm_arch_set_memory_region(struct kvm *kvm, if (IS_ERR((void *)userspace_addr)) return PTR_ERR((void *)userspace_addr); - /* set userspace_addr atomically for kvm_hva_to_rmapp */ - spin_lock(&kvm->mmu_lock); memslot->userspace_addr = userspace_addr; - spin_unlock(&kvm->mmu_lock); - } else { - if (!old.user_alloc && old.rmap) { - int ret; - - down_write(¤t->mm->mmap_sem); - ret = do_munmap(current->mm, old.userspace_addr, - old.npages * PAGE_SIZE); - up_write(¤t->mm->mmap_sem); - if (ret < 0) - printk(KERN_WARNING - "kvm_vm_ioctl_set_memory_region: " - "failed to munmap memory\n"); - } } } + + return 0; +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + + int npages = mem->memory_size >> PAGE_SHIFT; + + if (!user_alloc && !old.user_alloc && old.rmap && !npages) { + int ret; + + down_write(¤t->mm->mmap_sem); + ret = do_munmap(current->mm, old.userspace_addr, + old.npages * PAGE_SIZE); + up_write(¤t->mm->mmap_sem); + if (ret < 0) + printk(KERN_WARNING + "kvm_vm_ioctl_set_memory_region: " + "failed to munmap memory\n"); + } + spin_lock(&kvm->mmu_lock); if (!kvm->arch.n_requested_mmu_pages) { unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); @@ -5216,8 +5782,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm, kvm_mmu_slot_remove_write_access(kvm, mem->slot); spin_unlock(&kvm->mmu_lock); - - return 0; } void kvm_arch_flush_shadow(struct kvm *kvm) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 5eadea585d2..2d101639bd8 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -2,6 +2,7 @@ #define ARCH_X86_KVM_X86_H #include <linux/kvm_host.h> +#include "kvm_cache_regs.h" static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) { @@ -35,4 +36,33 @@ static inline bool kvm_exception_is_soft(unsigned int nr) struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); +static inline bool is_protmode(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, X86_CR0_PE); +} + +static inline int is_long_mode(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + return vcpu->arch.efer & EFER_LMA; +#else + return 0; +#endif +} + +static inline int is_pae(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); +} + +static inline int is_pse(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, X86_CR4_PSE); +} + +static inline int is_paging(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, X86_CR0_PG); +} + #endif diff --git a/crypto/ahash.c b/crypto/ahash.c index 33a4ff45f84..b8c59b889c6 100644 --- a/crypto/ahash.c +++ b/crypto/ahash.c @@ -78,7 +78,6 @@ int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err) walk->data -= walk->offset; if (nbytes && walk->offset & alignmask && !err) { - walk->offset += alignmask - 1; walk->offset = ALIGN(walk->offset, alignmask + 1); walk->data += walk->offset; diff --git a/crypto/authenc.c b/crypto/authenc.c index 18870906ea0..2bb7348d8d5 100644 --- a/crypto/authenc.c +++ b/crypto/authenc.c @@ -386,11 +386,13 @@ static int crypto_authenc_encrypt(struct aead_request *req) { struct crypto_aead *authenc = crypto_aead_reqtfm(req); struct crypto_authenc_ctx *ctx = crypto_aead_ctx(authenc); - struct ablkcipher_request *abreq = aead_request_ctx(req); + struct authenc_request_ctx *areq_ctx = aead_request_ctx(req); struct crypto_ablkcipher *enc = ctx->enc; struct scatterlist *dst = req->dst; unsigned int cryptlen = req->cryptlen; - u8 *iv = (u8 *)(abreq + 1) + crypto_ablkcipher_reqsize(enc); + struct ablkcipher_request *abreq = (void *)(areq_ctx->tail + + ctx->reqoff); + u8 *iv = (u8 *)abreq - crypto_ablkcipher_ivsize(enc); int err; ablkcipher_request_set_tfm(abreq, enc); @@ -454,7 +456,7 @@ static int crypto_authenc_verify(struct aead_request *req, unsigned int authsize; areq_ctx->complete = authenc_verify_ahash_done; - areq_ctx->complete = authenc_verify_ahash_update_done; + areq_ctx->update_complete = authenc_verify_ahash_update_done; ohash = authenc_ahash_fn(req, CRYPTO_TFM_REQ_MAY_SLEEP); if (IS_ERR(ohash)) @@ -546,10 +548,6 @@ static int crypto_authenc_init_tfm(struct crypto_tfm *tfm) if (IS_ERR(auth)) return PTR_ERR(auth); - ctx->reqoff = ALIGN(2 * crypto_ahash_digestsize(auth) + - crypto_ahash_alignmask(auth), - crypto_ahash_alignmask(auth) + 1); - enc = crypto_spawn_skcipher(&ictx->enc); err = PTR_ERR(enc); if (IS_ERR(enc)) @@ -558,13 +556,18 @@ static int crypto_authenc_init_tfm(struct crypto_tfm *tfm) ctx->auth = auth; ctx->enc = enc; - tfm->crt_aead.reqsize = max_t(unsigned int, - crypto_ahash_reqsize(auth) + ctx->reqoff + - sizeof(struct authenc_request_ctx) + + ctx->reqoff = ALIGN(2 * crypto_ahash_digestsize(auth) + + crypto_ahash_alignmask(auth), + crypto_ahash_alignmask(auth) + 1) + + crypto_ablkcipher_ivsize(enc); + + tfm->crt_aead.reqsize = sizeof(struct authenc_request_ctx) + + ctx->reqoff + + max_t(unsigned int, + crypto_ahash_reqsize(auth) + sizeof(struct ahash_request), sizeof(struct skcipher_givcrypt_request) + - crypto_ablkcipher_reqsize(enc) + - crypto_ablkcipher_ivsize(enc)); + crypto_ablkcipher_reqsize(enc)); return 0; diff --git a/crypto/md5.c b/crypto/md5.c index 9fda213a592..30efc7dad89 100644 --- a/crypto/md5.c +++ b/crypto/md5.c @@ -234,6 +234,7 @@ static struct shash_alg alg = { .export = md5_export, .import = md5_import, .descsize = sizeof(struct md5_state), + .statesize = sizeof(struct md5_state), .base = { .cra_name = "md5", .cra_flags = CRYPTO_ALG_TYPE_SHASH, diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c index f69b7783027..11fc4d5c43e 100644 --- a/drivers/staging/pohmelfs/inode.c +++ b/drivers/staging/pohmelfs/inode.c @@ -969,7 +969,7 @@ int pohmelfs_setattr_raw(struct inode *inode, struct iattr *attr) if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { - err = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + err = dquot_transfer(inode, attr); if (err) goto err_out_exit; } diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index 9cc18775b83..2ff622f6f54 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -121,7 +121,7 @@ struct adfs_discmap { /* Inode stuff */ struct inode *adfs_iget(struct super_block *sb, struct object_info *obj); -int adfs_write_inode(struct inode *inode,int unused); +int adfs_write_inode(struct inode *inode, struct writeback_control *wbc); int adfs_notify_change(struct dentry *dentry, struct iattr *attr); /* map.c */ diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 3f57ce4bee5..0f5e3097813 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -9,6 +9,7 @@ */ #include <linux/smp_lock.h> #include <linux/buffer_head.h> +#include <linux/writeback.h> #include "adfs.h" /* @@ -360,7 +361,7 @@ out: * The adfs-specific inode data has already been updated by * adfs_notify_change() */ -int adfs_write_inode(struct inode *inode, int wait) +int adfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct super_block *sb = inode->i_sb; struct object_info obj; @@ -375,7 +376,7 @@ int adfs_write_inode(struct inode *inode, int wait) obj.attr = ADFS_I(inode)->attr; obj.size = inode->i_size; - ret = adfs_dir_update(sb, &obj, wait); + ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL); unlock_kernel(); return ret; } diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 0e40caaba45..861dae68ac1 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -175,7 +175,8 @@ extern void affs_delete_inode(struct inode *inode); extern void affs_clear_inode(struct inode *inode); extern struct inode *affs_iget(struct super_block *sb, unsigned long ino); -extern int affs_write_inode(struct inode *inode, int); +extern int affs_write_inode(struct inode *inode, + struct writeback_control *wbc); extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type); /* file.c */ diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 3c4ec7d864c..c9744d771d9 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -166,7 +166,7 @@ bad_inode: } int -affs_write_inode(struct inode *inode, int unused) +affs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct super_block *sb = inode->i_sb; struct buffer_head *bh; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 6ece2a13bf7..c54dad4e606 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -733,7 +733,6 @@ extern int afs_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata); extern int afs_writepage(struct page *, struct writeback_control *); extern int afs_writepages(struct address_space *, struct writeback_control *); -extern int afs_write_inode(struct inode *, int); extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *); extern ssize_t afs_file_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); diff --git a/fs/afs/super.c b/fs/afs/super.c index e1ea1c240b6..14f6431598a 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -48,7 +48,6 @@ struct file_system_type afs_fs_type = { static const struct super_operations afs_super_ops = { .statfs = afs_statfs, .alloc_inode = afs_alloc_inode, - .write_inode = afs_write_inode, .destroy_inode = afs_destroy_inode, .clear_inode = afs_clear_inode, .put_super = afs_put_super, diff --git a/fs/afs/write.c b/fs/afs/write.c index 5e15a21dbf9..3bed54a294d 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -585,27 +585,6 @@ int afs_writepages(struct address_space *mapping, } /* - * write an inode back - */ -int afs_write_inode(struct inode *inode, int sync) -{ - struct afs_vnode *vnode = AFS_FS_I(inode); - int ret; - - _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode); - - ret = 0; - if (sync) { - ret = filemap_fdatawait(inode->i_mapping); - if (ret < 0) - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); - } - - _leave(" = %d", ret); - return ret; -} - -/* * completion of write to server */ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) diff --git a/fs/attr.c b/fs/attr.c index 96d394bdadd..0a6ea54cde7 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -12,7 +12,6 @@ #include <linux/capability.h> #include <linux/fsnotify.h> #include <linux/fcntl.h> -#include <linux/quotaops.h> #include <linux/security.h> /* Taken over from the old code... */ @@ -212,14 +211,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr) error = inode->i_op->setattr(dentry, attr); } else { error = inode_change_ok(inode, attr); - if (!error) { - if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || - (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) - error = vfs_dq_transfer(inode, attr) ? - -EDQUOT : 0; - if (!error) - error = inode_setattr(inode, attr); - } + if (!error) + error = inode_setattr(inode, attr); } if (ia_valid & ATTR_SIZE) diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 8f3d9fd8960..f22a7d3dc36 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -15,6 +15,7 @@ #include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/vfs.h> +#include <linux/writeback.h> #include <asm/uaccess.h> #include "bfs.h" @@ -98,7 +99,7 @@ error: return ERR_PTR(-EIO); } -static int bfs_write_inode(struct inode *inode, int wait) +static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct bfs_sb_info *info = BFS_SB(inode->i_sb); unsigned int ino = (u16)inode->i_ino; @@ -147,7 +148,7 @@ static int bfs_write_inode(struct inode *inode, int wait) di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1); mark_buffer_dirty(bh); - if (wait) { + if (wbc->sync_mode == WB_SYNC_ALL) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) err = -EIO; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2aa8ec6a098..8b5cfdd4bfc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2326,7 +2326,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_delete_inode(struct inode *inode); void btrfs_put_inode(struct inode *inode); -int btrfs_write_inode(struct inode *inode, int wait); +int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); void btrfs_dirty_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4deb280f896..c41db6d45ab 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3968,7 +3968,7 @@ err: return ret; } -int btrfs_write_inode(struct inode *inode, int wait) +int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; @@ -3977,7 +3977,7 @@ int btrfs_write_inode(struct inode *inode, int wait) if (root->fs_info->btree_inode == inode) return 0; - if (wait) { + if (wbc->sync_mode == WB_SYNC_ALL) { trans = btrfs_join_transaction(root, 1); btrfs_set_trans_block_group(trans, inode); ret = btrfs_commit_transaction(trans, root); diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 59b8bf2825c..8442e353309 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -261,7 +261,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata); extern struct inode *exofs_iget(struct super_block *, unsigned long); struct inode *exofs_new_inode(struct inode *, int); -extern int exofs_write_inode(struct inode *, int); +extern int exofs_write_inode(struct inode *, struct writeback_control *wbc); extern void exofs_delete_inode(struct inode *); /* dir.c: */ diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 5514f3c2c2f..a17e4b733e3 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1280,9 +1280,9 @@ out: return ret; } -int exofs_write_inode(struct inode *inode, int wait) +int exofs_write_inode(struct inode *inode, struct writeback_control *wbc) { - return exofs_update_inode(inode, wait); + return exofs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } /* diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 7f8d2e5a7ea..1d081f0cfec 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -570,7 +570,7 @@ do_more: error_return: brelse(bitmap_bh); release_blocks(sb, freed); - vfs_dq_free_block(inode, freed); + dquot_free_block(inode, freed); } /** @@ -1236,6 +1236,7 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal, unsigned short windowsz = 0; unsigned long ngroups; unsigned long num = *count; + int ret; *errp = -ENOSPC; sb = inode->i_sb; @@ -1247,8 +1248,9 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal, /* * Check quota for allocation of this block. */ - if (vfs_dq_alloc_block(inode, num)) { - *errp = -EDQUOT; + ret = dquot_alloc_block(inode, num); + if (ret) { + *errp = ret; return 0; } @@ -1409,7 +1411,7 @@ allocated: *errp = 0; brelse(bitmap_bh); - vfs_dq_free_block(inode, *count-num); + dquot_free_block(inode, *count-num); *count = num; return ret_block; @@ -1420,7 +1422,7 @@ out: * Undo the block allocation */ if (!performed_allocation) - vfs_dq_free_block(inode, *count); + dquot_free_block(inode, *count); brelse(bitmap_bh); return 0; } diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 061914add3c..0b038e47ad2 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -118,7 +118,7 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned); /* inode.c */ extern struct inode *ext2_iget (struct super_block *, unsigned long); -extern int ext2_write_inode (struct inode *, int); +extern int ext2_write_inode (struct inode *, struct writeback_control *); extern void ext2_delete_inode (struct inode *); extern int ext2_sync_inode (struct inode *); extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 586e3589d4c..5d198d0697f 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -20,6 +20,7 @@ #include <linux/time.h> #include <linux/pagemap.h> +#include <linux/quotaops.h> #include "ext2.h" #include "xattr.h" #include "acl.h" @@ -70,7 +71,7 @@ const struct file_operations ext2_file_operations = { .compat_ioctl = ext2_compat_ioctl, #endif .mmap = generic_file_mmap, - .open = generic_file_open, + .open = dquot_file_open, .release = ext2_release_file, .fsync = ext2_fsync, .splice_read = generic_file_splice_read, @@ -87,7 +88,7 @@ const struct file_operations ext2_xip_file_operations = { .compat_ioctl = ext2_compat_ioctl, #endif .mmap = xip_file_mmap, - .open = generic_file_open, + .open = dquot_file_open, .release = ext2_release_file, .fsync = ext2_fsync, }; diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 15387c9c17d..ad7d572ee8d 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -121,8 +121,8 @@ void ext2_free_inode (struct inode * inode) if (!is_bad_inode(inode)) { /* Quota is already initialized in iput() */ ext2_xattr_delete_inode(inode); - vfs_dq_free_inode(inode); - vfs_dq_drop(inode); + dquot_free_inode(inode); + dquot_drop(inode); } es = EXT2_SB(sb)->s_es; @@ -586,10 +586,10 @@ got: goto fail_drop; } - if (vfs_dq_alloc_inode(inode)) { - err = -EDQUOT; + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) goto fail_drop; - } err = ext2_init_acl(inode, dir); if (err) @@ -605,10 +605,10 @@ got: return inode; fail_free_drop: - vfs_dq_free_inode(inode); + dquot_free_inode(inode); fail_drop: - vfs_dq_drop(inode); + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; unlock_new_inode(inode); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 71b032c65a0..fc13cc119aa 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -41,6 +41,8 @@ MODULE_AUTHOR("Remy Card and others"); MODULE_DESCRIPTION("Second Extended Filesystem"); MODULE_LICENSE("GPL"); +static int __ext2_write_inode(struct inode *inode, int do_sync); + /* * Test whether an inode is a fast symlink. */ @@ -58,13 +60,15 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode) */ void ext2_delete_inode (struct inode * inode) { + if (!is_bad_inode(inode)) + dquot_initialize(inode); truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) goto no_delete; EXT2_I(inode)->i_dtime = get_seconds(); mark_inode_dirty(inode); - ext2_write_inode(inode, inode_needs_sync(inode)); + __ext2_write_inode(inode, inode_needs_sync(inode)); inode->i_size = 0; if (inode->i_blocks) @@ -1335,7 +1339,7 @@ bad_inode: return ERR_PTR(ret); } -int ext2_write_inode(struct inode *inode, int do_sync) +static int __ext2_write_inode(struct inode *inode, int do_sync) { struct ext2_inode_info *ei = EXT2_I(inode); struct super_block *sb = inode->i_sb; @@ -1440,6 +1444,11 @@ int ext2_write_inode(struct inode *inode, int do_sync) return err; } +int ext2_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + int ext2_sync_inode(struct inode *inode) { struct writeback_control wbc = { @@ -1457,9 +1466,12 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr) error = inode_change_ok(inode, iattr); if (error) return error; + + if (iattr->ia_valid & ATTR_SIZE) + dquot_initialize(inode); if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { - error = vfs_dq_transfer(inode, iattr) ? -EDQUOT : 0; + error = dquot_transfer(inode, iattr); if (error) return error; } diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index dd7175ce560..71efb0e9a3f 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -31,6 +31,7 @@ */ #include <linux/pagemap.h> +#include <linux/quotaops.h> #include "ext2.h" #include "xattr.h" #include "acl.h" @@ -99,24 +100,27 @@ struct dentry *ext2_get_parent(struct dentry *child) */ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd) { - struct inode * inode = ext2_new_inode (dir, mode); - int err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - inode->i_op = &ext2_file_inode_operations; - if (ext2_use_xip(inode->i_sb)) { - inode->i_mapping->a_ops = &ext2_aops_xip; - inode->i_fop = &ext2_xip_file_operations; - } else if (test_opt(inode->i_sb, NOBH)) { - inode->i_mapping->a_ops = &ext2_nobh_aops; - inode->i_fop = &ext2_file_operations; - } else { - inode->i_mapping->a_ops = &ext2_aops; - inode->i_fop = &ext2_file_operations; - } - mark_inode_dirty(inode); - err = ext2_add_nondir(dentry, inode); + struct inode *inode; + + dquot_initialize(dir); + + inode = ext2_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &ext2_file_inode_operations; + if (ext2_use_xip(inode->i_sb)) { + inode->i_mapping->a_ops = &ext2_aops_xip; + inode->i_fop = &ext2_xip_file_operations; + } else if (test_opt(inode->i_sb, NOBH)) { + inode->i_mapping->a_ops = &ext2_nobh_aops; + inode->i_fop = &ext2_file_operations; + } else { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_file_operations; } - return err; + mark_inode_dirty(inode); + return ext2_add_nondir(dentry, inode); } static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev) @@ -127,6 +131,8 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_ if (!new_valid_dev(rdev)) return -EINVAL; + dquot_initialize(dir); + inode = ext2_new_inode (dir, mode); err = PTR_ERR(inode); if (!IS_ERR(inode)) { @@ -151,6 +157,8 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry, if (l > sb->s_blocksize) goto out; + dquot_initialize(dir); + inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO); err = PTR_ERR(inode); if (IS_ERR(inode)) @@ -194,6 +202,8 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, if (inode->i_nlink >= EXT2_LINK_MAX) return -EMLINK; + dquot_initialize(dir); + inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); atomic_inc(&inode->i_count); @@ -216,6 +226,8 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode) if (dir->i_nlink >= EXT2_LINK_MAX) goto out; + dquot_initialize(dir); + inode_inc_link_count(dir); inode = ext2_new_inode (dir, S_IFDIR | mode); @@ -262,6 +274,8 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry) struct page * page; int err = -ENOENT; + dquot_initialize(dir); + de = ext2_find_entry (dir, &dentry->d_name, &page); if (!de) goto out; @@ -304,6 +318,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, struct ext2_dir_entry_2 * old_de; int err = -ENOENT; + dquot_initialize(old_dir); + dquot_initialize(new_dir); + old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page); if (!old_de) goto out; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index f9cb54a585c..42e4a303b67 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -194,6 +194,8 @@ static void destroy_inodecache(void) static void ext2_clear_inode(struct inode *inode) { struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info; + + dquot_drop(inode); ext2_discard_reservation(inode); EXT2_I(inode)->i_block_alloc_info = NULL; if (unlikely(rsv)) diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 904f00642f8..e44dc92609b 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -644,8 +644,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, the inode. */ ea_bdebug(new_bh, "reusing block"); - error = -EDQUOT; - if (vfs_dq_alloc_block(inode, 1)) { + error = dquot_alloc_block(inode, 1); + if (error) { unlock_buffer(new_bh); goto cleanup; } @@ -702,7 +702,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, * as if nothing happened and cleanup the unused block */ if (error && error != -ENOSPC) { if (new_bh && new_bh != old_bh) - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); goto cleanup; } } else @@ -734,7 +734,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, le32_add_cpu(&HDR(old_bh)->h_refcount, -1); if (ce) mb_cache_entry_release(ce); - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); mark_buffer_dirty(old_bh); ea_bdebug(old_bh, "refcount now=%d", le32_to_cpu(HDR(old_bh)->h_refcount)); @@ -797,7 +797,7 @@ ext2_xattr_delete_inode(struct inode *inode) mark_buffer_dirty(bh); if (IS_SYNC(inode)) sync_dirty_buffer(bh); - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); } EXT2_I(inode)->i_file_acl = 0; diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 27967f92e82..161da2d3f89 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -676,7 +676,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode, } ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); if (dquot_freed_blocks) - vfs_dq_free_block(inode, dquot_freed_blocks); + dquot_free_block(inode, dquot_freed_blocks); return; } @@ -1502,8 +1502,9 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, /* * Check quota for allocation of this block. */ - if (vfs_dq_alloc_block(inode, num)) { - *errp = -EDQUOT; + err = dquot_alloc_block(inode, num); + if (err) { + *errp = err; return 0; } @@ -1713,7 +1714,7 @@ allocated: *errp = 0; brelse(bitmap_bh); - vfs_dq_free_block(inode, *count-num); + dquot_free_block(inode, *count-num); *count = num; return ret_block; @@ -1728,7 +1729,7 @@ out: * Undo the block allocation */ if (!performed_allocation) - vfs_dq_free_block(inode, *count); + dquot_free_block(inode, *count); brelse(bitmap_bh); return 0; } diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 388bbdfa0b4..f55df0e61cb 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -21,6 +21,7 @@ #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd.h> +#include <linux/quotaops.h> #include <linux/ext3_fs.h> #include <linux/ext3_jbd.h> #include "xattr.h" @@ -33,9 +34,9 @@ */ static int ext3_release_file (struct inode * inode, struct file * filp) { - if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) { + if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) { filemap_flush(inode->i_mapping); - EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE; + ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); } /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && @@ -62,7 +63,7 @@ const struct file_operations ext3_file_operations = { .compat_ioctl = ext3_compat_ioctl, #endif .mmap = generic_file_mmap, - .open = generic_file_open, + .open = dquot_file_open, .release = ext3_release_file, .fsync = ext3_sync_file, .splice_read = generic_file_splice_read, diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index b3999128513..ef9008b885b 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -123,10 +123,10 @@ void ext3_free_inode (handle_t *handle, struct inode * inode) * Note: we must free any quota before locking the superblock, * as writing the quota to disk may need the lock as well. */ - vfs_dq_init(inode); + dquot_initialize(inode); ext3_xattr_delete_inode(handle, inode); - vfs_dq_free_inode(inode); - vfs_dq_drop(inode); + dquot_free_inode(inode); + dquot_drop(inode); is_directory = S_ISDIR(inode->i_mode); @@ -588,10 +588,10 @@ got: sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; ret = inode; - if (vfs_dq_alloc_inode(inode)) { - err = -EDQUOT; + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) goto fail_drop; - } err = ext3_init_acl(handle, inode, dir); if (err) @@ -619,10 +619,10 @@ really_out: return ret; fail_free_drop: - vfs_dq_free_inode(inode); + dquot_free_inode(inode); fail_drop: - vfs_dq_drop(inode); + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; unlock_new_inode(inode); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 455e6e6e5cb..7f920b7263a 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -196,6 +196,9 @@ void ext3_delete_inode (struct inode * inode) { handle_t *handle; + if (!is_bad_inode(inode)) + dquot_initialize(inode); + truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) @@ -1378,7 +1381,7 @@ static int ext3_journalled_write_end(struct file *file, */ if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); - EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; + ext3_set_inode_state(inode, EXT3_STATE_JDATA); if (inode->i_size > EXT3_I(inode)->i_disksize) { EXT3_I(inode)->i_disksize = inode->i_size; ret2 = ext3_mark_inode_dirty(handle, inode); @@ -1417,7 +1420,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block) journal_t *journal; int err; - if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) { + if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) { /* * This is a REALLY heavyweight approach, but the use of * bmap on dirty files is expected to be extremely rare: @@ -1436,7 +1439,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block) * everything they get. */ - EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA; + ext3_clear_inode_state(inode, EXT3_STATE_JDATA); journal = EXT3_JOURNAL(inode); journal_lock_updates(journal); err = journal_flush(journal); @@ -1528,6 +1531,7 @@ static int ext3_ordered_writepage(struct page *page, int err; J_ASSERT(PageLocked(page)); + WARN_ON_ONCE(IS_RDONLY(inode)); /* * We give up here if we're reentered, because it might be for a @@ -1600,6 +1604,9 @@ static int ext3_writeback_writepage(struct page *page, int ret = 0; int err; + J_ASSERT(PageLocked(page)); + WARN_ON_ONCE(IS_RDONLY(inode)); + if (ext3_journal_current_handle()) goto out_fail; @@ -1642,6 +1649,9 @@ static int ext3_journalled_writepage(struct page *page, int ret = 0; int err; + J_ASSERT(PageLocked(page)); + WARN_ON_ONCE(IS_RDONLY(inode)); + if (ext3_journal_current_handle()) goto no_write; @@ -1670,7 +1680,7 @@ static int ext3_journalled_writepage(struct page *page, PAGE_CACHE_SIZE, NULL, write_end_fn); if (ret == 0) ret = err; - EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; + ext3_set_inode_state(inode, EXT3_STATE_JDATA); unlock_page(page); } else { /* @@ -1785,8 +1795,9 @@ retry: handle = ext3_journal_start(inode, 2); if (IS_ERR(handle)) { /* This is really bad luck. We've written the data - * but cannot extend i_size. Bail out and pretend - * the write failed... */ + * but cannot extend i_size. Truncate allocated blocks + * and pretend the write failed... */ + ext3_truncate(inode); ret = PTR_ERR(handle); goto out; } @@ -2402,7 +2413,7 @@ void ext3_truncate(struct inode *inode) goto out_notrans; if (inode->i_size == 0 && ext3_should_writeback_data(inode)) - ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE; + ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); /* * We have to lock the EOF page here, because lock_page() nests @@ -2721,7 +2732,7 @@ int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc) { /* We have all inode data except xattrs in memory here. */ return __ext3_get_inode_loc(inode, iloc, - !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)); + !ext3_test_inode_state(inode, EXT3_STATE_XATTR)); } void ext3_set_inode_flags(struct inode *inode) @@ -2893,7 +2904,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC)) - ei->i_state |= EXT3_STATE_XATTR; + ext3_set_inode_state(inode, EXT3_STATE_XATTR); } } else ei->i_extra_isize = 0; @@ -2955,7 +2966,7 @@ again: /* For fields not not tracking in the in-memory inode, * initialise them to zero for new inodes. */ - if (ei->i_state & EXT3_STATE_NEW) + if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); ext3_get_inode_flags(ei); @@ -3052,7 +3063,7 @@ again: rc = ext3_journal_dirty_metadata(handle, bh); if (!err) err = rc; - ei->i_state &= ~EXT3_STATE_NEW; + ext3_clear_inode_state(inode, EXT3_STATE_NEW); atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); out_brelse: @@ -3096,7 +3107,7 @@ out_brelse: * `stuff()' is running, and the new i_size will be lost. Plus the inode * will no longer be on the superblock's dirty inode list. */ -int ext3_write_inode(struct inode *inode, int wait) +int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) { if (current->flags & PF_MEMALLOC) return 0; @@ -3107,7 +3118,7 @@ int ext3_write_inode(struct inode *inode, int wait) return -EIO; } - if (!wait) + if (wbc->sync_mode != WB_SYNC_ALL) return 0; return ext3_force_commit(inode->i_sb); @@ -3140,6 +3151,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; + if (ia_valid & ATTR_SIZE) + dquot_initialize(inode); if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { handle_t *handle; @@ -3152,7 +3165,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) error = PTR_ERR(handle); goto err_out; } - error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + error = dquot_transfer(inode, attr); if (error) { ext3_journal_stop(handle); return error; @@ -3237,7 +3250,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode) ret = 2 * (bpp + indirects) + 2; #ifdef CONFIG_QUOTA - /* We know that structure was already allocated during vfs_dq_init so + /* We know that structure was already allocated during dquot_initialize so * we will be updating only the data blocks + inodes */ ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); #endif @@ -3328,7 +3341,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) * i_size has been changed by generic_commit_write() and we thus need * to include the updated inode in the current transaction. * - * Also, vfs_dq_alloc_space() will always dirty the inode when blocks + * Also, dquot_alloc_space() will always dirty the inode when blocks * are allocated to the file. * * If the inode is marked synchronous, we don't honour that here - doing diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 7b0e44f7d66..ee184084ca4 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1696,6 +1696,8 @@ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, struct inode * inode; int err, retries = 0; + dquot_initialize(dir); + retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -1730,6 +1732,8 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; + dquot_initialize(dir); + retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -1766,6 +1770,8 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) if (dir->i_nlink >= EXT3_LINK_MAX) return -EMLINK; + dquot_initialize(dir); + retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -2060,7 +2066,9 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go in * separate transaction */ - vfs_dq_init(dentry->d_inode); + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2119,7 +2127,9 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go * in separate transaction */ - vfs_dq_init(dentry->d_inode); + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2174,6 +2184,8 @@ static int ext3_symlink (struct inode * dir, if (l > dir->i_sb->s_blocksize) return -ENAMETOOLONG; + dquot_initialize(dir); + retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 + @@ -2228,6 +2240,9 @@ static int ext3_link (struct dentry * old_dentry, if (inode->i_nlink >= EXT3_LINK_MAX) return -EMLINK; + + dquot_initialize(dir); + /* * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing * otherwise has the potential to corrupt the orphan inode list. @@ -2278,12 +2293,15 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, struct ext3_dir_entry_2 * old_de, * new_de; int retval, flush_file = 0; + dquot_initialize(old_dir); + dquot_initialize(new_dir); + old_bh = new_bh = dir_bh = NULL; /* Initialize quotas before so that eventual writes go * in separate transaction */ if (new_dentry->d_inode) - vfs_dq_init(new_dentry->d_inode); + dquot_initialize(new_dentry->d_inode); handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index afa2b569da1..e844accbf55 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -181,7 +181,7 @@ static void ext3_handle_error(struct super_block *sb) if (!test_opt (sb, ERRORS_CONT)) { journal_t *journal = EXT3_SB(sb)->s_journal; - EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; + set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); if (journal) journal_abort(journal, -EIO); } @@ -296,7 +296,7 @@ void ext3_abort (struct super_block * sb, const char * function, "error: remounting filesystem read-only"); EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; sb->s_flags |= MS_RDONLY; - EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; + set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); if (EXT3_SB(sb)->s_journal) journal_abort(EXT3_SB(sb)->s_journal, -EIO); } @@ -528,6 +528,8 @@ static void destroy_inodecache(void) static void ext3_clear_inode(struct inode *inode) { struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info; + + dquot_drop(inode); ext3_discard_reservation(inode); EXT3_I(inode)->i_block_alloc_info = NULL; if (unlikely(rsv)) @@ -562,10 +564,10 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl if (sbi->s_qf_names[GRPQUOTA]) seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); - if (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) + if (test_opt(sb, USRQUOTA)) seq_puts(seq, ",usrquota"); - if (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) + if (test_opt(sb, GRPQUOTA)) seq_puts(seq, ",grpquota"); #endif } @@ -656,8 +658,7 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, NOBH)) seq_puts(seq, ",nobh"); - seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt & - EXT3_MOUNT_DATA_FLAGS)); + seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS))); if (test_opt(sb, DATA_ERR_ABORT)) seq_puts(seq, ",data_err=abort"); @@ -751,13 +752,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); static const struct dquot_operations ext3_quota_operations = { - .initialize = dquot_initialize, - .drop = dquot_drop, - .alloc_space = dquot_alloc_space, - .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, - .free_inode = dquot_free_inode, - .transfer = dquot_transfer, .write_dquot = ext3_write_dquot, .acquire_dquot = ext3_acquire_dquot, .release_dquot = ext3_release_dquot, @@ -896,6 +890,63 @@ static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb) return sb_block; } +#ifdef CONFIG_QUOTA +static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + char *qname; + + if (sb_any_quota_loaded(sb) && + !sbi->s_qf_names[qtype]) { + ext3_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); + return 0; + } + qname = match_strdup(args); + if (!qname) { + ext3_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); + return 0; + } + if (sbi->s_qf_names[qtype] && + strcmp(sbi->s_qf_names[qtype], qname)) { + ext3_msg(sb, KERN_ERR, + "%s quota file already specified", QTYPE2NAME(qtype)); + kfree(qname); + return 0; + } + sbi->s_qf_names[qtype] = qname; + if (strchr(sbi->s_qf_names[qtype], '/')) { + ext3_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return 0; + } + set_opt(sbi->s_mount_opt, QUOTA); + return 1; +} + +static int clear_qf_name(struct super_block *sb, int qtype) { + + struct ext3_sb_info *sbi = EXT3_SB(sb); + + if (sb_any_quota_loaded(sb) && + sbi->s_qf_names[qtype]) { + ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return 0; + } + /* + * The space will be released later when all options are confirmed + * to be correct + */ + sbi->s_qf_names[qtype] = NULL; + return 1; +} +#endif + static int parse_options (char *options, struct super_block *sb, unsigned int *inum, unsigned long *journal_devnum, ext3_fsblk_t *n_blocks_count, int is_remount) @@ -906,8 +957,7 @@ static int parse_options (char *options, struct super_block *sb, int data_opt = 0; int option; #ifdef CONFIG_QUOTA - int qtype, qfmt; - char *qname; + int qfmt; #endif if (!options) @@ -1065,20 +1115,19 @@ static int parse_options (char *options, struct super_block *sb, data_opt = EXT3_MOUNT_WRITEBACK_DATA; datacheck: if (is_remount) { - if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS) - == data_opt) + if (test_opt(sb, DATA_FLAGS) == data_opt) break; ext3_msg(sb, KERN_ERR, "error: cannot change " "data mode on remount. The filesystem " "is mounted in data=%s mode and you " "try to remount it in data=%s mode.", - data_mode_string(sbi->s_mount_opt & - EXT3_MOUNT_DATA_FLAGS), + data_mode_string(test_opt(sb, + DATA_FLAGS)), data_mode_string(data_opt)); return 0; } else { - sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS; + clear_opt(sbi->s_mount_opt, DATA_FLAGS); sbi->s_mount_opt |= data_opt; } break; @@ -1090,62 +1139,20 @@ static int parse_options (char *options, struct super_block *sb, break; #ifdef CONFIG_QUOTA case Opt_usrjquota: - qtype = USRQUOTA; - goto set_qf_name; - case Opt_grpjquota: - qtype = GRPQUOTA; -set_qf_name: - if (sb_any_quota_loaded(sb) && - !sbi->s_qf_names[qtype]) { - ext3_msg(sb, KERN_ERR, - "error: cannot change journaled " - "quota options when quota turned on."); - return 0; - } - qname = match_strdup(&args[0]); - if (!qname) { - ext3_msg(sb, KERN_ERR, - "error: not enough memory for " - "storing quotafile name."); + if (!set_qf_name(sb, USRQUOTA, &args[0])) return 0; - } - if (sbi->s_qf_names[qtype] && - strcmp(sbi->s_qf_names[qtype], qname)) { - ext3_msg(sb, KERN_ERR, - "error: %s quota file already " - "specified.", QTYPE2NAME(qtype)); - kfree(qname); - return 0; - } - sbi->s_qf_names[qtype] = qname; - if (strchr(sbi->s_qf_names[qtype], '/')) { - ext3_msg(sb, KERN_ERR, - "error: quotafile must be on " - "filesystem root."); - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; + break; + case Opt_grpjquota: + if (!set_qf_name(sb, GRPQUOTA, &args[0])) return 0; - } - set_opt(sbi->s_mount_opt, QUOTA); break; case Opt_offusrjquota: - qtype = USRQUOTA; - goto clear_qf_name; + if (!clear_qf_name(sb, USRQUOTA)) + return 0; + break; case Opt_offgrpjquota: - qtype = GRPQUOTA; -clear_qf_name: - if (sb_any_quota_loaded(sb) && - sbi->s_qf_names[qtype]) { - ext3_msg(sb, KERN_ERR, "error: cannot change " - "journaled quota options when " - "quota turned on."); + if (!clear_qf_name(sb, GRPQUOTA)) return 0; - } - /* - * The space will be released later when all options - * are confirmed to be correct - */ - sbi->s_qf_names[qtype] = NULL; break; case Opt_jqfmt_vfsold: qfmt = QFMT_VFS_OLD; @@ -1244,18 +1251,12 @@ set_qf_format: } #ifdef CONFIG_QUOTA if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { - if ((sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) && - sbi->s_qf_names[USRQUOTA]) + if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) clear_opt(sbi->s_mount_opt, USRQUOTA); - - if ((sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) && - sbi->s_qf_names[GRPQUOTA]) + if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) clear_opt(sbi->s_mount_opt, GRPQUOTA); - if ((sbi->s_qf_names[USRQUOTA] && - (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)) || - (sbi->s_qf_names[GRPQUOTA] && - (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA))) { + if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { ext3_msg(sb, KERN_ERR, "error: old and new quota " "format mixing."); return 0; @@ -1478,7 +1479,7 @@ static void ext3_orphan_cleanup (struct super_block * sb, } list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); - vfs_dq_init(inode); + dquot_initialize(inode); if (inode->i_nlink) { printk(KERN_DEBUG "%s: truncating inode %lu to %Ld bytes\n", @@ -1671,11 +1672,11 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, POSIX_ACL); #endif if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA) - sbi->s_mount_opt |= EXT3_MOUNT_JOURNAL_DATA; + set_opt(sbi->s_mount_opt, JOURNAL_DATA); else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED) - sbi->s_mount_opt |= EXT3_MOUNT_ORDERED_DATA; + set_opt(sbi->s_mount_opt, ORDERED_DATA); else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK) - sbi->s_mount_opt |= EXT3_MOUNT_WRITEBACK_DATA; + set_opt(sbi->s_mount_opt, WRITEBACK_DATA); if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC) set_opt(sbi->s_mount_opt, ERRORS_PANIC); @@ -1694,7 +1695,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV && (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) || @@ -2561,11 +2562,11 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) goto restore_opts; } - if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) + if (test_opt(sb, ABORT)) ext3_abort(sb, __func__, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); es = sbi->s_es; @@ -2573,7 +2574,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || n_blocks_count > le32_to_cpu(es->s_blocks_count)) { - if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) { + if (test_opt(sb, ABORT)) { err = -EROFS; goto restore_opts; } @@ -2734,7 +2735,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) * Process 1 Process 2 * ext3_create() quota_sync() * journal_start() write_dquot() - * vfs_dq_init() down(dqio_mutex) + * dquot_initialize() down(dqio_mutex) * down(dqio_mutex) journal_start() * */ @@ -2942,9 +2943,7 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb); int err = 0; int offset = off & (sb->s_blocksize - 1); - int tocopy; int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL; - size_t towrite = len; struct buffer_head *bh; handle_t *handle = journal_current_handle(); @@ -2955,53 +2954,54 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, (unsigned long long)off, (unsigned long long)len); return -EIO; } + + /* + * Since we account only one data block in transaction credits, + * then it is impossible to cross a block boundary. + */ + if (sb->s_blocksize - offset < len) { + ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" + " cancelled because not block aligned", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); - while (towrite > 0) { - tocopy = sb->s_blocksize - offset < towrite ? - sb->s_blocksize - offset : towrite; - bh = ext3_bread(handle, inode, blk, 1, &err); - if (!bh) + bh = ext3_bread(handle, inode, blk, 1, &err); + if (!bh) + goto out; + if (journal_quota) { + err = ext3_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); goto out; - if (journal_quota) { - err = ext3_journal_get_write_access(handle, bh); - if (err) { - brelse(bh); - goto out; - } - } - lock_buffer(bh); - memcpy(bh->b_data+offset, data, tocopy); - flush_dcache_page(bh->b_page); - unlock_buffer(bh); - if (journal_quota) - err = ext3_journal_dirty_metadata(handle, bh); - else { - /* Always do at least ordered writes for quotas */ - err = ext3_journal_dirty_data(handle, bh); - mark_buffer_dirty(bh); } - brelse(bh); - if (err) - goto out; - offset = 0; - towrite -= tocopy; - data += tocopy; - blk++; } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, len); + flush_dcache_page(bh->b_page); + unlock_buffer(bh); + if (journal_quota) + err = ext3_journal_dirty_metadata(handle, bh); + else { + /* Always do at least ordered writes for quotas */ + err = ext3_journal_dirty_data(handle, bh); + mark_buffer_dirty(bh); + } + brelse(bh); out: - if (len == towrite) { + if (err) { mutex_unlock(&inode->i_mutex); return err; } - if (inode->i_size < off+len-towrite) { - i_size_write(inode, off+len-towrite); + if (inode->i_size < off + len) { + i_size_write(inode, off + len); EXT3_I(inode)->i_disksize = inode->i_size; } inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; ext3_mark_inode_dirty(handle, inode); mutex_unlock(&inode->i_mutex); - return len - towrite; + return len; } #endif diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 66895ccf76c..534a94c3a93 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -274,7 +274,7 @@ ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *end; int error; - if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)) + if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR)) return -ENODATA; error = ext3_get_inode_loc(inode, &iloc); if (error) @@ -403,7 +403,7 @@ ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) void *end; int error; - if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)) + if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR)) return 0; error = ext3_get_inode_loc(inode, &iloc); if (error) @@ -500,7 +500,7 @@ ext3_xattr_release_block(handle_t *handle, struct inode *inode, error = ext3_journal_dirty_metadata(handle, bh); if (IS_SYNC(inode)) handle->h_sync = 1; - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); if (ce) @@ -775,8 +775,8 @@ inserted: else { /* The old block is released after updating the inode. */ - error = -EDQUOT; - if (vfs_dq_alloc_block(inode, 1)) + error = dquot_alloc_block(inode, 1); + if (error) goto cleanup; error = ext3_journal_get_write_access(handle, new_bh); @@ -850,7 +850,7 @@ cleanup: return error; cleanup_dquot: - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); goto cleanup; bad_block: @@ -882,7 +882,7 @@ ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i, is->s.base = is->s.first = IFIRST(header); is->s.here = is->s.first; is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; - if (EXT3_I(inode)->i_state & EXT3_STATE_XATTR) { + if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) { error = ext3_xattr_check_names(IFIRST(header), is->s.end); if (error) return error; @@ -914,10 +914,10 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode, header = IHDR(inode, ext3_raw_inode(&is->iloc)); if (!IS_LAST_ENTRY(s->first)) { header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); - EXT3_I(inode)->i_state |= EXT3_STATE_XATTR; + ext3_set_inode_state(inode, EXT3_STATE_XATTR); } else { header->h_magic = cpu_to_le32(0); - EXT3_I(inode)->i_state &= ~EXT3_STATE_XATTR; + ext3_clear_inode_state(inode, EXT3_STATE_XATTR); } return 0; } @@ -967,10 +967,10 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (error) goto cleanup; - if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) { + if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) { struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc); memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); - EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW; + ext3_clear_inode_state(inode, EXT3_STATE_NEW); } error = ext3_xattr_ibody_find(inode, &i, &is); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 22bc7435d91..d2f37a5516c 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -97,8 +97,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, /* If checksum is bad mark all blocks used to prevent allocation * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { - ext4_error(sb, __func__, - "Checksum bad for group %u", block_group); + ext4_error(sb, "Checksum bad for group %u", + block_group); ext4_free_blks_set(sb, gdp, 0); ext4_free_inodes_set(sb, gdp, 0); ext4_itable_unused_set(sb, gdp, 0); @@ -130,8 +130,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, * to make sure we calculate the right free blocks */ group_blocks = ext4_blocks_count(sbi->s_es) - - le32_to_cpu(sbi->s_es->s_first_data_block) - - (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1)); + ext4_group_first_block_no(sb, ngroups - 1); } else { group_blocks = EXT4_BLOCKS_PER_GROUP(sb); } @@ -189,9 +188,6 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, * when a file system is mounted (see ext4_fill_super). */ - -#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) - /** * ext4_get_group_desc() -- load group descriptor from disk * @sb: super block @@ -210,10 +206,8 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); if (block_group >= ngroups) { - ext4_error(sb, "ext4_get_group_desc", - "block_group >= groups_count - " - "block_group = %u, groups_count = %u", - block_group, ngroups); + ext4_error(sb, "block_group >= groups_count - block_group = %u," + " groups_count = %u", block_group, ngroups); return NULL; } @@ -221,8 +215,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); if (!sbi->s_group_desc[group_desc]) { - ext4_error(sb, "ext4_get_group_desc", - "Group descriptor not loaded - " + ext4_error(sb, "Group descriptor not loaded - " "block_group = %u, group_desc = %u, desc = %u", block_group, group_desc, offset); return NULL; @@ -282,9 +275,7 @@ static int ext4_valid_block_bitmap(struct super_block *sb, return 1; err_out: - ext4_error(sb, __func__, - "Invalid block bitmap - " - "block_group = %d, block = %llu", + ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu", block_group, bitmap_blk); return 0; } @@ -311,8 +302,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) bitmap_blk = ext4_block_bitmap(sb, desc); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { - ext4_error(sb, __func__, - "Cannot read block bitmap - " + ext4_error(sb, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, bitmap_blk); return NULL; @@ -354,8 +344,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) set_bitmap_uptodate(bh); if (bh_submit_read(bh) < 0) { put_bh(bh); - ext4_error(sb, __func__, - "Cannot read block bitmap - " + ext4_error(sb, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, bitmap_blk); return NULL; @@ -419,8 +408,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || in_range(block + count - 1, ext4_inode_table(sb, desc), sbi->s_itb_per_group)) { - ext4_error(sb, __func__, - "Adding blocks in system zones - " + ext4_error(sb, "Adding blocks in system zones - " "Block = %llu, count = %lu", block, count); goto error_return; @@ -453,8 +441,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, BUFFER_TRACE(bitmap_bh, "clear bit"); if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), bit + i, bitmap_bh->b_data)) { - ext4_error(sb, __func__, - "bit already cleared for block %llu", + ext4_error(sb, "bit already cleared for block %llu", (ext4_fsblk_t)(block + i)); BUFFER_TRACE(bitmap_bh, "bit already cleared"); } else { diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index a60ab9aad57..983f0e12749 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -205,14 +205,14 @@ void ext4_release_system_zone(struct super_block *sb) entry = rb_entry(n, struct ext4_system_zone, node); kmem_cache_free(ext4_system_zone_cachep, entry); if (!parent) - EXT4_SB(sb)->system_blks.rb_node = NULL; + EXT4_SB(sb)->system_blks = RB_ROOT; else if (parent->rb_left == n) parent->rb_left = NULL; else if (parent->rb_right == n) parent->rb_right = NULL; n = parent; } - EXT4_SB(sb)->system_blks.rb_node = NULL; + EXT4_SB(sb)->system_blks = RB_ROOT; } /* diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 9dc93168e26..86cb6d86a04 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -83,10 +83,12 @@ int ext4_check_dir_entry(const char *function, struct inode *dir, error_msg = "inode out of bounds"; if (error_msg != NULL) - ext4_error(dir->i_sb, function, - "bad entry in directory #%lu: %s - " - "offset=%u, inode=%u, rec_len=%d, name_len=%d", - dir->i_ino, error_msg, offset, + __ext4_error(dir->i_sb, function, + "bad entry in directory #%lu: %s - block=%llu" + "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", + dir->i_ino, error_msg, + (unsigned long long) bh->b_blocknr, + (unsigned) (offset%bh->b_size), offset, le32_to_cpu(de->inode), rlen, de->name_len); return error_msg == NULL ? 1 : 0; @@ -150,7 +152,7 @@ static int ext4_readdir(struct file *filp, */ if (!bh) { if (!dir_has_error) { - ext4_error(sb, __func__, "directory #%lu " + ext4_error(sb, "directory #%lu " "contains a hole at offset %Lu", inode->i_ino, (unsigned long long) filp->f_pos); @@ -303,7 +305,7 @@ static void free_rb_tree_fname(struct rb_root *root) kfree(old); } if (!parent) - root->rb_node = NULL; + *root = RB_ROOT; else if (parent->rb_left == n) parent->rb_left = NULL; else if (parent->rb_right == n) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4cedc91ec59..bf938cf7c5f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -53,6 +53,12 @@ #define ext4_debug(f, a...) do {} while (0) #endif +#define EXT4_ERROR_INODE(inode, fmt, a...) \ + ext4_error_inode(__func__, (inode), (fmt), ## a); + +#define EXT4_ERROR_FILE(file, fmt, a...) \ + ext4_error_file(__func__, (file), (fmt), ## a); + /* data type for block offset of block group */ typedef int ext4_grpblk_t; @@ -133,14 +139,14 @@ struct mpage_da_data { int pages_written; int retval; }; -#define DIO_AIO_UNWRITTEN 0x1 +#define EXT4_IO_UNWRITTEN 0x1 typedef struct ext4_io_end { struct list_head list; /* per-file finished AIO list */ struct inode *inode; /* file being written to */ unsigned int flag; /* unwritten or not */ - int error; /* I/O error code */ - ext4_lblk_t offset; /* offset in the file */ - size_t size; /* size of the extent */ + struct page *page; /* page struct for buffer write */ + loff_t offset; /* offset in the file */ + ssize_t size; /* size of the extent */ struct work_struct work; /* data work queue */ } ext4_io_end_t; @@ -284,10 +290,12 @@ struct flex_groups { #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ +#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */ +#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */ /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ @@ -313,17 +321,6 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) return flags & EXT4_OTHER_FLMASK; } -/* - * Inode dynamic state flags - */ -#define EXT4_STATE_JDATA 0x00000001 /* journaled data exists */ -#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */ -#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ -#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ -#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ -#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */ -#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/ - /* Used to pass group descriptor data when online resize is done */ struct ext4_new_group_input { __u32 group; /* Group number for this data */ @@ -364,19 +361,20 @@ struct ext4_new_group_data { /* caller is from the direct IO path, request to creation of an unitialized extents if not allocated, split the uninitialized extent if blocks has been preallocated already*/ -#define EXT4_GET_BLOCKS_DIO 0x0008 +#define EXT4_GET_BLOCKS_PRE_IO 0x0008 #define EXT4_GET_BLOCKS_CONVERT 0x0010 -#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\ +#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) + /* Convert extent to initialized after IO complete */ +#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) - /* Convert extent to initialized after direct IO complete */ -#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ - EXT4_GET_BLOCKS_DIO_CREATE_EXT) /* * Flags used by ext4_free_blocks */ #define EXT4_FREE_BLOCKS_METADATA 0x0001 #define EXT4_FREE_BLOCKS_FORGET 0x0002 +#define EXT4_FREE_BLOCKS_VALIDATED 0x0004 /* * ioctl commands @@ -630,7 +628,7 @@ struct ext4_inode_info { * near to their parent directory's inode. */ ext4_group_t i_block_group; - __u32 i_state; /* Dynamic state flags for ext4 */ + unsigned long i_state_flags; /* Dynamic state flags */ ext4_lblk_t i_dir_start_lookup; #ifdef CONFIG_EXT4_FS_XATTR @@ -708,8 +706,9 @@ struct ext4_inode_info { qsize_t i_reserved_quota; #endif - /* completed async DIOs that might need unwritten extents handling */ - struct list_head i_aio_dio_complete_list; + /* completed IOs that might need unwritten extents handling */ + struct list_head i_completed_io_list; + spinlock_t i_completed_io_lock; /* current io_end structure for async DIO write*/ ext4_io_end_t *cur_aio_dio; @@ -760,6 +759,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ @@ -1050,6 +1050,34 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) (ino >= EXT4_FIRST_INO(sb) && ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } + +/* + * Inode dynamic state flags + */ +enum { + EXT4_STATE_JDATA, /* journaled data exists */ + EXT4_STATE_NEW, /* inode is newly created */ + EXT4_STATE_XATTR, /* has in-inode xattrs */ + EXT4_STATE_NO_EXPAND, /* No space for expansion */ + EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ + EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ + EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ +}; + +static inline int ext4_test_inode_state(struct inode *inode, int bit) +{ + return test_bit(bit, &EXT4_I(inode)->i_state_flags); +} + +static inline void ext4_set_inode_state(struct inode *inode, int bit) +{ + set_bit(bit, &EXT4_I(inode)->i_state_flags); +} + +static inline void ext4_clear_inode_state(struct inode *inode, int bit) +{ + clear_bit(bit, &EXT4_I(inode)->i_state_flags); +} #else /* Assume that user mode programs are passing in an ext4fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test @@ -1126,6 +1154,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) #define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 #define EXT4_FEATURE_INCOMPAT_MMP 0x0100 #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 +#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ +#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ #define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ @@ -1416,7 +1446,7 @@ int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); extern struct inode *ext4_iget(struct super_block *, unsigned long); -extern int ext4_write_inode(struct inode *, int); +extern int ext4_write_inode(struct inode *, struct writeback_control *); extern int ext4_setattr(struct dentry *, struct iattr *); extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); @@ -1439,7 +1469,7 @@ extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); -extern int flush_aio_dio_completed_IO(struct inode *inode); +extern int flush_completed_IO(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); /* ioctl.c */ @@ -1465,13 +1495,20 @@ extern int ext4_group_extend(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ -extern void ext4_error(struct super_block *, const char *, const char *, ...) +extern void __ext4_error(struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +#define ext4_error(sb, message...) __ext4_error(sb, __func__, ## message) +extern void ext4_error_inode(const char *, struct inode *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern void ext4_error_file(const char *, struct file *, const char *, ...) __attribute__ ((format (printf, 3, 4))); extern void __ext4_std_error(struct super_block *, const char *, int); extern void ext4_abort(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); -extern void ext4_warning(struct super_block *, const char *, const char *, ...) +extern void __ext4_warning(struct super_block *, const char *, + const char *, ...) __attribute__ ((format (printf, 3, 4))); +#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, ## message) extern void ext4_msg(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, @@ -1744,7 +1781,7 @@ extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, - loff_t len); + ssize_t len); extern int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, unsigned int max_blocks, struct buffer_head *bh, int flags); @@ -1756,6 +1793,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 len, __u64 *moved_len); +/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ +enum ext4_state_bits { + BH_Uninit /* blocks are allocated but uninitialized on disk */ + = BH_JBDPrivateStart, +}; + +BUFFER_FNS(Uninit, uninit) +TAS_BUFFER_FNS(Uninit, uninit) + /* * Add new method to test wether block and inode bitmaps are properly * initialized. With uninit_bg reading the block from disk is not enough @@ -1773,6 +1819,8 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + #endif /* __KERNEL__ */ #endif /* _EXT4_H */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index b57e5c711b6..53d2764d71c 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -125,14 +125,14 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, ext4_journal_abort_handle(where, __func__, bh, handle, err); } else { - if (inode && bh) + if (inode) mark_buffer_dirty_inode(bh, inode); else mark_buffer_dirty(bh); if (inode && inode_needs_sync(inode)) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) { - ext4_error(inode->i_sb, __func__, + ext4_error(inode->i_sb, "IO error syncing inode, " "inode=%lu, block=%llu", inode->i_ino, diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 05eca817d70..b79ad512646 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -304,4 +304,28 @@ static inline int ext4_should_writeback_data(struct inode *inode) return 0; } +/* + * This function controls whether or not we should try to go down the + * dioread_nolock code paths, which makes it safe to avoid taking + * i_mutex for direct I/O reads. This only works for extent-based + * files, and it doesn't work for nobh or if data journaling is + * enabled, since the dioread_nolock code uses b_private to pass + * information back to the I/O completion handler, and this conflicts + * with the jbd's use of b_private. + */ +static inline int ext4_should_dioread_nolock(struct inode *inode) +{ + if (!test_opt(inode->i_sb, DIOREAD_NOLOCK)) + return 0; + if (test_opt(inode->i_sb, NOBH)) + return 0; + if (!S_ISREG(inode->i_mode)) + return 0; + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + return 0; + if (ext4_should_journal_data(inode)) + return 0; + return 1; +} + #endif /* _EXT4_JBD2_H */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 765a4826b11..94c8ee81f5e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -195,8 +195,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, if (S_ISREG(inode->i_mode)) block_group++; } - bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block); + bg_start = ext4_group_first_block_no(inode->i_sb, block_group); last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; /* @@ -440,7 +439,7 @@ static int __ext4_ext_check(const char *function, struct inode *inode, return 0; corrupted: - ext4_error(inode->i_sb, function, + __ext4_error(inode->i_sb, function, "bad header/extent in inode #%lu: %s - magic %x, " "entries %u, max %u(%u), depth %u(%u)", inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), @@ -703,7 +702,12 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, } eh = ext_block_hdr(bh); ppos++; - BUG_ON(ppos > depth); + if (unlikely(ppos > depth)) { + put_bh(bh); + EXT4_ERROR_INODE(inode, + "ppos %d > depth %d", ppos, depth); + goto err; + } path[ppos].p_bh = bh; path[ppos].p_hdr = eh; i--; @@ -749,7 +753,12 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode, if (err) return err; - BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block)); + if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { + EXT4_ERROR_INODE(inode, + "logical %d == ei_block %d!", + logical, le32_to_cpu(curp->p_idx->ei_block)); + return -EIO; + } len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; if (logical > le32_to_cpu(curp->p_idx->ei_block)) { /* insert after */ @@ -779,9 +788,17 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode, ext4_idx_store_pblock(ix, ptr); le16_add_cpu(&curp->p_hdr->eh_entries, 1); - BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries) - > le16_to_cpu(curp->p_hdr->eh_max)); - BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr)); + if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) + > le16_to_cpu(curp->p_hdr->eh_max))) { + EXT4_ERROR_INODE(inode, + "logical %d == ei_block %d!", + logical, le32_to_cpu(curp->p_idx->ei_block)); + return -EIO; + } + if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { + EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); + return -EIO; + } err = ext4_ext_dirty(handle, inode, curp); ext4_std_error(inode->i_sb, err); @@ -819,7 +836,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, /* if current leaf will be split, then we should use * border from split point */ - BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr)); + if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { + EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); + return -EIO; + } if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { border = path[depth].p_ext[1].ee_block; ext_debug("leaf will be split." @@ -860,7 +880,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, /* initialize new leaf */ newblock = ablocks[--a]; - BUG_ON(newblock == 0); + if (unlikely(newblock == 0)) { + EXT4_ERROR_INODE(inode, "newblock == 0!"); + err = -EIO; + goto cleanup; + } bh = sb_getblk(inode->i_sb, newblock); if (!bh) { err = -EIO; @@ -880,7 +904,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, ex = EXT_FIRST_EXTENT(neh); /* move remainder of path[depth] to the new leaf */ - BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max); + if (unlikely(path[depth].p_hdr->eh_entries != + path[depth].p_hdr->eh_max)) { + EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", + path[depth].p_hdr->eh_entries, + path[depth].p_hdr->eh_max); + err = -EIO; + goto cleanup; + } /* start copy from next extent */ /* TODO: we could do it by single memmove */ m = 0; @@ -927,7 +958,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, /* create intermediate indexes */ k = depth - at - 1; - BUG_ON(k < 0); + if (unlikely(k < 0)) { + EXT4_ERROR_INODE(inode, "k %d < 0!", k); + err = -EIO; + goto cleanup; + } if (k) ext_debug("create %d intermediate indices\n", k); /* insert new index into current index block */ @@ -964,8 +999,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, EXT_MAX_INDEX(path[i].p_hdr)); - BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) != - EXT_LAST_INDEX(path[i].p_hdr)); + if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != + EXT_LAST_INDEX(path[i].p_hdr))) { + EXT4_ERROR_INODE(inode, + "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", + le32_to_cpu(path[i].p_ext->ee_block)); + err = -EIO; + goto cleanup; + } while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { ext_debug("%d: move %d:%llu in new index %llu\n", i, le32_to_cpu(path[i].p_idx->ei_block), @@ -1203,7 +1244,10 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex; int depth, ee_len; - BUG_ON(path == NULL); + if (unlikely(path == NULL)) { + EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); + return -EIO; + } depth = path->p_depth; *phys = 0; @@ -1217,15 +1261,33 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, ex = path[depth].p_ext; ee_len = ext4_ext_get_actual_len(ex); if (*logical < le32_to_cpu(ex->ee_block)) { - BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex); + if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { + EXT4_ERROR_INODE(inode, + "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", + *logical, le32_to_cpu(ex->ee_block)); + return -EIO; + } while (--depth >= 0) { ix = path[depth].p_idx; - BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr)); + if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { + EXT4_ERROR_INODE(inode, + "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", + ix != NULL ? ix->ei_block : 0, + EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? + EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0, + depth); + return -EIO; + } } return 0; } - BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len)); + if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { + EXT4_ERROR_INODE(inode, + "logical %d < ee_block %d + ee_len %d!", + *logical, le32_to_cpu(ex->ee_block), ee_len); + return -EIO; + } *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; *phys = ext_pblock(ex) + ee_len - 1; @@ -1251,7 +1313,10 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, int depth; /* Note, NOT eh_depth; depth from top of tree */ int ee_len; - BUG_ON(path == NULL); + if (unlikely(path == NULL)) { + EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); + return -EIO; + } depth = path->p_depth; *phys = 0; @@ -1265,17 +1330,32 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, ex = path[depth].p_ext; ee_len = ext4_ext_get_actual_len(ex); if (*logical < le32_to_cpu(ex->ee_block)) { - BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex); + if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { + EXT4_ERROR_INODE(inode, + "first_extent(path[%d].p_hdr) != ex", + depth); + return -EIO; + } while (--depth >= 0) { ix = path[depth].p_idx; - BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr)); + if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { + EXT4_ERROR_INODE(inode, + "ix != EXT_FIRST_INDEX *logical %d!", + *logical); + return -EIO; + } } *logical = le32_to_cpu(ex->ee_block); *phys = ext_pblock(ex); return 0; } - BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len)); + if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { + EXT4_ERROR_INODE(inode, + "logical %d < ee_block %d + ee_len %d!", + *logical, le32_to_cpu(ex->ee_block), ee_len); + return -EIO; + } if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { /* next allocated block in this leaf */ @@ -1414,8 +1494,12 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, eh = path[depth].p_hdr; ex = path[depth].p_ext; - BUG_ON(ex == NULL); - BUG_ON(eh == NULL); + + if (unlikely(ex == NULL || eh == NULL)) { + EXT4_ERROR_INODE(inode, + "ex %p == NULL or eh %p == NULL", ex, eh); + return -EIO; + } if (depth == 0) { /* there is no tree at all */ @@ -1538,8 +1622,9 @@ int ext4_ext_try_to_merge(struct inode *inode, merge_done = 1; WARN_ON(eh->eh_entries == 0); if (!eh->eh_entries) - ext4_error(inode->i_sb, "ext4_ext_try_to_merge", - "inode#%lu, eh->eh_entries = 0!", inode->i_ino); + ext4_error(inode->i_sb, + "inode#%lu, eh->eh_entries = 0!", + inode->i_ino); } return merge_done; @@ -1612,13 +1697,19 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, ext4_lblk_t next; unsigned uninitialized = 0; - BUG_ON(ext4_ext_get_actual_len(newext) == 0); + if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { + EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); + return -EIO; + } depth = ext_depth(inode); ex = path[depth].p_ext; - BUG_ON(path[depth].p_hdr == NULL); + if (unlikely(path[depth].p_hdr == NULL)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); + return -EIO; + } /* try to insert block into found extent and return */ - if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) + if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) && ext4_can_extents_be_merged(inode, ex, newext)) { ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", ext4_ext_is_uninitialized(newext), @@ -1739,7 +1830,7 @@ has_space: merge: /* try to merge extents to the right */ - if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) + if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) ext4_ext_try_to_merge(inode, path, nearex); /* try to merge extents to the left */ @@ -1787,7 +1878,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, } depth = ext_depth(inode); - BUG_ON(path[depth].p_hdr == NULL); + if (unlikely(path[depth].p_hdr == NULL)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); + err = -EIO; + break; + } ex = path[depth].p_ext; next = ext4_ext_next_allocated_block(path); @@ -1838,7 +1933,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, cbex.ec_type = EXT4_EXT_CACHE_EXTENT; } - BUG_ON(cbex.ec_len == 0); + if (unlikely(cbex.ec_len == 0)) { + EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); + err = -EIO; + break; + } err = func(inode, path, &cbex, ex, cbdata); ext4_ext_drop_refs(path); @@ -1952,7 +2051,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP && cex->ec_type != EXT4_EXT_CACHE_EXTENT); - if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { + if (in_range(block, cex->ec_block, cex->ec_len)) { ex->ee_block = cpu_to_le32(cex->ec_block); ext4_ext_store_pblock(ex, cex->ec_start); ex->ee_len = cpu_to_le16(cex->ec_len); @@ -1981,7 +2080,10 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, /* free index block */ path--; leaf = idx_pblock(path->p_idx); - BUG_ON(path->p_hdr->eh_entries == 0); + if (unlikely(path->p_hdr->eh_entries == 0)) { + EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); + return -EIO; + } err = ext4_ext_get_access(handle, inode, path); if (err) return err; @@ -2119,8 +2221,10 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (!path[depth].p_hdr) path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); eh = path[depth].p_hdr; - BUG_ON(eh == NULL); - + if (unlikely(path[depth].p_hdr == NULL)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); + return -EIO; + } /* find where to start removing */ ex = EXT_LAST_EXTENT(eh); @@ -2983,7 +3087,7 @@ fix_extent_len: ext4_ext_dirty(handle, inode, path + depth); return err; } -static int ext4_convert_unwritten_extents_dio(handle_t *handle, +static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { @@ -3063,8 +3167,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, flags, allocated); ext4_ext_show_leaf(inode, path); - /* DIO get_block() before submit the IO, split the extent */ - if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { + /* get_block() before submit the IO, split the extent */ + if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { ret = ext4_split_unwritten_extents(handle, inode, path, iblock, max_blocks, flags); @@ -3074,14 +3178,16 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, * completed */ if (io) - io->flag = DIO_AIO_UNWRITTEN; + io->flag = EXT4_IO_UNWRITTEN; else - EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN; + ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); + if (ext4_should_dioread_nolock(inode)) + set_buffer_uninit(bh_result); goto out; } - /* async DIO end_io complete, convert the filled extent to written */ - if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { - ret = ext4_convert_unwritten_extents_dio(handle, inode, + /* IO end_io complete, convert the filled extent to written */ + if ((flags & EXT4_GET_BLOCKS_CONVERT)) { + ret = ext4_convert_unwritten_extents_endio(handle, inode, path); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); @@ -3185,7 +3291,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, { struct ext4_ext_path *path = NULL; struct ext4_extent_header *eh; - struct ext4_extent newex, *ex; + struct ext4_extent newex, *ex, *last_ex; ext4_fsblk_t newblock; int err = 0, depth, ret, cache_type; unsigned int allocated = 0; @@ -3237,10 +3343,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * this situation is possible, though, _during_ tree modification; * this is why assert can't be put in ext4_ext_find_extent() */ - if (path[depth].p_ext == NULL && depth != 0) { - ext4_error(inode->i_sb, __func__, "bad extent address " - "inode: %lu, iblock: %d, depth: %d", - inode->i_ino, iblock, depth); + if (unlikely(path[depth].p_ext == NULL && depth != 0)) { + EXT4_ERROR_INODE(inode, "bad extent address " + "iblock: %d, depth: %d pblock %lld", + iblock, depth, path[depth].p_block); err = -EIO; goto out2; } @@ -3258,7 +3364,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, */ ee_len = ext4_ext_get_actual_len(ex); /* if found extent covers block, simply return it */ - if (iblock >= ee_block && iblock < ee_block + ee_len) { + if (in_range(iblock, ee_block, ee_len)) { newblock = iblock - ee_block + ee_start; /* number of remaining blocks in the extent */ allocated = ee_len - (iblock - ee_block); @@ -3350,21 +3456,35 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ ext4_ext_mark_uninitialized(&newex); /* - * io_end structure was created for every async - * direct IO write to the middle of the file. - * To avoid unecessary convertion for every aio dio rewrite - * to the mid of file, here we flag the IO that is really - * need the convertion. + * io_end structure was created for every IO write to an + * uninitialized extent. To avoid unecessary conversion, + * here we flag the IO that really needs the conversion. * For non asycn direct IO case, flag the inode state * that we need to perform convertion when IO is done. */ - if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { + if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { if (io) - io->flag = DIO_AIO_UNWRITTEN; + io->flag = EXT4_IO_UNWRITTEN; else - EXT4_I(inode)->i_state |= - EXT4_STATE_DIO_UNWRITTEN;; + ext4_set_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN); + } + if (ext4_should_dioread_nolock(inode)) + set_buffer_uninit(bh_result); + } + + if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) { + if (unlikely(!eh->eh_entries)) { + EXT4_ERROR_INODE(inode, + "eh->eh_entries == 0 ee_block %d", + ex->ee_block); + err = -EIO; + goto out2; } + last_ex = EXT_LAST_EXTENT(eh); + if (iblock + ar.len > le32_to_cpu(last_ex->ee_block) + + ext4_ext_get_actual_len(last_ex)) + EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; } err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err) { @@ -3499,6 +3619,13 @@ static void ext4_falloc_update_inode(struct inode *inode, i_size_write(inode, new_size); if (new_size > EXT4_I(inode)->i_disksize) ext4_update_i_disksize(inode, new_size); + } else { + /* + * Mark that we allocate beyond EOF so the subsequent truncate + * can proceed even if the new size is the same as i_size. + */ + if (new_size > i_size_read(inode)) + EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL; } } @@ -3603,7 +3730,7 @@ retry: * Returns 0 on success. */ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, - loff_t len) + ssize_t len) { handle_t *handle; ext4_lblk_t block; @@ -3635,7 +3762,7 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, map_bh.b_state = 0; ret = ext4_get_blocks(handle, inode, block, max_blocks, &map_bh, - EXT4_GET_BLOCKS_DIO_CONVERT_EXT); + EXT4_GET_BLOCKS_IO_CONVERT_EXT); if (ret <= 0) { WARN_ON(ret <= 0); printk(KERN_ERR "%s: ext4_ext_get_blocks " @@ -3739,7 +3866,7 @@ static int ext4_xattr_fiemap(struct inode *inode, int error = 0; /* in-inode? */ - if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { struct ext4_iloc iloc; int offset; /* offset of xattr in inode */ @@ -3767,7 +3894,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { ext4_lblk_t start_blk; - ext4_lblk_t len_blks; int error = 0; /* fallback to generic here if not in extents fmt */ @@ -3781,8 +3907,14 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { error = ext4_xattr_fiemap(inode, fieinfo); } else { + ext4_lblk_t len_blks; + __u64 last_blk; + start_blk = start >> inode->i_sb->s_blocksize_bits; - len_blks = len >> inode->i_sb->s_blocksize_bits; + last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; + if (last_blk >= EXT_MAX_BLOCK) + last_blk = EXT_MAX_BLOCK-1; + len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; /* * Walk the extent tree gathering extent information. diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 56eee3d796c..d0776e410f3 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -23,6 +23,7 @@ #include <linux/jbd2.h> #include <linux/mount.h> #include <linux/path.h> +#include <linux/quotaops.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -35,9 +36,9 @@ */ static int ext4_release_file(struct inode *inode, struct file *filp) { - if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) { + if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { ext4_alloc_da_blocks(inode); - EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE; + ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); } /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && @@ -125,7 +126,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) sb->s_dirt = 1; } } - return generic_file_open(inode, filp); + return dquot_file_open(inode, filp); } const struct file_operations ext4_file_operations = { diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 98bd140aad0..0d0c3239c1c 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -63,7 +63,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) if (inode->i_sb->s_flags & MS_RDONLY) return 0; - ret = flush_aio_dio_completed_IO(inode); + ret = flush_completed_IO(inode); if (ret < 0) return ret; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f3624ead4f6..361c0b9962a 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -76,8 +76,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, /* If checksum is bad mark all blocks and inodes use to prevent * allocation, essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { - ext4_error(sb, __func__, "Checksum bad for group %u", - block_group); + ext4_error(sb, "Checksum bad for group %u", block_group); ext4_free_blks_set(sb, gdp, 0); ext4_free_inodes_set(sb, gdp, 0); ext4_itable_unused_set(sb, gdp, 0); @@ -111,8 +110,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) bitmap_blk = ext4_inode_bitmap(sb, desc); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { - ext4_error(sb, __func__, - "Cannot read inode bitmap - " + ext4_error(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); return NULL; @@ -153,8 +151,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) set_bitmap_uptodate(bh); if (bh_submit_read(bh) < 0) { put_bh(bh); - ext4_error(sb, __func__, - "Cannot read inode bitmap - " + ext4_error(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); return NULL; @@ -217,10 +214,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) * Note: we must free any quota before locking the superblock, * as writing the quota to disk may need the lock as well. */ - vfs_dq_init(inode); + dquot_initialize(inode); ext4_xattr_delete_inode(handle, inode); - vfs_dq_free_inode(inode); - vfs_dq_drop(inode); + dquot_free_inode(inode); + dquot_drop(inode); is_directory = S_ISDIR(inode->i_mode); @@ -229,8 +226,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) es = EXT4_SB(sb)->s_es; if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { - ext4_error(sb, "ext4_free_inode", - "reserved or nonexistent inode %lu", ino); + ext4_error(sb, "reserved or nonexistent inode %lu", ino); goto error_return; } block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); @@ -248,8 +244,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), bit, bitmap_bh->b_data); if (!cleared) - ext4_error(sb, "ext4_free_inode", - "bit already cleared for inode %lu", ino); + ext4_error(sb, "bit already cleared for inode %lu", ino); else { gdp = ext4_get_group_desc(sb, block_group, &bh2); @@ -736,8 +731,7 @@ static int ext4_claim_inode(struct super_block *sb, if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || ino > EXT4_INODES_PER_GROUP(sb)) { ext4_unlock_group(sb, group); - ext4_error(sb, __func__, - "reserved inode or inode > inodes count - " + ext4_error(sb, "reserved inode or inode > inodes count - " "block_group = %u, inode=%lu", group, ino + group * EXT4_INODES_PER_GROUP(sb)); return 1; @@ -904,7 +898,7 @@ repeat_in_this_group: BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, - inode, + NULL, inode_bitmap_bh); if (err) goto fail; @@ -1029,15 +1023,16 @@ got: inode->i_generation = sbi->s_next_generation++; spin_unlock(&sbi->s_next_gen_lock); - ei->i_state = EXT4_STATE_NEW; + ei->i_state_flags = 0; + ext4_set_inode_state(inode, EXT4_STATE_NEW); ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; ret = inode; - if (vfs_dq_alloc_inode(inode)) { - err = -EDQUOT; + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) goto fail_drop; - } err = ext4_init_acl(handle, inode, dir); if (err) @@ -1074,10 +1069,10 @@ really_out: return ret; fail_free_drop: - vfs_dq_free_inode(inode); + dquot_free_inode(inode); fail_drop: - vfs_dq_drop(inode); + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; unlock_new_inode(inode); @@ -1098,8 +1093,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) /* Error cases - e2fsck has already cleaned up for us */ if (ino > max_ino) { - ext4_warning(sb, __func__, - "bad orphan ino %lu! e2fsck was run?", ino); + ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino); goto error; } @@ -1107,8 +1101,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); if (!bitmap_bh) { - ext4_warning(sb, __func__, - "inode bitmap error for orphan %lu", ino); + ext4_warning(sb, "inode bitmap error for orphan %lu", ino); goto error; } @@ -1140,8 +1133,7 @@ iget_failed: err = PTR_ERR(inode); inode = NULL; bad_orphan: - ext4_warning(sb, __func__, - "bad orphan inode %lu! e2fsck was run?", ino); + ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino); printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n", bit, (unsigned long long)bitmap_bh->b_blocknr, ext4_test_bit(bit, bitmap_bh->b_data)); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e11952404e0..986120f3006 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -38,6 +38,7 @@ #include <linux/uio.h> #include <linux/bio.h> #include <linux/workqueue.h> +#include <linux/kernel.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -170,6 +171,9 @@ void ext4_delete_inode(struct inode *inode) handle_t *handle; int err; + if (!is_bad_inode(inode)) + dquot_initialize(inode); + if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages(&inode->i_data, 0); @@ -194,7 +198,7 @@ void ext4_delete_inode(struct inode *inode) inode->i_size = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) { - ext4_warning(inode->i_sb, __func__, + ext4_warning(inode->i_sb, "couldn't mark inode dirty (err %d)", err); goto stop_handle; } @@ -212,7 +216,7 @@ void ext4_delete_inode(struct inode *inode) if (err > 0) err = ext4_journal_restart(handle, 3); if (err != 0) { - ext4_warning(inode->i_sb, __func__, + ext4_warning(inode->i_sb, "couldn't extend journal (err %d)", err); stop_handle: ext4_journal_stop(handle); @@ -323,8 +327,7 @@ static int ext4_block_to_path(struct inode *inode, offsets[n++] = i_block & (ptrs - 1); final = ptrs; } else { - ext4_warning(inode->i_sb, "ext4_block_to_path", - "block %lu > max in inode %lu", + ext4_warning(inode->i_sb, "block %lu > max in inode %lu", i_block + direct_blocks + indirect_blocks + double_blocks, inode->i_ino); } @@ -344,7 +347,7 @@ static int __ext4_check_blockref(const char *function, struct inode *inode, if (blk && unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), blk, 1))) { - ext4_error(inode->i_sb, function, + __ext4_error(inode->i_sb, function, "invalid block reference %u " "in inode #%lu", blk, inode->i_ino); return -EIO; @@ -607,7 +610,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, if (*err) goto failed_out; - BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS); + if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { + EXT4_ERROR_INODE(inode, + "current_block %llu + count %lu > %d!", + current_block, count, + EXT4_MAX_BLOCK_FILE_PHYS); + *err = -EIO; + goto failed_out; + } target -= count; /* allocate blocks for indirect blocks */ @@ -643,7 +653,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, ar.flags = EXT4_MB_HINT_DATA; current_block = ext4_mb_new_blocks(handle, &ar, err); - BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS); + if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { + EXT4_ERROR_INODE(inode, + "current_block %llu + ar.len %d > %d!", + current_block, ar.len, + EXT4_MAX_BLOCK_FILE_PHYS); + *err = -EIO; + goto failed_out; + } if (*err && (target == blks)) { /* @@ -1061,6 +1078,7 @@ void ext4_da_update_reserve_space(struct inode *inode, int mdb_free = 0, allocated_meta_blocks = 0; spin_lock(&ei->i_block_reservation_lock); + trace_ext4_da_update_reserve_space(inode, used); if (unlikely(used > ei->i_reserved_data_blocks)) { ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " "with only %d reserved data blocks\n", @@ -1093,9 +1111,9 @@ void ext4_da_update_reserve_space(struct inode *inode, /* Update quota subsystem */ if (quota_claim) { - vfs_dq_claim_block(inode, used); + dquot_claim_block(inode, used); if (mdb_free) - vfs_dq_release_reservation_block(inode, mdb_free); + dquot_release_reservation_block(inode, mdb_free); } else { /* * We did fallocate with an offset that is already delayed @@ -1106,8 +1124,8 @@ void ext4_da_update_reserve_space(struct inode *inode, * that */ if (allocated_meta_blocks) - vfs_dq_claim_block(inode, allocated_meta_blocks); - vfs_dq_release_reservation_block(inode, mdb_free + used); + dquot_claim_block(inode, allocated_meta_blocks); + dquot_release_reservation_block(inode, mdb_free + used); } /* @@ -1124,7 +1142,7 @@ static int check_block_validity(struct inode *inode, const char *msg, sector_t logical, sector_t phys, int len) { if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { - ext4_error(inode->i_sb, msg, + __ext4_error(inode->i_sb, msg, "inode #%lu logical block %llu mapped to %llu " "(size %d)", inode->i_ino, (unsigned long long) logical, @@ -1306,7 +1324,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, * i_data's format changing. Force the migrate * to fail by clearing migrate flags */ - EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; + ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); } /* @@ -1534,6 +1552,8 @@ static void ext4_truncate_failed_write(struct inode *inode) ext4_truncate(inode); } +static int ext4_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1575,8 +1595,12 @@ retry: } *pagep = page; - ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - ext4_get_block); + if (ext4_should_dioread_nolock(inode)) + ret = block_write_begin(file, mapping, pos, len, flags, pagep, + fsdata, ext4_get_block_write); + else + ret = block_write_begin(file, mapping, pos, len, flags, pagep, + fsdata, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = walk_page_buffers(handle, page_buffers(page), @@ -1793,7 +1817,7 @@ static int ext4_journalled_write_end(struct file *file, new_i_size = pos + copied; if (new_i_size > inode->i_size) i_size_write(inode, pos+copied); - EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; + ext4_set_inode_state(inode, EXT4_STATE_JDATA); if (new_i_size > EXT4_I(inode)->i_disksize) { ext4_update_i_disksize(inode, new_i_size); ret2 = ext4_mark_inode_dirty(handle, inode); @@ -1836,6 +1860,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); unsigned long md_needed, md_reserved; + int ret; /* * recalculate the amount of metadata blocks to reserve @@ -1846,6 +1871,7 @@ repeat: spin_lock(&ei->i_block_reservation_lock); md_reserved = ei->i_reserved_meta_blocks; md_needed = ext4_calc_metadata_amount(inode, lblock); + trace_ext4_da_reserve_space(inode, md_needed); spin_unlock(&ei->i_block_reservation_lock); /* @@ -1853,11 +1879,12 @@ repeat: * later. Real quota accounting is done at pages writeout * time. */ - if (vfs_dq_reserve_block(inode, md_needed + 1)) - return -EDQUOT; + ret = dquot_reserve_block(inode, md_needed + 1); + if (ret) + return ret; if (ext4_claim_free_blocks(sbi, md_needed + 1)) { - vfs_dq_release_reservation_block(inode, md_needed + 1); + dquot_release_reservation_block(inode, md_needed + 1); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { yield(); goto repeat; @@ -1914,7 +1941,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free) spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - vfs_dq_release_reservation_block(inode, to_free); + dquot_release_reservation_block(inode, to_free); } static void ext4_da_page_release_reservation(struct page *page, @@ -2091,6 +2118,8 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, } else if (buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); + if (buffer_uninit(exbh)) + set_buffer_uninit(bh); cur_logical++; pblock++; } while ((bh = bh->b_this_page) != head); @@ -2133,17 +2162,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - index = page->index; - if (index > end) + if (page->index > end) break; - index++; - BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); block_invalidatepage(page, 0); ClearPageUptodate(page); unlock_page(page); } + index = pvec.pages[nr_pages - 1]->index + 1; + pagevec_release(&pvec); } return; } @@ -2220,6 +2248,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) */ new.b_state = 0; get_blocks_flags = EXT4_GET_BLOCKS_CREATE; + if (ext4_should_dioread_nolock(mpd->inode)) + get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; if (mpd->b_state & (1 << BH_Delay)) get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; @@ -2630,11 +2660,14 @@ static int __ext4_journalled_writepage(struct page *page, ret = err; walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); - EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; + ext4_set_inode_state(inode, EXT4_STATE_JDATA); out: return ret; } +static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); +static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); + /* * Note that we don't need to start a transaction unless we're journaling data * because we should have holes filled from ext4_page_mkwrite(). We even don't @@ -2682,7 +2715,7 @@ static int ext4_writepage(struct page *page, int ret = 0; loff_t size; unsigned int len; - struct buffer_head *page_bufs; + struct buffer_head *page_bufs = NULL; struct inode *inode = page->mapping->host; trace_ext4_writepage(inode, page); @@ -2758,7 +2791,11 @@ static int ext4_writepage(struct page *page, if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) ret = nobh_writepage(page, noalloc_get_block_write, wbc); - else + else if (page_bufs && buffer_uninit(page_bufs)) { + ext4_set_bh_endio(page_bufs, inode); + ret = block_write_full_page_endio(page, noalloc_get_block_write, + wbc, ext4_end_io_buffer_write); + } else ret = block_write_full_page(page, noalloc_get_block_write, wbc); @@ -3301,7 +3338,8 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) filemap_write_and_wait(mapping); } - if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { + if (EXT4_JOURNAL(inode) && + ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { /* * This is a REALLY heavyweight approach, but the use of * bmap on dirty files is expected to be extremely rare: @@ -3320,7 +3358,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) * everything they get. */ - EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; + ext4_clear_inode_state(inode, EXT4_STATE_JDATA); journal = EXT4_JOURNAL(inode); jbd2_journal_lock_updates(journal); err = jbd2_journal_flush(journal); @@ -3345,11 +3383,45 @@ ext4_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); } +static void ext4_free_io_end(ext4_io_end_t *io) +{ + BUG_ON(!io); + if (io->page) + put_page(io->page); + iput(io->inode); + kfree(io); +} + +static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) +{ + struct buffer_head *head, *bh; + unsigned int curr_off = 0; + + if (!page_has_buffers(page)) + return; + head = bh = page_buffers(page); + do { + if (offset <= curr_off && test_clear_buffer_uninit(bh) + && bh->b_private) { + ext4_free_io_end(bh->b_private); + bh->b_private = NULL; + bh->b_end_io = NULL; + } + curr_off = curr_off + bh->b_size; + bh = bh->b_this_page; + } while (bh != head); +} + static void ext4_invalidatepage(struct page *page, unsigned long offset) { journal_t *journal = EXT4_JOURNAL(page->mapping->host); /* + * free any io_end structure allocated for buffers to be discarded + */ + if (ext4_should_dioread_nolock(page->mapping->host)) + ext4_invalidatepage_free_endio(page, offset); + /* * If it's a full truncate we just forget about the pending dirtying */ if (offset == 0) @@ -3420,7 +3492,14 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, } retry: - ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + if (rw == READ && ext4_should_dioread_nolock(inode)) + ret = blockdev_direct_IO_no_locking(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block, NULL); + else + ret = blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, offset, nr_segs, ext4_get_block, NULL); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) @@ -3436,6 +3515,9 @@ retry: * but cannot extend i_size. Bail out and pretend * the write failed... */ ret = PTR_ERR(handle); + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + goto out; } if (inode->i_nlink) @@ -3463,75 +3545,63 @@ out: return ret; } -static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, +static int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - handle_t *handle = NULL; + handle_t *handle = ext4_journal_current_handle(); int ret = 0; unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; int dio_credits; + int started = 0; - ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n", + ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", inode->i_ino, create); /* - * DIO VFS code passes create = 0 flag for write to - * the middle of file. It does this to avoid block - * allocation for holes, to prevent expose stale data - * out when there is parallel buffered read (which does - * not hold the i_mutex lock) while direct IO write has - * not completed. DIO request on holes finally falls back - * to buffered IO for this reason. - * - * For ext4 extent based file, since we support fallocate, - * new allocated extent as uninitialized, for holes, we - * could fallocate blocks for holes, thus parallel - * buffered IO read will zero out the page when read on - * a hole while parallel DIO write to the hole has not completed. - * - * when we come here, we know it's a direct IO write to - * to the middle of file (<i_size) - * so it's safe to override the create flag from VFS. + * ext4_get_block in prepare for a DIO write or buffer write. + * We allocate an uinitialized extent if blocks haven't been allocated. + * The extent will be converted to initialized after IO complete. */ - create = EXT4_GET_BLOCKS_DIO_CREATE_EXT; + create = EXT4_GET_BLOCKS_IO_CREATE_EXT; - if (max_blocks > DIO_MAX_BLOCKS) - max_blocks = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); - handle = ext4_journal_start(inode, dio_credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; + if (!handle) { + if (max_blocks > DIO_MAX_BLOCKS) + max_blocks = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); + handle = ext4_journal_start(inode, dio_credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + started = 1; } + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, create); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; } - ext4_journal_stop(handle); + if (started) + ext4_journal_stop(handle); out: return ret; } -static void ext4_free_io_end(ext4_io_end_t *io) -{ - BUG_ON(!io); - iput(io->inode); - kfree(io); -} -static void dump_aio_dio_list(struct inode * inode) +static void dump_completed_IO(struct inode * inode) { #ifdef EXT4_DEBUG struct list_head *cur, *before, *after; ext4_io_end_t *io, *io0, *io1; + unsigned long flags; - if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ - ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino); + if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ + ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); return; } - ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino); - list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){ + ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); + spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); + list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ cur = &io->list; before = cur->prev; io0 = container_of(before, ext4_io_end_t, list); @@ -3541,32 +3611,31 @@ static void dump_aio_dio_list(struct inode * inode) ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", io, inode->i_ino, io0, io1); } + spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); #endif } /* * check a range of space and convert unwritten extents to written. */ -static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) +static int ext4_end_io_nolock(ext4_io_end_t *io) { struct inode *inode = io->inode; loff_t offset = io->offset; - size_t size = io->size; + ssize_t size = io->size; int ret = 0; - ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," + ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", io, inode->i_ino, io->list.next, io->list.prev); if (list_empty(&io->list)) return ret; - if (io->flag != DIO_AIO_UNWRITTEN) + if (io->flag != EXT4_IO_UNWRITTEN) return ret; - if (offset + size <= i_size_read(inode)) - ret = ext4_convert_unwritten_extents(inode, offset, size); - + ret = ext4_convert_unwritten_extents(inode, offset, size); if (ret < 0) { printk(KERN_EMERG "%s: failed to convert unwritten" "extents to written extents, error is %d" @@ -3579,50 +3648,64 @@ static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) io->flag = 0; return ret; } + /* * work on completed aio dio IO, to convert unwritten extents to extents */ -static void ext4_end_aio_dio_work(struct work_struct *work) +static void ext4_end_io_work(struct work_struct *work) { - ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); - struct inode *inode = io->inode; - int ret = 0; + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); + struct inode *inode = io->inode; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned long flags; + int ret; mutex_lock(&inode->i_mutex); - ret = ext4_end_aio_dio_nolock(io); - if (ret >= 0) { - if (!list_empty(&io->list)) - list_del_init(&io->list); - ext4_free_io_end(io); + ret = ext4_end_io_nolock(io); + if (ret < 0) { + mutex_unlock(&inode->i_mutex); + return; } + + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + if (!list_empty(&io->list)) + list_del_init(&io->list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); mutex_unlock(&inode->i_mutex); + ext4_free_io_end(io); } + /* * This function is called from ext4_sync_file(). * - * When AIO DIO IO is completed, the work to convert unwritten - * extents to written is queued on workqueue but may not get immediately + * When IO is completed, the work to convert unwritten extents to + * written is queued on workqueue but may not get immediately * scheduled. When fsync is called, we need to ensure the * conversion is complete before fsync returns. - * The inode keeps track of a list of completed AIO from DIO path - * that might needs to do the conversion. This function walks through - * the list and convert the related unwritten extents to written. + * The inode keeps track of a list of pending/completed IO that + * might needs to do the conversion. This function walks through + * the list and convert the related unwritten extents for completed IO + * to written. + * The function return the number of pending IOs on success. */ -int flush_aio_dio_completed_IO(struct inode *inode) +int flush_completed_IO(struct inode *inode) { ext4_io_end_t *io; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned long flags; int ret = 0; int ret2 = 0; - if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)) + if (list_empty(&ei->i_completed_io_list)) return ret; - dump_aio_dio_list(inode); - while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ - io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next, + dump_completed_IO(inode); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + while (!list_empty(&ei->i_completed_io_list)){ + io = list_entry(ei->i_completed_io_list.next, ext4_io_end_t, list); /* - * Calling ext4_end_aio_dio_nolock() to convert completed + * Calling ext4_end_io_nolock() to convert completed * IO to written. * * When ext4_sync_file() is called, run_queue() may already @@ -3635,20 +3718,23 @@ int flush_aio_dio_completed_IO(struct inode *inode) * avoid double converting from both fsync and background work * queue work. */ - ret = ext4_end_aio_dio_nolock(io); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + ret = ext4_end_io_nolock(io); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); if (ret < 0) ret2 = ret; else list_del_init(&io->list); } + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); return (ret2 < 0) ? ret2 : 0; } -static ext4_io_end_t *ext4_init_io_end (struct inode *inode) +static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags) { ext4_io_end_t *io = NULL; - io = kmalloc(sizeof(*io), GFP_NOFS); + io = kmalloc(sizeof(*io), flags); if (io) { igrab(inode); @@ -3656,8 +3742,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode) io->flag = 0; io->offset = 0; io->size = 0; - io->error = 0; - INIT_WORK(&io->work, ext4_end_aio_dio_work); + io->page = NULL; + INIT_WORK(&io->work, ext4_end_io_work); INIT_LIST_HEAD(&io->list); } @@ -3669,6 +3755,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, { ext4_io_end_t *io_end = iocb->private; struct workqueue_struct *wq; + unsigned long flags; + struct ext4_inode_info *ei; /* if not async direct IO or dio with 0 bytes write, just return */ if (!io_end || !size) @@ -3680,7 +3768,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, size); /* if not aio dio with unwritten extents, just free io and return */ - if (io_end->flag != DIO_AIO_UNWRITTEN){ + if (io_end->flag != EXT4_IO_UNWRITTEN){ ext4_free_io_end(io_end); iocb->private = NULL; return; @@ -3688,16 +3776,85 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, io_end->offset = offset; io_end->size = size; + io_end->flag = EXT4_IO_UNWRITTEN; wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; /* queue the work to convert unwritten extents to written */ queue_work(wq, &io_end->work); /* Add the io_end to per-inode completed aio dio list*/ - list_add_tail(&io_end->list, - &EXT4_I(io_end->inode)->i_aio_dio_complete_list); + ei = EXT4_I(io_end->inode); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + list_add_tail(&io_end->list, &ei->i_completed_io_list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); iocb->private = NULL; } + +static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) +{ + ext4_io_end_t *io_end = bh->b_private; + struct workqueue_struct *wq; + struct inode *inode; + unsigned long flags; + + if (!test_clear_buffer_uninit(bh) || !io_end) + goto out; + + if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { + printk("sb umounted, discard end_io request for inode %lu\n", + io_end->inode->i_ino); + ext4_free_io_end(io_end); + goto out; + } + + io_end->flag = EXT4_IO_UNWRITTEN; + inode = io_end->inode; + + /* Add the io_end to per-inode completed io list*/ + spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); + list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); + spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); + + wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; + /* queue the work to convert unwritten extents to written */ + queue_work(wq, &io_end->work); +out: + bh->b_private = NULL; + bh->b_end_io = NULL; + clear_buffer_uninit(bh); + end_buffer_async_write(bh, uptodate); +} + +static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode) +{ + ext4_io_end_t *io_end; + struct page *page = bh->b_page; + loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT; + size_t size = bh->b_size; + +retry: + io_end = ext4_init_io_end(inode, GFP_ATOMIC); + if (!io_end) { + if (printk_ratelimit()) + printk(KERN_WARNING "%s: allocation fail\n", __func__); + schedule(); + goto retry; + } + io_end->offset = offset; + io_end->size = size; + /* + * We need to hold a reference to the page to make sure it + * doesn't get evicted before ext4_end_io_work() has a chance + * to convert the extent from written to unwritten. + */ + io_end->page = page; + get_page(io_end->page); + + bh->b_private = io_end; + bh->b_end_io = ext4_end_io_buffer_write; + return 0; +} + /* * For ext4 extent files, ext4 will do direct-io write to holes, * preallocated extents, and those write extend the file, no need to @@ -3751,7 +3908,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, iocb->private = NULL; EXT4_I(inode)->cur_aio_dio = NULL; if (!is_sync_kiocb(iocb)) { - iocb->private = ext4_init_io_end(inode); + iocb->private = ext4_init_io_end(inode, GFP_NOFS); if (!iocb->private) return -ENOMEM; /* @@ -3767,7 +3924,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, - ext4_get_block_dio_write, + ext4_get_block_write, ext4_end_io_dio); if (iocb->private) EXT4_I(inode)->cur_aio_dio = NULL; @@ -3788,8 +3945,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { ext4_free_io_end(iocb->private); iocb->private = NULL; - } else if (ret > 0 && (EXT4_I(inode)->i_state & - EXT4_STATE_DIO_UNWRITTEN)) { + } else if (ret > 0 && ext4_test_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN)) { int err; /* * for non AIO case, since the IO is already @@ -3799,7 +3956,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, offset, ret); if (err < 0) ret = err; - EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN; + ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); } return ret; } @@ -4130,18 +4287,27 @@ no_top: * We release `count' blocks on disk, but (last - first) may be greater * than `count' because there can be holes in there. */ -static void ext4_clear_blocks(handle_t *handle, struct inode *inode, - struct buffer_head *bh, - ext4_fsblk_t block_to_free, - unsigned long count, __le32 *first, - __le32 *last) +static int ext4_clear_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, + ext4_fsblk_t block_to_free, + unsigned long count, __le32 *first, + __le32 *last) { __le32 *p; - int flags = EXT4_FREE_BLOCKS_FORGET; + int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) flags |= EXT4_FREE_BLOCKS_METADATA; + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, + count)) { + ext4_error(inode->i_sb, "inode #%lu: " + "attempt to clear blocks %llu len %lu, invalid", + inode->i_ino, (unsigned long long) block_to_free, + count); + return 1; + } + if (try_to_extend_transaction(handle, inode)) { if (bh) { BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); @@ -4160,6 +4326,7 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, *p = 0; ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); + return 0; } /** @@ -4215,9 +4382,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, } else if (nr == block_to_free + count) { count++; } else { - ext4_clear_blocks(handle, inode, this_bh, - block_to_free, - count, block_to_free_p, p); + if (ext4_clear_blocks(handle, inode, this_bh, + block_to_free, count, + block_to_free_p, p)) + break; block_to_free = nr; block_to_free_p = p; count = 1; @@ -4241,7 +4409,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) ext4_handle_dirty_metadata(handle, inode, this_bh); else - ext4_error(inode->i_sb, __func__, + ext4_error(inode->i_sb, "circular indirect block detected, " "inode=%lu, block=%llu", inode->i_ino, @@ -4281,6 +4449,16 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, if (!nr) continue; /* A hole */ + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), + nr, 1)) { + ext4_error(inode->i_sb, + "indirect mapped block in inode " + "#%lu invalid (level %d, blk #%lu)", + inode->i_ino, depth, + (unsigned long) nr); + break; + } + /* Go read the buffer for the next level down */ bh = sb_bread(inode->i_sb, nr); @@ -4289,7 +4467,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, * (should be rare). */ if (!bh) { - ext4_error(inode->i_sb, "ext4_free_branches", + ext4_error(inode->i_sb, "Read failure, inode=%lu, block=%llu", inode->i_ino, nr); continue; @@ -4433,8 +4611,10 @@ void ext4_truncate(struct inode *inode) if (!ext4_can_truncate(inode)) return; + EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) - ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; + ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { ext4_ext_truncate(inode); @@ -4604,9 +4784,8 @@ static int __ext4_get_inode_loc(struct inode *inode, bh = sb_getblk(sb, block); if (!bh) { - ext4_error(sb, "ext4_get_inode_loc", "unable to read " - "inode block - inode=%lu, block=%llu", - inode->i_ino, block); + ext4_error(sb, "unable to read inode block - " + "inode=%lu, block=%llu", inode->i_ino, block); return -EIO; } if (!buffer_uptodate(bh)) { @@ -4704,9 +4883,8 @@ make_io: submit_bh(READ_META, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - ext4_error(sb, __func__, - "unable to read inode block - inode=%lu, " - "block=%llu", inode->i_ino, block); + ext4_error(sb, "unable to read inode block - inode=%lu," + " block=%llu", inode->i_ino, block); brelse(bh); return -EIO; } @@ -4720,7 +4898,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) { /* We have all inode data except xattrs in memory here. */ return __ext4_get_inode_loc(inode, iloc, - !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)); + !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); } void ext4_set_inode_flags(struct inode *inode) @@ -4814,7 +4992,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); - ei->i_state = 0; + ei->i_state_flags = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. @@ -4897,7 +5075,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) - ei->i_state |= EXT4_STATE_XATTR; + ext4_set_inode_state(inode, EXT4_STATE_XATTR); } } else ei->i_extra_isize = 0; @@ -4917,8 +5095,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ret = 0; if (ei->i_file_acl && !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { - ext4_error(sb, __func__, - "bad extended attribute block %llu in inode #%lu", + ext4_error(sb, "bad extended attribute block %llu inode #%lu", ei->i_file_acl, inode->i_ino); ret = -EIO; goto bad_inode; @@ -4964,8 +5141,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } else { ret = -EIO; - ext4_error(inode->i_sb, __func__, - "bogus i_mode (%o) for inode=%lu", + ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu", inode->i_mode, inode->i_ino); goto bad_inode; } @@ -5037,7 +5213,7 @@ static int ext4_do_update_inode(handle_t *handle, /* For fields not not tracking in the in-memory inode, * initialise them to zero for new inodes. */ - if (ei->i_state & EXT4_STATE_NEW) + if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); ext4_get_inode_flags(ei); @@ -5101,7 +5277,7 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_FEATURE_RO_COMPAT_LARGE_FILE); sb->s_dirt = 1; ext4_handle_sync(handle); - err = ext4_handle_dirty_metadata(handle, inode, + err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); } } @@ -5130,10 +5306,10 @@ static int ext4_do_update_inode(handle_t *handle, } BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - rc = ext4_handle_dirty_metadata(handle, inode, bh); + rc = ext4_handle_dirty_metadata(handle, NULL, bh); if (!err) err = rc; - ei->i_state &= ~EXT4_STATE_NEW; + ext4_clear_inode_state(inode, EXT4_STATE_NEW); ext4_update_inode_fsync_trans(handle, inode, 0); out_brelse: @@ -5177,7 +5353,7 @@ out_brelse: * `stuff()' is running, and the new i_size will be lost. Plus the inode * will no longer be on the superblock's dirty inode list. */ -int ext4_write_inode(struct inode *inode, int wait) +int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; @@ -5191,7 +5367,7 @@ int ext4_write_inode(struct inode *inode, int wait) return -EIO; } - if (!wait) + if (wbc->sync_mode != WB_SYNC_ALL) return 0; err = ext4_force_commit(inode->i_sb); @@ -5201,13 +5377,11 @@ int ext4_write_inode(struct inode *inode, int wait) err = ext4_get_inode_loc(inode, &iloc); if (err) return err; - if (wait) + if (wbc->sync_mode == WB_SYNC_ALL) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { - ext4_error(inode->i_sb, __func__, - "IO error syncing inode, " - "inode=%lu, block=%llu", - inode->i_ino, + ext4_error(inode->i_sb, "IO error syncing inode, " + "inode=%lu, block=%llu", inode->i_ino, (unsigned long long)iloc.bh->b_blocknr); err = -EIO; } @@ -5249,6 +5423,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; + if (ia_valid & ATTR_SIZE) + dquot_initialize(inode); if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { handle_t *handle; @@ -5261,7 +5437,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) error = PTR_ERR(handle); goto err_out; } - error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + error = dquot_transfer(inode, attr); if (error) { ext4_journal_stop(handle); return error; @@ -5288,7 +5464,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } if (S_ISREG(inode->i_mode) && - attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { + attr->ia_valid & ATTR_SIZE && + (attr->ia_size < inode->i_size || + (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) { handle_t *handle; handle = ext4_journal_start(inode, 3); @@ -5319,6 +5497,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) goto err_out; } } + /* ext4_truncate will clear the flag */ + if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) + ext4_truncate(inode); } rc = inode_setattr(inode, attr); @@ -5557,8 +5738,8 @@ static int ext4_expand_extra_isize(struct inode *inode, entry = IFIRST(header); /* No extended attributes present */ - if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) || - header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || + header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, new_extra_isize); EXT4_I(inode)->i_extra_isize = new_extra_isize; @@ -5602,7 +5783,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) err = ext4_reserve_inode_write(handle, inode, &iloc); if (ext4_handle_valid(handle) && EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && - !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { + !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { /* * We need extra buffer credits since we may write into EA block * with this same handle. If journal_extend fails, then it will @@ -5616,10 +5797,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) sbi->s_want_extra_isize, iloc, handle); if (ret) { - EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; + ext4_set_inode_state(inode, + EXT4_STATE_NO_EXPAND); if (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count)) { - ext4_warning(inode->i_sb, __func__, + ext4_warning(inode->i_sb, "Unable to expand inode %lu. Delete" " some EAs or run e2fsck.", inode->i_ino); @@ -5641,7 +5823,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) * i_size has been changed by generic_commit_write() and we thus need * to include the updated inode in the current transaction. * - * Also, vfs_dq_alloc_block() will always dirty the inode when blocks + * Also, dquot_alloc_block() will always dirty the inode when blocks * are allocated to the file. * * If the inode is marked synchronous, we don't honour that here - doing @@ -5683,7 +5865,7 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode) err = jbd2_journal_get_write_access(handle, iloc.bh); if (!err) err = ext4_handle_dirty_metadata(handle, - inode, + NULL, iloc.bh); brelse(iloc.bh); } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index b63d193126d..016d0249294 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -92,6 +92,15 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) flags &= ~EXT4_EXTENTS_FL; } + if (flags & EXT4_EOFBLOCKS_FL) { + /* we don't support adding EOFBLOCKS flag */ + if (!(oldflags & EXT4_EOFBLOCKS_FL)) { + err = -EOPNOTSUPP; + goto flags_out; + } + } else if (oldflags & EXT4_EOFBLOCKS_FL) + ext4_truncate(inode); + handle = ext4_journal_start(inode, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); @@ -249,7 +258,8 @@ setversion_out: if (me.moved_len > 0) file_remove_suid(donor_filp); - if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) + if (copy_to_user((struct move_extent __user *)arg, + &me, sizeof(me))) err = -EFAULT; mext_out: fput(donor_filp); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d34afad3e13..506713a2ebd 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -441,10 +441,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, for (i = 0; i < count; i++) { if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { ext4_fsblk_t blocknr; - blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); + + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += first + i; - blocknr += - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_grp_locked_error(sb, e4b->bd_group, __func__, "double-free of inode" " %lu's block %llu(bit %u in group %u)", @@ -1255,10 +1254,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { ext4_fsblk_t blocknr; - blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); + + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += block; - blocknr += - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_grp_locked_error(sb, e4b->bd_group, __func__, "double-free of inode" " %lu's block %llu(bit %u in group %u)", @@ -1631,7 +1629,6 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, int max; int err; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_super_block *es = sbi->s_es; struct ext4_free_extent ex; if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) @@ -1648,8 +1645,8 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ext4_fsblk_t start; - start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) + - ex.fe_start + le32_to_cpu(es->s_first_data_block); + start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) + + ex.fe_start; /* use do_div to get remainder (would be 64-bit modulo) */ if (do_div(start, sbi->s_stripe) == 0) { ac->ac_found++; @@ -1803,8 +1800,8 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, BUG_ON(sbi->s_stripe == 0); /* find first stripe-aligned block in group */ - first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb) - + le32_to_cpu(sbi->s_es->s_first_data_block); + first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); + a = first_group_block + sbi->s_stripe - 1; do_div(a, sbi->s_stripe); i = (a * sbi->s_stripe) - first_group_block; @@ -2256,7 +2253,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); - meta_group_info[i]->bb_free_root.rb_node = NULL; + meta_group_info[i]->bb_free_root = RB_ROOT; #ifdef DOUBLE_CHECK { @@ -2560,12 +2557,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) ext4_unlock_group(sb, entry->group); if (test_opt(sb, DISCARD)) { ext4_fsblk_t discard_block; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - discard_block = (ext4_fsblk_t)entry->group * - EXT4_BLOCKS_PER_GROUP(sb) - + entry->start_blk - + le32_to_cpu(es->s_first_data_block); + discard_block = entry->start_blk + + ext4_group_first_block_no(sb, entry->group); trace_ext4_discard_blocks(sb, (unsigned long long)discard_block, entry->count); @@ -2703,14 +2697,11 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (err) goto out_err; - block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb) - + ac->ac_b_ex.fe_start - + le32_to_cpu(es->s_first_data_block); + block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); len = ac->ac_b_ex.fe_len; if (!ext4_data_block_valid(sbi, block, len)) { - ext4_error(sb, __func__, - "Allocating blocks %llu-%llu which overlap " + ext4_error(sb, "Allocating blocks %llu-%llu which overlap " "fs metadata\n", block, block+len); /* File system mounted not to panic on error * Fix the bitmap and repeat the block allocation @@ -3161,9 +3152,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; - goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) + - ac->ac_g_ex.fe_start + - le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block); + goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); /* * search for the prealloc space that is having * minimal distance from the goal block. @@ -3526,8 +3515,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, if (bit >= end) break; next = mb_find_next_bit(bitmap_bh->b_data, end, bit); - start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + - le32_to_cpu(sbi->s_es->s_first_data_block); + start = ext4_group_first_block_no(sb, group) + bit; mb_debug(1, " free preallocated %u/%u in group %u\n", (unsigned) start, (unsigned) next - bit, (unsigned) group); @@ -3623,15 +3611,13 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { - ext4_error(sb, __func__, "Error in reading block " - "bitmap for %u", group); + ext4_error(sb, "Error reading block bitmap for %u", group); return 0; } err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { - ext4_error(sb, __func__, "Error in loading buddy " - "information for %u", group); + ext4_error(sb, "Error loading buddy information for %u", group); put_bh(bitmap_bh); return 0; } @@ -3804,15 +3790,15 @@ repeat: err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { - ext4_error(sb, __func__, "Error in loading buddy " - "information for %u", group); + ext4_error(sb, "Error loading buddy information for %u", + group); continue; } bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { - ext4_error(sb, __func__, "Error in reading block " - "bitmap for %u", group); + ext4_error(sb, "Error reading block bitmap for %u", + group); ext4_mb_release_desc(&e4b); continue; } @@ -3938,7 +3924,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) /* don't use group allocation for large files */ size = max(size, isize); - if (size >= sbi->s_mb_stream_request) { + if (size > sbi->s_mb_stream_request) { ac->ac_flags |= EXT4_MB_STREAM_ALLOC; return; } @@ -4077,8 +4063,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); if (ext4_mb_load_buddy(sb, group, &e4b)) { - ext4_error(sb, __func__, "Error in loading buddy " - "information for %u", group); + ext4_error(sb, "Error loading buddy information for %u", + group); continue; } ext4_lock_group(sb, group); @@ -4254,7 +4240,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, return 0; } reserv_blks = ar->len; - while (ar->len && vfs_dq_alloc_block(ar->inode, ar->len)) { + while (ar->len && dquot_alloc_block(ar->inode, ar->len)) { ar->flags |= EXT4_MB_HINT_NOPREALLOC; ar->len--; } @@ -4331,7 +4317,7 @@ out2: kmem_cache_free(ext4_ac_cachep, ac); out1: if (inquota && ar->len < inquota) - vfs_dq_free_block(ar->inode, inquota - ar->len); + dquot_free_block(ar->inode, inquota - ar->len); out3: if (!ar->len) { if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) @@ -4476,10 +4462,10 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, sbi = EXT4_SB(sb); es = EXT4_SB(sb)->s_es; - if (!ext4_data_block_valid(sbi, block, count)) { - ext4_error(sb, __func__, - "Freeing blocks not in datazone - " - "block = %llu, count = %lu", block, count); + if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && + !ext4_data_block_valid(sbi, block, count)) { + ext4_error(sb, "Freeing blocks not in datazone - " + "block = %llu, count = %lu", block, count); goto error_return; } @@ -4547,8 +4533,7 @@ do_more: in_range(block + count - 1, ext4_inode_table(sb, gdp), EXT4_SB(sb)->s_itb_per_group)) { - ext4_error(sb, __func__, - "Freeing blocks in system zone - " + ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ goto error_return; @@ -4646,7 +4631,7 @@ do_more: sb->s_dirt = 1; error_return: if (freed) - vfs_dq_free_block(inode, freed); + dquot_free_block(inode, freed); brelse(bitmap_bh); ext4_std_error(sb, err); if (ac) diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 436521cae45..b619322c76f 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -220,16 +220,9 @@ struct ext4_buddy { #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) -#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) - static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, struct ext4_free_extent *fex) { - ext4_fsblk_t block; - - block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb) - + fex->fe_start - + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - return block; + return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start; } #endif diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 81415814b00..8b87bd0eac9 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -365,12 +365,12 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, * happened after we started the migrate. We need to * fail the migrate */ - if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) { + if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) { retval = -EAGAIN; up_write(&EXT4_I(inode)->i_data_sem); goto err_out; } else - EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; + ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); /* * We have the extent map build with the tmp inode. * Now copy the i_data across @@ -503,14 +503,10 @@ int ext4_ext_migrate(struct inode *inode) } i_size_write(tmp_inode, i_size_read(inode)); /* - * We don't want the inode to be reclaimed - * if we got interrupted in between. We have - * this tmp inode carrying reference to the - * data blocks of the original file. We set - * the i_nlink to zero at the last stage after - * switching the original file to extent format + * Set the i_nlink to zero so it will be deleted later + * when we drop inode reference. */ - tmp_inode->i_nlink = 1; + tmp_inode->i_nlink = 0; ext4_ext_tree_init(handle, tmp_inode); ext4_orphan_add(handle, tmp_inode); @@ -533,10 +529,20 @@ int ext4_ext_migrate(struct inode *inode) * allocation. */ down_read((&EXT4_I(inode)->i_data_sem)); - EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE; + ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE); up_read((&EXT4_I(inode)->i_data_sem)); handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + /* + * It is impossible to update on-disk structures without + * a handle, so just rollback in-core changes and live other + * work to orphan_list_cleanup() + */ + ext4_orphan_del(NULL, tmp_inode); + retval = PTR_ERR(handle); + goto out; + } ei = EXT4_I(inode); i_data = ei->i_data; @@ -618,15 +624,8 @@ err_out: /* Reset the extent details */ ext4_ext_tree_init(handle, tmp_inode); - - /* - * Set the i_nlink to zero so that - * generic_drop_inode really deletes the - * inode - */ - tmp_inode->i_nlink = 0; - ext4_journal_stop(handle); +out: unlock_new_inode(tmp_inode); iput(tmp_inode); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 82c415be87a..aa5fe28d180 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -152,12 +152,12 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2, int ret = 0; if (inode1 == NULL) { - ext4_error(inode2->i_sb, function, + __ext4_error(inode2->i_sb, function, "Both inodes should not be NULL: " "inode1 NULL inode2 %lu", inode2->i_ino); ret = -EIO; } else if (inode2 == NULL) { - ext4_error(inode1->i_sb, function, + __ext4_error(inode1->i_sb, function, "Both inodes should not be NULL: " "inode1 %lu inode2 NULL", inode1->i_ino); ret = -EIO; @@ -252,6 +252,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, } o_start->ee_len = start_ext->ee_len; + eblock = le32_to_cpu(start_ext->ee_block); new_flag = 1; } else if (start_ext->ee_len && new_ext->ee_len && @@ -262,6 +263,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, * orig |------------------------------| */ o_start->ee_len = start_ext->ee_len; + eblock = le32_to_cpu(start_ext->ee_block); new_flag = 1; } else if (!start_ext->ee_len && new_ext->ee_len && @@ -475,7 +477,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, struct ext4_extent *oext, *o_start, *o_end, *prev_ext; struct ext4_extent new_ext, start_ext, end_ext; ext4_lblk_t new_ext_end; - ext4_fsblk_t new_phys_end; int oext_alen, new_ext_alen, end_ext_alen; int depth = ext_depth(orig_inode); int ret; @@ -489,7 +490,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, new_ext.ee_len = dext->ee_len; new_ext_alen = ext4_ext_get_actual_len(&new_ext); new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; - new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1; /* * Case: original extent is first @@ -502,6 +502,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, le32_to_cpu(oext->ee_block) + oext_alen) { start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) - le32_to_cpu(oext->ee_block)); + start_ext.ee_block = oext->ee_block; copy_extent_status(oext, &start_ext); } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) { prev_ext = oext - 1; @@ -515,6 +516,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, start_ext.ee_len = cpu_to_le16( ext4_ext_get_actual_len(prev_ext) + new_ext_alen); + start_ext.ee_block = oext->ee_block; copy_extent_status(prev_ext, &start_ext); new_ext.ee_len = 0; } @@ -526,7 +528,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, * new_ext |-------| */ if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { - ext4_error(orig_inode->i_sb, __func__, + ext4_error(orig_inode->i_sb, "new_ext_end(%u) should be less than or equal to " "oext->ee_block(%u) + oext_alen(%d) - 1", new_ext_end, le32_to_cpu(oext->ee_block), @@ -689,12 +691,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, while (1) { /* The extent for donor must be found. */ if (!dext) { - ext4_error(donor_inode->i_sb, __func__, + ext4_error(donor_inode->i_sb, "The extent for donor must be found"); *err = -EIO; goto out; } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { - ext4_error(donor_inode->i_sb, __func__, + ext4_error(donor_inode->i_sb, "Donor offset(%u) and the first block of donor " "extent(%u) should be equal", donor_off, @@ -928,7 +930,7 @@ out2: } /** - * mext_check_argumants - Check whether move extent can be done + * mext_check_arguments - Check whether move extent can be done * * @orig_inode: original inode * @donor_inode: donor inode @@ -949,14 +951,6 @@ mext_check_arguments(struct inode *orig_inode, unsigned int blkbits = orig_inode->i_blkbits; unsigned int blocksize = 1 << blkbits; - /* Regular file check */ - if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { - ext4_debug("ext4 move extent: The argument files should be " - "regular file [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { ext4_debug("ext4 move extent: suid or sgid is set" " to donor file [ino:orig %lu, donor %lu]\n", @@ -1204,6 +1198,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, return -EINVAL; } + /* Regular file check */ + if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { + ext4_debug("ext4 move extent: The argument files should be " + "regular file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + /* Protect orig and donor inodes against a truncate */ ret1 = mext_inode_double_lock(orig_inode, donor_inode); if (ret1 < 0) @@ -1351,7 +1353,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, if (ret1 < 0) break; if (*moved_len > len) { - ext4_error(orig_inode->i_sb, __func__, + ext4_error(orig_inode->i_sb, "We replaced blocks too much! " "sum of replaced: %llu requested: %llu", *moved_len, len); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 17a17e10dd6..0c070fabd10 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -383,8 +383,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && root->info.hash_version != DX_HASH_LEGACY) { - ext4_warning(dir->i_sb, __func__, - "Unrecognised inode hash code %d", + ext4_warning(dir->i_sb, "Unrecognised inode hash code %d", root->info.hash_version); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -399,8 +398,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, hash = hinfo->hash; if (root->info.unused_flags & 1) { - ext4_warning(dir->i_sb, __func__, - "Unimplemented inode hash flags: %#06x", + ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x", root->info.unused_flags); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -408,8 +406,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, } if ((indirect = root->info.indirect_levels) > 1) { - ext4_warning(dir->i_sb, __func__, - "Unimplemented inode hash depth: %#06x", + ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", root->info.indirect_levels); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -421,8 +418,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, if (dx_get_limit(entries) != dx_root_limit(dir, root->info.info_length)) { - ext4_warning(dir->i_sb, __func__, - "dx entry: limit != root limit"); + ext4_warning(dir->i_sb, "dx entry: limit != root limit"); brelse(bh); *err = ERR_BAD_DX_DIR; goto fail; @@ -433,7 +429,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, { count = dx_get_count(entries); if (!count || count > dx_get_limit(entries)) { - ext4_warning(dir->i_sb, __func__, + ext4_warning(dir->i_sb, "dx entry: no count or count > limit"); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -478,7 +474,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, goto fail2; at = entries = ((struct dx_node *) bh->b_data)->entries; if (dx_get_limit(entries) != dx_node_limit (dir)) { - ext4_warning(dir->i_sb, __func__, + ext4_warning(dir->i_sb, "dx entry: limit != node limit"); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -494,7 +490,7 @@ fail2: } fail: if (*err == ERR_BAD_DX_DIR) - ext4_warning(dir->i_sb, __func__, + ext4_warning(dir->i_sb, "Corrupt dir inode %ld, running e2fsck is " "recommended.", dir->i_ino); return NULL; @@ -947,9 +943,8 @@ restart: wait_on_buffer(bh); if (!buffer_uptodate(bh)) { /* read error, skip block & hope for the best */ - ext4_error(sb, __func__, "reading directory #%lu " - "offset %lu", dir->i_ino, - (unsigned long)block); + ext4_error(sb, "reading directory #%lu offset %lu", + dir->i_ino, (unsigned long)block); brelse(bh); goto next; } @@ -1041,7 +1036,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q retval = ext4_htree_next_block(dir, hash, frame, frames, NULL); if (retval < 0) { - ext4_warning(sb, __func__, + ext4_warning(sb, "error reading index page in directory #%lu", dir->i_ino); *err = retval; @@ -1071,14 +1066,13 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru __u32 ino = le32_to_cpu(de->inode); brelse(bh); if (!ext4_valid_inum(dir->i_sb, ino)) { - ext4_error(dir->i_sb, "ext4_lookup", - "bad inode number: %u", ino); + ext4_error(dir->i_sb, "bad inode number: %u", ino); return ERR_PTR(-EIO); } inode = ext4_iget(dir->i_sb, ino); if (unlikely(IS_ERR(inode))) { if (PTR_ERR(inode) == -ESTALE) { - ext4_error(dir->i_sb, __func__, + ext4_error(dir->i_sb, "deleted inode referenced: %u", ino); return ERR_PTR(-EIO); @@ -1110,7 +1104,7 @@ struct dentry *ext4_get_parent(struct dentry *child) brelse(bh); if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { - ext4_error(child->d_inode->i_sb, "ext4_get_parent", + ext4_error(child->d_inode->i_sb, "bad inode number: %u", ino); return ERR_PTR(-EIO); } @@ -1410,7 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *)((char *)fde + ext4_rec_len_from_disk(fde->rec_len, blocksize)); if ((char *) de >= (((char *) root) + blocksize)) { - ext4_error(dir->i_sb, __func__, + ext4_error(dir->i_sb, "invalid rec_len for '..' in inode %lu", dir->i_ino); brelse(bh); @@ -1575,8 +1569,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (levels && (dx_get_count(frames->entries) == dx_get_limit(frames->entries))) { - ext4_warning(sb, __func__, - "Directory index full!"); + ext4_warning(sb, "Directory index full!"); err = -ENOSPC; goto cleanup; } @@ -1766,6 +1759,8 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode, struct inode *inode; int err, retries = 0; + dquot_initialize(dir); + retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -1800,6 +1795,8 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; + dquot_initialize(dir); + retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -1837,6 +1834,8 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (EXT4_DIR_LINK_MAX(dir)) return -EMLINK; + dquot_initialize(dir); + retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -1916,11 +1915,11 @@ static int empty_dir(struct inode *inode) if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { if (err) - ext4_error(inode->i_sb, __func__, + ext4_error(inode->i_sb, "error %d reading directory #%lu offset 0", err, inode->i_ino); else - ext4_warning(inode->i_sb, __func__, + ext4_warning(inode->i_sb, "bad directory (dir #%lu) - no data block", inode->i_ino); return 1; @@ -1931,7 +1930,7 @@ static int empty_dir(struct inode *inode) !le32_to_cpu(de1->inode) || strcmp(".", de->name) || strcmp("..", de1->name)) { - ext4_warning(inode->i_sb, "empty_dir", + ext4_warning(inode->i_sb, "bad directory (dir #%lu) - no `.' or `..'", inode->i_ino); brelse(bh); @@ -1949,7 +1948,7 @@ static int empty_dir(struct inode *inode) offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); if (!bh) { if (err) - ext4_error(sb, __func__, + ext4_error(sb, "error %d reading directory" " #%lu offset %u", err, inode->i_ino, offset); @@ -2020,11 +2019,18 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto out_unlock; + /* + * Due to previous errors inode may be already a part of on-disk + * orphan list. If so skip on-disk list modification. + */ + if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <= + (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) + goto mem_insert; /* Insert this inode at the head of the on-disk orphan list... */ NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); - err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh); + err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) err = rc; @@ -2037,6 +2043,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) * * This is safe: on error we're going to ignore the orphan list * anyway on the next recovery. */ +mem_insert: if (!err) list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); @@ -2096,7 +2103,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) if (err) goto out_brelse; sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); - err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh); + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); } else { struct ext4_iloc iloc2; struct inode *i_prev = @@ -2136,7 +2143,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go in * separate transaction */ - vfs_dq_init(dentry->d_inode); + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); + handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2163,7 +2172,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) if (retval) goto end_rmdir; if (!EXT4_DIR_LINK_EMPTY(inode)) - ext4_warning(inode->i_sb, "ext4_rmdir", + ext4_warning(inode->i_sb, "empty directory has too many links (%d)", inode->i_nlink); inode->i_version++; @@ -2195,7 +2204,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go * in separate transaction */ - vfs_dq_init(dentry->d_inode); + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); + handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2215,7 +2226,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) goto end_unlink; if (!inode->i_nlink) { - ext4_warning(inode->i_sb, "ext4_unlink", + ext4_warning(inode->i_sb, "Deleting nonexistent file (%lu), %d", inode->i_ino, inode->i_nlink); inode->i_nlink = 1; @@ -2250,6 +2261,8 @@ static int ext4_symlink(struct inode *dir, if (l > dir->i_sb->s_blocksize) return -ENAMETOOLONG; + dquot_initialize(dir); + retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + @@ -2308,6 +2321,8 @@ static int ext4_link(struct dentry *old_dentry, if (inode->i_nlink >= EXT4_LINK_MAX) return -EMLINK; + dquot_initialize(dir); + /* * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing * otherwise has the potential to corrupt the orphan inode list. @@ -2358,12 +2373,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct ext4_dir_entry_2 *old_de, *new_de; int retval, force_da_alloc = 0; + dquot_initialize(old_dir); + dquot_initialize(new_dir); + old_bh = new_bh = dir_bh = NULL; /* Initialize quotas before so that eventual writes go * in separate transaction */ if (new_dentry->d_inode) - vfs_dq_init(new_dentry->d_inode); + dquot_initialize(new_dentry->d_inode); handle = ext4_journal_start(old_dir, 2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); @@ -2462,7 +2480,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, } } if (retval) { - ext4_warning(old_dir->i_sb, "ext4_rename", + ext4_warning(old_dir->i_sb, "Deleting old file (%lu), %d, error=%d", old_dir->i_ino, old_dir->i_nlink, retval); } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 3b2c5541d8a..5692c48754a 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -48,65 +48,54 @@ static int verify_group_input(struct super_block *sb, ext4_get_group_no_and_offset(sb, start, NULL, &offset); if (group != sbi->s_groups_count) - ext4_warning(sb, __func__, - "Cannot add at group %u (only %u groups)", + ext4_warning(sb, "Cannot add at group %u (only %u groups)", input->group, sbi->s_groups_count); else if (offset != 0) - ext4_warning(sb, __func__, "Last group not full"); + ext4_warning(sb, "Last group not full"); else if (input->reserved_blocks > input->blocks_count / 5) - ext4_warning(sb, __func__, "Reserved blocks too high (%u)", + ext4_warning(sb, "Reserved blocks too high (%u)", input->reserved_blocks); else if (free_blocks_count < 0) - ext4_warning(sb, __func__, "Bad blocks count %u", + ext4_warning(sb, "Bad blocks count %u", input->blocks_count); else if (!(bh = sb_bread(sb, end - 1))) - ext4_warning(sb, __func__, - "Cannot read last block (%llu)", + ext4_warning(sb, "Cannot read last block (%llu)", end - 1); else if (outside(input->block_bitmap, start, end)) - ext4_warning(sb, __func__, - "Block bitmap not in group (block %llu)", + ext4_warning(sb, "Block bitmap not in group (block %llu)", (unsigned long long)input->block_bitmap); else if (outside(input->inode_bitmap, start, end)) - ext4_warning(sb, __func__, - "Inode bitmap not in group (block %llu)", + ext4_warning(sb, "Inode bitmap not in group (block %llu)", (unsigned long long)input->inode_bitmap); else if (outside(input->inode_table, start, end) || outside(itend - 1, start, end)) - ext4_warning(sb, __func__, - "Inode table not in group (blocks %llu-%llu)", + ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)", (unsigned long long)input->inode_table, itend - 1); else if (input->inode_bitmap == input->block_bitmap) - ext4_warning(sb, __func__, - "Block bitmap same as inode bitmap (%llu)", + ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)", (unsigned long long)input->block_bitmap); else if (inside(input->block_bitmap, input->inode_table, itend)) - ext4_warning(sb, __func__, - "Block bitmap (%llu) in inode table (%llu-%llu)", + ext4_warning(sb, "Block bitmap (%llu) in inode table " + "(%llu-%llu)", (unsigned long long)input->block_bitmap, (unsigned long long)input->inode_table, itend - 1); else if (inside(input->inode_bitmap, input->inode_table, itend)) - ext4_warning(sb, __func__, - "Inode bitmap (%llu) in inode table (%llu-%llu)", + ext4_warning(sb, "Inode bitmap (%llu) in inode table " + "(%llu-%llu)", (unsigned long long)input->inode_bitmap, (unsigned long long)input->inode_table, itend - 1); else if (inside(input->block_bitmap, start, metaend)) - ext4_warning(sb, __func__, - "Block bitmap (%llu) in GDT table" - " (%llu-%llu)", + ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)", (unsigned long long)input->block_bitmap, start, metaend - 1); else if (inside(input->inode_bitmap, start, metaend)) - ext4_warning(sb, __func__, - "Inode bitmap (%llu) in GDT table" - " (%llu-%llu)", + ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)", (unsigned long long)input->inode_bitmap, start, metaend - 1); else if (inside(input->inode_table, start, metaend) || inside(itend - 1, start, metaend)) - ext4_warning(sb, __func__, - "Inode table (%llu-%llu) overlaps" - "GDT table (%llu-%llu)", + ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table " + "(%llu-%llu)", (unsigned long long)input->inode_table, itend - 1, start, metaend - 1); else @@ -364,8 +353,7 @@ static int verify_reserved_gdb(struct super_block *sb, while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) { if (le32_to_cpu(*p++) != grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){ - ext4_warning(sb, __func__, - "reserved GDT %llu" + ext4_warning(sb, "reserved GDT %llu" " missing grp %d (%llu)", blk, grp, grp * @@ -420,8 +408,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, */ if (EXT4_SB(sb)->s_sbh->b_blocknr != le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { - ext4_warning(sb, __func__, - "won't resize using backup superblock at %llu", + ext4_warning(sb, "won't resize using backup superblock at %llu", (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); return -EPERM; } @@ -444,8 +431,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, data = (__le32 *)dind->b_data; if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { - ext4_warning(sb, __func__, - "new group %u GDT block %llu not reserved", + ext4_warning(sb, "new group %u GDT block %llu not reserved", input->group, gdblock); err = -EINVAL; goto exit_dind; @@ -468,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, GFP_NOFS); if (!n_group_desc) { err = -ENOMEM; - ext4_warning(sb, __func__, + ext4_warning(sb, "not enough memory for %lu groups", gdb_num + 1); goto exit_inode; } @@ -567,8 +553,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, /* Get each reserved primary GDT block and verify it holds backups */ for (res = 0; res < reserved_gdb; res++, blk++) { if (le32_to_cpu(*data) != blk) { - ext4_warning(sb, __func__, - "reserved block %llu" + ext4_warning(sb, "reserved block %llu" " not at offset %ld", blk, (long)(data - (__le32 *)dind->b_data)); @@ -713,8 +698,7 @@ static void update_backups(struct super_block *sb, */ exit_err: if (err) { - ext4_warning(sb, __func__, - "can't update backup for group %u (err %d), " + ext4_warning(sb, "can't update backup for group %u (err %d), " "forcing fsck on next reboot", group, err); sbi->s_mount_state &= ~EXT4_VALID_FS; sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); @@ -753,20 +737,19 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { - ext4_warning(sb, __func__, - "Can't resize non-sparse filesystem further"); + ext4_warning(sb, "Can't resize non-sparse filesystem further"); return -EPERM; } if (ext4_blocks_count(es) + input->blocks_count < ext4_blocks_count(es)) { - ext4_warning(sb, __func__, "blocks_count overflow"); + ext4_warning(sb, "blocks_count overflow"); return -EINVAL; } if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < le32_to_cpu(es->s_inodes_count)) { - ext4_warning(sb, __func__, "inodes_count overflow"); + ext4_warning(sb, "inodes_count overflow"); return -EINVAL; } @@ -774,14 +757,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) || !le16_to_cpu(es->s_reserved_gdt_blocks)) { - ext4_warning(sb, __func__, + ext4_warning(sb, "No reserved GDT blocks, can't resize"); return -EPERM; } inode = ext4_iget(sb, EXT4_RESIZE_INO); if (IS_ERR(inode)) { - ext4_warning(sb, __func__, - "Error opening resize inode"); + ext4_warning(sb, "Error opening resize inode"); return PTR_ERR(inode); } } @@ -810,8 +792,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { - ext4_warning(sb, __func__, - "multiple resizers run on filesystem!"); + ext4_warning(sb, "multiple resizers run on filesystem!"); err = -EBUSY; goto exit_journal; } @@ -997,13 +978,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, " too large to resize to %llu blocks safely\n", sb->s_id, n_blocks_count); if (sizeof(sector_t) < 8) - ext4_warning(sb, __func__, "CONFIG_LBDAF not enabled"); + ext4_warning(sb, "CONFIG_LBDAF not enabled"); return -EINVAL; } if (n_blocks_count < o_blocks_count) { - ext4_warning(sb, __func__, - "can't shrink FS - resize aborted"); + ext4_warning(sb, "can't shrink FS - resize aborted"); return -EBUSY; } @@ -1011,15 +991,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); if (last == 0) { - ext4_warning(sb, __func__, - "need to use ext2online to resize further"); + ext4_warning(sb, "need to use ext2online to resize further"); return -EPERM; } add = EXT4_BLOCKS_PER_GROUP(sb) - last; if (o_blocks_count + add < o_blocks_count) { - ext4_warning(sb, __func__, "blocks_count overflow"); + ext4_warning(sb, "blocks_count overflow"); return -EINVAL; } @@ -1027,16 +1006,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, add = n_blocks_count - o_blocks_count; if (o_blocks_count + add < n_blocks_count) - ext4_warning(sb, __func__, - "will only finish group (%llu" - " blocks, %u new)", + ext4_warning(sb, "will only finish group (%llu blocks, %u new)", o_blocks_count + add, add); /* See if the device is actually as big as what was requested */ bh = sb_bread(sb, o_blocks_count + add - 1); if (!bh) { - ext4_warning(sb, __func__, - "can't read last block, resize aborted"); + ext4_warning(sb, "can't read last block, resize aborted"); return -ENOSPC; } brelse(bh); @@ -1047,14 +1023,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, handle = ext4_journal_start_sb(sb, 3); if (IS_ERR(handle)) { err = PTR_ERR(handle); - ext4_warning(sb, __func__, "error %d on journal start", err); + ext4_warning(sb, "error %d on journal start", err); goto exit_put; } mutex_lock(&EXT4_SB(sb)->s_resize_lock); if (o_blocks_count != ext4_blocks_count(es)) { - ext4_warning(sb, __func__, - "multiple resizers run on filesystem!"); + ext4_warning(sb, "multiple resizers run on filesystem!"); mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_journal_stop(handle); err = -EBUSY; @@ -1063,8 +1038,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh))) { - ext4_warning(sb, __func__, - "error %d on journal write access", err); + ext4_warning(sb, "error %d on journal write access", err); mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_journal_stop(handle); goto exit_put; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 735c20d5fd5..2b83b96cb2e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -333,7 +333,7 @@ static void ext4_handle_error(struct super_block *sb) sb->s_id); } -void ext4_error(struct super_block *sb, const char *function, +void __ext4_error(struct super_block *sb, const char *function, const char *fmt, ...) { va_list args; @@ -347,6 +347,42 @@ void ext4_error(struct super_block *sb, const char *function, ext4_handle_error(sb); } +void ext4_error_inode(const char *function, struct inode *inode, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + printk(KERN_CRIT "EXT4-fs error (device %s): %s: inode #%lu: (comm %s) ", + inode->i_sb->s_id, function, inode->i_ino, current->comm); + vprintk(fmt, args); + printk("\n"); + va_end(args); + + ext4_handle_error(inode->i_sb); +} + +void ext4_error_file(const char *function, struct file *file, + const char *fmt, ...) +{ + va_list args; + struct inode *inode = file->f_dentry->d_inode; + char pathname[80], *path; + + va_start(args, fmt); + path = d_path(&(file->f_path), pathname, sizeof(pathname)); + if (!path) + path = "(unknown)"; + printk(KERN_CRIT + "EXT4-fs error (device %s): %s: inode #%lu (comm %s path %s): ", + inode->i_sb->s_id, function, inode->i_ino, current->comm, path); + vprintk(fmt, args); + printk("\n"); + va_end(args); + + ext4_handle_error(inode->i_sb); +} + static const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]) { @@ -450,7 +486,7 @@ void ext4_msg (struct super_block * sb, const char *prefix, va_end(args); } -void ext4_warning(struct super_block *sb, const char *function, +void __ext4_warning(struct super_block *sb, const char *function, const char *fmt, ...) { va_list args; @@ -507,7 +543,7 @@ void ext4_update_dynamic_rev(struct super_block *sb) if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) return; - ext4_warning(sb, __func__, + ext4_warning(sb, "updating to rev %d because of new feature flag, " "running e2fsck is recommended", EXT4_DYNAMIC_REV); @@ -708,7 +744,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; #endif - INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); + INIT_LIST_HEAD(&ei->i_completed_io_list); + spin_lock_init(&ei->i_completed_io_lock); ei->cur_aio_dio = NULL; ei->i_sync_tid = 0; ei->i_datasync_tid = 0; @@ -761,6 +798,7 @@ static void destroy_inodecache(void) static void ext4_clear_inode(struct inode *inode) { + dquot_drop(inode); ext4_discard_preallocations(inode); if (EXT4_JOURNAL(inode)) jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, @@ -796,10 +834,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq, if (sbi->s_qf_names[GRPQUOTA]) seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); - if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) + if (test_opt(sb, USRQUOTA)) seq_puts(seq, ",usrquota"); - if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) + if (test_opt(sb, GRPQUOTA)) seq_puts(seq, ",grpquota"); #endif } @@ -926,6 +964,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, NOLOAD)) seq_puts(seq, ",norecovery"); + if (test_opt(sb, DIOREAD_NOLOCK)) + seq_puts(seq, ",dioread_nolock"); + ext4_show_quota_options(seq, sb); return 0; @@ -1012,19 +1053,9 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); static const struct dquot_operations ext4_quota_operations = { - .initialize = dquot_initialize, - .drop = dquot_drop, - .alloc_space = dquot_alloc_space, - .reserve_space = dquot_reserve_space, - .claim_space = dquot_claim_space, - .release_rsv = dquot_release_reserved_space, #ifdef CONFIG_QUOTA .get_reserved_space = ext4_get_reserved_space, #endif - .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, - .free_inode = dquot_free_inode, - .transfer = dquot_transfer, .write_dquot = ext4_write_dquot, .acquire_dquot = ext4_acquire_dquot, .release_dquot = ext4_release_dquot, @@ -1109,6 +1140,7 @@ enum { Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, }; @@ -1176,6 +1208,8 @@ static const match_table_t tokens = { {Opt_auto_da_alloc, "auto_da_alloc=%u"}, {Opt_auto_da_alloc, "auto_da_alloc"}, {Opt_noauto_da_alloc, "noauto_da_alloc"}, + {Opt_dioread_nolock, "dioread_nolock"}, + {Opt_dioread_lock, "dioread_lock"}, {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_err, NULL}, @@ -1205,6 +1239,66 @@ static ext4_fsblk_t get_sb_block(void **data) } #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) +static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n" + "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; + +#ifdef CONFIG_QUOTA +static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + char *qname; + + if (sb_any_quota_loaded(sb) && + !sbi->s_qf_names[qtype]) { + ext4_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); + return 0; + } + qname = match_strdup(args); + if (!qname) { + ext4_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); + return 0; + } + if (sbi->s_qf_names[qtype] && + strcmp(sbi->s_qf_names[qtype], qname)) { + ext4_msg(sb, KERN_ERR, + "%s quota file already specified", QTYPE2NAME(qtype)); + kfree(qname); + return 0; + } + sbi->s_qf_names[qtype] = qname; + if (strchr(sbi->s_qf_names[qtype], '/')) { + ext4_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return 0; + } + set_opt(sbi->s_mount_opt, QUOTA); + return 1; +} + +static int clear_qf_name(struct super_block *sb, int qtype) +{ + + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (sb_any_quota_loaded(sb) && + sbi->s_qf_names[qtype]) { + ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return 0; + } + /* + * The space will be released later when all options are confirmed + * to be correct + */ + sbi->s_qf_names[qtype] = NULL; + return 1; +} +#endif static int parse_options(char *options, struct super_block *sb, unsigned long *journal_devnum, @@ -1217,8 +1311,7 @@ static int parse_options(char *options, struct super_block *sb, int data_opt = 0; int option; #ifdef CONFIG_QUOTA - int qtype, qfmt; - char *qname; + int qfmt; #endif if (!options) @@ -1229,19 +1322,31 @@ static int parse_options(char *options, struct super_block *sb, if (!*p) continue; + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].to = args[0].from = 0; token = match_token(p, tokens, args); switch (token) { case Opt_bsd_df: + ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); clear_opt(sbi->s_mount_opt, MINIX_DF); break; case Opt_minix_df: + ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); set_opt(sbi->s_mount_opt, MINIX_DF); + break; case Opt_grpid: + ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); set_opt(sbi->s_mount_opt, GRPID); + break; case Opt_nogrpid: + ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); clear_opt(sbi->s_mount_opt, GRPID); + break; case Opt_resuid: if (match_int(&args[0], &option)) @@ -1378,14 +1483,13 @@ static int parse_options(char *options, struct super_block *sb, data_opt = EXT4_MOUNT_WRITEBACK_DATA; datacheck: if (is_remount) { - if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS) - != data_opt) { + if (test_opt(sb, DATA_FLAGS) != data_opt) { ext4_msg(sb, KERN_ERR, "Cannot change data mode on remount"); return 0; } } else { - sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS; + clear_opt(sbi->s_mount_opt, DATA_FLAGS); sbi->s_mount_opt |= data_opt; } break; @@ -1397,63 +1501,22 @@ static int parse_options(char *options, struct super_block *sb, break; #ifdef CONFIG_QUOTA case Opt_usrjquota: - qtype = USRQUOTA; - goto set_qf_name; - case Opt_grpjquota: - qtype = GRPQUOTA; -set_qf_name: - if (sb_any_quota_loaded(sb) && - !sbi->s_qf_names[qtype]) { - ext4_msg(sb, KERN_ERR, - "Cannot change journaled " - "quota options when quota turned on"); + if (!set_qf_name(sb, USRQUOTA, &args[0])) return 0; - } - qname = match_strdup(&args[0]); - if (!qname) { - ext4_msg(sb, KERN_ERR, - "Not enough memory for " - "storing quotafile name"); - return 0; - } - if (sbi->s_qf_names[qtype] && - strcmp(sbi->s_qf_names[qtype], qname)) { - ext4_msg(sb, KERN_ERR, - "%s quota file already " - "specified", QTYPE2NAME(qtype)); - kfree(qname); - return 0; - } - sbi->s_qf_names[qtype] = qname; - if (strchr(sbi->s_qf_names[qtype], '/')) { - ext4_msg(sb, KERN_ERR, - "quotafile must be on " - "filesystem root"); - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; + break; + case Opt_grpjquota: + if (!set_qf_name(sb, GRPQUOTA, &args[0])) return 0; - } - set_opt(sbi->s_mount_opt, QUOTA); break; case Opt_offusrjquota: - qtype = USRQUOTA; - goto clear_qf_name; + if (!clear_qf_name(sb, USRQUOTA)) + return 0; + break; case Opt_offgrpjquota: - qtype = GRPQUOTA; -clear_qf_name: - if (sb_any_quota_loaded(sb) && - sbi->s_qf_names[qtype]) { - ext4_msg(sb, KERN_ERR, "Cannot change " - "journaled quota options when " - "quota turned on"); + if (!clear_qf_name(sb, GRPQUOTA)) return 0; - } - /* - * The space will be released later when all options - * are confirmed to be correct - */ - sbi->s_qf_names[qtype] = NULL; break; + case Opt_jqfmt_vfsold: qfmt = QFMT_VFS_OLD; goto set_qf_format; @@ -1518,10 +1581,11 @@ set_qf_format: clear_opt(sbi->s_mount_opt, BARRIER); break; case Opt_barrier: - if (match_int(&args[0], &option)) { - set_opt(sbi->s_mount_opt, BARRIER); - break; - } + if (args[0].from) { + if (match_int(&args[0], &option)) + return 0; + } else + option = 1; /* No argument, default to 1 */ if (option) set_opt(sbi->s_mount_opt, BARRIER); else @@ -1594,10 +1658,11 @@ set_qf_format: set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); break; case Opt_auto_da_alloc: - if (match_int(&args[0], &option)) { - clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); - break; - } + if (args[0].from) { + if (match_int(&args[0], &option)) + return 0; + } else + option = 1; /* No argument, default to 1 */ if (option) clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); else @@ -1609,6 +1674,12 @@ set_qf_format: case Opt_nodiscard: clear_opt(sbi->s_mount_opt, DISCARD); break; + case Opt_dioread_nolock: + set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + break; + case Opt_dioread_lock: + clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + break; default: ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " @@ -1618,18 +1689,13 @@ set_qf_format: } #ifdef CONFIG_QUOTA if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { - if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) && - sbi->s_qf_names[USRQUOTA]) + if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) clear_opt(sbi->s_mount_opt, USRQUOTA); - if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) && - sbi->s_qf_names[GRPQUOTA]) + if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) clear_opt(sbi->s_mount_opt, GRPQUOTA); - if ((sbi->s_qf_names[USRQUOTA] && - (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) || - (sbi->s_qf_names[GRPQUOTA] && - (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) { + if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { ext4_msg(sb, KERN_ERR, "old and new quota " "format mixing"); return 0; @@ -1939,7 +2005,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, } list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); - vfs_dq_init(inode); + dquot_initialize(inode); if (inode->i_nlink) { ext4_msg(sb, KERN_DEBUG, "%s: truncating inode %lu to %lld bytes", @@ -2432,8 +2498,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) def_mount_opts = le32_to_cpu(es->s_default_mount_opts); if (def_mount_opts & EXT4_DEFM_DEBUG) set_opt(sbi->s_mount_opt, DEBUG); - if (def_mount_opts & EXT4_DEFM_BSDGROUPS) + if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { + ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups", + "2.6.38"); set_opt(sbi->s_mount_opt, GRPID); + } if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sbi->s_mount_opt, NO_UID32); #ifdef CONFIG_EXT4_FS_XATTR @@ -2445,11 +2514,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, POSIX_ACL); #endif if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) - sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; + set_opt(sbi->s_mount_opt, JOURNAL_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) - sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA; + set_opt(sbi->s_mount_opt, ORDERED_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) - sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA; + set_opt(sbi->s_mount_opt, WRITEBACK_DATA); if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) set_opt(sbi->s_mount_opt, ERRORS_PANIC); @@ -2477,7 +2546,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) || @@ -2766,7 +2835,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { ext4_msg(sb, KERN_ERR, "required journal recovery " "suppressed and not mounted read-only"); - goto failed_mount4; + goto failed_mount_wq; } else { clear_opt(sbi->s_mount_opt, DATA_FLAGS); set_opt(sbi->s_mount_opt, WRITEBACK_DATA); @@ -2779,7 +2848,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_64BIT)) { ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); - goto failed_mount4; + goto failed_mount_wq; } if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { @@ -2818,7 +2887,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { ext4_msg(sb, KERN_ERR, "Journal does not support " "requested data journaling mode"); - goto failed_mount4; + goto failed_mount_wq; } default: break; @@ -2826,13 +2895,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); no_journal: - if (test_opt(sb, NOBH)) { if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " "its supported only with writeback mode"); clear_opt(sbi->s_mount_opt, NOBH); } + if (test_opt(sb, DIOREAD_NOLOCK)) { + ext4_msg(sb, KERN_WARNING, "dioread_nolock option is " + "not supported with nobh mode"); + goto failed_mount_wq; + } } EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); if (!EXT4_SB(sb)->dio_unwritten_wq) { @@ -2897,6 +2970,18 @@ no_journal: "requested data journaling mode"); clear_opt(sbi->s_mount_opt, DELALLOC); } + if (test_opt(sb, DIOREAD_NOLOCK)) { + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " + "option - requested data journaling mode"); + clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + } + if (sb->s_blocksize < PAGE_SIZE) { + ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " + "option - block size is too small"); + clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + } + } err = ext4_setup_system_zone(sb); if (err) { @@ -3360,10 +3445,9 @@ static void ext4_clear_journal_err(struct super_block *sb, char nbuf[16]; errstr = ext4_decode_error(sb, j_errno, nbuf); - ext4_warning(sb, __func__, "Filesystem error recorded " + ext4_warning(sb, "Filesystem error recorded " "from previous mount: %s", errstr); - ext4_warning(sb, __func__, "Marking fs in need of " - "filesystem check."); + ext4_warning(sb, "Marking fs in need of filesystem check."); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); @@ -3514,7 +3598,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ext4_abort(sb, __func__, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); es = sbi->s_es; @@ -3708,7 +3792,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) * Process 1 Process 2 * ext4_create() quota_sync() * jbd2_journal_start() write_dquot() - * vfs_dq_init() down(dqio_mutex) + * dquot_initialize() down(dqio_mutex) * down(dqio_mutex) jbd2_journal_start() * */ @@ -3917,9 +4001,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); int err = 0; int offset = off & (sb->s_blocksize - 1); - int tocopy; int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL; - size_t towrite = len; struct buffer_head *bh; handle_t *handle = journal_current_handle(); @@ -3929,52 +4011,53 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, (unsigned long long)off, (unsigned long long)len); return -EIO; } + /* + * Since we account only one data block in transaction credits, + * then it is impossible to cross a block boundary. + */ + if (sb->s_blocksize - offset < len) { + ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" + " cancelled because not block aligned", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); - while (towrite > 0) { - tocopy = sb->s_blocksize - offset < towrite ? - sb->s_blocksize - offset : towrite; - bh = ext4_bread(handle, inode, blk, 1, &err); - if (!bh) + bh = ext4_bread(handle, inode, blk, 1, &err); + if (!bh) + goto out; + if (journal_quota) { + err = ext4_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); goto out; - if (journal_quota) { - err = ext4_journal_get_write_access(handle, bh); - if (err) { - brelse(bh); - goto out; - } } - lock_buffer(bh); - memcpy(bh->b_data+offset, data, tocopy); - flush_dcache_page(bh->b_page); - unlock_buffer(bh); - if (journal_quota) - err = ext4_handle_dirty_metadata(handle, NULL, bh); - else { - /* Always do at least ordered writes for quotas */ - err = ext4_jbd2_file_inode(handle, inode); - mark_buffer_dirty(bh); - } - brelse(bh); - if (err) - goto out; - offset = 0; - towrite -= tocopy; - data += tocopy; - blk++; } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, len); + flush_dcache_page(bh->b_page); + unlock_buffer(bh); + if (journal_quota) + err = ext4_handle_dirty_metadata(handle, NULL, bh); + else { + /* Always do at least ordered writes for quotas */ + err = ext4_jbd2_file_inode(handle, inode); + mark_buffer_dirty(bh); + } + brelse(bh); out: - if (len == towrite) { + if (err) { mutex_unlock(&inode->i_mutex); return err; } - if (inode->i_size < off+len-towrite) { - i_size_write(inode, off+len-towrite); + if (inode->i_size < off + len) { + i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; } inode->i_mtime = inode->i_ctime = CURRENT_TIME; ext4_mark_inode_dirty(handle, inode); mutex_unlock(&inode->i_mutex); - return len - towrite; + return len; } #endif diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index f3a2f7ed45a..b4c5aa8489d 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -227,7 +227,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext4_xattr_check_block(bh)) { -bad_block: ext4_error(inode->i_sb, __func__, +bad_block: + ext4_error(inode->i_sb, "inode %lu: bad block %llu", inode->i_ino, EXT4_I(inode)->i_file_acl); error = -EIO; @@ -267,7 +268,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *end; int error; - if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)) + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) return -ENODATA; error = ext4_get_inode_loc(inode, &iloc); if (error) @@ -371,7 +372,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext4_xattr_check_block(bh)) { - ext4_error(inode->i_sb, __func__, + ext4_error(inode->i_sb, "inode %lu: bad block %llu", inode->i_ino, EXT4_I(inode)->i_file_acl); error = -EIO; @@ -396,7 +397,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) void *end; int error; - if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)) + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) return 0; error = ext4_get_inode_loc(inode, &iloc); if (error) @@ -494,7 +495,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, error = ext4_handle_dirty_metadata(handle, inode, bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); if (ce) @@ -665,9 +666,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); if (ext4_xattr_check_block(bs->bh)) { - ext4_error(sb, __func__, - "inode %lu: bad block %llu", inode->i_ino, - EXT4_I(inode)->i_file_acl); + ext4_error(sb, "inode %lu: bad block %llu", + inode->i_ino, EXT4_I(inode)->i_file_acl); error = -EIO; goto cleanup; } @@ -787,8 +787,8 @@ inserted: else { /* The old block is released after updating the inode. */ - error = -EDQUOT; - if (vfs_dq_alloc_block(inode, 1)) + error = dquot_alloc_block(inode, 1); + if (error) goto cleanup; error = ext4_journal_get_write_access(handle, new_bh); @@ -876,13 +876,12 @@ cleanup: return error; cleanup_dquot: - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); goto cleanup; bad_block: - ext4_error(inode->i_sb, __func__, - "inode %lu: bad block %llu", inode->i_ino, - EXT4_I(inode)->i_file_acl); + ext4_error(inode->i_sb, "inode %lu: bad block %llu", + inode->i_ino, EXT4_I(inode)->i_file_acl); goto cleanup; #undef header @@ -908,7 +907,7 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, is->s.base = is->s.first = IFIRST(header); is->s.here = is->s.first; is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { error = ext4_xattr_check_names(IFIRST(header), is->s.end); if (error) return error; @@ -940,10 +939,10 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, header = IHDR(inode, ext4_raw_inode(&is->iloc)); if (!IS_LAST_ENTRY(s->first)) { header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); - EXT4_I(inode)->i_state |= EXT4_STATE_XATTR; + ext4_set_inode_state(inode, EXT4_STATE_XATTR); } else { header->h_magic = cpu_to_le32(0); - EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR; + ext4_clear_inode_state(inode, EXT4_STATE_XATTR); } return 0; } @@ -986,8 +985,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (strlen(name) > 255) return -ERANGE; down_write(&EXT4_I(inode)->xattr_sem); - no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND; - EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; + no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); + ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); error = ext4_get_inode_loc(inode, &is.iloc); if (error) @@ -997,10 +996,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (error) goto cleanup; - if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) { + if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) { struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); - EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW; + ext4_clear_inode_state(inode, EXT4_STATE_NEW); } error = ext4_xattr_ibody_find(inode, &i, &is); @@ -1052,7 +1051,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, ext4_xattr_update_super_block(handle, inode->i_sb); inode->i_ctime = ext4_current_time(inode); if (!value) - EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); /* * The bh is consumed by ext4_mark_iloc_dirty, even with @@ -1067,7 +1066,7 @@ cleanup: brelse(is.iloc.bh); brelse(bs.bh); if (no_expand == 0) - EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); up_write(&EXT4_I(inode)->xattr_sem); return error; } @@ -1195,9 +1194,8 @@ retry: if (!bh) goto cleanup; if (ext4_xattr_check_block(bh)) { - ext4_error(inode->i_sb, __func__, - "inode %lu: bad block %llu", inode->i_ino, - EXT4_I(inode)->i_file_acl); + ext4_error(inode->i_sb, "inode %lu: bad block %llu", + inode->i_ino, EXT4_I(inode)->i_file_acl); error = -EIO; goto cleanup; } @@ -1302,6 +1300,8 @@ retry: /* Remove the chosen entry from the inode */ error = ext4_xattr_ibody_set(handle, inode, &i, is); + if (error) + goto cleanup; entry = IFIRST(header); if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize) @@ -1372,16 +1372,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) goto cleanup; bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); if (!bh) { - ext4_error(inode->i_sb, __func__, - "inode %lu: block %llu read error", inode->i_ino, - EXT4_I(inode)->i_file_acl); + ext4_error(inode->i_sb, "inode %lu: block %llu read error", + inode->i_ino, EXT4_I(inode)->i_file_acl); goto cleanup; } if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) { - ext4_error(inode->i_sb, __func__, - "inode %lu: bad block %llu", inode->i_ino, - EXT4_I(inode)->i_file_acl); + ext4_error(inode->i_sb, "inode %lu: bad block %llu", + inode->i_ino, EXT4_I(inode)->i_file_acl); goto cleanup; } ext4_xattr_release_block(handle, inode, bh); @@ -1506,7 +1504,7 @@ again: } bh = sb_bread(inode->i_sb, ce->e_block); if (!bh) { - ext4_error(inode->i_sb, __func__, + ext4_error(inode->i_sb, "inode %lu: block %lu read error", inode->i_ino, (unsigned long) ce->e_block); } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 14da530b05c..fbeecdc194d 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -577,7 +577,7 @@ static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi, return i_pos; } -static int fat_write_inode(struct inode *inode, int wait) +static int __fat_write_inode(struct inode *inode, int wait) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); @@ -634,9 +634,14 @@ retry: return err; } +static int fat_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + int fat_sync_inode(struct inode *inode) { - return fat_write_inode(inode, 1); + return __fat_write_inode(inode, 1); } EXPORT_SYMBOL_GPL(fat_sync_inode); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1a7c42c64ff..76fc4d594ac 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -381,10 +381,10 @@ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); } -static int write_inode(struct inode *inode, int sync) +static int write_inode(struct inode *inode, struct writeback_control *wbc) { if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) - return inode->i_sb->s_op->write_inode(inode, sync); + return inode->i_sb->s_op->write_inode(inode, wbc); return 0; } @@ -421,7 +421,6 @@ static int writeback_single_inode(struct inode *inode, struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; - int wait = wbc->sync_mode == WB_SYNC_ALL; unsigned dirty; int ret; @@ -439,7 +438,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * We'll have another go at writing back this inode when we * completed a full scan of b_io. */ - if (!wait) { + if (wbc->sync_mode != WB_SYNC_ALL) { requeue_io(inode); return 0; } @@ -461,15 +460,20 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ret = do_writepages(mapping, wbc); - /* Don't write the inode if only I_DIRTY_PAGES was set */ - if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { - int err = write_inode(inode, wait); + /* + * Make sure to wait on the data before writing out the metadata. + * This is important for filesystems that modify metadata on data + * I/O completion. + */ + if (wbc->sync_mode == WB_SYNC_ALL) { + int err = filemap_fdatawait(mapping); if (ret == 0) ret = err; } - if (wait) { - int err = filemap_fdatawait(mapping); + /* Don't write the inode if only I_DIRTY_PAGES was set */ + if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + int err = write_inode(inode, wbc); if (ret == 0) ret = err; } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index e3bf6eab875..6dbcbad6ab1 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1083,7 +1083,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, } } -int gfs2_quota_sync(struct super_block *sb, int type) +int gfs2_quota_sync(struct super_block *sb, int type, int wait) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_quota_data **qda; @@ -1127,6 +1127,11 @@ int gfs2_quota_sync(struct super_block *sb, int type) return error; } +static int gfs2_quota_sync_timeo(struct super_block *sb, int type) +{ + return gfs2_quota_sync(sb, type, 0); +} + int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id) { struct gfs2_quota_data *qd; @@ -1382,7 +1387,7 @@ int gfs2_quotad(void *data) &tune->gt_statfs_quantum); /* Update quota file */ - quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t, + quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t, "ad_timeo, &tune->gt_quota_quantum); /* Check for & recover partially truncated inodes */ diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index e271fa07ad0..195f60c8bd1 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -25,7 +25,7 @@ extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid); extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change, u32 uid, u32 gid); -extern int gfs2_quota_sync(struct super_block *sb, int type); +extern int gfs2_quota_sync(struct super_block *sb, int type, int wait); extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id); extern int gfs2_quota_init(struct gfs2_sbd *sdp); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index e5e22629da6..50aac606b99 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -22,6 +22,7 @@ #include <linux/crc32.h> #include <linux/time.h> #include <linux/wait.h> +#include <linux/writeback.h> #include "gfs2.h" #include "incore.h" @@ -711,7 +712,7 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp) * Returns: errno */ -static int gfs2_write_inode(struct inode *inode, int sync) +static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -745,7 +746,7 @@ static int gfs2_write_inode(struct inode *inode, int sync) do_unlock: gfs2_glock_dq_uninit(&gh); do_flush: - if (sync != 0) + if (wbc->sync_mode == WB_SYNC_ALL) gfs2_log_flush(GFS2_SB(inode), ip->i_gl); return ret; } @@ -763,7 +764,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) int error; flush_workqueue(gfs2_delete_workqueue); - gfs2_quota_sync(sdp->sd_vfs, 0); + gfs2_quota_sync(sdp->sd_vfs, 0, 1); gfs2_statfs_sync(sdp->sd_vfs, 0); error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index a0db1c94317..b5f1a46133c 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -167,7 +167,7 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf, if (simple_strtol(buf, NULL, 0) != 1) return -EINVAL; - gfs2_quota_sync(sdp->sd_vfs, 0); + gfs2_quota_sync(sdp->sd_vfs, 0, 1); return len; } diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 052387e1167..fe35e3b626c 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -188,7 +188,7 @@ extern const struct address_space_operations hfs_btree_aops; extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int); extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); -extern int hfs_write_inode(struct inode *, int); +extern int hfs_write_inode(struct inode *, struct writeback_control *); extern int hfs_inode_setattr(struct dentry *, struct iattr *); extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, __be32 log_size, __be32 phys_size, u32 clump_size); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index a1cbff2b4d9..14f5cb1b9fd 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -381,7 +381,7 @@ void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext, HFS_SB(inode->i_sb)->alloc_blksz); } -int hfs_write_inode(struct inode *inode, int unused) +int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct inode *main_inode = inode; struct hfs_find_data fd; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 43022f3d514..74b473a8ef9 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -87,7 +87,8 @@ bad_inode: return ERR_PTR(err); } -static int hfsplus_write_inode(struct inode *inode, int unused) +static int hfsplus_write_inode(struct inode *inode, + struct writeback_control *wbc) { struct hfsplus_vh *vhdr; int ret = 0; diff --git a/fs/inode.c b/fs/inode.c index 03dfeb2e392..407bf392e20 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -8,7 +8,6 @@ #include <linux/mm.h> #include <linux/dcache.h> #include <linux/init.h> -#include <linux/quotaops.h> #include <linux/slab.h> #include <linux/writeback.h> #include <linux/module.h> @@ -314,7 +313,6 @@ void clear_inode(struct inode *inode) BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); inode_sync_wait(inode); - vfs_dq_drop(inode); if (inode->i_sb->s_op->clear_inode) inode->i_sb->s_op->clear_inode(inode); if (S_ISBLK(inode->i_mode) && inode->i_bdev) @@ -1211,8 +1209,6 @@ void generic_delete_inode(struct inode *inode) if (op->delete_inode) { void (*delete)(struct inode *) = op->delete_inode; - if (!is_bad_inode(inode)) - vfs_dq_init(inode); /* Filesystems implementing their own * s_op->delete_inode are required to call * truncate_inode_pages and clear_inode() diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 4bd882548c4..2c90e3ef625 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -862,12 +862,12 @@ restart_loop: /* A buffer which has been freed while still being * journaled by a previous transaction may end up still * being dirty here, but we want to avoid writing back - * that buffer in the future now that the last use has - * been committed. That's not only a performance gain, - * it also stops aliasing problems if the buffer is left - * behind for writeback and gets reallocated for another + * that buffer in the future after the "add to orphan" + * operation been committed, That's not only a performance + * gain, it also stops aliasing problems if the buffer is + * left behind for writeback and gets reallocated for another * use in a different page. */ - if (buffer_freed(bh)) { + if (buffer_freed(bh) && !jh->b_next_transaction) { clear_buffer_freed(bh); clear_buffer_jbddirty(bh); } diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 006f9ad838a..99e9fea1107 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1864,6 +1864,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) if (!jh) goto zap_buffer_no_jh; + /* + * We cannot remove the buffer from checkpoint lists until the + * transaction adding inode to orphan list (let's call it T) + * is committed. Otherwise if the transaction changing the + * buffer would be cleaned from the journal before T is + * committed, a crash will cause that the correct contents of + * the buffer will be lost. On the other hand we have to + * clear the buffer dirty bit at latest at the moment when the + * transaction marking the buffer as freed in the filesystem + * structures is committed because from that moment on the + * buffer can be reallocated and used by a different page. + * Since the block hasn't been freed yet but the inode has + * already been added to orphan list, it is safe for us to add + * the buffer to BJ_Forget list of the newest transaction. + */ transaction = jh->b_transaction; if (transaction == NULL) { /* First case: not on any transaction. If it @@ -1929,16 +1944,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) goto zap_buffer; } /* - * If it is committing, we simply cannot touch it. We - * can remove it's next_transaction pointer from the - * running transaction if that is set, but nothing - * else. */ + * The buffer is committing, we simply cannot touch + * it. So we just set j_next_transaction to the + * running transaction (if there is one) and mark + * buffer as freed so that commit code knows it should + * clear dirty bits when it is done with the buffer. + */ set_buffer_freed(bh); - if (jh->b_next_transaction) { - J_ASSERT(jh->b_next_transaction == - journal->j_running_transaction); - jh->b_next_transaction = NULL; - } + if (journal->j_running_transaction && buffer_jbddirty(bh)) + jh->b_next_transaction = journal->j_running_transaction; journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); @@ -2120,7 +2134,7 @@ void journal_file_buffer(struct journal_head *jh, */ void __journal_refile_buffer(struct journal_head *jh) { - int was_dirty; + int was_dirty, jlist; struct buffer_head *bh = jh2bh(jh); J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); @@ -2142,8 +2156,13 @@ void __journal_refile_buffer(struct journal_head *jh) __journal_temp_unlink_buffer(jh); jh->b_transaction = jh->b_next_transaction; jh->b_next_transaction = NULL; - __journal_file_buffer(jh, jh->b_transaction, - jh->b_modified ? BJ_Metadata : BJ_Reserved); + if (buffer_freed(bh)) + jlist = BJ_Forget; + else if (jh->b_modified) + jlist = BJ_Metadata; + else + jlist = BJ_Reserved; + __journal_file_buffer(jh, jh->b_transaction, jlist); J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); if (was_dirty) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 88684937095..30beb11ef92 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -507,6 +507,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) if (blocknr < journal->j_tail) freed = freed + journal->j_last - journal->j_first; + trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed); jbd_debug(1, "Cleaning journal tail from %d to %d (offset %lu), " "freeing %lu\n", diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 1bc74b6f26d..671da7fb7ff 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -883,8 +883,7 @@ restart_loop: spin_unlock(&journal->j_list_lock); bh = jh2bh(jh); jbd_lock_bh_state(bh); - J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || - jh->b_transaction == journal->j_running_transaction); + J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); /* * If there is undo-protected committed data against @@ -930,12 +929,12 @@ restart_loop: /* A buffer which has been freed while still being * journaled by a previous transaction may end up still * being dirty here, but we want to avoid writing back - * that buffer in the future now that the last use has - * been committed. That's not only a performance gain, - * it also stops aliasing problems if the buffer is left - * behind for writeback and gets reallocated for another + * that buffer in the future after the "add to orphan" + * operation been committed, That's not only a performance + * gain, it also stops aliasing problems if the buffer is + * left behind for writeback and gets reallocated for another * use in a different page. */ - if (buffer_freed(bh)) { + if (buffer_freed(bh) && !jh->b_next_transaction) { clear_buffer_freed(bh); clear_buffer_jbddirty(bh); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index ac0d027595d..c03d4dce4d7 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -39,6 +39,8 @@ #include <linux/seq_file.h> #include <linux/math64.h> #include <linux/hash.h> +#include <linux/log2.h> +#include <linux/vmalloc.h> #define CREATE_TRACE_POINTS #include <trace/events/jbd2.h> @@ -93,6 +95,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); +static int jbd2_journal_create_slab(size_t slab_size); /* * Helper function used to manage commit timeouts @@ -1248,6 +1251,13 @@ int jbd2_journal_load(journal_t *journal) } } + /* + * Create a slab for this blocksize + */ + err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize)); + if (err) + return err; + /* Let the recovery code check whether it needs to recover any * data from the journal. */ if (jbd2_journal_recover(journal)) @@ -1807,6 +1817,127 @@ size_t journal_tag_bytes(journal_t *journal) } /* + * JBD memory management + * + * These functions are used to allocate block-sized chunks of memory + * used for making copies of buffer_head data. Very often it will be + * page-sized chunks of data, but sometimes it will be in + * sub-page-size chunks. (For example, 16k pages on Power systems + * with a 4k block file system.) For blocks smaller than a page, we + * use a SLAB allocator. There are slab caches for each block size, + * which are allocated at mount time, if necessary, and we only free + * (all of) the slab caches when/if the jbd2 module is unloaded. For + * this reason we don't need to a mutex to protect access to + * jbd2_slab[] allocating or releasing memory; only in + * jbd2_journal_create_slab(). + */ +#define JBD2_MAX_SLABS 8 +static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS]; +static DECLARE_MUTEX(jbd2_slab_create_sem); + +static const char *jbd2_slab_names[JBD2_MAX_SLABS] = { + "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k", + "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k" +}; + + +static void jbd2_journal_destroy_slabs(void) +{ + int i; + + for (i = 0; i < JBD2_MAX_SLABS; i++) { + if (jbd2_slab[i]) + kmem_cache_destroy(jbd2_slab[i]); + jbd2_slab[i] = NULL; + } +} + +static int jbd2_journal_create_slab(size_t size) +{ + int i = order_base_2(size) - 10; + size_t slab_size; + + if (size == PAGE_SIZE) + return 0; + + if (i >= JBD2_MAX_SLABS) + return -EINVAL; + + if (unlikely(i < 0)) + i = 0; + down(&jbd2_slab_create_sem); + if (jbd2_slab[i]) { + up(&jbd2_slab_create_sem); + return 0; /* Already created */ + } + + slab_size = 1 << (i+10); + jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size, + slab_size, 0, NULL); + up(&jbd2_slab_create_sem); + if (!jbd2_slab[i]) { + printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n"); + return -ENOMEM; + } + return 0; +} + +static struct kmem_cache *get_slab(size_t size) +{ + int i = order_base_2(size) - 10; + + BUG_ON(i >= JBD2_MAX_SLABS); + if (unlikely(i < 0)) + i = 0; + BUG_ON(jbd2_slab[i] == 0); + return jbd2_slab[i]; +} + +void *jbd2_alloc(size_t size, gfp_t flags) +{ + void *ptr; + + BUG_ON(size & (size-1)); /* Must be a power of 2 */ + + flags |= __GFP_REPEAT; + if (size == PAGE_SIZE) + ptr = (void *)__get_free_pages(flags, 0); + else if (size > PAGE_SIZE) { + int order = get_order(size); + + if (order < 3) + ptr = (void *)__get_free_pages(flags, order); + else + ptr = vmalloc(size); + } else + ptr = kmem_cache_alloc(get_slab(size), flags); + + /* Check alignment; SLUB has gotten this wrong in the past, + * and this can lead to user data corruption! */ + BUG_ON(((unsigned long) ptr) & (size-1)); + + return ptr; +} + +void jbd2_free(void *ptr, size_t size) +{ + if (size == PAGE_SIZE) { + free_pages((unsigned long)ptr, 0); + return; + } + if (size > PAGE_SIZE) { + int order = get_order(size); + + if (order < 3) + free_pages((unsigned long)ptr, order); + else + vfree(ptr); + return; + } + kmem_cache_free(get_slab(size), ptr); +}; + +/* * Journal_head storage management */ static struct kmem_cache *jbd2_journal_head_cache; @@ -2204,6 +2335,7 @@ static void jbd2_journal_destroy_caches(void) jbd2_journal_destroy_revoke_caches(); jbd2_journal_destroy_jbd2_journal_head_cache(); jbd2_journal_destroy_handle_cache(); + jbd2_journal_destroy_slabs(); } static int __init journal_init(void) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index a0512700542..bfc70f57900 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1727,6 +1727,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) if (!jh) goto zap_buffer_no_jh; + /* + * We cannot remove the buffer from checkpoint lists until the + * transaction adding inode to orphan list (let's call it T) + * is committed. Otherwise if the transaction changing the + * buffer would be cleaned from the journal before T is + * committed, a crash will cause that the correct contents of + * the buffer will be lost. On the other hand we have to + * clear the buffer dirty bit at latest at the moment when the + * transaction marking the buffer as freed in the filesystem + * structures is committed because from that moment on the + * buffer can be reallocated and used by a different page. + * Since the block hasn't been freed yet but the inode has + * already been added to orphan list, it is safe for us to add + * the buffer to BJ_Forget list of the newest transaction. + */ transaction = jh->b_transaction; if (transaction == NULL) { /* First case: not on any transaction. If it @@ -1783,16 +1798,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) } else if (transaction == journal->j_committing_transaction) { JBUFFER_TRACE(jh, "on committing transaction"); /* - * If it is committing, we simply cannot touch it. We - * can remove it's next_transaction pointer from the - * running transaction if that is set, but nothing - * else. */ + * The buffer is committing, we simply cannot touch + * it. So we just set j_next_transaction to the + * running transaction (if there is one) and mark + * buffer as freed so that commit code knows it should + * clear dirty bits when it is done with the buffer. + */ set_buffer_freed(bh); - if (jh->b_next_transaction) { - J_ASSERT(jh->b_next_transaction == - journal->j_running_transaction); - jh->b_next_transaction = NULL; - } + if (journal->j_running_transaction && buffer_jbddirty(bh)) + jh->b_next_transaction = journal->j_running_transaction; jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); @@ -1969,7 +1983,7 @@ void jbd2_journal_file_buffer(struct journal_head *jh, */ void __jbd2_journal_refile_buffer(struct journal_head *jh) { - int was_dirty; + int was_dirty, jlist; struct buffer_head *bh = jh2bh(jh); J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); @@ -1991,8 +2005,13 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh) __jbd2_journal_temp_unlink_buffer(jh); jh->b_transaction = jh->b_next_transaction; jh->b_next_transaction = NULL; - __jbd2_journal_file_buffer(jh, jh->b_transaction, - jh->b_modified ? BJ_Metadata : BJ_Reserved); + if (buffer_freed(bh)) + jlist = BJ_Forget; + else if (jh->b_modified) + jlist = BJ_Metadata; + else + jlist = BJ_Reserved; + __jbd2_journal_file_buffer(jh, jh->b_transaction, jlist); J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); if (was_dirty) diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index d66477c3430..213169780b6 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -20,7 +20,6 @@ #include <linux/sched.h> #include <linux/fs.h> -#include <linux/quotaops.h> #include <linux/posix_acl_xattr.h> #include "jfs_incore.h" #include "jfs_txnmgr.h" @@ -174,7 +173,7 @@ cleanup: return rc; } -static int jfs_acl_chmod(struct inode *inode) +int jfs_acl_chmod(struct inode *inode) { struct posix_acl *acl, *clone; int rc; @@ -205,26 +204,3 @@ static int jfs_acl_chmod(struct inode *inode) posix_acl_release(clone); return rc; } - -int jfs_setattr(struct dentry *dentry, struct iattr *iattr) -{ - struct inode *inode = dentry->d_inode; - int rc; - - rc = inode_change_ok(inode, iattr); - if (rc) - return rc; - - if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || - (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { - if (vfs_dq_transfer(inode, iattr)) - return -EDQUOT; - } - - rc = inode_setattr(inode, iattr); - - if (!rc && (iattr->ia_valid & ATTR_MODE)) - rc = jfs_acl_chmod(inode); - - return rc; -} diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 2b70fa78e4a..14ba982b3f2 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -18,6 +18,7 @@ */ #include <linux/fs.h> +#include <linux/quotaops.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_dmap.h" @@ -47,7 +48,7 @@ static int jfs_open(struct inode *inode, struct file *file) { int rc; - if ((rc = generic_file_open(inode, file))) + if ((rc = dquot_file_open(inode, file))) return rc; /* @@ -88,14 +89,40 @@ static int jfs_release(struct inode *inode, struct file *file) return 0; } +int jfs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + int rc; + + rc = inode_change_ok(inode, iattr); + if (rc) + return rc; + + if (iattr->ia_valid & ATTR_SIZE) + dquot_initialize(inode); + if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || + (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { + rc = dquot_transfer(inode, iattr); + if (rc) + return rc; + } + + rc = inode_setattr(inode, iattr); + + if (!rc && (iattr->ia_valid & ATTR_MODE)) + rc = jfs_acl_chmod(inode); + + return rc; +} + const struct inode_operations jfs_file_inode_operations = { .truncate = jfs_truncate, .setxattr = jfs_setxattr, .getxattr = jfs_getxattr, .listxattr = jfs_listxattr, .removexattr = jfs_removexattr, -#ifdef CONFIG_JFS_POSIX_ACL .setattr = jfs_setattr, +#ifdef CONFIG_JFS_POSIX_ACL .check_acl = jfs_check_acl, #endif }; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index b2ae190a77b..9dd126276c9 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -22,6 +22,7 @@ #include <linux/buffer_head.h> #include <linux/pagemap.h> #include <linux/quotaops.h> +#include <linux/writeback.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_filsys.h" @@ -120,8 +121,10 @@ int jfs_commit_inode(struct inode *inode, int wait) return rc; } -int jfs_write_inode(struct inode *inode, int wait) +int jfs_write_inode(struct inode *inode, struct writeback_control *wbc) { + int wait = wbc->sync_mode == WB_SYNC_ALL; + if (test_cflag(COMMIT_Nolink, inode)) return 0; /* @@ -146,6 +149,9 @@ void jfs_delete_inode(struct inode *inode) { jfs_info("In jfs_delete_inode, inode = 0x%p", inode); + if (!is_bad_inode(inode)) + dquot_initialize(inode); + if (!is_bad_inode(inode) && (JFS_IP(inode)->fileset == FILESYSTEM_I)) { truncate_inode_pages(&inode->i_data, 0); @@ -158,9 +164,9 @@ void jfs_delete_inode(struct inode *inode) /* * Free the inode from the quota allocation. */ - vfs_dq_init(inode); - vfs_dq_free_inode(inode); - vfs_dq_drop(inode); + dquot_initialize(inode); + dquot_free_inode(inode); + dquot_drop(inode); } clear_inode(inode); diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index b07bd417ef8..54e07559878 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -22,7 +22,7 @@ int jfs_check_acl(struct inode *, int); int jfs_init_acl(tid_t, struct inode *, struct inode *); -int jfs_setattr(struct dentry *, struct iattr *); +int jfs_acl_chmod(struct inode *inode); #else @@ -32,5 +32,10 @@ static inline int jfs_init_acl(tid_t tid, struct inode *inode, return 0; } +static inline int jfs_acl_chmod(struct inode *inode) +{ + return 0; +} + #endif #endif /* _H_JFS_ACL */ diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 925871e9887..0e4623be70c 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -381,10 +381,10 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) * It's time to move the inline table to an external * page and begin to build the xtree */ - if (vfs_dq_alloc_block(ip, sbi->nbperpage)) + if (dquot_alloc_block(ip, sbi->nbperpage)) goto clean_up; if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) { - vfs_dq_free_block(ip, sbi->nbperpage); + dquot_free_block(ip, sbi->nbperpage); goto clean_up; } @@ -408,7 +408,7 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) memcpy(&jfs_ip->i_dirtable, temp_table, sizeof (temp_table)); dbFree(ip, xaddr, sbi->nbperpage); - vfs_dq_free_block(ip, sbi->nbperpage); + dquot_free_block(ip, sbi->nbperpage); goto clean_up; } ip->i_size = PSIZE; @@ -1027,10 +1027,9 @@ static int dtSplitUp(tid_t tid, n = xlen; /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, n)) { - rc = -EDQUOT; + rc = dquot_alloc_block(ip, n); + if (rc) goto extendOut; - } quota_allocation += n; if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen, @@ -1308,7 +1307,7 @@ static int dtSplitUp(tid_t tid, /* Rollback quota allocation */ if (rc && quota_allocation) - vfs_dq_free_block(ip, quota_allocation); + dquot_free_block(ip, quota_allocation); dtSplitUp_Exit: @@ -1369,9 +1368,10 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, return -EIO; /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) { release_metapage(rmp); - return -EDQUOT; + return rc; } jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp); @@ -1892,6 +1892,7 @@ static int dtSplitRoot(tid_t tid, struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; + int rc; /* get split root page */ smp = split->mp; @@ -1916,9 +1917,10 @@ static int dtSplitRoot(tid_t tid, rp = rmp->data; /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) { release_metapage(rmp); - return -EDQUOT; + return rc; } BT_MARK_DIRTY(rmp, ip); @@ -2287,7 +2289,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip, xlen = lengthPXD(&fp->header.self); /* Free quota allocation. */ - vfs_dq_free_block(ip, xlen); + dquot_free_block(ip, xlen); /* free/invalidate its buffer page */ discard_metapage(fmp); @@ -2363,7 +2365,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip, xlen = lengthPXD(&p->header.self); /* Free quota allocation */ - vfs_dq_free_block(ip, xlen); + dquot_free_block(ip, xlen); /* free/invalidate its buffer page */ discard_metapage(mp); diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index 41d6045dbeb..5d3bbd10f8d 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -141,10 +141,11 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) } /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, nxlen)) { + rc = dquot_alloc_block(ip, nxlen); + if (rc) { dbFree(ip, nxaddr, (s64) nxlen); mutex_unlock(&JFS_IP(ip)->commit_mutex); - return -EDQUOT; + return rc; } /* determine the value of the extent flag */ @@ -164,7 +165,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) */ if (rc) { dbFree(ip, nxaddr, nxlen); - vfs_dq_free_block(ip, nxlen); + dquot_free_block(ip, nxlen); mutex_unlock(&JFS_IP(ip)->commit_mutex); return (rc); } @@ -256,10 +257,11 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) goto exit; /* Allocat blocks to quota. */ - if (vfs_dq_alloc_block(ip, nxlen)) { + rc = dquot_alloc_block(ip, nxlen); + if (rc) { dbFree(ip, nxaddr, (s64) nxlen); mutex_unlock(&JFS_IP(ip)->commit_mutex); - return -EDQUOT; + return rc; } delta = nxlen - xlen; @@ -297,7 +299,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) /* extend the extent */ if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) { dbFree(ip, xaddr + xlen, delta); - vfs_dq_free_block(ip, nxlen); + dquot_free_block(ip, nxlen); goto exit; } } else { @@ -308,7 +310,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) */ if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) { dbFree(ip, nxaddr, nxlen); - vfs_dq_free_block(ip, nxlen); + dquot_free_block(ip, nxlen); goto exit; } } diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index dc0e02159ac..829921b6776 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -116,10 +116,10 @@ struct inode *ialloc(struct inode *parent, umode_t mode) /* * Allocate inode to quota. */ - if (vfs_dq_alloc_inode(inode)) { - rc = -EDQUOT; + dquot_initialize(inode); + rc = dquot_alloc_inode(inode); + if (rc) goto fail_drop; - } inode->i_mode = mode; /* inherit flags from parent */ @@ -162,7 +162,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) return inode; fail_drop: - vfs_dq_drop(inode); + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; fail_unlock: inode->i_nlink = 0; diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 1eff7db34d6..79e2c79661d 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -26,7 +26,7 @@ extern long jfs_ioctl(struct file *, unsigned int, unsigned long); extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long); extern struct inode *jfs_iget(struct super_block *, unsigned long); extern int jfs_commit_inode(struct inode *, int); -extern int jfs_write_inode(struct inode*, int); +extern int jfs_write_inode(struct inode *, struct writeback_control *); extern void jfs_delete_inode(struct inode *); extern void jfs_dirty_inode(struct inode *); extern void jfs_truncate(struct inode *); @@ -40,6 +40,7 @@ extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); extern void jfs_set_inode_flags(struct inode *); extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); +extern int jfs_setattr(struct dentry *, struct iattr *); extern const struct address_space_operations jfs_aops; extern const struct inode_operations jfs_dir_inode_operations; diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index d654a645864..6c50871e622 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -585,10 +585,10 @@ int xtInsert(tid_t tid, /* transaction id */ hint = addressXAD(xad) + lengthXAD(xad) - 1; } else hint = 0; - if ((rc = vfs_dq_alloc_block(ip, xlen))) + if ((rc = dquot_alloc_block(ip, xlen))) goto out; if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) { - vfs_dq_free_block(ip, xlen); + dquot_free_block(ip, xlen); goto out; } } @@ -617,7 +617,7 @@ int xtInsert(tid_t tid, /* transaction id */ /* undo data extent allocation */ if (*xaddrp == 0) { dbFree(ip, xaddr, (s64) xlen); - vfs_dq_free_block(ip, xlen); + dquot_free_block(ip, xlen); } return rc; } @@ -985,10 +985,9 @@ xtSplitPage(tid_t tid, struct inode *ip, rbn = addressPXD(pxd); /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { - rc = -EDQUOT; + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) goto clean_up; - } quota_allocation += lengthPXD(pxd); @@ -1195,7 +1194,7 @@ xtSplitPage(tid_t tid, struct inode *ip, /* Rollback quota allocation. */ if (quota_allocation) - vfs_dq_free_block(ip, quota_allocation); + dquot_free_block(ip, quota_allocation); return (rc); } @@ -1235,6 +1234,7 @@ xtSplitRoot(tid_t tid, struct pxdlist *pxdlist; struct tlock *tlck; struct xtlock *xtlck; + int rc; sp = &JFS_IP(ip)->i_xtroot; @@ -1252,9 +1252,10 @@ xtSplitRoot(tid_t tid, return -EIO; /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) { release_metapage(rmp); - return -EDQUOT; + return rc; } jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp); @@ -3680,7 +3681,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) ip->i_size = newsize; /* update quota allocation to reflect freed blocks */ - vfs_dq_free_block(ip, nfreed); + dquot_free_block(ip, nfreed); /* * free tlock of invalidated pages diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index c79a4270f08..4a3e9f39c21 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -85,6 +85,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name); + dquot_initialize(dip); + /* * search parent directory for entry/freespace * (dtSearch() returns parent directory page pinned) @@ -215,6 +217,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name); + dquot_initialize(dip); + /* link count overflow on parent directory ? */ if (dip->i_nlink == JFS_LINK_MAX) { rc = -EMLINK; @@ -356,7 +360,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name); /* Init inode for quota operations. */ - vfs_dq_init(ip); + dquot_initialize(dip); + dquot_initialize(ip); /* directory must be empty to be removed */ if (!dtEmpty(ip)) { @@ -483,7 +488,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name); /* Init inode for quota operations. */ - vfs_dq_init(ip); + dquot_initialize(dip); + dquot_initialize(ip); if ((rc = get_UCSname(&dname, dentry))) goto out; @@ -805,6 +811,8 @@ static int jfs_link(struct dentry *old_dentry, if (ip->i_nlink == 0) return -ENOENT; + dquot_initialize(dir); + tid = txBegin(ip->i_sb, 0); mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT); @@ -896,6 +904,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name); + dquot_initialize(dip); + ssize = strlen(name) + 1; /* @@ -1087,6 +1097,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, jfs_info("jfs_rename: %s %s", old_dentry->d_name.name, new_dentry->d_name.name); + dquot_initialize(old_dir); + dquot_initialize(new_dir); + old_ip = old_dentry->d_inode; new_ip = new_dentry->d_inode; @@ -1136,7 +1149,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, } else if (new_ip) { IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL); /* Init inode for quota operations. */ - vfs_dq_init(new_ip); + dquot_initialize(new_ip); } /* @@ -1360,6 +1373,8 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, jfs_info("jfs_mknod: %s", dentry->d_name.name); + dquot_initialize(dir); + if ((rc = get_UCSname(&dname, dentry))) goto out; @@ -1541,8 +1556,8 @@ const struct inode_operations jfs_dir_inode_operations = { .getxattr = jfs_getxattr, .listxattr = jfs_listxattr, .removexattr = jfs_removexattr, -#ifdef CONFIG_JFS_POSIX_ACL .setattr = jfs_setattr, +#ifdef CONFIG_JFS_POSIX_ACL .check_acl = jfs_check_acl, #endif }; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index d929a822a74..266699deb1c 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -131,6 +131,11 @@ static void jfs_destroy_inode(struct inode *inode) kmem_cache_free(jfs_inode_cachep, ji); } +static void jfs_clear_inode(struct inode *inode) +{ + dquot_drop(inode); +} + static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb); @@ -745,6 +750,7 @@ static const struct super_operations jfs_super_operations = { .dirty_inode = jfs_dirty_inode, .write_inode = jfs_write_inode, .delete_inode = jfs_delete_inode, + .clear_inode = jfs_clear_inode, .put_super = jfs_put_super, .sync_fs = jfs_sync_fs, .freeze_fs = jfs_freeze, diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index fad364548bc..1f594ab2189 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -260,14 +260,14 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size, nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits; /* Allocate new blocks to quota. */ - if (vfs_dq_alloc_block(ip, nblocks)) { - return -EDQUOT; - } + rc = dquot_alloc_block(ip, nblocks); + if (rc) + return rc; rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno); if (rc) { /*Rollback quota allocation. */ - vfs_dq_free_block(ip, nblocks); + dquot_free_block(ip, nblocks); return rc; } @@ -332,7 +332,7 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size, failed: /* Rollback quota allocation. */ - vfs_dq_free_block(ip, nblocks); + dquot_free_block(ip, nblocks); dbFree(ip, blkno, nblocks); return rc; @@ -538,7 +538,8 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) if (blocks_needed > current_blocks) { /* Allocate new blocks to quota. */ - if (vfs_dq_alloc_block(inode, blocks_needed)) + rc = dquot_alloc_block(inode, blocks_needed); + if (rc) return -EDQUOT; quota_allocation = blocks_needed; @@ -602,7 +603,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) clean_up: /* Rollback quota allocation */ if (quota_allocation) - vfs_dq_free_block(inode, quota_allocation); + dquot_free_block(inode, quota_allocation); return (rc); } @@ -677,7 +678,7 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf, /* If old blocks exist, they must be removed from quota allocation. */ if (old_blocks) - vfs_dq_free_block(inode, old_blocks); + dquot_free_block(inode, old_blocks); inode->i_ctime = CURRENT_TIME; diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 74ea82d7216..756f8c93780 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -17,8 +17,10 @@ #include <linux/init.h> #include <linux/highuid.h> #include <linux/vfs.h> +#include <linux/writeback.h> -static int minix_write_inode(struct inode * inode, int wait); +static int minix_write_inode(struct inode *inode, + struct writeback_control *wbc); static int minix_statfs(struct dentry *dentry, struct kstatfs *buf); static int minix_remount (struct super_block * sb, int * flags, char * data); @@ -552,7 +554,7 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode) return bh; } -static int minix_write_inode(struct inode *inode, int wait) +static int minix_write_inode(struct inode *inode, struct writeback_control *wbc) { int err = 0; struct buffer_head *bh; @@ -563,7 +565,7 @@ static int minix_write_inode(struct inode *inode, int wait) bh = V2_minix_update_inode(inode); if (!bh) return -EIO; - if (wait && buffer_dirty(bh)) { + if (wbc->sync_mode == WB_SYNC_ALL && buffer_dirty(bh)) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) { printk("IO error syncing minix inode [%s:%08lx]\n", diff --git a/fs/namei.c b/fs/namei.c index 0741c69b331..3d9d2f965f8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -19,7 +19,6 @@ #include <linux/slab.h> #include <linux/fs.h> #include <linux/namei.h> -#include <linux/quotaops.h> #include <linux/pagemap.h> #include <linux/fsnotify.h> #include <linux/personality.h> @@ -498,8 +497,6 @@ static int link_path_walk(const char *, struct nameidata *); static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) { - int res = 0; - char *name; if (IS_ERR(link)) goto fail; @@ -510,22 +507,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l path_get(&nd->root); } - res = link_path_walk(link, nd); - if (nd->depth || res || nd->last_type!=LAST_NORM) - return res; - /* - * If it is an iterative symlinks resolution in open_namei() we - * have to copy the last component. And all that crap because of - * bloody create() on broken symlinks. Furrfu... - */ - name = __getname(); - if (unlikely(!name)) { - path_put(&nd->path); - return -ENOMEM; - } - strcpy(name, nd->last.name); - nd->last.name = name; - return 0; + return link_path_walk(link, nd); fail: path_put(&nd->path); return PTR_ERR(link); @@ -547,10 +529,10 @@ static inline void path_to_nameidata(struct path *path, struct nameidata *nd) nd->path.dentry = path->dentry; } -static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd) +static __always_inline int +__do_follow_link(struct path *path, struct nameidata *nd, void **p) { int error; - void *cookie; struct dentry *dentry = path->dentry; touch_atime(path->mnt, dentry); @@ -562,9 +544,9 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata } mntget(path->mnt); nd->last_type = LAST_BIND; - cookie = dentry->d_inode->i_op->follow_link(dentry, nd); - error = PTR_ERR(cookie); - if (!IS_ERR(cookie)) { + *p = dentry->d_inode->i_op->follow_link(dentry, nd); + error = PTR_ERR(*p); + if (!IS_ERR(*p)) { char *s = nd_get_link(nd); error = 0; if (s) @@ -574,8 +556,6 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata if (error) path_put(&nd->path); } - if (dentry->d_inode->i_op->put_link) - dentry->d_inode->i_op->put_link(dentry, nd, cookie); } return error; } @@ -589,6 +569,7 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata */ static inline int do_follow_link(struct path *path, struct nameidata *nd) { + void *cookie; int err = -ELOOP; if (current->link_count >= MAX_NESTED_LINKS) goto loop; @@ -602,7 +583,9 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd) current->link_count++; current->total_link_count++; nd->depth++; - err = __do_follow_link(path, nd); + err = __do_follow_link(path, nd, &cookie); + if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link) + path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie); path_put(path); current->link_count--; nd->depth--; @@ -1375,22 +1358,6 @@ static inline int may_create(struct inode *dir, struct dentry *child) return inode_permission(dir, MAY_WRITE | MAY_EXEC); } -/* - * O_DIRECTORY translates into forcing a directory lookup. - */ -static inline int lookup_flags(unsigned int f) -{ - unsigned long retval = LOOKUP_FOLLOW; - - if (f & O_NOFOLLOW) - retval &= ~LOOKUP_FOLLOW; - - if (f & O_DIRECTORY) - retval |= LOOKUP_DIRECTORY; - - return retval; -} - /* * p1 and p2 should be directories on the same fs. */ @@ -1448,7 +1415,6 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode, error = security_inode_create(dir, dentry, mode); if (error) return error; - vfs_dq_init(dir); error = dir->i_op->create(dir, dentry, mode, nd); if (!error) fsnotify_create(dir, dentry); @@ -1590,129 +1556,132 @@ static int open_will_truncate(int flag, struct inode *inode) return (flag & O_TRUNC); } -/* - * Note that the low bits of the passed in "open_flag" - * are not the same as in the local variable "flag". See - * open_to_namei_flags() for more details. - */ -struct file *do_filp_open(int dfd, const char *pathname, - int open_flag, int mode, int acc_mode) +static struct file *finish_open(struct nameidata *nd, + int open_flag, int acc_mode) { struct file *filp; - struct nameidata nd; - int error; - struct path path; - struct dentry *dir; - int count = 0; int will_truncate; - int flag = open_to_namei_flags(open_flag); - int force_reval = 0; + int error; + will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode); + if (will_truncate) { + error = mnt_want_write(nd->path.mnt); + if (error) + goto exit; + } + error = may_open(&nd->path, acc_mode, open_flag); + if (error) { + if (will_truncate) + mnt_drop_write(nd->path.mnt); + goto exit; + } + filp = nameidata_to_filp(nd); + if (!IS_ERR(filp)) { + error = ima_file_check(filp, acc_mode); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + if (!IS_ERR(filp)) { + if (will_truncate) { + error = handle_truncate(&nd->path); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + } /* - * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only - * check for O_DSYNC if the need any syncing at all we enforce it's - * always set instead of having to deal with possibly weird behaviour - * for malicious applications setting only __O_SYNC. + * It is now safe to drop the mnt write + * because the filp has had a write taken + * on its behalf. */ - if (open_flag & __O_SYNC) - open_flag |= O_DSYNC; - - if (!acc_mode) - acc_mode = MAY_OPEN | ACC_MODE(open_flag); + if (will_truncate) + mnt_drop_write(nd->path.mnt); + return filp; - /* O_TRUNC implies we need access checks for write permissions */ - if (flag & O_TRUNC) - acc_mode |= MAY_WRITE; +exit: + if (!IS_ERR(nd->intent.open.file)) + release_open_intent(nd); + path_put(&nd->path); + return ERR_PTR(error); +} - /* Allow the LSM permission hook to distinguish append - access from general write access. */ - if (flag & O_APPEND) - acc_mode |= MAY_APPEND; +static struct file *do_last(struct nameidata *nd, struct path *path, + int open_flag, int acc_mode, + int mode, const char *pathname, + int *want_dir) +{ + struct dentry *dir = nd->path.dentry; + struct file *filp; + int error = -EISDIR; - /* - * The simplest case - just a plain lookup. - */ - if (!(flag & O_CREAT)) { - filp = get_empty_filp(); - - if (filp == NULL) - return ERR_PTR(-ENFILE); - nd.intent.open.file = filp; - filp->f_flags = open_flag; - nd.intent.open.flags = flag; - nd.intent.open.create_mode = 0; - error = do_path_lookup(dfd, pathname, - lookup_flags(flag)|LOOKUP_OPEN, &nd); - if (IS_ERR(nd.intent.open.file)) { - if (error == 0) { - error = PTR_ERR(nd.intent.open.file); - path_put(&nd.path); + switch (nd->last_type) { + case LAST_DOTDOT: + follow_dotdot(nd); + dir = nd->path.dentry; + if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) { + if (!dir->d_op->d_revalidate(dir, nd)) { + error = -ESTALE; + goto exit; } - } else if (error) - release_open_intent(&nd); - if (error) - return ERR_PTR(error); + } + /* fallthrough */ + case LAST_DOT: + case LAST_ROOT: + if (open_flag & O_CREAT) + goto exit; + /* fallthrough */ + case LAST_BIND: + audit_inode(pathname, dir); goto ok; } - /* - * Create - we need to know the parent. - */ -reval: - error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); - if (error) - return ERR_PTR(error); - if (force_reval) - nd.flags |= LOOKUP_REVAL; - error = path_walk(pathname, &nd); - if (error) { - if (nd.root.mnt) - path_put(&nd.root); - return ERR_PTR(error); + /* trailing slashes? */ + if (nd->last.name[nd->last.len]) { + if (open_flag & O_CREAT) + goto exit; + *want_dir = 1; } - if (unlikely(!audit_dummy_context())) - audit_inode(pathname, nd.path.dentry); - /* - * We have the parent and last component. First of all, check - * that we are not asked to creat(2) an obvious directory - that - * will not do. - */ - error = -EISDIR; - if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len]) - goto exit_parent; + /* just plain open? */ + if (!(open_flag & O_CREAT)) { + error = do_lookup(nd, &nd->last, path); + if (error) + goto exit; + error = -ENOENT; + if (!path->dentry->d_inode) + goto exit_dput; + if (path->dentry->d_inode->i_op->follow_link) + return NULL; + error = -ENOTDIR; + if (*want_dir & !path->dentry->d_inode->i_op->lookup) + goto exit_dput; + path_to_nameidata(path, nd); + audit_inode(pathname, nd->path.dentry); + goto ok; + } - error = -ENFILE; - filp = get_empty_filp(); - if (filp == NULL) - goto exit_parent; - nd.intent.open.file = filp; - filp->f_flags = open_flag; - nd.intent.open.flags = flag; - nd.intent.open.create_mode = mode; - dir = nd.path.dentry; - nd.flags &= ~LOOKUP_PARENT; - nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN; - if (flag & O_EXCL) - nd.flags |= LOOKUP_EXCL; + /* OK, it's O_CREAT */ mutex_lock(&dir->d_inode->i_mutex); - path.dentry = lookup_hash(&nd); - path.mnt = nd.path.mnt; -do_last: - error = PTR_ERR(path.dentry); - if (IS_ERR(path.dentry)) { + path->dentry = lookup_hash(nd); + path->mnt = nd->path.mnt; + + error = PTR_ERR(path->dentry); + if (IS_ERR(path->dentry)) { mutex_unlock(&dir->d_inode->i_mutex); goto exit; } - if (IS_ERR(nd.intent.open.file)) { - error = PTR_ERR(nd.intent.open.file); + if (IS_ERR(nd->intent.open.file)) { + error = PTR_ERR(nd->intent.open.file); goto exit_mutex_unlock; } /* Negative dentry, just create the file */ - if (!path.dentry->d_inode) { + if (!path->dentry->d_inode) { /* * This write is needed to ensure that a * ro->rw transition does not occur between @@ -1720,18 +1689,16 @@ do_last: * a permanent write count is taken through * the 'struct file' in nameidata_to_filp(). */ - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(nd->path.mnt); if (error) goto exit_mutex_unlock; - error = __open_namei_create(&nd, &path, open_flag, mode); + error = __open_namei_create(nd, path, open_flag, mode); if (error) { - mnt_drop_write(nd.path.mnt); + mnt_drop_write(nd->path.mnt); goto exit; } - filp = nameidata_to_filp(&nd); - mnt_drop_write(nd.path.mnt); - if (nd.root.mnt) - path_put(&nd.root); + filp = nameidata_to_filp(nd); + mnt_drop_write(nd->path.mnt); if (!IS_ERR(filp)) { error = ima_file_check(filp, acc_mode); if (error) { @@ -1746,150 +1713,181 @@ do_last: * It already exists. */ mutex_unlock(&dir->d_inode->i_mutex); - audit_inode(pathname, path.dentry); + audit_inode(pathname, path->dentry); error = -EEXIST; - if (flag & O_EXCL) + if (open_flag & O_EXCL) goto exit_dput; - if (__follow_mount(&path)) { + if (__follow_mount(path)) { error = -ELOOP; - if (flag & O_NOFOLLOW) + if (open_flag & O_NOFOLLOW) goto exit_dput; } error = -ENOENT; - if (!path.dentry->d_inode) + if (!path->dentry->d_inode) goto exit_dput; - if (path.dentry->d_inode->i_op->follow_link) - goto do_link; - path_to_nameidata(&path, &nd); + if (path->dentry->d_inode->i_op->follow_link) + return NULL; + + path_to_nameidata(path, nd); error = -EISDIR; - if (S_ISDIR(path.dentry->d_inode->i_mode)) + if (S_ISDIR(path->dentry->d_inode->i_mode)) goto exit; ok: + filp = finish_open(nd, open_flag, acc_mode); + return filp; + +exit_mutex_unlock: + mutex_unlock(&dir->d_inode->i_mutex); +exit_dput: + path_put_conditional(path, nd); +exit: + if (!IS_ERR(nd->intent.open.file)) + release_open_intent(nd); + path_put(&nd->path); + return ERR_PTR(error); +} + +/* + * Note that the low bits of the passed in "open_flag" + * are not the same as in the local variable "flag". See + * open_to_namei_flags() for more details. + */ +struct file *do_filp_open(int dfd, const char *pathname, + int open_flag, int mode, int acc_mode) +{ + struct file *filp; + struct nameidata nd; + int error; + struct path path; + int count = 0; + int flag = open_to_namei_flags(open_flag); + int force_reval = 0; + int want_dir = open_flag & O_DIRECTORY; + + if (!(open_flag & O_CREAT)) + mode = 0; + /* - * Consider: - * 1. may_open() truncates a file - * 2. a rw->ro mount transition occurs - * 3. nameidata_to_filp() fails due to - * the ro mount. - * That would be inconsistent, and should - * be avoided. Taking this mnt write here - * ensures that (2) can not occur. + * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only + * check for O_DSYNC if the need any syncing at all we enforce it's + * always set instead of having to deal with possibly weird behaviour + * for malicious applications setting only __O_SYNC. */ - will_truncate = open_will_truncate(flag, nd.path.dentry->d_inode); - if (will_truncate) { - error = mnt_want_write(nd.path.mnt); - if (error) - goto exit; - } - error = may_open(&nd.path, acc_mode, open_flag); + if (open_flag & __O_SYNC) + open_flag |= O_DSYNC; + + if (!acc_mode) + acc_mode = MAY_OPEN | ACC_MODE(open_flag); + + /* O_TRUNC implies we need access checks for write permissions */ + if (open_flag & O_TRUNC) + acc_mode |= MAY_WRITE; + + /* Allow the LSM permission hook to distinguish append + access from general write access. */ + if (open_flag & O_APPEND) + acc_mode |= MAY_APPEND; + + /* find the parent */ +reval: + error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); + if (error) + return ERR_PTR(error); + if (force_reval) + nd.flags |= LOOKUP_REVAL; + + current->total_link_count = 0; + error = link_path_walk(pathname, &nd); if (error) { - if (will_truncate) - mnt_drop_write(nd.path.mnt); - goto exit; - } - filp = nameidata_to_filp(&nd); - if (!IS_ERR(filp)) { - error = ima_file_check(filp, acc_mode); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } + filp = ERR_PTR(error); + goto out; } - if (!IS_ERR(filp)) { - if (acc_mode & MAY_WRITE) - vfs_dq_init(nd.path.dentry->d_inode); + if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT)) + audit_inode(pathname, nd.path.dentry); - if (will_truncate) { - error = handle_truncate(&nd.path); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } - } - } /* - * It is now safe to drop the mnt write - * because the filp has had a write taken - * on its behalf. + * We have the parent and last component. */ - if (will_truncate) - mnt_drop_write(nd.path.mnt); + + error = -ENFILE; + filp = get_empty_filp(); + if (filp == NULL) + goto exit_parent; + nd.intent.open.file = filp; + filp->f_flags = open_flag; + nd.intent.open.flags = flag; + nd.intent.open.create_mode = mode; + nd.flags &= ~LOOKUP_PARENT; + nd.flags |= LOOKUP_OPEN; + if (open_flag & O_CREAT) { + nd.flags |= LOOKUP_CREATE; + if (open_flag & O_EXCL) + nd.flags |= LOOKUP_EXCL; + } + filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir); + while (unlikely(!filp)) { /* trailing symlink */ + struct path holder; + struct inode *inode = path.dentry->d_inode; + void *cookie; + error = -ELOOP; + /* S_ISDIR part is a temporary automount kludge */ + if ((open_flag & O_NOFOLLOW) && !S_ISDIR(inode->i_mode)) + goto exit_dput; + if (count++ == 32) + goto exit_dput; + /* + * This is subtle. Instead of calling do_follow_link() we do + * the thing by hands. The reason is that this way we have zero + * link_count and path_walk() (called from ->follow_link) + * honoring LOOKUP_PARENT. After that we have the parent and + * last component, i.e. we are in the same situation as after + * the first path_walk(). Well, almost - if the last component + * is normal we get its copy stored in nd->last.name and we will + * have to putname() it when we are done. Procfs-like symlinks + * just set LAST_BIND. + */ + nd.flags |= LOOKUP_PARENT; + error = security_inode_follow_link(path.dentry, &nd); + if (error) + goto exit_dput; + error = __do_follow_link(&path, &nd, &cookie); + if (unlikely(error)) { + /* nd.path had been dropped */ + if (!IS_ERR(cookie) && inode->i_op->put_link) + inode->i_op->put_link(path.dentry, &nd, cookie); + path_put(&path); + release_open_intent(&nd); + filp = ERR_PTR(error); + goto out; + } + holder = path; + nd.flags &= ~LOOKUP_PARENT; + filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir); + if (inode->i_op->put_link) + inode->i_op->put_link(holder.dentry, &nd, cookie); + path_put(&holder); + } +out: if (nd.root.mnt) path_put(&nd.root); + if (filp == ERR_PTR(-ESTALE) && !force_reval) { + force_reval = 1; + goto reval; + } return filp; -exit_mutex_unlock: - mutex_unlock(&dir->d_inode->i_mutex); exit_dput: path_put_conditional(&path, &nd); -exit: if (!IS_ERR(nd.intent.open.file)) release_open_intent(&nd); exit_parent: - if (nd.root.mnt) - path_put(&nd.root); path_put(&nd.path); - return ERR_PTR(error); - -do_link: - error = -ELOOP; - if (flag & O_NOFOLLOW) - goto exit_dput; - /* - * This is subtle. Instead of calling do_follow_link() we do the - * thing by hands. The reason is that this way we have zero link_count - * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT. - * After that we have the parent and last component, i.e. - * we are in the same situation as after the first path_walk(). - * Well, almost - if the last component is normal we get its copy - * stored in nd->last.name and we will have to putname() it when we - * are done. Procfs-like symlinks just set LAST_BIND. - */ - nd.flags |= LOOKUP_PARENT; - error = security_inode_follow_link(path.dentry, &nd); - if (error) - goto exit_dput; - error = __do_follow_link(&path, &nd); - path_put(&path); - if (error) { - /* Does someone understand code flow here? Or it is only - * me so stupid? Anathema to whoever designed this non-sense - * with "intent.open". - */ - release_open_intent(&nd); - if (nd.root.mnt) - path_put(&nd.root); - if (error == -ESTALE && !force_reval) { - force_reval = 1; - goto reval; - } - return ERR_PTR(error); - } - nd.flags &= ~LOOKUP_PARENT; - if (nd.last_type == LAST_BIND) - goto ok; - error = -EISDIR; - if (nd.last_type != LAST_NORM) - goto exit; - if (nd.last.name[nd.last.len]) { - __putname(nd.last.name); - goto exit; - } - error = -ELOOP; - if (count++==32) { - __putname(nd.last.name); - goto exit; - } - dir = nd.path.dentry; - mutex_lock(&dir->d_inode->i_mutex); - path.dentry = lookup_hash(&nd); - path.mnt = nd.path.mnt; - __putname(nd.last.name); - goto do_last; + filp = ERR_PTR(error); + goto out; } /** @@ -1983,7 +1981,6 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) if (error) return error; - vfs_dq_init(dir); error = dir->i_op->mknod(dir, dentry, mode, dev); if (!error) fsnotify_create(dir, dentry); @@ -2082,7 +2079,6 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (error) return error; - vfs_dq_init(dir); error = dir->i_op->mkdir(dir, dentry, mode); if (!error) fsnotify_mkdir(dir, dentry); @@ -2168,8 +2164,6 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) if (!dir->i_op->rmdir) return -EPERM; - vfs_dq_init(dir); - mutex_lock(&dentry->d_inode->i_mutex); dentry_unhash(dentry); if (d_mountpoint(dentry)) @@ -2255,8 +2249,6 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) if (!dir->i_op->unlink) return -EPERM; - vfs_dq_init(dir); - mutex_lock(&dentry->d_inode->i_mutex); if (d_mountpoint(dentry)) error = -EBUSY; @@ -2369,7 +2361,6 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) if (error) return error; - vfs_dq_init(dir); error = dir->i_op->symlink(dir, dentry, oldname); if (!error) fsnotify_create(dir, dentry); @@ -2453,7 +2444,6 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de return error; mutex_lock(&inode->i_mutex); - vfs_dq_init(dir); error = dir->i_op->link(old_dentry, dir, new_dentry); mutex_unlock(&inode->i_mutex); if (!error) @@ -2654,9 +2644,6 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!old_dir->i_op->rename) return -EPERM; - vfs_dq_init(old_dir); - vfs_dq_init(new_dir); - old_name = fsnotify_oldname_init(old_dentry->d_name.name); if (is_dir) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 7570573bdb3..7f9ecc46f3f 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -97,16 +97,12 @@ u64 nfs_compat_user_ino64(u64 fileid) return ino; } -int nfs_write_inode(struct inode *inode, int sync) +int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) { int ret; - if (sync) { - ret = filemap_fdatawait(inode->i_mapping); - if (ret == 0) - ret = nfs_commit_inode(inode, FLUSH_SYNC); - } else - ret = nfs_commit_inode(inode, 0); + ret = nfs_commit_inode(inode, + wbc->sync_mode == WB_SYNC_ALL ? FLUSH_SYNC : 0); if (ret >= 0) return 0; __mark_inode_dirty(inode, I_DIRTY_DATASYNC); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 29e464d23b3..11f82f03c5d 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -211,7 +211,7 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); extern struct workqueue_struct *nfsiod_workqueue; extern struct inode *nfs_alloc_inode(struct super_block *sb); extern void nfs_destroy_inode(struct inode *); -extern int nfs_write_inode(struct inode *,int); +extern int nfs_write_inode(struct inode *, struct writeback_control *); extern void nfs_clear_inode(struct inode *); #ifdef CONFIG_NFS_V4 extern void nfs4_clear_inode(struct inode *); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 15dc2deaac5..8eca17df4f6 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -20,7 +20,6 @@ #include <linux/fcntl.h> #include <linux/namei.h> #include <linux/delay.h> -#include <linux/quotaops.h> #include <linux/fsnotify.h> #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> @@ -377,7 +376,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, put_write_access(inode); goto out_nfserr; } - vfs_dq_init(inode); } /* sanitize the mode change */ @@ -745,8 +743,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, flags = O_RDWR|O_LARGEFILE; else flags = O_WRONLY|O_LARGEFILE; - - vfs_dq_init(inode); } *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt), flags, current_cred()); diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 5a9e34475e3..9173e82a45d 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1545,7 +1545,7 @@ static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry, write_inode_now(bmp_vi, !datasync); iput(bmp_vi); } - ret = ntfs_write_inode(vi, 1); + ret = __ntfs_write_inode(vi, 1); write_inode_now(vi, !datasync); err = sync_blockdev(vi->i_sb->s_bdev); if (unlikely(err && !ret)) diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 43179ddd336..b681c71d706 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2182,7 +2182,7 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry, ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); BUG_ON(S_ISDIR(vi->i_mode)); if (!datasync || !NInoNonResident(NTFS_I(vi))) - ret = ntfs_write_inode(vi, 1); + ret = __ntfs_write_inode(vi, 1); write_inode_now(vi, !datasync); /* * NOTE: If we were to use mapping->private_list (see ext2 and diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index dc2505abb6d..4b57fb1eac2 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2957,7 +2957,7 @@ out: * * Return 0 on success and -errno on error. */ -int ntfs_write_inode(struct inode *vi, int sync) +int __ntfs_write_inode(struct inode *vi, int sync) { sle64 nt; ntfs_inode *ni = NTFS_I(vi); diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index 117eaf8032a..9a113544605 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -307,12 +307,12 @@ extern void ntfs_truncate_vfs(struct inode *vi); extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr); -extern int ntfs_write_inode(struct inode *vi, int sync); +extern int __ntfs_write_inode(struct inode *vi, int sync); static inline void ntfs_commit_inode(struct inode *vi) { if (!is_bad_inode(vi)) - ntfs_write_inode(vi, 1); + __ntfs_write_inode(vi, 1); return; } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 80b04770e8e..1cf39dfaee7 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -39,6 +39,7 @@ #include "dir.h" #include "debug.h" #include "index.h" +#include "inode.h" #include "aops.h" #include "layout.h" #include "malloc.h" @@ -2662,6 +2663,13 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs) return 0; } +#ifdef NTFS_RW +static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc) +{ + return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL); +} +#endif + /** * The complete super operations. */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 2bbe1ecc08c..9f8bd913c51 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5713,7 +5713,7 @@ int ocfs2_remove_btree_range(struct inode *inode, goto out; } - vfs_dq_free_space_nodirty(inode, + dquot_free_space_nodirty(inode, ocfs2_clusters_to_bytes(inode->i_sb, len)); ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc); @@ -6936,7 +6936,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, goto bail; } - vfs_dq_free_space_nodirty(inode, + dquot_free_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, clusters_to_del)); spin_lock(&OCFS2_I(inode)->ip_lock); OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - @@ -7301,11 +7301,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, unsigned int page_end; u64 phys; - if (vfs_dq_alloc_space_nodirty(inode, - ocfs2_clusters_to_bytes(osb->sb, 1))) { - ret = -EDQUOT; + ret = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (ret) goto out_commit; - } did_quota = 1; ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, @@ -7381,7 +7380,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, out_commit: if (ret < 0 && did_quota) - vfs_dq_free_space_nodirty(inode, + dquot_free_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, 1)); ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 4c2a6d282c4..21441ddb550 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1764,10 +1764,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, wc->w_handle = handle; - if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode, - ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) { - ret = -EDQUOT; - goto out_commit; + if (clusters_to_alloc) { + ret = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc)); + if (ret) + goto out_commit; } /* * We don't want this to fail in ocfs2_write_end(), so do it @@ -1810,7 +1811,7 @@ success: return 0; out_quota: if (clusters_to_alloc) - vfs_dq_free_space(inode, + dquot_free_space(inode, ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc)); out_commit: ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 765d66c7098..efd77d071c8 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2964,12 +2964,10 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, goto out; } - if (vfs_dq_alloc_space_nodirty(dir, - ocfs2_clusters_to_bytes(osb->sb, - alloc + dx_alloc))) { - ret = -EDQUOT; + ret = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc)); + if (ret) goto out_commit; - } did_quota = 1; if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { @@ -3178,7 +3176,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, out_commit: if (ret < 0 && did_quota) - vfs_dq_free_space_nodirty(dir, bytes_allocated); + dquot_free_space_nodirty(dir, bytes_allocated); ocfs2_commit_trans(osb, handle); @@ -3221,11 +3219,10 @@ static int ocfs2_do_extend_dir(struct super_block *sb, if (extend) { u32 offset = OCFS2_I(dir)->ip_clusters; - if (vfs_dq_alloc_space_nodirty(dir, - ocfs2_clusters_to_bytes(sb, 1))) { - status = -EDQUOT; + status = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(sb, 1)); + if (status) goto bail; - } did_quota = 1; status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset, @@ -3254,7 +3251,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb, status = 0; bail: if (did_quota && status < 0) - vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1)); + dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1)); mlog_exit(status); return status; } @@ -3889,11 +3886,10 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, goto out; } - if (vfs_dq_alloc_space_nodirty(dir, - ocfs2_clusters_to_bytes(dir->i_sb, 1))) { - ret = -EDQUOT; + ret = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(dir->i_sb, 1)); + if (ret) goto out_commit; - } did_quota = 1; ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh, @@ -3983,7 +3979,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, out_commit: if (ret < 0 && did_quota) - vfs_dq_free_space_nodirty(dir, + dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(dir->i_sb, 1)); ocfs2_commit_trans(osb, handle); @@ -4165,11 +4161,10 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir, goto out; } - if (vfs_dq_alloc_space_nodirty(dir, - ocfs2_clusters_to_bytes(osb->sb, 1))) { - ret = -EDQUOT; + ret = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (ret) goto out_commit; - } did_quota = 1; /* @@ -4229,7 +4224,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir, out_commit: if (ret < 0 && did_quota) - vfs_dq_free_space_nodirty(dir, + dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(dir->i_sb, 1)); ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 5b52547d629..17947dc8341 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -107,6 +107,9 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); + if (file->f_mode & FMODE_WRITE) + dquot_initialize(inode); + spin_lock(&oi->ip_lock); /* Check that the inode hasn't been wiped from disk by another @@ -629,11 +632,10 @@ restart_all: } restarted_transaction: - if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, - clusters_to_add))) { - status = -EDQUOT; + status = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); + if (status) goto leave; - } did_quota = 1; /* reserve a write to the file entry early on - that we if we @@ -674,7 +676,7 @@ restarted_transaction: clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); spin_unlock(&OCFS2_I(inode)->ip_lock); /* Release unused quota reservation */ - vfs_dq_free_space(inode, + dquot_free_space(inode, ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); did_quota = 0; @@ -710,7 +712,7 @@ restarted_transaction: leave: if (status < 0 && did_quota) - vfs_dq_free_space(inode, + dquot_free_space(inode, ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); if (handle) { ocfs2_commit_trans(osb, handle); @@ -978,6 +980,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; if (size_change) { + dquot_initialize(inode); + status = ocfs2_rw_lock(inode, 1); if (status < 0) { mlog_errno(status); @@ -1020,7 +1024,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) /* * Gather pointers to quota structures so that allocation / * freeing of quota structures happens here and not inside - * vfs_dq_transfer() where we have problems with lock ordering + * dquot_transfer() where we have problems with lock ordering */ if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid && OCFS2_HAS_RO_COMPAT_FEATURE(sb, @@ -1053,7 +1057,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) mlog_errno(status); goto bail_unlock; } - status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + status = dquot_transfer(inode, attr); if (status < 0) goto bail_commit; } else { diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 88459bdd1ff..278a223aae1 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -665,7 +665,7 @@ static int ocfs2_remove_inode(struct inode *inode, } ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh); - vfs_dq_free_inode(inode); + dquot_free_inode(inode); status = ocfs2_free_dinode(handle, inode_alloc_inode, inode_alloc_bh, di); @@ -971,6 +971,8 @@ void ocfs2_delete_inode(struct inode *inode) goto bail; } + dquot_initialize(inode); + if (!ocfs2_inode_is_valid_to_delete(inode)) { /* It's probably not necessary to truncate_inode_pages * here but we do it for safety anyway (it will most @@ -1087,6 +1089,8 @@ void ocfs2_clear_inode(struct inode *inode) mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, "Inode=%lu\n", inode->i_ino); + dquot_drop(inode); + /* To preven remote deletes we hold open lock before, now it * is time to unlock PR and EX open locks. */ ocfs2_open_unlock(inode); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 50fb26a6a5f..d9cd4e373a5 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -212,7 +212,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode) } else inode->i_gid = current_fsgid(); inode->i_mode = mode; - vfs_dq_init(inode); + dquot_initialize(inode); return inode; } @@ -244,6 +244,8 @@ static int ocfs2_mknod(struct inode *dir, (unsigned long)dev, dentry->d_name.len, dentry->d_name.name); + dquot_initialize(dir); + /* get our super block */ osb = OCFS2_SB(dir->i_sb); @@ -348,13 +350,9 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } - /* We don't use standard VFS wrapper because we don't want vfs_dq_init - * to be called. */ - if (sb_any_quota_active(osb->sb) && - osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) { - status = -EDQUOT; + status = dquot_alloc_inode(inode); + if (status) goto leave; - } did_quota_inode = 1; mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, @@ -431,7 +429,7 @@ static int ocfs2_mknod(struct inode *dir, status = 0; leave: if (status < 0 && did_quota_inode) - vfs_dq_free_inode(inode); + dquot_free_inode(inode); if (handle) ocfs2_commit_trans(osb, handle); @@ -636,6 +634,8 @@ static int ocfs2_link(struct dentry *old_dentry, if (S_ISDIR(inode->i_mode)) return -EPERM; + dquot_initialize(dir); + err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT); if (err < 0) { if (err != -ENOENT) @@ -791,6 +791,8 @@ static int ocfs2_unlink(struct inode *dir, mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, dentry->d_name.len, dentry->d_name.name); + dquot_initialize(dir); + BUG_ON(dentry->d_parent->d_inode != dir); mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -1051,6 +1053,9 @@ static int ocfs2_rename(struct inode *old_dir, old_dentry->d_name.len, old_dentry->d_name.name, new_dentry->d_name.len, new_dentry->d_name.name); + dquot_initialize(old_dir); + dquot_initialize(new_dir); + osb = OCFS2_SB(old_dir->i_sb); if (new_inode) { @@ -1599,6 +1604,8 @@ static int ocfs2_symlink(struct inode *dir, mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, dentry, symname, dentry->d_name.len, dentry->d_name.name); + dquot_initialize(dir); + sb = dir->i_sb; osb = OCFS2_SB(sb); @@ -1688,13 +1695,9 @@ static int ocfs2_symlink(struct inode *dir, goto bail; } - /* We don't use standard VFS wrapper because we don't want vfs_dq_init - * to be called. */ - if (sb_any_quota_active(osb->sb) && - osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) { - status = -EDQUOT; + status = dquot_alloc_inode(inode); + if (status) goto bail; - } did_quota_inode = 1; mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, @@ -1716,11 +1719,10 @@ static int ocfs2_symlink(struct inode *dir, u32 offset = 0; inode->i_op = &ocfs2_symlink_inode_operations; - if (vfs_dq_alloc_space_nodirty(inode, - ocfs2_clusters_to_bytes(osb->sb, 1))) { - status = -EDQUOT; + status = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (status) goto bail; - } did_quota = 1; status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0, new_fe_bh, @@ -1788,10 +1790,10 @@ static int ocfs2_symlink(struct inode *dir, d_instantiate(dentry, inode); bail: if (status < 0 && did_quota) - vfs_dq_free_space_nodirty(inode, + dquot_free_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, 1)); if (status < 0 && did_quota_inode) - vfs_dq_free_inode(inode); + dquot_free_inode(inode); if (handle) ocfs2_commit_trans(osb, handle); @@ -2099,13 +2101,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, goto leave; } - /* We don't use standard VFS wrapper because we don't want vfs_dq_init - * to be called. */ - if (sb_any_quota_active(osb->sb) && - osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) { - status = -EDQUOT; + status = dquot_alloc_inode(inode); + if (status) goto leave; - } did_quota_inode = 1; inode->i_nlink = 0; @@ -2140,7 +2138,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, insert_inode_hash(inode); leave: if (status < 0 && did_quota_inode) - vfs_dq_free_inode(inode); + dquot_free_inode(inode); if (handle) ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index b437dc0c4ca..355f41d1d52 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -851,13 +851,6 @@ static void ocfs2_destroy_dquot(struct dquot *dquot) } const struct dquot_operations ocfs2_quota_operations = { - .initialize = dquot_initialize, - .drop = dquot_drop, - .alloc_space = dquot_alloc_space, - .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, - .free_inode = dquot_free_inode, - .transfer = dquot_transfer, .write_dquot = ocfs2_write_dquot, .acquire_dquot = ocfs2_acquire_dquot, .release_dquot = ocfs2_release_dquot, diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index fb6aa7acf54..9e96921dffd 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4390,7 +4390,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, } mutex_lock(&inode->i_mutex); - vfs_dq_init(dir); + dquot_initialize(dir); error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); mutex_unlock(&inode->i_mutex); if (!error) diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index f3b7c1541f3..75d9b5ba1d4 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -11,6 +11,7 @@ #include <linux/parser.h> #include <linux/buffer_head.h> #include <linux/vmalloc.h> +#include <linux/writeback.h> #include <linux/crc-itu-t.h> #include "omfs.h" @@ -89,7 +90,7 @@ static void omfs_update_checksums(struct omfs_inode *oi) oi->i_head.h_check_xor = xor; } -static int omfs_write_inode(struct inode *inode, int wait) +static int __omfs_write_inode(struct inode *inode, int wait) { struct omfs_inode *oi; struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); @@ -162,9 +163,14 @@ out: return ret; } +static int omfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __omfs_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + int omfs_sync_inode(struct inode *inode) { - return omfs_write_inode(inode, 1); + return __omfs_write_inode(inode, 1); } /* diff --git a/fs/open.c b/fs/open.c index e0b2d88b038..e17f54454b5 100644 --- a/fs/open.c +++ b/fs/open.c @@ -8,7 +8,6 @@ #include <linux/mm.h> #include <linux/file.h> #include <linux/fdtable.h> -#include <linux/quotaops.h> #include <linux/fsnotify.h> #include <linux/module.h> #include <linux/slab.h> @@ -278,10 +277,8 @@ static long do_sys_truncate(const char __user *pathname, loff_t length) error = locks_verify_truncate(inode, NULL, length); if (!error) error = security_path_truncate(&path, length, 0); - if (!error) { - vfs_dq_init(inode); + if (!error) error = do_truncate(path.dentry, length, 0, NULL); - } put_write_and_out: put_write_access(inode); diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index efc02ebb8c7..dad7fb247dd 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -59,3 +59,8 @@ config QUOTACTL bool depends on XFS_QUOTA || QUOTA default y + +config QUOTACTL_COMPAT + bool + depends on QUOTACTL && COMPAT_FOR_U64_ALIGNMENT + default y diff --git a/fs/quota/Makefile b/fs/quota/Makefile index 68d4f6dc057..5f9e9e276af 100644 --- a/fs/quota/Makefile +++ b/fs/quota/Makefile @@ -3,3 +3,5 @@ obj-$(CONFIG_QFMT_V1) += quota_v1.o obj-$(CONFIG_QFMT_V2) += quota_v2.o obj-$(CONFIG_QUOTA_TREE) += quota_tree.o obj-$(CONFIG_QUOTACTL) += quota.o +obj-$(CONFIG_QUOTACTL_COMPAT) += compat.o +obj-$(CONFIG_QUOTA_NETLINK_INTERFACE) += netlink.o diff --git a/fs/quota/compat.c b/fs/quota/compat.c new file mode 100644 index 00000000000..fb1892fe3e5 --- /dev/null +++ b/fs/quota/compat.c @@ -0,0 +1,118 @@ + +#include <linux/syscalls.h> +#include <linux/compat.h> +#include <linux/quotaops.h> + +/* + * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64) + * and is necessary due to alignment problems. + */ +struct compat_if_dqblk { + compat_u64 dqb_bhardlimit; + compat_u64 dqb_bsoftlimit; + compat_u64 dqb_curspace; + compat_u64 dqb_ihardlimit; + compat_u64 dqb_isoftlimit; + compat_u64 dqb_curinodes; + compat_u64 dqb_btime; + compat_u64 dqb_itime; + compat_uint_t dqb_valid; +}; + +/* XFS structures */ +struct compat_fs_qfilestat { + compat_u64 dqb_bhardlimit; + compat_u64 qfs_nblks; + compat_uint_t qfs_nextents; +}; + +struct compat_fs_quota_stat { + __s8 qs_version; + __u16 qs_flags; + __s8 qs_pad; + struct compat_fs_qfilestat qs_uquota; + struct compat_fs_qfilestat qs_gquota; + compat_uint_t qs_incoredqs; + compat_int_t qs_btimelimit; + compat_int_t qs_itimelimit; + compat_int_t qs_rtbtimelimit; + __u16 qs_bwarnlimit; + __u16 qs_iwarnlimit; +}; + +asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, + qid_t id, void __user *addr) +{ + unsigned int cmds; + struct if_dqblk __user *dqblk; + struct compat_if_dqblk __user *compat_dqblk; + struct fs_quota_stat __user *fsqstat; + struct compat_fs_quota_stat __user *compat_fsqstat; + compat_uint_t data; + u16 xdata; + long ret; + + cmds = cmd >> SUBCMDSHIFT; + + switch (cmds) { + case Q_GETQUOTA: + dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); + compat_dqblk = addr; + ret = sys_quotactl(cmd, special, id, dqblk); + if (ret) + break; + if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) || + get_user(data, &dqblk->dqb_valid) || + put_user(data, &compat_dqblk->dqb_valid)) + ret = -EFAULT; + break; + case Q_SETQUOTA: + dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); + compat_dqblk = addr; + ret = -EFAULT; + if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) || + get_user(data, &compat_dqblk->dqb_valid) || + put_user(data, &dqblk->dqb_valid)) + break; + ret = sys_quotactl(cmd, special, id, dqblk); + break; + case Q_XGETQSTAT: + fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat)); + compat_fsqstat = addr; + ret = sys_quotactl(cmd, special, id, fsqstat); + if (ret) + break; + ret = -EFAULT; + /* Copying qs_version, qs_flags, qs_pad */ + if (copy_in_user(compat_fsqstat, fsqstat, + offsetof(struct compat_fs_quota_stat, qs_uquota))) + break; + /* Copying qs_uquota */ + if (copy_in_user(&compat_fsqstat->qs_uquota, + &fsqstat->qs_uquota, + sizeof(compat_fsqstat->qs_uquota)) || + get_user(data, &fsqstat->qs_uquota.qfs_nextents) || + put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents)) + break; + /* Copying qs_gquota */ + if (copy_in_user(&compat_fsqstat->qs_gquota, + &fsqstat->qs_gquota, + sizeof(compat_fsqstat->qs_gquota)) || + get_user(data, &fsqstat->qs_gquota.qfs_nextents) || + put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents)) + break; + /* Copying the rest */ + if (copy_in_user(&compat_fsqstat->qs_incoredqs, + &fsqstat->qs_incoredqs, + sizeof(struct compat_fs_quota_stat) - + offsetof(struct compat_fs_quota_stat, qs_incoredqs)) || + get_user(xdata, &fsqstat->qs_iwarnlimit) || + put_user(xdata, &compat_fsqstat->qs_iwarnlimit)) + break; + ret = 0; + break; + default: + ret = sys_quotactl(cmd, special, id, addr); + } + return ret; +} diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 3fc62b097be..e0b870f4749 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -100,9 +100,13 @@ * * Any operation working on dquots via inode pointers must hold dqptr_sem. If * operation is just reading pointers from inode (or not using them at all) the - * read lock is enough. If pointers are altered function must hold write lock - * (these locking rules also apply for S_NOQUOTA flag in the inode - note that - * for altering the flag i_mutex is also needed). + * read lock is enough. If pointers are altered function must hold write lock. + * Special care needs to be taken about S_NOQUOTA inode flag (marking that + * inode is a quota file). Functions adding pointers from inode to dquots have + * to check this flag under dqptr_sem and then (if S_NOQUOTA is not set) they + * have to do all pointer modifications before dropping dqptr_sem. This makes + * sure they cannot race with quotaon which first sets S_NOQUOTA flag and + * then drops all pointers to dquots from an inode. * * Each dquot has its dq_lock mutex. Locked dquots might not be referenced * from inodes (dquot_alloc_space() and such don't check the dq_lock). @@ -225,6 +229,9 @@ static struct hlist_head *dquot_hash; struct dqstats dqstats; EXPORT_SYMBOL(dqstats); +static qsize_t inode_get_rsv_space(struct inode *inode); +static void __dquot_initialize(struct inode *inode, int type); + static inline unsigned int hashfn(const struct super_block *sb, unsigned int id, int type) { @@ -564,7 +571,7 @@ out: } EXPORT_SYMBOL(dquot_scan_active); -int vfs_quota_sync(struct super_block *sb, int type) +int vfs_quota_sync(struct super_block *sb, int type, int wait) { struct list_head *dirty; struct dquot *dquot; @@ -609,6 +616,33 @@ int vfs_quota_sync(struct super_block *sb, int type) spin_unlock(&dq_list_lock); mutex_unlock(&dqopt->dqonoff_mutex); + if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)) + return 0; + + /* This is not very clever (and fast) but currently I don't know about + * any other simple way of getting quota data to disk and we must get + * them there for userspace to be visible... */ + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, 1); + sync_blockdev(sb->s_bdev); + + /* + * Now when everything is written we can discard the pagecache so + * that userspace sees the changes. + */ + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + if (!sb_has_quota_active(sb, cnt)) + continue; + mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, + I_MUTEX_QUOTA); + truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0); + mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex); + } + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + return 0; } EXPORT_SYMBOL(vfs_quota_sync); @@ -840,11 +874,14 @@ static int dqinit_needed(struct inode *inode, int type) static void add_dquot_ref(struct super_block *sb, int type) { struct inode *inode, *old_inode = NULL; + int reserved = 0; spin_lock(&inode_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) continue; + if (unlikely(inode_get_rsv_space(inode) > 0)) + reserved = 1; if (!atomic_read(&inode->i_writecount)) continue; if (!dqinit_needed(inode, type)) @@ -854,7 +891,7 @@ static void add_dquot_ref(struct super_block *sb, int type) spin_unlock(&inode_lock); iput(old_inode); - sb->dq_op->initialize(inode, type); + __dquot_initialize(inode, type); /* We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the inode_lock. * We cannot iput the inode now as we can be holding the last @@ -865,6 +902,12 @@ static void add_dquot_ref(struct super_block *sb, int type) } spin_unlock(&inode_lock); iput(old_inode); + + if (reserved) { + printk(KERN_WARNING "VFS (%s): Writes happened before quota" + " was turned on thus quota information is probably " + "inconsistent. Please run quotacheck(8).\n", sb->s_id); + } } /* @@ -978,10 +1021,12 @@ static inline void dquot_resv_space(struct dquot *dquot, qsize_t number) /* * Claim reserved quota space */ -static void dquot_claim_reserved_space(struct dquot *dquot, - qsize_t number) +static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number) { - WARN_ON(dquot->dq_dqb.dqb_rsvspace < number); + if (dquot->dq_dqb.dqb_rsvspace < number) { + WARN_ON_ONCE(1); + number = dquot->dq_dqb.dqb_rsvspace; + } dquot->dq_dqb.dqb_curspace += number; dquot->dq_dqb.dqb_rsvspace -= number; } @@ -989,7 +1034,12 @@ static void dquot_claim_reserved_space(struct dquot *dquot, static inline void dquot_free_reserved_space(struct dquot *dquot, qsize_t number) { - dquot->dq_dqb.dqb_rsvspace -= number; + if (dquot->dq_dqb.dqb_rsvspace >= number) + dquot->dq_dqb.dqb_rsvspace -= number; + else { + WARN_ON_ONCE(1); + dquot->dq_dqb.dqb_rsvspace = 0; + } } static void dquot_decr_inodes(struct dquot *dquot, qsize_t number) @@ -1131,13 +1181,13 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) *warntype = QUOTA_NL_NOWARN; if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) - return QUOTA_OK; + return 0; if (dquot->dq_dqb.dqb_ihardlimit && newinodes > dquot->dq_dqb.dqb_ihardlimit && !ignore_hardlimit(dquot)) { *warntype = QUOTA_NL_IHARDWARN; - return NO_QUOTA; + return -EDQUOT; } if (dquot->dq_dqb.dqb_isoftlimit && @@ -1146,7 +1196,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) get_seconds() >= dquot->dq_dqb.dqb_itime && !ignore_hardlimit(dquot)) { *warntype = QUOTA_NL_ISOFTLONGWARN; - return NO_QUOTA; + return -EDQUOT; } if (dquot->dq_dqb.dqb_isoftlimit && @@ -1157,7 +1207,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace; } - return QUOTA_OK; + return 0; } /* needs dq_data_lock */ @@ -1169,7 +1219,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war *warntype = QUOTA_NL_NOWARN; if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) - return QUOTA_OK; + return 0; tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace + space; @@ -1179,7 +1229,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war !ignore_hardlimit(dquot)) { if (!prealloc) *warntype = QUOTA_NL_BHARDWARN; - return NO_QUOTA; + return -EDQUOT; } if (dquot->dq_dqb.dqb_bsoftlimit && @@ -1189,7 +1239,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war !ignore_hardlimit(dquot)) { if (!prealloc) *warntype = QUOTA_NL_BSOFTLONGWARN; - return NO_QUOTA; + return -EDQUOT; } if (dquot->dq_dqb.dqb_bsoftlimit && @@ -1205,10 +1255,10 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war * We don't allow preallocation to exceed softlimit so exceeding will * be always printed */ - return NO_QUOTA; + return -EDQUOT; } - return QUOTA_OK; + return 0; } static int info_idq_free(struct dquot *dquot, qsize_t inodes) @@ -1242,25 +1292,32 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space) return QUOTA_NL_BHARDBELOW; return QUOTA_NL_NOWARN; } + /* - * Initialize quota pointers in inode - * We do things in a bit complicated way but by that we avoid calling - * dqget() and thus filesystem callbacks under dqptr_sem. + * Initialize quota pointers in inode + * + * We do things in a bit complicated way but by that we avoid calling + * dqget() and thus filesystem callbacks under dqptr_sem. + * + * It is better to call this function outside of any transaction as it + * might need a lot of space in journal for dquot structure allocation. */ -int dquot_initialize(struct inode *inode, int type) +static void __dquot_initialize(struct inode *inode, int type) { unsigned int id = 0; - int cnt, ret = 0; - struct dquot *got[MAXQUOTAS] = { NULL, NULL }; + int cnt; + struct dquot *got[MAXQUOTAS]; struct super_block *sb = inode->i_sb; + qsize_t rsv; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (IS_NOQUOTA(inode)) - return 0; + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) + return; /* First get references to structures we might need. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + got[cnt] = NULL; if (type != -1 && cnt != type) continue; switch (cnt) { @@ -1275,7 +1332,6 @@ int dquot_initialize(struct inode *inode, int type) } down_write(&sb_dqopt(sb)->dqptr_sem); - /* Having dqptr_sem we know NOQUOTA flags can't be altered... */ if (IS_NOQUOTA(inode)) goto out_err; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1287,20 +1343,31 @@ int dquot_initialize(struct inode *inode, int type) if (!inode->i_dquot[cnt]) { inode->i_dquot[cnt] = got[cnt]; got[cnt] = NULL; + /* + * Make quota reservation system happy if someone + * did a write before quota was turned on + */ + rsv = inode_get_rsv_space(inode); + if (unlikely(rsv)) + dquot_resv_space(inode->i_dquot[cnt], rsv); } } out_err: up_write(&sb_dqopt(sb)->dqptr_sem); /* Drop unused references */ dqput_all(got); - return ret; +} + +void dquot_initialize(struct inode *inode) +{ + __dquot_initialize(inode, -1); } EXPORT_SYMBOL(dquot_initialize); /* * Release all quotas referenced by inode */ -int dquot_drop(struct inode *inode) +static void __dquot_drop(struct inode *inode) { int cnt; struct dquot *put[MAXQUOTAS]; @@ -1312,32 +1379,31 @@ int dquot_drop(struct inode *inode) } up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); dqput_all(put); - return 0; } -EXPORT_SYMBOL(dquot_drop); -/* Wrapper to remove references to quota structures from inode */ -void vfs_dq_drop(struct inode *inode) -{ - /* Here we can get arbitrary inode from clear_inode() so we have - * to be careful. OTOH we don't need locking as quota operations - * are allowed to change only at mount time */ - if (!IS_NOQUOTA(inode) && inode->i_sb && inode->i_sb->dq_op - && inode->i_sb->dq_op->drop) { - int cnt; - /* Test before calling to rule out calls from proc and such - * where we are not allowed to block. Note that this is - * actually reliable test even without the lock - the caller - * must assure that nobody can come after the DQUOT_DROP and - * add quota pointers back anyway */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - break; - if (cnt < MAXQUOTAS) - inode->i_sb->dq_op->drop(inode); - } -} -EXPORT_SYMBOL(vfs_dq_drop); +void dquot_drop(struct inode *inode) +{ + int cnt; + + if (IS_NOQUOTA(inode)) + return; + + /* + * Test before calling to rule out calls from proc and such + * where we are not allowed to block. Note that this is + * actually reliable test even without the lock - the caller + * must assure that nobody can come after the DQUOT_DROP and + * add quota pointers back anyway. + */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (inode->i_dquot[cnt]) + break; + } + + if (cnt < MAXQUOTAS) + __dquot_drop(inode); +} +EXPORT_SYMBOL(dquot_drop); /* * inode_reserved_space is managed internally by quota, and protected by @@ -1351,28 +1417,30 @@ static qsize_t *inode_reserved_space(struct inode * inode) return inode->i_sb->dq_op->get_reserved_space(inode); } -static void inode_add_rsv_space(struct inode *inode, qsize_t number) +void inode_add_rsv_space(struct inode *inode, qsize_t number) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) += number; spin_unlock(&inode->i_lock); } +EXPORT_SYMBOL(inode_add_rsv_space); - -static void inode_claim_rsv_space(struct inode *inode, qsize_t number) +void inode_claim_rsv_space(struct inode *inode, qsize_t number) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) -= number; __inode_add_bytes(inode, number); spin_unlock(&inode->i_lock); } +EXPORT_SYMBOL(inode_claim_rsv_space); -static void inode_sub_rsv_space(struct inode *inode, qsize_t number) +void inode_sub_rsv_space(struct inode *inode, qsize_t number) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) -= number; spin_unlock(&inode->i_lock); } +EXPORT_SYMBOL(inode_sub_rsv_space); static qsize_t inode_get_rsv_space(struct inode *inode) { @@ -1404,38 +1472,34 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve) } /* - * Following four functions update i_blocks+i_bytes fields and - * quota information (together with appropriate checks) - * NOTE: We absolutely rely on the fact that caller dirties - * the inode (usually macros in quotaops.h care about this) and - * holds a handle for the current transaction so that dquot write and - * inode write go into the same transaction. + * This functions updates i_blocks+i_bytes fields and quota information + * (together with appropriate checks). + * + * NOTE: We absolutely rely on the fact that caller dirties the inode + * (usually helpers in quotaops.h care about this) and holds a handle for + * the current transaction so that dquot write and inode write go into the + * same transaction. */ /* * This operation can block, but only after everything is updated */ int __dquot_alloc_space(struct inode *inode, qsize_t number, - int warn, int reserve) + int warn, int reserve) { - int cnt, ret = QUOTA_OK; + int cnt, ret = 0; char warntype[MAXQUOTAS]; /* * First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (IS_NOQUOTA(inode)) { + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { inode_incr_space(inode, number, reserve); goto out; } down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) { - inode_incr_space(inode, number, reserve); - goto out_unlock; - } - for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = QUOTA_NL_NOWARN; @@ -1443,9 +1507,9 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) continue; - if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt) - == NO_QUOTA) { - ret = NO_QUOTA; + ret = check_bdq(inode->i_dquot[cnt], number, !warn, + warntype+cnt); + if (ret) { spin_unlock(&dq_data_lock); goto out_flush_warn; } @@ -1466,61 +1530,45 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, mark_all_dquot_dirty(inode->i_dquot); out_flush_warn: flush_warnings(inode->i_dquot, warntype); -out_unlock: up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); out: return ret; } - -int dquot_alloc_space(struct inode *inode, qsize_t number, int warn) -{ - return __dquot_alloc_space(inode, number, warn, 0); -} -EXPORT_SYMBOL(dquot_alloc_space); - -int dquot_reserve_space(struct inode *inode, qsize_t number, int warn) -{ - return __dquot_alloc_space(inode, number, warn, 1); -} -EXPORT_SYMBOL(dquot_reserve_space); +EXPORT_SYMBOL(__dquot_alloc_space); /* * This operation can block, but only after everything is updated */ -int dquot_alloc_inode(const struct inode *inode, qsize_t number) +int dquot_alloc_inode(const struct inode *inode) { - int cnt, ret = NO_QUOTA; + int cnt, ret = 0; char warntype[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (IS_NOQUOTA(inode)) - return QUOTA_OK; + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) + return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = QUOTA_NL_NOWARN; down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) { - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - return QUOTA_OK; - } spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) continue; - if (check_idq(inode->i_dquot[cnt], number, warntype+cnt) - == NO_QUOTA) + ret = check_idq(inode->i_dquot[cnt], 1, warntype + cnt); + if (ret) goto warn_put_all; } for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) continue; - dquot_incr_inodes(inode->i_dquot[cnt], number); + dquot_incr_inodes(inode->i_dquot[cnt], 1); } - ret = QUOTA_OK; + warn_put_all: spin_unlock(&dq_data_lock); - if (ret == QUOTA_OK) + if (ret == 0) mark_all_dquot_dirty(inode->i_dquot); flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); @@ -1528,23 +1576,19 @@ warn_put_all: } EXPORT_SYMBOL(dquot_alloc_inode); -int dquot_claim_space(struct inode *inode, qsize_t number) +/* + * Convert in-memory reserved quotas to real consumed quotas + */ +int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { int cnt; - int ret = QUOTA_OK; - if (IS_NOQUOTA(inode)) { + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { inode_claim_rsv_space(inode, number); - goto out; + return 0; } down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) { - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - inode_claim_rsv_space(inode, number); - goto out; - } - spin_lock(&dq_data_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1557,33 +1601,26 @@ int dquot_claim_space(struct inode *inode, qsize_t number) spin_unlock(&dq_data_lock); mark_all_dquot_dirty(inode->i_dquot); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); -out: - return ret; + return 0; } -EXPORT_SYMBOL(dquot_claim_space); +EXPORT_SYMBOL(dquot_claim_space_nodirty); /* * This operation can block, but only after everything is updated */ -int __dquot_free_space(struct inode *inode, qsize_t number, int reserve) +void __dquot_free_space(struct inode *inode, qsize_t number, int reserve) { unsigned int cnt; char warntype[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (IS_NOQUOTA(inode)) { -out_sub: + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { inode_decr_space(inode, number, reserve); - return QUOTA_OK; + return; } down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - /* Now recheck reliably when holding dqptr_sem */ - if (IS_NOQUOTA(inode)) { - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - goto out_sub; - } spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) @@ -1603,56 +1640,34 @@ out_sub: out_unlock: flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - return QUOTA_OK; -} - -int dquot_free_space(struct inode *inode, qsize_t number) -{ - return __dquot_free_space(inode, number, 0); } -EXPORT_SYMBOL(dquot_free_space); - -/* - * Release reserved quota space - */ -void dquot_release_reserved_space(struct inode *inode, qsize_t number) -{ - __dquot_free_space(inode, number, 1); - -} -EXPORT_SYMBOL(dquot_release_reserved_space); +EXPORT_SYMBOL(__dquot_free_space); /* * This operation can block, but only after everything is updated */ -int dquot_free_inode(const struct inode *inode, qsize_t number) +void dquot_free_inode(const struct inode *inode) { unsigned int cnt; char warntype[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (IS_NOQUOTA(inode)) - return QUOTA_OK; + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) + return; down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - /* Now recheck reliably when holding dqptr_sem */ - if (IS_NOQUOTA(inode)) { - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - return QUOTA_OK; - } spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) continue; - warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number); - dquot_decr_inodes(inode->i_dquot[cnt], number); + warntype[cnt] = info_idq_free(inode->i_dquot[cnt], 1); + dquot_decr_inodes(inode->i_dquot[cnt], 1); } spin_unlock(&dq_data_lock); mark_all_dquot_dirty(inode->i_dquot); flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - return QUOTA_OK; } EXPORT_SYMBOL(dquot_free_inode); @@ -1662,37 +1677,31 @@ EXPORT_SYMBOL(dquot_free_inode); * This operation can block, but only after everything is updated * A transaction must be started when entering this function. */ -int dquot_transfer(struct inode *inode, struct iattr *iattr) +static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask) { qsize_t space, cur_space; qsize_t rsv_space = 0; struct dquot *transfer_from[MAXQUOTAS]; struct dquot *transfer_to[MAXQUOTAS]; - int cnt, ret = QUOTA_OK; - int chuid = iattr->ia_valid & ATTR_UID && inode->i_uid != iattr->ia_uid, - chgid = iattr->ia_valid & ATTR_GID && inode->i_gid != iattr->ia_gid; + int cnt, ret = 0; char warntype_to[MAXQUOTAS]; char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ if (IS_NOQUOTA(inode)) - return QUOTA_OK; + return 0; /* Initialize the arrays */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { transfer_from[cnt] = NULL; transfer_to[cnt] = NULL; warntype_to[cnt] = QUOTA_NL_NOWARN; } - if (chuid) - transfer_to[USRQUOTA] = dqget(inode->i_sb, iattr->ia_uid, - USRQUOTA); - if (chgid) - transfer_to[GRPQUOTA] = dqget(inode->i_sb, iattr->ia_gid, - GRPQUOTA); - + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (mask & (1 << cnt)) + transfer_to[cnt] = dqget(inode->i_sb, chid[cnt], cnt); + } down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - /* Now recheck reliably when holding dqptr_sem */ if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); goto put_all; @@ -1706,9 +1715,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) if (!transfer_to[cnt]) continue; transfer_from[cnt] = inode->i_dquot[cnt]; - if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) == - NO_QUOTA || check_bdq(transfer_to[cnt], space, 0, - warntype_to + cnt) == NO_QUOTA) + ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt); + if (ret) + goto over_quota; + ret = check_bdq(transfer_to[cnt], space, 0, warntype_to + cnt); + if (ret) goto over_quota; } @@ -1762,22 +1773,32 @@ over_quota: /* Clear dquot pointers we don't want to dqput() */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) transfer_from[cnt] = NULL; - ret = NO_QUOTA; goto warn_put_all; } -EXPORT_SYMBOL(dquot_transfer); -/* Wrapper for transferring ownership of an inode */ -int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) +/* Wrapper for transferring ownership of an inode for uid/gid only + * Called from FSXXX_setattr() + */ +int dquot_transfer(struct inode *inode, struct iattr *iattr) { + qid_t chid[MAXQUOTAS]; + unsigned long mask = 0; + + if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) { + mask |= 1 << USRQUOTA; + chid[USRQUOTA] = iattr->ia_uid; + } + if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) { + mask |= 1 << GRPQUOTA; + chid[GRPQUOTA] = iattr->ia_gid; + } if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) { - vfs_dq_init(inode); - if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA) - return 1; + dquot_initialize(inode); + return __dquot_transfer(inode, chid, mask); } return 0; } -EXPORT_SYMBOL(vfs_dq_transfer); +EXPORT_SYMBOL(dquot_transfer); /* * Write info of quota file to disk @@ -1798,13 +1819,6 @@ EXPORT_SYMBOL(dquot_commit_info); * Definitions of diskquota operations. */ const struct dquot_operations dquot_operations = { - .initialize = dquot_initialize, - .drop = dquot_drop, - .alloc_space = dquot_alloc_space, - .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, - .free_inode = dquot_free_inode, - .transfer = dquot_transfer, .write_dquot = dquot_commit, .acquire_dquot = dquot_acquire, .release_dquot = dquot_release, @@ -1815,6 +1829,20 @@ const struct dquot_operations dquot_operations = { }; /* + * Generic helper for ->open on filesystems supporting disk quotas. + */ +int dquot_file_open(struct inode *inode, struct file *file) +{ + int error; + + error = generic_file_open(inode, file); + if (!error && (file->f_mode & FMODE_WRITE)) + dquot_initialize(inode); + return error; +} +EXPORT_SYMBOL(dquot_file_open); + +/* * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) */ int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags) @@ -1993,11 +2021,13 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, } if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { - /* As we bypass the pagecache we must now flush the inode so - * that we see all the changes from userspace... */ - write_inode_now(inode, 1); - /* And now flush the block cache so that kernel sees the - * changes */ + /* As we bypass the pagecache we must now flush all the + * dirty data and invalidate caches so that kernel sees + * changes from userspace. It is not enough to just flush + * the quota file since if blocksize < pagesize, invalidation + * of the cache could fail because of other unrelated dirty + * data */ + sync_filesystem(sb); invalidate_bdev(sb->s_bdev); } mutex_lock(&dqopt->dqonoff_mutex); @@ -2010,14 +2040,16 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, /* We don't want quota and atime on quota files (deadlocks * possible) Also nobody should write to the file - we use * special IO operations which ignore the immutable bit. */ - down_write(&dqopt->dqptr_sem); mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; mutex_unlock(&inode->i_mutex); - up_write(&dqopt->dqptr_sem); - sb->dq_op->drop(inode); + /* + * When S_NOQUOTA is set, remove dquot references as no more + * references can be added + */ + __dquot_drop(inode); } error = -EIO; @@ -2053,14 +2085,12 @@ out_file_init: iput(inode); out_lock: if (oldflags != -1) { - down_write(&dqopt->dqptr_sem); mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); /* Set the flags back (in the case of accidental quotaon() * on a wrong file we don't want to mess up the flags) */ inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); inode->i_flags |= oldflags; mutex_unlock(&inode->i_mutex); - up_write(&dqopt->dqptr_sem); } mutex_unlock(&dqopt->dqonoff_mutex); out_fmt: diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c new file mode 100644 index 00000000000..2663ed90fb0 --- /dev/null +++ b/fs/quota/netlink.c @@ -0,0 +1,95 @@ + +#include <linux/cred.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/quotaops.h> +#include <linux/sched.h> +#include <net/netlink.h> +#include <net/genetlink.h> + +/* Netlink family structure for quota */ +static struct genl_family quota_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = "VFS_DQUOT", + .version = 1, + .maxattr = QUOTA_NL_A_MAX, +}; + +/** + * quota_send_warning - Send warning to userspace about exceeded quota + * @type: The quota type: USRQQUOTA, GRPQUOTA,... + * @id: The user or group id of the quota that was exceeded + * @dev: The device on which the fs is mounted (sb->s_dev) + * @warntype: The type of the warning: QUOTA_NL_... + * + * This can be used by filesystems (including those which don't use + * dquot) to send a message to userspace relating to quota limits. + * + */ + +void quota_send_warning(short type, unsigned int id, dev_t dev, + const char warntype) +{ + static atomic_t seq; + struct sk_buff *skb; + void *msg_head; + int ret; + int msg_size = 4 * nla_total_size(sizeof(u32)) + + 2 * nla_total_size(sizeof(u64)); + + /* We have to allocate using GFP_NOFS as we are called from a + * filesystem performing write and thus further recursion into + * the fs to free some data could cause deadlocks. */ + skb = genlmsg_new(msg_size, GFP_NOFS); + if (!skb) { + printk(KERN_ERR + "VFS: Not enough memory to send quota warning.\n"); + return; + } + msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq), + "a_genl_family, 0, QUOTA_NL_C_WARNING); + if (!msg_head) { + printk(KERN_ERR + "VFS: Cannot store netlink header in quota warning.\n"); + goto err_out; + } + ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type); + if (ret) + goto attr_err_out; + ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id); + if (ret) + goto attr_err_out; + ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype); + if (ret) + goto attr_err_out; + ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev)); + if (ret) + goto attr_err_out; + ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev)); + if (ret) + goto attr_err_out; + ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid()); + if (ret) + goto attr_err_out; + genlmsg_end(skb, msg_head); + + genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS); + return; +attr_err_out: + printk(KERN_ERR "VFS: Not enough space to compose quota message!\n"); +err_out: + kfree_skb(skb); +} +EXPORT_SYMBOL(quota_send_warning); + +static int __init quota_init(void) +{ + if (genl_register_family("a_genl_family) != 0) + printk(KERN_ERR + "VFS: Failed to create quota netlink interface.\n"); + return 0; +}; + +module_init(quota_init); diff --git a/fs/quota/quota.c b/fs/quota/quota.c index ee91e275695..95388f9b735 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -10,7 +10,6 @@ #include <linux/slab.h> #include <asm/current.h> #include <asm/uaccess.h> -#include <linux/compat.h> #include <linux/kernel.h> #include <linux/security.h> #include <linux/syscalls.h> @@ -18,220 +17,205 @@ #include <linux/capability.h> #include <linux/quotaops.h> #include <linux/types.h> -#include <net/netlink.h> -#include <net/genetlink.h> +#include <linux/writeback.h> -/* Check validity of generic quotactl commands */ -static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, - qid_t id) +static int check_quotactl_permission(struct super_block *sb, int type, int cmd, + qid_t id) { - if (type >= MAXQUOTAS) - return -EINVAL; - if (!sb && cmd != Q_SYNC) - return -ENODEV; - /* Is operation supported? */ - if (sb && !sb->s_qcop) - return -ENOSYS; - switch (cmd) { - case Q_GETFMT: - break; - case Q_QUOTAON: - if (!sb->s_qcop->quota_on) - return -ENOSYS; - break; - case Q_QUOTAOFF: - if (!sb->s_qcop->quota_off) - return -ENOSYS; - break; - case Q_SETINFO: - if (!sb->s_qcop->set_info) - return -ENOSYS; - break; - case Q_GETINFO: - if (!sb->s_qcop->get_info) - return -ENOSYS; - break; - case Q_SETQUOTA: - if (!sb->s_qcop->set_dqblk) - return -ENOSYS; - break; - case Q_GETQUOTA: - if (!sb->s_qcop->get_dqblk) - return -ENOSYS; - break; - case Q_SYNC: - if (sb && !sb->s_qcop->quota_sync) - return -ENOSYS; + /* these commands do not require any special privilegues */ + case Q_GETFMT: + case Q_SYNC: + case Q_GETINFO: + case Q_XGETQSTAT: + case Q_XQUOTASYNC: + break; + /* allow to query information for dquots we "own" */ + case Q_GETQUOTA: + case Q_XGETQUOTA: + if ((type == USRQUOTA && current_euid() == id) || + (type == GRPQUOTA && in_egroup_p(id))) break; - default: - return -EINVAL; + /*FALLTHROUGH*/ + default: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; } - /* Is quota turned on for commands which need it? */ - switch (cmd) { - case Q_GETFMT: - case Q_GETINFO: - case Q_SETINFO: - case Q_SETQUOTA: - case Q_GETQUOTA: - /* This is just an informative test so we are satisfied - * without the lock */ - if (!sb_has_quota_active(sb, type)) - return -ESRCH; - } + return security_quotactl(cmd, type, id, sb); +} - /* Check privileges */ - if (cmd == Q_GETQUOTA) { - if (((type == USRQUOTA && current_euid() != id) || - (type == GRPQUOTA && !in_egroup_p(id))) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; +static int quota_sync_all(int type) +{ + struct super_block *sb; + int ret; + + if (type >= MAXQUOTAS) + return -EINVAL; + ret = security_quotactl(Q_SYNC, type, 0, NULL); + if (ret) + return ret; + + spin_lock(&sb_lock); +restart: + list_for_each_entry(sb, &super_blocks, s_list) { + if (!sb->s_qcop || !sb->s_qcop->quota_sync) + continue; + + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + if (sb->s_root) + sb->s_qcop->quota_sync(sb, type, 1); + up_read(&sb->s_umount); + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto restart; } - else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO) - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + spin_unlock(&sb_lock); return 0; } -/* Check validity of XFS Quota Manager commands */ -static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd, - qid_t id) +static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, + void __user *addr) { - if (type >= XQM_MAXQUOTAS) - return -EINVAL; - if (!sb) - return -ENODEV; - if (!sb->s_qcop) - return -ENOSYS; + char *pathname; + int ret = -ENOSYS; + + pathname = getname(addr); + if (IS_ERR(pathname)) + return PTR_ERR(pathname); + if (sb->s_qcop->quota_on) + ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0); + putname(pathname); + return ret; +} - switch (cmd) { - case Q_XQUOTAON: - case Q_XQUOTAOFF: - case Q_XQUOTARM: - if (!sb->s_qcop->set_xstate) - return -ENOSYS; - break; - case Q_XGETQSTAT: - if (!sb->s_qcop->get_xstate) - return -ENOSYS; - break; - case Q_XSETQLIM: - if (!sb->s_qcop->set_xquota) - return -ENOSYS; - break; - case Q_XGETQUOTA: - if (!sb->s_qcop->get_xquota) - return -ENOSYS; - break; - case Q_XQUOTASYNC: - if (!sb->s_qcop->quota_sync) - return -ENOSYS; - break; - default: - return -EINVAL; - } +static int quota_getfmt(struct super_block *sb, int type, void __user *addr) +{ + __u32 fmt; - /* Check privileges */ - if (cmd == Q_XGETQUOTA) { - if (((type == XQM_USRQUOTA && current_euid() != id) || - (type == XQM_GRPQUOTA && !in_egroup_p(id))) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - } else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) { - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + down_read(&sb_dqopt(sb)->dqptr_sem); + if (!sb_has_quota_active(sb, type)) { + up_read(&sb_dqopt(sb)->dqptr_sem); + return -ESRCH; } + fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id; + up_read(&sb_dqopt(sb)->dqptr_sem); + if (copy_to_user(addr, &fmt, sizeof(fmt))) + return -EFAULT; + return 0; +} +static int quota_getinfo(struct super_block *sb, int type, void __user *addr) +{ + struct if_dqinfo info; + int ret; + + if (!sb_has_quota_active(sb, type)) + return -ESRCH; + if (!sb->s_qcop->get_info) + return -ENOSYS; + ret = sb->s_qcop->get_info(sb, type, &info); + if (!ret && copy_to_user(addr, &info, sizeof(info))) + return -EFAULT; + return ret; +} + +static int quota_setinfo(struct super_block *sb, int type, void __user *addr) +{ + struct if_dqinfo info; + + if (copy_from_user(&info, addr, sizeof(info))) + return -EFAULT; + if (!sb_has_quota_active(sb, type)) + return -ESRCH; + if (!sb->s_qcop->set_info) + return -ENOSYS; + return sb->s_qcop->set_info(sb, type, &info); +} + +static int quota_getquota(struct super_block *sb, int type, qid_t id, + void __user *addr) +{ + struct if_dqblk idq; + int ret; + + if (!sb_has_quota_active(sb, type)) + return -ESRCH; + if (!sb->s_qcop->get_dqblk) + return -ENOSYS; + ret = sb->s_qcop->get_dqblk(sb, type, id, &idq); + if (ret) + return ret; + if (copy_to_user(addr, &idq, sizeof(idq))) + return -EFAULT; return 0; } -static int check_quotactl_valid(struct super_block *sb, int type, int cmd, - qid_t id) +static int quota_setquota(struct super_block *sb, int type, qid_t id, + void __user *addr) { - int error; - - if (XQM_COMMAND(cmd)) - error = xqm_quotactl_valid(sb, type, cmd, id); - else - error = generic_quotactl_valid(sb, type, cmd, id); - if (!error) - error = security_quotactl(cmd, type, id, sb); - return error; + struct if_dqblk idq; + + if (copy_from_user(&idq, addr, sizeof(idq))) + return -EFAULT; + if (!sb_has_quota_active(sb, type)) + return -ESRCH; + if (!sb->s_qcop->set_dqblk) + return -ENOSYS; + return sb->s_qcop->set_dqblk(sb, type, id, &idq); } -#ifdef CONFIG_QUOTA -void sync_quota_sb(struct super_block *sb, int type) +static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr) { - int cnt; + __u32 flags; - if (!sb->s_qcop->quota_sync) - return; + if (copy_from_user(&flags, addr, sizeof(flags))) + return -EFAULT; + if (!sb->s_qcop->set_xstate) + return -ENOSYS; + return sb->s_qcop->set_xstate(sb, flags, cmd); +} - sb->s_qcop->quota_sync(sb, type); +static int quota_getxstate(struct super_block *sb, void __user *addr) +{ + struct fs_quota_stat fqs; + int ret; - if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE) - return; - /* This is not very clever (and fast) but currently I don't know about - * any other simple way of getting quota data to disk and we must get - * them there for userspace to be visible... */ - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, 1); - sync_blockdev(sb->s_bdev); + if (!sb->s_qcop->get_xstate) + return -ENOSYS; + ret = sb->s_qcop->get_xstate(sb, &fqs); + if (!ret && copy_to_user(addr, &fqs, sizeof(fqs))) + return -EFAULT; + return ret; +} - /* - * Now when everything is written we can discard the pagecache so - * that userspace sees the changes. - */ - mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (type != -1 && cnt != type) - continue; - if (!sb_has_quota_active(sb, cnt)) - continue; - mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, - I_MUTEX_QUOTA); - truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0); - mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex); - } - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); +static int quota_setxquota(struct super_block *sb, int type, qid_t id, + void __user *addr) +{ + struct fs_disk_quota fdq; + + if (copy_from_user(&fdq, addr, sizeof(fdq))) + return -EFAULT; + if (!sb->s_qcop->set_xquota) + return -ENOSYS; + return sb->s_qcop->set_xquota(sb, type, id, &fdq); } -#endif -static void sync_dquots(int type) +static int quota_getxquota(struct super_block *sb, int type, qid_t id, + void __user *addr) { - struct super_block *sb; - int cnt; + struct fs_disk_quota fdq; + int ret; - spin_lock(&sb_lock); -restart: - list_for_each_entry(sb, &super_blocks, s_list) { - /* This test just improves performance so it needn't be - * reliable... */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (type != -1 && type != cnt) - continue; - if (!sb_has_quota_active(sb, cnt)) - continue; - if (!info_dirty(&sb_dqopt(sb)->info[cnt]) && - list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list)) - continue; - break; - } - if (cnt == MAXQUOTAS) - continue; - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (sb->s_root) - sync_quota_sb(sb, type); - up_read(&sb->s_umount); - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; - } - spin_unlock(&sb_lock); + if (!sb->s_qcop->get_xquota) + return -ENOSYS; + ret = sb->s_qcop->get_xquota(sb, type, id, &fdq); + if (!ret && copy_to_user(addr, &fdq, sizeof(fdq))) + return -EFAULT; + return ret; } /* Copy parameters and call proper function */ @@ -240,117 +224,55 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, { int ret; + if (type >= (XQM_COMMAND(cmd) ? XQM_MAXQUOTAS : MAXQUOTAS)) + return -EINVAL; + if (!sb->s_qcop) + return -ENOSYS; + + ret = check_quotactl_permission(sb, type, cmd, id); + if (ret < 0) + return ret; + switch (cmd) { - case Q_QUOTAON: { - char *pathname; - - pathname = getname(addr); - if (IS_ERR(pathname)) - return PTR_ERR(pathname); - ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0); - putname(pathname); - return ret; - } - case Q_QUOTAOFF: - return sb->s_qcop->quota_off(sb, type, 0); - - case Q_GETFMT: { - __u32 fmt; - - down_read(&sb_dqopt(sb)->dqptr_sem); - if (!sb_has_quota_active(sb, type)) { - up_read(&sb_dqopt(sb)->dqptr_sem); - return -ESRCH; - } - fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id; - up_read(&sb_dqopt(sb)->dqptr_sem); - if (copy_to_user(addr, &fmt, sizeof(fmt))) - return -EFAULT; - return 0; - } - case Q_GETINFO: { - struct if_dqinfo info; - - ret = sb->s_qcop->get_info(sb, type, &info); - if (ret) - return ret; - if (copy_to_user(addr, &info, sizeof(info))) - return -EFAULT; - return 0; - } - case Q_SETINFO: { - struct if_dqinfo info; - - if (copy_from_user(&info, addr, sizeof(info))) - return -EFAULT; - return sb->s_qcop->set_info(sb, type, &info); - } - case Q_GETQUOTA: { - struct if_dqblk idq; - - ret = sb->s_qcop->get_dqblk(sb, type, id, &idq); - if (ret) - return ret; - if (copy_to_user(addr, &idq, sizeof(idq))) - return -EFAULT; - return 0; - } - case Q_SETQUOTA: { - struct if_dqblk idq; - - if (copy_from_user(&idq, addr, sizeof(idq))) - return -EFAULT; - return sb->s_qcop->set_dqblk(sb, type, id, &idq); - } - case Q_SYNC: - if (sb) - sync_quota_sb(sb, type); - else - sync_dquots(type); - return 0; - - case Q_XQUOTAON: - case Q_XQUOTAOFF: - case Q_XQUOTARM: { - __u32 flags; - - if (copy_from_user(&flags, addr, sizeof(flags))) - return -EFAULT; - return sb->s_qcop->set_xstate(sb, flags, cmd); - } - case Q_XGETQSTAT: { - struct fs_quota_stat fqs; - - if ((ret = sb->s_qcop->get_xstate(sb, &fqs))) - return ret; - if (copy_to_user(addr, &fqs, sizeof(fqs))) - return -EFAULT; - return 0; - } - case Q_XSETQLIM: { - struct fs_disk_quota fdq; - - if (copy_from_user(&fdq, addr, sizeof(fdq))) - return -EFAULT; - return sb->s_qcop->set_xquota(sb, type, id, &fdq); - } - case Q_XGETQUOTA: { - struct fs_disk_quota fdq; - - ret = sb->s_qcop->get_xquota(sb, type, id, &fdq); - if (ret) - return ret; - if (copy_to_user(addr, &fdq, sizeof(fdq))) - return -EFAULT; - return 0; - } - case Q_XQUOTASYNC: - return sb->s_qcop->quota_sync(sb, type); - /* We never reach here unless validity check is broken */ - default: - BUG(); + case Q_QUOTAON: + return quota_quotaon(sb, type, cmd, id, addr); + case Q_QUOTAOFF: + if (!sb->s_qcop->quota_off) + return -ENOSYS; + return sb->s_qcop->quota_off(sb, type, 0); + case Q_GETFMT: + return quota_getfmt(sb, type, addr); + case Q_GETINFO: + return quota_getinfo(sb, type, addr); + case Q_SETINFO: + return quota_setinfo(sb, type, addr); + case Q_GETQUOTA: + return quota_getquota(sb, type, id, addr); + case Q_SETQUOTA: + return quota_setquota(sb, type, id, addr); + case Q_SYNC: + if (!sb->s_qcop->quota_sync) + return -ENOSYS; + return sb->s_qcop->quota_sync(sb, type, 1); + case Q_XQUOTAON: + case Q_XQUOTAOFF: + case Q_XQUOTARM: + return quota_setxstate(sb, cmd, addr); + case Q_XGETQSTAT: + return quota_getxstate(sb, addr); + case Q_XSETQLIM: + return quota_setxquota(sb, type, id, addr); + case Q_XGETQUOTA: + return quota_getxquota(sb, type, id, addr); + case Q_XQUOTASYNC: + /* caller already holds s_umount */ + if (sb->s_flags & MS_RDONLY) + return -EROFS; + writeback_inodes_sb(sb); + return 0; + default: + return -EINVAL; } - return 0; } /* @@ -397,224 +319,23 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, cmds = cmd >> SUBCMDSHIFT; type = cmd & SUBCMDMASK; - if (cmds != Q_SYNC || special) { - sb = quotactl_block(special); - if (IS_ERR(sb)) - return PTR_ERR(sb); + /* + * As a special case Q_SYNC can be called without a specific device. + * It will iterate all superblocks that have quota enabled and call + * the sync action on each of them. + */ + if (!special) { + if (cmds == Q_SYNC) + return quota_sync_all(type); + return -ENODEV; } - ret = check_quotactl_valid(sb, type, cmds, id); - if (ret >= 0) - ret = do_quotactl(sb, type, cmds, id, addr); - if (sb) - drop_super(sb); + sb = quotactl_block(special); + if (IS_ERR(sb)) + return PTR_ERR(sb); - return ret; -} - -#if defined(CONFIG_COMPAT_FOR_U64_ALIGNMENT) -/* - * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64) - * and is necessary due to alignment problems. - */ -struct compat_if_dqblk { - compat_u64 dqb_bhardlimit; - compat_u64 dqb_bsoftlimit; - compat_u64 dqb_curspace; - compat_u64 dqb_ihardlimit; - compat_u64 dqb_isoftlimit; - compat_u64 dqb_curinodes; - compat_u64 dqb_btime; - compat_u64 dqb_itime; - compat_uint_t dqb_valid; -}; - -/* XFS structures */ -struct compat_fs_qfilestat { - compat_u64 dqb_bhardlimit; - compat_u64 qfs_nblks; - compat_uint_t qfs_nextents; -}; - -struct compat_fs_quota_stat { - __s8 qs_version; - __u16 qs_flags; - __s8 qs_pad; - struct compat_fs_qfilestat qs_uquota; - struct compat_fs_qfilestat qs_gquota; - compat_uint_t qs_incoredqs; - compat_int_t qs_btimelimit; - compat_int_t qs_itimelimit; - compat_int_t qs_rtbtimelimit; - __u16 qs_bwarnlimit; - __u16 qs_iwarnlimit; -}; - -asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, - qid_t id, void __user *addr) -{ - unsigned int cmds; - struct if_dqblk __user *dqblk; - struct compat_if_dqblk __user *compat_dqblk; - struct fs_quota_stat __user *fsqstat; - struct compat_fs_quota_stat __user *compat_fsqstat; - compat_uint_t data; - u16 xdata; - long ret; + ret = do_quotactl(sb, type, cmds, id, addr); - cmds = cmd >> SUBCMDSHIFT; - - switch (cmds) { - case Q_GETQUOTA: - dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); - compat_dqblk = addr; - ret = sys_quotactl(cmd, special, id, dqblk); - if (ret) - break; - if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) || - get_user(data, &dqblk->dqb_valid) || - put_user(data, &compat_dqblk->dqb_valid)) - ret = -EFAULT; - break; - case Q_SETQUOTA: - dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); - compat_dqblk = addr; - ret = -EFAULT; - if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) || - get_user(data, &compat_dqblk->dqb_valid) || - put_user(data, &dqblk->dqb_valid)) - break; - ret = sys_quotactl(cmd, special, id, dqblk); - break; - case Q_XGETQSTAT: - fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat)); - compat_fsqstat = addr; - ret = sys_quotactl(cmd, special, id, fsqstat); - if (ret) - break; - ret = -EFAULT; - /* Copying qs_version, qs_flags, qs_pad */ - if (copy_in_user(compat_fsqstat, fsqstat, - offsetof(struct compat_fs_quota_stat, qs_uquota))) - break; - /* Copying qs_uquota */ - if (copy_in_user(&compat_fsqstat->qs_uquota, - &fsqstat->qs_uquota, - sizeof(compat_fsqstat->qs_uquota)) || - get_user(data, &fsqstat->qs_uquota.qfs_nextents) || - put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents)) - break; - /* Copying qs_gquota */ - if (copy_in_user(&compat_fsqstat->qs_gquota, - &fsqstat->qs_gquota, - sizeof(compat_fsqstat->qs_gquota)) || - get_user(data, &fsqstat->qs_gquota.qfs_nextents) || - put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents)) - break; - /* Copying the rest */ - if (copy_in_user(&compat_fsqstat->qs_incoredqs, - &fsqstat->qs_incoredqs, - sizeof(struct compat_fs_quota_stat) - - offsetof(struct compat_fs_quota_stat, qs_incoredqs)) || - get_user(xdata, &fsqstat->qs_iwarnlimit) || - put_user(xdata, &compat_fsqstat->qs_iwarnlimit)) - break; - ret = 0; - break; - default: - ret = sys_quotactl(cmd, special, id, addr); - } + drop_super(sb); return ret; } -#endif - - -#ifdef CONFIG_QUOTA_NETLINK_INTERFACE - -/* Netlink family structure for quota */ -static struct genl_family quota_genl_family = { - .id = GENL_ID_GENERATE, - .hdrsize = 0, - .name = "VFS_DQUOT", - .version = 1, - .maxattr = QUOTA_NL_A_MAX, -}; - -/** - * quota_send_warning - Send warning to userspace about exceeded quota - * @type: The quota type: USRQQUOTA, GRPQUOTA,... - * @id: The user or group id of the quota that was exceeded - * @dev: The device on which the fs is mounted (sb->s_dev) - * @warntype: The type of the warning: QUOTA_NL_... - * - * This can be used by filesystems (including those which don't use - * dquot) to send a message to userspace relating to quota limits. - * - */ - -void quota_send_warning(short type, unsigned int id, dev_t dev, - const char warntype) -{ - static atomic_t seq; - struct sk_buff *skb; - void *msg_head; - int ret; - int msg_size = 4 * nla_total_size(sizeof(u32)) + - 2 * nla_total_size(sizeof(u64)); - - /* We have to allocate using GFP_NOFS as we are called from a - * filesystem performing write and thus further recursion into - * the fs to free some data could cause deadlocks. */ - skb = genlmsg_new(msg_size, GFP_NOFS); - if (!skb) { - printk(KERN_ERR - "VFS: Not enough memory to send quota warning.\n"); - return; - } - msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq), - "a_genl_family, 0, QUOTA_NL_C_WARNING); - if (!msg_head) { - printk(KERN_ERR - "VFS: Cannot store netlink header in quota warning.\n"); - goto err_out; - } - ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type); - if (ret) - goto attr_err_out; - ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id); - if (ret) - goto attr_err_out; - ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype); - if (ret) - goto attr_err_out; - ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev)); - if (ret) - goto attr_err_out; - ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev)); - if (ret) - goto attr_err_out; - ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid()); - if (ret) - goto attr_err_out; - genlmsg_end(skb, msg_head); - - genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS); - return; -attr_err_out: - printk(KERN_ERR "VFS: Not enough space to compose quota message!\n"); -err_out: - kfree_skb(skb); -} -EXPORT_SYMBOL(quota_send_warning); - -static int __init quota_init(void) -{ - if (genl_register_family("a_genl_family) != 0) - printk(KERN_ERR - "VFS: Failed to create quota netlink interface.\n"); - return 0; -}; - -module_init(quota_init); -#endif - diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index 65c87276117..dc014f7def0 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -425,7 +425,7 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th, journal_mark_dirty(th, s, sbh); if (for_unformatted) - vfs_dq_free_block_nodirty(inode, 1); + dquot_free_block_nodirty(inode, 1); } void reiserfs_free_block(struct reiserfs_transaction_handle *th, @@ -1049,7 +1049,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start amount_needed, hint->inode->i_uid); #endif quota_ret = - vfs_dq_alloc_block_nodirty(hint->inode, amount_needed); + dquot_alloc_block_nodirty(hint->inode, amount_needed); if (quota_ret) /* Quota exceeded? */ return QUOTA_EXCEEDED; if (hint->preallocate && hint->prealloc_size) { @@ -1058,7 +1058,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start "reiserquota: allocating (prealloc) %d blocks id=%u", hint->prealloc_size, hint->inode->i_uid); #endif - quota_ret = vfs_dq_prealloc_block_nodirty(hint->inode, + quota_ret = dquot_prealloc_block_nodirty(hint->inode, hint->prealloc_size); if (quota_ret) hint->preallocate = hint->prealloc_size = 0; @@ -1092,7 +1092,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start hint->inode->i_uid); #endif /* Free not allocated blocks */ - vfs_dq_free_block_nodirty(hint->inode, + dquot_free_block_nodirty(hint->inode, amount_needed + hint->prealloc_size - nr_allocated); } @@ -1125,7 +1125,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start REISERFS_I(hint->inode)->i_prealloc_count, hint->inode->i_uid); #endif - vfs_dq_free_block_nodirty(hint->inode, amount_needed + + dquot_free_block_nodirty(hint->inode, amount_needed + hint->prealloc_size - nr_allocated - REISERFS_I(hint->inode)-> i_prealloc_count); diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index da2dba082e2..1d9c12714c5 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -289,7 +289,7 @@ const struct file_operations reiserfs_file_operations = { .compat_ioctl = reiserfs_compat_ioctl, #endif .mmap = reiserfs_file_mmap, - .open = generic_file_open, + .open = dquot_file_open, .release = reiserfs_file_release, .fsync = reiserfs_sync_file, .aio_read = generic_file_aio_read, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 2df0f5c7c60..d1da94b82d8 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -34,6 +34,9 @@ void reiserfs_delete_inode(struct inode *inode) int depth; int err; + if (!is_bad_inode(inode)) + dquot_initialize(inode); + truncate_inode_pages(&inode->i_data, 0); depth = reiserfs_write_lock_once(inode->i_sb); @@ -54,7 +57,7 @@ void reiserfs_delete_inode(struct inode *inode) * after delete_object so that quota updates go into the same transaction as * stat data deletion */ if (!err) - vfs_dq_free_inode(inode); + dquot_free_inode(inode); if (journal_end(&th, inode->i_sb, jbegin_count)) goto out; @@ -1615,7 +1618,7 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp, ** to properly mark inodes for datasync and such, but only actually ** does something when called for a synchronous update. */ -int reiserfs_write_inode(struct inode *inode, int do_sync) +int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct reiserfs_transaction_handle th; int jbegin_count = 1; @@ -1627,7 +1630,7 @@ int reiserfs_write_inode(struct inode *inode, int do_sync) ** inode needs to reach disk for safety, and they can safely be ** ignored because the altered inode has already been logged. */ - if (do_sync && !(current->flags & PF_MEMALLOC)) { + if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) { reiserfs_write_lock(inode->i_sb); if (!journal_begin(&th, inode->i_sb, jbegin_count)) { reiserfs_update_sd(&th, inode); @@ -1765,10 +1768,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, BUG_ON(!th->t_trans_id); - if (vfs_dq_alloc_inode(inode)) { - err = -EDQUOT; + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) goto out_end_trans; - } if (!dir->i_nlink) { err = -EPERM; goto out_bad_inode; @@ -1959,12 +1962,12 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, INODE_PKEY(inode)->k_objectid = 0; /* Quota change must be inside a transaction for journaling */ - vfs_dq_free_inode(inode); + dquot_free_inode(inode); out_end_trans: journal_end(th, th->t_super, th->t_blocks_allocated); /* Drop can be outside and it needs more credits so it's better to have it outside */ - vfs_dq_drop(inode); + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; make_bad_inode(inode); @@ -3073,6 +3076,8 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) depth = reiserfs_write_lock_once(inode->i_sb); if (attr->ia_valid & ATTR_SIZE) { + dquot_initialize(inode); + /* version 2 items will be caught by the s_maxbytes check ** done for us in vmtruncate */ @@ -3134,8 +3139,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) jbegin_count); if (error) goto out; - error = - vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + error = dquot_transfer(inode, attr); if (error) { journal_end(&th, inode->i_sb, jbegin_count); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 9d4dcf0b07c..96e4cbbfaa1 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -546,7 +546,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, */ static int drop_new_inode(struct inode *inode) { - vfs_dq_drop(inode); + dquot_drop(inode); make_bad_inode(inode); inode->i_flags |= S_NOQUOTA; iput(inode); @@ -554,7 +554,7 @@ static int drop_new_inode(struct inode *inode) } /* utility function that does setup for reiserfs_new_inode. -** vfs_dq_init needs lots of credits so it's better to have it +** dquot_initialize needs lots of credits so it's better to have it ** outside of a transaction, so we had to pull some bits of ** reiserfs_new_inode out into this func. */ @@ -577,7 +577,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, int mode) } else { inode->i_gid = current_fsgid(); } - vfs_dq_init(inode); + dquot_initialize(inode); return 0; } @@ -594,6 +594,8 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode, struct reiserfs_transaction_handle th; struct reiserfs_security_handle security; + dquot_initialize(dir); + if (!(inode = new_inode(dir->i_sb))) { return -ENOMEM; } @@ -666,6 +668,8 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode, if (!new_valid_dev(rdev)) return -EINVAL; + dquot_initialize(dir); + if (!(inode = new_inode(dir->i_sb))) { return -ENOMEM; } @@ -739,6 +743,8 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); + dquot_initialize(dir); + #ifdef DISPLACE_NEW_PACKING_LOCALITIES /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */ REISERFS_I(dir)->new_packing_locality = 1; @@ -842,6 +848,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); + dquot_initialize(dir); + reiserfs_write_lock(dir->i_sb); retval = journal_begin(&th, dir->i_sb, jbegin_count); if (retval) @@ -923,6 +931,8 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) unsigned long savelink; int depth; + dquot_initialize(dir); + inode = dentry->d_inode; /* in this transaction we can be doing at max two balancings and update @@ -1024,6 +1034,8 @@ static int reiserfs_symlink(struct inode *parent_dir, 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) + REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb)); + dquot_initialize(parent_dir); + if (!(inode = new_inode(parent_dir->i_sb))) { return -ENOMEM; } @@ -1111,6 +1123,8 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, JOURNAL_PER_BALANCE_CNT * 3 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); + dquot_initialize(dir); + reiserfs_write_lock(dir->i_sb); if (inode->i_nlink >= REISERFS_LINK_MAX) { //FIXME: sd_nlink is 32 bit for new files @@ -1235,6 +1249,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, JOURNAL_PER_BALANCE_CNT * 3 + 5 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb); + dquot_initialize(old_dir); + dquot_initialize(new_dir); + old_inode = old_dentry->d_inode; new_dentry_inode = new_dentry->d_inode; diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 5fa7118f04e..313d39d639e 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -1299,7 +1299,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, "reiserquota delete_item(): freeing %u, id=%u type=%c", quota_cut_bytes, inode->i_uid, head2type(&s_ih)); #endif - vfs_dq_free_space_nodirty(inode, quota_cut_bytes); + dquot_free_space_nodirty(inode, quota_cut_bytes); /* Return deleted body length */ return ret_value; @@ -1383,7 +1383,7 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, quota_cut_bytes, inode->i_uid, key2type(key)); #endif - vfs_dq_free_space_nodirty(inode, + dquot_free_space_nodirty(inode, quota_cut_bytes); } break; @@ -1733,7 +1733,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, "reiserquota cut_from_item(): freeing %u id=%u type=%c", quota_cut_bytes, inode->i_uid, '?'); #endif - vfs_dq_free_space_nodirty(inode, quota_cut_bytes); + dquot_free_space_nodirty(inode, quota_cut_bytes); return ret_value; } @@ -1968,9 +1968,10 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree key2type(&(key->on_disk_key))); #endif - if (vfs_dq_alloc_space_nodirty(inode, pasted_size)) { + retval = dquot_alloc_space_nodirty(inode, pasted_size); + if (retval) { pathrelse(search_path); - return -EDQUOT; + return retval; } init_tb_struct(th, &s_paste_balance, th->t_super, search_path, pasted_size); @@ -2024,7 +2025,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree pasted_size, inode->i_uid, key2type(&(key->on_disk_key))); #endif - vfs_dq_free_space_nodirty(inode, pasted_size); + dquot_free_space_nodirty(inode, pasted_size); return retval; } @@ -2062,9 +2063,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, #endif /* We can't dirty inode here. It would be immediately written but * appropriate stat item isn't inserted yet... */ - if (vfs_dq_alloc_space_nodirty(inode, quota_bytes)) { + retval = dquot_alloc_space_nodirty(inode, quota_bytes); + if (retval) { pathrelse(path); - return -EDQUOT; + return retval; } } init_tb_struct(th, &s_ins_balance, th->t_super, path, @@ -2113,6 +2115,6 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, quota_bytes, inode->i_uid, head2type(ih)); #endif if (inode) - vfs_dq_free_space_nodirty(inode, quota_bytes); + dquot_free_space_nodirty(inode, quota_bytes); return retval; } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index b4a7dd03bdb..04bf5d791bd 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -246,7 +246,7 @@ static int finish_unfinished(struct super_block *s) retval = remove_save_link_only(s, &save_link_key, 0); continue; } - vfs_dq_init(inode); + dquot_initialize(inode); if (truncate && S_ISDIR(inode->i_mode)) { /* We got a truncate request for a dir which is impossible. @@ -578,6 +578,11 @@ out: reiserfs_write_unlock_once(inode->i_sb, lock_depth); } +static void reiserfs_clear_inode(struct inode *inode) +{ + dquot_drop(inode); +} + #ifdef CONFIG_QUOTA static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, size_t, loff_t); @@ -590,6 +595,7 @@ static const struct super_operations reiserfs_sops = { .destroy_inode = reiserfs_destroy_inode, .write_inode = reiserfs_write_inode, .dirty_inode = reiserfs_dirty_inode, + .clear_inode = reiserfs_clear_inode, .delete_inode = reiserfs_delete_inode, .put_super = reiserfs_put_super, .write_super = reiserfs_write_super, @@ -616,13 +622,6 @@ static int reiserfs_write_info(struct super_block *, int); static int reiserfs_quota_on(struct super_block *, int, int, char *, int); static const struct dquot_operations reiserfs_quota_operations = { - .initialize = dquot_initialize, - .drop = dquot_drop, - .alloc_space = dquot_alloc_space, - .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, - .free_inode = dquot_free_inode, - .transfer = dquot_transfer, .write_dquot = reiserfs_write_dquot, .acquire_dquot = reiserfs_acquire_dquot, .release_dquot = reiserfs_release_dquot, diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 81f09fab8ae..37d034ca7d9 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -61,7 +61,6 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) { BUG_ON(!mutex_is_locked(&dir->i_mutex)); - vfs_dq_init(dir); return dir->i_op->create(dir, dentry, mode, NULL); } #endif @@ -69,7 +68,6 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) static int xattr_mkdir(struct inode *dir, struct dentry *dentry, int mode) { BUG_ON(!mutex_is_locked(&dir->i_mutex)); - vfs_dq_init(dir); return dir->i_op->mkdir(dir, dentry, mode); } @@ -81,7 +79,6 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry) { int error; BUG_ON(!mutex_is_locked(&dir->i_mutex)); - vfs_dq_init(dir); reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, I_MUTEX_CHILD, dir->i_sb); @@ -97,7 +94,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry) { int error; BUG_ON(!mutex_is_locked(&dir->i_mutex)); - vfs_dq_init(dir); reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, I_MUTEX_CHILD, dir->i_sb); diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index 70e3244fa30..df8a19ef870 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o -squashfs-y += namei.o super.o symlink.o +squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 2a796031034..1cb0d81b164 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -29,15 +29,14 @@ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> -#include <linux/mutex.h> #include <linux/string.h> #include <linux/buffer_head.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" +#include "decompressor.h" /* * Read the metadata block length, this is stored in the first two @@ -153,72 +152,10 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, } if (compressed) { - int zlib_err = 0, zlib_init = 0; - - /* - * Uncompress block. - */ - - mutex_lock(&msblk->read_data_mutex); - - msblk->stream.avail_out = 0; - msblk->stream.avail_in = 0; - - bytes = length; - do { - if (msblk->stream.avail_in == 0 && k < b) { - avail = min(bytes, msblk->devblksize - offset); - bytes -= avail; - wait_on_buffer(bh[k]); - if (!buffer_uptodate(bh[k])) - goto release_mutex; - - if (avail == 0) { - offset = 0; - put_bh(bh[k++]); - continue; - } - - msblk->stream.next_in = bh[k]->b_data + offset; - msblk->stream.avail_in = avail; - offset = 0; - } - - if (msblk->stream.avail_out == 0 && page < pages) { - msblk->stream.next_out = buffer[page++]; - msblk->stream.avail_out = PAGE_CACHE_SIZE; - } - - if (!zlib_init) { - zlib_err = zlib_inflateInit(&msblk->stream); - if (zlib_err != Z_OK) { - ERROR("zlib_inflateInit returned" - " unexpected result 0x%x," - " srclength %d\n", zlib_err, - srclength); - goto release_mutex; - } - zlib_init = 1; - } - - zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH); - - if (msblk->stream.avail_in == 0 && k < b) - put_bh(bh[k++]); - } while (zlib_err == Z_OK); - - if (zlib_err != Z_STREAM_END) { - ERROR("zlib_inflate error, data probably corrupt\n"); - goto release_mutex; - } - - zlib_err = zlib_inflateEnd(&msblk->stream); - if (zlib_err != Z_OK) { - ERROR("zlib_inflate error, data probably corrupt\n"); - goto release_mutex; - } - length = msblk->stream.total_out; - mutex_unlock(&msblk->read_data_mutex); + length = squashfs_decompress(msblk, buffer, bh, b, offset, + length, srclength, pages); + if (length < 0) + goto read_failure; } else { /* * Block is uncompressed. @@ -255,9 +192,6 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, kfree(bh); return length; -release_mutex: - mutex_unlock(&msblk->read_data_mutex); - block_release: for (; k < b; k++) put_bh(bh[k]); diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index 40c98fa6b5d..57314bee905 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -51,7 +51,6 @@ #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/wait.h> -#include <linux/zlib.h> #include <linux/pagemap.h> #include "squashfs_fs.h" diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c new file mode 100644 index 00000000000..157478da6ac --- /dev/null +++ b/fs/squashfs/decompressor.c @@ -0,0 +1,68 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * decompressor.c + */ + +#include <linux/types.h> +#include <linux/mutex.h> +#include <linux/buffer_head.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "decompressor.h" +#include "squashfs.h" + +/* + * This file (and decompressor.h) implements a decompressor framework for + * Squashfs, allowing multiple decompressors to be easily supported + */ + +static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = { + NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0 +}; + +static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = { + NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 +}; + +static const struct squashfs_decompressor squashfs_unknown_comp_ops = { + NULL, NULL, NULL, 0, "unknown", 0 +}; + +static const struct squashfs_decompressor *decompressor[] = { + &squashfs_zlib_comp_ops, + &squashfs_lzma_unsupported_comp_ops, + &squashfs_lzo_unsupported_comp_ops, + &squashfs_unknown_comp_ops +}; + + +const struct squashfs_decompressor *squashfs_lookup_decompressor(int id) +{ + int i; + + for (i = 0; decompressor[i]->id; i++) + if (id == decompressor[i]->id) + break; + + return decompressor[i]; +} diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h new file mode 100644 index 00000000000..7425f80783f --- /dev/null +++ b/fs/squashfs/decompressor.h @@ -0,0 +1,55 @@ +#ifndef DECOMPRESSOR_H +#define DECOMPRESSOR_H +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * decompressor.h + */ + +struct squashfs_decompressor { + void *(*init)(struct squashfs_sb_info *); + void (*free)(void *); + int (*decompress)(struct squashfs_sb_info *, void **, + struct buffer_head **, int, int, int, int, int); + int id; + char *name; + int supported; +}; + +static inline void *squashfs_decompressor_init(struct squashfs_sb_info *msblk) +{ + return msblk->decompressor->init(msblk); +} + +static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk, + void *s) +{ + if (msblk->decompressor) + msblk->decompressor->free(s); +} + +static inline int squashfs_decompress(struct squashfs_sb_info *msblk, + void **buffer, struct buffer_head **bh, int b, int offset, int length, + int srclength, int pages) +{ + return msblk->decompressor->decompress(msblk, buffer, bh, b, offset, + length, srclength, pages); +} +#endif diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c index 566b0eaed86..12b933ac658 100644 --- a/fs/squashfs/dir.c +++ b/fs/squashfs/dir.c @@ -30,7 +30,6 @@ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c index 2b1b8fe5e03..7f93d5a9ee0 100644 --- a/fs/squashfs/export.c +++ b/fs/squashfs/export.c @@ -39,7 +39,6 @@ #include <linux/vfs.h> #include <linux/dcache.h> #include <linux/exportfs.h> -#include <linux/zlib.h> #include <linux/slab.h> #include "squashfs_fs.h" diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 717767d831d..a25c5060bdc 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -47,7 +47,6 @@ #include <linux/string.h> #include <linux/pagemap.h> #include <linux/mutex.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c index b5a2c15bbbc..7c90bbd6879 100644 --- a/fs/squashfs/fragment.c +++ b/fs/squashfs/fragment.c @@ -36,7 +36,6 @@ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c index 3795b837ba2..b7f64bcd2b7 100644 --- a/fs/squashfs/id.c +++ b/fs/squashfs/id.c @@ -34,7 +34,6 @@ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index 9101dbde39e..49daaf669e4 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -40,7 +40,6 @@ #include <linux/fs.h> #include <linux/vfs.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c index 9e398653b22..5266bd8ad93 100644 --- a/fs/squashfs/namei.c +++ b/fs/squashfs/namei.c @@ -57,7 +57,6 @@ #include <linux/slab.h> #include <linux/string.h> #include <linux/dcache.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 0e9feb6adf7..fe2587af551 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -51,6 +51,9 @@ extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *, u64, int); extern int squashfs_read_table(struct super_block *, void *, u64, int); +/* decompressor.c */ +extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int); + /* export.c */ extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, unsigned int); @@ -71,7 +74,7 @@ extern struct inode *squashfs_iget(struct super_block *, long long, extern int squashfs_read_inode(struct inode *, long long); /* - * Inodes and files operations + * Inodes, files and decompressor operations */ /* dir.c */ @@ -88,3 +91,6 @@ extern const struct inode_operations squashfs_dir_inode_ops; /* symlink.c */ extern const struct address_space_operations squashfs_symlink_aops; + +/* zlib_wrapper.c */ +extern const struct squashfs_decompressor squashfs_zlib_comp_ops; diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index 283daafc568..79024245ea0 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -183,8 +183,6 @@ #define SQUASHFS_MAX_FILE_SIZE (1LL << \ (SQUASHFS_MAX_FILE_SIZE_LOG - 2)) -#define SQUASHFS_MARKER_BYTE 0xff - /* meta index cache */ #define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int)) #define SQUASHFS_META_ENTRIES 127 @@ -211,7 +209,9 @@ struct meta_index { /* * definitions for structures on disk */ -#define ZLIB_COMPRESSION 1 +#define ZLIB_COMPRESSION 1 +#define LZMA_COMPRESSION 2 +#define LZO_COMPRESSION 3 struct squashfs_super_block { __le32 s_magic; diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index c8c65614dd1..2e77dc547e2 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -52,25 +52,25 @@ struct squashfs_cache_entry { }; struct squashfs_sb_info { - int devblksize; - int devblksize_log2; - struct squashfs_cache *block_cache; - struct squashfs_cache *fragment_cache; - struct squashfs_cache *read_page; - int next_meta_index; - __le64 *id_table; - __le64 *fragment_index; - unsigned int *fragment_index_2; - struct mutex read_data_mutex; - struct mutex meta_index_mutex; - struct meta_index *meta_index; - z_stream stream; - __le64 *inode_lookup_table; - u64 inode_table; - u64 directory_table; - unsigned int block_size; - unsigned short block_log; - long long bytes_used; - unsigned int inodes; + const struct squashfs_decompressor *decompressor; + int devblksize; + int devblksize_log2; + struct squashfs_cache *block_cache; + struct squashfs_cache *fragment_cache; + struct squashfs_cache *read_page; + int next_meta_index; + __le64 *id_table; + __le64 *fragment_index; + struct mutex read_data_mutex; + struct mutex meta_index_mutex; + struct meta_index *meta_index; + void *stream; + __le64 *inode_lookup_table; + u64 inode_table; + u64 directory_table; + unsigned int block_size; + unsigned short block_log; + long long bytes_used; + unsigned int inodes; }; #endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 6c197ef53ad..3550aec2f65 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -35,34 +35,41 @@ #include <linux/pagemap.h> #include <linux/init.h> #include <linux/module.h> -#include <linux/zlib.h> #include <linux/magic.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" +#include "decompressor.h" static struct file_system_type squashfs_fs_type; static const struct super_operations squashfs_super_ops; -static int supported_squashfs_filesystem(short major, short minor, short comp) +static const struct squashfs_decompressor *supported_squashfs_filesystem(short + major, short minor, short id) { + const struct squashfs_decompressor *decompressor; + if (major < SQUASHFS_MAJOR) { ERROR("Major/Minor mismatch, older Squashfs %d.%d " "filesystems are unsupported\n", major, minor); - return -EINVAL; + return NULL; } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) { ERROR("Major/Minor mismatch, trying to mount newer " "%d.%d filesystem\n", major, minor); ERROR("Please update your kernel\n"); - return -EINVAL; + return NULL; } - if (comp != ZLIB_COMPRESSION) - return -EINVAL; + decompressor = squashfs_lookup_decompressor(id); + if (!decompressor->supported) { + ERROR("Filesystem uses \"%s\" compression. This is not " + "supported\n", decompressor->name); + return NULL; + } - return 0; + return decompressor; } @@ -87,13 +94,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) } msblk = sb->s_fs_info; - msblk->stream.workspace = kmalloc(zlib_inflate_workspacesize(), - GFP_KERNEL); - if (msblk->stream.workspace == NULL) { - ERROR("Failed to allocate zlib workspace\n"); - goto failure; - } - sblk = kzalloc(sizeof(*sblk), GFP_KERNEL); if (sblk == NULL) { ERROR("Failed to allocate squashfs_super_block\n"); @@ -120,25 +120,25 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + err = -EINVAL; + /* Check it is a SQUASHFS superblock */ sb->s_magic = le32_to_cpu(sblk->s_magic); if (sb->s_magic != SQUASHFS_MAGIC) { if (!silent) ERROR("Can't find a SQUASHFS superblock on %s\n", bdevname(sb->s_bdev, b)); - err = -EINVAL; goto failed_mount; } - /* Check the MAJOR & MINOR versions and compression type */ - err = supported_squashfs_filesystem(le16_to_cpu(sblk->s_major), + /* Check the MAJOR & MINOR versions and lookup compression type */ + msblk->decompressor = supported_squashfs_filesystem( + le16_to_cpu(sblk->s_major), le16_to_cpu(sblk->s_minor), le16_to_cpu(sblk->compression)); - if (err < 0) + if (msblk->decompressor == NULL) goto failed_mount; - err = -EINVAL; - /* * Check if there's xattrs in the filesystem. These are not * supported in this version, so warn that they will be ignored. @@ -205,6 +205,10 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) err = -ENOMEM; + msblk->stream = squashfs_decompressor_init(msblk); + if (msblk->stream == NULL) + goto failed_mount; + msblk->block_cache = squashfs_cache_init("metadata", SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE); if (msblk->block_cache == NULL) @@ -292,17 +296,16 @@ failed_mount: squashfs_cache_delete(msblk->block_cache); squashfs_cache_delete(msblk->fragment_cache); squashfs_cache_delete(msblk->read_page); + squashfs_decompressor_free(msblk, msblk->stream); kfree(msblk->inode_lookup_table); kfree(msblk->fragment_index); kfree(msblk->id_table); - kfree(msblk->stream.workspace); kfree(sb->s_fs_info); sb->s_fs_info = NULL; kfree(sblk); return err; failure: - kfree(msblk->stream.workspace); kfree(sb->s_fs_info); sb->s_fs_info = NULL; return -ENOMEM; @@ -346,10 +349,10 @@ static void squashfs_put_super(struct super_block *sb) squashfs_cache_delete(sbi->block_cache); squashfs_cache_delete(sbi->fragment_cache); squashfs_cache_delete(sbi->read_page); + squashfs_decompressor_free(sbi, sbi->stream); kfree(sbi->id_table); kfree(sbi->fragment_index); kfree(sbi->meta_index); - kfree(sbi->stream.workspace); kfree(sb->s_fs_info); sb->s_fs_info = NULL; } diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c index 83d87880aac..e80be2022a7 100644 --- a/fs/squashfs/symlink.c +++ b/fs/squashfs/symlink.c @@ -36,7 +36,6 @@ #include <linux/slab.h> #include <linux/string.h> #include <linux/pagemap.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c new file mode 100644 index 00000000000..4dd70e04333 --- /dev/null +++ b/fs/squashfs/zlib_wrapper.c @@ -0,0 +1,150 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * zlib_wrapper.c + */ + + +#include <linux/mutex.h> +#include <linux/buffer_head.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" +#include "decompressor.h" + +static void *zlib_init(struct squashfs_sb_info *dummy) +{ + z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL); + if (stream == NULL) + goto failed; + stream->workspace = kmalloc(zlib_inflate_workspacesize(), + GFP_KERNEL); + if (stream->workspace == NULL) + goto failed; + + return stream; + +failed: + ERROR("Failed to allocate zlib workspace\n"); + kfree(stream); + return NULL; +} + + +static void zlib_free(void *strm) +{ + z_stream *stream = strm; + + if (stream) + kfree(stream->workspace); + kfree(stream); +} + + +static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, + struct buffer_head **bh, int b, int offset, int length, int srclength, + int pages) +{ + int zlib_err = 0, zlib_init = 0; + int avail, bytes, k = 0, page = 0; + z_stream *stream = msblk->stream; + + mutex_lock(&msblk->read_data_mutex); + + stream->avail_out = 0; + stream->avail_in = 0; + + bytes = length; + do { + if (stream->avail_in == 0 && k < b) { + avail = min(bytes, msblk->devblksize - offset); + bytes -= avail; + wait_on_buffer(bh[k]); + if (!buffer_uptodate(bh[k])) + goto release_mutex; + + if (avail == 0) { + offset = 0; + put_bh(bh[k++]); + continue; + } + + stream->next_in = bh[k]->b_data + offset; + stream->avail_in = avail; + offset = 0; + } + + if (stream->avail_out == 0 && page < pages) { + stream->next_out = buffer[page++]; + stream->avail_out = PAGE_CACHE_SIZE; + } + + if (!zlib_init) { + zlib_err = zlib_inflateInit(stream); + if (zlib_err != Z_OK) { + ERROR("zlib_inflateInit returned unexpected " + "result 0x%x, srclength %d\n", + zlib_err, srclength); + goto release_mutex; + } + zlib_init = 1; + } + + zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH); + + if (stream->avail_in == 0 && k < b) + put_bh(bh[k++]); + } while (zlib_err == Z_OK); + + if (zlib_err != Z_STREAM_END) { + ERROR("zlib_inflate error, data probably corrupt\n"); + goto release_mutex; + } + + zlib_err = zlib_inflateEnd(stream); + if (zlib_err != Z_OK) { + ERROR("zlib_inflate error, data probably corrupt\n"); + goto release_mutex; + } + + mutex_unlock(&msblk->read_data_mutex); + return stream->total_out; + +release_mutex: + mutex_unlock(&msblk->read_data_mutex); + + for (; k < b; k++) + put_bh(bh[k]); + + return -EIO; +} + +const struct squashfs_decompressor squashfs_zlib_comp_ops = { + .init = zlib_init, + .free = zlib_free, + .decompress = zlib_uncompress, + .id = ZLIB_COMPRESSION, + .name = "zlib", + .supported = 1 +}; + diff --git a/fs/sync.c b/fs/sync.c index 418727a2a23..f557d71cb09 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -34,14 +34,14 @@ static int __sync_filesystem(struct super_block *sb, int wait) if (!sb->s_bdi) return 0; - /* Avoid doing twice syncing and cache pruning for quota sync */ - if (!wait) { - writeout_quota_sb(sb, -1); - writeback_inodes_sb(sb); - } else { - sync_quota_sb(sb, -1); + if (sb->s_qcop && sb->s_qcop->quota_sync) + sb->s_qcop->quota_sync(sb, -1, wait); + + if (wait) sync_inodes_sb(sb); - } + else + writeback_inodes_sb(sb); + if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, wait); return __sync_blockdev(sb->s_bdev, wait); diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 9824743832a..4573734d723 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -26,6 +26,7 @@ #include <linux/init.h> #include <linux/buffer_head.h> #include <linux/vfs.h> +#include <linux/writeback.h> #include <linux/namei.h> #include <asm/byteorder.h> #include "sysv.h" @@ -246,7 +247,7 @@ bad_inode: return ERR_PTR(-EIO); } -int sysv_write_inode(struct inode *inode, int wait) +static int __sysv_write_inode(struct inode *inode, int wait) { struct super_block * sb = inode->i_sb; struct sysv_sb_info * sbi = SYSV_SB(sb); @@ -296,9 +297,14 @@ int sysv_write_inode(struct inode *inode, int wait) return 0; } +int sysv_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __sysv_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + int sysv_sync_inode(struct inode *inode) { - return sysv_write_inode(inode, 1); + return __sysv_write_inode(inode, 1); } static void sysv_delete_inode(struct inode *inode) diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index 53786eb5cf6..94cb9b4d76c 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -142,7 +142,7 @@ extern int __sysv_write_begin(struct file *file, struct address_space *mapping, /* inode.c */ extern struct inode *sysv_iget(struct super_block *, unsigned int); -extern int sysv_write_inode(struct inode *, int); +extern int sysv_write_inode(struct inode *, struct writeback_control *wbc); extern int sysv_sync_inode(struct inode *); extern void sysv_set_inode(struct inode *, dev_t); extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 552fb0111ff..401e503d44a 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1120,7 +1120,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, if (release) ubifs_release_budget(c, &ino_req); if (IS_SYNC(old_inode)) - err = old_inode->i_sb->s_op->write_inode(old_inode, 1); + err = old_inode->i_sb->s_op->write_inode(old_inode, NULL); return err; out_cancel: diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 16a6444330e..e26c02ab6cd 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1011,7 +1011,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc) /* Is the page fully inside @i_size? */ if (page->index < end_index) { if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) { - err = inode->i_sb->s_op->write_inode(inode, 1); + err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) goto out_unlock; /* @@ -1039,7 +1039,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc) kunmap_atomic(kaddr, KM_USER0); if (i_size > synced_i_size) { - err = inode->i_sb->s_op->write_inode(inode, 1); + err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) goto out_unlock; } @@ -1242,7 +1242,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, if (release) ubifs_release_budget(c, &req); if (IS_SYNC(inode)) - err = inode->i_sb->s_op->write_inode(inode, 1); + err = inode->i_sb->s_op->write_inode(inode, NULL); return err; out: @@ -1316,7 +1316,7 @@ int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync) * the inode unless this is a 'datasync()' call. */ if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { - err = inode->i_sb->s_op->write_inode(inode, 1); + err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) return err; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 43f9d19a6f3..4d2f2157dd3 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -283,7 +283,7 @@ static void ubifs_destroy_inode(struct inode *inode) /* * Note, Linux write-back code calls this without 'i_mutex'. */ -static int ubifs_write_inode(struct inode *inode, int wait) +static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) { int err = 0; struct ubifs_info *c = inode->i_sb->s_fs_info; diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index b2d96f45c12..ccc3ad7242d 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -208,7 +208,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb, ((char *)bh->b_data)[(bit + i) >> 3]); } else { if (inode) - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); udf_add_free_space(sb, sbi->s_partition, 1); } } @@ -260,11 +260,11 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb, while (bit < (sb->s_blocksize << 3) && block_count > 0) { if (!udf_test_bit(bit, bh->b_data)) goto out; - else if (vfs_dq_prealloc_block(inode, 1)) + else if (dquot_prealloc_block(inode, 1)) goto out; else if (!udf_clear_bit(bit, bh->b_data)) { udf_debug("bit already cleared for block %d\n", bit); - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); goto out; } block_count--; @@ -390,10 +390,14 @@ got_block: /* * Check quota for allocation of this block. */ - if (inode && vfs_dq_alloc_block(inode, 1)) { - mutex_unlock(&sbi->s_alloc_mutex); - *err = -EDQUOT; - return 0; + if (inode) { + int ret = dquot_alloc_block(inode, 1); + + if (ret) { + mutex_unlock(&sbi->s_alloc_mutex); + *err = ret; + return 0; + } } newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) - @@ -449,7 +453,7 @@ static void udf_table_free_blocks(struct super_block *sb, /* We do this up front - There are some error conditions that could occure, but.. oh well */ if (inode) - vfs_dq_free_block(inode, count); + dquot_free_block(inode, count); udf_add_free_space(sb, sbi->s_partition, count); start = bloc->logicalBlockNum + offset; @@ -694,7 +698,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb, epos.offset -= adsize; alloc_count = (elen >> sb->s_blocksize_bits); - if (inode && vfs_dq_prealloc_block(inode, + if (inode && dquot_prealloc_block(inode, alloc_count > block_count ? block_count : alloc_count)) alloc_count = 0; else if (alloc_count > block_count) { @@ -797,12 +801,13 @@ static int udf_table_new_block(struct super_block *sb, newblock = goal_eloc.logicalBlockNum; goal_eloc.logicalBlockNum++; goal_elen -= sb->s_blocksize; - - if (inode && vfs_dq_alloc_block(inode, 1)) { - brelse(goal_epos.bh); - mutex_unlock(&sbi->s_alloc_mutex); - *err = -EDQUOT; - return 0; + if (inode) { + *err = dquot_alloc_block(inode, 1); + if (*err) { + brelse(goal_epos.bh); + mutex_unlock(&sbi->s_alloc_mutex); + return 0; + } } if (goal_elen) diff --git a/fs/udf/file.c b/fs/udf/file.c index f311d509b6a..1eb06774ed9 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -34,6 +34,7 @@ #include <linux/errno.h> #include <linux/smp_lock.h> #include <linux/pagemap.h> +#include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/aio.h> @@ -207,7 +208,7 @@ const struct file_operations udf_file_operations = { .read = do_sync_read, .aio_read = generic_file_aio_read, .ioctl = udf_ioctl, - .open = generic_file_open, + .open = dquot_file_open, .mmap = generic_file_mmap, .write = do_sync_write, .aio_write = udf_file_aio_write, @@ -217,6 +218,29 @@ const struct file_operations udf_file_operations = { .llseek = generic_file_llseek, }; +static int udf_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + if (iattr->ia_valid & ATTR_SIZE) + dquot_initialize(inode); + + if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || + (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { + error = dquot_transfer(inode, iattr); + if (error) + return error; + } + + return inode_setattr(inode, iattr); +} + const struct inode_operations udf_file_inode_operations = { - .truncate = udf_truncate, + .truncate = udf_truncate, + .setattr = udf_setattr, }; diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index c10fa39f97e..fb68c9cd0c3 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -36,8 +36,8 @@ void udf_free_inode(struct inode *inode) * Note: we must free any quota before locking the superblock, * as writing the quota to disk may need the lock as well. */ - vfs_dq_free_inode(inode); - vfs_dq_drop(inode); + dquot_free_inode(inode); + dquot_drop(inode); clear_inode(inode); @@ -61,7 +61,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) struct super_block *sb = dir->i_sb; struct udf_sb_info *sbi = UDF_SB(sb); struct inode *inode; - int block; + int block, ret; uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; struct udf_inode_info *iinfo; struct udf_inode_info *dinfo = UDF_I(dir); @@ -153,12 +153,14 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) insert_inode_hash(inode); mark_inode_dirty(inode); - if (vfs_dq_alloc_inode(inode)) { - vfs_dq_drop(inode); + dquot_initialize(inode); + ret = dquot_alloc_inode(inode); + if (ret) { + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; iput(inode); - *err = -EDQUOT; + *err = ret; return NULL; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 378a7592257..b57ab0402d8 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -36,6 +36,7 @@ #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/writeback.h> +#include <linux/quotaops.h> #include <linux/slab.h> #include <linux/crc-itu-t.h> @@ -70,6 +71,9 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); void udf_delete_inode(struct inode *inode) { + if (!is_bad_inode(inode)) + dquot_initialize(inode); + truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) @@ -108,6 +112,8 @@ void udf_clear_inode(struct inode *inode) (unsigned long long)inode->i_size, (unsigned long long)iinfo->i_lenExtents); } + + dquot_drop(inode); kfree(iinfo->i_ext.i_data); iinfo->i_ext.i_data = NULL; } @@ -1373,12 +1379,12 @@ static mode_t udf_convert_permissions(struct fileEntry *fe) return mode; } -int udf_write_inode(struct inode *inode, int sync) +int udf_write_inode(struct inode *inode, struct writeback_control *wbc) { int ret; lock_kernel(); - ret = udf_update_inode(inode, sync); + ret = udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); unlock_kernel(); return ret; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 7c56ff00cd5..db423ab078b 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -563,6 +563,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode, int err; struct udf_inode_info *iinfo; + dquot_initialize(dir); + lock_kernel(); inode = udf_new_inode(dir, mode, &err); if (!inode) { @@ -616,6 +618,8 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode, if (!old_valid_dev(rdev)) return -EINVAL; + dquot_initialize(dir); + lock_kernel(); err = -EIO; inode = udf_new_inode(dir, mode, &err); @@ -662,6 +666,8 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct udf_inode_info *dinfo = UDF_I(dir); struct udf_inode_info *iinfo; + dquot_initialize(dir); + lock_kernel(); err = -EMLINK; if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1) @@ -799,6 +805,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) struct fileIdentDesc *fi, cfi; struct kernel_lb_addr tloc; + dquot_initialize(dir); + retval = -ENOENT; lock_kernel(); fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); @@ -845,6 +853,8 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) struct fileIdentDesc cfi; struct kernel_lb_addr tloc; + dquot_initialize(dir); + retval = -ENOENT; lock_kernel(); fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); @@ -899,6 +909,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, struct buffer_head *bh; struct udf_inode_info *iinfo; + dquot_initialize(dir); + lock_kernel(); inode = udf_new_inode(dir, S_IFLNK, &err); if (!inode) @@ -1069,6 +1081,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, int err; struct buffer_head *bh; + dquot_initialize(dir); + lock_kernel(); if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { unlock_kernel(); @@ -1131,6 +1145,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, struct kernel_lb_addr tloc; struct udf_inode_info *old_iinfo = UDF_I(old_inode); + dquot_initialize(old_dir); + dquot_initialize(new_dir); + lock_kernel(); ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); if (ofi) { diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 8d46f4294ee..4223ac855da 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -142,7 +142,7 @@ extern void udf_truncate(struct inode *); extern void udf_read_inode(struct inode *); extern void udf_delete_inode(struct inode *); extern void udf_clear_inode(struct inode *); -extern int udf_write_inode(struct inode *, int); +extern int udf_write_inode(struct inode *, struct writeback_control *wbc); extern long udf_block_map(struct inode *, sector_t); extern int udf_extend_file(struct inode *, struct extent_position *, struct kernel_long_ad *, sector_t); diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 54c16ec95df..5cfa4d85ccf 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -85,7 +85,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) "bit already cleared for fragment %u", i); } - vfs_dq_free_block(inode, count); + dquot_free_block(inode, count); fs32_add(sb, &ucg->cg_cs.cs_nffree, count); @@ -195,7 +195,7 @@ do_more: ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) ufs_clusteracct (sb, ucpi, blkno, 1); - vfs_dq_free_block(inode, uspi->s_fpb); + dquot_free_block(inode, uspi->s_fpb); fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); uspi->cs_total.cs_nbfree++; @@ -511,6 +511,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned cgno, fragno, fragoff, count, fragsize, i; + int ret; UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n", (unsigned long long)fragment, oldcount, newcount); @@ -556,8 +557,9 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1); for (i = oldcount; i < newcount; i++) ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i); - if (vfs_dq_alloc_block(inode, count)) { - *err = -EDQUOT; + ret = dquot_alloc_block(inode, count); + if (ret) { + *err = ret; return 0; } @@ -596,6 +598,7 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno, struct ufs_cylinder_group * ucg; unsigned oldcg, i, j, k, allocsize; u64 result; + int ret; UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n", inode->i_ino, cgno, (unsigned long long)goal, count); @@ -664,7 +667,7 @@ cg_found: for (i = count; i < uspi->s_fpb; i++) ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); i = uspi->s_fpb - count; - vfs_dq_free_block(inode, i); + dquot_free_block(inode, i); fs32_add(sb, &ucg->cg_cs.cs_nffree, i); uspi->cs_total.cs_nffree += i; @@ -676,8 +679,9 @@ cg_found: result = ufs_bitmap_search (sb, ucpi, goal, allocsize); if (result == INVBLOCK) return 0; - if (vfs_dq_alloc_block(inode, count)) { - *err = -EDQUOT; + ret = dquot_alloc_block(inode, count); + if (ret) { + *err = ret; return 0; } for (i = 0; i < count; i++) @@ -714,6 +718,7 @@ static u64 ufs_alloccg_block(struct inode *inode, struct ufs_super_block_first * usb1; struct ufs_cylinder_group * ucg; u64 result, blkno; + int ret; UFSD("ENTER, goal %llu\n", (unsigned long long)goal); @@ -747,8 +752,9 @@ gotit: ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) ufs_clusteracct (sb, ucpi, blkno, -1); - if (vfs_dq_alloc_block(inode, uspi->s_fpb)) { - *err = -EDQUOT; + ret = dquot_alloc_block(inode, uspi->s_fpb); + if (ret) { + *err = ret; return INVBLOCK; } diff --git a/fs/ufs/file.c b/fs/ufs/file.c index 73655c61240..a8962cecde5 100644 --- a/fs/ufs/file.c +++ b/fs/ufs/file.c @@ -24,6 +24,7 @@ */ #include <linux/fs.h> +#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -40,7 +41,7 @@ const struct file_operations ufs_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .open = generic_file_open, + .open = dquot_file_open, .fsync = simple_fsync, .splice_read = generic_file_splice_read, }; diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 3527c00fef0..230ecf60802 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -95,8 +95,8 @@ void ufs_free_inode (struct inode * inode) is_directory = S_ISDIR(inode->i_mode); - vfs_dq_free_inode(inode); - vfs_dq_drop(inode); + dquot_free_inode(inode); + dquot_drop(inode); clear_inode (inode); @@ -355,9 +355,10 @@ cg_found: unlock_super (sb); - if (vfs_dq_alloc_inode(inode)) { - vfs_dq_drop(inode); - err = -EDQUOT; + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) { + dquot_drop(inode); goto fail_without_unlock; } diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 7cf33379fd4..80b68c3702d 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -36,6 +36,8 @@ #include <linux/mm.h> #include <linux/smp_lock.h> #include <linux/buffer_head.h> +#include <linux/writeback.h> +#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -890,11 +892,11 @@ static int ufs_update_inode(struct inode * inode, int do_sync) return 0; } -int ufs_write_inode (struct inode * inode, int wait) +int ufs_write_inode(struct inode *inode, struct writeback_control *wbc) { int ret; lock_kernel(); - ret = ufs_update_inode (inode, wait); + ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); unlock_kernel(); return ret; } @@ -908,6 +910,9 @@ void ufs_delete_inode (struct inode * inode) { loff_t old_i_size; + if (!is_bad_inode(inode)) + dquot_initialize(inode); + truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) goto no_delete; diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 4c26d9e8bc9..118556243e7 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -30,6 +30,7 @@ #include <linux/time.h> #include <linux/fs.h> #include <linux/smp_lock.h> +#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -84,6 +85,9 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode, int err; UFSD("BEGIN\n"); + + dquot_initialize(dir); + inode = ufs_new_inode(dir, mode); err = PTR_ERR(inode); @@ -107,6 +111,9 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t if (!old_valid_dev(rdev)) return -EINVAL; + + dquot_initialize(dir); + inode = ufs_new_inode(dir, mode); err = PTR_ERR(inode); if (!IS_ERR(inode)) { @@ -131,6 +138,8 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, if (l > sb->s_blocksize) goto out_notlocked; + dquot_initialize(dir); + lock_kernel(); inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); err = PTR_ERR(inode); @@ -176,6 +185,8 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, return -EMLINK; } + dquot_initialize(dir); + inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); atomic_inc(&inode->i_count); @@ -193,6 +204,8 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode) if (dir->i_nlink >= UFS_LINK_MAX) goto out; + dquot_initialize(dir); + lock_kernel(); inode_inc_link_count(dir); @@ -237,6 +250,8 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry) struct page *page; int err = -ENOENT; + dquot_initialize(dir); + de = ufs_find_entry(dir, &dentry->d_name, &page); if (!de) goto out; @@ -281,6 +296,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, struct ufs_dir_entry *old_de; int err = -ENOENT; + dquot_initialize(old_dir); + dquot_initialize(new_dir); + old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_de) goto out; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 143c20bfb04..66b63a75161 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1432,6 +1432,11 @@ static void destroy_inodecache(void) kmem_cache_destroy(ufs_inode_cachep); } +static void ufs_clear_inode(struct inode *inode) +{ + dquot_drop(inode); +} + #ifdef CONFIG_QUOTA static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t); static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t); @@ -1442,6 +1447,7 @@ static const struct super_operations ufs_super_ops = { .destroy_inode = ufs_destroy_inode, .write_inode = ufs_write_inode, .delete_inode = ufs_delete_inode, + .clear_inode = ufs_clear_inode, .put_super = ufs_put_super, .write_super = ufs_write_super, .sync_fs = ufs_sync_fs, diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index 41dd431ce22..d3b6270cb37 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -44,6 +44,7 @@ #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/sched.h> +#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -517,9 +518,18 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || + (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + error = dquot_transfer(inode, attr); + if (error) + return error; + } if (ia_valid & ATTR_SIZE && attr->ia_size != i_size_read(inode)) { loff_t old_i_size = inode->i_size; + + dquot_initialize(inode); + error = vmtruncate(inode, attr->ia_size); if (error) return error; diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 01d0e2a3b23..43f9f5d5670 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -106,7 +106,7 @@ extern struct inode * ufs_new_inode (struct inode *, int); /* inode.c */ extern struct inode *ufs_iget(struct super_block *, unsigned long); -extern int ufs_write_inode (struct inode *, int); +extern int ufs_write_inode (struct inode *, struct writeback_control *); extern int ufs_sync_inode (struct inode *); extern void ufs_delete_inode (struct inode *); extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *); diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c index 3d4a0c84d63..1947514ce1a 100644 --- a/fs/xfs/linux-2.6/xfs_quotaops.c +++ b/fs/xfs/linux-2.6/xfs_quotaops.c @@ -44,20 +44,6 @@ xfs_quota_type(int type) } STATIC int -xfs_fs_quota_sync( - struct super_block *sb, - int type) -{ - struct xfs_mount *mp = XFS_M(sb); - - if (sb->s_flags & MS_RDONLY) - return -EROFS; - if (!XFS_IS_QUOTA_RUNNING(mp)) - return -ENOSYS; - return -xfs_sync_data(mp, 0); -} - -STATIC int xfs_fs_get_xstate( struct super_block *sb, struct fs_quota_stat *fqs) @@ -82,8 +68,6 @@ xfs_fs_set_xstate( return -EROFS; if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp)) return -ENOSYS; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; if (uflags & XFS_QUOTA_UDQ_ACCT) flags |= XFS_UQUOTA_ACCT; @@ -144,14 +128,11 @@ xfs_fs_set_xquota( return -ENOSYS; if (!XFS_IS_QUOTA_ON(mp)) return -ESRCH; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq); } const struct quotactl_ops xfs_quotactl_operations = { - .quota_sync = xfs_fs_quota_sync, .get_xstate = xfs_fs_get_xstate, .set_xstate = xfs_fs_set_xstate, .get_xquota = xfs_fs_get_xquota, diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 25ea2408118..71345a370d9 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1063,7 +1063,7 @@ xfs_log_inode( STATIC int xfs_fs_write_inode( struct inode *inode, - int sync) + struct writeback_control *wbc) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1074,11 +1074,7 @@ xfs_fs_write_inode( if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - if (sync) { - error = xfs_wait_on_pages(ip, 0, -1); - if (error) - goto out; - + if (wbc->sync_mode == WB_SYNC_ALL) { /* * Make sure the inode has hit stable storage. By using the * log and the fsync transactions we reduce the IOs we have diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index 6b049030fbe..cac84b00666 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -202,14 +202,6 @@ static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags) return flags & EXT3_OTHER_FLMASK; } -/* - * Inode dynamic state flags - */ -#define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ -#define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ -#define EXT3_STATE_XATTR 0x00000004 /* has in-inode xattrs */ -#define EXT3_STATE_FLUSH_ON_CLOSE 0x00000008 - /* Used to pass group descriptor data when online resize is done */ struct ext3_new_group_input { __u32 group; /* Group number for this data */ @@ -560,6 +552,31 @@ static inline int ext3_valid_inum(struct super_block *sb, unsigned long ino) (ino >= EXT3_FIRST_INO(sb) && ino <= le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)); } + +/* + * Inode dynamic state flags + */ +enum { + EXT3_STATE_JDATA, /* journaled data exists */ + EXT3_STATE_NEW, /* inode is newly created */ + EXT3_STATE_XATTR, /* has in-inode xattrs */ + EXT3_STATE_FLUSH_ON_CLOSE, /* flush dirty pages on close */ +}; + +static inline int ext3_test_inode_state(struct inode *inode, int bit) +{ + return test_bit(bit, &EXT3_I(inode)->i_state); +} + +static inline void ext3_set_inode_state(struct inode *inode, int bit) +{ + set_bit(bit, &EXT3_I(inode)->i_state); +} + +static inline void ext3_clear_inode_state(struct inode *inode, int bit) +{ + clear_bit(bit, &EXT3_I(inode)->i_state); +} #else /* Assume that user mode programs are passing in an ext3fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test @@ -877,7 +894,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, int create); extern struct inode *ext3_iget(struct super_block *, unsigned long); -extern int ext3_write_inode (struct inode *, int); +extern int ext3_write_inode (struct inode *, struct writeback_control *); extern int ext3_setattr (struct dentry *, struct iattr *); extern void ext3_delete_inode (struct inode *); extern int ext3_sync_inode (handle_t *, struct inode *); diff --git a/include/linux/ext3_fs_i.h b/include/linux/ext3_fs_i.h index 93e7428156b..7679acdb519 100644 --- a/include/linux/ext3_fs_i.h +++ b/include/linux/ext3_fs_i.h @@ -87,7 +87,7 @@ struct ext3_inode_info { * near to their parent directory's inode. */ __u32 i_block_group; - __u32 i_state; /* Dynamic state flags for ext3 */ + unsigned long i_state; /* Dynamic state flags for ext3 */ /* block reservation info */ struct ext3_block_alloc_info *i_block_alloc_info; diff --git a/include/linux/fs.h b/include/linux/fs.h index 5b3182c7eb5..45689621a85 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1557,7 +1557,7 @@ struct super_operations { void (*destroy_inode)(struct inode *); void (*dirty_inode) (struct inode *); - int (*write_inode) (struct inode *, int); + int (*write_inode) (struct inode *, struct writeback_control *wbc); void (*drop_inode) (struct inode *); void (*delete_inode) (struct inode *); void (*put_super) (struct super_block *); diff --git a/include/linux/jbd.h b/include/linux/jbd.h index 331530cd3cc..f3aa59cb675 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h @@ -246,19 +246,8 @@ typedef struct journal_superblock_s #define J_ASSERT(assert) BUG_ON(!(assert)) -#if defined(CONFIG_BUFFER_DEBUG) -void buffer_assertion_failure(struct buffer_head *bh); -#define J_ASSERT_BH(bh, expr) \ - do { \ - if (!(expr)) \ - buffer_assertion_failure(bh); \ - J_ASSERT(expr); \ - } while (0) -#define J_ASSERT_JH(jh, expr) J_ASSERT_BH(jh2bh(jh), expr) -#else #define J_ASSERT_BH(bh, expr) J_ASSERT(expr) #define J_ASSERT_JH(jh, expr) J_ASSERT(expr) -#endif #if defined(JBD_PARANOID_IOFAIL) #define J_EXPECT(expr, why...) J_ASSERT(expr) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 638ce4554c7..1ec87635818 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -69,15 +69,8 @@ extern u8 jbd2_journal_enable_debug; #define jbd_debug(f, a...) /**/ #endif -static inline void *jbd2_alloc(size_t size, gfp_t flags) -{ - return (void *)__get_free_pages(flags, get_order(size)); -} - -static inline void jbd2_free(void *ptr, size_t size) -{ - free_pages((unsigned long)ptr, get_order(size)); -}; +extern void *jbd2_alloc(size_t size, gfp_t flags); +extern void jbd2_free(void *ptr, size_t size); #define JBD2_MIN_JOURNAL_BLOCKS 1024 @@ -284,19 +277,8 @@ typedef struct journal_superblock_s #define J_ASSERT(assert) BUG_ON(!(assert)) -#if defined(CONFIG_BUFFER_DEBUG) -void buffer_assertion_failure(struct buffer_head *bh); -#define J_ASSERT_BH(bh, expr) \ - do { \ - if (!(expr)) \ - buffer_assertion_failure(bh); \ - J_ASSERT(expr); \ - } while (0) -#define J_ASSERT_JH(jh, expr) J_ASSERT_BH(jh2bh(jh), expr) -#else #define J_ASSERT_BH(bh, expr) J_ASSERT(expr) #define J_ASSERT_JH(jh, expr) J_ASSERT(expr) -#endif #if defined(JBD2_PARANOID_IOFAIL) #define J_EXPECT(expr, why...) J_ASSERT(expr) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 1b672f74a32..e7d1b2e0070 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -122,6 +122,11 @@ struct kprobe { /* Kprobe status flags */ #define KPROBE_FLAG_GONE 1 /* breakpoint has already gone */ #define KPROBE_FLAG_DISABLED 2 /* probe is temporarily disabled */ +#define KPROBE_FLAG_OPTIMIZED 4 /* + * probe is really optimized. + * NOTE: + * this flag is only for optimized_kprobe. + */ /* Has this kprobe gone ? */ static inline int kprobe_gone(struct kprobe *p) @@ -134,6 +139,12 @@ static inline int kprobe_disabled(struct kprobe *p) { return p->flags & (KPROBE_FLAG_DISABLED | KPROBE_FLAG_GONE); } + +/* Is this kprobe really running optimized path ? */ +static inline int kprobe_optimized(struct kprobe *p) +{ + return p->flags & KPROBE_FLAG_OPTIMIZED; +} /* * Special probe type that uses setjmp-longjmp type tricks to resume * execution at a specified entry with a matching prototype corresponding @@ -249,6 +260,39 @@ extern kprobe_opcode_t *get_insn_slot(void); extern void free_insn_slot(kprobe_opcode_t *slot, int dirty); extern void kprobes_inc_nmissed_count(struct kprobe *p); +#ifdef CONFIG_OPTPROBES +/* + * Internal structure for direct jump optimized probe + */ +struct optimized_kprobe { + struct kprobe kp; + struct list_head list; /* list for optimizing queue */ + struct arch_optimized_insn optinsn; +}; + +/* Architecture dependent functions for direct jump optimization */ +extern int arch_prepared_optinsn(struct arch_optimized_insn *optinsn); +extern int arch_check_optimized_kprobe(struct optimized_kprobe *op); +extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op); +extern void arch_remove_optimized_kprobe(struct optimized_kprobe *op); +extern int arch_optimize_kprobe(struct optimized_kprobe *op); +extern void arch_unoptimize_kprobe(struct optimized_kprobe *op); +extern kprobe_opcode_t *get_optinsn_slot(void); +extern void free_optinsn_slot(kprobe_opcode_t *slot, int dirty); +extern int arch_within_optimized_kprobe(struct optimized_kprobe *op, + unsigned long addr); + +extern void opt_pre_handler(struct kprobe *p, struct pt_regs *regs); + +#ifdef CONFIG_SYSCTL +extern int sysctl_kprobes_optimization; +extern int proc_kprobes_optimization_handler(struct ctl_table *table, + int write, void __user *buffer, + size_t *length, loff_t *ppos); +#endif + +#endif /* CONFIG_OPTPROBES */ + /* Get the kprobe at this addr (if any) - called with preemption disabled */ struct kprobe *get_kprobe(void *addr); void kretprobe_hash_lock(struct task_struct *tsk, diff --git a/include/linux/kvm.h b/include/linux/kvm.h index a24de0b1858..60df9c84eca 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -103,7 +103,7 @@ struct kvm_userspace_memory_region { /* for kvm_memory_region::flags */ #define KVM_MEM_LOG_DIRTY_PAGES 1UL - +#define KVM_MEMSLOT_INVALID (1UL << 1) /* for KVM_IRQ_LINE */ struct kvm_irq_level { @@ -497,6 +497,11 @@ struct kvm_ioeventfd { #endif #define KVM_CAP_S390_PSW 42 #define KVM_CAP_PPC_SEGSTATE 43 +#define KVM_CAP_HYPERV 44 +#define KVM_CAP_HYPERV_VAPIC 45 +#define KVM_CAP_HYPERV_SPIN 46 +#define KVM_CAP_PCI_SEGMENT 47 +#define KVM_CAP_X86_ROBUST_SINGLESTEP 51 #ifdef KVM_CAP_IRQ_ROUTING @@ -691,8 +696,9 @@ struct kvm_assigned_pci_dev { __u32 busnr; __u32 devfn; __u32 flags; + __u32 segnr; union { - __u32 reserved[12]; + __u32 reserved[11]; }; }; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index bd5a616d937..a3fd0f91d94 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -38,6 +38,7 @@ #define KVM_REQ_MMU_SYNC 7 #define KVM_REQ_KVMCLOCK_UPDATE 8 #define KVM_REQ_KICK 9 +#define KVM_REQ_DEACTIVATE_FPU 10 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 @@ -57,20 +58,20 @@ struct kvm_io_bus { struct kvm_io_device *devs[NR_IOBUS_DEVS]; }; -void kvm_io_bus_init(struct kvm_io_bus *bus); -void kvm_io_bus_destroy(struct kvm_io_bus *bus); -int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr, int len, - const void *val); -int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len, +enum kvm_bus { + KVM_MMIO_BUS, + KVM_PIO_BUS, + KVM_NR_BUSES +}; + +int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, + int len, const void *val); +int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, void *val); -int __kvm_io_bus_register_dev(struct kvm_io_bus *bus, - struct kvm_io_device *dev); -int kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus, +int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_io_device *dev); -void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus, - struct kvm_io_device *dev); -void kvm_io_bus_unregister_dev(struct kvm *kvm, struct kvm_io_bus *bus, - struct kvm_io_device *dev); +int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev); struct kvm_vcpu { struct kvm *kvm; @@ -83,6 +84,8 @@ struct kvm_vcpu { struct kvm_run *run; unsigned long requests; unsigned long guest_debug; + int srcu_idx; + int fpu_active; int guest_fpu_loaded; wait_queue_head_t wq; @@ -150,14 +153,19 @@ struct kvm_irq_routing_table {}; #endif -struct kvm { - spinlock_t mmu_lock; - spinlock_t requests_lock; - struct rw_semaphore slots_lock; - struct mm_struct *mm; /* userspace tied to this vm */ +struct kvm_memslots { int nmemslots; struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS]; +}; + +struct kvm { + spinlock_t mmu_lock; + raw_spinlock_t requests_lock; + struct mutex slots_lock; + struct mm_struct *mm; /* userspace tied to this vm */ + struct kvm_memslots *memslots; + struct srcu_struct srcu; #ifdef CONFIG_KVM_APIC_ARCHITECTURE u32 bsp_vcpu_id; struct kvm_vcpu *bsp_vcpu; @@ -166,8 +174,7 @@ struct kvm { atomic_t online_vcpus; struct list_head vm_list; struct mutex lock; - struct kvm_io_bus mmio_bus; - struct kvm_io_bus pio_bus; + struct kvm_io_bus *buses[KVM_NR_BUSES]; #ifdef CONFIG_HAVE_KVM_EVENTFD struct { spinlock_t lock; @@ -249,13 +256,20 @@ int kvm_set_memory_region(struct kvm *kvm, int __kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, int user_alloc); -int kvm_arch_set_memory_region(struct kvm *kvm, +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, + int user_alloc); +void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, struct kvm_memory_slot old, int user_alloc); void kvm_disable_largepages(void); void kvm_arch_flush_shadow(struct kvm *kvm); gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); +gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn); + struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); void kvm_release_page_clean(struct page *page); @@ -264,6 +278,9 @@ void kvm_set_page_dirty(struct page *page); void kvm_set_page_accessed(struct page *page); pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); +pfn_t gfn_to_pfn_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn); +int memslot_id(struct kvm *kvm, gfn_t gfn); void kvm_release_pfn_dirty(pfn_t); void kvm_release_pfn_clean(pfn_t pfn); void kvm_set_pfn_dirty(pfn_t pfn); @@ -283,6 +300,7 @@ int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); +unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); void kvm_vcpu_block(struct kvm_vcpu *vcpu); @@ -383,6 +401,7 @@ struct kvm_assigned_dev_kernel { struct work_struct interrupt_work; struct list_head list; int assigned_dev_id; + int host_segnr; int host_busnr; int host_devfn; unsigned int entries_nr; @@ -429,8 +448,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); #define KVM_IOMMU_CACHE_COHERENCY 0x1 #ifdef CONFIG_IOMMU_API -int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn, - unsigned long npages); +int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot); int kvm_iommu_map_guest(struct kvm *kvm); int kvm_iommu_unmap_guest(struct kvm *kvm); int kvm_assign_device(struct kvm *kvm, @@ -480,11 +498,6 @@ static inline void kvm_guest_exit(void) current->flags &= ~PF_VCPU; } -static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot) -{ - return slot - kvm->memslots; -} - static inline gpa_t gfn_to_gpa(gfn_t gfn) { return (gpa_t)gfn << PAGE_SHIFT; @@ -532,6 +545,10 @@ static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_se } #endif +#ifndef KVM_ARCH_HAS_UNALIAS_INSTANTIATION +#define unalias_gfn_instantiation unalias_gfn +#endif + #ifdef CONFIG_HAVE_KVM_IRQCHIP #define KVM_MAX_IRQ_ROUTES 1024 diff --git a/include/linux/quota.h b/include/linux/quota.h index a6861f11748..b462916b2a0 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -279,9 +279,6 @@ struct dquot { struct mem_dqblk dq_dqb; /* Diskquota usage */ }; -#define QUOTA_OK 0 -#define NO_QUOTA 1 - /* Operations which must be implemented by each quota format */ struct quota_format_ops { int (*check_quota_file)(struct super_block *sb, int type); /* Detect whether file is in our format */ @@ -295,13 +292,6 @@ struct quota_format_ops { /* Operations working with dquots */ struct dquot_operations { - int (*initialize) (struct inode *, int); - int (*drop) (struct inode *); - int (*alloc_space) (struct inode *, qsize_t, int); - int (*alloc_inode) (const struct inode *, qsize_t); - int (*free_space) (struct inode *, qsize_t); - int (*free_inode) (const struct inode *, qsize_t); - int (*transfer) (struct inode *, struct iattr *); int (*write_dquot) (struct dquot *); /* Ordinary dquot write */ struct dquot *(*alloc_dquot)(struct super_block *, int); /* Allocate memory for new dquot */ void (*destroy_dquot)(struct dquot *); /* Free memory for dquot */ @@ -309,12 +299,6 @@ struct dquot_operations { int (*release_dquot) (struct dquot *); /* Quota is going to be deleted from disk */ int (*mark_dirty) (struct dquot *); /* Dquot is marked dirty */ int (*write_info) (struct super_block *, int); /* Write of quota "superblock" */ - /* reserve quota for delayed block allocation */ - int (*reserve_space) (struct inode *, qsize_t, int); - /* claim reserved quota for delayed alloc */ - int (*claim_space) (struct inode *, qsize_t); - /* release rsved quota for delayed alloc */ - void (*release_rsv) (struct inode *, qsize_t); /* get reserved quota for delayed alloc, value returned is managed by * quota code only */ qsize_t *(*get_reserved_space) (struct inode *); @@ -324,7 +308,7 @@ struct dquot_operations { struct quotactl_ops { int (*quota_on)(struct super_block *, int, int, char *, int); int (*quota_off)(struct super_block *, int, int); - int (*quota_sync)(struct super_block *, int); + int (*quota_sync)(struct super_block *, int, int); int (*get_info)(struct super_block *, int, struct if_dqinfo *); int (*set_info)(struct super_block *, int, struct if_dqinfo *); int (*get_dqblk)(struct super_block *, int, qid_t, struct if_dqblk *); @@ -357,26 +341,25 @@ enum { #define DQUOT_STATE_FLAGS (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED | \ DQUOT_SUSPENDED) /* Other quota flags */ -#define DQUOT_QUOTA_SYS_FILE (1 << 6) /* Quota file is a special +#define DQUOT_STATE_LAST (_DQUOT_STATE_FLAGS * MAXQUOTAS) +#define DQUOT_QUOTA_SYS_FILE (1 << DQUOT_STATE_LAST) + /* Quota file is a special * system file and user cannot * touch it. Filesystem is * responsible for setting * S_NOQUOTA, S_NOATIME flags */ -#define DQUOT_NEGATIVE_USAGE (1 << 7) /* Allow negative quota usage */ +#define DQUOT_NEGATIVE_USAGE (1 << (DQUOT_STATE_LAST + 1)) + /* Allow negative quota usage */ static inline unsigned int dquot_state_flag(unsigned int flags, int type) { - if (type == USRQUOTA) - return flags; - return flags << _DQUOT_STATE_FLAGS; + return flags << _DQUOT_STATE_FLAGS * type; } static inline unsigned int dquot_generic_flag(unsigned int flags, int type) { - if (type == USRQUOTA) - return flags; - return flags >> _DQUOT_STATE_FLAGS; + return (flags >> _DQUOT_STATE_FLAGS * type) & DQUOT_STATE_FLAGS; } #ifdef CONFIG_QUOTA_NETLINK_INTERFACE diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 3ebb2315364..e6fa7acce29 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -19,15 +19,12 @@ static inline struct quota_info *sb_dqopt(struct super_block *sb) /* * declaration of quota_function calls in kernel. */ -void sync_quota_sb(struct super_block *sb, int type); -static inline void writeout_quota_sb(struct super_block *sb, int type) -{ - if (sb->s_qcop->quota_sync) - sb->s_qcop->quota_sync(sb, type); -} +void inode_add_rsv_space(struct inode *inode, qsize_t number); +void inode_claim_rsv_space(struct inode *inode, qsize_t number); +void inode_sub_rsv_space(struct inode *inode, qsize_t number); -int dquot_initialize(struct inode *inode, int type); -int dquot_drop(struct inode *inode); +void dquot_initialize(struct inode *inode); +void dquot_drop(struct inode *inode); struct dquot *dqget(struct super_block *sb, unsigned int id, int type); void dqput(struct dquot *dquot); int dquot_scan_active(struct super_block *sb, @@ -36,24 +33,23 @@ int dquot_scan_active(struct super_block *sb, struct dquot *dquot_alloc(struct super_block *sb, int type); void dquot_destroy(struct dquot *dquot); -int dquot_alloc_space(struct inode *inode, qsize_t number, int prealloc); -int dquot_alloc_inode(const struct inode *inode, qsize_t number); +int __dquot_alloc_space(struct inode *inode, qsize_t number, + int warn, int reserve); +void __dquot_free_space(struct inode *inode, qsize_t number, int reserve); -int dquot_reserve_space(struct inode *inode, qsize_t number, int prealloc); -int dquot_claim_space(struct inode *inode, qsize_t number); -void dquot_release_reserved_space(struct inode *inode, qsize_t number); -qsize_t dquot_get_reserved_space(struct inode *inode); +int dquot_alloc_inode(const struct inode *inode); -int dquot_free_space(struct inode *inode, qsize_t number); -int dquot_free_inode(const struct inode *inode, qsize_t number); +int dquot_claim_space_nodirty(struct inode *inode, qsize_t number); +void dquot_free_inode(const struct inode *inode); -int dquot_transfer(struct inode *inode, struct iattr *iattr); int dquot_commit(struct dquot *dquot); int dquot_acquire(struct dquot *dquot); int dquot_release(struct dquot *dquot); int dquot_commit_info(struct super_block *sb, int type); int dquot_mark_dquot_dirty(struct dquot *dquot); +int dquot_file_open(struct inode *inode, struct file *file); + int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path, int remount); int vfs_quota_enable(struct inode *inode, int type, int format_id, @@ -64,14 +60,13 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name, int format_id, int type); int vfs_quota_off(struct super_block *sb, int type, int remount); int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags); -int vfs_quota_sync(struct super_block *sb, int type); +int vfs_quota_sync(struct super_block *sb, int type, int wait); int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii); int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii); int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di); int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di); -void vfs_dq_drop(struct inode *inode); -int vfs_dq_transfer(struct inode *inode, struct iattr *iattr); +int dquot_transfer(struct inode *inode, struct iattr *iattr); int vfs_dq_quota_on_remount(struct super_block *sb); static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type) @@ -83,53 +78,56 @@ static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type) * Functions for checking status of quota */ -static inline int sb_has_quota_usage_enabled(struct super_block *sb, int type) +static inline bool sb_has_quota_usage_enabled(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_USAGE_ENABLED, type); } -static inline int sb_has_quota_limits_enabled(struct super_block *sb, int type) +static inline bool sb_has_quota_limits_enabled(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_LIMITS_ENABLED, type); } -static inline int sb_has_quota_suspended(struct super_block *sb, int type) +static inline bool sb_has_quota_suspended(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_SUSPENDED, type); } -static inline int sb_any_quota_suspended(struct super_block *sb) +static inline unsigned sb_any_quota_suspended(struct super_block *sb) { - return sb_has_quota_suspended(sb, USRQUOTA) || - sb_has_quota_suspended(sb, GRPQUOTA); + unsigned type, tmsk = 0; + for (type = 0; type < MAXQUOTAS; type++) + tmsk |= sb_has_quota_suspended(sb, type) << type; + return tmsk; } /* Does kernel know about any quota information for given sb + type? */ -static inline int sb_has_quota_loaded(struct super_block *sb, int type) +static inline bool sb_has_quota_loaded(struct super_block *sb, int type) { /* Currently if anything is on, then quota usage is on as well */ return sb_has_quota_usage_enabled(sb, type); } -static inline int sb_any_quota_loaded(struct super_block *sb) +static inline unsigned sb_any_quota_loaded(struct super_block *sb) { - return sb_has_quota_loaded(sb, USRQUOTA) || - sb_has_quota_loaded(sb, GRPQUOTA); + unsigned type, tmsk = 0; + for (type = 0; type < MAXQUOTAS; type++) + tmsk |= sb_has_quota_loaded(sb, type) << type; + return tmsk; } -static inline int sb_has_quota_active(struct super_block *sb, int type) +static inline bool sb_has_quota_active(struct super_block *sb, int type) { return sb_has_quota_loaded(sb, type) && !sb_has_quota_suspended(sb, type); } -static inline int sb_any_quota_active(struct super_block *sb) +static inline unsigned sb_any_quota_active(struct super_block *sb) { - return sb_has_quota_active(sb, USRQUOTA) || - sb_has_quota_active(sb, GRPQUOTA); + return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb); } /* @@ -141,122 +139,6 @@ extern const struct quotactl_ops vfs_quotactl_ops; #define sb_dquot_ops (&dquot_operations) #define sb_quotactl_ops (&vfs_quotactl_ops) -/* It is better to call this function outside of any transaction as it might - * need a lot of space in journal for dquot structure allocation. */ -static inline void vfs_dq_init(struct inode *inode) -{ - BUG_ON(!inode->i_sb); - if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) - inode->i_sb->dq_op->initialize(inode, -1); -} - -/* The following allocation/freeing/transfer functions *must* be called inside - * a transaction (deadlocks possible otherwise) */ -static inline int vfs_dq_prealloc_space_nodirty(struct inode *inode, qsize_t nr) -{ - if (sb_any_quota_active(inode->i_sb)) { - /* Used space is updated in alloc_space() */ - if (inode->i_sb->dq_op->alloc_space(inode, nr, 1) == NO_QUOTA) - return 1; - } - else - inode_add_bytes(inode, nr); - return 0; -} - -static inline int vfs_dq_prealloc_space(struct inode *inode, qsize_t nr) -{ - int ret; - if (!(ret = vfs_dq_prealloc_space_nodirty(inode, nr))) - mark_inode_dirty(inode); - return ret; -} - -static inline int vfs_dq_alloc_space_nodirty(struct inode *inode, qsize_t nr) -{ - if (sb_any_quota_active(inode->i_sb)) { - /* Used space is updated in alloc_space() */ - if (inode->i_sb->dq_op->alloc_space(inode, nr, 0) == NO_QUOTA) - return 1; - } - else - inode_add_bytes(inode, nr); - return 0; -} - -static inline int vfs_dq_alloc_space(struct inode *inode, qsize_t nr) -{ - int ret; - if (!(ret = vfs_dq_alloc_space_nodirty(inode, nr))) - mark_inode_dirty(inode); - return ret; -} - -static inline int vfs_dq_reserve_space(struct inode *inode, qsize_t nr) -{ - if (sb_any_quota_active(inode->i_sb)) { - /* Used space is updated in alloc_space() */ - if (inode->i_sb->dq_op->reserve_space(inode, nr, 0) == NO_QUOTA) - return 1; - } - return 0; -} - -static inline int vfs_dq_alloc_inode(struct inode *inode) -{ - if (sb_any_quota_active(inode->i_sb)) { - vfs_dq_init(inode); - if (inode->i_sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) - return 1; - } - return 0; -} - -/* - * Convert in-memory reserved quotas to real consumed quotas - */ -static inline int vfs_dq_claim_space(struct inode *inode, qsize_t nr) -{ - if (sb_any_quota_active(inode->i_sb)) { - if (inode->i_sb->dq_op->claim_space(inode, nr) == NO_QUOTA) - return 1; - } else - inode_add_bytes(inode, nr); - - mark_inode_dirty(inode); - return 0; -} - -/* - * Release reserved (in-memory) quotas - */ -static inline -void vfs_dq_release_reservation_space(struct inode *inode, qsize_t nr) -{ - if (sb_any_quota_active(inode->i_sb)) - inode->i_sb->dq_op->release_rsv(inode, nr); -} - -static inline void vfs_dq_free_space_nodirty(struct inode *inode, qsize_t nr) -{ - if (sb_any_quota_active(inode->i_sb)) - inode->i_sb->dq_op->free_space(inode, nr); - else - inode_sub_bytes(inode, nr); -} - -static inline void vfs_dq_free_space(struct inode *inode, qsize_t nr) -{ - vfs_dq_free_space_nodirty(inode, nr); - mark_inode_dirty(inode); -} - -static inline void vfs_dq_free_inode(struct inode *inode) -{ - if (sb_any_quota_active(inode->i_sb)) - inode->i_sb->dq_op->free_inode(inode, 1); -} - /* Cannot be called inside a transaction */ static inline int vfs_dq_off(struct super_block *sb, int remount) { @@ -316,28 +198,20 @@ static inline int sb_any_quota_active(struct super_block *sb) #define sb_dquot_ops (NULL) #define sb_quotactl_ops (NULL) -static inline void vfs_dq_init(struct inode *inode) +static inline void dquot_initialize(struct inode *inode) { } -static inline void vfs_dq_drop(struct inode *inode) +static inline void dquot_drop(struct inode *inode) { } -static inline int vfs_dq_alloc_inode(struct inode *inode) +static inline int dquot_alloc_inode(const struct inode *inode) { return 0; } -static inline void vfs_dq_free_inode(struct inode *inode) -{ -} - -static inline void sync_quota_sb(struct super_block *sb, int type) -{ -} - -static inline void writeout_quota_sb(struct super_block *sb, int type) +static inline void dquot_free_inode(const struct inode *inode) { } @@ -351,110 +225,116 @@ static inline int vfs_dq_quota_on_remount(struct super_block *sb) return 0; } -static inline int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) +static inline int dquot_transfer(struct inode *inode, struct iattr *iattr) { return 0; } -static inline int vfs_dq_prealloc_space_nodirty(struct inode *inode, qsize_t nr) +static inline int __dquot_alloc_space(struct inode *inode, qsize_t number, + int warn, int reserve) { - inode_add_bytes(inode, nr); + if (!reserve) + inode_add_bytes(inode, number); return 0; } -static inline int vfs_dq_prealloc_space(struct inode *inode, qsize_t nr) +static inline void __dquot_free_space(struct inode *inode, qsize_t number, + int reserve) { - vfs_dq_prealloc_space_nodirty(inode, nr); - mark_inode_dirty(inode); - return 0; + if (!reserve) + inode_sub_bytes(inode, number); } -static inline int vfs_dq_alloc_space_nodirty(struct inode *inode, qsize_t nr) +static inline int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { - inode_add_bytes(inode, nr); + inode_add_bytes(inode, number); return 0; } -static inline int vfs_dq_alloc_space(struct inode *inode, qsize_t nr) +#define dquot_file_open generic_file_open + +#endif /* CONFIG_QUOTA */ + +static inline int dquot_alloc_space_nodirty(struct inode *inode, qsize_t nr) { - vfs_dq_alloc_space_nodirty(inode, nr); - mark_inode_dirty(inode); - return 0; + return __dquot_alloc_space(inode, nr, 1, 0); } -static inline int vfs_dq_reserve_space(struct inode *inode, qsize_t nr) +static inline int dquot_alloc_space(struct inode *inode, qsize_t nr) { - return 0; + int ret; + + ret = dquot_alloc_space_nodirty(inode, nr); + if (!ret) + mark_inode_dirty(inode); + return ret; } -static inline int vfs_dq_claim_space(struct inode *inode, qsize_t nr) +static inline int dquot_alloc_block_nodirty(struct inode *inode, qsize_t nr) { - return vfs_dq_alloc_space(inode, nr); + return dquot_alloc_space_nodirty(inode, nr << inode->i_blkbits); } -static inline -int vfs_dq_release_reservation_space(struct inode *inode, qsize_t nr) +static inline int dquot_alloc_block(struct inode *inode, qsize_t nr) { - return 0; + return dquot_alloc_space(inode, nr << inode->i_blkbits); } -static inline void vfs_dq_free_space_nodirty(struct inode *inode, qsize_t nr) +static inline int dquot_prealloc_block_nodirty(struct inode *inode, qsize_t nr) { - inode_sub_bytes(inode, nr); + return __dquot_alloc_space(inode, nr << inode->i_blkbits, 0, 0); } -static inline void vfs_dq_free_space(struct inode *inode, qsize_t nr) +static inline int dquot_prealloc_block(struct inode *inode, qsize_t nr) { - vfs_dq_free_space_nodirty(inode, nr); - mark_inode_dirty(inode); -} - -#endif /* CONFIG_QUOTA */ + int ret; -static inline int vfs_dq_prealloc_block_nodirty(struct inode *inode, qsize_t nr) -{ - return vfs_dq_prealloc_space_nodirty(inode, nr << inode->i_blkbits); + ret = dquot_prealloc_block_nodirty(inode, nr); + if (!ret) + mark_inode_dirty(inode); + return ret; } -static inline int vfs_dq_prealloc_block(struct inode *inode, qsize_t nr) +static inline int dquot_reserve_block(struct inode *inode, qsize_t nr) { - return vfs_dq_prealloc_space(inode, nr << inode->i_blkbits); + return __dquot_alloc_space(inode, nr << inode->i_blkbits, 1, 1); } -static inline int vfs_dq_alloc_block_nodirty(struct inode *inode, qsize_t nr) +static inline int dquot_claim_block(struct inode *inode, qsize_t nr) { - return vfs_dq_alloc_space_nodirty(inode, nr << inode->i_blkbits); -} + int ret; -static inline int vfs_dq_alloc_block(struct inode *inode, qsize_t nr) -{ - return vfs_dq_alloc_space(inode, nr << inode->i_blkbits); + ret = dquot_claim_space_nodirty(inode, nr << inode->i_blkbits); + if (!ret) + mark_inode_dirty(inode); + return ret; } -static inline int vfs_dq_reserve_block(struct inode *inode, qsize_t nr) +static inline void dquot_free_space_nodirty(struct inode *inode, qsize_t nr) { - return vfs_dq_reserve_space(inode, nr << inode->i_blkbits); + __dquot_free_space(inode, nr, 0); } -static inline int vfs_dq_claim_block(struct inode *inode, qsize_t nr) +static inline void dquot_free_space(struct inode *inode, qsize_t nr) { - return vfs_dq_claim_space(inode, nr << inode->i_blkbits); + dquot_free_space_nodirty(inode, nr); + mark_inode_dirty(inode); } -static inline -void vfs_dq_release_reservation_block(struct inode *inode, qsize_t nr) +static inline void dquot_free_block_nodirty(struct inode *inode, qsize_t nr) { - vfs_dq_release_reservation_space(inode, nr << inode->i_blkbits); + dquot_free_space_nodirty(inode, nr << inode->i_blkbits); } -static inline void vfs_dq_free_block_nodirty(struct inode *inode, qsize_t nr) +static inline void dquot_free_block(struct inode *inode, qsize_t nr) { - vfs_dq_free_space_nodirty(inode, nr << inode->i_blkbits); + dquot_free_space(inode, nr << inode->i_blkbits); } -static inline void vfs_dq_free_block(struct inode *inode, qsize_t nr) +static inline void dquot_release_reservation_block(struct inode *inode, + qsize_t nr) { - vfs_dq_free_space(inode, nr << inode->i_blkbits); + __dquot_free_space(inode, nr << inode->i_blkbits, 1); } #endif /* _LINUX_QUOTAOPS_ */ diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index 1ba3cf6edfb..3b603f47418 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -2034,7 +2034,7 @@ void reiserfs_read_locked_inode(struct inode *inode, int reiserfs_find_actor(struct inode *inode, void *p); int reiserfs_init_locked_inode(struct inode *inode, void *p); void reiserfs_delete_inode(struct inode *inode); -int reiserfs_write_inode(struct inode *inode, int); +int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc); int reiserfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create); struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid, diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index d0b6cd3afb2..2aa6aa3e8f6 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -874,6 +874,107 @@ TRACE_EVENT(ext4_forget, __entry->mode, __entry->is_metadata, __entry->block) ); +TRACE_EVENT(ext4_da_update_reserve_space, + TP_PROTO(struct inode *inode, int used_blocks), + + TP_ARGS(inode, used_blocks), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( umode_t, mode ) + __field( __u64, i_blocks ) + __field( int, used_blocks ) + __field( int, reserved_data_blocks ) + __field( int, reserved_meta_blocks ) + __field( int, allocated_meta_blocks ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->mode = inode->i_mode; + __entry->i_blocks = inode->i_blocks; + __entry->used_blocks = used_blocks; + __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; + __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; + __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks; + ), + + TP_printk("dev %s ino %lu mode 0%o i_blocks %llu used_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d", + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->mode, (unsigned long long) __entry->i_blocks, + __entry->used_blocks, __entry->reserved_data_blocks, + __entry->reserved_meta_blocks, __entry->allocated_meta_blocks) +); + +TRACE_EVENT(ext4_da_reserve_space, + TP_PROTO(struct inode *inode, int md_needed), + + TP_ARGS(inode, md_needed), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( umode_t, mode ) + __field( __u64, i_blocks ) + __field( int, md_needed ) + __field( int, reserved_data_blocks ) + __field( int, reserved_meta_blocks ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->mode = inode->i_mode; + __entry->i_blocks = inode->i_blocks; + __entry->md_needed = md_needed; + __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; + __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; + ), + + TP_printk("dev %s ino %lu mode 0%o i_blocks %llu md_needed %d reserved_data_blocks %d reserved_meta_blocks %d", + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->mode, (unsigned long long) __entry->i_blocks, + __entry->md_needed, __entry->reserved_data_blocks, + __entry->reserved_meta_blocks) +); + +TRACE_EVENT(ext4_da_release_space, + TP_PROTO(struct inode *inode, int freed_blocks), + + TP_ARGS(inode, freed_blocks), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( umode_t, mode ) + __field( __u64, i_blocks ) + __field( int, freed_blocks ) + __field( int, reserved_data_blocks ) + __field( int, reserved_meta_blocks ) + __field( int, allocated_meta_blocks ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->mode = inode->i_mode; + __entry->i_blocks = inode->i_blocks; + __entry->freed_blocks = freed_blocks; + __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; + __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; + __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks; + ), + + TP_printk("dev %s ino %lu mode 0%o i_blocks %llu freed_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d", + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->mode, (unsigned long long) __entry->i_blocks, + __entry->freed_blocks, __entry->reserved_data_blocks, + __entry->reserved_meta_blocks, __entry->allocated_meta_blocks) +); + + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index 96b370a050d..bf16545cc97 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -199,6 +199,34 @@ TRACE_EVENT(jbd2_checkpoint_stats, __entry->forced_to_close, __entry->written, __entry->dropped) ); +TRACE_EVENT(jbd2_cleanup_journal_tail, + + TP_PROTO(journal_t *journal, tid_t first_tid, + unsigned long block_nr, unsigned long freed), + + TP_ARGS(journal, first_tid, block_nr, freed), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( tid_t, tail_sequence ) + __field( tid_t, first_tid ) + __field(unsigned long, block_nr ) + __field(unsigned long, freed ) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->tail_sequence = journal->j_tail_sequence; + __entry->first_tid = first_tid; + __entry->block_nr = block_nr; + __entry->freed = freed; + ), + + TP_printk("dev %s from %u to %u offset %lu freed %lu", + jbd2_dev_to_name(__entry->dev), __entry->tail_sequence, + __entry->first_tid, __entry->block_nr, __entry->freed) +); + #endif /* _TRACE_JBD2_H */ /* This part must be outside protection */ diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index dbe10845527..b17d49dfc3e 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -145,6 +145,47 @@ TRACE_EVENT(kvm_mmio, __entry->len, __entry->gpa, __entry->val) ); +#define kvm_fpu_load_symbol \ + {0, "unload"}, \ + {1, "load"} + +TRACE_EVENT(kvm_fpu, + TP_PROTO(int load), + TP_ARGS(load), + + TP_STRUCT__entry( + __field( u32, load ) + ), + + TP_fast_assign( + __entry->load = load; + ), + + TP_printk("%s", __print_symbolic(__entry->load, kvm_fpu_load_symbol)) +); + +TRACE_EVENT(kvm_age_page, + TP_PROTO(ulong hva, struct kvm_memory_slot *slot, int ref), + TP_ARGS(hva, slot, ref), + + TP_STRUCT__entry( + __field( u64, hva ) + __field( u64, gfn ) + __field( u8, referenced ) + ), + + TP_fast_assign( + __entry->hva = hva; + __entry->gfn = + slot->base_gfn + ((hva - slot->userspace_addr) >> PAGE_SHIFT); + __entry->referenced = ref; + ), + + TP_printk("hva %llx gfn %llx %s", + __entry->hva, __entry->gfn, + __entry->referenced ? "YOUNG" : "OLD") +); + #endif /* _TRACE_KVM_MAIN_H */ /* This part must be outside protection */ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ccec774c716..fa034d29cf7 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -42,9 +42,11 @@ #include <linux/freezer.h> #include <linux/seq_file.h> #include <linux/debugfs.h> +#include <linux/sysctl.h> #include <linux/kdebug.h> #include <linux/memory.h> #include <linux/ftrace.h> +#include <linux/cpu.h> #include <asm-generic/sections.h> #include <asm/cacheflush.h> @@ -105,57 +107,74 @@ static struct kprobe_blackpoint kprobe_blacklist[] = { * stepping on the instruction on a vmalloced/kmalloced/data page * is a recipe for disaster */ -#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) - struct kprobe_insn_page { struct list_head list; kprobe_opcode_t *insns; /* Page of instruction slots */ - char slot_used[INSNS_PER_PAGE]; int nused; int ngarbage; + char slot_used[]; +}; + +#define KPROBE_INSN_PAGE_SIZE(slots) \ + (offsetof(struct kprobe_insn_page, slot_used) + \ + (sizeof(char) * (slots))) + +struct kprobe_insn_cache { + struct list_head pages; /* list of kprobe_insn_page */ + size_t insn_size; /* size of instruction slot */ + int nr_garbage; }; +static int slots_per_page(struct kprobe_insn_cache *c) +{ + return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t)); +} + enum kprobe_slot_state { SLOT_CLEAN = 0, SLOT_DIRTY = 1, SLOT_USED = 2, }; -static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ -static LIST_HEAD(kprobe_insn_pages); -static int kprobe_garbage_slots; -static int collect_garbage_slots(void); +static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */ +static struct kprobe_insn_cache kprobe_insn_slots = { + .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), + .insn_size = MAX_INSN_SIZE, + .nr_garbage = 0, +}; +static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c); /** * __get_insn_slot() - Find a slot on an executable page for an instruction. * We allocate an executable page if there's no room on existing ones. */ -static kprobe_opcode_t __kprobes *__get_insn_slot(void) +static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) { struct kprobe_insn_page *kip; retry: - list_for_each_entry(kip, &kprobe_insn_pages, list) { - if (kip->nused < INSNS_PER_PAGE) { + list_for_each_entry(kip, &c->pages, list) { + if (kip->nused < slots_per_page(c)) { int i; - for (i = 0; i < INSNS_PER_PAGE; i++) { + for (i = 0; i < slots_per_page(c); i++) { if (kip->slot_used[i] == SLOT_CLEAN) { kip->slot_used[i] = SLOT_USED; kip->nused++; - return kip->insns + (i * MAX_INSN_SIZE); + return kip->insns + (i * c->insn_size); } } - /* Surprise! No unused slots. Fix kip->nused. */ - kip->nused = INSNS_PER_PAGE; + /* kip->nused is broken. Fix it. */ + kip->nused = slots_per_page(c); + WARN_ON(1); } } /* If there are any garbage slots, collect it and try again. */ - if (kprobe_garbage_slots && collect_garbage_slots() == 0) { + if (c->nr_garbage && collect_garbage_slots(c) == 0) goto retry; - } - /* All out of space. Need to allocate a new page. Use slot 0. */ - kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); + + /* All out of space. Need to allocate a new page. */ + kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL); if (!kip) return NULL; @@ -170,20 +189,23 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void) return NULL; } INIT_LIST_HEAD(&kip->list); - list_add(&kip->list, &kprobe_insn_pages); - memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE); + memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); kip->slot_used[0] = SLOT_USED; kip->nused = 1; kip->ngarbage = 0; + list_add(&kip->list, &c->pages); return kip->insns; } + kprobe_opcode_t __kprobes *get_insn_slot(void) { - kprobe_opcode_t *ret; + kprobe_opcode_t *ret = NULL; + mutex_lock(&kprobe_insn_mutex); - ret = __get_insn_slot(); + ret = __get_insn_slot(&kprobe_insn_slots); mutex_unlock(&kprobe_insn_mutex); + return ret; } @@ -199,7 +221,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) * so as not to have to set it up again the * next time somebody inserts a probe. */ - if (!list_is_singular(&kprobe_insn_pages)) { + if (!list_is_singular(&kip->list)) { list_del(&kip->list); module_free(NULL, kip->insns); kfree(kip); @@ -209,51 +231,84 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) return 0; } -static int __kprobes collect_garbage_slots(void) +static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) { struct kprobe_insn_page *kip, *next; /* Ensure no-one is interrupted on the garbages */ synchronize_sched(); - list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { + list_for_each_entry_safe(kip, next, &c->pages, list) { int i; if (kip->ngarbage == 0) continue; kip->ngarbage = 0; /* we will collect all garbages */ - for (i = 0; i < INSNS_PER_PAGE; i++) { + for (i = 0; i < slots_per_page(c); i++) { if (kip->slot_used[i] == SLOT_DIRTY && collect_one_slot(kip, i)) break; } } - kprobe_garbage_slots = 0; + c->nr_garbage = 0; return 0; } -void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) +static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, + kprobe_opcode_t *slot, int dirty) { struct kprobe_insn_page *kip; - mutex_lock(&kprobe_insn_mutex); - list_for_each_entry(kip, &kprobe_insn_pages, list) { - if (kip->insns <= slot && - slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { - int i = (slot - kip->insns) / MAX_INSN_SIZE; + list_for_each_entry(kip, &c->pages, list) { + long idx = ((long)slot - (long)kip->insns) / c->insn_size; + if (idx >= 0 && idx < slots_per_page(c)) { + WARN_ON(kip->slot_used[idx] != SLOT_USED); if (dirty) { - kip->slot_used[i] = SLOT_DIRTY; + kip->slot_used[idx] = SLOT_DIRTY; kip->ngarbage++; + if (++c->nr_garbage > slots_per_page(c)) + collect_garbage_slots(c); } else - collect_one_slot(kip, i); - break; + collect_one_slot(kip, idx); + return; } } + /* Could not free this slot. */ + WARN_ON(1); +} - if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) - collect_garbage_slots(); - +void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) +{ + mutex_lock(&kprobe_insn_mutex); + __free_insn_slot(&kprobe_insn_slots, slot, dirty); mutex_unlock(&kprobe_insn_mutex); } +#ifdef CONFIG_OPTPROBES +/* For optimized_kprobe buffer */ +static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */ +static struct kprobe_insn_cache kprobe_optinsn_slots = { + .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), + /* .insn_size is initialized later */ + .nr_garbage = 0, +}; +/* Get a slot for optimized_kprobe buffer */ +kprobe_opcode_t __kprobes *get_optinsn_slot(void) +{ + kprobe_opcode_t *ret = NULL; + + mutex_lock(&kprobe_optinsn_mutex); + ret = __get_insn_slot(&kprobe_optinsn_slots); + mutex_unlock(&kprobe_optinsn_mutex); + + return ret; +} + +void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty) +{ + mutex_lock(&kprobe_optinsn_mutex); + __free_insn_slot(&kprobe_optinsn_slots, slot, dirty); + mutex_unlock(&kprobe_optinsn_mutex); +} +#endif #endif /* We have preemption disabled.. so it is safe to use __ versions */ @@ -284,23 +339,401 @@ struct kprobe __kprobes *get_kprobe(void *addr) if (p->addr == addr) return p; } + + return NULL; +} + +static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); + +/* Return true if the kprobe is an aggregator */ +static inline int kprobe_aggrprobe(struct kprobe *p) +{ + return p->pre_handler == aggr_pre_handler; +} + +/* + * Keep all fields in the kprobe consistent + */ +static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) +{ + memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); + memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); +} + +#ifdef CONFIG_OPTPROBES +/* NOTE: change this value only with kprobe_mutex held */ +static bool kprobes_allow_optimization; + +/* + * Call all pre_handler on the list, but ignores its return value. + * This must be called from arch-dep optimized caller. + */ +void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe *kp; + + list_for_each_entry_rcu(kp, &p->list, list) { + if (kp->pre_handler && likely(!kprobe_disabled(kp))) { + set_kprobe_instance(kp); + kp->pre_handler(kp, regs); + } + reset_kprobe_instance(); + } +} + +/* Return true(!0) if the kprobe is ready for optimization. */ +static inline int kprobe_optready(struct kprobe *p) +{ + struct optimized_kprobe *op; + + if (kprobe_aggrprobe(p)) { + op = container_of(p, struct optimized_kprobe, kp); + return arch_prepared_optinsn(&op->optinsn); + } + + return 0; +} + +/* + * Return an optimized kprobe whose optimizing code replaces + * instructions including addr (exclude breakpoint). + */ +struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) +{ + int i; + struct kprobe *p = NULL; + struct optimized_kprobe *op; + + /* Don't check i == 0, since that is a breakpoint case. */ + for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++) + p = get_kprobe((void *)(addr - i)); + + if (p && kprobe_optready(p)) { + op = container_of(p, struct optimized_kprobe, kp); + if (arch_within_optimized_kprobe(op, addr)) + return p; + } + return NULL; } +/* Optimization staging list, protected by kprobe_mutex */ +static LIST_HEAD(optimizing_list); + +static void kprobe_optimizer(struct work_struct *work); +static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); +#define OPTIMIZE_DELAY 5 + +/* Kprobe jump optimizer */ +static __kprobes void kprobe_optimizer(struct work_struct *work) +{ + struct optimized_kprobe *op, *tmp; + + /* Lock modules while optimizing kprobes */ + mutex_lock(&module_mutex); + mutex_lock(&kprobe_mutex); + if (kprobes_all_disarmed || !kprobes_allow_optimization) + goto end; + + /* + * Wait for quiesence period to ensure all running interrupts + * are done. Because optprobe may modify multiple instructions + * there is a chance that Nth instruction is interrupted. In that + * case, running interrupt can return to 2nd-Nth byte of jump + * instruction. This wait is for avoiding it. + */ + synchronize_sched(); + + /* + * The optimization/unoptimization refers online_cpus via + * stop_machine() and cpu-hotplug modifies online_cpus. + * And same time, text_mutex will be held in cpu-hotplug and here. + * This combination can cause a deadlock (cpu-hotplug try to lock + * text_mutex but stop_machine can not be done because online_cpus + * has been changed) + * To avoid this deadlock, we need to call get_online_cpus() + * for preventing cpu-hotplug outside of text_mutex locking. + */ + get_online_cpus(); + mutex_lock(&text_mutex); + list_for_each_entry_safe(op, tmp, &optimizing_list, list) { + WARN_ON(kprobe_disabled(&op->kp)); + if (arch_optimize_kprobe(op) < 0) + op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + list_del_init(&op->list); + } + mutex_unlock(&text_mutex); + put_online_cpus(); +end: + mutex_unlock(&kprobe_mutex); + mutex_unlock(&module_mutex); +} + +/* Optimize kprobe if p is ready to be optimized */ +static __kprobes void optimize_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + /* Check if the kprobe is disabled or not ready for optimization. */ + if (!kprobe_optready(p) || !kprobes_allow_optimization || + (kprobe_disabled(p) || kprobes_all_disarmed)) + return; + + /* Both of break_handler and post_handler are not supported. */ + if (p->break_handler || p->post_handler) + return; + + op = container_of(p, struct optimized_kprobe, kp); + + /* Check there is no other kprobes at the optimized instructions */ + if (arch_check_optimized_kprobe(op) < 0) + return; + + /* Check if it is already optimized. */ + if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) + return; + + op->kp.flags |= KPROBE_FLAG_OPTIMIZED; + list_add(&op->list, &optimizing_list); + if (!delayed_work_pending(&optimizing_work)) + schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); +} + +/* Unoptimize a kprobe if p is optimized */ +static __kprobes void unoptimize_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) { + op = container_of(p, struct optimized_kprobe, kp); + if (!list_empty(&op->list)) + /* Dequeue from the optimization queue */ + list_del_init(&op->list); + else + /* Replace jump with break */ + arch_unoptimize_kprobe(op); + op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + } +} + +/* Remove optimized instructions */ +static void __kprobes kill_optimized_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + op = container_of(p, struct optimized_kprobe, kp); + if (!list_empty(&op->list)) { + /* Dequeue from the optimization queue */ + list_del_init(&op->list); + op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + } + /* Don't unoptimize, because the target code will be freed. */ + arch_remove_optimized_kprobe(op); +} + +/* Try to prepare optimized instructions */ +static __kprobes void prepare_optimized_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + op = container_of(p, struct optimized_kprobe, kp); + arch_prepare_optimized_kprobe(op); +} + +/* Free optimized instructions and optimized_kprobe */ +static __kprobes void free_aggr_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + op = container_of(p, struct optimized_kprobe, kp); + arch_remove_optimized_kprobe(op); + kfree(op); +} + +/* Allocate new optimized_kprobe and try to prepare optimized instructions */ +static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL); + if (!op) + return NULL; + + INIT_LIST_HEAD(&op->list); + op->kp.addr = p->addr; + arch_prepare_optimized_kprobe(op); + + return &op->kp; +} + +static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p); + +/* + * Prepare an optimized_kprobe and optimize it + * NOTE: p must be a normal registered kprobe + */ +static __kprobes void try_to_optimize_kprobe(struct kprobe *p) +{ + struct kprobe *ap; + struct optimized_kprobe *op; + + ap = alloc_aggr_kprobe(p); + if (!ap) + return; + + op = container_of(ap, struct optimized_kprobe, kp); + if (!arch_prepared_optinsn(&op->optinsn)) { + /* If failed to setup optimizing, fallback to kprobe */ + free_aggr_kprobe(ap); + return; + } + + init_aggr_kprobe(ap, p); + optimize_kprobe(ap); +} + +#ifdef CONFIG_SYSCTL +static void __kprobes optimize_all_kprobes(void) +{ + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + unsigned int i; + + /* If optimization is already allowed, just return */ + if (kprobes_allow_optimization) + return; + + kprobes_allow_optimization = true; + mutex_lock(&text_mutex); + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) + if (!kprobe_disabled(p)) + optimize_kprobe(p); + } + mutex_unlock(&text_mutex); + printk(KERN_INFO "Kprobes globally optimized\n"); +} + +static void __kprobes unoptimize_all_kprobes(void) +{ + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + unsigned int i; + + /* If optimization is already prohibited, just return */ + if (!kprobes_allow_optimization) + return; + + kprobes_allow_optimization = false; + printk(KERN_INFO "Kprobes globally unoptimized\n"); + get_online_cpus(); /* For avoiding text_mutex deadlock */ + mutex_lock(&text_mutex); + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) { + if (!kprobe_disabled(p)) + unoptimize_kprobe(p); + } + } + + mutex_unlock(&text_mutex); + put_online_cpus(); + /* Allow all currently running kprobes to complete */ + synchronize_sched(); +} + +int sysctl_kprobes_optimization; +int proc_kprobes_optimization_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, + loff_t *ppos) +{ + int ret; + + mutex_lock(&kprobe_mutex); + sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0; + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + + if (sysctl_kprobes_optimization) + optimize_all_kprobes(); + else + unoptimize_all_kprobes(); + mutex_unlock(&kprobe_mutex); + + return ret; +} +#endif /* CONFIG_SYSCTL */ + +static void __kprobes __arm_kprobe(struct kprobe *p) +{ + struct kprobe *old_p; + + /* Check collision with other optimized kprobes */ + old_p = get_optimized_kprobe((unsigned long)p->addr); + if (unlikely(old_p)) + unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */ + + arch_arm_kprobe(p); + optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ +} + +static void __kprobes __disarm_kprobe(struct kprobe *p) +{ + struct kprobe *old_p; + + unoptimize_kprobe(p); /* Try to unoptimize */ + arch_disarm_kprobe(p); + + /* If another kprobe was blocked, optimize it. */ + old_p = get_optimized_kprobe((unsigned long)p->addr); + if (unlikely(old_p)) + optimize_kprobe(old_p); +} + +#else /* !CONFIG_OPTPROBES */ + +#define optimize_kprobe(p) do {} while (0) +#define unoptimize_kprobe(p) do {} while (0) +#define kill_optimized_kprobe(p) do {} while (0) +#define prepare_optimized_kprobe(p) do {} while (0) +#define try_to_optimize_kprobe(p) do {} while (0) +#define __arm_kprobe(p) arch_arm_kprobe(p) +#define __disarm_kprobe(p) arch_disarm_kprobe(p) + +static __kprobes void free_aggr_kprobe(struct kprobe *p) +{ + kfree(p); +} + +static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) +{ + return kzalloc(sizeof(struct kprobe), GFP_KERNEL); +} +#endif /* CONFIG_OPTPROBES */ + /* Arm a kprobe with text_mutex */ static void __kprobes arm_kprobe(struct kprobe *kp) { + /* + * Here, since __arm_kprobe() doesn't use stop_machine(), + * this doesn't cause deadlock on text_mutex. So, we don't + * need get_online_cpus(). + */ mutex_lock(&text_mutex); - arch_arm_kprobe(kp); + __arm_kprobe(kp); mutex_unlock(&text_mutex); } /* Disarm a kprobe with text_mutex */ static void __kprobes disarm_kprobe(struct kprobe *kp) { + get_online_cpus(); /* For avoiding text_mutex deadlock */ mutex_lock(&text_mutex); - arch_disarm_kprobe(kp); + __disarm_kprobe(kp); mutex_unlock(&text_mutex); + put_online_cpus(); } /* @@ -369,7 +802,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) { struct kprobe *kp; - if (p->pre_handler != aggr_pre_handler) { + if (!kprobe_aggrprobe(p)) { p->nmissed++; } else { list_for_each_entry_rcu(kp, &p->list, list) @@ -493,21 +926,16 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp) } /* - * Keep all fields in the kprobe consistent - */ -static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) -{ - memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); - memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); -} - -/* * Add the new probe to ap->list. Fail if this is the * second jprobe at the address - two jprobes can't coexist */ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) { BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); + + if (p->break_handler || p->post_handler) + unoptimize_kprobe(ap); /* Fall back to normal kprobe */ + if (p->break_handler) { if (ap->break_handler) return -EEXIST; @@ -522,7 +950,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) ap->flags &= ~KPROBE_FLAG_DISABLED; if (!kprobes_all_disarmed) /* Arm the breakpoint again. */ - arm_kprobe(ap); + __arm_kprobe(ap); } return 0; } @@ -531,12 +959,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) * Fill in the required fields of the "manager kprobe". Replace the * earlier kprobe in the hlist with the manager kprobe */ -static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) +static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) { + /* Copy p's insn slot to ap */ copy_kprobe(p, ap); flush_insn_slot(ap); ap->addr = p->addr; - ap->flags = p->flags; + ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED; ap->pre_handler = aggr_pre_handler; ap->fault_handler = aggr_fault_handler; /* We don't care the kprobe which has gone. */ @@ -546,8 +975,9 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) ap->break_handler = aggr_break_handler; INIT_LIST_HEAD(&ap->list); - list_add_rcu(&p->list, &ap->list); + INIT_HLIST_NODE(&ap->hlist); + list_add_rcu(&p->list, &ap->list); hlist_replace_rcu(&p->hlist, &ap->hlist); } @@ -561,12 +991,12 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, int ret = 0; struct kprobe *ap = old_p; - if (old_p->pre_handler != aggr_pre_handler) { - /* If old_p is not an aggr_probe, create new aggr_kprobe. */ - ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); + if (!kprobe_aggrprobe(old_p)) { + /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */ + ap = alloc_aggr_kprobe(old_p); if (!ap) return -ENOMEM; - add_aggr_kprobe(ap, old_p); + init_aggr_kprobe(ap, old_p); } if (kprobe_gone(ap)) { @@ -585,6 +1015,9 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, */ return ret; + /* Prepare optimized instructions if possible. */ + prepare_optimized_kprobe(ap); + /* * Clear gone flag to prevent allocating new slot again, and * set disabled flag because it is not armed yet. @@ -593,6 +1026,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | KPROBE_FLAG_DISABLED; } + /* Copy ap's insn slot to p */ copy_kprobe(ap, p); return add_new_kprobe(ap, p); } @@ -743,27 +1177,34 @@ int __kprobes register_kprobe(struct kprobe *p) p->nmissed = 0; INIT_LIST_HEAD(&p->list); mutex_lock(&kprobe_mutex); + + get_online_cpus(); /* For avoiding text_mutex deadlock. */ + mutex_lock(&text_mutex); + old_p = get_kprobe(p->addr); if (old_p) { + /* Since this may unoptimize old_p, locking text_mutex. */ ret = register_aggr_kprobe(old_p, p); goto out; } - mutex_lock(&text_mutex); ret = arch_prepare_kprobe(p); if (ret) - goto out_unlock_text; + goto out; INIT_HLIST_NODE(&p->hlist); hlist_add_head_rcu(&p->hlist, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); if (!kprobes_all_disarmed && !kprobe_disabled(p)) - arch_arm_kprobe(p); + __arm_kprobe(p); + + /* Try to optimize kprobe */ + try_to_optimize_kprobe(p); -out_unlock_text: - mutex_unlock(&text_mutex); out: + mutex_unlock(&text_mutex); + put_online_cpus(); mutex_unlock(&kprobe_mutex); if (probed_mod) @@ -785,7 +1226,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p) return -EINVAL; if (old_p == p || - (old_p->pre_handler == aggr_pre_handler && + (kprobe_aggrprobe(old_p) && list_is_singular(&old_p->list))) { /* * Only probe on the hash list. Disarm only if kprobes are @@ -793,7 +1234,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p) * already have been removed. We save on flushing icache. */ if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) - disarm_kprobe(p); + disarm_kprobe(old_p); hlist_del_rcu(&old_p->hlist); } else { if (p->break_handler && !kprobe_gone(p)) @@ -809,8 +1250,13 @@ noclean: list_del_rcu(&p->list); if (!kprobe_disabled(old_p)) { try_to_disable_aggr_kprobe(old_p); - if (!kprobes_all_disarmed && kprobe_disabled(old_p)) - disarm_kprobe(old_p); + if (!kprobes_all_disarmed) { + if (kprobe_disabled(old_p)) + disarm_kprobe(old_p); + else + /* Try to optimize this probe again */ + optimize_kprobe(old_p); + } } } return 0; @@ -827,7 +1273,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) old_p = list_entry(p->list.next, struct kprobe, list); list_del(&p->list); arch_remove_kprobe(old_p); - kfree(old_p); + free_aggr_kprobe(old_p); } } @@ -1123,7 +1569,7 @@ static void __kprobes kill_kprobe(struct kprobe *p) struct kprobe *kp; p->flags |= KPROBE_FLAG_GONE; - if (p->pre_handler == aggr_pre_handler) { + if (kprobe_aggrprobe(p)) { /* * If this is an aggr_kprobe, we have to list all the * chained probes and mark them GONE. @@ -1132,6 +1578,7 @@ static void __kprobes kill_kprobe(struct kprobe *p) kp->flags |= KPROBE_FLAG_GONE; p->post_handler = NULL; p->break_handler = NULL; + kill_optimized_kprobe(p); } /* * Here, we can remove insn_slot safely, because no thread calls @@ -1241,6 +1688,15 @@ static int __init init_kprobes(void) } } +#if defined(CONFIG_OPTPROBES) +#if defined(__ARCH_WANT_KPROBES_INSN_SLOT) + /* Init kprobe_optinsn_slots */ + kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE; +#endif + /* By default, kprobes can be optimized */ + kprobes_allow_optimization = true; +#endif + /* By default, kprobes are armed */ kprobes_all_disarmed = false; @@ -1259,7 +1715,7 @@ static int __init init_kprobes(void) #ifdef CONFIG_DEBUG_FS static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, - const char *sym, int offset,char *modname) + const char *sym, int offset, char *modname, struct kprobe *pp) { char *kprobe_type; @@ -1269,19 +1725,21 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, kprobe_type = "j"; else kprobe_type = "k"; + if (sym) - seq_printf(pi, "%p %s %s+0x%x %s %s%s\n", + seq_printf(pi, "%p %s %s+0x%x %s ", p->addr, kprobe_type, sym, offset, - (modname ? modname : " "), - (kprobe_gone(p) ? "[GONE]" : ""), - ((kprobe_disabled(p) && !kprobe_gone(p)) ? - "[DISABLED]" : "")); + (modname ? modname : " ")); else - seq_printf(pi, "%p %s %p %s%s\n", - p->addr, kprobe_type, p->addr, - (kprobe_gone(p) ? "[GONE]" : ""), - ((kprobe_disabled(p) && !kprobe_gone(p)) ? - "[DISABLED]" : "")); + seq_printf(pi, "%p %s %p ", + p->addr, kprobe_type, p->addr); + + if (!pp) + pp = p; + seq_printf(pi, "%s%s%s\n", + (kprobe_gone(p) ? "[GONE]" : ""), + ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""), + (kprobe_optimized(pp) ? "[OPTIMIZED]" : "")); } static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) @@ -1317,11 +1775,11 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) hlist_for_each_entry_rcu(p, node, head, hlist) { sym = kallsyms_lookup((unsigned long)p->addr, NULL, &offset, &modname, namebuf); - if (p->pre_handler == aggr_pre_handler) { + if (kprobe_aggrprobe(p)) { list_for_each_entry_rcu(kp, &p->list, list) - report_probe(pi, kp, sym, offset, modname); + report_probe(pi, kp, sym, offset, modname, p); } else - report_probe(pi, p, sym, offset, modname); + report_probe(pi, p, sym, offset, modname, NULL); } preempt_enable(); return 0; @@ -1399,12 +1857,13 @@ int __kprobes enable_kprobe(struct kprobe *kp) goto out; } - if (!kprobes_all_disarmed && kprobe_disabled(p)) - arm_kprobe(p); - - p->flags &= ~KPROBE_FLAG_DISABLED; if (p != kp) kp->flags &= ~KPROBE_FLAG_DISABLED; + + if (!kprobes_all_disarmed && kprobe_disabled(p)) { + p->flags &= ~KPROBE_FLAG_DISABLED; + arm_kprobe(p); + } out: mutex_unlock(&kprobe_mutex); return ret; @@ -1424,12 +1883,13 @@ static void __kprobes arm_all_kprobes(void) if (!kprobes_all_disarmed) goto already_enabled; + /* Arming kprobes doesn't optimize kprobe itself */ mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) if (!kprobe_disabled(p)) - arch_arm_kprobe(p); + __arm_kprobe(p); } mutex_unlock(&text_mutex); @@ -1456,16 +1916,23 @@ static void __kprobes disarm_all_kprobes(void) kprobes_all_disarmed = true; printk(KERN_INFO "Kprobes globally disabled\n"); + + /* + * Here we call get_online_cpus() for avoiding text_mutex deadlock, + * because disarming may also unoptimize kprobes. + */ + get_online_cpus(); mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) { if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) - arch_disarm_kprobe(p); + __disarm_kprobe(p); } } mutex_unlock(&text_mutex); + put_online_cpus(); mutex_unlock(&kprobe_mutex); /* Allow all currently running kprobes to complete */ synchronize_sched(); diff --git a/kernel/padata.c b/kernel/padata.c index 6f9bcb8313d..93caf65ff57 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -642,6 +642,9 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask, if (!pd) goto err_free_inst; + if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL)) + goto err_free_pd; + rcu_assign_pointer(pinst->pd, pd); pinst->wq = wq; @@ -654,12 +657,14 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask, pinst->cpu_notifier.priority = 0; err = register_hotcpu_notifier(&pinst->cpu_notifier); if (err) - goto err_free_pd; + goto err_free_cpumask; mutex_init(&pinst->lock); return pinst; +err_free_cpumask: + free_cpumask_var(pinst->cpumask); err_free_pd: padata_free_pd(pd); err_free_inst: @@ -685,6 +690,7 @@ void padata_free(struct padata_instance *pinst) unregister_hotcpu_notifier(&pinst->cpu_notifier); padata_free_pd(pinst->pd); + free_cpumask_var(pinst->cpumask); kfree(pinst); } EXPORT_SYMBOL(padata_free); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 33e7a38b6eb..0ef19c614f6 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -50,6 +50,7 @@ #include <linux/ftrace.h> #include <linux/slow-work.h> #include <linux/perf_event.h> +#include <linux/kprobes.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -1450,6 +1451,17 @@ static struct ctl_table debug_table[] = { .proc_handler = proc_dointvec }, #endif +#if defined(CONFIG_OPTPROBES) + { + .procname = "kprobes-optimization", + .data = &sysctl_kprobes_optimization, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_kprobes_optimization_handler, + .extra1 = &zero, + .extra2 = &one, + }, +#endif { } }; diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt index 2de34075f6a..34202b1be0b 100644 --- a/tools/perf/Documentation/perf-probe.txt +++ b/tools/perf/Documentation/perf-probe.txt @@ -41,7 +41,8 @@ OPTIONS -d:: --del=:: - Delete a probe event. + Delete probe events. This accepts glob wildcards('*', '?') and character + classes(e.g. [a-z], [!A-Z]). -l:: --list:: @@ -50,17 +51,29 @@ OPTIONS -L:: --line=:: Show source code lines which can be probed. This needs an argument - which specifies a range of the source code. + which specifies a range of the source code. (see LINE SYNTAX for detail) + +-f:: +--force:: + Forcibly add events with existing name. PROBE SYNTAX ------------ Probe points are defined by following syntax. - "[EVENT=]FUNC[+OFFS|:RLN|%return][@SRC]|SRC:ALN [ARG ...]" + 1) Define event based on function name + [EVENT=]FUNC[@SRC][:RLN|+OFFS|%return|;PTN] [ARG ...] + + 2) Define event based on source file with line number + [EVENT=]SRC:ALN [ARG ...] + + 3) Define event based on source file with lazy pattern + [EVENT=]SRC;PTN [ARG ...] + 'EVENT' specifies the name of new event, if omitted, it will be set the name of the probed function. Currently, event group name is set as 'probe'. -'FUNC' specifies a probed function name, and it may have one of the following options; '+OFFS' is the offset from function entry address in bytes, 'RLN' is the relative-line number from function entry line, and '%return' means that it probes function return. In addition, 'SRC' specifies a source file which has that function. -It is also possible to specify a probe point by the source line number by using 'SRC:ALN' syntax, where 'SRC' is the source file path and 'ALN' is the line number. +'FUNC' specifies a probed function name, and it may have one of the following options; '+OFFS' is the offset from function entry address in bytes, ':RLN' is the relative-line number from function entry line, and '%return' means that it probes function return. And ';PTN' means lazy matching pattern (see LAZY MATCHING). Note that ';PTN' must be the end of the probe point definition. In addition, '@SRC' specifies a source file which has that function. +It is also possible to specify a probe point by the source line number or lazy matching by using 'SRC:ALN' or 'SRC;PTN' syntax, where 'SRC' is the source file path, ':ALN' is the line number and ';PTN' is the lazy matching pattern. 'ARG' specifies the arguments of this probe point. You can use the name of local variable, or kprobe-tracer argument format (e.g. $retval, %ax, etc). LINE SYNTAX @@ -76,6 +89,41 @@ and 'ALN2' is end line number in the file. It is also possible to specify how many lines to show by using 'NUM'. So, "source.c:100-120" shows lines between 100th to l20th in source.c file. And "func:10+20" shows 20 lines from 10th line of func function. +LAZY MATCHING +------------- + The lazy line matching is similar to glob matching but ignoring spaces in both of pattern and target. So this accepts wildcards('*', '?') and character classes(e.g. [a-z], [!A-Z]). + +e.g. + 'a=*' can matches 'a=b', 'a = b', 'a == b' and so on. + +This provides some sort of flexibility and robustness to probe point definitions against minor code changes. For example, actual 10th line of schedule() can be moved easily by modifying schedule(), but the same line matching 'rq=cpu_rq*' may still exist in the function.) + + +EXAMPLES +-------- +Display which lines in schedule() can be probed: + + ./perf probe --line schedule + +Add a probe on schedule() function 12th line with recording cpu local variable: + + ./perf probe schedule:12 cpu + or + ./perf probe --add='schedule:12 cpu' + + this will add one or more probes which has the name start with "schedule". + + Add probes on lines in schedule() function which calls update_rq_clock(). + + ./perf probe 'schedule;update_rq_clock*' + or + ./perf probe --add='schedule;update_rq_clock*' + +Delete all probes on schedule(). + + ./perf probe --del='schedule*' + + SEE ALSO -------- linkperf:perf-trace[1], linkperf:perf-record[1] diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 54a5b50ff31..2d537382c68 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -500,12 +500,12 @@ else msg := $(error No libelf.h/libelf found, please install libelf-dev/elfutils-libelf-devel and glibc-dev[el]); endif -ifneq ($(shell sh -c "(echo '\#ifndef _MIPS_SZLONG'; echo '\#define _MIPS_SZLONG 0'; echo '\#endif'; echo '\#include <dwarf.h>'; echo '\#include <libdwarf.h>'; echo 'int main(void) { Dwarf_Debug dbg; Dwarf_Error err; Dwarf_Ranges *rng; dwarf_init(0, DW_DLC_READ, 0, 0, &dbg, &err); dwarf_get_ranges(dbg, 0, &rng, 0, 0, &err); return (long)dbg; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -I/usr/include/libdwarf -ldwarf -lelf -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y) - msg := $(warning No libdwarf.h found or old libdwarf.h found, disables dwarf support. Please install libdwarf-dev/libdwarf-devel >= 20081231); - BASIC_CFLAGS += -DNO_LIBDWARF +ifneq ($(shell sh -c "(echo '\#include <dwarf.h>'; echo '\#include <libdw.h>'; echo 'int main(void) { Dwarf *dbg; dbg = dwarf_begin(0, DWARF_C_READ); return (long)dbg; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -I/usr/include/elfutils -ldw -lelf -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y) + msg := $(warning No libdw.h found or old libdw.h found, disables dwarf support. Please install elfutils-devel/elfutils-dev); + BASIC_CFLAGS += -DNO_DWARF_SUPPORT else - BASIC_CFLAGS += -I/usr/include/libdwarf - EXTLIBS += -lelf -ldwarf + BASIC_CFLAGS += -I/usr/include/elfutils + EXTLIBS += -lelf -ldw LIB_OBJS += util/probe-finder.o endif diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c index ad47bd4c50e..c30a3359234 100644 --- a/tools/perf/builtin-probe.c +++ b/tools/perf/builtin-probe.c @@ -128,7 +128,7 @@ static void evaluate_probe_point(struct probe_point *pp) pp->function); } -#ifndef NO_LIBDWARF +#ifndef NO_DWARF_SUPPORT static int open_vmlinux(void) { if (map__load(session.kmaps[MAP__FUNCTION], NULL) < 0) { @@ -156,14 +156,16 @@ static const char * const probe_usage[] = { "perf probe [<options>] --add 'PROBEDEF' [--add 'PROBEDEF' ...]", "perf probe [<options>] --del '[GROUP:]EVENT' ...", "perf probe --list", +#ifndef NO_DWARF_SUPPORT "perf probe --line 'LINEDESC'", +#endif NULL }; static const struct option options[] = { OPT_BOOLEAN('v', "verbose", &verbose, "be more verbose (show parsed arguments, etc)"), -#ifndef NO_LIBDWARF +#ifndef NO_DWARF_SUPPORT OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name, "file", "vmlinux pathname"), #endif @@ -172,30 +174,32 @@ static const struct option options[] = { OPT_CALLBACK('d', "del", NULL, "[GROUP:]EVENT", "delete a probe event.", opt_del_probe_event), OPT_CALLBACK('a', "add", NULL, -#ifdef NO_LIBDWARF - "[EVENT=]FUNC[+OFFS|%return] [ARG ...]", +#ifdef NO_DWARF_SUPPORT + "[EVENT=]FUNC[+OFF|%return] [ARG ...]", #else - "[EVENT=]FUNC[+OFFS|%return|:RLN][@SRC]|SRC:ALN [ARG ...]", + "[EVENT=]FUNC[@SRC][+OFF|%return|:RL|;PT]|SRC:AL|SRC;PT" + " [ARG ...]", #endif "probe point definition, where\n" "\t\tGROUP:\tGroup name (optional)\n" "\t\tEVENT:\tEvent name\n" "\t\tFUNC:\tFunction name\n" - "\t\tOFFS:\tOffset from function entry (in byte)\n" + "\t\tOFF:\tOffset from function entry (in byte)\n" "\t\t%return:\tPut the probe at function return\n" -#ifdef NO_LIBDWARF +#ifdef NO_DWARF_SUPPORT "\t\tARG:\tProbe argument (only \n" #else "\t\tSRC:\tSource code path\n" - "\t\tRLN:\tRelative line number from function entry.\n" - "\t\tALN:\tAbsolute line number in file.\n" + "\t\tRL:\tRelative line number from function entry.\n" + "\t\tAL:\tAbsolute line number in file.\n" + "\t\tPT:\tLazy expression of line code.\n" "\t\tARG:\tProbe argument (local variable name or\n" #endif "\t\t\tkprobe-tracer argument format.)\n", opt_add_probe_event), OPT_BOOLEAN('f', "force", &session.force_add, "forcibly add events" " with existing name"), -#ifndef NO_LIBDWARF +#ifndef NO_DWARF_SUPPORT OPT_CALLBACK('L', "line", NULL, "FUNC[:RLN[+NUM|:RLN2]]|SRC:ALN[+NUM|:ALN2]", "Show source code lines.", opt_show_lines), @@ -223,7 +227,7 @@ static void init_vmlinux(void) int cmd_probe(int argc, const char **argv, const char *prefix __used) { int i, ret; -#ifndef NO_LIBDWARF +#ifndef NO_DWARF_SUPPORT int fd; #endif struct probe_point *pp; @@ -259,7 +263,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used) return 0; } -#ifndef NO_LIBDWARF +#ifndef NO_DWARF_SUPPORT if (session.show_lines) { if (session.nr_probe != 0 || session.dellist) { pr_warning(" Error: Don't use --line with" @@ -290,9 +294,9 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used) init_vmlinux(); if (session.need_dwarf) -#ifdef NO_LIBDWARF +#ifdef NO_DWARF_SUPPORT die("Debuginfo-analysis is not supported"); -#else /* !NO_LIBDWARF */ +#else /* !NO_DWARF_SUPPORT */ pr_debug("Some probes require debuginfo.\n"); fd = open_vmlinux(); @@ -312,7 +316,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used) continue; lseek(fd, SEEK_SET, 0); - ret = find_probepoint(fd, pp); + ret = find_probe_point(fd, pp); if (ret > 0) continue; if (ret == 0) { /* No error but failed to find probe point. */ @@ -333,7 +337,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used) close(fd); end_dwarf: -#endif /* !NO_LIBDWARF */ +#endif /* !NO_DWARF_SUPPORT */ /* Synthesize probes without dwarf */ for (i = 0; i < session.nr_probe; i++) { diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 8f056884969..c971e81e9cb 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -119,14 +119,14 @@ static void parse_perf_probe_probepoint(char *arg, struct probe_point *pp) char c, nc = 0; /* * <Syntax> - * perf probe [EVENT=]SRC:LN - * perf probe [EVENT=]FUNC[+OFFS|%return][@SRC] + * perf probe [EVENT=]SRC[:LN|;PTN] + * perf probe [EVENT=]FUNC[@SRC][+OFFS|%return|:LN|;PAT] * * TODO:Group name support */ - ptr = strchr(arg, '='); - if (ptr) { /* Event name */ + ptr = strpbrk(arg, ";=@+%"); + if (ptr && *ptr == '=') { /* Event name */ *ptr = '\0'; tmp = ptr + 1; ptr = strchr(arg, ':'); @@ -139,7 +139,7 @@ static void parse_perf_probe_probepoint(char *arg, struct probe_point *pp) arg = tmp; } - ptr = strpbrk(arg, ":+@%"); + ptr = strpbrk(arg, ";:+@%"); if (ptr) { nc = *ptr; *ptr++ = '\0'; @@ -156,7 +156,11 @@ static void parse_perf_probe_probepoint(char *arg, struct probe_point *pp) while (ptr) { arg = ptr; c = nc; - ptr = strpbrk(arg, ":+@%"); + if (c == ';') { /* Lazy pattern must be the last part */ + pp->lazy_line = strdup(arg); + break; + } + ptr = strpbrk(arg, ";:+@%"); if (ptr) { nc = *ptr; *ptr++ = '\0'; @@ -165,13 +169,13 @@ static void parse_perf_probe_probepoint(char *arg, struct probe_point *pp) case ':': /* Line number */ pp->line = strtoul(arg, &tmp, 0); if (*tmp != '\0') - semantic_error("There is non-digit charactor" - " in line number."); + semantic_error("There is non-digit char" + " in line number."); break; case '+': /* Byte offset from a symbol */ pp->offset = strtoul(arg, &tmp, 0); if (*tmp != '\0') - semantic_error("There is non-digit charactor" + semantic_error("There is non-digit character" " in offset."); break; case '@': /* File name */ @@ -179,9 +183,6 @@ static void parse_perf_probe_probepoint(char *arg, struct probe_point *pp) semantic_error("SRC@SRC is not allowed."); pp->file = strdup(arg); DIE_IF(pp->file == NULL); - if (ptr) - semantic_error("@SRC must be the last " - "option."); break; case '%': /* Probe places */ if (strcmp(arg, "return") == 0) { @@ -196,11 +197,18 @@ static void parse_perf_probe_probepoint(char *arg, struct probe_point *pp) } /* Exclusion check */ + if (pp->lazy_line && pp->line) + semantic_error("Lazy pattern can't be used with line number."); + + if (pp->lazy_line && pp->offset) + semantic_error("Lazy pattern can't be used with offset."); + if (pp->line && pp->offset) semantic_error("Offset can't be used with line number."); - if (!pp->line && pp->file && !pp->function) - semantic_error("File always requires line number."); + if (!pp->line && !pp->lazy_line && pp->file && !pp->function) + semantic_error("File always requires line number or " + "lazy pattern."); if (pp->offset && !pp->function) semantic_error("Offset requires an entry function."); @@ -208,11 +216,13 @@ static void parse_perf_probe_probepoint(char *arg, struct probe_point *pp) if (pp->retprobe && !pp->function) semantic_error("Return probe requires an entry function."); - if ((pp->offset || pp->line) && pp->retprobe) - semantic_error("Offset/Line can't be used with return probe."); + if ((pp->offset || pp->line || pp->lazy_line) && pp->retprobe) + semantic_error("Offset/Line/Lazy pattern can't be used with " + "return probe."); - pr_debug("symbol:%s file:%s line:%d offset:%d, return:%d\n", - pp->function, pp->file, pp->line, pp->offset, pp->retprobe); + pr_debug("symbol:%s file:%s line:%d offset:%d return:%d lazy:%s\n", + pp->function, pp->file, pp->line, pp->offset, pp->retprobe, + pp->lazy_line); } /* Parse perf-probe event definition */ @@ -458,6 +468,8 @@ static void clear_probe_point(struct probe_point *pp) free(pp->function); if (pp->file) free(pp->file); + if (pp->lazy_line) + free(pp->lazy_line); for (i = 0; i < pp->nr_args; i++) free(pp->args[i]); if (pp->args) @@ -719,6 +731,7 @@ void del_trace_kprobe_events(struct strlist *dellist) } #define LINEBUF_SIZE 256 +#define NR_ADDITIONAL_LINES 2 static void show_one_line(FILE *fp, unsigned int l, bool skip, bool show_num) { @@ -779,5 +792,11 @@ void show_line_range(struct line_range *lr) show_one_line(fp, (l++) - lr->offset, false, false); show_one_line(fp, (l++) - lr->offset, false, true); } + + if (lr->end == INT_MAX) + lr->end = l + NR_ADDITIONAL_LINES; + while (l < lr->end && !feof(fp)) + show_one_line(fp, (l++) - lr->offset, false, false); + fclose(fp); } diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 1b2124d12f6..e77dc886760 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -32,21 +32,13 @@ #include <stdarg.h> #include <ctype.h> +#include "string.h" #include "event.h" #include "debug.h" #include "util.h" #include "probe-finder.h" -/* Dwarf_Die Linkage to parent Die */ -struct die_link { - struct die_link *parent; /* Parent die */ - Dwarf_Die die; /* Current die */ -}; - -static Dwarf_Debug __dw_debug; -static Dwarf_Error __dw_error; - /* * Generic dwarf analysis helpers */ @@ -113,281 +105,190 @@ static int strtailcmp(const char *s1, const char *s2) return 0; } -/* Find the fileno of the target file. */ -static Dwarf_Unsigned cu_find_fileno(Dwarf_Die cu_die, const char *fname) -{ - Dwarf_Signed cnt, i; - Dwarf_Unsigned found = 0; - char **srcs; - int ret; +/* Line number list operations */ - if (!fname) - return 0; +/* Add a line to line number list */ +static void line_list__add_line(struct list_head *head, unsigned int line) +{ + struct line_node *ln; + struct list_head *p; - ret = dwarf_srcfiles(cu_die, &srcs, &cnt, &__dw_error); - if (ret == DW_DLV_OK) { - for (i = 0; i < cnt && !found; i++) { - if (strtailcmp(srcs[i], fname) == 0) - found = i + 1; - dwarf_dealloc(__dw_debug, srcs[i], DW_DLA_STRING); - } - for (; i < cnt; i++) - dwarf_dealloc(__dw_debug, srcs[i], DW_DLA_STRING); - dwarf_dealloc(__dw_debug, srcs, DW_DLA_LIST); + /* Reverse search, because new line will be the last one */ + list_for_each_entry_reverse(ln, head, list) { + if (ln->line < line) { + p = &ln->list; + goto found; + } else if (ln->line == line) /* Already exist */ + return ; } - if (found) - pr_debug("found fno: %d\n", (int)found); - return found; + /* List is empty, or the smallest entry */ + p = head; +found: + pr_debug("line list: add a line %u\n", line); + ln = zalloc(sizeof(struct line_node)); + DIE_IF(ln == NULL); + ln->line = line; + INIT_LIST_HEAD(&ln->list); + list_add(&ln->list, p); } -static int cu_get_filename(Dwarf_Die cu_die, Dwarf_Unsigned fno, char **buf) +/* Check if the line in line number list */ +static int line_list__has_line(struct list_head *head, unsigned int line) { - Dwarf_Signed cnt, i; - char **srcs; - int ret = 0; - - if (!buf || !fno) - return -EINVAL; - - ret = dwarf_srcfiles(cu_die, &srcs, &cnt, &__dw_error); - if (ret == DW_DLV_OK) { - if ((Dwarf_Unsigned)cnt > fno - 1) { - *buf = strdup(srcs[fno - 1]); - ret = 0; - pr_debug("found filename: %s\n", *buf); - } else - ret = -ENOENT; - for (i = 0; i < cnt; i++) - dwarf_dealloc(__dw_debug, srcs[i], DW_DLA_STRING); - dwarf_dealloc(__dw_debug, srcs, DW_DLA_LIST); - } else - ret = -EINVAL; - return ret; + struct line_node *ln; + + /* Reverse search, because new line will be the last one */ + list_for_each_entry(ln, head, list) + if (ln->line == line) + return 1; + + return 0; } -/* Compare diename and tname */ -static int die_compare_name(Dwarf_Die dw_die, const char *tname) +/* Init line number list */ +static void line_list__init(struct list_head *head) { - char *name; - int ret; - ret = dwarf_diename(dw_die, &name, &__dw_error); - DIE_IF(ret == DW_DLV_ERROR); - if (ret == DW_DLV_OK) { - ret = strcmp(tname, name); - dwarf_dealloc(__dw_debug, name, DW_DLA_STRING); - } else - ret = -1; - return ret; + INIT_LIST_HEAD(head); } -/* Check the address is in the subprogram(function). */ -static int die_within_subprogram(Dwarf_Die sp_die, Dwarf_Addr addr, - Dwarf_Signed *offs) +/* Free line number list */ +static void line_list__free(struct list_head *head) { - Dwarf_Addr lopc, hipc; - int ret; - - /* TODO: check ranges */ - ret = dwarf_lowpc(sp_die, &lopc, &__dw_error); - DIE_IF(ret == DW_DLV_ERROR); - if (ret == DW_DLV_NO_ENTRY) - return 0; - ret = dwarf_highpc(sp_die, &hipc, &__dw_error); - DIE_IF(ret != DW_DLV_OK); - if (lopc <= addr && addr < hipc) { - *offs = addr - lopc; - return 1; - } else - return 0; + struct line_node *ln; + while (!list_empty(head)) { + ln = list_first_entry(head, struct line_node, list); + list_del(&ln->list); + free(ln); + } } -/* Check the die is inlined function */ -static Dwarf_Bool die_inlined_subprogram(Dwarf_Die dw_die) +/* Dwarf wrappers */ + +/* Find the realpath of the target file. */ +static const char *cu_find_realpath(Dwarf_Die *cu_die, const char *fname) { - /* TODO: check strictly */ - Dwarf_Bool inl; + Dwarf_Files *files; + size_t nfiles, i; + const char *src; int ret; - ret = dwarf_hasattr(dw_die, DW_AT_inline, &inl, &__dw_error); - DIE_IF(ret == DW_DLV_ERROR); - return inl; -} + if (!fname) + return NULL; -/* Get the offset of abstruct_origin */ -static Dwarf_Off die_get_abstract_origin(Dwarf_Die dw_die) -{ - Dwarf_Attribute attr; - Dwarf_Off cu_offs; - int ret; + ret = dwarf_getsrcfiles(cu_die, &files, &nfiles); + if (ret != 0) + return NULL; - ret = dwarf_attr(dw_die, DW_AT_abstract_origin, &attr, &__dw_error); - DIE_IF(ret != DW_DLV_OK); - ret = dwarf_formref(attr, &cu_offs, &__dw_error); - DIE_IF(ret != DW_DLV_OK); - dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR); - return cu_offs; + for (i = 0; i < nfiles; i++) { + src = dwarf_filesrc(files, i, NULL, NULL); + if (strtailcmp(src, fname) == 0) + break; + } + return src; } -/* Get entry pc(or low pc, 1st entry of ranges) of the die */ -static Dwarf_Addr die_get_entrypc(Dwarf_Die dw_die) +struct __addr_die_search_param { + Dwarf_Addr addr; + Dwarf_Die *die_mem; +}; + +static int __die_search_func_cb(Dwarf_Die *fn_die, void *data) { - Dwarf_Attribute attr; - Dwarf_Addr addr; - Dwarf_Off offs; - Dwarf_Ranges *ranges; - Dwarf_Signed cnt; - int ret; + struct __addr_die_search_param *ad = data; - /* Try to get entry pc */ - ret = dwarf_attr(dw_die, DW_AT_entry_pc, &attr, &__dw_error); - DIE_IF(ret == DW_DLV_ERROR); - if (ret == DW_DLV_OK) { - ret = dwarf_formaddr(attr, &addr, &__dw_error); - DIE_IF(ret != DW_DLV_OK); - dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR); - return addr; + if (dwarf_tag(fn_die) == DW_TAG_subprogram && + dwarf_haspc(fn_die, ad->addr)) { + memcpy(ad->die_mem, fn_die, sizeof(Dwarf_Die)); + return DWARF_CB_ABORT; } + return DWARF_CB_OK; +} - /* Try to get low pc */ - ret = dwarf_lowpc(dw_die, &addr, &__dw_error); - DIE_IF(ret == DW_DLV_ERROR); - if (ret == DW_DLV_OK) - return addr; - - /* Try to get ranges */ - ret = dwarf_attr(dw_die, DW_AT_ranges, &attr, &__dw_error); - DIE_IF(ret != DW_DLV_OK); - ret = dwarf_formref(attr, &offs, &__dw_error); - DIE_IF(ret != DW_DLV_OK); - ret = dwarf_get_ranges(__dw_debug, offs, &ranges, &cnt, NULL, - &__dw_error); - DIE_IF(ret != DW_DLV_OK); - addr = ranges[0].dwr_addr1; - dwarf_ranges_dealloc(__dw_debug, ranges, cnt); - return addr; +/* Search a real subprogram including this line, */ +static Dwarf_Die *die_get_real_subprogram(Dwarf_Die *cu_die, Dwarf_Addr addr, + Dwarf_Die *die_mem) +{ + struct __addr_die_search_param ad; + ad.addr = addr; + ad.die_mem = die_mem; + /* dwarf_getscopes can't find subprogram. */ + if (!dwarf_getfuncs(cu_die, __die_search_func_cb, &ad, 0)) + return NULL; + else + return die_mem; } -/* - * Search a Die from Die tree. - * Note: cur_link->die should be deallocated in this function. - */ -static int __search_die_tree(struct die_link *cur_link, - int (*die_cb)(struct die_link *, void *), - void *data) +/* Similar to dwarf_getfuncs, but returns inlined_subroutine if exists. */ +static Dwarf_Die *die_get_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr, + Dwarf_Die *die_mem) { - Dwarf_Die new_die; - struct die_link new_link; + Dwarf_Die child_die; int ret; - if (!die_cb) - return 0; - - /* Check current die */ - while (!(ret = die_cb(cur_link, data))) { - /* Check child die */ - ret = dwarf_child(cur_link->die, &new_die, &__dw_error); - DIE_IF(ret == DW_DLV_ERROR); - if (ret == DW_DLV_OK) { - new_link.parent = cur_link; - new_link.die = new_die; - ret = __search_die_tree(&new_link, die_cb, data); - if (ret) - break; - } + ret = dwarf_child(sp_die, die_mem); + if (ret != 0) + return NULL; - /* Move to next sibling */ - ret = dwarf_siblingof(__dw_debug, cur_link->die, &new_die, - &__dw_error); - DIE_IF(ret == DW_DLV_ERROR); - dwarf_dealloc(__dw_debug, cur_link->die, DW_DLA_DIE); - cur_link->die = new_die; - if (ret == DW_DLV_NO_ENTRY) - return 0; - } - dwarf_dealloc(__dw_debug, cur_link->die, DW_DLA_DIE); - return ret; -} + do { + if (dwarf_tag(die_mem) == DW_TAG_inlined_subroutine && + dwarf_haspc(die_mem, addr)) + return die_mem; -/* Search a die in its children's die tree */ -static int search_die_from_children(Dwarf_Die parent_die, - int (*die_cb)(struct die_link *, void *), - void *data) -{ - struct die_link new_link; - int ret; + if (die_get_inlinefunc(die_mem, addr, &child_die)) { + memcpy(die_mem, &child_die, sizeof(Dwarf_Die)); + return die_mem; + } + } while (dwarf_siblingof(die_mem, die_mem) == 0); - new_link.parent = NULL; - ret = dwarf_child(parent_die, &new_link.die, &__dw_error); - DIE_IF(ret == DW_DLV_ERROR); - if (ret == DW_DLV_OK) - return __search_die_tree(&new_link, die_cb, data); - else - return 0; + return NULL; } -/* Find a locdesc corresponding to the address */ -static int attr_get_locdesc(Dwarf_Attribute attr, Dwarf_Locdesc *desc, - Dwarf_Addr addr) +/* Compare diename and tname */ +static bool die_compare_name(Dwarf_Die *dw_die, const char *tname) { - Dwarf_Signed lcnt; - Dwarf_Locdesc **llbuf; - int ret, i; - - ret = dwarf_loclist_n(attr, &llbuf, &lcnt, &__dw_error); - DIE_IF(ret != DW_DLV_OK); - ret = DW_DLV_NO_ENTRY; - for (i = 0; i < lcnt; ++i) { - if (llbuf[i]->ld_lopc <= addr && - llbuf[i]->ld_hipc > addr) { - memcpy(desc, llbuf[i], sizeof(Dwarf_Locdesc)); - desc->ld_s = - malloc(sizeof(Dwarf_Loc) * llbuf[i]->ld_cents); - DIE_IF(desc->ld_s == NULL); - memcpy(desc->ld_s, llbuf[i]->ld_s, - sizeof(Dwarf_Loc) * llbuf[i]->ld_cents); - ret = DW_DLV_OK; - break; - } - dwarf_dealloc(__dw_debug, llbuf[i]->ld_s, DW_DLA_LOC_BLOCK); - dwarf_dealloc(__dw_debug, llbuf[i], DW_DLA_LOCDESC); - } - /* Releasing loop */ - for (; i < lcnt; ++i) { - dwarf_dealloc(__dw_debug, llbuf[i]->ld_s, DW_DLA_LOC_BLOCK); - dwarf_dealloc(__dw_debug, llbuf[i], DW_DLA_LOCDESC); - } - dwarf_dealloc(__dw_debug, llbuf, DW_DLA_LIST); - return ret; + const char *name; + name = dwarf_diename(dw_die); + DIE_IF(name == NULL); + return strcmp(tname, name); } -/* Get decl_file attribute value (file number) */ -static Dwarf_Unsigned die_get_decl_file(Dwarf_Die sp_die) +/* Get entry pc(or low pc, 1st entry of ranges) of the die */ +static Dwarf_Addr die_get_entrypc(Dwarf_Die *dw_die) { - Dwarf_Attribute attr; - Dwarf_Unsigned fno; + Dwarf_Addr epc; int ret; - ret = dwarf_attr(sp_die, DW_AT_decl_file, &attr, &__dw_error); - DIE_IF(ret != DW_DLV_OK); - dwarf_formudata(attr, &fno, &__dw_error); - DIE_IF(ret != DW_DLV_OK); - dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR); - return fno; + ret = dwarf_entrypc(dw_die, &epc); + DIE_IF(ret == -1); + return epc; } -/* Get decl_line attribute value (line number) */ -static Dwarf_Unsigned die_get_decl_line(Dwarf_Die sp_die) +/* Get a variable die */ +static Dwarf_Die *die_find_variable(Dwarf_Die *sp_die, const char *name, + Dwarf_Die *die_mem) { - Dwarf_Attribute attr; - Dwarf_Unsigned lno; + Dwarf_Die child_die; + int tag; int ret; - ret = dwarf_attr(sp_die, DW_AT_decl_line, &attr, &__dw_error); - DIE_IF(ret != DW_DLV_OK); - dwarf_formudata(attr, &lno, &__dw_error); - DIE_IF(ret != DW_DLV_OK); - dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR); - return lno; + ret = dwarf_child(sp_die, die_mem); + if (ret != 0) + return NULL; + + do { + tag = dwarf_tag(die_mem); + if ((tag == DW_TAG_formal_parameter || + tag == DW_TAG_variable) && + (die_compare_name(die_mem, name) == 0)) + return die_mem; + + if (die_find_variable(die_mem, name, &child_die)) { + memcpy(die_mem, &child_die, sizeof(Dwarf_Die)); + return die_mem; + } + } while (dwarf_siblingof(die_mem, die_mem) == 0); + + return NULL; } /* @@ -395,47 +296,45 @@ static Dwarf_Unsigned die_get_decl_line(Dwarf_Die sp_die) */ /* Show a location */ -static void show_location(Dwarf_Loc *loc, struct probe_finder *pf) +static void show_location(Dwarf_Op *op, struct probe_finder *pf) { - Dwarf_Small op; - Dwarf_Unsigned regn; - Dwarf_Signed offs; + unsigned int regn; + Dwarf_Word offs = 0; int deref = 0, ret; const char *regs; - op = loc->lr_atom; - + /* TODO: support CFA */ /* If this is based on frame buffer, set the offset */ - if (op == DW_OP_fbreg) { + if (op->atom == DW_OP_fbreg) { + if (pf->fb_ops == NULL) + die("The attribute of frame base is not supported.\n"); deref = 1; - offs = (Dwarf_Signed)loc->lr_number; - op = pf->fbloc.ld_s[0].lr_atom; - loc = &pf->fbloc.ld_s[0]; - } else - offs = 0; + offs = op->number; + op = &pf->fb_ops[0]; + } - if (op >= DW_OP_breg0 && op <= DW_OP_breg31) { - regn = op - DW_OP_breg0; - offs += (Dwarf_Signed)loc->lr_number; + if (op->atom >= DW_OP_breg0 && op->atom <= DW_OP_breg31) { + regn = op->atom - DW_OP_breg0; + offs += op->number; deref = 1; - } else if (op >= DW_OP_reg0 && op <= DW_OP_reg31) { - regn = op - DW_OP_reg0; - } else if (op == DW_OP_bregx) { - regn = loc->lr_number; - offs += (Dwarf_Signed)loc->lr_number2; + } else if (op->atom >= DW_OP_reg0 && op->atom <= DW_OP_reg31) { + regn = op->atom - DW_OP_reg0; + } else if (op->atom == DW_OP_bregx) { + regn = op->number; + offs += op->number2; deref = 1; - } else if (op == DW_OP_regx) { - regn = loc->lr_number; + } else if (op->atom == DW_OP_regx) { + regn = op->number; } else - die("Dwarf_OP %d is not supported.", op); + die("DW_OP %d is not supported.", op->atom); regs = get_arch_regstr(regn); if (!regs) - die("%lld exceeds max register number.", regn); + die("%u exceeds max register number.", regn); if (deref) - ret = snprintf(pf->buf, pf->len, - " %s=%+lld(%s)", pf->var, offs, regs); + ret = snprintf(pf->buf, pf->len, " %s=+%ju(%s)", + pf->var, (uintmax_t)offs, regs); else ret = snprintf(pf->buf, pf->len, " %s=%s", pf->var, regs); DIE_IF(ret < 0); @@ -443,52 +342,37 @@ static void show_location(Dwarf_Loc *loc, struct probe_finder *pf) } /* Show a variables in kprobe event format */ -static void show_variable(Dwarf_Die vr_die, struct probe_finder *pf) +static void show_variable(Dwarf_Die *vr_die, struct probe_finder *pf) { Dwarf_Attribute attr; - Dwarf_Locdesc ld; + Dwarf_Op *expr; + size_t nexpr; int ret; - ret = dwarf_attr(vr_die, DW_AT_location, &attr, &__dw_error); - if (ret != DW_DLV_OK) + if (dwarf_attr(vr_die, DW_AT_location, &attr) == NULL) goto error; - ret = attr_get_locdesc(attr, &ld, (pf->addr - pf->cu_base)); - if (ret != DW_DLV_OK) + /* TODO: handle more than 1 exprs */ + ret = dwarf_getlocation_addr(&attr, (pf->addr - pf->cu_base), + &expr, &nexpr, 1); + if (ret <= 0 || nexpr == 0) goto error; - /* TODO? */ - DIE_IF(ld.ld_cents != 1); - show_location(&ld.ld_s[0], pf); - free(ld.ld_s); - dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR); + + show_location(expr, pf); + /* *expr will be cached in libdw. Don't free it. */ return ; error: + /* TODO: Support const_value */ die("Failed to find the location of %s at this address.\n" " Perhaps, it has been optimized out.", pf->var); } -static int variable_callback(struct die_link *dlink, void *data) -{ - struct probe_finder *pf = (struct probe_finder *)data; - Dwarf_Half tag; - int ret; - - ret = dwarf_tag(dlink->die, &tag, &__dw_error); - DIE_IF(ret == DW_DLV_ERROR); - if ((tag == DW_TAG_formal_parameter || - tag == DW_TAG_variable) && - (die_compare_name(dlink->die, pf->var) == 0)) { - show_variable(dlink->die, pf); - return 1; - } - /* TODO: Support struct members and arrays */ - return 0; -} - /* Find a variable in a subprogram die */ -static void find_variable(Dwarf_Die sp_die, struct probe_finder *pf) +static void find_variable(Dwarf_Die *sp_die, struct probe_finder *pf) { int ret; + Dwarf_Die vr_die; + /* TODO: Support struct members and arrays */ if (!is_c_varname(pf->var)) { /* Output raw parameters */ ret = snprintf(pf->buf, pf->len, " %s", pf->var); @@ -499,58 +383,51 @@ static void find_variable(Dwarf_Die sp_die, struct probe_finder *pf) pr_debug("Searching '%s' variable in context.\n", pf->var); /* Search child die for local variables and parameters. */ - ret = search_die_from_children(sp_die, variable_callback, pf); - if (!ret) + if (!die_find_variable(sp_die, pf->var, &vr_die)) die("Failed to find '%s' in this function.", pf->var); -} - -/* Get a frame base on the address */ -static void get_current_frame_base(Dwarf_Die sp_die, struct probe_finder *pf) -{ - Dwarf_Attribute attr; - int ret; - ret = dwarf_attr(sp_die, DW_AT_frame_base, &attr, &__dw_error); - DIE_IF(ret != DW_DLV_OK); - ret = attr_get_locdesc(attr, &pf->fbloc, (pf->addr - pf->cu_base)); - DIE_IF(ret != DW_DLV_OK); - dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR); -} - -static void free_current_frame_base(struct probe_finder *pf) -{ - free(pf->fbloc.ld_s); - memset(&pf->fbloc, 0, sizeof(Dwarf_Locdesc)); + show_variable(&vr_die, pf); } /* Show a probe point to output buffer */ -static void show_probepoint(Dwarf_Die sp_die, Dwarf_Signed offs, - struct probe_finder *pf) +static void show_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf) { struct probe_point *pp = pf->pp; - char *name; + Dwarf_Addr eaddr; + Dwarf_Die die_mem; + const char *name; char tmp[MAX_PROBE_BUFFER]; int ret, i, len; + Dwarf_Attribute fb_attr; + size_t nops; + + /* If no real subprogram, find a real one */ + if (!sp_die || dwarf_tag(sp_die) != DW_TAG_subprogram) { + sp_die = die_get_real_subprogram(&pf->cu_die, + pf->addr, &die_mem); + if (!sp_die) + die("Probe point is not found in subprograms."); + } /* Output name of probe point */ - ret = dwarf_diename(sp_die, &name, &__dw_error); - DIE_IF(ret == DW_DLV_ERROR); - if (ret == DW_DLV_OK) { - ret = snprintf(tmp, MAX_PROBE_BUFFER, "%s+%u", name, - (unsigned int)offs); + name = dwarf_diename(sp_die); + if (name) { + dwarf_entrypc(sp_die, &eaddr); + ret = snprintf(tmp, MAX_PROBE_BUFFER, "%s+%lu", name, + (unsigned long)(pf->addr - eaddr)); /* Copy the function name if possible */ if (!pp->function) { pp->function = strdup(name); - pp->offset = offs; + pp->offset = (size_t)(pf->addr - eaddr); } - dwarf_dealloc(__dw_debug, name, DW_DLA_STRING); } else { /* This function has no name. */ - ret = snprintf(tmp, MAX_PROBE_BUFFER, "0x%llx", pf->addr); + ret = snprintf(tmp, MAX_PROBE_BUFFER, "0x%jx", + (uintmax_t)pf->addr); if (!pp->function) { /* TODO: Use _stext */ pp->function = strdup(""); - pp->offset = (int)pf->addr; + pp->offset = (size_t)pf->addr; } } DIE_IF(ret < 0); @@ -558,8 +435,15 @@ static void show_probepoint(Dwarf_Die sp_die, Dwarf_Signed offs, len = ret; pr_debug("Probe point found: %s\n", tmp); + /* Get the frame base attribute/ops */ + dwarf_attr(sp_die, DW_AT_frame_base, &fb_attr); + ret = dwarf_getlocation_addr(&fb_attr, (pf->addr - pf->cu_base), + &pf->fb_ops, &nops, 1); + if (ret <= 0 || nops == 0) + pf->fb_ops = NULL; + /* Find each argument */ - get_current_frame_base(sp_die, pf); + /* TODO: use dwarf_cfi_addrframe */ for (i = 0; i < pp->nr_args; i++) { pf->var = pp->args[i]; pf->buf = &tmp[len]; @@ -567,289 +451,327 @@ static void show_probepoint(Dwarf_Die sp_die, Dwarf_Signed offs, find_variable(sp_die, pf); len += strlen(pf->buf); } - free_current_frame_base(pf); + + /* *pf->fb_ops will be cached in libdw. Don't free it. */ + pf->fb_ops = NULL; pp->probes[pp->found] = strdup(tmp); pp->found++; } -static int probeaddr_callback(struct die_link *dlink, void *data) +/* Find probe point from its line number */ +static void find_probe_point_by_line(struct probe_finder *pf) { - struct probe_finder *pf = (struct probe_finder *)data; - Dwarf_Half tag; - Dwarf_Signed offs; + Dwarf_Lines *lines; + Dwarf_Line *line; + size_t nlines, i; + Dwarf_Addr addr; + int lineno; int ret; - ret = dwarf_tag(dlink->die, &tag, &__dw_error); - DIE_IF(ret == DW_DLV_ERROR); - /* Check the address is in this subprogram */ - if (tag == DW_TAG_subprogram && - die_within_subprogram(dlink->die, pf->addr, &offs)) { - show_probepoint(dlink->die, offs, pf); - return 1; + ret = dwarf_getsrclines(&pf->cu_die, &lines, &nlines); + DIE_IF(ret != 0); + + for (i = 0; i < nlines; i++) { + line = dwarf_onesrcline(lines, i); + dwarf_lineno(line, &lineno); + if (lineno != pf->lno) + continue; + + /* TODO: Get fileno from line, but how? */ + if (strtailcmp(dwarf_linesrc(line, NULL, NULL), pf->fname) != 0) + continue; + + ret = dwarf_lineaddr(line, &addr); + DIE_IF(ret != 0); + pr_debug("Probe line found: line[%d]:%d addr:0x%jx\n", + (int)i, lineno, (uintmax_t)addr); + pf->addr = addr; + + show_probe_point(NULL, pf); + /* Continuing, because target line might be inlined. */ } - return 0; } -/* Find probe point from its line number */ -static void find_probe_point_by_line(struct probe_finder *pf) +/* Find lines which match lazy pattern */ +static int find_lazy_match_lines(struct list_head *head, + const char *fname, const char *pat) { - Dwarf_Signed cnt, i, clm; - Dwarf_Line *lines; - Dwarf_Unsigned lineno = 0; + char *fbuf, *p1, *p2; + int fd, line, nlines = 0; + struct stat st; + + fd = open(fname, O_RDONLY); + if (fd < 0) + die("failed to open %s", fname); + DIE_IF(fstat(fd, &st) < 0); + fbuf = malloc(st.st_size + 2); + DIE_IF(fbuf == NULL); + DIE_IF(read(fd, fbuf, st.st_size) < 0); + close(fd); + fbuf[st.st_size] = '\n'; /* Dummy line */ + fbuf[st.st_size + 1] = '\0'; + p1 = fbuf; + line = 1; + while ((p2 = strchr(p1, '\n')) != NULL) { + *p2 = '\0'; + if (strlazymatch(p1, pat)) { + line_list__add_line(head, line); + nlines++; + } + line++; + p1 = p2 + 1; + } + free(fbuf); + return nlines; +} + +/* Find probe points from lazy pattern */ +static void find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf) +{ + Dwarf_Lines *lines; + Dwarf_Line *line; + size_t nlines, i; Dwarf_Addr addr; - Dwarf_Unsigned fno; + Dwarf_Die die_mem; + int lineno; int ret; - ret = dwarf_srclines(pf->cu_die, &lines, &cnt, &__dw_error); - DIE_IF(ret != DW_DLV_OK); + if (list_empty(&pf->lcache)) { + /* Matching lazy line pattern */ + ret = find_lazy_match_lines(&pf->lcache, pf->fname, + pf->pp->lazy_line); + if (ret <= 0) + die("No matched lines found in %s.", pf->fname); + } + + ret = dwarf_getsrclines(&pf->cu_die, &lines, &nlines); + DIE_IF(ret != 0); + for (i = 0; i < nlines; i++) { + line = dwarf_onesrcline(lines, i); - for (i = 0; i < cnt; i++) { - ret = dwarf_line_srcfileno(lines[i], &fno, &__dw_error); - DIE_IF(ret != DW_DLV_OK); - if (fno != pf->fno) + dwarf_lineno(line, &lineno); + if (!line_list__has_line(&pf->lcache, lineno)) continue; - ret = dwarf_lineno(lines[i], &lineno, &__dw_error); - DIE_IF(ret != DW_DLV_OK); - if (lineno != pf->lno) + /* TODO: Get fileno from line, but how? */ + if (strtailcmp(dwarf_linesrc(line, NULL, NULL), pf->fname) != 0) continue; - ret = dwarf_lineoff(lines[i], &clm, &__dw_error); - DIE_IF(ret != DW_DLV_OK); + ret = dwarf_lineaddr(line, &addr); + DIE_IF(ret != 0); + if (sp_die) { + /* Address filtering 1: does sp_die include addr? */ + if (!dwarf_haspc(sp_die, addr)) + continue; + /* Address filtering 2: No child include addr? */ + if (die_get_inlinefunc(sp_die, addr, &die_mem)) + continue; + } - ret = dwarf_lineaddr(lines[i], &addr, &__dw_error); - DIE_IF(ret != DW_DLV_OK); - pr_debug("Probe line found: line[%d]:%u,%d addr:0x%llx\n", - (int)i, (unsigned)lineno, (int)clm, addr); + pr_debug("Probe line found: line[%d]:%d addr:0x%llx\n", + (int)i, lineno, (unsigned long long)addr); pf->addr = addr; - /* Search a real subprogram including this line, */ - ret = search_die_from_children(pf->cu_die, - probeaddr_callback, pf); - if (ret == 0) - die("Probe point is not found in subprograms."); + + show_probe_point(sp_die, pf); /* Continuing, because target line might be inlined. */ } - dwarf_srclines_dealloc(__dw_debug, lines, cnt); + /* TODO: deallocate lines, but how? */ +} + +static int probe_point_inline_cb(Dwarf_Die *in_die, void *data) +{ + struct probe_finder *pf = (struct probe_finder *)data; + struct probe_point *pp = pf->pp; + + if (pp->lazy_line) + find_probe_point_lazy(in_die, pf); + else { + /* Get probe address */ + pf->addr = die_get_entrypc(in_die); + pf->addr += pp->offset; + pr_debug("found inline addr: 0x%jx\n", + (uintmax_t)pf->addr); + + show_probe_point(in_die, pf); + } + + return DWARF_CB_OK; } /* Search function from function name */ -static int probefunc_callback(struct die_link *dlink, void *data) +static int probe_point_search_cb(Dwarf_Die *sp_die, void *data) { struct probe_finder *pf = (struct probe_finder *)data; struct probe_point *pp = pf->pp; - struct die_link *lk; - Dwarf_Signed offs; - Dwarf_Half tag; - int ret; - ret = dwarf_tag(dlink->die, &tag, &__dw_error); - DIE_IF(ret == DW_DLV_ERROR); - if (tag == DW_TAG_subprogram) { - if (die_compare_name(dlink->die, pp->function) == 0) { - if (pp->line) { /* Function relative line */ - pf->fno = die_get_decl_file(dlink->die); - pf->lno = die_get_decl_line(dlink->die) - + pp->line; - find_probe_point_by_line(pf); - return 1; - } - if (die_inlined_subprogram(dlink->die)) { - /* Inlined function, save it. */ - ret = dwarf_die_CU_offset(dlink->die, - &pf->inl_offs, - &__dw_error); - DIE_IF(ret != DW_DLV_OK); - pr_debug("inline definition offset %lld\n", - pf->inl_offs); - return 0; /* Continue to search */ - } - /* Get probe address */ - pf->addr = die_get_entrypc(dlink->die); + /* Check tag and diename */ + if (dwarf_tag(sp_die) != DW_TAG_subprogram || + die_compare_name(sp_die, pp->function) != 0) + return 0; + + pf->fname = dwarf_decl_file(sp_die); + if (pp->line) { /* Function relative line */ + dwarf_decl_line(sp_die, &pf->lno); + pf->lno += pp->line; + find_probe_point_by_line(pf); + } else if (!dwarf_func_inline(sp_die)) { + /* Real function */ + if (pp->lazy_line) + find_probe_point_lazy(sp_die, pf); + else { + pf->addr = die_get_entrypc(sp_die); pf->addr += pp->offset; /* TODO: Check the address in this function */ - show_probepoint(dlink->die, pp->offset, pf); - return 1; /* Exit; no same symbol in this CU. */ - } - } else if (tag == DW_TAG_inlined_subroutine && pf->inl_offs) { - if (die_get_abstract_origin(dlink->die) == pf->inl_offs) { - /* Get probe address */ - pf->addr = die_get_entrypc(dlink->die); - pf->addr += pp->offset; - pr_debug("found inline addr: 0x%llx\n", pf->addr); - /* Inlined function. Get a real subprogram */ - for (lk = dlink->parent; lk != NULL; lk = lk->parent) { - tag = 0; - dwarf_tag(lk->die, &tag, &__dw_error); - DIE_IF(ret == DW_DLV_ERROR); - if (tag == DW_TAG_subprogram && - !die_inlined_subprogram(lk->die)) - goto found; - } - die("Failed to find real subprogram."); -found: - /* Get offset from subprogram */ - ret = die_within_subprogram(lk->die, pf->addr, &offs); - DIE_IF(!ret); - show_probepoint(lk->die, offs, pf); - /* Continue to search */ + show_probe_point(sp_die, pf); } - } - return 0; + } else + /* Inlined function: search instances */ + dwarf_func_inline_instances(sp_die, probe_point_inline_cb, pf); + + return 1; /* Exit; no same symbol in this CU. */ } static void find_probe_point_by_func(struct probe_finder *pf) { - search_die_from_children(pf->cu_die, probefunc_callback, pf); + dwarf_getfuncs(&pf->cu_die, probe_point_search_cb, pf, 0); } /* Find a probe point */ -int find_probepoint(int fd, struct probe_point *pp) +int find_probe_point(int fd, struct probe_point *pp) { - Dwarf_Half addr_size = 0; - Dwarf_Unsigned next_cuh = 0; - int cu_number = 0, ret; struct probe_finder pf = {.pp = pp}; + int ret; + Dwarf_Off off, noff; + size_t cuhl; + Dwarf_Die *diep; + Dwarf *dbg; - ret = dwarf_init(fd, DW_DLC_READ, 0, 0, &__dw_debug, &__dw_error); - if (ret != DW_DLV_OK) + dbg = dwarf_begin(fd, DWARF_C_READ); + if (!dbg) return -ENOENT; pp->found = 0; - while (++cu_number) { - /* Search CU (Compilation Unit) */ - ret = dwarf_next_cu_header(__dw_debug, NULL, NULL, NULL, - &addr_size, &next_cuh, &__dw_error); - DIE_IF(ret == DW_DLV_ERROR); - if (ret == DW_DLV_NO_ENTRY) - break; - + off = 0; + line_list__init(&pf.lcache); + /* Loop on CUs (Compilation Unit) */ + while (!dwarf_nextcu(dbg, off, &noff, &cuhl, NULL, NULL, NULL)) { /* Get the DIE(Debugging Information Entry) of this CU */ - ret = dwarf_siblingof(__dw_debug, 0, &pf.cu_die, &__dw_error); - DIE_IF(ret != DW_DLV_OK); + diep = dwarf_offdie(dbg, off + cuhl, &pf.cu_die); + if (!diep) + continue; /* Check if target file is included. */ if (pp->file) - pf.fno = cu_find_fileno(pf.cu_die, pp->file); + pf.fname = cu_find_realpath(&pf.cu_die, pp->file); + else + pf.fname = NULL; - if (!pp->file || pf.fno) { + if (!pp->file || pf.fname) { /* Save CU base address (for frame_base) */ - ret = dwarf_lowpc(pf.cu_die, &pf.cu_base, &__dw_error); - DIE_IF(ret == DW_DLV_ERROR); - if (ret == DW_DLV_NO_ENTRY) + ret = dwarf_lowpc(&pf.cu_die, &pf.cu_base); + if (ret != 0) pf.cu_base = 0; if (pp->function) find_probe_point_by_func(&pf); + else if (pp->lazy_line) + find_probe_point_lazy(NULL, &pf); else { pf.lno = pp->line; find_probe_point_by_line(&pf); } } - dwarf_dealloc(__dw_debug, pf.cu_die, DW_DLA_DIE); + off = noff; } - ret = dwarf_finish(__dw_debug, &__dw_error); - DIE_IF(ret != DW_DLV_OK); + line_list__free(&pf.lcache); + dwarf_end(dbg); return pp->found; } - -static void line_range_add_line(struct line_range *lr, unsigned int line) -{ - struct line_node *ln; - struct list_head *p; - - /* Reverse search, because new line will be the last one */ - list_for_each_entry_reverse(ln, &lr->line_list, list) { - if (ln->line < line) { - p = &ln->list; - goto found; - } else if (ln->line == line) /* Already exist */ - return ; - } - /* List is empty, or the smallest entry */ - p = &lr->line_list; -found: - pr_debug("Debug: add a line %u\n", line); - ln = zalloc(sizeof(struct line_node)); - DIE_IF(ln == NULL); - ln->line = line; - INIT_LIST_HEAD(&ln->list); - list_add(&ln->list, p); -} - /* Find line range from its line number */ -static void find_line_range_by_line(struct line_finder *lf) +static void find_line_range_by_line(Dwarf_Die *sp_die, struct line_finder *lf) { - Dwarf_Signed cnt, i; - Dwarf_Line *lines; - Dwarf_Unsigned lineno = 0; - Dwarf_Unsigned fno; + Dwarf_Lines *lines; + Dwarf_Line *line; + size_t nlines, i; Dwarf_Addr addr; + int lineno; int ret; + const char *src; + Dwarf_Die die_mem; - ret = dwarf_srclines(lf->cu_die, &lines, &cnt, &__dw_error); - DIE_IF(ret != DW_DLV_OK); + line_list__init(&lf->lr->line_list); + ret = dwarf_getsrclines(&lf->cu_die, &lines, &nlines); + DIE_IF(ret != 0); - for (i = 0; i < cnt; i++) { - ret = dwarf_line_srcfileno(lines[i], &fno, &__dw_error); - DIE_IF(ret != DW_DLV_OK); - if (fno != lf->fno) - continue; - - ret = dwarf_lineno(lines[i], &lineno, &__dw_error); - DIE_IF(ret != DW_DLV_OK); + for (i = 0; i < nlines; i++) { + line = dwarf_onesrcline(lines, i); + ret = dwarf_lineno(line, &lineno); + DIE_IF(ret != 0); if (lf->lno_s > lineno || lf->lno_e < lineno) continue; - /* Filter line in the function address range */ - if (lf->addr_s && lf->addr_e) { - ret = dwarf_lineaddr(lines[i], &addr, &__dw_error); - DIE_IF(ret != DW_DLV_OK); - if (lf->addr_s > addr || lf->addr_e <= addr) + if (sp_die) { + /* Address filtering 1: does sp_die include addr? */ + ret = dwarf_lineaddr(line, &addr); + DIE_IF(ret != 0); + if (!dwarf_haspc(sp_die, addr)) + continue; + + /* Address filtering 2: No child include addr? */ + if (die_get_inlinefunc(sp_die, addr, &die_mem)) continue; } - line_range_add_line(lf->lr, (unsigned int)lineno); + + /* TODO: Get fileno from line, but how? */ + src = dwarf_linesrc(line, NULL, NULL); + if (strtailcmp(src, lf->fname) != 0) + continue; + + /* Copy real path */ + if (!lf->lr->path) + lf->lr->path = strdup(src); + line_list__add_line(&lf->lr->line_list, (unsigned int)lineno); } - dwarf_srclines_dealloc(__dw_debug, lines, cnt); + /* Update status */ if (!list_empty(&lf->lr->line_list)) lf->found = 1; + else { + free(lf->lr->path); + lf->lr->path = NULL; + } +} + +static int line_range_inline_cb(Dwarf_Die *in_die, void *data) +{ + find_line_range_by_line(in_die, (struct line_finder *)data); + return DWARF_CB_ABORT; /* No need to find other instances */ } /* Search function from function name */ -static int linefunc_callback(struct die_link *dlink, void *data) +static int line_range_search_cb(Dwarf_Die *sp_die, void *data) { struct line_finder *lf = (struct line_finder *)data; struct line_range *lr = lf->lr; - Dwarf_Half tag; - int ret; - ret = dwarf_tag(dlink->die, &tag, &__dw_error); - DIE_IF(ret == DW_DLV_ERROR); - if (tag == DW_TAG_subprogram && - die_compare_name(dlink->die, lr->function) == 0) { - /* Get the address range of this function */ - ret = dwarf_highpc(dlink->die, &lf->addr_e, &__dw_error); - if (ret == DW_DLV_OK) - ret = dwarf_lowpc(dlink->die, &lf->addr_s, &__dw_error); - DIE_IF(ret == DW_DLV_ERROR); - if (ret == DW_DLV_NO_ENTRY) { - lf->addr_s = 0; - lf->addr_e = 0; - } - - lf->fno = die_get_decl_file(dlink->die); - lr->offset = die_get_decl_line(dlink->die);; + if (dwarf_tag(sp_die) == DW_TAG_subprogram && + die_compare_name(sp_die, lr->function) == 0) { + lf->fname = dwarf_decl_file(sp_die); + dwarf_decl_line(sp_die, &lr->offset); + pr_debug("fname: %s, lineno:%d\n", lf->fname, lr->offset); lf->lno_s = lr->offset + lr->start; if (!lr->end) - lf->lno_e = (Dwarf_Unsigned)-1; + lf->lno_e = INT_MAX; else lf->lno_e = lr->offset + lr->end; lr->start = lf->lno_s; lr->end = lf->lno_e; - find_line_range_by_line(lf); - /* If we find a target function, this should be end. */ - lf->found = 1; + if (dwarf_func_inline(sp_die)) + dwarf_func_inline_instances(sp_die, + line_range_inline_cb, lf); + else + find_line_range_by_line(sp_die, lf); return 1; } return 0; @@ -857,55 +779,55 @@ static int linefunc_callback(struct die_link *dlink, void *data) static void find_line_range_by_func(struct line_finder *lf) { - search_die_from_children(lf->cu_die, linefunc_callback, lf); + dwarf_getfuncs(&lf->cu_die, line_range_search_cb, lf, 0); } int find_line_range(int fd, struct line_range *lr) { - Dwarf_Half addr_size = 0; - Dwarf_Unsigned next_cuh = 0; + struct line_finder lf = {.lr = lr, .found = 0}; int ret; - struct line_finder lf = {.lr = lr}; + Dwarf_Off off = 0, noff; + size_t cuhl; + Dwarf_Die *diep; + Dwarf *dbg; - ret = dwarf_init(fd, DW_DLC_READ, 0, 0, &__dw_debug, &__dw_error); - if (ret != DW_DLV_OK) + dbg = dwarf_begin(fd, DWARF_C_READ); + if (!dbg) return -ENOENT; + /* Loop on CUs (Compilation Unit) */ while (!lf.found) { - /* Search CU (Compilation Unit) */ - ret = dwarf_next_cu_header(__dw_debug, NULL, NULL, NULL, - &addr_size, &next_cuh, &__dw_error); - DIE_IF(ret == DW_DLV_ERROR); - if (ret == DW_DLV_NO_ENTRY) + ret = dwarf_nextcu(dbg, off, &noff, &cuhl, NULL, NULL, NULL); + if (ret != 0) break; /* Get the DIE(Debugging Information Entry) of this CU */ - ret = dwarf_siblingof(__dw_debug, 0, &lf.cu_die, &__dw_error); - DIE_IF(ret != DW_DLV_OK); + diep = dwarf_offdie(dbg, off + cuhl, &lf.cu_die); + if (!diep) + continue; /* Check if target file is included. */ if (lr->file) - lf.fno = cu_find_fileno(lf.cu_die, lr->file); + lf.fname = cu_find_realpath(&lf.cu_die, lr->file); + else + lf.fname = 0; - if (!lr->file || lf.fno) { + if (!lr->file || lf.fname) { if (lr->function) find_line_range_by_func(&lf); else { lf.lno_s = lr->start; if (!lr->end) - lf.lno_e = (Dwarf_Unsigned)-1; + lf.lno_e = INT_MAX; else lf.lno_e = lr->end; - find_line_range_by_line(&lf); + find_line_range_by_line(NULL, &lf); } - /* Get the real file path */ - if (lf.found) - cu_get_filename(lf.cu_die, lf.fno, &lr->path); } - dwarf_dealloc(__dw_debug, lf.cu_die, DW_DLA_DIE); + off = noff; } - ret = dwarf_finish(__dw_debug, &__dw_error); - DIE_IF(ret != DW_DLV_OK); + pr_debug("path: %lx\n", (unsigned long)lr->path); + dwarf_end(dbg); return lf.found; } diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h index 972b386116f..d1a651793ba 100644 --- a/tools/perf/util/probe-finder.h +++ b/tools/perf/util/probe-finder.h @@ -1,6 +1,7 @@ #ifndef _PROBE_FINDER_H #define _PROBE_FINDER_H +#include <stdbool.h> #include "util.h" #define MAX_PATH_LEN 256 @@ -20,6 +21,7 @@ struct probe_point { /* Inputs */ char *file; /* File name */ int line; /* Line number */ + char *lazy_line; /* Lazy line pattern */ char *function; /* Function name */ int offset; /* Offset bytes */ @@ -46,53 +48,46 @@ struct line_range { char *function; /* Function name */ unsigned int start; /* Start line number */ unsigned int end; /* End line number */ - unsigned int offset; /* Start line offset */ + int offset; /* Start line offset */ char *path; /* Real path name */ struct list_head line_list; /* Visible lines */ }; -#ifndef NO_LIBDWARF -extern int find_probepoint(int fd, struct probe_point *pp); +#ifndef NO_DWARF_SUPPORT +extern int find_probe_point(int fd, struct probe_point *pp); extern int find_line_range(int fd, struct line_range *lr); -/* Workaround for undefined _MIPS_SZLONG bug in libdwarf.h: */ -#ifndef _MIPS_SZLONG -# define _MIPS_SZLONG 0 -#endif - #include <dwarf.h> -#include <libdwarf.h> +#include <libdw.h> struct probe_finder { - struct probe_point *pp; /* Target probe point */ + struct probe_point *pp; /* Target probe point */ /* For function searching */ - Dwarf_Addr addr; /* Address */ - Dwarf_Unsigned fno; /* File number */ - Dwarf_Unsigned lno; /* Line number */ - Dwarf_Off inl_offs; /* Inline offset */ - Dwarf_Die cu_die; /* Current CU */ + Dwarf_Addr addr; /* Address */ + const char *fname; /* File name */ + int lno; /* Line number */ + Dwarf_Die cu_die; /* Current CU */ /* For variable searching */ - Dwarf_Addr cu_base; /* Current CU base address */ - Dwarf_Locdesc fbloc; /* Location of Current Frame Base */ - const char *var; /* Current variable name */ - char *buf; /* Current output buffer */ - int len; /* Length of output buffer */ + Dwarf_Op *fb_ops; /* Frame base attribute */ + Dwarf_Addr cu_base; /* Current CU base address */ + const char *var; /* Current variable name */ + char *buf; /* Current output buffer */ + int len; /* Length of output buffer */ + struct list_head lcache; /* Line cache for lazy match */ }; struct line_finder { - struct line_range *lr; /* Target line range */ - - Dwarf_Unsigned fno; /* File number */ - Dwarf_Unsigned lno_s; /* Start line number */ - Dwarf_Unsigned lno_e; /* End line number */ - Dwarf_Addr addr_s; /* Start address */ - Dwarf_Addr addr_e; /* End address */ - Dwarf_Die cu_die; /* Current CU */ + struct line_range *lr; /* Target line range */ + + const char *fname; /* File name */ + int lno_s; /* Start line number */ + int lno_e; /* End line number */ + Dwarf_Die cu_die; /* Current CU */ int found; }; -#endif /* NO_LIBDWARF */ +#endif /* NO_DWARF_SUPPORT */ #endif /*_PROBE_FINDER_H */ diff --git a/tools/perf/util/string.c b/tools/perf/util/string.c index c397d4f6f74..a175949ed21 100644 --- a/tools/perf/util/string.c +++ b/tools/perf/util/string.c @@ -265,21 +265,21 @@ error: return false; } -/** - * strglobmatch - glob expression pattern matching - * @str: the target string to match - * @pat: the pattern string to match - * - * This returns true if the @str matches @pat. @pat can includes wildcards - * ('*','?') and character classes ([CHARS], complementation and ranges are - * also supported). Also, this supports escape character ('\') to use special - * characters as normal character. - * - * Note: if @pat syntax is broken, this always returns false. - */ -bool strglobmatch(const char *str, const char *pat) +/* Glob/lazy pattern matching */ +static bool __match_glob(const char *str, const char *pat, bool ignore_space) { while (*str && *pat && *pat != '*') { + if (ignore_space) { + /* Ignore spaces for lazy matching */ + if (isspace(*str)) { + str++; + continue; + } + if (isspace(*pat)) { + pat++; + continue; + } + } if (*pat == '?') { /* Matches any single character */ str++; pat++; @@ -308,3 +308,32 @@ bool strglobmatch(const char *str, const char *pat) return !*str && !*pat; } +/** + * strglobmatch - glob expression pattern matching + * @str: the target string to match + * @pat: the pattern string to match + * + * This returns true if the @str matches @pat. @pat can includes wildcards + * ('*','?') and character classes ([CHARS], complementation and ranges are + * also supported). Also, this supports escape character ('\') to use special + * characters as normal character. + * + * Note: if @pat syntax is broken, this always returns false. + */ +bool strglobmatch(const char *str, const char *pat) +{ + return __match_glob(str, pat, false); +} + +/** + * strlazymatch - matching pattern strings lazily with glob pattern + * @str: the target string to match + * @pat: the pattern string to match + * + * This is similar to strglobmatch, except this ignores spaces in + * the target string. + */ +bool strlazymatch(const char *str, const char *pat) +{ + return __match_glob(str, pat, true); +} diff --git a/tools/perf/util/string.h b/tools/perf/util/string.h index 02ede58c54b..542e44de371 100644 --- a/tools/perf/util/string.h +++ b/tools/perf/util/string.h @@ -10,6 +10,7 @@ s64 perf_atoll(const char *str); char **argv_split(const char *str, int *argcp); void argv_free(char **argv); bool strglobmatch(const char *str, const char *pat); +bool strlazymatch(const char *str, const char *pat); #define _STR(x) #x #define STR(x) _STR(x) diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index daece36c0a5..7f1178f6b83 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -12,3 +12,6 @@ config HAVE_KVM_EVENTFD config KVM_APIC_ARCHITECTURE bool + +config KVM_MMIO + bool diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index f73de631e3e..057e2cca6af 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -504,12 +504,12 @@ out: static int kvm_vm_ioctl_assign_device(struct kvm *kvm, struct kvm_assigned_pci_dev *assigned_dev) { - int r = 0; + int r = 0, idx; struct kvm_assigned_dev_kernel *match; struct pci_dev *dev; mutex_lock(&kvm->lock); - down_read(&kvm->slots_lock); + idx = srcu_read_lock(&kvm->srcu); match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, assigned_dev->assigned_dev_id); @@ -526,7 +526,8 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, r = -ENOMEM; goto out; } - dev = pci_get_bus_and_slot(assigned_dev->busnr, + dev = pci_get_domain_bus_and_slot(assigned_dev->segnr, + assigned_dev->busnr, assigned_dev->devfn); if (!dev) { printk(KERN_INFO "%s: host device not found\n", __func__); @@ -548,6 +549,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, pci_reset_function(dev); match->assigned_dev_id = assigned_dev->assigned_dev_id; + match->host_segnr = assigned_dev->segnr; match->host_busnr = assigned_dev->busnr; match->host_devfn = assigned_dev->devfn; match->flags = assigned_dev->flags; @@ -573,7 +575,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, } out: - up_read(&kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, idx); mutex_unlock(&kvm->lock); return r; out_list_del: @@ -585,7 +587,7 @@ out_put: pci_dev_put(dev); out_free: kfree(match); - up_read(&kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, idx); mutex_unlock(&kvm->lock); return r; } diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 04d69cd7049..5169736377a 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -92,41 +92,64 @@ static const struct kvm_io_device_ops coalesced_mmio_ops = { int kvm_coalesced_mmio_init(struct kvm *kvm) { struct kvm_coalesced_mmio_dev *dev; + struct page *page; int ret; + ret = -ENOMEM; + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + goto out_err; + kvm->coalesced_mmio_ring = page_address(page); + + ret = -ENOMEM; dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); if (!dev) - return -ENOMEM; + goto out_free_page; spin_lock_init(&dev->lock); kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops); dev->kvm = kvm; kvm->coalesced_mmio_dev = dev; - ret = kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &dev->dev); + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev); + mutex_unlock(&kvm->slots_lock); if (ret < 0) - kfree(dev); + goto out_free_dev; + + return ret; +out_free_dev: + kfree(dev); +out_free_page: + __free_page(page); +out_err: return ret; } +void kvm_coalesced_mmio_free(struct kvm *kvm) +{ + if (kvm->coalesced_mmio_ring) + free_page((unsigned long)kvm->coalesced_mmio_ring); +} + int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, - struct kvm_coalesced_mmio_zone *zone) + struct kvm_coalesced_mmio_zone *zone) { struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev; if (dev == NULL) return -EINVAL; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) { - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return -ENOBUFS; } dev->zone[dev->nb_zones] = *zone; dev->nb_zones++; - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return 0; } @@ -140,10 +163,10 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, if (dev == NULL) return -EINVAL; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); i = dev->nb_zones; - while(i) { + while (i) { z = &dev->zone[i - 1]; /* unregister all zones @@ -158,7 +181,7 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, i--; } - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return 0; } diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h index 4b49f27fa31..8a5959e3535 100644 --- a/virt/kvm/coalesced_mmio.h +++ b/virt/kvm/coalesced_mmio.h @@ -1,3 +1,6 @@ +#ifndef __KVM_COALESCED_MMIO_H__ +#define __KVM_COALESCED_MMIO_H__ + /* * KVM coalesced MMIO * @@ -7,6 +10,8 @@ * */ +#ifdef CONFIG_KVM_MMIO + #define KVM_COALESCED_MMIO_ZONE_MAX 100 struct kvm_coalesced_mmio_dev { @@ -18,7 +23,17 @@ struct kvm_coalesced_mmio_dev { }; int kvm_coalesced_mmio_init(struct kvm *kvm); +void kvm_coalesced_mmio_free(struct kvm *kvm); int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, struct kvm_coalesced_mmio_zone *zone); int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, struct kvm_coalesced_mmio_zone *zone); + +#else + +static inline int kvm_coalesced_mmio_init(struct kvm *kvm) { return 0; } +static inline void kvm_coalesced_mmio_free(struct kvm *kvm) { } + +#endif + +#endif diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index a9d3fc6c681..7016319b1ec 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -47,7 +47,6 @@ struct _irqfd { int gsi; struct list_head list; poll_table pt; - wait_queue_head_t *wqh; wait_queue_t wait; struct work_struct inject; struct work_struct shutdown; @@ -159,8 +158,6 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, poll_table *pt) { struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt); - - irqfd->wqh = wqh; add_wait_queue(wqh, &irqfd->wait); } @@ -463,7 +460,7 @@ static int kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) { int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; - struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus; + enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS; struct _ioeventfd *p; struct eventfd_ctx *eventfd; int ret; @@ -508,7 +505,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) else p->wildcard = true; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); /* Verify that there isnt a match already */ if (ioeventfd_check_collision(kvm, p)) { @@ -518,18 +515,18 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) kvm_iodevice_init(&p->dev, &ioeventfd_ops); - ret = __kvm_io_bus_register_dev(bus, &p->dev); + ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev); if (ret < 0) goto unlock_fail; list_add_tail(&p->list, &kvm->ioeventfds); - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return 0; unlock_fail: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); fail: kfree(p); @@ -542,7 +539,7 @@ static int kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) { int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; - struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus; + enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS; struct _ioeventfd *p, *tmp; struct eventfd_ctx *eventfd; int ret = -ENOENT; @@ -551,7 +548,7 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) if (IS_ERR(eventfd)) return PTR_ERR(eventfd); - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) { bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); @@ -565,13 +562,13 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) if (!p->wildcard && p->datamatch != args->datamatch) continue; - __kvm_io_bus_unregister_dev(bus, &p->dev); + kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); ioeventfd_release(p); ret = 0; break; } - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); eventfd_ctx_put(eventfd); diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 38a2d20b89d..3db15a807f8 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -100,6 +100,19 @@ static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) return injected; } +static void update_handled_vectors(struct kvm_ioapic *ioapic) +{ + DECLARE_BITMAP(handled_vectors, 256); + int i; + + memset(handled_vectors, 0, sizeof(handled_vectors)); + for (i = 0; i < IOAPIC_NUM_PINS; ++i) + __set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors); + memcpy(ioapic->handled_vectors, handled_vectors, + sizeof(handled_vectors)); + smp_wmb(); +} + static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) { unsigned index; @@ -134,6 +147,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) e->bits |= (u32) val; e->fields.remote_irr = 0; } + update_handled_vectors(ioapic); mask_after = e->fields.mask; if (mask_before != mask_after) kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after); @@ -241,6 +255,9 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; + smp_rmb(); + if (!test_bit(vector, ioapic->handled_vectors)) + return; mutex_lock(&ioapic->lock); __kvm_ioapic_update_eoi(ioapic, vector, trigger_mode); mutex_unlock(&ioapic->lock); @@ -352,6 +369,7 @@ void kvm_ioapic_reset(struct kvm_ioapic *ioapic) ioapic->ioregsel = 0; ioapic->irr = 0; ioapic->id = 0; + update_handled_vectors(ioapic); } static const struct kvm_io_device_ops ioapic_mmio_ops = { @@ -372,13 +390,28 @@ int kvm_ioapic_init(struct kvm *kvm) kvm_ioapic_reset(ioapic); kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); ioapic->kvm = kvm; - ret = kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &ioapic->dev); - if (ret < 0) + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); + mutex_unlock(&kvm->slots_lock); + if (ret < 0) { + kvm->arch.vioapic = NULL; kfree(ioapic); + } return ret; } +void kvm_ioapic_destroy(struct kvm *kvm) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + + if (ioapic) { + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); + kvm->arch.vioapic = NULL; + kfree(ioapic); + } +} + int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) { struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); @@ -399,6 +432,7 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) mutex_lock(&ioapic->lock); memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); + update_handled_vectors(ioapic); mutex_unlock(&ioapic->lock); return 0; } diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 419c43b667a..8a751b78a43 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -46,6 +46,7 @@ struct kvm_ioapic { struct kvm *kvm; void (*ack_notifier)(void *opaque, int irq); struct mutex lock; + DECLARE_BITMAP(handled_vectors, 256); }; #ifdef DEBUG @@ -71,6 +72,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode); int kvm_ioapic_init(struct kvm *kvm); +void kvm_ioapic_destroy(struct kvm *kvm); int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); void kvm_ioapic_reset(struct kvm_ioapic *ioapic); int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index 15147583abd..80fd3ad3b2d 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c @@ -32,10 +32,10 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm); static void kvm_iommu_put_pages(struct kvm *kvm, gfn_t base_gfn, unsigned long npages); -int kvm_iommu_map_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages) +int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) { - gfn_t gfn = base_gfn; + gfn_t gfn = slot->base_gfn; + unsigned long npages = slot->npages; pfn_t pfn; int i, r = 0; struct iommu_domain *domain = kvm->arch.iommu_domain; @@ -54,7 +54,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) continue; - pfn = gfn_to_pfn(kvm, gfn); + pfn = gfn_to_pfn_memslot(kvm, slot, gfn); r = iommu_map_range(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn), @@ -69,17 +69,19 @@ int kvm_iommu_map_pages(struct kvm *kvm, return 0; unmap_pages: - kvm_iommu_put_pages(kvm, base_gfn, i); + kvm_iommu_put_pages(kvm, slot->base_gfn, i); return r; } static int kvm_iommu_map_memslots(struct kvm *kvm) { int i, r = 0; + struct kvm_memslots *slots; + + slots = rcu_dereference(kvm->memslots); - for (i = 0; i < kvm->nmemslots; i++) { - r = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn, - kvm->memslots[i].npages); + for (i = 0; i < slots->nmemslots; i++) { + r = kvm_iommu_map_pages(kvm, &slots->memslots[i]); if (r) break; } @@ -104,7 +106,8 @@ int kvm_assign_device(struct kvm *kvm, r = iommu_attach_device(domain, &pdev->dev); if (r) { - printk(KERN_ERR "assign device %x:%x.%x failed", + printk(KERN_ERR "assign device %x:%x:%x.%x failed", + pci_domain_nr(pdev->bus), pdev->bus->number, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); @@ -125,7 +128,8 @@ int kvm_assign_device(struct kvm *kvm, goto out_unmap; } - printk(KERN_DEBUG "assign device: host bdf = %x:%x:%x\n", + printk(KERN_DEBUG "assign device %x:%x:%x.%x\n", + assigned_dev->host_segnr, assigned_dev->host_busnr, PCI_SLOT(assigned_dev->host_devfn), PCI_FUNC(assigned_dev->host_devfn)); @@ -152,7 +156,8 @@ int kvm_deassign_device(struct kvm *kvm, iommu_detach_device(domain, &pdev->dev); - printk(KERN_DEBUG "deassign device: host bdf = %x:%x:%x\n", + printk(KERN_DEBUG "deassign device %x:%x:%x.%x\n", + assigned_dev->host_segnr, assigned_dev->host_busnr, PCI_SLOT(assigned_dev->host_devfn), PCI_FUNC(assigned_dev->host_devfn)); @@ -210,10 +215,13 @@ static void kvm_iommu_put_pages(struct kvm *kvm, static int kvm_iommu_unmap_memslots(struct kvm *kvm) { int i; + struct kvm_memslots *slots; + + slots = rcu_dereference(kvm->memslots); - for (i = 0; i < kvm->nmemslots; i++) { - kvm_iommu_put_pages(kvm, kvm->memslots[i].base_gfn, - kvm->memslots[i].npages); + for (i = 0; i < slots->nmemslots; i++) { + kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn, + slots->memslots[i].npages); } return 0; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a944be392d6..548f9253c19 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -44,6 +44,8 @@ #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/compat.h> +#include <linux/srcu.h> +#include <linux/hugetlb.h> #include <asm/processor.h> #include <asm/io.h> @@ -51,9 +53,7 @@ #include <asm/pgtable.h> #include <asm-generic/bitops/le.h> -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET #include "coalesced_mmio.h" -#endif #define CREATE_TRACE_POINTS #include <trace/events/kvm.h> @@ -86,6 +86,8 @@ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, static int hardware_enable_all(void); static void hardware_disable_all(void); +static void kvm_io_bus_destroy(struct kvm_io_bus *bus); + static bool kvm_rebooting; static bool largepages_enabled = true; @@ -136,7 +138,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) zalloc_cpumask_var(&cpus, GFP_ATOMIC); - spin_lock(&kvm->requests_lock); + raw_spin_lock(&kvm->requests_lock); me = smp_processor_id(); kvm_for_each_vcpu(i, vcpu, kvm) { if (test_and_set_bit(req, &vcpu->requests)) @@ -151,7 +153,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) smp_call_function_many(cpus, ack_flush, NULL, 1); else called = false; - spin_unlock(&kvm->requests_lock); + raw_spin_unlock(&kvm->requests_lock); free_cpumask_var(cpus); return called; } @@ -215,7 +217,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, unsigned long address) { struct kvm *kvm = mmu_notifier_to_kvm(mn); - int need_tlb_flush; + int need_tlb_flush, idx; /* * When ->invalidate_page runs, the linux pte has been zapped @@ -235,10 +237,12 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, * pte after kvm_unmap_hva returned, without noticing the page * is going to be freed. */ + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); kvm->mmu_notifier_seq++; need_tlb_flush = kvm_unmap_hva(kvm, address); spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); /* we've to flush the tlb before the pages can be freed */ if (need_tlb_flush) @@ -252,11 +256,14 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, pte_t pte) { struct kvm *kvm = mmu_notifier_to_kvm(mn); + int idx; + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); kvm->mmu_notifier_seq++; kvm_set_spte_hva(kvm, address, pte); spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); } static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, @@ -265,8 +272,9 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, unsigned long end) { struct kvm *kvm = mmu_notifier_to_kvm(mn); - int need_tlb_flush = 0; + int need_tlb_flush = 0, idx; + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); /* * The count increase must become visible at unlock time as no @@ -277,6 +285,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, for (; start < end; start += PAGE_SIZE) need_tlb_flush |= kvm_unmap_hva(kvm, start); spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); /* we've to flush the tlb before the pages can be freed */ if (need_tlb_flush) @@ -314,11 +323,13 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, unsigned long address) { struct kvm *kvm = mmu_notifier_to_kvm(mn); - int young; + int young, idx; + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); young = kvm_age_hva(kvm, address); spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); if (young) kvm_flush_remote_tlbs(kvm); @@ -341,15 +352,26 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { .change_pte = kvm_mmu_notifier_change_pte, .release = kvm_mmu_notifier_release, }; + +static int kvm_init_mmu_notifier(struct kvm *kvm) +{ + kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; + return mmu_notifier_register(&kvm->mmu_notifier, current->mm); +} + +#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ + +static int kvm_init_mmu_notifier(struct kvm *kvm) +{ + return 0; +} + #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ static struct kvm *kvm_create_vm(void) { - int r = 0; + int r = 0, i; struct kvm *kvm = kvm_arch_create_vm(); -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - struct page *page; -#endif if (IS_ERR(kvm)) goto out; @@ -363,39 +385,35 @@ static struct kvm *kvm_create_vm(void) INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); #endif -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) { - r = -ENOMEM; + r = -ENOMEM; + kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!kvm->memslots) goto out_err; - } - kvm->coalesced_mmio_ring = - (struct kvm_coalesced_mmio_ring *)page_address(page); -#endif - -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) - { - kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; - r = mmu_notifier_register(&kvm->mmu_notifier, current->mm); - if (r) { -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - put_page(page); -#endif + if (init_srcu_struct(&kvm->srcu)) + goto out_err; + for (i = 0; i < KVM_NR_BUSES; i++) { + kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), + GFP_KERNEL); + if (!kvm->buses[i]) { + cleanup_srcu_struct(&kvm->srcu); goto out_err; } } -#endif + + r = kvm_init_mmu_notifier(kvm); + if (r) { + cleanup_srcu_struct(&kvm->srcu); + goto out_err; + } kvm->mm = current->mm; atomic_inc(&kvm->mm->mm_count); spin_lock_init(&kvm->mmu_lock); - spin_lock_init(&kvm->requests_lock); - kvm_io_bus_init(&kvm->pio_bus); + raw_spin_lock_init(&kvm->requests_lock); kvm_eventfd_init(kvm); mutex_init(&kvm->lock); mutex_init(&kvm->irq_lock); - kvm_io_bus_init(&kvm->mmio_bus); - init_rwsem(&kvm->slots_lock); + mutex_init(&kvm->slots_lock); atomic_set(&kvm->users_count, 1); spin_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); @@ -406,12 +424,12 @@ static struct kvm *kvm_create_vm(void) out: return kvm; -#if defined(KVM_COALESCED_MMIO_PAGE_OFFSET) || \ - (defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)) out_err: hardware_disable_all(); -#endif out_err_nodisable: + for (i = 0; i < KVM_NR_BUSES; i++) + kfree(kvm->buses[i]); + kfree(kvm->memslots); kfree(kvm); return ERR_PTR(r); } @@ -446,13 +464,17 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free, void kvm_free_physmem(struct kvm *kvm) { int i; + struct kvm_memslots *slots = kvm->memslots; + + for (i = 0; i < slots->nmemslots; ++i) + kvm_free_physmem_slot(&slots->memslots[i], NULL); - for (i = 0; i < kvm->nmemslots; ++i) - kvm_free_physmem_slot(&kvm->memslots[i], NULL); + kfree(kvm->memslots); } static void kvm_destroy_vm(struct kvm *kvm) { + int i; struct mm_struct *mm = kvm->mm; kvm_arch_sync_events(kvm); @@ -460,12 +482,9 @@ static void kvm_destroy_vm(struct kvm *kvm) list_del(&kvm->vm_list); spin_unlock(&kvm_lock); kvm_free_irq_routing(kvm); - kvm_io_bus_destroy(&kvm->pio_bus); - kvm_io_bus_destroy(&kvm->mmio_bus); -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - if (kvm->coalesced_mmio_ring != NULL) - free_page((unsigned long)kvm->coalesced_mmio_ring); -#endif + for (i = 0; i < KVM_NR_BUSES; i++) + kvm_io_bus_destroy(kvm->buses[i]); + kvm_coalesced_mmio_free(kvm); #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); #else @@ -512,12 +531,13 @@ int __kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, int user_alloc) { - int r; + int r, flush_shadow = 0; gfn_t base_gfn; unsigned long npages; unsigned long i; struct kvm_memory_slot *memslot; struct kvm_memory_slot old, new; + struct kvm_memslots *slots, *old_memslots; r = -EINVAL; /* General sanity checks */ @@ -532,7 +552,7 @@ int __kvm_set_memory_region(struct kvm *kvm, if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) goto out; - memslot = &kvm->memslots[mem->slot]; + memslot = &kvm->memslots->memslots[mem->slot]; base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; npages = mem->memory_size >> PAGE_SHIFT; @@ -553,7 +573,7 @@ int __kvm_set_memory_region(struct kvm *kvm, /* Check for overlaps */ r = -EEXIST; for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *s = &kvm->memslots[i]; + struct kvm_memory_slot *s = &kvm->memslots->memslots[i]; if (s == memslot || !s->npages) continue; @@ -579,15 +599,7 @@ int __kvm_set_memory_region(struct kvm *kvm, memset(new.rmap, 0, npages * sizeof(*new.rmap)); new.user_alloc = user_alloc; - /* - * hva_to_rmmap() serialzies with the mmu_lock and to be - * safe it has to ignore memslots with !user_alloc && - * !userspace_addr. - */ - if (user_alloc) - new.userspace_addr = mem->userspace_addr; - else - new.userspace_addr = 0; + new.userspace_addr = mem->userspace_addr; } if (!npages) goto skip_lpage; @@ -642,8 +654,9 @@ skip_lpage: if (!new.dirty_bitmap) goto out_free; memset(new.dirty_bitmap, 0, dirty_bytes); + /* destroy any largepage mappings for dirty tracking */ if (old.npages) - kvm_arch_flush_shadow(kvm); + flush_shadow = 1; } #else /* not defined CONFIG_S390 */ new.user_alloc = user_alloc; @@ -651,36 +664,72 @@ skip_lpage: new.userspace_addr = mem->userspace_addr; #endif /* not defined CONFIG_S390 */ - if (!npages) + if (!npages) { + r = -ENOMEM; + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) + goto out_free; + memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); + if (mem->slot >= slots->nmemslots) + slots->nmemslots = mem->slot + 1; + slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; + + old_memslots = kvm->memslots; + rcu_assign_pointer(kvm->memslots, slots); + synchronize_srcu_expedited(&kvm->srcu); + /* From this point no new shadow pages pointing to a deleted + * memslot will be created. + * + * validation of sp->gfn happens in: + * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) + * - kvm_is_visible_gfn (mmu_check_roots) + */ kvm_arch_flush_shadow(kvm); + kfree(old_memslots); + } - spin_lock(&kvm->mmu_lock); - if (mem->slot >= kvm->nmemslots) - kvm->nmemslots = mem->slot + 1; - - *memslot = new; - spin_unlock(&kvm->mmu_lock); - - r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); - if (r) { - spin_lock(&kvm->mmu_lock); - *memslot = old; - spin_unlock(&kvm->mmu_lock); + r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc); + if (r) goto out_free; - } - kvm_free_physmem_slot(&old, npages ? &new : NULL); - /* Slot deletion case: we have to update the current slot */ - spin_lock(&kvm->mmu_lock); - if (!npages) - *memslot = old; - spin_unlock(&kvm->mmu_lock); #ifdef CONFIG_DMAR /* map the pages in iommu page table */ - r = kvm_iommu_map_pages(kvm, base_gfn, npages); - if (r) - goto out; + if (npages) { + r = kvm_iommu_map_pages(kvm, &new); + if (r) + goto out_free; + } #endif + + r = -ENOMEM; + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) + goto out_free; + memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); + if (mem->slot >= slots->nmemslots) + slots->nmemslots = mem->slot + 1; + + /* actual memory is freed via old in kvm_free_physmem_slot below */ + if (!npages) { + new.rmap = NULL; + new.dirty_bitmap = NULL; + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) + new.lpage_info[i] = NULL; + } + + slots->memslots[mem->slot] = new; + old_memslots = kvm->memslots; + rcu_assign_pointer(kvm->memslots, slots); + synchronize_srcu_expedited(&kvm->srcu); + + kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); + + kvm_free_physmem_slot(&old, &new); + kfree(old_memslots); + + if (flush_shadow) + kvm_arch_flush_shadow(kvm); + return 0; out_free: @@ -697,9 +746,9 @@ int kvm_set_memory_region(struct kvm *kvm, { int r; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); r = __kvm_set_memory_region(kvm, mem, user_alloc); - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } EXPORT_SYMBOL_GPL(kvm_set_memory_region); @@ -726,7 +775,7 @@ int kvm_get_dirty_log(struct kvm *kvm, if (log->slot >= KVM_MEMORY_SLOTS) goto out; - memslot = &kvm->memslots[log->slot]; + memslot = &kvm->memslots->memslots[log->slot]; r = -ENOENT; if (!memslot->dirty_bitmap) goto out; @@ -780,9 +829,10 @@ EXPORT_SYMBOL_GPL(kvm_is_error_hva); struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn) { int i; + struct kvm_memslots *slots = rcu_dereference(kvm->memslots); - for (i = 0; i < kvm->nmemslots; ++i) { - struct kvm_memory_slot *memslot = &kvm->memslots[i]; + for (i = 0; i < slots->nmemslots; ++i) { + struct kvm_memory_slot *memslot = &slots->memslots[i]; if (gfn >= memslot->base_gfn && gfn < memslot->base_gfn + memslot->npages) @@ -801,10 +851,14 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) { int i; + struct kvm_memslots *slots = rcu_dereference(kvm->memslots); - gfn = unalias_gfn(kvm, gfn); + gfn = unalias_gfn_instantiation(kvm, gfn); for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *memslot = &kvm->memslots[i]; + struct kvm_memory_slot *memslot = &slots->memslots[i]; + + if (memslot->flags & KVM_MEMSLOT_INVALID) + continue; if (gfn >= memslot->base_gfn && gfn < memslot->base_gfn + memslot->npages) @@ -814,33 +868,68 @@ int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); +unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) +{ + struct vm_area_struct *vma; + unsigned long addr, size; + + size = PAGE_SIZE; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return PAGE_SIZE; + + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, addr); + if (!vma) + goto out; + + size = vma_kernel_pagesize(vma); + +out: + up_read(¤t->mm->mmap_sem); + + return size; +} + +int memslot_id(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_memslots *slots = rcu_dereference(kvm->memslots); + struct kvm_memory_slot *memslot = NULL; + + gfn = unalias_gfn(kvm, gfn); + for (i = 0; i < slots->nmemslots; ++i) { + memslot = &slots->memslots[i]; + + if (gfn >= memslot->base_gfn + && gfn < memslot->base_gfn + memslot->npages) + break; + } + + return memslot - slots->memslots; +} + unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *slot; - gfn = unalias_gfn(kvm, gfn); + gfn = unalias_gfn_instantiation(kvm, gfn); slot = gfn_to_memslot_unaliased(kvm, gfn); - if (!slot) + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return bad_hva(); return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); } EXPORT_SYMBOL_GPL(gfn_to_hva); -pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) +static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr) { struct page *page[1]; - unsigned long addr; int npages; pfn_t pfn; might_sleep(); - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) { - get_page(bad_page); - return page_to_pfn(bad_page); - } - npages = get_user_pages_fast(addr, 1, 1, page); if (unlikely(npages != 1)) { @@ -865,8 +954,32 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) return pfn; } +pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) +{ + unsigned long addr; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) { + get_page(bad_page); + return page_to_pfn(bad_page); + } + + return hva_to_pfn(kvm, addr); +} EXPORT_SYMBOL_GPL(gfn_to_pfn); +static unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) +{ + return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); +} + +pfn_t gfn_to_pfn_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn) +{ + unsigned long addr = gfn_to_hva_memslot(slot, gfn); + return hva_to_pfn(kvm, addr); +} + struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) { pfn_t pfn; @@ -1854,12 +1967,7 @@ static struct notifier_block kvm_reboot_notifier = { .priority = 0, }; -void kvm_io_bus_init(struct kvm_io_bus *bus) -{ - memset(bus, 0, sizeof(*bus)); -} - -void kvm_io_bus_destroy(struct kvm_io_bus *bus) +static void kvm_io_bus_destroy(struct kvm_io_bus *bus) { int i; @@ -1868,13 +1976,15 @@ void kvm_io_bus_destroy(struct kvm_io_bus *bus) kvm_iodevice_destructor(pos); } + kfree(bus); } /* kvm_io_bus_write - called under kvm->slots_lock */ -int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr, +int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val) { int i; + struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]); for (i = 0; i < bus->dev_count; i++) if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) return 0; @@ -1882,59 +1992,71 @@ int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr, } /* kvm_io_bus_read - called under kvm->slots_lock */ -int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len, void *val) +int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, + int len, void *val) { int i; + struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]); + for (i = 0; i < bus->dev_count; i++) if (!kvm_iodevice_read(bus->devs[i], addr, len, val)) return 0; return -EOPNOTSUPP; } -int kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus, - struct kvm_io_device *dev) +/* Caller must hold slots_lock. */ +int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev) { - int ret; - - down_write(&kvm->slots_lock); - ret = __kvm_io_bus_register_dev(bus, dev); - up_write(&kvm->slots_lock); + struct kvm_io_bus *new_bus, *bus; - return ret; -} - -/* An unlocked version. Caller must have write lock on slots_lock. */ -int __kvm_io_bus_register_dev(struct kvm_io_bus *bus, - struct kvm_io_device *dev) -{ + bus = kvm->buses[bus_idx]; if (bus->dev_count > NR_IOBUS_DEVS-1) return -ENOSPC; - bus->devs[bus->dev_count++] = dev; + new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); + if (!new_bus) + return -ENOMEM; + memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); + new_bus->devs[new_bus->dev_count++] = dev; + rcu_assign_pointer(kvm->buses[bus_idx], new_bus); + synchronize_srcu_expedited(&kvm->srcu); + kfree(bus); return 0; } -void kvm_io_bus_unregister_dev(struct kvm *kvm, - struct kvm_io_bus *bus, - struct kvm_io_device *dev) +/* Caller must hold slots_lock. */ +int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev) { - down_write(&kvm->slots_lock); - __kvm_io_bus_unregister_dev(bus, dev); - up_write(&kvm->slots_lock); -} + int i, r; + struct kvm_io_bus *new_bus, *bus; -/* An unlocked version. Caller must have write lock on slots_lock. */ -void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus, - struct kvm_io_device *dev) -{ - int i; + new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); + if (!new_bus) + return -ENOMEM; - for (i = 0; i < bus->dev_count; i++) - if (bus->devs[i] == dev) { - bus->devs[i] = bus->devs[--bus->dev_count]; + bus = kvm->buses[bus_idx]; + memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); + + r = -ENOENT; + for (i = 0; i < new_bus->dev_count; i++) + if (new_bus->devs[i] == dev) { + r = 0; + new_bus->devs[i] = new_bus->devs[--new_bus->dev_count]; break; } + + if (r) { + kfree(new_bus); + return r; + } + + rcu_assign_pointer(kvm->buses[bus_idx], new_bus); + synchronize_srcu_expedited(&kvm->srcu); + kfree(bus); + return r; } static struct notifier_block kvm_cpu_notifier = { |