diff options
202 files changed, 10357 insertions, 2865 deletions
diff --git a/Documentation/devicetree/bindings/dma/atmel-dma.txt b/Documentation/devicetree/bindings/dma/atmel-dma.txt new file mode 100644 index 00000000000..3c046ee6e8b --- /dev/null +++ b/Documentation/devicetree/bindings/dma/atmel-dma.txt @@ -0,0 +1,14 @@ +* Atmel Direct Memory Access Controller (DMA) + +Required properties: +- compatible: Should be "atmel,<chip>-dma" +- reg: Should contain DMA registers location and length +- interrupts: Should contain DMA interrupt + +Examples: + +dma@ffffec00 { + compatible = "atmel,at91sam9g45-dma"; + reg = <0xffffec00 0x200>; + interrupts = <21>; +}; diff --git a/Documentation/dmaengine.txt b/Documentation/dmaengine.txt index 94b7e0f96b3..bbe6cb3d185 100644 --- a/Documentation/dmaengine.txt +++ b/Documentation/dmaengine.txt @@ -75,6 +75,10 @@ The slave DMA usage consists of following steps: slave_sg - DMA a list of scatter gather buffers from/to a peripheral dma_cyclic - Perform a cyclic DMA operation from/to a peripheral till the operation is explicitly stopped. + interleaved_dma - This is common to Slave as well as M2M clients. For slave + address of devices' fifo could be already known to the driver. + Various types of operations could be expressed by setting + appropriate values to the 'dma_interleaved_template' members. A non-NULL return of this transfer API represents a "descriptor" for the given transaction. @@ -89,6 +93,10 @@ The slave DMA usage consists of following steps: struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len, size_t period_len, enum dma_data_direction direction); + struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)( + struct dma_chan *chan, struct dma_interleaved_template *xt, + unsigned long flags); + The peripheral driver is expected to have mapped the scatterlist for the DMA operation prior to calling device_prep_slave_sg, and must keep the scatterlist mapped until the DMA operation has completed. diff --git a/MAINTAINERS b/MAINTAINERS index 2a90101309d..341dee3b02c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -745,6 +745,7 @@ M: Barry Song <baohua.song@csr.com> L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/mach-prima2/ +F: drivers/dma/sirf-dma* ARM/EBSA110 MACHINE SUPPORT M: Russell King <linux@arm.linux.org.uk> @@ -5846,7 +5847,7 @@ F: drivers/mmc/host/sdhci-spear.c SECURITY SUBSYSTEM M: James Morris <jmorris@namei.org> L: linux-security-module@vger.kernel.org (suggested Cc:) -T: git git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/security-testing-2.6.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security.git W: http://security.wiki.kernel.org/ S: Supported F: security/ diff --git a/arch/arm/include/asm/kprobes.h b/arch/arm/include/asm/kprobes.h index feec86768f9..f82ec22eeb1 100644 --- a/arch/arm/include/asm/kprobes.h +++ b/arch/arm/include/asm/kprobes.h @@ -24,7 +24,6 @@ #define MAX_INSN_SIZE 2 #define MAX_STACK_SIZE 64 /* 32 would probably be OK */ -#define regs_return_value(regs) ((regs)->ARM_r0) #define flush_insn_slot(p) do { } while (0) #define kretprobe_blacklist_size 0 diff --git a/arch/arm/include/asm/ptrace.h b/arch/arm/include/asm/ptrace.h index 96187ff58c2..451808ba121 100644 --- a/arch/arm/include/asm/ptrace.h +++ b/arch/arm/include/asm/ptrace.h @@ -189,6 +189,11 @@ static inline int valid_user_regs(struct pt_regs *regs) return 0; } +static inline long regs_return_value(struct pt_regs *regs) +{ + return regs->ARM_r0; +} + #define instruction_pointer(regs) (regs)->ARM_pc #ifdef CONFIG_SMP diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 0f30c3a78fc..d4c24d412a8 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -129,6 +129,7 @@ extern void vfp_flush_hwstate(struct thread_info *); /* * thread information flags: * TIF_SYSCALL_TRACE - syscall trace active + * TIF_SYSCAL_AUDIT - syscall auditing active * TIF_SIGPENDING - signal pending * TIF_NEED_RESCHED - rescheduling necessary * TIF_NOTIFY_RESUME - callback before returning to user @@ -139,6 +140,7 @@ extern void vfp_flush_hwstate(struct thread_info *); #define TIF_NEED_RESCHED 1 #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_SYSCALL_TRACE 8 +#define TIF_SYSCALL_AUDIT 9 #define TIF_POLLING_NRFLAG 16 #define TIF_USING_IWMMXT 17 #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ @@ -149,11 +151,15 @@ extern void vfp_flush_hwstate(struct thread_info *); #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) +#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT) #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) #define _TIF_SECCOMP (1 << TIF_SECCOMP) +/* Checks for any syscall work in entry-common.S */ +#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT) + /* * Change these and you break ASM code in entry-common.S */ diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index b2a27b6b004..520889cf1b5 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -87,7 +87,7 @@ ENTRY(ret_from_fork) get_thread_info tsk ldr r1, [tsk, #TI_FLAGS] @ check for syscall tracing mov why, #1 - tst r1, #_TIF_SYSCALL_TRACE @ are we tracing syscalls? + tst r1, #_TIF_SYSCALL_WORK @ are we tracing syscalls? beq ret_slow_syscall mov r1, sp mov r0, #1 @ trace exit [IP = 1] @@ -443,7 +443,7 @@ ENTRY(vector_swi) 1: #endif - tst r10, #_TIF_SYSCALL_TRACE @ are we tracing syscalls? + tst r10, #_TIF_SYSCALL_WORK @ are we tracing syscalls? bne __sys_trace cmp scno, #NR_syscalls @ check upper syscall limit diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 483727ad689..e1d5e1929fb 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -906,11 +906,6 @@ asmlinkage int syscall_trace(int why, struct pt_regs *regs, int scno) { unsigned long ip; - if (!test_thread_flag(TIF_SYSCALL_TRACE)) - return scno; - if (!(current->ptrace & PT_PTRACED)) - return scno; - /* * Save IP. IP is used to denote syscall entry/exit: * IP = 0 -> entry, = 1 -> exit @@ -918,6 +913,17 @@ asmlinkage int syscall_trace(int why, struct pt_regs *regs, int scno) ip = regs->ARM_ip; regs->ARM_ip = why; + if (!ip) + audit_syscall_exit(regs); + else + audit_syscall_entry(AUDIT_ARCH_ARMEB, scno, regs->ARM_r0, + regs->ARM_r1, regs->ARM_r2, regs->ARM_r3); + + if (!test_thread_flag(TIF_SYSCALL_TRACE)) + return scno; + if (!(current->ptrace & PT_PTRACED)) + return scno; + current_thread_info()->syscall = scno; /* the 0x80 provides a way for the tracing parent to distinguish diff --git a/arch/arm/mach-ep93xx/include/mach/dma.h b/arch/arm/mach-ep93xx/include/mach/dma.h index 46d4d876e6f..e82c642fa53 100644 --- a/arch/arm/mach-ep93xx/include/mach/dma.h +++ b/arch/arm/mach-ep93xx/include/mach/dma.h @@ -37,7 +37,7 @@ */ struct ep93xx_dma_data { int port; - enum dma_data_direction direction; + enum dma_transfer_direction direction; const char *name; }; @@ -80,14 +80,14 @@ static inline bool ep93xx_dma_chan_is_m2p(struct dma_chan *chan) * channel supports given DMA direction. Only M2P channels have such * limitation, for M2M channels the direction is configurable. */ -static inline enum dma_data_direction +static inline enum dma_transfer_direction ep93xx_dma_chan_direction(struct dma_chan *chan) { if (!ep93xx_dma_chan_is_m2p(chan)) return DMA_NONE; /* even channels are for TX, odd for RX */ - return (chan->chan_id % 2 == 0) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; + return (chan->chan_id % 2 == 0) ? DMA_MEM_TO_DEV : DMA_DEV_TO_MEM; } #endif /* __ASM_ARCH_DMA_H */ diff --git a/arch/arm/mach-shmobile/setup-sh7372.c b/arch/arm/mach-shmobile/setup-sh7372.c index 1ea89be63e2..6fcf304d3cd 100644 --- a/arch/arm/mach-shmobile/setup-sh7372.c +++ b/arch/arm/mach-shmobile/setup-sh7372.c @@ -445,31 +445,39 @@ static const struct sh_dmae_slave_config sh7372_dmae_slaves[] = { }, }; +#define SH7372_CHCLR 0x220 + static const struct sh_dmae_channel sh7372_dmae_channels[] = { { .offset = 0, .dmars = 0, .dmars_bit = 0, + .chclr_offset = SH7372_CHCLR + 0, }, { .offset = 0x10, .dmars = 0, .dmars_bit = 8, + .chclr_offset = SH7372_CHCLR + 0x10, }, { .offset = 0x20, .dmars = 4, .dmars_bit = 0, + .chclr_offset = SH7372_CHCLR + 0x20, }, { .offset = 0x30, .dmars = 4, .dmars_bit = 8, + .chclr_offset = SH7372_CHCLR + 0x30, }, { .offset = 0x50, .dmars = 8, .dmars_bit = 0, + .chclr_offset = SH7372_CHCLR + 0x50, }, { .offset = 0x60, .dmars = 8, .dmars_bit = 8, + .chclr_offset = SH7372_CHCLR + 0x60, } }; @@ -487,6 +495,7 @@ static struct sh_dmae_pdata dma_platform_data = { .ts_shift = ts_shift, .ts_shift_num = ARRAY_SIZE(ts_shift), .dmaor_init = DMAOR_DME, + .chclr_present = 1, }; /* Resource order important! */ @@ -494,7 +503,7 @@ static struct resource sh7372_dmae0_resources[] = { { /* Channel registers and DMAOR */ .start = 0xfe008020, - .end = 0xfe00808f, + .end = 0xfe00828f, .flags = IORESOURCE_MEM, }, { @@ -522,7 +531,7 @@ static struct resource sh7372_dmae1_resources[] = { { /* Channel registers and DMAOR */ .start = 0xfe018020, - .end = 0xfe01808f, + .end = 0xfe01828f, .flags = IORESOURCE_MEM, }, { @@ -550,7 +559,7 @@ static struct resource sh7372_dmae2_resources[] = { { /* Channel registers and DMAOR */ .start = 0xfe028020, - .end = 0xfe02808f, + .end = 0xfe02828f, .flags = IORESOURCE_MEM, }, { diff --git a/arch/arm/plat-mxc/include/mach/mx3fb.h b/arch/arm/plat-mxc/include/mach/mx3fb.h index ac24c5c4bc8..fdbe6000154 100644 --- a/arch/arm/plat-mxc/include/mach/mx3fb.h +++ b/arch/arm/plat-mxc/include/mach/mx3fb.h @@ -22,6 +22,20 @@ #define FB_SYNC_SWAP_RGB 0x04000000 #define FB_SYNC_CLK_SEL_EN 0x02000000 +/* + * Specify the way your display is connected. The IPU can arbitrarily + * map the internal colors to the external data lines. We only support + * the following mappings at the moment. + */ +enum disp_data_mapping { + /* blue -> d[0..5], green -> d[6..11], red -> d[12..17] */ + IPU_DISP_DATA_MAPPING_RGB666, + /* blue -> d[0..4], green -> d[5..10], red -> d[11..15] */ + IPU_DISP_DATA_MAPPING_RGB565, + /* blue -> d[0..7], green -> d[8..15], red -> d[16..23] */ + IPU_DISP_DATA_MAPPING_RGB888, +}; + /** * struct mx3fb_platform_data - mx3fb platform data * @@ -33,6 +47,7 @@ struct mx3fb_platform_data { const char *name; const struct fb_videomode *mode; int num_modes; + enum disp_data_mapping disp_data_fmt; }; #endif diff --git a/arch/arm/plat-nomadik/include/plat/ste_dma40.h b/arch/arm/plat-nomadik/include/plat/ste_dma40.h index 685c78716d9..fd0ee84c45d 100644 --- a/arch/arm/plat-nomadik/include/plat/ste_dma40.h +++ b/arch/arm/plat-nomadik/include/plat/ste_dma40.h @@ -113,7 +113,8 @@ struct stedma40_half_channel_info { * @dst_dev_type: Dst device type * @src_info: Parameters for dst half channel * @dst_info: Parameters for dst half channel - * + * @use_fixed_channel: if true, use physical channel specified by phy_channel + * @phy_channel: physical channel to use, only if use_fixed_channel is true * * This structure has to be filled by the client drivers. * It is recommended to do all dma configurations for clients in the machine. @@ -129,6 +130,9 @@ struct stedma40_chan_cfg { int dst_dev_type; struct stedma40_half_channel_info src_info; struct stedma40_half_channel_info dst_info; + + bool use_fixed_channel; + int phy_channel; }; /** @@ -153,6 +157,7 @@ struct stedma40_platform_data { struct stedma40_chan_cfg *memcpy_conf_phy; struct stedma40_chan_cfg *memcpy_conf_log; int disabled_channels[STEDMA40_MAX_PHYS]; + bool use_esram_lcla; }; #ifdef CONFIG_STE_DMA40 @@ -187,7 +192,7 @@ static inline struct dma_async_tx_descriptor *stedma40_slave_mem(struct dma_chan *chan, dma_addr_t addr, unsigned int size, - enum dma_data_direction direction, + enum dma_transfer_direction direction, unsigned long flags) { struct scatterlist sg; @@ -209,7 +214,7 @@ static inline struct dma_async_tx_descriptor *stedma40_slave_mem(struct dma_chan *chan, dma_addr_t addr, unsigned int size, - enum dma_data_direction direction, + enum dma_transfer_direction direction, unsigned long flags) { return NULL; diff --git a/arch/arm/plat-samsung/dma-ops.c b/arch/arm/plat-samsung/dma-ops.c index 2cded872f22..0747c77a2fd 100644 --- a/arch/arm/plat-samsung/dma-ops.c +++ b/arch/arm/plat-samsung/dma-ops.c @@ -37,14 +37,14 @@ static unsigned samsung_dmadev_request(enum dma_ch dma_ch, (void *)dma_ch; chan = dma_request_channel(mask, pl330_filter, filter_param); - if (info->direction == DMA_FROM_DEVICE) { + if (info->direction == DMA_DEV_TO_MEM) { memset(&slave_config, 0, sizeof(struct dma_slave_config)); slave_config.direction = info->direction; slave_config.src_addr = info->fifo; slave_config.src_addr_width = info->width; slave_config.src_maxburst = 1; dmaengine_slave_config(chan, &slave_config); - } else if (info->direction == DMA_TO_DEVICE) { + } else if (info->direction == DMA_MEM_TO_DEV) { memset(&slave_config, 0, sizeof(struct dma_slave_config)); slave_config.direction = info->direction; slave_config.dst_addr = info->fifo; diff --git a/arch/arm/plat-samsung/include/plat/dma-ops.h b/arch/arm/plat-samsung/include/plat/dma-ops.h index 7c6c2a8dd55..71a6827c770 100644 --- a/arch/arm/plat-samsung/include/plat/dma-ops.h +++ b/arch/arm/plat-samsung/include/plat/dma-ops.h @@ -18,7 +18,7 @@ struct samsung_dma_prep_info { enum dma_transaction_type cap; - enum dma_data_direction direction; + enum dma_transfer_direction direction; dma_addr_t buf; unsigned long period; unsigned long len; @@ -28,7 +28,7 @@ struct samsung_dma_prep_info { struct samsung_dma_info { enum dma_transaction_type cap; - enum dma_data_direction direction; + enum dma_transfer_direction direction; enum dma_slave_buswidth width; dma_addr_t fifo; struct s3c2410_dma_client *client; diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h index f5cb27614e3..68c98f5b3ca 100644 --- a/arch/ia64/include/asm/ptrace.h +++ b/arch/ia64/include/asm/ptrace.h @@ -246,7 +246,18 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs) return regs->ar_bspstore; } -#define regs_return_value(regs) ((regs)->r8) +static inline int is_syscall_success(struct pt_regs *regs) +{ + return regs->r10 != -1; +} + +static inline long regs_return_value(struct pt_regs *regs) +{ + if (is_syscall_success(regs)) + return regs->r8; + else + return -regs->r8; +} /* Conserve space in histogram by encoding slot bits in address * bits 2 and 3 rather than bits 0 and 1. diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 8848f43d819..dad91661ddf 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -1246,15 +1246,8 @@ syscall_trace_enter (long arg0, long arg1, long arg2, long arg3, if (test_thread_flag(TIF_RESTORE_RSE)) ia64_sync_krbs(); - if (unlikely(current->audit_context)) { - long syscall; - int arch; - syscall = regs.r15; - arch = AUDIT_ARCH_IA64; - - audit_syscall_entry(arch, syscall, arg0, arg1, arg2, arg3); - } + audit_syscall_entry(AUDIT_ARCH_IA64, regs.r15, arg0, arg1, arg2, arg3); return 0; } @@ -1268,14 +1261,7 @@ syscall_trace_leave (long arg0, long arg1, long arg2, long arg3, { int step; - if (unlikely(current->audit_context)) { - int success = AUDITSC_RESULT(regs.r10); - long result = regs.r8; - - if (success != AUDITSC_SUCCESS) - result = -result; - audit_syscall_exit(success, result); - } + audit_syscall_exit(®s); step = test_thread_flag(TIF_SINGLESTEP); if (step || test_thread_flag(TIF_SYSCALL_TRACE)) diff --git a/arch/microblaze/include/asm/ptrace.h b/arch/microblaze/include/asm/ptrace.h index 816bee64b19..94e92c80585 100644 --- a/arch/microblaze/include/asm/ptrace.h +++ b/arch/microblaze/include/asm/ptrace.h @@ -61,6 +61,11 @@ struct pt_regs { #define instruction_pointer(regs) ((regs)->pc) #define profile_pc(regs) instruction_pointer(regs) +static inline long regs_return_value(struct pt_regs *regs) +{ + return regs->r3; +} + #else /* __KERNEL__ */ /* pt_regs offsets used by gdbserver etc in ptrace syscalls */ diff --git a/arch/microblaze/kernel/ptrace.c b/arch/microblaze/kernel/ptrace.c index 043cb58f9c4..6eb2aa927d8 100644 --- a/arch/microblaze/kernel/ptrace.c +++ b/arch/microblaze/kernel/ptrace.c @@ -147,10 +147,8 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) */ ret = -1L; - if (unlikely(current->audit_context)) - audit_syscall_entry(EM_MICROBLAZE, regs->r12, - regs->r5, regs->r6, - regs->r7, regs->r8); + audit_syscall_entry(EM_MICROBLAZE, regs->r12, regs->r5, regs->r6, + regs->r7, regs->r8); return ret ?: regs->r12; } @@ -159,8 +157,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs) { int step; - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->r3), regs->r3); + audit_syscall_exit(regs); step = test_thread_flag(TIF_SINGLESTEP); if (step || test_thread_flag(TIF_SYSCALL_TRACE)) diff --git a/arch/microblaze/kernel/setup.c b/arch/microblaze/kernel/setup.c index 604cd9dd133..d4fc1a97177 100644 --- a/arch/microblaze/kernel/setup.c +++ b/arch/microblaze/kernel/setup.c @@ -26,6 +26,7 @@ #include <linux/cache.h> #include <linux/of_platform.h> #include <linux/dma-mapping.h> +#include <linux/cpu.h> #include <asm/cacheflush.h> #include <asm/entry.h> #include <asm/cpuinfo.h> @@ -226,5 +227,23 @@ static int __init setup_bus_notifier(void) return 0; } - arch_initcall(setup_bus_notifier); + +static DEFINE_PER_CPU(struct cpu, cpu_devices); + +static int __init topology_init(void) +{ + int i, ret; + + for_each_present_cpu(i) { + struct cpu *c = &per_cpu(cpu_devices, i); + + ret = register_cpu(c, i); + if (ret) + printk(KERN_WARNING "topology_init: register_cpu %d " + "failed (%d)\n", i, ret); + } + + return 0; +} +subsys_initcall(topology_init); diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h index 7b99c670e47..4b7f5252d2f 100644 --- a/arch/mips/include/asm/ptrace.h +++ b/arch/mips/include/asm/ptrace.h @@ -137,7 +137,19 @@ extern int ptrace_set_watch_regs(struct task_struct *child, */ #define user_mode(regs) (((regs)->cp0_status & KU_MASK) == KU_USER) -#define regs_return_value(_regs) ((_regs)->regs[2]) +static inline int is_syscall_success(struct pt_regs *regs) +{ + return !regs->regs[7]; +} + +static inline long regs_return_value(struct pt_regs *regs) +{ + if (is_syscall_success(regs)) + return regs->regs[2]; + else + return -regs->regs[2]; +} + #define instruction_pointer(regs) ((regs)->cp0_epc) #define profile_pc(regs) instruction_pointer(regs) diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 4e6ea1ffad4..7786b608d93 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -560,10 +560,9 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs) } out: - if (unlikely(current->audit_context)) - audit_syscall_entry(audit_arch(), regs->regs[2], - regs->regs[4], regs->regs[5], - regs->regs[6], regs->regs[7]); + audit_syscall_entry(audit_arch(), regs->regs[2], + regs->regs[4], regs->regs[5], + regs->regs[6], regs->regs[7]); } /* @@ -572,9 +571,7 @@ out: */ asmlinkage void syscall_trace_leave(struct pt_regs *regs) { - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->regs[7]), - -regs->regs[2]); + audit_syscall_exit(regs); if (!(current->ptrace & PT_PTRACED)) return; diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 48223f9b872..78a205162fd 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -86,7 +86,18 @@ struct pt_regs { #define instruction_pointer(regs) ((regs)->nip) #define user_stack_pointer(regs) ((regs)->gpr[1]) #define kernel_stack_pointer(regs) ((regs)->gpr[1]) -#define regs_return_value(regs) ((regs)->gpr[3]) +static inline int is_syscall_success(struct pt_regs *regs) +{ + return !(regs->ccr & 0x10000000); +} + +static inline long regs_return_value(struct pt_regs *regs) +{ + if (is_syscall_success(regs)) + return regs->gpr[3]; + else + return -regs->gpr[3]; +} #ifdef CONFIG_SMP extern unsigned long profile_pc(struct pt_regs *regs); diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 5de73dbd15c..5b43325402b 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -1724,22 +1724,20 @@ long do_syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->gpr[0]); - if (unlikely(current->audit_context)) { #ifdef CONFIG_PPC64 - if (!is_32bit_task()) - audit_syscall_entry(AUDIT_ARCH_PPC64, - regs->gpr[0], - regs->gpr[3], regs->gpr[4], - regs->gpr[5], regs->gpr[6]); - else + if (!is_32bit_task()) + audit_syscall_entry(AUDIT_ARCH_PPC64, + regs->gpr[0], + regs->gpr[3], regs->gpr[4], + regs->gpr[5], regs->gpr[6]); + else #endif - audit_syscall_entry(AUDIT_ARCH_PPC, - regs->gpr[0], - regs->gpr[3] & 0xffffffff, - regs->gpr[4] & 0xffffffff, - regs->gpr[5] & 0xffffffff, - regs->gpr[6] & 0xffffffff); - } + audit_syscall_entry(AUDIT_ARCH_PPC, + regs->gpr[0], + regs->gpr[3] & 0xffffffff, + regs->gpr[4] & 0xffffffff, + regs->gpr[5] & 0xffffffff, + regs->gpr[6] & 0xffffffff); return ret ?: regs->gpr[0]; } @@ -1748,9 +1746,7 @@ void do_syscall_trace_leave(struct pt_regs *regs) { int step; - if (unlikely(current->audit_context)) - audit_syscall_exit((regs->ccr&0x10000000)?AUDITSC_FAILURE:AUDITSC_SUCCESS, - regs->result); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->result); diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index 56da355678f..aeb77f01798 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -541,9 +541,13 @@ struct user_regs_struct #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0) #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN) #define user_stack_pointer(regs)((regs)->gprs[15]) -#define regs_return_value(regs)((regs)->gprs[2]) #define profile_pc(regs) instruction_pointer(regs) +static inline long regs_return_value(struct pt_regs *regs) +{ + return regs->gprs[2]; +} + int regs_query_register_offset(const char *name); const char *regs_query_register_name(unsigned int offset); unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset); diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 573bc29551e..9d82ed4bcb2 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -740,20 +740,17 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->gprs[2]); - if (unlikely(current->audit_context)) - audit_syscall_entry(is_compat_task() ? - AUDIT_ARCH_S390 : AUDIT_ARCH_S390X, - regs->gprs[2], regs->orig_gpr2, - regs->gprs[3], regs->gprs[4], - regs->gprs[5]); + audit_syscall_entry(is_compat_task() ? + AUDIT_ARCH_S390 : AUDIT_ARCH_S390X, + regs->gprs[2], regs->orig_gpr2, + regs->gprs[3], regs->gprs[4], + regs->gprs[5]); return ret ?: regs->gprs[2]; } asmlinkage void do_syscall_trace_exit(struct pt_regs *regs) { - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]), - regs->gprs[2]); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->gprs[2]); diff --git a/arch/sh/include/asm/ptrace_32.h b/arch/sh/include/asm/ptrace_32.h index 6c2239cca1a..2d3e906aa72 100644 --- a/arch/sh/include/asm/ptrace_32.h +++ b/arch/sh/include/asm/ptrace_32.h @@ -76,7 +76,10 @@ struct pt_dspregs { #ifdef __KERNEL__ #define MAX_REG_OFFSET offsetof(struct pt_regs, tra) -#define regs_return_value(_regs) ((_regs)->regs[0]) +static inline long regs_return_value(struct pt_regs *regs) +{ + return regs->regs[0]; +} #endif /* __KERNEL__ */ diff --git a/arch/sh/include/asm/ptrace_64.h b/arch/sh/include/asm/ptrace_64.h index bf9be7764d6..eb3fcceaf64 100644 --- a/arch/sh/include/asm/ptrace_64.h +++ b/arch/sh/include/asm/ptrace_64.h @@ -13,7 +13,10 @@ struct pt_regs { #ifdef __KERNEL__ #define MAX_REG_OFFSET offsetof(struct pt_regs, tregs[7]) -#define regs_return_value(_regs) ((_regs)->regs[3]) +static inline long regs_return_value(struct pt_regs *regs) +{ + return regs->regs[3]; +} #endif /* __KERNEL__ */ diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c index 92b3c276339..a3e65156376 100644 --- a/arch/sh/kernel/ptrace_32.c +++ b/arch/sh/kernel/ptrace_32.c @@ -518,10 +518,9 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->regs[0]); - if (unlikely(current->audit_context)) - audit_syscall_entry(audit_arch(), regs->regs[3], - regs->regs[4], regs->regs[5], - regs->regs[6], regs->regs[7]); + audit_syscall_entry(audit_arch(), regs->regs[3], + regs->regs[4], regs->regs[5], + regs->regs[6], regs->regs[7]); return ret ?: regs->regs[0]; } @@ -530,9 +529,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs) { int step; - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->regs[0]), - regs->regs[0]); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->regs[0]); diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c index c8f97649f35..3d0080b5c97 100644 --- a/arch/sh/kernel/ptrace_64.c +++ b/arch/sh/kernel/ptrace_64.c @@ -536,10 +536,9 @@ asmlinkage long long do_syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->regs[9]); - if (unlikely(current->audit_context)) - audit_syscall_entry(audit_arch(), regs->regs[1], - regs->regs[2], regs->regs[3], - regs->regs[4], regs->regs[5]); + audit_syscall_entry(audit_arch(), regs->regs[1], + regs->regs[2], regs->regs[3], + regs->regs[4], regs->regs[5]); return ret ?: regs->regs[9]; } @@ -548,9 +547,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs) { int step; - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->regs[9]), - regs->regs[9]); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->regs[9]); diff --git a/arch/sparc/include/asm/ptrace.h b/arch/sparc/include/asm/ptrace.h index a0e1bcf843a..c00c3b5c280 100644 --- a/arch/sparc/include/asm/ptrace.h +++ b/arch/sparc/include/asm/ptrace.h @@ -207,7 +207,15 @@ do { current_thread_info()->syscall_noerror = 1; \ #define instruction_pointer(regs) ((regs)->tpc) #define instruction_pointer_set(regs, val) ((regs)->tpc = (val)) #define user_stack_pointer(regs) ((regs)->u_regs[UREG_FP]) -#define regs_return_value(regs) ((regs)->u_regs[UREG_I0]) +static inline int is_syscall_success(struct pt_regs *regs) +{ + return !(regs->tstate & (TSTATE_XCARRY | TSTATE_ICARRY)); +} + +static inline long regs_return_value(struct pt_regs *regs) +{ + return regs->u_regs[UREG_I0]; +} #ifdef CONFIG_SMP extern unsigned long profile_pc(struct pt_regs *); #else diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c index 96ee50a8066..9388844cd88 100644 --- a/arch/sparc/kernel/ptrace_64.c +++ b/arch/sparc/kernel/ptrace_64.c @@ -1071,32 +1071,22 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->u_regs[UREG_G1]); - if (unlikely(current->audit_context) && !ret) - audit_syscall_entry((test_thread_flag(TIF_32BIT) ? - AUDIT_ARCH_SPARC : - AUDIT_ARCH_SPARC64), - regs->u_regs[UREG_G1], - regs->u_regs[UREG_I0], - regs->u_regs[UREG_I1], - regs->u_regs[UREG_I2], - regs->u_regs[UREG_I3]); + audit_syscall_entry((test_thread_flag(TIF_32BIT) ? + AUDIT_ARCH_SPARC : + AUDIT_ARCH_SPARC64), + regs->u_regs[UREG_G1], + regs->u_regs[UREG_I0], + regs->u_regs[UREG_I1], + regs->u_regs[UREG_I2], + regs->u_regs[UREG_I3]); return ret; } asmlinkage void syscall_trace_leave(struct pt_regs *regs) { -#ifdef CONFIG_AUDITSYSCALL - if (unlikely(current->audit_context)) { - unsigned long tstate = regs->tstate; - int result = AUDITSC_SUCCESS; + audit_syscall_exit(regs); - if (unlikely(tstate & (TSTATE_XCARRY | TSTATE_ICARRY))) - result = AUDITSC_FAILURE; - - audit_syscall_exit(result, regs->u_regs[UREG_I0]); - } -#endif if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->u_regs[UREG_G1]); diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c index c9da32b0c70..06b19039050 100644 --- a/arch/um/kernel/ptrace.c +++ b/arch/um/kernel/ptrace.c @@ -167,17 +167,15 @@ void syscall_trace(struct uml_pt_regs *regs, int entryexit) int is_singlestep = (current->ptrace & PT_DTRACE) && entryexit; int tracesysgood; - if (unlikely(current->audit_context)) { - if (!entryexit) - audit_syscall_entry(HOST_AUDIT_ARCH, - UPT_SYSCALL_NR(regs), - UPT_SYSCALL_ARG1(regs), - UPT_SYSCALL_ARG2(regs), - UPT_SYSCALL_ARG3(regs), - UPT_SYSCALL_ARG4(regs)); - else audit_syscall_exit(AUDITSC_RESULT(UPT_SYSCALL_RET(regs)), - UPT_SYSCALL_RET(regs)); - } + if (!entryexit) + audit_syscall_entry(HOST_AUDIT_ARCH, + UPT_SYSCALL_NR(regs), + UPT_SYSCALL_ARG1(regs), + UPT_SYSCALL_ARG2(regs), + UPT_SYSCALL_ARG3(regs), + UPT_SYSCALL_ARG4(regs)); + else + audit_syscall_exit(regs); /* Fake a debug trap */ if (is_singlestep) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 1106261856c..e3e734005e1 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -14,6 +14,7 @@ #include <asm/segment.h> #include <asm/irqflags.h> #include <linux/linkage.h> +#include <linux/err.h> /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ #include <linux/elf-em.h> @@ -189,7 +190,7 @@ sysexit_from_sys_call: movl %ebx,%edx /* 3rd arg: 1st syscall arg */ movl %eax,%esi /* 2nd arg: syscall number */ movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */ - call audit_syscall_entry + call __audit_syscall_entry movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys @@ -206,12 +207,13 @@ sysexit_from_sys_call: TRACE_IRQS_ON sti movl %eax,%esi /* second arg, syscall return value */ - cmpl $0,%eax /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + jbe 1f + movslq %eax, %rsi /* if error sign extend to 64 bits */ +1: setbe %al /* 1 if error, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ - inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit - movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */ + call __audit_syscall_exit + movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi cli TRACE_IRQS_OFF diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 4af9fd2450a..79d97e68f04 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -42,6 +42,7 @@ */ #include <linux/linkage.h> +#include <linux/err.h> #include <asm/thread_info.h> #include <asm/irqflags.h> #include <asm/errno.h> @@ -453,7 +454,7 @@ sysenter_audit: movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ movl %eax,%edx /* 2nd arg: syscall number */ movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ - call audit_syscall_entry + call __audit_syscall_entry pushl_cfi %ebx movl PT_EAX(%esp),%eax /* reload syscall number */ jmp sysenter_do_call @@ -464,11 +465,10 @@ sysexit_audit: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) movl %eax,%edx /* second arg, syscall return value */ - cmpl $0,%eax /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + setbe %al /* 1 if so, 0 if not */ movzbl %al,%eax /* zero-extend that */ - inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit + call __audit_syscall_exit DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 940ba711fc2..3fe8239fd8f 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -55,6 +55,7 @@ #include <asm/paravirt.h> #include <asm/ftrace.h> #include <asm/percpu.h> +#include <linux/err.h> /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ #include <linux/elf-em.h> @@ -548,7 +549,7 @@ badsys: #ifdef CONFIG_AUDITSYSCALL /* * Fast path for syscall audit without full syscall trace. - * We just call audit_syscall_entry() directly, and then + * We just call __audit_syscall_entry() directly, and then * jump back to the normal fast path. */ auditsys: @@ -558,22 +559,21 @@ auditsys: movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ movq %rax,%rsi /* 2nd arg: syscall number */ movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ - call audit_syscall_entry + call __audit_syscall_entry LOAD_ARGS 0 /* reload call-clobbered registers */ jmp system_call_fastpath /* - * Return fast path for syscall audit. Call audit_syscall_exit() + * Return fast path for syscall audit. Call __audit_syscall_exit() * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT * masked off. */ sysret_audit: movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ - cmpq $0,%rsi /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */ + setbe %al /* 1 if so, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ - inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit + call __audit_syscall_exit movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi jmp sysret_check #endif /* CONFIG_AUDITSYSCALL */ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 89a04c7b5bb..50267386b76 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1392,20 +1392,18 @@ long syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->orig_ax); - if (unlikely(current->audit_context)) { - if (IS_IA32) - audit_syscall_entry(AUDIT_ARCH_I386, - regs->orig_ax, - regs->bx, regs->cx, - regs->dx, regs->si); + if (IS_IA32) + audit_syscall_entry(AUDIT_ARCH_I386, + regs->orig_ax, + regs->bx, regs->cx, + regs->dx, regs->si); #ifdef CONFIG_X86_64 - else - audit_syscall_entry(AUDIT_ARCH_X86_64, - regs->orig_ax, - regs->di, regs->si, - regs->dx, regs->r10); + else + audit_syscall_entry(AUDIT_ARCH_X86_64, + regs->orig_ax, + regs->di, regs->si, + regs->dx, regs->r10); #endif - } return ret ?: regs->orig_ax; } @@ -1414,8 +1412,7 @@ void syscall_trace_leave(struct pt_regs *regs) { bool step; - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->ax); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 863f8753ab0..b466cab5ba1 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -335,9 +335,11 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk if (info->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); - /*call audit_syscall_exit since we do not exit via the normal paths */ + /*call __audit_syscall_exit since we do not exit via the normal paths */ +#ifdef CONFIG_AUDITSYSCALL if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(0), 0); + __audit_syscall_exit(1, 0); +#endif __asm__ __volatile__( "movl %0,%%esp\n\t" diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h index 711b1621747..5ef9344a8b2 100644 --- a/arch/x86/um/shared/sysdep/ptrace.h +++ b/arch/x86/um/shared/sysdep/ptrace.h @@ -3,3 +3,8 @@ #else #include "ptrace_64.h" #endif + +static inline long regs_return_value(struct uml_pt_regs *regs) +{ + return UPT_SYSCALL_RET(regs); +} diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c index a0d042aa296..2dff698ab02 100644 --- a/arch/xtensa/kernel/ptrace.c +++ b/arch/xtensa/kernel/ptrace.c @@ -334,8 +334,7 @@ void do_syscall_trace_enter(struct pt_regs *regs) do_syscall_trace(); #if 0 - if (unlikely(current->audit_context)) - audit_syscall_entry(current, AUDIT_ARCH_XTENSA..); + audit_syscall_entry(current, AUDIT_ARCH_XTENSA..); #endif } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 163263ddd38..ee55019066a 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3117,18 +3117,17 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, */ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - struct cfq_queue *old_cfqq = cfqd->active_queue; - cfq_log_cfqq(cfqd, cfqq, "preempt"); - cfq_slice_expired(cfqd, 1); /* * workload type is changed, don't save slice, otherwise preempt * doesn't happen */ - if (cfqq_type(old_cfqq) != cfqq_type(cfqq)) + if (cfqq_type(cfqd->active_queue) != cfqq_type(cfqq)) cfqq->cfqg->saved_workload_slice = 0; + cfq_slice_expired(cfqd, 1); + /* * Put the new queue at the front of the of the current list, * so we know that it will be selected next. diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c index 69ac373c72a..fdf27b9fce4 100644 --- a/drivers/ata/ata_piix.c +++ b/drivers/ata/ata_piix.c @@ -1117,6 +1117,13 @@ static int piix_broken_suspend(void) }, }, { + .ident = "Satellite Pro A120", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"), + DMI_MATCH(DMI_PRODUCT_NAME, "Satellite Pro A120"), + }, + }, + { .ident = "Portege M500", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"), diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 11c9aea4f4f..c06e0ec1155 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4125,6 +4125,8 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { * device and controller are SATA. */ { "PIONEER DVD-RW DVRTD08", NULL, ATA_HORKAGE_NOSETXFER }, + { "PIONEER DVD-RW DVRTD08A", NULL, ATA_HORKAGE_NOSETXFER }, + { "PIONEER DVD-RW DVR-215", NULL, ATA_HORKAGE_NOSETXFER }, { "PIONEER DVD-RW DVR-212D", NULL, ATA_HORKAGE_NOSETXFER }, { "PIONEER DVD-RW DVR-216D", NULL, ATA_HORKAGE_NOSETXFER }, diff --git a/drivers/ata/libata-transport.c b/drivers/ata/libata-transport.c index 9a7f0ea565d..74aaee30e26 100644 --- a/drivers/ata/libata-transport.c +++ b/drivers/ata/libata-transport.c @@ -291,6 +291,7 @@ int ata_tport_add(struct device *parent, goto tport_err; } + device_enable_async_suspend(dev); pm_runtime_set_active(dev); pm_runtime_enable(dev); diff --git a/drivers/ata/pata_bf54x.c b/drivers/ata/pata_bf54x.c index d6a4677fdf7..1e65842e2ca 100644 --- a/drivers/ata/pata_bf54x.c +++ b/drivers/ata/pata_bf54x.c @@ -251,6 +251,8 @@ static const u32 udma_tenvmin = 20; static const u32 udma_tackmin = 20; static const u32 udma_tssmin = 50; +#define BFIN_MAX_SG_SEGMENTS 4 + /** * * Function: num_clocks_min @@ -829,79 +831,61 @@ static void bfin_set_devctl(struct ata_port *ap, u8 ctl) static void bfin_bmdma_setup(struct ata_queued_cmd *qc) { - unsigned short config = WDSIZE_16; + struct ata_port *ap = qc->ap; + struct dma_desc_array *dma_desc_cpu = (struct dma_desc_array *)ap->bmdma_prd; + void __iomem *base = (void __iomem *)ap->ioaddr.ctl_addr; + unsigned short config = DMAFLOW_ARRAY | NDSIZE_5 | RESTART | WDSIZE_16 | DMAEN; struct scatterlist *sg; unsigned int si; + unsigned int channel; + unsigned int dir; + unsigned int size = 0; dev_dbg(qc->ap->dev, "in atapi dma setup\n"); /* Program the ATA_CTRL register with dir */ if (qc->tf.flags & ATA_TFLAG_WRITE) { - /* fill the ATAPI DMA controller */ - set_dma_config(CH_ATAPI_TX, config); - set_dma_x_modify(CH_ATAPI_TX, 2); - for_each_sg(qc->sg, sg, qc->n_elem, si) { - set_dma_start_addr(CH_ATAPI_TX, sg_dma_address(sg)); - set_dma_x_count(CH_ATAPI_TX, sg_dma_len(sg) >> 1); - } + channel = CH_ATAPI_TX; + dir = DMA_TO_DEVICE; } else { + channel = CH_ATAPI_RX; + dir = DMA_FROM_DEVICE; config |= WNR; - /* fill the ATAPI DMA controller */ - set_dma_config(CH_ATAPI_RX, config); - set_dma_x_modify(CH_ATAPI_RX, 2); - for_each_sg(qc->sg, sg, qc->n_elem, si) { - set_dma_start_addr(CH_ATAPI_RX, sg_dma_address(sg)); - set_dma_x_count(CH_ATAPI_RX, sg_dma_len(sg) >> 1); - } } -} -/** - * bfin_bmdma_start - Start an IDE DMA transaction - * @qc: Info associated with this ATA transaction. - * - * Note: Original code is ata_bmdma_start(). - */ + dma_map_sg(ap->dev, qc->sg, qc->n_elem, dir); -static void bfin_bmdma_start(struct ata_queued_cmd *qc) -{ - struct ata_port *ap = qc->ap; - void __iomem *base = (void __iomem *)ap->ioaddr.ctl_addr; - struct scatterlist *sg; - unsigned int si; + /* fill the ATAPI DMA controller */ + for_each_sg(qc->sg, sg, qc->n_elem, si) { + dma_desc_cpu[si].start_addr = sg_dma_address(sg); + dma_desc_cpu[si].cfg = config; + dma_desc_cpu[si].x_count = sg_dma_len(sg) >> 1; + dma_desc_cpu[si].x_modify = 2; + size += sg_dma_len(sg); + } - dev_dbg(qc->ap->dev, "in atapi dma start\n"); - if (!(ap->udma_mask || ap->mwdma_mask)) - return; + /* Set the last descriptor to stop mode */ + dma_desc_cpu[qc->n_elem - 1].cfg &= ~(DMAFLOW | NDSIZE); - /* start ATAPI DMA controller*/ - if (qc->tf.flags & ATA_TFLAG_WRITE) { - /* - * On blackfin arch, uncacheable memory is not - * allocated with flag GFP_DMA. DMA buffer from - * common kenel code should be flushed if WB - * data cache is enabled. Otherwise, this loop - * is an empty loop and optimized out. - */ - for_each_sg(qc->sg, sg, qc->n_elem, si) { - flush_dcache_range(sg_dma_address(sg), - sg_dma_address(sg) + sg_dma_len(sg)); - } - enable_dma(CH_ATAPI_TX); - dev_dbg(qc->ap->dev, "enable udma write\n"); + flush_dcache_range((unsigned int)dma_desc_cpu, + (unsigned int)dma_desc_cpu + + qc->n_elem * sizeof(struct dma_desc_array)); - /* Send ATA DMA write command */ - bfin_exec_command(ap, &qc->tf); + /* Enable ATA DMA operation*/ + set_dma_curr_desc_addr(channel, (unsigned long *)ap->bmdma_prd_dma); + set_dma_x_count(channel, 0); + set_dma_x_modify(channel, 0); + set_dma_config(channel, config); + + SSYNC(); + + /* Send ATA DMA command */ + bfin_exec_command(ap, &qc->tf); + if (qc->tf.flags & ATA_TFLAG_WRITE) { /* set ATA DMA write direction */ ATAPI_SET_CONTROL(base, (ATAPI_GET_CONTROL(base) | XFER_DIR)); } else { - enable_dma(CH_ATAPI_RX); - dev_dbg(qc->ap->dev, "enable udma read\n"); - - /* Send ATA DMA read command */ - bfin_exec_command(ap, &qc->tf); - /* set ATA DMA read direction */ ATAPI_SET_CONTROL(base, (ATAPI_GET_CONTROL(base) & ~XFER_DIR)); @@ -913,12 +897,28 @@ static void bfin_bmdma_start(struct ata_queued_cmd *qc) /* Set ATAPI state machine contorl in terminate sequence */ ATAPI_SET_CONTROL(base, ATAPI_GET_CONTROL(base) | END_ON_TERM); - /* Set transfer length to buffer len */ - for_each_sg(qc->sg, sg, qc->n_elem, si) { - ATAPI_SET_XFER_LEN(base, (sg_dma_len(sg) >> 1)); - } + /* Set transfer length to the total size of sg buffers */ + ATAPI_SET_XFER_LEN(base, size >> 1); +} - /* Enable ATA DMA operation*/ +/** + * bfin_bmdma_start - Start an IDE DMA transaction + * @qc: Info associated with this ATA transaction. + * + * Note: Original code is ata_bmdma_start(). + */ + +static void bfin_bmdma_start(struct ata_queued_cmd *qc) +{ + struct ata_port *ap = qc->ap; + void __iomem *base = (void __iomem *)ap->ioaddr.ctl_addr; + + dev_dbg(qc->ap->dev, "in atapi dma start\n"); + + if (!(ap->udma_mask || ap->mwdma_mask)) + return; + + /* start ATAPI transfer*/ if (ap->udma_mask) ATAPI_SET_CONTROL(base, ATAPI_GET_CONTROL(base) | ULTRA_START); @@ -935,34 +935,23 @@ static void bfin_bmdma_start(struct ata_queued_cmd *qc) static void bfin_bmdma_stop(struct ata_queued_cmd *qc) { struct ata_port *ap = qc->ap; - struct scatterlist *sg; - unsigned int si; + unsigned int dir; dev_dbg(qc->ap->dev, "in atapi dma stop\n"); + if (!(ap->udma_mask || ap->mwdma_mask)) return; /* stop ATAPI DMA controller*/ - if (qc->tf.flags & ATA_TFLAG_WRITE) + if (qc->tf.flags & ATA_TFLAG_WRITE) { + dir = DMA_TO_DEVICE; disable_dma(CH_ATAPI_TX); - else { + } else { + dir = DMA_FROM_DEVICE; disable_dma(CH_ATAPI_RX); - if (ap->hsm_task_state & HSM_ST_LAST) { - /* - * On blackfin arch, uncacheable memory is not - * allocated with flag GFP_DMA. DMA buffer from - * common kenel code should be invalidated if - * data cache is enabled. Otherwise, this loop - * is an empty loop and optimized out. - */ - for_each_sg(qc->sg, sg, qc->n_elem, si) { - invalidate_dcache_range( - sg_dma_address(sg), - sg_dma_address(sg) - + sg_dma_len(sg)); - } - } } + + dma_unmap_sg(ap->dev, qc->sg, qc->n_elem, dir); } /** @@ -1260,6 +1249,11 @@ static void bfin_port_stop(struct ata_port *ap) { dev_dbg(ap->dev, "in atapi port stop\n"); if (ap->udma_mask != 0 || ap->mwdma_mask != 0) { + dma_free_coherent(ap->dev, + BFIN_MAX_SG_SEGMENTS * sizeof(struct dma_desc_array), + ap->bmdma_prd, + ap->bmdma_prd_dma); + free_dma(CH_ATAPI_RX); free_dma(CH_ATAPI_TX); } @@ -1271,14 +1265,29 @@ static int bfin_port_start(struct ata_port *ap) if (!(ap->udma_mask || ap->mwdma_mask)) return 0; + ap->bmdma_prd = dma_alloc_coherent(ap->dev, + BFIN_MAX_SG_SEGMENTS * sizeof(struct dma_desc_array), + &ap->bmdma_prd_dma, + GFP_KERNEL); + + if (ap->bmdma_prd == NULL) { + dev_info(ap->dev, "Unable to allocate DMA descriptor array.\n"); + goto out; + } + if (request_dma(CH_ATAPI_RX, "BFIN ATAPI RX DMA") >= 0) { if (request_dma(CH_ATAPI_TX, "BFIN ATAPI TX DMA") >= 0) return 0; free_dma(CH_ATAPI_RX); + dma_free_coherent(ap->dev, + BFIN_MAX_SG_SEGMENTS * sizeof(struct dma_desc_array), + ap->bmdma_prd, + ap->bmdma_prd_dma); } +out: ap->udma_mask = 0; ap->mwdma_mask = 0; dev_err(ap->dev, "Unable to request ATAPI DMA!" @@ -1400,7 +1409,7 @@ static irqreturn_t bfin_ata_interrupt(int irq, void *dev_instance) static struct scsi_host_template bfin_sht = { ATA_BASE_SHT(DRV_NAME), - .sg_tablesize = SG_NONE, + .sg_tablesize = BFIN_MAX_SG_SEGMENTS, .dma_boundary = ATA_DMA_BOUNDARY, }; diff --git a/drivers/ata/sata_fsl.c b/drivers/ata/sata_fsl.c index 5a2c95ba050..0120b0d1e9a 100644 --- a/drivers/ata/sata_fsl.c +++ b/drivers/ata/sata_fsl.c @@ -140,6 +140,7 @@ enum { */ HCONTROL_ONLINE_PHY_RST = (1 << 31), HCONTROL_FORCE_OFFLINE = (1 << 30), + HCONTROL_LEGACY = (1 << 28), HCONTROL_PARITY_PROT_MOD = (1 << 14), HCONTROL_DPATH_PARITY = (1 << 12), HCONTROL_SNOOP_ENABLE = (1 << 10), @@ -1223,6 +1224,10 @@ static int sata_fsl_init_controller(struct ata_host *host) * part of the port_start() callback */ + /* sata controller to operate in enterprise mode */ + temp = ioread32(hcr_base + HCONTROL); + iowrite32(temp & ~HCONTROL_LEGACY, hcr_base + HCONTROL); + /* ack. any pending IRQs for this controller/port */ temp = ioread32(hcr_base + HSTATUS); if (temp & 0x3F) @@ -1421,6 +1426,12 @@ static int sata_fsl_resume(struct platform_device *op) /* Recovery the CHBA register in host controller cmd register set */ iowrite32(pp->cmdslot_paddr & 0xffffffff, hcr_base + CHBA); + iowrite32((ioread32(hcr_base + HCONTROL) + | HCONTROL_ONLINE_PHY_RST + | HCONTROL_SNOOP_ENABLE + | HCONTROL_PMP_ATTACHED), + hcr_base + HCONTROL); + ata_host_resume(host); return 0; } diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 5a99bb3f255..f1a274994bb 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -124,7 +124,7 @@ config MV_XOR config MX3_IPU bool "MX3x Image Processing Unit support" - depends on SOC_IMX31 ||Â SOC_IMX35 + depends on ARCH_MXC select DMA_ENGINE default y help @@ -187,6 +187,13 @@ config TIMB_DMA help Enable support for the Timberdale FPGA DMA engine. +config SIRF_DMA + tristate "CSR SiRFprimaII DMA support" + depends on ARCH_PRIMA2 + select DMA_ENGINE + help + Enable support for the CSR SiRFprimaII DMA engine. + config ARCH_HAS_ASYNC_TX_FIND_CHANNEL bool @@ -201,26 +208,26 @@ config PL330_DMA platform_data for a dma-pl330 device. config PCH_DMA - tristate "Intel EG20T PCH / OKI Semi IOH(ML7213/ML7223) DMA support" + tristate "Intel EG20T PCH / LAPIS Semicon IOH(ML7213/ML7223/ML7831) DMA" depends on PCI && X86 select DMA_ENGINE help Enable support for Intel EG20T PCH DMA engine. - This driver also can be used for OKI SEMICONDUCTOR IOH(Input/ - Output Hub), ML7213 and ML7223. - ML7213 IOH is for IVI(In-Vehicle Infotainment) use and ML7223 IOH is - for MP(Media Phone) use. - ML7213/ML7223 is companion chip for Intel Atom E6xx series. - ML7213/ML7223 is completely compatible for Intel EG20T PCH. + This driver also can be used for LAPIS Semiconductor IOH(Input/ + Output Hub), ML7213, ML7223 and ML7831. + ML7213 IOH is for IVI(In-Vehicle Infotainment) use, ML7223 IOH is + for MP(Media Phone) use and ML7831 IOH is for general purpose use. + ML7213/ML7223/ML7831 is companion chip for Intel Atom E6xx series. + ML7213/ML7223/ML7831 is completely compatible for Intel EG20T PCH. config IMX_SDMA tristate "i.MX SDMA support" - depends on ARCH_MX25 || SOC_IMX31 ||Â SOC_IMX35 || ARCH_MX5 + depends on ARCH_MXC select DMA_ENGINE help Support the i.MX SDMA engine. This engine is integrated into - Freescale i.MX25/31/35/51 chips. + Freescale i.MX25/31/35/51/53 chips. config IMX_DMA tristate "i.MX DMA support" diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index 30cf3b1f0c5..009a222e828 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -21,6 +21,7 @@ obj-$(CONFIG_IMX_SDMA) += imx-sdma.o obj-$(CONFIG_IMX_DMA) += imx-dma.o obj-$(CONFIG_MXS_DMA) += mxs-dma.o obj-$(CONFIG_TIMB_DMA) += timb_dma.o +obj-$(CONFIG_SIRF_DMA) += sirf-dma.o obj-$(CONFIG_STE_DMA40) += ste_dma40.o ste_dma40_ll.o obj-$(CONFIG_PL330_DMA) += pl330.o obj-$(CONFIG_PCH_DMA) += pch_dma.o diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c index 0698695e8bf..8a281584458 100644 --- a/drivers/dma/amba-pl08x.c +++ b/drivers/dma/amba-pl08x.c @@ -854,8 +854,10 @@ static int prep_phy_channel(struct pl08x_dma_chan *plchan, int ret; /* Check if we already have a channel */ - if (plchan->phychan) - return 0; + if (plchan->phychan) { + ch = plchan->phychan; + goto got_channel; + } ch = pl08x_get_phy_channel(pl08x, plchan); if (!ch) { @@ -880,21 +882,22 @@ static int prep_phy_channel(struct pl08x_dma_chan *plchan, return -EBUSY; } ch->signal = ret; - - /* Assign the flow control signal to this channel */ - if (txd->direction == DMA_TO_DEVICE) - txd->ccfg |= ch->signal << PL080_CONFIG_DST_SEL_SHIFT; - else if (txd->direction == DMA_FROM_DEVICE) - txd->ccfg |= ch->signal << PL080_CONFIG_SRC_SEL_SHIFT; } + plchan->phychan = ch; dev_dbg(&pl08x->adev->dev, "allocated physical channel %d and signal %d for xfer on %s\n", ch->id, ch->signal, plchan->name); +got_channel: + /* Assign the flow control signal to this channel */ + if (txd->direction == DMA_MEM_TO_DEV) + txd->ccfg |= ch->signal << PL080_CONFIG_DST_SEL_SHIFT; + else if (txd->direction == DMA_DEV_TO_MEM) + txd->ccfg |= ch->signal << PL080_CONFIG_SRC_SEL_SHIFT; + plchan->phychan_hold++; - plchan->phychan = ch; return 0; } @@ -1102,10 +1105,10 @@ static int dma_set_runtime_config(struct dma_chan *chan, /* Transfer direction */ plchan->runtime_direction = config->direction; - if (config->direction == DMA_TO_DEVICE) { + if (config->direction == DMA_MEM_TO_DEV) { addr_width = config->dst_addr_width; maxburst = config->dst_maxburst; - } else if (config->direction == DMA_FROM_DEVICE) { + } else if (config->direction == DMA_DEV_TO_MEM) { addr_width = config->src_addr_width; maxburst = config->src_maxburst; } else { @@ -1136,7 +1139,7 @@ static int dma_set_runtime_config(struct dma_chan *chan, cctl |= burst << PL080_CONTROL_SB_SIZE_SHIFT; cctl |= burst << PL080_CONTROL_DB_SIZE_SHIFT; - if (plchan->runtime_direction == DMA_FROM_DEVICE) { + if (plchan->runtime_direction == DMA_DEV_TO_MEM) { plchan->src_addr = config->src_addr; plchan->src_cctl = pl08x_cctl(cctl) | PL080_CONTROL_DST_INCR | pl08x_select_bus(plchan->cd->periph_buses, @@ -1152,7 +1155,7 @@ static int dma_set_runtime_config(struct dma_chan *chan, "configured channel %s (%s) for %s, data width %d, " "maxburst %d words, LE, CCTL=0x%08x\n", dma_chan_name(chan), plchan->name, - (config->direction == DMA_FROM_DEVICE) ? "RX" : "TX", + (config->direction == DMA_DEV_TO_MEM) ? "RX" : "TX", addr_width, maxburst, cctl); @@ -1322,7 +1325,7 @@ static struct dma_async_tx_descriptor *pl08x_prep_dma_memcpy( static struct dma_async_tx_descriptor *pl08x_prep_slave_sg( struct dma_chan *chan, struct scatterlist *sgl, - unsigned int sg_len, enum dma_data_direction direction, + unsigned int sg_len, enum dma_transfer_direction direction, unsigned long flags) { struct pl08x_dma_chan *plchan = to_pl08x_chan(chan); @@ -1354,10 +1357,10 @@ static struct dma_async_tx_descriptor *pl08x_prep_slave_sg( */ txd->direction = direction; - if (direction == DMA_TO_DEVICE) { + if (direction == DMA_MEM_TO_DEV) { txd->cctl = plchan->dst_cctl; slave_addr = plchan->dst_addr; - } else if (direction == DMA_FROM_DEVICE) { + } else if (direction == DMA_DEV_TO_MEM) { txd->cctl = plchan->src_cctl; slave_addr = plchan->src_addr; } else { @@ -1368,10 +1371,10 @@ static struct dma_async_tx_descriptor *pl08x_prep_slave_sg( } if (plchan->cd->device_fc) - tmp = (direction == DMA_TO_DEVICE) ? PL080_FLOW_MEM2PER_PER : + tmp = (direction == DMA_MEM_TO_DEV) ? PL080_FLOW_MEM2PER_PER : PL080_FLOW_PER2MEM_PER; else - tmp = (direction == DMA_TO_DEVICE) ? PL080_FLOW_MEM2PER : + tmp = (direction == DMA_MEM_TO_DEV) ? PL080_FLOW_MEM2PER : PL080_FLOW_PER2MEM; txd->ccfg |= tmp << PL080_CONFIG_FLOW_CONTROL_SHIFT; @@ -1387,7 +1390,7 @@ static struct dma_async_tx_descriptor *pl08x_prep_slave_sg( list_add_tail(&dsg->node, &txd->dsg_list); dsg->len = sg_dma_len(sg); - if (direction == DMA_TO_DEVICE) { + if (direction == DMA_MEM_TO_DEV) { dsg->src_addr = sg_phys(sg); dsg->dst_addr = slave_addr; } else { diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c index fcfa0a8b5c5..97f87b29b9f 100644 --- a/drivers/dma/at_hdmac.c +++ b/drivers/dma/at_hdmac.c @@ -23,6 +23,8 @@ #include <linux/module.h> #include <linux/platform_device.h> #include <linux/slab.h> +#include <linux/of.h> +#include <linux/of_device.h> #include "at_hdmac_regs.h" @@ -660,7 +662,7 @@ err_desc_get: */ static struct dma_async_tx_descriptor * atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, - unsigned int sg_len, enum dma_data_direction direction, + unsigned int sg_len, enum dma_transfer_direction direction, unsigned long flags) { struct at_dma_chan *atchan = to_at_dma_chan(chan); @@ -678,7 +680,7 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, dev_vdbg(chan2dev(chan), "prep_slave_sg (%d): %s f0x%lx\n", sg_len, - direction == DMA_TO_DEVICE ? "TO DEVICE" : "FROM DEVICE", + direction == DMA_MEM_TO_DEV ? "TO DEVICE" : "FROM DEVICE", flags); if (unlikely(!atslave || !sg_len)) { @@ -692,7 +694,7 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, ctrlb = ATC_IEN; switch (direction) { - case DMA_TO_DEVICE: + case DMA_MEM_TO_DEV: ctrla |= ATC_DST_WIDTH(reg_width); ctrlb |= ATC_DST_ADDR_MODE_FIXED | ATC_SRC_ADDR_MODE_INCR @@ -725,7 +727,7 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, total_len += len; } break; - case DMA_FROM_DEVICE: + case DMA_DEV_TO_MEM: ctrla |= ATC_SRC_WIDTH(reg_width); ctrlb |= ATC_DST_ADDR_MODE_INCR | ATC_SRC_ADDR_MODE_FIXED @@ -787,7 +789,7 @@ err_desc_get: */ static int atc_dma_cyclic_check_values(unsigned int reg_width, dma_addr_t buf_addr, - size_t period_len, enum dma_data_direction direction) + size_t period_len, enum dma_transfer_direction direction) { if (period_len > (ATC_BTSIZE_MAX << reg_width)) goto err_out; @@ -795,7 +797,7 @@ atc_dma_cyclic_check_values(unsigned int reg_width, dma_addr_t buf_addr, goto err_out; if (unlikely(buf_addr & ((1 << reg_width) - 1))) goto err_out; - if (unlikely(!(direction & (DMA_TO_DEVICE | DMA_FROM_DEVICE)))) + if (unlikely(!(direction & (DMA_DEV_TO_MEM | DMA_MEM_TO_DEV)))) goto err_out; return 0; @@ -810,7 +812,7 @@ err_out: static int atc_dma_cyclic_fill_desc(struct at_dma_slave *atslave, struct at_desc *desc, unsigned int period_index, dma_addr_t buf_addr, - size_t period_len, enum dma_data_direction direction) + size_t period_len, enum dma_transfer_direction direction) { u32 ctrla; unsigned int reg_width = atslave->reg_width; @@ -822,7 +824,7 @@ atc_dma_cyclic_fill_desc(struct at_dma_slave *atslave, struct at_desc *desc, | period_len >> reg_width; switch (direction) { - case DMA_TO_DEVICE: + case DMA_MEM_TO_DEV: desc->lli.saddr = buf_addr + (period_len * period_index); desc->lli.daddr = atslave->tx_reg; desc->lli.ctrla = ctrla; @@ -833,7 +835,7 @@ atc_dma_cyclic_fill_desc(struct at_dma_slave *atslave, struct at_desc *desc, | ATC_DIF(AT_DMA_PER_IF); break; - case DMA_FROM_DEVICE: + case DMA_DEV_TO_MEM: desc->lli.saddr = atslave->rx_reg; desc->lli.daddr = buf_addr + (period_len * period_index); desc->lli.ctrla = ctrla; @@ -861,7 +863,7 @@ atc_dma_cyclic_fill_desc(struct at_dma_slave *atslave, struct at_desc *desc, */ static struct dma_async_tx_descriptor * atc_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len, - size_t period_len, enum dma_data_direction direction) + size_t period_len, enum dma_transfer_direction direction) { struct at_dma_chan *atchan = to_at_dma_chan(chan); struct at_dma_slave *atslave = chan->private; @@ -872,7 +874,7 @@ atc_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len, unsigned int i; dev_vdbg(chan2dev(chan), "prep_dma_cyclic: %s buf@0x%08x - %d (%d/%d)\n", - direction == DMA_TO_DEVICE ? "TO DEVICE" : "FROM DEVICE", + direction == DMA_MEM_TO_DEV ? "TO DEVICE" : "FROM DEVICE", buf_addr, periods, buf_len, period_len); @@ -1175,6 +1177,56 @@ static void atc_free_chan_resources(struct dma_chan *chan) /*-- Module Management -----------------------------------------------*/ +/* cap_mask is a multi-u32 bitfield, fill it with proper C code. */ +static struct at_dma_platform_data at91sam9rl_config = { + .nr_channels = 2, +}; +static struct at_dma_platform_data at91sam9g45_config = { + .nr_channels = 8, +}; + +#if defined(CONFIG_OF) +static const struct of_device_id atmel_dma_dt_ids[] = { + { + .compatible = "atmel,at91sam9rl-dma", + .data = &at91sam9rl_config, + }, { + .compatible = "atmel,at91sam9g45-dma", + .data = &at91sam9g45_config, + }, { + /* sentinel */ + } +}; + +MODULE_DEVICE_TABLE(of, atmel_dma_dt_ids); +#endif + +static const struct platform_device_id atdma_devtypes[] = { + { + .name = "at91sam9rl_dma", + .driver_data = (unsigned long) &at91sam9rl_config, + }, { + .name = "at91sam9g45_dma", + .driver_data = (unsigned long) &at91sam9g45_config, + }, { + /* sentinel */ + } +}; + +static inline struct at_dma_platform_data * __init at_dma_get_driver_data( + struct platform_device *pdev) +{ + if (pdev->dev.of_node) { + const struct of_device_id *match; + match = of_match_node(atmel_dma_dt_ids, pdev->dev.of_node); + if (match == NULL) + return NULL; + return match->data; + } + return (struct at_dma_platform_data *) + platform_get_device_id(pdev)->driver_data; +} + /** * at_dma_off - disable DMA controller * @atdma: the Atmel HDAMC device @@ -1193,18 +1245,23 @@ static void at_dma_off(struct at_dma *atdma) static int __init at_dma_probe(struct platform_device *pdev) { - struct at_dma_platform_data *pdata; struct resource *io; struct at_dma *atdma; size_t size; int irq; int err; int i; + struct at_dma_platform_data *plat_dat; - /* get DMA Controller parameters from platform */ - pdata = pdev->dev.platform_data; - if (!pdata || pdata->nr_channels > AT_DMA_MAX_NR_CHANNELS) - return -EINVAL; + /* setup platform data for each SoC */ + dma_cap_set(DMA_MEMCPY, at91sam9rl_config.cap_mask); + dma_cap_set(DMA_MEMCPY, at91sam9g45_config.cap_mask); + dma_cap_set(DMA_SLAVE, at91sam9g45_config.cap_mask); + + /* get DMA parameters from controller type */ + plat_dat = at_dma_get_driver_data(pdev); + if (!plat_dat) + return -ENODEV; io = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!io) @@ -1215,14 +1272,14 @@ static int __init at_dma_probe(struct platform_device *pdev) return irq; size = sizeof(struct at_dma); - size += pdata->nr_channels * sizeof(struct at_dma_chan); + size += plat_dat->nr_channels * sizeof(struct at_dma_chan); atdma = kzalloc(size, GFP_KERNEL); if (!atdma) return -ENOMEM; - /* discover transaction capabilites from the platform data */ - atdma->dma_common.cap_mask = pdata->cap_mask; - atdma->all_chan_mask = (1 << pdata->nr_channels) - 1; + /* discover transaction capabilities */ + atdma->dma_common.cap_mask = plat_dat->cap_mask; + atdma->all_chan_mask = (1 << plat_dat->nr_channels) - 1; size = resource_size(io); if (!request_mem_region(io->start, size, pdev->dev.driver->name)) { @@ -1268,7 +1325,7 @@ static int __init at_dma_probe(struct platform_device *pdev) /* initialize channels related values */ INIT_LIST_HEAD(&atdma->dma_common.channels); - for (i = 0; i < pdata->nr_channels; i++) { + for (i = 0; i < plat_dat->nr_channels; i++) { struct at_dma_chan *atchan = &atdma->chan[i]; atchan->chan_common.device = &atdma->dma_common; @@ -1313,7 +1370,7 @@ static int __init at_dma_probe(struct platform_device *pdev) dev_info(&pdev->dev, "Atmel AHB DMA Controller ( %s%s), %d channels\n", dma_has_cap(DMA_MEMCPY, atdma->dma_common.cap_mask) ? "cpy " : "", dma_has_cap(DMA_SLAVE, atdma->dma_common.cap_mask) ? "slave " : "", - pdata->nr_channels); + plat_dat->nr_channels); dma_async_device_register(&atdma->dma_common); @@ -1495,9 +1552,11 @@ static const struct dev_pm_ops at_dma_dev_pm_ops = { static struct platform_driver at_dma_driver = { .remove = __exit_p(at_dma_remove), .shutdown = at_dma_shutdown, + .id_table = atdma_devtypes, .driver = { .name = "at_hdmac", .pm = &at_dma_dev_pm_ops, + .of_match_table = of_match_ptr(atmel_dma_dt_ids), }, }; diff --git a/drivers/dma/at_hdmac_regs.h b/drivers/dma/at_hdmac_regs.h index aa4c9aebab7..dcaedfc181c 100644 --- a/drivers/dma/at_hdmac_regs.h +++ b/drivers/dma/at_hdmac_regs.h @@ -251,6 +251,7 @@ static inline struct at_dma_chan *to_at_dma_chan(struct dma_chan *dchan) /** * struct at_dma - internal representation of an Atmel HDMA Controller * @chan_common: common dmaengine dma_device object members + * @atdma_devtype: identifier of DMA controller compatibility * @ch_regs: memory mapped register base * @clk: dma controller clock * @save_imr: interrupt mask register that is saved on suspend/resume cycle diff --git a/drivers/dma/coh901318.c b/drivers/dma/coh901318.c index 4234f416ef1..d65a718c0f9 100644 --- a/drivers/dma/coh901318.c +++ b/drivers/dma/coh901318.c @@ -39,7 +39,7 @@ struct coh901318_desc { struct scatterlist *sg; unsigned int sg_len; struct coh901318_lli *lli; - enum dma_data_direction dir; + enum dma_transfer_direction dir; unsigned long flags; u32 head_config; u32 head_ctrl; @@ -1034,7 +1034,7 @@ coh901318_prep_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, static struct dma_async_tx_descriptor * coh901318_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, - unsigned int sg_len, enum dma_data_direction direction, + unsigned int sg_len, enum dma_transfer_direction direction, unsigned long flags) { struct coh901318_chan *cohc = to_coh901318_chan(chan); @@ -1077,7 +1077,7 @@ coh901318_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, ctrl_last |= cohc->runtime_ctrl; ctrl |= cohc->runtime_ctrl; - if (direction == DMA_TO_DEVICE) { + if (direction == DMA_MEM_TO_DEV) { u32 tx_flags = COH901318_CX_CTRL_PRDD_SOURCE | COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE; @@ -1085,7 +1085,7 @@ coh901318_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, ctrl_chained |= tx_flags; ctrl_last |= tx_flags; ctrl |= tx_flags; - } else if (direction == DMA_FROM_DEVICE) { + } else if (direction == DMA_DEV_TO_MEM) { u32 rx_flags = COH901318_CX_CTRL_PRDD_DEST | COH901318_CX_CTRL_DST_ADDR_INC_ENABLE; @@ -1274,11 +1274,11 @@ static void coh901318_dma_set_runtimeconfig(struct dma_chan *chan, int i = 0; /* We only support mem to per or per to mem transfers */ - if (config->direction == DMA_FROM_DEVICE) { + if (config->direction == DMA_DEV_TO_MEM) { addr = config->src_addr; addr_width = config->src_addr_width; maxburst = config->src_maxburst; - } else if (config->direction == DMA_TO_DEVICE) { + } else if (config->direction == DMA_MEM_TO_DEV) { addr = config->dst_addr; addr_width = config->dst_addr_width; maxburst = config->dst_maxburst; diff --git a/drivers/dma/coh901318_lli.c b/drivers/dma/coh901318_lli.c index 9f7e0e6a7ee..6c0e2d4c668 100644 --- a/drivers/dma/coh901318_lli.c +++ b/drivers/dma/coh901318_lli.c @@ -7,11 +7,10 @@ * Author: Per Friden <per.friden@stericsson.com> */ -#include <linux/dma-mapping.h> #include <linux/spinlock.h> -#include <linux/dmapool.h> #include <linux/memory.h> #include <linux/gfp.h> +#include <linux/dmapool.h> #include <mach/coh901318.h> #include "coh901318_lli.h" @@ -177,18 +176,18 @@ coh901318_lli_fill_single(struct coh901318_pool *pool, struct coh901318_lli *lli, dma_addr_t buf, unsigned int size, dma_addr_t dev_addr, u32 ctrl_chained, u32 ctrl_eom, - enum dma_data_direction dir) + enum dma_transfer_direction dir) { int s = size; dma_addr_t src; dma_addr_t dst; - if (dir == DMA_TO_DEVICE) { + if (dir == DMA_MEM_TO_DEV) { src = buf; dst = dev_addr; - } else if (dir == DMA_FROM_DEVICE) { + } else if (dir == DMA_DEV_TO_MEM) { src = dev_addr; dst = buf; @@ -215,9 +214,9 @@ coh901318_lli_fill_single(struct coh901318_pool *pool, lli = coh901318_lli_next(lli); - if (dir == DMA_TO_DEVICE) + if (dir == DMA_MEM_TO_DEV) src += block_size; - else if (dir == DMA_FROM_DEVICE) + else if (dir == DMA_DEV_TO_MEM) dst += block_size; } @@ -234,7 +233,7 @@ coh901318_lli_fill_sg(struct coh901318_pool *pool, struct scatterlist *sgl, unsigned int nents, dma_addr_t dev_addr, u32 ctrl_chained, u32 ctrl, u32 ctrl_last, - enum dma_data_direction dir, u32 ctrl_irq_mask) + enum dma_transfer_direction dir, u32 ctrl_irq_mask) { int i; struct scatterlist *sg; @@ -249,9 +248,9 @@ coh901318_lli_fill_sg(struct coh901318_pool *pool, spin_lock(&pool->lock); - if (dir == DMA_TO_DEVICE) + if (dir == DMA_MEM_TO_DEV) dst = dev_addr; - else if (dir == DMA_FROM_DEVICE) + else if (dir == DMA_DEV_TO_MEM) src = dev_addr; else goto err; @@ -269,7 +268,7 @@ coh901318_lli_fill_sg(struct coh901318_pool *pool, ctrl_sg = ctrl ? ctrl : ctrl_last; - if (dir == DMA_TO_DEVICE) + if (dir == DMA_MEM_TO_DEV) /* increment source address */ src = sg_phys(sg); else @@ -293,7 +292,7 @@ coh901318_lli_fill_sg(struct coh901318_pool *pool, lli->src_addr = src; lli->dst_addr = dst; - if (dir == DMA_FROM_DEVICE) + if (dir == DMA_DEV_TO_MEM) dst += elem_size; else src += elem_size; diff --git a/drivers/dma/coh901318_lli.h b/drivers/dma/coh901318_lli.h index 7a5c80990e9..abff3714fdd 100644 --- a/drivers/dma/coh901318_lli.h +++ b/drivers/dma/coh901318_lli.h @@ -97,7 +97,7 @@ coh901318_lli_fill_single(struct coh901318_pool *pool, struct coh901318_lli *lli, dma_addr_t buf, unsigned int size, dma_addr_t dev_addr, u32 ctrl_chained, u32 ctrl_last, - enum dma_data_direction dir); + enum dma_transfer_direction dir); /** * coh901318_lli_fill_single() - Prepares the lli:s for dma scatter list transfer @@ -119,6 +119,6 @@ coh901318_lli_fill_sg(struct coh901318_pool *pool, struct scatterlist *sg, unsigned int nents, dma_addr_t dev_addr, u32 ctrl_chained, u32 ctrl, u32 ctrl_last, - enum dma_data_direction dir, u32 ctrl_irq_mask); + enum dma_transfer_direction dir, u32 ctrl_irq_mask); #endif /* COH901318_LLI_H */ diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index b48967b499d..a6c6051ec85 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -693,12 +693,12 @@ int dma_async_device_register(struct dma_device *device) !device->device_prep_dma_interrupt); BUG_ON(dma_has_cap(DMA_SG, device->cap_mask) && !device->device_prep_dma_sg); - BUG_ON(dma_has_cap(DMA_SLAVE, device->cap_mask) && - !device->device_prep_slave_sg); BUG_ON(dma_has_cap(DMA_CYCLIC, device->cap_mask) && !device->device_prep_dma_cyclic); BUG_ON(dma_has_cap(DMA_SLAVE, device->cap_mask) && !device->device_control); + BUG_ON(dma_has_cap(DMA_INTERLEAVE, device->cap_mask) && + !device->device_prep_interleaved_dma); BUG_ON(!device->device_alloc_chan_resources); BUG_ON(!device->device_free_chan_resources); diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c index 9bfd6d36071..9b592b02b5f 100644 --- a/drivers/dma/dw_dmac.c +++ b/drivers/dma/dw_dmac.c @@ -166,6 +166,38 @@ dwc_assign_cookie(struct dw_dma_chan *dwc, struct dw_desc *desc) return cookie; } +static void dwc_initialize(struct dw_dma_chan *dwc) +{ + struct dw_dma *dw = to_dw_dma(dwc->chan.device); + struct dw_dma_slave *dws = dwc->chan.private; + u32 cfghi = DWC_CFGH_FIFO_MODE; + u32 cfglo = DWC_CFGL_CH_PRIOR(dwc->priority); + + if (dwc->initialized == true) + return; + + if (dws) { + /* + * We need controller-specific data to set up slave + * transfers. + */ + BUG_ON(!dws->dma_dev || dws->dma_dev != dw->dma.dev); + + cfghi = dws->cfg_hi; + cfglo |= dws->cfg_lo & ~DWC_CFGL_CH_PRIOR_MASK; + } + + channel_writel(dwc, CFG_LO, cfglo); + channel_writel(dwc, CFG_HI, cfghi); + + /* Enable interrupts */ + channel_set_bit(dw, MASK.XFER, dwc->mask); + channel_set_bit(dw, MASK.BLOCK, dwc->mask); + channel_set_bit(dw, MASK.ERROR, dwc->mask); + + dwc->initialized = true; +} + /*----------------------------------------------------------------------*/ /* Called with dwc->lock held and bh disabled */ @@ -189,6 +221,8 @@ static void dwc_dostart(struct dw_dma_chan *dwc, struct dw_desc *first) return; } + dwc_initialize(dwc); + channel_writel(dwc, LLP, first->txd.phys); channel_writel(dwc, CTL_LO, DWC_CTLL_LLP_D_EN | DWC_CTLL_LLP_S_EN); @@ -696,7 +730,7 @@ err_desc_get: static struct dma_async_tx_descriptor * dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, - unsigned int sg_len, enum dma_data_direction direction, + unsigned int sg_len, enum dma_transfer_direction direction, unsigned long flags) { struct dw_dma_chan *dwc = to_dw_dma_chan(chan); @@ -720,7 +754,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, prev = first = NULL; switch (direction) { - case DMA_TO_DEVICE: + case DMA_MEM_TO_DEV: ctllo = (DWC_DEFAULT_CTLLO(chan->private) | DWC_CTLL_DST_WIDTH(reg_width) | DWC_CTLL_DST_FIX @@ -777,7 +811,7 @@ slave_sg_todev_fill_desc: goto slave_sg_todev_fill_desc; } break; - case DMA_FROM_DEVICE: + case DMA_DEV_TO_MEM: ctllo = (DWC_DEFAULT_CTLLO(chan->private) | DWC_CTLL_SRC_WIDTH(reg_width) | DWC_CTLL_DST_INC @@ -959,10 +993,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan) struct dw_dma_chan *dwc = to_dw_dma_chan(chan); struct dw_dma *dw = to_dw_dma(chan->device); struct dw_desc *desc; - struct dw_dma_slave *dws; int i; - u32 cfghi; - u32 cfglo; unsigned long flags; dev_vdbg(chan2dev(chan), "alloc_chan_resources\n"); @@ -975,26 +1006,6 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan) dwc->completed = chan->cookie = 1; - cfghi = DWC_CFGH_FIFO_MODE; - cfglo = 0; - - dws = chan->private; - if (dws) { - /* - * We need controller-specific data to set up slave - * transfers. - */ - BUG_ON(!dws->dma_dev || dws->dma_dev != dw->dma.dev); - - cfghi = dws->cfg_hi; - cfglo = dws->cfg_lo & ~DWC_CFGL_CH_PRIOR_MASK; - } - - cfglo |= DWC_CFGL_CH_PRIOR(dwc->priority); - - channel_writel(dwc, CFG_LO, cfglo); - channel_writel(dwc, CFG_HI, cfghi); - /* * NOTE: some controllers may have additional features that we * need to initialize here, like "scatter-gather" (which @@ -1026,11 +1037,6 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan) i = ++dwc->descs_allocated; } - /* Enable interrupts */ - channel_set_bit(dw, MASK.XFER, dwc->mask); - channel_set_bit(dw, MASK.BLOCK, dwc->mask); - channel_set_bit(dw, MASK.ERROR, dwc->mask); - spin_unlock_irqrestore(&dwc->lock, flags); dev_dbg(chan2dev(chan), @@ -1058,6 +1064,7 @@ static void dwc_free_chan_resources(struct dma_chan *chan) spin_lock_irqsave(&dwc->lock, flags); list_splice_init(&dwc->free_list, &list); dwc->descs_allocated = 0; + dwc->initialized = false; /* Disable interrupts */ channel_clear_bit(dw, MASK.XFER, dwc->mask); @@ -1165,7 +1172,7 @@ EXPORT_SYMBOL(dw_dma_cyclic_stop); */ struct dw_cyclic_desc *dw_dma_cyclic_prep(struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len, size_t period_len, - enum dma_data_direction direction) + enum dma_transfer_direction direction) { struct dw_dma_chan *dwc = to_dw_dma_chan(chan); struct dw_cyclic_desc *cdesc; @@ -1206,7 +1213,7 @@ struct dw_cyclic_desc *dw_dma_cyclic_prep(struct dma_chan *chan, goto out_err; if (unlikely(buf_addr & ((1 << reg_width) - 1))) goto out_err; - if (unlikely(!(direction & (DMA_TO_DEVICE | DMA_FROM_DEVICE)))) + if (unlikely(!(direction & (DMA_MEM_TO_DEV | DMA_DEV_TO_MEM)))) goto out_err; retval = ERR_PTR(-ENOMEM); @@ -1228,7 +1235,7 @@ struct dw_cyclic_desc *dw_dma_cyclic_prep(struct dma_chan *chan, goto out_err_desc_get; switch (direction) { - case DMA_TO_DEVICE: + case DMA_MEM_TO_DEV: desc->lli.dar = dws->tx_reg; desc->lli.sar = buf_addr + (period_len * i); desc->lli.ctllo = (DWC_DEFAULT_CTLLO(chan->private) @@ -1239,7 +1246,7 @@ struct dw_cyclic_desc *dw_dma_cyclic_prep(struct dma_chan *chan, | DWC_CTLL_FC(dws->fc) | DWC_CTLL_INT_EN); break; - case DMA_FROM_DEVICE: + case DMA_DEV_TO_MEM: desc->lli.dar = buf_addr + (period_len * i); desc->lli.sar = dws->rx_reg; desc->lli.ctllo = (DWC_DEFAULT_CTLLO(chan->private) @@ -1335,6 +1342,8 @@ EXPORT_SYMBOL(dw_dma_cyclic_free); static void dw_dma_off(struct dw_dma *dw) { + int i; + dma_writel(dw, CFG, 0); channel_clear_bit(dw, MASK.XFER, dw->all_chan_mask); @@ -1345,6 +1354,9 @@ static void dw_dma_off(struct dw_dma *dw) while (dma_readl(dw, CFG) & DW_CFG_DMA_EN) cpu_relax(); + + for (i = 0; i < dw->dma.chancnt; i++) + dw->chan[i].initialized = false; } static int __init dw_probe(struct platform_device *pdev) @@ -1533,6 +1545,7 @@ static int dw_suspend_noirq(struct device *dev) dw_dma_off(platform_get_drvdata(pdev)); clk_disable(dw->clk); + return 0; } diff --git a/drivers/dma/dw_dmac_regs.h b/drivers/dma/dw_dmac_regs.h index c3419518d70..5eef6946a36 100644 --- a/drivers/dma/dw_dmac_regs.h +++ b/drivers/dma/dw_dmac_regs.h @@ -140,6 +140,7 @@ struct dw_dma_chan { u8 mask; u8 priority; bool paused; + bool initialized; spinlock_t lock; diff --git a/drivers/dma/ep93xx_dma.c b/drivers/dma/ep93xx_dma.c index b47e2b803fa..59e7a965772 100644 --- a/drivers/dma/ep93xx_dma.c +++ b/drivers/dma/ep93xx_dma.c @@ -246,6 +246,9 @@ static void ep93xx_dma_set_active(struct ep93xx_dma_chan *edmac, static struct ep93xx_dma_desc * ep93xx_dma_get_active(struct ep93xx_dma_chan *edmac) { + if (list_empty(&edmac->active)) + return NULL; + return list_first_entry(&edmac->active, struct ep93xx_dma_desc, node); } @@ -263,16 +266,22 @@ ep93xx_dma_get_active(struct ep93xx_dma_chan *edmac) */ static bool ep93xx_dma_advance_active(struct ep93xx_dma_chan *edmac) { + struct ep93xx_dma_desc *desc; + list_rotate_left(&edmac->active); if (test_bit(EP93XX_DMA_IS_CYCLIC, &edmac->flags)) return true; + desc = ep93xx_dma_get_active(edmac); + if (!desc) + return false; + /* * If txd.cookie is set it means that we are back in the first * descriptor in the chain and hence done with it. */ - return !ep93xx_dma_get_active(edmac)->txd.cookie; + return !desc->txd.cookie; } /* @@ -327,10 +336,16 @@ static void m2p_hw_shutdown(struct ep93xx_dma_chan *edmac) static void m2p_fill_desc(struct ep93xx_dma_chan *edmac) { - struct ep93xx_dma_desc *desc = ep93xx_dma_get_active(edmac); + struct ep93xx_dma_desc *desc; u32 bus_addr; - if (ep93xx_dma_chan_direction(&edmac->chan) == DMA_TO_DEVICE) + desc = ep93xx_dma_get_active(edmac); + if (!desc) { + dev_warn(chan2dev(edmac), "M2P: empty descriptor list\n"); + return; + } + + if (ep93xx_dma_chan_direction(&edmac->chan) == DMA_MEM_TO_DEV) bus_addr = desc->src_addr; else bus_addr = desc->dst_addr; @@ -443,7 +458,7 @@ static int m2m_hw_setup(struct ep93xx_dma_chan *edmac) control = (5 << M2M_CONTROL_PWSC_SHIFT); control |= M2M_CONTROL_NO_HDSK; - if (data->direction == DMA_TO_DEVICE) { + if (data->direction == DMA_MEM_TO_DEV) { control |= M2M_CONTROL_DAH; control |= M2M_CONTROL_TM_TX; control |= M2M_CONTROL_RSS_SSPTX; @@ -459,11 +474,7 @@ static int m2m_hw_setup(struct ep93xx_dma_chan *edmac) * This IDE part is totally untested. Values below are taken * from the EP93xx Users's Guide and might not be correct. */ - control |= M2M_CONTROL_NO_HDSK; - control |= M2M_CONTROL_RSS_IDE; - control |= M2M_CONTROL_PW_16; - - if (data->direction == DMA_TO_DEVICE) { + if (data->direction == DMA_MEM_TO_DEV) { /* Worst case from the UG */ control = (3 << M2M_CONTROL_PWSC_SHIFT); control |= M2M_CONTROL_DAH; @@ -473,6 +484,10 @@ static int m2m_hw_setup(struct ep93xx_dma_chan *edmac) control |= M2M_CONTROL_SAH; control |= M2M_CONTROL_TM_RX; } + + control |= M2M_CONTROL_NO_HDSK; + control |= M2M_CONTROL_RSS_IDE; + control |= M2M_CONTROL_PW_16; break; default: @@ -491,7 +506,13 @@ static void m2m_hw_shutdown(struct ep93xx_dma_chan *edmac) static void m2m_fill_desc(struct ep93xx_dma_chan *edmac) { - struct ep93xx_dma_desc *desc = ep93xx_dma_get_active(edmac); + struct ep93xx_dma_desc *desc; + + desc = ep93xx_dma_get_active(edmac); + if (!desc) { + dev_warn(chan2dev(edmac), "M2M: empty descriptor list\n"); + return; + } if (edmac->buffer == 0) { writel(desc->src_addr, edmac->regs + M2M_SAR_BASE0); @@ -669,24 +690,30 @@ static void ep93xx_dma_tasklet(unsigned long data) { struct ep93xx_dma_chan *edmac = (struct ep93xx_dma_chan *)data; struct ep93xx_dma_desc *desc, *d; - dma_async_tx_callback callback; - void *callback_param; + dma_async_tx_callback callback = NULL; + void *callback_param = NULL; LIST_HEAD(list); spin_lock_irq(&edmac->lock); + /* + * If dma_terminate_all() was called before we get to run, the active + * list has become empty. If that happens we aren't supposed to do + * anything more than call ep93xx_dma_advance_work(). + */ desc = ep93xx_dma_get_active(edmac); - if (desc->complete) { - edmac->last_completed = desc->txd.cookie; - list_splice_init(&edmac->active, &list); + if (desc) { + if (desc->complete) { + edmac->last_completed = desc->txd.cookie; + list_splice_init(&edmac->active, &list); + } + callback = desc->txd.callback; + callback_param = desc->txd.callback_param; } spin_unlock_irq(&edmac->lock); /* Pick up the next descriptor from the queue */ ep93xx_dma_advance_work(edmac); - callback = desc->txd.callback; - callback_param = desc->txd.callback_param; - /* Now we can release all the chained descriptors */ list_for_each_entry_safe(desc, d, &list, node) { /* @@ -706,13 +733,22 @@ static void ep93xx_dma_tasklet(unsigned long data) static irqreturn_t ep93xx_dma_interrupt(int irq, void *dev_id) { struct ep93xx_dma_chan *edmac = dev_id; + struct ep93xx_dma_desc *desc; irqreturn_t ret = IRQ_HANDLED; spin_lock(&edmac->lock); + desc = ep93xx_dma_get_active(edmac); + if (!desc) { + dev_warn(chan2dev(edmac), + "got interrupt while active list is empty\n"); + spin_unlock(&edmac->lock); + return IRQ_NONE; + } + switch (edmac->edma->hw_interrupt(edmac)) { case INTERRUPT_DONE: - ep93xx_dma_get_active(edmac)->complete = true; + desc->complete = true; tasklet_schedule(&edmac->tasklet); break; @@ -803,8 +839,8 @@ static int ep93xx_dma_alloc_chan_resources(struct dma_chan *chan) switch (data->port) { case EP93XX_DMA_SSP: case EP93XX_DMA_IDE: - if (data->direction != DMA_TO_DEVICE && - data->direction != DMA_FROM_DEVICE) + if (data->direction != DMA_MEM_TO_DEV && + data->direction != DMA_DEV_TO_MEM) return -EINVAL; break; default: @@ -952,7 +988,7 @@ fail: */ static struct dma_async_tx_descriptor * ep93xx_dma_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, - unsigned int sg_len, enum dma_data_direction dir, + unsigned int sg_len, enum dma_transfer_direction dir, unsigned long flags) { struct ep93xx_dma_chan *edmac = to_ep93xx_dma_chan(chan); @@ -988,7 +1024,7 @@ ep93xx_dma_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, goto fail; } - if (dir == DMA_TO_DEVICE) { + if (dir == DMA_MEM_TO_DEV) { desc->src_addr = sg_dma_address(sg); desc->dst_addr = edmac->runtime_addr; } else { @@ -1032,7 +1068,7 @@ fail: static struct dma_async_tx_descriptor * ep93xx_dma_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t dma_addr, size_t buf_len, size_t period_len, - enum dma_data_direction dir) + enum dma_transfer_direction dir) { struct ep93xx_dma_chan *edmac = to_ep93xx_dma_chan(chan); struct ep93xx_dma_desc *desc, *first; @@ -1065,7 +1101,7 @@ ep93xx_dma_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t dma_addr, goto fail; } - if (dir == DMA_TO_DEVICE) { + if (dir == DMA_MEM_TO_DEV) { desc->src_addr = dma_addr + offset; desc->dst_addr = edmac->runtime_addr; } else { @@ -1133,12 +1169,12 @@ static int ep93xx_dma_slave_config(struct ep93xx_dma_chan *edmac, return -EINVAL; switch (config->direction) { - case DMA_FROM_DEVICE: + case DMA_DEV_TO_MEM: width = config->src_addr_width; addr = config->src_addr; break; - case DMA_TO_DEVICE: + case DMA_MEM_TO_DEV: width = config->dst_addr_width; addr = config->dst_addr; break; diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index 8a781540590..b98070c33ca 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -772,7 +772,7 @@ fail: */ static struct dma_async_tx_descriptor *fsl_dma_prep_slave_sg( struct dma_chan *dchan, struct scatterlist *sgl, unsigned int sg_len, - enum dma_data_direction direction, unsigned long flags) + enum dma_transfer_direction direction, unsigned long flags) { /* * This operation is not supported on the Freescale DMA controller @@ -819,7 +819,7 @@ static int fsl_dma_device_control(struct dma_chan *dchan, return -ENXIO; /* we set the controller burst size depending on direction */ - if (config->direction == DMA_TO_DEVICE) + if (config->direction == DMA_MEM_TO_DEV) size = config->dst_addr_width * config->dst_maxburst; else size = config->src_addr_width * config->src_maxburst; diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c index 4be55f9bb6c..e4383ee2c9a 100644 --- a/drivers/dma/imx-dma.c +++ b/drivers/dma/imx-dma.c @@ -107,7 +107,7 @@ static int imxdma_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd, imx_dma_disable(imxdmac->imxdma_channel); return 0; case DMA_SLAVE_CONFIG: - if (dmaengine_cfg->direction == DMA_FROM_DEVICE) { + if (dmaengine_cfg->direction == DMA_DEV_TO_MEM) { imxdmac->per_address = dmaengine_cfg->src_addr; imxdmac->watermark_level = dmaengine_cfg->src_maxburst; imxdmac->word_size = dmaengine_cfg->src_addr_width; @@ -224,7 +224,7 @@ static void imxdma_free_chan_resources(struct dma_chan *chan) static struct dma_async_tx_descriptor *imxdma_prep_slave_sg( struct dma_chan *chan, struct scatterlist *sgl, - unsigned int sg_len, enum dma_data_direction direction, + unsigned int sg_len, enum dma_transfer_direction direction, unsigned long flags) { struct imxdma_channel *imxdmac = to_imxdma_chan(chan); @@ -241,7 +241,7 @@ static struct dma_async_tx_descriptor *imxdma_prep_slave_sg( dma_length += sg->length; } - if (direction == DMA_FROM_DEVICE) + if (direction == DMA_DEV_TO_MEM) dmamode = DMA_MODE_READ; else dmamode = DMA_MODE_WRITE; @@ -271,7 +271,7 @@ static struct dma_async_tx_descriptor *imxdma_prep_slave_sg( static struct dma_async_tx_descriptor *imxdma_prep_dma_cyclic( struct dma_chan *chan, dma_addr_t dma_addr, size_t buf_len, - size_t period_len, enum dma_data_direction direction) + size_t period_len, enum dma_transfer_direction direction) { struct imxdma_channel *imxdmac = to_imxdma_chan(chan); struct imxdma_engine *imxdma = imxdmac->imxdma; @@ -317,7 +317,7 @@ static struct dma_async_tx_descriptor *imxdma_prep_dma_cyclic( imxdmac->sg_list[periods].page_link = ((unsigned long)imxdmac->sg_list | 0x01) & ~0x02; - if (direction == DMA_FROM_DEVICE) + if (direction == DMA_DEV_TO_MEM) dmamode = DMA_MODE_READ; else dmamode = DMA_MODE_WRITE; diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c index f993955a640..a8af379680c 100644 --- a/drivers/dma/imx-sdma.c +++ b/drivers/dma/imx-sdma.c @@ -247,7 +247,7 @@ struct sdma_engine; struct sdma_channel { struct sdma_engine *sdma; unsigned int channel; - enum dma_data_direction direction; + enum dma_transfer_direction direction; enum sdma_peripheral_type peripheral_type; unsigned int event_id0; unsigned int event_id1; @@ -268,6 +268,8 @@ struct sdma_channel { struct dma_async_tx_descriptor desc; dma_cookie_t last_completed; enum dma_status status; + unsigned int chn_count; + unsigned int chn_real_count; }; #define IMX_DMA_SG_LOOP (1 << 0) @@ -503,6 +505,7 @@ static void mxc_sdma_handle_channel_normal(struct sdma_channel *sdmac) struct sdma_buffer_descriptor *bd; int i, error = 0; + sdmac->chn_real_count = 0; /* * non loop mode. Iterate over all descriptors, collect * errors and call callback function @@ -512,6 +515,7 @@ static void mxc_sdma_handle_channel_normal(struct sdma_channel *sdmac) if (bd->mode.status & (BD_DONE | BD_RROR)) error = -EIO; + sdmac->chn_real_count += bd->mode.count; } if (error) @@ -519,9 +523,9 @@ static void mxc_sdma_handle_channel_normal(struct sdma_channel *sdmac) else sdmac->status = DMA_SUCCESS; + sdmac->last_completed = sdmac->desc.cookie; if (sdmac->desc.callback) sdmac->desc.callback(sdmac->desc.callback_param); - sdmac->last_completed = sdmac->desc.cookie; } static void mxc_sdma_handle_channel(struct sdma_channel *sdmac) @@ -650,7 +654,7 @@ static int sdma_load_context(struct sdma_channel *sdmac) struct sdma_buffer_descriptor *bd0 = sdma->channel[0].bd; int ret; - if (sdmac->direction == DMA_FROM_DEVICE) { + if (sdmac->direction == DMA_DEV_TO_MEM) { load_address = sdmac->pc_from_device; } else { load_address = sdmac->pc_to_device; @@ -832,17 +836,18 @@ static struct sdma_channel *to_sdma_chan(struct dma_chan *chan) static dma_cookie_t sdma_tx_submit(struct dma_async_tx_descriptor *tx) { + unsigned long flags; struct sdma_channel *sdmac = to_sdma_chan(tx->chan); struct sdma_engine *sdma = sdmac->sdma; dma_cookie_t cookie; - spin_lock_irq(&sdmac->lock); + spin_lock_irqsave(&sdmac->lock, flags); cookie = sdma_assign_cookie(sdmac); sdma_enable_channel(sdma, sdmac->channel); - spin_unlock_irq(&sdmac->lock); + spin_unlock_irqrestore(&sdmac->lock, flags); return cookie; } @@ -911,7 +916,7 @@ static void sdma_free_chan_resources(struct dma_chan *chan) static struct dma_async_tx_descriptor *sdma_prep_slave_sg( struct dma_chan *chan, struct scatterlist *sgl, - unsigned int sg_len, enum dma_data_direction direction, + unsigned int sg_len, enum dma_transfer_direction direction, unsigned long flags) { struct sdma_channel *sdmac = to_sdma_chan(chan); @@ -941,6 +946,7 @@ static struct dma_async_tx_descriptor *sdma_prep_slave_sg( goto err_out; } + sdmac->chn_count = 0; for_each_sg(sgl, sg, sg_len, i) { struct sdma_buffer_descriptor *bd = &sdmac->bd[i]; int param; @@ -957,6 +963,7 @@ static struct dma_async_tx_descriptor *sdma_prep_slave_sg( } bd->mode.count = count; + sdmac->chn_count += count; if (sdmac->word_size > DMA_SLAVE_BUSWIDTH_4_BYTES) { ret = -EINVAL; @@ -1008,7 +1015,7 @@ err_out: static struct dma_async_tx_descriptor *sdma_prep_dma_cyclic( struct dma_chan *chan, dma_addr_t dma_addr, size_t buf_len, - size_t period_len, enum dma_data_direction direction) + size_t period_len, enum dma_transfer_direction direction) { struct sdma_channel *sdmac = to_sdma_chan(chan); struct sdma_engine *sdma = sdmac->sdma; @@ -1093,7 +1100,7 @@ static int sdma_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd, sdma_disable_channel(sdmac); return 0; case DMA_SLAVE_CONFIG: - if (dmaengine_cfg->direction == DMA_FROM_DEVICE) { + if (dmaengine_cfg->direction == DMA_DEV_TO_MEM) { sdmac->per_address = dmaengine_cfg->src_addr; sdmac->watermark_level = dmaengine_cfg->src_maxburst; sdmac->word_size = dmaengine_cfg->src_addr_width; @@ -1102,6 +1109,7 @@ static int sdma_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd, sdmac->watermark_level = dmaengine_cfg->dst_maxburst; sdmac->word_size = dmaengine_cfg->dst_addr_width; } + sdmac->direction = dmaengine_cfg->direction; return sdma_config_channel(sdmac); default: return -ENOSYS; @@ -1119,7 +1127,8 @@ static enum dma_status sdma_tx_status(struct dma_chan *chan, last_used = chan->cookie; - dma_set_tx_state(txstate, sdmac->last_completed, last_used, 0); + dma_set_tx_state(txstate, sdmac->last_completed, last_used, + sdmac->chn_count - sdmac->chn_real_count); return sdmac->status; } diff --git a/drivers/dma/intel_mid_dma.c b/drivers/dma/intel_mid_dma.c index 19a0c64d45d..74f70aadf9e 100644 --- a/drivers/dma/intel_mid_dma.c +++ b/drivers/dma/intel_mid_dma.c @@ -280,7 +280,8 @@ static void midc_dostart(struct intel_mid_dma_chan *midc, * callbacks but must be called with the lock held. */ static void midc_descriptor_complete(struct intel_mid_dma_chan *midc, - struct intel_mid_dma_desc *desc) + struct intel_mid_dma_desc *desc) + __releases(&midc->lock) __acquires(&midc->lock) { struct dma_async_tx_descriptor *txd = &desc->txd; dma_async_tx_callback callback_txd = NULL; @@ -311,6 +312,7 @@ static void midc_descriptor_complete(struct intel_mid_dma_chan *midc, pci_pool_free(desc->lli_pool, desc->lli, desc->lli_phys); pci_pool_destroy(desc->lli_pool); + desc->lli = NULL; } list_move(&desc->desc_node, &midc->free_list); midc->busy = false; @@ -395,10 +397,10 @@ static int midc_lli_fill_sg(struct intel_mid_dma_chan *midc, midc->dma->block_size); /*Populate SAR and DAR values*/ sg_phy_addr = sg_phys(sg); - if (desc->dirn == DMA_TO_DEVICE) { + if (desc->dirn == DMA_MEM_TO_DEV) { lli_bloc_desc->sar = sg_phy_addr; lli_bloc_desc->dar = mids->dma_slave.dst_addr; - } else if (desc->dirn == DMA_FROM_DEVICE) { + } else if (desc->dirn == DMA_DEV_TO_MEM) { lli_bloc_desc->sar = mids->dma_slave.src_addr; lli_bloc_desc->dar = sg_phy_addr; } @@ -490,7 +492,9 @@ static enum dma_status intel_mid_dma_tx_status(struct dma_chan *chan, ret = dma_async_is_complete(cookie, last_complete, last_used); if (ret != DMA_SUCCESS) { + spin_lock_bh(&midc->lock); midc_scan_descriptors(to_middma_device(chan->device), midc); + spin_unlock_bh(&midc->lock); last_complete = midc->completed; last_used = chan->cookie; @@ -566,6 +570,7 @@ static int intel_mid_dma_device_control(struct dma_chan *chan, pci_pool_free(desc->lli_pool, desc->lli, desc->lli_phys); pci_pool_destroy(desc->lli_pool); + desc->lli = NULL; } list_move(&desc->desc_node, &midc->free_list); } @@ -632,13 +637,13 @@ static struct dma_async_tx_descriptor *intel_mid_dma_prep_memcpy( if (midc->dma->pimr_mask) { cfg_hi.cfgx.protctl = 0x0; /*default value*/ cfg_hi.cfgx.fifo_mode = 1; - if (mids->dma_slave.direction == DMA_TO_DEVICE) { + if (mids->dma_slave.direction == DMA_MEM_TO_DEV) { cfg_hi.cfgx.src_per = 0; if (mids->device_instance == 0) cfg_hi.cfgx.dst_per = 3; if (mids->device_instance == 1) cfg_hi.cfgx.dst_per = 1; - } else if (mids->dma_slave.direction == DMA_FROM_DEVICE) { + } else if (mids->dma_slave.direction == DMA_DEV_TO_MEM) { if (mids->device_instance == 0) cfg_hi.cfgx.src_per = 2; if (mids->device_instance == 1) @@ -682,11 +687,11 @@ static struct dma_async_tx_descriptor *intel_mid_dma_prep_memcpy( ctl_lo.ctlx.sinc = 0; ctl_lo.ctlx.dinc = 0; } else { - if (mids->dma_slave.direction == DMA_TO_DEVICE) { + if (mids->dma_slave.direction == DMA_MEM_TO_DEV) { ctl_lo.ctlx.sinc = 0; ctl_lo.ctlx.dinc = 2; ctl_lo.ctlx.tt_fc = 1; - } else if (mids->dma_slave.direction == DMA_FROM_DEVICE) { + } else if (mids->dma_slave.direction == DMA_DEV_TO_MEM) { ctl_lo.ctlx.sinc = 2; ctl_lo.ctlx.dinc = 0; ctl_lo.ctlx.tt_fc = 2; @@ -732,7 +737,7 @@ err_desc_get: */ static struct dma_async_tx_descriptor *intel_mid_dma_prep_slave_sg( struct dma_chan *chan, struct scatterlist *sgl, - unsigned int sg_len, enum dma_data_direction direction, + unsigned int sg_len, enum dma_transfer_direction direction, unsigned long flags) { struct intel_mid_dma_chan *midc = NULL; @@ -868,7 +873,7 @@ static int intel_mid_dma_alloc_chan_resources(struct dma_chan *chan) pm_runtime_get_sync(&mid->pdev->dev); if (mid->state == SUSPENDED) { - if (dma_resume(mid->pdev)) { + if (dma_resume(&mid->pdev->dev)) { pr_err("ERR_MDMA: resume failed"); return -EFAULT; } @@ -1099,7 +1104,8 @@ static int mid_setup_dma(struct pci_dev *pdev) LNW_PERIPHRAL_MASK_SIZE); if (dma->mask_reg == NULL) { pr_err("ERR_MDMA:Can't map periphral intr space !!\n"); - return -ENOMEM; + err = -ENOMEM; + goto err_ioremap; } } else dma->mask_reg = NULL; @@ -1196,6 +1202,9 @@ static int mid_setup_dma(struct pci_dev *pdev) err_engine: free_irq(pdev->irq, dma); err_irq: + if (dma->mask_reg) + iounmap(dma->mask_reg); +err_ioremap: pci_pool_destroy(dma->dma_pool); err_dma_pool: pr_err("ERR_MDMA:setup_dma failed: %d\n", err); @@ -1337,8 +1346,9 @@ static void __devexit intel_mid_dma_remove(struct pci_dev *pdev) * * This function is called by OS when a power event occurs */ -int dma_suspend(struct pci_dev *pci, pm_message_t state) +static int dma_suspend(struct device *dev) { + struct pci_dev *pci = to_pci_dev(dev); int i; struct middma_device *device = pci_get_drvdata(pci); pr_debug("MDMA: dma_suspend called\n"); @@ -1362,8 +1372,9 @@ int dma_suspend(struct pci_dev *pci, pm_message_t state) * * This function is called by OS when a power event occurs */ -int dma_resume(struct pci_dev *pci) +int dma_resume(struct device *dev) { + struct pci_dev *pci = to_pci_dev(dev); int ret; struct middma_device *device = pci_get_drvdata(pci); @@ -1429,6 +1440,8 @@ static const struct dev_pm_ops intel_mid_dma_pm = { .runtime_suspend = dma_runtime_suspend, .runtime_resume = dma_runtime_resume, .runtime_idle = dma_runtime_idle, + .suspend = dma_suspend, + .resume = dma_resume, }; static struct pci_driver intel_mid_dma_pci_driver = { @@ -1437,8 +1450,6 @@ static struct pci_driver intel_mid_dma_pci_driver = { .probe = intel_mid_dma_probe, .remove = __devexit_p(intel_mid_dma_remove), #ifdef CONFIG_PM - .suspend = dma_suspend, - .resume = dma_resume, .driver = { .pm = &intel_mid_dma_pm, }, diff --git a/drivers/dma/intel_mid_dma_regs.h b/drivers/dma/intel_mid_dma_regs.h index aea5ee88ce0..c83d35b97bd 100644 --- a/drivers/dma/intel_mid_dma_regs.h +++ b/drivers/dma/intel_mid_dma_regs.h @@ -262,7 +262,7 @@ struct intel_mid_dma_desc { unsigned int lli_length; unsigned int current_lli; dma_addr_t next; - enum dma_data_direction dirn; + enum dma_transfer_direction dirn; enum dma_status status; enum dma_slave_buswidth width; /*width of DMA txn*/ enum intel_mid_dma_mode cfg_mode; /*mode configuration*/ @@ -296,6 +296,6 @@ static inline struct intel_mid_dma_slave *to_intel_mid_dma_slave } -int dma_resume(struct pci_dev *pci); +int dma_resume(struct device *dev); #endif /*__INTEL_MID_DMAC_REGS_H__*/ diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c index e03f811a83d..04be90b645b 100644 --- a/drivers/dma/iop-adma.c +++ b/drivers/dma/iop-adma.c @@ -1735,8 +1735,6 @@ static void iop_chan_start_null_xor(struct iop_adma_chan *iop_chan) spin_unlock_bh(&iop_chan->lock); } -MODULE_ALIAS("platform:iop-adma"); - static struct platform_driver iop_adma_driver = { .probe = iop_adma_probe, .remove = __devexit_p(iop_adma_remove), @@ -1746,19 +1744,9 @@ static struct platform_driver iop_adma_driver = { }, }; -static int __init iop_adma_init (void) -{ - return platform_driver_register(&iop_adma_driver); -} - -static void __exit iop_adma_exit (void) -{ - platform_driver_unregister(&iop_adma_driver); - return; -} -module_exit(iop_adma_exit); -module_init(iop_adma_init); +module_platform_driver(iop_adma_driver); MODULE_AUTHOR("Intel Corporation"); MODULE_DESCRIPTION("IOP ADMA Engine Driver"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:iop-adma"); diff --git a/drivers/dma/ipu/ipu_idmac.c b/drivers/dma/ipu/ipu_idmac.c index 0e5ef33f90a..6212b16e8cf 100644 --- a/drivers/dma/ipu/ipu_idmac.c +++ b/drivers/dma/ipu/ipu_idmac.c @@ -312,7 +312,7 @@ static void ipu_ch_param_set_size(union chan_param_mem *params, case IPU_PIX_FMT_RGB565: params->ip.bpp = 2; params->ip.pfs = 4; - params->ip.npb = 7; + params->ip.npb = 15; params->ip.sat = 2; /* SAT = 32-bit access */ params->ip.ofs0 = 0; /* Red bit offset */ params->ip.ofs1 = 5; /* Green bit offset */ @@ -422,12 +422,6 @@ static void ipu_ch_param_set_size(union chan_param_mem *params, params->pp.nsb = 1; } -static void ipu_ch_param_set_burst_size(union chan_param_mem *params, - uint16_t burst_pixels) -{ - params->pp.npb = burst_pixels - 1; -} - static void ipu_ch_param_set_buffer(union chan_param_mem *params, dma_addr_t buf0, dma_addr_t buf1) { @@ -690,23 +684,6 @@ static int ipu_init_channel_buffer(struct idmac_channel *ichan, ipu_ch_param_set_size(¶ms, pixel_fmt, width, height, stride_bytes); ipu_ch_param_set_buffer(¶ms, phyaddr_0, phyaddr_1); ipu_ch_param_set_rotation(¶ms, rot_mode); - /* Some channels (rotation) have restriction on burst length */ - switch (channel) { - case IDMAC_IC_7: /* Hangs with burst 8, 16, other values - invalid - Table 44-30 */ -/* - ipu_ch_param_set_burst_size(¶ms, 8); - */ - break; - case IDMAC_SDC_0: - case IDMAC_SDC_1: - /* In original code only IPU_PIX_FMT_RGB565 was setting burst */ - ipu_ch_param_set_burst_size(¶ms, 16); - break; - case IDMAC_IC_0: - default: - break; - } spin_lock_irqsave(&ipu->lock, flags); @@ -1364,7 +1341,7 @@ static void ipu_gc_tasklet(unsigned long arg) /* Allocate and initialise a transfer descriptor. */ static struct dma_async_tx_descriptor *idmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, unsigned int sg_len, - enum dma_data_direction direction, unsigned long tx_flags) + enum dma_transfer_direction direction, unsigned long tx_flags) { struct idmac_channel *ichan = to_idmac_chan(chan); struct idmac_tx_desc *desc = NULL; @@ -1376,7 +1353,7 @@ static struct dma_async_tx_descriptor *idmac_prep_slave_sg(struct dma_chan *chan chan->chan_id != IDMAC_IC_7) return NULL; - if (direction != DMA_FROM_DEVICE && direction != DMA_TO_DEVICE) { + if (direction != DMA_DEV_TO_MEM && direction != DMA_MEM_TO_DEV) { dev_err(chan->device->dev, "Invalid DMA direction %d!\n", direction); return NULL; } diff --git a/drivers/dma/mpc512x_dma.c b/drivers/dma/mpc512x_dma.c index 8ba4edc6185..4d6d4cf6694 100644 --- a/drivers/dma/mpc512x_dma.c +++ b/drivers/dma/mpc512x_dma.c @@ -835,17 +835,7 @@ static struct platform_driver mpc_dma_driver = { }, }; -static int __init mpc_dma_init(void) -{ - return platform_driver_register(&mpc_dma_driver); -} -module_init(mpc_dma_init); - -static void __exit mpc_dma_exit(void) -{ - platform_driver_unregister(&mpc_dma_driver); -} -module_exit(mpc_dma_exit); +module_platform_driver(mpc_dma_driver); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Piotr Ziecik <kosmo@semihalf.com>"); diff --git a/drivers/dma/mxs-dma.c b/drivers/dma/mxs-dma.c index fc903c0ed23..b06cd4ca626 100644 --- a/drivers/dma/mxs-dma.c +++ b/drivers/dma/mxs-dma.c @@ -44,7 +44,6 @@ #define HW_APBHX_CTRL0 0x000 #define BM_APBH_CTRL0_APB_BURST8_EN (1 << 29) #define BM_APBH_CTRL0_APB_BURST_EN (1 << 28) -#define BP_APBH_CTRL0_CLKGATE_CHANNEL 8 #define BP_APBH_CTRL0_RESET_CHANNEL 16 #define HW_APBHX_CTRL1 0x010 #define HW_APBHX_CTRL2 0x020 @@ -111,6 +110,7 @@ struct mxs_dma_chan { int chan_irq; struct mxs_dma_ccw *ccw; dma_addr_t ccw_phys; + int desc_count; dma_cookie_t last_completed; enum dma_status status; unsigned int flags; @@ -130,23 +130,6 @@ struct mxs_dma_engine { struct mxs_dma_chan mxs_chans[MXS_DMA_CHANNELS]; }; -static inline void mxs_dma_clkgate(struct mxs_dma_chan *mxs_chan, int enable) -{ - struct mxs_dma_engine *mxs_dma = mxs_chan->mxs_dma; - int chan_id = mxs_chan->chan.chan_id; - int set_clr = enable ? MXS_CLR_ADDR : MXS_SET_ADDR; - - /* enable apbh channel clock */ - if (dma_is_apbh()) { - if (apbh_is_old()) - writel(1 << (chan_id + BP_APBH_CTRL0_CLKGATE_CHANNEL), - mxs_dma->base + HW_APBHX_CTRL0 + set_clr); - else - writel(1 << chan_id, - mxs_dma->base + HW_APBHX_CTRL0 + set_clr); - } -} - static void mxs_dma_reset_chan(struct mxs_dma_chan *mxs_chan) { struct mxs_dma_engine *mxs_dma = mxs_chan->mxs_dma; @@ -165,9 +148,6 @@ static void mxs_dma_enable_chan(struct mxs_dma_chan *mxs_chan) struct mxs_dma_engine *mxs_dma = mxs_chan->mxs_dma; int chan_id = mxs_chan->chan.chan_id; - /* clkgate needs to be enabled before writing other registers */ - mxs_dma_clkgate(mxs_chan, 1); - /* set cmd_addr up */ writel(mxs_chan->ccw_phys, mxs_dma->base + HW_APBHX_CHn_NXTCMDAR(chan_id)); @@ -178,9 +158,6 @@ static void mxs_dma_enable_chan(struct mxs_dma_chan *mxs_chan) static void mxs_dma_disable_chan(struct mxs_dma_chan *mxs_chan) { - /* disable apbh channel clock */ - mxs_dma_clkgate(mxs_chan, 0); - mxs_chan->status = DMA_SUCCESS; } @@ -268,7 +245,7 @@ static irqreturn_t mxs_dma_int_handler(int irq, void *dev_id) /* * When both completion and error of termination bits set at the * same time, we do not take it as an error. IOW, it only becomes - * an error we need to handler here in case of ether it's (1) an bus + * an error we need to handle here in case of either it's (1) a bus * error or (2) a termination error with no completion. */ stat2 = ((stat2 >> MXS_DMA_CHANNELS) & stat2) | /* (1) */ @@ -338,10 +315,7 @@ static int mxs_dma_alloc_chan_resources(struct dma_chan *chan) if (ret) goto err_clk; - /* clkgate needs to be enabled for reset to finish */ - mxs_dma_clkgate(mxs_chan, 1); mxs_dma_reset_chan(mxs_chan); - mxs_dma_clkgate(mxs_chan, 0); dma_async_tx_descriptor_init(&mxs_chan->desc, chan); mxs_chan->desc.tx_submit = mxs_dma_tx_submit; @@ -377,7 +351,7 @@ static void mxs_dma_free_chan_resources(struct dma_chan *chan) static struct dma_async_tx_descriptor *mxs_dma_prep_slave_sg( struct dma_chan *chan, struct scatterlist *sgl, - unsigned int sg_len, enum dma_data_direction direction, + unsigned int sg_len, enum dma_transfer_direction direction, unsigned long append) { struct mxs_dma_chan *mxs_chan = to_mxs_dma_chan(chan); @@ -386,7 +360,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_slave_sg( struct scatterlist *sg; int i, j; u32 *pio; - static int idx; + int idx = append ? mxs_chan->desc_count : 0; if (mxs_chan->status == DMA_IN_PROGRESS && !append) return NULL; @@ -417,7 +391,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_slave_sg( idx = 0; } - if (direction == DMA_NONE) { + if (direction == DMA_TRANS_NONE) { ccw = &mxs_chan->ccw[idx++]; pio = (u32 *) sgl; @@ -450,7 +424,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_slave_sg( ccw->bits |= CCW_CHAIN; ccw->bits |= CCW_HALT_ON_TERM; ccw->bits |= CCW_TERM_FLUSH; - ccw->bits |= BF_CCW(direction == DMA_FROM_DEVICE ? + ccw->bits |= BF_CCW(direction == DMA_DEV_TO_MEM ? MXS_DMA_CMD_WRITE : MXS_DMA_CMD_READ, COMMAND); @@ -462,6 +436,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_slave_sg( } } } + mxs_chan->desc_count = idx; return &mxs_chan->desc; @@ -472,7 +447,7 @@ err_out: static struct dma_async_tx_descriptor *mxs_dma_prep_dma_cyclic( struct dma_chan *chan, dma_addr_t dma_addr, size_t buf_len, - size_t period_len, enum dma_data_direction direction) + size_t period_len, enum dma_transfer_direction direction) { struct mxs_dma_chan *mxs_chan = to_mxs_dma_chan(chan); struct mxs_dma_engine *mxs_dma = mxs_chan->mxs_dma; @@ -515,7 +490,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_dma_cyclic( ccw->bits |= CCW_IRQ; ccw->bits |= CCW_HALT_ON_TERM; ccw->bits |= CCW_TERM_FLUSH; - ccw->bits |= BF_CCW(direction == DMA_FROM_DEVICE ? + ccw->bits |= BF_CCW(direction == DMA_DEV_TO_MEM ? MXS_DMA_CMD_WRITE : MXS_DMA_CMD_READ, COMMAND); dma_addr += period_len; @@ -523,6 +498,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_dma_cyclic( i++; } + mxs_chan->desc_count = i; return &mxs_chan->desc; @@ -539,8 +515,8 @@ static int mxs_dma_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd, switch (cmd) { case DMA_TERMINATE_ALL: - mxs_dma_disable_chan(mxs_chan); mxs_dma_reset_chan(mxs_chan); + mxs_dma_disable_chan(mxs_chan); break; case DMA_PAUSE: mxs_dma_pause_chan(mxs_chan); @@ -580,7 +556,7 @@ static int __init mxs_dma_init(struct mxs_dma_engine *mxs_dma) ret = clk_prepare_enable(mxs_dma->clk); if (ret) - goto err_out; + return ret; ret = mxs_reset_block(mxs_dma->base); if (ret) @@ -604,11 +580,8 @@ static int __init mxs_dma_init(struct mxs_dma_engine *mxs_dma) writel(MXS_DMA_CHANNELS_MASK << MXS_DMA_CHANNELS, mxs_dma->base + HW_APBHX_CTRL1 + MXS_SET_ADDR); - clk_disable_unprepare(mxs_dma->clk); - - return 0; - err_out: + clk_disable_unprepare(mxs_dma->clk); return ret; } diff --git a/drivers/dma/pch_dma.c b/drivers/dma/pch_dma.c index a6d0e3dbed0..823f58179f9 100644 --- a/drivers/dma/pch_dma.c +++ b/drivers/dma/pch_dma.c @@ -1,7 +1,7 @@ /* * Topcliff PCH DMA controller driver * Copyright (c) 2010 Intel Corporation - * Copyright (C) 2011 OKI SEMICONDUCTOR CO., LTD. + * Copyright (C) 2011 LAPIS Semiconductor Co., Ltd. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -99,7 +99,7 @@ struct pch_dma_desc { struct pch_dma_chan { struct dma_chan chan; void __iomem *membase; - enum dma_data_direction dir; + enum dma_transfer_direction dir; struct tasklet_struct tasklet; unsigned long err_status; @@ -224,7 +224,7 @@ static void pdc_set_dir(struct dma_chan *chan) mask_ctl = DMA_MASK_CTL0_MODE & ~(DMA_CTL0_MODE_MASK_BITS << (DMA_CTL0_BITS_PER_CH * chan->chan_id)); val &= mask_mode; - if (pd_chan->dir == DMA_TO_DEVICE) + if (pd_chan->dir == DMA_MEM_TO_DEV) val |= 0x1 << (DMA_CTL0_BITS_PER_CH * chan->chan_id + DMA_CTL0_DIR_SHIFT_BITS); else @@ -242,7 +242,7 @@ static void pdc_set_dir(struct dma_chan *chan) mask_ctl = DMA_MASK_CTL2_MODE & ~(DMA_CTL0_MODE_MASK_BITS << (DMA_CTL0_BITS_PER_CH * ch)); val &= mask_mode; - if (pd_chan->dir == DMA_TO_DEVICE) + if (pd_chan->dir == DMA_MEM_TO_DEV) val |= 0x1 << (DMA_CTL0_BITS_PER_CH * ch + DMA_CTL0_DIR_SHIFT_BITS); else @@ -607,7 +607,7 @@ static void pd_issue_pending(struct dma_chan *chan) static struct dma_async_tx_descriptor *pd_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, unsigned int sg_len, - enum dma_data_direction direction, unsigned long flags) + enum dma_transfer_direction direction, unsigned long flags) { struct pch_dma_chan *pd_chan = to_pd_chan(chan); struct pch_dma_slave *pd_slave = chan->private; @@ -623,9 +623,9 @@ static struct dma_async_tx_descriptor *pd_prep_slave_sg(struct dma_chan *chan, return NULL; } - if (direction == DMA_FROM_DEVICE) + if (direction == DMA_DEV_TO_MEM) reg = pd_slave->rx_reg; - else if (direction == DMA_TO_DEVICE) + else if (direction == DMA_MEM_TO_DEV) reg = pd_slave->tx_reg; else return NULL; @@ -1018,6 +1018,8 @@ static void __devexit pch_dma_remove(struct pci_dev *pdev) #define PCI_DEVICE_ID_ML7223_DMA2_4CH 0x800E #define PCI_DEVICE_ID_ML7223_DMA3_4CH 0x8017 #define PCI_DEVICE_ID_ML7223_DMA4_4CH 0x803B +#define PCI_DEVICE_ID_ML7831_DMA1_8CH 0x8810 +#define PCI_DEVICE_ID_ML7831_DMA2_4CH 0x8815 DEFINE_PCI_DEVICE_TABLE(pch_dma_id_table) = { { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_EG20T_PCH_DMA_8CH), 8 }, @@ -1030,6 +1032,8 @@ DEFINE_PCI_DEVICE_TABLE(pch_dma_id_table) = { { PCI_VDEVICE(ROHM, PCI_DEVICE_ID_ML7223_DMA2_4CH), 4}, /* Video SPI */ { PCI_VDEVICE(ROHM, PCI_DEVICE_ID_ML7223_DMA3_4CH), 4}, /* Security */ { PCI_VDEVICE(ROHM, PCI_DEVICE_ID_ML7223_DMA4_4CH), 4}, /* FPGA */ + { PCI_VDEVICE(ROHM, PCI_DEVICE_ID_ML7831_DMA1_8CH), 8}, /* UART */ + { PCI_VDEVICE(ROHM, PCI_DEVICE_ID_ML7831_DMA2_4CH), 4}, /* SPI */ { 0, }, }; @@ -1057,7 +1061,7 @@ static void __exit pch_dma_exit(void) module_init(pch_dma_init); module_exit(pch_dma_exit); -MODULE_DESCRIPTION("Intel EG20T PCH / OKI SEMICONDUCTOR ML7213 IOH " +MODULE_DESCRIPTION("Intel EG20T PCH / LAPIS Semicon ML7213/ML7223/ML7831 IOH " "DMA controller driver"); MODULE_AUTHOR("Yong Wang <yong.y.wang@intel.com>"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c index 09adcfcd953..b8ec03ee8e2 100644 --- a/drivers/dma/pl330.c +++ b/drivers/dma/pl330.c @@ -350,14 +350,14 @@ static int pl330_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd, unsigned case DMA_SLAVE_CONFIG: slave_config = (struct dma_slave_config *)arg; - if (slave_config->direction == DMA_TO_DEVICE) { + if (slave_config->direction == DMA_MEM_TO_DEV) { if (slave_config->dst_addr) pch->fifo_addr = slave_config->dst_addr; if (slave_config->dst_addr_width) pch->burst_sz = __ffs(slave_config->dst_addr_width); if (slave_config->dst_maxburst) pch->burst_len = slave_config->dst_maxburst; - } else if (slave_config->direction == DMA_FROM_DEVICE) { + } else if (slave_config->direction == DMA_DEV_TO_MEM) { if (slave_config->src_addr) pch->fifo_addr = slave_config->src_addr; if (slave_config->src_addr_width) @@ -621,7 +621,7 @@ static inline int get_burst_len(struct dma_pl330_desc *desc, size_t len) static struct dma_async_tx_descriptor *pl330_prep_dma_cyclic( struct dma_chan *chan, dma_addr_t dma_addr, size_t len, - size_t period_len, enum dma_data_direction direction) + size_t period_len, enum dma_transfer_direction direction) { struct dma_pl330_desc *desc; struct dma_pl330_chan *pch = to_pchan(chan); @@ -636,14 +636,14 @@ static struct dma_async_tx_descriptor *pl330_prep_dma_cyclic( } switch (direction) { - case DMA_TO_DEVICE: + case DMA_MEM_TO_DEV: desc->rqcfg.src_inc = 1; desc->rqcfg.dst_inc = 0; desc->req.rqtype = MEMTODEV; src = dma_addr; dst = pch->fifo_addr; break; - case DMA_FROM_DEVICE: + case DMA_DEV_TO_MEM: desc->rqcfg.src_inc = 0; desc->rqcfg.dst_inc = 1; desc->req.rqtype = DEVTOMEM; @@ -710,7 +710,7 @@ pl330_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dst, static struct dma_async_tx_descriptor * pl330_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, - unsigned int sg_len, enum dma_data_direction direction, + unsigned int sg_len, enum dma_transfer_direction direction, unsigned long flg) { struct dma_pl330_desc *first, *desc = NULL; @@ -759,7 +759,7 @@ pl330_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, else list_add_tail(&desc->node, &first->node); - if (direction == DMA_TO_DEVICE) { + if (direction == DMA_MEM_TO_DEV) { desc->rqcfg.src_inc = 1; desc->rqcfg.dst_inc = 0; desc->req.rqtype = MEMTODEV; @@ -834,17 +834,7 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id) amba_set_drvdata(adev, pdmac); -#ifdef CONFIG_PM_RUNTIME - /* to use the runtime PM helper functions */ - pm_runtime_enable(&adev->dev); - - /* enable the power domain */ - if (pm_runtime_get_sync(&adev->dev)) { - dev_err(&adev->dev, "failed to get runtime pm\n"); - ret = -ENODEV; - goto probe_err1; - } -#else +#ifndef CONFIG_PM_RUNTIME /* enable dma clk */ clk_enable(pdmac->clk); #endif @@ -977,10 +967,7 @@ static int __devexit pl330_remove(struct amba_device *adev) res = &adev->res; release_mem_region(res->start, resource_size(res)); -#ifdef CONFIG_PM_RUNTIME - pm_runtime_put(&adev->dev); - pm_runtime_disable(&adev->dev); -#else +#ifndef CONFIG_PM_RUNTIME clk_disable(pdmac->clk); #endif diff --git a/drivers/dma/shdma.c b/drivers/dma/shdma.c index 81809c2b46a..54043cd831c 100644 --- a/drivers/dma/shdma.c +++ b/drivers/dma/shdma.c @@ -23,7 +23,6 @@ #include <linux/interrupt.h> #include <linux/dmaengine.h> #include <linux/delay.h> -#include <linux/dma-mapping.h> #include <linux/platform_device.h> #include <linux/pm_runtime.h> #include <linux/sh_dma.h> @@ -57,6 +56,15 @@ static LIST_HEAD(sh_dmae_devices); static unsigned long sh_dmae_slave_used[BITS_TO_LONGS(SH_DMA_SLAVE_NUMBER)]; static void sh_dmae_chan_ld_cleanup(struct sh_dmae_chan *sh_chan, bool all); +static void sh_chan_xfer_ld_queue(struct sh_dmae_chan *sh_chan); + +static void chclr_write(struct sh_dmae_chan *sh_dc, u32 data) +{ + struct sh_dmae_device *shdev = to_sh_dev(sh_dc); + + __raw_writel(data, shdev->chan_reg + + shdev->pdata->channel[sh_dc->id].chclr_offset); +} static void sh_dmae_writel(struct sh_dmae_chan *sh_dc, u32 data, u32 reg) { @@ -129,6 +137,15 @@ static int sh_dmae_rst(struct sh_dmae_device *shdev) dmaor = dmaor_read(shdev) & ~(DMAOR_NMIF | DMAOR_AE | DMAOR_DME); + if (shdev->pdata->chclr_present) { + int i; + for (i = 0; i < shdev->pdata->channel_num; i++) { + struct sh_dmae_chan *sh_chan = shdev->chan[i]; + if (sh_chan) + chclr_write(sh_chan, 0); + } + } + dmaor_write(shdev, dmaor | shdev->pdata->dmaor_init); dmaor = dmaor_read(shdev); @@ -139,6 +156,10 @@ static int sh_dmae_rst(struct sh_dmae_device *shdev) dev_warn(shdev->common.dev, "Can't initialize DMAOR.\n"); return -EIO; } + if (shdev->pdata->dmaor_init & ~dmaor) + dev_warn(shdev->common.dev, + "DMAOR=0x%x hasn't latched the initial value 0x%x.\n", + dmaor, shdev->pdata->dmaor_init); return 0; } @@ -259,8 +280,6 @@ static int dmae_set_dmars(struct sh_dmae_chan *sh_chan, u16 val) return 0; } -static void sh_chan_xfer_ld_queue(struct sh_dmae_chan *sh_chan); - static dma_cookie_t sh_dmae_tx_submit(struct dma_async_tx_descriptor *tx) { struct sh_desc *desc = tx_to_sh_desc(tx), *chunk, *last = desc, *c; @@ -340,6 +359,8 @@ static dma_cookie_t sh_dmae_tx_submit(struct dma_async_tx_descriptor *tx) sh_chan_xfer_ld_queue(sh_chan); sh_chan->pm_state = DMAE_PM_ESTABLISHED; } + } else { + sh_chan->pm_state = DMAE_PM_PENDING; } spin_unlock_irq(&sh_chan->desc_lock); @@ -479,19 +500,19 @@ static void sh_dmae_free_chan_resources(struct dma_chan *chan) * @sh_chan: DMA channel * @flags: DMA transfer flags * @dest: destination DMA address, incremented when direction equals - * DMA_FROM_DEVICE or DMA_BIDIRECTIONAL + * DMA_DEV_TO_MEM * @src: source DMA address, incremented when direction equals - * DMA_TO_DEVICE or DMA_BIDIRECTIONAL + * DMA_MEM_TO_DEV * @len: DMA transfer length * @first: if NULL, set to the current descriptor and cookie set to -EBUSY * @direction: needed for slave DMA to decide which address to keep constant, - * equals DMA_BIDIRECTIONAL for MEMCPY + * equals DMA_MEM_TO_MEM for MEMCPY * Returns 0 or an error * Locks: called with desc_lock held */ static struct sh_desc *sh_dmae_add_desc(struct sh_dmae_chan *sh_chan, unsigned long flags, dma_addr_t *dest, dma_addr_t *src, size_t *len, - struct sh_desc **first, enum dma_data_direction direction) + struct sh_desc **first, enum dma_transfer_direction direction) { struct sh_desc *new; size_t copy_size; @@ -531,9 +552,9 @@ static struct sh_desc *sh_dmae_add_desc(struct sh_dmae_chan *sh_chan, new->direction = direction; *len -= copy_size; - if (direction == DMA_BIDIRECTIONAL || direction == DMA_TO_DEVICE) + if (direction == DMA_MEM_TO_MEM || direction == DMA_MEM_TO_DEV) *src += copy_size; - if (direction == DMA_BIDIRECTIONAL || direction == DMA_FROM_DEVICE) + if (direction == DMA_MEM_TO_MEM || direction == DMA_DEV_TO_MEM) *dest += copy_size; return new; @@ -546,12 +567,12 @@ static struct sh_desc *sh_dmae_add_desc(struct sh_dmae_chan *sh_chan, * converted to scatter-gather to guarantee consistent locking and a correct * list manipulation. For slave DMA direction carries the usual meaning, and, * logically, the SG list is RAM and the addr variable contains slave address, - * e.g., the FIFO I/O register. For MEMCPY direction equals DMA_BIDIRECTIONAL + * e.g., the FIFO I/O register. For MEMCPY direction equals DMA_MEM_TO_MEM * and the SG list contains only one element and points at the source buffer. */ static struct dma_async_tx_descriptor *sh_dmae_prep_sg(struct sh_dmae_chan *sh_chan, struct scatterlist *sgl, unsigned int sg_len, dma_addr_t *addr, - enum dma_data_direction direction, unsigned long flags) + enum dma_transfer_direction direction, unsigned long flags) { struct scatterlist *sg; struct sh_desc *first = NULL, *new = NULL /* compiler... */; @@ -592,7 +613,7 @@ static struct dma_async_tx_descriptor *sh_dmae_prep_sg(struct sh_dmae_chan *sh_c dev_dbg(sh_chan->dev, "Add SG #%d@%p[%d], dma %llx\n", i, sg, len, (unsigned long long)sg_addr); - if (direction == DMA_FROM_DEVICE) + if (direction == DMA_DEV_TO_MEM) new = sh_dmae_add_desc(sh_chan, flags, &sg_addr, addr, &len, &first, direction); @@ -646,13 +667,13 @@ static struct dma_async_tx_descriptor *sh_dmae_prep_memcpy( sg_dma_address(&sg) = dma_src; sg_dma_len(&sg) = len; - return sh_dmae_prep_sg(sh_chan, &sg, 1, &dma_dest, DMA_BIDIRECTIONAL, + return sh_dmae_prep_sg(sh_chan, &sg, 1, &dma_dest, DMA_MEM_TO_MEM, flags); } static struct dma_async_tx_descriptor *sh_dmae_prep_slave_sg( struct dma_chan *chan, struct scatterlist *sgl, unsigned int sg_len, - enum dma_data_direction direction, unsigned long flags) + enum dma_transfer_direction direction, unsigned long flags) { struct sh_dmae_slave *param; struct sh_dmae_chan *sh_chan; @@ -996,7 +1017,7 @@ static void dmae_do_tasklet(unsigned long data) spin_lock_irq(&sh_chan->desc_lock); list_for_each_entry(desc, &sh_chan->ld_queue, node) { if (desc->mark == DESC_SUBMITTED && - ((desc->direction == DMA_FROM_DEVICE && + ((desc->direction == DMA_DEV_TO_MEM && (desc->hw.dar + desc->hw.tcr) == dar_buf) || (desc->hw.sar + desc->hw.tcr) == sar_buf)) { dev_dbg(sh_chan->dev, "done #%d@%p dst %u\n", @@ -1225,6 +1246,8 @@ static int __init sh_dmae_probe(struct platform_device *pdev) platform_set_drvdata(pdev, shdev); + shdev->common.dev = &pdev->dev; + pm_runtime_enable(&pdev->dev); pm_runtime_get_sync(&pdev->dev); @@ -1254,7 +1277,6 @@ static int __init sh_dmae_probe(struct platform_device *pdev) shdev->common.device_prep_slave_sg = sh_dmae_prep_slave_sg; shdev->common.device_control = sh_dmae_control; - shdev->common.dev = &pdev->dev; /* Default transfer size of 32 bytes requires 32-byte alignment */ shdev->common.copy_align = LOG2_DEFAULT_XFER_SIZE; @@ -1435,22 +1457,17 @@ static int sh_dmae_runtime_resume(struct device *dev) #ifdef CONFIG_PM static int sh_dmae_suspend(struct device *dev) { - struct sh_dmae_device *shdev = dev_get_drvdata(dev); - int i; - - for (i = 0; i < shdev->pdata->channel_num; i++) { - struct sh_dmae_chan *sh_chan = shdev->chan[i]; - if (sh_chan->descs_allocated) - sh_chan->pm_error = pm_runtime_put_sync(dev); - } - return 0; } static int sh_dmae_resume(struct device *dev) { struct sh_dmae_device *shdev = dev_get_drvdata(dev); - int i; + int i, ret; + + ret = sh_dmae_rst(shdev); + if (ret < 0) + dev_err(dev, "Failed to reset!\n"); for (i = 0; i < shdev->pdata->channel_num; i++) { struct sh_dmae_chan *sh_chan = shdev->chan[i]; @@ -1459,9 +1476,6 @@ static int sh_dmae_resume(struct device *dev) if (!sh_chan->descs_allocated) continue; - if (!sh_chan->pm_error) - pm_runtime_get_sync(dev); - if (param) { const struct sh_dmae_slave_config *cfg = param->config; dmae_set_dmars(sh_chan, cfg->mid_rid); diff --git a/drivers/dma/sirf-dma.c b/drivers/dma/sirf-dma.c new file mode 100644 index 00000000000..2333810d168 --- /dev/null +++ b/drivers/dma/sirf-dma.c @@ -0,0 +1,707 @@ +/* + * DMA controller driver for CSR SiRFprimaII + * + * Copyright (c) 2011 Cambridge Silicon Radio Limited, a CSR plc group company. + * + * Licensed under GPLv2 or later. + */ + +#include <linux/module.h> +#include <linux/dmaengine.h> +#include <linux/dma-mapping.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/slab.h> +#include <linux/of_irq.h> +#include <linux/of_address.h> +#include <linux/of_device.h> +#include <linux/of_platform.h> +#include <linux/sirfsoc_dma.h> + +#define SIRFSOC_DMA_DESCRIPTORS 16 +#define SIRFSOC_DMA_CHANNELS 16 + +#define SIRFSOC_DMA_CH_ADDR 0x00 +#define SIRFSOC_DMA_CH_XLEN 0x04 +#define SIRFSOC_DMA_CH_YLEN 0x08 +#define SIRFSOC_DMA_CH_CTRL 0x0C + +#define SIRFSOC_DMA_WIDTH_0 0x100 +#define SIRFSOC_DMA_CH_VALID 0x140 +#define SIRFSOC_DMA_CH_INT 0x144 +#define SIRFSOC_DMA_INT_EN 0x148 +#define SIRFSOC_DMA_CH_LOOP_CTRL 0x150 + +#define SIRFSOC_DMA_MODE_CTRL_BIT 4 +#define SIRFSOC_DMA_DIR_CTRL_BIT 5 + +/* xlen and dma_width register is in 4 bytes boundary */ +#define SIRFSOC_DMA_WORD_LEN 4 + +struct sirfsoc_dma_desc { + struct dma_async_tx_descriptor desc; + struct list_head node; + + /* SiRFprimaII 2D-DMA parameters */ + + int xlen; /* DMA xlen */ + int ylen; /* DMA ylen */ + int width; /* DMA width */ + int dir; + bool cyclic; /* is loop DMA? */ + u32 addr; /* DMA buffer address */ +}; + +struct sirfsoc_dma_chan { + struct dma_chan chan; + struct list_head free; + struct list_head prepared; + struct list_head queued; + struct list_head active; + struct list_head completed; + dma_cookie_t completed_cookie; + unsigned long happened_cyclic; + unsigned long completed_cyclic; + + /* Lock for this structure */ + spinlock_t lock; + + int mode; +}; + +struct sirfsoc_dma { + struct dma_device dma; + struct tasklet_struct tasklet; + struct sirfsoc_dma_chan channels[SIRFSOC_DMA_CHANNELS]; + void __iomem *base; + int irq; +}; + +#define DRV_NAME "sirfsoc_dma" + +/* Convert struct dma_chan to struct sirfsoc_dma_chan */ +static inline +struct sirfsoc_dma_chan *dma_chan_to_sirfsoc_dma_chan(struct dma_chan *c) +{ + return container_of(c, struct sirfsoc_dma_chan, chan); +} + +/* Convert struct dma_chan to struct sirfsoc_dma */ +static inline struct sirfsoc_dma *dma_chan_to_sirfsoc_dma(struct dma_chan *c) +{ + struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(c); + return container_of(schan, struct sirfsoc_dma, channels[c->chan_id]); +} + +/* Execute all queued DMA descriptors */ +static void sirfsoc_dma_execute(struct sirfsoc_dma_chan *schan) +{ + struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(&schan->chan); + int cid = schan->chan.chan_id; + struct sirfsoc_dma_desc *sdesc = NULL; + + /* + * lock has been held by functions calling this, so we don't hold + * lock again + */ + + sdesc = list_first_entry(&schan->queued, struct sirfsoc_dma_desc, + node); + /* Move the first queued descriptor to active list */ + list_move_tail(&schan->queued, &schan->active); + + /* Start the DMA transfer */ + writel_relaxed(sdesc->width, sdma->base + SIRFSOC_DMA_WIDTH_0 + + cid * 4); + writel_relaxed(cid | (schan->mode << SIRFSOC_DMA_MODE_CTRL_BIT) | + (sdesc->dir << SIRFSOC_DMA_DIR_CTRL_BIT), + sdma->base + cid * 0x10 + SIRFSOC_DMA_CH_CTRL); + writel_relaxed(sdesc->xlen, sdma->base + cid * 0x10 + + SIRFSOC_DMA_CH_XLEN); + writel_relaxed(sdesc->ylen, sdma->base + cid * 0x10 + + SIRFSOC_DMA_CH_YLEN); + writel_relaxed(readl_relaxed(sdma->base + SIRFSOC_DMA_INT_EN) | + (1 << cid), sdma->base + SIRFSOC_DMA_INT_EN); + + /* + * writel has an implict memory write barrier to make sure data is + * flushed into memory before starting DMA + */ + writel(sdesc->addr >> 2, sdma->base + cid * 0x10 + SIRFSOC_DMA_CH_ADDR); + + if (sdesc->cyclic) { + writel((1 << cid) | 1 << (cid + 16) | + readl_relaxed(sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL), + sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL); + schan->happened_cyclic = schan->completed_cyclic = 0; + } +} + +/* Interrupt handler */ +static irqreturn_t sirfsoc_dma_irq(int irq, void *data) +{ + struct sirfsoc_dma *sdma = data; + struct sirfsoc_dma_chan *schan; + struct sirfsoc_dma_desc *sdesc = NULL; + u32 is; + int ch; + + is = readl(sdma->base + SIRFSOC_DMA_CH_INT); + while ((ch = fls(is) - 1) >= 0) { + is &= ~(1 << ch); + writel_relaxed(1 << ch, sdma->base + SIRFSOC_DMA_CH_INT); + schan = &sdma->channels[ch]; + + spin_lock(&schan->lock); + + sdesc = list_first_entry(&schan->active, struct sirfsoc_dma_desc, + node); + if (!sdesc->cyclic) { + /* Execute queued descriptors */ + list_splice_tail_init(&schan->active, &schan->completed); + if (!list_empty(&schan->queued)) + sirfsoc_dma_execute(schan); + } else + schan->happened_cyclic++; + + spin_unlock(&schan->lock); + } + + /* Schedule tasklet */ + tasklet_schedule(&sdma->tasklet); + + return IRQ_HANDLED; +} + +/* process completed descriptors */ +static void sirfsoc_dma_process_completed(struct sirfsoc_dma *sdma) +{ + dma_cookie_t last_cookie = 0; + struct sirfsoc_dma_chan *schan; + struct sirfsoc_dma_desc *sdesc; + struct dma_async_tx_descriptor *desc; + unsigned long flags; + unsigned long happened_cyclic; + LIST_HEAD(list); + int i; + + for (i = 0; i < sdma->dma.chancnt; i++) { + schan = &sdma->channels[i]; + + /* Get all completed descriptors */ + spin_lock_irqsave(&schan->lock, flags); + if (!list_empty(&schan->completed)) { + list_splice_tail_init(&schan->completed, &list); + spin_unlock_irqrestore(&schan->lock, flags); + + /* Execute callbacks and run dependencies */ + list_for_each_entry(sdesc, &list, node) { + desc = &sdesc->desc; + + if (desc->callback) + desc->callback(desc->callback_param); + + last_cookie = desc->cookie; + dma_run_dependencies(desc); + } + + /* Free descriptors */ + spin_lock_irqsave(&schan->lock, flags); + list_splice_tail_init(&list, &schan->free); + schan->completed_cookie = last_cookie; + spin_unlock_irqrestore(&schan->lock, flags); + } else { + /* for cyclic channel, desc is always in active list */ + sdesc = list_first_entry(&schan->active, struct sirfsoc_dma_desc, + node); + + if (!sdesc || (sdesc && !sdesc->cyclic)) { + /* without active cyclic DMA */ + spin_unlock_irqrestore(&schan->lock, flags); + continue; + } + + /* cyclic DMA */ + happened_cyclic = schan->happened_cyclic; + spin_unlock_irqrestore(&schan->lock, flags); + + desc = &sdesc->desc; + while (happened_cyclic != schan->completed_cyclic) { + if (desc->callback) + desc->callback(desc->callback_param); + schan->completed_cyclic++; + } + } + } +} + +/* DMA Tasklet */ +static void sirfsoc_dma_tasklet(unsigned long data) +{ + struct sirfsoc_dma *sdma = (void *)data; + + sirfsoc_dma_process_completed(sdma); +} + +/* Submit descriptor to hardware */ +static dma_cookie_t sirfsoc_dma_tx_submit(struct dma_async_tx_descriptor *txd) +{ + struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(txd->chan); + struct sirfsoc_dma_desc *sdesc; + unsigned long flags; + dma_cookie_t cookie; + + sdesc = container_of(txd, struct sirfsoc_dma_desc, desc); + + spin_lock_irqsave(&schan->lock, flags); + + /* Move descriptor to queue */ + list_move_tail(&sdesc->node, &schan->queued); + + /* Update cookie */ + cookie = schan->chan.cookie + 1; + if (cookie <= 0) + cookie = 1; + + schan->chan.cookie = cookie; + sdesc->desc.cookie = cookie; + + spin_unlock_irqrestore(&schan->lock, flags); + + return cookie; +} + +static int sirfsoc_dma_slave_config(struct sirfsoc_dma_chan *schan, + struct dma_slave_config *config) +{ + unsigned long flags; + + if ((config->src_addr_width != DMA_SLAVE_BUSWIDTH_4_BYTES) || + (config->dst_addr_width != DMA_SLAVE_BUSWIDTH_4_BYTES)) + return -EINVAL; + + spin_lock_irqsave(&schan->lock, flags); + schan->mode = (config->src_maxburst == 4 ? 1 : 0); + spin_unlock_irqrestore(&schan->lock, flags); + + return 0; +} + +static int sirfsoc_dma_terminate_all(struct sirfsoc_dma_chan *schan) +{ + struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(&schan->chan); + int cid = schan->chan.chan_id; + unsigned long flags; + + writel_relaxed(readl_relaxed(sdma->base + SIRFSOC_DMA_INT_EN) & + ~(1 << cid), sdma->base + SIRFSOC_DMA_INT_EN); + writel_relaxed(1 << cid, sdma->base + SIRFSOC_DMA_CH_VALID); + + writel_relaxed(readl_relaxed(sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL) + & ~((1 << cid) | 1 << (cid + 16)), + sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL); + + spin_lock_irqsave(&schan->lock, flags); + list_splice_tail_init(&schan->active, &schan->free); + list_splice_tail_init(&schan->queued, &schan->free); + spin_unlock_irqrestore(&schan->lock, flags); + + return 0; +} + +static int sirfsoc_dma_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd, + unsigned long arg) +{ + struct dma_slave_config *config; + struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan); + + switch (cmd) { + case DMA_TERMINATE_ALL: + return sirfsoc_dma_terminate_all(schan); + case DMA_SLAVE_CONFIG: + config = (struct dma_slave_config *)arg; + return sirfsoc_dma_slave_config(schan, config); + + default: + break; + } + + return -ENOSYS; +} + +/* Alloc channel resources */ +static int sirfsoc_dma_alloc_chan_resources(struct dma_chan *chan) +{ + struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(chan); + struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan); + struct sirfsoc_dma_desc *sdesc; + unsigned long flags; + LIST_HEAD(descs); + int i; + + /* Alloc descriptors for this channel */ + for (i = 0; i < SIRFSOC_DMA_DESCRIPTORS; i++) { + sdesc = kzalloc(sizeof(*sdesc), GFP_KERNEL); + if (!sdesc) { + dev_notice(sdma->dma.dev, "Memory allocation error. " + "Allocated only %u descriptors\n", i); + break; + } + + dma_async_tx_descriptor_init(&sdesc->desc, chan); + sdesc->desc.flags = DMA_CTRL_ACK; + sdesc->desc.tx_submit = sirfsoc_dma_tx_submit; + + list_add_tail(&sdesc->node, &descs); + } + + /* Return error only if no descriptors were allocated */ + if (i == 0) + return -ENOMEM; + + spin_lock_irqsave(&schan->lock, flags); + + list_splice_tail_init(&descs, &schan->free); + spin_unlock_irqrestore(&schan->lock, flags); + + return i; +} + +/* Free channel resources */ +static void sirfsoc_dma_free_chan_resources(struct dma_chan *chan) +{ + struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan); + struct sirfsoc_dma_desc *sdesc, *tmp; + unsigned long flags; + LIST_HEAD(descs); + + spin_lock_irqsave(&schan->lock, flags); + + /* Channel must be idle */ + BUG_ON(!list_empty(&schan->prepared)); + BUG_ON(!list_empty(&schan->queued)); + BUG_ON(!list_empty(&schan->active)); + BUG_ON(!list_empty(&schan->completed)); + + /* Move data */ + list_splice_tail_init(&schan->free, &descs); + + spin_unlock_irqrestore(&schan->lock, flags); + + /* Free descriptors */ + list_for_each_entry_safe(sdesc, tmp, &descs, node) + kfree(sdesc); +} + +/* Send pending descriptor to hardware */ +static void sirfsoc_dma_issue_pending(struct dma_chan *chan) +{ + struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan); + unsigned long flags; + + spin_lock_irqsave(&schan->lock, flags); + + if (list_empty(&schan->active) && !list_empty(&schan->queued)) + sirfsoc_dma_execute(schan); + + spin_unlock_irqrestore(&schan->lock, flags); +} + +/* Check request completion status */ +static enum dma_status +sirfsoc_dma_tx_status(struct dma_chan *chan, dma_cookie_t cookie, + struct dma_tx_state *txstate) +{ + struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan); + unsigned long flags; + dma_cookie_t last_used; + dma_cookie_t last_complete; + + spin_lock_irqsave(&schan->lock, flags); + last_used = schan->chan.cookie; + last_complete = schan->completed_cookie; + spin_unlock_irqrestore(&schan->lock, flags); + + dma_set_tx_state(txstate, last_complete, last_used, 0); + return dma_async_is_complete(cookie, last_complete, last_used); +} + +static struct dma_async_tx_descriptor *sirfsoc_dma_prep_interleaved( + struct dma_chan *chan, struct dma_interleaved_template *xt, + unsigned long flags) +{ + struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(chan); + struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan); + struct sirfsoc_dma_desc *sdesc = NULL; + unsigned long iflags; + int ret; + + if ((xt->dir != DMA_MEM_TO_DEV) || (xt->dir != DMA_DEV_TO_MEM)) { + ret = -EINVAL; + goto err_dir; + } + + /* Get free descriptor */ + spin_lock_irqsave(&schan->lock, iflags); + if (!list_empty(&schan->free)) { + sdesc = list_first_entry(&schan->free, struct sirfsoc_dma_desc, + node); + list_del(&sdesc->node); + } + spin_unlock_irqrestore(&schan->lock, iflags); + + if (!sdesc) { + /* try to free completed descriptors */ + sirfsoc_dma_process_completed(sdma); + ret = 0; + goto no_desc; + } + + /* Place descriptor in prepared list */ + spin_lock_irqsave(&schan->lock, iflags); + + /* + * Number of chunks in a frame can only be 1 for prima2 + * and ylen (number of frame - 1) must be at least 0 + */ + if ((xt->frame_size == 1) && (xt->numf > 0)) { + sdesc->cyclic = 0; + sdesc->xlen = xt->sgl[0].size / SIRFSOC_DMA_WORD_LEN; + sdesc->width = (xt->sgl[0].size + xt->sgl[0].icg) / + SIRFSOC_DMA_WORD_LEN; + sdesc->ylen = xt->numf - 1; + if (xt->dir == DMA_MEM_TO_DEV) { + sdesc->addr = xt->src_start; + sdesc->dir = 1; + } else { + sdesc->addr = xt->dst_start; + sdesc->dir = 0; + } + + list_add_tail(&sdesc->node, &schan->prepared); + } else { + pr_err("sirfsoc DMA Invalid xfer\n"); + ret = -EINVAL; + goto err_xfer; + } + spin_unlock_irqrestore(&schan->lock, iflags); + + return &sdesc->desc; +err_xfer: + spin_unlock_irqrestore(&schan->lock, iflags); +no_desc: +err_dir: + return ERR_PTR(ret); +} + +static struct dma_async_tx_descriptor * +sirfsoc_dma_prep_cyclic(struct dma_chan *chan, dma_addr_t addr, + size_t buf_len, size_t period_len, + enum dma_transfer_direction direction) +{ + struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan); + struct sirfsoc_dma_desc *sdesc = NULL; + unsigned long iflags; + + /* + * we only support cycle transfer with 2 period + * If the X-length is set to 0, it would be the loop mode. + * The DMA address keeps increasing until reaching the end of a loop + * area whose size is defined by (DMA_WIDTH x (Y_LENGTH + 1)). Then + * the DMA address goes back to the beginning of this area. + * In loop mode, the DMA data region is divided into two parts, BUFA + * and BUFB. DMA controller generates interrupts twice in each loop: + * when the DMA address reaches the end of BUFA or the end of the + * BUFB + */ + if (buf_len != 2 * period_len) + return ERR_PTR(-EINVAL); + + /* Get free descriptor */ + spin_lock_irqsave(&schan->lock, iflags); + if (!list_empty(&schan->free)) { + sdesc = list_first_entry(&schan->free, struct sirfsoc_dma_desc, + node); + list_del(&sdesc->node); + } + spin_unlock_irqrestore(&schan->lock, iflags); + + if (!sdesc) + return 0; + + /* Place descriptor in prepared list */ + spin_lock_irqsave(&schan->lock, iflags); + sdesc->addr = addr; + sdesc->cyclic = 1; + sdesc->xlen = 0; + sdesc->ylen = buf_len / SIRFSOC_DMA_WORD_LEN - 1; + sdesc->width = 1; + list_add_tail(&sdesc->node, &schan->prepared); + spin_unlock_irqrestore(&schan->lock, iflags); + + return &sdesc->desc; +} + +/* + * The DMA controller consists of 16 independent DMA channels. + * Each channel is allocated to a different function + */ +bool sirfsoc_dma_filter_id(struct dma_chan *chan, void *chan_id) +{ + unsigned int ch_nr = (unsigned int) chan_id; + + if (ch_nr == chan->chan_id + + chan->device->dev_id * SIRFSOC_DMA_CHANNELS) + return true; + + return false; +} +EXPORT_SYMBOL(sirfsoc_dma_filter_id); + +static int __devinit sirfsoc_dma_probe(struct platform_device *op) +{ + struct device_node *dn = op->dev.of_node; + struct device *dev = &op->dev; + struct dma_device *dma; + struct sirfsoc_dma *sdma; + struct sirfsoc_dma_chan *schan; + struct resource res; + ulong regs_start, regs_size; + u32 id; + int ret, i; + + sdma = devm_kzalloc(dev, sizeof(*sdma), GFP_KERNEL); + if (!sdma) { + dev_err(dev, "Memory exhausted!\n"); + return -ENOMEM; + } + + if (of_property_read_u32(dn, "cell-index", &id)) { + dev_err(dev, "Fail to get DMAC index\n"); + ret = -ENODEV; + goto free_mem; + } + + sdma->irq = irq_of_parse_and_map(dn, 0); + if (sdma->irq == NO_IRQ) { + dev_err(dev, "Error mapping IRQ!\n"); + ret = -EINVAL; + goto free_mem; + } + + ret = of_address_to_resource(dn, 0, &res); + if (ret) { + dev_err(dev, "Error parsing memory region!\n"); + goto free_mem; + } + + regs_start = res.start; + regs_size = resource_size(&res); + + sdma->base = devm_ioremap(dev, regs_start, regs_size); + if (!sdma->base) { + dev_err(dev, "Error mapping memory region!\n"); + ret = -ENOMEM; + goto irq_dispose; + } + + ret = devm_request_irq(dev, sdma->irq, &sirfsoc_dma_irq, 0, DRV_NAME, + sdma); + if (ret) { + dev_err(dev, "Error requesting IRQ!\n"); + ret = -EINVAL; + goto unmap_mem; + } + + dma = &sdma->dma; + dma->dev = dev; + dma->chancnt = SIRFSOC_DMA_CHANNELS; + + dma->device_alloc_chan_resources = sirfsoc_dma_alloc_chan_resources; + dma->device_free_chan_resources = sirfsoc_dma_free_chan_resources; + dma->device_issue_pending = sirfsoc_dma_issue_pending; + dma->device_control = sirfsoc_dma_control; + dma->device_tx_status = sirfsoc_dma_tx_status; + dma->device_prep_interleaved_dma = sirfsoc_dma_prep_interleaved; + dma->device_prep_dma_cyclic = sirfsoc_dma_prep_cyclic; + + INIT_LIST_HEAD(&dma->channels); + dma_cap_set(DMA_SLAVE, dma->cap_mask); + dma_cap_set(DMA_CYCLIC, dma->cap_mask); + dma_cap_set(DMA_INTERLEAVE, dma->cap_mask); + dma_cap_set(DMA_PRIVATE, dma->cap_mask); + + for (i = 0; i < dma->chancnt; i++) { + schan = &sdma->channels[i]; + + schan->chan.device = dma; + schan->chan.cookie = 1; + schan->completed_cookie = schan->chan.cookie; + + INIT_LIST_HEAD(&schan->free); + INIT_LIST_HEAD(&schan->prepared); + INIT_LIST_HEAD(&schan->queued); + INIT_LIST_HEAD(&schan->active); + INIT_LIST_HEAD(&schan->completed); + + spin_lock_init(&schan->lock); + list_add_tail(&schan->chan.device_node, &dma->channels); + } + + tasklet_init(&sdma->tasklet, sirfsoc_dma_tasklet, (unsigned long)sdma); + + /* Register DMA engine */ + dev_set_drvdata(dev, sdma); + ret = dma_async_device_register(dma); + if (ret) + goto free_irq; + + dev_info(dev, "initialized SIRFSOC DMAC driver\n"); + + return 0; + +free_irq: + devm_free_irq(dev, sdma->irq, sdma); +irq_dispose: + irq_dispose_mapping(sdma->irq); +unmap_mem: + iounmap(sdma->base); +free_mem: + devm_kfree(dev, sdma); + return ret; +} + +static int __devexit sirfsoc_dma_remove(struct platform_device *op) +{ + struct device *dev = &op->dev; + struct sirfsoc_dma *sdma = dev_get_drvdata(dev); + + dma_async_device_unregister(&sdma->dma); + devm_free_irq(dev, sdma->irq, sdma); + irq_dispose_mapping(sdma->irq); + iounmap(sdma->base); + devm_kfree(dev, sdma); + return 0; +} + +static struct of_device_id sirfsoc_dma_match[] = { + { .compatible = "sirf,prima2-dmac", }, + {}, +}; + +static struct platform_driver sirfsoc_dma_driver = { + .probe = sirfsoc_dma_probe, + .remove = __devexit_p(sirfsoc_dma_remove), + .driver = { + .name = DRV_NAME, + .owner = THIS_MODULE, + .of_match_table = sirfsoc_dma_match, + }, +}; + +module_platform_driver(sirfsoc_dma_driver); + +MODULE_AUTHOR("Rongjun Ying <rongjun.ying@csr.com>, " + "Barry Song <baohua.song@csr.com>"); +MODULE_DESCRIPTION("SIRFSOC DMA control driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/dma/ste_dma40.c b/drivers/dma/ste_dma40.c index 13259cad0ce..cc5ecbc067a 100644 --- a/drivers/dma/ste_dma40.c +++ b/drivers/dma/ste_dma40.c @@ -14,6 +14,8 @@ #include <linux/platform_device.h> #include <linux/clk.h> #include <linux/delay.h> +#include <linux/pm.h> +#include <linux/pm_runtime.h> #include <linux/err.h> #include <linux/amba/bus.h> @@ -32,6 +34,9 @@ /* Maximum iterations taken before giving up suspending a channel */ #define D40_SUSPEND_MAX_IT 500 +/* Milliseconds */ +#define DMA40_AUTOSUSPEND_DELAY 100 + /* Hardware requirement on LCLA alignment */ #define LCLA_ALIGNMENT 0x40000 @@ -62,6 +67,55 @@ enum d40_command { D40_DMA_SUSPENDED = 3 }; +/* + * These are the registers that has to be saved and later restored + * when the DMA hw is powered off. + * TODO: Add save/restore of D40_DREG_GCC on dma40 v3 or later, if that works. + */ +static u32 d40_backup_regs[] = { + D40_DREG_LCPA, + D40_DREG_LCLA, + D40_DREG_PRMSE, + D40_DREG_PRMSO, + D40_DREG_PRMOE, + D40_DREG_PRMOO, +}; + +#define BACKUP_REGS_SZ ARRAY_SIZE(d40_backup_regs) + +/* TODO: Check if all these registers have to be saved/restored on dma40 v3 */ +static u32 d40_backup_regs_v3[] = { + D40_DREG_PSEG1, + D40_DREG_PSEG2, + D40_DREG_PSEG3, + D40_DREG_PSEG4, + D40_DREG_PCEG1, + D40_DREG_PCEG2, + D40_DREG_PCEG3, + D40_DREG_PCEG4, + D40_DREG_RSEG1, + D40_DREG_RSEG2, + D40_DREG_RSEG3, + D40_DREG_RSEG4, + D40_DREG_RCEG1, + D40_DREG_RCEG2, + D40_DREG_RCEG3, + D40_DREG_RCEG4, +}; + +#define BACKUP_REGS_SZ_V3 ARRAY_SIZE(d40_backup_regs_v3) + +static u32 d40_backup_regs_chan[] = { + D40_CHAN_REG_SSCFG, + D40_CHAN_REG_SSELT, + D40_CHAN_REG_SSPTR, + D40_CHAN_REG_SSLNK, + D40_CHAN_REG_SDCFG, + D40_CHAN_REG_SDELT, + D40_CHAN_REG_SDPTR, + D40_CHAN_REG_SDLNK, +}; + /** * struct d40_lli_pool - Structure for keeping LLIs in memory * @@ -96,7 +150,7 @@ struct d40_lli_pool { * during a transfer. * @node: List entry. * @is_in_client_list: true if the client owns this descriptor. - * the previous one. + * @cyclic: true if this is a cyclic job * * This descriptor is used for both logical and physical transfers. */ @@ -143,6 +197,7 @@ struct d40_lcla_pool { * channels. * * @lock: A lock protection this entity. + * @reserved: True if used by secure world or otherwise. * @num: The physical channel number of this entity. * @allocated_src: Bit mapped to show which src event line's are mapped to * this physical channel. Can also be free or physically allocated. @@ -152,6 +207,7 @@ struct d40_lcla_pool { */ struct d40_phy_res { spinlock_t lock; + bool reserved; int num; u32 allocated_src; u32 allocated_dst; @@ -185,7 +241,6 @@ struct d40_base; * @src_def_cfg: Default cfg register setting for src. * @dst_def_cfg: Default cfg register setting for dst. * @log_def: Default logical channel settings. - * @lcla: Space for one dst src pair for logical channel transfers. * @lcpa: Pointer to dst and src lcpa settings. * @runtime_addr: runtime configured address. * @runtime_direction: runtime configured direction. @@ -217,7 +272,7 @@ struct d40_chan { struct d40_log_lli_full *lcpa; /* Runtime reconfiguration */ dma_addr_t runtime_addr; - enum dma_data_direction runtime_direction; + enum dma_transfer_direction runtime_direction; }; /** @@ -241,6 +296,7 @@ struct d40_chan { * @dma_both: dma_device channels that can do both memcpy and slave transfers. * @dma_slave: dma_device channels that can do only do slave transfers. * @dma_memcpy: dma_device channels that can do only do memcpy transfers. + * @phy_chans: Room for all possible physical channels in system. * @log_chans: Room for all possible logical channels in system. * @lookup_log_chans: Used to map interrupt number to logical channel. Points * to log_chans entries. @@ -248,12 +304,20 @@ struct d40_chan { * to phy_chans entries. * @plat_data: Pointer to provided platform_data which is the driver * configuration. + * @lcpa_regulator: Pointer to hold the regulator for the esram bank for lcla. * @phy_res: Vector containing all physical channels. * @lcla_pool: lcla pool settings and data. * @lcpa_base: The virtual mapped address of LCPA. * @phy_lcpa: The physical address of the LCPA. * @lcpa_size: The size of the LCPA area. * @desc_slab: cache for descriptors. + * @reg_val_backup: Here the values of some hardware registers are stored + * before the DMA is powered off. They are restored when the power is back on. + * @reg_val_backup_v3: Backup of registers that only exits on dma40 v3 and + * later. + * @reg_val_backup_chan: Backup data for standard channel parameter registers. + * @gcc_pwr_off_mask: Mask to maintain the channels that can be turned off. + * @initialized: true if the dma has been initialized */ struct d40_base { spinlock_t interrupt_lock; @@ -275,6 +339,7 @@ struct d40_base { struct d40_chan **lookup_log_chans; struct d40_chan **lookup_phy_chans; struct stedma40_platform_data *plat_data; + struct regulator *lcpa_regulator; /* Physical half channels */ struct d40_phy_res *phy_res; struct d40_lcla_pool lcla_pool; @@ -282,6 +347,11 @@ struct d40_base { dma_addr_t phy_lcpa; resource_size_t lcpa_size; struct kmem_cache *desc_slab; + u32 reg_val_backup[BACKUP_REGS_SZ]; + u32 reg_val_backup_v3[BACKUP_REGS_SZ_V3]; + u32 *reg_val_backup_chan; + u16 gcc_pwr_off_mask; + bool initialized; }; /** @@ -479,13 +549,14 @@ static struct d40_desc *d40_desc_get(struct d40_chan *d40c) struct d40_desc *d; struct d40_desc *_d; - list_for_each_entry_safe(d, _d, &d40c->client, node) + list_for_each_entry_safe(d, _d, &d40c->client, node) { if (async_tx_test_ack(&d->txd)) { d40_desc_remove(d); desc = d; memset(desc, 0, sizeof(*desc)); break; } + } } if (!desc) @@ -536,6 +607,7 @@ static void d40_log_lli_to_lcxa(struct d40_chan *chan, struct d40_desc *desc) bool cyclic = desc->cyclic; int curr_lcla = -EINVAL; int first_lcla = 0; + bool use_esram_lcla = chan->base->plat_data->use_esram_lcla; bool linkback; /* @@ -608,11 +680,16 @@ static void d40_log_lli_to_lcxa(struct d40_chan *chan, struct d40_desc *desc) &lli->src[lli_current], next_lcla, flags); - dma_sync_single_range_for_device(chan->base->dev, - pool->dma_addr, lcla_offset, - 2 * sizeof(struct d40_log_lli), - DMA_TO_DEVICE); - + /* + * Cache maintenance is not needed if lcla is + * mapped in esram + */ + if (!use_esram_lcla) { + dma_sync_single_range_for_device(chan->base->dev, + pool->dma_addr, lcla_offset, + 2 * sizeof(struct d40_log_lli), + DMA_TO_DEVICE); + } curr_lcla = next_lcla; if (curr_lcla == -EINVAL || curr_lcla == first_lcla) { @@ -740,7 +817,61 @@ static int d40_sg_2_dmalen(struct scatterlist *sgl, int sg_len, return len; } -/* Support functions for logical channels */ + +#ifdef CONFIG_PM +static void dma40_backup(void __iomem *baseaddr, u32 *backup, + u32 *regaddr, int num, bool save) +{ + int i; + + for (i = 0; i < num; i++) { + void __iomem *addr = baseaddr + regaddr[i]; + + if (save) + backup[i] = readl_relaxed(addr); + else + writel_relaxed(backup[i], addr); + } +} + +static void d40_save_restore_registers(struct d40_base *base, bool save) +{ + int i; + + /* Save/Restore channel specific registers */ + for (i = 0; i < base->num_phy_chans; i++) { + void __iomem *addr; + int idx; + + if (base->phy_res[i].reserved) + continue; + + addr = base->virtbase + D40_DREG_PCBASE + i * D40_DREG_PCDELTA; + idx = i * ARRAY_SIZE(d40_backup_regs_chan); + + dma40_backup(addr, &base->reg_val_backup_chan[idx], + d40_backup_regs_chan, + ARRAY_SIZE(d40_backup_regs_chan), + save); + } + + /* Save/Restore global registers */ + dma40_backup(base->virtbase, base->reg_val_backup, + d40_backup_regs, ARRAY_SIZE(d40_backup_regs), + save); + + /* Save/Restore registers only existing on dma40 v3 and later */ + if (base->rev >= 3) + dma40_backup(base->virtbase, base->reg_val_backup_v3, + d40_backup_regs_v3, + ARRAY_SIZE(d40_backup_regs_v3), + save); +} +#else +static void d40_save_restore_registers(struct d40_base *base, bool save) +{ +} +#endif static int d40_channel_execute_command(struct d40_chan *d40c, enum d40_command command) @@ -973,6 +1104,10 @@ static void d40_config_write(struct d40_chan *d40c) /* Set LIDX for lcla */ writel(lidx, chanbase + D40_CHAN_REG_SSELT); writel(lidx, chanbase + D40_CHAN_REG_SDELT); + + /* Clear LNK which will be used by d40_chan_has_events() */ + writel(0, chanbase + D40_CHAN_REG_SSLNK); + writel(0, chanbase + D40_CHAN_REG_SDLNK); } } @@ -1013,6 +1148,7 @@ static int d40_pause(struct d40_chan *d40c) if (!d40c->busy) return 0; + pm_runtime_get_sync(d40c->base->dev); spin_lock_irqsave(&d40c->lock, flags); res = d40_channel_execute_command(d40c, D40_DMA_SUSPEND_REQ); @@ -1025,7 +1161,8 @@ static int d40_pause(struct d40_chan *d40c) D40_DMA_RUN); } } - + pm_runtime_mark_last_busy(d40c->base->dev); + pm_runtime_put_autosuspend(d40c->base->dev); spin_unlock_irqrestore(&d40c->lock, flags); return res; } @@ -1039,7 +1176,7 @@ static int d40_resume(struct d40_chan *d40c) return 0; spin_lock_irqsave(&d40c->lock, flags); - + pm_runtime_get_sync(d40c->base->dev); if (d40c->base->rev == 0) if (chan_is_logical(d40c)) { res = d40_channel_execute_command(d40c, @@ -1057,6 +1194,8 @@ static int d40_resume(struct d40_chan *d40c) } no_suspend: + pm_runtime_mark_last_busy(d40c->base->dev); + pm_runtime_put_autosuspend(d40c->base->dev); spin_unlock_irqrestore(&d40c->lock, flags); return res; } @@ -1129,7 +1268,10 @@ static struct d40_desc *d40_queue_start(struct d40_chan *d40c) d40d = d40_first_queued(d40c); if (d40d != NULL) { - d40c->busy = true; + if (!d40c->busy) + d40c->busy = true; + + pm_runtime_get_sync(d40c->base->dev); /* Remove from queue */ d40_desc_remove(d40d); @@ -1190,6 +1332,8 @@ static void dma_tc_handle(struct d40_chan *d40c) if (d40_queue_start(d40c) == NULL) d40c->busy = false; + pm_runtime_mark_last_busy(d40c->base->dev); + pm_runtime_put_autosuspend(d40c->base->dev); } d40c->pending_tx++; @@ -1405,11 +1549,16 @@ static int d40_validate_conf(struct d40_chan *d40c, return res; } -static bool d40_alloc_mask_set(struct d40_phy_res *phy, bool is_src, - int log_event_line, bool is_log) +static bool d40_alloc_mask_set(struct d40_phy_res *phy, + bool is_src, int log_event_line, bool is_log, + bool *first_user) { unsigned long flags; spin_lock_irqsave(&phy->lock, flags); + + *first_user = ((phy->allocated_src | phy->allocated_dst) + == D40_ALLOC_FREE); + if (!is_log) { /* Physical interrupts are masked per physical full channel */ if (phy->allocated_src == D40_ALLOC_FREE && @@ -1490,7 +1639,7 @@ out: return is_free; } -static int d40_allocate_channel(struct d40_chan *d40c) +static int d40_allocate_channel(struct d40_chan *d40c, bool *first_phy_user) { int dev_type; int event_group; @@ -1526,7 +1675,8 @@ static int d40_allocate_channel(struct d40_chan *d40c) for (i = 0; i < d40c->base->num_phy_chans; i++) { if (d40_alloc_mask_set(&phys[i], is_src, - 0, is_log)) + 0, is_log, + first_phy_user)) goto found_phy; } } else @@ -1536,7 +1686,8 @@ static int d40_allocate_channel(struct d40_chan *d40c) if (d40_alloc_mask_set(&phys[i], is_src, 0, - is_log)) + is_log, + first_phy_user)) goto found_phy; } } @@ -1552,6 +1703,25 @@ found_phy: /* Find logical channel */ for (j = 0; j < d40c->base->num_phy_chans; j += 8) { int phy_num = j + event_group * 2; + + if (d40c->dma_cfg.use_fixed_channel) { + i = d40c->dma_cfg.phy_channel; + + if ((i != phy_num) && (i != phy_num + 1)) { + dev_err(chan2dev(d40c), + "invalid fixed phy channel %d\n", i); + return -EINVAL; + } + + if (d40_alloc_mask_set(&phys[i], is_src, event_line, + is_log, first_phy_user)) + goto found_log; + + dev_err(chan2dev(d40c), + "could not allocate fixed phy channel %d\n", i); + return -EINVAL; + } + /* * Spread logical channels across all available physical rather * than pack every logical channel at the first available phy @@ -1560,13 +1730,15 @@ found_phy: if (is_src) { for (i = phy_num; i < phy_num + 2; i++) { if (d40_alloc_mask_set(&phys[i], is_src, - event_line, is_log)) + event_line, is_log, + first_phy_user)) goto found_log; } } else { for (i = phy_num + 1; i >= phy_num; i--) { if (d40_alloc_mask_set(&phys[i], is_src, - event_line, is_log)) + event_line, is_log, + first_phy_user)) goto found_log; } } @@ -1643,10 +1815,11 @@ static int d40_free_dma(struct d40_chan *d40c) return -EINVAL; } + pm_runtime_get_sync(d40c->base->dev); res = d40_channel_execute_command(d40c, D40_DMA_SUSPEND_REQ); if (res) { chan_err(d40c, "suspend failed\n"); - return res; + goto out; } if (chan_is_logical(d40c)) { @@ -1664,13 +1837,11 @@ static int d40_free_dma(struct d40_chan *d40c) if (d40_chan_has_events(d40c)) { res = d40_channel_execute_command(d40c, D40_DMA_RUN); - if (res) { + if (res) chan_err(d40c, "Executing RUN command\n"); - return res; - } } - return 0; + goto out; } } else { (void) d40_alloc_mask_free(phy, is_src, 0); @@ -1680,13 +1851,23 @@ static int d40_free_dma(struct d40_chan *d40c) res = d40_channel_execute_command(d40c, D40_DMA_STOP); if (res) { chan_err(d40c, "Failed to stop channel\n"); - return res; + goto out; } + + if (d40c->busy) { + pm_runtime_mark_last_busy(d40c->base->dev); + pm_runtime_put_autosuspend(d40c->base->dev); + } + + d40c->busy = false; d40c->phy_chan = NULL; d40c->configured = false; d40c->base->lookup_phy_chans[phy->num] = NULL; +out: - return 0; + pm_runtime_mark_last_busy(d40c->base->dev); + pm_runtime_put_autosuspend(d40c->base->dev); + return res; } static bool d40_is_paused(struct d40_chan *d40c) @@ -1855,7 +2036,7 @@ err: } static dma_addr_t -d40_get_dev_addr(struct d40_chan *chan, enum dma_data_direction direction) +d40_get_dev_addr(struct d40_chan *chan, enum dma_transfer_direction direction) { struct stedma40_platform_data *plat = chan->base->plat_data; struct stedma40_chan_cfg *cfg = &chan->dma_cfg; @@ -1864,9 +2045,9 @@ d40_get_dev_addr(struct d40_chan *chan, enum dma_data_direction direction) if (chan->runtime_addr) return chan->runtime_addr; - if (direction == DMA_FROM_DEVICE) + if (direction == DMA_DEV_TO_MEM) addr = plat->dev_rx[cfg->src_dev_type]; - else if (direction == DMA_TO_DEVICE) + else if (direction == DMA_MEM_TO_DEV) addr = plat->dev_tx[cfg->dst_dev_type]; return addr; @@ -1875,7 +2056,7 @@ d40_get_dev_addr(struct d40_chan *chan, enum dma_data_direction direction) static struct dma_async_tx_descriptor * d40_prep_sg(struct dma_chan *dchan, struct scatterlist *sg_src, struct scatterlist *sg_dst, unsigned int sg_len, - enum dma_data_direction direction, unsigned long dma_flags) + enum dma_transfer_direction direction, unsigned long dma_flags) { struct d40_chan *chan = container_of(dchan, struct d40_chan, chan); dma_addr_t src_dev_addr = 0; @@ -1902,9 +2083,9 @@ d40_prep_sg(struct dma_chan *dchan, struct scatterlist *sg_src, if (direction != DMA_NONE) { dma_addr_t dev_addr = d40_get_dev_addr(chan, direction); - if (direction == DMA_FROM_DEVICE) + if (direction == DMA_DEV_TO_MEM) src_dev_addr = dev_addr; - else if (direction == DMA_TO_DEVICE) + else if (direction == DMA_MEM_TO_DEV) dst_dev_addr = dev_addr; } @@ -2011,14 +2192,15 @@ static int d40_alloc_chan_resources(struct dma_chan *chan) goto fail; } } - is_free_phy = (d40c->phy_chan == NULL); - err = d40_allocate_channel(d40c); + err = d40_allocate_channel(d40c, &is_free_phy); if (err) { chan_err(d40c, "Failed to allocate channel\n"); + d40c->configured = false; goto fail; } + pm_runtime_get_sync(d40c->base->dev); /* Fill in basic CFG register values */ d40_phy_cfg(&d40c->dma_cfg, &d40c->src_def_cfg, &d40c->dst_def_cfg, chan_is_logical(d40c)); @@ -2038,6 +2220,12 @@ static int d40_alloc_chan_resources(struct dma_chan *chan) D40_LCPA_CHAN_SIZE + D40_LCPA_CHAN_DST_DELTA; } + dev_dbg(chan2dev(d40c), "allocated %s channel (phy %d%s)\n", + chan_is_logical(d40c) ? "logical" : "physical", + d40c->phy_chan->num, + d40c->dma_cfg.use_fixed_channel ? ", fixed" : ""); + + /* * Only write channel configuration to the DMA if the physical * resource is free. In case of multiple logical channels @@ -2046,6 +2234,8 @@ static int d40_alloc_chan_resources(struct dma_chan *chan) if (is_free_phy) d40_config_write(d40c); fail: + pm_runtime_mark_last_busy(d40c->base->dev); + pm_runtime_put_autosuspend(d40c->base->dev); spin_unlock_irqrestore(&d40c->lock, flags); return err; } @@ -2108,10 +2298,10 @@ d40_prep_memcpy_sg(struct dma_chan *chan, static struct dma_async_tx_descriptor *d40_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, unsigned int sg_len, - enum dma_data_direction direction, + enum dma_transfer_direction direction, unsigned long dma_flags) { - if (direction != DMA_FROM_DEVICE && direction != DMA_TO_DEVICE) + if (direction != DMA_DEV_TO_MEM && direction != DMA_MEM_TO_DEV) return NULL; return d40_prep_sg(chan, sgl, sgl, sg_len, direction, dma_flags); @@ -2120,7 +2310,7 @@ static struct dma_async_tx_descriptor *d40_prep_slave_sg(struct dma_chan *chan, static struct dma_async_tx_descriptor * dma40_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t dma_addr, size_t buf_len, size_t period_len, - enum dma_data_direction direction) + enum dma_transfer_direction direction) { unsigned int periods = buf_len / period_len; struct dma_async_tx_descriptor *txd; @@ -2269,7 +2459,7 @@ static int d40_set_runtime_config(struct dma_chan *chan, dst_addr_width = config->dst_addr_width; dst_maxburst = config->dst_maxburst; - if (config->direction == DMA_FROM_DEVICE) { + if (config->direction == DMA_DEV_TO_MEM) { dma_addr_t dev_addr_rx = d40c->base->plat_data->dev_rx[cfg->src_dev_type]; @@ -2292,7 +2482,7 @@ static int d40_set_runtime_config(struct dma_chan *chan, if (dst_maxburst == 0) dst_maxburst = src_maxburst; - } else if (config->direction == DMA_TO_DEVICE) { + } else if (config->direction == DMA_MEM_TO_DEV) { dma_addr_t dev_addr_tx = d40c->base->plat_data->dev_tx[cfg->dst_dev_type]; @@ -2357,7 +2547,7 @@ static int d40_set_runtime_config(struct dma_chan *chan, "configured channel %s for %s, data width %d/%d, " "maxburst %d/%d elements, LE, no flow control\n", dma_chan_name(chan), - (config->direction == DMA_FROM_DEVICE) ? "RX" : "TX", + (config->direction == DMA_DEV_TO_MEM) ? "RX" : "TX", src_addr_width, dst_addr_width, src_maxburst, dst_maxburst); @@ -2519,6 +2709,72 @@ failure1: return err; } +/* Suspend resume functionality */ +#ifdef CONFIG_PM +static int dma40_pm_suspend(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + struct d40_base *base = platform_get_drvdata(pdev); + int ret = 0; + if (!pm_runtime_suspended(dev)) + return -EBUSY; + + if (base->lcpa_regulator) + ret = regulator_disable(base->lcpa_regulator); + return ret; +} + +static int dma40_runtime_suspend(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + struct d40_base *base = platform_get_drvdata(pdev); + + d40_save_restore_registers(base, true); + + /* Don't disable/enable clocks for v1 due to HW bugs */ + if (base->rev != 1) + writel_relaxed(base->gcc_pwr_off_mask, + base->virtbase + D40_DREG_GCC); + + return 0; +} + +static int dma40_runtime_resume(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + struct d40_base *base = platform_get_drvdata(pdev); + + if (base->initialized) + d40_save_restore_registers(base, false); + + writel_relaxed(D40_DREG_GCC_ENABLE_ALL, + base->virtbase + D40_DREG_GCC); + return 0; +} + +static int dma40_resume(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + struct d40_base *base = platform_get_drvdata(pdev); + int ret = 0; + + if (base->lcpa_regulator) + ret = regulator_enable(base->lcpa_regulator); + + return ret; +} + +static const struct dev_pm_ops dma40_pm_ops = { + .suspend = dma40_pm_suspend, + .runtime_suspend = dma40_runtime_suspend, + .runtime_resume = dma40_runtime_resume, + .resume = dma40_resume, +}; +#define DMA40_PM_OPS (&dma40_pm_ops) +#else +#define DMA40_PM_OPS NULL +#endif + /* Initialization functions. */ static int __init d40_phy_res_init(struct d40_base *base) @@ -2527,6 +2783,7 @@ static int __init d40_phy_res_init(struct d40_base *base) int num_phy_chans_avail = 0; u32 val[2]; int odd_even_bit = -2; + int gcc = D40_DREG_GCC_ENA; val[0] = readl(base->virtbase + D40_DREG_PRSME); val[1] = readl(base->virtbase + D40_DREG_PRSMO); @@ -2538,9 +2795,17 @@ static int __init d40_phy_res_init(struct d40_base *base) /* Mark security only channels as occupied */ base->phy_res[i].allocated_src = D40_ALLOC_PHY; base->phy_res[i].allocated_dst = D40_ALLOC_PHY; + base->phy_res[i].reserved = true; + gcc |= D40_DREG_GCC_EVTGRP_ENA(D40_PHYS_TO_GROUP(i), + D40_DREG_GCC_SRC); + gcc |= D40_DREG_GCC_EVTGRP_ENA(D40_PHYS_TO_GROUP(i), + D40_DREG_GCC_DST); + + } else { base->phy_res[i].allocated_src = D40_ALLOC_FREE; base->phy_res[i].allocated_dst = D40_ALLOC_FREE; + base->phy_res[i].reserved = false; num_phy_chans_avail++; } spin_lock_init(&base->phy_res[i].lock); @@ -2552,6 +2817,11 @@ static int __init d40_phy_res_init(struct d40_base *base) base->phy_res[chan].allocated_src = D40_ALLOC_PHY; base->phy_res[chan].allocated_dst = D40_ALLOC_PHY; + base->phy_res[chan].reserved = true; + gcc |= D40_DREG_GCC_EVTGRP_ENA(D40_PHYS_TO_GROUP(chan), + D40_DREG_GCC_SRC); + gcc |= D40_DREG_GCC_EVTGRP_ENA(D40_PHYS_TO_GROUP(chan), + D40_DREG_GCC_DST); num_phy_chans_avail--; } @@ -2572,6 +2842,15 @@ static int __init d40_phy_res_init(struct d40_base *base) val[0] = val[0] >> 2; } + /* + * To keep things simple, Enable all clocks initially. + * The clocks will get managed later post channel allocation. + * The clocks for the event lines on which reserved channels exists + * are not managed here. + */ + writel(D40_DREG_GCC_ENABLE_ALL, base->virtbase + D40_DREG_GCC); + base->gcc_pwr_off_mask = gcc; + return num_phy_chans_avail; } @@ -2699,10 +2978,15 @@ static struct d40_base * __init d40_hw_detect_init(struct platform_device *pdev) goto failure; } - base->lcla_pool.alloc_map = kzalloc(num_phy_chans * - sizeof(struct d40_desc *) * - D40_LCLA_LINK_PER_EVENT_GRP, + base->reg_val_backup_chan = kmalloc(base->num_phy_chans * + sizeof(d40_backup_regs_chan), GFP_KERNEL); + if (!base->reg_val_backup_chan) + goto failure; + + base->lcla_pool.alloc_map = + kzalloc(num_phy_chans * sizeof(struct d40_desc *) + * D40_LCLA_LINK_PER_EVENT_GRP, GFP_KERNEL); if (!base->lcla_pool.alloc_map) goto failure; @@ -2741,9 +3025,9 @@ failure: static void __init d40_hw_init(struct d40_base *base) { - static const struct d40_reg_val dma_init_reg[] = { + static struct d40_reg_val dma_init_reg[] = { /* Clock every part of the DMA block from start */ - { .reg = D40_DREG_GCC, .val = 0x0000ff01}, + { .reg = D40_DREG_GCC, .val = D40_DREG_GCC_ENABLE_ALL}, /* Interrupts on all logical channels */ { .reg = D40_DREG_LCMIS0, .val = 0xFFFFFFFF}, @@ -2943,11 +3227,31 @@ static int __init d40_probe(struct platform_device *pdev) d40_err(&pdev->dev, "Failed to ioremap LCPA region\n"); goto failure; } + /* If lcla has to be located in ESRAM we don't need to allocate */ + if (base->plat_data->use_esram_lcla) { + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, + "lcla_esram"); + if (!res) { + ret = -ENOENT; + d40_err(&pdev->dev, + "No \"lcla_esram\" memory resource\n"); + goto failure; + } + base->lcla_pool.base = ioremap(res->start, + resource_size(res)); + if (!base->lcla_pool.base) { + ret = -ENOMEM; + d40_err(&pdev->dev, "Failed to ioremap LCLA region\n"); + goto failure; + } + writel(res->start, base->virtbase + D40_DREG_LCLA); - ret = d40_lcla_allocate(base); - if (ret) { - d40_err(&pdev->dev, "Failed to allocate LCLA area\n"); - goto failure; + } else { + ret = d40_lcla_allocate(base); + if (ret) { + d40_err(&pdev->dev, "Failed to allocate LCLA area\n"); + goto failure; + } } spin_lock_init(&base->lcla_pool.lock); @@ -2960,6 +3264,32 @@ static int __init d40_probe(struct platform_device *pdev) goto failure; } + pm_runtime_irq_safe(base->dev); + pm_runtime_set_autosuspend_delay(base->dev, DMA40_AUTOSUSPEND_DELAY); + pm_runtime_use_autosuspend(base->dev); + pm_runtime_enable(base->dev); + pm_runtime_resume(base->dev); + + if (base->plat_data->use_esram_lcla) { + + base->lcpa_regulator = regulator_get(base->dev, "lcla_esram"); + if (IS_ERR(base->lcpa_regulator)) { + d40_err(&pdev->dev, "Failed to get lcpa_regulator\n"); + base->lcpa_regulator = NULL; + goto failure; + } + + ret = regulator_enable(base->lcpa_regulator); + if (ret) { + d40_err(&pdev->dev, + "Failed to enable lcpa_regulator\n"); + regulator_put(base->lcpa_regulator); + base->lcpa_regulator = NULL; + goto failure; + } + } + + base->initialized = true; err = d40_dmaengine_init(base, num_reserved_chans); if (err) goto failure; @@ -2976,6 +3306,11 @@ failure: if (base->virtbase) iounmap(base->virtbase); + if (base->lcla_pool.base && base->plat_data->use_esram_lcla) { + iounmap(base->lcla_pool.base); + base->lcla_pool.base = NULL; + } + if (base->lcla_pool.dma_addr) dma_unmap_single(base->dev, base->lcla_pool.dma_addr, SZ_1K * base->num_phy_chans, @@ -2998,6 +3333,11 @@ failure: clk_put(base->clk); } + if (base->lcpa_regulator) { + regulator_disable(base->lcpa_regulator); + regulator_put(base->lcpa_regulator); + } + kfree(base->lcla_pool.alloc_map); kfree(base->lookup_log_chans); kfree(base->lookup_phy_chans); @@ -3013,6 +3353,7 @@ static struct platform_driver d40_driver = { .driver = { .owner = THIS_MODULE, .name = D40_NAME, + .pm = DMA40_PM_OPS, }, }; diff --git a/drivers/dma/ste_dma40_ll.h b/drivers/dma/ste_dma40_ll.h index b44c455158d..8d3d490968a 100644 --- a/drivers/dma/ste_dma40_ll.h +++ b/drivers/dma/ste_dma40_ll.h @@ -16,6 +16,8 @@ #define D40_TYPE_TO_GROUP(type) (type / 16) #define D40_TYPE_TO_EVENT(type) (type % 16) +#define D40_GROUP_SIZE 8 +#define D40_PHYS_TO_GROUP(phys) ((phys & (D40_GROUP_SIZE - 1)) / 2) /* Most bits of the CFG register are the same in log as in phy mode */ #define D40_SREG_CFG_MST_POS 15 @@ -123,6 +125,15 @@ /* DMA Register Offsets */ #define D40_DREG_GCC 0x000 +#define D40_DREG_GCC_ENA 0x1 +/* This assumes that there are only 4 event groups */ +#define D40_DREG_GCC_ENABLE_ALL 0xff01 +#define D40_DREG_GCC_EVTGRP_POS 8 +#define D40_DREG_GCC_SRC 0 +#define D40_DREG_GCC_DST 1 +#define D40_DREG_GCC_EVTGRP_ENA(x, y) \ + (1 << (D40_DREG_GCC_EVTGRP_POS + 2 * x + y)) + #define D40_DREG_PRTYP 0x004 #define D40_DREG_PRSME 0x008 #define D40_DREG_PRSMO 0x00C diff --git a/drivers/dma/timb_dma.c b/drivers/dma/timb_dma.c index a4a398f2ef6..a6f9c1684a0 100644 --- a/drivers/dma/timb_dma.c +++ b/drivers/dma/timb_dma.c @@ -90,7 +90,7 @@ struct timb_dma_chan { struct list_head queue; struct list_head free_list; unsigned int bytes_per_line; - enum dma_data_direction direction; + enum dma_transfer_direction direction; unsigned int descs; /* Descriptors to allocate */ unsigned int desc_elems; /* number of elems per descriptor */ }; @@ -166,10 +166,10 @@ static void __td_unmap_desc(struct timb_dma_chan *td_chan, const u8 *dma_desc, if (single) dma_unmap_single(chan2dev(&td_chan->chan), addr, len, - td_chan->direction); + DMA_TO_DEVICE); else dma_unmap_page(chan2dev(&td_chan->chan), addr, len, - td_chan->direction); + DMA_TO_DEVICE); } static void __td_unmap_descs(struct timb_dma_desc *td_desc, bool single) @@ -235,7 +235,7 @@ static void __td_start_dma(struct timb_dma_chan *td_chan) "td_chan: %p, chan: %d, membase: %p\n", td_chan, td_chan->chan.chan_id, td_chan->membase); - if (td_chan->direction == DMA_FROM_DEVICE) { + if (td_chan->direction == DMA_DEV_TO_MEM) { /* descriptor address */ iowrite32(0, td_chan->membase + TIMBDMA_OFFS_RX_DHAR); @@ -278,7 +278,7 @@ static void __td_finish(struct timb_dma_chan *td_chan) txd->cookie); /* make sure to stop the transfer */ - if (td_chan->direction == DMA_FROM_DEVICE) + if (td_chan->direction == DMA_DEV_TO_MEM) iowrite32(0, td_chan->membase + TIMBDMA_OFFS_RX_ER); /* Currently no support for stopping DMA transfers else @@ -558,7 +558,7 @@ static void td_issue_pending(struct dma_chan *chan) static struct dma_async_tx_descriptor *td_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, unsigned int sg_len, - enum dma_data_direction direction, unsigned long flags) + enum dma_transfer_direction direction, unsigned long flags) { struct timb_dma_chan *td_chan = container_of(chan, struct timb_dma_chan, chan); @@ -606,7 +606,7 @@ static struct dma_async_tx_descriptor *td_prep_slave_sg(struct dma_chan *chan, } dma_sync_single_for_device(chan2dmadev(chan), td_desc->txd.phys, - td_desc->desc_list_len, DMA_TO_DEVICE); + td_desc->desc_list_len, DMA_MEM_TO_DEV); return &td_desc->txd; } @@ -775,8 +775,8 @@ static int __devinit td_probe(struct platform_device *pdev) td_chan->descs = pchan->descriptors; td_chan->desc_elems = pchan->descriptor_elements; td_chan->bytes_per_line = pchan->bytes_per_line; - td_chan->direction = pchan->rx ? DMA_FROM_DEVICE : - DMA_TO_DEVICE; + td_chan->direction = pchan->rx ? DMA_DEV_TO_MEM : + DMA_MEM_TO_DEV; td_chan->membase = td->membase + (i / 2) * TIMBDMA_INSTANCE_OFFSET + @@ -841,17 +841,7 @@ static struct platform_driver td_driver = { .remove = __exit_p(td_remove), }; -static int __init td_init(void) -{ - return platform_driver_register(&td_driver); -} -module_init(td_init); - -static void __exit td_exit(void) -{ - platform_driver_unregister(&td_driver); -} -module_exit(td_exit); +module_platform_driver(td_driver); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("Timberdale DMA controller driver"); diff --git a/drivers/dma/txx9dmac.c b/drivers/dma/txx9dmac.c index cbd83e362b5..6122c364cf1 100644 --- a/drivers/dma/txx9dmac.c +++ b/drivers/dma/txx9dmac.c @@ -845,7 +845,7 @@ txx9dmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, static struct dma_async_tx_descriptor * txx9dmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, - unsigned int sg_len, enum dma_data_direction direction, + unsigned int sg_len, enum dma_transfer_direction direction, unsigned long flags) { struct txx9dmac_chan *dc = to_txx9dmac_chan(chan); @@ -860,9 +860,9 @@ txx9dmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, BUG_ON(!ds || !ds->reg_width); if (ds->tx_reg) - BUG_ON(direction != DMA_TO_DEVICE); + BUG_ON(direction != DMA_MEM_TO_DEV); else - BUG_ON(direction != DMA_FROM_DEVICE); + BUG_ON(direction != DMA_DEV_TO_MEM); if (unlikely(!sg_len)) return NULL; @@ -882,7 +882,7 @@ txx9dmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, mem = sg_dma_address(sg); if (__is_dmac64(ddev)) { - if (direction == DMA_TO_DEVICE) { + if (direction == DMA_MEM_TO_DEV) { desc->hwdesc.SAR = mem; desc->hwdesc.DAR = ds->tx_reg; } else { @@ -891,7 +891,7 @@ txx9dmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, } desc->hwdesc.CNTR = sg_dma_len(sg); } else { - if (direction == DMA_TO_DEVICE) { + if (direction == DMA_MEM_TO_DEV) { desc->hwdesc32.SAR = mem; desc->hwdesc32.DAR = ds->tx_reg; } else { @@ -900,7 +900,7 @@ txx9dmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, } desc->hwdesc32.CNTR = sg_dma_len(sg); } - if (direction == DMA_TO_DEVICE) { + if (direction == DMA_MEM_TO_DEV) { sai = ds->reg_width; dai = 0; } else { diff --git a/drivers/media/video/mx3_camera.c b/drivers/media/video/mx3_camera.c index 0cb461dd396..74522773e93 100644 --- a/drivers/media/video/mx3_camera.c +++ b/drivers/media/video/mx3_camera.c @@ -287,7 +287,7 @@ static void mx3_videobuf_queue(struct vb2_buffer *vb) sg_dma_len(sg) = new_size; txd = ichan->dma_chan.device->device_prep_slave_sg( - &ichan->dma_chan, sg, 1, DMA_FROM_DEVICE, + &ichan->dma_chan, sg, 1, DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT); if (!txd) goto error; diff --git a/drivers/media/video/timblogiw.c b/drivers/media/video/timblogiw.c index 0a2d75f0406..4ed1c7c28ae 100644 --- a/drivers/media/video/timblogiw.c +++ b/drivers/media/video/timblogiw.c @@ -565,7 +565,7 @@ static void buffer_queue(struct videobuf_queue *vq, struct videobuf_buffer *vb) spin_unlock_irq(&fh->queue_lock); desc = fh->chan->device->device_prep_slave_sg(fh->chan, - buf->sg, sg_elems, DMA_FROM_DEVICE, + buf->sg, sg_elems, DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_SRC_UNMAP); if (!desc) { spin_lock_irq(&fh->queue_lock); diff --git a/drivers/misc/carma/carma-fpga-program.c b/drivers/misc/carma/carma-fpga-program.c index eb5cd28bc6d..a2d25e4857e 100644 --- a/drivers/misc/carma/carma-fpga-program.c +++ b/drivers/misc/carma/carma-fpga-program.c @@ -513,7 +513,7 @@ static noinline int fpga_program_dma(struct fpga_dev *priv) * transaction, and then put it under external control */ memset(&config, 0, sizeof(config)); - config.direction = DMA_TO_DEVICE; + config.direction = DMA_MEM_TO_DEV; config.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; config.dst_maxburst = fpga_fifo_size(priv->regs) / 2 / 4; ret = chan->device->device_control(chan, DMA_SLAVE_CONFIG, diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c index a7ee5027146..fcfe1eb5acc 100644 --- a/drivers/mmc/host/atmel-mci.c +++ b/drivers/mmc/host/atmel-mci.c @@ -823,6 +823,7 @@ atmci_prepare_data_dma(struct atmel_mci *host, struct mmc_data *data) struct scatterlist *sg; unsigned int i; enum dma_data_direction direction; + enum dma_transfer_direction slave_dirn; unsigned int sglen; u32 iflags; @@ -860,16 +861,19 @@ atmci_prepare_data_dma(struct atmel_mci *host, struct mmc_data *data) if (host->caps.has_dma) atmci_writel(host, ATMCI_DMA, ATMCI_DMA_CHKSIZE(3) | ATMCI_DMAEN); - if (data->flags & MMC_DATA_READ) + if (data->flags & MMC_DATA_READ) { direction = DMA_FROM_DEVICE; - else + slave_dirn = DMA_DEV_TO_MEM; + } else { direction = DMA_TO_DEVICE; + slave_dirn = DMA_MEM_TO_DEV; + } sglen = dma_map_sg(chan->device->dev, data->sg, data->sg_len, direction); desc = chan->device->device_prep_slave_sg(chan, - data->sg, sglen, direction, + data->sg, sglen, slave_dirn, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!desc) goto unmap_exit; diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c index ece03b491c7..0d955ffaf44 100644 --- a/drivers/mmc/host/mmci.c +++ b/drivers/mmc/host/mmci.c @@ -374,6 +374,7 @@ static int mmci_dma_prep_data(struct mmci_host *host, struct mmc_data *data, struct dma_chan *chan; struct dma_device *device; struct dma_async_tx_descriptor *desc; + enum dma_data_direction buffer_dirn; int nr_sg; /* Check if next job is already prepared */ @@ -387,10 +388,12 @@ static int mmci_dma_prep_data(struct mmci_host *host, struct mmc_data *data, } if (data->flags & MMC_DATA_READ) { - conf.direction = DMA_FROM_DEVICE; + conf.direction = DMA_DEV_TO_MEM; + buffer_dirn = DMA_FROM_DEVICE; chan = host->dma_rx_channel; } else { - conf.direction = DMA_TO_DEVICE; + conf.direction = DMA_MEM_TO_DEV; + buffer_dirn = DMA_TO_DEVICE; chan = host->dma_tx_channel; } @@ -403,7 +406,7 @@ static int mmci_dma_prep_data(struct mmci_host *host, struct mmc_data *data, return -EINVAL; device = chan->device; - nr_sg = dma_map_sg(device->dev, data->sg, data->sg_len, conf.direction); + nr_sg = dma_map_sg(device->dev, data->sg, data->sg_len, buffer_dirn); if (nr_sg == 0) return -EINVAL; @@ -426,7 +429,7 @@ static int mmci_dma_prep_data(struct mmci_host *host, struct mmc_data *data, unmap_exit: if (!next) dmaengine_terminate_all(chan); - dma_unmap_sg(device->dev, data->sg, data->sg_len, conf.direction); + dma_unmap_sg(device->dev, data->sg, data->sg_len, buffer_dirn); return -ENOMEM; } diff --git a/drivers/mmc/host/mxcmmc.c b/drivers/mmc/host/mxcmmc.c index 7088b40f957..4184b7946bb 100644 --- a/drivers/mmc/host/mxcmmc.c +++ b/drivers/mmc/host/mxcmmc.c @@ -218,6 +218,7 @@ static int mxcmci_setup_data(struct mxcmci_host *host, struct mmc_data *data) unsigned int blksz = data->blksz; unsigned int datasize = nob * blksz; struct scatterlist *sg; + enum dma_transfer_direction slave_dirn; int i, nents; if (data->flags & MMC_DATA_STREAM) @@ -240,10 +241,13 @@ static int mxcmci_setup_data(struct mxcmci_host *host, struct mmc_data *data) } } - if (data->flags & MMC_DATA_READ) + if (data->flags & MMC_DATA_READ) { host->dma_dir = DMA_FROM_DEVICE; - else + slave_dirn = DMA_DEV_TO_MEM; + } else { host->dma_dir = DMA_TO_DEVICE; + slave_dirn = DMA_MEM_TO_DEV; + } nents = dma_map_sg(host->dma->device->dev, data->sg, data->sg_len, host->dma_dir); @@ -251,7 +255,7 @@ static int mxcmci_setup_data(struct mxcmci_host *host, struct mmc_data *data) return -EINVAL; host->desc = host->dma->device->device_prep_slave_sg(host->dma, - data->sg, data->sg_len, host->dma_dir, + data->sg, data->sg_len, slave_dirn, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!host->desc) { diff --git a/drivers/mmc/host/mxs-mmc.c b/drivers/mmc/host/mxs-mmc.c index 4e2e019dd5c..382c835d217 100644 --- a/drivers/mmc/host/mxs-mmc.c +++ b/drivers/mmc/host/mxs-mmc.c @@ -154,6 +154,7 @@ struct mxs_mmc_host { struct dma_chan *dmach; struct mxs_dma_data dma_data; unsigned int dma_dir; + enum dma_transfer_direction slave_dirn; u32 ssp_pio_words[SSP_PIO_NUM]; unsigned int version; @@ -324,7 +325,7 @@ static struct dma_async_tx_descriptor *mxs_mmc_prep_dma( } desc = host->dmach->device->device_prep_slave_sg(host->dmach, - sgl, sg_len, host->dma_dir, append); + sgl, sg_len, host->slave_dirn, append); if (desc) { desc->callback = mxs_mmc_dma_irq_callback; desc->callback_param = host; @@ -356,6 +357,7 @@ static void mxs_mmc_bc(struct mxs_mmc_host *host) host->ssp_pio_words[1] = cmd0; host->ssp_pio_words[2] = cmd1; host->dma_dir = DMA_NONE; + host->slave_dirn = DMA_TRANS_NONE; desc = mxs_mmc_prep_dma(host, 0); if (!desc) goto out; @@ -395,6 +397,7 @@ static void mxs_mmc_ac(struct mxs_mmc_host *host) host->ssp_pio_words[1] = cmd0; host->ssp_pio_words[2] = cmd1; host->dma_dir = DMA_NONE; + host->slave_dirn = DMA_TRANS_NONE; desc = mxs_mmc_prep_dma(host, 0); if (!desc) goto out; @@ -433,6 +436,7 @@ static void mxs_mmc_adtc(struct mxs_mmc_host *host) int i; unsigned short dma_data_dir, timeout; + enum dma_transfer_direction slave_dirn; unsigned int data_size = 0, log2_blksz; unsigned int blocks = data->blocks; @@ -448,9 +452,11 @@ static void mxs_mmc_adtc(struct mxs_mmc_host *host) if (data->flags & MMC_DATA_WRITE) { dma_data_dir = DMA_TO_DEVICE; + slave_dirn = DMA_MEM_TO_DEV; read = 0; } else { dma_data_dir = DMA_FROM_DEVICE; + slave_dirn = DMA_DEV_TO_MEM; read = BM_SSP_CTRL0_READ; } @@ -510,6 +516,7 @@ static void mxs_mmc_adtc(struct mxs_mmc_host *host) host->ssp_pio_words[1] = cmd0; host->ssp_pio_words[2] = cmd1; host->dma_dir = DMA_NONE; + host->slave_dirn = DMA_TRANS_NONE; desc = mxs_mmc_prep_dma(host, 0); if (!desc) goto out; @@ -518,6 +525,7 @@ static void mxs_mmc_adtc(struct mxs_mmc_host *host) WARN_ON(host->data != NULL); host->data = data; host->dma_dir = dma_data_dir; + host->slave_dirn = slave_dirn; desc = mxs_mmc_prep_dma(host, 1); if (!desc) goto out; diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c index 4a2c5b2355f..f5d8b53be33 100644 --- a/drivers/mmc/host/sh_mmcif.c +++ b/drivers/mmc/host/sh_mmcif.c @@ -286,7 +286,7 @@ static void sh_mmcif_start_dma_rx(struct sh_mmcif_host *host) if (ret > 0) { host->dma_active = true; desc = chan->device->device_prep_slave_sg(chan, sg, ret, - DMA_FROM_DEVICE, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); } if (desc) { @@ -335,7 +335,7 @@ static void sh_mmcif_start_dma_tx(struct sh_mmcif_host *host) if (ret > 0) { host->dma_active = true; desc = chan->device->device_prep_slave_sg(chan, sg, ret, - DMA_TO_DEVICE, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); } if (desc) { diff --git a/drivers/mmc/host/tmio_mmc_dma.c b/drivers/mmc/host/tmio_mmc_dma.c index 86f259cdfcb..7a6e6cc8f8b 100644 --- a/drivers/mmc/host/tmio_mmc_dma.c +++ b/drivers/mmc/host/tmio_mmc_dma.c @@ -77,7 +77,7 @@ static void tmio_mmc_start_dma_rx(struct tmio_mmc_host *host) ret = dma_map_sg(chan->device->dev, sg, host->sg_len, DMA_FROM_DEVICE); if (ret > 0) desc = chan->device->device_prep_slave_sg(chan, sg, ret, - DMA_FROM_DEVICE, DMA_CTRL_ACK); + DMA_DEV_TO_MEM, DMA_CTRL_ACK); if (desc) { cookie = dmaengine_submit(desc); @@ -158,7 +158,7 @@ static void tmio_mmc_start_dma_tx(struct tmio_mmc_host *host) ret = dma_map_sg(chan->device->dev, sg, host->sg_len, DMA_TO_DEVICE); if (ret > 0) desc = chan->device->device_prep_slave_sg(chan, sg, ret, - DMA_TO_DEVICE, DMA_CTRL_ACK); + DMA_MEM_TO_DEV, DMA_CTRL_ACK); if (desc) { cookie = dmaengine_submit(desc); diff --git a/drivers/mtd/nand/gpmi-nand/gpmi-lib.c b/drivers/mtd/nand/gpmi-nand/gpmi-lib.c index 2a56fc6f399..7f680420bfa 100644 --- a/drivers/mtd/nand/gpmi-nand/gpmi-lib.c +++ b/drivers/mtd/nand/gpmi-nand/gpmi-lib.c @@ -827,7 +827,7 @@ int gpmi_send_command(struct gpmi_nand_data *this) pio[1] = pio[2] = 0; desc = channel->device->device_prep_slave_sg(channel, (struct scatterlist *)pio, - ARRAY_SIZE(pio), DMA_NONE, 0); + ARRAY_SIZE(pio), DMA_TRANS_NONE, 0); if (!desc) { pr_err("step 1 error\n"); return -1; @@ -839,7 +839,7 @@ int gpmi_send_command(struct gpmi_nand_data *this) sg_init_one(sgl, this->cmd_buffer, this->command_length); dma_map_sg(this->dev, sgl, 1, DMA_TO_DEVICE); desc = channel->device->device_prep_slave_sg(channel, - sgl, 1, DMA_TO_DEVICE, 1); + sgl, 1, DMA_MEM_TO_DEV, 1); if (!desc) { pr_err("step 2 error\n"); return -1; @@ -872,7 +872,7 @@ int gpmi_send_data(struct gpmi_nand_data *this) pio[1] = 0; desc = channel->device->device_prep_slave_sg(channel, (struct scatterlist *)pio, - ARRAY_SIZE(pio), DMA_NONE, 0); + ARRAY_SIZE(pio), DMA_TRANS_NONE, 0); if (!desc) { pr_err("step 1 error\n"); return -1; @@ -881,7 +881,7 @@ int gpmi_send_data(struct gpmi_nand_data *this) /* [2] send DMA request */ prepare_data_dma(this, DMA_TO_DEVICE); desc = channel->device->device_prep_slave_sg(channel, &this->data_sgl, - 1, DMA_TO_DEVICE, 1); + 1, DMA_MEM_TO_DEV, 1); if (!desc) { pr_err("step 2 error\n"); return -1; @@ -908,7 +908,7 @@ int gpmi_read_data(struct gpmi_nand_data *this) pio[1] = 0; desc = channel->device->device_prep_slave_sg(channel, (struct scatterlist *)pio, - ARRAY_SIZE(pio), DMA_NONE, 0); + ARRAY_SIZE(pio), DMA_TRANS_NONE, 0); if (!desc) { pr_err("step 1 error\n"); return -1; @@ -917,7 +917,7 @@ int gpmi_read_data(struct gpmi_nand_data *this) /* [2] : send DMA request */ prepare_data_dma(this, DMA_FROM_DEVICE); desc = channel->device->device_prep_slave_sg(channel, &this->data_sgl, - 1, DMA_FROM_DEVICE, 1); + 1, DMA_DEV_TO_MEM, 1); if (!desc) { pr_err("step 2 error\n"); return -1; @@ -964,7 +964,7 @@ int gpmi_send_page(struct gpmi_nand_data *this, desc = channel->device->device_prep_slave_sg(channel, (struct scatterlist *)pio, - ARRAY_SIZE(pio), DMA_NONE, 0); + ARRAY_SIZE(pio), DMA_TRANS_NONE, 0); if (!desc) { pr_err("step 2 error\n"); return -1; @@ -998,7 +998,8 @@ int gpmi_read_page(struct gpmi_nand_data *this, | BF_GPMI_CTRL0_XFER_COUNT(0); pio[1] = 0; desc = channel->device->device_prep_slave_sg(channel, - (struct scatterlist *)pio, 2, DMA_NONE, 0); + (struct scatterlist *)pio, 2, + DMA_TRANS_NONE, 0); if (!desc) { pr_err("step 1 error\n"); return -1; @@ -1027,7 +1028,7 @@ int gpmi_read_page(struct gpmi_nand_data *this, pio[5] = auxiliary; desc = channel->device->device_prep_slave_sg(channel, (struct scatterlist *)pio, - ARRAY_SIZE(pio), DMA_NONE, 1); + ARRAY_SIZE(pio), DMA_TRANS_NONE, 1); if (!desc) { pr_err("step 2 error\n"); return -1; @@ -1045,7 +1046,8 @@ int gpmi_read_page(struct gpmi_nand_data *this, | BF_GPMI_CTRL0_XFER_COUNT(geo->page_size); pio[1] = 0; desc = channel->device->device_prep_slave_sg(channel, - (struct scatterlist *)pio, 2, DMA_NONE, 1); + (struct scatterlist *)pio, 2, + DMA_TRANS_NONE, 1); if (!desc) { pr_err("step 3 error\n"); return -1; diff --git a/drivers/net/ethernet/micrel/ks8842.c b/drivers/net/ethernet/micrel/ks8842.c index 75ec87a822b..0a85690a132 100644 --- a/drivers/net/ethernet/micrel/ks8842.c +++ b/drivers/net/ethernet/micrel/ks8842.c @@ -459,7 +459,7 @@ static int ks8842_tx_frame_dma(struct sk_buff *skb, struct net_device *netdev) sg_dma_len(&ctl->sg) += 4 - sg_dma_len(&ctl->sg) % 4; ctl->adesc = ctl->chan->device->device_prep_slave_sg(ctl->chan, - &ctl->sg, 1, DMA_TO_DEVICE, + &ctl->sg, 1, DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_SRC_UNMAP); if (!ctl->adesc) return NETDEV_TX_BUSY; @@ -571,7 +571,7 @@ static int __ks8842_start_new_rx_dma(struct net_device *netdev) sg_dma_len(sg) = DMA_BUFFER_SIZE; ctl->adesc = ctl->chan->device->device_prep_slave_sg(ctl->chan, - sg, 1, DMA_FROM_DEVICE, + sg, 1, DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_SRC_UNMAP); if (!ctl->adesc) diff --git a/drivers/spi/spi-dw-mid.c b/drivers/spi/spi-dw-mid.c index e743a45ee92..8418eb03665 100644 --- a/drivers/spi/spi-dw-mid.c +++ b/drivers/spi/spi-dw-mid.c @@ -131,7 +131,7 @@ static int mid_spi_dma_transfer(struct dw_spi *dws, int cs_change) rxchan = dws->rxchan; /* 2. Prepare the TX dma transfer */ - txconf.direction = DMA_TO_DEVICE; + txconf.direction = DMA_MEM_TO_DEV; txconf.dst_addr = dws->dma_addr; txconf.dst_maxburst = LNW_DMA_MSIZE_16; txconf.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; @@ -147,13 +147,13 @@ static int mid_spi_dma_transfer(struct dw_spi *dws, int cs_change) txdesc = txchan->device->device_prep_slave_sg(txchan, &dws->tx_sgl, 1, - DMA_TO_DEVICE, + DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_DEST_UNMAP); txdesc->callback = dw_spi_dma_done; txdesc->callback_param = dws; /* 3. Prepare the RX dma transfer */ - rxconf.direction = DMA_FROM_DEVICE; + rxconf.direction = DMA_DEV_TO_MEM; rxconf.src_addr = dws->dma_addr; rxconf.src_maxburst = LNW_DMA_MSIZE_16; rxconf.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; @@ -169,7 +169,7 @@ static int mid_spi_dma_transfer(struct dw_spi *dws, int cs_change) rxdesc = rxchan->device->device_prep_slave_sg(rxchan, &dws->rx_sgl, 1, - DMA_FROM_DEVICE, + DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_DEST_UNMAP); rxdesc->callback = dw_spi_dma_done; rxdesc->callback_param = dws; diff --git a/drivers/spi/spi-ep93xx.c b/drivers/spi/spi-ep93xx.c index 0a282e5fcc9..d46e55c720b 100644 --- a/drivers/spi/spi-ep93xx.c +++ b/drivers/spi/spi-ep93xx.c @@ -551,6 +551,7 @@ ep93xx_spi_dma_prepare(struct ep93xx_spi *espi, enum dma_data_direction dir) struct dma_async_tx_descriptor *txd; enum dma_slave_buswidth buswidth; struct dma_slave_config conf; + enum dma_transfer_direction slave_dirn; struct scatterlist *sg; struct sg_table *sgt; struct dma_chan *chan; @@ -573,6 +574,7 @@ ep93xx_spi_dma_prepare(struct ep93xx_spi *espi, enum dma_data_direction dir) conf.src_addr = espi->sspdr_phys; conf.src_addr_width = buswidth; + slave_dirn = DMA_DEV_TO_MEM; } else { chan = espi->dma_tx; buf = t->tx_buf; @@ -580,6 +582,7 @@ ep93xx_spi_dma_prepare(struct ep93xx_spi *espi, enum dma_data_direction dir) conf.dst_addr = espi->sspdr_phys; conf.dst_addr_width = buswidth; + slave_dirn = DMA_MEM_TO_DEV; } ret = dmaengine_slave_config(chan, &conf); @@ -631,7 +634,7 @@ ep93xx_spi_dma_prepare(struct ep93xx_spi *espi, enum dma_data_direction dir) return ERR_PTR(-ENOMEM); txd = chan->device->device_prep_slave_sg(chan, sgt->sgl, nents, - dir, DMA_CTRL_ACK); + slave_dirn, DMA_CTRL_ACK); if (!txd) { dma_unmap_sg(chan->device->dev, sgt->sgl, sgt->nents, dir); return ERR_PTR(-ENOMEM); @@ -979,7 +982,7 @@ static int ep93xx_spi_setup_dma(struct ep93xx_spi *espi) dma_cap_set(DMA_SLAVE, mask); espi->dma_rx_data.port = EP93XX_DMA_SSP; - espi->dma_rx_data.direction = DMA_FROM_DEVICE; + espi->dma_rx_data.direction = DMA_DEV_TO_MEM; espi->dma_rx_data.name = "ep93xx-spi-rx"; espi->dma_rx = dma_request_channel(mask, ep93xx_spi_dma_filter, @@ -990,7 +993,7 @@ static int ep93xx_spi_setup_dma(struct ep93xx_spi *espi) } espi->dma_tx_data.port = EP93XX_DMA_SSP; - espi->dma_tx_data.direction = DMA_TO_DEVICE; + espi->dma_tx_data.direction = DMA_MEM_TO_DEV; espi->dma_tx_data.name = "ep93xx-spi-tx"; espi->dma_tx = dma_request_channel(mask, ep93xx_spi_dma_filter, diff --git a/drivers/spi/spi-pl022.c b/drivers/spi/spi-pl022.c index f1f5efbc340..2f9cb43a239 100644 --- a/drivers/spi/spi-pl022.c +++ b/drivers/spi/spi-pl022.c @@ -900,11 +900,11 @@ static int configure_dma(struct pl022 *pl022) { struct dma_slave_config rx_conf = { .src_addr = SSP_DR(pl022->phybase), - .direction = DMA_FROM_DEVICE, + .direction = DMA_DEV_TO_MEM, }; struct dma_slave_config tx_conf = { .dst_addr = SSP_DR(pl022->phybase), - .direction = DMA_TO_DEVICE, + .direction = DMA_MEM_TO_DEV, }; unsigned int pages; int ret; @@ -1041,7 +1041,7 @@ static int configure_dma(struct pl022 *pl022) rxdesc = rxchan->device->device_prep_slave_sg(rxchan, pl022->sgt_rx.sgl, rx_sglen, - DMA_FROM_DEVICE, + DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!rxdesc) goto err_rxdesc; @@ -1049,7 +1049,7 @@ static int configure_dma(struct pl022 *pl022) txdesc = txchan->device->device_prep_slave_sg(txchan, pl022->sgt_tx.sgl, tx_sglen, - DMA_TO_DEVICE, + DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!txdesc) goto err_txdesc; diff --git a/drivers/spi/spi-topcliff-pch.c b/drivers/spi/spi-topcliff-pch.c index 7086583b910..2a6429d8c36 100644 --- a/drivers/spi/spi-topcliff-pch.c +++ b/drivers/spi/spi-topcliff-pch.c @@ -1079,7 +1079,7 @@ static void pch_spi_handle_dma(struct pch_spi_data *data, int *bpw) } sg = dma->sg_rx_p; desc_rx = dma->chan_rx->device->device_prep_slave_sg(dma->chan_rx, sg, - num, DMA_FROM_DEVICE, + num, DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!desc_rx) { dev_err(&data->master->dev, "%s:device_prep_slave_sg Failed\n", @@ -1124,7 +1124,7 @@ static void pch_spi_handle_dma(struct pch_spi_data *data, int *bpw) } sg = dma->sg_tx_p; desc_tx = dma->chan_tx->device->device_prep_slave_sg(dma->chan_tx, - sg, num, DMA_TO_DEVICE, + sg, num, DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!desc_tx) { dev_err(&data->master->dev, "%s:device_prep_slave_sg Failed\n", diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c index 6958594f2fc..9ae024025ff 100644 --- a/drivers/tty/serial/amba-pl011.c +++ b/drivers/tty/serial/amba-pl011.c @@ -268,7 +268,7 @@ static void pl011_dma_probe_initcall(struct uart_amba_port *uap) struct dma_slave_config tx_conf = { .dst_addr = uap->port.mapbase + UART01x_DR, .dst_addr_width = DMA_SLAVE_BUSWIDTH_1_BYTE, - .direction = DMA_TO_DEVICE, + .direction = DMA_MEM_TO_DEV, .dst_maxburst = uap->fifosize >> 1, }; struct dma_chan *chan; @@ -301,7 +301,7 @@ static void pl011_dma_probe_initcall(struct uart_amba_port *uap) struct dma_slave_config rx_conf = { .src_addr = uap->port.mapbase + UART01x_DR, .src_addr_width = DMA_SLAVE_BUSWIDTH_1_BYTE, - .direction = DMA_FROM_DEVICE, + .direction = DMA_DEV_TO_MEM, .src_maxburst = uap->fifosize >> 1, }; @@ -480,7 +480,7 @@ static int pl011_dma_tx_refill(struct uart_amba_port *uap) return -EBUSY; } - desc = dma_dev->device_prep_slave_sg(chan, &dmatx->sg, 1, DMA_TO_DEVICE, + desc = dma_dev->device_prep_slave_sg(chan, &dmatx->sg, 1, DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!desc) { dma_unmap_sg(dma_dev->dev, &dmatx->sg, 1, DMA_TO_DEVICE); @@ -676,7 +676,7 @@ static int pl011_dma_rx_trigger_dma(struct uart_amba_port *uap) &uap->dmarx.sgbuf_b : &uap->dmarx.sgbuf_a; dma_dev = rxchan->device; desc = rxchan->device->device_prep_slave_sg(rxchan, &sgbuf->sg, 1, - DMA_FROM_DEVICE, + DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); /* * If the DMA engine is busy and cannot prepare a diff --git a/drivers/tty/serial/pch_uart.c b/drivers/tty/serial/pch_uart.c index de0f613ed6f..17ae65762d1 100644 --- a/drivers/tty/serial/pch_uart.c +++ b/drivers/tty/serial/pch_uart.c @@ -764,7 +764,7 @@ static int dma_handle_rx(struct eg20t_port *priv) sg_dma_address(sg) = priv->rx_buf_dma; desc = priv->chan_rx->device->device_prep_slave_sg(priv->chan_rx, - sg, 1, DMA_FROM_DEVICE, + sg, 1, DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!desc) @@ -923,7 +923,7 @@ static unsigned int dma_handle_tx(struct eg20t_port *priv) } desc = priv->chan_tx->device->device_prep_slave_sg(priv->chan_tx, - priv->sg_tx_p, nent, DMA_TO_DEVICE, + priv->sg_tx_p, nent, DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!desc) { dev_err(priv->port.dev, "%s:device_prep_slave_sg Failed\n", diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c index 9e62349b3d9..75085795528 100644 --- a/drivers/tty/serial/sh-sci.c +++ b/drivers/tty/serial/sh-sci.c @@ -1339,7 +1339,7 @@ static void sci_submit_rx(struct sci_port *s) struct dma_async_tx_descriptor *desc; desc = chan->device->device_prep_slave_sg(chan, - sg, 1, DMA_FROM_DEVICE, DMA_PREP_INTERRUPT); + sg, 1, DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT); if (desc) { s->desc_rx[i] = desc; @@ -1454,7 +1454,7 @@ static void work_fn_tx(struct work_struct *work) BUG_ON(!sg_dma_len(sg)); desc = chan->device->device_prep_slave_sg(chan, - sg, s->sg_len_tx, DMA_TO_DEVICE, + sg, s->sg_len_tx, DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!desc) { /* switch to PIO */ diff --git a/drivers/usb/host/ehci-xilinx-of.c b/drivers/usb/host/ehci-xilinx-of.c index 32793ce3d9e..9c2cc463389 100644 --- a/drivers/usb/host/ehci-xilinx-of.c +++ b/drivers/usb/host/ehci-xilinx-of.c @@ -183,7 +183,7 @@ static int __devinit ehci_hcd_xilinx_of_probe(struct platform_device *op) } irq = irq_of_parse_and_map(dn, 0); - if (irq == NO_IRQ) { + if (!irq) { printk(KERN_ERR "%s: irq_of_parse_and_map failed\n", __FILE__); rv = -EBUSY; goto err_irq; diff --git a/drivers/usb/musb/ux500_dma.c b/drivers/usb/musb/ux500_dma.c index a163632877a..97cb45916c4 100644 --- a/drivers/usb/musb/ux500_dma.c +++ b/drivers/usb/musb/ux500_dma.c @@ -84,7 +84,7 @@ static bool ux500_configure_channel(struct dma_channel *channel, struct musb_hw_ep *hw_ep = ux500_channel->hw_ep; struct dma_chan *dma_chan = ux500_channel->dma_chan; struct dma_async_tx_descriptor *dma_desc; - enum dma_data_direction direction; + enum dma_transfer_direction direction; struct scatterlist sg; struct dma_slave_config slave_conf; enum dma_slave_buswidth addr_width; @@ -104,7 +104,7 @@ static bool ux500_configure_channel(struct dma_channel *channel, sg_dma_address(&sg) = dma_addr; sg_dma_len(&sg) = len; - direction = ux500_channel->is_tx ? DMA_TO_DEVICE : DMA_FROM_DEVICE; + direction = ux500_channel->is_tx ? DMA_MEM_TO_DEV : DMA_DEV_TO_MEM; addr_width = (len & 0x3) ? DMA_SLAVE_BUSWIDTH_1_BYTE : DMA_SLAVE_BUSWIDTH_4_BYTES; diff --git a/drivers/usb/renesas_usbhs/fifo.c b/drivers/usb/renesas_usbhs/fifo.c index b51fcd80d24..72339bd6fca 100644 --- a/drivers/usb/renesas_usbhs/fifo.c +++ b/drivers/usb/renesas_usbhs/fifo.c @@ -772,10 +772,10 @@ static void usbhsf_dma_prepare_tasklet(unsigned long data) struct dma_async_tx_descriptor *desc; struct dma_chan *chan = usbhsf_dma_chan_get(fifo, pkt); struct device *dev = usbhs_priv_to_dev(priv); - enum dma_data_direction dir; + enum dma_transfer_direction dir; dma_cookie_t cookie; - dir = usbhs_pipe_is_dir_in(pipe) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + dir = usbhs_pipe_is_dir_in(pipe) ? DMA_DEV_TO_MEM : DMA_MEM_TO_DEV; sg_init_table(&sg, 1); sg_set_page(&sg, virt_to_page(pkt->dma), diff --git a/drivers/video/mx3fb.c b/drivers/video/mx3fb.c index e3406ab3130..727a5149d81 100644 --- a/drivers/video/mx3fb.c +++ b/drivers/video/mx3fb.c @@ -245,6 +245,7 @@ struct mx3fb_data { uint32_t h_start_width; uint32_t v_start_width; + enum disp_data_mapping disp_data_fmt; }; struct dma_chan_request { @@ -287,11 +288,14 @@ static void mx3fb_write_reg(struct mx3fb_data *mx3fb, u32 value, unsigned long r __raw_writel(value, mx3fb->reg_base + reg); } -static const uint32_t di_mappings[] = { - 0x1600AAAA, 0x00E05555, 0x00070000, 3, /* RGB888 */ - 0x0005000F, 0x000B000F, 0x0011000F, 1, /* RGB666 */ - 0x0011000F, 0x000B000F, 0x0005000F, 1, /* BGR666 */ - 0x0004003F, 0x000A000F, 0x000F003F, 1 /* RGB565 */ +struct di_mapping { + uint32_t b0, b1, b2; +}; + +static const struct di_mapping di_mappings[] = { + [IPU_DISP_DATA_MAPPING_RGB666] = { 0x0005000f, 0x000b000f, 0x0011000f }, + [IPU_DISP_DATA_MAPPING_RGB565] = { 0x0004003f, 0x000a000f, 0x000f003f }, + [IPU_DISP_DATA_MAPPING_RGB888] = { 0x00070000, 0x000f0000, 0x00170000 }, }; static void sdc_fb_init(struct mx3fb_info *fbi) @@ -334,7 +338,7 @@ static void sdc_enable_channel(struct mx3fb_info *mx3_fbi) /* This enables the channel */ if (mx3_fbi->cookie < 0) { mx3_fbi->txd = dma_chan->device->device_prep_slave_sg(dma_chan, - &mx3_fbi->sg[0], 1, DMA_TO_DEVICE, DMA_PREP_INTERRUPT); + &mx3_fbi->sg[0], 1, DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT); if (!mx3_fbi->txd) { dev_err(mx3fb->dev, "Cannot allocate descriptor on %d\n", dma_chan->chan_id); @@ -425,7 +429,6 @@ static int sdc_set_window_pos(struct mx3fb_data *mx3fb, enum ipu_channel channel * @pixel_clk: desired pixel clock frequency in Hz. * @width: width of panel in pixels. * @height: height of panel in pixels. - * @pixel_fmt: pixel format of buffer as FOURCC ASCII code. * @h_start_width: number of pixel clocks between the HSYNC signal pulse * and the start of valid data. * @h_sync_width: width of the HSYNC signal in units of pixel clocks. @@ -442,7 +445,6 @@ static int sdc_set_window_pos(struct mx3fb_data *mx3fb, enum ipu_channel channel static int sdc_init_panel(struct mx3fb_data *mx3fb, enum ipu_panel panel, uint32_t pixel_clk, uint16_t width, uint16_t height, - enum pixel_fmt pixel_fmt, uint16_t h_start_width, uint16_t h_sync_width, uint16_t h_end_width, uint16_t v_start_width, uint16_t v_sync_width, uint16_t v_end_width, @@ -453,6 +455,7 @@ static int sdc_init_panel(struct mx3fb_data *mx3fb, enum ipu_panel panel, uint32_t old_conf; uint32_t div; struct clk *ipu_clk; + const struct di_mapping *map; dev_dbg(mx3fb->dev, "panel size = %d x %d", width, height); @@ -540,36 +543,10 @@ static int sdc_init_panel(struct mx3fb_data *mx3fb, enum ipu_panel panel, sig.Vsync_pol << DI_D3_VSYNC_POL_SHIFT; mx3fb_write_reg(mx3fb, old_conf, DI_DISP_SIG_POL); - switch (pixel_fmt) { - case IPU_PIX_FMT_RGB24: - mx3fb_write_reg(mx3fb, di_mappings[0], DI_DISP3_B0_MAP); - mx3fb_write_reg(mx3fb, di_mappings[1], DI_DISP3_B1_MAP); - mx3fb_write_reg(mx3fb, di_mappings[2], DI_DISP3_B2_MAP); - mx3fb_write_reg(mx3fb, mx3fb_read_reg(mx3fb, DI_DISP_ACC_CC) | - ((di_mappings[3] - 1) << 12), DI_DISP_ACC_CC); - break; - case IPU_PIX_FMT_RGB666: - mx3fb_write_reg(mx3fb, di_mappings[4], DI_DISP3_B0_MAP); - mx3fb_write_reg(mx3fb, di_mappings[5], DI_DISP3_B1_MAP); - mx3fb_write_reg(mx3fb, di_mappings[6], DI_DISP3_B2_MAP); - mx3fb_write_reg(mx3fb, mx3fb_read_reg(mx3fb, DI_DISP_ACC_CC) | - ((di_mappings[7] - 1) << 12), DI_DISP_ACC_CC); - break; - case IPU_PIX_FMT_BGR666: - mx3fb_write_reg(mx3fb, di_mappings[8], DI_DISP3_B0_MAP); - mx3fb_write_reg(mx3fb, di_mappings[9], DI_DISP3_B1_MAP); - mx3fb_write_reg(mx3fb, di_mappings[10], DI_DISP3_B2_MAP); - mx3fb_write_reg(mx3fb, mx3fb_read_reg(mx3fb, DI_DISP_ACC_CC) | - ((di_mappings[11] - 1) << 12), DI_DISP_ACC_CC); - break; - default: - mx3fb_write_reg(mx3fb, di_mappings[12], DI_DISP3_B0_MAP); - mx3fb_write_reg(mx3fb, di_mappings[13], DI_DISP3_B1_MAP); - mx3fb_write_reg(mx3fb, di_mappings[14], DI_DISP3_B2_MAP); - mx3fb_write_reg(mx3fb, mx3fb_read_reg(mx3fb, DI_DISP_ACC_CC) | - ((di_mappings[15] - 1) << 12), DI_DISP_ACC_CC); - break; - } + map = &di_mappings[mx3fb->disp_data_fmt]; + mx3fb_write_reg(mx3fb, map->b0, DI_DISP3_B0_MAP); + mx3fb_write_reg(mx3fb, map->b1, DI_DISP3_B1_MAP); + mx3fb_write_reg(mx3fb, map->b2, DI_DISP3_B2_MAP); spin_unlock_irqrestore(&mx3fb->lock, lock_flags); @@ -780,8 +757,6 @@ static int __set_par(struct fb_info *fbi, bool lock) if (sdc_init_panel(mx3fb, mode, (PICOS2KHZ(fbi->var.pixclock)) * 1000UL, fbi->var.xres, fbi->var.yres, - (fbi->var.sync & FB_SYNC_SWAP_RGB) ? - IPU_PIX_FMT_BGR666 : IPU_PIX_FMT_RGB666, fbi->var.left_margin, fbi->var.hsync_len, fbi->var.right_margin + @@ -1117,7 +1092,7 @@ static int mx3fb_pan_display(struct fb_var_screeninfo *var, async_tx_ack(mx3_fbi->txd); txd = dma_chan->device->device_prep_slave_sg(dma_chan, sg + - mx3_fbi->cur_ipu_buf, 1, DMA_TO_DEVICE, DMA_PREP_INTERRUPT); + mx3_fbi->cur_ipu_buf, 1, DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT); if (!txd) { dev_err(fbi->device, "Error preparing a DMA transaction descriptor.\n"); @@ -1349,6 +1324,12 @@ static int init_fb_chan(struct mx3fb_data *mx3fb, struct idmac_channel *ichan) const struct fb_videomode *mode; int ret, num_modes; + if (mx3fb_pdata->disp_data_fmt >= ARRAY_SIZE(di_mappings)) { + dev_err(dev, "Illegal display data format %d\n", + mx3fb_pdata->disp_data_fmt); + return -EINVAL; + } + ichan->client = mx3fb; irq = ichan->eof_irq; @@ -1402,6 +1383,8 @@ static int init_fb_chan(struct mx3fb_data *mx3fb, struct idmac_channel *ichan) mx3fbi->mx3fb = mx3fb; mx3fbi->blank = FB_BLANK_NORMAL; + mx3fb->disp_data_fmt = mx3fb_pdata->disp_data_fmt; + init_completion(&mx3fbi->flip_cmpl); disable_irq(ichan->eof_irq); dev_dbg(mx3fb->dev, "disabling irq %d\n", ichan->eof_irq); diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c index 3832e303c33..596e6a7b17d 100644 --- a/drivers/xen/xen-balloon.c +++ b/drivers/xen/xen-balloon.c @@ -221,7 +221,7 @@ static int register_balloon(struct device *dev) { int i, error; - error = bus_register(&balloon_subsys); + error = subsys_system_register(&balloon_subsys, NULL); if (error) return error; diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index ecb9fd3be14..d33f01c08b6 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -31,3 +31,22 @@ config BTRFS_FS_POSIX_ACL Linux website <http://acl.bestbits.at/>. If you don't know what Access Control Lists are, say N + +config BTRFS_FS_CHECK_INTEGRITY + bool "Btrfs with integrity check tool compiled in (DANGEROUS)" + depends on BTRFS_FS + help + Adds code that examines all block write requests (including + writes of the super block). The goal is to verify that the + state of the filesystem on disk is always consistent, i.e., + after a power-loss or kernel panic event the filesystem is + in a consistent state. + + If the integrity check tool is included and activated in + the mount options, plenty of kernel memory is used, and + plenty of additional CPU cycles are spent. Enabling this + functionality is not intended for normal use. + + In most cases, unless you are a btrfs developer who needs + to verify the integrity of (super)-block write requests + during the run of a regression test, say N diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index c0ddfd29c5e..0c4fa2befae 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,6 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ - reada.o backref.o + reada.o backref.o ulist.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o +btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 22c64fff1bd..b9a843226de 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -19,18 +19,789 @@ #include "ctree.h" #include "disk-io.h" #include "backref.h" +#include "ulist.h" +#include "transaction.h" +#include "delayed-ref.h" -struct __data_ref { +/* + * this structure records all encountered refs on the way up to the root + */ +struct __prelim_ref { struct list_head list; - u64 inum; - u64 root; - u64 extent_data_item_offset; + u64 root_id; + struct btrfs_key key; + int level; + int count; + u64 parent; + u64 wanted_disk_byte; }; -struct __shared_ref { - struct list_head list; +static int __add_prelim_ref(struct list_head *head, u64 root_id, + struct btrfs_key *key, int level, u64 parent, + u64 wanted_disk_byte, int count) +{ + struct __prelim_ref *ref; + + /* in case we're adding delayed refs, we're holding the refs spinlock */ + ref = kmalloc(sizeof(*ref), GFP_ATOMIC); + if (!ref) + return -ENOMEM; + + ref->root_id = root_id; + if (key) + ref->key = *key; + else + memset(&ref->key, 0, sizeof(ref->key)); + + ref->level = level; + ref->count = count; + ref->parent = parent; + ref->wanted_disk_byte = wanted_disk_byte; + list_add_tail(&ref->list, head); + + return 0; +} + +static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, + struct ulist *parents, + struct extent_buffer *eb, int level, + u64 wanted_objectid, u64 wanted_disk_byte) +{ + int ret; + int slot; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; u64 disk_byte; -}; + +add_parent: + ret = ulist_add(parents, eb->start, 0, GFP_NOFS); + if (ret < 0) + return ret; + + if (level != 0) + return 0; + + /* + * if the current leaf is full with EXTENT_DATA items, we must + * check the next one if that holds a reference as well. + * ref->count cannot be used to skip this check. + * repeat this until we don't find any additional EXTENT_DATA items. + */ + while (1) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + if (ret) + return 0; + + eb = path->nodes[0]; + for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) { + btrfs_item_key_to_cpu(eb, &key, slot); + if (key.objectid != wanted_objectid || + key.type != BTRFS_EXTENT_DATA_KEY) + return 0; + fi = btrfs_item_ptr(eb, slot, + struct btrfs_file_extent_item); + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + if (disk_byte == wanted_disk_byte) + goto add_parent; + } + } + + return 0; +} + +/* + * resolve an indirect backref in the form (root_id, key, level) + * to a logical address + */ +static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, + struct __prelim_ref *ref, + struct ulist *parents) +{ + struct btrfs_path *path; + struct btrfs_root *root; + struct btrfs_key root_key; + struct btrfs_key key = {0}; + struct extent_buffer *eb; + int ret = 0; + int root_level; + int level = ref->level; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + root_key.objectid = ref->root_id; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + root = btrfs_read_fs_root_no_name(fs_info, &root_key); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } + + rcu_read_lock(); + root_level = btrfs_header_level(root->node); + rcu_read_unlock(); + + if (root_level + 1 == level) + goto out; + + path->lowest_level = level; + ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0); + pr_debug("search slot in root %llu (level %d, ref count %d) returned " + "%d for key (%llu %u %llu)\n", + (unsigned long long)ref->root_id, level, ref->count, ret, + (unsigned long long)ref->key.objectid, ref->key.type, + (unsigned long long)ref->key.offset); + if (ret < 0) + goto out; + + eb = path->nodes[level]; + if (!eb) { + WARN_ON(1); + ret = 1; + goto out; + } + + if (level == 0) { + if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(root, path); + if (ret) + goto out; + eb = path->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, &key, path->slots[0]); + } + + /* the last two parameters will only be used for level == 0 */ + ret = add_all_parents(root, path, parents, eb, level, key.objectid, + ref->wanted_disk_byte); +out: + btrfs_free_path(path); + return ret; +} + +/* + * resolve all indirect backrefs from the list + */ +static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, + struct list_head *head) +{ + int err; + int ret = 0; + struct __prelim_ref *ref; + struct __prelim_ref *ref_safe; + struct __prelim_ref *new_ref; + struct ulist *parents; + struct ulist_node *node; + + parents = ulist_alloc(GFP_NOFS); + if (!parents) + return -ENOMEM; + + /* + * _safe allows us to insert directly after the current item without + * iterating over the newly inserted items. + * we're also allowed to re-assign ref during iteration. + */ + list_for_each_entry_safe(ref, ref_safe, head, list) { + if (ref->parent) /* already direct */ + continue; + if (ref->count == 0) + continue; + err = __resolve_indirect_ref(fs_info, ref, parents); + if (err) { + if (ret == 0) + ret = err; + continue; + } + + /* we put the first parent into the ref at hand */ + node = ulist_next(parents, NULL); + ref->parent = node ? node->val : 0; + + /* additional parents require new refs being added here */ + while ((node = ulist_next(parents, node))) { + new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS); + if (!new_ref) { + ret = -ENOMEM; + break; + } + memcpy(new_ref, ref, sizeof(*ref)); + new_ref->parent = node->val; + list_add(&new_ref->list, &ref->list); + } + ulist_reinit(parents); + } + + ulist_free(parents); + return ret; +} + +/* + * merge two lists of backrefs and adjust counts accordingly + * + * mode = 1: merge identical keys, if key is set + * mode = 2: merge identical parents + */ +static int __merge_refs(struct list_head *head, int mode) +{ + struct list_head *pos1; + + list_for_each(pos1, head) { + struct list_head *n2; + struct list_head *pos2; + struct __prelim_ref *ref1; + + ref1 = list_entry(pos1, struct __prelim_ref, list); + + if (mode == 1 && ref1->key.type == 0) + continue; + for (pos2 = pos1->next, n2 = pos2->next; pos2 != head; + pos2 = n2, n2 = pos2->next) { + struct __prelim_ref *ref2; + + ref2 = list_entry(pos2, struct __prelim_ref, list); + + if (mode == 1) { + if (memcmp(&ref1->key, &ref2->key, + sizeof(ref1->key)) || + ref1->level != ref2->level || + ref1->root_id != ref2->root_id) + continue; + ref1->count += ref2->count; + } else { + if (ref1->parent != ref2->parent) + continue; + ref1->count += ref2->count; + } + list_del(&ref2->list); + kfree(ref2); + } + + } + return 0; +} + +/* + * add all currently queued delayed refs from this head whose seq nr is + * smaller or equal that seq to the list + */ +static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, + struct btrfs_key *info_key, + struct list_head *prefs) +{ + struct btrfs_delayed_extent_op *extent_op = head->extent_op; + struct rb_node *n = &head->node.rb_node; + int sgn; + int ret; + + if (extent_op && extent_op->update_key) + btrfs_disk_key_to_cpu(info_key, &extent_op->key); + + while ((n = rb_prev(n))) { + struct btrfs_delayed_ref_node *node; + node = rb_entry(n, struct btrfs_delayed_ref_node, + rb_node); + if (node->bytenr != head->node.bytenr) + break; + WARN_ON(node->is_head); + + if (node->seq > seq) + continue; + + switch (node->action) { + case BTRFS_ADD_DELAYED_EXTENT: + case BTRFS_UPDATE_DELAYED_HEAD: + WARN_ON(1); + continue; + case BTRFS_ADD_DELAYED_REF: + sgn = 1; + break; + case BTRFS_DROP_DELAYED_REF: + sgn = -1; + break; + default: + BUG_ON(1); + } + switch (node->type) { + case BTRFS_TREE_BLOCK_REF_KEY: { + struct btrfs_delayed_tree_ref *ref; + + ref = btrfs_delayed_node_to_tree_ref(node); + ret = __add_prelim_ref(prefs, ref->root, info_key, + ref->level + 1, 0, node->bytenr, + node->ref_mod * sgn); + break; + } + case BTRFS_SHARED_BLOCK_REF_KEY: { + struct btrfs_delayed_tree_ref *ref; + + ref = btrfs_delayed_node_to_tree_ref(node); + ret = __add_prelim_ref(prefs, ref->root, info_key, + ref->level + 1, ref->parent, + node->bytenr, + node->ref_mod * sgn); + break; + } + case BTRFS_EXTENT_DATA_REF_KEY: { + struct btrfs_delayed_data_ref *ref; + struct btrfs_key key; + + ref = btrfs_delayed_node_to_data_ref(node); + + key.objectid = ref->objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = ref->offset; + ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0, + node->bytenr, + node->ref_mod * sgn); + break; + } + case BTRFS_SHARED_DATA_REF_KEY: { + struct btrfs_delayed_data_ref *ref; + struct btrfs_key key; + + ref = btrfs_delayed_node_to_data_ref(node); + + key.objectid = ref->objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = ref->offset; + ret = __add_prelim_ref(prefs, ref->root, &key, 0, + ref->parent, node->bytenr, + node->ref_mod * sgn); + break; + } + default: + WARN_ON(1); + } + BUG_ON(ret); + } + + return 0; +} + +/* + * add all inline backrefs for bytenr to the list + */ +static int __add_inline_refs(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 bytenr, + struct btrfs_key *info_key, int *info_level, + struct list_head *prefs) +{ + int ret; + int slot; + struct extent_buffer *leaf; + struct btrfs_key key; + unsigned long ptr; + unsigned long end; + struct btrfs_extent_item *ei; + u64 flags; + u64 item_size; + + /* + * enumerate all inline refs + */ + leaf = path->nodes[0]; + slot = path->slots[0] - 1; + + item_size = btrfs_item_size_nr(leaf, slot); + BUG_ON(item_size < sizeof(*ei)); + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + flags = btrfs_extent_flags(leaf, ei); + + ptr = (unsigned long)(ei + 1); + end = (unsigned long)ei + item_size; + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + struct btrfs_tree_block_info *info; + struct btrfs_disk_key disk_key; + + info = (struct btrfs_tree_block_info *)ptr; + *info_level = btrfs_tree_block_level(leaf, info); + btrfs_tree_block_key(leaf, info, &disk_key); + btrfs_disk_key_to_cpu(info_key, &disk_key); + ptr += sizeof(struct btrfs_tree_block_info); + BUG_ON(ptr > end); + } else { + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); + } + + while (ptr < end) { + struct btrfs_extent_inline_ref *iref; + u64 offset; + int type; + + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(leaf, iref); + offset = btrfs_extent_inline_ref_offset(leaf, iref); + + switch (type) { + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, 0, info_key, + *info_level + 1, offset, + bytenr, 1); + break; + case BTRFS_SHARED_DATA_REF_KEY: { + struct btrfs_shared_data_ref *sdref; + int count; + + sdref = (struct btrfs_shared_data_ref *)(iref + 1); + count = btrfs_shared_data_ref_count(leaf, sdref); + ret = __add_prelim_ref(prefs, 0, NULL, 0, offset, + bytenr, count); + break; + } + case BTRFS_TREE_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, offset, info_key, + *info_level + 1, 0, bytenr, 1); + break; + case BTRFS_EXTENT_DATA_REF_KEY: { + struct btrfs_extent_data_ref *dref; + int count; + u64 root; + + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + count = btrfs_extent_data_ref_count(leaf, dref); + key.objectid = btrfs_extent_data_ref_objectid(leaf, + dref); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = btrfs_extent_data_ref_offset(leaf, dref); + root = btrfs_extent_data_ref_root(leaf, dref); + ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr, + count); + break; + } + default: + WARN_ON(1); + } + BUG_ON(ret); + ptr += btrfs_extent_inline_ref_size(type); + } + + return 0; +} + +/* + * add all non-inline backrefs for bytenr to the list + */ +static int __add_keyed_refs(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 bytenr, + struct btrfs_key *info_key, int info_level, + struct list_head *prefs) +{ + struct btrfs_root *extent_root = fs_info->extent_root; + int ret; + int slot; + struct extent_buffer *leaf; + struct btrfs_key key; + + while (1) { + ret = btrfs_next_item(extent_root, path); + if (ret < 0) + break; + if (ret) { + ret = 0; + break; + } + + slot = path->slots[0]; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.objectid != bytenr) + break; + if (key.type < BTRFS_TREE_BLOCK_REF_KEY) + continue; + if (key.type > BTRFS_SHARED_DATA_REF_KEY) + break; + + switch (key.type) { + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, 0, info_key, + info_level + 1, key.offset, + bytenr, 1); + break; + case BTRFS_SHARED_DATA_REF_KEY: { + struct btrfs_shared_data_ref *sdref; + int count; + + sdref = btrfs_item_ptr(leaf, slot, + struct btrfs_shared_data_ref); + count = btrfs_shared_data_ref_count(leaf, sdref); + ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset, + bytenr, count); + break; + } + case BTRFS_TREE_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, key.offset, info_key, + info_level + 1, 0, bytenr, 1); + break; + case BTRFS_EXTENT_DATA_REF_KEY: { + struct btrfs_extent_data_ref *dref; + int count; + u64 root; + + dref = btrfs_item_ptr(leaf, slot, + struct btrfs_extent_data_ref); + count = btrfs_extent_data_ref_count(leaf, dref); + key.objectid = btrfs_extent_data_ref_objectid(leaf, + dref); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = btrfs_extent_data_ref_offset(leaf, dref); + root = btrfs_extent_data_ref_root(leaf, dref); + ret = __add_prelim_ref(prefs, root, &key, 0, 0, + bytenr, count); + break; + } + default: + WARN_ON(1); + } + BUG_ON(ret); + } + + return ret; +} + +/* + * this adds all existing backrefs (inline backrefs, backrefs and delayed + * refs) for the given bytenr to the refs list, merges duplicates and resolves + * indirect refs to their parent bytenr. + * When roots are found, they're added to the roots list + * + * FIXME some caching might speed things up + */ +static int find_parent_nodes(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 seq, struct ulist *refs, struct ulist *roots) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct btrfs_key info_key = { 0 }; + struct btrfs_delayed_ref_root *delayed_refs = NULL; + struct btrfs_delayed_ref_head *head = NULL; + int info_level = 0; + int ret; + struct list_head prefs_delayed; + struct list_head prefs; + struct __prelim_ref *ref; + + INIT_LIST_HEAD(&prefs); + INIT_LIST_HEAD(&prefs_delayed); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * grab both a lock on the path and a lock on the delayed ref head. + * We need both to get a consistent picture of how the refs look + * at a specified point in time + */ +again: + ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + + /* + * look if there are updates for this ref queued and lock the head + */ + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (head) { + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(path); + + /* + * Mutex was contended, block until it's + * released and try again + */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + goto again; + } + ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed); + if (ret) + goto out; + } + spin_unlock(&delayed_refs->lock); + + if (path->slots[0]) { + struct extent_buffer *leaf; + int slot; + + leaf = path->nodes[0]; + slot = path->slots[0] - 1; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY) { + ret = __add_inline_refs(fs_info, path, bytenr, + &info_key, &info_level, &prefs); + if (ret) + goto out; + ret = __add_keyed_refs(fs_info, path, bytenr, &info_key, + info_level, &prefs); + if (ret) + goto out; + } + } + btrfs_release_path(path); + + /* + * when adding the delayed refs above, the info_key might not have + * been known yet. Go over the list and replace the missing keys + */ + list_for_each_entry(ref, &prefs_delayed, list) { + if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0) + memcpy(&ref->key, &info_key, sizeof(ref->key)); + } + list_splice_init(&prefs_delayed, &prefs); + + ret = __merge_refs(&prefs, 1); + if (ret) + goto out; + + ret = __resolve_indirect_refs(fs_info, &prefs); + if (ret) + goto out; + + ret = __merge_refs(&prefs, 2); + if (ret) + goto out; + + while (!list_empty(&prefs)) { + ref = list_first_entry(&prefs, struct __prelim_ref, list); + list_del(&ref->list); + if (ref->count < 0) + WARN_ON(1); + if (ref->count && ref->root_id && ref->parent == 0) { + /* no parent == root of tree */ + ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); + BUG_ON(ret < 0); + } + if (ref->count && ref->parent) { + ret = ulist_add(refs, ref->parent, 0, GFP_NOFS); + BUG_ON(ret < 0); + } + kfree(ref); + } + +out: + if (head) + mutex_unlock(&head->mutex); + btrfs_free_path(path); + while (!list_empty(&prefs)) { + ref = list_first_entry(&prefs, struct __prelim_ref, list); + list_del(&ref->list); + kfree(ref); + } + while (!list_empty(&prefs_delayed)) { + ref = list_first_entry(&prefs_delayed, struct __prelim_ref, + list); + list_del(&ref->list); + kfree(ref); + } + + return ret; +} + +/* + * Finds all leafs with a reference to the specified combination of bytenr and + * offset. key_list_head will point to a list of corresponding keys (caller must + * free each list element). The leafs will be stored in the leafs ulist, which + * must be freed with ulist_free. + * + * returns 0 on success, <0 on error + */ +static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 num_bytes, u64 seq, struct ulist **leafs) +{ + struct ulist *tmp; + int ret; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + *leafs = ulist_alloc(GFP_NOFS); + if (!*leafs) { + ulist_free(tmp); + return -ENOMEM; + } + + ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp); + ulist_free(tmp); + + if (ret < 0 && ret != -ENOENT) { + ulist_free(*leafs); + return ret; + } + + return 0; +} + +/* + * walk all backrefs for a given extent to find all roots that reference this + * extent. Walking a backref means finding all extents that reference this + * extent and in turn walk the backrefs of those, too. Naturally this is a + * recursive process, but here it is implemented in an iterative fashion: We + * find all referencing extents for the extent in question and put them on a + * list. In turn, we find all referencing extents for those, further appending + * to the list. The way we iterate the list allows adding more elements after + * the current while iterating. The process stops when we reach the end of the + * list. Found roots are added to the roots list. + * + * returns 0 on success, < 0 on error. + */ +int btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 num_bytes, u64 seq, struct ulist **roots) +{ + struct ulist *tmp; + struct ulist_node *node = NULL; + int ret; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + *roots = ulist_alloc(GFP_NOFS); + if (!*roots) { + ulist_free(tmp); + return -ENOMEM; + } + + while (1) { + ret = find_parent_nodes(trans, fs_info, bytenr, seq, + tmp, *roots); + if (ret < 0 && ret != -ENOENT) { + ulist_free(tmp); + ulist_free(*roots); + return ret; + } + node = ulist_next(tmp, node); + if (!node) + break; + bytenr = node->val; + } + + ulist_free(tmp); + return 0; +} + static int __inode_info(u64 inum, u64 ioff, u8 key_type, struct btrfs_root *fs_root, struct btrfs_path *path, @@ -181,8 +952,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); if (found_key->type != BTRFS_EXTENT_ITEM_KEY || found_key->objectid > logical || - found_key->objectid + found_key->offset <= logical) + found_key->objectid + found_key->offset <= logical) { + pr_debug("logical %llu is not within any extent\n", + (unsigned long long)logical); return -ENOENT; + } eb = path->nodes[0]; item_size = btrfs_item_size_nr(eb, path->slots[0]); @@ -191,6 +965,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); flags = btrfs_extent_flags(eb, ei); + pr_debug("logical %llu is at position %llu within the extent (%llu " + "EXTENT_ITEM %llu) flags %#llx size %u\n", + (unsigned long long)logical, + (unsigned long long)(logical - found_key->objectid), + (unsigned long long)found_key->objectid, + (unsigned long long)found_key->offset, + (unsigned long long)flags, item_size); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) return BTRFS_EXTENT_FLAG_TREE_BLOCK; if (flags & BTRFS_EXTENT_FLAG_DATA) @@ -287,128 +1068,11 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, return 0; } -static int __data_list_add(struct list_head *head, u64 inum, - u64 extent_data_item_offset, u64 root) -{ - struct __data_ref *ref; - - ref = kmalloc(sizeof(*ref), GFP_NOFS); - if (!ref) - return -ENOMEM; - - ref->inum = inum; - ref->extent_data_item_offset = extent_data_item_offset; - ref->root = root; - list_add_tail(&ref->list, head); - - return 0; -} - -static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb, - struct btrfs_extent_data_ref *dref) -{ - return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref), - btrfs_extent_data_ref_offset(eb, dref), - btrfs_extent_data_ref_root(eb, dref)); -} - -static int __shared_list_add(struct list_head *head, u64 disk_byte) -{ - struct __shared_ref *ref; - - ref = kmalloc(sizeof(*ref), GFP_NOFS); - if (!ref) - return -ENOMEM; - - ref->disk_byte = disk_byte; - list_add_tail(&ref->list, head); - - return 0; -} - -static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info, - u64 logical, u64 inum, - u64 extent_data_item_offset, - u64 extent_offset, - struct btrfs_path *path, - struct list_head *data_refs, - iterate_extent_inodes_t *iterate, - void *ctx) -{ - u64 ref_root; - u32 item_size; - struct btrfs_key key; - struct extent_buffer *eb; - struct btrfs_extent_item *ei; - struct btrfs_extent_inline_ref *eiref; - struct __data_ref *ref; - int ret; - int type; - int last; - unsigned long ptr = 0; - - WARN_ON(!list_empty(data_refs)); - ret = extent_from_logical(fs_info, logical, path, &key); - if (ret & BTRFS_EXTENT_FLAG_DATA) - ret = -EIO; - if (ret < 0) - goto out; - - eb = path->nodes[0]; - ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); - item_size = btrfs_item_size_nr(eb, path->slots[0]); - - ret = 0; - ref_root = 0; - /* - * as done in iterate_extent_inodes, we first build a list of refs to - * iterate, then free the path and then iterate them to avoid deadlocks. - */ - do { - last = __get_extent_inline_ref(&ptr, eb, ei, item_size, - &eiref, &type); - if (last < 0) { - ret = last; - goto out; - } - if (type == BTRFS_TREE_BLOCK_REF_KEY || - type == BTRFS_SHARED_BLOCK_REF_KEY) { - ref_root = btrfs_extent_inline_ref_offset(eb, eiref); - ret = __data_list_add(data_refs, inum, - extent_data_item_offset, - ref_root); - } - } while (!ret && !last); - - btrfs_release_path(path); - - if (ref_root == 0) { - printk(KERN_ERR "btrfs: failed to find tree block ref " - "for shared data backref %llu\n", logical); - WARN_ON(1); - ret = -EIO; - } - -out: - while (!list_empty(data_refs)) { - ref = list_first_entry(data_refs, struct __data_ref, list); - list_del(&ref->list); - if (!ret) - ret = iterate(ref->inum, extent_offset + - ref->extent_data_item_offset, - ref->root, ctx); - kfree(ref); - } - - return ret; -} - -static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info, - u64 logical, u64 orig_extent_item_objectid, - u64 extent_offset, struct btrfs_path *path, - struct list_head *data_refs, - iterate_extent_inodes_t *iterate, - void *ctx) +static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 logical, + u64 orig_extent_item_objectid, + u64 extent_item_pos, u64 root, + iterate_extent_inodes_t *iterate, void *ctx) { u64 disk_byte; struct btrfs_key key; @@ -416,8 +1080,10 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info, struct extent_buffer *eb; int slot; int nritems; - int ret; - int found = 0; + int ret = 0; + int extent_type; + u64 data_offset; + u64 data_len; eb = read_tree_block(fs_info->tree_root, logical, fs_info->tree_root->leafsize, 0); @@ -435,149 +1101,99 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info, if (key.type != BTRFS_EXTENT_DATA_KEY) continue; fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); - if (!fi) { - free_extent_buffer(eb); - return -EIO; - } + extent_type = btrfs_file_extent_type(eb, fi); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + continue; + /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - if (disk_byte != orig_extent_item_objectid) { - if (found) - break; - else - continue; - } - ++found; - ret = __iter_shared_inline_ref_inodes(fs_info, logical, - key.objectid, - key.offset, - extent_offset, path, - data_refs, - iterate, ctx); - if (ret) - break; - } + if (disk_byte != orig_extent_item_objectid) + continue; - if (!found) { - printk(KERN_ERR "btrfs: failed to follow shared data backref " - "to parent %llu\n", logical); - WARN_ON(1); - ret = -EIO; + data_offset = btrfs_file_extent_offset(eb, fi); + data_len = btrfs_file_extent_num_bytes(eb, fi); + + if (extent_item_pos < data_offset || + extent_item_pos >= data_offset + data_len) + continue; + + pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), " + "root %llu\n", orig_extent_item_objectid, + key.objectid, key.offset, root); + ret = iterate(key.objectid, + key.offset + (extent_item_pos - data_offset), + root, ctx); + if (ret) { + pr_debug("stopping iteration because ret=%d\n", ret); + break; + } } free_extent_buffer(eb); + return ret; } /* * calls iterate() for every inode that references the extent identified by - * the given parameters. will use the path given as a parameter and return it - * released. + * the given parameters. * when the iterator function returns a non-zero value, iteration stops. + * path is guaranteed to be in released state when iterate() is called. */ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - u64 extent_item_objectid, - u64 extent_offset, + u64 extent_item_objectid, u64 extent_item_pos, iterate_extent_inodes_t *iterate, void *ctx) { - unsigned long ptr = 0; - int last; int ret; - int type; - u64 logical; - u32 item_size; - struct btrfs_extent_inline_ref *eiref; - struct btrfs_extent_data_ref *dref; - struct extent_buffer *eb; - struct btrfs_extent_item *ei; - struct btrfs_key key; struct list_head data_refs = LIST_HEAD_INIT(data_refs); struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); - struct __data_ref *ref_d; - struct __shared_ref *ref_s; - - eb = path->nodes[0]; - ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); - item_size = btrfs_item_size_nr(eb, path->slots[0]); - - /* first we iterate the inline refs, ... */ - do { - last = __get_extent_inline_ref(&ptr, eb, ei, item_size, - &eiref, &type); - if (last == -ENOENT) { - ret = 0; - break; - } - if (last < 0) { - ret = last; - break; - } + struct btrfs_trans_handle *trans; + struct ulist *refs; + struct ulist *roots; + struct ulist_node *ref_node = NULL; + struct ulist_node *root_node = NULL; + struct seq_list seq_elem; + struct btrfs_delayed_ref_root *delayed_refs; + + trans = btrfs_join_transaction(fs_info->extent_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + pr_debug("resolving all inodes for extent %llu\n", + extent_item_objectid); + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + btrfs_get_delayed_seq(delayed_refs, &seq_elem); + spin_unlock(&delayed_refs->lock); + + ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, + extent_item_pos, seq_elem.seq, + &refs); - if (type == BTRFS_EXTENT_DATA_REF_KEY) { - dref = (struct btrfs_extent_data_ref *)(&eiref->offset); - ret = __data_list_add_eb(&data_refs, eb, dref); - } else if (type == BTRFS_SHARED_DATA_REF_KEY) { - logical = btrfs_extent_inline_ref_offset(eb, eiref); - ret = __shared_list_add(&shared_refs, logical); - } - } while (!ret && !last); + if (ret) + goto out; - /* ... then we proceed to in-tree references and ... */ - while (!ret) { - ++path->slots[0]; - if (path->slots[0] > btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(fs_info->extent_root, path); - if (ret) { - if (ret == 1) - ret = 0; /* we're done */ - break; - } - eb = path->nodes[0]; - } - btrfs_item_key_to_cpu(eb, &key, path->slots[0]); - if (key.objectid != extent_item_objectid) + while (!ret && (ref_node = ulist_next(refs, ref_node))) { + ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1, + seq_elem.seq, &roots); + if (ret) break; - if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { - dref = btrfs_item_ptr(eb, path->slots[0], - struct btrfs_extent_data_ref); - ret = __data_list_add_eb(&data_refs, eb, dref); - } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { - ret = __shared_list_add(&shared_refs, key.offset); + while (!ret && (root_node = ulist_next(roots, root_node))) { + pr_debug("root %llu references leaf %llu\n", + root_node->val, ref_node->val); + ret = iterate_leaf_refs(fs_info, path, ref_node->val, + extent_item_objectid, + extent_item_pos, root_node->val, + iterate, ctx); } } - btrfs_release_path(path); - - /* - * ... only at the very end we can process the refs we found. this is - * because the iterator function we call is allowed to make tree lookups - * and we have to avoid deadlocks. additionally, we need more tree - * lookups ourselves for shared data refs. - */ - while (!list_empty(&data_refs)) { - ref_d = list_first_entry(&data_refs, struct __data_ref, list); - list_del(&ref_d->list); - if (!ret) - ret = iterate(ref_d->inum, extent_offset + - ref_d->extent_data_item_offset, - ref_d->root, ctx); - kfree(ref_d); - } - - while (!list_empty(&shared_refs)) { - ref_s = list_first_entry(&shared_refs, struct __shared_ref, - list); - list_del(&ref_s->list); - if (!ret) - ret = __iter_shared_inline_ref(fs_info, - ref_s->disk_byte, - extent_item_objectid, - extent_offset, path, - &data_refs, - iterate, ctx); - kfree(ref_s); - } - + ulist_free(refs); + ulist_free(roots); +out: + btrfs_put_delayed_seq(delayed_refs, &seq_elem); + btrfs_end_transaction(trans, fs_info->extent_root); return ret; } @@ -586,19 +1202,20 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, iterate_extent_inodes_t *iterate, void *ctx) { int ret; - u64 offset; + u64 extent_item_pos; struct btrfs_key found_key; ret = extent_from_logical(fs_info, logical, path, &found_key); + btrfs_release_path(path); if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) ret = -EINVAL; if (ret < 0) return ret; - offset = logical - found_key.objectid; + extent_item_pos = logical - found_key.objectid; ret = iterate_extent_inodes(fs_info, path, found_key.objectid, - offset, iterate, ctx); + extent_item_pos, iterate, ctx); return ret; } @@ -643,6 +1260,10 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { name_len = btrfs_inode_ref_name_len(eb, iref); /* path must be released before calling iterate()! */ + pr_debug("following ref at offset %u for inode %llu in " + "tree %llu\n", cur, + (unsigned long long)found_key.objectid, + (unsigned long long)fs_root->objectid); ret = iterate(parent, iref, eb, ctx); if (ret) { free_extent_buffer(eb); @@ -683,10 +1304,14 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, return PTR_ERR(fspath); if (fspath > fspath_min) { + pr_debug("path resolved: %s\n", fspath); ipath->fspath->val[i] = (u64)(unsigned long)fspath; ++ipath->fspath->elem_cnt; ipath->fspath->bytes_left = fspath - fspath_min; } else { + pr_debug("missed path, not enough space. missing bytes: %lu, " + "constructed so far: %s\n", + (unsigned long)(fspath_min - fspath), fspath_min); ++ipath->fspath->elem_missed; ipath->fspath->bytes_missing += fspath_min - fspath; ipath->fspath->bytes_left = 0; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 92618837cb8..d00dfa9ca93 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -20,6 +20,7 @@ #define __BTRFS_BACKREF__ #include "ioctl.h" +#include "ulist.h" struct inode_fs_paths { struct btrfs_path *btrfs_path; @@ -54,6 +55,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); +int btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 num_bytes, u64 seq, struct ulist **roots); + struct btrfs_data_container *init_data_container(u32 total_bytes); struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, struct btrfs_path *path); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 634608d2a6d..9b9b15fd520 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -51,6 +51,9 @@ struct btrfs_inode { /* held while logging the inode in tree-log.c */ struct mutex log_mutex; + /* held while doing delalloc reservations */ + struct mutex delalloc_mutex; + /* used to order data wrt metadata */ struct btrfs_ordered_inode_tree ordered_tree; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c new file mode 100644 index 00000000000..ad0b3ba735b --- /dev/null +++ b/fs/btrfs/check-integrity.c @@ -0,0 +1,3068 @@ +/* + * Copyright (C) STRATO AG 2011. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +/* + * This module can be used to catch cases when the btrfs kernel + * code executes write requests to the disk that bring the file + * system in an inconsistent state. In such a state, a power-loss + * or kernel panic event would cause that the data on disk is + * lost or at least damaged. + * + * Code is added that examines all block write requests during + * runtime (including writes of the super block). Three rules + * are verified and an error is printed on violation of the + * rules: + * 1. It is not allowed to write a disk block which is + * currently referenced by the super block (either directly + * or indirectly). + * 2. When a super block is written, it is verified that all + * referenced (directly or indirectly) blocks fulfill the + * following requirements: + * 2a. All referenced blocks have either been present when + * the file system was mounted, (i.e., they have been + * referenced by the super block) or they have been + * written since then and the write completion callback + * was called and a FLUSH request to the device where + * these blocks are located was received and completed. + * 2b. All referenced blocks need to have a generation + * number which is equal to the parent's number. + * + * One issue that was found using this module was that the log + * tree on disk became temporarily corrupted because disk blocks + * that had been in use for the log tree had been freed and + * reused too early, while being referenced by the written super + * block. + * + * The search term in the kernel log that can be used to filter + * on the existence of detected integrity issues is + * "btrfs: attempt". + * + * The integrity check is enabled via mount options. These + * mount options are only supported if the integrity check + * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY. + * + * Example #1, apply integrity checks to all metadata: + * mount /dev/sdb1 /mnt -o check_int + * + * Example #2, apply integrity checks to all metadata and + * to data extents: + * mount /dev/sdb1 /mnt -o check_int_data + * + * Example #3, apply integrity checks to all metadata and dump + * the tree that the super block references to kernel messages + * each time after a super block was written: + * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263 + * + * If the integrity check tool is included and activated in + * the mount options, plenty of kernel memory is used, and + * plenty of additional CPU cycles are spent. Enabling this + * functionality is not intended for normal use. In most + * cases, unless you are a btrfs developer who needs to verify + * the integrity of (super)-block write requests, do not + * enable the config option BTRFS_FS_CHECK_INTEGRITY to + * include and compile the integrity check tool. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> +#include <linux/mutex.h> +#include <linux/crc32c.h> +#include <linux/genhd.h> +#include <linux/blkdev.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "extent_io.h" +#include "disk-io.h" +#include "volumes.h" +#include "print-tree.h" +#include "locking.h" +#include "check-integrity.h" + +#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 +#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 +#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100 +#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051 +#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807 +#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530 +#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300 +#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters, + * excluding " [...]" */ +#define BTRFSIC_BLOCK_SIZE PAGE_SIZE + +#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1) + +/* + * The definition of the bitmask fields for the print_mask. + * They are specified with the mount option check_integrity_print_mask. + */ +#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001 +#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002 +#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004 +#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008 +#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010 +#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020 +#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040 +#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080 +#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100 +#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200 +#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400 +#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800 +#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000 + +struct btrfsic_dev_state; +struct btrfsic_state; + +struct btrfsic_block { + u32 magic_num; /* only used for debug purposes */ + unsigned int is_metadata:1; /* if it is meta-data, not data-data */ + unsigned int is_superblock:1; /* if it is one of the superblocks */ + unsigned int is_iodone:1; /* if is done by lower subsystem */ + unsigned int iodone_w_error:1; /* error was indicated to endio */ + unsigned int never_written:1; /* block was added because it was + * referenced, not because it was + * written */ + unsigned int mirror_num:2; /* large enough to hold + * BTRFS_SUPER_MIRROR_MAX */ + struct btrfsic_dev_state *dev_state; + u64 dev_bytenr; /* key, physical byte num on disk */ + u64 logical_bytenr; /* logical byte num on disk */ + u64 generation; + struct btrfs_disk_key disk_key; /* extra info to print in case of + * issues, will not always be correct */ + struct list_head collision_resolving_node; /* list node */ + struct list_head all_blocks_node; /* list node */ + + /* the following two lists contain block_link items */ + struct list_head ref_to_list; /* list */ + struct list_head ref_from_list; /* list */ + struct btrfsic_block *next_in_same_bio; + void *orig_bio_bh_private; + union { + bio_end_io_t *bio; + bh_end_io_t *bh; + } orig_bio_bh_end_io; + int submit_bio_bh_rw; + u64 flush_gen; /* only valid if !never_written */ +}; + +/* + * Elements of this type are allocated dynamically and required because + * each block object can refer to and can be ref from multiple blocks. + * The key to lookup them in the hashtable is the dev_bytenr of + * the block ref to plus the one from the block refered from. + * The fact that they are searchable via a hashtable and that a + * ref_cnt is maintained is not required for the btrfs integrity + * check algorithm itself, it is only used to make the output more + * beautiful in case that an error is detected (an error is defined + * as a write operation to a block while that block is still referenced). + */ +struct btrfsic_block_link { + u32 magic_num; /* only used for debug purposes */ + u32 ref_cnt; + struct list_head node_ref_to; /* list node */ + struct list_head node_ref_from; /* list node */ + struct list_head collision_resolving_node; /* list node */ + struct btrfsic_block *block_ref_to; + struct btrfsic_block *block_ref_from; + u64 parent_generation; +}; + +struct btrfsic_dev_state { + u32 magic_num; /* only used for debug purposes */ + struct block_device *bdev; + struct btrfsic_state *state; + struct list_head collision_resolving_node; /* list node */ + struct btrfsic_block dummy_block_for_bio_bh_flush; + u64 last_flush_gen; + char name[BDEVNAME_SIZE]; +}; + +struct btrfsic_block_hashtable { + struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE]; +}; + +struct btrfsic_block_link_hashtable { + struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE]; +}; + +struct btrfsic_dev_state_hashtable { + struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE]; +}; + +struct btrfsic_block_data_ctx { + u64 start; /* virtual bytenr */ + u64 dev_bytenr; /* physical bytenr on device */ + u32 len; + struct btrfsic_dev_state *dev; + char *data; + struct buffer_head *bh; /* do not use if set to NULL */ +}; + +/* This structure is used to implement recursion without occupying + * any stack space, refer to btrfsic_process_metablock() */ +struct btrfsic_stack_frame { + u32 magic; + u32 nr; + int error; + int i; + int limit_nesting; + int num_copies; + int mirror_num; + struct btrfsic_block *block; + struct btrfsic_block_data_ctx *block_ctx; + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx next_block_ctx; + struct btrfs_header *hdr; + struct btrfsic_stack_frame *prev; +}; + +/* Some state per mounted filesystem */ +struct btrfsic_state { + u32 print_mask; + int include_extent_data; + int csum_size; + struct list_head all_blocks_list; + struct btrfsic_block_hashtable block_hashtable; + struct btrfsic_block_link_hashtable block_link_hashtable; + struct btrfs_root *root; + u64 max_superblock_generation; + struct btrfsic_block *latest_superblock; +}; + +static void btrfsic_block_init(struct btrfsic_block *b); +static struct btrfsic_block *btrfsic_block_alloc(void); +static void btrfsic_block_free(struct btrfsic_block *b); +static void btrfsic_block_link_init(struct btrfsic_block_link *n); +static struct btrfsic_block_link *btrfsic_block_link_alloc(void); +static void btrfsic_block_link_free(struct btrfsic_block_link *n); +static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds); +static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void); +static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds); +static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h); +static void btrfsic_block_hashtable_add(struct btrfsic_block *b, + struct btrfsic_block_hashtable *h); +static void btrfsic_block_hashtable_remove(struct btrfsic_block *b); +static struct btrfsic_block *btrfsic_block_hashtable_lookup( + struct block_device *bdev, + u64 dev_bytenr, + struct btrfsic_block_hashtable *h); +static void btrfsic_block_link_hashtable_init( + struct btrfsic_block_link_hashtable *h); +static void btrfsic_block_link_hashtable_add( + struct btrfsic_block_link *l, + struct btrfsic_block_link_hashtable *h); +static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l); +static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( + struct block_device *bdev_ref_to, + u64 dev_bytenr_ref_to, + struct block_device *bdev_ref_from, + u64 dev_bytenr_ref_from, + struct btrfsic_block_link_hashtable *h); +static void btrfsic_dev_state_hashtable_init( + struct btrfsic_dev_state_hashtable *h); +static void btrfsic_dev_state_hashtable_add( + struct btrfsic_dev_state *ds, + struct btrfsic_dev_state_hashtable *h); +static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds); +static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( + struct block_device *bdev, + struct btrfsic_dev_state_hashtable *h); +static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void); +static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf); +static int btrfsic_process_superblock(struct btrfsic_state *state, + struct btrfs_fs_devices *fs_devices); +static int btrfsic_process_metablock(struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + struct btrfs_header *hdr, + int limit_nesting, int force_iodone_flag); +static int btrfsic_create_link_to_next_block( + struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx + *block_ctx, u64 next_bytenr, + int limit_nesting, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block **next_blockp, + int force_iodone_flag, + int *num_copiesp, int *mirror_nump, + struct btrfs_disk_key *disk_key, + u64 parent_generation); +static int btrfsic_handle_extent_data(struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + u32 item_offset, int force_iodone_flag); +static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, + struct btrfsic_block_data_ctx *block_ctx_out, + int mirror_num); +static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, + u32 len, struct block_device *bdev, + struct btrfsic_block_data_ctx *block_ctx_out); +static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); +static int btrfsic_read_block(struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx); +static void btrfsic_dump_database(struct btrfsic_state *state); +static int btrfsic_test_for_metadata(struct btrfsic_state *state, + const u8 *data, unsigned int size); +static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, + u64 dev_bytenr, u8 *mapped_data, + unsigned int len, struct bio *bio, + int *bio_is_patched, + struct buffer_head *bh, + int submit_bio_bh_rw); +static int btrfsic_process_written_superblock( + struct btrfsic_state *state, + struct btrfsic_block *const block, + struct btrfs_super_block *const super_hdr); +static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status); +static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate); +static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state, + const struct btrfsic_block *block, + int recursion_level); +static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, + struct btrfsic_block *const block, + int recursion_level); +static void btrfsic_print_add_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l); +static void btrfsic_print_rem_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l); +static char btrfsic_get_block_type(const struct btrfsic_state *state, + const struct btrfsic_block *block); +static void btrfsic_dump_tree(const struct btrfsic_state *state); +static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, + const struct btrfsic_block *block, + int indent_level); +static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block *next_block, + struct btrfsic_block *from_block, + u64 parent_generation); +static struct btrfsic_block *btrfsic_block_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx, + const char *additional_string, + int is_metadata, + int is_iodone, + int never_written, + int mirror_num, + int *was_created); +static int btrfsic_process_superblock_dev_mirror( + struct btrfsic_state *state, + struct btrfsic_dev_state *dev_state, + struct btrfs_device *device, + int superblock_mirror_num, + struct btrfsic_dev_state **selected_dev_state, + struct btrfs_super_block *selected_super); +static struct btrfsic_dev_state *btrfsic_dev_state_lookup( + struct block_device *bdev); +static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, + u64 bytenr, + struct btrfsic_dev_state *dev_state, + u64 dev_bytenr, char *data); + +static struct mutex btrfsic_mutex; +static int btrfsic_is_initialized; +static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable; + + +static void btrfsic_block_init(struct btrfsic_block *b) +{ + b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER; + b->dev_state = NULL; + b->dev_bytenr = 0; + b->logical_bytenr = 0; + b->generation = BTRFSIC_GENERATION_UNKNOWN; + b->disk_key.objectid = 0; + b->disk_key.type = 0; + b->disk_key.offset = 0; + b->is_metadata = 0; + b->is_superblock = 0; + b->is_iodone = 0; + b->iodone_w_error = 0; + b->never_written = 0; + b->mirror_num = 0; + b->next_in_same_bio = NULL; + b->orig_bio_bh_private = NULL; + b->orig_bio_bh_end_io.bio = NULL; + INIT_LIST_HEAD(&b->collision_resolving_node); + INIT_LIST_HEAD(&b->all_blocks_node); + INIT_LIST_HEAD(&b->ref_to_list); + INIT_LIST_HEAD(&b->ref_from_list); + b->submit_bio_bh_rw = 0; + b->flush_gen = 0; +} + +static struct btrfsic_block *btrfsic_block_alloc(void) +{ + struct btrfsic_block *b; + + b = kzalloc(sizeof(*b), GFP_NOFS); + if (NULL != b) + btrfsic_block_init(b); + + return b; +} + +static void btrfsic_block_free(struct btrfsic_block *b) +{ + BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num)); + kfree(b); +} + +static void btrfsic_block_link_init(struct btrfsic_block_link *l) +{ + l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER; + l->ref_cnt = 1; + INIT_LIST_HEAD(&l->node_ref_to); + INIT_LIST_HEAD(&l->node_ref_from); + INIT_LIST_HEAD(&l->collision_resolving_node); + l->block_ref_to = NULL; + l->block_ref_from = NULL; +} + +static struct btrfsic_block_link *btrfsic_block_link_alloc(void) +{ + struct btrfsic_block_link *l; + + l = kzalloc(sizeof(*l), GFP_NOFS); + if (NULL != l) + btrfsic_block_link_init(l); + + return l; +} + +static void btrfsic_block_link_free(struct btrfsic_block_link *l) +{ + BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num)); + kfree(l); +} + +static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds) +{ + ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER; + ds->bdev = NULL; + ds->state = NULL; + ds->name[0] = '\0'; + INIT_LIST_HEAD(&ds->collision_resolving_node); + ds->last_flush_gen = 0; + btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush); + ds->dummy_block_for_bio_bh_flush.is_iodone = 1; + ds->dummy_block_for_bio_bh_flush.dev_state = ds; +} + +static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void) +{ + struct btrfsic_dev_state *ds; + + ds = kzalloc(sizeof(*ds), GFP_NOFS); + if (NULL != ds) + btrfsic_dev_state_init(ds); + + return ds; +} + +static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds) +{ + BUG_ON(!(NULL == ds || + BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num)); + kfree(ds); +} + +static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h) +{ + int i; + + for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++) + INIT_LIST_HEAD(h->table + i); +} + +static void btrfsic_block_hashtable_add(struct btrfsic_block *b, + struct btrfsic_block_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(b->dev_bytenr >> 16)) ^ + ((unsigned int)((uintptr_t)b->dev_state->bdev))) & + (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); + + list_add(&b->collision_resolving_node, h->table + hashval); +} + +static void btrfsic_block_hashtable_remove(struct btrfsic_block *b) +{ + list_del(&b->collision_resolving_node); +} + +static struct btrfsic_block *btrfsic_block_hashtable_lookup( + struct block_device *bdev, + u64 dev_bytenr, + struct btrfsic_block_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(dev_bytenr >> 16)) ^ + ((unsigned int)((uintptr_t)bdev))) & + (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); + struct list_head *elem; + + list_for_each(elem, h->table + hashval) { + struct btrfsic_block *const b = + list_entry(elem, struct btrfsic_block, + collision_resolving_node); + + if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr) + return b; + } + + return NULL; +} + +static void btrfsic_block_link_hashtable_init( + struct btrfsic_block_link_hashtable *h) +{ + int i; + + for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++) + INIT_LIST_HEAD(h->table + i); +} + +static void btrfsic_block_link_hashtable_add( + struct btrfsic_block_link *l, + struct btrfsic_block_link_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^ + ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^ + ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^ + ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev))) + & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); + + BUG_ON(NULL == l->block_ref_to); + BUG_ON(NULL == l->block_ref_from); + list_add(&l->collision_resolving_node, h->table + hashval); +} + +static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l) +{ + list_del(&l->collision_resolving_node); +} + +static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( + struct block_device *bdev_ref_to, + u64 dev_bytenr_ref_to, + struct block_device *bdev_ref_from, + u64 dev_bytenr_ref_from, + struct btrfsic_block_link_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(dev_bytenr_ref_to >> 16)) ^ + ((unsigned int)(dev_bytenr_ref_from >> 16)) ^ + ((unsigned int)((uintptr_t)bdev_ref_to)) ^ + ((unsigned int)((uintptr_t)bdev_ref_from))) & + (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); + struct list_head *elem; + + list_for_each(elem, h->table + hashval) { + struct btrfsic_block_link *const l = + list_entry(elem, struct btrfsic_block_link, + collision_resolving_node); + + BUG_ON(NULL == l->block_ref_to); + BUG_ON(NULL == l->block_ref_from); + if (l->block_ref_to->dev_state->bdev == bdev_ref_to && + l->block_ref_to->dev_bytenr == dev_bytenr_ref_to && + l->block_ref_from->dev_state->bdev == bdev_ref_from && + l->block_ref_from->dev_bytenr == dev_bytenr_ref_from) + return l; + } + + return NULL; +} + +static void btrfsic_dev_state_hashtable_init( + struct btrfsic_dev_state_hashtable *h) +{ + int i; + + for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++) + INIT_LIST_HEAD(h->table + i); +} + +static void btrfsic_dev_state_hashtable_add( + struct btrfsic_dev_state *ds, + struct btrfsic_dev_state_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)((uintptr_t)ds->bdev)) & + (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); + + list_add(&ds->collision_resolving_node, h->table + hashval); +} + +static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds) +{ + list_del(&ds->collision_resolving_node); +} + +static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( + struct block_device *bdev, + struct btrfsic_dev_state_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)((uintptr_t)bdev)) & + (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); + struct list_head *elem; + + list_for_each(elem, h->table + hashval) { + struct btrfsic_dev_state *const ds = + list_entry(elem, struct btrfsic_dev_state, + collision_resolving_node); + + if (ds->bdev == bdev) + return ds; + } + + return NULL; +} + +static int btrfsic_process_superblock(struct btrfsic_state *state, + struct btrfs_fs_devices *fs_devices) +{ + int ret; + struct btrfs_super_block *selected_super; + struct list_head *dev_head = &fs_devices->devices; + struct btrfs_device *device; + struct btrfsic_dev_state *selected_dev_state = NULL; + int pass; + + BUG_ON(NULL == state); + selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS); + if (NULL == selected_super) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + return -1; + } + + list_for_each_entry(device, dev_head, dev_list) { + int i; + struct btrfsic_dev_state *dev_state; + + if (!device->bdev || !device->name) + continue; + + dev_state = btrfsic_dev_state_lookup(device->bdev); + BUG_ON(NULL == dev_state); + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + ret = btrfsic_process_superblock_dev_mirror( + state, dev_state, device, i, + &selected_dev_state, selected_super); + if (0 != ret && 0 == i) { + kfree(selected_super); + return ret; + } + } + } + + if (NULL == state->latest_superblock) { + printk(KERN_INFO "btrfsic: no superblock found!\n"); + kfree(selected_super); + return -1; + } + + state->csum_size = btrfs_super_csum_size(selected_super); + + for (pass = 0; pass < 3; pass++) { + int num_copies; + int mirror_num; + u64 next_bytenr; + + switch (pass) { + case 0: + next_bytenr = btrfs_super_root(selected_super); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "root@%llu\n", + (unsigned long long)next_bytenr); + break; + case 1: + next_bytenr = btrfs_super_chunk_root(selected_super); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "chunk@%llu\n", + (unsigned long long)next_bytenr); + break; + case 2: + next_bytenr = btrfs_super_log_root(selected_super); + if (0 == next_bytenr) + continue; + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "log@%llu\n", + (unsigned long long)next_bytenr); + break; + } + + num_copies = + btrfs_num_copies(&state->root->fs_info->mapping_tree, + next_bytenr, PAGE_SIZE); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + (unsigned long long)next_bytenr, num_copies); + + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx tmp_next_block_ctx; + struct btrfsic_block_link *l; + struct btrfs_header *hdr; + + ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, + &tmp_next_block_ctx, + mirror_num); + if (ret) { + printk(KERN_INFO "btrfsic:" + " btrfsic_map_block(root @%llu," + " mirror %d) failed!\n", + (unsigned long long)next_bytenr, + mirror_num); + kfree(selected_super); + return -1; + } + + next_block = btrfsic_block_hashtable_lookup( + tmp_next_block_ctx.dev->bdev, + tmp_next_block_ctx.dev_bytenr, + &state->block_hashtable); + BUG_ON(NULL == next_block); + + l = btrfsic_block_link_hashtable_lookup( + tmp_next_block_ctx.dev->bdev, + tmp_next_block_ctx.dev_bytenr, + state->latest_superblock->dev_state-> + bdev, + state->latest_superblock->dev_bytenr, + &state->block_link_hashtable); + BUG_ON(NULL == l); + + ret = btrfsic_read_block(state, &tmp_next_block_ctx); + if (ret < (int)BTRFSIC_BLOCK_SIZE) { + printk(KERN_INFO + "btrfsic: read @logical %llu failed!\n", + (unsigned long long) + tmp_next_block_ctx.start); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + kfree(selected_super); + return -1; + } + + hdr = (struct btrfs_header *)tmp_next_block_ctx.data; + ret = btrfsic_process_metablock(state, + next_block, + &tmp_next_block_ctx, + hdr, + BTRFS_MAX_LEVEL + 3, 1); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + } + } + + kfree(selected_super); + return ret; +} + +static int btrfsic_process_superblock_dev_mirror( + struct btrfsic_state *state, + struct btrfsic_dev_state *dev_state, + struct btrfs_device *device, + int superblock_mirror_num, + struct btrfsic_dev_state **selected_dev_state, + struct btrfs_super_block *selected_super) +{ + struct btrfs_super_block *super_tmp; + u64 dev_bytenr; + struct buffer_head *bh; + struct btrfsic_block *superblock_tmp; + int pass; + struct block_device *const superblock_bdev = device->bdev; + + /* super block bytenr is always the unmapped device bytenr */ + dev_bytenr = btrfs_sb_offset(superblock_mirror_num); + bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096); + if (NULL == bh) + return -1; + super_tmp = (struct btrfs_super_block *) + (bh->b_data + (dev_bytenr & 4095)); + + if (btrfs_super_bytenr(super_tmp) != dev_bytenr || + strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, + sizeof(super_tmp->magic)) || + memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) { + brelse(bh); + return 0; + } + + superblock_tmp = + btrfsic_block_hashtable_lookup(superblock_bdev, + dev_bytenr, + &state->block_hashtable); + if (NULL == superblock_tmp) { + superblock_tmp = btrfsic_block_alloc(); + if (NULL == superblock_tmp) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + brelse(bh); + return -1; + } + /* for superblock, only the dev_bytenr makes sense */ + superblock_tmp->dev_bytenr = dev_bytenr; + superblock_tmp->dev_state = dev_state; + superblock_tmp->logical_bytenr = dev_bytenr; + superblock_tmp->generation = btrfs_super_generation(super_tmp); + superblock_tmp->is_metadata = 1; + superblock_tmp->is_superblock = 1; + superblock_tmp->is_iodone = 1; + superblock_tmp->never_written = 0; + superblock_tmp->mirror_num = 1 + superblock_mirror_num; + if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) + printk(KERN_INFO "New initial S-block (bdev %p, %s)" + " @%llu (%s/%llu/%d)\n", + superblock_bdev, device->name, + (unsigned long long)dev_bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + superblock_mirror_num); + list_add(&superblock_tmp->all_blocks_node, + &state->all_blocks_list); + btrfsic_block_hashtable_add(superblock_tmp, + &state->block_hashtable); + } + + /* select the one with the highest generation field */ + if (btrfs_super_generation(super_tmp) > + state->max_superblock_generation || + 0 == state->max_superblock_generation) { + memcpy(selected_super, super_tmp, sizeof(*selected_super)); + *selected_dev_state = dev_state; + state->max_superblock_generation = + btrfs_super_generation(super_tmp); + state->latest_superblock = superblock_tmp; + } + + for (pass = 0; pass < 3; pass++) { + u64 next_bytenr; + int num_copies; + int mirror_num; + const char *additional_string = NULL; + struct btrfs_disk_key tmp_disk_key; + + tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; + tmp_disk_key.offset = 0; + switch (pass) { + case 0: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID); + additional_string = "initial root "; + next_bytenr = btrfs_super_root(super_tmp); + break; + case 1: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID); + additional_string = "initial chunk "; + next_bytenr = btrfs_super_chunk_root(super_tmp); + break; + case 2: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_TREE_LOG_OBJECTID); + additional_string = "initial log "; + next_bytenr = btrfs_super_log_root(super_tmp); + if (0 == next_bytenr) + continue; + break; + } + + num_copies = + btrfs_num_copies(&state->root->fs_info->mapping_tree, + next_bytenr, PAGE_SIZE); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + (unsigned long long)next_bytenr, num_copies); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx tmp_next_block_ctx; + struct btrfsic_block_link *l; + + if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE, + &tmp_next_block_ctx, + mirror_num)) { + printk(KERN_INFO "btrfsic: btrfsic_map_block(" + "bytenr @%llu, mirror %d) failed!\n", + (unsigned long long)next_bytenr, + mirror_num); + brelse(bh); + return -1; + } + + next_block = btrfsic_block_lookup_or_add( + state, &tmp_next_block_ctx, + additional_string, 1, 1, 0, + mirror_num, NULL); + if (NULL == next_block) { + btrfsic_release_block_ctx(&tmp_next_block_ctx); + brelse(bh); + return -1; + } + + next_block->disk_key = tmp_disk_key; + next_block->generation = BTRFSIC_GENERATION_UNKNOWN; + l = btrfsic_block_link_lookup_or_add( + state, &tmp_next_block_ctx, + next_block, superblock_tmp, + BTRFSIC_GENERATION_UNKNOWN); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + if (NULL == l) { + brelse(bh); + return -1; + } + } + } + if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES) + btrfsic_dump_tree_sub(state, superblock_tmp, 0); + + brelse(bh); + return 0; +} + +static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void) +{ + struct btrfsic_stack_frame *sf; + + sf = kzalloc(sizeof(*sf), GFP_NOFS); + if (NULL == sf) + printk(KERN_INFO "btrfsic: alloc memory failed!\n"); + else + sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER; + return sf; +} + +static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf) +{ + BUG_ON(!(NULL == sf || + BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic)); + kfree(sf); +} + +static int btrfsic_process_metablock( + struct btrfsic_state *state, + struct btrfsic_block *const first_block, + struct btrfsic_block_data_ctx *const first_block_ctx, + struct btrfs_header *const first_hdr, + int first_limit_nesting, int force_iodone_flag) +{ + struct btrfsic_stack_frame initial_stack_frame = { 0 }; + struct btrfsic_stack_frame *sf; + struct btrfsic_stack_frame *next_stack; + + sf = &initial_stack_frame; + sf->error = 0; + sf->i = -1; + sf->limit_nesting = first_limit_nesting; + sf->block = first_block; + sf->block_ctx = first_block_ctx; + sf->next_block = NULL; + sf->hdr = first_hdr; + sf->prev = NULL; + +continue_with_new_stack_frame: + sf->block->generation = le64_to_cpu(sf->hdr->generation); + if (0 == sf->hdr->level) { + struct btrfs_leaf *const leafhdr = + (struct btrfs_leaf *)sf->hdr; + + if (-1 == sf->i) { + sf->nr = le32_to_cpu(leafhdr->header.nritems); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "leaf %llu items %d generation %llu" + " owner %llu\n", + (unsigned long long) + sf->block_ctx->start, + sf->nr, + (unsigned long long) + le64_to_cpu(leafhdr->header.generation), + (unsigned long long) + le64_to_cpu(leafhdr->header.owner)); + } + +continue_with_current_leaf_stack_frame: + if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { + sf->i++; + sf->num_copies = 0; + } + + if (sf->i < sf->nr) { + struct btrfs_item *disk_item = leafhdr->items + sf->i; + struct btrfs_disk_key *disk_key = &disk_item->key; + u8 type; + const u32 item_offset = le32_to_cpu(disk_item->offset); + + type = disk_key->type; + + if (BTRFS_ROOT_ITEM_KEY == type) { + const struct btrfs_root_item *const root_item = + (struct btrfs_root_item *) + (sf->block_ctx->data + + offsetof(struct btrfs_leaf, items) + + item_offset); + const u64 next_bytenr = + le64_to_cpu(root_item->bytenr); + + sf->error = + btrfsic_create_link_to_next_block( + state, + sf->block, + sf->block_ctx, + next_bytenr, + sf->limit_nesting, + &sf->next_block_ctx, + &sf->next_block, + force_iodone_flag, + &sf->num_copies, + &sf->mirror_num, + disk_key, + le64_to_cpu(root_item-> + generation)); + if (sf->error) + goto one_stack_frame_backwards; + + if (NULL != sf->next_block) { + struct btrfs_header *const next_hdr = + (struct btrfs_header *) + sf->next_block_ctx.data; + + next_stack = + btrfsic_stack_frame_alloc(); + if (NULL == next_stack) { + btrfsic_release_block_ctx( + &sf-> + next_block_ctx); + goto one_stack_frame_backwards; + } + + next_stack->i = -1; + next_stack->block = sf->next_block; + next_stack->block_ctx = + &sf->next_block_ctx; + next_stack->next_block = NULL; + next_stack->hdr = next_hdr; + next_stack->limit_nesting = + sf->limit_nesting - 1; + next_stack->prev = sf; + sf = next_stack; + goto continue_with_new_stack_frame; + } + } else if (BTRFS_EXTENT_DATA_KEY == type && + state->include_extent_data) { + sf->error = btrfsic_handle_extent_data( + state, + sf->block, + sf->block_ctx, + item_offset, + force_iodone_flag); + if (sf->error) + goto one_stack_frame_backwards; + } + + goto continue_with_current_leaf_stack_frame; + } + } else { + struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr; + + if (-1 == sf->i) { + sf->nr = le32_to_cpu(nodehdr->header.nritems); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO "node %llu level %d items %d" + " generation %llu owner %llu\n", + (unsigned long long) + sf->block_ctx->start, + nodehdr->header.level, sf->nr, + (unsigned long long) + le64_to_cpu(nodehdr->header.generation), + (unsigned long long) + le64_to_cpu(nodehdr->header.owner)); + } + +continue_with_current_node_stack_frame: + if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { + sf->i++; + sf->num_copies = 0; + } + + if (sf->i < sf->nr) { + struct btrfs_key_ptr *disk_key_ptr = + nodehdr->ptrs + sf->i; + const u64 next_bytenr = + le64_to_cpu(disk_key_ptr->blockptr); + + sf->error = btrfsic_create_link_to_next_block( + state, + sf->block, + sf->block_ctx, + next_bytenr, + sf->limit_nesting, + &sf->next_block_ctx, + &sf->next_block, + force_iodone_flag, + &sf->num_copies, + &sf->mirror_num, + &disk_key_ptr->key, + le64_to_cpu(disk_key_ptr->generation)); + if (sf->error) + goto one_stack_frame_backwards; + + if (NULL != sf->next_block) { + struct btrfs_header *const next_hdr = + (struct btrfs_header *) + sf->next_block_ctx.data; + + next_stack = btrfsic_stack_frame_alloc(); + if (NULL == next_stack) + goto one_stack_frame_backwards; + + next_stack->i = -1; + next_stack->block = sf->next_block; + next_stack->block_ctx = &sf->next_block_ctx; + next_stack->next_block = NULL; + next_stack->hdr = next_hdr; + next_stack->limit_nesting = + sf->limit_nesting - 1; + next_stack->prev = sf; + sf = next_stack; + goto continue_with_new_stack_frame; + } + + goto continue_with_current_node_stack_frame; + } + } + +one_stack_frame_backwards: + if (NULL != sf->prev) { + struct btrfsic_stack_frame *const prev = sf->prev; + + /* the one for the initial block is freed in the caller */ + btrfsic_release_block_ctx(sf->block_ctx); + + if (sf->error) { + prev->error = sf->error; + btrfsic_stack_frame_free(sf); + sf = prev; + goto one_stack_frame_backwards; + } + + btrfsic_stack_frame_free(sf); + sf = prev; + goto continue_with_new_stack_frame; + } else { + BUG_ON(&initial_stack_frame != sf); + } + + return sf->error; +} + +static int btrfsic_create_link_to_next_block( + struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + u64 next_bytenr, + int limit_nesting, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block **next_blockp, + int force_iodone_flag, + int *num_copiesp, int *mirror_nump, + struct btrfs_disk_key *disk_key, + u64 parent_generation) +{ + struct btrfsic_block *next_block = NULL; + int ret; + struct btrfsic_block_link *l; + int did_alloc_block_link; + int block_was_created; + + *next_blockp = NULL; + if (0 == *num_copiesp) { + *num_copiesp = + btrfs_num_copies(&state->root->fs_info->mapping_tree, + next_bytenr, PAGE_SIZE); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + (unsigned long long)next_bytenr, *num_copiesp); + *mirror_nump = 1; + } + + if (*mirror_nump > *num_copiesp) + return 0; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic_create_link_to_next_block(mirror_num=%d)\n", + *mirror_nump); + ret = btrfsic_map_block(state, next_bytenr, + BTRFSIC_BLOCK_SIZE, + next_block_ctx, *mirror_nump); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", + (unsigned long long)next_bytenr, *mirror_nump); + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + + next_block = btrfsic_block_lookup_or_add(state, + next_block_ctx, "referenced ", + 1, force_iodone_flag, + !force_iodone_flag, + *mirror_nump, + &block_was_created); + if (NULL == next_block) { + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + if (block_was_created) { + l = NULL; + next_block->generation = BTRFSIC_GENERATION_UNKNOWN; + } else { + if (next_block->logical_bytenr != next_bytenr && + !(!next_block->is_metadata && + 0 == next_block->logical_bytenr)) { + printk(KERN_INFO + "Referenced block @%llu (%s/%llu/%d)" + " found in hash table, %c," + " bytenr mismatch (!= stored %llu).\n", + (unsigned long long)next_bytenr, + next_block_ctx->dev->name, + (unsigned long long)next_block_ctx->dev_bytenr, + *mirror_nump, + btrfsic_get_block_type(state, next_block), + (unsigned long long)next_block->logical_bytenr); + } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "Referenced block @%llu (%s/%llu/%d)" + " found in hash table, %c.\n", + (unsigned long long)next_bytenr, + next_block_ctx->dev->name, + (unsigned long long)next_block_ctx->dev_bytenr, + *mirror_nump, + btrfsic_get_block_type(state, next_block)); + next_block->logical_bytenr = next_bytenr; + + next_block->mirror_num = *mirror_nump; + l = btrfsic_block_link_hashtable_lookup( + next_block_ctx->dev->bdev, + next_block_ctx->dev_bytenr, + block_ctx->dev->bdev, + block_ctx->dev_bytenr, + &state->block_link_hashtable); + } + + next_block->disk_key = *disk_key; + if (NULL == l) { + l = btrfsic_block_link_alloc(); + if (NULL == l) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + + did_alloc_block_link = 1; + l->block_ref_to = next_block; + l->block_ref_from = block; + l->ref_cnt = 1; + l->parent_generation = parent_generation; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + + list_add(&l->node_ref_to, &block->ref_to_list); + list_add(&l->node_ref_from, &next_block->ref_from_list); + + btrfsic_block_link_hashtable_add(l, + &state->block_link_hashtable); + } else { + did_alloc_block_link = 0; + if (0 == limit_nesting) { + l->ref_cnt++; + l->parent_generation = parent_generation; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + } + } + + if (limit_nesting > 0 && did_alloc_block_link) { + ret = btrfsic_read_block(state, next_block_ctx); + if (ret < (int)BTRFSIC_BLOCK_SIZE) { + printk(KERN_INFO + "btrfsic: read block @logical %llu failed!\n", + (unsigned long long)next_bytenr); + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + + *next_blockp = next_block; + } else { + *next_blockp = NULL; + } + (*mirror_nump)++; + + return 0; +} + +static int btrfsic_handle_extent_data( + struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + u32 item_offset, int force_iodone_flag) +{ + int ret; + struct btrfs_file_extent_item *file_extent_item = + (struct btrfs_file_extent_item *)(block_ctx->data + + offsetof(struct btrfs_leaf, + items) + item_offset); + u64 next_bytenr = + le64_to_cpu(file_extent_item->disk_bytenr) + + le64_to_cpu(file_extent_item->offset); + u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes); + u64 generation = le64_to_cpu(file_extent_item->generation); + struct btrfsic_block_link *l; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) + printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu," + " offset = %llu, num_bytes = %llu\n", + file_extent_item->type, + (unsigned long long) + le64_to_cpu(file_extent_item->disk_bytenr), + (unsigned long long) + le64_to_cpu(file_extent_item->offset), + (unsigned long long) + le64_to_cpu(file_extent_item->num_bytes)); + if (BTRFS_FILE_EXTENT_REG != file_extent_item->type || + ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr)) + return 0; + while (num_bytes > 0) { + u32 chunk_len; + int num_copies; + int mirror_num; + + if (num_bytes > BTRFSIC_BLOCK_SIZE) + chunk_len = BTRFSIC_BLOCK_SIZE; + else + chunk_len = num_bytes; + + num_copies = + btrfs_num_copies(&state->root->fs_info->mapping_tree, + next_bytenr, PAGE_SIZE); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + (unsigned long long)next_bytenr, num_copies); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + struct btrfsic_block_data_ctx next_block_ctx; + struct btrfsic_block *next_block; + int block_was_created; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO "btrfsic_handle_extent_data(" + "mirror_num=%d)\n", mirror_num); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) + printk(KERN_INFO + "\tdisk_bytenr = %llu, num_bytes %u\n", + (unsigned long long)next_bytenr, + chunk_len); + ret = btrfsic_map_block(state, next_bytenr, + chunk_len, &next_block_ctx, + mirror_num); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(@%llu," + " mirror=%d) failed!\n", + (unsigned long long)next_bytenr, + mirror_num); + return -1; + } + + next_block = btrfsic_block_lookup_or_add( + state, + &next_block_ctx, + "referenced ", + 0, + force_iodone_flag, + !force_iodone_flag, + mirror_num, + &block_was_created); + if (NULL == next_block) { + printk(KERN_INFO + "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(&next_block_ctx); + return -1; + } + if (!block_was_created) { + if (next_block->logical_bytenr != next_bytenr && + !(!next_block->is_metadata && + 0 == next_block->logical_bytenr)) { + printk(KERN_INFO + "Referenced block" + " @%llu (%s/%llu/%d)" + " found in hash table, D," + " bytenr mismatch" + " (!= stored %llu).\n", + (unsigned long long)next_bytenr, + next_block_ctx.dev->name, + (unsigned long long) + next_block_ctx.dev_bytenr, + mirror_num, + (unsigned long long) + next_block->logical_bytenr); + } + next_block->logical_bytenr = next_bytenr; + next_block->mirror_num = mirror_num; + } + + l = btrfsic_block_link_lookup_or_add(state, + &next_block_ctx, + next_block, block, + generation); + btrfsic_release_block_ctx(&next_block_ctx); + if (NULL == l) + return -1; + } + + next_bytenr += chunk_len; + num_bytes -= chunk_len; + } + + return 0; +} + +static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, + struct btrfsic_block_data_ctx *block_ctx_out, + int mirror_num) +{ + int ret; + u64 length; + struct btrfs_bio *multi = NULL; + struct btrfs_device *device; + + length = len; + ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ, + bytenr, &length, &multi, mirror_num); + + device = multi->stripes[0].dev; + block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); + block_ctx_out->dev_bytenr = multi->stripes[0].physical; + block_ctx_out->start = bytenr; + block_ctx_out->len = len; + block_ctx_out->data = NULL; + block_ctx_out->bh = NULL; + + if (0 == ret) + kfree(multi); + if (NULL == block_ctx_out->dev) { + ret = -ENXIO; + printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); + } + + return ret; +} + +static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, + u32 len, struct block_device *bdev, + struct btrfsic_block_data_ctx *block_ctx_out) +{ + block_ctx_out->dev = btrfsic_dev_state_lookup(bdev); + block_ctx_out->dev_bytenr = bytenr; + block_ctx_out->start = bytenr; + block_ctx_out->len = len; + block_ctx_out->data = NULL; + block_ctx_out->bh = NULL; + if (NULL != block_ctx_out->dev) { + return 0; + } else { + printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n"); + return -ENXIO; + } +} + +static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) +{ + if (NULL != block_ctx->bh) { + brelse(block_ctx->bh); + block_ctx->bh = NULL; + } +} + +static int btrfsic_read_block(struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx) +{ + block_ctx->bh = NULL; + if (block_ctx->dev_bytenr & 4095) { + printk(KERN_INFO + "btrfsic: read_block() with unaligned bytenr %llu\n", + (unsigned long long)block_ctx->dev_bytenr); + return -1; + } + if (block_ctx->len > 4096) { + printk(KERN_INFO + "btrfsic: read_block() with too huge size %d\n", + block_ctx->len); + return -1; + } + + block_ctx->bh = __bread(block_ctx->dev->bdev, + block_ctx->dev_bytenr >> 12, 4096); + if (NULL == block_ctx->bh) + return -1; + block_ctx->data = block_ctx->bh->b_data; + + return block_ctx->len; +} + +static void btrfsic_dump_database(struct btrfsic_state *state) +{ + struct list_head *elem_all; + + BUG_ON(NULL == state); + + printk(KERN_INFO "all_blocks_list:\n"); + list_for_each(elem_all, &state->all_blocks_list) { + const struct btrfsic_block *const b_all = + list_entry(elem_all, struct btrfsic_block, + all_blocks_node); + struct list_head *elem_ref_to; + struct list_head *elem_ref_from; + + printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n", + btrfsic_get_block_type(state, b_all), + (unsigned long long)b_all->logical_bytenr, + b_all->dev_state->name, + (unsigned long long)b_all->dev_bytenr, + b_all->mirror_num); + + list_for_each(elem_ref_to, &b_all->ref_to_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_to, + struct btrfsic_block_link, + node_ref_to); + + printk(KERN_INFO " %c @%llu (%s/%llu/%d)" + " refers %u* to" + " %c @%llu (%s/%llu/%d)\n", + btrfsic_get_block_type(state, b_all), + (unsigned long long)b_all->logical_bytenr, + b_all->dev_state->name, + (unsigned long long)b_all->dev_bytenr, + b_all->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + } + + list_for_each(elem_ref_from, &b_all->ref_from_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_from, + struct btrfsic_block_link, + node_ref_from); + + printk(KERN_INFO " %c @%llu (%s/%llu/%d)" + " is ref %u* from" + " %c @%llu (%s/%llu/%d)\n", + btrfsic_get_block_type(state, b_all), + (unsigned long long)b_all->logical_bytenr, + b_all->dev_state->name, + (unsigned long long)b_all->dev_bytenr, + b_all->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + (unsigned long long) + l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + (unsigned long long) + l->block_ref_from->dev_bytenr, + l->block_ref_from->mirror_num); + } + + printk(KERN_INFO "\n"); + } +} + +/* + * Test whether the disk block contains a tree block (leaf or node) + * (note that this test fails for the super block) + */ +static int btrfsic_test_for_metadata(struct btrfsic_state *state, + const u8 *data, unsigned int size) +{ + struct btrfs_header *h; + u8 csum[BTRFS_CSUM_SIZE]; + u32 crc = ~(u32)0; + int fail = 0; + int crc_fail = 0; + + h = (struct btrfs_header *)data; + + if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE)) + fail++; + + crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, csum); + if (memcmp(csum, h->csum, state->csum_size)) + crc_fail++; + + return fail || crc_fail; +} + +static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, + u64 dev_bytenr, + u8 *mapped_data, unsigned int len, + struct bio *bio, + int *bio_is_patched, + struct buffer_head *bh, + int submit_bio_bh_rw) +{ + int is_metadata; + struct btrfsic_block *block; + struct btrfsic_block_data_ctx block_ctx; + int ret; + struct btrfsic_state *state = dev_state->state; + struct block_device *bdev = dev_state->bdev; + + WARN_ON(len > PAGE_SIZE); + is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len)); + if (NULL != bio_is_patched) + *bio_is_patched = 0; + + block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, + &state->block_hashtable); + if (NULL != block) { + u64 bytenr; + struct list_head *elem_ref_to; + struct list_head *tmp_ref_to; + + if (block->is_superblock) { + bytenr = le64_to_cpu(((struct btrfs_super_block *) + mapped_data)->bytenr); + is_metadata = 1; + if (state->print_mask & + BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { + printk(KERN_INFO + "[before new superblock is written]:\n"); + btrfsic_dump_tree_sub(state, block, 0); + } + } + if (is_metadata) { + if (!block->is_superblock) { + bytenr = le64_to_cpu(((struct btrfs_header *) + mapped_data)->bytenr); + btrfsic_cmp_log_and_dev_bytenr(state, bytenr, + dev_state, + dev_bytenr, + mapped_data); + } + if (block->logical_bytenr != bytenr) { + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d)" + " found in hash table, %c," + " bytenr mismatch" + " (!= stored %llu).\n", + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + block->mirror_num, + btrfsic_get_block_type(state, block), + (unsigned long long) + block->logical_bytenr); + block->logical_bytenr = bytenr; + } else if (state->print_mask & + BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d)" + " found in hash table, %c.\n", + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + block->mirror_num, + btrfsic_get_block_type(state, block)); + } else { + bytenr = block->logical_bytenr; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d)" + " found in hash table, %c.\n", + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + block->mirror_num, + btrfsic_get_block_type(state, block)); + } + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "ref_to_list: %cE, ref_from_list: %cE\n", + list_empty(&block->ref_to_list) ? ' ' : '!', + list_empty(&block->ref_from_list) ? ' ' : '!'); + if (btrfsic_is_block_ref_by_superblock(state, block, 0)) { + printk(KERN_INFO "btrfs: attempt to overwrite %c-block" + " @%llu (%s/%llu/%d), old(gen=%llu," + " objectid=%llu, type=%d, offset=%llu)," + " new(gen=%llu)," + " which is referenced by most recent superblock" + " (superblockgen=%llu)!\n", + btrfsic_get_block_type(state, block), + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + block->mirror_num, + (unsigned long long)block->generation, + (unsigned long long) + le64_to_cpu(block->disk_key.objectid), + block->disk_key.type, + (unsigned long long) + le64_to_cpu(block->disk_key.offset), + (unsigned long long) + le64_to_cpu(((struct btrfs_header *) + mapped_data)->generation), + (unsigned long long) + state->max_superblock_generation); + btrfsic_dump_tree(state); + } + + if (!block->is_iodone && !block->never_written) { + printk(KERN_INFO "btrfs: attempt to overwrite %c-block" + " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu," + " which is not yet iodone!\n", + btrfsic_get_block_type(state, block), + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + block->mirror_num, + (unsigned long long)block->generation, + (unsigned long long) + le64_to_cpu(((struct btrfs_header *) + mapped_data)->generation)); + /* it would not be safe to go on */ + btrfsic_dump_tree(state); + return; + } + + /* + * Clear all references of this block. Do not free + * the block itself even if is not referenced anymore + * because it still carries valueable information + * like whether it was ever written and IO completed. + */ + list_for_each_safe(elem_ref_to, tmp_ref_to, + &block->ref_to_list) { + struct btrfsic_block_link *const l = + list_entry(elem_ref_to, + struct btrfsic_block_link, + node_ref_to); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_rem_link(state, l); + l->ref_cnt--; + if (0 == l->ref_cnt) { + list_del(&l->node_ref_to); + list_del(&l->node_ref_from); + btrfsic_block_link_hashtable_remove(l); + btrfsic_block_link_free(l); + } + } + + if (block->is_superblock) + ret = btrfsic_map_superblock(state, bytenr, len, + bdev, &block_ctx); + else + ret = btrfsic_map_block(state, bytenr, len, + &block_ctx, 0); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(root @%llu)" + " failed!\n", (unsigned long long)bytenr); + return; + } + block_ctx.data = mapped_data; + /* the following is required in case of writes to mirrors, + * use the same that was used for the lookup */ + block_ctx.dev = dev_state; + block_ctx.dev_bytenr = dev_bytenr; + + if (is_metadata || state->include_extent_data) { + block->never_written = 0; + block->iodone_w_error = 0; + if (NULL != bio) { + block->is_iodone = 0; + BUG_ON(NULL == bio_is_patched); + if (!*bio_is_patched) { + block->orig_bio_bh_private = + bio->bi_private; + block->orig_bio_bh_end_io.bio = + bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + *bio_is_patched = 1; + } else { + struct btrfsic_block *chained_block = + (struct btrfsic_block *) + bio->bi_private; + + BUG_ON(NULL == chained_block); + block->orig_bio_bh_private = + chained_block->orig_bio_bh_private; + block->orig_bio_bh_end_io.bio = + chained_block->orig_bio_bh_end_io. + bio; + block->next_in_same_bio = chained_block; + bio->bi_private = block; + } + } else if (NULL != bh) { + block->is_iodone = 0; + block->orig_bio_bh_private = bh->b_private; + block->orig_bio_bh_end_io.bh = bh->b_end_io; + block->next_in_same_bio = NULL; + bh->b_private = block; + bh->b_end_io = btrfsic_bh_end_io; + } else { + block->is_iodone = 1; + block->orig_bio_bh_private = NULL; + block->orig_bio_bh_end_io.bio = NULL; + block->next_in_same_bio = NULL; + } + } + + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = submit_bio_bh_rw; + if (is_metadata) { + block->logical_bytenr = bytenr; + block->is_metadata = 1; + if (block->is_superblock) { + ret = btrfsic_process_written_superblock( + state, + block, + (struct btrfs_super_block *) + mapped_data); + if (state->print_mask & + BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) { + printk(KERN_INFO + "[after new superblock is written]:\n"); + btrfsic_dump_tree_sub(state, block, 0); + } + } else { + block->mirror_num = 0; /* unknown */ + ret = btrfsic_process_metablock( + state, + block, + &block_ctx, + (struct btrfs_header *) + block_ctx.data, + 0, 0); + } + if (ret) + printk(KERN_INFO + "btrfsic: btrfsic_process_metablock" + "(root @%llu) failed!\n", + (unsigned long long)dev_bytenr); + } else { + block->is_metadata = 0; + block->mirror_num = 0; /* unknown */ + block->generation = BTRFSIC_GENERATION_UNKNOWN; + if (!state->include_extent_data + && list_empty(&block->ref_from_list)) { + /* + * disk block is overwritten with extent + * data (not meta data) and we are configured + * to not include extent data: take the + * chance and free the block's memory + */ + btrfsic_block_hashtable_remove(block); + list_del(&block->all_blocks_node); + btrfsic_block_free(block); + } + } + btrfsic_release_block_ctx(&block_ctx); + } else { + /* block has not been found in hash table */ + u64 bytenr; + + if (!is_metadata) { + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO "Written block (%s/%llu/?)" + " !found in hash table, D.\n", + dev_state->name, + (unsigned long long)dev_bytenr); + if (!state->include_extent_data) + return; /* ignore that written D block */ + + /* this is getting ugly for the + * include_extent_data case... */ + bytenr = 0; /* unknown */ + block_ctx.start = bytenr; + block_ctx.len = len; + block_ctx.bh = NULL; + } else { + bytenr = le64_to_cpu(((struct btrfs_header *) + mapped_data)->bytenr); + btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, + dev_bytenr, + mapped_data); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "Written block @%llu (%s/%llu/?)" + " !found in hash table, M.\n", + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr); + + ret = btrfsic_map_block(state, bytenr, len, &block_ctx, + 0); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(root @%llu)" + " failed!\n", + (unsigned long long)dev_bytenr); + return; + } + } + block_ctx.data = mapped_data; + /* the following is required in case of writes to mirrors, + * use the same that was used for the lookup */ + block_ctx.dev = dev_state; + block_ctx.dev_bytenr = dev_bytenr; + + block = btrfsic_block_alloc(); + if (NULL == block) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(&block_ctx); + return; + } + block->dev_state = dev_state; + block->dev_bytenr = dev_bytenr; + block->logical_bytenr = bytenr; + block->is_metadata = is_metadata; + block->never_written = 0; + block->iodone_w_error = 0; + block->mirror_num = 0; /* unknown */ + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = submit_bio_bh_rw; + if (NULL != bio) { + block->is_iodone = 0; + BUG_ON(NULL == bio_is_patched); + if (!*bio_is_patched) { + block->orig_bio_bh_private = bio->bi_private; + block->orig_bio_bh_end_io.bio = bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + *bio_is_patched = 1; + } else { + struct btrfsic_block *chained_block = + (struct btrfsic_block *) + bio->bi_private; + + BUG_ON(NULL == chained_block); + block->orig_bio_bh_private = + chained_block->orig_bio_bh_private; + block->orig_bio_bh_end_io.bio = + chained_block->orig_bio_bh_end_io.bio; + block->next_in_same_bio = chained_block; + bio->bi_private = block; + } + } else if (NULL != bh) { + block->is_iodone = 0; + block->orig_bio_bh_private = bh->b_private; + block->orig_bio_bh_end_io.bh = bh->b_end_io; + block->next_in_same_bio = NULL; + bh->b_private = block; + bh->b_end_io = btrfsic_bh_end_io; + } else { + block->is_iodone = 1; + block->orig_bio_bh_private = NULL; + block->orig_bio_bh_end_io.bio = NULL; + block->next_in_same_bio = NULL; + } + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "New written %c-block @%llu (%s/%llu/%d)\n", + is_metadata ? 'M' : 'D', + (unsigned long long)block->logical_bytenr, + block->dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num); + list_add(&block->all_blocks_node, &state->all_blocks_list); + btrfsic_block_hashtable_add(block, &state->block_hashtable); + + if (is_metadata) { + ret = btrfsic_process_metablock(state, block, + &block_ctx, + (struct btrfs_header *) + block_ctx.data, 0, 0); + if (ret) + printk(KERN_INFO + "btrfsic: process_metablock(root @%llu)" + " failed!\n", + (unsigned long long)dev_bytenr); + } + btrfsic_release_block_ctx(&block_ctx); + } +} + +static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) +{ + struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private; + int iodone_w_error; + + /* mutex is not held! This is not save if IO is not yet completed + * on umount */ + iodone_w_error = 0; + if (bio_error_status) + iodone_w_error = 1; + + BUG_ON(NULL == block); + bp->bi_private = block->orig_bio_bh_private; + bp->bi_end_io = block->orig_bio_bh_end_io.bio; + + do { + struct btrfsic_block *next_block; + struct btrfsic_dev_state *const dev_state = block->dev_state; + + if ((dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", + bio_error_status, + btrfsic_get_block_type(dev_state->state, block), + (unsigned long long)block->logical_bytenr, + dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num); + next_block = block->next_in_same_bio; + block->iodone_w_error = iodone_w_error; + if (block->submit_bio_bh_rw & REQ_FLUSH) { + dev_state->last_flush_gen++; + if ((dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bio_end_io() new %s flush_gen=%llu\n", + dev_state->name, + (unsigned long long) + dev_state->last_flush_gen); + } + if (block->submit_bio_bh_rw & REQ_FUA) + block->flush_gen = 0; /* FUA completed means block is + * on disk */ + block->is_iodone = 1; /* for FLUSH, this releases the block */ + block = next_block; + } while (NULL != block); + + bp->bi_end_io(bp, bio_error_status); +} + +static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) +{ + struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private; + int iodone_w_error = !uptodate; + struct btrfsic_dev_state *dev_state; + + BUG_ON(NULL == block); + dev_state = block->dev_state; + if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n", + iodone_w_error, + btrfsic_get_block_type(dev_state->state, block), + (unsigned long long)block->logical_bytenr, + block->dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num); + + block->iodone_w_error = iodone_w_error; + if (block->submit_bio_bh_rw & REQ_FLUSH) { + dev_state->last_flush_gen++; + if ((dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bh_end_io() new %s flush_gen=%llu\n", + dev_state->name, + (unsigned long long)dev_state->last_flush_gen); + } + if (block->submit_bio_bh_rw & REQ_FUA) + block->flush_gen = 0; /* FUA completed means block is on disk */ + + bh->b_private = block->orig_bio_bh_private; + bh->b_end_io = block->orig_bio_bh_end_io.bh; + block->is_iodone = 1; /* for FLUSH, this releases the block */ + bh->b_end_io(bh, uptodate); +} + +static int btrfsic_process_written_superblock( + struct btrfsic_state *state, + struct btrfsic_block *const superblock, + struct btrfs_super_block *const super_hdr) +{ + int pass; + + superblock->generation = btrfs_super_generation(super_hdr); + if (!(superblock->generation > state->max_superblock_generation || + 0 == state->max_superblock_generation)) { + if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) + printk(KERN_INFO + "btrfsic: superblock @%llu (%s/%llu/%d)" + " with old gen %llu <= %llu\n", + (unsigned long long)superblock->logical_bytenr, + superblock->dev_state->name, + (unsigned long long)superblock->dev_bytenr, + superblock->mirror_num, + (unsigned long long) + btrfs_super_generation(super_hdr), + (unsigned long long) + state->max_superblock_generation); + } else { + if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) + printk(KERN_INFO + "btrfsic: got new superblock @%llu (%s/%llu/%d)" + " with new gen %llu > %llu\n", + (unsigned long long)superblock->logical_bytenr, + superblock->dev_state->name, + (unsigned long long)superblock->dev_bytenr, + superblock->mirror_num, + (unsigned long long) + btrfs_super_generation(super_hdr), + (unsigned long long) + state->max_superblock_generation); + + state->max_superblock_generation = + btrfs_super_generation(super_hdr); + state->latest_superblock = superblock; + } + + for (pass = 0; pass < 3; pass++) { + int ret; + u64 next_bytenr; + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx tmp_next_block_ctx; + struct btrfsic_block_link *l; + int num_copies; + int mirror_num; + const char *additional_string = NULL; + struct btrfs_disk_key tmp_disk_key; + + tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; + tmp_disk_key.offset = 0; + + switch (pass) { + case 0: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID); + additional_string = "root "; + next_bytenr = btrfs_super_root(super_hdr); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "root@%llu\n", + (unsigned long long)next_bytenr); + break; + case 1: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID); + additional_string = "chunk "; + next_bytenr = btrfs_super_chunk_root(super_hdr); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "chunk@%llu\n", + (unsigned long long)next_bytenr); + break; + case 2: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_TREE_LOG_OBJECTID); + additional_string = "log "; + next_bytenr = btrfs_super_log_root(super_hdr); + if (0 == next_bytenr) + continue; + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "log@%llu\n", + (unsigned long long)next_bytenr); + break; + } + + num_copies = + btrfs_num_copies(&state->root->fs_info->mapping_tree, + next_bytenr, PAGE_SIZE); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + (unsigned long long)next_bytenr, num_copies); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + int was_created; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic_process_written_superblock(" + "mirror_num=%d)\n", mirror_num); + ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, + &tmp_next_block_ctx, + mirror_num); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(@%llu," + " mirror=%d) failed!\n", + (unsigned long long)next_bytenr, + mirror_num); + return -1; + } + + next_block = btrfsic_block_lookup_or_add( + state, + &tmp_next_block_ctx, + additional_string, + 1, 0, 1, + mirror_num, + &was_created); + if (NULL == next_block) { + printk(KERN_INFO + "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + return -1; + } + + next_block->disk_key = tmp_disk_key; + if (was_created) + next_block->generation = + BTRFSIC_GENERATION_UNKNOWN; + l = btrfsic_block_link_lookup_or_add( + state, + &tmp_next_block_ctx, + next_block, + superblock, + BTRFSIC_GENERATION_UNKNOWN); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + if (NULL == l) + return -1; + } + } + + if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) { + WARN_ON(1); + btrfsic_dump_tree(state); + } + + return 0; +} + +static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, + struct btrfsic_block *const block, + int recursion_level) +{ + struct list_head *elem_ref_to; + int ret = 0; + + if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { + /* + * Note that this situation can happen and does not + * indicate an error in regular cases. It happens + * when disk blocks are freed and later reused. + * The check-integrity module is not aware of any + * block free operations, it just recognizes block + * write operations. Therefore it keeps the linkage + * information for a block until a block is + * rewritten. This can temporarily cause incorrect + * and even circular linkage informations. This + * causes no harm unless such blocks are referenced + * by the most recent super block. + */ + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic: abort cyclic linkage (case 1).\n"); + + return ret; + } + + /* + * This algorithm is recursive because the amount of used stack + * space is very small and the max recursion depth is limited. + */ + list_for_each(elem_ref_to, &block->ref_to_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_to, struct btrfsic_block_link, + node_ref_to); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "rl=%d, %c @%llu (%s/%llu/%d)" + " %u* refers to %c @%llu (%s/%llu/%d)\n", + recursion_level, + btrfsic_get_block_type(state, block), + (unsigned long long)block->logical_bytenr, + block->dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + if (l->block_ref_to->never_written) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which is never written!\n", + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + ret = -1; + } else if (!l->block_ref_to->is_iodone) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which is not yet iodone!\n", + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + ret = -1; + } else if (l->parent_generation != + l->block_ref_to->generation && + BTRFSIC_GENERATION_UNKNOWN != + l->parent_generation && + BTRFSIC_GENERATION_UNKNOWN != + l->block_ref_to->generation) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " with generation %llu !=" + " parent generation %llu!\n", + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num, + (unsigned long long)l->block_ref_to->generation, + (unsigned long long)l->parent_generation); + ret = -1; + } else if (l->block_ref_to->flush_gen > + l->block_ref_to->dev_state->last_flush_gen) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which is not flushed out of disk's write cache" + " (block flush_gen=%llu," + " dev->flush_gen=%llu)!\n", + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num, + (unsigned long long)block->flush_gen, + (unsigned long long) + l->block_ref_to->dev_state->last_flush_gen); + ret = -1; + } else if (-1 == btrfsic_check_all_ref_blocks(state, + l->block_ref_to, + recursion_level + + 1)) { + ret = -1; + } + } + + return ret; +} + +static int btrfsic_is_block_ref_by_superblock( + const struct btrfsic_state *state, + const struct btrfsic_block *block, + int recursion_level) +{ + struct list_head *elem_ref_from; + + if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { + /* refer to comment at "abort cyclic linkage (case 1)" */ + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic: abort cyclic linkage (case 2).\n"); + + return 0; + } + + /* + * This algorithm is recursive because the amount of used stack space + * is very small and the max recursion depth is limited. + */ + list_for_each(elem_ref_from, &block->ref_from_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_from, struct btrfsic_block_link, + node_ref_from); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "rl=%d, %c @%llu (%s/%llu/%d)" + " is ref %u* from %c @%llu (%s/%llu/%d)\n", + recursion_level, + btrfsic_get_block_type(state, block), + (unsigned long long)block->logical_bytenr, + block->dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + (unsigned long long) + l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + (unsigned long long) + l->block_ref_from->dev_bytenr, + l->block_ref_from->mirror_num); + if (l->block_ref_from->is_superblock && + state->latest_superblock->dev_bytenr == + l->block_ref_from->dev_bytenr && + state->latest_superblock->dev_state->bdev == + l->block_ref_from->dev_state->bdev) + return 1; + else if (btrfsic_is_block_ref_by_superblock(state, + l->block_ref_from, + recursion_level + + 1)) + return 1; + } + + return 0; +} + +static void btrfsic_print_add_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l) +{ + printk(KERN_INFO + "Add %u* link from %c @%llu (%s/%llu/%d)" + " to %c @%llu (%s/%llu/%d).\n", + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + (unsigned long long)l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + (unsigned long long)l->block_ref_from->dev_bytenr, + l->block_ref_from->mirror_num, + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long)l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); +} + +static void btrfsic_print_rem_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l) +{ + printk(KERN_INFO + "Rem %u* link from %c @%llu (%s/%llu/%d)" + " to %c @%llu (%s/%llu/%d).\n", + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + (unsigned long long)l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + (unsigned long long)l->block_ref_from->dev_bytenr, + l->block_ref_from->mirror_num, + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long)l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); +} + +static char btrfsic_get_block_type(const struct btrfsic_state *state, + const struct btrfsic_block *block) +{ + if (block->is_superblock && + state->latest_superblock->dev_bytenr == block->dev_bytenr && + state->latest_superblock->dev_state->bdev == block->dev_state->bdev) + return 'S'; + else if (block->is_superblock) + return 's'; + else if (block->is_metadata) + return 'M'; + else + return 'D'; +} + +static void btrfsic_dump_tree(const struct btrfsic_state *state) +{ + btrfsic_dump_tree_sub(state, state->latest_superblock, 0); +} + +static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, + const struct btrfsic_block *block, + int indent_level) +{ + struct list_head *elem_ref_to; + int indent_add; + static char buf[80]; + int cursor_position; + + /* + * Should better fill an on-stack buffer with a complete line and + * dump it at once when it is time to print a newline character. + */ + + /* + * This algorithm is recursive because the amount of used stack space + * is very small and the max recursion depth is limited. + */ + indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)", + btrfsic_get_block_type(state, block), + (unsigned long long)block->logical_bytenr, + block->dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num); + if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { + printk("[...]\n"); + return; + } + printk(buf); + indent_level += indent_add; + if (list_empty(&block->ref_to_list)) { + printk("\n"); + return; + } + if (block->mirror_num > 1 && + !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) { + printk(" [...]\n"); + return; + } + + cursor_position = indent_level; + list_for_each(elem_ref_to, &block->ref_to_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_to, struct btrfsic_block_link, + node_ref_to); + + while (cursor_position < indent_level) { + printk(" "); + cursor_position++; + } + if (l->ref_cnt > 1) + indent_add = sprintf(buf, " %d*--> ", l->ref_cnt); + else + indent_add = sprintf(buf, " --> "); + if (indent_level + indent_add > + BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { + printk("[...]\n"); + cursor_position = 0; + continue; + } + + printk(buf); + + btrfsic_dump_tree_sub(state, l->block_ref_to, + indent_level + indent_add); + cursor_position = 0; + } +} + +static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block *next_block, + struct btrfsic_block *from_block, + u64 parent_generation) +{ + struct btrfsic_block_link *l; + + l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev, + next_block_ctx->dev_bytenr, + from_block->dev_state->bdev, + from_block->dev_bytenr, + &state->block_link_hashtable); + if (NULL == l) { + l = btrfsic_block_link_alloc(); + if (NULL == l) { + printk(KERN_INFO + "btrfsic: error, kmalloc" " failed!\n"); + return NULL; + } + + l->block_ref_to = next_block; + l->block_ref_from = from_block; + l->ref_cnt = 1; + l->parent_generation = parent_generation; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + + list_add(&l->node_ref_to, &from_block->ref_to_list); + list_add(&l->node_ref_from, &next_block->ref_from_list); + + btrfsic_block_link_hashtable_add(l, + &state->block_link_hashtable); + } else { + l->ref_cnt++; + l->parent_generation = parent_generation; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + } + + return l; +} + +static struct btrfsic_block *btrfsic_block_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx, + const char *additional_string, + int is_metadata, + int is_iodone, + int never_written, + int mirror_num, + int *was_created) +{ + struct btrfsic_block *block; + + block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev, + block_ctx->dev_bytenr, + &state->block_hashtable); + if (NULL == block) { + struct btrfsic_dev_state *dev_state; + + block = btrfsic_block_alloc(); + if (NULL == block) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + return NULL; + } + dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev); + if (NULL == dev_state) { + printk(KERN_INFO + "btrfsic: error, lookup dev_state failed!\n"); + btrfsic_block_free(block); + return NULL; + } + block->dev_state = dev_state; + block->dev_bytenr = block_ctx->dev_bytenr; + block->logical_bytenr = block_ctx->start; + block->is_metadata = is_metadata; + block->is_iodone = is_iodone; + block->never_written = never_written; + block->mirror_num = mirror_num; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "New %s%c-block @%llu (%s/%llu/%d)\n", + additional_string, + btrfsic_get_block_type(state, block), + (unsigned long long)block->logical_bytenr, + dev_state->name, + (unsigned long long)block->dev_bytenr, + mirror_num); + list_add(&block->all_blocks_node, &state->all_blocks_list); + btrfsic_block_hashtable_add(block, &state->block_hashtable); + if (NULL != was_created) + *was_created = 1; + } else { + if (NULL != was_created) + *was_created = 0; + } + + return block; +} + +static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, + u64 bytenr, + struct btrfsic_dev_state *dev_state, + u64 dev_bytenr, char *data) +{ + int num_copies; + int mirror_num; + int ret; + struct btrfsic_block_data_ctx block_ctx; + int match = 0; + + num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, + bytenr, PAGE_SIZE); + + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, + &block_ctx, mirror_num); + if (ret) { + printk(KERN_INFO "btrfsic:" + " btrfsic_map_block(logical @%llu," + " mirror %d) failed!\n", + (unsigned long long)bytenr, mirror_num); + continue; + } + + if (dev_state->bdev == block_ctx.dev->bdev && + dev_bytenr == block_ctx.dev_bytenr) { + match++; + btrfsic_release_block_ctx(&block_ctx); + break; + } + btrfsic_release_block_ctx(&block_ctx); + } + + if (!match) { + printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio," + " buffer->log_bytenr=%llu, submit_bio(bdev=%s," + " phys_bytenr=%llu)!\n", + (unsigned long long)bytenr, dev_state->name, + (unsigned long long)dev_bytenr); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, + &block_ctx, mirror_num); + if (ret) + continue; + + printk(KERN_INFO "Read logical bytenr @%llu maps to" + " (%s/%llu/%d)\n", + (unsigned long long)bytenr, + block_ctx.dev->name, + (unsigned long long)block_ctx.dev_bytenr, + mirror_num); + } + WARN_ON(1); + } +} + +static struct btrfsic_dev_state *btrfsic_dev_state_lookup( + struct block_device *bdev) +{ + struct btrfsic_dev_state *ds; + + ds = btrfsic_dev_state_hashtable_lookup(bdev, + &btrfsic_dev_state_hashtable); + return ds; +} + +int btrfsic_submit_bh(int rw, struct buffer_head *bh) +{ + struct btrfsic_dev_state *dev_state; + + if (!btrfsic_is_initialized) + return submit_bh(rw, bh); + + mutex_lock(&btrfsic_mutex); + /* since btrfsic_submit_bh() might also be called before + * btrfsic_mount(), this might return NULL */ + dev_state = btrfsic_dev_state_lookup(bh->b_bdev); + + /* Only called to write the superblock (incl. FLUSH/FUA) */ + if (NULL != dev_state && + (rw & WRITE) && bh->b_size > 0) { + u64 dev_bytenr; + + dev_bytenr = 4096 * bh->b_blocknr; + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu)," + " size=%lu, data=%p, bdev=%p)\n", + rw, bh->b_blocknr, + (unsigned long long)dev_bytenr, bh->b_size, + bh->b_data, bh->b_bdev); + btrfsic_process_written_block(dev_state, dev_bytenr, + bh->b_data, bh->b_size, NULL, + NULL, bh, rw); + } else if (NULL != dev_state && (rw & REQ_FLUSH)) { + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n", + rw, bh->b_bdev); + if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { + if ((dev_state->state->print_mask & + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE))) + printk(KERN_INFO + "btrfsic_submit_bh(%s) with FLUSH" + " but dummy block already in use" + " (ignored)!\n", + dev_state->name); + } else { + struct btrfsic_block *const block = + &dev_state->dummy_block_for_bio_bh_flush; + + block->is_iodone = 0; + block->never_written = 0; + block->iodone_w_error = 0; + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = rw; + block->orig_bio_bh_private = bh->b_private; + block->orig_bio_bh_end_io.bh = bh->b_end_io; + block->next_in_same_bio = NULL; + bh->b_private = block; + bh->b_end_io = btrfsic_bh_end_io; + } + } + mutex_unlock(&btrfsic_mutex); + return submit_bh(rw, bh); +} + +void btrfsic_submit_bio(int rw, struct bio *bio) +{ + struct btrfsic_dev_state *dev_state; + + if (!btrfsic_is_initialized) { + submit_bio(rw, bio); + return; + } + + mutex_lock(&btrfsic_mutex); + /* since btrfsic_submit_bio() is also called before + * btrfsic_mount(), this might return NULL */ + dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); + if (NULL != dev_state && + (rw & WRITE) && NULL != bio->bi_io_vec) { + unsigned int i; + u64 dev_bytenr; + int bio_is_patched; + + dev_bytenr = 512 * bio->bi_sector; + bio_is_patched = 0; + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bio(rw=0x%x, bi_vcnt=%u," + " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n", + rw, bio->bi_vcnt, bio->bi_sector, + (unsigned long long)dev_bytenr, + bio->bi_bdev); + + for (i = 0; i < bio->bi_vcnt; i++) { + u8 *mapped_data; + + mapped_data = kmap(bio->bi_io_vec[i].bv_page); + if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE) == + (dev_state->state->print_mask & + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE))) + printk(KERN_INFO + "#%u: page=%p, mapped=%p, len=%u," + " offset=%u\n", + i, bio->bi_io_vec[i].bv_page, + mapped_data, + bio->bi_io_vec[i].bv_len, + bio->bi_io_vec[i].bv_offset); + btrfsic_process_written_block(dev_state, dev_bytenr, + mapped_data, + bio->bi_io_vec[i].bv_len, + bio, &bio_is_patched, + NULL, rw); + kunmap(bio->bi_io_vec[i].bv_page); + dev_bytenr += bio->bi_io_vec[i].bv_len; + } + } else if (NULL != dev_state && (rw & REQ_FLUSH)) { + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n", + rw, bio->bi_bdev); + if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { + if ((dev_state->state->print_mask & + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE))) + printk(KERN_INFO + "btrfsic_submit_bio(%s) with FLUSH" + " but dummy block already in use" + " (ignored)!\n", + dev_state->name); + } else { + struct btrfsic_block *const block = + &dev_state->dummy_block_for_bio_bh_flush; + + block->is_iodone = 0; + block->never_written = 0; + block->iodone_w_error = 0; + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = rw; + block->orig_bio_bh_private = bio->bi_private; + block->orig_bio_bh_end_io.bio = bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + } + } + mutex_unlock(&btrfsic_mutex); + + submit_bio(rw, bio); +} + +int btrfsic_mount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices, + int including_extent_data, u32 print_mask) +{ + int ret; + struct btrfsic_state *state; + struct list_head *dev_head = &fs_devices->devices; + struct btrfs_device *device; + + state = kzalloc(sizeof(*state), GFP_NOFS); + if (NULL == state) { + printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); + return -1; + } + + if (!btrfsic_is_initialized) { + mutex_init(&btrfsic_mutex); + btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable); + btrfsic_is_initialized = 1; + } + mutex_lock(&btrfsic_mutex); + state->root = root; + state->print_mask = print_mask; + state->include_extent_data = including_extent_data; + state->csum_size = 0; + INIT_LIST_HEAD(&state->all_blocks_list); + btrfsic_block_hashtable_init(&state->block_hashtable); + btrfsic_block_link_hashtable_init(&state->block_link_hashtable); + state->max_superblock_generation = 0; + state->latest_superblock = NULL; + + list_for_each_entry(device, dev_head, dev_list) { + struct btrfsic_dev_state *ds; + char *p; + + if (!device->bdev || !device->name) + continue; + + ds = btrfsic_dev_state_alloc(); + if (NULL == ds) { + printk(KERN_INFO + "btrfs check-integrity: kmalloc() failed!\n"); + mutex_unlock(&btrfsic_mutex); + return -1; + } + ds->bdev = device->bdev; + ds->state = state; + bdevname(ds->bdev, ds->name); + ds->name[BDEVNAME_SIZE - 1] = '\0'; + for (p = ds->name; *p != '\0'; p++); + while (p > ds->name && *p != '/') + p--; + if (*p == '/') + p++; + strlcpy(ds->name, p, sizeof(ds->name)); + btrfsic_dev_state_hashtable_add(ds, + &btrfsic_dev_state_hashtable); + } + + ret = btrfsic_process_superblock(state, fs_devices); + if (0 != ret) { + mutex_unlock(&btrfsic_mutex); + btrfsic_unmount(root, fs_devices); + return ret; + } + + if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE) + btrfsic_dump_database(state); + if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE) + btrfsic_dump_tree(state); + + mutex_unlock(&btrfsic_mutex); + return 0; +} + +void btrfsic_unmount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices) +{ + struct list_head *elem_all; + struct list_head *tmp_all; + struct btrfsic_state *state; + struct list_head *dev_head = &fs_devices->devices; + struct btrfs_device *device; + + if (!btrfsic_is_initialized) + return; + + mutex_lock(&btrfsic_mutex); + + state = NULL; + list_for_each_entry(device, dev_head, dev_list) { + struct btrfsic_dev_state *ds; + + if (!device->bdev || !device->name) + continue; + + ds = btrfsic_dev_state_hashtable_lookup( + device->bdev, + &btrfsic_dev_state_hashtable); + if (NULL != ds) { + state = ds->state; + btrfsic_dev_state_hashtable_remove(ds); + btrfsic_dev_state_free(ds); + } + } + + if (NULL == state) { + printk(KERN_INFO + "btrfsic: error, cannot find state information" + " on umount!\n"); + mutex_unlock(&btrfsic_mutex); + return; + } + + /* + * Don't care about keeping the lists' state up to date, + * just free all memory that was allocated dynamically. + * Free the blocks and the block_links. + */ + list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) { + struct btrfsic_block *const b_all = + list_entry(elem_all, struct btrfsic_block, + all_blocks_node); + struct list_head *elem_ref_to; + struct list_head *tmp_ref_to; + + list_for_each_safe(elem_ref_to, tmp_ref_to, + &b_all->ref_to_list) { + struct btrfsic_block_link *const l = + list_entry(elem_ref_to, + struct btrfsic_block_link, + node_ref_to); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_rem_link(state, l); + + l->ref_cnt--; + if (0 == l->ref_cnt) + btrfsic_block_link_free(l); + } + + if (b_all->is_iodone) + btrfsic_block_free(b_all); + else + printk(KERN_INFO "btrfs: attempt to free %c-block" + " @%llu (%s/%llu/%d) on umount which is" + " not yet iodone!\n", + btrfsic_get_block_type(state, b_all), + (unsigned long long)b_all->logical_bytenr, + b_all->dev_state->name, + (unsigned long long)b_all->dev_bytenr, + b_all->mirror_num); + } + + mutex_unlock(&btrfsic_mutex); + + kfree(state); +} diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h new file mode 100644 index 00000000000..8b59175cc50 --- /dev/null +++ b/fs/btrfs/check-integrity.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) STRATO AG 2011. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#if !defined(__BTRFS_CHECK_INTEGRITY__) +#define __BTRFS_CHECK_INTEGRITY__ + +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY +int btrfsic_submit_bh(int rw, struct buffer_head *bh); +void btrfsic_submit_bio(int rw, struct bio *bio); +#else +#define btrfsic_submit_bh submit_bh +#define btrfsic_submit_bio submit_bio +#endif + +int btrfsic_mount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices, + int including_extent_data, u32 print_mask); +void btrfsic_unmount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices); + +#endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index dede441bdee..0639a555e16 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -240,7 +240,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, cow = btrfs_alloc_free_block(trans, root, buf->len, 0, new_root_objectid, &disk_key, level, - buf->start, 0); + buf->start, 0, 1); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -261,9 +261,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, WARN_ON(btrfs_header_generation(buf) > trans->transid); if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, 1, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0); + ret = btrfs_inc_ref(trans, root, cow, 0, 1); if (ret) return ret; @@ -350,14 +350,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if ((owner == root->root_key.objectid || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { - ret = btrfs_inc_ref(trans, root, buf, 1); + ret = btrfs_inc_ref(trans, root, buf, 1, 1); BUG_ON(ret); if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_dec_ref(trans, root, buf, 0); + ret = btrfs_dec_ref(trans, root, buf, 0, 1); BUG_ON(ret); - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, 1, 1); BUG_ON(ret); } new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; @@ -365,9 +365,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, 1, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0); + ret = btrfs_inc_ref(trans, root, cow, 0, 1); BUG_ON(ret); } if (new_flags != 0) { @@ -381,11 +381,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, 1, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0); + ret = btrfs_inc_ref(trans, root, cow, 0, 1); BUG_ON(ret); - ret = btrfs_dec_ref(trans, root, buf, 1); + ret = btrfs_dec_ref(trans, root, buf, 1, 1); BUG_ON(ret); } clean_tree_block(trans, root, buf); @@ -446,7 +446,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, root->root_key.objectid, &disk_key, - level, search_start, empty_size); + level, search_start, empty_size, 1); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -484,7 +484,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, rcu_assign_pointer(root->node, cow); btrfs_free_tree_block(trans, root, buf, parent_start, - last_ref); + last_ref, 1); free_extent_buffer(buf); add_root_to_dirty_list(root); } else { @@ -500,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, trans->transid); btrfs_mark_buffer_dirty(parent); btrfs_free_tree_block(trans, root, buf, parent_start, - last_ref); + last_ref, 1); } if (unlock_orig) btrfs_tree_unlock(buf); @@ -957,7 +957,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, free_extent_buffer(mid); root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, root, mid, 0, 1); + btrfs_free_tree_block(trans, root, mid, 0, 1, 0); /* once for the root ptr */ free_extent_buffer(mid); return 0; @@ -1015,7 +1015,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret) ret = wret; root_sub_used(root, right->len); - btrfs_free_tree_block(trans, root, right, 0, 1); + btrfs_free_tree_block(trans, root, right, 0, 1, 0); free_extent_buffer(right); right = NULL; } else { @@ -1055,7 +1055,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret) ret = wret; root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, root, mid, 0, 1); + btrfs_free_tree_block(trans, root, mid, 0, 1, 0); free_extent_buffer(mid); mid = NULL; } else { @@ -2089,7 +2089,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, root->root_key.objectid, &lower_key, - level, root->node->start, 0); + level, root->node->start, 0, 0); if (IS_ERR(c)) return PTR_ERR(c); @@ -2216,7 +2216,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, root->root_key.objectid, - &disk_key, level, c->start, 0); + &disk_key, level, c->start, 0, 0); if (IS_ERR(split)) return PTR_ERR(split); @@ -2970,7 +2970,7 @@ again: right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, root->root_key.objectid, - &disk_key, 0, l->start, 0); + &disk_key, 0, l->start, 0, 0); if (IS_ERR(right)) return PTR_ERR(right); @@ -3781,7 +3781,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used(root, leaf->len); - btrfs_free_tree_block(trans, root, leaf, 0, 1); + btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); return 0; } /* diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 67385033323..27ebe61d3cc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -86,6 +86,9 @@ struct btrfs_ordered_sum; /* holds checksums of all the data extents */ #define BTRFS_CSUM_TREE_OBJECTID 7ULL +/* for storing balance parameters in the root tree */ +#define BTRFS_BALANCE_OBJECTID -4ULL + /* orhpan objectid for tracking unlinked/truncated files */ #define BTRFS_ORPHAN_OBJECTID -5ULL @@ -692,6 +695,54 @@ struct btrfs_root_ref { __le16 name_len; } __attribute__ ((__packed__)); +struct btrfs_disk_balance_args { + /* + * profiles to operate on, single is denoted by + * BTRFS_AVAIL_ALLOC_BIT_SINGLE + */ + __le64 profiles; + + /* usage filter */ + __le64 usage; + + /* devid filter */ + __le64 devid; + + /* devid subset filter [pstart..pend) */ + __le64 pstart; + __le64 pend; + + /* btrfs virtual address space subset filter [vstart..vend) */ + __le64 vstart; + __le64 vend; + + /* + * profile to convert to, single is denoted by + * BTRFS_AVAIL_ALLOC_BIT_SINGLE + */ + __le64 target; + + /* BTRFS_BALANCE_ARGS_* */ + __le64 flags; + + __le64 unused[8]; +} __attribute__ ((__packed__)); + +/* + * store balance parameters to disk so that balance can be properly + * resumed after crash or unmount + */ +struct btrfs_balance_item { + /* BTRFS_BALANCE_* */ + __le64 flags; + + struct btrfs_disk_balance_args data; + struct btrfs_disk_balance_args meta; + struct btrfs_disk_balance_args sys; + + __le64 unused[4]; +} __attribute__ ((__packed__)); + #define BTRFS_FILE_EXTENT_INLINE 0 #define BTRFS_FILE_EXTENT_REG 1 #define BTRFS_FILE_EXTENT_PREALLOC 2 @@ -751,14 +802,32 @@ struct btrfs_csum_item { } __attribute__ ((__packed__)); /* different types of block groups (and chunks) */ -#define BTRFS_BLOCK_GROUP_DATA (1 << 0) -#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) -#define BTRFS_BLOCK_GROUP_METADATA (1 << 2) -#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3) -#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) -#define BTRFS_BLOCK_GROUP_DUP (1 << 5) -#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) -#define BTRFS_NR_RAID_TYPES 5 +#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) +#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) +#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2) +#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3) +#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) +#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) +#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) +#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE +#define BTRFS_NR_RAID_TYPES 5 + +#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ + BTRFS_BLOCK_GROUP_SYSTEM | \ + BTRFS_BLOCK_GROUP_METADATA) + +#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ + BTRFS_BLOCK_GROUP_RAID1 | \ + BTRFS_BLOCK_GROUP_DUP | \ + BTRFS_BLOCK_GROUP_RAID10) +/* + * We need a bit for restriper to be able to tell when chunks of type + * SINGLE are available. This "extended" profile format is used in + * fs_info->avail_*_alloc_bits (in-memory) and balance item fields + * (on-disk). The corresponding on-disk bit in chunk.type is reserved + * to avoid remappings between two formats in future. + */ +#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) struct btrfs_block_group_item { __le64 used; @@ -916,6 +985,7 @@ struct btrfs_block_group_cache { struct reloc_control; struct btrfs_device; struct btrfs_fs_devices; +struct btrfs_balance_control; struct btrfs_delayed_root; struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; @@ -971,7 +1041,7 @@ struct btrfs_fs_info { * is required instead of the faster short fsync log commits */ u64 last_trans_log_full_commit; - unsigned long mount_opt:20; + unsigned long mount_opt:21; unsigned long compress_type:4; u64 max_inline; u64 alloc_start; @@ -1132,12 +1202,23 @@ struct btrfs_fs_info { spinlock_t ref_cache_lock; u64 total_ref_cache_size; + /* + * these three are in extended format (availability of single + * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other + * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits) + */ u64 avail_data_alloc_bits; u64 avail_metadata_alloc_bits; u64 avail_system_alloc_bits; - u64 data_alloc_profile; - u64 metadata_alloc_profile; - u64 system_alloc_profile; + + /* restriper state */ + spinlock_t balance_lock; + struct mutex balance_mutex; + atomic_t balance_running; + atomic_t balance_pause_req; + atomic_t balance_cancel_req; + struct btrfs_balance_control *balance_ctl; + wait_queue_head_t balance_wait_q; unsigned data_chunk_allocations; unsigned metadata_ratio; @@ -1155,6 +1236,10 @@ struct btrfs_fs_info { int scrub_workers_refcnt; struct btrfs_workers scrub_workers; +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + u32 check_integrity_print_mask; +#endif + /* filesystem state */ u64 fs_state; @@ -1383,6 +1468,8 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_DEV_ITEM_KEY 216 #define BTRFS_CHUNK_ITEM_KEY 228 +#define BTRFS_BALANCE_ITEM_KEY 248 + /* * string items are for debugging. They just store a short string of * data in the FS @@ -1413,6 +1500,9 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) #define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) #define BTRFS_MOUNT_RECOVERY (1 << 18) +#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) +#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) +#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -2077,8 +2167,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, num_devices, 64); -/* struct btrfs_super_block */ +/* struct btrfs_balance_item */ +BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); +static inline void btrfs_balance_data(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_set_balance_data(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_balance_meta(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_set_balance_meta(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_balance_sys(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} + +static inline void btrfs_set_balance_sys(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} + +static inline void +btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, + struct btrfs_disk_balance_args *disk) +{ + memset(cpu, 0, sizeof(*cpu)); + + cpu->profiles = le64_to_cpu(disk->profiles); + cpu->usage = le64_to_cpu(disk->usage); + cpu->devid = le64_to_cpu(disk->devid); + cpu->pstart = le64_to_cpu(disk->pstart); + cpu->pend = le64_to_cpu(disk->pend); + cpu->vstart = le64_to_cpu(disk->vstart); + cpu->vend = le64_to_cpu(disk->vend); + cpu->target = le64_to_cpu(disk->target); + cpu->flags = le64_to_cpu(disk->flags); +} + +static inline void +btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, + struct btrfs_balance_args *cpu) +{ + memset(disk, 0, sizeof(*disk)); + + disk->profiles = cpu_to_le64(cpu->profiles); + disk->usage = cpu_to_le64(cpu->usage); + disk->devid = cpu_to_le64(cpu->devid); + disk->pstart = cpu_to_le64(cpu->pstart); + disk->pend = cpu_to_le64(cpu->pend); + disk->vstart = cpu_to_le64(cpu->vstart); + disk->vend = cpu_to_le64(cpu->vend); + disk->target = cpu_to_le64(cpu->target); + disk->flags = cpu_to_le64(cpu->flags); +} + +/* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, @@ -2196,7 +2364,7 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, return btrfs_item_size(eb, e) - offset; } -static inline struct btrfs_root *btrfs_sb(struct super_block *sb) +static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) { return sb->s_fs_info; } @@ -2277,11 +2445,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, - u64 hint, u64 empty_size); + u64 hint, u64 empty_size, int for_cow); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - u64 parent, int last_ref); + u64 parent, int last_ref, int for_cow); struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -2301,17 +2469,17 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, u64 search_end, struct btrfs_key *ins, u64 data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); + struct extent_buffer *buf, int full_backref, int for_cow); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); + struct extent_buffer *buf, int full_backref, int for_cow); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 flags, int is_data); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset); + u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, + u64 owner, u64 offset, int for_cow); int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, @@ -2323,7 +2491,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset); + u64 root_objectid, u64 owner, u64 offset, int for_cow); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -2482,10 +2650,18 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, } int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); +static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) +{ + ++p->slots[0]; + if (p->slots[0] >= btrfs_header_nritems(p->nodes[0])) + return btrfs_next_leaf(root, p); + return 0; +} int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); void btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref); + struct btrfs_block_rsv *block_rsv, int update_ref, + int for_reloc); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, @@ -2500,6 +2676,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) } static inline void free_fs_info(struct btrfs_fs_info *fs_info) { + kfree(fs_info->balance_ctl); kfree(fs_info->delayed_root); kfree(fs_info->extent_root); kfree(fs_info->tree_root); @@ -2510,6 +2687,24 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->super_for_commit); kfree(fs_info); } +/** + * profile_is_valid - tests whether a given profile is valid and reduced + * @flags: profile to validate + * @extended: if true @flags is treated as an extended profile + */ +static inline int profile_is_valid(u64 flags, int extended) +{ + u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK; + + flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; + if (extended) + mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + if (flags & mask) + return 0; + /* true if zero or exactly one bit set */ + return (flags & (~flags + 1)) == flags; +} /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 9c1eccc2c50..fe4cd0f1cef 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -595,8 +595,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, num_bytes = btrfs_calc_trans_metadata_size(root, 1); ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!ret) + if (!ret) { + trace_btrfs_space_reservation(root->fs_info, "delayed_item", + item->key.objectid, + num_bytes, 1); item->bytes_reserved = num_bytes; + } return ret; } @@ -610,6 +614,9 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, return; rsv = &root->fs_info->delayed_block_rsv; + trace_btrfs_space_reservation(root->fs_info, "delayed_item", + item->key.objectid, item->bytes_reserved, + 0); btrfs_block_rsv_release(root, rsv, item->bytes_reserved); } @@ -624,7 +631,7 @@ static int btrfs_delayed_inode_reserve_metadata( struct btrfs_block_rsv *dst_rsv; u64 num_bytes; int ret; - int release = false; + bool release = false; src_rsv = trans->block_rsv; dst_rsv = &root->fs_info->delayed_block_rsv; @@ -651,8 +658,13 @@ static int btrfs_delayed_inode_reserve_metadata( */ if (ret == -EAGAIN) ret = -ENOSPC; - if (!ret) + if (!ret) { node->bytes_reserved = num_bytes; + trace_btrfs_space_reservation(root->fs_info, + "delayed_inode", + btrfs_ino(inode), + num_bytes, 1); + } return ret; } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { spin_lock(&BTRFS_I(inode)->lock); @@ -707,11 +719,17 @@ out: * reservation here. I think it may be time for a documentation page on * how block rsvs. work. */ - if (!ret) + if (!ret) { + trace_btrfs_space_reservation(root->fs_info, "delayed_inode", + btrfs_ino(inode), num_bytes, 1); node->bytes_reserved = num_bytes; + } - if (release) + if (release) { + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), num_bytes, 0); btrfs_block_rsv_release(root, src_rsv, num_bytes); + } return ret; } @@ -725,6 +743,8 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root, return; rsv = &root->fs_info->delayed_block_rsv; + trace_btrfs_space_reservation(root->fs_info, "delayed_inode", + node->inode_id, node->bytes_reserved, 0); btrfs_block_rsv_release(root, rsv, node->bytes_reserved); node->bytes_reserved = 0; @@ -1372,13 +1392,6 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, goto release_node; } - ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item); - /* - * we have reserved enough space when we start a new transaction, - * so reserving metadata failure is impossible - */ - BUG_ON(ret); - delayed_item->key.objectid = btrfs_ino(dir); btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY); delayed_item->key.offset = index; @@ -1391,6 +1404,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, dir_item->type = type; memcpy((char *)(dir_item + 1), name, name_len); + ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item); + /* + * we have reserved enough space when we start a new transaction, + * so reserving metadata failure is impossible + */ + BUG_ON(ret); + + mutex_lock(&delayed_node->mutex); ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); if (unlikely(ret)) { diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 125cf76fcd0..66e4f29505a 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -101,6 +101,11 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2, return -1; if (ref1->type > ref2->type) return 1; + /* merging of sequenced refs is not allowed */ + if (ref1->seq < ref2->seq) + return -1; + if (ref1->seq > ref2->seq) + return 1; if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), @@ -150,16 +155,22 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, /* * find an head entry based on bytenr. This returns the delayed ref - * head if it was able to find one, or NULL if nothing was in that spot + * head if it was able to find one, or NULL if nothing was in that spot. + * If return_bigger is given, the next bigger entry is returned if no exact + * match is found. */ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, u64 bytenr, - struct btrfs_delayed_ref_node **last) + struct btrfs_delayed_ref_node **last, + int return_bigger) { - struct rb_node *n = root->rb_node; + struct rb_node *n; struct btrfs_delayed_ref_node *entry; - int cmp; + int cmp = 0; +again: + n = root->rb_node; + entry = NULL; while (n) { entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); WARN_ON(!entry->in_tree); @@ -182,6 +193,19 @@ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, else return entry; } + if (entry && return_bigger) { + if (cmp > 0) { + n = rb_next(&entry->rb_node); + if (!n) + n = rb_first(root); + entry = rb_entry(n, struct btrfs_delayed_ref_node, + rb_node); + bytenr = entry->bytenr; + return_bigger = 0; + goto again; + } + return entry; + } return NULL; } @@ -209,6 +233,24 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, return 0; } +int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, + u64 seq) +{ + struct seq_list *elem; + + assert_spin_locked(&delayed_refs->lock); + if (list_empty(&delayed_refs->seq_head)) + return 0; + + elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list); + if (seq >= elem->seq) { + pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n", + seq, elem->seq, delayed_refs); + return 1; + } + return 0; +} + int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, struct list_head *cluster, u64 start) { @@ -223,20 +265,8 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, node = rb_first(&delayed_refs->root); } else { ref = NULL; - find_ref_head(&delayed_refs->root, start, &ref); + find_ref_head(&delayed_refs->root, start + 1, &ref, 1); if (ref) { - struct btrfs_delayed_ref_node *tmp; - - node = rb_prev(&ref->rb_node); - while (node) { - tmp = rb_entry(node, - struct btrfs_delayed_ref_node, - rb_node); - if (tmp->bytenr < start) - break; - ref = tmp; - node = rb_prev(&ref->rb_node); - } node = &ref->rb_node; } else node = rb_first(&delayed_refs->root); @@ -390,7 +420,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, * this does all the dirty work in terms of maintaining the correct * overall modification count. */ -static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, +static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, int action, int is_data) @@ -437,6 +468,7 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, ref->action = 0; ref->is_head = 1; ref->in_tree = 1; + ref->seq = 0; head_ref = btrfs_delayed_node_to_head(ref); head_ref->must_insert_reserved = must_insert_reserved; @@ -468,14 +500,17 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, /* * helper to insert a delayed tree ref into the rbtree. */ -static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, +static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, int level, int action) + u64 ref_root, int level, int action, + int for_cow) { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_tree_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; + u64 seq = 0; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; @@ -491,14 +526,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, ref->is_head = 0; ref->in_tree = 1; + if (need_ref_seq(for_cow, ref_root)) + seq = inc_delayed_seq(delayed_refs); + ref->seq = seq; + full_ref = btrfs_delayed_node_to_tree_ref(ref); - if (parent) { - full_ref->parent = parent; + full_ref->parent = parent; + full_ref->root = ref_root; + if (parent) ref->type = BTRFS_SHARED_BLOCK_REF_KEY; - } else { - full_ref->root = ref_root; + else ref->type = BTRFS_TREE_BLOCK_REF_KEY; - } full_ref->level = level; trace_btrfs_delayed_tree_ref(ref, full_ref, action); @@ -522,15 +560,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, /* * helper to insert a delayed data ref into the rbtree. */ -static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, +static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, - int action) + int action, int for_cow) { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_data_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; + u64 seq = 0; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; @@ -546,14 +586,18 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, ref->is_head = 0; ref->in_tree = 1; + if (need_ref_seq(for_cow, ref_root)) + seq = inc_delayed_seq(delayed_refs); + ref->seq = seq; + full_ref = btrfs_delayed_node_to_data_ref(ref); - if (parent) { - full_ref->parent = parent; + full_ref->parent = parent; + full_ref->root = ref_root; + if (parent) ref->type = BTRFS_SHARED_DATA_REF_KEY; - } else { - full_ref->root = ref_root; + else ref->type = BTRFS_EXTENT_DATA_REF_KEY; - } + full_ref->objectid = owner; full_ref->offset = offset; @@ -580,10 +624,12 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, * to make sure the delayed ref is eventually processed before this * transaction commits. */ -int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, +int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int for_cow) { struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; @@ -610,13 +656,17 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, * insert both the head node and the new ref without dropping * the spin lock */ - ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, - action, 0); + ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, + num_bytes, action, 0); BUG_ON(ret); - ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes, - parent, ref_root, level, action); + ret = add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, + num_bytes, parent, ref_root, level, action, + for_cow); BUG_ON(ret); + if (!need_ref_seq(for_cow, ref_root) && + waitqueue_active(&delayed_refs->seq_wait)) + wake_up(&delayed_refs->seq_wait); spin_unlock(&delayed_refs->lock); return 0; } @@ -624,11 +674,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, /* * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. */ -int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, +int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int for_cow) { struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; @@ -655,18 +707,23 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, * insert both the head node and the new ref without dropping * the spin lock */ - ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, - action, 1); + ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, + num_bytes, action, 1); BUG_ON(ret); - ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes, - parent, ref_root, owner, offset, action); + ret = add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, + num_bytes, parent, ref_root, owner, offset, + action, for_cow); BUG_ON(ret); + if (!need_ref_seq(for_cow, ref_root) && + waitqueue_active(&delayed_refs->seq_wait)) + wake_up(&delayed_refs->seq_wait); spin_unlock(&delayed_refs->lock); return 0; } -int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, +int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op) { @@ -683,11 +740,13 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, + ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, num_bytes, BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data); BUG_ON(ret); + if (waitqueue_active(&delayed_refs->seq_wait)) + wake_up(&delayed_refs->seq_wait); spin_unlock(&delayed_refs->lock); return 0; } @@ -704,7 +763,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) struct btrfs_delayed_ref_root *delayed_refs; delayed_refs = &trans->transaction->delayed_refs; - ref = find_ref_head(&delayed_refs->root, bytenr, NULL); + ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0); if (ref) return btrfs_delayed_node_to_head(ref); return NULL; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index e287e3b0eab..d8f244d9492 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -33,6 +33,9 @@ struct btrfs_delayed_ref_node { /* the size of the extent */ u64 num_bytes; + /* seq number to keep track of insertion order */ + u64 seq; + /* ref count on this data structure */ atomic_t refs; @@ -98,19 +101,15 @@ struct btrfs_delayed_ref_head { struct btrfs_delayed_tree_ref { struct btrfs_delayed_ref_node node; - union { - u64 root; - u64 parent; - }; + u64 root; + u64 parent; int level; }; struct btrfs_delayed_data_ref { struct btrfs_delayed_ref_node node; - union { - u64 root; - u64 parent; - }; + u64 root; + u64 parent; u64 objectid; u64 offset; }; @@ -140,6 +139,26 @@ struct btrfs_delayed_ref_root { int flushing; u64 run_delayed_start; + + /* + * seq number of delayed refs. We need to know if a backref was being + * added before the currently processed ref or afterwards. + */ + u64 seq; + + /* + * seq_list holds a list of all seq numbers that are currently being + * added to the list. While walking backrefs (btrfs_find_all_roots, + * qgroups), which might take some time, no newer ref must be processed, + * as it might influence the outcome of the walk. + */ + struct list_head seq_head; + + /* + * when the only refs we have in the list must not be processed, we want + * to wait for more refs to show up or for the end of backref walking. + */ + wait_queue_head_t seq_wait; }; static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) @@ -151,16 +170,21 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) } } -int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, +int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op); -int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_extent_op *extent_op, + int for_cow); +int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, - struct btrfs_delayed_extent_op *extent_op); -int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, + struct btrfs_delayed_extent_op *extent_op, + int for_cow); +int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op); @@ -170,6 +194,60 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head); int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, struct list_head *cluster, u64 search_start); + +struct seq_list { + struct list_head list; + u64 seq; +}; + +static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs) +{ + assert_spin_locked(&delayed_refs->lock); + ++delayed_refs->seq; + return delayed_refs->seq; +} + +static inline void +btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, + struct seq_list *elem) +{ + assert_spin_locked(&delayed_refs->lock); + elem->seq = delayed_refs->seq; + list_add_tail(&elem->list, &delayed_refs->seq_head); +} + +static inline void +btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, + struct seq_list *elem) +{ + spin_lock(&delayed_refs->lock); + list_del(&elem->list); + wake_up(&delayed_refs->seq_wait); + spin_unlock(&delayed_refs->lock); +} + +int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, + u64 seq); + +/* + * delayed refs with a ref_seq > 0 must be held back during backref walking. + * this only applies to items in one of the fs-trees. for_cow items never need + * to be held back, so they won't get a ref_seq number. + */ +static inline int need_ref_seq(int for_cow, u64 rootid) +{ + if (for_cow) + return 0; + + if (rootid == BTRFS_FS_TREE_OBJECTID) + return 1; + + if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) + return 1; + + return 0; +} + /* * a node might live in a head or a regular ref, this lets you * test for the proper type to use. diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d8525662ca7..7aa9cd36bf1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -43,6 +43,7 @@ #include "tree-log.h" #include "free-space-cache.h" #include "inode-map.h" +#include "check-integrity.h" static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); @@ -1143,7 +1144,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->orphan_item_inserted = 0; root->orphan_cleanup_state = 0; - root->fs_info = fs_info; root->objectid = objectid; root->last_trans = 0; root->highest_objectid = 0; @@ -1217,6 +1217,14 @@ static int find_and_setup_root(struct btrfs_root *tree_root, return 0; } +static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS); + if (root) + root->fs_info = fs_info; + return root; +} + static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { @@ -1224,7 +1232,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root = fs_info->tree_root; struct extent_buffer *leaf; - root = kzalloc(sizeof(*root), GFP_NOFS); + root = btrfs_alloc_root(fs_info); if (!root) return ERR_PTR(-ENOMEM); @@ -1244,7 +1252,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, root->ref_cows = 0; leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, - BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); + BTRFS_TREE_LOG_OBJECTID, NULL, + 0, 0, 0, 0); if (IS_ERR(leaf)) { kfree(root); return ERR_CAST(leaf); @@ -1318,7 +1327,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, u32 blocksize; int ret = 0; - root = kzalloc(sizeof(*root), GFP_NOFS); + root = btrfs_alloc_root(fs_info); if (!root) return ERR_PTR(-ENOMEM); if (location->offset == (u64)-1) { @@ -1874,9 +1883,9 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) } -struct btrfs_root *open_ctree(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - char *options) +int open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options) { u32 sectorsize; u32 nodesize; @@ -1888,8 +1897,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, struct btrfs_key location; struct buffer_head *bh; struct btrfs_super_block *disk_super; - struct btrfs_root *tree_root = btrfs_sb(sb); - struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *tree_root; struct btrfs_root *extent_root; struct btrfs_root *csum_root; struct btrfs_root *chunk_root; @@ -1900,16 +1909,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, int num_backups_tried = 0; int backup_index = 0; - extent_root = fs_info->extent_root = - kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - csum_root = fs_info->csum_root = - kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - chunk_root = fs_info->chunk_root = - kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - dev_root = fs_info->dev_root = - kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); + extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info); + csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info); + chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); + dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info); - if (!extent_root || !csum_root || !chunk_root || !dev_root) { + if (!tree_root || !extent_root || !csum_root || + !chunk_root || !dev_root) { err = -ENOMEM; goto fail; } @@ -1998,6 +2005,17 @@ struct btrfs_root *open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->scrub_pause_wait); init_rwsem(&fs_info->scrub_super_lock); fs_info->scrub_workers_refcnt = 0; +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + fs_info->check_integrity_print_mask = 0; +#endif + + spin_lock_init(&fs_info->balance_lock); + mutex_init(&fs_info->balance_mutex); + atomic_set(&fs_info->balance_running, 0); + atomic_set(&fs_info->balance_pause_req, 0); + atomic_set(&fs_info->balance_cancel_req, 0); + fs_info->balance_ctl = NULL; + init_waitqueue_head(&fs_info->balance_wait_q); sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); @@ -2267,9 +2285,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE); - mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_chunk_tree(chunk_root); - mutex_unlock(&fs_info->chunk_mutex); if (ret) { printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", sb->s_id); @@ -2318,9 +2334,6 @@ retry_root_backup: fs_info->generation = generation; fs_info->last_trans_committed = generation; - fs_info->data_alloc_profile = (u64)-1; - fs_info->metadata_alloc_profile = (u64)-1; - fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; ret = btrfs_init_space_info(fs_info); if (ret) { @@ -2353,6 +2366,19 @@ retry_root_backup: btrfs_set_opt(fs_info->mount_opt, SSD); } +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { + ret = btrfsic_mount(tree_root, fs_devices, + btrfs_test_opt(tree_root, + CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ? + 1 : 0, + fs_info->check_integrity_print_mask); + if (ret) + printk(KERN_WARNING "btrfs: failed to initialize" + " integrity check module %s\n", sb->s_id); + } +#endif + /* do not make disk changes in broken FS */ if (btrfs_super_log_root(disk_super) != 0 && !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) { @@ -2368,7 +2394,7 @@ retry_root_backup: btrfs_level_size(tree_root, btrfs_super_log_root_level(disk_super)); - log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + log_tree_root = btrfs_alloc_root(fs_info); if (!log_tree_root) { err = -ENOMEM; goto fail_trans_kthread; @@ -2423,13 +2449,17 @@ retry_root_backup: if (!err) err = btrfs_orphan_cleanup(fs_info->tree_root); up_read(&fs_info->cleanup_work_sem); + + if (!err) + err = btrfs_recover_balance(fs_info->tree_root); + if (err) { close_ctree(tree_root); - return ERR_PTR(err); + return err; } } - return tree_root; + return 0; fail_trans_kthread: kthread_stop(fs_info->transaction_kthread); @@ -2475,8 +2505,7 @@ fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: btrfs_close_devices(fs_info->fs_devices); - free_fs_info(fs_info); - return ERR_PTR(err); + return err; recovery_tree_root: if (!btrfs_test_opt(tree_root, RECOVERY)) @@ -2631,7 +2660,7 @@ static int write_dev_supers(struct btrfs_device *device, * we fua the first super. The others we allow * to go down lazy. */ - ret = submit_bh(WRITE_FUA, bh); + ret = btrfsic_submit_bh(WRITE_FUA, bh); if (ret) errors++; } @@ -2708,7 +2737,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) device->flush_bio = bio; bio_get(bio); - submit_bio(WRITE_FLUSH, bio); + btrfsic_submit_bio(WRITE_FLUSH, bio); return 0; } @@ -2972,6 +3001,9 @@ int close_ctree(struct btrfs_root *root) fs_info->closing = 1; smp_mb(); + /* pause restriper - we want to resume on mount */ + btrfs_pause_balance(root->fs_info); + btrfs_scrub_cancel(root); /* wait for any defraggers to finish */ @@ -2979,7 +3011,7 @@ int close_ctree(struct btrfs_root *root) (atomic_read(&fs_info->defrag_running) == 0)); /* clear out the rbtree of defraggable inodes */ - btrfs_run_defrag_inodes(root->fs_info); + btrfs_run_defrag_inodes(fs_info); /* * Here come 2 situations when btrfs is broken to flip readonly: @@ -3008,8 +3040,8 @@ int close_ctree(struct btrfs_root *root) btrfs_put_block_group_cache(fs_info); - kthread_stop(root->fs_info->transaction_kthread); - kthread_stop(root->fs_info->cleaner_kthread); + kthread_stop(fs_info->transaction_kthread); + kthread_stop(fs_info->cleaner_kthread); fs_info->closing = 2; smp_mb(); @@ -3027,14 +3059,14 @@ int close_ctree(struct btrfs_root *root) free_extent_buffer(fs_info->extent_root->commit_root); free_extent_buffer(fs_info->tree_root->node); free_extent_buffer(fs_info->tree_root->commit_root); - free_extent_buffer(root->fs_info->chunk_root->node); - free_extent_buffer(root->fs_info->chunk_root->commit_root); - free_extent_buffer(root->fs_info->dev_root->node); - free_extent_buffer(root->fs_info->dev_root->commit_root); - free_extent_buffer(root->fs_info->csum_root->node); - free_extent_buffer(root->fs_info->csum_root->commit_root); + free_extent_buffer(fs_info->chunk_root->node); + free_extent_buffer(fs_info->chunk_root->commit_root); + free_extent_buffer(fs_info->dev_root->node); + free_extent_buffer(fs_info->dev_root->commit_root); + free_extent_buffer(fs_info->csum_root->node); + free_extent_buffer(fs_info->csum_root->commit_root); - btrfs_free_block_groups(root->fs_info); + btrfs_free_block_groups(fs_info); del_fs_roots(fs_info); @@ -3054,14 +3086,17 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->caching_workers); btrfs_stop_workers(&fs_info->readahead_workers); +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_test_opt(root, CHECK_INTEGRITY)) + btrfsic_unmount(root, fs_info->fs_devices); +#endif + btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); - free_fs_info(fs_info); - return 0; } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index c99d0a8f13f..e4bc4741319 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -46,9 +46,9 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf); -struct btrfs_root *open_ctree(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - char *options); +int open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options); int close_ctree(struct btrfs_root *root); int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 1b8dc33778f..5f77166fd01 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -67,7 +67,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, u64 root_objectid, u32 generation, int check_generation) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root; struct inode *inode; struct btrfs_key key; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f5fbe576d2b..700879ed64c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -618,8 +618,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, struct list_head *head = &info->space_info; struct btrfs_space_info *found; - flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM | - BTRFS_BLOCK_GROUP_METADATA; + flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { @@ -1872,20 +1871,24 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset) + u64 root_objectid, u64 owner, u64 offset, int for_cow) { int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && root_objectid == BTRFS_TREE_LOG_OBJECTID); if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, + ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, + num_bytes, parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL); + BTRFS_ADD_DELAYED_REF, NULL, for_cow); } else { - ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, + ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, + num_bytes, parent, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_REF, NULL); + BTRFS_ADD_DELAYED_REF, NULL, for_cow); } return ret; } @@ -2233,6 +2236,28 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, } /* + * locked_ref is the head node, so we have to go one + * node back for any delayed ref updates + */ + ref = select_delayed_ref(locked_ref); + + if (ref && ref->seq && + btrfs_check_delayed_seq(delayed_refs, ref->seq)) { + /* + * there are still refs with lower seq numbers in the + * process of being added. Don't run this ref yet. + */ + list_del_init(&locked_ref->cluster); + mutex_unlock(&locked_ref->mutex); + locked_ref = NULL; + delayed_refs->num_heads_ready++; + spin_unlock(&delayed_refs->lock); + cond_resched(); + spin_lock(&delayed_refs->lock); + continue; + } + + /* * record the must insert reserved flag before we * drop the spin lock. */ @@ -2242,11 +2267,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, extent_op = locked_ref->extent_op; locked_ref->extent_op = NULL; - /* - * locked_ref is the head node, so we have to go one - * node back for any delayed ref updates - */ - ref = select_delayed_ref(locked_ref); if (!ref) { /* All delayed refs have been processed, Go ahead * and send the head node to run_one_delayed_ref, @@ -2267,9 +2287,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, BUG_ON(ret); kfree(extent_op); - cond_resched(); - spin_lock(&delayed_refs->lock); - continue; + goto next; } list_del_init(&locked_ref->cluster); @@ -2279,7 +2297,12 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ref->in_tree = 0; rb_erase(&ref->rb_node, &delayed_refs->root); delayed_refs->num_entries--; - + /* + * we modified num_entries, but as we're currently running + * delayed refs, skip + * wake_up(&delayed_refs->seq_wait); + * here. + */ spin_unlock(&delayed_refs->lock); ret = run_one_delayed_ref(trans, root, ref, extent_op, @@ -2289,13 +2312,34 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, btrfs_put_delayed_ref(ref); kfree(extent_op); count++; - +next: + do_chunk_alloc(trans, root->fs_info->extent_root, + 2 * 1024 * 1024, + btrfs_get_alloc_profile(root, 0), + CHUNK_ALLOC_NO_FORCE); cond_resched(); spin_lock(&delayed_refs->lock); } return count; } + +static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs, + unsigned long num_refs) +{ + struct list_head *first_seq = delayed_refs->seq_head.next; + + spin_unlock(&delayed_refs->lock); + pr_debug("waiting for more refs (num %ld, first %p)\n", + num_refs, first_seq); + wait_event(delayed_refs->seq_wait, + num_refs != delayed_refs->num_entries || + delayed_refs->seq_head.next != first_seq); + pr_debug("done waiting for more refs (num %ld, first %p)\n", + delayed_refs->num_entries, delayed_refs->seq_head.next); + spin_lock(&delayed_refs->lock); +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2311,15 +2355,23 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref; struct list_head cluster; int ret; + u64 delayed_start; int run_all = count == (unsigned long)-1; int run_most = 0; + unsigned long num_refs = 0; + int consider_waiting; if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; + do_chunk_alloc(trans, root->fs_info->extent_root, + 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), + CHUNK_ALLOC_NO_FORCE); + delayed_refs = &trans->transaction->delayed_refs; INIT_LIST_HEAD(&cluster); again: + consider_waiting = 0; spin_lock(&delayed_refs->lock); if (count == 0) { count = delayed_refs->num_entries * 2; @@ -2336,11 +2388,35 @@ again: * of refs to process starting at the first one we are able to * lock */ + delayed_start = delayed_refs->run_delayed_start; ret = btrfs_find_ref_cluster(trans, &cluster, delayed_refs->run_delayed_start); if (ret) break; + if (delayed_start >= delayed_refs->run_delayed_start) { + if (consider_waiting == 0) { + /* + * btrfs_find_ref_cluster looped. let's do one + * more cycle. if we don't run any delayed ref + * during that cycle (because we can't because + * all of them are blocked) and if the number of + * refs doesn't change, we avoid busy waiting. + */ + consider_waiting = 1; + num_refs = delayed_refs->num_entries; + } else { + wait_for_more_refs(delayed_refs, num_refs); + /* + * after waiting, things have changed. we + * dropped the lock and someone else might have + * run some refs, built new clusters and so on. + * therefore, we restart staleness detection. + */ + consider_waiting = 0; + } + } + ret = run_clustered_refs(trans, root, &cluster); BUG_ON(ret < 0); @@ -2348,6 +2424,11 @@ again: if (count == 0) break; + + if (ret || delayed_refs->run_delayed_start == 0) { + /* refs were run, let's reset staleness detection */ + consider_waiting = 0; + } } if (run_all) { @@ -2405,7 +2486,8 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->update_key = 0; extent_op->is_data = is_data ? 1 : 0; - ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); + ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, + num_bytes, extent_op); if (ret) kfree(extent_op); return ret; @@ -2590,7 +2672,7 @@ out: static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - int full_backref, int inc) + int full_backref, int inc, int for_cow) { u64 bytenr; u64 num_bytes; @@ -2603,7 +2685,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int level; int ret = 0; int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, - u64, u64, u64, u64, u64, u64); + u64, u64, u64, u64, u64, u64, int); ref_root = btrfs_header_owner(buf); nritems = btrfs_header_nritems(buf); @@ -2640,14 +2722,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(buf, fi); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, key.objectid, - key.offset); + key.offset, for_cow); if (ret) goto fail; } else { bytenr = btrfs_node_blockptr(buf, i); num_bytes = btrfs_level_size(root, level - 1); ret = process_func(trans, root, bytenr, num_bytes, - parent, ref_root, level - 1, 0); + parent, ref_root, level - 1, 0, + for_cow); if (ret) goto fail; } @@ -2659,15 +2742,15 @@ fail: } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref) + struct extent_buffer *buf, int full_backref, int for_cow) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 1); + return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow); } int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref) + struct extent_buffer *buf, int full_backref, int for_cow) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 0); + return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow); } static int write_one_cache_group(struct btrfs_trans_handle *trans, @@ -2993,9 +3076,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, INIT_LIST_HEAD(&found->block_groups[i]); init_rwsem(&found->groups_sem); spin_lock_init(&found->lock); - found->flags = flags & (BTRFS_BLOCK_GROUP_DATA | - BTRFS_BLOCK_GROUP_SYSTEM | - BTRFS_BLOCK_GROUP_METADATA); + found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; found->total_bytes = total_bytes; found->disk_total = total_bytes * factor; found->bytes_used = bytes_used; @@ -3016,20 +3097,27 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { - u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_DUP); - if (extra_flags) { - if (flags & BTRFS_BLOCK_GROUP_DATA) - fs_info->avail_data_alloc_bits |= extra_flags; - if (flags & BTRFS_BLOCK_GROUP_METADATA) - fs_info->avail_metadata_alloc_bits |= extra_flags; - if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - fs_info->avail_system_alloc_bits |= extra_flags; - } + u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK; + + /* chunk -> extended profile */ + if (extra_flags == 0) + extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits |= extra_flags; } +/* + * @flags: available profiles in extended format (see ctree.h) + * + * Returns reduced profile in chunk format. If profile changing is in + * progress (either running or paused) picks the target profile (if it's + * already available), otherwise falls back to plain reducing. + */ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { /* @@ -3040,6 +3128,34 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) u64 num_devices = root->fs_info->fs_devices->rw_devices + root->fs_info->fs_devices->missing_devices; + /* pick restriper's target profile if it's available */ + spin_lock(&root->fs_info->balance_lock); + if (root->fs_info->balance_ctl) { + struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; + u64 tgt = 0; + + if ((flags & BTRFS_BLOCK_GROUP_DATA) && + (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (flags & bctl->data.target)) { + tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; + } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) && + (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (flags & bctl->sys.target)) { + tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; + } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) && + (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (flags & bctl->meta.target)) { + tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; + } + + if (tgt) { + spin_unlock(&root->fs_info->balance_lock); + flags = tgt; + goto out; + } + } + spin_unlock(&root->fs_info->balance_lock); + if (num_devices == 1) flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); if (num_devices < 4) @@ -3059,22 +3175,25 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) if ((flags & BTRFS_BLOCK_GROUP_RAID0) && ((flags & BTRFS_BLOCK_GROUP_RAID1) | (flags & BTRFS_BLOCK_GROUP_RAID10) | - (flags & BTRFS_BLOCK_GROUP_DUP))) + (flags & BTRFS_BLOCK_GROUP_DUP))) { flags &= ~BTRFS_BLOCK_GROUP_RAID0; + } + +out: + /* extended -> chunk profile */ + flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; return flags; } static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) { if (flags & BTRFS_BLOCK_GROUP_DATA) - flags |= root->fs_info->avail_data_alloc_bits & - root->fs_info->data_alloc_profile; + flags |= root->fs_info->avail_data_alloc_bits; else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - flags |= root->fs_info->avail_system_alloc_bits & - root->fs_info->system_alloc_profile; + flags |= root->fs_info->avail_system_alloc_bits; else if (flags & BTRFS_BLOCK_GROUP_METADATA) - flags |= root->fs_info->avail_metadata_alloc_bits & - root->fs_info->metadata_alloc_profile; + flags |= root->fs_info->avail_metadata_alloc_bits; + return btrfs_reduce_alloc_profile(root, flags); } @@ -3191,6 +3310,8 @@ commit_trans: return -ENOSPC; } data_sinfo->bytes_may_use += bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + (u64)data_sinfo, bytes, 1); spin_unlock(&data_sinfo->lock); return 0; @@ -3210,6 +3331,8 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) data_sinfo = BTRFS_I(inode)->space_info; spin_lock(&data_sinfo->lock); data_sinfo->bytes_may_use -= bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + (u64)data_sinfo, bytes, 0); spin_unlock(&data_sinfo->lock); } @@ -3257,27 +3380,15 @@ static int should_alloc_chunk(struct btrfs_root *root, if (num_bytes - num_allocated < thresh) return 1; } - - /* - * we have two similar checks here, one based on percentage - * and once based on a hard number of 256MB. The idea - * is that if we have a good amount of free - * room, don't allocate a chunk. A good mount is - * less than 80% utilized of the chunks we have allocated, - * or more than 256MB free - */ - if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes) - return 0; - - if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) - return 0; - thresh = btrfs_super_total_bytes(root->fs_info->super_copy); - /* 256MB or 5% of the FS */ - thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); + /* 256MB or 2% of the FS */ + thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2)); + /* system chunks need a much small threshold */ + if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM) + thresh = 32 * 1024 * 1024; - if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) + if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8)) return 0; return 1; } @@ -3291,7 +3402,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, int wait_for_alloc = 0; int ret = 0; - flags = btrfs_reduce_alloc_profile(extent_root, flags); + BUG_ON(!profile_is_valid(flags, 0)); space_info = __find_space_info(extent_root->fs_info, flags); if (!space_info) { @@ -3582,6 +3693,10 @@ again: if (used <= space_info->total_bytes) { if (used + orig_bytes <= space_info->total_bytes) { space_info->bytes_may_use += orig_bytes; + trace_btrfs_space_reservation(root->fs_info, + "space_info", + (u64)space_info, + orig_bytes, 1); ret = 0; } else { /* @@ -3649,6 +3764,10 @@ again: if (used + num_bytes < space_info->total_bytes + avail) { space_info->bytes_may_use += orig_bytes; + trace_btrfs_space_reservation(root->fs_info, + "space_info", + (u64)space_info, + orig_bytes, 1); ret = 0; } else { wait_ordered = true; @@ -3755,7 +3874,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, spin_unlock(&block_rsv->lock); } -static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, +static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes) { struct btrfs_space_info *space_info = block_rsv->space_info; @@ -3791,6 +3911,9 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, if (num_bytes) { spin_lock(&space_info->lock); space_info->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + (u64)space_info, + num_bytes, 0); space_info->reservation_progress++; spin_unlock(&space_info->lock); } @@ -3947,7 +4070,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root, if (global_rsv->full || global_rsv == block_rsv || block_rsv->space_info != global_rsv->space_info) global_rsv = NULL; - block_rsv_release_bytes(block_rsv, global_rsv, num_bytes); + block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, + num_bytes); } /* @@ -4006,11 +4130,15 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) num_bytes = sinfo->total_bytes - num_bytes; block_rsv->reserved += num_bytes; sinfo->bytes_may_use += num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + (u64)sinfo, num_bytes, 1); } if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; sinfo->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + (u64)sinfo, num_bytes, 0); sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; @@ -4045,7 +4173,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) static void release_global_block_rsv(struct btrfs_fs_info *fs_info) { - block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1); + block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, + (u64)-1); WARN_ON(fs_info->delalloc_block_rsv.size > 0); WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); WARN_ON(fs_info->trans_block_rsv.size > 0); @@ -4062,6 +4191,8 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, if (!trans->bytes_reserved) return; + trace_btrfs_space_reservation(root->fs_info, "transaction", (u64)trans, + trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); trans->bytes_reserved = 0; } @@ -4079,6 +4210,8 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, * when we are truly done with the orphan item. */ u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); + trace_btrfs_space_reservation(root->fs_info, "orphan", + btrfs_ino(inode), num_bytes, 1); return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } @@ -4086,6 +4219,8 @@ void btrfs_orphan_release_metadata(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); + trace_btrfs_space_reservation(root->fs_info, "orphan", + btrfs_ino(inode), num_bytes, 0); btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); } @@ -4213,12 +4348,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) /* Need to be holding the i_mutex here if we aren't free space cache */ if (btrfs_is_free_space_inode(root, inode)) flush = 0; - else - WARN_ON(!mutex_is_locked(&inode->i_mutex)); if (flush && btrfs_transaction_in_commit(root->fs_info)) schedule_timeout(1); + mutex_lock(&BTRFS_I(inode)->delalloc_mutex); num_bytes = ALIGN(num_bytes, root->sectorsize); spin_lock(&BTRFS_I(inode)->lock); @@ -4266,8 +4400,14 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) if (dropped) to_free += btrfs_calc_trans_metadata_size(root, dropped); - if (to_free) + if (to_free) { btrfs_block_rsv_release(root, block_rsv, to_free); + trace_btrfs_space_reservation(root->fs_info, + "delalloc", + btrfs_ino(inode), + to_free, 0); + } + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); return ret; } @@ -4278,7 +4418,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) } BTRFS_I(inode)->reserved_extents += nr_extents; spin_unlock(&BTRFS_I(inode)->lock); + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + if (to_reserve) + trace_btrfs_space_reservation(root->fs_info,"delalloc", + btrfs_ino(inode), to_reserve, 1); block_rsv_add_bytes(block_rsv, to_reserve, 1); return 0; @@ -4308,6 +4452,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), to_free, 0); btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, to_free); } @@ -4562,7 +4708,10 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, cache->reserved += num_bytes; space_info->bytes_reserved += num_bytes; if (reserve == RESERVE_ALLOC) { - BUG_ON(space_info->bytes_may_use < num_bytes); + trace_btrfs_space_reservation(cache->fs_info, + "space_info", + (u64)space_info, + num_bytes, 0); space_info->bytes_may_use -= num_bytes; } } @@ -4928,6 +5077,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, rb_erase(&head->node.rb_node, &delayed_refs->root); delayed_refs->num_entries--; + if (waitqueue_active(&delayed_refs->seq_wait)) + wake_up(&delayed_refs->seq_wait); /* * we don't take a ref on the node because we're removing it from the @@ -4955,16 +5106,17 @@ out: void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - u64 parent, int last_ref) + u64 parent, int last_ref, int for_cow) { struct btrfs_block_group_cache *cache = NULL; int ret; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len, - parent, root->root_key.objectid, - btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL); + ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, + buf->start, buf->len, + parent, root->root_key.objectid, + btrfs_header_level(buf), + BTRFS_DROP_DELAYED_REF, NULL, for_cow); BUG_ON(ret); } @@ -4999,12 +5151,12 @@ out: btrfs_put_block_group(cache); } -int btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset) +int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, + u64 owner, u64 offset, int for_cow) { int ret; + struct btrfs_fs_info *fs_info = root->fs_info; /* * tree log blocks never actually go into the extent allocation @@ -5016,14 +5168,17 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_pin_extent(root, bytenr, num_bytes, 1); ret = 0; } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, + ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, + num_bytes, parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL); + BTRFS_DROP_DELAYED_REF, NULL, for_cow); BUG_ON(ret); } else { - ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, - parent, root_objectid, owner, - offset, BTRFS_DROP_DELAYED_REF, NULL); + ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, + num_bytes, + parent, root_objectid, owner, + offset, BTRFS_DROP_DELAYED_REF, + NULL, for_cow); BUG_ON(ret); } return ret; @@ -5146,6 +5301,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, ins->objectid = 0; ins->offset = 0; + trace_find_free_extent(orig_root, num_bytes, empty_size, data); + space_info = __find_space_info(root->fs_info, data); if (!space_info) { printk(KERN_ERR "No space info for %llu\n", data); @@ -5295,15 +5452,6 @@ alloc: if (unlikely(block_group->ro)) goto loop; - spin_lock(&block_group->free_space_ctl->tree_lock); - if (cached && - block_group->free_space_ctl->free_space < - num_bytes + empty_cluster + empty_size) { - spin_unlock(&block_group->free_space_ctl->tree_lock); - goto loop; - } - spin_unlock(&block_group->free_space_ctl->tree_lock); - /* * Ok we want to try and use the cluster allocator, so * lets look there @@ -5331,6 +5479,8 @@ alloc: if (offset) { /* we have a block, we're done */ spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(root, + block_group, search_start, num_bytes); goto checks; } @@ -5349,8 +5499,15 @@ refill_cluster: * plenty of times and not have found * anything, so we are likely way too * fragmented for the clustering stuff to find - * anything. */ - if (loop >= LOOP_NO_EMPTY_SIZE) { + * anything. + * + * However, if the cluster is taken from the + * current block group, release the cluster + * first, so that we stand a better chance of + * succeeding in the unclustered + * allocation. */ + if (loop >= LOOP_NO_EMPTY_SIZE && + last_ptr->block_group != block_group) { spin_unlock(&last_ptr->refill_lock); goto unclustered_alloc; } @@ -5361,6 +5518,11 @@ refill_cluster: */ btrfs_return_cluster_to_free_space(NULL, last_ptr); + if (loop >= LOOP_NO_EMPTY_SIZE) { + spin_unlock(&last_ptr->refill_lock); + goto unclustered_alloc; + } + /* allocate a cluster in this block group */ ret = btrfs_find_space_cluster(trans, root, block_group, last_ptr, @@ -5377,6 +5539,9 @@ refill_cluster: if (offset) { /* we found one, proceed */ spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(root, + block_group, search_start, + num_bytes); goto checks; } } else if (!cached && loop > LOOP_CACHING_NOWAIT @@ -5401,6 +5566,15 @@ refill_cluster: } unclustered_alloc: + spin_lock(&block_group->free_space_ctl->tree_lock); + if (cached && + block_group->free_space_ctl->free_space < + num_bytes + empty_cluster + empty_size) { + spin_unlock(&block_group->free_space_ctl->tree_lock); + goto loop; + } + spin_unlock(&block_group->free_space_ctl->tree_lock); + offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); /* @@ -5438,9 +5612,6 @@ checks: goto loop; } - ins->objectid = search_start; - ins->offset = num_bytes; - if (offset < search_start) btrfs_add_free_space(used_block_group, offset, search_start - offset); @@ -5457,6 +5628,8 @@ checks: ins->objectid = search_start; ins->offset = num_bytes; + trace_btrfs_reserve_extent(orig_root, block_group, + search_start, num_bytes); if (offset < search_start) btrfs_add_free_space(used_block_group, offset, search_start - offset); @@ -5842,9 +6015,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset, - 0, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_EXTENT, NULL); + ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, + ins->offset, 0, + root_objectid, owner, offset, + BTRFS_ADD_DELAYED_EXTENT, NULL, 0); return ret; } @@ -5997,10 +6171,11 @@ use_block_rsv(struct btrfs_trans_handle *trans, return ERR_PTR(-ENOSPC); } -static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize) +static void unuse_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, u32 blocksize) { block_rsv_add_bytes(block_rsv, blocksize, 0); - block_rsv_release_bytes(block_rsv, NULL, 0); + block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); } /* @@ -6014,7 +6189,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, - u64 hint, u64 empty_size) + u64 hint, u64 empty_size, int for_cow) { struct btrfs_key ins; struct btrfs_block_rsv *block_rsv; @@ -6030,7 +6205,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, empty_size, hint, (u64)-1, &ins, 0); if (ret) { - unuse_block_rsv(block_rsv, blocksize); + unuse_block_rsv(root->fs_info, block_rsv, blocksize); return ERR_PTR(ret); } @@ -6058,10 +6233,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, extent_op->update_flags = 1; extent_op->is_data = 0; - ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, + ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, + ins.objectid, ins.offset, parent, root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, - extent_op); + extent_op, for_cow); BUG_ON(ret); } return buf; @@ -6078,6 +6254,7 @@ struct walk_control { int keep_locks; int reada_slot; int reada_count; + int for_reloc; }; #define DROP_REFERENCE 1 @@ -6216,9 +6393,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, /* wc->stage == UPDATE_BACKREF */ if (!(wc->flags[level] & flag)) { BUG_ON(!path->locks[level]); - ret = btrfs_inc_ref(trans, root, eb, 1); + ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); BUG_ON(ret); - ret = btrfs_dec_ref(trans, root, eb, 0); + ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); BUG_ON(ret); ret = btrfs_set_disk_extent_flags(trans, root, eb->start, eb->len, flag, 0); @@ -6362,7 +6539,7 @@ skip: } ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, - root->root_key.objectid, level - 1, 0); + root->root_key.objectid, level - 1, 0, 0); BUG_ON(ret); } btrfs_tree_unlock(next); @@ -6436,9 +6613,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (wc->refs[level] == 1) { if (level == 0) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) - ret = btrfs_dec_ref(trans, root, eb, 1); + ret = btrfs_dec_ref(trans, root, eb, 1, + wc->for_reloc); else - ret = btrfs_dec_ref(trans, root, eb, 0); + ret = btrfs_dec_ref(trans, root, eb, 0, + wc->for_reloc); BUG_ON(ret); } /* make block locked assertion in clean_tree_block happy */ @@ -6465,7 +6644,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, btrfs_header_owner(path->nodes[level + 1])); } - btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); + btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0); out: wc->refs[level] = 0; wc->flags[level] = 0; @@ -6549,7 +6728,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * blocks are properly updated. */ void btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref) + struct btrfs_block_rsv *block_rsv, int update_ref, + int for_reloc) { struct btrfs_path *path; struct btrfs_trans_handle *trans; @@ -6637,6 +6817,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root, wc->stage = DROP_REFERENCE; wc->update_ref = update_ref; wc->keep_locks = 0; + wc->for_reloc = for_reloc; wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { @@ -6721,6 +6902,7 @@ out: * drop subtree rooted at tree block 'node'. * * NOTE: this function will unlock and release tree block 'node' + * only used by relocation code */ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -6765,6 +6947,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, wc->stage = DROP_REFERENCE; wc->update_ref = 0; wc->keep_locks = 1; + wc->for_reloc = 1; wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { @@ -6792,6 +6975,29 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; + if (root->fs_info->balance_ctl) { + struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; + u64 tgt = 0; + + /* pick restriper's target profile and return */ + if (flags & BTRFS_BLOCK_GROUP_DATA && + bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { + tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; + } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && + bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; + } else if (flags & BTRFS_BLOCK_GROUP_METADATA && + bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { + tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; + } + + if (tgt) { + /* extended -> chunk profile */ + tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; + return tgt; + } + } + /* * we add in the count of missing devices because we want * to make sure that any RAID levels on a degraded FS @@ -7085,7 +7291,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * space to fit our block group in. */ if (device->total_bytes > device->bytes_used + min_free) { - ret = find_free_dev_extent(NULL, device, min_free, + ret = find_free_dev_extent(device, min_free, &dev_offset, NULL); if (!ret) dev_nr++; @@ -7447,6 +7653,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); BUG_ON(ret); + update_global_block_rsv(root->fs_info); spin_lock(&cache->space_info->lock); cache->space_info->bytes_readonly += cache->bytes_super; @@ -7466,6 +7673,22 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, return 0; } +static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK; + + /* chunk -> extended profile */ + if (extra_flags == 0) + extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits &= ~extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits &= ~extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits &= ~extra_flags; +} + int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start) { @@ -7476,6 +7699,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_key key; struct inode *inode; int ret; + int index; int factor; root = root->fs_info->extent_root; @@ -7491,6 +7715,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, free_excluded_extents(root, block_group); memcpy(&key, &block_group->key, sizeof(key)); + index = get_block_group_index(block_group); if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) @@ -7565,6 +7790,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * are still on the list after taking the semaphore */ list_del_init(&block_group->list); + if (list_empty(&block_group->space_info->block_groups[index])) + clear_avail_alloc_bits(root->fs_info, block_group->flags); up_write(&block_group->space_info->groups_sem); if (block_group->cached == BTRFS_CACHE_STARTED) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 49f3c9dc09f..9d09a4f8187 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -18,6 +18,7 @@ #include "ctree.h" #include "btrfs_inode.h" #include "volumes.h" +#include "check-integrity.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -1895,7 +1896,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, } bio->bi_bdev = dev->bdev; bio_add_page(bio, page, length, start-page_offset(page)); - submit_bio(WRITE_SYNC, bio); + btrfsic_submit_bio(WRITE_SYNC, bio); wait_for_completion(&compl); if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { @@ -2393,7 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, mirror_num, bio_flags, start); else - submit_bio(rw, bio); + btrfsic_submit_bio(rw, bio); if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; @@ -3579,6 +3580,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, atomic_set(&eb->blocking_writers, 0); atomic_set(&eb->spinning_readers, 0); atomic_set(&eb->spinning_writers, 0); + eb->lock_nested = 0; init_waitqueue_head(&eb->write_lock_wq); init_waitqueue_head(&eb->read_lock_wq); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 7604c300132..bc6a042cb6f 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -129,6 +129,7 @@ struct extent_buffer { struct list_head leak_list; struct rcu_head rcu_head; atomic_t refs; + pid_t lock_owner; /* count of read lock holders on the extent buffer */ atomic_t write_locks; @@ -137,6 +138,7 @@ struct extent_buffer { atomic_t blocking_readers; atomic_t spinning_readers; atomic_t spinning_writers; + int lock_nested; /* protects write locks */ rwlock_t lock; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 034d9850322..859ba2dd889 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -678,7 +678,7 @@ next_slot: disk_bytenr, num_bytes, 0, root->root_key.objectid, new_key.objectid, - start - extent_offset); + start - extent_offset, 0); BUG_ON(ret); *hint_byte = disk_bytenr; } @@ -753,7 +753,7 @@ next_slot: disk_bytenr, num_bytes, 0, root->root_key.objectid, key.objectid, key.offset - - extent_offset); + extent_offset, 0); BUG_ON(ret); inode_sub_bytes(inode, extent_end - key.offset); @@ -962,7 +962,7 @@ again: ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset); + ino, orig_offset, 0); BUG_ON(ret); if (split == start) { @@ -989,7 +989,7 @@ again: del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset); + ino, orig_offset, 0); BUG_ON(ret); } other_start = 0; @@ -1006,7 +1006,7 @@ again: del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset); + ino, orig_offset, 0); BUG_ON(ret); } if (del_nr == 0) { @@ -1274,7 +1274,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, dirty_pages); if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) btrfs_btree_balance_dirty(root, 1); - btrfs_throttle(root); pos += copied; num_written += copied; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 9a897bf7953..d20ff87ca60 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -319,9 +319,11 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl) io_ctl_unmap_page(io_ctl); for (i = 0; i < io_ctl->num_pages; i++) { - ClearPageChecked(io_ctl->pages[i]); - unlock_page(io_ctl->pages[i]); - page_cache_release(io_ctl->pages[i]); + if (io_ctl->pages[i]) { + ClearPageChecked(io_ctl->pages[i]); + unlock_page(io_ctl->pages[i]); + page_cache_release(io_ctl->pages[i]); + } } } @@ -635,7 +637,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (!num_entries) return 0; - io_ctl_init(&io_ctl, inode, root); + ret = io_ctl_init(&io_ctl, inode, root); + if (ret) + return ret; + ret = readahead_cache(inode); if (ret) goto out; @@ -838,7 +843,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct io_ctl io_ctl; struct list_head bitmap_list; struct btrfs_key key; - u64 start, end, len; + u64 start, extent_start, extent_end, len; int entries = 0; int bitmaps = 0; int ret; @@ -849,7 +854,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, if (!i_size_read(inode)) return -1; - io_ctl_init(&io_ctl, inode, root); + ret = io_ctl_init(&io_ctl, inode, root); + if (ret) + return -1; /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) @@ -857,25 +864,12 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_cluster, block_group_list); - /* - * We shouldn't have switched the pinned extents yet so this is the - * right one - */ - unpin = root->fs_info->pinned_extents; - /* Lock all pages first so we can lock the extent safely. */ io_ctl_prepare_pages(&io_ctl, inode, 0); lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 0, &cached_state, GFP_NOFS); - /* - * When searching for pinned extents, we need to start at our start - * offset. - */ - if (block_group) - start = block_group->key.objectid; - node = rb_first(&ctl->free_space_offset); if (!node && cluster) { node = rb_first(&cluster->root); @@ -918,9 +912,20 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * We want to add any pinned extents to our free space cache * so we don't leak the space */ + + /* + * We shouldn't have switched the pinned extents yet so this is the + * right one + */ + unpin = root->fs_info->pinned_extents; + + if (block_group) + start = block_group->key.objectid; + while (block_group && (start < block_group->key.objectid + block_group->key.offset)) { - ret = find_first_extent_bit(unpin, start, &start, &end, + ret = find_first_extent_bit(unpin, start, + &extent_start, &extent_end, EXTENT_DIRTY); if (ret) { ret = 0; @@ -928,20 +933,21 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, } /* This pinned extent is out of our range */ - if (start >= block_group->key.objectid + + if (extent_start >= block_group->key.objectid + block_group->key.offset) break; - len = block_group->key.objectid + - block_group->key.offset - start; - len = min(len, end + 1 - start); + extent_start = max(extent_start, start); + extent_end = min(block_group->key.objectid + + block_group->key.offset, extent_end + 1); + len = extent_end - extent_start; entries++; - ret = io_ctl_add_entry(&io_ctl, start, len, NULL); + ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL); if (ret) goto out_nospc; - start = end + 1; + start = extent_end; } /* Write out the bitmaps */ @@ -2283,23 +2289,23 @@ out: static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *entry, struct btrfs_free_cluster *cluster, - u64 offset, u64 bytes, u64 min_bytes) + u64 offset, u64 bytes, + u64 cont1_bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; unsigned long next_zero; unsigned long i; - unsigned long search_bits; - unsigned long total_bits; + unsigned long want_bits; + unsigned long min_bits; unsigned long found_bits; unsigned long start = 0; unsigned long total_found = 0; int ret; - bool found = false; i = offset_to_bit(entry->offset, block_group->sectorsize, max_t(u64, offset, entry->offset)); - search_bits = bytes_to_bits(bytes, block_group->sectorsize); - total_bits = bytes_to_bits(min_bytes, block_group->sectorsize); + want_bits = bytes_to_bits(bytes, block_group->sectorsize); + min_bits = bytes_to_bits(min_bytes, block_group->sectorsize); again: found_bits = 0; @@ -2308,7 +2314,7 @@ again: i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { next_zero = find_next_zero_bit(entry->bitmap, BITS_PER_BITMAP, i); - if (next_zero - i >= search_bits) { + if (next_zero - i >= min_bits) { found_bits = next_zero - i; break; } @@ -2318,10 +2324,9 @@ again: if (!found_bits) return -ENOSPC; - if (!found) { + if (!total_found) { start = i; cluster->max_size = 0; - found = true; } total_found += found_bits; @@ -2329,13 +2334,8 @@ again: if (cluster->max_size < found_bits * block_group->sectorsize) cluster->max_size = found_bits * block_group->sectorsize; - if (total_found < total_bits) { - i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); - if (i - start > total_bits * 2) { - total_found = 0; - cluster->max_size = 0; - found = false; - } + if (total_found < want_bits || cluster->max_size < cont1_bytes) { + i = next_zero + 1; goto again; } @@ -2346,28 +2346,31 @@ again: &entry->offset_index, 1); BUG_ON(ret); + trace_btrfs_setup_cluster(block_group, cluster, + total_found * block_group->sectorsize, 1); return 0; } /* * This searches the block group for just extents to fill the cluster with. + * Try to find a cluster with at least bytes total bytes, at least one + * extent of cont1_bytes, and other clusters of at least min_bytes. */ static noinline int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, struct list_head *bitmaps, u64 offset, u64 bytes, - u64 min_bytes) + u64 cont1_bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *first = NULL; struct btrfs_free_space *entry = NULL; - struct btrfs_free_space *prev = NULL; struct btrfs_free_space *last; struct rb_node *node; u64 window_start; u64 window_free; u64 max_extent; - u64 max_gap = 128 * 1024; + u64 total_size = 0; entry = tree_search_offset(ctl, offset, 0, 1); if (!entry) @@ -2377,8 +2380,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, * We don't want bitmaps, so just move along until we find a normal * extent entry. */ - while (entry->bitmap) { - if (list_empty(&entry->list)) + while (entry->bitmap || entry->bytes < min_bytes) { + if (entry->bitmap && list_empty(&entry->list)) list_add_tail(&entry->list, bitmaps); node = rb_next(&entry->offset_index); if (!node) @@ -2391,12 +2394,9 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, max_extent = entry->bytes; first = entry; last = entry; - prev = entry; - while (window_free <= min_bytes) { - node = rb_next(&entry->offset_index); - if (!node) - return -ENOSPC; + for (node = rb_next(&entry->offset_index); node; + node = rb_next(&entry->offset_index)) { entry = rb_entry(node, struct btrfs_free_space, offset_index); if (entry->bitmap) { @@ -2405,26 +2405,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, continue; } - /* - * we haven't filled the empty size and the window is - * very large. reset and try again - */ - if (entry->offset - (prev->offset + prev->bytes) > max_gap || - entry->offset - window_start > (min_bytes * 2)) { - first = entry; - window_start = entry->offset; - window_free = entry->bytes; - last = entry; + if (entry->bytes < min_bytes) + continue; + + last = entry; + window_free += entry->bytes; + if (entry->bytes > max_extent) max_extent = entry->bytes; - } else { - last = entry; - window_free += entry->bytes; - if (entry->bytes > max_extent) - max_extent = entry->bytes; - } - prev = entry; } + if (window_free < bytes || max_extent < cont1_bytes) + return -ENOSPC; + cluster->window_start = first->offset; node = &first->offset_index; @@ -2438,17 +2430,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); - if (entry->bitmap) + if (entry->bitmap || entry->bytes < min_bytes) continue; rb_erase(&entry->offset_index, &ctl->free_space_offset); ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 0); + total_size += entry->bytes; BUG_ON(ret); } while (node && entry != last); cluster->max_size = max_extent; - + trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); return 0; } @@ -2460,7 +2453,7 @@ static noinline int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, struct list_head *bitmaps, u64 offset, u64 bytes, - u64 min_bytes) + u64 cont1_bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; @@ -2485,7 +2478,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, if (entry->bytes < min_bytes) continue; ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, - bytes, min_bytes); + bytes, cont1_bytes, min_bytes); if (!ret) return 0; } @@ -2499,7 +2492,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, /* * here we try to find a cluster of blocks in a block group. The goal - * is to find at least bytes free and up to empty_size + bytes free. + * is to find at least bytes+empty_size. * We might not find them all in one contiguous area. * * returns zero and sets up cluster if things worked out, otherwise @@ -2515,23 +2508,24 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, struct btrfs_free_space *entry, *tmp; LIST_HEAD(bitmaps); u64 min_bytes; + u64 cont1_bytes; int ret; - /* for metadata, allow allocates with more holes */ + /* + * Choose the minimum extent size we'll require for this + * cluster. For SSD_SPREAD, don't allow any fragmentation. + * For metadata, allow allocates with smaller extents. For + * data, keep it dense. + */ if (btrfs_test_opt(root, SSD_SPREAD)) { - min_bytes = bytes + empty_size; + cont1_bytes = min_bytes = bytes + empty_size; } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { - /* - * we want to do larger allocations when we are - * flushing out the delayed refs, it helps prevent - * making more work as we go along. - */ - if (trans->transaction->delayed_refs.flushing) - min_bytes = max(bytes, (bytes + empty_size) >> 1); - else - min_bytes = max(bytes, (bytes + empty_size) >> 4); - } else - min_bytes = max(bytes, (bytes + empty_size) >> 2); + cont1_bytes = bytes; + min_bytes = block_group->sectorsize; + } else { + cont1_bytes = max(bytes, (bytes + empty_size) >> 2); + min_bytes = block_group->sectorsize; + } spin_lock(&ctl->tree_lock); @@ -2539,7 +2533,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, * If we know we don't have enough space to make a cluster don't even * bother doing all the work to try and find one. */ - if (ctl->free_space < min_bytes) { + if (ctl->free_space < bytes) { spin_unlock(&ctl->tree_lock); return -ENOSPC; } @@ -2552,11 +2546,17 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, goto out; } + trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, + min_bytes); + + INIT_LIST_HEAD(&bitmaps); ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, - bytes, min_bytes); + bytes + empty_size, + cont1_bytes, min_bytes); if (ret) ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, - offset, bytes, min_bytes); + offset, bytes + empty_size, + cont1_bytes, min_bytes); /* Clear our temporary list */ list_for_each_entry_safe(entry, tmp, &bitmaps, list) @@ -2567,6 +2567,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, list_add_tail(&cluster->block_group_list, &block_group->cluster_list); cluster->block_group = block_group; + } else { + trace_btrfs_failed_cluster_setup(block_group); } out: spin_unlock(&cluster->lock); @@ -2588,17 +2590,57 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) cluster->block_group = NULL; } -int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, - u64 *trimmed, u64 start, u64 end, u64 minlen) +static int do_trimming(struct btrfs_block_group_cache *block_group, + u64 *total_trimmed, u64 start, u64 bytes, + u64 reserved_start, u64 reserved_bytes) { - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_free_space *entry = NULL; + struct btrfs_space_info *space_info = block_group->space_info; struct btrfs_fs_info *fs_info = block_group->fs_info; - u64 bytes = 0; - u64 actually_trimmed; - int ret = 0; + int ret; + int update = 0; + u64 trimmed = 0; - *trimmed = 0; + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (!block_group->ro) { + block_group->reserved += reserved_bytes; + space_info->bytes_reserved += reserved_bytes; + update = 1; + } + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + + ret = btrfs_error_discard_extent(fs_info->extent_root, + start, bytes, &trimmed); + if (!ret) + *total_trimmed += trimmed; + + btrfs_add_free_space(block_group, reserved_start, reserved_bytes); + + if (update) { + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (block_group->ro) + space_info->bytes_readonly += reserved_bytes; + block_group->reserved -= reserved_bytes; + space_info->bytes_reserved -= reserved_bytes; + spin_unlock(&space_info->lock); + spin_unlock(&block_group->lock); + } + + return ret; +} + +static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, + u64 *total_trimmed, u64 start, u64 end, u64 minlen) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry; + struct rb_node *node; + int ret = 0; + u64 extent_start; + u64 extent_bytes; + u64 bytes; while (start < end) { spin_lock(&ctl->tree_lock); @@ -2609,81 +2651,118 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, } entry = tree_search_offset(ctl, start, 0, 1); - if (!entry) - entry = tree_search_offset(ctl, - offset_to_bitmap(ctl, start), - 1, 1); - - if (!entry || entry->offset >= end) { + if (!entry) { spin_unlock(&ctl->tree_lock); break; } - if (entry->bitmap) { - ret = search_bitmap(ctl, entry, &start, &bytes); - if (!ret) { - if (start >= end) { - spin_unlock(&ctl->tree_lock); - break; - } - bytes = min(bytes, end - start); - bitmap_clear_bits(ctl, entry, start, bytes); - if (entry->bytes == 0) - free_bitmap(ctl, entry); - } else { - start = entry->offset + BITS_PER_BITMAP * - block_group->sectorsize; + /* skip bitmaps */ + while (entry->bitmap) { + node = rb_next(&entry->offset_index); + if (!node) { spin_unlock(&ctl->tree_lock); - ret = 0; - continue; + goto out; } - } else { - start = entry->offset; - bytes = min(entry->bytes, end - start); - unlink_free_space(ctl, entry); - kmem_cache_free(btrfs_free_space_cachep, entry); + entry = rb_entry(node, struct btrfs_free_space, + offset_index); } + if (entry->offset >= end) { + spin_unlock(&ctl->tree_lock); + break; + } + + extent_start = entry->offset; + extent_bytes = entry->bytes; + start = max(start, extent_start); + bytes = min(extent_start + extent_bytes, end) - start; + if (bytes < minlen) { + spin_unlock(&ctl->tree_lock); + goto next; + } + + unlink_free_space(ctl, entry); + kmem_cache_free(btrfs_free_space_cachep, entry); + spin_unlock(&ctl->tree_lock); - if (bytes >= minlen) { - struct btrfs_space_info *space_info; - int update = 0; - - space_info = block_group->space_info; - spin_lock(&space_info->lock); - spin_lock(&block_group->lock); - if (!block_group->ro) { - block_group->reserved += bytes; - space_info->bytes_reserved += bytes; - update = 1; - } - spin_unlock(&block_group->lock); - spin_unlock(&space_info->lock); - - ret = btrfs_error_discard_extent(fs_info->extent_root, - start, - bytes, - &actually_trimmed); - - btrfs_add_free_space(block_group, start, bytes); - if (update) { - spin_lock(&space_info->lock); - spin_lock(&block_group->lock); - if (block_group->ro) - space_info->bytes_readonly += bytes; - block_group->reserved -= bytes; - space_info->bytes_reserved -= bytes; - spin_unlock(&space_info->lock); - spin_unlock(&block_group->lock); - } + ret = do_trimming(block_group, total_trimmed, start, bytes, + extent_start, extent_bytes); + if (ret) + break; +next: + start += bytes; - if (ret) - break; - *trimmed += actually_trimmed; + if (fatal_signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + cond_resched(); + } +out: + return ret; +} + +static int trim_bitmaps(struct btrfs_block_group_cache *block_group, + u64 *total_trimmed, u64 start, u64 end, u64 minlen) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry; + int ret = 0; + int ret2; + u64 bytes; + u64 offset = offset_to_bitmap(ctl, start); + + while (offset < end) { + bool next_bitmap = false; + + spin_lock(&ctl->tree_lock); + + if (ctl->free_space < minlen) { + spin_unlock(&ctl->tree_lock); + break; + } + + entry = tree_search_offset(ctl, offset, 1, 0); + if (!entry) { + spin_unlock(&ctl->tree_lock); + next_bitmap = true; + goto next; + } + + bytes = minlen; + ret2 = search_bitmap(ctl, entry, &start, &bytes); + if (ret2 || start >= end) { + spin_unlock(&ctl->tree_lock); + next_bitmap = true; + goto next; + } + + bytes = min(bytes, end - start); + if (bytes < minlen) { + spin_unlock(&ctl->tree_lock); + goto next; + } + + bitmap_clear_bits(ctl, entry, start, bytes); + if (entry->bytes == 0) + free_bitmap(ctl, entry); + + spin_unlock(&ctl->tree_lock); + + ret = do_trimming(block_group, total_trimmed, start, bytes, + start, bytes); + if (ret) + break; +next: + if (next_bitmap) { + offset += BITS_PER_BITMAP * ctl->unit; + } else { + start += bytes; + if (start >= offset + BITS_PER_BITMAP * ctl->unit) + offset += BITS_PER_BITMAP * ctl->unit; } - start += bytes; - bytes = 0; if (fatal_signal_pending(current)) { ret = -ERESTARTSYS; @@ -2696,6 +2775,22 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, return ret; } +int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen) +{ + int ret; + + *trimmed = 0; + + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); + if (ret) + return ret; + + ret = trim_bitmaps(block_group, trimmed, start, end, minlen); + + return ret; +} + /* * Find the left-most item in the cache tree, and then return the * smallest inode number in the item. diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index f8962a957d6..213ffa86ce1 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -438,6 +438,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root, trans->bytes_reserved); if (ret) goto out; + trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans, + trans->bytes_reserved, 1); again: inode = lookup_free_ino_inode(root, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { @@ -498,6 +500,8 @@ again: out_put: iput(inode); out_release: + trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans, + trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); out: trans->block_rsv = rsv; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 81b235a61f8..0da19a0ea00 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1951,12 +1951,28 @@ enum btrfs_orphan_cleanup_state { void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, struct btrfs_root *root) { + struct btrfs_block_rsv *block_rsv; int ret; if (!list_empty(&root->orphan_list) || root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) return; + spin_lock(&root->orphan_lock); + if (!list_empty(&root->orphan_list)) { + spin_unlock(&root->orphan_lock); + return; + } + + if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) { + spin_unlock(&root->orphan_lock); + return; + } + + block_rsv = root->orphan_block_rsv; + root->orphan_block_rsv = NULL; + spin_unlock(&root->orphan_lock); + if (root->orphan_item_inserted && btrfs_root_refs(&root->root_item) > 0) { ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, @@ -1965,10 +1981,9 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, root->orphan_item_inserted = 0; } - if (root->orphan_block_rsv) { - WARN_ON(root->orphan_block_rsv->size > 0); - btrfs_free_block_rsv(root, root->orphan_block_rsv); - root->orphan_block_rsv = NULL; + if (block_rsv) { + WARN_ON(block_rsv->size > 0); + btrfs_free_block_rsv(root, block_rsv); } } @@ -2224,14 +2239,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) continue; } nr_truncate++; - /* - * Need to hold the imutex for reservation purposes, not - * a huge deal here but I have a WARN_ON in - * btrfs_delalloc_reserve_space to catch offenders. - */ - mutex_lock(&inode->i_mutex); ret = btrfs_truncate(inode); - mutex_unlock(&inode->i_mutex); } else { nr_unlink++; } @@ -2845,7 +2853,7 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans, BUG_ON(!root->fs_info->enospc_unlink); root->fs_info->enospc_unlink = 0; } - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); } static int btrfs_unlink(struct inode *dir, struct dentry *dentry) @@ -3009,7 +3017,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, int pending_del_nr = 0; int pending_del_slot = 0; int extent_type = -1; - int encoding; int ret; int err = 0; u64 ino = btrfs_ino(inode); @@ -3059,7 +3066,6 @@ search_again: leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); found_type = btrfs_key_type(&found_key); - encoding = 0; if (found_key.objectid != ino) break; @@ -3072,10 +3078,6 @@ search_again: fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); - encoding = btrfs_file_extent_compression(leaf, fi); - encoding |= btrfs_file_extent_encryption(leaf, fi); - encoding |= btrfs_file_extent_other_encoding(leaf, fi); - if (extent_type != BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_num_bytes(leaf, fi); @@ -3103,7 +3105,7 @@ search_again: if (extent_type != BTRFS_FILE_EXTENT_INLINE) { u64 num_dec; extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); - if (!del_item && !encoding) { + if (!del_item) { u64 orig_num_bytes = btrfs_file_extent_num_bytes(leaf, fi); extent_num_bytes = new_size - @@ -3179,7 +3181,7 @@ delete: ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, 0, btrfs_header_owner(leaf), - ino, extent_offset); + ino, extent_offset, 0); BUG_ON(ret); } @@ -3434,7 +3436,7 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) i_size_write(inode, newsize); btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); ret = btrfs_update_inode(trans, root, inode); - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); } else { /* @@ -4655,7 +4657,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, } out_unlock: nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root, nr); if (drop_inode) { inode_dec_link_count(inode); @@ -4723,7 +4725,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, } out_unlock: nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -4782,7 +4784,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); fail: if (drop_inode) { inode_dec_link_count(inode); @@ -4848,7 +4850,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) out_fail: nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); if (drop_on_err) iput(inode); btrfs_btree_balance_dirty(root, nr); @@ -5121,7 +5123,7 @@ again: } flush_dcache_page(page); } else if (create && PageUptodate(page)) { - WARN_ON(1); + BUG(); if (!trans) { kunmap(page); free_extent_map(em); @@ -6402,10 +6404,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_start; u64 page_end; - /* Need this to keep space reservations serialized */ - mutex_lock(&inode->i_mutex); ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); - mutex_unlock(&inode->i_mutex); if (!ret) ret = btrfs_update_time(vma->vm_file); if (ret) { @@ -6494,8 +6493,8 @@ out_unlock: if (!ret) return VM_FAULT_LOCKED; unlock_page(page); - btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); out: + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); return ret; } @@ -6668,7 +6667,7 @@ end_trans: err = ret; nr = trans->blocks_used; - ret = btrfs_end_transaction_throttle(trans, root); + ret = btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root, nr); } @@ -6749,6 +6748,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) extent_io_tree_init(&ei->io_tree, &inode->i_data); extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); mutex_init(&ei->log_mutex); + mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->i_orphan); INIT_LIST_HEAD(&ei->delalloc_inodes); @@ -7074,7 +7074,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, btrfs_end_log_trans(root); } out_fail: - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&root->fs_info->subvol_sem); @@ -7246,7 +7246,7 @@ out_unlock: if (!err) d_instantiate(dentry, inode); nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5441ff1480f..ab620014bcc 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -176,6 +176,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) struct btrfs_trans_handle *trans; unsigned int flags, oldflags; int ret; + u64 ip_oldflags; + unsigned int i_oldflags; if (btrfs_root_readonly(root)) return -EROFS; @@ -192,6 +194,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) mutex_lock(&inode->i_mutex); + ip_oldflags = ip->flags; + i_oldflags = inode->i_flags; + flags = btrfs_mask_flags(inode->i_mode, flags); oldflags = btrfs_flags_to_ioctl(ip->flags); if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { @@ -249,19 +254,24 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } - trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_drop; + } btrfs_update_iflags(inode); inode->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); btrfs_end_transaction(trans, root); + out_drop: + if (ret) { + ip->flags = ip_oldflags; + inode->i_flags = i_oldflags; + } mnt_drop_write_file(file); - - ret = 0; out_unlock: mutex_unlock(&inode->i_mutex); return ret; @@ -276,14 +286,13 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg) static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) { - struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info; - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb); struct btrfs_device *device; struct request_queue *q; struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; - u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); + u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); int ret; if (!capable(CAP_SYS_ADMIN)) @@ -312,7 +321,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) range.len = min(range.len, total_bytes - range.start); range.minlen = max(range.minlen, minlen); - ret = btrfs_trim_fs(root, &range); + ret = btrfs_trim_fs(fs_info->tree_root, &range); if (ret < 0) return ret; @@ -358,7 +367,7 @@ static noinline int create_subvol(struct btrfs_root *root, return PTR_ERR(trans); leaf = btrfs_alloc_free_block(trans, root, root->leafsize, - 0, objectid, NULL, 0, 0, 0); + 0, objectid, NULL, 0, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; @@ -858,10 +867,8 @@ static int cluster_pages_for_defrag(struct inode *inode, return 0; file_end = (isize - 1) >> PAGE_CACHE_SHIFT; - mutex_lock(&inode->i_mutex); ret = btrfs_delalloc_reserve_space(inode, num_pages << PAGE_CACHE_SHIFT); - mutex_unlock(&inode->i_mutex); if (ret) return ret; again: @@ -1203,13 +1210,21 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + mutex_lock(&root->fs_info->volume_mutex); + if (root->fs_info->balance_ctl) { + printk(KERN_INFO "btrfs: balance in progress\n"); + ret = -EINVAL; + goto out; + } + vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - mutex_lock(&root->fs_info->volume_mutex); sizestr = vol_args->name; devstr = strchr(sizestr, ':'); if (devstr) { @@ -1226,7 +1241,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", (unsigned long long)devid); ret = -EINVAL; - goto out_unlock; + goto out_free; } if (!strcmp(sizestr, "max")) new_size = device->bdev->bd_inode->i_size; @@ -1241,7 +1256,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, new_size = memparse(sizestr, NULL); if (new_size == 0) { ret = -EINVAL; - goto out_unlock; + goto out_free; } } @@ -1250,7 +1265,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (mod < 0) { if (new_size > old_size) { ret = -EINVAL; - goto out_unlock; + goto out_free; } new_size = old_size - new_size; } else if (mod > 0) { @@ -1259,11 +1274,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (new_size < 256 * 1024 * 1024) { ret = -EINVAL; - goto out_unlock; + goto out_free; } if (new_size > device->bdev->bd_inode->i_size) { ret = -EFBIG; - goto out_unlock; + goto out_free; } do_div(new_size, root->sectorsize); @@ -1276,7 +1291,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - goto out_unlock; + goto out_free; } ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans, root); @@ -1284,9 +1299,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, ret = btrfs_shrink_device(device, new_size); } -out_unlock: - mutex_unlock(&root->fs_info->volume_mutex); +out_free: kfree(vol_args); +out: + mutex_unlock(&root->fs_info->volume_mutex); return ret; } @@ -2052,14 +2068,25 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + mutex_lock(&root->fs_info->volume_mutex); + if (root->fs_info->balance_ctl) { + printk(KERN_INFO "btrfs: balance in progress\n"); + ret = -EINVAL; + goto out; + } + vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_init_new_device(root, vol_args->name); kfree(vol_args); +out: + mutex_unlock(&root->fs_info->volume_mutex); return ret; } @@ -2074,14 +2101,25 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; + mutex_lock(&root->fs_info->volume_mutex); + if (root->fs_info->balance_ctl) { + printk(KERN_INFO "btrfs: balance in progress\n"); + ret = -EINVAL; + goto out; + } + vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_rm_device(root, vol_args->name); kfree(vol_args); +out: + mutex_unlock(&root->fs_info->volume_mutex); return ret; } @@ -2427,7 +2465,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, disko, diskl, 0, root->root_key.objectid, btrfs_ino(inode), - new_key.offset - datao); + new_key.offset - datao, + 0); BUG_ON(ret); } } else if (type == BTRFS_FILE_EXTENT_INLINE) { @@ -2977,7 +3016,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, { int ret = 0; int size; - u64 extent_offset; + u64 extent_item_pos; struct btrfs_ioctl_logical_ino_args *loi; struct btrfs_data_container *inodes = NULL; struct btrfs_path *path = NULL; @@ -3008,15 +3047,17 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, } ret = extent_from_logical(root->fs_info, loi->logical, path, &key); + btrfs_release_path(path); if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) ret = -ENOENT; if (ret < 0) goto out; - extent_offset = loi->logical - key.objectid; + extent_item_pos = loi->logical - key.objectid; ret = iterate_extent_inodes(root->fs_info, path, key.objectid, - extent_offset, build_ino_list, inodes); + extent_item_pos, build_ino_list, + inodes); if (ret < 0) goto out; @@ -3034,6 +3075,163 @@ out: return ret; } +void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, + struct btrfs_ioctl_balance_args *bargs) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + + bargs->flags = bctl->flags; + + if (atomic_read(&fs_info->balance_running)) + bargs->state |= BTRFS_BALANCE_STATE_RUNNING; + if (atomic_read(&fs_info->balance_pause_req)) + bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ; + if (atomic_read(&fs_info->balance_cancel_req)) + bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ; + + memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); + memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); + memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); + + if (lock) { + spin_lock(&fs_info->balance_lock); + memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); + spin_unlock(&fs_info->balance_lock); + } else { + memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); + } +} + +static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ioctl_balance_args *bargs; + struct btrfs_balance_control *bctl; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + + if (arg) { + bargs = memdup_user(arg, sizeof(*bargs)); + if (IS_ERR(bargs)) { + ret = PTR_ERR(bargs); + goto out; + } + + if (bargs->flags & BTRFS_BALANCE_RESUME) { + if (!fs_info->balance_ctl) { + ret = -ENOTCONN; + goto out_bargs; + } + + bctl = fs_info->balance_ctl; + spin_lock(&fs_info->balance_lock); + bctl->flags |= BTRFS_BALANCE_RESUME; + spin_unlock(&fs_info->balance_lock); + + goto do_balance; + } + } else { + bargs = NULL; + } + + if (fs_info->balance_ctl) { + ret = -EINPROGRESS; + goto out_bargs; + } + + bctl = kzalloc(sizeof(*bctl), GFP_NOFS); + if (!bctl) { + ret = -ENOMEM; + goto out_bargs; + } + + bctl->fs_info = fs_info; + if (arg) { + memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); + memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); + memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); + + bctl->flags = bargs->flags; + } else { + /* balance everything - no filters */ + bctl->flags |= BTRFS_BALANCE_TYPE_MASK; + } + +do_balance: + ret = btrfs_balance(bctl, bargs); + /* + * bctl is freed in __cancel_balance or in free_fs_info if + * restriper was paused all the way until unmount + */ + if (arg) { + if (copy_to_user(arg, bargs, sizeof(*bargs))) + ret = -EFAULT; + } + +out_bargs: + kfree(bargs); +out: + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + return ret; +} + +static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case BTRFS_BALANCE_CTL_PAUSE: + return btrfs_pause_balance(root->fs_info); + case BTRFS_BALANCE_CTL_CANCEL: + return btrfs_cancel_balance(root->fs_info); + } + + return -EINVAL; +} + +static long btrfs_ioctl_balance_progress(struct btrfs_root *root, + void __user *arg) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ioctl_balance_args *bargs; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + ret = -ENOTCONN; + goto out; + } + + bargs = kzalloc(sizeof(*bargs), GFP_NOFS); + if (!bargs) { + ret = -ENOMEM; + goto out; + } + + update_ioctl_balance_args(fs_info, 1, bargs); + + if (copy_to_user(arg, bargs, sizeof(*bargs))) + ret = -EFAULT; + + kfree(bargs); +out: + mutex_unlock(&fs_info->balance_mutex); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -3078,7 +3276,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(root, argp); case BTRFS_IOC_BALANCE: - return btrfs_balance(root->fs_info->dev_root); + return btrfs_ioctl_balance(root, NULL); case BTRFS_IOC_CLONE: return btrfs_ioctl_clone(file, arg, 0, 0, 0); case BTRFS_IOC_CLONE_RANGE: @@ -3110,6 +3308,12 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_scrub_cancel(root, argp); case BTRFS_IOC_SCRUB_PROGRESS: return btrfs_ioctl_scrub_progress(root, argp); + case BTRFS_IOC_BALANCE_V2: + return btrfs_ioctl_balance(root, argp); + case BTRFS_IOC_BALANCE_CTL: + return btrfs_ioctl_balance_ctl(root, arg); + case BTRFS_IOC_BALANCE_PROGRESS: + return btrfs_ioctl_balance_progress(root, argp); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 252ae9915de..4f69028a68c 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -109,6 +109,55 @@ struct btrfs_ioctl_fs_info_args { __u64 reserved[124]; /* pad to 1k */ }; +/* balance control ioctl modes */ +#define BTRFS_BALANCE_CTL_PAUSE 1 +#define BTRFS_BALANCE_CTL_CANCEL 2 + +/* + * this is packed, because it should be exactly the same as its disk + * byte order counterpart (struct btrfs_disk_balance_args) + */ +struct btrfs_balance_args { + __u64 profiles; + __u64 usage; + __u64 devid; + __u64 pstart; + __u64 pend; + __u64 vstart; + __u64 vend; + + __u64 target; + + __u64 flags; + + __u64 unused[8]; +} __attribute__ ((__packed__)); + +/* report balance progress to userspace */ +struct btrfs_balance_progress { + __u64 expected; /* estimated # of chunks that will be + * relocated to fulfill the request */ + __u64 considered; /* # of chunks we have considered so far */ + __u64 completed; /* # of chunks relocated so far */ +}; + +#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0) +#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1) +#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2) + +struct btrfs_ioctl_balance_args { + __u64 flags; /* in/out */ + __u64 state; /* out */ + + struct btrfs_balance_args data; /* in/out */ + struct btrfs_balance_args meta; /* in/out */ + struct btrfs_balance_args sys; /* in/out */ + + struct btrfs_balance_progress stat; /* out */ + + __u64 unused[72]; /* pad to 1k */ +}; + #define BTRFS_INO_LOOKUP_PATH_MAX 4080 struct btrfs_ioctl_ino_lookup_args { __u64 treeid; @@ -272,6 +321,11 @@ struct btrfs_ioctl_logical_ino_args { struct btrfs_ioctl_dev_info_args) #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ struct btrfs_ioctl_fs_info_args) +#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \ + struct btrfs_ioctl_balance_args) +#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int) +#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \ + struct btrfs_ioctl_balance_args) #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ struct btrfs_ioctl_ino_path_args) #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index d77b67c4b27..5e178d8f716 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -33,6 +33,14 @@ void btrfs_assert_tree_read_locked(struct extent_buffer *eb); */ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) { + if (eb->lock_nested) { + read_lock(&eb->lock); + if (eb->lock_nested && current->pid == eb->lock_owner) { + read_unlock(&eb->lock); + return; + } + read_unlock(&eb->lock); + } if (rw == BTRFS_WRITE_LOCK) { if (atomic_read(&eb->blocking_writers) == 0) { WARN_ON(atomic_read(&eb->spinning_writers) != 1); @@ -57,6 +65,14 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) */ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) { + if (eb->lock_nested) { + read_lock(&eb->lock); + if (&eb->lock_nested && current->pid == eb->lock_owner) { + read_unlock(&eb->lock); + return; + } + read_unlock(&eb->lock); + } if (rw == BTRFS_WRITE_LOCK_BLOCKING) { BUG_ON(atomic_read(&eb->blocking_writers) != 1); write_lock(&eb->lock); @@ -81,12 +97,25 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) void btrfs_tree_read_lock(struct extent_buffer *eb) { again: + read_lock(&eb->lock); + if (atomic_read(&eb->blocking_writers) && + current->pid == eb->lock_owner) { + /* + * This extent is already write-locked by our thread. We allow + * an additional read lock to be added because it's for the same + * thread. btrfs_find_all_roots() depends on this as it may be + * called on a partly (write-)locked tree. + */ + BUG_ON(eb->lock_nested); + eb->lock_nested = 1; + read_unlock(&eb->lock); + return; + } + read_unlock(&eb->lock); wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); read_lock(&eb->lock); if (atomic_read(&eb->blocking_writers)) { read_unlock(&eb->lock); - wait_event(eb->write_lock_wq, - atomic_read(&eb->blocking_writers) == 0); goto again; } atomic_inc(&eb->read_locks); @@ -129,6 +158,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) } atomic_inc(&eb->write_locks); atomic_inc(&eb->spinning_writers); + eb->lock_owner = current->pid; return 1; } @@ -137,6 +167,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) */ void btrfs_tree_read_unlock(struct extent_buffer *eb) { + if (eb->lock_nested) { + read_lock(&eb->lock); + if (eb->lock_nested && current->pid == eb->lock_owner) { + eb->lock_nested = 0; + read_unlock(&eb->lock); + return; + } + read_unlock(&eb->lock); + } btrfs_assert_tree_read_locked(eb); WARN_ON(atomic_read(&eb->spinning_readers) == 0); atomic_dec(&eb->spinning_readers); @@ -149,6 +188,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) */ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) { + if (eb->lock_nested) { + read_lock(&eb->lock); + if (eb->lock_nested && current->pid == eb->lock_owner) { + eb->lock_nested = 0; + read_unlock(&eb->lock); + return; + } + read_unlock(&eb->lock); + } btrfs_assert_tree_read_locked(eb); WARN_ON(atomic_read(&eb->blocking_readers) == 0); if (atomic_dec_and_test(&eb->blocking_readers)) @@ -181,6 +229,7 @@ again: WARN_ON(atomic_read(&eb->spinning_writers)); atomic_inc(&eb->spinning_writers); atomic_inc(&eb->write_locks); + eb->lock_owner = current->pid; return 0; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index cfb55434a46..8c1aae2c845 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1604,12 +1604,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, root, new_bytenr, num_bytes, parent, btrfs_header_owner(leaf), - key.objectid, key.offset); + key.objectid, key.offset, 1); BUG_ON(ret); ret = btrfs_free_extent(trans, root, bytenr, num_bytes, parent, btrfs_header_owner(leaf), - key.objectid, key.offset); + key.objectid, key.offset, 1); BUG_ON(ret); } if (dirty) @@ -1778,21 +1778,23 @@ again: ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, path->nodes[level]->start, - src->root_key.objectid, level - 1, 0); + src->root_key.objectid, level - 1, 0, + 1); BUG_ON(ret); ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, - 0); + 0, 1); BUG_ON(ret); ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, path->nodes[level]->start, - src->root_key.objectid, level - 1, 0); + src->root_key.objectid, level - 1, 0, + 1); BUG_ON(ret); ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, - 0); + 0, 1); BUG_ON(ret); btrfs_unlock_up_safe(path, 0); @@ -2244,7 +2246,7 @@ again: } else { list_del_init(&reloc_root->root_list); } - btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0); + btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); } if (found) { @@ -2558,7 +2560,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, node->eb->start, blocksize, upper->eb->start, btrfs_header_owner(upper->eb), - node->level, 0); + node->level, 0, 1); BUG_ON(ret); ret = btrfs_drop_subtree(trans, root, eb, upper->eb); @@ -2947,9 +2949,7 @@ static int relocate_file_extent_cluster(struct inode *inode, index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; while (index <= last_index) { - mutex_lock(&inode->i_mutex); ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); - mutex_unlock(&inode->i_mutex); if (ret) goto out; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ddf2c90d3fc..9770cc5bfb7 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -25,6 +25,7 @@ #include "transaction.h" #include "backref.h" #include "extent_io.h" +#include "check-integrity.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -309,7 +310,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, u8 ref_level; unsigned long ptr = 0; const int bufsize = 4096; - u64 extent_offset; + u64 extent_item_pos; path = btrfs_alloc_path(); @@ -329,12 +330,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, if (ret < 0) goto out; - extent_offset = swarn.logical - found_key.objectid; + extent_item_pos = swarn.logical - found_key.objectid; swarn.extent_item_size = found_key.offset; eb = path->nodes[0]; ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); item_size = btrfs_item_size_nr(eb, path->slots[0]); + btrfs_release_path(path); if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { do { @@ -351,7 +353,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, } else { swarn.path = path; iterate_extent_inodes(fs_info, path, found_key.objectid, - extent_offset, + extent_item_pos, scrub_print_warning_inode, &swarn); } @@ -732,7 +734,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, bio_add_page(bio, page, PAGE_SIZE, 0); bio->bi_end_io = scrub_fixup_end_io; bio->bi_private = &complete; - submit_bio(rw, bio); + btrfsic_submit_bio(rw, bio); /* this will also unplug the queue */ wait_for_completion(&complete); @@ -958,7 +960,7 @@ static int scrub_submit(struct scrub_dev *sdev) sdev->curr = -1; atomic_inc(&sdev->in_flight); - submit_bio(READ, sbio->bio); + btrfsic_submit_bio(READ, sbio->bio); return 0; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ae488aa1966..3ce97b217cb 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -147,13 +147,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, static void btrfs_put_super(struct super_block *sb) { - struct btrfs_root *root = btrfs_sb(sb); - int ret; - - ret = close_ctree(root); - sb->s_fs_info = NULL; - - (void)ret; /* FIXME: need to fix VFS to return error? */ + (void)close_ctree(btrfs_sb(sb)->tree_root); + /* FIXME: need to fix VFS to return error? */ + /* AV: return it _where_? ->put_super() can be triggered by any number + * of async events, up to and including delivery of SIGKILL to the + * last process that kept it busy. Or segfault in the aforementioned + * process... Whom would you report that to? + */ } enum { @@ -163,8 +163,11 @@ enum { Opt_compress_type, Opt_compress_force, Opt_compress_force_type, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, - Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, - Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err, + Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache, + Opt_no_space_cache, Opt_recovery, Opt_skip_balance, + Opt_check_integrity, Opt_check_integrity_including_extent_data, + Opt_check_integrity_print_mask, + Opt_err, }; static match_table_t tokens = { @@ -199,6 +202,10 @@ static match_table_t tokens = { {Opt_inode_cache, "inode_cache"}, {Opt_no_space_cache, "nospace_cache"}, {Opt_recovery, "recovery"}, + {Opt_skip_balance, "skip_balance"}, + {Opt_check_integrity, "check_int"}, + {Opt_check_integrity_including_extent_data, "check_int_data"}, + {Opt_check_integrity_print_mask, "check_int_print_mask=%d"}, {Opt_err, NULL}, }; @@ -397,6 +404,40 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: enabling auto recovery"); btrfs_set_opt(info->mount_opt, RECOVERY); break; + case Opt_skip_balance: + btrfs_set_opt(info->mount_opt, SKIP_BALANCE); + break; +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + case Opt_check_integrity_including_extent_data: + printk(KERN_INFO "btrfs: enabling check integrity" + " including extent data\n"); + btrfs_set_opt(info->mount_opt, + CHECK_INTEGRITY_INCLUDING_EXTENT_DATA); + btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); + break; + case Opt_check_integrity: + printk(KERN_INFO "btrfs: enabling check integrity\n"); + btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); + break; + case Opt_check_integrity_print_mask: + intarg = 0; + match_int(&args[0], &intarg); + if (intarg) { + info->check_integrity_print_mask = intarg; + printk(KERN_INFO "btrfs:" + " check_integrity_print_mask 0x%x\n", + info->check_integrity_print_mask); + } + break; +#else + case Opt_check_integrity_including_extent_data: + case Opt_check_integrity: + case Opt_check_integrity_print_mask: + printk(KERN_ERR "btrfs: support for check_integrity*" + " not compiled in!\n"); + ret = -EINVAL; + goto out; +#endif case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); @@ -500,7 +541,8 @@ out: static struct dentry *get_default_root(struct super_block *sb, u64 subvol_objectid) { - struct btrfs_root *root = sb->s_fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *root = fs_info->tree_root; struct btrfs_root *new_root; struct btrfs_dir_item *di; struct btrfs_path *path; @@ -530,7 +572,7 @@ static struct dentry *get_default_root(struct super_block *sb, * will mount by default if we haven't been given a specific subvolume * to mount. */ - dir_id = btrfs_super_root_dir(root->fs_info->super_copy); + dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); if (IS_ERR(di)) { btrfs_free_path(path); @@ -544,7 +586,7 @@ static struct dentry *get_default_root(struct super_block *sb, */ btrfs_free_path(path); dir_id = BTRFS_FIRST_FREE_OBJECTID; - new_root = root->fs_info->fs_root; + new_root = fs_info->fs_root; goto setup_root; } @@ -552,7 +594,7 @@ static struct dentry *get_default_root(struct super_block *sb, btrfs_free_path(path); find_root: - new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); + new_root = btrfs_read_fs_root_no_name(fs_info, &location); if (IS_ERR(new_root)) return ERR_CAST(new_root); @@ -588,7 +630,7 @@ static int btrfs_fill_super(struct super_block *sb, { struct inode *inode; struct dentry *root_dentry; - struct btrfs_root *tree_root; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_key key; int err; @@ -603,18 +645,16 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_flags |= MS_POSIXACL; #endif - tree_root = open_ctree(sb, fs_devices, (char *)data); - - if (IS_ERR(tree_root)) { + err = open_ctree(sb, fs_devices, (char *)data); + if (err) { printk("btrfs: open_ctree failed\n"); - return PTR_ERR(tree_root); + return err; } - sb->s_fs_info = tree_root; key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL); + inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto fail_close; @@ -631,23 +671,25 @@ static int btrfs_fill_super(struct super_block *sb, save_mount_options(sb, data); cleancache_init_fs(sb); + sb->s_flags |= MS_ACTIVE; return 0; fail_close: - close_ctree(tree_root); + close_ctree(fs_info->tree_root); return err; } int btrfs_sync_fs(struct super_block *sb, int wait) { struct btrfs_trans_handle *trans; - struct btrfs_root *root = btrfs_sb(sb); + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *root = fs_info->tree_root; int ret; trace_btrfs_sync_fs(wait); if (!wait) { - filemap_flush(root->fs_info->btree_inode->i_mapping); + filemap_flush(fs_info->btree_inode->i_mapping); return 0; } @@ -663,8 +705,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait) static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) { - struct btrfs_root *root = btrfs_sb(dentry->d_sb); - struct btrfs_fs_info *info = root->fs_info; + struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb); + struct btrfs_root *root = info->tree_root; char *compress_type; if (btrfs_test_opt(root, DEGRADED)) @@ -722,28 +764,25 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",autodefrag"); if (btrfs_test_opt(root, INODE_MAP_CACHE)) seq_puts(seq, ",inode_cache"); + if (btrfs_test_opt(root, SKIP_BALANCE)) + seq_puts(seq, ",skip_balance"); return 0; } static int btrfs_test_super(struct super_block *s, void *data) { - struct btrfs_root *test_root = data; - struct btrfs_root *root = btrfs_sb(s); + struct btrfs_fs_info *p = data; + struct btrfs_fs_info *fs_info = btrfs_sb(s); - /* - * If this super block is going away, return false as it - * can't match as an existing super block. - */ - if (!atomic_read(&s->s_active)) - return 0; - return root->fs_info->fs_devices == test_root->fs_info->fs_devices; + return fs_info->fs_devices == p->fs_devices; } static int btrfs_set_super(struct super_block *s, void *data) { - s->s_fs_info = data; - - return set_anon_super(s, data); + int err = set_anon_super(s, data); + if (!err) + s->s_fs_info = data; + return err; } /* @@ -903,12 +942,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (!fs_info) return ERR_PTR(-ENOMEM); - fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - if (!fs_info->tree_root) { - error = -ENOMEM; - goto error_fs_info; - } - fs_info->tree_root->fs_info = fs_info; fs_info->fs_devices = fs_devices; fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); @@ -928,43 +961,30 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } bdev = fs_devices->latest_bdev; - s = sget(fs_type, btrfs_test_super, btrfs_set_super, - fs_info->tree_root); + s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info); if (IS_ERR(s)) { error = PTR_ERR(s); goto error_close_devices; } if (s->s_root) { - if ((flags ^ s->s_flags) & MS_RDONLY) { - deactivate_locked_super(s); - error = -EBUSY; - goto error_close_devices; - } - btrfs_close_devices(fs_devices); free_fs_info(fs_info); + if ((flags ^ s->s_flags) & MS_RDONLY) + error = -EBUSY; } else { char b[BDEVNAME_SIZE]; s->s_flags = flags | MS_NOSEC; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); - btrfs_sb(s)->fs_info->bdev_holder = fs_type; + btrfs_sb(s)->bdev_holder = fs_type; error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - - s->s_flags |= MS_ACTIVE; } - root = get_default_root(s, subvol_objectid); - if (IS_ERR(root)) { + root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error); + if (IS_ERR(root)) deactivate_locked_super(s); - return root; - } return root; @@ -977,7 +997,8 @@ error_fs_info: static int btrfs_remount(struct super_block *sb, int *flags, char *data) { - struct btrfs_root *root = btrfs_sb(sb); + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *root = fs_info->tree_root; int ret; ret = btrfs_parse_options(root, data); @@ -993,13 +1014,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) ret = btrfs_commit_super(root); WARN_ON(ret); } else { - if (root->fs_info->fs_devices->rw_devices == 0) + if (fs_info->fs_devices->rw_devices == 0) return -EACCES; - if (btrfs_super_log_root(root->fs_info->super_copy) != 0) + if (btrfs_super_log_root(fs_info->super_copy) != 0) return -EINVAL; - ret = btrfs_cleanup_fs_roots(root->fs_info); + ret = btrfs_cleanup_fs_roots(fs_info); WARN_ON(ret); /* recover relocation */ @@ -1168,18 +1189,18 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct btrfs_root *root = btrfs_sb(dentry->d_sb); - struct btrfs_super_block *disk_super = root->fs_info->super_copy; - struct list_head *head = &root->fs_info->space_info; + struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); + struct btrfs_super_block *disk_super = fs_info->super_copy; + struct list_head *head = &fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; u64 total_free_data = 0; int bits = dentry->d_sb->s_blocksize_bits; - __be32 *fsid = (__be32 *)root->fs_info->fsid; + __be32 *fsid = (__be32 *)fs_info->fsid; int ret; /* holding chunk_muext to avoid allocating new chunks */ - mutex_lock(&root->fs_info->chunk_mutex); + mutex_lock(&fs_info->chunk_mutex); rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) { @@ -1198,14 +1219,14 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_type = BTRFS_SUPER_MAGIC; buf->f_bavail = total_free_data; - ret = btrfs_calc_avail_data_space(root, &total_free_data); + ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); if (ret) { - mutex_unlock(&root->fs_info->chunk_mutex); + mutex_unlock(&fs_info->chunk_mutex); return ret; } buf->f_bavail += total_free_data; buf->f_bavail = buf->f_bavail >> bits; - mutex_unlock(&root->fs_info->chunk_mutex); + mutex_unlock(&fs_info->chunk_mutex); /* We treat it as constant endianness (it doesn't matter _which_) because we want the fsid to come out the same whether mounted @@ -1219,11 +1240,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } +static void btrfs_kill_super(struct super_block *sb) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + kill_anon_super(sb); + free_fs_info(fs_info); +} + static struct file_system_type btrfs_fs_type = { .owner = THIS_MODULE, .name = "btrfs", .mount = btrfs_mount, - .kill_sb = kill_anon_super, + .kill_sb = btrfs_kill_super, .fs_flags = FS_REQUIRES_DEV, }; @@ -1257,17 +1285,17 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, static int btrfs_freeze(struct super_block *sb) { - struct btrfs_root *root = btrfs_sb(sb); - mutex_lock(&root->fs_info->transaction_kthread_mutex); - mutex_lock(&root->fs_info->cleaner_mutex); + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + mutex_lock(&fs_info->transaction_kthread_mutex); + mutex_lock(&fs_info->cleaner_mutex); return 0; } static int btrfs_unfreeze(struct super_block *sb) { - struct btrfs_root *root = btrfs_sb(sb); - mutex_unlock(&root->fs_info->cleaner_mutex); - mutex_unlock(&root->fs_info->transaction_kthread_mutex); + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + mutex_unlock(&fs_info->cleaner_mutex); + mutex_unlock(&fs_info->transaction_kthread_mutex); return 0; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 81376d94cd3..287a6728b1a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -36,6 +36,8 @@ static noinline void put_transaction(struct btrfs_transaction *transaction) WARN_ON(atomic_read(&transaction->use_count) == 0); if (atomic_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); + WARN_ON(transaction->delayed_refs.root.rb_node); + WARN_ON(!list_empty(&transaction->delayed_refs.seq_head)); memset(transaction, 0, sizeof(*transaction)); kmem_cache_free(btrfs_transaction_cachep, transaction); } @@ -108,8 +110,11 @@ loop: cur_trans->delayed_refs.num_heads = 0; cur_trans->delayed_refs.flushing = 0; cur_trans->delayed_refs.run_delayed_start = 0; + cur_trans->delayed_refs.seq = 1; + init_waitqueue_head(&cur_trans->delayed_refs.seq_wait); spin_lock_init(&cur_trans->commit_lock); spin_lock_init(&cur_trans->delayed_refs.lock); + INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head); INIT_LIST_HEAD(&cur_trans->pending_snapshots); list_add_tail(&cur_trans->list, &root->fs_info->trans_list); @@ -321,6 +326,8 @@ again: } if (num_bytes) { + trace_btrfs_space_reservation(root->fs_info, "transaction", + (u64)h, num_bytes, 1); h->block_rsv = &root->fs_info->trans_block_rsv; h->bytes_reserved = num_bytes; } @@ -467,19 +474,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; - while (count < 4) { + while (count < 2) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (cur && trans->transaction->delayed_refs.num_heads_ready > 64) { trans->delayed_ref_updates = 0; - - /* - * do a full flush if the transaction is trying - * to close - */ - if (trans->transaction->delayed_refs.flushing) - cur = 0; btrfs_run_delayed_refs(trans, root, cur); } else { break; @@ -1393,9 +1393,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) if (btrfs_header_backref_rev(root->node) < BTRFS_MIXED_BACKREF_REV) - btrfs_drop_snapshot(root, NULL, 0); + btrfs_drop_snapshot(root, NULL, 0, 0); else - btrfs_drop_snapshot(root, NULL, 1); + btrfs_drop_snapshot(root, NULL, 1, 0); } return 0; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3568374d419..cb877e0886a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -589,7 +589,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, root, ins.objectid, ins.offset, 0, root->root_key.objectid, - key->objectid, offset); + key->objectid, offset, 0); BUG_ON(ret); } else { /* diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c new file mode 100644 index 00000000000..12f5147bd2b --- /dev/null +++ b/fs/btrfs/ulist.c @@ -0,0 +1,220 @@ +/* + * Copyright (C) 2011 STRATO AG + * written by Arne Jansen <sensille@gmx.net> + * Distributed under the GNU GPL license version 2. + */ + +#include <linux/slab.h> +#include <linux/module.h> +#include "ulist.h" + +/* + * ulist is a generic data structure to hold a collection of unique u64 + * values. The only operations it supports is adding to the list and + * enumerating it. + * It is possible to store an auxiliary value along with the key. + * + * The implementation is preliminary and can probably be sped up + * significantly. A first step would be to store the values in an rbtree + * as soon as ULIST_SIZE is exceeded. + * + * A sample usage for ulists is the enumeration of directed graphs without + * visiting a node twice. The pseudo-code could look like this: + * + * ulist = ulist_alloc(); + * ulist_add(ulist, root); + * elem = NULL; + * + * while ((elem = ulist_next(ulist, elem)) { + * for (all child nodes n in elem) + * ulist_add(ulist, n); + * do something useful with the node; + * } + * ulist_free(ulist); + * + * This assumes the graph nodes are adressable by u64. This stems from the + * usage for tree enumeration in btrfs, where the logical addresses are + * 64 bit. + * + * It is also useful for tree enumeration which could be done elegantly + * recursively, but is not possible due to kernel stack limitations. The + * loop would be similar to the above. + */ + +/** + * ulist_init - freshly initialize a ulist + * @ulist: the ulist to initialize + * + * Note: don't use this function to init an already used ulist, use + * ulist_reinit instead. + */ +void ulist_init(struct ulist *ulist) +{ + ulist->nnodes = 0; + ulist->nodes = ulist->int_nodes; + ulist->nodes_alloced = ULIST_SIZE; +} +EXPORT_SYMBOL(ulist_init); + +/** + * ulist_fini - free up additionally allocated memory for the ulist + * @ulist: the ulist from which to free the additional memory + * + * This is useful in cases where the base 'struct ulist' has been statically + * allocated. + */ +void ulist_fini(struct ulist *ulist) +{ + /* + * The first ULIST_SIZE elements are stored inline in struct ulist. + * Only if more elements are alocated they need to be freed. + */ + if (ulist->nodes_alloced > ULIST_SIZE) + kfree(ulist->nodes); + ulist->nodes_alloced = 0; /* in case ulist_fini is called twice */ +} +EXPORT_SYMBOL(ulist_fini); + +/** + * ulist_reinit - prepare a ulist for reuse + * @ulist: ulist to be reused + * + * Free up all additional memory allocated for the list elements and reinit + * the ulist. + */ +void ulist_reinit(struct ulist *ulist) +{ + ulist_fini(ulist); + ulist_init(ulist); +} +EXPORT_SYMBOL(ulist_reinit); + +/** + * ulist_alloc - dynamically allocate a ulist + * @gfp_mask: allocation flags to for base allocation + * + * The allocated ulist will be returned in an initialized state. + */ +struct ulist *ulist_alloc(unsigned long gfp_mask) +{ + struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask); + + if (!ulist) + return NULL; + + ulist_init(ulist); + + return ulist; +} +EXPORT_SYMBOL(ulist_alloc); + +/** + * ulist_free - free dynamically allocated ulist + * @ulist: ulist to free + * + * It is not necessary to call ulist_fini before. + */ +void ulist_free(struct ulist *ulist) +{ + if (!ulist) + return; + ulist_fini(ulist); + kfree(ulist); +} +EXPORT_SYMBOL(ulist_free); + +/** + * ulist_add - add an element to the ulist + * @ulist: ulist to add the element to + * @val: value to add to ulist + * @aux: auxiliary value to store along with val + * @gfp_mask: flags to use for allocation + * + * Note: locking must be provided by the caller. In case of rwlocks write + * locking is needed + * + * Add an element to a ulist. The @val will only be added if it doesn't + * already exist. If it is added, the auxiliary value @aux is stored along with + * it. In case @val already exists in the ulist, @aux is ignored, even if + * it differs from the already stored value. + * + * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been + * inserted. + * In case of allocation failure -ENOMEM is returned and the ulist stays + * unaltered. + */ +int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, + unsigned long gfp_mask) +{ + int i; + + for (i = 0; i < ulist->nnodes; ++i) { + if (ulist->nodes[i].val == val) + return 0; + } + + if (ulist->nnodes >= ulist->nodes_alloced) { + u64 new_alloced = ulist->nodes_alloced + 128; + struct ulist_node *new_nodes; + void *old = NULL; + + /* + * if nodes_alloced == ULIST_SIZE no memory has been allocated + * yet, so pass NULL to krealloc + */ + if (ulist->nodes_alloced > ULIST_SIZE) + old = ulist->nodes; + + new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced, + gfp_mask); + if (!new_nodes) + return -ENOMEM; + + if (!old) + memcpy(new_nodes, ulist->int_nodes, + sizeof(ulist->int_nodes)); + + ulist->nodes = new_nodes; + ulist->nodes_alloced = new_alloced; + } + ulist->nodes[ulist->nnodes].val = val; + ulist->nodes[ulist->nnodes].aux = aux; + ++ulist->nnodes; + + return 1; +} +EXPORT_SYMBOL(ulist_add); + +/** + * ulist_next - iterate ulist + * @ulist: ulist to iterate + * @prev: previously returned element or %NULL to start iteration + * + * Note: locking must be provided by the caller. In case of rwlocks only read + * locking is needed + * + * This function is used to iterate an ulist. The iteration is started with + * @prev = %NULL. It returns the next element from the ulist or %NULL when the + * end is reached. No guarantee is made with respect to the order in which + * the elements are returned. They might neither be returned in order of + * addition nor in ascending order. + * It is allowed to call ulist_add during an enumeration. Newly added items + * are guaranteed to show up in the running enumeration. + */ +struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev) +{ + int next; + + if (ulist->nnodes == 0) + return NULL; + + if (!prev) + return &ulist->nodes[0]; + + next = (prev - ulist->nodes) + 1; + if (next < 0 || next >= ulist->nnodes) + return NULL; + + return &ulist->nodes[next]; +} +EXPORT_SYMBOL(ulist_next); diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h new file mode 100644 index 00000000000..2e25dec58ec --- /dev/null +++ b/fs/btrfs/ulist.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2011 STRATO AG + * written by Arne Jansen <sensille@gmx.net> + * Distributed under the GNU GPL license version 2. + * + */ + +#ifndef __ULIST__ +#define __ULIST__ + +/* + * ulist is a generic data structure to hold a collection of unique u64 + * values. The only operations it supports is adding to the list and + * enumerating it. + * It is possible to store an auxiliary value along with the key. + * + * The implementation is preliminary and can probably be sped up + * significantly. A first step would be to store the values in an rbtree + * as soon as ULIST_SIZE is exceeded. + */ + +/* + * number of elements statically allocated inside struct ulist + */ +#define ULIST_SIZE 16 + +/* + * element of the list + */ +struct ulist_node { + u64 val; /* value to store */ + unsigned long aux; /* auxiliary value saved along with the val */ +}; + +struct ulist { + /* + * number of elements stored in list + */ + unsigned long nnodes; + + /* + * number of nodes we already have room for + */ + unsigned long nodes_alloced; + + /* + * pointer to the array storing the elements. The first ULIST_SIZE + * elements are stored inline. In this case the it points to int_nodes. + * After exceeding ULIST_SIZE, dynamic memory is allocated. + */ + struct ulist_node *nodes; + + /* + * inline storage space for the first ULIST_SIZE entries + */ + struct ulist_node int_nodes[ULIST_SIZE]; +}; + +void ulist_init(struct ulist *ulist); +void ulist_fini(struct ulist *ulist); +void ulist_reinit(struct ulist *ulist); +struct ulist *ulist_alloc(unsigned long gfp_mask); +void ulist_free(struct ulist *ulist); +int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, + unsigned long gfp_mask); +struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev); + +#endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f4b839fd3c9..0b4e2af7954 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -23,6 +23,7 @@ #include <linux/random.h> #include <linux/iocontext.h> #include <linux/capability.h> +#include <linux/kthread.h> #include <asm/div64.h> #include "compat.h" #include "ctree.h" @@ -32,6 +33,7 @@ #include "print-tree.h" #include "volumes.h" #include "async-thread.h" +#include "check-integrity.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -246,7 +248,7 @@ loop_lock: sync_pending = 0; } - submit_bio(cur->bi_rw, cur); + btrfsic_submit_bio(cur->bi_rw, cur); num_run++; batch_run++; if (need_resched()) @@ -706,8 +708,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, u64 devid; u64 transid; - mutex_lock(&uuid_mutex); - flags |= FMODE_EXCL; bdev = blkdev_get_by_path(path, flags, holder); @@ -716,6 +716,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, goto error; } + mutex_lock(&uuid_mutex); ret = set_blocksize(bdev, 4096); if (ret) goto error_close; @@ -737,9 +738,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, brelse(bh); error_close: + mutex_unlock(&uuid_mutex); blkdev_put(bdev, flags); error: - mutex_unlock(&uuid_mutex); return ret; } @@ -829,7 +830,6 @@ out: /* * find_free_dev_extent - find free space in the specified device - * @trans: transaction handler * @device: the device which we search the free space in * @num_bytes: the size of the free space that we need * @start: store the start of the free space. @@ -848,8 +848,7 @@ out: * But if we don't find suitable free space, it is used to store the size of * the max free space. */ -int find_free_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *len) { struct btrfs_key key; @@ -893,7 +892,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; - ret = btrfs_search_slot(trans, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; if (ret > 0) { @@ -1282,7 +1281,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) bool clear_super = false; mutex_lock(&uuid_mutex); - mutex_lock(&root->fs_info->volume_mutex); all_avail = root->fs_info->avail_data_alloc_bits | root->fs_info->avail_system_alloc_bits | @@ -1452,7 +1450,6 @@ error_close: if (bdev) blkdev_put(bdev, FMODE_READ | FMODE_EXCL); out: - mutex_unlock(&root->fs_info->volume_mutex); mutex_unlock(&uuid_mutex); return ret; error_undo: @@ -1469,8 +1466,7 @@ error_undo: /* * does all the dirty work required for changing file system's UUID. */ -static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static int btrfs_prepare_sprout(struct btrfs_root *root) { struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_fs_devices *old_devices; @@ -1629,7 +1625,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) } filemap_write_and_wait(bdev->bd_inode->i_mapping); - mutex_lock(&root->fs_info->volume_mutex); devices = &root->fs_info->fs_devices->devices; /* @@ -1695,7 +1690,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if (seeding_dev) { sb->s_flags &= ~MS_RDONLY; - ret = btrfs_prepare_sprout(trans, root); + ret = btrfs_prepare_sprout(root); BUG_ON(ret); } @@ -1757,8 +1752,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_relocate_sys_chunks(root); BUG_ON(ret); } -out: - mutex_unlock(&root->fs_info->volume_mutex); + return ret; error: blkdev_put(bdev, FMODE_EXCL); @@ -1766,7 +1760,7 @@ error: mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); } - goto out; + return ret; } static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, @@ -2077,6 +2071,362 @@ error: return ret; } +static int insert_balance_item(struct btrfs_root *root, + struct btrfs_balance_control *bctl) +{ + struct btrfs_trans_handle *trans; + struct btrfs_balance_item *item; + struct btrfs_disk_balance_args disk_bargs; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret, err; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_BALANCE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*item)); + if (ret) + goto out; + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); + + memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); + + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); + btrfs_set_balance_data(leaf, item, &disk_bargs); + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); + btrfs_set_balance_meta(leaf, item, &disk_bargs); + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); + btrfs_set_balance_sys(leaf, item, &disk_bargs); + + btrfs_set_balance_flags(leaf, item, bctl->flags); + + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + err = btrfs_commit_transaction(trans, root); + if (err && !ret) + ret = err; + return ret; +} + +static int del_balance_item(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct btrfs_key key; + int ret, err; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_BALANCE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, root, path); +out: + btrfs_free_path(path); + err = btrfs_commit_transaction(trans, root); + if (err && !ret) + ret = err; + return ret; +} + +/* + * This is a heuristic used to reduce the number of chunks balanced on + * resume after balance was interrupted. + */ +static void update_balance_args(struct btrfs_balance_control *bctl) +{ + /* + * Turn on soft mode for chunk types that were being converted. + */ + if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; + if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; + if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; + + /* + * Turn on usage filter if is not already used. The idea is + * that chunks that we have already balanced should be + * reasonably full. Don't do it for chunks that are being + * converted - that will keep us from relocating unconverted + * (albeit full) chunks. + */ + if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->data.usage = 90; + } + if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->sys.usage = 90; + } + if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->meta.usage = 90; + } +} + +/* + * Should be called with both balance and volume mutexes held to + * serialize other volume operations (add_dev/rm_dev/resize) with + * restriper. Same goes for unset_balance_control. + */ +static void set_balance_control(struct btrfs_balance_control *bctl) +{ + struct btrfs_fs_info *fs_info = bctl->fs_info; + + BUG_ON(fs_info->balance_ctl); + + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = bctl; + spin_unlock(&fs_info->balance_lock); +} + +static void unset_balance_control(struct btrfs_fs_info *fs_info) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + + BUG_ON(!fs_info->balance_ctl); + + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = NULL; + spin_unlock(&fs_info->balance_lock); + + kfree(bctl); +} + +/* + * Balance filters. Return 1 if chunk should be filtered out + * (should not be balanced). + */ +static int chunk_profiles_filter(u64 chunk_profile, + struct btrfs_balance_args *bargs) +{ + chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK; + + if (chunk_profile == 0) + chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + if (bargs->profiles & chunk_profile) + return 0; + + return 1; +} + +static u64 div_factor_fine(u64 num, int factor) +{ + if (factor <= 0) + return 0; + if (factor >= 100) + return num; + + num *= factor; + do_div(num, 100); + return num; +} + +static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + struct btrfs_block_group_cache *cache; + u64 chunk_used, user_thresh; + int ret = 1; + + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + chunk_used = btrfs_block_group_used(&cache->item); + + user_thresh = div_factor_fine(cache->key.offset, bargs->usage); + if (chunk_used < user_thresh) + ret = 0; + + btrfs_put_block_group(cache); + return ret; +} + +static int chunk_devid_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) +{ + struct btrfs_stripe *stripe; + int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + int i; + + for (i = 0; i < num_stripes; i++) { + stripe = btrfs_stripe_nr(chunk, i); + if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) + return 0; + } + + return 1; +} + +/* [pstart, pend) */ +static int chunk_drange_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + struct btrfs_stripe *stripe; + int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + u64 stripe_offset; + u64 stripe_length; + int factor; + int i; + + if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) + return 0; + + if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; + factor = num_stripes / factor; + + for (i = 0; i < num_stripes; i++) { + stripe = btrfs_stripe_nr(chunk, i); + if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) + continue; + + stripe_offset = btrfs_stripe_offset(leaf, stripe); + stripe_length = btrfs_chunk_length(leaf, chunk); + do_div(stripe_length, factor); + + if (stripe_offset < bargs->pend && + stripe_offset + stripe_length > bargs->pstart) + return 0; + } + + return 1; +} + +/* [vstart, vend) */ +static int chunk_vrange_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + if (chunk_offset < bargs->vend && + chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) + /* at least part of the chunk is inside this vrange */ + return 0; + + return 1; +} + +static int chunk_soft_convert_filter(u64 chunk_profile, + struct btrfs_balance_args *bargs) +{ + if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) + return 0; + + chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK; + + if (chunk_profile == 0) + chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + if (bargs->target & chunk_profile) + return 1; + + return 0; +} + +static int should_balance_chunk(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 chunk_offset) +{ + struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; + struct btrfs_balance_args *bargs = NULL; + u64 chunk_type = btrfs_chunk_type(leaf, chunk); + + /* type filter */ + if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & + (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { + return 0; + } + + if (chunk_type & BTRFS_BLOCK_GROUP_DATA) + bargs = &bctl->data; + else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) + bargs = &bctl->sys; + else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) + bargs = &bctl->meta; + + /* profiles filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && + chunk_profiles_filter(chunk_type, bargs)) { + return 0; + } + + /* usage filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && + chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { + return 0; + } + + /* devid filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && + chunk_devid_filter(leaf, chunk, bargs)) { + return 0; + } + + /* drange filter, makes sense only with devid filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && + chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) { + return 0; + } + + /* vrange filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && + chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { + return 0; + } + + /* soft profile changing mode */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && + chunk_soft_convert_filter(chunk_type, bargs)) { + return 0; + } + + return 1; +} + static u64 div_factor(u64 num, int factor) { if (factor == 10) @@ -2086,29 +2436,28 @@ static u64 div_factor(u64 num, int factor) return num; } -int btrfs_balance(struct btrfs_root *dev_root) +static int __btrfs_balance(struct btrfs_fs_info *fs_info) { - int ret; - struct list_head *devices = &dev_root->fs_info->fs_devices->devices; + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + struct btrfs_root *chunk_root = fs_info->chunk_root; + struct btrfs_root *dev_root = fs_info->dev_root; + struct list_head *devices; struct btrfs_device *device; u64 old_size; u64 size_to_free; + struct btrfs_chunk *chunk; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; - struct btrfs_trans_handle *trans; struct btrfs_key found_key; - - if (dev_root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - mutex_lock(&dev_root->fs_info->volume_mutex); - dev_root = dev_root->fs_info->dev_root; + struct btrfs_trans_handle *trans; + struct extent_buffer *leaf; + int slot; + int ret; + int enospc_errors = 0; + bool counting = true; /* step one make some room on all the devices */ + devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { old_size = device->total_bytes; size_to_free = div_factor(old_size, 1); @@ -2137,11 +2486,23 @@ int btrfs_balance(struct btrfs_root *dev_root) ret = -ENOMEM; goto error; } + + /* zero out stat counters */ + spin_lock(&fs_info->balance_lock); + memset(&bctl->stat, 0, sizeof(bctl->stat)); + spin_unlock(&fs_info->balance_lock); +again: key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { + if ((!counting && atomic_read(&fs_info->balance_pause_req)) || + atomic_read(&fs_info->balance_cancel_req)) { + ret = -ECANCELED; + goto error; + } + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) goto error; @@ -2151,15 +2512,19 @@ int btrfs_balance(struct btrfs_root *dev_root) * failed */ if (ret == 0) - break; + BUG(); /* FIXME break ? */ ret = btrfs_previous_item(chunk_root, path, 0, BTRFS_CHUNK_ITEM_KEY); - if (ret) + if (ret) { + ret = 0; break; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); if (found_key.objectid != key.objectid) break; @@ -2167,22 +2532,375 @@ int btrfs_balance(struct btrfs_root *dev_root) if (found_key.offset == 0) break; + chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + + if (!counting) { + spin_lock(&fs_info->balance_lock); + bctl->stat.considered++; + spin_unlock(&fs_info->balance_lock); + } + + ret = should_balance_chunk(chunk_root, leaf, chunk, + found_key.offset); btrfs_release_path(path); + if (!ret) + goto loop; + + if (counting) { + spin_lock(&fs_info->balance_lock); + bctl->stat.expected++; + spin_unlock(&fs_info->balance_lock); + goto loop; + } + ret = btrfs_relocate_chunk(chunk_root, chunk_root->root_key.objectid, found_key.objectid, found_key.offset); if (ret && ret != -ENOSPC) goto error; + if (ret == -ENOSPC) { + enospc_errors++; + } else { + spin_lock(&fs_info->balance_lock); + bctl->stat.completed++; + spin_unlock(&fs_info->balance_lock); + } +loop: key.offset = found_key.offset - 1; } - ret = 0; + + if (counting) { + btrfs_release_path(path); + counting = false; + goto again; + } error: btrfs_free_path(path); - mutex_unlock(&dev_root->fs_info->volume_mutex); + if (enospc_errors) { + printk(KERN_INFO "btrfs: %d enospc errors during balance\n", + enospc_errors); + if (!ret) + ret = -ENOSPC; + } + return ret; } +static inline int balance_need_close(struct btrfs_fs_info *fs_info) +{ + /* cancel requested || normal exit path */ + return atomic_read(&fs_info->balance_cancel_req) || + (atomic_read(&fs_info->balance_pause_req) == 0 && + atomic_read(&fs_info->balance_cancel_req) == 0); +} + +static void __cancel_balance(struct btrfs_fs_info *fs_info) +{ + int ret; + + unset_balance_control(fs_info); + ret = del_balance_item(fs_info->tree_root); + BUG_ON(ret); +} + +void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, + struct btrfs_ioctl_balance_args *bargs); + +/* + * Should be called with both balance and volume mutexes held + */ +int btrfs_balance(struct btrfs_balance_control *bctl, + struct btrfs_ioctl_balance_args *bargs) +{ + struct btrfs_fs_info *fs_info = bctl->fs_info; + u64 allowed; + int ret; + + if (btrfs_fs_closing(fs_info) || + atomic_read(&fs_info->balance_pause_req) || + atomic_read(&fs_info->balance_cancel_req)) { + ret = -EINVAL; + goto out; + } + + /* + * In case of mixed groups both data and meta should be picked, + * and identical options should be given for both of them. + */ + allowed = btrfs_super_incompat_flags(fs_info->super_copy); + if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && + (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) { + if (!(bctl->flags & BTRFS_BALANCE_DATA) || + !(bctl->flags & BTRFS_BALANCE_METADATA) || + memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { + printk(KERN_ERR "btrfs: with mixed groups data and " + "metadata balance options must be the same\n"); + ret = -EINVAL; + goto out; + } + } + + /* + * Profile changing sanity checks. Skip them if a simple + * balance is requested. + */ + if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) & + BTRFS_BALANCE_ARGS_CONVERT)) + goto do_balance; + + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + if (fs_info->fs_devices->num_devices == 1) + allowed |= BTRFS_BLOCK_GROUP_DUP; + else if (fs_info->fs_devices->num_devices < 4) + allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); + else + allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10); + + if (!profile_is_valid(bctl->data.target, 1) || + bctl->data.target & ~allowed) { + printk(KERN_ERR "btrfs: unable to start balance with target " + "data profile %llu\n", + (unsigned long long)bctl->data.target); + ret = -EINVAL; + goto out; + } + if (!profile_is_valid(bctl->meta.target, 1) || + bctl->meta.target & ~allowed) { + printk(KERN_ERR "btrfs: unable to start balance with target " + "metadata profile %llu\n", + (unsigned long long)bctl->meta.target); + ret = -EINVAL; + goto out; + } + if (!profile_is_valid(bctl->sys.target, 1) || + bctl->sys.target & ~allowed) { + printk(KERN_ERR "btrfs: unable to start balance with target " + "system profile %llu\n", + (unsigned long long)bctl->sys.target); + ret = -EINVAL; + goto out; + } + + if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) { + printk(KERN_ERR "btrfs: dup for data is not allowed\n"); + ret = -EINVAL; + goto out; + } + + /* allow to reduce meta or sys integrity only if force set */ + allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10; + if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_system_alloc_bits & allowed) && + !(bctl->sys.target & allowed)) || + ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_metadata_alloc_bits & allowed) && + !(bctl->meta.target & allowed))) { + if (bctl->flags & BTRFS_BALANCE_FORCE) { + printk(KERN_INFO "btrfs: force reducing metadata " + "integrity\n"); + } else { + printk(KERN_ERR "btrfs: balance will reduce metadata " + "integrity, use force if you want this\n"); + ret = -EINVAL; + goto out; + } + } + +do_balance: + ret = insert_balance_item(fs_info->tree_root, bctl); + if (ret && ret != -EEXIST) + goto out; + + if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { + BUG_ON(ret == -EEXIST); + set_balance_control(bctl); + } else { + BUG_ON(ret != -EEXIST); + spin_lock(&fs_info->balance_lock); + update_balance_args(bctl); + spin_unlock(&fs_info->balance_lock); + } + + atomic_inc(&fs_info->balance_running); + mutex_unlock(&fs_info->balance_mutex); + + ret = __btrfs_balance(fs_info); + + mutex_lock(&fs_info->balance_mutex); + atomic_dec(&fs_info->balance_running); + + if (bargs) { + memset(bargs, 0, sizeof(*bargs)); + update_ioctl_balance_args(fs_info, 0, bargs); + } + + if ((ret && ret != -ECANCELED && ret != -ENOSPC) || + balance_need_close(fs_info)) { + __cancel_balance(fs_info); + } + + wake_up(&fs_info->balance_wait_q); + + return ret; +out: + if (bctl->flags & BTRFS_BALANCE_RESUME) + __cancel_balance(fs_info); + else + kfree(bctl); + return ret; +} + +static int balance_kthread(void *data) +{ + struct btrfs_balance_control *bctl = + (struct btrfs_balance_control *)data; + struct btrfs_fs_info *fs_info = bctl->fs_info; + int ret = 0; + + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + + set_balance_control(bctl); + + if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { + printk(KERN_INFO "btrfs: force skipping balance\n"); + } else { + printk(KERN_INFO "btrfs: continuing balance\n"); + ret = btrfs_balance(bctl, NULL); + } + + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + return ret; +} + +int btrfs_recover_balance(struct btrfs_root *tree_root) +{ + struct task_struct *tsk; + struct btrfs_balance_control *bctl; + struct btrfs_balance_item *item; + struct btrfs_disk_balance_args disk_bargs; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + bctl = kzalloc(sizeof(*bctl), GFP_NOFS); + if (!bctl) { + ret = -ENOMEM; + goto out; + } + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_BALANCE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); + if (ret < 0) + goto out_bctl; + if (ret > 0) { /* ret = -ENOENT; */ + ret = 0; + goto out_bctl; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); + + bctl->fs_info = tree_root->fs_info; + bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME; + + btrfs_balance_data(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); + btrfs_balance_meta(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); + btrfs_balance_sys(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); + + tsk = kthread_run(balance_kthread, bctl, "btrfs-balance"); + if (IS_ERR(tsk)) + ret = PTR_ERR(tsk); + else + goto out; + +out_bctl: + kfree(bctl); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_pause_balance(struct btrfs_fs_info *fs_info) +{ + int ret = 0; + + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + mutex_unlock(&fs_info->balance_mutex); + return -ENOTCONN; + } + + if (atomic_read(&fs_info->balance_running)) { + atomic_inc(&fs_info->balance_pause_req); + mutex_unlock(&fs_info->balance_mutex); + + wait_event(fs_info->balance_wait_q, + atomic_read(&fs_info->balance_running) == 0); + + mutex_lock(&fs_info->balance_mutex); + /* we are good with balance_ctl ripped off from under us */ + BUG_ON(atomic_read(&fs_info->balance_running)); + atomic_dec(&fs_info->balance_pause_req); + } else { + ret = -ENOTCONN; + } + + mutex_unlock(&fs_info->balance_mutex); + return ret; +} + +int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) +{ + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + mutex_unlock(&fs_info->balance_mutex); + return -ENOTCONN; + } + + atomic_inc(&fs_info->balance_cancel_req); + /* + * if we are running just wait and return, balance item is + * deleted in btrfs_balance in this case + */ + if (atomic_read(&fs_info->balance_running)) { + mutex_unlock(&fs_info->balance_mutex); + wait_event(fs_info->balance_wait_q, + atomic_read(&fs_info->balance_running) == 0); + mutex_lock(&fs_info->balance_mutex); + } else { + /* __cancel_balance needs volume_mutex */ + mutex_unlock(&fs_info->balance_mutex); + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl) + __cancel_balance(fs_info); + + mutex_unlock(&fs_info->volume_mutex); + } + + BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); + atomic_dec(&fs_info->balance_cancel_req); + mutex_unlock(&fs_info->balance_mutex); + return 0; +} + /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. @@ -2323,8 +3041,7 @@ done: return ret; } -static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int btrfs_add_system_chunk(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_chunk *chunk, int item_size) { @@ -2441,10 +3158,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_stripe_size = 1024 * 1024 * 1024; max_chunk_size = 10 * max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_METADATA) { - max_stripe_size = 256 * 1024 * 1024; + /* for larger filesystems, use larger metadata chunks */ + if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) + max_stripe_size = 1024 * 1024 * 1024; + else + max_stripe_size = 256 * 1024 * 1024; max_chunk_size = max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { - max_stripe_size = 8 * 1024 * 1024; + max_stripe_size = 32 * 1024 * 1024; max_chunk_size = 2 * max_stripe_size; } else { printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", @@ -2496,7 +3217,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (total_avail == 0) continue; - ret = find_free_dev_extent(trans, device, + ret = find_free_dev_extent(device, max_stripe_size * dev_stripes, &dev_offset, &max_avail); if (ret && ret != -ENOSPC) @@ -2687,7 +3408,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, BUG_ON(ret); if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { - ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, + ret = btrfs_add_system_chunk(chunk_root, &key, chunk, item_size); BUG_ON(ret); } @@ -2752,8 +3473,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, return ret; alloc_profile = BTRFS_BLOCK_GROUP_METADATA | - (fs_info->metadata_alloc_profile & - fs_info->avail_metadata_alloc_bits); + fs_info->avail_metadata_alloc_bits; alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, @@ -2763,8 +3483,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, sys_chunk_offset = chunk_offset + chunk_size; alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | - (fs_info->system_alloc_profile & - fs_info->avail_system_alloc_bits); + fs_info->avail_system_alloc_bits; alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, @@ -2901,26 +3620,13 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 stripe_nr; u64 stripe_nr_orig; u64 stripe_nr_end; - int stripes_allocated = 8; - int stripes_required = 1; int stripe_index; int i; + int ret = 0; int num_stripes; int max_errors = 0; struct btrfs_bio *bbio = NULL; - if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) - stripes_allocated = 1; -again: - if (bbio_ret) { - bbio = kzalloc(btrfs_bio_size(stripes_allocated), - GFP_NOFS); - if (!bbio) - return -ENOMEM; - - atomic_set(&bbio->error, 0); - } - read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, *length); read_unlock(&em_tree->lock); @@ -2939,32 +3645,6 @@ again: if (mirror_num > map->num_stripes) mirror_num = 0; - /* if our btrfs_bio struct is too small, back off and try again */ - if (rw & REQ_WRITE) { - if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_DUP)) { - stripes_required = map->num_stripes; - max_errors = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - stripes_required = map->sub_stripes; - max_errors = 1; - } - } - if (rw & REQ_DISCARD) { - if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID10)) { - stripes_required = map->num_stripes; - } - } - if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && - stripes_allocated < stripes_required) { - stripes_allocated = map->num_stripes; - free_extent_map(em); - kfree(bbio); - goto again; - } stripe_nr = offset; /* * stripe_nr counts the total number of stripes we have to stride @@ -2980,10 +3660,7 @@ again: if (rw & REQ_DISCARD) *length = min_t(u64, em->len - offset, *length); - else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_DUP)) { + else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { /* we limit the length of each bio to what fits in a stripe */ *length = min_t(u64, em->len - offset, map->stripe_len - stripe_offset); @@ -3059,81 +3736,55 @@ again: } BUG_ON(stripe_index >= map->num_stripes); + bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); + if (!bbio) { + ret = -ENOMEM; + goto out; + } + atomic_set(&bbio->error, 0); + if (rw & REQ_DISCARD) { + int factor = 0; + int sub_stripes = 0; + u64 stripes_per_dev = 0; + u32 remaining_stripes = 0; + + if (map->type & + (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID0) + sub_stripes = 1; + else + sub_stripes = map->sub_stripes; + + factor = map->num_stripes / sub_stripes; + stripes_per_dev = div_u64_rem(stripe_nr_end - + stripe_nr_orig, + factor, + &remaining_stripes); + } + for (i = 0; i < num_stripes; i++) { bbio->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; bbio->stripes[i].dev = map->stripes[stripe_index].dev; - if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - u64 stripes; - u32 last_stripe = 0; - int j; - - div_u64_rem(stripe_nr_end - 1, - map->num_stripes, - &last_stripe); - - for (j = 0; j < map->num_stripes; j++) { - u32 test; - - div_u64_rem(stripe_nr_end - 1 - j, - map->num_stripes, &test); - if (test == stripe_index) - break; - } - stripes = stripe_nr_end - 1 - j; - do_div(stripes, map->num_stripes); - bbio->stripes[i].length = map->stripe_len * - (stripes - stripe_nr + 1); - - if (i == 0) { - bbio->stripes[i].length -= - stripe_offset; - stripe_offset = 0; - } - if (stripe_index == last_stripe) - bbio->stripes[i].length -= - stripe_end_offset; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - u64 stripes; - int j; - int factor = map->num_stripes / - map->sub_stripes; - u32 last_stripe = 0; - - div_u64_rem(stripe_nr_end - 1, - factor, &last_stripe); - last_stripe *= map->sub_stripes; - - for (j = 0; j < factor; j++) { - u32 test; - - div_u64_rem(stripe_nr_end - 1 - j, - factor, &test); - - if (test == - stripe_index / map->sub_stripes) - break; - } - stripes = stripe_nr_end - 1 - j; - do_div(stripes, factor); - bbio->stripes[i].length = map->stripe_len * - (stripes - stripe_nr + 1); - - if (i < map->sub_stripes) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { + bbio->stripes[i].length = stripes_per_dev * + map->stripe_len; + if (i / sub_stripes < remaining_stripes) + bbio->stripes[i].length += + map->stripe_len; + if (i < sub_stripes) bbio->stripes[i].length -= stripe_offset; - if (i == map->sub_stripes - 1) - stripe_offset = 0; - } - if (stripe_index >= last_stripe && - stripe_index <= (last_stripe + - map->sub_stripes - 1)) { + if ((i / sub_stripes + 1) % + sub_stripes == remaining_stripes) bbio->stripes[i].length -= stripe_end_offset; - } + if (i == sub_stripes - 1) + stripe_offset = 0; } else bbio->stripes[i].length = *length; @@ -3155,15 +3806,22 @@ again: stripe_index++; } } - if (bbio_ret) { - *bbio_ret = bbio; - bbio->num_stripes = num_stripes; - bbio->max_errors = max_errors; - bbio->mirror_num = mirror_num; + + if (rw & REQ_WRITE) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP)) { + max_errors = 1; + } } + + *bbio_ret = bbio; + bbio->num_stripes = num_stripes; + bbio->max_errors = max_errors; + bbio->mirror_num = mirror_num; out: free_extent_map(em); - return 0; + return ret; } int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, @@ -3304,7 +3962,7 @@ static noinline int schedule_bio(struct btrfs_root *root, /* don't bother with additional async steps for reads, right now */ if (!(rw & REQ_WRITE)) { bio_get(bio); - submit_bio(rw, bio); + btrfsic_submit_bio(rw, bio); bio_put(bio); return 0; } @@ -3399,7 +4057,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, if (async_submit) schedule_bio(root, dev, rw, bio); else - submit_bio(rw, bio); + btrfsic_submit_bio(rw, bio); } else { bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; bio->bi_sector = logical >> 9; @@ -3568,7 +4226,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) struct btrfs_fs_devices *fs_devices; int ret; - mutex_lock(&uuid_mutex); + BUG_ON(!mutex_is_locked(&uuid_mutex)); fs_devices = root->fs_info->fs_devices->seed; while (fs_devices) { @@ -3606,7 +4264,6 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) fs_devices->seed = root->fs_info->fs_devices->seed; root->fs_info->fs_devices->seed = fs_devices; out: - mutex_unlock(&uuid_mutex); return ret; } @@ -3749,6 +4406,9 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) if (!path) return -ENOMEM; + mutex_lock(&uuid_mutex); + lock_chunks(root); + /* first we search for all of the device items, and then we * read in all of the chunk items. This way we can create chunk * mappings that reference all of the devices that are afound @@ -3799,6 +4459,9 @@ again: } ret = 0; error: + unlock_chunks(root); + mutex_unlock(&uuid_mutex); + btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 78f2d4d4f37..19ac95048b8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -186,6 +186,51 @@ struct map_lookup { #define map_lookup_size(n) (sizeof(struct map_lookup) + \ (sizeof(struct btrfs_bio_stripe) * (n))) +/* + * Restriper's general type filter + */ +#define BTRFS_BALANCE_DATA (1ULL << 0) +#define BTRFS_BALANCE_SYSTEM (1ULL << 1) +#define BTRFS_BALANCE_METADATA (1ULL << 2) + +#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \ + BTRFS_BALANCE_SYSTEM | \ + BTRFS_BALANCE_METADATA) + +#define BTRFS_BALANCE_FORCE (1ULL << 3) +#define BTRFS_BALANCE_RESUME (1ULL << 4) + +/* + * Balance filters + */ +#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) +#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) +#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) +#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) +#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) + +/* + * Profile changing flags. When SOFT is set we won't relocate chunk if + * it already has the target profile (even though it may be + * half-filled). + */ +#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8) +#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9) + +struct btrfs_balance_args; +struct btrfs_balance_progress; +struct btrfs_balance_control { + struct btrfs_fs_info *fs_info; + + struct btrfs_balance_args data; + struct btrfs_balance_args meta; + struct btrfs_balance_args sys; + + u64 flags; + + struct btrfs_balance_progress stat; +}; + int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); @@ -228,9 +273,12 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); -int btrfs_balance(struct btrfs_root *dev_root); +int btrfs_balance(struct btrfs_balance_control *bctl, + struct btrfs_ioctl_balance_args *bargs); +int btrfs_recover_balance(struct btrfs_root *tree_root); +int btrfs_pause_balance(struct btrfs_fs_info *fs_info); +int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); -int find_free_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 3848b04e310..e7a5659087e 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -200,7 +200,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); out: - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); return ret; } diff --git a/fs/namei.c b/fs/namei.c index c283a1ec008..208c6aa4a98 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -140,21 +140,19 @@ static int do_getname(const char __user *filename, char *page) static char *getname_flags(const char __user *filename, int flags, int *empty) { - char *tmp, *result; - - result = ERR_PTR(-ENOMEM); - tmp = __getname(); - if (tmp) { - int retval = do_getname(filename, tmp); - - result = tmp; - if (retval < 0) { - if (retval == -ENOENT && empty) - *empty = 1; - if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { - __putname(tmp); - result = ERR_PTR(retval); - } + char *result = __getname(); + int retval; + + if (!result) + return ERR_PTR(-ENOMEM); + + retval = do_getname(filename, result); + if (retval < 0) { + if (retval == -ENOENT && empty) + *empty = 1; + if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { + __putname(result); + return ERR_PTR(retval); } } audit_getname(result); diff --git a/fs/proc/base.c b/fs/proc/base.c index 5485a5388ec..9cde9edf9c4 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -198,65 +198,7 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } -static struct mm_struct *__check_mem_permission(struct task_struct *task) -{ - struct mm_struct *mm; - - mm = get_task_mm(task); - if (!mm) - return ERR_PTR(-EINVAL); - - /* - * A task can always look at itself, in case it chooses - * to use system calls instead of load instructions. - */ - if (task == current) - return mm; - - /* - * If current is actively ptrace'ing, and would also be - * permitted to freshly attach with ptrace now, permit it. - */ - if (task_is_stopped_or_traced(task)) { - int match; - rcu_read_lock(); - match = (ptrace_parent(task) == current); - rcu_read_unlock(); - if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH)) - return mm; - } - - /* - * No one else is allowed. - */ - mmput(mm); - return ERR_PTR(-EPERM); -} - -/* - * If current may access user memory in @task return a reference to the - * corresponding mm, otherwise ERR_PTR. - */ -static struct mm_struct *check_mem_permission(struct task_struct *task) -{ - struct mm_struct *mm; - int err; - - /* - * Avoid racing if task exec's as we might get a new mm but validate - * against old credentials. - */ - err = mutex_lock_killable(&task->signal->cred_guard_mutex); - if (err) - return ERR_PTR(err); - - mm = __check_mem_permission(task); - mutex_unlock(&task->signal->cred_guard_mutex); - - return mm; -} - -struct mm_struct *mm_for_maps(struct task_struct *task) +static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) { struct mm_struct *mm; int err; @@ -267,7 +209,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task) mm = get_task_mm(task); if (mm && mm != current->mm && - !ptrace_may_access(task, PTRACE_MODE_READ)) { + !ptrace_may_access(task, mode)) { mmput(mm); mm = ERR_PTR(-EACCES); } @@ -276,6 +218,11 @@ struct mm_struct *mm_for_maps(struct task_struct *task) return mm; } +struct mm_struct *mm_for_maps(struct task_struct *task) +{ + return mm_access(task, PTRACE_MODE_READ); +} + static int proc_pid_cmdline(struct task_struct *task, char * buffer) { int res = 0; @@ -752,38 +699,39 @@ static const struct file_operations proc_single_file_operations = { static int mem_open(struct inode* inode, struct file* file) { - file->private_data = (void*)((long)current->self_exec_id); + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + struct mm_struct *mm; + + if (!task) + return -ESRCH; + + mm = mm_access(task, PTRACE_MODE_ATTACH); + put_task_struct(task); + + if (IS_ERR(mm)) + return PTR_ERR(mm); + /* OK to pass negative loff_t, we can catch out-of-range */ file->f_mode |= FMODE_UNSIGNED_OFFSET; + file->private_data = mm; + return 0; } static ssize_t mem_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + int ret; char *page; unsigned long src = *ppos; - int ret = -ESRCH; - struct mm_struct *mm; + struct mm_struct *mm = file->private_data; - if (!task) - goto out_no_task; + if (!mm) + return 0; - ret = -ENOMEM; page = (char *)__get_free_page(GFP_TEMPORARY); if (!page) - goto out; - - mm = check_mem_permission(task); - ret = PTR_ERR(mm); - if (IS_ERR(mm)) - goto out_free; - - ret = -EIO; - - if (file->private_data != (void*)((long)current->self_exec_id)) - goto out_put; + return -ENOMEM; ret = 0; @@ -810,13 +758,7 @@ static ssize_t mem_read(struct file * file, char __user * buf, } *ppos = src; -out_put: - mmput(mm); -out_free: free_page((unsigned long) page); -out: - put_task_struct(task); -out_no_task: return ret; } @@ -825,27 +767,15 @@ static ssize_t mem_write(struct file * file, const char __user *buf, { int copied; char *page; - struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); unsigned long dst = *ppos; - struct mm_struct *mm; + struct mm_struct *mm = file->private_data; - copied = -ESRCH; - if (!task) - goto out_no_task; + if (!mm) + return 0; - copied = -ENOMEM; page = (char *)__get_free_page(GFP_TEMPORARY); if (!page) - goto out_task; - - mm = check_mem_permission(task); - copied = PTR_ERR(mm); - if (IS_ERR(mm)) - goto out_free; - - copied = -EIO; - if (file->private_data != (void *)((long)current->self_exec_id)) - goto out_mm; + return -ENOMEM; copied = 0; while (count > 0) { @@ -869,13 +799,7 @@ static ssize_t mem_write(struct file * file, const char __user *buf, } *ppos = dst; -out_mm: - mmput(mm); -out_free: free_page((unsigned long) page); -out_task: - put_task_struct(task); -out_no_task: return copied; } @@ -895,11 +819,20 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig) return file->f_pos; } +static int mem_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + + mmput(mm); + return 0; +} + static const struct file_operations proc_mem_operations = { .llseek = mem_lseek, .read = mem_read, .write = mem_write, .open = mem_open, + .release = mem_release, }; static ssize_t environ_read(struct file *file, char __user *buf, @@ -1199,9 +1132,6 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, ssize_t length; uid_t loginuid; - if (!capable(CAP_AUDIT_CONTROL)) - return -EPERM; - rcu_read_lock(); if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { rcu_read_unlock(); @@ -1230,7 +1160,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, goto out_free_page; } - length = audit_set_loginuid(current, loginuid); + length = audit_set_loginuid(loginuid); if (likely(length == 0)) length = count; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 574d4ee9b62..74b9baf36ac 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -111,8 +111,7 @@ xfs_ioend_new_eof( xfs_fsize_t bsize; bsize = ioend->io_offset + ioend->io_size; - isize = MAX(ip->i_size, ip->i_new_size); - isize = MIN(isize, bsize); + isize = MIN(i_size_read(VFS_I(ip)), bsize); return isize > ip->i_d.di_size ? isize : 0; } @@ -126,11 +125,7 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) } /* - * Update on-disk file size now that data has been written to disk. The - * current in-memory file size is i_size. If a write is beyond eof i_new_size - * will be the intended file size until i_size is updated. If this write does - * not extend all the way to the valid file size then restrict this update to - * the end of the write. + * Update on-disk file size now that data has been written to disk. * * This function does not block as blocking on the inode lock in IO completion * can lead to IO completion order dependency deadlocks.. If it can't get the @@ -1279,6 +1274,15 @@ xfs_end_io_direct_write( struct xfs_ioend *ioend = iocb->private; /* + * While the generic direct I/O code updates the inode size, it does + * so only after the end_io handler is called, which means our + * end_io handler thinks the on-disk size is outside the in-core + * size. To prevent this just update it a little bit earlier here. + */ + if (offset + size > i_size_read(ioend->io_inode)) + i_size_write(ioend->io_inode, offset + size); + + /* * blockdev_direct_IO can return an error even after the I/O * completion handler was called. Thus we need to protect * against double-freeing. @@ -1340,12 +1344,11 @@ xfs_vm_write_failed( if (to > inode->i_size) { /* - * punch out the delalloc blocks we have already allocated. We - * don't call xfs_setattr() to do this as we may be in the - * middle of a multi-iovec write and so the vfs inode->i_size - * will not match the xfs ip->i_size and so it will zero too - * much. Hence we jus truncate the page cache to zero what is - * necessary and punch the delalloc blocks directly. + * Punch out the delalloc blocks we have already allocated. + * + * Don't bother with xfs_setattr given that nothing can have + * made it to disk yet as the page is still locked at this + * point. */ struct xfs_inode *ip = XFS_I(inode); xfs_fileoff_t start_fsb; diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 1e5d97f86ea..08b9ac644c3 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -827,10 +827,6 @@ xfs_attr_inactive(xfs_inode_t *dp) if (error) goto out; - /* - * Commit the last in the sequence of transactions. - */ - xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index c1b55e59655..d25eafd4d28 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -271,10 +271,6 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) dp = args->dp; mp = dp->i_mount; dp->i_d.di_forkoff = forkoff; - dp->i_df.if_ext_max = - XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); - dp->i_afp->if_ext_max = - XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); ifp = dp->i_afp; ASSERT(ifp->if_flags & XFS_IFINLINE); @@ -326,7 +322,6 @@ xfs_attr_fork_reset( ASSERT(ip->i_d.di_anextents == 0); ASSERT(ip->i_afp == NULL); - ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -389,10 +384,6 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) (args->op_flags & XFS_DA_OP_ADDNAME) || !(mp->m_flags & XFS_MOUNT_ATTR2) || dp->i_d.di_format == XFS_DINODE_FMT_BTREE); - dp->i_afp->if_ext_max = - XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); - dp->i_df.if_ext_max = - XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index d0ab7883705..188ef2fbd62 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -249,7 +249,27 @@ xfs_bmbt_lookup_ge( } /* -* Update the record referred to by cur to the value given + * Check if the inode needs to be converted to btree format. + */ +static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) +{ + return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) > + XFS_IFORK_MAXEXT(ip, whichfork); +} + +/* + * Check if the inode should be converted to extent format. + */ +static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) +{ + return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + XFS_IFORK_NEXTENTS(ip, whichfork) <= + XFS_IFORK_MAXEXT(ip, whichfork); +} + +/* + * Update the record referred to by cur to the value given * by [off, bno, len, state]. * This either works (return 0) or gets an EFSCORRUPTED error. */ @@ -683,8 +703,8 @@ xfs_bmap_add_extent_delay_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); @@ -767,8 +787,8 @@ xfs_bmap_add_extent_delay_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); @@ -836,8 +856,8 @@ xfs_bmap_add_extent_delay_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); @@ -884,8 +904,7 @@ xfs_bmap_add_extent_delay_real( } /* convert to a btree if necessary */ - if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) { + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { int tmp_logflags; /* partial log flag return val */ ASSERT(bma->cur == NULL); @@ -1421,8 +1440,7 @@ xfs_bmap_add_extent_unwritten_real( } /* convert to a btree if necessary */ - if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) { + if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) { int tmp_logflags; /* partial log flag return val */ ASSERT(cur == NULL); @@ -1812,8 +1830,7 @@ xfs_bmap_add_extent_hole_real( } /* convert to a btree if necessary */ - if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) { + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ ASSERT(bma->cur == NULL); @@ -3037,8 +3054,7 @@ xfs_bmap_extents_to_btree( ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + /* * Make space in the inode incore. */ @@ -3184,13 +3200,8 @@ xfs_bmap_forkoff_reset( ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; - if (dfl_forkoff > ip->i_d.di_forkoff) { + if (dfl_forkoff > ip->i_d.di_forkoff) ip->i_d.di_forkoff = dfl_forkoff; - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t); - ip->i_afp->if_ext_max = - XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t); - } } } @@ -3430,8 +3441,6 @@ xfs_bmap_add_attrfork( int error; /* error return value */ ASSERT(XFS_IFORK_Q(ip) == 0); - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); mp = ip->i_mount; ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); @@ -3486,12 +3495,9 @@ xfs_bmap_add_attrfork( error = XFS_ERROR(EINVAL); goto error1; } - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(ip->i_afp == NULL); ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); - ip->i_afp->if_ext_max = - XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); ip->i_afp->if_flags = XFS_IFEXTENTS; logflags = 0; xfs_bmap_init(&flist, &firstblock); @@ -3535,20 +3541,17 @@ xfs_bmap_add_attrfork( } else spin_unlock(&mp->m_sb_lock); } - if ((error = xfs_bmap_finish(&tp, &flist, &committed))) + + error = xfs_bmap_finish(&tp, &flist, &committed); + if (error) goto error2; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); - return error; + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); error2: xfs_bmap_cancel(&flist); error1: xfs_iunlock(ip, XFS_ILOCK_EXCL); error0: xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); return error; } @@ -3994,11 +3997,8 @@ xfs_bmap_one_block( xfs_bmbt_irec_t s; /* internal version of extent */ #ifndef DEBUG - if (whichfork == XFS_DATA_FORK) { - return S_ISREG(ip->i_d.di_mode) ? - (ip->i_size == ip->i_mount->m_sb.sb_blocksize) : - (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize); - } + if (whichfork == XFS_DATA_FORK) + return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize; #endif /* !DEBUG */ if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) return 0; @@ -4010,7 +4010,7 @@ xfs_bmap_one_block( xfs_bmbt_get_all(ep, &s); rval = s.br_startoff == 0 && s.br_blockcount == 1; if (rval && whichfork == XFS_DATA_FORK) - ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize); + ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize); return rval; } @@ -4379,8 +4379,6 @@ xfs_bmapi_read( XFS_STATS_INC(xs_blk_mapr); ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(NULL, ip, whichfork); @@ -4871,8 +4869,6 @@ xfs_bmapi_write( return XFS_ERROR(EIO); ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); XFS_STATS_INC(xs_blk_mapw); @@ -4981,8 +4977,7 @@ xfs_bmapi_write( /* * Transform from btree to extents, give it cur. */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && - XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { + if (xfs_bmap_wants_extents(ip, whichfork)) { int tmp_logflags = 0; ASSERT(bma.cur); @@ -4992,10 +4987,10 @@ xfs_bmapi_write( if (error) goto error0; } - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || - XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max); + XFS_IFORK_NEXTENTS(ip, whichfork) > + XFS_IFORK_MAXEXT(ip, whichfork)); error = 0; error0: /* @@ -5095,8 +5090,7 @@ xfs_bunmapi( ASSERT(len > 0); ASSERT(nexts >= 0); - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + if (!(ifp->if_flags & XFS_IFEXTENTS) && (error = xfs_iread_extents(tp, ip, whichfork))) return error; @@ -5322,7 +5316,8 @@ xfs_bunmapi( */ if (!wasdel && xfs_trans_get_block_res(tp) == 0 && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max && + XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */ + XFS_IFORK_MAXEXT(ip, whichfork) && del.br_startoff > got.br_startoff && del.br_startoff + del.br_blockcount < got.br_startoff + got.br_blockcount) { @@ -5353,13 +5348,11 @@ nodelete: } } *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + /* * Convert to a btree if necessary. */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) { + if (xfs_bmap_needs_btree(ip, whichfork)) { ASSERT(cur == NULL); error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0, &tmp_logflags, whichfork); @@ -5370,8 +5363,7 @@ nodelete: /* * transform from btree to extents, give it cur */ - else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && - XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { + else if (xfs_bmap_wants_extents(ip, whichfork)) { ASSERT(cur != NULL); error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, whichfork); @@ -5382,8 +5374,6 @@ nodelete: /* * transform from extents to local? */ - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); error = 0; error0: /* @@ -5434,7 +5424,7 @@ xfs_getbmapx_fix_eof_hole( if (startblock == HOLESTARTBLOCK) { mp = ip->i_mount; out->bmv_block = -1; - fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size)); + fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip))); fixlen -= out->bmv_offset; if (prealloced && out->bmv_offset + out->bmv_length == end) { /* Came to hole at EOF. Trim it. */ @@ -5522,7 +5512,7 @@ xfs_getbmap( fixlen = XFS_MAXIOFFSET(mp); } else { prealloced = 0; - fixlen = ip->i_size; + fixlen = XFS_ISIZE(ip); } } @@ -5551,7 +5541,7 @@ xfs_getbmap( xfs_ilock(ip, XFS_IOLOCK_SHARED); if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { - if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) { + if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF); if (error) goto out_unlock_iolock; diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 654dc6f05ba..dd974a55c77 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -163,12 +163,14 @@ xfs_swap_extents_check_format( /* Check temp in extent form to max in target */ if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max) + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > + XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) return EINVAL; /* Check target in extent form to max in temp */ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max) + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > + XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) return EINVAL; /* @@ -180,18 +182,25 @@ xfs_swap_extents_check_format( * (a common defrag case) which will occur when the temp inode is in * extent format... */ - if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE && - ((XFS_IFORK_BOFF(ip) && - tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) || - XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max)) - return EINVAL; + if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (XFS_IFORK_BOFF(ip) && + tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) + return EINVAL; + if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= + XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) + return EINVAL; + } /* Reciprocal target->temp btree format checks */ - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && - ((XFS_IFORK_BOFF(tip) && - ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) || - XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max)) - return EINVAL; + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (XFS_IFORK_BOFF(tip) && + ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) + return EINVAL; + + if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= + XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) + return EINVAL; + } return 0; } @@ -349,16 +358,6 @@ xfs_swap_extents( *tifp = *tempifp; /* struct copy */ /* - * Fix the in-memory data fork values that are dependent on the fork - * offset in the inode. We can't assume they remain the same as attr2 - * has dynamic fork offsets. - */ - ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) / - (uint)sizeof(xfs_bmbt_rec_t); - tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) / - (uint)sizeof(xfs_bmbt_rec_t); - - /* * Fix the on-disk inode values */ tmp = (__uint64_t)ip->i_d.di_nblocks; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f675f3d9d7b..7e5bc872f2b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -327,7 +327,7 @@ xfs_file_aio_read( mp->m_rtdev_targp : mp->m_ddev_targp; if ((iocb->ki_pos & target->bt_smask) || (size & target->bt_smask)) { - if (iocb->ki_pos == ip->i_size) + if (iocb->ki_pos == i_size_read(inode)) return 0; return -XFS_ERROR(EINVAL); } @@ -412,51 +412,6 @@ xfs_file_splice_read( return ret; } -STATIC void -xfs_aio_write_isize_update( - struct inode *inode, - loff_t *ppos, - ssize_t bytes_written) -{ - struct xfs_inode *ip = XFS_I(inode); - xfs_fsize_t isize = i_size_read(inode); - - if (bytes_written > 0) - XFS_STATS_ADD(xs_write_bytes, bytes_written); - - if (unlikely(bytes_written < 0 && bytes_written != -EFAULT && - *ppos > isize)) - *ppos = isize; - - if (*ppos > ip->i_size) { - xfs_rw_ilock(ip, XFS_ILOCK_EXCL); - if (*ppos > ip->i_size) - ip->i_size = *ppos; - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); - } -} - -/* - * If this was a direct or synchronous I/O that failed (such as ENOSPC) then - * part of the I/O may have been written to disk before the error occurred. In - * this case the on-disk file size may have been adjusted beyond the in-memory - * file size and now needs to be truncated back. - */ -STATIC void -xfs_aio_write_newsize_update( - struct xfs_inode *ip, - xfs_fsize_t new_size) -{ - if (new_size == ip->i_new_size) { - xfs_rw_ilock(ip, XFS_ILOCK_EXCL); - if (new_size == ip->i_new_size) - ip->i_new_size = 0; - if (ip->i_d.di_size > ip->i_size) - ip->i_d.di_size = ip->i_size; - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); - } -} - /* * xfs_file_splice_write() does not use xfs_rw_ilock() because * generic_file_splice_write() takes the i_mutex itself. This, in theory, @@ -475,7 +430,6 @@ xfs_file_splice_write( { struct inode *inode = outfilp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - xfs_fsize_t new_size; int ioflags = 0; ssize_t ret; @@ -489,19 +443,12 @@ xfs_file_splice_write( xfs_ilock(ip, XFS_IOLOCK_EXCL); - new_size = *ppos + count; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (new_size > ip->i_size) - ip->i_new_size = new_size; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - trace_xfs_file_splice_write(ip, count, *ppos, ioflags); ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); + if (ret > 0) + XFS_STATS_ADD(xs_write_bytes, ret); - xfs_aio_write_isize_update(inode, ppos, ret); - xfs_aio_write_newsize_update(ip, new_size); xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } @@ -689,28 +636,26 @@ out_lock: /* * Common pre-write limit and setup checks. * - * Returns with iolock held according to @iolock. + * Called with the iolocked held either shared and exclusive according to + * @iolock, and returns with it held. Might upgrade the iolock to exclusive + * if called for a direct write beyond i_size. */ STATIC ssize_t xfs_file_aio_write_checks( struct file *file, loff_t *pos, size_t *count, - xfs_fsize_t *new_sizep, int *iolock) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - xfs_fsize_t new_size; int error = 0; xfs_rw_ilock(ip, XFS_ILOCK_EXCL); - *new_sizep = 0; restart: error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); if (error) { - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); - *iolock = 0; + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -720,36 +665,21 @@ restart: /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this - * write. There is no need to issue zeroing if another in-flght IO ends - * at or before this one If zeronig is needed and we are currently - * holding the iolock shared, we need to update it to exclusive which - * involves dropping all locks and relocking to maintain correct locking - * order. If we do this, restart the function to ensure all checks and - * values are still valid. + * write. If zeroing is needed and we are currently holding the + * iolock shared, we need to update it to exclusive which involves + * dropping all locks and relocking to maintain correct locking order. + * If we do this, restart the function to ensure all checks and values + * are still valid. */ - if ((ip->i_new_size && *pos > ip->i_new_size) || - (!ip->i_new_size && *pos > ip->i_size)) { + if (*pos > i_size_read(inode)) { if (*iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); goto restart; } - error = -xfs_zero_eof(ip, *pos, ip->i_size); + error = -xfs_zero_eof(ip, *pos, i_size_read(inode)); } - - /* - * If this IO extends beyond EOF, we may need to update ip->i_new_size. - * We have already zeroed space beyond EOF (if necessary). Only update - * ip->i_new_size if this IO ends beyond any other in-flight writes. - */ - new_size = *pos + *count; - if (new_size > ip->i_size) { - if (new_size > ip->i_new_size) - ip->i_new_size = new_size; - *new_sizep = new_size; - } - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; @@ -794,9 +724,7 @@ xfs_file_dio_aio_write( const struct iovec *iovp, unsigned long nr_segs, loff_t pos, - size_t ocount, - xfs_fsize_t *new_size, - int *iolock) + size_t ocount) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -806,10 +734,10 @@ xfs_file_dio_aio_write( ssize_t ret = 0; size_t count = ocount; int unaligned_io = 0; + int iolock; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; - *iolock = 0; if ((pos & target->bt_smask) || (count & target->bt_smask)) return -XFS_ERROR(EINVAL); @@ -824,31 +752,31 @@ xfs_file_dio_aio_write( * EOF zeroing cases and fill out the new inode size as appropriate. */ if (unaligned_io || mapping->nrpages) - *iolock = XFS_IOLOCK_EXCL; + iolock = XFS_IOLOCK_EXCL; else - *iolock = XFS_IOLOCK_SHARED; - xfs_rw_ilock(ip, *iolock); + iolock = XFS_IOLOCK_SHARED; + xfs_rw_ilock(ip, iolock); /* * Recheck if there are cached pages that need invalidate after we got * the iolock to protect against other threads adding new pages while * we were waiting for the iolock. */ - if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, *iolock); - *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); + if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, iolock); + iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, iolock); } - ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); + ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); if (ret) - return ret; + goto out; if (mapping->nrpages) { ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); if (ret) - return ret; + goto out; } /* @@ -857,15 +785,18 @@ xfs_file_dio_aio_write( */ if (unaligned_io) inode_dio_wait(inode); - else if (*iolock == XFS_IOLOCK_EXCL) { + else if (iolock == XFS_IOLOCK_EXCL) { xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); - *iolock = XFS_IOLOCK_SHARED; + iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); ret = generic_file_direct_write(iocb, iovp, &nr_segs, pos, &iocb->ki_pos, count, ocount); +out: + xfs_rw_iunlock(ip, iolock); + /* No fallback to buffered IO on errors for XFS. */ ASSERT(ret < 0 || ret == count); return ret; @@ -877,9 +808,7 @@ xfs_file_buffered_aio_write( const struct iovec *iovp, unsigned long nr_segs, loff_t pos, - size_t ocount, - xfs_fsize_t *new_size, - int *iolock) + size_t ocount) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -887,14 +816,14 @@ xfs_file_buffered_aio_write( struct xfs_inode *ip = XFS_I(inode); ssize_t ret; int enospc = 0; + int iolock = XFS_IOLOCK_EXCL; size_t count = ocount; - *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); + xfs_rw_ilock(ip, iolock); - ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); + ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); if (ret) - return ret; + goto out; /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; @@ -908,13 +837,15 @@ write_retry: * page locks and retry *once* */ if (ret == -ENOSPC && !enospc) { - ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); - if (ret) - return ret; enospc = 1; - goto write_retry; + ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); + if (!ret) + goto write_retry; } + current->backing_dev_info = NULL; +out: + xfs_rw_iunlock(ip, iolock); return ret; } @@ -930,9 +861,7 @@ xfs_file_aio_write( struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; - int iolock; size_t ocount = 0; - xfs_fsize_t new_size = 0; XFS_STATS_INC(xs_write_calls); @@ -951,33 +880,22 @@ xfs_file_aio_write( return -EIO; if (unlikely(file->f_flags & O_DIRECT)) - ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, - ocount, &new_size, &iolock); + ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount); else ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, - ocount, &new_size, &iolock); - - xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); + ocount); - if (ret <= 0) - goto out_unlock; + if (ret > 0) { + ssize_t err; - /* Handle various SYNC-type writes */ - if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { - loff_t end = pos + ret - 1; - int error; + XFS_STATS_ADD(xs_write_bytes, ret); - xfs_rw_iunlock(ip, iolock); - error = xfs_file_fsync(file, pos, end, - (file->f_flags & __O_SYNC) ? 0 : 1); - xfs_rw_ilock(ip, iolock); - if (error) - ret = error; + /* Handle various SYNC-type writes */ + err = generic_write_sync(file, pos, ret); + if (err < 0) + ret = err; } -out_unlock: - xfs_aio_write_newsize_update(ip, new_size); - xfs_rw_iunlock(ip, iolock); return ret; } diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c index ed88ed16811..652b875a9d4 100644 --- a/fs/xfs/xfs_fs_subr.c +++ b/fs/xfs/xfs_fs_subr.c @@ -90,7 +90,7 @@ xfs_wait_on_pages( if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { return -filemap_fdatawait_range(mapping, first, - last == -1 ? ip->i_size - 1 : last); + last == -1 ? XFS_ISIZE(ip) - 1 : last); } return 0; } diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 3960a066d7f..8c3e46394d4 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -77,7 +77,7 @@ xfs_inode_alloc( ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(completion_done(&ip->i_flush)); + ASSERT(!xfs_isiflocked(ip)); ASSERT(ip->i_ino == 0); mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); @@ -94,8 +94,6 @@ xfs_inode_alloc( ip->i_update_core = 0; ip->i_delayed_blks = 0; memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); - ip->i_size = 0; - ip->i_new_size = 0; return ip; } @@ -150,7 +148,7 @@ xfs_inode_free( /* asserts to verify all state is correct here */ ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(completion_done(&ip->i_flush)); + ASSERT(!xfs_isiflocked(ip)); /* * Because we use RCU freeing we need to ensure the inode always @@ -450,8 +448,6 @@ again: *ipp = ip; - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t)); /* * If we have a real type for an on-disk inode, we can set ops(&unlock) * now. If it's a new inode being created, xfs_ialloc will handle it. @@ -715,3 +711,19 @@ xfs_isilocked( return 0; } #endif + +void +__xfs_iflock( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); + + do { + prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (xfs_isiflocked(ip)) + io_schedule(); + } while (!xfs_iflock_nowait(ip)); + + finish_wait(wq, &wait.wait); +} diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9dda7cc3284..b21022499c2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -299,11 +299,8 @@ xfs_iformat( { xfs_attr_shortform_t *atp; int size; - int error; + int error = 0; xfs_fsize_t di_size; - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); - error = 0; if (unlikely(be32_to_cpu(dip->di_nextents) + be16_to_cpu(dip->di_anextents) > @@ -350,7 +347,6 @@ xfs_iformat( return XFS_ERROR(EFSCORRUPTED); } ip->i_d.di_size = 0; - ip->i_size = 0; ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); break; @@ -409,10 +405,10 @@ xfs_iformat( } if (!XFS_DFORK_Q(dip)) return 0; + ASSERT(ip->i_afp == NULL); ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); - ip->i_afp->if_ext_max = - XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + switch (dip->di_aformat) { case XFS_DINODE_FMT_LOCAL: atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); @@ -604,10 +600,11 @@ xfs_iformat_btree( * or the number of extents is greater than the number of * blocks. */ - if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max - || XFS_BMDR_SPACE_CALC(nrecs) > - XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) - || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { + if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= + XFS_IFORK_MAXEXT(ip, whichfork) || + XFS_BMDR_SPACE_CALC(nrecs) > + XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) || + XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).", (unsigned long long) ip->i_ino); XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, @@ -835,12 +832,6 @@ xfs_iread( * with the uninitialized part of it. */ ip->i_d.di_mode = 0; - /* - * Initialize the per-fork minima and maxima for a new - * inode here. xfs_iformat will do it for old inodes. - */ - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); } /* @@ -861,7 +852,6 @@ xfs_iread( } ip->i_delayed_blks = 0; - ip->i_size = ip->i_d.di_size; /* * Mark the buffer containing the inode as something to keep @@ -1051,7 +1041,6 @@ xfs_ialloc( } ip->i_d.di_size = 0; - ip->i_size = 0; ip->i_d.di_nextents = 0; ASSERT(ip->i_d.di_nblocks == 0); @@ -1166,52 +1155,6 @@ xfs_ialloc( } /* - * Check to make sure that there are no blocks allocated to the - * file beyond the size of the file. We don't check this for - * files with fixed size extents or real time extents, but we - * at least do it for regular files. - */ -#ifdef DEBUG -STATIC void -xfs_isize_check( - struct xfs_inode *ip, - xfs_fsize_t isize) -{ - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t map_first; - int nimaps; - xfs_bmbt_irec_t imaps[2]; - int error; - - if (!S_ISREG(ip->i_d.di_mode)) - return; - - if (XFS_IS_REALTIME_INODE(ip)) - return; - - if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) - return; - - nimaps = 2; - map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); - /* - * The filesystem could be shutting down, so bmapi may return - * an error. - */ - error = xfs_bmapi_read(ip, map_first, - (XFS_B_TO_FSB(mp, - (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first), - imaps, &nimaps, XFS_BMAPI_ENTIRE); - if (error) - return; - ASSERT(nimaps == 1); - ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); -} -#else /* DEBUG */ -#define xfs_isize_check(ip, isize) -#endif /* DEBUG */ - -/* * Free up the underlying blocks past new_size. The new size must be smaller * than the current size. This routine can be used both for the attribute and * data fork, and does not modify the inode size, which is left to the caller. @@ -1252,12 +1195,14 @@ xfs_itruncate_extents( int done = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); - ASSERT(new_size <= ip->i_size); + ASSERT(new_size <= XFS_ISIZE(ip)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(ip->i_itemp != NULL); ASSERT(ip->i_itemp->ili_lock_flags == 0); ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + trace_xfs_itruncate_extents_start(ip, new_size); + /* * Since it is possible for space to become allocated beyond * the end of the file (in a crash where the space is allocated @@ -1325,6 +1270,14 @@ xfs_itruncate_extents( goto out; } + /* + * Always re-log the inode so that our permanent transaction can keep + * on rolling it forward in the log. + */ + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + trace_xfs_itruncate_extents_end(ip, new_size); + out: *tpp = tp; return error; @@ -1338,74 +1291,6 @@ out_bmap_cancel: goto out; } -int -xfs_itruncate_data( - struct xfs_trans **tpp, - struct xfs_inode *ip, - xfs_fsize_t new_size) -{ - int error; - - trace_xfs_itruncate_data_start(ip, new_size); - - /* - * The first thing we do is set the size to new_size permanently on - * disk. This way we don't have to worry about anyone ever being able - * to look at the data being freed even in the face of a crash. - * What we're getting around here is the case where we free a block, it - * is allocated to another file, it is written to, and then we crash. - * If the new data gets written to the file but the log buffers - * containing the free and reallocation don't, then we'd end up with - * garbage in the blocks being freed. As long as we make the new_size - * permanent before actually freeing any blocks it doesn't matter if - * they get written to. - */ - if (ip->i_d.di_nextents > 0) { - /* - * If we are not changing the file size then do not update - * the on-disk file size - we may be called from - * xfs_inactive_free_eofblocks(). If we update the on-disk - * file size and then the system crashes before the contents - * of the file are flushed to disk then the files may be - * full of holes (ie NULL files bug). - */ - if (ip->i_size != new_size) { - ip->i_d.di_size = new_size; - ip->i_size = new_size; - xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); - } - } - - error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size); - if (error) - return error; - - /* - * If we are not changing the file size then do not update the on-disk - * file size - we may be called from xfs_inactive_free_eofblocks(). - * If we update the on-disk file size and then the system crashes - * before the contents of the file are flushed to disk then the files - * may be full of holes (ie NULL files bug). - */ - xfs_isize_check(ip, new_size); - if (ip->i_size != new_size) { - ip->i_d.di_size = new_size; - ip->i_size = new_size; - } - - ASSERT(new_size != 0 || ip->i_delayed_blks == 0); - ASSERT(new_size != 0 || ip->i_d.di_nextents == 0); - - /* - * Always re-log the inode so that our permanent transaction can keep - * on rolling it forward in the log. - */ - xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); - - trace_xfs_itruncate_data_end(ip, new_size); - return 0; -} - /* * This is called when the inode's link count goes to 0. * We place the on-disk inode on a list in the AGI. It @@ -1824,8 +1709,7 @@ xfs_ifree( ASSERT(ip->i_d.di_nlink == 0); ASSERT(ip->i_d.di_nextents == 0); ASSERT(ip->i_d.di_anextents == 0); - ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) || - (!S_ISREG(ip->i_d.di_mode))); + ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode)); ASSERT(ip->i_d.di_nblocks == 0); /* @@ -1844,8 +1728,6 @@ xfs_ifree( ip->i_d.di_flags = 0; ip->i_d.di_dmevmask = 0; ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; /* @@ -2151,7 +2033,7 @@ xfs_idestroy_fork( * once someone is waiting for it to be unpinned. */ static void -xfs_iunpin_nowait( +xfs_iunpin( struct xfs_inode *ip) { ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); @@ -2163,14 +2045,29 @@ xfs_iunpin_nowait( } +static void +__xfs_iunpin_wait( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT); + + xfs_iunpin(ip); + + do { + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (xfs_ipincount(ip)) + io_schedule(); + } while (xfs_ipincount(ip)); + finish_wait(wq, &wait.wait); +} + void xfs_iunpin_wait( struct xfs_inode *ip) { - if (xfs_ipincount(ip)) { - xfs_iunpin_nowait(ip); - wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0)); - } + if (xfs_ipincount(ip)) + __xfs_iunpin_wait(ip); } /* @@ -2510,9 +2407,9 @@ xfs_iflush( XFS_STATS_INC(xs_iflush_count); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(!completion_done(&ip->i_flush)); + ASSERT(xfs_isiflocked(ip)); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > ip->i_df.if_ext_max); + ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); iip = ip->i_itemp; mp = ip->i_mount; @@ -2529,7 +2426,7 @@ xfs_iflush( * out for us if they occur after the log force completes. */ if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) { - xfs_iunpin_nowait(ip); + xfs_iunpin(ip); xfs_ifunlock(ip); return EAGAIN; } @@ -2626,9 +2523,9 @@ xfs_iflush_int( #endif ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(!completion_done(&ip->i_flush)); + ASSERT(xfs_isiflocked(ip)); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > ip->i_df.if_ext_max); + ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); iip = ip->i_itemp; mp = ip->i_mount; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index f0e6b151ba3..2f27b745408 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -66,7 +66,6 @@ typedef struct xfs_ifork { struct xfs_btree_block *if_broot; /* file's incore btree root */ short if_broot_bytes; /* bytes allocated for root */ unsigned char if_flags; /* per-fork flags */ - unsigned char if_ext_max; /* max # of extent records */ union { xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ @@ -206,12 +205,12 @@ typedef struct xfs_icdinode { ((w) == XFS_DATA_FORK ? \ ((ip)->i_d.di_nextents = (n)) : \ ((ip)->i_d.di_anextents = (n))) - +#define XFS_IFORK_MAXEXT(ip, w) \ + (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) #ifdef __KERNEL__ -struct bhv_desc; struct xfs_buf; struct xfs_bmap_free; struct xfs_bmbt_irec; @@ -220,12 +219,6 @@ struct xfs_mount; struct xfs_trans; struct xfs_dquot; -typedef struct dm_attrs_s { - __uint32_t da_dmevmask; /* DMIG event mask */ - __uint16_t da_dmstate; /* DMIG state info */ - __uint16_t da_pad; /* DMIG extra padding */ -} dm_attrs_t; - typedef struct xfs_inode { /* Inode linking and identification information. */ struct xfs_mount *i_mount; /* fs mount struct ptr */ @@ -244,27 +237,19 @@ typedef struct xfs_inode { struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ mrlock_t i_iolock; /* inode IO lock */ - struct completion i_flush; /* inode flush completion q */ atomic_t i_pincount; /* inode pin count */ - wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */ spinlock_t i_flags_lock; /* inode i_flags lock */ /* Miscellaneous state. */ - unsigned short i_flags; /* see defined flags below */ + unsigned long i_flags; /* see defined flags below */ unsigned char i_update_core; /* timestamps/size is dirty */ unsigned int i_delayed_blks; /* count of delay alloc blks */ xfs_icdinode_t i_d; /* most of ondisk inode */ - xfs_fsize_t i_size; /* in-memory size */ - xfs_fsize_t i_new_size; /* size when write completes */ - /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ } xfs_inode_t; -#define XFS_ISIZE(ip) S_ISREG((ip)->i_d.di_mode) ? \ - (ip)->i_size : (ip)->i_d.di_size; - /* Convert from vfs inode to xfs inode */ static inline struct xfs_inode *XFS_I(struct inode *inode) { @@ -278,6 +263,18 @@ static inline struct inode *VFS_I(struct xfs_inode *ip) } /* + * For regular files we only update the on-disk filesize when actually + * writing data back to disk. Until then only the copy in the VFS inode + * is uptodate. + */ +static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip) +{ + if (S_ISREG(ip->i_d.di_mode)) + return i_size_read(VFS_I(ip)); + return ip->i_d.di_size; +} + +/* * i_flags helper functions */ static inline void @@ -331,6 +328,19 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags) return ret; } +static inline int +xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags) +{ + int ret; + + spin_lock(&ip->i_flags_lock); + ret = ip->i_flags & flags; + if (!ret) + ip->i_flags |= flags; + spin_unlock(&ip->i_flags_lock); + return ret; +} + /* * Project quota id helpers (previously projid was 16bit only * and using two 16bit values to hold new 32bit projid was chosen @@ -351,35 +361,19 @@ xfs_set_projid(struct xfs_inode *ip, } /* - * Manage the i_flush queue embedded in the inode. This completion - * queue synchronizes processes attempting to flush the in-core - * inode back to disk. - */ -static inline void xfs_iflock(xfs_inode_t *ip) -{ - wait_for_completion(&ip->i_flush); -} - -static inline int xfs_iflock_nowait(xfs_inode_t *ip) -{ - return try_wait_for_completion(&ip->i_flush); -} - -static inline void xfs_ifunlock(xfs_inode_t *ip) -{ - complete(&ip->i_flush); -} - -/* * In-core inode flags. */ -#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */ -#define XFS_ISTALE 0x0002 /* inode has been staled */ -#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */ -#define XFS_INEW 0x0008 /* inode has just been allocated */ -#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */ -#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ -#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */ +#define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */ +#define XFS_ISTALE (1 << 1) /* inode has been staled */ +#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ +#define XFS_INEW (1 << 3) /* inode has just been allocated */ +#define XFS_IFILESTREAM (1 << 4) /* inode is in a filestream dir. */ +#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ +#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ +#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ +#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT) +#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ +#define XFS_IPINNED (1 << __XFS_IPINNED_BIT) /* * Per-lifetime flags need to be reset when re-using a reclaimable inode during @@ -392,6 +386,34 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) XFS_IFILESTREAM); /* + * Synchronize processes attempting to flush the in-core inode back to disk. + */ + +extern void __xfs_iflock(struct xfs_inode *ip); + +static inline int xfs_iflock_nowait(struct xfs_inode *ip) +{ + return !xfs_iflags_test_and_set(ip, XFS_IFLOCK); +} + +static inline void xfs_iflock(struct xfs_inode *ip) +{ + if (!xfs_iflock_nowait(ip)) + __xfs_iflock(ip); +} + +static inline void xfs_ifunlock(struct xfs_inode *ip) +{ + xfs_iflags_clear(ip, XFS_IFLOCK); + wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT); +} + +static inline int xfs_isiflocked(struct xfs_inode *ip) +{ + return xfs_iflags_test(ip, XFS_IFLOCK); +} + +/* * Flags for inode locking. * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) * 1<<16 - 1<<32-1 -- lockdep annotation (integers) @@ -491,8 +513,6 @@ int xfs_ifree(struct xfs_trans *, xfs_inode_t *, struct xfs_bmap_free *); int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, int, xfs_fsize_t); -int xfs_itruncate_data(struct xfs_trans **, struct xfs_inode *, - xfs_fsize_t); int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); void xfs_iext_realloc(xfs_inode_t *, int, int); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index cfd6c7f8cc3..91d71dcd485 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -79,8 +79,6 @@ xfs_inode_item_size( break; case XFS_DINODE_FMT_BTREE: - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); iip->ili_format.ilf_fields &= ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | XFS_ILOG_DEV | XFS_ILOG_UUID); @@ -557,7 +555,7 @@ xfs_inode_item_unpin( trace_xfs_inode_unpin(ip, _RET_IP_); ASSERT(atomic_read(&ip->i_pincount) > 0); if (atomic_dec_and_test(&ip->i_pincount)) - wake_up(&ip->i_ipin_wait); + wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); } /* @@ -719,7 +717,7 @@ xfs_inode_item_pushbuf( * If a flush is not in progress anymore, chances are that the * inode was taken off the AIL. So, just get out. */ - if (completion_done(&ip->i_flush) || + if (!xfs_isiflocked(ip) || !(lip->li_flags & XFS_LI_IN_AIL)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); return true; @@ -752,7 +750,7 @@ xfs_inode_item_push( struct xfs_inode *ip = iip->ili_inode; ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); - ASSERT(!completion_done(&ip->i_flush)); + ASSERT(xfs_isiflocked(ip)); /* * Since we were able to lock the inode's flush lock and diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 9afa282aa93..246c7d57c6f 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -57,26 +57,26 @@ xfs_iomap_eof_align_last_fsb( xfs_fileoff_t *last_fsb) { xfs_fileoff_t new_last_fsb = 0; - xfs_extlen_t align; + xfs_extlen_t align = 0; int eof, error; - if (XFS_IS_REALTIME_INODE(ip)) - ; - /* - * If mounted with the "-o swalloc" option, roundup the allocation - * request to a stripe width boundary if the file size is >= - * stripe width and we are allocating past the allocation eof. - */ - else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) && - (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth))) - new_last_fsb = roundup_64(*last_fsb, mp->m_swidth); - /* - * Roundup the allocation request to a stripe unit (m_dalign) boundary - * if the file size is >= stripe unit size, and we are allocating past - * the allocation eof. - */ - else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign))) - new_last_fsb = roundup_64(*last_fsb, mp->m_dalign); + if (!XFS_IS_REALTIME_INODE(ip)) { + /* + * Round up the allocation request to a stripe unit + * (m_dalign) boundary if the file size is >= stripe unit + * size, and we are allocating past the allocation eof. + * + * If mounted with the "-o swalloc" option the alignment is + * increased from the strip unit size to the stripe width. + */ + if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) + align = mp->m_swidth; + else if (mp->m_dalign) + align = mp->m_dalign; + + if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align)) + new_last_fsb = roundup_64(*last_fsb, align); + } /* * Always round up the allocation request to an extent boundary @@ -154,7 +154,7 @@ xfs_iomap_write_direct( offset_fsb = XFS_B_TO_FSBT(mp, offset); last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); - if ((offset + count) > ip->i_size) { + if ((offset + count) > XFS_ISIZE(ip)) { error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); if (error) goto error_out; @@ -211,7 +211,7 @@ xfs_iomap_write_direct( xfs_trans_ijoin(tp, ip, 0); bmapi_flag = 0; - if (offset < ip->i_size || extsz) + if (offset < XFS_ISIZE(ip) || extsz) bmapi_flag |= XFS_BMAPI_PREALLOC; /* @@ -286,7 +286,7 @@ xfs_iomap_eof_want_preallocate( int found_delalloc = 0; *prealloc = 0; - if ((offset + count) <= ip->i_size) + if (offset + count <= XFS_ISIZE(ip)) return 0; /* @@ -340,7 +340,7 @@ xfs_iomap_prealloc_size( * if we pass in alloc_blocks = 0. Hence the "+ 1" to * ensure we always pass in a non-zero value. */ - alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1; + alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1; alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, rounddown_pow_of_two(alloc_blocks)); @@ -564,7 +564,7 @@ xfs_iomap_write_allocate( * back.... */ nimaps = 1; - end_fsb = XFS_B_TO_FSB(mp, ip->i_size); + end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); error = xfs_bmap_last_offset(NULL, ip, &last_block, XFS_DATA_FORK); if (error) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f9babd17922..ab302539e5b 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -750,6 +750,7 @@ xfs_setattr_size( struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); int mask = iattr->ia_valid; + xfs_off_t oldsize, newsize; struct xfs_trans *tp; int error; uint lock_flags; @@ -777,11 +778,13 @@ xfs_setattr_size( lock_flags |= XFS_IOLOCK_EXCL; xfs_ilock(ip, lock_flags); + oldsize = inode->i_size; + newsize = iattr->ia_size; + /* * Short circuit the truncate case for zero length files. */ - if (iattr->ia_size == 0 && - ip->i_size == 0 && ip->i_d.di_nextents == 0) { + if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { if (!(mask & (ATTR_CTIME|ATTR_MTIME))) goto out_unlock; @@ -807,14 +810,14 @@ xfs_setattr_size( * the inode to the transaction, because the inode cannot be unlocked * once it is a part of the transaction. */ - if (iattr->ia_size > ip->i_size) { + if (newsize > oldsize) { /* * Do the first part of growing a file: zero any data in the * last block that is beyond the old EOF. We need to do this * before the inode is joined to the transaction to modify * i_size. */ - error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size); + error = xfs_zero_eof(ip, newsize, oldsize); if (error) goto out_unlock; } @@ -833,8 +836,8 @@ xfs_setattr_size( * here and prevents waiting for other data not within the range we * care about here. */ - if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) { - error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0, + if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { + error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0, FI_NONE); if (error) goto out_unlock; @@ -845,8 +848,7 @@ xfs_setattr_size( */ inode_dio_wait(inode); - error = -block_truncate_page(inode->i_mapping, iattr->ia_size, - xfs_get_blocks); + error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); if (error) goto out_unlock; @@ -857,7 +859,7 @@ xfs_setattr_size( if (error) goto out_trans_cancel; - truncate_setsize(inode, iattr->ia_size); + truncate_setsize(inode, newsize); commit_flags = XFS_TRANS_RELEASE_LOG_RES; lock_flags |= XFS_ILOCK_EXCL; @@ -876,19 +878,29 @@ xfs_setattr_size( * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ - if (iattr->ia_size != ip->i_size && - (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { + if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { iattr->ia_ctime = iattr->ia_mtime = current_fs_time(inode->i_sb); mask |= ATTR_CTIME | ATTR_MTIME; } - if (iattr->ia_size > ip->i_size) { - ip->i_d.di_size = iattr->ia_size; - ip->i_size = iattr->ia_size; - } else if (iattr->ia_size <= ip->i_size || - (iattr->ia_size == 0 && ip->i_d.di_nextents)) { - error = xfs_itruncate_data(&tp, ip, iattr->ia_size); + /* + * The first thing we do is set the size to new_size permanently on + * disk. This way we don't have to worry about anyone ever being able + * to look at the data being freed even in the face of a crash. + * What we're getting around here is the case where we free a block, it + * is allocated to another file, it is written to, and then we crash. + * If the new data gets written to the file but the log buffers + * containing the free and reallocation don't, then we'd end up with + * garbage in the blocks being freed. As long as we make the new size + * permanent before actually freeing any blocks it doesn't matter if + * they get written to. + */ + ip->i_d.di_size = newsize; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + if (newsize <= oldsize) { + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); if (error) goto out_trans_abort; diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 5cc3dde1bc9..eafbcff81f3 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -31,6 +31,7 @@ #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_inode.h" +#include "xfs_inode_item.h" #include "xfs_itable.h" #include "xfs_bmap.h" #include "xfs_rtalloc.h" @@ -263,13 +264,18 @@ xfs_qm_scall_trunc_qfile( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_itruncate_data(&tp, ip, 0); + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); if (error) { xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); goto out_unlock; } + ASSERT(ip->i_d.di_nextents == 0); + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 281961c1d81..ee5b695c99a 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -828,14 +828,6 @@ xfs_fs_inode_init_once( /* xfs inode */ atomic_set(&ip->i_pincount, 0); spin_lock_init(&ip->i_flags_lock); - init_waitqueue_head(&ip->i_ipin_wait); - /* - * Because we want to use a counting completion, complete - * the flush completion once to allow a single access to - * the flush completion without blocking. - */ - init_completion(&ip->i_flush); - complete(&ip->i_flush); mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, "xfsino", ip->i_ino); diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 72c01a1c16e..40b75eecd2b 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -707,14 +707,13 @@ xfs_reclaim_inode_grab( return 1; /* - * do some unlocked checks first to avoid unnecessary lock traffic. - * The first is a flush lock check, the second is a already in reclaim - * check. Only do these checks if we are not going to block on locks. + * If we are asked for non-blocking operation, do unlocked checks to + * see if the inode already is being flushed or in reclaim to avoid + * lock traffic. */ if ((flags & SYNC_TRYLOCK) && - (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) { + __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) return 1; - } /* * The radix tree lock here protects a thread in xfs_iget from racing diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index a9d5b1e06ef..6b6df5802e9 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -891,7 +891,6 @@ DECLARE_EVENT_CLASS(xfs_file_class, __field(dev_t, dev) __field(xfs_ino_t, ino) __field(xfs_fsize_t, size) - __field(xfs_fsize_t, new_size) __field(loff_t, offset) __field(size_t, count) __field(int, flags) @@ -900,17 +899,15 @@ DECLARE_EVENT_CLASS(xfs_file_class, __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; __entry->size = ip->i_d.di_size; - __entry->new_size = ip->i_new_size; __entry->offset = offset; __entry->count = count; __entry->flags = flags; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " + TP_printk("dev %d:%d ino 0x%llx size 0x%llx " "offset 0x%llx count 0x%zx ioflags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, - __entry->new_size, __entry->offset, __entry->count, __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) @@ -978,7 +975,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __field(dev_t, dev) __field(xfs_ino_t, ino) __field(loff_t, size) - __field(loff_t, new_size) __field(loff_t, offset) __field(size_t, count) __field(int, type) @@ -990,7 +986,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; __entry->size = ip->i_d.di_size; - __entry->new_size = ip->i_new_size; __entry->offset = offset; __entry->count = count; __entry->type = type; @@ -998,13 +993,11 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __entry->startblock = irec ? irec->br_startblock : 0; __entry->blockcount = irec ? irec->br_blockcount : 0; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " - "offset 0x%llx count %zd type %s " - "startoff 0x%llx startblock %lld blockcount 0x%llx", + TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd " + "type %s startoff 0x%llx startblock %lld blockcount 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, - __entry->new_size, __entry->offset, __entry->count, __print_symbolic(__entry->type, XFS_IO_TYPES), @@ -1031,26 +1024,23 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class, __field(xfs_ino_t, ino) __field(loff_t, isize) __field(loff_t, disize) - __field(loff_t, new_size) __field(loff_t, offset) __field(size_t, count) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->isize = ip->i_size; + __entry->isize = VFS_I(ip)->i_size; __entry->disize = ip->i_d.di_size; - __entry->new_size = ip->i_new_size; __entry->offset = offset; __entry->count = count; ), - TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx new_size 0x%llx " + TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx " "offset 0x%llx count %zd", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->isize, __entry->disize, - __entry->new_size, __entry->offset, __entry->count) ); @@ -1090,8 +1080,8 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class, DEFINE_EVENT(xfs_itrunc_class, name, \ TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \ TP_ARGS(ip, new_size)) -DEFINE_ITRUNC_EVENT(xfs_itruncate_data_start); -DEFINE_ITRUNC_EVENT(xfs_itruncate_data_end); +DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start); +DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end); TRACE_EVENT(xfs_pagecache_inval, TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish), @@ -1568,7 +1558,6 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __field(xfs_ino_t, ino) __field(int, format) __field(int, nex) - __field(int, max_nex) __field(int, broot_size) __field(int, fork_off) ), @@ -1578,18 +1567,16 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __entry->ino = ip->i_ino; __entry->format = ip->i_d.di_format; __entry->nex = ip->i_d.di_nextents; - __entry->max_nex = ip->i_df.if_ext_max; __entry->broot_size = ip->i_df.if_broot_bytes; __entry->fork_off = XFS_IFORK_BOFF(ip); ), TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, " - "Max in-fork extents %d, broot size %d, fork offset %d", + "broot size %d, fork offset %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_symbolic(__entry->which, XFS_SWAPEXT_INODES), __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR), __entry->nex, - __entry->max_nex, __entry->broot_size, __entry->fork_off) ) diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index f2fea868d4d..0cf52da9d24 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -175,7 +175,7 @@ xfs_free_eofblocks( * Figure out if there are any blocks beyond the end * of the file. If not, then there is nothing to do. */ - end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size)); + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); if (last_fsb <= end_fsb) return 0; @@ -226,7 +226,14 @@ xfs_free_eofblocks( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_itruncate_data(&tp, ip, ip->i_size); + /* + * Do not update the on-disk file size. If we update the + * on-disk file size and then the system crashes before the + * contents of the file are flushed to disk then the files + * may be full of holes (ie NULL files bug). + */ + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, + XFS_ISIZE(ip)); if (error) { /* * If we get an error at this point we simply don't @@ -540,8 +547,8 @@ xfs_release( return 0; if ((S_ISREG(ip->i_d.di_mode) && - ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || - ip->i_delayed_blks > 0)) && + (VFS_I(ip)->i_size > 0 || + (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && (ip->i_df.if_flags & XFS_IFEXTENTS)) && (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { @@ -618,7 +625,7 @@ xfs_inactive( * only one with a reference to the inode. */ truncate = ((ip->i_d.di_nlink == 0) && - ((ip->i_d.di_size != 0) || (ip->i_size != 0) || + ((ip->i_d.di_size != 0) || XFS_ISIZE(ip) != 0 || (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) && S_ISREG(ip->i_d.di_mode)); @@ -632,12 +639,12 @@ xfs_inactive( if (ip->i_d.di_nlink != 0) { if ((S_ISREG(ip->i_d.di_mode) && - ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || - ip->i_delayed_blks > 0)) && - (ip->i_df.if_flags & XFS_IFEXTENTS) && - (!(ip->i_d.di_flags & + (VFS_I(ip)->i_size > 0 || + (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && + (ip->i_df.if_flags & XFS_IFEXTENTS) && + (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || - (ip->i_delayed_blks != 0)))) { + ip->i_delayed_blks != 0))) { error = xfs_free_eofblocks(mp, ip, 0); if (error) return VN_INACTIVE_CACHE; @@ -670,13 +677,18 @@ xfs_inactive( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_itruncate_data(&tp, ip, 0); + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); if (error) { xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); return VN_INACTIVE_CACHE; } + + ASSERT(ip->i_d.di_nextents == 0); } else if (S_ISLNK(ip->i_d.di_mode)) { /* @@ -1961,11 +1973,11 @@ xfs_zero_remaining_bytes( * since nothing can read beyond eof. The space will * be zeroed when the file is extended anyway. */ - if (startoff >= ip->i_size) + if (startoff >= XFS_ISIZE(ip)) return 0; - if (endoff > ip->i_size) - endoff = ip->i_size; + if (endoff > XFS_ISIZE(ip)) + endoff = XFS_ISIZE(ip); bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp, @@ -2260,7 +2272,7 @@ xfs_change_file_space( bf->l_start += offset; break; case 2: /*SEEK_END*/ - bf->l_start += ip->i_size; + bf->l_start += XFS_ISIZE(ip); break; default: return XFS_ERROR(EINVAL); @@ -2277,7 +2289,7 @@ xfs_change_file_space( bf->l_whence = 0; startoffset = bf->l_start; - fsize = ip->i_size; + fsize = XFS_ISIZE(ip); /* * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h index 9eabffbc4e5..033f6aa670d 100644 --- a/include/linux/amba/pl08x.h +++ b/include/linux/amba/pl08x.h @@ -134,7 +134,7 @@ struct pl08x_txd { struct dma_async_tx_descriptor tx; struct list_head node; struct list_head dsg_list; - enum dma_data_direction direction; + enum dma_transfer_direction direction; dma_addr_t llis_bus; struct pl08x_lli *llis_va; /* Default cctl value for LLIs */ @@ -197,7 +197,7 @@ struct pl08x_dma_chan { dma_addr_t dst_addr; u32 src_cctl; u32 dst_cctl; - enum dma_data_direction runtime_direction; + enum dma_transfer_direction runtime_direction; dma_cookie_t lc; struct list_head pend_list; struct pl08x_txd *at; diff --git a/include/linux/audit.h b/include/linux/audit.h index 426ab9f4dd8..9ff7a2c48b5 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -26,6 +26,7 @@ #include <linux/types.h> #include <linux/elf-em.h> +#include <linux/ptrace.h> /* The netlink messages for the audit system is divided into blocks: * 1000 - 1099 are for commanding the audit system @@ -181,6 +182,40 @@ * AUDIT_UNUSED_BITS is updated if need be. */ #define AUDIT_UNUSED_BITS 0x07FFFC00 +/* AUDIT_FIELD_COMPARE rule list */ +#define AUDIT_COMPARE_UID_TO_OBJ_UID 1 +#define AUDIT_COMPARE_GID_TO_OBJ_GID 2 +#define AUDIT_COMPARE_EUID_TO_OBJ_UID 3 +#define AUDIT_COMPARE_EGID_TO_OBJ_GID 4 +#define AUDIT_COMPARE_AUID_TO_OBJ_UID 5 +#define AUDIT_COMPARE_SUID_TO_OBJ_UID 6 +#define AUDIT_COMPARE_SGID_TO_OBJ_GID 7 +#define AUDIT_COMPARE_FSUID_TO_OBJ_UID 8 +#define AUDIT_COMPARE_FSGID_TO_OBJ_GID 9 + +#define AUDIT_COMPARE_UID_TO_AUID 10 +#define AUDIT_COMPARE_UID_TO_EUID 11 +#define AUDIT_COMPARE_UID_TO_FSUID 12 +#define AUDIT_COMPARE_UID_TO_SUID 13 + +#define AUDIT_COMPARE_AUID_TO_FSUID 14 +#define AUDIT_COMPARE_AUID_TO_SUID 15 +#define AUDIT_COMPARE_AUID_TO_EUID 16 + +#define AUDIT_COMPARE_EUID_TO_SUID 17 +#define AUDIT_COMPARE_EUID_TO_FSUID 18 + +#define AUDIT_COMPARE_SUID_TO_FSUID 19 + +#define AUDIT_COMPARE_GID_TO_EGID 20 +#define AUDIT_COMPARE_GID_TO_FSGID 21 +#define AUDIT_COMPARE_GID_TO_SGID 22 + +#define AUDIT_COMPARE_EGID_TO_FSGID 23 +#define AUDIT_COMPARE_EGID_TO_SGID 24 +#define AUDIT_COMPARE_SGID_TO_FSGID 25 + +#define AUDIT_MAX_FIELD_COMPARE AUDIT_COMPARE_SGID_TO_FSGID /* Rule fields */ /* These are useful when checking the @@ -222,6 +257,9 @@ #define AUDIT_PERM 106 #define AUDIT_DIR 107 #define AUDIT_FILETYPE 108 +#define AUDIT_OBJ_UID 109 +#define AUDIT_OBJ_GID 110 +#define AUDIT_FIELD_COMPARE 111 #define AUDIT_ARG0 200 #define AUDIT_ARG1 (AUDIT_ARG0+1) @@ -408,28 +446,24 @@ struct audit_field { void *lsm_rule; }; -#define AUDITSC_INVALID 0 -#define AUDITSC_SUCCESS 1 -#define AUDITSC_FAILURE 2 -#define AUDITSC_RESULT(x) ( ((long)(x))<0?AUDITSC_FAILURE:AUDITSC_SUCCESS ) extern int __init audit_register_class(int class, unsigned *list); extern int audit_classify_syscall(int abi, unsigned syscall); extern int audit_classify_arch(int arch); #ifdef CONFIG_AUDITSYSCALL /* These are defined in auditsc.c */ /* Public API */ -extern void audit_finish_fork(struct task_struct *child); extern int audit_alloc(struct task_struct *task); -extern void audit_free(struct task_struct *task); -extern void audit_syscall_entry(int arch, - int major, unsigned long a0, unsigned long a1, - unsigned long a2, unsigned long a3); -extern void audit_syscall_exit(int failed, long return_code); +extern void __audit_free(struct task_struct *task); +extern void __audit_syscall_entry(int arch, + int major, unsigned long a0, unsigned long a1, + unsigned long a2, unsigned long a3); +extern void __audit_syscall_exit(int ret_success, long ret_value); extern void __audit_getname(const char *name); extern void audit_putname(const char *name); extern void __audit_inode(const char *name, const struct dentry *dentry); extern void __audit_inode_child(const struct dentry *dentry, const struct inode *parent); +extern void __audit_seccomp(unsigned long syscall); extern void __audit_ptrace(struct task_struct *t); static inline int audit_dummy_context(void) @@ -437,6 +471,27 @@ static inline int audit_dummy_context(void) void *p = current->audit_context; return !p || *(int *)p; } +static inline void audit_free(struct task_struct *task) +{ + if (unlikely(task->audit_context)) + __audit_free(task); +} +static inline void audit_syscall_entry(int arch, int major, unsigned long a0, + unsigned long a1, unsigned long a2, + unsigned long a3) +{ + if (unlikely(!audit_dummy_context())) + __audit_syscall_entry(arch, major, a0, a1, a2, a3); +} +static inline void audit_syscall_exit(void *pt_regs) +{ + if (unlikely(current->audit_context)) { + int success = is_syscall_success(pt_regs); + int return_code = regs_return_value(pt_regs); + + __audit_syscall_exit(success, return_code); + } +} static inline void audit_getname(const char *name) { if (unlikely(!audit_dummy_context())) @@ -453,6 +508,12 @@ static inline void audit_inode_child(const struct dentry *dentry, } void audit_core_dumps(long signr); +static inline void audit_seccomp(unsigned long syscall) +{ + if (unlikely(!audit_dummy_context())) + __audit_seccomp(syscall); +} + static inline void audit_ptrace(struct task_struct *t) { if (unlikely(!audit_dummy_context())) @@ -463,17 +524,16 @@ static inline void audit_ptrace(struct task_struct *t) extern unsigned int audit_serial(void); extern int auditsc_get_stamp(struct audit_context *ctx, struct timespec *t, unsigned int *serial); -extern int audit_set_loginuid(struct task_struct *task, uid_t loginuid); +extern int audit_set_loginuid(uid_t loginuid); #define audit_get_loginuid(t) ((t)->loginuid) #define audit_get_sessionid(t) ((t)->sessionid) extern void audit_log_task_context(struct audit_buffer *ab); extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp); extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode); -extern int audit_bprm(struct linux_binprm *bprm); -extern void audit_socketcall(int nargs, unsigned long *args); -extern int audit_sockaddr(int len, void *addr); +extern int __audit_bprm(struct linux_binprm *bprm); +extern void __audit_socketcall(int nargs, unsigned long *args); +extern int __audit_sockaddr(int len, void *addr); extern void __audit_fd_pair(int fd1, int fd2); -extern int audit_set_macxattr(const char *name); extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr); extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout); extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification); @@ -499,6 +559,23 @@ static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid if (unlikely(!audit_dummy_context())) __audit_ipc_set_perm(qbytes, uid, gid, mode); } +static inline int audit_bprm(struct linux_binprm *bprm) +{ + if (unlikely(!audit_dummy_context())) + return __audit_bprm(bprm); + return 0; +} +static inline void audit_socketcall(int nargs, unsigned long *args) +{ + if (unlikely(!audit_dummy_context())) + __audit_socketcall(nargs, args); +} +static inline int audit_sockaddr(int len, void *addr) +{ + if (unlikely(!audit_dummy_context())) + return __audit_sockaddr(len, addr); + return 0; +} static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { if (unlikely(!audit_dummy_context())) @@ -544,12 +621,11 @@ static inline void audit_mmap_fd(int fd, int flags) extern int audit_n_rules; extern int audit_signals; -#else -#define audit_finish_fork(t) +#else /* CONFIG_AUDITSYSCALL */ #define audit_alloc(t) ({ 0; }) #define audit_free(t) do { ; } while (0) #define audit_syscall_entry(ta,a,b,c,d,e) do { ; } while (0) -#define audit_syscall_exit(f,r) do { ; } while (0) +#define audit_syscall_exit(r) do { ; } while (0) #define audit_dummy_context() 1 #define audit_getname(n) do { ; } while (0) #define audit_putname(n) do { ; } while (0) @@ -558,6 +634,7 @@ extern int audit_signals; #define audit_inode(n,d) do { (void)(d); } while (0) #define audit_inode_child(i,p) do { ; } while (0) #define audit_core_dumps(i) do { ; } while (0) +#define audit_seccomp(i) do { ; } while (0) #define auditsc_get_stamp(c,t,s) (0) #define audit_get_loginuid(t) (-1) #define audit_get_sessionid(t) (-1) @@ -568,7 +645,6 @@ extern int audit_signals; #define audit_socketcall(n,a) ((void)0) #define audit_fd_pair(n,a) ((void)0) #define audit_sockaddr(len, addr) ({ 0; }) -#define audit_set_macxattr(n) do { ; } while (0) #define audit_mq_open(o,m,a) ((void)0) #define audit_mq_sendrecv(d,l,p,t) ((void)0) #define audit_mq_notify(d,n) ((void)0) @@ -579,7 +655,7 @@ extern int audit_signals; #define audit_ptrace(t) ((void)0) #define audit_n_rules 0 #define audit_signals 0 -#endif +#endif /* CONFIG_AUDITSYSCALL */ #ifdef CONFIG_AUDIT /* These are defined in audit.c */ diff --git a/include/linux/digsig.h b/include/linux/digsig.h index efae755017d..b01558b1581 100644 --- a/include/linux/digsig.h +++ b/include/linux/digsig.h @@ -46,7 +46,7 @@ struct signature_hdr { char mpi[0]; } __packed; -#if defined(CONFIG_DIGSIG) || defined(CONFIG_DIGSIG_MODULE) +#if defined(CONFIG_SIGNATURE) || defined(CONFIG_SIGNATURE_MODULE) int digsig_verify(struct key *keyring, const char *sig, int siglen, const char *digest, int digestlen); @@ -59,6 +59,6 @@ static inline int digsig_verify(struct key *keyring, const char *sig, return -EOPNOTSUPP; } -#endif /* CONFIG_DIGSIG */ +#endif /* CONFIG_SIGNATURE */ #endif /* _DIGSIG_H */ diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 75f53f874b2..679b349d9b6 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -23,7 +23,6 @@ #include <linux/device.h> #include <linux/uio.h> -#include <linux/dma-direction.h> #include <linux/scatterlist.h> #include <linux/bitmap.h> #include <asm/page.h> @@ -72,11 +71,93 @@ enum dma_transaction_type { DMA_ASYNC_TX, DMA_SLAVE, DMA_CYCLIC, + DMA_INTERLEAVE, +/* last transaction type for creation of the capabilities mask */ + DMA_TX_TYPE_END, }; -/* last transaction type for creation of the capabilities mask */ -#define DMA_TX_TYPE_END (DMA_CYCLIC + 1) +/** + * enum dma_transfer_direction - dma transfer mode and direction indicator + * @DMA_MEM_TO_MEM: Async/Memcpy mode + * @DMA_MEM_TO_DEV: Slave mode & From Memory to Device + * @DMA_DEV_TO_MEM: Slave mode & From Device to Memory + * @DMA_DEV_TO_DEV: Slave mode & From Device to Device + */ +enum dma_transfer_direction { + DMA_MEM_TO_MEM, + DMA_MEM_TO_DEV, + DMA_DEV_TO_MEM, + DMA_DEV_TO_DEV, + DMA_TRANS_NONE, +}; + +/** + * Interleaved Transfer Request + * ---------------------------- + * A chunk is collection of contiguous bytes to be transfered. + * The gap(in bytes) between two chunks is called inter-chunk-gap(ICG). + * ICGs may or maynot change between chunks. + * A FRAME is the smallest series of contiguous {chunk,icg} pairs, + * that when repeated an integral number of times, specifies the transfer. + * A transfer template is specification of a Frame, the number of times + * it is to be repeated and other per-transfer attributes. + * + * Practically, a client driver would have ready a template for each + * type of transfer it is going to need during its lifetime and + * set only 'src_start' and 'dst_start' before submitting the requests. + * + * + * | Frame-1 | Frame-2 | ~ | Frame-'numf' | + * |====....==.===...=...|====....==.===...=...| ~ |====....==.===...=...| + * + * == Chunk size + * ... ICG + */ + +/** + * struct data_chunk - Element of scatter-gather list that makes a frame. + * @size: Number of bytes to read from source. + * size_dst := fn(op, size_src), so doesn't mean much for destination. + * @icg: Number of bytes to jump after last src/dst address of this + * chunk and before first src/dst address for next chunk. + * Ignored for dst(assumed 0), if dst_inc is true and dst_sgl is false. + * Ignored for src(assumed 0), if src_inc is true and src_sgl is false. + */ +struct data_chunk { + size_t size; + size_t icg; +}; +/** + * struct dma_interleaved_template - Template to convey DMAC the transfer pattern + * and attributes. + * @src_start: Bus address of source for the first chunk. + * @dst_start: Bus address of destination for the first chunk. + * @dir: Specifies the type of Source and Destination. + * @src_inc: If the source address increments after reading from it. + * @dst_inc: If the destination address increments after writing to it. + * @src_sgl: If the 'icg' of sgl[] applies to Source (scattered read). + * Otherwise, source is read contiguously (icg ignored). + * Ignored if src_inc is false. + * @dst_sgl: If the 'icg' of sgl[] applies to Destination (scattered write). + * Otherwise, destination is filled contiguously (icg ignored). + * Ignored if dst_inc is false. + * @numf: Number of frames in this template. + * @frame_size: Number of chunks in a frame i.e, size of sgl[]. + * @sgl: Array of {chunk,icg} pairs that make up a frame. + */ +struct dma_interleaved_template { + dma_addr_t src_start; + dma_addr_t dst_start; + enum dma_transfer_direction dir; + bool src_inc; + bool dst_inc; + bool src_sgl; + bool dst_sgl; + size_t numf; + size_t frame_size; + struct data_chunk sgl[0]; +}; /** * enum dma_ctrl_flags - DMA flags to augment operation preparation, @@ -269,7 +350,7 @@ enum dma_slave_buswidth { * struct, if applicable. */ struct dma_slave_config { - enum dma_data_direction direction; + enum dma_transfer_direction direction; dma_addr_t src_addr; dma_addr_t dst_addr; enum dma_slave_buswidth src_addr_width; @@ -433,6 +514,7 @@ struct dma_tx_state { * @device_prep_dma_cyclic: prepare a cyclic dma operation suitable for audio. * The function takes a buffer of size buf_len. The callback function will * be called after period_len bytes have been transferred. + * @device_prep_interleaved_dma: Transfer expression in a generic way. * @device_control: manipulate all pending operations on a channel, returns * zero or error code * @device_tx_status: poll for transaction completion, the optional @@ -492,11 +574,14 @@ struct dma_device { struct dma_async_tx_descriptor *(*device_prep_slave_sg)( struct dma_chan *chan, struct scatterlist *sgl, - unsigned int sg_len, enum dma_data_direction direction, + unsigned int sg_len, enum dma_transfer_direction direction, unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_dma_cyclic)( struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len, - size_t period_len, enum dma_data_direction direction); + size_t period_len, enum dma_transfer_direction direction); + struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)( + struct dma_chan *chan, struct dma_interleaved_template *xt, + unsigned long flags); int (*device_control)(struct dma_chan *chan, enum dma_ctrl_cmd cmd, unsigned long arg); @@ -522,7 +607,7 @@ static inline int dmaengine_slave_config(struct dma_chan *chan, static inline struct dma_async_tx_descriptor *dmaengine_prep_slave_single( struct dma_chan *chan, void *buf, size_t len, - enum dma_data_direction dir, unsigned long flags) + enum dma_transfer_direction dir, unsigned long flags) { struct scatterlist sg; sg_init_one(&sg, buf, len); diff --git a/include/linux/dw_dmac.h b/include/linux/dw_dmac.h index 4bfe0a2f7d5..f2c64f92c4a 100644 --- a/include/linux/dw_dmac.h +++ b/include/linux/dw_dmac.h @@ -127,7 +127,7 @@ struct dw_cyclic_desc { struct dw_cyclic_desc *dw_dma_cyclic_prep(struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len, size_t period_len, - enum dma_data_direction direction); + enum dma_transfer_direction direction); void dw_dma_cyclic_free(struct dma_chan *chan); int dw_dma_cyclic_start(struct dma_chan *chan); void dw_dma_cyclic_stop(struct dma_chan *chan); diff --git a/include/linux/key.h b/include/linux/key.h index 183a6af7715..bfc014c5735 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -293,6 +293,9 @@ static inline bool key_is_instantiated(const struct key *key) (rcu_dereference_protected((KEY)->payload.rcudata, \ rwsem_is_locked(&((struct key *)(KEY))->sem))) +#define rcu_assign_keypointer(KEY, PAYLOAD) \ + (rcu_assign_pointer((KEY)->payload.rcudata, PAYLOAD)) + #ifdef CONFIG_SYSCTL extern ctl_table key_sysctls[]; #endif diff --git a/include/linux/kref.h b/include/linux/kref.h index abc0120b09b..9c07dcebded 100644 --- a/include/linux/kref.h +++ b/include/linux/kref.h @@ -17,6 +17,7 @@ #include <linux/bug.h> #include <linux/atomic.h> +#include <linux/kernel.h> struct kref { atomic_t refcount; diff --git a/include/linux/mtd/gpmi-nand.h b/include/linux/mtd/gpmi-nand.h new file mode 100644 index 00000000000..69b6dbf46b5 --- /dev/null +++ b/include/linux/mtd/gpmi-nand.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2011 Freescale Semiconductor, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef __MACH_MXS_GPMI_NAND_H__ +#define __MACH_MXS_GPMI_NAND_H__ + +/* The size of the resources is fixed. */ +#define GPMI_NAND_RES_SIZE 6 + +/* Resource names for the GPMI NAND driver. */ +#define GPMI_NAND_GPMI_REGS_ADDR_RES_NAME "GPMI NAND GPMI Registers" +#define GPMI_NAND_GPMI_INTERRUPT_RES_NAME "GPMI NAND GPMI Interrupt" +#define GPMI_NAND_BCH_REGS_ADDR_RES_NAME "GPMI NAND BCH Registers" +#define GPMI_NAND_BCH_INTERRUPT_RES_NAME "GPMI NAND BCH Interrupt" +#define GPMI_NAND_DMA_CHANNELS_RES_NAME "GPMI NAND DMA Channels" +#define GPMI_NAND_DMA_INTERRUPT_RES_NAME "GPMI NAND DMA Interrupt" + +/** + * struct gpmi_nand_platform_data - GPMI NAND driver platform data. + * + * This structure communicates platform-specific information to the GPMI NAND + * driver that can't be expressed as resources. + * + * @platform_init: A pointer to a function the driver will call to + * initialize the platform (e.g., set up the pin mux). + * @min_prop_delay_in_ns: Minimum propagation delay of GPMI signals to and + * from the NAND Flash device, in nanoseconds. + * @max_prop_delay_in_ns: Maximum propagation delay of GPMI signals to and + * from the NAND Flash device, in nanoseconds. + * @max_chip_count: The maximum number of chips for which the driver + * should configure the hardware. This value most + * likely reflects the number of pins that are + * connected to a NAND Flash device. If this is + * greater than the SoC hardware can support, the + * driver will print a message and fail to initialize. + * @partitions: An optional pointer to an array of partition + * descriptions. + * @partition_count: The number of elements in the partitions array. + */ +struct gpmi_nand_platform_data { + /* SoC hardware information. */ + int (*platform_init)(void); + + /* NAND Flash information. */ + unsigned int min_prop_delay_in_ns; + unsigned int max_prop_delay_in_ns; + unsigned int max_chip_count; + + /* Medium information. */ + struct mtd_partition *partitions; + unsigned partition_count; +}; +#endif diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index a27e56ca41a..c2f1f6a5fcb 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -112,6 +112,7 @@ #include <linux/compiler.h> /* For unlikely. */ #include <linux/sched.h> /* For struct task_struct. */ +#include <linux/err.h> /* for IS_ERR_VALUE */ extern long arch_ptrace(struct task_struct *child, long request, @@ -266,6 +267,15 @@ static inline void ptrace_release_task(struct task_struct *task) #define force_successful_syscall_return() do { } while (0) #endif +#ifndef is_syscall_success +/* + * On most systems we can tell if a syscall is a success based on if the retval + * is an error value. On some systems like ia64 and powerpc they have different + * indicators of success/failure and must define their own. + */ +#define is_syscall_success(regs) (!IS_ERR_VALUE((unsigned long)(regs_return_value(regs)))) +#endif + /* * <asm/ptrace.h> should define the following things inside #ifdef __KERNEL__. * diff --git a/include/linux/sh_dma.h b/include/linux/sh_dma.h index cb2dd118cc0..8cd7fe59cf1 100644 --- a/include/linux/sh_dma.h +++ b/include/linux/sh_dma.h @@ -30,7 +30,7 @@ struct sh_desc { struct sh_dmae_regs hw; struct list_head node; struct dma_async_tx_descriptor async_tx; - enum dma_data_direction direction; + enum dma_transfer_direction direction; dma_cookie_t cookie; size_t partial; int chunks; @@ -48,6 +48,7 @@ struct sh_dmae_channel { unsigned int offset; unsigned int dmars; unsigned int dmars_bit; + unsigned int chclr_offset; }; struct sh_dmae_pdata { @@ -68,6 +69,7 @@ struct sh_dmae_pdata { unsigned int dmaor_is_32bit:1; unsigned int needs_tend_set:1; unsigned int no_dmars:1; + unsigned int chclr_present:1; }; /* DMA register */ diff --git a/include/linux/sirfsoc_dma.h b/include/linux/sirfsoc_dma.h new file mode 100644 index 00000000000..29d959333d8 --- /dev/null +++ b/include/linux/sirfsoc_dma.h @@ -0,0 +1,6 @@ +#ifndef _SIRFSOC_DMA_H_ +#define _SIRFSOC_DMA_H_ + +bool sirfsoc_dma_filter_id(struct dma_chan *chan, void *chan_id); + +#endif diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index ecdaeb98b29..5cf685086dd 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -312,7 +312,6 @@ struct tty_driver { */ struct tty_struct **ttys; struct ktermios **termios; - struct ktermios **termios_locked; void *driver_state; /* diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index b31702ac15b..84f3001a568 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -16,6 +16,8 @@ struct btrfs_delayed_ref_node; struct btrfs_delayed_tree_ref; struct btrfs_delayed_data_ref; struct btrfs_delayed_ref_head; +struct btrfs_block_group_cache; +struct btrfs_free_cluster; struct map_lookup; struct extent_buffer; @@ -44,6 +46,17 @@ struct extent_buffer; obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \ (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-" +#define BTRFS_GROUP_FLAGS \ + { BTRFS_BLOCK_GROUP_DATA, "DATA"}, \ + { BTRFS_BLOCK_GROUP_SYSTEM, "SYSTEM"}, \ + { BTRFS_BLOCK_GROUP_METADATA, "METADATA"}, \ + { BTRFS_BLOCK_GROUP_RAID0, "RAID0"}, \ + { BTRFS_BLOCK_GROUP_RAID1, "RAID1"}, \ + { BTRFS_BLOCK_GROUP_DUP, "DUP"}, \ + { BTRFS_BLOCK_GROUP_RAID10, "RAID10"} + +#define BTRFS_UUID_SIZE 16 + TRACE_EVENT(btrfs_transaction_commit, TP_PROTO(struct btrfs_root *root), @@ -621,6 +634,34 @@ TRACE_EVENT(btrfs_cow_block, __entry->cow_level) ); +TRACE_EVENT(btrfs_space_reservation, + + TP_PROTO(struct btrfs_fs_info *fs_info, char *type, u64 val, + u64 bytes, int reserve), + + TP_ARGS(fs_info, type, val, bytes, reserve), + + TP_STRUCT__entry( + __array( u8, fsid, BTRFS_UUID_SIZE ) + __string( type, type ) + __field( u64, val ) + __field( u64, bytes ) + __field( int, reserve ) + ), + + TP_fast_assign( + memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE); + __assign_str(type, type); + __entry->val = val; + __entry->bytes = bytes; + __entry->reserve = reserve; + ), + + TP_printk("%pU: %s: %Lu %s %Lu", __entry->fsid, __get_str(type), + __entry->val, __entry->reserve ? "reserve" : "release", + __entry->bytes) +); + DECLARE_EVENT_CLASS(btrfs__reserved_extent, TP_PROTO(struct btrfs_root *root, u64 start, u64 len), @@ -659,6 +700,168 @@ DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_free, TP_ARGS(root, start, len) ); +TRACE_EVENT(find_free_extent, + + TP_PROTO(struct btrfs_root *root, u64 num_bytes, u64 empty_size, + u64 data), + + TP_ARGS(root, num_bytes, empty_size, data), + + TP_STRUCT__entry( + __field( u64, root_objectid ) + __field( u64, num_bytes ) + __field( u64, empty_size ) + __field( u64, data ) + ), + + TP_fast_assign( + __entry->root_objectid = root->root_key.objectid; + __entry->num_bytes = num_bytes; + __entry->empty_size = empty_size; + __entry->data = data; + ), + + TP_printk("root = %Lu(%s), len = %Lu, empty_size = %Lu, " + "flags = %Lu(%s)", show_root_type(__entry->root_objectid), + __entry->num_bytes, __entry->empty_size, __entry->data, + __print_flags((unsigned long)__entry->data, "|", + BTRFS_GROUP_FLAGS)) +); + +DECLARE_EVENT_CLASS(btrfs__reserve_extent, + + TP_PROTO(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group, u64 start, + u64 len), + + TP_ARGS(root, block_group, start, len), + + TP_STRUCT__entry( + __field( u64, root_objectid ) + __field( u64, bg_objectid ) + __field( u64, flags ) + __field( u64, start ) + __field( u64, len ) + ), + + TP_fast_assign( + __entry->root_objectid = root->root_key.objectid; + __entry->bg_objectid = block_group->key.objectid; + __entry->flags = block_group->flags; + __entry->start = start; + __entry->len = len; + ), + + TP_printk("root = %Lu(%s), block_group = %Lu, flags = %Lu(%s), " + "start = %Lu, len = %Lu", + show_root_type(__entry->root_objectid), __entry->bg_objectid, + __entry->flags, __print_flags((unsigned long)__entry->flags, + "|", BTRFS_GROUP_FLAGS), + __entry->start, __entry->len) +); + +DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent, + + TP_PROTO(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group, u64 start, + u64 len), + + TP_ARGS(root, block_group, start, len) +); + +DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster, + + TP_PROTO(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group, u64 start, + u64 len), + + TP_ARGS(root, block_group, start, len) +); + +TRACE_EVENT(btrfs_find_cluster, + + TP_PROTO(struct btrfs_block_group_cache *block_group, u64 start, + u64 bytes, u64 empty_size, u64 min_bytes), + + TP_ARGS(block_group, start, bytes, empty_size, min_bytes), + + TP_STRUCT__entry( + __field( u64, bg_objectid ) + __field( u64, flags ) + __field( u64, start ) + __field( u64, bytes ) + __field( u64, empty_size ) + __field( u64, min_bytes ) + ), + + TP_fast_assign( + __entry->bg_objectid = block_group->key.objectid; + __entry->flags = block_group->flags; + __entry->start = start; + __entry->bytes = bytes; + __entry->empty_size = empty_size; + __entry->min_bytes = min_bytes; + ), + + TP_printk("block_group = %Lu, flags = %Lu(%s), start = %Lu, len = %Lu," + " empty_size = %Lu, min_bytes = %Lu", __entry->bg_objectid, + __entry->flags, + __print_flags((unsigned long)__entry->flags, "|", + BTRFS_GROUP_FLAGS), __entry->start, + __entry->bytes, __entry->empty_size, __entry->min_bytes) +); + +TRACE_EVENT(btrfs_failed_cluster_setup, + + TP_PROTO(struct btrfs_block_group_cache *block_group), + + TP_ARGS(block_group), + + TP_STRUCT__entry( + __field( u64, bg_objectid ) + ), + + TP_fast_assign( + __entry->bg_objectid = block_group->key.objectid; + ), + + TP_printk("block_group = %Lu", __entry->bg_objectid) +); + +TRACE_EVENT(btrfs_setup_cluster, + + TP_PROTO(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, u64 size, int bitmap), + + TP_ARGS(block_group, cluster, size, bitmap), + + TP_STRUCT__entry( + __field( u64, bg_objectid ) + __field( u64, flags ) + __field( u64, start ) + __field( u64, max_size ) + __field( u64, size ) + __field( int, bitmap ) + ), + + TP_fast_assign( + __entry->bg_objectid = block_group->key.objectid; + __entry->flags = block_group->flags; + __entry->start = cluster->window_start; + __entry->max_size = cluster->max_size; + __entry->size = size; + __entry->bitmap = bitmap; + ), + + TP_printk("block_group = %Lu, flags = %Lu(%s), window_start = %Lu, " + "size = %Lu, max_size = %Lu, bitmap = %d", + __entry->bg_objectid, + __entry->flags, + __print_flags((unsigned long)__entry->flags, "|", + BTRFS_GROUP_FLAGS), __entry->start, + __entry->size, __entry->max_size, __entry->bitmap) +); + #endif /* _TRACE_BTRFS_H */ /* This part must be outside protection */ diff --git a/init/Kconfig b/init/Kconfig index 6ac2236244c..3f42cd66f0f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -355,7 +355,7 @@ config AUDIT config AUDITSYSCALL bool "Enable system-call auditing support" - depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH) + depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || ARM) default y if SECURITY_SELINUX help Enable low-overhead system-call auditing infrastructure that @@ -372,6 +372,20 @@ config AUDIT_TREE depends on AUDITSYSCALL select FSNOTIFY +config AUDIT_LOGINUID_IMMUTABLE + bool "Make audit loginuid immutable" + depends on AUDIT + help + The config option toggles if a task setting its loginuid requires + CAP_SYS_AUDITCONTROL or if that task should require no special permissions + but should instead only allow setting its loginuid if it was never + previously set. On systems which use systemd or a similar central + process to restart login services this should be set to true. On older + systems in which an admin would typically have to directly stop and + start processes this should be set to false. Setting this to true allows + one to drop potentially dangerous capabilites from the login tasks, + but may not be backwards compatible with older init systems. + source "kernel/irq/Kconfig" menu "RCU Subsystem" diff --git a/kernel/audit.c b/kernel/audit.c index 57e3f510793..bb0eb5bb9a0 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -631,7 +631,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, } *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); - audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u", + audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", pid, uid, auid, ses); if (sid) { rc = security_secid_to_secctx(sid, &ctx, &len); @@ -1423,7 +1423,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, char *p, *pathname; if (prefix) - audit_log_format(ab, " %s", prefix); + audit_log_format(ab, "%s", prefix); /* We will allow 11 spaces for ' (deleted)' to be appended */ pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); diff --git a/kernel/audit.h b/kernel/audit.h index 91e7071c4d2..81676680337 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -36,12 +36,8 @@ enum audit_state { AUDIT_DISABLED, /* Do not create per-task audit_context. * No syscall-specific audit records can * be generated. */ - AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context, - * but don't necessarily fill it in at - * syscall entry time (i.e., filter - * instead). */ AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, - * and always fill it in at syscall + * and fill it in at syscall * entry time. This makes a full * syscall record available if some * other part of the kernel decides it diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f8277c80d67..a6c3f1abd20 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -235,13 +235,15 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) switch(listnr) { default: goto exit_err; - case AUDIT_FILTER_USER: - case AUDIT_FILTER_TYPE: #ifdef CONFIG_AUDITSYSCALL case AUDIT_FILTER_ENTRY: + if (rule->action == AUDIT_ALWAYS) + goto exit_err; case AUDIT_FILTER_EXIT: case AUDIT_FILTER_TASK: #endif + case AUDIT_FILTER_USER: + case AUDIT_FILTER_TYPE: ; } if (unlikely(rule->action == AUDIT_POSSIBLE)) { @@ -385,7 +387,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) goto exit_free; break; case AUDIT_FILETYPE: - if ((f->val & ~S_IFMT) > S_IFMT) + if (f->val & ~S_IFMT) goto exit_free; break; case AUDIT_INODE: @@ -459,6 +461,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_ARG1: case AUDIT_ARG2: case AUDIT_ARG3: + case AUDIT_OBJ_UID: + case AUDIT_OBJ_GID: break; case AUDIT_ARCH: entry->rule.arch_f = f; @@ -522,7 +526,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_free; break; case AUDIT_FILTERKEY: - err = -EINVAL; if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) goto exit_free; str = audit_unpack_string(&bufp, &remain, f->val); @@ -536,7 +539,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_free; break; case AUDIT_FILETYPE: - if ((f->val & ~S_IFMT) > S_IFMT) + if (f->val & ~S_IFMT) + goto exit_free; + break; + case AUDIT_FIELD_COMPARE: + if (f->val > AUDIT_MAX_FIELD_COMPARE) goto exit_free; break; default: diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e7fe2b0d29b..caaea6e944f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -70,9 +70,15 @@ #include "audit.h" +/* flags stating the success for a syscall */ +#define AUDITSC_INVALID 0 +#define AUDITSC_SUCCESS 1 +#define AUDITSC_FAILURE 2 + /* AUDIT_NAMES is the number of slots we reserve in the audit_context - * for saving names from getname(). */ -#define AUDIT_NAMES 20 + * for saving names from getname(). If we get more names we will allocate + * a name dynamically and also add those to the list anchored by names_list. */ +#define AUDIT_NAMES 5 /* Indicates that audit should log the full pathname. */ #define AUDIT_NAME_FULL -1 @@ -101,9 +107,8 @@ struct audit_cap_data { * * Further, in fs/namei.c:path_lookup() we store the inode and device. */ struct audit_names { + struct list_head list; /* audit_context->names_list */ const char *name; - int name_len; /* number of name's characters to log */ - unsigned name_put; /* call __putname() for this name */ unsigned long ino; dev_t dev; umode_t mode; @@ -113,6 +118,14 @@ struct audit_names { u32 osid; struct audit_cap_data fcap; unsigned int fcap_ver; + int name_len; /* number of name's characters to log */ + bool name_put; /* call __putname() for this name */ + /* + * This was an allocated audit_names and not from the array of + * names allocated in the task audit context. Thus this name + * should be freed on syscall exit + */ + bool should_free; }; struct audit_aux_data { @@ -174,8 +187,17 @@ struct audit_context { long return_code;/* syscall return code */ u64 prio; int return_valid; /* return code is valid */ - int name_count; - struct audit_names names[AUDIT_NAMES]; + /* + * The names_list is the list of all audit_names collected during this + * syscall. The first AUDIT_NAMES entries in the names_list will + * actually be from the preallocated_names array for performance + * reasons. Except during allocation they should never be referenced + * through the preallocated_names array and should only be found/used + * by running the names_list. + */ + struct audit_names preallocated_names[AUDIT_NAMES]; + int name_count; /* total records in names_list */ + struct list_head names_list; /* anchor for struct audit_names->list */ char * filterkey; /* key for rule that triggered record */ struct path pwd; struct audit_context *previous; /* For nested syscalls */ @@ -305,21 +327,21 @@ static int audit_match_perm(struct audit_context *ctx, int mask) } } -static int audit_match_filetype(struct audit_context *ctx, int which) +static int audit_match_filetype(struct audit_context *ctx, int val) { - unsigned index = which & ~S_IFMT; - umode_t mode = which & S_IFMT; + struct audit_names *n; + umode_t mode = (umode_t)val; if (unlikely(!ctx)) return 0; - if (index >= ctx->name_count) - return 0; - if (ctx->names[index].ino == -1) - return 0; - if ((ctx->names[index].mode ^ mode) & S_IFMT) - return 0; - return 1; + list_for_each_entry(n, &ctx->names_list, list) { + if ((n->ino != -1) && + ((n->mode & S_IFMT) == mode)) + return 1; + } + + return 0; } /* @@ -441,6 +463,134 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) return 0; } +static int audit_compare_id(uid_t uid1, + struct audit_names *name, + unsigned long name_offset, + struct audit_field *f, + struct audit_context *ctx) +{ + struct audit_names *n; + unsigned long addr; + uid_t uid2; + int rc; + + BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); + + if (name) { + addr = (unsigned long)name; + addr += name_offset; + + uid2 = *(uid_t *)addr; + rc = audit_comparator(uid1, f->op, uid2); + if (rc) + return rc; + } + + if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + addr = (unsigned long)n; + addr += name_offset; + + uid2 = *(uid_t *)addr; + + rc = audit_comparator(uid1, f->op, uid2); + if (rc) + return rc; + } + } + return 0; +} + +static int audit_field_compare(struct task_struct *tsk, + const struct cred *cred, + struct audit_field *f, + struct audit_context *ctx, + struct audit_names *name) +{ + switch (f->val) { + /* process to file object comparisons */ + case AUDIT_COMPARE_UID_TO_OBJ_UID: + return audit_compare_id(cred->uid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_GID_TO_OBJ_GID: + return audit_compare_id(cred->gid, + name, offsetof(struct audit_names, gid), + f, ctx); + case AUDIT_COMPARE_EUID_TO_OBJ_UID: + return audit_compare_id(cred->euid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_EGID_TO_OBJ_GID: + return audit_compare_id(cred->egid, + name, offsetof(struct audit_names, gid), + f, ctx); + case AUDIT_COMPARE_AUID_TO_OBJ_UID: + return audit_compare_id(tsk->loginuid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_SUID_TO_OBJ_UID: + return audit_compare_id(cred->suid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_SGID_TO_OBJ_GID: + return audit_compare_id(cred->sgid, + name, offsetof(struct audit_names, gid), + f, ctx); + case AUDIT_COMPARE_FSUID_TO_OBJ_UID: + return audit_compare_id(cred->fsuid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_FSGID_TO_OBJ_GID: + return audit_compare_id(cred->fsgid, + name, offsetof(struct audit_names, gid), + f, ctx); + /* uid comparisons */ + case AUDIT_COMPARE_UID_TO_AUID: + return audit_comparator(cred->uid, f->op, tsk->loginuid); + case AUDIT_COMPARE_UID_TO_EUID: + return audit_comparator(cred->uid, f->op, cred->euid); + case AUDIT_COMPARE_UID_TO_SUID: + return audit_comparator(cred->uid, f->op, cred->suid); + case AUDIT_COMPARE_UID_TO_FSUID: + return audit_comparator(cred->uid, f->op, cred->fsuid); + /* auid comparisons */ + case AUDIT_COMPARE_AUID_TO_EUID: + return audit_comparator(tsk->loginuid, f->op, cred->euid); + case AUDIT_COMPARE_AUID_TO_SUID: + return audit_comparator(tsk->loginuid, f->op, cred->suid); + case AUDIT_COMPARE_AUID_TO_FSUID: + return audit_comparator(tsk->loginuid, f->op, cred->fsuid); + /* euid comparisons */ + case AUDIT_COMPARE_EUID_TO_SUID: + return audit_comparator(cred->euid, f->op, cred->suid); + case AUDIT_COMPARE_EUID_TO_FSUID: + return audit_comparator(cred->euid, f->op, cred->fsuid); + /* suid comparisons */ + case AUDIT_COMPARE_SUID_TO_FSUID: + return audit_comparator(cred->suid, f->op, cred->fsuid); + /* gid comparisons */ + case AUDIT_COMPARE_GID_TO_EGID: + return audit_comparator(cred->gid, f->op, cred->egid); + case AUDIT_COMPARE_GID_TO_SGID: + return audit_comparator(cred->gid, f->op, cred->sgid); + case AUDIT_COMPARE_GID_TO_FSGID: + return audit_comparator(cred->gid, f->op, cred->fsgid); + /* egid comparisons */ + case AUDIT_COMPARE_EGID_TO_SGID: + return audit_comparator(cred->egid, f->op, cred->sgid); + case AUDIT_COMPARE_EGID_TO_FSGID: + return audit_comparator(cred->egid, f->op, cred->fsgid); + /* sgid comparison */ + case AUDIT_COMPARE_SGID_TO_FSGID: + return audit_comparator(cred->sgid, f->op, cred->fsgid); + default: + WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); + return 0; + } + return 0; +} + /* Determine if any context name data matches a rule's watch data */ /* Compare a task_struct with an audit_rule. Return 1 on match, 0 * otherwise. @@ -457,13 +607,14 @@ static int audit_filter_rules(struct task_struct *tsk, bool task_creation) { const struct cred *cred; - int i, j, need_sid = 1; + int i, need_sid = 1; u32 sid; cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; + struct audit_names *n; int result = 0; switch (f->type) { @@ -522,12 +673,14 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_DEVMAJOR: - if (name) - result = audit_comparator(MAJOR(name->dev), - f->op, f->val); - else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { + if (name) { + if (audit_comparator(MAJOR(name->dev), f->op, f->val) || + audit_comparator(MAJOR(name->rdev), f->op, f->val)) + ++result; + } else if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(MAJOR(n->dev), f->op, f->val) || + audit_comparator(MAJOR(n->rdev), f->op, f->val)) { ++result; break; } @@ -535,12 +688,14 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_DEVMINOR: - if (name) - result = audit_comparator(MINOR(name->dev), - f->op, f->val); - else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { + if (name) { + if (audit_comparator(MINOR(name->dev), f->op, f->val) || + audit_comparator(MINOR(name->rdev), f->op, f->val)) + ++result; + } else if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(MINOR(n->dev), f->op, f->val) || + audit_comparator(MINOR(n->rdev), f->op, f->val)) { ++result; break; } @@ -551,8 +706,32 @@ static int audit_filter_rules(struct task_struct *tsk, if (name) result = (name->ino == f->val); else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(ctx->names[j].ino, f->op, f->val)) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(n->ino, f->op, f->val)) { + ++result; + break; + } + } + } + break; + case AUDIT_OBJ_UID: + if (name) { + result = audit_comparator(name->uid, f->op, f->val); + } else if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(n->uid, f->op, f->val)) { + ++result; + break; + } + } + } + break; + case AUDIT_OBJ_GID: + if (name) { + result = audit_comparator(name->gid, f->op, f->val); + } else if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(n->gid, f->op, f->val)) { ++result; break; } @@ -607,11 +786,10 @@ static int audit_filter_rules(struct task_struct *tsk, name->osid, f->type, f->op, f->lsm_rule, ctx); } else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (security_audit_rule_match( - ctx->names[j].osid, - f->type, f->op, - f->lsm_rule, ctx)) { + list_for_each_entry(n, &ctx->names_list, list) { + if (security_audit_rule_match(n->osid, f->type, + f->op, f->lsm_rule, + ctx)) { ++result; break; } @@ -643,8 +821,10 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_FILETYPE: result = audit_match_filetype(ctx, f->val); break; + case AUDIT_FIELD_COMPARE: + result = audit_field_compare(tsk, cred, f, ctx, name); + break; } - if (!result) return 0; } @@ -722,40 +902,53 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, return AUDIT_BUILD_CONTEXT; } -/* At syscall exit time, this filter is called if any audit_names[] have been +/* + * Given an audit_name check the inode hash table to see if they match. + * Called holding the rcu read lock to protect the use of audit_inode_hash + */ +static int audit_filter_inode_name(struct task_struct *tsk, + struct audit_names *n, + struct audit_context *ctx) { + int word, bit; + int h = audit_hash_ino((u32)n->ino); + struct list_head *list = &audit_inode_hash[h]; + struct audit_entry *e; + enum audit_state state; + + word = AUDIT_WORD(ctx->major); + bit = AUDIT_BIT(ctx->major); + + if (list_empty(list)) + return 0; + + list_for_each_entry_rcu(e, list, list) { + if ((e->rule.mask[word] & bit) == bit && + audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { + ctx->current_state = state; + return 1; + } + } + + return 0; +} + +/* At syscall exit time, this filter is called if any audit_names have been * collected during syscall processing. We only check rules in sublists at hash - * buckets applicable to the inode numbers in audit_names[]. + * buckets applicable to the inode numbers in audit_names. * Regarding audit_state, same rules apply as for audit_filter_syscall(). */ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) { - int i; - struct audit_entry *e; - enum audit_state state; + struct audit_names *n; if (audit_pid && tsk->tgid == audit_pid) return; rcu_read_lock(); - for (i = 0; i < ctx->name_count; i++) { - int word = AUDIT_WORD(ctx->major); - int bit = AUDIT_BIT(ctx->major); - struct audit_names *n = &ctx->names[i]; - int h = audit_hash_ino((u32)n->ino); - struct list_head *list = &audit_inode_hash[h]; - - if (list_empty(list)) - continue; - list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit && - audit_filter_rules(tsk, &e->rule, ctx, n, - &state, false)) { - rcu_read_unlock(); - ctx->current_state = state; - return; - } - } + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_filter_inode_name(tsk, n, ctx)) + break; } rcu_read_unlock(); } @@ -766,7 +959,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, { struct audit_context *context = tsk->audit_context; - if (likely(!context)) + if (!context) return NULL; context->return_valid = return_valid; @@ -799,7 +992,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, static inline void audit_free_names(struct audit_context *context) { - int i; + struct audit_names *n, *next; #if AUDIT_DEBUG == 2 if (context->put_count + context->ino_count != context->name_count) { @@ -810,10 +1003,9 @@ static inline void audit_free_names(struct audit_context *context) context->serial, context->major, context->in_syscall, context->name_count, context->put_count, context->ino_count); - for (i = 0; i < context->name_count; i++) { + list_for_each_entry(n, &context->names_list, list) { printk(KERN_ERR "names[%d] = %p = %s\n", i, - context->names[i].name, - context->names[i].name ?: "(null)"); + n->name, n->name ?: "(null)"); } dump_stack(); return; @@ -824,9 +1016,12 @@ static inline void audit_free_names(struct audit_context *context) context->ino_count = 0; #endif - for (i = 0; i < context->name_count; i++) { - if (context->names[i].name && context->names[i].name_put) - __putname(context->names[i].name); + list_for_each_entry_safe(n, next, &context->names_list, list) { + list_del(&n->list); + if (n->name && n->name_put) + __putname(n->name); + if (n->should_free) + kfree(n); } context->name_count = 0; path_put(&context->pwd); @@ -864,6 +1059,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state) return NULL; audit_zero_context(context, state); INIT_LIST_HEAD(&context->killed_trees); + INIT_LIST_HEAD(&context->names_list); return context; } @@ -886,7 +1082,7 @@ int audit_alloc(struct task_struct *tsk) return 0; /* Return if not auditing. */ state = audit_filter_task(tsk, &key); - if (likely(state == AUDIT_DISABLED)) + if (state == AUDIT_DISABLED) return 0; if (!(context = audit_alloc_context(state))) { @@ -975,7 +1171,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk while (vma) { if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) { - audit_log_d_path(ab, "exe=", + audit_log_d_path(ab, " exe=", &vma->vm_file->f_path); break; } @@ -1166,8 +1362,8 @@ static void audit_log_execve_info(struct audit_context *context, struct audit_buffer **ab, struct audit_aux_data_execve *axi) { - int i; - size_t len, len_sent = 0; + int i, len; + size_t len_sent = 0; const char __user *p; char *buf; @@ -1324,6 +1520,68 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_end(ab); } +static void audit_log_name(struct audit_context *context, struct audit_names *n, + int record_num, int *call_panic) +{ + struct audit_buffer *ab; + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); + if (!ab) + return; /* audit_panic has been called */ + + audit_log_format(ab, "item=%d", record_num); + + if (n->name) { + switch (n->name_len) { + case AUDIT_NAME_FULL: + /* log the full path */ + audit_log_format(ab, " name="); + audit_log_untrustedstring(ab, n->name); + break; + case 0: + /* name was specified as a relative path and the + * directory component is the cwd */ + audit_log_d_path(ab, " name=", &context->pwd); + break; + default: + /* log the name's directory component */ + audit_log_format(ab, " name="); + audit_log_n_untrustedstring(ab, n->name, + n->name_len); + } + } else + audit_log_format(ab, " name=(null)"); + + if (n->ino != (unsigned long)-1) { + audit_log_format(ab, " inode=%lu" + " dev=%02x:%02x mode=%#ho" + " ouid=%u ogid=%u rdev=%02x:%02x", + n->ino, + MAJOR(n->dev), + MINOR(n->dev), + n->mode, + n->uid, + n->gid, + MAJOR(n->rdev), + MINOR(n->rdev)); + } + if (n->osid != 0) { + char *ctx = NULL; + u32 len; + if (security_secid_to_secctx( + n->osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", n->osid); + *call_panic = 2; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } + } + + audit_log_fcaps(ab, n); + + audit_log_end(ab); +} + static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { const struct cred *cred; @@ -1331,6 +1589,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts struct audit_buffer *ab; struct audit_aux_data *aux; const char *tty; + struct audit_names *n; /* tsk == current */ context->pid = tsk->pid; @@ -1466,70 +1725,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts if (context->pwd.dentry && context->pwd.mnt) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); if (ab) { - audit_log_d_path(ab, "cwd=", &context->pwd); + audit_log_d_path(ab, " cwd=", &context->pwd); audit_log_end(ab); } } - for (i = 0; i < context->name_count; i++) { - struct audit_names *n = &context->names[i]; - ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); - if (!ab) - continue; /* audit_panic has been called */ - - audit_log_format(ab, "item=%d", i); - - if (n->name) { - switch(n->name_len) { - case AUDIT_NAME_FULL: - /* log the full path */ - audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, n->name); - break; - case 0: - /* name was specified as a relative path and the - * directory component is the cwd */ - audit_log_d_path(ab, "name=", &context->pwd); - break; - default: - /* log the name's directory component */ - audit_log_format(ab, " name="); - audit_log_n_untrustedstring(ab, n->name, - n->name_len); - } - } else - audit_log_format(ab, " name=(null)"); - - if (n->ino != (unsigned long)-1) { - audit_log_format(ab, " inode=%lu" - " dev=%02x:%02x mode=%#ho" - " ouid=%u ogid=%u rdev=%02x:%02x", - n->ino, - MAJOR(n->dev), - MINOR(n->dev), - n->mode, - n->uid, - n->gid, - MAJOR(n->rdev), - MINOR(n->rdev)); - } - if (n->osid != 0) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx( - n->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", n->osid); - call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - } - - audit_log_fcaps(ab, n); - - audit_log_end(ab); - } + i = 0; + list_for_each_entry(n, &context->names_list, list) + audit_log_name(context, n, i++, &call_panic); /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); @@ -1545,12 +1748,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts * * Called from copy_process and do_exit */ -void audit_free(struct task_struct *tsk) +void __audit_free(struct task_struct *tsk) { struct audit_context *context; context = audit_get_context(tsk, 0, 0); - if (likely(!context)) + if (!context) return; /* Check for system calls that do not go through the exit @@ -1583,7 +1786,7 @@ void audit_free(struct task_struct *tsk) * will only be written if another part of the kernel requests that it * be written). */ -void audit_syscall_entry(int arch, int major, +void __audit_syscall_entry(int arch, int major, unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4) { @@ -1591,7 +1794,7 @@ void audit_syscall_entry(int arch, int major, struct audit_context *context = tsk->audit_context; enum audit_state state; - if (unlikely(!context)) + if (!context) return; /* @@ -1648,7 +1851,7 @@ void audit_syscall_entry(int arch, int major, context->prio = 0; state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); } - if (likely(state == AUDIT_DISABLED)) + if (state == AUDIT_DISABLED) return; context->serial = 0; @@ -1658,30 +1861,9 @@ void audit_syscall_entry(int arch, int major, context->ppid = 0; } -void audit_finish_fork(struct task_struct *child) -{ - struct audit_context *ctx = current->audit_context; - struct audit_context *p = child->audit_context; - if (!p || !ctx) - return; - if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT) - return; - p->arch = ctx->arch; - p->major = ctx->major; - memcpy(p->argv, ctx->argv, sizeof(ctx->argv)); - p->ctime = ctx->ctime; - p->dummy = ctx->dummy; - p->in_syscall = ctx->in_syscall; - p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL); - p->ppid = current->pid; - p->prio = ctx->prio; - p->current_state = ctx->current_state; -} - /** * audit_syscall_exit - deallocate audit context after a system call - * @valid: success/failure flag - * @return_code: syscall return value + * @pt_regs: syscall registers * * Tear down after system call. If the audit context has been marked as * auditable (either because of the AUDIT_RECORD_CONTEXT state from @@ -1689,14 +1871,18 @@ void audit_finish_fork(struct task_struct *child) * message), then write out the syscall information. In call cases, * free the names stored from getname(). */ -void audit_syscall_exit(int valid, long return_code) +void __audit_syscall_exit(int success, long return_code) { struct task_struct *tsk = current; struct audit_context *context; - context = audit_get_context(tsk, valid, return_code); + if (success) + success = AUDITSC_SUCCESS; + else + success = AUDITSC_FAILURE; - if (likely(!context)) + context = audit_get_context(tsk, success, return_code); + if (!context) return; if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) @@ -1821,6 +2007,30 @@ retry: #endif } +static struct audit_names *audit_alloc_name(struct audit_context *context) +{ + struct audit_names *aname; + + if (context->name_count < AUDIT_NAMES) { + aname = &context->preallocated_names[context->name_count]; + memset(aname, 0, sizeof(*aname)); + } else { + aname = kzalloc(sizeof(*aname), GFP_NOFS); + if (!aname) + return NULL; + aname->should_free = true; + } + + aname->ino = (unsigned long)-1; + list_add_tail(&aname->list, &context->names_list); + + context->name_count++; +#if AUDIT_DEBUG + context->ino_count++; +#endif + return aname; +} + /** * audit_getname - add a name to the list * @name: name to add @@ -1831,9 +2041,7 @@ retry: void __audit_getname(const char *name) { struct audit_context *context = current->audit_context; - - if (IS_ERR(name) || !name) - return; + struct audit_names *n; if (!context->in_syscall) { #if AUDIT_DEBUG == 2 @@ -1843,13 +2051,15 @@ void __audit_getname(const char *name) #endif return; } - BUG_ON(context->name_count >= AUDIT_NAMES); - context->names[context->name_count].name = name; - context->names[context->name_count].name_len = AUDIT_NAME_FULL; - context->names[context->name_count].name_put = 1; - context->names[context->name_count].ino = (unsigned long)-1; - context->names[context->name_count].osid = 0; - ++context->name_count; + + n = audit_alloc_name(context); + if (!n) + return; + + n->name = name; + n->name_len = AUDIT_NAME_FULL; + n->name_put = true; + if (!context->pwd.dentry) get_fs_pwd(current->fs, &context->pwd); } @@ -1871,12 +2081,13 @@ void audit_putname(const char *name) printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", __FILE__, __LINE__, context->serial, name); if (context->name_count) { + struct audit_names *n; int i; - for (i = 0; i < context->name_count; i++) + + list_for_each_entry(n, &context->names_list, list) printk(KERN_ERR "name[%d] = %p = %s\n", i, - context->names[i].name, - context->names[i].name ?: "(null)"); - } + n->name, n->name ?: "(null)"); + } #endif __putname(name); } @@ -1897,39 +2108,11 @@ void audit_putname(const char *name) #endif } -static int audit_inc_name_count(struct audit_context *context, - const struct inode *inode) -{ - if (context->name_count >= AUDIT_NAMES) { - if (inode) - printk(KERN_DEBUG "audit: name_count maxed, losing inode data: " - "dev=%02x:%02x, inode=%lu\n", - MAJOR(inode->i_sb->s_dev), - MINOR(inode->i_sb->s_dev), - inode->i_ino); - - else - printk(KERN_DEBUG "name_count maxed, losing inode data\n"); - return 1; - } - context->name_count++; -#if AUDIT_DEBUG - context->ino_count++; -#endif - return 0; -} - - static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) { struct cpu_vfs_cap_data caps; int rc; - memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t)); - memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t)); - name->fcap.fE = 0; - name->fcap_ver = 0; - if (!dentry) return 0; @@ -1969,30 +2152,25 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent */ void __audit_inode(const char *name, const struct dentry *dentry) { - int idx; struct audit_context *context = current->audit_context; const struct inode *inode = dentry->d_inode; + struct audit_names *n; if (!context->in_syscall) return; - if (context->name_count - && context->names[context->name_count-1].name - && context->names[context->name_count-1].name == name) - idx = context->name_count - 1; - else if (context->name_count > 1 - && context->names[context->name_count-2].name - && context->names[context->name_count-2].name == name) - idx = context->name_count - 2; - else { - /* FIXME: how much do we care about inodes that have no - * associated name? */ - if (audit_inc_name_count(context, inode)) - return; - idx = context->name_count - 1; - context->names[idx].name = NULL; + + list_for_each_entry_reverse(n, &context->names_list, list) { + if (n->name && (n->name == name)) + goto out; } + + /* unable to find the name from a previous getname() */ + n = audit_alloc_name(context); + if (!n) + return; +out: handle_path(dentry); - audit_copy_inode(&context->names[idx], dentry, inode); + audit_copy_inode(n, dentry, inode); } /** @@ -2011,11 +2189,11 @@ void __audit_inode(const char *name, const struct dentry *dentry) void __audit_inode_child(const struct dentry *dentry, const struct inode *parent) { - int idx; struct audit_context *context = current->audit_context; const char *found_parent = NULL, *found_child = NULL; const struct inode *inode = dentry->d_inode; const char *dname = dentry->d_name.name; + struct audit_names *n; int dirlen = 0; if (!context->in_syscall) @@ -2025,9 +2203,7 @@ void __audit_inode_child(const struct dentry *dentry, handle_one(inode); /* parent is more likely, look for it first */ - for (idx = 0; idx < context->name_count; idx++) { - struct audit_names *n = &context->names[idx]; - + list_for_each_entry(n, &context->names_list, list) { if (!n->name) continue; @@ -2040,9 +2216,7 @@ void __audit_inode_child(const struct dentry *dentry, } /* no matching parent, look for matching child */ - for (idx = 0; idx < context->name_count; idx++) { - struct audit_names *n = &context->names[idx]; - + list_for_each_entry(n, &context->names_list, list) { if (!n->name) continue; @@ -2060,34 +2234,29 @@ void __audit_inode_child(const struct dentry *dentry, add_names: if (!found_parent) { - if (audit_inc_name_count(context, parent)) + n = audit_alloc_name(context); + if (!n) return; - idx = context->name_count - 1; - context->names[idx].name = NULL; - audit_copy_inode(&context->names[idx], NULL, parent); + audit_copy_inode(n, NULL, parent); } if (!found_child) { - if (audit_inc_name_count(context, inode)) + n = audit_alloc_name(context); + if (!n) return; - idx = context->name_count - 1; /* Re-use the name belonging to the slot for a matching parent * directory. All names for this context are relinquished in * audit_free_names() */ if (found_parent) { - context->names[idx].name = found_parent; - context->names[idx].name_len = AUDIT_NAME_FULL; + n->name = found_parent; + n->name_len = AUDIT_NAME_FULL; /* don't call __putname() */ - context->names[idx].name_put = 0; - } else { - context->names[idx].name = NULL; + n->name_put = false; } if (inode) - audit_copy_inode(&context->names[idx], NULL, inode); - else - context->names[idx].ino = (unsigned long)-1; + audit_copy_inode(n, NULL, inode); } } EXPORT_SYMBOL_GPL(__audit_inode_child); @@ -2121,19 +2290,28 @@ int auditsc_get_stamp(struct audit_context *ctx, static atomic_t session_id = ATOMIC_INIT(0); /** - * audit_set_loginuid - set a task's audit_context loginuid - * @task: task whose audit context is being modified + * audit_set_loginuid - set current task's audit_context loginuid * @loginuid: loginuid value * * Returns 0. * * Called (set) from fs/proc/base.c::proc_loginuid_write(). */ -int audit_set_loginuid(struct task_struct *task, uid_t loginuid) +int audit_set_loginuid(uid_t loginuid) { - unsigned int sessionid = atomic_inc_return(&session_id); + struct task_struct *task = current; struct audit_context *context = task->audit_context; + unsigned int sessionid; + +#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE + if (task->loginuid != -1) + return -EPERM; +#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ + if (!capable(CAP_AUDIT_CONTROL)) + return -EPERM; +#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ + sessionid = atomic_inc_return(&session_id); if (context && context->in_syscall) { struct audit_buffer *ab; @@ -2271,14 +2449,11 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo context->ipc.has_perm = 1; } -int audit_bprm(struct linux_binprm *bprm) +int __audit_bprm(struct linux_binprm *bprm) { struct audit_aux_data_execve *ax; struct audit_context *context = current->audit_context; - if (likely(!audit_enabled || !context || context->dummy)) - return 0; - ax = kmalloc(sizeof(*ax), GFP_KERNEL); if (!ax) return -ENOMEM; @@ -2299,13 +2474,10 @@ int audit_bprm(struct linux_binprm *bprm) * @args: args array * */ -void audit_socketcall(int nargs, unsigned long *args) +void __audit_socketcall(int nargs, unsigned long *args) { struct audit_context *context = current->audit_context; - if (likely(!context || context->dummy)) - return; - context->type = AUDIT_SOCKETCALL; context->socketcall.nargs = nargs; memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); @@ -2331,13 +2503,10 @@ void __audit_fd_pair(int fd1, int fd2) * * Returns 0 for success or NULL context or < 0 on error. */ -int audit_sockaddr(int len, void *a) +int __audit_sockaddr(int len, void *a) { struct audit_context *context = current->audit_context; - if (likely(!context || context->dummy)) - return 0; - if (!context->sockaddr) { void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); if (!p) @@ -2499,6 +2668,25 @@ void __audit_mmap_fd(int fd, int flags) context->type = AUDIT_MMAP; } +static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) +{ + uid_t auid, uid; + gid_t gid; + unsigned int sessionid; + + auid = audit_get_loginuid(current); + sessionid = audit_get_sessionid(current); + current_uid_gid(&uid, &gid); + + audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", + auid, uid, gid, sessionid); + audit_log_task_context(ab); + audit_log_format(ab, " pid=%d comm=", current->pid); + audit_log_untrustedstring(ab, current->comm); + audit_log_format(ab, " reason="); + audit_log_string(ab, reason); + audit_log_format(ab, " sig=%ld", signr); +} /** * audit_core_dumps - record information about processes that end abnormally * @signr: signal value @@ -2509,10 +2697,6 @@ void __audit_mmap_fd(int fd, int flags) void audit_core_dumps(long signr) { struct audit_buffer *ab; - u32 sid; - uid_t auid = audit_get_loginuid(current), uid; - gid_t gid; - unsigned int sessionid = audit_get_sessionid(current); if (!audit_enabled) return; @@ -2521,24 +2705,17 @@ void audit_core_dumps(long signr) return; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); - current_uid_gid(&uid, &gid); - audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", - auid, uid, gid, sessionid); - security_task_getsecid(current, &sid); - if (sid) { - char *ctx = NULL; - u32 len; + audit_log_abend(ab, "memory violation", signr); + audit_log_end(ab); +} - if (security_secid_to_secctx(sid, &ctx, &len)) - audit_log_format(ab, " ssid=%u", sid); - else { - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - } - } - audit_log_format(ab, " pid=%d comm=", current->pid); - audit_log_untrustedstring(ab, current->comm); - audit_log_format(ab, " sig=%ld", signr); +void __audit_seccomp(unsigned long syscall) +{ + struct audit_buffer *ab; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); + audit_log_abend(ab, "seccomp", SIGKILL); + audit_log_format(ab, " syscall=%ld", syscall); audit_log_end(ab); } diff --git a/kernel/capability.c b/kernel/capability.c index 0fcf1c14a29..3f1adb6c647 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -384,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap) BUG(); } - if (has_ns_capability(current, ns, cap)) { + if (security_capable(current_cred(), ns, cap) == 0) { current->flags |= PF_SUPERPRIV; return true; } diff --git a/kernel/exit.c b/kernel/exit.c index c44738267be..294b1709170 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -964,8 +964,7 @@ void do_exit(long code) acct_collect(code, group_dead); if (group_dead) tty_audit_exit(); - if (unlikely(tsk->audit_context)) - audit_free(tsk); + audit_free(tsk); tsk->exit_code = code; taskstats_exit(tsk, group_dead); diff --git a/kernel/fork.c b/kernel/fork.c index f3fa18887cc..051f090d40c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1527,8 +1527,6 @@ long do_fork(unsigned long clone_flags, init_completion(&vfork); } - audit_finish_fork(p); - /* * We set PF_STARTING at creation in case tracing wants to * use this to distinguish a fully live task from one that diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 57d4b13b631..e8d76c5895e 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -6,6 +6,7 @@ * This defines a simple but solid secure-computing mode. */ +#include <linux/audit.h> #include <linux/seccomp.h> #include <linux/sched.h> #include <linux/compat.h> @@ -54,6 +55,7 @@ void __secure_computing(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif + audit_seccomp(this_syscall); do_exit(SIGKILL); } diff --git a/lib/Kconfig b/lib/Kconfig index 201e1b33d72..169eb7c598e 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -286,25 +286,24 @@ config CORDIC calculations are in fixed point. Module will be called cordic. config MPILIB - tristate "Multiprecision maths library" + tristate help Multiprecision maths library from GnuPG. It is used to implement RSA digital signature verification, which is used by IMA/EVM digital signature extension. config MPILIB_EXTRA - bool "Multiprecision maths library - additional sources" + bool depends on MPILIB help - Multiprecision maths library from GnuPG. - It is used to implement RSA digital signature verification, - which is used by IMA/EVM digital signature extension. - This code in unnecessary for RSA digital signature verification, - and can be compiled if needed. + Additional sources of multiprecision maths library from GnuPG. + This code is unnecessary for RSA digital signature verification, + but can be compiled if needed. -config DIGSIG - tristate "In-kernel signature checker" - depends on KEYS +config SIGNATURE + tristate + depends on KEYS && CRYPTO + select CRYPTO_SHA1 select MPILIB help Digital signature verification. Currently only RSA is supported. diff --git a/lib/Makefile b/lib/Makefile index dace162c7e1..d71aae1b01b 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -119,7 +119,7 @@ obj-$(CONFIG_CORDIC) += cordic.o obj-$(CONFIG_DQL) += dynamic_queue_limits.o obj-$(CONFIG_MPILIB) += mpi/ -obj-$(CONFIG_DIGSIG) += digsig.o +obj-$(CONFIG_SIGNATURE) += digsig.o hostprogs-y := gen_crc32table clean-files := crc32table.h diff --git a/security/integrity/Kconfig b/security/integrity/Kconfig index d384ea92148..5bd1cc1b4a5 100644 --- a/security/integrity/Kconfig +++ b/security/integrity/Kconfig @@ -3,11 +3,11 @@ config INTEGRITY def_bool y depends on IMA || EVM -config INTEGRITY_DIGSIG +config INTEGRITY_SIGNATURE boolean "Digital signature verification using multiple keyrings" depends on INTEGRITY && KEYS default n - select DIGSIG + select SIGNATURE help This option enables digital signature verification support using multiple keyrings. It defines separate keyrings for each diff --git a/security/integrity/Makefile b/security/integrity/Makefile index bece0563ee5..d43799cc14f 100644 --- a/security/integrity/Makefile +++ b/security/integrity/Makefile @@ -3,7 +3,7 @@ # obj-$(CONFIG_INTEGRITY) += integrity.o -obj-$(CONFIG_INTEGRITY_DIGSIG) += digsig.o +obj-$(CONFIG_INTEGRITY_SIGNATURE) += digsig.o integrity-y := iint.o diff --git a/security/integrity/ima/ima_audit.c b/security/integrity/ima/ima_audit.c index c5c5a72c30b..2ad942fb1e2 100644 --- a/security/integrity/ima/ima_audit.c +++ b/security/integrity/ima/ima_audit.c @@ -56,9 +56,11 @@ void integrity_audit_msg(int audit_msgno, struct inode *inode, audit_log_format(ab, " name="); audit_log_untrustedstring(ab, fname); } - if (inode) - audit_log_format(ab, " dev=%s ino=%lu", - inode->i_sb->s_id, inode->i_ino); + if (inode) { + audit_log_format(ab, " dev="); + audit_log_untrustedstring(ab, inode->i_sb->s_id); + audit_log_format(ab, " ino=%lu", inode->i_ino); + } audit_log_format(ab, " res=%d", !result ? 0 : 1); audit_log_end(ab); } diff --git a/security/integrity/integrity.h b/security/integrity/integrity.h index 4da6ba81d15..7a25ecec5aa 100644 --- a/security/integrity/integrity.h +++ b/security/integrity/integrity.h @@ -51,7 +51,7 @@ struct integrity_iint_cache *integrity_iint_find(struct inode *inode); #define INTEGRITY_KEYRING_IMA 2 #define INTEGRITY_KEYRING_MAX 3 -#ifdef CONFIG_INTEGRITY_DIGSIG +#ifdef CONFIG_INTEGRITY_SIGNATURE int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen, const char *digest, int digestlen); @@ -65,7 +65,7 @@ static inline int integrity_digsig_verify(const unsigned int id, return -EOPNOTSUPP; } -#endif /* CONFIG_INTEGRITY_DIGSIG */ +#endif /* CONFIG_INTEGRITY_SIGNATURE */ /* set during initialization */ extern int iint_initialized; diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c index 41144f71d61..2d1bb8af769 100644 --- a/security/keys/encrypted-keys/encrypted.c +++ b/security/keys/encrypted-keys/encrypted.c @@ -314,7 +314,7 @@ static struct key *request_user_key(const char *master_desc, u8 **master_key, goto error; down_read(&ukey->sem); - upayload = rcu_dereference(ukey->payload.data); + upayload = ukey->payload.data; *master_key = upayload->data; *master_keylen = upayload->datalen; error: @@ -810,7 +810,7 @@ static int encrypted_instantiate(struct key *key, const void *data, goto out; } - rcu_assign_pointer(key->payload.data, epayload); + rcu_assign_keypointer(key, epayload); out: kfree(datablob); return ret; @@ -874,7 +874,7 @@ static int encrypted_update(struct key *key, const void *data, size_t datalen) memcpy(new_epayload->payload_data, epayload->payload_data, epayload->payload_datalen); - rcu_assign_pointer(key->payload.data, new_epayload); + rcu_assign_keypointer(key, new_epayload); call_rcu(&epayload->rcu, encrypted_rcu_free); out: kfree(buf); diff --git a/security/keys/encrypted-keys/masterkey_trusted.c b/security/keys/encrypted-keys/masterkey_trusted.c index df87272e3f5..013f7e5d3a2 100644 --- a/security/keys/encrypted-keys/masterkey_trusted.c +++ b/security/keys/encrypted-keys/masterkey_trusted.c @@ -18,6 +18,8 @@ #include <linux/module.h> #include <linux/err.h> #include <keys/trusted-type.h> +#include <keys/encrypted-type.h> +#include "encrypted.h" /* * request_trusted_key - request the trusted key @@ -37,7 +39,7 @@ struct key *request_trusted_key(const char *trusted_desc, goto error; down_read(&tkey->sem); - tpayload = rcu_dereference(tkey->payload.data); + tpayload = tkey->payload.data; *master_key = tpayload->key; *master_keylen = tpayload->key_len; error: diff --git a/security/keys/gc.c b/security/keys/gc.c index bf4d8da5a79..a42b45531aa 100644 --- a/security/keys/gc.c +++ b/security/keys/gc.c @@ -145,7 +145,9 @@ static void key_gc_keyring(struct key *keyring, time_t limit) if (!klist) goto unlock_dont_gc; - for (loop = klist->nkeys - 1; loop >= 0; loop--) { + loop = klist->nkeys; + smp_rmb(); + for (loop--; loop >= 0; loop--) { key = klist->keys[loop]; if (test_bit(KEY_FLAG_DEAD, &key->flags) || (key->expiry > 0 && key->expiry <= limit)) diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 37a7f3b2885..d605f75292e 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -319,7 +319,7 @@ key_ref_t keyring_search_aux(key_ref_t keyring_ref, struct key *keyring, *key; key_ref_t key_ref; long err; - int sp, kix; + int sp, nkeys, kix; keyring = key_ref_to_ptr(keyring_ref); possessed = is_key_possessed(keyring_ref); @@ -380,7 +380,9 @@ descend: goto not_this_keyring; /* iterate through the keys in this keyring first */ - for (kix = 0; kix < keylist->nkeys; kix++) { + nkeys = keylist->nkeys; + smp_rmb(); + for (kix = 0; kix < nkeys; kix++) { key = keylist->keys[kix]; kflags = key->flags; @@ -421,7 +423,9 @@ descend: /* search through the keyrings nested in this one */ kix = 0; ascend: - for (; kix < keylist->nkeys; kix++) { + nkeys = keylist->nkeys; + smp_rmb(); + for (; kix < nkeys; kix++) { key = keylist->keys[kix]; if (key->type != &key_type_keyring) continue; @@ -515,7 +519,7 @@ key_ref_t __keyring_search_one(key_ref_t keyring_ref, struct keyring_list *klist; unsigned long possessed; struct key *keyring, *key; - int loop; + int nkeys, loop; keyring = key_ref_to_ptr(keyring_ref); possessed = is_key_possessed(keyring_ref); @@ -524,7 +528,9 @@ key_ref_t __keyring_search_one(key_ref_t keyring_ref, klist = rcu_dereference(keyring->payload.subscriptions); if (klist) { - for (loop = 0; loop < klist->nkeys; loop++) { + nkeys = klist->nkeys; + smp_rmb(); + for (loop = 0; loop < nkeys ; loop++) { key = klist->keys[loop]; if (key->type == ktype && @@ -622,7 +628,7 @@ static int keyring_detect_cycle(struct key *A, struct key *B) struct keyring_list *keylist; struct key *subtree, *key; - int sp, kix, ret; + int sp, nkeys, kix, ret; rcu_read_lock(); @@ -645,7 +651,9 @@ descend: ascend: /* iterate through the remaining keys in this keyring */ - for (; kix < keylist->nkeys; kix++) { + nkeys = keylist->nkeys; + smp_rmb(); + for (; kix < nkeys; kix++) { key = keylist->keys[kix]; if (key == A) diff --git a/security/keys/trusted.c b/security/keys/trusted.c index 0ed5fdf238a..2d5d041f204 100644 --- a/security/keys/trusted.c +++ b/security/keys/trusted.c @@ -993,7 +993,7 @@ out: kfree(datablob); kfree(options); if (!ret) - rcu_assign_pointer(key->payload.data, payload); + rcu_assign_keypointer(key, payload); else kfree(payload); return ret; @@ -1067,7 +1067,7 @@ static int trusted_update(struct key *key, const void *data, size_t datalen) goto out; } } - rcu_assign_pointer(key->payload.data, new_p); + rcu_assign_keypointer(key, new_p); call_rcu(&p->rcu, trusted_rcu_free); out: kfree(datablob); diff --git a/security/lsm_audit.c b/security/lsm_audit.c index 7bd6f138236..293b8c45b1d 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -232,13 +232,14 @@ static void dump_common_audit_data(struct audit_buffer *ab, case LSM_AUDIT_DATA_PATH: { struct inode *inode; - audit_log_d_path(ab, "path=", &a->u.path); + audit_log_d_path(ab, " path=", &a->u.path); inode = a->u.path.dentry->d_inode; - if (inode) - audit_log_format(ab, " dev=%s ino=%lu", - inode->i_sb->s_id, - inode->i_ino); + if (inode) { + audit_log_format(ab, " dev="); + audit_log_untrustedstring(ab, inode->i_sb->s_id); + audit_log_format(ab, " ino=%lu", inode->i_ino); + } break; } case LSM_AUDIT_DATA_DENTRY: { @@ -248,10 +249,11 @@ static void dump_common_audit_data(struct audit_buffer *ab, audit_log_untrustedstring(ab, a->u.dentry->d_name.name); inode = a->u.dentry->d_inode; - if (inode) - audit_log_format(ab, " dev=%s ino=%lu", - inode->i_sb->s_id, - inode->i_ino); + if (inode) { + audit_log_format(ab, " dev="); + audit_log_untrustedstring(ab, inode->i_sb->s_id); + audit_log_format(ab, " ino=%lu", inode->i_ino); + } break; } case LSM_AUDIT_DATA_INODE: { @@ -266,8 +268,9 @@ static void dump_common_audit_data(struct audit_buffer *ab, dentry->d_name.name); dput(dentry); } - audit_log_format(ab, " dev=%s ino=%lu", inode->i_sb->s_id, - inode->i_ino); + audit_log_format(ab, " dev="); + audit_log_untrustedstring(ab, inode->i_sb->s_id); + audit_log_format(ab, " ino=%lu", inode->i_ino); break; } case LSM_AUDIT_DATA_TASK: @@ -315,7 +318,7 @@ static void dump_common_audit_data(struct audit_buffer *ab, .dentry = u->dentry, .mnt = u->mnt }; - audit_log_d_path(ab, "path=", &path); + audit_log_d_path(ab, " path=", &path); break; } if (!u->addr) diff --git a/security/tomoyo/util.c b/security/tomoyo/util.c index 4a9b4b2eb75..867558c9833 100644 --- a/security/tomoyo/util.c +++ b/security/tomoyo/util.c @@ -492,13 +492,13 @@ static bool tomoyo_correct_word2(const char *string, size_t len) if (d < '0' || d > '7' || e < '0' || e > '7') break; c = tomoyo_make_byte(c, d, e); - if (tomoyo_invalid(c)) - continue; /* pattern is not \000 */ + if (c <= ' ' || c >= 127) + continue; } goto out; } else if (in_repetition && c == '/') { goto out; - } else if (tomoyo_invalid(c)) { + } else if (c <= ' ' || c >= 127) { goto out; } } diff --git a/sound/atmel/abdac.c b/sound/atmel/abdac.c index 6fd9391b3a6..4fa1dbd8ee8 100644 --- a/sound/atmel/abdac.c +++ b/sound/atmel/abdac.c @@ -133,7 +133,7 @@ static int atmel_abdac_prepare_dma(struct atmel_abdac *dac, period_len = frames_to_bytes(runtime, runtime->period_size); cdesc = dw_dma_cyclic_prep(chan, runtime->dma_addr, buffer_len, - period_len, DMA_TO_DEVICE); + period_len, DMA_MEM_TO_DEV); if (IS_ERR(cdesc)) { dev_dbg(&dac->pdev->dev, "could not prepare cyclic DMA\n"); return PTR_ERR(cdesc); diff --git a/sound/atmel/ac97c.c b/sound/atmel/ac97c.c index 73516f69ac7..61dade69835 100644 --- a/sound/atmel/ac97c.c +++ b/sound/atmel/ac97c.c @@ -102,7 +102,7 @@ static void atmel_ac97c_dma_capture_period_done(void *arg) static int atmel_ac97c_prepare_dma(struct atmel_ac97c *chip, struct snd_pcm_substream *substream, - enum dma_data_direction direction) + enum dma_transfer_direction direction) { struct dma_chan *chan; struct dw_cyclic_desc *cdesc; @@ -118,7 +118,7 @@ static int atmel_ac97c_prepare_dma(struct atmel_ac97c *chip, return -EINVAL; } - if (direction == DMA_TO_DEVICE) + if (direction == DMA_MEM_TO_DEV) chan = chip->dma.tx_chan; else chan = chip->dma.rx_chan; @@ -133,7 +133,7 @@ static int atmel_ac97c_prepare_dma(struct atmel_ac97c *chip, return PTR_ERR(cdesc); } - if (direction == DMA_TO_DEVICE) { + if (direction == DMA_MEM_TO_DEV) { cdesc->period_callback = atmel_ac97c_dma_playback_period_done; set_bit(DMA_TX_READY, &chip->flags); } else { @@ -393,7 +393,7 @@ static int atmel_ac97c_playback_prepare(struct snd_pcm_substream *substream) if (cpu_is_at32ap7000()) { if (!test_bit(DMA_TX_READY, &chip->flags)) retval = atmel_ac97c_prepare_dma(chip, substream, - DMA_TO_DEVICE); + DMA_MEM_TO_DEV); } else { /* Initialize and start the PDC */ writel(runtime->dma_addr, chip->regs + ATMEL_PDC_TPR); @@ -484,7 +484,7 @@ static int atmel_ac97c_capture_prepare(struct snd_pcm_substream *substream) if (cpu_is_at32ap7000()) { if (!test_bit(DMA_RX_READY, &chip->flags)) retval = atmel_ac97c_prepare_dma(chip, substream, - DMA_FROM_DEVICE); + DMA_DEV_TO_MEM); } else { /* Initialize and start the PDC */ writel(runtime->dma_addr, chip->regs + ATMEL_PDC_RPR); diff --git a/sound/core/Kconfig b/sound/core/Kconfig index ad409381f8c..b413ed05e74 100644 --- a/sound/core/Kconfig +++ b/sound/core/Kconfig @@ -12,6 +12,9 @@ config SND_HWDEP config SND_RAWMIDI tristate +config SND_COMPRESS_OFFLOAD + tristate + # To be effective this also requires INPUT - users should say: # select SND_JACK if INPUT=y || INPUT=SND # to avoid having to force INPUT on. @@ -154,16 +157,6 @@ config SND_DYNAMIC_MINORS If you are unsure about this, say N here. -config SND_COMPRESS_OFFLOAD - tristate "ALSA Compressed audio offload support" - default n - help - If you want support for offloading compressed audio and have such - a hardware, then you should say Y here and also to the DSP driver - of your platform. - - If you are unsure about this, say N here. - config SND_SUPPORT_OLD_API bool "Support old ALSA API" default y diff --git a/sound/pci/au88x0/au88x0.c b/sound/pci/au88x0/au88x0.c index 762bb108c51..f13ad536b2d 100644 --- a/sound/pci/au88x0/au88x0.c +++ b/sound/pci/au88x0/au88x0.c @@ -268,8 +268,14 @@ snd_vortex_probe(struct pci_dev *pci, const struct pci_device_id *pci_id) card->shortname, chip->io, chip->irq); // (4) Alloc components. + err = snd_vortex_mixer(chip); + if (err < 0) { + snd_card_free(card); + return err; + } // ADB pcm. - if ((err = snd_vortex_new_pcm(chip, VORTEX_PCM_ADB, NR_ADB)) < 0) { + err = snd_vortex_new_pcm(chip, VORTEX_PCM_ADB, NR_PCM); + if (err < 0) { snd_card_free(card); return err; } @@ -299,11 +305,6 @@ snd_vortex_probe(struct pci_dev *pci, const struct pci_device_id *pci_id) return err; } #endif - // snd_ac97_mixer and Vortex mixer. - if ((err = snd_vortex_mixer(chip)) < 0) { - snd_card_free(card); - return err; - } if ((err = snd_vortex_midi(chip)) < 0) { snd_card_free(card); return err; diff --git a/sound/pci/au88x0/au88x0.h b/sound/pci/au88x0/au88x0.h index 02f6e08f759..bb938153a96 100644 --- a/sound/pci/au88x0/au88x0.h +++ b/sound/pci/au88x0/au88x0.h @@ -105,6 +105,7 @@ #define MIX_SPDIF(x) (vortex->mixspdif[x]) #define NR_WTPB 0x20 /* WT channels per each bank. */ +#define NR_PCM 0x10 /* Structs */ typedef struct { diff --git a/sound/pci/au88x0/au88x0_pcm.c b/sound/pci/au88x0/au88x0_pcm.c index 0488633ea87..0ef2f971220 100644 --- a/sound/pci/au88x0/au88x0_pcm.c +++ b/sound/pci/au88x0/au88x0_pcm.c @@ -168,6 +168,7 @@ static int snd_vortex_pcm_open(struct snd_pcm_substream *substream) runtime->hw = snd_vortex_playback_hw_adb; #ifdef CHIP_AU8830 if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK && + VORTEX_IS_QUAD(vortex) && VORTEX_PCM_TYPE(substream->pcm) == VORTEX_PCM_ADB) { runtime->hw.channels_max = 4; snd_pcm_hw_constraint_list(runtime, 0, diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 0852e204a4c..fb35474c120 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -2498,6 +2498,7 @@ static struct snd_pci_quirk position_fix_list[] __devinitdata = { SND_PCI_QUIRK(0x1043, 0x81b3, "ASUS", POS_FIX_LPIB), SND_PCI_QUIRK(0x1043, 0x81e7, "ASUS M2V", POS_FIX_LPIB), SND_PCI_QUIRK(0x104d, 0x9069, "Sony VPCS11V9E", POS_FIX_LPIB), + SND_PCI_QUIRK(0x10de, 0xcb89, "Macbook Pro 7,1", POS_FIX_LPIB), SND_PCI_QUIRK(0x1297, 0x3166, "Shuttle", POS_FIX_LPIB), SND_PCI_QUIRK(0x1458, 0xa022, "ga-ma770-ud3", POS_FIX_LPIB), SND_PCI_QUIRK(0x1462, 0x1002, "MSI Wind U115", POS_FIX_LPIB), diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c index 87e684fa830..3556408d6ec 100644 --- a/sound/pci/hda/patch_sigmatel.c +++ b/sound/pci/hda/patch_sigmatel.c @@ -1596,7 +1596,7 @@ static const struct snd_pci_quirk stac92hd73xx_cfg_tbl[] = { SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x02bd, "Dell Studio 1557", STAC_DELL_M6_DMIC), SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x02fe, - "Dell Studio XPS 1645", STAC_DELL_M6_BOTH), + "Dell Studio XPS 1645", STAC_DELL_M6_DMIC), SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x0413, "Dell Studio 1558", STAC_DELL_M6_DMIC), {} /* terminator */ diff --git a/sound/pci/oxygen/xonar_wm87x6.c b/sound/pci/oxygen/xonar_wm87x6.c index 478303e6c2b..63cff90706b 100644 --- a/sound/pci/oxygen/xonar_wm87x6.c +++ b/sound/pci/oxygen/xonar_wm87x6.c @@ -177,6 +177,7 @@ static void wm8776_registers_init(struct oxygen *chip) struct xonar_wm87x6 *data = chip->model_data; wm8776_write(chip, WM8776_RESET, 0); + wm8776_write(chip, WM8776_PHASESWAP, WM8776_PH_MASK); wm8776_write(chip, WM8776_DACCTRL1, WM8776_DZCEN | WM8776_PL_LEFT_LEFT | WM8776_PL_RIGHT_RIGHT); wm8776_write(chip, WM8776_DACMUTE, chip->dac_mute ? WM8776_DMUTE : 0); diff --git a/sound/soc/ep93xx/ep93xx-pcm.c b/sound/soc/ep93xx/ep93xx-pcm.c index 3fc96130d1a..de839044987 100644 --- a/sound/soc/ep93xx/ep93xx-pcm.c +++ b/sound/soc/ep93xx/ep93xx-pcm.c @@ -113,9 +113,9 @@ static int ep93xx_pcm_open(struct snd_pcm_substream *substream) rtd->dma_data.name = dma_params->name; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) - rtd->dma_data.direction = DMA_TO_DEVICE; + rtd->dma_data.direction = DMA_MEM_TO_DEV; else - rtd->dma_data.direction = DMA_FROM_DEVICE; + rtd->dma_data.direction = DMA_DEV_TO_MEM; rtd->dma_chan = dma_request_channel(mask, ep93xx_pcm_dma_filter, &rtd->dma_data); diff --git a/sound/soc/imx/imx-pcm-dma-mx2.c b/sound/soc/imx/imx-pcm-dma-mx2.c index 1cf2fe889f6..aecdba9f65a 100644 --- a/sound/soc/imx/imx-pcm-dma-mx2.c +++ b/sound/soc/imx/imx-pcm-dma-mx2.c @@ -107,12 +107,12 @@ static int imx_ssi_dma_alloc(struct snd_pcm_substream *substream, } if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { - slave_config.direction = DMA_TO_DEVICE; + slave_config.direction = DMA_MEM_TO_DEV; slave_config.dst_addr = dma_params->dma_addr; slave_config.dst_addr_width = buswidth; slave_config.dst_maxburst = dma_params->burstsize; } else { - slave_config.direction = DMA_FROM_DEVICE; + slave_config.direction = DMA_DEV_TO_MEM; slave_config.src_addr = dma_params->dma_addr; slave_config.src_addr_width = buswidth; slave_config.src_maxburst = dma_params->burstsize; @@ -159,7 +159,7 @@ static int snd_imx_pcm_hw_params(struct snd_pcm_substream *substream, iprtd->period_bytes * iprtd->periods, iprtd->period_bytes, substream->stream == SNDRV_PCM_STREAM_PLAYBACK ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); + DMA_MEM_TO_DEV : DMA_DEV_TO_MEM); if (!iprtd->desc) { dev_err(&chan->dev->device, "cannot prepare slave dma\n"); return -EINVAL; diff --git a/sound/soc/mxs/mxs-pcm.c b/sound/soc/mxs/mxs-pcm.c index 0e12f4e0a76..105f42a394d 100644 --- a/sound/soc/mxs/mxs-pcm.c +++ b/sound/soc/mxs/mxs-pcm.c @@ -136,7 +136,7 @@ static int snd_mxs_pcm_hw_params(struct snd_pcm_substream *substream, iprtd->period_bytes * iprtd->periods, iprtd->period_bytes, substream->stream == SNDRV_PCM_STREAM_PLAYBACK ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); + DMA_MEM_TO_DEV : DMA_DEV_TO_MEM); if (!iprtd->desc) { dev_err(&chan->dev->device, "cannot prepare slave dma\n"); return -EINVAL; diff --git a/sound/soc/samsung/dma.c b/sound/soc/samsung/dma.c index 427ae0d9817..e4ba17ce6b3 100644 --- a/sound/soc/samsung/dma.c +++ b/sound/soc/samsung/dma.c @@ -86,7 +86,7 @@ static void dma_enqueue(struct snd_pcm_substream *substream) dma_info.cap = (samsung_dma_has_circular() ? DMA_CYCLIC : DMA_SLAVE); dma_info.direction = (substream->stream == SNDRV_PCM_STREAM_PLAYBACK - ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + ? DMA_MEM_TO_DEV : DMA_DEV_TO_MEM); dma_info.fp = audio_buffdone; dma_info.fp_param = substream; dma_info.period = prtd->dma_period; @@ -171,7 +171,7 @@ static int dma_hw_params(struct snd_pcm_substream *substream, dma_info.client = prtd->params->client; dma_info.direction = (substream->stream == SNDRV_PCM_STREAM_PLAYBACK - ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + ? DMA_MEM_TO_DEV : DMA_DEV_TO_MEM); dma_info.width = prtd->params->dma_size; dma_info.fifo = prtd->params->dma_addr; prtd->params->ch = prtd->params->ops->request( diff --git a/sound/soc/sh/siu_pcm.c b/sound/soc/sh/siu_pcm.c index f8f681690a7..0193e595d41 100644 --- a/sound/soc/sh/siu_pcm.c +++ b/sound/soc/sh/siu_pcm.c @@ -131,7 +131,7 @@ static int siu_pcm_wr_set(struct siu_port *port_info, sg_dma_address(&sg) = buff; desc = siu_stream->chan->device->device_prep_slave_sg(siu_stream->chan, - &sg, 1, DMA_TO_DEVICE, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + &sg, 1, DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!desc) { dev_err(dev, "Failed to allocate a dma descriptor\n"); return -ENOMEM; @@ -181,7 +181,7 @@ static int siu_pcm_rd_set(struct siu_port *port_info, sg_dma_address(&sg) = buff; desc = siu_stream->chan->device->device_prep_slave_sg(siu_stream->chan, - &sg, 1, DMA_FROM_DEVICE, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + &sg, 1, DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!desc) { dev_err(dev, "Failed to allocate dma descriptor\n"); return -ENOMEM; diff --git a/sound/soc/txx9/txx9aclc.c b/sound/soc/txx9/txx9aclc.c index 93931def0dc..21554611557 100644 --- a/sound/soc/txx9/txx9aclc.c +++ b/sound/soc/txx9/txx9aclc.c @@ -134,7 +134,7 @@ txx9aclc_dma_submit(struct txx9aclc_dmadata *dmadata, dma_addr_t buf_dma_addr) sg_dma_address(&sg) = buf_dma_addr; desc = chan->device->device_prep_slave_sg(chan, &sg, 1, dmadata->substream->stream == SNDRV_PCM_STREAM_PLAYBACK ? - DMA_TO_DEVICE : DMA_FROM_DEVICE, + DMA_MEM_TO_DEV : DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!desc) { dev_err(&chan->dev->device, "cannot prepare slave dma\n"); |