diff options
Diffstat (limited to 'lib')
42 files changed, 2552 insertions, 377 deletions
diff --git a/lib/Kconfig b/lib/Kconfig index 5b916bc0fba..fa9bf2c0619 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -7,6 +7,9 @@ config BINARY_PRINTF menu "Library routines" +config RAID6_PQ + tristate + config BITREVERSE tristate diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 083b23d211d..28b42b9274d 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -76,7 +76,6 @@ config UNUSED_SYMBOLS config DEBUG_FS bool "Debug Filesystem" - depends on SYSFS help debugfs is a virtual file system that kernel developers use to put debugging files into. Enable this option to be able to read and @@ -152,28 +151,33 @@ config DEBUG_SHIRQ Drivers ought to be able to handle interrupts coming in at those points; some don't and need to be caught. -config DETECT_SOFTLOCKUP - bool "Detect Soft Lockups" +config LOCKUP_DETECTOR + bool "Detect Hard and Soft Lockups" depends on DEBUG_KERNEL && !S390 - default y help - Say Y here to enable the kernel to detect "soft lockups", - which are bugs that cause the kernel to loop in kernel + Say Y here to enable the kernel to act as a watchdog to detect + hard and soft lockups. + + Softlockups are bugs that cause the kernel to loop in kernel mode for more than 60 seconds, without giving other tasks a - chance to run. + chance to run. The current stack trace is displayed upon + detection and the system will stay locked up. - When a soft-lockup is detected, the kernel will print the - current stack trace (which you should report), but the - system will stay locked up. This feature has negligible - overhead. + Hardlockups are bugs that cause the CPU to loop in kernel mode + for more than 60 seconds, without letting other interrupts have a + chance to run. The current stack trace is displayed upon detection + and the system will stay locked up. + + The overhead should be minimal. A periodic hrtimer runs to + generate interrupts and kick the watchdog task every 10-12 seconds. + An NMI is generated every 60 seconds or so to check for hardlockups. - (Note that "hard lockups" are separate type of bugs that - can be detected via the NMI-watchdog, on platforms that - support it.) +config HARDLOCKUP_DETECTOR + def_bool LOCKUP_DETECTOR && PERF_EVENTS && HAVE_PERF_EVENTS_NMI config BOOTPARAM_SOFTLOCKUP_PANIC bool "Panic (Reboot) On Soft Lockups" - depends on DETECT_SOFTLOCKUP + depends on LOCKUP_DETECTOR help Say Y here to enable the kernel to panic on "soft lockups", which are bugs that cause the kernel to loop in kernel @@ -190,7 +194,7 @@ config BOOTPARAM_SOFTLOCKUP_PANIC config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE int - depends on DETECT_SOFTLOCKUP + depends on LOCKUP_DETECTOR range 0 1 default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC default 1 if BOOTPARAM_SOFTLOCKUP_PANIC @@ -307,6 +311,20 @@ config DEBUG_OBJECTS_WORK work queue routines to track the life time of work objects and validate the work operations. +config DEBUG_OBJECTS_RCU_HEAD + bool "Debug RCU callbacks objects" + depends on DEBUG_OBJECTS && PREEMPT + help + Enable this to turn on debugging of RCU list heads (call_rcu() usage). + +config DEBUG_OBJECTS_PERCPU_COUNTER + bool "Debug percpu counter objects" + depends on DEBUG_OBJECTS + help + If you say Y here, additional code will be inserted into the + percpu counter routines to track the life time of percpu counter + objects and validate the percpu counter operations. + config DEBUG_OBJECTS_ENABLE_DEFAULT int "debug_objects bootup default value (0-1)" range 0 1 @@ -343,7 +361,7 @@ config SLUB_DEBUG_ON config SLUB_STATS default n bool "Enable SLUB performance statistics" - depends on SLUB && SLUB_DEBUG && SYSFS + depends on SLUB && SYSFS help SLUB statistics are useful to debug SLUBs allocation behavior in order find ways to optimize the allocator. This should never be @@ -356,7 +374,7 @@ config SLUB_STATS config DEBUG_KMEMLEAK bool "Kernel memory leak detector" depends on DEBUG_KERNEL && EXPERIMENTAL && !MEMORY_HOTPLUG && \ - (X86 || ARM || PPC || S390 || SPARC64 || SUPERH || MICROBLAZE) + (X86 || ARM || PPC || S390 || SPARC64 || SUPERH || MICROBLAZE || TILE) select DEBUG_FS if SYSFS select STACKTRACE if STACKTRACE_SUPPORT @@ -400,6 +418,13 @@ config DEBUG_KMEMLEAK_TEST If unsure, say N. +config DEBUG_KMEMLEAK_DEFAULT_OFF + bool "Default kmemleak to off" + depends on DEBUG_KMEMLEAK + help + Say Y here to disable kmemleak by default. It can then be enabled + on the command line via kmemleak=on. + config DEBUG_PREEMPT bool "Debug preemptible kernel" depends on DEBUG_KERNEL && PREEMPT && TRACE_IRQFLAGS_SUPPORT @@ -444,6 +469,15 @@ config DEBUG_MUTEXES This feature allows mutex semantics violations to be detected and reported. +config BKL + bool "Big Kernel Lock" if (SMP || PREEMPT) + default y + help + This is the traditional lock that is used in old code instead + of proper locking. All drivers that use the BKL should depend + on this symbol. + Say Y here unless you are working on removing the BKL. + config DEBUG_LOCK_ALLOC bool "Lock debugging: detect incorrect freeing of live locks" depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT @@ -465,6 +499,7 @@ config PROVE_LOCKING select DEBUG_SPINLOCK select DEBUG_MUTEXES select DEBUG_LOCK_ALLOC + select TRACE_IRQFLAGS default n help This feature enables the kernel to prove that all locking @@ -522,13 +557,30 @@ config PROVE_RCU_REPEATEDLY disabling, allowing multiple RCU-lockdep warnings to be printed on a single reboot. + Say Y to allow multiple RCU-lockdep warnings per boot. + + Say N if you are unsure. + +config SPARSE_RCU_POINTER + bool "RCU debugging: sparse-based checks for pointer usage" + default n + help + This feature enables the __rcu sparse annotation for + RCU-protected pointers. This annotation will cause sparse + to flag any non-RCU used of annotated pointers. This can be + helpful when debugging RCU usage. Please note that this feature + is not intended to enforce code cleanliness; it is instead merely + a debugging aid. + + Say Y to make sparse flag questionable use of RCU-protected pointers + Say N if you are unsure. config LOCKDEP bool depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT select STACKTRACE - select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 + select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE select KALLSYMS select KALLSYMS_ALL @@ -562,11 +614,10 @@ config DEBUG_LOCKDEP of more runtime overhead. config TRACE_IRQFLAGS - depends on DEBUG_KERNEL bool - default y - depends on TRACE_IRQFLAGS_SUPPORT - depends on PROVE_LOCKING + help + Enables hooks to interrupt enabling and disabling for + either tracing or lock debugging. config DEBUG_SPINLOCK_SLEEP bool "Spinlock debugging: sleep-inside-spinlock checking" @@ -697,6 +748,15 @@ config DEBUG_LIST If unsure, say N. +config TEST_LIST_SORT + bool "Linked list sorting test" + depends on DEBUG_KERNEL + help + Enable this to turn on 'list_sort()' function test. This test is + executed only once during system boot, so affects only boot time. + + If unsure, say N. + config DEBUG_SG bool "Debug SG table operations" depends on DEBUG_KERNEL @@ -815,6 +875,30 @@ config RCU_CPU_STALL_DETECTOR Say Y if you are unsure. +config RCU_CPU_STALL_TIMEOUT + int "RCU CPU stall timeout in seconds" + depends on RCU_CPU_STALL_DETECTOR + range 3 300 + default 60 + help + If a given RCU grace period extends more than the specified + number of seconds, a CPU stall warning is printed. If the + RCU grace period persists, additional CPU stall warnings are + printed at more widely spaced intervals. + +config RCU_CPU_STALL_DETECTOR_RUNNABLE + bool "RCU CPU stall checking starts automatically at boot" + depends on RCU_CPU_STALL_DETECTOR + default y + help + If set, start checking for RCU CPU stalls immediately on + boot. Otherwise, RCU CPU stall checking must be manually + enabled. + + Say Y if you are unsure. + + Say N if you wish to suppress RCU CPU stall checking during boot. + config RCU_CPU_STALL_VERBOSE bool "Print additional per-task information for RCU_CPU_STALL_DETECTOR" depends on RCU_CPU_STALL_DETECTOR && TREE_PREEMPT_RCU @@ -950,7 +1034,7 @@ config FAIL_MAKE_REQUEST Provide fault-injection capability for disk IO. config FAIL_IO_TIMEOUT - bool "Faul-injection capability for faking disk interrupts" + bool "Fault-injection capability for faking disk interrupts" depends on FAULT_INJECTION && BLOCK help Provide fault-injection capability on end IO handling. This @@ -971,19 +1055,22 @@ config FAULT_INJECTION_STACKTRACE_FILTER depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT depends on !X86_64 select STACKTRACE - select FRAME_POINTER if !PPC && !S390 + select FRAME_POINTER if !PPC && !S390 && !MICROBLAZE help Provide stacktrace filter for fault-injection capabilities config LATENCYTOP bool "Latency measuring infrastructure" - select FRAME_POINTER if !MIPS && !PPC && !S390 + depends on HAVE_LATENCYTOP_SUPPORT + depends on DEBUG_KERNEL + depends on STACKTRACE_SUPPORT + depends on PROC_FS + select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE select KALLSYMS select KALLSYMS_ALL select STACKTRACE select SCHEDSTATS select SCHED_DEBUG - depends on HAVE_LATENCYTOP_SUPPORT help Enable this option if you want to use the LatencyTOP tool to find out which userspace is blocking on what kernel operations. @@ -1130,6 +1217,19 @@ config ATOMIC64_SELFTEST If unsure, say N. +config ASYNC_RAID6_TEST + tristate "Self test for hardware accelerated raid6 recovery" + depends on ASYNC_RAID6_RECOV + select ASYNC_MEMCPY + ---help--- + This is a one-shot self test that permutes through the + recovery of all the possible two disk failure scenarios for a + N-disk array. Recovery is performed with the asynchronous + raid6 recovery routines, and will optionally use an offload + engine if one is available. + + If unsure, say N. + source "samples/Kconfig" source "lib/Kconfig.kgdb" diff --git a/lib/Makefile b/lib/Makefile index 0bfabba1bb3..e6a3763b821 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/ obj-$(CONFIG_LZO_COMPRESS) += lzo/ obj-$(CONFIG_LZO_DECOMPRESS) += lzo/ +obj-$(CONFIG_RAID6_PQ) += raid6/ lib-$(CONFIG_DECOMPRESS_GZIP) += decompress_inflate.o lib-$(CONFIG_DECOMPRESS_BZIP2) += decompress_bunzip2.o diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c index 250ed11d3ed..44524cc8c32 100644 --- a/lib/atomic64_test.c +++ b/lib/atomic64_test.c @@ -114,7 +114,7 @@ static __init int test_atomic64(void) BUG_ON(v.counter != r); #if defined(CONFIG_X86) || defined(CONFIG_MIPS) || defined(CONFIG_PPC) || \ - defined(CONFIG_S390) || defined(_ASM_GENERIC_ATOMIC64_H) + defined(CONFIG_S390) || defined(_ASM_GENERIC_ATOMIC64_H) || defined(CONFIG_ARM) INIT(onestwos); BUG_ON(atomic64_dec_if_positive(&v) != (onestwos - 1)); r -= one; diff --git a/lib/bitmap.c b/lib/bitmap.c index ffb78c916cc..741fae905ae 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -359,7 +359,6 @@ EXPORT_SYMBOL(bitmap_find_next_zero_area); #define CHUNKSZ 32 #define nbits_to_hold_value(val) fls(val) -#define unhex(c) (isdigit(c) ? (c - '0') : (toupper(c) - 'A' + 10)) #define BASEDEC 10 /* fancier cpuset lists input in decimal */ /** @@ -466,7 +465,7 @@ int __bitmap_parse(const char *buf, unsigned int buflen, if (chunk & ~((1UL << (CHUNKSZ - 4)) - 1)) return -EOVERFLOW; - chunk = (chunk << 4) | unhex(c); + chunk = (chunk << 4) | hex_to_bin(c); ndigits++; totaldigits++; } if (ndigits == 0) diff --git a/lib/bug.c b/lib/bug.c index f13daf43521..19552096d16 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -72,8 +72,8 @@ static const struct bug_entry *module_find_bug(unsigned long bugaddr) return NULL; } -int module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, - struct module *mod) +void module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, + struct module *mod) { char *secstrings; unsigned int i; @@ -97,8 +97,6 @@ int module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, * could potentially lead to deadlock and thus be counter-productive. */ list_add(&mod->bug_list, &module_bug_list); - - return 0; } void module_bug_cleanup(struct module *mod) @@ -136,8 +134,6 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) bug = find_bug(bugaddr); - printk(KERN_EMERG "------------[ cut here ]------------\n"); - file = NULL; line = 0; warning = 0; @@ -156,19 +152,25 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) if (warning) { /* this is a WARN_ON rather than BUG/BUG_ON */ + printk(KERN_WARNING "------------[ cut here ]------------\n"); + if (file) - printk(KERN_ERR "Badness at %s:%u\n", + printk(KERN_WARNING "WARNING: at %s:%u\n", file, line); else - printk(KERN_ERR "Badness at %p " + printk(KERN_WARNING "WARNING: at %p " "[verbose debug info unavailable]\n", (void *)bugaddr); + print_modules(); show_regs(regs); + print_oops_end_marker(); add_taint(BUG_GET_TAINT(bug)); return BUG_TRAP_TYPE_WARN; } + printk(KERN_EMERG "------------[ cut here ]------------\n"); + if (file) printk(KERN_CRIT "kernel BUG at %s:%u!\n", file, line); diff --git a/lib/decompress_bunzip2.c b/lib/decompress_bunzip2.c index a4e971dee10..81c8bb1cc6a 100644 --- a/lib/decompress_bunzip2.c +++ b/lib/decompress_bunzip2.c @@ -107,6 +107,8 @@ struct bunzip_data { unsigned char selectors[32768]; /* nSelectors = 15 bits */ struct group_data groups[MAX_GROUPS]; /* Huffman coding tables */ int io_error; /* non-zero if we have IO error */ + int byteCount[256]; + unsigned char symToByte[256], mtfSymbol[256]; }; @@ -158,14 +160,16 @@ static int INIT get_next_block(struct bunzip_data *bd) int *base = NULL; int *limit = NULL; int dbufCount, nextSym, dbufSize, groupCount, selector, - i, j, k, t, runPos, symCount, symTotal, nSelectors, - byteCount[256]; - unsigned char uc, symToByte[256], mtfSymbol[256], *selectors; + i, j, k, t, runPos, symCount, symTotal, nSelectors, *byteCount; + unsigned char uc, *symToByte, *mtfSymbol, *selectors; unsigned int *dbuf, origPtr; dbuf = bd->dbuf; dbufSize = bd->dbufSize; selectors = bd->selectors; + byteCount = bd->byteCount; + symToByte = bd->symToByte; + mtfSymbol = bd->mtfSymbol; /* Read in header signature and CRC, then validate signature. (last block signature means CRC is for whole file, return now) */ diff --git a/lib/devres.c b/lib/devres.c index 49368608f98..6efddf53b90 100644 --- a/lib/devres.c +++ b/lib/devres.c @@ -328,7 +328,7 @@ EXPORT_SYMBOL(pcim_iomap_regions_request_all); * @pdev: PCI device to map IO resources for * @mask: Mask of BARs to unmap and release * - * Unamp and release regions specified by @mask. + * Unmap and release regions specified by @mask. */ void pcim_iounmap_regions(struct pci_dev *pdev, u16 mask) { diff --git a/lib/div64.c b/lib/div64.c index a111eb8de9c..5b491919177 100644 --- a/lib/div64.c +++ b/lib/div64.c @@ -77,26 +77,58 @@ s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder) EXPORT_SYMBOL(div_s64_rem); #endif -/* 64bit divisor, dividend and result. dynamic precision */ +/** + * div64_u64 - unsigned 64bit divide with 64bit divisor + * @dividend: 64bit dividend + * @divisor: 64bit divisor + * + * This implementation is a modified version of the algorithm proposed + * by the book 'Hacker's Delight'. The original source and full proof + * can be found here and is available for use without restriction. + * + * 'http://www.hackersdelight.org/HDcode/newCode/divDouble.c' + */ #ifndef div64_u64 u64 div64_u64(u64 dividend, u64 divisor) { - u32 high, d; + u32 high = divisor >> 32; + u64 quot; - high = divisor >> 32; - if (high) { - unsigned int shift = fls(high); + if (high == 0) { + quot = div_u64(dividend, divisor); + } else { + int n = 1 + fls(high); + quot = div_u64(dividend >> n, divisor >> n); - d = divisor >> shift; - dividend >>= shift; - } else - d = divisor; + if (quot != 0) + quot--; + if ((dividend - quot * divisor) >= divisor) + quot++; + } - return div_u64(dividend, d); + return quot; } EXPORT_SYMBOL(div64_u64); #endif +/** + * div64_s64 - signed 64bit divide with 64bit divisor + * @dividend: 64bit dividend + * @divisor: 64bit divisor + */ +#ifndef div64_s64 +s64 div64_s64(s64 dividend, s64 divisor) +{ + s64 quot, t; + + quot = div64_u64(abs64(dividend), abs64(divisor)); + t = (dividend ^ divisor) >> 63; + + return (quot ^ t) - t; +} +EXPORT_SYMBOL(div64_s64); +#endif + #endif /* BITS_PER_LONG == 32 */ /* diff --git a/lib/dma-debug.c b/lib/dma-debug.c index 01e64270e24..4bfb0471f10 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -590,6 +590,7 @@ out_unlock: static const struct file_operations filter_fops = { .read = filter_read, .write = filter_write, + .llseek = default_llseek, }; static int dma_debug_fs_init(void) diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 02afc253372..3094318bfea 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -26,19 +26,11 @@ #include <linux/dynamic_debug.h> #include <linux/debugfs.h> #include <linux/slab.h> +#include <linux/jump_label.h> extern struct _ddebug __start___verbose[]; extern struct _ddebug __stop___verbose[]; -/* dynamic_debug_enabled, and dynamic_debug_enabled2 are bitmasks in which - * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They - * use independent hash functions, to reduce the chance of false positives. - */ -long long dynamic_debug_enabled; -EXPORT_SYMBOL_GPL(dynamic_debug_enabled); -long long dynamic_debug_enabled2; -EXPORT_SYMBOL_GPL(dynamic_debug_enabled2); - struct ddebug_table { struct list_head link; char *mod_name; @@ -88,26 +80,6 @@ static char *ddebug_describe_flags(struct _ddebug *dp, char *buf, } /* - * must be called with ddebug_lock held - */ - -static int disabled_hash(char hash, bool first_table) -{ - struct ddebug_table *dt; - char table_hash_value; - - list_for_each_entry(dt, &ddebug_tables, link) { - if (first_table) - table_hash_value = dt->ddebugs->primary_hash; - else - table_hash_value = dt->ddebugs->secondary_hash; - if (dt->num_enabled && (hash == table_hash_value)) - return 0; - } - return 1; -} - -/* * Search the tables for _ddebug's which match the given * `query' and apply the `flags' and `mask' to them. Tells * the user which ddebug's were changed, or whether none @@ -170,17 +142,9 @@ static void ddebug_change(const struct ddebug_query *query, dt->num_enabled++; dp->flags = newflags; if (newflags) { - dynamic_debug_enabled |= - (1LL << dp->primary_hash); - dynamic_debug_enabled2 |= - (1LL << dp->secondary_hash); + jump_label_enable(&dp->enabled); } else { - if (disabled_hash(dp->primary_hash, true)) - dynamic_debug_enabled &= - ~(1LL << dp->primary_hash); - if (disabled_hash(dp->secondary_hash, false)) - dynamic_debug_enabled2 &= - ~(1LL << dp->secondary_hash); + jump_label_disable(&dp->enabled); } if (verbose) printk(KERN_INFO @@ -429,6 +393,40 @@ static int ddebug_parse_flags(const char *str, unsigned int *flagsp, return 0; } +static int ddebug_exec_query(char *query_string) +{ + unsigned int flags = 0, mask = 0; + struct ddebug_query query; +#define MAXWORDS 9 + int nwords; + char *words[MAXWORDS]; + + nwords = ddebug_tokenize(query_string, words, MAXWORDS); + if (nwords <= 0) + return -EINVAL; + if (ddebug_parse_query(words, nwords-1, &query)) + return -EINVAL; + if (ddebug_parse_flags(words[nwords-1], &flags, &mask)) + return -EINVAL; + + /* actually go and implement the change */ + ddebug_change(&query, flags, mask); + return 0; +} + +static __initdata char ddebug_setup_string[1024]; +static __init int ddebug_setup_query(char *str) +{ + if (strlen(str) >= 1024) { + pr_warning("ddebug boot param string too large\n"); + return 0; + } + strcpy(ddebug_setup_string, str); + return 1; +} + +__setup("ddebug_query=", ddebug_setup_query); + /* * File_ops->write method for <debugfs>/dynamic_debug/conrol. Gathers the * command text from userspace, parses and executes it. @@ -436,12 +434,8 @@ static int ddebug_parse_flags(const char *str, unsigned int *flagsp, static ssize_t ddebug_proc_write(struct file *file, const char __user *ubuf, size_t len, loff_t *offp) { - unsigned int flags = 0, mask = 0; - struct ddebug_query query; -#define MAXWORDS 9 - int nwords; - char *words[MAXWORDS]; char tmpbuf[256]; + int ret; if (len == 0) return 0; @@ -455,16 +449,9 @@ static ssize_t ddebug_proc_write(struct file *file, const char __user *ubuf, printk(KERN_INFO "%s: read %d bytes from userspace\n", __func__, (int)len); - nwords = ddebug_tokenize(tmpbuf, words, MAXWORDS); - if (nwords <= 0) - return -EINVAL; - if (ddebug_parse_query(words, nwords-1, &query)) - return -EINVAL; - if (ddebug_parse_flags(words[nwords-1], &flags, &mask)) - return -EINVAL; - - /* actually go and implement the change */ - ddebug_change(&query, flags, mask); + ret = ddebug_exec_query(tmpbuf); + if (ret) + return ret; *offp += len; return len; @@ -725,13 +712,14 @@ static void ddebug_remove_all_tables(void) mutex_unlock(&ddebug_lock); } -static int __init dynamic_debug_init(void) +static __initdata int ddebug_init_success; + +static int __init dynamic_debug_init_debugfs(void) { struct dentry *dir, *file; - struct _ddebug *iter, *iter_start; - const char *modname = NULL; - int ret = 0; - int n = 0; + + if (!ddebug_init_success) + return -ENODEV; dir = debugfs_create_dir("dynamic_debug", NULL); if (!dir) @@ -742,6 +730,16 @@ static int __init dynamic_debug_init(void) debugfs_remove(dir); return -ENOMEM; } + return 0; +} + +static int __init dynamic_debug_init(void) +{ + struct _ddebug *iter, *iter_start; + const char *modname = NULL; + int ret = 0; + int n = 0; + if (__start___verbose != __stop___verbose) { iter = __start___verbose; modname = iter->modname; @@ -759,12 +757,26 @@ static int __init dynamic_debug_init(void) } ret = ddebug_add_module(iter_start, n, modname); } + + /* ddebug_query boot param got passed -> set it up */ + if (ddebug_setup_string[0] != '\0') { + ret = ddebug_exec_query(ddebug_setup_string); + if (ret) + pr_warning("Invalid ddebug boot param %s", + ddebug_setup_string); + else + pr_info("ddebug initialized with string %s", + ddebug_setup_string); + } + out_free: - if (ret) { + if (ret) ddebug_remove_all_tables(); - debugfs_remove(dir); - debugfs_remove(file); - } + else + ddebug_init_success = 1; return 0; } -module_init(dynamic_debug_init); +/* Allow early initialization for boot messages via boot param */ +arch_initcall(dynamic_debug_init); +/* Debugfs setup must be done later */ +module_init(dynamic_debug_init_debugfs); diff --git a/lib/flex_array.c b/lib/flex_array.c index 41b1804fa72..77a6fea7481 100644 --- a/lib/flex_array.c +++ b/lib/flex_array.c @@ -171,6 +171,8 @@ __fa_get_part(struct flex_array *fa, int part_nr, gfp_t flags) * Note that this *copies* the contents of @src into * the array. If you are trying to store an array of * pointers, make sure to pass in &ptr instead of ptr. + * You may instead wish to use the flex_array_put_ptr() + * helper function. * * Locking must be provided by the caller. */ @@ -265,7 +267,8 @@ int flex_array_prealloc(struct flex_array *fa, unsigned int start, * * Returns a pointer to the data at index @element_nr. Note * that this is a copy of the data that was passed in. If you - * are using this to store pointers, you'll get back &ptr. + * are using this to store pointers, you'll get back &ptr. You + * may instead wish to use the flex_array_get_ptr helper. * * Locking must be provided by the caller. */ @@ -286,6 +289,26 @@ void *flex_array_get(struct flex_array *fa, unsigned int element_nr) return &part->elements[index_inside_part(fa, element_nr)]; } +/** + * flex_array_get_ptr - pull a ptr back out of the array + * @fa: the flex array from which to extract data + * @element_nr: index of the element to fetch from the array + * + * Returns the pointer placed in the flex array at element_nr using + * flex_array_put_ptr(). This function should not be called if the + * element in question was not set using the _put_ptr() helper. + */ +void *flex_array_get_ptr(struct flex_array *fa, unsigned int element_nr) +{ + void **tmp; + + tmp = flex_array_get(fa, element_nr); + if (!tmp) + return NULL; + + return *tmp; +} + static int part_is_free(struct flex_array_part *part) { int i; diff --git a/lib/idr.c b/lib/idr.c index 7f1a4f0acf5..e15502e8b21 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -106,16 +106,17 @@ static void idr_mark_full(struct idr_layer **pa, int id) } /** - * idr_pre_get - reserver resources for idr allocation + * idr_pre_get - reserve resources for idr allocation * @idp: idr handle * @gfp_mask: memory allocation flags * - * This function should be called prior to locking and calling the - * idr_get_new* functions. It preallocates enough memory to satisfy - * the worst possible allocation. + * This function should be called prior to calling the idr_get_new* functions. + * It preallocates enough memory to satisfy the worst possible allocation. The + * caller should pass in GFP_KERNEL if possible. This of course requires that + * no spinning locks be held. * - * If the system is REALLY out of memory this function returns 0, - * otherwise 1. + * If the system is REALLY out of memory this function returns %0, + * otherwise %1. */ int idr_pre_get(struct idr *idp, gfp_t gfp_mask) { @@ -284,17 +285,19 @@ static int idr_get_new_above_int(struct idr *idp, void *ptr, int starting_id) * idr_get_new_above - allocate new idr entry above or equal to a start id * @idp: idr handle * @ptr: pointer you want associated with the id - * @start_id: id to start search at + * @starting_id: id to start search at * @id: pointer to the allocated handle * * This is the allocate id function. It should be called with any * required locks. * - * If memory is required, it will return -EAGAIN, you should unlock - * and go back to the idr_pre_get() call. If the idr is full, it will - * return -ENOSPC. + * If allocation from IDR's private freelist fails, idr_get_new_above() will + * return %-EAGAIN. The caller should retry the idr_pre_get() call to refill + * IDR's preallocation and then retry the idr_get_new_above() call. + * + * If the idr is full idr_get_new_above() will return %-ENOSPC. * - * @id returns a value in the range @starting_id ... 0x7fffffff + * @id returns a value in the range @starting_id ... %0x7fffffff */ int idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id) { @@ -318,14 +321,13 @@ EXPORT_SYMBOL(idr_get_new_above); * @ptr: pointer you want associated with the id * @id: pointer to the allocated handle * - * This is the allocate id function. It should be called with any - * required locks. + * If allocation from IDR's private freelist fails, idr_get_new_above() will + * return %-EAGAIN. The caller should retry the idr_pre_get() call to refill + * IDR's preallocation and then retry the idr_get_new_above() call. * - * If memory is required, it will return -EAGAIN, you should unlock - * and go back to the idr_pre_get() call. If the idr is full, it will - * return -ENOSPC. + * If the idr is full idr_get_new_above() will return %-ENOSPC. * - * @id returns a value in the range 0 ... 0x7fffffff + * @id returns a value in the range %0 ... %0x7fffffff */ int idr_get_new(struct idr *idp, void *ptr, int *id) { @@ -388,7 +390,7 @@ static void sub_remove(struct idr *idp, int shift, int id) } /** - * idr_remove - remove the given id and free it's slot + * idr_remove - remove the given id and free its slot * @idp: idr handle * @id: unique key */ @@ -437,7 +439,7 @@ EXPORT_SYMBOL(idr_remove); * function will remove all id mappings and leave all idp_layers * unused. * - * A typical clean-up sequence for objects stored in an idr tree, will + * A typical clean-up sequence for objects stored in an idr tree will * use idr_for_each() to free all objects, if necessay, then * idr_remove_all() to remove all ids, and idr_destroy() to free * up the cached idr_layers. @@ -479,7 +481,7 @@ EXPORT_SYMBOL(idr_remove_all); /** * idr_destroy - release all cached layers within an idr tree - * idp: idr handle + * @idp: idr handle */ void idr_destroy(struct idr *idp) { @@ -542,7 +544,7 @@ EXPORT_SYMBOL(idr_find); * not allowed. * * We check the return of @fn each time. If it returns anything other - * than 0, we break out and return that value. + * than %0, we break out and return that value. * * The caller must serialize idr_for_each() vs idr_get_new() and idr_remove(). */ @@ -586,10 +588,11 @@ EXPORT_SYMBOL(idr_for_each); /** * idr_get_next - lookup next object of id to given id. * @idp: idr handle - * @id: pointer to lookup key + * @nextidp: pointer to lookup key * * Returns pointer to registered object with id, which is next number to - * given id. + * given id. After being looked up, *@nextidp will be updated for the next + * iteration. */ void *idr_get_next(struct idr *idp, int *nextidp) @@ -636,8 +639,8 @@ EXPORT_SYMBOL(idr_get_next); * @id: lookup key * * Replace the pointer registered with an id and return the old value. - * A -ENOENT return indicates that @id was not found. - * A -EINVAL return indicates that @id was not within valid constraints. + * A %-ENOENT return indicates that @id was not found. + * A %-EINVAL return indicates that @id was not within valid constraints. * * The caller must serialize with writers. */ @@ -695,10 +698,11 @@ void idr_init(struct idr *idp) EXPORT_SYMBOL(idr_init); -/* +/** + * DOC: IDA description * IDA - IDR based ID allocator * - * this is id allocator without id -> pointer translation. Memory + * This is id allocator without id -> pointer translation. Memory * usage is much lower than full blown idr because each id only * occupies a bit. ida uses a custom leaf node which contains * IDA_BITMAP_BITS slots. @@ -731,8 +735,8 @@ static void free_bitmap(struct ida *ida, struct ida_bitmap *bitmap) * following function. It preallocates enough memory to satisfy the * worst possible allocation. * - * If the system is REALLY out of memory this function returns 0, - * otherwise 1. + * If the system is REALLY out of memory this function returns %0, + * otherwise %1. */ int ida_pre_get(struct ida *ida, gfp_t gfp_mask) { @@ -758,17 +762,17 @@ EXPORT_SYMBOL(ida_pre_get); /** * ida_get_new_above - allocate new ID above or equal to a start id * @ida: ida handle - * @staring_id: id to start search at + * @starting_id: id to start search at * @p_id: pointer to the allocated handle * * Allocate new ID above or equal to @ida. It should be called with * any required locks. * - * If memory is required, it will return -EAGAIN, you should unlock + * If memory is required, it will return %-EAGAIN, you should unlock * and go back to the ida_pre_get() call. If the ida is full, it will - * return -ENOSPC. + * return %-ENOSPC. * - * @p_id returns a value in the range @starting_id ... 0x7fffffff. + * @p_id returns a value in the range @starting_id ... %0x7fffffff. */ int ida_get_new_above(struct ida *ida, int starting_id, int *p_id) { @@ -850,11 +854,11 @@ EXPORT_SYMBOL(ida_get_new_above); * * Allocate new ID. It should be called with any required locks. * - * If memory is required, it will return -EAGAIN, you should unlock + * If memory is required, it will return %-EAGAIN, you should unlock * and go back to the idr_pre_get() call. If the idr is full, it will - * return -ENOSPC. + * return %-ENOSPC. * - * @id returns a value in the range 0 ... 0x7fffffff. + * @id returns a value in the range %0 ... %0x7fffffff. */ int ida_get_new(struct ida *ida, int *p_id) { @@ -912,7 +916,7 @@ EXPORT_SYMBOL(ida_remove); /** * ida_destroy - release all cached layers within an ida tree - * ida: ida handle + * @ida: ida handle */ void ida_destroy(struct ida *ida) { diff --git a/lib/inflate.c b/lib/inflate.c index 677b738c220..013a7619348 100644 --- a/lib/inflate.c +++ b/lib/inflate.c @@ -103,7 +103,9 @@ the two sets of lengths. */ #include <linux/compiler.h> +#ifdef NO_INFLATE_MALLOC #include <linux/slab.h> +#endif #ifdef RCSID static char rcsid[] = "#Id: inflate.c,v 0.14 1993/06/10 13:27:04 jloup Exp #"; diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c index c0251f4ad08..da053313ee5 100644 --- a/lib/iommu-helper.c +++ b/lib/iommu-helper.c @@ -38,12 +38,3 @@ again: return -1; } EXPORT_SYMBOL(iommu_area_alloc); - -unsigned long iommu_num_pages(unsigned long addr, unsigned long len, - unsigned long io_page_size) -{ - unsigned long size = (addr & (io_page_size - 1)) + len; - - return DIV_ROUND_UP(size, io_page_size); -} -EXPORT_SYMBOL(iommu_num_pages); diff --git a/lib/ioremap.c b/lib/ioremap.c index 14c6078f17a..5730ecd3eb6 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -13,10 +13,10 @@ #include <asm/pgtable.h> static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, unsigned long phys_addr, pgprot_t prot) + unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { pte_t *pte; - unsigned long pfn; + u64 pfn; pfn = phys_addr >> PAGE_SHIFT; pte = pte_alloc_kernel(pmd, addr); @@ -31,7 +31,7 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, } static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, unsigned long phys_addr, pgprot_t prot) + unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { pmd_t *pmd; unsigned long next; @@ -49,7 +49,7 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, } static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr, - unsigned long end, unsigned long phys_addr, pgprot_t prot) + unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { pud_t *pud; unsigned long next; @@ -67,7 +67,7 @@ static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr, } int ioremap_page_range(unsigned long addr, - unsigned long end, unsigned long phys_addr, pgprot_t prot) + unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { pgd_t *pgd; unsigned long start; diff --git a/lib/kobject.c b/lib/kobject.c index f07c57252e8..82dc34c095c 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -746,17 +746,56 @@ void kset_unregister(struct kset *k) */ struct kobject *kset_find_obj(struct kset *kset, const char *name) { + return kset_find_obj_hinted(kset, name, NULL); +} + +/** + * kset_find_obj_hinted - search for object in kset given a predecessor hint. + * @kset: kset we're looking in. + * @name: object's name. + * @hint: hint to possible object's predecessor. + * + * Check the hint's next object and if it is a match return it directly, + * otherwise, fall back to the behavior of kset_find_obj(). Either way + * a reference for the returned object is held and the reference on the + * hinted object is released. + */ +struct kobject *kset_find_obj_hinted(struct kset *kset, const char *name, + struct kobject *hint) +{ struct kobject *k; struct kobject *ret = NULL; spin_lock(&kset->list_lock); + + if (!hint) + goto slow_search; + + /* end of list detection */ + if (hint->entry.next == kset->list.next) + goto slow_search; + + k = container_of(hint->entry.next, struct kobject, entry); + if (!kobject_name(k) || strcmp(kobject_name(k), name)) + goto slow_search; + + ret = kobject_get(k); + goto unlock_exit; + +slow_search: list_for_each_entry(k, &kset->list, entry) { if (kobject_name(k) && !strcmp(kobject_name(k), name)) { ret = kobject_get(k); break; } } + +unlock_exit: spin_unlock(&kset->list_lock); + + if (hint) + kobject_put(hint); + return ret; } diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index b93579504df..70af0a7f97c 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -123,7 +123,7 @@ static int kobj_usermode_filter(struct kobject *kobj) * @kobj: struct kobject that the action is happening to * @envp_ext: pointer to environmental data * - * Returns 0 if kobject_uevent() is completed with success or the + * Returns 0 if kobject_uevent_env() is completed with success or the * corresponding error when it fails. */ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, @@ -317,7 +317,7 @@ exit: EXPORT_SYMBOL_GPL(kobject_uevent_env); /** - * kobject_uevent - notify userspace by ending an uevent + * kobject_uevent - notify userspace by sending an uevent * * @action: action that is happening * @kobj: struct kobject that the action is happening to diff --git a/lib/list_debug.c b/lib/list_debug.c index 1a39f4e3ae1..344c710d16c 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -43,6 +43,12 @@ EXPORT_SYMBOL(__list_add); */ void list_del(struct list_head *entry) { + WARN(entry->next == LIST_POISON1, + "list_del corruption, next is LIST_POISON1 (%p)\n", + LIST_POISON1); + WARN(entry->next != LIST_POISON1 && entry->prev == LIST_POISON2, + "list_del corruption, prev is LIST_POISON2 (%p)\n", + LIST_POISON2); WARN(entry->prev->next != entry, "list_del corruption. prev->next should be %p, " "but was %p\n", entry, entry->prev->next); diff --git a/lib/list_sort.c b/lib/list_sort.c index 4b5cb794c38..d7325c6b103 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -70,7 +70,7 @@ static void merge_and_restore_back_links(void *priv, * element comparison is needed, so the client's cmp() * routine can invoke cond_resched() periodically. */ - (*cmp)(priv, tail, tail); + (*cmp)(priv, tail->next, tail->next); tail->next->prev = tail; tail = tail->next; @@ -141,77 +141,151 @@ void list_sort(void *priv, struct list_head *head, } EXPORT_SYMBOL(list_sort); -#ifdef DEBUG_LIST_SORT +#ifdef CONFIG_TEST_LIST_SORT + +#include <linux/random.h> + +/* + * The pattern of set bits in the list length determines which cases + * are hit in list_sort(). + */ +#define TEST_LIST_LEN (512+128+2) /* not including head */ + +#define TEST_POISON1 0xDEADBEEF +#define TEST_POISON2 0xA324354C + struct debug_el { - struct list_head l_h; + unsigned int poison1; + struct list_head list; + unsigned int poison2; int value; unsigned serial; }; -static int cmp(void *priv, struct list_head *a, struct list_head *b) +/* Array, containing pointers to all elements in the test list */ +static struct debug_el **elts __initdata; + +static int __init check(struct debug_el *ela, struct debug_el *elb) { - return container_of(a, struct debug_el, l_h)->value - - container_of(b, struct debug_el, l_h)->value; + if (ela->serial >= TEST_LIST_LEN) { + printk(KERN_ERR "list_sort_test: error: incorrect serial %d\n", + ela->serial); + return -EINVAL; + } + if (elb->serial >= TEST_LIST_LEN) { + printk(KERN_ERR "list_sort_test: error: incorrect serial %d\n", + elb->serial); + return -EINVAL; + } + if (elts[ela->serial] != ela || elts[elb->serial] != elb) { + printk(KERN_ERR "list_sort_test: error: phantom element\n"); + return -EINVAL; + } + if (ela->poison1 != TEST_POISON1 || ela->poison2 != TEST_POISON2) { + printk(KERN_ERR "list_sort_test: error: bad poison: %#x/%#x\n", + ela->poison1, ela->poison2); + return -EINVAL; + } + if (elb->poison1 != TEST_POISON1 || elb->poison2 != TEST_POISON2) { + printk(KERN_ERR "list_sort_test: error: bad poison: %#x/%#x\n", + elb->poison1, elb->poison2); + return -EINVAL; + } + return 0; } -/* - * The pattern of set bits in the list length determines which cases - * are hit in list_sort(). - */ -#define LIST_SORT_TEST_LENGTH (512+128+2) /* not including head */ +static int __init cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct debug_el *ela, *elb; + + ela = container_of(a, struct debug_el, list); + elb = container_of(b, struct debug_el, list); + + check(ela, elb); + return ela->value - elb->value; +} static int __init list_sort_test(void) { - int i, r = 1, count; - struct list_head *head = kmalloc(sizeof(*head), GFP_KERNEL); - struct list_head *cur; + int i, count = 1, err = -EINVAL; + struct debug_el *el; + struct list_head *cur, *tmp; + LIST_HEAD(head); + + printk(KERN_DEBUG "list_sort_test: start testing list_sort()\n"); - printk(KERN_WARNING "testing list_sort()\n"); + elts = kmalloc(sizeof(void *) * TEST_LIST_LEN, GFP_KERNEL); + if (!elts) { + printk(KERN_ERR "list_sort_test: error: cannot allocate " + "memory\n"); + goto exit; + } - cur = head; - for (i = 0; i < LIST_SORT_TEST_LENGTH; i++) { - struct debug_el *el = kmalloc(sizeof(*el), GFP_KERNEL); - BUG_ON(!el); + for (i = 0; i < TEST_LIST_LEN; i++) { + el = kmalloc(sizeof(*el), GFP_KERNEL); + if (!el) { + printk(KERN_ERR "list_sort_test: error: cannot " + "allocate memory\n"); + goto exit; + } /* force some equivalencies */ - el->value = (r = (r * 725861) % 6599) % (LIST_SORT_TEST_LENGTH/3); + el->value = random32() % (TEST_LIST_LEN/3); el->serial = i; - - el->l_h.prev = cur; - cur->next = &el->l_h; - cur = cur->next; + el->poison1 = TEST_POISON1; + el->poison2 = TEST_POISON2; + elts[i] = el; + list_add_tail(&el->list, &head); } - head->prev = cur; - list_sort(NULL, head, cmp); + list_sort(NULL, &head, cmp); + + for (cur = head.next; cur->next != &head; cur = cur->next) { + struct debug_el *el1; + int cmp_result; - count = 1; - for (cur = head->next; cur->next != head; cur = cur->next) { - struct debug_el *el = container_of(cur, struct debug_el, l_h); - int cmp_result = cmp(NULL, cur, cur->next); if (cur->next->prev != cur) { - printk(KERN_EMERG "list_sort() returned " - "a corrupted list!\n"); - return 1; - } else if (cmp_result > 0) { - printk(KERN_EMERG "list_sort() failed to sort!\n"); - return 1; - } else if (cmp_result == 0 && - el->serial >= container_of(cur->next, - struct debug_el, l_h)->serial) { - printk(KERN_EMERG "list_sort() failed to preserve order" - " of equivalent elements!\n"); - return 1; + printk(KERN_ERR "list_sort_test: error: list is " + "corrupted\n"); + goto exit; + } + + cmp_result = cmp(NULL, cur, cur->next); + if (cmp_result > 0) { + printk(KERN_ERR "list_sort_test: error: list is not " + "sorted\n"); + goto exit; + } + + el = container_of(cur, struct debug_el, list); + el1 = container_of(cur->next, struct debug_el, list); + if (cmp_result == 0 && el->serial >= el1->serial) { + printk(KERN_ERR "list_sort_test: error: order of " + "equivalent elements not preserved\n"); + goto exit; + } + + if (check(el, el1)) { + printk(KERN_ERR "list_sort_test: error: element check " + "failed\n"); + goto exit; } - kfree(cur->prev); count++; } - kfree(cur); - if (count != LIST_SORT_TEST_LENGTH) { - printk(KERN_EMERG "list_sort() returned list of" - "different length!\n"); - return 1; + + if (count != TEST_LIST_LEN) { + printk(KERN_ERR "list_sort_test: error: bad list length %d", + count); + goto exit; } - return 0; + + err = 0; +exit: + kfree(elts); + list_for_each_safe(cur, tmp, &head) { + list_del(cur); + kfree(container_of(cur, struct debug_el, list)); + } + return err; } module_init(list_sort_test); -#endif +#endif /* CONFIG_TEST_LIST_SORT */ diff --git a/lib/parser.c b/lib/parser.c index fb34977246b..6e89eca5cca 100644 --- a/lib/parser.c +++ b/lib/parser.c @@ -128,12 +128,13 @@ static int match_number(substring_t *s, int *result, int base) char *endp; char *buf; int ret; + size_t len = s->to - s->from; - buf = kmalloc(s->to - s->from + 1, GFP_KERNEL); + buf = kmalloc(len + 1, GFP_KERNEL); if (!buf) return -ENOMEM; - memcpy(buf, s->from, s->to - s->from); - buf[s->to - s->from] = '\0'; + memcpy(buf, s->from, len); + buf[len] = '\0'; *result = simple_strtol(buf, &endp, base); ret = 0; if (endp == buf) diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index aeaa6d73444..604678d7d06 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -8,10 +8,53 @@ #include <linux/init.h> #include <linux/cpu.h> #include <linux/module.h> +#include <linux/debugobjects.h> static LIST_HEAD(percpu_counters); static DEFINE_MUTEX(percpu_counters_lock); +#ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER + +static struct debug_obj_descr percpu_counter_debug_descr; + +static int percpu_counter_fixup_free(void *addr, enum debug_obj_state state) +{ + struct percpu_counter *fbc = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + percpu_counter_destroy(fbc); + debug_object_free(fbc, &percpu_counter_debug_descr); + return 1; + default: + return 0; + } +} + +static struct debug_obj_descr percpu_counter_debug_descr = { + .name = "percpu_counter", + .fixup_free = percpu_counter_fixup_free, +}; + +static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) +{ + debug_object_init(fbc, &percpu_counter_debug_descr); + debug_object_activate(fbc, &percpu_counter_debug_descr); +} + +static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) +{ + debug_object_deactivate(fbc, &percpu_counter_debug_descr); + debug_object_free(fbc, &percpu_counter_debug_descr); +} + +#else /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ +static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) +{ } +static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) +{ } +#endif /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ + void percpu_counter_set(struct percpu_counter *fbc, s64 amount) { int cpu; @@ -30,9 +73,9 @@ void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch) { s64 count; s32 *pcount; - int cpu = get_cpu(); - pcount = per_cpu_ptr(fbc->counters, cpu); + preempt_disable(); + pcount = this_cpu_ptr(fbc->counters); count = *pcount + amount; if (count >= batch || count <= -batch) { spin_lock(&fbc->lock); @@ -42,7 +85,7 @@ void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch) } else { *pcount = count; } - put_cpu(); + preempt_enable(); } EXPORT_SYMBOL(__percpu_counter_add); @@ -75,7 +118,11 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, fbc->counters = alloc_percpu(s32); if (!fbc->counters) return -ENOMEM; + + debug_percpu_counter_activate(fbc); + #ifdef CONFIG_HOTPLUG_CPU + INIT_LIST_HEAD(&fbc->list); mutex_lock(&percpu_counters_lock); list_add(&fbc->list, &percpu_counters); mutex_unlock(&percpu_counters_lock); @@ -89,6 +136,8 @@ void percpu_counter_destroy(struct percpu_counter *fbc) if (!fbc->counters) return; + debug_percpu_counter_deactivate(fbc); + #ifdef CONFIG_HOTPLUG_CPU mutex_lock(&percpu_counters_lock); list_del(&fbc->list); @@ -137,6 +186,33 @@ static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb, return NOTIFY_OK; } +/* + * Compare counter against given value. + * Return 1 if greater, 0 if equal and -1 if less + */ +int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs) +{ + s64 count; + + count = percpu_counter_read(fbc); + /* Check to see if rough count will be sufficient for comparison */ + if (abs(count - rhs) > (percpu_counter_batch*num_online_cpus())) { + if (count > rhs) + return 1; + else + return -1; + } + /* Need to use precise count */ + count = percpu_counter_sum(fbc); + if (count > rhs) + return 1; + else if (count < rhs) + return -1; + else + return 0; +} +EXPORT_SYMBOL(percpu_counter_compare); + static int __init percpu_counter_startup(void) { compute_batch_value(); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 05da38bcc29..6f412ab4c24 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -49,7 +49,7 @@ struct radix_tree_node { unsigned int height; /* Height from the bottom */ unsigned int count; struct rcu_head rcu_head; - void *slots[RADIX_TREE_MAP_SIZE]; + void __rcu *slots[RADIX_TREE_MAP_SIZE]; unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; }; @@ -174,14 +174,16 @@ static void radix_tree_node_rcu_free(struct rcu_head *head) { struct radix_tree_node *node = container_of(head, struct radix_tree_node, rcu_head); + int i; /* * must only free zeroed nodes into the slab. radix_tree_shrink * can leave us with a non-NULL entry in the first slot, so clear * that here to make sure. */ - tag_clear(node, 0, 0); - tag_clear(node, 1, 0); + for (i = 0; i < RADIX_TREE_MAX_TAGS; i++) + tag_clear(node, i, 0); + node->slots[0] = NULL; node->count = 0; @@ -609,6 +611,134 @@ int radix_tree_tag_get(struct radix_tree_root *root, EXPORT_SYMBOL(radix_tree_tag_get); /** + * radix_tree_range_tag_if_tagged - for each item in given range set given + * tag if item has another tag set + * @root: radix tree root + * @first_indexp: pointer to a starting index of a range to scan + * @last_index: last index of a range to scan + * @nr_to_tag: maximum number items to tag + * @iftag: tag index to test + * @settag: tag index to set if tested tag is set + * + * This function scans range of radix tree from first_index to last_index + * (inclusive). For each item in the range if iftag is set, the function sets + * also settag. The function stops either after tagging nr_to_tag items or + * after reaching last_index. + * + * The tags must be set from the leaf level only and propagated back up the + * path to the root. We must do this so that we resolve the full path before + * setting any tags on intermediate nodes. If we set tags as we descend, then + * we can get to the leaf node and find that the index that has the iftag + * set is outside the range we are scanning. This reults in dangling tags and + * can lead to problems with later tag operations (e.g. livelocks on lookups). + * + * The function returns number of leaves where the tag was set and sets + * *first_indexp to the first unscanned index. + * WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must + * be prepared to handle that. + */ +unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, + unsigned long *first_indexp, unsigned long last_index, + unsigned long nr_to_tag, + unsigned int iftag, unsigned int settag) +{ + unsigned int height = root->height; + struct radix_tree_path path[height]; + struct radix_tree_path *pathp = path; + struct radix_tree_node *slot; + unsigned int shift; + unsigned long tagged = 0; + unsigned long index = *first_indexp; + + last_index = min(last_index, radix_tree_maxindex(height)); + if (index > last_index) + return 0; + if (!nr_to_tag) + return 0; + if (!root_tag_get(root, iftag)) { + *first_indexp = last_index + 1; + return 0; + } + if (height == 0) { + *first_indexp = last_index + 1; + root_tag_set(root, settag); + return 1; + } + + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + slot = radix_tree_indirect_to_ptr(root->rnode); + + /* + * we fill the path from (root->height - 2) to 0, leaving the index at + * (root->height - 1) as a terminator. Zero the node in the terminator + * so that we can use this to end walk loops back up the path. + */ + path[height - 1].node = NULL; + + for (;;) { + int offset; + + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + if (!slot->slots[offset]) + goto next; + if (!tag_get(slot, iftag, offset)) + goto next; + if (height > 1) { + /* Go down one level */ + height--; + shift -= RADIX_TREE_MAP_SHIFT; + path[height - 1].node = slot; + path[height - 1].offset = offset; + slot = slot->slots[offset]; + continue; + } + + /* tag the leaf */ + tagged++; + tag_set(slot, settag, offset); + + /* walk back up the path tagging interior nodes */ + pathp = &path[0]; + while (pathp->node) { + /* stop if we find a node with the tag already set */ + if (tag_get(pathp->node, settag, pathp->offset)) + break; + tag_set(pathp->node, settag, pathp->offset); + pathp++; + } + +next: + /* Go to next item at level determined by 'shift' */ + index = ((index >> shift) + 1) << shift; + /* Overflow can happen when last_index is ~0UL... */ + if (index > last_index || !index) + break; + if (tagged >= nr_to_tag) + break; + while (((index >> shift) & RADIX_TREE_MAP_MASK) == 0) { + /* + * We've fully scanned this node. Go up. Because + * last_index is guaranteed to be in the tree, what + * we do below cannot wander astray. + */ + slot = path[height - 1].node; + height++; + shift += RADIX_TREE_MAP_SHIFT; + } + } + /* + * The iftag must have been set somewhere because otherwise + * we would return immediated at the beginning of the function + */ + root_tag_set(root, settag); + *first_indexp = index; + + return tagged; +} +EXPORT_SYMBOL(radix_tree_range_tag_if_tagged); + + +/** * radix_tree_next_hole - find the next hole (not-present entry) * @root: tree root * @index: index key diff --git a/lib/raid6/.gitignore b/lib/raid6/.gitignore new file mode 100644 index 00000000000..162becacf97 --- /dev/null +++ b/lib/raid6/.gitignore @@ -0,0 +1,4 @@ +mktables +altivec*.c +int*.c +tables.c diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile new file mode 100644 index 00000000000..8a38102770f --- /dev/null +++ b/lib/raid6/Makefile @@ -0,0 +1,75 @@ +obj-$(CONFIG_RAID6_PQ) += raid6_pq.o + +raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ + int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \ + altivec8.o mmx.o sse1.o sse2.o +hostprogs-y += mktables + +quiet_cmd_unroll = UNROLL $@ + cmd_unroll = $(AWK) -f$(srctree)/$(src)/unroll.awk -vN=$(UNROLL) \ + < $< > $@ || ( rm -f $@ && exit 1 ) + +ifeq ($(CONFIG_ALTIVEC),y) +altivec_flags := -maltivec -mabi=altivec +endif + +targets += int1.c +$(obj)/int1.c: UNROLL := 1 +$(obj)/int1.c: $(src)/int.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +targets += int2.c +$(obj)/int2.c: UNROLL := 2 +$(obj)/int2.c: $(src)/int.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +targets += int4.c +$(obj)/int4.c: UNROLL := 4 +$(obj)/int4.c: $(src)/int.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +targets += int8.c +$(obj)/int8.c: UNROLL := 8 +$(obj)/int8.c: $(src)/int.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +targets += int16.c +$(obj)/int16.c: UNROLL := 16 +$(obj)/int16.c: $(src)/int.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +targets += int32.c +$(obj)/int32.c: UNROLL := 32 +$(obj)/int32.c: $(src)/int.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +CFLAGS_altivec1.o += $(altivec_flags) +targets += altivec1.c +$(obj)/altivec1.c: UNROLL := 1 +$(obj)/altivec1.c: $(src)/altivec.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +CFLAGS_altivec2.o += $(altivec_flags) +targets += altivec2.c +$(obj)/altivec2.c: UNROLL := 2 +$(obj)/altivec2.c: $(src)/altivec.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +CFLAGS_altivec4.o += $(altivec_flags) +targets += altivec4.c +$(obj)/altivec4.c: UNROLL := 4 +$(obj)/altivec4.c: $(src)/altivec.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +CFLAGS_altivec8.o += $(altivec_flags) +targets += altivec8.c +$(obj)/altivec8.c: UNROLL := 8 +$(obj)/altivec8.c: $(src)/altivec.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +quiet_cmd_mktable = TABLE $@ + cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) + +targets += tables.c +$(obj)/tables.c: $(obj)/mktables FORCE + $(call if_changed,mktable) diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c new file mode 100644 index 00000000000..b595f560bee --- /dev/null +++ b/lib/raid6/algos.c @@ -0,0 +1,154 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/algos.c + * + * Algorithm list and algorithm selection for RAID-6 + */ + +#include <linux/raid/pq.h> +#ifndef __KERNEL__ +#include <sys/mman.h> +#include <stdio.h> +#else +#include <linux/gfp.h> +#if !RAID6_USE_EMPTY_ZERO_PAGE +/* In .bss so it's zeroed */ +const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); +EXPORT_SYMBOL(raid6_empty_zero_page); +#endif +#endif + +struct raid6_calls raid6_call; +EXPORT_SYMBOL_GPL(raid6_call); + +const struct raid6_calls * const raid6_algos[] = { + &raid6_intx1, + &raid6_intx2, + &raid6_intx4, + &raid6_intx8, +#if defined(__ia64__) + &raid6_intx16, + &raid6_intx32, +#endif +#if defined(__i386__) && !defined(__arch_um__) + &raid6_mmxx1, + &raid6_mmxx2, + &raid6_sse1x1, + &raid6_sse1x2, + &raid6_sse2x1, + &raid6_sse2x2, +#endif +#if defined(__x86_64__) && !defined(__arch_um__) + &raid6_sse2x1, + &raid6_sse2x2, + &raid6_sse2x4, +#endif +#ifdef CONFIG_ALTIVEC + &raid6_altivec1, + &raid6_altivec2, + &raid6_altivec4, + &raid6_altivec8, +#endif + NULL +}; + +#ifdef __KERNEL__ +#define RAID6_TIME_JIFFIES_LG2 4 +#else +/* Need more time to be stable in userspace */ +#define RAID6_TIME_JIFFIES_LG2 9 +#define time_before(x, y) ((x) < (y)) +#endif + +/* Try to pick the best algorithm */ +/* This code uses the gfmul table as convenient data set to abuse */ + +int __init raid6_select_algo(void) +{ + const struct raid6_calls * const * algo; + const struct raid6_calls * best; + char *syndromes; + void *dptrs[(65536/PAGE_SIZE)+2]; + int i, disks; + unsigned long perf, bestperf; + int bestprefer; + unsigned long j0, j1; + + disks = (65536/PAGE_SIZE)+2; + for ( i = 0 ; i < disks-2 ; i++ ) { + dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i; + } + + /* Normal code - use a 2-page allocation to avoid D$ conflict */ + syndromes = (void *) __get_free_pages(GFP_KERNEL, 1); + + if ( !syndromes ) { + printk("raid6: Yikes! No memory available.\n"); + return -ENOMEM; + } + + dptrs[disks-2] = syndromes; + dptrs[disks-1] = syndromes + PAGE_SIZE; + + bestperf = 0; bestprefer = 0; best = NULL; + + for ( algo = raid6_algos ; *algo ; algo++ ) { + if ( !(*algo)->valid || (*algo)->valid() ) { + perf = 0; + + preempt_disable(); + j0 = jiffies; + while ( (j1 = jiffies) == j0 ) + cpu_relax(); + while (time_before(jiffies, + j1 + (1<<RAID6_TIME_JIFFIES_LG2))) { + (*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs); + perf++; + } + preempt_enable(); + + if ( (*algo)->prefer > bestprefer || + ((*algo)->prefer == bestprefer && + perf > bestperf) ) { + best = *algo; + bestprefer = best->prefer; + bestperf = perf; + } + printk("raid6: %-8s %5ld MB/s\n", (*algo)->name, + (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); + } + } + + if (best) { + printk("raid6: using algorithm %s (%ld MB/s)\n", + best->name, + (bestperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); + raid6_call = *best; + } else + printk("raid6: Yikes! No algorithm found!\n"); + + free_pages((unsigned long)syndromes, 1); + + return best ? 0 : -EINVAL; +} + +static void raid6_exit(void) +{ + do { } while (0); +} + +subsys_initcall(raid6_select_algo); +module_exit(raid6_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID6 Q-syndrome calculations"); diff --git a/lib/raid6/altivec.uc b/lib/raid6/altivec.uc new file mode 100644 index 00000000000..2654d5c854b --- /dev/null +++ b/lib/raid6/altivec.uc @@ -0,0 +1,130 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6altivec$#.c + * + * $#-way unrolled portable integer math RAID-6 instruction set + * + * This file is postprocessed using unroll.awk + * + * <benh> hpa: in process, + * you can just "steal" the vec unit with enable_kernel_altivec() (but + * bracked this with preempt_disable/enable or in a lock) + */ + +#include <linux/raid/pq.h> + +#ifdef CONFIG_ALTIVEC + +#include <altivec.h> +#ifdef __KERNEL__ +# include <asm/system.h> +# include <asm/cputable.h> +#endif + +/* + * This is the C data type to use. We use a vector of + * signed char so vec_cmpgt() will generate the right + * instruction. + */ + +typedef vector signed char unative_t; + +#define NBYTES(x) ((vector signed char) {x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x}) +#define NSIZE sizeof(unative_t) + +/* + * The SHLBYTE() operation shifts each byte left by 1, *not* + * rolling over into the next byte + */ +static inline __attribute_const__ unative_t SHLBYTE(unative_t v) +{ + return vec_add(v,v); +} + +/* + * The MASK() operation returns 0xFF in any byte for which the high + * bit is 1, 0x00 for any byte for which the high bit is 0. + */ +static inline __attribute_const__ unative_t MASK(unative_t v) +{ + unative_t zv = NBYTES(0); + + /* vec_cmpgt returns a vector bool char; thus the need for the cast */ + return (unative_t)vec_cmpgt(zv, v); +} + + +/* This is noinline to make damned sure that gcc doesn't move any of the + Altivec code around the enable/disable code */ +static void noinline +raid6_altivec$#_gen_syndrome_real(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + unative_t wd$$, wq$$, wp$$, w1$$, w2$$; + unative_t x1d = NBYTES(0x1d); + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { + wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; + for ( z = z0-1 ; z >= 0 ; z-- ) { + wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + wp$$ = vec_xor(wp$$, wd$$); + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + w2$$ = vec_and(w2$$, x1d); + w1$$ = vec_xor(w1$$, w2$$); + wq$$ = vec_xor(w1$$, wd$$); + } + *(unative_t *)&p[d+NSIZE*$$] = wp$$; + *(unative_t *)&q[d+NSIZE*$$] = wq$$; + } +} + +static void raid6_altivec$#_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + preempt_disable(); + enable_kernel_altivec(); + + raid6_altivec$#_gen_syndrome_real(disks, bytes, ptrs); + + preempt_enable(); +} + +int raid6_have_altivec(void); +#if $# == 1 +int raid6_have_altivec(void) +{ + /* This assumes either all CPUs have Altivec or none does */ +# ifdef __KERNEL__ + return cpu_has_feature(CPU_FTR_ALTIVEC); +# else + return 1; +# endif +} +#endif + +const struct raid6_calls raid6_altivec$# = { + raid6_altivec$#_gen_syndrome, + raid6_have_altivec, + "altivecx$#", + 0 +}; + +#endif /* CONFIG_ALTIVEC */ diff --git a/lib/raid6/int.uc b/lib/raid6/int.uc new file mode 100644 index 00000000000..d1e276a14fa --- /dev/null +++ b/lib/raid6/int.uc @@ -0,0 +1,117 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6int$#.c + * + * $#-way unrolled portable integer math RAID-6 instruction set + * + * This file is postprocessed using unroll.awk + */ + +#include <linux/raid/pq.h> + +/* + * This is the C data type to use + */ + +/* Change this from BITS_PER_LONG if there is something better... */ +#if BITS_PER_LONG == 64 +# define NBYTES(x) ((x) * 0x0101010101010101UL) +# define NSIZE 8 +# define NSHIFT 3 +# define NSTRING "64" +typedef u64 unative_t; +#else +# define NBYTES(x) ((x) * 0x01010101U) +# define NSIZE 4 +# define NSHIFT 2 +# define NSTRING "32" +typedef u32 unative_t; +#endif + + + +/* + * IA-64 wants insane amounts of unrolling. On other architectures that + * is just a waste of space. + */ +#if ($# <= 8) || defined(__ia64__) + + +/* + * These sub-operations are separate inlines since they can sometimes be + * specially optimized using architecture-specific hacks. + */ + +/* + * The SHLBYTE() operation shifts each byte left by 1, *not* + * rolling over into the next byte + */ +static inline __attribute_const__ unative_t SHLBYTE(unative_t v) +{ + unative_t vv; + + vv = (v << 1) & NBYTES(0xfe); + return vv; +} + +/* + * The MASK() operation returns 0xFF in any byte for which the high + * bit is 1, 0x00 for any byte for which the high bit is 0. + */ +static inline __attribute_const__ unative_t MASK(unative_t v) +{ + unative_t vv; + + vv = v & NBYTES(0x80); + vv = (vv << 1) - (vv >> 7); /* Overflow on the top bit is OK */ + return vv; +} + + +static void raid6_int$#_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + unative_t wd$$, wq$$, wp$$, w1$$, w2$$; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { + wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; + for ( z = z0-1 ; z >= 0 ; z-- ) { + wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + wp$$ ^= wd$$; + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + w2$$ &= NBYTES(0x1d); + w1$$ ^= w2$$; + wq$$ = w1$$ ^ wd$$; + } + *(unative_t *)&p[d+NSIZE*$$] = wp$$; + *(unative_t *)&q[d+NSIZE*$$] = wq$$; + } +} + +const struct raid6_calls raid6_intx$# = { + raid6_int$#_gen_syndrome, + NULL, /* always valid */ + "int" NSTRING "x$#", + 0 +}; + +#endif diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c new file mode 100644 index 00000000000..3b1500843bb --- /dev/null +++ b/lib/raid6/mktables.c @@ -0,0 +1,132 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * mktables.c + * + * Make RAID-6 tables. This is a host user space program to be run at + * compile time. + */ + +#include <stdio.h> +#include <string.h> +#include <inttypes.h> +#include <stdlib.h> +#include <time.h> + +static uint8_t gfmul(uint8_t a, uint8_t b) +{ + uint8_t v = 0; + + while (b) { + if (b & 1) + v ^= a; + a = (a << 1) ^ (a & 0x80 ? 0x1d : 0); + b >>= 1; + } + + return v; +} + +static uint8_t gfpow(uint8_t a, int b) +{ + uint8_t v = 1; + + b %= 255; + if (b < 0) + b += 255; + + while (b) { + if (b & 1) + v = gfmul(v, a); + a = gfmul(a, a); + b >>= 1; + } + + return v; +} + +int main(int argc, char *argv[]) +{ + int i, j, k; + uint8_t v; + uint8_t exptbl[256], invtbl[256]; + + printf("#include <linux/raid/pq.h>\n"); + + /* Compute multiplication table */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfmul[256][256] =\n" + "{\n"); + for (i = 0; i < 256; i++) { + printf("\t{\n"); + for (j = 0; j < 256; j += 8) { + printf("\t\t"); + for (k = 0; k < 8; k++) + printf("0x%02x,%c", gfmul(i, j + k), + (k == 7) ? '\n' : ' '); + } + printf("\t},\n"); + } + printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfmul);\n"); + printf("#endif\n"); + + /* Compute power-of-2 table (exponent) */ + v = 1; + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfexp[256] =\n" "{\n"); + for (i = 0; i < 256; i += 8) { + printf("\t"); + for (j = 0; j < 8; j++) { + exptbl[i + j] = v; + printf("0x%02x,%c", v, (j == 7) ? '\n' : ' '); + v = gfmul(v, 2); + if (v == 1) + v = 0; /* For entry 255, not a real entry */ + } + } + printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfexp);\n"); + printf("#endif\n"); + + /* Compute inverse table x^-1 == x^254 */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfinv[256] =\n" "{\n"); + for (i = 0; i < 256; i += 8) { + printf("\t"); + for (j = 0; j < 8; j++) { + invtbl[i + j] = v = gfpow(i + j, 254); + printf("0x%02x,%c", v, (j == 7) ? '\n' : ' '); + } + } + printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfinv);\n"); + printf("#endif\n"); + + /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfexi[256] =\n" "{\n"); + for (i = 0; i < 256; i += 8) { + printf("\t"); + for (j = 0; j < 8; j++) + printf("0x%02x,%c", invtbl[exptbl[i + j] ^ 1], + (j == 7) ? '\n' : ' '); + } + printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfexi);\n"); + printf("#endif\n"); + + return 0; +} diff --git a/lib/raid6/mmx.c b/lib/raid6/mmx.c new file mode 100644 index 00000000000..279347f2309 --- /dev/null +++ b/lib/raid6/mmx.c @@ -0,0 +1,142 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/mmx.c + * + * MMX implementation of RAID-6 syndrome functions + */ + +#if defined(__i386__) && !defined(__arch_um__) + +#include <linux/raid/pq.h> +#include "x86.h" + +/* Shared with raid6/sse1.c */ +const struct raid6_mmx_constants { + u64 x1d; +} raid6_mmx_constants = { + 0x1d1d1d1d1d1d1d1dULL, +}; + +static int raid6_have_mmx(void) +{ + /* Not really "boot_cpu" but "all_cpus" */ + return boot_cpu_has(X86_FEATURE_MMX); +} + +/* + * Plain MMX implementation + */ +static void raid6_mmx1_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); + asm volatile("pxor %mm5,%mm5"); /* Zero temp */ + + for ( d = 0 ; d < bytes ; d += 8 ) { + asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("movq %mm2,%mm4"); /* Q[0] */ + for ( z = z0-1 ; z >= 0 ; z-- ) { + asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d])); + asm volatile("pcmpgtb %mm4,%mm5"); + asm volatile("paddb %mm4,%mm4"); + asm volatile("pand %mm0,%mm5"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm5,%mm5"); + asm volatile("pxor %mm6,%mm2"); + asm volatile("pxor %mm6,%mm4"); + } + asm volatile("movq %%mm2,%0" : "=m" (p[d])); + asm volatile("pxor %mm2,%mm2"); + asm volatile("movq %%mm4,%0" : "=m" (q[d])); + asm volatile("pxor %mm4,%mm4"); + } + + kernel_fpu_end(); +} + +const struct raid6_calls raid6_mmxx1 = { + raid6_mmx1_gen_syndrome, + raid6_have_mmx, + "mmxx1", + 0 +}; + +/* + * Unrolled-by-2 MMX implementation + */ +static void raid6_mmx2_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); + asm volatile("pxor %mm5,%mm5"); /* Zero temp */ + asm volatile("pxor %mm7,%mm7"); /* Zero temp */ + + for ( d = 0 ; d < bytes ; d += 16 ) { + asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8])); + asm volatile("movq %mm2,%mm4"); /* Q[0] */ + asm volatile("movq %mm3,%mm6"); /* Q[1] */ + for ( z = z0-1 ; z >= 0 ; z-- ) { + asm volatile("pcmpgtb %mm4,%mm5"); + asm volatile("pcmpgtb %mm6,%mm7"); + asm volatile("paddb %mm4,%mm4"); + asm volatile("paddb %mm6,%mm6"); + asm volatile("pand %mm0,%mm5"); + asm volatile("pand %mm0,%mm7"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm7,%mm6"); + asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d])); + asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8])); + asm volatile("pxor %mm5,%mm2"); + asm volatile("pxor %mm7,%mm3"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm7,%mm6"); + asm volatile("pxor %mm5,%mm5"); + asm volatile("pxor %mm7,%mm7"); + } + asm volatile("movq %%mm2,%0" : "=m" (p[d])); + asm volatile("movq %%mm3,%0" : "=m" (p[d+8])); + asm volatile("movq %%mm4,%0" : "=m" (q[d])); + asm volatile("movq %%mm6,%0" : "=m" (q[d+8])); + } + + kernel_fpu_end(); +} + +const struct raid6_calls raid6_mmxx2 = { + raid6_mmx2_gen_syndrome, + raid6_have_mmx, + "mmxx2", + 0 +}; + +#endif diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c new file mode 100644 index 00000000000..8590d19cf52 --- /dev/null +++ b/lib/raid6/recov.c @@ -0,0 +1,132 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/recov.c + * + * RAID-6 data recovery in dual failure mode. In single failure mode, + * use the RAID-5 algorithm (or, in the case of Q failure, just reconstruct + * the syndrome.) + */ + +#include <linux/raid/pq.h> + +/* Recover two failed data blocks. */ +void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, + void **ptrs) +{ + u8 *p, *q, *dp, *dq; + u8 px, qx, db; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data pages + Use the dead data pages as temporary storage for + delta p and delta q */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_gfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]]; + + /* Now do it... */ + while ( bytes-- ) { + px = *p ^ *dp; + qx = qmul[*q ^ *dq]; + *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */ + *dp++ = db ^ px; /* Reconstructed A */ + p++; q++; + } +} +EXPORT_SYMBOL_GPL(raid6_2data_recov); + +/* Recover failure of one data block plus the P block */ +void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data page + Use the dead data page as temporary storage for delta q */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + /* Now do it... */ + while ( bytes-- ) { + *p++ ^= *dq = qmul[*q ^ *dq]; + q++; dq++; + } +} +EXPORT_SYMBOL_GPL(raid6_datap_recov); + +#ifndef __KERNEL__ +/* Testing only */ + +/* Recover two failed blocks. */ +void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs) +{ + if ( faila > failb ) { + int tmp = faila; + faila = failb; + failb = tmp; + } + + if ( failb == disks-1 ) { + if ( faila == disks-2 ) { + /* P+Q failure. Just rebuild the syndrome. */ + raid6_call.gen_syndrome(disks, bytes, ptrs); + } else { + /* data+Q failure. Reconstruct data from P, + then rebuild syndrome. */ + /* NOT IMPLEMENTED - equivalent to RAID-5 */ + } + } else { + if ( failb == disks-2 ) { + /* data+P failure. */ + raid6_datap_recov(disks, bytes, faila, ptrs); + } else { + /* data+data failure. */ + raid6_2data_recov(disks, bytes, faila, failb, ptrs); + } + } +} + +#endif diff --git a/lib/raid6/sse1.c b/lib/raid6/sse1.c new file mode 100644 index 00000000000..10dd91948c0 --- /dev/null +++ b/lib/raid6/sse1.c @@ -0,0 +1,162 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/sse1.c + * + * SSE-1/MMXEXT implementation of RAID-6 syndrome functions + * + * This is really an MMX implementation, but it requires SSE-1 or + * AMD MMXEXT for prefetch support and a few other features. The + * support for nontemporal memory accesses is enough to make this + * worthwhile as a separate implementation. + */ + +#if defined(__i386__) && !defined(__arch_um__) + +#include <linux/raid/pq.h> +#include "x86.h" + +/* Defined in raid6/mmx.c */ +extern const struct raid6_mmx_constants { + u64 x1d; +} raid6_mmx_constants; + +static int raid6_have_sse1_or_mmxext(void) +{ + /* Not really boot_cpu but "all_cpus" */ + return boot_cpu_has(X86_FEATURE_MMX) && + (boot_cpu_has(X86_FEATURE_XMM) || + boot_cpu_has(X86_FEATURE_MMXEXT)); +} + +/* + * Plain SSE1 implementation + */ +static void raid6_sse11_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); + asm volatile("pxor %mm5,%mm5"); /* Zero temp */ + + for ( d = 0 ; d < bytes ; d += 8 ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); + asm volatile("movq %mm2,%mm4"); /* Q[0] */ + asm volatile("movq %0,%%mm6" : : "m" (dptr[z0-1][d])); + for ( z = z0-2 ; z >= 0 ; z-- ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("pcmpgtb %mm4,%mm5"); + asm volatile("paddb %mm4,%mm4"); + asm volatile("pand %mm0,%mm5"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm5,%mm5"); + asm volatile("pxor %mm6,%mm2"); + asm volatile("pxor %mm6,%mm4"); + asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d])); + } + asm volatile("pcmpgtb %mm4,%mm5"); + asm volatile("paddb %mm4,%mm4"); + asm volatile("pand %mm0,%mm5"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm5,%mm5"); + asm volatile("pxor %mm6,%mm2"); + asm volatile("pxor %mm6,%mm4"); + + asm volatile("movntq %%mm2,%0" : "=m" (p[d])); + asm volatile("movntq %%mm4,%0" : "=m" (q[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_sse1x1 = { + raid6_sse11_gen_syndrome, + raid6_have_sse1_or_mmxext, + "sse1x1", + 1 /* Has cache hints */ +}; + +/* + * Unrolled-by-2 SSE1 implementation + */ +static void raid6_sse12_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); + asm volatile("pxor %mm5,%mm5"); /* Zero temp */ + asm volatile("pxor %mm7,%mm7"); /* Zero temp */ + + /* We uniformly assume a single prefetch covers at least 16 bytes */ + for ( d = 0 ; d < bytes ; d += 16 ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8])); /* P[1] */ + asm volatile("movq %mm2,%mm4"); /* Q[0] */ + asm volatile("movq %mm3,%mm6"); /* Q[1] */ + for ( z = z0-1 ; z >= 0 ; z-- ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("pcmpgtb %mm4,%mm5"); + asm volatile("pcmpgtb %mm6,%mm7"); + asm volatile("paddb %mm4,%mm4"); + asm volatile("paddb %mm6,%mm6"); + asm volatile("pand %mm0,%mm5"); + asm volatile("pand %mm0,%mm7"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm7,%mm6"); + asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d])); + asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8])); + asm volatile("pxor %mm5,%mm2"); + asm volatile("pxor %mm7,%mm3"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm7,%mm6"); + asm volatile("pxor %mm5,%mm5"); + asm volatile("pxor %mm7,%mm7"); + } + asm volatile("movntq %%mm2,%0" : "=m" (p[d])); + asm volatile("movntq %%mm3,%0" : "=m" (p[d+8])); + asm volatile("movntq %%mm4,%0" : "=m" (q[d])); + asm volatile("movntq %%mm6,%0" : "=m" (q[d+8])); + } + + asm volatile("sfence" : :: "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_sse1x2 = { + raid6_sse12_gen_syndrome, + raid6_have_sse1_or_mmxext, + "sse1x2", + 1 /* Has cache hints */ +}; + +#endif diff --git a/lib/raid6/sse2.c b/lib/raid6/sse2.c new file mode 100644 index 00000000000..bc2d57daa58 --- /dev/null +++ b/lib/raid6/sse2.c @@ -0,0 +1,262 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/sse2.c + * + * SSE-2 implementation of RAID-6 syndrome functions + * + */ + +#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) + +#include <linux/raid/pq.h> +#include "x86.h" + +static const struct raid6_sse_constants { + u64 x1d[2]; +} raid6_sse_constants __attribute__((aligned(16))) = { + { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL }, +}; + +static int raid6_have_sse2(void) +{ + /* Not really boot_cpu but "all_cpus" */ + return boot_cpu_has(X86_FEATURE_MMX) && + boot_cpu_has(X86_FEATURE_FXSR) && + boot_cpu_has(X86_FEATURE_XMM) && + boot_cpu_has(X86_FEATURE_XMM2); +} + +/* + * Plain SSE2 implementation + */ +static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); + asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ + + for ( d = 0 ; d < bytes ; d += 16 ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); + asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */ + asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d])); + for ( z = z0-2 ; z >= 0 ; z-- ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm6,%xmm2"); + asm volatile("pxor %xmm6,%xmm4"); + asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d])); + } + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm6,%xmm2"); + asm volatile("pxor %xmm6,%xmm4"); + + asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); + asm volatile("pxor %xmm2,%xmm2"); + asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); + asm volatile("pxor %xmm4,%xmm4"); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_sse2x1 = { + raid6_sse21_gen_syndrome, + raid6_have_sse2, + "sse2x1", + 1 /* Has cache hints */ +}; + +/* + * Unrolled-by-2 SSE2 implementation + */ +static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); + asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ + asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */ + + /* We uniformly assume a single prefetch covers at least 32 bytes */ + for ( d = 0 ; d < bytes ; d += 32 ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */ + asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */ + asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */ + for ( z = z0-1 ; z >= 0 ; z-- ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d])); + asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16])); + asm volatile("pxor %xmm5,%xmm2"); + asm volatile("pxor %xmm7,%xmm3"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + } + asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); + asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); + asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); + asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_sse2x2 = { + raid6_sse22_gen_syndrome, + raid6_have_sse2, + "sse2x2", + 1 /* Has cache hints */ +}; + +#endif + +#if defined(__x86_64__) && !defined(__arch_um__) + +/* + * Unrolled-by-4 SSE2 implementation + */ +static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0])); + asm volatile("pxor %xmm2,%xmm2"); /* P[0] */ + asm volatile("pxor %xmm3,%xmm3"); /* P[1] */ + asm volatile("pxor %xmm4,%xmm4"); /* Q[0] */ + asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ + asm volatile("pxor %xmm6,%xmm6"); /* Q[1] */ + asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */ + asm volatile("pxor %xmm10,%xmm10"); /* P[2] */ + asm volatile("pxor %xmm11,%xmm11"); /* P[3] */ + asm volatile("pxor %xmm12,%xmm12"); /* Q[2] */ + asm volatile("pxor %xmm13,%xmm13"); /* Zero temp */ + asm volatile("pxor %xmm14,%xmm14"); /* Q[3] */ + asm volatile("pxor %xmm15,%xmm15"); /* Zero temp */ + + for ( d = 0 ; d < bytes ; d += 64 ) { + for ( z = z0 ; z >= 0 ; z-- ) { + /* The second prefetch seems to improve performance... */ + asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); + asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32])); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("pcmpgtb %xmm12,%xmm13"); + asm volatile("pcmpgtb %xmm14,%xmm15"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("paddb %xmm12,%xmm12"); + asm volatile("paddb %xmm14,%xmm14"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pand %xmm0,%xmm13"); + asm volatile("pand %xmm0,%xmm15"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm13,%xmm12"); + asm volatile("pxor %xmm15,%xmm14"); + asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); + asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); + asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32])); + asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48])); + asm volatile("pxor %xmm5,%xmm2"); + asm volatile("pxor %xmm7,%xmm3"); + asm volatile("pxor %xmm13,%xmm10"); + asm volatile("pxor %xmm15,%xmm11"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm13,%xmm12"); + asm volatile("pxor %xmm15,%xmm14"); + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + asm volatile("pxor %xmm13,%xmm13"); + asm volatile("pxor %xmm15,%xmm15"); + } + asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); + asm volatile("pxor %xmm2,%xmm2"); + asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); + asm volatile("pxor %xmm3,%xmm3"); + asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32])); + asm volatile("pxor %xmm10,%xmm10"); + asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48])); + asm volatile("pxor %xmm11,%xmm11"); + asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); + asm volatile("pxor %xmm4,%xmm4"); + asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); + asm volatile("pxor %xmm6,%xmm6"); + asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32])); + asm volatile("pxor %xmm12,%xmm12"); + asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48])); + asm volatile("pxor %xmm14,%xmm14"); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_sse2x4 = { + raid6_sse24_gen_syndrome, + raid6_have_sse2, + "sse2x4", + 1 /* Has cache hints */ +}; + +#endif diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile new file mode 100644 index 00000000000..aa651697b6d --- /dev/null +++ b/lib/raid6/test/Makefile @@ -0,0 +1,72 @@ +# +# This is a simple Makefile to test some of the RAID-6 code +# from userspace. +# + +CC = gcc +OPTFLAGS = -O2 # Adjust as desired +CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) +LD = ld +AWK = awk -f +AR = ar +RANLIB = ranlib + +.c.o: + $(CC) $(CFLAGS) -c -o $@ $< + +%.c: ../%.c + cp -f $< $@ + +%.uc: ../%.uc + cp -f $< $@ + +all: raid6.a raid6test + +raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \ + altivec1.o altivec2.o altivec4.o altivec8.o recov.o algos.o \ + tables.o + rm -f $@ + $(AR) cq $@ $^ + $(RANLIB) $@ + +raid6test: test.c raid6.a + $(CC) $(CFLAGS) -o raid6test $^ + +altivec1.c: altivec.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=1 < altivec.uc > $@ + +altivec2.c: altivec.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=2 < altivec.uc > $@ + +altivec4.c: altivec.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=4 < altivec.uc > $@ + +altivec8.c: altivec.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=8 < altivec.uc > $@ + +int1.c: int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=1 < int.uc > $@ + +int2.c: int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=2 < int.uc > $@ + +int4.c: int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=4 < int.uc > $@ + +int8.c: int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=8 < int.uc > $@ + +int16.c: int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=16 < int.uc > $@ + +int32.c: int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=32 < int.uc > $@ + +tables.c: mktables + ./mktables > tables.c + +clean: + rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c tables.c raid6test + +spotless: clean + rm -f *~ diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c new file mode 100644 index 00000000000..7a930318b17 --- /dev/null +++ b/lib/raid6/test/test.c @@ -0,0 +1,124 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6test.c + * + * Test RAID-6 recovery with various algorithms + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <linux/raid/pq.h> + +#define NDISKS 16 /* Including P and Q */ + +const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); +struct raid6_calls raid6_call; + +char *dataptrs[NDISKS]; +char data[NDISKS][PAGE_SIZE]; +char recovi[PAGE_SIZE], recovj[PAGE_SIZE]; + +static void makedata(void) +{ + int i, j; + + for (i = 0; i < NDISKS; i++) { + for (j = 0; j < PAGE_SIZE; j++) + data[i][j] = rand(); + + dataptrs[i] = data[i]; + } +} + +static char disk_type(int d) +{ + switch (d) { + case NDISKS-2: + return 'P'; + case NDISKS-1: + return 'Q'; + default: + return 'D'; + } +} + +static int test_disks(int i, int j) +{ + int erra, errb; + + memset(recovi, 0xf0, PAGE_SIZE); + memset(recovj, 0xba, PAGE_SIZE); + + dataptrs[i] = recovi; + dataptrs[j] = recovj; + + raid6_dual_recov(NDISKS, PAGE_SIZE, i, j, (void **)&dataptrs); + + erra = memcmp(data[i], recovi, PAGE_SIZE); + errb = memcmp(data[j], recovj, PAGE_SIZE); + + if (i < NDISKS-2 && j == NDISKS-1) { + /* We don't implement the DQ failure scenario, since it's + equivalent to a RAID-5 failure (XOR, then recompute Q) */ + erra = errb = 0; + } else { + printf("algo=%-8s faila=%3d(%c) failb=%3d(%c) %s\n", + raid6_call.name, + i, disk_type(i), + j, disk_type(j), + (!erra && !errb) ? "OK" : + !erra ? "ERRB" : + !errb ? "ERRA" : "ERRAB"); + } + + dataptrs[i] = data[i]; + dataptrs[j] = data[j]; + + return erra || errb; +} + +int main(int argc, char *argv[]) +{ + const struct raid6_calls *const *algo; + int i, j; + int err = 0; + + makedata(); + + for (algo = raid6_algos; *algo; algo++) { + if (!(*algo)->valid || (*algo)->valid()) { + raid6_call = **algo; + + /* Nuke syndromes */ + memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); + + /* Generate assumed good syndrome */ + raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, + (void **)&dataptrs); + + for (i = 0; i < NDISKS-1; i++) + for (j = i+1; j < NDISKS; j++) + err += test_disks(i, j); + } + printf("\n"); + } + + printf("\n"); + /* Pick the best algorithm test */ + raid6_select_algo(); + + if (err) + printf("\n*** ERRORS FOUND ***\n"); + + return err; +} diff --git a/lib/raid6/unroll.awk b/lib/raid6/unroll.awk new file mode 100644 index 00000000000..c6aa03631df --- /dev/null +++ b/lib/raid6/unroll.awk @@ -0,0 +1,20 @@ + +# This filter requires one command line option of form -vN=n +# where n must be a decimal number. +# +# Repeat each input line containing $$ n times, replacing $$ with 0...n-1. +# Replace each $# with n, and each $* with a single $. + +BEGIN { + n = N + 0 +} +{ + if (/\$\$/) { rep = n } else { rep = 1 } + for (i = 0; i < rep; ++i) { + tmp = $0 + gsub(/\$\$/, i, tmp) + gsub(/\$\#/, n, tmp) + gsub(/\$\*/, "$", tmp) + print tmp + } +} diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h new file mode 100644 index 00000000000..cb2a8c91c88 --- /dev/null +++ b/lib/raid6/x86.h @@ -0,0 +1,61 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/x86.h + * + * Definitions common to x86 and x86-64 RAID-6 code only + */ + +#ifndef LINUX_RAID_RAID6X86_H +#define LINUX_RAID_RAID6X86_H + +#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) + +#ifdef __KERNEL__ /* Real code */ + +#include <asm/i387.h> + +#else /* Dummy code for user space testing */ + +static inline void kernel_fpu_begin(void) +{ +} + +static inline void kernel_fpu_end(void) +{ +} + +#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ +#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE and FXRSTOR instructions + * (fast save and restore) */ +#define X86_FEATURE_XMM (0*32+25) /* Streaming SIMD Extensions */ +#define X86_FEATURE_XMM2 (0*32+26) /* Streaming SIMD Extensions-2 */ +#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ + +/* Should work well enough on modern CPUs for testing */ +static inline int boot_cpu_has(int flag) +{ + u32 eax = (flag >> 5) ? 0x80000001 : 1; + u32 edx; + + asm volatile("cpuid" + : "+a" (eax), "=d" (edx) + : : "ecx", "ebx"); + + return (edx >> (flag & 31)) & 1; +} + +#endif /* ndef __KERNEL__ */ + +#endif +#endif diff --git a/lib/random32.c b/lib/random32.c index 870dc3fc0f0..fc3545a3277 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -127,7 +127,7 @@ core_initcall(random32_init); /* * Generate better values after random number generator - * is fully initalized. + * is fully initialized. */ static int __init random32_reseed(void) { diff --git a/lib/rwsem.c b/lib/rwsem.c index ceba8e28807..f236d7cd5cf 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -36,45 +36,56 @@ struct rwsem_waiter { #define RWSEM_WAITING_FOR_WRITE 0x00000002 }; +/* Wake types for __rwsem_do_wake(). Note that RWSEM_WAKE_NO_ACTIVE and + * RWSEM_WAKE_READ_OWNED imply that the spinlock must have been kept held + * since the rwsem value was observed. + */ +#define RWSEM_WAKE_ANY 0 /* Wake whatever's at head of wait list */ +#define RWSEM_WAKE_NO_ACTIVE 1 /* rwsem was observed with no active thread */ +#define RWSEM_WAKE_READ_OWNED 2 /* rwsem was observed to be read owned */ + /* * handle the lock release when processes blocked on it that can now run * - if we come here from up_xxxx(), then: * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) - * - there must be someone on the queue + * - there must be someone on the queue * - the spinlock must be held by the caller * - woken process blocks are discarded from the list after having task zeroed * - writers are only woken if downgrading is false */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int downgrading) +static struct rw_semaphore * +__rwsem_do_wake(struct rw_semaphore *sem, int wake_type) { struct rwsem_waiter *waiter; struct task_struct *tsk; struct list_head *next; - signed long oldcount, woken, loop; - - if (downgrading) - goto dont_wake_writers; - - /* if we came through an up_xxxx() call, we only only wake someone up - * if we can transition the active part of the count from 0 -> 1 - */ - try_again: - oldcount = rwsem_atomic_update(RWSEM_ACTIVE_BIAS, sem) - - RWSEM_ACTIVE_BIAS; - if (oldcount & RWSEM_ACTIVE_MASK) - goto undo; + signed long oldcount, woken, loop, adjustment; waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - - /* try to grant a single write lock if there's a writer at the front - * of the queue - note we leave the 'active part' of the count - * incremented by 1 and the waiting part incremented by 0x00010000 - */ if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE)) goto readers_only; + if (wake_type == RWSEM_WAKE_READ_OWNED) + /* Another active reader was observed, so wakeup is not + * likely to succeed. Save the atomic op. + */ + goto out; + + /* There's a writer at the front of the queue - try to grant it the + * write lock. However, we only wake this writer if we can transition + * the active part of the count from 0 -> 1 + */ + adjustment = RWSEM_ACTIVE_WRITE_BIAS; + if (waiter->list.next == &sem->wait_list) + adjustment -= RWSEM_WAITING_BIAS; + + try_again_write: + oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; + if (oldcount & RWSEM_ACTIVE_MASK) + /* Someone grabbed the sem already */ + goto undo_write; + /* We must be careful not to touch 'waiter' after we set ->task = NULL. * It is an allocated on the waiter's stack and may become invalid at * any time after that point (due to a wakeup from another source). @@ -87,18 +98,30 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading) put_task_struct(tsk); goto out; - /* don't want to wake any writers */ - dont_wake_writers: - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - if (waiter->flags & RWSEM_WAITING_FOR_WRITE) + readers_only: + /* If we come here from up_xxxx(), another thread might have reached + * rwsem_down_failed_common() before we acquired the spinlock and + * woken up a waiter, making it now active. We prefer to check for + * this first in order to not spend too much time with the spinlock + * held if we're not going to be able to wake up readers in the end. + * + * Note that we do not need to update the rwsem count: any writer + * trying to acquire rwsem will run rwsem_down_write_failed() due + * to the waiting threads and block trying to acquire the spinlock. + * + * We use a dummy atomic update in order to acquire the cache line + * exclusively since we expect to succeed and run the final rwsem + * count adjustment pretty soon. + */ + if (wake_type == RWSEM_WAKE_ANY && + rwsem_atomic_update(0, sem) < RWSEM_WAITING_BIAS) + /* Someone grabbed the sem for write already */ goto out; - /* grant an infinite number of read locks to the readers at the front - * of the queue - * - note we increment the 'active part' of the count by the number of - * readers before waking any processes up + /* Grant an infinite number of read locks to the readers at the front + * of the queue. Note we increment the 'active part' of the count by + * the number of readers before waking any processes up. */ - readers_only: woken = 0; do { woken++; @@ -111,16 +134,15 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading) } while (waiter->flags & RWSEM_WAITING_FOR_READ); - loop = woken; - woken *= RWSEM_ACTIVE_BIAS - RWSEM_WAITING_BIAS; - if (!downgrading) - /* we'd already done one increment earlier */ - woken -= RWSEM_ACTIVE_BIAS; + adjustment = woken * RWSEM_ACTIVE_READ_BIAS; + if (waiter->flags & RWSEM_WAITING_FOR_READ) + /* hit end of list above */ + adjustment -= RWSEM_WAITING_BIAS; - rwsem_atomic_add(woken, sem); + rwsem_atomic_add(adjustment, sem); next = sem->wait_list.next; - for (; loop > 0; loop--) { + for (loop = woken; loop > 0; loop--) { waiter = list_entry(next, struct rwsem_waiter, list); next = waiter->list.next; tsk = waiter->task; @@ -138,10 +160,10 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading) /* undo the change to the active count, but check for a transition * 1->0 */ - undo: - if (rwsem_atomic_update(-RWSEM_ACTIVE_BIAS, sem) & RWSEM_ACTIVE_MASK) + undo_write: + if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK) goto out; - goto try_again; + goto try_again_write; } /* @@ -149,8 +171,9 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading) */ static struct rw_semaphore __sched * rwsem_down_failed_common(struct rw_semaphore *sem, - struct rwsem_waiter *waiter, signed long adjustment) + unsigned int flags, signed long adjustment) { + struct rwsem_waiter waiter; struct task_struct *tsk = current; signed long count; @@ -158,23 +181,34 @@ rwsem_down_failed_common(struct rw_semaphore *sem, /* set up my own style of waitqueue */ spin_lock_irq(&sem->wait_lock); - waiter->task = tsk; + waiter.task = tsk; + waiter.flags = flags; get_task_struct(tsk); - list_add_tail(&waiter->list, &sem->wait_list); + if (list_empty(&sem->wait_list)) + adjustment += RWSEM_WAITING_BIAS; + list_add_tail(&waiter.list, &sem->wait_list); - /* we're now waiting on the lock, but no longer actively read-locking */ + /* we're now waiting on the lock, but no longer actively locking */ count = rwsem_atomic_update(adjustment, sem); - /* if there are no active locks, wake the front queued process(es) up */ - if (!(count & RWSEM_ACTIVE_MASK)) - sem = __rwsem_do_wake(sem, 0); + /* If there are no active locks, wake the front queued process(es) up. + * + * Alternatively, if we're called from a failed down_write(), there + * were already threads queued before us and there are no active + * writers, the lock must be read owned; so we try to wake any read + * locks that were queued ahead of us. */ + if (count == RWSEM_WAITING_BIAS) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_NO_ACTIVE); + else if (count > RWSEM_WAITING_BIAS && + adjustment == -RWSEM_ACTIVE_WRITE_BIAS) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); spin_unlock_irq(&sem->wait_lock); /* wait to be given the lock */ for (;;) { - if (!waiter->task) + if (!waiter.task) break; schedule(); set_task_state(tsk, TASK_UNINTERRUPTIBLE); @@ -191,12 +225,8 @@ rwsem_down_failed_common(struct rw_semaphore *sem, asmregparm struct rw_semaphore __sched * rwsem_down_read_failed(struct rw_semaphore *sem) { - struct rwsem_waiter waiter; - - waiter.flags = RWSEM_WAITING_FOR_READ; - rwsem_down_failed_common(sem, &waiter, - RWSEM_WAITING_BIAS - RWSEM_ACTIVE_BIAS); - return sem; + return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_READ, + -RWSEM_ACTIVE_READ_BIAS); } /* @@ -205,12 +235,8 @@ rwsem_down_read_failed(struct rw_semaphore *sem) asmregparm struct rw_semaphore __sched * rwsem_down_write_failed(struct rw_semaphore *sem) { - struct rwsem_waiter waiter; - - waiter.flags = RWSEM_WAITING_FOR_WRITE; - rwsem_down_failed_common(sem, &waiter, -RWSEM_ACTIVE_BIAS); - - return sem; + return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_WRITE, + -RWSEM_ACTIVE_WRITE_BIAS); } /* @@ -225,7 +251,7 @@ asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 0); + sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); spin_unlock_irqrestore(&sem->wait_lock, flags); @@ -245,7 +271,7 @@ asmregparm struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 1); + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); spin_unlock_irqrestore(&sem->wait_lock, flags); diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 9afa25b52a8..4ceb05d772a 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -10,6 +10,7 @@ #include <linux/slab.h> #include <linux/scatterlist.h> #include <linux/highmem.h> +#include <linux/kmemleak.h> /** * sg_next - return the next scatterlist entry in a list @@ -115,17 +116,29 @@ EXPORT_SYMBOL(sg_init_one); */ static struct scatterlist *sg_kmalloc(unsigned int nents, gfp_t gfp_mask) { - if (nents == SG_MAX_SINGLE_ALLOC) - return (struct scatterlist *) __get_free_page(gfp_mask); - else + if (nents == SG_MAX_SINGLE_ALLOC) { + /* + * Kmemleak doesn't track page allocations as they are not + * commonly used (in a raw form) for kernel data structures. + * As we chain together a list of pages and then a normal + * kmalloc (tracked by kmemleak), in order to for that last + * allocation not to become decoupled (and thus a + * false-positive) we need to inform kmemleak of all the + * intermediate allocations. + */ + void *ptr = (void *) __get_free_page(gfp_mask); + kmemleak_alloc(ptr, PAGE_SIZE, 1, gfp_mask); + return ptr; + } else return kmalloc(nents * sizeof(struct scatterlist), gfp_mask); } static void sg_kfree(struct scatterlist *sg, unsigned int nents) { - if (nents == SG_MAX_SINGLE_ALLOC) + if (nents == SG_MAX_SINGLE_ALLOC) { + kmemleak_free(sg); free_page((unsigned long) sg); - else + } else kfree(sg); } @@ -235,8 +248,18 @@ int __sg_alloc_table(struct sg_table *table, unsigned int nents, left -= sg_size; sg = alloc_fn(alloc_size, gfp_mask); - if (unlikely(!sg)) - return -ENOMEM; + if (unlikely(!sg)) { + /* + * Adjust entry count to reflect that the last + * entry of the previous table won't be used for + * linkage. Without this, sg_kfree() may get + * confused. + */ + if (prv) + table->nents = ++table->orig_nents; + + return -ENOMEM; + } sg_init_table(sg, alloc_size); table->nents = table->orig_nents += sg_size; diff --git a/lib/swiotlb.c b/lib/swiotlb.c index a009055140e..7c06ee51a29 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -50,19 +50,11 @@ */ #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) -/* - * Enumeration for sync targets - */ -enum dma_sync_target { - SYNC_FOR_CPU = 0, - SYNC_FOR_DEVICE = 1, -}; - int swiotlb_force; /* - * Used to do a quick range check in unmap_single and - * sync_single_*, to see if the memory was in fact allocated by this + * Used to do a quick range check in swiotlb_tbl_unmap_single and + * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this * API. */ static char *io_tlb_start, *io_tlb_end; @@ -78,7 +70,7 @@ static unsigned long io_tlb_nslabs; */ static unsigned long io_tlb_overflow = 32*1024; -void *io_tlb_overflow_buffer; +static void *io_tlb_overflow_buffer; /* * This is a free list describing the number of free entries available from @@ -140,28 +132,14 @@ void swiotlb_print_info(void) (unsigned long long)pend); } -/* - * Statically reserve bounce buffer space and initialize bounce buffer data - * structures for the software IO TLB used to implement the DMA API. - */ -void __init -swiotlb_init_with_default_size(size_t default_size, int verbose) +void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { unsigned long i, bytes; - if (!io_tlb_nslabs) { - io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); - } - - bytes = io_tlb_nslabs << IO_TLB_SHIFT; + bytes = nslabs << IO_TLB_SHIFT; - /* - * Get IO TLB memory from the low pages - */ - io_tlb_start = alloc_bootmem_low_pages(bytes); - if (!io_tlb_start) - panic("Cannot allocate SWIOTLB buffer"); + io_tlb_nslabs = nslabs; + io_tlb_start = tlb; io_tlb_end = io_tlb_start + bytes; /* @@ -169,22 +147,48 @@ swiotlb_init_with_default_size(size_t default_size, int verbose) * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE * between io_tlb_start and io_tlb_end. */ - io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int)); + io_tlb_list = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); for (i = 0; i < io_tlb_nslabs; i++) io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); io_tlb_index = 0; - io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(phys_addr_t)); + io_tlb_orig_addr = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); /* * Get the overflow emergency buffer */ - io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow); + io_tlb_overflow_buffer = alloc_bootmem_low_pages(PAGE_ALIGN(io_tlb_overflow)); if (!io_tlb_overflow_buffer) panic("Cannot allocate SWIOTLB overflow buffer!\n"); if (verbose) swiotlb_print_info(); } +/* + * Statically reserve bounce buffer space and initialize bounce buffer data + * structures for the software IO TLB used to implement the DMA API. + */ +void __init +swiotlb_init_with_default_size(size_t default_size, int verbose) +{ + unsigned long bytes; + + if (!io_tlb_nslabs) { + io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); + io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + } + + bytes = io_tlb_nslabs << IO_TLB_SHIFT; + + /* + * Get IO TLB memory from the low pages + */ + io_tlb_start = alloc_bootmem_low_pages(PAGE_ALIGN(bytes)); + if (!io_tlb_start) + panic("Cannot allocate SWIOTLB buffer"); + + swiotlb_init_with_tbl(io_tlb_start, io_tlb_nslabs, verbose); +} + void __init swiotlb_init(int verbose) { @@ -304,13 +308,13 @@ void __init swiotlb_free(void) get_order(io_tlb_nslabs << IO_TLB_SHIFT)); } else { free_bootmem_late(__pa(io_tlb_overflow_buffer), - io_tlb_overflow); + PAGE_ALIGN(io_tlb_overflow)); free_bootmem_late(__pa(io_tlb_orig_addr), - io_tlb_nslabs * sizeof(phys_addr_t)); + PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); free_bootmem_late(__pa(io_tlb_list), - io_tlb_nslabs * sizeof(int)); + PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); free_bootmem_late(__pa(io_tlb_start), - io_tlb_nslabs << IO_TLB_SHIFT); + PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); } } @@ -323,8 +327,8 @@ static int is_swiotlb_buffer(phys_addr_t paddr) /* * Bounce: copy the swiotlb buffer back to the original dma location */ -static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, - enum dma_data_direction dir) +void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, + enum dma_data_direction dir) { unsigned long pfn = PFN_DOWN(phys); @@ -360,26 +364,25 @@ static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, memcpy(phys_to_virt(phys), dma_addr, size); } } +EXPORT_SYMBOL_GPL(swiotlb_bounce); -/* - * Allocates bounce buffer and returns its kernel virtual address. - */ -static void * -map_single(struct device *hwdev, phys_addr_t phys, size_t size, int dir) +void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr, + phys_addr_t phys, size_t size, + enum dma_data_direction dir) { unsigned long flags; char *dma_addr; unsigned int nslots, stride, index, wrap; int i; - unsigned long start_dma_addr; unsigned long mask; unsigned long offset_slots; unsigned long max_slots; mask = dma_get_seg_boundary(hwdev); - start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start) & mask; - offset_slots = ALIGN(start_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + tbl_dma_addr &= mask; + + offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; /* * Carefully handle integer overflow which can occur when mask == ~0UL. @@ -466,12 +469,27 @@ found: return dma_addr; } +EXPORT_SYMBOL_GPL(swiotlb_tbl_map_single); + +/* + * Allocates bounce buffer and returns its kernel virtual address. + */ + +static void * +map_single(struct device *hwdev, phys_addr_t phys, size_t size, + enum dma_data_direction dir) +{ + dma_addr_t start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start); + + return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, dir); +} /* * dma_addr is the kernel virtual address of the bounce buffer to unmap. */ -static void -do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir) +void +swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr, size_t size, + enum dma_data_direction dir) { unsigned long flags; int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; @@ -509,10 +527,12 @@ do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir) } spin_unlock_irqrestore(&io_tlb_lock, flags); } +EXPORT_SYMBOL_GPL(swiotlb_tbl_unmap_single); -static void -sync_single(struct device *hwdev, char *dma_addr, size_t size, - int dir, int target) +void +swiotlb_tbl_sync_single(struct device *hwdev, char *dma_addr, size_t size, + enum dma_data_direction dir, + enum dma_sync_target target) { int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; phys_addr_t phys = io_tlb_orig_addr[index]; @@ -536,6 +556,7 @@ sync_single(struct device *hwdev, char *dma_addr, size_t size, BUG(); } } +EXPORT_SYMBOL_GPL(swiotlb_tbl_sync_single); void * swiotlb_alloc_coherent(struct device *hwdev, size_t size, @@ -559,8 +580,8 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, } if (!ret) { /* - * We are either out of memory or the device can't DMA - * to GFP_DMA memory; fall back on map_single(), which + * We are either out of memory or the device can't DMA to + * GFP_DMA memory; fall back on map_single(), which * will grab memory from the lowest available address range. */ ret = map_single(hwdev, 0, size, DMA_FROM_DEVICE); @@ -578,7 +599,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, (unsigned long long)dev_addr); /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ - do_unmap_single(hwdev, ret, size, DMA_TO_DEVICE); + swiotlb_tbl_unmap_single(hwdev, ret, size, DMA_TO_DEVICE); return NULL; } *dma_handle = dev_addr; @@ -596,13 +617,14 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, if (!is_swiotlb_buffer(paddr)) free_pages((unsigned long)vaddr, get_order(size)); else - /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ - do_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE); + /* DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single */ + swiotlb_tbl_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE); } EXPORT_SYMBOL(swiotlb_free_coherent); static void -swiotlb_full(struct device *dev, size_t size, int dir, int do_panic) +swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir, + int do_panic) { /* * Ran out of IOMMU space for this operation. This is very bad. @@ -680,14 +702,14 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page); * whatever the device wrote there. */ static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, int dir) + size_t size, enum dma_data_direction dir) { phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); BUG_ON(dir == DMA_NONE); if (is_swiotlb_buffer(paddr)) { - do_unmap_single(hwdev, phys_to_virt(paddr), size, dir); + swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir); return; } @@ -723,14 +745,16 @@ EXPORT_SYMBOL_GPL(swiotlb_unmap_page); */ static void swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, int dir, int target) + size_t size, enum dma_data_direction dir, + enum dma_sync_target target) { phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); BUG_ON(dir == DMA_NONE); if (is_swiotlb_buffer(paddr)) { - sync_single(hwdev, phys_to_virt(paddr), size, dir, target); + swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir, + target); return; } @@ -809,7 +833,7 @@ EXPORT_SYMBOL(swiotlb_map_sg_attrs); int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, - int dir) + enum dma_data_direction dir) { return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL); } @@ -836,7 +860,7 @@ EXPORT_SYMBOL(swiotlb_unmap_sg_attrs); void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, - int dir) + enum dma_data_direction dir) { return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL); } @@ -851,7 +875,8 @@ EXPORT_SYMBOL(swiotlb_unmap_sg); */ static void swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, - int nelems, int dir, int target) + int nelems, enum dma_data_direction dir, + enum dma_sync_target target) { struct scatterlist *sg; int i; diff --git a/lib/vsprintf.c b/lib/vsprintf.c index b8a2f549ab0..c150d3dafff 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -146,19 +146,16 @@ int strict_strtoul(const char *cp, unsigned int base, unsigned long *res) { char *tail; unsigned long val; - size_t len; *res = 0; - len = strlen(cp); - if (len == 0) + if (!*cp) return -EINVAL; val = simple_strtoul(cp, &tail, base); if (tail == cp) return -EINVAL; - if ((*tail == '\0') || - ((len == (size_t)(tail - cp) + 1) && (*tail == '\n'))) { + if ((tail[0] == '\0') || (tail[0] == '\n' && tail[1] == '\0')) { *res = val; return 0; } @@ -220,18 +217,15 @@ int strict_strtoull(const char *cp, unsigned int base, unsigned long long *res) { char *tail; unsigned long long val; - size_t len; *res = 0; - len = strlen(cp); - if (len == 0) + if (!*cp) return -EINVAL; val = simple_strtoull(cp, &tail, base); if (tail == cp) return -EINVAL; - if ((*tail == '\0') || - ((len == (size_t)(tail - cp) + 1) && (*tail == '\n'))) { + if ((tail[0] == '\0') || (tail[0] == '\n' && tail[1] == '\0')) { *res = val; return 0; } @@ -980,6 +974,11 @@ char *uuid_string(char *buf, char *end, const u8 *addr, * [0][1][2][3]-[4][5]-[6][7]-[8][9]-[10][11][12][13][14][15] * little endian output byte order is: * [3][2][1][0]-[5][4]-[7][6]-[8][9]-[10][11][12][13][14][15] + * - 'V' For a struct va_format which contains a format string * and va_list *, + * call vsnprintf(->format, *->va_list). + * Implements a "recursive vsnprintf". + * Do not use this feature without some mechanism to verify the + * correctness of the format string and va_list arguments. * * Note: The difference between 'S' and 'F' is that on ia64 and ppc64 * function pointers are really function descriptors, which contain a @@ -989,8 +988,15 @@ static noinline_for_stack char *pointer(const char *fmt, char *buf, char *end, void *ptr, struct printf_spec spec) { - if (!ptr) + if (!ptr) { + /* + * Print (null) with the same width as a pointer so it makes + * tabular output look nice. + */ + if (spec.field_width == -1) + spec.field_width = 2 * sizeof(void *); return string(buf, end, "(null)", spec); + } switch (*fmt) { case 'F': @@ -1025,10 +1031,14 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, break; case 'U': return uuid_string(buf, end, ptr, spec, fmt); + case 'V': + return buf + vsnprintf(buf, end - buf, + ((struct va_format *)ptr)->fmt, + *(((struct va_format *)ptr)->va)); } spec.flags |= SMALL; if (spec.field_width == -1) { - spec.field_width = 2*sizeof(void *); + spec.field_width = 2 * sizeof(void *); spec.flags |= ZEROPAD; } spec.base = 16; @@ -1494,7 +1504,7 @@ EXPORT_SYMBOL(snprintf); * @...: Arguments for the format string * * The return value is the number of characters written into @buf not including - * the trailing '\0'. If @size is <= 0 the function returns 0. + * the trailing '\0'. If @size is == 0 the function returns 0. */ int scnprintf(char *buf, size_t size, const char *fmt, ...) @@ -1506,7 +1516,11 @@ int scnprintf(char *buf, size_t size, const char *fmt, ...) i = vsnprintf(buf, size, fmt, args); va_end(args); - return (i >= size) ? (size - 1) : i; + if (likely(i < size)) + return i; + if (size != 0) + return size - 1; + return 0; } EXPORT_SYMBOL(scnprintf); |