summaryrefslogtreecommitdiffstats
path: root/arch/tile/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/tile/lib')
-rw-r--r--arch/tile/lib/atomic_asm_32.S2
-rw-r--r--arch/tile/lib/cacheflush.c18
-rw-r--r--arch/tile/lib/memchr_64.c71
-rw-r--r--arch/tile/lib/memcpy_64.c220
-rw-r--r--arch/tile/lib/memcpy_user_64.c86
-rw-r--r--arch/tile/lib/memset_64.c145
-rw-r--r--arch/tile/lib/spinlock_64.c104
-rw-r--r--arch/tile/lib/strchr_64.c67
-rw-r--r--arch/tile/lib/strlen_64.c38
-rw-r--r--arch/tile/lib/usercopy_64.S196
10 files changed, 946 insertions, 1 deletions
diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S
index 82f64cc6365..24448734f6f 100644
--- a/arch/tile/lib/atomic_asm_32.S
+++ b/arch/tile/lib/atomic_asm_32.S
@@ -59,7 +59,7 @@
* bad kernel addresses).
*
* Note that if the value we would store is the same as what we
- * loaded, we bypass the load. Other platforms with true atomics can
+ * loaded, we bypass the store. Other platforms with true atomics can
* make the guarantee that a non-atomic __clear_bit(), for example,
* can safely race with an atomic test_and_set_bit(); this example is
* from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do
diff --git a/arch/tile/lib/cacheflush.c b/arch/tile/lib/cacheflush.c
index 35c1d8ca5f3..8928aace7a6 100644
--- a/arch/tile/lib/cacheflush.c
+++ b/arch/tile/lib/cacheflush.c
@@ -15,6 +15,7 @@
#include <asm/page.h>
#include <asm/cacheflush.h>
#include <arch/icache.h>
+#include <arch/spr_def.h>
void __flush_icache_range(unsigned long start, unsigned long end)
@@ -39,6 +40,18 @@ void finv_buffer_remote(void *buffer, size_t size, int hfh)
char *p, *base;
size_t step_size, load_count;
const unsigned long STRIPE_WIDTH = 8192;
+#ifdef __tilegx__
+ /*
+ * On TILE-Gx, we must disable the dstream prefetcher before doing
+ * a cache flush; otherwise, we could end up with data in the cache
+ * that we don't want there. Note that normally we'd do an mf
+ * after the SPR write to disabling the prefetcher, but we do one
+ * below, before any further loads, so there's no need to do it
+ * here.
+ */
+ uint_reg_t old_dstream_pf = __insn_mfspr(SPR_DSTREAM_PF);
+ __insn_mtspr(SPR_DSTREAM_PF, 0);
+#endif
/*
* Flush and invalidate the buffer out of the local L1/L2
@@ -122,4 +135,9 @@ void finv_buffer_remote(void *buffer, size_t size, int hfh)
/* Wait for the load+inv's (and thus finvs) to have completed. */
__insn_mf();
+
+#ifdef __tilegx__
+ /* Reenable the prefetcher. */
+ __insn_mtspr(SPR_DSTREAM_PF, old_dstream_pf);
+#endif
}
diff --git a/arch/tile/lib/memchr_64.c b/arch/tile/lib/memchr_64.c
new file mode 100644
index 00000000000..84fdc8d8e73
--- /dev/null
+++ b/arch/tile/lib/memchr_64.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+void *memchr(const void *s, int c, size_t n)
+{
+ const uint64_t *last_word_ptr;
+ const uint64_t *p;
+ const char *last_byte_ptr;
+ uintptr_t s_int;
+ uint64_t goal, before_mask, v, bits;
+ char *ret;
+
+ if (__builtin_expect(n == 0, 0)) {
+ /* Don't dereference any memory if the array is empty. */
+ return NULL;
+ }
+
+ /* Get an aligned pointer. */
+ s_int = (uintptr_t) s;
+ p = (const uint64_t *)(s_int & -8);
+
+ /* Create eight copies of the byte for which we are looking. */
+ goal = 0x0101010101010101ULL * (uint8_t) c;
+
+ /* Read the first word, but munge it so that bytes before the array
+ * will not match goal.
+ *
+ * Note that this shift count expression works because we know
+ * shift counts are taken mod 64.
+ */
+ before_mask = (1ULL << (s_int << 3)) - 1;
+ v = (*p | before_mask) ^ (goal & before_mask);
+
+ /* Compute the address of the last byte. */
+ last_byte_ptr = (const char *)s + n - 1;
+
+ /* Compute the address of the word containing the last byte. */
+ last_word_ptr = (const uint64_t *)((uintptr_t) last_byte_ptr & -8);
+
+ while ((bits = __insn_v1cmpeq(v, goal)) == 0) {
+ if (__builtin_expect(p == last_word_ptr, 0)) {
+ /* We already read the last word in the array,
+ * so give up.
+ */
+ return NULL;
+ }
+ v = *++p;
+ }
+
+ /* We found a match, but it might be in a byte past the end
+ * of the array.
+ */
+ ret = ((char *)p) + (__insn_ctz(bits) >> 3);
+ return (ret <= last_byte_ptr) ? ret : NULL;
+}
+EXPORT_SYMBOL(memchr);
diff --git a/arch/tile/lib/memcpy_64.c b/arch/tile/lib/memcpy_64.c
new file mode 100644
index 00000000000..3fab9a6a2bb
--- /dev/null
+++ b/arch/tile/lib/memcpy_64.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#define __memcpy memcpy
+/* EXPORT_SYMBOL() is in arch/tile/lib/exports.c since this should be asm. */
+
+/* Must be 8 bytes in size. */
+#define word_t uint64_t
+
+#if CHIP_L2_LINE_SIZE() != 64 && CHIP_L2_LINE_SIZE() != 128
+#error "Assumes 64 or 128 byte line size"
+#endif
+
+/* How many cache lines ahead should we prefetch? */
+#define PREFETCH_LINES_AHEAD 3
+
+/*
+ * Provide "base versions" of load and store for the normal code path.
+ * The kernel provides other versions for userspace copies.
+ */
+#define ST(p, v) (*(p) = (v))
+#define LD(p) (*(p))
+
+#ifndef USERCOPY_FUNC
+#define ST1 ST
+#define ST2 ST
+#define ST4 ST
+#define ST8 ST
+#define LD1 LD
+#define LD2 LD
+#define LD4 LD
+#define LD8 LD
+#define RETVAL dstv
+void *memcpy(void *__restrict dstv, const void *__restrict srcv, size_t n)
+#else
+/*
+ * Special kernel version will provide implementation of the LDn/STn
+ * macros to return a count of uncopied bytes due to mm fault.
+ */
+#define RETVAL 0
+int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
+#endif
+{
+ char *__restrict dst1 = (char *)dstv;
+ const char *__restrict src1 = (const char *)srcv;
+ const char *__restrict src1_end;
+ const char *__restrict prefetch;
+ word_t *__restrict dst8; /* 8-byte pointer to destination memory. */
+ word_t final; /* Final bytes to write to trailing word, if any */
+ long i;
+
+ if (n < 16) {
+ for (; n; n--)
+ ST1(dst1++, LD1(src1++));
+ return RETVAL;
+ }
+
+ /*
+ * Locate the end of source memory we will copy. Don't
+ * prefetch past this.
+ */
+ src1_end = src1 + n - 1;
+
+ /* Prefetch ahead a few cache lines, but not past the end. */
+ prefetch = src1;
+ for (i = 0; i < PREFETCH_LINES_AHEAD; i++) {
+ __insn_prefetch(prefetch);
+ prefetch += CHIP_L2_LINE_SIZE();
+ prefetch = (prefetch > src1_end) ? prefetch : src1;
+ }
+
+ /* Copy bytes until dst is word-aligned. */
+ for (; (uintptr_t)dst1 & (sizeof(word_t) - 1); n--)
+ ST1(dst1++, LD1(src1++));
+
+ /* 8-byte pointer to destination memory. */
+ dst8 = (word_t *)dst1;
+
+ if (__builtin_expect((uintptr_t)src1 & (sizeof(word_t) - 1), 0)) {
+ /*
+ * Misaligned copy. Copy 8 bytes at a time, but don't
+ * bother with other fanciness.
+ *
+ * TODO: Consider prefetching and using wh64 as well.
+ */
+
+ /* Create an aligned src8. */
+ const word_t *__restrict src8 =
+ (const word_t *)((uintptr_t)src1 & -sizeof(word_t));
+ word_t b;
+
+ word_t a = LD8(src8++);
+ for (; n >= sizeof(word_t); n -= sizeof(word_t)) {
+ b = LD8(src8++);
+ a = __insn_dblalign(a, b, src1);
+ ST8(dst8++, a);
+ a = b;
+ }
+
+ if (n == 0)
+ return RETVAL;
+
+ b = ((const char *)src8 <= src1_end) ? *src8 : 0;
+
+ /*
+ * Final source bytes to write to trailing partial
+ * word, if any.
+ */
+ final = __insn_dblalign(a, b, src1);
+ } else {
+ /* Aligned copy. */
+
+ const word_t* __restrict src8 = (const word_t *)src1;
+
+ /* src8 and dst8 are both word-aligned. */
+ if (n >= CHIP_L2_LINE_SIZE()) {
+ /* Copy until 'dst' is cache-line-aligned. */
+ for (; (uintptr_t)dst8 & (CHIP_L2_LINE_SIZE() - 1);
+ n -= sizeof(word_t))
+ ST8(dst8++, LD8(src8++));
+
+ for (; n >= CHIP_L2_LINE_SIZE(); ) {
+ __insn_wh64(dst8);
+
+ /*
+ * Prefetch and advance to next line
+ * to prefetch, but don't go past the end
+ */
+ __insn_prefetch(prefetch);
+ prefetch += CHIP_L2_LINE_SIZE();
+ prefetch = (prefetch > src1_end) ? prefetch :
+ (const char *)src8;
+
+ /*
+ * Copy an entire cache line. Manually
+ * unrolled to avoid idiosyncracies of
+ * compiler unrolling.
+ */
+#define COPY_WORD(offset) ({ ST8(dst8+offset, LD8(src8+offset)); n -= 8; })
+ COPY_WORD(0);
+ COPY_WORD(1);
+ COPY_WORD(2);
+ COPY_WORD(3);
+ COPY_WORD(4);
+ COPY_WORD(5);
+ COPY_WORD(6);
+ COPY_WORD(7);
+#if CHIP_L2_LINE_SIZE() == 128
+ COPY_WORD(8);
+ COPY_WORD(9);
+ COPY_WORD(10);
+ COPY_WORD(11);
+ COPY_WORD(12);
+ COPY_WORD(13);
+ COPY_WORD(14);
+ COPY_WORD(15);
+#elif CHIP_L2_LINE_SIZE() != 64
+# error Fix code that assumes particular L2 cache line sizes
+#endif
+
+ dst8 += CHIP_L2_LINE_SIZE() / sizeof(word_t);
+ src8 += CHIP_L2_LINE_SIZE() / sizeof(word_t);
+ }
+ }
+
+ for (; n >= sizeof(word_t); n -= sizeof(word_t))
+ ST8(dst8++, LD8(src8++));
+
+ if (__builtin_expect(n == 0, 1))
+ return RETVAL;
+
+ final = LD8(src8);
+ }
+
+ /* n != 0 if we get here. Write out any trailing bytes. */
+ dst1 = (char *)dst8;
+ if (n & 4) {
+ ST4((uint32_t *)dst1, final);
+ dst1 += 4;
+ final >>= 32;
+ n &= 3;
+ }
+ if (n & 2) {
+ ST2((uint16_t *)dst1, final);
+ dst1 += 2;
+ final >>= 16;
+ n &= 1;
+ }
+ if (n)
+ ST1((uint8_t *)dst1, final);
+
+ return RETVAL;
+}
+
+
+#ifdef USERCOPY_FUNC
+#undef ST1
+#undef ST2
+#undef ST4
+#undef ST8
+#undef LD1
+#undef LD2
+#undef LD4
+#undef LD8
+#undef USERCOPY_FUNC
+#endif
diff --git a/arch/tile/lib/memcpy_user_64.c b/arch/tile/lib/memcpy_user_64.c
new file mode 100644
index 00000000000..4763b3aff1c
--- /dev/null
+++ b/arch/tile/lib/memcpy_user_64.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ *
+ * Do memcpy(), but trap and return "n" when a load or store faults.
+ *
+ * Note: this idiom only works when memcpy() compiles to a leaf function.
+ * If "sp" is updated during memcpy, the "jrp lr" will be incorrect.
+ *
+ * Also note that we are capturing "n" from the containing scope here.
+ */
+
+#define _ST(p, inst, v) \
+ ({ \
+ asm("1: " #inst " %0, %1;" \
+ ".pushsection .coldtext.memcpy,\"ax\";" \
+ "2: { move r0, %2; jrp lr };" \
+ ".section __ex_table,\"a\";" \
+ ".quad 1b, 2b;" \
+ ".popsection" \
+ : "=m" (*(p)) : "r" (v), "r" (n)); \
+ })
+
+#define _LD(p, inst) \
+ ({ \
+ unsigned long __v; \
+ asm("1: " #inst " %0, %1;" \
+ ".pushsection .coldtext.memcpy,\"ax\";" \
+ "2: { move r0, %2; jrp lr };" \
+ ".section __ex_table,\"a\";" \
+ ".quad 1b, 2b;" \
+ ".popsection" \
+ : "=r" (__v) : "m" (*(p)), "r" (n)); \
+ __v; \
+ })
+
+#define USERCOPY_FUNC __copy_to_user_inatomic
+#define ST1(p, v) _ST((p), st1, (v))
+#define ST2(p, v) _ST((p), st2, (v))
+#define ST4(p, v) _ST((p), st4, (v))
+#define ST8(p, v) _ST((p), st, (v))
+#define LD1 LD
+#define LD2 LD
+#define LD4 LD
+#define LD8 LD
+#include "memcpy_64.c"
+
+#define USERCOPY_FUNC __copy_from_user_inatomic
+#define ST1 ST
+#define ST2 ST
+#define ST4 ST
+#define ST8 ST
+#define LD1(p) _LD((p), ld1u)
+#define LD2(p) _LD((p), ld2u)
+#define LD4(p) _LD((p), ld4u)
+#define LD8(p) _LD((p), ld)
+#include "memcpy_64.c"
+
+#define USERCOPY_FUNC __copy_in_user_inatomic
+#define ST1(p, v) _ST((p), st1, (v))
+#define ST2(p, v) _ST((p), st2, (v))
+#define ST4(p, v) _ST((p), st4, (v))
+#define ST8(p, v) _ST((p), st, (v))
+#define LD1(p) _LD((p), ld1u)
+#define LD2(p) _LD((p), ld2u)
+#define LD4(p) _LD((p), ld4u)
+#define LD8(p) _LD((p), ld)
+#include "memcpy_64.c"
+
+unsigned long __copy_from_user_zeroing(void *to, const void __user *from,
+ unsigned long n)
+{
+ unsigned long rc = __copy_from_user_inatomic(to, from, n);
+ if (unlikely(rc))
+ memset(to + n - rc, 0, rc);
+ return rc;
+}
diff --git a/arch/tile/lib/memset_64.c b/arch/tile/lib/memset_64.c
new file mode 100644
index 00000000000..3873085711d
--- /dev/null
+++ b/arch/tile/lib/memset_64.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <arch/chip.h>
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+#undef memset
+
+void *memset(void *s, int c, size_t n)
+{
+ uint64_t *out64;
+ int n64, to_align64;
+ uint64_t v64;
+ uint8_t *out8 = s;
+
+ /* Experimentation shows that a trivial tight loop is a win up until
+ * around a size of 20, where writing a word at a time starts to win.
+ */
+#define BYTE_CUTOFF 20
+
+#if BYTE_CUTOFF < 7
+ /* This must be at least at least this big, or some code later
+ * on doesn't work.
+ */
+#error "BYTE_CUTOFF is too small"
+#endif
+
+ if (n < BYTE_CUTOFF) {
+ /* Strangely, this turns out to be the tightest way to
+ * write this loop.
+ */
+ if (n != 0) {
+ do {
+ /* Strangely, combining these into one line
+ * performs worse.
+ */
+ *out8 = c;
+ out8++;
+ } while (--n != 0);
+ }
+
+ return s;
+ }
+
+ /* Align 'out8'. We know n >= 7 so this won't write past the end. */
+ while (((uintptr_t) out8 & 7) != 0) {
+ *out8++ = c;
+ --n;
+ }
+
+ /* Align 'n'. */
+ while (n & 7)
+ out8[--n] = c;
+
+ out64 = (uint64_t *) out8;
+ n64 = n >> 3;
+
+ /* Tile input byte out to 64 bits. */
+ /* KLUDGE */
+ v64 = 0x0101010101010101ULL * (uint8_t)c;
+
+ /* This must be at least 8 or the following loop doesn't work. */
+#define CACHE_LINE_SIZE_IN_DOUBLEWORDS (CHIP_L2_LINE_SIZE() / 8)
+
+ /* Determine how many words we need to emit before the 'out32'
+ * pointer becomes aligned modulo the cache line size.
+ */
+ to_align64 = (-((uintptr_t)out64 >> 3)) &
+ (CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1);
+
+ /* Only bother aligning and using wh64 if there is at least
+ * one full cache line to process. This check also prevents
+ * overrunning the end of the buffer with alignment words.
+ */
+ if (to_align64 <= n64 - CACHE_LINE_SIZE_IN_DOUBLEWORDS) {
+ int lines_left;
+
+ /* Align out64 mod the cache line size so we can use wh64. */
+ n64 -= to_align64;
+ for (; to_align64 != 0; to_align64--) {
+ *out64 = v64;
+ out64++;
+ }
+
+ /* Use unsigned divide to turn this into a right shift. */
+ lines_left = (unsigned)n64 / CACHE_LINE_SIZE_IN_DOUBLEWORDS;
+
+ do {
+ /* Only wh64 a few lines at a time, so we don't
+ * exceed the maximum number of victim lines.
+ */
+ int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS())
+ ? lines_left
+ : CHIP_MAX_OUTSTANDING_VICTIMS());
+ uint64_t *wh = out64;
+ int i = x;
+ int j;
+
+ lines_left -= x;
+
+ do {
+ __insn_wh64(wh);
+ wh += CACHE_LINE_SIZE_IN_DOUBLEWORDS;
+ } while (--i);
+
+ for (j = x * (CACHE_LINE_SIZE_IN_DOUBLEWORDS / 4);
+ j != 0; j--) {
+ *out64++ = v64;
+ *out64++ = v64;
+ *out64++ = v64;
+ *out64++ = v64;
+ }
+ } while (lines_left != 0);
+
+ /* We processed all full lines above, so only this many
+ * words remain to be processed.
+ */
+ n64 &= CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1;
+ }
+
+ /* Now handle any leftover values. */
+ if (n64 != 0) {
+ do {
+ *out64 = v64;
+ out64++;
+ } while (--n64 != 0);
+ }
+
+ return s;
+}
+EXPORT_SYMBOL(memset);
diff --git a/arch/tile/lib/spinlock_64.c b/arch/tile/lib/spinlock_64.c
new file mode 100644
index 00000000000..d6fb9581e98
--- /dev/null
+++ b/arch/tile/lib/spinlock_64.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+
+#include "spinlock_common.h"
+
+/*
+ * Read the spinlock value without allocating in our cache and without
+ * causing an invalidation to another cpu with a copy of the cacheline.
+ * This is important when we are spinning waiting for the lock.
+ */
+static inline u32 arch_spin_read_noalloc(void *lock)
+{
+ return atomic_cmpxchg((atomic_t *)lock, -1, -1);
+}
+
+/*
+ * Wait until the high bits (current) match my ticket.
+ * If we notice the overflow bit set on entry, we clear it.
+ */
+void arch_spin_lock_slow(arch_spinlock_t *lock, u32 my_ticket)
+{
+ if (unlikely(my_ticket & __ARCH_SPIN_NEXT_OVERFLOW)) {
+ __insn_fetchand4(&lock->lock, ~__ARCH_SPIN_NEXT_OVERFLOW);
+ my_ticket &= ~__ARCH_SPIN_NEXT_OVERFLOW;
+ }
+
+ for (;;) {
+ u32 val = arch_spin_read_noalloc(lock);
+ u32 delta = my_ticket - arch_spin_current(val);
+ if (delta == 0)
+ return;
+ relax((128 / CYCLES_PER_RELAX_LOOP) * delta);
+ }
+}
+EXPORT_SYMBOL(arch_spin_lock_slow);
+
+/*
+ * Check the lock to see if it is plausible, and try to get it with cmpxchg().
+ */
+int arch_spin_trylock(arch_spinlock_t *lock)
+{
+ u32 val = arch_spin_read_noalloc(lock);
+ if (unlikely(arch_spin_current(val) != arch_spin_next(val)))
+ return 0;
+ return cmpxchg(&lock->lock, val, (val + 1) & ~__ARCH_SPIN_NEXT_OVERFLOW)
+ == val;
+}
+EXPORT_SYMBOL(arch_spin_trylock);
+
+void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+ u32 iterations = 0;
+ while (arch_spin_is_locked(lock))
+ delay_backoff(iterations++);
+}
+EXPORT_SYMBOL(arch_spin_unlock_wait);
+
+/*
+ * If the read lock fails due to a writer, we retry periodically
+ * until the value is positive and we write our incremented reader count.
+ */
+void __read_lock_failed(arch_rwlock_t *rw)
+{
+ u32 val;
+ int iterations = 0;
+ do {
+ delay_backoff(iterations++);
+ val = __insn_fetchaddgez4(&rw->lock, 1);
+ } while (unlikely(arch_write_val_locked(val)));
+}
+EXPORT_SYMBOL(__read_lock_failed);
+
+/*
+ * If we failed because there were readers, clear the "writer" bit
+ * so we don't block additional readers. Otherwise, there was another
+ * writer anyway, so our "fetchor" made no difference. Then wait,
+ * issuing periodic fetchor instructions, till we get the lock.
+ */
+void __write_lock_failed(arch_rwlock_t *rw, u32 val)
+{
+ int iterations = 0;
+ do {
+ if (!arch_write_val_locked(val))
+ val = __insn_fetchand4(&rw->lock, ~__WRITE_LOCK_BIT);
+ delay_backoff(iterations++);
+ val = __insn_fetchor4(&rw->lock, __WRITE_LOCK_BIT);
+ } while (val != 0);
+}
+EXPORT_SYMBOL(__write_lock_failed);
diff --git a/arch/tile/lib/strchr_64.c b/arch/tile/lib/strchr_64.c
new file mode 100644
index 00000000000..617a9273aaa
--- /dev/null
+++ b/arch/tile/lib/strchr_64.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+#undef strchr
+
+char *strchr(const char *s, int c)
+{
+ int z, g;
+
+ /* Get an aligned pointer. */
+ const uintptr_t s_int = (uintptr_t) s;
+ const uint64_t *p = (const uint64_t *)(s_int & -8);
+
+ /* Create eight copies of the byte for which we are looking. */
+ const uint64_t goal = 0x0101010101010101ULL * (uint8_t) c;
+
+ /* Read the first aligned word, but force bytes before the string to
+ * match neither zero nor goal (we make sure the high bit of each
+ * byte is 1, and the low 7 bits are all the opposite of the goal
+ * byte).
+ *
+ * Note that this shift count expression works because we know shift
+ * counts are taken mod 64.
+ */
+ const uint64_t before_mask = (1ULL << (s_int << 3)) - 1;
+ uint64_t v = (*p | before_mask) ^
+ (goal & __insn_v1shrsi(before_mask, 1));
+
+ uint64_t zero_matches, goal_matches;
+ while (1) {
+ /* Look for a terminating '\0'. */
+ zero_matches = __insn_v1cmpeqi(v, 0);
+
+ /* Look for the goal byte. */
+ goal_matches = __insn_v1cmpeq(v, goal);
+
+ if (__builtin_expect((zero_matches | goal_matches) != 0, 0))
+ break;
+
+ v = *++p;
+ }
+
+ z = __insn_ctz(zero_matches);
+ g = __insn_ctz(goal_matches);
+
+ /* If we found c before '\0' we got a match. Note that if c == '\0'
+ * then g == z, and we correctly return the address of the '\0'
+ * rather than NULL.
+ */
+ return (g <= z) ? ((char *)p) + (g >> 3) : NULL;
+}
+EXPORT_SYMBOL(strchr);
diff --git a/arch/tile/lib/strlen_64.c b/arch/tile/lib/strlen_64.c
new file mode 100644
index 00000000000..1c92d46202a
--- /dev/null
+++ b/arch/tile/lib/strlen_64.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+#undef strlen
+
+size_t strlen(const char *s)
+{
+ /* Get an aligned pointer. */
+ const uintptr_t s_int = (uintptr_t) s;
+ const uint64_t *p = (const uint64_t *)(s_int & -8);
+
+ /* Read the first word, but force bytes before the string to be nonzero.
+ * This expression works because we know shift counts are taken mod 64.
+ */
+ uint64_t v = *p | ((1ULL << (s_int << 3)) - 1);
+
+ uint64_t bits;
+ while ((bits = __insn_v1cmpeqi(v, 0)) == 0)
+ v = *++p;
+
+ return ((const char *)p) + (__insn_ctz(bits) >> 3) - s;
+}
+EXPORT_SYMBOL(strlen);
diff --git a/arch/tile/lib/usercopy_64.S b/arch/tile/lib/usercopy_64.S
new file mode 100644
index 00000000000..2ff44f87b78
--- /dev/null
+++ b/arch/tile/lib/usercopy_64.S
@@ -0,0 +1,196 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/linkage.h>
+#include <asm/errno.h>
+#include <asm/cache.h>
+#include <arch/chip.h>
+
+/* Access user memory, but use MMU to avoid propagating kernel exceptions. */
+
+ .pushsection .fixup,"ax"
+
+get_user_fault:
+ { movei r1, -EFAULT; move r0, zero }
+ jrp lr
+ ENDPROC(get_user_fault)
+
+put_user_fault:
+ { movei r0, -EFAULT; jrp lr }
+ ENDPROC(put_user_fault)
+
+ .popsection
+
+/*
+ * __get_user_N functions take a pointer in r0, and return 0 in r1
+ * on success, with the value in r0; or else -EFAULT in r1.
+ */
+#define __get_user_N(bytes, LOAD) \
+ STD_ENTRY(__get_user_##bytes); \
+1: { LOAD r0, r0; move r1, zero }; \
+ jrp lr; \
+ STD_ENDPROC(__get_user_##bytes); \
+ .pushsection __ex_table,"a"; \
+ .quad 1b, get_user_fault; \
+ .popsection
+
+__get_user_N(1, ld1u)
+__get_user_N(2, ld2u)
+__get_user_N(4, ld4u)
+__get_user_N(8, ld)
+
+/*
+ * __put_user_N functions take a value in r0 and a pointer in r1,
+ * and return 0 in r0 on success or -EFAULT on failure.
+ */
+#define __put_user_N(bytes, STORE) \
+ STD_ENTRY(__put_user_##bytes); \
+1: { STORE r1, r0; move r0, zero }; \
+ jrp lr; \
+ STD_ENDPROC(__put_user_##bytes); \
+ .pushsection __ex_table,"a"; \
+ .quad 1b, put_user_fault; \
+ .popsection
+
+__put_user_N(1, st1)
+__put_user_N(2, st2)
+__put_user_N(4, st4)
+__put_user_N(8, st)
+
+/*
+ * strnlen_user_asm takes the pointer in r0, and the length bound in r1.
+ * It returns the length, including the terminating NUL, or zero on exception.
+ * If length is greater than the bound, returns one plus the bound.
+ */
+STD_ENTRY(strnlen_user_asm)
+ { beqz r1, 2f; addi r3, r0, -1 } /* bias down to include NUL */
+1: { ld1u r4, r0; addi r1, r1, -1 }
+ beqz r4, 2f
+ { bnezt r1, 1b; addi r0, r0, 1 }
+2: { sub r0, r0, r3; jrp lr }
+ STD_ENDPROC(strnlen_user_asm)
+ .pushsection .fixup,"ax"
+strnlen_user_fault:
+ { move r0, zero; jrp lr }
+ ENDPROC(strnlen_user_fault)
+ .section __ex_table,"a"
+ .quad 1b, strnlen_user_fault
+ .popsection
+
+/*
+ * strncpy_from_user_asm takes the kernel target pointer in r0,
+ * the userspace source pointer in r1, and the length bound (including
+ * the trailing NUL) in r2. On success, it returns the string length
+ * (not including the trailing NUL), or -EFAULT on failure.
+ */
+STD_ENTRY(strncpy_from_user_asm)
+ { beqz r2, 2f; move r3, r0 }
+1: { ld1u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
+ { st1 r0, r4; addi r0, r0, 1 }
+ beqz r2, 2f
+ bnezt r4, 1b
+ addi r0, r0, -1 /* don't count the trailing NUL */
+2: { sub r0, r0, r3; jrp lr }
+ STD_ENDPROC(strncpy_from_user_asm)
+ .pushsection .fixup,"ax"
+strncpy_from_user_fault:
+ { movei r0, -EFAULT; jrp lr }
+ ENDPROC(strncpy_from_user_fault)
+ .section __ex_table,"a"
+ .quad 1b, strncpy_from_user_fault
+ .popsection
+
+/*
+ * clear_user_asm takes the user target address in r0 and the
+ * number of bytes to zero in r1.
+ * It returns the number of uncopiable bytes (hopefully zero) in r0.
+ * Note that we don't use a separate .fixup section here since we fall
+ * through into the "fixup" code as the last straight-line bundle anyway.
+ */
+STD_ENTRY(clear_user_asm)
+ { beqz r1, 2f; or r2, r0, r1 }
+ andi r2, r2, 7
+ beqzt r2, .Lclear_aligned_user_asm
+1: { st1 r0, zero; addi r0, r0, 1; addi r1, r1, -1 }
+ bnezt r1, 1b
+2: { move r0, r1; jrp lr }
+ .pushsection __ex_table,"a"
+ .quad 1b, 2b
+ .popsection
+
+.Lclear_aligned_user_asm:
+1: { st r0, zero; addi r0, r0, 8; addi r1, r1, -8 }
+ bnezt r1, 1b
+2: { move r0, r1; jrp lr }
+ STD_ENDPROC(clear_user_asm)
+ .pushsection __ex_table,"a"
+ .quad 1b, 2b
+ .popsection
+
+/*
+ * flush_user_asm takes the user target address in r0 and the
+ * number of bytes to flush in r1.
+ * It returns the number of unflushable bytes (hopefully zero) in r0.
+ */
+STD_ENTRY(flush_user_asm)
+ beqz r1, 2f
+ { movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
+ { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
+ { and r0, r0, r2; and r1, r1, r2 }
+ { sub r1, r1, r0 }
+1: { flush r0; addi r1, r1, -CHIP_FLUSH_STRIDE() }
+ { addi r0, r0, CHIP_FLUSH_STRIDE(); bnezt r1, 1b }
+2: { move r0, r1; jrp lr }
+ STD_ENDPROC(flush_user_asm)
+ .pushsection __ex_table,"a"
+ .quad 1b, 2b
+ .popsection
+
+/*
+ * inv_user_asm takes the user target address in r0 and the
+ * number of bytes to invalidate in r1.
+ * It returns the number of not inv'able bytes (hopefully zero) in r0.
+ */
+STD_ENTRY(inv_user_asm)
+ beqz r1, 2f
+ { movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
+ { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
+ { and r0, r0, r2; and r1, r1, r2 }
+ { sub r1, r1, r0 }
+1: { inv r0; addi r1, r1, -CHIP_INV_STRIDE() }
+ { addi r0, r0, CHIP_INV_STRIDE(); bnezt r1, 1b }
+2: { move r0, r1; jrp lr }
+ STD_ENDPROC(inv_user_asm)
+ .pushsection __ex_table,"a"
+ .quad 1b, 2b
+ .popsection
+
+/*
+ * finv_user_asm takes the user target address in r0 and the
+ * number of bytes to flush-invalidate in r1.
+ * It returns the number of not finv'able bytes (hopefully zero) in r0.
+ */
+STD_ENTRY(finv_user_asm)
+ beqz r1, 2f
+ { movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
+ { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
+ { and r0, r0, r2; and r1, r1, r2 }
+ { sub r1, r1, r0 }
+1: { finv r0; addi r1, r1, -CHIP_FINV_STRIDE() }
+ { addi r0, r0, CHIP_FINV_STRIDE(); bnezt r1, 1b }
+2: { move r0, r1; jrp lr }
+ STD_ENDPROC(finv_user_asm)
+ .pushsection __ex_table,"a"
+ .quad 1b, 2b
+ .popsection