summaryrefslogtreecommitdiff
path: root/arch/tile/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/tile/lib')
-rw-r--r--arch/tile/lib/Makefile16
-rw-r--r--arch/tile/lib/atomic_32.c133
-rw-r--r--arch/tile/lib/atomic_asm_32.S1
-rw-r--r--arch/tile/lib/cacheflush.c16
-rw-r--r--arch/tile/lib/exports.c7
-rw-r--r--arch/tile/lib/memchr_64.c2
-rw-r--r--arch/tile/lib/memcpy_32.S63
-rw-r--r--arch/tile/lib/memcpy_64.c264
-rw-r--r--arch/tile/lib/memcpy_tile64.c276
-rw-r--r--arch/tile/lib/memcpy_user_64.c2
-rw-r--r--arch/tile/lib/memset_32.c110
-rw-r--r--arch/tile/lib/memset_64.c9
-rw-r--r--arch/tile/lib/strchr_32.c2
-rw-r--r--arch/tile/lib/strchr_64.c2
-rw-r--r--arch/tile/lib/string-endian.h13
-rw-r--r--arch/tile/lib/strlen_32.c2
-rw-r--r--arch/tile/lib/strnlen_32.c47
-rw-r--r--arch/tile/lib/strnlen_64.c48
-rw-r--r--arch/tile/lib/usercopy_32.S36
-rw-r--r--arch/tile/lib/usercopy_64.S36
20 files changed, 378 insertions, 707 deletions
diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile
index 985f59858234..c4211cbb2021 100644
--- a/arch/tile/lib/Makefile
+++ b/arch/tile/lib/Makefile
@@ -4,15 +4,15 @@
lib-y = cacheflush.o checksum.o cpumask.o delay.o uaccess.o \
memmove.o memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \
- strchr_$(BITS).o strlen_$(BITS).o
-
-ifeq ($(CONFIG_TILEGX),y)
-CFLAGS_REMOVE_memcpy_user_64.o = -fno-omit-frame-pointer
-lib-y += memcpy_user_64.o
-else
-lib-y += atomic_32.o atomic_asm_32.o memcpy_tile64.o
-endif
+ strchr_$(BITS).o strlen_$(BITS).o strnlen_$(BITS).o
+lib-$(CONFIG_TILEGX) += memcpy_user_64.o
+lib-$(CONFIG_TILEPRO) += atomic_32.o atomic_asm_32.o
lib-$(CONFIG_SMP) += spinlock_$(BITS).o usercopy_$(BITS).o
obj-$(CONFIG_MODULES) += exports.o
+
+# The finv_buffer_remote() and copy_{to,from}_user() routines can't
+# have -pg added, since they both rely on being leaf functions.
+CFLAGS_REMOVE_cacheflush.o = -pg
+CFLAGS_REMOVE_memcpy_user_64.o = -pg
diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c
index f5cada70c3c8..759efa337be8 100644
--- a/arch/tile/lib/atomic_32.c
+++ b/arch/tile/lib/atomic_32.c
@@ -20,50 +20,12 @@
#include <linux/atomic.h>
#include <arch/chip.h>
-/* See <asm/atomic_32.h> */
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-
-/*
- * A block of memory containing locks for atomic ops. Each instance of this
- * struct will be homed on a different CPU.
- */
-struct atomic_locks_on_cpu {
- int lock[ATOMIC_HASH_L2_SIZE];
-} __attribute__((aligned(ATOMIC_HASH_L2_SIZE * 4)));
-
-static DEFINE_PER_CPU(struct atomic_locks_on_cpu, atomic_lock_pool);
-
-/* The locks we'll use until __init_atomic_per_cpu is called. */
-static struct atomic_locks_on_cpu __initdata initial_atomic_locks;
-
-/* Hash into this vector to get a pointer to lock for the given atomic. */
-struct atomic_locks_on_cpu *atomic_lock_ptr[ATOMIC_HASH_L1_SIZE]
- __write_once = {
- [0 ... ATOMIC_HASH_L1_SIZE-1] (&initial_atomic_locks)
-};
-
-#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
/* This page is remapped on startup to be hash-for-home. */
int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss;
-#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
int *__atomic_hashed_lock(volatile void *v)
{
/* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
- unsigned long i =
- (unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long));
- unsigned long n = __insn_crc32_32(0, i);
-
- /* Grab high bits for L1 index. */
- unsigned long l1_index = n >> ((sizeof(n) * 8) - ATOMIC_HASH_L1_SHIFT);
- /* Grab low bits for L2 index. */
- unsigned long l2_index = n & (ATOMIC_HASH_L2_SIZE - 1);
-
- return &atomic_lock_ptr[l1_index]->lock[l2_index];
-#else
/*
* Use bits [3, 3 + ATOMIC_HASH_SHIFT) as the lock index.
* Using mm works here because atomic_locks is page aligned.
@@ -72,26 +34,13 @@ int *__atomic_hashed_lock(volatile void *v)
(unsigned long)atomic_locks,
2, (ATOMIC_HASH_SHIFT + 2) - 1);
return (int *)ptr;
-#endif
}
#ifdef CONFIG_SMP
/* Return whether the passed pointer is a valid atomic lock pointer. */
static int is_atomic_lock(int *p)
{
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
- int i;
- for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) {
-
- if (p >= &atomic_lock_ptr[i]->lock[0] &&
- p < &atomic_lock_ptr[i]->lock[ATOMIC_HASH_L2_SIZE]) {
- return 1;
- }
- }
- return 0;
-#else
return p >= &atomic_locks[0] && p < &atomic_locks[ATOMIC_HASH_SIZE];
-#endif
}
void __atomic_fault_unlock(int *irqlock_word)
@@ -110,33 +59,32 @@ static inline int *__atomic_setup(volatile void *v)
return __atomic_hashed_lock(v);
}
-int _atomic_xchg(atomic_t *v, int n)
+int _atomic_xchg(int *v, int n)
{
- return __atomic_xchg(&v->counter, __atomic_setup(v), n).val;
+ return __atomic_xchg(v, __atomic_setup(v), n).val;
}
EXPORT_SYMBOL(_atomic_xchg);
-int _atomic_xchg_add(atomic_t *v, int i)
+int _atomic_xchg_add(int *v, int i)
{
- return __atomic_xchg_add(&v->counter, __atomic_setup(v), i).val;
+ return __atomic_xchg_add(v, __atomic_setup(v), i).val;
}
EXPORT_SYMBOL(_atomic_xchg_add);
-int _atomic_xchg_add_unless(atomic_t *v, int a, int u)
+int _atomic_xchg_add_unless(int *v, int a, int u)
{
/*
* Note: argument order is switched here since it is easier
* to use the first argument consistently as the "old value"
* in the assembly, as is done for _atomic_cmpxchg().
*/
- return __atomic_xchg_add_unless(&v->counter, __atomic_setup(v), u, a)
- .val;
+ return __atomic_xchg_add_unless(v, __atomic_setup(v), u, a).val;
}
EXPORT_SYMBOL(_atomic_xchg_add_unless);
-int _atomic_cmpxchg(atomic_t *v, int o, int n)
+int _atomic_cmpxchg(int *v, int o, int n)
{
- return __atomic_cmpxchg(&v->counter, __atomic_setup(v), o, n).val;
+ return __atomic_cmpxchg(v, __atomic_setup(v), o, n).val;
}
EXPORT_SYMBOL(_atomic_cmpxchg);
@@ -159,33 +107,32 @@ unsigned long _atomic_xor(volatile unsigned long *p, unsigned long mask)
EXPORT_SYMBOL(_atomic_xor);
-u64 _atomic64_xchg(atomic64_t *v, u64 n)
+u64 _atomic64_xchg(u64 *v, u64 n)
{
- return __atomic64_xchg(&v->counter, __atomic_setup(v), n);
+ return __atomic64_xchg(v, __atomic_setup(v), n);
}
EXPORT_SYMBOL(_atomic64_xchg);
-u64 _atomic64_xchg_add(atomic64_t *v, u64 i)
+u64 _atomic64_xchg_add(u64 *v, u64 i)
{
- return __atomic64_xchg_add(&v->counter, __atomic_setup(v), i);
+ return __atomic64_xchg_add(v, __atomic_setup(v), i);
}
EXPORT_SYMBOL(_atomic64_xchg_add);
-u64 _atomic64_xchg_add_unless(atomic64_t *v, u64 a, u64 u)
+u64 _atomic64_xchg_add_unless(u64 *v, u64 a, u64 u)
{
/*
* Note: argument order is switched here since it is easier
* to use the first argument consistently as the "old value"
* in the assembly, as is done for _atomic_cmpxchg().
*/
- return __atomic64_xchg_add_unless(&v->counter, __atomic_setup(v),
- u, a);
+ return __atomic64_xchg_add_unless(v, __atomic_setup(v), u, a);
}
EXPORT_SYMBOL(_atomic64_xchg_add_unless);
-u64 _atomic64_cmpxchg(atomic64_t *v, u64 o, u64 n)
+u64 _atomic64_cmpxchg(u64 *v, u64 o, u64 n)
{
- return __atomic64_cmpxchg(&v->counter, __atomic_setup(v), o, n);
+ return __atomic64_cmpxchg(v, __atomic_setup(v), o, n);
}
EXPORT_SYMBOL(_atomic64_cmpxchg);
@@ -208,54 +155,8 @@ struct __get_user __atomic_bad_address(int __user *addr)
}
-#if CHIP_HAS_CBOX_HOME_MAP()
-static int __init noatomichash(char *str)
-{
- pr_warning("noatomichash is deprecated.\n");
- return 1;
-}
-__setup("noatomichash", noatomichash);
-#endif
-
void __init __init_atomic_per_cpu(void)
{
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-
- unsigned int i;
- int actual_cpu;
-
- /*
- * Before this is called from setup, we just have one lock for
- * all atomic objects/operations. Here we replace the
- * elements of atomic_lock_ptr so that they point at per_cpu
- * integers. This seemingly over-complex approach stems from
- * the fact that DEFINE_PER_CPU defines an entry for each cpu
- * in the grid, not each cpu from 0..ATOMIC_HASH_SIZE-1. But
- * for efficient hashing of atomics to their locks we want a
- * compile time constant power of 2 for the size of this
- * table, so we use ATOMIC_HASH_SIZE.
- *
- * Here we populate atomic_lock_ptr from the per cpu
- * atomic_lock_pool, interspersing by actual cpu so that
- * subsequent elements are homed on consecutive cpus.
- */
-
- actual_cpu = cpumask_first(cpu_possible_mask);
-
- for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) {
- /*
- * Preincrement to slightly bias against using cpu 0,
- * which has plenty of stuff homed on it already.
- */
- actual_cpu = cpumask_next(actual_cpu, cpu_possible_mask);
- if (actual_cpu >= nr_cpu_ids)
- actual_cpu = cpumask_first(cpu_possible_mask);
-
- atomic_lock_ptr[i] = &per_cpu(atomic_lock_pool, actual_cpu);
- }
-
-#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
/* Validate power-of-two and "bigger than cpus" assumption */
BUILD_BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1));
BUG_ON(ATOMIC_HASH_SIZE < nr_cpu_ids);
@@ -279,6 +180,4 @@ void __init __init_atomic_per_cpu(void)
* That should not produce more indices than ATOMIC_HASH_SIZE.
*/
BUILD_BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE);
-
-#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
}
diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S
index 30638042691d..6bda3132cd61 100644
--- a/arch/tile/lib/atomic_asm_32.S
+++ b/arch/tile/lib/atomic_asm_32.S
@@ -164,6 +164,7 @@ STD_ENTRY_SECTION(__atomic\name, .text.atomic)
STD_ENDPROC(__atomic\name)
.ifc \bitwidth,32
.pushsection __ex_table,"a"
+ .align 4
.word 1b, __atomic\name
.word 2b, __atomic\name
.word __atomic\name, __atomic_bad_address
diff --git a/arch/tile/lib/cacheflush.c b/arch/tile/lib/cacheflush.c
index 8f8ad814b139..9c0ec22009a5 100644
--- a/arch/tile/lib/cacheflush.c
+++ b/arch/tile/lib/cacheflush.c
@@ -36,7 +36,8 @@ static inline void force_load(char *p)
* core (if "!hfh") or homed via hash-for-home (if "hfh"), waiting
* until the memory controller holds the flushed values.
*/
-void finv_buffer_remote(void *buffer, size_t size, int hfh)
+void __attribute__((optimize("omit-frame-pointer")))
+finv_buffer_remote(void *buffer, size_t size, int hfh)
{
char *p, *base;
size_t step_size, load_count;
@@ -147,18 +148,21 @@ void finv_buffer_remote(void *buffer, size_t size, int hfh)
force_load(p);
/*
- * Repeat, but with inv's instead of loads, to get rid of the
+ * Repeat, but with finv's instead of loads, to get rid of the
* data we just loaded into our own cache and the old home L3.
- * No need to unroll since inv's don't target a register.
+ * No need to unroll since finv's don't target a register.
+ * The finv's are guaranteed not to actually flush the data in
+ * the buffer back to their home, since we just read it, so the
+ * lines are clean in cache; we will only invalidate those lines.
*/
p = (char *)buffer + size - 1;
- __insn_inv(p);
+ __insn_finv(p);
p -= step_size;
p = (char *)((unsigned long)p | (step_size - 1));
for (; p >= base; p -= step_size)
- __insn_inv(p);
+ __insn_finv(p);
- /* Wait for the load+inv's (and thus finvs) to have completed. */
+ /* Wait for these finv's (and thus the first finvs) to be done. */
__insn_mf();
#ifdef __tilegx__
diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c
index a93b02a25222..82733c87d67e 100644
--- a/arch/tile/lib/exports.c
+++ b/arch/tile/lib/exports.c
@@ -22,7 +22,6 @@ EXPORT_SYMBOL(strnlen_user_asm);
EXPORT_SYMBOL(strncpy_from_user_asm);
EXPORT_SYMBOL(clear_user_asm);
EXPORT_SYMBOL(flush_user_asm);
-EXPORT_SYMBOL(inv_user_asm);
EXPORT_SYMBOL(finv_user_asm);
/* arch/tile/kernel/entry.S */
@@ -34,6 +33,12 @@ EXPORT_SYMBOL(dump_stack);
/* arch/tile/kernel/head.S */
EXPORT_SYMBOL(empty_zero_page);
+#ifdef CONFIG_FUNCTION_TRACER
+/* arch/tile/kernel/mcount_64.S */
+#include <asm/ftrace.h>
+EXPORT_SYMBOL(__mcount);
+#endif /* CONFIG_FUNCTION_TRACER */
+
/* arch/tile/lib/, various memcpy files */
EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(__copy_to_user_inatomic);
diff --git a/arch/tile/lib/memchr_64.c b/arch/tile/lib/memchr_64.c
index 6f867dbf7c56..f8196b3a950e 100644
--- a/arch/tile/lib/memchr_64.c
+++ b/arch/tile/lib/memchr_64.c
@@ -36,7 +36,7 @@ void *memchr(const void *s, int c, size_t n)
p = (const uint64_t *)(s_int & -8);
/* Create eight copies of the byte for which we are looking. */
- goal = 0x0101010101010101ULL * (uint8_t) c;
+ goal = copy_byte(c);
/* Read the first word, but munge it so that bytes before the array
* will not match goal.
diff --git a/arch/tile/lib/memcpy_32.S b/arch/tile/lib/memcpy_32.S
index 2a419a6122db..a2771ae5da53 100644
--- a/arch/tile/lib/memcpy_32.S
+++ b/arch/tile/lib/memcpy_32.S
@@ -22,14 +22,6 @@
#include <linux/linkage.h>
-/* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-#define memcpy __memcpy_asm
-#define __copy_to_user_inatomic __copy_to_user_inatomic_asm
-#define __copy_from_user_inatomic __copy_from_user_inatomic_asm
-#define __copy_from_user_zeroing __copy_from_user_zeroing_asm
-#endif
-
#define IS_MEMCPY 0
#define IS_COPY_FROM_USER 1
#define IS_COPY_FROM_USER_ZEROING 2
@@ -44,6 +36,7 @@
*/
#define EX \
.pushsection __ex_table, "a"; \
+ .align 4; \
.word 9f, memcpy_common_fixup; \
.popsection; \
9
@@ -158,12 +151,9 @@ EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
{ addi r3, r1, 60; andi r9, r9, -64 }
-#if CHIP_HAS_WH64()
/* No need to prefetch dst, we'll just do the wh64
* right before we copy a line.
*/
-#endif
-
EX: { lw r5, r3; addi r3, r3, 64; movei r4, 1 }
/* Intentionally stall for a few cycles to leave L2 cache alone. */
{ bnzt zero, .; move r27, lr }
@@ -171,21 +161,6 @@ EX: { lw r6, r3; addi r3, r3, 64 }
/* Intentionally stall for a few cycles to leave L2 cache alone. */
{ bnzt zero, . }
EX: { lw r7, r3; addi r3, r3, 64 }
-#if !CHIP_HAS_WH64()
- /* Prefetch the dest */
- /* Intentionally stall for a few cycles to leave L2 cache alone. */
- { bnzt zero, . }
- /* Use a real load to cause a TLB miss if necessary. We aren't using
- * r28, so this should be fine.
- */
-EX: { lw r28, r9; addi r9, r9, 64 }
- /* Intentionally stall for a few cycles to leave L2 cache alone. */
- { bnzt zero, . }
- { prefetch r9; addi r9, r9, 64 }
- /* Intentionally stall for a few cycles to leave L2 cache alone. */
- { bnzt zero, . }
- { prefetch r9; addi r9, r9, 64 }
-#endif
/* Intentionally stall for a few cycles to leave L2 cache alone. */
{ bz zero, .Lbig_loop2 }
@@ -286,13 +261,8 @@ EX: { lw r7, r3; addi r3, r3, 64 }
/* Fill second L1D line. */
EX: { lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */
-#if CHIP_HAS_WH64()
/* Prepare destination line for writing. */
EX: { wh64 r9; addi r9, r9, 64 }
-#else
- /* Prefetch dest line */
- { prefetch r9; addi r9, r9, 64 }
-#endif
/* Load seven words that are L1D hits to cover wh64 L2 usage. */
/* Load the three remaining words from the last L1D line, which
@@ -330,16 +300,7 @@ EX: { lw r18, r1; addi r1, r1, 4 } /* r18 = WORD_8 */
EX: { sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */
EX: { sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */
EX: { sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */
-#if CHIP_HAS_WH64()
EX: { sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */
-#else
- /* Back up the r9 to a cache line we are already storing to
- * if it gets past the end of the dest vector. Strictly speaking,
- * we don't need to back up to the start of a cache line, but it's free
- * and tidy, so why not?
- */
-EX: { sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */
-#endif
/* Store second L1D line. */
EX: { sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */
EX: { sw r0, r19; addi r0, r0, 4 } /* store(WORD_5) */
@@ -403,7 +364,6 @@ EX: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
.Ldest_is_word_aligned:
-#if CHIP_HAS_DWORD_ALIGN()
EX: { andi r8, r0, 63; lwadd_na r6, r1, 4}
{ slti_u r9, r2, 64; bz r8, .Ldest_is_L2_line_aligned }
@@ -511,26 +471,6 @@ EX: { swadd r0, r13, 4; addi r2, r2, -32 }
/* Move r1 back to the point where it corresponds to r0. */
{ addi r1, r1, -4 }
-#else /* !CHIP_HAS_DWORD_ALIGN() */
-
- /* Compute right/left shift counts and load initial source words. */
- { andi r5, r1, -4; andi r3, r1, 3 }
-EX: { lw r6, r5; addi r5, r5, 4; shli r3, r3, 3 }
-EX: { lw r7, r5; addi r5, r5, 4; sub r4, zero, r3 }
-
- /* Load and store one word at a time, using shifts and ORs
- * to correct for the misaligned src.
- */
-.Lcopy_unaligned_src_loop:
- { shr r6, r6, r3; shl r8, r7, r4 }
-EX: { lw r7, r5; or r8, r8, r6; move r6, r7 }
-EX: { sw r0, r8; addi r0, r0, 4; addi r2, r2, -4 }
- { addi r5, r5, 4; slti_u r8, r2, 8 }
- { bzt r8, .Lcopy_unaligned_src_loop; addi r1, r1, 4 }
-
- { bz r2, .Lcopy_unaligned_done }
-#endif /* !CHIP_HAS_DWORD_ALIGN() */
-
/* Fall through */
/*
@@ -614,5 +554,6 @@ memcpy_fixup_loop:
.size memcpy_common_fixup, . - memcpy_common_fixup
.section __ex_table,"a"
+ .align 4
.word .Lcfu, .Lcopy_from_user_fixup_zero_remainder
.word .Lctu, .Lcopy_to_user_fixup_done
diff --git a/arch/tile/lib/memcpy_64.c b/arch/tile/lib/memcpy_64.c
index c79b8e7c6828..4815354b8cd2 100644
--- a/arch/tile/lib/memcpy_64.c
+++ b/arch/tile/lib/memcpy_64.c
@@ -18,14 +18,17 @@
/* EXPORT_SYMBOL() is in arch/tile/lib/exports.c since this should be asm. */
/* Must be 8 bytes in size. */
-#define word_t uint64_t
+#define op_t uint64_t
-#if CHIP_L2_LINE_SIZE() != 64 && CHIP_L2_LINE_SIZE() != 128
-#error "Assumes 64 or 128 byte line size"
+/* Threshold value for when to enter the unrolled loops. */
+#define OP_T_THRES 16
+
+#if CHIP_L2_LINE_SIZE() != 64
+#error "Assumes 64 byte line size"
#endif
/* How many cache lines ahead should we prefetch? */
-#define PREFETCH_LINES_AHEAD 3
+#define PREFETCH_LINES_AHEAD 4
/*
* Provide "base versions" of load and store for the normal code path.
@@ -51,15 +54,16 @@ void *memcpy(void *__restrict dstv, const void *__restrict srcv, size_t n)
* macros to return a count of uncopied bytes due to mm fault.
*/
#define RETVAL 0
-int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
+int __attribute__((optimize("omit-frame-pointer")))
+USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
#endif
{
char *__restrict dst1 = (char *)dstv;
const char *__restrict src1 = (const char *)srcv;
const char *__restrict src1_end;
const char *__restrict prefetch;
- word_t *__restrict dst8; /* 8-byte pointer to destination memory. */
- word_t final; /* Final bytes to write to trailing word, if any */
+ op_t *__restrict dst8; /* 8-byte pointer to destination memory. */
+ op_t final; /* Final bytes to write to trailing word, if any */
long i;
if (n < 16) {
@@ -79,104 +83,228 @@ int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
for (i = 0; i < PREFETCH_LINES_AHEAD; i++) {
__insn_prefetch(prefetch);
prefetch += CHIP_L2_LINE_SIZE();
- prefetch = (prefetch > src1_end) ? prefetch : src1;
+ prefetch = (prefetch < src1_end) ? prefetch : src1;
}
/* Copy bytes until dst is word-aligned. */
- for (; (uintptr_t)dst1 & (sizeof(word_t) - 1); n--)
+ for (; (uintptr_t)dst1 & (sizeof(op_t) - 1); n--)
ST1(dst1++, LD1(src1++));
/* 8-byte pointer to destination memory. */
- dst8 = (word_t *)dst1;
-
- if (__builtin_expect((uintptr_t)src1 & (sizeof(word_t) - 1), 0)) {
- /*
- * Misaligned copy. Copy 8 bytes at a time, but don't
- * bother with other fanciness.
- *
- * TODO: Consider prefetching and using wh64 as well.
- */
-
- /* Create an aligned src8. */
- const word_t *__restrict src8 =
- (const word_t *)((uintptr_t)src1 & -sizeof(word_t));
- word_t b;
-
- word_t a = LD8(src8++);
- for (; n >= sizeof(word_t); n -= sizeof(word_t)) {
- b = LD8(src8++);
- a = __insn_dblalign(a, b, src1);
- ST8(dst8++, a);
- a = b;
+ dst8 = (op_t *)dst1;
+
+ if (__builtin_expect((uintptr_t)src1 & (sizeof(op_t) - 1), 0)) {
+ /* Unaligned copy. */
+
+ op_t tmp0 = 0, tmp1 = 0, tmp2, tmp3;
+ const op_t *src8 = (const op_t *) ((uintptr_t)src1 &
+ -sizeof(op_t));
+ const void *srci = (void *)src1;
+ int m;
+
+ m = (CHIP_L2_LINE_SIZE() << 2) -
+ (((uintptr_t)dst8) & ((CHIP_L2_LINE_SIZE() << 2) - 1));
+ m = (n < m) ? n : m;
+ m /= sizeof(op_t);
+
+ /* Copy until 'dst' is cache-line-aligned. */
+ n -= (sizeof(op_t) * m);
+
+ switch (m % 4) {
+ case 0:
+ if (__builtin_expect(!m, 0))
+ goto _M0;
+ tmp1 = LD8(src8++);
+ tmp2 = LD8(src8++);
+ goto _8B3;
+ case 2:
+ m += 2;
+ tmp3 = LD8(src8++);
+ tmp0 = LD8(src8++);
+ goto _8B1;
+ case 3:
+ m += 1;
+ tmp2 = LD8(src8++);
+ tmp3 = LD8(src8++);
+ goto _8B2;
+ case 1:
+ m--;
+ tmp0 = LD8(src8++);
+ tmp1 = LD8(src8++);
+ if (__builtin_expect(!m, 0))
+ goto _8B0;
+ }
+
+ do {
+ tmp2 = LD8(src8++);
+ tmp0 = __insn_dblalign(tmp0, tmp1, srci);
+ ST8(dst8++, tmp0);
+_8B3:
+ tmp3 = LD8(src8++);
+ tmp1 = __insn_dblalign(tmp1, tmp2, srci);
+ ST8(dst8++, tmp1);
+_8B2:
+ tmp0 = LD8(src8++);
+ tmp2 = __insn_dblalign(tmp2, tmp3, srci);
+ ST8(dst8++, tmp2);
+_8B1:
+ tmp1 = LD8(src8++);
+ tmp3 = __insn_dblalign(tmp3, tmp0, srci);
+ ST8(dst8++, tmp3);
+ m -= 4;
+ } while (m);
+
+_8B0:
+ tmp0 = __insn_dblalign(tmp0, tmp1, srci);
+ ST8(dst8++, tmp0);
+ src8--;
+
+_M0:
+ if (__builtin_expect(n >= CHIP_L2_LINE_SIZE(), 0)) {
+ op_t tmp4, tmp5, tmp6, tmp7, tmp8;
+
+ prefetch = ((const char *)src8) +
+ CHIP_L2_LINE_SIZE() * PREFETCH_LINES_AHEAD;
+
+ for (tmp0 = LD8(src8++); n >= CHIP_L2_LINE_SIZE();
+ n -= CHIP_L2_LINE_SIZE()) {
+ /* Prefetch and advance to next line to
+ prefetch, but don't go past the end. */
+ __insn_prefetch(prefetch);
+
+ /* Make sure prefetch got scheduled
+ earlier. */
+ __asm__ ("" : : : "memory");
+
+ prefetch += CHIP_L2_LINE_SIZE();
+ prefetch = (prefetch < src1_end) ? prefetch :
+ (const char *) src8;
+
+ tmp1 = LD8(src8++);
+ tmp2 = LD8(src8++);
+ tmp3 = LD8(src8++);
+ tmp4 = LD8(src8++);
+ tmp5 = LD8(src8++);
+ tmp6 = LD8(src8++);
+ tmp7 = LD8(src8++);
+ tmp8 = LD8(src8++);
+
+ tmp0 = __insn_dblalign(tmp0, tmp1, srci);
+ tmp1 = __insn_dblalign(tmp1, tmp2, srci);
+ tmp2 = __insn_dblalign(tmp2, tmp3, srci);
+ tmp3 = __insn_dblalign(tmp3, tmp4, srci);
+ tmp4 = __insn_dblalign(tmp4, tmp5, srci);
+ tmp5 = __insn_dblalign(tmp5, tmp6, srci);
+ tmp6 = __insn_dblalign(tmp6, tmp7, srci);
+ tmp7 = __insn_dblalign(tmp7, tmp8, srci);
+
+ __insn_wh64(dst8);
+
+ ST8(dst8++, tmp0);
+ ST8(dst8++, tmp1);
+ ST8(dst8++, tmp2);
+ ST8(dst8++, tmp3);
+ ST8(dst8++, tmp4);
+ ST8(dst8++, tmp5);
+ ST8(dst8++, tmp6);
+ ST8(dst8++, tmp7);
+
+ tmp0 = tmp8;
+ }
+ src8--;
+ }
+
+ /* Copy the rest 8-byte chunks. */
+ if (n >= sizeof(op_t)) {
+ tmp0 = LD8(src8++);
+ for (; n >= sizeof(op_t); n -= sizeof(op_t)) {
+ tmp1 = LD8(src8++);
+ tmp0 = __insn_dblalign(tmp0, tmp1, srci);
+ ST8(dst8++, tmp0);
+ tmp0 = tmp1;
+ }
+ src8--;
}
if (n == 0)
return RETVAL;
- b = ((const char *)src8 <= src1_end) ? *src8 : 0;
+ tmp0 = LD8(src8++);
+ tmp1 = ((const char *)src8 <= src1_end)
+ ? LD8((op_t *)src8) : 0;
+ final = __insn_dblalign(tmp0, tmp1, srci);
- /*
- * Final source bytes to write to trailing partial
- * word, if any.
- */
- final = __insn_dblalign(a, b, src1);
} else {
/* Aligned copy. */
- const word_t* __restrict src8 = (const word_t *)src1;
+ const op_t *__restrict src8 = (const op_t *)src1;
/* src8 and dst8 are both word-aligned. */
if (n >= CHIP_L2_LINE_SIZE()) {
/* Copy until 'dst' is cache-line-aligned. */
for (; (uintptr_t)dst8 & (CHIP_L2_LINE_SIZE() - 1);
- n -= sizeof(word_t))
+ n -= sizeof(op_t))
ST8(dst8++, LD8(src8++));
for (; n >= CHIP_L2_LINE_SIZE(); ) {
- __insn_wh64(dst8);
+ op_t tmp0, tmp1, tmp2, tmp3;
+ op_t tmp4, tmp5, tmp6, tmp7;
/*
* Prefetch and advance to next line
- * to prefetch, but don't go past the end
+ * to prefetch, but don't go past the
+ * end.
*/
__insn_prefetch(prefetch);
+
+ /* Make sure prefetch got scheduled
+ earlier. */
+ __asm__ ("" : : : "memory");
+
prefetch += CHIP_L2_LINE_SIZE();
- prefetch = (prefetch > src1_end) ? prefetch :
+ prefetch = (prefetch < src1_end) ? prefetch :
(const char *)src8;
/*
- * Copy an entire cache line. Manually
- * unrolled to avoid idiosyncracies of
- * compiler unrolling.
+ * Do all the loads before wh64. This
+ * is necessary if [src8, src8+7] and
+ * [dst8, dst8+7] share the same cache
+ * line and dst8 <= src8, as can be
+ * the case when called from memmove,
+ * or with code tested on x86 whose
+ * memcpy always works with forward
+ * copies.
*/
-#define COPY_WORD(offset) ({ ST8(dst8+offset, LD8(src8+offset)); n -= 8; })
- COPY_WORD(0);
- COPY_WORD(1);
- COPY_WORD(2);
- COPY_WORD(3);
- COPY_WORD(4);
- COPY_WORD(5);
- COPY_WORD(6);
- COPY_WORD(7);
-#if CHIP_L2_LINE_SIZE() == 128
- COPY_WORD(8);
- COPY_WORD(9);
- COPY_WORD(10);
- COPY_WORD(11);
- COPY_WORD(12);
- COPY_WORD(13);
- COPY_WORD(14);
- COPY_WORD(15);
-#elif CHIP_L2_LINE_SIZE() != 64
-# error Fix code that assumes particular L2 cache line sizes
-#endif
+ tmp0 = LD8(src8++);
+ tmp1 = LD8(src8++);
+ tmp2 = LD8(src8++);
+ tmp3 = LD8(src8++);
+ tmp4 = LD8(src8++);
+ tmp5 = LD8(src8++);
+ tmp6 = LD8(src8++);
+ tmp7 = LD8(src8++);
+
+ /* wh64 and wait for tmp7 load completion. */
+ __asm__ ("move %0, %0; wh64 %1\n"
+ : : "r"(tmp7), "r"(dst8));
- dst8 += CHIP_L2_LINE_SIZE() / sizeof(word_t);
- src8 += CHIP_L2_LINE_SIZE() / sizeof(word_t);
+ ST8(dst8++, tmp0);
+ ST8(dst8++, tmp1);
+ ST8(dst8++, tmp2);
+ ST8(dst8++, tmp3);
+ ST8(dst8++, tmp4);
+ ST8(dst8++, tmp5);
+ ST8(dst8++, tmp6);
+ ST8(dst8++, tmp7);
+
+ n -= CHIP_L2_LINE_SIZE();
}
+#if CHIP_L2_LINE_SIZE() != 64
+# error "Fix code that assumes particular L2 cache line size."
+#endif
}
- for (; n >= sizeof(word_t); n -= sizeof(word_t))
+ for (; n >= sizeof(op_t); n -= sizeof(op_t))
ST8(dst8++, LD8(src8++));
if (__builtin_expect(n == 0, 1))
diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c
deleted file mode 100644
index 3bc4b4e40d93..000000000000
--- a/arch/tile/lib/memcpy_tile64.c
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation, version 2.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for
- * more details.
- */
-
-#include <linux/string.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/uaccess.h>
-#include <asm/fixmap.h>
-#include <asm/kmap_types.h>
-#include <asm/tlbflush.h>
-#include <hv/hypervisor.h>
-#include <arch/chip.h>
-
-
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-
-/* Defined in memcpy.S */
-extern unsigned long __memcpy_asm(void *to, const void *from, unsigned long n);
-extern unsigned long __copy_to_user_inatomic_asm(
- void __user *to, const void *from, unsigned long n);
-extern unsigned long __copy_from_user_inatomic_asm(
- void *to, const void __user *from, unsigned long n);
-extern unsigned long __copy_from_user_zeroing_asm(
- void *to, const void __user *from, unsigned long n);
-
-typedef unsigned long (*memcpy_t)(void *, const void *, unsigned long);
-
-/* Size above which to consider TLB games for performance */
-#define LARGE_COPY_CUTOFF 2048
-
-/* Communicate to the simulator what we are trying to do. */
-#define sim_allow_multiple_caching(b) \
- __insn_mtspr(SPR_SIM_CONTROL, \
- SIM_CONTROL_ALLOW_MULTIPLE_CACHING | ((b) << _SIM_CONTROL_OPERATOR_BITS))
-
-/*
- * Copy memory by briefly enabling incoherent cacheline-at-a-time mode.
- *
- * We set up our own source and destination PTEs that we fully control.
- * This is the only way to guarantee that we don't race with another
- * thread that is modifying the PTE; we can't afford to try the
- * copy_{to,from}_user() technique of catching the interrupt, since
- * we must run with interrupts disabled to avoid the risk of some
- * other code seeing the incoherent data in our cache. (Recall that
- * our cache is indexed by PA, so even if the other code doesn't use
- * our kmap_atomic virtual addresses, they'll still hit in cache using
- * the normal VAs that aren't supposed to hit in cache.)
- */
-static void memcpy_multicache(void *dest, const void *source,
- pte_t dst_pte, pte_t src_pte, int len)
-{
- int idx;
- unsigned long flags, newsrc, newdst;
- pmd_t *pmdp;
- pte_t *ptep;
- int type0, type1;
- int cpu = get_cpu();
-
- /*
- * Disable interrupts so that we don't recurse into memcpy()
- * in an interrupt handler, nor accidentally reference
- * the PA of the source from an interrupt routine. Also
- * notify the simulator that we're playing games so we don't
- * generate spurious coherency warnings.
- */
- local_irq_save(flags);
- sim_allow_multiple_caching(1);
-
- /* Set up the new dest mapping */
- type0 = kmap_atomic_idx_push();
- idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + type0;
- newdst = __fix_to_virt(idx) + ((unsigned long)dest & (PAGE_SIZE-1));
- pmdp = pmd_offset(pud_offset(pgd_offset_k(newdst), newdst), newdst);
- ptep = pte_offset_kernel(pmdp, newdst);
- if (pte_val(*ptep) != pte_val(dst_pte)) {
- set_pte(ptep, dst_pte);
- local_flush_tlb_page(NULL, newdst, PAGE_SIZE);
- }
-
- /* Set up the new source mapping */
- type1 = kmap_atomic_idx_push();
- idx += (type0 - type1);
- src_pte = hv_pte_set_nc(src_pte);
- src_pte = hv_pte_clear_writable(src_pte); /* be paranoid */
- newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1));
- pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc);
- ptep = pte_offset_kernel(pmdp, newsrc);
- __set_pte(ptep, src_pte); /* set_pte() would be confused by this */
- local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
-
- /* Actually move the data. */
- __memcpy_asm((void *)newdst, (const void *)newsrc, len);
-
- /*
- * Remap the source as locally-cached and not OLOC'ed so that
- * we can inval without also invaling the remote cpu's cache.
- * This also avoids known errata with inv'ing cacheable oloc data.
- */
- src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3);
- src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */
- __set_pte(ptep, src_pte); /* set_pte() would be confused by this */
- local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
-
- /*
- * Do the actual invalidation, covering the full L2 cache line
- * at the end since __memcpy_asm() is somewhat aggressive.
- */
- __inv_buffer((void *)newsrc, len);
-
- /*
- * We're done: notify the simulator that all is back to normal,
- * and re-enable interrupts and pre-emption.
- */
- kmap_atomic_idx_pop();
- kmap_atomic_idx_pop();
- sim_allow_multiple_caching(0);
- local_irq_restore(flags);
- put_cpu();
-}
-
-/*
- * Identify large copies from remotely-cached memory, and copy them
- * via memcpy_multicache() if they look good, otherwise fall back
- * to the particular kind of copying passed as the memcpy_t function.
- */
-static unsigned long fast_copy(void *dest, const void *source, int len,
- memcpy_t func)
-{
- /*
- * Check if it's big enough to bother with. We may end up doing a
- * small copy via TLB manipulation if we're near a page boundary,
- * but presumably we'll make it up when we hit the second page.
- */
- while (len >= LARGE_COPY_CUTOFF) {
- int copy_size, bytes_left_on_page;
- pte_t *src_ptep, *dst_ptep;
- pte_t src_pte, dst_pte;
- struct page *src_page, *dst_page;
-
- /* Is the source page oloc'ed to a remote cpu? */
-retry_source:
- src_ptep = virt_to_pte(current->mm, (unsigned long)source);
- if (src_ptep == NULL)
- break;
- src_pte = *src_ptep;
- if (!hv_pte_get_present(src_pte) ||
- !hv_pte_get_readable(src_pte) ||
- hv_pte_get_mode(src_pte) != HV_PTE_MODE_CACHE_TILE_L3)
- break;
- if (get_remote_cache_cpu(src_pte) == smp_processor_id())
- break;
- src_page = pfn_to_page(pte_pfn(src_pte));
- get_page(src_page);
- if (pte_val(src_pte) != pte_val(*src_ptep)) {
- put_page(src_page);
- goto retry_source;
- }
- if (pte_huge(src_pte)) {
- /* Adjust the PTE to correspond to a small page */
- int pfn = pte_pfn(src_pte);
- pfn += (((unsigned long)source & (HPAGE_SIZE-1))
- >> PAGE_SHIFT);
- src_pte = pfn_pte(pfn, src_pte);
- src_pte = pte_mksmall(src_pte);
- }
-
- /* Is the destination page writable? */
-retry_dest:
- dst_ptep = virt_to_pte(current->mm, (unsigned long)dest);
- if (dst_ptep == NULL) {
- put_page(src_page);
- break;
- }
- dst_pte = *dst_ptep;
- if (!hv_pte_get_present(dst_pte) ||
- !hv_pte_get_writable(dst_pte)) {
- put_page(src_page);
- break;
- }
- dst_page = pfn_to_page(pte_pfn(dst_pte));
- if (dst_page == src_page) {
- /*
- * Source and dest are on the same page; this
- * potentially exposes us to incoherence if any
- * part of src and dest overlap on a cache line.
- * Just give up rather than trying to be precise.
- */
- put_page(src_page);
- break;
- }
- get_page(dst_page);
- if (pte_val(dst_pte) != pte_val(*dst_ptep)) {
- put_page(dst_page);
- goto retry_dest;
- }
- if (pte_huge(dst_pte)) {
- /* Adjust the PTE to correspond to a small page */
- int pfn = pte_pfn(dst_pte);
- pfn += (((unsigned long)dest & (HPAGE_SIZE-1))
- >> PAGE_SHIFT);
- dst_pte = pfn_pte(pfn, dst_pte);
- dst_pte = pte_mksmall(dst_pte);
- }
-
- /* All looks good: create a cachable PTE and copy from it */
- copy_size = len;
- bytes_left_on_page =
- PAGE_SIZE - (((int)source) & (PAGE_SIZE-1));
- if (copy_size > bytes_left_on_page)
- copy_size = bytes_left_on_page;
- bytes_left_on_page =
- PAGE_SIZE - (((int)dest) & (PAGE_SIZE-1));
- if (copy_size > bytes_left_on_page)
- copy_size = bytes_left_on_page;
- memcpy_multicache(dest, source, dst_pte, src_pte, copy_size);
-
- /* Release the pages */
- put_page(dst_page);
- put_page(src_page);
-
- /* Continue on the next page */
- dest += copy_size;
- source += copy_size;
- len -= copy_size;
- }
-
- return func(dest, source, len);
-}
-
-void *memcpy(void *to, const void *from, __kernel_size_t n)
-{
- if (n < LARGE_COPY_CUTOFF)
- return (void *)__memcpy_asm(to, from, n);
- else
- return (void *)fast_copy(to, from, n, __memcpy_asm);
-}
-
-unsigned long __copy_to_user_inatomic(void __user *to, const void *from,
- unsigned long n)
-{
- if (n < LARGE_COPY_CUTOFF)
- return __copy_to_user_inatomic_asm(to, from, n);
- else
- return fast_copy(to, from, n, __copy_to_user_inatomic_asm);
-}
-
-unsigned long __copy_from_user_inatomic(void *to, const void __user *from,
- unsigned long n)
-{
- if (n < LARGE_COPY_CUTOFF)
- return __copy_from_user_inatomic_asm(to, from, n);
- else
- return fast_copy(to, from, n, __copy_from_user_inatomic_asm);
-}
-
-unsigned long __copy_from_user_zeroing(void *to, const void __user *from,
- unsigned long n)
-{
- if (n < LARGE_COPY_CUTOFF)
- return __copy_from_user_zeroing_asm(to, from, n);
- else
- return fast_copy(to, from, n, __copy_from_user_zeroing_asm);
-}
-
-#endif /* !CHIP_HAS_COHERENT_LOCAL_CACHE() */
diff --git a/arch/tile/lib/memcpy_user_64.c b/arch/tile/lib/memcpy_user_64.c
index 37440caa7370..88c7016492c4 100644
--- a/arch/tile/lib/memcpy_user_64.c
+++ b/arch/tile/lib/memcpy_user_64.c
@@ -31,6 +31,7 @@
".pushsection .coldtext.memcpy,\"ax\";" \
"2: { move r0, %2; jrp lr };" \
".section __ex_table,\"a\";" \
+ ".align 8;" \
".quad 1b, 2b;" \
".popsection" \
: "=m" (*(p)) : "r" (v), "r" (n)); \
@@ -43,6 +44,7 @@
".pushsection .coldtext.memcpy,\"ax\";" \
"2: { move r0, %2; jrp lr };" \
".section __ex_table,\"a\";" \
+ ".align 8;" \
".quad 1b, 2b;" \
".popsection" \
: "=r" (__v) : "m" (*(p)), "r" (n)); \
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c
index 57dbb3a5bff8..2042bfe6595f 100644
--- a/arch/tile/lib/memset_32.c
+++ b/arch/tile/lib/memset_32.c
@@ -12,13 +12,10 @@
* more details.
*/
-#include <arch/chip.h>
-
#include <linux/types.h>
#include <linux/string.h>
#include <linux/module.h>
-
-#undef memset
+#include <arch/chip.h>
void *memset(void *s, int c, size_t n)
{
@@ -26,11 +23,7 @@ void *memset(void *s, int c, size_t n)
int n32;
uint32_t v16, v32;
uint8_t *out8 = s;
-#if !CHIP_HAS_WH64()
- int ahead32;
-#else
int to_align32;
-#endif
/* Experimentation shows that a trivial tight loop is a win up until
* around a size of 20, where writing a word at a time starts to win.
@@ -61,21 +54,6 @@ void *memset(void *s, int c, size_t n)
return s;
}
-#if !CHIP_HAS_WH64()
- /* Use a spare issue slot to start prefetching the first cache
- * line early. This instruction is free as the store can be buried
- * in otherwise idle issue slots doing ALU ops.
- */
- __insn_prefetch(out8);
-
- /* We prefetch the end so that a short memset that spans two cache
- * lines gets some prefetching benefit. Again we believe this is free
- * to issue.
- */
- __insn_prefetch(&out8[n - 1]);
-#endif /* !CHIP_HAS_WH64() */
-
-
/* Align 'out8'. We know n >= 3 so this won't write past the end. */
while (((uintptr_t) out8 & 3) != 0) {
*out8++ = c;
@@ -96,90 +74,6 @@ void *memset(void *s, int c, size_t n)
/* This must be at least 8 or the following loop doesn't work. */
#define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)
-#if !CHIP_HAS_WH64()
-
- ahead32 = CACHE_LINE_SIZE_IN_WORDS;
-
- /* We already prefetched the first and last cache lines, so
- * we only need to do more prefetching if we are storing
- * to more than two cache lines.
- */
- if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) {
- int i;
-
- /* Prefetch the next several cache lines.
- * This is the setup code for the software-pipelined
- * loop below.
- */
-#define MAX_PREFETCH 5
- ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS;
- if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS)
- ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS;
-
- for (i = CACHE_LINE_SIZE_IN_WORDS;
- i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS)
- __insn_prefetch(&out32[i]);
- }
-
- if (n32 > ahead32) {
- while (1) {
- int j;
-
- /* Prefetch by reading one word several cache lines
- * ahead. Since loads are non-blocking this will
- * cause the full cache line to be read while we are
- * finishing earlier cache lines. Using a store
- * here causes microarchitectural performance
- * problems where a victimizing store miss goes to
- * the head of the retry FIFO and locks the pipe for
- * a few cycles. So a few subsequent stores in this
- * loop go into the retry FIFO, and then later
- * stores see other stores to the same cache line
- * are already in the retry FIFO and themselves go
- * into the retry FIFO, filling it up and grinding
- * to a halt waiting for the original miss to be
- * satisfied.
- */
- __insn_prefetch(&out32[ahead32]);
-
-#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0
-#error "Unhandled CACHE_LINE_SIZE_IN_WORDS"
-#endif
-
- n32 -= CACHE_LINE_SIZE_IN_WORDS;
-
- /* Save icache space by only partially unrolling
- * this loop.
- */
- for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) {
- *out32++ = v32;
- *out32++ = v32;
- *out32++ = v32;
- *out32++ = v32;
- }
-
- /* To save compiled code size, reuse this loop even
- * when we run out of prefetching to do by dropping
- * ahead32 down.
- */
- if (n32 <= ahead32) {
- /* Not even a full cache line left,
- * so stop now.
- */
- if (n32 < CACHE_LINE_SIZE_IN_WORDS)
- break;
-
- /* Choose a small enough value that we don't
- * prefetch past the end. There's no sense
- * in touching cache lines we don't have to.
- */
- ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1;
- }
- }
- }
-
-#else /* CHIP_HAS_WH64() */
-
/* Determine how many words we need to emit before the 'out32'
* pointer becomes aligned modulo the cache line size.
*/
@@ -236,8 +130,6 @@ void *memset(void *s, int c, size_t n)
n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;
}
-#endif /* CHIP_HAS_WH64() */
-
/* Now handle any leftover values. */
if (n32 != 0) {
do {
diff --git a/arch/tile/lib/memset_64.c b/arch/tile/lib/memset_64.c
index 3873085711d5..03ef69cd73de 100644
--- a/arch/tile/lib/memset_64.c
+++ b/arch/tile/lib/memset_64.c
@@ -12,13 +12,11 @@
* more details.
*/
-#include <arch/chip.h>
-
#include <linux/types.h>
#include <linux/string.h>
#include <linux/module.h>
-
-#undef memset
+#include <arch/chip.h>
+#include "string-endian.h"
void *memset(void *s, int c, size_t n)
{
@@ -70,8 +68,7 @@ void *memset(void *s, int c, size_t n)
n64 = n >> 3;
/* Tile input byte out to 64 bits. */
- /* KLUDGE */
- v64 = 0x0101010101010101ULL * (uint8_t)c;
+ v64 = copy_byte(c);
/* This must be at least 8 or the following loop doesn't work. */
#define CACHE_LINE_SIZE_IN_DOUBLEWORDS (CHIP_L2_LINE_SIZE() / 8)
diff --git a/arch/tile/lib/strchr_32.c b/arch/tile/lib/strchr_32.c
index c94e6f7ae7b5..841fe6963019 100644
--- a/arch/tile/lib/strchr_32.c
+++ b/arch/tile/lib/strchr_32.c
@@ -16,8 +16,6 @@
#include <linux/string.h>
#include <linux/module.h>
-#undef strchr
-
char *strchr(const char *s, int c)
{
int z, g;
diff --git a/arch/tile/lib/strchr_64.c b/arch/tile/lib/strchr_64.c
index f39f9dc422b0..fe6e31c06f8d 100644
--- a/arch/tile/lib/strchr_64.c
+++ b/arch/tile/lib/strchr_64.c
@@ -26,7 +26,7 @@ char *strchr(const char *s, int c)
const uint64_t *p = (const uint64_t *)(s_int & -8);
/* Create eight copies of the byte for which we are looking. */
- const uint64_t goal = 0x0101010101010101ULL * (uint8_t) c;
+ const uint64_t goal = copy_byte(c);
/* Read the first aligned word, but force bytes before the string to
* match neither zero nor goal (we make sure the high bit of each
diff --git a/arch/tile/lib/string-endian.h b/arch/tile/lib/string-endian.h
index c0eed7ce69c3..2e49cbfe9371 100644
--- a/arch/tile/lib/string-endian.h
+++ b/arch/tile/lib/string-endian.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -31,3 +31,14 @@
#define CFZ(x) __insn_clz(x)
#define REVCZ(x) __insn_ctz(x)
#endif
+
+/*
+ * Create eight copies of the byte in a uint64_t. Byte Shuffle uses
+ * the bytes of srcB as the index into the dest vector to select a
+ * byte. With all indices of zero, the first byte is copied into all
+ * the other bytes.
+ */
+static inline uint64_t copy_byte(uint8_t byte)
+{
+ return __insn_shufflebytes(byte, 0, 0);
+}
diff --git a/arch/tile/lib/strlen_32.c b/arch/tile/lib/strlen_32.c
index 4974292a5534..f26f88e11e4a 100644
--- a/arch/tile/lib/strlen_32.c
+++ b/arch/tile/lib/strlen_32.c
@@ -16,8 +16,6 @@
#include <linux/string.h>
#include <linux/module.h>
-#undef strlen
-
size_t strlen(const char *s)
{
/* Get an aligned pointer. */
diff --git a/arch/tile/lib/strnlen_32.c b/arch/tile/lib/strnlen_32.c
new file mode 100644
index 000000000000..1434141d9e01
--- /dev/null
+++ b/arch/tile/lib/strnlen_32.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+size_t strnlen(const char *s, size_t count)
+{
+ /* Get an aligned pointer. */
+ const uintptr_t s_int = (uintptr_t) s;
+ const uint32_t *p = (const uint32_t *)(s_int & -4);
+ size_t bytes_read = sizeof(*p) - (s_int & (sizeof(*p) - 1));
+ size_t len;
+ uint32_t v, bits;
+
+ /* Avoid page fault risk by not reading any bytes when count is 0. */
+ if (count == 0)
+ return 0;
+
+ /* Read first word, but force bytes before the string to be nonzero. */
+ v = *p | ((1 << ((s_int << 3) & 31)) - 1);
+
+ while ((bits = __insn_seqb(v, 0)) == 0) {
+ if (bytes_read >= count) {
+ /* Read COUNT bytes and didn't find the terminator. */
+ return count;
+ }
+ v = *++p;
+ bytes_read += sizeof(v);
+ }
+
+ len = ((const char *) p) + (__insn_ctz(bits) >> 3) - s;
+ return (len < count ? len : count);
+}
+EXPORT_SYMBOL(strnlen);
diff --git a/arch/tile/lib/strnlen_64.c b/arch/tile/lib/strnlen_64.c
new file mode 100644
index 000000000000..2e8de6a5136f
--- /dev/null
+++ b/arch/tile/lib/strnlen_64.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include "string-endian.h"
+
+size_t strnlen(const char *s, size_t count)
+{
+ /* Get an aligned pointer. */
+ const uintptr_t s_int = (uintptr_t) s;
+ const uint64_t *p = (const uint64_t *)(s_int & -8);
+ size_t bytes_read = sizeof(*p) - (s_int & (sizeof(*p) - 1));
+ size_t len;
+ uint64_t v, bits;
+
+ /* Avoid page fault risk by not reading any bytes when count is 0. */
+ if (count == 0)
+ return 0;
+
+ /* Read and MASK the first word. */
+ v = *p | MASK(s_int);
+
+ while ((bits = __insn_v1cmpeqi(v, 0)) == 0) {
+ if (bytes_read >= count) {
+ /* Read COUNT bytes and didn't find the terminator. */
+ return count;
+ }
+ v = *++p;
+ bytes_read += sizeof(v);
+ }
+
+ len = ((const char *) p) + (CFZ(bits) >> 3) - s;
+ return (len < count ? len : count);
+}
+EXPORT_SYMBOL(strnlen);
diff --git a/arch/tile/lib/usercopy_32.S b/arch/tile/lib/usercopy_32.S
index b62d002af009..1bc162224638 100644
--- a/arch/tile/lib/usercopy_32.S
+++ b/arch/tile/lib/usercopy_32.S
@@ -36,6 +36,7 @@ strnlen_user_fault:
{ move r0, zero; jrp lr }
ENDPROC(strnlen_user_fault)
.section __ex_table,"a"
+ .align 4
.word 1b, strnlen_user_fault
.popsection
@@ -47,18 +48,20 @@ strnlen_user_fault:
*/
STD_ENTRY(strncpy_from_user_asm)
{ bz r2, 2f; move r3, r0 }
-1: { lb_u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
+1: { lb_u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
{ sb r0, r4; addi r0, r0, 1 }
- bz r2, 2f
- bnzt r4, 1b
- addi r0, r0, -1 /* don't count the trailing NUL */
-2: { sub r0, r0, r3; jrp lr }
+ bz r4, 2f
+ bnzt r2, 1b
+ { sub r0, r0, r3; jrp lr }
+2: addi r0, r0, -1 /* don't count the trailing NUL */
+ { sub r0, r0, r3; jrp lr }
STD_ENDPROC(strncpy_from_user_asm)
.pushsection .fixup,"ax"
strncpy_from_user_fault:
{ movei r0, -EFAULT; jrp lr }
ENDPROC(strncpy_from_user_fault)
.section __ex_table,"a"
+ .align 4
.word 1b, strncpy_from_user_fault
.popsection
@@ -77,6 +80,7 @@ STD_ENTRY(clear_user_asm)
bnzt r1, 1b
2: { move r0, r1; jrp lr }
.pushsection __ex_table,"a"
+ .align 4
.word 1b, 2b
.popsection
@@ -86,6 +90,7 @@ STD_ENTRY(clear_user_asm)
2: { move r0, r1; jrp lr }
STD_ENDPROC(clear_user_asm)
.pushsection __ex_table,"a"
+ .align 4
.word 1b, 2b
.popsection
@@ -105,25 +110,7 @@ STD_ENTRY(flush_user_asm)
2: { move r0, r1; jrp lr }
STD_ENDPROC(flush_user_asm)
.pushsection __ex_table,"a"
- .word 1b, 2b
- .popsection
-
-/*
- * inv_user_asm takes the user target address in r0 and the
- * number of bytes to invalidate in r1.
- * It returns the number of not inv'able bytes (hopefully zero) in r0.
- */
-STD_ENTRY(inv_user_asm)
- bz r1, 2f
- { movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
- { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
- { and r0, r0, r2; and r1, r1, r2 }
- { sub r1, r1, r0 }
-1: { inv r0; addi r1, r1, -CHIP_INV_STRIDE() }
- { addi r0, r0, CHIP_INV_STRIDE(); bnzt r1, 1b }
-2: { move r0, r1; jrp lr }
- STD_ENDPROC(inv_user_asm)
- .pushsection __ex_table,"a"
+ .align 4
.word 1b, 2b
.popsection
@@ -143,5 +130,6 @@ STD_ENTRY(finv_user_asm)
2: { move r0, r1; jrp lr }
STD_ENDPROC(finv_user_asm)
.pushsection __ex_table,"a"
+ .align 4
.word 1b, 2b
.popsection
diff --git a/arch/tile/lib/usercopy_64.S b/arch/tile/lib/usercopy_64.S
index adb2dbbc70cd..b3b31a3306f8 100644
--- a/arch/tile/lib/usercopy_64.S
+++ b/arch/tile/lib/usercopy_64.S
@@ -36,6 +36,7 @@ strnlen_user_fault:
{ move r0, zero; jrp lr }
ENDPROC(strnlen_user_fault)
.section __ex_table,"a"
+ .align 8
.quad 1b, strnlen_user_fault
.popsection
@@ -47,18 +48,20 @@ strnlen_user_fault:
*/
STD_ENTRY(strncpy_from_user_asm)
{ beqz r2, 2f; move r3, r0 }
-1: { ld1u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
+1: { ld1u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
{ st1 r0, r4; addi r0, r0, 1 }
- beqz r2, 2f
- bnezt r4, 1b
- addi r0, r0, -1 /* don't count the trailing NUL */
-2: { sub r0, r0, r3; jrp lr }
+ beqz r4, 2f
+ bnezt r2, 1b
+ { sub r0, r0, r3; jrp lr }
+2: addi r0, r0, -1 /* don't count the trailing NUL */
+ { sub r0, r0, r3; jrp lr }
STD_ENDPROC(strncpy_from_user_asm)
.pushsection .fixup,"ax"
strncpy_from_user_fault:
{ movei r0, -EFAULT; jrp lr }
ENDPROC(strncpy_from_user_fault)
.section __ex_table,"a"
+ .align 8
.quad 1b, strncpy_from_user_fault
.popsection
@@ -77,6 +80,7 @@ STD_ENTRY(clear_user_asm)
bnezt r1, 1b
2: { move r0, r1; jrp lr }
.pushsection __ex_table,"a"
+ .align 8
.quad 1b, 2b
.popsection
@@ -86,6 +90,7 @@ STD_ENTRY(clear_user_asm)
2: { move r0, r1; jrp lr }
STD_ENDPROC(clear_user_asm)
.pushsection __ex_table,"a"
+ .align 8
.quad 1b, 2b
.popsection
@@ -105,25 +110,7 @@ STD_ENTRY(flush_user_asm)
2: { move r0, r1; jrp lr }
STD_ENDPROC(flush_user_asm)
.pushsection __ex_table,"a"
- .quad 1b, 2b
- .popsection
-
-/*
- * inv_user_asm takes the user target address in r0 and the
- * number of bytes to invalidate in r1.
- * It returns the number of not inv'able bytes (hopefully zero) in r0.
- */
-STD_ENTRY(inv_user_asm)
- beqz r1, 2f
- { movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
- { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
- { and r0, r0, r2; and r1, r1, r2 }
- { sub r1, r1, r0 }
-1: { inv r0; addi r1, r1, -CHIP_INV_STRIDE() }
- { addi r0, r0, CHIP_INV_STRIDE(); bnezt r1, 1b }
-2: { move r0, r1; jrp lr }
- STD_ENDPROC(inv_user_asm)
- .pushsection __ex_table,"a"
+ .align 8
.quad 1b, 2b
.popsection
@@ -143,5 +130,6 @@ STD_ENTRY(finv_user_asm)
2: { move r0, r1; jrp lr }
STD_ENDPROC(finv_user_asm)
.pushsection __ex_table,"a"
+ .align 8
.quad 1b, 2b
.popsection