diff options
author | Claudiu Zissulescu <claziss@synopsys.com> | 2014-11-21 13:39:25 +0530 |
---|---|---|
committer | Vineet Gupta <vgupta@synopsys.com> | 2015-06-22 14:06:56 +0530 |
commit | 1f7e3dc0baaa41217dc06d3370e1efd1aecbc1f0 (patch) | |
tree | a771d4ea921ad357ec1a9540cfc351bffaa61e77 /arch/arc/lib/memcpy-archs.S | |
parent | bcc4d65abec2adb74157b34519e80331eb4427eb (diff) |
ARCv2: optimised string/mem lib routines
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
Diffstat (limited to 'arch/arc/lib/memcpy-archs.S')
-rw-r--r-- | arch/arc/lib/memcpy-archs.S | 236 |
1 files changed, 236 insertions, 0 deletions
diff --git a/arch/arc/lib/memcpy-archs.S b/arch/arc/lib/memcpy-archs.S new file mode 100644 index 000000000000..1b2b3acfed52 --- /dev/null +++ b/arch/arc/lib/memcpy-archs.S @@ -0,0 +1,236 @@ +/* + * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> + +#ifdef __LITTLE_ENDIAN__ +# define SHIFT_1(RX,RY,IMM) asl RX, RY, IMM ; << +# define SHIFT_2(RX,RY,IMM) lsr RX, RY, IMM ; >> +# define MERGE_1(RX,RY,IMM) asl RX, RY, IMM +# define MERGE_2(RX,RY,IMM) +# define EXTRACT_1(RX,RY,IMM) and RX, RY, 0xFFFF +# define EXTRACT_2(RX,RY,IMM) lsr RX, RY, IMM +#else +# define SHIFT_1(RX,RY,IMM) lsr RX, RY, IMM ; >> +# define SHIFT_2(RX,RY,IMM) asl RX, RY, IMM ; << +# define MERGE_1(RX,RY,IMM) asl RX, RY, IMM ; << +# define MERGE_2(RX,RY,IMM) asl RX, RY, IMM ; << +# define EXTRACT_1(RX,RY,IMM) lsr RX, RY, IMM +# define EXTRACT_2(RX,RY,IMM) lsr RX, RY, 0x08 +#endif + +#ifdef CONFIG_ARC_HAS_LL64 +# define PREFETCH_READ(RX) prefetch [RX, 56] +# define PREFETCH_WRITE(RX) prefetchw [RX, 64] +# define LOADX(DST,RX) ldd.ab DST, [RX, 8] +# define STOREX(SRC,RX) std.ab SRC, [RX, 8] +# define ZOLSHFT 5 +# define ZOLAND 0x1F +#else +# define PREFETCH_READ(RX) prefetch [RX, 28] +# define PREFETCH_WRITE(RX) prefetchw [RX, 32] +# define LOADX(DST,RX) ld.ab DST, [RX, 4] +# define STOREX(SRC,RX) st.ab SRC, [RX, 4] +# define ZOLSHFT 4 +# define ZOLAND 0xF +#endif + +ENTRY(memcpy) + prefetch [r1] ; Prefetch the read location + prefetchw [r0] ; Prefetch the write location + mov.f 0, r2 +;;; if size is zero + jz.d [blink] + mov r3, r0 ; don;t clobber ret val + +;;; if size <= 8 + cmp r2, 8 + bls.d @smallchunk + mov.f lp_count, r2 + + and.f r4, r0, 0x03 + rsub lp_count, r4, 4 + lpnz @aligndestination + ;; LOOP BEGIN + ldb.ab r5, [r1,1] + sub r2, r2, 1 + stb.ab r5, [r3,1] +aligndestination: + +;;; Check the alignment of the source + and.f r4, r1, 0x03 + bnz.d @sourceunaligned + +;;; CASE 0: Both source and destination are 32bit aligned +;;; Convert len to Dwords, unfold x4 + lsr.f lp_count, r2, ZOLSHFT + lpnz @copy32_64bytes + ;; LOOP START + LOADX (r6, r1) + PREFETCH_READ (r1) + PREFETCH_WRITE (r3) + LOADX (r8, r1) + LOADX (r10, r1) + LOADX (r4, r1) + STOREX (r6, r3) + STOREX (r8, r3) + STOREX (r10, r3) + STOREX (r4, r3) +copy32_64bytes: + + and.f lp_count, r2, ZOLAND ;Last remaining 31 bytes +smallchunk: + lpnz @copyremainingbytes + ;; LOOP START + ldb.ab r5, [r1,1] + stb.ab r5, [r3,1] +copyremainingbytes: + + j [blink] +;;; END CASE 0 + +sourceunaligned: + cmp r4, 2 + beq.d @unalignedOffby2 + sub r2, r2, 1 + + bhi.d @unalignedOffby3 + ldb.ab r5, [r1, 1] + +;;; CASE 1: The source is unaligned, off by 1 + ;; Hence I need to read 1 byte for a 16bit alignment + ;; and 2bytes to reach 32bit alignment + ldh.ab r6, [r1, 2] + sub r2, r2, 2 + ;; Convert to words, unfold x2 + lsr.f lp_count, r2, 3 + MERGE_1 (r6, r6, 8) + MERGE_2 (r5, r5, 24) + or r5, r5, r6 + + ;; Both src and dst are aligned + lpnz @copy8bytes_1 + ;; LOOP START + ld.ab r6, [r1, 4] + prefetch [r1, 28] ;Prefetch the next read location + ld.ab r8, [r1,4] + prefetchw [r3, 32] ;Prefetch the next write location + + SHIFT_1 (r7, r6, 24) + or r7, r7, r5 + SHIFT_2 (r5, r6, 8) + + SHIFT_1 (r9, r8, 24) + or r9, r9, r5 + SHIFT_2 (r5, r8, 8) + + st.ab r7, [r3, 4] + st.ab r9, [r3, 4] +copy8bytes_1: + + ;; Write back the remaining 16bits + EXTRACT_1 (r6, r5, 16) + sth.ab r6, [r3, 2] + ;; Write back the remaining 8bits + EXTRACT_2 (r5, r5, 16) + stb.ab r5, [r3, 1] + + and.f lp_count, r2, 0x07 ;Last 8bytes + lpnz @copybytewise_1 + ;; LOOP START + ldb.ab r6, [r1,1] + stb.ab r6, [r3,1] +copybytewise_1: + j [blink] + +unalignedOffby2: +;;; CASE 2: The source is unaligned, off by 2 + ldh.ab r5, [r1, 2] + sub r2, r2, 1 + + ;; Both src and dst are aligned + ;; Convert to words, unfold x2 + lsr.f lp_count, r2, 3 +#ifdef __BIG_ENDIAN__ + asl.nz r5, r5, 16 +#endif + lpnz @copy8bytes_2 + ;; LOOP START + ld.ab r6, [r1, 4] + prefetch [r1, 28] ;Prefetch the next read location + ld.ab r8, [r1,4] + prefetchw [r3, 32] ;Prefetch the next write location + + SHIFT_1 (r7, r6, 16) + or r7, r7, r5 + SHIFT_2 (r5, r6, 16) + + SHIFT_1 (r9, r8, 16) + or r9, r9, r5 + SHIFT_2 (r5, r8, 16) + + st.ab r7, [r3, 4] + st.ab r9, [r3, 4] +copy8bytes_2: + +#ifdef __BIG_ENDIAN__ + lsr.nz r5, r5, 16 +#endif + sth.ab r5, [r3, 2] + + and.f lp_count, r2, 0x07 ;Last 8bytes + lpnz @copybytewise_2 + ;; LOOP START + ldb.ab r6, [r1,1] + stb.ab r6, [r3,1] +copybytewise_2: + j [blink] + +unalignedOffby3: +;;; CASE 3: The source is unaligned, off by 3 +;;; Hence, I need to read 1byte for achieve the 32bit alignment + + ;; Both src and dst are aligned + ;; Convert to words, unfold x2 + lsr.f lp_count, r2, 3 +#ifdef __BIG_ENDIAN__ + asl.ne r5, r5, 24 +#endif + lpnz @copy8bytes_3 + ;; LOOP START + ld.ab r6, [r1, 4] + prefetch [r1, 28] ;Prefetch the next read location + ld.ab r8, [r1,4] + prefetch [r3, 32] ;Prefetch the next write location + + SHIFT_1 (r7, r6, 8) + or r7, r7, r5 + SHIFT_2 (r5, r6, 24) + + SHIFT_1 (r9, r8, 8) + or r9, r9, r5 + SHIFT_2 (r5, r8, 24) + + st.ab r7, [r3, 4] + st.ab r9, [r3, 4] +copy8bytes_3: + +#ifdef __BIG_ENDIAN__ + lsr.nz r5, r5, 24 +#endif + stb.ab r5, [r3, 1] + + and.f lp_count, r2, 0x07 ;Last 8bytes + lpnz @copybytewise_3 + ;; LOOP START + ldb.ab r6, [r1,1] + stb.ab r6, [r3,1] +copybytewise_3: + j [blink] + +END(memcpy) |