diff options
Diffstat (limited to 'arch/x86_64/lib/memset.S')
-rw-r--r-- | arch/x86_64/lib/memset.S | 125 |
1 files changed, 125 insertions, 0 deletions
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S new file mode 100644 index 000000000000..4b4c40638640 --- /dev/null +++ b/arch/x86_64/lib/memset.S @@ -0,0 +1,125 @@ +/* Copyright 2002 Andi Kleen, SuSE Labs */ +/* + * ISO C memset - set a memory block to a byte value. + * + * rdi destination + * rsi value (char) + * rdx count (bytes) + * + * rax original destination + */ + .globl __memset + .globl memset + .p2align 4 +memset: +__memset: + movq %rdi,%r10 + movq %rdx,%r11 + + /* expand byte value */ + movzbl %sil,%ecx + movabs $0x0101010101010101,%rax + mul %rcx /* with rax, clobbers rdx */ + + /* align dst */ + movl %edi,%r9d + andl $7,%r9d + jnz .Lbad_alignment +.Lafter_bad_alignment: + + movl %r11d,%ecx + shrl $6,%ecx + jz .Lhandle_tail + + .p2align 4 +.Lloop_64: + decl %ecx + movq %rax,(%rdi) + movq %rax,8(%rdi) + movq %rax,16(%rdi) + movq %rax,24(%rdi) + movq %rax,32(%rdi) + movq %rax,40(%rdi) + movq %rax,48(%rdi) + movq %rax,56(%rdi) + leaq 64(%rdi),%rdi + jnz .Lloop_64 + + /* Handle tail in loops. The loops should be faster than hard + to predict jump tables. */ + .p2align 4 +.Lhandle_tail: + movl %r11d,%ecx + andl $63&(~7),%ecx + jz .Lhandle_7 + shrl $3,%ecx + .p2align 4 +.Lloop_8: + decl %ecx + movq %rax,(%rdi) + leaq 8(%rdi),%rdi + jnz .Lloop_8 + +.Lhandle_7: + movl %r11d,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: + decl %ecx + movb %al,(%rdi) + leaq 1(%rdi),%rdi + jnz .Lloop_1 + +.Lende: + movq %r10,%rax + ret + +.Lbad_alignment: + cmpq $7,%r11 + jbe .Lhandle_7 + movq %rax,(%rdi) /* unaligned store */ + movq $8,%r8 + subq %r9,%r8 + addq %r8,%rdi + subq %r8,%r11 + jmp .Lafter_bad_alignment + + /* C stepping K8 run faster using the string instructions. + It is also a lot simpler. Use this when possible */ + +#include <asm/cpufeature.h> + + .section .altinstructions,"a" + .align 8 + .quad memset + .quad memset_c + .byte X86_FEATURE_K8_C + .byte memset_c_end-memset_c + .byte memset_c_end-memset_c + .previous + + .section .altinstr_replacement,"ax" + /* rdi destination + * rsi value + * rdx count + */ +memset_c: + movq %rdi,%r9 + movl %edx,%r8d + andl $7,%r8d + movl %edx,%ecx + shrl $3,%ecx + /* expand byte value */ + movzbl %sil,%esi + movabs $0x0101010101010101,%rax + mulq %rsi /* with rax, clobbers rdx */ + rep + stosq + movl %r8d,%ecx + rep + stosb + movq %r9,%rax + ret +memset_c_end: + .previous |