linux/arch/x86/lib/memset_64.S
<<
>>
Prefs
   1/* Copyright 2002 Andi Kleen, SuSE Labs */
   2
   3#include <linux/linkage.h>
   4#include <asm/dwarf2.h>
   5#include <asm/cpufeature.h>
   6#include <asm/alternative-asm.h>
   7
   8.weak memset
   9
  10/*
  11 * ISO C memset - set a memory block to a byte value. This function uses fast
  12 * string to get better performance than the original function. The code is
  13 * simpler and shorter than the orignal function as well.
  14 *
  15 * rdi   destination
  16 * rsi   value (char)
  17 * rdx   count (bytes)
  18 *
  19 * rax   original destination
  20 */
  21ENTRY(memset)
  22ENTRY(__memset)
  23        /*
  24         * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
  25         * to use it when possible. If not available, use fast string instructions.
  26         *
  27         * Otherwise, use original memset function.
  28         */
  29        ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
  30                      "jmp memset_erms", X86_FEATURE_ERMS
  31
  32        movq %rdi,%r9
  33        movq %rdx,%rcx
  34        andl $7,%edx
  35        shrq $3,%rcx
  36        /* expand byte value  */
  37        movzbl %sil,%esi
  38        movabs $0x0101010101010101,%rax
  39        imulq %rsi,%rax
  40        rep stosq
  41        movl %edx,%ecx
  42        rep stosb
  43        movq %r9,%rax
  44        ret
  45ENDPROC(memset)
  46ENDPROC(__memset)
  47
  48/*
  49 * ISO C memset - set a memory block to a byte value. This function uses
  50 * enhanced rep stosb to override the fast string function.
  51 * The code is simpler and shorter than the fast string function as well.
  52 *
  53 * rdi   destination
  54 * rsi   value (char)
  55 * rdx   count (bytes)
  56 *
  57 * rax   original destination
  58 */
  59ENTRY(memset_erms)
  60        movq %rdi,%r9
  61        movb %sil,%al
  62        movq %rdx,%rcx
  63        rep stosb
  64        movq %r9,%rax
  65        ret
  66ENDPROC(memset_erms)
  67
  68ENTRY(memset_orig)
  69        CFI_STARTPROC
  70        movq %rdi,%r10
  71
  72        /* expand byte value  */
  73        movzbl %sil,%ecx
  74        movabs $0x0101010101010101,%rax
  75        imulq  %rcx,%rax
  76
  77        /* align dst */
  78        movl  %edi,%r9d
  79        andl  $7,%r9d
  80        jnz  .Lbad_alignment
  81        CFI_REMEMBER_STATE
  82.Lafter_bad_alignment:
  83
  84        movq  %rdx,%rcx
  85        shrq  $6,%rcx
  86        jz       .Lhandle_tail
  87
  88        .p2align 4
  89.Lloop_64:
  90        decq  %rcx
  91        movq  %rax,(%rdi)
  92        movq  %rax,8(%rdi)
  93        movq  %rax,16(%rdi)
  94        movq  %rax,24(%rdi)
  95        movq  %rax,32(%rdi)
  96        movq  %rax,40(%rdi)
  97        movq  %rax,48(%rdi)
  98        movq  %rax,56(%rdi)
  99        leaq  64(%rdi),%rdi
 100        jnz    .Lloop_64
 101
 102        /* Handle tail in loops. The loops should be faster than hard
 103           to predict jump tables. */
 104        .p2align 4
 105.Lhandle_tail:
 106        movl    %edx,%ecx
 107        andl    $63&(~7),%ecx
 108        jz              .Lhandle_7
 109        shrl    $3,%ecx
 110        .p2align 4
 111.Lloop_8:
 112        decl   %ecx
 113        movq  %rax,(%rdi)
 114        leaq  8(%rdi),%rdi
 115        jnz    .Lloop_8
 116
 117.Lhandle_7:
 118        andl    $7,%edx
 119        jz      .Lende
 120        .p2align 4
 121.Lloop_1:
 122        decl    %edx
 123        movb    %al,(%rdi)
 124        leaq    1(%rdi),%rdi
 125        jnz     .Lloop_1
 126
 127.Lende:
 128        movq    %r10,%rax
 129        ret
 130
 131        CFI_RESTORE_STATE
 132.Lbad_alignment:
 133        cmpq $7,%rdx
 134        jbe     .Lhandle_7
 135        movq %rax,(%rdi)        /* unaligned store */
 136        movq $8,%r8
 137        subq %r9,%r8
 138        addq %r8,%rdi
 139        subq %r8,%rdx
 140        jmp .Lafter_bad_alignment
 141.Lfinal:
 142        CFI_ENDPROC
 143ENDPROC(memset_orig)
 144