linux/arch/sh/lib64/memcpy.S
<<
>>
Prefs
   1/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
   2/* Modified by SuperH, Inc. September 2003 */
   3!
   4! Fast SH memcpy
   5!
   6! by Toshiyasu Morita (tm@netcom.com)
   7! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
   8! SH5 code Copyright 2002 SuperH Ltd.
   9!
  10! Entry: ARG0: destination pointer
  11!        ARG1: source pointer
  12!        ARG2: byte count
  13!
  14! Exit:  RESULT: destination pointer
  15!        any other registers in the range r0-r7: trashed
  16!
  17! Notes: Usually one wants to do small reads and write a longword, but
  18!        unfortunately it is difficult in some cases to concatanate bytes
  19!        into a longword on the SH, so this does a longword read and small
  20!        writes.
  21!
  22! This implementation makes two assumptions about how it is called:
  23!
  24! 1.: If the byte count is nonzero, the address of the last byte to be
  25!     copied is unsigned greater than the address of the first byte to
  26!     be copied.  This could be easily swapped for a signed comparison,
  27!     but the algorithm used needs some comparison.
  28!
  29! 2.: When there are two or three bytes in the last word of an 11-or-more
  30!     bytes memory chunk to b copied, the rest of the word can be read
  31!     without side effects.
  32!     This could be easily changed by increasing the minimum size of
  33!     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
  34!     however, this would cost a few extra cyles on average.
  35!     For SHmedia, the assumption is that any quadword can be read in its
  36!     enirety if at least one byte is included in the copy.
  37!
  38
  39        .section .text..SHmedia32,"ax"
  40        .globl  memcpy
  41        .type   memcpy, @function
  42        .align  5
  43
  44memcpy:
  45
  46#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
  47#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
  48#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
  49#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
  50
  51        ld.b r3,0,r63
  52        pta/l Large,tr0
  53        movi 25,r0
  54        bgeu/u r4,r0,tr0
  55        nsb r4,r0
  56        shlli r0,5,r0
  57        movi (L1-L0+63*32 + 1) & 0xffff,r1
  58        sub r1, r0, r0
  59L0:     ptrel r0,tr0
  60        add r2,r4,r5
  61        ptabs r18,tr1
  62        add r3,r4,r6
  63        blink tr0,r63
  64        
  65/* Rearranged to make cut2 safe */
  66        .balign 8
  67L4_7:   /* 4..7 byte memcpy cntd. */
  68        stlo.l r2, 0, r0
  69        or r6, r7, r6
  70        sthi.l r5, -1, r6
  71        stlo.l r5, -4, r6
  72        blink tr1,r63
  73
  74        .balign 8
  75L1:     /* 0 byte memcpy */
  76        nop
  77        blink tr1,r63
  78        nop
  79        nop
  80        nop
  81        nop
  82
  83L2_3:   /* 2 or 3 byte memcpy cntd. */
  84        st.b r5,-1,r6
  85        blink tr1,r63
  86
  87        /* 1 byte memcpy */
  88        ld.b r3,0,r0
  89        st.b r2,0,r0
  90        blink tr1,r63
  91
  92L8_15:  /* 8..15 byte memcpy cntd. */
  93        stlo.q r2, 0, r0
  94        or r6, r7, r6
  95        sthi.q r5, -1, r6
  96        stlo.q r5, -8, r6
  97        blink tr1,r63
  98        
  99        /* 2 or 3 byte memcpy */
 100        ld.b r3,0,r0
 101        ld.b r2,0,r63
 102        ld.b r3,1,r1
 103        st.b r2,0,r0
 104        pta/l L2_3,tr0
 105        ld.b r6,-1,r6
 106        st.b r2,1,r1
 107        blink tr0, r63
 108
 109        /* 4 .. 7 byte memcpy */
 110        LDUAL (r3, 0, r0, r1)
 111        pta L4_7, tr0
 112        ldlo.l r6, -4, r7
 113        or r0, r1, r0
 114        sthi.l r2, 3, r0
 115        ldhi.l r6, -1, r6
 116        blink tr0, r63
 117
 118        /* 8 .. 15 byte memcpy */
 119        LDUAQ (r3, 0, r0, r1)
 120        pta L8_15, tr0
 121        ldlo.q r6, -8, r7
 122        or r0, r1, r0
 123        sthi.q r2, 7, r0
 124        ldhi.q r6, -1, r6
 125        blink tr0, r63
 126
 127        /* 16 .. 24 byte memcpy */
 128        LDUAQ (r3, 0, r0, r1)
 129        LDUAQ (r3, 8, r8, r9)
 130        or r0, r1, r0
 131        sthi.q r2, 7, r0
 132        or r8, r9, r8
 133        sthi.q r2, 15, r8
 134        ldlo.q r6, -8, r7
 135        ldhi.q r6, -1, r6
 136        stlo.q r2, 8, r8
 137        stlo.q r2, 0, r0
 138        or r6, r7, r6
 139        sthi.q r5, -1, r6
 140        stlo.q r5, -8, r6
 141        blink tr1,r63
 142
 143Large:
 144        ld.b r2, 0, r63
 145        pta/l  Loop_ua, tr1
 146        ori r3, -8, r7
 147        sub r2, r7, r22
 148        sub r3, r2, r6
 149        add r2, r4, r5
 150        ldlo.q r3, 0, r0
 151        addi r5, -16, r5
 152        movi 64+8, r27 // could subtract r7 from that.
 153        stlo.q r2, 0, r0
 154        sthi.q r2, 7, r0
 155        ldx.q r22, r6, r0
 156        bgtu/l r27, r4, tr1
 157
 158        addi r5, -48, r27
 159        pta/l Loop_line, tr0
 160        addi r6, 64, r36
 161        addi r6, -24, r19
 162        addi r6, -16, r20
 163        addi r6, -8, r21
 164
 165Loop_line:
 166        ldx.q r22, r36, r63
 167        alloco r22, 32
 168        addi r22, 32, r22
 169        ldx.q r22, r19, r23
 170        sthi.q r22, -25, r0
 171        ldx.q r22, r20, r24
 172        ldx.q r22, r21, r25
 173        stlo.q r22, -32, r0
 174        ldx.q r22, r6,  r0
 175        sthi.q r22, -17, r23
 176        sthi.q r22,  -9, r24
 177        sthi.q r22,  -1, r25
 178        stlo.q r22, -24, r23
 179        stlo.q r22, -16, r24
 180        stlo.q r22,  -8, r25
 181        bgeu r27, r22, tr0
 182
 183Loop_ua:
 184        addi r22, 8, r22
 185        sthi.q r22, -1, r0
 186        stlo.q r22, -8, r0
 187        ldx.q r22, r6, r0
 188        bgtu/l r5, r22, tr1
 189
 190        add r3, r4, r7
 191        ldlo.q r7, -8, r1
 192        sthi.q r22, 7, r0
 193        ldhi.q r7, -1, r7
 194        ptabs r18,tr1
 195        stlo.q r22, 0, r0
 196        or r1, r7, r1
 197        sthi.q r5, 15, r1
 198        stlo.q r5, 8, r1
 199        blink tr1, r63
 200
 201        .size memcpy,.-memcpy
 202