linux/arch/sh/lib64/memcpy.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0 */
   2/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
   3/* Modified by SuperH, Inc. September 2003 */
   4!
   5! Fast SH memcpy
   6!
   7! by Toshiyasu Morita (tm@netcom.com)
   8! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
   9! SH5 code Copyright 2002 SuperH Ltd.
  10!
  11! Entry: ARG0: destination pointer
  12!        ARG1: source pointer
  13!        ARG2: byte count
  14!
  15! Exit:  RESULT: destination pointer
  16!        any other registers in the range r0-r7: trashed
  17!
  18! Notes: Usually one wants to do small reads and write a longword, but
  19!        unfortunately it is difficult in some cases to concatanate bytes
  20!        into a longword on the SH, so this does a longword read and small
  21!        writes.
  22!
  23! This implementation makes two assumptions about how it is called:
  24!
  25! 1.: If the byte count is nonzero, the address of the last byte to be
  26!     copied is unsigned greater than the address of the first byte to
  27!     be copied.  This could be easily swapped for a signed comparison,
  28!     but the algorithm used needs some comparison.
  29!
  30! 2.: When there are two or three bytes in the last word of an 11-or-more
  31!     bytes memory chunk to b copied, the rest of the word can be read
  32!     without side effects.
  33!     This could be easily changed by increasing the minimum size of
  34!     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
  35!     however, this would cost a few extra cyles on average.
  36!     For SHmedia, the assumption is that any quadword can be read in its
  37!     enirety if at least one byte is included in the copy.
  38!
  39
  40        .section .text..SHmedia32,"ax"
  41        .globl  memcpy
  42        .type   memcpy, @function
  43        .align  5
  44
  45memcpy:
  46
  47#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
  48#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
  49#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
  50#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
  51
  52        ld.b r3,0,r63
  53        pta/l Large,tr0
  54        movi 25,r0
  55        bgeu/u r4,r0,tr0
  56        nsb r4,r0
  57        shlli r0,5,r0
  58        movi (L1-L0+63*32 + 1) & 0xffff,r1
  59        sub r1, r0, r0
  60L0:     ptrel r0,tr0
  61        add r2,r4,r5
  62        ptabs r18,tr1
  63        add r3,r4,r6
  64        blink tr0,r63
  65        
  66/* Rearranged to make cut2 safe */
  67        .balign 8
  68L4_7:   /* 4..7 byte memcpy cntd. */
  69        stlo.l r2, 0, r0
  70        or r6, r7, r6
  71        sthi.l r5, -1, r6
  72        stlo.l r5, -4, r6
  73        blink tr1,r63
  74
  75        .balign 8
  76L1:     /* 0 byte memcpy */
  77        nop
  78        blink tr1,r63
  79        nop
  80        nop
  81        nop
  82        nop
  83
  84L2_3:   /* 2 or 3 byte memcpy cntd. */
  85        st.b r5,-1,r6
  86        blink tr1,r63
  87
  88        /* 1 byte memcpy */
  89        ld.b r3,0,r0
  90        st.b r2,0,r0
  91        blink tr1,r63
  92
  93L8_15:  /* 8..15 byte memcpy cntd. */
  94        stlo.q r2, 0, r0
  95        or r6, r7, r6
  96        sthi.q r5, -1, r6
  97        stlo.q r5, -8, r6
  98        blink tr1,r63
  99        
 100        /* 2 or 3 byte memcpy */
 101        ld.b r3,0,r0
 102        ld.b r2,0,r63
 103        ld.b r3,1,r1
 104        st.b r2,0,r0
 105        pta/l L2_3,tr0
 106        ld.b r6,-1,r6
 107        st.b r2,1,r1
 108        blink tr0, r63
 109
 110        /* 4 .. 7 byte memcpy */
 111        LDUAL (r3, 0, r0, r1)
 112        pta L4_7, tr0
 113        ldlo.l r6, -4, r7
 114        or r0, r1, r0
 115        sthi.l r2, 3, r0
 116        ldhi.l r6, -1, r6
 117        blink tr0, r63
 118
 119        /* 8 .. 15 byte memcpy */
 120        LDUAQ (r3, 0, r0, r1)
 121        pta L8_15, tr0
 122        ldlo.q r6, -8, r7
 123        or r0, r1, r0
 124        sthi.q r2, 7, r0
 125        ldhi.q r6, -1, r6
 126        blink tr0, r63
 127
 128        /* 16 .. 24 byte memcpy */
 129        LDUAQ (r3, 0, r0, r1)
 130        LDUAQ (r3, 8, r8, r9)
 131        or r0, r1, r0
 132        sthi.q r2, 7, r0
 133        or r8, r9, r8
 134        sthi.q r2, 15, r8
 135        ldlo.q r6, -8, r7
 136        ldhi.q r6, -1, r6
 137        stlo.q r2, 8, r8
 138        stlo.q r2, 0, r0
 139        or r6, r7, r6
 140        sthi.q r5, -1, r6
 141        stlo.q r5, -8, r6
 142        blink tr1,r63
 143
 144Large:
 145        ld.b r2, 0, r63
 146        pta/l  Loop_ua, tr1
 147        ori r3, -8, r7
 148        sub r2, r7, r22
 149        sub r3, r2, r6
 150        add r2, r4, r5
 151        ldlo.q r3, 0, r0
 152        addi r5, -16, r5
 153        movi 64+8, r27 // could subtract r7 from that.
 154        stlo.q r2, 0, r0
 155        sthi.q r2, 7, r0
 156        ldx.q r22, r6, r0
 157        bgtu/l r27, r4, tr1
 158
 159        addi r5, -48, r27
 160        pta/l Loop_line, tr0
 161        addi r6, 64, r36
 162        addi r6, -24, r19
 163        addi r6, -16, r20
 164        addi r6, -8, r21
 165
 166Loop_line:
 167        ldx.q r22, r36, r63
 168        alloco r22, 32
 169        addi r22, 32, r22
 170        ldx.q r22, r19, r23
 171        sthi.q r22, -25, r0
 172        ldx.q r22, r20, r24
 173        ldx.q r22, r21, r25
 174        stlo.q r22, -32, r0
 175        ldx.q r22, r6,  r0
 176        sthi.q r22, -17, r23
 177        sthi.q r22,  -9, r24
 178        sthi.q r22,  -1, r25
 179        stlo.q r22, -24, r23
 180        stlo.q r22, -16, r24
 181        stlo.q r22,  -8, r25
 182        bgeu r27, r22, tr0
 183
 184Loop_ua:
 185        addi r22, 8, r22
 186        sthi.q r22, -1, r0
 187        stlo.q r22, -8, r0
 188        ldx.q r22, r6, r0
 189        bgtu/l r5, r22, tr1
 190
 191        add r3, r4, r7
 192        ldlo.q r7, -8, r1
 193        sthi.q r22, 7, r0
 194        ldhi.q r7, -1, r7
 195        ptabs r18,tr1
 196        stlo.q r22, 0, r0
 197        or r1, r7, r1
 198        sthi.q r5, 15, r1
 199        stlo.q r5, 8, r1
 200        blink tr1, r63
 201
 202        .size memcpy,.-memcpy
 203