linux/arch/sh/lib64/copy_user_memcpy.S
<<
>>
Prefs
   1!
   2! Fast SH memcpy
   3!
   4! by Toshiyasu Morita (tm@netcom.com)
   5! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
   6! SH5 code Copyright 2002 SuperH Ltd.
   7!
   8! Entry: ARG0: destination pointer
   9!        ARG1: source pointer
  10!        ARG2: byte count
  11!
  12! Exit:  RESULT: destination pointer
  13!        any other registers in the range r0-r7: trashed
  14!
  15! Notes: Usually one wants to do small reads and write a longword, but
  16!        unfortunately it is difficult in some cases to concatanate bytes
  17!        into a longword on the SH, so this does a longword read and small
  18!        writes.
  19!
  20! This implementation makes two assumptions about how it is called:
  21!
  22! 1.: If the byte count is nonzero, the address of the last byte to be
  23!     copied is unsigned greater than the address of the first byte to
  24!     be copied.  This could be easily swapped for a signed comparison,
  25!     but the algorithm used needs some comparison.
  26!
  27! 2.: When there are two or three bytes in the last word of an 11-or-more
  28!     bytes memory chunk to b copied, the rest of the word can be read
  29!     without side effects.
  30!     This could be easily changed by increasing the minimum size of
  31!     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
  32!     however, this would cost a few extra cyles on average.
  33!     For SHmedia, the assumption is that any quadword can be read in its
  34!     enirety if at least one byte is included in the copy.
  35
  36/* Imported into Linux kernel by Richard Curnow.  This is used to implement the
  37   __copy_user function in the general case, so it has to be a distinct
  38   function from intra-kernel memcpy to allow for exception fix-ups in the
  39   event that the user pointer is bad somewhere in the copy (e.g. due to
  40   running off the end of the vma).
  41
  42   Note, this algorithm will be slightly wasteful in the case where the source
  43   and destination pointers are equally aligned, because the stlo/sthi pairs
  44   could then be merged back into single stores.  If there are a lot of cache
  45   misses, this is probably offset by the stall lengths on the preloads.
  46
  47*/
  48
  49/* NOTE : Prefetches removed and allocos guarded by synco to avoid TAKum03020
  50 * erratum.  The first two prefetches are nop-ed out to avoid upsetting the
  51 * instruction counts used in the jump address calculation.
  52 * */
  53
  54        .section .text..SHmedia32,"ax"
  55        .little
  56        .balign 32
  57        .global copy_user_memcpy
  58        .global copy_user_memcpy_end
  59copy_user_memcpy:
  60
  61#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
  62#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
  63#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
  64#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
  65
  66        nop ! ld.b r3,0,r63 ! TAKum03020
  67        pta/l Large,tr0
  68        movi 25,r0
  69        bgeu/u r4,r0,tr0
  70        nsb r4,r0
  71        shlli r0,5,r0
  72        movi (L1-L0+63*32 + 1) & 0xffff,r1
  73        sub r1, r0, r0
  74L0:     ptrel r0,tr0
  75        add r2,r4,r5
  76        ptabs r18,tr1
  77        add r3,r4,r6
  78        blink tr0,r63
  79
  80/* Rearranged to make cut2 safe */
  81        .balign 8
  82L4_7:   /* 4..7 byte memcpy cntd. */
  83        stlo.l r2, 0, r0
  84        or r6, r7, r6
  85        sthi.l r5, -1, r6
  86        stlo.l r5, -4, r6
  87        blink tr1,r63
  88
  89        .balign 8
  90L1:     /* 0 byte memcpy */
  91        nop
  92        blink tr1,r63
  93        nop
  94        nop
  95        nop
  96        nop
  97
  98L2_3:   /* 2 or 3 byte memcpy cntd. */
  99        st.b r5,-1,r6
 100        blink tr1,r63
 101
 102        /* 1 byte memcpy */
 103        ld.b r3,0,r0
 104        st.b r2,0,r0
 105        blink tr1,r63
 106
 107L8_15:  /* 8..15 byte memcpy cntd. */
 108        stlo.q r2, 0, r0
 109        or r6, r7, r6
 110        sthi.q r5, -1, r6
 111        stlo.q r5, -8, r6
 112        blink tr1,r63
 113
 114        /* 2 or 3 byte memcpy */
 115        ld.b r3,0,r0
 116        nop ! ld.b r2,0,r63 ! TAKum03020
 117        ld.b r3,1,r1
 118        st.b r2,0,r0
 119        pta/l L2_3,tr0
 120        ld.b r6,-1,r6
 121        st.b r2,1,r1
 122        blink tr0, r63
 123
 124        /* 4 .. 7 byte memcpy */
 125        LDUAL (r3, 0, r0, r1)
 126        pta L4_7, tr0
 127        ldlo.l r6, -4, r7
 128        or r0, r1, r0
 129        sthi.l r2, 3, r0
 130        ldhi.l r6, -1, r6
 131        blink tr0, r63
 132
 133        /* 8 .. 15 byte memcpy */
 134        LDUAQ (r3, 0, r0, r1)
 135        pta L8_15, tr0
 136        ldlo.q r6, -8, r7
 137        or r0, r1, r0
 138        sthi.q r2, 7, r0
 139        ldhi.q r6, -1, r6
 140        blink tr0, r63
 141
 142        /* 16 .. 24 byte memcpy */
 143        LDUAQ (r3, 0, r0, r1)
 144        LDUAQ (r3, 8, r8, r9)
 145        or r0, r1, r0
 146        sthi.q r2, 7, r0
 147        or r8, r9, r8
 148        sthi.q r2, 15, r8
 149        ldlo.q r6, -8, r7
 150        ldhi.q r6, -1, r6
 151        stlo.q r2, 8, r8
 152        stlo.q r2, 0, r0
 153        or r6, r7, r6
 154        sthi.q r5, -1, r6
 155        stlo.q r5, -8, r6
 156        blink tr1,r63
 157
 158Large:
 159        ! ld.b r2, 0, r63 ! TAKum03020
 160        pta/l  Loop_ua, tr1
 161        ori r3, -8, r7
 162        sub r2, r7, r22
 163        sub r3, r2, r6
 164        add r2, r4, r5
 165        ldlo.q r3, 0, r0
 166        addi r5, -16, r5
 167        movi 64+8, r27 ! could subtract r7 from that.
 168        stlo.q r2, 0, r0
 169        sthi.q r2, 7, r0
 170        ldx.q r22, r6, r0
 171        bgtu/l r27, r4, tr1
 172
 173        addi r5, -48, r27
 174        pta/l Loop_line, tr0
 175        addi r6, 64, r36
 176        addi r6, -24, r19
 177        addi r6, -16, r20
 178        addi r6, -8, r21
 179
 180Loop_line:
 181        ! ldx.q r22, r36, r63 ! TAKum03020
 182        alloco r22, 32
 183        synco
 184        addi r22, 32, r22
 185        ldx.q r22, r19, r23
 186        sthi.q r22, -25, r0
 187        ldx.q r22, r20, r24
 188        ldx.q r22, r21, r25
 189        stlo.q r22, -32, r0
 190        ldx.q r22, r6,  r0
 191        sthi.q r22, -17, r23
 192        sthi.q r22,  -9, r24
 193        sthi.q r22,  -1, r25
 194        stlo.q r22, -24, r23
 195        stlo.q r22, -16, r24
 196        stlo.q r22,  -8, r25
 197        bgeu r27, r22, tr0
 198
 199Loop_ua:
 200        addi r22, 8, r22
 201        sthi.q r22, -1, r0
 202        stlo.q r22, -8, r0
 203        ldx.q r22, r6, r0
 204        bgtu/l r5, r22, tr1
 205
 206        add r3, r4, r7
 207        ldlo.q r7, -8, r1
 208        sthi.q r22, 7, r0
 209        ldhi.q r7, -1, r7
 210        ptabs r18,tr1
 211        stlo.q r22, 0, r0
 212        or r1, r7, r1
 213        sthi.q r5, 15, r1
 214        stlo.q r5, 8, r1
 215        blink tr1, r63
 216copy_user_memcpy_end:
 217        nop
 218