linux/arch/powerpc/lib/memcpy_64.S
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2002 Paul Mackerras, IBM Corp.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public License
   6 * as published by the Free Software Foundation; either version
   7 * 2 of the License, or (at your option) any later version.
   8 */
   9#include <asm/processor.h>
  10#include <asm/ppc_asm.h>
  11
  12        .align  7
  13_GLOBAL_TOC(memcpy)
  14BEGIN_FTR_SECTION
  15#ifdef __LITTLE_ENDIAN__
  16        cmpdi   cr7,r5,0
  17#else
  18        std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* save destination pointer for return value */
  19#endif
  20FTR_SECTION_ELSE
  21        b       memcpy_power7
  22ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
  23#ifdef __LITTLE_ENDIAN__
  24        /* dumb little-endian memcpy that will get replaced at runtime */
  25        addi r9,r3,-1
  26        addi r4,r4,-1
  27        beqlr cr7
  28        mtctr r5
  291:      lbzu r10,1(r4)
  30        stbu r10,1(r9)
  31        bdnz 1b
  32        blr
  33#else
  34        PPC_MTOCRF(0x01,r5)
  35        cmpldi  cr1,r5,16
  36        neg     r6,r3           # LS 3 bits = # bytes to 8-byte dest bdry
  37        andi.   r6,r6,7
  38        dcbt    0,r4
  39        blt     cr1,.Lshort_copy
  40/* Below we want to nop out the bne if we're on a CPU that has the
  41   CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
  42   cleared.
  43   At the time of writing the only CPU that has this combination of bits
  44   set is Power6. */
  45BEGIN_FTR_SECTION
  46        nop
  47FTR_SECTION_ELSE
  48        bne     .Ldst_unaligned
  49ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
  50                    CPU_FTR_UNALIGNED_LD_STD)
  51.Ldst_aligned:
  52        addi    r3,r3,-16
  53BEGIN_FTR_SECTION
  54        andi.   r0,r4,7
  55        bne     .Lsrc_unaligned
  56END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
  57        srdi    r7,r5,4
  58        ld      r9,0(r4)
  59        addi    r4,r4,-8
  60        mtctr   r7
  61        andi.   r5,r5,7
  62        bf      cr7*4+0,2f
  63        addi    r3,r3,8
  64        addi    r4,r4,8
  65        mr      r8,r9
  66        blt     cr1,3f
  671:      ld      r9,8(r4)
  68        std     r8,8(r3)
  692:      ldu     r8,16(r4)
  70        stdu    r9,16(r3)
  71        bdnz    1b
  723:      std     r8,8(r3)
  73        beq     3f
  74        addi    r3,r3,16
  75.Ldo_tail:
  76        bf      cr7*4+1,1f
  77        lwz     r9,8(r4)
  78        addi    r4,r4,4
  79        stw     r9,0(r3)
  80        addi    r3,r3,4
  811:      bf      cr7*4+2,2f
  82        lhz     r9,8(r4)
  83        addi    r4,r4,2
  84        sth     r9,0(r3)
  85        addi    r3,r3,2
  862:      bf      cr7*4+3,3f
  87        lbz     r9,8(r4)
  88        stb     r9,0(r3)
  893:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
  90        blr
  91
  92.Lsrc_unaligned:
  93        srdi    r6,r5,3
  94        addi    r5,r5,-16
  95        subf    r4,r0,r4
  96        srdi    r7,r5,4
  97        sldi    r10,r0,3
  98        cmpdi   cr6,r6,3
  99        andi.   r5,r5,7
 100        mtctr   r7
 101        subfic  r11,r10,64
 102        add     r5,r5,r0
 103
 104        bt      cr7*4+0,0f
 105
 106        ld      r9,0(r4)        # 3+2n loads, 2+2n stores
 107        ld      r0,8(r4)
 108        sld     r6,r9,r10
 109        ldu     r9,16(r4)
 110        srd     r7,r0,r11
 111        sld     r8,r0,r10
 112        or      r7,r7,r6
 113        blt     cr6,4f
 114        ld      r0,8(r4)
 115        # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
 116        b       2f
 117
 1180:      ld      r0,0(r4)        # 4+2n loads, 3+2n stores
 119        ldu     r9,8(r4)
 120        sld     r8,r0,r10
 121        addi    r3,r3,-8
 122        blt     cr6,5f
 123        ld      r0,8(r4)
 124        srd     r12,r9,r11
 125        sld     r6,r9,r10
 126        ldu     r9,16(r4)
 127        or      r12,r8,r12
 128        srd     r7,r0,r11
 129        sld     r8,r0,r10
 130        addi    r3,r3,16
 131        beq     cr6,3f
 132
 133        # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
 1341:      or      r7,r7,r6
 135        ld      r0,8(r4)
 136        std     r12,8(r3)
 1372:      srd     r12,r9,r11
 138        sld     r6,r9,r10
 139        ldu     r9,16(r4)
 140        or      r12,r8,r12
 141        stdu    r7,16(r3)
 142        srd     r7,r0,r11
 143        sld     r8,r0,r10
 144        bdnz    1b
 145
 1463:      std     r12,8(r3)
 147        or      r7,r7,r6
 1484:      std     r7,16(r3)
 1495:      srd     r12,r9,r11
 150        or      r12,r8,r12
 151        std     r12,24(r3)
 152        beq     4f
 153        cmpwi   cr1,r5,8
 154        addi    r3,r3,32
 155        sld     r9,r9,r10
 156        ble     cr1,6f
 157        ld      r0,8(r4)
 158        srd     r7,r0,r11
 159        or      r9,r7,r9
 1606:
 161        bf      cr7*4+1,1f
 162        rotldi  r9,r9,32
 163        stw     r9,0(r3)
 164        addi    r3,r3,4
 1651:      bf      cr7*4+2,2f
 166        rotldi  r9,r9,16
 167        sth     r9,0(r3)
 168        addi    r3,r3,2
 1692:      bf      cr7*4+3,3f
 170        rotldi  r9,r9,8
 171        stb     r9,0(r3)
 1723:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
 173        blr
 174
 175.Ldst_unaligned:
 176        PPC_MTOCRF(0x01,r6)             # put #bytes to 8B bdry into cr7
 177        subf    r5,r6,r5
 178        li      r7,0
 179        cmpldi  cr1,r5,16
 180        bf      cr7*4+3,1f
 181        lbz     r0,0(r4)
 182        stb     r0,0(r3)
 183        addi    r7,r7,1
 1841:      bf      cr7*4+2,2f
 185        lhzx    r0,r7,r4
 186        sthx    r0,r7,r3
 187        addi    r7,r7,2
 1882:      bf      cr7*4+1,3f
 189        lwzx    r0,r7,r4
 190        stwx    r0,r7,r3
 1913:      PPC_MTOCRF(0x01,r5)
 192        add     r4,r6,r4
 193        add     r3,r6,r3
 194        b       .Ldst_aligned
 195
 196.Lshort_copy:
 197        bf      cr7*4+0,1f
 198        lwz     r0,0(r4)
 199        lwz     r9,4(r4)
 200        addi    r4,r4,8
 201        stw     r0,0(r3)
 202        stw     r9,4(r3)
 203        addi    r3,r3,8
 2041:      bf      cr7*4+1,2f
 205        lwz     r0,0(r4)
 206        addi    r4,r4,4
 207        stw     r0,0(r3)
 208        addi    r3,r3,4
 2092:      bf      cr7*4+2,3f
 210        lhz     r0,0(r4)
 211        addi    r4,r4,2
 212        sth     r0,0(r3)
 213        addi    r3,r3,2
 2143:      bf      cr7*4+3,4f
 215        lbz     r0,0(r4)
 216        stb     r0,0(r3)
 2174:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
 218        blr
 219#endif
 220