linux/arch/powerpc/lib/memcpy_64.S
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2002 Paul Mackerras, IBM Corp.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public License
   6 * as published by the Free Software Foundation; either version
   7 * 2 of the License, or (at your option) any later version.
   8 */
   9#include <asm/processor.h>
  10#include <asm/ppc_asm.h>
  11
  12        .align  7
  13_GLOBAL_TOC(memcpy)
  14BEGIN_FTR_SECTION
  15#ifdef __LITTLE_ENDIAN__
  16        cmpdi   cr7,r5,0
  17#else
  18        std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* save destination pointer for return value */
  19#endif
  20FTR_SECTION_ELSE
  21#ifndef SELFTEST
  22        b       memcpy_power7
  23#endif
  24ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
  25#ifdef __LITTLE_ENDIAN__
  26        /* dumb little-endian memcpy that will get replaced at runtime */
  27        addi r9,r3,-1
  28        addi r4,r4,-1
  29        beqlr cr7
  30        mtctr r5
  311:      lbzu r10,1(r4)
  32        stbu r10,1(r9)
  33        bdnz 1b
  34        blr
  35#else
  36        PPC_MTOCRF(0x01,r5)
  37        cmpldi  cr1,r5,16
  38        neg     r6,r3           # LS 3 bits = # bytes to 8-byte dest bdry
  39        andi.   r6,r6,7
  40        dcbt    0,r4
  41        blt     cr1,.Lshort_copy
  42/* Below we want to nop out the bne if we're on a CPU that has the
  43   CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
  44   cleared.
  45   At the time of writing the only CPU that has this combination of bits
  46   set is Power6. */
  47BEGIN_FTR_SECTION
  48        nop
  49FTR_SECTION_ELSE
  50        bne     .Ldst_unaligned
  51ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
  52                    CPU_FTR_UNALIGNED_LD_STD)
  53.Ldst_aligned:
  54        addi    r3,r3,-16
  55BEGIN_FTR_SECTION
  56        andi.   r0,r4,7
  57        bne     .Lsrc_unaligned
  58END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
  59        srdi    r7,r5,4
  60        ld      r9,0(r4)
  61        addi    r4,r4,-8
  62        mtctr   r7
  63        andi.   r5,r5,7
  64        bf      cr7*4+0,2f
  65        addi    r3,r3,8
  66        addi    r4,r4,8
  67        mr      r8,r9
  68        blt     cr1,3f
  691:      ld      r9,8(r4)
  70        std     r8,8(r3)
  712:      ldu     r8,16(r4)
  72        stdu    r9,16(r3)
  73        bdnz    1b
  743:      std     r8,8(r3)
  75        beq     3f
  76        addi    r3,r3,16
  77.Ldo_tail:
  78        bf      cr7*4+1,1f
  79        lwz     r9,8(r4)
  80        addi    r4,r4,4
  81        stw     r9,0(r3)
  82        addi    r3,r3,4
  831:      bf      cr7*4+2,2f
  84        lhz     r9,8(r4)
  85        addi    r4,r4,2
  86        sth     r9,0(r3)
  87        addi    r3,r3,2
  882:      bf      cr7*4+3,3f
  89        lbz     r9,8(r4)
  90        stb     r9,0(r3)
  913:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
  92        blr
  93
  94.Lsrc_unaligned:
  95        srdi    r6,r5,3
  96        addi    r5,r5,-16
  97        subf    r4,r0,r4
  98        srdi    r7,r5,4
  99        sldi    r10,r0,3
 100        cmpdi   cr6,r6,3
 101        andi.   r5,r5,7
 102        mtctr   r7
 103        subfic  r11,r10,64
 104        add     r5,r5,r0
 105
 106        bt      cr7*4+0,0f
 107
 108        ld      r9,0(r4)        # 3+2n loads, 2+2n stores
 109        ld      r0,8(r4)
 110        sld     r6,r9,r10
 111        ldu     r9,16(r4)
 112        srd     r7,r0,r11
 113        sld     r8,r0,r10
 114        or      r7,r7,r6
 115        blt     cr6,4f
 116        ld      r0,8(r4)
 117        # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
 118        b       2f
 119
 1200:      ld      r0,0(r4)        # 4+2n loads, 3+2n stores
 121        ldu     r9,8(r4)
 122        sld     r8,r0,r10
 123        addi    r3,r3,-8
 124        blt     cr6,5f
 125        ld      r0,8(r4)
 126        srd     r12,r9,r11
 127        sld     r6,r9,r10
 128        ldu     r9,16(r4)
 129        or      r12,r8,r12
 130        srd     r7,r0,r11
 131        sld     r8,r0,r10
 132        addi    r3,r3,16
 133        beq     cr6,3f
 134
 135        # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
 1361:      or      r7,r7,r6
 137        ld      r0,8(r4)
 138        std     r12,8(r3)
 1392:      srd     r12,r9,r11
 140        sld     r6,r9,r10
 141        ldu     r9,16(r4)
 142        or      r12,r8,r12
 143        stdu    r7,16(r3)
 144        srd     r7,r0,r11
 145        sld     r8,r0,r10
 146        bdnz    1b
 147
 1483:      std     r12,8(r3)
 149        or      r7,r7,r6
 1504:      std     r7,16(r3)
 1515:      srd     r12,r9,r11
 152        or      r12,r8,r12
 153        std     r12,24(r3)
 154        beq     4f
 155        cmpwi   cr1,r5,8
 156        addi    r3,r3,32
 157        sld     r9,r9,r10
 158        ble     cr1,6f
 159        ld      r0,8(r4)
 160        srd     r7,r0,r11
 161        or      r9,r7,r9
 1626:
 163        bf      cr7*4+1,1f
 164        rotldi  r9,r9,32
 165        stw     r9,0(r3)
 166        addi    r3,r3,4
 1671:      bf      cr7*4+2,2f
 168        rotldi  r9,r9,16
 169        sth     r9,0(r3)
 170        addi    r3,r3,2
 1712:      bf      cr7*4+3,3f
 172        rotldi  r9,r9,8
 173        stb     r9,0(r3)
 1743:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
 175        blr
 176
 177.Ldst_unaligned:
 178        PPC_MTOCRF(0x01,r6)             # put #bytes to 8B bdry into cr7
 179        subf    r5,r6,r5
 180        li      r7,0
 181        cmpldi  cr1,r5,16
 182        bf      cr7*4+3,1f
 183        lbz     r0,0(r4)
 184        stb     r0,0(r3)
 185        addi    r7,r7,1
 1861:      bf      cr7*4+2,2f
 187        lhzx    r0,r7,r4
 188        sthx    r0,r7,r3
 189        addi    r7,r7,2
 1902:      bf      cr7*4+1,3f
 191        lwzx    r0,r7,r4
 192        stwx    r0,r7,r3
 1933:      PPC_MTOCRF(0x01,r5)
 194        add     r4,r6,r4
 195        add     r3,r6,r3
 196        b       .Ldst_aligned
 197
 198.Lshort_copy:
 199        bf      cr7*4+0,1f
 200        lwz     r0,0(r4)
 201        lwz     r9,4(r4)
 202        addi    r4,r4,8
 203        stw     r0,0(r3)
 204        stw     r9,4(r3)
 205        addi    r3,r3,8
 2061:      bf      cr7*4+1,2f
 207        lwz     r0,0(r4)
 208        addi    r4,r4,4
 209        stw     r0,0(r3)
 210        addi    r3,r3,4
 2112:      bf      cr7*4+2,3f
 212        lhz     r0,0(r4)
 213        addi    r4,r4,2
 214        sth     r0,0(r3)
 215        addi    r3,r3,2
 2163:      bf      cr7*4+3,4f
 217        lbz     r0,0(r4)
 218        stb     r0,0(r3)
 2194:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
 220        blr
 221#endif
 222