linux/arch/powerpc/lib/memcpy_64.S
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2002 Paul Mackerras, IBM Corp.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public License
   6 * as published by the Free Software Foundation; either version
   7 * 2 of the License, or (at your option) any later version.
   8 */
   9#include <asm/processor.h>
  10#include <asm/ppc_asm.h>
  11#include <asm/export.h>
  12
  13        .align  7
  14_GLOBAL_TOC(memcpy)
  15BEGIN_FTR_SECTION
  16#ifdef __LITTLE_ENDIAN__
  17        cmpdi   cr7,r5,0
  18#else
  19        std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* save destination pointer for return value */
  20#endif
  21FTR_SECTION_ELSE
  22#ifdef CONFIG_PPC_BOOK3S_64
  23#ifndef SELFTEST
  24        b       memcpy_power7
  25#endif
  26#endif
  27ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
  28#ifdef __LITTLE_ENDIAN__
  29        /* dumb little-endian memcpy that will get replaced at runtime */
  30        addi r9,r3,-1
  31        addi r4,r4,-1
  32        beqlr cr7
  33        mtctr r5
  341:      lbzu r10,1(r4)
  35        stbu r10,1(r9)
  36        bdnz 1b
  37        blr
  38#else
  39        PPC_MTOCRF(0x01,r5)
  40        cmpldi  cr1,r5,16
  41        neg     r6,r3           # LS 3 bits = # bytes to 8-byte dest bdry
  42        andi.   r6,r6,7
  43        dcbt    0,r4
  44        blt     cr1,.Lshort_copy
  45/* Below we want to nop out the bne if we're on a CPU that has the
  46   CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
  47   cleared.
  48   At the time of writing the only CPU that has this combination of bits
  49   set is Power6. */
  50BEGIN_FTR_SECTION
  51        nop
  52FTR_SECTION_ELSE
  53        bne     .Ldst_unaligned
  54ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
  55                    CPU_FTR_UNALIGNED_LD_STD)
  56.Ldst_aligned:
  57        addi    r3,r3,-16
  58BEGIN_FTR_SECTION
  59        andi.   r0,r4,7
  60        bne     .Lsrc_unaligned
  61END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
  62        srdi    r7,r5,4
  63        ld      r9,0(r4)
  64        addi    r4,r4,-8
  65        mtctr   r7
  66        andi.   r5,r5,7
  67        bf      cr7*4+0,2f
  68        addi    r3,r3,8
  69        addi    r4,r4,8
  70        mr      r8,r9
  71        blt     cr1,3f
  721:      ld      r9,8(r4)
  73        std     r8,8(r3)
  742:      ldu     r8,16(r4)
  75        stdu    r9,16(r3)
  76        bdnz    1b
  773:      std     r8,8(r3)
  78        beq     3f
  79        addi    r3,r3,16
  80.Ldo_tail:
  81        bf      cr7*4+1,1f
  82        lwz     r9,8(r4)
  83        addi    r4,r4,4
  84        stw     r9,0(r3)
  85        addi    r3,r3,4
  861:      bf      cr7*4+2,2f
  87        lhz     r9,8(r4)
  88        addi    r4,r4,2
  89        sth     r9,0(r3)
  90        addi    r3,r3,2
  912:      bf      cr7*4+3,3f
  92        lbz     r9,8(r4)
  93        stb     r9,0(r3)
  943:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
  95        blr
  96
  97.Lsrc_unaligned:
  98        srdi    r6,r5,3
  99        addi    r5,r5,-16
 100        subf    r4,r0,r4
 101        srdi    r7,r5,4
 102        sldi    r10,r0,3
 103        cmpdi   cr6,r6,3
 104        andi.   r5,r5,7
 105        mtctr   r7
 106        subfic  r11,r10,64
 107        add     r5,r5,r0
 108
 109        bt      cr7*4+0,0f
 110
 111        ld      r9,0(r4)        # 3+2n loads, 2+2n stores
 112        ld      r0,8(r4)
 113        sld     r6,r9,r10
 114        ldu     r9,16(r4)
 115        srd     r7,r0,r11
 116        sld     r8,r0,r10
 117        or      r7,r7,r6
 118        blt     cr6,4f
 119        ld      r0,8(r4)
 120        # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
 121        b       2f
 122
 1230:      ld      r0,0(r4)        # 4+2n loads, 3+2n stores
 124        ldu     r9,8(r4)
 125        sld     r8,r0,r10
 126        addi    r3,r3,-8
 127        blt     cr6,5f
 128        ld      r0,8(r4)
 129        srd     r12,r9,r11
 130        sld     r6,r9,r10
 131        ldu     r9,16(r4)
 132        or      r12,r8,r12
 133        srd     r7,r0,r11
 134        sld     r8,r0,r10
 135        addi    r3,r3,16
 136        beq     cr6,3f
 137
 138        # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
 1391:      or      r7,r7,r6
 140        ld      r0,8(r4)
 141        std     r12,8(r3)
 1422:      srd     r12,r9,r11
 143        sld     r6,r9,r10
 144        ldu     r9,16(r4)
 145        or      r12,r8,r12
 146        stdu    r7,16(r3)
 147        srd     r7,r0,r11
 148        sld     r8,r0,r10
 149        bdnz    1b
 150
 1513:      std     r12,8(r3)
 152        or      r7,r7,r6
 1534:      std     r7,16(r3)
 1545:      srd     r12,r9,r11
 155        or      r12,r8,r12
 156        std     r12,24(r3)
 157        beq     4f
 158        cmpwi   cr1,r5,8
 159        addi    r3,r3,32
 160        sld     r9,r9,r10
 161        ble     cr1,6f
 162        ld      r0,8(r4)
 163        srd     r7,r0,r11
 164        or      r9,r7,r9
 1656:
 166        bf      cr7*4+1,1f
 167        rotldi  r9,r9,32
 168        stw     r9,0(r3)
 169        addi    r3,r3,4
 1701:      bf      cr7*4+2,2f
 171        rotldi  r9,r9,16
 172        sth     r9,0(r3)
 173        addi    r3,r3,2
 1742:      bf      cr7*4+3,3f
 175        rotldi  r9,r9,8
 176        stb     r9,0(r3)
 1773:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
 178        blr
 179
 180.Ldst_unaligned:
 181        PPC_MTOCRF(0x01,r6)             # put #bytes to 8B bdry into cr7
 182        subf    r5,r6,r5
 183        li      r7,0
 184        cmpldi  cr1,r5,16
 185        bf      cr7*4+3,1f
 186        lbz     r0,0(r4)
 187        stb     r0,0(r3)
 188        addi    r7,r7,1
 1891:      bf      cr7*4+2,2f
 190        lhzx    r0,r7,r4
 191        sthx    r0,r7,r3
 192        addi    r7,r7,2
 1932:      bf      cr7*4+1,3f
 194        lwzx    r0,r7,r4
 195        stwx    r0,r7,r3
 1963:      PPC_MTOCRF(0x01,r5)
 197        add     r4,r6,r4
 198        add     r3,r6,r3
 199        b       .Ldst_aligned
 200
 201.Lshort_copy:
 202        bf      cr7*4+0,1f
 203        lwz     r0,0(r4)
 204        lwz     r9,4(r4)
 205        addi    r4,r4,8
 206        stw     r0,0(r3)
 207        stw     r9,4(r3)
 208        addi    r3,r3,8
 2091:      bf      cr7*4+1,2f
 210        lwz     r0,0(r4)
 211        addi    r4,r4,4
 212        stw     r0,0(r3)
 213        addi    r3,r3,4
 2142:      bf      cr7*4+2,3f
 215        lhz     r0,0(r4)
 216        addi    r4,r4,2
 217        sth     r0,0(r3)
 218        addi    r3,r3,2
 2193:      bf      cr7*4+3,4f
 220        lbz     r0,0(r4)
 221        stb     r0,0(r3)
 2224:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
 223        blr
 224#endif
 225EXPORT_SYMBOL(memcpy)
 226