linux/arch/powerpc/lib/memcpy_power7.S
<<
>>
Prefs
   1/*
   2 * This program is free software; you can redistribute it and/or modify
   3 * it under the terms of the GNU General Public License as published by
   4 * the Free Software Foundation; either version 2 of the License, or
   5 * (at your option) any later version.
   6 *
   7 * This program is distributed in the hope that it will be useful,
   8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
   9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  10 * GNU General Public License for more details.
  11 *
  12 * You should have received a copy of the GNU General Public License
  13 * along with this program; if not, write to the Free Software
  14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  15 *
  16 * Copyright (C) IBM Corporation, 2012
  17 *
  18 * Author: Anton Blanchard <anton@au.ibm.com>
  19 */
  20#include <asm/ppc_asm.h>
  21
  22_GLOBAL(memcpy_power7)
  23
  24#ifdef __BIG_ENDIAN__
  25#define LVS(VRT,RA,RB)          lvsl    VRT,RA,RB
  26#define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRA,VRB,VRC
  27#else
  28#define LVS(VRT,RA,RB)          lvsr    VRT,RA,RB
  29#define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRB,VRA,VRC
  30#endif
  31
  32#ifdef CONFIG_ALTIVEC
  33        cmpldi  r5,16
  34        cmpldi  cr1,r5,4096
  35
  36        std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  37
  38        blt     .Lshort_copy
  39        bgt     cr1,.Lvmx_copy
  40#else
  41        cmpldi  r5,16
  42
  43        std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  44
  45        blt     .Lshort_copy
  46#endif
  47
  48.Lnonvmx_copy:
  49        /* Get the source 8B aligned */
  50        neg     r6,r4
  51        mtocrf  0x01,r6
  52        clrldi  r6,r6,(64-3)
  53
  54        bf      cr7*4+3,1f
  55        lbz     r0,0(r4)
  56        addi    r4,r4,1
  57        stb     r0,0(r3)
  58        addi    r3,r3,1
  59
  601:      bf      cr7*4+2,2f
  61        lhz     r0,0(r4)
  62        addi    r4,r4,2
  63        sth     r0,0(r3)
  64        addi    r3,r3,2
  65
  662:      bf      cr7*4+1,3f
  67        lwz     r0,0(r4)
  68        addi    r4,r4,4
  69        stw     r0,0(r3)
  70        addi    r3,r3,4
  71
  723:      sub     r5,r5,r6
  73        cmpldi  r5,128
  74        blt     5f
  75
  76        mflr    r0
  77        stdu    r1,-STACKFRAMESIZE(r1)
  78        std     r14,STK_REG(R14)(r1)
  79        std     r15,STK_REG(R15)(r1)
  80        std     r16,STK_REG(R16)(r1)
  81        std     r17,STK_REG(R17)(r1)
  82        std     r18,STK_REG(R18)(r1)
  83        std     r19,STK_REG(R19)(r1)
  84        std     r20,STK_REG(R20)(r1)
  85        std     r21,STK_REG(R21)(r1)
  86        std     r22,STK_REG(R22)(r1)
  87        std     r0,STACKFRAMESIZE+16(r1)
  88
  89        srdi    r6,r5,7
  90        mtctr   r6
  91
  92        /* Now do cacheline (128B) sized loads and stores. */
  93        .align  5
  944:
  95        ld      r0,0(r4)
  96        ld      r6,8(r4)
  97        ld      r7,16(r4)
  98        ld      r8,24(r4)
  99        ld      r9,32(r4)
 100        ld      r10,40(r4)
 101        ld      r11,48(r4)
 102        ld      r12,56(r4)
 103        ld      r14,64(r4)
 104        ld      r15,72(r4)
 105        ld      r16,80(r4)
 106        ld      r17,88(r4)
 107        ld      r18,96(r4)
 108        ld      r19,104(r4)
 109        ld      r20,112(r4)
 110        ld      r21,120(r4)
 111        addi    r4,r4,128
 112        std     r0,0(r3)
 113        std     r6,8(r3)
 114        std     r7,16(r3)
 115        std     r8,24(r3)
 116        std     r9,32(r3)
 117        std     r10,40(r3)
 118        std     r11,48(r3)
 119        std     r12,56(r3)
 120        std     r14,64(r3)
 121        std     r15,72(r3)
 122        std     r16,80(r3)
 123        std     r17,88(r3)
 124        std     r18,96(r3)
 125        std     r19,104(r3)
 126        std     r20,112(r3)
 127        std     r21,120(r3)
 128        addi    r3,r3,128
 129        bdnz    4b
 130
 131        clrldi  r5,r5,(64-7)
 132
 133        ld      r14,STK_REG(R14)(r1)
 134        ld      r15,STK_REG(R15)(r1)
 135        ld      r16,STK_REG(R16)(r1)
 136        ld      r17,STK_REG(R17)(r1)
 137        ld      r18,STK_REG(R18)(r1)
 138        ld      r19,STK_REG(R19)(r1)
 139        ld      r20,STK_REG(R20)(r1)
 140        ld      r21,STK_REG(R21)(r1)
 141        ld      r22,STK_REG(R22)(r1)
 142        addi    r1,r1,STACKFRAMESIZE
 143
 144        /* Up to 127B to go */
 1455:      srdi    r6,r5,4
 146        mtocrf  0x01,r6
 147
 1486:      bf      cr7*4+1,7f
 149        ld      r0,0(r4)
 150        ld      r6,8(r4)
 151        ld      r7,16(r4)
 152        ld      r8,24(r4)
 153        ld      r9,32(r4)
 154        ld      r10,40(r4)
 155        ld      r11,48(r4)
 156        ld      r12,56(r4)
 157        addi    r4,r4,64
 158        std     r0,0(r3)
 159        std     r6,8(r3)
 160        std     r7,16(r3)
 161        std     r8,24(r3)
 162        std     r9,32(r3)
 163        std     r10,40(r3)
 164        std     r11,48(r3)
 165        std     r12,56(r3)
 166        addi    r3,r3,64
 167
 168        /* Up to 63B to go */
 1697:      bf      cr7*4+2,8f
 170        ld      r0,0(r4)
 171        ld      r6,8(r4)
 172        ld      r7,16(r4)
 173        ld      r8,24(r4)
 174        addi    r4,r4,32
 175        std     r0,0(r3)
 176        std     r6,8(r3)
 177        std     r7,16(r3)
 178        std     r8,24(r3)
 179        addi    r3,r3,32
 180
 181        /* Up to 31B to go */
 1828:      bf      cr7*4+3,9f
 183        ld      r0,0(r4)
 184        ld      r6,8(r4)
 185        addi    r4,r4,16
 186        std     r0,0(r3)
 187        std     r6,8(r3)
 188        addi    r3,r3,16
 189
 1909:      clrldi  r5,r5,(64-4)
 191
 192        /* Up to 15B to go */
 193.Lshort_copy:
 194        mtocrf  0x01,r5
 195        bf      cr7*4+0,12f
 196        lwz     r0,0(r4)        /* Less chance of a reject with word ops */
 197        lwz     r6,4(r4)
 198        addi    r4,r4,8
 199        stw     r0,0(r3)
 200        stw     r6,4(r3)
 201        addi    r3,r3,8
 202
 20312:     bf      cr7*4+1,13f
 204        lwz     r0,0(r4)
 205        addi    r4,r4,4
 206        stw     r0,0(r3)
 207        addi    r3,r3,4
 208
 20913:     bf      cr7*4+2,14f
 210        lhz     r0,0(r4)
 211        addi    r4,r4,2
 212        sth     r0,0(r3)
 213        addi    r3,r3,2
 214
 21514:     bf      cr7*4+3,15f
 216        lbz     r0,0(r4)
 217        stb     r0,0(r3)
 218
 21915:     ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
 220        blr
 221
 222.Lunwind_stack_nonvmx_copy:
 223        addi    r1,r1,STACKFRAMESIZE
 224        b       .Lnonvmx_copy
 225
 226#ifdef CONFIG_ALTIVEC
 227.Lvmx_copy:
 228        mflr    r0
 229        std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
 230        std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
 231        std     r0,16(r1)
 232        stdu    r1,-STACKFRAMESIZE(r1)
 233        bl      enter_vmx_copy
 234        cmpwi   cr1,r3,0
 235        ld      r0,STACKFRAMESIZE+16(r1)
 236        ld      r3,STK_REG(R31)(r1)
 237        ld      r4,STK_REG(R30)(r1)
 238        ld      r5,STK_REG(R29)(r1)
 239        mtlr    r0
 240
 241        /*
 242         * We prefetch both the source and destination using enhanced touch
 243         * instructions. We use a stream ID of 0 for the load side and
 244         * 1 for the store side.
 245         */
 246        clrrdi  r6,r4,7
 247        clrrdi  r9,r3,7
 248        ori     r9,r9,1         /* stream=1 */
 249
 250        srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
 251        cmpldi  r7,0x3FF
 252        ble     1f
 253        li      r7,0x3FF
 2541:      lis     r0,0x0E00       /* depth=7 */
 255        sldi    r7,r7,7
 256        or      r7,r7,r0
 257        ori     r10,r7,1        /* stream=1 */
 258
 259        lis     r8,0x8000       /* GO=1 */
 260        clrldi  r8,r8,32
 261
 262.machine push
 263.machine "power4"
 264        dcbt    r0,r6,0b01000
 265        dcbt    r0,r7,0b01010
 266        dcbtst  r0,r9,0b01000
 267        dcbtst  r0,r10,0b01010
 268        eieio
 269        dcbt    r0,r8,0b01010   /* GO */
 270.machine pop
 271
 272        beq     cr1,.Lunwind_stack_nonvmx_copy
 273
 274        /*
 275         * If source and destination are not relatively aligned we use a
 276         * slower permute loop.
 277         */
 278        xor     r6,r4,r3
 279        rldicl. r6,r6,0,(64-4)
 280        bne     .Lvmx_unaligned_copy
 281
 282        /* Get the destination 16B aligned */
 283        neg     r6,r3
 284        mtocrf  0x01,r6
 285        clrldi  r6,r6,(64-4)
 286
 287        bf      cr7*4+3,1f
 288        lbz     r0,0(r4)
 289        addi    r4,r4,1
 290        stb     r0,0(r3)
 291        addi    r3,r3,1
 292
 2931:      bf      cr7*4+2,2f
 294        lhz     r0,0(r4)
 295        addi    r4,r4,2
 296        sth     r0,0(r3)
 297        addi    r3,r3,2
 298
 2992:      bf      cr7*4+1,3f
 300        lwz     r0,0(r4)
 301        addi    r4,r4,4
 302        stw     r0,0(r3)
 303        addi    r3,r3,4
 304
 3053:      bf      cr7*4+0,4f
 306        ld      r0,0(r4)
 307        addi    r4,r4,8
 308        std     r0,0(r3)
 309        addi    r3,r3,8
 310
 3114:      sub     r5,r5,r6
 312
 313        /* Get the desination 128B aligned */
 314        neg     r6,r3
 315        srdi    r7,r6,4
 316        mtocrf  0x01,r7
 317        clrldi  r6,r6,(64-7)
 318
 319        li      r9,16
 320        li      r10,32
 321        li      r11,48
 322
 323        bf      cr7*4+3,5f
 324        lvx     v1,r0,r4
 325        addi    r4,r4,16
 326        stvx    v1,r0,r3
 327        addi    r3,r3,16
 328
 3295:      bf      cr7*4+2,6f
 330        lvx     v1,r0,r4
 331        lvx     v0,r4,r9
 332        addi    r4,r4,32
 333        stvx    v1,r0,r3
 334        stvx    v0,r3,r9
 335        addi    r3,r3,32
 336
 3376:      bf      cr7*4+1,7f
 338        lvx     v3,r0,r4
 339        lvx     v2,r4,r9
 340        lvx     v1,r4,r10
 341        lvx     v0,r4,r11
 342        addi    r4,r4,64
 343        stvx    v3,r0,r3
 344        stvx    v2,r3,r9
 345        stvx    v1,r3,r10
 346        stvx    v0,r3,r11
 347        addi    r3,r3,64
 348
 3497:      sub     r5,r5,r6
 350        srdi    r6,r5,7
 351
 352        std     r14,STK_REG(R14)(r1)
 353        std     r15,STK_REG(R15)(r1)
 354        std     r16,STK_REG(R16)(r1)
 355
 356        li      r12,64
 357        li      r14,80
 358        li      r15,96
 359        li      r16,112
 360
 361        mtctr   r6
 362
 363        /*
 364         * Now do cacheline sized loads and stores. By this stage the
 365         * cacheline stores are also cacheline aligned.
 366         */
 367        .align  5
 3688:
 369        lvx     v7,r0,r4
 370        lvx     v6,r4,r9
 371        lvx     v5,r4,r10
 372        lvx     v4,r4,r11
 373        lvx     v3,r4,r12
 374        lvx     v2,r4,r14
 375        lvx     v1,r4,r15
 376        lvx     v0,r4,r16
 377        addi    r4,r4,128
 378        stvx    v7,r0,r3
 379        stvx    v6,r3,r9
 380        stvx    v5,r3,r10
 381        stvx    v4,r3,r11
 382        stvx    v3,r3,r12
 383        stvx    v2,r3,r14
 384        stvx    v1,r3,r15
 385        stvx    v0,r3,r16
 386        addi    r3,r3,128
 387        bdnz    8b
 388
 389        ld      r14,STK_REG(R14)(r1)
 390        ld      r15,STK_REG(R15)(r1)
 391        ld      r16,STK_REG(R16)(r1)
 392
 393        /* Up to 127B to go */
 394        clrldi  r5,r5,(64-7)
 395        srdi    r6,r5,4
 396        mtocrf  0x01,r6
 397
 398        bf      cr7*4+1,9f
 399        lvx     v3,r0,r4
 400        lvx     v2,r4,r9
 401        lvx     v1,r4,r10
 402        lvx     v0,r4,r11
 403        addi    r4,r4,64
 404        stvx    v3,r0,r3
 405        stvx    v2,r3,r9
 406        stvx    v1,r3,r10
 407        stvx    v0,r3,r11
 408        addi    r3,r3,64
 409
 4109:      bf      cr7*4+2,10f
 411        lvx     v1,r0,r4
 412        lvx     v0,r4,r9
 413        addi    r4,r4,32
 414        stvx    v1,r0,r3
 415        stvx    v0,r3,r9
 416        addi    r3,r3,32
 417
 41810:     bf      cr7*4+3,11f
 419        lvx     v1,r0,r4
 420        addi    r4,r4,16
 421        stvx    v1,r0,r3
 422        addi    r3,r3,16
 423
 424        /* Up to 15B to go */
 42511:     clrldi  r5,r5,(64-4)
 426        mtocrf  0x01,r5
 427        bf      cr7*4+0,12f
 428        ld      r0,0(r4)
 429        addi    r4,r4,8
 430        std     r0,0(r3)
 431        addi    r3,r3,8
 432
 43312:     bf      cr7*4+1,13f
 434        lwz     r0,0(r4)
 435        addi    r4,r4,4
 436        stw     r0,0(r3)
 437        addi    r3,r3,4
 438
 43913:     bf      cr7*4+2,14f
 440        lhz     r0,0(r4)
 441        addi    r4,r4,2
 442        sth     r0,0(r3)
 443        addi    r3,r3,2
 444
 44514:     bf      cr7*4+3,15f
 446        lbz     r0,0(r4)
 447        stb     r0,0(r3)
 448
 44915:     addi    r1,r1,STACKFRAMESIZE
 450        ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
 451        b       exit_vmx_copy           /* tail call optimise */
 452
 453.Lvmx_unaligned_copy:
 454        /* Get the destination 16B aligned */
 455        neg     r6,r3
 456        mtocrf  0x01,r6
 457        clrldi  r6,r6,(64-4)
 458
 459        bf      cr7*4+3,1f
 460        lbz     r0,0(r4)
 461        addi    r4,r4,1
 462        stb     r0,0(r3)
 463        addi    r3,r3,1
 464
 4651:      bf      cr7*4+2,2f
 466        lhz     r0,0(r4)
 467        addi    r4,r4,2
 468        sth     r0,0(r3)
 469        addi    r3,r3,2
 470
 4712:      bf      cr7*4+1,3f
 472        lwz     r0,0(r4)
 473        addi    r4,r4,4
 474        stw     r0,0(r3)
 475        addi    r3,r3,4
 476
 4773:      bf      cr7*4+0,4f
 478        lwz     r0,0(r4)        /* Less chance of a reject with word ops */
 479        lwz     r7,4(r4)
 480        addi    r4,r4,8
 481        stw     r0,0(r3)
 482        stw     r7,4(r3)
 483        addi    r3,r3,8
 484
 4854:      sub     r5,r5,r6
 486
 487        /* Get the desination 128B aligned */
 488        neg     r6,r3
 489        srdi    r7,r6,4
 490        mtocrf  0x01,r7
 491        clrldi  r6,r6,(64-7)
 492
 493        li      r9,16
 494        li      r10,32
 495        li      r11,48
 496
 497        LVS(v16,0,r4)           /* Setup permute control vector */
 498        lvx     v0,0,r4
 499        addi    r4,r4,16
 500
 501        bf      cr7*4+3,5f
 502        lvx     v1,r0,r4
 503        VPERM(v8,v0,v1,v16)
 504        addi    r4,r4,16
 505        stvx    v8,r0,r3
 506        addi    r3,r3,16
 507        vor     v0,v1,v1
 508
 5095:      bf      cr7*4+2,6f
 510        lvx     v1,r0,r4
 511        VPERM(v8,v0,v1,v16)
 512        lvx     v0,r4,r9
 513        VPERM(v9,v1,v0,v16)
 514        addi    r4,r4,32
 515        stvx    v8,r0,r3
 516        stvx    v9,r3,r9
 517        addi    r3,r3,32
 518
 5196:      bf      cr7*4+1,7f
 520        lvx     v3,r0,r4
 521        VPERM(v8,v0,v3,v16)
 522        lvx     v2,r4,r9
 523        VPERM(v9,v3,v2,v16)
 524        lvx     v1,r4,r10
 525        VPERM(v10,v2,v1,v16)
 526        lvx     v0,r4,r11
 527        VPERM(v11,v1,v0,v16)
 528        addi    r4,r4,64
 529        stvx    v8,r0,r3
 530        stvx    v9,r3,r9
 531        stvx    v10,r3,r10
 532        stvx    v11,r3,r11
 533        addi    r3,r3,64
 534
 5357:      sub     r5,r5,r6
 536        srdi    r6,r5,7
 537
 538        std     r14,STK_REG(R14)(r1)
 539        std     r15,STK_REG(R15)(r1)
 540        std     r16,STK_REG(R16)(r1)
 541
 542        li      r12,64
 543        li      r14,80
 544        li      r15,96
 545        li      r16,112
 546
 547        mtctr   r6
 548
 549        /*
 550         * Now do cacheline sized loads and stores. By this stage the
 551         * cacheline stores are also cacheline aligned.
 552         */
 553        .align  5
 5548:
 555        lvx     v7,r0,r4
 556        VPERM(v8,v0,v7,v16)
 557        lvx     v6,r4,r9
 558        VPERM(v9,v7,v6,v16)
 559        lvx     v5,r4,r10
 560        VPERM(v10,v6,v5,v16)
 561        lvx     v4,r4,r11
 562        VPERM(v11,v5,v4,v16)
 563        lvx     v3,r4,r12
 564        VPERM(v12,v4,v3,v16)
 565        lvx     v2,r4,r14
 566        VPERM(v13,v3,v2,v16)
 567        lvx     v1,r4,r15
 568        VPERM(v14,v2,v1,v16)
 569        lvx     v0,r4,r16
 570        VPERM(v15,v1,v0,v16)
 571        addi    r4,r4,128
 572        stvx    v8,r0,r3
 573        stvx    v9,r3,r9
 574        stvx    v10,r3,r10
 575        stvx    v11,r3,r11
 576        stvx    v12,r3,r12
 577        stvx    v13,r3,r14
 578        stvx    v14,r3,r15
 579        stvx    v15,r3,r16
 580        addi    r3,r3,128
 581        bdnz    8b
 582
 583        ld      r14,STK_REG(R14)(r1)
 584        ld      r15,STK_REG(R15)(r1)
 585        ld      r16,STK_REG(R16)(r1)
 586
 587        /* Up to 127B to go */
 588        clrldi  r5,r5,(64-7)
 589        srdi    r6,r5,4
 590        mtocrf  0x01,r6
 591
 592        bf      cr7*4+1,9f
 593        lvx     v3,r0,r4
 594        VPERM(v8,v0,v3,v16)
 595        lvx     v2,r4,r9
 596        VPERM(v9,v3,v2,v16)
 597        lvx     v1,r4,r10
 598        VPERM(v10,v2,v1,v16)
 599        lvx     v0,r4,r11
 600        VPERM(v11,v1,v0,v16)
 601        addi    r4,r4,64
 602        stvx    v8,r0,r3
 603        stvx    v9,r3,r9
 604        stvx    v10,r3,r10
 605        stvx    v11,r3,r11
 606        addi    r3,r3,64
 607
 6089:      bf      cr7*4+2,10f
 609        lvx     v1,r0,r4
 610        VPERM(v8,v0,v1,v16)
 611        lvx     v0,r4,r9
 612        VPERM(v9,v1,v0,v16)
 613        addi    r4,r4,32
 614        stvx    v8,r0,r3
 615        stvx    v9,r3,r9
 616        addi    r3,r3,32
 617
 61810:     bf      cr7*4+3,11f
 619        lvx     v1,r0,r4
 620        VPERM(v8,v0,v1,v16)
 621        addi    r4,r4,16
 622        stvx    v8,r0,r3
 623        addi    r3,r3,16
 624
 625        /* Up to 15B to go */
 62611:     clrldi  r5,r5,(64-4)
 627        addi    r4,r4,-16       /* Unwind the +16 load offset */
 628        mtocrf  0x01,r5
 629        bf      cr7*4+0,12f
 630        lwz     r0,0(r4)        /* Less chance of a reject with word ops */
 631        lwz     r6,4(r4)
 632        addi    r4,r4,8
 633        stw     r0,0(r3)
 634        stw     r6,4(r3)
 635        addi    r3,r3,8
 636
 63712:     bf      cr7*4+1,13f
 638        lwz     r0,0(r4)
 639        addi    r4,r4,4
 640        stw     r0,0(r3)
 641        addi    r3,r3,4
 642
 64313:     bf      cr7*4+2,14f
 644        lhz     r0,0(r4)
 645        addi    r4,r4,2
 646        sth     r0,0(r3)
 647        addi    r3,r3,2
 648
 64914:     bf      cr7*4+3,15f
 650        lbz     r0,0(r4)
 651        stb     r0,0(r3)
 652
 65315:     addi    r1,r1,STACKFRAMESIZE
 654        ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
 655        b       exit_vmx_copy           /* tail call optimise */
 656#endif /* CONFIG_ALTIVEC */
 657