linux/arch/powerpc/lib/memcpy_power7.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/*
   3 *
   4 * Copyright (C) IBM Corporation, 2012
   5 *
   6 * Author: Anton Blanchard <anton@au.ibm.com>
   7 */
   8#include <asm/ppc_asm.h>
   9
  10#ifndef SELFTEST_CASE
  11/* 0 == don't use VMX, 1 == use VMX */
  12#define SELFTEST_CASE   0
  13#endif
  14
  15#ifdef __BIG_ENDIAN__
  16#define LVS(VRT,RA,RB)          lvsl    VRT,RA,RB
  17#define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRA,VRB,VRC
  18#else
  19#define LVS(VRT,RA,RB)          lvsr    VRT,RA,RB
  20#define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRB,VRA,VRC
  21#endif
  22
  23_GLOBAL(memcpy_power7)
  24        cmpldi  r5,16
  25        cmpldi  cr1,r5,4096
  26        std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  27        blt     .Lshort_copy
  28
  29#ifdef CONFIG_ALTIVEC
  30test_feature = SELFTEST_CASE
  31BEGIN_FTR_SECTION
  32        bgt     cr1, .Lvmx_copy
  33END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
  34#endif
  35
  36.Lnonvmx_copy:
  37        /* Get the source 8B aligned */
  38        neg     r6,r4
  39        mtocrf  0x01,r6
  40        clrldi  r6,r6,(64-3)
  41
  42        bf      cr7*4+3,1f
  43        lbz     r0,0(r4)
  44        addi    r4,r4,1
  45        stb     r0,0(r3)
  46        addi    r3,r3,1
  47
  481:      bf      cr7*4+2,2f
  49        lhz     r0,0(r4)
  50        addi    r4,r4,2
  51        sth     r0,0(r3)
  52        addi    r3,r3,2
  53
  542:      bf      cr7*4+1,3f
  55        lwz     r0,0(r4)
  56        addi    r4,r4,4
  57        stw     r0,0(r3)
  58        addi    r3,r3,4
  59
  603:      sub     r5,r5,r6
  61        cmpldi  r5,128
  62        blt     5f
  63
  64        mflr    r0
  65        stdu    r1,-STACKFRAMESIZE(r1)
  66        std     r14,STK_REG(R14)(r1)
  67        std     r15,STK_REG(R15)(r1)
  68        std     r16,STK_REG(R16)(r1)
  69        std     r17,STK_REG(R17)(r1)
  70        std     r18,STK_REG(R18)(r1)
  71        std     r19,STK_REG(R19)(r1)
  72        std     r20,STK_REG(R20)(r1)
  73        std     r21,STK_REG(R21)(r1)
  74        std     r22,STK_REG(R22)(r1)
  75        std     r0,STACKFRAMESIZE+16(r1)
  76
  77        srdi    r6,r5,7
  78        mtctr   r6
  79
  80        /* Now do cacheline (128B) sized loads and stores. */
  81        .align  5
  824:
  83        ld      r0,0(r4)
  84        ld      r6,8(r4)
  85        ld      r7,16(r4)
  86        ld      r8,24(r4)
  87        ld      r9,32(r4)
  88        ld      r10,40(r4)
  89        ld      r11,48(r4)
  90        ld      r12,56(r4)
  91        ld      r14,64(r4)
  92        ld      r15,72(r4)
  93        ld      r16,80(r4)
  94        ld      r17,88(r4)
  95        ld      r18,96(r4)
  96        ld      r19,104(r4)
  97        ld      r20,112(r4)
  98        ld      r21,120(r4)
  99        addi    r4,r4,128
 100        std     r0,0(r3)
 101        std     r6,8(r3)
 102        std     r7,16(r3)
 103        std     r8,24(r3)
 104        std     r9,32(r3)
 105        std     r10,40(r3)
 106        std     r11,48(r3)
 107        std     r12,56(r3)
 108        std     r14,64(r3)
 109        std     r15,72(r3)
 110        std     r16,80(r3)
 111        std     r17,88(r3)
 112        std     r18,96(r3)
 113        std     r19,104(r3)
 114        std     r20,112(r3)
 115        std     r21,120(r3)
 116        addi    r3,r3,128
 117        bdnz    4b
 118
 119        clrldi  r5,r5,(64-7)
 120
 121        ld      r14,STK_REG(R14)(r1)
 122        ld      r15,STK_REG(R15)(r1)
 123        ld      r16,STK_REG(R16)(r1)
 124        ld      r17,STK_REG(R17)(r1)
 125        ld      r18,STK_REG(R18)(r1)
 126        ld      r19,STK_REG(R19)(r1)
 127        ld      r20,STK_REG(R20)(r1)
 128        ld      r21,STK_REG(R21)(r1)
 129        ld      r22,STK_REG(R22)(r1)
 130        addi    r1,r1,STACKFRAMESIZE
 131
 132        /* Up to 127B to go */
 1335:      srdi    r6,r5,4
 134        mtocrf  0x01,r6
 135
 1366:      bf      cr7*4+1,7f
 137        ld      r0,0(r4)
 138        ld      r6,8(r4)
 139        ld      r7,16(r4)
 140        ld      r8,24(r4)
 141        ld      r9,32(r4)
 142        ld      r10,40(r4)
 143        ld      r11,48(r4)
 144        ld      r12,56(r4)
 145        addi    r4,r4,64
 146        std     r0,0(r3)
 147        std     r6,8(r3)
 148        std     r7,16(r3)
 149        std     r8,24(r3)
 150        std     r9,32(r3)
 151        std     r10,40(r3)
 152        std     r11,48(r3)
 153        std     r12,56(r3)
 154        addi    r3,r3,64
 155
 156        /* Up to 63B to go */
 1577:      bf      cr7*4+2,8f
 158        ld      r0,0(r4)
 159        ld      r6,8(r4)
 160        ld      r7,16(r4)
 161        ld      r8,24(r4)
 162        addi    r4,r4,32
 163        std     r0,0(r3)
 164        std     r6,8(r3)
 165        std     r7,16(r3)
 166        std     r8,24(r3)
 167        addi    r3,r3,32
 168
 169        /* Up to 31B to go */
 1708:      bf      cr7*4+3,9f
 171        ld      r0,0(r4)
 172        ld      r6,8(r4)
 173        addi    r4,r4,16
 174        std     r0,0(r3)
 175        std     r6,8(r3)
 176        addi    r3,r3,16
 177
 1789:      clrldi  r5,r5,(64-4)
 179
 180        /* Up to 15B to go */
 181.Lshort_copy:
 182        mtocrf  0x01,r5
 183        bf      cr7*4+0,12f
 184        lwz     r0,0(r4)        /* Less chance of a reject with word ops */
 185        lwz     r6,4(r4)
 186        addi    r4,r4,8
 187        stw     r0,0(r3)
 188        stw     r6,4(r3)
 189        addi    r3,r3,8
 190
 19112:     bf      cr7*4+1,13f
 192        lwz     r0,0(r4)
 193        addi    r4,r4,4
 194        stw     r0,0(r3)
 195        addi    r3,r3,4
 196
 19713:     bf      cr7*4+2,14f
 198        lhz     r0,0(r4)
 199        addi    r4,r4,2
 200        sth     r0,0(r3)
 201        addi    r3,r3,2
 202
 20314:     bf      cr7*4+3,15f
 204        lbz     r0,0(r4)
 205        stb     r0,0(r3)
 206
 20715:     ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
 208        blr
 209
 210.Lunwind_stack_nonvmx_copy:
 211        addi    r1,r1,STACKFRAMESIZE
 212        b       .Lnonvmx_copy
 213
 214.Lvmx_copy:
 215#ifdef CONFIG_ALTIVEC
 216        mflr    r0
 217        std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
 218        std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
 219        std     r0,16(r1)
 220        stdu    r1,-STACKFRAMESIZE(r1)
 221        bl      enter_vmx_ops
 222        cmpwi   cr1,r3,0
 223        ld      r0,STACKFRAMESIZE+16(r1)
 224        ld      r3,STK_REG(R31)(r1)
 225        ld      r4,STK_REG(R30)(r1)
 226        ld      r5,STK_REG(R29)(r1)
 227        mtlr    r0
 228
 229        /*
 230         * We prefetch both the source and destination using enhanced touch
 231         * instructions. We use a stream ID of 0 for the load side and
 232         * 1 for the store side.
 233         */
 234        clrrdi  r6,r4,7
 235        clrrdi  r9,r3,7
 236        ori     r9,r9,1         /* stream=1 */
 237
 238        srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
 239        cmpldi  r7,0x3FF
 240        ble     1f
 241        li      r7,0x3FF
 2421:      lis     r0,0x0E00       /* depth=7 */
 243        sldi    r7,r7,7
 244        or      r7,r7,r0
 245        ori     r10,r7,1        /* stream=1 */
 246
 247        lis     r8,0x8000       /* GO=1 */
 248        clrldi  r8,r8,32
 249
 250        dcbt    0,r6,0b01000
 251        dcbt    0,r7,0b01010
 252        dcbtst  0,r9,0b01000
 253        dcbtst  0,r10,0b01010
 254        eieio
 255        dcbt    0,r8,0b01010    /* GO */
 256
 257        beq     cr1,.Lunwind_stack_nonvmx_copy
 258
 259        /*
 260         * If source and destination are not relatively aligned we use a
 261         * slower permute loop.
 262         */
 263        xor     r6,r4,r3
 264        rldicl. r6,r6,0,(64-4)
 265        bne     .Lvmx_unaligned_copy
 266
 267        /* Get the destination 16B aligned */
 268        neg     r6,r3
 269        mtocrf  0x01,r6
 270        clrldi  r6,r6,(64-4)
 271
 272        bf      cr7*4+3,1f
 273        lbz     r0,0(r4)
 274        addi    r4,r4,1
 275        stb     r0,0(r3)
 276        addi    r3,r3,1
 277
 2781:      bf      cr7*4+2,2f
 279        lhz     r0,0(r4)
 280        addi    r4,r4,2
 281        sth     r0,0(r3)
 282        addi    r3,r3,2
 283
 2842:      bf      cr7*4+1,3f
 285        lwz     r0,0(r4)
 286        addi    r4,r4,4
 287        stw     r0,0(r3)
 288        addi    r3,r3,4
 289
 2903:      bf      cr7*4+0,4f
 291        ld      r0,0(r4)
 292        addi    r4,r4,8
 293        std     r0,0(r3)
 294        addi    r3,r3,8
 295
 2964:      sub     r5,r5,r6
 297
 298        /* Get the desination 128B aligned */
 299        neg     r6,r3
 300        srdi    r7,r6,4
 301        mtocrf  0x01,r7
 302        clrldi  r6,r6,(64-7)
 303
 304        li      r9,16
 305        li      r10,32
 306        li      r11,48
 307
 308        bf      cr7*4+3,5f
 309        lvx     v1,0,r4
 310        addi    r4,r4,16
 311        stvx    v1,0,r3
 312        addi    r3,r3,16
 313
 3145:      bf      cr7*4+2,6f
 315        lvx     v1,0,r4
 316        lvx     v0,r4,r9
 317        addi    r4,r4,32
 318        stvx    v1,0,r3
 319        stvx    v0,r3,r9
 320        addi    r3,r3,32
 321
 3226:      bf      cr7*4+1,7f
 323        lvx     v3,0,r4
 324        lvx     v2,r4,r9
 325        lvx     v1,r4,r10
 326        lvx     v0,r4,r11
 327        addi    r4,r4,64
 328        stvx    v3,0,r3
 329        stvx    v2,r3,r9
 330        stvx    v1,r3,r10
 331        stvx    v0,r3,r11
 332        addi    r3,r3,64
 333
 3347:      sub     r5,r5,r6
 335        srdi    r6,r5,7
 336
 337        std     r14,STK_REG(R14)(r1)
 338        std     r15,STK_REG(R15)(r1)
 339        std     r16,STK_REG(R16)(r1)
 340
 341        li      r12,64
 342        li      r14,80
 343        li      r15,96
 344        li      r16,112
 345
 346        mtctr   r6
 347
 348        /*
 349         * Now do cacheline sized loads and stores. By this stage the
 350         * cacheline stores are also cacheline aligned.
 351         */
 352        .align  5
 3538:
 354        lvx     v7,0,r4
 355        lvx     v6,r4,r9
 356        lvx     v5,r4,r10
 357        lvx     v4,r4,r11
 358        lvx     v3,r4,r12
 359        lvx     v2,r4,r14
 360        lvx     v1,r4,r15
 361        lvx     v0,r4,r16
 362        addi    r4,r4,128
 363        stvx    v7,0,r3
 364        stvx    v6,r3,r9
 365        stvx    v5,r3,r10
 366        stvx    v4,r3,r11
 367        stvx    v3,r3,r12
 368        stvx    v2,r3,r14
 369        stvx    v1,r3,r15
 370        stvx    v0,r3,r16
 371        addi    r3,r3,128
 372        bdnz    8b
 373
 374        ld      r14,STK_REG(R14)(r1)
 375        ld      r15,STK_REG(R15)(r1)
 376        ld      r16,STK_REG(R16)(r1)
 377
 378        /* Up to 127B to go */
 379        clrldi  r5,r5,(64-7)
 380        srdi    r6,r5,4
 381        mtocrf  0x01,r6
 382
 383        bf      cr7*4+1,9f
 384        lvx     v3,0,r4
 385        lvx     v2,r4,r9
 386        lvx     v1,r4,r10
 387        lvx     v0,r4,r11
 388        addi    r4,r4,64
 389        stvx    v3,0,r3
 390        stvx    v2,r3,r9
 391        stvx    v1,r3,r10
 392        stvx    v0,r3,r11
 393        addi    r3,r3,64
 394
 3959:      bf      cr7*4+2,10f
 396        lvx     v1,0,r4
 397        lvx     v0,r4,r9
 398        addi    r4,r4,32
 399        stvx    v1,0,r3
 400        stvx    v0,r3,r9
 401        addi    r3,r3,32
 402
 40310:     bf      cr7*4+3,11f
 404        lvx     v1,0,r4
 405        addi    r4,r4,16
 406        stvx    v1,0,r3
 407        addi    r3,r3,16
 408
 409        /* Up to 15B to go */
 41011:     clrldi  r5,r5,(64-4)
 411        mtocrf  0x01,r5
 412        bf      cr7*4+0,12f
 413        ld      r0,0(r4)
 414        addi    r4,r4,8
 415        std     r0,0(r3)
 416        addi    r3,r3,8
 417
 41812:     bf      cr7*4+1,13f
 419        lwz     r0,0(r4)
 420        addi    r4,r4,4
 421        stw     r0,0(r3)
 422        addi    r3,r3,4
 423
 42413:     bf      cr7*4+2,14f
 425        lhz     r0,0(r4)
 426        addi    r4,r4,2
 427        sth     r0,0(r3)
 428        addi    r3,r3,2
 429
 43014:     bf      cr7*4+3,15f
 431        lbz     r0,0(r4)
 432        stb     r0,0(r3)
 433
 43415:     addi    r1,r1,STACKFRAMESIZE
 435        ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
 436        b       exit_vmx_ops            /* tail call optimise */
 437
 438.Lvmx_unaligned_copy:
 439        /* Get the destination 16B aligned */
 440        neg     r6,r3
 441        mtocrf  0x01,r6
 442        clrldi  r6,r6,(64-4)
 443
 444        bf      cr7*4+3,1f
 445        lbz     r0,0(r4)
 446        addi    r4,r4,1
 447        stb     r0,0(r3)
 448        addi    r3,r3,1
 449
 4501:      bf      cr7*4+2,2f
 451        lhz     r0,0(r4)
 452        addi    r4,r4,2
 453        sth     r0,0(r3)
 454        addi    r3,r3,2
 455
 4562:      bf      cr7*4+1,3f
 457        lwz     r0,0(r4)
 458        addi    r4,r4,4
 459        stw     r0,0(r3)
 460        addi    r3,r3,4
 461
 4623:      bf      cr7*4+0,4f
 463        lwz     r0,0(r4)        /* Less chance of a reject with word ops */
 464        lwz     r7,4(r4)
 465        addi    r4,r4,8
 466        stw     r0,0(r3)
 467        stw     r7,4(r3)
 468        addi    r3,r3,8
 469
 4704:      sub     r5,r5,r6
 471
 472        /* Get the desination 128B aligned */
 473        neg     r6,r3
 474        srdi    r7,r6,4
 475        mtocrf  0x01,r7
 476        clrldi  r6,r6,(64-7)
 477
 478        li      r9,16
 479        li      r10,32
 480        li      r11,48
 481
 482        LVS(v16,0,r4)           /* Setup permute control vector */
 483        lvx     v0,0,r4
 484        addi    r4,r4,16
 485
 486        bf      cr7*4+3,5f
 487        lvx     v1,0,r4
 488        VPERM(v8,v0,v1,v16)
 489        addi    r4,r4,16
 490        stvx    v8,0,r3
 491        addi    r3,r3,16
 492        vor     v0,v1,v1
 493
 4945:      bf      cr7*4+2,6f
 495        lvx     v1,0,r4
 496        VPERM(v8,v0,v1,v16)
 497        lvx     v0,r4,r9
 498        VPERM(v9,v1,v0,v16)
 499        addi    r4,r4,32
 500        stvx    v8,0,r3
 501        stvx    v9,r3,r9
 502        addi    r3,r3,32
 503
 5046:      bf      cr7*4+1,7f
 505        lvx     v3,0,r4
 506        VPERM(v8,v0,v3,v16)
 507        lvx     v2,r4,r9
 508        VPERM(v9,v3,v2,v16)
 509        lvx     v1,r4,r10
 510        VPERM(v10,v2,v1,v16)
 511        lvx     v0,r4,r11
 512        VPERM(v11,v1,v0,v16)
 513        addi    r4,r4,64
 514        stvx    v8,0,r3
 515        stvx    v9,r3,r9
 516        stvx    v10,r3,r10
 517        stvx    v11,r3,r11
 518        addi    r3,r3,64
 519
 5207:      sub     r5,r5,r6
 521        srdi    r6,r5,7
 522
 523        std     r14,STK_REG(R14)(r1)
 524        std     r15,STK_REG(R15)(r1)
 525        std     r16,STK_REG(R16)(r1)
 526
 527        li      r12,64
 528        li      r14,80
 529        li      r15,96
 530        li      r16,112
 531
 532        mtctr   r6
 533
 534        /*
 535         * Now do cacheline sized loads and stores. By this stage the
 536         * cacheline stores are also cacheline aligned.
 537         */
 538        .align  5
 5398:
 540        lvx     v7,0,r4
 541        VPERM(v8,v0,v7,v16)
 542        lvx     v6,r4,r9
 543        VPERM(v9,v7,v6,v16)
 544        lvx     v5,r4,r10
 545        VPERM(v10,v6,v5,v16)
 546        lvx     v4,r4,r11
 547        VPERM(v11,v5,v4,v16)
 548        lvx     v3,r4,r12
 549        VPERM(v12,v4,v3,v16)
 550        lvx     v2,r4,r14
 551        VPERM(v13,v3,v2,v16)
 552        lvx     v1,r4,r15
 553        VPERM(v14,v2,v1,v16)
 554        lvx     v0,r4,r16
 555        VPERM(v15,v1,v0,v16)
 556        addi    r4,r4,128
 557        stvx    v8,0,r3
 558        stvx    v9,r3,r9
 559        stvx    v10,r3,r10
 560        stvx    v11,r3,r11
 561        stvx    v12,r3,r12
 562        stvx    v13,r3,r14
 563        stvx    v14,r3,r15
 564        stvx    v15,r3,r16
 565        addi    r3,r3,128
 566        bdnz    8b
 567
 568        ld      r14,STK_REG(R14)(r1)
 569        ld      r15,STK_REG(R15)(r1)
 570        ld      r16,STK_REG(R16)(r1)
 571
 572        /* Up to 127B to go */
 573        clrldi  r5,r5,(64-7)
 574        srdi    r6,r5,4
 575        mtocrf  0x01,r6
 576
 577        bf      cr7*4+1,9f
 578        lvx     v3,0,r4
 579        VPERM(v8,v0,v3,v16)
 580        lvx     v2,r4,r9
 581        VPERM(v9,v3,v2,v16)
 582        lvx     v1,r4,r10
 583        VPERM(v10,v2,v1,v16)
 584        lvx     v0,r4,r11
 585        VPERM(v11,v1,v0,v16)
 586        addi    r4,r4,64
 587        stvx    v8,0,r3
 588        stvx    v9,r3,r9
 589        stvx    v10,r3,r10
 590        stvx    v11,r3,r11
 591        addi    r3,r3,64
 592
 5939:      bf      cr7*4+2,10f
 594        lvx     v1,0,r4
 595        VPERM(v8,v0,v1,v16)
 596        lvx     v0,r4,r9
 597        VPERM(v9,v1,v0,v16)
 598        addi    r4,r4,32
 599        stvx    v8,0,r3
 600        stvx    v9,r3,r9
 601        addi    r3,r3,32
 602
 60310:     bf      cr7*4+3,11f
 604        lvx     v1,0,r4
 605        VPERM(v8,v0,v1,v16)
 606        addi    r4,r4,16
 607        stvx    v8,0,r3
 608        addi    r3,r3,16
 609
 610        /* Up to 15B to go */
 61111:     clrldi  r5,r5,(64-4)
 612        addi    r4,r4,-16       /* Unwind the +16 load offset */
 613        mtocrf  0x01,r5
 614        bf      cr7*4+0,12f
 615        lwz     r0,0(r4)        /* Less chance of a reject with word ops */
 616        lwz     r6,4(r4)
 617        addi    r4,r4,8
 618        stw     r0,0(r3)
 619        stw     r6,4(r3)
 620        addi    r3,r3,8
 621
 62212:     bf      cr7*4+1,13f
 623        lwz     r0,0(r4)
 624        addi    r4,r4,4
 625        stw     r0,0(r3)
 626        addi    r3,r3,4
 627
 62813:     bf      cr7*4+2,14f
 629        lhz     r0,0(r4)
 630        addi    r4,r4,2
 631        sth     r0,0(r3)
 632        addi    r3,r3,2
 633
 63414:     bf      cr7*4+3,15f
 635        lbz     r0,0(r4)
 636        stb     r0,0(r3)
 637
 63815:     addi    r1,r1,STACKFRAMESIZE
 639        ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
 640        b       exit_vmx_ops            /* tail call optimise */
 641#endif /* CONFIG_ALTIVEC */
 642