linux/arch/powerpc/lib/copyuser_power7.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/*
   3 *
   4 * Copyright (C) IBM Corporation, 2011
   5 *
   6 * Author: Anton Blanchard <anton@au.ibm.com>
   7 */
   8#include <asm/ppc_asm.h>
   9
  10#ifndef SELFTEST_CASE
  11/* 0 == don't use VMX, 1 == use VMX */
  12#define SELFTEST_CASE   0
  13#endif
  14
  15#ifdef __BIG_ENDIAN__
  16#define LVS(VRT,RA,RB)          lvsl    VRT,RA,RB
  17#define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRA,VRB,VRC
  18#else
  19#define LVS(VRT,RA,RB)          lvsr    VRT,RA,RB
  20#define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRB,VRA,VRC
  21#endif
  22
  23        .macro err1
  24100:
  25        EX_TABLE(100b,.Ldo_err1)
  26        .endm
  27
  28        .macro err2
  29200:
  30        EX_TABLE(200b,.Ldo_err2)
  31        .endm
  32
  33#ifdef CONFIG_ALTIVEC
  34        .macro err3
  35300:
  36        EX_TABLE(300b,.Ldo_err3)
  37        .endm
  38
  39        .macro err4
  40400:
  41        EX_TABLE(400b,.Ldo_err4)
  42        .endm
  43
  44
  45.Ldo_err4:
  46        ld      r16,STK_REG(R16)(r1)
  47        ld      r15,STK_REG(R15)(r1)
  48        ld      r14,STK_REG(R14)(r1)
  49.Ldo_err3:
  50        bl      exit_vmx_usercopy
  51        ld      r0,STACKFRAMESIZE+16(r1)
  52        mtlr    r0
  53        b       .Lexit
  54#endif /* CONFIG_ALTIVEC */
  55
  56.Ldo_err2:
  57        ld      r22,STK_REG(R22)(r1)
  58        ld      r21,STK_REG(R21)(r1)
  59        ld      r20,STK_REG(R20)(r1)
  60        ld      r19,STK_REG(R19)(r1)
  61        ld      r18,STK_REG(R18)(r1)
  62        ld      r17,STK_REG(R17)(r1)
  63        ld      r16,STK_REG(R16)(r1)
  64        ld      r15,STK_REG(R15)(r1)
  65        ld      r14,STK_REG(R14)(r1)
  66.Lexit:
  67        addi    r1,r1,STACKFRAMESIZE
  68.Ldo_err1:
  69        ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  70        ld      r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
  71        ld      r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
  72        b       __copy_tofrom_user_base
  73
  74
  75_GLOBAL(__copy_tofrom_user_power7)
  76        cmpldi  r5,16
  77        cmpldi  cr1,r5,3328
  78
  79        std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  80        std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
  81        std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
  82
  83        blt     .Lshort_copy
  84
  85#ifdef CONFIG_ALTIVEC
  86test_feature = SELFTEST_CASE
  87BEGIN_FTR_SECTION
  88        bgt     cr1,.Lvmx_copy
  89END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
  90#endif
  91
  92.Lnonvmx_copy:
  93        /* Get the source 8B aligned */
  94        neg     r6,r4
  95        mtocrf  0x01,r6
  96        clrldi  r6,r6,(64-3)
  97
  98        bf      cr7*4+3,1f
  99err1;   lbz     r0,0(r4)
 100        addi    r4,r4,1
 101err1;   stb     r0,0(r3)
 102        addi    r3,r3,1
 103
 1041:      bf      cr7*4+2,2f
 105err1;   lhz     r0,0(r4)
 106        addi    r4,r4,2
 107err1;   sth     r0,0(r3)
 108        addi    r3,r3,2
 109
 1102:      bf      cr7*4+1,3f
 111err1;   lwz     r0,0(r4)
 112        addi    r4,r4,4
 113err1;   stw     r0,0(r3)
 114        addi    r3,r3,4
 115
 1163:      sub     r5,r5,r6
 117        cmpldi  r5,128
 118        blt     5f
 119
 120        mflr    r0
 121        stdu    r1,-STACKFRAMESIZE(r1)
 122        std     r14,STK_REG(R14)(r1)
 123        std     r15,STK_REG(R15)(r1)
 124        std     r16,STK_REG(R16)(r1)
 125        std     r17,STK_REG(R17)(r1)
 126        std     r18,STK_REG(R18)(r1)
 127        std     r19,STK_REG(R19)(r1)
 128        std     r20,STK_REG(R20)(r1)
 129        std     r21,STK_REG(R21)(r1)
 130        std     r22,STK_REG(R22)(r1)
 131        std     r0,STACKFRAMESIZE+16(r1)
 132
 133        srdi    r6,r5,7
 134        mtctr   r6
 135
 136        /* Now do cacheline (128B) sized loads and stores. */
 137        .align  5
 1384:
 139err2;   ld      r0,0(r4)
 140err2;   ld      r6,8(r4)
 141err2;   ld      r7,16(r4)
 142err2;   ld      r8,24(r4)
 143err2;   ld      r9,32(r4)
 144err2;   ld      r10,40(r4)
 145err2;   ld      r11,48(r4)
 146err2;   ld      r12,56(r4)
 147err2;   ld      r14,64(r4)
 148err2;   ld      r15,72(r4)
 149err2;   ld      r16,80(r4)
 150err2;   ld      r17,88(r4)
 151err2;   ld      r18,96(r4)
 152err2;   ld      r19,104(r4)
 153err2;   ld      r20,112(r4)
 154err2;   ld      r21,120(r4)
 155        addi    r4,r4,128
 156err2;   std     r0,0(r3)
 157err2;   std     r6,8(r3)
 158err2;   std     r7,16(r3)
 159err2;   std     r8,24(r3)
 160err2;   std     r9,32(r3)
 161err2;   std     r10,40(r3)
 162err2;   std     r11,48(r3)
 163err2;   std     r12,56(r3)
 164err2;   std     r14,64(r3)
 165err2;   std     r15,72(r3)
 166err2;   std     r16,80(r3)
 167err2;   std     r17,88(r3)
 168err2;   std     r18,96(r3)
 169err2;   std     r19,104(r3)
 170err2;   std     r20,112(r3)
 171err2;   std     r21,120(r3)
 172        addi    r3,r3,128
 173        bdnz    4b
 174
 175        clrldi  r5,r5,(64-7)
 176
 177        ld      r14,STK_REG(R14)(r1)
 178        ld      r15,STK_REG(R15)(r1)
 179        ld      r16,STK_REG(R16)(r1)
 180        ld      r17,STK_REG(R17)(r1)
 181        ld      r18,STK_REG(R18)(r1)
 182        ld      r19,STK_REG(R19)(r1)
 183        ld      r20,STK_REG(R20)(r1)
 184        ld      r21,STK_REG(R21)(r1)
 185        ld      r22,STK_REG(R22)(r1)
 186        addi    r1,r1,STACKFRAMESIZE
 187
 188        /* Up to 127B to go */
 1895:      srdi    r6,r5,4
 190        mtocrf  0x01,r6
 191
 1926:      bf      cr7*4+1,7f
 193err1;   ld      r0,0(r4)
 194err1;   ld      r6,8(r4)
 195err1;   ld      r7,16(r4)
 196err1;   ld      r8,24(r4)
 197err1;   ld      r9,32(r4)
 198err1;   ld      r10,40(r4)
 199err1;   ld      r11,48(r4)
 200err1;   ld      r12,56(r4)
 201        addi    r4,r4,64
 202err1;   std     r0,0(r3)
 203err1;   std     r6,8(r3)
 204err1;   std     r7,16(r3)
 205err1;   std     r8,24(r3)
 206err1;   std     r9,32(r3)
 207err1;   std     r10,40(r3)
 208err1;   std     r11,48(r3)
 209err1;   std     r12,56(r3)
 210        addi    r3,r3,64
 211
 212        /* Up to 63B to go */
 2137:      bf      cr7*4+2,8f
 214err1;   ld      r0,0(r4)
 215err1;   ld      r6,8(r4)
 216err1;   ld      r7,16(r4)
 217err1;   ld      r8,24(r4)
 218        addi    r4,r4,32
 219err1;   std     r0,0(r3)
 220err1;   std     r6,8(r3)
 221err1;   std     r7,16(r3)
 222err1;   std     r8,24(r3)
 223        addi    r3,r3,32
 224
 225        /* Up to 31B to go */
 2268:      bf      cr7*4+3,9f
 227err1;   ld      r0,0(r4)
 228err1;   ld      r6,8(r4)
 229        addi    r4,r4,16
 230err1;   std     r0,0(r3)
 231err1;   std     r6,8(r3)
 232        addi    r3,r3,16
 233
 2349:      clrldi  r5,r5,(64-4)
 235
 236        /* Up to 15B to go */
 237.Lshort_copy:
 238        mtocrf  0x01,r5
 239        bf      cr7*4+0,12f
 240err1;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
 241err1;   lwz     r6,4(r4)
 242        addi    r4,r4,8
 243err1;   stw     r0,0(r3)
 244err1;   stw     r6,4(r3)
 245        addi    r3,r3,8
 246
 24712:     bf      cr7*4+1,13f
 248err1;   lwz     r0,0(r4)
 249        addi    r4,r4,4
 250err1;   stw     r0,0(r3)
 251        addi    r3,r3,4
 252
 25313:     bf      cr7*4+2,14f
 254err1;   lhz     r0,0(r4)
 255        addi    r4,r4,2
 256err1;   sth     r0,0(r3)
 257        addi    r3,r3,2
 258
 25914:     bf      cr7*4+3,15f
 260err1;   lbz     r0,0(r4)
 261err1;   stb     r0,0(r3)
 262
 26315:     li      r3,0
 264        blr
 265
 266.Lunwind_stack_nonvmx_copy:
 267        addi    r1,r1,STACKFRAMESIZE
 268        b       .Lnonvmx_copy
 269
 270.Lvmx_copy:
 271#ifdef CONFIG_ALTIVEC
 272        mflr    r0
 273        std     r0,16(r1)
 274        stdu    r1,-STACKFRAMESIZE(r1)
 275        bl      enter_vmx_usercopy
 276        cmpwi   cr1,r3,0
 277        ld      r0,STACKFRAMESIZE+16(r1)
 278        ld      r3,STK_REG(R31)(r1)
 279        ld      r4,STK_REG(R30)(r1)
 280        ld      r5,STK_REG(R29)(r1)
 281        mtlr    r0
 282
 283        /*
 284         * We prefetch both the source and destination using enhanced touch
 285         * instructions. We use a stream ID of 0 for the load side and
 286         * 1 for the store side.
 287         */
 288        clrrdi  r6,r4,7
 289        clrrdi  r9,r3,7
 290        ori     r9,r9,1         /* stream=1 */
 291
 292        srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
 293        cmpldi  r7,0x3FF
 294        ble     1f
 295        li      r7,0x3FF
 2961:      lis     r0,0x0E00       /* depth=7 */
 297        sldi    r7,r7,7
 298        or      r7,r7,r0
 299        ori     r10,r7,1        /* stream=1 */
 300
 301        lis     r8,0x8000       /* GO=1 */
 302        clrldi  r8,r8,32
 303
 304        /* setup read stream 0 */
 305        dcbt    0,r6,0b01000   /* addr from */
 306        dcbt    0,r7,0b01010   /* length and depth from */
 307        /* setup write stream 1 */
 308        dcbtst  0,r9,0b01000   /* addr to */
 309        dcbtst  0,r10,0b01010  /* length and depth to */
 310        eieio
 311        dcbt    0,r8,0b01010    /* all streams GO */
 312
 313        beq     cr1,.Lunwind_stack_nonvmx_copy
 314
 315        /*
 316         * If source and destination are not relatively aligned we use a
 317         * slower permute loop.
 318         */
 319        xor     r6,r4,r3
 320        rldicl. r6,r6,0,(64-4)
 321        bne     .Lvmx_unaligned_copy
 322
 323        /* Get the destination 16B aligned */
 324        neg     r6,r3
 325        mtocrf  0x01,r6
 326        clrldi  r6,r6,(64-4)
 327
 328        bf      cr7*4+3,1f
 329err3;   lbz     r0,0(r4)
 330        addi    r4,r4,1
 331err3;   stb     r0,0(r3)
 332        addi    r3,r3,1
 333
 3341:      bf      cr7*4+2,2f
 335err3;   lhz     r0,0(r4)
 336        addi    r4,r4,2
 337err3;   sth     r0,0(r3)
 338        addi    r3,r3,2
 339
 3402:      bf      cr7*4+1,3f
 341err3;   lwz     r0,0(r4)
 342        addi    r4,r4,4
 343err3;   stw     r0,0(r3)
 344        addi    r3,r3,4
 345
 3463:      bf      cr7*4+0,4f
 347err3;   ld      r0,0(r4)
 348        addi    r4,r4,8
 349err3;   std     r0,0(r3)
 350        addi    r3,r3,8
 351
 3524:      sub     r5,r5,r6
 353
 354        /* Get the desination 128B aligned */
 355        neg     r6,r3
 356        srdi    r7,r6,4
 357        mtocrf  0x01,r7
 358        clrldi  r6,r6,(64-7)
 359
 360        li      r9,16
 361        li      r10,32
 362        li      r11,48
 363
 364        bf      cr7*4+3,5f
 365err3;   lvx     v1,0,r4
 366        addi    r4,r4,16
 367err3;   stvx    v1,0,r3
 368        addi    r3,r3,16
 369
 3705:      bf      cr7*4+2,6f
 371err3;   lvx     v1,0,r4
 372err3;   lvx     v0,r4,r9
 373        addi    r4,r4,32
 374err3;   stvx    v1,0,r3
 375err3;   stvx    v0,r3,r9
 376        addi    r3,r3,32
 377
 3786:      bf      cr7*4+1,7f
 379err3;   lvx     v3,0,r4
 380err3;   lvx     v2,r4,r9
 381err3;   lvx     v1,r4,r10
 382err3;   lvx     v0,r4,r11
 383        addi    r4,r4,64
 384err3;   stvx    v3,0,r3
 385err3;   stvx    v2,r3,r9
 386err3;   stvx    v1,r3,r10
 387err3;   stvx    v0,r3,r11
 388        addi    r3,r3,64
 389
 3907:      sub     r5,r5,r6
 391        srdi    r6,r5,7
 392
 393        std     r14,STK_REG(R14)(r1)
 394        std     r15,STK_REG(R15)(r1)
 395        std     r16,STK_REG(R16)(r1)
 396
 397        li      r12,64
 398        li      r14,80
 399        li      r15,96
 400        li      r16,112
 401
 402        mtctr   r6
 403
 404        /*
 405         * Now do cacheline sized loads and stores. By this stage the
 406         * cacheline stores are also cacheline aligned.
 407         */
 408        .align  5
 4098:
 410err4;   lvx     v7,0,r4
 411err4;   lvx     v6,r4,r9
 412err4;   lvx     v5,r4,r10
 413err4;   lvx     v4,r4,r11
 414err4;   lvx     v3,r4,r12
 415err4;   lvx     v2,r4,r14
 416err4;   lvx     v1,r4,r15
 417err4;   lvx     v0,r4,r16
 418        addi    r4,r4,128
 419err4;   stvx    v7,0,r3
 420err4;   stvx    v6,r3,r9
 421err4;   stvx    v5,r3,r10
 422err4;   stvx    v4,r3,r11
 423err4;   stvx    v3,r3,r12
 424err4;   stvx    v2,r3,r14
 425err4;   stvx    v1,r3,r15
 426err4;   stvx    v0,r3,r16
 427        addi    r3,r3,128
 428        bdnz    8b
 429
 430        ld      r14,STK_REG(R14)(r1)
 431        ld      r15,STK_REG(R15)(r1)
 432        ld      r16,STK_REG(R16)(r1)
 433
 434        /* Up to 127B to go */
 435        clrldi  r5,r5,(64-7)
 436        srdi    r6,r5,4
 437        mtocrf  0x01,r6
 438
 439        bf      cr7*4+1,9f
 440err3;   lvx     v3,0,r4
 441err3;   lvx     v2,r4,r9
 442err3;   lvx     v1,r4,r10
 443err3;   lvx     v0,r4,r11
 444        addi    r4,r4,64
 445err3;   stvx    v3,0,r3
 446err3;   stvx    v2,r3,r9
 447err3;   stvx    v1,r3,r10
 448err3;   stvx    v0,r3,r11
 449        addi    r3,r3,64
 450
 4519:      bf      cr7*4+2,10f
 452err3;   lvx     v1,0,r4
 453err3;   lvx     v0,r4,r9
 454        addi    r4,r4,32
 455err3;   stvx    v1,0,r3
 456err3;   stvx    v0,r3,r9
 457        addi    r3,r3,32
 458
 45910:     bf      cr7*4+3,11f
 460err3;   lvx     v1,0,r4
 461        addi    r4,r4,16
 462err3;   stvx    v1,0,r3
 463        addi    r3,r3,16
 464
 465        /* Up to 15B to go */
 46611:     clrldi  r5,r5,(64-4)
 467        mtocrf  0x01,r5
 468        bf      cr7*4+0,12f
 469err3;   ld      r0,0(r4)
 470        addi    r4,r4,8
 471err3;   std     r0,0(r3)
 472        addi    r3,r3,8
 473
 47412:     bf      cr7*4+1,13f
 475err3;   lwz     r0,0(r4)
 476        addi    r4,r4,4
 477err3;   stw     r0,0(r3)
 478        addi    r3,r3,4
 479
 48013:     bf      cr7*4+2,14f
 481err3;   lhz     r0,0(r4)
 482        addi    r4,r4,2
 483err3;   sth     r0,0(r3)
 484        addi    r3,r3,2
 485
 48614:     bf      cr7*4+3,15f
 487err3;   lbz     r0,0(r4)
 488err3;   stb     r0,0(r3)
 489
 49015:     addi    r1,r1,STACKFRAMESIZE
 491        b       exit_vmx_usercopy       /* tail call optimise */
 492
 493.Lvmx_unaligned_copy:
 494        /* Get the destination 16B aligned */
 495        neg     r6,r3
 496        mtocrf  0x01,r6
 497        clrldi  r6,r6,(64-4)
 498
 499        bf      cr7*4+3,1f
 500err3;   lbz     r0,0(r4)
 501        addi    r4,r4,1
 502err3;   stb     r0,0(r3)
 503        addi    r3,r3,1
 504
 5051:      bf      cr7*4+2,2f
 506err3;   lhz     r0,0(r4)
 507        addi    r4,r4,2
 508err3;   sth     r0,0(r3)
 509        addi    r3,r3,2
 510
 5112:      bf      cr7*4+1,3f
 512err3;   lwz     r0,0(r4)
 513        addi    r4,r4,4
 514err3;   stw     r0,0(r3)
 515        addi    r3,r3,4
 516
 5173:      bf      cr7*4+0,4f
 518err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
 519err3;   lwz     r7,4(r4)
 520        addi    r4,r4,8
 521err3;   stw     r0,0(r3)
 522err3;   stw     r7,4(r3)
 523        addi    r3,r3,8
 524
 5254:      sub     r5,r5,r6
 526
 527        /* Get the desination 128B aligned */
 528        neg     r6,r3
 529        srdi    r7,r6,4
 530        mtocrf  0x01,r7
 531        clrldi  r6,r6,(64-7)
 532
 533        li      r9,16
 534        li      r10,32
 535        li      r11,48
 536
 537        LVS(v16,0,r4)           /* Setup permute control vector */
 538err3;   lvx     v0,0,r4
 539        addi    r4,r4,16
 540
 541        bf      cr7*4+3,5f
 542err3;   lvx     v1,0,r4
 543        VPERM(v8,v0,v1,v16)
 544        addi    r4,r4,16
 545err3;   stvx    v8,0,r3
 546        addi    r3,r3,16
 547        vor     v0,v1,v1
 548
 5495:      bf      cr7*4+2,6f
 550err3;   lvx     v1,0,r4
 551        VPERM(v8,v0,v1,v16)
 552err3;   lvx     v0,r4,r9
 553        VPERM(v9,v1,v0,v16)
 554        addi    r4,r4,32
 555err3;   stvx    v8,0,r3
 556err3;   stvx    v9,r3,r9
 557        addi    r3,r3,32
 558
 5596:      bf      cr7*4+1,7f
 560err3;   lvx     v3,0,r4
 561        VPERM(v8,v0,v3,v16)
 562err3;   lvx     v2,r4,r9
 563        VPERM(v9,v3,v2,v16)
 564err3;   lvx     v1,r4,r10
 565        VPERM(v10,v2,v1,v16)
 566err3;   lvx     v0,r4,r11
 567        VPERM(v11,v1,v0,v16)
 568        addi    r4,r4,64
 569err3;   stvx    v8,0,r3
 570err3;   stvx    v9,r3,r9
 571err3;   stvx    v10,r3,r10
 572err3;   stvx    v11,r3,r11
 573        addi    r3,r3,64
 574
 5757:      sub     r5,r5,r6
 576        srdi    r6,r5,7
 577
 578        std     r14,STK_REG(R14)(r1)
 579        std     r15,STK_REG(R15)(r1)
 580        std     r16,STK_REG(R16)(r1)
 581
 582        li      r12,64
 583        li      r14,80
 584        li      r15,96
 585        li      r16,112
 586
 587        mtctr   r6
 588
 589        /*
 590         * Now do cacheline sized loads and stores. By this stage the
 591         * cacheline stores are also cacheline aligned.
 592         */
 593        .align  5
 5948:
 595err4;   lvx     v7,0,r4
 596        VPERM(v8,v0,v7,v16)
 597err4;   lvx     v6,r4,r9
 598        VPERM(v9,v7,v6,v16)
 599err4;   lvx     v5,r4,r10
 600        VPERM(v10,v6,v5,v16)
 601err4;   lvx     v4,r4,r11
 602        VPERM(v11,v5,v4,v16)
 603err4;   lvx     v3,r4,r12
 604        VPERM(v12,v4,v3,v16)
 605err4;   lvx     v2,r4,r14
 606        VPERM(v13,v3,v2,v16)
 607err4;   lvx     v1,r4,r15
 608        VPERM(v14,v2,v1,v16)
 609err4;   lvx     v0,r4,r16
 610        VPERM(v15,v1,v0,v16)
 611        addi    r4,r4,128
 612err4;   stvx    v8,0,r3
 613err4;   stvx    v9,r3,r9
 614err4;   stvx    v10,r3,r10
 615err4;   stvx    v11,r3,r11
 616err4;   stvx    v12,r3,r12
 617err4;   stvx    v13,r3,r14
 618err4;   stvx    v14,r3,r15
 619err4;   stvx    v15,r3,r16
 620        addi    r3,r3,128
 621        bdnz    8b
 622
 623        ld      r14,STK_REG(R14)(r1)
 624        ld      r15,STK_REG(R15)(r1)
 625        ld      r16,STK_REG(R16)(r1)
 626
 627        /* Up to 127B to go */
 628        clrldi  r5,r5,(64-7)
 629        srdi    r6,r5,4
 630        mtocrf  0x01,r6
 631
 632        bf      cr7*4+1,9f
 633err3;   lvx     v3,0,r4
 634        VPERM(v8,v0,v3,v16)
 635err3;   lvx     v2,r4,r9
 636        VPERM(v9,v3,v2,v16)
 637err3;   lvx     v1,r4,r10
 638        VPERM(v10,v2,v1,v16)
 639err3;   lvx     v0,r4,r11
 640        VPERM(v11,v1,v0,v16)
 641        addi    r4,r4,64
 642err3;   stvx    v8,0,r3
 643err3;   stvx    v9,r3,r9
 644err3;   stvx    v10,r3,r10
 645err3;   stvx    v11,r3,r11
 646        addi    r3,r3,64
 647
 6489:      bf      cr7*4+2,10f
 649err3;   lvx     v1,0,r4
 650        VPERM(v8,v0,v1,v16)
 651err3;   lvx     v0,r4,r9
 652        VPERM(v9,v1,v0,v16)
 653        addi    r4,r4,32
 654err3;   stvx    v8,0,r3
 655err3;   stvx    v9,r3,r9
 656        addi    r3,r3,32
 657
 65810:     bf      cr7*4+3,11f
 659err3;   lvx     v1,0,r4
 660        VPERM(v8,v0,v1,v16)
 661        addi    r4,r4,16
 662err3;   stvx    v8,0,r3
 663        addi    r3,r3,16
 664
 665        /* Up to 15B to go */
 66611:     clrldi  r5,r5,(64-4)
 667        addi    r4,r4,-16       /* Unwind the +16 load offset */
 668        mtocrf  0x01,r5
 669        bf      cr7*4+0,12f
 670err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
 671err3;   lwz     r6,4(r4)
 672        addi    r4,r4,8
 673err3;   stw     r0,0(r3)
 674err3;   stw     r6,4(r3)
 675        addi    r3,r3,8
 676
 67712:     bf      cr7*4+1,13f
 678err3;   lwz     r0,0(r4)
 679        addi    r4,r4,4
 680err3;   stw     r0,0(r3)
 681        addi    r3,r3,4
 682
 68313:     bf      cr7*4+2,14f
 684err3;   lhz     r0,0(r4)
 685        addi    r4,r4,2
 686err3;   sth     r0,0(r3)
 687        addi    r3,r3,2
 688
 68914:     bf      cr7*4+3,15f
 690err3;   lbz     r0,0(r4)
 691err3;   stb     r0,0(r3)
 692
 69315:     addi    r1,r1,STACKFRAMESIZE
 694        b       exit_vmx_usercopy       /* tail call optimise */
 695#endif /* CONFIG_ALTIVEC */
 696