linux/arch/powerpc/lib/copyuser_power7.S
<<
>>
Prefs
   1/*
   2 * This program is free software; you can redistribute it and/or modify
   3 * it under the terms of the GNU General Public License as published by
   4 * the Free Software Foundation; either version 2 of the License, or
   5 * (at your option) any later version.
   6 *
   7 * This program is distributed in the hope that it will be useful,
   8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
   9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  10 * GNU General Public License for more details.
  11 *
  12 * You should have received a copy of the GNU General Public License
  13 * along with this program; if not, write to the Free Software
  14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  15 *
  16 * Copyright (C) IBM Corporation, 2011
  17 *
  18 * Author: Anton Blanchard <anton@au.ibm.com>
  19 */
  20#include <asm/ppc_asm.h>
  21
  22#ifdef __BIG_ENDIAN__
  23#define LVS(VRT,RA,RB)          lvsl    VRT,RA,RB
  24#define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRA,VRB,VRC
  25#else
  26#define LVS(VRT,RA,RB)          lvsr    VRT,RA,RB
  27#define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRB,VRA,VRC
  28#endif
  29
  30        .macro err1
  31100:
  32        EX_TABLE(100b,.Ldo_err1)
  33        .endm
  34
  35        .macro err2
  36200:
  37        EX_TABLE(200b,.Ldo_err2)
  38        .endm
  39
  40#ifdef CONFIG_ALTIVEC
  41        .macro err3
  42300:
  43        EX_TABLE(300b,.Ldo_err3)
  44        .endm
  45
  46        .macro err4
  47400:
  48        EX_TABLE(400b,.Ldo_err4)
  49        .endm
  50
  51
  52.Ldo_err4:
  53        ld      r16,STK_REG(R16)(r1)
  54        ld      r15,STK_REG(R15)(r1)
  55        ld      r14,STK_REG(R14)(r1)
  56.Ldo_err3:
  57        bl      exit_vmx_usercopy
  58        ld      r0,STACKFRAMESIZE+16(r1)
  59        mtlr    r0
  60        b       .Lexit
  61#endif /* CONFIG_ALTIVEC */
  62
  63.Ldo_err2:
  64        ld      r22,STK_REG(R22)(r1)
  65        ld      r21,STK_REG(R21)(r1)
  66        ld      r20,STK_REG(R20)(r1)
  67        ld      r19,STK_REG(R19)(r1)
  68        ld      r18,STK_REG(R18)(r1)
  69        ld      r17,STK_REG(R17)(r1)
  70        ld      r16,STK_REG(R16)(r1)
  71        ld      r15,STK_REG(R15)(r1)
  72        ld      r14,STK_REG(R14)(r1)
  73.Lexit:
  74        addi    r1,r1,STACKFRAMESIZE
  75.Ldo_err1:
  76        ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  77        ld      r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
  78        ld      r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
  79        b       __copy_tofrom_user_base
  80
  81
  82_GLOBAL(__copy_tofrom_user_power7)
  83#ifdef CONFIG_ALTIVEC
  84        cmpldi  r5,16
  85        cmpldi  cr1,r5,3328
  86
  87        std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  88        std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
  89        std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
  90
  91        blt     .Lshort_copy
  92        bge     cr1,.Lvmx_copy
  93#else
  94        cmpldi  r5,16
  95
  96        std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  97        std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
  98        std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
  99
 100        blt     .Lshort_copy
 101#endif
 102
 103.Lnonvmx_copy:
 104        /* Get the source 8B aligned */
 105        neg     r6,r4
 106        mtocrf  0x01,r6
 107        clrldi  r6,r6,(64-3)
 108
 109        bf      cr7*4+3,1f
 110err1;   lbz     r0,0(r4)
 111        addi    r4,r4,1
 112err1;   stb     r0,0(r3)
 113        addi    r3,r3,1
 114
 1151:      bf      cr7*4+2,2f
 116err1;   lhz     r0,0(r4)
 117        addi    r4,r4,2
 118err1;   sth     r0,0(r3)
 119        addi    r3,r3,2
 120
 1212:      bf      cr7*4+1,3f
 122err1;   lwz     r0,0(r4)
 123        addi    r4,r4,4
 124err1;   stw     r0,0(r3)
 125        addi    r3,r3,4
 126
 1273:      sub     r5,r5,r6
 128        cmpldi  r5,128
 129        blt     5f
 130
 131        mflr    r0
 132        stdu    r1,-STACKFRAMESIZE(r1)
 133        std     r14,STK_REG(R14)(r1)
 134        std     r15,STK_REG(R15)(r1)
 135        std     r16,STK_REG(R16)(r1)
 136        std     r17,STK_REG(R17)(r1)
 137        std     r18,STK_REG(R18)(r1)
 138        std     r19,STK_REG(R19)(r1)
 139        std     r20,STK_REG(R20)(r1)
 140        std     r21,STK_REG(R21)(r1)
 141        std     r22,STK_REG(R22)(r1)
 142        std     r0,STACKFRAMESIZE+16(r1)
 143
 144        srdi    r6,r5,7
 145        mtctr   r6
 146
 147        /* Now do cacheline (128B) sized loads and stores. */
 148        .align  5
 1494:
 150err2;   ld      r0,0(r4)
 151err2;   ld      r6,8(r4)
 152err2;   ld      r7,16(r4)
 153err2;   ld      r8,24(r4)
 154err2;   ld      r9,32(r4)
 155err2;   ld      r10,40(r4)
 156err2;   ld      r11,48(r4)
 157err2;   ld      r12,56(r4)
 158err2;   ld      r14,64(r4)
 159err2;   ld      r15,72(r4)
 160err2;   ld      r16,80(r4)
 161err2;   ld      r17,88(r4)
 162err2;   ld      r18,96(r4)
 163err2;   ld      r19,104(r4)
 164err2;   ld      r20,112(r4)
 165err2;   ld      r21,120(r4)
 166        addi    r4,r4,128
 167err2;   std     r0,0(r3)
 168err2;   std     r6,8(r3)
 169err2;   std     r7,16(r3)
 170err2;   std     r8,24(r3)
 171err2;   std     r9,32(r3)
 172err2;   std     r10,40(r3)
 173err2;   std     r11,48(r3)
 174err2;   std     r12,56(r3)
 175err2;   std     r14,64(r3)
 176err2;   std     r15,72(r3)
 177err2;   std     r16,80(r3)
 178err2;   std     r17,88(r3)
 179err2;   std     r18,96(r3)
 180err2;   std     r19,104(r3)
 181err2;   std     r20,112(r3)
 182err2;   std     r21,120(r3)
 183        addi    r3,r3,128
 184        bdnz    4b
 185
 186        clrldi  r5,r5,(64-7)
 187
 188        ld      r14,STK_REG(R14)(r1)
 189        ld      r15,STK_REG(R15)(r1)
 190        ld      r16,STK_REG(R16)(r1)
 191        ld      r17,STK_REG(R17)(r1)
 192        ld      r18,STK_REG(R18)(r1)
 193        ld      r19,STK_REG(R19)(r1)
 194        ld      r20,STK_REG(R20)(r1)
 195        ld      r21,STK_REG(R21)(r1)
 196        ld      r22,STK_REG(R22)(r1)
 197        addi    r1,r1,STACKFRAMESIZE
 198
 199        /* Up to 127B to go */
 2005:      srdi    r6,r5,4
 201        mtocrf  0x01,r6
 202
 2036:      bf      cr7*4+1,7f
 204err1;   ld      r0,0(r4)
 205err1;   ld      r6,8(r4)
 206err1;   ld      r7,16(r4)
 207err1;   ld      r8,24(r4)
 208err1;   ld      r9,32(r4)
 209err1;   ld      r10,40(r4)
 210err1;   ld      r11,48(r4)
 211err1;   ld      r12,56(r4)
 212        addi    r4,r4,64
 213err1;   std     r0,0(r3)
 214err1;   std     r6,8(r3)
 215err1;   std     r7,16(r3)
 216err1;   std     r8,24(r3)
 217err1;   std     r9,32(r3)
 218err1;   std     r10,40(r3)
 219err1;   std     r11,48(r3)
 220err1;   std     r12,56(r3)
 221        addi    r3,r3,64
 222
 223        /* Up to 63B to go */
 2247:      bf      cr7*4+2,8f
 225err1;   ld      r0,0(r4)
 226err1;   ld      r6,8(r4)
 227err1;   ld      r7,16(r4)
 228err1;   ld      r8,24(r4)
 229        addi    r4,r4,32
 230err1;   std     r0,0(r3)
 231err1;   std     r6,8(r3)
 232err1;   std     r7,16(r3)
 233err1;   std     r8,24(r3)
 234        addi    r3,r3,32
 235
 236        /* Up to 31B to go */
 2378:      bf      cr7*4+3,9f
 238err1;   ld      r0,0(r4)
 239err1;   ld      r6,8(r4)
 240        addi    r4,r4,16
 241err1;   std     r0,0(r3)
 242err1;   std     r6,8(r3)
 243        addi    r3,r3,16
 244
 2459:      clrldi  r5,r5,(64-4)
 246
 247        /* Up to 15B to go */
 248.Lshort_copy:
 249        mtocrf  0x01,r5
 250        bf      cr7*4+0,12f
 251err1;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
 252err1;   lwz     r6,4(r4)
 253        addi    r4,r4,8
 254err1;   stw     r0,0(r3)
 255err1;   stw     r6,4(r3)
 256        addi    r3,r3,8
 257
 25812:     bf      cr7*4+1,13f
 259err1;   lwz     r0,0(r4)
 260        addi    r4,r4,4
 261err1;   stw     r0,0(r3)
 262        addi    r3,r3,4
 263
 26413:     bf      cr7*4+2,14f
 265err1;   lhz     r0,0(r4)
 266        addi    r4,r4,2
 267err1;   sth     r0,0(r3)
 268        addi    r3,r3,2
 269
 27014:     bf      cr7*4+3,15f
 271err1;   lbz     r0,0(r4)
 272err1;   stb     r0,0(r3)
 273
 27415:     li      r3,0
 275        blr
 276
 277.Lunwind_stack_nonvmx_copy:
 278        addi    r1,r1,STACKFRAMESIZE
 279        b       .Lnonvmx_copy
 280
 281#ifdef CONFIG_ALTIVEC
 282.Lvmx_copy:
 283        mflr    r0
 284        std     r0,16(r1)
 285        stdu    r1,-STACKFRAMESIZE(r1)
 286        bl      enter_vmx_usercopy
 287        cmpwi   cr1,r3,0
 288        ld      r0,STACKFRAMESIZE+16(r1)
 289        ld      r3,STK_REG(R31)(r1)
 290        ld      r4,STK_REG(R30)(r1)
 291        ld      r5,STK_REG(R29)(r1)
 292        mtlr    r0
 293
 294        /*
 295         * We prefetch both the source and destination using enhanced touch
 296         * instructions. We use a stream ID of 0 for the load side and
 297         * 1 for the store side.
 298         */
 299        clrrdi  r6,r4,7
 300        clrrdi  r9,r3,7
 301        ori     r9,r9,1         /* stream=1 */
 302
 303        srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
 304        cmpldi  r7,0x3FF
 305        ble     1f
 306        li      r7,0x3FF
 3071:      lis     r0,0x0E00       /* depth=7 */
 308        sldi    r7,r7,7
 309        or      r7,r7,r0
 310        ori     r10,r7,1        /* stream=1 */
 311
 312        lis     r8,0x8000       /* GO=1 */
 313        clrldi  r8,r8,32
 314
 315        /* setup read stream 0 */
 316        dcbt    0,r6,0b01000   /* addr from */
 317        dcbt    0,r7,0b01010   /* length and depth from */
 318        /* setup write stream 1 */
 319        dcbtst  0,r9,0b01000   /* addr to */
 320        dcbtst  0,r10,0b01010  /* length and depth to */
 321        eieio
 322        dcbt    0,r8,0b01010    /* all streams GO */
 323
 324        beq     cr1,.Lunwind_stack_nonvmx_copy
 325
 326        /*
 327         * If source and destination are not relatively aligned we use a
 328         * slower permute loop.
 329         */
 330        xor     r6,r4,r3
 331        rldicl. r6,r6,0,(64-4)
 332        bne     .Lvmx_unaligned_copy
 333
 334        /* Get the destination 16B aligned */
 335        neg     r6,r3
 336        mtocrf  0x01,r6
 337        clrldi  r6,r6,(64-4)
 338
 339        bf      cr7*4+3,1f
 340err3;   lbz     r0,0(r4)
 341        addi    r4,r4,1
 342err3;   stb     r0,0(r3)
 343        addi    r3,r3,1
 344
 3451:      bf      cr7*4+2,2f
 346err3;   lhz     r0,0(r4)
 347        addi    r4,r4,2
 348err3;   sth     r0,0(r3)
 349        addi    r3,r3,2
 350
 3512:      bf      cr7*4+1,3f
 352err3;   lwz     r0,0(r4)
 353        addi    r4,r4,4
 354err3;   stw     r0,0(r3)
 355        addi    r3,r3,4
 356
 3573:      bf      cr7*4+0,4f
 358err3;   ld      r0,0(r4)
 359        addi    r4,r4,8
 360err3;   std     r0,0(r3)
 361        addi    r3,r3,8
 362
 3634:      sub     r5,r5,r6
 364
 365        /* Get the desination 128B aligned */
 366        neg     r6,r3
 367        srdi    r7,r6,4
 368        mtocrf  0x01,r7
 369        clrldi  r6,r6,(64-7)
 370
 371        li      r9,16
 372        li      r10,32
 373        li      r11,48
 374
 375        bf      cr7*4+3,5f
 376err3;   lvx     v1,0,r4
 377        addi    r4,r4,16
 378err3;   stvx    v1,0,r3
 379        addi    r3,r3,16
 380
 3815:      bf      cr7*4+2,6f
 382err3;   lvx     v1,0,r4
 383err3;   lvx     v0,r4,r9
 384        addi    r4,r4,32
 385err3;   stvx    v1,0,r3
 386err3;   stvx    v0,r3,r9
 387        addi    r3,r3,32
 388
 3896:      bf      cr7*4+1,7f
 390err3;   lvx     v3,0,r4
 391err3;   lvx     v2,r4,r9
 392err3;   lvx     v1,r4,r10
 393err3;   lvx     v0,r4,r11
 394        addi    r4,r4,64
 395err3;   stvx    v3,0,r3
 396err3;   stvx    v2,r3,r9
 397err3;   stvx    v1,r3,r10
 398err3;   stvx    v0,r3,r11
 399        addi    r3,r3,64
 400
 4017:      sub     r5,r5,r6
 402        srdi    r6,r5,7
 403
 404        std     r14,STK_REG(R14)(r1)
 405        std     r15,STK_REG(R15)(r1)
 406        std     r16,STK_REG(R16)(r1)
 407
 408        li      r12,64
 409        li      r14,80
 410        li      r15,96
 411        li      r16,112
 412
 413        mtctr   r6
 414
 415        /*
 416         * Now do cacheline sized loads and stores. By this stage the
 417         * cacheline stores are also cacheline aligned.
 418         */
 419        .align  5
 4208:
 421err4;   lvx     v7,0,r4
 422err4;   lvx     v6,r4,r9
 423err4;   lvx     v5,r4,r10
 424err4;   lvx     v4,r4,r11
 425err4;   lvx     v3,r4,r12
 426err4;   lvx     v2,r4,r14
 427err4;   lvx     v1,r4,r15
 428err4;   lvx     v0,r4,r16
 429        addi    r4,r4,128
 430err4;   stvx    v7,0,r3
 431err4;   stvx    v6,r3,r9
 432err4;   stvx    v5,r3,r10
 433err4;   stvx    v4,r3,r11
 434err4;   stvx    v3,r3,r12
 435err4;   stvx    v2,r3,r14
 436err4;   stvx    v1,r3,r15
 437err4;   stvx    v0,r3,r16
 438        addi    r3,r3,128
 439        bdnz    8b
 440
 441        ld      r14,STK_REG(R14)(r1)
 442        ld      r15,STK_REG(R15)(r1)
 443        ld      r16,STK_REG(R16)(r1)
 444
 445        /* Up to 127B to go */
 446        clrldi  r5,r5,(64-7)
 447        srdi    r6,r5,4
 448        mtocrf  0x01,r6
 449
 450        bf      cr7*4+1,9f
 451err3;   lvx     v3,0,r4
 452err3;   lvx     v2,r4,r9
 453err3;   lvx     v1,r4,r10
 454err3;   lvx     v0,r4,r11
 455        addi    r4,r4,64
 456err3;   stvx    v3,0,r3
 457err3;   stvx    v2,r3,r9
 458err3;   stvx    v1,r3,r10
 459err3;   stvx    v0,r3,r11
 460        addi    r3,r3,64
 461
 4629:      bf      cr7*4+2,10f
 463err3;   lvx     v1,0,r4
 464err3;   lvx     v0,r4,r9
 465        addi    r4,r4,32
 466err3;   stvx    v1,0,r3
 467err3;   stvx    v0,r3,r9
 468        addi    r3,r3,32
 469
 47010:     bf      cr7*4+3,11f
 471err3;   lvx     v1,0,r4
 472        addi    r4,r4,16
 473err3;   stvx    v1,0,r3
 474        addi    r3,r3,16
 475
 476        /* Up to 15B to go */
 47711:     clrldi  r5,r5,(64-4)
 478        mtocrf  0x01,r5
 479        bf      cr7*4+0,12f
 480err3;   ld      r0,0(r4)
 481        addi    r4,r4,8
 482err3;   std     r0,0(r3)
 483        addi    r3,r3,8
 484
 48512:     bf      cr7*4+1,13f
 486err3;   lwz     r0,0(r4)
 487        addi    r4,r4,4
 488err3;   stw     r0,0(r3)
 489        addi    r3,r3,4
 490
 49113:     bf      cr7*4+2,14f
 492err3;   lhz     r0,0(r4)
 493        addi    r4,r4,2
 494err3;   sth     r0,0(r3)
 495        addi    r3,r3,2
 496
 49714:     bf      cr7*4+3,15f
 498err3;   lbz     r0,0(r4)
 499err3;   stb     r0,0(r3)
 500
 50115:     addi    r1,r1,STACKFRAMESIZE
 502        b       exit_vmx_usercopy       /* tail call optimise */
 503
 504.Lvmx_unaligned_copy:
 505        /* Get the destination 16B aligned */
 506        neg     r6,r3
 507        mtocrf  0x01,r6
 508        clrldi  r6,r6,(64-4)
 509
 510        bf      cr7*4+3,1f
 511err3;   lbz     r0,0(r4)
 512        addi    r4,r4,1
 513err3;   stb     r0,0(r3)
 514        addi    r3,r3,1
 515
 5161:      bf      cr7*4+2,2f
 517err3;   lhz     r0,0(r4)
 518        addi    r4,r4,2
 519err3;   sth     r0,0(r3)
 520        addi    r3,r3,2
 521
 5222:      bf      cr7*4+1,3f
 523err3;   lwz     r0,0(r4)
 524        addi    r4,r4,4
 525err3;   stw     r0,0(r3)
 526        addi    r3,r3,4
 527
 5283:      bf      cr7*4+0,4f
 529err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
 530err3;   lwz     r7,4(r4)
 531        addi    r4,r4,8
 532err3;   stw     r0,0(r3)
 533err3;   stw     r7,4(r3)
 534        addi    r3,r3,8
 535
 5364:      sub     r5,r5,r6
 537
 538        /* Get the desination 128B aligned */
 539        neg     r6,r3
 540        srdi    r7,r6,4
 541        mtocrf  0x01,r7
 542        clrldi  r6,r6,(64-7)
 543
 544        li      r9,16
 545        li      r10,32
 546        li      r11,48
 547
 548        LVS(v16,0,r4)           /* Setup permute control vector */
 549err3;   lvx     v0,0,r4
 550        addi    r4,r4,16
 551
 552        bf      cr7*4+3,5f
 553err3;   lvx     v1,0,r4
 554        VPERM(v8,v0,v1,v16)
 555        addi    r4,r4,16
 556err3;   stvx    v8,0,r3
 557        addi    r3,r3,16
 558        vor     v0,v1,v1
 559
 5605:      bf      cr7*4+2,6f
 561err3;   lvx     v1,0,r4
 562        VPERM(v8,v0,v1,v16)
 563err3;   lvx     v0,r4,r9
 564        VPERM(v9,v1,v0,v16)
 565        addi    r4,r4,32
 566err3;   stvx    v8,0,r3
 567err3;   stvx    v9,r3,r9
 568        addi    r3,r3,32
 569
 5706:      bf      cr7*4+1,7f
 571err3;   lvx     v3,0,r4
 572        VPERM(v8,v0,v3,v16)
 573err3;   lvx     v2,r4,r9
 574        VPERM(v9,v3,v2,v16)
 575err3;   lvx     v1,r4,r10
 576        VPERM(v10,v2,v1,v16)
 577err3;   lvx     v0,r4,r11
 578        VPERM(v11,v1,v0,v16)
 579        addi    r4,r4,64
 580err3;   stvx    v8,0,r3
 581err3;   stvx    v9,r3,r9
 582err3;   stvx    v10,r3,r10
 583err3;   stvx    v11,r3,r11
 584        addi    r3,r3,64
 585
 5867:      sub     r5,r5,r6
 587        srdi    r6,r5,7
 588
 589        std     r14,STK_REG(R14)(r1)
 590        std     r15,STK_REG(R15)(r1)
 591        std     r16,STK_REG(R16)(r1)
 592
 593        li      r12,64
 594        li      r14,80
 595        li      r15,96
 596        li      r16,112
 597
 598        mtctr   r6
 599
 600        /*
 601         * Now do cacheline sized loads and stores. By this stage the
 602         * cacheline stores are also cacheline aligned.
 603         */
 604        .align  5
 6058:
 606err4;   lvx     v7,0,r4
 607        VPERM(v8,v0,v7,v16)
 608err4;   lvx     v6,r4,r9
 609        VPERM(v9,v7,v6,v16)
 610err4;   lvx     v5,r4,r10
 611        VPERM(v10,v6,v5,v16)
 612err4;   lvx     v4,r4,r11
 613        VPERM(v11,v5,v4,v16)
 614err4;   lvx     v3,r4,r12
 615        VPERM(v12,v4,v3,v16)
 616err4;   lvx     v2,r4,r14
 617        VPERM(v13,v3,v2,v16)
 618err4;   lvx     v1,r4,r15
 619        VPERM(v14,v2,v1,v16)
 620err4;   lvx     v0,r4,r16
 621        VPERM(v15,v1,v0,v16)
 622        addi    r4,r4,128
 623err4;   stvx    v8,0,r3
 624err4;   stvx    v9,r3,r9
 625err4;   stvx    v10,r3,r10
 626err4;   stvx    v11,r3,r11
 627err4;   stvx    v12,r3,r12
 628err4;   stvx    v13,r3,r14
 629err4;   stvx    v14,r3,r15
 630err4;   stvx    v15,r3,r16
 631        addi    r3,r3,128
 632        bdnz    8b
 633
 634        ld      r14,STK_REG(R14)(r1)
 635        ld      r15,STK_REG(R15)(r1)
 636        ld      r16,STK_REG(R16)(r1)
 637
 638        /* Up to 127B to go */
 639        clrldi  r5,r5,(64-7)
 640        srdi    r6,r5,4
 641        mtocrf  0x01,r6
 642
 643        bf      cr7*4+1,9f
 644err3;   lvx     v3,0,r4
 645        VPERM(v8,v0,v3,v16)
 646err3;   lvx     v2,r4,r9
 647        VPERM(v9,v3,v2,v16)
 648err3;   lvx     v1,r4,r10
 649        VPERM(v10,v2,v1,v16)
 650err3;   lvx     v0,r4,r11
 651        VPERM(v11,v1,v0,v16)
 652        addi    r4,r4,64
 653err3;   stvx    v8,0,r3
 654err3;   stvx    v9,r3,r9
 655err3;   stvx    v10,r3,r10
 656err3;   stvx    v11,r3,r11
 657        addi    r3,r3,64
 658
 6599:      bf      cr7*4+2,10f
 660err3;   lvx     v1,0,r4
 661        VPERM(v8,v0,v1,v16)
 662err3;   lvx     v0,r4,r9
 663        VPERM(v9,v1,v0,v16)
 664        addi    r4,r4,32
 665err3;   stvx    v8,0,r3
 666err3;   stvx    v9,r3,r9
 667        addi    r3,r3,32
 668
 66910:     bf      cr7*4+3,11f
 670err3;   lvx     v1,0,r4
 671        VPERM(v8,v0,v1,v16)
 672        addi    r4,r4,16
 673err3;   stvx    v8,0,r3
 674        addi    r3,r3,16
 675
 676        /* Up to 15B to go */
 67711:     clrldi  r5,r5,(64-4)
 678        addi    r4,r4,-16       /* Unwind the +16 load offset */
 679        mtocrf  0x01,r5
 680        bf      cr7*4+0,12f
 681err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
 682err3;   lwz     r6,4(r4)
 683        addi    r4,r4,8
 684err3;   stw     r0,0(r3)
 685err3;   stw     r6,4(r3)
 686        addi    r3,r3,8
 687
 68812:     bf      cr7*4+1,13f
 689err3;   lwz     r0,0(r4)
 690        addi    r4,r4,4
 691err3;   stw     r0,0(r3)
 692        addi    r3,r3,4
 693
 69413:     bf      cr7*4+2,14f
 695err3;   lhz     r0,0(r4)
 696        addi    r4,r4,2
 697err3;   sth     r0,0(r3)
 698        addi    r3,r3,2
 699
 70014:     bf      cr7*4+3,15f
 701err3;   lbz     r0,0(r4)
 702err3;   stb     r0,0(r3)
 703
 70415:     addi    r1,r1,STACKFRAMESIZE
 705        b       exit_vmx_usercopy       /* tail call optimise */
 706#endif /* CONFIG_ALTIVEC */
 707