linux/arch/powerpc/lib/memcmp_64.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/*
   3 * Author: Anton Blanchard <anton@au.ibm.com>
   4 * Copyright 2015 IBM Corporation.
   5 */
   6#include <asm/ppc_asm.h>
   7#include <asm/export.h>
   8#include <asm/ppc-opcode.h>
   9
  10#define off8    r6
  11#define off16   r7
  12#define off24   r8
  13
  14#define rA      r9
  15#define rB      r10
  16#define rC      r11
  17#define rD      r27
  18#define rE      r28
  19#define rF      r29
  20#define rG      r30
  21#define rH      r31
  22
  23#ifdef __LITTLE_ENDIAN__
  24#define LH      lhbrx
  25#define LW      lwbrx
  26#define LD      ldbrx
  27#define LVS     lvsr
  28#define VPERM(_VRT,_VRA,_VRB,_VRC) \
  29        vperm _VRT,_VRB,_VRA,_VRC
  30#else
  31#define LH      lhzx
  32#define LW      lwzx
  33#define LD      ldx
  34#define LVS     lvsl
  35#define VPERM(_VRT,_VRA,_VRB,_VRC) \
  36        vperm _VRT,_VRA,_VRB,_VRC
  37#endif
  38
  39#define VMX_THRESH 4096
  40#define ENTER_VMX_OPS   \
  41        mflr    r0;     \
  42        std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
  43        std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
  44        std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
  45        std     r0,16(r1); \
  46        stdu    r1,-STACKFRAMESIZE(r1); \
  47        bl      enter_vmx_ops; \
  48        cmpwi   cr1,r3,0; \
  49        ld      r0,STACKFRAMESIZE+16(r1); \
  50        ld      r3,STK_REG(R31)(r1); \
  51        ld      r4,STK_REG(R30)(r1); \
  52        ld      r5,STK_REG(R29)(r1); \
  53        addi    r1,r1,STACKFRAMESIZE; \
  54        mtlr    r0
  55
  56#define EXIT_VMX_OPS \
  57        mflr    r0; \
  58        std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
  59        std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
  60        std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
  61        std     r0,16(r1); \
  62        stdu    r1,-STACKFRAMESIZE(r1); \
  63        bl      exit_vmx_ops; \
  64        ld      r0,STACKFRAMESIZE+16(r1); \
  65        ld      r3,STK_REG(R31)(r1); \
  66        ld      r4,STK_REG(R30)(r1); \
  67        ld      r5,STK_REG(R29)(r1); \
  68        addi    r1,r1,STACKFRAMESIZE; \
  69        mtlr    r0
  70
  71/*
  72 * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
  73 * 16 bytes boundary and permute the result with the 1st 16 bytes.
  74
  75 *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
  76 *    ^                                  ^                                 ^
  77 * 0xbbbb10                          0xbbbb20                          0xbbb30
  78 *                                 ^
  79 *                                _vaddr
  80 *
  81 *
  82 * _vmask is the mask generated by LVS
  83 * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
  84 *   for example: 0xyyyyyyyyyyyyy012 for big endian
  85 * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
  86 *   for example: 0x3456789abcdefzzz for big endian
  87 * The permute result is saved in _v_res.
  88 *   for example: 0x0123456789abcdef for big endian.
  89 */
  90#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
  91        lvx     _v2nd_qw,_vaddr,off16; \
  92        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
  93
  94/*
  95 * There are 2 categories for memcmp:
  96 * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
  97 * are named like .Lsameoffset_xxxx
  98 * 2) src/dst has different offset to the 8 bytes boundary. The handlers
  99 * are named like .Ldiffoffset_xxxx
 100 */
 101_GLOBAL_TOC(memcmp)
 102        cmpdi   cr1,r5,0
 103
 104        /* Use the short loop if the src/dst addresses are not
 105         * with the same offset of 8 bytes align boundary.
 106         */
 107        xor     r6,r3,r4
 108        andi.   r6,r6,7
 109
 110        /* Fall back to short loop if compare at aligned addrs
 111         * with less than 8 bytes.
 112         */
 113        cmpdi   cr6,r5,7
 114
 115        beq     cr1,.Lzero
 116        bgt     cr6,.Lno_short
 117
 118.Lshort:
 119        mtctr   r5
 1201:      lbz     rA,0(r3)
 121        lbz     rB,0(r4)
 122        subf.   rC,rB,rA
 123        bne     .Lnon_zero
 124        bdz     .Lzero
 125
 126        lbz     rA,1(r3)
 127        lbz     rB,1(r4)
 128        subf.   rC,rB,rA
 129        bne     .Lnon_zero
 130        bdz     .Lzero
 131
 132        lbz     rA,2(r3)
 133        lbz     rB,2(r4)
 134        subf.   rC,rB,rA
 135        bne     .Lnon_zero
 136        bdz     .Lzero
 137
 138        lbz     rA,3(r3)
 139        lbz     rB,3(r4)
 140        subf.   rC,rB,rA
 141        bne     .Lnon_zero
 142
 143        addi    r3,r3,4
 144        addi    r4,r4,4
 145
 146        bdnz    1b
 147
 148.Lzero:
 149        li      r3,0
 150        blr
 151
 152.Lno_short:
 153        dcbt    0,r3
 154        dcbt    0,r4
 155        bne     .Ldiffoffset_8bytes_make_align_start
 156
 157
 158.Lsameoffset_8bytes_make_align_start:
 159        /* attempt to compare bytes not aligned with 8 bytes so that
 160         * rest comparison can run based on 8 bytes alignment.
 161         */
 162        andi.   r6,r3,7
 163
 164        /* Try to compare the first double word which is not 8 bytes aligned:
 165         * load the first double word at (src & ~7UL) and shift left appropriate
 166         * bits before comparision.
 167         */
 168        rlwinm  r6,r3,3,26,28
 169        beq     .Lsameoffset_8bytes_aligned
 170        clrrdi  r3,r3,3
 171        clrrdi  r4,r4,3
 172        LD      rA,0,r3
 173        LD      rB,0,r4
 174        sld     rA,rA,r6
 175        sld     rB,rB,r6
 176        cmpld   cr0,rA,rB
 177        srwi    r6,r6,3
 178        bne     cr0,.LcmpAB_lightweight
 179        subfic  r6,r6,8
 180        subf.   r5,r6,r5
 181        addi    r3,r3,8
 182        addi    r4,r4,8
 183        beq     .Lzero
 184
 185.Lsameoffset_8bytes_aligned:
 186        /* now we are aligned with 8 bytes.
 187         * Use .Llong loop if left cmp bytes are equal or greater than 32B.
 188         */
 189        cmpdi   cr6,r5,31
 190        bgt     cr6,.Llong
 191
 192.Lcmp_lt32bytes:
 193        /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
 194        cmpdi   cr5,r5,7
 195        srdi    r0,r5,3
 196        ble     cr5,.Lcmp_rest_lt8bytes
 197
 198        /* handle 8 ~ 31 bytes */
 199        clrldi  r5,r5,61
 200        mtctr   r0
 2012:
 202        LD      rA,0,r3
 203        LD      rB,0,r4
 204        cmpld   cr0,rA,rB
 205        addi    r3,r3,8
 206        addi    r4,r4,8
 207        bne     cr0,.LcmpAB_lightweight
 208        bdnz    2b
 209
 210        cmpwi   r5,0
 211        beq     .Lzero
 212
 213.Lcmp_rest_lt8bytes:
 214        /*
 215         * Here we have less than 8 bytes to compare. At least s1 is aligned to
 216         * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a
 217         * page boundary, otherwise we might read past the end of the buffer and
 218         * trigger a page fault. We use 4K as the conservative minimum page
 219         * size. If we detect that case we go to the byte-by-byte loop.
 220         *
 221         * Otherwise the next double word is loaded from s1 and s2, and shifted
 222         * right to compare the appropriate bits.
 223         */
 224        clrldi  r6,r4,(64-12)   // r6 = r4 & 0xfff
 225        cmpdi   r6,0xff8
 226        bgt     .Lshort
 227
 228        subfic  r6,r5,8
 229        slwi    r6,r6,3
 230        LD      rA,0,r3
 231        LD      rB,0,r4
 232        srd     rA,rA,r6
 233        srd     rB,rB,r6
 234        cmpld   cr0,rA,rB
 235        bne     cr0,.LcmpAB_lightweight
 236        b       .Lzero
 237
 238.Lnon_zero:
 239        mr      r3,rC
 240        blr
 241
 242.Llong:
 243#ifdef CONFIG_ALTIVEC
 244BEGIN_FTR_SECTION
 245        /* Try to use vmx loop if length is equal or greater than 4K */
 246        cmpldi  cr6,r5,VMX_THRESH
 247        bge     cr6,.Lsameoffset_vmx_cmp
 248END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 249
 250.Llong_novmx_cmp:
 251#endif
 252        /* At least s1 addr is aligned with 8 bytes */
 253        li      off8,8
 254        li      off16,16
 255        li      off24,24
 256
 257        std     r31,-8(r1)
 258        std     r30,-16(r1)
 259        std     r29,-24(r1)
 260        std     r28,-32(r1)
 261        std     r27,-40(r1)
 262
 263        srdi    r0,r5,5
 264        mtctr   r0
 265        andi.   r5,r5,31
 266
 267        LD      rA,0,r3
 268        LD      rB,0,r4
 269
 270        LD      rC,off8,r3
 271        LD      rD,off8,r4
 272
 273        LD      rE,off16,r3
 274        LD      rF,off16,r4
 275
 276        LD      rG,off24,r3
 277        LD      rH,off24,r4
 278        cmpld   cr0,rA,rB
 279
 280        addi    r3,r3,32
 281        addi    r4,r4,32
 282
 283        bdz     .Lfirst32
 284
 285        LD      rA,0,r3
 286        LD      rB,0,r4
 287        cmpld   cr1,rC,rD
 288
 289        LD      rC,off8,r3
 290        LD      rD,off8,r4
 291        cmpld   cr6,rE,rF
 292
 293        LD      rE,off16,r3
 294        LD      rF,off16,r4
 295        cmpld   cr7,rG,rH
 296        bne     cr0,.LcmpAB
 297
 298        LD      rG,off24,r3
 299        LD      rH,off24,r4
 300        cmpld   cr0,rA,rB
 301        bne     cr1,.LcmpCD
 302
 303        addi    r3,r3,32
 304        addi    r4,r4,32
 305
 306        bdz     .Lsecond32
 307
 308        .balign 16
 309
 3101:      LD      rA,0,r3
 311        LD      rB,0,r4
 312        cmpld   cr1,rC,rD
 313        bne     cr6,.LcmpEF
 314
 315        LD      rC,off8,r3
 316        LD      rD,off8,r4
 317        cmpld   cr6,rE,rF
 318        bne     cr7,.LcmpGH
 319
 320        LD      rE,off16,r3
 321        LD      rF,off16,r4
 322        cmpld   cr7,rG,rH
 323        bne     cr0,.LcmpAB
 324
 325        LD      rG,off24,r3
 326        LD      rH,off24,r4
 327        cmpld   cr0,rA,rB
 328        bne     cr1,.LcmpCD
 329
 330        addi    r3,r3,32
 331        addi    r4,r4,32
 332
 333        bdnz    1b
 334
 335.Lsecond32:
 336        cmpld   cr1,rC,rD
 337        bne     cr6,.LcmpEF
 338
 339        cmpld   cr6,rE,rF
 340        bne     cr7,.LcmpGH
 341
 342        cmpld   cr7,rG,rH
 343        bne     cr0,.LcmpAB
 344
 345        bne     cr1,.LcmpCD
 346        bne     cr6,.LcmpEF
 347        bne     cr7,.LcmpGH
 348
 349.Ltail:
 350        ld      r31,-8(r1)
 351        ld      r30,-16(r1)
 352        ld      r29,-24(r1)
 353        ld      r28,-32(r1)
 354        ld      r27,-40(r1)
 355
 356        cmpdi   r5,0
 357        beq     .Lzero
 358        b       .Lshort
 359
 360.Lfirst32:
 361        cmpld   cr1,rC,rD
 362        cmpld   cr6,rE,rF
 363        cmpld   cr7,rG,rH
 364
 365        bne     cr0,.LcmpAB
 366        bne     cr1,.LcmpCD
 367        bne     cr6,.LcmpEF
 368        bne     cr7,.LcmpGH
 369
 370        b       .Ltail
 371
 372.LcmpAB:
 373        li      r3,1
 374        bgt     cr0,.Lout
 375        li      r3,-1
 376        b       .Lout
 377
 378.LcmpCD:
 379        li      r3,1
 380        bgt     cr1,.Lout
 381        li      r3,-1
 382        b       .Lout
 383
 384.LcmpEF:
 385        li      r3,1
 386        bgt     cr6,.Lout
 387        li      r3,-1
 388        b       .Lout
 389
 390.LcmpGH:
 391        li      r3,1
 392        bgt     cr7,.Lout
 393        li      r3,-1
 394
 395.Lout:
 396        ld      r31,-8(r1)
 397        ld      r30,-16(r1)
 398        ld      r29,-24(r1)
 399        ld      r28,-32(r1)
 400        ld      r27,-40(r1)
 401        blr
 402
 403.LcmpAB_lightweight:   /* skip NV GPRS restore */
 404        li      r3,1
 405        bgtlr
 406        li      r3,-1
 407        blr
 408
 409#ifdef CONFIG_ALTIVEC
 410.Lsameoffset_vmx_cmp:
 411        /* Enter with src/dst addrs has the same offset with 8 bytes
 412         * align boundary.
 413         *
 414         * There is an optimization based on following fact: memcmp()
 415         * prones to fail early at the first 32 bytes.
 416         * Before applying VMX instructions which will lead to 32x128bits
 417         * VMX regs load/restore penalty, we compare the first 32 bytes
 418         * so that we can catch the ~80% fail cases.
 419         */
 420
 421        li      r0,4
 422        mtctr   r0
 423.Lsameoffset_prechk_32B_loop:
 424        LD      rA,0,r3
 425        LD      rB,0,r4
 426        cmpld   cr0,rA,rB
 427        addi    r3,r3,8
 428        addi    r4,r4,8
 429        bne     cr0,.LcmpAB_lightweight
 430        addi    r5,r5,-8
 431        bdnz    .Lsameoffset_prechk_32B_loop
 432
 433        ENTER_VMX_OPS
 434        beq     cr1,.Llong_novmx_cmp
 435
 4363:
 437        /* need to check whether r4 has the same offset with r3
 438         * for 16 bytes boundary.
 439         */
 440        xor     r0,r3,r4
 441        andi.   r0,r0,0xf
 442        bne     .Ldiffoffset_vmx_cmp_start
 443
 444        /* len is no less than 4KB. Need to align with 16 bytes further.
 445         */
 446        andi.   rA,r3,8
 447        LD      rA,0,r3
 448        beq     4f
 449        LD      rB,0,r4
 450        cmpld   cr0,rA,rB
 451        addi    r3,r3,8
 452        addi    r4,r4,8
 453        addi    r5,r5,-8
 454
 455        beq     cr0,4f
 456        /* save and restore cr0 */
 457        mfocrf  r5,128
 458        EXIT_VMX_OPS
 459        mtocrf  128,r5
 460        b       .LcmpAB_lightweight
 461
 4624:
 463        /* compare 32 bytes for each loop */
 464        srdi    r0,r5,5
 465        mtctr   r0
 466        clrldi  r5,r5,59
 467        li      off16,16
 468
 469.balign 16
 4705:
 471        lvx     v0,0,r3
 472        lvx     v1,0,r4
 473        VCMPEQUD_RC(v0,v0,v1)
 474        bnl     cr6,7f
 475        lvx     v0,off16,r3
 476        lvx     v1,off16,r4
 477        VCMPEQUD_RC(v0,v0,v1)
 478        bnl     cr6,6f
 479        addi    r3,r3,32
 480        addi    r4,r4,32
 481        bdnz    5b
 482
 483        EXIT_VMX_OPS
 484        cmpdi   r5,0
 485        beq     .Lzero
 486        b       .Lcmp_lt32bytes
 487
 4886:
 489        addi    r3,r3,16
 490        addi    r4,r4,16
 491
 4927:
 493        /* diff the last 16 bytes */
 494        EXIT_VMX_OPS
 495        LD      rA,0,r3
 496        LD      rB,0,r4
 497        cmpld   cr0,rA,rB
 498        li      off8,8
 499        bne     cr0,.LcmpAB_lightweight
 500
 501        LD      rA,off8,r3
 502        LD      rB,off8,r4
 503        cmpld   cr0,rA,rB
 504        bne     cr0,.LcmpAB_lightweight
 505        b       .Lzero
 506#endif
 507
 508.Ldiffoffset_8bytes_make_align_start:
 509        /* now try to align s1 with 8 bytes */
 510        rlwinm  r6,r3,3,26,28
 511        beq     .Ldiffoffset_align_s1_8bytes
 512
 513        clrrdi  r3,r3,3
 514        LD      rA,0,r3
 515        LD      rB,0,r4  /* unaligned load */
 516        sld     rA,rA,r6
 517        srd     rA,rA,r6
 518        srd     rB,rB,r6
 519        cmpld   cr0,rA,rB
 520        srwi    r6,r6,3
 521        bne     cr0,.LcmpAB_lightweight
 522
 523        subfic  r6,r6,8
 524        subf.   r5,r6,r5
 525        addi    r3,r3,8
 526        add     r4,r4,r6
 527
 528        beq     .Lzero
 529
 530.Ldiffoffset_align_s1_8bytes:
 531        /* now s1 is aligned with 8 bytes. */
 532#ifdef CONFIG_ALTIVEC
 533BEGIN_FTR_SECTION
 534        /* only do vmx ops when the size equal or greater than 4K bytes */
 535        cmpdi   cr5,r5,VMX_THRESH
 536        bge     cr5,.Ldiffoffset_vmx_cmp
 537END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 538
 539.Ldiffoffset_novmx_cmp:
 540#endif
 541
 542
 543        cmpdi   cr5,r5,31
 544        ble     cr5,.Lcmp_lt32bytes
 545
 546#ifdef CONFIG_ALTIVEC
 547        b       .Llong_novmx_cmp
 548#else
 549        b       .Llong
 550#endif
 551
 552#ifdef CONFIG_ALTIVEC
 553.Ldiffoffset_vmx_cmp:
 554        /* perform a 32 bytes pre-checking before
 555         * enable VMX operations.
 556         */
 557        li      r0,4
 558        mtctr   r0
 559.Ldiffoffset_prechk_32B_loop:
 560        LD      rA,0,r3
 561        LD      rB,0,r4
 562        cmpld   cr0,rA,rB
 563        addi    r3,r3,8
 564        addi    r4,r4,8
 565        bne     cr0,.LcmpAB_lightweight
 566        addi    r5,r5,-8
 567        bdnz    .Ldiffoffset_prechk_32B_loop
 568
 569        ENTER_VMX_OPS
 570        beq     cr1,.Ldiffoffset_novmx_cmp
 571
 572.Ldiffoffset_vmx_cmp_start:
 573        /* Firstly try to align r3 with 16 bytes */
 574        andi.   r6,r3,0xf
 575        li      off16,16
 576        beq     .Ldiffoffset_vmx_s1_16bytes_align
 577
 578        LVS     v3,0,r3
 579        LVS     v4,0,r4
 580
 581        lvx     v5,0,r3
 582        lvx     v6,0,r4
 583        LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
 584        LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
 585
 586        VCMPEQUB_RC(v7,v9,v10)
 587        bnl     cr6,.Ldiffoffset_vmx_diff_found
 588
 589        subfic  r6,r6,16
 590        subf    r5,r6,r5
 591        add     r3,r3,r6
 592        add     r4,r4,r6
 593
 594.Ldiffoffset_vmx_s1_16bytes_align:
 595        /* now s1 is aligned with 16 bytes */
 596        lvx     v6,0,r4
 597        LVS     v4,0,r4
 598        srdi    r6,r5,5  /* loop for 32 bytes each */
 599        clrldi  r5,r5,59
 600        mtctr   r6
 601
 602.balign 16
 603.Ldiffoffset_vmx_32bytesloop:
 604        /* the first qw of r4 was saved in v6 */
 605        lvx     v9,0,r3
 606        LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
 607        VCMPEQUB_RC(v7,v9,v10)
 608        vor     v6,v8,v8
 609        bnl     cr6,.Ldiffoffset_vmx_diff_found
 610
 611        addi    r3,r3,16
 612        addi    r4,r4,16
 613
 614        lvx     v9,0,r3
 615        LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
 616        VCMPEQUB_RC(v7,v9,v10)
 617        vor     v6,v8,v8
 618        bnl     cr6,.Ldiffoffset_vmx_diff_found
 619
 620        addi    r3,r3,16
 621        addi    r4,r4,16
 622
 623        bdnz    .Ldiffoffset_vmx_32bytesloop
 624
 625        EXIT_VMX_OPS
 626
 627        cmpdi   r5,0
 628        beq     .Lzero
 629        b       .Lcmp_lt32bytes
 630
 631.Ldiffoffset_vmx_diff_found:
 632        EXIT_VMX_OPS
 633        /* anyway, the diff will appear in next 16 bytes */
 634        li      r5,16
 635        b       .Lcmp_lt32bytes
 636
 637#endif
 638EXPORT_SYMBOL(memcmp)
 639