linux/arch/sparc/lib/memcpy.S
<<
>>
Prefs
   1/* memcpy.S: Sparc optimized memcpy and memmove code
   2 * Hand optimized from GNU libc's memcpy and memmove
   3 * Copyright (C) 1991,1996 Free Software Foundation
   4 * Copyright (C) 1995 Linus Torvalds (Linus.Torvalds@helsinki.fi)
   5 * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
   6 * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be)
   7 * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
   8 */
   9
  10#define FUNC(x)                 \
  11        .globl  x;              \
  12        .type   x,@function;    \
  13        .align  4;              \
  14x:
  15
  16/* Both these macros have to start with exactly the same insn */
  17#define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
  18        ldd     [%src + (offset) + 0x00], %t0; \
  19        ldd     [%src + (offset) + 0x08], %t2; \
  20        ldd     [%src + (offset) + 0x10], %t4; \
  21        ldd     [%src + (offset) + 0x18], %t6; \
  22        st      %t0, [%dst + (offset) + 0x00]; \
  23        st      %t1, [%dst + (offset) + 0x04]; \
  24        st      %t2, [%dst + (offset) + 0x08]; \
  25        st      %t3, [%dst + (offset) + 0x0c]; \
  26        st      %t4, [%dst + (offset) + 0x10]; \
  27        st      %t5, [%dst + (offset) + 0x14]; \
  28        st      %t6, [%dst + (offset) + 0x18]; \
  29        st      %t7, [%dst + (offset) + 0x1c];
  30
  31#define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
  32        ldd     [%src + (offset) + 0x00], %t0; \
  33        ldd     [%src + (offset) + 0x08], %t2; \
  34        ldd     [%src + (offset) + 0x10], %t4; \
  35        ldd     [%src + (offset) + 0x18], %t6; \
  36        std     %t0, [%dst + (offset) + 0x00]; \
  37        std     %t2, [%dst + (offset) + 0x08]; \
  38        std     %t4, [%dst + (offset) + 0x10]; \
  39        std     %t6, [%dst + (offset) + 0x18];
  40
  41#define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
  42        ldd     [%src - (offset) - 0x10], %t0; \
  43        ldd     [%src - (offset) - 0x08], %t2; \
  44        st      %t0, [%dst - (offset) - 0x10]; \
  45        st      %t1, [%dst - (offset) - 0x0c]; \
  46        st      %t2, [%dst - (offset) - 0x08]; \
  47        st      %t3, [%dst - (offset) - 0x04];
  48
  49#define MOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1, t2, t3) \
  50        ldd     [%src - (offset) - 0x10], %t0; \
  51        ldd     [%src - (offset) - 0x08], %t2; \
  52        std     %t0, [%dst - (offset) - 0x10]; \
  53        std     %t2, [%dst - (offset) - 0x08];
  54
  55#define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
  56        ldub    [%src - (offset) - 0x02], %t0; \
  57        ldub    [%src - (offset) - 0x01], %t1; \
  58        stb     %t0, [%dst - (offset) - 0x02]; \
  59        stb     %t1, [%dst - (offset) - 0x01];
  60
  61/* Both these macros have to start with exactly the same insn */
  62#define RMOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
  63        ldd     [%src - (offset) - 0x20], %t0; \
  64        ldd     [%src - (offset) - 0x18], %t2; \
  65        ldd     [%src - (offset) - 0x10], %t4; \
  66        ldd     [%src - (offset) - 0x08], %t6; \
  67        st      %t0, [%dst - (offset) - 0x20]; \
  68        st      %t1, [%dst - (offset) - 0x1c]; \
  69        st      %t2, [%dst - (offset) - 0x18]; \
  70        st      %t3, [%dst - (offset) - 0x14]; \
  71        st      %t4, [%dst - (offset) - 0x10]; \
  72        st      %t5, [%dst - (offset) - 0x0c]; \
  73        st      %t6, [%dst - (offset) - 0x08]; \
  74        st      %t7, [%dst - (offset) - 0x04];
  75
  76#define RMOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
  77        ldd     [%src - (offset) - 0x20], %t0; \
  78        ldd     [%src - (offset) - 0x18], %t2; \
  79        ldd     [%src - (offset) - 0x10], %t4; \
  80        ldd     [%src - (offset) - 0x08], %t6; \
  81        std     %t0, [%dst - (offset) - 0x20]; \
  82        std     %t2, [%dst - (offset) - 0x18]; \
  83        std     %t4, [%dst - (offset) - 0x10]; \
  84        std     %t6, [%dst - (offset) - 0x08];
  85
  86#define RMOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
  87        ldd     [%src + (offset) + 0x00], %t0; \
  88        ldd     [%src + (offset) + 0x08], %t2; \
  89        st      %t0, [%dst + (offset) + 0x00]; \
  90        st      %t1, [%dst + (offset) + 0x04]; \
  91        st      %t2, [%dst + (offset) + 0x08]; \
  92        st      %t3, [%dst + (offset) + 0x0c];
  93
  94#define RMOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
  95        ldub    [%src + (offset) + 0x00], %t0; \
  96        ldub    [%src + (offset) + 0x01], %t1; \
  97        stb     %t0, [%dst + (offset) + 0x00]; \
  98        stb     %t1, [%dst + (offset) + 0x01];
  99
 100#define SMOVE_CHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, prev, shil, shir, offset2) \
 101        ldd     [%src + (offset) + 0x00], %t0; \
 102        ldd     [%src + (offset) + 0x08], %t2; \
 103        srl     %t0, shir, %t5; \
 104        srl     %t1, shir, %t6; \
 105        sll     %t0, shil, %t0; \
 106        or      %t5, %prev, %t5; \
 107        sll     %t1, shil, %prev; \
 108        or      %t6, %t0, %t0; \
 109        srl     %t2, shir, %t1; \
 110        srl     %t3, shir, %t6; \
 111        sll     %t2, shil, %t2; \
 112        or      %t1, %prev, %t1; \
 113        std     %t4, [%dst + (offset) + (offset2) - 0x04]; \
 114        std     %t0, [%dst + (offset) + (offset2) + 0x04]; \
 115        sll     %t3, shil, %prev; \
 116        or      %t6, %t2, %t4;
 117
 118#define SMOVE_ALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, prev, shil, shir, offset2) \
 119        ldd     [%src + (offset) + 0x00], %t0; \
 120        ldd     [%src + (offset) + 0x08], %t2; \
 121        srl     %t0, shir, %t4; \
 122        srl     %t1, shir, %t5; \
 123        sll     %t0, shil, %t6; \
 124        or      %t4, %prev, %t0; \
 125        sll     %t1, shil, %prev; \
 126        or      %t5, %t6, %t1; \
 127        srl     %t2, shir, %t4; \
 128        srl     %t3, shir, %t5; \
 129        sll     %t2, shil, %t6; \
 130        or      %t4, %prev, %t2; \
 131        sll     %t3, shil, %prev; \
 132        or      %t5, %t6, %t3; \
 133        std     %t0, [%dst + (offset) + (offset2) + 0x00]; \
 134        std     %t2, [%dst + (offset) + (offset2) + 0x08];
 135
 136        .text
 137        .align  4
 138
 1390:
 140        retl
 141         nop            ! Only bcopy returns here and it retuns void...
 142
 143#ifdef __KERNEL__
 144FUNC(amemmove)
 145FUNC(__memmove)
 146#endif
 147FUNC(memmove)
 148        cmp             %o0, %o1
 149        mov             %o0, %g7
 150        bleu            9f
 151         sub            %o0, %o1, %o4
 152
 153        add             %o1, %o2, %o3
 154        cmp             %o3, %o0
 155        bleu            0f
 156         andcc          %o4, 3, %o5
 157
 158        add             %o1, %o2, %o1
 159        add             %o0, %o2, %o0
 160        sub             %o1, 1, %o1
 161        sub             %o0, 1, %o0
 162        
 1631:      /* reverse_bytes */
 164
 165        ldub            [%o1], %o4
 166        subcc           %o2, 1, %o2
 167        stb             %o4, [%o0]
 168        sub             %o1, 1, %o1
 169        bne             1b
 170         sub            %o0, 1, %o0
 171
 172        retl
 173         mov            %g7, %o0
 174
 175/* NOTE: This code is executed just for the cases,
 176         where %src (=%o1) & 3 is != 0.
 177         We need to align it to 4. So, for (%src & 3)
 178         1 we need to do ldub,lduh
 179         2 lduh
 180         3 just ldub
 181         so even if it looks weird, the branches
 182         are correct here. -jj
 183 */
 18478:     /* dword_align */
 185
 186        andcc           %o1, 1, %g0
 187        be              4f
 188         andcc          %o1, 2, %g0
 189
 190        ldub            [%o1], %g2
 191        add             %o1, 1, %o1
 192        stb             %g2, [%o0]
 193        sub             %o2, 1, %o2
 194        bne             3f
 195         add            %o0, 1, %o0
 1964:
 197        lduh            [%o1], %g2
 198        add             %o1, 2, %o1
 199        sth             %g2, [%o0]
 200        sub             %o2, 2, %o2
 201        b               3f
 202         add            %o0, 2, %o0
 203
 204FUNC(memcpy)    /* %o0=dst %o1=src %o2=len */
 205
 206        sub             %o0, %o1, %o4
 207        mov             %o0, %g7
 2089:
 209        andcc           %o4, 3, %o5
 2100:
 211        bne             86f
 212         cmp            %o2, 15
 213
 214        bleu            90f
 215         andcc          %o1, 3, %g0
 216
 217        bne             78b
 2183:
 219         andcc          %o1, 4, %g0
 220
 221        be              2f
 222         mov            %o2, %g1
 223
 224        ld              [%o1], %o4
 225        sub             %g1, 4, %g1
 226        st              %o4, [%o0]
 227        add             %o1, 4, %o1
 228        add             %o0, 4, %o0
 2292:
 230        andcc           %g1, 0xffffff80, %g0
 231        be              3f
 232         andcc          %o0, 4, %g0
 233
 234        be              82f + 4
 2355:
 236        MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
 237        MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
 238        MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
 239        MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
 240        sub             %g1, 128, %g1
 241        add             %o1, 128, %o1
 242        cmp             %g1, 128
 243        bge             5b
 244         add            %o0, 128, %o0
 2453:
 246        andcc           %g1, 0x70, %g4
 247        be              80f
 248         andcc          %g1, 8, %g0
 249
 250        sethi           %hi(80f), %o5
 251        srl             %g4, 1, %o4
 252        add             %g4, %o4, %o4
 253        add             %o1, %g4, %o1
 254        sub             %o5, %o4, %o5
 255        jmpl            %o5 + %lo(80f), %g0
 256         add            %o0, %g4, %o0
 257
 25879:     /* memcpy_table */
 259
 260        MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
 261        MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
 262        MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
 263        MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
 264        MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
 265        MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
 266        MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
 267
 26880:     /* memcpy_table_end */
 269        be              81f
 270         andcc          %g1, 4, %g0
 271
 272        ldd             [%o1], %g2
 273        add             %o0, 8, %o0
 274        st              %g2, [%o0 - 0x08]
 275        add             %o1, 8, %o1
 276        st              %g3, [%o0 - 0x04]
 277
 27881:     /* memcpy_last7 */
 279
 280        be              1f
 281         andcc          %g1, 2, %g0
 282
 283        ld              [%o1], %g2
 284        add             %o1, 4, %o1
 285        st              %g2, [%o0]
 286        add             %o0, 4, %o0
 2871:
 288        be              1f
 289         andcc          %g1, 1, %g0
 290
 291        lduh            [%o1], %g2
 292        add             %o1, 2, %o1
 293        sth             %g2, [%o0]
 294        add             %o0, 2, %o0
 2951:
 296        be              1f
 297         nop
 298
 299        ldub            [%o1], %g2
 300        stb             %g2, [%o0]
 3011:
 302        retl
 303         mov            %g7, %o0
 304
 30582:     /* ldd_std */
 306        MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
 307        MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
 308        MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
 309        MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
 310        subcc           %g1, 128, %g1
 311        add             %o1, 128, %o1
 312        cmp             %g1, 128
 313        bge             82b
 314         add            %o0, 128, %o0
 315
 316        andcc           %g1, 0x70, %g4
 317        be              84f
 318         andcc          %g1, 8, %g0
 319
 320        sethi           %hi(84f), %o5
 321        add             %o1, %g4, %o1
 322        sub             %o5, %g4, %o5
 323        jmpl            %o5 + %lo(84f), %g0
 324         add            %o0, %g4, %o0
 325
 32683:     /* amemcpy_table */
 327
 328        MOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
 329        MOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
 330        MOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
 331        MOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
 332        MOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
 333        MOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
 334        MOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
 335
 33684:     /* amemcpy_table_end */
 337        be              85f
 338         andcc          %g1, 4, %g0
 339
 340        ldd             [%o1], %g2
 341        add             %o0, 8, %o0
 342        std             %g2, [%o0 - 0x08]
 343        add             %o1, 8, %o1
 34485:     /* amemcpy_last7 */
 345        be              1f
 346         andcc          %g1, 2, %g0
 347
 348        ld              [%o1], %g2
 349        add             %o1, 4, %o1
 350        st              %g2, [%o0]
 351        add             %o0, 4, %o0
 3521:
 353        be              1f
 354         andcc          %g1, 1, %g0
 355
 356        lduh            [%o1], %g2
 357        add             %o1, 2, %o1
 358        sth             %g2, [%o0]
 359        add             %o0, 2, %o0
 3601:
 361        be              1f
 362         nop
 363
 364        ldub            [%o1], %g2
 365        stb             %g2, [%o0]
 3661:
 367        retl
 368         mov            %g7, %o0
 369
 37086:     /* non_aligned */
 371        cmp             %o2, 6
 372        bleu            88f
 373         nop
 374
 375        save            %sp, -96, %sp
 376        andcc           %i0, 3, %g0
 377        be              61f
 378         andcc          %i0, 1, %g0
 379        be              60f
 380         andcc          %i0, 2, %g0
 381
 382        ldub            [%i1], %g5
 383        add             %i1, 1, %i1
 384        stb             %g5, [%i0]
 385        sub             %i2, 1, %i2
 386        bne             61f
 387         add            %i0, 1, %i0
 38860:
 389        ldub            [%i1], %g3
 390        add             %i1, 2, %i1
 391        stb             %g3, [%i0]
 392        sub             %i2, 2, %i2
 393        ldub            [%i1 - 1], %g3
 394        add             %i0, 2, %i0
 395        stb             %g3, [%i0 - 1]
 39661:
 397        and             %i1, 3, %g2
 398        and             %i2, 0xc, %g3
 399        and             %i1, -4, %i1
 400        cmp             %g3, 4
 401        sll             %g2, 3, %g4
 402        mov             32, %g2
 403        be              4f
 404         sub            %g2, %g4, %l0
 405        
 406        blu             3f
 407         cmp            %g3, 0x8
 408
 409        be              2f
 410         srl            %i2, 2, %g3
 411
 412        ld              [%i1], %i3
 413        add             %i0, -8, %i0
 414        ld              [%i1 + 4], %i4
 415        b               8f
 416         add            %g3, 1, %g3
 4172:
 418        ld              [%i1], %i4
 419        add             %i0, -12, %i0
 420        ld              [%i1 + 4], %i5
 421        add             %g3, 2, %g3
 422        b               9f
 423         add            %i1, -4, %i1
 4243:
 425        ld              [%i1], %g1
 426        add             %i0, -4, %i0
 427        ld              [%i1 + 4], %i3
 428        srl             %i2, 2, %g3
 429        b               7f
 430         add            %i1, 4, %i1
 4314:
 432        ld              [%i1], %i5
 433        cmp             %i2, 7
 434        ld              [%i1 + 4], %g1
 435        srl             %i2, 2, %g3
 436        bleu            10f
 437         add            %i1, 8, %i1
 438
 439        ld              [%i1], %i3
 440        add             %g3, -1, %g3
 4415:
 442        sll             %i5, %g4, %g2
 443        srl             %g1, %l0, %g5
 444        or              %g2, %g5, %g2
 445        st              %g2, [%i0]
 4467:
 447        ld              [%i1 + 4], %i4
 448        sll             %g1, %g4, %g2
 449        srl             %i3, %l0, %g5
 450        or              %g2, %g5, %g2
 451        st              %g2, [%i0 + 4]
 4528:
 453        ld              [%i1 + 8], %i5
 454        sll             %i3, %g4, %g2
 455        srl             %i4, %l0, %g5
 456        or              %g2, %g5, %g2
 457        st              %g2, [%i0 + 8]
 4589:
 459        ld              [%i1 + 12], %g1
 460        sll             %i4, %g4, %g2
 461        srl             %i5, %l0, %g5
 462        addcc           %g3, -4, %g3
 463        or              %g2, %g5, %g2
 464        add             %i1, 16, %i1
 465        st              %g2, [%i0 + 12]
 466        add             %i0, 16, %i0
 467        bne,a           5b
 468         ld             [%i1], %i3
 46910:
 470        sll             %i5, %g4, %g2
 471        srl             %g1, %l0, %g5
 472        srl             %l0, 3, %g3
 473        or              %g2, %g5, %g2
 474        sub             %i1, %g3, %i1
 475        andcc           %i2, 2, %g0
 476        st              %g2, [%i0]
 477        be              1f
 478         andcc          %i2, 1, %g0
 479
 480        ldub            [%i1], %g2
 481        add             %i1, 2, %i1
 482        stb             %g2, [%i0 + 4]
 483        add             %i0, 2, %i0
 484        ldub            [%i1 - 1], %g2
 485        stb             %g2, [%i0 + 3]
 4861:
 487        be              1f
 488         nop
 489        ldub            [%i1], %g2
 490        stb             %g2, [%i0 + 4]
 4911:
 492        ret
 493         restore        %g7, %g0, %o0
 494
 49588:     /* short_end */
 496
 497        and             %o2, 0xe, %o3
 49820:
 499        sethi           %hi(89f), %o5
 500        sll             %o3, 3, %o4
 501        add             %o0, %o3, %o0
 502        sub             %o5, %o4, %o5
 503        add             %o1, %o3, %o1
 504        jmpl            %o5 + %lo(89f), %g0
 505         andcc          %o2, 1, %g0
 506
 507        MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3)
 508        MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3)
 509        MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3)
 510        MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3)
 511        MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3)
 512        MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3)
 513        MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3)
 514
 51589:     /* short_table_end */
 516
 517        be              1f
 518         nop
 519
 520        ldub            [%o1], %g2
 521        stb             %g2, [%o0]
 5221:
 523        retl
 524         mov            %g7, %o0
 525
 52690:     /* short_aligned_end */
 527        bne             88b
 528         andcc          %o2, 8, %g0
 529
 530        be              1f
 531         andcc          %o2, 4, %g0
 532
 533        ld              [%o1 + 0x00], %g2
 534        ld              [%o1 + 0x04], %g3
 535        add             %o1, 8, %o1
 536        st              %g2, [%o0 + 0x00]
 537        st              %g3, [%o0 + 0x04]
 538        add             %o0, 8, %o0
 5391:
 540        b               81b
 541         mov            %o2, %g1
 542