linux/arch/hexagon/lib/memcpy.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-only */
   2/*
   3 * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
   4 */
   5
   6/*
   7 * Description
   8 *
   9 *   library function for memcpy where length bytes are copied from
  10 *   ptr_in to ptr_out. ptr_out is returned unchanged.
  11 *   Allows any combination of alignment on input and output pointers
  12 *   and length from 0 to 2^32-1
  13 *
  14 * Restrictions
  15 *   The arrays should not overlap, the program will produce undefined output
  16 *   if they do.
  17 *   For blocks less than 16 bytes a byte by byte copy is performed. For
  18 *   8byte alignments, and length multiples, a dword copy is performed up to
  19 *   96bytes
  20 * History
  21 *
  22 *   DJH  5/15/09 Initial version 1.0
  23 *   DJH  6/ 1/09 Version 1.1 modified ABI to inlcude R16-R19
  24 *   DJH  7/12/09 Version 1.2 optimized codesize down to 760 was 840
  25 *   DJH 10/14/09 Version 1.3 added special loop for aligned case, was
  26 *                            overreading bloated codesize back up to 892
  27 *   DJH  4/20/10 Version 1.4 fixed Ldword_loop_epilog loop to prevent loads
  28 *                            occurring if only 1 left outstanding, fixes bug
  29 *                            # 3888, corrected for all alignments. Peeled off
  30 *                            1 32byte chunk from kernel loop and extended 8byte
  31 *                            loop at end to solve all combinations and prevent
  32 *                            over read.  Fixed Ldword_loop_prolog to prevent
  33 *                            overread for blocks less than 48bytes. Reduced
  34 *                            codesize to 752 bytes
  35 *   DJH  4/21/10 version 1.5 1.4 fix broke code for input block ends not
  36 *                            aligned to dword boundaries,underwriting by 1
  37 *                            byte, added detection for this and fixed. A
  38 *                            little bloat.
  39 *   DJH  4/23/10 version 1.6 corrected stack error, R20 was not being restored
  40 *                            always, fixed the error of R20 being modified
  41 *                            before it was being saved
  42 * Natural c model
  43 * ===============
  44 * void * memcpy(char * ptr_out, char * ptr_in, int length) {
  45 *   int i;
  46 *   if(length) for(i=0; i < length; i++) { ptr_out[i] = ptr_in[i]; }
  47 *   return(ptr_out);
  48 * }
  49 *
  50 * Optimized memcpy function
  51 * =========================
  52 * void * memcpy(char * ptr_out, char * ptr_in, int len) {
  53 *   int i, prolog, kernel, epilog, mask;
  54 *   u8 offset;
  55 *   s64 data0, dataF8, data70;
  56 *
  57 *   s64 * ptr8_in;
  58 *   s64 * ptr8_out;
  59 *   s32 * ptr4;
  60 *   s16 * ptr2;
  61 *
  62 *   offset = ((int) ptr_in) & 7;
  63 *   ptr8_in = (s64 *) &ptr_in[-offset];   //read in the aligned pointers
  64 *
  65 *   data70 = *ptr8_in++;
  66 *   dataF8 = *ptr8_in++;
  67 *
  68 *   data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
  69 *
  70 *   prolog = 32 - ((int) ptr_out);
  71 *   mask  = 0x7fffffff >> HEXAGON_R_cl0_R(len);
  72 *   prolog = prolog & mask;
  73 *   kernel = len - prolog;
  74 *   epilog = kernel & 0x1F;
  75 *   kernel = kernel>>5;
  76 *
  77 *   if (prolog & 1) { ptr_out[0] = (u8) data0; data0 >>= 8; ptr_out += 1;}
  78 *   ptr2 = (s16 *) &ptr_out[0];
  79 *   if (prolog & 2) { ptr2[0] = (u16) data0;  data0 >>= 16; ptr_out += 2;}
  80 *   ptr4 = (s32 *) &ptr_out[0];
  81 *   if (prolog & 4) { ptr4[0] = (u32) data0;  data0 >>= 32; ptr_out += 4;}
  82 *
  83 *   offset = offset + (prolog & 7);
  84 *   if (offset >= 8) {
  85 *     data70 = dataF8;
  86 *     dataF8 = *ptr8_in++;
  87 *   }
  88 *   offset = offset & 0x7;
  89 *
  90 *   prolog = prolog >> 3;
  91 *   if (prolog) for (i=0; i < prolog; i++) {
  92 *       data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
  93 *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
  94 *       data70 = dataF8;
  95 *       dataF8 = *ptr8_in++;
  96 *   }
  97 *   if(kernel) { kernel -= 1; epilog += 32; }
  98 *   if(kernel) for(i=0; i < kernel; i++) {
  99 *       data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
 100 *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
 101 *       data70 = *ptr8_in++;
 102 *
 103 *       data0 = HEXAGON_P_valignb_PPp(data70, dataF8, offset);
 104 *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
 105 *       dataF8 = *ptr8_in++;
 106 *
 107 *       data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
 108 *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
 109 *       data70 = *ptr8_in++;
 110 *
 111 *       data0 = HEXAGON_P_valignb_PPp(data70, dataF8, offset);
 112 *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
 113 *       dataF8 = *ptr8_in++;
 114 *   }
 115 *   epilogdws = epilog >> 3;
 116 *   if (epilogdws) for (i=0; i < epilogdws; i++) {
 117 *       data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
 118 *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
 119 *       data70 = dataF8;
 120 *       dataF8 = *ptr8_in++;
 121 *   }
 122 *   data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
 123 *
 124 *   ptr4 = (s32 *) &ptr_out[0];
 125 *   if (epilog & 4) { ptr4[0] = (u32) data0; data0 >>= 32; ptr_out += 4;}
 126 *   ptr2 = (s16 *) &ptr_out[0];
 127 *   if (epilog & 2) { ptr2[0] = (u16) data0; data0 >>= 16; ptr_out += 2;}
 128 *   if (epilog & 1) { *ptr_out++ = (u8) data0; }
 129 *
 130 *   return(ptr_out - length);
 131 * }
 132 *
 133 * Codesize : 784 bytes
 134 */
 135
 136
 137#define ptr_out         R0      /*  destination  pounter  */
 138#define ptr_in          R1      /*  source pointer  */
 139#define len             R2      /*  length of copy in bytes  */
 140
 141#define data70          R13:12  /*  lo 8 bytes of non-aligned transfer  */
 142#define dataF8          R11:10  /*  hi 8 bytes of non-aligned transfer  */
 143#define ldata0          R7:6    /*  even 8 bytes chunks  */
 144#define ldata1          R25:24  /*  odd 8 bytes chunks  */
 145#define data1           R7      /*  lower 8 bytes of ldata1  */
 146#define data0           R6      /*  lower 8 bytes of ldata0  */
 147
 148#define ifbyte          p0      /*  if transfer has bytes in epilog/prolog  */
 149#define ifhword         p0      /*  if transfer has shorts in epilog/prolog  */
 150#define ifword          p0      /*  if transfer has words in epilog/prolog  */
 151#define noprolog        p0      /*  no prolog, xfer starts at 32byte  */
 152#define nokernel        p1      /*  no 32byte multiple block in the transfer  */
 153#define noepilog        p0      /*  no epilog, xfer ends on 32byte boundary  */
 154#define align           p2      /*  alignment of input rel to 8byte boundary  */
 155#define kernel1         p0      /*  kernel count == 1  */
 156
 157#define dalign          R25     /*  rel alignment of input to output data  */
 158#define star3           R16     /*  number bytes in prolog - dwords  */
 159#define rest            R8      /*  length - prolog bytes  */
 160#define back            R7      /*  nr bytes > dword boundary in src block  */
 161#define epilog          R3      /*  bytes in epilog  */
 162#define inc             R15:14  /*  inc kernel by -1 and defetch ptr by 32  */
 163#define kernel          R4      /*  number of 32byte chunks in kernel  */
 164#define ptr_in_p_128    R5      /*  pointer for prefetch of input data  */
 165#define mask            R8      /*  mask used to determine prolog size  */
 166#define shift           R8      /*  used to work a shifter to extract bytes  */
 167#define shift2          R5      /*  in epilog to workshifter to extract bytes */
 168#define prolog          R15     /*  bytes in  prolog  */
 169#define epilogdws       R15     /*  number dwords in epilog  */
 170#define shiftb          R14     /*  used to extract bytes  */
 171#define offset          R9      /*  same as align in reg  */
 172#define ptr_out_p_32    R17     /*  pointer to output dczero  */
 173#define align888        R14     /*  if simple dword loop can be used  */
 174#define len8            R9      /*  number of dwords in length  */
 175#define over            R20     /*  nr of bytes > last inp buf dword boundary */
 176
 177#define ptr_in_p_128kernel      R5:4    /*  packed fetch pointer & kernel cnt */
 178
 179        .section .text
 180        .p2align 4
 181        .global memcpy
 182        .type memcpy, @function
 183memcpy:
 184{
 185        p2 = cmp.eq(len, #0);           /*  =0 */
 186        align888 = or(ptr_in, ptr_out); /*  %8 < 97 */
 187        p0 = cmp.gtu(len, #23);         /*  %1, <24 */
 188        p1 = cmp.eq(ptr_in, ptr_out);   /*  attempt to overwrite self */
 189}
 190{
 191        p1 = or(p2, p1);
 192        p3 = cmp.gtu(len, #95);         /*  %8 < 97 */
 193        align888 = or(align888, len);   /*  %8 < 97 */
 194        len8 = lsr(len, #3);            /*  %8 < 97 */
 195}
 196{
 197        dcfetch(ptr_in);                /*  zero/ptrin=ptrout causes fetch */
 198        p2 = bitsclr(align888, #7);     /*  %8 < 97  */
 199        if(p1) jumpr r31;               /*  =0  */
 200}
 201{
 202        p2 = and(p2,!p3);                       /*  %8 < 97  */
 203        if (p2.new) len = add(len, #-8);        /*  %8 < 97  */
 204        if (p2.new) jump:NT .Ldwordaligned;     /*  %8 < 97  */
 205}
 206{
 207        if(!p0) jump .Lbytes23orless;   /*  %1, <24  */
 208        mask.l = #LO(0x7fffffff);
 209        /*  all bytes before line multiples of data  */
 210        prolog = sub(#0, ptr_out);
 211}
 212{
 213        /*  save r31 on stack, decrement sp by 16  */
 214        allocframe(#24);
 215        mask.h = #HI(0x7fffffff);
 216        ptr_in_p_128 = add(ptr_in, #32);
 217        back = cl0(len);
 218}
 219{
 220        memd(sp+#0) = R17:16;           /*  save r16,r17 on stack6  */
 221        r31.l = #LO(.Lmemcpy_return);   /*  set up final return pointer  */
 222        prolog &= lsr(mask, back);
 223        offset = and(ptr_in, #7);
 224}
 225{
 226        memd(sp+#8) = R25:24;           /*  save r25,r24 on stack  */
 227        dalign = sub(ptr_out, ptr_in);
 228        r31.h = #HI(.Lmemcpy_return);   /*  set up final return pointer  */
 229}
 230{
 231        /*  see if there if input buffer end if aligned  */
 232        over = add(len, ptr_in);
 233        back = add(len, offset);
 234        memd(sp+#16) = R21:20;          /*  save r20,r21 on stack  */
 235}
 236{
 237        noprolog = bitsclr(prolog, #7);
 238        prolog = and(prolog, #31);
 239        dcfetch(ptr_in_p_128);
 240        ptr_in_p_128 = add(ptr_in_p_128, #32);
 241}
 242{
 243        kernel = sub(len, prolog);
 244        shift = asl(prolog, #3);
 245        star3 = and(prolog, #7);
 246        ptr_in = and(ptr_in, #-8);
 247}
 248{
 249        prolog = lsr(prolog, #3);
 250        epilog = and(kernel, #31);
 251        ptr_out_p_32 = add(ptr_out, prolog);
 252        over = and(over, #7);
 253}
 254{
 255        p3 = cmp.gtu(back, #8);
 256        kernel = lsr(kernel, #5);
 257        dcfetch(ptr_in_p_128);
 258        ptr_in_p_128 = add(ptr_in_p_128, #32);
 259}
 260{
 261        p1 = cmp.eq(prolog, #0);
 262        if(!p1.new) prolog = add(prolog, #1);
 263        dcfetch(ptr_in_p_128);  /*  reserve the line 64bytes on  */
 264        ptr_in_p_128 = add(ptr_in_p_128, #32);
 265}
 266{
 267        nokernel = cmp.eq(kernel,#0);
 268        dcfetch(ptr_in_p_128);  /* reserve the line 64bytes on  */
 269        ptr_in_p_128 = add(ptr_in_p_128, #32);
 270        shiftb = and(shift, #8);
 271}
 272{
 273        dcfetch(ptr_in_p_128);          /*  reserve the line 64bytes on  */
 274        ptr_in_p_128 = add(ptr_in_p_128, #32);
 275        if(nokernel) jump .Lskip64;
 276        p2 = cmp.eq(kernel, #1);        /*  skip ovr if kernel == 0  */
 277}
 278{
 279        dczeroa(ptr_out_p_32);
 280        /*  don't advance pointer  */
 281        if(!p2) ptr_out_p_32 = add(ptr_out_p_32, #32);
 282}
 283{
 284        dalign = and(dalign, #31);
 285        dczeroa(ptr_out_p_32);
 286}
 287.Lskip64:
 288{
 289        data70 = memd(ptr_in++#16);
 290        if(p3) dataF8 = memd(ptr_in+#8);
 291        if(noprolog) jump .Lnoprolog32;
 292        align = offset;
 293}
 294/*  upto initial 7 bytes  */
 295{
 296        ldata0 = valignb(dataF8, data70, align);
 297        ifbyte = tstbit(shift,#3);
 298        offset = add(offset, star3);
 299}
 300{
 301        if(ifbyte) memb(ptr_out++#1) = data0;
 302        ldata0 = lsr(ldata0, shiftb);
 303        shiftb = and(shift, #16);
 304        ifhword = tstbit(shift,#4);
 305}
 306{
 307        if(ifhword) memh(ptr_out++#2) = data0;
 308        ldata0 = lsr(ldata0, shiftb);
 309        ifword = tstbit(shift,#5);
 310        p2 = cmp.gtu(offset, #7);
 311}
 312{
 313        if(ifword) memw(ptr_out++#4) = data0;
 314        if(p2) data70 = dataF8;
 315        if(p2) dataF8 = memd(ptr_in++#8);       /*  another 8 bytes  */
 316        align = offset;
 317}
 318.Lnoprolog32:
 319{
 320        p3 = sp1loop0(.Ldword_loop_prolog, prolog)
 321        rest = sub(len, star3); /*  whats left after the loop  */
 322        p0 = cmp.gt(over, #0);
 323}
 324        if(p0) rest = add(rest, #16);
 325.Ldword_loop_prolog:
 326{
 327        if(p3) memd(ptr_out++#8) = ldata0;
 328        ldata0 = valignb(dataF8, data70, align);
 329        p0 = cmp.gt(rest, #16);
 330}
 331{
 332        data70 = dataF8;
 333        if(p0) dataF8 = memd(ptr_in++#8);
 334        rest = add(rest, #-8);
 335}:endloop0
 336.Lkernel:
 337{
 338        /*  kernel is at least 32bytes  */
 339        p3 = cmp.gtu(kernel, #0);
 340        /*  last itn. remove edge effects  */
 341        if(p3.new) kernel = add(kernel, #-1);
 342        /*  dealt with in last dword loop  */
 343        if(p3.new) epilog = add(epilog, #32);
 344}
 345{
 346        nokernel = cmp.eq(kernel, #0);          /*  after adjustment, recheck */
 347        if(nokernel.new) jump:NT .Lepilog;      /*  likely not taken  */
 348        inc = combine(#32, #-1);
 349        p3 = cmp.gtu(dalign, #24);
 350}
 351{
 352        if(p3) jump .Lodd_alignment;
 353}
 354{
 355        loop0(.Loword_loop_25to31, kernel);
 356        kernel1 = cmp.gtu(kernel, #1);
 357        rest = kernel;
 358}
 359        .falign
 360.Loword_loop_25to31:
 361{
 362        dcfetch(ptr_in_p_128);  /*  prefetch 4 lines ahead  */
 363        if(kernel1) ptr_out_p_32 = add(ptr_out_p_32, #32);
 364}
 365{
 366        dczeroa(ptr_out_p_32);  /*  reserve the next 32bytes in cache  */
 367        p3 = cmp.eq(kernel, rest);
 368}
 369{
 370        /*  kernel -= 1  */
 371        ptr_in_p_128kernel = vaddw(ptr_in_p_128kernel, inc);
 372        /*  kill write on first iteration  */
 373        if(!p3) memd(ptr_out++#8) = ldata1;
 374        ldata1 = valignb(dataF8, data70, align);
 375        data70 = memd(ptr_in++#8);
 376}
 377{
 378        memd(ptr_out++#8) = ldata0;
 379        ldata0 = valignb(data70, dataF8, align);
 380        dataF8 = memd(ptr_in++#8);
 381}
 382{
 383        memd(ptr_out++#8) = ldata1;
 384        ldata1 = valignb(dataF8, data70, align);
 385        data70 = memd(ptr_in++#8);
 386}
 387{
 388        memd(ptr_out++#8) = ldata0;
 389        ldata0 = valignb(data70, dataF8, align);
 390        dataF8 = memd(ptr_in++#8);
 391        kernel1 = cmp.gtu(kernel, #1);
 392}:endloop0
 393{
 394        memd(ptr_out++#8) = ldata1;
 395        jump .Lepilog;
 396}
 397.Lodd_alignment:
 398{
 399        loop0(.Loword_loop_00to24, kernel);
 400        kernel1 = cmp.gtu(kernel, #1);
 401        rest = add(kernel, #-1);
 402}
 403        .falign
 404.Loword_loop_00to24:
 405{
 406        dcfetch(ptr_in_p_128);  /*  prefetch 4 lines ahead  */
 407        ptr_in_p_128kernel = vaddw(ptr_in_p_128kernel, inc);
 408        if(kernel1) ptr_out_p_32 = add(ptr_out_p_32, #32);
 409}
 410{
 411        dczeroa(ptr_out_p_32);  /*  reserve the next 32bytes in cache  */
 412}
 413{
 414        memd(ptr_out++#8) = ldata0;
 415        ldata0 = valignb(dataF8, data70, align);
 416        data70 = memd(ptr_in++#8);
 417}
 418{
 419        memd(ptr_out++#8) = ldata0;
 420        ldata0 = valignb(data70, dataF8, align);
 421        dataF8 = memd(ptr_in++#8);
 422}
 423{
 424        memd(ptr_out++#8) = ldata0;
 425        ldata0 = valignb(dataF8, data70, align);
 426        data70 = memd(ptr_in++#8);
 427}
 428{
 429        memd(ptr_out++#8) = ldata0;
 430        ldata0 = valignb(data70, dataF8, align);
 431        dataF8 = memd(ptr_in++#8);
 432        kernel1 = cmp.gtu(kernel, #1);
 433}:endloop0
 434.Lepilog:
 435{
 436        noepilog = cmp.eq(epilog,#0);
 437        epilogdws = lsr(epilog, #3);
 438        kernel = and(epilog, #7);
 439}
 440{
 441        if(noepilog) jumpr r31;
 442        if(noepilog) ptr_out = sub(ptr_out, len);
 443        p3 = cmp.eq(epilogdws, #0);
 444        shift2 = asl(epilog, #3);
 445}
 446{
 447        shiftb = and(shift2, #32);
 448        ifword = tstbit(epilog,#2);
 449        if(p3) jump .Lepilog60;
 450        if(!p3) epilog = add(epilog, #-16);
 451}
 452{
 453        loop0(.Ldword_loop_epilog, epilogdws);
 454        /*  stop criteria is lsbs unless = 0 then its 8  */
 455        p3 = cmp.eq(kernel, #0);
 456        if(p3.new) kernel= #8;
 457        p1 = cmp.gt(over, #0);
 458}
 459        /*  if not aligned to end of buffer execute 1 more iteration  */
 460        if(p1) kernel= #0;
 461.Ldword_loop_epilog:
 462{
 463        memd(ptr_out++#8) = ldata0;
 464        ldata0 = valignb(dataF8, data70, align);
 465        p3 = cmp.gt(epilog, kernel);
 466}
 467{
 468        data70 = dataF8;
 469        if(p3) dataF8 = memd(ptr_in++#8);
 470        epilog = add(epilog, #-8);
 471}:endloop0
 472/* copy last 7 bytes */
 473.Lepilog60:
 474{
 475        if(ifword) memw(ptr_out++#4) = data0;
 476        ldata0 = lsr(ldata0, shiftb);
 477        ifhword = tstbit(epilog,#1);
 478        shiftb = and(shift2, #16);
 479}
 480{
 481        if(ifhword) memh(ptr_out++#2) = data0;
 482        ldata0 = lsr(ldata0, shiftb);
 483        ifbyte = tstbit(epilog,#0);
 484        if(ifbyte.new) len = add(len, #-1);
 485}
 486{
 487        if(ifbyte) memb(ptr_out) = data0;
 488        ptr_out = sub(ptr_out, len);    /*  return dest pointer  */
 489        jumpr r31;
 490}
 491/*  do byte copy for small n  */
 492.Lbytes23orless:
 493{
 494        p3 = sp1loop0(.Lbyte_copy, len);
 495        len = add(len, #-1);
 496}
 497.Lbyte_copy:
 498{
 499        data0 = memb(ptr_in++#1);
 500        if(p3) memb(ptr_out++#1) = data0;
 501}:endloop0
 502{
 503        memb(ptr_out) = data0;
 504        ptr_out = sub(ptr_out, len);
 505        jumpr r31;
 506}
 507/*  do dword copies for aligned in, out and length  */
 508.Ldwordaligned:
 509{
 510        p3 = sp1loop0(.Ldword_copy, len8);
 511}
 512.Ldword_copy:
 513{
 514        if(p3) memd(ptr_out++#8) = ldata0;
 515        ldata0 = memd(ptr_in++#8);
 516}:endloop0
 517{
 518        memd(ptr_out) = ldata0;
 519        ptr_out = sub(ptr_out, len);
 520        jumpr r31;      /*  return to function caller  */
 521}
 522.Lmemcpy_return:
 523        r21:20 = memd(sp+#16);  /*  restore r20+r21  */
 524{
 525        r25:24 = memd(sp+#8);   /*  restore r24+r25  */
 526        r17:16 = memd(sp+#0);   /*  restore r16+r17  */
 527}
 528        deallocframe;   /*  restore r31 and incrment stack by 16  */
 529        jumpr r31
 530