linux/arch/hexagon/lib/memcpy.S
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
   3 *
   4 *
   5 * This program is free software; you can redistribute it and/or modify
   6 * it under the terms of the GNU General Public License version 2 and
   7 * only version 2 as published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it will be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write to the Free Software
  16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17 * 02110-1301, USA.
  18 */
  19
  20/*
  21 * Description
  22 *
  23 *   library function for memcpy where length bytes are copied from
  24 *   ptr_in to ptr_out. ptr_out is returned unchanged.
  25 *   Allows any combination of alignment on input and output pointers
  26 *   and length from 0 to 2^32-1
  27 *
  28 * Restrictions
  29 *   The arrays should not overlap, the program will produce undefined output
  30 *   if they do.
  31 *   For blocks less than 16 bytes a byte by byte copy is performed. For
  32 *   8byte alignments, and length multiples, a dword copy is performed up to
  33 *   96bytes
  34 * History
  35 *
  36 *   DJH  5/15/09 Initial version 1.0
  37 *   DJH  6/ 1/09 Version 1.1 modified ABI to inlcude R16-R19
  38 *   DJH  7/12/09 Version 1.2 optimized codesize down to 760 was 840
  39 *   DJH 10/14/09 Version 1.3 added special loop for aligned case, was
  40 *                            overreading bloated codesize back up to 892
  41 *   DJH  4/20/10 Version 1.4 fixed Ldword_loop_epilog loop to prevent loads
  42 *                            occuring if only 1 left outstanding, fixes bug
  43 *                            # 3888, corrected for all alignments. Peeled off
  44 *                            1 32byte chunk from kernel loop and extended 8byte
  45 *                            loop at end to solve all combinations and prevent
  46 *                            over read.  Fixed Ldword_loop_prolog to prevent
  47 *                            overread for blocks less than 48bytes. Reduced
  48 *                            codesize to 752 bytes
  49 *   DJH  4/21/10 version 1.5 1.4 fix broke code for input block ends not
  50 *                            aligned to dword boundaries,underwriting by 1
  51 *                            byte, added detection for this and fixed. A
  52 *                            little bloat.
  53 *   DJH  4/23/10 version 1.6 corrected stack error, R20 was not being restored
  54 *                            always, fixed the error of R20 being modified
  55 *                            before it was being saved
  56 * Natural c model
  57 * ===============
  58 * void * memcpy(char * ptr_out, char * ptr_in, int length) {
  59 *   int i;
  60 *   if(length) for(i=0; i < length; i++) { ptr_out[i] = ptr_in[i]; }
  61 *   return(ptr_out);
  62 * }
  63 *
  64 * Optimized memcpy function
  65 * =========================
  66 * void * memcpy(char * ptr_out, char * ptr_in, int len) {
  67 *   int i, prolog, kernel, epilog, mask;
  68 *   u8 offset;
  69 *   s64 data0, dataF8, data70;
  70 *
  71 *   s64 * ptr8_in;
  72 *   s64 * ptr8_out;
  73 *   s32 * ptr4;
  74 *   s16 * ptr2;
  75 *
  76 *   offset = ((int) ptr_in) & 7;
  77 *   ptr8_in = (s64 *) &ptr_in[-offset];   //read in the aligned pointers
  78 *
  79 *   data70 = *ptr8_in++;
  80 *   dataF8 = *ptr8_in++;
  81 *
  82 *   data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
  83 *
  84 *   prolog = 32 - ((int) ptr_out);
  85 *   mask  = 0x7fffffff >> HEXAGON_R_cl0_R(len);
  86 *   prolog = prolog & mask;
  87 *   kernel = len - prolog;
  88 *   epilog = kernel & 0x1F;
  89 *   kernel = kernel>>5;
  90 *
  91 *   if (prolog & 1) { ptr_out[0] = (u8) data0; data0 >>= 8; ptr_out += 1;}
  92 *   ptr2 = (s16 *) &ptr_out[0];
  93 *   if (prolog & 2) { ptr2[0] = (u16) data0;  data0 >>= 16; ptr_out += 2;}
  94 *   ptr4 = (s32 *) &ptr_out[0];
  95 *   if (prolog & 4) { ptr4[0] = (u32) data0;  data0 >>= 32; ptr_out += 4;}
  96 *
  97 *   offset = offset + (prolog & 7);
  98 *   if (offset >= 8) {
  99 *     data70 = dataF8;
 100 *     dataF8 = *ptr8_in++;
 101 *   }
 102 *   offset = offset & 0x7;
 103 *
 104 *   prolog = prolog >> 3;
 105 *   if (prolog) for (i=0; i < prolog; i++) {
 106 *       data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
 107 *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
 108 *       data70 = dataF8;
 109 *       dataF8 = *ptr8_in++;
 110 *   }
 111 *   if(kernel) { kernel -= 1; epilog += 32; }
 112 *   if(kernel) for(i=0; i < kernel; i++) {
 113 *       data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
 114 *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
 115 *       data70 = *ptr8_in++;
 116 *
 117 *       data0 = HEXAGON_P_valignb_PPp(data70, dataF8, offset);
 118 *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
 119 *       dataF8 = *ptr8_in++;
 120 *
 121 *       data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
 122 *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
 123 *       data70 = *ptr8_in++;
 124 *
 125 *       data0 = HEXAGON_P_valignb_PPp(data70, dataF8, offset);
 126 *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
 127 *       dataF8 = *ptr8_in++;
 128 *   }
 129 *   epilogdws = epilog >> 3;
 130 *   if (epilogdws) for (i=0; i < epilogdws; i++) {
 131 *       data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
 132 *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
 133 *       data70 = dataF8;
 134 *       dataF8 = *ptr8_in++;
 135 *   }
 136 *   data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
 137 *
 138 *   ptr4 = (s32 *) &ptr_out[0];
 139 *   if (epilog & 4) { ptr4[0] = (u32) data0; data0 >>= 32; ptr_out += 4;}
 140 *   ptr2 = (s16 *) &ptr_out[0];
 141 *   if (epilog & 2) { ptr2[0] = (u16) data0; data0 >>= 16; ptr_out += 2;}
 142 *   if (epilog & 1) { *ptr_out++ = (u8) data0; }
 143 *
 144 *   return(ptr_out - length);
 145 * }
 146 *
 147 * Codesize : 784 bytes
 148 */
 149
 150
 151#define ptr_out         R0      /*  destination  pounter  */
 152#define ptr_in          R1      /*  source pointer  */
 153#define len             R2      /*  length of copy in bytes  */
 154
 155#define data70          R13:12  /*  lo 8 bytes of non-aligned transfer  */
 156#define dataF8          R11:10  /*  hi 8 bytes of non-aligned transfer  */
 157#define ldata0          R7:6    /*  even 8 bytes chunks  */
 158#define ldata1          R25:24  /*  odd 8 bytes chunks  */
 159#define data1           R7      /*  lower 8 bytes of ldata1  */
 160#define data0           R6      /*  lower 8 bytes of ldata0  */
 161
 162#define ifbyte          p0      /*  if transfer has bytes in epilog/prolog  */
 163#define ifhword         p0      /*  if transfer has shorts in epilog/prolog  */
 164#define ifword          p0      /*  if transfer has words in epilog/prolog  */
 165#define noprolog        p0      /*  no prolog, xfer starts at 32byte  */
 166#define nokernel        p1      /*  no 32byte multiple block in the transfer  */
 167#define noepilog        p0      /*  no epilog, xfer ends on 32byte boundary  */
 168#define align           p2      /*  alignment of input rel to 8byte boundary  */
 169#define kernel1         p0      /*  kernel count == 1  */
 170
 171#define dalign          R25     /*  rel alignment of input to output data  */
 172#define star3           R16     /*  number bytes in prolog - dwords  */
 173#define rest            R8      /*  length - prolog bytes  */
 174#define back            R7      /*  nr bytes > dword boundary in src block  */
 175#define epilog          R3      /*  bytes in epilog  */
 176#define inc             R15:14  /*  inc kernel by -1 and defetch ptr by 32  */
 177#define kernel          R4      /*  number of 32byte chunks in kernel  */
 178#define ptr_in_p_128    R5      /*  pointer for prefetch of input data  */
 179#define mask            R8      /*  mask used to determine prolog size  */
 180#define shift           R8      /*  used to work a shifter to extract bytes  */
 181#define shift2          R5      /*  in epilog to workshifter to extract bytes */
 182#define prolog          R15     /*  bytes in  prolog  */
 183#define epilogdws       R15     /*  number dwords in epilog  */
 184#define shiftb          R14     /*  used to extract bytes  */
 185#define offset          R9      /*  same as align in reg  */
 186#define ptr_out_p_32    R17     /*  pointer to output dczero  */
 187#define align888        R14     /*  if simple dword loop can be used  */
 188#define len8            R9      /*  number of dwords in length  */
 189#define over            R20     /*  nr of bytes > last inp buf dword boundary */
 190
 191#define ptr_in_p_128kernel      R5:4    /*  packed fetch pointer & kernel cnt */
 192
 193        .section .text
 194        .p2align 4
 195        .global memcpy
 196        .type memcpy, @function
 197memcpy:
 198{
 199        p2 = cmp.eq(len, #0);           /*  =0 */
 200        align888 = or(ptr_in, ptr_out); /*  %8 < 97 */
 201        p0 = cmp.gtu(len, #23);         /*  %1, <24 */
 202        p1 = cmp.eq(ptr_in, ptr_out);   /*  attempt to overwrite self */
 203}
 204{
 205        p1 = or(p2, p1);
 206        p3 = cmp.gtu(len, #95);         /*  %8 < 97 */
 207        align888 = or(align888, len);   /*  %8 < 97 */
 208        len8 = lsr(len, #3);            /*  %8 < 97 */
 209}
 210{
 211        dcfetch(ptr_in);                /*  zero/ptrin=ptrout causes fetch */
 212        p2 = bitsclr(align888, #7);     /*  %8 < 97  */
 213        if(p1) jumpr r31;               /*  =0  */
 214}
 215{
 216        p2 = and(p2,!p3);                       /*  %8 < 97  */
 217        if (p2.new) len = add(len, #-8);        /*  %8 < 97  */
 218        if (p2.new) jump:NT .Ldwordaligned;     /*  %8 < 97  */
 219}
 220{
 221        if(!p0) jump .Lbytes23orless;   /*  %1, <24  */
 222        mask.l = #LO(0x7fffffff);
 223        /*  all bytes before line multiples of data  */
 224        prolog = sub(#0, ptr_out);
 225}
 226{
 227        /*  save r31 on stack, decrement sp by 16  */
 228        allocframe(#24);
 229        mask.h = #HI(0x7fffffff);
 230        ptr_in_p_128 = add(ptr_in, #32);
 231        back = cl0(len);
 232}
 233{
 234        memd(sp+#0) = R17:16;           /*  save r16,r17 on stack6  */
 235        r31.l = #LO(.Lmemcpy_return);   /*  set up final return pointer  */
 236        prolog &= lsr(mask, back);
 237        offset = and(ptr_in, #7);
 238}
 239{
 240        memd(sp+#8) = R25:24;           /*  save r25,r24 on stack  */
 241        dalign = sub(ptr_out, ptr_in);
 242        r31.h = #HI(.Lmemcpy_return);   /*  set up final return pointer  */
 243}
 244{
 245        /*  see if there if input buffer end if aligned  */
 246        over = add(len, ptr_in);
 247        back = add(len, offset);
 248        memd(sp+#16) = R21:20;          /*  save r20,r21 on stack  */
 249}
 250{
 251        noprolog = bitsclr(prolog, #7);
 252        prolog = and(prolog, #31);
 253        dcfetch(ptr_in_p_128);
 254        ptr_in_p_128 = add(ptr_in_p_128, #32);
 255}
 256{
 257        kernel = sub(len, prolog);
 258        shift = asl(prolog, #3);
 259        star3 = and(prolog, #7);
 260        ptr_in = and(ptr_in, #-8);
 261}
 262{
 263        prolog = lsr(prolog, #3);
 264        epilog = and(kernel, #31);
 265        ptr_out_p_32 = add(ptr_out, prolog);
 266        over = and(over, #7);
 267}
 268{
 269        p3 = cmp.gtu(back, #8);
 270        kernel = lsr(kernel, #5);
 271        dcfetch(ptr_in_p_128);
 272        ptr_in_p_128 = add(ptr_in_p_128, #32);
 273}
 274{
 275        p1 = cmp.eq(prolog, #0);
 276        if(!p1.new) prolog = add(prolog, #1);
 277        dcfetch(ptr_in_p_128);  /*  reserve the line 64bytes on  */
 278        ptr_in_p_128 = add(ptr_in_p_128, #32);
 279}
 280{
 281        nokernel = cmp.eq(kernel,#0);
 282        dcfetch(ptr_in_p_128);  /* reserve the line 64bytes on  */
 283        ptr_in_p_128 = add(ptr_in_p_128, #32);
 284        shiftb = and(shift, #8);
 285}
 286{
 287        dcfetch(ptr_in_p_128);          /*  reserve the line 64bytes on  */
 288        ptr_in_p_128 = add(ptr_in_p_128, #32);
 289        if(nokernel) jump .Lskip64;
 290        p2 = cmp.eq(kernel, #1);        /*  skip ovr if kernel == 0  */
 291}
 292{
 293        dczeroa(ptr_out_p_32);
 294        /*  don't advance pointer  */
 295        if(!p2) ptr_out_p_32 = add(ptr_out_p_32, #32);
 296}
 297{
 298        dalign = and(dalign, #31);
 299        dczeroa(ptr_out_p_32);
 300}
 301.Lskip64:
 302{
 303        data70 = memd(ptr_in++#16);
 304        if(p3) dataF8 = memd(ptr_in+#8);
 305        if(noprolog) jump .Lnoprolog32;
 306        align = offset;
 307}
 308/*  upto initial 7 bytes  */
 309{
 310        ldata0 = valignb(dataF8, data70, align);
 311        ifbyte = tstbit(shift,#3);
 312        offset = add(offset, star3);
 313}
 314{
 315        if(ifbyte) memb(ptr_out++#1) = data0;
 316        ldata0 = lsr(ldata0, shiftb);
 317        shiftb = and(shift, #16);
 318        ifhword = tstbit(shift,#4);
 319}
 320{
 321        if(ifhword) memh(ptr_out++#2) = data0;
 322        ldata0 = lsr(ldata0, shiftb);
 323        ifword = tstbit(shift,#5);
 324        p2 = cmp.gtu(offset, #7);
 325}
 326{
 327        if(ifword) memw(ptr_out++#4) = data0;
 328        if(p2) data70 = dataF8;
 329        if(p2) dataF8 = memd(ptr_in++#8);       /*  another 8 bytes  */
 330        align = offset;
 331}
 332.Lnoprolog32:
 333{
 334        p3 = sp1loop0(.Ldword_loop_prolog, prolog)
 335        rest = sub(len, star3); /*  whats left after the loop  */
 336        p0 = cmp.gt(over, #0);
 337}
 338        if(p0) rest = add(rest, #16);
 339.Ldword_loop_prolog:
 340{
 341        if(p3) memd(ptr_out++#8) = ldata0;
 342        ldata0 = valignb(dataF8, data70, align);
 343        p0 = cmp.gt(rest, #16);
 344}
 345{
 346        data70 = dataF8;
 347        if(p0) dataF8 = memd(ptr_in++#8);
 348        rest = add(rest, #-8);
 349}:endloop0
 350.Lkernel:
 351{
 352        /*  kernel is at least 32bytes  */
 353        p3 = cmp.gtu(kernel, #0);
 354        /*  last itn. remove edge effects  */
 355        if(p3.new) kernel = add(kernel, #-1);
 356        /*  dealt with in last dword loop  */
 357        if(p3.new) epilog = add(epilog, #32);
 358}
 359{
 360        nokernel = cmp.eq(kernel, #0);          /*  after adjustment, recheck */
 361        if(nokernel.new) jump:NT .Lepilog;      /*  likely not taken  */
 362        inc = combine(#32, #-1);
 363        p3 = cmp.gtu(dalign, #24);
 364}
 365{
 366        if(p3) jump .Lodd_alignment;
 367}
 368{
 369        loop0(.Loword_loop_25to31, kernel);
 370        kernel1 = cmp.gtu(kernel, #1);
 371        rest = kernel;
 372}
 373        .falign
 374.Loword_loop_25to31:
 375{
 376        dcfetch(ptr_in_p_128);  /*  prefetch 4 lines ahead  */
 377        if(kernel1) ptr_out_p_32 = add(ptr_out_p_32, #32);
 378}
 379{
 380        dczeroa(ptr_out_p_32);  /*  reserve the next 32bytes in cache  */
 381        p3 = cmp.eq(kernel, rest);
 382}
 383{
 384        /*  kernel -= 1  */
 385        ptr_in_p_128kernel = vaddw(ptr_in_p_128kernel, inc);
 386        /*  kill write on first iteration  */
 387        if(!p3) memd(ptr_out++#8) = ldata1;
 388        ldata1 = valignb(dataF8, data70, align);
 389        data70 = memd(ptr_in++#8);
 390}
 391{
 392        memd(ptr_out++#8) = ldata0;
 393        ldata0 = valignb(data70, dataF8, align);
 394        dataF8 = memd(ptr_in++#8);
 395}
 396{
 397        memd(ptr_out++#8) = ldata1;
 398        ldata1 = valignb(dataF8, data70, align);
 399        data70 = memd(ptr_in++#8);
 400}
 401{
 402        memd(ptr_out++#8) = ldata0;
 403        ldata0 = valignb(data70, dataF8, align);
 404        dataF8 = memd(ptr_in++#8);
 405        kernel1 = cmp.gtu(kernel, #1);
 406}:endloop0
 407{
 408        memd(ptr_out++#8) = ldata1;
 409        jump .Lepilog;
 410}
 411.Lodd_alignment:
 412{
 413        loop0(.Loword_loop_00to24, kernel);
 414        kernel1 = cmp.gtu(kernel, #1);
 415        rest = add(kernel, #-1);
 416}
 417        .falign
 418.Loword_loop_00to24:
 419{
 420        dcfetch(ptr_in_p_128);  /*  prefetch 4 lines ahead  */
 421        ptr_in_p_128kernel = vaddw(ptr_in_p_128kernel, inc);
 422        if(kernel1) ptr_out_p_32 = add(ptr_out_p_32, #32);
 423}
 424{
 425        dczeroa(ptr_out_p_32);  /*  reserve the next 32bytes in cache  */
 426}
 427{
 428        memd(ptr_out++#8) = ldata0;
 429        ldata0 = valignb(dataF8, data70, align);
 430        data70 = memd(ptr_in++#8);
 431}
 432{
 433        memd(ptr_out++#8) = ldata0;
 434        ldata0 = valignb(data70, dataF8, align);
 435        dataF8 = memd(ptr_in++#8);
 436}
 437{
 438        memd(ptr_out++#8) = ldata0;
 439        ldata0 = valignb(dataF8, data70, align);
 440        data70 = memd(ptr_in++#8);
 441}
 442{
 443        memd(ptr_out++#8) = ldata0;
 444        ldata0 = valignb(data70, dataF8, align);
 445        dataF8 = memd(ptr_in++#8);
 446        kernel1 = cmp.gtu(kernel, #1);
 447}:endloop0
 448.Lepilog:
 449{
 450        noepilog = cmp.eq(epilog,#0);
 451        epilogdws = lsr(epilog, #3);
 452        kernel = and(epilog, #7);
 453}
 454{
 455        if(noepilog) jumpr r31;
 456        if(noepilog) ptr_out = sub(ptr_out, len);
 457        p3 = cmp.eq(epilogdws, #0);
 458        shift2 = asl(epilog, #3);
 459}
 460{
 461        shiftb = and(shift2, #32);
 462        ifword = tstbit(epilog,#2);
 463        if(p3) jump .Lepilog60;
 464        if(!p3) epilog = add(epilog, #-16);
 465}
 466{
 467        loop0(.Ldword_loop_epilog, epilogdws);
 468        /*  stop criteria is lsbs unless = 0 then its 8  */
 469        p3 = cmp.eq(kernel, #0);
 470        if(p3.new) kernel= #8;
 471        p1 = cmp.gt(over, #0);
 472}
 473        /*  if not aligned to end of buffer execute 1 more iteration  */
 474        if(p1) kernel= #0;
 475.Ldword_loop_epilog:
 476{
 477        memd(ptr_out++#8) = ldata0;
 478        ldata0 = valignb(dataF8, data70, align);
 479        p3 = cmp.gt(epilog, kernel);
 480}
 481{
 482        data70 = dataF8;
 483        if(p3) dataF8 = memd(ptr_in++#8);
 484        epilog = add(epilog, #-8);
 485}:endloop0
 486/* copy last 7 bytes */
 487.Lepilog60:
 488{
 489        if(ifword) memw(ptr_out++#4) = data0;
 490        ldata0 = lsr(ldata0, shiftb);
 491        ifhword = tstbit(epilog,#1);
 492        shiftb = and(shift2, #16);
 493}
 494{
 495        if(ifhword) memh(ptr_out++#2) = data0;
 496        ldata0 = lsr(ldata0, shiftb);
 497        ifbyte = tstbit(epilog,#0);
 498        if(ifbyte.new) len = add(len, #-1);
 499}
 500{
 501        if(ifbyte) memb(ptr_out) = data0;
 502        ptr_out = sub(ptr_out, len);    /*  return dest pointer  */
 503        jumpr r31;
 504}
 505/*  do byte copy for small n  */
 506.Lbytes23orless:
 507{
 508        p3 = sp1loop0(.Lbyte_copy, len);
 509        len = add(len, #-1);
 510}
 511.Lbyte_copy:
 512{
 513        data0 = memb(ptr_in++#1);
 514        if(p3) memb(ptr_out++#1) = data0;
 515}:endloop0
 516{
 517        memb(ptr_out) = data0;
 518        ptr_out = sub(ptr_out, len);
 519        jumpr r31;
 520}
 521/*  do dword copies for aligned in, out and length  */
 522.Ldwordaligned:
 523{
 524        p3 = sp1loop0(.Ldword_copy, len8);
 525}
 526.Ldword_copy:
 527{
 528        if(p3) memd(ptr_out++#8) = ldata0;
 529        ldata0 = memd(ptr_in++#8);
 530}:endloop0
 531{
 532        memd(ptr_out) = ldata0;
 533        ptr_out = sub(ptr_out, len);
 534        jumpr r31;      /*  return to function caller  */
 535}
 536.Lmemcpy_return:
 537        r21:20 = memd(sp+#16);  /*  restore r20+r21  */
 538{
 539        r25:24 = memd(sp+#8);   /*  restore r24+r25  */
 540        r17:16 = memd(sp+#0);   /*  restore r16+r17  */
 541}
 542        deallocframe;   /*  restore r31 and incrment stack by 16  */
 543        jumpr r31
 544