linux/arch/parisc/lib/memcpy.c
<<
>>
Prefs
   1/*
   2 *    Optimized memory copy routines.
   3 *
   4 *    Copyright (C) 2004 Randolph Chung <tausq@debian.org>
   5 *
   6 *    This program is free software; you can redistribute it and/or modify
   7 *    it under the terms of the GNU General Public License as published by
   8 *    the Free Software Foundation; either version 2, or (at your option)
   9 *    any later version.
  10 *
  11 *    This program is distributed in the hope that it will be useful,
  12 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 *    GNU General Public License for more details.
  15 *
  16 *    You should have received a copy of the GNU General Public License
  17 *    along with this program; if not, write to the Free Software
  18 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19 *
  20 *    Portions derived from the GNU C Library
  21 *    Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
  22 *
  23 * Several strategies are tried to try to get the best performance for various
  24 * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using 
  25 * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
  26 * general registers.  Unaligned copies are handled either by aligning the 
  27 * destination and then using shift-and-write method, or in a few cases by 
  28 * falling back to a byte-at-a-time copy.
  29 *
  30 * I chose to implement this in C because it is easier to maintain and debug,
  31 * and in my experiments it appears that the C code generated by gcc (3.3/3.4
  32 * at the time of writing) is fairly optimal. Unfortunately some of the 
  33 * semantics of the copy routine (exception handling) is difficult to express
  34 * in C, so we have to play some tricks to get it to work.
  35 *
  36 * All the loads and stores are done via explicit asm() code in order to use
  37 * the right space registers. 
  38 * 
  39 * Testing with various alignments and buffer sizes shows that this code is 
  40 * often >10x faster than a simple byte-at-a-time copy, even for strangely
  41 * aligned operands. It is interesting to note that the glibc version
  42 * of memcpy (written in C) is actually quite fast already. This routine is 
  43 * able to beat it by 30-40% for aligned copies because of the loop unrolling, 
  44 * but in some cases the glibc version is still slightly faster. This lends 
  45 * more credibility that gcc can generate very good code as long as we are 
  46 * careful.
  47 *
  48 * TODO:
  49 * - cache prefetching needs more experimentation to get optimal settings
  50 * - try not to use the post-increment address modifiers; they create additional
  51 *   interlocks
  52 * - replace byte-copy loops with stybs sequences
  53 */
  54
  55#ifdef __KERNEL__
  56#include <linux/module.h>
  57#include <linux/compiler.h>
  58#include <asm/uaccess.h>
  59#define s_space "%%sr1"
  60#define d_space "%%sr2"
  61#else
  62#include "memcpy.h"
  63#define s_space "%%sr0"
  64#define d_space "%%sr0"
  65#define pa_memcpy new2_copy
  66#endif
  67
  68DECLARE_PER_CPU(struct exception_data, exception_data);
  69
  70#define preserve_branch(label)  do {                                    \
  71        volatile int dummy = 0;                                         \
  72        /* The following branch is never taken, it's just here to  */   \
  73        /* prevent gcc from optimizing away our exception code. */      \
  74        if (unlikely(dummy != dummy))                                   \
  75                goto label;                                             \
  76} while (0)
  77
  78#define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
  79#define get_kernel_space() (0)
  80
  81#define MERGE(w0, sh_1, w1, sh_2)  ({                                   \
  82        unsigned int _r;                                                \
  83        asm volatile (                                                  \
  84        "mtsar %3\n"                                                    \
  85        "shrpw %1, %2, %%sar, %0\n"                                     \
  86        : "=r"(_r)                                                      \
  87        : "r"(w0), "r"(w1), "r"(sh_2)                                   \
  88        );                                                              \
  89        _r;                                                             \
  90})
  91#define THRESHOLD       16
  92
  93#ifdef DEBUG_MEMCPY
  94#define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __func__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
  95#else
  96#define DPRINTF(fmt, args...)
  97#endif
  98
  99#define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e)     \
 100        __asm__ __volatile__ (                          \
 101        "1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n\t" \
 102        ASM_EXCEPTIONTABLE_ENTRY(1b,_e)                 \
 103        : _tt(_t), "+r"(_a)                             \
 104        :                                               \
 105        : "r8")
 106
 107#define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e)    \
 108        __asm__ __volatile__ (                          \
 109        "1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n\t" \
 110        ASM_EXCEPTIONTABLE_ENTRY(1b,_e)                 \
 111        : "+r"(_a)                                      \
 112        : _tt(_t)                                       \
 113        : "r8")
 114
 115#define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)
 116#define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)
 117#define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)
 118#define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)
 119#define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
 120#define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)
 121
 122#define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e)         \
 123        __asm__ __volatile__ (                          \
 124        "1:\t" #_insn " " #_o "(" _s ",%1), %0\n\t"     \
 125        ASM_EXCEPTIONTABLE_ENTRY(1b,_e)                 \
 126        : _tt(_t)                                       \
 127        : "r"(_a)                                       \
 128        : "r8")
 129
 130#define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e)        \
 131        __asm__ __volatile__ (                          \
 132        "1:\t" #_insn " %0, " #_o "(" _s ",%1)\n\t"     \
 133        ASM_EXCEPTIONTABLE_ENTRY(1b,_e)                 \
 134        :                                               \
 135        : _tt(_t), "r"(_a)                              \
 136        : "r8")
 137
 138#define ldw(_s,_o,_a,_t,_e)     def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
 139#define stw(_s,_t,_o,_a,_e)     def_store_insn(stw,"r",_s,_t,_o,_a,_e)
 140
 141#ifdef  CONFIG_PREFETCH
 142static inline void prefetch_src(const void *addr)
 143{
 144        __asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));
 145}
 146
 147static inline void prefetch_dst(const void *addr)
 148{
 149        __asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));
 150}
 151#else
 152#define prefetch_src(addr) do { } while(0)
 153#define prefetch_dst(addr) do { } while(0)
 154#endif
 155
 156/* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
 157 * per loop.  This code is derived from glibc. 
 158 */
 159static inline unsigned long copy_dstaligned(unsigned long dst, unsigned long src, unsigned long len, unsigned long o_dst, unsigned long o_src, unsigned long o_len)
 160{
 161        /* gcc complains that a2 and a3 may be uninitialized, but actually
 162         * they cannot be.  Initialize a2/a3 to shut gcc up.
 163         */
 164        register unsigned int a0, a1, a2 = 0, a3 = 0;
 165        int sh_1, sh_2;
 166        struct exception_data *d;
 167
 168        /* prefetch_src((const void *)src); */
 169
 170        /* Calculate how to shift a word read at the memory operation
 171           aligned srcp to make it aligned for copy.  */
 172        sh_1 = 8 * (src % sizeof(unsigned int));
 173        sh_2 = 8 * sizeof(unsigned int) - sh_1;
 174
 175        /* Make src aligned by rounding it down.  */
 176        src &= -sizeof(unsigned int);
 177
 178        switch (len % 4)
 179        {
 180                case 2:
 181                        /* a1 = ((unsigned int *) src)[0];
 182                           a2 = ((unsigned int *) src)[1]; */
 183                        ldw(s_space, 0, src, a1, cda_ldw_exc);
 184                        ldw(s_space, 4, src, a2, cda_ldw_exc);
 185                        src -= 1 * sizeof(unsigned int);
 186                        dst -= 3 * sizeof(unsigned int);
 187                        len += 2;
 188                        goto do1;
 189                case 3:
 190                        /* a0 = ((unsigned int *) src)[0];
 191                           a1 = ((unsigned int *) src)[1]; */
 192                        ldw(s_space, 0, src, a0, cda_ldw_exc);
 193                        ldw(s_space, 4, src, a1, cda_ldw_exc);
 194                        src -= 0 * sizeof(unsigned int);
 195                        dst -= 2 * sizeof(unsigned int);
 196                        len += 1;
 197                        goto do2;
 198                case 0:
 199                        if (len == 0)
 200                                return 0;
 201                        /* a3 = ((unsigned int *) src)[0];
 202                           a0 = ((unsigned int *) src)[1]; */
 203                        ldw(s_space, 0, src, a3, cda_ldw_exc);
 204                        ldw(s_space, 4, src, a0, cda_ldw_exc);
 205                        src -=-1 * sizeof(unsigned int);
 206                        dst -= 1 * sizeof(unsigned int);
 207                        len += 0;
 208                        goto do3;
 209                case 1:
 210                        /* a2 = ((unsigned int *) src)[0];
 211                           a3 = ((unsigned int *) src)[1]; */
 212                        ldw(s_space, 0, src, a2, cda_ldw_exc);
 213                        ldw(s_space, 4, src, a3, cda_ldw_exc);
 214                        src -=-2 * sizeof(unsigned int);
 215                        dst -= 0 * sizeof(unsigned int);
 216                        len -= 1;
 217                        if (len == 0)
 218                                goto do0;
 219                        goto do4;                       /* No-op.  */
 220        }
 221
 222        do
 223        {
 224                /* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */
 225do4:
 226                /* a0 = ((unsigned int *) src)[0]; */
 227                ldw(s_space, 0, src, a0, cda_ldw_exc);
 228                /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
 229                stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
 230do3:
 231                /* a1 = ((unsigned int *) src)[1]; */
 232                ldw(s_space, 4, src, a1, cda_ldw_exc);
 233                /* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */
 234                stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc);
 235do2:
 236                /* a2 = ((unsigned int *) src)[2]; */
 237                ldw(s_space, 8, src, a2, cda_ldw_exc);
 238                /* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */
 239                stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc);
 240do1:
 241                /* a3 = ((unsigned int *) src)[3]; */
 242                ldw(s_space, 12, src, a3, cda_ldw_exc);
 243                /* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */
 244                stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc);
 245
 246                src += 4 * sizeof(unsigned int);
 247                dst += 4 * sizeof(unsigned int);
 248                len -= 4;
 249        }
 250        while (len != 0);
 251
 252do0:
 253        /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
 254        stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
 255
 256        preserve_branch(handle_load_error);
 257        preserve_branch(handle_store_error);
 258
 259        return 0;
 260
 261handle_load_error:
 262        __asm__ __volatile__ ("cda_ldw_exc:\n");
 263        d = &__get_cpu_var(exception_data);
 264        DPRINTF("cda_ldw_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
 265                o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
 266        return o_len * 4 - d->fault_addr + o_src;
 267
 268handle_store_error:
 269        __asm__ __volatile__ ("cda_stw_exc:\n");
 270        d = &__get_cpu_var(exception_data);
 271        DPRINTF("cda_stw_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
 272                o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
 273        return o_len * 4 - d->fault_addr + o_dst;
 274}
 275
 276
 277/* Returns 0 for success, otherwise, returns number of bytes not transferred. */
 278static unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
 279{
 280        register unsigned long src, dst, t1, t2, t3;
 281        register unsigned char *pcs, *pcd;
 282        register unsigned int *pws, *pwd;
 283        register double *pds, *pdd;
 284        unsigned long ret = 0;
 285        unsigned long o_dst, o_src, o_len;
 286        struct exception_data *d;
 287
 288        src = (unsigned long)srcp;
 289        dst = (unsigned long)dstp;
 290        pcs = (unsigned char *)srcp;
 291        pcd = (unsigned char *)dstp;
 292
 293        o_dst = dst; o_src = src; o_len = len;
 294
 295        /* prefetch_src((const void *)srcp); */
 296
 297        if (len < THRESHOLD)
 298                goto byte_copy;
 299
 300        /* Check alignment */
 301        t1 = (src ^ dst);
 302        if (unlikely(t1 & (sizeof(double)-1)))
 303                goto unaligned_copy;
 304
 305        /* src and dst have same alignment. */
 306
 307        /* Copy bytes till we are double-aligned. */
 308        t2 = src & (sizeof(double) - 1);
 309        if (unlikely(t2 != 0)) {
 310                t2 = sizeof(double) - t2;
 311                while (t2 && len) {
 312                        /* *pcd++ = *pcs++; */
 313                        ldbma(s_space, pcs, t3, pmc_load_exc);
 314                        len--;
 315                        stbma(d_space, t3, pcd, pmc_store_exc);
 316                        t2--;
 317                }
 318        }
 319
 320        pds = (double *)pcs;
 321        pdd = (double *)pcd;
 322
 323#if 0
 324        /* Copy 8 doubles at a time */
 325        while (len >= 8*sizeof(double)) {
 326                register double r1, r2, r3, r4, r5, r6, r7, r8;
 327                /* prefetch_src((char *)pds + L1_CACHE_BYTES); */
 328                flddma(s_space, pds, r1, pmc_load_exc);
 329                flddma(s_space, pds, r2, pmc_load_exc);
 330                flddma(s_space, pds, r3, pmc_load_exc);
 331                flddma(s_space, pds, r4, pmc_load_exc);
 332                fstdma(d_space, r1, pdd, pmc_store_exc);
 333                fstdma(d_space, r2, pdd, pmc_store_exc);
 334                fstdma(d_space, r3, pdd, pmc_store_exc);
 335                fstdma(d_space, r4, pdd, pmc_store_exc);
 336
 337#if 0
 338                if (L1_CACHE_BYTES <= 32)
 339                        prefetch_src((char *)pds + L1_CACHE_BYTES);
 340#endif
 341                flddma(s_space, pds, r5, pmc_load_exc);
 342                flddma(s_space, pds, r6, pmc_load_exc);
 343                flddma(s_space, pds, r7, pmc_load_exc);
 344                flddma(s_space, pds, r8, pmc_load_exc);
 345                fstdma(d_space, r5, pdd, pmc_store_exc);
 346                fstdma(d_space, r6, pdd, pmc_store_exc);
 347                fstdma(d_space, r7, pdd, pmc_store_exc);
 348                fstdma(d_space, r8, pdd, pmc_store_exc);
 349                len -= 8*sizeof(double);
 350        }
 351#endif
 352
 353        pws = (unsigned int *)pds;
 354        pwd = (unsigned int *)pdd;
 355
 356word_copy:
 357        while (len >= 8*sizeof(unsigned int)) {
 358                register unsigned int r1,r2,r3,r4,r5,r6,r7,r8;
 359                /* prefetch_src((char *)pws + L1_CACHE_BYTES); */
 360                ldwma(s_space, pws, r1, pmc_load_exc);
 361                ldwma(s_space, pws, r2, pmc_load_exc);
 362                ldwma(s_space, pws, r3, pmc_load_exc);
 363                ldwma(s_space, pws, r4, pmc_load_exc);
 364                stwma(d_space, r1, pwd, pmc_store_exc);
 365                stwma(d_space, r2, pwd, pmc_store_exc);
 366                stwma(d_space, r3, pwd, pmc_store_exc);
 367                stwma(d_space, r4, pwd, pmc_store_exc);
 368
 369                ldwma(s_space, pws, r5, pmc_load_exc);
 370                ldwma(s_space, pws, r6, pmc_load_exc);
 371                ldwma(s_space, pws, r7, pmc_load_exc);
 372                ldwma(s_space, pws, r8, pmc_load_exc);
 373                stwma(d_space, r5, pwd, pmc_store_exc);
 374                stwma(d_space, r6, pwd, pmc_store_exc);
 375                stwma(d_space, r7, pwd, pmc_store_exc);
 376                stwma(d_space, r8, pwd, pmc_store_exc);
 377                len -= 8*sizeof(unsigned int);
 378        }
 379
 380        while (len >= 4*sizeof(unsigned int)) {
 381                register unsigned int r1,r2,r3,r4;
 382                ldwma(s_space, pws, r1, pmc_load_exc);
 383                ldwma(s_space, pws, r2, pmc_load_exc);
 384                ldwma(s_space, pws, r3, pmc_load_exc);
 385                ldwma(s_space, pws, r4, pmc_load_exc);
 386                stwma(d_space, r1, pwd, pmc_store_exc);
 387                stwma(d_space, r2, pwd, pmc_store_exc);
 388                stwma(d_space, r3, pwd, pmc_store_exc);
 389                stwma(d_space, r4, pwd, pmc_store_exc);
 390                len -= 4*sizeof(unsigned int);
 391        }
 392
 393        pcs = (unsigned char *)pws;
 394        pcd = (unsigned char *)pwd;
 395
 396byte_copy:
 397        while (len) {
 398                /* *pcd++ = *pcs++; */
 399                ldbma(s_space, pcs, t3, pmc_load_exc);
 400                stbma(d_space, t3, pcd, pmc_store_exc);
 401                len--;
 402        }
 403
 404        return 0;
 405
 406unaligned_copy:
 407        /* possibly we are aligned on a word, but not on a double... */
 408        if (likely((t1 & (sizeof(unsigned int)-1)) == 0)) {
 409                t2 = src & (sizeof(unsigned int) - 1);
 410
 411                if (unlikely(t2 != 0)) {
 412                        t2 = sizeof(unsigned int) - t2;
 413                        while (t2) {
 414                                /* *pcd++ = *pcs++; */
 415                                ldbma(s_space, pcs, t3, pmc_load_exc);
 416                                stbma(d_space, t3, pcd, pmc_store_exc);
 417                                len--;
 418                                t2--;
 419                        }
 420                }
 421
 422                pws = (unsigned int *)pcs;
 423                pwd = (unsigned int *)pcd;
 424                goto word_copy;
 425        }
 426
 427        /* Align the destination.  */
 428        if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) {
 429                t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1));
 430                while (t2) {
 431                        /* *pcd++ = *pcs++; */
 432                        ldbma(s_space, pcs, t3, pmc_load_exc);
 433                        stbma(d_space, t3, pcd, pmc_store_exc);
 434                        len--;
 435                        t2--;
 436                }
 437                dst = (unsigned long)pcd;
 438                src = (unsigned long)pcs;
 439        }
 440
 441        ret = copy_dstaligned(dst, src, len / sizeof(unsigned int), 
 442                o_dst, o_src, o_len);
 443        if (ret)
 444                return ret;
 445
 446        pcs += (len & -sizeof(unsigned int));
 447        pcd += (len & -sizeof(unsigned int));
 448        len %= sizeof(unsigned int);
 449
 450        preserve_branch(handle_load_error);
 451        preserve_branch(handle_store_error);
 452
 453        goto byte_copy;
 454
 455handle_load_error:
 456        __asm__ __volatile__ ("pmc_load_exc:\n");
 457        d = &__get_cpu_var(exception_data);
 458        DPRINTF("pmc_load_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
 459                o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
 460        return o_len - d->fault_addr + o_src;
 461
 462handle_store_error:
 463        __asm__ __volatile__ ("pmc_store_exc:\n");
 464        d = &__get_cpu_var(exception_data);
 465        DPRINTF("pmc_store_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
 466                o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
 467        return o_len - d->fault_addr + o_dst;
 468}
 469
 470#ifdef __KERNEL__
 471unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len)
 472{
 473        mtsp(get_kernel_space(), 1);
 474        mtsp(get_user_space(), 2);
 475        return pa_memcpy((void __force *)dst, src, len);
 476}
 477
 478EXPORT_SYMBOL(__copy_from_user);
 479unsigned long __copy_from_user(void *dst, const void __user *src, unsigned long len)
 480{
 481        mtsp(get_user_space(), 1);
 482        mtsp(get_kernel_space(), 2);
 483        return pa_memcpy(dst, (void __force *)src, len);
 484}
 485
 486unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len)
 487{
 488        mtsp(get_user_space(), 1);
 489        mtsp(get_user_space(), 2);
 490        return pa_memcpy((void __force *)dst, (void __force *)src, len);
 491}
 492
 493
 494void * memcpy(void * dst,const void *src, size_t count)
 495{
 496        mtsp(get_kernel_space(), 1);
 497        mtsp(get_kernel_space(), 2);
 498        pa_memcpy(dst, src, count);
 499        return dst;
 500}
 501
 502EXPORT_SYMBOL(copy_to_user);
 503EXPORT_SYMBOL(copy_from_user);
 504EXPORT_SYMBOL(copy_in_user);
 505EXPORT_SYMBOL(memcpy);
 506#endif
 507