linux/arch/parisc/lib/memcpy.c
<<
>>
Prefs
   1/*
   2 *    Optimized memory copy routines.
   3 *
   4 *    Copyright (C) 2004 Randolph Chung <tausq@debian.org>
   5 *    Copyright (C) 2013 Helge Deller <deller@gmx.de>
   6 *
   7 *    This program is free software; you can redistribute it and/or modify
   8 *    it under the terms of the GNU General Public License as published by
   9 *    the Free Software Foundation; either version 2, or (at your option)
  10 *    any later version.
  11 *
  12 *    This program is distributed in the hope that it will be useful,
  13 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 *    GNU General Public License for more details.
  16 *
  17 *    You should have received a copy of the GNU General Public License
  18 *    along with this program; if not, write to the Free Software
  19 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20 *
  21 *    Portions derived from the GNU C Library
  22 *    Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
  23 *
  24 * Several strategies are tried to try to get the best performance for various
  25 * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using 
  26 * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
  27 * general registers.  Unaligned copies are handled either by aligning the 
  28 * destination and then using shift-and-write method, or in a few cases by 
  29 * falling back to a byte-at-a-time copy.
  30 *
  31 * I chose to implement this in C because it is easier to maintain and debug,
  32 * and in my experiments it appears that the C code generated by gcc (3.3/3.4
  33 * at the time of writing) is fairly optimal. Unfortunately some of the 
  34 * semantics of the copy routine (exception handling) is difficult to express
  35 * in C, so we have to play some tricks to get it to work.
  36 *
  37 * All the loads and stores are done via explicit asm() code in order to use
  38 * the right space registers. 
  39 * 
  40 * Testing with various alignments and buffer sizes shows that this code is 
  41 * often >10x faster than a simple byte-at-a-time copy, even for strangely
  42 * aligned operands. It is interesting to note that the glibc version
  43 * of memcpy (written in C) is actually quite fast already. This routine is 
  44 * able to beat it by 30-40% for aligned copies because of the loop unrolling, 
  45 * but in some cases the glibc version is still slightly faster. This lends 
  46 * more credibility that gcc can generate very good code as long as we are 
  47 * careful.
  48 *
  49 * TODO:
  50 * - cache prefetching needs more experimentation to get optimal settings
  51 * - try not to use the post-increment address modifiers; they create additional
  52 *   interlocks
  53 * - replace byte-copy loops with stybs sequences
  54 */
  55
  56#ifdef __KERNEL__
  57#include <linux/module.h>
  58#include <linux/compiler.h>
  59#include <linux/uaccess.h>
  60#define s_space "%%sr1"
  61#define d_space "%%sr2"
  62#else
  63#include "memcpy.h"
  64#define s_space "%%sr0"
  65#define d_space "%%sr0"
  66#define pa_memcpy new2_copy
  67#endif
  68
  69DECLARE_PER_CPU(struct exception_data, exception_data);
  70
  71#define preserve_branch(label)  do {                                    \
  72        volatile int dummy = 0;                                         \
  73        /* The following branch is never taken, it's just here to  */   \
  74        /* prevent gcc from optimizing away our exception code. */      \
  75        if (unlikely(dummy != dummy))                                   \
  76                goto label;                                             \
  77} while (0)
  78
  79#define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
  80#define get_kernel_space() (0)
  81
  82#define MERGE(w0, sh_1, w1, sh_2)  ({                                   \
  83        unsigned int _r;                                                \
  84        asm volatile (                                                  \
  85        "mtsar %3\n"                                                    \
  86        "shrpw %1, %2, %%sar, %0\n"                                     \
  87        : "=r"(_r)                                                      \
  88        : "r"(w0), "r"(w1), "r"(sh_2)                                   \
  89        );                                                              \
  90        _r;                                                             \
  91})
  92#define THRESHOLD       16
  93
  94#ifdef DEBUG_MEMCPY
  95#define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __func__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
  96#else
  97#define DPRINTF(fmt, args...)
  98#endif
  99
 100#define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e)     \
 101        __asm__ __volatile__ (                          \
 102        "1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n\t" \
 103        ASM_EXCEPTIONTABLE_ENTRY(1b,_e)                 \
 104        : _tt(_t), "+r"(_a)                             \
 105        :                                               \
 106        : "r8")
 107
 108#define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e)    \
 109        __asm__ __volatile__ (                          \
 110        "1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n\t" \
 111        ASM_EXCEPTIONTABLE_ENTRY(1b,_e)                 \
 112        : "+r"(_a)                                      \
 113        : _tt(_t)                                       \
 114        : "r8")
 115
 116#define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)
 117#define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)
 118#define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)
 119#define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)
 120#define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
 121#define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)
 122
 123#define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e)         \
 124        __asm__ __volatile__ (                          \
 125        "1:\t" #_insn " " #_o "(" _s ",%1), %0\n\t"     \
 126        ASM_EXCEPTIONTABLE_ENTRY(1b,_e)                 \
 127        : _tt(_t)                                       \
 128        : "r"(_a)                                       \
 129        : "r8")
 130
 131#define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e)        \
 132        __asm__ __volatile__ (                          \
 133        "1:\t" #_insn " %0, " #_o "(" _s ",%1)\n\t"     \
 134        ASM_EXCEPTIONTABLE_ENTRY(1b,_e)                 \
 135        :                                               \
 136        : _tt(_t), "r"(_a)                              \
 137        : "r8")
 138
 139#define ldw(_s,_o,_a,_t,_e)     def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
 140#define stw(_s,_t,_o,_a,_e)     def_store_insn(stw,"r",_s,_t,_o,_a,_e)
 141
 142#ifdef  CONFIG_PREFETCH
 143static inline void prefetch_src(const void *addr)
 144{
 145        __asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));
 146}
 147
 148static inline void prefetch_dst(const void *addr)
 149{
 150        __asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));
 151}
 152#else
 153#define prefetch_src(addr) do { } while(0)
 154#define prefetch_dst(addr) do { } while(0)
 155#endif
 156
 157#define PA_MEMCPY_OK            0
 158#define PA_MEMCPY_LOAD_ERROR    1
 159#define PA_MEMCPY_STORE_ERROR   2
 160
 161/* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
 162 * per loop.  This code is derived from glibc. 
 163 */
 164static noinline unsigned long copy_dstaligned(unsigned long dst,
 165                                        unsigned long src, unsigned long len)
 166{
 167        /* gcc complains that a2 and a3 may be uninitialized, but actually
 168         * they cannot be.  Initialize a2/a3 to shut gcc up.
 169         */
 170        register unsigned int a0, a1, a2 = 0, a3 = 0;
 171        int sh_1, sh_2;
 172
 173        /* prefetch_src((const void *)src); */
 174
 175        /* Calculate how to shift a word read at the memory operation
 176           aligned srcp to make it aligned for copy.  */
 177        sh_1 = 8 * (src % sizeof(unsigned int));
 178        sh_2 = 8 * sizeof(unsigned int) - sh_1;
 179
 180        /* Make src aligned by rounding it down.  */
 181        src &= -sizeof(unsigned int);
 182
 183        switch (len % 4)
 184        {
 185                case 2:
 186                        /* a1 = ((unsigned int *) src)[0];
 187                           a2 = ((unsigned int *) src)[1]; */
 188                        ldw(s_space, 0, src, a1, cda_ldw_exc);
 189                        ldw(s_space, 4, src, a2, cda_ldw_exc);
 190                        src -= 1 * sizeof(unsigned int);
 191                        dst -= 3 * sizeof(unsigned int);
 192                        len += 2;
 193                        goto do1;
 194                case 3:
 195                        /* a0 = ((unsigned int *) src)[0];
 196                           a1 = ((unsigned int *) src)[1]; */
 197                        ldw(s_space, 0, src, a0, cda_ldw_exc);
 198                        ldw(s_space, 4, src, a1, cda_ldw_exc);
 199                        src -= 0 * sizeof(unsigned int);
 200                        dst -= 2 * sizeof(unsigned int);
 201                        len += 1;
 202                        goto do2;
 203                case 0:
 204                        if (len == 0)
 205                                return PA_MEMCPY_OK;
 206                        /* a3 = ((unsigned int *) src)[0];
 207                           a0 = ((unsigned int *) src)[1]; */
 208                        ldw(s_space, 0, src, a3, cda_ldw_exc);
 209                        ldw(s_space, 4, src, a0, cda_ldw_exc);
 210                        src -=-1 * sizeof(unsigned int);
 211                        dst -= 1 * sizeof(unsigned int);
 212                        len += 0;
 213                        goto do3;
 214                case 1:
 215                        /* a2 = ((unsigned int *) src)[0];
 216                           a3 = ((unsigned int *) src)[1]; */
 217                        ldw(s_space, 0, src, a2, cda_ldw_exc);
 218                        ldw(s_space, 4, src, a3, cda_ldw_exc);
 219                        src -=-2 * sizeof(unsigned int);
 220                        dst -= 0 * sizeof(unsigned int);
 221                        len -= 1;
 222                        if (len == 0)
 223                                goto do0;
 224                        goto do4;                       /* No-op.  */
 225        }
 226
 227        do
 228        {
 229                /* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */
 230do4:
 231                /* a0 = ((unsigned int *) src)[0]; */
 232                ldw(s_space, 0, src, a0, cda_ldw_exc);
 233                /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
 234                stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
 235do3:
 236                /* a1 = ((unsigned int *) src)[1]; */
 237                ldw(s_space, 4, src, a1, cda_ldw_exc);
 238                /* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */
 239                stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc);
 240do2:
 241                /* a2 = ((unsigned int *) src)[2]; */
 242                ldw(s_space, 8, src, a2, cda_ldw_exc);
 243                /* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */
 244                stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc);
 245do1:
 246                /* a3 = ((unsigned int *) src)[3]; */
 247                ldw(s_space, 12, src, a3, cda_ldw_exc);
 248                /* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */
 249                stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc);
 250
 251                src += 4 * sizeof(unsigned int);
 252                dst += 4 * sizeof(unsigned int);
 253                len -= 4;
 254        }
 255        while (len != 0);
 256
 257do0:
 258        /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
 259        stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
 260
 261        preserve_branch(handle_load_error);
 262        preserve_branch(handle_store_error);
 263
 264        return PA_MEMCPY_OK;
 265
 266handle_load_error:
 267        __asm__ __volatile__ ("cda_ldw_exc:\n");
 268        return PA_MEMCPY_LOAD_ERROR;
 269
 270handle_store_error:
 271        __asm__ __volatile__ ("cda_stw_exc:\n");
 272        return PA_MEMCPY_STORE_ERROR;
 273}
 274
 275
 276/* Returns PA_MEMCPY_OK, PA_MEMCPY_LOAD_ERROR or PA_MEMCPY_STORE_ERROR.
 277 * In case of an access fault the faulty address can be read from the per_cpu
 278 * exception data struct. */
 279static noinline unsigned long pa_memcpy_internal(void *dstp, const void *srcp,
 280                                        unsigned long len)
 281{
 282        register unsigned long src, dst, t1, t2, t3;
 283        register unsigned char *pcs, *pcd;
 284        register unsigned int *pws, *pwd;
 285        register double *pds, *pdd;
 286        unsigned long ret;
 287
 288        src = (unsigned long)srcp;
 289        dst = (unsigned long)dstp;
 290        pcs = (unsigned char *)srcp;
 291        pcd = (unsigned char *)dstp;
 292
 293        /* prefetch_src((const void *)srcp); */
 294
 295        if (len < THRESHOLD)
 296                goto byte_copy;
 297
 298        /* Check alignment */
 299        t1 = (src ^ dst);
 300        if (unlikely(t1 & (sizeof(double)-1)))
 301                goto unaligned_copy;
 302
 303        /* src and dst have same alignment. */
 304
 305        /* Copy bytes till we are double-aligned. */
 306        t2 = src & (sizeof(double) - 1);
 307        if (unlikely(t2 != 0)) {
 308                t2 = sizeof(double) - t2;
 309                while (t2 && len) {
 310                        /* *pcd++ = *pcs++; */
 311                        ldbma(s_space, pcs, t3, pmc_load_exc);
 312                        len--;
 313                        stbma(d_space, t3, pcd, pmc_store_exc);
 314                        t2--;
 315                }
 316        }
 317
 318        pds = (double *)pcs;
 319        pdd = (double *)pcd;
 320
 321#if 0
 322        /* Copy 8 doubles at a time */
 323        while (len >= 8*sizeof(double)) {
 324                register double r1, r2, r3, r4, r5, r6, r7, r8;
 325                /* prefetch_src((char *)pds + L1_CACHE_BYTES); */
 326                flddma(s_space, pds, r1, pmc_load_exc);
 327                flddma(s_space, pds, r2, pmc_load_exc);
 328                flddma(s_space, pds, r3, pmc_load_exc);
 329                flddma(s_space, pds, r4, pmc_load_exc);
 330                fstdma(d_space, r1, pdd, pmc_store_exc);
 331                fstdma(d_space, r2, pdd, pmc_store_exc);
 332                fstdma(d_space, r3, pdd, pmc_store_exc);
 333                fstdma(d_space, r4, pdd, pmc_store_exc);
 334
 335#if 0
 336                if (L1_CACHE_BYTES <= 32)
 337                        prefetch_src((char *)pds + L1_CACHE_BYTES);
 338#endif
 339                flddma(s_space, pds, r5, pmc_load_exc);
 340                flddma(s_space, pds, r6, pmc_load_exc);
 341                flddma(s_space, pds, r7, pmc_load_exc);
 342                flddma(s_space, pds, r8, pmc_load_exc);
 343                fstdma(d_space, r5, pdd, pmc_store_exc);
 344                fstdma(d_space, r6, pdd, pmc_store_exc);
 345                fstdma(d_space, r7, pdd, pmc_store_exc);
 346                fstdma(d_space, r8, pdd, pmc_store_exc);
 347                len -= 8*sizeof(double);
 348        }
 349#endif
 350
 351        pws = (unsigned int *)pds;
 352        pwd = (unsigned int *)pdd;
 353
 354word_copy:
 355        while (len >= 8*sizeof(unsigned int)) {
 356                register unsigned int r1,r2,r3,r4,r5,r6,r7,r8;
 357                /* prefetch_src((char *)pws + L1_CACHE_BYTES); */
 358                ldwma(s_space, pws, r1, pmc_load_exc);
 359                ldwma(s_space, pws, r2, pmc_load_exc);
 360                ldwma(s_space, pws, r3, pmc_load_exc);
 361                ldwma(s_space, pws, r4, pmc_load_exc);
 362                stwma(d_space, r1, pwd, pmc_store_exc);
 363                stwma(d_space, r2, pwd, pmc_store_exc);
 364                stwma(d_space, r3, pwd, pmc_store_exc);
 365                stwma(d_space, r4, pwd, pmc_store_exc);
 366
 367                ldwma(s_space, pws, r5, pmc_load_exc);
 368                ldwma(s_space, pws, r6, pmc_load_exc);
 369                ldwma(s_space, pws, r7, pmc_load_exc);
 370                ldwma(s_space, pws, r8, pmc_load_exc);
 371                stwma(d_space, r5, pwd, pmc_store_exc);
 372                stwma(d_space, r6, pwd, pmc_store_exc);
 373                stwma(d_space, r7, pwd, pmc_store_exc);
 374                stwma(d_space, r8, pwd, pmc_store_exc);
 375                len -= 8*sizeof(unsigned int);
 376        }
 377
 378        while (len >= 4*sizeof(unsigned int)) {
 379                register unsigned int r1,r2,r3,r4;
 380                ldwma(s_space, pws, r1, pmc_load_exc);
 381                ldwma(s_space, pws, r2, pmc_load_exc);
 382                ldwma(s_space, pws, r3, pmc_load_exc);
 383                ldwma(s_space, pws, r4, pmc_load_exc);
 384                stwma(d_space, r1, pwd, pmc_store_exc);
 385                stwma(d_space, r2, pwd, pmc_store_exc);
 386                stwma(d_space, r3, pwd, pmc_store_exc);
 387                stwma(d_space, r4, pwd, pmc_store_exc);
 388                len -= 4*sizeof(unsigned int);
 389        }
 390
 391        pcs = (unsigned char *)pws;
 392        pcd = (unsigned char *)pwd;
 393
 394byte_copy:
 395        while (len) {
 396                /* *pcd++ = *pcs++; */
 397                ldbma(s_space, pcs, t3, pmc_load_exc);
 398                stbma(d_space, t3, pcd, pmc_store_exc);
 399                len--;
 400        }
 401
 402        return PA_MEMCPY_OK;
 403
 404unaligned_copy:
 405        /* possibly we are aligned on a word, but not on a double... */
 406        if (likely((t1 & (sizeof(unsigned int)-1)) == 0)) {
 407                t2 = src & (sizeof(unsigned int) - 1);
 408
 409                if (unlikely(t2 != 0)) {
 410                        t2 = sizeof(unsigned int) - t2;
 411                        while (t2) {
 412                                /* *pcd++ = *pcs++; */
 413                                ldbma(s_space, pcs, t3, pmc_load_exc);
 414                                stbma(d_space, t3, pcd, pmc_store_exc);
 415                                len--;
 416                                t2--;
 417                        }
 418                }
 419
 420                pws = (unsigned int *)pcs;
 421                pwd = (unsigned int *)pcd;
 422                goto word_copy;
 423        }
 424
 425        /* Align the destination.  */
 426        if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) {
 427                t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1));
 428                while (t2) {
 429                        /* *pcd++ = *pcs++; */
 430                        ldbma(s_space, pcs, t3, pmc_load_exc);
 431                        stbma(d_space, t3, pcd, pmc_store_exc);
 432                        len--;
 433                        t2--;
 434                }
 435                dst = (unsigned long)pcd;
 436                src = (unsigned long)pcs;
 437        }
 438
 439        ret = copy_dstaligned(dst, src, len / sizeof(unsigned int));
 440        if (ret)
 441                return ret;
 442
 443        pcs += (len & -sizeof(unsigned int));
 444        pcd += (len & -sizeof(unsigned int));
 445        len %= sizeof(unsigned int);
 446
 447        preserve_branch(handle_load_error);
 448        preserve_branch(handle_store_error);
 449
 450        goto byte_copy;
 451
 452handle_load_error:
 453        __asm__ __volatile__ ("pmc_load_exc:\n");
 454        return PA_MEMCPY_LOAD_ERROR;
 455
 456handle_store_error:
 457        __asm__ __volatile__ ("pmc_store_exc:\n");
 458        return PA_MEMCPY_STORE_ERROR;
 459}
 460
 461
 462/* Returns 0 for success, otherwise, returns number of bytes not transferred. */
 463static unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
 464{
 465        unsigned long ret, fault_addr, reference;
 466        struct exception_data *d;
 467
 468        ret = pa_memcpy_internal(dstp, srcp, len);
 469        if (likely(ret == PA_MEMCPY_OK))
 470                return 0;
 471
 472        /* if a load or store fault occured we can get the faulty addr */
 473        d = this_cpu_ptr(&exception_data);
 474        fault_addr = d->fault_addr;
 475
 476        /* error in load or store? */
 477        if (ret == PA_MEMCPY_LOAD_ERROR)
 478                reference = (unsigned long) srcp;
 479        else
 480                reference = (unsigned long) dstp;
 481
 482        DPRINTF("pa_memcpy: fault type = %lu, len=%lu fault_addr=%lu ref=%lu\n",
 483                ret, len, fault_addr, reference);
 484
 485        if (fault_addr >= reference)
 486                return len - (fault_addr - reference);
 487        else
 488                return len;
 489}
 490
 491#ifdef __KERNEL__
 492unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len)
 493{
 494        mtsp(get_kernel_space(), 1);
 495        mtsp(get_user_space(), 2);
 496        return pa_memcpy((void __force *)dst, src, len);
 497}
 498
 499EXPORT_SYMBOL(__copy_from_user);
 500unsigned long __copy_from_user(void *dst, const void __user *src, unsigned long len)
 501{
 502        mtsp(get_user_space(), 1);
 503        mtsp(get_kernel_space(), 2);
 504        return pa_memcpy(dst, (void __force *)src, len);
 505}
 506
 507unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len)
 508{
 509        mtsp(get_user_space(), 1);
 510        mtsp(get_user_space(), 2);
 511        return pa_memcpy((void __force *)dst, (void __force *)src, len);
 512}
 513
 514
 515void * memcpy(void * dst,const void *src, size_t count)
 516{
 517        mtsp(get_kernel_space(), 1);
 518        mtsp(get_kernel_space(), 2);
 519        pa_memcpy(dst, src, count);
 520        return dst;
 521}
 522
 523EXPORT_SYMBOL(copy_to_user);
 524EXPORT_SYMBOL(copy_from_user);
 525EXPORT_SYMBOL(copy_in_user);
 526EXPORT_SYMBOL(memcpy);
 527
 528long probe_kernel_read(void *dst, const void *src, size_t size)
 529{
 530        unsigned long addr = (unsigned long)src;
 531
 532        if (addr < PAGE_SIZE)
 533                return -EFAULT;
 534
 535        /* check for I/O space F_EXTEND(0xfff00000) access as well? */
 536
 537        return __probe_kernel_read(dst, src, size);
 538}
 539
 540#endif
 541