LXR linux/arch/alpha/lib/memcpy.c

   1/*
   2 *  linux/arch/alpha/lib/memcpy.c
   3 *
   4 *  Copyright (C) 1995  Linus Torvalds
   5 */
   6
   7/*
   8 * This is a reasonably optimized memcpy() routine.
   9 */
  10
  11/*
  12 * Note that the C code is written to be optimized into good assembly. However,
  13 * at this point gcc is unable to sanely compile "if (n >= 0)", resulting in a
  14 * explicit compare against 0 (instead of just using the proper "blt reg, xx" or
  15 * "bge reg, xx"). I hope alpha-gcc will be fixed to notice this eventually..
  16 */
  17
  18#include <linux/types.h>
  19#include <linux/export.h>
  20
  21/*
  22 * This should be done in one go with ldq_u*2/mask/stq_u. Do it
  23 * with a macro so that we can fix it up later..
  24 */
  25#define ALIGN_DEST_TO8_UP(d,s,n) \
  26        while (d & 7) { \
  27                if (n <= 0) return; \
  28                n--; \
  29                *(char *) d = *(char *) s; \
  30                d++; s++; \
  31        }
  32#define ALIGN_DEST_TO8_DN(d,s,n) \
  33        while (d & 7) { \
  34                if (n <= 0) return; \
  35                n--; \
  36                d--; s--; \
  37                *(char *) d = *(char *) s; \
  38        }
  39
  40/*
  41 * This should similarly be done with ldq_u*2/mask/stq. The destination
  42 * is aligned, but we don't fill in a full quad-word
  43 */
  44#define DO_REST_UP(d,s,n) \
  45        while (n > 0) { \
  46                n--; \
  47                *(char *) d = *(char *) s; \
  48                d++; s++; \
  49        }
  50#define DO_REST_DN(d,s,n) \
  51        while (n > 0) { \
  52                n--; \
  53                d--; s--; \
  54                *(char *) d = *(char *) s; \
  55        }
  56
  57/*
  58 * This should be done with ldq/mask/stq. The source and destination are
  59 * aligned, but we don't fill in a full quad-word
  60 */
  61#define DO_REST_ALIGNED_UP(d,s,n) DO_REST_UP(d,s,n)
  62#define DO_REST_ALIGNED_DN(d,s,n) DO_REST_DN(d,s,n)
  63
  64/*
  65 * This does unaligned memory copies. We want to avoid storing to
  66 * an unaligned address, as that would do a read-modify-write cycle.
  67 * We also want to avoid double-reading the unaligned reads.
  68 *
  69 * Note the ordering to try to avoid load (and address generation) latencies.
  70 */
  71static inline void __memcpy_unaligned_up (unsigned long d, unsigned long s,
  72                                          long n)
  73{
  74        ALIGN_DEST_TO8_UP(d,s,n);
  75        n -= 8;                 /* to avoid compare against 8 in the loop */
  76        if (n >= 0) {
  77                unsigned long low_word, high_word;
  78                __asm__("ldq_u %0,%1":"=r" (low_word):"m" (*(unsigned long *) s));
  79                do {
  80                        unsigned long tmp;
  81                        __asm__("ldq_u %0,%1":"=r" (high_word):"m" (*(unsigned long *)(s+8)));
  82                        n -= 8;
  83                        __asm__("extql %1,%2,%0"
  84                                :"=r" (low_word)
  85                                :"r" (low_word), "r" (s));
  86                        __asm__("extqh %1,%2,%0"
  87                                :"=r" (tmp)
  88                                :"r" (high_word), "r" (s));
  89                        s += 8;
  90                        *(unsigned long *) d = low_word | tmp;
  91                        d += 8;
  92                        low_word = high_word;
  93                } while (n >= 0);
  94        }
  95        n += 8;
  96        DO_REST_UP(d,s,n);
  97}
  98
  99static inline void __memcpy_unaligned_dn (unsigned long d, unsigned long s,
 100                                          long n)
 101{
 102        /* I don't understand AXP assembler well enough for this. -Tim */
 103        s += n;
 104        d += n;
 105        while (n--)
 106                * (char *) --d = * (char *) --s;
 107}
 108
 109/*
 110 * Hmm.. Strange. The __asm__ here is there to make gcc use an integer register
 111 * for the load-store. I don't know why, but it would seem that using a floating
 112 * point register for the move seems to slow things down (very small difference,
 113 * though).
 114 *
 115 * Note the ordering to try to avoid load (and address generation) latencies.
 116 */
 117static inline void __memcpy_aligned_up (unsigned long d, unsigned long s,
 118                                        long n)
 119{
 120        ALIGN_DEST_TO8_UP(d,s,n);
 121        n -= 8;
 122        while (n >= 0) {
 123                unsigned long tmp;
 124                __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s));
 125                n -= 8;
 126                s += 8;
 127                *(unsigned long *) d = tmp;
 128                d += 8;
 129        }
 130        n += 8;
 131        DO_REST_ALIGNED_UP(d,s,n);
 132}
 133static inline void __memcpy_aligned_dn (unsigned long d, unsigned long s,
 134                                        long n)
 135{
 136        s += n;
 137        d += n;
 138        ALIGN_DEST_TO8_DN(d,s,n);
 139        n -= 8;
 140        while (n >= 0) {
 141                unsigned long tmp;
 142                s -= 8;
 143                __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s));
 144                n -= 8;
 145                d -= 8;
 146                *(unsigned long *) d = tmp;
 147        }
 148        n += 8;
 149        DO_REST_ALIGNED_DN(d,s,n);
 150}
 151
 152void * memcpy(void * dest, const void *src, size_t n)
 153{
 154        if (!(((unsigned long) dest ^ (unsigned long) src) & 7)) {
 155                __memcpy_aligned_up ((unsigned long) dest, (unsigned long) src,
 156                                     n);
 157                return dest;
 158        }
 159        __memcpy_unaligned_up ((unsigned long) dest, (unsigned long) src, n);
 160        return dest;
 161}
 162EXPORT_SYMBOL(memcpy);
 163