LXR linux/arch/alpha/lib/memcpy.c

   1/*
   2 *  linux/arch/alpha/lib/memcpy.c
   3 *
   4 *  Copyright (C) 1995  Linus Torvalds
   5 */
   6
   7/*
   8 * This is a reasonably optimized memcpy() routine.
   9 */
  10
  11/*
  12 * Note that the C code is written to be optimized into good assembly. However,
  13 * at this point gcc is unable to sanely compile "if (n >= 0)", resulting in a
  14 * explicit compare against 0 (instead of just using the proper "blt reg, xx" or
  15 * "bge reg, xx"). I hope alpha-gcc will be fixed to notice this eventually..
  16 */
  17
  18#include <linux/types.h>
  19
  20/*
  21 * This should be done in one go with ldq_u*2/mask/stq_u. Do it
  22 * with a macro so that we can fix it up later..
  23 */
  24#define ALIGN_DEST_TO8_UP(d,s,n) \
  25        while (d & 7) { \
  26                if (n <= 0) return; \
  27                n--; \
  28                *(char *) d = *(char *) s; \
  29                d++; s++; \
  30        }
  31#define ALIGN_DEST_TO8_DN(d,s,n) \
  32        while (d & 7) { \
  33                if (n <= 0) return; \
  34                n--; \
  35                d--; s--; \
  36                *(char *) d = *(char *) s; \
  37        }
  38
  39/*
  40 * This should similarly be done with ldq_u*2/mask/stq. The destination
  41 * is aligned, but we don't fill in a full quad-word
  42 */
  43#define DO_REST_UP(d,s,n) \
  44        while (n > 0) { \
  45                n--; \
  46                *(char *) d = *(char *) s; \
  47                d++; s++; \
  48        }
  49#define DO_REST_DN(d,s,n) \
  50        while (n > 0) { \
  51                n--; \
  52                d--; s--; \
  53                *(char *) d = *(char *) s; \
  54        }
  55
  56/*
  57 * This should be done with ldq/mask/stq. The source and destination are
  58 * aligned, but we don't fill in a full quad-word
  59 */
  60#define DO_REST_ALIGNED_UP(d,s,n) DO_REST_UP(d,s,n)
  61#define DO_REST_ALIGNED_DN(d,s,n) DO_REST_DN(d,s,n)
  62
  63/*
  64 * This does unaligned memory copies. We want to avoid storing to
  65 * an unaligned address, as that would do a read-modify-write cycle.
  66 * We also want to avoid double-reading the unaligned reads.
  67 *
  68 * Note the ordering to try to avoid load (and address generation) latencies.
  69 */
  70static inline void __memcpy_unaligned_up (unsigned long d, unsigned long s,
  71                                          long n)
  72{
  73        ALIGN_DEST_TO8_UP(d,s,n);
  74        n -= 8;                 /* to avoid compare against 8 in the loop */
  75        if (n >= 0) {
  76                unsigned long low_word, high_word;
  77                __asm__("ldq_u %0,%1":"=r" (low_word):"m" (*(unsigned long *) s));
  78                do {
  79                        unsigned long tmp;
  80                        __asm__("ldq_u %0,%1":"=r" (high_word):"m" (*(unsigned long *)(s+8)));
  81                        n -= 8;
  82                        __asm__("extql %1,%2,%0"
  83                                :"=r" (low_word)
  84                                :"r" (low_word), "r" (s));
  85                        __asm__("extqh %1,%2,%0"
  86                                :"=r" (tmp)
  87                                :"r" (high_word), "r" (s));
  88                        s += 8;
  89                        *(unsigned long *) d = low_word | tmp;
  90                        d += 8;
  91                        low_word = high_word;
  92                } while (n >= 0);
  93        }
  94        n += 8;
  95        DO_REST_UP(d,s,n);
  96}
  97
  98static inline void __memcpy_unaligned_dn (unsigned long d, unsigned long s,
  99                                          long n)
 100{
 101        /* I don't understand AXP assembler well enough for this. -Tim */
 102        s += n;
 103        d += n;
 104        while (n--)
 105                * (char *) --d = * (char *) --s;
 106}
 107
 108/*
 109 * Hmm.. Strange. The __asm__ here is there to make gcc use an integer register
 110 * for the load-store. I don't know why, but it would seem that using a floating
 111 * point register for the move seems to slow things down (very small difference,
 112 * though).
 113 *
 114 * Note the ordering to try to avoid load (and address generation) latencies.
 115 */
 116static inline void __memcpy_aligned_up (unsigned long d, unsigned long s,
 117                                        long n)
 118{
 119        ALIGN_DEST_TO8_UP(d,s,n);
 120        n -= 8;
 121        while (n >= 0) {
 122                unsigned long tmp;
 123                __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s));
 124                n -= 8;
 125                s += 8;
 126                *(unsigned long *) d = tmp;
 127                d += 8;
 128        }
 129        n += 8;
 130        DO_REST_ALIGNED_UP(d,s,n);
 131}
 132static inline void __memcpy_aligned_dn (unsigned long d, unsigned long s,
 133                                        long n)
 134{
 135        s += n;
 136        d += n;
 137        ALIGN_DEST_TO8_DN(d,s,n);
 138        n -= 8;
 139        while (n >= 0) {
 140                unsigned long tmp;
 141                s -= 8;
 142                __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s));
 143                n -= 8;
 144                d -= 8;
 145                *(unsigned long *) d = tmp;
 146        }
 147        n += 8;
 148        DO_REST_ALIGNED_DN(d,s,n);
 149}
 150
 151void * memcpy(void * dest, const void *src, size_t n)
 152{
 153        if (!(((unsigned long) dest ^ (unsigned long) src) & 7)) {
 154                __memcpy_aligned_up ((unsigned long) dest, (unsigned long) src,
 155                                     n);
 156                return dest;
 157        }
 158        __memcpy_unaligned_up ((unsigned long) dest, (unsigned long) src, n);
 159        return dest;
 160}
 161
 162/* For backward modules compatibility, define __memcpy.  */
 163asm("__memcpy = memcpy; .globl __memcpy");
 164