LXR linux/arch/alpha/lib/memcpy.c

   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  linux/arch/alpha/lib/memcpy.c
   4 *
   5 *  Copyright (C) 1995  Linus Torvalds
   6 */
   7
   8/*
   9 * This is a reasonably optimized memcpy() routine.
  10 */
  11
  12/*
  13 * Note that the C code is written to be optimized into good assembly. However,
  14 * at this point gcc is unable to sanely compile "if (n >= 0)", resulting in a
  15 * explicit compare against 0 (instead of just using the proper "blt reg, xx" or
  16 * "bge reg, xx"). I hope alpha-gcc will be fixed to notice this eventually..
  17 */
  18
  19#include <linux/types.h>
  20#include <linux/export.h>
  21
  22/*
  23 * This should be done in one go with ldq_u*2/mask/stq_u. Do it
  24 * with a macro so that we can fix it up later..
  25 */
  26#define ALIGN_DEST_TO8_UP(d,s,n) \
  27        while (d & 7) { \
  28                if (n <= 0) return; \
  29                n--; \
  30                *(char *) d = *(char *) s; \
  31                d++; s++; \
  32        }
  33#define ALIGN_DEST_TO8_DN(d,s,n) \
  34        while (d & 7) { \
  35                if (n <= 0) return; \
  36                n--; \
  37                d--; s--; \
  38                *(char *) d = *(char *) s; \
  39        }
  40
  41/*
  42 * This should similarly be done with ldq_u*2/mask/stq. The destination
  43 * is aligned, but we don't fill in a full quad-word
  44 */
  45#define DO_REST_UP(d,s,n) \
  46        while (n > 0) { \
  47                n--; \
  48                *(char *) d = *(char *) s; \
  49                d++; s++; \
  50        }
  51#define DO_REST_DN(d,s,n) \
  52        while (n > 0) { \
  53                n--; \
  54                d--; s--; \
  55                *(char *) d = *(char *) s; \
  56        }
  57
  58/*
  59 * This should be done with ldq/mask/stq. The source and destination are
  60 * aligned, but we don't fill in a full quad-word
  61 */
  62#define DO_REST_ALIGNED_UP(d,s,n) DO_REST_UP(d,s,n)
  63#define DO_REST_ALIGNED_DN(d,s,n) DO_REST_DN(d,s,n)
  64
  65/*
  66 * This does unaligned memory copies. We want to avoid storing to
  67 * an unaligned address, as that would do a read-modify-write cycle.
  68 * We also want to avoid double-reading the unaligned reads.
  69 *
  70 * Note the ordering to try to avoid load (and address generation) latencies.
  71 */
  72static inline void __memcpy_unaligned_up (unsigned long d, unsigned long s,
  73                                          long n)
  74{
  75        ALIGN_DEST_TO8_UP(d,s,n);
  76        n -= 8;                 /* to avoid compare against 8 in the loop */
  77        if (n >= 0) {
  78                unsigned long low_word, high_word;
  79                __asm__("ldq_u %0,%1":"=r" (low_word):"m" (*(unsigned long *) s));
  80                do {
  81                        unsigned long tmp;
  82                        __asm__("ldq_u %0,%1":"=r" (high_word):"m" (*(unsigned long *)(s+8)));
  83                        n -= 8;
  84                        __asm__("extql %1,%2,%0"
  85                                :"=r" (low_word)
  86                                :"r" (low_word), "r" (s));
  87                        __asm__("extqh %1,%2,%0"
  88                                :"=r" (tmp)
  89                                :"r" (high_word), "r" (s));
  90                        s += 8;
  91                        *(unsigned long *) d = low_word | tmp;
  92                        d += 8;
  93                        low_word = high_word;
  94                } while (n >= 0);
  95        }
  96        n += 8;
  97        DO_REST_UP(d,s,n);
  98}
  99
 100static inline void __memcpy_unaligned_dn (unsigned long d, unsigned long s,
 101                                          long n)
 102{
 103        /* I don't understand AXP assembler well enough for this. -Tim */
 104        s += n;
 105        d += n;
 106        while (n--)
 107                * (char *) --d = * (char *) --s;
 108}
 109
 110/*
 111 * Hmm.. Strange. The __asm__ here is there to make gcc use an integer register
 112 * for the load-store. I don't know why, but it would seem that using a floating
 113 * point register for the move seems to slow things down (very small difference,
 114 * though).
 115 *
 116 * Note the ordering to try to avoid load (and address generation) latencies.
 117 */
 118static inline void __memcpy_aligned_up (unsigned long d, unsigned long s,
 119                                        long n)
 120{
 121        ALIGN_DEST_TO8_UP(d,s,n);
 122        n -= 8;
 123        while (n >= 0) {
 124                unsigned long tmp;
 125                __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s));
 126                n -= 8;
 127                s += 8;
 128                *(unsigned long *) d = tmp;
 129                d += 8;
 130        }
 131        n += 8;
 132        DO_REST_ALIGNED_UP(d,s,n);
 133}
 134static inline void __memcpy_aligned_dn (unsigned long d, unsigned long s,
 135                                        long n)
 136{
 137        s += n;
 138        d += n;
 139        ALIGN_DEST_TO8_DN(d,s,n);
 140        n -= 8;
 141        while (n >= 0) {
 142                unsigned long tmp;
 143                s -= 8;
 144                __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s));
 145                n -= 8;
 146                d -= 8;
 147                *(unsigned long *) d = tmp;
 148        }
 149        n += 8;
 150        DO_REST_ALIGNED_DN(d,s,n);
 151}
 152
 153void * memcpy(void * dest, const void *src, size_t n)
 154{
 155        if (!(((unsigned long) dest ^ (unsigned long) src) & 7)) {
 156                __memcpy_aligned_up ((unsigned long) dest, (unsigned long) src,
 157                                     n);
 158                return dest;
 159        }
 160        __memcpy_unaligned_up ((unsigned long) dest, (unsigned long) src, n);
 161        return dest;
 162}
 163EXPORT_SYMBOL(memcpy);
 164