linux/arch/x86/lib/mmx_32.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *      MMX 3DNow! library helper functions
   4 *
   5 *      To do:
   6 *      We can use MMX just for prefetch in IRQ's. This may be a win.
   7 *              (reported so on K6-III)
   8 *      We should use a better code neutral filler for the short jump
   9 *              leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
  10 *      We also want to clobber the filler register so we don't get any
  11 *              register forwarding stalls on the filler.
  12 *
  13 *      Add *user handling. Checksums are not a win with MMX on any CPU
  14 *      tested so far for any MMX solution figured.
  15 *
  16 *      22/09/2000 - Arjan van de Ven
  17 *              Improved for non-egineering-sample Athlons
  18 *
  19 */
  20#include <linux/hardirq.h>
  21#include <linux/string.h>
  22#include <linux/export.h>
  23#include <linux/sched.h>
  24#include <linux/types.h>
  25
  26#include <asm/fpu/api.h>
  27#include <asm/asm.h>
  28
  29void *_mmx_memcpy(void *to, const void *from, size_t len)
  30{
  31        void *p;
  32        int i;
  33
  34        if (unlikely(in_interrupt()))
  35                return __memcpy(to, from, len);
  36
  37        p = to;
  38        i = len >> 6; /* len/64 */
  39
  40        kernel_fpu_begin();
  41
  42        __asm__ __volatile__ (
  43                "1: prefetch (%0)\n"            /* This set is 28 bytes */
  44                "   prefetch 64(%0)\n"
  45                "   prefetch 128(%0)\n"
  46                "   prefetch 192(%0)\n"
  47                "   prefetch 256(%0)\n"
  48                "2:  \n"
  49                ".section .fixup, \"ax\"\n"
  50                "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
  51                "   jmp 2b\n"
  52                ".previous\n"
  53                        _ASM_EXTABLE(1b, 3b)
  54                        : : "r" (from));
  55
  56        for ( ; i > 5; i--) {
  57                __asm__ __volatile__ (
  58                "1:  prefetch 320(%0)\n"
  59                "2:  movq (%0), %%mm0\n"
  60                "  movq 8(%0), %%mm1\n"
  61                "  movq 16(%0), %%mm2\n"
  62                "  movq 24(%0), %%mm3\n"
  63                "  movq %%mm0, (%1)\n"
  64                "  movq %%mm1, 8(%1)\n"
  65                "  movq %%mm2, 16(%1)\n"
  66                "  movq %%mm3, 24(%1)\n"
  67                "  movq 32(%0), %%mm0\n"
  68                "  movq 40(%0), %%mm1\n"
  69                "  movq 48(%0), %%mm2\n"
  70                "  movq 56(%0), %%mm3\n"
  71                "  movq %%mm0, 32(%1)\n"
  72                "  movq %%mm1, 40(%1)\n"
  73                "  movq %%mm2, 48(%1)\n"
  74                "  movq %%mm3, 56(%1)\n"
  75                ".section .fixup, \"ax\"\n"
  76                "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
  77                "   jmp 2b\n"
  78                ".previous\n"
  79                        _ASM_EXTABLE(1b, 3b)
  80                        : : "r" (from), "r" (to) : "memory");
  81
  82                from += 64;
  83                to += 64;
  84        }
  85
  86        for ( ; i > 0; i--) {
  87                __asm__ __volatile__ (
  88                "  movq (%0), %%mm0\n"
  89                "  movq 8(%0), %%mm1\n"
  90                "  movq 16(%0), %%mm2\n"
  91                "  movq 24(%0), %%mm3\n"
  92                "  movq %%mm0, (%1)\n"
  93                "  movq %%mm1, 8(%1)\n"
  94                "  movq %%mm2, 16(%1)\n"
  95                "  movq %%mm3, 24(%1)\n"
  96                "  movq 32(%0), %%mm0\n"
  97                "  movq 40(%0), %%mm1\n"
  98                "  movq 48(%0), %%mm2\n"
  99                "  movq 56(%0), %%mm3\n"
 100                "  movq %%mm0, 32(%1)\n"
 101                "  movq %%mm1, 40(%1)\n"
 102                "  movq %%mm2, 48(%1)\n"
 103                "  movq %%mm3, 56(%1)\n"
 104                        : : "r" (from), "r" (to) : "memory");
 105
 106                from += 64;
 107                to += 64;
 108        }
 109        /*
 110         * Now do the tail of the block:
 111         */
 112        __memcpy(to, from, len & 63);
 113        kernel_fpu_end();
 114
 115        return p;
 116}
 117EXPORT_SYMBOL(_mmx_memcpy);
 118
 119#ifdef CONFIG_MK7
 120
 121/*
 122 *      The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
 123 *      other MMX using processors do not.
 124 */
 125
 126static void fast_clear_page(void *page)
 127{
 128        int i;
 129
 130        kernel_fpu_begin();
 131
 132        __asm__ __volatile__ (
 133                "  pxor %%mm0, %%mm0\n" : :
 134        );
 135
 136        for (i = 0; i < 4096/64; i++) {
 137                __asm__ __volatile__ (
 138                "  movntq %%mm0, (%0)\n"
 139                "  movntq %%mm0, 8(%0)\n"
 140                "  movntq %%mm0, 16(%0)\n"
 141                "  movntq %%mm0, 24(%0)\n"
 142                "  movntq %%mm0, 32(%0)\n"
 143                "  movntq %%mm0, 40(%0)\n"
 144                "  movntq %%mm0, 48(%0)\n"
 145                "  movntq %%mm0, 56(%0)\n"
 146                : : "r" (page) : "memory");
 147                page += 64;
 148        }
 149
 150        /*
 151         * Since movntq is weakly-ordered, a "sfence" is needed to become
 152         * ordered again:
 153         */
 154        __asm__ __volatile__("sfence\n"::);
 155
 156        kernel_fpu_end();
 157}
 158
 159static void fast_copy_page(void *to, void *from)
 160{
 161        int i;
 162
 163        kernel_fpu_begin();
 164
 165        /*
 166         * maybe the prefetch stuff can go before the expensive fnsave...
 167         * but that is for later. -AV
 168         */
 169        __asm__ __volatile__(
 170                "1: prefetch (%0)\n"
 171                "   prefetch 64(%0)\n"
 172                "   prefetch 128(%0)\n"
 173                "   prefetch 192(%0)\n"
 174                "   prefetch 256(%0)\n"
 175                "2:  \n"
 176                ".section .fixup, \"ax\"\n"
 177                "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
 178                "   jmp 2b\n"
 179                ".previous\n"
 180                        _ASM_EXTABLE(1b, 3b) : : "r" (from));
 181
 182        for (i = 0; i < (4096-320)/64; i++) {
 183                __asm__ __volatile__ (
 184                "1: prefetch 320(%0)\n"
 185                "2: movq (%0), %%mm0\n"
 186                "   movntq %%mm0, (%1)\n"
 187                "   movq 8(%0), %%mm1\n"
 188                "   movntq %%mm1, 8(%1)\n"
 189                "   movq 16(%0), %%mm2\n"
 190                "   movntq %%mm2, 16(%1)\n"
 191                "   movq 24(%0), %%mm3\n"
 192                "   movntq %%mm3, 24(%1)\n"
 193                "   movq 32(%0), %%mm4\n"
 194                "   movntq %%mm4, 32(%1)\n"
 195                "   movq 40(%0), %%mm5\n"
 196                "   movntq %%mm5, 40(%1)\n"
 197                "   movq 48(%0), %%mm6\n"
 198                "   movntq %%mm6, 48(%1)\n"
 199                "   movq 56(%0), %%mm7\n"
 200                "   movntq %%mm7, 56(%1)\n"
 201                ".section .fixup, \"ax\"\n"
 202                "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
 203                "   jmp 2b\n"
 204                ".previous\n"
 205                _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory");
 206
 207                from += 64;
 208                to += 64;
 209        }
 210
 211        for (i = (4096-320)/64; i < 4096/64; i++) {
 212                __asm__ __volatile__ (
 213                "2: movq (%0), %%mm0\n"
 214                "   movntq %%mm0, (%1)\n"
 215                "   movq 8(%0), %%mm1\n"
 216                "   movntq %%mm1, 8(%1)\n"
 217                "   movq 16(%0), %%mm2\n"
 218                "   movntq %%mm2, 16(%1)\n"
 219                "   movq 24(%0), %%mm3\n"
 220                "   movntq %%mm3, 24(%1)\n"
 221                "   movq 32(%0), %%mm4\n"
 222                "   movntq %%mm4, 32(%1)\n"
 223                "   movq 40(%0), %%mm5\n"
 224                "   movntq %%mm5, 40(%1)\n"
 225                "   movq 48(%0), %%mm6\n"
 226                "   movntq %%mm6, 48(%1)\n"
 227                "   movq 56(%0), %%mm7\n"
 228                "   movntq %%mm7, 56(%1)\n"
 229                        : : "r" (from), "r" (to) : "memory");
 230                from += 64;
 231                to += 64;
 232        }
 233        /*
 234         * Since movntq is weakly-ordered, a "sfence" is needed to become
 235         * ordered again:
 236         */
 237        __asm__ __volatile__("sfence \n"::);
 238        kernel_fpu_end();
 239}
 240
 241#else /* CONFIG_MK7 */
 242
 243/*
 244 *      Generic MMX implementation without K7 specific streaming
 245 */
 246static void fast_clear_page(void *page)
 247{
 248        int i;
 249
 250        kernel_fpu_begin();
 251
 252        __asm__ __volatile__ (
 253                "  pxor %%mm0, %%mm0\n" : :
 254        );
 255
 256        for (i = 0; i < 4096/128; i++) {
 257                __asm__ __volatile__ (
 258                "  movq %%mm0, (%0)\n"
 259                "  movq %%mm0, 8(%0)\n"
 260                "  movq %%mm0, 16(%0)\n"
 261                "  movq %%mm0, 24(%0)\n"
 262                "  movq %%mm0, 32(%0)\n"
 263                "  movq %%mm0, 40(%0)\n"
 264                "  movq %%mm0, 48(%0)\n"
 265                "  movq %%mm0, 56(%0)\n"
 266                "  movq %%mm0, 64(%0)\n"
 267                "  movq %%mm0, 72(%0)\n"
 268                "  movq %%mm0, 80(%0)\n"
 269                "  movq %%mm0, 88(%0)\n"
 270                "  movq %%mm0, 96(%0)\n"
 271                "  movq %%mm0, 104(%0)\n"
 272                "  movq %%mm0, 112(%0)\n"
 273                "  movq %%mm0, 120(%0)\n"
 274                        : : "r" (page) : "memory");
 275                page += 128;
 276        }
 277
 278        kernel_fpu_end();
 279}
 280
 281static void fast_copy_page(void *to, void *from)
 282{
 283        int i;
 284
 285        kernel_fpu_begin();
 286
 287        __asm__ __volatile__ (
 288                "1: prefetch (%0)\n"
 289                "   prefetch 64(%0)\n"
 290                "   prefetch 128(%0)\n"
 291                "   prefetch 192(%0)\n"
 292                "   prefetch 256(%0)\n"
 293                "2:  \n"
 294                ".section .fixup, \"ax\"\n"
 295                "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
 296                "   jmp 2b\n"
 297                ".previous\n"
 298                        _ASM_EXTABLE(1b, 3b) : : "r" (from));
 299
 300        for (i = 0; i < 4096/64; i++) {
 301                __asm__ __volatile__ (
 302                "1: prefetch 320(%0)\n"
 303                "2: movq (%0), %%mm0\n"
 304                "   movq 8(%0), %%mm1\n"
 305                "   movq 16(%0), %%mm2\n"
 306                "   movq 24(%0), %%mm3\n"
 307                "   movq %%mm0, (%1)\n"
 308                "   movq %%mm1, 8(%1)\n"
 309                "   movq %%mm2, 16(%1)\n"
 310                "   movq %%mm3, 24(%1)\n"
 311                "   movq 32(%0), %%mm0\n"
 312                "   movq 40(%0), %%mm1\n"
 313                "   movq 48(%0), %%mm2\n"
 314                "   movq 56(%0), %%mm3\n"
 315                "   movq %%mm0, 32(%1)\n"
 316                "   movq %%mm1, 40(%1)\n"
 317                "   movq %%mm2, 48(%1)\n"
 318                "   movq %%mm3, 56(%1)\n"
 319                ".section .fixup, \"ax\"\n"
 320                "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
 321                "   jmp 2b\n"
 322                ".previous\n"
 323                        _ASM_EXTABLE(1b, 3b)
 324                        : : "r" (from), "r" (to) : "memory");
 325
 326                from += 64;
 327                to += 64;
 328        }
 329        kernel_fpu_end();
 330}
 331
 332#endif /* !CONFIG_MK7 */
 333
 334/*
 335 * Favour MMX for page clear and copy:
 336 */
 337static void slow_zero_page(void *page)
 338{
 339        int d0, d1;
 340
 341        __asm__ __volatile__(
 342                "cld\n\t"
 343                "rep ; stosl"
 344
 345                        : "=&c" (d0), "=&D" (d1)
 346                        :"a" (0), "1" (page), "0" (1024)
 347                        :"memory");
 348}
 349
 350void mmx_clear_page(void *page)
 351{
 352        if (unlikely(in_interrupt()))
 353                slow_zero_page(page);
 354        else
 355                fast_clear_page(page);
 356}
 357EXPORT_SYMBOL(mmx_clear_page);
 358
 359static void slow_copy_page(void *to, void *from)
 360{
 361        int d0, d1, d2;
 362
 363        __asm__ __volatile__(
 364                "cld\n\t"
 365                "rep ; movsl"
 366                : "=&c" (d0), "=&D" (d1), "=&S" (d2)
 367                : "0" (1024), "1" ((long) to), "2" ((long) from)
 368                : "memory");
 369}
 370
 371void mmx_copy_page(void *to, void *from)
 372{
 373        if (unlikely(in_interrupt()))
 374                slow_copy_page(to, from);
 375        else
 376                fast_copy_page(to, from);
 377}
 378EXPORT_SYMBOL(mmx_copy_page);
 379