linux/arch/x86/lib/mmx_32.c
<<
>>
Prefs
   1/*
   2 *      MMX 3DNow! library helper functions
   3 *
   4 *      To do:
   5 *      We can use MMX just for prefetch in IRQ's. This may be a win.
   6 *              (reported so on K6-III)
   7 *      We should use a better code neutral filler for the short jump
   8 *              leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
   9 *      We also want to clobber the filler register so we don't get any
  10 *              register forwarding stalls on the filler.
  11 *
  12 *      Add *user handling. Checksums are not a win with MMX on any CPU
  13 *      tested so far for any MMX solution figured.
  14 *
  15 *      22/09/2000 - Arjan van de Ven
  16 *              Improved for non-egineering-sample Athlons
  17 *
  18 */
  19#include <linux/hardirq.h>
  20#include <linux/string.h>
  21#include <linux/module.h>
  22#include <linux/sched.h>
  23#include <linux/types.h>
  24
  25#include <asm/i387.h>
  26#include <asm/asm.h>
  27
  28void *_mmx_memcpy(void *to, const void *from, size_t len)
  29{
  30        void *p;
  31        int i;
  32
  33        if (unlikely(in_interrupt()))
  34                return __memcpy(to, from, len);
  35
  36        p = to;
  37        i = len >> 6; /* len/64 */
  38
  39        kernel_fpu_begin();
  40
  41        __asm__ __volatile__ (
  42                "1: prefetch (%0)\n"            /* This set is 28 bytes */
  43                "   prefetch 64(%0)\n"
  44                "   prefetch 128(%0)\n"
  45                "   prefetch 192(%0)\n"
  46                "   prefetch 256(%0)\n"
  47                "2:  \n"
  48                ".section .fixup, \"ax\"\n"
  49                "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
  50                "   jmp 2b\n"
  51                ".previous\n"
  52                        _ASM_EXTABLE(1b, 3b)
  53                        : : "r" (from));
  54
  55        for ( ; i > 5; i--) {
  56                __asm__ __volatile__ (
  57                "1:  prefetch 320(%0)\n"
  58                "2:  movq (%0), %%mm0\n"
  59                "  movq 8(%0), %%mm1\n"
  60                "  movq 16(%0), %%mm2\n"
  61                "  movq 24(%0), %%mm3\n"
  62                "  movq %%mm0, (%1)\n"
  63                "  movq %%mm1, 8(%1)\n"
  64                "  movq %%mm2, 16(%1)\n"
  65                "  movq %%mm3, 24(%1)\n"
  66                "  movq 32(%0), %%mm0\n"
  67                "  movq 40(%0), %%mm1\n"
  68                "  movq 48(%0), %%mm2\n"
  69                "  movq 56(%0), %%mm3\n"
  70                "  movq %%mm0, 32(%1)\n"
  71                "  movq %%mm1, 40(%1)\n"
  72                "  movq %%mm2, 48(%1)\n"
  73                "  movq %%mm3, 56(%1)\n"
  74                ".section .fixup, \"ax\"\n"
  75                "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
  76                "   jmp 2b\n"
  77                ".previous\n"
  78                        _ASM_EXTABLE(1b, 3b)
  79                        : : "r" (from), "r" (to) : "memory");
  80
  81                from += 64;
  82                to += 64;
  83        }
  84
  85        for ( ; i > 0; i--) {
  86                __asm__ __volatile__ (
  87                "  movq (%0), %%mm0\n"
  88                "  movq 8(%0), %%mm1\n"
  89                "  movq 16(%0), %%mm2\n"
  90                "  movq 24(%0), %%mm3\n"
  91                "  movq %%mm0, (%1)\n"
  92                "  movq %%mm1, 8(%1)\n"
  93                "  movq %%mm2, 16(%1)\n"
  94                "  movq %%mm3, 24(%1)\n"
  95                "  movq 32(%0), %%mm0\n"
  96                "  movq 40(%0), %%mm1\n"
  97                "  movq 48(%0), %%mm2\n"
  98                "  movq 56(%0), %%mm3\n"
  99                "  movq %%mm0, 32(%1)\n"
 100                "  movq %%mm1, 40(%1)\n"
 101                "  movq %%mm2, 48(%1)\n"
 102                "  movq %%mm3, 56(%1)\n"
 103                        : : "r" (from), "r" (to) : "memory");
 104
 105                from += 64;
 106                to += 64;
 107        }
 108        /*
 109         * Now do the tail of the block:
 110         */
 111        __memcpy(to, from, len & 63);
 112        kernel_fpu_end();
 113
 114        return p;
 115}
 116EXPORT_SYMBOL(_mmx_memcpy);
 117
 118#ifdef CONFIG_MK7
 119
 120/*
 121 *      The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
 122 *      other MMX using processors do not.
 123 */
 124
 125static void fast_clear_page(void *page)
 126{
 127        int i;
 128
 129        kernel_fpu_begin();
 130
 131        __asm__ __volatile__ (
 132                "  pxor %%mm0, %%mm0\n" : :
 133        );
 134
 135        for (i = 0; i < 4096/64; i++) {
 136                __asm__ __volatile__ (
 137                "  movntq %%mm0, (%0)\n"
 138                "  movntq %%mm0, 8(%0)\n"
 139                "  movntq %%mm0, 16(%0)\n"
 140                "  movntq %%mm0, 24(%0)\n"
 141                "  movntq %%mm0, 32(%0)\n"
 142                "  movntq %%mm0, 40(%0)\n"
 143                "  movntq %%mm0, 48(%0)\n"
 144                "  movntq %%mm0, 56(%0)\n"
 145                : : "r" (page) : "memory");
 146                page += 64;
 147        }
 148
 149        /*
 150         * Since movntq is weakly-ordered, a "sfence" is needed to become
 151         * ordered again:
 152         */
 153        __asm__ __volatile__("sfence\n"::);
 154
 155        kernel_fpu_end();
 156}
 157
 158static void fast_copy_page(void *to, void *from)
 159{
 160        int i;
 161
 162        kernel_fpu_begin();
 163
 164        /*
 165         * maybe the prefetch stuff can go before the expensive fnsave...
 166         * but that is for later. -AV
 167         */
 168        __asm__ __volatile__(
 169                "1: prefetch (%0)\n"
 170                "   prefetch 64(%0)\n"
 171                "   prefetch 128(%0)\n"
 172                "   prefetch 192(%0)\n"
 173                "   prefetch 256(%0)\n"
 174                "2:  \n"
 175                ".section .fixup, \"ax\"\n"
 176                "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
 177                "   jmp 2b\n"
 178                ".previous\n"
 179                        _ASM_EXTABLE(1b, 3b) : : "r" (from));
 180
 181        for (i = 0; i < (4096-320)/64; i++) {
 182                __asm__ __volatile__ (
 183                "1: prefetch 320(%0)\n"
 184                "2: movq (%0), %%mm0\n"
 185                "   movntq %%mm0, (%1)\n"
 186                "   movq 8(%0), %%mm1\n"
 187                "   movntq %%mm1, 8(%1)\n"
 188                "   movq 16(%0), %%mm2\n"
 189                "   movntq %%mm2, 16(%1)\n"
 190                "   movq 24(%0), %%mm3\n"
 191                "   movntq %%mm3, 24(%1)\n"
 192                "   movq 32(%0), %%mm4\n"
 193                "   movntq %%mm4, 32(%1)\n"
 194                "   movq 40(%0), %%mm5\n"
 195                "   movntq %%mm5, 40(%1)\n"
 196                "   movq 48(%0), %%mm6\n"
 197                "   movntq %%mm6, 48(%1)\n"
 198                "   movq 56(%0), %%mm7\n"
 199                "   movntq %%mm7, 56(%1)\n"
 200                ".section .fixup, \"ax\"\n"
 201                "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
 202                "   jmp 2b\n"
 203                ".previous\n"
 204                _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory");
 205
 206                from += 64;
 207                to += 64;
 208        }
 209
 210        for (i = (4096-320)/64; i < 4096/64; i++) {
 211                __asm__ __volatile__ (
 212                "2: movq (%0), %%mm0\n"
 213                "   movntq %%mm0, (%1)\n"
 214                "   movq 8(%0), %%mm1\n"
 215                "   movntq %%mm1, 8(%1)\n"
 216                "   movq 16(%0), %%mm2\n"
 217                "   movntq %%mm2, 16(%1)\n"
 218                "   movq 24(%0), %%mm3\n"
 219                "   movntq %%mm3, 24(%1)\n"
 220                "   movq 32(%0), %%mm4\n"
 221                "   movntq %%mm4, 32(%1)\n"
 222                "   movq 40(%0), %%mm5\n"
 223                "   movntq %%mm5, 40(%1)\n"
 224                "   movq 48(%0), %%mm6\n"
 225                "   movntq %%mm6, 48(%1)\n"
 226                "   movq 56(%0), %%mm7\n"
 227                "   movntq %%mm7, 56(%1)\n"
 228                        : : "r" (from), "r" (to) : "memory");
 229                from += 64;
 230                to += 64;
 231        }
 232        /*
 233         * Since movntq is weakly-ordered, a "sfence" is needed to become
 234         * ordered again:
 235         */
 236        __asm__ __volatile__("sfence \n"::);
 237        kernel_fpu_end();
 238}
 239
 240#else /* CONFIG_MK7 */
 241
 242/*
 243 *      Generic MMX implementation without K7 specific streaming
 244 */
 245static void fast_clear_page(void *page)
 246{
 247        int i;
 248
 249        kernel_fpu_begin();
 250
 251        __asm__ __volatile__ (
 252                "  pxor %%mm0, %%mm0\n" : :
 253        );
 254
 255        for (i = 0; i < 4096/128; i++) {
 256                __asm__ __volatile__ (
 257                "  movq %%mm0, (%0)\n"
 258                "  movq %%mm0, 8(%0)\n"
 259                "  movq %%mm0, 16(%0)\n"
 260                "  movq %%mm0, 24(%0)\n"
 261                "  movq %%mm0, 32(%0)\n"
 262                "  movq %%mm0, 40(%0)\n"
 263                "  movq %%mm0, 48(%0)\n"
 264                "  movq %%mm0, 56(%0)\n"
 265                "  movq %%mm0, 64(%0)\n"
 266                "  movq %%mm0, 72(%0)\n"
 267                "  movq %%mm0, 80(%0)\n"
 268                "  movq %%mm0, 88(%0)\n"
 269                "  movq %%mm0, 96(%0)\n"
 270                "  movq %%mm0, 104(%0)\n"
 271                "  movq %%mm0, 112(%0)\n"
 272                "  movq %%mm0, 120(%0)\n"
 273                        : : "r" (page) : "memory");
 274                page += 128;
 275        }
 276
 277        kernel_fpu_end();
 278}
 279
 280static void fast_copy_page(void *to, void *from)
 281{
 282        int i;
 283
 284        kernel_fpu_begin();
 285
 286        __asm__ __volatile__ (
 287                "1: prefetch (%0)\n"
 288                "   prefetch 64(%0)\n"
 289                "   prefetch 128(%0)\n"
 290                "   prefetch 192(%0)\n"
 291                "   prefetch 256(%0)\n"
 292                "2:  \n"
 293                ".section .fixup, \"ax\"\n"
 294                "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
 295                "   jmp 2b\n"
 296                ".previous\n"
 297                        _ASM_EXTABLE(1b, 3b) : : "r" (from));
 298
 299        for (i = 0; i < 4096/64; i++) {
 300                __asm__ __volatile__ (
 301                "1: prefetch 320(%0)\n"
 302                "2: movq (%0), %%mm0\n"
 303                "   movq 8(%0), %%mm1\n"
 304                "   movq 16(%0), %%mm2\n"
 305                "   movq 24(%0), %%mm3\n"
 306                "   movq %%mm0, (%1)\n"
 307                "   movq %%mm1, 8(%1)\n"
 308                "   movq %%mm2, 16(%1)\n"
 309                "   movq %%mm3, 24(%1)\n"
 310                "   movq 32(%0), %%mm0\n"
 311                "   movq 40(%0), %%mm1\n"
 312                "   movq 48(%0), %%mm2\n"
 313                "   movq 56(%0), %%mm3\n"
 314                "   movq %%mm0, 32(%1)\n"
 315                "   movq %%mm1, 40(%1)\n"
 316                "   movq %%mm2, 48(%1)\n"
 317                "   movq %%mm3, 56(%1)\n"
 318                ".section .fixup, \"ax\"\n"
 319                "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
 320                "   jmp 2b\n"
 321                ".previous\n"
 322                        _ASM_EXTABLE(1b, 3b)
 323                        : : "r" (from), "r" (to) : "memory");
 324
 325                from += 64;
 326                to += 64;
 327        }
 328        kernel_fpu_end();
 329}
 330
 331#endif /* !CONFIG_MK7 */
 332
 333/*
 334 * Favour MMX for page clear and copy:
 335 */
 336static void slow_zero_page(void *page)
 337{
 338        int d0, d1;
 339
 340        __asm__ __volatile__(
 341                "cld\n\t"
 342                "rep ; stosl"
 343
 344                        : "=&c" (d0), "=&D" (d1)
 345                        :"a" (0), "1" (page), "0" (1024)
 346                        :"memory");
 347}
 348
 349void mmx_clear_page(void *page)
 350{
 351        if (unlikely(in_interrupt()))
 352                slow_zero_page(page);
 353        else
 354                fast_clear_page(page);
 355}
 356EXPORT_SYMBOL(mmx_clear_page);
 357
 358static void slow_copy_page(void *to, void *from)
 359{
 360        int d0, d1, d2;
 361
 362        __asm__ __volatile__(
 363                "cld\n\t"
 364                "rep ; movsl"
 365                : "=&c" (d0), "=&D" (d1), "=&S" (d2)
 366                : "0" (1024), "1" ((long) to), "2" ((long) from)
 367                : "memory");
 368}
 369
 370void mmx_copy_page(void *to, void *from)
 371{
 372        if (unlikely(in_interrupt()))
 373                slow_copy_page(to, from);
 374        else
 375                fast_copy_page(to, from);
 376}
 377EXPORT_SYMBOL(mmx_copy_page);
 378