linux/drivers/video/fbdev/atafb_utils.h
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0 */
   2#ifndef _VIDEO_ATAFB_UTILS_H
   3#define _VIDEO_ATAFB_UTILS_H
   4
   5/* ================================================================= */
   6/*                      Utility Assembler Functions                  */
   7/* ================================================================= */
   8
   9/* ====================================================================== */
  10
  11/* Those of a delicate disposition might like to skip the next couple of
  12 * pages.
  13 *
  14 * These functions are drop in replacements for memmove and
  15 * memset(_, 0, _). However their five instances add at least a kilobyte
  16 * to the object file. You have been warned.
  17 *
  18 * Not a great fan of assembler for the sake of it, but I think
  19 * that these routines are at least 10 times faster than their C
  20 * equivalents for large blits, and that's important to the lowest level of
  21 * a graphics driver. Question is whether some scheme with the blitter
  22 * would be faster. I suspect not for simple text system - not much
  23 * asynchrony.
  24 *
  25 * Code is very simple, just gruesome expansion. Basic strategy is to
  26 * increase data moved/cleared at each step to 16 bytes to reduce
  27 * instruction per data move overhead. movem might be faster still
  28 * For more than 15 bytes, we try to align the write direction on a
  29 * longword boundary to get maximum speed. This is even more gruesome.
  30 * Unaligned read/write used requires 68020+ - think this is a problem?
  31 *
  32 * Sorry!
  33 */
  34
  35
  36/* ++roman: I've optimized Robert's original versions in some minor
  37 * aspects, e.g. moveq instead of movel, let gcc choose the registers,
  38 * use movem in some places...
  39 * For other modes than 1 plane, lots of more such assembler functions
  40 * were needed (e.g. the ones using movep or expanding color values).
  41 */
  42
  43/* ++andreas: more optimizations:
  44   subl #65536,d0 replaced by clrw d0; subql #1,d0 for dbcc
  45   addal is faster than addaw
  46   movep is rather expensive compared to ordinary move's
  47   some functions rewritten in C for clarity, no speed loss */
  48
  49static inline void *fb_memclear_small(void *s, size_t count)
  50{
  51        if (!count)
  52                return 0;
  53
  54        asm volatile ("\n"
  55                "       lsr.l   #1,%1 ; jcc 1f ; move.b %2,-(%0)\n"
  56                "1:     lsr.l   #1,%1 ; jcc 1f ; move.w %2,-(%0)\n"
  57                "1:     lsr.l   #1,%1 ; jcc 1f ; move.l %2,-(%0)\n"
  58                "1:     lsr.l   #1,%1 ; jcc 1f ; move.l %2,-(%0) ; move.l %2,-(%0)\n"
  59                "1:"
  60                : "=a" (s), "=d" (count)
  61                : "d" (0), "0" ((char *)s + count), "1" (count));
  62        asm volatile ("\n"
  63                "       subq.l  #1,%1\n"
  64                "       jcs     3f\n"
  65                "       move.l  %2,%%d4; move.l %2,%%d5; move.l %2,%%d6\n"
  66                "2:     movem.l %2/%%d4/%%d5/%%d6,-(%0)\n"
  67                "       dbra    %1,2b\n"
  68                "3:"
  69                : "=a" (s), "=d" (count)
  70                : "d" (0), "0" (s), "1" (count)
  71                : "d4", "d5", "d6"
  72                );
  73
  74        return 0;
  75}
  76
  77
  78static inline void *fb_memclear(void *s, size_t count)
  79{
  80        if (!count)
  81                return 0;
  82
  83        if (count < 16) {
  84                asm volatile ("\n"
  85                        "       lsr.l   #1,%1 ; jcc 1f ; clr.b (%0)+\n"
  86                        "1:     lsr.l   #1,%1 ; jcc 1f ; clr.w (%0)+\n"
  87                        "1:     lsr.l   #1,%1 ; jcc 1f ; clr.l (%0)+\n"
  88                        "1:     lsr.l   #1,%1 ; jcc 1f ; clr.l (%0)+ ; clr.l (%0)+\n"
  89                        "1:"
  90                        : "=a" (s), "=d" (count)
  91                        : "0" (s), "1" (count));
  92        } else {
  93                long tmp;
  94                asm volatile ("\n"
  95                        "       move.l  %1,%2\n"
  96                        "       lsr.l   #1,%2 ; jcc 1f ; clr.b (%0)+ ; subq.w #1,%1\n"
  97                        "       lsr.l   #1,%2 ; jcs 2f\n"  /* %0 increased=>bit 2 switched*/
  98                        "       clr.w   (%0)+  ; subq.w  #2,%1 ; jra 2f\n"
  99                        "1:     lsr.l   #1,%2 ; jcc 2f\n"
 100                        "       clr.w   (%0)+  ; subq.w  #2,%1\n"
 101                        "2:     move.w  %1,%2; lsr.l #2,%1 ; jeq 6f\n"
 102                        "       lsr.l   #1,%1 ; jcc 3f ; clr.l (%0)+\n"
 103                        "3:     lsr.l   #1,%1 ; jcc 4f ; clr.l (%0)+ ; clr.l (%0)+\n"
 104                        "4:     subq.l  #1,%1 ; jcs 6f\n"
 105                        "5:     clr.l   (%0)+; clr.l (%0)+ ; clr.l (%0)+ ; clr.l (%0)+\n"
 106                        "       dbra    %1,5b ; clr.w %1; subq.l #1,%1; jcc 5b\n"
 107                        "6:     move.w  %2,%1; btst #1,%1 ; jeq 7f ; clr.w (%0)+\n"
 108                        "7:     btst    #0,%1 ; jeq 8f ; clr.b (%0)+\n"
 109                        "8:"
 110                        : "=a" (s), "=d" (count), "=d" (tmp)
 111                        : "0" (s), "1" (count));
 112        }
 113
 114        return 0;
 115}
 116
 117
 118static inline void *fb_memset255(void *s, size_t count)
 119{
 120        if (!count)
 121                return 0;
 122
 123        asm volatile ("\n"
 124                "       lsr.l   #1,%1 ; jcc 1f ; move.b %2,-(%0)\n"
 125                "1:     lsr.l   #1,%1 ; jcc 1f ; move.w %2,-(%0)\n"
 126                "1:     lsr.l   #1,%1 ; jcc 1f ; move.l %2,-(%0)\n"
 127                "1:     lsr.l   #1,%1 ; jcc 1f ; move.l %2,-(%0) ; move.l %2,-(%0)\n"
 128                "1:"
 129                : "=a" (s), "=d" (count)
 130                : "d" (-1), "0" ((char *)s+count), "1" (count));
 131        asm volatile ("\n"
 132                "       subq.l  #1,%1 ; jcs 3f\n"
 133                "       move.l  %2,%%d4; move.l %2,%%d5; move.l %2,%%d6\n"
 134                "2:     movem.l %2/%%d4/%%d5/%%d6,-(%0)\n"
 135                "       dbra    %1,2b\n"
 136                "3:"
 137                : "=a" (s), "=d" (count)
 138                : "d" (-1), "0" (s), "1" (count)
 139                : "d4", "d5", "d6");
 140
 141        return 0;
 142}
 143
 144
 145static inline void *fb_memmove(void *d, const void *s, size_t count)
 146{
 147        if (d < s) {
 148                if (count < 16) {
 149                        asm volatile ("\n"
 150                                "       lsr.l   #1,%2 ; jcc 1f ; move.b (%1)+,(%0)+\n"
 151                                "1:     lsr.l   #1,%2 ; jcc 1f ; move.w (%1)+,(%0)+\n"
 152                                "1:     lsr.l   #1,%2 ; jcc 1f ; move.l (%1)+,(%0)+\n"
 153                                "1:     lsr.l   #1,%2 ; jcc 1f ; move.l (%1)+,(%0)+ ; move.l (%1)+,(%0)+\n"
 154                                "1:"
 155                                : "=a" (d), "=a" (s), "=d" (count)
 156                                : "0" (d), "1" (s), "2" (count));
 157                } else {
 158                        long tmp;
 159                        asm volatile ("\n"
 160                                "       move.l  %0,%3\n"
 161                                "       lsr.l   #1,%3 ; jcc 1f ; move.b (%1)+,(%0)+ ; subqw #1,%2\n"
 162                                "       lsr.l   #1,%3 ; jcs 2f\n"  /* %0 increased=>bit 2 switched*/
 163                                "       move.w  (%1)+,(%0)+  ; subqw  #2,%2 ; jra 2f\n"
 164                                "1:     lsr.l   #1,%3 ; jcc 2f\n"
 165                                "       move.w  (%1)+,(%0)+  ; subqw  #2,%2\n"
 166                                "2:     move.w  %2,%-; lsr.l #2,%2 ; jeq 6f\n"
 167                                "       lsr.l   #1,%2 ; jcc 3f ; move.l (%1)+,(%0)+\n"
 168                                "3:     lsr.l   #1,%2 ; jcc 4f ; move.l (%1)+,(%0)+ ; move.l (%1)+,(%0)+\n"
 169                                "4:     subq.l  #1,%2 ; jcs 6f\n"
 170                                "5:     move.l  (%1)+,(%0)+; move.l (%1)+,(%0)+\n"
 171                                "       move.l  (%1)+,(%0)+; move.l (%1)+,(%0)+\n"
 172                                "       dbra    %2,5b ; clr.w %2; subq.l #1,%2; jcc 5b\n"
 173                                "6:     move.w  %+,%2; btst #1,%2 ; jeq 7f ; move.w (%1)+,(%0)+\n"
 174                                "7:     btst    #0,%2 ; jeq 8f ; move.b (%1)+,(%0)+\n"
 175                                "8:"
 176                                : "=a" (d), "=a" (s), "=d" (count), "=d" (tmp)
 177                                : "0" (d), "1" (s), "2" (count));
 178                }
 179        } else {
 180                if (count < 16) {
 181                        asm volatile ("\n"
 182                                "       lsr.l   #1,%2 ; jcc 1f ; move.b -(%1),-(%0)\n"
 183                                "1:     lsr.l   #1,%2 ; jcc 1f ; move.w -(%1),-(%0)\n"
 184                                "1:     lsr.l   #1,%2 ; jcc 1f ; move.l -(%1),-(%0)\n"
 185                                "1:     lsr.l   #1,%2 ; jcc 1f ; move.l -(%1),-(%0) ; move.l -(%1),-(%0)\n"
 186                                "1:"
 187                                : "=a" (d), "=a" (s), "=d" (count)
 188                                : "0" ((char *) d + count), "1" ((char *) s + count), "2" (count));
 189                } else {
 190                        long tmp;
 191
 192                        asm volatile ("\n"
 193                                "       move.l  %0,%3\n"
 194                                "       lsr.l   #1,%3 ; jcc 1f ; move.b -(%1),-(%0) ; subqw #1,%2\n"
 195                                "       lsr.l   #1,%3 ; jcs 2f\n"  /* %0 increased=>bit 2 switched*/
 196                                "       move.w  -(%1),-(%0) ; subqw  #2,%2 ; jra 2f\n"
 197                                "1:     lsr.l   #1,%3 ; jcc 2f\n"
 198                                "       move.w  -(%1),-(%0) ; subqw  #2,%2\n"
 199                                "2:     move.w  %2,%-; lsr.l #2,%2 ; jeq 6f\n"
 200                                "       lsr.l   #1,%2 ; jcc 3f ; move.l -(%1),-(%0)\n"
 201                                "3:     lsr.l   #1,%2 ; jcc 4f ; move.l -(%1),-(%0) ; move.l -(%1),-(%0)\n"
 202                                "4:     subq.l  #1,%2 ; jcs 6f\n"
 203                                "5:     move.l  -(%1),-(%0); move.l -(%1),-(%0)\n"
 204                                "       move.l  -(%1),-(%0); move.l -(%1),-(%0)\n"
 205                                "       dbra    %2,5b ; clr.w %2; subq.l #1,%2; jcc 5b\n"
 206                                "6:     move.w  %+,%2; btst #1,%2 ; jeq 7f ; move.w -(%1),-(%0)\n"
 207                                "7:     btst    #0,%2 ; jeq 8f ; move.b -(%1),-(%0)\n"
 208                                "8:"
 209                                : "=a" (d), "=a" (s), "=d" (count), "=d" (tmp)
 210                                : "0" ((char *) d + count), "1" ((char *) s + count), "2" (count));
 211                }
 212        }
 213
 214        return 0;
 215}
 216
 217
 218/* ++andreas: Simple and fast version of memmove, assumes size is
 219   divisible by 16, suitable for moving the whole screen bitplane */
 220static inline void fast_memmove(char *dst, const char *src, size_t size)
 221{
 222        if (!size)
 223                return;
 224        if (dst < src)
 225                asm volatile ("\n"
 226                        "1:     movem.l (%0)+,%%d0/%%d1/%%a0/%%a1\n"
 227                        "       movem.l %%d0/%%d1/%%a0/%%a1,%1@\n"
 228                        "       addq.l  #8,%1; addq.l #8,%1\n"
 229                        "       dbra    %2,1b\n"
 230                        "       clr.w   %2; subq.l #1,%2\n"
 231                        "       jcc     1b"
 232                        : "=a" (src), "=a" (dst), "=d" (size)
 233                        : "0" (src), "1" (dst), "2" (size / 16 - 1)
 234                        : "d0", "d1", "a0", "a1", "memory");
 235        else
 236                asm volatile ("\n"
 237                        "1:     subq.l  #8,%0; subq.l #8,%0\n"
 238                        "       movem.l %0@,%%d0/%%d1/%%a0/%%a1\n"
 239                        "       movem.l %%d0/%%d1/%%a0/%%a1,-(%1)\n"
 240                        "       dbra    %2,1b\n"
 241                        "       clr.w   %2; subq.l #1,%2\n"
 242                        "       jcc 1b"
 243                        : "=a" (src), "=a" (dst), "=d" (size)
 244                        : "0" (src + size), "1" (dst + size), "2" (size / 16 - 1)
 245                        : "d0", "d1", "a0", "a1", "memory");
 246}
 247
 248#ifdef BPL
 249
 250/*
 251 * This expands a up to 8 bit color into two longs
 252 * for movel operations.
 253 */
 254static const u32 four2long[] = {
 255        0x00000000, 0x000000ff, 0x0000ff00, 0x0000ffff,
 256        0x00ff0000, 0x00ff00ff, 0x00ffff00, 0x00ffffff,
 257        0xff000000, 0xff0000ff, 0xff00ff00, 0xff00ffff,
 258        0xffff0000, 0xffff00ff, 0xffffff00, 0xffffffff,
 259};
 260
 261static inline void expand8_col2mask(u8 c, u32 m[])
 262{
 263        m[0] = four2long[c & 15];
 264#if BPL > 4
 265        m[1] = four2long[c >> 4];
 266#endif
 267}
 268
 269static inline void expand8_2col2mask(u8 fg, u8 bg, u32 fgm[], u32 bgm[])
 270{
 271        fgm[0] = four2long[fg & 15] ^ (bgm[0] = four2long[bg & 15]);
 272#if BPL > 4
 273        fgm[1] = four2long[fg >> 4] ^ (bgm[1] = four2long[bg >> 4]);
 274#endif
 275}
 276
 277/*
 278 * set an 8bit value to a color
 279 */
 280static inline void fill8_col(u8 *dst, u32 m[])
 281{
 282        u32 tmp = m[0];
 283        dst[0] = tmp;
 284        dst[2] = (tmp >>= 8);
 285#if BPL > 2
 286        dst[4] = (tmp >>= 8);
 287        dst[6] = tmp >> 8;
 288#endif
 289#if BPL > 4
 290        tmp = m[1];
 291        dst[8] = tmp;
 292        dst[10] = (tmp >>= 8);
 293        dst[12] = (tmp >>= 8);
 294        dst[14] = tmp >> 8;
 295#endif
 296}
 297
 298/*
 299 * set an 8bit value according to foreground/background color
 300 */
 301static inline void fill8_2col(u8 *dst, u8 fg, u8 bg, u32 mask)
 302{
 303        u32 fgm[2], bgm[2], tmp;
 304
 305        expand8_2col2mask(fg, bg, fgm, bgm);
 306
 307        mask |= mask << 8;
 308#if BPL > 2
 309        mask |= mask << 16;
 310#endif
 311        tmp = (mask & fgm[0]) ^ bgm[0];
 312        dst[0] = tmp;
 313        dst[2] = (tmp >>= 8);
 314#if BPL > 2
 315        dst[4] = (tmp >>= 8);
 316        dst[6] = tmp >> 8;
 317#endif
 318#if BPL > 4
 319        tmp = (mask & fgm[1]) ^ bgm[1];
 320        dst[8] = tmp;
 321        dst[10] = (tmp >>= 8);
 322        dst[12] = (tmp >>= 8);
 323        dst[14] = tmp >> 8;
 324#endif
 325}
 326
 327static const u32 two2word[] = {
 328        0x00000000, 0xffff0000, 0x0000ffff, 0xffffffff
 329};
 330
 331static inline void expand16_col2mask(u8 c, u32 m[])
 332{
 333        m[0] = two2word[c & 3];
 334#if BPL > 2
 335        m[1] = two2word[(c >> 2) & 3];
 336#endif
 337#if BPL > 4
 338        m[2] = two2word[(c >> 4) & 3];
 339        m[3] = two2word[c >> 6];
 340#endif
 341}
 342
 343static inline void expand16_2col2mask(u8 fg, u8 bg, u32 fgm[], u32 bgm[])
 344{
 345        bgm[0] = two2word[bg & 3];
 346        fgm[0] = two2word[fg & 3] ^ bgm[0];
 347#if BPL > 2
 348        bgm[1] = two2word[(bg >> 2) & 3];
 349        fgm[1] = two2word[(fg >> 2) & 3] ^ bgm[1];
 350#endif
 351#if BPL > 4
 352        bgm[2] = two2word[(bg >> 4) & 3];
 353        fgm[2] = two2word[(fg >> 4) & 3] ^ bgm[2];
 354        bgm[3] = two2word[bg >> 6];
 355        fgm[3] = two2word[fg >> 6] ^ bgm[3];
 356#endif
 357}
 358
 359static inline u32 *fill16_col(u32 *dst, int rows, u32 m[])
 360{
 361        while (rows) {
 362                *dst++ = m[0];
 363#if BPL > 2
 364                *dst++ = m[1];
 365#endif
 366#if BPL > 4
 367                *dst++ = m[2];
 368                *dst++ = m[3];
 369#endif
 370                rows--;
 371        }
 372        return dst;
 373}
 374
 375static inline void memmove32_col(void *dst, void *src, u32 mask, u32 h, u32 bytes)
 376{
 377        u32 *s, *d, v;
 378
 379        s = src;
 380        d = dst;
 381        do {
 382                v = (*s++ & mask) | (*d  & ~mask);
 383                *d++ = v;
 384#if BPL > 2
 385                v = (*s++ & mask) | (*d  & ~mask);
 386                *d++ = v;
 387#endif
 388#if BPL > 4
 389                v = (*s++ & mask) | (*d  & ~mask);
 390                *d++ = v;
 391                v = (*s++ & mask) | (*d  & ~mask);
 392                *d++ = v;
 393#endif
 394                d = (u32 *)((u8 *)d + bytes);
 395                s = (u32 *)((u8 *)s + bytes);
 396        } while (--h);
 397}
 398
 399#endif
 400
 401#endif /* _VIDEO_ATAFB_UTILS_H */
 402