linux/arch/alpha/lib/ev6-copy_page.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0 */
   2/*
   3 * arch/alpha/lib/ev6-copy_page.S
   4 *
   5 * Copy an entire page.
   6 */
   7
   8/* The following comparison of this routine vs the normal copy_page.S
   9   was written by an unnamed ev6 hardware designer and forwarded to me
  10   via Steven Hobbs <hobbs@steven.zko.dec.com>.
  11 
  12   First Problem: STQ overflows.
  13   -----------------------------
  14
  15        It would be nice if EV6 handled every resource overflow efficiently,
  16        but for some it doesn't.  Including store queue overflows.  It causes
  17        a trap and a restart of the pipe.
  18
  19        To get around this we sometimes use (to borrow a term from a VSSAD
  20        researcher) "aeration".  The idea is to slow the rate at which the
  21        processor receives valid instructions by inserting nops in the fetch
  22        path.  In doing so, you can prevent the overflow and actually make
  23        the code run faster.  You can, of course, take advantage of the fact
  24        that the processor can fetch at most 4 aligned instructions per cycle.
  25
  26        I inserted enough nops to force it to take 10 cycles to fetch the
  27        loop code.  In theory, EV6 should be able to execute this loop in
  28        9 cycles but I was not able to get it to run that fast -- the initial
  29        conditions were such that I could not reach this optimum rate on
  30        (chaotic) EV6.  I wrote the code such that everything would issue
  31        in order. 
  32
  33   Second Problem: Dcache index matches.
  34   -------------------------------------
  35
  36        If you are going to use this routine on random aligned pages, there
  37        is a 25% chance that the pages will be at the same dcache indices.
  38        This results in many nasty memory traps without care.
  39
  40        The solution is to schedule the prefetches to avoid the memory
  41        conflicts.  I schedule the wh64 prefetches farther ahead of the
  42        read prefetches to avoid this problem.
  43
  44   Third Problem: Needs more prefetching.
  45   --------------------------------------
  46
  47        In order to improve the code I added deeper prefetching to take the
  48        most advantage of EV6's bandwidth.
  49
  50        I also prefetched the read stream. Note that adding the read prefetch
  51        forced me to add another cycle to the inner-most kernel - up to 11
  52        from the original 8 cycles per iteration.  We could improve performance
  53        further by unrolling the loop and doing multiple prefetches per cycle.
  54
  55   I think that the code below will be very robust and fast code for the
  56   purposes of copying aligned pages.  It is slower when both source and
  57   destination pages are in the dcache, but it is my guess that this is
  58   less important than the dcache miss case.  */
  59
  60#include <asm/export.h>
  61        .text
  62        .align 4
  63        .global copy_page
  64        .ent copy_page
  65copy_page:
  66        .prologue 0
  67
  68        /* Prefetch 5 read cachelines; write-hint 10 cache lines.  */
  69        wh64    ($16)
  70        ldl     $31,0($17)
  71        ldl     $31,64($17)
  72        lda     $1,1*64($16)
  73
  74        wh64    ($1)
  75        ldl     $31,128($17)
  76        ldl     $31,192($17)
  77        lda     $1,2*64($16)
  78
  79        wh64    ($1)
  80        ldl     $31,256($17)
  81        lda     $18,118
  82        lda     $1,3*64($16)
  83
  84        wh64    ($1)
  85        nop
  86        lda     $1,4*64($16)
  87        lda     $2,5*64($16)
  88
  89        wh64    ($1)
  90        wh64    ($2)
  91        lda     $1,6*64($16)
  92        lda     $2,7*64($16)
  93
  94        wh64    ($1)
  95        wh64    ($2)
  96        lda     $1,8*64($16)
  97        lda     $2,9*64($16)
  98
  99        wh64    ($1)
 100        wh64    ($2)
 101        lda     $19,10*64($16)
 102        nop
 103
 104        /* Main prefetching/write-hinting loop.  */
 1051:      ldq     $0,0($17)
 106        ldq     $1,8($17)
 107        unop
 108        unop
 109
 110        unop
 111        unop
 112        ldq     $2,16($17)
 113        ldq     $3,24($17)
 114
 115        ldq     $4,32($17)
 116        ldq     $5,40($17)
 117        unop
 118        unop
 119
 120        unop
 121        unop
 122        ldq     $6,48($17)
 123        ldq     $7,56($17)
 124
 125        ldl     $31,320($17)
 126        unop
 127        unop
 128        unop
 129
 130        /* This gives the extra cycle of aeration above the minimum.  */
 131        unop                    
 132        unop
 133        unop
 134        unop
 135
 136        wh64    ($19)
 137        unop
 138        unop
 139        unop
 140
 141        stq     $0,0($16)
 142        subq    $18,1,$18
 143        stq     $1,8($16)
 144        unop
 145
 146        unop
 147        stq     $2,16($16)
 148        addq    $17,64,$17
 149        stq     $3,24($16)
 150
 151        stq     $4,32($16)
 152        stq     $5,40($16)
 153        addq    $19,64,$19
 154        unop
 155
 156        stq     $6,48($16)
 157        stq     $7,56($16)
 158        addq    $16,64,$16
 159        bne     $18, 1b
 160
 161        /* Prefetch the final 5 cache lines of the read stream.  */
 162        lda     $18,10
 163        ldl     $31,320($17)
 164        ldl     $31,384($17)
 165        ldl     $31,448($17)
 166
 167        ldl     $31,512($17)
 168        ldl     $31,576($17)
 169        nop
 170        nop
 171
 172        /* Non-prefetching, non-write-hinting cleanup loop for the
 173           final 10 cache lines.  */
 1742:      ldq     $0,0($17)
 175        ldq     $1,8($17)
 176        ldq     $2,16($17)
 177        ldq     $3,24($17)
 178
 179        ldq     $4,32($17)
 180        ldq     $5,40($17)
 181        ldq     $6,48($17)
 182        ldq     $7,56($17)
 183
 184        stq     $0,0($16)
 185        subq    $18,1,$18
 186        stq     $1,8($16)
 187        addq    $17,64,$17
 188
 189        stq     $2,16($16)
 190        stq     $3,24($16)
 191        stq     $4,32($16)
 192        stq     $5,40($16)
 193
 194        stq     $6,48($16)
 195        stq     $7,56($16)
 196        addq    $16,64,$16
 197        bne     $18, 2b
 198
 199        ret
 200        nop
 201        unop
 202        nop
 203
 204        .end copy_page
 205        EXPORT_SYMBOL(copy_page)
 206