linux/arch/alpha/lib/ev6-copy_page.S
<<
>>
Prefs
   1/*
   2 * arch/alpha/lib/ev6-copy_page.S
   3 *
   4 * Copy an entire page.
   5 */
   6
   7/* The following comparison of this routine vs the normal copy_page.S
   8   was written by an unnamed ev6 hardware designer and forwarded to me
   9   via Steven Hobbs <hobbs@steven.zko.dec.com>.
  10 
  11   First Problem: STQ overflows.
  12   -----------------------------
  13
  14        It would be nice if EV6 handled every resource overflow efficiently,
  15        but for some it doesn't.  Including store queue overflows.  It causes
  16        a trap and a restart of the pipe.
  17
  18        To get around this we sometimes use (to borrow a term from a VSSAD
  19        researcher) "aeration".  The idea is to slow the rate at which the
  20        processor receives valid instructions by inserting nops in the fetch
  21        path.  In doing so, you can prevent the overflow and actually make
  22        the code run faster.  You can, of course, take advantage of the fact
  23        that the processor can fetch at most 4 aligned instructions per cycle.
  24
  25        I inserted enough nops to force it to take 10 cycles to fetch the
  26        loop code.  In theory, EV6 should be able to execute this loop in
  27        9 cycles but I was not able to get it to run that fast -- the initial
  28        conditions were such that I could not reach this optimum rate on
  29        (chaotic) EV6.  I wrote the code such that everything would issue
  30        in order. 
  31
  32   Second Problem: Dcache index matches.
  33   -------------------------------------
  34
  35        If you are going to use this routine on random aligned pages, there
  36        is a 25% chance that the pages will be at the same dcache indices.
  37        This results in many nasty memory traps without care.
  38
  39        The solution is to schedule the prefetches to avoid the memory
  40        conflicts.  I schedule the wh64 prefetches farther ahead of the
  41        read prefetches to avoid this problem.
  42
  43   Third Problem: Needs more prefetching.
  44   --------------------------------------
  45
  46        In order to improve the code I added deeper prefetching to take the
  47        most advantage of EV6's bandwidth.
  48
  49        I also prefetched the read stream. Note that adding the read prefetch
  50        forced me to add another cycle to the inner-most kernel - up to 11
  51        from the original 8 cycles per iteration.  We could improve performance
  52        further by unrolling the loop and doing multiple prefetches per cycle.
  53
  54   I think that the code below will be very robust and fast code for the
  55   purposes of copying aligned pages.  It is slower when both source and
  56   destination pages are in the dcache, but it is my guess that this is
  57   less important than the dcache miss case.  */
  58
  59
  60        .text
  61        .align 4
  62        .global copy_page
  63        .ent copy_page
  64copy_page:
  65        .prologue 0
  66
  67        /* Prefetch 5 read cachelines; write-hint 10 cache lines.  */
  68        wh64    ($16)
  69        ldl     $31,0($17)
  70        ldl     $31,64($17)
  71        lda     $1,1*64($16)
  72
  73        wh64    ($1)
  74        ldl     $31,128($17)
  75        ldl     $31,192($17)
  76        lda     $1,2*64($16)
  77
  78        wh64    ($1)
  79        ldl     $31,256($17)
  80        lda     $18,118
  81        lda     $1,3*64($16)
  82
  83        wh64    ($1)
  84        nop
  85        lda     $1,4*64($16)
  86        lda     $2,5*64($16)
  87
  88        wh64    ($1)
  89        wh64    ($2)
  90        lda     $1,6*64($16)
  91        lda     $2,7*64($16)
  92
  93        wh64    ($1)
  94        wh64    ($2)
  95        lda     $1,8*64($16)
  96        lda     $2,9*64($16)
  97
  98        wh64    ($1)
  99        wh64    ($2)
 100        lda     $19,10*64($16)
 101        nop
 102
 103        /* Main prefetching/write-hinting loop.  */
 1041:      ldq     $0,0($17)
 105        ldq     $1,8($17)
 106        unop
 107        unop
 108
 109        unop
 110        unop
 111        ldq     $2,16($17)
 112        ldq     $3,24($17)
 113
 114        ldq     $4,32($17)
 115        ldq     $5,40($17)
 116        unop
 117        unop
 118
 119        unop
 120        unop
 121        ldq     $6,48($17)
 122        ldq     $7,56($17)
 123
 124        ldl     $31,320($17)
 125        unop
 126        unop
 127        unop
 128
 129        /* This gives the extra cycle of aeration above the minimum.  */
 130        unop                    
 131        unop
 132        unop
 133        unop
 134
 135        wh64    ($19)
 136        unop
 137        unop
 138        unop
 139
 140        stq     $0,0($16)
 141        subq    $18,1,$18
 142        stq     $1,8($16)
 143        unop
 144
 145        unop
 146        stq     $2,16($16)
 147        addq    $17,64,$17
 148        stq     $3,24($16)
 149
 150        stq     $4,32($16)
 151        stq     $5,40($16)
 152        addq    $19,64,$19
 153        unop
 154
 155        stq     $6,48($16)
 156        stq     $7,56($16)
 157        addq    $16,64,$16
 158        bne     $18, 1b
 159
 160        /* Prefetch the final 5 cache lines of the read stream.  */
 161        lda     $18,10
 162        ldl     $31,320($17)
 163        ldl     $31,384($17)
 164        ldl     $31,448($17)
 165
 166        ldl     $31,512($17)
 167        ldl     $31,576($17)
 168        nop
 169        nop
 170
 171        /* Non-prefetching, non-write-hinting cleanup loop for the
 172           final 10 cache lines.  */
 1732:      ldq     $0,0($17)
 174        ldq     $1,8($17)
 175        ldq     $2,16($17)
 176        ldq     $3,24($17)
 177
 178        ldq     $4,32($17)
 179        ldq     $5,40($17)
 180        ldq     $6,48($17)
 181        ldq     $7,56($17)
 182
 183        stq     $0,0($16)
 184        subq    $18,1,$18
 185        stq     $1,8($16)
 186        addq    $17,64,$17
 187
 188        stq     $2,16($16)
 189        stq     $3,24($16)
 190        stq     $4,32($16)
 191        stq     $5,40($16)
 192
 193        stq     $6,48($16)
 194        stq     $7,56($16)
 195        addq    $16,64,$16
 196        bne     $18, 2b
 197
 198        ret
 199        nop
 200        unop
 201        nop
 202
 203        .end copy_page
 204