linux/arch/sh/lib64/copy_page.S
<<
>>
Prefs
   1/*
   2   Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
   3
   4   This file is subject to the terms and conditions of the GNU General Public
   5   License.  See the file "COPYING" in the main directory of this archive
   6   for more details.
   7
   8   Tight version of mempy for the case of just copying a page.
   9   Prefetch strategy empirically optimised against RTL simulations
  10   of SH5-101 cut2 eval chip with Cayman board DDR memory.
  11
  12   Parameters:
  13   r2 : destination effective address (start of page)
  14   r3 : source effective address (start of page)
  15
  16   Always copies 4096 bytes.
  17
  18   Points to review.
  19   * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead.
  20     It seems like the prefetch needs to be at at least 4 lines ahead to get
  21     the data into the cache in time, and the allocos contend with outstanding
  22     prefetches for the same cache set, so it's better to have the numbers
  23     different.
  24   */
  25
  26        .section .text..SHmedia32,"ax"
  27        .little
  28
  29        .balign 8
  30        .global copy_page
  31copy_page:
  32
  33        /* Copy 4096 bytes worth of data from r3 to r2.
  34           Do prefetches 4 lines ahead.
  35           Do alloco 2 lines ahead */
  36
  37        pta 1f, tr1
  38        pta 2f, tr2
  39        pta 3f, tr3
  40        ptabs r18, tr0
  41
  42#if 0
  43        /* TAKum03020 */
  44        ld.q r3, 0x00, r63
  45        ld.q r3, 0x20, r63
  46        ld.q r3, 0x40, r63
  47        ld.q r3, 0x60, r63
  48#endif
  49        alloco r2, 0x00
  50        synco           ! TAKum03020
  51        alloco r2, 0x20
  52        synco           ! TAKum03020
  53
  54        movi 3968, r6
  55        add  r2, r6, r6
  56        addi r6, 64, r7
  57        addi r7, 64, r8
  58        sub r3, r2, r60
  59        addi r60, 8, r61
  60        addi r61, 8, r62
  61        addi r62, 8, r23
  62        addi r60, 0x80, r22
  63
  64/* Minimal code size.  The extra branches inside the loop don't cost much
  65   because they overlap with the time spent waiting for prefetches to
  66   complete. */
  671:
  68#if 0
  69        /* TAKum03020 */
  70        bge/u r2, r6, tr2  ! skip prefetch for last 4 lines
  71        ldx.q r2, r22, r63 ! prefetch 4 lines hence
  72#endif
  732:
  74        bge/u r2, r7, tr3  ! skip alloco for last 2 lines
  75        alloco r2, 0x40    ! alloc destination line 2 lines ahead
  76        synco           ! TAKum03020
  773:
  78        ldx.q r2, r60, r36
  79        ldx.q r2, r61, r37
  80        ldx.q r2, r62, r38
  81        ldx.q r2, r23, r39
  82        st.q  r2,   0, r36
  83        st.q  r2,   8, r37
  84        st.q  r2,  16, r38
  85        st.q  r2,  24, r39
  86        addi r2, 32, r2
  87        bgt/l r8, r2, tr1
  88
  89        blink tr0, r63     ! return
  90