LXR linux/arch/alpha/lib/ev6-strncpy_from

   1/*
   2 * arch/alpha/lib/ev6-strncpy_from_user.S
   3 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
   4 *
   5 * Just like strncpy except in the return value:
   6 *
   7 * -EFAULT       if an exception occurs before the terminator is copied.
   8 * N             if the buffer filled.
   9 *
  10 * Otherwise the length of the string is returned.
  11 *
  12 * Much of the information about 21264 scheduling/coding comes from:
  13 *      Compiler Writer's Guide for the Alpha 21264
  14 *      abbreviated as 'CWG' in other comments here
  15 *      ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
  16 * Scheduling notation:
  17 *      E       - either cluster
  18 *      U       - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
  19 *      L       - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
  20 * A bunch of instructions got moved and temp registers were changed
  21 * to aid in scheduling.  Control flow was also re-arranged to eliminate
  22 * branches, and to provide longer code sequences to enable better scheduling.
  23 * A total rewrite (using byte load/stores for start & tail sequences)
  24 * is desirable, but very difficult to do without a from-scratch rewrite.
  25 * Save that for the future.
  26 */
  27
  28
  29#include <asm/errno.h>
  30#include <asm/regdef.h>
  31
  32
  33/* Allow an exception for an insn; exit if we get one.  */
  34#define EX(x,y...)                      \
  35        99: x,##y;                      \
  36        .section __ex_table,"a";        \
  37        .long 99b - .;                  \
  38        lda $31, $exception-99b($0);    \
  39        .previous
  40
  41
  42        .set noat
  43        .set noreorder
  44        .text
  45
  46        .globl __strncpy_from_user
  47        .ent __strncpy_from_user
  48        .frame $30, 0, $26
  49        .prologue 0
  50
  51        .align 4
  52__strncpy_from_user:
  53        and     a0, 7, t3       # E : find dest misalignment
  54        beq     a2, $zerolength # U :
  55
  56        /* Are source and destination co-aligned?  */
  57        mov     a0, v0          # E : save the string start
  58        xor     a0, a1, t4      # E :
  59        EX( ldq_u t1, 0(a1) )   # L : Latency=3 load first quadword
  60        ldq_u   t0, 0(a0)       # L : load first (partial) aligned dest quadword
  61
  62        addq    a2, t3, a2      # E : bias count by dest misalignment
  63        subq    a2, 1, a3       # E :
  64        addq    zero, 1, t10    # E :
  65        and     t4, 7, t4       # E : misalignment between the two
  66
  67        and     a3, 7, t6       # E : number of tail bytes
  68        sll     t10, t6, t10    # E : t10 = bitmask of last count byte
  69        bne     t4, $unaligned  # U :
  70        lda     t2, -1          # E : build a mask against false zero
  71
  72        /*
  73         * We are co-aligned; take care of a partial first word.
  74         * On entry to this basic block:
  75         * t0 == the first destination word for masking back in
  76         * t1 == the first source word.
  77         */
  78
  79        srl     a3, 3, a2       # E : a2 = loop counter = (count - 1)/8
  80        addq    a1, 8, a1       # E :
  81        mskqh   t2, a1, t2      # U :   detection in the src word
  82        nop
  83
  84        /* Create the 1st output word and detect 0's in the 1st input word.  */
  85        mskqh   t1, a1, t3      # U :
  86        mskql   t0, a1, t0      # U : assemble the first output word
  87        ornot   t1, t2, t2      # E :
  88        nop
  89
  90        cmpbge  zero, t2, t8    # E : bits set iff null found
  91        or      t0, t3, t0      # E :
  92        beq     a2, $a_eoc      # U :
  93        bne     t8, $a_eos      # U : 2nd branch in a quad.  Bad.
  94
  95        /* On entry to this basic block:
  96         * t0 == a source quad not containing a null.
  97         * a0 - current aligned destination address
  98         * a1 - current aligned source address
  99         * a2 - count of quadwords to move.
 100         * NOTE: Loop improvement - unrolling this is going to be
 101         *      a huge win, since we're going to stall otherwise.
 102         *      Fix this later.  For _really_ large copies, look
 103         *      at using wh64 on a look-ahead basis.  See the code
 104         *      in clear_user.S and copy_user.S.
 105         * Presumably, since (a0) and (a1) do not overlap (by C definition)
 106         * Lots of nops here:
 107         *      - Separate loads from stores
 108         *      - Keep it to 1 branch/quadpack so the branch predictor
 109         *        can train.
 110         */
 111$a_loop:
 112        stq_u   t0, 0(a0)       # L :
 113        addq    a0, 8, a0       # E :
 114        nop
 115        subq    a2, 1, a2       # E :
 116
 117        EX( ldq_u t0, 0(a1) )   # L :
 118        addq    a1, 8, a1       # E :
 119        cmpbge  zero, t0, t8    # E : Stall 2 cycles on t0
 120        beq     a2, $a_eoc      # U :
 121
 122        beq     t8, $a_loop     # U :
 123        nop
 124        nop
 125        nop
 126
 127        /* Take care of the final (partial) word store.  At this point
 128         * the end-of-count bit is set in t8 iff it applies.
 129         *
 130         * On entry to this basic block we have:
 131         * t0 == the source word containing the null
 132         * t8 == the cmpbge mask that found it.
 133         */
 134$a_eos:
 135        negq    t8, t12         # E : find low bit set
 136        and     t8, t12, t12    # E : 
 137
 138        /* We're doing a partial word store and so need to combine
 139           our source and original destination words.  */
 140        ldq_u   t1, 0(a0)       # L :
 141        subq    t12, 1, t6      # E :
 142
 143        or      t12, t6, t8     # E :
 144        zapnot  t0, t8, t0      # U : clear src bytes > null
 145        zap     t1, t8, t1      # U : clear dst bytes <= null
 146        or      t0, t1, t0      # E :
 147
 148        stq_u   t0, 0(a0)       # L :
 149        br      $finish_up      # L0 :
 150        nop
 151        nop
 152
 153        /* Add the end-of-count bit to the eos detection bitmask.  */
 154        .align 4
 155$a_eoc:
 156        or      t10, t8, t8
 157        br      $a_eos
 158        nop
 159        nop
 160
 161
 162/* The source and destination are not co-aligned.  Align the destination
 163   and cope.  We have to be very careful about not reading too much and
 164   causing a SEGV.  */
 165
 166        .align 4
 167$u_head:
 168        /* We know just enough now to be able to assemble the first
 169           full source word.  We can still find a zero at the end of it
 170           that prevents us from outputting the whole thing.
 171
 172           On entry to this basic block:
 173           t0 == the first dest word, unmasked
 174           t1 == the shifted low bits of the first source word
 175           t6 == bytemask that is -1 in dest word bytes */
 176
 177        EX( ldq_u t2, 8(a1) )   # L : load second src word
 178        addq    a1, 8, a1       # E :
 179        mskql   t0, a0, t0      # U : mask trailing garbage in dst
 180        extqh   t2, a1, t4      # U :
 181
 182        or      t1, t4, t1      # E : first aligned src word complete
 183        mskqh   t1, a0, t1      # U : mask leading garbage in src
 184        or      t0, t1, t0      # E : first output word complete
 185        or      t0, t6, t6      # E : mask original data for zero test
 186
 187        cmpbge  zero, t6, t8    # E :
 188        beq     a2, $u_eocfin   # U :
 189        bne     t8, $u_final    # U : bad news - 2nd branch in a quad
 190        lda     t6, -1          # E : mask out the bits we have
 191
 192        mskql   t6, a1, t6      # U :   already seen
 193        stq_u   t0, 0(a0)       # L : store first output word
 194        or      t6, t2, t2      # E :
 195        cmpbge  zero, t2, t8    # E : find nulls in second partial
 196
 197        addq    a0, 8, a0               # E :
 198        subq    a2, 1, a2               # E :
 199        bne     t8, $u_late_head_exit   # U :
 200        nop
 201
 202        /* Finally, we've got all the stupid leading edge cases taken care
 203           of and we can set up to enter the main loop.  */
 204
 205        extql   t2, a1, t1      # U : position hi-bits of lo word
 206        EX( ldq_u t2, 8(a1) )   # L : read next high-order source word
 207        addq    a1, 8, a1       # E :
 208        cmpbge  zero, t2, t8    # E :
 209
 210        beq     a2, $u_eoc      # U :
 211        bne     t8, $u_eos      # U :
 212        nop
 213        nop
 214
 215        /* Unaligned copy main loop.  In order to avoid reading too much,
 216           the loop is structured to detect zeros in aligned source words.
 217           This has, unfortunately, effectively pulled half of a loop
 218           iteration out into the head and half into the tail, but it does
 219           prevent nastiness from accumulating in the very thing we want
 220           to run as fast as possible.
 221
 222           On entry to this basic block:
 223           t1 == the shifted high-order bits from the previous source word
 224           t2 == the unshifted current source word
 225
 226           We further know that t2 does not contain a null terminator.  */
 227
 228        /*
 229         * Extra nops here:
 230         *      separate load quads from store quads
 231         *      only one branch/quad to permit predictor training
 232         */
 233
 234        .align 4
 235$u_loop:
 236        extqh   t2, a1, t0      # U : extract high bits for current word
 237        addq    a1, 8, a1       # E :
 238        extql   t2, a1, t3      # U : extract low bits for next time
 239        addq    a0, 8, a0       # E :
 240
 241        or      t0, t1, t0      # E : current dst word now complete
 242        EX( ldq_u t2, 0(a1) )   # L : load high word for next time
 243        subq    a2, 1, a2       # E :
 244        nop
 245
 246        stq_u   t0, -8(a0)      # L : save the current word
 247        mov     t3, t1          # E :
 248        cmpbge  zero, t2, t8    # E : test new word for eos
 249        beq     a2, $u_eoc      # U :
 250
 251        beq     t8, $u_loop     # U :
 252        nop
 253        nop
 254        nop
 255
 256        /* We've found a zero somewhere in the source word we just read.
 257           If it resides in the lower half, we have one (probably partial)
 258           word to write out, and if it resides in the upper half, we
 259           have one full and one partial word left to write out.
 260
 261           On entry to this basic block:
 262           t1 == the shifted high-order bits from the previous source word
 263           t2 == the unshifted current source word.  */
 264        .align 4
 265$u_eos:
 266        extqh   t2, a1, t0      # U :
 267        or      t0, t1, t0      # E : first (partial) source word complete
 268        cmpbge  zero, t0, t8    # E : is the null in this first bit?
 269        nop
 270
 271        bne     t8, $u_final    # U :
 272        stq_u   t0, 0(a0)       # L : the null was in the high-order bits
 273        addq    a0, 8, a0       # E :
 274        subq    a2, 1, a2       # E :
 275
 276        .align 4
 277$u_late_head_exit:
 278        extql   t2, a1, t0      # U :
 279        cmpbge  zero, t0, t8    # E :
 280        or      t8, t10, t6     # E :
 281        cmoveq  a2, t6, t8      # E :
 282
 283        /* Take care of a final (probably partial) result word.
 284           On entry to this basic block:
 285           t0 == assembled source word
 286           t8 == cmpbge mask that found the null.  */
 287        .align 4
 288$u_final:
 289        negq    t8, t6          # E : isolate low bit set
 290        and     t6, t8, t12     # E :
 291        ldq_u   t1, 0(a0)       # L :
 292        subq    t12, 1, t6      # E :
 293
 294        or      t6, t12, t8     # E :
 295        zapnot  t0, t8, t0      # U : kill source bytes > null
 296        zap     t1, t8, t1      # U : kill dest bytes <= null
 297        or      t0, t1, t0      # E :
 298
 299        stq_u   t0, 0(a0)       # E :
 300        br      $finish_up      # U :
 301        nop
 302        nop
 303
 304        .align 4
 305$u_eoc:                         # end-of-count
 306        extqh   t2, a1, t0      # U :
 307        or      t0, t1, t0      # E :
 308        cmpbge  zero, t0, t8    # E :
 309        nop
 310
 311        .align 4
 312$u_eocfin:                      # end-of-count, final word
 313        or      t10, t8, t8     # E :
 314        br      $u_final        # U :
 315        nop
 316        nop
 317
 318        /* Unaligned copy entry point.  */
 319        .align 4
 320$unaligned:
 321
 322        srl     a3, 3, a2       # U : a2 = loop counter = (count - 1)/8
 323        and     a0, 7, t4       # E : find dest misalignment
 324        and     a1, 7, t5       # E : find src misalignment
 325        mov     zero, t0        # E :
 326
 327        /* Conditionally load the first destination word and a bytemask
 328           with 0xff indicating that the destination byte is sacrosanct.  */
 329
 330        mov     zero, t6        # E :
 331        beq     t4, 1f          # U :
 332        ldq_u   t0, 0(a0)       # L :
 333        lda     t6, -1          # E :
 334
 335        mskql   t6, a0, t6      # E :
 336        nop
 337        nop
 338        nop
 339
 340        .align 4
 3411:
 342        subq    a1, t4, a1      # E : sub dest misalignment from src addr
 343        /* If source misalignment is larger than dest misalignment, we need
 344           extra startup checks to avoid SEGV.  */
 345        cmplt   t4, t5, t12     # E :
 346        extql   t1, a1, t1      # U : shift src into place
 347        lda     t2, -1          # E : for creating masks later
 348
 349        beq     t12, $u_head    # U :
 350        mskqh   t2, t5, t2      # U : begin src byte validity mask
 351        cmpbge  zero, t1, t8    # E : is there a zero?
 352        nop
 353
 354        extql   t2, a1, t2      # U :
 355        or      t8, t10, t5     # E : test for end-of-count too
 356        cmpbge  zero, t2, t3    # E :
 357        cmoveq  a2, t5, t8      # E : Latency=2, extra map slot
 358
 359        nop                     # E : goes with cmov
 360        andnot  t8, t3, t8      # E :
 361        beq     t8, $u_head     # U :
 362        nop
 363
 364        /* At this point we've found a zero in the first partial word of
 365           the source.  We need to isolate the valid source data and mask
 366           it into the original destination data.  (Incidentally, we know
 367           that we'll need at least one byte of that original dest word.) */
 368
 369        ldq_u   t0, 0(a0)       # L :
 370        negq    t8, t6          # E : build bitmask of bytes <= zero
 371        mskqh   t1, t4, t1      # U :
 372        and     t6, t8, t12     # E :
 373
 374        subq    t12, 1, t6      # E :
 375        or      t6, t12, t8     # E :
 376        zapnot  t2, t8, t2      # U : prepare source word; mirror changes
 377        zapnot  t1, t8, t1      # U : to source validity mask
 378
 379        andnot  t0, t2, t0      # E : zero place for source to reside
 380        or      t0, t1, t0      # E : and put it there
 381        stq_u   t0, 0(a0)       # L :
 382        nop
 383
 384        .align 4
 385$finish_up:
 386        zapnot  t0, t12, t4     # U : was last byte written null?
 387        and     t12, 0xf0, t3   # E : binary search for the address of the
 388        cmovne  t4, 1, t4       # E : Latency=2, extra map slot
 389        nop                     # E : with cmovne
 390
 391        and     t12, 0xcc, t2   # E : last byte written
 392        and     t12, 0xaa, t1   # E :
 393        cmovne  t3, 4, t3       # E : Latency=2, extra map slot
 394        nop                     # E : with cmovne
 395
 396        bic     a0, 7, t0
 397        cmovne  t2, 2, t2       # E : Latency=2, extra map slot
 398        nop                     # E : with cmovne
 399        nop
 400
 401        cmovne  t1, 1, t1       # E : Latency=2, extra map slot
 402        nop                     # E : with cmovne
 403        addq    t0, t3, t0      # E :
 404        addq    t1, t2, t1      # E :
 405
 406        addq    t0, t1, t0      # E :
 407        addq    t0, t4, t0      # add one if we filled the buffer
 408        subq    t0, v0, v0      # find string length
 409        ret                     # L0 :
 410
 411        .align 4
 412$zerolength:
 413        nop
 414        nop
 415        nop
 416        clr     v0
 417
 418$exception:
 419        nop
 420        nop
 421        nop
 422        ret
 423
 424        .end __strncpy_from_user
 425