linux/arch/x86/crypto/salsa20-i586-asm_32.S
<<
>>
Prefs
   1# salsa20_pm.s version 20051229
   2# D. J. Bernstein
   3# Public domain.
   4
   5#include <linux/linkage.h>
   6
   7.text
   8
   9# enter salsa20_encrypt_bytes
  10ENTRY(salsa20_encrypt_bytes)
  11        mov     %esp,%eax
  12        and     $31,%eax
  13        add     $256,%eax
  14        sub     %eax,%esp
  15        # eax_stack = eax
  16        movl    %eax,80(%esp)
  17        # ebx_stack = ebx
  18        movl    %ebx,84(%esp)
  19        # esi_stack = esi
  20        movl    %esi,88(%esp)
  21        # edi_stack = edi
  22        movl    %edi,92(%esp)
  23        # ebp_stack = ebp
  24        movl    %ebp,96(%esp)
  25        # x = arg1
  26        movl    4(%esp,%eax),%edx
  27        # m = arg2
  28        movl    8(%esp,%eax),%esi
  29        # out = arg3
  30        movl    12(%esp,%eax),%edi
  31        # bytes = arg4
  32        movl    16(%esp,%eax),%ebx
  33        # bytes -= 0
  34        sub     $0,%ebx
  35        # goto done if unsigned<=
  36        jbe     ._done
  37._start:
  38        # in0 = *(uint32 *) (x + 0)
  39        movl    0(%edx),%eax
  40        # in1 = *(uint32 *) (x + 4)
  41        movl    4(%edx),%ecx
  42        # in2 = *(uint32 *) (x + 8)
  43        movl    8(%edx),%ebp
  44        # j0 = in0
  45        movl    %eax,164(%esp)
  46        # in3 = *(uint32 *) (x + 12)
  47        movl    12(%edx),%eax
  48        # j1 = in1
  49        movl    %ecx,168(%esp)
  50        # in4 = *(uint32 *) (x + 16)
  51        movl    16(%edx),%ecx
  52        # j2 = in2
  53        movl    %ebp,172(%esp)
  54        # in5 = *(uint32 *) (x + 20)
  55        movl    20(%edx),%ebp
  56        # j3 = in3
  57        movl    %eax,176(%esp)
  58        # in6 = *(uint32 *) (x + 24)
  59        movl    24(%edx),%eax
  60        # j4 = in4
  61        movl    %ecx,180(%esp)
  62        # in7 = *(uint32 *) (x + 28)
  63        movl    28(%edx),%ecx
  64        # j5 = in5
  65        movl    %ebp,184(%esp)
  66        # in8 = *(uint32 *) (x + 32)
  67        movl    32(%edx),%ebp
  68        # j6 = in6
  69        movl    %eax,188(%esp)
  70        # in9 = *(uint32 *) (x + 36)
  71        movl    36(%edx),%eax
  72        # j7 = in7
  73        movl    %ecx,192(%esp)
  74        # in10 = *(uint32 *) (x + 40)
  75        movl    40(%edx),%ecx
  76        # j8 = in8
  77        movl    %ebp,196(%esp)
  78        # in11 = *(uint32 *) (x + 44)
  79        movl    44(%edx),%ebp
  80        # j9 = in9
  81        movl    %eax,200(%esp)
  82        # in12 = *(uint32 *) (x + 48)
  83        movl    48(%edx),%eax
  84        # j10 = in10
  85        movl    %ecx,204(%esp)
  86        # in13 = *(uint32 *) (x + 52)
  87        movl    52(%edx),%ecx
  88        # j11 = in11
  89        movl    %ebp,208(%esp)
  90        # in14 = *(uint32 *) (x + 56)
  91        movl    56(%edx),%ebp
  92        # j12 = in12
  93        movl    %eax,212(%esp)
  94        # in15 = *(uint32 *) (x + 60)
  95        movl    60(%edx),%eax
  96        # j13 = in13
  97        movl    %ecx,216(%esp)
  98        # j14 = in14
  99        movl    %ebp,220(%esp)
 100        # j15 = in15
 101        movl    %eax,224(%esp)
 102        # x_backup = x
 103        movl    %edx,64(%esp)
 104._bytesatleast1:
 105        #   bytes - 64
 106        cmp     $64,%ebx
 107        #   goto nocopy if unsigned>=
 108        jae     ._nocopy
 109        #     ctarget = out
 110        movl    %edi,228(%esp)
 111        #     out = &tmp
 112        leal    0(%esp),%edi
 113        #     i = bytes
 114        mov     %ebx,%ecx
 115        #     while (i) { *out++ = *m++; --i }
 116        rep     movsb
 117        #     out = &tmp
 118        leal    0(%esp),%edi
 119        #     m = &tmp
 120        leal    0(%esp),%esi
 121._nocopy:
 122        #   out_backup = out
 123        movl    %edi,72(%esp)
 124        #   m_backup = m
 125        movl    %esi,68(%esp)
 126        #   bytes_backup = bytes
 127        movl    %ebx,76(%esp)
 128        #   in0 = j0
 129        movl    164(%esp),%eax
 130        #   in1 = j1
 131        movl    168(%esp),%ecx
 132        #   in2 = j2
 133        movl    172(%esp),%edx
 134        #   in3 = j3
 135        movl    176(%esp),%ebx
 136        #   x0 = in0
 137        movl    %eax,100(%esp)
 138        #   x1 = in1
 139        movl    %ecx,104(%esp)
 140        #   x2 = in2
 141        movl    %edx,108(%esp)
 142        #   x3 = in3
 143        movl    %ebx,112(%esp)
 144        #   in4 = j4
 145        movl    180(%esp),%eax
 146        #   in5 = j5
 147        movl    184(%esp),%ecx
 148        #   in6 = j6
 149        movl    188(%esp),%edx
 150        #   in7 = j7
 151        movl    192(%esp),%ebx
 152        #   x4 = in4
 153        movl    %eax,116(%esp)
 154        #   x5 = in5
 155        movl    %ecx,120(%esp)
 156        #   x6 = in6
 157        movl    %edx,124(%esp)
 158        #   x7 = in7
 159        movl    %ebx,128(%esp)
 160        #   in8 = j8
 161        movl    196(%esp),%eax
 162        #   in9 = j9
 163        movl    200(%esp),%ecx
 164        #   in10 = j10
 165        movl    204(%esp),%edx
 166        #   in11 = j11
 167        movl    208(%esp),%ebx
 168        #   x8 = in8
 169        movl    %eax,132(%esp)
 170        #   x9 = in9
 171        movl    %ecx,136(%esp)
 172        #   x10 = in10
 173        movl    %edx,140(%esp)
 174        #   x11 = in11
 175        movl    %ebx,144(%esp)
 176        #   in12 = j12
 177        movl    212(%esp),%eax
 178        #   in13 = j13
 179        movl    216(%esp),%ecx
 180        #   in14 = j14
 181        movl    220(%esp),%edx
 182        #   in15 = j15
 183        movl    224(%esp),%ebx
 184        #   x12 = in12
 185        movl    %eax,148(%esp)
 186        #   x13 = in13
 187        movl    %ecx,152(%esp)
 188        #   x14 = in14
 189        movl    %edx,156(%esp)
 190        #   x15 = in15
 191        movl    %ebx,160(%esp)
 192        #   i = 20
 193        mov     $20,%ebp
 194        # p = x0
 195        movl    100(%esp),%eax
 196        # s = x5
 197        movl    120(%esp),%ecx
 198        # t = x10
 199        movl    140(%esp),%edx
 200        # w = x15
 201        movl    160(%esp),%ebx
 202._mainloop:
 203        # x0 = p
 204        movl    %eax,100(%esp)
 205        #                               x10 = t
 206        movl    %edx,140(%esp)
 207        # p += x12
 208        addl    148(%esp),%eax
 209        #               x5 = s
 210        movl    %ecx,120(%esp)
 211        #                               t += x6
 212        addl    124(%esp),%edx
 213        #                                               x15 = w
 214        movl    %ebx,160(%esp)
 215        #               r = x1
 216        movl    104(%esp),%esi
 217        #               r += s
 218        add     %ecx,%esi
 219        #                                               v = x11
 220        movl    144(%esp),%edi
 221        #                                               v += w
 222        add     %ebx,%edi
 223        # p <<<= 7
 224        rol     $7,%eax
 225        # p ^= x4
 226        xorl    116(%esp),%eax
 227        #                               t <<<= 7
 228        rol     $7,%edx
 229        #                               t ^= x14
 230        xorl    156(%esp),%edx
 231        #               r <<<= 7
 232        rol     $7,%esi
 233        #               r ^= x9
 234        xorl    136(%esp),%esi
 235        #                                               v <<<= 7
 236        rol     $7,%edi
 237        #                                               v ^= x3
 238        xorl    112(%esp),%edi
 239        # x4 = p
 240        movl    %eax,116(%esp)
 241        #                               x14 = t
 242        movl    %edx,156(%esp)
 243        # p += x0
 244        addl    100(%esp),%eax
 245        #               x9 = r
 246        movl    %esi,136(%esp)
 247        #                               t += x10
 248        addl    140(%esp),%edx
 249        #                                               x3 = v
 250        movl    %edi,112(%esp)
 251        # p <<<= 9
 252        rol     $9,%eax
 253        # p ^= x8
 254        xorl    132(%esp),%eax
 255        #                               t <<<= 9
 256        rol     $9,%edx
 257        #                               t ^= x2
 258        xorl    108(%esp),%edx
 259        #               s += r
 260        add     %esi,%ecx
 261        #               s <<<= 9
 262        rol     $9,%ecx
 263        #               s ^= x13
 264        xorl    152(%esp),%ecx
 265        #                                               w += v
 266        add     %edi,%ebx
 267        #                                               w <<<= 9
 268        rol     $9,%ebx
 269        #                                               w ^= x7
 270        xorl    128(%esp),%ebx
 271        # x8 = p
 272        movl    %eax,132(%esp)
 273        #                               x2 = t
 274        movl    %edx,108(%esp)
 275        # p += x4
 276        addl    116(%esp),%eax
 277        #               x13 = s
 278        movl    %ecx,152(%esp)
 279        #                               t += x14
 280        addl    156(%esp),%edx
 281        #                                               x7 = w
 282        movl    %ebx,128(%esp)
 283        # p <<<= 13
 284        rol     $13,%eax
 285        # p ^= x12
 286        xorl    148(%esp),%eax
 287        #                               t <<<= 13
 288        rol     $13,%edx
 289        #                               t ^= x6
 290        xorl    124(%esp),%edx
 291        #               r += s
 292        add     %ecx,%esi
 293        #               r <<<= 13
 294        rol     $13,%esi
 295        #               r ^= x1
 296        xorl    104(%esp),%esi
 297        #                                               v += w
 298        add     %ebx,%edi
 299        #                                               v <<<= 13
 300        rol     $13,%edi
 301        #                                               v ^= x11
 302        xorl    144(%esp),%edi
 303        # x12 = p
 304        movl    %eax,148(%esp)
 305        #                               x6 = t
 306        movl    %edx,124(%esp)
 307        # p += x8
 308        addl    132(%esp),%eax
 309        #               x1 = r
 310        movl    %esi,104(%esp)
 311        #                               t += x2
 312        addl    108(%esp),%edx
 313        #                                               x11 = v
 314        movl    %edi,144(%esp)
 315        # p <<<= 18
 316        rol     $18,%eax
 317        # p ^= x0
 318        xorl    100(%esp),%eax
 319        #                               t <<<= 18
 320        rol     $18,%edx
 321        #                               t ^= x10
 322        xorl    140(%esp),%edx
 323        #               s += r
 324        add     %esi,%ecx
 325        #               s <<<= 18
 326        rol     $18,%ecx
 327        #               s ^= x5
 328        xorl    120(%esp),%ecx
 329        #                                               w += v
 330        add     %edi,%ebx
 331        #                                               w <<<= 18
 332        rol     $18,%ebx
 333        #                                               w ^= x15
 334        xorl    160(%esp),%ebx
 335        # x0 = p
 336        movl    %eax,100(%esp)
 337        #                               x10 = t
 338        movl    %edx,140(%esp)
 339        # p += x3
 340        addl    112(%esp),%eax
 341        # p <<<= 7
 342        rol     $7,%eax
 343        #               x5 = s
 344        movl    %ecx,120(%esp)
 345        #                               t += x9
 346        addl    136(%esp),%edx
 347        #                                               x15 = w
 348        movl    %ebx,160(%esp)
 349        #               r = x4
 350        movl    116(%esp),%esi
 351        #               r += s
 352        add     %ecx,%esi
 353        #                                               v = x14
 354        movl    156(%esp),%edi
 355        #                                               v += w
 356        add     %ebx,%edi
 357        # p ^= x1
 358        xorl    104(%esp),%eax
 359        #                               t <<<= 7
 360        rol     $7,%edx
 361        #                               t ^= x11
 362        xorl    144(%esp),%edx
 363        #               r <<<= 7
 364        rol     $7,%esi
 365        #               r ^= x6
 366        xorl    124(%esp),%esi
 367        #                                               v <<<= 7
 368        rol     $7,%edi
 369        #                                               v ^= x12
 370        xorl    148(%esp),%edi
 371        # x1 = p
 372        movl    %eax,104(%esp)
 373        #                               x11 = t
 374        movl    %edx,144(%esp)
 375        # p += x0
 376        addl    100(%esp),%eax
 377        #               x6 = r
 378        movl    %esi,124(%esp)
 379        #                               t += x10
 380        addl    140(%esp),%edx
 381        #                                               x12 = v
 382        movl    %edi,148(%esp)
 383        # p <<<= 9
 384        rol     $9,%eax
 385        # p ^= x2
 386        xorl    108(%esp),%eax
 387        #                               t <<<= 9
 388        rol     $9,%edx
 389        #                               t ^= x8
 390        xorl    132(%esp),%edx
 391        #               s += r
 392        add     %esi,%ecx
 393        #               s <<<= 9
 394        rol     $9,%ecx
 395        #               s ^= x7
 396        xorl    128(%esp),%ecx
 397        #                                               w += v
 398        add     %edi,%ebx
 399        #                                               w <<<= 9
 400        rol     $9,%ebx
 401        #                                               w ^= x13
 402        xorl    152(%esp),%ebx
 403        # x2 = p
 404        movl    %eax,108(%esp)
 405        #                               x8 = t
 406        movl    %edx,132(%esp)
 407        # p += x1
 408        addl    104(%esp),%eax
 409        #               x7 = s
 410        movl    %ecx,128(%esp)
 411        #                               t += x11
 412        addl    144(%esp),%edx
 413        #                                               x13 = w
 414        movl    %ebx,152(%esp)
 415        # p <<<= 13
 416        rol     $13,%eax
 417        # p ^= x3
 418        xorl    112(%esp),%eax
 419        #                               t <<<= 13
 420        rol     $13,%edx
 421        #                               t ^= x9
 422        xorl    136(%esp),%edx
 423        #               r += s
 424        add     %ecx,%esi
 425        #               r <<<= 13
 426        rol     $13,%esi
 427        #               r ^= x4
 428        xorl    116(%esp),%esi
 429        #                                               v += w
 430        add     %ebx,%edi
 431        #                                               v <<<= 13
 432        rol     $13,%edi
 433        #                                               v ^= x14
 434        xorl    156(%esp),%edi
 435        # x3 = p
 436        movl    %eax,112(%esp)
 437        #                               x9 = t
 438        movl    %edx,136(%esp)
 439        # p += x2
 440        addl    108(%esp),%eax
 441        #               x4 = r
 442        movl    %esi,116(%esp)
 443        #                               t += x8
 444        addl    132(%esp),%edx
 445        #                                               x14 = v
 446        movl    %edi,156(%esp)
 447        # p <<<= 18
 448        rol     $18,%eax
 449        # p ^= x0
 450        xorl    100(%esp),%eax
 451        #                               t <<<= 18
 452        rol     $18,%edx
 453        #                               t ^= x10
 454        xorl    140(%esp),%edx
 455        #               s += r
 456        add     %esi,%ecx
 457        #               s <<<= 18
 458        rol     $18,%ecx
 459        #               s ^= x5
 460        xorl    120(%esp),%ecx
 461        #                                               w += v
 462        add     %edi,%ebx
 463        #                                               w <<<= 18
 464        rol     $18,%ebx
 465        #                                               w ^= x15
 466        xorl    160(%esp),%ebx
 467        # x0 = p
 468        movl    %eax,100(%esp)
 469        #                               x10 = t
 470        movl    %edx,140(%esp)
 471        # p += x12
 472        addl    148(%esp),%eax
 473        #               x5 = s
 474        movl    %ecx,120(%esp)
 475        #                               t += x6
 476        addl    124(%esp),%edx
 477        #                                               x15 = w
 478        movl    %ebx,160(%esp)
 479        #               r = x1
 480        movl    104(%esp),%esi
 481        #               r += s
 482        add     %ecx,%esi
 483        #                                               v = x11
 484        movl    144(%esp),%edi
 485        #                                               v += w
 486        add     %ebx,%edi
 487        # p <<<= 7
 488        rol     $7,%eax
 489        # p ^= x4
 490        xorl    116(%esp),%eax
 491        #                               t <<<= 7
 492        rol     $7,%edx
 493        #                               t ^= x14
 494        xorl    156(%esp),%edx
 495        #               r <<<= 7
 496        rol     $7,%esi
 497        #               r ^= x9
 498        xorl    136(%esp),%esi
 499        #                                               v <<<= 7
 500        rol     $7,%edi
 501        #                                               v ^= x3
 502        xorl    112(%esp),%edi
 503        # x4 = p
 504        movl    %eax,116(%esp)
 505        #                               x14 = t
 506        movl    %edx,156(%esp)
 507        # p += x0
 508        addl    100(%esp),%eax
 509        #               x9 = r
 510        movl    %esi,136(%esp)
 511        #                               t += x10
 512        addl    140(%esp),%edx
 513        #                                               x3 = v
 514        movl    %edi,112(%esp)
 515        # p <<<= 9
 516        rol     $9,%eax
 517        # p ^= x8
 518        xorl    132(%esp),%eax
 519        #                               t <<<= 9
 520        rol     $9,%edx
 521        #                               t ^= x2
 522        xorl    108(%esp),%edx
 523        #               s += r
 524        add     %esi,%ecx
 525        #               s <<<= 9
 526        rol     $9,%ecx
 527        #               s ^= x13
 528        xorl    152(%esp),%ecx
 529        #                                               w += v
 530        add     %edi,%ebx
 531        #                                               w <<<= 9
 532        rol     $9,%ebx
 533        #                                               w ^= x7
 534        xorl    128(%esp),%ebx
 535        # x8 = p
 536        movl    %eax,132(%esp)
 537        #                               x2 = t
 538        movl    %edx,108(%esp)
 539        # p += x4
 540        addl    116(%esp),%eax
 541        #               x13 = s
 542        movl    %ecx,152(%esp)
 543        #                               t += x14
 544        addl    156(%esp),%edx
 545        #                                               x7 = w
 546        movl    %ebx,128(%esp)
 547        # p <<<= 13
 548        rol     $13,%eax
 549        # p ^= x12
 550        xorl    148(%esp),%eax
 551        #                               t <<<= 13
 552        rol     $13,%edx
 553        #                               t ^= x6
 554        xorl    124(%esp),%edx
 555        #               r += s
 556        add     %ecx,%esi
 557        #               r <<<= 13
 558        rol     $13,%esi
 559        #               r ^= x1
 560        xorl    104(%esp),%esi
 561        #                                               v += w
 562        add     %ebx,%edi
 563        #                                               v <<<= 13
 564        rol     $13,%edi
 565        #                                               v ^= x11
 566        xorl    144(%esp),%edi
 567        # x12 = p
 568        movl    %eax,148(%esp)
 569        #                               x6 = t
 570        movl    %edx,124(%esp)
 571        # p += x8
 572        addl    132(%esp),%eax
 573        #               x1 = r
 574        movl    %esi,104(%esp)
 575        #                               t += x2
 576        addl    108(%esp),%edx
 577        #                                               x11 = v
 578        movl    %edi,144(%esp)
 579        # p <<<= 18
 580        rol     $18,%eax
 581        # p ^= x0
 582        xorl    100(%esp),%eax
 583        #                               t <<<= 18
 584        rol     $18,%edx
 585        #                               t ^= x10
 586        xorl    140(%esp),%edx
 587        #               s += r
 588        add     %esi,%ecx
 589        #               s <<<= 18
 590        rol     $18,%ecx
 591        #               s ^= x5
 592        xorl    120(%esp),%ecx
 593        #                                               w += v
 594        add     %edi,%ebx
 595        #                                               w <<<= 18
 596        rol     $18,%ebx
 597        #                                               w ^= x15
 598        xorl    160(%esp),%ebx
 599        # x0 = p
 600        movl    %eax,100(%esp)
 601        #                               x10 = t
 602        movl    %edx,140(%esp)
 603        # p += x3
 604        addl    112(%esp),%eax
 605        # p <<<= 7
 606        rol     $7,%eax
 607        #               x5 = s
 608        movl    %ecx,120(%esp)
 609        #                               t += x9
 610        addl    136(%esp),%edx
 611        #                                               x15 = w
 612        movl    %ebx,160(%esp)
 613        #               r = x4
 614        movl    116(%esp),%esi
 615        #               r += s
 616        add     %ecx,%esi
 617        #                                               v = x14
 618        movl    156(%esp),%edi
 619        #                                               v += w
 620        add     %ebx,%edi
 621        # p ^= x1
 622        xorl    104(%esp),%eax
 623        #                               t <<<= 7
 624        rol     $7,%edx
 625        #                               t ^= x11
 626        xorl    144(%esp),%edx
 627        #               r <<<= 7
 628        rol     $7,%esi
 629        #               r ^= x6
 630        xorl    124(%esp),%esi
 631        #                                               v <<<= 7
 632        rol     $7,%edi
 633        #                                               v ^= x12
 634        xorl    148(%esp),%edi
 635        # x1 = p
 636        movl    %eax,104(%esp)
 637        #                               x11 = t
 638        movl    %edx,144(%esp)
 639        # p += x0
 640        addl    100(%esp),%eax
 641        #               x6 = r
 642        movl    %esi,124(%esp)
 643        #                               t += x10
 644        addl    140(%esp),%edx
 645        #                                               x12 = v
 646        movl    %edi,148(%esp)
 647        # p <<<= 9
 648        rol     $9,%eax
 649        # p ^= x2
 650        xorl    108(%esp),%eax
 651        #                               t <<<= 9
 652        rol     $9,%edx
 653        #                               t ^= x8
 654        xorl    132(%esp),%edx
 655        #               s += r
 656        add     %esi,%ecx
 657        #               s <<<= 9
 658        rol     $9,%ecx
 659        #               s ^= x7
 660        xorl    128(%esp),%ecx
 661        #                                               w += v
 662        add     %edi,%ebx
 663        #                                               w <<<= 9
 664        rol     $9,%ebx
 665        #                                               w ^= x13
 666        xorl    152(%esp),%ebx
 667        # x2 = p
 668        movl    %eax,108(%esp)
 669        #                               x8 = t
 670        movl    %edx,132(%esp)
 671        # p += x1
 672        addl    104(%esp),%eax
 673        #               x7 = s
 674        movl    %ecx,128(%esp)
 675        #                               t += x11
 676        addl    144(%esp),%edx
 677        #                                               x13 = w
 678        movl    %ebx,152(%esp)
 679        # p <<<= 13
 680        rol     $13,%eax
 681        # p ^= x3
 682        xorl    112(%esp),%eax
 683        #                               t <<<= 13
 684        rol     $13,%edx
 685        #                               t ^= x9
 686        xorl    136(%esp),%edx
 687        #               r += s
 688        add     %ecx,%esi
 689        #               r <<<= 13
 690        rol     $13,%esi
 691        #               r ^= x4
 692        xorl    116(%esp),%esi
 693        #                                               v += w
 694        add     %ebx,%edi
 695        #                                               v <<<= 13
 696        rol     $13,%edi
 697        #                                               v ^= x14
 698        xorl    156(%esp),%edi
 699        # x3 = p
 700        movl    %eax,112(%esp)
 701        #                               x9 = t
 702        movl    %edx,136(%esp)
 703        # p += x2
 704        addl    108(%esp),%eax
 705        #               x4 = r
 706        movl    %esi,116(%esp)
 707        #                               t += x8
 708        addl    132(%esp),%edx
 709        #                                               x14 = v
 710        movl    %edi,156(%esp)
 711        # p <<<= 18
 712        rol     $18,%eax
 713        # p ^= x0
 714        xorl    100(%esp),%eax
 715        #                               t <<<= 18
 716        rol     $18,%edx
 717        #                               t ^= x10
 718        xorl    140(%esp),%edx
 719        #               s += r
 720        add     %esi,%ecx
 721        #               s <<<= 18
 722        rol     $18,%ecx
 723        #               s ^= x5
 724        xorl    120(%esp),%ecx
 725        #                                               w += v
 726        add     %edi,%ebx
 727        #                                               w <<<= 18
 728        rol     $18,%ebx
 729        #                                               w ^= x15
 730        xorl    160(%esp),%ebx
 731        # i -= 4
 732        sub     $4,%ebp
 733        # goto mainloop if unsigned >
 734        ja      ._mainloop
 735        # x0 = p
 736        movl    %eax,100(%esp)
 737        # x5 = s
 738        movl    %ecx,120(%esp)
 739        # x10 = t
 740        movl    %edx,140(%esp)
 741        # x15 = w
 742        movl    %ebx,160(%esp)
 743        #   out = out_backup
 744        movl    72(%esp),%edi
 745        #   m = m_backup
 746        movl    68(%esp),%esi
 747        #   in0 = x0
 748        movl    100(%esp),%eax
 749        #   in1 = x1
 750        movl    104(%esp),%ecx
 751        #   in0 += j0
 752        addl    164(%esp),%eax
 753        #   in1 += j1
 754        addl    168(%esp),%ecx
 755        #   in0 ^= *(uint32 *) (m + 0)
 756        xorl    0(%esi),%eax
 757        #   in1 ^= *(uint32 *) (m + 4)
 758        xorl    4(%esi),%ecx
 759        #   *(uint32 *) (out + 0) = in0
 760        movl    %eax,0(%edi)
 761        #   *(uint32 *) (out + 4) = in1
 762        movl    %ecx,4(%edi)
 763        #   in2 = x2
 764        movl    108(%esp),%eax
 765        #   in3 = x3
 766        movl    112(%esp),%ecx
 767        #   in2 += j2
 768        addl    172(%esp),%eax
 769        #   in3 += j3
 770        addl    176(%esp),%ecx
 771        #   in2 ^= *(uint32 *) (m + 8)
 772        xorl    8(%esi),%eax
 773        #   in3 ^= *(uint32 *) (m + 12)
 774        xorl    12(%esi),%ecx
 775        #   *(uint32 *) (out + 8) = in2
 776        movl    %eax,8(%edi)
 777        #   *(uint32 *) (out + 12) = in3
 778        movl    %ecx,12(%edi)
 779        #   in4 = x4
 780        movl    116(%esp),%eax
 781        #   in5 = x5
 782        movl    120(%esp),%ecx
 783        #   in4 += j4
 784        addl    180(%esp),%eax
 785        #   in5 += j5
 786        addl    184(%esp),%ecx
 787        #   in4 ^= *(uint32 *) (m + 16)
 788        xorl    16(%esi),%eax
 789        #   in5 ^= *(uint32 *) (m + 20)
 790        xorl    20(%esi),%ecx
 791        #   *(uint32 *) (out + 16) = in4
 792        movl    %eax,16(%edi)
 793        #   *(uint32 *) (out + 20) = in5
 794        movl    %ecx,20(%edi)
 795        #   in6 = x6
 796        movl    124(%esp),%eax
 797        #   in7 = x7
 798        movl    128(%esp),%ecx
 799        #   in6 += j6
 800        addl    188(%esp),%eax
 801        #   in7 += j7
 802        addl    192(%esp),%ecx
 803        #   in6 ^= *(uint32 *) (m + 24)
 804        xorl    24(%esi),%eax
 805        #   in7 ^= *(uint32 *) (m + 28)
 806        xorl    28(%esi),%ecx
 807        #   *(uint32 *) (out + 24) = in6
 808        movl    %eax,24(%edi)
 809        #   *(uint32 *) (out + 28) = in7
 810        movl    %ecx,28(%edi)
 811        #   in8 = x8
 812        movl    132(%esp),%eax
 813        #   in9 = x9
 814        movl    136(%esp),%ecx
 815        #   in8 += j8
 816        addl    196(%esp),%eax
 817        #   in9 += j9
 818        addl    200(%esp),%ecx
 819        #   in8 ^= *(uint32 *) (m + 32)
 820        xorl    32(%esi),%eax
 821        #   in9 ^= *(uint32 *) (m + 36)
 822        xorl    36(%esi),%ecx
 823        #   *(uint32 *) (out + 32) = in8
 824        movl    %eax,32(%edi)
 825        #   *(uint32 *) (out + 36) = in9
 826        movl    %ecx,36(%edi)
 827        #   in10 = x10
 828        movl    140(%esp),%eax
 829        #   in11 = x11
 830        movl    144(%esp),%ecx
 831        #   in10 += j10
 832        addl    204(%esp),%eax
 833        #   in11 += j11
 834        addl    208(%esp),%ecx
 835        #   in10 ^= *(uint32 *) (m + 40)
 836        xorl    40(%esi),%eax
 837        #   in11 ^= *(uint32 *) (m + 44)
 838        xorl    44(%esi),%ecx
 839        #   *(uint32 *) (out + 40) = in10
 840        movl    %eax,40(%edi)
 841        #   *(uint32 *) (out + 44) = in11
 842        movl    %ecx,44(%edi)
 843        #   in12 = x12
 844        movl    148(%esp),%eax
 845        #   in13 = x13
 846        movl    152(%esp),%ecx
 847        #   in12 += j12
 848        addl    212(%esp),%eax
 849        #   in13 += j13
 850        addl    216(%esp),%ecx
 851        #   in12 ^= *(uint32 *) (m + 48)
 852        xorl    48(%esi),%eax
 853        #   in13 ^= *(uint32 *) (m + 52)
 854        xorl    52(%esi),%ecx
 855        #   *(uint32 *) (out + 48) = in12
 856        movl    %eax,48(%edi)
 857        #   *(uint32 *) (out + 52) = in13
 858        movl    %ecx,52(%edi)
 859        #   in14 = x14
 860        movl    156(%esp),%eax
 861        #   in15 = x15
 862        movl    160(%esp),%ecx
 863        #   in14 += j14
 864        addl    220(%esp),%eax
 865        #   in15 += j15
 866        addl    224(%esp),%ecx
 867        #   in14 ^= *(uint32 *) (m + 56)
 868        xorl    56(%esi),%eax
 869        #   in15 ^= *(uint32 *) (m + 60)
 870        xorl    60(%esi),%ecx
 871        #   *(uint32 *) (out + 56) = in14
 872        movl    %eax,56(%edi)
 873        #   *(uint32 *) (out + 60) = in15
 874        movl    %ecx,60(%edi)
 875        #   bytes = bytes_backup
 876        movl    76(%esp),%ebx
 877        #   in8 = j8
 878        movl    196(%esp),%eax
 879        #   in9 = j9
 880        movl    200(%esp),%ecx
 881        #   in8 += 1
 882        add     $1,%eax
 883        #   in9 += 0 + carry
 884        adc     $0,%ecx
 885        #   j8 = in8
 886        movl    %eax,196(%esp)
 887        #   j9 = in9
 888        movl    %ecx,200(%esp)
 889        #   bytes - 64
 890        cmp     $64,%ebx
 891        #   goto bytesatleast65 if unsigned>
 892        ja      ._bytesatleast65
 893        #     goto bytesatleast64 if unsigned>=
 894        jae     ._bytesatleast64
 895        #       m = out
 896        mov     %edi,%esi
 897        #       out = ctarget
 898        movl    228(%esp),%edi
 899        #       i = bytes
 900        mov     %ebx,%ecx
 901        #       while (i) { *out++ = *m++; --i }
 902        rep     movsb
 903._bytesatleast64:
 904        #     x = x_backup
 905        movl    64(%esp),%eax
 906        #     in8 = j8
 907        movl    196(%esp),%ecx
 908        #     in9 = j9
 909        movl    200(%esp),%edx
 910        #     *(uint32 *) (x + 32) = in8
 911        movl    %ecx,32(%eax)
 912        #     *(uint32 *) (x + 36) = in9
 913        movl    %edx,36(%eax)
 914._done:
 915        #     eax = eax_stack
 916        movl    80(%esp),%eax
 917        #     ebx = ebx_stack
 918        movl    84(%esp),%ebx
 919        #     esi = esi_stack
 920        movl    88(%esp),%esi
 921        #     edi = edi_stack
 922        movl    92(%esp),%edi
 923        #     ebp = ebp_stack
 924        movl    96(%esp),%ebp
 925        #     leave
 926        add     %eax,%esp
 927        ret
 928._bytesatleast65:
 929        #   bytes -= 64
 930        sub     $64,%ebx
 931        #   out += 64
 932        add     $64,%edi
 933        #   m += 64
 934        add     $64,%esi
 935        # goto bytesatleast1
 936        jmp     ._bytesatleast1
 937ENDPROC(salsa20_encrypt_bytes)
 938
 939# enter salsa20_keysetup
 940ENTRY(salsa20_keysetup)
 941        mov     %esp,%eax
 942        and     $31,%eax
 943        add     $256,%eax
 944        sub     %eax,%esp
 945        #   eax_stack = eax
 946        movl    %eax,64(%esp)
 947        #   ebx_stack = ebx
 948        movl    %ebx,68(%esp)
 949        #   esi_stack = esi
 950        movl    %esi,72(%esp)
 951        #   edi_stack = edi
 952        movl    %edi,76(%esp)
 953        #   ebp_stack = ebp
 954        movl    %ebp,80(%esp)
 955        #   k = arg2
 956        movl    8(%esp,%eax),%ecx
 957        #   kbits = arg3
 958        movl    12(%esp,%eax),%edx
 959        #   x = arg1
 960        movl    4(%esp,%eax),%eax
 961        #   in1 = *(uint32 *) (k + 0)
 962        movl    0(%ecx),%ebx
 963        #   in2 = *(uint32 *) (k + 4)
 964        movl    4(%ecx),%esi
 965        #   in3 = *(uint32 *) (k + 8)
 966        movl    8(%ecx),%edi
 967        #   in4 = *(uint32 *) (k + 12)
 968        movl    12(%ecx),%ebp
 969        #   *(uint32 *) (x + 4) = in1
 970        movl    %ebx,4(%eax)
 971        #   *(uint32 *) (x + 8) = in2
 972        movl    %esi,8(%eax)
 973        #   *(uint32 *) (x + 12) = in3
 974        movl    %edi,12(%eax)
 975        #   *(uint32 *) (x + 16) = in4
 976        movl    %ebp,16(%eax)
 977        #   kbits - 256
 978        cmp     $256,%edx
 979        #   goto kbits128 if unsigned<
 980        jb      ._kbits128
 981._kbits256:
 982        #     in11 = *(uint32 *) (k + 16)
 983        movl    16(%ecx),%edx
 984        #     in12 = *(uint32 *) (k + 20)
 985        movl    20(%ecx),%ebx
 986        #     in13 = *(uint32 *) (k + 24)
 987        movl    24(%ecx),%esi
 988        #     in14 = *(uint32 *) (k + 28)
 989        movl    28(%ecx),%ecx
 990        #     *(uint32 *) (x + 44) = in11
 991        movl    %edx,44(%eax)
 992        #     *(uint32 *) (x + 48) = in12
 993        movl    %ebx,48(%eax)
 994        #     *(uint32 *) (x + 52) = in13
 995        movl    %esi,52(%eax)
 996        #     *(uint32 *) (x + 56) = in14
 997        movl    %ecx,56(%eax)
 998        #     in0 = 1634760805
 999        mov     $1634760805,%ecx
1000        #     in5 = 857760878
1001        mov     $857760878,%edx
1002        #     in10 = 2036477234
1003        mov     $2036477234,%ebx
1004        #     in15 = 1797285236
1005        mov     $1797285236,%esi
1006        #     *(uint32 *) (x + 0) = in0
1007        movl    %ecx,0(%eax)
1008        #     *(uint32 *) (x + 20) = in5
1009        movl    %edx,20(%eax)
1010        #     *(uint32 *) (x + 40) = in10
1011        movl    %ebx,40(%eax)
1012        #     *(uint32 *) (x + 60) = in15
1013        movl    %esi,60(%eax)
1014        #   goto keysetupdone
1015        jmp     ._keysetupdone
1016._kbits128:
1017        #     in11 = *(uint32 *) (k + 0)
1018        movl    0(%ecx),%edx
1019        #     in12 = *(uint32 *) (k + 4)
1020        movl    4(%ecx),%ebx
1021        #     in13 = *(uint32 *) (k + 8)
1022        movl    8(%ecx),%esi
1023        #     in14 = *(uint32 *) (k + 12)
1024        movl    12(%ecx),%ecx
1025        #     *(uint32 *) (x + 44) = in11
1026        movl    %edx,44(%eax)
1027        #     *(uint32 *) (x + 48) = in12
1028        movl    %ebx,48(%eax)
1029        #     *(uint32 *) (x + 52) = in13
1030        movl    %esi,52(%eax)
1031        #     *(uint32 *) (x + 56) = in14
1032        movl    %ecx,56(%eax)
1033        #     in0 = 1634760805
1034        mov     $1634760805,%ecx
1035        #     in5 = 824206446
1036        mov     $824206446,%edx
1037        #     in10 = 2036477238
1038        mov     $2036477238,%ebx
1039        #     in15 = 1797285236
1040        mov     $1797285236,%esi
1041        #     *(uint32 *) (x + 0) = in0
1042        movl    %ecx,0(%eax)
1043        #     *(uint32 *) (x + 20) = in5
1044        movl    %edx,20(%eax)
1045        #     *(uint32 *) (x + 40) = in10
1046        movl    %ebx,40(%eax)
1047        #     *(uint32 *) (x + 60) = in15
1048        movl    %esi,60(%eax)
1049._keysetupdone:
1050        #   eax = eax_stack
1051        movl    64(%esp),%eax
1052        #   ebx = ebx_stack
1053        movl    68(%esp),%ebx
1054        #   esi = esi_stack
1055        movl    72(%esp),%esi
1056        #   edi = edi_stack
1057        movl    76(%esp),%edi
1058        #   ebp = ebp_stack
1059        movl    80(%esp),%ebp
1060        # leave
1061        add     %eax,%esp
1062        ret
1063ENDPROC(salsa20_keysetup)
1064
1065# enter salsa20_ivsetup
1066ENTRY(salsa20_ivsetup)
1067        mov     %esp,%eax
1068        and     $31,%eax
1069        add     $256,%eax
1070        sub     %eax,%esp
1071        #   eax_stack = eax
1072        movl    %eax,64(%esp)
1073        #   ebx_stack = ebx
1074        movl    %ebx,68(%esp)
1075        #   esi_stack = esi
1076        movl    %esi,72(%esp)
1077        #   edi_stack = edi
1078        movl    %edi,76(%esp)
1079        #   ebp_stack = ebp
1080        movl    %ebp,80(%esp)
1081        #   iv = arg2
1082        movl    8(%esp,%eax),%ecx
1083        #   x = arg1
1084        movl    4(%esp,%eax),%eax
1085        #   in6 = *(uint32 *) (iv + 0)
1086        movl    0(%ecx),%edx
1087        #   in7 = *(uint32 *) (iv + 4)
1088        movl    4(%ecx),%ecx
1089        #   in8 = 0
1090        mov     $0,%ebx
1091        #   in9 = 0
1092        mov     $0,%esi
1093        #   *(uint32 *) (x + 24) = in6
1094        movl    %edx,24(%eax)
1095        #   *(uint32 *) (x + 28) = in7
1096        movl    %ecx,28(%eax)
1097        #   *(uint32 *) (x + 32) = in8
1098        movl    %ebx,32(%eax)
1099        #   *(uint32 *) (x + 36) = in9
1100        movl    %esi,36(%eax)
1101        #   eax = eax_stack
1102        movl    64(%esp),%eax
1103        #   ebx = ebx_stack
1104        movl    68(%esp),%ebx
1105        #   esi = esi_stack
1106        movl    72(%esp),%esi
1107        #   edi = edi_stack
1108        movl    76(%esp),%edi
1109        #   ebp = ebp_stack
1110        movl    80(%esp),%ebp
1111        # leave
1112        add     %eax,%esp
1113        ret
1114ENDPROC(salsa20_ivsetup)
1115