linux/arch/x86/crypto/poly1305-sse2-x86_64.S
<<
>>
Prefs
   1/*
   2 * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
   3 *
   4 * Copyright (C) 2015 Martin Willi
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License as published by
   8 * the Free Software Foundation; either version 2 of the License, or
   9 * (at your option) any later version.
  10 */
  11
  12#include <linux/linkage.h>
  13
  14.data
  15.align 16
  16
  17ANMASK: .octa 0x0000000003ffffff0000000003ffffff
  18ORMASK: .octa 0x00000000010000000000000001000000
  19
  20.text
  21
  22#define h0 0x00(%rdi)
  23#define h1 0x04(%rdi)
  24#define h2 0x08(%rdi)
  25#define h3 0x0c(%rdi)
  26#define h4 0x10(%rdi)
  27#define r0 0x00(%rdx)
  28#define r1 0x04(%rdx)
  29#define r2 0x08(%rdx)
  30#define r3 0x0c(%rdx)
  31#define r4 0x10(%rdx)
  32#define s1 0x00(%rsp)
  33#define s2 0x04(%rsp)
  34#define s3 0x08(%rsp)
  35#define s4 0x0c(%rsp)
  36#define m %rsi
  37#define h01 %xmm0
  38#define h23 %xmm1
  39#define h44 %xmm2
  40#define t1 %xmm3
  41#define t2 %xmm4
  42#define t3 %xmm5
  43#define t4 %xmm6
  44#define mask %xmm7
  45#define d0 %r8
  46#define d1 %r9
  47#define d2 %r10
  48#define d3 %r11
  49#define d4 %r12
  50
  51ENTRY(poly1305_block_sse2)
  52        # %rdi: Accumulator h[5]
  53        # %rsi: 16 byte input block m
  54        # %rdx: Poly1305 key r[5]
  55        # %rcx: Block count
  56
  57        # This single block variant tries to improve performance by doing two
  58        # multiplications in parallel using SSE instructions. There is quite
  59        # some quardword packing involved, hence the speedup is marginal.
  60
  61        push            %rbx
  62        push            %r12
  63        sub             $0x10,%rsp
  64
  65        # s1..s4 = r1..r4 * 5
  66        mov             r1,%eax
  67        lea             (%eax,%eax,4),%eax
  68        mov             %eax,s1
  69        mov             r2,%eax
  70        lea             (%eax,%eax,4),%eax
  71        mov             %eax,s2
  72        mov             r3,%eax
  73        lea             (%eax,%eax,4),%eax
  74        mov             %eax,s3
  75        mov             r4,%eax
  76        lea             (%eax,%eax,4),%eax
  77        mov             %eax,s4
  78
  79        movdqa          ANMASK(%rip),mask
  80
  81.Ldoblock:
  82        # h01 = [0, h1, 0, h0]
  83        # h23 = [0, h3, 0, h2]
  84        # h44 = [0, h4, 0, h4]
  85        movd            h0,h01
  86        movd            h1,t1
  87        movd            h2,h23
  88        movd            h3,t2
  89        movd            h4,h44
  90        punpcklqdq      t1,h01
  91        punpcklqdq      t2,h23
  92        punpcklqdq      h44,h44
  93
  94        # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
  95        movd            0x00(m),t1
  96        movd            0x03(m),t2
  97        psrld           $2,t2
  98        punpcklqdq      t2,t1
  99        pand            mask,t1
 100        paddd           t1,h01
 101        # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
 102        movd            0x06(m),t1
 103        movd            0x09(m),t2
 104        psrld           $4,t1
 105        psrld           $6,t2
 106        punpcklqdq      t2,t1
 107        pand            mask,t1
 108        paddd           t1,h23
 109        # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
 110        mov             0x0c(m),%eax
 111        shr             $8,%eax
 112        or              $0x01000000,%eax
 113        movd            %eax,t1
 114        pshufd          $0xc4,t1,t1
 115        paddd           t1,h44
 116
 117        # t1[0] = h0 * r0 + h2 * s3
 118        # t1[1] = h1 * s4 + h3 * s2
 119        movd            r0,t1
 120        movd            s4,t2
 121        punpcklqdq      t2,t1
 122        pmuludq         h01,t1
 123        movd            s3,t2
 124        movd            s2,t3
 125        punpcklqdq      t3,t2
 126        pmuludq         h23,t2
 127        paddq           t2,t1
 128        # t2[0] = h0 * r1 + h2 * s4
 129        # t2[1] = h1 * r0 + h3 * s3
 130        movd            r1,t2
 131        movd            r0,t3
 132        punpcklqdq      t3,t2
 133        pmuludq         h01,t2
 134        movd            s4,t3
 135        movd            s3,t4
 136        punpcklqdq      t4,t3
 137        pmuludq         h23,t3
 138        paddq           t3,t2
 139        # t3[0] = h4 * s1
 140        # t3[1] = h4 * s2
 141        movd            s1,t3
 142        movd            s2,t4
 143        punpcklqdq      t4,t3
 144        pmuludq         h44,t3
 145        # d0 = t1[0] + t1[1] + t3[0]
 146        # d1 = t2[0] + t2[1] + t3[1]
 147        movdqa          t1,t4
 148        punpcklqdq      t2,t4
 149        punpckhqdq      t2,t1
 150        paddq           t4,t1
 151        paddq           t3,t1
 152        movq            t1,d0
 153        psrldq          $8,t1
 154        movq            t1,d1
 155
 156        # t1[0] = h0 * r2 + h2 * r0
 157        # t1[1] = h1 * r1 + h3 * s4
 158        movd            r2,t1
 159        movd            r1,t2
 160        punpcklqdq      t2,t1
 161        pmuludq         h01,t1
 162        movd            r0,t2
 163        movd            s4,t3
 164        punpcklqdq      t3,t2
 165        pmuludq         h23,t2
 166        paddq           t2,t1
 167        # t2[0] = h0 * r3 + h2 * r1
 168        # t2[1] = h1 * r2 + h3 * r0
 169        movd            r3,t2
 170        movd            r2,t3
 171        punpcklqdq      t3,t2
 172        pmuludq         h01,t2
 173        movd            r1,t3
 174        movd            r0,t4
 175        punpcklqdq      t4,t3
 176        pmuludq         h23,t3
 177        paddq           t3,t2
 178        # t3[0] = h4 * s3
 179        # t3[1] = h4 * s4
 180        movd            s3,t3
 181        movd            s4,t4
 182        punpcklqdq      t4,t3
 183        pmuludq         h44,t3
 184        # d2 = t1[0] + t1[1] + t3[0]
 185        # d3 = t2[0] + t2[1] + t3[1]
 186        movdqa          t1,t4
 187        punpcklqdq      t2,t4
 188        punpckhqdq      t2,t1
 189        paddq           t4,t1
 190        paddq           t3,t1
 191        movq            t1,d2
 192        psrldq          $8,t1
 193        movq            t1,d3
 194
 195        # t1[0] = h0 * r4 + h2 * r2
 196        # t1[1] = h1 * r3 + h3 * r1
 197        movd            r4,t1
 198        movd            r3,t2
 199        punpcklqdq      t2,t1
 200        pmuludq         h01,t1
 201        movd            r2,t2
 202        movd            r1,t3
 203        punpcklqdq      t3,t2
 204        pmuludq         h23,t2
 205        paddq           t2,t1
 206        # t3[0] = h4 * r0
 207        movd            r0,t3
 208        pmuludq         h44,t3
 209        # d4 = t1[0] + t1[1] + t3[0]
 210        movdqa          t1,t4
 211        psrldq          $8,t4
 212        paddq           t4,t1
 213        paddq           t3,t1
 214        movq            t1,d4
 215
 216        # d1 += d0 >> 26
 217        mov             d0,%rax
 218        shr             $26,%rax
 219        add             %rax,d1
 220        # h0 = d0 & 0x3ffffff
 221        mov             d0,%rbx
 222        and             $0x3ffffff,%ebx
 223
 224        # d2 += d1 >> 26
 225        mov             d1,%rax
 226        shr             $26,%rax
 227        add             %rax,d2
 228        # h1 = d1 & 0x3ffffff
 229        mov             d1,%rax
 230        and             $0x3ffffff,%eax
 231        mov             %eax,h1
 232
 233        # d3 += d2 >> 26
 234        mov             d2,%rax
 235        shr             $26,%rax
 236        add             %rax,d3
 237        # h2 = d2 & 0x3ffffff
 238        mov             d2,%rax
 239        and             $0x3ffffff,%eax
 240        mov             %eax,h2
 241
 242        # d4 += d3 >> 26
 243        mov             d3,%rax
 244        shr             $26,%rax
 245        add             %rax,d4
 246        # h3 = d3 & 0x3ffffff
 247        mov             d3,%rax
 248        and             $0x3ffffff,%eax
 249        mov             %eax,h3
 250
 251        # h0 += (d4 >> 26) * 5
 252        mov             d4,%rax
 253        shr             $26,%rax
 254        lea             (%eax,%eax,4),%eax
 255        add             %eax,%ebx
 256        # h4 = d4 & 0x3ffffff
 257        mov             d4,%rax
 258        and             $0x3ffffff,%eax
 259        mov             %eax,h4
 260
 261        # h1 += h0 >> 26
 262        mov             %ebx,%eax
 263        shr             $26,%eax
 264        add             %eax,h1
 265        # h0 = h0 & 0x3ffffff
 266        andl            $0x3ffffff,%ebx
 267        mov             %ebx,h0
 268
 269        add             $0x10,m
 270        dec             %rcx
 271        jnz             .Ldoblock
 272
 273        add             $0x10,%rsp
 274        pop             %r12
 275        pop             %rbx
 276        ret
 277ENDPROC(poly1305_block_sse2)
 278
 279
 280#define u0 0x00(%r8)
 281#define u1 0x04(%r8)
 282#define u2 0x08(%r8)
 283#define u3 0x0c(%r8)
 284#define u4 0x10(%r8)
 285#define hc0 %xmm0
 286#define hc1 %xmm1
 287#define hc2 %xmm2
 288#define hc3 %xmm5
 289#define hc4 %xmm6
 290#define ru0 %xmm7
 291#define ru1 %xmm8
 292#define ru2 %xmm9
 293#define ru3 %xmm10
 294#define ru4 %xmm11
 295#define sv1 %xmm12
 296#define sv2 %xmm13
 297#define sv3 %xmm14
 298#define sv4 %xmm15
 299#undef d0
 300#define d0 %r13
 301
 302ENTRY(poly1305_2block_sse2)
 303        # %rdi: Accumulator h[5]
 304        # %rsi: 16 byte input block m
 305        # %rdx: Poly1305 key r[5]
 306        # %rcx: Doubleblock count
 307        # %r8:  Poly1305 derived key r^2 u[5]
 308
 309        # This two-block variant further improves performance by using loop
 310        # unrolled block processing. This is more straight forward and does
 311        # less byte shuffling, but requires a second Poly1305 key r^2:
 312        # h = (h + m) * r    =>    h = (h + m1) * r^2 + m2 * r
 313
 314        push            %rbx
 315        push            %r12
 316        push            %r13
 317
 318        # combine r0,u0
 319        movd            u0,ru0
 320        movd            r0,t1
 321        punpcklqdq      t1,ru0
 322
 323        # combine r1,u1 and s1=r1*5,v1=u1*5
 324        movd            u1,ru1
 325        movd            r1,t1
 326        punpcklqdq      t1,ru1
 327        movdqa          ru1,sv1
 328        pslld           $2,sv1
 329        paddd           ru1,sv1
 330
 331        # combine r2,u2 and s2=r2*5,v2=u2*5
 332        movd            u2,ru2
 333        movd            r2,t1
 334        punpcklqdq      t1,ru2
 335        movdqa          ru2,sv2
 336        pslld           $2,sv2
 337        paddd           ru2,sv2
 338
 339        # combine r3,u3 and s3=r3*5,v3=u3*5
 340        movd            u3,ru3
 341        movd            r3,t1
 342        punpcklqdq      t1,ru3
 343        movdqa          ru3,sv3
 344        pslld           $2,sv3
 345        paddd           ru3,sv3
 346
 347        # combine r4,u4 and s4=r4*5,v4=u4*5
 348        movd            u4,ru4
 349        movd            r4,t1
 350        punpcklqdq      t1,ru4
 351        movdqa          ru4,sv4
 352        pslld           $2,sv4
 353        paddd           ru4,sv4
 354
 355.Ldoblock2:
 356        # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
 357        movd            0x00(m),hc0
 358        movd            0x10(m),t1
 359        punpcklqdq      t1,hc0
 360        pand            ANMASK(%rip),hc0
 361        movd            h0,t1
 362        paddd           t1,hc0
 363        # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
 364        movd            0x03(m),hc1
 365        movd            0x13(m),t1
 366        punpcklqdq      t1,hc1
 367        psrld           $2,hc1
 368        pand            ANMASK(%rip),hc1
 369        movd            h1,t1
 370        paddd           t1,hc1
 371        # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
 372        movd            0x06(m),hc2
 373        movd            0x16(m),t1
 374        punpcklqdq      t1,hc2
 375        psrld           $4,hc2
 376        pand            ANMASK(%rip),hc2
 377        movd            h2,t1
 378        paddd           t1,hc2
 379        # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
 380        movd            0x09(m),hc3
 381        movd            0x19(m),t1
 382        punpcklqdq      t1,hc3
 383        psrld           $6,hc3
 384        pand            ANMASK(%rip),hc3
 385        movd            h3,t1
 386        paddd           t1,hc3
 387        # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
 388        movd            0x0c(m),hc4
 389        movd            0x1c(m),t1
 390        punpcklqdq      t1,hc4
 391        psrld           $8,hc4
 392        por             ORMASK(%rip),hc4
 393        movd            h4,t1
 394        paddd           t1,hc4
 395
 396        # t1 = [ hc0[1] * r0, hc0[0] * u0 ]
 397        movdqa          ru0,t1
 398        pmuludq         hc0,t1
 399        # t1 += [ hc1[1] * s4, hc1[0] * v4 ]
 400        movdqa          sv4,t2
 401        pmuludq         hc1,t2
 402        paddq           t2,t1
 403        # t1 += [ hc2[1] * s3, hc2[0] * v3 ]
 404        movdqa          sv3,t2
 405        pmuludq         hc2,t2
 406        paddq           t2,t1
 407        # t1 += [ hc3[1] * s2, hc3[0] * v2 ]
 408        movdqa          sv2,t2
 409        pmuludq         hc3,t2
 410        paddq           t2,t1
 411        # t1 += [ hc4[1] * s1, hc4[0] * v1 ]
 412        movdqa          sv1,t2
 413        pmuludq         hc4,t2
 414        paddq           t2,t1
 415        # d0 = t1[0] + t1[1]
 416        movdqa          t1,t2
 417        psrldq          $8,t2
 418        paddq           t2,t1
 419        movq            t1,d0
 420
 421        # t1 = [ hc0[1] * r1, hc0[0] * u1 ]
 422        movdqa          ru1,t1
 423        pmuludq         hc0,t1
 424        # t1 += [ hc1[1] * r0, hc1[0] * u0 ]
 425        movdqa          ru0,t2
 426        pmuludq         hc1,t2
 427        paddq           t2,t1
 428        # t1 += [ hc2[1] * s4, hc2[0] * v4 ]
 429        movdqa          sv4,t2
 430        pmuludq         hc2,t2
 431        paddq           t2,t1
 432        # t1 += [ hc3[1] * s3, hc3[0] * v3 ]
 433        movdqa          sv3,t2
 434        pmuludq         hc3,t2
 435        paddq           t2,t1
 436        # t1 += [ hc4[1] * s2, hc4[0] * v2 ]
 437        movdqa          sv2,t2
 438        pmuludq         hc4,t2
 439        paddq           t2,t1
 440        # d1 = t1[0] + t1[1]
 441        movdqa          t1,t2
 442        psrldq          $8,t2
 443        paddq           t2,t1
 444        movq            t1,d1
 445
 446        # t1 = [ hc0[1] * r2, hc0[0] * u2 ]
 447        movdqa          ru2,t1
 448        pmuludq         hc0,t1
 449        # t1 += [ hc1[1] * r1, hc1[0] * u1 ]
 450        movdqa          ru1,t2
 451        pmuludq         hc1,t2
 452        paddq           t2,t1
 453        # t1 += [ hc2[1] * r0, hc2[0] * u0 ]
 454        movdqa          ru0,t2
 455        pmuludq         hc2,t2
 456        paddq           t2,t1
 457        # t1 += [ hc3[1] * s4, hc3[0] * v4 ]
 458        movdqa          sv4,t2
 459        pmuludq         hc3,t2
 460        paddq           t2,t1
 461        # t1 += [ hc4[1] * s3, hc4[0] * v3 ]
 462        movdqa          sv3,t2
 463        pmuludq         hc4,t2
 464        paddq           t2,t1
 465        # d2 = t1[0] + t1[1]
 466        movdqa          t1,t2
 467        psrldq          $8,t2
 468        paddq           t2,t1
 469        movq            t1,d2
 470
 471        # t1 = [ hc0[1] * r3, hc0[0] * u3 ]
 472        movdqa          ru3,t1
 473        pmuludq         hc0,t1
 474        # t1 += [ hc1[1] * r2, hc1[0] * u2 ]
 475        movdqa          ru2,t2
 476        pmuludq         hc1,t2
 477        paddq           t2,t1
 478        # t1 += [ hc2[1] * r1, hc2[0] * u1 ]
 479        movdqa          ru1,t2
 480        pmuludq         hc2,t2
 481        paddq           t2,t1
 482        # t1 += [ hc3[1] * r0, hc3[0] * u0 ]
 483        movdqa          ru0,t2
 484        pmuludq         hc3,t2
 485        paddq           t2,t1
 486        # t1 += [ hc4[1] * s4, hc4[0] * v4 ]
 487        movdqa          sv4,t2
 488        pmuludq         hc4,t2
 489        paddq           t2,t1
 490        # d3 = t1[0] + t1[1]
 491        movdqa          t1,t2
 492        psrldq          $8,t2
 493        paddq           t2,t1
 494        movq            t1,d3
 495
 496        # t1 = [ hc0[1] * r4, hc0[0] * u4 ]
 497        movdqa          ru4,t1
 498        pmuludq         hc0,t1
 499        # t1 += [ hc1[1] * r3, hc1[0] * u3 ]
 500        movdqa          ru3,t2
 501        pmuludq         hc1,t2
 502        paddq           t2,t1
 503        # t1 += [ hc2[1] * r2, hc2[0] * u2 ]
 504        movdqa          ru2,t2
 505        pmuludq         hc2,t2
 506        paddq           t2,t1
 507        # t1 += [ hc3[1] * r1, hc3[0] * u1 ]
 508        movdqa          ru1,t2
 509        pmuludq         hc3,t2
 510        paddq           t2,t1
 511        # t1 += [ hc4[1] * r0, hc4[0] * u0 ]
 512        movdqa          ru0,t2
 513        pmuludq         hc4,t2
 514        paddq           t2,t1
 515        # d4 = t1[0] + t1[1]
 516        movdqa          t1,t2
 517        psrldq          $8,t2
 518        paddq           t2,t1
 519        movq            t1,d4
 520
 521        # d1 += d0 >> 26
 522        mov             d0,%rax
 523        shr             $26,%rax
 524        add             %rax,d1
 525        # h0 = d0 & 0x3ffffff
 526        mov             d0,%rbx
 527        and             $0x3ffffff,%ebx
 528
 529        # d2 += d1 >> 26
 530        mov             d1,%rax
 531        shr             $26,%rax
 532        add             %rax,d2
 533        # h1 = d1 & 0x3ffffff
 534        mov             d1,%rax
 535        and             $0x3ffffff,%eax
 536        mov             %eax,h1
 537
 538        # d3 += d2 >> 26
 539        mov             d2,%rax
 540        shr             $26,%rax
 541        add             %rax,d3
 542        # h2 = d2 & 0x3ffffff
 543        mov             d2,%rax
 544        and             $0x3ffffff,%eax
 545        mov             %eax,h2
 546
 547        # d4 += d3 >> 26
 548        mov             d3,%rax
 549        shr             $26,%rax
 550        add             %rax,d4
 551        # h3 = d3 & 0x3ffffff
 552        mov             d3,%rax
 553        and             $0x3ffffff,%eax
 554        mov             %eax,h3
 555
 556        # h0 += (d4 >> 26) * 5
 557        mov             d4,%rax
 558        shr             $26,%rax
 559        lea             (%eax,%eax,4),%eax
 560        add             %eax,%ebx
 561        # h4 = d4 & 0x3ffffff
 562        mov             d4,%rax
 563        and             $0x3ffffff,%eax
 564        mov             %eax,h4
 565
 566        # h1 += h0 >> 26
 567        mov             %ebx,%eax
 568        shr             $26,%eax
 569        add             %eax,h1
 570        # h0 = h0 & 0x3ffffff
 571        andl            $0x3ffffff,%ebx
 572        mov             %ebx,h0
 573
 574        add             $0x20,m
 575        dec             %rcx
 576        jnz             .Ldoblock2
 577
 578        pop             %r13
 579        pop             %r12
 580        pop             %rbx
 581        ret
 582ENDPROC(poly1305_2block_sse2)
 583