linux/arch/x86/crypto/poly1305-sse2-x86_64.S
<<
>>
Prefs
   1/*
   2 * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
   3 *
   4 * Copyright (C) 2015 Martin Willi
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License as published by
   8 * the Free Software Foundation; either version 2 of the License, or
   9 * (at your option) any later version.
  10 */
  11
  12#include <linux/linkage.h>
  13
  14.section        .rodata.cst16.ANMASK, "aM", @progbits, 16
  15.align 16
  16ANMASK: .octa 0x0000000003ffffff0000000003ffffff
  17
  18.section        .rodata.cst16.ORMASK, "aM", @progbits, 16
  19.align 16
  20ORMASK: .octa 0x00000000010000000000000001000000
  21
  22.text
  23
  24#define h0 0x00(%rdi)
  25#define h1 0x04(%rdi)
  26#define h2 0x08(%rdi)
  27#define h3 0x0c(%rdi)
  28#define h4 0x10(%rdi)
  29#define r0 0x00(%rdx)
  30#define r1 0x04(%rdx)
  31#define r2 0x08(%rdx)
  32#define r3 0x0c(%rdx)
  33#define r4 0x10(%rdx)
  34#define s1 0x00(%rsp)
  35#define s2 0x04(%rsp)
  36#define s3 0x08(%rsp)
  37#define s4 0x0c(%rsp)
  38#define m %rsi
  39#define h01 %xmm0
  40#define h23 %xmm1
  41#define h44 %xmm2
  42#define t1 %xmm3
  43#define t2 %xmm4
  44#define t3 %xmm5
  45#define t4 %xmm6
  46#define mask %xmm7
  47#define d0 %r8
  48#define d1 %r9
  49#define d2 %r10
  50#define d3 %r11
  51#define d4 %r12
  52
  53ENTRY(poly1305_block_sse2)
  54        # %rdi: Accumulator h[5]
  55        # %rsi: 16 byte input block m
  56        # %rdx: Poly1305 key r[5]
  57        # %rcx: Block count
  58
  59        # This single block variant tries to improve performance by doing two
  60        # multiplications in parallel using SSE instructions. There is quite
  61        # some quardword packing involved, hence the speedup is marginal.
  62
  63        push            %rbx
  64        push            %r12
  65        sub             $0x10,%rsp
  66
  67        # s1..s4 = r1..r4 * 5
  68        mov             r1,%eax
  69        lea             (%eax,%eax,4),%eax
  70        mov             %eax,s1
  71        mov             r2,%eax
  72        lea             (%eax,%eax,4),%eax
  73        mov             %eax,s2
  74        mov             r3,%eax
  75        lea             (%eax,%eax,4),%eax
  76        mov             %eax,s3
  77        mov             r4,%eax
  78        lea             (%eax,%eax,4),%eax
  79        mov             %eax,s4
  80
  81        movdqa          ANMASK(%rip),mask
  82
  83.Ldoblock:
  84        # h01 = [0, h1, 0, h0]
  85        # h23 = [0, h3, 0, h2]
  86        # h44 = [0, h4, 0, h4]
  87        movd            h0,h01
  88        movd            h1,t1
  89        movd            h2,h23
  90        movd            h3,t2
  91        movd            h4,h44
  92        punpcklqdq      t1,h01
  93        punpcklqdq      t2,h23
  94        punpcklqdq      h44,h44
  95
  96        # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
  97        movd            0x00(m),t1
  98        movd            0x03(m),t2
  99        psrld           $2,t2
 100        punpcklqdq      t2,t1
 101        pand            mask,t1
 102        paddd           t1,h01
 103        # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
 104        movd            0x06(m),t1
 105        movd            0x09(m),t2
 106        psrld           $4,t1
 107        psrld           $6,t2
 108        punpcklqdq      t2,t1
 109        pand            mask,t1
 110        paddd           t1,h23
 111        # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
 112        mov             0x0c(m),%eax
 113        shr             $8,%eax
 114        or              $0x01000000,%eax
 115        movd            %eax,t1
 116        pshufd          $0xc4,t1,t1
 117        paddd           t1,h44
 118
 119        # t1[0] = h0 * r0 + h2 * s3
 120        # t1[1] = h1 * s4 + h3 * s2
 121        movd            r0,t1
 122        movd            s4,t2
 123        punpcklqdq      t2,t1
 124        pmuludq         h01,t1
 125        movd            s3,t2
 126        movd            s2,t3
 127        punpcklqdq      t3,t2
 128        pmuludq         h23,t2
 129        paddq           t2,t1
 130        # t2[0] = h0 * r1 + h2 * s4
 131        # t2[1] = h1 * r0 + h3 * s3
 132        movd            r1,t2
 133        movd            r0,t3
 134        punpcklqdq      t3,t2
 135        pmuludq         h01,t2
 136        movd            s4,t3
 137        movd            s3,t4
 138        punpcklqdq      t4,t3
 139        pmuludq         h23,t3
 140        paddq           t3,t2
 141        # t3[0] = h4 * s1
 142        # t3[1] = h4 * s2
 143        movd            s1,t3
 144        movd            s2,t4
 145        punpcklqdq      t4,t3
 146        pmuludq         h44,t3
 147        # d0 = t1[0] + t1[1] + t3[0]
 148        # d1 = t2[0] + t2[1] + t3[1]
 149        movdqa          t1,t4
 150        punpcklqdq      t2,t4
 151        punpckhqdq      t2,t1
 152        paddq           t4,t1
 153        paddq           t3,t1
 154        movq            t1,d0
 155        psrldq          $8,t1
 156        movq            t1,d1
 157
 158        # t1[0] = h0 * r2 + h2 * r0
 159        # t1[1] = h1 * r1 + h3 * s4
 160        movd            r2,t1
 161        movd            r1,t2
 162        punpcklqdq      t2,t1
 163        pmuludq         h01,t1
 164        movd            r0,t2
 165        movd            s4,t3
 166        punpcklqdq      t3,t2
 167        pmuludq         h23,t2
 168        paddq           t2,t1
 169        # t2[0] = h0 * r3 + h2 * r1
 170        # t2[1] = h1 * r2 + h3 * r0
 171        movd            r3,t2
 172        movd            r2,t3
 173        punpcklqdq      t3,t2
 174        pmuludq         h01,t2
 175        movd            r1,t3
 176        movd            r0,t4
 177        punpcklqdq      t4,t3
 178        pmuludq         h23,t3
 179        paddq           t3,t2
 180        # t3[0] = h4 * s3
 181        # t3[1] = h4 * s4
 182        movd            s3,t3
 183        movd            s4,t4
 184        punpcklqdq      t4,t3
 185        pmuludq         h44,t3
 186        # d2 = t1[0] + t1[1] + t3[0]
 187        # d3 = t2[0] + t2[1] + t3[1]
 188        movdqa          t1,t4
 189        punpcklqdq      t2,t4
 190        punpckhqdq      t2,t1
 191        paddq           t4,t1
 192        paddq           t3,t1
 193        movq            t1,d2
 194        psrldq          $8,t1
 195        movq            t1,d3
 196
 197        # t1[0] = h0 * r4 + h2 * r2
 198        # t1[1] = h1 * r3 + h3 * r1
 199        movd            r4,t1
 200        movd            r3,t2
 201        punpcklqdq      t2,t1
 202        pmuludq         h01,t1
 203        movd            r2,t2
 204        movd            r1,t3
 205        punpcklqdq      t3,t2
 206        pmuludq         h23,t2
 207        paddq           t2,t1
 208        # t3[0] = h4 * r0
 209        movd            r0,t3
 210        pmuludq         h44,t3
 211        # d4 = t1[0] + t1[1] + t3[0]
 212        movdqa          t1,t4
 213        psrldq          $8,t4
 214        paddq           t4,t1
 215        paddq           t3,t1
 216        movq            t1,d4
 217
 218        # d1 += d0 >> 26
 219        mov             d0,%rax
 220        shr             $26,%rax
 221        add             %rax,d1
 222        # h0 = d0 & 0x3ffffff
 223        mov             d0,%rbx
 224        and             $0x3ffffff,%ebx
 225
 226        # d2 += d1 >> 26
 227        mov             d1,%rax
 228        shr             $26,%rax
 229        add             %rax,d2
 230        # h1 = d1 & 0x3ffffff
 231        mov             d1,%rax
 232        and             $0x3ffffff,%eax
 233        mov             %eax,h1
 234
 235        # d3 += d2 >> 26
 236        mov             d2,%rax
 237        shr             $26,%rax
 238        add             %rax,d3
 239        # h2 = d2 & 0x3ffffff
 240        mov             d2,%rax
 241        and             $0x3ffffff,%eax
 242        mov             %eax,h2
 243
 244        # d4 += d3 >> 26
 245        mov             d3,%rax
 246        shr             $26,%rax
 247        add             %rax,d4
 248        # h3 = d3 & 0x3ffffff
 249        mov             d3,%rax
 250        and             $0x3ffffff,%eax
 251        mov             %eax,h3
 252
 253        # h0 += (d4 >> 26) * 5
 254        mov             d4,%rax
 255        shr             $26,%rax
 256        lea             (%eax,%eax,4),%eax
 257        add             %eax,%ebx
 258        # h4 = d4 & 0x3ffffff
 259        mov             d4,%rax
 260        and             $0x3ffffff,%eax
 261        mov             %eax,h4
 262
 263        # h1 += h0 >> 26
 264        mov             %ebx,%eax
 265        shr             $26,%eax
 266        add             %eax,h1
 267        # h0 = h0 & 0x3ffffff
 268        andl            $0x3ffffff,%ebx
 269        mov             %ebx,h0
 270
 271        add             $0x10,m
 272        dec             %rcx
 273        jnz             .Ldoblock
 274
 275        add             $0x10,%rsp
 276        pop             %r12
 277        pop             %rbx
 278        ret
 279ENDPROC(poly1305_block_sse2)
 280
 281
 282#define u0 0x00(%r8)
 283#define u1 0x04(%r8)
 284#define u2 0x08(%r8)
 285#define u3 0x0c(%r8)
 286#define u4 0x10(%r8)
 287#define hc0 %xmm0
 288#define hc1 %xmm1
 289#define hc2 %xmm2
 290#define hc3 %xmm5
 291#define hc4 %xmm6
 292#define ru0 %xmm7
 293#define ru1 %xmm8
 294#define ru2 %xmm9
 295#define ru3 %xmm10
 296#define ru4 %xmm11
 297#define sv1 %xmm12
 298#define sv2 %xmm13
 299#define sv3 %xmm14
 300#define sv4 %xmm15
 301#undef d0
 302#define d0 %r13
 303
 304ENTRY(poly1305_2block_sse2)
 305        # %rdi: Accumulator h[5]
 306        # %rsi: 16 byte input block m
 307        # %rdx: Poly1305 key r[5]
 308        # %rcx: Doubleblock count
 309        # %r8:  Poly1305 derived key r^2 u[5]
 310
 311        # This two-block variant further improves performance by using loop
 312        # unrolled block processing. This is more straight forward and does
 313        # less byte shuffling, but requires a second Poly1305 key r^2:
 314        # h = (h + m) * r    =>    h = (h + m1) * r^2 + m2 * r
 315
 316        push            %rbx
 317        push            %r12
 318        push            %r13
 319
 320        # combine r0,u0
 321        movd            u0,ru0
 322        movd            r0,t1
 323        punpcklqdq      t1,ru0
 324
 325        # combine r1,u1 and s1=r1*5,v1=u1*5
 326        movd            u1,ru1
 327        movd            r1,t1
 328        punpcklqdq      t1,ru1
 329        movdqa          ru1,sv1
 330        pslld           $2,sv1
 331        paddd           ru1,sv1
 332
 333        # combine r2,u2 and s2=r2*5,v2=u2*5
 334        movd            u2,ru2
 335        movd            r2,t1
 336        punpcklqdq      t1,ru2
 337        movdqa          ru2,sv2
 338        pslld           $2,sv2
 339        paddd           ru2,sv2
 340
 341        # combine r3,u3 and s3=r3*5,v3=u3*5
 342        movd            u3,ru3
 343        movd            r3,t1
 344        punpcklqdq      t1,ru3
 345        movdqa          ru3,sv3
 346        pslld           $2,sv3
 347        paddd           ru3,sv3
 348
 349        # combine r4,u4 and s4=r4*5,v4=u4*5
 350        movd            u4,ru4
 351        movd            r4,t1
 352        punpcklqdq      t1,ru4
 353        movdqa          ru4,sv4
 354        pslld           $2,sv4
 355        paddd           ru4,sv4
 356
 357.Ldoblock2:
 358        # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
 359        movd            0x00(m),hc0
 360        movd            0x10(m),t1
 361        punpcklqdq      t1,hc0
 362        pand            ANMASK(%rip),hc0
 363        movd            h0,t1
 364        paddd           t1,hc0
 365        # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
 366        movd            0x03(m),hc1
 367        movd            0x13(m),t1
 368        punpcklqdq      t1,hc1
 369        psrld           $2,hc1
 370        pand            ANMASK(%rip),hc1
 371        movd            h1,t1
 372        paddd           t1,hc1
 373        # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
 374        movd            0x06(m),hc2
 375        movd            0x16(m),t1
 376        punpcklqdq      t1,hc2
 377        psrld           $4,hc2
 378        pand            ANMASK(%rip),hc2
 379        movd            h2,t1
 380        paddd           t1,hc2
 381        # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
 382        movd            0x09(m),hc3
 383        movd            0x19(m),t1
 384        punpcklqdq      t1,hc3
 385        psrld           $6,hc3
 386        pand            ANMASK(%rip),hc3
 387        movd            h3,t1
 388        paddd           t1,hc3
 389        # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
 390        movd            0x0c(m),hc4
 391        movd            0x1c(m),t1
 392        punpcklqdq      t1,hc4
 393        psrld           $8,hc4
 394        por             ORMASK(%rip),hc4
 395        movd            h4,t1
 396        paddd           t1,hc4
 397
 398        # t1 = [ hc0[1] * r0, hc0[0] * u0 ]
 399        movdqa          ru0,t1
 400        pmuludq         hc0,t1
 401        # t1 += [ hc1[1] * s4, hc1[0] * v4 ]
 402        movdqa          sv4,t2
 403        pmuludq         hc1,t2
 404        paddq           t2,t1
 405        # t1 += [ hc2[1] * s3, hc2[0] * v3 ]
 406        movdqa          sv3,t2
 407        pmuludq         hc2,t2
 408        paddq           t2,t1
 409        # t1 += [ hc3[1] * s2, hc3[0] * v2 ]
 410        movdqa          sv2,t2
 411        pmuludq         hc3,t2
 412        paddq           t2,t1
 413        # t1 += [ hc4[1] * s1, hc4[0] * v1 ]
 414        movdqa          sv1,t2
 415        pmuludq         hc4,t2
 416        paddq           t2,t1
 417        # d0 = t1[0] + t1[1]
 418        movdqa          t1,t2
 419        psrldq          $8,t2
 420        paddq           t2,t1
 421        movq            t1,d0
 422
 423        # t1 = [ hc0[1] * r1, hc0[0] * u1 ]
 424        movdqa          ru1,t1
 425        pmuludq         hc0,t1
 426        # t1 += [ hc1[1] * r0, hc1[0] * u0 ]
 427        movdqa          ru0,t2
 428        pmuludq         hc1,t2
 429        paddq           t2,t1
 430        # t1 += [ hc2[1] * s4, hc2[0] * v4 ]
 431        movdqa          sv4,t2
 432        pmuludq         hc2,t2
 433        paddq           t2,t1
 434        # t1 += [ hc3[1] * s3, hc3[0] * v3 ]
 435        movdqa          sv3,t2
 436        pmuludq         hc3,t2
 437        paddq           t2,t1
 438        # t1 += [ hc4[1] * s2, hc4[0] * v2 ]
 439        movdqa          sv2,t2
 440        pmuludq         hc4,t2
 441        paddq           t2,t1
 442        # d1 = t1[0] + t1[1]
 443        movdqa          t1,t2
 444        psrldq          $8,t2
 445        paddq           t2,t1
 446        movq            t1,d1
 447
 448        # t1 = [ hc0[1] * r2, hc0[0] * u2 ]
 449        movdqa          ru2,t1
 450        pmuludq         hc0,t1
 451        # t1 += [ hc1[1] * r1, hc1[0] * u1 ]
 452        movdqa          ru1,t2
 453        pmuludq         hc1,t2
 454        paddq           t2,t1
 455        # t1 += [ hc2[1] * r0, hc2[0] * u0 ]
 456        movdqa          ru0,t2
 457        pmuludq         hc2,t2
 458        paddq           t2,t1
 459        # t1 += [ hc3[1] * s4, hc3[0] * v4 ]
 460        movdqa          sv4,t2
 461        pmuludq         hc3,t2
 462        paddq           t2,t1
 463        # t1 += [ hc4[1] * s3, hc4[0] * v3 ]
 464        movdqa          sv3,t2
 465        pmuludq         hc4,t2
 466        paddq           t2,t1
 467        # d2 = t1[0] + t1[1]
 468        movdqa          t1,t2
 469        psrldq          $8,t2
 470        paddq           t2,t1
 471        movq            t1,d2
 472
 473        # t1 = [ hc0[1] * r3, hc0[0] * u3 ]
 474        movdqa          ru3,t1
 475        pmuludq         hc0,t1
 476        # t1 += [ hc1[1] * r2, hc1[0] * u2 ]
 477        movdqa          ru2,t2
 478        pmuludq         hc1,t2
 479        paddq           t2,t1
 480        # t1 += [ hc2[1] * r1, hc2[0] * u1 ]
 481        movdqa          ru1,t2
 482        pmuludq         hc2,t2
 483        paddq           t2,t1
 484        # t1 += [ hc3[1] * r0, hc3[0] * u0 ]
 485        movdqa          ru0,t2
 486        pmuludq         hc3,t2
 487        paddq           t2,t1
 488        # t1 += [ hc4[1] * s4, hc4[0] * v4 ]
 489        movdqa          sv4,t2
 490        pmuludq         hc4,t2
 491        paddq           t2,t1
 492        # d3 = t1[0] + t1[1]
 493        movdqa          t1,t2
 494        psrldq          $8,t2
 495        paddq           t2,t1
 496        movq            t1,d3
 497
 498        # t1 = [ hc0[1] * r4, hc0[0] * u4 ]
 499        movdqa          ru4,t1
 500        pmuludq         hc0,t1
 501        # t1 += [ hc1[1] * r3, hc1[0] * u3 ]
 502        movdqa          ru3,t2
 503        pmuludq         hc1,t2
 504        paddq           t2,t1
 505        # t1 += [ hc2[1] * r2, hc2[0] * u2 ]
 506        movdqa          ru2,t2
 507        pmuludq         hc2,t2
 508        paddq           t2,t1
 509        # t1 += [ hc3[1] * r1, hc3[0] * u1 ]
 510        movdqa          ru1,t2
 511        pmuludq         hc3,t2
 512        paddq           t2,t1
 513        # t1 += [ hc4[1] * r0, hc4[0] * u0 ]
 514        movdqa          ru0,t2
 515        pmuludq         hc4,t2
 516        paddq           t2,t1
 517        # d4 = t1[0] + t1[1]
 518        movdqa          t1,t2
 519        psrldq          $8,t2
 520        paddq           t2,t1
 521        movq            t1,d4
 522
 523        # d1 += d0 >> 26
 524        mov             d0,%rax
 525        shr             $26,%rax
 526        add             %rax,d1
 527        # h0 = d0 & 0x3ffffff
 528        mov             d0,%rbx
 529        and             $0x3ffffff,%ebx
 530
 531        # d2 += d1 >> 26
 532        mov             d1,%rax
 533        shr             $26,%rax
 534        add             %rax,d2
 535        # h1 = d1 & 0x3ffffff
 536        mov             d1,%rax
 537        and             $0x3ffffff,%eax
 538        mov             %eax,h1
 539
 540        # d3 += d2 >> 26
 541        mov             d2,%rax
 542        shr             $26,%rax
 543        add             %rax,d3
 544        # h2 = d2 & 0x3ffffff
 545        mov             d2,%rax
 546        and             $0x3ffffff,%eax
 547        mov             %eax,h2
 548
 549        # d4 += d3 >> 26
 550        mov             d3,%rax
 551        shr             $26,%rax
 552        add             %rax,d4
 553        # h3 = d3 & 0x3ffffff
 554        mov             d3,%rax
 555        and             $0x3ffffff,%eax
 556        mov             %eax,h3
 557
 558        # h0 += (d4 >> 26) * 5
 559        mov             d4,%rax
 560        shr             $26,%rax
 561        lea             (%eax,%eax,4),%eax
 562        add             %eax,%ebx
 563        # h4 = d4 & 0x3ffffff
 564        mov             d4,%rax
 565        and             $0x3ffffff,%eax
 566        mov             %eax,h4
 567
 568        # h1 += h0 >> 26
 569        mov             %ebx,%eax
 570        shr             $26,%eax
 571        add             %eax,h1
 572        # h0 = h0 & 0x3ffffff
 573        andl            $0x3ffffff,%ebx
 574        mov             %ebx,h0
 575
 576        add             $0x20,m
 577        dec             %rcx
 578        jnz             .Ldoblock2
 579
 580        pop             %r13
 581        pop             %r12
 582        pop             %rbx
 583        ret
 584ENDPROC(poly1305_2block_sse2)
 585