LXR linux/lib/crypto/curve25519-hacl64.c

   1// SPDX-License-Identifier: GPL-2.0 OR MIT
   2/*
   3 * Copyright (C) 2016-2017 INRIA and Microsoft Corporation.
   4 * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
   5 *
   6 * This is a machine-generated formally verified implementation of Curve25519
   7 * ECDH from: <https://github.com/mitls/hacl-star>. Though originally machine
   8 * generated, it has been tweaked to be suitable for use in the kernel. It is
   9 * optimized for 64-bit machines that can efficiently work with 128-bit
  10 * integer types.
  11 */
  12
  13#include <asm/unaligned.h>
  14#include <crypto/curve25519.h>
  15#include <linux/string.h>
  16
  17typedef __uint128_t u128;
  18
  19static __always_inline u64 u64_eq_mask(u64 a, u64 b)
  20{
  21        u64 x = a ^ b;
  22        u64 minus_x = ~x + (u64)1U;
  23        u64 x_or_minus_x = x | minus_x;
  24        u64 xnx = x_or_minus_x >> (u32)63U;
  25        u64 c = xnx - (u64)1U;
  26        return c;
  27}
  28
  29static __always_inline u64 u64_gte_mask(u64 a, u64 b)
  30{
  31        u64 x = a;
  32        u64 y = b;
  33        u64 x_xor_y = x ^ y;
  34        u64 x_sub_y = x - y;
  35        u64 x_sub_y_xor_y = x_sub_y ^ y;
  36        u64 q = x_xor_y | x_sub_y_xor_y;
  37        u64 x_xor_q = x ^ q;
  38        u64 x_xor_q_ = x_xor_q >> (u32)63U;
  39        u64 c = x_xor_q_ - (u64)1U;
  40        return c;
  41}
  42
  43static __always_inline void modulo_carry_top(u64 *b)
  44{
  45        u64 b4 = b[4];
  46        u64 b0 = b[0];
  47        u64 b4_ = b4 & 0x7ffffffffffffLLU;
  48        u64 b0_ = b0 + 19 * (b4 >> 51);
  49        b[4] = b4_;
  50        b[0] = b0_;
  51}
  52
  53static __always_inline void fproduct_copy_from_wide_(u64 *output, u128 *input)
  54{
  55        {
  56                u128 xi = input[0];
  57                output[0] = ((u64)(xi));
  58        }
  59        {
  60                u128 xi = input[1];
  61                output[1] = ((u64)(xi));
  62        }
  63        {
  64                u128 xi = input[2];
  65                output[2] = ((u64)(xi));
  66        }
  67        {
  68                u128 xi = input[3];
  69                output[3] = ((u64)(xi));
  70        }
  71        {
  72                u128 xi = input[4];
  73                output[4] = ((u64)(xi));
  74        }
  75}
  76
  77static __always_inline void
  78fproduct_sum_scalar_multiplication_(u128 *output, u64 *input, u64 s)
  79{
  80        output[0] += (u128)input[0] * s;
  81        output[1] += (u128)input[1] * s;
  82        output[2] += (u128)input[2] * s;
  83        output[3] += (u128)input[3] * s;
  84        output[4] += (u128)input[4] * s;
  85}
  86
  87static __always_inline void fproduct_carry_wide_(u128 *tmp)
  88{
  89        {
  90                u32 ctr = 0;
  91                u128 tctr = tmp[ctr];
  92                u128 tctrp1 = tmp[ctr + 1];
  93                u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
  94                u128 c = ((tctr) >> (51));
  95                tmp[ctr] = ((u128)(r0));
  96                tmp[ctr + 1] = ((tctrp1) + (c));
  97        }
  98        {
  99                u32 ctr = 1;
 100                u128 tctr = tmp[ctr];
 101                u128 tctrp1 = tmp[ctr + 1];
 102                u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
 103                u128 c = ((tctr) >> (51));
 104                tmp[ctr] = ((u128)(r0));
 105                tmp[ctr + 1] = ((tctrp1) + (c));
 106        }
 107
 108        {
 109                u32 ctr = 2;
 110                u128 tctr = tmp[ctr];
 111                u128 tctrp1 = tmp[ctr + 1];
 112                u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
 113                u128 c = ((tctr) >> (51));
 114                tmp[ctr] = ((u128)(r0));
 115                tmp[ctr + 1] = ((tctrp1) + (c));
 116        }
 117        {
 118                u32 ctr = 3;
 119                u128 tctr = tmp[ctr];
 120                u128 tctrp1 = tmp[ctr + 1];
 121                u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
 122                u128 c = ((tctr) >> (51));
 123                tmp[ctr] = ((u128)(r0));
 124                tmp[ctr + 1] = ((tctrp1) + (c));
 125        }
 126}
 127
 128static __always_inline void fmul_shift_reduce(u64 *output)
 129{
 130        u64 tmp = output[4];
 131        u64 b0;
 132        {
 133                u32 ctr = 5 - 0 - 1;
 134                u64 z = output[ctr - 1];
 135                output[ctr] = z;
 136        }
 137        {
 138                u32 ctr = 5 - 1 - 1;
 139                u64 z = output[ctr - 1];
 140                output[ctr] = z;
 141        }
 142        {
 143                u32 ctr = 5 - 2 - 1;
 144                u64 z = output[ctr - 1];
 145                output[ctr] = z;
 146        }
 147        {
 148                u32 ctr = 5 - 3 - 1;
 149                u64 z = output[ctr - 1];
 150                output[ctr] = z;
 151        }
 152        output[0] = tmp;
 153        b0 = output[0];
 154        output[0] = 19 * b0;
 155}
 156
 157static __always_inline void fmul_mul_shift_reduce_(u128 *output, u64 *input,
 158                                                   u64 *input21)
 159{
 160        u32 i;
 161        u64 input2i;
 162        {
 163                u64 input2i = input21[0];
 164                fproduct_sum_scalar_multiplication_(output, input, input2i);
 165                fmul_shift_reduce(input);
 166        }
 167        {
 168                u64 input2i = input21[1];
 169                fproduct_sum_scalar_multiplication_(output, input, input2i);
 170                fmul_shift_reduce(input);
 171        }
 172        {
 173                u64 input2i = input21[2];
 174                fproduct_sum_scalar_multiplication_(output, input, input2i);
 175                fmul_shift_reduce(input);
 176        }
 177        {
 178                u64 input2i = input21[3];
 179                fproduct_sum_scalar_multiplication_(output, input, input2i);
 180                fmul_shift_reduce(input);
 181        }
 182        i = 4;
 183        input2i = input21[i];
 184        fproduct_sum_scalar_multiplication_(output, input, input2i);
 185}
 186
 187static __always_inline void fmul_fmul(u64 *output, u64 *input, u64 *input21)
 188{
 189        u64 tmp[5] = { input[0], input[1], input[2], input[3], input[4] };
 190        {
 191                u128 b4;
 192                u128 b0;
 193                u128 b4_;
 194                u128 b0_;
 195                u64 i0;
 196                u64 i1;
 197                u64 i0_;
 198                u64 i1_;
 199                u128 t[5] = { 0 };
 200                fmul_mul_shift_reduce_(t, tmp, input21);
 201                fproduct_carry_wide_(t);
 202                b4 = t[4];
 203                b0 = t[0];
 204                b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
 205                b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
 206                t[4] = b4_;
 207                t[0] = b0_;
 208                fproduct_copy_from_wide_(output, t);
 209                i0 = output[0];
 210                i1 = output[1];
 211                i0_ = i0 & 0x7ffffffffffffLLU;
 212                i1_ = i1 + (i0 >> 51);
 213                output[0] = i0_;
 214                output[1] = i1_;
 215        }
 216}
 217
 218static __always_inline void fsquare_fsquare__(u128 *tmp, u64 *output)
 219{
 220        u64 r0 = output[0];
 221        u64 r1 = output[1];
 222        u64 r2 = output[2];
 223        u64 r3 = output[3];
 224        u64 r4 = output[4];
 225        u64 d0 = r0 * 2;
 226        u64 d1 = r1 * 2;
 227        u64 d2 = r2 * 2 * 19;
 228        u64 d419 = r4 * 19;
 229        u64 d4 = d419 * 2;
 230        u128 s0 = ((((((u128)(r0) * (r0))) + (((u128)(d4) * (r1))))) +
 231                   (((u128)(d2) * (r3))));
 232        u128 s1 = ((((((u128)(d0) * (r1))) + (((u128)(d4) * (r2))))) +
 233                   (((u128)(r3 * 19) * (r3))));
 234        u128 s2 = ((((((u128)(d0) * (r2))) + (((u128)(r1) * (r1))))) +
 235                   (((u128)(d4) * (r3))));
 236        u128 s3 = ((((((u128)(d0) * (r3))) + (((u128)(d1) * (r2))))) +
 237                   (((u128)(r4) * (d419))));
 238        u128 s4 = ((((((u128)(d0) * (r4))) + (((u128)(d1) * (r3))))) +
 239                   (((u128)(r2) * (r2))));
 240        tmp[0] = s0;
 241        tmp[1] = s1;
 242        tmp[2] = s2;
 243        tmp[3] = s3;
 244        tmp[4] = s4;
 245}
 246
 247static __always_inline void fsquare_fsquare_(u128 *tmp, u64 *output)
 248{
 249        u128 b4;
 250        u128 b0;
 251        u128 b4_;
 252        u128 b0_;
 253        u64 i0;
 254        u64 i1;
 255        u64 i0_;
 256        u64 i1_;
 257        fsquare_fsquare__(tmp, output);
 258        fproduct_carry_wide_(tmp);
 259        b4 = tmp[4];
 260        b0 = tmp[0];
 261        b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
 262        b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
 263        tmp[4] = b4_;
 264        tmp[0] = b0_;
 265        fproduct_copy_from_wide_(output, tmp);
 266        i0 = output[0];
 267        i1 = output[1];
 268        i0_ = i0 & 0x7ffffffffffffLLU;
 269        i1_ = i1 + (i0 >> 51);
 270        output[0] = i0_;
 271        output[1] = i1_;
 272}
 273
 274static __always_inline void fsquare_fsquare_times_(u64 *output, u128 *tmp,
 275                                                   u32 count1)
 276{
 277        u32 i;
 278        fsquare_fsquare_(tmp, output);
 279        for (i = 1; i < count1; ++i)
 280                fsquare_fsquare_(tmp, output);
 281}
 282
 283static __always_inline void fsquare_fsquare_times(u64 *output, u64 *input,
 284                                                  u32 count1)
 285{
 286        u128 t[5];
 287        memcpy(output, input, 5 * sizeof(*input));
 288        fsquare_fsquare_times_(output, t, count1);
 289}
 290
 291static __always_inline void fsquare_fsquare_times_inplace(u64 *output,
 292                                                          u32 count1)
 293{
 294        u128 t[5];
 295        fsquare_fsquare_times_(output, t, count1);
 296}
 297
 298static __always_inline void crecip_crecip(u64 *out, u64 *z)
 299{
 300        u64 buf[20] = { 0 };
 301        u64 *a0 = buf;
 302        u64 *t00 = buf + 5;
 303        u64 *b0 = buf + 10;
 304        u64 *t01;
 305        u64 *b1;
 306        u64 *c0;
 307        u64 *a;
 308        u64 *t0;
 309        u64 *b;
 310        u64 *c;
 311        fsquare_fsquare_times(a0, z, 1);
 312        fsquare_fsquare_times(t00, a0, 2);
 313        fmul_fmul(b0, t00, z);
 314        fmul_fmul(a0, b0, a0);
 315        fsquare_fsquare_times(t00, a0, 1);
 316        fmul_fmul(b0, t00, b0);
 317        fsquare_fsquare_times(t00, b0, 5);
 318        t01 = buf + 5;
 319        b1 = buf + 10;
 320        c0 = buf + 15;
 321        fmul_fmul(b1, t01, b1);
 322        fsquare_fsquare_times(t01, b1, 10);
 323        fmul_fmul(c0, t01, b1);
 324        fsquare_fsquare_times(t01, c0, 20);
 325        fmul_fmul(t01, t01, c0);
 326        fsquare_fsquare_times_inplace(t01, 10);
 327        fmul_fmul(b1, t01, b1);
 328        fsquare_fsquare_times(t01, b1, 50);
 329        a = buf;
 330        t0 = buf + 5;
 331        b = buf + 10;
 332        c = buf + 15;
 333        fmul_fmul(c, t0, b);
 334        fsquare_fsquare_times(t0, c, 100);
 335        fmul_fmul(t0, t0, c);
 336        fsquare_fsquare_times_inplace(t0, 50);
 337        fmul_fmul(t0, t0, b);
 338        fsquare_fsquare_times_inplace(t0, 5);
 339        fmul_fmul(out, t0, a);
 340}
 341
 342static __always_inline void fsum(u64 *a, u64 *b)
 343{
 344        a[0] += b[0];
 345        a[1] += b[1];
 346        a[2] += b[2];
 347        a[3] += b[3];
 348        a[4] += b[4];
 349}
 350
 351static __always_inline void fdifference(u64 *a, u64 *b)
 352{
 353        u64 tmp[5] = { 0 };
 354        u64 b0;
 355        u64 b1;
 356        u64 b2;
 357        u64 b3;
 358        u64 b4;
 359        memcpy(tmp, b, 5 * sizeof(*b));
 360        b0 = tmp[0];
 361        b1 = tmp[1];
 362        b2 = tmp[2];
 363        b3 = tmp[3];
 364        b4 = tmp[4];
 365        tmp[0] = b0 + 0x3fffffffffff68LLU;
 366        tmp[1] = b1 + 0x3ffffffffffff8LLU;
 367        tmp[2] = b2 + 0x3ffffffffffff8LLU;
 368        tmp[3] = b3 + 0x3ffffffffffff8LLU;
 369        tmp[4] = b4 + 0x3ffffffffffff8LLU;
 370        {
 371                u64 xi = a[0];
 372                u64 yi = tmp[0];
 373                a[0] = yi - xi;
 374        }
 375        {
 376                u64 xi = a[1];
 377                u64 yi = tmp[1];
 378                a[1] = yi - xi;
 379        }
 380        {
 381                u64 xi = a[2];
 382                u64 yi = tmp[2];
 383                a[2] = yi - xi;
 384        }
 385        {
 386                u64 xi = a[3];
 387                u64 yi = tmp[3];
 388                a[3] = yi - xi;
 389        }
 390        {
 391                u64 xi = a[4];
 392                u64 yi = tmp[4];
 393                a[4] = yi - xi;
 394        }
 395}
 396
 397static __always_inline void fscalar(u64 *output, u64 *b, u64 s)
 398{
 399        u128 tmp[5];
 400        u128 b4;
 401        u128 b0;
 402        u128 b4_;
 403        u128 b0_;
 404        {
 405                u64 xi = b[0];
 406                tmp[0] = ((u128)(xi) * (s));
 407        }
 408        {
 409                u64 xi = b[1];
 410                tmp[1] = ((u128)(xi) * (s));
 411        }
 412        {
 413                u64 xi = b[2];
 414                tmp[2] = ((u128)(xi) * (s));
 415        }
 416        {
 417                u64 xi = b[3];
 418                tmp[3] = ((u128)(xi) * (s));
 419        }
 420        {
 421                u64 xi = b[4];
 422                tmp[4] = ((u128)(xi) * (s));
 423        }
 424        fproduct_carry_wide_(tmp);
 425        b4 = tmp[4];
 426        b0 = tmp[0];
 427        b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
 428        b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
 429        tmp[4] = b4_;
 430        tmp[0] = b0_;
 431        fproduct_copy_from_wide_(output, tmp);
 432}
 433
 434static __always_inline void fmul(u64 *output, u64 *a, u64 *b)
 435{
 436        fmul_fmul(output, a, b);
 437}
 438
 439static __always_inline void crecip(u64 *output, u64 *input)
 440{
 441        crecip_crecip(output, input);
 442}
 443
 444static __always_inline void point_swap_conditional_step(u64 *a, u64 *b,
 445                                                        u64 swap1, u32 ctr)
 446{
 447        u32 i = ctr - 1;
 448        u64 ai = a[i];
 449        u64 bi = b[i];
 450        u64 x = swap1 & (ai ^ bi);
 451        u64 ai1 = ai ^ x;
 452        u64 bi1 = bi ^ x;
 453        a[i] = ai1;
 454        b[i] = bi1;
 455}
 456
 457static __always_inline void point_swap_conditional5(u64 *a, u64 *b, u64 swap1)
 458{
 459        point_swap_conditional_step(a, b, swap1, 5);
 460        point_swap_conditional_step(a, b, swap1, 4);
 461        point_swap_conditional_step(a, b, swap1, 3);
 462        point_swap_conditional_step(a, b, swap1, 2);
 463        point_swap_conditional_step(a, b, swap1, 1);
 464}
 465
 466static __always_inline void point_swap_conditional(u64 *a, u64 *b, u64 iswap)
 467{
 468        u64 swap1 = 0 - iswap;
 469        point_swap_conditional5(a, b, swap1);
 470        point_swap_conditional5(a + 5, b + 5, swap1);
 471}
 472
 473static __always_inline void point_copy(u64 *output, u64 *input)
 474{
 475        memcpy(output, input, 5 * sizeof(*input));
 476        memcpy(output + 5, input + 5, 5 * sizeof(*input));
 477}
 478
 479static __always_inline void addanddouble_fmonty(u64 *pp, u64 *ppq, u64 *p,
 480                                                u64 *pq, u64 *qmqp)
 481{
 482        u64 *qx = qmqp;
 483        u64 *x2 = pp;
 484        u64 *z2 = pp + 5;
 485        u64 *x3 = ppq;
 486        u64 *z3 = ppq + 5;
 487        u64 *x = p;
 488        u64 *z = p + 5;
 489        u64 *xprime = pq;
 490        u64 *zprime = pq + 5;
 491        u64 buf[40] = { 0 };
 492        u64 *origx = buf;
 493        u64 *origxprime0 = buf + 5;
 494        u64 *xxprime0;
 495        u64 *zzprime0;
 496        u64 *origxprime;
 497        xxprime0 = buf + 25;
 498        zzprime0 = buf + 30;
 499        memcpy(origx, x, 5 * sizeof(*x));
 500        fsum(x, z);
 501        fdifference(z, origx);
 502        memcpy(origxprime0, xprime, 5 * sizeof(*xprime));
 503        fsum(xprime, zprime);
 504        fdifference(zprime, origxprime0);
 505        fmul(xxprime0, xprime, z);
 506        fmul(zzprime0, x, zprime);
 507        origxprime = buf + 5;
 508        {
 509                u64 *xx0;
 510                u64 *zz0;
 511                u64 *xxprime;
 512                u64 *zzprime;
 513                u64 *zzzprime;
 514                xx0 = buf + 15;
 515                zz0 = buf + 20;
 516                xxprime = buf + 25;
 517                zzprime = buf + 30;
 518                zzzprime = buf + 35;
 519                memcpy(origxprime, xxprime, 5 * sizeof(*xxprime));
 520                fsum(xxprime, zzprime);
 521                fdifference(zzprime, origxprime);
 522                fsquare_fsquare_times(x3, xxprime, 1);
 523                fsquare_fsquare_times(zzzprime, zzprime, 1);
 524                fmul(z3, zzzprime, qx);
 525                fsquare_fsquare_times(xx0, x, 1);
 526                fsquare_fsquare_times(zz0, z, 1);
 527                {
 528                        u64 *zzz;
 529                        u64 *xx;
 530                        u64 *zz;
 531                        u64 scalar;
 532                        zzz = buf + 10;
 533                        xx = buf + 15;
 534                        zz = buf + 20;
 535                        fmul(x2, xx, zz);
 536                        fdifference(zz, xx);
 537                        scalar = 121665;
 538                        fscalar(zzz, zz, scalar);
 539                        fsum(zzz, xx);
 540                        fmul(z2, zzz, zz);
 541                }
 542        }
 543}
 544
 545static __always_inline void
 546ladder_smallloop_cmult_small_loop_step(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2,
 547                                       u64 *q, u8 byt)
 548{
 549        u64 bit0 = (u64)(byt >> 7);
 550        u64 bit;
 551        point_swap_conditional(nq, nqpq, bit0);
 552        addanddouble_fmonty(nq2, nqpq2, nq, nqpq, q);
 553        bit = (u64)(byt >> 7);
 554        point_swap_conditional(nq2, nqpq2, bit);
 555}
 556
 557static __always_inline void
 558ladder_smallloop_cmult_small_loop_double_step(u64 *nq, u64 *nqpq, u64 *nq2,
 559                                              u64 *nqpq2, u64 *q, u8 byt)
 560{
 561        u8 byt1;
 562        ladder_smallloop_cmult_small_loop_step(nq, nqpq, nq2, nqpq2, q, byt);
 563        byt1 = byt << 1;
 564        ladder_smallloop_cmult_small_loop_step(nq2, nqpq2, nq, nqpq, q, byt1);
 565}
 566
 567static __always_inline void
 568ladder_smallloop_cmult_small_loop(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2,
 569                                  u64 *q, u8 byt, u32 i)
 570{
 571        while (i--) {
 572                ladder_smallloop_cmult_small_loop_double_step(nq, nqpq, nq2,
 573                                                              nqpq2, q, byt);
 574                byt <<= 2;
 575        }
 576}
 577
 578static __always_inline void ladder_bigloop_cmult_big_loop(u8 *n1, u64 *nq,
 579                                                          u64 *nqpq, u64 *nq2,
 580                                                          u64 *nqpq2, u64 *q,
 581                                                          u32 i)
 582{
 583        while (i--) {
 584                u8 byte = n1[i];
 585                ladder_smallloop_cmult_small_loop(nq, nqpq, nq2, nqpq2, q,
 586                                                  byte, 4);
 587        }
 588}
 589
 590static void ladder_cmult(u64 *result, u8 *n1, u64 *q)
 591{
 592        u64 point_buf[40] = { 0 };
 593        u64 *nq = point_buf;
 594        u64 *nqpq = point_buf + 10;
 595        u64 *nq2 = point_buf + 20;
 596        u64 *nqpq2 = point_buf + 30;
 597        point_copy(nqpq, q);
 598        nq[0] = 1;
 599        ladder_bigloop_cmult_big_loop(n1, nq, nqpq, nq2, nqpq2, q, 32);
 600        point_copy(result, nq);
 601}
 602
 603static __always_inline void format_fexpand(u64 *output, const u8 *input)
 604{
 605        const u8 *x00 = input + 6;
 606        const u8 *x01 = input + 12;
 607        const u8 *x02 = input + 19;
 608        const u8 *x0 = input + 24;
 609        u64 i0, i1, i2, i3, i4, output0, output1, output2, output3, output4;
 610        i0 = get_unaligned_le64(input);
 611        i1 = get_unaligned_le64(x00);
 612        i2 = get_unaligned_le64(x01);
 613        i3 = get_unaligned_le64(x02);
 614        i4 = get_unaligned_le64(x0);
 615        output0 = i0 & 0x7ffffffffffffLLU;
 616        output1 = i1 >> 3 & 0x7ffffffffffffLLU;
 617        output2 = i2 >> 6 & 0x7ffffffffffffLLU;
 618        output3 = i3 >> 1 & 0x7ffffffffffffLLU;
 619        output4 = i4 >> 12 & 0x7ffffffffffffLLU;
 620        output[0] = output0;
 621        output[1] = output1;
 622        output[2] = output2;
 623        output[3] = output3;
 624        output[4] = output4;
 625}
 626
 627static __always_inline void format_fcontract_first_carry_pass(u64 *input)
 628{
 629        u64 t0 = input[0];
 630        u64 t1 = input[1];
 631        u64 t2 = input[2];
 632        u64 t3 = input[3];
 633        u64 t4 = input[4];
 634        u64 t1_ = t1 + (t0 >> 51);
 635        u64 t0_ = t0 & 0x7ffffffffffffLLU;
 636        u64 t2_ = t2 + (t1_ >> 51);
 637        u64 t1__ = t1_ & 0x7ffffffffffffLLU;
 638        u64 t3_ = t3 + (t2_ >> 51);
 639        u64 t2__ = t2_ & 0x7ffffffffffffLLU;
 640        u64 t4_ = t4 + (t3_ >> 51);
 641        u64 t3__ = t3_ & 0x7ffffffffffffLLU;
 642        input[0] = t0_;
 643        input[1] = t1__;
 644        input[2] = t2__;
 645        input[3] = t3__;
 646        input[4] = t4_;
 647}
 648
 649static __always_inline void format_fcontract_first_carry_full(u64 *input)
 650{
 651        format_fcontract_first_carry_pass(input);
 652        modulo_carry_top(input);
 653}
 654
 655static __always_inline void format_fcontract_second_carry_pass(u64 *input)
 656{
 657        u64 t0 = input[0];
 658        u64 t1 = input[1];
 659        u64 t2 = input[2];
 660        u64 t3 = input[3];
 661        u64 t4 = input[4];
 662        u64 t1_ = t1 + (t0 >> 51);
 663        u64 t0_ = t0 & 0x7ffffffffffffLLU;
 664        u64 t2_ = t2 + (t1_ >> 51);
 665        u64 t1__ = t1_ & 0x7ffffffffffffLLU;
 666        u64 t3_ = t3 + (t2_ >> 51);
 667        u64 t2__ = t2_ & 0x7ffffffffffffLLU;
 668        u64 t4_ = t4 + (t3_ >> 51);
 669        u64 t3__ = t3_ & 0x7ffffffffffffLLU;
 670        input[0] = t0_;
 671        input[1] = t1__;
 672        input[2] = t2__;
 673        input[3] = t3__;
 674        input[4] = t4_;
 675}
 676
 677static __always_inline void format_fcontract_second_carry_full(u64 *input)
 678{
 679        u64 i0;
 680        u64 i1;
 681        u64 i0_;
 682        u64 i1_;
 683        format_fcontract_second_carry_pass(input);
 684        modulo_carry_top(input);
 685        i0 = input[0];
 686        i1 = input[1];
 687        i0_ = i0 & 0x7ffffffffffffLLU;
 688        i1_ = i1 + (i0 >> 51);
 689        input[0] = i0_;
 690        input[1] = i1_;
 691}
 692
 693static __always_inline void format_fcontract_trim(u64 *input)
 694{
 695        u64 a0 = input[0];
 696        u64 a1 = input[1];
 697        u64 a2 = input[2];
 698        u64 a3 = input[3];
 699        u64 a4 = input[4];
 700        u64 mask0 = u64_gte_mask(a0, 0x7ffffffffffedLLU);
 701        u64 mask1 = u64_eq_mask(a1, 0x7ffffffffffffLLU);
 702        u64 mask2 = u64_eq_mask(a2, 0x7ffffffffffffLLU);
 703        u64 mask3 = u64_eq_mask(a3, 0x7ffffffffffffLLU);
 704        u64 mask4 = u64_eq_mask(a4, 0x7ffffffffffffLLU);
 705        u64 mask = (((mask0 & mask1) & mask2) & mask3) & mask4;
 706        u64 a0_ = a0 - (0x7ffffffffffedLLU & mask);
 707        u64 a1_ = a1 - (0x7ffffffffffffLLU & mask);
 708        u64 a2_ = a2 - (0x7ffffffffffffLLU & mask);
 709        u64 a3_ = a3 - (0x7ffffffffffffLLU & mask);
 710        u64 a4_ = a4 - (0x7ffffffffffffLLU & mask);
 711        input[0] = a0_;
 712        input[1] = a1_;
 713        input[2] = a2_;
 714        input[3] = a3_;
 715        input[4] = a4_;
 716}
 717
 718static __always_inline void format_fcontract_store(u8 *output, u64 *input)
 719{
 720        u64 t0 = input[0];
 721        u64 t1 = input[1];
 722        u64 t2 = input[2];
 723        u64 t3 = input[3];
 724        u64 t4 = input[4];
 725        u64 o0 = t1 << 51 | t0;
 726        u64 o1 = t2 << 38 | t1 >> 13;
 727        u64 o2 = t3 << 25 | t2 >> 26;
 728        u64 o3 = t4 << 12 | t3 >> 39;
 729        u8 *b0 = output;
 730        u8 *b1 = output + 8;
 731        u8 *b2 = output + 16;
 732        u8 *b3 = output + 24;
 733        put_unaligned_le64(o0, b0);
 734        put_unaligned_le64(o1, b1);
 735        put_unaligned_le64(o2, b2);
 736        put_unaligned_le64(o3, b3);
 737}
 738
 739static __always_inline void format_fcontract(u8 *output, u64 *input)
 740{
 741        format_fcontract_first_carry_full(input);
 742        format_fcontract_second_carry_full(input);
 743        format_fcontract_trim(input);
 744        format_fcontract_store(output, input);
 745}
 746
 747static __always_inline void format_scalar_of_point(u8 *scalar, u64 *point)
 748{
 749        u64 *x = point;
 750        u64 *z = point + 5;
 751        u64 buf[10] __aligned(32) = { 0 };
 752        u64 *zmone = buf;
 753        u64 *sc = buf + 5;
 754        crecip(zmone, z);
 755        fmul(sc, x, zmone);
 756        format_fcontract(scalar, sc);
 757}
 758
 759void curve25519_generic(u8 mypublic[CURVE25519_KEY_SIZE],
 760                        const u8 secret[CURVE25519_KEY_SIZE],
 761                        const u8 basepoint[CURVE25519_KEY_SIZE])
 762{
 763        u64 buf0[10] __aligned(32) = { 0 };
 764        u64 *x0 = buf0;
 765        u64 *z = buf0 + 5;
 766        u64 *q;
 767        format_fexpand(x0, basepoint);
 768        z[0] = 1;
 769        q = buf0;
 770        {
 771                u8 e[32] __aligned(32) = { 0 };
 772                u8 *scalar;
 773                memcpy(e, secret, 32);
 774                curve25519_clamp_secret(e);
 775                scalar = e;
 776                {
 777                        u64 buf[15] = { 0 };
 778                        u64 *nq = buf;
 779                        u64 *x = nq;
 780                        x[0] = 1;
 781                        ladder_cmult(nq, scalar, q);
 782                        format_scalar_of_point(mypublic, nq);
 783                        memzero_explicit(buf, sizeof(buf));
 784                }
 785                memzero_explicit(e, sizeof(e));
 786        }
 787        memzero_explicit(buf0, sizeof(buf0));
 788}
 789