linux/arch/x86/crypto/curve25519-x86_64.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
   2/*
   3 * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>. All Rights Reserved.
   4 * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
   5 * Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
   6 */
   7
   8#include <crypto/curve25519.h>
   9#include <crypto/internal/kpp.h>
  10
  11#include <linux/types.h>
  12#include <linux/jump_label.h>
  13#include <linux/kernel.h>
  14#include <linux/module.h>
  15
  16#include <asm/cpufeature.h>
  17#include <asm/processor.h>
  18
  19static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2);
  20static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_adx);
  21
  22enum { NUM_WORDS_ELTFP25519 = 4 };
  23typedef __aligned(32) u64 eltfp25519_1w[NUM_WORDS_ELTFP25519];
  24typedef __aligned(32) u64 eltfp25519_1w_buffer[2 * NUM_WORDS_ELTFP25519];
  25
  26#define mul_eltfp25519_1w_adx(c, a, b) do { \
  27        mul_256x256_integer_adx(m.buffer, a, b); \
  28        red_eltfp25519_1w_adx(c, m.buffer); \
  29} while (0)
  30
  31#define mul_eltfp25519_1w_bmi2(c, a, b) do { \
  32        mul_256x256_integer_bmi2(m.buffer, a, b); \
  33        red_eltfp25519_1w_bmi2(c, m.buffer); \
  34} while (0)
  35
  36#define sqr_eltfp25519_1w_adx(a) do { \
  37        sqr_256x256_integer_adx(m.buffer, a); \
  38        red_eltfp25519_1w_adx(a, m.buffer); \
  39} while (0)
  40
  41#define sqr_eltfp25519_1w_bmi2(a) do { \
  42        sqr_256x256_integer_bmi2(m.buffer, a); \
  43        red_eltfp25519_1w_bmi2(a, m.buffer); \
  44} while (0)
  45
  46#define mul_eltfp25519_2w_adx(c, a, b) do { \
  47        mul2_256x256_integer_adx(m.buffer, a, b); \
  48        red_eltfp25519_2w_adx(c, m.buffer); \
  49} while (0)
  50
  51#define mul_eltfp25519_2w_bmi2(c, a, b) do { \
  52        mul2_256x256_integer_bmi2(m.buffer, a, b); \
  53        red_eltfp25519_2w_bmi2(c, m.buffer); \
  54} while (0)
  55
  56#define sqr_eltfp25519_2w_adx(a) do { \
  57        sqr2_256x256_integer_adx(m.buffer, a); \
  58        red_eltfp25519_2w_adx(a, m.buffer); \
  59} while (0)
  60
  61#define sqr_eltfp25519_2w_bmi2(a) do { \
  62        sqr2_256x256_integer_bmi2(m.buffer, a); \
  63        red_eltfp25519_2w_bmi2(a, m.buffer); \
  64} while (0)
  65
  66#define sqrn_eltfp25519_1w_adx(a, times) do { \
  67        int ____counter = (times); \
  68        while (____counter-- > 0) \
  69                sqr_eltfp25519_1w_adx(a); \
  70} while (0)
  71
  72#define sqrn_eltfp25519_1w_bmi2(a, times) do { \
  73        int ____counter = (times); \
  74        while (____counter-- > 0) \
  75                sqr_eltfp25519_1w_bmi2(a); \
  76} while (0)
  77
  78#define copy_eltfp25519_1w(C, A) do { \
  79        (C)[0] = (A)[0]; \
  80        (C)[1] = (A)[1]; \
  81        (C)[2] = (A)[2]; \
  82        (C)[3] = (A)[3]; \
  83} while (0)
  84
  85#define setzero_eltfp25519_1w(C) do { \
  86        (C)[0] = 0; \
  87        (C)[1] = 0; \
  88        (C)[2] = 0; \
  89        (C)[3] = 0; \
  90} while (0)
  91
  92__aligned(32) static const u64 table_ladder_8k[252 * NUM_WORDS_ELTFP25519] = {
  93        /*   1 */ 0xfffffffffffffff3UL, 0xffffffffffffffffUL,
  94                  0xffffffffffffffffUL, 0x5fffffffffffffffUL,
  95        /*   2 */ 0x6b8220f416aafe96UL, 0x82ebeb2b4f566a34UL,
  96                  0xd5a9a5b075a5950fUL, 0x5142b2cf4b2488f4UL,
  97        /*   3 */ 0x6aaebc750069680cUL, 0x89cf7820a0f99c41UL,
  98                  0x2a58d9183b56d0f4UL, 0x4b5aca80e36011a4UL,
  99        /*   4 */ 0x329132348c29745dUL, 0xf4a2e616e1642fd7UL,
 100                  0x1e45bb03ff67bc34UL, 0x306912d0f42a9b4aUL,
 101        /*   5 */ 0xff886507e6af7154UL, 0x04f50e13dfeec82fUL,
 102                  0xaa512fe82abab5ceUL, 0x174e251a68d5f222UL,
 103        /*   6 */ 0xcf96700d82028898UL, 0x1743e3370a2c02c5UL,
 104                  0x379eec98b4e86eaaUL, 0x0c59888a51e0482eUL,
 105        /*   7 */ 0xfbcbf1d699b5d189UL, 0xacaef0d58e9fdc84UL,
 106                  0xc1c20d06231f7614UL, 0x2938218da274f972UL,
 107        /*   8 */ 0xf6af49beff1d7f18UL, 0xcc541c22387ac9c2UL,
 108                  0x96fcc9ef4015c56bUL, 0x69c1627c690913a9UL,
 109        /*   9 */ 0x7a86fd2f4733db0eUL, 0xfdb8c4f29e087de9UL,
 110                  0x095e4b1a8ea2a229UL, 0x1ad7a7c829b37a79UL,
 111        /*  10 */ 0x342d89cad17ea0c0UL, 0x67bedda6cced2051UL,
 112                  0x19ca31bf2bb42f74UL, 0x3df7b4c84980acbbUL,
 113        /*  11 */ 0xa8c6444dc80ad883UL, 0xb91e440366e3ab85UL,
 114                  0xc215cda00164f6d8UL, 0x3d867c6ef247e668UL,
 115        /*  12 */ 0xc7dd582bcc3e658cUL, 0xfd2c4748ee0e5528UL,
 116                  0xa0fd9b95cc9f4f71UL, 0x7529d871b0675ddfUL,
 117        /*  13 */ 0xb8f568b42d3cbd78UL, 0x1233011b91f3da82UL,
 118                  0x2dce6ccd4a7c3b62UL, 0x75e7fc8e9e498603UL,
 119        /*  14 */ 0x2f4f13f1fcd0b6ecUL, 0xf1a8ca1f29ff7a45UL,
 120                  0xc249c1a72981e29bUL, 0x6ebe0dbb8c83b56aUL,
 121        /*  15 */ 0x7114fa8d170bb222UL, 0x65a2dcd5bf93935fUL,
 122                  0xbdc41f68b59c979aUL, 0x2f0eef79a2ce9289UL,
 123        /*  16 */ 0x42ecbf0c083c37ceUL, 0x2930bc09ec496322UL,
 124                  0xf294b0c19cfeac0dUL, 0x3780aa4bedfabb80UL,
 125        /*  17 */ 0x56c17d3e7cead929UL, 0xe7cb4beb2e5722c5UL,
 126                  0x0ce931732dbfe15aUL, 0x41b883c7621052f8UL,
 127        /*  18 */ 0xdbf75ca0c3d25350UL, 0x2936be086eb1e351UL,
 128                  0xc936e03cb4a9b212UL, 0x1d45bf82322225aaUL,
 129        /*  19 */ 0xe81ab1036a024cc5UL, 0xe212201c304c9a72UL,
 130                  0xc5d73fba6832b1fcUL, 0x20ffdb5a4d839581UL,
 131        /*  20 */ 0xa283d367be5d0fadUL, 0x6c2b25ca8b164475UL,
 132                  0x9d4935467caaf22eUL, 0x5166408eee85ff49UL,
 133        /*  21 */ 0x3c67baa2fab4e361UL, 0xb3e433c67ef35cefUL,
 134                  0x5259729241159b1cUL, 0x6a621892d5b0ab33UL,
 135        /*  22 */ 0x20b74a387555cdcbUL, 0x532aa10e1208923fUL,
 136                  0xeaa17b7762281dd1UL, 0x61ab3443f05c44bfUL,
 137        /*  23 */ 0x257a6c422324def8UL, 0x131c6c1017e3cf7fUL,
 138                  0x23758739f630a257UL, 0x295a407a01a78580UL,
 139        /*  24 */ 0xf8c443246d5da8d9UL, 0x19d775450c52fa5dUL,
 140                  0x2afcfc92731bf83dUL, 0x7d10c8e81b2b4700UL,
 141        /*  25 */ 0xc8e0271f70baa20bUL, 0x993748867ca63957UL,
 142                  0x5412efb3cb7ed4bbUL, 0x3196d36173e62975UL,
 143        /*  26 */ 0xde5bcad141c7dffcUL, 0x47cc8cd2b395c848UL,
 144                  0xa34cd942e11af3cbUL, 0x0256dbf2d04ecec2UL,
 145        /*  27 */ 0x875ab7e94b0e667fUL, 0xcad4dd83c0850d10UL,
 146                  0x47f12e8f4e72c79fUL, 0x5f1a87bb8c85b19bUL,
 147        /*  28 */ 0x7ae9d0b6437f51b8UL, 0x12c7ce5518879065UL,
 148                  0x2ade09fe5cf77aeeUL, 0x23a05a2f7d2c5627UL,
 149        /*  29 */ 0x5908e128f17c169aUL, 0xf77498dd8ad0852dUL,
 150                  0x74b4c4ceab102f64UL, 0x183abadd10139845UL,
 151        /*  30 */ 0xb165ba8daa92aaacUL, 0xd5c5ef9599386705UL,
 152                  0xbe2f8f0cf8fc40d1UL, 0x2701e635ee204514UL,
 153        /*  31 */ 0x629fa80020156514UL, 0xf223868764a8c1ceUL,
 154                  0x5b894fff0b3f060eUL, 0x60d9944cf708a3faUL,
 155        /*  32 */ 0xaeea001a1c7a201fUL, 0xebf16a633ee2ce63UL,
 156                  0x6f7709594c7a07e1UL, 0x79b958150d0208cbUL,
 157        /*  33 */ 0x24b55e5301d410e7UL, 0xe3a34edff3fdc84dUL,
 158                  0xd88768e4904032d8UL, 0x131384427b3aaeecUL,
 159        /*  34 */ 0x8405e51286234f14UL, 0x14dc4739adb4c529UL,
 160                  0xb8a2b5b250634ffdUL, 0x2fe2a94ad8a7ff93UL,
 161        /*  35 */ 0xec5c57efe843faddUL, 0x2843ce40f0bb9918UL,
 162                  0xa4b561d6cf3d6305UL, 0x743629bde8fb777eUL,
 163        /*  36 */ 0x343edd46bbaf738fUL, 0xed981828b101a651UL,
 164                  0xa401760b882c797aUL, 0x1fc223e28dc88730UL,
 165        /*  37 */ 0x48604e91fc0fba0eUL, 0xb637f78f052c6fa4UL,
 166                  0x91ccac3d09e9239cUL, 0x23f7eed4437a687cUL,
 167        /*  38 */ 0x5173b1118d9bd800UL, 0x29d641b63189d4a7UL,
 168                  0xfdbf177988bbc586UL, 0x2959894fcad81df5UL,
 169        /*  39 */ 0xaebc8ef3b4bbc899UL, 0x4148995ab26992b9UL,
 170                  0x24e20b0134f92cfbUL, 0x40d158894a05dee8UL,
 171        /*  40 */ 0x46b00b1185af76f6UL, 0x26bac77873187a79UL,
 172                  0x3dc0bf95ab8fff5fUL, 0x2a608bd8945524d7UL,
 173        /*  41 */ 0x26449588bd446302UL, 0x7c4bc21c0388439cUL,
 174                  0x8e98a4f383bd11b2UL, 0x26218d7bc9d876b9UL,
 175        /*  42 */ 0xe3081542997c178aUL, 0x3c2d29a86fb6606fUL,
 176                  0x5c217736fa279374UL, 0x7dde05734afeb1faUL,
 177        /*  43 */ 0x3bf10e3906d42babUL, 0xe4f7803e1980649cUL,
 178                  0xe6053bf89595bf7aUL, 0x394faf38da245530UL,
 179        /*  44 */ 0x7a8efb58896928f4UL, 0xfbc778e9cc6a113cUL,
 180                  0x72670ce330af596fUL, 0x48f222a81d3d6cf7UL,
 181        /*  45 */ 0xf01fce410d72caa7UL, 0x5a20ecc7213b5595UL,
 182                  0x7bc21165c1fa1483UL, 0x07f89ae31da8a741UL,
 183        /*  46 */ 0x05d2c2b4c6830ff9UL, 0xd43e330fc6316293UL,
 184                  0xa5a5590a96d3a904UL, 0x705edb91a65333b6UL,
 185        /*  47 */ 0x048ee15e0bb9a5f7UL, 0x3240cfca9e0aaf5dUL,
 186                  0x8f4b71ceedc4a40bUL, 0x621c0da3de544a6dUL,
 187        /*  48 */ 0x92872836a08c4091UL, 0xce8375b010c91445UL,
 188                  0x8a72eb524f276394UL, 0x2667fcfa7ec83635UL,
 189        /*  49 */ 0x7f4c173345e8752aUL, 0x061b47feee7079a5UL,
 190                  0x25dd9afa9f86ff34UL, 0x3780cef5425dc89cUL,
 191        /*  50 */ 0x1a46035a513bb4e9UL, 0x3e1ef379ac575adaUL,
 192                  0xc78c5f1c5fa24b50UL, 0x321a967634fd9f22UL,
 193        /*  51 */ 0x946707b8826e27faUL, 0x3dca84d64c506fd0UL,
 194                  0xc189218075e91436UL, 0x6d9284169b3b8484UL,
 195        /*  52 */ 0x3a67e840383f2ddfUL, 0x33eec9a30c4f9b75UL,
 196                  0x3ec7c86fa783ef47UL, 0x26ec449fbac9fbc4UL,
 197        /*  53 */ 0x5c0f38cba09b9e7dUL, 0x81168cc762a3478cUL,
 198                  0x3e23b0d306fc121cUL, 0x5a238aa0a5efdcddUL,
 199        /*  54 */ 0x1ba26121c4ea43ffUL, 0x36f8c77f7c8832b5UL,
 200                  0x88fbea0b0adcf99aUL, 0x5ca9938ec25bebf9UL,
 201        /*  55 */ 0xd5436a5e51fccda0UL, 0x1dbc4797c2cd893bUL,
 202                  0x19346a65d3224a08UL, 0x0f5034e49b9af466UL,
 203        /*  56 */ 0xf23c3967a1e0b96eUL, 0xe58b08fa867a4d88UL,
 204                  0xfb2fabc6a7341679UL, 0x2a75381eb6026946UL,
 205        /*  57 */ 0xc80a3be4c19420acUL, 0x66b1f6c681f2b6dcUL,
 206                  0x7cf7036761e93388UL, 0x25abbbd8a660a4c4UL,
 207        /*  58 */ 0x91ea12ba14fd5198UL, 0x684950fc4a3cffa9UL,
 208                  0xf826842130f5ad28UL, 0x3ea988f75301a441UL,
 209        /*  59 */ 0xc978109a695f8c6fUL, 0x1746eb4a0530c3f3UL,
 210                  0x444d6d77b4459995UL, 0x75952b8c054e5cc7UL,
 211        /*  60 */ 0xa3703f7915f4d6aaUL, 0x66c346202f2647d8UL,
 212                  0xd01469df811d644bUL, 0x77fea47d81a5d71fUL,
 213        /*  61 */ 0xc5e9529ef57ca381UL, 0x6eeeb4b9ce2f881aUL,
 214                  0xb6e91a28e8009bd6UL, 0x4b80be3e9afc3fecUL,
 215        /*  62 */ 0x7e3773c526aed2c5UL, 0x1b4afcb453c9a49dUL,
 216                  0xa920bdd7baffb24dUL, 0x7c54699f122d400eUL,
 217        /*  63 */ 0xef46c8e14fa94bc8UL, 0xe0b074ce2952ed5eUL,
 218                  0xbea450e1dbd885d5UL, 0x61b68649320f712cUL,
 219        /*  64 */ 0x8a485f7309ccbdd1UL, 0xbd06320d7d4d1a2dUL,
 220                  0x25232973322dbef4UL, 0x445dc4758c17f770UL,
 221        /*  65 */ 0xdb0434177cc8933cUL, 0xed6fe82175ea059fUL,
 222                  0x1efebefdc053db34UL, 0x4adbe867c65daf99UL,
 223        /*  66 */ 0x3acd71a2a90609dfUL, 0xe5e991856dd04050UL,
 224                  0x1ec69b688157c23cUL, 0x697427f6885cfe4dUL,
 225        /*  67 */ 0xd7be7b9b65e1a851UL, 0xa03d28d522c536ddUL,
 226                  0x28399d658fd2b645UL, 0x49e5b7e17c2641e1UL,
 227        /*  68 */ 0x6f8c3a98700457a4UL, 0x5078f0a25ebb6778UL,
 228                  0xd13c3ccbc382960fUL, 0x2e003258a7df84b1UL,
 229        /*  69 */ 0x8ad1f39be6296a1cUL, 0xc1eeaa652a5fbfb2UL,
 230                  0x33ee0673fd26f3cbUL, 0x59256173a69d2cccUL,
 231        /*  70 */ 0x41ea07aa4e18fc41UL, 0xd9fc19527c87a51eUL,
 232                  0xbdaacb805831ca6fUL, 0x445b652dc916694fUL,
 233        /*  71 */ 0xce92a3a7f2172315UL, 0x1edc282de11b9964UL,
 234                  0xa1823aafe04c314aUL, 0x790a2d94437cf586UL,
 235        /*  72 */ 0x71c447fb93f6e009UL, 0x8922a56722845276UL,
 236                  0xbf70903b204f5169UL, 0x2f7a89891ba319feUL,
 237        /*  73 */ 0x02a08eb577e2140cUL, 0xed9a4ed4427bdcf4UL,
 238                  0x5253ec44e4323cd1UL, 0x3e88363c14e9355bUL,
 239        /*  74 */ 0xaa66c14277110b8cUL, 0x1ae0391610a23390UL,
 240                  0x2030bd12c93fc2a2UL, 0x3ee141579555c7abUL,
 241        /*  75 */ 0x9214de3a6d6e7d41UL, 0x3ccdd88607f17efeUL,
 242                  0x674f1288f8e11217UL, 0x5682250f329f93d0UL,
 243        /*  76 */ 0x6cf00b136d2e396eUL, 0x6e4cf86f1014debfUL,
 244                  0x5930b1b5bfcc4e83UL, 0x047069b48aba16b6UL,
 245        /*  77 */ 0x0d4ce4ab69b20793UL, 0xb24db91a97d0fb9eUL,
 246                  0xcdfa50f54e00d01dUL, 0x221b1085368bddb5UL,
 247        /*  78 */ 0xe7e59468b1e3d8d2UL, 0x53c56563bd122f93UL,
 248                  0xeee8a903e0663f09UL, 0x61efa662cbbe3d42UL,
 249        /*  79 */ 0x2cf8ddddde6eab2aUL, 0x9bf80ad51435f231UL,
 250                  0x5deadacec9f04973UL, 0x29275b5d41d29b27UL,
 251        /*  80 */ 0xcfde0f0895ebf14fUL, 0xb9aab96b054905a7UL,
 252                  0xcae80dd9a1c420fdUL, 0x0a63bf2f1673bbc7UL,
 253        /*  81 */ 0x092f6e11958fbc8cUL, 0x672a81e804822fadUL,
 254                  0xcac8351560d52517UL, 0x6f3f7722c8f192f8UL,
 255        /*  82 */ 0xf8ba90ccc2e894b7UL, 0x2c7557a438ff9f0dUL,
 256                  0x894d1d855ae52359UL, 0x68e122157b743d69UL,
 257        /*  83 */ 0xd87e5570cfb919f3UL, 0x3f2cdecd95798db9UL,
 258                  0x2121154710c0a2ceUL, 0x3c66a115246dc5b2UL,
 259        /*  84 */ 0xcbedc562294ecb72UL, 0xba7143c36a280b16UL,
 260                  0x9610c2efd4078b67UL, 0x6144735d946a4b1eUL,
 261        /*  85 */ 0x536f111ed75b3350UL, 0x0211db8c2041d81bUL,
 262                  0xf93cb1000e10413cUL, 0x149dfd3c039e8876UL,
 263        /*  86 */ 0xd479dde46b63155bUL, 0xb66e15e93c837976UL,
 264                  0xdafde43b1f13e038UL, 0x5fafda1a2e4b0b35UL,
 265        /*  87 */ 0x3600bbdf17197581UL, 0x3972050bbe3cd2c2UL,
 266                  0x5938906dbdd5be86UL, 0x34fce5e43f9b860fUL,
 267        /*  88 */ 0x75a8a4cd42d14d02UL, 0x828dabc53441df65UL,
 268                  0x33dcabedd2e131d3UL, 0x3ebad76fb814d25fUL,
 269        /*  89 */ 0xd4906f566f70e10fUL, 0x5d12f7aa51690f5aUL,
 270                  0x45adb16e76cefcf2UL, 0x01f768aead232999UL,
 271        /*  90 */ 0x2b6cc77b6248febdUL, 0x3cd30628ec3aaffdUL,
 272                  0xce1c0b80d4ef486aUL, 0x4c3bff2ea6f66c23UL,
 273        /*  91 */ 0x3f2ec4094aeaeb5fUL, 0x61b19b286e372ca7UL,
 274                  0x5eefa966de2a701dUL, 0x23b20565de55e3efUL,
 275        /*  92 */ 0xe301ca5279d58557UL, 0x07b2d4ce27c2874fUL,
 276                  0xa532cd8a9dcf1d67UL, 0x2a52fee23f2bff56UL,
 277        /*  93 */ 0x8624efb37cd8663dUL, 0xbbc7ac20ffbd7594UL,
 278                  0x57b85e9c82d37445UL, 0x7b3052cb86a6ec66UL,
 279        /*  94 */ 0x3482f0ad2525e91eUL, 0x2cb68043d28edca0UL,
 280                  0xaf4f6d052e1b003aUL, 0x185f8c2529781b0aUL,
 281        /*  95 */ 0xaa41de5bd80ce0d6UL, 0x9407b2416853e9d6UL,
 282                  0x563ec36e357f4c3aUL, 0x4cc4b8dd0e297bceUL,
 283        /*  96 */ 0xa2fc1a52ffb8730eUL, 0x1811f16e67058e37UL,
 284                  0x10f9a366cddf4ee1UL, 0x72f4a0c4a0b9f099UL,
 285        /*  97 */ 0x8c16c06f663f4ea7UL, 0x693b3af74e970fbaUL,
 286                  0x2102e7f1d69ec345UL, 0x0ba53cbc968a8089UL,
 287        /*  98 */ 0xca3d9dc7fea15537UL, 0x4c6824bb51536493UL,
 288                  0xb9886314844006b1UL, 0x40d2a72ab454cc60UL,
 289        /*  99 */ 0x5936a1b712570975UL, 0x91b9d648debda657UL,
 290                  0x3344094bb64330eaUL, 0x006ba10d12ee51d0UL,
 291        /* 100 */ 0x19228468f5de5d58UL, 0x0eb12f4c38cc05b0UL,
 292                  0xa1039f9dd5601990UL, 0x4502d4ce4fff0e0bUL,
 293        /* 101 */ 0xeb2054106837c189UL, 0xd0f6544c6dd3b93cUL,
 294                  0x40727064c416d74fUL, 0x6e15c6114b502ef0UL,
 295        /* 102 */ 0x4df2a398cfb1a76bUL, 0x11256c7419f2f6b1UL,
 296                  0x4a497962066e6043UL, 0x705b3aab41355b44UL,
 297        /* 103 */ 0x365ef536d797b1d8UL, 0x00076bd622ddf0dbUL,
 298                  0x3bbf33b0e0575a88UL, 0x3777aa05c8e4ca4dUL,
 299        /* 104 */ 0x392745c85578db5fUL, 0x6fda4149dbae5ae2UL,
 300                  0xb1f0b00b8adc9867UL, 0x09963437d36f1da3UL,
 301        /* 105 */ 0x7e824e90a5dc3853UL, 0xccb5f6641f135cbdUL,
 302                  0x6736d86c87ce8fccUL, 0x625f3ce26604249fUL,
 303        /* 106 */ 0xaf8ac8059502f63fUL, 0x0c05e70a2e351469UL,
 304                  0x35292e9c764b6305UL, 0x1a394360c7e23ac3UL,
 305        /* 107 */ 0xd5c6d53251183264UL, 0x62065abd43c2b74fUL,
 306                  0xb5fbf5d03b973f9bUL, 0x13a3da3661206e5eUL,
 307        /* 108 */ 0xc6bd5837725d94e5UL, 0x18e30912205016c5UL,
 308                  0x2088ce1570033c68UL, 0x7fba1f495c837987UL,
 309        /* 109 */ 0x5a8c7423f2f9079dUL, 0x1735157b34023fc5UL,
 310                  0xe4f9b49ad2fab351UL, 0x6691ff72c878e33cUL,
 311        /* 110 */ 0x122c2adedc5eff3eUL, 0xf8dd4bf1d8956cf4UL,
 312                  0xeb86205d9e9e5bdaUL, 0x049b92b9d975c743UL,
 313        /* 111 */ 0xa5379730b0f6c05aUL, 0x72a0ffacc6f3a553UL,
 314                  0xb0032c34b20dcd6dUL, 0x470e9dbc88d5164aUL,
 315        /* 112 */ 0xb19cf10ca237c047UL, 0xb65466711f6c81a2UL,
 316                  0xb3321bd16dd80b43UL, 0x48c14f600c5fbe8eUL,
 317        /* 113 */ 0x66451c264aa6c803UL, 0xb66e3904a4fa7da6UL,
 318                  0xd45f19b0b3128395UL, 0x31602627c3c9bc10UL,
 319        /* 114 */ 0x3120dc4832e4e10dUL, 0xeb20c46756c717f7UL,
 320                  0x00f52e3f67280294UL, 0x566d4fc14730c509UL,
 321        /* 115 */ 0x7e3a5d40fd837206UL, 0xc1e926dc7159547aUL,
 322                  0x216730fba68d6095UL, 0x22e8c3843f69cea7UL,
 323        /* 116 */ 0x33d074e8930e4b2bUL, 0xb6e4350e84d15816UL,
 324                  0x5534c26ad6ba2365UL, 0x7773c12f89f1f3f3UL,
 325        /* 117 */ 0x8cba404da57962aaUL, 0x5b9897a81999ce56UL,
 326                  0x508e862f121692fcUL, 0x3a81907fa093c291UL,
 327        /* 118 */ 0x0dded0ff4725a510UL, 0x10d8cc10673fc503UL,
 328                  0x5b9d151c9f1f4e89UL, 0x32a5c1d5cb09a44cUL,
 329        /* 119 */ 0x1e0aa442b90541fbUL, 0x5f85eb7cc1b485dbUL,
 330                  0xbee595ce8a9df2e5UL, 0x25e496c722422236UL,
 331        /* 120 */ 0x5edf3c46cd0fe5b9UL, 0x34e75a7ed2a43388UL,
 332                  0xe488de11d761e352UL, 0x0e878a01a085545cUL,
 333        /* 121 */ 0xba493c77e021bb04UL, 0x2b4d1843c7df899aUL,
 334                  0x9ea37a487ae80d67UL, 0x67a9958011e41794UL,
 335        /* 122 */ 0x4b58051a6697b065UL, 0x47e33f7d8d6ba6d4UL,
 336                  0xbb4da8d483ca46c1UL, 0x68becaa181c2db0dUL,
 337        /* 123 */ 0x8d8980e90b989aa5UL, 0xf95eb14a2c93c99bUL,
 338                  0x51c6c7c4796e73a2UL, 0x6e228363b5efb569UL,
 339        /* 124 */ 0xc6bbc0b02dd624c8UL, 0x777eb47dec8170eeUL,
 340                  0x3cde15a004cfafa9UL, 0x1dc6bc087160bf9bUL,
 341        /* 125 */ 0x2e07e043eec34002UL, 0x18e9fc677a68dc7fUL,
 342                  0xd8da03188bd15b9aUL, 0x48fbc3bb00568253UL,
 343        /* 126 */ 0x57547d4cfb654ce1UL, 0xd3565b82a058e2adUL,
 344                  0xf63eaf0bbf154478UL, 0x47531ef114dfbb18UL,
 345        /* 127 */ 0xe1ec630a4278c587UL, 0x5507d546ca8e83f3UL,
 346                  0x85e135c63adc0c2bUL, 0x0aa7efa85682844eUL,
 347        /* 128 */ 0x72691ba8b3e1f615UL, 0x32b4e9701fbe3ffaUL,
 348                  0x97b6d92e39bb7868UL, 0x2cfe53dea02e39e8UL,
 349        /* 129 */ 0x687392cd85cd52b0UL, 0x27ff66c910e29831UL,
 350                  0x97134556a9832d06UL, 0x269bb0360a84f8a0UL,
 351        /* 130 */ 0x706e55457643f85cUL, 0x3734a48c9b597d1bUL,
 352                  0x7aee91e8c6efa472UL, 0x5cd6abc198a9d9e0UL,
 353        /* 131 */ 0x0e04de06cb3ce41aUL, 0xd8c6eb893402e138UL,
 354                  0x904659bb686e3772UL, 0x7215c371746ba8c8UL,
 355        /* 132 */ 0xfd12a97eeae4a2d9UL, 0x9514b7516394f2c5UL,
 356                  0x266fd5809208f294UL, 0x5c847085619a26b9UL,
 357        /* 133 */ 0x52985410fed694eaUL, 0x3c905b934a2ed254UL,
 358                  0x10bb47692d3be467UL, 0x063b3d2d69e5e9e1UL,
 359        /* 134 */ 0x472726eedda57debUL, 0xefb6c4ae10f41891UL,
 360                  0x2b1641917b307614UL, 0x117c554fc4f45b7cUL,
 361        /* 135 */ 0xc07cf3118f9d8812UL, 0x01dbd82050017939UL,
 362                  0xd7e803f4171b2827UL, 0x1015e87487d225eaUL,
 363        /* 136 */ 0xc58de3fed23acc4dUL, 0x50db91c294a7be2dUL,
 364                  0x0b94d43d1c9cf457UL, 0x6b1640fa6e37524aUL,
 365        /* 137 */ 0x692f346c5fda0d09UL, 0x200b1c59fa4d3151UL,
 366                  0xb8c46f760777a296UL, 0x4b38395f3ffdfbcfUL,
 367        /* 138 */ 0x18d25e00be54d671UL, 0x60d50582bec8aba6UL,
 368                  0x87ad8f263b78b982UL, 0x50fdf64e9cda0432UL,
 369        /* 139 */ 0x90f567aac578dcf0UL, 0xef1e9b0ef2a3133bUL,
 370                  0x0eebba9242d9de71UL, 0x15473c9bf03101c7UL,
 371        /* 140 */ 0x7c77e8ae56b78095UL, 0xb678e7666e6f078eUL,
 372                  0x2da0b9615348ba1fUL, 0x7cf931c1ff733f0bUL,
 373        /* 141 */ 0x26b357f50a0a366cUL, 0xe9708cf42b87d732UL,
 374                  0xc13aeea5f91cb2c0UL, 0x35d90c991143bb4cUL,
 375        /* 142 */ 0x47c1c404a9a0d9dcUL, 0x659e58451972d251UL,
 376                  0x3875a8c473b38c31UL, 0x1fbd9ed379561f24UL,
 377        /* 143 */ 0x11fabc6fd41ec28dUL, 0x7ef8dfe3cd2a2dcaUL,
 378                  0x72e73b5d8c404595UL, 0x6135fa4954b72f27UL,
 379        /* 144 */ 0xccfc32a2de24b69cUL, 0x3f55698c1f095d88UL,
 380                  0xbe3350ed5ac3f929UL, 0x5e9bf806ca477eebUL,
 381        /* 145 */ 0xe9ce8fb63c309f68UL, 0x5376f63565e1f9f4UL,
 382                  0xd1afcfb35a6393f1UL, 0x6632a1ede5623506UL,
 383        /* 146 */ 0x0b7d6c390c2ded4cUL, 0x56cb3281df04cb1fUL,
 384                  0x66305a1249ecc3c7UL, 0x5d588b60a38ca72aUL,
 385        /* 147 */ 0xa6ecbf78e8e5f42dUL, 0x86eeb44b3c8a3eecUL,
 386                  0xec219c48fbd21604UL, 0x1aaf1af517c36731UL,
 387        /* 148 */ 0xc306a2836769bde7UL, 0x208280622b1e2adbUL,
 388                  0x8027f51ffbff94a6UL, 0x76cfa1ce1124f26bUL,
 389        /* 149 */ 0x18eb00562422abb6UL, 0xf377c4d58f8c29c3UL,
 390                  0x4dbbc207f531561aUL, 0x0253b7f082128a27UL,
 391        /* 150 */ 0x3d1f091cb62c17e0UL, 0x4860e1abd64628a9UL,
 392                  0x52d17436309d4253UL, 0x356f97e13efae576UL,
 393        /* 151 */ 0xd351e11aa150535bUL, 0x3e6b45bb1dd878ccUL,
 394                  0x0c776128bed92c98UL, 0x1d34ae93032885b8UL,
 395        /* 152 */ 0x4ba0488ca85ba4c3UL, 0x985348c33c9ce6ceUL,
 396                  0x66124c6f97bda770UL, 0x0f81a0290654124aUL,
 397        /* 153 */ 0x9ed09ca6569b86fdUL, 0x811009fd18af9a2dUL,
 398                  0xff08d03f93d8c20aUL, 0x52a148199faef26bUL,
 399        /* 154 */ 0x3e03f9dc2d8d1b73UL, 0x4205801873961a70UL,
 400                  0xc0d987f041a35970UL, 0x07aa1f15a1c0d549UL,
 401        /* 155 */ 0xdfd46ce08cd27224UL, 0x6d0a024f934e4239UL,
 402                  0x808a7a6399897b59UL, 0x0a4556e9e13d95a2UL,
 403        /* 156 */ 0xd21a991fe9c13045UL, 0x9b0e8548fe7751b8UL,
 404                  0x5da643cb4bf30035UL, 0x77db28d63940f721UL,
 405        /* 157 */ 0xfc5eeb614adc9011UL, 0x5229419ae8c411ebUL,
 406                  0x9ec3e7787d1dcf74UL, 0x340d053e216e4cb5UL,
 407        /* 158 */ 0xcac7af39b48df2b4UL, 0xc0faec2871a10a94UL,
 408                  0x140a69245ca575edUL, 0x0cf1c37134273a4cUL,
 409        /* 159 */ 0xc8ee306ac224b8a5UL, 0x57eaee7ccb4930b0UL,
 410                  0xa1e806bdaacbe74fUL, 0x7d9a62742eeb657dUL,
 411        /* 160 */ 0x9eb6b6ef546c4830UL, 0x885cca1fddb36e2eUL,
 412                  0xe6b9f383ef0d7105UL, 0x58654fef9d2e0412UL,
 413        /* 161 */ 0xa905c4ffbe0e8e26UL, 0x942de5df9b31816eUL,
 414                  0x497d723f802e88e1UL, 0x30684dea602f408dUL,
 415        /* 162 */ 0x21e5a278a3e6cb34UL, 0xaefb6e6f5b151dc4UL,
 416                  0xb30b8e049d77ca15UL, 0x28c3c9cf53b98981UL,
 417        /* 163 */ 0x287fb721556cdd2aUL, 0x0d317ca897022274UL,
 418                  0x7468c7423a543258UL, 0x4a7f11464eb5642fUL,
 419        /* 164 */ 0xa237a4774d193aa6UL, 0xd865986ea92129a1UL,
 420                  0x24c515ecf87c1a88UL, 0x604003575f39f5ebUL,
 421        /* 165 */ 0x47b9f189570a9b27UL, 0x2b98cede465e4b78UL,
 422                  0x026df551dbb85c20UL, 0x74fcd91047e21901UL,
 423        /* 166 */ 0x13e2a90a23c1bfa3UL, 0x0cb0074e478519f6UL,
 424                  0x5ff1cbbe3af6cf44UL, 0x67fe5438be812dbeUL,
 425        /* 167 */ 0xd13cf64fa40f05b0UL, 0x054dfb2f32283787UL,
 426                  0x4173915b7f0d2aeaUL, 0x482f144f1f610d4eUL,
 427        /* 168 */ 0xf6210201b47f8234UL, 0x5d0ae1929e70b990UL,
 428                  0xdcd7f455b049567cUL, 0x7e93d0f1f0916f01UL,
 429        /* 169 */ 0xdd79cbf18a7db4faUL, 0xbe8391bf6f74c62fUL,
 430                  0x027145d14b8291bdUL, 0x585a73ea2cbf1705UL,
 431        /* 170 */ 0x485ca03e928a0db2UL, 0x10fc01a5742857e7UL,
 432                  0x2f482edbd6d551a7UL, 0x0f0433b5048fdb8aUL,
 433        /* 171 */ 0x60da2e8dd7dc6247UL, 0x88b4c9d38cd4819aUL,
 434                  0x13033ac001f66697UL, 0x273b24fe3b367d75UL,
 435        /* 172 */ 0xc6e8f66a31b3b9d4UL, 0x281514a494df49d5UL,
 436                  0xd1726fdfc8b23da7UL, 0x4b3ae7d103dee548UL,
 437        /* 173 */ 0xc6256e19ce4b9d7eUL, 0xff5c5cf186e3c61cUL,
 438                  0xacc63ca34b8ec145UL, 0x74621888fee66574UL,
 439        /* 174 */ 0x956f409645290a1eUL, 0xef0bf8e3263a962eUL,
 440                  0xed6a50eb5ec2647bUL, 0x0694283a9dca7502UL,
 441        /* 175 */ 0x769b963643a2dcd1UL, 0x42b7c8ea09fc5353UL,
 442                  0x4f002aee13397eabUL, 0x63005e2c19b7d63aUL,
 443        /* 176 */ 0xca6736da63023beaUL, 0x966c7f6db12a99b7UL,
 444                  0xace09390c537c5e1UL, 0x0b696063a1aa89eeUL,
 445        /* 177 */ 0xebb03e97288c56e5UL, 0x432a9f9f938c8be8UL,
 446                  0xa6a5a93d5b717f71UL, 0x1a5fb4c3e18f9d97UL,
 447        /* 178 */ 0x1c94e7ad1c60cdceUL, 0xee202a43fc02c4a0UL,
 448                  0x8dafe4d867c46a20UL, 0x0a10263c8ac27b58UL,
 449        /* 179 */ 0xd0dea9dfe4432a4aUL, 0x856af87bbe9277c5UL,
 450                  0xce8472acc212c71aUL, 0x6f151b6d9bbb1e91UL,
 451        /* 180 */ 0x26776c527ceed56aUL, 0x7d211cb7fbf8faecUL,
 452                  0x37ae66a6fd4609ccUL, 0x1f81b702d2770c42UL,
 453        /* 181 */ 0x2fb0b057eac58392UL, 0xe1dd89fe29744e9dUL,
 454                  0xc964f8eb17beb4f8UL, 0x29571073c9a2d41eUL,
 455        /* 182 */ 0xa948a18981c0e254UL, 0x2df6369b65b22830UL,
 456                  0xa33eb2d75fcfd3c6UL, 0x078cd6ec4199a01fUL,
 457        /* 183 */ 0x4a584a41ad900d2fUL, 0x32142b78e2c74c52UL,
 458                  0x68c4e8338431c978UL, 0x7f69ea9008689fc2UL,
 459        /* 184 */ 0x52f2c81e46a38265UL, 0xfd78072d04a832fdUL,
 460                  0x8cd7d5fa25359e94UL, 0x4de71b7454cc29d2UL,
 461        /* 185 */ 0x42eb60ad1eda6ac9UL, 0x0aad37dfdbc09c3aUL,
 462                  0x81004b71e33cc191UL, 0x44e6be345122803cUL,
 463        /* 186 */ 0x03fe8388ba1920dbUL, 0xf5d57c32150db008UL,
 464                  0x49c8c4281af60c29UL, 0x21edb518de701aeeUL,
 465        /* 187 */ 0x7fb63e418f06dc99UL, 0xa4460d99c166d7b8UL,
 466                  0x24dd5248ce520a83UL, 0x5ec3ad712b928358UL,
 467        /* 188 */ 0x15022a5fbd17930fUL, 0xa4f64a77d82570e3UL,
 468                  0x12bc8d6915783712UL, 0x498194c0fc620abbUL,
 469        /* 189 */ 0x38a2d9d255686c82UL, 0x785c6bd9193e21f0UL,
 470                  0xe4d5c81ab24a5484UL, 0x56307860b2e20989UL,
 471        /* 190 */ 0x429d55f78b4d74c4UL, 0x22f1834643350131UL,
 472                  0x1e60c24598c71fffUL, 0x59f2f014979983efUL,
 473        /* 191 */ 0x46a47d56eb494a44UL, 0x3e22a854d636a18eUL,
 474                  0xb346e15274491c3bUL, 0x2ceafd4e5390cde7UL,
 475        /* 192 */ 0xba8a8538be0d6675UL, 0x4b9074bb50818e23UL,
 476                  0xcbdab89085d304c3UL, 0x61a24fe0e56192c4UL,
 477        /* 193 */ 0xcb7615e6db525bcbUL, 0xdd7d8c35a567e4caUL,
 478                  0xe6b4153acafcdd69UL, 0x2d668e097f3c9766UL,
 479        /* 194 */ 0xa57e7e265ce55ef0UL, 0x5d9f4e527cd4b967UL,
 480                  0xfbc83606492fd1e5UL, 0x090d52beb7c3f7aeUL,
 481        /* 195 */ 0x09b9515a1e7b4d7cUL, 0x1f266a2599da44c0UL,
 482                  0xa1c49548e2c55504UL, 0x7ef04287126f15ccUL,
 483        /* 196 */ 0xfed1659dbd30ef15UL, 0x8b4ab9eec4e0277bUL,
 484                  0x884d6236a5df3291UL, 0x1fd96ea6bf5cf788UL,
 485        /* 197 */ 0x42a161981f190d9aUL, 0x61d849507e6052c1UL,
 486                  0x9fe113bf285a2cd5UL, 0x7c22d676dbad85d8UL,
 487        /* 198 */ 0x82e770ed2bfbd27dUL, 0x4c05b2ece996f5a5UL,
 488                  0xcd40a9c2b0900150UL, 0x5895319213d9bf64UL,
 489        /* 199 */ 0xe7cc5d703fea2e08UL, 0xb50c491258e2188cUL,
 490                  0xcce30baa48205bf0UL, 0x537c659ccfa32d62UL,
 491        /* 200 */ 0x37b6623a98cfc088UL, 0xfe9bed1fa4d6aca4UL,
 492                  0x04d29b8e56a8d1b0UL, 0x725f71c40b519575UL,
 493        /* 201 */ 0x28c7f89cd0339ce6UL, 0x8367b14469ddc18bUL,
 494                  0x883ada83a6a1652cUL, 0x585f1974034d6c17UL,
 495        /* 202 */ 0x89cfb266f1b19188UL, 0xe63b4863e7c35217UL,
 496                  0xd88c9da6b4c0526aUL, 0x3e035c9df0954635UL,
 497        /* 203 */ 0xdd9d5412fb45de9dUL, 0xdd684532e4cff40dUL,
 498                  0x4b5c999b151d671cUL, 0x2d8c2cc811e7f690UL,
 499        /* 204 */ 0x7f54be1d90055d40UL, 0xa464c5df464aaf40UL,
 500                  0x33979624f0e917beUL, 0x2c018dc527356b30UL,
 501        /* 205 */ 0xa5415024e330b3d4UL, 0x73ff3d96691652d3UL,
 502                  0x94ec42c4ef9b59f1UL, 0x0747201618d08e5aUL,
 503        /* 206 */ 0x4d6ca48aca411c53UL, 0x66415f2fcfa66119UL,
 504                  0x9c4dd40051e227ffUL, 0x59810bc09a02f7ebUL,
 505        /* 207 */ 0x2a7eb171b3dc101dUL, 0x441c5ab99ffef68eUL,
 506                  0x32025c9b93b359eaUL, 0x5e8ce0a71e9d112fUL,
 507        /* 208 */ 0xbfcccb92429503fdUL, 0xd271ba752f095d55UL,
 508                  0x345ead5e972d091eUL, 0x18c8df11a83103baUL,
 509        /* 209 */ 0x90cd949a9aed0f4cUL, 0xc5d1f4cb6660e37eUL,
 510                  0xb8cac52d56c52e0bUL, 0x6e42e400c5808e0dUL,
 511        /* 210 */ 0xa3b46966eeaefd23UL, 0x0c4f1f0be39ecdcaUL,
 512                  0x189dc8c9d683a51dUL, 0x51f27f054c09351bUL,
 513        /* 211 */ 0x4c487ccd2a320682UL, 0x587ea95bb3df1c96UL,
 514                  0xc8ccf79e555cb8e8UL, 0x547dc829a206d73dUL,
 515        /* 212 */ 0xb822a6cd80c39b06UL, 0xe96d54732000d4c6UL,
 516                  0x28535b6f91463b4dUL, 0x228f4660e2486e1dUL,
 517        /* 213 */ 0x98799538de8d3abfUL, 0x8cd8330045ebca6eUL,
 518                  0x79952a008221e738UL, 0x4322e1a7535cd2bbUL,
 519        /* 214 */ 0xb114c11819d1801cUL, 0x2016e4d84f3f5ec7UL,
 520                  0xdd0e2df409260f4cUL, 0x5ec362c0ae5f7266UL,
 521        /* 215 */ 0xc0462b18b8b2b4eeUL, 0x7cc8d950274d1afbUL,
 522                  0xf25f7105436b02d2UL, 0x43bbf8dcbff9ccd3UL,
 523        /* 216 */ 0xb6ad1767a039e9dfUL, 0xb0714da8f69d3583UL,
 524                  0x5e55fa18b42931f5UL, 0x4ed5558f33c60961UL,
 525        /* 217 */ 0x1fe37901c647a5ddUL, 0x593ddf1f8081d357UL,
 526                  0x0249a4fd813fd7a6UL, 0x69acca274e9caf61UL,
 527        /* 218 */ 0x047ba3ea330721c9UL, 0x83423fc20e7e1ea0UL,
 528                  0x1df4c0af01314a60UL, 0x09a62dab89289527UL,
 529        /* 219 */ 0xa5b325a49cc6cb00UL, 0xe94b5dc654b56cb6UL,
 530                  0x3be28779adc994a0UL, 0x4296e8f8ba3a4aadUL,
 531        /* 220 */ 0x328689761e451eabUL, 0x2e4d598bff59594aUL,
 532                  0x49b96853d7a7084aUL, 0x4980a319601420a8UL,
 533        /* 221 */ 0x9565b9e12f552c42UL, 0x8a5318db7100fe96UL,
 534                  0x05c90b4d43add0d7UL, 0x538b4cd66a5d4edaUL,
 535        /* 222 */ 0xf4e94fc3e89f039fUL, 0x592c9af26f618045UL,
 536                  0x08a36eb5fd4b9550UL, 0x25fffaf6c2ed1419UL,
 537        /* 223 */ 0x34434459cc79d354UL, 0xeeecbfb4b1d5476bUL,
 538                  0xddeb34a061615d99UL, 0x5129cecceb64b773UL,
 539        /* 224 */ 0xee43215894993520UL, 0x772f9c7cf14c0b3bUL,
 540                  0xd2e2fce306bedad5UL, 0x715f42b546f06a97UL,
 541        /* 225 */ 0x434ecdceda5b5f1aUL, 0x0da17115a49741a9UL,
 542                  0x680bd77c73edad2eUL, 0x487c02354edd9041UL,
 543        /* 226 */ 0xb8efeff3a70ed9c4UL, 0x56a32aa3e857e302UL,
 544                  0xdf3a68bd48a2a5a0UL, 0x07f650b73176c444UL,
 545        /* 227 */ 0xe38b9b1626e0ccb1UL, 0x79e053c18b09fb36UL,
 546                  0x56d90319c9f94964UL, 0x1ca941e7ac9ff5c4UL,
 547        /* 228 */ 0x49c4df29162fa0bbUL, 0x8488cf3282b33305UL,
 548                  0x95dfda14cabb437dUL, 0x3391f78264d5ad86UL,
 549        /* 229 */ 0x729ae06ae2b5095dUL, 0xd58a58d73259a946UL,
 550                  0xe9834262d13921edUL, 0x27fedafaa54bb592UL,
 551        /* 230 */ 0xa99dc5b829ad48bbUL, 0x5f025742499ee260UL,
 552                  0x802c8ecd5d7513fdUL, 0x78ceb3ef3f6dd938UL,
 553        /* 231 */ 0xc342f44f8a135d94UL, 0x7b9edb44828cdda3UL,
 554                  0x9436d11a0537cfe7UL, 0x5064b164ec1ab4c8UL,
 555        /* 232 */ 0x7020eccfd37eb2fcUL, 0x1f31ea3ed90d25fcUL,
 556                  0x1b930d7bdfa1bb34UL, 0x5344467a48113044UL,
 557        /* 233 */ 0x70073170f25e6dfbUL, 0xe385dc1a50114cc8UL,
 558                  0x2348698ac8fc4f00UL, 0x2a77a55284dd40d8UL,
 559        /* 234 */ 0xfe06afe0c98c6ce4UL, 0xc235df96dddfd6e4UL,
 560                  0x1428d01e33bf1ed3UL, 0x785768ec9300bdafUL,
 561        /* 235 */ 0x9702e57a91deb63bUL, 0x61bdb8bfe5ce8b80UL,
 562                  0x645b426f3d1d58acUL, 0x4804a82227a557bcUL,
 563        /* 236 */ 0x8e57048ab44d2601UL, 0x68d6501a4b3a6935UL,
 564                  0xc39c9ec3f9e1c293UL, 0x4172f257d4de63e2UL,
 565        /* 237 */ 0xd368b450330c6401UL, 0x040d3017418f2391UL,
 566                  0x2c34bb6090b7d90dUL, 0x16f649228fdfd51fUL,
 567        /* 238 */ 0xbea6818e2b928ef5UL, 0xe28ccf91cdc11e72UL,
 568                  0x594aaa68e77a36cdUL, 0x313034806c7ffd0fUL,
 569        /* 239 */ 0x8a9d27ac2249bd65UL, 0x19a3b464018e9512UL,
 570                  0xc26ccff352b37ec7UL, 0x056f68341d797b21UL,
 571        /* 240 */ 0x5e79d6757efd2327UL, 0xfabdbcb6553afe15UL,
 572                  0xd3e7222c6eaf5a60UL, 0x7046c76d4dae743bUL,
 573        /* 241 */ 0x660be872b18d4a55UL, 0x19992518574e1496UL,
 574                  0xc103053a302bdcbbUL, 0x3ed8e9800b218e8eUL,
 575        /* 242 */ 0x7b0b9239fa75e03eUL, 0xefe9fb684633c083UL,
 576                  0x98a35fbe391a7793UL, 0x6065510fe2d0fe34UL,
 577        /* 243 */ 0x55cb668548abad0cUL, 0xb4584548da87e527UL,
 578                  0x2c43ecea0107c1ddUL, 0x526028809372de35UL,
 579        /* 244 */ 0x3415c56af9213b1fUL, 0x5bee1a4d017e98dbUL,
 580                  0x13f6b105b5cf709bUL, 0x5ff20e3482b29ab6UL,
 581        /* 245 */ 0x0aa29c75cc2e6c90UL, 0xfc7d73ca3a70e206UL,
 582                  0x899fc38fc4b5c515UL, 0x250386b124ffc207UL,
 583        /* 246 */ 0x54ea28d5ae3d2b56UL, 0x9913149dd6de60ceUL,
 584                  0x16694fc58f06d6c1UL, 0x46b23975eb018fc7UL,
 585        /* 247 */ 0x470a6a0fb4b7b4e2UL, 0x5d92475a8f7253deUL,
 586                  0xabeee5b52fbd3adbUL, 0x7fa20801a0806968UL,
 587        /* 248 */ 0x76f3faf19f7714d2UL, 0xb3e840c12f4660c3UL,
 588                  0x0fb4cd8df212744eUL, 0x4b065a251d3a2dd2UL,
 589        /* 249 */ 0x5cebde383d77cd4aUL, 0x6adf39df882c9cb1UL,
 590                  0xa2dd242eb09af759UL, 0x3147c0e50e5f6422UL,
 591        /* 250 */ 0x164ca5101d1350dbUL, 0xf8d13479c33fc962UL,
 592                  0xe640ce4d13e5da08UL, 0x4bdee0c45061f8baUL,
 593        /* 251 */ 0xd7c46dc1a4edb1c9UL, 0x5514d7b6437fd98aUL,
 594                  0x58942f6bb2a1c00bUL, 0x2dffb2ab1d70710eUL,
 595        /* 252 */ 0xccdfcf2fc18b6d68UL, 0xa8ebcba8b7806167UL,
 596                  0x980697f95e2937e3UL, 0x02fbba1cd0126e8cUL
 597};
 598
 599/* c is two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7]
 600 * a is two 256-bit integers: a0[0:3] and a1[4:7]
 601 * b is two 256-bit integers: b0[0:3] and b1[4:7]
 602 */
 603static void mul2_256x256_integer_adx(u64 *const c, const u64 *const a,
 604                                     const u64 *const b)
 605{
 606        asm volatile(
 607                "xorl %%r14d, %%r14d ;"
 608                "movq   (%1), %%rdx; "  /* A[0] */
 609                "mulx   (%2),  %%r8, %%r15; " /* A[0]*B[0] */
 610                "xorl %%r10d, %%r10d ;"
 611                "movq %%r8, (%0) ;"
 612                "mulx  8(%2), %%r10, %%rax; " /* A[0]*B[1] */
 613                "adox %%r10, %%r15 ;"
 614                "mulx 16(%2),  %%r8, %%rbx; " /* A[0]*B[2] */
 615                "adox  %%r8, %%rax ;"
 616                "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
 617                "adox %%r10, %%rbx ;"
 618                /******************************************/
 619                "adox %%r14, %%rcx ;"
 620
 621                "movq  8(%1), %%rdx; "  /* A[1] */
 622                "mulx   (%2),  %%r8,  %%r9; " /* A[1]*B[0] */
 623                "adox %%r15,  %%r8 ;"
 624                "movq  %%r8, 8(%0) ;"
 625                "mulx  8(%2), %%r10, %%r11; " /* A[1]*B[1] */
 626                "adox %%r10,  %%r9 ;"
 627                "adcx  %%r9, %%rax ;"
 628                "mulx 16(%2),  %%r8, %%r13; " /* A[1]*B[2] */
 629                "adox  %%r8, %%r11 ;"
 630                "adcx %%r11, %%rbx ;"
 631                "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
 632                "adox %%r10, %%r13 ;"
 633                "adcx %%r13, %%rcx ;"
 634                /******************************************/
 635                "adox %%r14, %%r15 ;"
 636                "adcx %%r14, %%r15 ;"
 637
 638                "movq 16(%1), %%rdx; " /* A[2] */
 639                "xorl %%r10d, %%r10d ;"
 640                "mulx   (%2),  %%r8,  %%r9; " /* A[2]*B[0] */
 641                "adox %%rax,  %%r8 ;"
 642                "movq %%r8, 16(%0) ;"
 643                "mulx  8(%2), %%r10, %%r11; " /* A[2]*B[1] */
 644                "adox %%r10,  %%r9 ;"
 645                "adcx  %%r9, %%rbx ;"
 646                "mulx 16(%2),  %%r8, %%r13; " /* A[2]*B[2] */
 647                "adox  %%r8, %%r11 ;"
 648                "adcx %%r11, %%rcx ;"
 649                "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
 650                "adox %%r10, %%r13 ;"
 651                "adcx %%r13, %%r15 ;"
 652                /******************************************/
 653                "adox %%r14, %%rax ;"
 654                "adcx %%r14, %%rax ;"
 655
 656                "movq 24(%1), %%rdx; " /* A[3] */
 657                "xorl %%r10d, %%r10d ;"
 658                "mulx   (%2),  %%r8,  %%r9; " /* A[3]*B[0] */
 659                "adox %%rbx,  %%r8 ;"
 660                "movq %%r8, 24(%0) ;"
 661                "mulx  8(%2), %%r10, %%r11; " /* A[3]*B[1] */
 662                "adox %%r10,  %%r9 ;"
 663                "adcx  %%r9, %%rcx ;"
 664                "movq %%rcx, 32(%0) ;"
 665                "mulx 16(%2),  %%r8, %%r13; " /* A[3]*B[2] */
 666                "adox  %%r8, %%r11 ;"
 667                "adcx %%r11, %%r15 ;"
 668                "movq %%r15, 40(%0) ;"
 669                "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
 670                "adox %%r10, %%r13 ;"
 671                "adcx %%r13, %%rax ;"
 672                "movq %%rax, 48(%0) ;"
 673                /******************************************/
 674                "adox %%r14, %%rbx ;"
 675                "adcx %%r14, %%rbx ;"
 676                "movq %%rbx, 56(%0) ;"
 677
 678                "movq 32(%1), %%rdx; "  /* C[0] */
 679                "mulx 32(%2),  %%r8, %%r15; " /* C[0]*D[0] */
 680                "xorl %%r10d, %%r10d ;"
 681                "movq %%r8, 64(%0);"
 682                "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
 683                "adox %%r10, %%r15 ;"
 684                "mulx 48(%2),  %%r8, %%rbx; " /* C[0]*D[2] */
 685                "adox  %%r8, %%rax ;"
 686                "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
 687                "adox %%r10, %%rbx ;"
 688                /******************************************/
 689                "adox %%r14, %%rcx ;"
 690
 691                "movq 40(%1), %%rdx; " /* C[1] */
 692                "xorl %%r10d, %%r10d ;"
 693                "mulx 32(%2),  %%r8,  %%r9; " /* C[1]*D[0] */
 694                "adox %%r15,  %%r8 ;"
 695                "movq  %%r8, 72(%0);"
 696                "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
 697                "adox %%r10,  %%r9 ;"
 698                "adcx  %%r9, %%rax ;"
 699                "mulx 48(%2),  %%r8, %%r13; " /* C[1]*D[2] */
 700                "adox  %%r8, %%r11 ;"
 701                "adcx %%r11, %%rbx ;"
 702                "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */
 703                "adox %%r10, %%r13 ;"
 704                "adcx %%r13, %%rcx ;"
 705                /******************************************/
 706                "adox %%r14, %%r15 ;"
 707                "adcx %%r14, %%r15 ;"
 708
 709                "movq 48(%1), %%rdx; " /* C[2] */
 710                "xorl %%r10d, %%r10d ;"
 711                "mulx 32(%2),  %%r8,  %%r9; " /* C[2]*D[0] */
 712                "adox %%rax,  %%r8 ;"
 713                "movq  %%r8, 80(%0);"
 714                "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
 715                "adox %%r10,  %%r9 ;"
 716                "adcx  %%r9, %%rbx ;"
 717                "mulx 48(%2),  %%r8, %%r13; " /* C[2]*D[2] */
 718                "adox  %%r8, %%r11 ;"
 719                "adcx %%r11, %%rcx ;"
 720                "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
 721                "adox %%r10, %%r13 ;"
 722                "adcx %%r13, %%r15 ;"
 723                /******************************************/
 724                "adox %%r14, %%rax ;"
 725                "adcx %%r14, %%rax ;"
 726
 727                "movq 56(%1), %%rdx; " /* C[3] */
 728                "xorl %%r10d, %%r10d ;"
 729                "mulx 32(%2),  %%r8,  %%r9; " /* C[3]*D[0] */
 730                "adox %%rbx,  %%r8 ;"
 731                "movq  %%r8, 88(%0);"
 732                "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
 733                "adox %%r10,  %%r9 ;"
 734                "adcx  %%r9, %%rcx ;"
 735                "movq %%rcx,  96(%0) ;"
 736                "mulx 48(%2),  %%r8, %%r13; " /* C[3]*D[2] */
 737                "adox  %%r8, %%r11 ;"
 738                "adcx %%r11, %%r15 ;"
 739                "movq %%r15, 104(%0) ;"
 740                "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
 741                "adox %%r10, %%r13 ;"
 742                "adcx %%r13, %%rax ;"
 743                "movq %%rax, 112(%0) ;"
 744                /******************************************/
 745                "adox %%r14, %%rbx ;"
 746                "adcx %%r14, %%rbx ;"
 747                "movq %%rbx, 120(%0) ;"
 748                :
 749                : "r"(c), "r"(a), "r"(b)
 750                : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
 751                  "%r10", "%r11", "%r13", "%r14", "%r15");
 752}
 753
 754static void mul2_256x256_integer_bmi2(u64 *const c, const u64 *const a,
 755                                      const u64 *const b)
 756{
 757        asm volatile(
 758                "movq   (%1), %%rdx; "  /* A[0] */
 759                "mulx   (%2),  %%r8, %%r15; " /* A[0]*B[0] */
 760                "movq %%r8,  (%0) ;"
 761                "mulx  8(%2), %%r10, %%rax; " /* A[0]*B[1] */
 762                "addq %%r10, %%r15 ;"
 763                "mulx 16(%2),  %%r8, %%rbx; " /* A[0]*B[2] */
 764                "adcq  %%r8, %%rax ;"
 765                "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
 766                "adcq %%r10, %%rbx ;"
 767                /******************************************/
 768                "adcq    $0, %%rcx ;"
 769
 770                "movq  8(%1), %%rdx; "  /* A[1] */
 771                "mulx   (%2),  %%r8,  %%r9; " /* A[1]*B[0] */
 772                "addq %%r15,  %%r8 ;"
 773                "movq %%r8, 8(%0) ;"
 774                "mulx  8(%2), %%r10, %%r11; " /* A[1]*B[1] */
 775                "adcq %%r10,  %%r9 ;"
 776                "mulx 16(%2),  %%r8, %%r13; " /* A[1]*B[2] */
 777                "adcq  %%r8, %%r11 ;"
 778                "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
 779                "adcq %%r10, %%r13 ;"
 780                /******************************************/
 781                "adcq    $0, %%r15 ;"
 782
 783                "addq  %%r9, %%rax ;"
 784                "adcq %%r11, %%rbx ;"
 785                "adcq %%r13, %%rcx ;"
 786                "adcq    $0, %%r15 ;"
 787
 788                "movq 16(%1), %%rdx; "  /* A[2] */
 789                "mulx   (%2),  %%r8,  %%r9; " /* A[2]*B[0] */
 790                "addq %%rax,  %%r8 ;"
 791                "movq %%r8, 16(%0) ;"
 792                "mulx  8(%2), %%r10, %%r11; " /* A[2]*B[1] */
 793                "adcq %%r10,  %%r9 ;"
 794                "mulx 16(%2),  %%r8, %%r13; " /* A[2]*B[2] */
 795                "adcq  %%r8, %%r11 ;"
 796                "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
 797                "adcq %%r10, %%r13 ;"
 798                /******************************************/
 799                "adcq    $0, %%rax ;"
 800
 801                "addq  %%r9, %%rbx ;"
 802                "adcq %%r11, %%rcx ;"
 803                "adcq %%r13, %%r15 ;"
 804                "adcq    $0, %%rax ;"
 805
 806                "movq 24(%1), %%rdx; "  /* A[3] */
 807                "mulx   (%2),  %%r8,  %%r9; " /* A[3]*B[0] */
 808                "addq %%rbx,  %%r8 ;"
 809                "movq %%r8, 24(%0) ;"
 810                "mulx  8(%2), %%r10, %%r11; " /* A[3]*B[1] */
 811                "adcq %%r10,  %%r9 ;"
 812                "mulx 16(%2),  %%r8, %%r13; " /* A[3]*B[2] */
 813                "adcq  %%r8, %%r11 ;"
 814                "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
 815                "adcq %%r10, %%r13 ;"
 816                /******************************************/
 817                "adcq    $0, %%rbx ;"
 818
 819                "addq  %%r9, %%rcx ;"
 820                "movq %%rcx, 32(%0) ;"
 821                "adcq %%r11, %%r15 ;"
 822                "movq %%r15, 40(%0) ;"
 823                "adcq %%r13, %%rax ;"
 824                "movq %%rax, 48(%0) ;"
 825                "adcq    $0, %%rbx ;"
 826                "movq %%rbx, 56(%0) ;"
 827
 828                "movq 32(%1), %%rdx; "  /* C[0] */
 829                "mulx 32(%2),  %%r8, %%r15; " /* C[0]*D[0] */
 830                "movq %%r8, 64(%0) ;"
 831                "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
 832                "addq %%r10, %%r15 ;"
 833                "mulx 48(%2),  %%r8, %%rbx; " /* C[0]*D[2] */
 834                "adcq  %%r8, %%rax ;"
 835                "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
 836                "adcq %%r10, %%rbx ;"
 837                /******************************************/
 838                "adcq    $0, %%rcx ;"
 839
 840                "movq 40(%1), %%rdx; "  /* C[1] */
 841                "mulx 32(%2),  %%r8,  %%r9; " /* C[1]*D[0] */
 842                "addq %%r15,  %%r8 ;"
 843                "movq %%r8, 72(%0) ;"
 844                "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
 845                "adcq %%r10,  %%r9 ;"
 846                "mulx 48(%2),  %%r8, %%r13; " /* C[1]*D[2] */
 847                "adcq  %%r8, %%r11 ;"
 848                "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */
 849                "adcq %%r10, %%r13 ;"
 850                /******************************************/
 851                "adcq    $0, %%r15 ;"
 852
 853                "addq  %%r9, %%rax ;"
 854                "adcq %%r11, %%rbx ;"
 855                "adcq %%r13, %%rcx ;"
 856                "adcq    $0, %%r15 ;"
 857
 858                "movq 48(%1), %%rdx; "  /* C[2] */
 859                "mulx 32(%2),  %%r8,  %%r9; " /* C[2]*D[0] */
 860                "addq %%rax,  %%r8 ;"
 861                "movq %%r8, 80(%0) ;"
 862                "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
 863                "adcq %%r10,  %%r9 ;"
 864                "mulx 48(%2),  %%r8, %%r13; " /* C[2]*D[2] */
 865                "adcq  %%r8, %%r11 ;"
 866                "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
 867                "adcq %%r10, %%r13 ;"
 868                /******************************************/
 869                "adcq    $0, %%rax ;"
 870
 871                "addq  %%r9, %%rbx ;"
 872                "adcq %%r11, %%rcx ;"
 873                "adcq %%r13, %%r15 ;"
 874                "adcq    $0, %%rax ;"
 875
 876                "movq 56(%1), %%rdx; "  /* C[3] */
 877                "mulx 32(%2),  %%r8,  %%r9; " /* C[3]*D[0] */
 878                "addq %%rbx,  %%r8 ;"
 879                "movq %%r8, 88(%0) ;"
 880                "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
 881                "adcq %%r10,  %%r9 ;"
 882                "mulx 48(%2),  %%r8, %%r13; " /* C[3]*D[2] */
 883                "adcq  %%r8, %%r11 ;"
 884                "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
 885                "adcq %%r10, %%r13 ;"
 886                /******************************************/
 887                "adcq    $0, %%rbx ;"
 888
 889                "addq  %%r9, %%rcx ;"
 890                "movq %%rcx,  96(%0) ;"
 891                "adcq %%r11, %%r15 ;"
 892                "movq %%r15, 104(%0) ;"
 893                "adcq %%r13, %%rax ;"
 894                "movq %%rax, 112(%0) ;"
 895                "adcq    $0, %%rbx ;"
 896                "movq %%rbx, 120(%0) ;"
 897                :
 898                : "r"(c), "r"(a), "r"(b)
 899                : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
 900                  "%r10", "%r11", "%r13", "%r15");
 901}
 902
 903static void sqr2_256x256_integer_adx(u64 *const c, const u64 *const a)
 904{
 905        asm volatile(
 906                "movq   (%1), %%rdx        ;" /* A[0]      */
 907                "mulx  8(%1),  %%r8, %%r14 ;" /* A[1]*A[0] */
 908                "xorl %%r15d, %%r15d;"
 909                "mulx 16(%1),  %%r9, %%r10 ;" /* A[2]*A[0] */
 910                "adcx %%r14,  %%r9 ;"
 911                "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
 912                "adcx %%rax, %%r10 ;"
 913                "movq 24(%1), %%rdx        ;" /* A[3]      */
 914                "mulx  8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */
 915                "adcx %%rcx, %%r11 ;"
 916                "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
 917                "adcx %%rax, %%rbx ;"
 918                "movq  8(%1), %%rdx        ;" /* A[1]      */
 919                "adcx %%r15, %%r13 ;"
 920                "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
 921                "movq    $0, %%r14 ;"
 922                /******************************************/
 923                "adcx %%r15, %%r14 ;"
 924
 925                "xorl %%r15d, %%r15d;"
 926                "adox %%rax, %%r10 ;"
 927                "adcx  %%r8,  %%r8 ;"
 928                "adox %%rcx, %%r11 ;"
 929                "adcx  %%r9,  %%r9 ;"
 930                "adox %%r15, %%rbx ;"
 931                "adcx %%r10, %%r10 ;"
 932                "adox %%r15, %%r13 ;"
 933                "adcx %%r11, %%r11 ;"
 934                "adox %%r15, %%r14 ;"
 935                "adcx %%rbx, %%rbx ;"
 936                "adcx %%r13, %%r13 ;"
 937                "adcx %%r14, %%r14 ;"
 938
 939                "movq   (%1), %%rdx ;"
 940                "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
 941                /*******************/
 942                "movq %%rax,  0(%0) ;"
 943                "addq %%rcx,  %%r8 ;"
 944                "movq  %%r8,  8(%0) ;"
 945                "movq  8(%1), %%rdx ;"
 946                "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
 947                "adcq %%rax,  %%r9 ;"
 948                "movq  %%r9, 16(%0) ;"
 949                "adcq %%rcx, %%r10 ;"
 950                "movq %%r10, 24(%0) ;"
 951                "movq 16(%1), %%rdx ;"
 952                "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
 953                "adcq %%rax, %%r11 ;"
 954                "movq %%r11, 32(%0) ;"
 955                "adcq %%rcx, %%rbx ;"
 956                "movq %%rbx, 40(%0) ;"
 957                "movq 24(%1), %%rdx ;"
 958                "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
 959                "adcq %%rax, %%r13 ;"
 960                "movq %%r13, 48(%0) ;"
 961                "adcq %%rcx, %%r14 ;"
 962                "movq %%r14, 56(%0) ;"
 963
 964
 965                "movq 32(%1), %%rdx        ;" /* B[0]      */
 966                "mulx 40(%1),  %%r8, %%r14 ;" /* B[1]*B[0] */
 967                "xorl %%r15d, %%r15d;"
 968                "mulx 48(%1),  %%r9, %%r10 ;" /* B[2]*B[0] */
 969                "adcx %%r14,  %%r9 ;"
 970                "mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */
 971                "adcx %%rax, %%r10 ;"
 972                "movq 56(%1), %%rdx        ;" /* B[3]      */
 973                "mulx 40(%1), %%r11, %%rbx ;" /* B[1]*B[3] */
 974                "adcx %%rcx, %%r11 ;"
 975                "mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */
 976                "adcx %%rax, %%rbx ;"
 977                "movq 40(%1), %%rdx        ;" /* B[1]      */
 978                "adcx %%r15, %%r13 ;"
 979                "mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */
 980                "movq    $0, %%r14 ;"
 981                /******************************************/
 982                "adcx %%r15, %%r14 ;"
 983
 984                "xorl %%r15d, %%r15d;"
 985                "adox %%rax, %%r10 ;"
 986                "adcx  %%r8,  %%r8 ;"
 987                "adox %%rcx, %%r11 ;"
 988                "adcx  %%r9,  %%r9 ;"
 989                "adox %%r15, %%rbx ;"
 990                "adcx %%r10, %%r10 ;"
 991                "adox %%r15, %%r13 ;"
 992                "adcx %%r11, %%r11 ;"
 993                "adox %%r15, %%r14 ;"
 994                "adcx %%rbx, %%rbx ;"
 995                "adcx %%r13, %%r13 ;"
 996                "adcx %%r14, %%r14 ;"
 997
 998                "movq 32(%1), %%rdx ;"
 999                "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */
1000                /*******************/
1001                "movq %%rax,  64(%0) ;"
1002                "addq %%rcx,  %%r8 ;"
1003                "movq  %%r8,  72(%0) ;"
1004                "movq 40(%1), %%rdx ;"
1005                "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */
1006                "adcq %%rax,  %%r9 ;"
1007                "movq  %%r9,  80(%0) ;"
1008                "adcq %%rcx, %%r10 ;"
1009                "movq %%r10,  88(%0) ;"
1010                "movq 48(%1), %%rdx ;"
1011                "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */
1012                "adcq %%rax, %%r11 ;"
1013                "movq %%r11,  96(%0) ;"
1014                "adcq %%rcx, %%rbx ;"
1015                "movq %%rbx, 104(%0) ;"
1016                "movq 56(%1), %%rdx ;"
1017                "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */
1018                "adcq %%rax, %%r13 ;"
1019                "movq %%r13, 112(%0) ;"
1020                "adcq %%rcx, %%r14 ;"
1021                "movq %%r14, 120(%0) ;"
1022                :
1023                : "r"(c), "r"(a)
1024                : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1025                  "%r10", "%r11", "%r13", "%r14", "%r15");
1026}
1027
1028static void sqr2_256x256_integer_bmi2(u64 *const c, const u64 *const a)
1029{
1030        asm volatile(
1031                "movq  8(%1), %%rdx        ;" /* A[1]      */
1032                "mulx   (%1),  %%r8,  %%r9 ;" /* A[0]*A[1] */
1033                "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
1034                "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
1035
1036                "movq 16(%1), %%rdx        ;" /* A[2]      */
1037                "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */
1038                "mulx   (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
1039
1040                "addq %%rax,  %%r9 ;"
1041                "adcq %%rdx, %%r10 ;"
1042                "adcq %%rcx, %%r11 ;"
1043                "adcq %%r14, %%r15 ;"
1044                "adcq    $0, %%r13 ;"
1045                "movq    $0, %%r14 ;"
1046                "adcq    $0, %%r14 ;"
1047
1048                "movq   (%1), %%rdx        ;" /* A[0]      */
1049                "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
1050
1051                "addq %%rax, %%r10 ;"
1052                "adcq %%rcx, %%r11 ;"
1053                "adcq    $0, %%r15 ;"
1054                "adcq    $0, %%r13 ;"
1055                "adcq    $0, %%r14 ;"
1056
1057                "shldq $1, %%r13, %%r14 ;"
1058                "shldq $1, %%r15, %%r13 ;"
1059                "shldq $1, %%r11, %%r15 ;"
1060                "shldq $1, %%r10, %%r11 ;"
1061                "shldq $1,  %%r9, %%r10 ;"
1062                "shldq $1,  %%r8,  %%r9 ;"
1063                "shlq  $1,  %%r8        ;"
1064
1065                /*******************/
1066                "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */
1067                /*******************/
1068                "movq %%rax,  0(%0) ;"
1069                "addq %%rcx,  %%r8 ;"
1070                "movq  %%r8,  8(%0) ;"
1071                "movq  8(%1), %%rdx ;"
1072                "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */
1073                "adcq %%rax,  %%r9 ;"
1074                "movq  %%r9, 16(%0) ;"
1075                "adcq %%rcx, %%r10 ;"
1076                "movq %%r10, 24(%0) ;"
1077                "movq 16(%1), %%rdx ;"
1078                "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */
1079                "adcq %%rax, %%r11 ;"
1080                "movq %%r11, 32(%0) ;"
1081                "adcq %%rcx, %%r15 ;"
1082                "movq %%r15, 40(%0) ;"
1083                "movq 24(%1), %%rdx ;"
1084                "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */
1085                "adcq %%rax, %%r13 ;"
1086                "movq %%r13, 48(%0) ;"
1087                "adcq %%rcx, %%r14 ;"
1088                "movq %%r14, 56(%0) ;"
1089
1090                "movq 40(%1), %%rdx        ;" /* B[1]      */
1091                "mulx 32(%1),  %%r8,  %%r9 ;" /* B[0]*B[1] */
1092                "mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */
1093                "mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */
1094
1095                "movq 48(%1), %%rdx        ;" /* B[2]      */
1096                "mulx 56(%1), %%r15, %%r13 ;" /* B[3]*B[2] */
1097                "mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */
1098
1099                "addq %%rax,  %%r9 ;"
1100                "adcq %%rdx, %%r10 ;"
1101                "adcq %%rcx, %%r11 ;"
1102                "adcq %%r14, %%r15 ;"
1103                "adcq    $0, %%r13 ;"
1104                "movq    $0, %%r14 ;"
1105                "adcq    $0, %%r14 ;"
1106
1107                "movq 32(%1), %%rdx        ;" /* B[0]      */
1108                "mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */
1109
1110                "addq %%rax, %%r10 ;"
1111                "adcq %%rcx, %%r11 ;"
1112                "adcq    $0, %%r15 ;"
1113                "adcq    $0, %%r13 ;"
1114                "adcq    $0, %%r14 ;"
1115
1116                "shldq $1, %%r13, %%r14 ;"
1117                "shldq $1, %%r15, %%r13 ;"
1118                "shldq $1, %%r11, %%r15 ;"
1119                "shldq $1, %%r10, %%r11 ;"
1120                "shldq $1,  %%r9, %%r10 ;"
1121                "shldq $1,  %%r8,  %%r9 ;"
1122                "shlq  $1,  %%r8        ;"
1123
1124                /*******************/
1125                "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */
1126                /*******************/
1127                "movq %%rax,  64(%0) ;"
1128                "addq %%rcx,  %%r8 ;"
1129                "movq  %%r8,  72(%0) ;"
1130                "movq 40(%1), %%rdx ;"
1131                "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */
1132                "adcq %%rax,  %%r9 ;"
1133                "movq  %%r9,  80(%0) ;"
1134                "adcq %%rcx, %%r10 ;"
1135                "movq %%r10,  88(%0) ;"
1136                "movq 48(%1), %%rdx ;"
1137                "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */
1138                "adcq %%rax, %%r11 ;"
1139                "movq %%r11,  96(%0) ;"
1140                "adcq %%rcx, %%r15 ;"
1141                "movq %%r15, 104(%0) ;"
1142                "movq 56(%1), %%rdx ;"
1143                "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */
1144                "adcq %%rax, %%r13 ;"
1145                "movq %%r13, 112(%0) ;"
1146                "adcq %%rcx, %%r14 ;"
1147                "movq %%r14, 120(%0) ;"
1148                :
1149                : "r"(c), "r"(a)
1150                : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1151                  "%r11", "%r13", "%r14", "%r15");
1152}
1153
1154static void red_eltfp25519_2w_adx(u64 *const c, const u64 *const a)
1155{
1156        asm volatile(
1157                "movl    $38, %%edx; "  /* 2*c = 38 = 2^256 */
1158                "mulx 32(%1),  %%r8, %%r10; " /* c*C[4] */
1159                "xorl %%ebx, %%ebx ;"
1160                "adox   (%1),  %%r8 ;"
1161                "mulx 40(%1),  %%r9, %%r11; " /* c*C[5] */
1162                "adcx %%r10,  %%r9 ;"
1163                "adox  8(%1),  %%r9 ;"
1164                "mulx 48(%1), %%r10, %%rax; " /* c*C[6] */
1165                "adcx %%r11, %%r10 ;"
1166                "adox 16(%1), %%r10 ;"
1167                "mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */
1168                "adcx %%rax, %%r11 ;"
1169                "adox 24(%1), %%r11 ;"
1170                /***************************************/
1171                "adcx %%rbx, %%rcx ;"
1172                "adox  %%rbx, %%rcx ;"
1173                "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
1174                "adcx %%rcx,  %%r8 ;"
1175                "adcx %%rbx,  %%r9 ;"
1176                "movq  %%r9,  8(%0) ;"
1177                "adcx %%rbx, %%r10 ;"
1178                "movq %%r10, 16(%0) ;"
1179                "adcx %%rbx, %%r11 ;"
1180                "movq %%r11, 24(%0) ;"
1181                "mov     $0, %%ecx ;"
1182                "cmovc %%edx, %%ecx ;"
1183                "addq %%rcx,  %%r8 ;"
1184                "movq  %%r8,   (%0) ;"
1185
1186                "mulx  96(%1),  %%r8, %%r10; " /* c*C[4] */
1187                "xorl %%ebx, %%ebx ;"
1188                "adox 64(%1),  %%r8 ;"
1189                "mulx 104(%1),  %%r9, %%r11; " /* c*C[5] */
1190                "adcx %%r10,  %%r9 ;"
1191                "adox 72(%1),  %%r9 ;"
1192                "mulx 112(%1), %%r10, %%rax; " /* c*C[6] */
1193                "adcx %%r11, %%r10 ;"
1194                "adox 80(%1), %%r10 ;"
1195                "mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */
1196                "adcx %%rax, %%r11 ;"
1197                "adox 88(%1), %%r11 ;"
1198                /****************************************/
1199                "adcx %%rbx, %%rcx ;"
1200                "adox  %%rbx, %%rcx ;"
1201                "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
1202                "adcx %%rcx,  %%r8 ;"
1203                "adcx %%rbx,  %%r9 ;"
1204                "movq  %%r9, 40(%0) ;"
1205                "adcx %%rbx, %%r10 ;"
1206                "movq %%r10, 48(%0) ;"
1207                "adcx %%rbx, %%r11 ;"
1208                "movq %%r11, 56(%0) ;"
1209                "mov     $0, %%ecx ;"
1210                "cmovc %%edx, %%ecx ;"
1211                "addq %%rcx,  %%r8 ;"
1212                "movq  %%r8, 32(%0) ;"
1213                :
1214                : "r"(c), "r"(a)
1215                : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1216                  "%r10", "%r11");
1217}
1218
1219static void red_eltfp25519_2w_bmi2(u64 *const c, const u64 *const a)
1220{
1221        asm volatile(
1222                "movl    $38, %%edx ; "       /* 2*c = 38 = 2^256 */
1223                "mulx 32(%1),  %%r8, %%r10 ;" /* c*C[4] */
1224                "mulx 40(%1),  %%r9, %%r11 ;" /* c*C[5] */
1225                "addq %%r10,  %%r9 ;"
1226                "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
1227                "adcq %%r11, %%r10 ;"
1228                "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
1229                "adcq %%rax, %%r11 ;"
1230                /***************************************/
1231                "adcq    $0, %%rcx ;"
1232                "addq   (%1),  %%r8 ;"
1233                "adcq  8(%1),  %%r9 ;"
1234                "adcq 16(%1), %%r10 ;"
1235                "adcq 24(%1), %%r11 ;"
1236                "adcq     $0, %%rcx ;"
1237                "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
1238                "addq %%rcx,  %%r8 ;"
1239                "adcq    $0,  %%r9 ;"
1240                "movq  %%r9,  8(%0) ;"
1241                "adcq    $0, %%r10 ;"
1242                "movq %%r10, 16(%0) ;"
1243                "adcq    $0, %%r11 ;"
1244                "movq %%r11, 24(%0) ;"
1245                "mov     $0, %%ecx ;"
1246                "cmovc %%edx, %%ecx ;"
1247                "addq %%rcx,  %%r8 ;"
1248                "movq  %%r8,   (%0) ;"
1249
1250                "mulx  96(%1),  %%r8, %%r10 ;" /* c*C[4] */
1251                "mulx 104(%1),  %%r9, %%r11 ;" /* c*C[5] */
1252                "addq %%r10,  %%r9 ;"
1253                "mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */
1254                "adcq %%r11, %%r10 ;"
1255                "mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */
1256                "adcq %%rax, %%r11 ;"
1257                /****************************************/
1258                "adcq    $0, %%rcx ;"
1259                "addq 64(%1),  %%r8 ;"
1260                "adcq 72(%1),  %%r9 ;"
1261                "adcq 80(%1), %%r10 ;"
1262                "adcq 88(%1), %%r11 ;"
1263                "adcq     $0, %%rcx ;"
1264                "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
1265                "addq %%rcx,  %%r8 ;"
1266                "adcq    $0,  %%r9 ;"
1267                "movq  %%r9, 40(%0) ;"
1268                "adcq    $0, %%r10 ;"
1269                "movq %%r10, 48(%0) ;"
1270                "adcq    $0, %%r11 ;"
1271                "movq %%r11, 56(%0) ;"
1272                "mov     $0, %%ecx ;"
1273                "cmovc %%edx, %%ecx ;"
1274                "addq %%rcx,  %%r8 ;"
1275                "movq  %%r8, 32(%0) ;"
1276                :
1277                : "r"(c), "r"(a)
1278                : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1279                  "%r11");
1280}
1281
1282static void mul_256x256_integer_adx(u64 *const c, const u64 *const a,
1283                                    const u64 *const b)
1284{
1285        asm volatile(
1286                "movq   (%1), %%rdx; "  /* A[0] */
1287                "mulx   (%2),  %%r8,  %%r9; " /* A[0]*B[0] */
1288                "xorl %%r10d, %%r10d ;"
1289                "movq  %%r8,  (%0) ;"
1290                "mulx  8(%2), %%r10, %%r11; " /* A[0]*B[1] */
1291                "adox  %%r9, %%r10 ;"
1292                "movq %%r10, 8(%0) ;"
1293                "mulx 16(%2), %%r15, %%r13; " /* A[0]*B[2] */
1294                "adox %%r11, %%r15 ;"
1295                "mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */
1296                "adox %%r13, %%r14 ;"
1297                "movq $0, %%rax ;"
1298                /******************************************/
1299                "adox %%rdx, %%rax ;"
1300
1301                "movq  8(%1), %%rdx; "  /* A[1] */
1302                "mulx   (%2),  %%r8,  %%r9; " /* A[1]*B[0] */
1303                "xorl %%r10d, %%r10d ;"
1304                "adcx 8(%0),  %%r8 ;"
1305                "movq  %%r8,  8(%0) ;"
1306                "mulx  8(%2), %%r10, %%r11; " /* A[1]*B[1] */
1307                "adox  %%r9, %%r10 ;"
1308                "adcx %%r15, %%r10 ;"
1309                "movq %%r10, 16(%0) ;"
1310                "mulx 16(%2), %%r15, %%r13; " /* A[1]*B[2] */
1311                "adox %%r11, %%r15 ;"
1312                "adcx %%r14, %%r15 ;"
1313                "movq $0, %%r8  ;"
1314                "mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */
1315                "adox %%r13, %%r14 ;"
1316                "adcx %%rax, %%r14 ;"
1317                "movq $0, %%rax ;"
1318                /******************************************/
1319                "adox %%rdx, %%rax ;"
1320                "adcx  %%r8, %%rax ;"
1321
1322                "movq 16(%1), %%rdx; "  /* A[2] */
1323                "mulx   (%2),  %%r8,  %%r9; " /* A[2]*B[0] */
1324                "xorl %%r10d, %%r10d ;"
1325                "adcx 16(%0), %%r8 ;"
1326                "movq  %%r8, 16(%0) ;"
1327                "mulx  8(%2), %%r10, %%r11; " /* A[2]*B[1] */
1328                "adox  %%r9, %%r10 ;"
1329                "adcx %%r15, %%r10 ;"
1330                "movq %%r10, 24(%0) ;"
1331                "mulx 16(%2), %%r15, %%r13; " /* A[2]*B[2] */
1332                "adox %%r11, %%r15 ;"
1333                "adcx %%r14, %%r15 ;"
1334                "movq $0, %%r8  ;"
1335                "mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */
1336                "adox %%r13, %%r14 ;"
1337                "adcx %%rax, %%r14 ;"
1338                "movq $0, %%rax ;"
1339                /******************************************/
1340                "adox %%rdx, %%rax ;"
1341                "adcx  %%r8, %%rax ;"
1342
1343                "movq 24(%1), %%rdx; "  /* A[3] */
1344                "mulx   (%2),  %%r8,  %%r9; " /* A[3]*B[0] */
1345                "xorl %%r10d, %%r10d ;"
1346                "adcx 24(%0), %%r8 ;"
1347                "movq  %%r8, 24(%0) ;"
1348                "mulx  8(%2), %%r10, %%r11; " /* A[3]*B[1] */
1349                "adox  %%r9, %%r10 ;"
1350                "adcx %%r15, %%r10 ;"
1351                "movq %%r10, 32(%0) ;"
1352                "mulx 16(%2), %%r15, %%r13; " /* A[3]*B[2] */
1353                "adox %%r11, %%r15 ;"
1354                "adcx %%r14, %%r15 ;"
1355                "movq %%r15, 40(%0) ;"
1356                "movq $0, %%r8  ;"
1357                "mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */
1358                "adox %%r13, %%r14 ;"
1359                "adcx %%rax, %%r14 ;"
1360                "movq %%r14, 48(%0) ;"
1361                "movq $0, %%rax ;"
1362                /******************************************/
1363                "adox %%rdx, %%rax ;"
1364                "adcx  %%r8, %%rax ;"
1365                "movq %%rax, 56(%0) ;"
1366                :
1367                : "r"(c), "r"(a), "r"(b)
1368                : "memory", "cc", "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11",
1369                  "%r13", "%r14", "%r15");
1370}
1371
1372static void mul_256x256_integer_bmi2(u64 *const c, const u64 *const a,
1373                                     const u64 *const b)
1374{
1375        asm volatile(
1376                "movq   (%1), %%rdx; "  /* A[0] */
1377                "mulx   (%2),  %%r8, %%r15; " /* A[0]*B[0] */
1378                "movq %%r8,  (%0) ;"
1379                "mulx  8(%2), %%r10, %%rax; " /* A[0]*B[1] */
1380                "addq %%r10, %%r15 ;"
1381                "mulx 16(%2),  %%r8, %%rbx; " /* A[0]*B[2] */
1382                "adcq  %%r8, %%rax ;"
1383                "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
1384                "adcq %%r10, %%rbx ;"
1385                /******************************************/
1386                "adcq    $0, %%rcx ;"
1387
1388                "movq  8(%1), %%rdx; "  /* A[1] */
1389                "mulx   (%2),  %%r8,  %%r9; " /* A[1]*B[0] */
1390                "addq %%r15,  %%r8 ;"
1391                "movq %%r8, 8(%0) ;"
1392                "mulx  8(%2), %%r10, %%r11; " /* A[1]*B[1] */
1393                "adcq %%r10,  %%r9 ;"
1394                "mulx 16(%2),  %%r8, %%r13; " /* A[1]*B[2] */
1395                "adcq  %%r8, %%r11 ;"
1396                "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
1397                "adcq %%r10, %%r13 ;"
1398                /******************************************/
1399                "adcq    $0, %%r15 ;"
1400
1401                "addq  %%r9, %%rax ;"
1402                "adcq %%r11, %%rbx ;"
1403                "adcq %%r13, %%rcx ;"
1404                "adcq    $0, %%r15 ;"
1405
1406                "movq 16(%1), %%rdx; "  /* A[2] */
1407                "mulx   (%2),  %%r8,  %%r9; " /* A[2]*B[0] */
1408                "addq %%rax,  %%r8 ;"
1409                "movq %%r8, 16(%0) ;"
1410                "mulx  8(%2), %%r10, %%r11; " /* A[2]*B[1] */
1411                "adcq %%r10,  %%r9 ;"
1412                "mulx 16(%2),  %%r8, %%r13; " /* A[2]*B[2] */
1413                "adcq  %%r8, %%r11 ;"
1414                "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
1415                "adcq %%r10, %%r13 ;"
1416                /******************************************/
1417                "adcq    $0, %%rax ;"
1418
1419                "addq  %%r9, %%rbx ;"
1420                "adcq %%r11, %%rcx ;"
1421                "adcq %%r13, %%r15 ;"
1422                "adcq    $0, %%rax ;"
1423
1424                "movq 24(%1), %%rdx; "  /* A[3] */
1425                "mulx   (%2),  %%r8,  %%r9; " /* A[3]*B[0] */
1426                "addq %%rbx,  %%r8 ;"
1427                "movq %%r8, 24(%0) ;"
1428                "mulx  8(%2), %%r10, %%r11; " /* A[3]*B[1] */
1429                "adcq %%r10,  %%r9 ;"
1430                "mulx 16(%2),  %%r8, %%r13; " /* A[3]*B[2] */
1431                "adcq  %%r8, %%r11 ;"
1432                "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
1433                "adcq %%r10, %%r13 ;"
1434                /******************************************/
1435                "adcq    $0, %%rbx ;"
1436
1437                "addq  %%r9, %%rcx ;"
1438                "movq %%rcx, 32(%0) ;"
1439                "adcq %%r11, %%r15 ;"
1440                "movq %%r15, 40(%0) ;"
1441                "adcq %%r13, %%rax ;"
1442                "movq %%rax, 48(%0) ;"
1443                "adcq    $0, %%rbx ;"
1444                "movq %%rbx, 56(%0) ;"
1445                :
1446                : "r"(c), "r"(a), "r"(b)
1447                : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1448                  "%r10", "%r11", "%r13", "%r15");
1449}
1450
1451static void sqr_256x256_integer_adx(u64 *const c, const u64 *const a)
1452{
1453        asm volatile(
1454                "movq   (%1), %%rdx        ;" /* A[0]      */
1455                "mulx  8(%1),  %%r8, %%r14 ;" /* A[1]*A[0] */
1456                "xorl %%r15d, %%r15d;"
1457                "mulx 16(%1),  %%r9, %%r10 ;" /* A[2]*A[0] */
1458                "adcx %%r14,  %%r9 ;"
1459                "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
1460                "adcx %%rax, %%r10 ;"
1461                "movq 24(%1), %%rdx        ;" /* A[3]      */
1462                "mulx  8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */
1463                "adcx %%rcx, %%r11 ;"
1464                "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
1465                "adcx %%rax, %%rbx ;"
1466                "movq  8(%1), %%rdx        ;" /* A[1]      */
1467                "adcx %%r15, %%r13 ;"
1468                "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
1469                "movq    $0, %%r14 ;"
1470                /******************************************/
1471                "adcx %%r15, %%r14 ;"
1472
1473                "xorl %%r15d, %%r15d;"
1474                "adox %%rax, %%r10 ;"
1475                "adcx  %%r8,  %%r8 ;"
1476                "adox %%rcx, %%r11 ;"
1477                "adcx  %%r9,  %%r9 ;"
1478                "adox %%r15, %%rbx ;"
1479                "adcx %%r10, %%r10 ;"
1480                "adox %%r15, %%r13 ;"
1481                "adcx %%r11, %%r11 ;"
1482                "adox %%r15, %%r14 ;"
1483                "adcx %%rbx, %%rbx ;"
1484                "adcx %%r13, %%r13 ;"
1485                "adcx %%r14, %%r14 ;"
1486
1487                "movq   (%1), %%rdx ;"
1488                "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
1489                /*******************/
1490                "movq %%rax,  0(%0) ;"
1491                "addq %%rcx,  %%r8 ;"
1492                "movq  %%r8,  8(%0) ;"
1493                "movq  8(%1), %%rdx ;"
1494                "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
1495                "adcq %%rax,  %%r9 ;"
1496                "movq  %%r9, 16(%0) ;"
1497                "adcq %%rcx, %%r10 ;"
1498                "movq %%r10, 24(%0) ;"
1499                "movq 16(%1), %%rdx ;"
1500                "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
1501                "adcq %%rax, %%r11 ;"
1502                "movq %%r11, 32(%0) ;"
1503                "adcq %%rcx, %%rbx ;"
1504                "movq %%rbx, 40(%0) ;"
1505                "movq 24(%1), %%rdx ;"
1506                "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
1507                "adcq %%rax, %%r13 ;"
1508                "movq %%r13, 48(%0) ;"
1509                "adcq %%rcx, %%r14 ;"
1510                "movq %%r14, 56(%0) ;"
1511                :
1512                : "r"(c), "r"(a)
1513                : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1514                  "%r10", "%r11", "%r13", "%r14", "%r15");
1515}
1516
1517static void sqr_256x256_integer_bmi2(u64 *const c, const u64 *const a)
1518{
1519        asm volatile(
1520                "movq  8(%1), %%rdx        ;" /* A[1]      */
1521                "mulx   (%1),  %%r8,  %%r9 ;" /* A[0]*A[1] */
1522                "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
1523                "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
1524
1525                "movq 16(%1), %%rdx        ;" /* A[2]      */
1526                "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */
1527                "mulx   (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
1528
1529                "addq %%rax,  %%r9 ;"
1530                "adcq %%rdx, %%r10 ;"
1531                "adcq %%rcx, %%r11 ;"
1532                "adcq %%r14, %%r15 ;"
1533                "adcq    $0, %%r13 ;"
1534                "movq    $0, %%r14 ;"
1535                "adcq    $0, %%r14 ;"
1536
1537                "movq   (%1), %%rdx        ;" /* A[0]      */
1538                "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
1539
1540                "addq %%rax, %%r10 ;"
1541                "adcq %%rcx, %%r11 ;"
1542                "adcq    $0, %%r15 ;"
1543                "adcq    $0, %%r13 ;"
1544                "adcq    $0, %%r14 ;"
1545
1546                "shldq $1, %%r13, %%r14 ;"
1547                "shldq $1, %%r15, %%r13 ;"
1548                "shldq $1, %%r11, %%r15 ;"
1549                "shldq $1, %%r10, %%r11 ;"
1550                "shldq $1,  %%r9, %%r10 ;"
1551                "shldq $1,  %%r8,  %%r9 ;"
1552                "shlq  $1,  %%r8        ;"
1553
1554                /*******************/
1555                "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
1556                /*******************/
1557                "movq %%rax,  0(%0) ;"
1558                "addq %%rcx,  %%r8 ;"
1559                "movq  %%r8,  8(%0) ;"
1560                "movq  8(%1), %%rdx ;"
1561                "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
1562                "adcq %%rax,  %%r9 ;"
1563                "movq  %%r9, 16(%0) ;"
1564                "adcq %%rcx, %%r10 ;"
1565                "movq %%r10, 24(%0) ;"
1566                "movq 16(%1), %%rdx ;"
1567                "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
1568                "adcq %%rax, %%r11 ;"
1569                "movq %%r11, 32(%0) ;"
1570                "adcq %%rcx, %%r15 ;"
1571                "movq %%r15, 40(%0) ;"
1572                "movq 24(%1), %%rdx ;"
1573                "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
1574                "adcq %%rax, %%r13 ;"
1575                "movq %%r13, 48(%0) ;"
1576                "adcq %%rcx, %%r14 ;"
1577                "movq %%r14, 56(%0) ;"
1578                :
1579                : "r"(c), "r"(a)
1580                : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1581                  "%r11", "%r13", "%r14", "%r15");
1582}
1583
1584static void red_eltfp25519_1w_adx(u64 *const c, const u64 *const a)
1585{
1586        asm volatile(
1587                "movl    $38, %%edx ;"  /* 2*c = 38 = 2^256 */
1588                "mulx 32(%1),  %%r8, %%r10 ;" /* c*C[4] */
1589                "xorl %%ebx, %%ebx ;"
1590                "adox   (%1),  %%r8 ;"
1591                "mulx 40(%1),  %%r9, %%r11 ;" /* c*C[5] */
1592                "adcx %%r10,  %%r9 ;"
1593                "adox  8(%1),  %%r9 ;"
1594                "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
1595                "adcx %%r11, %%r10 ;"
1596                "adox 16(%1), %%r10 ;"
1597                "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
1598                "adcx %%rax, %%r11 ;"
1599                "adox 24(%1), %%r11 ;"
1600                /***************************************/
1601                "adcx %%rbx, %%rcx ;"
1602                "adox  %%rbx, %%rcx ;"
1603                "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
1604                "adcx %%rcx,  %%r8 ;"
1605                "adcx %%rbx,  %%r9 ;"
1606                "movq  %%r9,  8(%0) ;"
1607                "adcx %%rbx, %%r10 ;"
1608                "movq %%r10, 16(%0) ;"
1609                "adcx %%rbx, %%r11 ;"
1610                "movq %%r11, 24(%0) ;"
1611                "mov     $0, %%ecx ;"
1612                "cmovc %%edx, %%ecx ;"
1613                "addq %%rcx,  %%r8 ;"
1614                "movq  %%r8,   (%0) ;"
1615                :
1616                : "r"(c), "r"(a)
1617                : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1618                  "%r10", "%r11");
1619}
1620
1621static void red_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a)
1622{
1623        asm volatile(
1624                "movl    $38, %%edx ;"  /* 2*c = 38 = 2^256 */
1625                "mulx 32(%1),  %%r8, %%r10 ;" /* c*C[4] */
1626                "mulx 40(%1),  %%r9, %%r11 ;" /* c*C[5] */
1627                "addq %%r10,  %%r9 ;"
1628                "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
1629                "adcq %%r11, %%r10 ;"
1630                "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
1631                "adcq %%rax, %%r11 ;"
1632                /***************************************/
1633                "adcq    $0, %%rcx ;"
1634                "addq   (%1),  %%r8 ;"
1635                "adcq  8(%1),  %%r9 ;"
1636                "adcq 16(%1), %%r10 ;"
1637                "adcq 24(%1), %%r11 ;"
1638                "adcq     $0, %%rcx ;"
1639                "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
1640                "addq %%rcx,  %%r8 ;"
1641                "adcq    $0,  %%r9 ;"
1642                "movq  %%r9,  8(%0) ;"
1643                "adcq    $0, %%r10 ;"
1644                "movq %%r10, 16(%0) ;"
1645                "adcq    $0, %%r11 ;"
1646                "movq %%r11, 24(%0) ;"
1647                "mov     $0, %%ecx ;"
1648                "cmovc %%edx, %%ecx ;"
1649                "addq %%rcx,  %%r8 ;"
1650                "movq  %%r8,   (%0) ;"
1651                :
1652                : "r"(c), "r"(a)
1653                : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1654                  "%r11");
1655}
1656
1657static __always_inline void
1658add_eltfp25519_1w_adx(u64 *const c, const u64 *const a, const u64 *const b)
1659{
1660        asm volatile(
1661                "mov     $38, %%eax ;"
1662                "xorl  %%ecx, %%ecx ;"
1663                "movq   (%2),  %%r8 ;"
1664                "adcx   (%1),  %%r8 ;"
1665                "movq  8(%2),  %%r9 ;"
1666                "adcx  8(%1),  %%r9 ;"
1667                "movq 16(%2), %%r10 ;"
1668                "adcx 16(%1), %%r10 ;"
1669                "movq 24(%2), %%r11 ;"
1670                "adcx 24(%1), %%r11 ;"
1671                "cmovc %%eax, %%ecx ;"
1672                "xorl %%eax, %%eax  ;"
1673                "adcx %%rcx,  %%r8  ;"
1674                "adcx %%rax,  %%r9  ;"
1675                "movq  %%r9,  8(%0) ;"
1676                "adcx %%rax, %%r10  ;"
1677                "movq %%r10, 16(%0) ;"
1678                "adcx %%rax, %%r11  ;"
1679                "movq %%r11, 24(%0) ;"
1680                "mov     $38, %%ecx ;"
1681                "cmovc %%ecx, %%eax ;"
1682                "addq %%rax,  %%r8  ;"
1683                "movq  %%r8,   (%0) ;"
1684                :
1685                : "r"(c), "r"(a), "r"(b)
1686                : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
1687}
1688
1689static __always_inline void
1690add_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a, const u64 *const b)
1691{
1692        asm volatile(
1693                "mov     $38, %%eax ;"
1694                "movq   (%2),  %%r8 ;"
1695                "addq   (%1),  %%r8 ;"
1696                "movq  8(%2),  %%r9 ;"
1697                "adcq  8(%1),  %%r9 ;"
1698                "movq 16(%2), %%r10 ;"
1699                "adcq 16(%1), %%r10 ;"
1700                "movq 24(%2), %%r11 ;"
1701                "adcq 24(%1), %%r11 ;"
1702                "mov      $0, %%ecx ;"
1703                "cmovc %%eax, %%ecx ;"
1704                "addq %%rcx,  %%r8  ;"
1705                "adcq    $0,  %%r9  ;"
1706                "movq  %%r9,  8(%0) ;"
1707                "adcq    $0, %%r10  ;"
1708                "movq %%r10, 16(%0) ;"
1709                "adcq    $0, %%r11  ;"
1710                "movq %%r11, 24(%0) ;"
1711                "mov     $0, %%ecx  ;"
1712                "cmovc %%eax, %%ecx ;"
1713                "addq %%rcx,  %%r8  ;"
1714                "movq  %%r8,   (%0) ;"
1715                :
1716                : "r"(c), "r"(a), "r"(b)
1717                : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
1718}
1719
1720static __always_inline void
1721sub_eltfp25519_1w(u64 *const c, const u64 *const a, const u64 *const b)
1722{
1723        asm volatile(
1724                "mov     $38, %%eax ;"
1725                "movq   (%1),  %%r8 ;"
1726                "subq   (%2),  %%r8 ;"
1727                "movq  8(%1),  %%r9 ;"
1728                "sbbq  8(%2),  %%r9 ;"
1729                "movq 16(%1), %%r10 ;"
1730                "sbbq 16(%2), %%r10 ;"
1731                "movq 24(%1), %%r11 ;"
1732                "sbbq 24(%2), %%r11 ;"
1733                "mov      $0, %%ecx ;"
1734                "cmovc %%eax, %%ecx ;"
1735                "subq %%rcx,  %%r8  ;"
1736                "sbbq    $0,  %%r9  ;"
1737                "movq  %%r9,  8(%0) ;"
1738                "sbbq    $0, %%r10  ;"
1739                "movq %%r10, 16(%0) ;"
1740                "sbbq    $0, %%r11  ;"
1741                "movq %%r11, 24(%0) ;"
1742                "mov     $0, %%ecx  ;"
1743                "cmovc %%eax, %%ecx ;"
1744                "subq %%rcx,  %%r8  ;"
1745                "movq  %%r8,   (%0) ;"
1746                :
1747                : "r"(c), "r"(a), "r"(b)
1748                : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
1749}
1750
1751/* Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666 */
1752static __always_inline void
1753mul_a24_eltfp25519_1w(u64 *const c, const u64 *const a)
1754{
1755        const u64 a24 = 121666;
1756        asm volatile(
1757                "movq     %2, %%rdx ;"
1758                "mulx   (%1),  %%r8, %%r10 ;"
1759                "mulx  8(%1),  %%r9, %%r11 ;"
1760                "addq %%r10,  %%r9 ;"
1761                "mulx 16(%1), %%r10, %%rax ;"
1762                "adcq %%r11, %%r10 ;"
1763                "mulx 24(%1), %%r11, %%rcx ;"
1764                "adcq %%rax, %%r11 ;"
1765                /**************************/
1766                "adcq    $0, %%rcx ;"
1767                "movl   $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/
1768                "imul %%rdx, %%rcx ;"
1769                "addq %%rcx,  %%r8 ;"
1770                "adcq    $0,  %%r9 ;"
1771                "movq  %%r9,  8(%0) ;"
1772                "adcq    $0, %%r10 ;"
1773                "movq %%r10, 16(%0) ;"
1774                "adcq    $0, %%r11 ;"
1775                "movq %%r11, 24(%0) ;"
1776                "mov     $0, %%ecx ;"
1777                "cmovc %%edx, %%ecx ;"
1778                "addq %%rcx,  %%r8 ;"
1779                "movq  %%r8,   (%0) ;"
1780                :
1781                : "r"(c), "r"(a), "r"(a24)
1782                : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1783                  "%r11");
1784}
1785
1786static void inv_eltfp25519_1w_adx(u64 *const c, const u64 *const a)
1787{
1788        struct {
1789                eltfp25519_1w_buffer buffer;
1790                eltfp25519_1w x0, x1, x2;
1791        } __aligned(32) m;
1792        u64 *T[4];
1793
1794        T[0] = m.x0;
1795        T[1] = c; /* x^(-1) */
1796        T[2] = m.x1;
1797        T[3] = m.x2;
1798
1799        copy_eltfp25519_1w(T[1], a);
1800        sqrn_eltfp25519_1w_adx(T[1], 1);
1801        copy_eltfp25519_1w(T[2], T[1]);
1802        sqrn_eltfp25519_1w_adx(T[2], 2);
1803        mul_eltfp25519_1w_adx(T[0], a, T[2]);
1804        mul_eltfp25519_1w_adx(T[1], T[1], T[0]);
1805        copy_eltfp25519_1w(T[2], T[1]);
1806        sqrn_eltfp25519_1w_adx(T[2], 1);
1807        mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
1808        copy_eltfp25519_1w(T[2], T[0]);
1809        sqrn_eltfp25519_1w_adx(T[2], 5);
1810        mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
1811        copy_eltfp25519_1w(T[2], T[0]);
1812        sqrn_eltfp25519_1w_adx(T[2], 10);
1813        mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
1814        copy_eltfp25519_1w(T[3], T[2]);
1815        sqrn_eltfp25519_1w_adx(T[3], 20);
1816        mul_eltfp25519_1w_adx(T[3], T[3], T[2]);
1817        sqrn_eltfp25519_1w_adx(T[3], 10);
1818        mul_eltfp25519_1w_adx(T[3], T[3], T[0]);
1819        copy_eltfp25519_1w(T[0], T[3]);
1820        sqrn_eltfp25519_1w_adx(T[0], 50);
1821        mul_eltfp25519_1w_adx(T[0], T[0], T[3]);
1822        copy_eltfp25519_1w(T[2], T[0]);
1823        sqrn_eltfp25519_1w_adx(T[2], 100);
1824        mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
1825        sqrn_eltfp25519_1w_adx(T[2], 50);
1826        mul_eltfp25519_1w_adx(T[2], T[2], T[3]);
1827        sqrn_eltfp25519_1w_adx(T[2], 5);
1828        mul_eltfp25519_1w_adx(T[1], T[1], T[2]);
1829
1830        memzero_explicit(&m, sizeof(m));
1831}
1832
1833static void inv_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a)
1834{
1835        struct {
1836                eltfp25519_1w_buffer buffer;
1837                eltfp25519_1w x0, x1, x2;
1838        } __aligned(32) m;
1839        u64 *T[5];
1840
1841        T[0] = m.x0;
1842        T[1] = c; /* x^(-1) */
1843        T[2] = m.x1;
1844        T[3] = m.x2;
1845
1846        copy_eltfp25519_1w(T[1], a);
1847        sqrn_eltfp25519_1w_bmi2(T[1], 1);
1848        copy_eltfp25519_1w(T[2], T[1]);
1849        sqrn_eltfp25519_1w_bmi2(T[2], 2);
1850        mul_eltfp25519_1w_bmi2(T[0], a, T[2]);
1851        mul_eltfp25519_1w_bmi2(T[1], T[1], T[0]);
1852        copy_eltfp25519_1w(T[2], T[1]);
1853        sqrn_eltfp25519_1w_bmi2(T[2], 1);
1854        mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
1855        copy_eltfp25519_1w(T[2], T[0]);
1856        sqrn_eltfp25519_1w_bmi2(T[2], 5);
1857        mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
1858        copy_eltfp25519_1w(T[2], T[0]);
1859        sqrn_eltfp25519_1w_bmi2(T[2], 10);
1860        mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
1861        copy_eltfp25519_1w(T[3], T[2]);
1862        sqrn_eltfp25519_1w_bmi2(T[3], 20);
1863        mul_eltfp25519_1w_bmi2(T[3], T[3], T[2]);
1864        sqrn_eltfp25519_1w_bmi2(T[3], 10);
1865        mul_eltfp25519_1w_bmi2(T[3], T[3], T[0]);
1866        copy_eltfp25519_1w(T[0], T[3]);
1867        sqrn_eltfp25519_1w_bmi2(T[0], 50);
1868        mul_eltfp25519_1w_bmi2(T[0], T[0], T[3]);
1869        copy_eltfp25519_1w(T[2], T[0]);
1870        sqrn_eltfp25519_1w_bmi2(T[2], 100);
1871        mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
1872        sqrn_eltfp25519_1w_bmi2(T[2], 50);
1873        mul_eltfp25519_1w_bmi2(T[2], T[2], T[3]);
1874        sqrn_eltfp25519_1w_bmi2(T[2], 5);
1875        mul_eltfp25519_1w_bmi2(T[1], T[1], T[2]);
1876
1877        memzero_explicit(&m, sizeof(m));
1878}
1879
1880/* Given c, a 256-bit number, fred_eltfp25519_1w updates c
1881 * with a number such that 0 <= C < 2**255-19.
1882 */
1883static __always_inline void fred_eltfp25519_1w(u64 *const c)
1884{
1885        u64 tmp0 = 38, tmp1 = 19;
1886        asm volatile(
1887                "btrq   $63,    %3 ;" /* Put bit 255 in carry flag and clear */
1888                "cmovncl %k5,   %k4 ;" /* c[255] ? 38 : 19 */
1889
1890                /* Add either 19 or 38 to c */
1891                "addq    %4,   %0 ;"
1892                "adcq    $0,   %1 ;"
1893                "adcq    $0,   %2 ;"
1894                "adcq    $0,   %3 ;"
1895
1896                /* Test for bit 255 again; only triggered on overflow modulo 2^255-19 */
1897                "movl    $0,  %k4 ;"
1898                "cmovnsl %k5,  %k4 ;" /* c[255] ? 0 : 19 */
1899                "btrq   $63,   %3 ;" /* Clear bit 255 */
1900
1901                /* Subtract 19 if necessary */
1902                "subq    %4,   %0 ;"
1903                "sbbq    $0,   %1 ;"
1904                "sbbq    $0,   %2 ;"
1905                "sbbq    $0,   %3 ;"
1906
1907                : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(tmp0),
1908                  "+r"(tmp1)
1909                :
1910                : "memory", "cc");
1911}
1912
1913static __always_inline void cswap(u8 bit, u64 *const px, u64 *const py)
1914{
1915        u64 temp;
1916        asm volatile(
1917                "test %9, %9 ;"
1918                "movq %0, %8 ;"
1919                "cmovnzq %4, %0 ;"
1920                "cmovnzq %8, %4 ;"
1921                "movq %1, %8 ;"
1922                "cmovnzq %5, %1 ;"
1923                "cmovnzq %8, %5 ;"
1924                "movq %2, %8 ;"
1925                "cmovnzq %6, %2 ;"
1926                "cmovnzq %8, %6 ;"
1927                "movq %3, %8 ;"
1928                "cmovnzq %7, %3 ;"
1929                "cmovnzq %8, %7 ;"
1930                : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]),
1931                  "+r"(py[0]), "+r"(py[1]), "+r"(py[2]), "+r"(py[3]),
1932                  "=r"(temp)
1933                : "r"(bit)
1934                : "cc"
1935        );
1936}
1937
1938static __always_inline void cselect(u8 bit, u64 *const px, const u64 *const py)
1939{
1940        asm volatile(
1941                "test %4, %4 ;"
1942                "cmovnzq %5, %0 ;"
1943                "cmovnzq %6, %1 ;"
1944                "cmovnzq %7, %2 ;"
1945                "cmovnzq %8, %3 ;"
1946                : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3])
1947                : "r"(bit), "rm"(py[0]), "rm"(py[1]), "rm"(py[2]), "rm"(py[3])
1948                : "cc"
1949        );
1950}
1951
1952static void curve25519_adx(u8 shared[CURVE25519_KEY_SIZE],
1953                           const u8 private_key[CURVE25519_KEY_SIZE],
1954                           const u8 session_key[CURVE25519_KEY_SIZE])
1955{
1956        struct {
1957                u64 buffer[4 * NUM_WORDS_ELTFP25519];
1958                u64 coordinates[4 * NUM_WORDS_ELTFP25519];
1959                u64 workspace[6 * NUM_WORDS_ELTFP25519];
1960                u8 session[CURVE25519_KEY_SIZE];
1961                u8 private[CURVE25519_KEY_SIZE];
1962        } __aligned(32) m;
1963
1964        int i = 0, j = 0;
1965        u64 prev = 0;
1966        u64 *const X1 = (u64 *)m.session;
1967        u64 *const key = (u64 *)m.private;
1968        u64 *const Px = m.coordinates + 0;
1969        u64 *const Pz = m.coordinates + 4;
1970        u64 *const Qx = m.coordinates + 8;
1971        u64 *const Qz = m.coordinates + 12;
1972        u64 *const X2 = Qx;
1973        u64 *const Z2 = Qz;
1974        u64 *const X3 = Px;
1975        u64 *const Z3 = Pz;
1976        u64 *const X2Z2 = Qx;
1977        u64 *const X3Z3 = Px;
1978
1979        u64 *const A = m.workspace + 0;
1980        u64 *const B = m.workspace + 4;
1981        u64 *const D = m.workspace + 8;
1982        u64 *const C = m.workspace + 12;
1983        u64 *const DA = m.workspace + 16;
1984        u64 *const CB = m.workspace + 20;
1985        u64 *const AB = A;
1986        u64 *const DC = D;
1987        u64 *const DACB = DA;
1988
1989        memcpy(m.private, private_key, sizeof(m.private));
1990        memcpy(m.session, session_key, sizeof(m.session));
1991
1992        curve25519_clamp_secret(m.private);
1993
1994        /* As in the draft:
1995         * When receiving such an array, implementations of curve25519
1996         * MUST mask the most-significant bit in the final byte. This
1997         * is done to preserve compatibility with point formats which
1998         * reserve the sign bit for use in other protocols and to
1999         * increase resistance to implementation fingerprinting
2000         */
2001        m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1;
2002
2003        copy_eltfp25519_1w(Px, X1);
2004        setzero_eltfp25519_1w(Pz);
2005        setzero_eltfp25519_1w(Qx);
2006        setzero_eltfp25519_1w(Qz);
2007
2008        Pz[0] = 1;
2009        Qx[0] = 1;
2010
2011        /* main-loop */
2012        prev = 0;
2013        j = 62;
2014        for (i = 3; i >= 0; --i) {
2015                while (j >= 0) {
2016                        u64 bit = (key[i] >> j) & 0x1;
2017                        u64 swap = bit ^ prev;
2018                        prev = bit;
2019
2020                        add_eltfp25519_1w_adx(A, X2, Z2);       /* A = (X2+Z2) */
2021                        sub_eltfp25519_1w(B, X2, Z2);           /* B = (X2-Z2) */
2022                        add_eltfp25519_1w_adx(C, X3, Z3);       /* C = (X3+Z3) */
2023                        sub_eltfp25519_1w(D, X3, Z3);           /* D = (X3-Z3) */
2024                        mul_eltfp25519_2w_adx(DACB, AB, DC);    /* [DA|CB] = [A|B]*[D|C] */
2025
2026                        cselect(swap, A, C);
2027                        cselect(swap, B, D);
2028
2029                        sqr_eltfp25519_2w_adx(AB);              /* [AA|BB] = [A^2|B^2] */
2030                        add_eltfp25519_1w_adx(X3, DA, CB);      /* X3 = (DA+CB) */
2031                        sub_eltfp25519_1w(Z3, DA, CB);          /* Z3 = (DA-CB) */
2032                        sqr_eltfp25519_2w_adx(X3Z3);            /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
2033
2034                        copy_eltfp25519_1w(X2, B);              /* X2 = B^2 */
2035                        sub_eltfp25519_1w(Z2, A, B);            /* Z2 = E = AA-BB */
2036
2037                        mul_a24_eltfp25519_1w(B, Z2);           /* B = a24*E */
2038                        add_eltfp25519_1w_adx(B, B, X2);        /* B = a24*E+B */
2039                        mul_eltfp25519_2w_adx(X2Z2, X2Z2, AB);  /* [X2|Z2] = [B|E]*[A|a24*E+B] */
2040                        mul_eltfp25519_1w_adx(Z3, Z3, X1);      /* Z3 = Z3*X1 */
2041                        --j;
2042                }
2043                j = 63;
2044        }
2045
2046        inv_eltfp25519_1w_adx(A, Qz);
2047        mul_eltfp25519_1w_adx((u64 *)shared, Qx, A);
2048        fred_eltfp25519_1w((u64 *)shared);
2049
2050        memzero_explicit(&m, sizeof(m));
2051}
2052
2053static void curve25519_adx_base(u8 session_key[CURVE25519_KEY_SIZE],
2054                                const u8 private_key[CURVE25519_KEY_SIZE])
2055{
2056        struct {
2057                u64 buffer[4 * NUM_WORDS_ELTFP25519];
2058                u64 coordinates[4 * NUM_WORDS_ELTFP25519];
2059                u64 workspace[4 * NUM_WORDS_ELTFP25519];
2060                u8 private[CURVE25519_KEY_SIZE];
2061        } __aligned(32) m;
2062
2063        const int ite[4] = { 64, 64, 64, 63 };
2064        const int q = 3;
2065        u64 swap = 1;
2066
2067        int i = 0, j = 0, k = 0;
2068        u64 *const key = (u64 *)m.private;
2069        u64 *const Ur1 = m.coordinates + 0;
2070        u64 *const Zr1 = m.coordinates + 4;
2071        u64 *const Ur2 = m.coordinates + 8;
2072        u64 *const Zr2 = m.coordinates + 12;
2073
2074        u64 *const UZr1 = m.coordinates + 0;
2075        u64 *const ZUr2 = m.coordinates + 8;
2076
2077        u64 *const A = m.workspace + 0;
2078        u64 *const B = m.workspace + 4;
2079        u64 *const C = m.workspace + 8;
2080        u64 *const D = m.workspace + 12;
2081
2082        u64 *const AB = m.workspace + 0;
2083        u64 *const CD = m.workspace + 8;
2084
2085        const u64 *const P = table_ladder_8k;
2086
2087        memcpy(m.private, private_key, sizeof(m.private));
2088
2089        curve25519_clamp_secret(m.private);
2090
2091        setzero_eltfp25519_1w(Ur1);
2092        setzero_eltfp25519_1w(Zr1);
2093        setzero_eltfp25519_1w(Zr2);
2094        Ur1[0] = 1;
2095        Zr1[0] = 1;
2096        Zr2[0] = 1;
2097
2098        /* G-S */
2099        Ur2[3] = 0x1eaecdeee27cab34UL;
2100        Ur2[2] = 0xadc7a0b9235d48e2UL;
2101        Ur2[1] = 0xbbf095ae14b2edf8UL;
2102        Ur2[0] = 0x7e94e1fec82faabdUL;
2103
2104        /* main-loop */
2105        j = q;
2106        for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
2107                while (j < ite[i]) {
2108                        u64 bit = (key[i] >> j) & 0x1;
2109                        k = (64 * i + j - q);
2110                        swap = swap ^ bit;
2111                        cswap(swap, Ur1, Ur2);
2112                        cswap(swap, Zr1, Zr2);
2113                        swap = bit;
2114                        /* Addition */
2115                        sub_eltfp25519_1w(B, Ur1, Zr1);         /* B = Ur1-Zr1 */
2116                        add_eltfp25519_1w_adx(A, Ur1, Zr1);     /* A = Ur1+Zr1 */
2117                        mul_eltfp25519_1w_adx(C, &P[4 * k], B); /* C = M0-B */
2118                        sub_eltfp25519_1w(B, A, C);             /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
2119                        add_eltfp25519_1w_adx(A, A, C);         /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
2120                        sqr_eltfp25519_2w_adx(AB);              /* A = A^2      |  B = B^2 */
2121                        mul_eltfp25519_2w_adx(UZr1, ZUr2, AB);  /* Ur1 = Zr2*A  |  Zr1 = Ur2*B */
2122                        ++j;
2123                }
2124                j = 0;
2125        }
2126
2127        /* Doubling */
2128        for (i = 0; i < q; ++i) {
2129                add_eltfp25519_1w_adx(A, Ur1, Zr1);     /*  A = Ur1+Zr1 */
2130                sub_eltfp25519_1w(B, Ur1, Zr1);         /*  B = Ur1-Zr1 */
2131                sqr_eltfp25519_2w_adx(AB);              /*  A = A**2     B = B**2 */
2132                copy_eltfp25519_1w(C, B);               /*  C = B */
2133                sub_eltfp25519_1w(B, A, B);             /*  B = A-B */
2134                mul_a24_eltfp25519_1w(D, B);            /*  D = my_a24*B */
2135                add_eltfp25519_1w_adx(D, D, C);         /*  D = D+C */
2136                mul_eltfp25519_2w_adx(UZr1, AB, CD);    /*  Ur1 = A*B   Zr1 = Zr1*A */
2137        }
2138
2139        /* Convert to affine coordinates */
2140        inv_eltfp25519_1w_adx(A, Zr1);
2141        mul_eltfp25519_1w_adx((u64 *)session_key, Ur1, A);
2142        fred_eltfp25519_1w((u64 *)session_key);
2143
2144        memzero_explicit(&m, sizeof(m));
2145}
2146
2147static void curve25519_bmi2(u8 shared[CURVE25519_KEY_SIZE],
2148                            const u8 private_key[CURVE25519_KEY_SIZE],
2149                            const u8 session_key[CURVE25519_KEY_SIZE])
2150{
2151        struct {
2152                u64 buffer[4 * NUM_WORDS_ELTFP25519];
2153                u64 coordinates[4 * NUM_WORDS_ELTFP25519];
2154                u64 workspace[6 * NUM_WORDS_ELTFP25519];
2155                u8 session[CURVE25519_KEY_SIZE];
2156                u8 private[CURVE25519_KEY_SIZE];
2157        } __aligned(32) m;
2158
2159        int i = 0, j = 0;
2160        u64 prev = 0;
2161        u64 *const X1 = (u64 *)m.session;
2162        u64 *const key = (u64 *)m.private;
2163        u64 *const Px = m.coordinates + 0;
2164        u64 *const Pz = m.coordinates + 4;
2165        u64 *const Qx = m.coordinates + 8;
2166        u64 *const Qz = m.coordinates + 12;
2167        u64 *const X2 = Qx;
2168        u64 *const Z2 = Qz;
2169        u64 *const X3 = Px;
2170        u64 *const Z3 = Pz;
2171        u64 *const X2Z2 = Qx;
2172        u64 *const X3Z3 = Px;
2173
2174        u64 *const A = m.workspace + 0;
2175        u64 *const B = m.workspace + 4;
2176        u64 *const D = m.workspace + 8;
2177        u64 *const C = m.workspace + 12;
2178        u64 *const DA = m.workspace + 16;
2179        u64 *const CB = m.workspace + 20;
2180        u64 *const AB = A;
2181        u64 *const DC = D;
2182        u64 *const DACB = DA;
2183
2184        memcpy(m.private, private_key, sizeof(m.private));
2185        memcpy(m.session, session_key, sizeof(m.session));
2186
2187        curve25519_clamp_secret(m.private);
2188
2189        /* As in the draft:
2190         * When receiving such an array, implementations of curve25519
2191         * MUST mask the most-significant bit in the final byte. This
2192         * is done to preserve compatibility with point formats which
2193         * reserve the sign bit for use in other protocols and to
2194         * increase resistance to implementation fingerprinting
2195         */
2196        m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1;
2197
2198        copy_eltfp25519_1w(Px, X1);
2199        setzero_eltfp25519_1w(Pz);
2200        setzero_eltfp25519_1w(Qx);
2201        setzero_eltfp25519_1w(Qz);
2202
2203        Pz[0] = 1;
2204        Qx[0] = 1;
2205
2206        /* main-loop */
2207        prev = 0;
2208        j = 62;
2209        for (i = 3; i >= 0; --i) {
2210                while (j >= 0) {
2211                        u64 bit = (key[i] >> j) & 0x1;
2212                        u64 swap = bit ^ prev;
2213                        prev = bit;
2214
2215                        add_eltfp25519_1w_bmi2(A, X2, Z2);      /* A = (X2+Z2) */
2216                        sub_eltfp25519_1w(B, X2, Z2);           /* B = (X2-Z2) */
2217                        add_eltfp25519_1w_bmi2(C, X3, Z3);      /* C = (X3+Z3) */
2218                        sub_eltfp25519_1w(D, X3, Z3);           /* D = (X3-Z3) */
2219                        mul_eltfp25519_2w_bmi2(DACB, AB, DC);   /* [DA|CB] = [A|B]*[D|C] */
2220
2221                        cselect(swap, A, C);
2222                        cselect(swap, B, D);
2223
2224                        sqr_eltfp25519_2w_bmi2(AB);             /* [AA|BB] = [A^2|B^2] */
2225                        add_eltfp25519_1w_bmi2(X3, DA, CB);     /* X3 = (DA+CB) */
2226                        sub_eltfp25519_1w(Z3, DA, CB);          /* Z3 = (DA-CB) */
2227                        sqr_eltfp25519_2w_bmi2(X3Z3);           /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
2228
2229                        copy_eltfp25519_1w(X2, B);              /* X2 = B^2 */
2230                        sub_eltfp25519_1w(Z2, A, B);            /* Z2 = E = AA-BB */
2231
2232                        mul_a24_eltfp25519_1w(B, Z2);           /* B = a24*E */
2233                        add_eltfp25519_1w_bmi2(B, B, X2);       /* B = a24*E+B */
2234                        mul_eltfp25519_2w_bmi2(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */
2235                        mul_eltfp25519_1w_bmi2(Z3, Z3, X1);     /* Z3 = Z3*X1 */
2236                        --j;
2237                }
2238                j = 63;
2239        }
2240
2241        inv_eltfp25519_1w_bmi2(A, Qz);
2242        mul_eltfp25519_1w_bmi2((u64 *)shared, Qx, A);
2243        fred_eltfp25519_1w((u64 *)shared);
2244
2245        memzero_explicit(&m, sizeof(m));
2246}
2247
2248static void curve25519_bmi2_base(u8 session_key[CURVE25519_KEY_SIZE],
2249                                 const u8 private_key[CURVE25519_KEY_SIZE])
2250{
2251        struct {
2252                u64 buffer[4 * NUM_WORDS_ELTFP25519];
2253                u64 coordinates[4 * NUM_WORDS_ELTFP25519];
2254                u64 workspace[4 * NUM_WORDS_ELTFP25519];
2255                u8 private[CURVE25519_KEY_SIZE];
2256        } __aligned(32) m;
2257
2258        const int ite[4] = { 64, 64, 64, 63 };
2259        const int q = 3;
2260        u64 swap = 1;
2261
2262        int i = 0, j = 0, k = 0;
2263        u64 *const key = (u64 *)m.private;
2264        u64 *const Ur1 = m.coordinates + 0;
2265        u64 *const Zr1 = m.coordinates + 4;
2266        u64 *const Ur2 = m.coordinates + 8;
2267        u64 *const Zr2 = m.coordinates + 12;
2268
2269        u64 *const UZr1 = m.coordinates + 0;
2270        u64 *const ZUr2 = m.coordinates + 8;
2271
2272        u64 *const A = m.workspace + 0;
2273        u64 *const B = m.workspace + 4;
2274        u64 *const C = m.workspace + 8;
2275        u64 *const D = m.workspace + 12;
2276
2277        u64 *const AB = m.workspace + 0;
2278        u64 *const CD = m.workspace + 8;
2279
2280        const u64 *const P = table_ladder_8k;
2281
2282        memcpy(m.private, private_key, sizeof(m.private));
2283
2284        curve25519_clamp_secret(m.private);
2285
2286        setzero_eltfp25519_1w(Ur1);
2287        setzero_eltfp25519_1w(Zr1);
2288        setzero_eltfp25519_1w(Zr2);
2289        Ur1[0] = 1;
2290        Zr1[0] = 1;
2291        Zr2[0] = 1;
2292
2293        /* G-S */
2294        Ur2[3] = 0x1eaecdeee27cab34UL;
2295        Ur2[2] = 0xadc7a0b9235d48e2UL;
2296        Ur2[1] = 0xbbf095ae14b2edf8UL;
2297        Ur2[0] = 0x7e94e1fec82faabdUL;
2298
2299        /* main-loop */
2300        j = q;
2301        for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
2302                while (j < ite[i]) {
2303                        u64 bit = (key[i] >> j) & 0x1;
2304                        k = (64 * i + j - q);
2305                        swap = swap ^ bit;
2306                        cswap(swap, Ur1, Ur2);
2307                        cswap(swap, Zr1, Zr2);
2308                        swap = bit;
2309                        /* Addition */
2310                        sub_eltfp25519_1w(B, Ur1, Zr1);         /* B = Ur1-Zr1 */
2311                        add_eltfp25519_1w_bmi2(A, Ur1, Zr1);    /* A = Ur1+Zr1 */
2312                        mul_eltfp25519_1w_bmi2(C, &P[4 * k], B);/* C = M0-B */
2313                        sub_eltfp25519_1w(B, A, C);             /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
2314                        add_eltfp25519_1w_bmi2(A, A, C);        /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
2315                        sqr_eltfp25519_2w_bmi2(AB);             /* A = A^2      |  B = B^2 */
2316                        mul_eltfp25519_2w_bmi2(UZr1, ZUr2, AB); /* Ur1 = Zr2*A  |  Zr1 = Ur2*B */
2317                        ++j;
2318                }
2319                j = 0;
2320        }
2321
2322        /* Doubling */
2323        for (i = 0; i < q; ++i) {
2324                add_eltfp25519_1w_bmi2(A, Ur1, Zr1);    /*  A = Ur1+Zr1 */
2325                sub_eltfp25519_1w(B, Ur1, Zr1);         /*  B = Ur1-Zr1 */
2326                sqr_eltfp25519_2w_bmi2(AB);             /*  A = A**2     B = B**2 */
2327                copy_eltfp25519_1w(C, B);               /*  C = B */
2328                sub_eltfp25519_1w(B, A, B);             /*  B = A-B */
2329                mul_a24_eltfp25519_1w(D, B);            /*  D = my_a24*B */
2330                add_eltfp25519_1w_bmi2(D, D, C);        /*  D = D+C */
2331                mul_eltfp25519_2w_bmi2(UZr1, AB, CD);   /*  Ur1 = A*B   Zr1 = Zr1*A */
2332        }
2333
2334        /* Convert to affine coordinates */
2335        inv_eltfp25519_1w_bmi2(A, Zr1);
2336        mul_eltfp25519_1w_bmi2((u64 *)session_key, Ur1, A);
2337        fred_eltfp25519_1w((u64 *)session_key);
2338
2339        memzero_explicit(&m, sizeof(m));
2340}
2341
2342void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
2343                     const u8 secret[CURVE25519_KEY_SIZE],
2344                     const u8 basepoint[CURVE25519_KEY_SIZE])
2345{
2346        if (static_branch_likely(&curve25519_use_adx))
2347                curve25519_adx(mypublic, secret, basepoint);
2348        else if (static_branch_likely(&curve25519_use_bmi2))
2349                curve25519_bmi2(mypublic, secret, basepoint);
2350        else
2351                curve25519_generic(mypublic, secret, basepoint);
2352}
2353EXPORT_SYMBOL(curve25519_arch);
2354
2355void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
2356                          const u8 secret[CURVE25519_KEY_SIZE])
2357{
2358        if (static_branch_likely(&curve25519_use_adx))
2359                curve25519_adx_base(pub, secret);
2360        else if (static_branch_likely(&curve25519_use_bmi2))
2361                curve25519_bmi2_base(pub, secret);
2362        else
2363                curve25519_generic(pub, secret, curve25519_base_point);
2364}
2365EXPORT_SYMBOL(curve25519_base_arch);
2366
2367static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
2368                                 unsigned int len)
2369{
2370        u8 *secret = kpp_tfm_ctx(tfm);
2371
2372        if (!len)
2373                curve25519_generate_secret(secret);
2374        else if (len == CURVE25519_KEY_SIZE &&
2375                 crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
2376                memcpy(secret, buf, CURVE25519_KEY_SIZE);
2377        else
2378                return -EINVAL;
2379        return 0;
2380}
2381
2382static int curve25519_generate_public_key(struct kpp_request *req)
2383{
2384        struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
2385        const u8 *secret = kpp_tfm_ctx(tfm);
2386        u8 buf[CURVE25519_KEY_SIZE];
2387        int copied, nbytes;
2388
2389        if (req->src)
2390                return -EINVAL;
2391
2392        curve25519_base_arch(buf, secret);
2393
2394        /* might want less than we've got */
2395        nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
2396        copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
2397                                                                nbytes),
2398                                     buf, nbytes);
2399        if (copied != nbytes)
2400                return -EINVAL;
2401        return 0;
2402}
2403
2404static int curve25519_compute_shared_secret(struct kpp_request *req)
2405{
2406        struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
2407        const u8 *secret = kpp_tfm_ctx(tfm);
2408        u8 public_key[CURVE25519_KEY_SIZE];
2409        u8 buf[CURVE25519_KEY_SIZE];
2410        int copied, nbytes;
2411
2412        if (!req->src)
2413                return -EINVAL;
2414
2415        copied = sg_copy_to_buffer(req->src,
2416                                   sg_nents_for_len(req->src,
2417                                                    CURVE25519_KEY_SIZE),
2418                                   public_key, CURVE25519_KEY_SIZE);
2419        if (copied != CURVE25519_KEY_SIZE)
2420                return -EINVAL;
2421
2422        curve25519_arch(buf, secret, public_key);
2423
2424        /* might want less than we've got */
2425        nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
2426        copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
2427                                                                nbytes),
2428                                     buf, nbytes);
2429        if (copied != nbytes)
2430                return -EINVAL;
2431        return 0;
2432}
2433
2434static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
2435{
2436        return CURVE25519_KEY_SIZE;
2437}
2438
2439static struct kpp_alg curve25519_alg = {
2440        .base.cra_name          = "curve25519",
2441        .base.cra_driver_name   = "curve25519-x86",
2442        .base.cra_priority      = 200,
2443        .base.cra_module        = THIS_MODULE,
2444        .base.cra_ctxsize       = CURVE25519_KEY_SIZE,
2445
2446        .set_secret             = curve25519_set_secret,
2447        .generate_public_key    = curve25519_generate_public_key,
2448        .compute_shared_secret  = curve25519_compute_shared_secret,
2449        .max_size               = curve25519_max_size,
2450};
2451
2452static int __init curve25519_mod_init(void)
2453{
2454        if (boot_cpu_has(X86_FEATURE_BMI2))
2455                static_branch_enable(&curve25519_use_bmi2);
2456        else if (boot_cpu_has(X86_FEATURE_ADX))
2457                static_branch_enable(&curve25519_use_adx);
2458        else
2459                return 0;
2460        return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
2461                crypto_register_kpp(&curve25519_alg) : 0;
2462}
2463
2464static void __exit curve25519_mod_exit(void)
2465{
2466        if (IS_REACHABLE(CONFIG_CRYPTO_KPP) &&
2467            (boot_cpu_has(X86_FEATURE_BMI2) || boot_cpu_has(X86_FEATURE_ADX)))
2468                crypto_unregister_kpp(&curve25519_alg);
2469}
2470
2471module_init(curve25519_mod_init);
2472module_exit(curve25519_mod_exit);
2473
2474MODULE_ALIAS_CRYPTO("curve25519");
2475MODULE_ALIAS_CRYPTO("curve25519-x86");
2476MODULE_LICENSE("GPL v2");
2477