1
2
3
4
5
6#include "tls.h"
7
8#define SP_DEBUG 0
9#define FIXED_SECRET 0
10#define FIXED_PEER_PUBKEY 0
11
12#define ALLOW_ASM 1
13
14#if SP_DEBUG
15# define dbg(...) fprintf(stderr, __VA_ARGS__)
16static void dump_hex(const char *fmt, const void *vp, int len)
17{
18 char hexbuf[32 * 1024 + 4];
19 const uint8_t *p = vp;
20
21 bin2hex(hexbuf, (void*)p, len)[0] = '\0';
22 dbg(fmt, hexbuf);
23}
24#else
25# define dbg(...) ((void)0)
26# define dump_hex(...) ((void)0)
27#endif
28
29typedef uint32_t sp_digit;
30typedef int32_t signed_sp_digit;
31
32
33
34
35
36
37
38
39
40#if defined(__GNUC__) && defined(__x86_64__)
41# define UNALIGNED_LE_64BIT 1
42#else
43# define UNALIGNED_LE_64BIT 0
44#endif
45
46
47
48
49
50
51typedef struct sp_point {
52 sp_digit x[8]
53#if ULONG_MAX > 0xffffffff
54
55 ALIGNED(8)
56#endif
57 ;
58 sp_digit y[8];
59 sp_digit z[8];
60 int infinity;
61} sp_point;
62
63
64static const sp_digit p256_mod[8] ALIGNED(8) = {
65 0xffffffff,0xffffffff,0xffffffff,0x00000000,
66 0x00000000,0x00000000,0x00000001,0xffffffff,
67};
68
69#define p256_mp_mod ((sp_digit)0x000001)
70
71
72#define sp_256_norm_8(a) ((void)0)
73
74
75
76
77
78
79
80#if BB_UNALIGNED_MEMACCESS_OK && ULONG_MAX > 0xffffffff
81static void sp_256_to_bin_8(const sp_digit* rr, uint8_t* a)
82{
83 int i;
84 const uint64_t* r = (void*)rr;
85
86 sp_256_norm_8(rr);
87
88 r += 4;
89 for (i = 0; i < 4; i++) {
90 r--;
91 move_to_unaligned64(a, SWAP_BE64(*r));
92 a += 8;
93 }
94}
95#else
96static void sp_256_to_bin_8(const sp_digit* r, uint8_t* a)
97{
98 int i;
99
100 sp_256_norm_8(r);
101
102 r += 8;
103 for (i = 0; i < 8; i++) {
104 r--;
105 move_to_unaligned32(a, SWAP_BE32(*r));
106 a += 4;
107 }
108}
109#endif
110
111
112
113
114
115
116
117#if BB_UNALIGNED_MEMACCESS_OK && ULONG_MAX > 0xffffffff
118static void sp_256_from_bin_8(sp_digit* rr, const uint8_t* a)
119{
120 int i;
121 uint64_t* r = (void*)rr;
122
123 r += 4;
124 for (i = 0; i < 4; i++) {
125 uint64_t v;
126 move_from_unaligned64(v, a);
127 *--r = SWAP_BE64(v);
128 a += 8;
129 }
130}
131#else
132static void sp_256_from_bin_8(sp_digit* r, const uint8_t* a)
133{
134 int i;
135
136 r += 8;
137 for (i = 0; i < 8; i++) {
138 sp_digit v;
139 move_from_unaligned32(v, a);
140 *--r = SWAP_BE32(v);
141 a += 4;
142 }
143}
144#endif
145
146#if SP_DEBUG
147static void dump_256(const char *fmt, const sp_digit* r)
148{
149 uint8_t b32[32];
150 sp_256_to_bin_8(r, b32);
151 dump_hex(fmt, b32, 32);
152}
153static void dump_512(const char *fmt, const sp_digit* r)
154{
155 uint8_t b64[64];
156 sp_256_to_bin_8(r, b64 + 32);
157 sp_256_to_bin_8(r+8, b64);
158 dump_hex(fmt, b64, 64);
159}
160#else
161# define dump_256(...) ((void)0)
162# define dump_512(...) ((void)0)
163#endif
164
165
166static void sp_256_point_from_bin2x32(sp_point* p, const uint8_t *bin2x32)
167{
168 memset(p, 0, sizeof(*p));
169
170 sp_256_from_bin_8(p->x, bin2x32);
171 sp_256_from_bin_8(p->y, bin2x32 + 32);
172 p->z[0] = 1;
173}
174
175
176
177
178
179
180#if UNALIGNED_LE_64BIT
181static signed_sp_digit sp_256_cmp_8(const sp_digit* aa, const sp_digit* bb)
182{
183 const uint64_t* a = (void*)aa;
184 const uint64_t* b = (void*)bb;
185 int i;
186 for (i = 3; i >= 0; i--) {
187 if (a[i] == b[i])
188 continue;
189 return (a[i] > b[i]) * 2 - 1;
190 }
191 return 0;
192}
193#else
194static signed_sp_digit sp_256_cmp_8(const sp_digit* a, const sp_digit* b)
195{
196 int i;
197 for (i = 7; i >= 0; i--) {
198
199
200
201
202
203 if (a[i] == b[i])
204 continue;
205 return (a[i] > b[i]) * 2 - 1;
206 }
207 return 0;
208}
209#endif
210
211
212
213
214
215static int sp_256_cmp_equal_8(const sp_digit* a, const sp_digit* b)
216{
217 return sp_256_cmp_8(a, b) == 0;
218}
219
220
221static int sp_256_add_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
222{
223#if ALLOW_ASM && defined(__GNUC__) && defined(__i386__)
224 sp_digit reg;
225 asm volatile (
226"\n movl (%0), %3"
227"\n addl (%1), %3"
228"\n movl %3, (%2)"
229"\n"
230"\n movl 1*4(%0), %3"
231"\n adcl 1*4(%1), %3"
232"\n movl %3, 1*4(%2)"
233"\n"
234"\n movl 2*4(%0), %3"
235"\n adcl 2*4(%1), %3"
236"\n movl %3, 2*4(%2)"
237"\n"
238"\n movl 3*4(%0), %3"
239"\n adcl 3*4(%1), %3"
240"\n movl %3, 3*4(%2)"
241"\n"
242"\n movl 4*4(%0), %3"
243"\n adcl 4*4(%1), %3"
244"\n movl %3, 4*4(%2)"
245"\n"
246"\n movl 5*4(%0), %3"
247"\n adcl 5*4(%1), %3"
248"\n movl %3, 5*4(%2)"
249"\n"
250"\n movl 6*4(%0), %3"
251"\n adcl 6*4(%1), %3"
252"\n movl %3, 6*4(%2)"
253"\n"
254"\n movl 7*4(%0), %3"
255"\n adcl 7*4(%1), %3"
256"\n movl %3, 7*4(%2)"
257"\n"
258"\n sbbl %3, %3"
259"\n"
260 : "=r" (a), "=r" (b), "=r" (r), "=r" (reg)
261 : "0" (a), "1" (b), "2" (r)
262 : "memory"
263 );
264 return reg;
265#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
266 uint64_t reg;
267 asm volatile (
268"\n movq (%0), %3"
269"\n addq (%1), %3"
270"\n movq %3, (%2)"
271"\n"
272"\n movq 1*8(%0), %3"
273"\n adcq 1*8(%1), %3"
274"\n movq %3, 1*8(%2)"
275"\n"
276"\n movq 2*8(%0), %3"
277"\n adcq 2*8(%1), %3"
278"\n movq %3, 2*8(%2)"
279"\n"
280"\n movq 3*8(%0), %3"
281"\n adcq 3*8(%1), %3"
282"\n movq %3, 3*8(%2)"
283"\n"
284"\n sbbq %3, %3"
285"\n"
286 : "=r" (a), "=r" (b), "=r" (r), "=r" (reg)
287 : "0" (a), "1" (b), "2" (r)
288 : "memory"
289 );
290 return reg;
291#else
292 int i;
293 sp_digit carry;
294
295 carry = 0;
296 for (i = 0; i < 8; i++) {
297 sp_digit w, v;
298 w = b[i] + carry;
299 v = a[i];
300 if (w != 0) {
301 v = a[i] + w;
302 carry = (v < a[i]);
303
304 }
305
306
307
308
309
310 r[i] = v;
311 }
312 return carry;
313#endif
314}
315
316
317static int sp_256_sub_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
318{
319#if ALLOW_ASM && defined(__GNUC__) && defined(__i386__)
320 sp_digit reg;
321 asm volatile (
322"\n movl (%0), %3"
323"\n subl (%1), %3"
324"\n movl %3, (%2)"
325"\n"
326"\n movl 1*4(%0), %3"
327"\n sbbl 1*4(%1), %3"
328"\n movl %3, 1*4(%2)"
329"\n"
330"\n movl 2*4(%0), %3"
331"\n sbbl 2*4(%1), %3"
332"\n movl %3, 2*4(%2)"
333"\n"
334"\n movl 3*4(%0), %3"
335"\n sbbl 3*4(%1), %3"
336"\n movl %3, 3*4(%2)"
337"\n"
338"\n movl 4*4(%0), %3"
339"\n sbbl 4*4(%1), %3"
340"\n movl %3, 4*4(%2)"
341"\n"
342"\n movl 5*4(%0), %3"
343"\n sbbl 5*4(%1), %3"
344"\n movl %3, 5*4(%2)"
345"\n"
346"\n movl 6*4(%0), %3"
347"\n sbbl 6*4(%1), %3"
348"\n movl %3, 6*4(%2)"
349"\n"
350"\n movl 7*4(%0), %3"
351"\n sbbl 7*4(%1), %3"
352"\n movl %3, 7*4(%2)"
353"\n"
354"\n sbbl %3, %3"
355"\n"
356 : "=r" (a), "=r" (b), "=r" (r), "=r" (reg)
357 : "0" (a), "1" (b), "2" (r)
358 : "memory"
359 );
360 return reg;
361#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
362 uint64_t reg;
363 asm volatile (
364"\n movq (%0), %3"
365"\n subq (%1), %3"
366"\n movq %3, (%2)"
367"\n"
368"\n movq 1*8(%0), %3"
369"\n sbbq 1*8(%1), %3"
370"\n movq %3, 1*8(%2)"
371"\n"
372"\n movq 2*8(%0), %3"
373"\n sbbq 2*8(%1), %3"
374"\n movq %3, 2*8(%2)"
375"\n"
376"\n movq 3*8(%0), %3"
377"\n sbbq 3*8(%1), %3"
378"\n movq %3, 3*8(%2)"
379"\n"
380"\n sbbq %3, %3"
381"\n"
382 : "=r" (a), "=r" (b), "=r" (r), "=r" (reg)
383 : "0" (a), "1" (b), "2" (r)
384 : "memory"
385 );
386 return reg;
387#else
388 int i;
389 sp_digit borrow;
390
391 borrow = 0;
392 for (i = 0; i < 8; i++) {
393 sp_digit w, v;
394 w = b[i] + borrow;
395 v = a[i];
396 if (w != 0) {
397 v = a[i] - w;
398 borrow = (v > a[i]);
399
400 }
401
402
403
404
405
406 r[i] = v;
407 }
408 return borrow;
409#endif
410}
411
412
413#if ALLOW_ASM && defined(__GNUC__) && defined(__i386__)
414static void sp_256_sub_8_p256_mod(sp_digit* r)
415{
416
417 asm volatile (
418"\n subl $0xffffffff, (%0)"
419"\n sbbl $0xffffffff, 1*4(%0)"
420"\n sbbl $0xffffffff, 2*4(%0)"
421"\n sbbl $0, 3*4(%0)"
422"\n sbbl $0, 4*4(%0)"
423"\n sbbl $0, 5*4(%0)"
424"\n sbbl $1, 6*4(%0)"
425"\n sbbl $0xffffffff, 7*4(%0)"
426"\n"
427 : "=r" (r)
428 : "0" (r)
429 : "memory"
430 );
431}
432#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
433static void sp_256_sub_8_p256_mod(sp_digit* r)
434{
435 uint64_t reg;
436 uint64_t ooff;
437
438 asm volatile (
439"\n addq $1, (%0)"
440"\n cmc"
441"\n"
442"\n sbbq %1, 1*8(%0)"
443"\n"
444"\n sbbq $0, 2*8(%0)"
445"\n"
446"\n movq 3*8(%0), %2"
447"\n sbbq $0, %2"
448"\n addq %1, %2"
449"\n movq %2, 3*8(%0)"
450"\n"
451 : "=r" (r), "=r" (ooff), "=r" (reg)
452 : "0" (r), "1" (0x00000000ffffffff)
453 : "memory"
454 );
455}
456#else
457static void sp_256_sub_8_p256_mod(sp_digit* r)
458{
459 sp_256_sub_8(r, r, p256_mod);
460}
461#endif
462
463
464
465
466static void sp_256to512_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
467{
468#if ALLOW_ASM && defined(__GNUC__) && defined(__i386__)
469 int k;
470 uint32_t accl;
471 uint32_t acch;
472
473 acch = accl = 0;
474 for (k = 0; k < 15; k++) {
475 int i, j;
476 uint32_t acc_hi;
477 i = k - 7;
478 if (i < 0)
479 i = 0;
480 j = k - i;
481 acc_hi = 0;
482 do {
483
484
485
486 asm volatile (
487
488"\n mull %7"
489"\n addl %%eax, %0"
490"\n adcl %%edx, %1"
491"\n adcl $0, %2"
492 : "=rm" (accl), "=rm" (acch), "=rm" (acc_hi)
493 : "0" (accl), "1" (acch), "2" (acc_hi), "a" (a[i]), "m" (b[j])
494 : "cc", "dx"
495 );
496
497 j--;
498 i++;
499 } while (i != 8 && i <= k);
500 r[k] = accl;
501 accl = acch;
502 acch = acc_hi;
503 }
504 r[15] = accl;
505#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
506 const uint64_t* aa = (const void*)a;
507 const uint64_t* bb = (const void*)b;
508 uint64_t* rr = (void*)r;
509 int k;
510 uint64_t accl;
511 uint64_t acch;
512
513 acch = accl = 0;
514 for (k = 0; k < 7; k++) {
515 int i, j;
516 uint64_t acc_hi;
517 i = k - 3;
518 if (i < 0)
519 i = 0;
520 j = k - i;
521 acc_hi = 0;
522 do {
523
524
525
526 asm volatile (
527
528"\n mulq %7"
529"\n addq %%rax, %0"
530"\n adcq %%rdx, %1"
531"\n adcq $0, %2"
532 : "=rm" (accl), "=rm" (acch), "=rm" (acc_hi)
533 : "0" (accl), "1" (acch), "2" (acc_hi), "a" (aa[i]), "m" (bb[j])
534 : "cc", "dx"
535 );
536
537 j--;
538 i++;
539 } while (i != 4 && i <= k);
540 rr[k] = accl;
541 accl = acch;
542 acch = acc_hi;
543 }
544 rr[7] = accl;
545#elif 0
546
547 asm volatile (
548"\n mov r5, #0"
549"\n mov r6, #0"
550"\n mov r7, #0"
551"\n mov r8, #0"
552"\n 1:"
553"\n subs r3, r5, #28"
554"\n movcc r3, #0"
555"\n sub r4, r5, r3"
556"\n 2:"
557"\n ldr r14, [%[a], r3]"
558"\n ldr r12, [%[b], r4]"
559"\n umull r9, r10, r14, r12"
560"\n adds r6, r6, r9"
561"\n adcs r7, r7, r10"
562"\n adc r8, r8, #0"
563"\n add r3, r3, #4"
564"\n sub r4, r4, #4"
565"\n cmp r3, #32"
566"\n beq 3f"
567"\n cmp r3, r5"
568"\n ble 2b"
569"\n 3:"
570"\n str r6, [%[r], r5]"
571"\n mov r6, r7"
572"\n mov r7, r8"
573"\n mov r8, #0"
574"\n add r5, r5, #4"
575"\n cmp r5, #56"
576"\n ble 1b"
577"\n str r6, [%[r], r5]"
578 : [r] "r" (r), [a] "r" (a), [b] "r" (b)
579 : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "r14"
580 );
581#else
582 int i, j, k;
583 uint64_t acc;
584
585 acc = 0;
586 for (k = 0; k < 15; k++) {
587 uint32_t acc_hi;
588 i = k - 7;
589 if (i < 0)
590 i = 0;
591 j = k - i;
592 acc_hi = 0;
593 do {
594 uint64_t m = ((uint64_t)a[i]) * b[j];
595 acc += m;
596 if (acc < m)
597 acc_hi++;
598 j--;
599 i++;
600 } while (i != 8 && i <= k);
601 r[k] = acc;
602 acc = (acc >> 32) | ((uint64_t)acc_hi << 32);
603 }
604 r[15] = acc;
605#endif
606}
607
608
609#if UNALIGNED_LE_64BIT
610static void sp_256_rshift1_8(sp_digit* rr, uint64_t carry)
611{
612 uint64_t *r = (void*)rr;
613 int i;
614
615 carry = (((uint64_t)!!carry) << 63);
616 for (i = 3; i >= 0; i--) {
617 uint64_t c = r[i] << 63;
618 r[i] = (r[i] >> 1) | carry;
619 carry = c;
620 }
621}
622#else
623static void sp_256_rshift1_8(sp_digit* r, sp_digit carry)
624{
625 int i;
626
627 carry = (((sp_digit)!!carry) << 31);
628 for (i = 7; i >= 0; i--) {
629 sp_digit c = r[i] << 31;
630 r[i] = (r[i] >> 1) | carry;
631 carry = c;
632 }
633}
634#endif
635
636
637static void sp_256_div2_8(sp_digit* r )
638{
639 const sp_digit* m = p256_mod;
640
641 int carry = 0;
642 if (r[0] & 1)
643 carry = sp_256_add_8(r, r, m);
644 sp_256_norm_8(r);
645 sp_256_rshift1_8(r, carry);
646}
647
648
649static void sp_256_mont_add_8(sp_digit* r, const sp_digit* a, const sp_digit* b
650 )
651{
652
653
654 int carry = sp_256_add_8(r, a, b);
655 sp_256_norm_8(r);
656 if (carry) {
657 sp_256_sub_8_p256_mod(r);
658 sp_256_norm_8(r);
659 }
660}
661
662
663static void sp_256_mont_sub_8(sp_digit* r, const sp_digit* a, const sp_digit* b
664 )
665{
666 const sp_digit* m = p256_mod;
667
668 int borrow;
669 borrow = sp_256_sub_8(r, a, b);
670 sp_256_norm_8(r);
671 if (borrow) {
672 sp_256_add_8(r, r, m);
673 sp_256_norm_8(r);
674 }
675}
676
677
678static void sp_256_mont_dbl_8(sp_digit* r, const sp_digit* a )
679{
680
681
682 int carry = sp_256_add_8(r, a, a);
683 sp_256_norm_8(r);
684 if (carry)
685 sp_256_sub_8_p256_mod(r);
686 sp_256_norm_8(r);
687}
688
689
690static void sp_256_mont_tpl_8(sp_digit* r, const sp_digit* a )
691{
692
693
694 int carry = sp_256_add_8(r, a, a);
695 sp_256_norm_8(r);
696 if (carry) {
697 sp_256_sub_8_p256_mod(r);
698 sp_256_norm_8(r);
699 }
700 carry = sp_256_add_8(r, r, a);
701 sp_256_norm_8(r);
702 if (carry) {
703 sp_256_sub_8_p256_mod(r);
704 sp_256_norm_8(r);
705 }
706}
707
708
709static void sp_512to256_mont_shift_8(sp_digit* r, sp_digit* a)
710{
711 memcpy(r, a + 8, sizeof(*r) * 8);
712}
713
714#if UNALIGNED_LE_64BIT
715
716
717
718
719
720
721
722static int sp_256_mul_add_4(uint64_t *r )
723{
724 uint64_t b = r[0];
725
726# if 0
727 const uint64_t* a = (const void*)p256_mod;
728
729 uint128_t t;
730 int i;
731 t = 0;
732 for (i = 0; i < 4; i++) {
733 uint32_t t_hi;
734 uint128_t m = ((uint128_t)b * a[i]) + r[i];
735 t += m;
736 t_hi = (t < m);
737 r[i] = (uint64_t)t;
738 t = (t >> 64) | ((uint128_t)t_hi << 64);
739 }
740 r[4] += (uint64_t)t;
741 return (r[4] < (uint64_t)t);
742# else
743
744
745
746 uint64_t t64, t64u;
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769 r[1] += (b << 32);
770
771 t64 = (r[1] < (b << 32));
772 t64 += (b >> 32);
773
774
775
776
777
778
779
780
781
782 r[2] += t64;
783
784 t64 = (r[2] < t64);
785
786
787
788
789
790
791
792
793
794 t64 += b;
795 t64u = (t64 < b);
796 t64 += r[3];
797 t64u += (t64 < r[3]);
798 {
799 uint64_t lo, hi;
800
801
802
803 hi = (b << 32) - b;
804 b = (b >> 32) - ((b << 32) < b);
805 lo = hi << 32;
806 t64 += lo;
807 t64u += (t64 < lo);
808 hi >>= 32;
809 hi |= (b << 32);
810 t64u += hi;
811 }
812
813
814
815 r[3] = t64;
816
817
818
819 r[4] += t64u;
820 return (r[4] < t64u);
821# endif
822}
823
824static void sp_512to256_mont_reduce_8(sp_digit* r, sp_digit* aa)
825{
826
827 int i;
828 uint64_t *a = (void*)aa;
829
830 sp_digit carry = 0;
831 for (i = 0; i < 4; i++) {
832
833 if (sp_256_mul_add_4(a+i )) {
834 int j = i + 4;
835 inc_next_word:
836 if (++j > 7) {
837 carry++;
838 continue;
839 }
840 if (++a[j] == 0)
841 goto inc_next_word;
842 }
843 }
844 sp_512to256_mont_shift_8(r, aa);
845 if (carry != 0)
846 sp_256_sub_8_p256_mod(r);
847 sp_256_norm_8(r);
848}
849
850#else
851
852
853
854
855
856static int sp_256_mul_add_8(sp_digit* r )
857{
858 sp_digit b = r[0];
859 uint64_t t;
860
861# if 0
862 const sp_digit* a = p256_mod;
863
864 int i;
865 t = 0;
866 for (i = 0; i < 8; i++) {
867 uint32_t t_hi;
868 uint64_t m = ((uint64_t)b * a[i]) + r[i];
869 t += m;
870 t_hi = (t < m);
871 r[i] = (sp_digit)t;
872 t = (t >> 32) | ((uint64_t)t_hi << 32);
873 }
874 r[8] += (sp_digit)t;
875 return (r[8] < (sp_digit)t);
876# else
877
878
879 uint64_t m;
880 uint32_t t32;
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927 r[3] = r[3] + b;
928
929 t32 = (r[3] < b);
930
931
932
933
934
935
936
937
938
939
940 if (t32 != 0) {
941 r[4]++;
942 t32 = (r[4] == 0);
943
944
945
946
947
948
949
950
951
952
953 if (t32 != 0) {
954 r[5]++;
955 t32 = (r[5] == 0);
956 }
957 }
958
959
960
961
962
963 t = t32 + (uint64_t)b + r[6];
964
965
966 r[6] = (sp_digit)t;
967
968 t = (t >> 32);
969
970
971
972
973 m = ((uint64_t)b << 32) - b + r[7];
974 t += m;
975
976
977 r[7] = (sp_digit)t;
978
979 t = (t >> 32);
980
981 r[8] += (sp_digit)t;
982 return (r[8] < (sp_digit)t);
983# endif
984}
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009static void sp_512to256_mont_reduce_8(sp_digit* r, sp_digit* a)
1010{
1011
1012 sp_digit mp = p256_mp_mod;
1013
1014 int i;
1015
1016
1017 if (mp != 1) {
1018 sp_digit word16th = 0;
1019 for (i = 0; i < 8; i++) {
1020
1021 if (sp_256_mul_add_8(a+i )) {
1022 int j = i + 8;
1023 inc_next_word0:
1024 if (++j > 15) {
1025 word16th++;
1026 continue;
1027 }
1028 if (++a[j] == 0)
1029 goto inc_next_word0;
1030 }
1031 }
1032 sp_512to256_mont_shift_8(r, a);
1033 if (word16th != 0)
1034 sp_256_sub_8_p256_mod(r);
1035 sp_256_norm_8(r);
1036 }
1037 else {
1038 sp_digit word16th = 0;
1039 for (i = 0; i < 8; i++) {
1040
1041 if (sp_256_mul_add_8(a+i )) {
1042 int j = i + 8;
1043 inc_next_word:
1044 if (++j > 15) {
1045 word16th++;
1046 continue;
1047 }
1048 if (++a[j] == 0)
1049 goto inc_next_word;
1050 }
1051 }
1052 sp_512to256_mont_shift_8(r, a);
1053 if (word16th != 0)
1054 sp_256_sub_8_p256_mod(r);
1055 sp_256_norm_8(r);
1056 }
1057}
1058#endif
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b
1070 )
1071{
1072
1073
1074 sp_digit t[2 * 8];
1075 sp_256to512_mul_8(t, a, b);
1076 sp_512to256_mont_reduce_8(r, t );
1077}
1078
1079
1080
1081
1082
1083
1084
1085
1086static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a
1087 )
1088{
1089
1090
1091 sp_256_mont_mul_8(r, a, a );
1092}
1093
1094static NOINLINE void sp_256_mont_mul_and_reduce_8(sp_digit* r,
1095 const sp_digit* a, const sp_digit* b
1096 )
1097{
1098 sp_digit rr[2 * 8];
1099
1100 sp_256_mont_mul_8(rr, a, b );
1101 memset(rr + 8, 0, sizeof(rr) / 2);
1102 sp_512to256_mont_reduce_8(r, rr );
1103}
1104
1105
1106
1107
1108
1109
1110
1111static void sp_256_mont_inv_8(sp_digit* r, sp_digit* a)
1112{
1113 int i;
1114
1115 memcpy(r, a, sizeof(sp_digit) * 8);
1116 for (i = 254; i >= 0; i--) {
1117 sp_256_mont_sqr_8(r, r );
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127 if (i >= 224 || i == 192 || (i <= 95 && i != 1))
1128 sp_256_mont_mul_8(r, r, a );
1129 }
1130}
1131
1132
1133
1134
1135
1136
1137static void sp_256_mod_mul_norm_8(sp_digit* r, const sp_digit* a)
1138{
1139 int64_t t[8];
1140 int32_t o;
1141
1142#define A(n) ((uint64_t)a[n])
1143
1144 t[0] = 0 + A(0) + A(1) - A(3) - A(4) - A(5) - A(6);
1145
1146 t[1] = 0 + A(1) + A(2) - A(4) - A(5) - A(6) - A(7);
1147
1148 t[2] = 0 + A(2) + A(3) - A(5) - A(6) - A(7);
1149
1150 t[3] = 0 - A(0) - A(1) + 2 * A(3) + 2 * A(4) + A(5) - A(7);
1151
1152 t[4] = 0 - A(1) - A(2) + 2 * A(4) + 2 * A(5) + A(6);
1153
1154 t[5] = 0 - A(2) - A(3) + 2 * A(5) + 2 * A(6) + A(7);
1155
1156 t[6] = 0 - A(0) - A(1) + A(5) + 3 * A(6) + 2 * A(7);
1157
1158 t[7] = 0 + A(0) - A(2) - A(3) - A(4) - A(5) + 3 * A(7);
1159#undef A
1160
1161 t[1] += t[0] >> 32; t[0] &= 0xffffffff;
1162 t[2] += t[1] >> 32; t[1] &= 0xffffffff;
1163 t[3] += t[2] >> 32; t[2] &= 0xffffffff;
1164 t[4] += t[3] >> 32; t[3] &= 0xffffffff;
1165 t[5] += t[4] >> 32; t[4] &= 0xffffffff;
1166 t[6] += t[5] >> 32; t[5] &= 0xffffffff;
1167 t[7] += t[6] >> 32; t[6] &= 0xffffffff;
1168 o = t[7] >> 32;
1169 t[0] += o;
1170 t[3] -= o;
1171 t[6] -= o;
1172 t[7] += o;
1173 r[0] = (sp_digit)t[0];
1174 t[1] += t[0] >> 32;
1175 r[1] = (sp_digit)t[1];
1176 t[2] += t[1] >> 32;
1177 r[2] = (sp_digit)t[2];
1178 t[3] += t[2] >> 32;
1179 r[3] = (sp_digit)t[3];
1180 t[4] += t[3] >> 32;
1181 r[4] = (sp_digit)t[4];
1182 t[5] += t[4] >> 32;
1183 r[5] = (sp_digit)t[5];
1184 t[6] += t[5] >> 32;
1185 r[6] = (sp_digit)t[6];
1186
1187
1188 r[7] = (sp_digit)t[7] + (sp_digit)(t[6] >> 32);
1189}
1190
1191
1192
1193
1194
1195
1196static void sp_256_map_8(sp_point* r, sp_point* p)
1197{
1198 sp_digit t1[8];
1199 sp_digit t2[8];
1200
1201 sp_256_mont_inv_8(t1, p->z);
1202
1203 sp_256_mont_sqr_8(t2, t1 );
1204 sp_256_mont_mul_8(t1, t2, t1 );
1205
1206
1207 sp_256_mont_mul_and_reduce_8(r->x, p->x, t2 );
1208
1209 if (sp_256_cmp_8(r->x, p256_mod) >= 0)
1210 sp_256_sub_8_p256_mod(r->x);
1211 sp_256_norm_8(r->x);
1212
1213
1214 sp_256_mont_mul_and_reduce_8(r->y, p->y, t1 );
1215
1216 if (sp_256_cmp_8(r->y, p256_mod) >= 0)
1217 sp_256_sub_8_p256_mod(r->y);
1218 sp_256_norm_8(r->y);
1219
1220 memset(r->z, 0, sizeof(r->z));
1221 r->z[0] = 1;
1222}
1223
1224
1225
1226
1227
1228
1229static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p)
1230{
1231 sp_digit t1[8];
1232 sp_digit t2[8];
1233
1234
1235 if (r != p)
1236 *r = *p;
1237
1238 if (r->infinity)
1239 return;
1240
1241
1242 sp_256_mont_sqr_8(t1, r->z );
1243
1244 sp_256_mont_mul_8(r->z, r->y, r->z );
1245
1246 sp_256_mont_dbl_8(r->z, r->z );
1247
1248 sp_256_mont_sub_8(t2, r->x, t1 );
1249
1250 sp_256_mont_add_8(t1, r->x, t1 );
1251
1252 sp_256_mont_mul_8(t2, t1, t2 );
1253
1254 sp_256_mont_tpl_8(t1, t2 );
1255
1256 sp_256_mont_dbl_8(r->y, r->y );
1257
1258 sp_256_mont_sqr_8(r->y, r->y );
1259
1260 sp_256_mont_sqr_8(t2, r->y );
1261
1262 sp_256_div2_8(t2 );
1263
1264 sp_256_mont_mul_8(r->y, r->y, r->x );
1265
1266 sp_256_mont_mul_8(r->x, t1, t1 );
1267
1268 sp_256_mont_sub_8(r->x, r->x, r->y );
1269
1270 sp_256_mont_sub_8(r->x, r->x, r->y );
1271
1272 sp_256_mont_sub_8(r->y, r->y, r->x );
1273
1274 sp_256_mont_mul_8(r->y, r->y, t1 );
1275
1276 sp_256_mont_sub_8(r->y, r->y, t2 );
1277 dump_512("y2 %s\n", r->y);
1278}
1279
1280
1281
1282
1283
1284
1285
1286static NOINLINE void sp_256_proj_point_add_8(sp_point* r, sp_point* p, sp_point* q)
1287{
1288 sp_digit t1[8];
1289 sp_digit t2[8];
1290 sp_digit t3[8];
1291 sp_digit t4[8];
1292 sp_digit t5[8];
1293
1294
1295 if (q == r) {
1296 sp_point* a = p;
1297 p = q;
1298 q = a;
1299 }
1300
1301
1302 sp_256_sub_8(t1, p256_mod, q->y);
1303 sp_256_norm_8(t1);
1304 if (sp_256_cmp_equal_8(p->x, q->x)
1305 && sp_256_cmp_equal_8(p->z, q->z)
1306 && (sp_256_cmp_equal_8(p->y, q->y) || sp_256_cmp_equal_8(p->y, t1))
1307 ) {
1308 sp_256_proj_point_dbl_8(r, p);
1309 return;
1310 }
1311
1312 if (p->infinity || q->infinity) {
1313 *r = p->infinity ? *q : *p;
1314 return;
1315 }
1316
1317
1318 sp_256_mont_sqr_8(t1, q->z );
1319 sp_256_mont_mul_8(t3, t1, q->z );
1320 sp_256_mont_mul_8(t1, t1, r->x );
1321
1322 sp_256_mont_sqr_8(t2, r->z );
1323 sp_256_mont_mul_8(t4, t2, r->z );
1324 sp_256_mont_mul_8(t2, t2, q->x );
1325
1326 sp_256_mont_mul_8(t3, t3, r->y );
1327
1328 sp_256_mont_mul_8(t4, t4, q->y );
1329
1330 sp_256_mont_sub_8(t2, t2, t1 );
1331
1332 sp_256_mont_sub_8(t4, t4, t3 );
1333
1334 sp_256_mont_mul_8(r->z, r->z, q->z );
1335 sp_256_mont_mul_8(r->z, r->z, t2 );
1336
1337 sp_256_mont_sqr_8(r->x, t4 );
1338 sp_256_mont_sqr_8(t5, t2 );
1339 sp_256_mont_mul_8(r->y, t1, t5 );
1340 sp_256_mont_mul_8(t5, t5, t2 );
1341 sp_256_mont_sub_8(r->x, r->x, t5 );
1342 sp_256_mont_dbl_8(t1, r->y );
1343 sp_256_mont_sub_8(r->x, r->x, t1 );
1344
1345 sp_256_mont_sub_8(r->y, r->y, r->x );
1346 sp_256_mont_mul_8(r->y, r->y, t4 );
1347 sp_256_mont_mul_8(t5, t5, t3 );
1348 sp_256_mont_sub_8(r->y, r->y, t5 );
1349}
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359static void sp_256_ecc_mulmod_8(sp_point* r, const sp_point* g, const sp_digit* k )
1360{
1361 enum { map = 1 };
1362 sp_point t[3];
1363 sp_digit n = n;
1364 int c, y;
1365
1366 memset(t, 0, sizeof(t));
1367
1368
1369 t[0].infinity = 1;
1370
1371 sp_256_mod_mul_norm_8(t[1].x, g->x);
1372 sp_256_mod_mul_norm_8(t[1].y, g->y);
1373 sp_256_mod_mul_norm_8(t[1].z, g->z);
1374
1375
1376 k += 7;
1377 c = 256;
1378 for (;;) {
1379 if ((c & 0x1f) == 0) {
1380 if (c == 0)
1381 break;
1382 n = *k--;
1383 }
1384
1385 y = (n >> 31);
1386 dbg("y:%d t[%d] = t[0]+t[1]\n", y, y^1);
1387 sp_256_proj_point_add_8(&t[y^1], &t[0], &t[1]);
1388 dump_512("t[0].x %s\n", t[0].x);
1389 dump_512("t[0].y %s\n", t[0].y);
1390 dump_512("t[0].z %s\n", t[0].z);
1391 dump_512("t[1].x %s\n", t[1].x);
1392 dump_512("t[1].y %s\n", t[1].y);
1393 dump_512("t[1].z %s\n", t[1].z);
1394 dbg("t[2] = t[%d]\n", y);
1395 t[2] = t[y];
1396 dbg("t[2] *= 2\n");
1397 sp_256_proj_point_dbl_8(&t[2], &t[2]);
1398 dump_512("t[2].x %s\n", t[2].x);
1399 dump_512("t[2].y %s\n", t[2].y);
1400 dump_512("t[2].z %s\n", t[2].z);
1401 t[y] = t[2];
1402
1403 n <<= 1;
1404 c--;
1405 }
1406
1407 if (map)
1408 sp_256_map_8(r, &t[0]);
1409 else
1410 *r = t[0];
1411
1412 memset(t, 0, sizeof(t));
1413}
1414
1415
1416
1417
1418
1419
1420
1421
1422static void sp_256_ecc_mulmod_base_8(sp_point* r, sp_digit* k )
1423{
1424
1425
1426
1427
1428 static const uint8_t p256_base_bin[] = {
1429
1430 0x6b,0x17,0xd1,0xf2,0xe1,0x2c,0x42,0x47,0xf8,0xbc,0xe6,0xe5,0x63,0xa4,0x40,0xf2,0x77,0x03,0x7d,0x81,0x2d,0xeb,0x33,0xa0,0xf4,0xa1,0x39,0x45,0xd8,0x98,0xc2,0x96,
1431
1432 0x4f,0xe3,0x42,0xe2,0xfe,0x1a,0x7f,0x9b,0x8e,0xe7,0xeb,0x4a,0x7c,0x0f,0x9e,0x16,0x2b,0xce,0x33,0x57,0x6b,0x31,0x5e,0xce,0xcb,0xb6,0x40,0x68,0x37,0xbf,0x51,0xf5,
1433
1434 };
1435 sp_point p256_base;
1436
1437 sp_256_point_from_bin2x32(&p256_base, p256_base_bin);
1438
1439 sp_256_ecc_mulmod_8(r, &p256_base, k );
1440}
1441
1442
1443
1444
1445
1446
1447
1448
1449static void sp_ecc_secret_gen_256(const sp_digit priv[8], const uint8_t *pub2x32, uint8_t* out32)
1450{
1451 sp_point point[1];
1452
1453#if FIXED_PEER_PUBKEY
1454 memset((void*)pub2x32, 0x55, 64);
1455#endif
1456 dump_hex("peerkey %s\n", pub2x32, 32);
1457 dump_hex(" %s\n", pub2x32 + 32, 32);
1458
1459 sp_256_point_from_bin2x32(point, pub2x32);
1460 dump_512("point->x %s\n", point->x);
1461 dump_512("point->y %s\n", point->y);
1462
1463 sp_256_ecc_mulmod_8(point, point, priv);
1464
1465 sp_256_to_bin_8(point->x, out32);
1466 dump_hex("out32: %s\n", out32, 32);
1467}
1468
1469
1470static void sp_256_ecc_gen_k_8(sp_digit k[8])
1471{
1472
1473
1474
1475
1476 tls_get_random(k, 8 * sizeof(k[0]));
1477#if FIXED_SECRET
1478 memset(k, 0x77, 8 * sizeof(k[0]));
1479#endif
1480
1481
1482
1483
1484
1485
1486
1487 if (k[0] == 0)
1488 k[0] = 1;
1489 if (k[7] >= 0xffffffff)
1490 k[7] = 0xfffffffe;
1491}
1492
1493
1494static void sp_ecc_make_key_256(sp_digit privkey[8], uint8_t *pubkey)
1495{
1496 sp_point point[1];
1497
1498 sp_256_ecc_gen_k_8(privkey);
1499 dump_256("privkey %s\n", privkey);
1500 sp_256_ecc_mulmod_base_8(point, privkey);
1501 dump_512("point->x %s\n", point->x);
1502 dump_512("point->y %s\n", point->y);
1503 sp_256_to_bin_8(point->x, pubkey);
1504 sp_256_to_bin_8(point->y, pubkey + 32);
1505
1506 memset(point, 0, sizeof(point));
1507}
1508
1509void FAST_FUNC curve_P256_compute_pubkey_and_premaster(
1510 uint8_t *pubkey2x32, uint8_t *premaster32,
1511 const uint8_t *peerkey2x32)
1512{
1513 sp_digit privkey[8];
1514
1515 dump_hex("peerkey2x32: %s\n", peerkey2x32, 64);
1516 sp_ecc_make_key_256(privkey, pubkey2x32);
1517 dump_hex("pubkey: %s\n", pubkey2x32, 32);
1518 dump_hex(" %s\n", pubkey2x32 + 32, 32);
1519
1520
1521 sp_ecc_secret_gen_256(privkey, peerkey2x32, premaster32);
1522 dump_hex("premaster: %s\n", premaster32, 32);
1523}
1524