1
2
3
4
5
6#include "tls.h"
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48#ifndef DISABLE_PSTM
49
50
51
52#if defined(PSTM_X86)
53
54#if !defined(__GNUC__) || !defined(__i386__) || !defined(PSTM_32BIT)
55#error "PSTM_X86 option requires GCC and 32 bit mode x86 processor"
56#endif
57
58
59#define MONT_START
60#define MONT_FINI
61#define LOOP_END
62#define LOOP_START \
63 mu = c[x] * mp
64
65#if 0
66#define INNERMUL \
67asm( \
68 "movl %5,%%eax \n\t" \
69 "mull %4 \n\t" \
70 "addl %1,%%eax \n\t" \
71 "adcl $0,%%edx \n\t" \
72 "addl %%eax,%0 \n\t" \
73 "adcl $0,%%edx \n\t" \
74 "movl %%edx,%1 \n\t" \
75:"=g"(_c[LO]), "=r"(cy) \
76:"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++) \
77: "%eax", "%edx", "cc")
78
79
80
81
82#else
83
84
85
86#define INNERMUL \
87asm( \
88 "mull %4 \n\t" \
89 "addl %3,%%eax \n\t" \
90 "adcl $0,%%edx \n\t" \
91 "addl %%eax,%0 \n\t" \
92 "adcl $0,%%edx \n\t" \
93:"=g"(_c[LO]), "=&d"(cy) \
94:"0"(_c[LO]), "g"(cy), "g"(mu), "a"(*tmpm++) \
95:"cc")
96
97
98
99
100#endif
101
102#define PROPCARRY \
103asm( \
104 "addl %1,%0 \n\t" \
105 "sbb %1,%1 \n\t" \
106 "neg %1 \n\t" \
107:"=g"(_c[LO]), "=r"(cy) \
108:"0"(_c[LO]), "1"(cy) \
109:"cc")
110
111
112#elif defined(PSTM_X86_64)
113
114#if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT)
115#error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor"
116#endif
117
118
119#define MONT_START
120#define MONT_FINI
121#define LOOP_END
122#define LOOP_START \
123mu = c[x] * mp
124
125#define INNERMUL \
126asm( \
127 "movq %5,%%rax \n\t" \
128 "mulq %4 \n\t" \
129 "addq %1,%%rax \n\t" \
130 "adcq $0,%%rdx \n\t" \
131 "addq %%rax,%0 \n\t" \
132 "adcq $0,%%rdx \n\t" \
133 "movq %%rdx,%1 \n\t" \
134 :"=g"(_c[LO]), "=r"(cy) \
135 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
136 : "%rax", "%rdx", "cc")
137
138#define INNERMUL8 \
139asm( \
140 "movq 0(%5),%%rax \n\t" \
141 "movq 0(%2),%%r10 \n\t" \
142 "movq 0x8(%5),%%r11 \n\t" \
143 "mulq %4 \n\t" \
144 "addq %%r10,%%rax \n\t" \
145 "adcq $0,%%rdx \n\t" \
146 "movq 0x8(%2),%%r10 \n\t" \
147 "addq %3,%%rax \n\t" \
148 "adcq $0,%%rdx \n\t" \
149 "movq %%rax,0(%0) \n\t" \
150 "movq %%rdx,%1 \n\t" \
151 \
152 "movq %%r11,%%rax \n\t" \
153 "movq 0x10(%5),%%r11 \n\t" \
154 "mulq %4 \n\t" \
155 "addq %%r10,%%rax \n\t" \
156 "adcq $0,%%rdx \n\t" \
157 "movq 0x10(%2),%%r10 \n\t" \
158 "addq %3,%%rax \n\t" \
159 "adcq $0,%%rdx \n\t" \
160 "movq %%rax,0x8(%0) \n\t" \
161 "movq %%rdx,%1 \n\t" \
162 \
163 "movq %%r11,%%rax \n\t" \
164 "movq 0x18(%5),%%r11 \n\t" \
165 "mulq %4 \n\t" \
166 "addq %%r10,%%rax \n\t" \
167 "adcq $0,%%rdx \n\t" \
168 "movq 0x18(%2),%%r10 \n\t" \
169 "addq %3,%%rax \n\t" \
170 "adcq $0,%%rdx \n\t" \
171 "movq %%rax,0x10(%0) \n\t" \
172 "movq %%rdx,%1 \n\t" \
173 \
174 "movq %%r11,%%rax \n\t" \
175 "movq 0x20(%5),%%r11 \n\t" \
176 "mulq %4 \n\t" \
177 "addq %%r10,%%rax \n\t" \
178 "adcq $0,%%rdx \n\t" \
179 "movq 0x20(%2),%%r10 \n\t" \
180 "addq %3,%%rax \n\t" \
181 "adcq $0,%%rdx \n\t" \
182 "movq %%rax,0x18(%0) \n\t" \
183 "movq %%rdx,%1 \n\t" \
184 \
185 "movq %%r11,%%rax \n\t" \
186 "movq 0x28(%5),%%r11 \n\t" \
187 "mulq %4 \n\t" \
188 "addq %%r10,%%rax \n\t" \
189 "adcq $0,%%rdx \n\t" \
190 "movq 0x28(%2),%%r10 \n\t" \
191 "addq %3,%%rax \n\t" \
192 "adcq $0,%%rdx \n\t" \
193 "movq %%rax,0x20(%0) \n\t" \
194 "movq %%rdx,%1 \n\t" \
195 \
196 "movq %%r11,%%rax \n\t" \
197 "movq 0x30(%5),%%r11 \n\t" \
198 "mulq %4 \n\t" \
199 "addq %%r10,%%rax \n\t" \
200 "adcq $0,%%rdx \n\t" \
201 "movq 0x30(%2),%%r10 \n\t" \
202 "addq %3,%%rax \n\t" \
203 "adcq $0,%%rdx \n\t" \
204 "movq %%rax,0x28(%0) \n\t" \
205 "movq %%rdx,%1 \n\t" \
206 \
207 "movq %%r11,%%rax \n\t" \
208 "movq 0x38(%5),%%r11 \n\t" \
209 "mulq %4 \n\t" \
210 "addq %%r10,%%rax \n\t" \
211 "adcq $0,%%rdx \n\t" \
212 "movq 0x38(%2),%%r10 \n\t" \
213 "addq %3,%%rax \n\t" \
214 "adcq $0,%%rdx \n\t" \
215 "movq %%rax,0x30(%0) \n\t" \
216 "movq %%rdx,%1 \n\t" \
217 \
218 "movq %%r11,%%rax \n\t" \
219 "mulq %4 \n\t" \
220 "addq %%r10,%%rax \n\t" \
221 "adcq $0,%%rdx \n\t" \
222 "addq %3,%%rax \n\t" \
223 "adcq $0,%%rdx \n\t" \
224 "movq %%rax,0x38(%0) \n\t" \
225 "movq %%rdx,%1 \n\t" \
226 \
227 :"=r"(_c), "=r"(cy) \
228 : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\
229 : "%rax", "%rdx", "%r10", "%r11", "cc")
230
231#define PROPCARRY \
232asm( \
233 "addq %1,%0 \n\t" \
234 "setb %%al \n\t" \
235 "movzbq %%al,%1 \n\t" \
236 :"=g"(_c[LO]), "=r"(cy) \
237 :"0"(_c[LO]), "1"(cy) \
238 : "%rax", "cc")
239
240
241#elif defined(PSTM_ARM)
242
243#define MONT_START
244#define MONT_FINI
245#define LOOP_END
246#define LOOP_START \
247mu = c[x] * mp
248
249#ifdef __thumb2__
250
251#define INNERMUL \
252asm( \
253 " LDR r0,%1 \n\t" \
254 " ADDS r0,r0,%0 \n\t" \
255 " ITE CS \n\t" \
256 " MOVCS %0,#1 \n\t" \
257 " MOVCC %0,#0 \n\t" \
258 " UMLAL r0,%0,%3,%4 \n\t" \
259 " STR r0,%1 \n\t" \
260 :"=r"(cy),"=m"(_c[0])\
261 :"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\
262 :"r0","cc");
263#define PROPCARRY \
264asm( \
265 " LDR r0,%1 \n\t" \
266 " ADDS r0,r0,%0 \n\t" \
267 " STR r0,%1 \n\t" \
268 " ITE CS \n\t" \
269 " MOVCS %0,#1 \n\t" \
270 " MOVCC %0,#0 \n\t" \
271 :"=r"(cy),"=m"(_c[0])\
272 :"0"(cy),"m"(_c[0])\
273 :"r0","cc");
274#else
275
276#define INNERMUL \
277asm( \
278 " LDR r0,%1 \n\t" \
279 " ADDS r0,r0,%0 \n\t" \
280 " MOVCS %0,#1 \n\t" \
281 " MOVCC %0,#0 \n\t" \
282 " UMLAL r0,%0,%3,%4 \n\t" \
283 " STR r0,%1 \n\t" \
284 :"=r"(cy),"=m"(_c[0])\
285 :"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\
286 :"r0","cc");
287#define PROPCARRY \
288asm( \
289 " LDR r0,%1 \n\t" \
290 " ADDS r0,r0,%0 \n\t" \
291 " STR r0,%1 \n\t" \
292 " MOVCS %0,#1 \n\t" \
293 " MOVCC %0,#0 \n\t" \
294 :"=r"(cy),"=m"(_c[0])\
295 :"0"(cy),"m"(_c[0])\
296 :"r0","cc");
297#endif
298
299
300
301#elif defined(PSTM_MIPS)
302
303
304#define MONT_START
305#define MONT_FINI
306#define LOOP_END
307#define LOOP_START \
308mu = c[x] * mp
309
310#define INNERMUL \
311asm( \
312 " multu %3,%4 \n\t" \
313 " mflo $12 \n\t" \
314 " mfhi $13 \n\t" \
315 " addu $12,$12,%0 \n\t" \
316 " sltu $10,$12,%0 \n\t" \
317 " addu $13,$13,$10 \n\t" \
318 " lw $10,%1 \n\t" \
319 " addu $12,$12,$10 \n\t" \
320 " sltu $10,$12,$10 \n\t" \
321 " addu %0,$13,$10 \n\t" \
322 " sw $12,%1 \n\t" \
323 :"=r"(cy),"=m"(_c[0])\
324 :"r"(cy),"r"(mu),"r"(tmpm[0]),"r"(_c[0])\
325 :"$10","$12","$13")\
326; ++tmpm;
327
328#define PROPCARRY \
329asm( \
330 " lw $10,%1 \n\t" \
331 " addu $10,$10,%0 \n\t" \
332 " sw $10,%1 \n\t" \
333 " sltu %0,$10,%0 \n\t" \
334 :"=r"(cy),"=m"(_c[0])\
335 :"r"(cy),"r"(_c[0])\
336 :"$10");
337
338
339
340#else
341
342
343#define MONT_START
344#define MONT_FINI
345#define LOOP_END
346#define LOOP_START \
347 mu = c[x] * mp
348
349#define INNERMUL \
350 do { pstm_word t; \
351 t = ((pstm_word)_c[0] + (pstm_word)cy) + \
352 (((pstm_word)mu) * ((pstm_word)*tmpm++)); \
353 _c[0] = (pstm_digit)t; \
354 cy = (pstm_digit)(t >> DIGIT_BIT); \
355 } while (0)
356
357#define PROPCARRY \
358 do { pstm_digit t = _c[0] += cy; cy = (t < cy); } while (0)
359
360#endif
361
362
363
364#define LO 0
365
366
367int32 FAST_FUNC pstm_montgomery_reduce(psPool_t *pool, pstm_int *a, pstm_int *m,
368 pstm_digit mp, pstm_digit *paD, uint32 paDlen)
369{
370 pstm_digit *c, *_c, *tmpm, mu;
371 int32 oldused, x, y;
372 int pa;
373
374 pa = m->used;
375 if (pa > a->alloc) {
376
377 return PS_LIMIT_FAIL;
378 }
379
380 if (paD && paDlen >= (uint32)2*pa+1) {
381 c = paD;
382 memset(c, 0x0, paDlen);
383 } else {
384 c = xzalloc(2*pa+1);
385 }
386
387 oldused = a->used;
388 for (x = 0; x < oldused; x++) {
389 c[x] = a->dp[x];
390 }
391
392 MONT_START;
393
394 for (x = 0; x < pa; x++) {
395 pstm_digit cy = 0;
396
397 LOOP_START;
398 _c = c + x;
399 tmpm = m->dp;
400 y = 0;
401#ifdef PSTM_X86_64
402 for (; y < (pa & ~7); y += 8) {
403 INNERMUL8;
404 _c += 8;
405 tmpm += 8;
406 }
407#endif
408 for (; y < pa; y++) {
409 INNERMUL;
410 ++_c;
411 }
412 LOOP_END;
413 while (cy) {
414 PROPCARRY;
415 ++_c;
416 }
417 }
418
419
420 _c = c + pa;
421 tmpm = a->dp;
422 for (x = 0; x < pa+1; x++) {
423 *tmpm++ = *_c++;
424 }
425
426 for (; x < oldused; x++) {
427 *tmpm++ = 0;
428 }
429
430 MONT_FINI;
431
432 a->used = pa+1;
433 pstm_clamp(a);
434
435
436 x = PSTM_OKAY;
437
438
439 if (pstm_cmp_mag (a, m) != PSTM_LT) {
440 if (s_pstm_sub (a, m, a) != PSTM_OKAY) {
441 x = PS_MEM_FAIL;
442 }
443 }
444 if (paDlen < (uint32)2*pa+1) {
445 psFree(c, pool);
446 }
447 return x;
448}
449
450#endif
451
452