1
2
3
4
5
6
7#define MASK_U32 0x3c
8#define CHACHA20_BLOCK_SIZE 64
9#define STACK_SIZE 32
10
11#define X0 $t0
12#define X1 $t1
13#define X2 $t2
14#define X3 $t3
15#define X4 $t4
16#define X5 $t5
17#define X6 $t6
18#define X7 $t7
19#define X8 $t8
20#define X9 $t9
21#define X10 $v1
22#define X11 $s6
23#define X12 $s5
24#define X13 $s4
25#define X14 $s3
26#define X15 $s2
27
28#define T0 $s1
29#define T1 $s0
30#define T(n) T
31#define X(n) X
32
33
34#define STATE $a0
35#define OUT $a1
36#define IN $a2
37#define BYTES $a3
38
39
40
41
42
43
44#define NONCE_0 $v0
45
46
47
48
49
50#define SAVED_X X15
51#define SAVED_CA $s7
52
53#define IS_UNALIGNED $s7
54
55
56#define MSB 0
57#define LSB 3
58#define ROTx rotl
59#define ROTR(n) rotr n, 24
60#define CPU_TO_LE32(n) \
61 wsbh n; \
62 rotr n, 16;
63#else
64#define MSB 3
65#define LSB 0
66#define ROTx rotr
67#define CPU_TO_LE32(n)
68#define ROTR(n)
69#endif
70
71#define FOR_EACH_WORD(x) \
72 x( 0); \
73 x( 1); \
74 x( 2); \
75 x( 3); \
76 x( 4); \
77 x( 5); \
78 x( 6); \
79 x( 7); \
80 x( 8); \
81 x( 9); \
82 x(10); \
83 x(11); \
84 x(12); \
85 x(13); \
86 x(14); \
87 x(15);
88
89#define FOR_EACH_WORD_REV(x) \
90 x(15); \
91 x(14); \
92 x(13); \
93 x(12); \
94 x(11); \
95 x(10); \
96 x( 9); \
97 x( 8); \
98 x( 7); \
99 x( 6); \
100 x( 5); \
101 x( 4); \
102 x( 3); \
103 x( 2); \
104 x( 1); \
105 x( 0);
106
107#define PLUS_ONE_0 1
108#define PLUS_ONE_1 2
109#define PLUS_ONE_2 3
110#define PLUS_ONE_3 4
111#define PLUS_ONE_4 5
112#define PLUS_ONE_5 6
113#define PLUS_ONE_6 7
114#define PLUS_ONE_7 8
115#define PLUS_ONE_8 9
116#define PLUS_ONE_9 10
117#define PLUS_ONE_10 11
118#define PLUS_ONE_11 12
119#define PLUS_ONE_12 13
120#define PLUS_ONE_13 14
121#define PLUS_ONE_14 15
122#define PLUS_ONE_15 16
123#define PLUS_ONE(x) PLUS_ONE_
124#define _CONCAT3(a,b,c) a
125#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
126
127#define STORE_UNALIGNED(x) \
128CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
129 .if (x != 12); \
130 lw T0, (x*4)(STATE); \
131 .endif; \
132 lwl T1, (x*4)+MSB
133 lwr T1, (x*4)+LSB
134 .if (x == 12); \
135 addu X
136 .else; \
137 addu X
138 .endif; \
139 CPU_TO_LE32(X
140 xor X
141 swl X
142 swr X
143
144#define STORE_ALIGNED(x) \
145CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
146 .if (x != 12); \
147 lw T0, (x*4)(STATE); \
148 .endif; \
149 lw T1, (x*4)
150 .if (x == 12); \
151 addu X
152 .else; \
153 addu X
154 .endif; \
155 CPU_TO_LE32(X
156 xor X
157 sw X
158
159
160
161
162
163
164#define JMPTBL_ALIGNED(x) \
165.Lchacha_mips_jmptbl_aligned_
166 .set noreorder; \
167 b .Lchacha_mips_xor_aligned_
168 .if (x == 12); \
169 addu SAVED_X, X
170 .else; \
171 addu SAVED_X, X
172 .endif; \
173 .set reorder
174
175#define JMPTBL_UNALIGNED(x) \
176.Lchacha_mips_jmptbl_unaligned_
177 .set noreorder; \
178 b .Lchacha_mips_xor_unaligned_
179 .if (x == 12); \
180 addu SAVED_X, X
181 .else; \
182 addu SAVED_X, X
183 .endif; \
184 .set reorder
185
186#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
187 addu X(A), X(K); \
188 addu X(B), X(L); \
189 addu X(C), X(M); \
190 addu X(D), X(N); \
191 xor X(V), X(A); \
192 xor X(W), X(B); \
193 xor X(Y), X(C); \
194 xor X(Z), X(D); \
195 rotl X(V), S; \
196 rotl X(W), S; \
197 rotl X(Y), S; \
198 rotl X(Z), S;
199
200.text
201.set reorder
202.set noat
203.globl chacha_crypt_arch
204.ent chacha_crypt_arch
205chacha_crypt_arch:
206 .frame $sp, STACK_SIZE, $ra
207
208
209 lw $at, 16($sp)
210
211 addiu $sp, -STACK_SIZE
212
213
214 beqz BYTES, .Lchacha_mips_end
215
216 lw NONCE_0, 48(STATE)
217
218
219 sw $s0, 0($sp)
220 sw $s1, 4($sp)
221 sw $s2, 8($sp)
222 sw $s3, 12($sp)
223 sw $s4, 16($sp)
224 sw $s5, 20($sp)
225 sw $s6, 24($sp)
226 sw $s7, 28($sp)
227
228
229
230
231 or IS_UNALIGNED, IN, OUT
232 andi IS_UNALIGNED, 0x3
233
234 b .Lchacha_rounds_start
235
236.align 4
237.Loop_chacha_rounds:
238 addiu IN, CHACHA20_BLOCK_SIZE
239 addiu OUT, CHACHA20_BLOCK_SIZE
240 addiu NONCE_0, 1
241
242.Lchacha_rounds_start:
243 lw X0, 0(STATE)
244 lw X1, 4(STATE)
245 lw X2, 8(STATE)
246 lw X3, 12(STATE)
247
248 lw X4, 16(STATE)
249 lw X5, 20(STATE)
250 lw X6, 24(STATE)
251 lw X7, 28(STATE)
252 lw X8, 32(STATE)
253 lw X9, 36(STATE)
254 lw X10, 40(STATE)
255 lw X11, 44(STATE)
256
257 move X12, NONCE_0
258 lw X13, 52(STATE)
259 lw X14, 56(STATE)
260 lw X15, 60(STATE)
261
262.Loop_chacha_xor_rounds:
263 addiu $at, -2
264 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
265 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
266 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
267 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
268 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
269 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
270 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
271 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
272 bnez $at, .Loop_chacha_xor_rounds
273
274 addiu BYTES, -(CHACHA20_BLOCK_SIZE)
275
276
277 bnez IS_UNALIGNED, .Loop_chacha_unaligned
278
279
280 lw $at, (STACK_SIZE+16)($sp)
281
282
283 bltz BYTES, .Lchacha_mips_no_full_block_aligned
284
285 FOR_EACH_WORD_REV(STORE_ALIGNED)
286
287
288 bgtz BYTES, .Loop_chacha_rounds
289
290
291 addiu NONCE_0, 1
292
293
294 bltz BYTES, .Lchacha_mips_xor_bytes
295
296.Lchacha_mips_xor_done:
297
298 lw $s0, 0($sp)
299 lw $s1, 4($sp)
300 lw $s2, 8($sp)
301 lw $s3, 12($sp)
302 lw $s4, 16($sp)
303 lw $s5, 20($sp)
304 lw $s6, 24($sp)
305 lw $s7, 28($sp)
306
307
308 sw NONCE_0, 48(STATE)
309
310.Lchacha_mips_end:
311 addiu $sp, STACK_SIZE
312 jr $ra
313
314.Lchacha_mips_no_full_block_aligned:
315
316 addiu BYTES, CHACHA20_BLOCK_SIZE
317
318
319 andi $at, BYTES, MASK_U32
320
321
322 lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
323
324
325 ins T0, $at, 1, 6
326
327
328 addu T1, STATE, $at
329
330
331 addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
332
333
334 lw SAVED_CA, 0(T1)
335
336
337 subu BYTES, $at, BYTES
338
339 jr T0
340
341
342 FOR_EACH_WORD(JMPTBL_ALIGNED)
343
344
345.Loop_chacha_unaligned:
346
347 lw $at, (STACK_SIZE+16)($sp)
348
349
350 bltz BYTES, .Lchacha_mips_no_full_block_unaligned
351
352 FOR_EACH_WORD_REV(STORE_UNALIGNED)
353
354
355 bgtz BYTES, .Loop_chacha_rounds
356
357
358 sw NONCE_0, 48(STATE)
359
360 .set noreorder
361
362 bgez BYTES, .Lchacha_mips_xor_done
363.Lchacha_mips_xor_unaligned_0_b:
364.Lchacha_mips_xor_aligned_0_b:
365
366 addiu NONCE_0, 1
367 .set reorder
368
369.Lchacha_mips_xor_bytes:
370 addu IN, $at
371 addu OUT, $at
372
373 lbu T1, 0(IN)
374 addiu $at, BYTES, 1
375 CPU_TO_LE32(SAVED_X)
376 ROTR(SAVED_X)
377 xor T1, SAVED_X
378 sb T1, 0(OUT)
379 beqz $at, .Lchacha_mips_xor_done
380
381 lbu T1, 1(IN)
382 addiu $at, BYTES, 2
383 ROTx SAVED_X, 8
384 xor T1, SAVED_X
385 sb T1, 1(OUT)
386 beqz $at, .Lchacha_mips_xor_done
387
388 lbu T1, 2(IN)
389 ROTx SAVED_X, 8
390 xor T1, SAVED_X
391 sb T1, 2(OUT)
392 b .Lchacha_mips_xor_done
393
394.Lchacha_mips_no_full_block_unaligned:
395
396 addiu BYTES, CHACHA20_BLOCK_SIZE
397
398
399 andi $at, BYTES, MASK_U32
400
401
402 lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
403
404
405 ins T0, $at, 1, 6
406
407
408 addu T1, STATE, $at
409
410
411 addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
412
413
414 lw SAVED_CA, 0(T1)
415
416
417 subu BYTES, $at, BYTES
418
419 jr T0
420
421
422 FOR_EACH_WORD(JMPTBL_UNALIGNED)
423.end chacha_crypt_arch
424.set at
425
426
427
428
429
430
431
432#undef X12
433#undef X13
434#undef X14
435#undef X15
436
437#define X12 $a3
438#define X13 $at
439#define X14 $v0
440#define X15 STATE
441
442.set noat
443.globl hchacha_block_arch
444.ent hchacha_block_arch
445hchacha_block_arch:
446 .frame $sp, STACK_SIZE, $ra
447
448 addiu $sp, -STACK_SIZE
449
450
451 sw X11, 0($sp)
452
453 lw X0, 0(STATE)
454 lw X1, 4(STATE)
455 lw X2, 8(STATE)
456 lw X3, 12(STATE)
457 lw X4, 16(STATE)
458 lw X5, 20(STATE)
459 lw X6, 24(STATE)
460 lw X7, 28(STATE)
461 lw X8, 32(STATE)
462 lw X9, 36(STATE)
463 lw X10, 40(STATE)
464 lw X11, 44(STATE)
465 lw X12, 48(STATE)
466 lw X13, 52(STATE)
467 lw X14, 56(STATE)
468 lw X15, 60(STATE)
469
470.Loop_hchacha_xor_rounds:
471 addiu $a2, -2
472 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
473 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
474 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
475 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
476 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
477 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
478 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
479 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
480 bnez $a2, .Loop_hchacha_xor_rounds
481
482
483 lw X11, 0($sp)
484
485 sw X0, 0(OUT)
486 sw X1, 4(OUT)
487 sw X2, 8(OUT)
488 sw X3, 12(OUT)
489 sw X12, 16(OUT)
490 sw X13, 20(OUT)
491 sw X14, 24(OUT)
492 sw X15, 28(OUT)
493
494 addiu $sp, STACK_SIZE
495 jr $ra
496.end hchacha_block_arch
497.set at
498