1
2
3
4
5
6
7
8
9
10
11
12
13#define load_16way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
14 vmovdqu (0*32)(src), x0; \
15 vmovdqu (1*32)(src), x1; \
16 vmovdqu (2*32)(src), x2; \
17 vmovdqu (3*32)(src), x3; \
18 vmovdqu (4*32)(src), x4; \
19 vmovdqu (5*32)(src), x5; \
20 vmovdqu (6*32)(src), x6; \
21 vmovdqu (7*32)(src), x7;
22
23#define store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
24 vmovdqu x0, (0*32)(dst); \
25 vmovdqu x1, (1*32)(dst); \
26 vmovdqu x2, (2*32)(dst); \
27 vmovdqu x3, (3*32)(dst); \
28 vmovdqu x4, (4*32)(dst); \
29 vmovdqu x5, (5*32)(dst); \
30 vmovdqu x6, (6*32)(dst); \
31 vmovdqu x7, (7*32)(dst);
32
33#define store_cbc_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7, t0) \
34 vpxor t0, t0, t0; \
35 vinserti128 $1, (src), t0, t0; \
36 vpxor t0, x0, x0; \
37 vpxor (0*32+16)(src), x1, x1; \
38 vpxor (1*32+16)(src), x2, x2; \
39 vpxor (2*32+16)(src), x3, x3; \
40 vpxor (3*32+16)(src), x4, x4; \
41 vpxor (4*32+16)(src), x5, x5; \
42 vpxor (5*32+16)(src), x6, x6; \
43 vpxor (6*32+16)(src), x7, x7; \
44 store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
45
46#define inc_le128(x, minus_one, tmp) \
47 vpcmpeqq minus_one, x, tmp; \
48 vpsubq minus_one, x, x; \
49 vpslldq $8, tmp, tmp; \
50 vpsubq tmp, x, x;
51
52#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
53 vpcmpeqq minus_one, x, tmp1; \
54 vpcmpeqq minus_two, x, tmp2; \
55 vpsubq minus_two, x, x; \
56 vpor tmp2, tmp1, tmp1; \
57 vpslldq $8, tmp1, tmp1; \
58 vpsubq tmp1, x, x;
59
60#define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \
61 t1x, t2, t2x, t3, t3x, t4, t5) \
62 vpcmpeqd t0, t0, t0; \
63 vpsrldq $8, t0, t0; \
64 vpaddq t0, t0, t4; \
65 \
66 \
67 vmovdqu (iv), t2x; \
68 vmovdqa t2x, t3x; \
69 inc_le128(t2x, t0x, t1x); \
70 vbroadcasti128 bswap, t1; \
71 vinserti128 $1, t2x, t3, t2; \
72 vpshufb t1, t2, x0; \
73 \
74 \
75 add2_le128(t2, t0, t4, t3, t5); \
76 vpshufb t1, t2, x1; \
77 add2_le128(t2, t0, t4, t3, t5); \
78 vpshufb t1, t2, x2; \
79 add2_le128(t2, t0, t4, t3, t5); \
80 vpshufb t1, t2, x3; \
81 add2_le128(t2, t0, t4, t3, t5); \
82 vpshufb t1, t2, x4; \
83 add2_le128(t2, t0, t4, t3, t5); \
84 vpshufb t1, t2, x5; \
85 add2_le128(t2, t0, t4, t3, t5); \
86 vpshufb t1, t2, x6; \
87 add2_le128(t2, t0, t4, t3, t5); \
88 vpshufb t1, t2, x7; \
89 vextracti128 $1, t2, t2x; \
90 inc_le128(t2x, t0x, t3x); \
91 vmovdqu t2x, (iv);
92
93#define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
94 vpxor (0*32)(src), x0, x0; \
95 vpxor (1*32)(src), x1, x1; \
96 vpxor (2*32)(src), x2, x2; \
97 vpxor (3*32)(src), x3, x3; \
98 vpxor (4*32)(src), x4, x4; \
99 vpxor (5*32)(src), x5, x5; \
100 vpxor (6*32)(src), x6, x6; \
101 vpxor (7*32)(src), x7, x7; \
102 store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
103
104#define gf128mul_x_ble(iv, mask, tmp) \
105 vpsrad $31, iv, tmp; \
106 vpaddq iv, iv, iv; \
107 vpshufd $0x13, tmp, tmp; \
108 vpand mask, tmp, tmp; \
109 vpxor tmp, iv, iv;
110
111#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
112 vpsrad $31, iv, tmp0; \
113 vpaddq iv, iv, tmp1; \
114 vpsllq $2, iv, iv; \
115 vpshufd $0x13, tmp0, tmp0; \
116 vpsrad $31, tmp1, tmp1; \
117 vpand mask2, tmp0, tmp0; \
118 vpshufd $0x13, tmp1, tmp1; \
119 vpxor tmp0, iv, iv; \
120 vpand mask1, tmp1, tmp1; \
121 vpxor tmp1, iv, iv;
122
123#define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \
124 tivx, t0, t0x, t1, t1x, t2, t2x, t3, \
125 xts_gf128mul_and_shl1_mask_0, \
126 xts_gf128mul_and_shl1_mask_1) \
127 vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \
128 \
129 \
130 vmovdqu (iv), tivx; \
131 vmovdqa tivx, t0x; \
132 gf128mul_x_ble(tivx, t1x, t2x); \
133 vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \
134 vinserti128 $1, tivx, t0, tiv; \
135 vpxor (0*32)(src), tiv, x0; \
136 vmovdqu tiv, (0*32)(dst); \
137 \
138 \
139 gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
140 vpxor (1*32)(src), tiv, x1; \
141 vmovdqu tiv, (1*32)(dst); \
142 \
143 gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
144 vpxor (2*32)(src), tiv, x2; \
145 vmovdqu tiv, (2*32)(dst); \
146 \
147 gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
148 vpxor (3*32)(src), tiv, x3; \
149 vmovdqu tiv, (3*32)(dst); \
150 \
151 gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
152 vpxor (4*32)(src), tiv, x4; \
153 vmovdqu tiv, (4*32)(dst); \
154 \
155 gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
156 vpxor (5*32)(src), tiv, x5; \
157 vmovdqu tiv, (5*32)(dst); \
158 \
159 gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
160 vpxor (6*32)(src), tiv, x6; \
161 vmovdqu tiv, (6*32)(dst); \
162 \
163 gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
164 vpxor (7*32)(src), tiv, x7; \
165 vmovdqu tiv, (7*32)(dst); \
166 \
167 vextracti128 $1, tiv, tivx; \
168 gf128mul_x_ble(tivx, t1x, t2x); \
169 vmovdqu tivx, (iv);
170
171#define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
172 vpxor (0*32)(dst), x0, x0; \
173 vpxor (1*32)(dst), x1, x1; \
174 vpxor (2*32)(dst), x2, x2; \
175 vpxor (3*32)(dst), x3, x3; \
176 vpxor (4*32)(dst), x4, x4; \
177 vpxor (5*32)(dst), x5, x5; \
178 vpxor (6*32)(dst), x6, x6; \
179 vpxor (7*32)(dst), x7, x7; \
180 store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
181