1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | /* |
3 | * Implementation of POLYVAL using ARMv8 Crypto Extensions. |
4 | * |
5 | * Copyright 2021 Google LLC |
6 | */ |
7 | /* |
8 | * This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions |
9 | * It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8, |
10 | * ..., h^1 in the POLYVAL finite field. This precomputation allows us to split |
11 | * finite field multiplication into two steps. |
12 | * |
13 | * In the first step, we consider h^i, m_i as normal polynomials of degree less |
14 | * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication |
15 | * is simply polynomial multiplication. |
16 | * |
17 | * In the second step, we compute the reduction of p(x) modulo the finite field |
18 | * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1. |
19 | * |
20 | * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where |
21 | * multiplication is finite field multiplication. The advantage is that the |
22 | * two-step process only requires 1 finite field reduction for every 8 |
23 | * polynomial multiplications. Further parallelism is gained by interleaving the |
24 | * multiplications and polynomial reductions. |
25 | */ |
26 | |
27 | #include <linux/linkage.h> |
28 | #define STRIDE_BLOCKS 8 |
29 | |
30 | KEY_POWERS .req x0 |
31 | MSG .req x1 |
32 | BLOCKS_LEFT .req x2 |
33 | ACCUMULATOR .req x3 |
34 | KEY_START .req x10 |
35 | EXTRA_BYTES .req x11 |
36 | TMP .req x13 |
37 | |
38 | M0 .req v0 |
39 | M1 .req v1 |
40 | M2 .req v2 |
41 | M3 .req v3 |
42 | M4 .req v4 |
43 | M5 .req v5 |
44 | M6 .req v6 |
45 | M7 .req v7 |
46 | KEY8 .req v8 |
47 | KEY7 .req v9 |
48 | KEY6 .req v10 |
49 | KEY5 .req v11 |
50 | KEY4 .req v12 |
51 | KEY3 .req v13 |
52 | KEY2 .req v14 |
53 | KEY1 .req v15 |
54 | PL .req v16 |
55 | PH .req v17 |
56 | TMP_V .req v18 |
57 | LO .req v20 |
58 | MI .req v21 |
59 | HI .req v22 |
60 | SUM .req v23 |
61 | GSTAR .req v24 |
62 | |
63 | .text |
64 | |
65 | .arch armv8-a+crypto |
66 | .align 4 |
67 | |
68 | .Lgstar: |
69 | .quad 0xc200000000000000, 0xc200000000000000 |
70 | |
71 | /* |
72 | * Computes the product of two 128-bit polynomials in X and Y and XORs the |
73 | * components of the 256-bit product into LO, MI, HI. |
74 | * |
75 | * Given: |
76 | * X = [X_1 : X_0] |
77 | * Y = [Y_1 : Y_0] |
78 | * |
79 | * We compute: |
80 | * LO += X_0 * Y_0 |
81 | * MI += (X_0 + X_1) * (Y_0 + Y_1) |
82 | * HI += X_1 * Y_1 |
83 | * |
84 | * Later, the 256-bit result can be extracted as: |
85 | * [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0] |
86 | * This step is done when computing the polynomial reduction for efficiency |
87 | * reasons. |
88 | * |
89 | * Karatsuba multiplication is used instead of Schoolbook multiplication because |
90 | * it was found to be slightly faster on ARM64 CPUs. |
91 | * |
92 | */ |
93 | .macro karatsuba1 X Y |
94 | X .req \X |
95 | Y .req \Y |
96 | ext v25.16b, X.16b, X.16b, #8 |
97 | ext v26.16b, Y.16b, Y.16b, #8 |
98 | eor v25.16b, v25.16b, X.16b |
99 | eor v26.16b, v26.16b, Y.16b |
100 | pmull2 v28.1q, X.2d, Y.2d |
101 | pmull v29.1q, X.1d, Y.1d |
102 | pmull v27.1q, v25.1d, v26.1d |
103 | eor HI.16b, HI.16b, v28.16b |
104 | eor LO.16b, LO.16b, v29.16b |
105 | eor MI.16b, MI.16b, v27.16b |
106 | .unreq X |
107 | .unreq Y |
108 | .endm |
109 | |
110 | /* |
111 | * Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into |
112 | * them. |
113 | */ |
114 | .macro karatsuba1_store X Y |
115 | X .req \X |
116 | Y .req \Y |
117 | ext v25.16b, X.16b, X.16b, #8 |
118 | ext v26.16b, Y.16b, Y.16b, #8 |
119 | eor v25.16b, v25.16b, X.16b |
120 | eor v26.16b, v26.16b, Y.16b |
121 | pmull2 HI.1q, X.2d, Y.2d |
122 | pmull LO.1q, X.1d, Y.1d |
123 | pmull MI.1q, v25.1d, v26.1d |
124 | .unreq X |
125 | .unreq Y |
126 | .endm |
127 | |
128 | /* |
129 | * Computes the 256-bit polynomial represented by LO, HI, MI. Stores |
130 | * the result in PL, PH. |
131 | * [PH : PL] = |
132 | * [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0] |
133 | */ |
134 | .macro karatsuba2 |
135 | // v4 = [HI_1 + MI_1 : HI_0 + MI_0] |
136 | eor v4.16b, HI.16b, MI.16b |
137 | // v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0] |
138 | eor v4.16b, v4.16b, LO.16b |
139 | // v5 = [HI_0 : LO_1] |
140 | ext v5.16b, LO.16b, HI.16b, #8 |
141 | // v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0] |
142 | eor v4.16b, v4.16b, v5.16b |
143 | // HI = [HI_0 : HI_1] |
144 | ext HI.16b, HI.16b, HI.16b, #8 |
145 | // LO = [LO_0 : LO_1] |
146 | ext LO.16b, LO.16b, LO.16b, #8 |
147 | // PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1] |
148 | ext PH.16b, v4.16b, HI.16b, #8 |
149 | // PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0] |
150 | ext PL.16b, LO.16b, v4.16b, #8 |
151 | .endm |
152 | |
153 | /* |
154 | * Computes the 128-bit reduction of PH : PL. Stores the result in dest. |
155 | * |
156 | * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) = |
157 | * x^128 + x^127 + x^126 + x^121 + 1. |
158 | * |
159 | * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the |
160 | * product of two 128-bit polynomials in Montgomery form. We need to reduce it |
161 | * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor |
162 | * of x^128, this product has two extra factors of x^128. To get it back into |
163 | * Montgomery form, we need to remove one of these factors by dividing by x^128. |
164 | * |
165 | * To accomplish both of these goals, we add multiples of g(x) that cancel out |
166 | * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low |
167 | * bits are zero, the polynomial division by x^128 can be done by right |
168 | * shifting. |
169 | * |
170 | * Since the only nonzero term in the low 64 bits of g(x) is the constant term, |
171 | * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can |
172 | * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 + |
173 | * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to |
174 | * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T |
175 | * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191. |
176 | * |
177 | * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits |
178 | * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1 |
179 | * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) * |
180 | * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 : |
181 | * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0). |
182 | * |
183 | * So our final computation is: |
184 | * T = T_1 : T_0 = g*(x) * P_0 |
185 | * V = V_1 : V_0 = g*(x) * (P_1 + T_0) |
186 | * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0 |
187 | * |
188 | * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0 |
189 | * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 : |
190 | * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V. |
191 | */ |
192 | .macro montgomery_reduction dest |
193 | DEST .req \dest |
194 | // TMP_V = T_1 : T_0 = P_0 * g*(x) |
195 | pmull TMP_V.1q, PL.1d, GSTAR.1d |
196 | // TMP_V = T_0 : T_1 |
197 | ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8 |
198 | // TMP_V = P_1 + T_0 : P_0 + T_1 |
199 | eor TMP_V.16b, PL.16b, TMP_V.16b |
200 | // PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1 |
201 | eor PH.16b, PH.16b, TMP_V.16b |
202 | // TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x) |
203 | pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d |
204 | eor DEST.16b, PH.16b, TMP_V.16b |
205 | .unreq DEST |
206 | .endm |
207 | |
208 | /* |
209 | * Compute Polyval on 8 blocks. |
210 | * |
211 | * If reduce is set, also computes the montgomery reduction of the |
212 | * previous full_stride call and XORs with the first message block. |
213 | * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1. |
214 | * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0. |
215 | * |
216 | * Sets PL, PH. |
217 | */ |
218 | .macro full_stride reduce |
219 | eor LO.16b, LO.16b, LO.16b |
220 | eor MI.16b, MI.16b, MI.16b |
221 | eor HI.16b, HI.16b, HI.16b |
222 | |
223 | ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64 |
224 | ld1 {M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64 |
225 | |
226 | karatsuba1 M7 KEY1 |
227 | .if \reduce |
228 | pmull TMP_V.1q, PL.1d, GSTAR.1d |
229 | .endif |
230 | |
231 | karatsuba1 M6 KEY2 |
232 | .if \reduce |
233 | ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8 |
234 | .endif |
235 | |
236 | karatsuba1 M5 KEY3 |
237 | .if \reduce |
238 | eor TMP_V.16b, PL.16b, TMP_V.16b |
239 | .endif |
240 | |
241 | karatsuba1 M4 KEY4 |
242 | .if \reduce |
243 | eor PH.16b, PH.16b, TMP_V.16b |
244 | .endif |
245 | |
246 | karatsuba1 M3 KEY5 |
247 | .if \reduce |
248 | pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d |
249 | .endif |
250 | |
251 | karatsuba1 M2 KEY6 |
252 | .if \reduce |
253 | eor SUM.16b, PH.16b, TMP_V.16b |
254 | .endif |
255 | |
256 | karatsuba1 M1 KEY7 |
257 | eor M0.16b, M0.16b, SUM.16b |
258 | |
259 | karatsuba1 M0 KEY8 |
260 | karatsuba2 |
261 | .endm |
262 | |
263 | /* |
264 | * Handle any extra blocks after full_stride loop. |
265 | */ |
266 | .macro partial_stride |
267 | add KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4) |
268 | sub KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4 |
269 | ld1 {KEY1.16b}, [KEY_POWERS], #16 |
270 | |
271 | ld1 {TMP_V.16b}, [MSG], #16 |
272 | eor SUM.16b, SUM.16b, TMP_V.16b |
273 | karatsuba1_store KEY1 SUM |
274 | sub BLOCKS_LEFT, BLOCKS_LEFT, #1 |
275 | |
276 | tst BLOCKS_LEFT, #4 |
277 | beq .Lpartial4BlocksDone |
278 | ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64 |
279 | ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64 |
280 | karatsuba1 M0 KEY8 |
281 | karatsuba1 M1 KEY7 |
282 | karatsuba1 M2 KEY6 |
283 | karatsuba1 M3 KEY5 |
284 | .Lpartial4BlocksDone: |
285 | tst BLOCKS_LEFT, #2 |
286 | beq .Lpartial2BlocksDone |
287 | ld1 {M0.16b, M1.16b}, [MSG], #32 |
288 | ld1 {KEY8.16b, KEY7.16b}, [KEY_POWERS], #32 |
289 | karatsuba1 M0 KEY8 |
290 | karatsuba1 M1 KEY7 |
291 | .Lpartial2BlocksDone: |
292 | tst BLOCKS_LEFT, #1 |
293 | beq .LpartialDone |
294 | ld1 {M0.16b}, [MSG], #16 |
295 | ld1 {KEY8.16b}, [KEY_POWERS], #16 |
296 | karatsuba1 M0 KEY8 |
297 | .LpartialDone: |
298 | karatsuba2 |
299 | montgomery_reduction SUM |
300 | .endm |
301 | |
302 | /* |
303 | * Perform montgomery multiplication in GF(2^128) and store result in op1. |
304 | * |
305 | * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1 |
306 | * If op1, op2 are in montgomery form, this computes the montgomery |
307 | * form of op1*op2. |
308 | * |
309 | * void pmull_polyval_mul(u8 *op1, const u8 *op2); |
310 | */ |
311 | SYM_FUNC_START(pmull_polyval_mul) |
312 | adr TMP, .Lgstar |
313 | ld1 {GSTAR.2d}, [TMP] |
314 | ld1 {v0.16b}, [x0] |
315 | ld1 {v1.16b}, [x1] |
316 | karatsuba1_store v0 v1 |
317 | karatsuba2 |
318 | montgomery_reduction SUM |
319 | st1 {SUM.16b}, [x0] |
320 | ret |
321 | SYM_FUNC_END(pmull_polyval_mul) |
322 | |
323 | /* |
324 | * Perform polynomial evaluation as specified by POLYVAL. This computes: |
325 | * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1} |
326 | * where n=nblocks, h is the hash key, and m_i are the message blocks. |
327 | * |
328 | * x0 - pointer to precomputed key powers h^8 ... h^1 |
329 | * x1 - pointer to message blocks |
330 | * x2 - number of blocks to hash |
331 | * x3 - pointer to accumulator |
332 | * |
333 | * void pmull_polyval_update(const struct polyval_ctx *ctx, const u8 *in, |
334 | * size_t nblocks, u8 *accumulator); |
335 | */ |
336 | SYM_FUNC_START(pmull_polyval_update) |
337 | adr TMP, .Lgstar |
338 | mov KEY_START, KEY_POWERS |
339 | ld1 {GSTAR.2d}, [TMP] |
340 | ld1 {SUM.16b}, [ACCUMULATOR] |
341 | subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS |
342 | blt .LstrideLoopExit |
343 | ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64 |
344 | ld1 {KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64 |
345 | full_stride 0 |
346 | subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS |
347 | blt .LstrideLoopExitReduce |
348 | .LstrideLoop: |
349 | full_stride 1 |
350 | subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS |
351 | bge .LstrideLoop |
352 | .LstrideLoopExitReduce: |
353 | montgomery_reduction SUM |
354 | .LstrideLoopExit: |
355 | adds BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS |
356 | beq .LskipPartial |
357 | partial_stride |
358 | .LskipPartial: |
359 | st1 {SUM.16b}, [ACCUMULATOR] |
360 | ret |
361 | SYM_FUNC_END(pmull_polyval_update) |
362 | |