ghash-ce-core.S source code [linux/arch/arm64/crypto/ghash-ce-core.S]

1	/ SPDX-License-Identifier: GPL-2.0-only /
2	/*
3	* Accelerated GHASH implementation with ARMv8 PMULL instructions.
4	*
5	* Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
6	*/
7
8	#include <linux/linkage.h>
9	#include <linux/cfi_types.h>
10	#include <asm/assembler.h>
11
12	SHASH .req v0
13	SHASH2 .req v1
14	T1 .req v2
15	T2 .req v3
16	MASK .req v4
17	XM .req v5
18	XL .req v6
19	XH .req v7
20	IN1 .req v7
21
22	k00_16 .req v8
23	k32_48 .req v9
24
25	t3 .req v10
26	t4 .req v11
27	t5 .req v12
28	t6 .req v13
29	t7 .req v14
30	t8 .req v15
31	t9 .req v16
32
33	perm1 .req v17
34	perm2 .req v18
35	perm3 .req v19
36
37	sh1 .req v20
38	sh2 .req v21
39	sh3 .req v22
40	sh4 .req v23
41
42	ss1 .req v24
43	ss2 .req v25
44	ss3 .req v26
45	ss4 .req v27
46
47	XL2 .req v8
48	XM2 .req v9
49	XH2 .req v10
50	XL3 .req v11
51	XM3 .req v12
52	XH3 .req v13
53	TT3 .req v14
54	TT4 .req v15
55	HH .req v16
56	HH3 .req v17
57	HH4 .req v18
58	HH34 .req v19
59
60	.text
61	.arch armv8-a+crypto
62
63	.macro __pmull_p64, rd, rn, rm
64	pmull \rd\()`.1q`, \rn\()`.1d`, \rm\()`.1d`
65	.endm
66
67	.macro __pmull2_p64, rd, rn, rm
68	pmull2 \rd\()`.1q`, \rn\()`.2d`, \rm\()`.2d`
69	.endm
70
71	.macro __pmull_p8, rq, ad, bd
72	ext t3`.8b`, \ad\()`.8b`, \ad\()`.8b`, #`1` // A1
73	ext t5`.8b`, \ad\()`.8b`, \ad\()`.8b`, #`2` // A2
74	ext t7`.8b`, \ad\()`.8b`, \ad\()`.8b`, #`3` // A3
75
76	__pmull_p8_\bd \rq, \ad
77	.endm
78
79	.macro __pmull2_p8, rq, ad, bd
80	tbl t3`.16b`, {\ad\()`.16b`}, perm1`.16b` // A1
81	tbl t5`.16b`, {\ad\()`.16b`}, perm2`.16b` // A2
82	tbl t7`.16b`, {\ad\()`.16b`}, perm3`.16b` // A3
83
84	__pmull2_p8_\bd \rq, \ad
85	.endm
86
87	.macro __pmull_p8_SHASH, rq, ad
88	__pmull_p8_tail \rq, \ad\()`.8b`, SHASH`.8b`, `8b`,, sh1, sh2, sh3, sh4
89	.endm
90
91	.macro __pmull_p8_SHASH2, rq, ad
92	__pmull_p8_tail \rq, \ad\()`.8b`, SHASH2`.8b`, `8b`,, ss1, ss2, ss3, ss4
93	.endm
94
95	.macro __pmull2_p8_SHASH, rq, ad
96	__pmull_p8_tail \rq, \ad\()`.16b`, SHASH`.16b`, `16b`, `2`, sh1, sh2, sh3, sh4
97	.endm
98
99	.macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
100	pmull\t t3`.8h`, t3.\nb, \bd // F = A1B*
101	pmull\t t4`.8h`, \ad, \b1\().\nb // E = AB1*
102	pmull\t t5`.8h`, t5.\nb, \bd // H = A2B*
103	pmull\t t6`.8h`, \ad, \b2\().\nb // G = AB2*
104	pmull\t t7`.8h`, t7.\nb, \bd // J = A3B*
105	pmull\t t8`.8h`, \ad, \b3\().\nb // I = AB3*
106	pmull\t t9`.8h`, \ad, \b4\().\nb // K = AB4*
107	pmull\t \rq\()`.8h`, \ad, \bd // D = AB*
108
109	eor t3`.16b`, t3`.16b`, t4`.16b` // L = E + F
110	eor t5`.16b`, t5`.16b`, t6`.16b` // M = G + H
111	eor t7`.16b`, t7`.16b`, t8`.16b` // N = I + J
112
113	uzp1 t4`.2d`, t3`.2d`, t5`.2d`
114	uzp2 t3`.2d`, t3`.2d`, t5`.2d`
115	uzp1 t6`.2d`, t7`.2d`, t9`.2d`
116	uzp2 t7`.2d`, t7`.2d`, t9`.2d`
117
118	// t3 = (L) (P0 + P1) << 8
119	// t5 = (M) (P2 + P3) << 16
120	eor t4`.16b`, t4`.16b`, t3`.16b`
121	and t3`.16b`, t3`.16b`, k32_48`.16b`
122
123	// t7 = (N) (P4 + P5) << 24
124	// t9 = (K) (P6 + P7) << 32
125	eor t6`.16b`, t6`.16b`, t7`.16b`
126	and t7`.16b`, t7`.16b`, k00_16`.16b`
127
128	eor t4`.16b`, t4`.16b`, t3`.16b`
129	eor t6`.16b`, t6`.16b`, t7`.16b`
130
131	zip2 t5`.2d`, t4`.2d`, t3`.2d`
132	zip1 t3`.2d`, t4`.2d`, t3`.2d`
133	zip2 t9`.2d`, t6`.2d`, t7`.2d`
134	zip1 t7`.2d`, t6`.2d`, t7`.2d`
135
136	ext t3`.16b`, t3`.16b`, t3`.16b`, #`15`
137	ext t5`.16b`, t5`.16b`, t5`.16b`, #`14`
138	ext t7`.16b`, t7`.16b`, t7`.16b`, #`13`
139	ext t9`.16b`, t9`.16b`, t9`.16b`, #`12`
140
141	eor t3`.16b`, t3`.16b`, t5`.16b`
142	eor t7`.16b`, t7`.16b`, t9`.16b`
143	eor \rq\()`.16b`, \rq\()`.16b`, t3`.16b`
144	eor \rq\()`.16b`, \rq\()`.16b`, t7`.16b`
145	.endm
146
147	.macro __pmull_pre_p64
148	add x8, x3, #`16`
149	ld1 {HH`.2d`-HH4`.2d`}, [x8]
150
151	trn1 SHASH2`.2d`, SHASH`.2d`, HH`.2d`
152	trn2 T1`.2d`, SHASH`.2d`, HH`.2d`
153	eor SHASH2`.16b`, SHASH2`.16b`, T1`.16b`
154
155	trn1 HH34`.2d`, HH3`.2d`, HH4`.2d`
156	trn2 T1`.2d`, HH3`.2d`, HH4`.2d`
157	eor HH34`.16b`, HH34`.16b`, T1`.16b`
158
159	movi MASK`.16b`, #`0xe1`
160	shl MASK`.2d`, MASK`.2d`, #`57`
161	.endm
162
163	.macro __pmull_pre_p8
164	ext SHASH2`.16b`, SHASH`.16b`, SHASH`.16b`, #`8`
165	eor SHASH2`.16b`, SHASH2`.16b`, SHASH`.16b`
166
167	// k00_16 := 0x0000000000000000_000000000000ffff
168	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
169	movi k32_48`.2d`, #`0xffffffff`
170	mov k32_48.h[`2`], k32_48.h[`0`]
171	ushr k00_16`.2d`, k32_48`.2d`, #`32`
172
173	// prepare the permutation vectors
174	mov_q x5, `0x080f0e0d0c0b0a09`
175	movi T1`.8b`, #`8`
176	dup perm1`.2d`, x5
177	eor perm1`.16b`, perm1`.16b`, T1`.16b`
178	ushr perm2`.2d`, perm1`.2d`, #`8`
179	ushr perm3`.2d`, perm1`.2d`, #`16`
180	ushr T1`.2d`, perm1`.2d`, #`24`
181	sli perm2`.2d`, perm1`.2d`, #`56`
182	sli perm3`.2d`, perm1`.2d`, #`48`
183	sli T1`.2d`, perm1`.2d`, #`40`
184
185	// precompute loop invariants
186	tbl sh1`.16b`, {SHASH`.16b`}, perm1`.16b`
187	tbl sh2`.16b`, {SHASH`.16b`}, perm2`.16b`
188	tbl sh3`.16b`, {SHASH`.16b`}, perm3`.16b`
189	tbl sh4`.16b`, {SHASH`.16b`}, T1`.16b`
190	ext ss1`.8b`, SHASH2`.8b`, SHASH2`.8b`, #`1`
191	ext ss2`.8b`, SHASH2`.8b`, SHASH2`.8b`, #`2`
192	ext ss3`.8b`, SHASH2`.8b`, SHASH2`.8b`, #`3`
193	ext ss4`.8b`, SHASH2`.8b`, SHASH2`.8b`, #`4`
194	.endm
195
196	//
197	// PMULL (64x64->128) based reduction for CPUs that can do
198	// it in a single instruction.
199	//
200	.macro __pmull_reduce_p64
201	pmull T2`.1q`, XL`.1d`, MASK`.1d`
202	eor XM`.16b`, XM`.16b`, T1`.16b`
203
204	mov XH.d[`0`], XM.d[`1`]
205	mov XM.d[`1`], XL.d[`0`]
206
207	eor XL`.16b`, XM`.16b`, T2`.16b`
208	ext T2`.16b`, XL`.16b`, XL`.16b`, #`8`
209	pmull XL`.1q`, XL`.1d`, MASK`.1d`
210	.endm
211
212	//
213	// Alternative reduction for CPUs that lack support for the
214	// 64x64->128 PMULL instruction
215	//
216	.macro __pmull_reduce_p8
217	eor XM`.16b`, XM`.16b`, T1`.16b`
218
219	mov XL.d[`1`], XM.d[`0`]
220	mov XH.d[`0`], XM.d[`1`]
221
222	shl T1`.2d`, XL`.2d`, #`57`
223	shl T2`.2d`, XL`.2d`, #`62`
224	eor T2`.16b`, T2`.16b`, T1`.16b`
225	shl T1`.2d`, XL`.2d`, #`63`
226	eor T2`.16b`, T2`.16b`, T1`.16b`
227	ext T1`.16b`, XL`.16b`, XH`.16b`, #`8`
228	eor T2`.16b`, T2`.16b`, T1`.16b`
229
230	mov XL.d[`1`], T2.d[`0`]
231	mov XH.d[`0`], T2.d[`1`]
232
233	ushr T2`.2d`, XL`.2d`, #`1`
234	eor XH`.16b`, XH`.16b`, XL`.16b`
235	eor XL`.16b`, XL`.16b`, T2`.16b`
236	ushr T2`.2d`, T2`.2d`, #`6`
237	ushr XL`.2d`, XL`.2d`, #`1`
238	.endm
239
240	.macro __pmull_ghash, pn
241	ld1 {SHASH`.2d`}, [x3]
242	ld1 {XL`.2d`}, [x1]
243
244	__pmull_pre_\pn
245
246	/ do the head block first, if supplied /
247	cbz x4, `0f`
248	ld1 {T1`.2d`}, [x4]
249	mov x4, xzr
250	b `3f`
251
252	`0`: .ifc \pn, p64
253	tbnz w0, #`0`, `2f` // skip until #blocks is a
254	tbnz w0, #`1`, `2f` // round multiple of 4
255
256	`1`: ld1 {XM3`.16b`-TT4`.16b`}, [x2], #`64`
257
258	sub w0, w0, #`4`
259
260	rev64 T1`.16b`, XM3`.16b`
261	rev64 T2`.16b`, XH3`.16b`
262	rev64 TT4`.16b`, TT4`.16b`
263	rev64 TT3`.16b`, TT3`.16b`
264
265	ext IN1`.16b`, TT4`.16b`, TT4`.16b`, #`8`
266	ext XL3`.16b`, TT3`.16b`, TT3`.16b`, #`8`
267
268	eor TT4`.16b`, TT4`.16b`, IN1`.16b`
269	pmull2 XH2`.1q`, SHASH`.2d`, IN1`.2d` // a1 b1*
270	pmull XL2`.1q`, SHASH`.1d`, IN1`.1d` // a0 b0*
271	pmull XM2`.1q`, SHASH2`.1d`, TT4`.1d` // (a1 + a0)(b1 + b0)
272
273	eor TT3`.16b`, TT3`.16b`, XL3`.16b`
274	pmull2 XH3`.1q`, HH`.2d`, XL3`.2d` // a1 b1*
275	pmull XL3`.1q`, HH`.1d`, XL3`.1d` // a0 b0*
276	pmull2 XM3`.1q`, SHASH2`.2d`, TT3`.2d` // (a1 + a0)(b1 + b0)
277
278	ext IN1`.16b`, T2`.16b`, T2`.16b`, #`8`
279	eor XL2`.16b`, XL2`.16b`, XL3`.16b`
280	eor XH2`.16b`, XH2`.16b`, XH3`.16b`
281	eor XM2`.16b`, XM2`.16b`, XM3`.16b`
282
283	eor T2`.16b`, T2`.16b`, IN1`.16b`
284	pmull2 XH3`.1q`, HH3`.2d`, IN1`.2d` // a1 b1*
285	pmull XL3`.1q`, HH3`.1d`, IN1`.1d` // a0 b0*
286	pmull XM3`.1q`, HH34`.1d`, T2`.1d` // (a1 + a0)(b1 + b0)
287
288	eor XL2`.16b`, XL2`.16b`, XL3`.16b`
289	eor XH2`.16b`, XH2`.16b`, XH3`.16b`
290	eor XM2`.16b`, XM2`.16b`, XM3`.16b`
291
292	ext IN1`.16b`, T1`.16b`, T1`.16b`, #`8`
293	ext TT3`.16b`, XL`.16b`, XL`.16b`, #`8`
294	eor XL`.16b`, XL`.16b`, IN1`.16b`
295	eor T1`.16b`, T1`.16b`, TT3`.16b`
296
297	pmull2 XH`.1q`, HH4`.2d`, XL`.2d` // a1 b1*
298	eor T1`.16b`, T1`.16b`, XL`.16b`
299	pmull XL`.1q`, HH4`.1d`, XL`.1d` // a0 b0*
300	pmull2 XM`.1q`, HH34`.2d`, T1`.2d` // (a1 + a0)(b1 + b0)
301
302	eor XL`.16b`, XL`.16b`, XL2`.16b`
303	eor XH`.16b`, XH`.16b`, XH2`.16b`
304	eor XM`.16b`, XM`.16b`, XM2`.16b`
305
306	eor T2`.16b`, XL`.16b`, XH`.16b`
307	ext T1`.16b`, XL`.16b`, XH`.16b`, #`8`
308	eor XM`.16b`, XM`.16b`, T2`.16b`
309
310	__pmull_reduce_p64
311
312	eor T2`.16b`, T2`.16b`, XH`.16b`
313	eor XL`.16b`, XL`.16b`, T2`.16b`
314
315	cbz w0, `5f`
316	b `1b`
317	.endif
318
319	`2`: ld1 {T1`.2d`}, [x2], #`16`
320	sub w0, w0, #`1`
321
322	`3`: / multiply XL by SHASH in GF(2^128) /
323	CPU_LE( rev64 T1`.16b`, T1`.16b` )
324
325	ext T2`.16b`, XL`.16b`, XL`.16b`, #`8`
326	ext IN1`.16b`, T1`.16b`, T1`.16b`, #`8`
327	eor T1`.16b`, T1`.16b`, T2`.16b`
328	eor XL`.16b`, XL`.16b`, IN1`.16b`
329
330	__pmull2_\pn XH, XL, SHASH // a1 b1*
331	eor T1`.16b`, T1`.16b`, XL`.16b`
332	__pmull_\pn XL, XL, SHASH // a0 b0*
333	__pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0)
334
335	`4`: eor T2`.16b`, XL`.16b`, XH`.16b`
336	ext T1`.16b`, XL`.16b`, XH`.16b`, #`8`
337	eor XM`.16b`, XM`.16b`, T2`.16b`
338
339	__pmull_reduce_\pn
340
341	eor T2`.16b`, T2`.16b`, XH`.16b`
342	eor XL`.16b`, XL`.16b`, T2`.16b`
343
344	cbnz w0, `0b`
345
346	`5`: st1 {XL`.2d`}, [x1]
347	ret
348	.endm
349
350	/*
351	* void pmull_ghash_update(int blocks, u64 dg[], const char *src,
352	* struct ghash_key const k, const char head)
353	*/
354	SYM_TYPED_FUNC_START(pmull_ghash_update_p64)
355	__pmull_ghash p64
356	SYM_FUNC_END(pmull_ghash_update_p64)
357
358	SYM_TYPED_FUNC_START(pmull_ghash_update_p8)
359	__pmull_ghash p8
360	SYM_FUNC_END(pmull_ghash_update_p8)
361
362	KS0 .req v8
363	KS1 .req v9
364	KS2 .req v10
365	KS3 .req v11
366
367	INP0 .req v21
368	INP1 .req v22
369	INP2 .req v23
370	INP3 .req v24
371
372	K0 .req v25
373	K1 .req v26
374	K2 .req v27
375	K3 .req v28
376	K4 .req v12
377	K5 .req v13
378	K6 .req v4
379	K7 .req v5
380	K8 .req v14
381	K9 .req v15
382	KK .req v29
383	KL .req v30
384	KM .req v31
385
386	.macro load_round_keys, rounds, rk, tmp
387	add \tmp, \rk, #`64`
388	ld1 {K0`.4s`-K3`.4s`}, [\rk]
389	ld1 {K4`.4s`-K5`.4s`}, [\tmp]
390	add \tmp, \rk, \rounds, lsl #`4`
391	sub \tmp, \tmp, #`32`
392	ld1 {KK`.4s`-KM`.4s`}, [\tmp]
393	.endm
394
395	.macro enc_round, state, key
396	aese \state\()`.16b`, \key\()`.16b`
397	aesmc \state\()`.16b`, \state\()`.16b`
398	.endm
399
400	.macro enc_qround, s0, s1, s2, s3, key
401	enc_round \s0, \key
402	enc_round \s1, \key
403	enc_round \s2, \key
404	enc_round \s3, \key
405	.endm
406
407	.macro enc_block, state, rounds, rk, tmp
408	add \tmp, \rk, #`96`
409	ld1 {K6`.4s`-K7`.4s`}, [\tmp], #`32`
410	.irp key, K0, K1, K2, K3, K4 K5
411	enc_round \state, \key
412	.endr
413
414	tbnz \rounds, #`2`, .Lnot128_\@
415	.Lout256_\@:
416	enc_round \state, K6
417	enc_round \state, K7
418
419	.Lout192_\@:
420	enc_round \state, KK
421	aese \state\()`.16b`, KL`.16b`
422	eor \state\()`.16b`, \state\()`.16b`, KM`.16b`
423
424	.subsection `1`
425	.Lnot128_\@:
426	ld1 {K8`.4s`-K9`.4s`}, [\tmp], #`32`
427	enc_round \state, K6
428	enc_round \state, K7
429	ld1 {K6`.4s`-K7`.4s`}, [\tmp]
430	enc_round \state, K8
431	enc_round \state, K9
432	tbz \rounds, #`1`, .Lout192_\@
433	b .Lout256_\@
434	.previous
435	.endm
436
437	.align `6`
438	.macro pmull_gcm_do_crypt, enc
439	frame_push `1`
440
441	load_round_keys x7, x6, x8
442
443	ld1 {SHASH`.2d`}, [x3], #`16`
444	ld1 {HH`.2d`-HH4`.2d`}, [x3]
445
446	trn1 SHASH2`.2d`, SHASH`.2d`, HH`.2d`
447	trn2 T1`.2d`, SHASH`.2d`, HH`.2d`
448	eor SHASH2`.16b`, SHASH2`.16b`, T1`.16b`
449
450	trn1 HH34`.2d`, HH3`.2d`, HH4`.2d`
451	trn2 T1`.2d`, HH3`.2d`, HH4`.2d`
452	eor HH34`.16b`, HH34`.16b`, T1`.16b`
453
454	ld1 {XL`.2d`}, [x4]
455
456	cbz x0, `3f` // tag only?
457
458	ldr w8, [x5, #`12`] // load lower counter
459	CPU_LE( rev w8, w8 )
460
461	`0`: mov w9, #`4` // max blocks per round
462	add x10, x0, #`0xf`
463	lsr x10, x10, #`4` // remaining blocks
464
465	subs x0, x0, #`64`
466	csel w9, w10, w9, mi
467	add w8, w8, w9
468
469	bmi `1f`
470	ld1 {INP0`.16b`-INP3`.16b`}, [x2], #`64`
471	.subsection `1`
472	/*
473	* Populate the four input registers right to left with up to 63 bytes
474	* of data, using overlapping loads to avoid branches.
475	*
476	* INP0 INP1 INP2 INP3
477	* 1 byte \| \| \| \|x \|
478	* 16 bytes \| \| \| \|xxxxxxxx\|
479	* 17 bytes \| \| \|xxxxxxxx\|x \|
480	* 47 bytes \| \|xxxxxxxx\|xxxxxxxx\|xxxxxxx \|
481	* etc etc
482	*
483	* Note that this code may read up to 15 bytes before the start of
484	* the input. It is up to the calling code to ensure this is safe if
485	* this happens in the first iteration of the loop (i.e., when the
486	* input size is < 16 bytes)
487	*/
488	`1`: mov x15, #`16`
489	ands x19, x0, #`0xf`
490	csel x19, x19, x15, ne
491	adr_l x17, .Lpermute_table + `16`
492
493	sub x11, x15, x19
494	add x12, x17, x11
495	sub x17, x17, x11
496	ld1 {T1`.16b`}, [x12]
497	sub x10, x1, x11
498	sub x11, x2, x11
499
500	cmp x0, #-`16`
501	csel x14, x15, xzr, gt
502	cmp x0, #-`32`
503	csel x15, x15, xzr, gt
504	cmp x0, #-`48`
505	csel x16, x19, xzr, gt
506	csel x1, x1, x10, gt
507	csel x2, x2, x11, gt
508
509	ld1 {INP0`.16b`}, [x2], x14
510	ld1 {INP1`.16b`}, [x2], x15
511	ld1 {INP2`.16b`}, [x2], x16
512	ld1 {INP3`.16b`}, [x2]
513	tbl INP3`.16b`, {INP3`.16b`}, T1`.16b`
514	b `2f`
515	.previous
516
517	`2`: .if \enc == `0`
518	bl pmull_gcm_ghash_4x
519	.endif
520
521	bl pmull_gcm_enc_4x
522
523	tbnz x0, #`63`, `6f`
524	st1 {INP0`.16b`-INP3`.16b`}, [x1], #`64`
525	.if \enc == `1`
526	bl pmull_gcm_ghash_4x
527	.endif
528	bne `0b`
529
530	`3`: ldr x10, [sp, #.Lframe_local_offset]
531	cbz x10, `5f` // output tag?
532
533	ld1 {INP3`.16b`}, [x10] // load lengths[]
534	mov w9, #`1`
535	bl pmull_gcm_ghash_4x
536
537	mov w11, #(`0x1` << `24`) // BE '1U'
538	ld1 {KS0`.16b`}, [x5]
539	mov KS0.s[`3`], w11
540
541	enc_block KS0, x7, x6, x12
542
543	ext XL`.16b`, XL`.16b`, XL`.16b`, #`8`
544	rev64 XL`.16b`, XL`.16b`
545	eor XL`.16b`, XL`.16b`, KS0`.16b`
546
547	.if \enc == `1`
548	st1 {XL`.16b`}, [x10] // store tag
549	.else
550	ldp x11, x12, [sp, #`40`] // load tag pointer and authsize
551	adr_l x17, .Lpermute_table
552	ld1 {KS0`.16b`}, [x11] // load supplied tag
553	add x17, x17, x12
554	ld1 {KS1`.16b`}, [x17] // load permute vector
555
556	cmeq XL`.16b`, XL`.16b`, KS0`.16b` // compare tags
557	mvn XL`.16b`, XL`.16b` // -1 for fail, 0 for pass
558	tbl XL`.16b`, {XL`.16b`}, KS1`.16b` // keep authsize bytes only
559	sminv b0, XL`.16b` // signed minimum across XL
560	smov w0, v0.b[`0`] // return b0
561	.endif
562
563	`4`: frame_pop
564	ret
565
566	`5`:
567	CPU_LE( rev w8, w8 )
568	str w8, [x5, #`12`] // store lower counter
569	st1 {XL`.2d`}, [x4]
570	b `4b`
571
572	`6`: ld1 {T1`.16b`-T2`.16b`}, [x17], #`32` // permute vectors
573	sub x17, x17, x19, lsl #`1`
574
575	cmp w9, #`1`
576	beq `7f`
577	.subsection `1`
578	`7`: ld1 {INP2`.16b`}, [x1]
579	tbx INP2`.16b`, {INP3`.16b`}, T1`.16b`
580	mov INP3`.16b`, INP2`.16b`
581	b `8f`
582	.previous
583
584	st1 {INP0`.16b`}, [x1], x14
585	st1 {INP1`.16b`}, [x1], x15
586	st1 {INP2`.16b`}, [x1], x16
587	tbl INP3`.16b`, {INP3`.16b`}, T1`.16b`
588	tbx INP3`.16b`, {INP2`.16b`}, T2`.16b`
589	`8`: st1 {INP3`.16b`}, [x1]
590
591	.if \enc == `1`
592	ld1 {T1`.16b`}, [x17]
593	tbl INP3`.16b`, {INP3`.16b`}, T1`.16b` // clear non-data bits
594	bl pmull_gcm_ghash_4x
595	.endif
596	b `3b`
597	.endm
598
599	/*
600	* void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
601	* struct ghash_key const *k, u64 dg[], u8 ctr[],
602	* int rounds, u8 tag)
603	*/
604	SYM_FUNC_START(pmull_gcm_encrypt)
605	pmull_gcm_do_crypt `1`
606	SYM_FUNC_END(pmull_gcm_encrypt)
607
608	/*
609	* void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
610	* struct ghash_key const *k, u64 dg[], u8 ctr[],
611	* int rounds, u8 tag)
612	*/
613	SYM_FUNC_START(pmull_gcm_decrypt)
614	pmull_gcm_do_crypt `0`
615	SYM_FUNC_END(pmull_gcm_decrypt)
616
617	SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
618	movi MASK`.16b`, #`0xe1`
619	shl MASK`.2d`, MASK`.2d`, #`57`
620
621	rev64 T1`.16b`, INP0`.16b`
622	rev64 T2`.16b`, INP1`.16b`
623	rev64 TT3`.16b`, INP2`.16b`
624	rev64 TT4`.16b`, INP3`.16b`
625
626	ext XL`.16b`, XL`.16b`, XL`.16b`, #`8`
627
628	tbz w9, #`2`, `0f` // <4 blocks?
629	.subsection `1`
630	`0`: movi XH2`.16b`, #`0`
631	movi XM2`.16b`, #`0`
632	movi XL2`.16b`, #`0`
633
634	tbz w9, #`0`, `1f` // 2 blocks?
635	tbz w9, #`1`, `2f` // 1 block?
636
637	eor T2`.16b`, T2`.16b`, XL`.16b`
638	ext T1`.16b`, T2`.16b`, T2`.16b`, #`8`
639	b .Lgh3
640
641	`1`: eor TT3`.16b`, TT3`.16b`, XL`.16b`
642	ext T2`.16b`, TT3`.16b`, TT3`.16b`, #`8`
643	b .Lgh2
644
645	`2`: eor TT4`.16b`, TT4`.16b`, XL`.16b`
646	ext IN1`.16b`, TT4`.16b`, TT4`.16b`, #`8`
647	b .Lgh1
648	.previous
649
650	eor T1`.16b`, T1`.16b`, XL`.16b`
651	ext IN1`.16b`, T1`.16b`, T1`.16b`, #`8`
652
653	pmull2 XH2`.1q`, HH4`.2d`, IN1`.2d` // a1 b1*
654	eor T1`.16b`, T1`.16b`, IN1`.16b`
655	pmull XL2`.1q`, HH4`.1d`, IN1`.1d` // a0 b0*
656	pmull2 XM2`.1q`, HH34`.2d`, T1`.2d` // (a1 + a0)(b1 + b0)
657
658	ext T1`.16b`, T2`.16b`, T2`.16b`, #`8`
659	.Lgh3: eor T2`.16b`, T2`.16b`, T1`.16b`
660	pmull2 XH`.1q`, HH3`.2d`, T1`.2d` // a1 b1*
661	pmull XL`.1q`, HH3`.1d`, T1`.1d` // a0 b0*
662	pmull XM`.1q`, HH34`.1d`, T2`.1d` // (a1 + a0)(b1 + b0)
663
664	eor XH2`.16b`, XH2`.16b`, XH`.16b`
665	eor XL2`.16b`, XL2`.16b`, XL`.16b`
666	eor XM2`.16b`, XM2`.16b`, XM`.16b`
667
668	ext T2`.16b`, TT3`.16b`, TT3`.16b`, #`8`
669	.Lgh2: eor TT3`.16b`, TT3`.16b`, T2`.16b`
670	pmull2 XH`.1q`, HH`.2d`, T2`.2d` // a1 b1*
671	pmull XL`.1q`, HH`.1d`, T2`.1d` // a0 b0*
672	pmull2 XM`.1q`, SHASH2`.2d`, TT3`.2d` // (a1 + a0)(b1 + b0)
673
674	eor XH2`.16b`, XH2`.16b`, XH`.16b`
675	eor XL2`.16b`, XL2`.16b`, XL`.16b`
676	eor XM2`.16b`, XM2`.16b`, XM`.16b`
677
678	ext IN1`.16b`, TT4`.16b`, TT4`.16b`, #`8`
679	.Lgh1: eor TT4`.16b`, TT4`.16b`, IN1`.16b`
680	pmull XL`.1q`, SHASH`.1d`, IN1`.1d` // a0 b0*
681	pmull2 XH`.1q`, SHASH`.2d`, IN1`.2d` // a1 b1*
682	pmull XM`.1q`, SHASH2`.1d`, TT4`.1d` // (a1 + a0)(b1 + b0)
683
684	eor XH`.16b`, XH`.16b`, XH2`.16b`
685	eor XL`.16b`, XL`.16b`, XL2`.16b`
686	eor XM`.16b`, XM`.16b`, XM2`.16b`
687
688	eor T2`.16b`, XL`.16b`, XH`.16b`
689	ext T1`.16b`, XL`.16b`, XH`.16b`, #`8`
690	eor XM`.16b`, XM`.16b`, T2`.16b`
691
692	__pmull_reduce_p64
693
694	eor T2`.16b`, T2`.16b`, XH`.16b`
695	eor XL`.16b`, XL`.16b`, T2`.16b`
696
697	ret
698	SYM_FUNC_END(pmull_gcm_ghash_4x)
699
700	SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
701	ld1 {KS0`.16b`}, [x5] // load upper counter
702	sub w10, w8, #`4`
703	sub w11, w8, #`3`
704	sub w12, w8, #`2`
705	sub w13, w8, #`1`
706	rev w10, w10
707	rev w11, w11
708	rev w12, w12
709	rev w13, w13
710	mov KS1`.16b`, KS0`.16b`
711	mov KS2`.16b`, KS0`.16b`
712	mov KS3`.16b`, KS0`.16b`
713	ins KS0.s[`3`], w10 // set lower counter
714	ins KS1.s[`3`], w11
715	ins KS2.s[`3`], w12
716	ins KS3.s[`3`], w13
717
718	add x10, x6, #`96` // round key pointer
719	ld1 {K6`.4s`-K7`.4s`}, [x10], #`32`
720	.irp key, K0, K1, K2, K3, K4, K5
721	enc_qround KS0, KS1, KS2, KS3, \key
722	.endr
723
724	tbnz x7, #`2`, .Lnot128
725	.subsection `1`
726	.Lnot128:
727	ld1 {K8`.4s`-K9`.4s`}, [x10], #`32`
728	.irp key, K6, K7
729	enc_qround KS0, KS1, KS2, KS3, \key
730	.endr
731	ld1 {K6`.4s`-K7`.4s`}, [x10]
732	.irp key, K8, K9
733	enc_qround KS0, KS1, KS2, KS3, \key
734	.endr
735	tbz x7, #`1`, .Lout192
736	b .Lout256
737	.previous
738
739	.Lout256:
740	.irp key, K6, K7
741	enc_qround KS0, KS1, KS2, KS3, \key
742	.endr
743
744	.Lout192:
745	enc_qround KS0, KS1, KS2, KS3, KK
746
747	aese KS0`.16b`, KL`.16b`
748	aese KS1`.16b`, KL`.16b`
749	aese KS2`.16b`, KL`.16b`
750	aese KS3`.16b`, KL`.16b`
751
752	eor KS0`.16b`, KS0`.16b`, KM`.16b`
753	eor KS1`.16b`, KS1`.16b`, KM`.16b`
754	eor KS2`.16b`, KS2`.16b`, KM`.16b`
755	eor KS3`.16b`, KS3`.16b`, KM`.16b`
756
757	eor INP0`.16b`, INP0`.16b`, KS0`.16b`
758	eor INP1`.16b`, INP1`.16b`, KS1`.16b`
759	eor INP2`.16b`, INP2`.16b`, KS2`.16b`
760	eor INP3`.16b`, INP3`.16b`, KS3`.16b`
761
762	ret
763	SYM_FUNC_END(pmull_gcm_enc_4x)
764
765	.section ".rodata", "a"
766	.align `6`
767	.Lpermute_table:
768	.byte `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`
769	.byte `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`
770	.byte `0x0`, `0x1`, `0x2`, `0x3`, `0x4`, `0x5`, `0x6`, `0x7`
771	.byte `0x8`, `0x9`, `0xa`, `0xb`, `0xc`, `0xd`, `0xe`, `0xf`
772	.byte `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`
773	.byte `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`
774	.byte `0x0`, `0x1`, `0x2`, `0x3`, `0x4`, `0x5`, `0x6`, `0x7`
775	.byte `0x8`, `0x9`, `0xa`, `0xb`, `0xc`, `0xd`, `0xe`, `0xf`
776	.previous
777

source code of linux/arch/arm64/crypto/ghash-ce-core.S