1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
4 *
5 * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
6 */
7
8#include <linux/linkage.h>
9#include <asm/assembler.h>
10
11#define AES_FUNC_START(func) SYM_FUNC_START(neon_ ## func)
12#define AES_FUNC_END(func) SYM_FUNC_END(neon_ ## func)
13
14 xtsmask .req v7
15 cbciv .req v7
16 vctr .req v4
17
18 .macro xts_reload_mask, tmp
19 xts_load_mask \tmp
20 .endm
21
22 /* special case for the neon-bs driver calling into this one for CTS */
23 .macro xts_cts_skip_tw, reg, lbl
24 tbnz \reg, #1, \lbl
25 .endm
26
27 /* multiply by polynomial 'x' in GF(2^8) */
28 .macro mul_by_x, out, in, temp, const
29 sshr \temp, \in, #7
30 shl \out, \in, #1
31 and \temp, \temp, \const
32 eor \out, \out, \temp
33 .endm
34
35 /* multiply by polynomial 'x^2' in GF(2^8) */
36 .macro mul_by_x2, out, in, temp, const
37 ushr \temp, \in, #6
38 shl \out, \in, #2
39 pmul \temp, \temp, \const
40 eor \out, \out, \temp
41 .endm
42
43 /* preload the entire Sbox */
44 .macro prepare, sbox, shiftrows, temp
45 movi v12.16b, #0x1b
46 ldr_l q13, \shiftrows, \temp
47 ldr_l q14, .Lror32by8, \temp
48 adr_l \temp, \sbox
49 ld1 {v16.16b-v19.16b}, [\temp], #64
50 ld1 {v20.16b-v23.16b}, [\temp], #64
51 ld1 {v24.16b-v27.16b}, [\temp], #64
52 ld1 {v28.16b-v31.16b}, [\temp]
53 .endm
54
55 /* do preload for encryption */
56 .macro enc_prepare, ignore0, ignore1, temp
57 prepare crypto_aes_sbox, .LForward_ShiftRows, \temp
58 .endm
59
60 .macro enc_switch_key, ignore0, ignore1, temp
61 /* do nothing */
62 .endm
63
64 /* do preload for decryption */
65 .macro dec_prepare, ignore0, ignore1, temp
66 prepare crypto_aes_inv_sbox, .LReverse_ShiftRows, \temp
67 .endm
68
69 /* apply SubBytes transformation using the preloaded Sbox */
70 .macro sub_bytes, in
71 sub v9.16b, \in\().16b, v15.16b
72 tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b
73 sub v10.16b, v9.16b, v15.16b
74 tbx \in\().16b, {v20.16b-v23.16b}, v9.16b
75 sub v11.16b, v10.16b, v15.16b
76 tbx \in\().16b, {v24.16b-v27.16b}, v10.16b
77 tbx \in\().16b, {v28.16b-v31.16b}, v11.16b
78 .endm
79
80 /* apply MixColumns transformation */
81 .macro mix_columns, in, enc
82 .if \enc == 0
83 /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
84 mul_by_x2 v8.16b, \in\().16b, v9.16b, v12.16b
85 eor \in\().16b, \in\().16b, v8.16b
86 rev32 v8.8h, v8.8h
87 eor \in\().16b, \in\().16b, v8.16b
88 .endif
89
90 mul_by_x v9.16b, \in\().16b, v8.16b, v12.16b
91 rev32 v8.8h, \in\().8h
92 eor v8.16b, v8.16b, v9.16b
93 eor \in\().16b, \in\().16b, v8.16b
94 tbl \in\().16b, {\in\().16b}, v14.16b
95 eor \in\().16b, \in\().16b, v8.16b
96 .endm
97
98 .macro do_block, enc, in, rounds, rk, rkp, i
99 ld1 {v15.4s}, [\rk]
100 add \rkp, \rk, #16
101 mov \i, \rounds
1021111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
103 movi v15.16b, #0x40
104 tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */
105 sub_bytes \in
106 subs \i, \i, #1
107 ld1 {v15.4s}, [\rkp], #16
108 beq 2222f
109 mix_columns \in, \enc
110 b 1111b
1112222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
112 .endm
113
114 .macro encrypt_block, in, rounds, rk, rkp, i
115 do_block 1, \in, \rounds, \rk, \rkp, \i
116 .endm
117
118 .macro decrypt_block, in, rounds, rk, rkp, i
119 do_block 0, \in, \rounds, \rk, \rkp, \i
120 .endm
121
122 /*
123 * Interleaved versions: functionally equivalent to the
124 * ones above, but applied to AES states in parallel.
125 */
126
127 .macro sub_bytes_4x, in0, in1, in2, in3
128 sub v8.16b, \in0\().16b, v15.16b
129 tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
130 sub v9.16b, \in1\().16b, v15.16b
131 tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
132 sub v10.16b, \in2\().16b, v15.16b
133 tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b
134 sub v11.16b, \in3\().16b, v15.16b
135 tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b
136 tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b
137 tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b
138 sub v8.16b, v8.16b, v15.16b
139 tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b
140 sub v9.16b, v9.16b, v15.16b
141 tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b
142 sub v10.16b, v10.16b, v15.16b
143 tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b
144 sub v11.16b, v11.16b, v15.16b
145 tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b
146 sub v8.16b, v8.16b, v15.16b
147 tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b
148 sub v9.16b, v9.16b, v15.16b
149 tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b
150 sub v10.16b, v10.16b, v15.16b
151 tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b
152 sub v11.16b, v11.16b, v15.16b
153 tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b
154 tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b
155 tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b
156 .endm
157
158 .macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
159 sshr \tmp0\().16b, \in0\().16b, #7
160 shl \out0\().16b, \in0\().16b, #1
161 sshr \tmp1\().16b, \in1\().16b, #7
162 and \tmp0\().16b, \tmp0\().16b, \const\().16b
163 shl \out1\().16b, \in1\().16b, #1
164 and \tmp1\().16b, \tmp1\().16b, \const\().16b
165 eor \out0\().16b, \out0\().16b, \tmp0\().16b
166 eor \out1\().16b, \out1\().16b, \tmp1\().16b
167 .endm
168
169 .macro mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const
170 ushr \tmp0\().16b, \in0\().16b, #6
171 shl \out0\().16b, \in0\().16b, #2
172 ushr \tmp1\().16b, \in1\().16b, #6
173 pmul \tmp0\().16b, \tmp0\().16b, \const\().16b
174 shl \out1\().16b, \in1\().16b, #2
175 pmul \tmp1\().16b, \tmp1\().16b, \const\().16b
176 eor \out0\().16b, \out0\().16b, \tmp0\().16b
177 eor \out1\().16b, \out1\().16b, \tmp1\().16b
178 .endm
179
180 .macro mix_columns_2x, in0, in1, enc
181 .if \enc == 0
182 /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
183 mul_by_x2_2x v8, v9, \in0, \in1, v10, v11, v12
184 eor \in0\().16b, \in0\().16b, v8.16b
185 rev32 v8.8h, v8.8h
186 eor \in1\().16b, \in1\().16b, v9.16b
187 rev32 v9.8h, v9.8h
188 eor \in0\().16b, \in0\().16b, v8.16b
189 eor \in1\().16b, \in1\().16b, v9.16b
190 .endif
191
192 mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v12
193 rev32 v10.8h, \in0\().8h
194 rev32 v11.8h, \in1\().8h
195 eor v10.16b, v10.16b, v8.16b
196 eor v11.16b, v11.16b, v9.16b
197 eor \in0\().16b, \in0\().16b, v10.16b
198 eor \in1\().16b, \in1\().16b, v11.16b
199 tbl \in0\().16b, {\in0\().16b}, v14.16b
200 tbl \in1\().16b, {\in1\().16b}, v14.16b
201 eor \in0\().16b, \in0\().16b, v10.16b
202 eor \in1\().16b, \in1\().16b, v11.16b
203 .endm
204
205 .macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
206 ld1 {v15.4s}, [\rk]
207 add \rkp, \rk, #16
208 mov \i, \rounds
2091111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
210 eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
211 eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
212 eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
213 movi v15.16b, #0x40
214 tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
215 tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
216 tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */
217 tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */
218 sub_bytes_4x \in0, \in1, \in2, \in3
219 subs \i, \i, #1
220 ld1 {v15.4s}, [\rkp], #16
221 beq 2222f
222 mix_columns_2x \in0, \in1, \enc
223 mix_columns_2x \in2, \in3, \enc
224 b 1111b
2252222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
226 eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
227 eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
228 eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
229 .endm
230
231 .macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
232 do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
233 .endm
234
235 .macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
236 do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
237 .endm
238
239#include "aes-modes.S"
240
241 .section ".rodata", "a"
242 .align 4
243.LForward_ShiftRows:
244 .octa 0x0b06010c07020d08030e09040f0a0500
245
246.LReverse_ShiftRows:
247 .octa 0x0306090c0f0205080b0e0104070a0d00
248
249.Lror32by8:
250 .octa 0x0c0f0e0d080b0a090407060500030201
251

source code of linux/arch/arm64/crypto/aes-neon.S