1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Copyright (C) 2016 Intel Corporation |
4 | * |
5 | * Author: Gayatri Kammela <gayatri.kammela@intel.com> |
6 | * Author: Megha Dey <megha.dey@linux.intel.com> |
7 | */ |
8 | |
9 | #ifdef CONFIG_AS_AVX512 |
10 | |
11 | #include <linux/raid/pq.h> |
12 | #include "x86.h" |
13 | |
14 | static int raid6_has_avx512(void) |
15 | { |
16 | return boot_cpu_has(X86_FEATURE_AVX2) && |
17 | boot_cpu_has(X86_FEATURE_AVX) && |
18 | boot_cpu_has(X86_FEATURE_AVX512F) && |
19 | boot_cpu_has(X86_FEATURE_AVX512BW) && |
20 | boot_cpu_has(X86_FEATURE_AVX512VL) && |
21 | boot_cpu_has(X86_FEATURE_AVX512DQ); |
22 | } |
23 | |
24 | static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila, |
25 | int failb, void **ptrs) |
26 | { |
27 | u8 *p, *q, *dp, *dq; |
28 | const u8 *pbmul; /* P multiplier table for B data */ |
29 | const u8 *qmul; /* Q multiplier table (for both) */ |
30 | const u8 x0f = 0x0f; |
31 | |
32 | p = (u8 *)ptrs[disks-2]; |
33 | q = (u8 *)ptrs[disks-1]; |
34 | |
35 | /* |
36 | * Compute syndrome with zero for the missing data pages |
37 | * Use the dead data pages as temporary storage for |
38 | * delta p and delta q |
39 | */ |
40 | |
41 | dp = (u8 *)ptrs[faila]; |
42 | ptrs[faila] = (void *)raid6_empty_zero_page; |
43 | ptrs[disks-2] = dp; |
44 | dq = (u8 *)ptrs[failb]; |
45 | ptrs[failb] = (void *)raid6_empty_zero_page; |
46 | ptrs[disks-1] = dq; |
47 | |
48 | raid6_call.gen_syndrome(disks, bytes, ptrs); |
49 | |
50 | /* Restore pointer table */ |
51 | ptrs[faila] = dp; |
52 | ptrs[failb] = dq; |
53 | ptrs[disks-2] = p; |
54 | ptrs[disks-1] = q; |
55 | |
56 | /* Now, pick the proper data tables */ |
57 | pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; |
58 | qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ |
59 | raid6_gfexp[failb]]]; |
60 | |
61 | kernel_fpu_begin(); |
62 | |
63 | /* zmm0 = x0f[16] */ |
64 | asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f)); |
65 | |
66 | while (bytes) { |
67 | #ifdef CONFIG_X86_64 |
68 | asm volatile("vmovdqa64 %0, %%zmm1\n\t" |
69 | "vmovdqa64 %1, %%zmm9\n\t" |
70 | "vmovdqa64 %2, %%zmm0\n\t" |
71 | "vmovdqa64 %3, %%zmm8\n\t" |
72 | "vpxorq %4, %%zmm1, %%zmm1\n\t" |
73 | "vpxorq %5, %%zmm9, %%zmm9\n\t" |
74 | "vpxorq %6, %%zmm0, %%zmm0\n\t" |
75 | "vpxorq %7, %%zmm8, %%zmm8" |
76 | : |
77 | : "m" (q[0]), "m" (q[64]), "m" (p[0]), |
78 | "m" (p[64]), "m" (dq[0]), "m" (dq[64]), |
79 | "m" (dp[0]), "m" (dp[64])); |
80 | |
81 | /* |
82 | * 1 = dq[0] ^ q[0] |
83 | * 9 = dq[64] ^ q[64] |
84 | * 0 = dp[0] ^ p[0] |
85 | * 8 = dp[64] ^ p[64] |
86 | */ |
87 | |
88 | asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" |
89 | "vbroadcasti64x2 %1, %%zmm5" |
90 | : |
91 | : "m" (qmul[0]), "m" (qmul[16])); |
92 | |
93 | asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t" |
94 | "vpsraw $4, %%zmm9, %%zmm12\n\t" |
95 | "vpandq %%zmm7, %%zmm1, %%zmm1\n\t" |
96 | "vpandq %%zmm7, %%zmm9, %%zmm9\n\t" |
97 | "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" |
98 | "vpandq %%zmm7, %%zmm12, %%zmm12\n\t" |
99 | "vpshufb %%zmm9, %%zmm4, %%zmm14\n\t" |
100 | "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t" |
101 | "vpshufb %%zmm12, %%zmm5, %%zmm15\n\t" |
102 | "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t" |
103 | "vpxorq %%zmm14, %%zmm15, %%zmm15\n\t" |
104 | "vpxorq %%zmm4, %%zmm5, %%zmm5" |
105 | : |
106 | : ); |
107 | |
108 | /* |
109 | * 5 = qx[0] |
110 | * 15 = qx[64] |
111 | */ |
112 | |
113 | asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" |
114 | "vbroadcasti64x2 %1, %%zmm1\n\t" |
115 | "vpsraw $4, %%zmm0, %%zmm2\n\t" |
116 | "vpsraw $4, %%zmm8, %%zmm6\n\t" |
117 | "vpandq %%zmm7, %%zmm0, %%zmm3\n\t" |
118 | "vpandq %%zmm7, %%zmm8, %%zmm14\n\t" |
119 | "vpandq %%zmm7, %%zmm2, %%zmm2\n\t" |
120 | "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" |
121 | "vpshufb %%zmm14, %%zmm4, %%zmm12\n\t" |
122 | "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t" |
123 | "vpshufb %%zmm6, %%zmm1, %%zmm13\n\t" |
124 | "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t" |
125 | "vpxorq %%zmm4, %%zmm1, %%zmm1\n\t" |
126 | "vpxorq %%zmm12, %%zmm13, %%zmm13" |
127 | : |
128 | : "m" (pbmul[0]), "m" (pbmul[16])); |
129 | |
130 | /* |
131 | * 1 = pbmul[px[0]] |
132 | * 13 = pbmul[px[64]] |
133 | */ |
134 | asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t" |
135 | "vpxorq %%zmm15, %%zmm13, %%zmm13" |
136 | : |
137 | : ); |
138 | |
139 | /* |
140 | * 1 = db = DQ |
141 | * 13 = db[64] = DQ[64] |
142 | */ |
143 | asm volatile("vmovdqa64 %%zmm1, %0\n\t" |
144 | "vmovdqa64 %%zmm13,%1\n\t" |
145 | "vpxorq %%zmm1, %%zmm0, %%zmm0\n\t" |
146 | "vpxorq %%zmm13, %%zmm8, %%zmm8" |
147 | : |
148 | : "m" (dq[0]), "m" (dq[64])); |
149 | |
150 | asm volatile("vmovdqa64 %%zmm0, %0\n\t" |
151 | "vmovdqa64 %%zmm8, %1" |
152 | : |
153 | : "m" (dp[0]), "m" (dp[64])); |
154 | |
155 | bytes -= 128; |
156 | p += 128; |
157 | q += 128; |
158 | dp += 128; |
159 | dq += 128; |
160 | #else |
161 | asm volatile("vmovdqa64 %0, %%zmm1\n\t" |
162 | "vmovdqa64 %1, %%zmm0\n\t" |
163 | "vpxorq %2, %%zmm1, %%zmm1\n\t" |
164 | "vpxorq %3, %%zmm0, %%zmm0" |
165 | : |
166 | : "m" (*q), "m" (*p), "m" (*dq), "m" (*dp)); |
167 | |
168 | /* 1 = dq ^ q; 0 = dp ^ p */ |
169 | |
170 | asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" |
171 | "vbroadcasti64x2 %1, %%zmm5" |
172 | : |
173 | : "m" (qmul[0]), "m" (qmul[16])); |
174 | |
175 | /* |
176 | * 1 = dq ^ q |
177 | * 3 = dq ^ p >> 4 |
178 | */ |
179 | asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t" |
180 | "vpandq %%zmm7, %%zmm1, %%zmm1\n\t" |
181 | "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" |
182 | "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t" |
183 | "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t" |
184 | "vpxorq %%zmm4, %%zmm5, %%zmm5" |
185 | : |
186 | : ); |
187 | |
188 | /* 5 = qx */ |
189 | |
190 | asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" |
191 | "vbroadcasti64x2 %1, %%zmm1" |
192 | : |
193 | : "m" (pbmul[0]), "m" (pbmul[16])); |
194 | |
195 | asm volatile("vpsraw $4, %%zmm0, %%zmm2\n\t" |
196 | "vpandq %%zmm7, %%zmm0, %%zmm3\n\t" |
197 | "vpandq %%zmm7, %%zmm2, %%zmm2\n\t" |
198 | "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t" |
199 | "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t" |
200 | "vpxorq %%zmm4, %%zmm1, %%zmm1" |
201 | : |
202 | : ); |
203 | |
204 | /* 1 = pbmul[px] */ |
205 | asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t" |
206 | /* 1 = db = DQ */ |
207 | "vmovdqa64 %%zmm1, %0\n\t" |
208 | : |
209 | : "m" (dq[0])); |
210 | |
211 | asm volatile("vpxorq %%zmm1, %%zmm0, %%zmm0\n\t" |
212 | "vmovdqa64 %%zmm0, %0" |
213 | : |
214 | : "m" (dp[0])); |
215 | |
216 | bytes -= 64; |
217 | p += 64; |
218 | q += 64; |
219 | dp += 64; |
220 | dq += 64; |
221 | #endif |
222 | } |
223 | |
224 | kernel_fpu_end(); |
225 | } |
226 | |
227 | static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila, |
228 | void **ptrs) |
229 | { |
230 | u8 *p, *q, *dq; |
231 | const u8 *qmul; /* Q multiplier table */ |
232 | const u8 x0f = 0x0f; |
233 | |
234 | p = (u8 *)ptrs[disks-2]; |
235 | q = (u8 *)ptrs[disks-1]; |
236 | |
237 | /* |
238 | * Compute syndrome with zero for the missing data page |
239 | * Use the dead data page as temporary storage for delta q |
240 | */ |
241 | |
242 | dq = (u8 *)ptrs[faila]; |
243 | ptrs[faila] = (void *)raid6_empty_zero_page; |
244 | ptrs[disks-1] = dq; |
245 | |
246 | raid6_call.gen_syndrome(disks, bytes, ptrs); |
247 | |
248 | /* Restore pointer table */ |
249 | ptrs[faila] = dq; |
250 | ptrs[disks-1] = q; |
251 | |
252 | /* Now, pick the proper data tables */ |
253 | qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; |
254 | |
255 | kernel_fpu_begin(); |
256 | |
257 | asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f)); |
258 | |
259 | while (bytes) { |
260 | #ifdef CONFIG_X86_64 |
261 | asm volatile("vmovdqa64 %0, %%zmm3\n\t" |
262 | "vmovdqa64 %1, %%zmm8\n\t" |
263 | "vpxorq %2, %%zmm3, %%zmm3\n\t" |
264 | "vpxorq %3, %%zmm8, %%zmm8" |
265 | : |
266 | : "m" (dq[0]), "m" (dq[64]), "m" (q[0]), |
267 | "m" (q[64])); |
268 | |
269 | /* |
270 | * 3 = q[0] ^ dq[0] |
271 | * 8 = q[64] ^ dq[64] |
272 | */ |
273 | asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t" |
274 | "vmovapd %%zmm0, %%zmm13\n\t" |
275 | "vbroadcasti64x2 %1, %%zmm1\n\t" |
276 | "vmovapd %%zmm1, %%zmm14" |
277 | : |
278 | : "m" (qmul[0]), "m" (qmul[16])); |
279 | |
280 | asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t" |
281 | "vpsraw $4, %%zmm8, %%zmm12\n\t" |
282 | "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" |
283 | "vpandq %%zmm7, %%zmm8, %%zmm8\n\t" |
284 | "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" |
285 | "vpandq %%zmm7, %%zmm12, %%zmm12\n\t" |
286 | "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t" |
287 | "vpshufb %%zmm8, %%zmm13, %%zmm13\n\t" |
288 | "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t" |
289 | "vpshufb %%zmm12, %%zmm14, %%zmm14\n\t" |
290 | "vpxorq %%zmm0, %%zmm1, %%zmm1\n\t" |
291 | "vpxorq %%zmm13, %%zmm14, %%zmm14" |
292 | : |
293 | : ); |
294 | |
295 | /* |
296 | * 1 = qmul[q[0] ^ dq[0]] |
297 | * 14 = qmul[q[64] ^ dq[64]] |
298 | */ |
299 | asm volatile("vmovdqa64 %0, %%zmm2\n\t" |
300 | "vmovdqa64 %1, %%zmm12\n\t" |
301 | "vpxorq %%zmm1, %%zmm2, %%zmm2\n\t" |
302 | "vpxorq %%zmm14, %%zmm12, %%zmm12" |
303 | : |
304 | : "m" (p[0]), "m" (p[64])); |
305 | |
306 | /* |
307 | * 2 = p[0] ^ qmul[q[0] ^ dq[0]] |
308 | * 12 = p[64] ^ qmul[q[64] ^ dq[64]] |
309 | */ |
310 | |
311 | asm volatile("vmovdqa64 %%zmm1, %0\n\t" |
312 | "vmovdqa64 %%zmm14, %1\n\t" |
313 | "vmovdqa64 %%zmm2, %2\n\t" |
314 | "vmovdqa64 %%zmm12,%3" |
315 | : |
316 | : "m" (dq[0]), "m" (dq[64]), "m" (p[0]), |
317 | "m" (p[64])); |
318 | |
319 | bytes -= 128; |
320 | p += 128; |
321 | q += 128; |
322 | dq += 128; |
323 | #else |
324 | asm volatile("vmovdqa64 %0, %%zmm3\n\t" |
325 | "vpxorq %1, %%zmm3, %%zmm3" |
326 | : |
327 | : "m" (dq[0]), "m" (q[0])); |
328 | |
329 | /* 3 = q ^ dq */ |
330 | |
331 | asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t" |
332 | "vbroadcasti64x2 %1, %%zmm1" |
333 | : |
334 | : "m" (qmul[0]), "m" (qmul[16])); |
335 | |
336 | asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t" |
337 | "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" |
338 | "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" |
339 | "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t" |
340 | "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t" |
341 | "vpxorq %%zmm0, %%zmm1, %%zmm1" |
342 | : |
343 | : ); |
344 | |
345 | /* 1 = qmul[q ^ dq] */ |
346 | |
347 | asm volatile("vmovdqa64 %0, %%zmm2\n\t" |
348 | "vpxorq %%zmm1, %%zmm2, %%zmm2" |
349 | : |
350 | : "m" (p[0])); |
351 | |
352 | /* 2 = p ^ qmul[q ^ dq] */ |
353 | |
354 | asm volatile("vmovdqa64 %%zmm1, %0\n\t" |
355 | "vmovdqa64 %%zmm2, %1" |
356 | : |
357 | : "m" (dq[0]), "m" (p[0])); |
358 | |
359 | bytes -= 64; |
360 | p += 64; |
361 | q += 64; |
362 | dq += 64; |
363 | #endif |
364 | } |
365 | |
366 | kernel_fpu_end(); |
367 | } |
368 | |
369 | const struct raid6_recov_calls raid6_recov_avx512 = { |
370 | .data2 = raid6_2data_recov_avx512, |
371 | .datap = raid6_datap_recov_avx512, |
372 | .valid = raid6_has_avx512, |
373 | #ifdef CONFIG_X86_64 |
374 | .name = "avx512x2" , |
375 | #else |
376 | .name = "avx512x1" , |
377 | #endif |
378 | .priority = 3, |
379 | }; |
380 | |
381 | #else |
382 | #warning "your version of binutils lacks AVX512 support" |
383 | #endif |
384 | |