1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* -*- linux-c -*- ------------------------------------------------------- * |
3 | * |
4 | * Copyright 2002 H. Peter Anvin - All Rights Reserved |
5 | * |
6 | * ----------------------------------------------------------------------- */ |
7 | |
8 | /* |
9 | * raid6/sse2.c |
10 | * |
11 | * SSE-2 implementation of RAID-6 syndrome functions |
12 | * |
13 | */ |
14 | |
15 | #include <linux/raid/pq.h> |
16 | #include "x86.h" |
17 | |
18 | static const struct raid6_sse_constants { |
19 | u64 x1d[2]; |
20 | } raid6_sse_constants __attribute__((aligned(16))) = { |
21 | { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL }, |
22 | }; |
23 | |
24 | static int raid6_have_sse2(void) |
25 | { |
26 | /* Not really boot_cpu but "all_cpus" */ |
27 | return boot_cpu_has(X86_FEATURE_MMX) && |
28 | boot_cpu_has(X86_FEATURE_FXSR) && |
29 | boot_cpu_has(X86_FEATURE_XMM) && |
30 | boot_cpu_has(X86_FEATURE_XMM2); |
31 | } |
32 | |
33 | /* |
34 | * Plain SSE2 implementation |
35 | */ |
36 | static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs) |
37 | { |
38 | u8 **dptr = (u8 **)ptrs; |
39 | u8 *p, *q; |
40 | int d, z, z0; |
41 | |
42 | z0 = disks - 3; /* Highest data disk */ |
43 | p = dptr[z0+1]; /* XOR parity */ |
44 | q = dptr[z0+2]; /* RS syndrome */ |
45 | |
46 | kernel_fpu_begin(); |
47 | |
48 | asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); |
49 | asm volatile("pxor %xmm5,%xmm5" ); /* Zero temp */ |
50 | |
51 | for ( d = 0 ; d < bytes ; d += 16 ) { |
52 | asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); |
53 | asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */ |
54 | asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); |
55 | asm volatile("movdqa %xmm2,%xmm4" ); /* Q[0] */ |
56 | asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d])); |
57 | for ( z = z0-2 ; z >= 0 ; z-- ) { |
58 | asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); |
59 | asm volatile("pcmpgtb %xmm4,%xmm5" ); |
60 | asm volatile("paddb %xmm4,%xmm4" ); |
61 | asm volatile("pand %xmm0,%xmm5" ); |
62 | asm volatile("pxor %xmm5,%xmm4" ); |
63 | asm volatile("pxor %xmm5,%xmm5" ); |
64 | asm volatile("pxor %xmm6,%xmm2" ); |
65 | asm volatile("pxor %xmm6,%xmm4" ); |
66 | asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d])); |
67 | } |
68 | asm volatile("pcmpgtb %xmm4,%xmm5" ); |
69 | asm volatile("paddb %xmm4,%xmm4" ); |
70 | asm volatile("pand %xmm0,%xmm5" ); |
71 | asm volatile("pxor %xmm5,%xmm4" ); |
72 | asm volatile("pxor %xmm5,%xmm5" ); |
73 | asm volatile("pxor %xmm6,%xmm2" ); |
74 | asm volatile("pxor %xmm6,%xmm4" ); |
75 | |
76 | asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); |
77 | asm volatile("pxor %xmm2,%xmm2" ); |
78 | asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); |
79 | asm volatile("pxor %xmm4,%xmm4" ); |
80 | } |
81 | |
82 | asm volatile("sfence" : : : "memory" ); |
83 | kernel_fpu_end(); |
84 | } |
85 | |
86 | |
87 | static void raid6_sse21_xor_syndrome(int disks, int start, int stop, |
88 | size_t bytes, void **ptrs) |
89 | { |
90 | u8 **dptr = (u8 **)ptrs; |
91 | u8 *p, *q; |
92 | int d, z, z0; |
93 | |
94 | z0 = stop; /* P/Q right side optimization */ |
95 | p = dptr[disks-2]; /* XOR parity */ |
96 | q = dptr[disks-1]; /* RS syndrome */ |
97 | |
98 | kernel_fpu_begin(); |
99 | |
100 | asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); |
101 | |
102 | for ( d = 0 ; d < bytes ; d += 16 ) { |
103 | asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); |
104 | asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); |
105 | asm volatile("pxor %xmm4,%xmm2" ); |
106 | /* P/Q data pages */ |
107 | for ( z = z0-1 ; z >= start ; z-- ) { |
108 | asm volatile("pxor %xmm5,%xmm5" ); |
109 | asm volatile("pcmpgtb %xmm4,%xmm5" ); |
110 | asm volatile("paddb %xmm4,%xmm4" ); |
111 | asm volatile("pand %xmm0,%xmm5" ); |
112 | asm volatile("pxor %xmm5,%xmm4" ); |
113 | asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); |
114 | asm volatile("pxor %xmm5,%xmm2" ); |
115 | asm volatile("pxor %xmm5,%xmm4" ); |
116 | } |
117 | /* P/Q left side optimization */ |
118 | for ( z = start-1 ; z >= 0 ; z-- ) { |
119 | asm volatile("pxor %xmm5,%xmm5" ); |
120 | asm volatile("pcmpgtb %xmm4,%xmm5" ); |
121 | asm volatile("paddb %xmm4,%xmm4" ); |
122 | asm volatile("pand %xmm0,%xmm5" ); |
123 | asm volatile("pxor %xmm5,%xmm4" ); |
124 | } |
125 | asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); |
126 | /* Don't use movntdq for r/w memory area < cache line */ |
127 | asm volatile("movdqa %%xmm4,%0" : "=m" (q[d])); |
128 | asm volatile("movdqa %%xmm2,%0" : "=m" (p[d])); |
129 | } |
130 | |
131 | asm volatile("sfence" : : : "memory" ); |
132 | kernel_fpu_end(); |
133 | } |
134 | |
135 | const struct raid6_calls raid6_sse2x1 = { |
136 | raid6_sse21_gen_syndrome, |
137 | raid6_sse21_xor_syndrome, |
138 | raid6_have_sse2, |
139 | "sse2x1" , |
140 | 1 /* Has cache hints */ |
141 | }; |
142 | |
143 | /* |
144 | * Unrolled-by-2 SSE2 implementation |
145 | */ |
146 | static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs) |
147 | { |
148 | u8 **dptr = (u8 **)ptrs; |
149 | u8 *p, *q; |
150 | int d, z, z0; |
151 | |
152 | z0 = disks - 3; /* Highest data disk */ |
153 | p = dptr[z0+1]; /* XOR parity */ |
154 | q = dptr[z0+2]; /* RS syndrome */ |
155 | |
156 | kernel_fpu_begin(); |
157 | |
158 | asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); |
159 | asm volatile("pxor %xmm5,%xmm5" ); /* Zero temp */ |
160 | asm volatile("pxor %xmm7,%xmm7" ); /* Zero temp */ |
161 | |
162 | /* We uniformly assume a single prefetch covers at least 32 bytes */ |
163 | for ( d = 0 ; d < bytes ; d += 32 ) { |
164 | asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); |
165 | asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */ |
166 | asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */ |
167 | asm volatile("movdqa %xmm2,%xmm4" ); /* Q[0] */ |
168 | asm volatile("movdqa %xmm3,%xmm6" ); /* Q[1] */ |
169 | for ( z = z0-1 ; z >= 0 ; z-- ) { |
170 | asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); |
171 | asm volatile("pcmpgtb %xmm4,%xmm5" ); |
172 | asm volatile("pcmpgtb %xmm6,%xmm7" ); |
173 | asm volatile("paddb %xmm4,%xmm4" ); |
174 | asm volatile("paddb %xmm6,%xmm6" ); |
175 | asm volatile("pand %xmm0,%xmm5" ); |
176 | asm volatile("pand %xmm0,%xmm7" ); |
177 | asm volatile("pxor %xmm5,%xmm4" ); |
178 | asm volatile("pxor %xmm7,%xmm6" ); |
179 | asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d])); |
180 | asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16])); |
181 | asm volatile("pxor %xmm5,%xmm2" ); |
182 | asm volatile("pxor %xmm7,%xmm3" ); |
183 | asm volatile("pxor %xmm5,%xmm4" ); |
184 | asm volatile("pxor %xmm7,%xmm6" ); |
185 | asm volatile("pxor %xmm5,%xmm5" ); |
186 | asm volatile("pxor %xmm7,%xmm7" ); |
187 | } |
188 | asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); |
189 | asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); |
190 | asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); |
191 | asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); |
192 | } |
193 | |
194 | asm volatile("sfence" : : : "memory" ); |
195 | kernel_fpu_end(); |
196 | } |
197 | |
198 | static void raid6_sse22_xor_syndrome(int disks, int start, int stop, |
199 | size_t bytes, void **ptrs) |
200 | { |
201 | u8 **dptr = (u8 **)ptrs; |
202 | u8 *p, *q; |
203 | int d, z, z0; |
204 | |
205 | z0 = stop; /* P/Q right side optimization */ |
206 | p = dptr[disks-2]; /* XOR parity */ |
207 | q = dptr[disks-1]; /* RS syndrome */ |
208 | |
209 | kernel_fpu_begin(); |
210 | |
211 | asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); |
212 | |
213 | for ( d = 0 ; d < bytes ; d += 32 ) { |
214 | asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); |
215 | asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16])); |
216 | asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); |
217 | asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16])); |
218 | asm volatile("pxor %xmm4,%xmm2" ); |
219 | asm volatile("pxor %xmm6,%xmm3" ); |
220 | /* P/Q data pages */ |
221 | for ( z = z0-1 ; z >= start ; z-- ) { |
222 | asm volatile("pxor %xmm5,%xmm5" ); |
223 | asm volatile("pxor %xmm7,%xmm7" ); |
224 | asm volatile("pcmpgtb %xmm4,%xmm5" ); |
225 | asm volatile("pcmpgtb %xmm6,%xmm7" ); |
226 | asm volatile("paddb %xmm4,%xmm4" ); |
227 | asm volatile("paddb %xmm6,%xmm6" ); |
228 | asm volatile("pand %xmm0,%xmm5" ); |
229 | asm volatile("pand %xmm0,%xmm7" ); |
230 | asm volatile("pxor %xmm5,%xmm4" ); |
231 | asm volatile("pxor %xmm7,%xmm6" ); |
232 | asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); |
233 | asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); |
234 | asm volatile("pxor %xmm5,%xmm2" ); |
235 | asm volatile("pxor %xmm7,%xmm3" ); |
236 | asm volatile("pxor %xmm5,%xmm4" ); |
237 | asm volatile("pxor %xmm7,%xmm6" ); |
238 | } |
239 | /* P/Q left side optimization */ |
240 | for ( z = start-1 ; z >= 0 ; z-- ) { |
241 | asm volatile("pxor %xmm5,%xmm5" ); |
242 | asm volatile("pxor %xmm7,%xmm7" ); |
243 | asm volatile("pcmpgtb %xmm4,%xmm5" ); |
244 | asm volatile("pcmpgtb %xmm6,%xmm7" ); |
245 | asm volatile("paddb %xmm4,%xmm4" ); |
246 | asm volatile("paddb %xmm6,%xmm6" ); |
247 | asm volatile("pand %xmm0,%xmm5" ); |
248 | asm volatile("pand %xmm0,%xmm7" ); |
249 | asm volatile("pxor %xmm5,%xmm4" ); |
250 | asm volatile("pxor %xmm7,%xmm6" ); |
251 | } |
252 | asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); |
253 | asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16])); |
254 | /* Don't use movntdq for r/w memory area < cache line */ |
255 | asm volatile("movdqa %%xmm4,%0" : "=m" (q[d])); |
256 | asm volatile("movdqa %%xmm6,%0" : "=m" (q[d+16])); |
257 | asm volatile("movdqa %%xmm2,%0" : "=m" (p[d])); |
258 | asm volatile("movdqa %%xmm3,%0" : "=m" (p[d+16])); |
259 | } |
260 | |
261 | asm volatile("sfence" : : : "memory" ); |
262 | kernel_fpu_end(); |
263 | } |
264 | |
265 | const struct raid6_calls raid6_sse2x2 = { |
266 | raid6_sse22_gen_syndrome, |
267 | raid6_sse22_xor_syndrome, |
268 | raid6_have_sse2, |
269 | "sse2x2" , |
270 | 1 /* Has cache hints */ |
271 | }; |
272 | |
273 | #ifdef CONFIG_X86_64 |
274 | |
275 | /* |
276 | * Unrolled-by-4 SSE2 implementation |
277 | */ |
278 | static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs) |
279 | { |
280 | u8 **dptr = (u8 **)ptrs; |
281 | u8 *p, *q; |
282 | int d, z, z0; |
283 | |
284 | z0 = disks - 3; /* Highest data disk */ |
285 | p = dptr[z0+1]; /* XOR parity */ |
286 | q = dptr[z0+2]; /* RS syndrome */ |
287 | |
288 | kernel_fpu_begin(); |
289 | |
290 | asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0])); |
291 | asm volatile("pxor %xmm2,%xmm2" ); /* P[0] */ |
292 | asm volatile("pxor %xmm3,%xmm3" ); /* P[1] */ |
293 | asm volatile("pxor %xmm4,%xmm4" ); /* Q[0] */ |
294 | asm volatile("pxor %xmm5,%xmm5" ); /* Zero temp */ |
295 | asm volatile("pxor %xmm6,%xmm6" ); /* Q[1] */ |
296 | asm volatile("pxor %xmm7,%xmm7" ); /* Zero temp */ |
297 | asm volatile("pxor %xmm10,%xmm10" ); /* P[2] */ |
298 | asm volatile("pxor %xmm11,%xmm11" ); /* P[3] */ |
299 | asm volatile("pxor %xmm12,%xmm12" ); /* Q[2] */ |
300 | asm volatile("pxor %xmm13,%xmm13" ); /* Zero temp */ |
301 | asm volatile("pxor %xmm14,%xmm14" ); /* Q[3] */ |
302 | asm volatile("pxor %xmm15,%xmm15" ); /* Zero temp */ |
303 | |
304 | for ( d = 0 ; d < bytes ; d += 64 ) { |
305 | for ( z = z0 ; z >= 0 ; z-- ) { |
306 | /* The second prefetch seems to improve performance... */ |
307 | asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); |
308 | asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32])); |
309 | asm volatile("pcmpgtb %xmm4,%xmm5" ); |
310 | asm volatile("pcmpgtb %xmm6,%xmm7" ); |
311 | asm volatile("pcmpgtb %xmm12,%xmm13" ); |
312 | asm volatile("pcmpgtb %xmm14,%xmm15" ); |
313 | asm volatile("paddb %xmm4,%xmm4" ); |
314 | asm volatile("paddb %xmm6,%xmm6" ); |
315 | asm volatile("paddb %xmm12,%xmm12" ); |
316 | asm volatile("paddb %xmm14,%xmm14" ); |
317 | asm volatile("pand %xmm0,%xmm5" ); |
318 | asm volatile("pand %xmm0,%xmm7" ); |
319 | asm volatile("pand %xmm0,%xmm13" ); |
320 | asm volatile("pand %xmm0,%xmm15" ); |
321 | asm volatile("pxor %xmm5,%xmm4" ); |
322 | asm volatile("pxor %xmm7,%xmm6" ); |
323 | asm volatile("pxor %xmm13,%xmm12" ); |
324 | asm volatile("pxor %xmm15,%xmm14" ); |
325 | asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); |
326 | asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); |
327 | asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32])); |
328 | asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48])); |
329 | asm volatile("pxor %xmm5,%xmm2" ); |
330 | asm volatile("pxor %xmm7,%xmm3" ); |
331 | asm volatile("pxor %xmm13,%xmm10" ); |
332 | asm volatile("pxor %xmm15,%xmm11" ); |
333 | asm volatile("pxor %xmm5,%xmm4" ); |
334 | asm volatile("pxor %xmm7,%xmm6" ); |
335 | asm volatile("pxor %xmm13,%xmm12" ); |
336 | asm volatile("pxor %xmm15,%xmm14" ); |
337 | asm volatile("pxor %xmm5,%xmm5" ); |
338 | asm volatile("pxor %xmm7,%xmm7" ); |
339 | asm volatile("pxor %xmm13,%xmm13" ); |
340 | asm volatile("pxor %xmm15,%xmm15" ); |
341 | } |
342 | asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); |
343 | asm volatile("pxor %xmm2,%xmm2" ); |
344 | asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); |
345 | asm volatile("pxor %xmm3,%xmm3" ); |
346 | asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32])); |
347 | asm volatile("pxor %xmm10,%xmm10" ); |
348 | asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48])); |
349 | asm volatile("pxor %xmm11,%xmm11" ); |
350 | asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); |
351 | asm volatile("pxor %xmm4,%xmm4" ); |
352 | asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); |
353 | asm volatile("pxor %xmm6,%xmm6" ); |
354 | asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32])); |
355 | asm volatile("pxor %xmm12,%xmm12" ); |
356 | asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48])); |
357 | asm volatile("pxor %xmm14,%xmm14" ); |
358 | } |
359 | |
360 | asm volatile("sfence" : : : "memory" ); |
361 | kernel_fpu_end(); |
362 | } |
363 | |
364 | static void raid6_sse24_xor_syndrome(int disks, int start, int stop, |
365 | size_t bytes, void **ptrs) |
366 | { |
367 | u8 **dptr = (u8 **)ptrs; |
368 | u8 *p, *q; |
369 | int d, z, z0; |
370 | |
371 | z0 = stop; /* P/Q right side optimization */ |
372 | p = dptr[disks-2]; /* XOR parity */ |
373 | q = dptr[disks-1]; /* RS syndrome */ |
374 | |
375 | kernel_fpu_begin(); |
376 | |
377 | asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0])); |
378 | |
379 | for ( d = 0 ; d < bytes ; d += 64 ) { |
380 | asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); |
381 | asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16])); |
382 | asm volatile("movdqa %0,%%xmm12" :: "m" (dptr[z0][d+32])); |
383 | asm volatile("movdqa %0,%%xmm14" :: "m" (dptr[z0][d+48])); |
384 | asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); |
385 | asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16])); |
386 | asm volatile("movdqa %0,%%xmm10" : : "m" (p[d+32])); |
387 | asm volatile("movdqa %0,%%xmm11" : : "m" (p[d+48])); |
388 | asm volatile("pxor %xmm4,%xmm2" ); |
389 | asm volatile("pxor %xmm6,%xmm3" ); |
390 | asm volatile("pxor %xmm12,%xmm10" ); |
391 | asm volatile("pxor %xmm14,%xmm11" ); |
392 | /* P/Q data pages */ |
393 | for ( z = z0-1 ; z >= start ; z-- ) { |
394 | asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); |
395 | asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32])); |
396 | asm volatile("pxor %xmm5,%xmm5" ); |
397 | asm volatile("pxor %xmm7,%xmm7" ); |
398 | asm volatile("pxor %xmm13,%xmm13" ); |
399 | asm volatile("pxor %xmm15,%xmm15" ); |
400 | asm volatile("pcmpgtb %xmm4,%xmm5" ); |
401 | asm volatile("pcmpgtb %xmm6,%xmm7" ); |
402 | asm volatile("pcmpgtb %xmm12,%xmm13" ); |
403 | asm volatile("pcmpgtb %xmm14,%xmm15" ); |
404 | asm volatile("paddb %xmm4,%xmm4" ); |
405 | asm volatile("paddb %xmm6,%xmm6" ); |
406 | asm volatile("paddb %xmm12,%xmm12" ); |
407 | asm volatile("paddb %xmm14,%xmm14" ); |
408 | asm volatile("pand %xmm0,%xmm5" ); |
409 | asm volatile("pand %xmm0,%xmm7" ); |
410 | asm volatile("pand %xmm0,%xmm13" ); |
411 | asm volatile("pand %xmm0,%xmm15" ); |
412 | asm volatile("pxor %xmm5,%xmm4" ); |
413 | asm volatile("pxor %xmm7,%xmm6" ); |
414 | asm volatile("pxor %xmm13,%xmm12" ); |
415 | asm volatile("pxor %xmm15,%xmm14" ); |
416 | asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); |
417 | asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); |
418 | asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32])); |
419 | asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48])); |
420 | asm volatile("pxor %xmm5,%xmm2" ); |
421 | asm volatile("pxor %xmm7,%xmm3" ); |
422 | asm volatile("pxor %xmm13,%xmm10" ); |
423 | asm volatile("pxor %xmm15,%xmm11" ); |
424 | asm volatile("pxor %xmm5,%xmm4" ); |
425 | asm volatile("pxor %xmm7,%xmm6" ); |
426 | asm volatile("pxor %xmm13,%xmm12" ); |
427 | asm volatile("pxor %xmm15,%xmm14" ); |
428 | } |
429 | asm volatile("prefetchnta %0" :: "m" (q[d])); |
430 | asm volatile("prefetchnta %0" :: "m" (q[d+32])); |
431 | /* P/Q left side optimization */ |
432 | for ( z = start-1 ; z >= 0 ; z-- ) { |
433 | asm volatile("pxor %xmm5,%xmm5" ); |
434 | asm volatile("pxor %xmm7,%xmm7" ); |
435 | asm volatile("pxor %xmm13,%xmm13" ); |
436 | asm volatile("pxor %xmm15,%xmm15" ); |
437 | asm volatile("pcmpgtb %xmm4,%xmm5" ); |
438 | asm volatile("pcmpgtb %xmm6,%xmm7" ); |
439 | asm volatile("pcmpgtb %xmm12,%xmm13" ); |
440 | asm volatile("pcmpgtb %xmm14,%xmm15" ); |
441 | asm volatile("paddb %xmm4,%xmm4" ); |
442 | asm volatile("paddb %xmm6,%xmm6" ); |
443 | asm volatile("paddb %xmm12,%xmm12" ); |
444 | asm volatile("paddb %xmm14,%xmm14" ); |
445 | asm volatile("pand %xmm0,%xmm5" ); |
446 | asm volatile("pand %xmm0,%xmm7" ); |
447 | asm volatile("pand %xmm0,%xmm13" ); |
448 | asm volatile("pand %xmm0,%xmm15" ); |
449 | asm volatile("pxor %xmm5,%xmm4" ); |
450 | asm volatile("pxor %xmm7,%xmm6" ); |
451 | asm volatile("pxor %xmm13,%xmm12" ); |
452 | asm volatile("pxor %xmm15,%xmm14" ); |
453 | } |
454 | asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); |
455 | asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); |
456 | asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32])); |
457 | asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48])); |
458 | asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); |
459 | asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16])); |
460 | asm volatile("pxor %0,%%xmm12" : : "m" (q[d+32])); |
461 | asm volatile("pxor %0,%%xmm14" : : "m" (q[d+48])); |
462 | asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); |
463 | asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); |
464 | asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32])); |
465 | asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48])); |
466 | } |
467 | asm volatile("sfence" : : : "memory" ); |
468 | kernel_fpu_end(); |
469 | } |
470 | |
471 | |
472 | const struct raid6_calls raid6_sse2x4 = { |
473 | raid6_sse24_gen_syndrome, |
474 | raid6_sse24_xor_syndrome, |
475 | raid6_have_sse2, |
476 | "sse2x4" , |
477 | 1 /* Has cache hints */ |
478 | }; |
479 | |
480 | #endif /* CONFIG_X86_64 */ |
481 | |