1#ifndef _ASM_X86_XOR_H
2#define _ASM_X86_XOR_H
3
4/*
5 * Optimized RAID-5 checksumming functions for SSE.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2, or (at your option)
10 * any later version.
11 *
12 * You should have received a copy of the GNU General Public License
13 * (for example /usr/src/linux/COPYING); if not, write to the Free
14 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 * Cache avoiding checksumming functions utilizing KNI instructions
19 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
20 */
21
22/*
23 * Based on
24 * High-speed RAID5 checksumming functions utilizing SSE instructions.
25 * Copyright (C) 1998 Ingo Molnar.
26 */
27
28/*
29 * x86-64 changes / gcc fixes from Andi Kleen.
30 * Copyright 2002 Andi Kleen, SuSE Labs.
31 *
32 * This hasn't been optimized for the hammer yet, but there are likely
33 * no advantages to be gotten from x86-64 here anyways.
34 */
35
36#include <asm/fpu/api.h>
37
38#ifdef CONFIG_X86_32
39/* reduce register pressure */
40# define XOR_CONSTANT_CONSTRAINT "i"
41#else
42# define XOR_CONSTANT_CONSTRAINT "re"
43#endif
44
45#define OFFS(x) "16*("#x")"
46#define PF_OFFS(x) "256+16*("#x")"
47#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
48#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
49#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
50#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
51#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
52#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
53#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
54#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
55#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
56#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
57#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
58#define NOP(x)
59
60#define BLK64(pf, op, i) \
61 pf(i) \
62 op(i, 0) \
63 op(i + 1, 1) \
64 op(i + 2, 2) \
65 op(i + 3, 3)
66
67static void
68xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
69{
70 unsigned long lines = bytes >> 8;
71
72 kernel_fpu_begin();
73
74 asm volatile(
75#undef BLOCK
76#define BLOCK(i) \
77 LD(i, 0) \
78 LD(i + 1, 1) \
79 PF1(i) \
80 PF1(i + 2) \
81 LD(i + 2, 2) \
82 LD(i + 3, 3) \
83 PF0(i + 4) \
84 PF0(i + 6) \
85 XO1(i, 0) \
86 XO1(i + 1, 1) \
87 XO1(i + 2, 2) \
88 XO1(i + 3, 3) \
89 ST(i, 0) \
90 ST(i + 1, 1) \
91 ST(i + 2, 2) \
92 ST(i + 3, 3) \
93
94
95 PF0(0)
96 PF0(2)
97
98 " .align 32 ;\n"
99 " 1: ;\n"
100
101 BLOCK(0)
102 BLOCK(4)
103 BLOCK(8)
104 BLOCK(12)
105
106 " add %[inc], %[p1] ;\n"
107 " add %[inc], %[p2] ;\n"
108 " dec %[cnt] ;\n"
109 " jnz 1b ;\n"
110 : [cnt] "+r" (lines),
111 [p1] "+r" (p1), [p2] "+r" (p2)
112 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
113 : "memory");
114
115 kernel_fpu_end();
116}
117
118static void
119xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
120{
121 unsigned long lines = bytes >> 8;
122
123 kernel_fpu_begin();
124
125 asm volatile(
126#undef BLOCK
127#define BLOCK(i) \
128 BLK64(PF0, LD, i) \
129 BLK64(PF1, XO1, i) \
130 BLK64(NOP, ST, i) \
131
132 " .align 32 ;\n"
133 " 1: ;\n"
134
135 BLOCK(0)
136 BLOCK(4)
137 BLOCK(8)
138 BLOCK(12)
139
140 " add %[inc], %[p1] ;\n"
141 " add %[inc], %[p2] ;\n"
142 " dec %[cnt] ;\n"
143 " jnz 1b ;\n"
144 : [cnt] "+r" (lines),
145 [p1] "+r" (p1), [p2] "+r" (p2)
146 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
147 : "memory");
148
149 kernel_fpu_end();
150}
151
152static void
153xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
154 unsigned long *p3)
155{
156 unsigned long lines = bytes >> 8;
157
158 kernel_fpu_begin();
159
160 asm volatile(
161#undef BLOCK
162#define BLOCK(i) \
163 PF1(i) \
164 PF1(i + 2) \
165 LD(i, 0) \
166 LD(i + 1, 1) \
167 LD(i + 2, 2) \
168 LD(i + 3, 3) \
169 PF2(i) \
170 PF2(i + 2) \
171 PF0(i + 4) \
172 PF0(i + 6) \
173 XO1(i, 0) \
174 XO1(i + 1, 1) \
175 XO1(i + 2, 2) \
176 XO1(i + 3, 3) \
177 XO2(i, 0) \
178 XO2(i + 1, 1) \
179 XO2(i + 2, 2) \
180 XO2(i + 3, 3) \
181 ST(i, 0) \
182 ST(i + 1, 1) \
183 ST(i + 2, 2) \
184 ST(i + 3, 3) \
185
186
187 PF0(0)
188 PF0(2)
189
190 " .align 32 ;\n"
191 " 1: ;\n"
192
193 BLOCK(0)
194 BLOCK(4)
195 BLOCK(8)
196 BLOCK(12)
197
198 " add %[inc], %[p1] ;\n"
199 " add %[inc], %[p2] ;\n"
200 " add %[inc], %[p3] ;\n"
201 " dec %[cnt] ;\n"
202 " jnz 1b ;\n"
203 : [cnt] "+r" (lines),
204 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
205 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
206 : "memory");
207
208 kernel_fpu_end();
209}
210
211static void
212xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
213 unsigned long *p3)
214{
215 unsigned long lines = bytes >> 8;
216
217 kernel_fpu_begin();
218
219 asm volatile(
220#undef BLOCK
221#define BLOCK(i) \
222 BLK64(PF0, LD, i) \
223 BLK64(PF1, XO1, i) \
224 BLK64(PF2, XO2, i) \
225 BLK64(NOP, ST, i) \
226
227 " .align 32 ;\n"
228 " 1: ;\n"
229
230 BLOCK(0)
231 BLOCK(4)
232 BLOCK(8)
233 BLOCK(12)
234
235 " add %[inc], %[p1] ;\n"
236 " add %[inc], %[p2] ;\n"
237 " add %[inc], %[p3] ;\n"
238 " dec %[cnt] ;\n"
239 " jnz 1b ;\n"
240 : [cnt] "+r" (lines),
241 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
242 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
243 : "memory");
244
245 kernel_fpu_end();
246}
247
248static void
249xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
250 unsigned long *p3, unsigned long *p4)
251{
252 unsigned long lines = bytes >> 8;
253
254 kernel_fpu_begin();
255
256 asm volatile(
257#undef BLOCK
258#define BLOCK(i) \
259 PF1(i) \
260 PF1(i + 2) \
261 LD(i, 0) \
262 LD(i + 1, 1) \
263 LD(i + 2, 2) \
264 LD(i + 3, 3) \
265 PF2(i) \
266 PF2(i + 2) \
267 XO1(i, 0) \
268 XO1(i + 1, 1) \
269 XO1(i + 2, 2) \
270 XO1(i + 3, 3) \
271 PF3(i) \
272 PF3(i + 2) \
273 PF0(i + 4) \
274 PF0(i + 6) \
275 XO2(i, 0) \
276 XO2(i + 1, 1) \
277 XO2(i + 2, 2) \
278 XO2(i + 3, 3) \
279 XO3(i, 0) \
280 XO3(i + 1, 1) \
281 XO3(i + 2, 2) \
282 XO3(i + 3, 3) \
283 ST(i, 0) \
284 ST(i + 1, 1) \
285 ST(i + 2, 2) \
286 ST(i + 3, 3) \
287
288
289 PF0(0)
290 PF0(2)
291
292 " .align 32 ;\n"
293 " 1: ;\n"
294
295 BLOCK(0)
296 BLOCK(4)
297 BLOCK(8)
298 BLOCK(12)
299
300 " add %[inc], %[p1] ;\n"
301 " add %[inc], %[p2] ;\n"
302 " add %[inc], %[p3] ;\n"
303 " add %[inc], %[p4] ;\n"
304 " dec %[cnt] ;\n"
305 " jnz 1b ;\n"
306 : [cnt] "+r" (lines), [p1] "+r" (p1),
307 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
308 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
309 : "memory");
310
311 kernel_fpu_end();
312}
313
314static void
315xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
316 unsigned long *p3, unsigned long *p4)
317{
318 unsigned long lines = bytes >> 8;
319
320 kernel_fpu_begin();
321
322 asm volatile(
323#undef BLOCK
324#define BLOCK(i) \
325 BLK64(PF0, LD, i) \
326 BLK64(PF1, XO1, i) \
327 BLK64(PF2, XO2, i) \
328 BLK64(PF3, XO3, i) \
329 BLK64(NOP, ST, i) \
330
331 " .align 32 ;\n"
332 " 1: ;\n"
333
334 BLOCK(0)
335 BLOCK(4)
336 BLOCK(8)
337 BLOCK(12)
338
339 " add %[inc], %[p1] ;\n"
340 " add %[inc], %[p2] ;\n"
341 " add %[inc], %[p3] ;\n"
342 " add %[inc], %[p4] ;\n"
343 " dec %[cnt] ;\n"
344 " jnz 1b ;\n"
345 : [cnt] "+r" (lines), [p1] "+r" (p1),
346 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
347 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
348 : "memory");
349
350 kernel_fpu_end();
351}
352
353static void
354xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
355 unsigned long *p3, unsigned long *p4, unsigned long *p5)
356{
357 unsigned long lines = bytes >> 8;
358
359 kernel_fpu_begin();
360
361 asm volatile(
362#undef BLOCK
363#define BLOCK(i) \
364 PF1(i) \
365 PF1(i + 2) \
366 LD(i, 0) \
367 LD(i + 1, 1) \
368 LD(i + 2, 2) \
369 LD(i + 3, 3) \
370 PF2(i) \
371 PF2(i + 2) \
372 XO1(i, 0) \
373 XO1(i + 1, 1) \
374 XO1(i + 2, 2) \
375 XO1(i + 3, 3) \
376 PF3(i) \
377 PF3(i + 2) \
378 XO2(i, 0) \
379 XO2(i + 1, 1) \
380 XO2(i + 2, 2) \
381 XO2(i + 3, 3) \
382 PF4(i) \
383 PF4(i + 2) \
384 PF0(i + 4) \
385 PF0(i + 6) \
386 XO3(i, 0) \
387 XO3(i + 1, 1) \
388 XO3(i + 2, 2) \
389 XO3(i + 3, 3) \
390 XO4(i, 0) \
391 XO4(i + 1, 1) \
392 XO4(i + 2, 2) \
393 XO4(i + 3, 3) \
394 ST(i, 0) \
395 ST(i + 1, 1) \
396 ST(i + 2, 2) \
397 ST(i + 3, 3) \
398
399
400 PF0(0)
401 PF0(2)
402
403 " .align 32 ;\n"
404 " 1: ;\n"
405
406 BLOCK(0)
407 BLOCK(4)
408 BLOCK(8)
409 BLOCK(12)
410
411 " add %[inc], %[p1] ;\n"
412 " add %[inc], %[p2] ;\n"
413 " add %[inc], %[p3] ;\n"
414 " add %[inc], %[p4] ;\n"
415 " add %[inc], %[p5] ;\n"
416 " dec %[cnt] ;\n"
417 " jnz 1b ;\n"
418 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
419 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
420 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
421 : "memory");
422
423 kernel_fpu_end();
424}
425
426static void
427xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
428 unsigned long *p3, unsigned long *p4, unsigned long *p5)
429{
430 unsigned long lines = bytes >> 8;
431
432 kernel_fpu_begin();
433
434 asm volatile(
435#undef BLOCK
436#define BLOCK(i) \
437 BLK64(PF0, LD, i) \
438 BLK64(PF1, XO1, i) \
439 BLK64(PF2, XO2, i) \
440 BLK64(PF3, XO3, i) \
441 BLK64(PF4, XO4, i) \
442 BLK64(NOP, ST, i) \
443
444 " .align 32 ;\n"
445 " 1: ;\n"
446
447 BLOCK(0)
448 BLOCK(4)
449 BLOCK(8)
450 BLOCK(12)
451
452 " add %[inc], %[p1] ;\n"
453 " add %[inc], %[p2] ;\n"
454 " add %[inc], %[p3] ;\n"
455 " add %[inc], %[p4] ;\n"
456 " add %[inc], %[p5] ;\n"
457 " dec %[cnt] ;\n"
458 " jnz 1b ;\n"
459 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
460 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
461 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
462 : "memory");
463
464 kernel_fpu_end();
465}
466
467static struct xor_block_template xor_block_sse_pf64 = {
468 .name = "prefetch64-sse",
469 .do_2 = xor_sse_2_pf64,
470 .do_3 = xor_sse_3_pf64,
471 .do_4 = xor_sse_4_pf64,
472 .do_5 = xor_sse_5_pf64,
473};
474
475#undef LD
476#undef XO1
477#undef XO2
478#undef XO3
479#undef XO4
480#undef ST
481#undef NOP
482#undef BLK64
483#undef BLOCK
484
485#undef XOR_CONSTANT_CONSTRAINT
486
487#ifdef CONFIG_X86_32
488# include <asm/xor_32.h>
489#else
490# include <asm/xor_64.h>
491#endif
492
493#define XOR_SELECT_TEMPLATE(FASTEST) \
494 AVX_SELECT(FASTEST)
495
496#endif /* _ASM_X86_XOR_H */
497