1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
2 | #ifndef _ASM_X86_XOR_AVX_H |
3 | #define _ASM_X86_XOR_AVX_H |
4 | |
5 | /* |
6 | * Optimized RAID-5 checksumming functions for AVX |
7 | * |
8 | * Copyright (C) 2012 Intel Corporation |
9 | * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> |
10 | * |
11 | * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines |
12 | */ |
13 | |
14 | #include <linux/compiler.h> |
15 | #include <asm/fpu/api.h> |
16 | |
17 | #define BLOCK4(i) \ |
18 | BLOCK(32 * i, 0) \ |
19 | BLOCK(32 * (i + 1), 1) \ |
20 | BLOCK(32 * (i + 2), 2) \ |
21 | BLOCK(32 * (i + 3), 3) |
22 | |
23 | #define BLOCK16() \ |
24 | BLOCK4(0) \ |
25 | BLOCK4(4) \ |
26 | BLOCK4(8) \ |
27 | BLOCK4(12) |
28 | |
29 | static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0, |
30 | const unsigned long * __restrict p1) |
31 | { |
32 | unsigned long lines = bytes >> 9; |
33 | |
34 | kernel_fpu_begin(); |
35 | |
36 | while (lines--) { |
37 | #undef BLOCK |
38 | #define BLOCK(i, reg) \ |
39 | do { \ |
40 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ |
41 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ |
42 | "m" (p0[i / sizeof(*p0)])); \ |
43 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ |
44 | "=m" (p0[i / sizeof(*p0)])); \ |
45 | } while (0); |
46 | |
47 | BLOCK16() |
48 | |
49 | p0 = (unsigned long *)((uintptr_t)p0 + 512); |
50 | p1 = (unsigned long *)((uintptr_t)p1 + 512); |
51 | } |
52 | |
53 | kernel_fpu_end(); |
54 | } |
55 | |
56 | static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0, |
57 | const unsigned long * __restrict p1, |
58 | const unsigned long * __restrict p2) |
59 | { |
60 | unsigned long lines = bytes >> 9; |
61 | |
62 | kernel_fpu_begin(); |
63 | |
64 | while (lines--) { |
65 | #undef BLOCK |
66 | #define BLOCK(i, reg) \ |
67 | do { \ |
68 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ |
69 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ |
70 | "m" (p1[i / sizeof(*p1)])); \ |
71 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ |
72 | "m" (p0[i / sizeof(*p0)])); \ |
73 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ |
74 | "=m" (p0[i / sizeof(*p0)])); \ |
75 | } while (0); |
76 | |
77 | BLOCK16() |
78 | |
79 | p0 = (unsigned long *)((uintptr_t)p0 + 512); |
80 | p1 = (unsigned long *)((uintptr_t)p1 + 512); |
81 | p2 = (unsigned long *)((uintptr_t)p2 + 512); |
82 | } |
83 | |
84 | kernel_fpu_end(); |
85 | } |
86 | |
87 | static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0, |
88 | const unsigned long * __restrict p1, |
89 | const unsigned long * __restrict p2, |
90 | const unsigned long * __restrict p3) |
91 | { |
92 | unsigned long lines = bytes >> 9; |
93 | |
94 | kernel_fpu_begin(); |
95 | |
96 | while (lines--) { |
97 | #undef BLOCK |
98 | #define BLOCK(i, reg) \ |
99 | do { \ |
100 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ |
101 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ |
102 | "m" (p2[i / sizeof(*p2)])); \ |
103 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ |
104 | "m" (p1[i / sizeof(*p1)])); \ |
105 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ |
106 | "m" (p0[i / sizeof(*p0)])); \ |
107 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ |
108 | "=m" (p0[i / sizeof(*p0)])); \ |
109 | } while (0); |
110 | |
111 | BLOCK16(); |
112 | |
113 | p0 = (unsigned long *)((uintptr_t)p0 + 512); |
114 | p1 = (unsigned long *)((uintptr_t)p1 + 512); |
115 | p2 = (unsigned long *)((uintptr_t)p2 + 512); |
116 | p3 = (unsigned long *)((uintptr_t)p3 + 512); |
117 | } |
118 | |
119 | kernel_fpu_end(); |
120 | } |
121 | |
122 | static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0, |
123 | const unsigned long * __restrict p1, |
124 | const unsigned long * __restrict p2, |
125 | const unsigned long * __restrict p3, |
126 | const unsigned long * __restrict p4) |
127 | { |
128 | unsigned long lines = bytes >> 9; |
129 | |
130 | kernel_fpu_begin(); |
131 | |
132 | while (lines--) { |
133 | #undef BLOCK |
134 | #define BLOCK(i, reg) \ |
135 | do { \ |
136 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ |
137 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ |
138 | "m" (p3[i / sizeof(*p3)])); \ |
139 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ |
140 | "m" (p2[i / sizeof(*p2)])); \ |
141 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ |
142 | "m" (p1[i / sizeof(*p1)])); \ |
143 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ |
144 | "m" (p0[i / sizeof(*p0)])); \ |
145 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ |
146 | "=m" (p0[i / sizeof(*p0)])); \ |
147 | } while (0); |
148 | |
149 | BLOCK16() |
150 | |
151 | p0 = (unsigned long *)((uintptr_t)p0 + 512); |
152 | p1 = (unsigned long *)((uintptr_t)p1 + 512); |
153 | p2 = (unsigned long *)((uintptr_t)p2 + 512); |
154 | p3 = (unsigned long *)((uintptr_t)p3 + 512); |
155 | p4 = (unsigned long *)((uintptr_t)p4 + 512); |
156 | } |
157 | |
158 | kernel_fpu_end(); |
159 | } |
160 | |
161 | static struct xor_block_template xor_block_avx = { |
162 | .name = "avx" , |
163 | .do_2 = xor_avx_2, |
164 | .do_3 = xor_avx_3, |
165 | .do_4 = xor_avx_4, |
166 | .do_5 = xor_avx_5, |
167 | }; |
168 | |
169 | #define AVX_XOR_SPEED \ |
170 | do { \ |
171 | if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \ |
172 | xor_speed(&xor_block_avx); \ |
173 | } while (0) |
174 | |
175 | #define AVX_SELECT(FASTEST) \ |
176 | (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST) |
177 | |
178 | #endif |
179 | |