1/* __memcmpeq optimized with EVEX.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23/* __memcmpeq is implemented as:
24 1. Use ymm vector compares when possible. The only case where
25 vector compares is not possible for when size < VEC_SIZE
26 and loading from either s1 or s2 would cause a page cross.
27 2. Use xmm vector compare when size >= 8 bytes.
28 3. Optimistically compare up to first 4 * VEC_SIZE one at a
29 to check for early mismatches. Only do this if its guaranteed the
30 work is not wasted.
31 4. If size is 8 * VEC_SIZE or less, unroll the loop.
32 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
33 area.
34 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
35 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
36 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
37
38# include <sysdep.h>
39
40# ifndef MEMCMPEQ
41# define MEMCMPEQ __memcmpeq_evex
42# endif
43
44# ifndef VEC_SIZE
45# include "x86-evex256-vecs.h"
46# endif
47# include "reg-macros.h"
48
49
50# if VEC_SIZE == 32
51
52# define TEST_ZERO_VCMP(reg) inc %VGPR(reg)
53# define TEST_ZERO(reg) test %VGPR(reg), %VGPR(reg)
54
55# define TO_32BIT_P1(reg) /* Do nothing. */
56# define TO_32BIT_P2(reg) /* Do nothing. */
57# define TO_32BIT(reg) /* Do nothing. */
58
59# define VEC_CMP VPCMPEQ
60
61# elif VEC_SIZE == 64
62
63# define TEST_ZERO_VCMP(reg) TEST_ZERO(reg)
64# define TEST_ZERO(reg) neg %VGPR(reg)
65
66
67 /* VEC_SIZE == 64 needs to reduce the 64-bit mask to a 32-bit
68 int. We have two methods for this. If the mask with branched
69 on, we use `neg` for the branch then `sbb` to get the 32-bit
70 return. If the mask was no branched on, we just use
71 `popcntq`. */
72# define TO_32BIT_P1(reg) TEST_ZERO(reg)
73# define TO_32BIT_P2(reg) sbb %VGPR_SZ(reg, 32), %VGPR_SZ(reg, 32)
74# define TO_32BIT(reg) popcntq %reg, %reg
75
76# define VEC_CMP VPCMPNEQ
77
78# else
79# error "Unsupported VEC_SIZE"
80# endif
81
82
83# define VMOVU_MASK vmovdqu8
84# define VPCMPNEQ vpcmpneqb
85# define VPCMPEQ vpcmpeqb
86# define VPTEST vptestmb
87
88# define PAGE_SIZE 4096
89
90 .section SECTION(.text), "ax", @progbits
91ENTRY_P2ALIGN (MEMCMPEQ, 6)
92# ifdef __ILP32__
93 /* Clear the upper 32 bits. */
94 movl %edx, %edx
95# endif
96 cmp $VEC_SIZE, %RDX_LP
97 /* Fall through for [0, VEC_SIZE] as its the hottest. */
98 ja L(more_1x_vec)
99
100 /* Create mask of bytes that are guaranteed to be valid because
101 of length (edx). Using masked movs allows us to skip checks
102 for page crosses/zero size. */
103 mov $-1, %VRAX
104 bzhi %VRDX, %VRAX, %VRAX
105 /* NB: A `jz` might be useful here. Page-faults that are
106 invalidated by predicate execution (the evex mask) can be
107 very slow. The expectation is this is not the norm so and
108 "most" code will not regularly call 'memcmp' with length = 0
109 and memory that is not wired up. */
110 KMOV %VRAX, %k2
111
112 /* Use masked loads as VEC_SIZE could page cross where length
113 (edx) would not. */
114 VMOVU_MASK (%rsi), %VMM(2){%k2}{z}
115 VPCMPNEQ (%rdi), %VMM(2), %k1{%k2}
116 KMOV %k1, %VRAX
117 TO_32BIT (VRAX)
118 ret
119
120 .p2align 4,, 3
121L(last_1x_vec):
122 VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
123 VPCMPNEQ -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %k1
124 KMOV %k1, %VRAX
125 TO_32BIT_P1 (rax)
126L(return_neq0):
127 TO_32BIT_P2 (rax)
128 ret
129
130
131 .p2align 4,, 12
132L(more_1x_vec):
133 /* From VEC + 1 to 2 * VEC. */
134 VMOVU (%rsi), %VMM(1)
135 /* Use compare not equals to directly check for mismatch. */
136 VPCMPNEQ (%rdi), %VMM(1), %k1
137 KMOV %k1, %VRAX
138 TEST_ZERO (rax)
139 jnz L(return_neq0)
140
141 cmpq $(VEC_SIZE * 2), %rdx
142 jbe L(last_1x_vec)
143
144 /* Check second VEC no matter what. */
145 VMOVU VEC_SIZE(%rsi), %VMM(2)
146 VPCMPNEQ VEC_SIZE(%rdi), %VMM(2), %k1
147 KMOV %k1, %VRAX
148 TEST_ZERO (rax)
149 jnz L(return_neq0)
150
151 /* Less than 4 * VEC. */
152 cmpq $(VEC_SIZE * 4), %rdx
153 jbe L(last_2x_vec)
154
155 /* Check third and fourth VEC no matter what. */
156 VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3)
157 VEC_CMP (VEC_SIZE * 2)(%rdi), %VMM(3), %k1
158 KMOV %k1, %VRAX
159 TEST_ZERO_VCMP (rax)
160 jnz L(return_neq0)
161
162 VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4)
163 VEC_CMP (VEC_SIZE * 3)(%rdi), %VMM(4), %k1
164 KMOV %k1, %VRAX
165 TEST_ZERO_VCMP (rax)
166 jnz L(return_neq0)
167
168 /* Go to 4x VEC loop. */
169 cmpq $(VEC_SIZE * 8), %rdx
170 ja L(more_8x_vec)
171
172 /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
173 branches. */
174
175 VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
176 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
177 addq %rdx, %rdi
178
179 /* Wait to load from s1 until addressed adjust due to
180 unlamination. */
181
182 /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
183 will have some 1s. */
184 vpxorq -(VEC_SIZE * 1)(%rdi), %VMM(1), %VMM(1)
185 /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with VEC(2) while
186 oring with VEC(1). Result is stored in VEC(1). */
187 vpternlogd $0xde, -(VEC_SIZE * 2)(%rdi), %VMM(1), %VMM(2)
188
189 cmpl $(VEC_SIZE * 6), %edx
190 jbe L(4x_last_2x_vec)
191
192 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(3)
193 vpxorq -(VEC_SIZE * 3)(%rdi), %VMM(3), %VMM(3)
194 /* Or together VEC(1), VEC(2), and VEC(3) into VEC(3). */
195 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(4)
196 vpxorq -(VEC_SIZE * 4)(%rdi), %VMM(4), %VMM(4)
197
198 /* Or together VEC(4), VEC(3), and VEC(2) into VEC(2). */
199 vpternlogd $0xfe, %VMM(4), %VMM(3), %VMM(2)
200
201 /* Compare VEC(4) with 0. If any 1s s1 and s2 don't match. */
202L(4x_last_2x_vec):
203 VPTEST %VMM(2), %VMM(2), %k1
204 KMOV %k1, %VRAX
205 TO_32BIT (VRAX)
206 ret
207
208
209 .p2align 4,, 10
210L(more_8x_vec):
211 /* Set end of s1 in rdx. */
212 leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx
213 /* rsi stores s2 - s1. This allows loop to only update one
214 pointer. */
215 subq %rdi, %rsi
216 /* Align s1 pointer. */
217 andq $-VEC_SIZE, %rdi
218 /* Adjust because first 4x vec where check already. */
219 subq $-(VEC_SIZE * 4), %rdi
220 .p2align 5,, 12
221 .p2align 4,, 8
222L(loop_4x_vec):
223 VMOVU (%rsi, %rdi), %VMM(1)
224 vpxorq (%rdi), %VMM(1), %VMM(1)
225
226 VMOVU VEC_SIZE(%rsi, %rdi), %VMM(2)
227 vpternlogd $0xde, (VEC_SIZE)(%rdi), %VMM(1), %VMM(2)
228
229 VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %VMM(3)
230 vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
231
232 VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %VMM(4)
233 vpxorq (VEC_SIZE * 3)(%rdi), %VMM(4), %VMM(4)
234
235 vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
236 VPTEST %VMM(4), %VMM(4), %k1
237 KMOV %k1, %VRAX
238 TEST_ZERO (rax)
239 jnz L(return_neq2)
240 subq $-(VEC_SIZE * 4), %rdi
241 cmpq %rdx, %rdi
242 jb L(loop_4x_vec)
243
244 subq %rdx, %rdi
245
246 VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(4)
247 vpxorq (VEC_SIZE * 3)(%rdx), %VMM(4), %VMM(4)
248 /* rdi has 4 * VEC_SIZE - remaining length. */
249
250 /* Load regardless of branch. */
251 VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
252 /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with VEC(3) while
253 oring with VEC(4). Result is stored in VEC(4). */
254 vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(4)
255
256 /* Separate logic as we can only use testb for VEC_SIZE == 64.
257 */
258# if VEC_SIZE == 64
259 testb %dil, %dil
260 js L(8x_last_2x_vec)
261# else
262 cmpl $(VEC_SIZE * 2), %edi
263 jge L(8x_last_2x_vec)
264# endif
265
266 VMOVU VEC_SIZE(%rsi, %rdx), %VMM(2)
267 vpxorq VEC_SIZE(%rdx), %VMM(2), %VMM(2)
268
269 VMOVU (%rsi, %rdx), %VMM(1)
270 vpxorq (%rdx), %VMM(1), %VMM(1)
271
272 vpternlogd $0xfe, %VMM(1), %VMM(2), %VMM(4)
273L(8x_last_1x_vec):
274L(8x_last_2x_vec):
275 VPTEST %VMM(4), %VMM(4), %k1
276 KMOV %k1, %VRAX
277 TO_32BIT_P1 (rax)
278L(return_neq2):
279 TO_32BIT_P2 (rax)
280 ret
281
282 .p2align 4,, 4
283L(last_2x_vec):
284 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(1)
285 vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %VMM(1), %VMM(1)
286 VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(2)
287 vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %VMM(2)
288 VPTEST %VMM(2), %VMM(2), %k1
289 KMOV %k1, %VRAX
290 TO_32BIT (VRAX)
291 ret
292
293 /* evex256: 1 Bytes from next cache line. evex512: 15 Bytes from
294 next cache line. */
295END (MEMCMPEQ)
296#endif
297

source code of glibc/sysdeps/x86_64/multiarch/memcmpeq-evex.S