1 | /* fast SSE2 memrchr with 64 byte loop and pmaxub instruction using |
2 | |
3 | Copyright (C) 2011-2022 Free Software Foundation, Inc. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include <sysdep.h> |
21 | #define VEC_SIZE 16 |
22 | #define PAGE_SIZE 4096 |
23 | |
24 | .text |
25 | ENTRY_P2ALIGN(__memrchr, 6) |
26 | #ifdef __ILP32__ |
27 | /* Clear upper bits. */ |
28 | mov %RDX_LP, %RDX_LP |
29 | #endif |
30 | movd %esi, %xmm0 |
31 | |
32 | /* Get end pointer. */ |
33 | leaq (%rdx, %rdi), %rcx |
34 | |
35 | punpcklbw %xmm0, %xmm0 |
36 | punpcklwd %xmm0, %xmm0 |
37 | pshufd $0, %xmm0, %xmm0 |
38 | |
39 | /* Check if we can load 1x VEC without cross a page. */ |
40 | testl $(PAGE_SIZE - VEC_SIZE), %ecx |
41 | jz L(page_cross) |
42 | |
43 | /* NB: This load happens regardless of whether rdx (len) is zero. Since |
44 | it doesn't cross a page and the standard gurantees any pointer have |
45 | at least one-valid byte this load must be safe. For the entire |
46 | history of the x86 memrchr implementation this has been possible so |
47 | no code "should" be relying on a zero-length check before this load. |
48 | The zero-length check is moved to the page cross case because it is |
49 | 1) pretty cold and including it pushes the hot case len <= VEC_SIZE |
50 | into 2-cache lines. */ |
51 | movups -(VEC_SIZE)(%rcx), %xmm1 |
52 | pcmpeqb %xmm0, %xmm1 |
53 | pmovmskb %xmm1, %eax |
54 | |
55 | subq $VEC_SIZE, %rdx |
56 | ja L(more_1x_vec) |
57 | L(ret_vec_x0_test): |
58 | /* Zero-flag set if eax (src) is zero. Destination unchanged if src is |
59 | zero. */ |
60 | bsrl %eax, %eax |
61 | jz L(ret_0) |
62 | /* Check if the CHAR match is in bounds. Need to truly zero `eax` here |
63 | if out of bounds. */ |
64 | addl %edx, %eax |
65 | jl L(zero_0) |
66 | /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base |
67 | ptr. */ |
68 | addq %rdi, %rax |
69 | L(ret_0): |
70 | ret |
71 | |
72 | .p2align 4,, 5 |
73 | L(ret_vec_x0): |
74 | bsrl %eax, %eax |
75 | leaq -(VEC_SIZE)(%rcx, %rax), %rax |
76 | ret |
77 | |
78 | .p2align 4,, 2 |
79 | L(zero_0): |
80 | xorl %eax, %eax |
81 | ret |
82 | |
83 | |
84 | .p2align 4,, 8 |
85 | L(more_1x_vec): |
86 | testl %eax, %eax |
87 | jnz L(ret_vec_x0) |
88 | |
89 | /* Align rcx (pointer to string). */ |
90 | decq %rcx |
91 | andq $-VEC_SIZE, %rcx |
92 | |
93 | movq %rcx, %rdx |
94 | /* NB: We could consistenyl save 1-byte in this pattern with `movaps |
95 | %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is |
96 | it adds more frontend uops (even if the moves can be eliminated) and |
97 | some percentage of the time actual backend uops. */ |
98 | movaps -(VEC_SIZE)(%rcx), %xmm1 |
99 | pcmpeqb %xmm0, %xmm1 |
100 | subq %rdi, %rdx |
101 | pmovmskb %xmm1, %eax |
102 | |
103 | cmpq $(VEC_SIZE * 2), %rdx |
104 | ja L(more_2x_vec) |
105 | L(last_2x_vec): |
106 | subl $VEC_SIZE, %edx |
107 | jbe L(ret_vec_x0_test) |
108 | |
109 | testl %eax, %eax |
110 | jnz L(ret_vec_x0) |
111 | |
112 | movaps -(VEC_SIZE * 2)(%rcx), %xmm1 |
113 | pcmpeqb %xmm0, %xmm1 |
114 | pmovmskb %xmm1, %eax |
115 | |
116 | subl $VEC_SIZE, %edx |
117 | bsrl %eax, %eax |
118 | jz L(ret_1) |
119 | addl %edx, %eax |
120 | jl L(zero_0) |
121 | addq %rdi, %rax |
122 | L(ret_1): |
123 | ret |
124 | |
125 | /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross) |
126 | causes the hot pause (length <= VEC_SIZE) to span multiple cache |
127 | lines. Naturally aligned % 16 to 8-bytes. */ |
128 | L(page_cross): |
129 | /* Zero length check. */ |
130 | testq %rdx, %rdx |
131 | jz L(zero_0) |
132 | |
133 | leaq -1(%rcx), %r8 |
134 | andq $-(VEC_SIZE), %r8 |
135 | |
136 | movaps (%r8), %xmm1 |
137 | pcmpeqb %xmm0, %xmm1 |
138 | pmovmskb %xmm1, %esi |
139 | /* Shift out negative alignment (because we are starting from endptr and |
140 | working backwards). */ |
141 | negl %ecx |
142 | /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count |
143 | explicitly. */ |
144 | andl $(VEC_SIZE - 1), %ecx |
145 | shl %cl, %esi |
146 | movzwl %si, %eax |
147 | leaq (%rdi, %rdx), %rcx |
148 | cmpq %rdi, %r8 |
149 | ja L(more_1x_vec) |
150 | subl $VEC_SIZE, %edx |
151 | bsrl %eax, %eax |
152 | jz L(ret_2) |
153 | addl %edx, %eax |
154 | jl L(zero_1) |
155 | addq %rdi, %rax |
156 | L(ret_2): |
157 | ret |
158 | |
159 | /* Fits in aliging bytes. */ |
160 | L(zero_1): |
161 | xorl %eax, %eax |
162 | ret |
163 | |
164 | .p2align 4,, 5 |
165 | L(ret_vec_x1): |
166 | bsrl %eax, %eax |
167 | leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax |
168 | ret |
169 | |
170 | .p2align 4,, 8 |
171 | L(more_2x_vec): |
172 | testl %eax, %eax |
173 | jnz L(ret_vec_x0) |
174 | |
175 | movaps -(VEC_SIZE * 2)(%rcx), %xmm1 |
176 | pcmpeqb %xmm0, %xmm1 |
177 | pmovmskb %xmm1, %eax |
178 | testl %eax, %eax |
179 | jnz L(ret_vec_x1) |
180 | |
181 | |
182 | movaps -(VEC_SIZE * 3)(%rcx), %xmm1 |
183 | pcmpeqb %xmm0, %xmm1 |
184 | pmovmskb %xmm1, %eax |
185 | |
186 | subq $(VEC_SIZE * 4), %rdx |
187 | ja L(more_4x_vec) |
188 | |
189 | addl $(VEC_SIZE), %edx |
190 | jle L(ret_vec_x2_test) |
191 | |
192 | L(last_vec): |
193 | testl %eax, %eax |
194 | jnz L(ret_vec_x2) |
195 | |
196 | movaps -(VEC_SIZE * 4)(%rcx), %xmm1 |
197 | pcmpeqb %xmm0, %xmm1 |
198 | pmovmskb %xmm1, %eax |
199 | |
200 | subl $(VEC_SIZE), %edx |
201 | bsrl %eax, %eax |
202 | jz L(ret_3) |
203 | addl %edx, %eax |
204 | jl L(zero_2) |
205 | addq %rdi, %rax |
206 | L(ret_3): |
207 | ret |
208 | |
209 | .p2align 4,, 6 |
210 | L(ret_vec_x2_test): |
211 | bsrl %eax, %eax |
212 | jz L(zero_2) |
213 | addl %edx, %eax |
214 | jl L(zero_2) |
215 | addq %rdi, %rax |
216 | ret |
217 | |
218 | L(zero_2): |
219 | xorl %eax, %eax |
220 | ret |
221 | |
222 | |
223 | .p2align 4,, 5 |
224 | L(ret_vec_x2): |
225 | bsrl %eax, %eax |
226 | leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax |
227 | ret |
228 | |
229 | .p2align 4,, 5 |
230 | L(ret_vec_x3): |
231 | bsrl %eax, %eax |
232 | leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax |
233 | ret |
234 | |
235 | .p2align 4,, 8 |
236 | L(more_4x_vec): |
237 | testl %eax, %eax |
238 | jnz L(ret_vec_x2) |
239 | |
240 | movaps -(VEC_SIZE * 4)(%rcx), %xmm1 |
241 | pcmpeqb %xmm0, %xmm1 |
242 | pmovmskb %xmm1, %eax |
243 | |
244 | testl %eax, %eax |
245 | jnz L(ret_vec_x3) |
246 | |
247 | addq $-(VEC_SIZE * 4), %rcx |
248 | cmpq $(VEC_SIZE * 4), %rdx |
249 | jbe L(last_4x_vec) |
250 | |
251 | /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end |
252 | keeping the code from spilling to the next cache line. */ |
253 | addq $(VEC_SIZE * 4 - 1), %rcx |
254 | andq $-(VEC_SIZE * 4), %rcx |
255 | leaq (VEC_SIZE * 4)(%rdi), %rdx |
256 | andq $-(VEC_SIZE * 4), %rdx |
257 | |
258 | .p2align 4,, 11 |
259 | L(loop_4x_vec): |
260 | movaps (VEC_SIZE * -1)(%rcx), %xmm1 |
261 | movaps (VEC_SIZE * -2)(%rcx), %xmm2 |
262 | movaps (VEC_SIZE * -3)(%rcx), %xmm3 |
263 | movaps (VEC_SIZE * -4)(%rcx), %xmm4 |
264 | pcmpeqb %xmm0, %xmm1 |
265 | pcmpeqb %xmm0, %xmm2 |
266 | pcmpeqb %xmm0, %xmm3 |
267 | pcmpeqb %xmm0, %xmm4 |
268 | |
269 | por %xmm1, %xmm2 |
270 | por %xmm3, %xmm4 |
271 | por %xmm2, %xmm4 |
272 | |
273 | pmovmskb %xmm4, %esi |
274 | testl %esi, %esi |
275 | jnz L(loop_end) |
276 | |
277 | addq $-(VEC_SIZE * 4), %rcx |
278 | cmpq %rdx, %rcx |
279 | jne L(loop_4x_vec) |
280 | |
281 | subl %edi, %edx |
282 | |
283 | /* Ends up being 1-byte nop. */ |
284 | .p2align 4,, 2 |
285 | L(last_4x_vec): |
286 | movaps -(VEC_SIZE)(%rcx), %xmm1 |
287 | pcmpeqb %xmm0, %xmm1 |
288 | pmovmskb %xmm1, %eax |
289 | |
290 | cmpl $(VEC_SIZE * 2), %edx |
291 | jbe L(last_2x_vec) |
292 | |
293 | testl %eax, %eax |
294 | jnz L(ret_vec_x0) |
295 | |
296 | |
297 | movaps -(VEC_SIZE * 2)(%rcx), %xmm1 |
298 | pcmpeqb %xmm0, %xmm1 |
299 | pmovmskb %xmm1, %eax |
300 | |
301 | testl %eax, %eax |
302 | jnz L(ret_vec_end) |
303 | |
304 | movaps -(VEC_SIZE * 3)(%rcx), %xmm1 |
305 | pcmpeqb %xmm0, %xmm1 |
306 | pmovmskb %xmm1, %eax |
307 | |
308 | subl $(VEC_SIZE * 3), %edx |
309 | ja L(last_vec) |
310 | bsrl %eax, %eax |
311 | jz L(ret_4) |
312 | addl %edx, %eax |
313 | jl L(zero_3) |
314 | addq %rdi, %rax |
315 | L(ret_4): |
316 | ret |
317 | |
318 | /* Ends up being 1-byte nop. */ |
319 | .p2align 4,, 3 |
320 | L(loop_end): |
321 | pmovmskb %xmm1, %eax |
322 | sall $16, %eax |
323 | jnz L(ret_vec_end) |
324 | |
325 | pmovmskb %xmm2, %eax |
326 | testl %eax, %eax |
327 | jnz L(ret_vec_end) |
328 | |
329 | pmovmskb %xmm3, %eax |
330 | /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) |
331 | then it won't affect the result in esi (VEC4). If ecx is non-zero |
332 | then CHAR in VEC3 and bsrq will use that position. */ |
333 | sall $16, %eax |
334 | orl %esi, %eax |
335 | bsrl %eax, %eax |
336 | leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax |
337 | ret |
338 | |
339 | L(ret_vec_end): |
340 | bsrl %eax, %eax |
341 | leaq (VEC_SIZE * -2)(%rax, %rcx), %rax |
342 | ret |
343 | /* Use in L(last_4x_vec). In the same cache line. This is just a spare |
344 | aligning bytes. */ |
345 | L(zero_3): |
346 | xorl %eax, %eax |
347 | ret |
348 | /* 2-bytes from next cache line. */ |
349 | END(__memrchr) |
350 | weak_alias (__memrchr, memrchr) |
351 | |