1 | /* memcmp with SSE2 |
2 | Copyright (C) 2009-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | #ifdef USE_AS_WMEMCMP |
22 | # define PCMPEQ pcmpeqd |
23 | # define CHAR_SIZE 4 |
24 | # define SIZE_OFFSET (0) |
25 | #else |
26 | # define PCMPEQ pcmpeqb |
27 | # define CHAR_SIZE 1 |
28 | #endif |
29 | |
30 | #ifdef USE_AS_MEMCMPEQ |
31 | # define SIZE_OFFSET (0) |
32 | # define CHECK_CMP(x, y) subl x, y |
33 | #else |
34 | # ifndef SIZE_OFFSET |
35 | # define SIZE_OFFSET (CHAR_PER_VEC * 2) |
36 | # endif |
37 | # define CHECK_CMP(x, y) cmpl x, y |
38 | #endif |
39 | |
40 | #define VEC_SIZE 16 |
41 | #define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
42 | |
43 | #ifndef MEMCMP |
44 | # define MEMCMP memcmp |
45 | #endif |
46 | |
47 | .text |
48 | ENTRY(MEMCMP) |
49 | # ifdef __ILP32__ |
50 | /* Clear the upper 32 bits. */ |
51 | movl %edx, %edx |
52 | # endif |
53 | #ifdef USE_AS_WMEMCMP |
54 | /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store |
55 | in ecx for code size. This is preferable to using `incw` as |
56 | it avoids partial register stalls on older hardware (pre |
57 | SnB). */ |
58 | movl $0xffff, %ecx |
59 | #endif |
60 | cmpq $CHAR_PER_VEC, %rdx |
61 | ja L(more_1x_vec) |
62 | |
63 | #ifdef USE_AS_WMEMCMP |
64 | /* saves a byte of code keeping the fall through path n = [2, 4] |
65 | in the initial cache line. */ |
66 | decl %edx |
67 | jle L(cmp_0_1) |
68 | |
69 | movq (%rsi), %xmm0 |
70 | movq (%rdi), %xmm1 |
71 | PCMPEQ %xmm0, %xmm1 |
72 | pmovmskb %xmm1, %eax |
73 | subl %ecx, %eax |
74 | jnz L(ret_nonzero_vec_start_0) |
75 | |
76 | movq -4(%rsi, %rdx, CHAR_SIZE), %xmm0 |
77 | movq -4(%rdi, %rdx, CHAR_SIZE), %xmm1 |
78 | PCMPEQ %xmm0, %xmm1 |
79 | pmovmskb %xmm1, %eax |
80 | subl %ecx, %eax |
81 | jnz L(ret_nonzero_vec_end_0_adj) |
82 | #else |
83 | cmpl $8, %edx |
84 | ja L(cmp_9_16) |
85 | |
86 | cmpl $4, %edx |
87 | jb L(cmp_0_3) |
88 | |
89 | # ifdef USE_AS_MEMCMPEQ |
90 | movl (%rsi), %eax |
91 | subl (%rdi), %eax |
92 | |
93 | movl -4(%rsi, %rdx), %esi |
94 | subl -4(%rdi, %rdx), %esi |
95 | |
96 | orl %esi, %eax |
97 | ret |
98 | # else |
99 | /* Combine comparisons for lo and hi 4-byte comparisons. */ |
100 | movl -4(%rsi, %rdx), %ecx |
101 | movl -4(%rdi, %rdx), %eax |
102 | shlq $32, %rcx |
103 | shlq $32, %rax |
104 | movl (%rsi), %esi |
105 | movl (%rdi), %edi |
106 | orq %rsi, %rcx |
107 | orq %rdi, %rax |
108 | /* Only compute proper return if not-equal. */ |
109 | cmpq %rcx, %rax |
110 | jnz L(ret_nonzero) |
111 | xorl %eax, %eax |
112 | ret |
113 | # endif |
114 | |
115 | .p2align 4,, 10 |
116 | L(cmp_9_16): |
117 | # ifdef USE_AS_MEMCMPEQ |
118 | movq (%rsi), %rax |
119 | subq (%rdi), %rax |
120 | |
121 | movq -8(%rsi, %rdx), %rcx |
122 | subq -8(%rdi, %rdx), %rcx |
123 | orq %rcx, %rax |
124 | /* Convert 64 bit -> 32 bit boolean (we should have made the ABI |
125 | return long). */ |
126 | setnz %cl |
127 | movzbl %cl, %eax |
128 | # else |
129 | movq (%rsi), %rcx |
130 | movq (%rdi), %rax |
131 | /* Only compute proper return if not-equal. */ |
132 | cmpq %rcx, %rax |
133 | jnz L(ret_nonzero) |
134 | |
135 | movq -8(%rsi, %rdx, CHAR_SIZE), %rcx |
136 | movq -8(%rdi, %rdx, CHAR_SIZE), %rax |
137 | /* Only compute proper return if not-equal. */ |
138 | cmpq %rcx, %rax |
139 | jnz L(ret_nonzero) |
140 | xorl %eax, %eax |
141 | # endif |
142 | #endif |
143 | ret |
144 | |
145 | .p2align 4,, 8 |
146 | L(cmp_0_1): |
147 | /* Flag set by earlier comparison against 1. */ |
148 | jne L(cmp_0_0) |
149 | #ifdef USE_AS_WMEMCMP |
150 | movl (%rdi), %ecx |
151 | xorl %edx, %edx |
152 | cmpl (%rsi), %ecx |
153 | je L(cmp_0_0) |
154 | setg %dl |
155 | leal -1(%rdx, %rdx), %eax |
156 | #else |
157 | movzbl (%rdi), %eax |
158 | movzbl (%rsi), %ecx |
159 | subl %ecx, %eax |
160 | #endif |
161 | ret |
162 | |
163 | /* Fits in aligning bytes. */ |
164 | L(cmp_0_0): |
165 | xorl %eax, %eax |
166 | ret |
167 | |
168 | #ifdef USE_AS_WMEMCMP |
169 | .p2align 4 |
170 | L(ret_nonzero_vec_start_0): |
171 | bsfl %eax, %eax |
172 | movl (%rdi, %rax), %ecx |
173 | xorl %edx, %edx |
174 | cmpl (%rsi, %rax), %ecx |
175 | /* NB: no partial register stall here because xorl zero idiom |
176 | above. */ |
177 | setg %dl |
178 | leal -1(%rdx, %rdx), %eax |
179 | ret |
180 | #else |
181 | |
182 | # ifndef USE_AS_MEMCMPEQ |
183 | .p2align 4,, 14 |
184 | L(ret_nonzero): |
185 | /* Need to bswap to get proper return without branch. */ |
186 | bswapq %rcx |
187 | bswapq %rax |
188 | subq %rcx, %rax |
189 | sbbl %eax, %eax |
190 | orl $1, %eax |
191 | ret |
192 | # endif |
193 | |
194 | .p2align 4 |
195 | L(cmp_0_3): |
196 | # ifdef USE_AS_MEMCMPEQ |
197 | /* No reason to add to dependency chain on rdx. Saving a the |
198 | bytes here doesn't change number of fetch blocks. */ |
199 | cmpl $1, %edx |
200 | jbe L(cmp_0_1) |
201 | # else |
202 | /* We need the code size to prevent taking an extra fetch block. |
203 | */ |
204 | decl %edx |
205 | jle L(cmp_0_1) |
206 | # endif |
207 | movzwl (%rsi), %ecx |
208 | movzwl (%rdi), %eax |
209 | |
210 | # ifdef USE_AS_MEMCMPEQ |
211 | subl %ecx, %eax |
212 | |
213 | movzbl -1(%rsi, %rdx), %esi |
214 | movzbl -1(%rdi, %rdx), %edi |
215 | subl %edi, %esi |
216 | orl %esi, %eax |
217 | # else |
218 | bswapl %ecx |
219 | bswapl %eax |
220 | |
221 | /* Implicit right shift by one. We just need to displace the |
222 | sign bits. */ |
223 | shrl %ecx |
224 | shrl %eax |
225 | |
226 | /* Eat a partial register stall here. Saves code stopping |
227 | L(cmp_0_3) from bleeding into the next fetch block and saves |
228 | an ALU. */ |
229 | movb (%rsi, %rdx), %cl |
230 | movzbl (%rdi, %rdx), %edi |
231 | orl %edi, %eax |
232 | subl %ecx, %eax |
233 | # endif |
234 | ret |
235 | #endif |
236 | |
237 | .p2align 5 |
238 | L(more_1x_vec): |
239 | #ifndef USE_AS_WMEMCMP |
240 | /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store |
241 | in ecx for code size. This is preferable to using `incw` as |
242 | it avoids partial register stalls on older hardware (pre |
243 | SnB). */ |
244 | movl $0xffff, %ecx |
245 | #endif |
246 | movups (%rsi), %xmm0 |
247 | movups (%rdi), %xmm1 |
248 | PCMPEQ %xmm0, %xmm1 |
249 | pmovmskb %xmm1, %eax |
250 | subl %ecx, %eax |
251 | jnz L(ret_nonzero_vec_start_0) |
252 | #if SIZE_OFFSET == 0 |
253 | cmpq $(CHAR_PER_VEC * 2), %rdx |
254 | #else |
255 | /* Offset rdx. Saves just enough code size to keep the |
256 | L(last_2x_vec) case and the non-zero return in a single |
257 | cache line. */ |
258 | subq $(CHAR_PER_VEC * 2), %rdx |
259 | #endif |
260 | ja L(more_2x_vec) |
261 | |
262 | movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0 |
263 | movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1 |
264 | PCMPEQ %xmm0, %xmm1 |
265 | pmovmskb %xmm1, %eax |
266 | subl %ecx, %eax |
267 | #ifndef USE_AS_MEMCMPEQ |
268 | /* Don't use `incw ax` as machines this code runs on are liable |
269 | to have partial register stall. */ |
270 | jnz L(ret_nonzero_vec_end_0) |
271 | #else |
272 | /* Various return targets for memcmpeq. Will always be hot in |
273 | Icache and get short encoding. */ |
274 | L(ret_nonzero_vec_start_1): |
275 | L(ret_nonzero_vec_start_0): |
276 | L(ret_nonzero_vec_end_0): |
277 | #endif |
278 | ret |
279 | |
280 | #ifndef USE_AS_MEMCMPEQ |
281 | # ifdef USE_AS_WMEMCMP |
282 | .p2align 4 |
283 | L(ret_nonzero_vec_end_0_adj): |
284 | addl $3, %edx |
285 | # else |
286 | .p2align 4,, 8 |
287 | # endif |
288 | L(ret_nonzero_vec_end_0): |
289 | bsfl %eax, %eax |
290 | # ifdef USE_AS_WMEMCMP |
291 | leal (%rax, %rdx, CHAR_SIZE), %eax |
292 | movl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx |
293 | xorl %edx, %edx |
294 | cmpl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx |
295 | /* NB: no partial register stall here because xorl zero idiom |
296 | above. */ |
297 | setg %dl |
298 | leal -1(%rdx, %rdx), %eax |
299 | # else |
300 | /* Use `addq` instead of `addl` here so that even if `rax` + `rdx` |
301 | is negative value of the sum will be usable as a 64-bit offset |
302 | (negative 32-bit numbers zero-extend to a large and often |
303 | out-of-bounds 64-bit offsets). Note that `rax` + `rdx` >= 0 is |
304 | an invariant when `memcmp` is used correctly, but if the input |
305 | strings `rsi`/`rdi` are concurrently modified as the function |
306 | runs (there is a Data-Race) it is possible for `rax` + `rdx` to |
307 | be negative. Given that there is virtually no extra to cost |
308 | using `addq` instead of `addl` we may as well protect the |
309 | data-race case. */ |
310 | addq %rdx, %rax |
311 | movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx |
312 | movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax |
313 | subl %ecx, %eax |
314 | # endif |
315 | ret |
316 | # ifndef USE_AS_WMEMCMP |
317 | .p2align 4,, 10 |
318 | L(ret_nonzero_vec_start_0): |
319 | bsfl %eax, %eax |
320 | movzbl (%rsi, %rax), %ecx |
321 | movzbl (%rdi, %rax), %eax |
322 | subl %ecx, %eax |
323 | ret |
324 | # endif |
325 | #else |
326 | #endif |
327 | |
328 | .p2align 5 |
329 | L(more_2x_vec): |
330 | movups (VEC_SIZE * 1)(%rsi), %xmm0 |
331 | movups (VEC_SIZE * 1)(%rdi), %xmm1 |
332 | PCMPEQ %xmm0, %xmm1 |
333 | pmovmskb %xmm1, %eax |
334 | subl %ecx, %eax |
335 | jnz L(ret_nonzero_vec_start_1) |
336 | |
337 | cmpq $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx |
338 | jbe L(last_2x_vec) |
339 | |
340 | cmpq $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx |
341 | ja L(more_8x_vec) |
342 | |
343 | /* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time. |
344 | This can harm performance if non-zero return in [65, 80] or |
345 | [97, 112] but helps performance otherwise. Generally zero- |
346 | return is hotter. */ |
347 | movups (VEC_SIZE * 2)(%rsi), %xmm0 |
348 | movups (VEC_SIZE * 2)(%rdi), %xmm1 |
349 | PCMPEQ %xmm0, %xmm1 |
350 | movups (VEC_SIZE * 3)(%rsi), %xmm2 |
351 | movups (VEC_SIZE * 3)(%rdi), %xmm3 |
352 | PCMPEQ %xmm2, %xmm3 |
353 | pand %xmm1, %xmm3 |
354 | |
355 | pmovmskb %xmm3, %eax |
356 | CHECK_CMP (%ecx, %eax) |
357 | jnz L(ret_nonzero_vec_start_2_3) |
358 | |
359 | cmpl $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx |
360 | jbe L(last_2x_vec) |
361 | |
362 | movups (VEC_SIZE * 4)(%rsi), %xmm0 |
363 | movups (VEC_SIZE * 4)(%rdi), %xmm1 |
364 | PCMPEQ %xmm0, %xmm1 |
365 | movups (VEC_SIZE * 5)(%rsi), %xmm2 |
366 | movups (VEC_SIZE * 5)(%rdi), %xmm3 |
367 | PCMPEQ %xmm2, %xmm3 |
368 | pand %xmm1, %xmm3 |
369 | |
370 | pmovmskb %xmm3, %eax |
371 | CHECK_CMP (%ecx, %eax) |
372 | #ifdef USE_AS_MEMCMPEQ |
373 | jz L(last_2x_vec) |
374 | ret |
375 | #else |
376 | jnz L(ret_nonzero_vec_start_4_5) |
377 | #endif |
378 | .p2align 4 |
379 | L(last_2x_vec): |
380 | movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0 |
381 | movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1 |
382 | PCMPEQ %xmm0, %xmm1 |
383 | movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2 |
384 | movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3 |
385 | PCMPEQ %xmm2, %xmm3 |
386 | pand %xmm1, %xmm3 |
387 | pmovmskb %xmm3, %eax |
388 | subl %ecx, %eax |
389 | #ifdef USE_AS_MEMCMPEQ |
390 | /* Various return targets for memcmpeq. Will always be hot in |
391 | Icache and get short encoding. */ |
392 | L(ret_nonzero_vec_start_2_3): |
393 | L(ret_nonzero_vec_start_4_5): |
394 | ret |
395 | #else |
396 | jnz L(ret_nonzero_vec_end_1) |
397 | ret |
398 | |
399 | .p2align 4,, 8 |
400 | L(ret_nonzero_vec_end_1): |
401 | pmovmskb %xmm1, %ecx |
402 | /* High 16 bits of eax guranteed to be all ones. Rotate them in |
403 | to we can do `or + not` with just `xor`. */ |
404 | rorl $16, %eax |
405 | xorl %ecx, %eax |
406 | /* Partial register stall. */ |
407 | |
408 | bsfl %eax, %eax |
409 | # ifdef USE_AS_WMEMCMP |
410 | leal (%rax, %rdx, CHAR_SIZE), %eax |
411 | movl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx |
412 | xorl %edx, %edx |
413 | cmpl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx |
414 | /* NB: no partial register stall here because xorl zero idiom |
415 | above. */ |
416 | setg %dl |
417 | leal -1(%rdx, %rdx), %eax |
418 | # else |
419 | addl %edx, %eax |
420 | movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx |
421 | movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax |
422 | subl %ecx, %eax |
423 | # endif |
424 | ret |
425 | |
426 | .p2align 4 |
427 | L(ret_nonzero_vec_start_4_5): |
428 | pmovmskb %xmm1, %edx |
429 | sall $16, %eax |
430 | leal 1(%rax, %rdx), %eax |
431 | bsfl %eax, %eax |
432 | # ifdef USE_AS_WMEMCMP |
433 | movl (VEC_SIZE * 4)(%rdi, %rax), %ecx |
434 | xorl %edx, %edx |
435 | cmpl (VEC_SIZE * 4)(%rsi, %rax), %ecx |
436 | /* NB: no partial register stall here because xorl zero idiom |
437 | above. */ |
438 | setg %dl |
439 | leal -1(%rdx, %rdx), %eax |
440 | # else |
441 | movzbl (VEC_SIZE * 4)(%rsi, %rax), %ecx |
442 | movzbl (VEC_SIZE * 4)(%rdi, %rax), %eax |
443 | subl %ecx, %eax |
444 | # endif |
445 | ret |
446 | |
447 | .p2align 4,, 8 |
448 | L(ret_nonzero_vec_start_1): |
449 | bsfl %eax, %eax |
450 | # ifdef USE_AS_WMEMCMP |
451 | movl (VEC_SIZE * 1)(%rdi, %rax), %ecx |
452 | xorl %edx, %edx |
453 | cmpl (VEC_SIZE * 1)(%rsi, %rax), %ecx |
454 | /* NB: no partial register stall here because xorl zero idiom |
455 | above. */ |
456 | setg %dl |
457 | leal -1(%rdx, %rdx), %eax |
458 | # else |
459 | movzbl (VEC_SIZE * 1)(%rsi, %rax), %ecx |
460 | movzbl (VEC_SIZE * 1)(%rdi, %rax), %eax |
461 | subl %ecx, %eax |
462 | # endif |
463 | ret |
464 | #endif |
465 | |
466 | .p2align 4 |
467 | L(more_8x_vec): |
468 | subq %rdi, %rsi |
469 | leaq (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx |
470 | andq $(VEC_SIZE * -1), %rdi |
471 | addq %rdi, %rsi |
472 | .p2align 4 |
473 | L(loop_4x): |
474 | movups (VEC_SIZE * 2)(%rsi), %xmm0 |
475 | movups (VEC_SIZE * 3)(%rsi), %xmm1 |
476 | |
477 | PCMPEQ (VEC_SIZE * 2)(%rdi), %xmm0 |
478 | PCMPEQ (VEC_SIZE * 3)(%rdi), %xmm1 |
479 | |
480 | movups (VEC_SIZE * 4)(%rsi), %xmm2 |
481 | movups (VEC_SIZE * 5)(%rsi), %xmm3 |
482 | |
483 | PCMPEQ (VEC_SIZE * 4)(%rdi), %xmm2 |
484 | PCMPEQ (VEC_SIZE * 5)(%rdi), %xmm3 |
485 | |
486 | pand %xmm0, %xmm1 |
487 | pand %xmm2, %xmm3 |
488 | pand %xmm1, %xmm3 |
489 | |
490 | pmovmskb %xmm3, %eax |
491 | subl %ecx, %eax |
492 | jnz L(ret_nonzero_loop) |
493 | |
494 | addq $(VEC_SIZE * 4), %rdi |
495 | addq $(VEC_SIZE * 4), %rsi |
496 | cmpq %rdi, %rdx |
497 | ja L(loop_4x) |
498 | /* Get remaining length in edx. */ |
499 | subl %edi, %edx |
500 | /* Restore offset so we can reuse L(last_2x_vec). */ |
501 | addl $(VEC_SIZE * 6 - SIZE_OFFSET), %edx |
502 | #ifdef USE_AS_WMEMCMP |
503 | shrl $2, %edx |
504 | #endif |
505 | cmpl $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx |
506 | jbe L(last_2x_vec) |
507 | |
508 | |
509 | movups (VEC_SIZE * 2)(%rsi), %xmm0 |
510 | movups (VEC_SIZE * 2)(%rdi), %xmm1 |
511 | PCMPEQ %xmm0, %xmm1 |
512 | movups (VEC_SIZE * 3)(%rsi), %xmm2 |
513 | movups (VEC_SIZE * 3)(%rdi), %xmm3 |
514 | PCMPEQ %xmm2, %xmm3 |
515 | pand %xmm1, %xmm3 |
516 | |
517 | pmovmskb %xmm3, %eax |
518 | CHECK_CMP (%ecx, %eax) |
519 | jz L(last_2x_vec) |
520 | #ifdef USE_AS_MEMCMPEQ |
521 | L(ret_nonzero_loop): |
522 | ret |
523 | #else |
524 | |
525 | .p2align 4 |
526 | L(ret_nonzero_vec_start_2_3): |
527 | pmovmskb %xmm1, %edx |
528 | sall $16, %eax |
529 | leal 1(%rax, %rdx), %eax |
530 | |
531 | bsfl %eax, %eax |
532 | # ifdef USE_AS_WMEMCMP |
533 | movl (VEC_SIZE * 2)(%rdi, %rax), %ecx |
534 | xorl %edx, %edx |
535 | cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx |
536 | /* NB: no partial register stall here because xorl zero idiom |
537 | above. */ |
538 | setg %dl |
539 | leal -1(%rdx, %rdx), %eax |
540 | # else |
541 | movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx |
542 | movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax |
543 | subl %ecx, %eax |
544 | # endif |
545 | ret |
546 | |
547 | .p2align 4 |
548 | L(ret_nonzero_loop): |
549 | pmovmskb %xmm0, %ecx |
550 | pmovmskb %xmm1, %edx |
551 | sall $(VEC_SIZE * 1), %edx |
552 | leal 1(%rcx, %rdx), %edx |
553 | pmovmskb %xmm2, %ecx |
554 | /* High 16 bits of eax guranteed to be all ones. Rotate them in |
555 | to we can do `or + not` with just `xor`. */ |
556 | rorl $16, %eax |
557 | xorl %ecx, %eax |
558 | |
559 | salq $32, %rax |
560 | orq %rdx, %rax |
561 | |
562 | bsfq %rax, %rax |
563 | # ifdef USE_AS_WMEMCMP |
564 | movl (VEC_SIZE * 2)(%rdi, %rax), %ecx |
565 | xorl %edx, %edx |
566 | cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx |
567 | /* NB: no partial register stall here because xorl zero idiom |
568 | above. */ |
569 | setg %dl |
570 | leal -1(%rdx, %rdx), %eax |
571 | # else |
572 | movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx |
573 | movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax |
574 | subl %ecx, %eax |
575 | # endif |
576 | ret |
577 | #endif |
578 | END(MEMCMP) |
579 | |
580 | #ifndef USE_AS_WMEMCMP |
581 | # ifdef USE_AS_MEMCMPEQ |
582 | libc_hidden_def (MEMCMP) |
583 | # else |
584 | # undef bcmp |
585 | weak_alias (MEMCMP, bcmp) |
586 | libc_hidden_builtin_def (MEMCMP) |
587 | # endif |
588 | #endif |
589 | |