1 | /* Optimized wcscmp for x86-64 with SSE2. |
2 | Copyright (C) 2011-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | /* Note: wcscmp uses signed comparison, not unsighed as in strcmp function. */ |
22 | |
23 | .text |
24 | ENTRY (__wcscmp) |
25 | /* |
26 | * This implementation uses SSE to compare up to 16 bytes at a time. |
27 | */ |
28 | mov %esi, %eax |
29 | mov %edi, %edx |
30 | pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ |
31 | mov %al, %ch |
32 | mov %dl, %cl |
33 | and $63, %eax /* rsi alignment in cache line */ |
34 | and $63, %edx /* rdi alignment in cache line */ |
35 | and $15, %cl |
36 | jz L(continue_00) |
37 | cmp $16, %edx |
38 | jb L(continue_0) |
39 | cmp $32, %edx |
40 | jb L(continue_16) |
41 | cmp $48, %edx |
42 | jb L(continue_32) |
43 | |
44 | L(continue_48): |
45 | and $15, %ch |
46 | jz L(continue_48_00) |
47 | cmp $16, %eax |
48 | jb L(continue_0_48) |
49 | cmp $32, %eax |
50 | jb L(continue_16_48) |
51 | cmp $48, %eax |
52 | jb L(continue_32_48) |
53 | |
54 | .p2align 4 |
55 | L(continue_48_48): |
56 | mov (%rsi), %ecx |
57 | cmp %ecx, (%rdi) |
58 | jne L(nequal) |
59 | test %ecx, %ecx |
60 | jz L(equal) |
61 | |
62 | mov 4(%rsi), %ecx |
63 | cmp %ecx, 4(%rdi) |
64 | jne L(nequal) |
65 | test %ecx, %ecx |
66 | jz L(equal) |
67 | |
68 | mov 8(%rsi), %ecx |
69 | cmp %ecx, 8(%rdi) |
70 | jne L(nequal) |
71 | test %ecx, %ecx |
72 | jz L(equal) |
73 | |
74 | mov 12(%rsi), %ecx |
75 | cmp %ecx, 12(%rdi) |
76 | jne L(nequal) |
77 | test %ecx, %ecx |
78 | jz L(equal) |
79 | |
80 | movdqu 16(%rdi), %xmm1 |
81 | movdqu 16(%rsi), %xmm2 |
82 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
83 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
84 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
85 | pmovmskb %xmm1, %edx |
86 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
87 | jnz L(less4_double_words_16) |
88 | |
89 | movdqu 32(%rdi), %xmm1 |
90 | movdqu 32(%rsi), %xmm2 |
91 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
92 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
93 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
94 | pmovmskb %xmm1, %edx |
95 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
96 | jnz L(less4_double_words_32) |
97 | |
98 | movdqu 48(%rdi), %xmm1 |
99 | movdqu 48(%rsi), %xmm2 |
100 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
101 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
102 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
103 | pmovmskb %xmm1, %edx |
104 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
105 | jnz L(less4_double_words_48) |
106 | |
107 | add $64, %rsi |
108 | add $64, %rdi |
109 | jmp L(continue_48_48) |
110 | |
111 | L(continue_0): |
112 | and $15, %ch |
113 | jz L(continue_0_00) |
114 | cmp $16, %eax |
115 | jb L(continue_0_0) |
116 | cmp $32, %eax |
117 | jb L(continue_0_16) |
118 | cmp $48, %eax |
119 | jb L(continue_0_32) |
120 | |
121 | .p2align 4 |
122 | L(continue_0_48): |
123 | mov (%rsi), %ecx |
124 | cmp %ecx, (%rdi) |
125 | jne L(nequal) |
126 | test %ecx, %ecx |
127 | jz L(equal) |
128 | |
129 | mov 4(%rsi), %ecx |
130 | cmp %ecx, 4(%rdi) |
131 | jne L(nequal) |
132 | test %ecx, %ecx |
133 | jz L(equal) |
134 | |
135 | mov 8(%rsi), %ecx |
136 | cmp %ecx, 8(%rdi) |
137 | jne L(nequal) |
138 | test %ecx, %ecx |
139 | jz L(equal) |
140 | |
141 | mov 12(%rsi), %ecx |
142 | cmp %ecx, 12(%rdi) |
143 | jne L(nequal) |
144 | test %ecx, %ecx |
145 | jz L(equal) |
146 | |
147 | movdqu 16(%rdi), %xmm1 |
148 | movdqu 16(%rsi), %xmm2 |
149 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
150 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
151 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
152 | pmovmskb %xmm1, %edx |
153 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
154 | jnz L(less4_double_words_16) |
155 | |
156 | movdqu 32(%rdi), %xmm1 |
157 | movdqu 32(%rsi), %xmm2 |
158 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
159 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
160 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
161 | pmovmskb %xmm1, %edx |
162 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
163 | jnz L(less4_double_words_32) |
164 | |
165 | mov 48(%rsi), %ecx |
166 | cmp %ecx, 48(%rdi) |
167 | jne L(nequal) |
168 | test %ecx, %ecx |
169 | jz L(equal) |
170 | |
171 | mov 52(%rsi), %ecx |
172 | cmp %ecx, 52(%rdi) |
173 | jne L(nequal) |
174 | test %ecx, %ecx |
175 | jz L(equal) |
176 | |
177 | mov 56(%rsi), %ecx |
178 | cmp %ecx, 56(%rdi) |
179 | jne L(nequal) |
180 | test %ecx, %ecx |
181 | jz L(equal) |
182 | |
183 | mov 60(%rsi), %ecx |
184 | cmp %ecx, 60(%rdi) |
185 | jne L(nequal) |
186 | test %ecx, %ecx |
187 | jz L(equal) |
188 | |
189 | add $64, %rsi |
190 | add $64, %rdi |
191 | jmp L(continue_0_48) |
192 | |
193 | .p2align 4 |
194 | L(continue_00): |
195 | and $15, %ch |
196 | jz L(continue_00_00) |
197 | cmp $16, %eax |
198 | jb L(continue_00_0) |
199 | cmp $32, %eax |
200 | jb L(continue_00_16) |
201 | cmp $48, %eax |
202 | jb L(continue_00_32) |
203 | |
204 | .p2align 4 |
205 | L(continue_00_48): |
206 | pcmpeqd (%rdi), %xmm0 |
207 | mov (%rdi), %eax |
208 | pmovmskb %xmm0, %ecx |
209 | test %ecx, %ecx |
210 | jnz L(less4_double_words1) |
211 | |
212 | cmp (%rsi), %eax |
213 | jne L(nequal) |
214 | |
215 | mov 4(%rdi), %eax |
216 | cmp 4(%rsi), %eax |
217 | jne L(nequal) |
218 | |
219 | mov 8(%rdi), %eax |
220 | cmp 8(%rsi), %eax |
221 | jne L(nequal) |
222 | |
223 | mov 12(%rdi), %eax |
224 | cmp 12(%rsi), %eax |
225 | jne L(nequal) |
226 | |
227 | movdqu 16(%rsi), %xmm2 |
228 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
229 | pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */ |
230 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
231 | pmovmskb %xmm2, %edx |
232 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
233 | jnz L(less4_double_words_16) |
234 | |
235 | movdqu 32(%rsi), %xmm2 |
236 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
237 | pcmpeqd 32(%rdi), %xmm2 /* compare first 4 double_words for equality */ |
238 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
239 | pmovmskb %xmm2, %edx |
240 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
241 | jnz L(less4_double_words_32) |
242 | |
243 | movdqu 48(%rsi), %xmm2 |
244 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
245 | pcmpeqd 48(%rdi), %xmm2 /* compare first 4 double_words for equality */ |
246 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
247 | pmovmskb %xmm2, %edx |
248 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
249 | jnz L(less4_double_words_48) |
250 | |
251 | add $64, %rsi |
252 | add $64, %rdi |
253 | jmp L(continue_00_48) |
254 | |
255 | .p2align 4 |
256 | L(continue_32): |
257 | and $15, %ch |
258 | jz L(continue_32_00) |
259 | cmp $16, %eax |
260 | jb L(continue_0_32) |
261 | cmp $32, %eax |
262 | jb L(continue_16_32) |
263 | cmp $48, %eax |
264 | jb L(continue_32_32) |
265 | |
266 | .p2align 4 |
267 | L(continue_32_48): |
268 | mov (%rsi), %ecx |
269 | cmp %ecx, (%rdi) |
270 | jne L(nequal) |
271 | test %ecx, %ecx |
272 | jz L(equal) |
273 | |
274 | mov 4(%rsi), %ecx |
275 | cmp %ecx, 4(%rdi) |
276 | jne L(nequal) |
277 | test %ecx, %ecx |
278 | jz L(equal) |
279 | |
280 | mov 8(%rsi), %ecx |
281 | cmp %ecx, 8(%rdi) |
282 | jne L(nequal) |
283 | test %ecx, %ecx |
284 | jz L(equal) |
285 | |
286 | mov 12(%rsi), %ecx |
287 | cmp %ecx, 12(%rdi) |
288 | jne L(nequal) |
289 | test %ecx, %ecx |
290 | jz L(equal) |
291 | |
292 | mov 16(%rsi), %ecx |
293 | cmp %ecx, 16(%rdi) |
294 | jne L(nequal) |
295 | test %ecx, %ecx |
296 | jz L(equal) |
297 | |
298 | mov 20(%rsi), %ecx |
299 | cmp %ecx, 20(%rdi) |
300 | jne L(nequal) |
301 | test %ecx, %ecx |
302 | jz L(equal) |
303 | |
304 | mov 24(%rsi), %ecx |
305 | cmp %ecx, 24(%rdi) |
306 | jne L(nequal) |
307 | test %ecx, %ecx |
308 | jz L(equal) |
309 | |
310 | mov 28(%rsi), %ecx |
311 | cmp %ecx, 28(%rdi) |
312 | jne L(nequal) |
313 | test %ecx, %ecx |
314 | jz L(equal) |
315 | |
316 | movdqu 32(%rdi), %xmm1 |
317 | movdqu 32(%rsi), %xmm2 |
318 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
319 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
320 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
321 | pmovmskb %xmm1, %edx |
322 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
323 | jnz L(less4_double_words_32) |
324 | |
325 | movdqu 48(%rdi), %xmm1 |
326 | movdqu 48(%rsi), %xmm2 |
327 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
328 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
329 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
330 | pmovmskb %xmm1, %edx |
331 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
332 | jnz L(less4_double_words_48) |
333 | |
334 | add $64, %rsi |
335 | add $64, %rdi |
336 | jmp L(continue_32_48) |
337 | |
338 | .p2align 4 |
339 | L(continue_16): |
340 | and $15, %ch |
341 | jz L(continue_16_00) |
342 | cmp $16, %eax |
343 | jb L(continue_0_16) |
344 | cmp $32, %eax |
345 | jb L(continue_16_16) |
346 | cmp $48, %eax |
347 | jb L(continue_16_32) |
348 | |
349 | .p2align 4 |
350 | L(continue_16_48): |
351 | mov (%rsi), %ecx |
352 | cmp %ecx, (%rdi) |
353 | jne L(nequal) |
354 | test %ecx, %ecx |
355 | jz L(equal) |
356 | |
357 | mov 4(%rsi), %ecx |
358 | cmp %ecx, 4(%rdi) |
359 | jne L(nequal) |
360 | test %ecx, %ecx |
361 | jz L(equal) |
362 | |
363 | mov 8(%rsi), %ecx |
364 | cmp %ecx, 8(%rdi) |
365 | jne L(nequal) |
366 | test %ecx, %ecx |
367 | jz L(equal) |
368 | |
369 | mov 12(%rsi), %ecx |
370 | cmp %ecx, 12(%rdi) |
371 | jne L(nequal) |
372 | test %ecx, %ecx |
373 | jz L(equal) |
374 | |
375 | movdqu 16(%rdi), %xmm1 |
376 | movdqu 16(%rsi), %xmm2 |
377 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
378 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
379 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
380 | pmovmskb %xmm1, %edx |
381 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
382 | jnz L(less4_double_words_16) |
383 | |
384 | mov 32(%rsi), %ecx |
385 | cmp %ecx, 32(%rdi) |
386 | jne L(nequal) |
387 | test %ecx, %ecx |
388 | jz L(equal) |
389 | |
390 | mov 36(%rsi), %ecx |
391 | cmp %ecx, 36(%rdi) |
392 | jne L(nequal) |
393 | test %ecx, %ecx |
394 | jz L(equal) |
395 | |
396 | mov 40(%rsi), %ecx |
397 | cmp %ecx, 40(%rdi) |
398 | jne L(nequal) |
399 | test %ecx, %ecx |
400 | jz L(equal) |
401 | |
402 | mov 44(%rsi), %ecx |
403 | cmp %ecx, 44(%rdi) |
404 | jne L(nequal) |
405 | test %ecx, %ecx |
406 | jz L(equal) |
407 | |
408 | movdqu 48(%rdi), %xmm1 |
409 | movdqu 48(%rsi), %xmm2 |
410 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
411 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
412 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
413 | pmovmskb %xmm1, %edx |
414 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
415 | jnz L(less4_double_words_48) |
416 | |
417 | add $64, %rsi |
418 | add $64, %rdi |
419 | jmp L(continue_16_48) |
420 | |
421 | .p2align 4 |
422 | L(continue_00_00): |
423 | movdqa (%rdi), %xmm1 |
424 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
425 | pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */ |
426 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
427 | pmovmskb %xmm1, %edx |
428 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
429 | jnz L(less4_double_words) |
430 | |
431 | movdqa 16(%rdi), %xmm3 |
432 | pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ |
433 | pcmpeqd 16(%rsi), %xmm3 /* compare first 4 double_words for equality */ |
434 | psubb %xmm0, %xmm3 /* packed sub of comparison results*/ |
435 | pmovmskb %xmm3, %edx |
436 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
437 | jnz L(less4_double_words_16) |
438 | |
439 | movdqa 32(%rdi), %xmm5 |
440 | pcmpeqd %xmm5, %xmm0 /* Any null double_word? */ |
441 | pcmpeqd 32(%rsi), %xmm5 /* compare first 4 double_words for equality */ |
442 | psubb %xmm0, %xmm5 /* packed sub of comparison results*/ |
443 | pmovmskb %xmm5, %edx |
444 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
445 | jnz L(less4_double_words_32) |
446 | |
447 | movdqa 48(%rdi), %xmm1 |
448 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
449 | pcmpeqd 48(%rsi), %xmm1 /* compare first 4 double_words for equality */ |
450 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
451 | pmovmskb %xmm1, %edx |
452 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
453 | jnz L(less4_double_words_48) |
454 | |
455 | add $64, %rsi |
456 | add $64, %rdi |
457 | jmp L(continue_00_00) |
458 | |
459 | .p2align 4 |
460 | L(continue_00_32): |
461 | movdqu (%rsi), %xmm2 |
462 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
463 | pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */ |
464 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
465 | pmovmskb %xmm2, %edx |
466 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
467 | jnz L(less4_double_words) |
468 | |
469 | add $16, %rsi |
470 | add $16, %rdi |
471 | jmp L(continue_00_48) |
472 | |
473 | .p2align 4 |
474 | L(continue_00_16): |
475 | movdqu (%rsi), %xmm2 |
476 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
477 | pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */ |
478 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
479 | pmovmskb %xmm2, %edx |
480 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
481 | jnz L(less4_double_words) |
482 | |
483 | movdqu 16(%rsi), %xmm2 |
484 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
485 | pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */ |
486 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
487 | pmovmskb %xmm2, %edx |
488 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
489 | jnz L(less4_double_words_16) |
490 | |
491 | add $32, %rsi |
492 | add $32, %rdi |
493 | jmp L(continue_00_48) |
494 | |
495 | .p2align 4 |
496 | L(continue_00_0): |
497 | movdqu (%rsi), %xmm2 |
498 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
499 | pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */ |
500 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
501 | pmovmskb %xmm2, %edx |
502 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
503 | jnz L(less4_double_words) |
504 | |
505 | movdqu 16(%rsi), %xmm2 |
506 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
507 | pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */ |
508 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
509 | pmovmskb %xmm2, %edx |
510 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
511 | jnz L(less4_double_words_16) |
512 | |
513 | movdqu 32(%rsi), %xmm2 |
514 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
515 | pcmpeqd 32(%rdi), %xmm2 /* compare first 4 double_words for equality */ |
516 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
517 | pmovmskb %xmm2, %edx |
518 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
519 | jnz L(less4_double_words_32) |
520 | |
521 | add $48, %rsi |
522 | add $48, %rdi |
523 | jmp L(continue_00_48) |
524 | |
525 | .p2align 4 |
526 | L(continue_48_00): |
527 | pcmpeqd (%rsi), %xmm0 |
528 | mov (%rdi), %eax |
529 | pmovmskb %xmm0, %ecx |
530 | test %ecx, %ecx |
531 | jnz L(less4_double_words1) |
532 | |
533 | cmp (%rsi), %eax |
534 | jne L(nequal) |
535 | |
536 | mov 4(%rdi), %eax |
537 | cmp 4(%rsi), %eax |
538 | jne L(nequal) |
539 | |
540 | mov 8(%rdi), %eax |
541 | cmp 8(%rsi), %eax |
542 | jne L(nequal) |
543 | |
544 | mov 12(%rdi), %eax |
545 | cmp 12(%rsi), %eax |
546 | jne L(nequal) |
547 | |
548 | movdqu 16(%rdi), %xmm1 |
549 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
550 | pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */ |
551 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
552 | pmovmskb %xmm1, %edx |
553 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
554 | jnz L(less4_double_words_16) |
555 | |
556 | movdqu 32(%rdi), %xmm1 |
557 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
558 | pcmpeqd 32(%rsi), %xmm1 /* compare first 4 double_words for equality */ |
559 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
560 | pmovmskb %xmm1, %edx |
561 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
562 | jnz L(less4_double_words_32) |
563 | |
564 | movdqu 48(%rdi), %xmm1 |
565 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
566 | pcmpeqd 48(%rsi), %xmm1 /* compare first 4 double_words for equality */ |
567 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
568 | pmovmskb %xmm1, %edx |
569 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
570 | jnz L(less4_double_words_48) |
571 | |
572 | add $64, %rsi |
573 | add $64, %rdi |
574 | jmp L(continue_48_00) |
575 | |
576 | .p2align 4 |
577 | L(continue_32_00): |
578 | movdqu (%rdi), %xmm1 |
579 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
580 | pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */ |
581 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
582 | pmovmskb %xmm1, %edx |
583 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
584 | jnz L(less4_double_words) |
585 | |
586 | add $16, %rsi |
587 | add $16, %rdi |
588 | jmp L(continue_48_00) |
589 | |
590 | .p2align 4 |
591 | L(continue_16_00): |
592 | movdqu (%rdi), %xmm1 |
593 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
594 | pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */ |
595 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
596 | pmovmskb %xmm1, %edx |
597 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
598 | jnz L(less4_double_words) |
599 | |
600 | movdqu 16(%rdi), %xmm1 |
601 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
602 | pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */ |
603 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
604 | pmovmskb %xmm1, %edx |
605 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
606 | jnz L(less4_double_words_16) |
607 | |
608 | add $32, %rsi |
609 | add $32, %rdi |
610 | jmp L(continue_48_00) |
611 | |
612 | .p2align 4 |
613 | L(continue_0_00): |
614 | movdqu (%rdi), %xmm1 |
615 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
616 | pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */ |
617 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
618 | pmovmskb %xmm1, %edx |
619 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
620 | jnz L(less4_double_words) |
621 | |
622 | movdqu 16(%rdi), %xmm1 |
623 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
624 | pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */ |
625 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
626 | pmovmskb %xmm1, %edx |
627 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
628 | jnz L(less4_double_words_16) |
629 | |
630 | movdqu 32(%rdi), %xmm1 |
631 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
632 | pcmpeqd 32(%rsi), %xmm1 /* compare first 4 double_words for equality */ |
633 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
634 | pmovmskb %xmm1, %edx |
635 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
636 | jnz L(less4_double_words_32) |
637 | |
638 | add $48, %rsi |
639 | add $48, %rdi |
640 | jmp L(continue_48_00) |
641 | |
642 | .p2align 4 |
643 | L(continue_32_32): |
644 | movdqu (%rdi), %xmm1 |
645 | movdqu (%rsi), %xmm2 |
646 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
647 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
648 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
649 | pmovmskb %xmm1, %edx |
650 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
651 | jnz L(less4_double_words) |
652 | |
653 | add $16, %rsi |
654 | add $16, %rdi |
655 | jmp L(continue_48_48) |
656 | |
657 | .p2align 4 |
658 | L(continue_16_16): |
659 | movdqu (%rdi), %xmm1 |
660 | movdqu (%rsi), %xmm2 |
661 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
662 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
663 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
664 | pmovmskb %xmm1, %edx |
665 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
666 | jnz L(less4_double_words) |
667 | |
668 | movdqu 16(%rdi), %xmm3 |
669 | movdqu 16(%rsi), %xmm4 |
670 | pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ |
671 | pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */ |
672 | psubb %xmm0, %xmm3 /* packed sub of comparison results*/ |
673 | pmovmskb %xmm3, %edx |
674 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
675 | jnz L(less4_double_words_16) |
676 | |
677 | add $32, %rsi |
678 | add $32, %rdi |
679 | jmp L(continue_48_48) |
680 | |
681 | .p2align 4 |
682 | L(continue_0_0): |
683 | movdqu (%rdi), %xmm1 |
684 | movdqu (%rsi), %xmm2 |
685 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
686 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
687 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
688 | pmovmskb %xmm1, %edx |
689 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
690 | jnz L(less4_double_words) |
691 | |
692 | movdqu 16(%rdi), %xmm3 |
693 | movdqu 16(%rsi), %xmm4 |
694 | pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ |
695 | pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */ |
696 | psubb %xmm0, %xmm3 /* packed sub of comparison results*/ |
697 | pmovmskb %xmm3, %edx |
698 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
699 | jnz L(less4_double_words_16) |
700 | |
701 | movdqu 32(%rdi), %xmm1 |
702 | movdqu 32(%rsi), %xmm2 |
703 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
704 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
705 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
706 | pmovmskb %xmm1, %edx |
707 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
708 | jnz L(less4_double_words_32) |
709 | |
710 | add $48, %rsi |
711 | add $48, %rdi |
712 | jmp L(continue_48_48) |
713 | |
714 | .p2align 4 |
715 | L(continue_0_16): |
716 | movdqu (%rdi), %xmm1 |
717 | movdqu (%rsi), %xmm2 |
718 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
719 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
720 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
721 | pmovmskb %xmm1, %edx |
722 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
723 | jnz L(less4_double_words) |
724 | |
725 | movdqu 16(%rdi), %xmm1 |
726 | movdqu 16(%rsi), %xmm2 |
727 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
728 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
729 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
730 | pmovmskb %xmm1, %edx |
731 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
732 | jnz L(less4_double_words_16) |
733 | |
734 | add $32, %rsi |
735 | add $32, %rdi |
736 | jmp L(continue_32_48) |
737 | |
738 | .p2align 4 |
739 | L(continue_0_32): |
740 | movdqu (%rdi), %xmm1 |
741 | movdqu (%rsi), %xmm2 |
742 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
743 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
744 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
745 | pmovmskb %xmm1, %edx |
746 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
747 | jnz L(less4_double_words) |
748 | |
749 | add $16, %rsi |
750 | add $16, %rdi |
751 | jmp L(continue_16_48) |
752 | |
753 | .p2align 4 |
754 | L(continue_16_32): |
755 | movdqu (%rdi), %xmm1 |
756 | movdqu (%rsi), %xmm2 |
757 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
758 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
759 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
760 | pmovmskb %xmm1, %edx |
761 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
762 | jnz L(less4_double_words) |
763 | |
764 | add $16, %rsi |
765 | add $16, %rdi |
766 | jmp L(continue_32_48) |
767 | |
768 | .p2align 4 |
769 | L(less4_double_words1): |
770 | cmp (%rsi), %eax |
771 | jne L(nequal) |
772 | test %eax, %eax |
773 | jz L(equal) |
774 | |
775 | mov 4(%rsi), %ecx |
776 | cmp %ecx, 4(%rdi) |
777 | jne L(nequal) |
778 | test %ecx, %ecx |
779 | jz L(equal) |
780 | |
781 | mov 8(%rsi), %ecx |
782 | cmp %ecx, 8(%rdi) |
783 | jne L(nequal) |
784 | test %ecx, %ecx |
785 | jz L(equal) |
786 | |
787 | mov 12(%rsi), %ecx |
788 | cmp %ecx, 12(%rdi) |
789 | jne L(nequal) |
790 | xor %eax, %eax |
791 | ret |
792 | |
793 | .p2align 4 |
794 | L(less4_double_words): |
795 | xor %eax, %eax |
796 | test %dl, %dl |
797 | jz L(next_two_double_words) |
798 | and $15, %dl |
799 | jz L(second_double_word) |
800 | mov (%rdi), %eax |
801 | cmp (%rsi), %eax |
802 | jne L(nequal) |
803 | ret |
804 | |
805 | .p2align 4 |
806 | L(second_double_word): |
807 | mov 4(%rdi), %eax |
808 | cmp 4(%rsi), %eax |
809 | jne L(nequal) |
810 | ret |
811 | |
812 | .p2align 4 |
813 | L(next_two_double_words): |
814 | and $15, %dh |
815 | jz L(fourth_double_word) |
816 | mov 8(%rdi), %eax |
817 | cmp 8(%rsi), %eax |
818 | jne L(nequal) |
819 | ret |
820 | |
821 | .p2align 4 |
822 | L(fourth_double_word): |
823 | mov 12(%rdi), %eax |
824 | cmp 12(%rsi), %eax |
825 | jne L(nequal) |
826 | ret |
827 | |
828 | .p2align 4 |
829 | L(less4_double_words_16): |
830 | xor %eax, %eax |
831 | test %dl, %dl |
832 | jz L(next_two_double_words_16) |
833 | and $15, %dl |
834 | jz L(second_double_word_16) |
835 | mov 16(%rdi), %eax |
836 | cmp 16(%rsi), %eax |
837 | jne L(nequal) |
838 | ret |
839 | |
840 | .p2align 4 |
841 | L(second_double_word_16): |
842 | mov 20(%rdi), %eax |
843 | cmp 20(%rsi), %eax |
844 | jne L(nequal) |
845 | ret |
846 | |
847 | .p2align 4 |
848 | L(next_two_double_words_16): |
849 | and $15, %dh |
850 | jz L(fourth_double_word_16) |
851 | mov 24(%rdi), %eax |
852 | cmp 24(%rsi), %eax |
853 | jne L(nequal) |
854 | ret |
855 | |
856 | .p2align 4 |
857 | L(fourth_double_word_16): |
858 | mov 28(%rdi), %eax |
859 | cmp 28(%rsi), %eax |
860 | jne L(nequal) |
861 | ret |
862 | |
863 | .p2align 4 |
864 | L(less4_double_words_32): |
865 | xor %eax, %eax |
866 | test %dl, %dl |
867 | jz L(next_two_double_words_32) |
868 | and $15, %dl |
869 | jz L(second_double_word_32) |
870 | mov 32(%rdi), %eax |
871 | cmp 32(%rsi), %eax |
872 | jne L(nequal) |
873 | ret |
874 | |
875 | .p2align 4 |
876 | L(second_double_word_32): |
877 | mov 36(%rdi), %eax |
878 | cmp 36(%rsi), %eax |
879 | jne L(nequal) |
880 | ret |
881 | |
882 | .p2align 4 |
883 | L(next_two_double_words_32): |
884 | and $15, %dh |
885 | jz L(fourth_double_word_32) |
886 | mov 40(%rdi), %eax |
887 | cmp 40(%rsi), %eax |
888 | jne L(nequal) |
889 | ret |
890 | |
891 | .p2align 4 |
892 | L(fourth_double_word_32): |
893 | mov 44(%rdi), %eax |
894 | cmp 44(%rsi), %eax |
895 | jne L(nequal) |
896 | ret |
897 | |
898 | .p2align 4 |
899 | L(less4_double_words_48): |
900 | xor %eax, %eax |
901 | test %dl, %dl |
902 | jz L(next_two_double_words_48) |
903 | and $15, %dl |
904 | jz L(second_double_word_48) |
905 | mov 48(%rdi), %eax |
906 | cmp 48(%rsi), %eax |
907 | jne L(nequal) |
908 | ret |
909 | |
910 | .p2align 4 |
911 | L(second_double_word_48): |
912 | mov 52(%rdi), %eax |
913 | cmp 52(%rsi), %eax |
914 | jne L(nequal) |
915 | ret |
916 | |
917 | .p2align 4 |
918 | L(next_two_double_words_48): |
919 | and $15, %dh |
920 | jz L(fourth_double_word_48) |
921 | mov 56(%rdi), %eax |
922 | cmp 56(%rsi), %eax |
923 | jne L(nequal) |
924 | ret |
925 | |
926 | .p2align 4 |
927 | L(fourth_double_word_48): |
928 | mov 60(%rdi), %eax |
929 | cmp 60(%rsi), %eax |
930 | jne L(nequal) |
931 | ret |
932 | |
933 | .p2align 4 |
934 | L(nequal): |
935 | mov $1, %eax |
936 | jg L(nequal_bigger) |
937 | neg %eax |
938 | |
939 | L(nequal_bigger): |
940 | ret |
941 | |
942 | .p2align 4 |
943 | L(equal): |
944 | xor %rax, %rax |
945 | ret |
946 | |
947 | END (__wcscmp) |
948 | #ifndef __wcscmp |
949 | libc_hidden_def (__wcscmp) |
950 | weak_alias (__wcscmp, wcscmp) |
951 | #endif |
952 | |