1 | /* wcscmp with SSE2 |
2 | Copyright (C) 2011-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # include <sysdep.h> |
22 | |
23 | # define CFI_PUSH(REG) \ |
24 | cfi_adjust_cfa_offset (4); \ |
25 | cfi_rel_offset (REG, 0) |
26 | |
27 | # define CFI_POP(REG) \ |
28 | cfi_adjust_cfa_offset (-4); \ |
29 | cfi_restore (REG) |
30 | |
31 | # define PUSH(REG) pushl REG; CFI_PUSH (REG) |
32 | # define POP(REG) popl REG; CFI_POP (REG) |
33 | |
34 | # define ENTRANCE PUSH(%esi); PUSH(%edi) |
35 | # define RETURN POP(%edi); POP(%esi); ret; CFI_PUSH(%esi); CFI_PUSH(%edi); |
36 | # define PARMS 4 |
37 | # define STR1 PARMS |
38 | # define STR2 STR1+4 |
39 | |
40 | /* Note: wcscmp uses signed comparison, not unsugned as in strcmp function. */ |
41 | |
42 | .text |
43 | ENTRY (__wcscmp_sse2) |
44 | /* |
45 | * This implementation uses SSE to compare up to 16 bytes at a time. |
46 | */ |
47 | mov STR1(%esp), %edx |
48 | mov STR2(%esp), %eax |
49 | |
50 | mov (%eax), %ecx |
51 | cmp %ecx, (%edx) |
52 | jne L(neq) |
53 | test %ecx, %ecx |
54 | jz L(eq) |
55 | |
56 | mov 4(%eax), %ecx |
57 | cmp %ecx, 4(%edx) |
58 | jne L(neq) |
59 | test %ecx, %ecx |
60 | jz L(eq) |
61 | |
62 | mov 8(%eax), %ecx |
63 | cmp %ecx, 8(%edx) |
64 | jne L(neq) |
65 | test %ecx, %ecx |
66 | jz L(eq) |
67 | |
68 | mov 12(%eax), %ecx |
69 | cmp %ecx, 12(%edx) |
70 | jne L(neq) |
71 | test %ecx, %ecx |
72 | jz L(eq) |
73 | |
74 | ENTRANCE |
75 | add $16, %eax |
76 | add $16, %edx |
77 | |
78 | mov %eax, %esi |
79 | mov %edx, %edi |
80 | pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ |
81 | mov %al, %ch |
82 | mov %dl, %cl |
83 | and $63, %eax /* esi alignment in cache line */ |
84 | and $63, %edx /* edi alignment in cache line */ |
85 | and $15, %cl |
86 | jz L(continue_00) |
87 | cmp $16, %edx |
88 | jb L(continue_0) |
89 | cmp $32, %edx |
90 | jb L(continue_16) |
91 | cmp $48, %edx |
92 | jb L(continue_32) |
93 | |
94 | L(continue_48): |
95 | and $15, %ch |
96 | jz L(continue_48_00) |
97 | cmp $16, %eax |
98 | jb L(continue_0_48) |
99 | cmp $32, %eax |
100 | jb L(continue_16_48) |
101 | cmp $48, %eax |
102 | jb L(continue_32_48) |
103 | |
104 | .p2align 4 |
105 | L(continue_48_48): |
106 | mov (%esi), %ecx |
107 | cmp %ecx, (%edi) |
108 | jne L(nequal) |
109 | test %ecx, %ecx |
110 | jz L(equal) |
111 | |
112 | mov 4(%esi), %ecx |
113 | cmp %ecx, 4(%edi) |
114 | jne L(nequal) |
115 | test %ecx, %ecx |
116 | jz L(equal) |
117 | |
118 | mov 8(%esi), %ecx |
119 | cmp %ecx, 8(%edi) |
120 | jne L(nequal) |
121 | test %ecx, %ecx |
122 | jz L(equal) |
123 | |
124 | mov 12(%esi), %ecx |
125 | cmp %ecx, 12(%edi) |
126 | jne L(nequal) |
127 | test %ecx, %ecx |
128 | jz L(equal) |
129 | |
130 | movdqu 16(%edi), %xmm1 |
131 | movdqu 16(%esi), %xmm2 |
132 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
133 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
134 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
135 | pmovmskb %xmm1, %edx |
136 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
137 | jnz L(less4_double_words_16) |
138 | |
139 | movdqu 32(%edi), %xmm1 |
140 | movdqu 32(%esi), %xmm2 |
141 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
142 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
143 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
144 | pmovmskb %xmm1, %edx |
145 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
146 | jnz L(less4_double_words_32) |
147 | |
148 | movdqu 48(%edi), %xmm1 |
149 | movdqu 48(%esi), %xmm2 |
150 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
151 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
152 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
153 | pmovmskb %xmm1, %edx |
154 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
155 | jnz L(less4_double_words_48) |
156 | |
157 | add $64, %esi |
158 | add $64, %edi |
159 | jmp L(continue_48_48) |
160 | |
161 | L(continue_0): |
162 | and $15, %ch |
163 | jz L(continue_0_00) |
164 | cmp $16, %eax |
165 | jb L(continue_0_0) |
166 | cmp $32, %eax |
167 | jb L(continue_0_16) |
168 | cmp $48, %eax |
169 | jb L(continue_0_32) |
170 | |
171 | .p2align 4 |
172 | L(continue_0_48): |
173 | mov (%esi), %ecx |
174 | cmp %ecx, (%edi) |
175 | jne L(nequal) |
176 | test %ecx, %ecx |
177 | jz L(equal) |
178 | |
179 | mov 4(%esi), %ecx |
180 | cmp %ecx, 4(%edi) |
181 | jne L(nequal) |
182 | test %ecx, %ecx |
183 | jz L(equal) |
184 | |
185 | mov 8(%esi), %ecx |
186 | cmp %ecx, 8(%edi) |
187 | jne L(nequal) |
188 | test %ecx, %ecx |
189 | jz L(equal) |
190 | |
191 | mov 12(%esi), %ecx |
192 | cmp %ecx, 12(%edi) |
193 | jne L(nequal) |
194 | test %ecx, %ecx |
195 | jz L(equal) |
196 | |
197 | movdqu 16(%edi), %xmm1 |
198 | movdqu 16(%esi), %xmm2 |
199 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
200 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
201 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
202 | pmovmskb %xmm1, %edx |
203 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
204 | jnz L(less4_double_words_16) |
205 | |
206 | movdqu 32(%edi), %xmm1 |
207 | movdqu 32(%esi), %xmm2 |
208 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
209 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
210 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
211 | pmovmskb %xmm1, %edx |
212 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
213 | jnz L(less4_double_words_32) |
214 | |
215 | mov 48(%esi), %ecx |
216 | cmp %ecx, 48(%edi) |
217 | jne L(nequal) |
218 | test %ecx, %ecx |
219 | jz L(equal) |
220 | |
221 | mov 52(%esi), %ecx |
222 | cmp %ecx, 52(%edi) |
223 | jne L(nequal) |
224 | test %ecx, %ecx |
225 | jz L(equal) |
226 | |
227 | mov 56(%esi), %ecx |
228 | cmp %ecx, 56(%edi) |
229 | jne L(nequal) |
230 | test %ecx, %ecx |
231 | jz L(equal) |
232 | |
233 | mov 60(%esi), %ecx |
234 | cmp %ecx, 60(%edi) |
235 | jne L(nequal) |
236 | test %ecx, %ecx |
237 | jz L(equal) |
238 | |
239 | add $64, %esi |
240 | add $64, %edi |
241 | jmp L(continue_0_48) |
242 | |
243 | .p2align 4 |
244 | L(continue_00): |
245 | and $15, %ch |
246 | jz L(continue_00_00) |
247 | cmp $16, %eax |
248 | jb L(continue_00_0) |
249 | cmp $32, %eax |
250 | jb L(continue_00_16) |
251 | cmp $48, %eax |
252 | jb L(continue_00_32) |
253 | |
254 | .p2align 4 |
255 | L(continue_00_48): |
256 | pcmpeqd (%edi), %xmm0 |
257 | mov (%edi), %eax |
258 | pmovmskb %xmm0, %ecx |
259 | test %ecx, %ecx |
260 | jnz L(less4_double_words1) |
261 | |
262 | cmp (%esi), %eax |
263 | jne L(nequal) |
264 | |
265 | mov 4(%edi), %eax |
266 | cmp 4(%esi), %eax |
267 | jne L(nequal) |
268 | |
269 | mov 8(%edi), %eax |
270 | cmp 8(%esi), %eax |
271 | jne L(nequal) |
272 | |
273 | mov 12(%edi), %eax |
274 | cmp 12(%esi), %eax |
275 | jne L(nequal) |
276 | |
277 | movdqu 16(%esi), %xmm2 |
278 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
279 | pcmpeqd 16(%edi), %xmm2 /* compare first 4 double_words for equality */ |
280 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
281 | pmovmskb %xmm2, %edx |
282 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
283 | jnz L(less4_double_words_16) |
284 | |
285 | movdqu 32(%esi), %xmm2 |
286 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
287 | pcmpeqd 32(%edi), %xmm2 /* compare first 4 double_words for equality */ |
288 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
289 | pmovmskb %xmm2, %edx |
290 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
291 | jnz L(less4_double_words_32) |
292 | |
293 | movdqu 48(%esi), %xmm2 |
294 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
295 | pcmpeqd 48(%edi), %xmm2 /* compare first 4 double_words for equality */ |
296 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
297 | pmovmskb %xmm2, %edx |
298 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
299 | jnz L(less4_double_words_48) |
300 | |
301 | add $64, %esi |
302 | add $64, %edi |
303 | jmp L(continue_00_48) |
304 | |
305 | .p2align 4 |
306 | L(continue_32): |
307 | and $15, %ch |
308 | jz L(continue_32_00) |
309 | cmp $16, %eax |
310 | jb L(continue_0_32) |
311 | cmp $32, %eax |
312 | jb L(continue_16_32) |
313 | cmp $48, %eax |
314 | jb L(continue_32_32) |
315 | |
316 | .p2align 4 |
317 | L(continue_32_48): |
318 | mov (%esi), %ecx |
319 | cmp %ecx, (%edi) |
320 | jne L(nequal) |
321 | test %ecx, %ecx |
322 | jz L(equal) |
323 | |
324 | mov 4(%esi), %ecx |
325 | cmp %ecx, 4(%edi) |
326 | jne L(nequal) |
327 | test %ecx, %ecx |
328 | jz L(equal) |
329 | |
330 | mov 8(%esi), %ecx |
331 | cmp %ecx, 8(%edi) |
332 | jne L(nequal) |
333 | test %ecx, %ecx |
334 | jz L(equal) |
335 | |
336 | mov 12(%esi), %ecx |
337 | cmp %ecx, 12(%edi) |
338 | jne L(nequal) |
339 | test %ecx, %ecx |
340 | jz L(equal) |
341 | |
342 | mov 16(%esi), %ecx |
343 | cmp %ecx, 16(%edi) |
344 | jne L(nequal) |
345 | test %ecx, %ecx |
346 | jz L(equal) |
347 | |
348 | mov 20(%esi), %ecx |
349 | cmp %ecx, 20(%edi) |
350 | jne L(nequal) |
351 | test %ecx, %ecx |
352 | jz L(equal) |
353 | |
354 | mov 24(%esi), %ecx |
355 | cmp %ecx, 24(%edi) |
356 | jne L(nequal) |
357 | test %ecx, %ecx |
358 | jz L(equal) |
359 | |
360 | mov 28(%esi), %ecx |
361 | cmp %ecx, 28(%edi) |
362 | jne L(nequal) |
363 | test %ecx, %ecx |
364 | jz L(equal) |
365 | |
366 | movdqu 32(%edi), %xmm1 |
367 | movdqu 32(%esi), %xmm2 |
368 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
369 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
370 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
371 | pmovmskb %xmm1, %edx |
372 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
373 | jnz L(less4_double_words_32) |
374 | |
375 | movdqu 48(%edi), %xmm1 |
376 | movdqu 48(%esi), %xmm2 |
377 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
378 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
379 | psubb %xmm0, %xmm1 /* packed sub of comparison results */ |
380 | pmovmskb %xmm1, %edx |
381 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
382 | jnz L(less4_double_words_48) |
383 | |
384 | add $64, %esi |
385 | add $64, %edi |
386 | jmp L(continue_32_48) |
387 | |
388 | .p2align 4 |
389 | L(continue_16): |
390 | and $15, %ch |
391 | jz L(continue_16_00) |
392 | cmp $16, %eax |
393 | jb L(continue_0_16) |
394 | cmp $32, %eax |
395 | jb L(continue_16_16) |
396 | cmp $48, %eax |
397 | jb L(continue_16_32) |
398 | |
399 | .p2align 4 |
400 | L(continue_16_48): |
401 | mov (%esi), %ecx |
402 | cmp %ecx, (%edi) |
403 | jne L(nequal) |
404 | test %ecx, %ecx |
405 | jz L(equal) |
406 | |
407 | mov 4(%esi), %ecx |
408 | cmp %ecx, 4(%edi) |
409 | jne L(nequal) |
410 | test %ecx, %ecx |
411 | jz L(equal) |
412 | |
413 | mov 8(%esi), %ecx |
414 | cmp %ecx, 8(%edi) |
415 | jne L(nequal) |
416 | test %ecx, %ecx |
417 | jz L(equal) |
418 | |
419 | mov 12(%esi), %ecx |
420 | cmp %ecx, 12(%edi) |
421 | jne L(nequal) |
422 | test %ecx, %ecx |
423 | jz L(equal) |
424 | |
425 | movdqu 16(%edi), %xmm1 |
426 | movdqu 16(%esi), %xmm2 |
427 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
428 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
429 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
430 | pmovmskb %xmm1, %edx |
431 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
432 | jnz L(less4_double_words_16) |
433 | |
434 | mov 32(%esi), %ecx |
435 | cmp %ecx, 32(%edi) |
436 | jne L(nequal) |
437 | test %ecx, %ecx |
438 | jz L(equal) |
439 | |
440 | mov 36(%esi), %ecx |
441 | cmp %ecx, 36(%edi) |
442 | jne L(nequal) |
443 | test %ecx, %ecx |
444 | jz L(equal) |
445 | |
446 | mov 40(%esi), %ecx |
447 | cmp %ecx, 40(%edi) |
448 | jne L(nequal) |
449 | test %ecx, %ecx |
450 | jz L(equal) |
451 | |
452 | mov 44(%esi), %ecx |
453 | cmp %ecx, 44(%edi) |
454 | jne L(nequal) |
455 | test %ecx, %ecx |
456 | jz L(equal) |
457 | |
458 | movdqu 48(%edi), %xmm1 |
459 | movdqu 48(%esi), %xmm2 |
460 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
461 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
462 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
463 | pmovmskb %xmm1, %edx |
464 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
465 | jnz L(less4_double_words_48) |
466 | |
467 | add $64, %esi |
468 | add $64, %edi |
469 | jmp L(continue_16_48) |
470 | |
471 | .p2align 4 |
472 | L(continue_00_00): |
473 | movdqa (%edi), %xmm1 |
474 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
475 | pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */ |
476 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
477 | pmovmskb %xmm1, %edx |
478 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
479 | jnz L(less4_double_words) |
480 | |
481 | movdqa 16(%edi), %xmm3 |
482 | pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ |
483 | pcmpeqd 16(%esi), %xmm3 /* compare first 4 double_words for equality */ |
484 | psubb %xmm0, %xmm3 /* packed sub of comparison results*/ |
485 | pmovmskb %xmm3, %edx |
486 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
487 | jnz L(less4_double_words_16) |
488 | |
489 | movdqa 32(%edi), %xmm5 |
490 | pcmpeqd %xmm5, %xmm0 /* Any null double_word? */ |
491 | pcmpeqd 32(%esi), %xmm5 /* compare first 4 double_words for equality */ |
492 | psubb %xmm0, %xmm5 /* packed sub of comparison results*/ |
493 | pmovmskb %xmm5, %edx |
494 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
495 | jnz L(less4_double_words_32) |
496 | |
497 | movdqa 48(%edi), %xmm1 |
498 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
499 | pcmpeqd 48(%esi), %xmm1 /* compare first 4 double_words for equality */ |
500 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
501 | pmovmskb %xmm1, %edx |
502 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
503 | jnz L(less4_double_words_48) |
504 | |
505 | add $64, %esi |
506 | add $64, %edi |
507 | jmp L(continue_00_00) |
508 | |
509 | .p2align 4 |
510 | L(continue_00_32): |
511 | movdqu (%esi), %xmm2 |
512 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
513 | pcmpeqd (%edi), %xmm2 /* compare first 4 double_words for equality */ |
514 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
515 | pmovmskb %xmm2, %edx |
516 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
517 | jnz L(less4_double_words) |
518 | |
519 | add $16, %esi |
520 | add $16, %edi |
521 | jmp L(continue_00_48) |
522 | |
523 | .p2align 4 |
524 | L(continue_00_16): |
525 | movdqu (%esi), %xmm2 |
526 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
527 | pcmpeqd (%edi), %xmm2 /* compare first 4 double_words for equality */ |
528 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
529 | pmovmskb %xmm2, %edx |
530 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
531 | jnz L(less4_double_words) |
532 | |
533 | movdqu 16(%esi), %xmm2 |
534 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
535 | pcmpeqd 16(%edi), %xmm2 /* compare first 4 double_words for equality */ |
536 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
537 | pmovmskb %xmm2, %edx |
538 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
539 | jnz L(less4_double_words_16) |
540 | |
541 | add $32, %esi |
542 | add $32, %edi |
543 | jmp L(continue_00_48) |
544 | |
545 | .p2align 4 |
546 | L(continue_00_0): |
547 | movdqu (%esi), %xmm2 |
548 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
549 | pcmpeqd (%edi), %xmm2 /* compare first 4 double_words for equality */ |
550 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
551 | pmovmskb %xmm2, %edx |
552 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
553 | jnz L(less4_double_words) |
554 | |
555 | movdqu 16(%esi), %xmm2 |
556 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
557 | pcmpeqd 16(%edi), %xmm2 /* compare first 4 double_words for equality */ |
558 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
559 | pmovmskb %xmm2, %edx |
560 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
561 | jnz L(less4_double_words_16) |
562 | |
563 | movdqu 32(%esi), %xmm2 |
564 | pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ |
565 | pcmpeqd 32(%edi), %xmm2 /* compare first 4 double_words for equality */ |
566 | psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
567 | pmovmskb %xmm2, %edx |
568 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
569 | jnz L(less4_double_words_32) |
570 | |
571 | add $48, %esi |
572 | add $48, %edi |
573 | jmp L(continue_00_48) |
574 | |
575 | .p2align 4 |
576 | L(continue_48_00): |
577 | pcmpeqd (%esi), %xmm0 |
578 | mov (%edi), %eax |
579 | pmovmskb %xmm0, %ecx |
580 | test %ecx, %ecx |
581 | jnz L(less4_double_words1) |
582 | |
583 | cmp (%esi), %eax |
584 | jne L(nequal) |
585 | |
586 | mov 4(%edi), %eax |
587 | cmp 4(%esi), %eax |
588 | jne L(nequal) |
589 | |
590 | mov 8(%edi), %eax |
591 | cmp 8(%esi), %eax |
592 | jne L(nequal) |
593 | |
594 | mov 12(%edi), %eax |
595 | cmp 12(%esi), %eax |
596 | jne L(nequal) |
597 | |
598 | movdqu 16(%edi), %xmm1 |
599 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
600 | pcmpeqd 16(%esi), %xmm1 /* compare first 4 double_words for equality */ |
601 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
602 | pmovmskb %xmm1, %edx |
603 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
604 | jnz L(less4_double_words_16) |
605 | |
606 | movdqu 32(%edi), %xmm1 |
607 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
608 | pcmpeqd 32(%esi), %xmm1 /* compare first 4 double_words for equality */ |
609 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
610 | pmovmskb %xmm1, %edx |
611 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
612 | jnz L(less4_double_words_32) |
613 | |
614 | movdqu 48(%edi), %xmm1 |
615 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
616 | pcmpeqd 48(%esi), %xmm1 /* compare first 4 double_words for equality */ |
617 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
618 | pmovmskb %xmm1, %edx |
619 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
620 | jnz L(less4_double_words_48) |
621 | |
622 | add $64, %esi |
623 | add $64, %edi |
624 | jmp L(continue_48_00) |
625 | |
626 | .p2align 4 |
627 | L(continue_32_00): |
628 | movdqu (%edi), %xmm1 |
629 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
630 | pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */ |
631 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
632 | pmovmskb %xmm1, %edx |
633 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
634 | jnz L(less4_double_words) |
635 | |
636 | add $16, %esi |
637 | add $16, %edi |
638 | jmp L(continue_48_00) |
639 | |
640 | .p2align 4 |
641 | L(continue_16_00): |
642 | movdqu (%edi), %xmm1 |
643 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
644 | pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */ |
645 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
646 | pmovmskb %xmm1, %edx |
647 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
648 | jnz L(less4_double_words) |
649 | |
650 | movdqu 16(%edi), %xmm1 |
651 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
652 | pcmpeqd 16(%esi), %xmm1 /* compare first 4 double_words for equality */ |
653 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
654 | pmovmskb %xmm1, %edx |
655 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
656 | jnz L(less4_double_words_16) |
657 | |
658 | add $32, %esi |
659 | add $32, %edi |
660 | jmp L(continue_48_00) |
661 | |
662 | .p2align 4 |
663 | L(continue_0_00): |
664 | movdqu (%edi), %xmm1 |
665 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
666 | pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */ |
667 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
668 | pmovmskb %xmm1, %edx |
669 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
670 | jnz L(less4_double_words) |
671 | |
672 | movdqu 16(%edi), %xmm1 |
673 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
674 | pcmpeqd 16(%esi), %xmm1 /* compare first 4 double_words for equality */ |
675 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
676 | pmovmskb %xmm1, %edx |
677 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
678 | jnz L(less4_double_words_16) |
679 | |
680 | movdqu 32(%edi), %xmm1 |
681 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
682 | pcmpeqd 32(%esi), %xmm1 /* compare first 4 double_words for equality */ |
683 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
684 | pmovmskb %xmm1, %edx |
685 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
686 | jnz L(less4_double_words_32) |
687 | |
688 | add $48, %esi |
689 | add $48, %edi |
690 | jmp L(continue_48_00) |
691 | |
692 | .p2align 4 |
693 | L(continue_32_32): |
694 | movdqu (%edi), %xmm1 |
695 | movdqu (%esi), %xmm2 |
696 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
697 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
698 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
699 | pmovmskb %xmm1, %edx |
700 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
701 | jnz L(less4_double_words) |
702 | |
703 | add $16, %esi |
704 | add $16, %edi |
705 | jmp L(continue_48_48) |
706 | |
707 | .p2align 4 |
708 | L(continue_16_16): |
709 | movdqu (%edi), %xmm1 |
710 | movdqu (%esi), %xmm2 |
711 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
712 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
713 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
714 | pmovmskb %xmm1, %edx |
715 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
716 | jnz L(less4_double_words) |
717 | |
718 | movdqu 16(%edi), %xmm3 |
719 | movdqu 16(%esi), %xmm4 |
720 | pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ |
721 | pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */ |
722 | psubb %xmm0, %xmm3 /* packed sub of comparison results*/ |
723 | pmovmskb %xmm3, %edx |
724 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
725 | jnz L(less4_double_words_16) |
726 | |
727 | add $32, %esi |
728 | add $32, %edi |
729 | jmp L(continue_48_48) |
730 | |
731 | .p2align 4 |
732 | L(continue_0_0): |
733 | movdqu (%edi), %xmm1 |
734 | movdqu (%esi), %xmm2 |
735 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
736 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
737 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
738 | pmovmskb %xmm1, %edx |
739 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
740 | jnz L(less4_double_words) |
741 | |
742 | movdqu 16(%edi), %xmm3 |
743 | movdqu 16(%esi), %xmm4 |
744 | pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ |
745 | pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */ |
746 | psubb %xmm0, %xmm3 /* packed sub of comparison results*/ |
747 | pmovmskb %xmm3, %edx |
748 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
749 | jnz L(less4_double_words_16) |
750 | |
751 | movdqu 32(%edi), %xmm1 |
752 | movdqu 32(%esi), %xmm2 |
753 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
754 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
755 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
756 | pmovmskb %xmm1, %edx |
757 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
758 | jnz L(less4_double_words_32) |
759 | |
760 | add $48, %esi |
761 | add $48, %edi |
762 | jmp L(continue_48_48) |
763 | |
764 | .p2align 4 |
765 | L(continue_0_16): |
766 | movdqu (%edi), %xmm1 |
767 | movdqu (%esi), %xmm2 |
768 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
769 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
770 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
771 | pmovmskb %xmm1, %edx |
772 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
773 | jnz L(less4_double_words) |
774 | |
775 | movdqu 16(%edi), %xmm1 |
776 | movdqu 16(%esi), %xmm2 |
777 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
778 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
779 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
780 | pmovmskb %xmm1, %edx |
781 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
782 | jnz L(less4_double_words_16) |
783 | |
784 | add $32, %esi |
785 | add $32, %edi |
786 | jmp L(continue_32_48) |
787 | |
788 | .p2align 4 |
789 | L(continue_0_32): |
790 | movdqu (%edi), %xmm1 |
791 | movdqu (%esi), %xmm2 |
792 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
793 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
794 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
795 | pmovmskb %xmm1, %edx |
796 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
797 | jnz L(less4_double_words) |
798 | |
799 | add $16, %esi |
800 | add $16, %edi |
801 | jmp L(continue_16_48) |
802 | |
803 | .p2align 4 |
804 | L(continue_16_32): |
805 | movdqu (%edi), %xmm1 |
806 | movdqu (%esi), %xmm2 |
807 | pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ |
808 | pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ |
809 | psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
810 | pmovmskb %xmm1, %edx |
811 | sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ |
812 | jnz L(less4_double_words) |
813 | |
814 | add $16, %esi |
815 | add $16, %edi |
816 | jmp L(continue_32_48) |
817 | |
818 | .p2align 4 |
819 | L(less4_double_words1): |
820 | cmp (%esi), %eax |
821 | jne L(nequal) |
822 | test %eax, %eax |
823 | jz L(equal) |
824 | |
825 | mov 4(%esi), %ecx |
826 | cmp %ecx, 4(%edi) |
827 | jne L(nequal) |
828 | test %ecx, %ecx |
829 | jz L(equal) |
830 | |
831 | mov 8(%esi), %ecx |
832 | cmp %ecx, 8(%edi) |
833 | jne L(nequal) |
834 | test %ecx, %ecx |
835 | jz L(equal) |
836 | |
837 | mov 12(%esi), %ecx |
838 | cmp %ecx, 12(%edi) |
839 | jne L(nequal) |
840 | xor %eax, %eax |
841 | RETURN |
842 | |
843 | .p2align 4 |
844 | L(less4_double_words): |
845 | xor %eax, %eax |
846 | test %dl, %dl |
847 | jz L(next_two_double_words) |
848 | and $15, %dl |
849 | jz L(second_double_word) |
850 | mov (%esi), %ecx |
851 | cmp %ecx, (%edi) |
852 | jne L(nequal) |
853 | RETURN |
854 | |
855 | .p2align 4 |
856 | L(second_double_word): |
857 | mov 4(%esi), %ecx |
858 | cmp %ecx, 4(%edi) |
859 | jne L(nequal) |
860 | RETURN |
861 | |
862 | .p2align 4 |
863 | L(next_two_double_words): |
864 | and $15, %dh |
865 | jz L(fourth_double_word) |
866 | mov 8(%esi), %ecx |
867 | cmp %ecx, 8(%edi) |
868 | jne L(nequal) |
869 | RETURN |
870 | |
871 | .p2align 4 |
872 | L(fourth_double_word): |
873 | mov 12(%esi), %ecx |
874 | cmp %ecx, 12(%edi) |
875 | jne L(nequal) |
876 | RETURN |
877 | |
878 | .p2align 4 |
879 | L(less4_double_words_16): |
880 | xor %eax, %eax |
881 | test %dl, %dl |
882 | jz L(next_two_double_words_16) |
883 | and $15, %dl |
884 | jz L(second_double_word_16) |
885 | mov 16(%esi), %ecx |
886 | cmp %ecx, 16(%edi) |
887 | jne L(nequal) |
888 | RETURN |
889 | |
890 | .p2align 4 |
891 | L(second_double_word_16): |
892 | mov 20(%esi), %ecx |
893 | cmp %ecx, 20(%edi) |
894 | jne L(nequal) |
895 | RETURN |
896 | |
897 | .p2align 4 |
898 | L(next_two_double_words_16): |
899 | and $15, %dh |
900 | jz L(fourth_double_word_16) |
901 | mov 24(%esi), %ecx |
902 | cmp %ecx, 24(%edi) |
903 | jne L(nequal) |
904 | RETURN |
905 | |
906 | .p2align 4 |
907 | L(fourth_double_word_16): |
908 | mov 28(%esi), %ecx |
909 | cmp %ecx, 28(%edi) |
910 | jne L(nequal) |
911 | RETURN |
912 | |
913 | .p2align 4 |
914 | L(less4_double_words_32): |
915 | xor %eax, %eax |
916 | test %dl, %dl |
917 | jz L(next_two_double_words_32) |
918 | and $15, %dl |
919 | jz L(second_double_word_32) |
920 | mov 32(%esi), %ecx |
921 | cmp %ecx, 32(%edi) |
922 | jne L(nequal) |
923 | RETURN |
924 | |
925 | .p2align 4 |
926 | L(second_double_word_32): |
927 | mov 36(%esi), %ecx |
928 | cmp %ecx, 36(%edi) |
929 | jne L(nequal) |
930 | RETURN |
931 | |
932 | .p2align 4 |
933 | L(next_two_double_words_32): |
934 | and $15, %dh |
935 | jz L(fourth_double_word_32) |
936 | mov 40(%esi), %ecx |
937 | cmp %ecx, 40(%edi) |
938 | jne L(nequal) |
939 | RETURN |
940 | |
941 | .p2align 4 |
942 | L(fourth_double_word_32): |
943 | mov 44(%esi), %ecx |
944 | cmp %ecx, 44(%edi) |
945 | jne L(nequal) |
946 | RETURN |
947 | |
948 | .p2align 4 |
949 | L(less4_double_words_48): |
950 | xor %eax, %eax |
951 | test %dl, %dl |
952 | jz L(next_two_double_words_48) |
953 | and $15, %dl |
954 | jz L(second_double_word_48) |
955 | mov 48(%esi), %ecx |
956 | cmp %ecx, 48(%edi) |
957 | jne L(nequal) |
958 | RETURN |
959 | |
960 | .p2align 4 |
961 | L(second_double_word_48): |
962 | mov 52(%esi), %ecx |
963 | cmp %ecx, 52(%edi) |
964 | jne L(nequal) |
965 | RETURN |
966 | |
967 | .p2align 4 |
968 | L(next_two_double_words_48): |
969 | and $15, %dh |
970 | jz L(fourth_double_word_48) |
971 | mov 56(%esi), %ecx |
972 | cmp %ecx, 56(%edi) |
973 | jne L(nequal) |
974 | RETURN |
975 | |
976 | .p2align 4 |
977 | L(fourth_double_word_48): |
978 | mov 60(%esi), %ecx |
979 | cmp %ecx, 60(%edi) |
980 | jne L(nequal) |
981 | RETURN |
982 | |
983 | .p2align 4 |
984 | L(nequal): |
985 | mov $1, %eax |
986 | jg L(return) |
987 | neg %eax |
988 | RETURN |
989 | |
990 | .p2align 4 |
991 | L(return): |
992 | RETURN |
993 | |
994 | .p2align 4 |
995 | L(equal): |
996 | xorl %eax, %eax |
997 | RETURN |
998 | |
999 | CFI_POP (%edi) |
1000 | CFI_POP (%esi) |
1001 | |
1002 | .p2align 4 |
1003 | L(neq): |
1004 | mov $1, %eax |
1005 | jg L(neq_bigger) |
1006 | neg %eax |
1007 | |
1008 | L(neq_bigger): |
1009 | ret |
1010 | |
1011 | .p2align 4 |
1012 | L(eq): |
1013 | xorl %eax, %eax |
1014 | ret |
1015 | |
1016 | END (__wcscmp_sse2) |
1017 | #endif |
1018 | |