1/* Optimized memcmp implementation for POWER7/PowerPC64.
2 Copyright (C) 2010-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21/* int [r3] memcmp (const char *s1 [r3],
22 const char *s2 [r4],
23 size_t size [r5]) */
24
25#ifndef MEMCMP
26# define MEMCMP memcmp
27#endif
28 .machine power8
29ENTRY_TOCLESS (MEMCMP, 4)
30 CALL_MCOUNT 3
31
32#define rRTN r3
33#define rSTR1 r3 /* First string arg. */
34#define rSTR2 r4 /* Second string arg. */
35#define rN r5 /* Max string length. */
36#define rWORD1 r6 /* Current word in s1. */
37#define rWORD2 r7 /* Current word in s2. */
38#define rWORD3 r8 /* Next word in s1. */
39#define rWORD4 r9 /* Next word in s2. */
40#define rWORD5 r10 /* Next word in s1. */
41#define rWORD6 r11 /* Next word in s2. */
42
43#define rOFF8 r20 /* 8 bytes offset. */
44#define rOFF16 r21 /* 16 bytes offset. */
45#define rOFF24 r22 /* 24 bytes offset. */
46#define rOFF32 r23 /* 24 bytes offset. */
47#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
48#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
49#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
50#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
51#define rSHR r28 /* Unaligned shift right count. */
52#define rSHL r29 /* Unaligned shift left count. */
53#define rWORD7 r30 /* Next word in s1. */
54#define rWORD8 r31 /* Next word in s2. */
55
56#define rWORD8SAVE (-8)
57#define rWORD7SAVE (-16)
58#define rOFF8SAVE (-24)
59#define rOFF16SAVE (-32)
60#define rOFF24SAVE (-40)
61#define rOFF32SAVE (-48)
62#define rSHRSAVE (-56)
63#define rSHLSAVE (-64)
64#define rWORD8SHIFTSAVE (-72)
65#define rWORD2SHIFTSAVE (-80)
66#define rWORD4SHIFTSAVE (-88)
67#define rWORD6SHIFTSAVE (-96)
68
69#ifdef __LITTLE_ENDIAN__
70# define LD ldbrx
71#else
72# define LD ldx
73#endif
74
75 xor r10, rSTR2, rSTR1
76 cmpldi cr6, rN, 0
77 cmpldi cr1, rN, 8
78 clrldi. r0, r10, 61
79 clrldi r12, rSTR1, 61
80 cmpldi cr5, r12, 0
81 beq- cr6, L(zeroLength)
82 dcbt 0, rSTR1
83 dcbt 0, rSTR2
84 /* If less than 8 bytes or not aligned, use the unaligned
85 byte loop. */
86 blt cr1, L(bytealigned)
87 bne L(unalignedqw)
88/* At this point we know both strings have the same alignment and the
89 compare length is at least 8 bytes. r12 contains the low order
90 3 bits of rSTR1 and cr5 contains the result of the logical compare
91 of r12 to 0. If r12 == 0 then we are already double word
92 aligned and can perform the DW aligned loop. */
93
94 .align 4
95L(samealignment):
96 or r11, rSTR2, rSTR1
97 clrldi. r11, r11, 60
98 beq L(qw_align)
99 /* Try to align to QW else proceed to DW loop. */
100 clrldi. r10, r10, 60
101 bne L(DW)
102 /* For the difference to reach QW alignment, load as DW. */
103 clrrdi rSTR1, rSTR1, 3
104 clrrdi rSTR2, rSTR2, 3
105 subfic r10, r12, 8
106 LD rWORD1, 0, rSTR1
107 LD rWORD2, 0, rSTR2
108 sldi r9, r10, 3
109 subfic r9, r9, 64
110 sld rWORD1, rWORD1, r9
111 sld rWORD2, rWORD2, r9
112 cmpld cr6, rWORD1, rWORD2
113 addi rSTR1, rSTR1, 8
114 addi rSTR2, rSTR2, 8
115 bne cr6, L(ret_diff)
116 subf rN, r10, rN
117
118 cmpld cr6, r11, r12
119 bgt cr6, L(qw_align)
120 LD rWORD1, 0, rSTR1
121 LD rWORD2, 0, rSTR2
122 cmpld cr6, rWORD1, rWORD2
123 addi rSTR1, rSTR1, 8
124 addi rSTR2, rSTR2, 8
125 bne cr6, L(different)
126 cmpldi cr6, rN, 8
127 ble cr6, L(zeroLength)
128 addi rN, rN, -8
129 /* Now both rSTR1 and rSTR2 are aligned to QW. */
130 .align 4
131L(qw_align):
132 vspltisb v0, 0
133 srdi. r6, rN, 6
134 li r8, 16
135 li r10, 32
136 li r11, 48
137 ble cr0, L(lessthan64)
138 mtctr r6
139 vspltisb v8, 0
140 vspltisb v6, 0
141 /* Aligned vector loop. */
142 .align 4
143L(aligned_loop):
144 lvx v4, 0, rSTR1
145 lvx v5, 0, rSTR2
146 vcmpequb. v7, v6, v8
147 bnl cr6, L(different3)
148 lvx v6, rSTR1, r8
149 lvx v8, rSTR2, r8
150 vcmpequb. v7, v5, v4
151 bnl cr6, L(different2)
152 lvx v4, rSTR1, r10
153 lvx v5, rSTR2, r10
154 vcmpequb. v7, v6, v8
155 bnl cr6, L(different3)
156 lvx v6, rSTR1, r11
157 lvx v8, rSTR2, r11
158 vcmpequb. v7, v5, v4
159 bnl cr6, L(different2)
160 addi rSTR1, rSTR1, 64
161 addi rSTR2, rSTR2, 64
162 bdnz L(aligned_loop)
163 vcmpequb. v7, v6, v8
164 bnl cr6, L(different3)
165 clrldi rN, rN, 58
166 /* Handle remainder for aligned loop. */
167 .align 4
168L(lessthan64):
169 mr r9, rSTR1
170 cmpdi cr6, rN, 0
171 li rSTR1, 0
172 blelr cr6
173 lvx v4, 0, r9
174 lvx v5, 0, rSTR2
175 vcmpequb. v7, v5, v4
176 bnl cr6, L(different1)
177 addi rN, rN, -16
178
179 cmpdi cr6, rN, 0
180 blelr cr6
181 lvx v4, r9, r8
182 lvx v5, rSTR2, r8
183 vcmpequb. v7, v5, v4
184 bnl cr6, L(different1)
185 addi rN, rN, -16
186
187 cmpdi cr6, rN, 0
188 blelr cr6
189 lvx v4, r9, r10
190 lvx v5, rSTR2, r10
191 vcmpequb. v7, v5, v4
192 bnl cr6, L(different1)
193 addi rN, rN, -16
194
195 cmpdi cr6, rN, 0
196 blelr cr6
197 lvx v4, r9, r11
198 lvx v5, rSTR2, r11
199 vcmpequb. v7, v5, v4
200 bnl cr6, L(different1)
201 blr
202
203 /* Calculate and return the difference. */
204 .align 4
205L(different1):
206 cmpdi cr6, rN, 16
207 bge cr6, L(different2)
208 /* Discard unwanted bytes. */
209#ifdef __LITTLE_ENDIAN__
210 lvsr v1, 0, rN
211 vperm v4, v4, v0, v1
212 vperm v5, v5, v0, v1
213#else
214 lvsl v1, 0, rN
215 vperm v4, v0, v4, v1
216 vperm v5, v0, v5, v1
217#endif
218 vcmpequb. v7, v4, v5
219 li rRTN, 0
220 bltlr cr6
221 .align 4
222L(different2):
223#ifdef __LITTLE_ENDIAN__
224 /* Reverse bytes for direct comparison. */
225 lvsl v10, r0, r0
226 vspltisb v8, 15
227 vsububm v9, v8, v10
228 vperm v4, v4, v0, v9
229 vperm v5, v5, v0, v9
230#endif
231 mfvrd r7, v4
232 mfvrd r9, v5
233 cmpld cr6, r7, r9
234 bne cr6, L(ret_diff)
235 /* Difference in second DW. */
236 vsldoi v4, v4, v4, 8
237 vsldoi v5, v5, v5, 8
238 mfvrd r7, v4
239 mfvrd r9, v5
240 cmpld cr6, r7, r9
241L(ret_diff):
242 li rRTN, 1
243 bgtlr cr6
244 li rRTN, -1
245 blr
246 .align 4
247L(different3):
248#ifdef __LITTLE_ENDIAN__
249 /* Reverse bytes for direct comparison. */
250 vspltisb v9, 15
251 lvsl v10, r0, r0
252 vsububm v9, v9, v10
253 vperm v6, v6, v0, v9
254 vperm v8, v8, v0, v9
255#endif
256 mfvrd r7, v6
257 mfvrd r9, v8
258 cmpld cr6, r7, r9
259 bne cr6, L(ret_diff)
260 /* Difference in second DW. */
261 vsldoi v6, v6, v6, 8
262 vsldoi v8, v8, v8, 8
263 mfvrd r7, v6
264 mfvrd r9, v8
265 cmpld cr6, r7, r9
266 li rRTN, 1
267 bgtlr cr6
268 li rRTN, -1
269 blr
270
271 .align 4
272L(different):
273 cmpldi cr7, rN, 8
274 bgt cr7, L(end)
275 /* Skip unwanted bytes. */
276 sldi r8, rN, 3
277 subfic r8, r8, 64
278 srd rWORD1, rWORD1, r8
279 srd rWORD2, rWORD2, r8
280 cmpld cr6, rWORD1, rWORD2
281 li rRTN, 0
282 beqlr cr6
283L(end):
284 li rRTN, 1
285 bgtlr cr6
286 li rRTN, -1
287 blr
288
289 .align 4
290L(unalignedqw):
291 /* Proceed to DW unaligned loop,if there is a chance of pagecross. */
292 rldicl r9, rSTR1, 0, 52
293 add r9, r9, rN
294 cmpldi cr0, r9, 4096-16
295 bgt cr0, L(unaligned)
296 rldicl r9, rSTR2, 0, 52
297 add r9, r9, rN
298 cmpldi cr0, r9, 4096-16
299 bgt cr0, L(unaligned)
300 li r0, 0
301 li r8, 16
302 vspltisb v0, 0
303 /* Check if rSTR1 is aligned to QW. */
304 andi. r11, rSTR1, 0xF
305 beq L(s1_align)
306
307 /* Compare 16B and align S1 to QW. */
308#ifdef __LITTLE_ENDIAN__
309 lvsr v10, 0, rSTR1 /* Compute mask. */
310 lvsr v6, 0, rSTR2 /* Compute mask. */
311#else
312 lvsl v10, 0, rSTR1 /* Compute mask. */
313 lvsl v6, 0, rSTR2 /* Compute mask. */
314#endif
315 lvx v5, 0, rSTR2
316 lvx v9, rSTR2, r8
317#ifdef __LITTLE_ENDIAN__
318 vperm v5, v9, v5, v6
319#else
320 vperm v5, v5, v9, v6
321#endif
322 lvx v4, 0, rSTR1
323 lvx v9, rSTR1, r8
324#ifdef __LITTLE_ENDIAN__
325 vperm v4, v9, v4, v10
326#else
327 vperm v4, v4, v9, v10
328#endif
329 vcmpequb. v7, v5, v4
330 bnl cr6, L(different1)
331 cmpldi cr6, rN, 16
332 ble cr6, L(zeroLength)
333 subfic r11, r11, 16
334 subf rN, r11, rN
335 add rSTR1, rSTR1, r11
336 add rSTR2, rSTR2, r11
337
338 /* As s1 is QW aligned prepare for unaligned loop. */
339 .align 4
340L(s1_align):
341#ifdef __LITTLE_ENDIAN__
342 lvsr v6, 0, rSTR2
343#else
344 lvsl v6, 0, rSTR2
345#endif
346 lvx v5, 0, rSTR2
347 srdi. r6, rN, 6
348 li r10, 32
349 li r11, 48
350 ble cr0, L(lessthan64_unalign)
351 mtctr r6
352 li r9, 64
353 /* Unaligned vector loop. */
354 .align 4
355L(unalign_qwloop):
356 lvx v4, 0, rSTR1
357 lvx v10, rSTR2, r8
358#ifdef __LITTLE_ENDIAN__
359 vperm v5, v10, v5, v6
360#else
361 vperm v5, v5, v10, v6
362#endif
363 vcmpequb. v7, v5, v4
364 bnl cr6, L(different2)
365 vor v5, v10, v10
366 lvx v4, rSTR1, r8
367 lvx v10, rSTR2, r10
368#ifdef __LITTLE_ENDIAN__
369 vperm v5, v10, v5, v6
370#else
371 vperm v5, v5, v10, v6
372#endif
373 vcmpequb. v7, v5, v4
374 bnl cr6, L(different2)
375 vor v5, v10, v10
376 lvx v4, rSTR1, r10
377 lvx v10, rSTR2, r11
378#ifdef __LITTLE_ENDIAN__
379 vperm v5, v10, v5, v6
380#else
381 vperm v5, v5, v10, v6
382#endif
383 vcmpequb. v7, v5, v4
384 bnl cr6, L(different2)
385 vor v5, v10, v10
386 lvx v4, rSTR1, r11
387 lvx v10, rSTR2, r9
388#ifdef __LITTLE_ENDIAN__
389 vperm v5, v10, v5, v6
390#else
391 vperm v5, v5, v10, v6
392#endif
393 vcmpequb. v7, v5, v4
394 bnl cr6, L(different2)
395 vor v5, v10, v10
396 addi rSTR1, rSTR1, 64
397 addi rSTR2, rSTR2, 64
398 bdnz L(unalign_qwloop)
399 clrldi rN, rN, 58
400 /* Handle remainder for unaligned loop. */
401 .align 4
402L(lessthan64_unalign):
403 mr r9, rSTR1
404 cmpdi cr6, rN, 0
405 li rSTR1, 0
406 blelr cr6
407 lvx v4, 0, r9
408 lvx v10, rSTR2, r8
409#ifdef __LITTLE_ENDIAN__
410 vperm v5, v10, v5, v6
411#else
412 vperm v5, v5, v10, v6
413#endif
414 vcmpequb. v7, v5, v4
415 bnl cr6, L(different1)
416 vor v5, v10, v10
417 addi rN, rN, -16
418
419 cmpdi cr6, rN, 0
420 blelr cr6
421 lvx v4, r9, r8
422 lvx v10, rSTR2, r10
423#ifdef __LITTLE_ENDIAN__
424 vperm v5, v10, v5, v6
425#else
426 vperm v5, v5, v10, v6
427#endif
428 vcmpequb. v7, v5, v4
429 bnl cr6, L(different1)
430 vor v5, v10, v10
431 addi rN, rN, -16
432
433 cmpdi cr6, rN, 0
434 blelr cr6
435 lvx v4, r9, r10
436 lvx v10, rSTR2, r11
437#ifdef __LITTLE_ENDIAN__
438 vperm v5, v10, v5, v6
439#else
440 vperm v5, v5, v10, v6
441#endif
442 vcmpequb. v7, v5, v4
443 bnl cr6, L(different1)
444 vor v5, v10, v10
445 addi rN, rN, -16
446
447 cmpdi cr6, rN, 0
448 blelr cr6
449 lvx v4, r9, r11
450 addi r11, r11, 16
451 lvx v10, rSTR2, r11
452#ifdef __LITTLE_ENDIAN__
453 vperm v5, v10, v5, v6
454#else
455 vperm v5, v5, v10, v6
456#endif
457 vcmpequb. v7, v5, v4
458 bnl cr6, L(different1)
459 blr
460
461/* Otherwise we know the two strings have the same alignment (but not
462 yet DW). So we force the string addresses to the next lower DW
463 boundary and special case this first DW using shift left to
464 eliminate bits preceding the first byte. Since we want to join the
465 normal (DW aligned) compare loop, starting at the second double word,
466 we need to adjust the length (rN) and special case the loop
467 versioning for the first DW. This ensures that the loop count is
468 correct and the first DW (shifted) is in the expected register pair. */
469 .align 4
470L(DW):
471 std rWORD8, rWORD8SAVE(r1)
472 std rWORD7, rWORD7SAVE(r1)
473 std rOFF8, rOFF8SAVE(r1)
474 std rOFF16, rOFF16SAVE(r1)
475 std rOFF24, rOFF24SAVE(r1)
476 std rOFF32, rOFF32SAVE(r1)
477 cfi_offset(rWORD8, rWORD8SAVE)
478 cfi_offset(rWORD7, rWORD7SAVE)
479 cfi_offset(rOFF8, rOFF8SAVE)
480 cfi_offset(rOFF16, rOFF16SAVE)
481 cfi_offset(rOFF24, rOFF24SAVE)
482 cfi_offset(rOFF32, rOFF32SAVE)
483
484 li rOFF8,8
485 li rOFF16,16
486 li rOFF24,24
487 li rOFF32,32
488 clrrdi rSTR1, rSTR1, 3
489 clrrdi rSTR2, rSTR2, 3
490 beq cr5, L(DWaligned)
491 add rN, rN, r12
492 sldi rWORD6, r12, 3
493 srdi r0, rN, 5 /* Divide by 32. */
494 andi. r12, rN, 24 /* Get the DW remainder. */
495 LD rWORD1, 0, rSTR1
496 LD rWORD2, 0, rSTR2
497 cmpldi cr1, r12, 16
498 cmpldi cr7, rN, 32
499 clrldi rN, rN, 61
500 beq L(dPs4)
501 mtctr r0
502 bgt cr1, L(dPs3)
503 beq cr1, L(dPs2)
504
505/* Remainder is 8. */
506 .align 3
507L(dsP1):
508 sld rWORD5, rWORD1, rWORD6
509 sld rWORD6, rWORD2, rWORD6
510 cmpld cr5, rWORD5, rWORD6
511 blt cr7, L(dP1x)
512/* Do something useful in this cycle since we have to branch anyway. */
513 LD rWORD1, rOFF8, rSTR1
514 LD rWORD2, rOFF8, rSTR2
515 cmpld cr7, rWORD1, rWORD2
516 b L(dP1e)
517/* Remainder is 16. */
518 .align 4
519L(dPs2):
520 sld rWORD5, rWORD1, rWORD6
521 sld rWORD6, rWORD2, rWORD6
522 cmpld cr6, rWORD5, rWORD6
523 blt cr7, L(dP2x)
524/* Do something useful in this cycle since we have to branch anyway. */
525 LD rWORD7, rOFF8, rSTR1
526 LD rWORD8, rOFF8, rSTR2
527 cmpld cr5, rWORD7, rWORD8
528 b L(dP2e)
529/* Remainder is 24. */
530 .align 4
531L(dPs3):
532 sld rWORD3, rWORD1, rWORD6
533 sld rWORD4, rWORD2, rWORD6
534 cmpld cr1, rWORD3, rWORD4
535 b L(dP3e)
536/* Count is a multiple of 32, remainder is 0. */
537 .align 4
538L(dPs4):
539 mtctr r0
540 sld rWORD1, rWORD1, rWORD6
541 sld rWORD2, rWORD2, rWORD6
542 cmpld cr7, rWORD1, rWORD2
543 b L(dP4e)
544
545/* At this point we know both strings are double word aligned and the
546 compare length is at least 8 bytes. */
547 .align 4
548L(DWaligned):
549 andi. r12, rN, 24 /* Get the DW remainder. */
550 srdi r0, rN, 5 /* Divide by 32. */
551 cmpldi cr1, r12, 16
552 cmpldi cr7, rN, 32
553 clrldi rN, rN, 61
554 beq L(dP4)
555 bgt cr1, L(dP3)
556 beq cr1, L(dP2)
557
558/* Remainder is 8. */
559 .align 4
560L(dP1):
561 mtctr r0
562/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
563 (8-15 byte compare), we want to use only volatile registers. This
564 means we can avoid restoring non-volatile registers since we did not
565 change any on the early exit path. The key here is the non-early
566 exit path only cares about the condition code (cr5), not about which
567 register pair was used. */
568 LD rWORD5, 0, rSTR1
569 LD rWORD6, 0, rSTR2
570 cmpld cr5, rWORD5, rWORD6
571 blt cr7, L(dP1x)
572 LD rWORD1, rOFF8, rSTR1
573 LD rWORD2, rOFF8, rSTR2
574 cmpld cr7, rWORD1, rWORD2
575L(dP1e):
576 LD rWORD3, rOFF16, rSTR1
577 LD rWORD4, rOFF16, rSTR2
578 cmpld cr1, rWORD3, rWORD4
579 LD rWORD5, rOFF24, rSTR1
580 LD rWORD6, rOFF24, rSTR2
581 cmpld cr6, rWORD5, rWORD6
582 bne cr5, L(dLcr5x)
583 bne cr7, L(dLcr7x)
584
585 LD rWORD7, rOFF32, rSTR1
586 LD rWORD8, rOFF32, rSTR2
587 addi rSTR1, rSTR1, 32
588 addi rSTR2, rSTR2, 32
589 bne cr1, L(dLcr1)
590 cmpld cr5, rWORD7, rWORD8
591 bdnz L(dLoop)
592 bne cr6, L(dLcr6)
593 ld rWORD8, rWORD8SAVE(r1)
594 ld rWORD7, rWORD7SAVE(r1)
595 .align 3
596L(dP1x):
597 sldi. r12, rN, 3
598 bne cr5, L(dLcr5x)
599 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
600 bne L(d00)
601 ld rOFF8, rOFF8SAVE(r1)
602 ld rOFF16, rOFF16SAVE(r1)
603 ld rOFF24, rOFF24SAVE(r1)
604 ld rOFF32, rOFF32SAVE(r1)
605 li rRTN, 0
606 blr
607
608/* Remainder is 16. */
609 .align 4
610L(dP2):
611 mtctr r0
612 LD rWORD5, 0, rSTR1
613 LD rWORD6, 0, rSTR2
614 cmpld cr6, rWORD5, rWORD6
615 blt cr7, L(dP2x)
616 LD rWORD7, rOFF8, rSTR1
617 LD rWORD8, rOFF8, rSTR2
618 cmpld cr5, rWORD7, rWORD8
619L(dP2e):
620 LD rWORD1, rOFF16, rSTR1
621 LD rWORD2, rOFF16, rSTR2
622 cmpld cr7, rWORD1, rWORD2
623 LD rWORD3, rOFF24, rSTR1
624 LD rWORD4, rOFF24, rSTR2
625 cmpld cr1, rWORD3, rWORD4
626 addi rSTR1, rSTR1, 8
627 addi rSTR2, rSTR2, 8
628 bne cr6, L(dLcr6)
629 bne cr5, L(dLcr5)
630 b L(dLoop2)
631 .align 4
632L(dP2x):
633 LD rWORD3, rOFF8, rSTR1
634 LD rWORD4, rOFF8, rSTR2
635 cmpld cr1, rWORD3, rWORD4
636 sldi. r12, rN, 3
637 bne cr6, L(dLcr6x)
638 addi rSTR1, rSTR1, 8
639 addi rSTR2, rSTR2, 8
640 bne cr1, L(dLcr1x)
641 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
642 bne L(d00)
643 ld rOFF8, rOFF8SAVE(r1)
644 ld rOFF16, rOFF16SAVE(r1)
645 ld rOFF24, rOFF24SAVE(r1)
646 ld rOFF32, rOFF32SAVE(r1)
647 li rRTN, 0
648 blr
649
650/* Remainder is 24. */
651 .align 4
652L(dP3):
653 mtctr r0
654 LD rWORD3, 0, rSTR1
655 LD rWORD4, 0, rSTR2
656 cmpld cr1, rWORD3, rWORD4
657L(dP3e):
658 LD rWORD5, rOFF8, rSTR1
659 LD rWORD6, rOFF8, rSTR2
660 cmpld cr6, rWORD5, rWORD6
661 blt cr7, L(dP3x)
662 LD rWORD7, rOFF16, rSTR1
663 LD rWORD8, rOFF16, rSTR2
664 cmpld cr5, rWORD7, rWORD8
665 LD rWORD1, rOFF24, rSTR1
666 LD rWORD2, rOFF24, rSTR2
667 cmpld cr7, rWORD1, rWORD2
668 addi rSTR1, rSTR1, 16
669 addi rSTR2, rSTR2, 16
670 bne cr1, L(dLcr1)
671 bne cr6, L(dLcr6)
672 b L(dLoop1)
673/* Again we are on a early exit path (24-31 byte compare), we want to
674 only use volatile registers and avoid restoring non-volatile
675 registers. */
676 .align 4
677L(dP3x):
678 LD rWORD1, rOFF16, rSTR1
679 LD rWORD2, rOFF16, rSTR2
680 cmpld cr7, rWORD1, rWORD2
681 sldi. r12, rN, 3
682 bne cr1, L(dLcr1x)
683 addi rSTR1, rSTR1, 16
684 addi rSTR2, rSTR2, 16
685 bne cr6, L(dLcr6x)
686 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
687 bne cr7, L(dLcr7x)
688 bne L(d00)
689 ld rOFF8, rOFF8SAVE(r1)
690 ld rOFF16, rOFF16SAVE(r1)
691 ld rOFF24, rOFF24SAVE(r1)
692 ld rOFF32, rOFF32SAVE(r1)
693 li rRTN, 0
694 blr
695
696/* Count is a multiple of 32, remainder is 0. */
697 .align 4
698L(dP4):
699 mtctr r0
700 LD rWORD1, 0, rSTR1
701 LD rWORD2, 0, rSTR2
702 cmpld cr7, rWORD1, rWORD2
703L(dP4e):
704 LD rWORD3, rOFF8, rSTR1
705 LD rWORD4, rOFF8, rSTR2
706 cmpld cr1, rWORD3, rWORD4
707 LD rWORD5, rOFF16, rSTR1
708 LD rWORD6, rOFF16, rSTR2
709 cmpld cr6, rWORD5, rWORD6
710 LD rWORD7, rOFF24, rSTR1
711 LD rWORD8, rOFF24, rSTR2
712 addi rSTR1, rSTR1, 24
713 addi rSTR2, rSTR2, 24
714 cmpld cr5, rWORD7, rWORD8
715 bne cr7, L(dLcr7)
716 bne cr1, L(dLcr1)
717 bdz- L(d24) /* Adjust CTR as we start with +4. */
718/* This is the primary loop. */
719 .align 4
720L(dLoop):
721 LD rWORD1, rOFF8, rSTR1
722 LD rWORD2, rOFF8, rSTR2
723 cmpld cr1, rWORD3, rWORD4
724 bne cr6, L(dLcr6)
725L(dLoop1):
726 LD rWORD3, rOFF16, rSTR1
727 LD rWORD4, rOFF16, rSTR2
728 cmpld cr6, rWORD5, rWORD6
729 bne cr5, L(dLcr5)
730L(dLoop2):
731 LD rWORD5, rOFF24, rSTR1
732 LD rWORD6, rOFF24, rSTR2
733 cmpld cr5, rWORD7, rWORD8
734 bne cr7, L(dLcr7)
735L(dLoop3):
736 LD rWORD7, rOFF32, rSTR1
737 LD rWORD8, rOFF32, rSTR2
738 addi rSTR1, rSTR1, 32
739 addi rSTR2, rSTR2, 32
740 bne cr1, L(dLcr1)
741 cmpld cr7, rWORD1, rWORD2
742 bdnz L(dLoop)
743
744L(dL4):
745 cmpld cr1, rWORD3, rWORD4
746 bne cr6, L(dLcr6)
747 cmpld cr6, rWORD5, rWORD6
748 bne cr5, L(dLcr5)
749 cmpld cr5, rWORD7, rWORD8
750L(d44):
751 bne cr7, L(dLcr7)
752L(d34):
753 bne cr1, L(dLcr1)
754L(d24):
755 bne cr6, L(dLcr6)
756L(d14):
757 sldi. r12, rN, 3
758 bne cr5, L(dLcr5)
759L(d04):
760 ld rWORD8, rWORD8SAVE(r1)
761 ld rWORD7, rWORD7SAVE(r1)
762 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
763 beq L(duzeroLength)
764/* At this point we have a remainder of 1 to 7 bytes to compare. Since
765 we are aligned it is safe to load the whole double word, and use
766 shift right double to eliminate bits beyond the compare length. */
767L(d00):
768 LD rWORD1, rOFF8, rSTR1
769 LD rWORD2, rOFF8, rSTR2
770 srd rWORD1, rWORD1, rN
771 srd rWORD2, rWORD2, rN
772 cmpld cr7, rWORD1, rWORD2
773 bne cr7, L(dLcr7x)
774 ld rOFF8, rOFF8SAVE(r1)
775 ld rOFF16, rOFF16SAVE(r1)
776 ld rOFF24, rOFF24SAVE(r1)
777 ld rOFF32, rOFF32SAVE(r1)
778 li rRTN, 0
779 blr
780
781 .align 4
782L(dLcr7):
783 ld rWORD8, rWORD8SAVE(r1)
784 ld rWORD7, rWORD7SAVE(r1)
785L(dLcr7x):
786 ld rOFF8, rOFF8SAVE(r1)
787 ld rOFF16, rOFF16SAVE(r1)
788 ld rOFF24, rOFF24SAVE(r1)
789 ld rOFF32, rOFF32SAVE(r1)
790 li rRTN, 1
791 bgtlr cr7
792 li rRTN, -1
793 blr
794 .align 4
795L(dLcr1):
796 ld rWORD8, rWORD8SAVE(r1)
797 ld rWORD7, rWORD7SAVE(r1)
798L(dLcr1x):
799 ld rOFF8, rOFF8SAVE(r1)
800 ld rOFF16, rOFF16SAVE(r1)
801 ld rOFF24, rOFF24SAVE(r1)
802 ld rOFF32, rOFF32SAVE(r1)
803 li rRTN, 1
804 bgtlr cr1
805 li rRTN, -1
806 blr
807 .align 4
808L(dLcr6):
809 ld rWORD8, rWORD8SAVE(r1)
810 ld rWORD7, rWORD7SAVE(r1)
811L(dLcr6x):
812 ld rOFF8, rOFF8SAVE(r1)
813 ld rOFF16, rOFF16SAVE(r1)
814 ld rOFF24, rOFF24SAVE(r1)
815 ld rOFF32, rOFF32SAVE(r1)
816 li rRTN, 1
817 bgtlr cr6
818 li rRTN, -1
819 blr
820 .align 4
821L(dLcr5):
822 ld rWORD8, rWORD8SAVE(r1)
823 ld rWORD7, rWORD7SAVE(r1)
824L(dLcr5x):
825 ld rOFF8, rOFF8SAVE(r1)
826 ld rOFF16, rOFF16SAVE(r1)
827 ld rOFF24, rOFF24SAVE(r1)
828 ld rOFF32, rOFF32SAVE(r1)
829 li rRTN, 1
830 bgtlr cr5
831 li rRTN, -1
832 blr
833
834 .align 4
835L(bytealigned):
836 mtctr rN
837
838/* We need to prime this loop. This loop is swing modulo scheduled
839 to avoid pipe delays. The dependent instruction latencies (load to
840 compare to conditional branch) is 2 to 3 cycles. In this loop each
841 dispatch group ends in a branch and takes 1 cycle. Effectively
842 the first iteration of the loop only serves to load operands and
843 branches based on compares are delayed until the next loop.
844
845 So we must precondition some registers and condition codes so that
846 we don't exit the loop early on the first iteration. */
847
848 lbz rWORD1, 0(rSTR1)
849 lbz rWORD2, 0(rSTR2)
850 bdz L(b11)
851 cmpld cr7, rWORD1, rWORD2
852 lbz rWORD3, 1(rSTR1)
853 lbz rWORD4, 1(rSTR2)
854 bdz L(b12)
855 cmpld cr1, rWORD3, rWORD4
856 lbzu rWORD5, 2(rSTR1)
857 lbzu rWORD6, 2(rSTR2)
858 bdz L(b13)
859 .align 4
860L(bLoop):
861 lbzu rWORD1, 1(rSTR1)
862 lbzu rWORD2, 1(rSTR2)
863 bne cr7, L(bLcr7)
864
865 cmpld cr6, rWORD5, rWORD6
866 bdz L(b3i)
867
868 lbzu rWORD3, 1(rSTR1)
869 lbzu rWORD4, 1(rSTR2)
870 bne cr1, L(bLcr1)
871
872 cmpld cr7, rWORD1, rWORD2
873 bdz L(b2i)
874
875 lbzu rWORD5, 1(rSTR1)
876 lbzu rWORD6, 1(rSTR2)
877 bne cr6, L(bLcr6)
878
879 cmpld cr1, rWORD3, rWORD4
880 bdnz L(bLoop)
881
882/* We speculatively loading bytes before we have tested the previous
883 bytes. But we must avoid overrunning the length (in the ctr) to
884 prevent these speculative loads from causing a segfault. In this
885 case the loop will exit early (before the all pending bytes are
886 tested. In this case we must complete the pending operations
887 before returning. */
888L(b1i):
889 bne cr7, L(bLcr7)
890 bne cr1, L(bLcr1)
891 b L(bx56)
892 .align 4
893L(b2i):
894 bne cr6, L(bLcr6)
895 bne cr7, L(bLcr7)
896 b L(bx34)
897 .align 4
898L(b3i):
899 bne cr1, L(bLcr1)
900 bne cr6, L(bLcr6)
901 b L(bx12)
902 .align 4
903L(bLcr7):
904 li rRTN, 1
905 bgtlr cr7
906 li rRTN, -1
907 blr
908L(bLcr1):
909 li rRTN, 1
910 bgtlr cr1
911 li rRTN, -1
912 blr
913L(bLcr6):
914 li rRTN, 1
915 bgtlr cr6
916 li rRTN, -1
917 blr
918
919L(b13):
920 bne cr7, L(bx12)
921 bne cr1, L(bx34)
922L(bx56):
923 sub rRTN, rWORD5, rWORD6
924 blr
925 nop
926L(b12):
927 bne cr7, L(bx12)
928L(bx34):
929 sub rRTN, rWORD3, rWORD4
930 blr
931L(b11):
932L(bx12):
933 sub rRTN, rWORD1, rWORD2
934 blr
935
936 .align 4
937L(zeroLength):
938 li rRTN, 0
939 blr
940
941 .align 4
942/* At this point we know the strings have different alignment and the
943 compare length is at least 8 bytes. r12 contains the low order
944 3 bits of rSTR1 and cr5 contains the result of the logical compare
945 of r12 to 0. If r12 == 0 then rStr1 is double word
946 aligned and can perform the DWunaligned loop.
947
948 Otherwise we know that rSTR1 is not already DW aligned yet.
949 So we can force the string addresses to the next lower DW
950 boundary and special case this first DW using shift left to
951 eliminate bits preceding the first byte. Since we want to join the
952 normal (DWaligned) compare loop, starting at the second double word,
953 we need to adjust the length (rN) and special case the loop
954 versioning for the first DW. This ensures that the loop count is
955 correct and the first DW (shifted) is in the expected resister pair. */
956L(unaligned):
957 std rWORD8, rWORD8SAVE(r1)
958 std rWORD7, rWORD7SAVE(r1)
959 std rOFF8, rOFF8SAVE(r1)
960 std rOFF16, rOFF16SAVE(r1)
961 std rOFF24, rOFF24SAVE(r1)
962 std rOFF32, rOFF32SAVE(r1)
963 cfi_offset(rWORD8, rWORD8SAVE)
964 cfi_offset(rWORD7, rWORD7SAVE)
965 cfi_offset(rOFF8, rOFF8SAVE)
966 cfi_offset(rOFF16, rOFF16SAVE)
967 cfi_offset(rOFF24, rOFF24SAVE)
968 cfi_offset(rOFF32, rOFF32SAVE)
969 li rOFF8,8
970 li rOFF16,16
971 li rOFF24,24
972 li rOFF32,32
973 std rSHL, rSHLSAVE(r1)
974 cfi_offset(rSHL, rSHLSAVE)
975 clrldi rSHL, rSTR2, 61
976 beq cr6, L(duzeroLength)
977 std rSHR, rSHRSAVE(r1)
978 cfi_offset(rSHR, rSHRSAVE)
979 beq cr5, L(DWunaligned)
980 std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
981 cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
982/* Adjust the logical start of rSTR2 to compensate for the extra bits
983 in the 1st rSTR1 DW. */
984 sub rWORD8_SHIFT, rSTR2, r12
985/* But do not attempt to address the DW before that DW that contains
986 the actual start of rSTR2. */
987 clrrdi rSTR2, rSTR2, 3
988 std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
989/* Compute the left/right shift counts for the unaligned rSTR2,
990 compensating for the logical (DW aligned) start of rSTR1. */
991 clrldi rSHL, rWORD8_SHIFT, 61
992 clrrdi rSTR1, rSTR1, 3
993 std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
994 sldi rSHL, rSHL, 3
995 cmpld cr5, rWORD8_SHIFT, rSTR2
996 add rN, rN, r12
997 sldi rWORD6, r12, 3
998 std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
999 cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
1000 cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
1001 cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
1002 subfic rSHR, rSHL, 64
1003 srdi r0, rN, 5 /* Divide by 32. */
1004 andi. r12, rN, 24 /* Get the DW remainder. */
1005/* We normally need to load 2 DWs to start the unaligned rSTR2, but in
1006 this special case those bits may be discarded anyway. Also we
1007 must avoid loading a DW where none of the bits are part of rSTR2 as
1008 this may cross a page boundary and cause a page fault. */
1009 li rWORD8, 0
1010 blt cr5, L(dus0)
1011 LD rWORD8, 0, rSTR2
1012 addi rSTR2, rSTR2, 8
1013 sld rWORD8, rWORD8, rSHL
1014
1015L(dus0):
1016 LD rWORD1, 0, rSTR1
1017 LD rWORD2, 0, rSTR2
1018 cmpldi cr1, r12, 16
1019 cmpldi cr7, rN, 32
1020 srd r12, rWORD2, rSHR
1021 clrldi rN, rN, 61
1022 beq L(duPs4)
1023 mtctr r0
1024 or rWORD8, r12, rWORD8
1025 bgt cr1, L(duPs3)
1026 beq cr1, L(duPs2)
1027
1028/* Remainder is 8. */
1029 .align 4
1030L(dusP1):
1031 sld rWORD8_SHIFT, rWORD2, rSHL
1032 sld rWORD7, rWORD1, rWORD6
1033 sld rWORD8, rWORD8, rWORD6
1034 bge cr7, L(duP1e)
1035/* At this point we exit early with the first double word compare
1036 complete and remainder of 0 to 7 bytes. See L(du14) for details on
1037 how we handle the remaining bytes. */
1038 cmpld cr5, rWORD7, rWORD8
1039 sldi. rN, rN, 3
1040 bne cr5, L(duLcr5)
1041 cmpld cr7, rN, rSHR
1042 beq L(duZeroReturn)
1043 li r0, 0
1044 ble cr7, L(dutrim)
1045 LD rWORD2, rOFF8, rSTR2
1046 srd r0, rWORD2, rSHR
1047 b L(dutrim)
1048/* Remainder is 16. */
1049 .align 4
1050L(duPs2):
1051 sld rWORD6_SHIFT, rWORD2, rSHL
1052 sld rWORD5, rWORD1, rWORD6
1053 sld rWORD6, rWORD8, rWORD6
1054 b L(duP2e)
1055/* Remainder is 24. */
1056 .align 4
1057L(duPs3):
1058 sld rWORD4_SHIFT, rWORD2, rSHL
1059 sld rWORD3, rWORD1, rWORD6
1060 sld rWORD4, rWORD8, rWORD6
1061 b L(duP3e)
1062/* Count is a multiple of 32, remainder is 0. */
1063 .align 4
1064L(duPs4):
1065 mtctr r0
1066 or rWORD8, r12, rWORD8
1067 sld rWORD2_SHIFT, rWORD2, rSHL
1068 sld rWORD1, rWORD1, rWORD6
1069 sld rWORD2, rWORD8, rWORD6
1070 b L(duP4e)
1071
1072/* At this point we know rSTR1 is double word aligned and the
1073 compare length is at least 8 bytes. */
1074 .align 4
1075L(DWunaligned):
1076 std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
1077 clrrdi rSTR2, rSTR2, 3
1078 std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
1079 srdi r0, rN, 5 /* Divide by 32. */
1080 std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
1081 andi. r12, rN, 24 /* Get the DW remainder. */
1082 std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1083 cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
1084 cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
1085 cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
1086 cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
1087 sldi rSHL, rSHL, 3
1088 LD rWORD6, 0, rSTR2
1089 LD rWORD8, rOFF8, rSTR2
1090 addi rSTR2, rSTR2, 8
1091 cmpldi cr1, r12, 16
1092 cmpldi cr7, rN, 32
1093 clrldi rN, rN, 61
1094 subfic rSHR, rSHL, 64
1095 sld rWORD6_SHIFT, rWORD6, rSHL
1096 beq L(duP4)
1097 mtctr r0
1098 bgt cr1, L(duP3)
1099 beq cr1, L(duP2)
1100
1101/* Remainder is 8. */
1102 .align 4
1103L(duP1):
1104 srd r12, rWORD8, rSHR
1105 LD rWORD7, 0, rSTR1
1106 sld rWORD8_SHIFT, rWORD8, rSHL
1107 or rWORD8, r12, rWORD6_SHIFT
1108 blt cr7, L(duP1x)
1109L(duP1e):
1110 LD rWORD1, rOFF8, rSTR1
1111 LD rWORD2, rOFF8, rSTR2
1112 cmpld cr5, rWORD7, rWORD8
1113 srd r0, rWORD2, rSHR
1114 sld rWORD2_SHIFT, rWORD2, rSHL
1115 or rWORD2, r0, rWORD8_SHIFT
1116 LD rWORD3, rOFF16, rSTR1
1117 LD rWORD4, rOFF16, rSTR2
1118 cmpld cr7, rWORD1, rWORD2
1119 srd r12, rWORD4, rSHR
1120 sld rWORD4_SHIFT, rWORD4, rSHL
1121 bne cr5, L(duLcr5)
1122 or rWORD4, r12, rWORD2_SHIFT
1123 LD rWORD5, rOFF24, rSTR1
1124 LD rWORD6, rOFF24, rSTR2
1125 cmpld cr1, rWORD3, rWORD4
1126 srd r0, rWORD6, rSHR
1127 sld rWORD6_SHIFT, rWORD6, rSHL
1128 bne cr7, L(duLcr7)
1129 or rWORD6, r0, rWORD4_SHIFT
1130 cmpld cr6, rWORD5, rWORD6
1131 b L(duLoop3)
1132 .align 4
1133/* At this point we exit early with the first double word compare
1134 complete and remainder of 0 to 7 bytes. See L(du14) for details on
1135 how we handle the remaining bytes. */
1136L(duP1x):
1137 cmpld cr5, rWORD7, rWORD8
1138 sldi. rN, rN, 3
1139 bne cr5, L(duLcr5)
1140 cmpld cr7, rN, rSHR
1141 beq L(duZeroReturn)
1142 li r0, 0
1143 ble cr7, L(dutrim)
1144 LD rWORD2, rOFF8, rSTR2
1145 srd r0, rWORD2, rSHR
1146 b L(dutrim)
1147/* Remainder is 16. */
1148 .align 4
1149L(duP2):
1150 srd r0, rWORD8, rSHR
1151 LD rWORD5, 0, rSTR1
1152 or rWORD6, r0, rWORD6_SHIFT
1153 sld rWORD6_SHIFT, rWORD8, rSHL
1154L(duP2e):
1155 LD rWORD7, rOFF8, rSTR1
1156 LD rWORD8, rOFF8, rSTR2
1157 cmpld cr6, rWORD5, rWORD6
1158 srd r12, rWORD8, rSHR
1159 sld rWORD8_SHIFT, rWORD8, rSHL
1160 or rWORD8, r12, rWORD6_SHIFT
1161 blt cr7, L(duP2x)
1162 LD rWORD1, rOFF16, rSTR1
1163 LD rWORD2, rOFF16, rSTR2
1164 cmpld cr5, rWORD7, rWORD8
1165 bne cr6, L(duLcr6)
1166 srd r0, rWORD2, rSHR
1167 sld rWORD2_SHIFT, rWORD2, rSHL
1168 or rWORD2, r0, rWORD8_SHIFT
1169 LD rWORD3, rOFF24, rSTR1
1170 LD rWORD4, rOFF24, rSTR2
1171 cmpld cr7, rWORD1, rWORD2
1172 bne cr5, L(duLcr5)
1173 srd r12, rWORD4, rSHR
1174 sld rWORD4_SHIFT, rWORD4, rSHL
1175 or rWORD4, r12, rWORD2_SHIFT
1176 addi rSTR1, rSTR1, 8
1177 addi rSTR2, rSTR2, 8
1178 cmpld cr1, rWORD3, rWORD4
1179 b L(duLoop2)
1180 .align 4
1181L(duP2x):
1182 cmpld cr5, rWORD7, rWORD8
1183 addi rSTR1, rSTR1, 8
1184 addi rSTR2, rSTR2, 8
1185 bne cr6, L(duLcr6)
1186 sldi. rN, rN, 3
1187 bne cr5, L(duLcr5)
1188 cmpld cr7, rN, rSHR
1189 beq L(duZeroReturn)
1190 li r0, 0
1191 ble cr7, L(dutrim)
1192 LD rWORD2, rOFF8, rSTR2
1193 srd r0, rWORD2, rSHR
1194 b L(dutrim)
1195
1196/* Remainder is 24. */
1197 .align 4
1198L(duP3):
1199 srd r12, rWORD8, rSHR
1200 LD rWORD3, 0, rSTR1
1201 sld rWORD4_SHIFT, rWORD8, rSHL
1202 or rWORD4, r12, rWORD6_SHIFT
1203L(duP3e):
1204 LD rWORD5, rOFF8, rSTR1
1205 LD rWORD6, rOFF8, rSTR2
1206 cmpld cr1, rWORD3, rWORD4
1207 srd r0, rWORD6, rSHR
1208 sld rWORD6_SHIFT, rWORD6, rSHL
1209 or rWORD6, r0, rWORD4_SHIFT
1210 LD rWORD7, rOFF16, rSTR1
1211 LD rWORD8, rOFF16, rSTR2
1212 cmpld cr6, rWORD5, rWORD6
1213 bne cr1, L(duLcr1)
1214 srd r12, rWORD8, rSHR
1215 sld rWORD8_SHIFT, rWORD8, rSHL
1216 or rWORD8, r12, rWORD6_SHIFT
1217 blt cr7, L(duP3x)
1218 LD rWORD1, rOFF24, rSTR1
1219 LD rWORD2, rOFF24, rSTR2
1220 cmpld cr5, rWORD7, rWORD8
1221 bne cr6, L(duLcr6)
1222 srd r0, rWORD2, rSHR
1223 sld rWORD2_SHIFT, rWORD2, rSHL
1224 or rWORD2, r0, rWORD8_SHIFT
1225 addi rSTR1, rSTR1, 16
1226 addi rSTR2, rSTR2, 16
1227 cmpld cr7, rWORD1, rWORD2
1228 b L(duLoop1)
1229 .align 4
1230L(duP3x):
1231 addi rSTR1, rSTR1, 16
1232 addi rSTR2, rSTR2, 16
1233 cmpld cr5, rWORD7, rWORD8
1234 bne cr6, L(duLcr6)
1235 sldi. rN, rN, 3
1236 bne cr5, L(duLcr5)
1237 cmpld cr7, rN, rSHR
1238 beq L(duZeroReturn)
1239 li r0, 0
1240 ble cr7, L(dutrim)
1241 LD rWORD2, rOFF8, rSTR2
1242 srd r0, rWORD2, rSHR
1243 b L(dutrim)
1244
1245/* Count is a multiple of 32, remainder is 0. */
1246 .align 4
1247L(duP4):
1248 mtctr r0
1249 srd r0, rWORD8, rSHR
1250 LD rWORD1, 0, rSTR1
1251 sld rWORD2_SHIFT, rWORD8, rSHL
1252 or rWORD2, r0, rWORD6_SHIFT
1253L(duP4e):
1254 LD rWORD3, rOFF8, rSTR1
1255 LD rWORD4, rOFF8, rSTR2
1256 cmpld cr7, rWORD1, rWORD2
1257 srd r12, rWORD4, rSHR
1258 sld rWORD4_SHIFT, rWORD4, rSHL
1259 or rWORD4, r12, rWORD2_SHIFT
1260 LD rWORD5, rOFF16, rSTR1
1261 LD rWORD6, rOFF16, rSTR2
1262 cmpld cr1, rWORD3, rWORD4
1263 bne cr7, L(duLcr7)
1264 srd r0, rWORD6, rSHR
1265 sld rWORD6_SHIFT, rWORD6, rSHL
1266 or rWORD6, r0, rWORD4_SHIFT
1267 LD rWORD7, rOFF24, rSTR1
1268 LD rWORD8, rOFF24, rSTR2
1269 addi rSTR1, rSTR1, 24
1270 addi rSTR2, rSTR2, 24
1271 cmpld cr6, rWORD5, rWORD6
1272 bne cr1, L(duLcr1)
1273 srd r12, rWORD8, rSHR
1274 sld rWORD8_SHIFT, rWORD8, rSHL
1275 or rWORD8, r12, rWORD6_SHIFT
1276 cmpld cr5, rWORD7, rWORD8
1277 bdz L(du24) /* Adjust CTR as we start with +4. */
1278/* This is the primary loop. */
1279 .align 4
1280L(duLoop):
1281 LD rWORD1, rOFF8, rSTR1
1282 LD rWORD2, rOFF8, rSTR2
1283 cmpld cr1, rWORD3, rWORD4
1284 bne cr6, L(duLcr6)
1285 srd r0, rWORD2, rSHR
1286 sld rWORD2_SHIFT, rWORD2, rSHL
1287 or rWORD2, r0, rWORD8_SHIFT
1288L(duLoop1):
1289 LD rWORD3, rOFF16, rSTR1
1290 LD rWORD4, rOFF16, rSTR2
1291 cmpld cr6, rWORD5, rWORD6
1292 bne cr5, L(duLcr5)
1293 srd r12, rWORD4, rSHR
1294 sld rWORD4_SHIFT, rWORD4, rSHL
1295 or rWORD4, r12, rWORD2_SHIFT
1296L(duLoop2):
1297 LD rWORD5, rOFF24, rSTR1
1298 LD rWORD6, rOFF24, rSTR2
1299 cmpld cr5, rWORD7, rWORD8
1300 bne cr7, L(duLcr7)
1301 srd r0, rWORD6, rSHR
1302 sld rWORD6_SHIFT, rWORD6, rSHL
1303 or rWORD6, r0, rWORD4_SHIFT
1304L(duLoop3):
1305 LD rWORD7, rOFF32, rSTR1
1306 LD rWORD8, rOFF32, rSTR2
1307 addi rSTR1, rSTR1, 32
1308 addi rSTR2, rSTR2, 32
1309 cmpld cr7, rWORD1, rWORD2
1310 bne cr1, L(duLcr1)
1311 srd r12, rWORD8, rSHR
1312 sld rWORD8_SHIFT, rWORD8, rSHL
1313 or rWORD8, r12, rWORD6_SHIFT
1314 bdnz L(duLoop)
1315
1316L(duL4):
1317 cmpld cr1, rWORD3, rWORD4
1318 bne cr6, L(duLcr6)
1319 cmpld cr6, rWORD5, rWORD6
1320 bne cr5, L(duLcr5)
1321 cmpld cr5, rWORD7, rWORD8
1322L(du44):
1323 bne cr7, L(duLcr7)
1324L(du34):
1325 bne cr1, L(duLcr1)
1326L(du24):
1327 bne cr6, L(duLcr6)
1328L(du14):
1329 sldi. rN, rN, 3
1330 bne cr5, L(duLcr5)
1331/* At this point we have a remainder of 1 to 7 bytes to compare. We use
1332 shift right double to eliminate bits beyond the compare length.
1333
1334 However it may not be safe to load rWORD2 which may be beyond the
1335 string length. So we compare the bit length of the remainder to
1336 the right shift count (rSHR). If the bit count is less than or equal
1337 we do not need to load rWORD2 (all significant bits are already in
1338 rWORD8_SHIFT). */
1339 cmpld cr7, rN, rSHR
1340 beq L(duZeroReturn)
1341 li r0, 0
1342 ble cr7, L(dutrim)
1343 LD rWORD2, rOFF8, rSTR2
1344 srd r0, rWORD2, rSHR
1345 .align 4
1346L(dutrim):
1347 LD rWORD1, rOFF8, rSTR1
1348 ld rWORD8, -8(r1)
1349 subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
1350 or rWORD2, r0, rWORD8_SHIFT
1351 ld rWORD7, rWORD7SAVE(r1)
1352 ld rSHL, rSHLSAVE(r1)
1353 srd rWORD1, rWORD1, rN
1354 srd rWORD2, rWORD2, rN
1355 ld rSHR, rSHRSAVE(r1)
1356 ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
1357 li rRTN, 0
1358 cmpld cr7, rWORD1, rWORD2
1359 ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
1360 ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
1361 beq cr7, L(dureturn24)
1362 li rRTN, 1
1363 ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1364 ld rOFF8, rOFF8SAVE(r1)
1365 ld rOFF16, rOFF16SAVE(r1)
1366 ld rOFF24, rOFF24SAVE(r1)
1367 ld rOFF32, rOFF32SAVE(r1)
1368 bgtlr cr7
1369 li rRTN, -1
1370 blr
1371 .align 4
1372L(duLcr7):
1373 ld rWORD8, rWORD8SAVE(r1)
1374 ld rWORD7, rWORD7SAVE(r1)
1375 li rRTN, 1
1376 bgt cr7, L(dureturn29)
1377 ld rSHL, rSHLSAVE(r1)
1378 ld rSHR, rSHRSAVE(r1)
1379 li rRTN, -1
1380 b L(dureturn27)
1381 .align 4
1382L(duLcr1):
1383 ld rWORD8, rWORD8SAVE(r1)
1384 ld rWORD7, rWORD7SAVE(r1)
1385 li rRTN, 1
1386 bgt cr1, L(dureturn29)
1387 ld rSHL, rSHLSAVE(r1)
1388 ld rSHR, rSHRSAVE(r1)
1389 li rRTN, -1
1390 b L(dureturn27)
1391 .align 4
1392L(duLcr6):
1393 ld rWORD8, rWORD8SAVE(r1)
1394 ld rWORD7, rWORD7SAVE(r1)
1395 li rRTN, 1
1396 bgt cr6, L(dureturn29)
1397 ld rSHL, rSHLSAVE(r1)
1398 ld rSHR, rSHRSAVE(r1)
1399 li rRTN, -1
1400 b L(dureturn27)
1401 .align 4
1402L(duLcr5):
1403 ld rWORD8, rWORD8SAVE(r1)
1404 ld rWORD7, rWORD7SAVE(r1)
1405 li rRTN, 1
1406 bgt cr5, L(dureturn29)
1407 ld rSHL, rSHLSAVE(r1)
1408 ld rSHR, rSHRSAVE(r1)
1409 li rRTN, -1
1410 b L(dureturn27)
1411
1412 .align 3
1413L(duZeroReturn):
1414 li rRTN, 0
1415 .align 4
1416L(dureturn):
1417 ld rWORD8, rWORD8SAVE(r1)
1418 ld rWORD7, rWORD7SAVE(r1)
1419L(dureturn29):
1420 ld rSHL, rSHLSAVE(r1)
1421 ld rSHR, rSHRSAVE(r1)
1422L(dureturn27):
1423 ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
1424 ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
1425 ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
1426L(dureturn24):
1427 ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1428 ld rOFF8, rOFF8SAVE(r1)
1429 ld rOFF16, rOFF16SAVE(r1)
1430 ld rOFF24, rOFF24SAVE(r1)
1431 ld rOFF32, rOFF32SAVE(r1)
1432 blr
1433
1434L(duzeroLength):
1435 ld rOFF8, rOFF8SAVE(r1)
1436 ld rOFF16, rOFF16SAVE(r1)
1437 ld rOFF24, rOFF24SAVE(r1)
1438 ld rOFF32, rOFF32SAVE(r1)
1439 li rRTN, 0
1440 blr
1441
1442END (MEMCMP)
1443libc_hidden_builtin_def (memcmp)
1444weak_alias (memcmp, bcmp)
1445strong_alias (memcmp, __memcmpeq)
1446libc_hidden_def (__memcmpeq)
1447

source code of glibc/sysdeps/powerpc/powerpc64/power8/memcmp.S