1/* Optimized strcmp implementation for PowerPC32.
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21/* int [r3] memcmp (const char *s1 [r3],
22 const char *s2 [r4],
23 size_t size [r5]) */
24
25 .machine power4
26EALIGN (memcmp, 4, 0)
27 CALL_MCOUNT
28
29#define rRTN r3
30#define rSTR1 r3 /* first string arg */
31#define rSTR2 r4 /* second string arg */
32#define rN r5 /* max string length */
33#define rWORD1 r6 /* current word in s1 */
34#define rWORD2 r7 /* current word in s2 */
35#define rWORD3 r8 /* next word in s1 */
36#define rWORD4 r9 /* next word in s2 */
37#define rWORD5 r10 /* next word in s1 */
38#define rWORD6 r11 /* next word in s2 */
39#define rWORD7 r30 /* next word in s1 */
40#define rWORD8 r31 /* next word in s2 */
41
42 xor r0, rSTR2, rSTR1
43 cmplwi cr6, rN, 0
44 cmplwi cr1, rN, 12
45 clrlwi. r0, r0, 30
46 clrlwi r12, rSTR1, 30
47 cmplwi cr5, r12, 0
48 beq- cr6, L(zeroLength)
49 dcbt 0, rSTR1
50 dcbt 0, rSTR2
51/* If less than 8 bytes or not aligned, use the unaligned
52 byte loop. */
53 blt cr1, L(bytealigned)
54 stwu 1, -64(r1)
55 cfi_adjust_cfa_offset(64)
56 stw rWORD8, 48(r1)
57 stw rWORD7, 44(r1)
58 cfi_offset(rWORD8, (48-64))
59 cfi_offset(rWORD7, (44-64))
60 bne L(unaligned)
61/* At this point we know both strings have the same alignment and the
62 compare length is at least 8 bytes. r12 contains the low order
63 2 bits of rSTR1 and cr5 contains the result of the logical compare
64 of r12 to 0. If r12 == 0 then we are already word
65 aligned and can perform the word aligned loop.
66
67 Otherwise we know the two strings have the same alignment (but not
68 yet word aligned). So we force the string addresses to the next lower
69 word boundary and special case this first word using shift left to
70 eliminate bits preceding the first byte. Since we want to join the
71 normal (word aligned) compare loop, starting at the second word,
72 we need to adjust the length (rN) and special case the loop
73 versioning for the first word. This ensures that the loop count is
74 correct and the first word (shifted) is in the expected register pair. */
75 .align 4
76L(samealignment):
77 clrrwi rSTR1, rSTR1, 2
78 clrrwi rSTR2, rSTR2, 2
79 beq cr5, L(Waligned)
80 add rN, rN, r12
81 slwi rWORD6, r12, 3
82 srwi r0, rN, 4 /* Divide by 16 */
83 andi. r12, rN, 12 /* Get the word remainder */
84#ifdef __LITTLE_ENDIAN__
85 lwbrx rWORD1, 0, rSTR1
86 lwbrx rWORD2, 0, rSTR2
87 addi rSTR1, rSTR1, 4
88 addi rSTR2, rSTR2, 4
89#else
90 lwz rWORD1, 0(rSTR1)
91 lwz rWORD2, 0(rSTR2)
92#endif
93 cmplwi cr1, r12, 8
94 cmplwi cr7, rN, 16
95 clrlwi rN, rN, 30
96 beq L(dPs4)
97 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
98 bgt cr1, L(dPs3)
99 beq cr1, L(dPs2)
100
101/* Remainder is 4 */
102 .align 3
103L(dsP1):
104 slw rWORD5, rWORD1, rWORD6
105 slw rWORD6, rWORD2, rWORD6
106 cmplw cr5, rWORD5, rWORD6
107 blt cr7, L(dP1x)
108/* Do something useful in this cycle since we have to branch anyway. */
109#ifdef __LITTLE_ENDIAN__
110 lwbrx rWORD1, 0, rSTR1
111 lwbrx rWORD2, 0, rSTR2
112 addi rSTR1, rSTR1, 4
113 addi rSTR2, rSTR2, 4
114#else
115 lwz rWORD1, 4(rSTR1)
116 lwz rWORD2, 4(rSTR2)
117#endif
118 cmplw cr7, rWORD1, rWORD2
119 b L(dP1e)
120/* Remainder is 8 */
121 .align 4
122L(dPs2):
123 slw rWORD5, rWORD1, rWORD6
124 slw rWORD6, rWORD2, rWORD6
125 cmplw cr6, rWORD5, rWORD6
126 blt cr7, L(dP2x)
127/* Do something useful in this cycle since we have to branch anyway. */
128#ifdef __LITTLE_ENDIAN__
129 lwbrx rWORD7, 0, rSTR1
130 lwbrx rWORD8, 0, rSTR2
131 addi rSTR1, rSTR1, 4
132 addi rSTR2, rSTR2, 4
133#else
134 lwz rWORD7, 4(rSTR1)
135 lwz rWORD8, 4(rSTR2)
136#endif
137 cmplw cr5, rWORD7, rWORD8
138 b L(dP2e)
139/* Remainder is 12 */
140 .align 4
141L(dPs3):
142 slw rWORD3, rWORD1, rWORD6
143 slw rWORD4, rWORD2, rWORD6
144 cmplw cr1, rWORD3, rWORD4
145 b L(dP3e)
146/* Count is a multiple of 16, remainder is 0 */
147 .align 4
148L(dPs4):
149 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
150 slw rWORD1, rWORD1, rWORD6
151 slw rWORD2, rWORD2, rWORD6
152 cmplw cr7, rWORD1, rWORD2
153 b L(dP4e)
154
155/* At this point we know both strings are word aligned and the
156 compare length is at least 8 bytes. */
157 .align 4
158L(Waligned):
159 andi. r12, rN, 12 /* Get the word remainder */
160 srwi r0, rN, 4 /* Divide by 16 */
161 cmplwi cr1, r12, 8
162 cmplwi cr7, rN, 16
163 clrlwi rN, rN, 30
164 beq L(dP4)
165 bgt cr1, L(dP3)
166 beq cr1, L(dP2)
167
168/* Remainder is 4 */
169 .align 4
170L(dP1):
171 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
172/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
173 (8-15 byte compare), we want to use only volatile registers. This
174 means we can avoid restoring non-volatile registers since we did not
175 change any on the early exit path. The key here is the non-early
176 exit path only cares about the condition code (cr5), not about which
177 register pair was used. */
178#ifdef __LITTLE_ENDIAN__
179 lwbrx rWORD5, 0, rSTR1
180 lwbrx rWORD6, 0, rSTR2
181 addi rSTR1, rSTR1, 4
182 addi rSTR2, rSTR2, 4
183#else
184 lwz rWORD5, 0(rSTR1)
185 lwz rWORD6, 0(rSTR2)
186#endif
187 cmplw cr5, rWORD5, rWORD6
188 blt cr7, L(dP1x)
189#ifdef __LITTLE_ENDIAN__
190 lwbrx rWORD1, 0, rSTR1
191 lwbrx rWORD2, 0, rSTR2
192 addi rSTR1, rSTR1, 4
193 addi rSTR2, rSTR2, 4
194#else
195 lwz rWORD1, 4(rSTR1)
196 lwz rWORD2, 4(rSTR2)
197#endif
198 cmplw cr7, rWORD1, rWORD2
199L(dP1e):
200#ifdef __LITTLE_ENDIAN__
201 lwbrx rWORD3, 0, rSTR1
202 lwbrx rWORD4, 0, rSTR2
203 addi rSTR1, rSTR1, 4
204 addi rSTR2, rSTR2, 4
205#else
206 lwz rWORD3, 8(rSTR1)
207 lwz rWORD4, 8(rSTR2)
208#endif
209 cmplw cr1, rWORD3, rWORD4
210#ifdef __LITTLE_ENDIAN__
211 lwbrx rWORD5, 0, rSTR1
212 lwbrx rWORD6, 0, rSTR2
213 addi rSTR1, rSTR1, 4
214 addi rSTR2, rSTR2, 4
215#else
216 lwz rWORD5, 12(rSTR1)
217 lwz rWORD6, 12(rSTR2)
218#endif
219 cmplw cr6, rWORD5, rWORD6
220 bne cr5, L(dLcr5x)
221 bne cr7, L(dLcr7x)
222
223#ifdef __LITTLE_ENDIAN__
224 lwbrx rWORD7, 0, rSTR1
225 lwbrx rWORD8, 0, rSTR2
226 addi rSTR1, rSTR1, 4
227 addi rSTR2, rSTR2, 4
228#else
229 lwzu rWORD7, 16(rSTR1)
230 lwzu rWORD8, 16(rSTR2)
231#endif
232 bne cr1, L(dLcr1)
233 cmplw cr5, rWORD7, rWORD8
234 bdnz L(dLoop)
235 bne cr6, L(dLcr6)
236 lwz rWORD7, 44(r1)
237 lwz rWORD8, 48(r1)
238 .align 3
239L(dP1x):
240 slwi. r12, rN, 3
241 bne cr5, L(dLcr5x)
242 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
243 addi 1, 1, 64
244 cfi_adjust_cfa_offset(-64)
245 bne L(d00)
246 li rRTN, 0
247 blr
248
249/* Remainder is 8 */
250 .align 4
251 cfi_adjust_cfa_offset(64)
252L(dP2):
253 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
254#ifdef __LITTLE_ENDIAN__
255 lwbrx rWORD5, 0, rSTR1
256 lwbrx rWORD6, 0, rSTR2
257 addi rSTR1, rSTR1, 4
258 addi rSTR2, rSTR2, 4
259#else
260 lwz rWORD5, 0(rSTR1)
261 lwz rWORD6, 0(rSTR2)
262#endif
263 cmplw cr6, rWORD5, rWORD6
264 blt cr7, L(dP2x)
265#ifdef __LITTLE_ENDIAN__
266 lwbrx rWORD7, 0, rSTR1
267 lwbrx rWORD8, 0, rSTR2
268 addi rSTR1, rSTR1, 4
269 addi rSTR2, rSTR2, 4
270#else
271 lwz rWORD7, 4(rSTR1)
272 lwz rWORD8, 4(rSTR2)
273#endif
274 cmplw cr5, rWORD7, rWORD8
275L(dP2e):
276#ifdef __LITTLE_ENDIAN__
277 lwbrx rWORD1, 0, rSTR1
278 lwbrx rWORD2, 0, rSTR2
279 addi rSTR1, rSTR1, 4
280 addi rSTR2, rSTR2, 4
281#else
282 lwz rWORD1, 8(rSTR1)
283 lwz rWORD2, 8(rSTR2)
284#endif
285 cmplw cr7, rWORD1, rWORD2
286#ifdef __LITTLE_ENDIAN__
287 lwbrx rWORD3, 0, rSTR1
288 lwbrx rWORD4, 0, rSTR2
289 addi rSTR1, rSTR1, 4
290 addi rSTR2, rSTR2, 4
291#else
292 lwz rWORD3, 12(rSTR1)
293 lwz rWORD4, 12(rSTR2)
294#endif
295 cmplw cr1, rWORD3, rWORD4
296#ifndef __LITTLE_ENDIAN__
297 addi rSTR1, rSTR1, 4
298 addi rSTR2, rSTR2, 4
299#endif
300 bne cr6, L(dLcr6)
301 bne cr5, L(dLcr5)
302 b L(dLoop2)
303/* Again we are on a early exit path (16-23 byte compare), we want to
304 only use volatile registers and avoid restoring non-volatile
305 registers. */
306 .align 4
307L(dP2x):
308#ifdef __LITTLE_ENDIAN__
309 lwbrx rWORD3, 0, rSTR1
310 lwbrx rWORD4, 0, rSTR2
311 addi rSTR1, rSTR1, 4
312 addi rSTR2, rSTR2, 4
313#else
314 lwz rWORD3, 4(rSTR1)
315 lwz rWORD4, 4(rSTR2)
316#endif
317 cmplw cr1, rWORD3, rWORD4
318 slwi. r12, rN, 3
319 bne cr6, L(dLcr6x)
320#ifndef __LITTLE_ENDIAN__
321 addi rSTR1, rSTR1, 4
322 addi rSTR2, rSTR2, 4
323#endif
324 bne cr1, L(dLcr1x)
325 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
326 addi 1, 1, 64
327 cfi_adjust_cfa_offset(-64)
328 bne L(d00)
329 li rRTN, 0
330 blr
331
332/* Remainder is 12 */
333 .align 4
334 cfi_adjust_cfa_offset(64)
335L(dP3):
336 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
337#ifdef __LITTLE_ENDIAN__
338 lwbrx rWORD3, 0, rSTR1
339 lwbrx rWORD4, 0, rSTR2
340 addi rSTR1, rSTR1, 4
341 addi rSTR2, rSTR2, 4
342#else
343 lwz rWORD3, 0(rSTR1)
344 lwz rWORD4, 0(rSTR2)
345#endif
346 cmplw cr1, rWORD3, rWORD4
347L(dP3e):
348#ifdef __LITTLE_ENDIAN__
349 lwbrx rWORD5, 0, rSTR1
350 lwbrx rWORD6, 0, rSTR2
351 addi rSTR1, rSTR1, 4
352 addi rSTR2, rSTR2, 4
353#else
354 lwz rWORD5, 4(rSTR1)
355 lwz rWORD6, 4(rSTR2)
356#endif
357 cmplw cr6, rWORD5, rWORD6
358 blt cr7, L(dP3x)
359#ifdef __LITTLE_ENDIAN__
360 lwbrx rWORD7, 0, rSTR1
361 lwbrx rWORD8, 0, rSTR2
362 addi rSTR1, rSTR1, 4
363 addi rSTR2, rSTR2, 4
364#else
365 lwz rWORD7, 8(rSTR1)
366 lwz rWORD8, 8(rSTR2)
367#endif
368 cmplw cr5, rWORD7, rWORD8
369#ifdef __LITTLE_ENDIAN__
370 lwbrx rWORD1, 0, rSTR1
371 lwbrx rWORD2, 0, rSTR2
372 addi rSTR1, rSTR1, 4
373 addi rSTR2, rSTR2, 4
374#else
375 lwz rWORD1, 12(rSTR1)
376 lwz rWORD2, 12(rSTR2)
377#endif
378 cmplw cr7, rWORD1, rWORD2
379#ifndef __LITTLE_ENDIAN__
380 addi rSTR1, rSTR1, 8
381 addi rSTR2, rSTR2, 8
382#endif
383 bne cr1, L(dLcr1)
384 bne cr6, L(dLcr6)
385 b L(dLoop1)
386/* Again we are on a early exit path (24-31 byte compare), we want to
387 only use volatile registers and avoid restoring non-volatile
388 registers. */
389 .align 4
390L(dP3x):
391#ifdef __LITTLE_ENDIAN__
392 lwbrx rWORD1, 0, rSTR1
393 lwbrx rWORD2, 0, rSTR2
394 addi rSTR1, rSTR1, 4
395 addi rSTR2, rSTR2, 4
396#else
397 lwz rWORD1, 8(rSTR1)
398 lwz rWORD2, 8(rSTR2)
399#endif
400 cmplw cr7, rWORD1, rWORD2
401 slwi. r12, rN, 3
402 bne cr1, L(dLcr1x)
403#ifndef __LITTLE_ENDIAN__
404 addi rSTR1, rSTR1, 8
405 addi rSTR2, rSTR2, 8
406#endif
407 bne cr6, L(dLcr6x)
408 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
409 bne cr7, L(dLcr7x)
410 addi 1, 1, 64
411 cfi_adjust_cfa_offset(-64)
412 bne L(d00)
413 li rRTN, 0
414 blr
415
416/* Count is a multiple of 16, remainder is 0 */
417 .align 4
418 cfi_adjust_cfa_offset(64)
419L(dP4):
420 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
421#ifdef __LITTLE_ENDIAN__
422 lwbrx rWORD1, 0, rSTR1
423 lwbrx rWORD2, 0, rSTR2
424 addi rSTR1, rSTR1, 4
425 addi rSTR2, rSTR2, 4
426#else
427 lwz rWORD1, 0(rSTR1)
428 lwz rWORD2, 0(rSTR2)
429#endif
430 cmplw cr7, rWORD1, rWORD2
431L(dP4e):
432#ifdef __LITTLE_ENDIAN__
433 lwbrx rWORD3, 0, rSTR1
434 lwbrx rWORD4, 0, rSTR2
435 addi rSTR1, rSTR1, 4
436 addi rSTR2, rSTR2, 4
437#else
438 lwz rWORD3, 4(rSTR1)
439 lwz rWORD4, 4(rSTR2)
440#endif
441 cmplw cr1, rWORD3, rWORD4
442#ifdef __LITTLE_ENDIAN__
443 lwbrx rWORD5, 0, rSTR1
444 lwbrx rWORD6, 0, rSTR2
445 addi rSTR1, rSTR1, 4
446 addi rSTR2, rSTR2, 4
447#else
448 lwz rWORD5, 8(rSTR1)
449 lwz rWORD6, 8(rSTR2)
450#endif
451 cmplw cr6, rWORD5, rWORD6
452#ifdef __LITTLE_ENDIAN__
453 lwbrx rWORD7, 0, rSTR1
454 lwbrx rWORD8, 0, rSTR2
455 addi rSTR1, rSTR1, 4
456 addi rSTR2, rSTR2, 4
457#else
458 lwzu rWORD7, 12(rSTR1)
459 lwzu rWORD8, 12(rSTR2)
460#endif
461 cmplw cr5, rWORD7, rWORD8
462 bne cr7, L(dLcr7)
463 bne cr1, L(dLcr1)
464 bdz- L(d24) /* Adjust CTR as we start with +4 */
465/* This is the primary loop */
466 .align 4
467L(dLoop):
468#ifdef __LITTLE_ENDIAN__
469 lwbrx rWORD1, 0, rSTR1
470 lwbrx rWORD2, 0, rSTR2
471 addi rSTR1, rSTR1, 4
472 addi rSTR2, rSTR2, 4
473#else
474 lwz rWORD1, 4(rSTR1)
475 lwz rWORD2, 4(rSTR2)
476#endif
477 cmplw cr1, rWORD3, rWORD4
478 bne cr6, L(dLcr6)
479L(dLoop1):
480#ifdef __LITTLE_ENDIAN__
481 lwbrx rWORD3, 0, rSTR1
482 lwbrx rWORD4, 0, rSTR2
483 addi rSTR1, rSTR1, 4
484 addi rSTR2, rSTR2, 4
485#else
486 lwz rWORD3, 8(rSTR1)
487 lwz rWORD4, 8(rSTR2)
488#endif
489 cmplw cr6, rWORD5, rWORD6
490 bne cr5, L(dLcr5)
491L(dLoop2):
492#ifdef __LITTLE_ENDIAN__
493 lwbrx rWORD5, 0, rSTR1
494 lwbrx rWORD6, 0, rSTR2
495 addi rSTR1, rSTR1, 4
496 addi rSTR2, rSTR2, 4
497#else
498 lwz rWORD5, 12(rSTR1)
499 lwz rWORD6, 12(rSTR2)
500#endif
501 cmplw cr5, rWORD7, rWORD8
502 bne cr7, L(dLcr7)
503L(dLoop3):
504#ifdef __LITTLE_ENDIAN__
505 lwbrx rWORD7, 0, rSTR1
506 lwbrx rWORD8, 0, rSTR2
507 addi rSTR1, rSTR1, 4
508 addi rSTR2, rSTR2, 4
509#else
510 lwzu rWORD7, 16(rSTR1)
511 lwzu rWORD8, 16(rSTR2)
512#endif
513 bne- cr1, L(dLcr1)
514 cmplw cr7, rWORD1, rWORD2
515 bdnz+ L(dLoop)
516
517L(dL4):
518 cmplw cr1, rWORD3, rWORD4
519 bne cr6, L(dLcr6)
520 cmplw cr6, rWORD5, rWORD6
521 bne cr5, L(dLcr5)
522 cmplw cr5, rWORD7, rWORD8
523L(d44):
524 bne cr7, L(dLcr7)
525L(d34):
526 bne cr1, L(dLcr1)
527L(d24):
528 bne cr6, L(dLcr6)
529L(d14):
530 slwi. r12, rN, 3
531 bne cr5, L(dLcr5)
532L(d04):
533 lwz rWORD7, 44(r1)
534 lwz rWORD8, 48(r1)
535 addi 1, 1, 64
536 cfi_adjust_cfa_offset(-64)
537 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
538 beq L(zeroLength)
539/* At this point we have a remainder of 1 to 3 bytes to compare. Since
540 we are aligned it is safe to load the whole word, and use
541 shift right to eliminate bits beyond the compare length. */
542L(d00):
543#ifdef __LITTLE_ENDIAN__
544 lwbrx rWORD1, 0, rSTR1
545 lwbrx rWORD2, 0, rSTR2
546 addi rSTR1, rSTR1, 4
547 addi rSTR2, rSTR2, 4
548#else
549 lwz rWORD1, 4(rSTR1)
550 lwz rWORD2, 4(rSTR2)
551#endif
552 srw rWORD1, rWORD1, rN
553 srw rWORD2, rWORD2, rN
554 sub rRTN, rWORD1, rWORD2
555 blr
556
557 .align 4
558 cfi_adjust_cfa_offset(64)
559L(dLcr7):
560 lwz rWORD7, 44(r1)
561 lwz rWORD8, 48(r1)
562L(dLcr7x):
563 li rRTN, 1
564 addi 1, 1, 64
565 cfi_adjust_cfa_offset(-64)
566 bgtlr cr7
567 li rRTN, -1
568 blr
569 .align 4
570 cfi_adjust_cfa_offset(64)
571L(dLcr1):
572 lwz rWORD7, 44(r1)
573 lwz rWORD8, 48(r1)
574L(dLcr1x):
575 li rRTN, 1
576 addi 1, 1, 64
577 cfi_adjust_cfa_offset(-64)
578 bgtlr cr1
579 li rRTN, -1
580 blr
581 .align 4
582 cfi_adjust_cfa_offset(64)
583L(dLcr6):
584 lwz rWORD7, 44(r1)
585 lwz rWORD8, 48(r1)
586L(dLcr6x):
587 li rRTN, 1
588 addi 1, 1, 64
589 cfi_adjust_cfa_offset(-64)
590 bgtlr cr6
591 li rRTN, -1
592 blr
593 .align 4
594 cfi_adjust_cfa_offset(64)
595L(dLcr5):
596 lwz rWORD7, 44(r1)
597 lwz rWORD8, 48(r1)
598L(dLcr5x):
599 li rRTN, 1
600 addi 1, 1, 64
601 cfi_adjust_cfa_offset(-64)
602 bgtlr cr5
603 li rRTN, -1
604 blr
605
606 .align 4
607L(bytealigned):
608 mtctr rN /* Power4 wants mtctr 1st in dispatch group */
609
610/* We need to prime this loop. This loop is swing modulo scheduled
611 to avoid pipe delays. The dependent instruction latencies (load to
612 compare to conditional branch) is 2 to 3 cycles. In this loop each
613 dispatch group ends in a branch and takes 1 cycle. Effectively
614 the first iteration of the loop only serves to load operands and
615 branches based on compares are delayed until the next loop.
616
617 So we must precondition some registers and condition codes so that
618 we don't exit the loop early on the first iteration. */
619
620 lbz rWORD1, 0(rSTR1)
621 lbz rWORD2, 0(rSTR2)
622 bdz- L(b11)
623 cmplw cr7, rWORD1, rWORD2
624 lbz rWORD3, 1(rSTR1)
625 lbz rWORD4, 1(rSTR2)
626 bdz- L(b12)
627 cmplw cr1, rWORD3, rWORD4
628 lbzu rWORD5, 2(rSTR1)
629 lbzu rWORD6, 2(rSTR2)
630 bdz- L(b13)
631 .align 4
632L(bLoop):
633 lbzu rWORD1, 1(rSTR1)
634 lbzu rWORD2, 1(rSTR2)
635 bne- cr7, L(bLcr7)
636
637 cmplw cr6, rWORD5, rWORD6
638 bdz- L(b3i)
639
640 lbzu rWORD3, 1(rSTR1)
641 lbzu rWORD4, 1(rSTR2)
642 bne- cr1, L(bLcr1)
643
644 cmplw cr7, rWORD1, rWORD2
645 bdz- L(b2i)
646
647 lbzu rWORD5, 1(rSTR1)
648 lbzu rWORD6, 1(rSTR2)
649 bne- cr6, L(bLcr6)
650
651 cmplw cr1, rWORD3, rWORD4
652 bdnz+ L(bLoop)
653
654/* We speculatively loading bytes before we have tested the previous
655 bytes. But we must avoid overrunning the length (in the ctr) to
656 prevent these speculative loads from causing a segfault. In this
657 case the loop will exit early (before the all pending bytes are
658 tested. In this case we must complete the pending operations
659 before returning. */
660L(b1i):
661 bne- cr7, L(bLcr7)
662 bne- cr1, L(bLcr1)
663 b L(bx56)
664 .align 4
665L(b2i):
666 bne- cr6, L(bLcr6)
667 bne- cr7, L(bLcr7)
668 b L(bx34)
669 .align 4
670L(b3i):
671 bne- cr1, L(bLcr1)
672 bne- cr6, L(bLcr6)
673 b L(bx12)
674 .align 4
675L(bLcr7):
676 li rRTN, 1
677 bgtlr cr7
678 li rRTN, -1
679 blr
680L(bLcr1):
681 li rRTN, 1
682 bgtlr cr1
683 li rRTN, -1
684 blr
685L(bLcr6):
686 li rRTN, 1
687 bgtlr cr6
688 li rRTN, -1
689 blr
690
691L(b13):
692 bne- cr7, L(bx12)
693 bne- cr1, L(bx34)
694L(bx56):
695 sub rRTN, rWORD5, rWORD6
696 blr
697 nop
698L(b12):
699 bne- cr7, L(bx12)
700L(bx34):
701 sub rRTN, rWORD3, rWORD4
702 blr
703L(b11):
704L(bx12):
705 sub rRTN, rWORD1, rWORD2
706 blr
707 .align 4
708L(zeroLength):
709 li rRTN, 0
710 blr
711
712 .align 4
713/* At this point we know the strings have different alignment and the
714 compare length is at least 8 bytes. r12 contains the low order
715 2 bits of rSTR1 and cr5 contains the result of the logical compare
716 of r12 to 0. If r12 == 0 then rStr1 is word aligned and can
717 perform the Wunaligned loop.
718
719 Otherwise we know that rSTR1 is not already word aligned yet.
720 So we can force the string addresses to the next lower word
721 boundary and special case this first word using shift left to
722 eliminate bits preceding the first byte. Since we want to join the
723 normal (Wualigned) compare loop, starting at the second word,
724 we need to adjust the length (rN) and special case the loop
725 versioning for the first W. This ensures that the loop count is
726 correct and the first W (shifted) is in the expected resister pair. */
727#define rSHL r29 /* Unaligned shift left count. */
728#define rSHR r28 /* Unaligned shift right count. */
729#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
730#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
731#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
732#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
733 cfi_adjust_cfa_offset(64)
734L(unaligned):
735 stw rSHL, 40(r1)
736 cfi_offset(rSHL, (40-64))
737 clrlwi rSHL, rSTR2, 30
738 stw rSHR, 36(r1)
739 cfi_offset(rSHR, (36-64))
740 beq cr5, L(Wunaligned)
741 stw rWORD8_SHIFT, 32(r1)
742 cfi_offset(rWORD8_SHIFT, (32-64))
743/* Adjust the logical start of rSTR2 to compensate for the extra bits
744 in the 1st rSTR1 W. */
745 sub rWORD8_SHIFT, rSTR2, r12
746/* But do not attempt to address the W before that W that contains
747 the actual start of rSTR2. */
748 clrrwi rSTR2, rSTR2, 2
749 stw rWORD2_SHIFT, 28(r1)
750/* Compute the left/right shift counts for the unaligned rSTR2,
751 compensating for the logical (W aligned) start of rSTR1. */
752 clrlwi rSHL, rWORD8_SHIFT, 30
753 clrrwi rSTR1, rSTR1, 2
754 stw rWORD4_SHIFT, 24(r1)
755 slwi rSHL, rSHL, 3
756 cmplw cr5, rWORD8_SHIFT, rSTR2
757 add rN, rN, r12
758 slwi rWORD6, r12, 3
759 stw rWORD6_SHIFT, 20(r1)
760 cfi_offset(rWORD2_SHIFT, (28-64))
761 cfi_offset(rWORD4_SHIFT, (24-64))
762 cfi_offset(rWORD6_SHIFT, (20-64))
763 subfic rSHR, rSHL, 32
764 srwi r0, rN, 4 /* Divide by 16 */
765 andi. r12, rN, 12 /* Get the W remainder */
766/* We normally need to load 2 Ws to start the unaligned rSTR2, but in
767 this special case those bits may be discarded anyway. Also we
768 must avoid loading a W where none of the bits are part of rSTR2 as
769 this may cross a page boundary and cause a page fault. */
770 li rWORD8, 0
771 blt cr5, L(dus0)
772#ifdef __LITTLE_ENDIAN__
773 lwbrx rWORD8, 0, rSTR2
774 addi rSTR2, rSTR2, 4
775#else
776 lwz rWORD8, 0(rSTR2)
777 addi rSTR2, rSTR2, 4
778#endif
779 slw rWORD8, rWORD8, rSHL
780
781L(dus0):
782#ifdef __LITTLE_ENDIAN__
783 lwbrx rWORD1, 0, rSTR1
784 lwbrx rWORD2, 0, rSTR2
785 addi rSTR1, rSTR1, 4
786 addi rSTR2, rSTR2, 4
787#else
788 lwz rWORD1, 0(rSTR1)
789 lwz rWORD2, 0(rSTR2)
790#endif
791 cmplwi cr1, r12, 8
792 cmplwi cr7, rN, 16
793 srw r12, rWORD2, rSHR
794 clrlwi rN, rN, 30
795 beq L(duPs4)
796 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
797 or rWORD8, r12, rWORD8
798 bgt cr1, L(duPs3)
799 beq cr1, L(duPs2)
800
801/* Remainder is 4 */
802 .align 4
803L(dusP1):
804 slw rWORD8_SHIFT, rWORD2, rSHL
805 slw rWORD7, rWORD1, rWORD6
806 slw rWORD8, rWORD8, rWORD6
807 bge cr7, L(duP1e)
808/* At this point we exit early with the first word compare
809 complete and remainder of 0 to 3 bytes. See L(du14) for details on
810 how we handle the remaining bytes. */
811 cmplw cr5, rWORD7, rWORD8
812 slwi. rN, rN, 3
813 bne cr5, L(duLcr5)
814 cmplw cr7, rN, rSHR
815 beq L(duZeroReturn)
816 li r0, 0
817 ble cr7, L(dutrim)
818#ifdef __LITTLE_ENDIAN__
819 lwbrx rWORD2, 0, rSTR2
820 addi rSTR2, rSTR2, 4
821#else
822 lwz rWORD2, 4(rSTR2)
823#endif
824 srw r0, rWORD2, rSHR
825 b L(dutrim)
826/* Remainder is 8 */
827 .align 4
828L(duPs2):
829 slw rWORD6_SHIFT, rWORD2, rSHL
830 slw rWORD5, rWORD1, rWORD6
831 slw rWORD6, rWORD8, rWORD6
832 b L(duP2e)
833/* Remainder is 12 */
834 .align 4
835L(duPs3):
836 slw rWORD4_SHIFT, rWORD2, rSHL
837 slw rWORD3, rWORD1, rWORD6
838 slw rWORD4, rWORD8, rWORD6
839 b L(duP3e)
840/* Count is a multiple of 16, remainder is 0 */
841 .align 4
842L(duPs4):
843 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
844 or rWORD8, r12, rWORD8
845 slw rWORD2_SHIFT, rWORD2, rSHL
846 slw rWORD1, rWORD1, rWORD6
847 slw rWORD2, rWORD8, rWORD6
848 b L(duP4e)
849
850/* At this point we know rSTR1 is word aligned and the
851 compare length is at least 8 bytes. */
852 .align 4
853L(Wunaligned):
854 stw rWORD8_SHIFT, 32(r1)
855 clrrwi rSTR2, rSTR2, 2
856 stw rWORD2_SHIFT, 28(r1)
857 srwi r0, rN, 4 /* Divide by 16 */
858 stw rWORD4_SHIFT, 24(r1)
859 andi. r12, rN, 12 /* Get the W remainder */
860 stw rWORD6_SHIFT, 20(r1)
861 cfi_offset(rWORD8_SHIFT, (32-64))
862 cfi_offset(rWORD2_SHIFT, (28-64))
863 cfi_offset(rWORD4_SHIFT, (24-64))
864 cfi_offset(rWORD6_SHIFT, (20-64))
865 slwi rSHL, rSHL, 3
866#ifdef __LITTLE_ENDIAN__
867 lwbrx rWORD6, 0, rSTR2
868 addi rSTR2, rSTR2, 4
869 lwbrx rWORD8, 0, rSTR2
870 addi rSTR2, rSTR2, 4
871#else
872 lwz rWORD6, 0(rSTR2)
873 lwzu rWORD8, 4(rSTR2)
874#endif
875 cmplwi cr1, r12, 8
876 cmplwi cr7, rN, 16
877 clrlwi rN, rN, 30
878 subfic rSHR, rSHL, 32
879 slw rWORD6_SHIFT, rWORD6, rSHL
880 beq L(duP4)
881 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
882 bgt cr1, L(duP3)
883 beq cr1, L(duP2)
884
885/* Remainder is 4 */
886 .align 4
887L(duP1):
888 srw r12, rWORD8, rSHR
889#ifdef __LITTLE_ENDIAN__
890 lwbrx rWORD7, 0, rSTR1
891 addi rSTR1, rSTR1, 4
892#else
893 lwz rWORD7, 0(rSTR1)
894#endif
895 slw rWORD8_SHIFT, rWORD8, rSHL
896 or rWORD8, r12, rWORD6_SHIFT
897 blt cr7, L(duP1x)
898L(duP1e):
899#ifdef __LITTLE_ENDIAN__
900 lwbrx rWORD1, 0, rSTR1
901 lwbrx rWORD2, 0, rSTR2
902 addi rSTR1, rSTR1, 4
903 addi rSTR2, rSTR2, 4
904#else
905 lwz rWORD1, 4(rSTR1)
906 lwz rWORD2, 4(rSTR2)
907#endif
908 cmplw cr5, rWORD7, rWORD8
909 srw r0, rWORD2, rSHR
910 slw rWORD2_SHIFT, rWORD2, rSHL
911 or rWORD2, r0, rWORD8_SHIFT
912#ifdef __LITTLE_ENDIAN__
913 lwbrx rWORD3, 0, rSTR1
914 lwbrx rWORD4, 0, rSTR2
915 addi rSTR1, rSTR1, 4
916 addi rSTR2, rSTR2, 4
917#else
918 lwz rWORD3, 8(rSTR1)
919 lwz rWORD4, 8(rSTR2)
920#endif
921 cmplw cr7, rWORD1, rWORD2
922 srw r12, rWORD4, rSHR
923 slw rWORD4_SHIFT, rWORD4, rSHL
924 bne cr5, L(duLcr5)
925 or rWORD4, r12, rWORD2_SHIFT
926#ifdef __LITTLE_ENDIAN__
927 lwbrx rWORD5, 0, rSTR1
928 lwbrx rWORD6, 0, rSTR2
929 addi rSTR1, rSTR1, 4
930 addi rSTR2, rSTR2, 4
931#else
932 lwz rWORD5, 12(rSTR1)
933 lwz rWORD6, 12(rSTR2)
934#endif
935 cmplw cr1, rWORD3, rWORD4
936 srw r0, rWORD6, rSHR
937 slw rWORD6_SHIFT, rWORD6, rSHL
938 bne cr7, L(duLcr7)
939 or rWORD6, r0, rWORD4_SHIFT
940 cmplw cr6, rWORD5, rWORD6
941 b L(duLoop3)
942 .align 4
943/* At this point we exit early with the first word compare
944 complete and remainder of 0 to 3 bytes. See L(du14) for details on
945 how we handle the remaining bytes. */
946L(duP1x):
947 cmplw cr5, rWORD7, rWORD8
948 slwi. rN, rN, 3
949 bne cr5, L(duLcr5)
950 cmplw cr7, rN, rSHR
951 beq L(duZeroReturn)
952 li r0, 0
953 ble cr7, L(dutrim)
954#ifdef __LITTLE_ENDIAN__
955 lwbrx rWORD2, 0, rSTR2
956 addi rSTR2, rSTR2, 4
957#else
958 lwz rWORD2, 8(rSTR2)
959#endif
960 srw r0, rWORD2, rSHR
961 b L(dutrim)
962/* Remainder is 8 */
963 .align 4
964L(duP2):
965 srw r0, rWORD8, rSHR
966#ifdef __LITTLE_ENDIAN__
967 lwbrx rWORD5, 0, rSTR1
968 addi rSTR1, rSTR1, 4
969#else
970 lwz rWORD5, 0(rSTR1)
971#endif
972 or rWORD6, r0, rWORD6_SHIFT
973 slw rWORD6_SHIFT, rWORD8, rSHL
974L(duP2e):
975#ifdef __LITTLE_ENDIAN__
976 lwbrx rWORD7, 0, rSTR1
977 lwbrx rWORD8, 0, rSTR2
978 addi rSTR1, rSTR1, 4
979 addi rSTR2, rSTR2, 4
980#else
981 lwz rWORD7, 4(rSTR1)
982 lwz rWORD8, 4(rSTR2)
983#endif
984 cmplw cr6, rWORD5, rWORD6
985 srw r12, rWORD8, rSHR
986 slw rWORD8_SHIFT, rWORD8, rSHL
987 or rWORD8, r12, rWORD6_SHIFT
988 blt cr7, L(duP2x)
989#ifdef __LITTLE_ENDIAN__
990 lwbrx rWORD1, 0, rSTR1
991 lwbrx rWORD2, 0, rSTR2
992 addi rSTR1, rSTR1, 4
993 addi rSTR2, rSTR2, 4
994#else
995 lwz rWORD1, 8(rSTR1)
996 lwz rWORD2, 8(rSTR2)
997#endif
998 cmplw cr5, rWORD7, rWORD8
999 bne cr6, L(duLcr6)
1000 srw r0, rWORD2, rSHR
1001 slw rWORD2_SHIFT, rWORD2, rSHL
1002 or rWORD2, r0, rWORD8_SHIFT
1003#ifdef __LITTLE_ENDIAN__
1004 lwbrx rWORD3, 0, rSTR1
1005 lwbrx rWORD4, 0, rSTR2
1006 addi rSTR1, rSTR1, 4
1007 addi rSTR2, rSTR2, 4
1008#else
1009 lwz rWORD3, 12(rSTR1)
1010 lwz rWORD4, 12(rSTR2)
1011#endif
1012 cmplw cr7, rWORD1, rWORD2
1013 bne cr5, L(duLcr5)
1014 srw r12, rWORD4, rSHR
1015 slw rWORD4_SHIFT, rWORD4, rSHL
1016 or rWORD4, r12, rWORD2_SHIFT
1017#ifndef __LITTLE_ENDIAN__
1018 addi rSTR1, rSTR1, 4
1019 addi rSTR2, rSTR2, 4
1020#endif
1021 cmplw cr1, rWORD3, rWORD4
1022 b L(duLoop2)
1023 .align 4
1024L(duP2x):
1025 cmplw cr5, rWORD7, rWORD8
1026#ifndef __LITTLE_ENDIAN__
1027 addi rSTR1, rSTR1, 4
1028 addi rSTR2, rSTR2, 4
1029#endif
1030 bne cr6, L(duLcr6)
1031 slwi. rN, rN, 3
1032 bne cr5, L(duLcr5)
1033 cmplw cr7, rN, rSHR
1034 beq L(duZeroReturn)
1035 li r0, 0
1036 ble cr7, L(dutrim)
1037#ifdef __LITTLE_ENDIAN__
1038 lwbrx rWORD2, 0, rSTR2
1039 addi rSTR2, rSTR2, 4
1040#else
1041 lwz rWORD2, 4(rSTR2)
1042#endif
1043 srw r0, rWORD2, rSHR
1044 b L(dutrim)
1045
1046/* Remainder is 12 */
1047 .align 4
1048L(duP3):
1049 srw r12, rWORD8, rSHR
1050#ifdef __LITTLE_ENDIAN__
1051 lwbrx rWORD3, 0, rSTR1
1052 addi rSTR1, rSTR1, 4
1053#else
1054 lwz rWORD3, 0(rSTR1)
1055#endif
1056 slw rWORD4_SHIFT, rWORD8, rSHL
1057 or rWORD4, r12, rWORD6_SHIFT
1058L(duP3e):
1059#ifdef __LITTLE_ENDIAN__
1060 lwbrx rWORD5, 0, rSTR1
1061 lwbrx rWORD6, 0, rSTR2
1062 addi rSTR1, rSTR1, 4
1063 addi rSTR2, rSTR2, 4
1064#else
1065 lwz rWORD5, 4(rSTR1)
1066 lwz rWORD6, 4(rSTR2)
1067#endif
1068 cmplw cr1, rWORD3, rWORD4
1069 srw r0, rWORD6, rSHR
1070 slw rWORD6_SHIFT, rWORD6, rSHL
1071 or rWORD6, r0, rWORD4_SHIFT
1072#ifdef __LITTLE_ENDIAN__
1073 lwbrx rWORD7, 0, rSTR1
1074 lwbrx rWORD8, 0, rSTR2
1075 addi rSTR1, rSTR1, 4
1076 addi rSTR2, rSTR2, 4
1077#else
1078 lwz rWORD7, 8(rSTR1)
1079 lwz rWORD8, 8(rSTR2)
1080#endif
1081 cmplw cr6, rWORD5, rWORD6
1082 bne cr1, L(duLcr1)
1083 srw r12, rWORD8, rSHR
1084 slw rWORD8_SHIFT, rWORD8, rSHL
1085 or rWORD8, r12, rWORD6_SHIFT
1086 blt cr7, L(duP3x)
1087#ifdef __LITTLE_ENDIAN__
1088 lwbrx rWORD1, 0, rSTR1
1089 lwbrx rWORD2, 0, rSTR2
1090 addi rSTR1, rSTR1, 4
1091 addi rSTR2, rSTR2, 4
1092#else
1093 lwz rWORD1, 12(rSTR1)
1094 lwz rWORD2, 12(rSTR2)
1095#endif
1096 cmplw cr5, rWORD7, rWORD8
1097 bne cr6, L(duLcr6)
1098 srw r0, rWORD2, rSHR
1099 slw rWORD2_SHIFT, rWORD2, rSHL
1100 or rWORD2, r0, rWORD8_SHIFT
1101#ifndef __LITTLE_ENDIAN__
1102 addi rSTR1, rSTR1, 8
1103 addi rSTR2, rSTR2, 8
1104#endif
1105 cmplw cr7, rWORD1, rWORD2
1106 b L(duLoop1)
1107 .align 4
1108L(duP3x):
1109#ifndef __LITTLE_ENDIAN__
1110 addi rSTR1, rSTR1, 8
1111 addi rSTR2, rSTR2, 8
1112#endif
1113#if 0
1114/* Huh? We've already branched on cr1! */
1115 bne cr1, L(duLcr1)
1116#endif
1117 cmplw cr5, rWORD7, rWORD8
1118 bne cr6, L(duLcr6)
1119 slwi. rN, rN, 3
1120 bne cr5, L(duLcr5)
1121 cmplw cr7, rN, rSHR
1122 beq L(duZeroReturn)
1123 li r0, 0
1124 ble cr7, L(dutrim)
1125#ifdef __LITTLE_ENDIAN__
1126 lwbrx rWORD2, 0, rSTR2
1127 addi rSTR2, rSTR2, 4
1128#else
1129 lwz rWORD2, 4(rSTR2)
1130#endif
1131 srw r0, rWORD2, rSHR
1132 b L(dutrim)
1133
1134/* Count is a multiple of 16, remainder is 0 */
1135 .align 4
1136L(duP4):
1137 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
1138 srw r0, rWORD8, rSHR
1139#ifdef __LITTLE_ENDIAN__
1140 lwbrx rWORD1, 0, rSTR1
1141 addi rSTR1, rSTR1, 4
1142#else
1143 lwz rWORD1, 0(rSTR1)
1144#endif
1145 slw rWORD2_SHIFT, rWORD8, rSHL
1146 or rWORD2, r0, rWORD6_SHIFT
1147L(duP4e):
1148#ifdef __LITTLE_ENDIAN__
1149 lwbrx rWORD3, 0, rSTR1
1150 lwbrx rWORD4, 0, rSTR2
1151 addi rSTR1, rSTR1, 4
1152 addi rSTR2, rSTR2, 4
1153#else
1154 lwz rWORD3, 4(rSTR1)
1155 lwz rWORD4, 4(rSTR2)
1156#endif
1157 cmplw cr7, rWORD1, rWORD2
1158 srw r12, rWORD4, rSHR
1159 slw rWORD4_SHIFT, rWORD4, rSHL
1160 or rWORD4, r12, rWORD2_SHIFT
1161#ifdef __LITTLE_ENDIAN__
1162 lwbrx rWORD5, 0, rSTR1
1163 lwbrx rWORD6, 0, rSTR2
1164 addi rSTR1, rSTR1, 4
1165 addi rSTR2, rSTR2, 4
1166#else
1167 lwz rWORD5, 8(rSTR1)
1168 lwz rWORD6, 8(rSTR2)
1169#endif
1170 cmplw cr1, rWORD3, rWORD4
1171 bne cr7, L(duLcr7)
1172 srw r0, rWORD6, rSHR
1173 slw rWORD6_SHIFT, rWORD6, rSHL
1174 or rWORD6, r0, rWORD4_SHIFT
1175#ifdef __LITTLE_ENDIAN__
1176 lwbrx rWORD7, 0, rSTR1
1177 lwbrx rWORD8, 0, rSTR2
1178 addi rSTR1, rSTR1, 4
1179 addi rSTR2, rSTR2, 4
1180#else
1181 lwzu rWORD7, 12(rSTR1)
1182 lwzu rWORD8, 12(rSTR2)
1183#endif
1184 cmplw cr6, rWORD5, rWORD6
1185 bne cr1, L(duLcr1)
1186 srw r12, rWORD8, rSHR
1187 slw rWORD8_SHIFT, rWORD8, rSHL
1188 or rWORD8, r12, rWORD6_SHIFT
1189 cmplw cr5, rWORD7, rWORD8
1190 bdz- L(du24) /* Adjust CTR as we start with +4 */
1191/* This is the primary loop */
1192 .align 4
1193L(duLoop):
1194#ifdef __LITTLE_ENDIAN__
1195 lwbrx rWORD1, 0, rSTR1
1196 lwbrx rWORD2, 0, rSTR2
1197 addi rSTR1, rSTR1, 4
1198 addi rSTR2, rSTR2, 4
1199#else
1200 lwz rWORD1, 4(rSTR1)
1201 lwz rWORD2, 4(rSTR2)
1202#endif
1203 cmplw cr1, rWORD3, rWORD4
1204 bne cr6, L(duLcr6)
1205 srw r0, rWORD2, rSHR
1206 slw rWORD2_SHIFT, rWORD2, rSHL
1207 or rWORD2, r0, rWORD8_SHIFT
1208L(duLoop1):
1209#ifdef __LITTLE_ENDIAN__
1210 lwbrx rWORD3, 0, rSTR1
1211 lwbrx rWORD4, 0, rSTR2
1212 addi rSTR1, rSTR1, 4
1213 addi rSTR2, rSTR2, 4
1214#else
1215 lwz rWORD3, 8(rSTR1)
1216 lwz rWORD4, 8(rSTR2)
1217#endif
1218 cmplw cr6, rWORD5, rWORD6
1219 bne cr5, L(duLcr5)
1220 srw r12, rWORD4, rSHR
1221 slw rWORD4_SHIFT, rWORD4, rSHL
1222 or rWORD4, r12, rWORD2_SHIFT
1223L(duLoop2):
1224#ifdef __LITTLE_ENDIAN__
1225 lwbrx rWORD5, 0, rSTR1
1226 lwbrx rWORD6, 0, rSTR2
1227 addi rSTR1, rSTR1, 4
1228 addi rSTR2, rSTR2, 4
1229#else
1230 lwz rWORD5, 12(rSTR1)
1231 lwz rWORD6, 12(rSTR2)
1232#endif
1233 cmplw cr5, rWORD7, rWORD8
1234 bne cr7, L(duLcr7)
1235 srw r0, rWORD6, rSHR
1236 slw rWORD6_SHIFT, rWORD6, rSHL
1237 or rWORD6, r0, rWORD4_SHIFT
1238L(duLoop3):
1239#ifdef __LITTLE_ENDIAN__
1240 lwbrx rWORD7, 0, rSTR1
1241 lwbrx rWORD8, 0, rSTR2
1242 addi rSTR1, rSTR1, 4
1243 addi rSTR2, rSTR2, 4
1244#else
1245 lwzu rWORD7, 16(rSTR1)
1246 lwzu rWORD8, 16(rSTR2)
1247#endif
1248 cmplw cr7, rWORD1, rWORD2
1249 bne- cr1, L(duLcr1)
1250 srw r12, rWORD8, rSHR
1251 slw rWORD8_SHIFT, rWORD8, rSHL
1252 or rWORD8, r12, rWORD6_SHIFT
1253 bdnz+ L(duLoop)
1254
1255L(duL4):
1256#if 0
1257/* Huh? We've already branched on cr1! */
1258 bne cr1, L(duLcr1)
1259#endif
1260 cmplw cr1, rWORD3, rWORD4
1261 bne cr6, L(duLcr6)
1262 cmplw cr6, rWORD5, rWORD6
1263 bne cr5, L(duLcr5)
1264 cmplw cr5, rWORD7, rWORD8
1265L(du44):
1266 bne cr7, L(duLcr7)
1267L(du34):
1268 bne cr1, L(duLcr1)
1269L(du24):
1270 bne cr6, L(duLcr6)
1271L(du14):
1272 slwi. rN, rN, 3
1273 bne cr5, L(duLcr5)
1274/* At this point we have a remainder of 1 to 3 bytes to compare. We use
1275 shift right to eliminate bits beyond the compare length.
1276 This allows the use of word subtract to compute the final result.
1277
1278 However it may not be safe to load rWORD2 which may be beyond the
1279 string length. So we compare the bit length of the remainder to
1280 the right shift count (rSHR). If the bit count is less than or equal
1281 we do not need to load rWORD2 (all significant bits are already in
1282 rWORD8_SHIFT). */
1283 cmplw cr7, rN, rSHR
1284 beq L(duZeroReturn)
1285 li r0, 0
1286 ble cr7, L(dutrim)
1287#ifdef __LITTLE_ENDIAN__
1288 lwbrx rWORD2, 0, rSTR2
1289 addi rSTR2, rSTR2, 4
1290#else
1291 lwz rWORD2, 4(rSTR2)
1292#endif
1293 srw r0, rWORD2, rSHR
1294 .align 4
1295L(dutrim):
1296#ifdef __LITTLE_ENDIAN__
1297 lwbrx rWORD1, 0, rSTR1
1298#else
1299 lwz rWORD1, 4(rSTR1)
1300#endif
1301 lwz rWORD8, 48(r1)
1302 subfic rN, rN, 32 /* Shift count is 32 - (rN * 8). */
1303 or rWORD2, r0, rWORD8_SHIFT
1304 lwz rWORD7, 44(r1)
1305 lwz rSHL, 40(r1)
1306 srw rWORD1, rWORD1, rN
1307 srw rWORD2, rWORD2, rN
1308 lwz rSHR, 36(r1)
1309 lwz rWORD8_SHIFT, 32(r1)
1310 sub rRTN, rWORD1, rWORD2
1311 b L(dureturn26)
1312 .align 4
1313L(duLcr7):
1314 lwz rWORD8, 48(r1)
1315 lwz rWORD7, 44(r1)
1316 li rRTN, 1
1317 bgt cr7, L(dureturn29)
1318 lwz rSHL, 40(r1)
1319 lwz rSHR, 36(r1)
1320 li rRTN, -1
1321 b L(dureturn27)
1322 .align 4
1323L(duLcr1):
1324 lwz rWORD8, 48(r1)
1325 lwz rWORD7, 44(r1)
1326 li rRTN, 1
1327 bgt cr1, L(dureturn29)
1328 lwz rSHL, 40(r1)
1329 lwz rSHR, 36(r1)
1330 li rRTN, -1
1331 b L(dureturn27)
1332 .align 4
1333L(duLcr6):
1334 lwz rWORD8, 48(r1)
1335 lwz rWORD7, 44(r1)
1336 li rRTN, 1
1337 bgt cr6, L(dureturn29)
1338 lwz rSHL, 40(r1)
1339 lwz rSHR, 36(r1)
1340 li rRTN, -1
1341 b L(dureturn27)
1342 .align 4
1343L(duLcr5):
1344 lwz rWORD8, 48(r1)
1345 lwz rWORD7, 44(r1)
1346 li rRTN, 1
1347 bgt cr5, L(dureturn29)
1348 lwz rSHL, 40(r1)
1349 lwz rSHR, 36(r1)
1350 li rRTN, -1
1351 b L(dureturn27)
1352 .align 3
1353L(duZeroReturn):
1354 li rRTN, 0
1355 .align 4
1356L(dureturn):
1357 lwz rWORD8, 48(r1)
1358 lwz rWORD7, 44(r1)
1359L(dureturn29):
1360 lwz rSHL, 40(r1)
1361 lwz rSHR, 36(r1)
1362L(dureturn27):
1363 lwz rWORD8_SHIFT, 32(r1)
1364L(dureturn26):
1365 lwz rWORD2_SHIFT, 28(r1)
1366L(dureturn25):
1367 lwz rWORD4_SHIFT, 24(r1)
1368 lwz rWORD6_SHIFT, 20(r1)
1369 addi 1, 1, 64
1370 cfi_adjust_cfa_offset(-64)
1371 blr
1372END (memcmp)
1373
1374libc_hidden_builtin_def (memcmp)
1375weak_alias (memcmp, bcmp)
1376strong_alias (memcmp, __memcmpeq)
1377libc_hidden_def (__memcmpeq)
1378

source code of glibc/sysdeps/powerpc/powerpc32/power4/memcmp.S