memcmp.S source code [glibc/sysdeps/powerpc/powerpc64/power7/memcmp.S]

1	/ Optimized memcmp implementation for POWER7/PowerPC64.*
2	Copyright (C) 2010-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20
21	/ int [r3] memcmp (const char s1 [r3],
22	const char s2 [r4],*
23	size_t size [r5]) /*
24	#ifndef MEMCMP
25	# define MEMCMP memcmp
26	#endif
27	.machine power7
28	ENTRY_TOCLESS (MEMCMP, `4`)
29	CALL_MCOUNT `3`
30
31	#define rRTN r3
32	#define rSTR1 r3 /* first string arg */
33	#define rSTR2 r4 /* second string arg */
34	#define rN r5 /* max string length */
35	#define rWORD1 r6 /* current word in s1 */
36	#define rWORD2 r7 /* current word in s2 */
37	#define rWORD3 r8 /* next word in s1 */
38	#define rWORD4 r9 /* next word in s2 */
39	#define rWORD5 r10 /* next word in s1 */
40	#define rWORD6 r11 /* next word in s2 */
41
42	#define rOFF8 r20 /* 8 bytes offset. */
43	#define rOFF16 r21 /* 16 bytes offset. */
44	#define rOFF24 r22 /* 24 bytes offset. */
45	#define rOFF32 r23 /* 24 bytes offset. */
46	#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
47	#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
48	#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
49	#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
50	#define rSHR r28 /* Unaligned shift right count. */
51	#define rSHL r29 /* Unaligned shift left count. */
52	#define rWORD7 r30 /* next word in s1 */
53	#define rWORD8 r31 /* next word in s2 */
54
55	#define rWORD8SAVE (-8)
56	#define rWORD7SAVE (-16)
57	#define rOFF8SAVE (-24)
58	#define rOFF16SAVE (-32)
59	#define rOFF24SAVE (-40)
60	#define rOFF32SAVE (-48)
61	#define rSHRSAVE (-56)
62	#define rSHLSAVE (-64)
63	#define rWORD8SHIFTSAVE (-72)
64	#define rWORD2SHIFTSAVE (-80)
65	#define rWORD4SHIFTSAVE (-88)
66	#define rWORD6SHIFTSAVE (-96)
67
68	#ifdef __LITTLE_ENDIAN__
69	# define LD ldbrx
70	#else
71	# define LD ldx
72	#endif
73
74	xor r0, rSTR2, rSTR1
75	cmpldi cr6, rN, `0`
76	cmpldi cr1, rN, `12`
77	clrldi. r0, r0, `61`
78	clrldi r12, rSTR1, `61`
79	cmpldi cr5, r12, `0`
80	beq- cr6, L(zeroLength)
81	dcbt `0`, rSTR1
82	dcbt `0`, rSTR2
83	/ If less than 8 bytes or not aligned, use the unaligned*
84	byte loop. /*
85	blt cr1, L(bytealigned)
86	std rWORD8, rWORD8SAVE(r1)
87	std rWORD7, rWORD7SAVE(r1)
88	std rOFF8, rOFF8SAVE(r1)
89	std rOFF16, rOFF16SAVE(r1)
90	std rOFF24, rOFF24SAVE(r1)
91	std rOFF32, rOFF32SAVE(r1)
92	cfi_offset(rWORD8, rWORD8SAVE)
93	cfi_offset(rWORD7, rWORD7SAVE)
94	cfi_offset(rOFF8, rOFF8SAVE)
95	cfi_offset(rOFF16, rOFF16SAVE)
96	cfi_offset(rOFF24, rOFF24SAVE)
97	cfi_offset(rOFF32, rOFF32SAVE)
98
99	li rOFF8,`8`
100	li rOFF16,`16`
101	li rOFF24,`24`
102	li rOFF32,`32`
103
104	bne L(unaligned)
105	/ At this point we know both strings have the same alignment and the*
106	compare length is at least 8 bytes. r12 contains the low order
107	3 bits of rSTR1 and cr5 contains the result of the logical compare
108	of r12 to 0. If r12 == 0 then we are already double word
109	aligned and can perform the DW aligned loop.
110
111	Otherwise we know the two strings have the same alignment (but not
112	yet DW). So we force the string addresses to the next lower DW
113	boundary and special case this first DW using shift left to
114	eliminate bits preceding the first byte. Since we want to join the
115	normal (DW aligned) compare loop, starting at the second double word,
116	we need to adjust the length (rN) and special case the loop
117	versioning for the first DW. This ensures that the loop count is
118	correct and the first DW (shifted) is in the expected register pair. /*
119	.align `4`
120	L(samealignment):
121	clrrdi rSTR1, rSTR1, `3`
122	clrrdi rSTR2, rSTR2, `3`
123	beq cr5, L(DWaligned)
124	add rN, rN, r12
125	sldi rWORD6, r12, `3`
126	srdi r0, rN, `5` / Divide by 32 /
127	andi. r12, rN, `24` / Get the DW remainder /
128	LD rWORD1, `0`, rSTR1
129	LD rWORD2, `0`, rSTR2
130	cmpldi cr1, r12, `16`
131	cmpldi cr7, rN, `32`
132	clrldi rN, rN, `61`
133	beq L(dPs4)
134	mtctr r0
135	bgt cr1, L(dPs3)
136	beq cr1, L(dPs2)
137
138	/ Remainder is 8 /
139	.align `3`
140	L(dsP1):
141	sld rWORD5, rWORD1, rWORD6
142	sld rWORD6, rWORD2, rWORD6
143	cmpld cr5, rWORD5, rWORD6
144	blt cr7, L(dP1x)
145	/ Do something useful in this cycle since we have to branch anyway. /
146	LD rWORD1, rOFF8, rSTR1
147	LD rWORD2, rOFF8, rSTR2
148	cmpld cr7, rWORD1, rWORD2
149	b L(dP1e)
150	/ Remainder is 16 /
151	.align `4`
152	L(dPs2):
153	sld rWORD5, rWORD1, rWORD6
154	sld rWORD6, rWORD2, rWORD6
155	cmpld cr6, rWORD5, rWORD6
156	blt cr7, L(dP2x)
157	/ Do something useful in this cycle since we have to branch anyway. /
158	LD rWORD7, rOFF8, rSTR1
159	LD rWORD8, rOFF8, rSTR2
160	cmpld cr5, rWORD7, rWORD8
161	b L(dP2e)
162	/ Remainder is 24 /
163	.align `4`
164	L(dPs3):
165	sld rWORD3, rWORD1, rWORD6
166	sld rWORD4, rWORD2, rWORD6
167	cmpld cr1, rWORD3, rWORD4
168	b L(dP3e)
169	/ Count is a multiple of 32, remainder is 0 /
170	.align `4`
171	L(dPs4):
172	mtctr r0
173	sld rWORD1, rWORD1, rWORD6
174	sld rWORD2, rWORD2, rWORD6
175	cmpld cr7, rWORD1, rWORD2
176	b L(dP4e)
177
178	/ At this point we know both strings are double word aligned and the*
179	compare length is at least 8 bytes. /*
180	.align `4`
181	L(DWaligned):
182	andi. r12, rN, `24` / Get the DW remainder /
183	srdi r0, rN, `5` / Divide by 32 /
184	cmpldi cr1, r12, `16`
185	cmpldi cr7, rN, `32`
186	clrldi rN, rN, `61`
187	beq L(dP4)
188	bgt cr1, L(dP3)
189	beq cr1, L(dP2)
190
191	/ Remainder is 8 /
192	.align `4`
193	L(dP1):
194	mtctr r0
195	/ Normally we'd use rWORD7/rWORD8 here, but since we might exit early*
196	(8-15 byte compare), we want to use only volatile registers. This
197	means we can avoid restoring non-volatile registers since we did not
198	change any on the early exit path. The key here is the non-early
199	exit path only cares about the condition code (cr5), not about which
200	register pair was used. /*
201	LD rWORD5, `0`, rSTR1
202	LD rWORD6, `0`, rSTR2
203	cmpld cr5, rWORD5, rWORD6
204	blt cr7, L(dP1x)
205	LD rWORD1, rOFF8, rSTR1
206	LD rWORD2, rOFF8, rSTR2
207	cmpld cr7, rWORD1, rWORD2
208	L(dP1e):
209	LD rWORD3, rOFF16, rSTR1
210	LD rWORD4, rOFF16, rSTR2
211	cmpld cr1, rWORD3, rWORD4
212	LD rWORD5, rOFF24, rSTR1
213	LD rWORD6, rOFF24, rSTR2
214	cmpld cr6, rWORD5, rWORD6
215	bne cr5, L(dLcr5x)
216	bne cr7, L(dLcr7x)
217
218	LD rWORD7, rOFF32, rSTR1
219	LD rWORD8, rOFF32, rSTR2
220	addi rSTR1, rSTR1, `32`
221	addi rSTR2, rSTR2, `32`
222	bne cr1, L(dLcr1)
223	cmpld cr5, rWORD7, rWORD8
224	bdnz L(dLoop)
225	bne cr6, L(dLcr6)
226	ld rWORD8, rWORD8SAVE(r1)
227	ld rWORD7, rWORD7SAVE(r1)
228	.align `3`
229	L(dP1x):
230	sldi. r12, rN, `3`
231	bne cr5, L(dLcr5x)
232	subfic rN, r12, `64` / Shift count is 64 - (rN * 8). /
233	bne L(d00)
234	ld rOFF8, rOFF8SAVE(r1)
235	ld rOFF16, rOFF16SAVE(r1)
236	ld rOFF24, rOFF24SAVE(r1)
237	ld rOFF32, rOFF32SAVE(r1)
238	li rRTN, `0`
239	blr
240
241	/ Remainder is 16 /
242	.align `4`
243	L(dP2):
244	mtctr r0
245	LD rWORD5, `0`, rSTR1
246	LD rWORD6, `0`, rSTR2
247	cmpld cr6, rWORD5, rWORD6
248	blt cr7, L(dP2x)
249	LD rWORD7, rOFF8, rSTR1
250	LD rWORD8, rOFF8, rSTR2
251	cmpld cr5, rWORD7, rWORD8
252	L(dP2e):
253	LD rWORD1, rOFF16, rSTR1
254	LD rWORD2, rOFF16, rSTR2
255	cmpld cr7, rWORD1, rWORD2
256	LD rWORD3, rOFF24, rSTR1
257	LD rWORD4, rOFF24, rSTR2
258	cmpld cr1, rWORD3, rWORD4
259	addi rSTR1, rSTR1, `8`
260	addi rSTR2, rSTR2, `8`
261	bne cr6, L(dLcr6)
262	bne cr5, L(dLcr5)
263	b L(dLoop2)
264	.align `4`
265	L(dP2x):
266	LD rWORD3, rOFF8, rSTR1
267	LD rWORD4, rOFF8, rSTR2
268	cmpld cr1, rWORD3, rWORD4
269	sldi. r12, rN, `3`
270	bne cr6, L(dLcr6x)
271	addi rSTR1, rSTR1, `8`
272	addi rSTR2, rSTR2, `8`
273	bne cr1, L(dLcr1x)
274	subfic rN, r12, `64` / Shift count is 64 - (rN * 8). /
275	bne L(d00)
276	ld rOFF8, rOFF8SAVE(r1)
277	ld rOFF16, rOFF16SAVE(r1)
278	ld rOFF24, rOFF24SAVE(r1)
279	ld rOFF32, rOFF32SAVE(r1)
280	li rRTN, `0`
281	blr
282
283	/ Remainder is 24 /
284	.align `4`
285	L(dP3):
286	mtctr r0
287	LD rWORD3, `0`, rSTR1
288	LD rWORD4, `0`, rSTR2
289	cmpld cr1, rWORD3, rWORD4
290	L(dP3e):
291	LD rWORD5, rOFF8, rSTR1
292	LD rWORD6, rOFF8, rSTR2
293	cmpld cr6, rWORD5, rWORD6
294	blt cr7, L(dP3x)
295	LD rWORD7, rOFF16, rSTR1
296	LD rWORD8, rOFF16, rSTR2
297	cmpld cr5, rWORD7, rWORD8
298	LD rWORD1, rOFF24, rSTR1
299	LD rWORD2, rOFF24, rSTR2
300	cmpld cr7, rWORD1, rWORD2
301	addi rSTR1, rSTR1, `16`
302	addi rSTR2, rSTR2, `16`
303	bne cr1, L(dLcr1)
304	bne cr6, L(dLcr6)
305	b L(dLoop1)
306	/ Again we are on a early exit path (24-31 byte compare), we want to*
307	only use volatile registers and avoid restoring non-volatile
308	registers. /*
309	.align `4`
310	L(dP3x):
311	LD rWORD1, rOFF16, rSTR1
312	LD rWORD2, rOFF16, rSTR2
313	cmpld cr7, rWORD1, rWORD2
314	sldi. r12, rN, `3`
315	bne cr1, L(dLcr1x)
316	addi rSTR1, rSTR1, `16`
317	addi rSTR2, rSTR2, `16`
318	bne cr6, L(dLcr6x)
319	subfic rN, r12, `64` / Shift count is 64 - (rN * 8). /
320	bne cr7, L(dLcr7x)
321	bne L(d00)
322	ld rOFF8, rOFF8SAVE(r1)
323	ld rOFF16, rOFF16SAVE(r1)
324	ld rOFF24, rOFF24SAVE(r1)
325	ld rOFF32, rOFF32SAVE(r1)
326	li rRTN, `0`
327	blr
328
329	/ Count is a multiple of 32, remainder is 0 /
330	.align `4`
331	L(dP4):
332	mtctr r0
333	LD rWORD1, `0`, rSTR1
334	LD rWORD2, `0`, rSTR2
335	cmpld cr7, rWORD1, rWORD2
336	L(dP4e):
337	LD rWORD3, rOFF8, rSTR1
338	LD rWORD4, rOFF8, rSTR2
339	cmpld cr1, rWORD3, rWORD4
340	LD rWORD5, rOFF16, rSTR1
341	LD rWORD6, rOFF16, rSTR2
342	cmpld cr6, rWORD5, rWORD6
343	LD rWORD7, rOFF24, rSTR1
344	LD rWORD8, rOFF24, rSTR2
345	addi rSTR1, rSTR1, `24`
346	addi rSTR2, rSTR2, `24`
347	cmpld cr5, rWORD7, rWORD8
348	bne cr7, L(dLcr7)
349	bne cr1, L(dLcr1)
350	bdz- L(d24) / Adjust CTR as we start with +4 /
351	/ This is the primary loop /
352	.align `4`
353	L(dLoop):
354	LD rWORD1, rOFF8, rSTR1
355	LD rWORD2, rOFF8, rSTR2
356	cmpld cr1, rWORD3, rWORD4
357	bne cr6, L(dLcr6)
358	L(dLoop1):
359	LD rWORD3, rOFF16, rSTR1
360	LD rWORD4, rOFF16, rSTR2
361	cmpld cr6, rWORD5, rWORD6
362	bne cr5, L(dLcr5)
363	L(dLoop2):
364	LD rWORD5, rOFF24, rSTR1
365	LD rWORD6, rOFF24, rSTR2
366	cmpld cr5, rWORD7, rWORD8
367	bne cr7, L(dLcr7)
368	L(dLoop3):
369	LD rWORD7, rOFF32, rSTR1
370	LD rWORD8, rOFF32, rSTR2
371	addi rSTR1, rSTR1, `32`
372	addi rSTR2, rSTR2, `32`
373	bne cr1, L(dLcr1)
374	cmpld cr7, rWORD1, rWORD2
375	bdnz L(dLoop)
376
377	L(dL4):
378	cmpld cr1, rWORD3, rWORD4
379	bne cr6, L(dLcr6)
380	cmpld cr6, rWORD5, rWORD6
381	bne cr5, L(dLcr5)
382	cmpld cr5, rWORD7, rWORD8
383	L(d44):
384	bne cr7, L(dLcr7)
385	L(d34):
386	bne cr1, L(dLcr1)
387	L(d24):
388	bne cr6, L(dLcr6)
389	L(d14):
390	sldi. r12, rN, `3`
391	bne cr5, L(dLcr5)
392	L(d04):
393	ld rWORD8, rWORD8SAVE(r1)
394	ld rWORD7, rWORD7SAVE(r1)
395	subfic rN, r12, `64` / Shift count is 64 - (rN * 8). /
396	beq L(duzeroLength)
397	/ At this point we have a remainder of 1 to 7 bytes to compare. Since*
398	we are aligned it is safe to load the whole double word, and use
399	shift right double to eliminate bits beyond the compare length. /*
400	L(d00):
401	LD rWORD1, rOFF8, rSTR1
402	LD rWORD2, rOFF8, rSTR2
403	srd rWORD1, rWORD1, rN
404	srd rWORD2, rWORD2, rN
405	cmpld cr7, rWORD1, rWORD2
406	bne cr7, L(dLcr7x)
407	ld rOFF8, rOFF8SAVE(r1)
408	ld rOFF16, rOFF16SAVE(r1)
409	ld rOFF24, rOFF24SAVE(r1)
410	ld rOFF32, rOFF32SAVE(r1)
411	li rRTN, `0`
412	blr
413
414	.align `4`
415	L(dLcr7):
416	ld rWORD8, rWORD8SAVE(r1)
417	ld rWORD7, rWORD7SAVE(r1)
418	L(dLcr7x):
419	ld rOFF8, rOFF8SAVE(r1)
420	ld rOFF16, rOFF16SAVE(r1)
421	ld rOFF24, rOFF24SAVE(r1)
422	ld rOFF32, rOFF32SAVE(r1)
423	li rRTN, `1`
424	bgtlr cr7
425	li rRTN, -`1`
426	blr
427	.align `4`
428	L(dLcr1):
429	ld rWORD8, rWORD8SAVE(r1)
430	ld rWORD7, rWORD7SAVE(r1)
431	L(dLcr1x):
432	ld rOFF8, rOFF8SAVE(r1)
433	ld rOFF16, rOFF16SAVE(r1)
434	ld rOFF24, rOFF24SAVE(r1)
435	ld rOFF32, rOFF32SAVE(r1)
436	li rRTN, `1`
437	bgtlr cr1
438	li rRTN, -`1`
439	blr
440	.align `4`
441	L(dLcr6):
442	ld rWORD8, rWORD8SAVE(r1)
443	ld rWORD7, rWORD7SAVE(r1)
444	L(dLcr6x):
445	ld rOFF8, rOFF8SAVE(r1)
446	ld rOFF16, rOFF16SAVE(r1)
447	ld rOFF24, rOFF24SAVE(r1)
448	ld rOFF32, rOFF32SAVE(r1)
449	li rRTN, `1`
450	bgtlr cr6
451	li rRTN, -`1`
452	blr
453	.align `4`
454	L(dLcr5):
455	ld rWORD8, rWORD8SAVE(r1)
456	ld rWORD7, rWORD7SAVE(r1)
457	L(dLcr5x):
458	ld rOFF8, rOFF8SAVE(r1)
459	ld rOFF16, rOFF16SAVE(r1)
460	ld rOFF24, rOFF24SAVE(r1)
461	ld rOFF32, rOFF32SAVE(r1)
462	li rRTN, `1`
463	bgtlr cr5
464	li rRTN, -`1`
465	blr
466
467	.align `4`
468	L(bytealigned):
469	mtctr rN
470
471	/ We need to prime this loop. This loop is swing modulo scheduled*
472	to avoid pipe delays. The dependent instruction latencies (load to
473	compare to conditional branch) is 2 to 3 cycles. In this loop each
474	dispatch group ends in a branch and takes 1 cycle. Effectively
475	the first iteration of the loop only serves to load operands and
476	branches based on compares are delayed until the next loop.
477
478	So we must precondition some registers and condition codes so that
479	we don't exit the loop early on the first iteration. /*
480
481	lbz rWORD1, `0`(rSTR1)
482	lbz rWORD2, `0`(rSTR2)
483	bdz L(b11)
484	cmpld cr7, rWORD1, rWORD2
485	lbz rWORD3, `1`(rSTR1)
486	lbz rWORD4, `1`(rSTR2)
487	bdz L(b12)
488	cmpld cr1, rWORD3, rWORD4
489	lbzu rWORD5, `2`(rSTR1)
490	lbzu rWORD6, `2`(rSTR2)
491	bdz L(b13)
492	.align `4`
493	L(bLoop):
494	lbzu rWORD1, `1`(rSTR1)
495	lbzu rWORD2, `1`(rSTR2)
496	bne cr7, L(bLcr7)
497
498	cmpld cr6, rWORD5, rWORD6
499	bdz L(b3i)
500
501	lbzu rWORD3, `1`(rSTR1)
502	lbzu rWORD4, `1`(rSTR2)
503	bne cr1, L(bLcr1)
504
505	cmpld cr7, rWORD1, rWORD2
506	bdz L(b2i)
507
508	lbzu rWORD5, `1`(rSTR1)
509	lbzu rWORD6, `1`(rSTR2)
510	bne cr6, L(bLcr6)
511
512	cmpld cr1, rWORD3, rWORD4
513	bdnz L(bLoop)
514
515	/ We speculatively loading bytes before we have tested the previous*
516	bytes. But we must avoid overrunning the length (in the ctr) to
517	prevent these speculative loads from causing a segfault. In this
518	case the loop will exit early (before the all pending bytes are
519	tested. In this case we must complete the pending operations
520	before returning. /*
521	L(b1i):
522	bne cr7, L(bLcr7)
523	bne cr1, L(bLcr1)
524	b L(bx56)
525	.align `4`
526	L(b2i):
527	bne cr6, L(bLcr6)
528	bne cr7, L(bLcr7)
529	b L(bx34)
530	.align `4`
531	L(b3i):
532	bne cr1, L(bLcr1)
533	bne cr6, L(bLcr6)
534	b L(bx12)
535	.align `4`
536	L(bLcr7):
537	li rRTN, `1`
538	bgtlr cr7
539	li rRTN, -`1`
540	blr
541	L(bLcr1):
542	li rRTN, `1`
543	bgtlr cr1
544	li rRTN, -`1`
545	blr
546	L(bLcr6):
547	li rRTN, `1`
548	bgtlr cr6
549	li rRTN, -`1`
550	blr
551
552	L(b13):
553	bne cr7, L(bx12)
554	bne cr1, L(bx34)
555	L(bx56):
556	sub rRTN, rWORD5, rWORD6
557	blr
558	nop
559	L(b12):
560	bne cr7, L(bx12)
561	L(bx34):
562	sub rRTN, rWORD3, rWORD4
563	blr
564	L(b11):
565	L(bx12):
566	sub rRTN, rWORD1, rWORD2
567	blr
568
569	.align `4`
570	L(zeroLength):
571	li rRTN, `0`
572	blr
573
574	.align `4`
575	/ At this point we know the strings have different alignment and the*
576	compare length is at least 8 bytes. r12 contains the low order
577	3 bits of rSTR1 and cr5 contains the result of the logical compare
578	of r12 to 0. If r12 == 0 then rStr1 is double word
579	aligned and can perform the DWunaligned loop.
580
581	Otherwise we know that rSTR1 is not already DW aligned yet.
582	So we can force the string addresses to the next lower DW
583	boundary and special case this first DW using shift left to
584	eliminate bits preceding the first byte. Since we want to join the
585	normal (DWaligned) compare loop, starting at the second double word,
586	we need to adjust the length (rN) and special case the loop
587	versioning for the first DW. This ensures that the loop count is
588	correct and the first DW (shifted) is in the expected resister pair. /*
589	L(unaligned):
590	std rSHL, rSHLSAVE(r1)
591	cfi_offset(rSHL, rSHLSAVE)
592	clrldi rSHL, rSTR2, `61`
593	beq cr6, L(duzeroLength)
594	std rSHR, rSHRSAVE(r1)
595	cfi_offset(rSHR, rSHRSAVE)
596	beq cr5, L(DWunaligned)
597	std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
598	cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
599	/ Adjust the logical start of rSTR2 to compensate for the extra bits*
600	in the 1st rSTR1 DW. /*
601	sub rWORD8_SHIFT, rSTR2, r12
602	/ But do not attempt to address the DW before that DW that contains*
603	the actual start of rSTR2. /*
604	clrrdi rSTR2, rSTR2, `3`
605	std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
606	/ Compute the left/right shift counts for the unaligned rSTR2,*
607	compensating for the logical (DW aligned) start of rSTR1. /*
608	clrldi rSHL, rWORD8_SHIFT, `61`
609	clrrdi rSTR1, rSTR1, `3`
610	std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
611	sldi rSHL, rSHL, `3`
612	cmpld cr5, rWORD8_SHIFT, rSTR2
613	add rN, rN, r12
614	sldi rWORD6, r12, `3`
615	std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
616	cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
617	cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
618	cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
619	subfic rSHR, rSHL, `64`
620	srdi r0, rN, `5` / Divide by 32 /
621	andi. r12, rN, `24` / Get the DW remainder /
622	/ We normally need to load 2 DWs to start the unaligned rSTR2, but in*
623	this special case those bits may be discarded anyway. Also we
624	must avoid loading a DW where none of the bits are part of rSTR2 as
625	this may cross a page boundary and cause a page fault. /*
626	li rWORD8, `0`
627	blt cr5, L(dus0)
628	LD rWORD8, `0`, rSTR2
629	addi rSTR2, rSTR2, `8`
630	sld rWORD8, rWORD8, rSHL
631
632	L(dus0):
633	LD rWORD1, `0`, rSTR1
634	LD rWORD2, `0`, rSTR2
635	cmpldi cr1, r12, `16`
636	cmpldi cr7, rN, `32`
637	srd r12, rWORD2, rSHR
638	clrldi rN, rN, `61`
639	beq L(duPs4)
640	mtctr r0
641	or rWORD8, r12, rWORD8
642	bgt cr1, L(duPs3)
643	beq cr1, L(duPs2)
644
645	/ Remainder is 8 /
646	.align `4`
647	L(dusP1):
648	sld rWORD8_SHIFT, rWORD2, rSHL
649	sld rWORD7, rWORD1, rWORD6
650	sld rWORD8, rWORD8, rWORD6
651	bge cr7, L(duP1e)
652	/ At this point we exit early with the first double word compare*
653	complete and remainder of 0 to 7 bytes. See L(du14) for details on
654	how we handle the remaining bytes. /*
655	cmpld cr5, rWORD7, rWORD8
656	sldi. rN, rN, `3`
657	bne cr5, L(duLcr5)
658	cmpld cr7, rN, rSHR
659	beq L(duZeroReturn)
660	li r0, `0`
661	ble cr7, L(dutrim)
662	LD rWORD2, rOFF8, rSTR2
663	srd r0, rWORD2, rSHR
664	b L(dutrim)
665	/ Remainder is 16 /
666	.align `4`
667	L(duPs2):
668	sld rWORD6_SHIFT, rWORD2, rSHL
669	sld rWORD5, rWORD1, rWORD6
670	sld rWORD6, rWORD8, rWORD6
671	b L(duP2e)
672	/ Remainder is 24 /
673	.align `4`
674	L(duPs3):
675	sld rWORD4_SHIFT, rWORD2, rSHL
676	sld rWORD3, rWORD1, rWORD6
677	sld rWORD4, rWORD8, rWORD6
678	b L(duP3e)
679	/ Count is a multiple of 32, remainder is 0 /
680	.align `4`
681	L(duPs4):
682	mtctr r0
683	or rWORD8, r12, rWORD8
684	sld rWORD2_SHIFT, rWORD2, rSHL
685	sld rWORD1, rWORD1, rWORD6
686	sld rWORD2, rWORD8, rWORD6
687	b L(duP4e)
688
689	/ At this point we know rSTR1 is double word aligned and the*
690	compare length is at least 8 bytes. /*
691	.align `4`
692	L(DWunaligned):
693	std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
694	clrrdi rSTR2, rSTR2, `3`
695	std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
696	srdi r0, rN, `5` / Divide by 32 /
697	std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
698	andi. r12, rN, `24` / Get the DW remainder /
699	std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
700	cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
701	cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
702	cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
703	cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
704	sldi rSHL, rSHL, `3`
705	LD rWORD6, `0`, rSTR2
706	LD rWORD8, rOFF8, rSTR2
707	addi rSTR2, rSTR2, `8`
708	cmpldi cr1, r12, `16`
709	cmpldi cr7, rN, `32`
710	clrldi rN, rN, `61`
711	subfic rSHR, rSHL, `64`
712	sld rWORD6_SHIFT, rWORD6, rSHL
713	beq L(duP4)
714	mtctr r0
715	bgt cr1, L(duP3)
716	beq cr1, L(duP2)
717
718	/ Remainder is 8 /
719	.align `4`
720	L(duP1):
721	srd r12, rWORD8, rSHR
722	LD rWORD7, `0`, rSTR1
723	sld rWORD8_SHIFT, rWORD8, rSHL
724	or rWORD8, r12, rWORD6_SHIFT
725	blt cr7, L(duP1x)
726	L(duP1e):
727	LD rWORD1, rOFF8, rSTR1
728	LD rWORD2, rOFF8, rSTR2
729	cmpld cr5, rWORD7, rWORD8
730	srd r0, rWORD2, rSHR
731	sld rWORD2_SHIFT, rWORD2, rSHL
732	or rWORD2, r0, rWORD8_SHIFT
733	LD rWORD3, rOFF16, rSTR1
734	LD rWORD4, rOFF16, rSTR2
735	cmpld cr7, rWORD1, rWORD2
736	srd r12, rWORD4, rSHR
737	sld rWORD4_SHIFT, rWORD4, rSHL
738	bne cr5, L(duLcr5)
739	or rWORD4, r12, rWORD2_SHIFT
740	LD rWORD5, rOFF24, rSTR1
741	LD rWORD6, rOFF24, rSTR2
742	cmpld cr1, rWORD3, rWORD4
743	srd r0, rWORD6, rSHR
744	sld rWORD6_SHIFT, rWORD6, rSHL
745	bne cr7, L(duLcr7)
746	or rWORD6, r0, rWORD4_SHIFT
747	cmpld cr6, rWORD5, rWORD6
748	b L(duLoop3)
749	.align `4`
750	/ At this point we exit early with the first double word compare*
751	complete and remainder of 0 to 7 bytes. See L(du14) for details on
752	how we handle the remaining bytes. /*
753	L(duP1x):
754	cmpld cr5, rWORD7, rWORD8
755	sldi. rN, rN, `3`
756	bne cr5, L(duLcr5)
757	cmpld cr7, rN, rSHR
758	beq L(duZeroReturn)
759	li r0, `0`
760	ble cr7, L(dutrim)
761	LD rWORD2, rOFF8, rSTR2
762	srd r0, rWORD2, rSHR
763	b L(dutrim)
764	/ Remainder is 16 /
765	.align `4`
766	L(duP2):
767	srd r0, rWORD8, rSHR
768	LD rWORD5, `0`, rSTR1
769	or rWORD6, r0, rWORD6_SHIFT
770	sld rWORD6_SHIFT, rWORD8, rSHL
771	L(duP2e):
772	LD rWORD7, rOFF8, rSTR1
773	LD rWORD8, rOFF8, rSTR2
774	cmpld cr6, rWORD5, rWORD6
775	srd r12, rWORD8, rSHR
776	sld rWORD8_SHIFT, rWORD8, rSHL
777	or rWORD8, r12, rWORD6_SHIFT
778	blt cr7, L(duP2x)
779	LD rWORD1, rOFF16, rSTR1
780	LD rWORD2, rOFF16, rSTR2
781	cmpld cr5, rWORD7, rWORD8
782	bne cr6, L(duLcr6)
783	srd r0, rWORD2, rSHR
784	sld rWORD2_SHIFT, rWORD2, rSHL
785	or rWORD2, r0, rWORD8_SHIFT
786	LD rWORD3, rOFF24, rSTR1
787	LD rWORD4, rOFF24, rSTR2
788	cmpld cr7, rWORD1, rWORD2
789	bne cr5, L(duLcr5)
790	srd r12, rWORD4, rSHR
791	sld rWORD4_SHIFT, rWORD4, rSHL
792	or rWORD4, r12, rWORD2_SHIFT
793	addi rSTR1, rSTR1, `8`
794	addi rSTR2, rSTR2, `8`
795	cmpld cr1, rWORD3, rWORD4
796	b L(duLoop2)
797	.align `4`
798	L(duP2x):
799	cmpld cr5, rWORD7, rWORD8
800	addi rSTR1, rSTR1, `8`
801	addi rSTR2, rSTR2, `8`
802	bne cr6, L(duLcr6)
803	sldi. rN, rN, `3`
804	bne cr5, L(duLcr5)
805	cmpld cr7, rN, rSHR
806	beq L(duZeroReturn)
807	li r0, `0`
808	ble cr7, L(dutrim)
809	LD rWORD2, rOFF8, rSTR2
810	srd r0, rWORD2, rSHR
811	b L(dutrim)
812
813	/ Remainder is 24 /
814	.align `4`
815	L(duP3):
816	srd r12, rWORD8, rSHR
817	LD rWORD3, `0`, rSTR1
818	sld rWORD4_SHIFT, rWORD8, rSHL
819	or rWORD4, r12, rWORD6_SHIFT
820	L(duP3e):
821	LD rWORD5, rOFF8, rSTR1
822	LD rWORD6, rOFF8, rSTR2
823	cmpld cr1, rWORD3, rWORD4
824	srd r0, rWORD6, rSHR
825	sld rWORD6_SHIFT, rWORD6, rSHL
826	or rWORD6, r0, rWORD4_SHIFT
827	LD rWORD7, rOFF16, rSTR1
828	LD rWORD8, rOFF16, rSTR2
829	cmpld cr6, rWORD5, rWORD6
830	bne cr1, L(duLcr1)
831	srd r12, rWORD8, rSHR
832	sld rWORD8_SHIFT, rWORD8, rSHL
833	or rWORD8, r12, rWORD6_SHIFT
834	blt cr7, L(duP3x)
835	LD rWORD1, rOFF24, rSTR1
836	LD rWORD2, rOFF24, rSTR2
837	cmpld cr5, rWORD7, rWORD8
838	bne cr6, L(duLcr6)
839	srd r0, rWORD2, rSHR
840	sld rWORD2_SHIFT, rWORD2, rSHL
841	or rWORD2, r0, rWORD8_SHIFT
842	addi rSTR1, rSTR1, `16`
843	addi rSTR2, rSTR2, `16`
844	cmpld cr7, rWORD1, rWORD2
845	b L(duLoop1)
846	.align `4`
847	L(duP3x):
848	addi rSTR1, rSTR1, `16`
849	addi rSTR2, rSTR2, `16`
850	cmpld cr5, rWORD7, rWORD8
851	bne cr6, L(duLcr6)
852	sldi. rN, rN, `3`
853	bne cr5, L(duLcr5)
854	cmpld cr7, rN, rSHR
855	beq L(duZeroReturn)
856	li r0, `0`
857	ble cr7, L(dutrim)
858	LD rWORD2, rOFF8, rSTR2
859	srd r0, rWORD2, rSHR
860	b L(dutrim)
861
862	/ Count is a multiple of 32, remainder is 0 /
863	.align `4`
864	L(duP4):
865	mtctr r0
866	srd r0, rWORD8, rSHR
867	LD rWORD1, `0`, rSTR1
868	sld rWORD2_SHIFT, rWORD8, rSHL
869	or rWORD2, r0, rWORD6_SHIFT
870	L(duP4e):
871	LD rWORD3, rOFF8, rSTR1
872	LD rWORD4, rOFF8, rSTR2
873	cmpld cr7, rWORD1, rWORD2
874	srd r12, rWORD4, rSHR
875	sld rWORD4_SHIFT, rWORD4, rSHL
876	or rWORD4, r12, rWORD2_SHIFT
877	LD rWORD5, rOFF16, rSTR1
878	LD rWORD6, rOFF16, rSTR2
879	cmpld cr1, rWORD3, rWORD4
880	bne cr7, L(duLcr7)
881	srd r0, rWORD6, rSHR
882	sld rWORD6_SHIFT, rWORD6, rSHL
883	or rWORD6, r0, rWORD4_SHIFT
884	LD rWORD7, rOFF24, rSTR1
885	LD rWORD8, rOFF24, rSTR2
886	addi rSTR1, rSTR1, `24`
887	addi rSTR2, rSTR2, `24`
888	cmpld cr6, rWORD5, rWORD6
889	bne cr1, L(duLcr1)
890	srd r12, rWORD8, rSHR
891	sld rWORD8_SHIFT, rWORD8, rSHL
892	or rWORD8, r12, rWORD6_SHIFT
893	cmpld cr5, rWORD7, rWORD8
894	bdz L(du24) / Adjust CTR as we start with +4 /
895	/ This is the primary loop /
896	.align `4`
897	L(duLoop):
898	LD rWORD1, rOFF8, rSTR1
899	LD rWORD2, rOFF8, rSTR2
900	cmpld cr1, rWORD3, rWORD4
901	bne cr6, L(duLcr6)
902	srd r0, rWORD2, rSHR
903	sld rWORD2_SHIFT, rWORD2, rSHL
904	or rWORD2, r0, rWORD8_SHIFT
905	L(duLoop1):
906	LD rWORD3, rOFF16, rSTR1
907	LD rWORD4, rOFF16, rSTR2
908	cmpld cr6, rWORD5, rWORD6
909	bne cr5, L(duLcr5)
910	srd r12, rWORD4, rSHR
911	sld rWORD4_SHIFT, rWORD4, rSHL
912	or rWORD4, r12, rWORD2_SHIFT
913	L(duLoop2):
914	LD rWORD5, rOFF24, rSTR1
915	LD rWORD6, rOFF24, rSTR2
916	cmpld cr5, rWORD7, rWORD8
917	bne cr7, L(duLcr7)
918	srd r0, rWORD6, rSHR
919	sld rWORD6_SHIFT, rWORD6, rSHL
920	or rWORD6, r0, rWORD4_SHIFT
921	L(duLoop3):
922	LD rWORD7, rOFF32, rSTR1
923	LD rWORD8, rOFF32, rSTR2
924	addi rSTR1, rSTR1, `32`
925	addi rSTR2, rSTR2, `32`
926	cmpld cr7, rWORD1, rWORD2
927	bne cr1, L(duLcr1)
928	srd r12, rWORD8, rSHR
929	sld rWORD8_SHIFT, rWORD8, rSHL
930	or rWORD8, r12, rWORD6_SHIFT
931	bdnz L(duLoop)
932
933	L(duL4):
934	cmpld cr1, rWORD3, rWORD4
935	bne cr6, L(duLcr6)
936	cmpld cr6, rWORD5, rWORD6
937	bne cr5, L(duLcr5)
938	cmpld cr5, rWORD7, rWORD8
939	L(du44):
940	bne cr7, L(duLcr7)
941	L(du34):
942	bne cr1, L(duLcr1)
943	L(du24):
944	bne cr6, L(duLcr6)
945	L(du14):
946	sldi. rN, rN, `3`
947	bne cr5, L(duLcr5)
948	/ At this point we have a remainder of 1 to 7 bytes to compare. We use*
949	shift right double to eliminate bits beyond the compare length.
950
951	However it may not be safe to load rWORD2 which may be beyond the
952	string length. So we compare the bit length of the remainder to
953	the right shift count (rSHR). If the bit count is less than or equal
954	we do not need to load rWORD2 (all significant bits are already in
955	rWORD8_SHIFT). /*
956	cmpld cr7, rN, rSHR
957	beq L(duZeroReturn)
958	li r0, `0`
959	ble cr7, L(dutrim)
960	LD rWORD2, rOFF8, rSTR2
961	srd r0, rWORD2, rSHR
962	.align `4`
963	L(dutrim):
964	LD rWORD1, rOFF8, rSTR1
965	ld rWORD8, -`8`(r1)
966	subfic rN, rN, `64` / Shift count is 64 - (rN * 8). /
967	or rWORD2, r0, rWORD8_SHIFT
968	ld rWORD7, rWORD7SAVE(r1)
969	ld rSHL, rSHLSAVE(r1)
970	srd rWORD1, rWORD1, rN
971	srd rWORD2, rWORD2, rN
972	ld rSHR, rSHRSAVE(r1)
973	ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
974	li rRTN, `0`
975	cmpld cr7, rWORD1, rWORD2
976	ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
977	ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
978	beq cr7, L(dureturn24)
979	li rRTN, `1`
980	ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
981	ld rOFF8, rOFF8SAVE(r1)
982	ld rOFF16, rOFF16SAVE(r1)
983	ld rOFF24, rOFF24SAVE(r1)
984	ld rOFF32, rOFF32SAVE(r1)
985	bgtlr cr7
986	li rRTN, -`1`
987	blr
988	.align `4`
989	L(duLcr7):
990	ld rWORD8, rWORD8SAVE(r1)
991	ld rWORD7, rWORD7SAVE(r1)
992	li rRTN, `1`
993	bgt cr7, L(dureturn29)
994	ld rSHL, rSHLSAVE(r1)
995	ld rSHR, rSHRSAVE(r1)
996	li rRTN, -`1`
997	b L(dureturn27)
998	.align `4`
999	L(duLcr1):
1000	ld rWORD8, rWORD8SAVE(r1)
1001	ld rWORD7, rWORD7SAVE(r1)
1002	li rRTN, `1`
1003	bgt cr1, L(dureturn29)
1004	ld rSHL, rSHLSAVE(r1)
1005	ld rSHR, rSHRSAVE(r1)
1006	li rRTN, -`1`
1007	b L(dureturn27)
1008	.align `4`
1009	L(duLcr6):
1010	ld rWORD8, rWORD8SAVE(r1)
1011	ld rWORD7, rWORD7SAVE(r1)
1012	li rRTN, `1`
1013	bgt cr6, L(dureturn29)
1014	ld rSHL, rSHLSAVE(r1)
1015	ld rSHR, rSHRSAVE(r1)
1016	li rRTN, -`1`
1017	b L(dureturn27)
1018	.align `4`
1019	L(duLcr5):
1020	ld rWORD8, rWORD8SAVE(r1)
1021	ld rWORD7, rWORD7SAVE(r1)
1022	li rRTN, `1`
1023	bgt cr5, L(dureturn29)
1024	ld rSHL, rSHLSAVE(r1)
1025	ld rSHR, rSHRSAVE(r1)
1026	li rRTN, -`1`
1027	b L(dureturn27)
1028
1029	.align `3`
1030	L(duZeroReturn):
1031	li rRTN, `0`
1032	.align `4`
1033	L(dureturn):
1034	ld rWORD8, rWORD8SAVE(r1)
1035	ld rWORD7, rWORD7SAVE(r1)
1036	L(dureturn29):
1037	ld rSHL, rSHLSAVE(r1)
1038	ld rSHR, rSHRSAVE(r1)
1039	L(dureturn27):
1040	ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
1041	ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
1042	ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
1043	L(dureturn24):
1044	ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1045	ld rOFF8, rOFF8SAVE(r1)
1046	ld rOFF16, rOFF16SAVE(r1)
1047	ld rOFF24, rOFF24SAVE(r1)
1048	ld rOFF32, rOFF32SAVE(r1)
1049	blr
1050
1051	L(duzeroLength):
1052	ld rOFF8, rOFF8SAVE(r1)
1053	ld rOFF16, rOFF16SAVE(r1)
1054	ld rOFF24, rOFF24SAVE(r1)
1055	ld rOFF32, rOFF32SAVE(r1)
1056	li rRTN, `0`
1057	blr
1058
1059	END (MEMCMP)
1060	libc_hidden_builtin_def (memcmp)
1061	weak_alias (memcmp, bcmp)
1062	strong_alias (memcmp, __memcmpeq)
1063	libc_hidden_def (__memcmpeq)
1064

source code of glibc/sysdeps/powerpc/powerpc64/power7/memcmp.S