strcmp.S source code [glibc/sysdeps/arm/armv7/strcmp.S]

1	/ strcmp implementation for ARMv7-A, optimized for Cortex-A15.*
2	Copyright (C) 2012-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library. If not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <arm-features.h>
20	#include <sysdep.h>
21
22	/ Implementation of strcmp for ARMv7 when DSP instructions are*
23	available. Use ldrd to support wider loads, provided the data
24	is sufficiently aligned. Use saturating arithmetic to optimize
25	the compares. /*
26
27	/ Build Options:*
28	STRCMP_PRECHECK: Run a quick pre-check of the first byte in the
29	string. If comparing completely random strings the pre-check will
30	save time, since there is a very high probability of a mismatch in
31	the first character: we save significant overhead if this is the
32	common case. However, if strings are likely to be identical (e.g.
33	because we're verifying a hit in a hash table), then this check
34	is largely redundant. /*
35
36	#define STRCMP_PRECHECK 1
37
38	.syntax unified
39
40	#ifdef __ARM_BIG_ENDIAN
41	# define S2LO lsl
42	# define S2LOEQ lsleq
43	# define S2HI lsr
44	# define MSB 0x000000ff
45	# define LSB 0xff000000
46	# define BYTE0_OFFSET 24
47	# define BYTE1_OFFSET 16
48	# define BYTE2_OFFSET 8
49	# define BYTE3_OFFSET 0
50	#else /* not __ARM_BIG_ENDIAN */
51	# define S2LO lsr
52	# define S2LOEQ lsreq
53	# define S2HI lsl
54	# define BYTE0_OFFSET 0
55	# define BYTE1_OFFSET 8
56	# define BYTE2_OFFSET 16
57	# define BYTE3_OFFSET 24
58	# define MSB 0xff000000
59	# define LSB 0x000000ff
60	#endif /* not __ARM_BIG_ENDIAN */
61
62	/ Parameters and result. /
63	#define src1 r0
64	#define src2 r1
65	#define result r0 /* Overlaps src1. */
66
67	/ Internal variables. /
68	#define tmp1 r4
69	#define tmp2 r5
70	#define const_m1 r12
71
72	/ Additional internal variables for 64-bit aligned data. /
73	#define data1a r2
74	#define data1b r3
75	#define data2a r6
76	#define data2b r7
77	#define syndrome_a tmp1
78	#define syndrome_b tmp2
79
80	/ Additional internal variables for 32-bit aligned data. /
81	#define data1 r2
82	#define data2 r3
83	#define syndrome tmp2
84
85
86	.thumb
87
88	/ In Thumb code we can't use MVN with a register shift, but we do have ORN. /
89	.macro prepare_mask mask_reg, nbits_reg
90	S2HI \mask_reg, const_m1, \nbits_reg
91	.endm
92	.macro apply_mask data_reg, mask_reg
93	orn \data_reg, \data_reg, \mask_reg
94	.endm
95
96	/ Macro to compute and return the result value for word-aligned*
97	cases. /*
98	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
99	#ifdef __ARM_BIG_ENDIAN
100	/ If data1 contains a zero byte, then syndrome will contain a 1 in*
101	bit 7 of that byte. Otherwise, the highest set bit in the
102	syndrome will highlight the first different bit. It is therefore
103	sufficient to extract the eight bits starting with the syndrome
104	bit. /*
105	clz tmp1, \synd
106	lsl r1, \d2, tmp1
107	.if \restore_r6
108	ldrd r6, r7, [sp, #`8`]
109	.endif
110	lsl \d1, \d1, tmp1
111	lsr result, \d1, #`24`
112	ldrd r4, r5, [sp], #`16`
113	cfi_remember_state
114	cfi_def_cfa_offset (`0`)
115	cfi_restore (r4)
116	cfi_restore (r5)
117	cfi_restore (r6)
118	cfi_restore (r7)
119	sub result, result, r1, lsr #`24`
120	bx lr
121	#else
122	/ To use the big-endian trick we'd have to reverse all three words.*
123	that's slower than this approach. /*
124	rev \synd, \synd
125	clz tmp1, \synd
126	bic tmp1, tmp1, #`7`
127	lsr r1, \d2, tmp1
128	.if \restore_r6
129	ldrd r6, r7, [sp, #`8`]
130	.endif
131	lsr \d1, \d1, tmp1
132	and result, \d1, #`255`
133	and r1, r1, #`255`
134	ldrd r4, r5, [sp], #`16`
135	cfi_remember_state
136	cfi_def_cfa_offset (`0`)
137	cfi_restore (r4)
138	cfi_restore (r5)
139	cfi_restore (r6)
140	cfi_restore (r7)
141	sub result, result, r1
142
143	bx lr
144	#endif
145	.endm
146
147	.text
148	.p2align `5`
149	.Lstrcmp_start_addr:
150	#if STRCMP_PRECHECK == 1
151	.Lfastpath_exit:
152	sub r0, r2, r3
153	bx lr
154	nop
155	#endif
156	ENTRY (strcmp)
157	#if STRCMP_PRECHECK == 1
158	ldrb r2, [src1]
159	ldrb r3, [src2]
160	cmp r2, #`1`
161	it cs
162	cmpcs r2, r3
163	bne .Lfastpath_exit
164	#endif
165	strd r4, r5, [sp, #-`16`]!
166	cfi_def_cfa_offset (`16`)
167	cfi_offset (r4, -`16`)
168	cfi_offset (r5, -`12`)
169	orr tmp1, src1, src2
170	strd r6, r7, [sp, #`8`]
171	cfi_offset (r6, -`8`)
172	cfi_offset (r7, -`4`)
173	mvn const_m1, #`0`
174	lsl r2, tmp1, #`29`
175	cbz r2, .Lloop_aligned8
176
177	.Lnot_aligned:
178	eor tmp1, src1, src2
179	tst tmp1, #`7`
180	bne .Lmisaligned8
181
182	/ Deal with mutual misalignment by aligning downwards and then*
183	masking off the unwanted loaded data to prevent a difference. /*
184	and tmp1, src1, #`7`
185	bic src1, src1, #`7`
186	and tmp2, tmp1, #`3`
187	bic src2, src2, #`7`
188	lsl tmp2, tmp2, #`3` / Bytes -> bits. /
189	ldrd data1a, data1b, [src1], #`16`
190	tst tmp1, #`4`
191	ldrd data2a, data2b, [src2], #`16`
192	prepare_mask tmp1, tmp2
193	apply_mask data1a, tmp1
194	apply_mask data2a, tmp1
195	beq .Lstart_realigned8
196	apply_mask data1b, tmp1
197	mov data1a, const_m1
198	apply_mask data2b, tmp1
199	mov data2a, const_m1
200	b .Lstart_realigned8
201
202	/ Unwind the inner loop by a factor of 2, giving 16 bytes per*
203	pass. /*
204	.p2align `5`,,`12` / Don't start in the tail bytes of a cache line. /
205	.p2align `2` / Always word aligned. /
206	.Lloop_aligned8:
207	ldrd data1a, data1b, [src1], #`16`
208	ldrd data2a, data2b, [src2], #`16`
209	.Lstart_realigned8:
210	uadd8 syndrome_b, data1a, const_m1 / Only want GE bits, /
211	eor syndrome_a, data1a, data2a
212	sel syndrome_a, syndrome_a, const_m1
213	cbnz syndrome_a, .Ldiff_in_a
214	uadd8 syndrome_b, data1b, const_m1 / Only want GE bits. /
215	eor syndrome_b, data1b, data2b
216	sel syndrome_b, syndrome_b, const_m1
217	cbnz syndrome_b, .Ldiff_in_b
218
219	ldrd data1a, data1b, [src1, #-`8`]
220	ldrd data2a, data2b, [src2, #-`8`]
221	uadd8 syndrome_b, data1a, const_m1 / Only want GE bits, /
222	eor syndrome_a, data1a, data2a
223	sel syndrome_a, syndrome_a, const_m1
224	uadd8 syndrome_b, data1b, const_m1 / Only want GE bits. /
225	eor syndrome_b, data1b, data2b
226	sel syndrome_b, syndrome_b, const_m1
227	/ Can't use CBZ for backwards branch. /
228	orrs syndrome_b, syndrome_b, syndrome_a / Only need if s_a == 0 /
229	beq .Lloop_aligned8
230
231	.Ldiff_found:
232	cbnz syndrome_a, .Ldiff_in_a
233
234	.Ldiff_in_b:
235	strcmp_epilogue_aligned syndrome_b, data1b, data2b `1`
236
237	.Ldiff_in_a:
238	cfi_restore_state
239	strcmp_epilogue_aligned syndrome_a, data1a, data2a `1`
240
241	cfi_restore_state
242	.Lmisaligned8:
243	tst tmp1, #`3`
244	bne .Lmisaligned4
245	ands tmp1, src1, #`3`
246	bne .Lmutual_align4
247
248	/ Unrolled by a factor of 2, to reduce the number of post-increment*
249	operations. /*
250	.Lloop_aligned4:
251	ldr data1, [src1], #`8`
252	ldr data2, [src2], #`8`
253	.Lstart_realigned4:
254	uadd8 syndrome, data1, const_m1 / Only need GE bits. /
255	eor syndrome, data1, data2
256	sel syndrome, syndrome, const_m1
257	cbnz syndrome, .Laligned4_done
258	ldr data1, [src1, #-`4`]
259	ldr data2, [src2, #-`4`]
260	uadd8 syndrome, data1, const_m1
261	eor syndrome, data1, data2
262	sel syndrome, syndrome, const_m1
263	cmp syndrome, #`0`
264	beq .Lloop_aligned4
265
266	.Laligned4_done:
267	strcmp_epilogue_aligned syndrome, data1, data2, `0`
268
269	.Lmutual_align4:
270	cfi_restore_state
271	/ Deal with mutual misalignment by aligning downwards and then*
272	masking off the unwanted loaded data to prevent a difference. /*
273	lsl tmp1, tmp1, #`3` / Bytes -> bits. /
274	bic src1, src1, #`3`
275	ldr data1, [src1], #`8`
276	bic src2, src2, #`3`
277	ldr data2, [src2], #`8`
278
279	prepare_mask tmp1, tmp1
280	apply_mask data1, tmp1
281	apply_mask data2, tmp1
282	b .Lstart_realigned4
283
284	.Lmisaligned4:
285	ands tmp1, src1, #`3`
286	beq .Lsrc1_aligned
287	sub src2, src2, tmp1
288	bic src1, src1, #`3`
289	lsls tmp1, tmp1, #`31`
290	ldr data1, [src1], #`4`
291	beq .Laligned_m2
292	bcs .Laligned_m1
293
294	#if STRCMP_PRECHECK == 0
295	ldrb data2, [src2, #`1`]
296	uxtb tmp1, data1, ror #BYTE1_OFFSET
297	subs tmp1, tmp1, data2
298	bne .Lmisaligned_exit
299	cbz data2, .Lmisaligned_exit
300
301	.Laligned_m2:
302	ldrb data2, [src2, #`2`]
303	uxtb tmp1, data1, ror #BYTE2_OFFSET
304	subs tmp1, tmp1, data2
305	bne .Lmisaligned_exit
306	cbz data2, .Lmisaligned_exit
307
308	.Laligned_m1:
309	ldrb data2, [src2, #`3`]
310	uxtb tmp1, data1, ror #BYTE3_OFFSET
311	subs tmp1, tmp1, data2
312	bne .Lmisaligned_exit
313	add src2, src2, #`4`
314	cbnz data2, .Lsrc1_aligned
315	#else /* STRCMP_PRECHECK */
316	/ If we've done the pre-check, then we don't need to check the*
317	first byte again here. /*
318	ldrb data2, [src2, #`2`]
319	uxtb tmp1, data1, ror #BYTE2_OFFSET
320	subs tmp1, tmp1, data2
321	bne .Lmisaligned_exit
322	cbz data2, .Lmisaligned_exit
323
324	.Laligned_m2:
325	ldrb data2, [src2, #`3`]
326	uxtb tmp1, data1, ror #BYTE3_OFFSET
327	subs tmp1, tmp1, data2
328	bne .Lmisaligned_exit
329	cbnz data2, .Laligned_m1
330	#endif
331
332	.Lmisaligned_exit:
333	mov result, tmp1
334	ldr r4, [sp], #`16`
335	cfi_remember_state
336	cfi_def_cfa_offset (`0`)
337	cfi_restore (r4)
338	cfi_restore (r5)
339	cfi_restore (r6)
340	cfi_restore (r7)
341	bx lr
342
343	#if STRCMP_PRECHECK == 1
344	.Laligned_m1:
345	add src2, src2, #`4`
346	#endif
347	.Lsrc1_aligned:
348	cfi_restore_state
349	/ src1 is word aligned, but src2 has no common alignment*
350	with it. /*
351	ldr data1, [src1], #`4`
352	lsls tmp1, src2, #`31` / C=src2[1], Z=src2[0]. /
353
354	bic src2, src2, #`3`
355	ldr data2, [src2], #`4`
356	bhi .Loverlap1 / C=1, Z=0 => src2[1:0] = 0b11. /
357	bcs .Loverlap2 / C=1, Z=1 => src2[1:0] = 0b10. /
358
359	/ (overlap3) C=0, Z=0 => src2[1:0] = 0b01. /
360	.Loverlap3:
361	bic tmp1, data1, #MSB
362	uadd8 syndrome, data1, const_m1
363	eors syndrome, tmp1, data2, S2LO #`8`
364	sel syndrome, syndrome, const_m1
365	bne `4f`
366	cbnz syndrome, `5f`
367	ldr data2, [src2], #`4`
368	eor tmp1, tmp1, data1
369	cmp tmp1, data2, S2HI #`24`
370	bne `6f`
371	ldr data1, [src1], #`4`
372	b .Loverlap3
373	`4`:
374	S2LO data2, data2, #`8`
375	b .Lstrcmp_tail
376
377	`5`:
378	bics syndrome, syndrome, #MSB
379	bne .Lstrcmp_done_equal
380
381	/ We can only get here if the MSB of data1 contains 0, so*
382	fast-path the exit. /*
383	ldrb result, [src2]
384	ldrd r4, r5, [sp], #`16`
385	cfi_remember_state
386	cfi_def_cfa_offset (`0`)
387	cfi_restore (r4)
388	cfi_restore (r5)
389	/ R6/7 Not used in this sequence. /
390	cfi_restore (r6)
391	cfi_restore (r7)
392	neg result, result
393	bx lr
394
395	`6`:
396	cfi_restore_state
397	S2LO data1, data1, #`24`
398	and data2, data2, #LSB
399	b .Lstrcmp_tail
400
401	.p2align `5`,,`12` / Ensure at least 3 instructions in cache line. /
402	.Loverlap2:
403	and tmp1, data1, const_m1, S2LO #`16`
404	uadd8 syndrome, data1, const_m1
405	eors syndrome, tmp1, data2, S2LO #`16`
406	sel syndrome, syndrome, const_m1
407	bne `4f`
408	cbnz syndrome, `5f`
409	ldr data2, [src2], #`4`
410	eor tmp1, tmp1, data1
411	cmp tmp1, data2, S2HI #`16`
412	bne `6f`
413	ldr data1, [src1], #`4`
414	b .Loverlap2
415	`4`:
416	S2LO data2, data2, #`16`
417	b .Lstrcmp_tail
418	`5`:
419	ands syndrome, syndrome, const_m1, S2LO #`16`
420	bne .Lstrcmp_done_equal
421
422	ldrh data2, [src2]
423	S2LO data1, data1, #`16`
424	#ifdef __ARM_BIG_ENDIAN
425	lsl data2, data2, #`16`
426	#endif
427	b .Lstrcmp_tail
428
429	`6`:
430	S2LO data1, data1, #`16`
431	and data2, data2, const_m1, S2LO #`16`
432	b .Lstrcmp_tail
433
434	.p2align `5`,,`12` / Ensure at least 3 instructions in cache line. /
435	.Loverlap1:
436	and tmp1, data1, #LSB
437	uadd8 syndrome, data1, const_m1
438	eors syndrome, tmp1, data2, S2LO #`24`
439	sel syndrome, syndrome, const_m1
440	bne `4f`
441	cbnz syndrome, `5f`
442	ldr data2, [src2], #`4`
443	eor tmp1, tmp1, data1
444	cmp tmp1, data2, S2HI #`8`
445	bne `6f`
446	ldr data1, [src1], #`4`
447	b .Loverlap1
448	`4`:
449	S2LO data2, data2, #`24`
450	b .Lstrcmp_tail
451	`5`:
452	tst syndrome, #LSB
453	bne .Lstrcmp_done_equal
454	ldr data2, [src2]
455	`6`:
456	S2LO data1, data1, #`8`
457	bic data2, data2, #MSB
458	b .Lstrcmp_tail
459
460	.Lstrcmp_done_equal:
461	mov result, #`0`
462	ldrd r4, r5, [sp], #`16`
463	cfi_remember_state
464	cfi_def_cfa_offset (`0`)
465	cfi_restore (r4)
466	cfi_restore (r5)
467	/ R6/7 not used in this sequence. /
468	cfi_restore (r6)
469	cfi_restore (r7)
470	bx lr
471
472	.Lstrcmp_tail:
473	cfi_restore_state
474	#ifndef __ARM_BIG_ENDIAN
475	rev data1, data1
476	rev data2, data2
477	/ Now everything looks big-endian... /
478	#endif
479	uadd8 tmp1, data1, const_m1
480	eor tmp1, data1, data2
481	sel syndrome, tmp1, const_m1
482	clz tmp1, syndrome
483	lsl data1, data1, tmp1
484	lsl data2, data2, tmp1
485	lsr result, data1, #`24`
486	ldrd r4, r5, [sp], #`16`
487	cfi_def_cfa_offset (`0`)
488	cfi_restore (r4)
489	cfi_restore (r5)
490	/ R6/7 not used in this sequence. /
491	cfi_restore (r6)
492	cfi_restore (r7)
493	sub result, result, data2, lsr #`24`
494	bx lr
495	END (strcmp)
496	libc_hidden_builtin_def (strcmp)
497

source code of glibc/sysdeps/arm/armv7/strcmp.S