1/* strcmp implementation for ARMv7-A, optimized for Cortex-A15.
2 Copyright (C) 2012-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <arm-features.h>
20#include <sysdep.h>
21
22/* Implementation of strcmp for ARMv7 when DSP instructions are
23 available. Use ldrd to support wider loads, provided the data
24 is sufficiently aligned. Use saturating arithmetic to optimize
25 the compares. */
26
27/* Build Options:
28 STRCMP_PRECHECK: Run a quick pre-check of the first byte in the
29 string. If comparing completely random strings the pre-check will
30 save time, since there is a very high probability of a mismatch in
31 the first character: we save significant overhead if this is the
32 common case. However, if strings are likely to be identical (e.g.
33 because we're verifying a hit in a hash table), then this check
34 is largely redundant. */
35
36#define STRCMP_PRECHECK 1
37
38 .syntax unified
39
40#ifdef __ARM_BIG_ENDIAN
41# define S2LO lsl
42# define S2LOEQ lsleq
43# define S2HI lsr
44# define MSB 0x000000ff
45# define LSB 0xff000000
46# define BYTE0_OFFSET 24
47# define BYTE1_OFFSET 16
48# define BYTE2_OFFSET 8
49# define BYTE3_OFFSET 0
50#else /* not __ARM_BIG_ENDIAN */
51# define S2LO lsr
52# define S2LOEQ lsreq
53# define S2HI lsl
54# define BYTE0_OFFSET 0
55# define BYTE1_OFFSET 8
56# define BYTE2_OFFSET 16
57# define BYTE3_OFFSET 24
58# define MSB 0xff000000
59# define LSB 0x000000ff
60#endif /* not __ARM_BIG_ENDIAN */
61
62/* Parameters and result. */
63#define src1 r0
64#define src2 r1
65#define result r0 /* Overlaps src1. */
66
67/* Internal variables. */
68#define tmp1 r4
69#define tmp2 r5
70#define const_m1 r12
71
72/* Additional internal variables for 64-bit aligned data. */
73#define data1a r2
74#define data1b r3
75#define data2a r6
76#define data2b r7
77#define syndrome_a tmp1
78#define syndrome_b tmp2
79
80/* Additional internal variables for 32-bit aligned data. */
81#define data1 r2
82#define data2 r3
83#define syndrome tmp2
84
85
86 .thumb
87
88/* In Thumb code we can't use MVN with a register shift, but we do have ORN. */
89.macro prepare_mask mask_reg, nbits_reg
90 S2HI \mask_reg, const_m1, \nbits_reg
91.endm
92.macro apply_mask data_reg, mask_reg
93 orn \data_reg, \data_reg, \mask_reg
94.endm
95
96 /* Macro to compute and return the result value for word-aligned
97 cases. */
98 .macro strcmp_epilogue_aligned synd d1 d2 restore_r6
99#ifdef __ARM_BIG_ENDIAN
100 /* If data1 contains a zero byte, then syndrome will contain a 1 in
101 bit 7 of that byte. Otherwise, the highest set bit in the
102 syndrome will highlight the first different bit. It is therefore
103 sufficient to extract the eight bits starting with the syndrome
104 bit. */
105 clz tmp1, \synd
106 lsl r1, \d2, tmp1
107 .if \restore_r6
108 ldrd r6, r7, [sp, #8]
109 .endif
110 lsl \d1, \d1, tmp1
111 lsr result, \d1, #24
112 ldrd r4, r5, [sp], #16
113 cfi_remember_state
114 cfi_def_cfa_offset (0)
115 cfi_restore (r4)
116 cfi_restore (r5)
117 cfi_restore (r6)
118 cfi_restore (r7)
119 sub result, result, r1, lsr #24
120 bx lr
121#else
122 /* To use the big-endian trick we'd have to reverse all three words.
123 that's slower than this approach. */
124 rev \synd, \synd
125 clz tmp1, \synd
126 bic tmp1, tmp1, #7
127 lsr r1, \d2, tmp1
128 .if \restore_r6
129 ldrd r6, r7, [sp, #8]
130 .endif
131 lsr \d1, \d1, tmp1
132 and result, \d1, #255
133 and r1, r1, #255
134 ldrd r4, r5, [sp], #16
135 cfi_remember_state
136 cfi_def_cfa_offset (0)
137 cfi_restore (r4)
138 cfi_restore (r5)
139 cfi_restore (r6)
140 cfi_restore (r7)
141 sub result, result, r1
142
143 bx lr
144#endif
145 .endm
146
147 .text
148 .p2align 5
149.Lstrcmp_start_addr:
150#if STRCMP_PRECHECK == 1
151.Lfastpath_exit:
152 sub r0, r2, r3
153 bx lr
154 nop
155#endif
156ENTRY (strcmp)
157#if STRCMP_PRECHECK == 1
158 ldrb r2, [src1]
159 ldrb r3, [src2]
160 cmp r2, #1
161 it cs
162 cmpcs r2, r3
163 bne .Lfastpath_exit
164#endif
165 strd r4, r5, [sp, #-16]!
166 cfi_def_cfa_offset (16)
167 cfi_offset (r4, -16)
168 cfi_offset (r5, -12)
169 orr tmp1, src1, src2
170 strd r6, r7, [sp, #8]
171 cfi_offset (r6, -8)
172 cfi_offset (r7, -4)
173 mvn const_m1, #0
174 lsl r2, tmp1, #29
175 cbz r2, .Lloop_aligned8
176
177.Lnot_aligned:
178 eor tmp1, src1, src2
179 tst tmp1, #7
180 bne .Lmisaligned8
181
182 /* Deal with mutual misalignment by aligning downwards and then
183 masking off the unwanted loaded data to prevent a difference. */
184 and tmp1, src1, #7
185 bic src1, src1, #7
186 and tmp2, tmp1, #3
187 bic src2, src2, #7
188 lsl tmp2, tmp2, #3 /* Bytes -> bits. */
189 ldrd data1a, data1b, [src1], #16
190 tst tmp1, #4
191 ldrd data2a, data2b, [src2], #16
192 prepare_mask tmp1, tmp2
193 apply_mask data1a, tmp1
194 apply_mask data2a, tmp1
195 beq .Lstart_realigned8
196 apply_mask data1b, tmp1
197 mov data1a, const_m1
198 apply_mask data2b, tmp1
199 mov data2a, const_m1
200 b .Lstart_realigned8
201
202 /* Unwind the inner loop by a factor of 2, giving 16 bytes per
203 pass. */
204 .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */
205 .p2align 2 /* Always word aligned. */
206.Lloop_aligned8:
207 ldrd data1a, data1b, [src1], #16
208 ldrd data2a, data2b, [src2], #16
209.Lstart_realigned8:
210 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
211 eor syndrome_a, data1a, data2a
212 sel syndrome_a, syndrome_a, const_m1
213 cbnz syndrome_a, .Ldiff_in_a
214 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
215 eor syndrome_b, data1b, data2b
216 sel syndrome_b, syndrome_b, const_m1
217 cbnz syndrome_b, .Ldiff_in_b
218
219 ldrd data1a, data1b, [src1, #-8]
220 ldrd data2a, data2b, [src2, #-8]
221 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
222 eor syndrome_a, data1a, data2a
223 sel syndrome_a, syndrome_a, const_m1
224 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
225 eor syndrome_b, data1b, data2b
226 sel syndrome_b, syndrome_b, const_m1
227 /* Can't use CBZ for backwards branch. */
228 orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
229 beq .Lloop_aligned8
230
231.Ldiff_found:
232 cbnz syndrome_a, .Ldiff_in_a
233
234.Ldiff_in_b:
235 strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
236
237.Ldiff_in_a:
238 cfi_restore_state
239 strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
240
241 cfi_restore_state
242.Lmisaligned8:
243 tst tmp1, #3
244 bne .Lmisaligned4
245 ands tmp1, src1, #3
246 bne .Lmutual_align4
247
248 /* Unrolled by a factor of 2, to reduce the number of post-increment
249 operations. */
250.Lloop_aligned4:
251 ldr data1, [src1], #8
252 ldr data2, [src2], #8
253.Lstart_realigned4:
254 uadd8 syndrome, data1, const_m1 /* Only need GE bits. */
255 eor syndrome, data1, data2
256 sel syndrome, syndrome, const_m1
257 cbnz syndrome, .Laligned4_done
258 ldr data1, [src1, #-4]
259 ldr data2, [src2, #-4]
260 uadd8 syndrome, data1, const_m1
261 eor syndrome, data1, data2
262 sel syndrome, syndrome, const_m1
263 cmp syndrome, #0
264 beq .Lloop_aligned4
265
266.Laligned4_done:
267 strcmp_epilogue_aligned syndrome, data1, data2, 0
268
269.Lmutual_align4:
270 cfi_restore_state
271 /* Deal with mutual misalignment by aligning downwards and then
272 masking off the unwanted loaded data to prevent a difference. */
273 lsl tmp1, tmp1, #3 /* Bytes -> bits. */
274 bic src1, src1, #3
275 ldr data1, [src1], #8
276 bic src2, src2, #3
277 ldr data2, [src2], #8
278
279 prepare_mask tmp1, tmp1
280 apply_mask data1, tmp1
281 apply_mask data2, tmp1
282 b .Lstart_realigned4
283
284.Lmisaligned4:
285 ands tmp1, src1, #3
286 beq .Lsrc1_aligned
287 sub src2, src2, tmp1
288 bic src1, src1, #3
289 lsls tmp1, tmp1, #31
290 ldr data1, [src1], #4
291 beq .Laligned_m2
292 bcs .Laligned_m1
293
294#if STRCMP_PRECHECK == 0
295 ldrb data2, [src2, #1]
296 uxtb tmp1, data1, ror #BYTE1_OFFSET
297 subs tmp1, tmp1, data2
298 bne .Lmisaligned_exit
299 cbz data2, .Lmisaligned_exit
300
301.Laligned_m2:
302 ldrb data2, [src2, #2]
303 uxtb tmp1, data1, ror #BYTE2_OFFSET
304 subs tmp1, tmp1, data2
305 bne .Lmisaligned_exit
306 cbz data2, .Lmisaligned_exit
307
308.Laligned_m1:
309 ldrb data2, [src2, #3]
310 uxtb tmp1, data1, ror #BYTE3_OFFSET
311 subs tmp1, tmp1, data2
312 bne .Lmisaligned_exit
313 add src2, src2, #4
314 cbnz data2, .Lsrc1_aligned
315#else /* STRCMP_PRECHECK */
316 /* If we've done the pre-check, then we don't need to check the
317 first byte again here. */
318 ldrb data2, [src2, #2]
319 uxtb tmp1, data1, ror #BYTE2_OFFSET
320 subs tmp1, tmp1, data2
321 bne .Lmisaligned_exit
322 cbz data2, .Lmisaligned_exit
323
324.Laligned_m2:
325 ldrb data2, [src2, #3]
326 uxtb tmp1, data1, ror #BYTE3_OFFSET
327 subs tmp1, tmp1, data2
328 bne .Lmisaligned_exit
329 cbnz data2, .Laligned_m1
330#endif
331
332.Lmisaligned_exit:
333 mov result, tmp1
334 ldr r4, [sp], #16
335 cfi_remember_state
336 cfi_def_cfa_offset (0)
337 cfi_restore (r4)
338 cfi_restore (r5)
339 cfi_restore (r6)
340 cfi_restore (r7)
341 bx lr
342
343#if STRCMP_PRECHECK == 1
344.Laligned_m1:
345 add src2, src2, #4
346#endif
347.Lsrc1_aligned:
348 cfi_restore_state
349 /* src1 is word aligned, but src2 has no common alignment
350 with it. */
351 ldr data1, [src1], #4
352 lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */
353
354 bic src2, src2, #3
355 ldr data2, [src2], #4
356 bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */
357 bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */
358
359 /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */
360.Loverlap3:
361 bic tmp1, data1, #MSB
362 uadd8 syndrome, data1, const_m1
363 eors syndrome, tmp1, data2, S2LO #8
364 sel syndrome, syndrome, const_m1
365 bne 4f
366 cbnz syndrome, 5f
367 ldr data2, [src2], #4
368 eor tmp1, tmp1, data1
369 cmp tmp1, data2, S2HI #24
370 bne 6f
371 ldr data1, [src1], #4
372 b .Loverlap3
3734:
374 S2LO data2, data2, #8
375 b .Lstrcmp_tail
376
3775:
378 bics syndrome, syndrome, #MSB
379 bne .Lstrcmp_done_equal
380
381 /* We can only get here if the MSB of data1 contains 0, so
382 fast-path the exit. */
383 ldrb result, [src2]
384 ldrd r4, r5, [sp], #16
385 cfi_remember_state
386 cfi_def_cfa_offset (0)
387 cfi_restore (r4)
388 cfi_restore (r5)
389 /* R6/7 Not used in this sequence. */
390 cfi_restore (r6)
391 cfi_restore (r7)
392 neg result, result
393 bx lr
394
3956:
396 cfi_restore_state
397 S2LO data1, data1, #24
398 and data2, data2, #LSB
399 b .Lstrcmp_tail
400
401 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
402.Loverlap2:
403 and tmp1, data1, const_m1, S2LO #16
404 uadd8 syndrome, data1, const_m1
405 eors syndrome, tmp1, data2, S2LO #16
406 sel syndrome, syndrome, const_m1
407 bne 4f
408 cbnz syndrome, 5f
409 ldr data2, [src2], #4
410 eor tmp1, tmp1, data1
411 cmp tmp1, data2, S2HI #16
412 bne 6f
413 ldr data1, [src1], #4
414 b .Loverlap2
4154:
416 S2LO data2, data2, #16
417 b .Lstrcmp_tail
4185:
419 ands syndrome, syndrome, const_m1, S2LO #16
420 bne .Lstrcmp_done_equal
421
422 ldrh data2, [src2]
423 S2LO data1, data1, #16
424#ifdef __ARM_BIG_ENDIAN
425 lsl data2, data2, #16
426#endif
427 b .Lstrcmp_tail
428
4296:
430 S2LO data1, data1, #16
431 and data2, data2, const_m1, S2LO #16
432 b .Lstrcmp_tail
433
434 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
435.Loverlap1:
436 and tmp1, data1, #LSB
437 uadd8 syndrome, data1, const_m1
438 eors syndrome, tmp1, data2, S2LO #24
439 sel syndrome, syndrome, const_m1
440 bne 4f
441 cbnz syndrome, 5f
442 ldr data2, [src2], #4
443 eor tmp1, tmp1, data1
444 cmp tmp1, data2, S2HI #8
445 bne 6f
446 ldr data1, [src1], #4
447 b .Loverlap1
4484:
449 S2LO data2, data2, #24
450 b .Lstrcmp_tail
4515:
452 tst syndrome, #LSB
453 bne .Lstrcmp_done_equal
454 ldr data2, [src2]
4556:
456 S2LO data1, data1, #8
457 bic data2, data2, #MSB
458 b .Lstrcmp_tail
459
460.Lstrcmp_done_equal:
461 mov result, #0
462 ldrd r4, r5, [sp], #16
463 cfi_remember_state
464 cfi_def_cfa_offset (0)
465 cfi_restore (r4)
466 cfi_restore (r5)
467 /* R6/7 not used in this sequence. */
468 cfi_restore (r6)
469 cfi_restore (r7)
470 bx lr
471
472.Lstrcmp_tail:
473 cfi_restore_state
474#ifndef __ARM_BIG_ENDIAN
475 rev data1, data1
476 rev data2, data2
477 /* Now everything looks big-endian... */
478#endif
479 uadd8 tmp1, data1, const_m1
480 eor tmp1, data1, data2
481 sel syndrome, tmp1, const_m1
482 clz tmp1, syndrome
483 lsl data1, data1, tmp1
484 lsl data2, data2, tmp1
485 lsr result, data1, #24
486 ldrd r4, r5, [sp], #16
487 cfi_def_cfa_offset (0)
488 cfi_restore (r4)
489 cfi_restore (r5)
490 /* R6/7 not used in this sequence. */
491 cfi_restore (r6)
492 cfi_restore (r7)
493 sub result, result, data2, lsr #24
494 bx lr
495END (strcmp)
496libc_hidden_builtin_def (strcmp)
497

source code of glibc/sysdeps/arm/armv7/strcmp.S