1 | /* strcmp implementation for ARMv7-A, optimized for Cortex-A15. |
2 | Copyright (C) 2012-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library. If not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <arm-features.h> |
20 | #include <sysdep.h> |
21 | |
22 | /* Implementation of strcmp for ARMv7 when DSP instructions are |
23 | available. Use ldrd to support wider loads, provided the data |
24 | is sufficiently aligned. Use saturating arithmetic to optimize |
25 | the compares. */ |
26 | |
27 | /* Build Options: |
28 | STRCMP_PRECHECK: Run a quick pre-check of the first byte in the |
29 | string. If comparing completely random strings the pre-check will |
30 | save time, since there is a very high probability of a mismatch in |
31 | the first character: we save significant overhead if this is the |
32 | common case. However, if strings are likely to be identical (e.g. |
33 | because we're verifying a hit in a hash table), then this check |
34 | is largely redundant. */ |
35 | |
36 | #define STRCMP_PRECHECK 1 |
37 | |
38 | .syntax unified |
39 | |
40 | #ifdef __ARM_BIG_ENDIAN |
41 | # define S2LO lsl |
42 | # define S2LOEQ lsleq |
43 | # define S2HI lsr |
44 | # define MSB 0x000000ff |
45 | # define LSB 0xff000000 |
46 | # define BYTE0_OFFSET 24 |
47 | # define BYTE1_OFFSET 16 |
48 | # define BYTE2_OFFSET 8 |
49 | # define BYTE3_OFFSET 0 |
50 | #else /* not __ARM_BIG_ENDIAN */ |
51 | # define S2LO lsr |
52 | # define S2LOEQ lsreq |
53 | # define S2HI lsl |
54 | # define BYTE0_OFFSET 0 |
55 | # define BYTE1_OFFSET 8 |
56 | # define BYTE2_OFFSET 16 |
57 | # define BYTE3_OFFSET 24 |
58 | # define MSB 0xff000000 |
59 | # define LSB 0x000000ff |
60 | #endif /* not __ARM_BIG_ENDIAN */ |
61 | |
62 | /* Parameters and result. */ |
63 | #define src1 r0 |
64 | #define src2 r1 |
65 | #define result r0 /* Overlaps src1. */ |
66 | |
67 | /* Internal variables. */ |
68 | #define tmp1 r4 |
69 | #define tmp2 r5 |
70 | #define const_m1 r12 |
71 | |
72 | /* Additional internal variables for 64-bit aligned data. */ |
73 | #define data1a r2 |
74 | #define data1b r3 |
75 | #define data2a r6 |
76 | #define data2b r7 |
77 | #define syndrome_a tmp1 |
78 | #define syndrome_b tmp2 |
79 | |
80 | /* Additional internal variables for 32-bit aligned data. */ |
81 | #define data1 r2 |
82 | #define data2 r3 |
83 | #define syndrome tmp2 |
84 | |
85 | |
86 | .thumb |
87 | |
88 | /* In Thumb code we can't use MVN with a register shift, but we do have ORN. */ |
89 | .macro prepare_mask mask_reg, nbits_reg |
90 | S2HI \mask_reg, const_m1, \nbits_reg |
91 | .endm |
92 | .macro apply_mask data_reg, mask_reg |
93 | orn \data_reg, \data_reg, \mask_reg |
94 | .endm |
95 | |
96 | /* Macro to compute and return the result value for word-aligned |
97 | cases. */ |
98 | .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 |
99 | #ifdef __ARM_BIG_ENDIAN |
100 | /* If data1 contains a zero byte, then syndrome will contain a 1 in |
101 | bit 7 of that byte. Otherwise, the highest set bit in the |
102 | syndrome will highlight the first different bit. It is therefore |
103 | sufficient to extract the eight bits starting with the syndrome |
104 | bit. */ |
105 | clz tmp1, \synd |
106 | lsl r1, \d2, tmp1 |
107 | .if \restore_r6 |
108 | ldrd r6, r7, [sp, #8] |
109 | .endif |
110 | lsl \d1, \d1, tmp1 |
111 | lsr result, \d1, #24 |
112 | ldrd r4, r5, [sp], #16 |
113 | cfi_remember_state |
114 | cfi_def_cfa_offset (0) |
115 | cfi_restore (r4) |
116 | cfi_restore (r5) |
117 | cfi_restore (r6) |
118 | cfi_restore (r7) |
119 | sub result, result, r1, lsr #24 |
120 | bx lr |
121 | #else |
122 | /* To use the big-endian trick we'd have to reverse all three words. |
123 | that's slower than this approach. */ |
124 | rev \synd, \synd |
125 | clz tmp1, \synd |
126 | bic tmp1, tmp1, #7 |
127 | lsr r1, \d2, tmp1 |
128 | .if \restore_r6 |
129 | ldrd r6, r7, [sp, #8] |
130 | .endif |
131 | lsr \d1, \d1, tmp1 |
132 | and result, \d1, #255 |
133 | and r1, r1, #255 |
134 | ldrd r4, r5, [sp], #16 |
135 | cfi_remember_state |
136 | cfi_def_cfa_offset (0) |
137 | cfi_restore (r4) |
138 | cfi_restore (r5) |
139 | cfi_restore (r6) |
140 | cfi_restore (r7) |
141 | sub result, result, r1 |
142 | |
143 | bx lr |
144 | #endif |
145 | .endm |
146 | |
147 | .text |
148 | .p2align 5 |
149 | .Lstrcmp_start_addr: |
150 | #if STRCMP_PRECHECK == 1 |
151 | .Lfastpath_exit: |
152 | sub r0, r2, r3 |
153 | bx lr |
154 | nop |
155 | #endif |
156 | ENTRY (strcmp) |
157 | #if STRCMP_PRECHECK == 1 |
158 | ldrb r2, [src1] |
159 | ldrb r3, [src2] |
160 | cmp r2, #1 |
161 | it cs |
162 | cmpcs r2, r3 |
163 | bne .Lfastpath_exit |
164 | #endif |
165 | strd r4, r5, [sp, #-16]! |
166 | cfi_def_cfa_offset (16) |
167 | cfi_offset (r4, -16) |
168 | cfi_offset (r5, -12) |
169 | orr tmp1, src1, src2 |
170 | strd r6, r7, [sp, #8] |
171 | cfi_offset (r6, -8) |
172 | cfi_offset (r7, -4) |
173 | mvn const_m1, #0 |
174 | lsl r2, tmp1, #29 |
175 | cbz r2, .Lloop_aligned8 |
176 | |
177 | .Lnot_aligned: |
178 | eor tmp1, src1, src2 |
179 | tst tmp1, #7 |
180 | bne .Lmisaligned8 |
181 | |
182 | /* Deal with mutual misalignment by aligning downwards and then |
183 | masking off the unwanted loaded data to prevent a difference. */ |
184 | and tmp1, src1, #7 |
185 | bic src1, src1, #7 |
186 | and tmp2, tmp1, #3 |
187 | bic src2, src2, #7 |
188 | lsl tmp2, tmp2, #3 /* Bytes -> bits. */ |
189 | ldrd data1a, data1b, [src1], #16 |
190 | tst tmp1, #4 |
191 | ldrd data2a, data2b, [src2], #16 |
192 | prepare_mask tmp1, tmp2 |
193 | apply_mask data1a, tmp1 |
194 | apply_mask data2a, tmp1 |
195 | beq .Lstart_realigned8 |
196 | apply_mask data1b, tmp1 |
197 | mov data1a, const_m1 |
198 | apply_mask data2b, tmp1 |
199 | mov data2a, const_m1 |
200 | b .Lstart_realigned8 |
201 | |
202 | /* Unwind the inner loop by a factor of 2, giving 16 bytes per |
203 | pass. */ |
204 | .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ |
205 | .p2align 2 /* Always word aligned. */ |
206 | .Lloop_aligned8: |
207 | ldrd data1a, data1b, [src1], #16 |
208 | ldrd data2a, data2b, [src2], #16 |
209 | .Lstart_realigned8: |
210 | uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ |
211 | eor syndrome_a, data1a, data2a |
212 | sel syndrome_a, syndrome_a, const_m1 |
213 | cbnz syndrome_a, .Ldiff_in_a |
214 | uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ |
215 | eor syndrome_b, data1b, data2b |
216 | sel syndrome_b, syndrome_b, const_m1 |
217 | cbnz syndrome_b, .Ldiff_in_b |
218 | |
219 | ldrd data1a, data1b, [src1, #-8] |
220 | ldrd data2a, data2b, [src2, #-8] |
221 | uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ |
222 | eor syndrome_a, data1a, data2a |
223 | sel syndrome_a, syndrome_a, const_m1 |
224 | uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ |
225 | eor syndrome_b, data1b, data2b |
226 | sel syndrome_b, syndrome_b, const_m1 |
227 | /* Can't use CBZ for backwards branch. */ |
228 | orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ |
229 | beq .Lloop_aligned8 |
230 | |
231 | .Ldiff_found: |
232 | cbnz syndrome_a, .Ldiff_in_a |
233 | |
234 | .Ldiff_in_b: |
235 | strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 |
236 | |
237 | .Ldiff_in_a: |
238 | cfi_restore_state |
239 | strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 |
240 | |
241 | cfi_restore_state |
242 | .Lmisaligned8: |
243 | tst tmp1, #3 |
244 | bne .Lmisaligned4 |
245 | ands tmp1, src1, #3 |
246 | bne .Lmutual_align4 |
247 | |
248 | /* Unrolled by a factor of 2, to reduce the number of post-increment |
249 | operations. */ |
250 | .Lloop_aligned4: |
251 | ldr data1, [src1], #8 |
252 | ldr data2, [src2], #8 |
253 | .Lstart_realigned4: |
254 | uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ |
255 | eor syndrome, data1, data2 |
256 | sel syndrome, syndrome, const_m1 |
257 | cbnz syndrome, .Laligned4_done |
258 | ldr data1, [src1, #-4] |
259 | ldr data2, [src2, #-4] |
260 | uadd8 syndrome, data1, const_m1 |
261 | eor syndrome, data1, data2 |
262 | sel syndrome, syndrome, const_m1 |
263 | cmp syndrome, #0 |
264 | beq .Lloop_aligned4 |
265 | |
266 | .Laligned4_done: |
267 | strcmp_epilogue_aligned syndrome, data1, data2, 0 |
268 | |
269 | .Lmutual_align4: |
270 | cfi_restore_state |
271 | /* Deal with mutual misalignment by aligning downwards and then |
272 | masking off the unwanted loaded data to prevent a difference. */ |
273 | lsl tmp1, tmp1, #3 /* Bytes -> bits. */ |
274 | bic src1, src1, #3 |
275 | ldr data1, [src1], #8 |
276 | bic src2, src2, #3 |
277 | ldr data2, [src2], #8 |
278 | |
279 | prepare_mask tmp1, tmp1 |
280 | apply_mask data1, tmp1 |
281 | apply_mask data2, tmp1 |
282 | b .Lstart_realigned4 |
283 | |
284 | .Lmisaligned4: |
285 | ands tmp1, src1, #3 |
286 | beq .Lsrc1_aligned |
287 | sub src2, src2, tmp1 |
288 | bic src1, src1, #3 |
289 | lsls tmp1, tmp1, #31 |
290 | ldr data1, [src1], #4 |
291 | beq .Laligned_m2 |
292 | bcs .Laligned_m1 |
293 | |
294 | #if STRCMP_PRECHECK == 0 |
295 | ldrb data2, [src2, #1] |
296 | uxtb tmp1, data1, ror #BYTE1_OFFSET |
297 | subs tmp1, tmp1, data2 |
298 | bne .Lmisaligned_exit |
299 | cbz data2, .Lmisaligned_exit |
300 | |
301 | .Laligned_m2: |
302 | ldrb data2, [src2, #2] |
303 | uxtb tmp1, data1, ror #BYTE2_OFFSET |
304 | subs tmp1, tmp1, data2 |
305 | bne .Lmisaligned_exit |
306 | cbz data2, .Lmisaligned_exit |
307 | |
308 | .Laligned_m1: |
309 | ldrb data2, [src2, #3] |
310 | uxtb tmp1, data1, ror #BYTE3_OFFSET |
311 | subs tmp1, tmp1, data2 |
312 | bne .Lmisaligned_exit |
313 | add src2, src2, #4 |
314 | cbnz data2, .Lsrc1_aligned |
315 | #else /* STRCMP_PRECHECK */ |
316 | /* If we've done the pre-check, then we don't need to check the |
317 | first byte again here. */ |
318 | ldrb data2, [src2, #2] |
319 | uxtb tmp1, data1, ror #BYTE2_OFFSET |
320 | subs tmp1, tmp1, data2 |
321 | bne .Lmisaligned_exit |
322 | cbz data2, .Lmisaligned_exit |
323 | |
324 | .Laligned_m2: |
325 | ldrb data2, [src2, #3] |
326 | uxtb tmp1, data1, ror #BYTE3_OFFSET |
327 | subs tmp1, tmp1, data2 |
328 | bne .Lmisaligned_exit |
329 | cbnz data2, .Laligned_m1 |
330 | #endif |
331 | |
332 | .Lmisaligned_exit: |
333 | mov result, tmp1 |
334 | ldr r4, [sp], #16 |
335 | cfi_remember_state |
336 | cfi_def_cfa_offset (0) |
337 | cfi_restore (r4) |
338 | cfi_restore (r5) |
339 | cfi_restore (r6) |
340 | cfi_restore (r7) |
341 | bx lr |
342 | |
343 | #if STRCMP_PRECHECK == 1 |
344 | .Laligned_m1: |
345 | add src2, src2, #4 |
346 | #endif |
347 | .Lsrc1_aligned: |
348 | cfi_restore_state |
349 | /* src1 is word aligned, but src2 has no common alignment |
350 | with it. */ |
351 | ldr data1, [src1], #4 |
352 | lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ |
353 | |
354 | bic src2, src2, #3 |
355 | ldr data2, [src2], #4 |
356 | bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */ |
357 | bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */ |
358 | |
359 | /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ |
360 | .Loverlap3: |
361 | bic tmp1, data1, #MSB |
362 | uadd8 syndrome, data1, const_m1 |
363 | eors syndrome, tmp1, data2, S2LO #8 |
364 | sel syndrome, syndrome, const_m1 |
365 | bne 4f |
366 | cbnz syndrome, 5f |
367 | ldr data2, [src2], #4 |
368 | eor tmp1, tmp1, data1 |
369 | cmp tmp1, data2, S2HI #24 |
370 | bne 6f |
371 | ldr data1, [src1], #4 |
372 | b .Loverlap3 |
373 | 4: |
374 | S2LO data2, data2, #8 |
375 | b .Lstrcmp_tail |
376 | |
377 | 5: |
378 | bics syndrome, syndrome, #MSB |
379 | bne .Lstrcmp_done_equal |
380 | |
381 | /* We can only get here if the MSB of data1 contains 0, so |
382 | fast-path the exit. */ |
383 | ldrb result, [src2] |
384 | ldrd r4, r5, [sp], #16 |
385 | cfi_remember_state |
386 | cfi_def_cfa_offset (0) |
387 | cfi_restore (r4) |
388 | cfi_restore (r5) |
389 | /* R6/7 Not used in this sequence. */ |
390 | cfi_restore (r6) |
391 | cfi_restore (r7) |
392 | neg result, result |
393 | bx lr |
394 | |
395 | 6: |
396 | cfi_restore_state |
397 | S2LO data1, data1, #24 |
398 | and data2, data2, #LSB |
399 | b .Lstrcmp_tail |
400 | |
401 | .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ |
402 | .Loverlap2: |
403 | and tmp1, data1, const_m1, S2LO #16 |
404 | uadd8 syndrome, data1, const_m1 |
405 | eors syndrome, tmp1, data2, S2LO #16 |
406 | sel syndrome, syndrome, const_m1 |
407 | bne 4f |
408 | cbnz syndrome, 5f |
409 | ldr data2, [src2], #4 |
410 | eor tmp1, tmp1, data1 |
411 | cmp tmp1, data2, S2HI #16 |
412 | bne 6f |
413 | ldr data1, [src1], #4 |
414 | b .Loverlap2 |
415 | 4: |
416 | S2LO data2, data2, #16 |
417 | b .Lstrcmp_tail |
418 | 5: |
419 | ands syndrome, syndrome, const_m1, S2LO #16 |
420 | bne .Lstrcmp_done_equal |
421 | |
422 | ldrh data2, [src2] |
423 | S2LO data1, data1, #16 |
424 | #ifdef __ARM_BIG_ENDIAN |
425 | lsl data2, data2, #16 |
426 | #endif |
427 | b .Lstrcmp_tail |
428 | |
429 | 6: |
430 | S2LO data1, data1, #16 |
431 | and data2, data2, const_m1, S2LO #16 |
432 | b .Lstrcmp_tail |
433 | |
434 | .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ |
435 | .Loverlap1: |
436 | and tmp1, data1, #LSB |
437 | uadd8 syndrome, data1, const_m1 |
438 | eors syndrome, tmp1, data2, S2LO #24 |
439 | sel syndrome, syndrome, const_m1 |
440 | bne 4f |
441 | cbnz syndrome, 5f |
442 | ldr data2, [src2], #4 |
443 | eor tmp1, tmp1, data1 |
444 | cmp tmp1, data2, S2HI #8 |
445 | bne 6f |
446 | ldr data1, [src1], #4 |
447 | b .Loverlap1 |
448 | 4: |
449 | S2LO data2, data2, #24 |
450 | b .Lstrcmp_tail |
451 | 5: |
452 | tst syndrome, #LSB |
453 | bne .Lstrcmp_done_equal |
454 | ldr data2, [src2] |
455 | 6: |
456 | S2LO data1, data1, #8 |
457 | bic data2, data2, #MSB |
458 | b .Lstrcmp_tail |
459 | |
460 | .Lstrcmp_done_equal: |
461 | mov result, #0 |
462 | ldrd r4, r5, [sp], #16 |
463 | cfi_remember_state |
464 | cfi_def_cfa_offset (0) |
465 | cfi_restore (r4) |
466 | cfi_restore (r5) |
467 | /* R6/7 not used in this sequence. */ |
468 | cfi_restore (r6) |
469 | cfi_restore (r7) |
470 | bx lr |
471 | |
472 | .Lstrcmp_tail: |
473 | cfi_restore_state |
474 | #ifndef __ARM_BIG_ENDIAN |
475 | rev data1, data1 |
476 | rev data2, data2 |
477 | /* Now everything looks big-endian... */ |
478 | #endif |
479 | uadd8 tmp1, data1, const_m1 |
480 | eor tmp1, data1, data2 |
481 | sel syndrome, tmp1, const_m1 |
482 | clz tmp1, syndrome |
483 | lsl data1, data1, tmp1 |
484 | lsl data2, data2, tmp1 |
485 | lsr result, data1, #24 |
486 | ldrd r4, r5, [sp], #16 |
487 | cfi_def_cfa_offset (0) |
488 | cfi_restore (r4) |
489 | cfi_restore (r5) |
490 | /* R6/7 not used in this sequence. */ |
491 | cfi_restore (r6) |
492 | cfi_restore (r7) |
493 | sub result, result, data2, lsr #24 |
494 | bx lr |
495 | END (strcmp) |
496 | libc_hidden_builtin_def (strcmp) |
497 | |