1 | /* Optimized strcasecmp implementation for PowerPC64. |
2 | Copyright (C) 2016-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | #include <locale-defines.h> |
21 | |
22 | /* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) */ |
23 | |
24 | #ifndef USE_AS_STRNCASECMP |
25 | # define __STRCASECMP __strcasecmp |
26 | # define STRCASECMP strcasecmp |
27 | #else |
28 | # define __STRCASECMP __strncasecmp |
29 | # define STRCASECMP strncasecmp |
30 | #endif |
31 | /* Convert 16 bytes to lowercase and compare */ |
32 | #define TOLOWER() \ |
33 | vaddubm v8, v4, v1; \ |
34 | vaddubm v7, v4, v3; \ |
35 | vcmpgtub v8, v8, v2; \ |
36 | vsel v4, v7, v4, v8; \ |
37 | vaddubm v8, v5, v1; \ |
38 | vaddubm v7, v5, v3; \ |
39 | vcmpgtub v8, v8, v2; \ |
40 | vsel v5, v7, v5, v8; \ |
41 | vcmpequb. v7, v5, v4; |
42 | |
43 | /* |
44 | * Get 16 bytes for unaligned case. |
45 | * reg1: Vector to hold next 16 bytes. |
46 | * reg2: Address to read from. |
47 | * reg3: Permute control vector. |
48 | * v8: Tmp vector used to mask unwanted bytes. |
49 | * v9: Tmp vector,0 when null is found on first 16 bytes |
50 | */ |
51 | #ifdef __LITTLE_ENDIAN__ |
52 | #define GET16BYTES(reg1, reg2, reg3) \ |
53 | lvx reg1, 0, reg2; \ |
54 | vspltisb v8, -1; \ |
55 | vperm v8, v8, reg1, reg3; \ |
56 | vcmpequb. v8, v0, v8; \ |
57 | beq cr6, 1f; \ |
58 | vspltisb v9, 0; \ |
59 | b 2f; \ |
60 | .align 4; \ |
61 | 1: \ |
62 | addi r6, reg2, 16; \ |
63 | lvx v9, 0, r6; \ |
64 | 2: \ |
65 | vperm reg1, v9, reg1, reg3; |
66 | #else |
67 | #define GET16BYTES(reg1, reg2, reg3) \ |
68 | lvx reg1, 0, reg2; \ |
69 | vspltisb v8, -1; \ |
70 | vperm v8, reg1, v8, reg3; \ |
71 | vcmpequb. v8, v0, v8; \ |
72 | beq cr6, 1f; \ |
73 | vspltisb v9, 0; \ |
74 | b 2f; \ |
75 | .align 4; \ |
76 | 1: \ |
77 | addi r6, reg2, 16; \ |
78 | lvx v9, 0, r6; \ |
79 | 2: \ |
80 | vperm reg1, reg1, v9, reg3; |
81 | #endif |
82 | |
83 | /* Check null in v4, v5 and convert to lower. */ |
84 | #define CHECKNULLANDCONVERT() \ |
85 | vcmpequb. v7, v0, v5; \ |
86 | beq cr6, 3f; \ |
87 | vcmpequb. v7, v0, v4; \ |
88 | beq cr6, 3f; \ |
89 | b L(null_found); \ |
90 | .align 4; \ |
91 | 3: \ |
92 | TOLOWER() |
93 | |
94 | .machine power8 |
95 | |
96 | ENTRY (__STRCASECMP) |
97 | #ifdef USE_AS_STRNCASECMP |
98 | CALL_MCOUNT 3 |
99 | #else |
100 | CALL_MCOUNT 2 |
101 | #endif |
102 | #define rRTN r3 /* Return value */ |
103 | #define rSTR1 r10 /* 1st string */ |
104 | #define rSTR2 r4 /* 2nd string */ |
105 | #define rCHAR1 r6 /* Byte read from 1st string */ |
106 | #define rCHAR2 r7 /* Byte read from 2nd string */ |
107 | #define rADDR1 r8 /* Address of tolower(rCHAR1) */ |
108 | #define rADDR2 r12 /* Address of tolower(rCHAR2) */ |
109 | #define rLWR1 r8 /* Word tolower(rCHAR1) */ |
110 | #define rLWR2 r12 /* Word tolower(rCHAR2) */ |
111 | #define rTMP r9 |
112 | #define rLOC r11 /* Default locale address */ |
113 | |
114 | cmpd cr7, rRTN, rSTR2 |
115 | |
116 | /* Get locale address. */ |
117 | ld rTMP, __libc_tsd_LOCALE@got@tprel(r2) |
118 | add rLOC, rTMP, __libc_tsd_LOCALE@tls |
119 | ld rLOC, 0(rLOC) |
120 | |
121 | mr rSTR1, rRTN |
122 | li rRTN, 0 |
123 | beqlr cr7 |
124 | #ifdef USE_AS_STRNCASECMP |
125 | cmpdi cr7, r5, 0 |
126 | beq cr7, L(retnull) |
127 | cmpdi cr7, r5, 16 |
128 | blt cr7, L(bytebybyte) |
129 | #endif |
130 | vspltisb v0, 0 |
131 | vspltisb v8, -1 |
132 | /* Check for null in initial characters. |
133 | Check max of 16 char depending on the alignment. |
134 | If null is present, proceed byte by byte. */ |
135 | lvx v4, 0, rSTR1 |
136 | #ifdef __LITTLE_ENDIAN__ |
137 | lvsr v10, 0, rSTR1 /* Compute mask. */ |
138 | vperm v9, v8, v4, v10 /* Mask bits that are not part of string. */ |
139 | #else |
140 | lvsl v10, 0, rSTR1 |
141 | vperm v9, v4, v8, v10 |
142 | #endif |
143 | vcmpequb. v9, v0, v9 /* Check for null bytes. */ |
144 | bne cr6, L(bytebybyte) |
145 | lvx v5, 0, rSTR2 |
146 | /* Calculate alignment. */ |
147 | #ifdef __LITTLE_ENDIAN__ |
148 | lvsr v6, 0, rSTR2 |
149 | vperm v9, v8, v5, v6 /* Mask bits that are not part of string. */ |
150 | #else |
151 | lvsl v6, 0, rSTR2 |
152 | vperm v9, v5, v8, v6 |
153 | #endif |
154 | vcmpequb. v9, v0, v9 /* Check for null bytes. */ |
155 | bne cr6, L(bytebybyte) |
156 | /* Check if locale has non ascii characters. */ |
157 | ld rTMP, 0(rLOC) |
158 | addi r6, rTMP,LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES |
159 | lwz rTMP, 0(r6) |
160 | cmpdi cr7, rTMP, 1 |
161 | beq cr7, L(bytebybyte) |
162 | |
163 | /* Load vector registers with values used for TOLOWER. */ |
164 | /* Load v1 = 0xbf, v2 = 0x19 v3 = 0x20 in each byte. */ |
165 | vspltisb v3, 2 |
166 | vspltisb v9, 4 |
167 | vsl v3, v3, v9 |
168 | vaddubm v1, v3, v3 |
169 | vnor v1, v1, v1 |
170 | vspltisb v2, 7 |
171 | vsububm v2, v3, v2 |
172 | |
173 | andi. rADDR1, rSTR1, 0xF |
174 | beq cr0, L(align) |
175 | addi r6, rSTR1, 16 |
176 | lvx v9, 0, r6 |
177 | /* Compute 16 bytes from previous two loads. */ |
178 | #ifdef __LITTLE_ENDIAN__ |
179 | vperm v4, v9, v4, v10 |
180 | #else |
181 | vperm v4, v4, v9, v10 |
182 | #endif |
183 | L(align): |
184 | andi. rADDR2, rSTR2, 0xF |
185 | beq cr0, L(align1) |
186 | addi r6, rSTR2, 16 |
187 | lvx v9, 0, r6 |
188 | /* Compute 16 bytes from previous two loads. */ |
189 | #ifdef __LITTLE_ENDIAN__ |
190 | vperm v5, v9, v5, v6 |
191 | #else |
192 | vperm v5, v5, v9, v6 |
193 | #endif |
194 | L(align1): |
195 | CHECKNULLANDCONVERT() |
196 | blt cr6, L(match) |
197 | b L(different) |
198 | .align 4 |
199 | L(match): |
200 | clrldi r6, rSTR1, 60 |
201 | subfic r7, r6, 16 |
202 | #ifdef USE_AS_STRNCASECMP |
203 | sub r5, r5, r7 |
204 | #endif |
205 | add rSTR1, rSTR1, r7 |
206 | add rSTR2, rSTR2, r7 |
207 | andi. rADDR2, rSTR2, 0xF |
208 | addi rSTR1, rSTR1, -16 |
209 | addi rSTR2, rSTR2, -16 |
210 | beq cr0, L(aligned) |
211 | #ifdef __LITTLE_ENDIAN__ |
212 | lvsr v6, 0, rSTR2 |
213 | #else |
214 | lvsl v6, 0, rSTR2 |
215 | #endif |
216 | /* There are 2 loops depending on the input alignment. |
217 | Each loop gets 16 bytes from s1 and s2, check for null, |
218 | convert to lowercase and compare. Loop till difference |
219 | or null occurs. */ |
220 | L(s1_align): |
221 | addi rSTR1, rSTR1, 16 |
222 | addi rSTR2, rSTR2, 16 |
223 | #ifdef USE_AS_STRNCASECMP |
224 | cmpdi cr7, r5, 16 |
225 | blt cr7, L(bytebybyte) |
226 | addi r5, r5, -16 |
227 | #endif |
228 | lvx v4, 0, rSTR1 |
229 | GET16BYTES(v5, rSTR2, v6) |
230 | CHECKNULLANDCONVERT() |
231 | blt cr6, L(s1_align) |
232 | b L(different) |
233 | .align 4 |
234 | L(aligned): |
235 | addi rSTR1, rSTR1, 16 |
236 | addi rSTR2, rSTR2, 16 |
237 | #ifdef USE_AS_STRNCASECMP |
238 | cmpdi cr7, r5, 16 |
239 | blt cr7, L(bytebybyte) |
240 | addi r5, r5, -16 |
241 | #endif |
242 | lvx v4, 0, rSTR1 |
243 | lvx v5, 0, rSTR2 |
244 | CHECKNULLANDCONVERT() |
245 | blt cr6, L(aligned) |
246 | |
247 | /* Calculate and return the difference. */ |
248 | L(different): |
249 | vaddubm v1, v3, v3 |
250 | vcmpequb v7, v0, v7 |
251 | #ifdef __LITTLE_ENDIAN__ |
252 | /* Count trailing zero. */ |
253 | vspltisb v8, -1 |
254 | vadduqm v9, v7, v8 |
255 | vandc v8, v9, v7 |
256 | vpopcntd v8, v8 |
257 | vspltb v6, v8, 15 |
258 | vcmpequb. v6, v6, v1 |
259 | blt cr6, L(shift8) |
260 | #else |
261 | /* Count leading zero. */ |
262 | vclzd v8, v7 |
263 | vspltb v6, v8, 7 |
264 | vcmpequb. v6, v6, v1 |
265 | blt cr6, L(shift8) |
266 | vsro v8, v8, v1 |
267 | #endif |
268 | b L(skipsum) |
269 | .align 4 |
270 | L(shift8): |
271 | vsumsws v8, v8, v0 |
272 | L(skipsum): |
273 | #ifdef __LITTLE_ENDIAN__ |
274 | /* Shift registers based on leading zero count. */ |
275 | vsro v6, v5, v8 |
276 | vsro v7, v4, v8 |
277 | /* Merge and move to GPR. */ |
278 | vmrglb v6, v6, v7 |
279 | vslo v1, v6, v1 |
280 | mfvrd r3, v1 |
281 | /* Place the characters that are different in first position. */ |
282 | sldi rSTR2, rRTN, 56 |
283 | srdi rSTR2, rSTR2, 56 |
284 | sldi rSTR1, rRTN, 48 |
285 | srdi rSTR1, rSTR1, 56 |
286 | #else |
287 | vslo v6, v5, v8 |
288 | vslo v7, v4, v8 |
289 | vmrghb v1, v6, v7 |
290 | mfvrd r3, v1 |
291 | srdi rSTR2, rRTN, 48 |
292 | sldi rSTR2, rSTR2, 56 |
293 | srdi rSTR2, rSTR2, 56 |
294 | srdi rSTR1, rRTN, 56 |
295 | #endif |
296 | subf rRTN, rSTR1, rSTR2 |
297 | extsw rRTN, rRTN |
298 | blr |
299 | |
300 | .align 4 |
301 | /* OK. We've hit the end of the string. We need to be careful that |
302 | we don't compare two strings as different because of junk beyond |
303 | the end of the strings... */ |
304 | L(null_found): |
305 | vaddubm v10, v3, v3 |
306 | #ifdef __LITTLE_ENDIAN__ |
307 | /* Count trailing zero. */ |
308 | vspltisb v8, -1 |
309 | vadduqm v9, v7, v8 |
310 | vandc v8, v9, v7 |
311 | vpopcntd v8, v8 |
312 | vspltb v6, v8, 15 |
313 | vcmpequb. v6, v6, v10 |
314 | blt cr6, L(shift_8) |
315 | #else |
316 | /* Count leading zero. */ |
317 | vclzd v8, v7 |
318 | vspltb v6, v8, 7 |
319 | vcmpequb. v6, v6, v10 |
320 | blt cr6, L(shift_8) |
321 | vsro v8, v8, v10 |
322 | #endif |
323 | b L(skipsum1) |
324 | .align 4 |
325 | L(shift_8): |
326 | vsumsws v8, v8, v0 |
327 | L(skipsum1): |
328 | /* Calculate shift count based on count of zero. */ |
329 | vspltisb v10, 7 |
330 | vslb v10, v10, v10 |
331 | vsldoi v9, v0, v10, 1 |
332 | vsubudm v9, v9, v8 |
333 | vspltisb v8, 8 |
334 | vsldoi v8, v0, v8, 1 |
335 | vsubudm v9, v9, v8 |
336 | /* Shift and remove junk after null character. */ |
337 | #ifdef __LITTLE_ENDIAN__ |
338 | vslo v5, v5, v9 |
339 | vslo v4, v4, v9 |
340 | #else |
341 | vsro v5, v5, v9 |
342 | vsro v4, v4, v9 |
343 | #endif |
344 | /* Convert and compare 16 bytes. */ |
345 | TOLOWER() |
346 | blt cr6, L(retnull) |
347 | b L(different) |
348 | .align 4 |
349 | L(retnull): |
350 | li rRTN, 0 |
351 | blr |
352 | .align 4 |
353 | L(bytebybyte): |
354 | /* Unrolling loop for POWER: loads are done with 'lbz' plus |
355 | offset and string descriptors are only updated in the end |
356 | of loop unrolling. */ |
357 | ld rLOC, LOCALE_CTYPE_TOLOWER(rLOC) |
358 | lbz rCHAR1, 0(rSTR1) /* Load char from s1 */ |
359 | lbz rCHAR2, 0(rSTR2) /* Load char from s2 */ |
360 | #ifdef USE_AS_STRNCASECMP |
361 | rldicl rTMP, r5, 62, 2 |
362 | cmpdi cr7, rTMP, 0 |
363 | beq cr7, L(lessthan4) |
364 | mtctr rTMP |
365 | #endif |
366 | L(loop): |
367 | cmpdi rCHAR1, 0 /* *s1 == '\0' ? */ |
368 | sldi rADDR1, rCHAR1, 2 /* Calculate address for tolower(*s1) */ |
369 | sldi rADDR2, rCHAR2, 2 /* Calculate address for tolower(*s2) */ |
370 | lwzx rLWR1, rLOC, rADDR1 /* Load tolower(*s1) */ |
371 | lwzx rLWR2, rLOC, rADDR2 /* Load tolower(*s2) */ |
372 | cmpw cr1, rLWR1, rLWR2 /* r = tolower(*s1) == tolower(*s2) ? */ |
373 | crorc 4*cr1+eq,eq,4*cr1+eq /* (*s1 != '\0') || (r == 1) */ |
374 | beq cr1, L(done) |
375 | lbz rCHAR1, 1(rSTR1) |
376 | lbz rCHAR2, 1(rSTR2) |
377 | cmpdi rCHAR1, 0 |
378 | sldi rADDR1, rCHAR1, 2 |
379 | sldi rADDR2, rCHAR2, 2 |
380 | lwzx rLWR1, rLOC, rADDR1 |
381 | lwzx rLWR2, rLOC, rADDR2 |
382 | cmpw cr1, rLWR1, rLWR2 |
383 | crorc 4*cr1+eq,eq,4*cr1+eq |
384 | beq cr1, L(done) |
385 | lbz rCHAR1, 2(rSTR1) |
386 | lbz rCHAR2, 2(rSTR2) |
387 | cmpdi rCHAR1, 0 |
388 | sldi rADDR1, rCHAR1, 2 |
389 | sldi rADDR2, rCHAR2, 2 |
390 | lwzx rLWR1, rLOC, rADDR1 |
391 | lwzx rLWR2, rLOC, rADDR2 |
392 | cmpw cr1, rLWR1, rLWR2 |
393 | crorc 4*cr1+eq,eq,4*cr1+eq |
394 | beq cr1, L(done) |
395 | lbz rCHAR1, 3(rSTR1) |
396 | lbz rCHAR2, 3(rSTR2) |
397 | cmpdi rCHAR1, 0 |
398 | /* Increment both string descriptors */ |
399 | addi rSTR1, rSTR1, 4 |
400 | addi rSTR2, rSTR2, 4 |
401 | sldi rADDR1, rCHAR1, 2 |
402 | sldi rADDR2, rCHAR2, 2 |
403 | lwzx rLWR1, rLOC, rADDR1 |
404 | lwzx rLWR2, rLOC, rADDR2 |
405 | cmpw cr1, rLWR1, rLWR2 |
406 | crorc 4*cr1+eq,eq,4*cr1+eq |
407 | beq cr1, L(done) |
408 | lbz rCHAR1, 0(rSTR1) /* Load char from s1 */ |
409 | lbz rCHAR2, 0(rSTR2) /* Load char from s2 */ |
410 | #ifdef USE_AS_STRNCASECMP |
411 | bdnz L(loop) |
412 | #else |
413 | b L(loop) |
414 | #endif |
415 | #ifdef USE_AS_STRNCASECMP |
416 | L(lessthan4): |
417 | clrldi r5, r5, 62 |
418 | cmpdi cr7, r5, 0 |
419 | beq cr7, L(retnull) |
420 | mtctr r5 |
421 | L(loop1): |
422 | cmpdi rCHAR1, 0 |
423 | sldi rADDR1, rCHAR1, 2 |
424 | sldi rADDR2, rCHAR2, 2 |
425 | lwzx rLWR1, rLOC, rADDR1 |
426 | lwzx rLWR2, rLOC, rADDR2 |
427 | cmpw cr1, rLWR1, rLWR2 |
428 | crorc 4*cr1+eq,eq,4*cr1+eq |
429 | beq cr1, L(done) |
430 | addi rSTR1, rSTR1, 1 |
431 | addi rSTR2, rSTR2, 1 |
432 | lbz rCHAR1, 0(rSTR1) |
433 | lbz rCHAR2, 0(rSTR2) |
434 | bdnz L(loop1) |
435 | #endif |
436 | L(done): |
437 | subf r0, rLWR2, rLWR1 |
438 | extsw rRTN, r0 |
439 | blr |
440 | END (__STRCASECMP) |
441 | |
442 | weak_alias (__STRCASECMP, STRCASECMP) |
443 | libc_hidden_builtin_def (__STRCASECMP) |
444 | |