1 | /* Optimized strncmp implementation for PowerPC64/POWER8. |
2 | Copyright (C) 2015-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | #ifndef STRNCMP |
22 | # define STRNCMP strncmp |
23 | #endif |
24 | |
25 | /* Implements the function |
26 | |
27 | int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n) |
28 | |
29 | The implementation uses unaligned doubleword access to avoid specialized |
30 | code paths depending of data alignment. Although recent powerpc64 uses |
31 | 64K as default, the page cross handling assumes minimum page size of |
32 | 4k. */ |
33 | |
34 | .machine power8 |
35 | ENTRY_TOCLESS (STRNCMP, 4) |
36 | /* Check if size is 0. */ |
37 | mr. r10,r5 |
38 | beq cr0,L(ret0) |
39 | |
40 | /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using |
41 | the code: |
42 | |
43 | (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE)) |
44 | |
45 | with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */ |
46 | rldicl r8,r3,0,52 |
47 | cmpldi cr7,r8,4096-16 |
48 | bgt cr7,L(pagecross) |
49 | rldicl r9,r4,0,52 |
50 | cmpldi cr7,r9,4096-16 |
51 | bgt cr7,L(pagecross) |
52 | |
53 | /* For short string up to 16 bytes, load both s1 and s2 using |
54 | unaligned dwords and compare. */ |
55 | ld r7,0(r3) |
56 | ld r9,0(r4) |
57 | li r8,0 |
58 | cmpb r8,r7,r8 |
59 | cmpb r6,r7,r9 |
60 | orc. r8,r8,r6 |
61 | bne cr0,L(different1) |
62 | |
63 | /* If the string compared are equal, but size is less or equal |
64 | to 8, return 0. */ |
65 | cmpldi cr7,r10,8 |
66 | li r9,0 |
67 | ble cr7,L(ret1) |
68 | addi r5,r10,-8 |
69 | |
70 | ld r7,8(r3) |
71 | ld r9,8(r4) |
72 | cmpb r8,r7,r8 |
73 | cmpb r6,r7,r9 |
74 | orc. r8,r8,r6 |
75 | bne cr0,L(different0) |
76 | |
77 | cmpldi cr7,r5,8 |
78 | mr r9,r8 |
79 | ble cr7,L(ret1) |
80 | |
81 | /* Update pointers and size. */ |
82 | addi r10,r10,-16 |
83 | addi r3,r3,16 |
84 | addi r4,r4,16 |
85 | |
86 | /* Now it has checked for first 16 bytes, align source1 to doubleword |
87 | and adjust source2 address. */ |
88 | L(align_8b): |
89 | rldicl r5,r3,0,61 |
90 | rldicr r3,r3,0,60 |
91 | subf r4,r5,r4 |
92 | add r10,r10,r5 |
93 | |
94 | /* At this point, source1 alignment is 0 and source2 alignment is |
95 | between 0 and 7. Check is source2 alignment is 0, meaning both |
96 | sources have the same alignment. */ |
97 | andi. r8,r4,0x7 |
98 | beq cr0,L(loop_eq_align_0) |
99 | |
100 | li r5,0 |
101 | b L(loop_ne_align_1) |
102 | |
103 | /* If source2 is unaligned to doubleword, the code needs to check |
104 | on each iteration if the unaligned doubleword access will cross |
105 | a 4k page boundary. */ |
106 | .align 4 |
107 | L(loop_ne_align_0): |
108 | ld r7,0(r3) |
109 | ld r9,0(r4) |
110 | cmpb r8,r7,r5 |
111 | cmpb r6,r7,r9 |
112 | orc. r8,r8,r6 |
113 | bne cr0,L(different1) |
114 | |
115 | cmpldi cr7,r10,8 |
116 | ble cr7,L(ret0) |
117 | addi r10,r10,-8 |
118 | addi r3,r3,8 |
119 | addi r4,r4,8 |
120 | L(loop_ne_align_1): |
121 | rldicl r9,r4,0,52 |
122 | cmpldi r7,r9,4088 |
123 | ble cr7,L(loop_ne_align_0) |
124 | cmpdi cr7,r10,0 |
125 | beq cr7,L(ret0) |
126 | |
127 | lbz r9,0(r3) |
128 | lbz r8,0(r4) |
129 | cmplw cr7,r9,r8 |
130 | bne cr7,L(byte_ne_4) |
131 | cmpdi cr7,r9,0 |
132 | beq cr7,L(size_reached_0) |
133 | |
134 | li r9,r7 |
135 | addi r8,r3,1 |
136 | mtctr r9 |
137 | addi r4,r4,1 |
138 | addi r10,r10,-1 |
139 | addi r3,r3,8 |
140 | |
141 | /* The unaligned read of source2 will cross a 4K page boundary, |
142 | and the different byte or NULL maybe be in the remaining page |
143 | bytes. Since it can not use the unaligned load the algorithm |
144 | reads and compares 8 bytes to keep source1 doubleword aligned. */ |
145 | .align 4 |
146 | L(loop_ne_align_byte): |
147 | cmpdi cr7,r10,0 |
148 | addi r10,r10,-1 |
149 | beq cr7,L(ret0) |
150 | lbz r9,0(r8) |
151 | lbz r7,0(r4) |
152 | addi r8,r8,1 |
153 | addi r4,r4,1 |
154 | cmplw cr7,r9,r7 |
155 | cmpdi cr5,r9,0 |
156 | bne cr7,L(size_reached_2) |
157 | beq cr5,L(size_reached_0) |
158 | bdnz L(loop_ne_align_byte) |
159 | |
160 | cmpdi cr7,r10,0 |
161 | bne+ cr7,L(loop_ne_align_0) |
162 | |
163 | .align 4 |
164 | L(ret0): |
165 | li r9,0 |
166 | L(ret1): |
167 | mr r3,r9 |
168 | blr |
169 | |
170 | /* The code now check if r8 and r10 are different by issuing a |
171 | cmpb and shift the result based on its output: |
172 | |
173 | #ifdef __LITTLE_ENDIAN__ |
174 | leadzero = (__builtin_ffsl (z1) - 1); |
175 | leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero; |
176 | r1 = (r1 >> leadzero) & 0xFFUL; |
177 | r2 = (r2 >> leadzero) & 0xFFUL; |
178 | #else |
179 | leadzero = __builtin_clzl (z1); |
180 | leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero; |
181 | r1 = (r1 >> (56 - leadzero)) & 0xFFUL; |
182 | r2 = (r2 >> (56 - leadzero)) & 0xFFUL; |
183 | #endif |
184 | return r1 - r2; */ |
185 | |
186 | .align 4 |
187 | L(different0): |
188 | mr r10,r5 |
189 | #ifdef __LITTLE_ENDIAN__ |
190 | L(different1): |
191 | neg r11,r8 |
192 | sldi r10,r10,3 |
193 | and r8,r11,r8 |
194 | addi r10,r10,-8 |
195 | cntlzd r8,r8 |
196 | subfic r8,r8,63 |
197 | extsw r8,r8 |
198 | cmpld cr7,r8,r10 |
199 | ble cr7,L(different2) |
200 | mr r8,r10 |
201 | L(different2): |
202 | extsw r8,r8 |
203 | #else |
204 | L(different1): |
205 | addi r10,r10,-1 |
206 | cntlzd r8,r8 |
207 | sldi r10,r10,3 |
208 | cmpld cr7,r8,r10 |
209 | blt cr7,L(different2) |
210 | mr r8,r10 |
211 | L(different2): |
212 | subfic r8,r8,56 |
213 | #endif |
214 | srd r7,r7,r8 |
215 | srd r9,r9,r8 |
216 | rldicl r3,r7,0,56 |
217 | rldicl r9,r9,0,56 |
218 | subf r9,r9,3 |
219 | extsw r9,r9 |
220 | mr r3,r9 |
221 | blr |
222 | |
223 | /* If unaligned 16 bytes reads across a 4K page boundary, it uses |
224 | a simple byte a byte comparison until the page alignment for s1 |
225 | is reached. */ |
226 | .align 4 |
227 | L(pagecross): |
228 | lbz r7,0(r3) |
229 | lbz r9,0(r4) |
230 | subfic r8,r8,4095 |
231 | cmplw cr7,r9,r7 |
232 | bne cr7,L(byte_ne_3) |
233 | cmpdi cr7,r9,0 |
234 | beq cr7,L(byte_ne_0) |
235 | addi r10,r10,-1 |
236 | subf r7,r8,r10 |
237 | subf r9,r7,r10 |
238 | addi r9,r9,1 |
239 | mtctr r9 |
240 | b L(pagecross_loop1) |
241 | |
242 | .align 4 |
243 | L(pagecross_loop0): |
244 | beq cr7,L(ret0) |
245 | lbz r9,0(r3) |
246 | lbz r8,0(r4) |
247 | addi r10,r10,-1 |
248 | cmplw cr7,r9,r8 |
249 | cmpdi cr5,r9,0 |
250 | bne r7,L(byte_ne_2) |
251 | beq r5,L(byte_ne_0) |
252 | L(pagecross_loop1): |
253 | cmpdi cr7,r10,0 |
254 | addi r3,r3,1 |
255 | addi r4,r4,1 |
256 | bdnz L(pagecross_loop0) |
257 | cmpdi cr7,r7,0 |
258 | li r9,0 |
259 | bne+ cr7,L(align_8b) |
260 | b L(ret1) |
261 | |
262 | /* If both source1 and source2 are doubleword aligned, there is no |
263 | need for page boundary cross checks. */ |
264 | .align 4 |
265 | L(loop_eq_align_0): |
266 | ld r7,0(r3) |
267 | ld r9,0(r4) |
268 | cmpb r8,r7,r8 |
269 | cmpb r6,r7,r9 |
270 | orc. r8,r8,r6 |
271 | bne cr0,L(different1) |
272 | |
273 | cmpldi cr7,r10,8 |
274 | ble cr7,L(ret0) |
275 | addi r9,r10,-9 |
276 | |
277 | li r5,0 |
278 | srdi r9,r9,3 |
279 | addi r9,r9,1 |
280 | mtctr r9 |
281 | b L(loop_eq_align_2) |
282 | |
283 | .align 4 |
284 | L(loop_eq_align_1): |
285 | bdz L(ret0) |
286 | L(loop_eq_align_2): |
287 | ldu r7,8(r3) |
288 | addi r10,r10,-8 |
289 | ldu r9,8(r4) |
290 | cmpb r8,r7,r5 |
291 | cmpb r6,r7,r9 |
292 | orc. r8,r8,r6 |
293 | beq cr0,L(loop_eq_align_1) |
294 | b L(different1) |
295 | |
296 | .align 4 |
297 | L(byte_ne_0): |
298 | li r7,0 |
299 | L(byte_ne_1): |
300 | subf r9,r9,r7 |
301 | extsw r9,r9 |
302 | b L(ret1) |
303 | |
304 | .align 4 |
305 | L(byte_ne_2): |
306 | extsw r7,r9 |
307 | mr r9,r8 |
308 | b L(byte_ne_1) |
309 | L(size_reached_0): |
310 | li r10,0 |
311 | L(size_reached_1): |
312 | subf r9,r9,r10 |
313 | extsw r9,r9 |
314 | b L(ret1) |
315 | L(size_reached_2): |
316 | extsw r10,r9 |
317 | mr r9,r7 |
318 | b L(size_reached_1) |
319 | L(byte_ne_3): |
320 | extsw r7,r7 |
321 | b L(byte_ne_1) |
322 | L(byte_ne_4): |
323 | extsw r10,r9 |
324 | mr r9,r8 |
325 | b L(size_reached_1) |
326 | END(STRNCMP) |
327 | libc_hidden_builtin_def(strncmp) |
328 | |