1 | /* Optimized strcmp implementation for PowerPC64/POWER8. |
2 | Copyright (C) 2015-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | #ifndef STRCMP |
22 | # define STRCMP strcmp |
23 | #endif |
24 | |
25 | /* Implements the function |
26 | |
27 | size_t [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) |
28 | |
29 | The implementation uses unaligned doubleword access to avoid specialized |
30 | code paths depending of data alignment. Although recent powerpc64 uses |
31 | 64K as default, the page cross handling assumes minimum page size of |
32 | 4k. */ |
33 | |
34 | .machine power8 |
35 | ENTRY_TOCLESS (STRCMP, 4) |
36 | li r0,0 |
37 | |
38 | /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using |
39 | the code: |
40 | |
41 | (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE)) |
42 | |
43 | with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */ |
44 | |
45 | rldicl r7,r3,0,52 |
46 | rldicl r9,r4,0,52 |
47 | cmpldi cr7,r7,4096-16 |
48 | bgt cr7,L(pagecross_check) |
49 | cmpldi cr5,r9,4096-16 |
50 | bgt cr5,L(pagecross_check) |
51 | |
52 | /* For short string up to 16 bytes, load both s1 and s2 using |
53 | unaligned dwords and compare. */ |
54 | ld r8,0(r3) |
55 | ld r10,0(r4) |
56 | cmpb r12,r8,r0 |
57 | cmpb r11,r8,r10 |
58 | orc. r9,r12,r11 |
59 | bne cr0,L(different_nocmpb) |
60 | |
61 | ld r8,8(r3) |
62 | ld r10,8(r4) |
63 | cmpb r12,r8,r0 |
64 | cmpb r11,r8,r10 |
65 | orc. r9,r12,r11 |
66 | bne cr0,L(different_nocmpb) |
67 | |
68 | addi r7,r3,16 |
69 | addi r4,r4,16 |
70 | |
71 | L(align_8b): |
72 | /* Now it has checked for first 16 bytes, align source1 to doubleword |
73 | and adjust source2 address. */ |
74 | rldicl r9,r7,0,61 /* source1 alignment to doubleword */ |
75 | subf r4,r9,r4 /* Adjust source2 address based on source1 |
76 | alignment. */ |
77 | rldicr r7,r7,0,60 /* Align source1 to doubleword. */ |
78 | |
79 | /* At this point, source1 alignment is 0 and source2 alignment is |
80 | between 0 and 7. Check is source2 alignment is 0, meaning both |
81 | sources have the same alignment. */ |
82 | andi. r9,r4,0x7 |
83 | bne cr0,L(loop_diff_align) |
84 | |
85 | /* If both source1 and source2 are doubleword aligned, there is no |
86 | need for page boundary cross checks. */ |
87 | |
88 | ld r8,0(r7) |
89 | ld r10,0(r4) |
90 | cmpb r12,r8,r0 |
91 | cmpb r11,r8,r10 |
92 | orc. r9,r12,r11 |
93 | bne cr0,L(different_nocmpb) |
94 | |
95 | .align 4 |
96 | L(loop_equal_align): |
97 | ld r8,8(r7) |
98 | ld r10,8(r4) |
99 | cmpb r12,r8,r0 |
100 | cmpb r11,r8,r10 |
101 | orc. r9,r12,r11 |
102 | bne cr0,L(different_nocmpb) |
103 | |
104 | ld r8,16(r7) |
105 | ld r10,16(r4) |
106 | cmpb r12,r8,r0 |
107 | cmpb r11,r8,r10 |
108 | orc. r9,r12,r11 |
109 | bne cr0,L(different_nocmpb) |
110 | |
111 | ldu r8,24(r7) |
112 | ldu r10,24(r4) |
113 | cmpb r12,r8,r0 |
114 | cmpb r11,r8,r10 |
115 | orc. r9,r12,r11 |
116 | bne cr0,L(different_nocmpb) |
117 | |
118 | b L(loop_equal_align) |
119 | |
120 | /* A zero byte was found in r8 (s1 dword), r9 contains the cmpb |
121 | result and r10 the dword from s2. To code isolate the byte |
122 | up to end (including the '\0'), masking with 0xFF the remaining |
123 | ones: |
124 | |
125 | #if __LITTLE_ENDIAN__ |
126 | (__builtin_ffsl (x) - 1) = counting trailing zero bits |
127 | r9 = (__builtin_ffsl (r9) - 1) + 8; |
128 | r9 = -1UL << r9 |
129 | #else |
130 | r9 = __builtin_clzl (r9) + 8; |
131 | r9 = -1UL >> r9 |
132 | #endif |
133 | r8 = r8 | r9 |
134 | r10 = r10 | r9 */ |
135 | |
136 | #ifdef __LITTLE_ENDIAN__ |
137 | nor r9,r9,r9 |
138 | L(different_nocmpb): |
139 | neg r3,r9 |
140 | and r9,r9,r3 |
141 | cntlzd r9,r9 |
142 | subfic r9,r9,63 |
143 | #else |
144 | not r9,r9 |
145 | L(different_nocmpb): |
146 | cntlzd r9,r9 |
147 | subfic r9,r9,56 |
148 | #endif |
149 | srd r3,r8,r9 |
150 | srd r10,r10,r9 |
151 | rldicl r10,r10,0,56 |
152 | rldicl r3,r3,0,56 |
153 | subf r3,r10,r3 |
154 | extsw r3,r3 |
155 | blr |
156 | |
157 | .align 4 |
158 | L(pagecross_check): |
159 | subfic r9,r9,4096 |
160 | subfic r7,r7,4096 |
161 | cmpld cr7,r7,r9 |
162 | bge cr7,L(pagecross) |
163 | mr r7,r9 |
164 | |
165 | /* If unaligned 16 bytes reads across a 4K page boundary, it uses |
166 | a simple byte a byte comparison until the page alignment for s1 |
167 | is reached. */ |
168 | L(pagecross): |
169 | add r7,r3,r7 |
170 | subf r9,r3,r7 |
171 | mtctr r9 |
172 | |
173 | .align 4 |
174 | L(pagecross_loop): |
175 | /* Loads a byte from s1 and s2, compare if *s1 is equal to *s2 |
176 | and if *s1 is '\0'. */ |
177 | lbz r9,0(r3) |
178 | lbz r10,0(r4) |
179 | addi r3,r3,1 |
180 | addi r4,r4,1 |
181 | cmplw cr7,r9,r10 |
182 | cmpdi cr5,r9,r0 |
183 | bne cr7,L(pagecross_ne) |
184 | beq cr5,L(pagecross_nullfound) |
185 | bdnz L(pagecross_loop) |
186 | b L(align_8b) |
187 | |
188 | .align 4 |
189 | /* The unaligned read of source2 will cross a 4K page boundary, |
190 | and the different byte or NULL maybe be in the remaining page |
191 | bytes. Since it can not use the unaligned load, the algorithm |
192 | reads and compares 8 bytes to keep source1 doubleword aligned. */ |
193 | L(check_source2_byte): |
194 | li r9,8 |
195 | mtctr r9 |
196 | |
197 | .align 4 |
198 | L(check_source2_byte_loop): |
199 | lbz r9,0(r7) |
200 | lbz r10,0(r4) |
201 | addi r7,r7,1 |
202 | addi r4,r4,1 |
203 | cmplw cr7,r9,10 |
204 | cmpdi r5,r9,0 |
205 | bne cr7,L(pagecross_ne) |
206 | beq cr5,L(pagecross_nullfound) |
207 | bdnz L(check_source2_byte_loop) |
208 | |
209 | /* If source2 is unaligned to doubleword, the code needs to check |
210 | on each iteration if the unaligned doubleword access will cross |
211 | a 4k page boundary. */ |
212 | .align 5 |
213 | L(loop_unaligned): |
214 | ld r8,0(r7) |
215 | ld r10,0(r4) |
216 | cmpb r12,r8,r0 |
217 | cmpb r11,r8,r10 |
218 | orc. r9,r12,r11 |
219 | bne cr0,L(different_nocmpb) |
220 | addi r7,r7,8 |
221 | addi r4,r4,8 |
222 | |
223 | L(loop_diff_align): |
224 | /* Check if [src2]+8 cross a 4k page boundary: |
225 | |
226 | srcin2 % PAGE_SIZE > (PAGE_SIZE - 8) |
227 | |
228 | with PAGE_SIZE being 4096. */ |
229 | rldicl r9,r4,0,52 |
230 | cmpldi cr7,r9,4088 |
231 | ble cr7,L(loop_unaligned) |
232 | b L(check_source2_byte) |
233 | |
234 | .align 4 |
235 | L(pagecross_ne): |
236 | extsw r3,r9 |
237 | mr r9,r10 |
238 | L(pagecross_retdiff): |
239 | subf r9,r9,r3 |
240 | extsw r3,r9 |
241 | blr |
242 | |
243 | .align 4 |
244 | L(pagecross_nullfound): |
245 | li r3,0 |
246 | b L(pagecross_retdiff) |
247 | END (STRCMP) |
248 | libc_hidden_builtin_def (strcmp) |
249 | |