1 | /* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized |
2 | loop. |
3 | Copyright (C) 2016-2024 Free Software Foundation, Inc. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include <sysdep.h> |
21 | |
22 | /* int [r3] strlen (char *s [r3]) */ |
23 | |
24 | #ifndef STRLEN |
25 | # define STRLEN strlen |
26 | #endif |
27 | .machine power8 |
28 | ENTRY_TOCLESS (STRLEN, 4) |
29 | CALL_MCOUNT 1 |
30 | dcbt 0,r3 |
31 | clrrdi r4,r3,3 /* Align the address to doubleword boundary. */ |
32 | rlwinm r6,r3,3,26,28 /* Calculate padding. */ |
33 | li r0,0 /* Doubleword with null chars to use |
34 | with cmpb. */ |
35 | li r5,-1 /* MASK = 0xffffffffffffffff. */ |
36 | ld r12,0(r4) /* Load doubleword from memory. */ |
37 | #ifdef __LITTLE_ENDIAN__ |
38 | sld r5,r5,r6 |
39 | #else |
40 | srd r5,r5,r6 /* MASK = MASK >> padding. */ |
41 | #endif |
42 | orc r9,r12,r5 /* Mask bits that are not part of the string. */ |
43 | cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */ |
44 | cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */ |
45 | bne cr7,L(done) |
46 | |
47 | /* For shorter strings (< 64 bytes), we will not use vector registers, |
48 | as the overhead isn't worth it. So, let's use GPRs instead. This |
49 | will be done the same way as we do in the POWER7 implementation. |
50 | Let's see if we are aligned to a quadword boundary. If so, we can |
51 | jump to the first (non-vectorized) loop. Otherwise, we have to |
52 | handle the next DWORD first. */ |
53 | mtcrf 0x01,r4 |
54 | mr r9,r4 |
55 | addi r9,r9,8 |
56 | bt 28,L(align64) |
57 | |
58 | /* Handle the next 8 bytes so we are aligned to a quadword |
59 | boundary. */ |
60 | ldu r5,8(r4) |
61 | cmpb r10,r5,r0 |
62 | cmpdi cr7,r10,0 |
63 | addi r9,r9,8 |
64 | bne cr7,L(done) |
65 | |
66 | L(align64): |
67 | /* Proceed to the old (POWER7) implementation, checking two doublewords |
68 | per iteration. For the first 56 bytes, we will just check for null |
69 | characters. After that, we will also check if we are 64-byte aligned |
70 | so we can jump to the vectorized implementation. We will unroll |
71 | these loops to avoid excessive branching. */ |
72 | ld r6,8(r4) |
73 | ldu r5,16(r4) |
74 | cmpb r10,r6,r0 |
75 | cmpb r11,r5,r0 |
76 | or r5,r10,r11 |
77 | cmpdi cr7,r5,0 |
78 | addi r9,r9,16 |
79 | bne cr7,L(dword_zero) |
80 | |
81 | ld r6,8(r4) |
82 | ldu r5,16(r4) |
83 | cmpb r10,r6,r0 |
84 | cmpb r11,r5,r0 |
85 | or r5,r10,r11 |
86 | cmpdi cr7,r5,0 |
87 | addi r9,r9,16 |
88 | bne cr7,L(dword_zero) |
89 | |
90 | ld r6,8(r4) |
91 | ldu r5,16(r4) |
92 | cmpb r10,r6,r0 |
93 | cmpb r11,r5,r0 |
94 | or r5,r10,r11 |
95 | cmpdi cr7,r5,0 |
96 | addi r9,r9,16 |
97 | bne cr7,L(dword_zero) |
98 | |
99 | /* Are we 64-byte aligned? If so, jump to the vectorized loop. |
100 | Note: aligning to 64-byte will necessarily slow down performance for |
101 | strings around 64 bytes in length due to the extra comparisons |
102 | required to check alignment for the vectorized loop. This is a |
103 | necessary tradeoff we are willing to take in order to speed up the |
104 | calculation for larger strings. */ |
105 | andi. r10,r9,63 |
106 | beq cr0,L(preloop) |
107 | ld r6,8(r4) |
108 | ldu r5,16(r4) |
109 | cmpb r10,r6,r0 |
110 | cmpb r11,r5,r0 |
111 | or r5,r10,r11 |
112 | cmpdi cr7,r5,0 |
113 | addi r9,r9,16 |
114 | bne cr7,L(dword_zero) |
115 | |
116 | andi. r10,r9,63 |
117 | beq cr0,L(preloop) |
118 | ld r6,8(r4) |
119 | ldu r5,16(r4) |
120 | cmpb r10,r6,r0 |
121 | cmpb r11,r5,r0 |
122 | or r5,r10,r11 |
123 | cmpdi cr7,r5,0 |
124 | addi r9,r9,16 |
125 | bne cr7,L(dword_zero) |
126 | |
127 | andi. r10,r9,63 |
128 | beq cr0,L(preloop) |
129 | ld r6,8(r4) |
130 | ldu r5,16(r4) |
131 | cmpb r10,r6,r0 |
132 | cmpb r11,r5,r0 |
133 | or r5,r10,r11 |
134 | cmpdi cr7,r5,0 |
135 | addi r9,r9,16 |
136 | |
137 | /* At this point, we are necessarily 64-byte aligned. If no zeroes were |
138 | found, jump to the vectorized loop. */ |
139 | beq cr7,L(preloop) |
140 | |
141 | L(dword_zero): |
142 | /* OK, one (or both) of the doublewords contains a null byte. Check |
143 | the first doubleword and decrement the address in case the first |
144 | doubleword really contains a null byte. */ |
145 | |
146 | cmpdi cr6,r10,0 |
147 | addi r4,r4,-8 |
148 | bne cr6,L(done) |
149 | |
150 | /* The null byte must be in the second doubleword. Adjust the address |
151 | again and move the result of cmpb to r10 so we can calculate the |
152 | length. */ |
153 | |
154 | mr r10,r11 |
155 | addi r4,r4,8 |
156 | |
157 | /* If the null byte was found in the non-vectorized code, compute the |
158 | final length. r10 has the output of the cmpb instruction, that is, |
159 | it contains 0xff in the same position as the null byte in the |
160 | original doubleword from the string. Use that to calculate the |
161 | length. */ |
162 | L(done): |
163 | #ifdef __LITTLE_ENDIAN__ |
164 | addi r9, r10,-1 /* Form a mask from trailing zeros. */ |
165 | andc r9, r9,r10 |
166 | popcntd r0, r9 /* Count the bits in the mask. */ |
167 | #else |
168 | cntlzd r0,r10 /* Count leading zeros before the match. */ |
169 | #endif |
170 | subf r5,r3,r4 |
171 | srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */ |
172 | add r3,r5,r0 /* Compute final length. */ |
173 | blr |
174 | |
175 | /* Vectorized implementation starts here. */ |
176 | .p2align 4 |
177 | L(preloop): |
178 | /* Set up for the loop. */ |
179 | mr r4,r9 |
180 | li r7, 16 /* Load required offsets. */ |
181 | li r8, 32 |
182 | li r9, 48 |
183 | li r12, 8 |
184 | vxor v0,v0,v0 /* VR with null chars to use with |
185 | vcmpequb. */ |
186 | |
187 | /* Main loop to look for the end of the string. We will read in |
188 | 64-byte chunks. Align it to 32 bytes and unroll it 3 times to |
189 | leverage the icache performance. */ |
190 | .p2align 5 |
191 | L(loop): |
192 | lvx v1,r4,r0 /* Load 4 quadwords. */ |
193 | lvx v2,r4,r7 |
194 | lvx v3,r4,r8 |
195 | lvx v4,r4,r9 |
196 | vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ |
197 | vminub v6,v3,v4 |
198 | vminub v7,v5,v6 |
199 | vcmpequb. v7,v7,v0 /* Check for NULLs. */ |
200 | addi r4,r4,64 /* Adjust address for the next iteration. */ |
201 | bne cr6,L(vmx_zero) |
202 | |
203 | lvx v1,r4,r0 /* Load 4 quadwords. */ |
204 | lvx v2,r4,r7 |
205 | lvx v3,r4,r8 |
206 | lvx v4,r4,r9 |
207 | vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ |
208 | vminub v6,v3,v4 |
209 | vminub v7,v5,v6 |
210 | vcmpequb. v7,v7,v0 /* Check for NULLs. */ |
211 | addi r4,r4,64 /* Adjust address for the next iteration. */ |
212 | bne cr6,L(vmx_zero) |
213 | |
214 | lvx v1,r4,r0 /* Load 4 quadwords. */ |
215 | lvx v2,r4,r7 |
216 | lvx v3,r4,r8 |
217 | lvx v4,r4,r9 |
218 | vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ |
219 | vminub v6,v3,v4 |
220 | vminub v7,v5,v6 |
221 | vcmpequb. v7,v7,v0 /* Check for NULLs. */ |
222 | addi r4,r4,64 /* Adjust address for the next iteration. */ |
223 | beq cr6,L(loop) |
224 | |
225 | L(vmx_zero): |
226 | /* OK, we found a null byte. Let's look for it in the current 64-byte |
227 | block and mark it in its corresponding VR. */ |
228 | vcmpequb v1,v1,v0 |
229 | vcmpequb v2,v2,v0 |
230 | vcmpequb v3,v3,v0 |
231 | vcmpequb v4,v4,v0 |
232 | |
233 | /* We will now 'compress' the result into a single doubleword, so it |
234 | can be moved to a GPR for the final calculation. First, we |
235 | generate an appropriate mask for vbpermq, so we can permute bits into |
236 | the first halfword. */ |
237 | vspltisb v10,3 |
238 | lvsl v11,r0,r0 |
239 | vslb v10,v11,v10 |
240 | |
241 | /* Permute the first bit of each byte into bits 48-63. */ |
242 | vbpermq v1,v1,v10 |
243 | vbpermq v2,v2,v10 |
244 | vbpermq v3,v3,v10 |
245 | vbpermq v4,v4,v10 |
246 | |
247 | /* Shift each component into its correct position for merging. */ |
248 | #ifdef __LITTLE_ENDIAN__ |
249 | vsldoi v2,v2,v2,2 |
250 | vsldoi v3,v3,v3,4 |
251 | vsldoi v4,v4,v4,6 |
252 | #else |
253 | vsldoi v1,v1,v1,6 |
254 | vsldoi v2,v2,v2,4 |
255 | vsldoi v3,v3,v3,2 |
256 | #endif |
257 | |
258 | /* Merge the results and move to a GPR. */ |
259 | vor v1,v2,v1 |
260 | vor v2,v3,v4 |
261 | vor v4,v1,v2 |
262 | mfvrd r10,v4 |
263 | |
264 | /* Adjust address to the begninning of the current 64-byte block. */ |
265 | addi r4,r4,-64 |
266 | |
267 | #ifdef __LITTLE_ENDIAN__ |
268 | addi r9, r10,-1 /* Form a mask from trailing zeros. */ |
269 | andc r9, r9,r10 |
270 | popcntd r0, r9 /* Count the bits in the mask. */ |
271 | #else |
272 | cntlzd r0,r10 /* Count leading zeros before the match. */ |
273 | #endif |
274 | subf r5,r3,r4 |
275 | add r3,r5,r0 /* Compute final length. */ |
276 | blr |
277 | |
278 | END (STRLEN) |
279 | libc_hidden_builtin_def (strlen) |
280 | |