1/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized
2 loop.
3 Copyright (C) 2016-2024 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#include <sysdep.h>
21
22/* int [r3] strlen (char *s [r3]) */
23
24#ifndef STRLEN
25# define STRLEN strlen
26#endif
27 .machine power8
28ENTRY_TOCLESS (STRLEN, 4)
29 CALL_MCOUNT 1
30 dcbt 0,r3
31 clrrdi r4,r3,3 /* Align the address to doubleword boundary. */
32 rlwinm r6,r3,3,26,28 /* Calculate padding. */
33 li r0,0 /* Doubleword with null chars to use
34 with cmpb. */
35 li r5,-1 /* MASK = 0xffffffffffffffff. */
36 ld r12,0(r4) /* Load doubleword from memory. */
37#ifdef __LITTLE_ENDIAN__
38 sld r5,r5,r6
39#else
40 srd r5,r5,r6 /* MASK = MASK >> padding. */
41#endif
42 orc r9,r12,r5 /* Mask bits that are not part of the string. */
43 cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */
44 cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
45 bne cr7,L(done)
46
47 /* For shorter strings (< 64 bytes), we will not use vector registers,
48 as the overhead isn't worth it. So, let's use GPRs instead. This
49 will be done the same way as we do in the POWER7 implementation.
50 Let's see if we are aligned to a quadword boundary. If so, we can
51 jump to the first (non-vectorized) loop. Otherwise, we have to
52 handle the next DWORD first. */
53 mtcrf 0x01,r4
54 mr r9,r4
55 addi r9,r9,8
56 bt 28,L(align64)
57
58 /* Handle the next 8 bytes so we are aligned to a quadword
59 boundary. */
60 ldu r5,8(r4)
61 cmpb r10,r5,r0
62 cmpdi cr7,r10,0
63 addi r9,r9,8
64 bne cr7,L(done)
65
66L(align64):
67 /* Proceed to the old (POWER7) implementation, checking two doublewords
68 per iteration. For the first 56 bytes, we will just check for null
69 characters. After that, we will also check if we are 64-byte aligned
70 so we can jump to the vectorized implementation. We will unroll
71 these loops to avoid excessive branching. */
72 ld r6,8(r4)
73 ldu r5,16(r4)
74 cmpb r10,r6,r0
75 cmpb r11,r5,r0
76 or r5,r10,r11
77 cmpdi cr7,r5,0
78 addi r9,r9,16
79 bne cr7,L(dword_zero)
80
81 ld r6,8(r4)
82 ldu r5,16(r4)
83 cmpb r10,r6,r0
84 cmpb r11,r5,r0
85 or r5,r10,r11
86 cmpdi cr7,r5,0
87 addi r9,r9,16
88 bne cr7,L(dword_zero)
89
90 ld r6,8(r4)
91 ldu r5,16(r4)
92 cmpb r10,r6,r0
93 cmpb r11,r5,r0
94 or r5,r10,r11
95 cmpdi cr7,r5,0
96 addi r9,r9,16
97 bne cr7,L(dword_zero)
98
99 /* Are we 64-byte aligned? If so, jump to the vectorized loop.
100 Note: aligning to 64-byte will necessarily slow down performance for
101 strings around 64 bytes in length due to the extra comparisons
102 required to check alignment for the vectorized loop. This is a
103 necessary tradeoff we are willing to take in order to speed up the
104 calculation for larger strings. */
105 andi. r10,r9,63
106 beq cr0,L(preloop)
107 ld r6,8(r4)
108 ldu r5,16(r4)
109 cmpb r10,r6,r0
110 cmpb r11,r5,r0
111 or r5,r10,r11
112 cmpdi cr7,r5,0
113 addi r9,r9,16
114 bne cr7,L(dword_zero)
115
116 andi. r10,r9,63
117 beq cr0,L(preloop)
118 ld r6,8(r4)
119 ldu r5,16(r4)
120 cmpb r10,r6,r0
121 cmpb r11,r5,r0
122 or r5,r10,r11
123 cmpdi cr7,r5,0
124 addi r9,r9,16
125 bne cr7,L(dword_zero)
126
127 andi. r10,r9,63
128 beq cr0,L(preloop)
129 ld r6,8(r4)
130 ldu r5,16(r4)
131 cmpb r10,r6,r0
132 cmpb r11,r5,r0
133 or r5,r10,r11
134 cmpdi cr7,r5,0
135 addi r9,r9,16
136
137 /* At this point, we are necessarily 64-byte aligned. If no zeroes were
138 found, jump to the vectorized loop. */
139 beq cr7,L(preloop)
140
141L(dword_zero):
142 /* OK, one (or both) of the doublewords contains a null byte. Check
143 the first doubleword and decrement the address in case the first
144 doubleword really contains a null byte. */
145
146 cmpdi cr6,r10,0
147 addi r4,r4,-8
148 bne cr6,L(done)
149
150 /* The null byte must be in the second doubleword. Adjust the address
151 again and move the result of cmpb to r10 so we can calculate the
152 length. */
153
154 mr r10,r11
155 addi r4,r4,8
156
157 /* If the null byte was found in the non-vectorized code, compute the
158 final length. r10 has the output of the cmpb instruction, that is,
159 it contains 0xff in the same position as the null byte in the
160 original doubleword from the string. Use that to calculate the
161 length. */
162L(done):
163#ifdef __LITTLE_ENDIAN__
164 addi r9, r10,-1 /* Form a mask from trailing zeros. */
165 andc r9, r9,r10
166 popcntd r0, r9 /* Count the bits in the mask. */
167#else
168 cntlzd r0,r10 /* Count leading zeros before the match. */
169#endif
170 subf r5,r3,r4
171 srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
172 add r3,r5,r0 /* Compute final length. */
173 blr
174
175 /* Vectorized implementation starts here. */
176 .p2align 4
177L(preloop):
178 /* Set up for the loop. */
179 mr r4,r9
180 li r7, 16 /* Load required offsets. */
181 li r8, 32
182 li r9, 48
183 li r12, 8
184 vxor v0,v0,v0 /* VR with null chars to use with
185 vcmpequb. */
186
187 /* Main loop to look for the end of the string. We will read in
188 64-byte chunks. Align it to 32 bytes and unroll it 3 times to
189 leverage the icache performance. */
190 .p2align 5
191L(loop):
192 lvx v1,r4,r0 /* Load 4 quadwords. */
193 lvx v2,r4,r7
194 lvx v3,r4,r8
195 lvx v4,r4,r9
196 vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
197 vminub v6,v3,v4
198 vminub v7,v5,v6
199 vcmpequb. v7,v7,v0 /* Check for NULLs. */
200 addi r4,r4,64 /* Adjust address for the next iteration. */
201 bne cr6,L(vmx_zero)
202
203 lvx v1,r4,r0 /* Load 4 quadwords. */
204 lvx v2,r4,r7
205 lvx v3,r4,r8
206 lvx v4,r4,r9
207 vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
208 vminub v6,v3,v4
209 vminub v7,v5,v6
210 vcmpequb. v7,v7,v0 /* Check for NULLs. */
211 addi r4,r4,64 /* Adjust address for the next iteration. */
212 bne cr6,L(vmx_zero)
213
214 lvx v1,r4,r0 /* Load 4 quadwords. */
215 lvx v2,r4,r7
216 lvx v3,r4,r8
217 lvx v4,r4,r9
218 vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
219 vminub v6,v3,v4
220 vminub v7,v5,v6
221 vcmpequb. v7,v7,v0 /* Check for NULLs. */
222 addi r4,r4,64 /* Adjust address for the next iteration. */
223 beq cr6,L(loop)
224
225L(vmx_zero):
226 /* OK, we found a null byte. Let's look for it in the current 64-byte
227 block and mark it in its corresponding VR. */
228 vcmpequb v1,v1,v0
229 vcmpequb v2,v2,v0
230 vcmpequb v3,v3,v0
231 vcmpequb v4,v4,v0
232
233 /* We will now 'compress' the result into a single doubleword, so it
234 can be moved to a GPR for the final calculation. First, we
235 generate an appropriate mask for vbpermq, so we can permute bits into
236 the first halfword. */
237 vspltisb v10,3
238 lvsl v11,r0,r0
239 vslb v10,v11,v10
240
241 /* Permute the first bit of each byte into bits 48-63. */
242 vbpermq v1,v1,v10
243 vbpermq v2,v2,v10
244 vbpermq v3,v3,v10
245 vbpermq v4,v4,v10
246
247 /* Shift each component into its correct position for merging. */
248#ifdef __LITTLE_ENDIAN__
249 vsldoi v2,v2,v2,2
250 vsldoi v3,v3,v3,4
251 vsldoi v4,v4,v4,6
252#else
253 vsldoi v1,v1,v1,6
254 vsldoi v2,v2,v2,4
255 vsldoi v3,v3,v3,2
256#endif
257
258 /* Merge the results and move to a GPR. */
259 vor v1,v2,v1
260 vor v2,v3,v4
261 vor v4,v1,v2
262 mfvrd r10,v4
263
264 /* Adjust address to the begninning of the current 64-byte block. */
265 addi r4,r4,-64
266
267#ifdef __LITTLE_ENDIAN__
268 addi r9, r10,-1 /* Form a mask from trailing zeros. */
269 andc r9, r9,r10
270 popcntd r0, r9 /* Count the bits in the mask. */
271#else
272 cntlzd r0,r10 /* Count leading zeros before the match. */
273#endif
274 subf r5,r3,r4
275 add r3,r5,r0 /* Compute final length. */
276 blr
277
278END (STRLEN)
279libc_hidden_builtin_def (strlen)
280

source code of glibc/sysdeps/powerpc/powerpc64/power8/strlen.S