1 | /* Vector optimized 32/64 bit S/390 version of strcspn. |
2 | Copyright (C) 2015-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <ifunc-strcspn.h> |
20 | |
21 | #if HAVE_STRCSPN_Z13 |
22 | |
23 | # include "sysdep.h" |
24 | # include "asm-syntax.h" |
25 | |
26 | .text |
27 | |
28 | /* size_t strcspn (const char *s, const char * reject) |
29 | The strcspn() function calculates the length of the initial segment |
30 | of s which consists entirely of characters not in reject. |
31 | |
32 | This method checks the length of reject string. If it fits entirely |
33 | in one vector register, a fast algorithm is used, which does not need |
34 | to check multiple parts of accept-string. Otherwise a slower full |
35 | check of accept-string is used. |
36 | |
37 | register overview: |
38 | r3: pointer to start of reject-string |
39 | r2: pointer to start of search-string |
40 | r0: loaded byte count of vlbb search-string |
41 | r4: found byte index |
42 | r1: current return len |
43 | v16: search-string |
44 | v17: reject-string |
45 | v18: temp-vreg |
46 | |
47 | ONLY FOR SLOW: |
48 | v19: first reject-string |
49 | v20: zero for preparing acc-vector |
50 | v21: global mask; 1 indicates a match between |
51 | search-string-vreg and any reject-character |
52 | v22: current mask; 1 indicates a match between |
53 | search-string-vreg and any reject-character in current acc-vreg |
54 | v24: one for result-checking of former string-part |
55 | v30, v31: for re-/storing registers r6, r8, r9 |
56 | r5: current len of reject-string |
57 | r6: zero-index in search-string or 16 if no zero |
58 | or min(zero-index, loaded byte count) |
59 | r8: >0, if former reject-string-part contains a zero, |
60 | otherwise =0; |
61 | r9: loaded byte count of vlbb reject-string |
62 | */ |
63 | ENTRY(STRCSPN_Z13) |
64 | .machine "z13" |
65 | .machinemode "zarch_nohighgprs" |
66 | |
67 | /* |
68 | Check if reject-string fits in one vreg: |
69 | ---------------------------------------- |
70 | */ |
71 | vlbb %v17,0(%r3),6 /* Load reject. */ |
72 | lghi %r1,0 /* Zero out current len. */ |
73 | lcbb %r0,0(%r3),6 |
74 | jo .Lcheck_onbb /* Special case if reject |
75 | lays on block-boundary. */ |
76 | .Lcheck_notonbb: |
77 | vistrbs %v17,%v17 /* Fill with zeros after first zero. */ |
78 | je .Lfast /* Zero found -> reject fits in one vreg. */ |
79 | j .Lslow /* No zero -> reject exceeds one vreg. */ |
80 | |
81 | |
82 | .Lcheck_onbb: |
83 | /* Reject lays on block-boundary. */ |
84 | vfenezb %v18,%v17,%v17 /* Search zero in loaded reject bytes. */ |
85 | vlgvb %r4,%v18,7 /* Get index of zero or 16 if not found. */ |
86 | clrjl %r4,%r0,.Lcheck_notonbb /* Zero index < loaded bytes count -> |
87 | Reject fits in one vreg; |
88 | Fill with zeros and proceed |
89 | with FAST. */ |
90 | vl %v17,0(%r3) /* Load reject, which exceeds loaded bytes. */ |
91 | j .Lcheck_notonbb /* Check if reject fits in one vreg. */ |
92 | |
93 | |
94 | /* |
95 | Search s for reject in one vreg |
96 | ------------------------------- |
97 | */ |
98 | .Lfast: |
99 | /* Complete reject-string in v17 and remaining bytes are zero. */ |
100 | |
101 | vlbb %v16,0(%r2),6 /* Load s until next 4k-byte boundary. */ |
102 | lcbb %r0,0(%r2),6 /* Get bytes to 4k-byte boundary or 16. */ |
103 | |
104 | vfaezbs %v18,%v16,%v17,0 /* Find first element in v16 |
105 | unequal to any in v17 |
106 | or first zero element. */ |
107 | |
108 | vlgvb %r4,%v18,7 /* Load byte index of found element. */ |
109 | clrjl %r4,%r0,.Lfast_loop_found2 /* If found index is within loaded |
110 | bytes, return with found element |
111 | index (=equal count). */ |
112 | |
113 | /* Align s to 16 byte. */ |
114 | risbgn %r4,%r2,60,128+63,0 /* %r3 = bits 60-63 of %r2 'and' 15. */ |
115 | lghi %r1,16 /* current_len = 16. */ |
116 | slr %r1,%r4 /* Compute bytes to 16bytes boundary. */ |
117 | |
118 | /* Process s in 16byte aligned loop. */ |
119 | .Lfast_loop: |
120 | vl %v16,0(%r1,%r2) /* Load search-string. */ |
121 | vfaezbs %v18,%v16,%v17,0 /* Find first element in v16 equal to any |
122 | in v17 or first zero element. */ |
123 | jno .Lfast_loop_found |
124 | |
125 | vl %v16,16(%r1,%r2) |
126 | vfaezbs %v18,%v16,%v17,0 |
127 | jno .Lfast_loop_found16 |
128 | |
129 | vl %v16,32(%r1,%r2) |
130 | vfaezbs %v18,%v16,%v17,0 |
131 | jno .Lfast_loop_found32 |
132 | |
133 | vl %v16,48(%r1,%r2) |
134 | vfaezbs %v18,%v16,%v17,0 |
135 | jno .Lfast_loop_found48 |
136 | |
137 | aghi %r1,64 |
138 | j .Lfast_loop /* Loop if no element was unequal to reject |
139 | and not zero. */ |
140 | |
141 | /* Found equal or zero element. */ |
142 | .Lfast_loop_found48: |
143 | aghi %r1,16 |
144 | .Lfast_loop_found32: |
145 | aghi %r1,16 |
146 | .Lfast_loop_found16: |
147 | aghi %r1,16 |
148 | .Lfast_loop_found: |
149 | vlgvb %r4,%v18,7 /* Load byte index of found element or zero. */ |
150 | .Lfast_loop_found2: |
151 | algrk %r2,%r1,%r4 /* Add found index to current len. */ |
152 | br %r14 |
153 | |
154 | |
155 | |
156 | /* |
157 | Search s for reject in multiple vregs |
158 | ------------------------------------- |
159 | */ |
160 | .Lslow: |
161 | /* Save registers. */ |
162 | vlvgg %v30,%r6,0 |
163 | vlvgp %v31,%r8,%r9 |
164 | |
165 | /* Reject in v17 without zero. */ |
166 | vlr %v19,%v17 /* Save first acc-part for a fast reload. */ |
167 | vzero %v20 /* Zero for preparing acc-vector. */ |
168 | vone %v24 /* One for checking result of former |
169 | string-part. */ |
170 | |
171 | /* Align s to 16 byte. */ |
172 | risbg %r4,%r2,60,128+63,0 /* Test if s is aligned and |
173 | %r4 = bits 60-63 'and' 15. */ |
174 | je .Lslow_loop_str /* If s is aligned, loop aligned. */ |
175 | lghi %r0,15 |
176 | slr %r0,%r4 /* Compute highest index to load (15-x). */ |
177 | vll %v16,%r0,0(%r2) /* Load up to 16 byte boundary (vll needs |
178 | highest index, remaining bytes are 0). */ |
179 | ahi %r0,1 /* Work with loaded byte count. */ |
180 | vzero %v21 /* Zero out global mask. */ |
181 | lghi %r5,0 /* Set current len of reject-string to zero. */ |
182 | vfenezb %v18,%v16,%v16 /* Find zero in current string-part. */ |
183 | lghi %r8,0 /* There is no zero in first reject-part. */ |
184 | vlgvb %r6,%v18,7 /* Load byte index of zero or 16 if no zero. */ |
185 | clije %r6,0,.Lslow_end /* If first element is zero -> return 0. */ |
186 | clr %r0,%r6 /* cc==1 if loaded byte count < zero-index. */ |
187 | locrl %r6,%r0 /* Load on cc==1; zero-index = lbc. */ |
188 | j .Lslow_loop_acc |
189 | |
190 | |
191 | /* Process s in 16byte aligned loop. */ |
192 | .Lslow_next_str: |
193 | /* Check results of former processed str-part. */ |
194 | vfeeb %v18,%v21,%v24 /* Find first equal match in global mask |
195 | (ones in element). */ |
196 | vlgvb %r4,%v18,7 /* Get index of first one (=equal) or 16. */ |
197 | /* Equal-index < min(zero-index, loaded byte count) |
198 | -> Return pointer to equal element. */ |
199 | clrjl %r4,%r6,.Lslow_index_found |
200 | /* Zero-index < loaded byte count |
201 | -> Former str-part was last str-part |
202 | -> Return null */ |
203 | clrjl %r6,%r0,.Lslow_end_not_found |
204 | |
205 | /* All elements are zero (=no match) -> Proceed with next str-part. */ |
206 | vlr %v17,%v19 /* Load first part of reject (no zero). */ |
207 | algfr %r1,%r0 /* Add loaded byte count to current len. */ |
208 | |
209 | .Lslow_loop_str: |
210 | vl %v16,0(%r1,%r2) /* Load search-string. */ |
211 | lghi %r0,16 /* Loaded byte count is 16. */ |
212 | vzero %v21 /* Zero out global mask. */ |
213 | lghi %r5,0 /* Set current len of reject to zero. */ |
214 | vfenezb %v18,%v16,%v16 /* Find zero in current string-part. */ |
215 | lghi %r8,0 /* There is no zero in first reject-part. */ |
216 | vlgvb %r6,%v18,7 /* Load byte index of zero or 16 if no zero. */ |
217 | clije %r6,0,.Lslow_end /* If first element is zero (end of string) |
218 | -> Return current length. */ |
219 | |
220 | .Lslow_loop_acc: |
221 | vfaeb %v22,%v16,%v17,4 /* Create matching-mask (1 in mask -> |
222 | Character matches any rejected character in |
223 | this reject-string-part) IN=0, RT=1. */ |
224 | vlgvb %r4,%v22,0 /* Get result of first element. */ |
225 | /* First element is equal to any rejected characters? |
226 | (all other parts of reject cannot lead to a match before this one) |
227 | -> Return current len, which is pointing to this element. */ |
228 | clijh %r4,0,.Lslow_end |
229 | vo %v21,%v21,%v22 /* Global-mask = global-|matching-mask. */ |
230 | /* Proceed with next acc until end of acc is reached. */ |
231 | |
232 | |
233 | .Lslow_next_acc: |
234 | clijh %r8,0,.Lslow_next_str /* There was a zero in last reject-part |
235 | -> Add found index to current len |
236 | and end. */ |
237 | vlbb %v17,16(%r5,%r3),6 /* Load next reject part. */ |
238 | aghi %r5,16 /* Increment current len of reject-string. */ |
239 | lcbb %r9,0(%r5,%r3),6 /* Get loaded byte count of reject-string. */ |
240 | jo .Lslow_next_acc_onbb /* Jump away if reject-string is |
241 | on block-boundary. */ |
242 | .Lslow_next_acc_notonbb: |
243 | vistrbs %v17,%v17 /* Fill with zeros after first zero. */ |
244 | jo .Lslow_loop_acc /* No zero found -> no preparation needed. */ |
245 | |
246 | .Lslow_next_acc_prepare_zero: |
247 | /* Zero in reject-part: fill zeros with first-reject-character. */ |
248 | vlgvb %r8,%v17,0 /* Load first element of reject-part. */ |
249 | clije %r8,0,.Lslow_next_str /* Process next str-part if first |
250 | character in this part of reject |
251 | is a zero. */ |
252 | /* r8>0 -> zero found in this acc-part. */ |
253 | vrepb %v18,%v17,0 /* Replicate first char accross all chars. */ |
254 | vceqb %v22,%v20,%v17 /* Create a mask (v22) of null chars |
255 | by comparing with 0 (v20). */ |
256 | vsel %v17,%v18,%v17,%v22 /* Replace null chars with first char. */ |
257 | j .Lslow_loop_acc /* Reject-string part is prepared. */ |
258 | |
259 | .Lslow_next_acc_onbb: |
260 | vfenezb %v18,%v17,%v17 /* Find zero in loaded bytes of reject part. */ |
261 | vlgvb %r8,%v18,7 /* Load byte index of zero. */ |
262 | clrjl %r8,%r9,.Lslow_next_acc_notonbb /* Found a zero in loaded bytes |
263 | -> Prepare vreg. */ |
264 | vl %v17,0(%r5,%r3) /* Load over boundary ... */ |
265 | lghi %r8,0 /* r8=0 -> no zero in this part of acc, |
266 | check for zero is in jump-target. */ |
267 | j .Lslow_next_acc_notonbb /* ... and search for zero in |
268 | fully loaded vreg again. */ |
269 | |
270 | .Lslow_end_not_found: |
271 | algfr %r1,%r6 /* Add zero-index to current len. */ |
272 | j .Lslow_end |
273 | .Lslow_index_found: |
274 | algfr %r1,%r4 /* Add found index of char to current len. */ |
275 | .Lslow_end: |
276 | lgr %r2,%r1 |
277 | /* Restore registers. */ |
278 | vlgvg %r6,%v30,0 |
279 | vlgvg %r8,%v31,0 |
280 | vlgvg %r9,%v31,1 |
281 | br %r14 |
282 | END(STRCSPN_Z13) |
283 | |
284 | # if ! HAVE_STRCSPN_IFUNC |
285 | strong_alias (STRCSPN_Z13, strcspn) |
286 | # endif |
287 | |
288 | # if ! HAVE_STRCSPN_C && defined SHARED && IS_IN (libc) |
289 | strong_alias (STRCSPN_Z13, __GI_strcspn) |
290 | # endif |
291 | |
292 | #endif /* HAVE_STRCSPN_Z13 */ |
293 | |