1 | /* Vector optimized 32/64 bit S/390 version of wcscspn. |
2 | Copyright (C) 2015-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <ifunc-wcscspn.h> |
20 | #if HAVE_WCSCSPN_Z13 |
21 | |
22 | # include "sysdep.h" |
23 | # include "asm-syntax.h" |
24 | |
25 | .text |
26 | |
27 | /* size_t wcscspn (const wchar_t *s, const wchar_t * reject) |
28 | The wcscspn() function calculates the length of the initial segment |
29 | of s which consists entirely of characters not in reject. |
30 | |
31 | This method checks the length of reject string. If it fits entirely |
32 | in one vector register, a fast algorithm is used, which does not need |
33 | to check multiple parts of accept-string. Otherwise a slower full |
34 | check of accept-string is used. |
35 | |
36 | register overview: |
37 | r3: pointer to start of reject-string |
38 | r2: pointer to start of search-string |
39 | r0: loaded byte count of vlbb search-string |
40 | r4: found byte index |
41 | r1: current return len |
42 | v16: search-string |
43 | v17: reject-string |
44 | v18: temp-vreg |
45 | |
46 | ONLY FOR SLOW: |
47 | v19: first reject-string |
48 | v20: zero for preparing acc-vector |
49 | v21: global mask; 1 indicates a match between |
50 | search-string-vreg and any reject-character |
51 | v22: current mask; 1 indicates a match between |
52 | search-string-vreg and any reject-character in current acc-vreg |
53 | v30, v31: for re-/storing registers r6, r8, r9 |
54 | r5: current len of reject-string |
55 | r6: zero-index in search-string or 16 if no zero |
56 | or min(zero-index, loaded byte count) |
57 | r8: >0, if former reject-string-part contains a zero, |
58 | otherwise =0; |
59 | r9: loaded byte count of vlbb reject-string |
60 | */ |
61 | ENTRY(WCSCSPN_Z13) |
62 | .machine "z13" |
63 | .machinemode "zarch_nohighgprs" |
64 | |
65 | tmll %r2,3 /* Test if s is 4-byte aligned? */ |
66 | jne .Lfallback /* And use common-code variant if not. */ |
67 | |
68 | /* |
69 | Check if reject-string fits in one vreg: |
70 | ---------------------------------------- |
71 | */ |
72 | vlbb %v17,0(%r3),0 /* Load reject. */ |
73 | lcbb %r0,0(%r3),0 |
74 | jo .Lcheck_onbb /* Special case if reject |
75 | lays on block-boundary. */ |
76 | |
77 | .Lcheck_notonbb: |
78 | lghi %r1,0 /* Zero out current len. */ |
79 | vistrfs %v17,%v17 /* Fill with zeros after first zero. */ |
80 | je .Lfast /* Zero found -> reject fits in one vreg. */ |
81 | j .Lslow /* No zero -> reject exceeds one vreg. */ |
82 | |
83 | |
84 | .Lcheck_onbb: |
85 | /* Reject lays on block-boundary. */ |
86 | nill %r0,65532 /* Recognize only fully loaded characters. */ |
87 | je .Lcheck_onbb2 /* Reload vr, if we loaded no full wchar_t. */ |
88 | vfenezf %v18,%v17,%v17 /* Search zero in loaded reject bytes. */ |
89 | vlgvb %r4,%v18,7 /* Get index of zero or 16 if not found. */ |
90 | clrjl %r4,%r0,.Lcheck_notonbb /* Zero index < loaded bytes count -> |
91 | Reject fits in one vreg; |
92 | Fill with zeros and proceed |
93 | with FAST. */ |
94 | .Lcheck_onbb2: |
95 | vl %v17,0(%r3) /* Load reject, which exceeds loaded bytes. */ |
96 | j .Lcheck_notonbb /* Check if reject fits in one vreg. */ |
97 | |
98 | |
99 | /* |
100 | Search s for reject in one vreg |
101 | ------------------------------- |
102 | */ |
103 | .Lfast: |
104 | /* Complete reject-string in v17 and remaining bytes are zero. */ |
105 | |
106 | vlbb %v16,0(%r2),6 /* Load s until next 4k-byte boundary. */ |
107 | lcbb %r0,0(%r2),6 /* Get bytes to 4k-byte boundary or 16. */ |
108 | |
109 | vfaezfs %v18,%v16,%v17,0 /* Find first element in v16 |
110 | unequal to any in v17 |
111 | or first zero element. */ |
112 | vlgvb %r4,%v18,7 /* Load byte index of found element. */ |
113 | clrjl %r4,%r0,.Lfast_loop_found2 /* If found index is within loaded |
114 | bytes, return with found element |
115 | index (=equal count). */ |
116 | |
117 | /* Align s to 16 byte. */ |
118 | risbgn %r4,%r2,60,128+63,0 /* %r3 = bits 60-63 of %r2 'and' 15. */ |
119 | lghi %r1,16 /* current_len = 16. */ |
120 | slr %r1,%r4 /* Compute bytes to 16bytes boundary. */ |
121 | |
122 | /* Process s in 16byte aligned loop. */ |
123 | .Lfast_loop: |
124 | vl %v16,0(%r1,%r2) /* Load search-string. */ |
125 | vfaezfs %v18,%v16,%v17,0 /* Find first element in v16 equal to any |
126 | in v17 or first zero element. */ |
127 | jno .Lfast_loop_found |
128 | |
129 | vl %v16,16(%r1,%r2) |
130 | vfaezfs %v18,%v16,%v17,0 |
131 | jno .Lfast_loop_found16 |
132 | |
133 | vl %v16,32(%r1,%r2) |
134 | vfaezfs %v18,%v16,%v17,0 |
135 | jno .Lfast_loop_found32 |
136 | |
137 | vl %v16,48(%r1,%r2) |
138 | vfaezfs %v18,%v16,%v17,0 |
139 | jno .Lfast_loop_found48 |
140 | |
141 | aghi %r1,64 |
142 | j .Lfast_loop /* Loop if no element was unequal to reject |
143 | and not zero. */ |
144 | |
145 | /* Found equal or zero element. */ |
146 | .Lfast_loop_found48: |
147 | aghi %r1,16 |
148 | .Lfast_loop_found32: |
149 | aghi %r1,16 |
150 | .Lfast_loop_found16: |
151 | aghi %r1,16 |
152 | .Lfast_loop_found: |
153 | vlgvb %r4,%v18,7 /* Load byte index of found element or zero. */ |
154 | .Lfast_loop_found2: |
155 | algrk %r2,%r1,%r4 /* Add found index to current len. */ |
156 | srlg %r2,%r2,2 /* Convert byte-count to character-count. */ |
157 | br %r14 |
158 | |
159 | |
160 | |
161 | /* |
162 | Search s for reject in multiple vregs |
163 | ------------------------------------- |
164 | */ |
165 | .Lslow: |
166 | /* Save registers. */ |
167 | vlvgg %v30,%r6,0 |
168 | vlvgp %v31,%r8,%r9 |
169 | |
170 | /* Reject in v17 without zero. */ |
171 | vlr %v19,%v17 /* Save first acc-part for a fast reload. */ |
172 | vzero %v20 /* Zero for preparing acc-vector. */ |
173 | vone %v24 /* One for checking result of former |
174 | string-part. */ |
175 | |
176 | /* Align s to 16 byte. */ |
177 | risbg %r4,%r2,60,128+63,0 /* Test if s is aligned and |
178 | %r4 = bits 60-63 'and' 15. */ |
179 | je .Lslow_loop_str /* If s is aligned, loop aligned. */ |
180 | lghi %r0,15 |
181 | slr %r0,%r4 /* Compute highest index to load (15-x). */ |
182 | vll %v16,%r0,0(%r2) /* Load up to 16byte boundary (vll needs |
183 | highest index, remaining bytes are 0). */ |
184 | ahi %r0,1 /* Work with loaded byte count. */ |
185 | vzero %v21 /* Zero out global mask. */ |
186 | lghi %r5,0 /* Set current len of reject-string to zero. */ |
187 | vfenezf %v18,%v16,%v16 /* Find zero in current string-part. */ |
188 | lghi %r8,0 /* There is no zero in first reject-part. */ |
189 | vlgvb %r6,%v18,7 /* Load byte index of zero or 16 if no zero. */ |
190 | clije %r6,0,.Lslow_end /* If first element is zero -> return 0. */ |
191 | clr %r0,%r6 /* cc==1 if loaded byte count < zero-index. */ |
192 | locrl %r6,%r0 /* Load on cc==1; zero-index = lbc. */ |
193 | j .Lslow_loop_acc |
194 | |
195 | |
196 | /* Process s in 16byte aligned loop. */ |
197 | .Lslow_next_str: |
198 | /* Check results of former processed str-part. */ |
199 | vfeef %v18,%v21,%v24 /* Find first equal match in global mask |
200 | (ones in element). */ |
201 | vlgvb %r4,%v18,7 /* Get index of first one (=equal) or 16. */ |
202 | /* Equal-index < min(zero-index, loaded byte count) |
203 | -> Return pointer to equal element. */ |
204 | clrjl %r4,%r6,.Lslow_index_found |
205 | /* Zero-index < loaded byte count |
206 | -> Former str-part was last str-part |
207 | -> Return null */ |
208 | clrjl %r6,%r0,.Lslow_end_not_found |
209 | |
210 | /* All elements are zero (=no match) -> proceed with next str-part. */ |
211 | vlr %v17,%v19 /* Load first part of reject (no zero). */ |
212 | algfr %r1,%r0 /* Add loaded byte count to current len. */ |
213 | |
214 | .Lslow_loop_str: |
215 | vl %v16,0(%r1,%r2) /* Load search-string. */ |
216 | lghi %r0,16 /* Loaded byte count is 16. */ |
217 | vzero %v21 /* Zero out global mask. */ |
218 | lghi %r5,0 /* Set current len of reject to zero. */ |
219 | vfenezf %v18,%v16,%v16 /* Find zero in current string-part. */ |
220 | lghi %r8,0 /* There is no zero in first reject-part. */ |
221 | vlgvb %r6,%v18,7 /* Load byte index of zero or 16 if no zero. */ |
222 | clije %r6,0,.Lslow_end /* If first element is zero (end of string) |
223 | -> Return current length. */ |
224 | |
225 | .Lslow_loop_acc: |
226 | vfaef %v22,%v16,%v17,4 /* Create matching-mask (1 in mask -> |
227 | Character matches any rejected character in |
228 | this reject-string-part) IN=0, RT=1. */ |
229 | vlgvf %r4,%v22,0 /* Get result of first element. */ |
230 | /* First element is equal to any rejected characters? |
231 | (All other parts of reject cannot lead to a match before this one) |
232 | -> Return current len, which is pointing to this element. */ |
233 | clijh %r4,0,.Lslow_end |
234 | vo %v21,%v21,%v22 /* Global-mask = global-|matching-mask. */ |
235 | /* Proceed with next acc until end of acc is reached. */ |
236 | |
237 | |
238 | .Lslow_next_acc: |
239 | clijh %r8,0,.Lslow_next_str /* There was a zero in last reject-part |
240 | -> Add found index to current len |
241 | and end. */ |
242 | vlbb %v17,16(%r5,%r3),6 /* Load next reject part. */ |
243 | aghi %r5,16 /* Increment current len of reject-string. */ |
244 | lcbb %r9,0(%r5,%r3),6 /* Get loaded byte count of reject-string. */ |
245 | jo .Lslow_next_acc_onbb /* Jump away if reject-string is |
246 | on block-boundary. */ |
247 | .Lslow_next_acc_notonbb: |
248 | vistrfs %v17,%v17 /* Fill with zeros after first zero. */ |
249 | jo .Lslow_loop_acc /* No zero found -> no preparation needed. */ |
250 | |
251 | .Lslow_next_acc_prepare_zero: |
252 | /* Zero in reject-part: fill zeros with first-reject-character. */ |
253 | vlgvf %r8,%v17,0 /* Load first element of reject-part. */ |
254 | clije %r8,0,.Lslow_next_str /* Process next str-part if first |
255 | character in this part of reject |
256 | is a zero. */ |
257 | /* r8>0 -> zero found in this acc-part. */ |
258 | vrepf %v18,%v17,0 /* Replicate first char accross all chars. */ |
259 | vceqf %v22,%v20,%v17 /* Create a mask (v22) of null chars |
260 | by comparing with 0 (v20). */ |
261 | vsel %v17,%v18,%v17,%v22 /* Replace null chars with first char. */ |
262 | j .Lslow_loop_acc /* Reject-string part is prepared. */ |
263 | |
264 | .Lslow_next_acc_onbb: |
265 | nill %r9,65532 /* Recognize only fully loaded characters. */ |
266 | je .Lslow_next_acc_onbb2 /* Reload vr, if no full wchar_t |
267 | loaded. */ |
268 | vfenezf %v18,%v17,%v17 /* Find zero in loaded bytes of reject part. */ |
269 | vlgvb %r8,%v18,7 /* Load byte index of zero. */ |
270 | clrjl %r8,%r9,.Lslow_next_acc_notonbb /* Found a zero in loaded bytes |
271 | -> Prepare vreg. */ |
272 | .Lslow_next_acc_onbb2: |
273 | vl %v17,0(%r5,%r3) /* Load over boundary ... */ |
274 | lghi %r8,0 /* r8=0 -> no zero in this part of acc, |
275 | check for zero is in jump-target. */ |
276 | j .Lslow_next_acc_notonbb /* ... and search for zero in |
277 | fully loaded vreg again. */ |
278 | |
279 | .Lslow_end_not_found: |
280 | algfr %r1,%r6 /* Add zero-index to current len. */ |
281 | j .Lslow_end |
282 | .Lslow_index_found: |
283 | algfr %r1,%r4 /* Add found index of char to current len. */ |
284 | .Lslow_end: |
285 | srlg %r2,%r1,2 /* Convert byte-count to character-count. */ |
286 | /* Restore registers. */ |
287 | vlgvg %r6,%v30,0 |
288 | vlgvg %r8,%v31,0 |
289 | vlgvg %r9,%v31,1 |
290 | br %r14 |
291 | .Lfallback: |
292 | jg WCSCSPN_C |
293 | END(WCSCSPN_Z13) |
294 | |
295 | # if ! HAVE_WCSCSPN_IFUNC |
296 | strong_alias (WCSCSPN_Z13, wcscspn) |
297 | # endif |
298 | #endif |
299 | |