1 | /* memchr - find a character in a memory zone using base integer registers |
2 | |
3 | Copyright (C) 2018-2022 Free Software Foundation, Inc. |
4 | |
5 | This file is part of the GNU C Library. |
6 | |
7 | The GNU C Library is free software; you can redistribute it and/or |
8 | modify it under the terms of the GNU Lesser General Public |
9 | License as published by the Free Software Foundation; either |
10 | version 2.1 of the License, or (at your option) any later version. |
11 | |
12 | The GNU C Library is distributed in the hope that it will be useful, |
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 | Lesser General Public License for more details. |
16 | |
17 | You should have received a copy of the GNU Lesser General Public |
18 | License along with the GNU C Library. If not, see |
19 | <https://www.gnu.org/licenses/>. */ |
20 | |
21 | #include <sysdep.h> |
22 | |
23 | /* Assumptions: |
24 | * |
25 | * ARMv8-a, AArch64 |
26 | * Use base integer registers. |
27 | */ |
28 | |
29 | #ifndef MEMCHR |
30 | # define MEMCHR __memchr_nosimd |
31 | #endif |
32 | |
33 | /* Arguments and results. */ |
34 | #define srcin x0 |
35 | #define chrin x1 |
36 | #define cntin x2 |
37 | |
38 | #define result x0 |
39 | |
40 | #define repchr x1 |
41 | |
42 | #define tmp1 x2 |
43 | #define tmp2 x3 |
44 | #define tmp3 x4 |
45 | #define tmp4 x5 |
46 | |
47 | #define src x6 |
48 | #define srcend x7 |
49 | #define srcend16 x8 |
50 | |
51 | #define anymore x9 |
52 | |
53 | #define zeroones x10 |
54 | |
55 | #define data1 x11 |
56 | #define data2 x12 |
57 | |
58 | #define has_chr1 x13 |
59 | #define has_chr2 x14 |
60 | |
61 | #define REP8_01 0x0101010101010101 |
62 | #define REP8_7f 0x7f7f7f7f7f7f7f7f |
63 | |
64 | |
65 | ENTRY_ALIGN (MEMCHR, 6) |
66 | |
67 | PTR_ARG (0) |
68 | SIZE_ARG (2) |
69 | |
70 | /* Do not dereference srcin if no bytes to compare. */ |
71 | cbz cntin, L(none_chr) |
72 | |
73 | /* Start address is 16-byte aligned or not? */ |
74 | tst srcin, 15 |
75 | bic src, srcin, 15 |
76 | |
77 | mov zeroones, REP8_01 |
78 | and repchr, chrin, 255 |
79 | /* Generate a qword integer as |c|c|c|c|c|c|c|c|. */ |
80 | mul repchr, repchr, zeroones |
81 | |
82 | add srcend, srcin, cntin |
83 | /* |
84 | * srcend16 is address of the block following the last block. |
85 | * |
86 | * [A block is 16-byte aligned and sized.] |
87 | */ |
88 | add srcend16, srcend, 15 |
89 | bic srcend16, srcend16, 15 |
90 | |
91 | b.eq L(loop) |
92 | |
93 | /* Load the first block containing start address. */ |
94 | ldp data1, data2, [src], 16 |
95 | |
96 | lsl tmp1, srcin, 3 |
97 | mov tmp2, ~0 |
98 | #ifdef __AARCH64EB__ |
99 | lsr tmp3, tmp2, tmp1 |
100 | #else |
101 | lsl tmp3, tmp2, tmp1 |
102 | #endif |
103 | /* Start address is in the first or the second qword? */ |
104 | tst srcin, 8 |
105 | |
106 | /* |
107 | * Transform any byte in the block to zero using XOR operation, |
108 | * if that byte equals the char to search. In this way, searching |
109 | * the char becomes detecting zero in the resulting two qwords. |
110 | */ |
111 | eor data1, data1, repchr |
112 | eor data2, data2, repchr |
113 | |
114 | /* |
115 | * Set those unused bytes(before start address) to 0xff, so |
116 | * that they will not hit any zero detection. |
117 | */ |
118 | orn tmp1, data1, tmp3 |
119 | orn tmp2, data2, tmp3 |
120 | |
121 | csinv data1, tmp1, xzr, eq |
122 | csel data2, data2, tmp2, eq |
123 | |
124 | /* |
125 | * When the first and last block are the same, there are two cases: |
126 | * o. Memory range to search is just in one block. |
127 | * ( start address - end address) < 0 |
128 | * |
129 | * o. Memory range is so large that end address wrap-around. |
130 | * ( start address - end address) > 0 |
131 | */ |
132 | cmp srcin, srcend |
133 | ccmp src, srcend16, 0, mi |
134 | csetm anymore, ne |
135 | b L(find_chr) |
136 | |
137 | .p2align 4 |
138 | L(loop): |
139 | ldp data1, data2, [src], 16 |
140 | |
141 | subs anymore, src, srcend16 |
142 | |
143 | /* |
144 | * Transform any byte in the block to zero using XOR operation, |
145 | * if that byte equals the char to search. |
146 | */ |
147 | eor data1, data1, repchr |
148 | eor data2, data2, repchr |
149 | |
150 | L(find_chr): |
151 | /* |
152 | * Use the following integer test to find out if any byte in a |
153 | * qword is zero. If do not contain zero-valued byte, test result |
154 | * is zero. |
155 | * |
156 | * (qword - 0x0101010101010101) & ~(qword) & 0x8080808080808080 |
157 | * = |
158 | * (qword - 0x0101010101010101) & ~(qword | 0x7f7f7f7f7f7f7f7f) |
159 | * |
160 | */ |
161 | sub tmp1, data1, zeroones |
162 | sub tmp2, data2, zeroones |
163 | |
164 | orr tmp3, data1, REP8_7f |
165 | orr tmp4, data2, REP8_7f |
166 | |
167 | bic has_chr1, tmp1, tmp3 |
168 | bic has_chr2, tmp2, tmp4 |
169 | |
170 | orr tmp1, has_chr1, has_chr2 |
171 | ccmp tmp1, 0, 0, ne |
172 | |
173 | b.eq L(loop) |
174 | |
175 | cbz has_chr1, 1f |
176 | sub result, src, 16 |
177 | #ifdef __AARCH64EB__ |
178 | rev data1, data1 |
179 | #else |
180 | rev has_chr1, has_chr1 |
181 | #endif |
182 | b L(done) |
183 | |
184 | 1: cbz has_chr2, L(none_chr) |
185 | sub result, src, 8 |
186 | #ifdef __AARCH64EB__ |
187 | rev data1, data2 |
188 | #else |
189 | rev has_chr1, has_chr2 |
190 | #endif |
191 | |
192 | L(done): |
193 | #ifdef __AARCH64EB__ |
194 | /* |
195 | * For big-endian, can not directly use has_chr1/has_chr2 because |
196 | * two qwords has been reversed after loading from memory. |
197 | * Thus, have to perform char detection on two qwords again, which |
198 | * should be byte-swapped this time. |
199 | */ |
200 | sub tmp1, data1, zeroones |
201 | orr tmp3, data1, REP8_7f |
202 | bic has_chr1, tmp1, tmp3 |
203 | rev has_chr1, has_chr1 |
204 | #endif |
205 | |
206 | /* |
207 | * If the specified char is found in a qword, the corresponding |
208 | * byte of in has_chr has value of 1, while this is only true for |
209 | * the first occurrence, not other occurrences. |
210 | */ |
211 | cmp anymore, 0 |
212 | clz tmp1, has_chr1 |
213 | add result, result, tmp1, lsr 3 |
214 | ccmp result, srcend, 8, eq /* NZCV = 8000 */ |
215 | csel result, result, xzr, mi |
216 | ret |
217 | |
218 | L(none_chr): |
219 | mov result, 0 |
220 | ret |
221 | |
222 | END (MEMCHR) |
223 | libc_hidden_builtin_def (MEMCHR) |
224 | |