1 | /* memchr implemented using NEON. |
2 | Copyright (C) 2011-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library. If not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | /* For __ARM_NEON__ this file defines memchr. */ |
22 | #ifndef __ARM_NEON__ |
23 | # define memchr __memchr_neon |
24 | # undef libc_hidden_builtin_def |
25 | # define libc_hidden_builtin_def(a) |
26 | #endif |
27 | |
28 | .arch armv7-a |
29 | .fpu neon |
30 | |
31 | |
32 | /* Arguments */ |
33 | #define srcin r0 |
34 | #define chrin r1 |
35 | #define cntin r2 |
36 | |
37 | /* Retval */ |
38 | #define result r0 /* Live range does not overlap with srcin */ |
39 | |
40 | /* Working registers */ |
41 | #define src r1 /* Live range does not overlap with chrin */ |
42 | #define tmp r3 |
43 | #define synd r0 /* No overlap with srcin or result */ |
44 | #define soff r12 |
45 | |
46 | /* Working NEON registers */ |
47 | #define vrepchr q0 |
48 | #define vdata0 q1 |
49 | #define vdata0_0 d2 /* Lower half of vdata0 */ |
50 | #define vdata0_1 d3 /* Upper half of vdata0 */ |
51 | #define vdata1 q2 |
52 | #define vdata1_0 d4 /* Lower half of vhas_chr0 */ |
53 | #define vdata1_1 d5 /* Upper half of vhas_chr0 */ |
54 | #define vrepmask q3 |
55 | #define vrepmask0 d6 |
56 | #define vrepmask1 d7 |
57 | #define vend q4 |
58 | #define vend0 d8 |
59 | #define vend1 d9 |
60 | |
61 | /* |
62 | * Core algorithm: |
63 | * |
64 | * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per |
65 | * byte. Each bit is set if the relevant byte matched the requested character |
66 | * and cleared otherwise. Since the bits in the syndrome reflect exactly the |
67 | * order in which things occur in the original string, counting trailing zeros |
68 | * allows to identify exactly which byte has matched. |
69 | */ |
70 | |
71 | .thumb_func |
72 | .p2align 4,,15 |
73 | |
74 | ENTRY(memchr) |
75 | /* Use a simple loop if there are less than 8 bytes to search. */ |
76 | cmp cntin, #7 |
77 | bhi .Llargestr |
78 | and chrin, chrin, #0xff |
79 | |
80 | .Lsmallstr: |
81 | subs cntin, cntin, #1 |
82 | blo .Lnotfound /* Return not found if reached end. */ |
83 | ldrb tmp, [srcin], #1 |
84 | cmp tmp, chrin |
85 | bne .Lsmallstr /* Loop again if not found. */ |
86 | /* Otherwise fixup address and return. */ |
87 | sub result, srcin, #1 |
88 | bx lr |
89 | |
90 | |
91 | .Llargestr: |
92 | vdup.8 vrepchr, chrin /* Duplicate char across all lanes. */ |
93 | /* |
94 | * Magic constant 0x8040201008040201 allows us to identify which lane |
95 | * matches the requested byte. |
96 | */ |
97 | movw tmp, #0x0201 |
98 | movt tmp, #0x0804 |
99 | lsl soff, tmp, #4 |
100 | vmov vrepmask0, tmp, soff |
101 | vmov vrepmask1, tmp, soff |
102 | /* Work with aligned 32-byte chunks */ |
103 | bic src, srcin, #31 |
104 | ands soff, srcin, #31 |
105 | beq .Lloopintro /* Go straight to main loop if it's aligned. */ |
106 | |
107 | /* |
108 | * Input string is not 32-byte aligned. We calculate the syndrome |
109 | * value for the aligned 32 bytes block containing the first bytes |
110 | * and mask the irrelevant part. |
111 | */ |
112 | vld1.8 {vdata0, vdata1}, [src:256]! |
113 | sub tmp, soff, #32 |
114 | adds cntin, cntin, tmp |
115 | vceq.i8 vdata0, vdata0, vrepchr |
116 | vceq.i8 vdata1, vdata1, vrepchr |
117 | vand vdata0, vdata0, vrepmask |
118 | vand vdata1, vdata1, vrepmask |
119 | vpadd.i8 vdata0_0, vdata0_0, vdata0_1 |
120 | vpadd.i8 vdata1_0, vdata1_0, vdata1_1 |
121 | vpadd.i8 vdata0_0, vdata0_0, vdata1_0 |
122 | vpadd.i8 vdata0_0, vdata0_0, vdata0_0 |
123 | vmov synd, vdata0_0[0] |
124 | |
125 | /* Clear the soff lower bits */ |
126 | lsr synd, synd, soff |
127 | lsl synd, synd, soff |
128 | /* The first block can also be the last */ |
129 | bls .Lmasklast |
130 | /* Have we found something already? */ |
131 | cbnz synd, .Ltail |
132 | |
133 | |
134 | .Lloopintro: |
135 | vpush {vend} |
136 | /* 264/265 correspond to d8/d9 for q4 */ |
137 | cfi_adjust_cfa_offset (16) |
138 | cfi_rel_offset (264, 0) |
139 | cfi_rel_offset (265, 8) |
140 | .p2align 3,,7 |
141 | .Lloop: |
142 | vld1.8 {vdata0, vdata1}, [src:256]! |
143 | subs cntin, cntin, #32 |
144 | vceq.i8 vdata0, vdata0, vrepchr |
145 | vceq.i8 vdata1, vdata1, vrepchr |
146 | /* If we're out of data we finish regardless of the result. */ |
147 | bls .Lend |
148 | /* Use a fast check for the termination condition. */ |
149 | vorr vend, vdata0, vdata1 |
150 | vorr vend0, vend0, vend1 |
151 | vmov synd, tmp, vend0 |
152 | orrs synd, synd, tmp |
153 | /* We're not out of data, loop if we haven't found the character. */ |
154 | beq .Lloop |
155 | |
156 | .Lend: |
157 | vpop {vend} |
158 | cfi_adjust_cfa_offset (-16) |
159 | cfi_restore (264) |
160 | cfi_restore (265) |
161 | |
162 | /* Termination condition found, let's calculate the syndrome value. */ |
163 | vand vdata0, vdata0, vrepmask |
164 | vand vdata1, vdata1, vrepmask |
165 | vpadd.i8 vdata0_0, vdata0_0, vdata0_1 |
166 | vpadd.i8 vdata1_0, vdata1_0, vdata1_1 |
167 | vpadd.i8 vdata0_0, vdata0_0, vdata1_0 |
168 | vpadd.i8 vdata0_0, vdata0_0, vdata0_0 |
169 | vmov synd, vdata0_0[0] |
170 | cbz synd, .Lnotfound |
171 | bhi .Ltail /* Uses the condition code from |
172 | subs cntin, cntin, #32 above. */ |
173 | |
174 | |
175 | .Lmasklast: |
176 | /* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */ |
177 | neg cntin, cntin |
178 | lsl synd, synd, cntin |
179 | lsrs synd, synd, cntin |
180 | it eq |
181 | moveq src, #0 /* If no match, set src to 0 so the retval is 0. */ |
182 | |
183 | |
184 | .Ltail: |
185 | /* Count the trailing zeros using bit reversing */ |
186 | rbit synd, synd |
187 | /* Compensate the last post-increment */ |
188 | sub src, src, #32 |
189 | /* Count the leading zeros */ |
190 | clz synd, synd |
191 | /* Compute the potential result and return */ |
192 | add result, src, synd |
193 | bx lr |
194 | |
195 | |
196 | .Lnotfound: |
197 | /* Set result to NULL if not found and return */ |
198 | mov result, #0 |
199 | bx lr |
200 | |
201 | END(memchr) |
202 | libc_hidden_builtin_def (memchr) |
203 | |