1 | /* memcmp - compare memory |
2 | |
3 | Copyright (C) 2013-2022 Free Software Foundation, Inc. |
4 | |
5 | This file is part of the GNU C Library. |
6 | |
7 | The GNU C Library is free software; you can redistribute it and/or |
8 | modify it under the terms of the GNU Lesser General Public |
9 | License as published by the Free Software Foundation; either |
10 | version 2.1 of the License, or (at your option) any later version. |
11 | |
12 | The GNU C Library is distributed in the hope that it will be useful, |
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 | Lesser General Public License for more details. |
16 | |
17 | You should have received a copy of the GNU Lesser General Public |
18 | License along with the GNU C Library. If not, see |
19 | <https://www.gnu.org/licenses/>. */ |
20 | |
21 | #include <sysdep.h> |
22 | |
23 | /* Assumptions: |
24 | * |
25 | * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. |
26 | */ |
27 | |
28 | #define src1 x0 |
29 | #define src2 x1 |
30 | #define limit x2 |
31 | #define result w0 |
32 | |
33 | #define data1 x3 |
34 | #define data1w w3 |
35 | #define data2 x4 |
36 | #define data2w w4 |
37 | #define data3 x5 |
38 | #define data3w w5 |
39 | #define data4 x6 |
40 | #define data4w w6 |
41 | #define tmp x6 |
42 | #define src1end x7 |
43 | #define src2end x8 |
44 | |
45 | |
46 | ENTRY (memcmp) |
47 | PTR_ARG (0) |
48 | PTR_ARG (1) |
49 | SIZE_ARG (2) |
50 | |
51 | cmp limit, 16 |
52 | b.lo L(less16) |
53 | ldp data1, data3, [src1] |
54 | ldp data2, data4, [src2] |
55 | ccmp data1, data2, 0, ne |
56 | ccmp data3, data4, 0, eq |
57 | b.ne L(return2) |
58 | |
59 | add src1end, src1, limit |
60 | add src2end, src2, limit |
61 | cmp limit, 32 |
62 | b.ls L(last_bytes) |
63 | cmp limit, 160 |
64 | b.hs L(loop_align) |
65 | sub limit, limit, 32 |
66 | |
67 | .p2align 4 |
68 | L(loop32): |
69 | ldp data1, data3, [src1, 16] |
70 | ldp data2, data4, [src2, 16] |
71 | cmp data1, data2 |
72 | ccmp data3, data4, 0, eq |
73 | b.ne L(return2) |
74 | cmp limit, 16 |
75 | b.ls L(last_bytes) |
76 | |
77 | ldp data1, data3, [src1, 32] |
78 | ldp data2, data4, [src2, 32] |
79 | cmp data1, data2 |
80 | ccmp data3, data4, 0, eq |
81 | b.ne L(return2) |
82 | add src1, src1, 32 |
83 | add src2, src2, 32 |
84 | L(last64): |
85 | subs limit, limit, 32 |
86 | b.hi L(loop32) |
87 | |
88 | /* Compare last 1-16 bytes using unaligned access. */ |
89 | L(last_bytes): |
90 | ldp data1, data3, [src1end, -16] |
91 | ldp data2, data4, [src2end, -16] |
92 | L(return2): |
93 | cmp data1, data2 |
94 | csel data1, data1, data3, ne |
95 | csel data2, data2, data4, ne |
96 | |
97 | /* Compare data bytes and set return value to 0, -1 or 1. */ |
98 | L(return): |
99 | #ifndef __AARCH64EB__ |
100 | rev data1, data1 |
101 | rev data2, data2 |
102 | #endif |
103 | cmp data1, data2 |
104 | cset result, ne |
105 | cneg result, result, lo |
106 | ret |
107 | |
108 | .p2align 4 |
109 | L(less16): |
110 | add src1end, src1, limit |
111 | add src2end, src2, limit |
112 | tbz limit, 3, L(less8) |
113 | ldr data1, [src1] |
114 | ldr data2, [src2] |
115 | ldr data3, [src1end, -8] |
116 | ldr data4, [src2end, -8] |
117 | b L(return2) |
118 | |
119 | .p2align 4 |
120 | L(less8): |
121 | tbz limit, 2, L(less4) |
122 | ldr data1w, [src1] |
123 | ldr data2w, [src2] |
124 | ldr data3w, [src1end, -4] |
125 | ldr data4w, [src2end, -4] |
126 | b L(return2) |
127 | |
128 | L(less4): |
129 | tbz limit, 1, L(less2) |
130 | ldrh data1w, [src1] |
131 | ldrh data2w, [src2] |
132 | cmp data1w, data2w |
133 | b.ne L(return) |
134 | L(less2): |
135 | mov result, 0 |
136 | tbz limit, 0, L(return_zero) |
137 | ldrb data1w, [src1end, -1] |
138 | ldrb data2w, [src2end, -1] |
139 | sub result, data1w, data2w |
140 | L(return_zero): |
141 | ret |
142 | |
143 | L(loop_align): |
144 | ldp data1, data3, [src1, 16] |
145 | ldp data2, data4, [src2, 16] |
146 | cmp data1, data2 |
147 | ccmp data3, data4, 0, eq |
148 | b.ne L(return2) |
149 | |
150 | /* Align src2 and adjust src1, src2 and limit. */ |
151 | and tmp, src2, 15 |
152 | sub tmp, tmp, 16 |
153 | sub src2, src2, tmp |
154 | add limit, limit, tmp |
155 | sub src1, src1, tmp |
156 | sub limit, limit, 64 + 16 |
157 | |
158 | .p2align 4 |
159 | L(loop64): |
160 | ldr q0, [src1, 16] |
161 | ldr q1, [src2, 16] |
162 | subs limit, limit, 64 |
163 | ldr q2, [src1, 32] |
164 | ldr q3, [src2, 32] |
165 | eor v0.16b, v0.16b, v1.16b |
166 | eor v1.16b, v2.16b, v3.16b |
167 | ldr q2, [src1, 48] |
168 | ldr q3, [src2, 48] |
169 | umaxp v0.16b, v0.16b, v1.16b |
170 | ldr q4, [src1, 64]! |
171 | ldr q5, [src2, 64]! |
172 | eor v1.16b, v2.16b, v3.16b |
173 | eor v2.16b, v4.16b, v5.16b |
174 | umaxp v1.16b, v1.16b, v2.16b |
175 | umaxp v0.16b, v0.16b, v1.16b |
176 | umaxp v0.16b, v0.16b, v0.16b |
177 | fmov tmp, d0 |
178 | ccmp tmp, 0, 0, hi |
179 | b.eq L(loop64) |
180 | |
181 | /* If equal, process last 1-64 bytes using scalar loop. */ |
182 | add limit, limit, 64 + 16 |
183 | cbz tmp, L(last64) |
184 | |
185 | /* Determine the 8-byte aligned offset of the first difference. */ |
186 | #ifdef __AARCH64EB__ |
187 | rev16 tmp, tmp |
188 | #endif |
189 | rev tmp, tmp |
190 | clz tmp, tmp |
191 | bic tmp, tmp, 7 |
192 | sub tmp, tmp, 48 |
193 | ldr data1, [src1, tmp] |
194 | ldr data2, [src2, tmp] |
195 | #ifndef __AARCH64EB__ |
196 | rev data1, data1 |
197 | rev data2, data2 |
198 | #endif |
199 | mov result, 1 |
200 | cmp data1, data2 |
201 | cneg result, result, lo |
202 | ret |
203 | |
204 | END (memcmp) |
205 | #undef bcmp |
206 | weak_alias (memcmp, bcmp) |
207 | #undef __memcmpeq |
208 | strong_alias (memcmp, __memcmpeq) |
209 | libc_hidden_builtin_def (memcmp) |
210 | libc_hidden_def (__memcmpeq) |
211 | |