1/* memcmp - compare memory
2
3 Copyright (C) 2013-2022 Free Software Foundation, Inc.
4
5 This file is part of the GNU C Library.
6
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
11
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public
18 License along with the GNU C Library. If not, see
19 <https://www.gnu.org/licenses/>. */
20
21#include <sysdep.h>
22
23/* Assumptions:
24 *
25 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
26 */
27
28#define src1 x0
29#define src2 x1
30#define limit x2
31#define result w0
32
33#define data1 x3
34#define data1w w3
35#define data2 x4
36#define data2w w4
37#define data3 x5
38#define data3w w5
39#define data4 x6
40#define data4w w6
41#define tmp x6
42#define src1end x7
43#define src2end x8
44
45
46ENTRY (memcmp)
47 PTR_ARG (0)
48 PTR_ARG (1)
49 SIZE_ARG (2)
50
51 cmp limit, 16
52 b.lo L(less16)
53 ldp data1, data3, [src1]
54 ldp data2, data4, [src2]
55 ccmp data1, data2, 0, ne
56 ccmp data3, data4, 0, eq
57 b.ne L(return2)
58
59 add src1end, src1, limit
60 add src2end, src2, limit
61 cmp limit, 32
62 b.ls L(last_bytes)
63 cmp limit, 160
64 b.hs L(loop_align)
65 sub limit, limit, 32
66
67 .p2align 4
68L(loop32):
69 ldp data1, data3, [src1, 16]
70 ldp data2, data4, [src2, 16]
71 cmp data1, data2
72 ccmp data3, data4, 0, eq
73 b.ne L(return2)
74 cmp limit, 16
75 b.ls L(last_bytes)
76
77 ldp data1, data3, [src1, 32]
78 ldp data2, data4, [src2, 32]
79 cmp data1, data2
80 ccmp data3, data4, 0, eq
81 b.ne L(return2)
82 add src1, src1, 32
83 add src2, src2, 32
84L(last64):
85 subs limit, limit, 32
86 b.hi L(loop32)
87
88 /* Compare last 1-16 bytes using unaligned access. */
89L(last_bytes):
90 ldp data1, data3, [src1end, -16]
91 ldp data2, data4, [src2end, -16]
92L(return2):
93 cmp data1, data2
94 csel data1, data1, data3, ne
95 csel data2, data2, data4, ne
96
97 /* Compare data bytes and set return value to 0, -1 or 1. */
98L(return):
99#ifndef __AARCH64EB__
100 rev data1, data1
101 rev data2, data2
102#endif
103 cmp data1, data2
104 cset result, ne
105 cneg result, result, lo
106 ret
107
108 .p2align 4
109L(less16):
110 add src1end, src1, limit
111 add src2end, src2, limit
112 tbz limit, 3, L(less8)
113 ldr data1, [src1]
114 ldr data2, [src2]
115 ldr data3, [src1end, -8]
116 ldr data4, [src2end, -8]
117 b L(return2)
118
119 .p2align 4
120L(less8):
121 tbz limit, 2, L(less4)
122 ldr data1w, [src1]
123 ldr data2w, [src2]
124 ldr data3w, [src1end, -4]
125 ldr data4w, [src2end, -4]
126 b L(return2)
127
128L(less4):
129 tbz limit, 1, L(less2)
130 ldrh data1w, [src1]
131 ldrh data2w, [src2]
132 cmp data1w, data2w
133 b.ne L(return)
134L(less2):
135 mov result, 0
136 tbz limit, 0, L(return_zero)
137 ldrb data1w, [src1end, -1]
138 ldrb data2w, [src2end, -1]
139 sub result, data1w, data2w
140L(return_zero):
141 ret
142
143L(loop_align):
144 ldp data1, data3, [src1, 16]
145 ldp data2, data4, [src2, 16]
146 cmp data1, data2
147 ccmp data3, data4, 0, eq
148 b.ne L(return2)
149
150 /* Align src2 and adjust src1, src2 and limit. */
151 and tmp, src2, 15
152 sub tmp, tmp, 16
153 sub src2, src2, tmp
154 add limit, limit, tmp
155 sub src1, src1, tmp
156 sub limit, limit, 64 + 16
157
158 .p2align 4
159L(loop64):
160 ldr q0, [src1, 16]
161 ldr q1, [src2, 16]
162 subs limit, limit, 64
163 ldr q2, [src1, 32]
164 ldr q3, [src2, 32]
165 eor v0.16b, v0.16b, v1.16b
166 eor v1.16b, v2.16b, v3.16b
167 ldr q2, [src1, 48]
168 ldr q3, [src2, 48]
169 umaxp v0.16b, v0.16b, v1.16b
170 ldr q4, [src1, 64]!
171 ldr q5, [src2, 64]!
172 eor v1.16b, v2.16b, v3.16b
173 eor v2.16b, v4.16b, v5.16b
174 umaxp v1.16b, v1.16b, v2.16b
175 umaxp v0.16b, v0.16b, v1.16b
176 umaxp v0.16b, v0.16b, v0.16b
177 fmov tmp, d0
178 ccmp tmp, 0, 0, hi
179 b.eq L(loop64)
180
181 /* If equal, process last 1-64 bytes using scalar loop. */
182 add limit, limit, 64 + 16
183 cbz tmp, L(last64)
184
185 /* Determine the 8-byte aligned offset of the first difference. */
186#ifdef __AARCH64EB__
187 rev16 tmp, tmp
188#endif
189 rev tmp, tmp
190 clz tmp, tmp
191 bic tmp, tmp, 7
192 sub tmp, tmp, 48
193 ldr data1, [src1, tmp]
194 ldr data2, [src2, tmp]
195#ifndef __AARCH64EB__
196 rev data1, data1
197 rev data2, data2
198#endif
199 mov result, 1
200 cmp data1, data2
201 cneg result, result, lo
202 ret
203
204END (memcmp)
205#undef bcmp
206weak_alias (memcmp, bcmp)
207#undef __memcmpeq
208strong_alias (memcmp, __memcmpeq)
209libc_hidden_builtin_def (memcmp)
210libc_hidden_def (__memcmpeq)
211

source code of glibc/sysdeps/aarch64/memcmp.S