memcmp.S source code [glibc/sysdeps/ia64/memcmp.S]

1	/ Optimized version of the standard memcmp() function.*
2	This file is part of the GNU C Library.
3	Copyright (C) 2000-2022 Free Software Foundation, Inc.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	/ Return: the result of the comparison*
20
21	Inputs:
22	in0: dest (aka s1)
23	in1: src (aka s2)
24	in2: byte count
25
26	In this form, it assumes little endian mode. For big endian mode,
27	the two shifts in .l2 must be inverted:
28
29	shl tmp1[0] = r[1 + MEMLAT], sh1 // tmp1 = w0 << sh1
30	shr.u tmp2[0] = r[0 + MEMLAT], sh2 // tmp2 = w1 >> sh2
31
32	and all the mux1 instructions should be replaced by plain mov's. /*
33
34	#include <sysdep.h>
35	#undef ret
36
37	#define OP_T_THRES 16
38	#define OPSIZ 8
39	#define MEMLAT 2
40
41	#define start r15
42	#define saved_pr r17
43	#define saved_lc r18
44	#define dest r19
45	#define src r20
46	#define len r21
47	#define asrc r22
48	#define tmp r23
49	#define value1 r24
50	#define value2 r25
51	#define sh2 r28
52	#define sh1 r29
53	#define loopcnt r30
54
55	ENTRY(memcmp)
56	.prologue
57	alloc r2 = ar.pfs, `3`, `37`, `0`, `40`
58
59	.rotr r[MEMLAT + `2`], q[MEMLAT + `5`], tmp1[`4`], tmp2[`4`], val[`2`]
60	.rotp p[MEMLAT + `4` + `1`]
61
62	mov ret0 = r0 // by default return value = 0
63	.save pr, saved_pr
64	mov saved_pr = pr // save the predicate registers
65	.save ar.lc, saved_lc
66	mov saved_lc = ar.lc // save the loop counter
67	.body
68	mov dest = in0 // dest
69	mov src = in1 // src
70	mov len = in2 // len
71	sub tmp = r0, in0 // tmp = -dest
72	;;
73	and loopcnt = `7`, tmp // loopcnt = -dest % 8
74	cmp.ge p6, p0 = OP_T_THRES, len // is len <= OP_T_THRES
75	(p6) br.cond.spnt .cmpfew // compare byte by byte
76	;;
77	cmp.eq p6, p0 = loopcnt, r0
78	(p6) br.cond.sptk .dest_aligned
79	sub len = len, loopcnt // len -= -dest % 8
80	adds loopcnt = -`1`, loopcnt // --loopcnt
81	;;
82	mov ar.lc = loopcnt
83	.l1: // copy -dest % 8 bytes
84	ld1 value1 = [src], `1` // value = src++*
85	ld1 value2 = [dest], `1`
86	;;
87	cmp.ne p6, p0 = value1, value2
88	(p6) br.cond.spnt .done
89	br.cloop.dptk .l1
90	.dest_aligned:
91	and sh1 = `7`, src // sh1 = src % 8
92	and tmp = -`8`, len // tmp = len & -OPSIZ
93	and asrc = -`8`, src // asrc = src & -OPSIZ -- align src
94	shr.u loopcnt = len, `3` // loopcnt = len / 8
95	and len = `7`, len ;; // len = len % 8
96	shl sh1 = sh1, `3` // sh1 = 8 (src % 8)*
97	adds loopcnt = -`1`, loopcnt // --loopcnt
98	mov pr.rot = `1` << `16` ;; // set rotating predicates
99	sub sh2 = `64`, sh1 // sh2 = 64 - sh1
100	mov ar.lc = loopcnt // set LC
101	cmp.eq p6, p0 = sh1, r0 // is the src aligned?
102	(p6) br.cond.sptk .src_aligned
103	add src = src, tmp // src += len & -OPSIZ
104	mov ar.ec = MEMLAT + `4` + `1` // four more passes needed
105	ld8 r[`1`] = [asrc], `8` ;; // r[1] = w0
106	.align `32`
107
108	// We enter this loop with p6 cleared by the above comparison
109
110	.l2:
111	(p[`0`]) ld8 r[`0`] = [asrc], `8` // r[0] = w1
112	(p[`0`]) ld8 q[`0`] = [dest], `8`
113	(p[MEMLAT]) shr.u tmp1[`0`] = r[`1` + MEMLAT], sh1 // tmp1 = w0 >> sh1
114	(p[MEMLAT]) shl tmp2[`0`] = r[`0` + MEMLAT], sh2 // tmp2 = w1 << sh2
115	(p[MEMLAT+`4`]) cmp.ne p6, p0 = q[MEMLAT + `4`], val[`1`]
116	(p[MEMLAT+`3`]) or val[`0`] = tmp1[`3`], tmp2[`3`] // val = tmp1 \| tmp2
117	(p6) br.cond.spnt .l2exit
118	br.ctop.sptk .l2
119	br.cond.sptk .cmpfew
120	.l3exit:
121	mux1 value1 = r[MEMLAT], @rev
122	mux1 value2 = q[MEMLAT], @rev
123	cmp.ne p6, p0 = r0, r0 ;; // clear p6
124	.l2exit:
125	(p6) mux1 value1 = val[`1`], @rev
126	(p6) mux1 value2 = q[MEMLAT + `4`], @rev ;;
127	cmp.ltu p6, p7 = value2, value1 ;;
128	(p6) mov ret0 = -`1`
129	(p7) mov ret0 = `1`
130	mov pr = saved_pr, -`1` // restore the predicate registers
131	mov ar.lc = saved_lc // restore the loop counter
132	br.ret.sptk.many b0
133	.src_aligned:
134	cmp.ne p6, p0 = r0, r0 // clear p6
135	mov ar.ec = MEMLAT + `1` ;; // set EC
136	.l3:
137	(p[`0`]) ld8 r[`0`] = [src], `8`
138	(p[`0`]) ld8 q[`0`] = [dest], `8`
139	(p[MEMLAT]) cmp.ne p6, p0 = r[MEMLAT], q[MEMLAT]
140	(p6) br.cond.spnt .l3exit
141	br.ctop.dptk .l3 ;;
142	.cmpfew:
143	cmp.eq p6, p0 = len, r0 // is len == 0 ?
144	adds len = -`1`, len // --len;
145	(p6) br.cond.spnt .restore_and_exit ;;
146	mov ar.lc = len
147	.l4:
148	ld1 value1 = [src], `1`
149	ld1 value2 = [dest], `1`
150	;;
151	cmp.ne p6, p0 = value1, value2
152	(p6) br.cond.spnt .done
153	br.cloop.dptk .l4 ;;
154	.done:
155	(p6) sub ret0 = value2, value1 // don't execute it if falling thru
156	.restore_and_exit:
157	mov pr = saved_pr, -`1` // restore the predicate registers
158	mov ar.lc = saved_lc // restore the loop counter
159	br.ret.sptk.many b0
160	END(memcmp)
161
162	weak_alias (memcmp, bcmp)
163	strong_alias (memcmp, __memcmpeq)
164	libc_hidden_builtin_def (memcmp)
165	libc_hidden_def (__memcmpeq)
166

source code of glibc/sysdeps/ia64/memcmp.S