strlen.S source code [glibc/sysdeps/powerpc/powerpc64/strlen.S]

1	/ Optimized strlen implementation for PowerPC64.*
2	Copyright (C) 1997-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20
21	/ The algorithm here uses the following techniques:*
22
23	1) Given a word 'x', we can test to see if it contains any 0 bytes
24	by subtracting 0x01010101, and seeing if any of the high bits of each
25	byte changed from 0 to 1. This works because the least significant
26	0 byte must have had no incoming carry (otherwise it's not the least
27	significant), so it is 0x00 - 0x01 == 0xff. For all other
28	byte values, either they have the high bit set initially, or when
29	1 is subtracted you get a value in the range 0x00-0x7f, none of which
30	have their high bit set. The expression here is
31	(x + 0xfefefeff) & ~(x \| 0x7f7f7f7f), which gives 0x00000000 when
32	there were no 0x00 bytes in the word. You get 0x80 in bytes that
33	match, but possibly false 0x80 matches in the next more significant
34	byte to a true match due to carries. For little-endian this is
35	of no consequence since the least significant match is the one
36	we're interested in, but big-endian needs method 2 to find which
37	byte matches.
38
39	2) Given a word 'x', we can test to see _which_ byte was zero by
40	calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) \| x \| 0x7f7f7f7f).
41	This produces 0x80 in each byte that was zero, and 0x00 in all
42	the other bytes. The '\| 0x7f7f7f7f' clears the low 7 bits in each
43	byte, and the '\| x' part ensures that bytes with the high bit set
44	produce 0x00. The addition will carry into the high bit of each byte
45	iff that byte had one of its low 7 bits set. We can then just see
46	which was the most significant bit set and divide by 8 to find how
47	many to add to the index.
48	This is from the book 'The PowerPC Compiler Writer's Guide',
49	by Steve Hoxey, Faraydon Karim, Bill Hay and Hank Warren.
50
51	We deal with strings not aligned to a word boundary by taking the
52	first word and ensuring that bytes not part of the string
53	are treated as nonzero. To allow for memory latency, we unroll the
54	loop a few times, being careful to ensure that we do not read ahead
55	across cache line boundaries.
56
57	Questions to answer:
58	1) How long are strings passed to strlen? If they're often really long,
59	we should probably use cache management instructions and/or unroll the
60	loop more. If they're often quite short, it might be better to use
61	fact (2) in the inner loop than have to recalculate it.
62	2) How popular are bytes with the high bit set? If they are very rare,
63	on some processors it might be useful to use the simpler expression
64	~((x - 0x01010101) \| 0x7f7f7f7f) (that is, on processors with only one
65	ALU), but this fails when any character has its high bit set.
66
67	Answer:
68	1) Added a Data Cache Block Touch early to prefetch the first 128
69	byte cache line. Adding dcbt instructions to the loop would not be
70	effective since most strings will be shorter than the cache line. /*
71
72	/ Some notes on register usage: Under the SVR4 ABI, we can use registers*
73	0 and 3 through 12 (so long as we don't call any procedures) without
74	saving them. We can also use registers 14 through 31 if we save them.
75	We can't use r1 (it's the stack pointer), r2 nor r13 because the user
76	program may expect them to hold their usual value if we get sent
77	a signal. Integer parameters are passed in r3 through r10.
78	We can use condition registers cr0, cr1, cr5, cr6, and cr7 without saving
79	them, the others we must save. /*
80
81	/ int [r3] strlen (char s [r3]) /*
82
83	#ifndef STRLEN
84	# define STRLEN strlen
85	#endif
86
87	ENTRY_TOCLESS (STRLEN)
88	CALL_MCOUNT `1`
89
90	#define rTMP4 r0
91	#define rRTN r3 /* incoming STR arg, outgoing result */
92	#define rSTR r4 /* current string position */
93	#define rPADN r5 /* number of padding bits we prepend to the
94	string to make it start at a word boundary */
95	#define rFEFE r6 /* constant 0xfefefefefefefeff (-0x0101010101010101) */
96	#define r7F7F r7 /* constant 0x7f7f7f7f7f7f7f7f */
97	#define rWORD1 r8 /* current string doubleword */
98	#define rWORD2 r9 /* next string doubleword */
99	#define rMASK r9 /* mask for first string doubleword */
100	#define rTMP1 r10
101	#define rTMP2 r11
102	#define rTMP3 r12
103
104	dcbt `0`,rRTN
105	clrrdi rSTR, rRTN, `3`
106	lis r7F7F, `0x7f7f`
107	rlwinm rPADN, rRTN, `3`, `26`, `28`
108	ld rWORD1, `0`(rSTR)
109	addi r7F7F, r7F7F, `0x7f7f`
110	li rMASK, -`1`
111	insrdi r7F7F, r7F7F, `32`, `0`
112	/ We use method (2) on the first two doublewords, because rFEFE isn't*
113	required which reduces setup overhead. Also gives a faster return
114	for small strings on big-endian due to needing to recalculate with
115	method (2) anyway. /*
116	#ifdef __LITTLE_ENDIAN__
117	sld rMASK, rMASK, rPADN
118	#else
119	srd rMASK, rMASK, rPADN
120	#endif
121	and rTMP1, r7F7F, rWORD1
122	or rTMP2, r7F7F, rWORD1
123	lis rFEFE, -`0x101`
124	add rTMP1, rTMP1, r7F7F
125	addi rFEFE, rFEFE, -`0x101`
126	nor rTMP3, rTMP2, rTMP1
127	and. rTMP3, rTMP3, rMASK
128	mtcrf `0x01`, rRTN
129	bne L(done0)
130	sldi rTMP1, rFEFE, `32`
131	add rFEFE, rFEFE, rTMP1
132	/ Are we now aligned to a doubleword boundary? /
133	bt `28`, L(loop)
134
135	/ Handle second doubleword of pair. /
136	/ Perhaps use method (1) here for little-endian, saving one instruction? /
137	ldu rWORD1, `8`(rSTR)
138	and rTMP1, r7F7F, rWORD1
139	or rTMP2, r7F7F, rWORD1
140	add rTMP1, rTMP1, r7F7F
141	nor. rTMP3, rTMP2, rTMP1
142	bne L(done0)
143
144	/ The loop. /
145
146	L(loop):
147	ld rWORD1, `8`(rSTR)
148	ldu rWORD2, `16`(rSTR)
149	add rTMP1, rFEFE, rWORD1
150	nor rTMP2, r7F7F, rWORD1
151	and. rTMP1, rTMP1, rTMP2
152	add rTMP3, rFEFE, rWORD2
153	nor rTMP4, r7F7F, rWORD2
154	bne L(done1)
155	and. rTMP3, rTMP3, rTMP4
156	beq L(loop)
157
158	#ifndef __LITTLE_ENDIAN__
159	and rTMP1, r7F7F, rWORD2
160	add rTMP1, rTMP1, r7F7F
161	andc rTMP3, rTMP4, rTMP1
162	b L(done0)
163
164	L(done1):
165	and rTMP1, r7F7F, rWORD1
166	subi rSTR, rSTR, `8`
167	add rTMP1, rTMP1, r7F7F
168	andc rTMP3, rTMP2, rTMP1
169
170	/ When we get to here, rSTR points to the first doubleword in the string that*
171	contains a zero byte, and rTMP3 has 0x80 for bytes that are zero, and 0x00
172	otherwise. /*
173	L(done0):
174	cntlzd rTMP3, rTMP3
175	subf rTMP1, rRTN, rSTR
176	srdi rTMP3, rTMP3, `3`
177	add rRTN, rTMP1, rTMP3
178	blr
179	#else
180
181	L(done0):
182	addi rTMP1, rTMP3, -`1` / Form a mask from trailing zeros. /
183	andc rTMP1, rTMP1, rTMP3
184	cntlzd rTMP1, rTMP1 / Count bits not in the mask. /
185	subf rTMP3, rRTN, rSTR
186	subfic rTMP1, rTMP1, `64`-`7`
187	srdi rTMP1, rTMP1, `3`
188	add rRTN, rTMP1, rTMP3
189	blr
190
191	L(done1):
192	addi rTMP3, rTMP1, -`1`
193	andc rTMP3, rTMP3, rTMP1
194	cntlzd rTMP3, rTMP3
195	subf rTMP1, rRTN, rSTR
196	subfic rTMP3, rTMP3, `64`-`7`-`64`
197	sradi rTMP3, rTMP3, `3`
198	add rRTN, rTMP1, rTMP3
199	blr
200	#endif
201
202	END (STRLEN)
203	libc_hidden_builtin_def (strlen)
204

source code of glibc/sysdeps/powerpc/powerpc64/strlen.S