memcmp_64.S source code [linux/arch/powerpc/lib/memcmp_64.S]

1	/ SPDX-License-Identifier: GPL-2.0-or-later /
2	/*
3	* Author: Anton Blanchard <anton@au.ibm.com>
4	* Copyright 2015 IBM Corporation.
5	*/
6	#include <linux/export.h>
7	#include <asm/ppc_asm.h>
8	#include <asm/ppc-opcode.h>
9
10	#define off8 r6
11	#define off16 r7
12	#define off24 r8
13
14	#define rA r9
15	#define rB r10
16	#define rC r11
17	#define rD r27
18	#define rE r28
19	#define rF r29
20	#define rG r30
21	#define rH r31
22
23	#ifdef __LITTLE_ENDIAN__
24	#define LH lhbrx
25	#define LW lwbrx
26	#define LD ldbrx
27	#define LVS lvsr
28	#define VPERM(_VRT,_VRA,_VRB,_VRC) \
29	vperm _VRT,_VRB,_VRA,_VRC
30	#else
31	#define LH lhzx
32	#define LW lwzx
33	#define LD ldx
34	#define LVS lvsl
35	#define VPERM(_VRT,_VRA,_VRB,_VRC) \
36	vperm _VRT,_VRA,_VRB,_VRC
37	#endif
38
39	#define VMX_THRESH 4096
40	#define ENTER_VMX_OPS \
41	mflr r0; \
42	std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
43	std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
44	std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
45	std r0,16(r1); \
46	stdu r1,-STACKFRAMESIZE(r1); \
47	bl CFUNC(enter_vmx_ops); \
48	cmpwi cr1,r3,0; \
49	ld r0,STACKFRAMESIZE+16(r1); \
50	ld r3,STK_REG(R31)(r1); \
51	ld r4,STK_REG(R30)(r1); \
52	ld r5,STK_REG(R29)(r1); \
53	addi r1,r1,STACKFRAMESIZE; \
54	mtlr r0
55
56	#define EXIT_VMX_OPS \
57	mflr r0; \
58	std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
59	std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
60	std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
61	std r0,16(r1); \
62	stdu r1,-STACKFRAMESIZE(r1); \
63	bl CFUNC(exit_vmx_ops); \
64	ld r0,STACKFRAMESIZE+16(r1); \
65	ld r3,STK_REG(R31)(r1); \
66	ld r4,STK_REG(R30)(r1); \
67	ld r5,STK_REG(R29)(r1); \
68	addi r1,r1,STACKFRAMESIZE; \
69	mtlr r0
70
71	/*
72	* LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
73	* 16 bytes boundary and permute the result with the 1st 16 bytes.
74
75	* \| y y y y y y y y y y y y y 0 1 2 \| 3 4 5 6 7 8 9 a b c d e f z z z \|
76	* ^ ^ ^
77	* 0xbbbb10 0xbbbb20 0xbbb30
78	* ^
79	* _vaddr
80	*
81	*
82	* _vmask is the mask generated by LVS
83	* _v1st_qw is the 1st aligned QW of current addr which is already loaded.
84	* for example: 0xyyyyyyyyyyyyy012 for big endian
85	* _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
86	* for example: 0x3456789abcdefzzz for big endian
87	* The permute result is saved in _v_res.
88	* for example: 0x0123456789abcdef for big endian.
89	*/
90	#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
91	lvx _v2nd_qw,_vaddr,off16; \
92	VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
93
94	/*
95	* There are 2 categories for memcmp:
96	* 1) src/dst has the same offset to the 8 bytes boundary. The handlers
97	* are named like .Lsameoffset_xxxx
98	* 2) src/dst has different offset to the 8 bytes boundary. The handlers
99	* are named like .Ldiffoffset_xxxx
100	*/
101	_GLOBAL_TOC(memcmp)
102	cmpdi cr1,r5,`0`
103
104	/ Use the short loop if the src/dst addresses are not*
105	* with the same offset of 8 bytes align boundary.
106	*/
107	xor r6,r3,r4
108	andi. r6,r6,`7`
109
110	/ Fall back to short loop if compare at aligned addrs*
111	* with less than 8 bytes.
112	*/
113	cmpdi cr6,r5,`7`
114
115	beq cr1,.Lzero
116	bgt cr6,.Lno_short
117
118	.Lshort:
119	mtctr r5
120	`1`: lbz rA,`0`(r3)
121	lbz rB,`0`(r4)
122	subf. rC,rB,rA
123	bne .Lnon_zero
124	bdz .Lzero
125
126	lbz rA,`1`(r3)
127	lbz rB,`1`(r4)
128	subf. rC,rB,rA
129	bne .Lnon_zero
130	bdz .Lzero
131
132	lbz rA,`2`(r3)
133	lbz rB,`2`(r4)
134	subf. rC,rB,rA
135	bne .Lnon_zero
136	bdz .Lzero
137
138	lbz rA,`3`(r3)
139	lbz rB,`3`(r4)
140	subf. rC,rB,rA
141	bne .Lnon_zero
142
143	addi r3,r3,`4`
144	addi r4,r4,`4`
145
146	bdnz `1b`
147
148	.Lzero:
149	li r3,`0`
150	blr
151
152	.Lno_short:
153	dcbt `0`,r3
154	dcbt `0`,r4
155	bne .Ldiffoffset_8bytes_make_align_start
156
157
158	.Lsameoffset_8bytes_make_align_start:
159	/ attempt to compare bytes not aligned with 8 bytes so that*
160	* rest comparison can run based on 8 bytes alignment.
161	*/
162	andi. r6,r3,`7`
163
164	/ Try to compare the first double word which is not 8 bytes aligned:*
165	* load the first double word at (src & ~7UL) and shift left appropriate
166	* bits before comparision.
167	*/
168	rlwinm r6,r3,`3`,`26`,`28`
169	beq .Lsameoffset_8bytes_aligned
170	clrrdi r3,r3,`3`
171	clrrdi r4,r4,`3`
172	LD rA,`0`,r3
173	LD rB,`0`,r4
174	sld rA,rA,r6
175	sld rB,rB,r6
176	cmpld cr0,rA,rB
177	srwi r6,r6,`3`
178	bne cr0,.LcmpAB_lightweight
179	subfic r6,r6,`8`
180	subf. r5,r6,r5
181	addi r3,r3,`8`
182	addi r4,r4,`8`
183	beq .Lzero
184
185	.Lsameoffset_8bytes_aligned:
186	/ now we are aligned with 8 bytes.*
187	* Use .Llong loop if left cmp bytes are equal or greater than 32B.
188	*/
189	cmpdi cr6,r5,`31`
190	bgt cr6,.Llong
191
192	.Lcmp_lt32bytes:
193	/ compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now /
194	cmpdi cr5,r5,`7`
195	srdi r0,r5,`3`
196	ble cr5,.Lcmp_rest_lt8bytes
197
198	/ handle 8 ~ 31 bytes /
199	clrldi r5,r5,`61`
200	mtctr r0
201	`2`:
202	LD rA,`0`,r3
203	LD rB,`0`,r4
204	cmpld cr0,rA,rB
205	addi r3,r3,`8`
206	addi r4,r4,`8`
207	bne cr0,.LcmpAB_lightweight
208	bdnz `2b`
209
210	cmpwi r5,`0`
211	beq .Lzero
212
213	.Lcmp_rest_lt8bytes:
214	/*
215	* Here we have less than 8 bytes to compare. At least s1 is aligned to
216	* 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a
217	* page boundary, otherwise we might read past the end of the buffer and
218	* trigger a page fault. We use 4K as the conservative minimum page
219	* size. If we detect that case we go to the byte-by-byte loop.
220	*
221	* Otherwise the next double word is loaded from s1 and s2, and shifted
222	* right to compare the appropriate bits.
223	*/
224	clrldi r6,r4,(`64`-`12`) // r6 = r4 & 0xfff
225	cmpdi r6,`0xff8`
226	bgt .Lshort
227
228	subfic r6,r5,`8`
229	slwi r6,r6,`3`
230	LD rA,`0`,r3
231	LD rB,`0`,r4
232	srd rA,rA,r6
233	srd rB,rB,r6
234	cmpld cr0,rA,rB
235	bne cr0,.LcmpAB_lightweight
236	b .Lzero
237
238	.Lnon_zero:
239	mr r3,rC
240	blr
241
242	.Llong:
243	#ifdef CONFIG_ALTIVEC
244	BEGIN_FTR_SECTION
245	/ Try to use vmx loop if length is equal or greater than 4K /
246	cmpldi cr6,r5,VMX_THRESH
247	bge cr6,.Lsameoffset_vmx_cmp
248	END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
249
250	.Llong_novmx_cmp:
251	#endif
252	/ At least s1 addr is aligned with 8 bytes /
253	li off8,`8`
254	li off16,`16`
255	li off24,`24`
256
257	std r31,-`8`(r1)
258	std r30,-`16`(r1)
259	std r29,-`24`(r1)
260	std r28,-`32`(r1)
261	std r27,-`40`(r1)
262
263	srdi r0,r5,`5`
264	mtctr r0
265	andi. r5,r5,`31`
266
267	LD rA,`0`,r3
268	LD rB,`0`,r4
269
270	LD rC,off8,r3
271	LD rD,off8,r4
272
273	LD rE,off16,r3
274	LD rF,off16,r4
275
276	LD rG,off24,r3
277	LD rH,off24,r4
278	cmpld cr0,rA,rB
279
280	addi r3,r3,`32`
281	addi r4,r4,`32`
282
283	bdz .Lfirst32
284
285	LD rA,`0`,r3
286	LD rB,`0`,r4
287	cmpld cr1,rC,rD
288
289	LD rC,off8,r3
290	LD rD,off8,r4
291	cmpld cr6,rE,rF
292
293	LD rE,off16,r3
294	LD rF,off16,r4
295	cmpld cr7,rG,rH
296	bne cr0,.LcmpAB
297
298	LD rG,off24,r3
299	LD rH,off24,r4
300	cmpld cr0,rA,rB
301	bne cr1,.LcmpCD
302
303	addi r3,r3,`32`
304	addi r4,r4,`32`
305
306	bdz .Lsecond32
307
308	.balign `16`
309
310	`1`: LD rA,`0`,r3
311	LD rB,`0`,r4
312	cmpld cr1,rC,rD
313	bne cr6,.LcmpEF
314
315	LD rC,off8,r3
316	LD rD,off8,r4
317	cmpld cr6,rE,rF
318	bne cr7,.LcmpGH
319
320	LD rE,off16,r3
321	LD rF,off16,r4
322	cmpld cr7,rG,rH
323	bne cr0,.LcmpAB
324
325	LD rG,off24,r3
326	LD rH,off24,r4
327	cmpld cr0,rA,rB
328	bne cr1,.LcmpCD
329
330	addi r3,r3,`32`
331	addi r4,r4,`32`
332
333	bdnz `1b`
334
335	.Lsecond32:
336	cmpld cr1,rC,rD
337	bne cr6,.LcmpEF
338
339	cmpld cr6,rE,rF
340	bne cr7,.LcmpGH
341
342	cmpld cr7,rG,rH
343	bne cr0,.LcmpAB
344
345	bne cr1,.LcmpCD
346	bne cr6,.LcmpEF
347	bne cr7,.LcmpGH
348
349	.Ltail:
350	ld r31,-`8`(r1)
351	ld r30,-`16`(r1)
352	ld r29,-`24`(r1)
353	ld r28,-`32`(r1)
354	ld r27,-`40`(r1)
355
356	cmpdi r5,`0`
357	beq .Lzero
358	b .Lshort
359
360	.Lfirst32:
361	cmpld cr1,rC,rD
362	cmpld cr6,rE,rF
363	cmpld cr7,rG,rH
364
365	bne cr0,.LcmpAB
366	bne cr1,.LcmpCD
367	bne cr6,.LcmpEF
368	bne cr7,.LcmpGH
369
370	b .Ltail
371
372	.LcmpAB:
373	li r3,`1`
374	bgt cr0,.Lout
375	li r3,-`1`
376	b .Lout
377
378	.LcmpCD:
379	li r3,`1`
380	bgt cr1,.Lout
381	li r3,-`1`
382	b .Lout
383
384	.LcmpEF:
385	li r3,`1`
386	bgt cr6,.Lout
387	li r3,-`1`
388	b .Lout
389
390	.LcmpGH:
391	li r3,`1`
392	bgt cr7,.Lout
393	li r3,-`1`
394
395	.Lout:
396	ld r31,-`8`(r1)
397	ld r30,-`16`(r1)
398	ld r29,-`24`(r1)
399	ld r28,-`32`(r1)
400	ld r27,-`40`(r1)
401	blr
402
403	.LcmpAB_lightweight: / skip NV GPRS restore /
404	li r3,`1`
405	bgtlr
406	li r3,-`1`
407	blr
408
409	#ifdef CONFIG_ALTIVEC
410	.Lsameoffset_vmx_cmp:
411	/ Enter with src/dst addrs has the same offset with 8 bytes*
412	* align boundary.
413	*
414	* There is an optimization based on following fact: memcmp()
415	* prones to fail early at the first 32 bytes.
416	* Before applying VMX instructions which will lead to 32x128bits
417	* VMX regs load/restore penalty, we compare the first 32 bytes
418	* so that we can catch the ~80% fail cases.
419	*/
420
421	li r0,`4`
422	mtctr r0
423	.Lsameoffset_prechk_32B_loop:
424	LD rA,`0`,r3
425	LD rB,`0`,r4
426	cmpld cr0,rA,rB
427	addi r3,r3,`8`
428	addi r4,r4,`8`
429	bne cr0,.LcmpAB_lightweight
430	addi r5,r5,-`8`
431	bdnz .Lsameoffset_prechk_32B_loop
432
433	ENTER_VMX_OPS
434	beq cr1,.Llong_novmx_cmp
435
436	`3`:
437	/ need to check whether r4 has the same offset with r3*
438	* for 16 bytes boundary.
439	*/
440	xor r0,r3,r4
441	andi. r0,r0,`0xf`
442	bne .Ldiffoffset_vmx_cmp_start
443
444	/ len is no less than 4KB. Need to align with 16 bytes further.*
445	*/
446	andi. rA,r3,`8`
447	LD rA,`0`,r3
448	beq `4f`
449	LD rB,`0`,r4
450	cmpld cr0,rA,rB
451	addi r3,r3,`8`
452	addi r4,r4,`8`
453	addi r5,r5,-`8`
454
455	beq cr0,`4f`
456	/ save and restore cr0 /
457	mfocrf r5,`128`
458	EXIT_VMX_OPS
459	mtocrf `128`,r5
460	b .LcmpAB_lightweight
461
462	`4`:
463	/ compare 32 bytes for each loop /
464	srdi r0,r5,`5`
465	mtctr r0
466	clrldi r5,r5,`59`
467	li off16,`16`
468
469	.balign `16`
470	`5`:
471	lvx v0,`0`,r3
472	lvx v1,`0`,r4
473	VCMPEQUD_RC(v0,v0,v1)
474	bnl cr6,`7f`
475	lvx v0,off16,r3
476	lvx v1,off16,r4
477	VCMPEQUD_RC(v0,v0,v1)
478	bnl cr6,`6f`
479	addi r3,r3,`32`
480	addi r4,r4,`32`
481	bdnz `5b`
482
483	EXIT_VMX_OPS
484	cmpdi r5,`0`
485	beq .Lzero
486	b .Lcmp_lt32bytes
487
488	`6`:
489	addi r3,r3,`16`
490	addi r4,r4,`16`
491
492	`7`:
493	/ diff the last 16 bytes /
494	EXIT_VMX_OPS
495	LD rA,`0`,r3
496	LD rB,`0`,r4
497	cmpld cr0,rA,rB
498	li off8,`8`
499	bne cr0,.LcmpAB_lightweight
500
501	LD rA,off8,r3
502	LD rB,off8,r4
503	cmpld cr0,rA,rB
504	bne cr0,.LcmpAB_lightweight
505	b .Lzero
506	#endif
507
508	.Ldiffoffset_8bytes_make_align_start:
509	/ now try to align s1 with 8 bytes /
510	rlwinm r6,r3,`3`,`26`,`28`
511	beq .Ldiffoffset_align_s1_8bytes
512
513	clrrdi r3,r3,`3`
514	LD rA,`0`,r3
515	LD rB,`0`,r4 / unaligned load /
516	sld rA,rA,r6
517	srd rA,rA,r6
518	srd rB,rB,r6
519	cmpld cr0,rA,rB
520	srwi r6,r6,`3`
521	bne cr0,.LcmpAB_lightweight
522
523	subfic r6,r6,`8`
524	subf. r5,r6,r5
525	addi r3,r3,`8`
526	add r4,r4,r6
527
528	beq .Lzero
529
530	.Ldiffoffset_align_s1_8bytes:
531	/ now s1 is aligned with 8 bytes. /
532	#ifdef CONFIG_ALTIVEC
533	BEGIN_FTR_SECTION
534	/ only do vmx ops when the size equal or greater than 4K bytes /
535	cmpdi cr5,r5,VMX_THRESH
536	bge cr5,.Ldiffoffset_vmx_cmp
537	END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
538
539	.Ldiffoffset_novmx_cmp:
540	#endif
541
542
543	cmpdi cr5,r5,`31`
544	ble cr5,.Lcmp_lt32bytes
545
546	#ifdef CONFIG_ALTIVEC
547	b .Llong_novmx_cmp
548	#else
549	b .Llong
550	#endif
551
552	#ifdef CONFIG_ALTIVEC
553	.Ldiffoffset_vmx_cmp:
554	/ perform a 32 bytes pre-checking before*
555	* enable VMX operations.
556	*/
557	li r0,`4`
558	mtctr r0
559	.Ldiffoffset_prechk_32B_loop:
560	LD rA,`0`,r3
561	LD rB,`0`,r4
562	cmpld cr0,rA,rB
563	addi r3,r3,`8`
564	addi r4,r4,`8`
565	bne cr0,.LcmpAB_lightweight
566	addi r5,r5,-`8`
567	bdnz .Ldiffoffset_prechk_32B_loop
568
569	ENTER_VMX_OPS
570	beq cr1,.Ldiffoffset_novmx_cmp
571
572	.Ldiffoffset_vmx_cmp_start:
573	/ Firstly try to align r3 with 16 bytes /
574	andi. r6,r3,`0xf`
575	li off16,`16`
576	beq .Ldiffoffset_vmx_s1_16bytes_align
577
578	LVS v3,`0`,r3
579	LVS v4,`0`,r4
580
581	lvx v5,`0`,r3
582	lvx v6,`0`,r4
583	LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
584	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
585
586	VCMPEQUB_RC(v7,v9,v10)
587	bnl cr6,.Ldiffoffset_vmx_diff_found
588
589	subfic r6,r6,`16`
590	subf r5,r6,r5
591	add r3,r3,r6
592	add r4,r4,r6
593
594	.Ldiffoffset_vmx_s1_16bytes_align:
595	/ now s1 is aligned with 16 bytes /
596	lvx v6,`0`,r4
597	LVS v4,`0`,r4
598	srdi r6,r5,`5` / loop for 32 bytes each /
599	clrldi r5,r5,`59`
600	mtctr r6
601
602	.balign `16`
603	.Ldiffoffset_vmx_32bytesloop:
604	/ the first qw of r4 was saved in v6 /
605	lvx v9,`0`,r3
606	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
607	VCMPEQUB_RC(v7,v9,v10)
608	vor v6,v8,v8
609	bnl cr6,.Ldiffoffset_vmx_diff_found
610
611	addi r3,r3,`16`
612	addi r4,r4,`16`
613
614	lvx v9,`0`,r3
615	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
616	VCMPEQUB_RC(v7,v9,v10)
617	vor v6,v8,v8
618	bnl cr6,.Ldiffoffset_vmx_diff_found
619
620	addi r3,r3,`16`
621	addi r4,r4,`16`
622
623	bdnz .Ldiffoffset_vmx_32bytesloop
624
625	EXIT_VMX_OPS
626
627	cmpdi r5,`0`
628	beq .Lzero
629	b .Lcmp_lt32bytes
630
631	.Ldiffoffset_vmx_diff_found:
632	EXIT_VMX_OPS
633	/ anyway, the diff will appear in next 16 bytes /
634	li r5,`16`
635	b .Lcmp_lt32bytes
636
637	#endif
638	EXPORT_SYMBOL(memcmp)
639

source code of linux/arch/powerpc/lib/memcmp_64.S