1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Author: Anton Blanchard <anton@au.ibm.com>
4 * Copyright 2015 IBM Corporation.
5 */
6#include <linux/export.h>
7#include <asm/ppc_asm.h>
8#include <asm/ppc-opcode.h>
9
10#define off8 r6
11#define off16 r7
12#define off24 r8
13
14#define rA r9
15#define rB r10
16#define rC r11
17#define rD r27
18#define rE r28
19#define rF r29
20#define rG r30
21#define rH r31
22
23#ifdef __LITTLE_ENDIAN__
24#define LH lhbrx
25#define LW lwbrx
26#define LD ldbrx
27#define LVS lvsr
28#define VPERM(_VRT,_VRA,_VRB,_VRC) \
29 vperm _VRT,_VRB,_VRA,_VRC
30#else
31#define LH lhzx
32#define LW lwzx
33#define LD ldx
34#define LVS lvsl
35#define VPERM(_VRT,_VRA,_VRB,_VRC) \
36 vperm _VRT,_VRA,_VRB,_VRC
37#endif
38
39#define VMX_THRESH 4096
40#define ENTER_VMX_OPS \
41 mflr r0; \
42 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
43 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
44 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
45 std r0,16(r1); \
46 stdu r1,-STACKFRAMESIZE(r1); \
47 bl CFUNC(enter_vmx_ops); \
48 cmpwi cr1,r3,0; \
49 ld r0,STACKFRAMESIZE+16(r1); \
50 ld r3,STK_REG(R31)(r1); \
51 ld r4,STK_REG(R30)(r1); \
52 ld r5,STK_REG(R29)(r1); \
53 addi r1,r1,STACKFRAMESIZE; \
54 mtlr r0
55
56#define EXIT_VMX_OPS \
57 mflr r0; \
58 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
59 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
60 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
61 std r0,16(r1); \
62 stdu r1,-STACKFRAMESIZE(r1); \
63 bl CFUNC(exit_vmx_ops); \
64 ld r0,STACKFRAMESIZE+16(r1); \
65 ld r3,STK_REG(R31)(r1); \
66 ld r4,STK_REG(R30)(r1); \
67 ld r5,STK_REG(R29)(r1); \
68 addi r1,r1,STACKFRAMESIZE; \
69 mtlr r0
70
71/*
72 * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
73 * 16 bytes boundary and permute the result with the 1st 16 bytes.
74
75 * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
76 * ^ ^ ^
77 * 0xbbbb10 0xbbbb20 0xbbb30
78 * ^
79 * _vaddr
80 *
81 *
82 * _vmask is the mask generated by LVS
83 * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
84 * for example: 0xyyyyyyyyyyyyy012 for big endian
85 * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
86 * for example: 0x3456789abcdefzzz for big endian
87 * The permute result is saved in _v_res.
88 * for example: 0x0123456789abcdef for big endian.
89 */
90#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
91 lvx _v2nd_qw,_vaddr,off16; \
92 VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
93
94/*
95 * There are 2 categories for memcmp:
96 * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
97 * are named like .Lsameoffset_xxxx
98 * 2) src/dst has different offset to the 8 bytes boundary. The handlers
99 * are named like .Ldiffoffset_xxxx
100 */
101_GLOBAL_TOC(memcmp)
102 cmpdi cr1,r5,0
103
104 /* Use the short loop if the src/dst addresses are not
105 * with the same offset of 8 bytes align boundary.
106 */
107 xor r6,r3,r4
108 andi. r6,r6,7
109
110 /* Fall back to short loop if compare at aligned addrs
111 * with less than 8 bytes.
112 */
113 cmpdi cr6,r5,7
114
115 beq cr1,.Lzero
116 bgt cr6,.Lno_short
117
118.Lshort:
119 mtctr r5
1201: lbz rA,0(r3)
121 lbz rB,0(r4)
122 subf. rC,rB,rA
123 bne .Lnon_zero
124 bdz .Lzero
125
126 lbz rA,1(r3)
127 lbz rB,1(r4)
128 subf. rC,rB,rA
129 bne .Lnon_zero
130 bdz .Lzero
131
132 lbz rA,2(r3)
133 lbz rB,2(r4)
134 subf. rC,rB,rA
135 bne .Lnon_zero
136 bdz .Lzero
137
138 lbz rA,3(r3)
139 lbz rB,3(r4)
140 subf. rC,rB,rA
141 bne .Lnon_zero
142
143 addi r3,r3,4
144 addi r4,r4,4
145
146 bdnz 1b
147
148.Lzero:
149 li r3,0
150 blr
151
152.Lno_short:
153 dcbt 0,r3
154 dcbt 0,r4
155 bne .Ldiffoffset_8bytes_make_align_start
156
157
158.Lsameoffset_8bytes_make_align_start:
159 /* attempt to compare bytes not aligned with 8 bytes so that
160 * rest comparison can run based on 8 bytes alignment.
161 */
162 andi. r6,r3,7
163
164 /* Try to compare the first double word which is not 8 bytes aligned:
165 * load the first double word at (src & ~7UL) and shift left appropriate
166 * bits before comparision.
167 */
168 rlwinm r6,r3,3,26,28
169 beq .Lsameoffset_8bytes_aligned
170 clrrdi r3,r3,3
171 clrrdi r4,r4,3
172 LD rA,0,r3
173 LD rB,0,r4
174 sld rA,rA,r6
175 sld rB,rB,r6
176 cmpld cr0,rA,rB
177 srwi r6,r6,3
178 bne cr0,.LcmpAB_lightweight
179 subfic r6,r6,8
180 subf. r5,r6,r5
181 addi r3,r3,8
182 addi r4,r4,8
183 beq .Lzero
184
185.Lsameoffset_8bytes_aligned:
186 /* now we are aligned with 8 bytes.
187 * Use .Llong loop if left cmp bytes are equal or greater than 32B.
188 */
189 cmpdi cr6,r5,31
190 bgt cr6,.Llong
191
192.Lcmp_lt32bytes:
193 /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
194 cmpdi cr5,r5,7
195 srdi r0,r5,3
196 ble cr5,.Lcmp_rest_lt8bytes
197
198 /* handle 8 ~ 31 bytes */
199 clrldi r5,r5,61
200 mtctr r0
2012:
202 LD rA,0,r3
203 LD rB,0,r4
204 cmpld cr0,rA,rB
205 addi r3,r3,8
206 addi r4,r4,8
207 bne cr0,.LcmpAB_lightweight
208 bdnz 2b
209
210 cmpwi r5,0
211 beq .Lzero
212
213.Lcmp_rest_lt8bytes:
214 /*
215 * Here we have less than 8 bytes to compare. At least s1 is aligned to
216 * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a
217 * page boundary, otherwise we might read past the end of the buffer and
218 * trigger a page fault. We use 4K as the conservative minimum page
219 * size. If we detect that case we go to the byte-by-byte loop.
220 *
221 * Otherwise the next double word is loaded from s1 and s2, and shifted
222 * right to compare the appropriate bits.
223 */
224 clrldi r6,r4,(64-12) // r6 = r4 & 0xfff
225 cmpdi r6,0xff8
226 bgt .Lshort
227
228 subfic r6,r5,8
229 slwi r6,r6,3
230 LD rA,0,r3
231 LD rB,0,r4
232 srd rA,rA,r6
233 srd rB,rB,r6
234 cmpld cr0,rA,rB
235 bne cr0,.LcmpAB_lightweight
236 b .Lzero
237
238.Lnon_zero:
239 mr r3,rC
240 blr
241
242.Llong:
243#ifdef CONFIG_ALTIVEC
244BEGIN_FTR_SECTION
245 /* Try to use vmx loop if length is equal or greater than 4K */
246 cmpldi cr6,r5,VMX_THRESH
247 bge cr6,.Lsameoffset_vmx_cmp
248END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
249
250.Llong_novmx_cmp:
251#endif
252 /* At least s1 addr is aligned with 8 bytes */
253 li off8,8
254 li off16,16
255 li off24,24
256
257 std r31,-8(r1)
258 std r30,-16(r1)
259 std r29,-24(r1)
260 std r28,-32(r1)
261 std r27,-40(r1)
262
263 srdi r0,r5,5
264 mtctr r0
265 andi. r5,r5,31
266
267 LD rA,0,r3
268 LD rB,0,r4
269
270 LD rC,off8,r3
271 LD rD,off8,r4
272
273 LD rE,off16,r3
274 LD rF,off16,r4
275
276 LD rG,off24,r3
277 LD rH,off24,r4
278 cmpld cr0,rA,rB
279
280 addi r3,r3,32
281 addi r4,r4,32
282
283 bdz .Lfirst32
284
285 LD rA,0,r3
286 LD rB,0,r4
287 cmpld cr1,rC,rD
288
289 LD rC,off8,r3
290 LD rD,off8,r4
291 cmpld cr6,rE,rF
292
293 LD rE,off16,r3
294 LD rF,off16,r4
295 cmpld cr7,rG,rH
296 bne cr0,.LcmpAB
297
298 LD rG,off24,r3
299 LD rH,off24,r4
300 cmpld cr0,rA,rB
301 bne cr1,.LcmpCD
302
303 addi r3,r3,32
304 addi r4,r4,32
305
306 bdz .Lsecond32
307
308 .balign 16
309
3101: LD rA,0,r3
311 LD rB,0,r4
312 cmpld cr1,rC,rD
313 bne cr6,.LcmpEF
314
315 LD rC,off8,r3
316 LD rD,off8,r4
317 cmpld cr6,rE,rF
318 bne cr7,.LcmpGH
319
320 LD rE,off16,r3
321 LD rF,off16,r4
322 cmpld cr7,rG,rH
323 bne cr0,.LcmpAB
324
325 LD rG,off24,r3
326 LD rH,off24,r4
327 cmpld cr0,rA,rB
328 bne cr1,.LcmpCD
329
330 addi r3,r3,32
331 addi r4,r4,32
332
333 bdnz 1b
334
335.Lsecond32:
336 cmpld cr1,rC,rD
337 bne cr6,.LcmpEF
338
339 cmpld cr6,rE,rF
340 bne cr7,.LcmpGH
341
342 cmpld cr7,rG,rH
343 bne cr0,.LcmpAB
344
345 bne cr1,.LcmpCD
346 bne cr6,.LcmpEF
347 bne cr7,.LcmpGH
348
349.Ltail:
350 ld r31,-8(r1)
351 ld r30,-16(r1)
352 ld r29,-24(r1)
353 ld r28,-32(r1)
354 ld r27,-40(r1)
355
356 cmpdi r5,0
357 beq .Lzero
358 b .Lshort
359
360.Lfirst32:
361 cmpld cr1,rC,rD
362 cmpld cr6,rE,rF
363 cmpld cr7,rG,rH
364
365 bne cr0,.LcmpAB
366 bne cr1,.LcmpCD
367 bne cr6,.LcmpEF
368 bne cr7,.LcmpGH
369
370 b .Ltail
371
372.LcmpAB:
373 li r3,1
374 bgt cr0,.Lout
375 li r3,-1
376 b .Lout
377
378.LcmpCD:
379 li r3,1
380 bgt cr1,.Lout
381 li r3,-1
382 b .Lout
383
384.LcmpEF:
385 li r3,1
386 bgt cr6,.Lout
387 li r3,-1
388 b .Lout
389
390.LcmpGH:
391 li r3,1
392 bgt cr7,.Lout
393 li r3,-1
394
395.Lout:
396 ld r31,-8(r1)
397 ld r30,-16(r1)
398 ld r29,-24(r1)
399 ld r28,-32(r1)
400 ld r27,-40(r1)
401 blr
402
403.LcmpAB_lightweight: /* skip NV GPRS restore */
404 li r3,1
405 bgtlr
406 li r3,-1
407 blr
408
409#ifdef CONFIG_ALTIVEC
410.Lsameoffset_vmx_cmp:
411 /* Enter with src/dst addrs has the same offset with 8 bytes
412 * align boundary.
413 *
414 * There is an optimization based on following fact: memcmp()
415 * prones to fail early at the first 32 bytes.
416 * Before applying VMX instructions which will lead to 32x128bits
417 * VMX regs load/restore penalty, we compare the first 32 bytes
418 * so that we can catch the ~80% fail cases.
419 */
420
421 li r0,4
422 mtctr r0
423.Lsameoffset_prechk_32B_loop:
424 LD rA,0,r3
425 LD rB,0,r4
426 cmpld cr0,rA,rB
427 addi r3,r3,8
428 addi r4,r4,8
429 bne cr0,.LcmpAB_lightweight
430 addi r5,r5,-8
431 bdnz .Lsameoffset_prechk_32B_loop
432
433 ENTER_VMX_OPS
434 beq cr1,.Llong_novmx_cmp
435
4363:
437 /* need to check whether r4 has the same offset with r3
438 * for 16 bytes boundary.
439 */
440 xor r0,r3,r4
441 andi. r0,r0,0xf
442 bne .Ldiffoffset_vmx_cmp_start
443
444 /* len is no less than 4KB. Need to align with 16 bytes further.
445 */
446 andi. rA,r3,8
447 LD rA,0,r3
448 beq 4f
449 LD rB,0,r4
450 cmpld cr0,rA,rB
451 addi r3,r3,8
452 addi r4,r4,8
453 addi r5,r5,-8
454
455 beq cr0,4f
456 /* save and restore cr0 */
457 mfocrf r5,128
458 EXIT_VMX_OPS
459 mtocrf 128,r5
460 b .LcmpAB_lightweight
461
4624:
463 /* compare 32 bytes for each loop */
464 srdi r0,r5,5
465 mtctr r0
466 clrldi r5,r5,59
467 li off16,16
468
469.balign 16
4705:
471 lvx v0,0,r3
472 lvx v1,0,r4
473 VCMPEQUD_RC(v0,v0,v1)
474 bnl cr6,7f
475 lvx v0,off16,r3
476 lvx v1,off16,r4
477 VCMPEQUD_RC(v0,v0,v1)
478 bnl cr6,6f
479 addi r3,r3,32
480 addi r4,r4,32
481 bdnz 5b
482
483 EXIT_VMX_OPS
484 cmpdi r5,0
485 beq .Lzero
486 b .Lcmp_lt32bytes
487
4886:
489 addi r3,r3,16
490 addi r4,r4,16
491
4927:
493 /* diff the last 16 bytes */
494 EXIT_VMX_OPS
495 LD rA,0,r3
496 LD rB,0,r4
497 cmpld cr0,rA,rB
498 li off8,8
499 bne cr0,.LcmpAB_lightweight
500
501 LD rA,off8,r3
502 LD rB,off8,r4
503 cmpld cr0,rA,rB
504 bne cr0,.LcmpAB_lightweight
505 b .Lzero
506#endif
507
508.Ldiffoffset_8bytes_make_align_start:
509 /* now try to align s1 with 8 bytes */
510 rlwinm r6,r3,3,26,28
511 beq .Ldiffoffset_align_s1_8bytes
512
513 clrrdi r3,r3,3
514 LD rA,0,r3
515 LD rB,0,r4 /* unaligned load */
516 sld rA,rA,r6
517 srd rA,rA,r6
518 srd rB,rB,r6
519 cmpld cr0,rA,rB
520 srwi r6,r6,3
521 bne cr0,.LcmpAB_lightweight
522
523 subfic r6,r6,8
524 subf. r5,r6,r5
525 addi r3,r3,8
526 add r4,r4,r6
527
528 beq .Lzero
529
530.Ldiffoffset_align_s1_8bytes:
531 /* now s1 is aligned with 8 bytes. */
532#ifdef CONFIG_ALTIVEC
533BEGIN_FTR_SECTION
534 /* only do vmx ops when the size equal or greater than 4K bytes */
535 cmpdi cr5,r5,VMX_THRESH
536 bge cr5,.Ldiffoffset_vmx_cmp
537END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
538
539.Ldiffoffset_novmx_cmp:
540#endif
541
542
543 cmpdi cr5,r5,31
544 ble cr5,.Lcmp_lt32bytes
545
546#ifdef CONFIG_ALTIVEC
547 b .Llong_novmx_cmp
548#else
549 b .Llong
550#endif
551
552#ifdef CONFIG_ALTIVEC
553.Ldiffoffset_vmx_cmp:
554 /* perform a 32 bytes pre-checking before
555 * enable VMX operations.
556 */
557 li r0,4
558 mtctr r0
559.Ldiffoffset_prechk_32B_loop:
560 LD rA,0,r3
561 LD rB,0,r4
562 cmpld cr0,rA,rB
563 addi r3,r3,8
564 addi r4,r4,8
565 bne cr0,.LcmpAB_lightweight
566 addi r5,r5,-8
567 bdnz .Ldiffoffset_prechk_32B_loop
568
569 ENTER_VMX_OPS
570 beq cr1,.Ldiffoffset_novmx_cmp
571
572.Ldiffoffset_vmx_cmp_start:
573 /* Firstly try to align r3 with 16 bytes */
574 andi. r6,r3,0xf
575 li off16,16
576 beq .Ldiffoffset_vmx_s1_16bytes_align
577
578 LVS v3,0,r3
579 LVS v4,0,r4
580
581 lvx v5,0,r3
582 lvx v6,0,r4
583 LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
584 LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
585
586 VCMPEQUB_RC(v7,v9,v10)
587 bnl cr6,.Ldiffoffset_vmx_diff_found
588
589 subfic r6,r6,16
590 subf r5,r6,r5
591 add r3,r3,r6
592 add r4,r4,r6
593
594.Ldiffoffset_vmx_s1_16bytes_align:
595 /* now s1 is aligned with 16 bytes */
596 lvx v6,0,r4
597 LVS v4,0,r4
598 srdi r6,r5,5 /* loop for 32 bytes each */
599 clrldi r5,r5,59
600 mtctr r6
601
602.balign 16
603.Ldiffoffset_vmx_32bytesloop:
604 /* the first qw of r4 was saved in v6 */
605 lvx v9,0,r3
606 LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
607 VCMPEQUB_RC(v7,v9,v10)
608 vor v6,v8,v8
609 bnl cr6,.Ldiffoffset_vmx_diff_found
610
611 addi r3,r3,16
612 addi r4,r4,16
613
614 lvx v9,0,r3
615 LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
616 VCMPEQUB_RC(v7,v9,v10)
617 vor v6,v8,v8
618 bnl cr6,.Ldiffoffset_vmx_diff_found
619
620 addi r3,r3,16
621 addi r4,r4,16
622
623 bdnz .Ldiffoffset_vmx_32bytesloop
624
625 EXIT_VMX_OPS
626
627 cmpdi r5,0
628 beq .Lzero
629 b .Lcmp_lt32bytes
630
631.Ldiffoffset_vmx_diff_found:
632 EXIT_VMX_OPS
633 /* anyway, the diff will appear in next 16 bytes */
634 li r5,16
635 b .Lcmp_lt32bytes
636
637#endif
638EXPORT_SYMBOL(memcmp)
639

source code of linux/arch/powerpc/lib/memcmp_64.S