1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* |
3 | * Author: Anton Blanchard <anton@au.ibm.com> |
4 | * Copyright 2015 IBM Corporation. |
5 | */ |
6 | #include <linux/export.h> |
7 | #include <asm/ppc_asm.h> |
8 | #include <asm/ppc-opcode.h> |
9 | |
10 | #define off8 r6 |
11 | #define off16 r7 |
12 | #define off24 r8 |
13 | |
14 | #define rA r9 |
15 | #define rB r10 |
16 | #define rC r11 |
17 | #define rD r27 |
18 | #define rE r28 |
19 | #define rF r29 |
20 | #define rG r30 |
21 | #define rH r31 |
22 | |
23 | #ifdef __LITTLE_ENDIAN__ |
24 | #define LH lhbrx |
25 | #define LW lwbrx |
26 | #define LD ldbrx |
27 | #define LVS lvsr |
28 | #define VPERM(_VRT,_VRA,_VRB,_VRC) \ |
29 | vperm _VRT,_VRB,_VRA,_VRC |
30 | #else |
31 | #define LH lhzx |
32 | #define LW lwzx |
33 | #define LD ldx |
34 | #define LVS lvsl |
35 | #define VPERM(_VRT,_VRA,_VRB,_VRC) \ |
36 | vperm _VRT,_VRA,_VRB,_VRC |
37 | #endif |
38 | |
39 | #define VMX_THRESH 4096 |
40 | #define ENTER_VMX_OPS \ |
41 | mflr r0; \ |
42 | std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ |
43 | std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ |
44 | std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ |
45 | std r0,16(r1); \ |
46 | stdu r1,-STACKFRAMESIZE(r1); \ |
47 | bl CFUNC(enter_vmx_ops); \ |
48 | cmpwi cr1,r3,0; \ |
49 | ld r0,STACKFRAMESIZE+16(r1); \ |
50 | ld r3,STK_REG(R31)(r1); \ |
51 | ld r4,STK_REG(R30)(r1); \ |
52 | ld r5,STK_REG(R29)(r1); \ |
53 | addi r1,r1,STACKFRAMESIZE; \ |
54 | mtlr r0 |
55 | |
56 | #define EXIT_VMX_OPS \ |
57 | mflr r0; \ |
58 | std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ |
59 | std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ |
60 | std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ |
61 | std r0,16(r1); \ |
62 | stdu r1,-STACKFRAMESIZE(r1); \ |
63 | bl CFUNC(exit_vmx_ops); \ |
64 | ld r0,STACKFRAMESIZE+16(r1); \ |
65 | ld r3,STK_REG(R31)(r1); \ |
66 | ld r4,STK_REG(R30)(r1); \ |
67 | ld r5,STK_REG(R29)(r1); \ |
68 | addi r1,r1,STACKFRAMESIZE; \ |
69 | mtlr r0 |
70 | |
71 | /* |
72 | * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with |
73 | * 16 bytes boundary and permute the result with the 1st 16 bytes. |
74 | |
75 | * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z | |
76 | * ^ ^ ^ |
77 | * 0xbbbb10 0xbbbb20 0xbbb30 |
78 | * ^ |
79 | * _vaddr |
80 | * |
81 | * |
82 | * _vmask is the mask generated by LVS |
83 | * _v1st_qw is the 1st aligned QW of current addr which is already loaded. |
84 | * for example: 0xyyyyyyyyyyyyy012 for big endian |
85 | * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded. |
86 | * for example: 0x3456789abcdefzzz for big endian |
87 | * The permute result is saved in _v_res. |
88 | * for example: 0x0123456789abcdef for big endian. |
89 | */ |
90 | #define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \ |
91 | lvx _v2nd_qw,_vaddr,off16; \ |
92 | VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask) |
93 | |
94 | /* |
95 | * There are 2 categories for memcmp: |
96 | * 1) src/dst has the same offset to the 8 bytes boundary. The handlers |
97 | * are named like .Lsameoffset_xxxx |
98 | * 2) src/dst has different offset to the 8 bytes boundary. The handlers |
99 | * are named like .Ldiffoffset_xxxx |
100 | */ |
101 | _GLOBAL_TOC(memcmp) |
102 | cmpdi cr1,r5,0 |
103 | |
104 | /* Use the short loop if the src/dst addresses are not |
105 | * with the same offset of 8 bytes align boundary. |
106 | */ |
107 | xor r6,r3,r4 |
108 | andi. r6,r6,7 |
109 | |
110 | /* Fall back to short loop if compare at aligned addrs |
111 | * with less than 8 bytes. |
112 | */ |
113 | cmpdi cr6,r5,7 |
114 | |
115 | beq cr1,.Lzero |
116 | bgt cr6,.Lno_short |
117 | |
118 | .Lshort: |
119 | mtctr r5 |
120 | 1: lbz rA,0(r3) |
121 | lbz rB,0(r4) |
122 | subf. rC,rB,rA |
123 | bne .Lnon_zero |
124 | bdz .Lzero |
125 | |
126 | lbz rA,1(r3) |
127 | lbz rB,1(r4) |
128 | subf. rC,rB,rA |
129 | bne .Lnon_zero |
130 | bdz .Lzero |
131 | |
132 | lbz rA,2(r3) |
133 | lbz rB,2(r4) |
134 | subf. rC,rB,rA |
135 | bne .Lnon_zero |
136 | bdz .Lzero |
137 | |
138 | lbz rA,3(r3) |
139 | lbz rB,3(r4) |
140 | subf. rC,rB,rA |
141 | bne .Lnon_zero |
142 | |
143 | addi r3,r3,4 |
144 | addi r4,r4,4 |
145 | |
146 | bdnz 1b |
147 | |
148 | .Lzero: |
149 | li r3,0 |
150 | blr |
151 | |
152 | .Lno_short: |
153 | dcbt 0,r3 |
154 | dcbt 0,r4 |
155 | bne .Ldiffoffset_8bytes_make_align_start |
156 | |
157 | |
158 | .Lsameoffset_8bytes_make_align_start: |
159 | /* attempt to compare bytes not aligned with 8 bytes so that |
160 | * rest comparison can run based on 8 bytes alignment. |
161 | */ |
162 | andi. r6,r3,7 |
163 | |
164 | /* Try to compare the first double word which is not 8 bytes aligned: |
165 | * load the first double word at (src & ~7UL) and shift left appropriate |
166 | * bits before comparision. |
167 | */ |
168 | rlwinm r6,r3,3,26,28 |
169 | beq .Lsameoffset_8bytes_aligned |
170 | clrrdi r3,r3,3 |
171 | clrrdi r4,r4,3 |
172 | LD rA,0,r3 |
173 | LD rB,0,r4 |
174 | sld rA,rA,r6 |
175 | sld rB,rB,r6 |
176 | cmpld cr0,rA,rB |
177 | srwi r6,r6,3 |
178 | bne cr0,.LcmpAB_lightweight |
179 | subfic r6,r6,8 |
180 | subf. r5,r6,r5 |
181 | addi r3,r3,8 |
182 | addi r4,r4,8 |
183 | beq .Lzero |
184 | |
185 | .Lsameoffset_8bytes_aligned: |
186 | /* now we are aligned with 8 bytes. |
187 | * Use .Llong loop if left cmp bytes are equal or greater than 32B. |
188 | */ |
189 | cmpdi cr6,r5,31 |
190 | bgt cr6,.Llong |
191 | |
192 | .Lcmp_lt32bytes: |
193 | /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */ |
194 | cmpdi cr5,r5,7 |
195 | srdi r0,r5,3 |
196 | ble cr5,.Lcmp_rest_lt8bytes |
197 | |
198 | /* handle 8 ~ 31 bytes */ |
199 | clrldi r5,r5,61 |
200 | mtctr r0 |
201 | 2: |
202 | LD rA,0,r3 |
203 | LD rB,0,r4 |
204 | cmpld cr0,rA,rB |
205 | addi r3,r3,8 |
206 | addi r4,r4,8 |
207 | bne cr0,.LcmpAB_lightweight |
208 | bdnz 2b |
209 | |
210 | cmpwi r5,0 |
211 | beq .Lzero |
212 | |
213 | .Lcmp_rest_lt8bytes: |
214 | /* |
215 | * Here we have less than 8 bytes to compare. At least s1 is aligned to |
216 | * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a |
217 | * page boundary, otherwise we might read past the end of the buffer and |
218 | * trigger a page fault. We use 4K as the conservative minimum page |
219 | * size. If we detect that case we go to the byte-by-byte loop. |
220 | * |
221 | * Otherwise the next double word is loaded from s1 and s2, and shifted |
222 | * right to compare the appropriate bits. |
223 | */ |
224 | clrldi r6,r4,(64-12) // r6 = r4 & 0xfff |
225 | cmpdi r6,0xff8 |
226 | bgt .Lshort |
227 | |
228 | subfic r6,r5,8 |
229 | slwi r6,r6,3 |
230 | LD rA,0,r3 |
231 | LD rB,0,r4 |
232 | srd rA,rA,r6 |
233 | srd rB,rB,r6 |
234 | cmpld cr0,rA,rB |
235 | bne cr0,.LcmpAB_lightweight |
236 | b .Lzero |
237 | |
238 | .Lnon_zero: |
239 | mr r3,rC |
240 | blr |
241 | |
242 | .Llong: |
243 | #ifdef CONFIG_ALTIVEC |
244 | BEGIN_FTR_SECTION |
245 | /* Try to use vmx loop if length is equal or greater than 4K */ |
246 | cmpldi cr6,r5,VMX_THRESH |
247 | bge cr6,.Lsameoffset_vmx_cmp |
248 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) |
249 | |
250 | .Llong_novmx_cmp: |
251 | #endif |
252 | /* At least s1 addr is aligned with 8 bytes */ |
253 | li off8,8 |
254 | li off16,16 |
255 | li off24,24 |
256 | |
257 | std r31,-8(r1) |
258 | std r30,-16(r1) |
259 | std r29,-24(r1) |
260 | std r28,-32(r1) |
261 | std r27,-40(r1) |
262 | |
263 | srdi r0,r5,5 |
264 | mtctr r0 |
265 | andi. r5,r5,31 |
266 | |
267 | LD rA,0,r3 |
268 | LD rB,0,r4 |
269 | |
270 | LD rC,off8,r3 |
271 | LD rD,off8,r4 |
272 | |
273 | LD rE,off16,r3 |
274 | LD rF,off16,r4 |
275 | |
276 | LD rG,off24,r3 |
277 | LD rH,off24,r4 |
278 | cmpld cr0,rA,rB |
279 | |
280 | addi r3,r3,32 |
281 | addi r4,r4,32 |
282 | |
283 | bdz .Lfirst32 |
284 | |
285 | LD rA,0,r3 |
286 | LD rB,0,r4 |
287 | cmpld cr1,rC,rD |
288 | |
289 | LD rC,off8,r3 |
290 | LD rD,off8,r4 |
291 | cmpld cr6,rE,rF |
292 | |
293 | LD rE,off16,r3 |
294 | LD rF,off16,r4 |
295 | cmpld cr7,rG,rH |
296 | bne cr0,.LcmpAB |
297 | |
298 | LD rG,off24,r3 |
299 | LD rH,off24,r4 |
300 | cmpld cr0,rA,rB |
301 | bne cr1,.LcmpCD |
302 | |
303 | addi r3,r3,32 |
304 | addi r4,r4,32 |
305 | |
306 | bdz .Lsecond32 |
307 | |
308 | .balign 16 |
309 | |
310 | 1: LD rA,0,r3 |
311 | LD rB,0,r4 |
312 | cmpld cr1,rC,rD |
313 | bne cr6,.LcmpEF |
314 | |
315 | LD rC,off8,r3 |
316 | LD rD,off8,r4 |
317 | cmpld cr6,rE,rF |
318 | bne cr7,.LcmpGH |
319 | |
320 | LD rE,off16,r3 |
321 | LD rF,off16,r4 |
322 | cmpld cr7,rG,rH |
323 | bne cr0,.LcmpAB |
324 | |
325 | LD rG,off24,r3 |
326 | LD rH,off24,r4 |
327 | cmpld cr0,rA,rB |
328 | bne cr1,.LcmpCD |
329 | |
330 | addi r3,r3,32 |
331 | addi r4,r4,32 |
332 | |
333 | bdnz 1b |
334 | |
335 | .Lsecond32: |
336 | cmpld cr1,rC,rD |
337 | bne cr6,.LcmpEF |
338 | |
339 | cmpld cr6,rE,rF |
340 | bne cr7,.LcmpGH |
341 | |
342 | cmpld cr7,rG,rH |
343 | bne cr0,.LcmpAB |
344 | |
345 | bne cr1,.LcmpCD |
346 | bne cr6,.LcmpEF |
347 | bne cr7,.LcmpGH |
348 | |
349 | .Ltail: |
350 | ld r31,-8(r1) |
351 | ld r30,-16(r1) |
352 | ld r29,-24(r1) |
353 | ld r28,-32(r1) |
354 | ld r27,-40(r1) |
355 | |
356 | cmpdi r5,0 |
357 | beq .Lzero |
358 | b .Lshort |
359 | |
360 | .Lfirst32: |
361 | cmpld cr1,rC,rD |
362 | cmpld cr6,rE,rF |
363 | cmpld cr7,rG,rH |
364 | |
365 | bne cr0,.LcmpAB |
366 | bne cr1,.LcmpCD |
367 | bne cr6,.LcmpEF |
368 | bne cr7,.LcmpGH |
369 | |
370 | b .Ltail |
371 | |
372 | .LcmpAB: |
373 | li r3,1 |
374 | bgt cr0,.Lout |
375 | li r3,-1 |
376 | b .Lout |
377 | |
378 | .LcmpCD: |
379 | li r3,1 |
380 | bgt cr1,.Lout |
381 | li r3,-1 |
382 | b .Lout |
383 | |
384 | .LcmpEF: |
385 | li r3,1 |
386 | bgt cr6,.Lout |
387 | li r3,-1 |
388 | b .Lout |
389 | |
390 | .LcmpGH: |
391 | li r3,1 |
392 | bgt cr7,.Lout |
393 | li r3,-1 |
394 | |
395 | .Lout: |
396 | ld r31,-8(r1) |
397 | ld r30,-16(r1) |
398 | ld r29,-24(r1) |
399 | ld r28,-32(r1) |
400 | ld r27,-40(r1) |
401 | blr |
402 | |
403 | .LcmpAB_lightweight: /* skip NV GPRS restore */ |
404 | li r3,1 |
405 | bgtlr |
406 | li r3,-1 |
407 | blr |
408 | |
409 | #ifdef CONFIG_ALTIVEC |
410 | .Lsameoffset_vmx_cmp: |
411 | /* Enter with src/dst addrs has the same offset with 8 bytes |
412 | * align boundary. |
413 | * |
414 | * There is an optimization based on following fact: memcmp() |
415 | * prones to fail early at the first 32 bytes. |
416 | * Before applying VMX instructions which will lead to 32x128bits |
417 | * VMX regs load/restore penalty, we compare the first 32 bytes |
418 | * so that we can catch the ~80% fail cases. |
419 | */ |
420 | |
421 | li r0,4 |
422 | mtctr r0 |
423 | .Lsameoffset_prechk_32B_loop: |
424 | LD rA,0,r3 |
425 | LD rB,0,r4 |
426 | cmpld cr0,rA,rB |
427 | addi r3,r3,8 |
428 | addi r4,r4,8 |
429 | bne cr0,.LcmpAB_lightweight |
430 | addi r5,r5,-8 |
431 | bdnz .Lsameoffset_prechk_32B_loop |
432 | |
433 | ENTER_VMX_OPS |
434 | beq cr1,.Llong_novmx_cmp |
435 | |
436 | 3: |
437 | /* need to check whether r4 has the same offset with r3 |
438 | * for 16 bytes boundary. |
439 | */ |
440 | xor r0,r3,r4 |
441 | andi. r0,r0,0xf |
442 | bne .Ldiffoffset_vmx_cmp_start |
443 | |
444 | /* len is no less than 4KB. Need to align with 16 bytes further. |
445 | */ |
446 | andi. rA,r3,8 |
447 | LD rA,0,r3 |
448 | beq 4f |
449 | LD rB,0,r4 |
450 | cmpld cr0,rA,rB |
451 | addi r3,r3,8 |
452 | addi r4,r4,8 |
453 | addi r5,r5,-8 |
454 | |
455 | beq cr0,4f |
456 | /* save and restore cr0 */ |
457 | mfocrf r5,128 |
458 | EXIT_VMX_OPS |
459 | mtocrf 128,r5 |
460 | b .LcmpAB_lightweight |
461 | |
462 | 4: |
463 | /* compare 32 bytes for each loop */ |
464 | srdi r0,r5,5 |
465 | mtctr r0 |
466 | clrldi r5,r5,59 |
467 | li off16,16 |
468 | |
469 | .balign 16 |
470 | 5: |
471 | lvx v0,0,r3 |
472 | lvx v1,0,r4 |
473 | VCMPEQUD_RC(v0,v0,v1) |
474 | bnl cr6,7f |
475 | lvx v0,off16,r3 |
476 | lvx v1,off16,r4 |
477 | VCMPEQUD_RC(v0,v0,v1) |
478 | bnl cr6,6f |
479 | addi r3,r3,32 |
480 | addi r4,r4,32 |
481 | bdnz 5b |
482 | |
483 | EXIT_VMX_OPS |
484 | cmpdi r5,0 |
485 | beq .Lzero |
486 | b .Lcmp_lt32bytes |
487 | |
488 | 6: |
489 | addi r3,r3,16 |
490 | addi r4,r4,16 |
491 | |
492 | 7: |
493 | /* diff the last 16 bytes */ |
494 | EXIT_VMX_OPS |
495 | LD rA,0,r3 |
496 | LD rB,0,r4 |
497 | cmpld cr0,rA,rB |
498 | li off8,8 |
499 | bne cr0,.LcmpAB_lightweight |
500 | |
501 | LD rA,off8,r3 |
502 | LD rB,off8,r4 |
503 | cmpld cr0,rA,rB |
504 | bne cr0,.LcmpAB_lightweight |
505 | b .Lzero |
506 | #endif |
507 | |
508 | .Ldiffoffset_8bytes_make_align_start: |
509 | /* now try to align s1 with 8 bytes */ |
510 | rlwinm r6,r3,3,26,28 |
511 | beq .Ldiffoffset_align_s1_8bytes |
512 | |
513 | clrrdi r3,r3,3 |
514 | LD rA,0,r3 |
515 | LD rB,0,r4 /* unaligned load */ |
516 | sld rA,rA,r6 |
517 | srd rA,rA,r6 |
518 | srd rB,rB,r6 |
519 | cmpld cr0,rA,rB |
520 | srwi r6,r6,3 |
521 | bne cr0,.LcmpAB_lightweight |
522 | |
523 | subfic r6,r6,8 |
524 | subf. r5,r6,r5 |
525 | addi r3,r3,8 |
526 | add r4,r4,r6 |
527 | |
528 | beq .Lzero |
529 | |
530 | .Ldiffoffset_align_s1_8bytes: |
531 | /* now s1 is aligned with 8 bytes. */ |
532 | #ifdef CONFIG_ALTIVEC |
533 | BEGIN_FTR_SECTION |
534 | /* only do vmx ops when the size equal or greater than 4K bytes */ |
535 | cmpdi cr5,r5,VMX_THRESH |
536 | bge cr5,.Ldiffoffset_vmx_cmp |
537 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) |
538 | |
539 | .Ldiffoffset_novmx_cmp: |
540 | #endif |
541 | |
542 | |
543 | cmpdi cr5,r5,31 |
544 | ble cr5,.Lcmp_lt32bytes |
545 | |
546 | #ifdef CONFIG_ALTIVEC |
547 | b .Llong_novmx_cmp |
548 | #else |
549 | b .Llong |
550 | #endif |
551 | |
552 | #ifdef CONFIG_ALTIVEC |
553 | .Ldiffoffset_vmx_cmp: |
554 | /* perform a 32 bytes pre-checking before |
555 | * enable VMX operations. |
556 | */ |
557 | li r0,4 |
558 | mtctr r0 |
559 | .Ldiffoffset_prechk_32B_loop: |
560 | LD rA,0,r3 |
561 | LD rB,0,r4 |
562 | cmpld cr0,rA,rB |
563 | addi r3,r3,8 |
564 | addi r4,r4,8 |
565 | bne cr0,.LcmpAB_lightweight |
566 | addi r5,r5,-8 |
567 | bdnz .Ldiffoffset_prechk_32B_loop |
568 | |
569 | ENTER_VMX_OPS |
570 | beq cr1,.Ldiffoffset_novmx_cmp |
571 | |
572 | .Ldiffoffset_vmx_cmp_start: |
573 | /* Firstly try to align r3 with 16 bytes */ |
574 | andi. r6,r3,0xf |
575 | li off16,16 |
576 | beq .Ldiffoffset_vmx_s1_16bytes_align |
577 | |
578 | LVS v3,0,r3 |
579 | LVS v4,0,r4 |
580 | |
581 | lvx v5,0,r3 |
582 | lvx v6,0,r4 |
583 | LD_VSR_CROSS16B(r3,v3,v5,v7,v9) |
584 | LD_VSR_CROSS16B(r4,v4,v6,v8,v10) |
585 | |
586 | VCMPEQUB_RC(v7,v9,v10) |
587 | bnl cr6,.Ldiffoffset_vmx_diff_found |
588 | |
589 | subfic r6,r6,16 |
590 | subf r5,r6,r5 |
591 | add r3,r3,r6 |
592 | add r4,r4,r6 |
593 | |
594 | .Ldiffoffset_vmx_s1_16bytes_align: |
595 | /* now s1 is aligned with 16 bytes */ |
596 | lvx v6,0,r4 |
597 | LVS v4,0,r4 |
598 | srdi r6,r5,5 /* loop for 32 bytes each */ |
599 | clrldi r5,r5,59 |
600 | mtctr r6 |
601 | |
602 | .balign 16 |
603 | .Ldiffoffset_vmx_32bytesloop: |
604 | /* the first qw of r4 was saved in v6 */ |
605 | lvx v9,0,r3 |
606 | LD_VSR_CROSS16B(r4,v4,v6,v8,v10) |
607 | VCMPEQUB_RC(v7,v9,v10) |
608 | vor v6,v8,v8 |
609 | bnl cr6,.Ldiffoffset_vmx_diff_found |
610 | |
611 | addi r3,r3,16 |
612 | addi r4,r4,16 |
613 | |
614 | lvx v9,0,r3 |
615 | LD_VSR_CROSS16B(r4,v4,v6,v8,v10) |
616 | VCMPEQUB_RC(v7,v9,v10) |
617 | vor v6,v8,v8 |
618 | bnl cr6,.Ldiffoffset_vmx_diff_found |
619 | |
620 | addi r3,r3,16 |
621 | addi r4,r4,16 |
622 | |
623 | bdnz .Ldiffoffset_vmx_32bytesloop |
624 | |
625 | EXIT_VMX_OPS |
626 | |
627 | cmpdi r5,0 |
628 | beq .Lzero |
629 | b .Lcmp_lt32bytes |
630 | |
631 | .Ldiffoffset_vmx_diff_found: |
632 | EXIT_VMX_OPS |
633 | /* anyway, the diff will appear in next 16 bytes */ |
634 | li r5,16 |
635 | b .Lcmp_lt32bytes |
636 | |
637 | #endif |
638 | EXPORT_SYMBOL(memcmp) |
639 | |