1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 *
4 * Copyright (C) IBM Corporation, 2011
5 *
6 * Author: Anton Blanchard <anton@au.ibm.com>
7 */
8#include <asm/ppc_asm.h>
9
10#ifndef SELFTEST_CASE
11/* 0 == don't use VMX, 1 == use VMX */
12#define SELFTEST_CASE 0
13#endif
14
15#ifdef __BIG_ENDIAN__
16#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
17#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
18#else
19#define LVS(VRT,RA,RB) lvsr VRT,RA,RB
20#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
21#endif
22
23 .macro err1
24100:
25 EX_TABLE(100b,.Ldo_err1)
26 .endm
27
28 .macro err2
29200:
30 EX_TABLE(200b,.Ldo_err2)
31 .endm
32
33#ifdef CONFIG_ALTIVEC
34 .macro err3
35300:
36 EX_TABLE(300b,.Ldo_err3)
37 .endm
38
39 .macro err4
40400:
41 EX_TABLE(400b,.Ldo_err4)
42 .endm
43
44
45.Ldo_err4:
46 ld r16,STK_REG(R16)(r1)
47 ld r15,STK_REG(R15)(r1)
48 ld r14,STK_REG(R14)(r1)
49.Ldo_err3:
50 bl CFUNC(exit_vmx_usercopy)
51 ld r0,STACKFRAMESIZE+16(r1)
52 mtlr r0
53 b .Lexit
54#endif /* CONFIG_ALTIVEC */
55
56.Ldo_err2:
57 ld r22,STK_REG(R22)(r1)
58 ld r21,STK_REG(R21)(r1)
59 ld r20,STK_REG(R20)(r1)
60 ld r19,STK_REG(R19)(r1)
61 ld r18,STK_REG(R18)(r1)
62 ld r17,STK_REG(R17)(r1)
63 ld r16,STK_REG(R16)(r1)
64 ld r15,STK_REG(R15)(r1)
65 ld r14,STK_REG(R14)(r1)
66.Lexit:
67 addi r1,r1,STACKFRAMESIZE
68.Ldo_err1:
69 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
70 ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
71 ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
72 b __copy_tofrom_user_base
73
74
75_GLOBAL(__copy_tofrom_user_power7)
76 cmpldi r5,16
77 cmpldi cr1,r5,3328
78
79 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
80 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
81 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
82
83 blt .Lshort_copy
84
85#ifdef CONFIG_ALTIVEC
86test_feature = SELFTEST_CASE
87BEGIN_FTR_SECTION
88 bgt cr1,.Lvmx_copy
89END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
90#endif
91
92.Lnonvmx_copy:
93 /* Get the source 8B aligned */
94 neg r6,r4
95 mtocrf 0x01,r6
96 clrldi r6,r6,(64-3)
97
98 bf cr7*4+3,1f
99err1; lbz r0,0(r4)
100 addi r4,r4,1
101err1; stb r0,0(r3)
102 addi r3,r3,1
103
1041: bf cr7*4+2,2f
105err1; lhz r0,0(r4)
106 addi r4,r4,2
107err1; sth r0,0(r3)
108 addi r3,r3,2
109
1102: bf cr7*4+1,3f
111err1; lwz r0,0(r4)
112 addi r4,r4,4
113err1; stw r0,0(r3)
114 addi r3,r3,4
115
1163: sub r5,r5,r6
117 cmpldi r5,128
118 blt 5f
119
120 mflr r0
121 stdu r1,-STACKFRAMESIZE(r1)
122 std r14,STK_REG(R14)(r1)
123 std r15,STK_REG(R15)(r1)
124 std r16,STK_REG(R16)(r1)
125 std r17,STK_REG(R17)(r1)
126 std r18,STK_REG(R18)(r1)
127 std r19,STK_REG(R19)(r1)
128 std r20,STK_REG(R20)(r1)
129 std r21,STK_REG(R21)(r1)
130 std r22,STK_REG(R22)(r1)
131 std r0,STACKFRAMESIZE+16(r1)
132
133 srdi r6,r5,7
134 mtctr r6
135
136 /* Now do cacheline (128B) sized loads and stores. */
137 .align 5
1384:
139err2; ld r0,0(r4)
140err2; ld r6,8(r4)
141err2; ld r7,16(r4)
142err2; ld r8,24(r4)
143err2; ld r9,32(r4)
144err2; ld r10,40(r4)
145err2; ld r11,48(r4)
146err2; ld r12,56(r4)
147err2; ld r14,64(r4)
148err2; ld r15,72(r4)
149err2; ld r16,80(r4)
150err2; ld r17,88(r4)
151err2; ld r18,96(r4)
152err2; ld r19,104(r4)
153err2; ld r20,112(r4)
154err2; ld r21,120(r4)
155 addi r4,r4,128
156err2; std r0,0(r3)
157err2; std r6,8(r3)
158err2; std r7,16(r3)
159err2; std r8,24(r3)
160err2; std r9,32(r3)
161err2; std r10,40(r3)
162err2; std r11,48(r3)
163err2; std r12,56(r3)
164err2; std r14,64(r3)
165err2; std r15,72(r3)
166err2; std r16,80(r3)
167err2; std r17,88(r3)
168err2; std r18,96(r3)
169err2; std r19,104(r3)
170err2; std r20,112(r3)
171err2; std r21,120(r3)
172 addi r3,r3,128
173 bdnz 4b
174
175 clrldi r5,r5,(64-7)
176
177 ld r14,STK_REG(R14)(r1)
178 ld r15,STK_REG(R15)(r1)
179 ld r16,STK_REG(R16)(r1)
180 ld r17,STK_REG(R17)(r1)
181 ld r18,STK_REG(R18)(r1)
182 ld r19,STK_REG(R19)(r1)
183 ld r20,STK_REG(R20)(r1)
184 ld r21,STK_REG(R21)(r1)
185 ld r22,STK_REG(R22)(r1)
186 addi r1,r1,STACKFRAMESIZE
187
188 /* Up to 127B to go */
1895: srdi r6,r5,4
190 mtocrf 0x01,r6
191
1926: bf cr7*4+1,7f
193err1; ld r0,0(r4)
194err1; ld r6,8(r4)
195err1; ld r7,16(r4)
196err1; ld r8,24(r4)
197err1; ld r9,32(r4)
198err1; ld r10,40(r4)
199err1; ld r11,48(r4)
200err1; ld r12,56(r4)
201 addi r4,r4,64
202err1; std r0,0(r3)
203err1; std r6,8(r3)
204err1; std r7,16(r3)
205err1; std r8,24(r3)
206err1; std r9,32(r3)
207err1; std r10,40(r3)
208err1; std r11,48(r3)
209err1; std r12,56(r3)
210 addi r3,r3,64
211
212 /* Up to 63B to go */
2137: bf cr7*4+2,8f
214err1; ld r0,0(r4)
215err1; ld r6,8(r4)
216err1; ld r7,16(r4)
217err1; ld r8,24(r4)
218 addi r4,r4,32
219err1; std r0,0(r3)
220err1; std r6,8(r3)
221err1; std r7,16(r3)
222err1; std r8,24(r3)
223 addi r3,r3,32
224
225 /* Up to 31B to go */
2268: bf cr7*4+3,9f
227err1; ld r0,0(r4)
228err1; ld r6,8(r4)
229 addi r4,r4,16
230err1; std r0,0(r3)
231err1; std r6,8(r3)
232 addi r3,r3,16
233
2349: clrldi r5,r5,(64-4)
235
236 /* Up to 15B to go */
237.Lshort_copy:
238 mtocrf 0x01,r5
239 bf cr7*4+0,12f
240err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
241err1; lwz r6,4(r4)
242 addi r4,r4,8
243err1; stw r0,0(r3)
244err1; stw r6,4(r3)
245 addi r3,r3,8
246
24712: bf cr7*4+1,13f
248err1; lwz r0,0(r4)
249 addi r4,r4,4
250err1; stw r0,0(r3)
251 addi r3,r3,4
252
25313: bf cr7*4+2,14f
254err1; lhz r0,0(r4)
255 addi r4,r4,2
256err1; sth r0,0(r3)
257 addi r3,r3,2
258
25914: bf cr7*4+3,15f
260err1; lbz r0,0(r4)
261err1; stb r0,0(r3)
262
26315: li r3,0
264 blr
265
266.Lunwind_stack_nonvmx_copy:
267 addi r1,r1,STACKFRAMESIZE
268 b .Lnonvmx_copy
269
270.Lvmx_copy:
271#ifdef CONFIG_ALTIVEC
272 mflr r0
273 std r0,16(r1)
274 stdu r1,-STACKFRAMESIZE(r1)
275 bl CFUNC(enter_vmx_usercopy)
276 cmpwi cr1,r3,0
277 ld r0,STACKFRAMESIZE+16(r1)
278 ld r3,STK_REG(R31)(r1)
279 ld r4,STK_REG(R30)(r1)
280 ld r5,STK_REG(R29)(r1)
281 mtlr r0
282
283 /*
284 * We prefetch both the source and destination using enhanced touch
285 * instructions. We use a stream ID of 0 for the load side and
286 * 1 for the store side.
287 */
288 clrrdi r6,r4,7
289 clrrdi r9,r3,7
290 ori r9,r9,1 /* stream=1 */
291
292 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
293 cmpldi r7,0x3FF
294 ble 1f
295 li r7,0x3FF
2961: lis r0,0x0E00 /* depth=7 */
297 sldi r7,r7,7
298 or r7,r7,r0
299 ori r10,r7,1 /* stream=1 */
300
301 DCBT_SETUP_STREAMS(r6, r7, r9, r10, r8)
302
303 beq cr1,.Lunwind_stack_nonvmx_copy
304
305 /*
306 * If source and destination are not relatively aligned we use a
307 * slower permute loop.
308 */
309 xor r6,r4,r3
310 rldicl. r6,r6,0,(64-4)
311 bne .Lvmx_unaligned_copy
312
313 /* Get the destination 16B aligned */
314 neg r6,r3
315 mtocrf 0x01,r6
316 clrldi r6,r6,(64-4)
317
318 bf cr7*4+3,1f
319err3; lbz r0,0(r4)
320 addi r4,r4,1
321err3; stb r0,0(r3)
322 addi r3,r3,1
323
3241: bf cr7*4+2,2f
325err3; lhz r0,0(r4)
326 addi r4,r4,2
327err3; sth r0,0(r3)
328 addi r3,r3,2
329
3302: bf cr7*4+1,3f
331err3; lwz r0,0(r4)
332 addi r4,r4,4
333err3; stw r0,0(r3)
334 addi r3,r3,4
335
3363: bf cr7*4+0,4f
337err3; ld r0,0(r4)
338 addi r4,r4,8
339err3; std r0,0(r3)
340 addi r3,r3,8
341
3424: sub r5,r5,r6
343
344 /* Get the desination 128B aligned */
345 neg r6,r3
346 srdi r7,r6,4
347 mtocrf 0x01,r7
348 clrldi r6,r6,(64-7)
349
350 li r9,16
351 li r10,32
352 li r11,48
353
354 bf cr7*4+3,5f
355err3; lvx v1,0,r4
356 addi r4,r4,16
357err3; stvx v1,0,r3
358 addi r3,r3,16
359
3605: bf cr7*4+2,6f
361err3; lvx v1,0,r4
362err3; lvx v0,r4,r9
363 addi r4,r4,32
364err3; stvx v1,0,r3
365err3; stvx v0,r3,r9
366 addi r3,r3,32
367
3686: bf cr7*4+1,7f
369err3; lvx v3,0,r4
370err3; lvx v2,r4,r9
371err3; lvx v1,r4,r10
372err3; lvx v0,r4,r11
373 addi r4,r4,64
374err3; stvx v3,0,r3
375err3; stvx v2,r3,r9
376err3; stvx v1,r3,r10
377err3; stvx v0,r3,r11
378 addi r3,r3,64
379
3807: sub r5,r5,r6
381 srdi r6,r5,7
382
383 std r14,STK_REG(R14)(r1)
384 std r15,STK_REG(R15)(r1)
385 std r16,STK_REG(R16)(r1)
386
387 li r12,64
388 li r14,80
389 li r15,96
390 li r16,112
391
392 mtctr r6
393
394 /*
395 * Now do cacheline sized loads and stores. By this stage the
396 * cacheline stores are also cacheline aligned.
397 */
398 .align 5
3998:
400err4; lvx v7,0,r4
401err4; lvx v6,r4,r9
402err4; lvx v5,r4,r10
403err4; lvx v4,r4,r11
404err4; lvx v3,r4,r12
405err4; lvx v2,r4,r14
406err4; lvx v1,r4,r15
407err4; lvx v0,r4,r16
408 addi r4,r4,128
409err4; stvx v7,0,r3
410err4; stvx v6,r3,r9
411err4; stvx v5,r3,r10
412err4; stvx v4,r3,r11
413err4; stvx v3,r3,r12
414err4; stvx v2,r3,r14
415err4; stvx v1,r3,r15
416err4; stvx v0,r3,r16
417 addi r3,r3,128
418 bdnz 8b
419
420 ld r14,STK_REG(R14)(r1)
421 ld r15,STK_REG(R15)(r1)
422 ld r16,STK_REG(R16)(r1)
423
424 /* Up to 127B to go */
425 clrldi r5,r5,(64-7)
426 srdi r6,r5,4
427 mtocrf 0x01,r6
428
429 bf cr7*4+1,9f
430err3; lvx v3,0,r4
431err3; lvx v2,r4,r9
432err3; lvx v1,r4,r10
433err3; lvx v0,r4,r11
434 addi r4,r4,64
435err3; stvx v3,0,r3
436err3; stvx v2,r3,r9
437err3; stvx v1,r3,r10
438err3; stvx v0,r3,r11
439 addi r3,r3,64
440
4419: bf cr7*4+2,10f
442err3; lvx v1,0,r4
443err3; lvx v0,r4,r9
444 addi r4,r4,32
445err3; stvx v1,0,r3
446err3; stvx v0,r3,r9
447 addi r3,r3,32
448
44910: bf cr7*4+3,11f
450err3; lvx v1,0,r4
451 addi r4,r4,16
452err3; stvx v1,0,r3
453 addi r3,r3,16
454
455 /* Up to 15B to go */
45611: clrldi r5,r5,(64-4)
457 mtocrf 0x01,r5
458 bf cr7*4+0,12f
459err3; ld r0,0(r4)
460 addi r4,r4,8
461err3; std r0,0(r3)
462 addi r3,r3,8
463
46412: bf cr7*4+1,13f
465err3; lwz r0,0(r4)
466 addi r4,r4,4
467err3; stw r0,0(r3)
468 addi r3,r3,4
469
47013: bf cr7*4+2,14f
471err3; lhz r0,0(r4)
472 addi r4,r4,2
473err3; sth r0,0(r3)
474 addi r3,r3,2
475
47614: bf cr7*4+3,15f
477err3; lbz r0,0(r4)
478err3; stb r0,0(r3)
479
48015: addi r1,r1,STACKFRAMESIZE
481 b CFUNC(exit_vmx_usercopy) /* tail call optimise */
482
483.Lvmx_unaligned_copy:
484 /* Get the destination 16B aligned */
485 neg r6,r3
486 mtocrf 0x01,r6
487 clrldi r6,r6,(64-4)
488
489 bf cr7*4+3,1f
490err3; lbz r0,0(r4)
491 addi r4,r4,1
492err3; stb r0,0(r3)
493 addi r3,r3,1
494
4951: bf cr7*4+2,2f
496err3; lhz r0,0(r4)
497 addi r4,r4,2
498err3; sth r0,0(r3)
499 addi r3,r3,2
500
5012: bf cr7*4+1,3f
502err3; lwz r0,0(r4)
503 addi r4,r4,4
504err3; stw r0,0(r3)
505 addi r3,r3,4
506
5073: bf cr7*4+0,4f
508err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
509err3; lwz r7,4(r4)
510 addi r4,r4,8
511err3; stw r0,0(r3)
512err3; stw r7,4(r3)
513 addi r3,r3,8
514
5154: sub r5,r5,r6
516
517 /* Get the desination 128B aligned */
518 neg r6,r3
519 srdi r7,r6,4
520 mtocrf 0x01,r7
521 clrldi r6,r6,(64-7)
522
523 li r9,16
524 li r10,32
525 li r11,48
526
527 LVS(v16,0,r4) /* Setup permute control vector */
528err3; lvx v0,0,r4
529 addi r4,r4,16
530
531 bf cr7*4+3,5f
532err3; lvx v1,0,r4
533 VPERM(v8,v0,v1,v16)
534 addi r4,r4,16
535err3; stvx v8,0,r3
536 addi r3,r3,16
537 vor v0,v1,v1
538
5395: bf cr7*4+2,6f
540err3; lvx v1,0,r4
541 VPERM(v8,v0,v1,v16)
542err3; lvx v0,r4,r9
543 VPERM(v9,v1,v0,v16)
544 addi r4,r4,32
545err3; stvx v8,0,r3
546err3; stvx v9,r3,r9
547 addi r3,r3,32
548
5496: bf cr7*4+1,7f
550err3; lvx v3,0,r4
551 VPERM(v8,v0,v3,v16)
552err3; lvx v2,r4,r9
553 VPERM(v9,v3,v2,v16)
554err3; lvx v1,r4,r10
555 VPERM(v10,v2,v1,v16)
556err3; lvx v0,r4,r11
557 VPERM(v11,v1,v0,v16)
558 addi r4,r4,64
559err3; stvx v8,0,r3
560err3; stvx v9,r3,r9
561err3; stvx v10,r3,r10
562err3; stvx v11,r3,r11
563 addi r3,r3,64
564
5657: sub r5,r5,r6
566 srdi r6,r5,7
567
568 std r14,STK_REG(R14)(r1)
569 std r15,STK_REG(R15)(r1)
570 std r16,STK_REG(R16)(r1)
571
572 li r12,64
573 li r14,80
574 li r15,96
575 li r16,112
576
577 mtctr r6
578
579 /*
580 * Now do cacheline sized loads and stores. By this stage the
581 * cacheline stores are also cacheline aligned.
582 */
583 .align 5
5848:
585err4; lvx v7,0,r4
586 VPERM(v8,v0,v7,v16)
587err4; lvx v6,r4,r9
588 VPERM(v9,v7,v6,v16)
589err4; lvx v5,r4,r10
590 VPERM(v10,v6,v5,v16)
591err4; lvx v4,r4,r11
592 VPERM(v11,v5,v4,v16)
593err4; lvx v3,r4,r12
594 VPERM(v12,v4,v3,v16)
595err4; lvx v2,r4,r14
596 VPERM(v13,v3,v2,v16)
597err4; lvx v1,r4,r15
598 VPERM(v14,v2,v1,v16)
599err4; lvx v0,r4,r16
600 VPERM(v15,v1,v0,v16)
601 addi r4,r4,128
602err4; stvx v8,0,r3
603err4; stvx v9,r3,r9
604err4; stvx v10,r3,r10
605err4; stvx v11,r3,r11
606err4; stvx v12,r3,r12
607err4; stvx v13,r3,r14
608err4; stvx v14,r3,r15
609err4; stvx v15,r3,r16
610 addi r3,r3,128
611 bdnz 8b
612
613 ld r14,STK_REG(R14)(r1)
614 ld r15,STK_REG(R15)(r1)
615 ld r16,STK_REG(R16)(r1)
616
617 /* Up to 127B to go */
618 clrldi r5,r5,(64-7)
619 srdi r6,r5,4
620 mtocrf 0x01,r6
621
622 bf cr7*4+1,9f
623err3; lvx v3,0,r4
624 VPERM(v8,v0,v3,v16)
625err3; lvx v2,r4,r9
626 VPERM(v9,v3,v2,v16)
627err3; lvx v1,r4,r10
628 VPERM(v10,v2,v1,v16)
629err3; lvx v0,r4,r11
630 VPERM(v11,v1,v0,v16)
631 addi r4,r4,64
632err3; stvx v8,0,r3
633err3; stvx v9,r3,r9
634err3; stvx v10,r3,r10
635err3; stvx v11,r3,r11
636 addi r3,r3,64
637
6389: bf cr7*4+2,10f
639err3; lvx v1,0,r4
640 VPERM(v8,v0,v1,v16)
641err3; lvx v0,r4,r9
642 VPERM(v9,v1,v0,v16)
643 addi r4,r4,32
644err3; stvx v8,0,r3
645err3; stvx v9,r3,r9
646 addi r3,r3,32
647
64810: bf cr7*4+3,11f
649err3; lvx v1,0,r4
650 VPERM(v8,v0,v1,v16)
651 addi r4,r4,16
652err3; stvx v8,0,r3
653 addi r3,r3,16
654
655 /* Up to 15B to go */
65611: clrldi r5,r5,(64-4)
657 addi r4,r4,-16 /* Unwind the +16 load offset */
658 mtocrf 0x01,r5
659 bf cr7*4+0,12f
660err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
661err3; lwz r6,4(r4)
662 addi r4,r4,8
663err3; stw r0,0(r3)
664err3; stw r6,4(r3)
665 addi r3,r3,8
666
66712: bf cr7*4+1,13f
668err3; lwz r0,0(r4)
669 addi r4,r4,4
670err3; stw r0,0(r3)
671 addi r3,r3,4
672
67313: bf cr7*4+2,14f
674err3; lhz r0,0(r4)
675 addi r4,r4,2
676err3; sth r0,0(r3)
677 addi r3,r3,2
678
67914: bf cr7*4+3,15f
680err3; lbz r0,0(r4)
681err3; stb r0,0(r3)
682
68315: addi r1,r1,STACKFRAMESIZE
684 b CFUNC(exit_vmx_usercopy) /* tail call optimise */
685#endif /* CONFIG_ALTIVEC */
686

source code of linux/arch/powerpc/lib/copyuser_power7.S