1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | /* |
3 | * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated |
4 | * |
5 | * Copyright 2018 Google LLC |
6 | * |
7 | * Author: Eric Biggers <ebiggers@google.com> |
8 | */ |
9 | |
10 | #include <linux/linkage.h> |
11 | #include <linux/cfi_types.h> |
12 | |
13 | #define PASS0_SUMS %xmm0 |
14 | #define PASS1_SUMS %xmm1 |
15 | #define PASS2_SUMS %xmm2 |
16 | #define PASS3_SUMS %xmm3 |
17 | #define K0 %xmm4 |
18 | #define K1 %xmm5 |
19 | #define K2 %xmm6 |
20 | #define K3 %xmm7 |
21 | #define T0 %xmm8 |
22 | #define T1 %xmm9 |
23 | #define T2 %xmm10 |
24 | #define T3 %xmm11 |
25 | #define T4 %xmm12 |
26 | #define T5 %xmm13 |
27 | #define T6 %xmm14 |
28 | #define T7 %xmm15 |
29 | #define KEY %rdi |
30 | #define MESSAGE %rsi |
31 | #define MESSAGE_LEN %rdx |
32 | #define HASH %rcx |
33 | |
34 | .macro _nh_stride k0, k1, k2, k3, offset |
35 | |
36 | // Load next message stride |
37 | movdqu \offset(MESSAGE), T1 |
38 | |
39 | // Load next key stride |
40 | movdqu \offset(KEY), \k3 |
41 | |
42 | // Add message words to key words |
43 | movdqa T1, T2 |
44 | movdqa T1, T3 |
45 | paddd T1, \k0 // reuse k0 to avoid a move |
46 | paddd \k1, T1 |
47 | paddd \k2, T2 |
48 | paddd \k3, T3 |
49 | |
50 | // Multiply 32x32 => 64 and accumulate |
51 | pshufd $0x10, \k0, T4 |
52 | pshufd $0x32, \k0, \k0 |
53 | pshufd $0x10, T1, T5 |
54 | pshufd $0x32, T1, T1 |
55 | pshufd $0x10, T2, T6 |
56 | pshufd $0x32, T2, T2 |
57 | pshufd $0x10, T3, T7 |
58 | pshufd $0x32, T3, T3 |
59 | pmuludq T4, \k0 |
60 | pmuludq T5, T1 |
61 | pmuludq T6, T2 |
62 | pmuludq T7, T3 |
63 | paddq \k0, PASS0_SUMS |
64 | paddq T1, PASS1_SUMS |
65 | paddq T2, PASS2_SUMS |
66 | paddq T3, PASS3_SUMS |
67 | .endm |
68 | |
69 | /* |
70 | * void nh_sse2(const u32 *key, const u8 *message, size_t message_len, |
71 | * __le64 hash[NH_NUM_PASSES]) |
72 | * |
73 | * It's guaranteed that message_len % 16 == 0. |
74 | */ |
75 | SYM_TYPED_FUNC_START(nh_sse2) |
76 | |
77 | movdqu 0x00(KEY), K0 |
78 | movdqu 0x10(KEY), K1 |
79 | movdqu 0x20(KEY), K2 |
80 | add $0x30, KEY |
81 | pxor PASS0_SUMS, PASS0_SUMS |
82 | pxor PASS1_SUMS, PASS1_SUMS |
83 | pxor PASS2_SUMS, PASS2_SUMS |
84 | pxor PASS3_SUMS, PASS3_SUMS |
85 | |
86 | sub $0x40, MESSAGE_LEN |
87 | jl .Lloop4_done |
88 | .Lloop4: |
89 | _nh_stride K0, K1, K2, K3, 0x00 |
90 | _nh_stride K1, K2, K3, K0, 0x10 |
91 | _nh_stride K2, K3, K0, K1, 0x20 |
92 | _nh_stride K3, K0, K1, K2, 0x30 |
93 | add $0x40, KEY |
94 | add $0x40, MESSAGE |
95 | sub $0x40, MESSAGE_LEN |
96 | jge .Lloop4 |
97 | |
98 | .Lloop4_done: |
99 | and $0x3f, MESSAGE_LEN |
100 | jz .Ldone |
101 | _nh_stride K0, K1, K2, K3, 0x00 |
102 | |
103 | sub $0x10, MESSAGE_LEN |
104 | jz .Ldone |
105 | _nh_stride K1, K2, K3, K0, 0x10 |
106 | |
107 | sub $0x10, MESSAGE_LEN |
108 | jz .Ldone |
109 | _nh_stride K2, K3, K0, K1, 0x20 |
110 | |
111 | .Ldone: |
112 | // Sum the accumulators for each pass, then store the sums to 'hash' |
113 | movdqa PASS0_SUMS, T0 |
114 | movdqa PASS2_SUMS, T1 |
115 | punpcklqdq PASS1_SUMS, T0 // => (PASS0_SUM_A PASS1_SUM_A) |
116 | punpcklqdq PASS3_SUMS, T1 // => (PASS2_SUM_A PASS3_SUM_A) |
117 | punpckhqdq PASS1_SUMS, PASS0_SUMS // => (PASS0_SUM_B PASS1_SUM_B) |
118 | punpckhqdq PASS3_SUMS, PASS2_SUMS // => (PASS2_SUM_B PASS3_SUM_B) |
119 | paddq PASS0_SUMS, T0 |
120 | paddq PASS2_SUMS, T1 |
121 | movdqu T0, 0x00(HASH) |
122 | movdqu T1, 0x10(HASH) |
123 | RET |
124 | SYM_FUNC_END(nh_sse2) |
125 | |