1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_RECIPROCAL_DIV_H
3#define _LINUX_RECIPROCAL_DIV_H
4
5#include <linux/types.h>
6
7/*
8 * This algorithm is based on the paper "Division by Invariant
9 * Integers Using Multiplication" by Torbjörn Granlund and Peter
10 * L. Montgomery.
11 *
12 * The assembler implementation from Agner Fog, which this code is
13 * based on, can be found here:
14 * http://www.agner.org/optimize/asmlib.zip
15 *
16 * This optimization for A/B is helpful if the divisor B is mostly
17 * runtime invariant. The reciprocal of B is calculated in the
18 * slow-path with reciprocal_value(). The fast-path can then just use
19 * a much faster multiplication operation with a variable dividend A
20 * to calculate the division A/B.
21 */
22
23struct reciprocal_value {
24 u32 m;
25 u8 sh1, sh2;
26};
27
28/* "reciprocal_value" and "reciprocal_divide" together implement the basic
29 * version of the algorithm described in Figure 4.1 of the paper.
30 */
31struct reciprocal_value reciprocal_value(u32 d);
32
33static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
34{
35 u32 t = (u32)(((u64)a * R.m) >> 32);
36 return (t + ((a - t) >> R.sh1)) >> R.sh2;
37}
38
39struct reciprocal_value_adv {
40 u32 m;
41 u8 sh, exp;
42 bool is_wide_m;
43};
44
45/* "reciprocal_value_adv" implements the advanced version of the algorithm
46 * described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose
47 * ceil(log2(d)) result will be 32 which then requires u128 divide on host. The
48 * exception case could be easily handled before calling "reciprocal_value_adv".
49 *
50 * The advanced version requires more complex calculation to get the reciprocal
51 * multiplier and other control variables, but then could reduce the required
52 * emulation operations.
53 *
54 * It makes no sense to use this advanced version for host divide emulation,
55 * those extra complexities for calculating multiplier etc could completely
56 * waive our saving on emulation operations.
57 *
58 * However, it makes sense to use it for JIT divide code generation for which
59 * we are willing to trade performance of JITed code with that of host. As shown
60 * by the following pseudo code, the required emulation operations could go down
61 * from 6 (the basic version) to 3 or 4.
62 *
63 * To use the result of "reciprocal_value_adv", suppose we want to calculate
64 * n/d, the pseudo C code will be:
65 *
66 * struct reciprocal_value_adv rvalue;
67 * u8 pre_shift, exp;
68 *
69 * // handle exception case.
70 * if (d >= (1U << 31)) {
71 * result = n >= d;
72 * return;
73 * }
74 *
75 * rvalue = reciprocal_value_adv(d, 32)
76 * exp = rvalue.exp;
77 * if (rvalue.is_wide_m && !(d & 1)) {
78 * // floor(log2(d & (2^32 -d)))
79 * pre_shift = fls(d & -d) - 1;
80 * rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
81 * } else {
82 * pre_shift = 0;
83 * }
84 *
85 * // code generation starts.
86 * if (imm == 1U << exp) {
87 * result = n >> exp;
88 * } else if (rvalue.is_wide_m) {
89 * // pre_shift must be zero when reached here.
90 * t = (n * rvalue.m) >> 32;
91 * result = n - t;
92 * result >>= 1;
93 * result += t;
94 * result >>= rvalue.sh - 1;
95 * } else {
96 * if (pre_shift)
97 * result = n >> pre_shift;
98 * result = ((u64)result * rvalue.m) >> 32;
99 * result >>= rvalue.sh;
100 * }
101 */
102struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec);
103
104#endif /* _LINUX_RECIPROCAL_DIV_H */
105