reciprocal_div.h source code [linux/include/linux/reciprocal_div.h]

1	/ SPDX-License-Identifier: GPL-2.0 /
2	#ifndef _LINUX_RECIPROCAL_DIV_H
3	#define _LINUX_RECIPROCAL_DIV_H
4
5	#include <linux/types.h>
6
7	/*
8	* This algorithm is based on the paper "Division by Invariant
9	* Integers Using Multiplication" by Torbjörn Granlund and Peter
10	* L. Montgomery.
11	*
12	* The assembler implementation from Agner Fog, which this code is
13	* based on, can be found here:
14	* http://www.agner.org/optimize/asmlib.zip
15	*
16	* This optimization for A/B is helpful if the divisor B is mostly
17	* runtime invariant. The reciprocal of B is calculated in the
18	* slow-path with reciprocal_value(). The fast-path can then just use
19	* a much faster multiplication operation with a variable dividend A
20	* to calculate the division A/B.
21	*/
22
23	struct reciprocal_value {
24	u32 m;
25	u8 sh1, sh2;
26	};
27
28	/ "reciprocal_value" and "reciprocal_divide" together implement the basic*
29	* version of the algorithm described in Figure 4.1 of the paper.
30	*/
31	struct reciprocal_value reciprocal_value(u32 d);
32
33	static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
34	{
35	u32 t = (u32)(((u64)a * R.m) >> `32`);
36	return (t + ((a - t) >> R.sh1)) >> R.sh2;
37	}
38
39	struct reciprocal_value_adv {
40	u32 m;
41	u8 sh, exp;
42	bool is_wide_m;
43	};
44
45	/ "reciprocal_value_adv" implements the advanced version of the algorithm*
46	* described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose
47	* ceil(log2(d)) result will be 32 which then requires u128 divide on host. The
48	* exception case could be easily handled before calling "reciprocal_value_adv".
49	*
50	* The advanced version requires more complex calculation to get the reciprocal
51	* multiplier and other control variables, but then could reduce the required
52	* emulation operations.
53	*
54	* It makes no sense to use this advanced version for host divide emulation,
55	* those extra complexities for calculating multiplier etc could completely
56	* waive our saving on emulation operations.
57	*
58	* However, it makes sense to use it for JIT divide code generation for which
59	* we are willing to trade performance of JITed code with that of host. As shown
60	* by the following pseudo code, the required emulation operations could go down
61	* from 6 (the basic version) to 3 or 4.
62	*
63	* To use the result of "reciprocal_value_adv", suppose we want to calculate
64	* n/d, the pseudo C code will be:
65	*
66	* struct reciprocal_value_adv rvalue;
67	* u8 pre_shift, exp;
68	*
69	* // handle exception case.
70	* if (d >= (1U << 31)) {
71	* result = n >= d;
72	* return;
73	* }
74	*
75	* rvalue = reciprocal_value_adv(d, 32)
76	* exp = rvalue.exp;
77	* if (rvalue.is_wide_m && !(d & 1)) {
78	* // floor(log2(d & (2^32 -d)))
79	* pre_shift = fls(d & -d) - 1;
80	* rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
81	* } else {
82	* pre_shift = 0;
83	* }
84	*
85	* // code generation starts.
86	* if (imm == 1U << exp) {
87	* result = n >> exp;
88	* } else if (rvalue.is_wide_m) {
89	* // pre_shift must be zero when reached here.
90	* t = (n * rvalue.m) >> 32;
91	* result = n - t;
92	* result >>= 1;
93	* result += t;
94	* result >>= rvalue.sh - 1;
95	* } else {
96	* if (pre_shift)
97	* result = n >> pre_shift;
98	* result = ((u64)result * rvalue.m) >> 32;
99	* result >>= rvalue.sh;
100	* }
101	*/
102	struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec);
103
104	#endif /* _LINUX_RECIPROCAL_DIV_H */
105

source code of linux/include/linux/reciprocal_div.h