1 | //===- BalancedPartitioning.h ---------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements BalancedPartitioning, a recursive balanced graph |
10 | // partitioning algorithm. |
11 | // |
12 | // The algorithm is used to find an ordering of FunctionNodes while optimizing |
13 | // a specified objective. The algorithm uses recursive bisection; it starts |
14 | // with a collection of unordered FunctionNodes and tries to split them into |
15 | // two sets (buckets) of equal cardinality. Each bisection step is comprised of |
16 | // iterations that greedily swap the FunctionNodes between the two buckets while |
17 | // there is an improvement of the objective. Once the process converges, the |
18 | // problem is divided into two sub-problems of half the size, which are |
19 | // recursively applied for the two buckets. The final ordering of the |
20 | // FunctionNodes is obtained by concatenating the two (recursively computed) |
21 | // orderings. |
22 | // |
23 | // In order to speed up the computation, we limit the depth of the recursive |
24 | // tree by a specified constant (SplitDepth) and apply at most a constant |
25 | // number of greedy iterations per split (IterationsPerSplit). The worst-case |
26 | // time complexity of the implementation is bounded by O(M*log^2 N), where |
27 | // N is the number of FunctionNodes and M is the number of |
28 | // FunctionNode-UtilityNode edges; (assuming that any collection of D |
29 | // FunctionNodes contains O(D) UtilityNodes). Notice that the two different |
30 | // recursive sub-problems are independent and thus can be efficiently processed |
31 | // in parallel. |
32 | // |
33 | // Reference: |
34 | // * Optimizing Function Layout for Mobile Applications, |
35 | // https://arxiv.org/abs/2211.09285 |
36 | // |
37 | //===----------------------------------------------------------------------===// |
38 | |
39 | #ifndef LLVM_SUPPORT_BALANCED_PARTITIONING_H |
40 | #define LLVM_SUPPORT_BALANCED_PARTITIONING_H |
41 | |
42 | #include "raw_ostream.h" |
43 | #include "llvm/ADT/ArrayRef.h" |
44 | |
45 | #include <atomic> |
46 | #include <condition_variable> |
47 | #include <mutex> |
48 | #include <random> |
49 | #include <vector> |
50 | |
51 | namespace llvm { |
52 | |
53 | class ThreadPoolInterface; |
54 | /// A function with a set of utility nodes where it is beneficial to order two |
55 | /// functions close together if they have similar utility nodes |
56 | class BPFunctionNode { |
57 | friend class BalancedPartitioning; |
58 | |
59 | public: |
60 | using IDT = uint64_t; |
61 | using UtilityNodeT = uint32_t; |
62 | |
63 | /// \param UtilityNodes the set of utility nodes (must be unique'd) |
64 | BPFunctionNode(IDT Id, ArrayRef<UtilityNodeT> UtilityNodes) |
65 | : Id(Id), UtilityNodes(UtilityNodes) {} |
66 | |
67 | /// The ID of this node |
68 | IDT Id; |
69 | |
70 | void dump(raw_ostream &OS) const; |
71 | |
72 | protected: |
73 | /// The list of utility nodes associated with this node |
74 | SmallVector<UtilityNodeT, 4> UtilityNodes; |
75 | /// The bucket assigned by balanced partitioning |
76 | std::optional<unsigned> Bucket; |
77 | /// The index of the input order of the FunctionNodes |
78 | uint64_t InputOrderIndex = 0; |
79 | |
80 | friend class BPFunctionNodeTest_Basic_Test; |
81 | friend class BalancedPartitioningTest_Basic_Test; |
82 | friend class BalancedPartitioningTest_Large_Test; |
83 | }; |
84 | |
85 | /// Algorithm parameters; default values are tuned on real-world binaries |
86 | struct BalancedPartitioningConfig { |
87 | /// The depth of the recursive bisection |
88 | unsigned SplitDepth = 18; |
89 | /// The maximum number of bp iterations per split |
90 | unsigned IterationsPerSplit = 40; |
91 | /// The probability for a vertex to skip a move from its current bucket to |
92 | /// another bucket; it often helps to escape from a local optima |
93 | float SkipProbability = 0.1f; |
94 | /// Recursive subtasks up to the given depth are added to the queue and |
95 | /// distributed among threads by ThreadPool; all subsequent calls are executed |
96 | /// on the same thread |
97 | unsigned TaskSplitDepth = 9; |
98 | }; |
99 | |
100 | class BalancedPartitioning { |
101 | public: |
102 | BalancedPartitioning(const BalancedPartitioningConfig &Config); |
103 | |
104 | /// Run recursive graph partitioning that optimizes a given objective. |
105 | void run(std::vector<BPFunctionNode> &Nodes) const; |
106 | |
107 | private: |
108 | struct UtilitySignature; |
109 | using SignaturesT = SmallVector<UtilitySignature, 4>; |
110 | using FunctionNodeRange = |
111 | iterator_range<std::vector<BPFunctionNode>::iterator>; |
112 | |
113 | /// A special ThreadPool that allows for spawning new tasks after blocking on |
114 | /// wait(). BalancedPartitioning recursively spawns new threads inside other |
115 | /// threads, so we need to track how many active threads that could spawn more |
116 | /// threads. |
117 | struct BPThreadPool { |
118 | ThreadPoolInterface &TheThreadPool; |
119 | std::mutex mtx; |
120 | std::condition_variable cv; |
121 | /// The number of threads that could spawn more threads |
122 | std::atomic<int> NumActiveThreads = 0; |
123 | /// Only true when all threads are down spawning new threads |
124 | bool IsFinishedSpawning = false; |
125 | /// Asynchronous submission of the task to the pool |
126 | template <typename Func> void async(Func &&F); |
127 | /// Blocking wait for all threads to complete. Unlike ThreadPool, it is |
128 | /// acceptable for other threads to add more tasks while blocking on this |
129 | /// call. |
130 | void wait(); |
131 | BPThreadPool(ThreadPoolInterface &TheThreadPool) |
132 | : TheThreadPool(TheThreadPool) {} |
133 | }; |
134 | |
135 | /// Run a recursive bisection of a given list of FunctionNodes |
136 | /// \param RecDepth the current depth of recursion |
137 | /// \param RootBucket the initial bucket of the dataVertices |
138 | /// \param Offset the assigned buckets are the range [Offset, Offset + |
139 | /// Nodes.size()] |
140 | void bisect(const FunctionNodeRange Nodes, unsigned RecDepth, |
141 | unsigned RootBucket, unsigned Offset, |
142 | std::optional<BPThreadPool> &TP) const; |
143 | |
144 | /// Run bisection iterations |
145 | void runIterations(const FunctionNodeRange Nodes, unsigned LeftBucket, |
146 | unsigned RightBucket, std::mt19937 &RNG) const; |
147 | |
148 | /// Run a bisection iteration to improve the optimization goal |
149 | /// \returns the total number of moved FunctionNodes |
150 | unsigned runIteration(const FunctionNodeRange Nodes, unsigned LeftBucket, |
151 | unsigned RightBucket, SignaturesT &Signatures, |
152 | std::mt19937 &RNG) const; |
153 | |
154 | /// Try to move \p N from one bucket to another |
155 | /// \returns true iff \p N is moved |
156 | bool moveFunctionNode(BPFunctionNode &N, unsigned LeftBucket, |
157 | unsigned RightBucket, SignaturesT &Signatures, |
158 | std::mt19937 &RNG) const; |
159 | |
160 | /// Split all the FunctionNodes into 2 buckets, StartBucket and StartBucket + |
161 | /// 1 The method is used for an initial assignment before a bisection step |
162 | void split(const FunctionNodeRange Nodes, unsigned StartBucket) const; |
163 | |
164 | /// The cost of the uniform log-gap cost, assuming a utility node has \p X |
165 | /// FunctionNodes in the left bucket and \p Y FunctionNodes in the right one. |
166 | float logCost(unsigned X, unsigned Y) const; |
167 | |
168 | float log2Cached(unsigned i) const; |
169 | |
170 | const BalancedPartitioningConfig &Config; |
171 | |
172 | /// Precomputed values of log2(x). Table size is small enough to fit in cache. |
173 | static constexpr unsigned LOG_CACHE_SIZE = 16384; |
174 | float Log2Cache[LOG_CACHE_SIZE]; |
175 | |
176 | /// The signature of a particular utility node used for the bisection step, |
177 | /// i.e., the number of \p FunctionNodes in each of the two buckets |
178 | struct UtilitySignature { |
179 | /// The number of \p FunctionNodes in the left bucket |
180 | unsigned LeftCount = 0; |
181 | /// The number of \p FunctionNodes in the right bucket |
182 | unsigned RightCount = 0; |
183 | /// The cached gain of moving a \p FunctionNode from the left bucket to the |
184 | /// right bucket |
185 | float CachedGainLR; |
186 | /// The cached gain of moving a \p FunctionNode from the right bucket to the |
187 | /// left bucket |
188 | float CachedGainRL; |
189 | /// Whether \p CachedGainLR and \p CachedGainRL are valid |
190 | bool CachedGainIsValid = false; |
191 | }; |
192 | |
193 | protected: |
194 | /// Compute the move gain for uniform log-gap cost |
195 | static float moveGain(const BPFunctionNode &N, bool FromLeftToRight, |
196 | const SignaturesT &Signatures); |
197 | friend class BalancedPartitioningTest_MoveGain_Test; |
198 | }; |
199 | |
200 | } // end namespace llvm |
201 | |
202 | #endif // LLVM_SUPPORT_BALANCED_PARTITIONING_H |
203 | |