1 | //===-- ProfileGenerator.h - Profile Generator -----------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #ifndef LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H |
10 | #define LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H |
11 | #include "CSPreInliner.h" |
12 | #include "ErrorHandling.h" |
13 | #include "PerfReader.h" |
14 | #include "ProfiledBinary.h" |
15 | #include "llvm/IR/DebugInfoMetadata.h" |
16 | #include "llvm/ProfileData/SampleProfWriter.h" |
17 | #include <memory> |
18 | #include <unordered_set> |
19 | |
20 | using namespace llvm; |
21 | using namespace sampleprof; |
22 | |
23 | namespace llvm { |
24 | namespace sampleprof { |
25 | |
26 | using ProbeCounterMap = |
27 | std::unordered_map<const MCDecodedPseudoProbe *, uint64_t>; |
28 | |
29 | // This base class for profile generation of sample-based PGO. We reuse all |
30 | // structures relating to function profiles and profile writers as seen in |
31 | // /ProfileData/SampleProf.h. |
32 | class ProfileGeneratorBase { |
33 | |
34 | public: |
35 | ProfileGeneratorBase(ProfiledBinary *Binary) : Binary(Binary){}; |
36 | ProfileGeneratorBase(ProfiledBinary *Binary, |
37 | const ContextSampleCounterMap *Counters) |
38 | : Binary(Binary), SampleCounters(Counters){}; |
39 | ProfileGeneratorBase(ProfiledBinary *Binary, |
40 | const SampleProfileMap &&Profiles) |
41 | : Binary(Binary), ProfileMap(std::move(Profiles)){}; |
42 | |
43 | virtual ~ProfileGeneratorBase() = default; |
44 | static std::unique_ptr<ProfileGeneratorBase> |
45 | create(ProfiledBinary *Binary, const ContextSampleCounterMap *Counters, |
46 | bool profileIsCS); |
47 | static std::unique_ptr<ProfileGeneratorBase> |
48 | create(ProfiledBinary *Binary, SampleProfileMap &ProfileMap, |
49 | bool profileIsCS); |
50 | virtual void generateProfile() = 0; |
51 | void write(); |
52 | |
53 | static uint32_t |
54 | getDuplicationFactor(unsigned Discriminator, |
55 | bool UseFSD = ProfileGeneratorBase::UseFSDiscriminator) { |
56 | return UseFSD ? 1 |
57 | : llvm::DILocation::getDuplicationFactorFromDiscriminator( |
58 | D: Discriminator); |
59 | } |
60 | |
61 | static uint32_t |
62 | getBaseDiscriminator(unsigned Discriminator, |
63 | bool UseFSD = ProfileGeneratorBase::UseFSDiscriminator) { |
64 | return UseFSD ? Discriminator |
65 | : DILocation::getBaseDiscriminatorFromDiscriminator( |
66 | D: Discriminator, /* IsFSDiscriminator */ IsFSDiscriminator: false); |
67 | } |
68 | |
69 | static bool UseFSDiscriminator; |
70 | |
71 | protected: |
72 | // Use SampleProfileWriter to serialize profile map |
73 | void write(std::unique_ptr<SampleProfileWriter> Writer, |
74 | SampleProfileMap &ProfileMap); |
75 | /* |
76 | For each region boundary point, mark if it is begin or end (or both) of |
77 | the region. Boundary points are inclusive. Log the sample count as well |
78 | so we can use it when we compute the sample count of each disjoint region |
79 | later. Note that there might be multiple ranges with different sample |
80 | count that share same begin/end point. We need to accumulate the sample |
81 | count for the boundary point for such case, because for the example |
82 | below, |
83 | |
84 | |<--100-->| |
85 | |<------200------>| |
86 | A B C |
87 | |
88 | sample count for disjoint region [A,B] would be 300. |
89 | */ |
90 | void findDisjointRanges(RangeSample &DisjointRanges, |
91 | const RangeSample &Ranges); |
92 | |
93 | // Go through each address from range to extract the top frame probe by |
94 | // looking up in the Address2ProbeMap |
95 | void (const RangeSample &RangeCounter, |
96 | ProbeCounterMap &ProbeCounter, |
97 | bool FindDisjointRanges = true); |
98 | |
99 | // Helper function for updating body sample for a leaf location in |
100 | // FunctionProfile |
101 | void updateBodySamplesforFunctionProfile(FunctionSamples &FunctionProfile, |
102 | const SampleContextFrame &LeafLoc, |
103 | uint64_t Count); |
104 | |
105 | void updateFunctionSamples(); |
106 | |
107 | void updateTotalSamples(); |
108 | |
109 | void updateCallsiteSamples(); |
110 | |
111 | void filterAmbiguousProfile(SampleProfileMap &Profiles); |
112 | |
113 | bool filterAmbiguousProfile(FunctionSamples &FS); |
114 | |
115 | StringRef getCalleeNameForAddress(uint64_t TargetAddress); |
116 | |
117 | void computeSummaryAndThreshold(SampleProfileMap &ProfileMap); |
118 | |
119 | void calculateAndShowDensity(const SampleProfileMap &Profiles); |
120 | |
121 | double calculateDensity(const SampleProfileMap &Profiles, |
122 | uint64_t HotCntThreshold); |
123 | |
124 | void showDensitySuggestion(double Density); |
125 | |
126 | void collectProfiledFunctions(); |
127 | |
128 | bool collectFunctionsFromRawProfile( |
129 | std::unordered_set<const BinaryFunction *> &ProfiledFunctions); |
130 | |
131 | // Collect profiled Functions for llvm sample profile input. |
132 | virtual bool collectFunctionsFromLLVMProfile( |
133 | std::unordered_set<const BinaryFunction *> &ProfiledFunctions) = 0; |
134 | |
135 | // List of function prefix to filter out. |
136 | static constexpr const char *FuncPrefixsToFilter[] = {"__cxx_global_var_init" , |
137 | "__tls_init" }; |
138 | |
139 | // Thresholds from profile summary to answer isHotCount/isColdCount queries. |
140 | uint64_t HotCountThreshold; |
141 | |
142 | uint64_t ColdCountThreshold; |
143 | |
144 | ProfiledBinary *Binary = nullptr; |
145 | |
146 | std::unique_ptr<ProfileSummary> Summary; |
147 | |
148 | // Used by SampleProfileWriter |
149 | SampleProfileMap ProfileMap; |
150 | |
151 | const ContextSampleCounterMap *SampleCounters = nullptr; |
152 | }; |
153 | |
154 | class ProfileGenerator : public ProfileGeneratorBase { |
155 | |
156 | public: |
157 | ProfileGenerator(ProfiledBinary *Binary, |
158 | const ContextSampleCounterMap *Counters) |
159 | : ProfileGeneratorBase(Binary, Counters){}; |
160 | ProfileGenerator(ProfiledBinary *Binary, const SampleProfileMap &&Profiles) |
161 | : ProfileGeneratorBase(Binary, std::move(Profiles)){}; |
162 | void generateProfile() override; |
163 | |
164 | private: |
165 | void generateLineNumBasedProfile(); |
166 | void generateProbeBasedProfile(); |
167 | RangeSample preprocessRangeCounter(const RangeSample &RangeCounter); |
168 | FunctionSamples &getTopLevelFunctionProfile(FunctionId FuncName); |
169 | // Helper function to get the leaf frame's FunctionProfile by traversing the |
170 | // inline stack and meanwhile it adds the total samples for each frame's |
171 | // function profile. |
172 | FunctionSamples & |
173 | getLeafProfileAndAddTotalSamples(const SampleContextFrameVector &FrameVec, |
174 | uint64_t Count); |
175 | void populateBodySamplesForAllFunctions(const RangeSample &RangeCounter); |
176 | void |
177 | populateBoundarySamplesForAllFunctions(const BranchSample &BranchCounters); |
178 | void |
179 | populateBodySamplesWithProbesForAllFunctions(const RangeSample &RangeCounter); |
180 | void populateBoundarySamplesWithProbesForAllFunctions( |
181 | const BranchSample &BranchCounters); |
182 | void postProcessProfiles(); |
183 | void trimColdProfiles(const SampleProfileMap &Profiles, |
184 | uint64_t ColdCntThreshold); |
185 | bool collectFunctionsFromLLVMProfile( |
186 | std::unordered_set<const BinaryFunction *> &ProfiledFunctions) override; |
187 | }; |
188 | |
189 | class CSProfileGenerator : public ProfileGeneratorBase { |
190 | public: |
191 | CSProfileGenerator(ProfiledBinary *Binary, |
192 | const ContextSampleCounterMap *Counters) |
193 | : ProfileGeneratorBase(Binary, Counters){}; |
194 | CSProfileGenerator(ProfiledBinary *Binary, SampleProfileMap &Profiles) |
195 | : ProfileGeneratorBase(Binary), ContextTracker(Profiles, nullptr){}; |
196 | void generateProfile() override; |
197 | |
198 | // Trim the context stack at a given depth. |
199 | template <typename T> |
200 | static void trimContext(SmallVectorImpl<T> &S, int Depth = MaxContextDepth) { |
201 | if (Depth < 0 || static_cast<size_t>(Depth) >= S.size()) |
202 | return; |
203 | std::copy(S.begin() + S.size() - static_cast<size_t>(Depth), S.end(), |
204 | S.begin()); |
205 | S.resize(Depth); |
206 | } |
207 | |
208 | // Remove adjacent repeated context sequences up to a given sequence length, |
209 | // -1 means no size limit. Note that repeated sequences are identified based |
210 | // on the exact call site, this is finer granularity than function recursion. |
211 | template <typename T> |
212 | static void compressRecursionContext(SmallVectorImpl<T> &Context, |
213 | int32_t CSize = MaxCompressionSize) { |
214 | uint32_t I = 1; |
215 | uint32_t HS = static_cast<uint32_t>(Context.size() / 2); |
216 | uint32_t MaxDedupSize = |
217 | CSize == -1 ? HS : std::min(a: static_cast<uint32_t>(CSize), b: HS); |
218 | auto BeginIter = Context.begin(); |
219 | // Use an in-place algorithm to save memory copy |
220 | // End indicates the end location of current iteration's data |
221 | uint32_t End = 0; |
222 | // Deduplicate from length 1 to the max possible size of a repeated |
223 | // sequence. |
224 | while (I <= MaxDedupSize) { |
225 | // This is a linear algorithm that deduplicates adjacent repeated |
226 | // sequences of size I. The deduplication detection runs on a sliding |
227 | // window whose size is 2*I and it keeps sliding the window to deduplicate |
228 | // the data inside. Once duplication is detected, deduplicate it by |
229 | // skipping the right half part of the window, otherwise just copy back |
230 | // the new one by appending them at the back of End pointer(for the next |
231 | // iteration). |
232 | // |
233 | // For example: |
234 | // Input: [a1, a2, b1, b2] |
235 | // (Added index to distinguish the same char, the origin is [a, a, b, |
236 | // b], the size of the dedup window is 2(I = 1) at the beginning) |
237 | // |
238 | // 1) The initial status is a dummy window[null, a1], then just copy the |
239 | // right half of the window(End = 0), then slide the window. |
240 | // Result: [a1], a2, b1, b2 (End points to the element right before ], |
241 | // after ] is the data of the previous iteration) |
242 | // |
243 | // 2) Next window is [a1, a2]. Since a1 == a2, then skip the right half of |
244 | // the window i.e the duplication happen. Only slide the window. |
245 | // Result: [a1], a2, b1, b2 |
246 | // |
247 | // 3) Next window is [a2, b1], copy the right half of the window(b1 is |
248 | // new) to the End and slide the window. |
249 | // Result: [a1, b1], b1, b2 |
250 | // |
251 | // 4) Next window is [b1, b2], same to 2), skip b2. |
252 | // Result: [a1, b1], b1, b2 |
253 | // After resize, it will be [a, b] |
254 | |
255 | // Use pointers like below to do comparison inside the window |
256 | // [a b c a b c] |
257 | // | | | | | |
258 | // LeftBoundary Left Right Left+I Right+I |
259 | // A duplication found if Left < LeftBoundry. |
260 | |
261 | int32_t Right = I - 1; |
262 | End = I; |
263 | int32_t LeftBoundary = 0; |
264 | while (Right + I < Context.size()) { |
265 | // To avoids scanning a part of a sequence repeatedly, it finds out |
266 | // the common suffix of two hald in the window. The common suffix will |
267 | // serve as the common prefix of next possible pair of duplicate |
268 | // sequences. The non-common part will be ignored and never scanned |
269 | // again. |
270 | |
271 | // For example. |
272 | // Input: [a, b1], c1, b2, c2 |
273 | // I = 2 |
274 | // |
275 | // 1) For the window [a, b1, c1, b2], non-common-suffix for the right |
276 | // part is 'c1', copy it and only slide the window 1 step. |
277 | // Result: [a, b1, c1], b2, c2 |
278 | // |
279 | // 2) Next window is [b1, c1, b2, c2], so duplication happen. |
280 | // Result after resize: [a, b, c] |
281 | |
282 | int32_t Left = Right; |
283 | while (Left >= LeftBoundary && Context[Left] == Context[Left + I]) { |
284 | // Find the longest suffix inside the window. When stops, Left points |
285 | // at the diverging point in the current sequence. |
286 | Left--; |
287 | } |
288 | |
289 | bool DuplicationFound = (Left < LeftBoundary); |
290 | // Don't need to recheck the data before Right |
291 | LeftBoundary = Right + 1; |
292 | if (DuplicationFound) { |
293 | // Duplication found, skip right half of the window. |
294 | Right += I; |
295 | } else { |
296 | // Copy the non-common-suffix part of the adjacent sequence. |
297 | std::copy(BeginIter + Right + 1, BeginIter + Left + I + 1, |
298 | BeginIter + End); |
299 | End += Left + I - Right; |
300 | // Only slide the window by the size of non-common-suffix |
301 | Right = Left + I; |
302 | } |
303 | } |
304 | // Don't forget the remaining part that's not scanned. |
305 | std::copy(BeginIter + Right + 1, Context.end(), BeginIter + End); |
306 | End += Context.size() - Right - 1; |
307 | I++; |
308 | Context.resize(End); |
309 | MaxDedupSize = std::min(a: static_cast<uint32_t>(End / 2), b: MaxDedupSize); |
310 | } |
311 | } |
312 | |
313 | private: |
314 | void generateLineNumBasedProfile(); |
315 | |
316 | FunctionSamples *getOrCreateFunctionSamples(ContextTrieNode *ContextNode, |
317 | bool WasLeafInlined = false); |
318 | |
319 | // Lookup or create ContextTrieNode for the context, FunctionSamples is |
320 | // created inside this function. |
321 | ContextTrieNode *getOrCreateContextNode(const SampleContextFrames Context, |
322 | bool WasLeafInlined = false); |
323 | |
324 | // For profiled only functions, on-demand compute their inline context |
325 | // function byte size which is used by the pre-inliner. |
326 | void computeSizeForProfiledFunctions(); |
327 | // Post processing for profiles before writing out, such as mermining |
328 | // and trimming cold profiles, running preinliner on profiles. |
329 | void postProcessProfiles(); |
330 | |
331 | void populateBodySamplesForFunction(FunctionSamples &FunctionProfile, |
332 | const RangeSample &RangeCounters); |
333 | |
334 | void populateBoundarySamplesForFunction(ContextTrieNode *CallerNode, |
335 | const BranchSample &BranchCounters); |
336 | |
337 | void populateInferredFunctionSamples(ContextTrieNode &Node); |
338 | |
339 | void updateFunctionSamples(); |
340 | |
341 | void generateProbeBasedProfile(); |
342 | |
343 | // Fill in function body samples from probes |
344 | void populateBodySamplesWithProbes(const RangeSample &RangeCounter, |
345 | const AddrBasedCtxKey *CtxKey); |
346 | // Fill in boundary samples for a call probe |
347 | void populateBoundarySamplesWithProbes(const BranchSample &BranchCounter, |
348 | const AddrBasedCtxKey *CtxKey); |
349 | |
350 | ContextTrieNode * |
351 | getContextNodeForLeafProbe(const AddrBasedCtxKey *CtxKey, |
352 | const MCDecodedPseudoProbe *LeafProbe); |
353 | |
354 | // Helper function to get FunctionSamples for the leaf probe |
355 | FunctionSamples & |
356 | getFunctionProfileForLeafProbe(const AddrBasedCtxKey *CtxKey, |
357 | const MCDecodedPseudoProbe *LeafProbe); |
358 | |
359 | void convertToProfileMap(ContextTrieNode &Node, |
360 | SampleContextFrameVector &Context); |
361 | |
362 | void convertToProfileMap(); |
363 | |
364 | void computeSummaryAndThreshold(); |
365 | |
366 | bool collectFunctionsFromLLVMProfile( |
367 | std::unordered_set<const BinaryFunction *> &ProfiledFunctions) override; |
368 | |
369 | void initializeMissingFrameInferrer(); |
370 | |
371 | // Given an input `Context`, output `NewContext` with inferred missing tail |
372 | // call frames. |
373 | void inferMissingFrames(const SmallVectorImpl<uint64_t> &Context, |
374 | SmallVectorImpl<uint64_t> &NewContext); |
375 | |
376 | ContextTrieNode &getRootContext() { return ContextTracker.getRootContext(); }; |
377 | |
378 | // The container for holding the FunctionSamples used by context trie. |
379 | std::list<FunctionSamples> FSamplesList; |
380 | |
381 | // Underlying context table serves for sample profile writer. |
382 | std::unordered_set<SampleContextFrameVector, SampleContextFrameHash> Contexts; |
383 | |
384 | SampleContextTracker ContextTracker; |
385 | |
386 | bool IsProfileValidOnTrie = true; |
387 | |
388 | public: |
389 | // Deduplicate adjacent repeated context sequences up to a given sequence |
390 | // length. -1 means no size limit. |
391 | static int32_t MaxCompressionSize; |
392 | static int MaxContextDepth; |
393 | }; |
394 | |
395 | } // end namespace sampleprof |
396 | } // end namespace llvm |
397 | |
398 | #endif |
399 | |