1 | //===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements the SampleProfileLoader transformation. This pass |
10 | // reads a profile file generated by a sampling profiler (e.g. Linux Perf - |
11 | // http://perf.wiki.kernel.org/) and generates IR metadata to reflect the |
12 | // profile information in the given profile. |
13 | // |
14 | // This pass generates branch weight annotations on the IR: |
15 | // |
16 | // - prof: Represents branch weights. This annotation is added to branches |
17 | // to indicate the weights of each edge coming out of the branch. |
18 | // The weight of each edge is the weight of the target block for |
19 | // that edge. The weight of a block B is computed as the maximum |
20 | // number of samples found in B. |
21 | // |
22 | //===----------------------------------------------------------------------===// |
23 | |
24 | #include "llvm/Transforms/IPO/SampleProfile.h" |
25 | #include "llvm/ADT/ArrayRef.h" |
26 | #include "llvm/ADT/DenseMap.h" |
27 | #include "llvm/ADT/DenseSet.h" |
28 | #include "llvm/ADT/MapVector.h" |
29 | #include "llvm/ADT/PriorityQueue.h" |
30 | #include "llvm/ADT/SCCIterator.h" |
31 | #include "llvm/ADT/SmallVector.h" |
32 | #include "llvm/ADT/Statistic.h" |
33 | #include "llvm/ADT/StringMap.h" |
34 | #include "llvm/ADT/StringRef.h" |
35 | #include "llvm/ADT/Twine.h" |
36 | #include "llvm/Analysis/AssumptionCache.h" |
37 | #include "llvm/Analysis/BlockFrequencyInfoImpl.h" |
38 | #include "llvm/Analysis/InlineAdvisor.h" |
39 | #include "llvm/Analysis/InlineCost.h" |
40 | #include "llvm/Analysis/LazyCallGraph.h" |
41 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
42 | #include "llvm/Analysis/ProfileSummaryInfo.h" |
43 | #include "llvm/Analysis/ReplayInlineAdvisor.h" |
44 | #include "llvm/Analysis/TargetLibraryInfo.h" |
45 | #include "llvm/Analysis/TargetTransformInfo.h" |
46 | #include "llvm/IR/BasicBlock.h" |
47 | #include "llvm/IR/DebugLoc.h" |
48 | #include "llvm/IR/DiagnosticInfo.h" |
49 | #include "llvm/IR/Function.h" |
50 | #include "llvm/IR/GlobalValue.h" |
51 | #include "llvm/IR/InstrTypes.h" |
52 | #include "llvm/IR/Instruction.h" |
53 | #include "llvm/IR/Instructions.h" |
54 | #include "llvm/IR/IntrinsicInst.h" |
55 | #include "llvm/IR/LLVMContext.h" |
56 | #include "llvm/IR/MDBuilder.h" |
57 | #include "llvm/IR/Module.h" |
58 | #include "llvm/IR/PassManager.h" |
59 | #include "llvm/IR/ProfDataUtils.h" |
60 | #include "llvm/IR/PseudoProbe.h" |
61 | #include "llvm/IR/ValueSymbolTable.h" |
62 | #include "llvm/ProfileData/InstrProf.h" |
63 | #include "llvm/ProfileData/SampleProf.h" |
64 | #include "llvm/ProfileData/SampleProfReader.h" |
65 | #include "llvm/Support/Casting.h" |
66 | #include "llvm/Support/CommandLine.h" |
67 | #include "llvm/Support/Debug.h" |
68 | #include "llvm/Support/ErrorOr.h" |
69 | #include "llvm/Support/VirtualFileSystem.h" |
70 | #include "llvm/Support/raw_ostream.h" |
71 | #include "llvm/Transforms/IPO.h" |
72 | #include "llvm/Transforms/IPO/ProfiledCallGraph.h" |
73 | #include "llvm/Transforms/IPO/SampleContextTracker.h" |
74 | #include "llvm/Transforms/IPO/SampleProfileProbe.h" |
75 | #include "llvm/Transforms/Instrumentation.h" |
76 | #include "llvm/Transforms/Utils/CallPromotionUtils.h" |
77 | #include "llvm/Transforms/Utils/Cloning.h" |
78 | #include "llvm/Transforms/Utils/MisExpect.h" |
79 | #include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h" |
80 | #include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h" |
81 | #include <algorithm> |
82 | #include <cassert> |
83 | #include <cstdint> |
84 | #include <functional> |
85 | #include <limits> |
86 | #include <map> |
87 | #include <memory> |
88 | #include <queue> |
89 | #include <string> |
90 | #include <system_error> |
91 | #include <utility> |
92 | #include <vector> |
93 | |
94 | using namespace llvm; |
95 | using namespace sampleprof; |
96 | using namespace llvm::sampleprofutil; |
97 | using ProfileCount = Function::ProfileCount; |
98 | #define DEBUG_TYPE "sample-profile" |
99 | #define CSINLINE_DEBUG DEBUG_TYPE "-inline" |
100 | |
101 | STATISTIC(NumCSInlined, |
102 | "Number of functions inlined with context sensitive profile" ); |
103 | STATISTIC(NumCSNotInlined, |
104 | "Number of functions not inlined with context sensitive profile" ); |
105 | STATISTIC(NumMismatchedProfile, |
106 | "Number of functions with CFG mismatched profile" ); |
107 | STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile" ); |
108 | STATISTIC(NumDuplicatedInlinesite, |
109 | "Number of inlined callsites with a partial distribution factor" ); |
110 | |
111 | STATISTIC(NumCSInlinedHitMinLimit, |
112 | "Number of functions with FDO inline stopped due to min size limit" ); |
113 | STATISTIC(NumCSInlinedHitMaxLimit, |
114 | "Number of functions with FDO inline stopped due to max size limit" ); |
115 | STATISTIC( |
116 | NumCSInlinedHitGrowthLimit, |
117 | "Number of functions with FDO inline stopped due to growth size limit" ); |
118 | |
119 | // Command line option to specify the file to read samples from. This is |
120 | // mainly used for debugging. |
121 | static cl::opt<std::string> SampleProfileFile( |
122 | "sample-profile-file" , cl::init(Val: "" ), cl::value_desc("filename" ), |
123 | cl::desc("Profile file loaded by -sample-profile" ), cl::Hidden); |
124 | |
125 | // The named file contains a set of transformations that may have been applied |
126 | // to the symbol names between the program from which the sample data was |
127 | // collected and the current program's symbols. |
128 | static cl::opt<std::string> SampleProfileRemappingFile( |
129 | "sample-profile-remapping-file" , cl::init(Val: "" ), cl::value_desc("filename" ), |
130 | cl::desc("Profile remapping file loaded by -sample-profile" ), cl::Hidden); |
131 | |
132 | static cl::opt<bool> SalvageStaleProfile( |
133 | "salvage-stale-profile" , cl::Hidden, cl::init(Val: false), |
134 | cl::desc("Salvage stale profile by fuzzy matching and use the remapped " |
135 | "location for sample profile query." )); |
136 | |
137 | static cl::opt<bool> ReportProfileStaleness( |
138 | "report-profile-staleness" , cl::Hidden, cl::init(Val: false), |
139 | cl::desc("Compute and report stale profile statistical metrics." )); |
140 | |
141 | static cl::opt<bool> PersistProfileStaleness( |
142 | "persist-profile-staleness" , cl::Hidden, cl::init(Val: false), |
143 | cl::desc("Compute stale profile statistical metrics and write it into the " |
144 | "native object file(.llvm_stats section)." )); |
145 | |
146 | static cl::opt<bool> ProfileSampleAccurate( |
147 | "profile-sample-accurate" , cl::Hidden, cl::init(Val: false), |
148 | cl::desc("If the sample profile is accurate, we will mark all un-sampled " |
149 | "callsite and function as having 0 samples. Otherwise, treat " |
150 | "un-sampled callsites and functions conservatively as unknown. " )); |
151 | |
152 | static cl::opt<bool> ProfileSampleBlockAccurate( |
153 | "profile-sample-block-accurate" , cl::Hidden, cl::init(Val: false), |
154 | cl::desc("If the sample profile is accurate, we will mark all un-sampled " |
155 | "branches and calls as having 0 samples. Otherwise, treat " |
156 | "them conservatively as unknown. " )); |
157 | |
158 | static cl::opt<bool> ProfileAccurateForSymsInList( |
159 | "profile-accurate-for-symsinlist" , cl::Hidden, cl::init(Val: true), |
160 | cl::desc("For symbols in profile symbol list, regard their profiles to " |
161 | "be accurate. It may be overriden by profile-sample-accurate. " )); |
162 | |
163 | static cl::opt<bool> ProfileMergeInlinee( |
164 | "sample-profile-merge-inlinee" , cl::Hidden, cl::init(Val: true), |
165 | cl::desc("Merge past inlinee's profile to outline version if sample " |
166 | "profile loader decided not to inline a call site. It will " |
167 | "only be enabled when top-down order of profile loading is " |
168 | "enabled. " )); |
169 | |
170 | static cl::opt<bool> ProfileTopDownLoad( |
171 | "sample-profile-top-down-load" , cl::Hidden, cl::init(Val: true), |
172 | cl::desc("Do profile annotation and inlining for functions in top-down " |
173 | "order of call graph during sample profile loading. It only " |
174 | "works for new pass manager. " )); |
175 | |
176 | static cl::opt<bool> |
177 | UseProfiledCallGraph("use-profiled-call-graph" , cl::init(Val: true), cl::Hidden, |
178 | cl::desc("Process functions in a top-down order " |
179 | "defined by the profiled call graph when " |
180 | "-sample-profile-top-down-load is on." )); |
181 | |
182 | static cl::opt<bool> ProfileSizeInline( |
183 | "sample-profile-inline-size" , cl::Hidden, cl::init(Val: false), |
184 | cl::desc("Inline cold call sites in profile loader if it's beneficial " |
185 | "for code size." )); |
186 | |
187 | // Since profiles are consumed by many passes, turning on this option has |
188 | // side effects. For instance, pre-link SCC inliner would see merged profiles |
189 | // and inline the hot functions (that are skipped in this pass). |
190 | static cl::opt<bool> DisableSampleLoaderInlining( |
191 | "disable-sample-loader-inlining" , cl::Hidden, cl::init(Val: false), |
192 | cl::desc("If true, artifically skip inline transformation in sample-loader " |
193 | "pass, and merge (or scale) profiles (as configured by " |
194 | "--sample-profile-merge-inlinee)." )); |
195 | |
196 | namespace llvm { |
197 | cl::opt<bool> |
198 | SortProfiledSCC("sort-profiled-scc-member" , cl::init(Val: true), cl::Hidden, |
199 | cl::desc("Sort profiled recursion by edge weights." )); |
200 | |
201 | cl::opt<int> ProfileInlineGrowthLimit( |
202 | "sample-profile-inline-growth-limit" , cl::Hidden, cl::init(Val: 12), |
203 | cl::desc("The size growth ratio limit for proirity-based sample profile " |
204 | "loader inlining." )); |
205 | |
206 | cl::opt<int> ProfileInlineLimitMin( |
207 | "sample-profile-inline-limit-min" , cl::Hidden, cl::init(Val: 100), |
208 | cl::desc("The lower bound of size growth limit for " |
209 | "proirity-based sample profile loader inlining." )); |
210 | |
211 | cl::opt<int> ProfileInlineLimitMax( |
212 | "sample-profile-inline-limit-max" , cl::Hidden, cl::init(Val: 10000), |
213 | cl::desc("The upper bound of size growth limit for " |
214 | "proirity-based sample profile loader inlining." )); |
215 | |
216 | cl::opt<int> SampleHotCallSiteThreshold( |
217 | "sample-profile-hot-inline-threshold" , cl::Hidden, cl::init(Val: 3000), |
218 | cl::desc("Hot callsite threshold for proirity-based sample profile loader " |
219 | "inlining." )); |
220 | |
221 | cl::opt<int> SampleColdCallSiteThreshold( |
222 | "sample-profile-cold-inline-threshold" , cl::Hidden, cl::init(Val: 45), |
223 | cl::desc("Threshold for inlining cold callsites" )); |
224 | } // namespace llvm |
225 | |
226 | static cl::opt<unsigned> ProfileICPRelativeHotness( |
227 | "sample-profile-icp-relative-hotness" , cl::Hidden, cl::init(Val: 25), |
228 | cl::desc( |
229 | "Relative hotness percentage threshold for indirect " |
230 | "call promotion in proirity-based sample profile loader inlining." )); |
231 | |
232 | static cl::opt<unsigned> ProfileICPRelativeHotnessSkip( |
233 | "sample-profile-icp-relative-hotness-skip" , cl::Hidden, cl::init(Val: 1), |
234 | cl::desc( |
235 | "Skip relative hotness check for ICP up to given number of targets." )); |
236 | |
237 | static cl::opt<bool> CallsitePrioritizedInline( |
238 | "sample-profile-prioritized-inline" , cl::Hidden, |
239 | |
240 | cl::desc("Use call site prioritized inlining for sample profile loader." |
241 | "Currently only CSSPGO is supported." )); |
242 | |
243 | static cl::opt<bool> UsePreInlinerDecision( |
244 | "sample-profile-use-preinliner" , cl::Hidden, |
245 | |
246 | cl::desc("Use the preinliner decisions stored in profile context." )); |
247 | |
248 | static cl::opt<bool> AllowRecursiveInline( |
249 | "sample-profile-recursive-inline" , cl::Hidden, |
250 | |
251 | cl::desc("Allow sample loader inliner to inline recursive calls." )); |
252 | |
253 | static cl::opt<std::string> ProfileInlineReplayFile( |
254 | "sample-profile-inline-replay" , cl::init(Val: "" ), cl::value_desc("filename" ), |
255 | cl::desc( |
256 | "Optimization remarks file containing inline remarks to be replayed " |
257 | "by inlining from sample profile loader." ), |
258 | cl::Hidden); |
259 | |
260 | static cl::opt<ReplayInlinerSettings::Scope> ProfileInlineReplayScope( |
261 | "sample-profile-inline-replay-scope" , |
262 | cl::init(Val: ReplayInlinerSettings::Scope::Function), |
263 | cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function" , |
264 | "Replay on functions that have remarks associated " |
265 | "with them (default)" ), |
266 | clEnumValN(ReplayInlinerSettings::Scope::Module, "Module" , |
267 | "Replay on the entire module" )), |
268 | cl::desc("Whether inline replay should be applied to the entire " |
269 | "Module or just the Functions (default) that are present as " |
270 | "callers in remarks during sample profile inlining." ), |
271 | cl::Hidden); |
272 | |
273 | static cl::opt<ReplayInlinerSettings::Fallback> ProfileInlineReplayFallback( |
274 | "sample-profile-inline-replay-fallback" , |
275 | cl::init(Val: ReplayInlinerSettings::Fallback::Original), |
276 | cl::values( |
277 | clEnumValN( |
278 | ReplayInlinerSettings::Fallback::Original, "Original" , |
279 | "All decisions not in replay send to original advisor (default)" ), |
280 | clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline, |
281 | "AlwaysInline" , "All decisions not in replay are inlined" ), |
282 | clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline" , |
283 | "All decisions not in replay are not inlined" )), |
284 | cl::desc("How sample profile inline replay treats sites that don't come " |
285 | "from the replay. Original: defers to original advisor, " |
286 | "AlwaysInline: inline all sites not in replay, NeverInline: " |
287 | "inline no sites not in replay" ), |
288 | cl::Hidden); |
289 | |
290 | static cl::opt<CallSiteFormat::Format> ProfileInlineReplayFormat( |
291 | "sample-profile-inline-replay-format" , |
292 | cl::init(Val: CallSiteFormat::Format::LineColumnDiscriminator), |
293 | cl::values( |
294 | clEnumValN(CallSiteFormat::Format::Line, "Line" , "<Line Number>" ), |
295 | clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn" , |
296 | "<Line Number>:<Column Number>" ), |
297 | clEnumValN(CallSiteFormat::Format::LineDiscriminator, |
298 | "LineDiscriminator" , "<Line Number>.<Discriminator>" ), |
299 | clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator, |
300 | "LineColumnDiscriminator" , |
301 | "<Line Number>:<Column Number>.<Discriminator> (default)" )), |
302 | cl::desc("How sample profile inline replay file is formatted" ), cl::Hidden); |
303 | |
304 | static cl::opt<unsigned> |
305 | MaxNumPromotions("sample-profile-icp-max-prom" , cl::init(Val: 3), cl::Hidden, |
306 | cl::desc("Max number of promotions for a single indirect " |
307 | "call callsite in sample profile loader" )); |
308 | |
309 | static cl::opt<bool> OverwriteExistingWeights( |
310 | "overwrite-existing-weights" , cl::Hidden, cl::init(Val: false), |
311 | cl::desc("Ignore existing branch weights on IR and always overwrite." )); |
312 | |
313 | static cl::opt<bool> AnnotateSampleProfileInlinePhase( |
314 | "annotate-sample-profile-inline-phase" , cl::Hidden, cl::init(Val: false), |
315 | cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for " |
316 | "sample-profile inline pass name." )); |
317 | |
318 | namespace llvm { |
319 | extern cl::opt<bool> EnableExtTspBlockPlacement; |
320 | } |
321 | |
322 | namespace { |
323 | |
324 | using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>; |
325 | using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>; |
326 | using Edge = std::pair<const BasicBlock *, const BasicBlock *>; |
327 | using EdgeWeightMap = DenseMap<Edge, uint64_t>; |
328 | using BlockEdgeMap = |
329 | DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>; |
330 | |
331 | class GUIDToFuncNameMapper { |
332 | public: |
333 | GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader, |
334 | DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap) |
335 | : CurrentReader(Reader), CurrentModule(M), |
336 | CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) { |
337 | if (!CurrentReader.useMD5()) |
338 | return; |
339 | |
340 | for (const auto &F : CurrentModule) { |
341 | StringRef OrigName = F.getName(); |
342 | CurrentGUIDToFuncNameMap.insert( |
343 | KV: {Function::getGUID(GlobalName: OrigName), OrigName}); |
344 | |
345 | // Local to global var promotion used by optimization like thinlto |
346 | // will rename the var and add suffix like ".llvm.xxx" to the |
347 | // original local name. In sample profile, the suffixes of function |
348 | // names are all stripped. Since it is possible that the mapper is |
349 | // built in post-thin-link phase and var promotion has been done, |
350 | // we need to add the substring of function name without the suffix |
351 | // into the GUIDToFuncNameMap. |
352 | StringRef CanonName = FunctionSamples::getCanonicalFnName(F); |
353 | if (CanonName != OrigName) |
354 | CurrentGUIDToFuncNameMap.insert( |
355 | KV: {Function::getGUID(GlobalName: CanonName), CanonName}); |
356 | } |
357 | |
358 | // Update GUIDToFuncNameMap for each function including inlinees. |
359 | SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap); |
360 | } |
361 | |
362 | ~GUIDToFuncNameMapper() { |
363 | if (!CurrentReader.useMD5()) |
364 | return; |
365 | |
366 | CurrentGUIDToFuncNameMap.clear(); |
367 | |
368 | // Reset GUIDToFuncNameMap for of each function as they're no |
369 | // longer valid at this point. |
370 | SetGUIDToFuncNameMapForAll(nullptr); |
371 | } |
372 | |
373 | private: |
374 | void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) { |
375 | std::queue<FunctionSamples *> FSToUpdate; |
376 | for (auto &IFS : CurrentReader.getProfiles()) { |
377 | FSToUpdate.push(x: &IFS.second); |
378 | } |
379 | |
380 | while (!FSToUpdate.empty()) { |
381 | FunctionSamples *FS = FSToUpdate.front(); |
382 | FSToUpdate.pop(); |
383 | FS->GUIDToFuncNameMap = Map; |
384 | for (const auto &ICS : FS->getCallsiteSamples()) { |
385 | const FunctionSamplesMap &FSMap = ICS.second; |
386 | for (const auto &IFS : FSMap) { |
387 | FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second); |
388 | FSToUpdate.push(x: &FS); |
389 | } |
390 | } |
391 | } |
392 | } |
393 | |
394 | SampleProfileReader &CurrentReader; |
395 | Module &CurrentModule; |
396 | DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap; |
397 | }; |
398 | |
399 | // Inline candidate used by iterative callsite prioritized inliner |
400 | struct InlineCandidate { |
401 | CallBase *CallInstr; |
402 | const FunctionSamples *CalleeSamples; |
403 | // Prorated callsite count, which will be used to guide inlining. For example, |
404 | // if a callsite is duplicated in LTO prelink, then in LTO postlink the two |
405 | // copies will get their own distribution factors and their prorated counts |
406 | // will be used to decide if they should be inlined independently. |
407 | uint64_t CallsiteCount; |
408 | // Call site distribution factor to prorate the profile samples for a |
409 | // duplicated callsite. Default value is 1.0. |
410 | float CallsiteDistribution; |
411 | }; |
412 | |
413 | // Inline candidate comparer using call site weight |
414 | struct CandidateComparer { |
415 | bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) { |
416 | if (LHS.CallsiteCount != RHS.CallsiteCount) |
417 | return LHS.CallsiteCount < RHS.CallsiteCount; |
418 | |
419 | const FunctionSamples *LCS = LHS.CalleeSamples; |
420 | const FunctionSamples *RCS = RHS.CalleeSamples; |
421 | assert(LCS && RCS && "Expect non-null FunctionSamples" ); |
422 | |
423 | // Tie breaker using number of samples try to favor smaller functions first |
424 | if (LCS->getBodySamples().size() != RCS->getBodySamples().size()) |
425 | return LCS->getBodySamples().size() > RCS->getBodySamples().size(); |
426 | |
427 | // Tie breaker using GUID so we have stable/deterministic inlining order |
428 | return LCS->getGUID() < RCS->getGUID(); |
429 | } |
430 | }; |
431 | |
432 | using CandidateQueue = |
433 | PriorityQueue<InlineCandidate, std::vector<InlineCandidate>, |
434 | CandidateComparer>; |
435 | |
436 | // Sample profile matching - fuzzy match. |
437 | class SampleProfileMatcher { |
438 | Module &M; |
439 | SampleProfileReader &Reader; |
440 | const PseudoProbeManager *ProbeManager; |
441 | SampleProfileMap FlattenedProfiles; |
442 | // For each function, the matcher generates a map, of which each entry is a |
443 | // mapping from the source location of current build to the source location in |
444 | // the profile. |
445 | StringMap<LocToLocMap> FuncMappings; |
446 | |
447 | // Profile mismatching statstics. |
448 | uint64_t TotalProfiledCallsites = 0; |
449 | uint64_t NumMismatchedCallsites = 0; |
450 | uint64_t MismatchedCallsiteSamples = 0; |
451 | uint64_t TotalCallsiteSamples = 0; |
452 | uint64_t TotalProfiledFunc = 0; |
453 | uint64_t NumMismatchedFuncHash = 0; |
454 | uint64_t MismatchedFuncHashSamples = 0; |
455 | uint64_t TotalFuncHashSamples = 0; |
456 | |
457 | // A dummy name for unknown indirect callee, used to differentiate from a |
458 | // non-call instruction that also has an empty callee name. |
459 | static constexpr const char *UnknownIndirectCallee = |
460 | "unknown.indirect.callee" ; |
461 | |
462 | public: |
463 | SampleProfileMatcher(Module &M, SampleProfileReader &Reader, |
464 | const PseudoProbeManager *ProbeManager) |
465 | : M(M), Reader(Reader), ProbeManager(ProbeManager){}; |
466 | void runOnModule(); |
467 | |
468 | private: |
469 | FunctionSamples *getFlattenedSamplesFor(const Function &F) { |
470 | StringRef CanonFName = FunctionSamples::getCanonicalFnName(F); |
471 | auto It = FlattenedProfiles.find(Ctx: FunctionId(CanonFName)); |
472 | if (It != FlattenedProfiles.end()) |
473 | return &It->second; |
474 | return nullptr; |
475 | } |
476 | void runOnFunction(const Function &F); |
477 | void findIRAnchors(const Function &F, |
478 | std::map<LineLocation, StringRef> &IRAnchors); |
479 | void findProfileAnchors( |
480 | const FunctionSamples &FS, |
481 | std::map<LineLocation, std::unordered_set<FunctionId>> |
482 | &ProfileAnchors); |
483 | void countMismatchedSamples(const FunctionSamples &FS); |
484 | void countProfileMismatches( |
485 | const Function &F, const FunctionSamples &FS, |
486 | const std::map<LineLocation, StringRef> &IRAnchors, |
487 | const std::map<LineLocation, std::unordered_set<FunctionId>> |
488 | &ProfileAnchors); |
489 | void countProfileCallsiteMismatches( |
490 | const FunctionSamples &FS, |
491 | const std::map<LineLocation, StringRef> &IRAnchors, |
492 | const std::map<LineLocation, std::unordered_set<FunctionId>> |
493 | &ProfileAnchors, |
494 | uint64_t &FuncMismatchedCallsites, uint64_t &FuncProfiledCallsites); |
495 | LocToLocMap &getIRToProfileLocationMap(const Function &F) { |
496 | auto Ret = FuncMappings.try_emplace( |
497 | Key: FunctionSamples::getCanonicalFnName(FnName: F.getName()), Args: LocToLocMap()); |
498 | return Ret.first->second; |
499 | } |
500 | void distributeIRToProfileLocationMap(); |
501 | void distributeIRToProfileLocationMap(FunctionSamples &FS); |
502 | void runStaleProfileMatching( |
503 | const Function &F, const std::map<LineLocation, StringRef> &IRAnchors, |
504 | const std::map<LineLocation, std::unordered_set<FunctionId>> |
505 | &ProfileAnchors, |
506 | LocToLocMap &IRToProfileLocationMap); |
507 | }; |
508 | |
509 | /// Sample profile pass. |
510 | /// |
511 | /// This pass reads profile data from the file specified by |
512 | /// -sample-profile-file and annotates every affected function with the |
513 | /// profile information found in that file. |
514 | class SampleProfileLoader final : public SampleProfileLoaderBaseImpl<Function> { |
515 | public: |
516 | SampleProfileLoader( |
517 | StringRef Name, StringRef RemapName, ThinOrFullLTOPhase LTOPhase, |
518 | IntrusiveRefCntPtr<vfs::FileSystem> FS, |
519 | std::function<AssumptionCache &(Function &)> GetAssumptionCache, |
520 | std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo, |
521 | std::function<const TargetLibraryInfo &(Function &)> GetTLI) |
522 | : SampleProfileLoaderBaseImpl(std::string(Name), std::string(RemapName), |
523 | std::move(FS)), |
524 | GetAC(std::move(GetAssumptionCache)), |
525 | GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)), |
526 | LTOPhase(LTOPhase), |
527 | AnnotatedPassName(AnnotateSampleProfileInlinePhase |
528 | ? llvm::AnnotateInlinePassName(IC: InlineContext{ |
529 | .LTOPhase: LTOPhase, .Pass: InlinePass::SampleProfileInliner}) |
530 | : CSINLINE_DEBUG) {} |
531 | |
532 | bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr); |
533 | bool runOnModule(Module &M, ModuleAnalysisManager *AM, |
534 | ProfileSummaryInfo *_PSI, LazyCallGraph &CG); |
535 | |
536 | protected: |
537 | bool runOnFunction(Function &F, ModuleAnalysisManager *AM); |
538 | bool emitAnnotations(Function &F); |
539 | ErrorOr<uint64_t> getInstWeight(const Instruction &I) override; |
540 | const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const; |
541 | const FunctionSamples * |
542 | findFunctionSamples(const Instruction &I) const override; |
543 | std::vector<const FunctionSamples *> |
544 | findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const; |
545 | void findExternalInlineCandidate(CallBase *CB, const FunctionSamples *Samples, |
546 | DenseSet<GlobalValue::GUID> &InlinedGUIDs, |
547 | uint64_t Threshold); |
548 | // Attempt to promote indirect call and also inline the promoted call |
549 | bool tryPromoteAndInlineCandidate( |
550 | Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, |
551 | uint64_t &Sum, SmallVector<CallBase *, 8> *InlinedCallSites = nullptr); |
552 | |
553 | bool inlineHotFunctions(Function &F, |
554 | DenseSet<GlobalValue::GUID> &InlinedGUIDs); |
555 | std::optional<InlineCost> getExternalInlineAdvisorCost(CallBase &CB); |
556 | bool getExternalInlineAdvisorShouldInline(CallBase &CB); |
557 | InlineCost shouldInlineCandidate(InlineCandidate &Candidate); |
558 | bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB); |
559 | bool |
560 | tryInlineCandidate(InlineCandidate &Candidate, |
561 | SmallVector<CallBase *, 8> *InlinedCallSites = nullptr); |
562 | bool |
563 | inlineHotFunctionsWithPriority(Function &F, |
564 | DenseSet<GlobalValue::GUID> &InlinedGUIDs); |
565 | // Inline cold/small functions in addition to hot ones |
566 | bool shouldInlineColdCallee(CallBase &CallInst); |
567 | void emitOptimizationRemarksForInlineCandidates( |
568 | const SmallVectorImpl<CallBase *> &Candidates, const Function &F, |
569 | bool Hot); |
570 | void promoteMergeNotInlinedContextSamples( |
571 | MapVector<CallBase *, const FunctionSamples *> NonInlinedCallSites, |
572 | const Function &F); |
573 | std::vector<Function *> buildFunctionOrder(Module &M, LazyCallGraph &CG); |
574 | std::unique_ptr<ProfiledCallGraph> buildProfiledCallGraph(Module &M); |
575 | void generateMDProfMetadata(Function &F); |
576 | |
577 | /// Map from function name to Function *. Used to find the function from |
578 | /// the function name. If the function name contains suffix, additional |
579 | /// entry is added to map from the stripped name to the function if there |
580 | /// is one-to-one mapping. |
581 | HashKeyMap<std::unordered_map, FunctionId, Function *> SymbolMap; |
582 | |
583 | std::function<AssumptionCache &(Function &)> GetAC; |
584 | std::function<TargetTransformInfo &(Function &)> GetTTI; |
585 | std::function<const TargetLibraryInfo &(Function &)> GetTLI; |
586 | |
587 | /// Profile tracker for different context. |
588 | std::unique_ptr<SampleContextTracker> ContextTracker; |
589 | |
590 | /// Flag indicating which LTO/ThinLTO phase the pass is invoked in. |
591 | /// |
592 | /// We need to know the LTO phase because for example in ThinLTOPrelink |
593 | /// phase, in annotation, we should not promote indirect calls. Instead, |
594 | /// we will mark GUIDs that needs to be annotated to the function. |
595 | const ThinOrFullLTOPhase LTOPhase; |
596 | const std::string AnnotatedPassName; |
597 | |
598 | /// Profle Symbol list tells whether a function name appears in the binary |
599 | /// used to generate the current profile. |
600 | std::unique_ptr<ProfileSymbolList> PSL; |
601 | |
602 | /// Total number of samples collected in this profile. |
603 | /// |
604 | /// This is the sum of all the samples collected in all the functions executed |
605 | /// at runtime. |
606 | uint64_t TotalCollectedSamples = 0; |
607 | |
608 | // Information recorded when we declined to inline a call site |
609 | // because we have determined it is too cold is accumulated for |
610 | // each callee function. Initially this is just the entry count. |
611 | struct NotInlinedProfileInfo { |
612 | uint64_t entryCount; |
613 | }; |
614 | DenseMap<Function *, NotInlinedProfileInfo> notInlinedCallInfo; |
615 | |
616 | // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for |
617 | // all the function symbols defined or declared in current module. |
618 | DenseMap<uint64_t, StringRef> GUIDToFuncNameMap; |
619 | |
620 | // All the Names used in FunctionSamples including outline function |
621 | // names, inline instance names and call target names. |
622 | StringSet<> NamesInProfile; |
623 | // MD5 version of NamesInProfile. Either NamesInProfile or GUIDsInProfile is |
624 | // populated, depends on whether the profile uses MD5. Because the name table |
625 | // generally contains several magnitude more entries than the number of |
626 | // functions, we do not want to convert all names from one form to another. |
627 | llvm::DenseSet<uint64_t> GUIDsInProfile; |
628 | |
629 | // For symbol in profile symbol list, whether to regard their profiles |
630 | // to be accurate. It is mainly decided by existance of profile symbol |
631 | // list and -profile-accurate-for-symsinlist flag, but it can be |
632 | // overriden by -profile-sample-accurate or profile-sample-accurate |
633 | // attribute. |
634 | bool ProfAccForSymsInList; |
635 | |
636 | // External inline advisor used to replay inline decision from remarks. |
637 | std::unique_ptr<InlineAdvisor> ExternalInlineAdvisor; |
638 | |
639 | // A helper to implement the sample profile matching algorithm. |
640 | std::unique_ptr<SampleProfileMatcher> MatchingManager; |
641 | |
642 | private: |
643 | const char *() const { |
644 | return AnnotatedPassName.c_str(); |
645 | } |
646 | }; |
647 | } // end anonymous namespace |
648 | |
649 | namespace llvm { |
650 | template <> |
651 | inline bool SampleProfileInference<Function>::isExit(const BasicBlock *BB) { |
652 | return succ_empty(BB); |
653 | } |
654 | |
655 | template <> |
656 | inline void SampleProfileInference<Function>::findUnlikelyJumps( |
657 | const std::vector<const BasicBlockT *> &BasicBlocks, |
658 | BlockEdgeMap &Successors, FlowFunction &Func) { |
659 | for (auto &Jump : Func.Jumps) { |
660 | const auto *BB = BasicBlocks[Jump.Source]; |
661 | const auto *Succ = BasicBlocks[Jump.Target]; |
662 | const Instruction *TI = BB->getTerminator(); |
663 | // Check if a block ends with InvokeInst and mark non-taken branch unlikely. |
664 | // In that case block Succ should be a landing pad |
665 | if (Successors[BB].size() == 2 && Successors[BB].back() == Succ) { |
666 | if (isa<InvokeInst>(Val: TI)) { |
667 | Jump.IsUnlikely = true; |
668 | } |
669 | } |
670 | const Instruction *SuccTI = Succ->getTerminator(); |
671 | // Check if the target block contains UnreachableInst and mark it unlikely |
672 | if (SuccTI->getNumSuccessors() == 0) { |
673 | if (isa<UnreachableInst>(Val: SuccTI)) { |
674 | Jump.IsUnlikely = true; |
675 | } |
676 | } |
677 | } |
678 | } |
679 | |
680 | template <> |
681 | void SampleProfileLoaderBaseImpl<Function>::computeDominanceAndLoopInfo( |
682 | Function &F) { |
683 | DT.reset(p: new DominatorTree); |
684 | DT->recalculate(Func&: F); |
685 | |
686 | PDT.reset(p: new PostDominatorTree(F)); |
687 | |
688 | LI.reset(p: new LoopInfo); |
689 | LI->analyze(DomTree: *DT); |
690 | } |
691 | } // namespace llvm |
692 | |
693 | ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) { |
694 | if (FunctionSamples::ProfileIsProbeBased) |
695 | return getProbeWeight(Inst); |
696 | |
697 | const DebugLoc &DLoc = Inst.getDebugLoc(); |
698 | if (!DLoc) |
699 | return std::error_code(); |
700 | |
701 | // Ignore all intrinsics, phinodes and branch instructions. |
702 | // Branch and phinodes instruction usually contains debug info from sources |
703 | // outside of the residing basic block, thus we ignore them during annotation. |
704 | if (isa<BranchInst>(Val: Inst) || isa<IntrinsicInst>(Val: Inst) || isa<PHINode>(Val: Inst)) |
705 | return std::error_code(); |
706 | |
707 | // For non-CS profile, if a direct call/invoke instruction is inlined in |
708 | // profile (findCalleeFunctionSamples returns non-empty result), but not |
709 | // inlined here, it means that the inlined callsite has no sample, thus the |
710 | // call instruction should have 0 count. |
711 | // For CS profile, the callsite count of previously inlined callees is |
712 | // populated with the entry count of the callees. |
713 | if (!FunctionSamples::ProfileIsCS) |
714 | if (const auto *CB = dyn_cast<CallBase>(Val: &Inst)) |
715 | if (!CB->isIndirectCall() && findCalleeFunctionSamples(I: *CB)) |
716 | return 0; |
717 | |
718 | return getInstWeightImpl(Inst); |
719 | } |
720 | |
721 | /// Get the FunctionSamples for a call instruction. |
722 | /// |
723 | /// The FunctionSamples of a call/invoke instruction \p Inst is the inlined |
724 | /// instance in which that call instruction is calling to. It contains |
725 | /// all samples that resides in the inlined instance. We first find the |
726 | /// inlined instance in which the call instruction is from, then we |
727 | /// traverse its children to find the callsite with the matching |
728 | /// location. |
729 | /// |
730 | /// \param Inst Call/Invoke instruction to query. |
731 | /// |
732 | /// \returns The FunctionSamples pointer to the inlined instance. |
733 | const FunctionSamples * |
734 | SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const { |
735 | const DILocation *DIL = Inst.getDebugLoc(); |
736 | if (!DIL) { |
737 | return nullptr; |
738 | } |
739 | |
740 | StringRef CalleeName; |
741 | if (Function *Callee = Inst.getCalledFunction()) |
742 | CalleeName = Callee->getName(); |
743 | |
744 | if (FunctionSamples::ProfileIsCS) |
745 | return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName); |
746 | |
747 | const FunctionSamples *FS = findFunctionSamples(I: Inst); |
748 | if (FS == nullptr) |
749 | return nullptr; |
750 | |
751 | return FS->findFunctionSamplesAt(Loc: FunctionSamples::getCallSiteIdentifier(DIL), |
752 | CalleeName, Remapper: Reader->getRemapper()); |
753 | } |
754 | |
755 | /// Returns a vector of FunctionSamples that are the indirect call targets |
756 | /// of \p Inst. The vector is sorted by the total number of samples. Stores |
757 | /// the total call count of the indirect call in \p Sum. |
758 | std::vector<const FunctionSamples *> |
759 | SampleProfileLoader::findIndirectCallFunctionSamples( |
760 | const Instruction &Inst, uint64_t &Sum) const { |
761 | const DILocation *DIL = Inst.getDebugLoc(); |
762 | std::vector<const FunctionSamples *> R; |
763 | |
764 | if (!DIL) { |
765 | return R; |
766 | } |
767 | |
768 | auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) { |
769 | assert(L && R && "Expect non-null FunctionSamples" ); |
770 | if (L->getHeadSamplesEstimate() != R->getHeadSamplesEstimate()) |
771 | return L->getHeadSamplesEstimate() > R->getHeadSamplesEstimate(); |
772 | return L->getGUID() < R->getGUID(); |
773 | }; |
774 | |
775 | if (FunctionSamples::ProfileIsCS) { |
776 | auto CalleeSamples = |
777 | ContextTracker->getIndirectCalleeContextSamplesFor(DIL); |
778 | if (CalleeSamples.empty()) |
779 | return R; |
780 | |
781 | // For CSSPGO, we only use target context profile's entry count |
782 | // as that already includes both inlined callee and non-inlined ones.. |
783 | Sum = 0; |
784 | for (const auto *const FS : CalleeSamples) { |
785 | Sum += FS->getHeadSamplesEstimate(); |
786 | R.push_back(x: FS); |
787 | } |
788 | llvm::sort(C&: R, Comp: FSCompare); |
789 | return R; |
790 | } |
791 | |
792 | const FunctionSamples *FS = findFunctionSamples(I: Inst); |
793 | if (FS == nullptr) |
794 | return R; |
795 | |
796 | auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL); |
797 | Sum = 0; |
798 | if (auto T = FS->findCallTargetMapAt(CallSite)) |
799 | for (const auto &T_C : *T) |
800 | Sum += T_C.second; |
801 | if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(Loc: CallSite)) { |
802 | if (M->empty()) |
803 | return R; |
804 | for (const auto &NameFS : *M) { |
805 | Sum += NameFS.second.getHeadSamplesEstimate(); |
806 | R.push_back(x: &NameFS.second); |
807 | } |
808 | llvm::sort(C&: R, Comp: FSCompare); |
809 | } |
810 | return R; |
811 | } |
812 | |
813 | const FunctionSamples * |
814 | SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { |
815 | if (FunctionSamples::ProfileIsProbeBased) { |
816 | std::optional<PseudoProbe> Probe = extractProbe(Inst); |
817 | if (!Probe) |
818 | return nullptr; |
819 | } |
820 | |
821 | const DILocation *DIL = Inst.getDebugLoc(); |
822 | if (!DIL) |
823 | return Samples; |
824 | |
825 | auto it = DILocation2SampleMap.try_emplace(Key: DIL,Args: nullptr); |
826 | if (it.second) { |
827 | if (FunctionSamples::ProfileIsCS) |
828 | it.first->second = ContextTracker->getContextSamplesFor(DIL); |
829 | else |
830 | it.first->second = |
831 | Samples->findFunctionSamples(DIL, Remapper: Reader->getRemapper()); |
832 | } |
833 | return it.first->second; |
834 | } |
835 | |
836 | /// Check whether the indirect call promotion history of \p Inst allows |
837 | /// the promotion for \p Candidate. |
838 | /// If the profile count for the promotion candidate \p Candidate is |
839 | /// NOMORE_ICP_MAGICNUM, it means \p Candidate has already been promoted |
840 | /// for \p Inst. If we already have at least MaxNumPromotions |
841 | /// NOMORE_ICP_MAGICNUM count values in the value profile of \p Inst, we |
842 | /// cannot promote for \p Inst anymore. |
843 | static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate) { |
844 | uint32_t NumVals = 0; |
845 | uint64_t TotalCount = 0; |
846 | std::unique_ptr<InstrProfValueData[]> ValueData = |
847 | std::make_unique<InstrProfValueData[]>(num: MaxNumPromotions); |
848 | bool Valid = |
849 | getValueProfDataFromInst(Inst, ValueKind: IPVK_IndirectCallTarget, MaxNumValueData: MaxNumPromotions, |
850 | ValueData: ValueData.get(), ActualNumValueData&: NumVals, TotalC&: TotalCount, GetNoICPValue: true); |
851 | // No valid value profile so no promoted targets have been recorded |
852 | // before. Ok to do ICP. |
853 | if (!Valid) |
854 | return true; |
855 | |
856 | unsigned NumPromoted = 0; |
857 | for (uint32_t I = 0; I < NumVals; I++) { |
858 | if (ValueData[I].Count != NOMORE_ICP_MAGICNUM) |
859 | continue; |
860 | |
861 | // If the promotion candidate has NOMORE_ICP_MAGICNUM count in the |
862 | // metadata, it means the candidate has been promoted for this |
863 | // indirect call. |
864 | if (ValueData[I].Value == Function::getGUID(GlobalName: Candidate)) |
865 | return false; |
866 | NumPromoted++; |
867 | // If already have MaxNumPromotions promotion, don't do it anymore. |
868 | if (NumPromoted == MaxNumPromotions) |
869 | return false; |
870 | } |
871 | return true; |
872 | } |
873 | |
874 | /// Update indirect call target profile metadata for \p Inst. |
875 | /// Usually \p Sum is the sum of counts of all the targets for \p Inst. |
876 | /// If it is 0, it means updateIDTMetaData is used to mark a |
877 | /// certain target to be promoted already. If it is not zero, |
878 | /// we expect to use it to update the total count in the value profile. |
879 | static void |
880 | updateIDTMetaData(Instruction &Inst, |
881 | const SmallVectorImpl<InstrProfValueData> &CallTargets, |
882 | uint64_t Sum) { |
883 | // Bail out early if MaxNumPromotions is zero. |
884 | // This prevents allocating an array of zero length below. |
885 | // |
886 | // Note `updateIDTMetaData` is called in two places so check |
887 | // `MaxNumPromotions` inside it. |
888 | if (MaxNumPromotions == 0) |
889 | return; |
890 | uint32_t NumVals = 0; |
891 | // OldSum is the existing total count in the value profile data. |
892 | uint64_t OldSum = 0; |
893 | std::unique_ptr<InstrProfValueData[]> ValueData = |
894 | std::make_unique<InstrProfValueData[]>(num: MaxNumPromotions); |
895 | bool Valid = |
896 | getValueProfDataFromInst(Inst, ValueKind: IPVK_IndirectCallTarget, MaxNumValueData: MaxNumPromotions, |
897 | ValueData: ValueData.get(), ActualNumValueData&: NumVals, TotalC&: OldSum, GetNoICPValue: true); |
898 | |
899 | DenseMap<uint64_t, uint64_t> ValueCountMap; |
900 | if (Sum == 0) { |
901 | assert((CallTargets.size() == 1 && |
902 | CallTargets[0].Count == NOMORE_ICP_MAGICNUM) && |
903 | "If sum is 0, assume only one element in CallTargets " |
904 | "with count being NOMORE_ICP_MAGICNUM" ); |
905 | // Initialize ValueCountMap with existing value profile data. |
906 | if (Valid) { |
907 | for (uint32_t I = 0; I < NumVals; I++) |
908 | ValueCountMap[ValueData[I].Value] = ValueData[I].Count; |
909 | } |
910 | auto Pair = |
911 | ValueCountMap.try_emplace(Key: CallTargets[0].Value, Args: CallTargets[0].Count); |
912 | // If the target already exists in value profile, decrease the total |
913 | // count OldSum and reset the target's count to NOMORE_ICP_MAGICNUM. |
914 | if (!Pair.second) { |
915 | OldSum -= Pair.first->second; |
916 | Pair.first->second = NOMORE_ICP_MAGICNUM; |
917 | } |
918 | Sum = OldSum; |
919 | } else { |
920 | // Initialize ValueCountMap with existing NOMORE_ICP_MAGICNUM |
921 | // counts in the value profile. |
922 | if (Valid) { |
923 | for (uint32_t I = 0; I < NumVals; I++) { |
924 | if (ValueData[I].Count == NOMORE_ICP_MAGICNUM) |
925 | ValueCountMap[ValueData[I].Value] = ValueData[I].Count; |
926 | } |
927 | } |
928 | |
929 | for (const auto &Data : CallTargets) { |
930 | auto Pair = ValueCountMap.try_emplace(Key: Data.Value, Args: Data.Count); |
931 | if (Pair.second) |
932 | continue; |
933 | // The target represented by Data.Value has already been promoted. |
934 | // Keep the count as NOMORE_ICP_MAGICNUM in the profile and decrease |
935 | // Sum by Data.Count. |
936 | assert(Sum >= Data.Count && "Sum should never be less than Data.Count" ); |
937 | Sum -= Data.Count; |
938 | } |
939 | } |
940 | |
941 | SmallVector<InstrProfValueData, 8> NewCallTargets; |
942 | for (const auto &ValueCount : ValueCountMap) { |
943 | NewCallTargets.emplace_back( |
944 | Args: InstrProfValueData{.Value: ValueCount.first, .Count: ValueCount.second}); |
945 | } |
946 | |
947 | llvm::sort(C&: NewCallTargets, |
948 | Comp: [](const InstrProfValueData &L, const InstrProfValueData &R) { |
949 | if (L.Count != R.Count) |
950 | return L.Count > R.Count; |
951 | return L.Value > R.Value; |
952 | }); |
953 | |
954 | uint32_t MaxMDCount = |
955 | std::min(a: NewCallTargets.size(), b: static_cast<size_t>(MaxNumPromotions)); |
956 | annotateValueSite(M&: *Inst.getParent()->getParent()->getParent(), Inst, |
957 | VDs: NewCallTargets, Sum, ValueKind: IPVK_IndirectCallTarget, MaxMDCount); |
958 | } |
959 | |
960 | /// Attempt to promote indirect call and also inline the promoted call. |
961 | /// |
962 | /// \param F Caller function. |
963 | /// \param Candidate ICP and inline candidate. |
964 | /// \param SumOrigin Original sum of target counts for indirect call before |
965 | /// promoting given candidate. |
966 | /// \param Sum Prorated sum of remaining target counts for indirect call |
967 | /// after promoting given candidate. |
968 | /// \param InlinedCallSite Output vector for new call sites exposed after |
969 | /// inlining. |
970 | bool SampleProfileLoader::tryPromoteAndInlineCandidate( |
971 | Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum, |
972 | SmallVector<CallBase *, 8> *InlinedCallSite) { |
973 | // Bail out early if sample-loader inliner is disabled. |
974 | if (DisableSampleLoaderInlining) |
975 | return false; |
976 | |
977 | // Bail out early if MaxNumPromotions is zero. |
978 | // This prevents allocating an array of zero length in callees below. |
979 | if (MaxNumPromotions == 0) |
980 | return false; |
981 | auto CalleeFunctionName = Candidate.CalleeSamples->getFunction(); |
982 | auto R = SymbolMap.find(Key: CalleeFunctionName); |
983 | if (R == SymbolMap.end() || !R->second) |
984 | return false; |
985 | |
986 | auto &CI = *Candidate.CallInstr; |
987 | if (!doesHistoryAllowICP(Inst: CI, Candidate: R->second->getName())) |
988 | return false; |
989 | |
990 | const char *Reason = "Callee function not available" ; |
991 | // R->getValue() != &F is to prevent promoting a recursive call. |
992 | // If it is a recursive call, we do not inline it as it could bloat |
993 | // the code exponentially. There is way to better handle this, e.g. |
994 | // clone the caller first, and inline the cloned caller if it is |
995 | // recursive. As llvm does not inline recursive calls, we will |
996 | // simply ignore it instead of handling it explicitly. |
997 | if (!R->second->isDeclaration() && R->second->getSubprogram() && |
998 | R->second->hasFnAttribute(Kind: "use-sample-profile" ) && |
999 | R->second != &F && isLegalToPromote(CB: CI, Callee: R->second, FailureReason: &Reason)) { |
1000 | // For promoted target, set its value with NOMORE_ICP_MAGICNUM count |
1001 | // in the value profile metadata so the target won't be promoted again. |
1002 | SmallVector<InstrProfValueData, 1> SortedCallTargets = {InstrProfValueData{ |
1003 | .Value: Function::getGUID(GlobalName: R->second->getName()), .Count: NOMORE_ICP_MAGICNUM}}; |
1004 | updateIDTMetaData(Inst&: CI, CallTargets: SortedCallTargets, Sum: 0); |
1005 | |
1006 | auto *DI = &pgo::promoteIndirectCall( |
1007 | CB&: CI, F: R->second, Count: Candidate.CallsiteCount, TotalCount: Sum, AttachProfToDirectCall: false, ORE); |
1008 | if (DI) { |
1009 | Sum -= Candidate.CallsiteCount; |
1010 | // Do not prorate the indirect callsite distribution since the original |
1011 | // distribution will be used to scale down non-promoted profile target |
1012 | // counts later. By doing this we lose track of the real callsite count |
1013 | // for the leftover indirect callsite as a trade off for accurate call |
1014 | // target counts. |
1015 | // TODO: Ideally we would have two separate factors, one for call site |
1016 | // counts and one is used to prorate call target counts. |
1017 | // Do not update the promoted direct callsite distribution at this |
1018 | // point since the original distribution combined with the callee profile |
1019 | // will be used to prorate callsites from the callee if inlined. Once not |
1020 | // inlined, the direct callsite distribution should be prorated so that |
1021 | // the it will reflect the real callsite counts. |
1022 | Candidate.CallInstr = DI; |
1023 | if (isa<CallInst>(Val: DI) || isa<InvokeInst>(Val: DI)) { |
1024 | bool Inlined = tryInlineCandidate(Candidate, InlinedCallSites: InlinedCallSite); |
1025 | if (!Inlined) { |
1026 | // Prorate the direct callsite distribution so that it reflects real |
1027 | // callsite counts. |
1028 | setProbeDistributionFactor( |
1029 | Inst&: *DI, Factor: static_cast<float>(Candidate.CallsiteCount) / SumOrigin); |
1030 | } |
1031 | return Inlined; |
1032 | } |
1033 | } |
1034 | } else { |
1035 | LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to " |
1036 | << FunctionSamples::getCanonicalFnName( |
1037 | Candidate.CallInstr->getName())<< " because " |
1038 | << Reason << "\n" ); |
1039 | } |
1040 | return false; |
1041 | } |
1042 | |
1043 | bool SampleProfileLoader::shouldInlineColdCallee(CallBase &CallInst) { |
1044 | if (!ProfileSizeInline) |
1045 | return false; |
1046 | |
1047 | Function *Callee = CallInst.getCalledFunction(); |
1048 | if (Callee == nullptr) |
1049 | return false; |
1050 | |
1051 | InlineCost Cost = getInlineCost(Call&: CallInst, Params: getInlineParams(), CalleeTTI&: GetTTI(*Callee), |
1052 | GetAssumptionCache: GetAC, GetTLI); |
1053 | |
1054 | if (Cost.isNever()) |
1055 | return false; |
1056 | |
1057 | if (Cost.isAlways()) |
1058 | return true; |
1059 | |
1060 | return Cost.getCost() <= SampleColdCallSiteThreshold; |
1061 | } |
1062 | |
1063 | void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates( |
1064 | const SmallVectorImpl<CallBase *> &Candidates, const Function &F, |
1065 | bool Hot) { |
1066 | for (auto *I : Candidates) { |
1067 | Function *CalledFunction = I->getCalledFunction(); |
1068 | if (CalledFunction) { |
1069 | ORE->emit(OptDiag&: OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), |
1070 | "InlineAttempt" , I->getDebugLoc(), |
1071 | I->getParent()) |
1072 | << "previous inlining reattempted for " |
1073 | << (Hot ? "hotness: '" : "size: '" ) |
1074 | << ore::NV("Callee" , CalledFunction) << "' into '" |
1075 | << ore::NV("Caller" , &F) << "'" ); |
1076 | } |
1077 | } |
1078 | } |
1079 | |
1080 | void SampleProfileLoader::findExternalInlineCandidate( |
1081 | CallBase *CB, const FunctionSamples *Samples, |
1082 | DenseSet<GlobalValue::GUID> &InlinedGUIDs, uint64_t Threshold) { |
1083 | |
1084 | // If ExternalInlineAdvisor(ReplayInlineAdvisor) wants to inline an external |
1085 | // function make sure it's imported |
1086 | if (CB && getExternalInlineAdvisorShouldInline(CB&: *CB)) { |
1087 | // Samples may not exist for replayed function, if so |
1088 | // just add the direct GUID and move on |
1089 | if (!Samples) { |
1090 | InlinedGUIDs.insert( |
1091 | V: Function::getGUID(GlobalName: CB->getCalledFunction()->getName())); |
1092 | return; |
1093 | } |
1094 | // Otherwise, drop the threshold to import everything that we can |
1095 | Threshold = 0; |
1096 | } |
1097 | |
1098 | // In some rare cases, call instruction could be changed after being pushed |
1099 | // into inline candidate queue, this is because earlier inlining may expose |
1100 | // constant propagation which can change indirect call to direct call. When |
1101 | // this happens, we may fail to find matching function samples for the |
1102 | // candidate later, even if a match was found when the candidate was enqueued. |
1103 | if (!Samples) |
1104 | return; |
1105 | |
1106 | // For AutoFDO profile, retrieve candidate profiles by walking over |
1107 | // the nested inlinee profiles. |
1108 | if (!FunctionSamples::ProfileIsCS) { |
1109 | Samples->findInlinedFunctions(S&: InlinedGUIDs, SymbolMap, Threshold); |
1110 | return; |
1111 | } |
1112 | |
1113 | ContextTrieNode *Caller = ContextTracker->getContextNodeForProfile(FSamples: Samples); |
1114 | std::queue<ContextTrieNode *> CalleeList; |
1115 | CalleeList.push(x: Caller); |
1116 | while (!CalleeList.empty()) { |
1117 | ContextTrieNode *Node = CalleeList.front(); |
1118 | CalleeList.pop(); |
1119 | FunctionSamples *CalleeSample = Node->getFunctionSamples(); |
1120 | // For CSSPGO profile, retrieve candidate profile by walking over the |
1121 | // trie built for context profile. Note that also take call targets |
1122 | // even if callee doesn't have a corresponding context profile. |
1123 | if (!CalleeSample) |
1124 | continue; |
1125 | |
1126 | // If pre-inliner decision is used, honor that for importing as well. |
1127 | bool PreInline = |
1128 | UsePreInlinerDecision && |
1129 | CalleeSample->getContext().hasAttribute(A: ContextShouldBeInlined); |
1130 | if (!PreInline && CalleeSample->getHeadSamplesEstimate() < Threshold) |
1131 | continue; |
1132 | |
1133 | Function *Func = SymbolMap.lookup(Key: CalleeSample->getFunction()); |
1134 | // Add to the import list only when it's defined out of module. |
1135 | if (!Func || Func->isDeclaration()) |
1136 | InlinedGUIDs.insert(V: CalleeSample->getGUID()); |
1137 | |
1138 | // Import hot CallTargets, which may not be available in IR because full |
1139 | // profile annotation cannot be done until backend compilation in ThinLTO. |
1140 | for (const auto &BS : CalleeSample->getBodySamples()) |
1141 | for (const auto &TS : BS.second.getCallTargets()) |
1142 | if (TS.second > Threshold) { |
1143 | const Function *Callee = SymbolMap.lookup(Key: TS.first); |
1144 | if (!Callee || Callee->isDeclaration()) |
1145 | InlinedGUIDs.insert(V: TS.first.getHashCode()); |
1146 | } |
1147 | |
1148 | // Import hot child context profile associted with callees. Note that this |
1149 | // may have some overlap with the call target loop above, but doing this |
1150 | // based child context profile again effectively allow us to use the max of |
1151 | // entry count and call target count to determine importing. |
1152 | for (auto &Child : Node->getAllChildContext()) { |
1153 | ContextTrieNode *CalleeNode = &Child.second; |
1154 | CalleeList.push(x: CalleeNode); |
1155 | } |
1156 | } |
1157 | } |
1158 | |
1159 | /// Iteratively inline hot callsites of a function. |
1160 | /// |
1161 | /// Iteratively traverse all callsites of the function \p F, so as to |
1162 | /// find out callsites with corresponding inline instances. |
1163 | /// |
1164 | /// For such callsites, |
1165 | /// - If it is hot enough, inline the callsites and adds callsites of the callee |
1166 | /// into the caller. If the call is an indirect call, first promote |
1167 | /// it to direct call. Each indirect call is limited with a single target. |
1168 | /// |
1169 | /// - If a callsite is not inlined, merge the its profile to the outline |
1170 | /// version (if --sample-profile-merge-inlinee is true), or scale the |
1171 | /// counters of standalone function based on the profile of inlined |
1172 | /// instances (if --sample-profile-merge-inlinee is false). |
1173 | /// |
1174 | /// Later passes may consume the updated profiles. |
1175 | /// |
1176 | /// \param F function to perform iterative inlining. |
1177 | /// \param InlinedGUIDs a set to be updated to include all GUIDs that are |
1178 | /// inlined in the profiled binary. |
1179 | /// |
1180 | /// \returns True if there is any inline happened. |
1181 | bool SampleProfileLoader::inlineHotFunctions( |
1182 | Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) { |
1183 | // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure |
1184 | // Profile symbol list is ignored when profile-sample-accurate is on. |
1185 | assert((!ProfAccForSymsInList || |
1186 | (!ProfileSampleAccurate && |
1187 | !F.hasFnAttribute("profile-sample-accurate" ))) && |
1188 | "ProfAccForSymsInList should be false when profile-sample-accurate " |
1189 | "is enabled" ); |
1190 | |
1191 | MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites; |
1192 | bool Changed = false; |
1193 | bool LocalChanged = true; |
1194 | while (LocalChanged) { |
1195 | LocalChanged = false; |
1196 | SmallVector<CallBase *, 10> CIS; |
1197 | for (auto &BB : F) { |
1198 | bool Hot = false; |
1199 | SmallVector<CallBase *, 10> AllCandidates; |
1200 | SmallVector<CallBase *, 10> ColdCandidates; |
1201 | for (auto &I : BB) { |
1202 | const FunctionSamples *FS = nullptr; |
1203 | if (auto *CB = dyn_cast<CallBase>(Val: &I)) { |
1204 | if (!isa<IntrinsicInst>(Val: I)) { |
1205 | if ((FS = findCalleeFunctionSamples(Inst: *CB))) { |
1206 | assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) && |
1207 | "GUIDToFuncNameMap has to be populated" ); |
1208 | AllCandidates.push_back(Elt: CB); |
1209 | if (FS->getHeadSamplesEstimate() > 0 || |
1210 | FunctionSamples::ProfileIsCS) |
1211 | LocalNotInlinedCallSites.insert(KV: {CB, FS}); |
1212 | if (callsiteIsHot(CallsiteFS: FS, PSI, ProfAccForSymsInList)) |
1213 | Hot = true; |
1214 | else if (shouldInlineColdCallee(CallInst&: *CB)) |
1215 | ColdCandidates.push_back(Elt: CB); |
1216 | } else if (getExternalInlineAdvisorShouldInline(CB&: *CB)) { |
1217 | AllCandidates.push_back(Elt: CB); |
1218 | } |
1219 | } |
1220 | } |
1221 | } |
1222 | if (Hot || ExternalInlineAdvisor) { |
1223 | CIS.insert(I: CIS.begin(), From: AllCandidates.begin(), To: AllCandidates.end()); |
1224 | emitOptimizationRemarksForInlineCandidates(Candidates: AllCandidates, F, Hot: true); |
1225 | } else { |
1226 | CIS.insert(I: CIS.begin(), From: ColdCandidates.begin(), To: ColdCandidates.end()); |
1227 | emitOptimizationRemarksForInlineCandidates(Candidates: ColdCandidates, F, Hot: false); |
1228 | } |
1229 | } |
1230 | for (CallBase *I : CIS) { |
1231 | Function *CalledFunction = I->getCalledFunction(); |
1232 | InlineCandidate Candidate = {.CallInstr: I, .CalleeSamples: LocalNotInlinedCallSites.lookup(Key: I), |
1233 | .CallsiteCount: 0 /* dummy count */, |
1234 | .CallsiteDistribution: 1.0 /* dummy distribution factor */}; |
1235 | // Do not inline recursive calls. |
1236 | if (CalledFunction == &F) |
1237 | continue; |
1238 | if (I->isIndirectCall()) { |
1239 | uint64_t Sum; |
1240 | for (const auto *FS : findIndirectCallFunctionSamples(Inst: *I, Sum)) { |
1241 | uint64_t SumOrigin = Sum; |
1242 | if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { |
1243 | findExternalInlineCandidate(CB: I, Samples: FS, InlinedGUIDs, |
1244 | Threshold: PSI->getOrCompHotCountThreshold()); |
1245 | continue; |
1246 | } |
1247 | if (!callsiteIsHot(CallsiteFS: FS, PSI, ProfAccForSymsInList)) |
1248 | continue; |
1249 | |
1250 | Candidate = {.CallInstr: I, .CalleeSamples: FS, .CallsiteCount: FS->getHeadSamplesEstimate(), .CallsiteDistribution: 1.0}; |
1251 | if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum)) { |
1252 | LocalNotInlinedCallSites.erase(Key: I); |
1253 | LocalChanged = true; |
1254 | } |
1255 | } |
1256 | } else if (CalledFunction && CalledFunction->getSubprogram() && |
1257 | !CalledFunction->isDeclaration()) { |
1258 | if (tryInlineCandidate(Candidate)) { |
1259 | LocalNotInlinedCallSites.erase(Key: I); |
1260 | LocalChanged = true; |
1261 | } |
1262 | } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { |
1263 | findExternalInlineCandidate(CB: I, Samples: findCalleeFunctionSamples(Inst: *I), |
1264 | InlinedGUIDs, |
1265 | Threshold: PSI->getOrCompHotCountThreshold()); |
1266 | } |
1267 | } |
1268 | Changed |= LocalChanged; |
1269 | } |
1270 | |
1271 | // For CS profile, profile for not inlined context will be merged when |
1272 | // base profile is being retrieved. |
1273 | if (!FunctionSamples::ProfileIsCS) |
1274 | promoteMergeNotInlinedContextSamples(NonInlinedCallSites: LocalNotInlinedCallSites, F); |
1275 | return Changed; |
1276 | } |
1277 | |
1278 | bool SampleProfileLoader::tryInlineCandidate( |
1279 | InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) { |
1280 | // Do not attempt to inline a candidate if |
1281 | // --disable-sample-loader-inlining is true. |
1282 | if (DisableSampleLoaderInlining) |
1283 | return false; |
1284 | |
1285 | CallBase &CB = *Candidate.CallInstr; |
1286 | Function *CalledFunction = CB.getCalledFunction(); |
1287 | assert(CalledFunction && "Expect a callee with definition" ); |
1288 | DebugLoc DLoc = CB.getDebugLoc(); |
1289 | BasicBlock *BB = CB.getParent(); |
1290 | |
1291 | InlineCost Cost = shouldInlineCandidate(Candidate); |
1292 | if (Cost.isNever()) { |
1293 | ORE->emit(OptDiag&: OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), |
1294 | "InlineFail" , DLoc, BB) |
1295 | << "incompatible inlining" ); |
1296 | return false; |
1297 | } |
1298 | |
1299 | if (!Cost) |
1300 | return false; |
1301 | |
1302 | InlineFunctionInfo IFI(GetAC); |
1303 | IFI.UpdateProfile = false; |
1304 | InlineResult IR = InlineFunction(CB, IFI, |
1305 | /*MergeAttributes=*/true); |
1306 | if (!IR.isSuccess()) |
1307 | return false; |
1308 | |
1309 | // The call to InlineFunction erases I, so we can't pass it here. |
1310 | emitInlinedIntoBasedOnCost(ORE&: *ORE, DLoc, Block: BB, Callee: *CalledFunction, Caller: *BB->getParent(), |
1311 | IC: Cost, ForProfileContext: true, PassName: getAnnotatedRemarkPassName()); |
1312 | |
1313 | // Now populate the list of newly exposed call sites. |
1314 | if (InlinedCallSites) { |
1315 | InlinedCallSites->clear(); |
1316 | for (auto &I : IFI.InlinedCallSites) |
1317 | InlinedCallSites->push_back(Elt: I); |
1318 | } |
1319 | |
1320 | if (FunctionSamples::ProfileIsCS) |
1321 | ContextTracker->markContextSamplesInlined(InlinedSamples: Candidate.CalleeSamples); |
1322 | ++NumCSInlined; |
1323 | |
1324 | // Prorate inlined probes for a duplicated inlining callsite which probably |
1325 | // has a distribution less than 100%. Samples for an inlinee should be |
1326 | // distributed among the copies of the original callsite based on each |
1327 | // callsite's distribution factor for counts accuracy. Note that an inlined |
1328 | // probe may come with its own distribution factor if it has been duplicated |
1329 | // in the inlinee body. The two factor are multiplied to reflect the |
1330 | // aggregation of duplication. |
1331 | if (Candidate.CallsiteDistribution < 1) { |
1332 | for (auto &I : IFI.InlinedCallSites) { |
1333 | if (std::optional<PseudoProbe> Probe = extractProbe(Inst: *I)) |
1334 | setProbeDistributionFactor(Inst&: *I, Factor: Probe->Factor * |
1335 | Candidate.CallsiteDistribution); |
1336 | } |
1337 | NumDuplicatedInlinesite++; |
1338 | } |
1339 | |
1340 | return true; |
1341 | } |
1342 | |
1343 | bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate, |
1344 | CallBase *CB) { |
1345 | assert(CB && "Expect non-null call instruction" ); |
1346 | |
1347 | if (isa<IntrinsicInst>(Val: CB)) |
1348 | return false; |
1349 | |
1350 | // Find the callee's profile. For indirect call, find hottest target profile. |
1351 | const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(Inst: *CB); |
1352 | // If ExternalInlineAdvisor wants to inline this site, do so even |
1353 | // if Samples are not present. |
1354 | if (!CalleeSamples && !getExternalInlineAdvisorShouldInline(CB&: *CB)) |
1355 | return false; |
1356 | |
1357 | float Factor = 1.0; |
1358 | if (std::optional<PseudoProbe> Probe = extractProbe(Inst: *CB)) |
1359 | Factor = Probe->Factor; |
1360 | |
1361 | uint64_t CallsiteCount = |
1362 | CalleeSamples ? CalleeSamples->getHeadSamplesEstimate() * Factor : 0; |
1363 | *NewCandidate = {.CallInstr: CB, .CalleeSamples: CalleeSamples, .CallsiteCount: CallsiteCount, .CallsiteDistribution: Factor}; |
1364 | return true; |
1365 | } |
1366 | |
1367 | std::optional<InlineCost> |
1368 | SampleProfileLoader::getExternalInlineAdvisorCost(CallBase &CB) { |
1369 | std::unique_ptr<InlineAdvice> Advice = nullptr; |
1370 | if (ExternalInlineAdvisor) { |
1371 | Advice = ExternalInlineAdvisor->getAdvice(CB); |
1372 | if (Advice) { |
1373 | if (!Advice->isInliningRecommended()) { |
1374 | Advice->recordUnattemptedInlining(); |
1375 | return InlineCost::getNever(Reason: "not previously inlined" ); |
1376 | } |
1377 | Advice->recordInlining(); |
1378 | return InlineCost::getAlways(Reason: "previously inlined" ); |
1379 | } |
1380 | } |
1381 | |
1382 | return {}; |
1383 | } |
1384 | |
1385 | bool SampleProfileLoader::getExternalInlineAdvisorShouldInline(CallBase &CB) { |
1386 | std::optional<InlineCost> Cost = getExternalInlineAdvisorCost(CB); |
1387 | return Cost ? !!*Cost : false; |
1388 | } |
1389 | |
1390 | InlineCost |
1391 | SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) { |
1392 | if (std::optional<InlineCost> ReplayCost = |
1393 | getExternalInlineAdvisorCost(CB&: *Candidate.CallInstr)) |
1394 | return *ReplayCost; |
1395 | // Adjust threshold based on call site hotness, only do this for callsite |
1396 | // prioritized inliner because otherwise cost-benefit check is done earlier. |
1397 | int SampleThreshold = SampleColdCallSiteThreshold; |
1398 | if (CallsitePrioritizedInline) { |
1399 | if (Candidate.CallsiteCount > PSI->getHotCountThreshold()) |
1400 | SampleThreshold = SampleHotCallSiteThreshold; |
1401 | else if (!ProfileSizeInline) |
1402 | return InlineCost::getNever(Reason: "cold callsite" ); |
1403 | } |
1404 | |
1405 | Function *Callee = Candidate.CallInstr->getCalledFunction(); |
1406 | assert(Callee && "Expect a definition for inline candidate of direct call" ); |
1407 | |
1408 | InlineParams Params = getInlineParams(); |
1409 | // We will ignore the threshold from inline cost, so always get full cost. |
1410 | Params.ComputeFullInlineCost = true; |
1411 | Params.AllowRecursiveCall = AllowRecursiveInline; |
1412 | // Checks if there is anything in the reachable portion of the callee at |
1413 | // this callsite that makes this inlining potentially illegal. Need to |
1414 | // set ComputeFullInlineCost, otherwise getInlineCost may return early |
1415 | // when cost exceeds threshold without checking all IRs in the callee. |
1416 | // The acutal cost does not matter because we only checks isNever() to |
1417 | // see if it is legal to inline the callsite. |
1418 | InlineCost Cost = getInlineCost(Call&: *Candidate.CallInstr, Callee, Params, |
1419 | CalleeTTI&: GetTTI(*Callee), GetAssumptionCache: GetAC, GetTLI); |
1420 | |
1421 | // Honor always inline and never inline from call analyzer |
1422 | if (Cost.isNever() || Cost.isAlways()) |
1423 | return Cost; |
1424 | |
1425 | // With CSSPGO, the preinliner in llvm-profgen can estimate global inline |
1426 | // decisions based on hotness as well as accurate function byte sizes for |
1427 | // given context using function/inlinee sizes from previous build. It |
1428 | // stores the decision in profile, and also adjust/merge context profile |
1429 | // aiming at better context-sensitive post-inline profile quality, assuming |
1430 | // all inline decision estimates are going to be honored by compiler. Here |
1431 | // we replay that inline decision under `sample-profile-use-preinliner`. |
1432 | // Note that we don't need to handle negative decision from preinliner as |
1433 | // context profile for not inlined calls are merged by preinliner already. |
1434 | if (UsePreInlinerDecision && Candidate.CalleeSamples) { |
1435 | // Once two node are merged due to promotion, we're losing some context |
1436 | // so the original context-sensitive preinliner decision should be ignored |
1437 | // for SyntheticContext. |
1438 | SampleContext &Context = Candidate.CalleeSamples->getContext(); |
1439 | if (!Context.hasState(S: SyntheticContext) && |
1440 | Context.hasAttribute(A: ContextShouldBeInlined)) |
1441 | return InlineCost::getAlways(Reason: "preinliner" ); |
1442 | } |
1443 | |
1444 | // For old FDO inliner, we inline the call site as long as cost is not |
1445 | // "Never". The cost-benefit check is done earlier. |
1446 | if (!CallsitePrioritizedInline) { |
1447 | return InlineCost::get(Cost: Cost.getCost(), INT_MAX); |
1448 | } |
1449 | |
1450 | // Otherwise only use the cost from call analyzer, but overwite threshold with |
1451 | // Sample PGO threshold. |
1452 | return InlineCost::get(Cost: Cost.getCost(), Threshold: SampleThreshold); |
1453 | } |
1454 | |
1455 | bool SampleProfileLoader::inlineHotFunctionsWithPriority( |
1456 | Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) { |
1457 | // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure |
1458 | // Profile symbol list is ignored when profile-sample-accurate is on. |
1459 | assert((!ProfAccForSymsInList || |
1460 | (!ProfileSampleAccurate && |
1461 | !F.hasFnAttribute("profile-sample-accurate" ))) && |
1462 | "ProfAccForSymsInList should be false when profile-sample-accurate " |
1463 | "is enabled" ); |
1464 | |
1465 | // Populating worklist with initial call sites from root inliner, along |
1466 | // with call site weights. |
1467 | CandidateQueue CQueue; |
1468 | InlineCandidate NewCandidate; |
1469 | for (auto &BB : F) { |
1470 | for (auto &I : BB) { |
1471 | auto *CB = dyn_cast<CallBase>(Val: &I); |
1472 | if (!CB) |
1473 | continue; |
1474 | if (getInlineCandidate(NewCandidate: &NewCandidate, CB)) |
1475 | CQueue.push(x: NewCandidate); |
1476 | } |
1477 | } |
1478 | |
1479 | // Cap the size growth from profile guided inlining. This is needed even |
1480 | // though cost of each inline candidate already accounts for callee size, |
1481 | // because with top-down inlining, we can grow inliner size significantly |
1482 | // with large number of smaller inlinees each pass the cost check. |
1483 | assert(ProfileInlineLimitMax >= ProfileInlineLimitMin && |
1484 | "Max inline size limit should not be smaller than min inline size " |
1485 | "limit." ); |
1486 | unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit; |
1487 | SizeLimit = std::min(a: SizeLimit, b: (unsigned)ProfileInlineLimitMax); |
1488 | SizeLimit = std::max(a: SizeLimit, b: (unsigned)ProfileInlineLimitMin); |
1489 | if (ExternalInlineAdvisor) |
1490 | SizeLimit = std::numeric_limits<unsigned>::max(); |
1491 | |
1492 | MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites; |
1493 | |
1494 | // Perform iterative BFS call site prioritized inlining |
1495 | bool Changed = false; |
1496 | while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) { |
1497 | InlineCandidate Candidate = CQueue.top(); |
1498 | CQueue.pop(); |
1499 | CallBase *I = Candidate.CallInstr; |
1500 | Function *CalledFunction = I->getCalledFunction(); |
1501 | |
1502 | if (CalledFunction == &F) |
1503 | continue; |
1504 | if (I->isIndirectCall()) { |
1505 | uint64_t Sum = 0; |
1506 | auto CalleeSamples = findIndirectCallFunctionSamples(Inst: *I, Sum); |
1507 | uint64_t SumOrigin = Sum; |
1508 | Sum *= Candidate.CallsiteDistribution; |
1509 | unsigned ICPCount = 0; |
1510 | for (const auto *FS : CalleeSamples) { |
1511 | // TODO: Consider disable pre-lTO ICP for MonoLTO as well |
1512 | if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { |
1513 | findExternalInlineCandidate(CB: I, Samples: FS, InlinedGUIDs, |
1514 | Threshold: PSI->getOrCompHotCountThreshold()); |
1515 | continue; |
1516 | } |
1517 | uint64_t EntryCountDistributed = |
1518 | FS->getHeadSamplesEstimate() * Candidate.CallsiteDistribution; |
1519 | // In addition to regular inline cost check, we also need to make sure |
1520 | // ICP isn't introducing excessive speculative checks even if individual |
1521 | // target looks beneficial to promote and inline. That means we should |
1522 | // only do ICP when there's a small number dominant targets. |
1523 | if (ICPCount >= ProfileICPRelativeHotnessSkip && |
1524 | EntryCountDistributed * 100 < SumOrigin * ProfileICPRelativeHotness) |
1525 | break; |
1526 | // TODO: Fix CallAnalyzer to handle all indirect calls. |
1527 | // For indirect call, we don't run CallAnalyzer to get InlineCost |
1528 | // before actual inlining. This is because we could see two different |
1529 | // types from the same definition, which makes CallAnalyzer choke as |
1530 | // it's expecting matching parameter type on both caller and callee |
1531 | // side. See example from PR18962 for the triggering cases (the bug was |
1532 | // fixed, but we generate different types). |
1533 | if (!PSI->isHotCount(C: EntryCountDistributed)) |
1534 | break; |
1535 | SmallVector<CallBase *, 8> InlinedCallSites; |
1536 | // Attach function profile for promoted indirect callee, and update |
1537 | // call site count for the promoted inline candidate too. |
1538 | Candidate = {.CallInstr: I, .CalleeSamples: FS, .CallsiteCount: EntryCountDistributed, |
1539 | .CallsiteDistribution: Candidate.CallsiteDistribution}; |
1540 | if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum, |
1541 | InlinedCallSite: &InlinedCallSites)) { |
1542 | for (auto *CB : InlinedCallSites) { |
1543 | if (getInlineCandidate(NewCandidate: &NewCandidate, CB)) |
1544 | CQueue.emplace(args&: NewCandidate); |
1545 | } |
1546 | ICPCount++; |
1547 | Changed = true; |
1548 | } else if (!ContextTracker) { |
1549 | LocalNotInlinedCallSites.insert(KV: {I, FS}); |
1550 | } |
1551 | } |
1552 | } else if (CalledFunction && CalledFunction->getSubprogram() && |
1553 | !CalledFunction->isDeclaration()) { |
1554 | SmallVector<CallBase *, 8> InlinedCallSites; |
1555 | if (tryInlineCandidate(Candidate, InlinedCallSites: &InlinedCallSites)) { |
1556 | for (auto *CB : InlinedCallSites) { |
1557 | if (getInlineCandidate(NewCandidate: &NewCandidate, CB)) |
1558 | CQueue.emplace(args&: NewCandidate); |
1559 | } |
1560 | Changed = true; |
1561 | } else if (!ContextTracker) { |
1562 | LocalNotInlinedCallSites.insert(KV: {I, Candidate.CalleeSamples}); |
1563 | } |
1564 | } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { |
1565 | findExternalInlineCandidate(CB: I, Samples: findCalleeFunctionSamples(Inst: *I), |
1566 | InlinedGUIDs, |
1567 | Threshold: PSI->getOrCompHotCountThreshold()); |
1568 | } |
1569 | } |
1570 | |
1571 | if (!CQueue.empty()) { |
1572 | if (SizeLimit == (unsigned)ProfileInlineLimitMax) |
1573 | ++NumCSInlinedHitMaxLimit; |
1574 | else if (SizeLimit == (unsigned)ProfileInlineLimitMin) |
1575 | ++NumCSInlinedHitMinLimit; |
1576 | else |
1577 | ++NumCSInlinedHitGrowthLimit; |
1578 | } |
1579 | |
1580 | // For CS profile, profile for not inlined context will be merged when |
1581 | // base profile is being retrieved. |
1582 | if (!FunctionSamples::ProfileIsCS) |
1583 | promoteMergeNotInlinedContextSamples(NonInlinedCallSites: LocalNotInlinedCallSites, F); |
1584 | return Changed; |
1585 | } |
1586 | |
1587 | void SampleProfileLoader::promoteMergeNotInlinedContextSamples( |
1588 | MapVector<CallBase *, const FunctionSamples *> NonInlinedCallSites, |
1589 | const Function &F) { |
1590 | // Accumulate not inlined callsite information into notInlinedSamples |
1591 | for (const auto &Pair : NonInlinedCallSites) { |
1592 | CallBase *I = Pair.first; |
1593 | Function *Callee = I->getCalledFunction(); |
1594 | if (!Callee || Callee->isDeclaration()) |
1595 | continue; |
1596 | |
1597 | ORE->emit( |
1598 | OptDiag&: OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), "NotInline" , |
1599 | I->getDebugLoc(), I->getParent()) |
1600 | << "previous inlining not repeated: '" << ore::NV("Callee" , Callee) |
1601 | << "' into '" << ore::NV("Caller" , &F) << "'" ); |
1602 | |
1603 | ++NumCSNotInlined; |
1604 | const FunctionSamples *FS = Pair.second; |
1605 | if (FS->getTotalSamples() == 0 && FS->getHeadSamplesEstimate() == 0) { |
1606 | continue; |
1607 | } |
1608 | |
1609 | // Do not merge a context that is already duplicated into the base profile. |
1610 | if (FS->getContext().hasAttribute(A: sampleprof::ContextDuplicatedIntoBase)) |
1611 | continue; |
1612 | |
1613 | if (ProfileMergeInlinee) { |
1614 | // A function call can be replicated by optimizations like callsite |
1615 | // splitting or jump threading and the replicates end up sharing the |
1616 | // sample nested callee profile instead of slicing the original |
1617 | // inlinee's profile. We want to do merge exactly once by filtering out |
1618 | // callee profiles with a non-zero head sample count. |
1619 | if (FS->getHeadSamples() == 0) { |
1620 | // Use entry samples as head samples during the merge, as inlinees |
1621 | // don't have head samples. |
1622 | const_cast<FunctionSamples *>(FS)->addHeadSamples( |
1623 | Num: FS->getHeadSamplesEstimate()); |
1624 | |
1625 | // Note that we have to do the merge right after processing function. |
1626 | // This allows OutlineFS's profile to be used for annotation during |
1627 | // top-down processing of functions' annotation. |
1628 | FunctionSamples *OutlineFS = Reader->getSamplesFor(F: *Callee); |
1629 | // If outlined function does not exist in the profile, add it to a |
1630 | // separate map so that it does not rehash the original profile. |
1631 | if (!OutlineFS) |
1632 | OutlineFS = &OutlineFunctionSamples[ |
1633 | FunctionId(FunctionSamples::getCanonicalFnName(FnName: Callee->getName()))]; |
1634 | OutlineFS->merge(Other: *FS, Weight: 1); |
1635 | // Set outlined profile to be synthetic to not bias the inliner. |
1636 | OutlineFS->SetContextSynthetic(); |
1637 | } |
1638 | } else { |
1639 | auto pair = |
1640 | notInlinedCallInfo.try_emplace(Key: Callee, Args: NotInlinedProfileInfo{.entryCount: 0}); |
1641 | pair.first->second.entryCount += FS->getHeadSamplesEstimate(); |
1642 | } |
1643 | } |
1644 | } |
1645 | |
1646 | /// Returns the sorted CallTargetMap \p M by count in descending order. |
1647 | static SmallVector<InstrProfValueData, 2> |
1648 | GetSortedValueDataFromCallTargets(const SampleRecord::CallTargetMap &M) { |
1649 | SmallVector<InstrProfValueData, 2> R; |
1650 | for (const auto &I : SampleRecord::SortCallTargets(Targets: M)) { |
1651 | R.emplace_back( |
1652 | Args: InstrProfValueData{.Value: I.first.getHashCode(), .Count: I.second}); |
1653 | } |
1654 | return R; |
1655 | } |
1656 | |
1657 | // Generate MD_prof metadata for every branch instruction using the |
1658 | // edge weights computed during propagation. |
1659 | void SampleProfileLoader::generateMDProfMetadata(Function &F) { |
1660 | // Generate MD_prof metadata for every branch instruction using the |
1661 | // edge weights computed during propagation. |
1662 | LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n" ); |
1663 | LLVMContext &Ctx = F.getContext(); |
1664 | MDBuilder MDB(Ctx); |
1665 | for (auto &BI : F) { |
1666 | BasicBlock *BB = &BI; |
1667 | |
1668 | if (BlockWeights[BB]) { |
1669 | for (auto &I : *BB) { |
1670 | if (!isa<CallInst>(Val: I) && !isa<InvokeInst>(Val: I)) |
1671 | continue; |
1672 | if (!cast<CallBase>(Val&: I).getCalledFunction()) { |
1673 | const DebugLoc &DLoc = I.getDebugLoc(); |
1674 | if (!DLoc) |
1675 | continue; |
1676 | const DILocation *DIL = DLoc; |
1677 | const FunctionSamples *FS = findFunctionSamples(Inst: I); |
1678 | if (!FS) |
1679 | continue; |
1680 | auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL); |
1681 | ErrorOr<SampleRecord::CallTargetMap> T = |
1682 | FS->findCallTargetMapAt(CallSite); |
1683 | if (!T || T.get().empty()) |
1684 | continue; |
1685 | if (FunctionSamples::ProfileIsProbeBased) { |
1686 | // Prorate the callsite counts based on the pre-ICP distribution |
1687 | // factor to reflect what is already done to the callsite before |
1688 | // ICP, such as calliste cloning. |
1689 | if (std::optional<PseudoProbe> Probe = extractProbe(Inst: I)) { |
1690 | if (Probe->Factor < 1) |
1691 | T = SampleRecord::adjustCallTargets(Targets: T.get(), DistributionFactor: Probe->Factor); |
1692 | } |
1693 | } |
1694 | SmallVector<InstrProfValueData, 2> SortedCallTargets = |
1695 | GetSortedValueDataFromCallTargets(M: T.get()); |
1696 | uint64_t Sum = 0; |
1697 | for (const auto &C : T.get()) |
1698 | Sum += C.second; |
1699 | // With CSSPGO all indirect call targets are counted torwards the |
1700 | // original indirect call site in the profile, including both |
1701 | // inlined and non-inlined targets. |
1702 | if (!FunctionSamples::ProfileIsCS) { |
1703 | if (const FunctionSamplesMap *M = |
1704 | FS->findFunctionSamplesMapAt(Loc: CallSite)) { |
1705 | for (const auto &NameFS : *M) |
1706 | Sum += NameFS.second.getHeadSamplesEstimate(); |
1707 | } |
1708 | } |
1709 | if (Sum) |
1710 | updateIDTMetaData(Inst&: I, CallTargets: SortedCallTargets, Sum); |
1711 | else if (OverwriteExistingWeights) |
1712 | I.setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr); |
1713 | } else if (!isa<IntrinsicInst>(Val: &I)) { |
1714 | setBranchWeights(I, Weights: {static_cast<uint32_t>(BlockWeights[BB])}); |
1715 | } |
1716 | } |
1717 | } else if (OverwriteExistingWeights || ProfileSampleBlockAccurate) { |
1718 | // Set profile metadata (possibly annotated by LTO prelink) to zero or |
1719 | // clear it for cold code. |
1720 | for (auto &I : *BB) { |
1721 | if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I)) { |
1722 | if (cast<CallBase>(Val&: I).isIndirectCall()) { |
1723 | I.setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr); |
1724 | } else { |
1725 | setBranchWeights(I, Weights: {uint32_t(0)}); |
1726 | } |
1727 | } |
1728 | } |
1729 | } |
1730 | |
1731 | Instruction *TI = BB->getTerminator(); |
1732 | if (TI->getNumSuccessors() == 1) |
1733 | continue; |
1734 | if (!isa<BranchInst>(Val: TI) && !isa<SwitchInst>(Val: TI) && |
1735 | !isa<IndirectBrInst>(Val: TI)) |
1736 | continue; |
1737 | |
1738 | DebugLoc BranchLoc = TI->getDebugLoc(); |
1739 | LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line " |
1740 | << ((BranchLoc) ? Twine(BranchLoc.getLine()) |
1741 | : Twine("<UNKNOWN LOCATION>" )) |
1742 | << ".\n" ); |
1743 | SmallVector<uint32_t, 4> Weights; |
1744 | uint32_t MaxWeight = 0; |
1745 | Instruction *MaxDestInst; |
1746 | // Since profi treats multiple edges (multiway branches) as a single edge, |
1747 | // we need to distribute the computed weight among the branches. We do |
1748 | // this by evenly splitting the edge weight among destinations. |
1749 | DenseMap<const BasicBlock *, uint64_t> EdgeMultiplicity; |
1750 | std::vector<uint64_t> EdgeIndex; |
1751 | if (SampleProfileUseProfi) { |
1752 | EdgeIndex.resize(new_size: TI->getNumSuccessors()); |
1753 | for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { |
1754 | const BasicBlock *Succ = TI->getSuccessor(Idx: I); |
1755 | EdgeIndex[I] = EdgeMultiplicity[Succ]; |
1756 | EdgeMultiplicity[Succ]++; |
1757 | } |
1758 | } |
1759 | for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { |
1760 | BasicBlock *Succ = TI->getSuccessor(Idx: I); |
1761 | Edge E = std::make_pair(x&: BB, y&: Succ); |
1762 | uint64_t Weight = EdgeWeights[E]; |
1763 | LLVM_DEBUG(dbgs() << "\t" ; printEdgeWeight(dbgs(), E)); |
1764 | // Use uint32_t saturated arithmetic to adjust the incoming weights, |
1765 | // if needed. Sample counts in profiles are 64-bit unsigned values, |
1766 | // but internally branch weights are expressed as 32-bit values. |
1767 | if (Weight > std::numeric_limits<uint32_t>::max()) { |
1768 | LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)" ); |
1769 | Weight = std::numeric_limits<uint32_t>::max(); |
1770 | } |
1771 | if (!SampleProfileUseProfi) { |
1772 | // Weight is added by one to avoid propagation errors introduced by |
1773 | // 0 weights. |
1774 | Weights.push_back(Elt: static_cast<uint32_t>(Weight + 1)); |
1775 | } else { |
1776 | // Profi creates proper weights that do not require "+1" adjustments but |
1777 | // we evenly split the weight among branches with the same destination. |
1778 | uint64_t W = Weight / EdgeMultiplicity[Succ]; |
1779 | // Rounding up, if needed, so that first branches are hotter. |
1780 | if (EdgeIndex[I] < Weight % EdgeMultiplicity[Succ]) |
1781 | W++; |
1782 | Weights.push_back(Elt: static_cast<uint32_t>(W)); |
1783 | } |
1784 | if (Weight != 0) { |
1785 | if (Weight > MaxWeight) { |
1786 | MaxWeight = Weight; |
1787 | MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime(); |
1788 | } |
1789 | } |
1790 | } |
1791 | |
1792 | misexpect::checkExpectAnnotations(I&: *TI, ExistingWeights: Weights, /*IsFrontend=*/false); |
1793 | |
1794 | uint64_t TempWeight; |
1795 | // Only set weights if there is at least one non-zero weight. |
1796 | // In any other case, let the analyzer set weights. |
1797 | // Do not set weights if the weights are present unless under |
1798 | // OverwriteExistingWeights. In ThinLTO, the profile annotation is done |
1799 | // twice. If the first annotation already set the weights, the second pass |
1800 | // does not need to set it. With OverwriteExistingWeights, Blocks with zero |
1801 | // weight should have their existing metadata (possibly annotated by LTO |
1802 | // prelink) cleared. |
1803 | if (MaxWeight > 0 && |
1804 | (!TI->extractProfTotalWeight(TotalVal&: TempWeight) || OverwriteExistingWeights)) { |
1805 | LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n" ); |
1806 | setBranchWeights(I&: *TI, Weights); |
1807 | ORE->emit(RemarkBuilder: [&]() { |
1808 | return OptimizationRemark(DEBUG_TYPE, "PopularDest" , MaxDestInst) |
1809 | << "most popular destination for conditional branches at " |
1810 | << ore::NV("CondBranchesLoc" , BranchLoc); |
1811 | }); |
1812 | } else { |
1813 | if (OverwriteExistingWeights) { |
1814 | TI->setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr); |
1815 | LLVM_DEBUG(dbgs() << "CLEARED. All branch weights are zero.\n" ); |
1816 | } else { |
1817 | LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n" ); |
1818 | } |
1819 | } |
1820 | } |
1821 | } |
1822 | |
1823 | /// Once all the branch weights are computed, we emit the MD_prof |
1824 | /// metadata on BB using the computed values for each of its branches. |
1825 | /// |
1826 | /// \param F The function to query. |
1827 | /// |
1828 | /// \returns true if \p F was modified. Returns false, otherwise. |
1829 | bool SampleProfileLoader::emitAnnotations(Function &F) { |
1830 | bool Changed = false; |
1831 | |
1832 | if (FunctionSamples::ProfileIsProbeBased) { |
1833 | if (!ProbeManager->profileIsValid(F, Samples: *Samples)) { |
1834 | LLVM_DEBUG( |
1835 | dbgs() << "Profile is invalid due to CFG mismatch for Function " |
1836 | << F.getName() << "\n" ); |
1837 | ++NumMismatchedProfile; |
1838 | if (!SalvageStaleProfile) |
1839 | return false; |
1840 | } |
1841 | ++NumMatchedProfile; |
1842 | } else { |
1843 | if (getFunctionLoc(F) == 0) |
1844 | return false; |
1845 | |
1846 | LLVM_DEBUG(dbgs() << "Line number for the first instruction in " |
1847 | << F.getName() << ": " << getFunctionLoc(F) << "\n" ); |
1848 | } |
1849 | |
1850 | DenseSet<GlobalValue::GUID> InlinedGUIDs; |
1851 | if (CallsitePrioritizedInline) |
1852 | Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs); |
1853 | else |
1854 | Changed |= inlineHotFunctions(F, InlinedGUIDs); |
1855 | |
1856 | Changed |= computeAndPropagateWeights(F, InlinedGUIDs); |
1857 | |
1858 | if (Changed) |
1859 | generateMDProfMetadata(F); |
1860 | |
1861 | emitCoverageRemarks(F); |
1862 | return Changed; |
1863 | } |
1864 | |
1865 | std::unique_ptr<ProfiledCallGraph> |
1866 | SampleProfileLoader::buildProfiledCallGraph(Module &M) { |
1867 | std::unique_ptr<ProfiledCallGraph> ProfiledCG; |
1868 | if (FunctionSamples::ProfileIsCS) |
1869 | ProfiledCG = std::make_unique<ProfiledCallGraph>(args&: *ContextTracker); |
1870 | else |
1871 | ProfiledCG = std::make_unique<ProfiledCallGraph>(args&: Reader->getProfiles()); |
1872 | |
1873 | // Add all functions into the profiled call graph even if they are not in |
1874 | // the profile. This makes sure functions missing from the profile still |
1875 | // gets a chance to be processed. |
1876 | for (Function &F : M) { |
1877 | if (F.isDeclaration() || !F.hasFnAttribute(Kind: "use-sample-profile" )) |
1878 | continue; |
1879 | ProfiledCG->addProfiledFunction( |
1880 | Name: getRepInFormat(Name: FunctionSamples::getCanonicalFnName(F))); |
1881 | } |
1882 | |
1883 | return ProfiledCG; |
1884 | } |
1885 | |
1886 | std::vector<Function *> |
1887 | SampleProfileLoader::buildFunctionOrder(Module &M, LazyCallGraph &CG) { |
1888 | std::vector<Function *> FunctionOrderList; |
1889 | FunctionOrderList.reserve(n: M.size()); |
1890 | |
1891 | if (!ProfileTopDownLoad && UseProfiledCallGraph) |
1892 | errs() << "WARNING: -use-profiled-call-graph ignored, should be used " |
1893 | "together with -sample-profile-top-down-load.\n" ; |
1894 | |
1895 | if (!ProfileTopDownLoad) { |
1896 | if (ProfileMergeInlinee) { |
1897 | // Disable ProfileMergeInlinee if profile is not loaded in top down order, |
1898 | // because the profile for a function may be used for the profile |
1899 | // annotation of its outline copy before the profile merging of its |
1900 | // non-inlined inline instances, and that is not the way how |
1901 | // ProfileMergeInlinee is supposed to work. |
1902 | ProfileMergeInlinee = false; |
1903 | } |
1904 | |
1905 | for (Function &F : M) |
1906 | if (!F.isDeclaration() && F.hasFnAttribute(Kind: "use-sample-profile" )) |
1907 | FunctionOrderList.push_back(x: &F); |
1908 | return FunctionOrderList; |
1909 | } |
1910 | |
1911 | if (UseProfiledCallGraph || (FunctionSamples::ProfileIsCS && |
1912 | !UseProfiledCallGraph.getNumOccurrences())) { |
1913 | // Use profiled call edges to augment the top-down order. There are cases |
1914 | // that the top-down order computed based on the static call graph doesn't |
1915 | // reflect real execution order. For example |
1916 | // |
1917 | // 1. Incomplete static call graph due to unknown indirect call targets. |
1918 | // Adjusting the order by considering indirect call edges from the |
1919 | // profile can enable the inlining of indirect call targets by allowing |
1920 | // the caller processed before them. |
1921 | // 2. Mutual call edges in an SCC. The static processing order computed for |
1922 | // an SCC may not reflect the call contexts in the context-sensitive |
1923 | // profile, thus may cause potential inlining to be overlooked. The |
1924 | // function order in one SCC is being adjusted to a top-down order based |
1925 | // on the profile to favor more inlining. This is only a problem with CS |
1926 | // profile. |
1927 | // 3. Transitive indirect call edges due to inlining. When a callee function |
1928 | // (say B) is inlined into a caller function (say A) in LTO prelink, |
1929 | // every call edge originated from the callee B will be transferred to |
1930 | // the caller A. If any transferred edge (say A->C) is indirect, the |
1931 | // original profiled indirect edge B->C, even if considered, would not |
1932 | // enforce a top-down order from the caller A to the potential indirect |
1933 | // call target C in LTO postlink since the inlined callee B is gone from |
1934 | // the static call graph. |
1935 | // 4. #3 can happen even for direct call targets, due to functions defined |
1936 | // in header files. A header function (say A), when included into source |
1937 | // files, is defined multiple times but only one definition survives due |
1938 | // to ODR. Therefore, the LTO prelink inlining done on those dropped |
1939 | // definitions can be useless based on a local file scope. More |
1940 | // importantly, the inlinee (say B), once fully inlined to a |
1941 | // to-be-dropped A, will have no profile to consume when its outlined |
1942 | // version is compiled. This can lead to a profile-less prelink |
1943 | // compilation for the outlined version of B which may be called from |
1944 | // external modules. while this isn't easy to fix, we rely on the |
1945 | // postlink AutoFDO pipeline to optimize B. Since the survived copy of |
1946 | // the A can be inlined in its local scope in prelink, it may not exist |
1947 | // in the merged IR in postlink, and we'll need the profiled call edges |
1948 | // to enforce a top-down order for the rest of the functions. |
1949 | // |
1950 | // Considering those cases, a profiled call graph completely independent of |
1951 | // the static call graph is constructed based on profile data, where |
1952 | // function objects are not even needed to handle case #3 and case 4. |
1953 | // |
1954 | // Note that static callgraph edges are completely ignored since they |
1955 | // can be conflicting with profiled edges for cyclic SCCs and may result in |
1956 | // an SCC order incompatible with profile-defined one. Using strictly |
1957 | // profile order ensures a maximum inlining experience. On the other hand, |
1958 | // static call edges are not so important when they don't correspond to a |
1959 | // context in the profile. |
1960 | |
1961 | std::unique_ptr<ProfiledCallGraph> ProfiledCG = buildProfiledCallGraph(M); |
1962 | scc_iterator<ProfiledCallGraph *> CGI = scc_begin(G: ProfiledCG.get()); |
1963 | while (!CGI.isAtEnd()) { |
1964 | auto Range = *CGI; |
1965 | if (SortProfiledSCC) { |
1966 | // Sort nodes in one SCC based on callsite hotness. |
1967 | scc_member_iterator<ProfiledCallGraph *> SI(*CGI); |
1968 | Range = *SI; |
1969 | } |
1970 | for (auto *Node : Range) { |
1971 | Function *F = SymbolMap.lookup(Key: Node->Name); |
1972 | if (F && !F->isDeclaration() && F->hasFnAttribute(Kind: "use-sample-profile" )) |
1973 | FunctionOrderList.push_back(x: F); |
1974 | } |
1975 | ++CGI; |
1976 | } |
1977 | } else { |
1978 | CG.buildRefSCCs(); |
1979 | for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs()) { |
1980 | for (LazyCallGraph::SCC &C : RC) { |
1981 | for (LazyCallGraph::Node &N : C) { |
1982 | Function &F = N.getFunction(); |
1983 | if (!F.isDeclaration() && F.hasFnAttribute(Kind: "use-sample-profile" )) |
1984 | FunctionOrderList.push_back(x: &F); |
1985 | } |
1986 | } |
1987 | } |
1988 | } |
1989 | |
1990 | std::reverse(first: FunctionOrderList.begin(), last: FunctionOrderList.end()); |
1991 | |
1992 | LLVM_DEBUG({ |
1993 | dbgs() << "Function processing order:\n" ; |
1994 | for (auto F : FunctionOrderList) { |
1995 | dbgs() << F->getName() << "\n" ; |
1996 | } |
1997 | }); |
1998 | |
1999 | return FunctionOrderList; |
2000 | } |
2001 | |
2002 | bool SampleProfileLoader::doInitialization(Module &M, |
2003 | FunctionAnalysisManager *FAM) { |
2004 | auto &Ctx = M.getContext(); |
2005 | |
2006 | auto ReaderOrErr = SampleProfileReader::create( |
2007 | Filename, C&: Ctx, FS&: *FS, P: FSDiscriminatorPass::Base, RemapFilename: RemappingFilename); |
2008 | if (std::error_code EC = ReaderOrErr.getError()) { |
2009 | std::string Msg = "Could not open profile: " + EC.message(); |
2010 | Ctx.diagnose(DI: DiagnosticInfoSampleProfile(Filename, Msg)); |
2011 | return false; |
2012 | } |
2013 | Reader = std::move(ReaderOrErr.get()); |
2014 | Reader->setSkipFlatProf(LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink); |
2015 | // set module before reading the profile so reader may be able to only |
2016 | // read the function profiles which are used by the current module. |
2017 | Reader->setModule(&M); |
2018 | if (std::error_code EC = Reader->read()) { |
2019 | std::string Msg = "profile reading failed: " + EC.message(); |
2020 | Ctx.diagnose(DI: DiagnosticInfoSampleProfile(Filename, Msg)); |
2021 | return false; |
2022 | } |
2023 | |
2024 | PSL = Reader->getProfileSymbolList(); |
2025 | |
2026 | // While profile-sample-accurate is on, ignore symbol list. |
2027 | ProfAccForSymsInList = |
2028 | ProfileAccurateForSymsInList && PSL && !ProfileSampleAccurate; |
2029 | if (ProfAccForSymsInList) { |
2030 | NamesInProfile.clear(); |
2031 | GUIDsInProfile.clear(); |
2032 | if (auto NameTable = Reader->getNameTable()) { |
2033 | if (FunctionSamples::UseMD5) { |
2034 | for (auto Name : *NameTable) |
2035 | GUIDsInProfile.insert(V: Name.getHashCode()); |
2036 | } else { |
2037 | for (auto Name : *NameTable) |
2038 | NamesInProfile.insert(key: Name.stringRef()); |
2039 | } |
2040 | } |
2041 | CoverageTracker.setProfAccForSymsInList(true); |
2042 | } |
2043 | |
2044 | if (FAM && !ProfileInlineReplayFile.empty()) { |
2045 | ExternalInlineAdvisor = getReplayInlineAdvisor( |
2046 | M, FAM&: *FAM, Context&: Ctx, /*OriginalAdvisor=*/nullptr, |
2047 | ReplaySettings: ReplayInlinerSettings{.ReplayFile: ProfileInlineReplayFile, |
2048 | .ReplayScope: ProfileInlineReplayScope, |
2049 | .ReplayFallback: ProfileInlineReplayFallback, |
2050 | .ReplayFormat: {.OutputFormat: ProfileInlineReplayFormat}}, |
2051 | /*EmitRemarks=*/false, IC: InlineContext{.LTOPhase: LTOPhase, .Pass: InlinePass::ReplaySampleProfileInliner}); |
2052 | } |
2053 | |
2054 | // Apply tweaks if context-sensitive or probe-based profile is available. |
2055 | if (Reader->profileIsCS() || Reader->profileIsPreInlined() || |
2056 | Reader->profileIsProbeBased()) { |
2057 | if (!UseIterativeBFIInference.getNumOccurrences()) |
2058 | UseIterativeBFIInference = true; |
2059 | if (!SampleProfileUseProfi.getNumOccurrences()) |
2060 | SampleProfileUseProfi = true; |
2061 | if (!EnableExtTspBlockPlacement.getNumOccurrences()) |
2062 | EnableExtTspBlockPlacement = true; |
2063 | // Enable priority-base inliner and size inline by default for CSSPGO. |
2064 | if (!ProfileSizeInline.getNumOccurrences()) |
2065 | ProfileSizeInline = true; |
2066 | if (!CallsitePrioritizedInline.getNumOccurrences()) |
2067 | CallsitePrioritizedInline = true; |
2068 | // For CSSPGO, we also allow recursive inline to best use context profile. |
2069 | if (!AllowRecursiveInline.getNumOccurrences()) |
2070 | AllowRecursiveInline = true; |
2071 | |
2072 | if (Reader->profileIsPreInlined()) { |
2073 | if (!UsePreInlinerDecision.getNumOccurrences()) |
2074 | UsePreInlinerDecision = true; |
2075 | } |
2076 | |
2077 | // Enable stale profile matching by default for probe-based profile. |
2078 | // Currently the matching relies on if the checksum mismatch is detected, |
2079 | // which is currently only available for pseudo-probe mode. Removing the |
2080 | // checksum check could cause regressions for some cases, so further tuning |
2081 | // might be needed if we want to enable it for all cases. |
2082 | if (Reader->profileIsProbeBased() && |
2083 | !SalvageStaleProfile.getNumOccurrences()) { |
2084 | SalvageStaleProfile = true; |
2085 | } |
2086 | |
2087 | if (!Reader->profileIsCS()) { |
2088 | // Non-CS profile should be fine without a function size budget for the |
2089 | // inliner since the contexts in the profile are either all from inlining |
2090 | // in the prevoius build or pre-computed by the preinliner with a size |
2091 | // cap, thus they are bounded. |
2092 | if (!ProfileInlineLimitMin.getNumOccurrences()) |
2093 | ProfileInlineLimitMin = std::numeric_limits<unsigned>::max(); |
2094 | if (!ProfileInlineLimitMax.getNumOccurrences()) |
2095 | ProfileInlineLimitMax = std::numeric_limits<unsigned>::max(); |
2096 | } |
2097 | } |
2098 | |
2099 | if (Reader->profileIsCS()) { |
2100 | // Tracker for profiles under different context |
2101 | ContextTracker = std::make_unique<SampleContextTracker>( |
2102 | args&: Reader->getProfiles(), args: &GUIDToFuncNameMap); |
2103 | } |
2104 | |
2105 | // Load pseudo probe descriptors for probe-based function samples. |
2106 | if (Reader->profileIsProbeBased()) { |
2107 | ProbeManager = std::make_unique<PseudoProbeManager>(args&: M); |
2108 | if (!ProbeManager->moduleIsProbed(M)) { |
2109 | const char *Msg = |
2110 | "Pseudo-probe-based profile requires SampleProfileProbePass" ; |
2111 | Ctx.diagnose(DI: DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg, |
2112 | DS_Warning)); |
2113 | return false; |
2114 | } |
2115 | } |
2116 | |
2117 | if (ReportProfileStaleness || PersistProfileStaleness || |
2118 | SalvageStaleProfile) { |
2119 | MatchingManager = |
2120 | std::make_unique<SampleProfileMatcher>(args&: M, args&: *Reader, args: ProbeManager.get()); |
2121 | } |
2122 | |
2123 | return true; |
2124 | } |
2125 | |
2126 | void SampleProfileMatcher::findIRAnchors( |
2127 | const Function &F, std::map<LineLocation, StringRef> &IRAnchors) { |
2128 | // For inlined code, recover the original callsite and callee by finding the |
2129 | // top-level inline frame. e.g. For frame stack "main:1 @ foo:2 @ bar:3", the |
2130 | // top-level frame is "main:1", the callsite is "1" and the callee is "foo". |
2131 | auto FindTopLevelInlinedCallsite = [](const DILocation *DIL) { |
2132 | assert((DIL && DIL->getInlinedAt()) && "No inlined callsite" ); |
2133 | const DILocation *PrevDIL = nullptr; |
2134 | do { |
2135 | PrevDIL = DIL; |
2136 | DIL = DIL->getInlinedAt(); |
2137 | } while (DIL->getInlinedAt()); |
2138 | |
2139 | LineLocation Callsite = FunctionSamples::getCallSiteIdentifier(DIL); |
2140 | StringRef CalleeName = PrevDIL->getSubprogramLinkageName(); |
2141 | return std::make_pair(x&: Callsite, y&: CalleeName); |
2142 | }; |
2143 | |
2144 | auto GetCanonicalCalleeName = [](const CallBase *CB) { |
2145 | StringRef CalleeName = UnknownIndirectCallee; |
2146 | if (Function *Callee = CB->getCalledFunction()) |
2147 | CalleeName = FunctionSamples::getCanonicalFnName(FnName: Callee->getName()); |
2148 | return CalleeName; |
2149 | }; |
2150 | |
2151 | // Extract profile matching anchors in the IR. |
2152 | for (auto &BB : F) { |
2153 | for (auto &I : BB) { |
2154 | DILocation *DIL = I.getDebugLoc(); |
2155 | if (!DIL) |
2156 | continue; |
2157 | |
2158 | if (FunctionSamples::ProfileIsProbeBased) { |
2159 | if (auto Probe = extractProbe(Inst: I)) { |
2160 | // Flatten inlined IR for the matching. |
2161 | if (DIL->getInlinedAt()) { |
2162 | IRAnchors.emplace(args: FindTopLevelInlinedCallsite(DIL)); |
2163 | } else { |
2164 | // Use empty StringRef for basic block probe. |
2165 | StringRef CalleeName; |
2166 | if (const auto *CB = dyn_cast<CallBase>(Val: &I)) { |
2167 | // Skip the probe inst whose callee name is "llvm.pseudoprobe". |
2168 | if (!isa<IntrinsicInst>(Val: &I)) |
2169 | CalleeName = GetCanonicalCalleeName(CB); |
2170 | } |
2171 | IRAnchors.emplace(args: LineLocation(Probe->Id, 0), args&: CalleeName); |
2172 | } |
2173 | } |
2174 | } else { |
2175 | // TODO: For line-number based profile(AutoFDO), currently only support |
2176 | // find callsite anchors. In future, we need to parse all the non-call |
2177 | // instructions to extract the line locations for profile matching. |
2178 | if (!isa<CallBase>(Val: &I) || isa<IntrinsicInst>(Val: &I)) |
2179 | continue; |
2180 | |
2181 | if (DIL->getInlinedAt()) { |
2182 | IRAnchors.emplace(args: FindTopLevelInlinedCallsite(DIL)); |
2183 | } else { |
2184 | LineLocation Callsite = FunctionSamples::getCallSiteIdentifier(DIL); |
2185 | StringRef CalleeName = GetCanonicalCalleeName(dyn_cast<CallBase>(Val: &I)); |
2186 | IRAnchors.emplace(args&: Callsite, args&: CalleeName); |
2187 | } |
2188 | } |
2189 | } |
2190 | } |
2191 | } |
2192 | |
2193 | void SampleProfileMatcher::countMismatchedSamples(const FunctionSamples &FS) { |
2194 | const auto *FuncDesc = ProbeManager->getDesc(GUID: FS.getGUID()); |
2195 | // Skip the function that is external or renamed. |
2196 | if (!FuncDesc) |
2197 | return; |
2198 | |
2199 | if (ProbeManager->profileIsHashMismatched(FuncDesc: *FuncDesc, Samples: FS)) { |
2200 | MismatchedFuncHashSamples += FS.getTotalSamples(); |
2201 | return; |
2202 | } |
2203 | for (const auto &I : FS.getCallsiteSamples()) |
2204 | for (const auto &CS : I.second) |
2205 | countMismatchedSamples(FS: CS.second); |
2206 | } |
2207 | |
2208 | void SampleProfileMatcher::countProfileMismatches( |
2209 | const Function &F, const FunctionSamples &FS, |
2210 | const std::map<LineLocation, StringRef> &IRAnchors, |
2211 | const std::map<LineLocation, std::unordered_set<FunctionId>> |
2212 | &ProfileAnchors) { |
2213 | [[maybe_unused]] bool IsFuncHashMismatch = false; |
2214 | if (FunctionSamples::ProfileIsProbeBased) { |
2215 | TotalFuncHashSamples += FS.getTotalSamples(); |
2216 | TotalProfiledFunc++; |
2217 | const auto *FuncDesc = ProbeManager->getDesc(F); |
2218 | if (FuncDesc) { |
2219 | if (ProbeManager->profileIsHashMismatched(FuncDesc: *FuncDesc, Samples: FS)) { |
2220 | NumMismatchedFuncHash++; |
2221 | IsFuncHashMismatch = true; |
2222 | } |
2223 | countMismatchedSamples(FS); |
2224 | } |
2225 | } |
2226 | |
2227 | uint64_t FuncMismatchedCallsites = 0; |
2228 | uint64_t FuncProfiledCallsites = 0; |
2229 | countProfileCallsiteMismatches(FS, IRAnchors, ProfileAnchors, |
2230 | FuncMismatchedCallsites, |
2231 | FuncProfiledCallsites); |
2232 | TotalProfiledCallsites += FuncProfiledCallsites; |
2233 | NumMismatchedCallsites += FuncMismatchedCallsites; |
2234 | LLVM_DEBUG({ |
2235 | if (FunctionSamples::ProfileIsProbeBased && !IsFuncHashMismatch && |
2236 | FuncMismatchedCallsites) |
2237 | dbgs() << "Function checksum is matched but there are " |
2238 | << FuncMismatchedCallsites << "/" << FuncProfiledCallsites |
2239 | << " mismatched callsites.\n" ; |
2240 | }); |
2241 | } |
2242 | |
2243 | void SampleProfileMatcher::countProfileCallsiteMismatches( |
2244 | const FunctionSamples &FS, |
2245 | const std::map<LineLocation, StringRef> &IRAnchors, |
2246 | const std::map<LineLocation, std::unordered_set<FunctionId>> |
2247 | &ProfileAnchors, |
2248 | uint64_t &FuncMismatchedCallsites, uint64_t &FuncProfiledCallsites) { |
2249 | |
2250 | // Check if there are any callsites in the profile that does not match to any |
2251 | // IR callsites, those callsite samples will be discarded. |
2252 | for (const auto &I : ProfileAnchors) { |
2253 | const auto &Loc = I.first; |
2254 | const auto &Callees = I.second; |
2255 | assert(!Callees.empty() && "Callees should not be empty" ); |
2256 | |
2257 | StringRef IRCalleeName; |
2258 | const auto &IR = IRAnchors.find(x: Loc); |
2259 | if (IR != IRAnchors.end()) |
2260 | IRCalleeName = IR->second; |
2261 | |
2262 | // Compute number of samples in the original profile. |
2263 | uint64_t CallsiteSamples = 0; |
2264 | if (auto CTM = FS.findCallTargetMapAt(CallSite: Loc)) { |
2265 | for (const auto &I : *CTM) |
2266 | CallsiteSamples += I.second; |
2267 | } |
2268 | const auto *FSMap = FS.findFunctionSamplesMapAt(Loc); |
2269 | if (FSMap) { |
2270 | for (const auto &I : *FSMap) |
2271 | CallsiteSamples += I.second.getTotalSamples(); |
2272 | } |
2273 | |
2274 | bool CallsiteIsMatched = false; |
2275 | // Since indirect call does not have CalleeName, check conservatively if |
2276 | // callsite in the profile is a callsite location. This is to reduce num of |
2277 | // false positive since otherwise all the indirect call samples will be |
2278 | // reported as mismatching. |
2279 | if (IRCalleeName == UnknownIndirectCallee) |
2280 | CallsiteIsMatched = true; |
2281 | else if (Callees.size() == 1 && Callees.count(x: getRepInFormat(Name: IRCalleeName))) |
2282 | CallsiteIsMatched = true; |
2283 | |
2284 | FuncProfiledCallsites++; |
2285 | TotalCallsiteSamples += CallsiteSamples; |
2286 | if (!CallsiteIsMatched) { |
2287 | FuncMismatchedCallsites++; |
2288 | MismatchedCallsiteSamples += CallsiteSamples; |
2289 | } |
2290 | } |
2291 | } |
2292 | |
2293 | void SampleProfileMatcher::findProfileAnchors(const FunctionSamples &FS, |
2294 | std::map<LineLocation, std::unordered_set<FunctionId>> &ProfileAnchors) { |
2295 | auto isInvalidLineOffset = [](uint32_t LineOffset) { |
2296 | return LineOffset & 0x8000; |
2297 | }; |
2298 | |
2299 | for (const auto &I : FS.getBodySamples()) { |
2300 | const LineLocation &Loc = I.first; |
2301 | if (isInvalidLineOffset(Loc.LineOffset)) |
2302 | continue; |
2303 | for (const auto &I : I.second.getCallTargets()) { |
2304 | auto Ret = ProfileAnchors.try_emplace(k: Loc, |
2305 | args: std::unordered_set<FunctionId>()); |
2306 | Ret.first->second.insert(x: I.first); |
2307 | } |
2308 | } |
2309 | |
2310 | for (const auto &I : FS.getCallsiteSamples()) { |
2311 | const LineLocation &Loc = I.first; |
2312 | if (isInvalidLineOffset(Loc.LineOffset)) |
2313 | continue; |
2314 | const auto &CalleeMap = I.second; |
2315 | for (const auto &I : CalleeMap) { |
2316 | auto Ret = ProfileAnchors.try_emplace(k: Loc, |
2317 | args: std::unordered_set<FunctionId>()); |
2318 | Ret.first->second.insert(x: I.first); |
2319 | } |
2320 | } |
2321 | } |
2322 | |
2323 | // Call target name anchor based profile fuzzy matching. |
2324 | // Input: |
2325 | // For IR locations, the anchor is the callee name of direct callsite; For |
2326 | // profile locations, it's the call target name for BodySamples or inlinee's |
2327 | // profile name for CallsiteSamples. |
2328 | // Matching heuristic: |
2329 | // First match all the anchors in lexical order, then split the non-anchor |
2330 | // locations between the two anchors evenly, first half are matched based on the |
2331 | // start anchor, second half are matched based on the end anchor. |
2332 | // For example, given: |
2333 | // IR locations: [1, 2(foo), 3, 5, 6(bar), 7] |
2334 | // Profile locations: [1, 2, 3(foo), 4, 7, 8(bar), 9] |
2335 | // The matching gives: |
2336 | // [1, 2(foo), 3, 5, 6(bar), 7] |
2337 | // | | | | | | |
2338 | // [1, 2, 3(foo), 4, 7, 8(bar), 9] |
2339 | // The output mapping: [2->3, 3->4, 5->7, 6->8, 7->9]. |
2340 | void SampleProfileMatcher::runStaleProfileMatching( |
2341 | const Function &F, |
2342 | const std::map<LineLocation, StringRef> &IRAnchors, |
2343 | const std::map<LineLocation, std::unordered_set<FunctionId>> |
2344 | &ProfileAnchors, |
2345 | LocToLocMap &IRToProfileLocationMap) { |
2346 | LLVM_DEBUG(dbgs() << "Run stale profile matching for " << F.getName() |
2347 | << "\n" ); |
2348 | assert(IRToProfileLocationMap.empty() && |
2349 | "Run stale profile matching only once per function" ); |
2350 | |
2351 | std::unordered_map<FunctionId, std::set<LineLocation>> |
2352 | CalleeToCallsitesMap; |
2353 | for (const auto &I : ProfileAnchors) { |
2354 | const auto &Loc = I.first; |
2355 | const auto &Callees = I.second; |
2356 | // Filter out possible indirect calls, use direct callee name as anchor. |
2357 | if (Callees.size() == 1) { |
2358 | FunctionId CalleeName = *Callees.begin(); |
2359 | const auto &Candidates = CalleeToCallsitesMap.try_emplace( |
2360 | k: CalleeName, args: std::set<LineLocation>()); |
2361 | Candidates.first->second.insert(x: Loc); |
2362 | } |
2363 | } |
2364 | |
2365 | auto InsertMatching = [&](const LineLocation &From, const LineLocation &To) { |
2366 | // Skip the unchanged location mapping to save memory. |
2367 | if (From != To) |
2368 | IRToProfileLocationMap.insert(x: {From, To}); |
2369 | }; |
2370 | |
2371 | // Use function's beginning location as the initial anchor. |
2372 | int32_t LocationDelta = 0; |
2373 | SmallVector<LineLocation> LastMatchedNonAnchors; |
2374 | |
2375 | for (const auto &IR : IRAnchors) { |
2376 | const auto &Loc = IR.first; |
2377 | auto CalleeName = IR.second; |
2378 | bool IsMatchedAnchor = false; |
2379 | // Match the anchor location in lexical order. |
2380 | if (!CalleeName.empty()) { |
2381 | auto CandidateAnchors = CalleeToCallsitesMap.find( |
2382 | x: getRepInFormat(Name: CalleeName)); |
2383 | if (CandidateAnchors != CalleeToCallsitesMap.end() && |
2384 | !CandidateAnchors->second.empty()) { |
2385 | auto CI = CandidateAnchors->second.begin(); |
2386 | const auto Candidate = *CI; |
2387 | CandidateAnchors->second.erase(position: CI); |
2388 | InsertMatching(Loc, Candidate); |
2389 | LLVM_DEBUG(dbgs() << "Callsite with callee:" << CalleeName |
2390 | << " is matched from " << Loc << " to " << Candidate |
2391 | << "\n" ); |
2392 | LocationDelta = Candidate.LineOffset - Loc.LineOffset; |
2393 | |
2394 | // Match backwards for non-anchor locations. |
2395 | // The locations in LastMatchedNonAnchors have been matched forwards |
2396 | // based on the previous anchor, spilt it evenly and overwrite the |
2397 | // second half based on the current anchor. |
2398 | for (size_t I = (LastMatchedNonAnchors.size() + 1) / 2; |
2399 | I < LastMatchedNonAnchors.size(); I++) { |
2400 | const auto &L = LastMatchedNonAnchors[I]; |
2401 | uint32_t CandidateLineOffset = L.LineOffset + LocationDelta; |
2402 | LineLocation Candidate(CandidateLineOffset, L.Discriminator); |
2403 | InsertMatching(L, Candidate); |
2404 | LLVM_DEBUG(dbgs() << "Location is rematched backwards from " << L |
2405 | << " to " << Candidate << "\n" ); |
2406 | } |
2407 | |
2408 | IsMatchedAnchor = true; |
2409 | LastMatchedNonAnchors.clear(); |
2410 | } |
2411 | } |
2412 | |
2413 | // Match forwards for non-anchor locations. |
2414 | if (!IsMatchedAnchor) { |
2415 | uint32_t CandidateLineOffset = Loc.LineOffset + LocationDelta; |
2416 | LineLocation Candidate(CandidateLineOffset, Loc.Discriminator); |
2417 | InsertMatching(Loc, Candidate); |
2418 | LLVM_DEBUG(dbgs() << "Location is matched from " << Loc << " to " |
2419 | << Candidate << "\n" ); |
2420 | LastMatchedNonAnchors.emplace_back(Args: Loc); |
2421 | } |
2422 | } |
2423 | } |
2424 | |
2425 | void SampleProfileMatcher::runOnFunction(const Function &F) { |
2426 | // We need to use flattened function samples for matching. |
2427 | // Unlike IR, which includes all callsites from the source code, the callsites |
2428 | // in profile only show up when they are hit by samples, i,e. the profile |
2429 | // callsites in one context may differ from those in another context. To get |
2430 | // the maximum number of callsites, we merge the function profiles from all |
2431 | // contexts, aka, the flattened profile to find profile anchors. |
2432 | const auto *FSFlattened = getFlattenedSamplesFor(F); |
2433 | if (!FSFlattened) |
2434 | return; |
2435 | |
2436 | // Anchors for IR. It's a map from IR location to callee name, callee name is |
2437 | // empty for non-call instruction and use a dummy name(UnknownIndirectCallee) |
2438 | // for unknown indrect callee name. |
2439 | std::map<LineLocation, StringRef> IRAnchors; |
2440 | findIRAnchors(F, IRAnchors); |
2441 | // Anchors for profile. It's a map from callsite location to a set of callee |
2442 | // name. |
2443 | std::map<LineLocation, std::unordered_set<FunctionId>> ProfileAnchors; |
2444 | findProfileAnchors(FS: *FSFlattened, ProfileAnchors); |
2445 | |
2446 | // Detect profile mismatch for profile staleness metrics report. |
2447 | // Skip reporting the metrics for imported functions. |
2448 | if (!GlobalValue::isAvailableExternallyLinkage(Linkage: F.getLinkage()) && |
2449 | (ReportProfileStaleness || PersistProfileStaleness)) { |
2450 | // Use top-level nested FS for counting profile mismatch metrics since |
2451 | // currently once a callsite is mismatched, all its children profiles are |
2452 | // dropped. |
2453 | if (const auto *FS = Reader.getSamplesFor(F)) |
2454 | countProfileMismatches(F, FS: *FS, IRAnchors, ProfileAnchors); |
2455 | } |
2456 | |
2457 | // Run profile matching for checksum mismatched profile, currently only |
2458 | // support for pseudo-probe. |
2459 | if (SalvageStaleProfile && FunctionSamples::ProfileIsProbeBased && |
2460 | !ProbeManager->profileIsValid(F, Samples: *FSFlattened)) { |
2461 | // The matching result will be saved to IRToProfileLocationMap, create a new |
2462 | // map for each function. |
2463 | runStaleProfileMatching(F, IRAnchors, ProfileAnchors, |
2464 | IRToProfileLocationMap&: getIRToProfileLocationMap(F)); |
2465 | } |
2466 | } |
2467 | |
2468 | void SampleProfileMatcher::runOnModule() { |
2469 | ProfileConverter::flattenProfile(InputProfiles: Reader.getProfiles(), OutputProfiles&: FlattenedProfiles, |
2470 | ProfileIsCS: FunctionSamples::ProfileIsCS); |
2471 | for (auto &F : M) { |
2472 | if (F.isDeclaration() || !F.hasFnAttribute(Kind: "use-sample-profile" )) |
2473 | continue; |
2474 | runOnFunction(F); |
2475 | } |
2476 | if (SalvageStaleProfile) |
2477 | distributeIRToProfileLocationMap(); |
2478 | |
2479 | if (ReportProfileStaleness) { |
2480 | if (FunctionSamples::ProfileIsProbeBased) { |
2481 | errs() << "(" << NumMismatchedFuncHash << "/" << TotalProfiledFunc << ")" |
2482 | << " of functions' profile are invalid and " |
2483 | << " (" << MismatchedFuncHashSamples << "/" << TotalFuncHashSamples |
2484 | << ")" |
2485 | << " of samples are discarded due to function hash mismatch.\n" ; |
2486 | } |
2487 | errs() << "(" << NumMismatchedCallsites << "/" << TotalProfiledCallsites |
2488 | << ")" |
2489 | << " of callsites' profile are invalid and " |
2490 | << "(" << MismatchedCallsiteSamples << "/" << TotalCallsiteSamples |
2491 | << ")" |
2492 | << " of samples are discarded due to callsite location mismatch.\n" ; |
2493 | } |
2494 | |
2495 | if (PersistProfileStaleness) { |
2496 | LLVMContext &Ctx = M.getContext(); |
2497 | MDBuilder MDB(Ctx); |
2498 | |
2499 | SmallVector<std::pair<StringRef, uint64_t>> ProfStatsVec; |
2500 | if (FunctionSamples::ProfileIsProbeBased) { |
2501 | ProfStatsVec.emplace_back(Args: "NumMismatchedFuncHash" , Args&: NumMismatchedFuncHash); |
2502 | ProfStatsVec.emplace_back(Args: "TotalProfiledFunc" , Args&: TotalProfiledFunc); |
2503 | ProfStatsVec.emplace_back(Args: "MismatchedFuncHashSamples" , |
2504 | Args&: MismatchedFuncHashSamples); |
2505 | ProfStatsVec.emplace_back(Args: "TotalFuncHashSamples" , Args&: TotalFuncHashSamples); |
2506 | } |
2507 | |
2508 | ProfStatsVec.emplace_back(Args: "NumMismatchedCallsites" , Args&: NumMismatchedCallsites); |
2509 | ProfStatsVec.emplace_back(Args: "TotalProfiledCallsites" , Args&: TotalProfiledCallsites); |
2510 | ProfStatsVec.emplace_back(Args: "MismatchedCallsiteSamples" , |
2511 | Args&: MismatchedCallsiteSamples); |
2512 | ProfStatsVec.emplace_back(Args: "TotalCallsiteSamples" , Args&: TotalCallsiteSamples); |
2513 | |
2514 | auto *MD = MDB.createLLVMStats(LLVMStatsVec: ProfStatsVec); |
2515 | auto *NMD = M.getOrInsertNamedMetadata(Name: "llvm.stats" ); |
2516 | NMD->addOperand(M: MD); |
2517 | } |
2518 | } |
2519 | |
2520 | void SampleProfileMatcher::distributeIRToProfileLocationMap( |
2521 | FunctionSamples &FS) { |
2522 | const auto ProfileMappings = FuncMappings.find(Key: FS.getFuncName()); |
2523 | if (ProfileMappings != FuncMappings.end()) { |
2524 | FS.setIRToProfileLocationMap(&(ProfileMappings->second)); |
2525 | } |
2526 | |
2527 | for (auto &Inlinees : FS.getCallsiteSamples()) { |
2528 | for (auto FS : Inlinees.second) { |
2529 | distributeIRToProfileLocationMap(FS&: FS.second); |
2530 | } |
2531 | } |
2532 | } |
2533 | |
2534 | // Use a central place to distribute the matching results. Outlined and inlined |
2535 | // profile with the function name will be set to the same pointer. |
2536 | void SampleProfileMatcher::distributeIRToProfileLocationMap() { |
2537 | for (auto &I : Reader.getProfiles()) { |
2538 | distributeIRToProfileLocationMap(FS&: I.second); |
2539 | } |
2540 | } |
2541 | |
2542 | bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM, |
2543 | ProfileSummaryInfo *_PSI, |
2544 | LazyCallGraph &CG) { |
2545 | GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap); |
2546 | |
2547 | PSI = _PSI; |
2548 | if (M.getProfileSummary(/* IsCS */ false) == nullptr) { |
2549 | M.setProfileSummary(M: Reader->getSummary().getMD(Context&: M.getContext()), |
2550 | Kind: ProfileSummary::PSK_Sample); |
2551 | PSI->refresh(); |
2552 | } |
2553 | // Compute the total number of samples collected in this profile. |
2554 | for (const auto &I : Reader->getProfiles()) |
2555 | TotalCollectedSamples += I.second.getTotalSamples(); |
2556 | |
2557 | auto Remapper = Reader->getRemapper(); |
2558 | // Populate the symbol map. |
2559 | for (const auto &N_F : M.getValueSymbolTable()) { |
2560 | StringRef OrigName = N_F.getKey(); |
2561 | Function *F = dyn_cast<Function>(Val: N_F.getValue()); |
2562 | if (F == nullptr || OrigName.empty()) |
2563 | continue; |
2564 | SymbolMap[FunctionId(OrigName)] = F; |
2565 | StringRef NewName = FunctionSamples::getCanonicalFnName(F: *F); |
2566 | if (OrigName != NewName && !NewName.empty()) { |
2567 | auto r = SymbolMap.emplace(Args: FunctionId(NewName), Args&: F); |
2568 | // Failiing to insert means there is already an entry in SymbolMap, |
2569 | // thus there are multiple functions that are mapped to the same |
2570 | // stripped name. In this case of name conflicting, set the value |
2571 | // to nullptr to avoid confusion. |
2572 | if (!r.second) |
2573 | r.first->second = nullptr; |
2574 | OrigName = NewName; |
2575 | } |
2576 | // Insert the remapped names into SymbolMap. |
2577 | if (Remapper) { |
2578 | if (auto MapName = Remapper->lookUpNameInProfile(FunctionName: OrigName)) { |
2579 | if (*MapName != OrigName && !MapName->empty()) |
2580 | SymbolMap.emplace(Args: FunctionId(*MapName), Args&: F); |
2581 | } |
2582 | } |
2583 | } |
2584 | assert(SymbolMap.count(FunctionId()) == 0 && |
2585 | "No empty StringRef should be added in SymbolMap" ); |
2586 | |
2587 | if (ReportProfileStaleness || PersistProfileStaleness || |
2588 | SalvageStaleProfile) { |
2589 | MatchingManager->runOnModule(); |
2590 | } |
2591 | |
2592 | bool retval = false; |
2593 | for (auto *F : buildFunctionOrder(M, CG)) { |
2594 | assert(!F->isDeclaration()); |
2595 | clearFunctionData(); |
2596 | retval |= runOnFunction(F&: *F, AM); |
2597 | } |
2598 | |
2599 | // Account for cold calls not inlined.... |
2600 | if (!FunctionSamples::ProfileIsCS) |
2601 | for (const std::pair<Function *, NotInlinedProfileInfo> &pair : |
2602 | notInlinedCallInfo) |
2603 | updateProfileCallee(Callee: pair.first, EntryDelta: pair.second.entryCount); |
2604 | |
2605 | return retval; |
2606 | } |
2607 | |
2608 | bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) { |
2609 | LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n" ); |
2610 | DILocation2SampleMap.clear(); |
2611 | // By default the entry count is initialized to -1, which will be treated |
2612 | // conservatively by getEntryCount as the same as unknown (None). This is |
2613 | // to avoid newly added code to be treated as cold. If we have samples |
2614 | // this will be overwritten in emitAnnotations. |
2615 | uint64_t initialEntryCount = -1; |
2616 | |
2617 | ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL; |
2618 | if (ProfileSampleAccurate || F.hasFnAttribute(Kind: "profile-sample-accurate" )) { |
2619 | // initialize all the function entry counts to 0. It means all the |
2620 | // functions without profile will be regarded as cold. |
2621 | initialEntryCount = 0; |
2622 | // profile-sample-accurate is a user assertion which has a higher precedence |
2623 | // than symbol list. When profile-sample-accurate is on, ignore symbol list. |
2624 | ProfAccForSymsInList = false; |
2625 | } |
2626 | CoverageTracker.setProfAccForSymsInList(ProfAccForSymsInList); |
2627 | |
2628 | // PSL -- profile symbol list include all the symbols in sampled binary. |
2629 | // If ProfileAccurateForSymsInList is enabled, PSL is used to treat |
2630 | // old functions without samples being cold, without having to worry |
2631 | // about new and hot functions being mistakenly treated as cold. |
2632 | if (ProfAccForSymsInList) { |
2633 | // Initialize the entry count to 0 for functions in the list. |
2634 | if (PSL->contains(Name: F.getName())) |
2635 | initialEntryCount = 0; |
2636 | |
2637 | // Function in the symbol list but without sample will be regarded as |
2638 | // cold. To minimize the potential negative performance impact it could |
2639 | // have, we want to be a little conservative here saying if a function |
2640 | // shows up in the profile, no matter as outline function, inline instance |
2641 | // or call targets, treat the function as not being cold. This will handle |
2642 | // the cases such as most callsites of a function are inlined in sampled |
2643 | // binary but not inlined in current build (because of source code drift, |
2644 | // imprecise debug information, or the callsites are all cold individually |
2645 | // but not cold accumulatively...), so the outline function showing up as |
2646 | // cold in sampled binary will actually not be cold after current build. |
2647 | StringRef CanonName = FunctionSamples::getCanonicalFnName(F); |
2648 | if ((FunctionSamples::UseMD5 && |
2649 | GUIDsInProfile.count(V: Function::getGUID(GlobalName: CanonName))) || |
2650 | (!FunctionSamples::UseMD5 && NamesInProfile.count(Key: CanonName))) |
2651 | initialEntryCount = -1; |
2652 | } |
2653 | |
2654 | // Initialize entry count when the function has no existing entry |
2655 | // count value. |
2656 | if (!F.getEntryCount()) |
2657 | F.setEntryCount(Count: ProfileCount(initialEntryCount, Function::PCT_Real)); |
2658 | std::unique_ptr<OptimizationRemarkEmitter> OwnedORE; |
2659 | if (AM) { |
2660 | auto &FAM = |
2661 | AM->getResult<FunctionAnalysisManagerModuleProxy>(IR&: *F.getParent()) |
2662 | .getManager(); |
2663 | ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F); |
2664 | } else { |
2665 | OwnedORE = std::make_unique<OptimizationRemarkEmitter>(args: &F); |
2666 | ORE = OwnedORE.get(); |
2667 | } |
2668 | |
2669 | if (FunctionSamples::ProfileIsCS) |
2670 | Samples = ContextTracker->getBaseSamplesFor(Func: F); |
2671 | else { |
2672 | Samples = Reader->getSamplesFor(F); |
2673 | // Try search in previously inlined functions that were split or duplicated |
2674 | // into base. |
2675 | if (!Samples) { |
2676 | StringRef CanonName = FunctionSamples::getCanonicalFnName(F); |
2677 | auto It = OutlineFunctionSamples.find(x: FunctionId(CanonName)); |
2678 | if (It != OutlineFunctionSamples.end()) { |
2679 | Samples = &It->second; |
2680 | } else if (auto Remapper = Reader->getRemapper()) { |
2681 | if (auto RemppedName = Remapper->lookUpNameInProfile(FunctionName: CanonName)) { |
2682 | It = OutlineFunctionSamples.find(x: FunctionId(*RemppedName)); |
2683 | if (It != OutlineFunctionSamples.end()) |
2684 | Samples = &It->second; |
2685 | } |
2686 | } |
2687 | } |
2688 | } |
2689 | |
2690 | if (Samples && !Samples->empty()) |
2691 | return emitAnnotations(F); |
2692 | return false; |
2693 | } |
2694 | SampleProfileLoaderPass::SampleProfileLoaderPass( |
2695 | std::string File, std::string RemappingFile, ThinOrFullLTOPhase LTOPhase, |
2696 | IntrusiveRefCntPtr<vfs::FileSystem> FS) |
2697 | : ProfileFileName(File), ProfileRemappingFileName(RemappingFile), |
2698 | LTOPhase(LTOPhase), FS(std::move(FS)) {} |
2699 | |
2700 | PreservedAnalyses SampleProfileLoaderPass::run(Module &M, |
2701 | ModuleAnalysisManager &AM) { |
2702 | FunctionAnalysisManager &FAM = |
2703 | AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager(); |
2704 | |
2705 | auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & { |
2706 | return FAM.getResult<AssumptionAnalysis>(IR&: F); |
2707 | }; |
2708 | auto GetTTI = [&](Function &F) -> TargetTransformInfo & { |
2709 | return FAM.getResult<TargetIRAnalysis>(IR&: F); |
2710 | }; |
2711 | auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & { |
2712 | return FAM.getResult<TargetLibraryAnalysis>(IR&: F); |
2713 | }; |
2714 | |
2715 | if (!FS) |
2716 | FS = vfs::getRealFileSystem(); |
2717 | |
2718 | SampleProfileLoader SampleLoader( |
2719 | ProfileFileName.empty() ? SampleProfileFile : ProfileFileName, |
2720 | ProfileRemappingFileName.empty() ? SampleProfileRemappingFile |
2721 | : ProfileRemappingFileName, |
2722 | LTOPhase, FS, GetAssumptionCache, GetTTI, GetTLI); |
2723 | |
2724 | if (!SampleLoader.doInitialization(M, FAM: &FAM)) |
2725 | return PreservedAnalyses::all(); |
2726 | |
2727 | ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(IR&: M); |
2728 | LazyCallGraph &CG = AM.getResult<LazyCallGraphAnalysis>(IR&: M); |
2729 | if (!SampleLoader.runOnModule(M, AM: &AM, PSI: PSI, CG)) |
2730 | return PreservedAnalyses::all(); |
2731 | |
2732 | return PreservedAnalyses::none(); |
2733 | } |
2734 | |