1 | //===- bolt/Profile/DataAggregator.h - Perf data aggregator -----*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This family of functions reads profile data written by perf record, |
10 | // aggregates it and then writes it back to an output file. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #ifndef BOLT_PROFILE_DATA_AGGREGATOR_H |
15 | #define BOLT_PROFILE_DATA_AGGREGATOR_H |
16 | |
17 | #include "bolt/Profile/DataReader.h" |
18 | #include "llvm/ADT/StringRef.h" |
19 | #include "llvm/Support/Error.h" |
20 | #include "llvm/Support/Program.h" |
21 | #include <unordered_map> |
22 | |
23 | namespace llvm { |
24 | namespace bolt { |
25 | |
26 | class BinaryFunction; |
27 | class BinaryContext; |
28 | class BoltAddressTranslation; |
29 | |
30 | /// DataAggregator inherits all parsing logic from DataReader as well as |
31 | /// its data structures used to represent aggregated profile data in memory. |
32 | /// |
33 | /// The aggregator works by dispatching two separate perf-script jobs that |
34 | /// read perf samples and perf task annotations. Later, we read the output |
35 | /// files to extract information about which PID was used for this binary. |
36 | /// With the PID, we filter the samples and extract all LBR entries. |
37 | /// |
38 | /// To aggregate LBR entries, we rely on a BinaryFunction map to locate the |
39 | /// original function where the event happened. Then, we convert a raw address |
40 | /// to an offset relative to the start of this function and aggregate branch |
41 | /// information for each function. |
42 | /// |
43 | /// This must be coordinated with RewriteInstance so we have BinaryFunctions in |
44 | /// State::Disassembled. After this state, BinaryFunction will drop the |
45 | /// instruction map with original addresses we rely on to validate the traces |
46 | /// found in the LBR. |
47 | /// |
48 | /// The last step is to write the aggregated data to disk in the output file |
49 | /// specified by the user. |
50 | class DataAggregator : public DataReader { |
51 | public: |
52 | explicit DataAggregator(StringRef Filename) : DataReader(Filename) { |
53 | start(); |
54 | } |
55 | |
56 | ~DataAggregator(); |
57 | |
58 | StringRef getReaderName() const override { return "perf data aggregator" ; } |
59 | |
60 | bool isTrustedSource() const override { return true; } |
61 | |
62 | Error preprocessProfile(BinaryContext &BC) override; |
63 | |
64 | Error readProfilePreCFG(BinaryContext &BC) override { |
65 | return Error::success(); |
66 | } |
67 | |
68 | Error readProfile(BinaryContext &BC) override; |
69 | |
70 | bool mayHaveProfileData(const BinaryFunction &BF) override; |
71 | |
72 | /// Set Bolt Address Translation Table when processing samples collected in |
73 | /// bolted binaries |
74 | void setBAT(BoltAddressTranslation *B) override { BAT = B; } |
75 | |
76 | /// Check whether \p FileName is a perf.data file |
77 | static bool checkPerfDataMagic(StringRef FileName); |
78 | |
79 | private: |
80 | struct PerfBranchSample { |
81 | SmallVector<LBREntry, 32> LBR; |
82 | uint64_t PC; |
83 | }; |
84 | |
85 | struct PerfBasicSample { |
86 | StringRef EventName; |
87 | uint64_t PC; |
88 | }; |
89 | |
90 | struct PerfMemSample { |
91 | uint64_t PC; |
92 | uint64_t Addr; |
93 | }; |
94 | |
95 | /// Used for parsing specific pre-aggregated input files. |
96 | struct AggregatedLBREntry { |
97 | enum Type : char { BRANCH = 0, FT, FT_EXTERNAL_ORIGIN }; |
98 | Location From; |
99 | Location To; |
100 | uint64_t Count; |
101 | uint64_t Mispreds; |
102 | Type EntryType; |
103 | }; |
104 | |
105 | struct Trace { |
106 | uint64_t From; |
107 | uint64_t To; |
108 | Trace(uint64_t From, uint64_t To) : From(From), To(To) {} |
109 | bool operator==(const Trace &Other) const { |
110 | return From == Other.From && To == Other.To; |
111 | } |
112 | }; |
113 | |
114 | struct TraceHash { |
115 | size_t operator()(const Trace &L) const { |
116 | return std::hash<uint64_t>()(L.From << 32 | L.To); |
117 | } |
118 | }; |
119 | |
120 | struct FTInfo { |
121 | uint64_t InternCount{0}; |
122 | uint64_t ExternCount{0}; |
123 | }; |
124 | |
125 | struct BranchInfo { |
126 | uint64_t TakenCount{0}; |
127 | uint64_t MispredCount{0}; |
128 | }; |
129 | |
130 | /// Intermediate storage for profile data. We save the results of parsing |
131 | /// and use them later for processing and assigning profile. |
132 | std::unordered_map<Trace, BranchInfo, TraceHash> BranchLBRs; |
133 | std::unordered_map<Trace, FTInfo, TraceHash> FallthroughLBRs; |
134 | std::vector<AggregatedLBREntry> AggregatedLBRs; |
135 | std::unordered_map<uint64_t, uint64_t> BasicSamples; |
136 | std::vector<PerfMemSample> MemSamples; |
137 | |
138 | template <typename T> void clear(T &Container) { |
139 | T TempContainer; |
140 | TempContainer.swap(Container); |
141 | } |
142 | |
143 | /// Perf utility full path name |
144 | std::string PerfPath; |
145 | |
146 | /// Perf process spawning bookkeeping |
147 | struct PerfProcessInfo { |
148 | bool IsFinished{false}; |
149 | sys::ProcessInfo PI; |
150 | SmallVector<char, 256> StdoutPath; |
151 | SmallVector<char, 256> StderrPath; |
152 | }; |
153 | |
154 | /// Process info for spawned processes |
155 | PerfProcessInfo MainEventsPPI; |
156 | PerfProcessInfo MemEventsPPI; |
157 | PerfProcessInfo MMapEventsPPI; |
158 | PerfProcessInfo TaskEventsPPI; |
159 | |
160 | /// Kernel VM starts at fixed based address |
161 | /// https://www.kernel.org/doc/Documentation/x86/x86_64/mm.txt |
162 | static constexpr uint64_t KernelBaseAddr = 0xffff800000000000; |
163 | |
164 | /// Current list of created temporary files |
165 | std::vector<std::string> TempFiles; |
166 | |
167 | /// Name of the binary with matching build-id from perf.data if different |
168 | /// from the file name in BC. |
169 | std::string BuildIDBinaryName; |
170 | |
171 | /// Memory map info for a single file as recorded in perf.data |
172 | struct MMapInfo { |
173 | uint64_t BaseAddress{0}; /// Base address of the mapped binary. |
174 | uint64_t MMapAddress{0}; /// Address of the executable segment. |
175 | uint64_t Size{0}; /// Size of the mapping. |
176 | uint64_t Offset{0}; /// File offset of the mapped segment. |
177 | int32_t PID{-1}; /// Process ID. |
178 | bool Forked{false}; /// Was the process forked? |
179 | uint64_t Time{0ULL}; /// Time in micro seconds. |
180 | }; |
181 | |
182 | /// Per-PID map info for the binary |
183 | std::unordered_map<uint64_t, MMapInfo> BinaryMMapInfo; |
184 | |
185 | /// Fork event info |
186 | struct ForkInfo { |
187 | int32_t ParentPID; |
188 | int32_t ChildPID; |
189 | uint64_t Time{0ULL}; |
190 | }; |
191 | |
192 | /// References to core BOLT data structures |
193 | BinaryContext *BC{nullptr}; |
194 | |
195 | BoltAddressTranslation *BAT{nullptr}; |
196 | |
197 | /// Update function execution profile with a recorded trace. |
198 | /// A trace is region of code executed between two LBR entries supplied in |
199 | /// execution order. |
200 | /// |
201 | /// Return true if the trace is valid, false otherwise. |
202 | bool |
203 | recordTrace(BinaryFunction &BF, const LBREntry &First, const LBREntry &Second, |
204 | uint64_t Count, |
205 | SmallVector<std::pair<uint64_t, uint64_t>, 16> &Branches) const; |
206 | |
207 | /// Return a vector of offsets corresponding to a trace in a function |
208 | /// (see recordTrace() above). |
209 | std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>> |
210 | getFallthroughsInTrace(BinaryFunction &BF, const LBREntry &First, |
211 | const LBREntry &Second, uint64_t Count = 1) const; |
212 | |
213 | /// Record external entry into the function \p BF. |
214 | /// |
215 | /// Return true if the entry is valid, false otherwise. |
216 | bool recordEntry(BinaryFunction &BF, uint64_t To, bool Mispred, |
217 | uint64_t Count = 1) const; |
218 | |
219 | /// Record exit from the function \p BF via a call or return. |
220 | /// |
221 | /// Return true if the exit point is valid, false otherwise. |
222 | bool recordExit(BinaryFunction &BF, uint64_t From, bool Mispred, |
223 | uint64_t Count = 1) const; |
224 | |
225 | /// Aggregation statistics |
226 | uint64_t NumInvalidTraces{0}; |
227 | uint64_t NumLongRangeTraces{0}; |
228 | /// Specifies how many samples were recorded in cold areas if we are dealing |
229 | /// with profiling data collected in a bolted binary. For LBRs, incremented |
230 | /// for the source of the branch to avoid counting cold activity twice (one |
231 | /// for source and another for destination). |
232 | uint64_t NumColdSamples{0}; |
233 | |
234 | /// Looks into system PATH for Linux Perf and set up the aggregator to use it |
235 | void findPerfExecutable(); |
236 | |
237 | /// Launch a perf subprocess with given args and save output for later |
238 | /// parsing. |
239 | void launchPerfProcess(StringRef Name, PerfProcessInfo &PPI, |
240 | const char *ArgsString, bool Wait); |
241 | |
242 | /// Delete all temporary files created to hold the output generated by spawned |
243 | /// subprocesses during the aggregation job |
244 | void deleteTempFiles(); |
245 | |
246 | // Semantic pass helpers |
247 | |
248 | /// Look up which function contains an address by using out map of |
249 | /// disassembled BinaryFunctions |
250 | BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address) const; |
251 | |
252 | /// Perform BAT translation for a given \p Func and return the parent |
253 | /// BinaryFunction or nullptr. |
254 | BinaryFunction *getBATParentFunction(const BinaryFunction &Func) const; |
255 | |
256 | /// Retrieve the location name to be used for samples recorded in \p Func. |
257 | StringRef getLocationName(const BinaryFunction &Func) const; |
258 | |
259 | /// Semantic actions - parser hooks to interpret parsed perf samples |
260 | /// Register a sample (non-LBR mode), i.e. a new hit at \p Address |
261 | bool doSample(BinaryFunction &Func, const uint64_t Address, uint64_t Count); |
262 | |
263 | /// Register an intraprocedural branch \p Branch. |
264 | bool doIntraBranch(BinaryFunction &Func, uint64_t From, uint64_t To, |
265 | uint64_t Count, uint64_t Mispreds); |
266 | |
267 | /// Register an interprocedural branch from \p FromFunc to \p ToFunc with |
268 | /// offsets \p From and \p To, respectively. |
269 | bool doInterBranch(BinaryFunction *FromFunc, BinaryFunction *ToFunc, |
270 | uint64_t From, uint64_t To, uint64_t Count, |
271 | uint64_t Mispreds); |
272 | |
273 | /// Register a \p Branch. |
274 | bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds); |
275 | |
276 | /// Register a trace between two LBR entries supplied in execution order. |
277 | bool doTrace(const LBREntry &First, const LBREntry &Second, |
278 | uint64_t Count = 1); |
279 | |
280 | /// Parser helpers |
281 | /// Return false if we exhausted our parser buffer and finished parsing |
282 | /// everything |
283 | bool hasData() const { return !ParsingBuf.empty(); } |
284 | |
285 | /// Print heat map based on LBR samples. |
286 | std::error_code printLBRHeatMap(); |
287 | |
288 | /// Parse a single perf sample containing a PID associated with a sequence of |
289 | /// LBR entries. If the PID does not correspond to the binary we are looking |
290 | /// for, return std::errc::no_such_process. If other parsing errors occur, |
291 | /// return the error. Otherwise, return the parsed sample. |
292 | ErrorOr<PerfBranchSample> parseBranchSample(); |
293 | |
294 | /// Parse a single perf sample containing a PID associated with an event name |
295 | /// and a PC |
296 | ErrorOr<PerfBasicSample> parseBasicSample(); |
297 | |
298 | /// Parse a single perf sample containing a PID associated with an IP and |
299 | /// address. |
300 | ErrorOr<PerfMemSample> parseMemSample(); |
301 | |
302 | /// Parse pre-aggregated LBR samples created by an external tool |
303 | ErrorOr<AggregatedLBREntry> parseAggregatedLBREntry(); |
304 | |
305 | /// Parse either buildid:offset or just offset, representing a location in the |
306 | /// binary. Used exclusively for pre-aggregated LBR samples. |
307 | ErrorOr<Location> parseLocationOrOffset(); |
308 | |
309 | /// Check if a field separator is the next char to parse and, if yes, consume |
310 | /// it and return true |
311 | bool checkAndConsumeFS(); |
312 | |
313 | /// Consume the entire line |
314 | void consumeRestOfLine(); |
315 | |
316 | /// True if the next token in the parsing buffer is a new line, but don't |
317 | /// consume it (peek only). |
318 | bool checkNewLine(); |
319 | |
320 | using PerfProcessErrorCallbackTy = std::function<void(int, StringRef)>; |
321 | /// Prepare to parse data from a given perf script invocation. |
322 | /// Returns an invocation exit code. |
323 | int prepareToParse(StringRef Name, PerfProcessInfo &Process, |
324 | PerfProcessErrorCallbackTy Callback); |
325 | |
326 | /// Parse a single LBR entry as output by perf script -Fbrstack |
327 | ErrorOr<LBREntry> parseLBREntry(); |
328 | |
329 | /// Parse LBR sample, returns the number of traces. |
330 | uint64_t parseLBRSample(const PerfBranchSample &Sample, bool NeedsSkylakeFix); |
331 | |
332 | /// Parse and pre-aggregate branch events. |
333 | std::error_code parseBranchEvents(); |
334 | |
335 | /// Process all branch events. |
336 | void processBranchEvents(); |
337 | |
338 | /// This member function supports generating data for AutoFDO LLVM tools. |
339 | std::error_code writeAutoFDOData(StringRef OutputFilename); |
340 | |
341 | /// Parse the full output generated by perf script to report non-LBR samples. |
342 | std::error_code parseBasicEvents(); |
343 | |
344 | /// Process non-LBR events. |
345 | void processBasicEvents(); |
346 | |
347 | /// Parse the full output generated by perf script to report memory events. |
348 | std::error_code parseMemEvents(); |
349 | |
350 | /// Process parsed memory events profile. |
351 | void processMemEvents(); |
352 | |
353 | /// Parse a single line of a PERF_RECORD_MMAP2 event looking for a mapping |
354 | /// between the binary name and its memory layout in a process with a given |
355 | /// PID. |
356 | /// On success return a <FileName, MMapInfo> pair. |
357 | ErrorOr<std::pair<StringRef, MMapInfo>> parseMMapEvent(); |
358 | |
359 | /// Parse PERF_RECORD_FORK event. |
360 | std::optional<ForkInfo> parseForkEvent(); |
361 | |
362 | /// Parse 'PERF_RECORD_COMM exec'. Don't consume the string. |
363 | std::optional<int32_t> parseCommExecEvent(); |
364 | |
365 | /// Parse the full output generated by `perf script --show-mmap-events` |
366 | /// to generate mapping between binary files and their memory mappings for |
367 | /// all PIDs. |
368 | std::error_code parseMMapEvents(); |
369 | |
370 | /// Parse output of `perf script --show-task-events`, and forked processes |
371 | /// to the set of tracked PIDs. |
372 | std::error_code parseTaskEvents(); |
373 | |
374 | /// Parse a single pair of binary full path and associated build-id |
375 | std::optional<std::pair<StringRef, StringRef>> parseNameBuildIDPair(); |
376 | |
377 | /// Coordinate reading and parsing of pre-aggregated file |
378 | /// |
379 | /// The regular perf2bolt aggregation job is to read perf output directly. |
380 | /// However, if the data is coming from a database instead of perf, one could |
381 | /// write a query to produce a pre-aggregated file. This function deals with |
382 | /// this case. |
383 | /// |
384 | /// The pre-aggregated file contains aggregated LBR data, but without binary |
385 | /// knowledge. BOLT will parse it and, using information from the disassembled |
386 | /// binary, augment it with fall-through edge frequency information. After |
387 | /// this step is finished, this data can be either written to disk to be |
388 | /// consumed by BOLT later, or can be used by BOLT immediately if kept in |
389 | /// memory. |
390 | /// |
391 | /// File format syntax: |
392 | /// {B|F|f} [<start_id>:]<start_offset> [<end_id>:]<end_offset> <count> |
393 | /// [<mispred_count>] |
394 | /// |
395 | /// B - indicates an aggregated branch |
396 | /// F - an aggregated fall-through |
397 | /// f - an aggregated fall-through with external origin - used to disambiguate |
398 | /// between a return hitting a basic block head and a regular internal |
399 | /// jump to the block |
400 | /// |
401 | /// <start_id> - build id of the object containing the start address. We can |
402 | /// skip it for the main binary and use "X" for an unknown object. This will |
403 | /// save some space and facilitate human parsing. |
404 | /// |
405 | /// <start_offset> - hex offset from the object base load address (0 for the |
406 | /// main executable unless it's PIE) to the start address. |
407 | /// |
408 | /// <end_id>, <end_offset> - same for the end address. |
409 | /// |
410 | /// <count> - total aggregated count of the branch or a fall-through. |
411 | /// |
412 | /// <mispred_count> - the number of times the branch was mispredicted. |
413 | /// Omitted for fall-throughs. |
414 | /// |
415 | /// Example: |
416 | /// F 41be50 41be50 3 |
417 | /// F 41be90 41be90 4 |
418 | /// B 4b1942 39b57f0 3 0 |
419 | /// B 4b196f 4b19e0 2 0 |
420 | void parsePreAggregated(); |
421 | |
422 | /// Parse the full output of pre-aggregated LBR samples generated by |
423 | /// an external tool. |
424 | std::error_code parsePreAggregatedLBRSamples(); |
425 | |
426 | /// Process parsed pre-aggregated data. |
427 | void processPreAggregated(); |
428 | |
429 | /// If \p Address falls into the binary address space based on memory |
430 | /// mapping info \p MMI, then adjust it for further processing by subtracting |
431 | /// the base load address. External addresses, i.e. addresses that do not |
432 | /// correspond to the binary allocated address space, are adjusted to avoid |
433 | /// conflicts. |
434 | void adjustAddress(uint64_t &Address, const MMapInfo &MMI) const { |
435 | if (Address >= MMI.MMapAddress && Address < MMI.MMapAddress + MMI.Size) { |
436 | Address -= MMI.BaseAddress; |
437 | } else if (Address < MMI.Size) { |
438 | // Make sure the address is not treated as belonging to the binary. |
439 | Address = (-1ULL); |
440 | } |
441 | } |
442 | |
443 | /// Adjust addresses in \p LBR entry. |
444 | void adjustLBR(LBREntry &LBR, const MMapInfo &MMI) const { |
445 | adjustAddress(Address&: LBR.From, MMI); |
446 | adjustAddress(Address&: LBR.To, MMI); |
447 | } |
448 | |
449 | /// Ignore kernel/user transition LBR if requested |
450 | bool ignoreKernelInterrupt(LBREntry &LBR) const; |
451 | |
452 | /// Populate functions in \p BC with profile. |
453 | void processProfile(BinaryContext &BC); |
454 | |
455 | /// Start an aggregation job asynchronously. |
456 | void start(); |
457 | |
458 | /// Returns true if this aggregation job is using a translation table to |
459 | /// remap samples collected on binaries already processed by BOLT. |
460 | bool usesBAT() const { return BAT; } |
461 | |
462 | /// Force all subprocesses to stop and cancel aggregation |
463 | void abort(); |
464 | |
465 | /// Dump data structures into a file readable by llvm-bolt |
466 | std::error_code writeAggregatedFile(StringRef OutputFilename) const; |
467 | |
468 | /// Dump translated data structures into YAML |
469 | std::error_code writeBATYAML(BinaryContext &BC, |
470 | StringRef OutputFilename) const; |
471 | |
472 | /// Filter out binaries based on PID |
473 | void filterBinaryMMapInfo(); |
474 | |
475 | /// If we have a build-id available for the input file, use it to assist |
476 | /// matching profile to a binary. |
477 | /// |
478 | /// If the binary name changed after profile collection, use build-id |
479 | /// to get the proper name in perf data when build-ids are available. |
480 | /// If \p FileBuildID has no match, then issue an error and exit. |
481 | void processFileBuildID(StringRef FileBuildID); |
482 | |
483 | /// Debugging dump methods |
484 | void dump() const; |
485 | void dump(const LBREntry &LBR) const; |
486 | void dump(const PerfBranchSample &Sample) const; |
487 | void dump(const PerfMemSample &Sample) const; |
488 | |
489 | public: |
490 | /// If perf.data was collected without build ids, the buildid-list may contain |
491 | /// incomplete entries. Return true if the buffer containing |
492 | /// "perf buildid-list" output has only valid entries and is non- empty. |
493 | /// Return false otherwise. |
494 | bool hasAllBuildIDs(); |
495 | |
496 | /// Parse the output generated by "perf buildid-list" to extract build-ids |
497 | /// and return a file name matching a given \p FileBuildID. |
498 | std::optional<StringRef> getFileNameForBuildID(StringRef FileBuildID); |
499 | }; |
500 | } // namespace bolt |
501 | } // namespace llvm |
502 | |
503 | #endif |
504 | |