1//===- bolt/Profile/DataAggregator.h - Perf data aggregator -----*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This family of functions reads profile data written by perf record,
10// aggregates it and then writes it back to an output file.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef BOLT_PROFILE_DATA_AGGREGATOR_H
15#define BOLT_PROFILE_DATA_AGGREGATOR_H
16
17#include "bolt/Profile/DataReader.h"
18#include "llvm/ADT/StringRef.h"
19#include "llvm/Support/Error.h"
20#include "llvm/Support/Program.h"
21#include <unordered_map>
22
23namespace llvm {
24namespace bolt {
25
26class BinaryFunction;
27class BinaryContext;
28class BoltAddressTranslation;
29
30/// DataAggregator inherits all parsing logic from DataReader as well as
31/// its data structures used to represent aggregated profile data in memory.
32///
33/// The aggregator works by dispatching two separate perf-script jobs that
34/// read perf samples and perf task annotations. Later, we read the output
35/// files to extract information about which PID was used for this binary.
36/// With the PID, we filter the samples and extract all LBR entries.
37///
38/// To aggregate LBR entries, we rely on a BinaryFunction map to locate the
39/// original function where the event happened. Then, we convert a raw address
40/// to an offset relative to the start of this function and aggregate branch
41/// information for each function.
42///
43/// This must be coordinated with RewriteInstance so we have BinaryFunctions in
44/// State::Disassembled. After this state, BinaryFunction will drop the
45/// instruction map with original addresses we rely on to validate the traces
46/// found in the LBR.
47///
48/// The last step is to write the aggregated data to disk in the output file
49/// specified by the user.
50class DataAggregator : public DataReader {
51public:
52 explicit DataAggregator(StringRef Filename) : DataReader(Filename) {
53 start();
54 }
55
56 ~DataAggregator();
57
58 StringRef getReaderName() const override { return "perf data aggregator"; }
59
60 bool isTrustedSource() const override { return true; }
61
62 Error preprocessProfile(BinaryContext &BC) override;
63
64 Error readProfilePreCFG(BinaryContext &BC) override {
65 return Error::success();
66 }
67
68 Error readProfile(BinaryContext &BC) override;
69
70 bool mayHaveProfileData(const BinaryFunction &BF) override;
71
72 /// Set Bolt Address Translation Table when processing samples collected in
73 /// bolted binaries
74 void setBAT(BoltAddressTranslation *B) override { BAT = B; }
75
76 /// Check whether \p FileName is a perf.data file
77 static bool checkPerfDataMagic(StringRef FileName);
78
79private:
80 struct PerfBranchSample {
81 SmallVector<LBREntry, 32> LBR;
82 uint64_t PC;
83 };
84
85 struct PerfBasicSample {
86 StringRef EventName;
87 uint64_t PC;
88 };
89
90 struct PerfMemSample {
91 uint64_t PC;
92 uint64_t Addr;
93 };
94
95 /// Used for parsing specific pre-aggregated input files.
96 struct AggregatedLBREntry {
97 enum Type : char { BRANCH = 0, FT, FT_EXTERNAL_ORIGIN };
98 Location From;
99 Location To;
100 uint64_t Count;
101 uint64_t Mispreds;
102 Type EntryType;
103 };
104
105 struct Trace {
106 uint64_t From;
107 uint64_t To;
108 Trace(uint64_t From, uint64_t To) : From(From), To(To) {}
109 bool operator==(const Trace &Other) const {
110 return From == Other.From && To == Other.To;
111 }
112 };
113
114 struct TraceHash {
115 size_t operator()(const Trace &L) const {
116 return std::hash<uint64_t>()(L.From << 32 | L.To);
117 }
118 };
119
120 struct FTInfo {
121 uint64_t InternCount{0};
122 uint64_t ExternCount{0};
123 };
124
125 struct BranchInfo {
126 uint64_t TakenCount{0};
127 uint64_t MispredCount{0};
128 };
129
130 /// Intermediate storage for profile data. We save the results of parsing
131 /// and use them later for processing and assigning profile.
132 std::unordered_map<Trace, BranchInfo, TraceHash> BranchLBRs;
133 std::unordered_map<Trace, FTInfo, TraceHash> FallthroughLBRs;
134 std::vector<AggregatedLBREntry> AggregatedLBRs;
135 std::unordered_map<uint64_t, uint64_t> BasicSamples;
136 std::vector<PerfMemSample> MemSamples;
137
138 template <typename T> void clear(T &Container) {
139 T TempContainer;
140 TempContainer.swap(Container);
141 }
142
143 /// Perf utility full path name
144 std::string PerfPath;
145
146 /// Perf process spawning bookkeeping
147 struct PerfProcessInfo {
148 bool IsFinished{false};
149 sys::ProcessInfo PI;
150 SmallVector<char, 256> StdoutPath;
151 SmallVector<char, 256> StderrPath;
152 };
153
154 /// Process info for spawned processes
155 PerfProcessInfo MainEventsPPI;
156 PerfProcessInfo MemEventsPPI;
157 PerfProcessInfo MMapEventsPPI;
158 PerfProcessInfo TaskEventsPPI;
159
160 /// Kernel VM starts at fixed based address
161 /// https://www.kernel.org/doc/Documentation/x86/x86_64/mm.txt
162 static constexpr uint64_t KernelBaseAddr = 0xffff800000000000;
163
164 /// Current list of created temporary files
165 std::vector<std::string> TempFiles;
166
167 /// Name of the binary with matching build-id from perf.data if different
168 /// from the file name in BC.
169 std::string BuildIDBinaryName;
170
171 /// Memory map info for a single file as recorded in perf.data
172 struct MMapInfo {
173 uint64_t BaseAddress{0}; /// Base address of the mapped binary.
174 uint64_t MMapAddress{0}; /// Address of the executable segment.
175 uint64_t Size{0}; /// Size of the mapping.
176 uint64_t Offset{0}; /// File offset of the mapped segment.
177 int32_t PID{-1}; /// Process ID.
178 bool Forked{false}; /// Was the process forked?
179 uint64_t Time{0ULL}; /// Time in micro seconds.
180 };
181
182 /// Per-PID map info for the binary
183 std::unordered_map<uint64_t, MMapInfo> BinaryMMapInfo;
184
185 /// Fork event info
186 struct ForkInfo {
187 int32_t ParentPID;
188 int32_t ChildPID;
189 uint64_t Time{0ULL};
190 };
191
192 /// References to core BOLT data structures
193 BinaryContext *BC{nullptr};
194
195 BoltAddressTranslation *BAT{nullptr};
196
197 /// Update function execution profile with a recorded trace.
198 /// A trace is region of code executed between two LBR entries supplied in
199 /// execution order.
200 ///
201 /// Return true if the trace is valid, false otherwise.
202 bool
203 recordTrace(BinaryFunction &BF, const LBREntry &First, const LBREntry &Second,
204 uint64_t Count,
205 SmallVector<std::pair<uint64_t, uint64_t>, 16> &Branches) const;
206
207 /// Return a vector of offsets corresponding to a trace in a function
208 /// (see recordTrace() above).
209 std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
210 getFallthroughsInTrace(BinaryFunction &BF, const LBREntry &First,
211 const LBREntry &Second, uint64_t Count = 1) const;
212
213 /// Record external entry into the function \p BF.
214 ///
215 /// Return true if the entry is valid, false otherwise.
216 bool recordEntry(BinaryFunction &BF, uint64_t To, bool Mispred,
217 uint64_t Count = 1) const;
218
219 /// Record exit from the function \p BF via a call or return.
220 ///
221 /// Return true if the exit point is valid, false otherwise.
222 bool recordExit(BinaryFunction &BF, uint64_t From, bool Mispred,
223 uint64_t Count = 1) const;
224
225 /// Aggregation statistics
226 uint64_t NumInvalidTraces{0};
227 uint64_t NumLongRangeTraces{0};
228 /// Specifies how many samples were recorded in cold areas if we are dealing
229 /// with profiling data collected in a bolted binary. For LBRs, incremented
230 /// for the source of the branch to avoid counting cold activity twice (one
231 /// for source and another for destination).
232 uint64_t NumColdSamples{0};
233
234 /// Looks into system PATH for Linux Perf and set up the aggregator to use it
235 void findPerfExecutable();
236
237 /// Launch a perf subprocess with given args and save output for later
238 /// parsing.
239 void launchPerfProcess(StringRef Name, PerfProcessInfo &PPI,
240 const char *ArgsString, bool Wait);
241
242 /// Delete all temporary files created to hold the output generated by spawned
243 /// subprocesses during the aggregation job
244 void deleteTempFiles();
245
246 // Semantic pass helpers
247
248 /// Look up which function contains an address by using out map of
249 /// disassembled BinaryFunctions
250 BinaryFunction *getBinaryFunctionContainingAddress(uint64_t Address) const;
251
252 /// Perform BAT translation for a given \p Func and return the parent
253 /// BinaryFunction or nullptr.
254 BinaryFunction *getBATParentFunction(const BinaryFunction &Func) const;
255
256 /// Retrieve the location name to be used for samples recorded in \p Func.
257 StringRef getLocationName(const BinaryFunction &Func) const;
258
259 /// Semantic actions - parser hooks to interpret parsed perf samples
260 /// Register a sample (non-LBR mode), i.e. a new hit at \p Address
261 bool doSample(BinaryFunction &Func, const uint64_t Address, uint64_t Count);
262
263 /// Register an intraprocedural branch \p Branch.
264 bool doIntraBranch(BinaryFunction &Func, uint64_t From, uint64_t To,
265 uint64_t Count, uint64_t Mispreds);
266
267 /// Register an interprocedural branch from \p FromFunc to \p ToFunc with
268 /// offsets \p From and \p To, respectively.
269 bool doInterBranch(BinaryFunction *FromFunc, BinaryFunction *ToFunc,
270 uint64_t From, uint64_t To, uint64_t Count,
271 uint64_t Mispreds);
272
273 /// Register a \p Branch.
274 bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds);
275
276 /// Register a trace between two LBR entries supplied in execution order.
277 bool doTrace(const LBREntry &First, const LBREntry &Second,
278 uint64_t Count = 1);
279
280 /// Parser helpers
281 /// Return false if we exhausted our parser buffer and finished parsing
282 /// everything
283 bool hasData() const { return !ParsingBuf.empty(); }
284
285 /// Print heat map based on LBR samples.
286 std::error_code printLBRHeatMap();
287
288 /// Parse a single perf sample containing a PID associated with a sequence of
289 /// LBR entries. If the PID does not correspond to the binary we are looking
290 /// for, return std::errc::no_such_process. If other parsing errors occur,
291 /// return the error. Otherwise, return the parsed sample.
292 ErrorOr<PerfBranchSample> parseBranchSample();
293
294 /// Parse a single perf sample containing a PID associated with an event name
295 /// and a PC
296 ErrorOr<PerfBasicSample> parseBasicSample();
297
298 /// Parse a single perf sample containing a PID associated with an IP and
299 /// address.
300 ErrorOr<PerfMemSample> parseMemSample();
301
302 /// Parse pre-aggregated LBR samples created by an external tool
303 ErrorOr<AggregatedLBREntry> parseAggregatedLBREntry();
304
305 /// Parse either buildid:offset or just offset, representing a location in the
306 /// binary. Used exclusively for pre-aggregated LBR samples.
307 ErrorOr<Location> parseLocationOrOffset();
308
309 /// Check if a field separator is the next char to parse and, if yes, consume
310 /// it and return true
311 bool checkAndConsumeFS();
312
313 /// Consume the entire line
314 void consumeRestOfLine();
315
316 /// True if the next token in the parsing buffer is a new line, but don't
317 /// consume it (peek only).
318 bool checkNewLine();
319
320 using PerfProcessErrorCallbackTy = std::function<void(int, StringRef)>;
321 /// Prepare to parse data from a given perf script invocation.
322 /// Returns an invocation exit code.
323 int prepareToParse(StringRef Name, PerfProcessInfo &Process,
324 PerfProcessErrorCallbackTy Callback);
325
326 /// Parse a single LBR entry as output by perf script -Fbrstack
327 ErrorOr<LBREntry> parseLBREntry();
328
329 /// Parse LBR sample, returns the number of traces.
330 uint64_t parseLBRSample(const PerfBranchSample &Sample, bool NeedsSkylakeFix);
331
332 /// Parse and pre-aggregate branch events.
333 std::error_code parseBranchEvents();
334
335 /// Process all branch events.
336 void processBranchEvents();
337
338 /// This member function supports generating data for AutoFDO LLVM tools.
339 std::error_code writeAutoFDOData(StringRef OutputFilename);
340
341 /// Parse the full output generated by perf script to report non-LBR samples.
342 std::error_code parseBasicEvents();
343
344 /// Process non-LBR events.
345 void processBasicEvents();
346
347 /// Parse the full output generated by perf script to report memory events.
348 std::error_code parseMemEvents();
349
350 /// Process parsed memory events profile.
351 void processMemEvents();
352
353 /// Parse a single line of a PERF_RECORD_MMAP2 event looking for a mapping
354 /// between the binary name and its memory layout in a process with a given
355 /// PID.
356 /// On success return a <FileName, MMapInfo> pair.
357 ErrorOr<std::pair<StringRef, MMapInfo>> parseMMapEvent();
358
359 /// Parse PERF_RECORD_FORK event.
360 std::optional<ForkInfo> parseForkEvent();
361
362 /// Parse 'PERF_RECORD_COMM exec'. Don't consume the string.
363 std::optional<int32_t> parseCommExecEvent();
364
365 /// Parse the full output generated by `perf script --show-mmap-events`
366 /// to generate mapping between binary files and their memory mappings for
367 /// all PIDs.
368 std::error_code parseMMapEvents();
369
370 /// Parse output of `perf script --show-task-events`, and forked processes
371 /// to the set of tracked PIDs.
372 std::error_code parseTaskEvents();
373
374 /// Parse a single pair of binary full path and associated build-id
375 std::optional<std::pair<StringRef, StringRef>> parseNameBuildIDPair();
376
377 /// Coordinate reading and parsing of pre-aggregated file
378 ///
379 /// The regular perf2bolt aggregation job is to read perf output directly.
380 /// However, if the data is coming from a database instead of perf, one could
381 /// write a query to produce a pre-aggregated file. This function deals with
382 /// this case.
383 ///
384 /// The pre-aggregated file contains aggregated LBR data, but without binary
385 /// knowledge. BOLT will parse it and, using information from the disassembled
386 /// binary, augment it with fall-through edge frequency information. After
387 /// this step is finished, this data can be either written to disk to be
388 /// consumed by BOLT later, or can be used by BOLT immediately if kept in
389 /// memory.
390 ///
391 /// File format syntax:
392 /// {B|F|f} [<start_id>:]<start_offset> [<end_id>:]<end_offset> <count>
393 /// [<mispred_count>]
394 ///
395 /// B - indicates an aggregated branch
396 /// F - an aggregated fall-through
397 /// f - an aggregated fall-through with external origin - used to disambiguate
398 /// between a return hitting a basic block head and a regular internal
399 /// jump to the block
400 ///
401 /// <start_id> - build id of the object containing the start address. We can
402 /// skip it for the main binary and use "X" for an unknown object. This will
403 /// save some space and facilitate human parsing.
404 ///
405 /// <start_offset> - hex offset from the object base load address (0 for the
406 /// main executable unless it's PIE) to the start address.
407 ///
408 /// <end_id>, <end_offset> - same for the end address.
409 ///
410 /// <count> - total aggregated count of the branch or a fall-through.
411 ///
412 /// <mispred_count> - the number of times the branch was mispredicted.
413 /// Omitted for fall-throughs.
414 ///
415 /// Example:
416 /// F 41be50 41be50 3
417 /// F 41be90 41be90 4
418 /// B 4b1942 39b57f0 3 0
419 /// B 4b196f 4b19e0 2 0
420 void parsePreAggregated();
421
422 /// Parse the full output of pre-aggregated LBR samples generated by
423 /// an external tool.
424 std::error_code parsePreAggregatedLBRSamples();
425
426 /// Process parsed pre-aggregated data.
427 void processPreAggregated();
428
429 /// If \p Address falls into the binary address space based on memory
430 /// mapping info \p MMI, then adjust it for further processing by subtracting
431 /// the base load address. External addresses, i.e. addresses that do not
432 /// correspond to the binary allocated address space, are adjusted to avoid
433 /// conflicts.
434 void adjustAddress(uint64_t &Address, const MMapInfo &MMI) const {
435 if (Address >= MMI.MMapAddress && Address < MMI.MMapAddress + MMI.Size) {
436 Address -= MMI.BaseAddress;
437 } else if (Address < MMI.Size) {
438 // Make sure the address is not treated as belonging to the binary.
439 Address = (-1ULL);
440 }
441 }
442
443 /// Adjust addresses in \p LBR entry.
444 void adjustLBR(LBREntry &LBR, const MMapInfo &MMI) const {
445 adjustAddress(Address&: LBR.From, MMI);
446 adjustAddress(Address&: LBR.To, MMI);
447 }
448
449 /// Ignore kernel/user transition LBR if requested
450 bool ignoreKernelInterrupt(LBREntry &LBR) const;
451
452 /// Populate functions in \p BC with profile.
453 void processProfile(BinaryContext &BC);
454
455 /// Start an aggregation job asynchronously.
456 void start();
457
458 /// Returns true if this aggregation job is using a translation table to
459 /// remap samples collected on binaries already processed by BOLT.
460 bool usesBAT() const { return BAT; }
461
462 /// Force all subprocesses to stop and cancel aggregation
463 void abort();
464
465 /// Dump data structures into a file readable by llvm-bolt
466 std::error_code writeAggregatedFile(StringRef OutputFilename) const;
467
468 /// Dump translated data structures into YAML
469 std::error_code writeBATYAML(BinaryContext &BC,
470 StringRef OutputFilename) const;
471
472 /// Filter out binaries based on PID
473 void filterBinaryMMapInfo();
474
475 /// If we have a build-id available for the input file, use it to assist
476 /// matching profile to a binary.
477 ///
478 /// If the binary name changed after profile collection, use build-id
479 /// to get the proper name in perf data when build-ids are available.
480 /// If \p FileBuildID has no match, then issue an error and exit.
481 void processFileBuildID(StringRef FileBuildID);
482
483 /// Debugging dump methods
484 void dump() const;
485 void dump(const LBREntry &LBR) const;
486 void dump(const PerfBranchSample &Sample) const;
487 void dump(const PerfMemSample &Sample) const;
488
489public:
490 /// If perf.data was collected without build ids, the buildid-list may contain
491 /// incomplete entries. Return true if the buffer containing
492 /// "perf buildid-list" output has only valid entries and is non- empty.
493 /// Return false otherwise.
494 bool hasAllBuildIDs();
495
496 /// Parse the output generated by "perf buildid-list" to extract build-ids
497 /// and return a file name matching a given \p FileBuildID.
498 std::optional<StringRef> getFileNameForBuildID(StringRef FileBuildID);
499};
500} // namespace bolt
501} // namespace llvm
502
503#endif
504

source code of bolt/include/bolt/Profile/DataAggregator.h