1 | //===- GsymCreator.h --------------------------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #ifndef LLVM_DEBUGINFO_GSYM_GSYMCREATOR_H |
10 | #define LLVM_DEBUGINFO_GSYM_GSYMCREATOR_H |
11 | |
12 | #include <functional> |
13 | #include <memory> |
14 | #include <mutex> |
15 | #include <thread> |
16 | |
17 | #include "llvm/ADT/AddressRanges.h" |
18 | #include "llvm/ADT/ArrayRef.h" |
19 | #include "llvm/ADT/StringSet.h" |
20 | #include "llvm/DebugInfo/GSYM/FileEntry.h" |
21 | #include "llvm/DebugInfo/GSYM/FunctionInfo.h" |
22 | #include "llvm/MC/StringTableBuilder.h" |
23 | #include "llvm/Support/Endian.h" |
24 | #include "llvm/Support/Error.h" |
25 | #include "llvm/Support/Path.h" |
26 | |
27 | namespace llvm { |
28 | |
29 | namespace gsym { |
30 | class FileWriter; |
31 | class OutputAggregator; |
32 | |
33 | /// GsymCreator is used to emit GSYM data to a stand alone file or section |
34 | /// within a file. |
35 | /// |
36 | /// The GsymCreator is designed to be used in 3 stages: |
37 | /// - Create FunctionInfo objects and add them |
38 | /// - Finalize the GsymCreator object |
39 | /// - Save to file or section |
40 | /// |
41 | /// The first stage involves creating FunctionInfo objects from another source |
42 | /// of information like compiler debug info metadata, DWARF or Breakpad files. |
43 | /// Any strings in the FunctionInfo or contained information, like InlineInfo |
44 | /// or LineTable objects, should get the string table offsets by calling |
45 | /// GsymCreator::insertString(...). Any file indexes that are needed should be |
46 | /// obtained by calling GsymCreator::insertFile(...). All of the function calls |
47 | /// in GsymCreator are thread safe. This allows multiple threads to create and |
48 | /// add FunctionInfo objects while parsing debug information. |
49 | /// |
50 | /// Once all of the FunctionInfo objects have been added, the |
51 | /// GsymCreator::finalize(...) must be called prior to saving. This function |
52 | /// will sort the FunctionInfo objects, finalize the string table, and do any |
53 | /// other passes on the information needed to prepare the information to be |
54 | /// saved. |
55 | /// |
56 | /// Once the object has been finalized, it can be saved to a file or section. |
57 | /// |
58 | /// ENCODING |
59 | /// |
60 | /// GSYM files are designed to be memory mapped into a process as shared, read |
61 | /// only data, and used as is. |
62 | /// |
63 | /// The GSYM file format when in a stand alone file consists of: |
64 | /// - Header |
65 | /// - Address Table |
66 | /// - Function Info Offsets |
67 | /// - File Table |
68 | /// - String Table |
69 | /// - Function Info Data |
70 | /// |
71 | /// HEADER |
72 | /// |
73 | /// The header is fully described in "llvm/DebugInfo/GSYM/Header.h". |
74 | /// |
75 | /// ADDRESS TABLE |
76 | /// |
77 | /// The address table immediately follows the header in the file and consists |
78 | /// of Header.NumAddresses address offsets. These offsets are sorted and can be |
79 | /// binary searched for efficient lookups. Addresses in the address table are |
80 | /// stored as offsets from a 64 bit base address found in Header.BaseAddress. |
81 | /// This allows the address table to contain 8, 16, or 32 offsets. This allows |
82 | /// the address table to not require full 64 bit addresses for each address. |
83 | /// The resulting GSYM size is smaller and causes fewer pages to be touched |
84 | /// during address lookups when the address table is smaller. The size of the |
85 | /// address offsets in the address table is specified in the header in |
86 | /// Header.AddrOffSize. The first offset in the address table is aligned to |
87 | /// Header.AddrOffSize alignment to ensure efficient access when loaded into |
88 | /// memory. |
89 | /// |
90 | /// FUNCTION INFO OFFSETS TABLE |
91 | /// |
92 | /// The function info offsets table immediately follows the address table and |
93 | /// consists of Header.NumAddresses 32 bit file offsets: one for each address |
94 | /// in the address table. This data is aligned to a 4 byte boundary. The |
95 | /// offsets in this table are the relative offsets from the start offset of the |
96 | /// GSYM header and point to the function info data for each address in the |
97 | /// address table. Keeping this data separate from the address table helps to |
98 | /// reduce the number of pages that are touched when address lookups occur on a |
99 | /// GSYM file. |
100 | /// |
101 | /// FILE TABLE |
102 | /// |
103 | /// The file table immediately follows the function info offsets table. The |
104 | /// encoding of the FileTable is: |
105 | /// |
106 | /// struct FileTable { |
107 | /// uint32_t Count; |
108 | /// FileEntry Files[]; |
109 | /// }; |
110 | /// |
111 | /// The file table starts with a 32 bit count of the number of files that are |
112 | /// used in all of the function info, followed by that number of FileEntry |
113 | /// structures. The file table is aligned to a 4 byte boundary, Each file in |
114 | /// the file table is represented with a FileEntry structure. |
115 | /// See "llvm/DebugInfo/GSYM/FileEntry.h" for details. |
116 | /// |
117 | /// STRING TABLE |
118 | /// |
119 | /// The string table follows the file table in stand alone GSYM files and |
120 | /// contains all strings for everything contained in the GSYM file. Any string |
121 | /// data should be added to the string table and any references to strings |
122 | /// inside GSYM information must be stored as 32 bit string table offsets into |
123 | /// this string table. The string table always starts with an empty string at |
124 | /// offset zero and is followed by any strings needed by the GSYM information. |
125 | /// The start of the string table is not aligned to any boundary. |
126 | /// |
127 | /// FUNCTION INFO DATA |
128 | /// |
129 | /// The function info data is the payload that contains information about the |
130 | /// address that is being looked up. It contains all of the encoded |
131 | /// FunctionInfo objects. Each encoded FunctionInfo's data is pointed to by an |
132 | /// entry in the Function Info Offsets Table. For details on the exact encoding |
133 | /// of FunctionInfo objects, see "llvm/DebugInfo/GSYM/FunctionInfo.h". |
134 | class GsymCreator { |
135 | // Private member variables require Mutex protections |
136 | mutable std::mutex Mutex; |
137 | std::vector<FunctionInfo> Funcs; |
138 | StringTableBuilder StrTab; |
139 | StringSet<> StringStorage; |
140 | DenseMap<llvm::gsym::FileEntry, uint32_t> FileEntryToIndex; |
141 | // Needed for mapping string offsets back to the string stored in \a StrTab. |
142 | DenseMap<uint64_t, CachedHashStringRef> StringOffsetMap; |
143 | std::vector<llvm::gsym::FileEntry> Files; |
144 | std::vector<uint8_t> UUID; |
145 | std::optional<AddressRanges> ; |
146 | std::optional<uint64_t> BaseAddress; |
147 | bool IsSegment = false; |
148 | bool Finalized = false; |
149 | bool Quiet; |
150 | |
151 | |
152 | /// Get the first function start address. |
153 | /// |
154 | /// \returns The start address of the first FunctionInfo or std::nullopt if |
155 | /// there are no function infos. |
156 | std::optional<uint64_t> getFirstFunctionAddress() const; |
157 | |
158 | /// Get the last function address. |
159 | /// |
160 | /// \returns The start address of the last FunctionInfo or std::nullopt if |
161 | /// there are no function infos. |
162 | std::optional<uint64_t> getLastFunctionAddress() const; |
163 | |
164 | /// Get the base address to use for this GSYM file. |
165 | /// |
166 | /// \returns The base address to put into the header and to use when creating |
167 | /// the address offset table or std::nullpt if there are no valid |
168 | /// function infos or if the base address wasn't specified. |
169 | std::optional<uint64_t> getBaseAddress() const; |
170 | |
171 | /// Get the size of an address offset in the address offset table. |
172 | /// |
173 | /// GSYM files store offsets from the base address in the address offset table |
174 | /// and we store the size of the address offsets in the GSYM header. This |
175 | /// function will calculate the size in bytes of these address offsets based |
176 | /// on the current contents of the GSYM file. |
177 | /// |
178 | /// \returns The size in byets of the address offsets. |
179 | uint8_t getAddressOffsetSize() const; |
180 | |
181 | /// Get the maximum address offset for the current address offset size. |
182 | /// |
183 | /// This is used when creating the address offset table to ensure we have |
184 | /// values that are in range so we don't end up truncating address offsets |
185 | /// when creating GSYM files as the code evolves. |
186 | /// |
187 | /// \returns The maximum address offset value that will be encoded into a GSYM |
188 | /// file. |
189 | uint64_t getMaxAddressOffset() const; |
190 | |
191 | /// Calculate the byte size of the GSYM header and tables sizes. |
192 | /// |
193 | /// This function will calculate the exact size in bytes of the encocded GSYM |
194 | /// for the following items: |
195 | /// - The GSYM header |
196 | /// - The Address offset table |
197 | /// - The Address info offset table |
198 | /// - The file table |
199 | /// - The string table |
200 | /// |
201 | /// This is used to help split GSYM files into segments. |
202 | /// |
203 | /// \returns Size in bytes the GSYM header and tables. |
204 | uint64_t calculateHeaderAndTableSize() const; |
205 | |
206 | /// Copy a FunctionInfo from the \a SrcGC GSYM creator into this creator. |
207 | /// |
208 | /// Copy the function info and only the needed files and strings and add a |
209 | /// converted FunctionInfo into this object. This is used to segment GSYM |
210 | /// files into separate files while only transferring the files and strings |
211 | /// that are needed from \a SrcGC. |
212 | /// |
213 | /// \param SrcGC The source gsym creator to copy from. |
214 | /// \param FuncInfoIdx The function info index within \a SrcGC to copy. |
215 | /// \returns The number of bytes it will take to encode the function info in |
216 | /// this GsymCreator. This helps calculate the size of the current GSYM |
217 | /// segment file. |
218 | uint64_t copyFunctionInfo(const GsymCreator &SrcGC, size_t FuncInfoIdx); |
219 | |
220 | /// Copy a string from \a SrcGC into this object. |
221 | /// |
222 | /// Copy a string from \a SrcGC by string table offset into this GSYM creator. |
223 | /// If a string has already been copied, the uniqued string table offset will |
224 | /// be returned, otherwise the string will be copied and a unique offset will |
225 | /// be returned. |
226 | /// |
227 | /// \param SrcGC The source gsym creator to copy from. |
228 | /// \param StrOff The string table offset from \a SrcGC to copy. |
229 | /// \returns The new string table offset of the string within this object. |
230 | uint32_t copyString(const GsymCreator &SrcGC, uint32_t StrOff); |
231 | |
232 | /// Copy a file from \a SrcGC into this object. |
233 | /// |
234 | /// Copy a file from \a SrcGC by file index into this GSYM creator. Files |
235 | /// consist of two string table entries, one for the directory and one for the |
236 | /// filename, this function will copy any needed strings ensure the file is |
237 | /// uniqued within this object. If a file already exists in this GSYM creator |
238 | /// the uniqued index will be returned, else the stirngs will be copied and |
239 | /// the new file index will be returned. |
240 | /// |
241 | /// \param SrcGC The source gsym creator to copy from. |
242 | /// \param FileIdx The 1 based file table index within \a SrcGC to copy. A |
243 | /// file index of zero will always return zero as the zero is a reserved file |
244 | /// index that means no file. |
245 | /// \returns The new file index of the file within this object. |
246 | uint32_t copyFile(const GsymCreator &SrcGC, uint32_t FileIdx); |
247 | |
248 | /// Inserts a FileEntry into the file table. |
249 | /// |
250 | /// This is used to insert a file entry in a thread safe way into this object. |
251 | /// |
252 | /// \param FE A file entry object that contains valid string table offsets |
253 | /// from this object already. |
254 | uint32_t insertFileEntry(FileEntry FE); |
255 | |
256 | /// Fixup any string and file references by updating any file indexes and |
257 | /// strings offsets in the InlineInfo parameter. |
258 | /// |
259 | /// When copying InlineInfo entries, we can simply make a copy of the object |
260 | /// and then fixup the files and strings for efficiency. |
261 | /// |
262 | /// \param SrcGC The source gsym creator to copy from. |
263 | /// \param II The inline info that contains file indexes and string offsets |
264 | /// that come from \a SrcGC. The entries will be updated by coping any files |
265 | /// and strings over into this object. |
266 | void fixupInlineInfo(const GsymCreator &SrcGC, InlineInfo &II); |
267 | |
268 | /// Save this GSYM file into segments that are roughly \a SegmentSize in size. |
269 | /// |
270 | /// When segemented GSYM files are saved to disk, they will use \a Path as a |
271 | /// prefix and then have the first function info address appended to the path |
272 | /// when each segment is saved. Each segmented GSYM file has a only the |
273 | /// strings and files that are needed to save the function infos that are in |
274 | /// each segment. These smaller files are easy to compress and download |
275 | /// separately and allow for efficient lookups with very large GSYM files and |
276 | /// segmenting them allows servers to download only the segments that are |
277 | /// needed. |
278 | /// |
279 | /// \param Path The path prefix to use when saving the GSYM files. |
280 | /// \param ByteOrder The endianness to use when saving the file. |
281 | /// \param SegmentSize The size in bytes to segment the GSYM file into. |
282 | llvm::Error saveSegments(StringRef Path, llvm::endianness ByteOrder, |
283 | uint64_t SegmentSize) const; |
284 | |
285 | /// Let this creator know that this is a segment of another GsymCreator. |
286 | /// |
287 | /// When we have a segment, we know that function infos will be added in |
288 | /// ascending address range order without having to be finalized. We also |
289 | /// don't need to sort and unique entries during the finalize function call. |
290 | void setIsSegment() { |
291 | IsSegment = true; |
292 | } |
293 | |
294 | public: |
295 | GsymCreator(bool Quiet = false); |
296 | |
297 | /// Save a GSYM file to a stand alone file. |
298 | /// |
299 | /// \param Path The file path to save the GSYM file to. |
300 | /// \param ByteOrder The endianness to use when saving the file. |
301 | /// \param SegmentSize The size in bytes to segment the GSYM file into. If |
302 | /// this option is set this function will create N segments |
303 | /// that are all around \a SegmentSize bytes in size. This |
304 | /// allows a very large GSYM file to be broken up into |
305 | /// shards. Each GSYM file will have its own file table, |
306 | /// and string table that only have the files and strings |
307 | /// needed for the shared. If this argument has no value, |
308 | /// a single GSYM file that contains all function |
309 | /// information will be created. |
310 | /// \returns An error object that indicates success or failure of the save. |
311 | llvm::Error save(StringRef Path, llvm::endianness ByteOrder, |
312 | std::optional<uint64_t> SegmentSize = std::nullopt) const; |
313 | |
314 | /// Encode a GSYM into the file writer stream at the current position. |
315 | /// |
316 | /// \param O The stream to save the binary data to |
317 | /// \returns An error object that indicates success or failure of the save. |
318 | llvm::Error encode(FileWriter &O) const; |
319 | |
320 | /// Insert a string into the GSYM string table. |
321 | /// |
322 | /// All strings used by GSYM files must be uniqued by adding them to this |
323 | /// string pool and using the returned offset for any string values. |
324 | /// |
325 | /// \param S The string to insert into the string table. |
326 | /// \param Copy If true, then make a backing copy of the string. If false, |
327 | /// the string is owned by another object that will stay around |
328 | /// long enough for the GsymCreator to save the GSYM file. |
329 | /// \returns The unique 32 bit offset into the string table. |
330 | uint32_t insertString(StringRef S, bool Copy = true); |
331 | |
332 | /// Insert a file into this GSYM creator. |
333 | /// |
334 | /// Inserts a file by adding a FileEntry into the "Files" member variable if |
335 | /// the file has not already been added. The file path is split into |
336 | /// directory and filename which are both added to the string table. This |
337 | /// allows paths to be stored efficiently by reusing the directories that are |
338 | /// common between multiple files. |
339 | /// |
340 | /// \param Path The path to the file to insert. |
341 | /// \param Style The path style for the "Path" parameter. |
342 | /// \returns The unique file index for the inserted file. |
343 | uint32_t insertFile(StringRef Path, |
344 | sys::path::Style Style = sys::path::Style::native); |
345 | |
346 | /// Add a function info to this GSYM creator. |
347 | /// |
348 | /// All information in the FunctionInfo object must use the |
349 | /// GsymCreator::insertString(...) function when creating string table |
350 | /// offsets for names and other strings. |
351 | /// |
352 | /// \param FI The function info object to emplace into our functions list. |
353 | void addFunctionInfo(FunctionInfo &&FI); |
354 | |
355 | /// Finalize the data in the GSYM creator prior to saving the data out. |
356 | /// |
357 | /// Finalize must be called after all FunctionInfo objects have been added |
358 | /// and before GsymCreator::save() is called. |
359 | /// |
360 | /// \param OS Output stream to report duplicate function infos, overlapping |
361 | /// function infos, and function infos that were merged or removed. |
362 | /// \returns An error object that indicates success or failure of the |
363 | /// finalize. |
364 | llvm::Error finalize(OutputAggregator &OS); |
365 | |
366 | /// Set the UUID value. |
367 | /// |
368 | /// \param UUIDBytes The new UUID bytes. |
369 | void setUUID(llvm::ArrayRef<uint8_t> UUIDBytes) { |
370 | UUID.assign(first: UUIDBytes.begin(), last: UUIDBytes.end()); |
371 | } |
372 | |
373 | /// Thread safe iteration over all function infos. |
374 | /// |
375 | /// \param Callback A callback function that will get called with each |
376 | /// FunctionInfo. If the callback returns false, stop iterating. |
377 | void forEachFunctionInfo( |
378 | std::function<bool(FunctionInfo &)> const &Callback); |
379 | |
380 | /// Thread safe const iteration over all function infos. |
381 | /// |
382 | /// \param Callback A callback function that will get called with each |
383 | /// FunctionInfo. If the callback returns false, stop iterating. |
384 | void forEachFunctionInfo( |
385 | std::function<bool(const FunctionInfo &)> const &Callback) const; |
386 | |
387 | /// Get the current number of FunctionInfo objects contained in this |
388 | /// object. |
389 | size_t getNumFunctionInfos() const; |
390 | |
391 | /// Set valid .text address ranges that all functions must be contained in. |
392 | void (AddressRanges &) { |
393 | ValidTextRanges = TextRanges; |
394 | } |
395 | |
396 | /// Get the valid text ranges. |
397 | const std::optional<AddressRanges> () const { |
398 | return ValidTextRanges; |
399 | } |
400 | |
401 | /// Check if an address is a valid code address. |
402 | /// |
403 | /// Any functions whose addresses do not exist within these function bounds |
404 | /// will not be converted into the final GSYM. This allows the object file |
405 | /// to figure out the valid file address ranges of all the code sections |
406 | /// and ensure we don't add invalid functions to the final output. Many |
407 | /// linkers have issues when dead stripping functions from DWARF debug info |
408 | /// where they set the DW_AT_low_pc to zero, but newer DWARF has the |
409 | /// DW_AT_high_pc as an offset from the DW_AT_low_pc and these size |
410 | /// attributes have no relocations that can be applied. This results in DWARF |
411 | /// where many functions have an DW_AT_low_pc of zero and a valid offset size |
412 | /// for DW_AT_high_pc. If we extract all valid ranges from an object file |
413 | /// that are marked with executable permissions, we can properly ensure that |
414 | /// these functions are removed. |
415 | /// |
416 | /// \param Addr An address to check. |
417 | /// |
418 | /// \returns True if the address is in the valid text ranges or if no valid |
419 | /// text ranges have been set, false otherwise. |
420 | bool IsValidTextAddress(uint64_t Addr) const; |
421 | |
422 | /// Set the base address to use for the GSYM file. |
423 | /// |
424 | /// Setting the base address to use for the GSYM file. Object files typically |
425 | /// get loaded from a base address when the OS loads them into memory. Using |
426 | /// GSYM files for symbolication becomes easier if the base address in the |
427 | /// GSYM header is the same address as it allows addresses to be easily slid |
428 | /// and allows symbolication without needing to find the original base |
429 | /// address in the original object file. |
430 | /// |
431 | /// \param Addr The address to use as the base address of the GSYM file |
432 | /// when it is saved to disk. |
433 | void setBaseAddress(uint64_t Addr) { |
434 | BaseAddress = Addr; |
435 | } |
436 | |
437 | /// Whether the transformation should be quiet, i.e. not output warnings. |
438 | bool isQuiet() const { return Quiet; } |
439 | |
440 | |
441 | /// Create a segmented GSYM creator starting with function info index |
442 | /// \a FuncIdx. |
443 | /// |
444 | /// This function will create a GsymCreator object that will encode into |
445 | /// roughly \a SegmentSize bytes and return it. It is used by the private |
446 | /// saveSegments(...) function and also is used by the GSYM unit tests to test |
447 | /// segmenting of GSYM files. The returned GsymCreator can be finalized and |
448 | /// encoded. |
449 | /// |
450 | /// \param [in] SegmentSize The size in bytes to roughly segment the GSYM file |
451 | /// into. |
452 | /// \param [in,out] FuncIdx The index of the first function info to encode |
453 | /// into the returned GsymCreator. This index will be updated so it can be |
454 | /// used in subsequent calls to this function to allow more segments to be |
455 | /// created. |
456 | /// \returns An expected unique pointer to a GsymCreator or an error. The |
457 | /// returned unique pointer can be NULL if there are no more functions to |
458 | /// encode. |
459 | llvm::Expected<std::unique_ptr<GsymCreator>> |
460 | createSegment(uint64_t SegmentSize, size_t &FuncIdx) const; |
461 | }; |
462 | |
463 | } // namespace gsym |
464 | } // namespace llvm |
465 | |
466 | #endif // LLVM_DEBUGINFO_GSYM_GSYMCREATOR_H |
467 | |