1 | //===-- llvm/MC/MCSchedule.h - Scheduling -----------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file defines the classes used to describe a subtarget's machine model |
10 | // for scheduling and other instruction cost heuristics. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #ifndef LLVM_MC_MCSCHEDULE_H |
15 | #define LLVM_MC_MCSCHEDULE_H |
16 | |
17 | #include "llvm/Config/llvm-config.h" |
18 | #include "llvm/Support/DataTypes.h" |
19 | #include <cassert> |
20 | |
21 | namespace llvm { |
22 | |
23 | template <typename T> class ArrayRef; |
24 | struct InstrItinerary; |
25 | class MCSubtargetInfo; |
26 | class MCInstrInfo; |
27 | class MCInst; |
28 | class InstrItineraryData; |
29 | |
30 | /// Define a kind of processor resource that will be modeled by the scheduler. |
31 | struct MCProcResourceDesc { |
32 | const char *Name; |
33 | unsigned NumUnits; // Number of resource of this kind |
34 | unsigned SuperIdx; // Index of the resources kind that contains this kind. |
35 | |
36 | // Number of resources that may be buffered. |
37 | // |
38 | // Buffered resources (BufferSize != 0) may be consumed at some indeterminate |
39 | // cycle after dispatch. This should be used for out-of-order cpus when |
40 | // instructions that use this resource can be buffered in a reservaton |
41 | // station. |
42 | // |
43 | // Unbuffered resources (BufferSize == 0) always consume their resource some |
44 | // fixed number of cycles after dispatch. If a resource is unbuffered, then |
45 | // the scheduler will avoid scheduling instructions with conflicting resources |
46 | // in the same cycle. This is for in-order cpus, or the in-order portion of |
47 | // an out-of-order cpus. |
48 | int BufferSize; |
49 | |
50 | // If the resource has sub-units, a pointer to the first element of an array |
51 | // of `NumUnits` elements containing the ProcResourceIdx of the sub units. |
52 | // nullptr if the resource does not have sub-units. |
53 | const unsigned *SubUnitsIdxBegin; |
54 | |
55 | bool operator==(const MCProcResourceDesc &Other) const { |
56 | return NumUnits == Other.NumUnits && SuperIdx == Other.SuperIdx |
57 | && BufferSize == Other.BufferSize; |
58 | } |
59 | }; |
60 | |
61 | /// Identify one of the processor resource kinds consumed by a |
62 | /// particular scheduling class for the specified number of cycles. |
63 | struct MCWriteProcResEntry { |
64 | uint16_t ProcResourceIdx; |
65 | /// Cycle at which the resource will be released by an instruction, |
66 | /// relatively to the cycle in which the instruction is issued |
67 | /// (assuming no stalls inbetween). |
68 | uint16_t ReleaseAtCycle; |
69 | /// Cycle at which the resource will be aquired by an instruction, |
70 | /// relatively to the cycle in which the instruction is issued |
71 | /// (assuming no stalls inbetween). |
72 | uint16_t AcquireAtCycle; |
73 | |
74 | bool operator==(const MCWriteProcResEntry &Other) const { |
75 | return ProcResourceIdx == Other.ProcResourceIdx && |
76 | ReleaseAtCycle == Other.ReleaseAtCycle && |
77 | AcquireAtCycle == Other.AcquireAtCycle; |
78 | } |
79 | }; |
80 | |
81 | /// Specify the latency in cpu cycles for a particular scheduling class and def |
82 | /// index. -1 indicates an invalid latency. Heuristics would typically consider |
83 | /// an instruction with invalid latency to have infinite latency. Also identify |
84 | /// the WriteResources of this def. When the operand expands to a sequence of |
85 | /// writes, this ID is the last write in the sequence. |
86 | struct MCWriteLatencyEntry { |
87 | int16_t Cycles; |
88 | uint16_t WriteResourceID; |
89 | |
90 | bool operator==(const MCWriteLatencyEntry &Other) const { |
91 | return Cycles == Other.Cycles && WriteResourceID == Other.WriteResourceID; |
92 | } |
93 | }; |
94 | |
95 | /// Specify the number of cycles allowed after instruction issue before a |
96 | /// particular use operand reads its registers. This effectively reduces the |
97 | /// write's latency. Here we allow negative cycles for corner cases where |
98 | /// latency increases. This rule only applies when the entry's WriteResource |
99 | /// matches the write's WriteResource. |
100 | /// |
101 | /// MCReadAdvanceEntries are sorted first by operand index (UseIdx), then by |
102 | /// WriteResourceIdx. |
103 | struct MCReadAdvanceEntry { |
104 | unsigned UseIdx; |
105 | unsigned WriteResourceID; |
106 | int Cycles; |
107 | |
108 | bool operator==(const MCReadAdvanceEntry &Other) const { |
109 | return UseIdx == Other.UseIdx && WriteResourceID == Other.WriteResourceID |
110 | && Cycles == Other.Cycles; |
111 | } |
112 | }; |
113 | |
114 | /// Summarize the scheduling resources required for an instruction of a |
115 | /// particular scheduling class. |
116 | /// |
117 | /// Defined as an aggregate struct for creating tables with initializer lists. |
118 | struct MCSchedClassDesc { |
119 | static const unsigned short InvalidNumMicroOps = (1U << 13) - 1; |
120 | static const unsigned short VariantNumMicroOps = InvalidNumMicroOps - 1; |
121 | |
122 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
123 | const char* Name; |
124 | #endif |
125 | uint16_t NumMicroOps : 13; |
126 | uint16_t BeginGroup : 1; |
127 | uint16_t EndGroup : 1; |
128 | uint16_t RetireOOO : 1; |
129 | uint16_t WriteProcResIdx; // First index into WriteProcResTable. |
130 | uint16_t NumWriteProcResEntries; |
131 | uint16_t WriteLatencyIdx; // First index into WriteLatencyTable. |
132 | uint16_t NumWriteLatencyEntries; |
133 | uint16_t ReadAdvanceIdx; // First index into ReadAdvanceTable. |
134 | uint16_t NumReadAdvanceEntries; |
135 | |
136 | bool isValid() const { |
137 | return NumMicroOps != InvalidNumMicroOps; |
138 | } |
139 | bool isVariant() const { |
140 | return NumMicroOps == VariantNumMicroOps; |
141 | } |
142 | }; |
143 | |
144 | /// Specify the cost of a register definition in terms of number of physical |
145 | /// register allocated at register renaming stage. For example, AMD Jaguar. |
146 | /// natively supports 128-bit data types, and operations on 256-bit registers |
147 | /// (i.e. YMM registers) are internally split into two COPs (complex operations) |
148 | /// and each COP updates a physical register. Basically, on Jaguar, a YMM |
149 | /// register write effectively consumes two physical registers. That means, |
150 | /// the cost of a YMM write in the BtVer2 model is 2. |
151 | struct MCRegisterCostEntry { |
152 | unsigned RegisterClassID; |
153 | unsigned Cost; |
154 | bool AllowMoveElimination; |
155 | }; |
156 | |
157 | /// A register file descriptor. |
158 | /// |
159 | /// This struct allows to describe processor register files. In particular, it |
160 | /// helps describing the size of the register file, as well as the cost of |
161 | /// allocating a register file at register renaming stage. |
162 | /// FIXME: this struct can be extended to provide information about the number |
163 | /// of read/write ports to the register file. A value of zero for field |
164 | /// 'NumPhysRegs' means: this register file has an unbounded number of physical |
165 | /// registers. |
166 | struct MCRegisterFileDesc { |
167 | const char *Name; |
168 | uint16_t NumPhysRegs; |
169 | uint16_t NumRegisterCostEntries; |
170 | // Index of the first cost entry in MCExtraProcessorInfo::RegisterCostTable. |
171 | uint16_t RegisterCostEntryIdx; |
172 | // A value of zero means: there is no limit in the number of moves that can be |
173 | // eliminated every cycle. |
174 | uint16_t MaxMovesEliminatedPerCycle; |
175 | // Ture if this register file only knows how to optimize register moves from |
176 | // known zero registers. |
177 | bool AllowZeroMoveEliminationOnly; |
178 | }; |
179 | |
180 | /// Provide extra details about the machine processor. |
181 | /// |
182 | /// This is a collection of "optional" processor information that is not |
183 | /// normally used by the LLVM machine schedulers, but that can be consumed by |
184 | /// external tools like llvm-mca to improve the quality of the peformance |
185 | /// analysis. |
186 | struct { |
187 | // Actual size of the reorder buffer in hardware. |
188 | unsigned ; |
189 | // Number of instructions retired per cycle. |
190 | unsigned ; |
191 | const MCRegisterFileDesc *; |
192 | unsigned ; |
193 | const MCRegisterCostEntry *; |
194 | unsigned ; |
195 | unsigned ; |
196 | unsigned ; |
197 | }; |
198 | |
199 | /// Machine model for scheduling, bundling, and heuristics. |
200 | /// |
201 | /// The machine model directly provides basic information about the |
202 | /// microarchitecture to the scheduler in the form of properties. It also |
203 | /// optionally refers to scheduler resource tables and itinerary |
204 | /// tables. Scheduler resource tables model the latency and cost for each |
205 | /// instruction type. Itinerary tables are an independent mechanism that |
206 | /// provides a detailed reservation table describing each cycle of instruction |
207 | /// execution. Subtargets may define any or all of the above categories of data |
208 | /// depending on the type of CPU and selected scheduler. |
209 | /// |
210 | /// The machine independent properties defined here are used by the scheduler as |
211 | /// an abstract machine model. A real micro-architecture has a number of |
212 | /// buffers, queues, and stages. Declaring that a given machine-independent |
213 | /// abstract property corresponds to a specific physical property across all |
214 | /// subtargets can't be done. Nonetheless, the abstract model is |
215 | /// useful. Futhermore, subtargets typically extend this model with processor |
216 | /// specific resources to model any hardware features that can be exploited by |
217 | /// scheduling heuristics and aren't sufficiently represented in the abstract. |
218 | /// |
219 | /// The abstract pipeline is built around the notion of an "issue point". This |
220 | /// is merely a reference point for counting machine cycles. The physical |
221 | /// machine will have pipeline stages that delay execution. The scheduler does |
222 | /// not model those delays because they are irrelevant as long as they are |
223 | /// consistent. Inaccuracies arise when instructions have different execution |
224 | /// delays relative to each other, in addition to their intrinsic latency. Those |
225 | /// special cases can be handled by TableGen constructs such as, ReadAdvance, |
226 | /// which reduces latency when reading data, and ReleaseAtCycles, which consumes |
227 | /// a processor resource when writing data for a number of abstract |
228 | /// cycles. |
229 | /// |
230 | /// TODO: One tool currently missing is the ability to add a delay to |
231 | /// ReleaseAtCycles. That would be easy to add and would likely cover all cases |
232 | /// currently handled by the legacy itinerary tables. |
233 | /// |
234 | /// A note on out-of-order execution and, more generally, instruction |
235 | /// buffers. Part of the CPU pipeline is always in-order. The issue point, which |
236 | /// is the point of reference for counting cycles, only makes sense as an |
237 | /// in-order part of the pipeline. Other parts of the pipeline are sometimes |
238 | /// falling behind and sometimes catching up. It's only interesting to model |
239 | /// those other, decoupled parts of the pipeline if they may be predictably |
240 | /// resource constrained in a way that the scheduler can exploit. |
241 | /// |
242 | /// The LLVM machine model distinguishes between in-order constraints and |
243 | /// out-of-order constraints so that the target's scheduling strategy can apply |
244 | /// appropriate heuristics. For a well-balanced CPU pipeline, out-of-order |
245 | /// resources would not typically be treated as a hard scheduling |
246 | /// constraint. For example, in the GenericScheduler, a delay caused by limited |
247 | /// out-of-order resources is not directly reflected in the number of cycles |
248 | /// that the scheduler sees between issuing an instruction and its dependent |
249 | /// instructions. In other words, out-of-order resources don't directly increase |
250 | /// the latency between pairs of instructions. However, they can still be used |
251 | /// to detect potential bottlenecks across a sequence of instructions and bias |
252 | /// the scheduling heuristics appropriately. |
253 | struct MCSchedModel { |
254 | // IssueWidth is the maximum number of instructions that may be scheduled in |
255 | // the same per-cycle group. This is meant to be a hard in-order constraint |
256 | // (a.k.a. "hazard"). In the GenericScheduler strategy, no more than |
257 | // IssueWidth micro-ops can ever be scheduled in a particular cycle. |
258 | // |
259 | // In practice, IssueWidth is useful to model any bottleneck between the |
260 | // decoder (after micro-op expansion) and the out-of-order reservation |
261 | // stations or the decoder bandwidth itself. If the total number of |
262 | // reservation stations is also a bottleneck, or if any other pipeline stage |
263 | // has a bandwidth limitation, then that can be naturally modeled by adding an |
264 | // out-of-order processor resource. |
265 | unsigned IssueWidth; |
266 | static const unsigned DefaultIssueWidth = 1; |
267 | |
268 | // MicroOpBufferSize is the number of micro-ops that the processor may buffer |
269 | // for out-of-order execution. |
270 | // |
271 | // "0" means operations that are not ready in this cycle are not considered |
272 | // for scheduling (they go in the pending queue). Latency is paramount. This |
273 | // may be more efficient if many instructions are pending in a schedule. |
274 | // |
275 | // "1" means all instructions are considered for scheduling regardless of |
276 | // whether they are ready in this cycle. Latency still causes issue stalls, |
277 | // but we balance those stalls against other heuristics. |
278 | // |
279 | // "> 1" means the processor is out-of-order. This is a machine independent |
280 | // estimate of highly machine specific characteristics such as the register |
281 | // renaming pool and reorder buffer. |
282 | unsigned MicroOpBufferSize; |
283 | static const unsigned DefaultMicroOpBufferSize = 0; |
284 | |
285 | // LoopMicroOpBufferSize is the number of micro-ops that the processor may |
286 | // buffer for optimized loop execution. More generally, this represents the |
287 | // optimal number of micro-ops in a loop body. A loop may be partially |
288 | // unrolled to bring the count of micro-ops in the loop body closer to this |
289 | // number. |
290 | unsigned LoopMicroOpBufferSize; |
291 | static const unsigned DefaultLoopMicroOpBufferSize = 0; |
292 | |
293 | // LoadLatency is the expected latency of load instructions. |
294 | unsigned LoadLatency; |
295 | static const unsigned DefaultLoadLatency = 4; |
296 | |
297 | // HighLatency is the expected latency of "very high latency" operations. |
298 | // See TargetInstrInfo::isHighLatencyDef(). |
299 | // By default, this is set to an arbitrarily high number of cycles |
300 | // likely to have some impact on scheduling heuristics. |
301 | unsigned HighLatency; |
302 | static const unsigned DefaultHighLatency = 10; |
303 | |
304 | // MispredictPenalty is the typical number of extra cycles the processor |
305 | // takes to recover from a branch misprediction. |
306 | unsigned MispredictPenalty; |
307 | static const unsigned DefaultMispredictPenalty = 10; |
308 | |
309 | bool PostRAScheduler; // default value is false |
310 | |
311 | bool CompleteModel; |
312 | |
313 | // Tells the MachineScheduler whether or not to track resource usage |
314 | // using intervals via ResourceSegments (see |
315 | // llvm/include/llvm/CodeGen/MachineScheduler.h). |
316 | bool EnableIntervals; |
317 | |
318 | unsigned ProcID; |
319 | const MCProcResourceDesc *ProcResourceTable; |
320 | const MCSchedClassDesc *SchedClassTable; |
321 | unsigned NumProcResourceKinds; |
322 | unsigned NumSchedClasses; |
323 | // Instruction itinerary tables used by InstrItineraryData. |
324 | friend class InstrItineraryData; |
325 | const InstrItinerary *InstrItineraries; |
326 | |
327 | const MCExtraProcessorInfo *; |
328 | |
329 | bool () const { return ExtraProcessorInfo; } |
330 | |
331 | unsigned getProcessorID() const { return ProcID; } |
332 | |
333 | /// Does this machine model include instruction-level scheduling. |
334 | bool hasInstrSchedModel() const { return SchedClassTable; } |
335 | |
336 | const MCExtraProcessorInfo &() const { |
337 | assert(hasExtraProcessorInfo() && |
338 | "No extra information available for this model" ); |
339 | return *ExtraProcessorInfo; |
340 | } |
341 | |
342 | /// Return true if this machine model data for all instructions with a |
343 | /// scheduling class (itinerary class or SchedRW list). |
344 | bool isComplete() const { return CompleteModel; } |
345 | |
346 | /// Return true if machine supports out of order execution. |
347 | bool isOutOfOrder() const { return MicroOpBufferSize > 1; } |
348 | |
349 | unsigned getNumProcResourceKinds() const { |
350 | return NumProcResourceKinds; |
351 | } |
352 | |
353 | const MCProcResourceDesc *getProcResource(unsigned ProcResourceIdx) const { |
354 | assert(hasInstrSchedModel() && "No scheduling machine model" ); |
355 | |
356 | assert(ProcResourceIdx < NumProcResourceKinds && "bad proc resource idx" ); |
357 | return &ProcResourceTable[ProcResourceIdx]; |
358 | } |
359 | |
360 | const MCSchedClassDesc *getSchedClassDesc(unsigned SchedClassIdx) const { |
361 | assert(hasInstrSchedModel() && "No scheduling machine model" ); |
362 | |
363 | assert(SchedClassIdx < NumSchedClasses && "bad scheduling class idx" ); |
364 | return &SchedClassTable[SchedClassIdx]; |
365 | } |
366 | |
367 | /// Returns the latency value for the scheduling class. |
368 | static int computeInstrLatency(const MCSubtargetInfo &STI, |
369 | const MCSchedClassDesc &SCDesc); |
370 | |
371 | int computeInstrLatency(const MCSubtargetInfo &STI, unsigned SClass) const; |
372 | int computeInstrLatency(const MCSubtargetInfo &STI, const MCInstrInfo &MCII, |
373 | const MCInst &Inst) const; |
374 | |
375 | // Returns the reciprocal throughput information from a MCSchedClassDesc. |
376 | static double |
377 | getReciprocalThroughput(const MCSubtargetInfo &STI, |
378 | const MCSchedClassDesc &SCDesc); |
379 | |
380 | static double |
381 | getReciprocalThroughput(unsigned SchedClass, const InstrItineraryData &IID); |
382 | |
383 | double |
384 | getReciprocalThroughput(const MCSubtargetInfo &STI, const MCInstrInfo &MCII, |
385 | const MCInst &Inst) const; |
386 | |
387 | /// Returns the maximum forwarding delay for register reads dependent on |
388 | /// writes of scheduling class WriteResourceIdx. |
389 | static unsigned getForwardingDelayCycles(ArrayRef<MCReadAdvanceEntry> Entries, |
390 | unsigned WriteResourceIdx = 0); |
391 | |
392 | /// Returns the default initialized model. |
393 | static const MCSchedModel Default; |
394 | }; |
395 | |
396 | } // namespace llvm |
397 | |
398 | #endif |
399 | |