1 | //===------------------------- LSUnit.h --------------------------*- C++-*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// \file |
9 | /// |
10 | /// A Load/Store unit class that models load/store queues and that implements |
11 | /// a simple weak memory consistency model. |
12 | /// |
13 | //===----------------------------------------------------------------------===// |
14 | |
15 | #ifndef LLVM_MCA_HARDWAREUNITS_LSUNIT_H |
16 | #define LLVM_MCA_HARDWAREUNITS_LSUNIT_H |
17 | |
18 | #include "llvm/ADT/DenseMap.h" |
19 | #include "llvm/ADT/SmallVector.h" |
20 | #include "llvm/MC/MCSchedule.h" |
21 | #include "llvm/MCA/HardwareUnits/HardwareUnit.h" |
22 | #include "llvm/MCA/Instruction.h" |
23 | |
24 | namespace llvm { |
25 | namespace mca { |
26 | |
27 | /// A node of a memory dependency graph. A MemoryGroup describes a set of |
28 | /// instructions with same memory dependencies. |
29 | /// |
30 | /// By construction, instructions of a MemoryGroup don't depend on each other. |
31 | /// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups. |
32 | /// A Memory group identifier is then stored as a "token" in field |
33 | /// Instruction::LSUTokenID of each dispatched instructions. That token is used |
34 | /// internally by the LSUnit to track memory dependencies. |
35 | class MemoryGroup { |
36 | unsigned NumPredecessors = 0; |
37 | unsigned NumExecutingPredecessors = 0; |
38 | unsigned NumExecutedPredecessors = 0; |
39 | |
40 | unsigned NumInstructions = 0; |
41 | unsigned NumExecuting = 0; |
42 | unsigned NumExecuted = 0; |
43 | // Successors that are in a order dependency with this group. |
44 | SmallVector<MemoryGroup *, 4> OrderSucc; |
45 | // Successors that are in a data dependency with this group. |
46 | SmallVector<MemoryGroup *, 4> DataSucc; |
47 | |
48 | CriticalDependency CriticalPredecessor; |
49 | InstRef CriticalMemoryInstruction; |
50 | |
51 | MemoryGroup(const MemoryGroup &) = delete; |
52 | MemoryGroup &operator=(const MemoryGroup &) = delete; |
53 | |
54 | public: |
55 | MemoryGroup() = default; |
56 | MemoryGroup(MemoryGroup &&) = default; |
57 | |
58 | size_t getNumSuccessors() const { |
59 | return OrderSucc.size() + DataSucc.size(); |
60 | } |
61 | unsigned getNumPredecessors() const { return NumPredecessors; } |
62 | unsigned getNumExecutingPredecessors() const { |
63 | return NumExecutingPredecessors; |
64 | } |
65 | unsigned getNumExecutedPredecessors() const { |
66 | return NumExecutedPredecessors; |
67 | } |
68 | unsigned getNumInstructions() const { return NumInstructions; } |
69 | unsigned getNumExecuting() const { return NumExecuting; } |
70 | unsigned getNumExecuted() const { return NumExecuted; } |
71 | |
72 | const InstRef &getCriticalMemoryInstruction() const { |
73 | return CriticalMemoryInstruction; |
74 | } |
75 | const CriticalDependency &getCriticalPredecessor() const { |
76 | return CriticalPredecessor; |
77 | } |
78 | |
79 | void addSuccessor(MemoryGroup *Group, bool IsDataDependent) { |
80 | // Do not need to add a dependency if there is no data |
81 | // dependency and all instructions from this group have been |
82 | // issued already. |
83 | if (!IsDataDependent && isExecuting()) |
84 | return; |
85 | |
86 | Group->NumPredecessors++; |
87 | assert(!isExecuted() && "Should have been removed!" ); |
88 | if (isExecuting()) |
89 | Group->onGroupIssued(IR: CriticalMemoryInstruction, ShouldUpdateCriticalDep: IsDataDependent); |
90 | |
91 | if (IsDataDependent) |
92 | DataSucc.emplace_back(Args&: Group); |
93 | else |
94 | OrderSucc.emplace_back(Args&: Group); |
95 | } |
96 | |
97 | bool isWaiting() const { |
98 | return NumPredecessors > |
99 | (NumExecutingPredecessors + NumExecutedPredecessors); |
100 | } |
101 | bool isPending() const { |
102 | return NumExecutingPredecessors && |
103 | ((NumExecutedPredecessors + NumExecutingPredecessors) == |
104 | NumPredecessors); |
105 | } |
106 | bool isReady() const { return NumExecutedPredecessors == NumPredecessors; } |
107 | bool isExecuting() const { |
108 | return NumExecuting && (NumExecuting == (NumInstructions - NumExecuted)); |
109 | } |
110 | bool isExecuted() const { return NumInstructions == NumExecuted; } |
111 | |
112 | void onGroupIssued(const InstRef &IR, bool ShouldUpdateCriticalDep) { |
113 | assert(!isReady() && "Unexpected group-start event!" ); |
114 | NumExecutingPredecessors++; |
115 | |
116 | if (!ShouldUpdateCriticalDep) |
117 | return; |
118 | |
119 | unsigned Cycles = IR.getInstruction()->getCyclesLeft(); |
120 | if (CriticalPredecessor.Cycles < Cycles) { |
121 | CriticalPredecessor.IID = IR.getSourceIndex(); |
122 | CriticalPredecessor.Cycles = Cycles; |
123 | } |
124 | } |
125 | |
126 | void onGroupExecuted() { |
127 | assert(!isReady() && "Inconsistent state found!" ); |
128 | NumExecutingPredecessors--; |
129 | NumExecutedPredecessors++; |
130 | } |
131 | |
132 | void onInstructionIssued(const InstRef &IR) { |
133 | assert(!isExecuting() && "Invalid internal state!" ); |
134 | ++NumExecuting; |
135 | |
136 | // update the CriticalMemDep. |
137 | const Instruction &IS = *IR.getInstruction(); |
138 | if ((bool)CriticalMemoryInstruction) { |
139 | const Instruction &OtherIS = *CriticalMemoryInstruction.getInstruction(); |
140 | if (OtherIS.getCyclesLeft() < IS.getCyclesLeft()) |
141 | CriticalMemoryInstruction = IR; |
142 | } else { |
143 | CriticalMemoryInstruction = IR; |
144 | } |
145 | |
146 | if (!isExecuting()) |
147 | return; |
148 | |
149 | // Notify successors that this group started execution. |
150 | for (MemoryGroup *MG : OrderSucc) { |
151 | MG->onGroupIssued(IR: CriticalMemoryInstruction, ShouldUpdateCriticalDep: false); |
152 | // Release the order dependency with this group. |
153 | MG->onGroupExecuted(); |
154 | } |
155 | |
156 | for (MemoryGroup *MG : DataSucc) |
157 | MG->onGroupIssued(IR: CriticalMemoryInstruction, ShouldUpdateCriticalDep: true); |
158 | } |
159 | |
160 | void onInstructionExecuted(const InstRef &IR) { |
161 | assert(isReady() && !isExecuted() && "Invalid internal state!" ); |
162 | --NumExecuting; |
163 | ++NumExecuted; |
164 | |
165 | if (CriticalMemoryInstruction && |
166 | CriticalMemoryInstruction.getSourceIndex() == IR.getSourceIndex()) { |
167 | CriticalMemoryInstruction.invalidate(); |
168 | } |
169 | |
170 | if (!isExecuted()) |
171 | return; |
172 | |
173 | // Notify data dependent successors that this group has finished execution. |
174 | for (MemoryGroup *MG : DataSucc) |
175 | MG->onGroupExecuted(); |
176 | } |
177 | |
178 | void addInstruction() { |
179 | assert(!getNumSuccessors() && "Cannot add instructions to this group!" ); |
180 | ++NumInstructions; |
181 | } |
182 | |
183 | void cycleEvent() { |
184 | if (isWaiting() && CriticalPredecessor.Cycles) |
185 | CriticalPredecessor.Cycles--; |
186 | } |
187 | }; |
188 | |
189 | /// Abstract base interface for LS (load/store) units in llvm-mca. |
190 | class LSUnitBase : public HardwareUnit { |
191 | /// Load queue size. |
192 | /// |
193 | /// A value of zero for this field means that the load queue is unbounded. |
194 | /// Processor models can declare the size of a load queue via tablegen (see |
195 | /// the definition of tablegen class LoadQueue in |
196 | /// llvm/Target/TargetSchedule.td). |
197 | unsigned LQSize; |
198 | |
199 | /// Load queue size. |
200 | /// |
201 | /// A value of zero for this field means that the store queue is unbounded. |
202 | /// Processor models can declare the size of a store queue via tablegen (see |
203 | /// the definition of tablegen class StoreQueue in |
204 | /// llvm/Target/TargetSchedule.td). |
205 | unsigned SQSize; |
206 | |
207 | unsigned UsedLQEntries; |
208 | unsigned UsedSQEntries; |
209 | |
210 | /// True if loads don't alias with stores. |
211 | /// |
212 | /// By default, the LS unit assumes that loads and stores don't alias with |
213 | /// eachother. If this field is set to false, then loads are always assumed to |
214 | /// alias with stores. |
215 | const bool NoAlias; |
216 | |
217 | /// Used to map group identifiers to MemoryGroups. |
218 | DenseMap<unsigned, std::unique_ptr<MemoryGroup>> Groups; |
219 | unsigned NextGroupID; |
220 | |
221 | public: |
222 | LSUnitBase(const MCSchedModel &SM, unsigned LoadQueueSize, |
223 | unsigned StoreQueueSize, bool AssumeNoAlias); |
224 | |
225 | virtual ~LSUnitBase(); |
226 | |
227 | /// Returns the total number of entries in the load queue. |
228 | unsigned getLoadQueueSize() const { return LQSize; } |
229 | |
230 | /// Returns the total number of entries in the store queue. |
231 | unsigned getStoreQueueSize() const { return SQSize; } |
232 | |
233 | unsigned getUsedLQEntries() const { return UsedLQEntries; } |
234 | unsigned getUsedSQEntries() const { return UsedSQEntries; } |
235 | void acquireLQSlot() { ++UsedLQEntries; } |
236 | void acquireSQSlot() { ++UsedSQEntries; } |
237 | void releaseLQSlot() { --UsedLQEntries; } |
238 | void releaseSQSlot() { --UsedSQEntries; } |
239 | |
240 | bool assumeNoAlias() const { return NoAlias; } |
241 | |
242 | enum Status { |
243 | LSU_AVAILABLE = 0, |
244 | LSU_LQUEUE_FULL, // Load Queue unavailable |
245 | LSU_SQUEUE_FULL // Store Queue unavailable |
246 | }; |
247 | |
248 | /// This method checks the availability of the load/store buffers. |
249 | /// |
250 | /// Returns LSU_AVAILABLE if there are enough load/store queue entries to |
251 | /// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is |
252 | /// not a memory operation. |
253 | virtual Status isAvailable(const InstRef &IR) const = 0; |
254 | |
255 | /// Allocates LS resources for instruction IR. |
256 | /// |
257 | /// This method assumes that a previous call to `isAvailable(IR)` succeeded |
258 | /// with a LSUnitBase::Status value of LSU_AVAILABLE. |
259 | /// Returns the GroupID associated with this instruction. That value will be |
260 | /// used to set the LSUTokenID field in class Instruction. |
261 | virtual unsigned dispatch(const InstRef &IR) = 0; |
262 | |
263 | bool isSQEmpty() const { return !UsedSQEntries; } |
264 | bool isLQEmpty() const { return !UsedLQEntries; } |
265 | bool isSQFull() const { return SQSize && SQSize == UsedSQEntries; } |
266 | bool isLQFull() const { return LQSize && LQSize == UsedLQEntries; } |
267 | |
268 | bool isValidGroupID(unsigned Index) const { |
269 | return Index && Groups.contains(Val: Index); |
270 | } |
271 | |
272 | /// Check if a peviously dispatched instruction IR is now ready for execution. |
273 | bool isReady(const InstRef &IR) const { |
274 | unsigned GroupID = IR.getInstruction()->getLSUTokenID(); |
275 | const MemoryGroup &Group = getGroup(Index: GroupID); |
276 | return Group.isReady(); |
277 | } |
278 | |
279 | /// Check if instruction IR only depends on memory instructions that are |
280 | /// currently executing. |
281 | bool isPending(const InstRef &IR) const { |
282 | unsigned GroupID = IR.getInstruction()->getLSUTokenID(); |
283 | const MemoryGroup &Group = getGroup(Index: GroupID); |
284 | return Group.isPending(); |
285 | } |
286 | |
287 | /// Check if instruction IR is still waiting on memory operations, and the |
288 | /// wait time is still unknown. |
289 | bool isWaiting(const InstRef &IR) const { |
290 | unsigned GroupID = IR.getInstruction()->getLSUTokenID(); |
291 | const MemoryGroup &Group = getGroup(Index: GroupID); |
292 | return Group.isWaiting(); |
293 | } |
294 | |
295 | bool hasDependentUsers(const InstRef &IR) const { |
296 | unsigned GroupID = IR.getInstruction()->getLSUTokenID(); |
297 | const MemoryGroup &Group = getGroup(Index: GroupID); |
298 | return !Group.isExecuted() && Group.getNumSuccessors(); |
299 | } |
300 | |
301 | const MemoryGroup &getGroup(unsigned Index) const { |
302 | assert(isValidGroupID(Index) && "Group doesn't exist!" ); |
303 | return *Groups.find(Val: Index)->second; |
304 | } |
305 | |
306 | MemoryGroup &getGroup(unsigned Index) { |
307 | assert(isValidGroupID(Index) && "Group doesn't exist!" ); |
308 | return *Groups.find(Val: Index)->second; |
309 | } |
310 | |
311 | unsigned createMemoryGroup() { |
312 | Groups.insert( |
313 | KV: std::make_pair(x&: NextGroupID, y: std::make_unique<MemoryGroup>())); |
314 | return NextGroupID++; |
315 | } |
316 | |
317 | virtual void onInstructionExecuted(const InstRef &IR); |
318 | |
319 | // Loads are tracked by the LDQ (load queue) from dispatch until completion. |
320 | // Stores are tracked by the STQ (store queue) from dispatch until commitment. |
321 | // By default we conservatively assume that the LDQ receives a load at |
322 | // dispatch. Loads leave the LDQ at retirement stage. |
323 | virtual void onInstructionRetired(const InstRef &IR); |
324 | |
325 | virtual void onInstructionIssued(const InstRef &IR) { |
326 | unsigned GroupID = IR.getInstruction()->getLSUTokenID(); |
327 | Groups[GroupID]->onInstructionIssued(IR); |
328 | } |
329 | |
330 | virtual void cycleEvent(); |
331 | |
332 | #ifndef NDEBUG |
333 | void dump() const; |
334 | #endif |
335 | }; |
336 | |
337 | /// Default Load/Store Unit (LS Unit) for simulated processors. |
338 | /// |
339 | /// Each load (or store) consumes one entry in the load (or store) queue. |
340 | /// |
341 | /// Rules are: |
342 | /// 1) A younger load is allowed to pass an older load only if there are no |
343 | /// stores nor barriers in between the two loads. |
344 | /// 2) An younger store is not allowed to pass an older store. |
345 | /// 3) A younger store is not allowed to pass an older load. |
346 | /// 4) A younger load is allowed to pass an older store only if the load does |
347 | /// not alias with the store. |
348 | /// |
349 | /// This class optimistically assumes that loads don't alias store operations. |
350 | /// Under this assumption, younger loads are always allowed to pass older |
351 | /// stores (this would only affects rule 4). |
352 | /// Essentially, this class doesn't perform any sort alias analysis to |
353 | /// identify aliasing loads and stores. |
354 | /// |
355 | /// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be |
356 | /// set to `false` by the constructor of LSUnit. |
357 | /// |
358 | /// Note that this class doesn't know about the existence of different memory |
359 | /// types for memory operations (example: write-through, write-combining, etc.). |
360 | /// Derived classes are responsible for implementing that extra knowledge, and |
361 | /// provide different sets of rules for loads and stores by overriding method |
362 | /// `isReady()`. |
363 | /// To emulate a write-combining memory type, rule 2. must be relaxed in a |
364 | /// derived class to enable the reordering of non-aliasing store operations. |
365 | /// |
366 | /// No assumptions are made by this class on the size of the store buffer. This |
367 | /// class doesn't know how to identify cases where store-to-load forwarding may |
368 | /// occur. |
369 | /// |
370 | /// LSUnit doesn't attempt to predict whether a load or store hits or misses |
371 | /// the L1 cache. To be more specific, LSUnit doesn't know anything about |
372 | /// cache hierarchy and memory types. |
373 | /// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the |
374 | /// scheduling model provides an "optimistic" load-to-use latency (which usually |
375 | /// matches the load-to-use latency for when there is a hit in the L1D). |
376 | /// Derived classes may expand this knowledge. |
377 | /// |
378 | /// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor |
379 | /// memory-barrier like instructions. |
380 | /// LSUnit conservatively assumes that an instruction which `mayLoad` and has |
381 | /// `unmodeled side effects` behave like a "soft" load-barrier. That means, it |
382 | /// serializes loads without forcing a flush of the load queue. |
383 | /// Similarly, instructions that both `mayStore` and have `unmodeled side |
384 | /// effects` are treated like store barriers. A full memory |
385 | /// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side |
386 | /// effects. This is obviously inaccurate, but this is the best that we can do |
387 | /// at the moment. |
388 | /// |
389 | /// Each load/store barrier consumes one entry in the load/store queue. A |
390 | /// load/store barrier enforces ordering of loads/stores: |
391 | /// - A younger load cannot pass a load barrier. |
392 | /// - A younger store cannot pass a store barrier. |
393 | /// |
394 | /// A younger load has to wait for the memory load barrier to execute. |
395 | /// A load/store barrier is "executed" when it becomes the oldest entry in |
396 | /// the load/store queue(s). That also means, all the older loads/stores have |
397 | /// already been executed. |
398 | class LSUnit : public LSUnitBase { |
399 | // This class doesn't know about the latency of a load instruction. So, it |
400 | // conservatively/pessimistically assumes that the latency of a load opcode |
401 | // matches the instruction latency. |
402 | // |
403 | // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses), |
404 | // and load/store conflicts, the latency of a load is determined by the depth |
405 | // of the load pipeline. So, we could use field `LoadLatency` in the |
406 | // MCSchedModel to model that latency. |
407 | // Field `LoadLatency` often matches the so-called 'load-to-use' latency from |
408 | // L1D, and it usually already accounts for any extra latency due to data |
409 | // forwarding. |
410 | // When doing throughput analysis, `LoadLatency` is likely to |
411 | // be a better predictor of load latency than instruction latency. This is |
412 | // particularly true when simulating code with temporal/spatial locality of |
413 | // memory accesses. |
414 | // Using `LoadLatency` (instead of the instruction latency) is also expected |
415 | // to improve the load queue allocation for long latency instructions with |
416 | // folded memory operands (See PR39829). |
417 | // |
418 | // FIXME: On some processors, load/store operations are split into multiple |
419 | // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but |
420 | // not 256-bit data types. So, a 256-bit load is effectively split into two |
421 | // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For |
422 | // simplicity, this class optimistically assumes that a load instruction only |
423 | // consumes one entry in the LoadQueue. Similarly, store instructions only |
424 | // consume a single entry in the StoreQueue. |
425 | // In future, we should reassess the quality of this design, and consider |
426 | // alternative approaches that let instructions specify the number of |
427 | // load/store queue entries which they consume at dispatch stage (See |
428 | // PR39830). |
429 | // |
430 | // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is |
431 | // conservatively treated as a store barrier. It forces older store to be |
432 | // executed before newer stores are issued. |
433 | // |
434 | // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is |
435 | // conservatively treated as a load barrier. It forces older loads to execute |
436 | // before newer loads are issued. |
437 | unsigned CurrentLoadGroupID; |
438 | unsigned CurrentLoadBarrierGroupID; |
439 | unsigned CurrentStoreGroupID; |
440 | unsigned CurrentStoreBarrierGroupID; |
441 | |
442 | public: |
443 | LSUnit(const MCSchedModel &SM) |
444 | : LSUnit(SM, /* LQSize */ 0, /* SQSize */ 0, /* NoAlias */ false) {} |
445 | LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ) |
446 | : LSUnit(SM, LQ, SQ, /* NoAlias */ false) {} |
447 | LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias) |
448 | : LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0), |
449 | CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0), |
450 | CurrentStoreBarrierGroupID(0) {} |
451 | |
452 | /// Returns LSU_AVAILABLE if there are enough load/store queue entries to |
453 | /// accomodate instruction IR. |
454 | Status isAvailable(const InstRef &IR) const override; |
455 | |
456 | /// Allocates LS resources for instruction IR. |
457 | /// |
458 | /// This method assumes that a previous call to `isAvailable(IR)` succeeded |
459 | /// returning LSU_AVAILABLE. |
460 | /// |
461 | /// Rules are: |
462 | /// By default, rules are: |
463 | /// 1. A store may not pass a previous store. |
464 | /// 2. A load may not pass a previous store unless flag 'NoAlias' is set. |
465 | /// 3. A load may pass a previous load. |
466 | /// 4. A store may not pass a previous load (regardless of flag 'NoAlias'). |
467 | /// 5. A load has to wait until an older load barrier is fully executed. |
468 | /// 6. A store has to wait until an older store barrier is fully executed. |
469 | unsigned dispatch(const InstRef &IR) override; |
470 | |
471 | void onInstructionExecuted(const InstRef &IR) override; |
472 | }; |
473 | |
474 | } // namespace mca |
475 | } // namespace llvm |
476 | |
477 | #endif // LLVM_MCA_HARDWAREUNITS_LSUNIT_H |
478 | |