LSUnit.h source code [llvm/include/llvm/MCA/HardwareUnits/LSUnit.h]

1	//===------------------------- LSUnit.h --------------------------- C++--===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	/// \file
9	///
10	/// A Load/Store unit class that models load/store queues and that implements
11	/// a simple weak memory consistency model.
12	///
13	//===----------------------------------------------------------------------===//
14
15	#ifndef LLVM_MCA_HARDWAREUNITS_LSUNIT_H
16	#define LLVM_MCA_HARDWAREUNITS_LSUNIT_H
17
18	#include "llvm/ADT/DenseMap.h"
19	#include "llvm/ADT/SmallVector.h"
20	#include "llvm/MC/MCSchedule.h"
21	#include "llvm/MCA/HardwareUnits/HardwareUnit.h"
22	#include "llvm/MCA/Instruction.h"
23
24	namespace llvm {
25	namespace mca {
26
27	/// A node of a memory dependency graph. A MemoryGroup describes a set of
28	/// instructions with same memory dependencies.
29	///
30	/// By construction, instructions of a MemoryGroup don't depend on each other.
31	/// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups.
32	/// A Memory group identifier is then stored as a "token" in field
33	/// Instruction::LSUTokenID of each dispatched instructions. That token is used
34	/// internally by the LSUnit to track memory dependencies.
35	class MemoryGroup {
36	unsigned NumPredecessors = `0`;
37	unsigned NumExecutingPredecessors = `0`;
38	unsigned NumExecutedPredecessors = `0`;
39
40	unsigned NumInstructions = `0`;
41	unsigned NumExecuting = `0`;
42	unsigned NumExecuted = `0`;
43	// Successors that are in a order dependency with this group.
44	SmallVector<MemoryGroup *, `4`> OrderSucc;
45	// Successors that are in a data dependency with this group.
46	SmallVector<MemoryGroup *, `4`> DataSucc;
47
48	CriticalDependency CriticalPredecessor;
49	InstRef CriticalMemoryInstruction;
50
51	MemoryGroup(const MemoryGroup &) = delete;
52	MemoryGroup &operator=(const MemoryGroup &) = delete;
53
54	public:
55	MemoryGroup() = default;
56	MemoryGroup(MemoryGroup &&) = default;
57
58	size_t getNumSuccessors() const {
59	return OrderSucc.size() + DataSucc.size();
60	}
61	unsigned getNumPredecessors() const { return NumPredecessors; }
62	unsigned getNumExecutingPredecessors() const {
63	return NumExecutingPredecessors;
64	}
65	unsigned getNumExecutedPredecessors() const {
66	return NumExecutedPredecessors;
67	}
68	unsigned getNumInstructions() const { return NumInstructions; }
69	unsigned getNumExecuting() const { return NumExecuting; }
70	unsigned getNumExecuted() const { return NumExecuted; }
71
72	const InstRef &getCriticalMemoryInstruction() const {
73	return CriticalMemoryInstruction;
74	}
75	const CriticalDependency &getCriticalPredecessor() const {
76	return CriticalPredecessor;
77	}
78
79	void addSuccessor(MemoryGroup Group, bool* IsDataDependent) {
80	// Do not need to add a dependency if there is no data
81	// dependency and all instructions from this group have been
82	// issued already.
83	if (!IsDataDependent && isExecuting())
84	return;
85
86	Group->NumPredecessors++;
87	assert(!isExecuted() && "Should have been removed!");
88	if (isExecuting())
89	Group->onGroupIssued(IR: CriticalMemoryInstruction, ShouldUpdateCriticalDep: IsDataDependent);
90
91	if (IsDataDependent)
92	DataSucc.emplace_back(Args&: Group);
93	else
94	OrderSucc.emplace_back(Args&: Group);
95	}
96
97	bool isWaiting() const {
98	return NumPredecessors >
99	(NumExecutingPredecessors + NumExecutedPredecessors);
100	}
101	bool isPending() const {
102	return NumExecutingPredecessors &&
103	((NumExecutedPredecessors + NumExecutingPredecessors) ==
104	NumPredecessors);
105	}
106	bool isReady() const { return NumExecutedPredecessors == NumPredecessors; }
107	bool isExecuting() const {
108	return NumExecuting && (NumExecuting == (NumInstructions - NumExecuted));
109	}
110	bool isExecuted() const { return NumInstructions == NumExecuted; }
111
112	void onGroupIssued(const InstRef &IR, bool ShouldUpdateCriticalDep) {
113	assert(!isReady() && "Unexpected group-start event!");
114	NumExecutingPredecessors++;
115
116	if (!ShouldUpdateCriticalDep)
117	return;
118
119	unsigned Cycles = IR.getInstruction()->getCyclesLeft();
120	if (CriticalPredecessor.Cycles < Cycles) {
121	CriticalPredecessor.IID = IR.getSourceIndex();
122	CriticalPredecessor.Cycles = Cycles;
123	}
124	}
125
126	void onGroupExecuted() {
127	assert(!isReady() && "Inconsistent state found!");
128	NumExecutingPredecessors--;
129	NumExecutedPredecessors++;
130	}
131
132	void onInstructionIssued(const InstRef &IR) {
133	assert(!isExecuting() && "Invalid internal state!");
134	++NumExecuting;
135
136	// update the CriticalMemDep.
137	const Instruction &IS = *IR.getInstruction();
138	if ((bool)CriticalMemoryInstruction) {
139	const Instruction &OtherIS = *CriticalMemoryInstruction.getInstruction();
140	if (OtherIS.getCyclesLeft() < IS.getCyclesLeft())
141	CriticalMemoryInstruction = IR;
142	} else {
143	CriticalMemoryInstruction = IR;
144	}
145
146	if (!isExecuting())
147	return;
148
149	// Notify successors that this group started execution.
150	for (MemoryGroup *MG : OrderSucc) {
151	MG->onGroupIssued(IR: CriticalMemoryInstruction, ShouldUpdateCriticalDep: false);
152	// Release the order dependency with this group.
153	MG->onGroupExecuted();
154	}
155
156	for (MemoryGroup *MG : DataSucc)
157	MG->onGroupIssued(IR: CriticalMemoryInstruction, ShouldUpdateCriticalDep: true);
158	}
159
160	void onInstructionExecuted(const InstRef &IR) {
161	assert(isReady() && !isExecuted() && "Invalid internal state!");
162	--NumExecuting;
163	++NumExecuted;
164
165	if (CriticalMemoryInstruction &&
166	CriticalMemoryInstruction.getSourceIndex() == IR.getSourceIndex()) {
167	CriticalMemoryInstruction.invalidate();
168	}
169
170	if (!isExecuted())
171	return;
172
173	// Notify data dependent successors that this group has finished execution.
174	for (MemoryGroup *MG : DataSucc)
175	MG->onGroupExecuted();
176	}
177
178	void addInstruction() {
179	assert(!getNumSuccessors() && "Cannot add instructions to this group!");
180	++NumInstructions;
181	}
182
183	void cycleEvent() {
184	if (isWaiting() && CriticalPredecessor.Cycles)
185	CriticalPredecessor.Cycles--;
186	}
187	};
188
189	/// Abstract base interface for LS (load/store) units in llvm-mca.
190	class LSUnitBase : public HardwareUnit {
191	/// Load queue size.
192	///
193	/// A value of zero for this field means that the load queue is unbounded.
194	/// Processor models can declare the size of a load queue via tablegen (see
195	/// the definition of tablegen class LoadQueue in
196	/// llvm/Target/TargetSchedule.td).
197	unsigned LQSize;
198
199	/// Load queue size.
200	///
201	/// A value of zero for this field means that the store queue is unbounded.
202	/// Processor models can declare the size of a store queue via tablegen (see
203	/// the definition of tablegen class StoreQueue in
204	/// llvm/Target/TargetSchedule.td).
205	unsigned SQSize;
206
207	unsigned UsedLQEntries;
208	unsigned UsedSQEntries;
209
210	/// True if loads don't alias with stores.
211	///
212	/// By default, the LS unit assumes that loads and stores don't alias with
213	/// eachother. If this field is set to false, then loads are always assumed to
214	/// alias with stores.
215	const bool NoAlias;
216
217	/// Used to map group identifiers to MemoryGroups.
218	DenseMap<unsigned, std::unique_ptr<MemoryGroup>> Groups;
219	unsigned NextGroupID;
220
221	public:
222	LSUnitBase(const MCSchedModel &SM, unsigned LoadQueueSize,
223	unsigned StoreQueueSize, bool AssumeNoAlias);
224
225	virtual ~LSUnitBase();
226
227	/// Returns the total number of entries in the load queue.
228	unsigned getLoadQueueSize() const { return LQSize; }
229
230	/// Returns the total number of entries in the store queue.
231	unsigned getStoreQueueSize() const { return SQSize; }
232
233	unsigned getUsedLQEntries() const { return UsedLQEntries; }
234	unsigned getUsedSQEntries() const { return UsedSQEntries; }
235	void acquireLQSlot() { ++UsedLQEntries; }
236	void acquireSQSlot() { ++UsedSQEntries; }
237	void releaseLQSlot() { --UsedLQEntries; }
238	void releaseSQSlot() { --UsedSQEntries; }
239
240	bool assumeNoAlias() const { return NoAlias; }
241
242	enum Status {
243	LSU_AVAILABLE = `0`,
244	LSU_LQUEUE_FULL, // Load Queue unavailable
245	LSU_SQUEUE_FULL // Store Queue unavailable
246	};
247
248	/// This method checks the availability of the load/store buffers.
249	///
250	/// Returns LSU_AVAILABLE if there are enough load/store queue entries to
251	/// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is
252	/// not a memory operation.
253	virtual Status isAvailable(const InstRef &IR) const = `0`;
254
255	/// Allocates LS resources for instruction IR.
256	///
257	/// This method assumes that a previous call to `isAvailable(IR)` succeeded
258	/// with a LSUnitBase::Status value of LSU_AVAILABLE.
259	/// Returns the GroupID associated with this instruction. That value will be
260	/// used to set the LSUTokenID field in class Instruction.
261	virtual unsigned dispatch(const InstRef &IR) = `0`;
262
263	bool isSQEmpty() const { return !UsedSQEntries; }
264	bool isLQEmpty() const { return !UsedLQEntries; }
265	bool isSQFull() const { return SQSize && SQSize == UsedSQEntries; }
266	bool isLQFull() const { return LQSize && LQSize == UsedLQEntries; }
267
268	bool isValidGroupID(unsigned Index) const {
269	return Index && Groups.contains(Val: Index);
270	}
271
272	/// Check if a peviously dispatched instruction IR is now ready for execution.
273	bool isReady(const InstRef &IR) const {
274	unsigned GroupID = IR.getInstruction()->getLSUTokenID();
275	const MemoryGroup &Group = getGroup(Index: GroupID);
276	return Group.isReady();
277	}
278
279	/// Check if instruction IR only depends on memory instructions that are
280	/// currently executing.
281	bool isPending(const InstRef &IR) const {
282	unsigned GroupID = IR.getInstruction()->getLSUTokenID();
283	const MemoryGroup &Group = getGroup(Index: GroupID);
284	return Group.isPending();
285	}
286
287	/// Check if instruction IR is still waiting on memory operations, and the
288	/// wait time is still unknown.
289	bool isWaiting(const InstRef &IR) const {
290	unsigned GroupID = IR.getInstruction()->getLSUTokenID();
291	const MemoryGroup &Group = getGroup(Index: GroupID);
292	return Group.isWaiting();
293	}
294
295	bool hasDependentUsers(const InstRef &IR) const {
296	unsigned GroupID = IR.getInstruction()->getLSUTokenID();
297	const MemoryGroup &Group = getGroup(Index: GroupID);
298	return !Group.isExecuted() && Group.getNumSuccessors();
299	}
300
301	const MemoryGroup &getGroup(unsigned Index) const {
302	assert(isValidGroupID(Index) && "Group doesn't exist!");
303	return *Groups.find(Val: Index)->second;
304	}
305
306	MemoryGroup &getGroup(unsigned Index) {
307	assert(isValidGroupID(Index) && "Group doesn't exist!");
308	return *Groups.find(Val: Index)->second;
309	}
310
311	unsigned createMemoryGroup() {
312	Groups.insert(
313	KV: std::make_pair(x&: NextGroupID, y: std::make_unique<MemoryGroup>()));
314	return NextGroupID++;
315	}
316
317	virtual void onInstructionExecuted(const InstRef &IR);
318
319	// Loads are tracked by the LDQ (load queue) from dispatch until completion.
320	// Stores are tracked by the STQ (store queue) from dispatch until commitment.
321	// By default we conservatively assume that the LDQ receives a load at
322	// dispatch. Loads leave the LDQ at retirement stage.
323	virtual void onInstructionRetired(const InstRef &IR);
324
325	virtual void onInstructionIssued(const InstRef &IR) {
326	unsigned GroupID = IR.getInstruction()->getLSUTokenID();
327	Groups [GroupID]->onInstructionIssued(IR);
328	}
329
330	virtual void cycleEvent();
331
332	#ifndef NDEBUG
333	void dump() const;
334	#endif
335	};
336
337	/// Default Load/Store Unit (LS Unit) for simulated processors.
338	///
339	/// Each load (or store) consumes one entry in the load (or store) queue.
340	///
341	/// Rules are:
342	/// 1) A younger load is allowed to pass an older load only if there are no
343	/// stores nor barriers in between the two loads.
344	/// 2) An younger store is not allowed to pass an older store.
345	/// 3) A younger store is not allowed to pass an older load.
346	/// 4) A younger load is allowed to pass an older store only if the load does
347	/// not alias with the store.
348	///
349	/// This class optimistically assumes that loads don't alias store operations.
350	/// Under this assumption, younger loads are always allowed to pass older
351	/// stores (this would only affects rule 4).
352	/// Essentially, this class doesn't perform any sort alias analysis to
353	/// identify aliasing loads and stores.
354	///
355	/// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be
356	/// set to `false` by the constructor of LSUnit.
357	///
358	/// Note that this class doesn't know about the existence of different memory
359	/// types for memory operations (example: write-through, write-combining, etc.).
360	/// Derived classes are responsible for implementing that extra knowledge, and
361	/// provide different sets of rules for loads and stores by overriding method
362	/// `isReady()`.
363	/// To emulate a write-combining memory type, rule 2. must be relaxed in a
364	/// derived class to enable the reordering of non-aliasing store operations.
365	///
366	/// No assumptions are made by this class on the size of the store buffer. This
367	/// class doesn't know how to identify cases where store-to-load forwarding may
368	/// occur.
369	///
370	/// LSUnit doesn't attempt to predict whether a load or store hits or misses
371	/// the L1 cache. To be more specific, LSUnit doesn't know anything about
372	/// cache hierarchy and memory types.
373	/// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
374	/// scheduling model provides an "optimistic" load-to-use latency (which usually
375	/// matches the load-to-use latency for when there is a hit in the L1D).
376	/// Derived classes may expand this knowledge.
377	///
378	/// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
379	/// memory-barrier like instructions.
380	/// LSUnit conservatively assumes that an instruction which `mayLoad` and has
381	/// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
382	/// serializes loads without forcing a flush of the load queue.
383	/// Similarly, instructions that both `mayStore` and have `unmodeled side
384	/// effects` are treated like store barriers. A full memory
385	/// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
386	/// effects. This is obviously inaccurate, but this is the best that we can do
387	/// at the moment.
388	///
389	/// Each load/store barrier consumes one entry in the load/store queue. A
390	/// load/store barrier enforces ordering of loads/stores:
391	/// - A younger load cannot pass a load barrier.
392	/// - A younger store cannot pass a store barrier.
393	///
394	/// A younger load has to wait for the memory load barrier to execute.
395	/// A load/store barrier is "executed" when it becomes the oldest entry in
396	/// the load/store queue(s). That also means, all the older loads/stores have
397	/// already been executed.
398	class LSUnit : public LSUnitBase {
399	// This class doesn't know about the latency of a load instruction. So, it
400	// conservatively/pessimistically assumes that the latency of a load opcode
401	// matches the instruction latency.
402	//
403	// FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
404	// and load/store conflicts, the latency of a load is determined by the depth
405	// of the load pipeline. So, we could use field `LoadLatency` in the
406	// MCSchedModel to model that latency.
407	// Field `LoadLatency` often matches the so-called 'load-to-use' latency from
408	// L1D, and it usually already accounts for any extra latency due to data
409	// forwarding.
410	// When doing throughput analysis, `LoadLatency` is likely to
411	// be a better predictor of load latency than instruction latency. This is
412	// particularly true when simulating code with temporal/spatial locality of
413	// memory accesses.
414	// Using `LoadLatency` (instead of the instruction latency) is also expected
415	// to improve the load queue allocation for long latency instructions with
416	// folded memory operands (See PR39829).
417	//
418	// FIXME: On some processors, load/store operations are split into multiple
419	// uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
420	// not 256-bit data types. So, a 256-bit load is effectively split into two
421	// 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
422	// simplicity, this class optimistically assumes that a load instruction only
423	// consumes one entry in the LoadQueue. Similarly, store instructions only
424	// consume a single entry in the StoreQueue.
425	// In future, we should reassess the quality of this design, and consider
426	// alternative approaches that let instructions specify the number of
427	// load/store queue entries which they consume at dispatch stage (See
428	// PR39830).
429	//
430	// An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
431	// conservatively treated as a store barrier. It forces older store to be
432	// executed before newer stores are issued.
433	//
434	// An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
435	// conservatively treated as a load barrier. It forces older loads to execute
436	// before newer loads are issued.
437	unsigned CurrentLoadGroupID;
438	unsigned CurrentLoadBarrierGroupID;
439	unsigned CurrentStoreGroupID;
440	unsigned CurrentStoreBarrierGroupID;
441
442	public:
443	LSUnit(const MCSchedModel &SM)
444	: LSUnit (SM, / LQSize / `0`, / SQSize / `0`, / NoAlias / false) {}
445	LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ)
446	: LSUnit (SM, LQ, SQ, / NoAlias / false) {}
447	LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias)
448	: LSUnitBase (SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(`0`),
449	CurrentLoadBarrierGroupID(`0`), CurrentStoreGroupID(`0`),
450	CurrentStoreBarrierGroupID(`0`) {}
451
452	/// Returns LSU_AVAILABLE if there are enough load/store queue entries to
453	/// accomodate instruction IR.
454	Status isAvailable(const InstRef &IR) const override;
455
456	/// Allocates LS resources for instruction IR.
457	///
458	/// This method assumes that a previous call to `isAvailable(IR)` succeeded
459	/// returning LSU_AVAILABLE.
460	///
461	/// Rules are:
462	/// By default, rules are:
463	/// 1. A store may not pass a previous store.
464	/// 2. A load may not pass a previous store unless flag 'NoAlias' is set.
465	/// 3. A load may pass a previous load.
466	/// 4. A store may not pass a previous load (regardless of flag 'NoAlias').
467	/// 5. A load has to wait until an older load barrier is fully executed.
468	/// 6. A store has to wait until an older store barrier is fully executed.
469	unsigned dispatch(const InstRef &IR) override;
470
471	void onInstructionExecuted(const InstRef &IR) override;
472	};
473
474	} // namespace mca
475	} // namespace llvm
476
477	#endif // LLVM_MCA_HARDWAREUNITS_LSUNIT_H
478

source code of llvm/include/llvm/MCA/HardwareUnits/LSUnit.h