1 | //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // The machine combiner pass uses machine trace metrics to ensure the combined |
10 | // instructions do not lengthen the critical path or the resource depth. |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "llvm/ADT/DenseMap.h" |
14 | #include "llvm/ADT/Statistic.h" |
15 | #include "llvm/Analysis/ProfileSummaryInfo.h" |
16 | #include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" |
17 | #include "llvm/CodeGen/MachineCombinerPattern.h" |
18 | #include "llvm/CodeGen/MachineDominators.h" |
19 | #include "llvm/CodeGen/MachineFunction.h" |
20 | #include "llvm/CodeGen/MachineFunctionPass.h" |
21 | #include "llvm/CodeGen/MachineLoopInfo.h" |
22 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
23 | #include "llvm/CodeGen/MachineSizeOpts.h" |
24 | #include "llvm/CodeGen/MachineTraceMetrics.h" |
25 | #include "llvm/CodeGen/RegisterClassInfo.h" |
26 | #include "llvm/CodeGen/TargetInstrInfo.h" |
27 | #include "llvm/CodeGen/TargetRegisterInfo.h" |
28 | #include "llvm/CodeGen/TargetSchedule.h" |
29 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
30 | #include "llvm/InitializePasses.h" |
31 | #include "llvm/Support/CommandLine.h" |
32 | #include "llvm/Support/Debug.h" |
33 | #include "llvm/Support/raw_ostream.h" |
34 | |
35 | using namespace llvm; |
36 | |
37 | #define DEBUG_TYPE "machine-combiner" |
38 | |
39 | STATISTIC(NumInstCombined, "Number of machineinst combined" ); |
40 | |
41 | static cl::opt<unsigned> |
42 | inc_threshold("machine-combiner-inc-threshold" , cl::Hidden, |
43 | cl::desc("Incremental depth computation will be used for basic " |
44 | "blocks with more instructions." ), cl::init(Val: 500)); |
45 | |
46 | static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs" , cl::Hidden, |
47 | cl::desc("Dump all substituted intrs" ), |
48 | cl::init(Val: false)); |
49 | |
50 | #ifdef EXPENSIVE_CHECKS |
51 | static cl::opt<bool> VerifyPatternOrder( |
52 | "machine-combiner-verify-pattern-order" , cl::Hidden, |
53 | cl::desc( |
54 | "Verify that the generated patterns are ordered by increasing latency" ), |
55 | cl::init(true)); |
56 | #else |
57 | static cl::opt<bool> VerifyPatternOrder( |
58 | "machine-combiner-verify-pattern-order" , cl::Hidden, |
59 | cl::desc( |
60 | "Verify that the generated patterns are ordered by increasing latency" ), |
61 | cl::init(Val: false)); |
62 | #endif |
63 | |
64 | namespace { |
65 | class MachineCombiner : public MachineFunctionPass { |
66 | const TargetSubtargetInfo *STI = nullptr; |
67 | const TargetInstrInfo *TII = nullptr; |
68 | const TargetRegisterInfo *TRI = nullptr; |
69 | MCSchedModel SchedModel; |
70 | MachineRegisterInfo *MRI = nullptr; |
71 | MachineLoopInfo *MLI = nullptr; // Current MachineLoopInfo |
72 | MachineTraceMetrics *Traces = nullptr; |
73 | MachineTraceMetrics::Ensemble *TraceEnsemble = nullptr; |
74 | MachineBlockFrequencyInfo *MBFI = nullptr; |
75 | ProfileSummaryInfo *PSI = nullptr; |
76 | RegisterClassInfo RegClassInfo; |
77 | |
78 | TargetSchedModel TSchedModel; |
79 | |
80 | /// True if optimizing for code size. |
81 | bool OptSize = false; |
82 | |
83 | public: |
84 | static char ID; |
85 | MachineCombiner() : MachineFunctionPass(ID) { |
86 | initializeMachineCombinerPass(*PassRegistry::getPassRegistry()); |
87 | } |
88 | void getAnalysisUsage(AnalysisUsage &AU) const override; |
89 | bool runOnMachineFunction(MachineFunction &MF) override; |
90 | StringRef getPassName() const override { return "Machine InstCombiner" ; } |
91 | |
92 | private: |
93 | bool combineInstructions(MachineBasicBlock *); |
94 | MachineInstr *getOperandDef(const MachineOperand &MO); |
95 | bool isTransientMI(const MachineInstr *MI); |
96 | unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, |
97 | DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, |
98 | MachineTraceMetrics::Trace BlockTrace, |
99 | const MachineBasicBlock &MBB); |
100 | unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot, |
101 | MachineTraceMetrics::Trace BlockTrace); |
102 | bool |
103 | improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root, |
104 | MachineTraceMetrics::Trace BlockTrace, |
105 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
106 | SmallVectorImpl<MachineInstr *> &DelInstrs, |
107 | DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, |
108 | MachineCombinerPattern Pattern, bool SlackIsAccurate); |
109 | bool reduceRegisterPressure(MachineInstr &Root, MachineBasicBlock *MBB, |
110 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
111 | SmallVectorImpl<MachineInstr *> &DelInstrs, |
112 | MachineCombinerPattern Pattern); |
113 | bool preservesResourceLen(MachineBasicBlock *MBB, |
114 | MachineTraceMetrics::Trace BlockTrace, |
115 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
116 | SmallVectorImpl<MachineInstr *> &DelInstrs); |
117 | void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs, |
118 | SmallVectorImpl<const MCSchedClassDesc *> &); |
119 | std::pair<unsigned, unsigned> |
120 | getLatenciesForInstrSequences(MachineInstr &MI, |
121 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
122 | SmallVectorImpl<MachineInstr *> &DelInstrs, |
123 | MachineTraceMetrics::Trace BlockTrace); |
124 | |
125 | void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root, |
126 | SmallVector<MachineCombinerPattern, 16> &Patterns); |
127 | }; |
128 | } |
129 | |
130 | char MachineCombiner::ID = 0; |
131 | char &llvm::MachineCombinerID = MachineCombiner::ID; |
132 | |
133 | INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE, |
134 | "Machine InstCombiner" , false, false) |
135 | INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) |
136 | INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) |
137 | INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner" , |
138 | false, false) |
139 | |
140 | void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { |
141 | AU.setPreservesCFG(); |
142 | AU.addPreserved<MachineDominatorTree>(); |
143 | AU.addRequired<MachineLoopInfo>(); |
144 | AU.addPreserved<MachineLoopInfo>(); |
145 | AU.addRequired<MachineTraceMetrics>(); |
146 | AU.addPreserved<MachineTraceMetrics>(); |
147 | AU.addRequired<LazyMachineBlockFrequencyInfoPass>(); |
148 | AU.addRequired<ProfileSummaryInfoWrapperPass>(); |
149 | MachineFunctionPass::getAnalysisUsage(AU); |
150 | } |
151 | |
152 | MachineInstr * |
153 | MachineCombiner::getOperandDef(const MachineOperand &MO) { |
154 | MachineInstr *DefInstr = nullptr; |
155 | // We need a virtual register definition. |
156 | if (MO.isReg() && MO.getReg().isVirtual()) |
157 | DefInstr = MRI->getUniqueVRegDef(Reg: MO.getReg()); |
158 | // PHI's have no depth etc. |
159 | if (DefInstr && DefInstr->isPHI()) |
160 | DefInstr = nullptr; |
161 | return DefInstr; |
162 | } |
163 | |
164 | /// Return true if MI is unlikely to generate an actual target instruction. |
165 | bool MachineCombiner::isTransientMI(const MachineInstr *MI) { |
166 | if (!MI->isCopy()) |
167 | return MI->isTransient(); |
168 | |
169 | // If MI is a COPY, check if its src and dst registers can be coalesced. |
170 | Register Dst = MI->getOperand(i: 0).getReg(); |
171 | Register Src = MI->getOperand(i: 1).getReg(); |
172 | |
173 | if (!MI->isFullCopy()) { |
174 | // If src RC contains super registers of dst RC, it can also be coalesced. |
175 | if (MI->getOperand(i: 0).getSubReg() || Src.isPhysical() || Dst.isPhysical()) |
176 | return false; |
177 | |
178 | auto SrcSub = MI->getOperand(i: 1).getSubReg(); |
179 | auto SrcRC = MRI->getRegClass(Reg: Src); |
180 | auto DstRC = MRI->getRegClass(Reg: Dst); |
181 | return TRI->getMatchingSuperRegClass(A: SrcRC, B: DstRC, Idx: SrcSub) != nullptr; |
182 | } |
183 | |
184 | if (Src.isPhysical() && Dst.isPhysical()) |
185 | return Src == Dst; |
186 | |
187 | if (Src.isVirtual() && Dst.isVirtual()) { |
188 | auto SrcRC = MRI->getRegClass(Reg: Src); |
189 | auto DstRC = MRI->getRegClass(Reg: Dst); |
190 | return SrcRC->hasSuperClassEq(RC: DstRC) || SrcRC->hasSubClassEq(RC: DstRC); |
191 | } |
192 | |
193 | if (Src.isVirtual()) |
194 | std::swap(a&: Src, b&: Dst); |
195 | |
196 | // Now Src is physical register, Dst is virtual register. |
197 | auto DstRC = MRI->getRegClass(Reg: Dst); |
198 | return DstRC->contains(Reg: Src); |
199 | } |
200 | |
201 | /// Computes depth of instructions in vector \InsInstr. |
202 | /// |
203 | /// \param InsInstrs is a vector of machine instructions |
204 | /// \param InstrIdxForVirtReg is a dense map of virtual register to index |
205 | /// of defining machine instruction in \p InsInstrs |
206 | /// \param BlockTrace is a trace of machine instructions |
207 | /// |
208 | /// \returns Depth of last instruction in \InsInstrs ("NewRoot") |
209 | unsigned |
210 | MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, |
211 | DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, |
212 | MachineTraceMetrics::Trace BlockTrace, |
213 | const MachineBasicBlock &MBB) { |
214 | SmallVector<unsigned, 16> InstrDepth; |
215 | // For each instruction in the new sequence compute the depth based on the |
216 | // operands. Use the trace information when possible. For new operands which |
217 | // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth |
218 | for (auto *InstrPtr : InsInstrs) { // for each Use |
219 | unsigned IDepth = 0; |
220 | for (const MachineOperand &MO : InstrPtr->all_uses()) { |
221 | // Check for virtual register operand. |
222 | if (!MO.getReg().isVirtual()) |
223 | continue; |
224 | unsigned DepthOp = 0; |
225 | unsigned LatencyOp = 0; |
226 | DenseMap<unsigned, unsigned>::iterator II = |
227 | InstrIdxForVirtReg.find(Val: MO.getReg()); |
228 | if (II != InstrIdxForVirtReg.end()) { |
229 | // Operand is new virtual register not in trace |
230 | assert(II->second < InstrDepth.size() && "Bad Index" ); |
231 | MachineInstr *DefInstr = InsInstrs[II->second]; |
232 | assert(DefInstr && |
233 | "There must be a definition for a new virtual register" ); |
234 | DepthOp = InstrDepth[II->second]; |
235 | int DefIdx = DefInstr->findRegisterDefOperandIdx(Reg: MO.getReg()); |
236 | int UseIdx = InstrPtr->findRegisterUseOperandIdx(Reg: MO.getReg()); |
237 | LatencyOp = TSchedModel.computeOperandLatency(DefMI: DefInstr, DefOperIdx: DefIdx, |
238 | UseMI: InstrPtr, UseOperIdx: UseIdx); |
239 | } else { |
240 | MachineInstr *DefInstr = getOperandDef(MO); |
241 | if (DefInstr && (TII->getMachineCombinerTraceStrategy() != |
242 | MachineTraceStrategy::TS_Local || |
243 | DefInstr->getParent() == &MBB)) { |
244 | DepthOp = BlockTrace.getInstrCycles(MI: *DefInstr).Depth; |
245 | if (!isTransientMI(MI: DefInstr)) |
246 | LatencyOp = TSchedModel.computeOperandLatency( |
247 | DefMI: DefInstr, DefOperIdx: DefInstr->findRegisterDefOperandIdx(Reg: MO.getReg()), |
248 | UseMI: InstrPtr, UseOperIdx: InstrPtr->findRegisterUseOperandIdx(Reg: MO.getReg())); |
249 | } |
250 | } |
251 | IDepth = std::max(a: IDepth, b: DepthOp + LatencyOp); |
252 | } |
253 | InstrDepth.push_back(Elt: IDepth); |
254 | } |
255 | unsigned NewRootIdx = InsInstrs.size() - 1; |
256 | return InstrDepth[NewRootIdx]; |
257 | } |
258 | |
259 | /// Computes instruction latency as max of latency of defined operands. |
260 | /// |
261 | /// \param Root is a machine instruction that could be replaced by NewRoot. |
262 | /// It is used to compute a more accurate latency information for NewRoot in |
263 | /// case there is a dependent instruction in the same trace (\p BlockTrace) |
264 | /// \param NewRoot is the instruction for which the latency is computed |
265 | /// \param BlockTrace is a trace of machine instructions |
266 | /// |
267 | /// \returns Latency of \p NewRoot |
268 | unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, |
269 | MachineTraceMetrics::Trace BlockTrace) { |
270 | // Check each definition in NewRoot and compute the latency |
271 | unsigned NewRootLatency = 0; |
272 | |
273 | for (const MachineOperand &MO : NewRoot->all_defs()) { |
274 | // Check for virtual register operand. |
275 | if (!MO.getReg().isVirtual()) |
276 | continue; |
277 | // Get the first instruction that uses MO |
278 | MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(RegNo: MO.getReg()); |
279 | RI++; |
280 | if (RI == MRI->reg_end()) |
281 | continue; |
282 | MachineInstr *UseMO = RI->getParent(); |
283 | unsigned LatencyOp = 0; |
284 | if (UseMO && BlockTrace.isDepInTrace(DefMI: *Root, UseMI: *UseMO)) { |
285 | LatencyOp = TSchedModel.computeOperandLatency( |
286 | DefMI: NewRoot, DefOperIdx: NewRoot->findRegisterDefOperandIdx(Reg: MO.getReg()), UseMI: UseMO, |
287 | UseOperIdx: UseMO->findRegisterUseOperandIdx(Reg: MO.getReg())); |
288 | } else { |
289 | LatencyOp = TSchedModel.computeInstrLatency(MI: NewRoot); |
290 | } |
291 | NewRootLatency = std::max(a: NewRootLatency, b: LatencyOp); |
292 | } |
293 | return NewRootLatency; |
294 | } |
295 | |
296 | /// The combiner's goal may differ based on which pattern it is attempting |
297 | /// to optimize. |
298 | enum class CombinerObjective { |
299 | MustReduceDepth, // The data dependency chain must be improved. |
300 | MustReduceRegisterPressure, // The register pressure must be reduced. |
301 | Default // The critical path must not be lengthened. |
302 | }; |
303 | |
304 | static CombinerObjective getCombinerObjective(MachineCombinerPattern P) { |
305 | // TODO: If C++ ever gets a real enum class, make this part of the |
306 | // MachineCombinerPattern class. |
307 | switch (P) { |
308 | case MachineCombinerPattern::REASSOC_AX_BY: |
309 | case MachineCombinerPattern::REASSOC_AX_YB: |
310 | case MachineCombinerPattern::REASSOC_XA_BY: |
311 | case MachineCombinerPattern::REASSOC_XA_YB: |
312 | case MachineCombinerPattern::REASSOC_XY_AMM_BMM: |
313 | case MachineCombinerPattern::REASSOC_XMM_AMM_BMM: |
314 | case MachineCombinerPattern::SUBADD_OP1: |
315 | case MachineCombinerPattern::SUBADD_OP2: |
316 | case MachineCombinerPattern::FMADD_AX: |
317 | case MachineCombinerPattern::FMADD_XA: |
318 | case MachineCombinerPattern::FMSUB: |
319 | case MachineCombinerPattern::FNMSUB: |
320 | return CombinerObjective::MustReduceDepth; |
321 | case MachineCombinerPattern::REASSOC_XY_BCA: |
322 | case MachineCombinerPattern::REASSOC_XY_BAC: |
323 | return CombinerObjective::MustReduceRegisterPressure; |
324 | default: |
325 | return CombinerObjective::Default; |
326 | } |
327 | } |
328 | |
329 | /// Estimate the latency of the new and original instruction sequence by summing |
330 | /// up the latencies of the inserted and deleted instructions. This assumes |
331 | /// that the inserted and deleted instructions are dependent instruction chains, |
332 | /// which might not hold in all cases. |
333 | std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences( |
334 | MachineInstr &MI, SmallVectorImpl<MachineInstr *> &InsInstrs, |
335 | SmallVectorImpl<MachineInstr *> &DelInstrs, |
336 | MachineTraceMetrics::Trace BlockTrace) { |
337 | assert(!InsInstrs.empty() && "Only support sequences that insert instrs." ); |
338 | unsigned NewRootLatency = 0; |
339 | // NewRoot is the last instruction in the \p InsInstrs vector. |
340 | MachineInstr *NewRoot = InsInstrs.back(); |
341 | for (unsigned i = 0; i < InsInstrs.size() - 1; i++) |
342 | NewRootLatency += TSchedModel.computeInstrLatency(MI: InsInstrs[i]); |
343 | NewRootLatency += getLatency(Root: &MI, NewRoot, BlockTrace); |
344 | |
345 | unsigned RootLatency = 0; |
346 | for (auto *I : DelInstrs) |
347 | RootLatency += TSchedModel.computeInstrLatency(MI: I); |
348 | |
349 | return {NewRootLatency, RootLatency}; |
350 | } |
351 | |
352 | bool MachineCombiner::reduceRegisterPressure( |
353 | MachineInstr &Root, MachineBasicBlock *MBB, |
354 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
355 | SmallVectorImpl<MachineInstr *> &DelInstrs, |
356 | MachineCombinerPattern Pattern) { |
357 | // FIXME: for now, we don't do any check for the register pressure patterns. |
358 | // We treat them as always profitable. But we can do better if we make |
359 | // RegPressureTracker class be aware of TIE attribute. Then we can get an |
360 | // accurate compare of register pressure with DelInstrs or InsInstrs. |
361 | return true; |
362 | } |
363 | |
364 | /// The DAGCombine code sequence ends in MI (Machine Instruction) Root. |
365 | /// The new code sequence ends in MI NewRoot. A necessary condition for the new |
366 | /// sequence to replace the old sequence is that it cannot lengthen the critical |
367 | /// path. The definition of "improve" may be restricted by specifying that the |
368 | /// new path improves the data dependency chain (MustReduceDepth). |
369 | bool MachineCombiner::improvesCriticalPathLen( |
370 | MachineBasicBlock *MBB, MachineInstr *Root, |
371 | MachineTraceMetrics::Trace BlockTrace, |
372 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
373 | SmallVectorImpl<MachineInstr *> &DelInstrs, |
374 | DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, |
375 | MachineCombinerPattern Pattern, |
376 | bool SlackIsAccurate) { |
377 | // Get depth and latency of NewRoot and Root. |
378 | unsigned NewRootDepth = |
379 | getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace, MBB: *MBB); |
380 | unsigned RootDepth = BlockTrace.getInstrCycles(MI: *Root).Depth; |
381 | |
382 | LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: " |
383 | << NewRootDepth << "\tRootDepth: " << RootDepth); |
384 | |
385 | // For a transform such as reassociation, the cost equation is |
386 | // conservatively calculated so that we must improve the depth (data |
387 | // dependency cycles) in the critical path to proceed with the transform. |
388 | // Being conservative also protects against inaccuracies in the underlying |
389 | // machine trace metrics and CPU models. |
390 | if (getCombinerObjective(P: Pattern) == CombinerObjective::MustReduceDepth) { |
391 | LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth " ); |
392 | LLVM_DEBUG(NewRootDepth < RootDepth |
393 | ? dbgs() << "\t and it does it\n" |
394 | : dbgs() << "\t but it does NOT do it\n" ); |
395 | return NewRootDepth < RootDepth; |
396 | } |
397 | |
398 | // A more flexible cost calculation for the critical path includes the slack |
399 | // of the original code sequence. This may allow the transform to proceed |
400 | // even if the instruction depths (data dependency cycles) become worse. |
401 | |
402 | // Account for the latency of the inserted and deleted instructions by |
403 | unsigned NewRootLatency, RootLatency; |
404 | if (TII->accumulateInstrSeqToRootLatency(Root&: *Root)) { |
405 | std::tie(args&: NewRootLatency, args&: RootLatency) = |
406 | getLatenciesForInstrSequences(MI&: *Root, InsInstrs, DelInstrs, BlockTrace); |
407 | } else { |
408 | NewRootLatency = TSchedModel.computeInstrLatency(MI: InsInstrs.back()); |
409 | RootLatency = TSchedModel.computeInstrLatency(MI: Root); |
410 | } |
411 | |
412 | unsigned RootSlack = BlockTrace.getInstrSlack(MI: *Root); |
413 | unsigned NewCycleCount = NewRootDepth + NewRootLatency; |
414 | unsigned OldCycleCount = |
415 | RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0); |
416 | LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency |
417 | << "\tRootLatency: " << RootLatency << "\n\tRootSlack: " |
418 | << RootSlack << " SlackIsAccurate=" << SlackIsAccurate |
419 | << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount |
420 | << "\n\tRootDepth + RootLatency + RootSlack = " |
421 | << OldCycleCount;); |
422 | LLVM_DEBUG(NewCycleCount <= OldCycleCount |
423 | ? dbgs() << "\n\t It IMPROVES PathLen because" |
424 | : dbgs() << "\n\t It DOES NOT improve PathLen because" ); |
425 | LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount |
426 | << ", OldCycleCount = " << OldCycleCount << "\n" ); |
427 | |
428 | return NewCycleCount <= OldCycleCount; |
429 | } |
430 | |
431 | /// helper routine to convert instructions into SC |
432 | void MachineCombiner::instr2instrSC( |
433 | SmallVectorImpl<MachineInstr *> &Instrs, |
434 | SmallVectorImpl<const MCSchedClassDesc *> &) { |
435 | for (auto *InstrPtr : Instrs) { |
436 | unsigned Opc = InstrPtr->getOpcode(); |
437 | unsigned Idx = TII->get(Opcode: Opc).getSchedClass(); |
438 | const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(SchedClassIdx: Idx); |
439 | InstrsSC.push_back(Elt: SC); |
440 | } |
441 | } |
442 | |
443 | /// True when the new instructions do not increase resource length |
444 | bool MachineCombiner::preservesResourceLen( |
445 | MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, |
446 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
447 | SmallVectorImpl<MachineInstr *> &DelInstrs) { |
448 | if (!TSchedModel.hasInstrSchedModel()) |
449 | return true; |
450 | |
451 | // Compute current resource length |
452 | |
453 | //ArrayRef<const MachineBasicBlock *> MBBarr(MBB); |
454 | SmallVector <const MachineBasicBlock *, 1> MBBarr; |
455 | MBBarr.push_back(Elt: MBB); |
456 | unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(Extrablocks: MBBarr); |
457 | |
458 | // Deal with SC rather than Instructions. |
459 | SmallVector<const MCSchedClassDesc *, 16> ; |
460 | SmallVector<const MCSchedClassDesc *, 16> ; |
461 | |
462 | instr2instrSC(Instrs&: InsInstrs, InstrsSC&: InsInstrsSC); |
463 | instr2instrSC(Instrs&: DelInstrs, InstrsSC&: DelInstrsSC); |
464 | |
465 | ArrayRef<const MCSchedClassDesc *> MSCInsArr{InsInstrsSC}; |
466 | ArrayRef<const MCSchedClassDesc *> MSCDelArr{DelInstrsSC}; |
467 | |
468 | // Compute new resource length. |
469 | unsigned ResLenAfterCombine = |
470 | BlockTrace.getResourceLength(Extrablocks: MBBarr, ExtraInstrs: MSCInsArr, RemoveInstrs: MSCDelArr); |
471 | |
472 | LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: " |
473 | << ResLenBeforeCombine |
474 | << " and after: " << ResLenAfterCombine << "\n" ;); |
475 | LLVM_DEBUG( |
476 | ResLenAfterCombine <= |
477 | ResLenBeforeCombine + TII->getExtendResourceLenLimit() |
478 | ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n" |
479 | : dbgs() << "\t\t As result it DOES NOT improve/preserve Resource " |
480 | "Length\n" ); |
481 | |
482 | return ResLenAfterCombine <= |
483 | ResLenBeforeCombine + TII->getExtendResourceLenLimit(); |
484 | } |
485 | |
486 | /// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction |
487 | /// depths if requested. |
488 | /// |
489 | /// \param MBB basic block to insert instructions in |
490 | /// \param MI current machine instruction |
491 | /// \param InsInstrs new instructions to insert in \p MBB |
492 | /// \param DelInstrs instruction to delete from \p MBB |
493 | /// \param TraceEnsemble is a pointer to the machine trace information |
494 | /// \param RegUnits set of live registers, needed to compute instruction depths |
495 | /// \param TII is target instruction info, used to call target hook |
496 | /// \param Pattern is used to call target hook finalizeInsInstrs |
497 | /// \param IncrementalUpdate if true, compute instruction depths incrementally, |
498 | /// otherwise invalidate the trace |
499 | static void insertDeleteInstructions( |
500 | MachineBasicBlock *MBB, MachineInstr &MI, |
501 | SmallVectorImpl<MachineInstr *> &InsInstrs, |
502 | SmallVectorImpl<MachineInstr *> &DelInstrs, |
503 | MachineTraceMetrics::Ensemble *TraceEnsemble, |
504 | SparseSet<LiveRegUnit> &RegUnits, const TargetInstrInfo *TII, |
505 | MachineCombinerPattern Pattern, bool IncrementalUpdate) { |
506 | // If we want to fix up some placeholder for some target, do it now. |
507 | // We need this because in genAlternativeCodeSequence, we have not decided the |
508 | // better pattern InsInstrs or DelInstrs, so we don't want generate some |
509 | // sideeffect to the function. For example we need to delay the constant pool |
510 | // entry creation here after InsInstrs is selected as better pattern. |
511 | // Otherwise the constant pool entry created for InsInstrs will not be deleted |
512 | // even if InsInstrs is not the better pattern. |
513 | TII->finalizeInsInstrs(Root&: MI, P&: Pattern, InsInstrs); |
514 | |
515 | for (auto *InstrPtr : InsInstrs) |
516 | MBB->insert(I: (MachineBasicBlock::iterator)&MI, MI: InstrPtr); |
517 | |
518 | for (auto *InstrPtr : DelInstrs) { |
519 | InstrPtr->eraseFromParent(); |
520 | // Erase all LiveRegs defined by the removed instruction |
521 | for (auto *I = RegUnits.begin(); I != RegUnits.end();) { |
522 | if (I->MI == InstrPtr) |
523 | I = RegUnits.erase(I); |
524 | else |
525 | I++; |
526 | } |
527 | } |
528 | |
529 | if (IncrementalUpdate) |
530 | for (auto *InstrPtr : InsInstrs) |
531 | TraceEnsemble->updateDepth(MBB, *InstrPtr, RegUnits); |
532 | else |
533 | TraceEnsemble->invalidate(MBB); |
534 | |
535 | NumInstCombined++; |
536 | } |
537 | |
538 | // Check that the difference between original and new latency is decreasing for |
539 | // later patterns. This helps to discover sub-optimal pattern orderings. |
540 | void MachineCombiner::verifyPatternOrder( |
541 | MachineBasicBlock *MBB, MachineInstr &Root, |
542 | SmallVector<MachineCombinerPattern, 16> &Patterns) { |
543 | long PrevLatencyDiff = std::numeric_limits<long>::max(); |
544 | (void)PrevLatencyDiff; // Variable is used in assert only. |
545 | for (auto P : Patterns) { |
546 | SmallVector<MachineInstr *, 16> InsInstrs; |
547 | SmallVector<MachineInstr *, 16> DelInstrs; |
548 | DenseMap<unsigned, unsigned> InstrIdxForVirtReg; |
549 | TII->genAlternativeCodeSequence(Root, Pattern: P, InsInstrs, DelInstrs, |
550 | InstIdxForVirtReg&: InstrIdxForVirtReg); |
551 | // Found pattern, but did not generate alternative sequence. |
552 | // This can happen e.g. when an immediate could not be materialized |
553 | // in a single instruction. |
554 | if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries()) |
555 | continue; |
556 | |
557 | unsigned NewRootLatency, RootLatency; |
558 | std::tie(args&: NewRootLatency, args&: RootLatency) = getLatenciesForInstrSequences( |
559 | MI&: Root, InsInstrs, DelInstrs, BlockTrace: TraceEnsemble->getTrace(MBB)); |
560 | long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency); |
561 | assert(CurrentLatencyDiff <= PrevLatencyDiff && |
562 | "Current pattern is better than previous pattern." ); |
563 | PrevLatencyDiff = CurrentLatencyDiff; |
564 | } |
565 | } |
566 | |
567 | /// Substitute a slow code sequence with a faster one by |
568 | /// evaluating instruction combining pattern. |
569 | /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction |
570 | /// combining based on machine trace metrics. Only combine a sequence of |
571 | /// instructions when this neither lengthens the critical path nor increases |
572 | /// resource pressure. When optimizing for codesize always combine when the new |
573 | /// sequence is shorter. |
574 | bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { |
575 | bool Changed = false; |
576 | LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n" ); |
577 | |
578 | bool IncrementalUpdate = false; |
579 | auto BlockIter = MBB->begin(); |
580 | decltype(BlockIter) LastUpdate; |
581 | // Check if the block is in a loop. |
582 | const MachineLoop *ML = MLI->getLoopFor(BB: MBB); |
583 | if (!TraceEnsemble) |
584 | TraceEnsemble = Traces->getEnsemble(TII->getMachineCombinerTraceStrategy()); |
585 | |
586 | SparseSet<LiveRegUnit> RegUnits; |
587 | RegUnits.setUniverse(TRI->getNumRegUnits()); |
588 | |
589 | bool OptForSize = OptSize || llvm::shouldOptimizeForSize(MBB, PSI, MBFI); |
590 | |
591 | bool DoRegPressureReduce = |
592 | TII->shouldReduceRegisterPressure(MBB, RegClassInfo: &RegClassInfo); |
593 | |
594 | while (BlockIter != MBB->end()) { |
595 | auto &MI = *BlockIter++; |
596 | SmallVector<MachineCombinerPattern, 16> Patterns; |
597 | // The motivating example is: |
598 | // |
599 | // MUL Other MUL_op1 MUL_op2 Other |
600 | // \ / \ | / |
601 | // ADD/SUB => MADD/MSUB |
602 | // (=Root) (=NewRoot) |
603 | |
604 | // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is |
605 | // usually beneficial for code size it unfortunately can hurt performance |
606 | // when the ADD is on the critical path, but the MUL is not. With the |
607 | // substitution the MUL becomes part of the critical path (in form of the |
608 | // MADD) and can lengthen it on architectures where the MADD latency is |
609 | // longer than the ADD latency. |
610 | // |
611 | // For each instruction we check if it can be the root of a combiner |
612 | // pattern. Then for each pattern the new code sequence in form of MI is |
613 | // generated and evaluated. When the efficiency criteria (don't lengthen |
614 | // critical path, don't use more resources) is met the new sequence gets |
615 | // hooked up into the basic block before the old sequence is removed. |
616 | // |
617 | // The algorithm does not try to evaluate all patterns and pick the best. |
618 | // This is only an artificial restriction though. In practice there is |
619 | // mostly one pattern, and getMachineCombinerPatterns() can order patterns |
620 | // based on an internal cost heuristic. If |
621 | // machine-combiner-verify-pattern-order is enabled, all patterns are |
622 | // checked to ensure later patterns do not provide better latency savings. |
623 | |
624 | if (!TII->getMachineCombinerPatterns(Root&: MI, Patterns, DoRegPressureReduce)) |
625 | continue; |
626 | |
627 | if (VerifyPatternOrder) |
628 | verifyPatternOrder(MBB, Root&: MI, Patterns); |
629 | |
630 | for (const auto P : Patterns) { |
631 | SmallVector<MachineInstr *, 16> InsInstrs; |
632 | SmallVector<MachineInstr *, 16> DelInstrs; |
633 | DenseMap<unsigned, unsigned> InstrIdxForVirtReg; |
634 | TII->genAlternativeCodeSequence(Root&: MI, Pattern: P, InsInstrs, DelInstrs, |
635 | InstIdxForVirtReg&: InstrIdxForVirtReg); |
636 | // Found pattern, but did not generate alternative sequence. |
637 | // This can happen e.g. when an immediate could not be materialized |
638 | // in a single instruction. |
639 | if (InsInstrs.empty()) |
640 | continue; |
641 | |
642 | LLVM_DEBUG(if (dump_intrs) { |
643 | dbgs() << "\tFor the Pattern (" << (int)P |
644 | << ") these instructions could be removed\n" ; |
645 | for (auto const *InstrPtr : DelInstrs) |
646 | InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false, |
647 | /*SkipDebugLoc*/false, /*AddNewLine*/true, TII); |
648 | dbgs() << "\tThese instructions could replace the removed ones\n" ; |
649 | for (auto const *InstrPtr : InsInstrs) |
650 | InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false, |
651 | /*SkipDebugLoc*/false, /*AddNewLine*/true, TII); |
652 | }); |
653 | |
654 | if (IncrementalUpdate && LastUpdate != BlockIter) { |
655 | // Update depths since the last incremental update. |
656 | TraceEnsemble->updateDepths(Start: LastUpdate, End: BlockIter, RegUnits); |
657 | LastUpdate = BlockIter; |
658 | } |
659 | |
660 | if (DoRegPressureReduce && |
661 | getCombinerObjective(P) == |
662 | CombinerObjective::MustReduceRegisterPressure) { |
663 | if (MBB->size() > inc_threshold) { |
664 | // Use incremental depth updates for basic blocks above threshold |
665 | IncrementalUpdate = true; |
666 | LastUpdate = BlockIter; |
667 | } |
668 | if (reduceRegisterPressure(Root&: MI, MBB, InsInstrs, DelInstrs, Pattern: P)) { |
669 | // Replace DelInstrs with InsInstrs. |
670 | insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble, |
671 | RegUnits, TII, Pattern: P, IncrementalUpdate); |
672 | Changed |= true; |
673 | |
674 | // Go back to previous instruction as it may have ILP reassociation |
675 | // opportunity. |
676 | BlockIter--; |
677 | break; |
678 | } |
679 | } |
680 | |
681 | if (ML && TII->isThroughputPattern(Pattern: P)) { |
682 | LLVM_DEBUG(dbgs() << "\t Replacing due to throughput pattern in loop\n" ); |
683 | insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble, |
684 | RegUnits, TII, Pattern: P, IncrementalUpdate); |
685 | // Eagerly stop after the first pattern fires. |
686 | Changed = true; |
687 | break; |
688 | } else if (OptForSize && InsInstrs.size() < DelInstrs.size()) { |
689 | LLVM_DEBUG(dbgs() << "\t Replacing due to OptForSize (" |
690 | << InsInstrs.size() << " < " |
691 | << DelInstrs.size() << ")\n" ); |
692 | insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble, |
693 | RegUnits, TII, Pattern: P, IncrementalUpdate); |
694 | // Eagerly stop after the first pattern fires. |
695 | Changed = true; |
696 | break; |
697 | } else { |
698 | // For big basic blocks, we only compute the full trace the first time |
699 | // we hit this. We do not invalidate the trace, but instead update the |
700 | // instruction depths incrementally. |
701 | // NOTE: Only the instruction depths up to MI are accurate. All other |
702 | // trace information is not updated. |
703 | MachineTraceMetrics::Trace BlockTrace = TraceEnsemble->getTrace(MBB); |
704 | Traces->verifyAnalysis(); |
705 | if (improvesCriticalPathLen(MBB, Root: &MI, BlockTrace, InsInstrs, DelInstrs, |
706 | InstrIdxForVirtReg, Pattern: P, |
707 | SlackIsAccurate: !IncrementalUpdate) && |
708 | preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) { |
709 | if (MBB->size() > inc_threshold) { |
710 | // Use incremental depth updates for basic blocks above treshold |
711 | IncrementalUpdate = true; |
712 | LastUpdate = BlockIter; |
713 | } |
714 | |
715 | insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble, |
716 | RegUnits, TII, Pattern: P, IncrementalUpdate); |
717 | |
718 | // Eagerly stop after the first pattern fires. |
719 | Changed = true; |
720 | break; |
721 | } |
722 | // Cleanup instructions of the alternative code sequence. There is no |
723 | // use for them. |
724 | MachineFunction *MF = MBB->getParent(); |
725 | for (auto *InstrPtr : InsInstrs) |
726 | MF->deleteMachineInstr(MI: InstrPtr); |
727 | } |
728 | InstrIdxForVirtReg.clear(); |
729 | } |
730 | } |
731 | |
732 | if (Changed && IncrementalUpdate) |
733 | Traces->invalidate(MBB); |
734 | return Changed; |
735 | } |
736 | |
737 | bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { |
738 | STI = &MF.getSubtarget(); |
739 | TII = STI->getInstrInfo(); |
740 | TRI = STI->getRegisterInfo(); |
741 | SchedModel = STI->getSchedModel(); |
742 | TSchedModel.init(TSInfo: STI); |
743 | MRI = &MF.getRegInfo(); |
744 | MLI = &getAnalysis<MachineLoopInfo>(); |
745 | Traces = &getAnalysis<MachineTraceMetrics>(); |
746 | PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); |
747 | MBFI = (PSI && PSI->hasProfileSummary()) ? |
748 | &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() : |
749 | nullptr; |
750 | TraceEnsemble = nullptr; |
751 | OptSize = MF.getFunction().hasOptSize(); |
752 | RegClassInfo.runOnMachineFunction(MF); |
753 | |
754 | LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n'); |
755 | if (!TII->useMachineCombiner()) { |
756 | LLVM_DEBUG( |
757 | dbgs() |
758 | << " Skipping pass: Target does not support machine combiner\n" ); |
759 | return false; |
760 | } |
761 | |
762 | bool Changed = false; |
763 | |
764 | // Try to combine instructions. |
765 | for (auto &MBB : MF) |
766 | Changed |= combineInstructions(MBB: &MBB); |
767 | |
768 | return Changed; |
769 | } |
770 | |