1 | //===- DFAJumpThreading.cpp - Threads a switch statement inside a loop ----===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Transform each threading path to effectively jump thread the DFA. For |
10 | // example, the CFG below could be transformed as follows, where the cloned |
11 | // blocks unconditionally branch to the next correct case based on what is |
12 | // identified in the analysis. |
13 | // |
14 | // sw.bb sw.bb |
15 | // / | \ / | \ |
16 | // case1 case2 case3 case1 case2 case3 |
17 | // \ | / | | | |
18 | // determinator det.2 det.3 det.1 |
19 | // br sw.bb / | \ |
20 | // sw.bb.2 sw.bb.3 sw.bb.1 |
21 | // br case2 br case3 br case1ยง |
22 | // |
23 | // Definitions and Terminology: |
24 | // |
25 | // * Threading path: |
26 | // a list of basic blocks, the exit state, and the block that determines |
27 | // the next state, for which the following notation will be used: |
28 | // < path of BBs that form a cycle > [ state, determinator ] |
29 | // |
30 | // * Predictable switch: |
31 | // The switch variable is always a known constant so that all conditional |
32 | // jumps based on switch variable can be converted to unconditional jump. |
33 | // |
34 | // * Determinator: |
35 | // The basic block that determines the next state of the DFA. |
36 | // |
37 | // Representing the optimization in C-like pseudocode: the code pattern on the |
38 | // left could functionally be transformed to the right pattern if the switch |
39 | // condition is predictable. |
40 | // |
41 | // X = A goto A |
42 | // for (...) A: |
43 | // switch (X) ... |
44 | // case A goto B |
45 | // X = B B: |
46 | // case B ... |
47 | // X = C goto C |
48 | // |
49 | // The pass first checks that switch variable X is decided by the control flow |
50 | // path taken in the loop; for example, in case B, the next value of X is |
51 | // decided to be C. It then enumerates through all paths in the loop and labels |
52 | // the basic blocks where the next state is decided. |
53 | // |
54 | // Using this information it creates new paths that unconditionally branch to |
55 | // the next case. This involves cloning code, so it only gets triggered if the |
56 | // amount of code duplicated is below a threshold. |
57 | // |
58 | //===----------------------------------------------------------------------===// |
59 | |
60 | #include "llvm/Transforms/Scalar/DFAJumpThreading.h" |
61 | #include "llvm/ADT/APInt.h" |
62 | #include "llvm/ADT/DenseMap.h" |
63 | #include "llvm/ADT/SmallSet.h" |
64 | #include "llvm/ADT/Statistic.h" |
65 | #include "llvm/Analysis/AssumptionCache.h" |
66 | #include "llvm/Analysis/CodeMetrics.h" |
67 | #include "llvm/Analysis/DomTreeUpdater.h" |
68 | #include "llvm/Analysis/LoopInfo.h" |
69 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
70 | #include "llvm/Analysis/TargetTransformInfo.h" |
71 | #include "llvm/IR/CFG.h" |
72 | #include "llvm/IR/Constants.h" |
73 | #include "llvm/IR/IntrinsicInst.h" |
74 | #include "llvm/Support/CommandLine.h" |
75 | #include "llvm/Support/Debug.h" |
76 | #include "llvm/Transforms/Utils/Cloning.h" |
77 | #include "llvm/Transforms/Utils/SSAUpdaterBulk.h" |
78 | #include "llvm/Transforms/Utils/ValueMapper.h" |
79 | #include <algorithm> |
80 | #include <deque> |
81 | |
82 | #ifdef EXPENSIVE_CHECKS |
83 | #include "llvm/IR/Verifier.h" |
84 | #endif |
85 | |
86 | using namespace llvm; |
87 | |
88 | #define DEBUG_TYPE "dfa-jump-threading" |
89 | |
90 | STATISTIC(NumTransforms, "Number of transformations done" ); |
91 | STATISTIC(NumCloned, "Number of blocks cloned" ); |
92 | STATISTIC(NumPaths, "Number of individual paths threaded" ); |
93 | |
94 | static cl::opt<bool> |
95 | ClViewCfgBefore("dfa-jump-view-cfg-before" , |
96 | cl::desc("View the CFG before DFA Jump Threading" ), |
97 | cl::Hidden, cl::init(Val: false)); |
98 | |
99 | static cl::opt<bool> EarlyExitHeuristic( |
100 | "dfa-early-exit-heuristic" , |
101 | cl::desc("Exit early if an unpredictable value come from the same loop" ), |
102 | cl::Hidden, cl::init(Val: true)); |
103 | |
104 | static cl::opt<unsigned> MaxPathLength( |
105 | "dfa-max-path-length" , |
106 | cl::desc("Max number of blocks searched to find a threading path" ), |
107 | cl::Hidden, cl::init(Val: 20)); |
108 | |
109 | static cl::opt<unsigned> |
110 | MaxNumPaths("dfa-max-num-paths" , |
111 | cl::desc("Max number of paths enumerated around a switch" ), |
112 | cl::Hidden, cl::init(Val: 200)); |
113 | |
114 | static cl::opt<unsigned> |
115 | CostThreshold("dfa-cost-threshold" , |
116 | cl::desc("Maximum cost accepted for the transformation" ), |
117 | cl::Hidden, cl::init(Val: 50)); |
118 | |
119 | namespace { |
120 | |
121 | class SelectInstToUnfold { |
122 | SelectInst *SI; |
123 | PHINode *SIUse; |
124 | |
125 | public: |
126 | SelectInstToUnfold(SelectInst *SI, PHINode *SIUse) : SI(SI), SIUse(SIUse) {} |
127 | |
128 | SelectInst *getInst() { return SI; } |
129 | PHINode *getUse() { return SIUse; } |
130 | |
131 | explicit operator bool() const { return SI && SIUse; } |
132 | }; |
133 | |
134 | void unfold(DomTreeUpdater *DTU, SelectInstToUnfold SIToUnfold, |
135 | std::vector<SelectInstToUnfold> *NewSIsToUnfold, |
136 | std::vector<BasicBlock *> *NewBBs); |
137 | |
138 | class DFAJumpThreading { |
139 | public: |
140 | (AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI, |
141 | TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE) |
142 | : AC(AC), DT(DT), LI(LI), TTI(TTI), ORE(ORE) {} |
143 | |
144 | bool run(Function &F); |
145 | |
146 | private: |
147 | void |
148 | unfoldSelectInstrs(DominatorTree *DT, |
149 | const SmallVector<SelectInstToUnfold, 4> &SelectInsts) { |
150 | DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); |
151 | SmallVector<SelectInstToUnfold, 4> Stack; |
152 | for (SelectInstToUnfold SIToUnfold : SelectInsts) |
153 | Stack.push_back(Elt: SIToUnfold); |
154 | |
155 | while (!Stack.empty()) { |
156 | SelectInstToUnfold SIToUnfold = Stack.pop_back_val(); |
157 | |
158 | std::vector<SelectInstToUnfold> NewSIsToUnfold; |
159 | std::vector<BasicBlock *> NewBBs; |
160 | unfold(DTU: &DTU, SIToUnfold, NewSIsToUnfold: &NewSIsToUnfold, NewBBs: &NewBBs); |
161 | |
162 | // Put newly discovered select instructions into the work list. |
163 | for (const SelectInstToUnfold &NewSIToUnfold : NewSIsToUnfold) |
164 | Stack.push_back(Elt: NewSIToUnfold); |
165 | } |
166 | } |
167 | |
168 | AssumptionCache *AC; |
169 | DominatorTree *DT; |
170 | LoopInfo *LI; |
171 | TargetTransformInfo *TTI; |
172 | OptimizationRemarkEmitter *ORE; |
173 | }; |
174 | |
175 | } // end anonymous namespace |
176 | |
177 | namespace { |
178 | |
179 | /// Create a new basic block and sink \p SIToSink into it. |
180 | void createBasicBlockAndSinkSelectInst( |
181 | DomTreeUpdater *DTU, SelectInst *SI, PHINode *SIUse, SelectInst *SIToSink, |
182 | BasicBlock *EndBlock, StringRef NewBBName, BasicBlock **NewBlock, |
183 | BranchInst **NewBranch, std::vector<SelectInstToUnfold> *NewSIsToUnfold, |
184 | std::vector<BasicBlock *> *NewBBs) { |
185 | assert(SIToSink->hasOneUse()); |
186 | assert(NewBlock); |
187 | assert(NewBranch); |
188 | *NewBlock = BasicBlock::Create(Context&: SI->getContext(), Name: NewBBName, |
189 | Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
190 | NewBBs->push_back(x: *NewBlock); |
191 | *NewBranch = BranchInst::Create(IfTrue: EndBlock, InsertAtEnd: *NewBlock); |
192 | SIToSink->moveBefore(MovePos: *NewBranch); |
193 | NewSIsToUnfold->push_back(x: SelectInstToUnfold(SIToSink, SIUse)); |
194 | DTU->applyUpdates(Updates: {{DominatorTree::Insert, *NewBlock, EndBlock}}); |
195 | } |
196 | |
197 | /// Unfold the select instruction held in \p SIToUnfold by replacing it with |
198 | /// control flow. |
199 | /// |
200 | /// Put newly discovered select instructions into \p NewSIsToUnfold. Put newly |
201 | /// created basic blocks into \p NewBBs. |
202 | /// |
203 | /// TODO: merge it with CodeGenPrepare::optimizeSelectInst() if possible. |
204 | void unfold(DomTreeUpdater *DTU, SelectInstToUnfold SIToUnfold, |
205 | std::vector<SelectInstToUnfold> *NewSIsToUnfold, |
206 | std::vector<BasicBlock *> *NewBBs) { |
207 | SelectInst *SI = SIToUnfold.getInst(); |
208 | PHINode *SIUse = SIToUnfold.getUse(); |
209 | BasicBlock *StartBlock = SI->getParent(); |
210 | BasicBlock *EndBlock = SIUse->getParent(); |
211 | BranchInst *StartBlockTerm = |
212 | dyn_cast<BranchInst>(Val: StartBlock->getTerminator()); |
213 | |
214 | assert(StartBlockTerm && StartBlockTerm->isUnconditional()); |
215 | assert(SI->hasOneUse()); |
216 | |
217 | // These are the new basic blocks for the conditional branch. |
218 | // At least one will become an actual new basic block. |
219 | BasicBlock *TrueBlock = nullptr; |
220 | BasicBlock *FalseBlock = nullptr; |
221 | BranchInst *TrueBranch = nullptr; |
222 | BranchInst *FalseBranch = nullptr; |
223 | |
224 | // Sink select instructions to be able to unfold them later. |
225 | if (SelectInst *SIOp = dyn_cast<SelectInst>(Val: SI->getTrueValue())) { |
226 | createBasicBlockAndSinkSelectInst(DTU, SI, SIUse, SIToSink: SIOp, EndBlock, |
227 | NewBBName: "si.unfold.true" , NewBlock: &TrueBlock, NewBranch: &TrueBranch, |
228 | NewSIsToUnfold, NewBBs); |
229 | } |
230 | if (SelectInst *SIOp = dyn_cast<SelectInst>(Val: SI->getFalseValue())) { |
231 | createBasicBlockAndSinkSelectInst(DTU, SI, SIUse, SIToSink: SIOp, EndBlock, |
232 | NewBBName: "si.unfold.false" , NewBlock: &FalseBlock, |
233 | NewBranch: &FalseBranch, NewSIsToUnfold, NewBBs); |
234 | } |
235 | |
236 | // If there was nothing to sink, then arbitrarily choose the 'false' side |
237 | // for a new input value to the PHI. |
238 | if (!TrueBlock && !FalseBlock) { |
239 | FalseBlock = BasicBlock::Create(Context&: SI->getContext(), Name: "si.unfold.false" , |
240 | Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
241 | NewBBs->push_back(x: FalseBlock); |
242 | BranchInst::Create(IfTrue: EndBlock, InsertAtEnd: FalseBlock); |
243 | DTU->applyUpdates(Updates: {{DominatorTree::Insert, FalseBlock, EndBlock}}); |
244 | } |
245 | |
246 | // Insert the real conditional branch based on the original condition. |
247 | // If we did not create a new block for one of the 'true' or 'false' paths |
248 | // of the condition, it means that side of the branch goes to the end block |
249 | // directly and the path originates from the start block from the point of |
250 | // view of the new PHI. |
251 | BasicBlock *TT = EndBlock; |
252 | BasicBlock *FT = EndBlock; |
253 | if (TrueBlock && FalseBlock) { |
254 | // A diamond. |
255 | TT = TrueBlock; |
256 | FT = FalseBlock; |
257 | |
258 | // Update the phi node of SI. |
259 | SIUse->addIncoming(V: SI->getTrueValue(), BB: TrueBlock); |
260 | SIUse->addIncoming(V: SI->getFalseValue(), BB: FalseBlock); |
261 | |
262 | // Update any other PHI nodes in EndBlock. |
263 | for (PHINode &Phi : EndBlock->phis()) { |
264 | if (&Phi != SIUse) { |
265 | Value *OrigValue = Phi.getIncomingValueForBlock(BB: StartBlock); |
266 | Phi.addIncoming(V: OrigValue, BB: TrueBlock); |
267 | Phi.addIncoming(V: OrigValue, BB: FalseBlock); |
268 | } |
269 | |
270 | // Remove incoming place of original StartBlock, which comes in a indirect |
271 | // way (through TrueBlock and FalseBlock) now. |
272 | Phi.removeIncomingValue(BB: StartBlock, /* DeletePHIIfEmpty = */ false); |
273 | } |
274 | } else { |
275 | BasicBlock *NewBlock = nullptr; |
276 | Value *SIOp1 = SI->getTrueValue(); |
277 | Value *SIOp2 = SI->getFalseValue(); |
278 | |
279 | // A triangle pointing right. |
280 | if (!TrueBlock) { |
281 | NewBlock = FalseBlock; |
282 | FT = FalseBlock; |
283 | } |
284 | // A triangle pointing left. |
285 | else { |
286 | NewBlock = TrueBlock; |
287 | TT = TrueBlock; |
288 | std::swap(a&: SIOp1, b&: SIOp2); |
289 | } |
290 | |
291 | // Update the phi node of SI. |
292 | for (unsigned Idx = 0; Idx < SIUse->getNumIncomingValues(); ++Idx) { |
293 | if (SIUse->getIncomingBlock(i: Idx) == StartBlock) |
294 | SIUse->setIncomingValue(i: Idx, V: SIOp1); |
295 | } |
296 | SIUse->addIncoming(V: SIOp2, BB: NewBlock); |
297 | |
298 | // Update any other PHI nodes in EndBlock. |
299 | for (auto II = EndBlock->begin(); PHINode *Phi = dyn_cast<PHINode>(Val&: II); |
300 | ++II) { |
301 | if (Phi != SIUse) |
302 | Phi->addIncoming(V: Phi->getIncomingValueForBlock(BB: StartBlock), BB: NewBlock); |
303 | } |
304 | } |
305 | StartBlockTerm->eraseFromParent(); |
306 | BranchInst::Create(IfTrue: TT, IfFalse: FT, Cond: SI->getCondition(), InsertAtEnd: StartBlock); |
307 | DTU->applyUpdates(Updates: {{DominatorTree::Insert, StartBlock, TT}, |
308 | {DominatorTree::Insert, StartBlock, FT}}); |
309 | |
310 | // The select is now dead. |
311 | assert(SI->use_empty() && "Select must be dead now" ); |
312 | SI->eraseFromParent(); |
313 | } |
314 | |
315 | struct ClonedBlock { |
316 | BasicBlock *BB; |
317 | APInt State; ///< \p State corresponds to the next value of a switch stmnt. |
318 | }; |
319 | |
320 | typedef std::deque<BasicBlock *> PathType; |
321 | typedef std::vector<PathType> PathsType; |
322 | typedef SmallPtrSet<const BasicBlock *, 8> VisitedBlocks; |
323 | typedef std::vector<ClonedBlock> CloneList; |
324 | |
325 | // This data structure keeps track of all blocks that have been cloned. If two |
326 | // different ThreadingPaths clone the same block for a certain state it should |
327 | // be reused, and it can be looked up in this map. |
328 | typedef DenseMap<BasicBlock *, CloneList> DuplicateBlockMap; |
329 | |
330 | // This map keeps track of all the new definitions for an instruction. This |
331 | // information is needed when restoring SSA form after cloning blocks. |
332 | typedef MapVector<Instruction *, std::vector<Instruction *>> DefMap; |
333 | |
334 | inline raw_ostream &operator<<(raw_ostream &OS, const PathType &Path) { |
335 | OS << "< " ; |
336 | for (const BasicBlock *BB : Path) { |
337 | std::string BBName; |
338 | if (BB->hasName()) |
339 | raw_string_ostream(BBName) << BB->getName(); |
340 | else |
341 | raw_string_ostream(BBName) << BB; |
342 | OS << BBName << " " ; |
343 | } |
344 | OS << ">" ; |
345 | return OS; |
346 | } |
347 | |
348 | /// ThreadingPath is a path in the control flow of a loop that can be threaded |
349 | /// by cloning necessary basic blocks and replacing conditional branches with |
350 | /// unconditional ones. A threading path includes a list of basic blocks, the |
351 | /// exit state, and the block that determines the next state. |
352 | struct ThreadingPath { |
353 | /// Exit value is DFA's exit state for the given path. |
354 | APInt getExitValue() const { return ExitVal; } |
355 | void setExitValue(const ConstantInt *V) { |
356 | ExitVal = V->getValue(); |
357 | IsExitValSet = true; |
358 | } |
359 | bool isExitValueSet() const { return IsExitValSet; } |
360 | |
361 | /// Determinator is the basic block that determines the next state of the DFA. |
362 | const BasicBlock *getDeterminatorBB() const { return DBB; } |
363 | void setDeterminator(const BasicBlock *BB) { DBB = BB; } |
364 | |
365 | /// Path is a list of basic blocks. |
366 | const PathType &getPath() const { return Path; } |
367 | void setPath(const PathType &NewPath) { Path = NewPath; } |
368 | |
369 | void print(raw_ostream &OS) const { |
370 | OS << Path << " [ " << ExitVal << ", " << DBB->getName() << " ]" ; |
371 | } |
372 | |
373 | private: |
374 | PathType Path; |
375 | APInt ExitVal; |
376 | const BasicBlock *DBB = nullptr; |
377 | bool IsExitValSet = false; |
378 | }; |
379 | |
380 | #ifndef NDEBUG |
381 | inline raw_ostream &operator<<(raw_ostream &OS, const ThreadingPath &TPath) { |
382 | TPath.print(OS); |
383 | return OS; |
384 | } |
385 | #endif |
386 | |
387 | struct MainSwitch { |
388 | MainSwitch(SwitchInst *SI, LoopInfo *LI, OptimizationRemarkEmitter *ORE) |
389 | : LI(LI) { |
390 | if (isCandidate(SI)) { |
391 | Instr = SI; |
392 | } else { |
393 | ORE->emit(RemarkBuilder: [&]() { |
394 | return OptimizationRemarkMissed(DEBUG_TYPE, "SwitchNotPredictable" , SI) |
395 | << "Switch instruction is not predictable." ; |
396 | }); |
397 | } |
398 | } |
399 | |
400 | virtual ~MainSwitch() = default; |
401 | |
402 | SwitchInst *getInstr() const { return Instr; } |
403 | const SmallVector<SelectInstToUnfold, 4> getSelectInsts() { |
404 | return SelectInsts; |
405 | } |
406 | |
407 | private: |
408 | /// Do a use-def chain traversal starting from the switch condition to see if |
409 | /// \p SI is a potential condidate. |
410 | /// |
411 | /// Also, collect select instructions to unfold. |
412 | bool isCandidate(const SwitchInst *SI) { |
413 | std::deque<std::pair<Value *, BasicBlock *>> Q; |
414 | SmallSet<Value *, 16> SeenValues; |
415 | SelectInsts.clear(); |
416 | |
417 | Value *SICond = SI->getCondition(); |
418 | LLVM_DEBUG(dbgs() << "\tSICond: " << *SICond << "\n" ); |
419 | if (!isa<PHINode>(Val: SICond)) |
420 | return false; |
421 | |
422 | // The switch must be in a loop. |
423 | const Loop *L = LI->getLoopFor(BB: SI->getParent()); |
424 | if (!L) |
425 | return false; |
426 | |
427 | addToQueue(Val: SICond, BB: nullptr, Q, SeenValues); |
428 | |
429 | while (!Q.empty()) { |
430 | Value *Current = Q.front().first; |
431 | BasicBlock *CurrentIncomingBB = Q.front().second; |
432 | Q.pop_front(); |
433 | |
434 | if (auto *Phi = dyn_cast<PHINode>(Val: Current)) { |
435 | for (BasicBlock *IncomingBB : Phi->blocks()) { |
436 | Value *Incoming = Phi->getIncomingValueForBlock(BB: IncomingBB); |
437 | addToQueue(Val: Incoming, BB: IncomingBB, Q, SeenValues); |
438 | } |
439 | LLVM_DEBUG(dbgs() << "\tphi: " << *Phi << "\n" ); |
440 | } else if (SelectInst *SelI = dyn_cast<SelectInst>(Val: Current)) { |
441 | if (!isValidSelectInst(SI: SelI)) |
442 | return false; |
443 | addToQueue(Val: SelI->getTrueValue(), BB: CurrentIncomingBB, Q, SeenValues); |
444 | addToQueue(Val: SelI->getFalseValue(), BB: CurrentIncomingBB, Q, SeenValues); |
445 | LLVM_DEBUG(dbgs() << "\tselect: " << *SelI << "\n" ); |
446 | if (auto *SelIUse = dyn_cast<PHINode>(Val: SelI->user_back())) |
447 | SelectInsts.push_back(Elt: SelectInstToUnfold(SelI, SelIUse)); |
448 | } else if (isa<Constant>(Val: Current)) { |
449 | LLVM_DEBUG(dbgs() << "\tconst: " << *Current << "\n" ); |
450 | continue; |
451 | } else { |
452 | LLVM_DEBUG(dbgs() << "\tother: " << *Current << "\n" ); |
453 | // Allow unpredictable values. The hope is that those will be the |
454 | // initial switch values that can be ignored (they will hit the |
455 | // unthreaded switch) but this assumption will get checked later after |
456 | // paths have been enumerated (in function getStateDefMap). |
457 | |
458 | // If the unpredictable value comes from the same inner loop it is |
459 | // likely that it will also be on the enumerated paths, causing us to |
460 | // exit after we have enumerated all the paths. This heuristic save |
461 | // compile time because a search for all the paths can become expensive. |
462 | if (EarlyExitHeuristic && |
463 | L->contains(L: LI->getLoopFor(BB: CurrentIncomingBB))) { |
464 | LLVM_DEBUG(dbgs() |
465 | << "\tExiting early due to unpredictability heuristic.\n" ); |
466 | return false; |
467 | } |
468 | |
469 | continue; |
470 | } |
471 | } |
472 | |
473 | return true; |
474 | } |
475 | |
476 | void addToQueue(Value *Val, BasicBlock *BB, |
477 | std::deque<std::pair<Value *, BasicBlock *>> &Q, |
478 | SmallSet<Value *, 16> &SeenValues) { |
479 | if (SeenValues.contains(Ptr: Val)) |
480 | return; |
481 | Q.push_back(x: {Val, BB}); |
482 | SeenValues.insert(Ptr: Val); |
483 | } |
484 | |
485 | bool isValidSelectInst(SelectInst *SI) { |
486 | if (!SI->hasOneUse()) |
487 | return false; |
488 | |
489 | Instruction *SIUse = dyn_cast<Instruction>(Val: SI->user_back()); |
490 | // The use of the select inst should be either a phi or another select. |
491 | if (!SIUse && !(isa<PHINode>(Val: SIUse) || isa<SelectInst>(Val: SIUse))) |
492 | return false; |
493 | |
494 | BasicBlock *SIBB = SI->getParent(); |
495 | |
496 | // Currently, we can only expand select instructions in basic blocks with |
497 | // one successor. |
498 | BranchInst *SITerm = dyn_cast<BranchInst>(Val: SIBB->getTerminator()); |
499 | if (!SITerm || !SITerm->isUnconditional()) |
500 | return false; |
501 | |
502 | // Only fold the select coming from directly where it is defined. |
503 | PHINode *PHIUser = dyn_cast<PHINode>(Val: SIUse); |
504 | if (PHIUser && PHIUser->getIncomingBlock(U: *SI->use_begin()) != SIBB) |
505 | return false; |
506 | |
507 | // If select will not be sunk during unfolding, and it is in the same basic |
508 | // block as another state defining select, then cannot unfold both. |
509 | for (SelectInstToUnfold SIToUnfold : SelectInsts) { |
510 | SelectInst *PrevSI = SIToUnfold.getInst(); |
511 | if (PrevSI->getTrueValue() != SI && PrevSI->getFalseValue() != SI && |
512 | PrevSI->getParent() == SI->getParent()) |
513 | return false; |
514 | } |
515 | |
516 | return true; |
517 | } |
518 | |
519 | LoopInfo *LI; |
520 | SwitchInst *Instr = nullptr; |
521 | SmallVector<SelectInstToUnfold, 4> SelectInsts; |
522 | }; |
523 | |
524 | struct AllSwitchPaths { |
525 | AllSwitchPaths(const MainSwitch *MSwitch, OptimizationRemarkEmitter *ORE) |
526 | : Switch(MSwitch->getInstr()), SwitchBlock(Switch->getParent()), |
527 | ORE(ORE) {} |
528 | |
529 | std::vector<ThreadingPath> &getThreadingPaths() { return TPaths; } |
530 | unsigned getNumThreadingPaths() { return TPaths.size(); } |
531 | SwitchInst *getSwitchInst() { return Switch; } |
532 | BasicBlock *getSwitchBlock() { return SwitchBlock; } |
533 | |
534 | void run() { |
535 | VisitedBlocks Visited; |
536 | PathsType LoopPaths = paths(BB: SwitchBlock, Visited, /* PathDepth = */ 1); |
537 | StateDefMap StateDef = getStateDefMap(LoopPaths); |
538 | |
539 | if (StateDef.empty()) { |
540 | ORE->emit(RemarkBuilder: [&]() { |
541 | return OptimizationRemarkMissed(DEBUG_TYPE, "SwitchNotPredictable" , |
542 | Switch) |
543 | << "Switch instruction is not predictable." ; |
544 | }); |
545 | return; |
546 | } |
547 | |
548 | for (PathType Path : LoopPaths) { |
549 | ThreadingPath TPath; |
550 | |
551 | const BasicBlock *PrevBB = Path.back(); |
552 | for (const BasicBlock *BB : Path) { |
553 | if (StateDef.contains(Val: BB)) { |
554 | const PHINode *Phi = dyn_cast<PHINode>(Val: StateDef[BB]); |
555 | assert(Phi && "Expected a state-defining instr to be a phi node." ); |
556 | |
557 | const Value *V = Phi->getIncomingValueForBlock(BB: PrevBB); |
558 | if (const ConstantInt *C = dyn_cast<const ConstantInt>(Val: V)) { |
559 | TPath.setExitValue(C); |
560 | TPath.setDeterminator(BB); |
561 | TPath.setPath(Path); |
562 | } |
563 | } |
564 | |
565 | // Switch block is the determinator, this is the final exit value. |
566 | if (TPath.isExitValueSet() && BB == Path.front()) |
567 | break; |
568 | |
569 | PrevBB = BB; |
570 | } |
571 | |
572 | if (TPath.isExitValueSet() && isSupported(TPath)) |
573 | TPaths.push_back(x: TPath); |
574 | } |
575 | } |
576 | |
577 | private: |
578 | // Value: an instruction that defines a switch state; |
579 | // Key: the parent basic block of that instruction. |
580 | typedef DenseMap<const BasicBlock *, const PHINode *> StateDefMap; |
581 | |
582 | PathsType paths(BasicBlock *BB, VisitedBlocks &Visited, |
583 | unsigned PathDepth) const { |
584 | PathsType Res; |
585 | |
586 | // Stop exploring paths after visiting MaxPathLength blocks |
587 | if (PathDepth > MaxPathLength) { |
588 | ORE->emit(RemarkBuilder: [&]() { |
589 | return OptimizationRemarkAnalysis(DEBUG_TYPE, "MaxPathLengthReached" , |
590 | Switch) |
591 | << "Exploration stopped after visiting MaxPathLength=" |
592 | << ore::NV("MaxPathLength" , MaxPathLength) << " blocks." ; |
593 | }); |
594 | return Res; |
595 | } |
596 | |
597 | Visited.insert(Ptr: BB); |
598 | |
599 | // Some blocks have multiple edges to the same successor, and this set |
600 | // is used to prevent a duplicate path from being generated |
601 | SmallSet<BasicBlock *, 4> Successors; |
602 | for (BasicBlock *Succ : successors(BB)) { |
603 | if (!Successors.insert(Ptr: Succ).second) |
604 | continue; |
605 | |
606 | // Found a cycle through the SwitchBlock |
607 | if (Succ == SwitchBlock) { |
608 | Res.push_back(x: {BB}); |
609 | continue; |
610 | } |
611 | |
612 | // We have encountered a cycle, do not get caught in it |
613 | if (Visited.contains(Ptr: Succ)) |
614 | continue; |
615 | |
616 | PathsType SuccPaths = paths(BB: Succ, Visited, PathDepth: PathDepth + 1); |
617 | for (const PathType &Path : SuccPaths) { |
618 | PathType NewPath(Path); |
619 | NewPath.push_front(x: BB); |
620 | Res.push_back(x: NewPath); |
621 | if (Res.size() >= MaxNumPaths) { |
622 | return Res; |
623 | } |
624 | } |
625 | } |
626 | // This block could now be visited again from a different predecessor. Note |
627 | // that this will result in exponential runtime. Subpaths could possibly be |
628 | // cached but it takes a lot of memory to store them. |
629 | Visited.erase(Ptr: BB); |
630 | return Res; |
631 | } |
632 | |
633 | /// Walk the use-def chain and collect all the state-defining instructions. |
634 | /// |
635 | /// Return an empty map if unpredictable values encountered inside the basic |
636 | /// blocks of \p LoopPaths. |
637 | StateDefMap getStateDefMap(const PathsType &LoopPaths) const { |
638 | StateDefMap Res; |
639 | |
640 | // Basic blocks belonging to any of the loops around the switch statement. |
641 | SmallPtrSet<BasicBlock *, 16> LoopBBs; |
642 | for (const PathType &Path : LoopPaths) { |
643 | for (BasicBlock *BB : Path) |
644 | LoopBBs.insert(Ptr: BB); |
645 | } |
646 | |
647 | Value *FirstDef = Switch->getOperand(i_nocapture: 0); |
648 | |
649 | assert(isa<PHINode>(FirstDef) && "The first definition must be a phi." ); |
650 | |
651 | SmallVector<PHINode *, 8> Stack; |
652 | Stack.push_back(Elt: dyn_cast<PHINode>(Val: FirstDef)); |
653 | SmallSet<Value *, 16> SeenValues; |
654 | |
655 | while (!Stack.empty()) { |
656 | PHINode *CurPhi = Stack.pop_back_val(); |
657 | |
658 | Res[CurPhi->getParent()] = CurPhi; |
659 | SeenValues.insert(Ptr: CurPhi); |
660 | |
661 | for (BasicBlock *IncomingBB : CurPhi->blocks()) { |
662 | Value *Incoming = CurPhi->getIncomingValueForBlock(BB: IncomingBB); |
663 | bool IsOutsideLoops = LoopBBs.count(Ptr: IncomingBB) == 0; |
664 | if (Incoming == FirstDef || isa<ConstantInt>(Val: Incoming) || |
665 | SeenValues.contains(Ptr: Incoming) || IsOutsideLoops) { |
666 | continue; |
667 | } |
668 | |
669 | // Any unpredictable value inside the loops means we must bail out. |
670 | if (!isa<PHINode>(Val: Incoming)) |
671 | return StateDefMap(); |
672 | |
673 | Stack.push_back(Elt: cast<PHINode>(Val: Incoming)); |
674 | } |
675 | } |
676 | |
677 | return Res; |
678 | } |
679 | |
680 | /// The determinator BB should precede the switch-defining BB. |
681 | /// |
682 | /// Otherwise, it is possible that the state defined in the determinator block |
683 | /// defines the state for the next iteration of the loop, rather than for the |
684 | /// current one. |
685 | /// |
686 | /// Currently supported paths: |
687 | /// \code |
688 | /// < switch bb1 determ def > [ 42, determ ] |
689 | /// < switch_and_def bb1 determ > [ 42, determ ] |
690 | /// < switch_and_def_and_determ bb1 > [ 42, switch_and_def_and_determ ] |
691 | /// \endcode |
692 | /// |
693 | /// Unsupported paths: |
694 | /// \code |
695 | /// < switch bb1 def determ > [ 43, determ ] |
696 | /// < switch_and_determ bb1 def > [ 43, switch_and_determ ] |
697 | /// \endcode |
698 | bool isSupported(const ThreadingPath &TPath) { |
699 | Instruction *SwitchCondI = dyn_cast<Instruction>(Val: Switch->getCondition()); |
700 | assert(SwitchCondI); |
701 | if (!SwitchCondI) |
702 | return false; |
703 | |
704 | const BasicBlock *SwitchCondDefBB = SwitchCondI->getParent(); |
705 | const BasicBlock *SwitchCondUseBB = Switch->getParent(); |
706 | const BasicBlock *DeterminatorBB = TPath.getDeterminatorBB(); |
707 | |
708 | assert( |
709 | SwitchCondUseBB == TPath.getPath().front() && |
710 | "The first BB in a threading path should have the switch instruction" ); |
711 | if (SwitchCondUseBB != TPath.getPath().front()) |
712 | return false; |
713 | |
714 | // Make DeterminatorBB the first element in Path. |
715 | PathType Path = TPath.getPath(); |
716 | auto ItDet = llvm::find(Range&: Path, Val: DeterminatorBB); |
717 | std::rotate(first: Path.begin(), middle: ItDet, last: Path.end()); |
718 | |
719 | bool IsDetBBSeen = false; |
720 | bool IsDefBBSeen = false; |
721 | bool IsUseBBSeen = false; |
722 | for (BasicBlock *BB : Path) { |
723 | if (BB == DeterminatorBB) |
724 | IsDetBBSeen = true; |
725 | if (BB == SwitchCondDefBB) |
726 | IsDefBBSeen = true; |
727 | if (BB == SwitchCondUseBB) |
728 | IsUseBBSeen = true; |
729 | if (IsDetBBSeen && IsUseBBSeen && !IsDefBBSeen) |
730 | return false; |
731 | } |
732 | |
733 | return true; |
734 | } |
735 | |
736 | SwitchInst *Switch; |
737 | BasicBlock *SwitchBlock; |
738 | OptimizationRemarkEmitter *ORE; |
739 | std::vector<ThreadingPath> TPaths; |
740 | }; |
741 | |
742 | struct TransformDFA { |
743 | TransformDFA(AllSwitchPaths *SwitchPaths, DominatorTree *DT, |
744 | AssumptionCache *AC, TargetTransformInfo *TTI, |
745 | OptimizationRemarkEmitter *ORE, |
746 | SmallPtrSet<const Value *, 32> EphValues) |
747 | : SwitchPaths(SwitchPaths), DT(DT), AC(AC), TTI(TTI), ORE(ORE), |
748 | EphValues(EphValues) {} |
749 | |
750 | void run() { |
751 | if (isLegalAndProfitableToTransform()) { |
752 | createAllExitPaths(); |
753 | NumTransforms++; |
754 | } |
755 | } |
756 | |
757 | private: |
758 | /// This function performs both a legality check and profitability check at |
759 | /// the same time since it is convenient to do so. It iterates through all |
760 | /// blocks that will be cloned, and keeps track of the duplication cost. It |
761 | /// also returns false if it is illegal to clone some required block. |
762 | bool isLegalAndProfitableToTransform() { |
763 | CodeMetrics Metrics; |
764 | SwitchInst *Switch = SwitchPaths->getSwitchInst(); |
765 | |
766 | // Don't thread switch without multiple successors. |
767 | if (Switch->getNumSuccessors() <= 1) |
768 | return false; |
769 | |
770 | // Note that DuplicateBlockMap is not being used as intended here. It is |
771 | // just being used to ensure (BB, State) pairs are only counted once. |
772 | DuplicateBlockMap DuplicateMap; |
773 | |
774 | for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { |
775 | PathType PathBBs = TPath.getPath(); |
776 | APInt NextState = TPath.getExitValue(); |
777 | const BasicBlock *Determinator = TPath.getDeterminatorBB(); |
778 | |
779 | // Update Metrics for the Switch block, this is always cloned |
780 | BasicBlock *BB = SwitchPaths->getSwitchBlock(); |
781 | BasicBlock *VisitedBB = getClonedBB(BB, NextState, DuplicateMap); |
782 | if (!VisitedBB) { |
783 | Metrics.analyzeBasicBlock(BB, TTI: *TTI, EphValues); |
784 | DuplicateMap[BB].push_back(x: {.BB: BB, .State: NextState}); |
785 | } |
786 | |
787 | // If the Switch block is the Determinator, then we can continue since |
788 | // this is the only block that is cloned and we already counted for it. |
789 | if (PathBBs.front() == Determinator) |
790 | continue; |
791 | |
792 | // Otherwise update Metrics for all blocks that will be cloned. If any |
793 | // block is already cloned and would be reused, don't double count it. |
794 | auto DetIt = llvm::find(Range&: PathBBs, Val: Determinator); |
795 | for (auto BBIt = DetIt; BBIt != PathBBs.end(); BBIt++) { |
796 | BB = *BBIt; |
797 | VisitedBB = getClonedBB(BB, NextState, DuplicateMap); |
798 | if (VisitedBB) |
799 | continue; |
800 | Metrics.analyzeBasicBlock(BB, TTI: *TTI, EphValues); |
801 | DuplicateMap[BB].push_back(x: {.BB: BB, .State: NextState}); |
802 | } |
803 | |
804 | if (Metrics.notDuplicatable) { |
805 | LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains " |
806 | << "non-duplicatable instructions.\n" ); |
807 | ORE->emit(RemarkBuilder: [&]() { |
808 | return OptimizationRemarkMissed(DEBUG_TYPE, "NonDuplicatableInst" , |
809 | Switch) |
810 | << "Contains non-duplicatable instructions." ; |
811 | }); |
812 | return false; |
813 | } |
814 | |
815 | if (Metrics.convergent) { |
816 | LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains " |
817 | << "convergent instructions.\n" ); |
818 | ORE->emit(RemarkBuilder: [&]() { |
819 | return OptimizationRemarkMissed(DEBUG_TYPE, "ConvergentInst" , Switch) |
820 | << "Contains convergent instructions." ; |
821 | }); |
822 | return false; |
823 | } |
824 | |
825 | if (!Metrics.NumInsts.isValid()) { |
826 | LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains " |
827 | << "instructions with invalid cost.\n" ); |
828 | ORE->emit(RemarkBuilder: [&]() { |
829 | return OptimizationRemarkMissed(DEBUG_TYPE, "ConvergentInst" , Switch) |
830 | << "Contains instructions with invalid cost." ; |
831 | }); |
832 | return false; |
833 | } |
834 | } |
835 | |
836 | InstructionCost DuplicationCost = 0; |
837 | |
838 | unsigned JumpTableSize = 0; |
839 | TTI->getEstimatedNumberOfCaseClusters(SI: *Switch, JTSize&: JumpTableSize, PSI: nullptr, |
840 | BFI: nullptr); |
841 | if (JumpTableSize == 0) { |
842 | // Factor in the number of conditional branches reduced from jump |
843 | // threading. Assume that lowering the switch block is implemented by |
844 | // using binary search, hence the LogBase2(). |
845 | unsigned CondBranches = |
846 | APInt(32, Switch->getNumSuccessors()).ceilLogBase2(); |
847 | assert(CondBranches > 0 && |
848 | "The threaded switch must have multiple branches" ); |
849 | DuplicationCost = Metrics.NumInsts / CondBranches; |
850 | } else { |
851 | // Compared with jump tables, the DFA optimizer removes an indirect branch |
852 | // on each loop iteration, thus making branch prediction more precise. The |
853 | // more branch targets there are, the more likely it is for the branch |
854 | // predictor to make a mistake, and the more benefit there is in the DFA |
855 | // optimizer. Thus, the more branch targets there are, the lower is the |
856 | // cost of the DFA opt. |
857 | DuplicationCost = Metrics.NumInsts / JumpTableSize; |
858 | } |
859 | |
860 | LLVM_DEBUG(dbgs() << "\nDFA Jump Threading: Cost to jump thread block " |
861 | << SwitchPaths->getSwitchBlock()->getName() |
862 | << " is: " << DuplicationCost << "\n\n" ); |
863 | |
864 | if (DuplicationCost > CostThreshold) { |
865 | LLVM_DEBUG(dbgs() << "Not jump threading, duplication cost exceeds the " |
866 | << "cost threshold.\n" ); |
867 | ORE->emit(RemarkBuilder: [&]() { |
868 | return OptimizationRemarkMissed(DEBUG_TYPE, "NotProfitable" , Switch) |
869 | << "Duplication cost exceeds the cost threshold (cost=" |
870 | << ore::NV("Cost" , DuplicationCost) |
871 | << ", threshold=" << ore::NV("Threshold" , CostThreshold) << ")." ; |
872 | }); |
873 | return false; |
874 | } |
875 | |
876 | ORE->emit(RemarkBuilder: [&]() { |
877 | return OptimizationRemark(DEBUG_TYPE, "JumpThreaded" , Switch) |
878 | << "Switch statement jump-threaded." ; |
879 | }); |
880 | |
881 | return true; |
882 | } |
883 | |
884 | /// Transform each threading path to effectively jump thread the DFA. |
885 | void createAllExitPaths() { |
886 | DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Eager); |
887 | |
888 | // Move the switch block to the end of the path, since it will be duplicated |
889 | BasicBlock *SwitchBlock = SwitchPaths->getSwitchBlock(); |
890 | for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { |
891 | LLVM_DEBUG(dbgs() << TPath << "\n" ); |
892 | PathType NewPath(TPath.getPath()); |
893 | NewPath.push_back(x: SwitchBlock); |
894 | TPath.setPath(NewPath); |
895 | } |
896 | |
897 | // Transform the ThreadingPaths and keep track of the cloned values |
898 | DuplicateBlockMap DuplicateMap; |
899 | DefMap NewDefs; |
900 | |
901 | SmallSet<BasicBlock *, 16> BlocksToClean; |
902 | for (BasicBlock *BB : successors(BB: SwitchBlock)) |
903 | BlocksToClean.insert(Ptr: BB); |
904 | |
905 | for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { |
906 | createExitPath(NewDefs, Path&: TPath, DuplicateMap, BlocksToClean, DTU: &DTU); |
907 | NumPaths++; |
908 | } |
909 | |
910 | // After all paths are cloned, now update the last successor of the cloned |
911 | // path so it skips over the switch statement |
912 | for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) |
913 | updateLastSuccessor(TPath, DuplicateMap, DTU: &DTU); |
914 | |
915 | // For each instruction that was cloned and used outside, update its uses |
916 | updateSSA(NewDefs); |
917 | |
918 | // Clean PHI Nodes for the newly created blocks |
919 | for (BasicBlock *BB : BlocksToClean) |
920 | cleanPhiNodes(BB); |
921 | } |
922 | |
923 | /// For a specific ThreadingPath \p Path, create an exit path starting from |
924 | /// the determinator block. |
925 | /// |
926 | /// To remember the correct destination, we have to duplicate blocks |
927 | /// corresponding to each state. Also update the terminating instruction of |
928 | /// the predecessors, and phis in the successor blocks. |
929 | void createExitPath(DefMap &NewDefs, ThreadingPath &Path, |
930 | DuplicateBlockMap &DuplicateMap, |
931 | SmallSet<BasicBlock *, 16> &BlocksToClean, |
932 | DomTreeUpdater *DTU) { |
933 | APInt NextState = Path.getExitValue(); |
934 | const BasicBlock *Determinator = Path.getDeterminatorBB(); |
935 | PathType PathBBs = Path.getPath(); |
936 | |
937 | // Don't select the placeholder block in front |
938 | if (PathBBs.front() == Determinator) |
939 | PathBBs.pop_front(); |
940 | |
941 | auto DetIt = llvm::find(Range&: PathBBs, Val: Determinator); |
942 | // When there is only one BB in PathBBs, the determinator takes itself as a |
943 | // direct predecessor. |
944 | BasicBlock *PrevBB = PathBBs.size() == 1 ? *DetIt : *std::prev(x: DetIt); |
945 | for (auto BBIt = DetIt; BBIt != PathBBs.end(); BBIt++) { |
946 | BasicBlock *BB = *BBIt; |
947 | BlocksToClean.insert(Ptr: BB); |
948 | |
949 | // We already cloned BB for this NextState, now just update the branch |
950 | // and continue. |
951 | BasicBlock *NextBB = getClonedBB(BB, NextState, DuplicateMap); |
952 | if (NextBB) { |
953 | updatePredecessor(PrevBB, OldBB: BB, NewBB: NextBB, DTU); |
954 | PrevBB = NextBB; |
955 | continue; |
956 | } |
957 | |
958 | // Clone the BB and update the successor of Prev to jump to the new block |
959 | BasicBlock *NewBB = cloneBlockAndUpdatePredecessor( |
960 | BB, PrevBB, NextState, DuplicateMap, NewDefs, DTU); |
961 | DuplicateMap[BB].push_back(x: {.BB: NewBB, .State: NextState}); |
962 | BlocksToClean.insert(Ptr: NewBB); |
963 | PrevBB = NewBB; |
964 | } |
965 | } |
966 | |
967 | /// Restore SSA form after cloning blocks. |
968 | /// |
969 | /// Each cloned block creates new defs for a variable, and the uses need to be |
970 | /// updated to reflect this. The uses may be replaced with a cloned value, or |
971 | /// some derived phi instruction. Note that all uses of a value defined in the |
972 | /// same block were already remapped when cloning the block. |
973 | void updateSSA(DefMap &NewDefs) { |
974 | SSAUpdaterBulk SSAUpdate; |
975 | SmallVector<Use *, 16> UsesToRename; |
976 | |
977 | for (const auto &KV : NewDefs) { |
978 | Instruction *I = KV.first; |
979 | BasicBlock *BB = I->getParent(); |
980 | std::vector<Instruction *> Cloned = KV.second; |
981 | |
982 | // Scan all uses of this instruction to see if it is used outside of its |
983 | // block, and if so, record them in UsesToRename. |
984 | for (Use &U : I->uses()) { |
985 | Instruction *User = cast<Instruction>(Val: U.getUser()); |
986 | if (PHINode *UserPN = dyn_cast<PHINode>(Val: User)) { |
987 | if (UserPN->getIncomingBlock(U) == BB) |
988 | continue; |
989 | } else if (User->getParent() == BB) { |
990 | continue; |
991 | } |
992 | |
993 | UsesToRename.push_back(Elt: &U); |
994 | } |
995 | |
996 | // If there are no uses outside the block, we're done with this |
997 | // instruction. |
998 | if (UsesToRename.empty()) |
999 | continue; |
1000 | LLVM_DEBUG(dbgs() << "DFA-JT: Renaming non-local uses of: " << *I |
1001 | << "\n" ); |
1002 | |
1003 | // We found a use of I outside of BB. Rename all uses of I that are |
1004 | // outside its block to be uses of the appropriate PHI node etc. See |
1005 | // ValuesInBlocks with the values we know. |
1006 | unsigned VarNum = SSAUpdate.AddVariable(Name: I->getName(), Ty: I->getType()); |
1007 | SSAUpdate.AddAvailableValue(Var: VarNum, BB, V: I); |
1008 | for (Instruction *New : Cloned) |
1009 | SSAUpdate.AddAvailableValue(Var: VarNum, BB: New->getParent(), V: New); |
1010 | |
1011 | while (!UsesToRename.empty()) |
1012 | SSAUpdate.AddUse(Var: VarNum, U: UsesToRename.pop_back_val()); |
1013 | |
1014 | LLVM_DEBUG(dbgs() << "\n" ); |
1015 | } |
1016 | // SSAUpdater handles phi placement and renaming uses with the appropriate |
1017 | // value. |
1018 | SSAUpdate.RewriteAllUses(DT); |
1019 | } |
1020 | |
1021 | /// Clones a basic block, and adds it to the CFG. |
1022 | /// |
1023 | /// This function also includes updating phi nodes in the successors of the |
1024 | /// BB, and remapping uses that were defined locally in the cloned BB. |
1025 | BasicBlock *cloneBlockAndUpdatePredecessor(BasicBlock *BB, BasicBlock *PrevBB, |
1026 | const APInt &NextState, |
1027 | DuplicateBlockMap &DuplicateMap, |
1028 | DefMap &NewDefs, |
1029 | DomTreeUpdater *DTU) { |
1030 | ValueToValueMapTy VMap; |
1031 | BasicBlock *NewBB = CloneBasicBlock( |
1032 | BB, VMap, NameSuffix: ".jt" + std::to_string(val: NextState.getLimitedValue()), |
1033 | F: BB->getParent()); |
1034 | NewBB->moveAfter(MovePos: BB); |
1035 | NumCloned++; |
1036 | |
1037 | for (Instruction &I : *NewBB) { |
1038 | // Do not remap operands of PHINode in case a definition in BB is an |
1039 | // incoming value to a phi in the same block. This incoming value will |
1040 | // be renamed later while restoring SSA. |
1041 | if (isa<PHINode>(Val: &I)) |
1042 | continue; |
1043 | RemapInstruction(I: &I, VM&: VMap, |
1044 | Flags: RF_IgnoreMissingLocals | RF_NoModuleLevelChanges); |
1045 | if (AssumeInst *II = dyn_cast<AssumeInst>(Val: &I)) |
1046 | AC->registerAssumption(CI: II); |
1047 | } |
1048 | |
1049 | updateSuccessorPhis(BB, ClonedBB: NewBB, NextState, VMap, DuplicateMap); |
1050 | updatePredecessor(PrevBB, OldBB: BB, NewBB, DTU); |
1051 | updateDefMap(NewDefs, VMap); |
1052 | |
1053 | // Add all successors to the DominatorTree |
1054 | SmallPtrSet<BasicBlock *, 4> SuccSet; |
1055 | for (auto *SuccBB : successors(BB: NewBB)) { |
1056 | if (SuccSet.insert(Ptr: SuccBB).second) |
1057 | DTU->applyUpdates(Updates: {{DominatorTree::Insert, NewBB, SuccBB}}); |
1058 | } |
1059 | SuccSet.clear(); |
1060 | return NewBB; |
1061 | } |
1062 | |
1063 | /// Update the phi nodes in BB's successors. |
1064 | /// |
1065 | /// This means creating a new incoming value from NewBB with the new |
1066 | /// instruction wherever there is an incoming value from BB. |
1067 | void updateSuccessorPhis(BasicBlock *BB, BasicBlock *ClonedBB, |
1068 | const APInt &NextState, ValueToValueMapTy &VMap, |
1069 | DuplicateBlockMap &DuplicateMap) { |
1070 | std::vector<BasicBlock *> BlocksToUpdate; |
1071 | |
1072 | // If BB is the last block in the path, we can simply update the one case |
1073 | // successor that will be reached. |
1074 | if (BB == SwitchPaths->getSwitchBlock()) { |
1075 | SwitchInst *Switch = SwitchPaths->getSwitchInst(); |
1076 | BasicBlock *NextCase = getNextCaseSuccessor(Switch, NextState); |
1077 | BlocksToUpdate.push_back(x: NextCase); |
1078 | BasicBlock *ClonedSucc = getClonedBB(BB: NextCase, NextState, DuplicateMap); |
1079 | if (ClonedSucc) |
1080 | BlocksToUpdate.push_back(x: ClonedSucc); |
1081 | } |
1082 | // Otherwise update phis in all successors. |
1083 | else { |
1084 | for (BasicBlock *Succ : successors(BB)) { |
1085 | BlocksToUpdate.push_back(x: Succ); |
1086 | |
1087 | // Check if a successor has already been cloned for the particular exit |
1088 | // value. In this case if a successor was already cloned, the phi nodes |
1089 | // in the cloned block should be updated directly. |
1090 | BasicBlock *ClonedSucc = getClonedBB(BB: Succ, NextState, DuplicateMap); |
1091 | if (ClonedSucc) |
1092 | BlocksToUpdate.push_back(x: ClonedSucc); |
1093 | } |
1094 | } |
1095 | |
1096 | // If there is a phi with an incoming value from BB, create a new incoming |
1097 | // value for the new predecessor ClonedBB. The value will either be the same |
1098 | // value from BB or a cloned value. |
1099 | for (BasicBlock *Succ : BlocksToUpdate) { |
1100 | for (auto II = Succ->begin(); PHINode *Phi = dyn_cast<PHINode>(Val&: II); |
1101 | ++II) { |
1102 | Value *Incoming = Phi->getIncomingValueForBlock(BB); |
1103 | if (Incoming) { |
1104 | if (isa<Constant>(Val: Incoming)) { |
1105 | Phi->addIncoming(V: Incoming, BB: ClonedBB); |
1106 | continue; |
1107 | } |
1108 | Value *ClonedVal = VMap[Incoming]; |
1109 | if (ClonedVal) |
1110 | Phi->addIncoming(V: ClonedVal, BB: ClonedBB); |
1111 | else |
1112 | Phi->addIncoming(V: Incoming, BB: ClonedBB); |
1113 | } |
1114 | } |
1115 | } |
1116 | } |
1117 | |
1118 | /// Sets the successor of PrevBB to be NewBB instead of OldBB. Note that all |
1119 | /// other successors are kept as well. |
1120 | void updatePredecessor(BasicBlock *PrevBB, BasicBlock *OldBB, |
1121 | BasicBlock *NewBB, DomTreeUpdater *DTU) { |
1122 | // When a path is reused, there is a chance that predecessors were already |
1123 | // updated before. Check if the predecessor needs to be updated first. |
1124 | if (!isPredecessor(BB: OldBB, IncomingBB: PrevBB)) |
1125 | return; |
1126 | |
1127 | Instruction *PrevTerm = PrevBB->getTerminator(); |
1128 | for (unsigned Idx = 0; Idx < PrevTerm->getNumSuccessors(); Idx++) { |
1129 | if (PrevTerm->getSuccessor(Idx) == OldBB) { |
1130 | OldBB->removePredecessor(Pred: PrevBB, /* KeepOneInputPHIs = */ true); |
1131 | PrevTerm->setSuccessor(Idx, BB: NewBB); |
1132 | } |
1133 | } |
1134 | DTU->applyUpdates(Updates: {{DominatorTree::Delete, PrevBB, OldBB}, |
1135 | {DominatorTree::Insert, PrevBB, NewBB}}); |
1136 | } |
1137 | |
1138 | /// Add new value mappings to the DefMap to keep track of all new definitions |
1139 | /// for a particular instruction. These will be used while updating SSA form. |
1140 | void updateDefMap(DefMap &NewDefs, ValueToValueMapTy &VMap) { |
1141 | SmallVector<std::pair<Instruction *, Instruction *>> NewDefsVector; |
1142 | NewDefsVector.reserve(N: VMap.size()); |
1143 | |
1144 | for (auto Entry : VMap) { |
1145 | Instruction *Inst = |
1146 | dyn_cast<Instruction>(Val: const_cast<Value *>(Entry.first)); |
1147 | if (!Inst || !Entry.second || isa<BranchInst>(Val: Inst) || |
1148 | isa<SwitchInst>(Val: Inst)) { |
1149 | continue; |
1150 | } |
1151 | |
1152 | Instruction *Cloned = dyn_cast<Instruction>(Val&: Entry.second); |
1153 | if (!Cloned) |
1154 | continue; |
1155 | |
1156 | NewDefsVector.push_back(Elt: {Inst, Cloned}); |
1157 | } |
1158 | |
1159 | // Sort the defs to get deterministic insertion order into NewDefs. |
1160 | sort(C&: NewDefsVector, Comp: [](const auto &LHS, const auto &RHS) { |
1161 | if (LHS.first == RHS.first) |
1162 | return LHS.second->comesBefore(RHS.second); |
1163 | return LHS.first->comesBefore(RHS.first); |
1164 | }); |
1165 | |
1166 | for (const auto &KV : NewDefsVector) |
1167 | NewDefs[KV.first].push_back(x: KV.second); |
1168 | } |
1169 | |
1170 | /// Update the last branch of a particular cloned path to point to the correct |
1171 | /// case successor. |
1172 | /// |
1173 | /// Note that this is an optional step and would have been done in later |
1174 | /// optimizations, but it makes the CFG significantly easier to work with. |
1175 | void updateLastSuccessor(ThreadingPath &TPath, |
1176 | DuplicateBlockMap &DuplicateMap, |
1177 | DomTreeUpdater *DTU) { |
1178 | APInt NextState = TPath.getExitValue(); |
1179 | BasicBlock *BB = TPath.getPath().back(); |
1180 | BasicBlock *LastBlock = getClonedBB(BB, NextState, DuplicateMap); |
1181 | |
1182 | // Note multiple paths can end at the same block so check that it is not |
1183 | // updated yet |
1184 | if (!isa<SwitchInst>(Val: LastBlock->getTerminator())) |
1185 | return; |
1186 | SwitchInst *Switch = cast<SwitchInst>(Val: LastBlock->getTerminator()); |
1187 | BasicBlock *NextCase = getNextCaseSuccessor(Switch, NextState); |
1188 | |
1189 | std::vector<DominatorTree::UpdateType> DTUpdates; |
1190 | SmallPtrSet<BasicBlock *, 4> SuccSet; |
1191 | for (BasicBlock *Succ : successors(BB: LastBlock)) { |
1192 | if (Succ != NextCase && SuccSet.insert(Ptr: Succ).second) |
1193 | DTUpdates.push_back(x: {DominatorTree::Delete, LastBlock, Succ}); |
1194 | } |
1195 | |
1196 | Switch->eraseFromParent(); |
1197 | BranchInst::Create(IfTrue: NextCase, InsertAtEnd: LastBlock); |
1198 | |
1199 | DTU->applyUpdates(Updates: DTUpdates); |
1200 | } |
1201 | |
1202 | /// After cloning blocks, some of the phi nodes have extra incoming values |
1203 | /// that are no longer used. This function removes them. |
1204 | void cleanPhiNodes(BasicBlock *BB) { |
1205 | // If BB is no longer reachable, remove any remaining phi nodes |
1206 | if (pred_empty(BB)) { |
1207 | std::vector<PHINode *> PhiToRemove; |
1208 | for (auto II = BB->begin(); PHINode *Phi = dyn_cast<PHINode>(Val&: II); ++II) { |
1209 | PhiToRemove.push_back(x: Phi); |
1210 | } |
1211 | for (PHINode *PN : PhiToRemove) { |
1212 | PN->replaceAllUsesWith(V: PoisonValue::get(T: PN->getType())); |
1213 | PN->eraseFromParent(); |
1214 | } |
1215 | return; |
1216 | } |
1217 | |
1218 | // Remove any incoming values that come from an invalid predecessor |
1219 | for (auto II = BB->begin(); PHINode *Phi = dyn_cast<PHINode>(Val&: II); ++II) { |
1220 | std::vector<BasicBlock *> BlocksToRemove; |
1221 | for (BasicBlock *IncomingBB : Phi->blocks()) { |
1222 | if (!isPredecessor(BB, IncomingBB)) |
1223 | BlocksToRemove.push_back(x: IncomingBB); |
1224 | } |
1225 | for (BasicBlock *BB : BlocksToRemove) |
1226 | Phi->removeIncomingValue(BB); |
1227 | } |
1228 | } |
1229 | |
1230 | /// Checks if BB was already cloned for a particular next state value. If it |
1231 | /// was then it returns this cloned block, and otherwise null. |
1232 | BasicBlock *getClonedBB(BasicBlock *BB, const APInt &NextState, |
1233 | DuplicateBlockMap &DuplicateMap) { |
1234 | CloneList ClonedBBs = DuplicateMap[BB]; |
1235 | |
1236 | // Find an entry in the CloneList with this NextState. If it exists then |
1237 | // return the corresponding BB |
1238 | auto It = llvm::find_if(Range&: ClonedBBs, P: [NextState](const ClonedBlock &C) { |
1239 | return C.State == NextState; |
1240 | }); |
1241 | return It != ClonedBBs.end() ? (*It).BB : nullptr; |
1242 | } |
1243 | |
1244 | /// Helper to get the successor corresponding to a particular case value for |
1245 | /// a switch statement. |
1246 | BasicBlock *getNextCaseSuccessor(SwitchInst *Switch, const APInt &NextState) { |
1247 | BasicBlock *NextCase = nullptr; |
1248 | for (auto Case : Switch->cases()) { |
1249 | if (Case.getCaseValue()->getValue() == NextState) { |
1250 | NextCase = Case.getCaseSuccessor(); |
1251 | break; |
1252 | } |
1253 | } |
1254 | if (!NextCase) |
1255 | NextCase = Switch->getDefaultDest(); |
1256 | return NextCase; |
1257 | } |
1258 | |
1259 | /// Returns true if IncomingBB is a predecessor of BB. |
1260 | bool isPredecessor(BasicBlock *BB, BasicBlock *IncomingBB) { |
1261 | return llvm::is_contained(Range: predecessors(BB), Element: IncomingBB); |
1262 | } |
1263 | |
1264 | AllSwitchPaths *SwitchPaths; |
1265 | DominatorTree *DT; |
1266 | AssumptionCache *AC; |
1267 | TargetTransformInfo *TTI; |
1268 | OptimizationRemarkEmitter *ORE; |
1269 | SmallPtrSet<const Value *, 32> EphValues; |
1270 | std::vector<ThreadingPath> TPaths; |
1271 | }; |
1272 | |
1273 | bool DFAJumpThreading::run(Function &F) { |
1274 | LLVM_DEBUG(dbgs() << "\nDFA Jump threading: " << F.getName() << "\n" ); |
1275 | |
1276 | if (F.hasOptSize()) { |
1277 | LLVM_DEBUG(dbgs() << "Skipping due to the 'minsize' attribute\n" ); |
1278 | return false; |
1279 | } |
1280 | |
1281 | if (ClViewCfgBefore) |
1282 | F.viewCFG(); |
1283 | |
1284 | SmallVector<AllSwitchPaths, 2> ThreadableLoops; |
1285 | bool MadeChanges = false; |
1286 | |
1287 | for (BasicBlock &BB : F) { |
1288 | auto *SI = dyn_cast<SwitchInst>(Val: BB.getTerminator()); |
1289 | if (!SI) |
1290 | continue; |
1291 | |
1292 | LLVM_DEBUG(dbgs() << "\nCheck if SwitchInst in BB " << BB.getName() |
1293 | << " is a candidate\n" ); |
1294 | MainSwitch Switch(SI, LI, ORE); |
1295 | |
1296 | if (!Switch.getInstr()) |
1297 | continue; |
1298 | |
1299 | LLVM_DEBUG(dbgs() << "\nSwitchInst in BB " << BB.getName() << " is a " |
1300 | << "candidate for jump threading\n" ); |
1301 | LLVM_DEBUG(SI->dump()); |
1302 | |
1303 | unfoldSelectInstrs(DT, SelectInsts: Switch.getSelectInsts()); |
1304 | if (!Switch.getSelectInsts().empty()) |
1305 | MadeChanges = true; |
1306 | |
1307 | AllSwitchPaths SwitchPaths(&Switch, ORE); |
1308 | SwitchPaths.run(); |
1309 | |
1310 | if (SwitchPaths.getNumThreadingPaths() > 0) { |
1311 | ThreadableLoops.push_back(Elt: SwitchPaths); |
1312 | |
1313 | // For the time being limit this optimization to occurring once in a |
1314 | // function since it can change the CFG significantly. This is not a |
1315 | // strict requirement but it can cause buggy behavior if there is an |
1316 | // overlap of blocks in different opportunities. There is a lot of room to |
1317 | // experiment with catching more opportunities here. |
1318 | break; |
1319 | } |
1320 | } |
1321 | |
1322 | SmallPtrSet<const Value *, 32> EphValues; |
1323 | if (ThreadableLoops.size() > 0) |
1324 | CodeMetrics::collectEphemeralValues(L: &F, AC, EphValues); |
1325 | |
1326 | for (AllSwitchPaths SwitchPaths : ThreadableLoops) { |
1327 | TransformDFA Transform(&SwitchPaths, DT, AC, TTI, ORE, EphValues); |
1328 | Transform.run(); |
1329 | MadeChanges = true; |
1330 | } |
1331 | |
1332 | #ifdef EXPENSIVE_CHECKS |
1333 | assert(DT->verify(DominatorTree::VerificationLevel::Full)); |
1334 | verifyFunction(F, &dbgs()); |
1335 | #endif |
1336 | |
1337 | return MadeChanges; |
1338 | } |
1339 | |
1340 | } // end anonymous namespace |
1341 | |
1342 | /// Integrate with the new Pass Manager |
1343 | PreservedAnalyses DFAJumpThreadingPass::run(Function &F, |
1344 | FunctionAnalysisManager &AM) { |
1345 | AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(IR&: F); |
1346 | DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F); |
1347 | LoopInfo &LI = AM.getResult<LoopAnalysis>(IR&: F); |
1348 | TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(IR&: F); |
1349 | OptimizationRemarkEmitter ORE(&F); |
1350 | |
1351 | if (!DFAJumpThreading(&AC, &DT, &LI, &TTI, &ORE).run(F)) |
1352 | return PreservedAnalyses::all(); |
1353 | |
1354 | PreservedAnalyses PA; |
1355 | PA.preserve<DominatorTreeAnalysis>(); |
1356 | return PA; |
1357 | } |
1358 | |