1 | //===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass performs various transformations related to eliminating memcpy |
10 | // calls, or transforming sets of stores into memset's. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "llvm/Transforms/Scalar/MemCpyOptimizer.h" |
15 | #include "llvm/ADT/DenseSet.h" |
16 | #include "llvm/ADT/STLExtras.h" |
17 | #include "llvm/ADT/SmallVector.h" |
18 | #include "llvm/ADT/Statistic.h" |
19 | #include "llvm/ADT/iterator_range.h" |
20 | #include "llvm/Analysis/AliasAnalysis.h" |
21 | #include "llvm/Analysis/AssumptionCache.h" |
22 | #include "llvm/Analysis/CFG.h" |
23 | #include "llvm/Analysis/CaptureTracking.h" |
24 | #include "llvm/Analysis/GlobalsModRef.h" |
25 | #include "llvm/Analysis/InstructionSimplify.h" |
26 | #include "llvm/Analysis/Loads.h" |
27 | #include "llvm/Analysis/MemoryLocation.h" |
28 | #include "llvm/Analysis/MemorySSA.h" |
29 | #include "llvm/Analysis/MemorySSAUpdater.h" |
30 | #include "llvm/Analysis/PostDominators.h" |
31 | #include "llvm/Analysis/TargetLibraryInfo.h" |
32 | #include "llvm/Analysis/ValueTracking.h" |
33 | #include "llvm/IR/BasicBlock.h" |
34 | #include "llvm/IR/Constants.h" |
35 | #include "llvm/IR/DataLayout.h" |
36 | #include "llvm/IR/DerivedTypes.h" |
37 | #include "llvm/IR/Dominators.h" |
38 | #include "llvm/IR/Function.h" |
39 | #include "llvm/IR/GlobalVariable.h" |
40 | #include "llvm/IR/IRBuilder.h" |
41 | #include "llvm/IR/InstrTypes.h" |
42 | #include "llvm/IR/Instruction.h" |
43 | #include "llvm/IR/Instructions.h" |
44 | #include "llvm/IR/IntrinsicInst.h" |
45 | #include "llvm/IR/Intrinsics.h" |
46 | #include "llvm/IR/LLVMContext.h" |
47 | #include "llvm/IR/Module.h" |
48 | #include "llvm/IR/PassManager.h" |
49 | #include "llvm/IR/Type.h" |
50 | #include "llvm/IR/User.h" |
51 | #include "llvm/IR/Value.h" |
52 | #include "llvm/Support/Casting.h" |
53 | #include "llvm/Support/Debug.h" |
54 | #include "llvm/Support/MathExtras.h" |
55 | #include "llvm/Support/raw_ostream.h" |
56 | #include "llvm/Transforms/Utils/Local.h" |
57 | #include <algorithm> |
58 | #include <cassert> |
59 | #include <cstdint> |
60 | #include <optional> |
61 | |
62 | using namespace llvm; |
63 | |
64 | #define DEBUG_TYPE "memcpyopt" |
65 | |
66 | static cl::opt<bool> EnableMemCpyOptWithoutLibcalls( |
67 | "enable-memcpyopt-without-libcalls" , cl::Hidden, |
68 | cl::desc("Enable memcpyopt even when libcalls are disabled" )); |
69 | |
70 | STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted" ); |
71 | STATISTIC(NumMemSetInfer, "Number of memsets inferred" ); |
72 | STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy" ); |
73 | STATISTIC(NumCpyToSet, "Number of memcpys converted to memset" ); |
74 | STATISTIC(NumCallSlot, "Number of call slot optimizations performed" ); |
75 | STATISTIC(NumStackMove, "Number of stack-move optimizations performed" ); |
76 | |
77 | namespace { |
78 | |
79 | /// Represents a range of memset'd bytes with the ByteVal value. |
80 | /// This allows us to analyze stores like: |
81 | /// store 0 -> P+1 |
82 | /// store 0 -> P+0 |
83 | /// store 0 -> P+3 |
84 | /// store 0 -> P+2 |
85 | /// which sometimes happens with stores to arrays of structs etc. When we see |
86 | /// the first store, we make a range [1, 2). The second store extends the range |
87 | /// to [0, 2). The third makes a new range [2, 3). The fourth store joins the |
88 | /// two ranges into [0, 3) which is memset'able. |
89 | struct MemsetRange { |
90 | // Start/End - A semi range that describes the span that this range covers. |
91 | // The range is closed at the start and open at the end: [Start, End). |
92 | int64_t Start, End; |
93 | |
94 | /// StartPtr - The getelementptr instruction that points to the start of the |
95 | /// range. |
96 | Value *StartPtr; |
97 | |
98 | /// Alignment - The known alignment of the first store. |
99 | MaybeAlign Alignment; |
100 | |
101 | /// TheStores - The actual stores that make up this range. |
102 | SmallVector<Instruction*, 16> TheStores; |
103 | |
104 | bool isProfitableToUseMemset(const DataLayout &DL) const; |
105 | }; |
106 | |
107 | } // end anonymous namespace |
108 | |
109 | bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { |
110 | // If we found more than 4 stores to merge or 16 bytes, use memset. |
111 | if (TheStores.size() >= 4 || End-Start >= 16) return true; |
112 | |
113 | // If there is nothing to merge, don't do anything. |
114 | if (TheStores.size() < 2) return false; |
115 | |
116 | // If any of the stores are a memset, then it is always good to extend the |
117 | // memset. |
118 | for (Instruction *SI : TheStores) |
119 | if (!isa<StoreInst>(Val: SI)) |
120 | return true; |
121 | |
122 | // Assume that the code generator is capable of merging pairs of stores |
123 | // together if it wants to. |
124 | if (TheStores.size() == 2) return false; |
125 | |
126 | // If we have fewer than 8 stores, it can still be worthwhile to do this. |
127 | // For example, merging 4 i8 stores into an i32 store is useful almost always. |
128 | // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the |
129 | // memset will be split into 2 32-bit stores anyway) and doing so can |
130 | // pessimize the llvm optimizer. |
131 | // |
132 | // Since we don't have perfect knowledge here, make some assumptions: assume |
133 | // the maximum GPR width is the same size as the largest legal integer |
134 | // size. If so, check to see whether we will end up actually reducing the |
135 | // number of stores used. |
136 | unsigned Bytes = unsigned(End-Start); |
137 | unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / 8; |
138 | if (MaxIntSize == 0) |
139 | MaxIntSize = 1; |
140 | unsigned NumPointerStores = Bytes / MaxIntSize; |
141 | |
142 | // Assume the remaining bytes if any are done a byte at a time. |
143 | unsigned NumByteStores = Bytes % MaxIntSize; |
144 | |
145 | // If we will reduce the # stores (according to this heuristic), do the |
146 | // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32 |
147 | // etc. |
148 | return TheStores.size() > NumPointerStores+NumByteStores; |
149 | } |
150 | |
151 | namespace { |
152 | |
153 | class MemsetRanges { |
154 | using range_iterator = SmallVectorImpl<MemsetRange>::iterator; |
155 | |
156 | /// A sorted list of the memset ranges. |
157 | SmallVector<MemsetRange, 8> Ranges; |
158 | |
159 | const DataLayout &DL; |
160 | |
161 | public: |
162 | MemsetRanges(const DataLayout &DL) : DL(DL) {} |
163 | |
164 | using const_iterator = SmallVectorImpl<MemsetRange>::const_iterator; |
165 | |
166 | const_iterator begin() const { return Ranges.begin(); } |
167 | const_iterator end() const { return Ranges.end(); } |
168 | bool empty() const { return Ranges.empty(); } |
169 | |
170 | void addInst(int64_t OffsetFromFirst, Instruction *Inst) { |
171 | if (auto *SI = dyn_cast<StoreInst>(Val: Inst)) |
172 | addStore(OffsetFromFirst, SI); |
173 | else |
174 | addMemSet(OffsetFromFirst, MSI: cast<MemSetInst>(Val: Inst)); |
175 | } |
176 | |
177 | void addStore(int64_t OffsetFromFirst, StoreInst *SI) { |
178 | TypeSize StoreSize = DL.getTypeStoreSize(Ty: SI->getOperand(i_nocapture: 0)->getType()); |
179 | assert(!StoreSize.isScalable() && "Can't track scalable-typed stores" ); |
180 | addRange(Start: OffsetFromFirst, Size: StoreSize.getFixedValue(), |
181 | Ptr: SI->getPointerOperand(), Alignment: SI->getAlign(), Inst: SI); |
182 | } |
183 | |
184 | void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) { |
185 | int64_t Size = cast<ConstantInt>(Val: MSI->getLength())->getZExtValue(); |
186 | addRange(Start: OffsetFromFirst, Size, Ptr: MSI->getDest(), Alignment: MSI->getDestAlign(), Inst: MSI); |
187 | } |
188 | |
189 | void addRange(int64_t Start, int64_t Size, Value *Ptr, MaybeAlign Alignment, |
190 | Instruction *Inst); |
191 | }; |
192 | |
193 | } // end anonymous namespace |
194 | |
195 | /// Add a new store to the MemsetRanges data structure. This adds a |
196 | /// new range for the specified store at the specified offset, merging into |
197 | /// existing ranges as appropriate. |
198 | void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, |
199 | MaybeAlign Alignment, Instruction *Inst) { |
200 | int64_t End = Start+Size; |
201 | |
202 | range_iterator I = partition_point( |
203 | Range&: Ranges, P: [=](const MemsetRange &O) { return O.End < Start; }); |
204 | |
205 | // We now know that I == E, in which case we didn't find anything to merge |
206 | // with, or that Start <= I->End. If End < I->Start or I == E, then we need |
207 | // to insert a new range. Handle this now. |
208 | if (I == Ranges.end() || End < I->Start) { |
209 | MemsetRange &R = *Ranges.insert(I, Elt: MemsetRange()); |
210 | R.Start = Start; |
211 | R.End = End; |
212 | R.StartPtr = Ptr; |
213 | R.Alignment = Alignment; |
214 | R.TheStores.push_back(Elt: Inst); |
215 | return; |
216 | } |
217 | |
218 | // This store overlaps with I, add it. |
219 | I->TheStores.push_back(Elt: Inst); |
220 | |
221 | // At this point, we may have an interval that completely contains our store. |
222 | // If so, just add it to the interval and return. |
223 | if (I->Start <= Start && I->End >= End) |
224 | return; |
225 | |
226 | // Now we know that Start <= I->End and End >= I->Start so the range overlaps |
227 | // but is not entirely contained within the range. |
228 | |
229 | // See if the range extends the start of the range. In this case, it couldn't |
230 | // possibly cause it to join the prior range, because otherwise we would have |
231 | // stopped on *it*. |
232 | if (Start < I->Start) { |
233 | I->Start = Start; |
234 | I->StartPtr = Ptr; |
235 | I->Alignment = Alignment; |
236 | } |
237 | |
238 | // Now we know that Start <= I->End and Start >= I->Start (so the startpoint |
239 | // is in or right at the end of I), and that End >= I->Start. Extend I out to |
240 | // End. |
241 | if (End > I->End) { |
242 | I->End = End; |
243 | range_iterator NextI = I; |
244 | while (++NextI != Ranges.end() && End >= NextI->Start) { |
245 | // Merge the range in. |
246 | I->TheStores.append(in_start: NextI->TheStores.begin(), in_end: NextI->TheStores.end()); |
247 | if (NextI->End > I->End) |
248 | I->End = NextI->End; |
249 | Ranges.erase(CI: NextI); |
250 | NextI = I; |
251 | } |
252 | } |
253 | } |
254 | |
255 | //===----------------------------------------------------------------------===// |
256 | // MemCpyOptLegacyPass Pass |
257 | //===----------------------------------------------------------------------===// |
258 | |
259 | // Check that V is either not accessible by the caller, or unwinding cannot |
260 | // occur between Start and End. |
261 | static bool mayBeVisibleThroughUnwinding(Value *V, Instruction *Start, |
262 | Instruction *End) { |
263 | assert(Start->getParent() == End->getParent() && "Must be in same block" ); |
264 | // Function can't unwind, so it also can't be visible through unwinding. |
265 | if (Start->getFunction()->doesNotThrow()) |
266 | return false; |
267 | |
268 | // Object is not visible on unwind. |
269 | // TODO: Support RequiresNoCaptureBeforeUnwind case. |
270 | bool RequiresNoCaptureBeforeUnwind; |
271 | if (isNotVisibleOnUnwind(Object: getUnderlyingObject(V), |
272 | RequiresNoCaptureBeforeUnwind) && |
273 | !RequiresNoCaptureBeforeUnwind) |
274 | return false; |
275 | |
276 | // Check whether there are any unwinding instructions in the range. |
277 | return any_of(Range: make_range(x: Start->getIterator(), y: End->getIterator()), |
278 | P: [](const Instruction &I) { return I.mayThrow(); }); |
279 | } |
280 | |
281 | void MemCpyOptPass::eraseInstruction(Instruction *I) { |
282 | MSSAU->removeMemoryAccess(I); |
283 | I->eraseFromParent(); |
284 | } |
285 | |
286 | // Check for mod or ref of Loc between Start and End, excluding both boundaries. |
287 | // Start and End must be in the same block. |
288 | // If SkippedLifetimeStart is provided, skip over one clobbering lifetime.start |
289 | // intrinsic and store it inside SkippedLifetimeStart. |
290 | static bool accessedBetween(BatchAAResults &AA, MemoryLocation Loc, |
291 | const MemoryUseOrDef *Start, |
292 | const MemoryUseOrDef *End, |
293 | Instruction **SkippedLifetimeStart = nullptr) { |
294 | assert(Start->getBlock() == End->getBlock() && "Only local supported" ); |
295 | for (const MemoryAccess &MA : |
296 | make_range(x: ++Start->getIterator(), y: End->getIterator())) { |
297 | Instruction *I = cast<MemoryUseOrDef>(Val: MA).getMemoryInst(); |
298 | if (isModOrRefSet(MRI: AA.getModRefInfo(I, OptLoc: Loc))) { |
299 | auto *II = dyn_cast<IntrinsicInst>(Val: I); |
300 | if (II && II->getIntrinsicID() == Intrinsic::lifetime_start && |
301 | SkippedLifetimeStart && !*SkippedLifetimeStart) { |
302 | *SkippedLifetimeStart = I; |
303 | continue; |
304 | } |
305 | |
306 | return true; |
307 | } |
308 | } |
309 | return false; |
310 | } |
311 | |
312 | // Check for mod of Loc between Start and End, excluding both boundaries. |
313 | // Start and End can be in different blocks. |
314 | static bool writtenBetween(MemorySSA *MSSA, BatchAAResults &AA, |
315 | MemoryLocation Loc, const MemoryUseOrDef *Start, |
316 | const MemoryUseOrDef *End) { |
317 | if (isa<MemoryUse>(Val: End)) { |
318 | // For MemoryUses, getClobberingMemoryAccess may skip non-clobbering writes. |
319 | // Manually check read accesses between Start and End, if they are in the |
320 | // same block, for clobbers. Otherwise assume Loc is clobbered. |
321 | return Start->getBlock() != End->getBlock() || |
322 | any_of( |
323 | Range: make_range(x: std::next(x: Start->getIterator()), y: End->getIterator()), |
324 | P: [&AA, Loc](const MemoryAccess &Acc) { |
325 | if (isa<MemoryUse>(Val: &Acc)) |
326 | return false; |
327 | Instruction *AccInst = |
328 | cast<MemoryUseOrDef>(Val: &Acc)->getMemoryInst(); |
329 | return isModSet(MRI: AA.getModRefInfo(I: AccInst, OptLoc: Loc)); |
330 | }); |
331 | } |
332 | |
333 | // TODO: Only walk until we hit Start. |
334 | MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( |
335 | End->getDefiningAccess(), Loc, AA); |
336 | return !MSSA->dominates(A: Clobber, B: Start); |
337 | } |
338 | |
339 | // Update AA metadata |
340 | static void combineAAMetadata(Instruction *ReplInst, Instruction *I) { |
341 | // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be |
342 | // handled here, but combineMetadata doesn't support them yet |
343 | unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, |
344 | LLVMContext::MD_noalias, |
345 | LLVMContext::MD_invariant_group, |
346 | LLVMContext::MD_access_group}; |
347 | combineMetadata(K: ReplInst, J: I, KnownIDs, DoesKMove: true); |
348 | } |
349 | |
350 | /// When scanning forward over instructions, we look for some other patterns to |
351 | /// fold away. In particular, this looks for stores to neighboring locations of |
352 | /// memory. If it sees enough consecutive ones, it attempts to merge them |
353 | /// together into a memcpy/memset. |
354 | Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, |
355 | Value *StartPtr, |
356 | Value *ByteVal) { |
357 | const DataLayout &DL = StartInst->getModule()->getDataLayout(); |
358 | |
359 | // We can't track scalable types |
360 | if (auto *SI = dyn_cast<StoreInst>(Val: StartInst)) |
361 | if (DL.getTypeStoreSize(Ty: SI->getOperand(i_nocapture: 0)->getType()).isScalable()) |
362 | return nullptr; |
363 | |
364 | // Okay, so we now have a single store that can be splatable. Scan to find |
365 | // all subsequent stores of the same value to offset from the same pointer. |
366 | // Join these together into ranges, so we can decide whether contiguous blocks |
367 | // are stored. |
368 | MemsetRanges Ranges(DL); |
369 | |
370 | BasicBlock::iterator BI(StartInst); |
371 | |
372 | // Keeps track of the last memory use or def before the insertion point for |
373 | // the new memset. The new MemoryDef for the inserted memsets will be inserted |
374 | // after MemInsertPoint. |
375 | MemoryUseOrDef *MemInsertPoint = nullptr; |
376 | for (++BI; !BI->isTerminator(); ++BI) { |
377 | auto *CurrentAcc = cast_or_null<MemoryUseOrDef>( |
378 | Val: MSSAU->getMemorySSA()->getMemoryAccess(I: &*BI)); |
379 | if (CurrentAcc) |
380 | MemInsertPoint = CurrentAcc; |
381 | |
382 | // Calls that only access inaccessible memory do not block merging |
383 | // accessible stores. |
384 | if (auto *CB = dyn_cast<CallBase>(Val&: BI)) { |
385 | if (CB->onlyAccessesInaccessibleMemory()) |
386 | continue; |
387 | } |
388 | |
389 | if (!isa<StoreInst>(Val: BI) && !isa<MemSetInst>(Val: BI)) { |
390 | // If the instruction is readnone, ignore it, otherwise bail out. We |
391 | // don't even allow readonly here because we don't want something like: |
392 | // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A). |
393 | if (BI->mayWriteToMemory() || BI->mayReadFromMemory()) |
394 | break; |
395 | continue; |
396 | } |
397 | |
398 | if (auto *NextStore = dyn_cast<StoreInst>(Val&: BI)) { |
399 | // If this is a store, see if we can merge it in. |
400 | if (!NextStore->isSimple()) break; |
401 | |
402 | Value *StoredVal = NextStore->getValueOperand(); |
403 | |
404 | // Don't convert stores of non-integral pointer types to memsets (which |
405 | // stores integers). |
406 | if (DL.isNonIntegralPointerType(Ty: StoredVal->getType()->getScalarType())) |
407 | break; |
408 | |
409 | // We can't track ranges involving scalable types. |
410 | if (DL.getTypeStoreSize(Ty: StoredVal->getType()).isScalable()) |
411 | break; |
412 | |
413 | // Check to see if this stored value is of the same byte-splattable value. |
414 | Value *StoredByte = isBytewiseValue(V: StoredVal, DL); |
415 | if (isa<UndefValue>(Val: ByteVal) && StoredByte) |
416 | ByteVal = StoredByte; |
417 | if (ByteVal != StoredByte) |
418 | break; |
419 | |
420 | // Check to see if this store is to a constant offset from the start ptr. |
421 | std::optional<int64_t> Offset = |
422 | NextStore->getPointerOperand()->getPointerOffsetFrom(Other: StartPtr, DL); |
423 | if (!Offset) |
424 | break; |
425 | |
426 | Ranges.addStore(OffsetFromFirst: *Offset, SI: NextStore); |
427 | } else { |
428 | auto *MSI = cast<MemSetInst>(Val&: BI); |
429 | |
430 | if (MSI->isVolatile() || ByteVal != MSI->getValue() || |
431 | !isa<ConstantInt>(Val: MSI->getLength())) |
432 | break; |
433 | |
434 | // Check to see if this store is to a constant offset from the start ptr. |
435 | std::optional<int64_t> Offset = |
436 | MSI->getDest()->getPointerOffsetFrom(Other: StartPtr, DL); |
437 | if (!Offset) |
438 | break; |
439 | |
440 | Ranges.addMemSet(OffsetFromFirst: *Offset, MSI); |
441 | } |
442 | } |
443 | |
444 | // If we have no ranges, then we just had a single store with nothing that |
445 | // could be merged in. This is a very common case of course. |
446 | if (Ranges.empty()) |
447 | return nullptr; |
448 | |
449 | // If we had at least one store that could be merged in, add the starting |
450 | // store as well. We try to avoid this unless there is at least something |
451 | // interesting as a small compile-time optimization. |
452 | Ranges.addInst(OffsetFromFirst: 0, Inst: StartInst); |
453 | |
454 | // If we create any memsets, we put it right before the first instruction that |
455 | // isn't part of the memset block. This ensure that the memset is dominated |
456 | // by any addressing instruction needed by the start of the block. |
457 | IRBuilder<> Builder(&*BI); |
458 | |
459 | // Now that we have full information about ranges, loop over the ranges and |
460 | // emit memset's for anything big enough to be worthwhile. |
461 | Instruction *AMemSet = nullptr; |
462 | for (const MemsetRange &Range : Ranges) { |
463 | if (Range.TheStores.size() == 1) continue; |
464 | |
465 | // If it is profitable to lower this range to memset, do so now. |
466 | if (!Range.isProfitableToUseMemset(DL)) |
467 | continue; |
468 | |
469 | // Otherwise, we do want to transform this! Create a new memset. |
470 | // Get the starting pointer of the block. |
471 | StartPtr = Range.StartPtr; |
472 | |
473 | AMemSet = Builder.CreateMemSet(Ptr: StartPtr, Val: ByteVal, Size: Range.End - Range.Start, |
474 | Align: Range.Alignment); |
475 | AMemSet->mergeDIAssignID(SourceInstructions: Range.TheStores); |
476 | |
477 | LLVM_DEBUG(dbgs() << "Replace stores:\n" ; for (Instruction *SI |
478 | : Range.TheStores) dbgs() |
479 | << *SI << '\n'; |
480 | dbgs() << "With: " << *AMemSet << '\n'); |
481 | if (!Range.TheStores.empty()) |
482 | AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc()); |
483 | |
484 | auto *NewDef = |
485 | cast<MemoryDef>(Val: MemInsertPoint->getMemoryInst() == &*BI |
486 | ? MSSAU->createMemoryAccessBefore( |
487 | I: AMemSet, Definition: nullptr, InsertPt: MemInsertPoint) |
488 | : MSSAU->createMemoryAccessAfter( |
489 | I: AMemSet, Definition: nullptr, InsertPt: MemInsertPoint)); |
490 | MSSAU->insertDef(Def: NewDef, /*RenameUses=*/true); |
491 | MemInsertPoint = NewDef; |
492 | |
493 | // Zap all the stores. |
494 | for (Instruction *SI : Range.TheStores) |
495 | eraseInstruction(I: SI); |
496 | |
497 | ++NumMemSetInfer; |
498 | } |
499 | |
500 | return AMemSet; |
501 | } |
502 | |
503 | // This method try to lift a store instruction before position P. |
504 | // It will lift the store and its argument + that anything that |
505 | // may alias with these. |
506 | // The method returns true if it was successful. |
507 | bool MemCpyOptPass::moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI) { |
508 | // If the store alias this position, early bail out. |
509 | MemoryLocation StoreLoc = MemoryLocation::get(SI); |
510 | if (isModOrRefSet(MRI: AA->getModRefInfo(I: P, OptLoc: StoreLoc))) |
511 | return false; |
512 | |
513 | // Keep track of the arguments of all instruction we plan to lift |
514 | // so we can make sure to lift them as well if appropriate. |
515 | DenseSet<Instruction*> Args; |
516 | auto AddArg = [&](Value *Arg) { |
517 | auto *I = dyn_cast<Instruction>(Val: Arg); |
518 | if (I && I->getParent() == SI->getParent()) { |
519 | // Cannot hoist user of P above P |
520 | if (I == P) return false; |
521 | Args.insert(V: I); |
522 | } |
523 | return true; |
524 | }; |
525 | if (!AddArg(SI->getPointerOperand())) |
526 | return false; |
527 | |
528 | // Instruction to lift before P. |
529 | SmallVector<Instruction *, 8> ToLift{SI}; |
530 | |
531 | // Memory locations of lifted instructions. |
532 | SmallVector<MemoryLocation, 8> MemLocs{StoreLoc}; |
533 | |
534 | // Lifted calls. |
535 | SmallVector<const CallBase *, 8> Calls; |
536 | |
537 | const MemoryLocation LoadLoc = MemoryLocation::get(LI); |
538 | |
539 | for (auto I = --SI->getIterator(), E = P->getIterator(); I != E; --I) { |
540 | auto *C = &*I; |
541 | |
542 | // Make sure hoisting does not perform a store that was not guaranteed to |
543 | // happen. |
544 | if (!isGuaranteedToTransferExecutionToSuccessor(I: C)) |
545 | return false; |
546 | |
547 | bool MayAlias = isModOrRefSet(MRI: AA->getModRefInfo(I: C, OptLoc: std::nullopt)); |
548 | |
549 | bool NeedLift = false; |
550 | if (Args.erase(V: C)) |
551 | NeedLift = true; |
552 | else if (MayAlias) { |
553 | NeedLift = llvm::any_of(Range&: MemLocs, P: [C, this](const MemoryLocation &ML) { |
554 | return isModOrRefSet(MRI: AA->getModRefInfo(I: C, OptLoc: ML)); |
555 | }); |
556 | |
557 | if (!NeedLift) |
558 | NeedLift = llvm::any_of(Range&: Calls, P: [C, this](const CallBase *Call) { |
559 | return isModOrRefSet(MRI: AA->getModRefInfo(I: C, Call)); |
560 | }); |
561 | } |
562 | |
563 | if (!NeedLift) |
564 | continue; |
565 | |
566 | if (MayAlias) { |
567 | // Since LI is implicitly moved downwards past the lifted instructions, |
568 | // none of them may modify its source. |
569 | if (isModSet(MRI: AA->getModRefInfo(I: C, OptLoc: LoadLoc))) |
570 | return false; |
571 | else if (const auto *Call = dyn_cast<CallBase>(Val: C)) { |
572 | // If we can't lift this before P, it's game over. |
573 | if (isModOrRefSet(MRI: AA->getModRefInfo(I: P, Call))) |
574 | return false; |
575 | |
576 | Calls.push_back(Elt: Call); |
577 | } else if (isa<LoadInst>(Val: C) || isa<StoreInst>(Val: C) || isa<VAArgInst>(Val: C)) { |
578 | // If we can't lift this before P, it's game over. |
579 | auto ML = MemoryLocation::get(Inst: C); |
580 | if (isModOrRefSet(MRI: AA->getModRefInfo(I: P, OptLoc: ML))) |
581 | return false; |
582 | |
583 | MemLocs.push_back(Elt: ML); |
584 | } else |
585 | // We don't know how to lift this instruction. |
586 | return false; |
587 | } |
588 | |
589 | ToLift.push_back(Elt: C); |
590 | for (Value *Op : C->operands()) |
591 | if (!AddArg(Op)) |
592 | return false; |
593 | } |
594 | |
595 | // Find MSSA insertion point. Normally P will always have a corresponding |
596 | // memory access before which we can insert. However, with non-standard AA |
597 | // pipelines, there may be a mismatch between AA and MSSA, in which case we |
598 | // will scan for a memory access before P. In either case, we know for sure |
599 | // that at least the load will have a memory access. |
600 | // TODO: Simplify this once P will be determined by MSSA, in which case the |
601 | // discrepancy can no longer occur. |
602 | MemoryUseOrDef *MemInsertPoint = nullptr; |
603 | if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(I: P)) { |
604 | MemInsertPoint = cast<MemoryUseOrDef>(Val&: --MA->getIterator()); |
605 | } else { |
606 | const Instruction *ConstP = P; |
607 | for (const Instruction &I : make_range(x: ++ConstP->getReverseIterator(), |
608 | y: ++LI->getReverseIterator())) { |
609 | if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(I: &I)) { |
610 | MemInsertPoint = MA; |
611 | break; |
612 | } |
613 | } |
614 | } |
615 | |
616 | // We made it, we need to lift. |
617 | for (auto *I : llvm::reverse(C&: ToLift)) { |
618 | LLVM_DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n" ); |
619 | I->moveBefore(MovePos: P); |
620 | assert(MemInsertPoint && "Must have found insert point" ); |
621 | if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(I)) { |
622 | MSSAU->moveAfter(What: MA, Where: MemInsertPoint); |
623 | MemInsertPoint = MA; |
624 | } |
625 | } |
626 | |
627 | return true; |
628 | } |
629 | |
630 | bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI, |
631 | const DataLayout &DL, |
632 | BasicBlock::iterator &BBI) { |
633 | if (!LI->isSimple() || !LI->hasOneUse() || |
634 | LI->getParent() != SI->getParent()) |
635 | return false; |
636 | |
637 | auto *T = LI->getType(); |
638 | // Don't introduce calls to memcpy/memmove intrinsics out of thin air if |
639 | // the corresponding libcalls are not available. |
640 | // TODO: We should really distinguish between libcall availability and |
641 | // our ability to introduce intrinsics. |
642 | if (T->isAggregateType() && |
643 | (EnableMemCpyOptWithoutLibcalls || |
644 | (TLI->has(F: LibFunc_memcpy) && TLI->has(F: LibFunc_memmove)))) { |
645 | MemoryLocation LoadLoc = MemoryLocation::get(LI); |
646 | |
647 | // We use alias analysis to check if an instruction may store to |
648 | // the memory we load from in between the load and the store. If |
649 | // such an instruction is found, we try to promote there instead |
650 | // of at the store position. |
651 | // TODO: Can use MSSA for this. |
652 | Instruction *P = SI; |
653 | for (auto &I : make_range(x: ++LI->getIterator(), y: SI->getIterator())) { |
654 | if (isModSet(MRI: AA->getModRefInfo(I: &I, OptLoc: LoadLoc))) { |
655 | P = &I; |
656 | break; |
657 | } |
658 | } |
659 | |
660 | // We found an instruction that may write to the loaded memory. |
661 | // We can try to promote at this position instead of the store |
662 | // position if nothing aliases the store memory after this and the store |
663 | // destination is not in the range. |
664 | if (P && P != SI) { |
665 | if (!moveUp(SI, P, LI)) |
666 | P = nullptr; |
667 | } |
668 | |
669 | // If a valid insertion position is found, then we can promote |
670 | // the load/store pair to a memcpy. |
671 | if (P) { |
672 | // If we load from memory that may alias the memory we store to, |
673 | // memmove must be used to preserve semantic. If not, memcpy can |
674 | // be used. Also, if we load from constant memory, memcpy can be used |
675 | // as the constant memory won't be modified. |
676 | bool UseMemMove = false; |
677 | if (isModSet(MRI: AA->getModRefInfo(I: SI, OptLoc: LoadLoc))) |
678 | UseMemMove = true; |
679 | |
680 | IRBuilder<> Builder(P); |
681 | Value *Size = Builder.CreateTypeSize(DstType: Builder.getInt64Ty(), |
682 | Size: DL.getTypeStoreSize(Ty: T)); |
683 | Instruction *M; |
684 | if (UseMemMove) |
685 | M = Builder.CreateMemMove( |
686 | Dst: SI->getPointerOperand(), DstAlign: SI->getAlign(), |
687 | Src: LI->getPointerOperand(), SrcAlign: LI->getAlign(), Size); |
688 | else |
689 | M = Builder.CreateMemCpy( |
690 | Dst: SI->getPointerOperand(), DstAlign: SI->getAlign(), |
691 | Src: LI->getPointerOperand(), SrcAlign: LI->getAlign(), Size); |
692 | M->copyMetadata(SrcInst: *SI, WL: LLVMContext::MD_DIAssignID); |
693 | |
694 | LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => " |
695 | << *M << "\n" ); |
696 | |
697 | auto *LastDef = |
698 | cast<MemoryDef>(Val: MSSAU->getMemorySSA()->getMemoryAccess(I: SI)); |
699 | auto *NewAccess = MSSAU->createMemoryAccessAfter(I: M, Definition: nullptr, InsertPt: LastDef); |
700 | MSSAU->insertDef(Def: cast<MemoryDef>(Val: NewAccess), /*RenameUses=*/true); |
701 | |
702 | eraseInstruction(I: SI); |
703 | eraseInstruction(I: LI); |
704 | ++NumMemCpyInstr; |
705 | |
706 | // Make sure we do not invalidate the iterator. |
707 | BBI = M->getIterator(); |
708 | return true; |
709 | } |
710 | } |
711 | |
712 | // Detect cases where we're performing call slot forwarding, but |
713 | // happen to be using a load-store pair to implement it, rather than |
714 | // a memcpy. |
715 | BatchAAResults BAA(*AA); |
716 | auto GetCall = [&]() -> CallInst * { |
717 | // We defer this expensive clobber walk until the cheap checks |
718 | // have been done on the source inside performCallSlotOptzn. |
719 | if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>( |
720 | Val: MSSA->getWalker()->getClobberingMemoryAccess(I: LI, AA&: BAA))) |
721 | return dyn_cast_or_null<CallInst>(Val: LoadClobber->getMemoryInst()); |
722 | return nullptr; |
723 | }; |
724 | |
725 | bool Changed = performCallSlotOptzn( |
726 | cpyLoad: LI, cpyStore: SI, cpyDst: SI->getPointerOperand()->stripPointerCasts(), |
727 | cpySrc: LI->getPointerOperand()->stripPointerCasts(), |
728 | cpyLen: DL.getTypeStoreSize(Ty: SI->getOperand(i_nocapture: 0)->getType()), |
729 | cpyAlign: std::min(a: SI->getAlign(), b: LI->getAlign()), BAA, GetC: GetCall); |
730 | if (Changed) { |
731 | eraseInstruction(I: SI); |
732 | eraseInstruction(I: LI); |
733 | ++NumMemCpyInstr; |
734 | return true; |
735 | } |
736 | |
737 | // If this is a load-store pair from a stack slot to a stack slot, we |
738 | // might be able to perform the stack-move optimization just as we do for |
739 | // memcpys from an alloca to an alloca. |
740 | if (auto *DestAlloca = dyn_cast<AllocaInst>(Val: SI->getPointerOperand())) { |
741 | if (auto *SrcAlloca = dyn_cast<AllocaInst>(Val: LI->getPointerOperand())) { |
742 | if (performStackMoveOptzn(Load: LI, Store: SI, DestAlloca, SrcAlloca, |
743 | Size: DL.getTypeStoreSize(Ty: T), BAA)) { |
744 | // Avoid invalidating the iterator. |
745 | BBI = SI->getNextNonDebugInstruction()->getIterator(); |
746 | eraseInstruction(I: SI); |
747 | eraseInstruction(I: LI); |
748 | ++NumMemCpyInstr; |
749 | return true; |
750 | } |
751 | } |
752 | } |
753 | |
754 | return false; |
755 | } |
756 | |
757 | bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { |
758 | if (!SI->isSimple()) return false; |
759 | |
760 | // Avoid merging nontemporal stores since the resulting |
761 | // memcpy/memset would not be able to preserve the nontemporal hint. |
762 | // In theory we could teach how to propagate the !nontemporal metadata to |
763 | // memset calls. However, that change would force the backend to |
764 | // conservatively expand !nontemporal memset calls back to sequences of |
765 | // store instructions (effectively undoing the merging). |
766 | if (SI->getMetadata(KindID: LLVMContext::MD_nontemporal)) |
767 | return false; |
768 | |
769 | const DataLayout &DL = SI->getModule()->getDataLayout(); |
770 | |
771 | Value *StoredVal = SI->getValueOperand(); |
772 | |
773 | // Not all the transforms below are correct for non-integral pointers, bail |
774 | // until we've audited the individual pieces. |
775 | if (DL.isNonIntegralPointerType(Ty: StoredVal->getType()->getScalarType())) |
776 | return false; |
777 | |
778 | // Load to store forwarding can be interpreted as memcpy. |
779 | if (auto *LI = dyn_cast<LoadInst>(Val: StoredVal)) |
780 | return processStoreOfLoad(SI, LI, DL, BBI); |
781 | |
782 | // The following code creates memset intrinsics out of thin air. Don't do |
783 | // this if the corresponding libfunc is not available. |
784 | // TODO: We should really distinguish between libcall availability and |
785 | // our ability to introduce intrinsics. |
786 | if (!(TLI->has(F: LibFunc_memset) || EnableMemCpyOptWithoutLibcalls)) |
787 | return false; |
788 | |
789 | // There are two cases that are interesting for this code to handle: memcpy |
790 | // and memset. Right now we only handle memset. |
791 | |
792 | // Ensure that the value being stored is something that can be memset'able a |
793 | // byte at a time like "0" or "-1" or any width, as well as things like |
794 | // 0xA0A0A0A0 and 0.0. |
795 | auto *V = SI->getOperand(i_nocapture: 0); |
796 | if (Value *ByteVal = isBytewiseValue(V, DL)) { |
797 | if (Instruction *I = tryMergingIntoMemset(StartInst: SI, StartPtr: SI->getPointerOperand(), |
798 | ByteVal)) { |
799 | BBI = I->getIterator(); // Don't invalidate iterator. |
800 | return true; |
801 | } |
802 | |
803 | // If we have an aggregate, we try to promote it to memset regardless |
804 | // of opportunity for merging as it can expose optimization opportunities |
805 | // in subsequent passes. |
806 | auto *T = V->getType(); |
807 | if (T->isAggregateType()) { |
808 | uint64_t Size = DL.getTypeStoreSize(Ty: T); |
809 | IRBuilder<> Builder(SI); |
810 | auto *M = Builder.CreateMemSet(Ptr: SI->getPointerOperand(), Val: ByteVal, Size, |
811 | Align: SI->getAlign()); |
812 | M->copyMetadata(SrcInst: *SI, WL: LLVMContext::MD_DIAssignID); |
813 | |
814 | LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n" ); |
815 | |
816 | // The newly inserted memset is immediately overwritten by the original |
817 | // store, so we do not need to rename uses. |
818 | auto *StoreDef = cast<MemoryDef>(Val: MSSA->getMemoryAccess(I: SI)); |
819 | auto *NewAccess = MSSAU->createMemoryAccessBefore( |
820 | I: M, Definition: nullptr, InsertPt: StoreDef); |
821 | MSSAU->insertDef(Def: cast<MemoryDef>(Val: NewAccess), /*RenameUses=*/false); |
822 | |
823 | eraseInstruction(I: SI); |
824 | NumMemSetInfer++; |
825 | |
826 | // Make sure we do not invalidate the iterator. |
827 | BBI = M->getIterator(); |
828 | return true; |
829 | } |
830 | } |
831 | |
832 | return false; |
833 | } |
834 | |
835 | bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { |
836 | // See if there is another memset or store neighboring this memset which |
837 | // allows us to widen out the memset to do a single larger store. |
838 | if (isa<ConstantInt>(Val: MSI->getLength()) && !MSI->isVolatile()) |
839 | if (Instruction *I = tryMergingIntoMemset(StartInst: MSI, StartPtr: MSI->getDest(), |
840 | ByteVal: MSI->getValue())) { |
841 | BBI = I->getIterator(); // Don't invalidate iterator. |
842 | return true; |
843 | } |
844 | return false; |
845 | } |
846 | |
847 | /// Takes a memcpy and a call that it depends on, |
848 | /// and checks for the possibility of a call slot optimization by having |
849 | /// the call write its result directly into the destination of the memcpy. |
850 | bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, |
851 | Instruction *cpyStore, Value *cpyDest, |
852 | Value *cpySrc, TypeSize cpySize, |
853 | Align cpyDestAlign, BatchAAResults &BAA, |
854 | std::function<CallInst *()> GetC) { |
855 | // The general transformation to keep in mind is |
856 | // |
857 | // call @func(..., src, ...) |
858 | // memcpy(dest, src, ...) |
859 | // |
860 | // -> |
861 | // |
862 | // memcpy(dest, src, ...) |
863 | // call @func(..., dest, ...) |
864 | // |
865 | // Since moving the memcpy is technically awkward, we additionally check that |
866 | // src only holds uninitialized values at the moment of the call, meaning that |
867 | // the memcpy can be discarded rather than moved. |
868 | |
869 | // We can't optimize scalable types. |
870 | if (cpySize.isScalable()) |
871 | return false; |
872 | |
873 | // Require that src be an alloca. This simplifies the reasoning considerably. |
874 | auto *srcAlloca = dyn_cast<AllocaInst>(Val: cpySrc); |
875 | if (!srcAlloca) |
876 | return false; |
877 | |
878 | ConstantInt *srcArraySize = dyn_cast<ConstantInt>(Val: srcAlloca->getArraySize()); |
879 | if (!srcArraySize) |
880 | return false; |
881 | |
882 | const DataLayout &DL = cpyLoad->getModule()->getDataLayout(); |
883 | TypeSize SrcAllocaSize = DL.getTypeAllocSize(Ty: srcAlloca->getAllocatedType()); |
884 | // We can't optimize scalable types. |
885 | if (SrcAllocaSize.isScalable()) |
886 | return false; |
887 | uint64_t srcSize = SrcAllocaSize * srcArraySize->getZExtValue(); |
888 | |
889 | if (cpySize < srcSize) |
890 | return false; |
891 | |
892 | CallInst *C = GetC(); |
893 | if (!C) |
894 | return false; |
895 | |
896 | // Lifetime marks shouldn't be operated on. |
897 | if (Function *F = C->getCalledFunction()) |
898 | if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start) |
899 | return false; |
900 | |
901 | |
902 | if (C->getParent() != cpyStore->getParent()) { |
903 | LLVM_DEBUG(dbgs() << "Call Slot: block local restriction\n" ); |
904 | return false; |
905 | } |
906 | |
907 | MemoryLocation DestLoc = isa<StoreInst>(Val: cpyStore) ? |
908 | MemoryLocation::get(Inst: cpyStore) : |
909 | MemoryLocation::getForDest(MI: cast<MemCpyInst>(Val: cpyStore)); |
910 | |
911 | // Check that nothing touches the dest of the copy between |
912 | // the call and the store/memcpy. |
913 | Instruction *SkippedLifetimeStart = nullptr; |
914 | if (accessedBetween(AA&: BAA, Loc: DestLoc, Start: MSSA->getMemoryAccess(I: C), |
915 | End: MSSA->getMemoryAccess(I: cpyStore), SkippedLifetimeStart: &SkippedLifetimeStart)) { |
916 | LLVM_DEBUG(dbgs() << "Call Slot: Dest pointer modified after call\n" ); |
917 | return false; |
918 | } |
919 | |
920 | // If we need to move a lifetime.start above the call, make sure that we can |
921 | // actually do so. If the argument is bitcasted for example, we would have to |
922 | // move the bitcast as well, which we don't handle. |
923 | if (SkippedLifetimeStart) { |
924 | auto *LifetimeArg = |
925 | dyn_cast<Instruction>(Val: SkippedLifetimeStart->getOperand(i: 1)); |
926 | if (LifetimeArg && LifetimeArg->getParent() == C->getParent() && |
927 | C->comesBefore(Other: LifetimeArg)) |
928 | return false; |
929 | } |
930 | |
931 | // Check that storing to the first srcSize bytes of dest will not cause a |
932 | // trap or data race. |
933 | bool ExplicitlyDereferenceableOnly; |
934 | if (!isWritableObject(Object: getUnderlyingObject(V: cpyDest), |
935 | ExplicitlyDereferenceableOnly) || |
936 | !isDereferenceableAndAlignedPointer(V: cpyDest, Alignment: Align(1), Size: APInt(64, cpySize), |
937 | DL, CtxI: C, AC, DT)) { |
938 | LLVM_DEBUG(dbgs() << "Call Slot: Dest pointer not dereferenceable\n" ); |
939 | return false; |
940 | } |
941 | |
942 | // Make sure that nothing can observe cpyDest being written early. There are |
943 | // a number of cases to consider: |
944 | // 1. cpyDest cannot be accessed between C and cpyStore as a precondition of |
945 | // the transform. |
946 | // 2. C itself may not access cpyDest (prior to the transform). This is |
947 | // checked further below. |
948 | // 3. If cpyDest is accessible to the caller of this function (potentially |
949 | // captured and not based on an alloca), we need to ensure that we cannot |
950 | // unwind between C and cpyStore. This is checked here. |
951 | // 4. If cpyDest is potentially captured, there may be accesses to it from |
952 | // another thread. In this case, we need to check that cpyStore is |
953 | // guaranteed to be executed if C is. As it is a non-atomic access, it |
954 | // renders accesses from other threads undefined. |
955 | // TODO: This is currently not checked. |
956 | if (mayBeVisibleThroughUnwinding(V: cpyDest, Start: C, End: cpyStore)) { |
957 | LLVM_DEBUG(dbgs() << "Call Slot: Dest may be visible through unwinding\n" ); |
958 | return false; |
959 | } |
960 | |
961 | // Check that dest points to memory that is at least as aligned as src. |
962 | Align srcAlign = srcAlloca->getAlign(); |
963 | bool isDestSufficientlyAligned = srcAlign <= cpyDestAlign; |
964 | // If dest is not aligned enough and we can't increase its alignment then |
965 | // bail out. |
966 | if (!isDestSufficientlyAligned && !isa<AllocaInst>(Val: cpyDest)) { |
967 | LLVM_DEBUG(dbgs() << "Call Slot: Dest not sufficiently aligned\n" ); |
968 | return false; |
969 | } |
970 | |
971 | // Check that src is not accessed except via the call and the memcpy. This |
972 | // guarantees that it holds only undefined values when passed in (so the final |
973 | // memcpy can be dropped), that it is not read or written between the call and |
974 | // the memcpy, and that writing beyond the end of it is undefined. |
975 | SmallVector<User *, 8> srcUseList(srcAlloca->users()); |
976 | while (!srcUseList.empty()) { |
977 | User *U = srcUseList.pop_back_val(); |
978 | |
979 | if (isa<BitCastInst>(Val: U) || isa<AddrSpaceCastInst>(Val: U)) { |
980 | append_range(C&: srcUseList, R: U->users()); |
981 | continue; |
982 | } |
983 | if (const auto *G = dyn_cast<GetElementPtrInst>(Val: U)) { |
984 | if (!G->hasAllZeroIndices()) |
985 | return false; |
986 | |
987 | append_range(C&: srcUseList, R: U->users()); |
988 | continue; |
989 | } |
990 | if (const auto *IT = dyn_cast<IntrinsicInst>(Val: U)) |
991 | if (IT->isLifetimeStartOrEnd()) |
992 | continue; |
993 | |
994 | if (U != C && U != cpyLoad) |
995 | return false; |
996 | } |
997 | |
998 | // Check whether src is captured by the called function, in which case there |
999 | // may be further indirect uses of src. |
1000 | bool SrcIsCaptured = any_of(Range: C->args(), P: [&](Use &U) { |
1001 | return U->stripPointerCasts() == cpySrc && |
1002 | !C->doesNotCapture(OpNo: C->getArgOperandNo(U: &U)); |
1003 | }); |
1004 | |
1005 | // If src is captured, then check whether there are any potential uses of |
1006 | // src through the captured pointer before the lifetime of src ends, either |
1007 | // due to a lifetime.end or a return from the function. |
1008 | if (SrcIsCaptured) { |
1009 | // Check that dest is not captured before/at the call. We have already |
1010 | // checked that src is not captured before it. If either had been captured, |
1011 | // then the call might be comparing the argument against the captured dest |
1012 | // or src pointer. |
1013 | Value *DestObj = getUnderlyingObject(V: cpyDest); |
1014 | if (!isIdentifiedFunctionLocal(V: DestObj) || |
1015 | PointerMayBeCapturedBefore(V: DestObj, /* ReturnCaptures */ true, |
1016 | /* StoreCaptures */ true, I: C, DT, |
1017 | /* IncludeI */ true)) |
1018 | return false; |
1019 | |
1020 | MemoryLocation SrcLoc = |
1021 | MemoryLocation(srcAlloca, LocationSize::precise(Value: srcSize)); |
1022 | for (Instruction &I : |
1023 | make_range(x: ++C->getIterator(), y: C->getParent()->end())) { |
1024 | // Lifetime of srcAlloca ends at lifetime.end. |
1025 | if (auto *II = dyn_cast<IntrinsicInst>(Val: &I)) { |
1026 | if (II->getIntrinsicID() == Intrinsic::lifetime_end && |
1027 | II->getArgOperand(i: 1)->stripPointerCasts() == srcAlloca && |
1028 | cast<ConstantInt>(Val: II->getArgOperand(i: 0))->uge(Num: srcSize)) |
1029 | break; |
1030 | } |
1031 | |
1032 | // Lifetime of srcAlloca ends at return. |
1033 | if (isa<ReturnInst>(Val: &I)) |
1034 | break; |
1035 | |
1036 | // Ignore the direct read of src in the load. |
1037 | if (&I == cpyLoad) |
1038 | continue; |
1039 | |
1040 | // Check whether this instruction may mod/ref src through the captured |
1041 | // pointer (we have already any direct mod/refs in the loop above). |
1042 | // Also bail if we hit a terminator, as we don't want to scan into other |
1043 | // blocks. |
1044 | if (isModOrRefSet(MRI: BAA.getModRefInfo(I: &I, OptLoc: SrcLoc)) || I.isTerminator()) |
1045 | return false; |
1046 | } |
1047 | } |
1048 | |
1049 | // Since we're changing the parameter to the callsite, we need to make sure |
1050 | // that what would be the new parameter dominates the callsite. |
1051 | bool NeedMoveGEP = false; |
1052 | if (!DT->dominates(Def: cpyDest, User: C)) { |
1053 | // Support moving a constant index GEP before the call. |
1054 | auto *GEP = dyn_cast<GetElementPtrInst>(Val: cpyDest); |
1055 | if (GEP && GEP->hasAllConstantIndices() && |
1056 | DT->dominates(Def: GEP->getPointerOperand(), User: C)) |
1057 | NeedMoveGEP = true; |
1058 | else |
1059 | return false; |
1060 | } |
1061 | |
1062 | // In addition to knowing that the call does not access src in some |
1063 | // unexpected manner, for example via a global, which we deduce from |
1064 | // the use analysis, we also need to know that it does not sneakily |
1065 | // access dest. We rely on AA to figure this out for us. |
1066 | MemoryLocation DestWithSrcSize(cpyDest, LocationSize::precise(Value: srcSize)); |
1067 | ModRefInfo MR = BAA.getModRefInfo(I: C, OptLoc: DestWithSrcSize); |
1068 | // If necessary, perform additional analysis. |
1069 | if (isModOrRefSet(MRI: MR)) |
1070 | MR = BAA.callCapturesBefore(I: C, MemLoc: DestWithSrcSize, DT); |
1071 | if (isModOrRefSet(MRI: MR)) |
1072 | return false; |
1073 | |
1074 | // We can't create address space casts here because we don't know if they're |
1075 | // safe for the target. |
1076 | if (cpySrc->getType() != cpyDest->getType()) |
1077 | return false; |
1078 | for (unsigned ArgI = 0; ArgI < C->arg_size(); ++ArgI) |
1079 | if (C->getArgOperand(i: ArgI)->stripPointerCasts() == cpySrc && |
1080 | cpySrc->getType() != C->getArgOperand(i: ArgI)->getType()) |
1081 | return false; |
1082 | |
1083 | // All the checks have passed, so do the transformation. |
1084 | bool changedArgument = false; |
1085 | for (unsigned ArgI = 0; ArgI < C->arg_size(); ++ArgI) |
1086 | if (C->getArgOperand(i: ArgI)->stripPointerCasts() == cpySrc) { |
1087 | changedArgument = true; |
1088 | C->setArgOperand(i: ArgI, v: cpyDest); |
1089 | } |
1090 | |
1091 | if (!changedArgument) |
1092 | return false; |
1093 | |
1094 | // If the destination wasn't sufficiently aligned then increase its alignment. |
1095 | if (!isDestSufficientlyAligned) { |
1096 | assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!" ); |
1097 | cast<AllocaInst>(Val: cpyDest)->setAlignment(srcAlign); |
1098 | } |
1099 | |
1100 | if (NeedMoveGEP) { |
1101 | auto *GEP = dyn_cast<GetElementPtrInst>(Val: cpyDest); |
1102 | GEP->moveBefore(MovePos: C); |
1103 | } |
1104 | |
1105 | if (SkippedLifetimeStart) { |
1106 | SkippedLifetimeStart->moveBefore(MovePos: C); |
1107 | MSSAU->moveBefore(What: MSSA->getMemoryAccess(I: SkippedLifetimeStart), |
1108 | Where: MSSA->getMemoryAccess(I: C)); |
1109 | } |
1110 | |
1111 | combineAAMetadata(ReplInst: C, I: cpyLoad); |
1112 | if (cpyLoad != cpyStore) |
1113 | combineAAMetadata(ReplInst: C, I: cpyStore); |
1114 | |
1115 | ++NumCallSlot; |
1116 | return true; |
1117 | } |
1118 | |
1119 | /// We've found that the (upward scanning) memory dependence of memcpy 'M' is |
1120 | /// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can. |
1121 | bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, |
1122 | MemCpyInst *MDep, |
1123 | BatchAAResults &BAA) { |
1124 | // We can only transforms memcpy's where the dest of one is the source of the |
1125 | // other. |
1126 | if (M->getSource() != MDep->getDest() || MDep->isVolatile()) |
1127 | return false; |
1128 | |
1129 | // If dep instruction is reading from our current input, then it is a noop |
1130 | // transfer and substituting the input won't change this instruction. Just |
1131 | // ignore the input and let someone else zap MDep. This handles cases like: |
1132 | // memcpy(a <- a) |
1133 | // memcpy(b <- a) |
1134 | if (M->getSource() == MDep->getSource()) |
1135 | return false; |
1136 | |
1137 | // Second, the length of the memcpy's must be the same, or the preceding one |
1138 | // must be larger than the following one. |
1139 | if (MDep->getLength() != M->getLength()) { |
1140 | auto *MDepLen = dyn_cast<ConstantInt>(Val: MDep->getLength()); |
1141 | auto *MLen = dyn_cast<ConstantInt>(Val: M->getLength()); |
1142 | if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) |
1143 | return false; |
1144 | } |
1145 | |
1146 | // Verify that the copied-from memory doesn't change in between the two |
1147 | // transfers. For example, in: |
1148 | // memcpy(a <- b) |
1149 | // *b = 42; |
1150 | // memcpy(c <- a) |
1151 | // It would be invalid to transform the second memcpy into memcpy(c <- b). |
1152 | // |
1153 | // TODO: If the code between M and MDep is transparent to the destination "c", |
1154 | // then we could still perform the xform by moving M up to the first memcpy. |
1155 | // TODO: It would be sufficient to check the MDep source up to the memcpy |
1156 | // size of M, rather than MDep. |
1157 | if (writtenBetween(MSSA, AA&: BAA, Loc: MemoryLocation::getForSource(MTI: MDep), |
1158 | Start: MSSA->getMemoryAccess(I: MDep), End: MSSA->getMemoryAccess(I: M))) |
1159 | return false; |
1160 | |
1161 | // If the dest of the second might alias the source of the first, then the |
1162 | // source and dest might overlap. In addition, if the source of the first |
1163 | // points to constant memory, they won't overlap by definition. Otherwise, we |
1164 | // still want to eliminate the intermediate value, but we have to generate a |
1165 | // memmove instead of memcpy. |
1166 | bool UseMemMove = false; |
1167 | if (isModSet(MRI: BAA.getModRefInfo(I: M, OptLoc: MemoryLocation::getForSource(MTI: MDep)))) { |
1168 | // Don't convert llvm.memcpy.inline into memmove because memmove can be |
1169 | // lowered as a call, and that is not allowed for llvm.memcpy.inline (and |
1170 | // there is no inline version of llvm.memmove) |
1171 | if (isa<MemCpyInlineInst>(Val: M)) |
1172 | return false; |
1173 | UseMemMove = true; |
1174 | } |
1175 | |
1176 | // If all checks passed, then we can transform M. |
1177 | LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy->memcpy src:\n" |
1178 | << *MDep << '\n' << *M << '\n'); |
1179 | |
1180 | // TODO: Is this worth it if we're creating a less aligned memcpy? For |
1181 | // example we could be moving from movaps -> movq on x86. |
1182 | IRBuilder<> Builder(M); |
1183 | Instruction *NewM; |
1184 | if (UseMemMove) |
1185 | NewM = Builder.CreateMemMove(Dst: M->getRawDest(), DstAlign: M->getDestAlign(), |
1186 | Src: MDep->getRawSource(), SrcAlign: MDep->getSourceAlign(), |
1187 | Size: M->getLength(), isVolatile: M->isVolatile()); |
1188 | else if (isa<MemCpyInlineInst>(Val: M)) { |
1189 | // llvm.memcpy may be promoted to llvm.memcpy.inline, but the converse is |
1190 | // never allowed since that would allow the latter to be lowered as a call |
1191 | // to an external function. |
1192 | NewM = Builder.CreateMemCpyInline( |
1193 | Dst: M->getRawDest(), DstAlign: M->getDestAlign(), Src: MDep->getRawSource(), |
1194 | SrcAlign: MDep->getSourceAlign(), Size: M->getLength(), isVolatile: M->isVolatile()); |
1195 | } else |
1196 | NewM = Builder.CreateMemCpy(Dst: M->getRawDest(), DstAlign: M->getDestAlign(), |
1197 | Src: MDep->getRawSource(), SrcAlign: MDep->getSourceAlign(), |
1198 | Size: M->getLength(), isVolatile: M->isVolatile()); |
1199 | NewM->copyMetadata(SrcInst: *M, WL: LLVMContext::MD_DIAssignID); |
1200 | |
1201 | assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M))); |
1202 | auto *LastDef = cast<MemoryDef>(Val: MSSAU->getMemorySSA()->getMemoryAccess(I: M)); |
1203 | auto *NewAccess = MSSAU->createMemoryAccessAfter(I: NewM, Definition: nullptr, InsertPt: LastDef); |
1204 | MSSAU->insertDef(Def: cast<MemoryDef>(Val: NewAccess), /*RenameUses=*/true); |
1205 | |
1206 | // Remove the instruction we're replacing. |
1207 | eraseInstruction(I: M); |
1208 | ++NumMemCpyInstr; |
1209 | return true; |
1210 | } |
1211 | |
1212 | /// We've found that the (upward scanning) memory dependence of \p MemCpy is |
1213 | /// \p MemSet. Try to simplify \p MemSet to only set the trailing bytes that |
1214 | /// weren't copied over by \p MemCpy. |
1215 | /// |
1216 | /// In other words, transform: |
1217 | /// \code |
1218 | /// memset(dst, c, dst_size); |
1219 | /// ... |
1220 | /// memcpy(dst, src, src_size); |
1221 | /// \endcode |
1222 | /// into: |
1223 | /// \code |
1224 | /// ... |
1225 | /// memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size); |
1226 | /// memcpy(dst, src, src_size); |
1227 | /// \endcode |
1228 | /// |
1229 | /// The memset is sunk to just before the memcpy to ensure that src_size is |
1230 | /// present when emitting the simplified memset. |
1231 | bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, |
1232 | MemSetInst *MemSet, |
1233 | BatchAAResults &BAA) { |
1234 | // We can only transform memset/memcpy with the same destination. |
1235 | if (!BAA.isMustAlias(V1: MemSet->getDest(), V2: MemCpy->getDest())) |
1236 | return false; |
1237 | |
1238 | // Check that src and dst of the memcpy aren't the same. While memcpy |
1239 | // operands cannot partially overlap, exact equality is allowed. |
1240 | if (isModSet(MRI: BAA.getModRefInfo(I: MemCpy, OptLoc: MemoryLocation::getForSource(MTI: MemCpy)))) |
1241 | return false; |
1242 | |
1243 | // We know that dst up to src_size is not written. We now need to make sure |
1244 | // that dst up to dst_size is not accessed. (If we did not move the memset, |
1245 | // checking for reads would be sufficient.) |
1246 | if (accessedBetween(AA&: BAA, Loc: MemoryLocation::getForDest(MI: MemSet), |
1247 | Start: MSSA->getMemoryAccess(I: MemSet), |
1248 | End: MSSA->getMemoryAccess(I: MemCpy))) |
1249 | return false; |
1250 | |
1251 | // Use the same i8* dest as the memcpy, killing the memset dest if different. |
1252 | Value *Dest = MemCpy->getRawDest(); |
1253 | Value *DestSize = MemSet->getLength(); |
1254 | Value *SrcSize = MemCpy->getLength(); |
1255 | |
1256 | if (mayBeVisibleThroughUnwinding(V: Dest, Start: MemSet, End: MemCpy)) |
1257 | return false; |
1258 | |
1259 | // If the sizes are the same, simply drop the memset instead of generating |
1260 | // a replacement with zero size. |
1261 | if (DestSize == SrcSize) { |
1262 | eraseInstruction(I: MemSet); |
1263 | return true; |
1264 | } |
1265 | |
1266 | // By default, create an unaligned memset. |
1267 | Align Alignment = Align(1); |
1268 | // If Dest is aligned, and SrcSize is constant, use the minimum alignment |
1269 | // of the sum. |
1270 | const Align DestAlign = std::max(a: MemSet->getDestAlign().valueOrOne(), |
1271 | b: MemCpy->getDestAlign().valueOrOne()); |
1272 | if (DestAlign > 1) |
1273 | if (auto *SrcSizeC = dyn_cast<ConstantInt>(Val: SrcSize)) |
1274 | Alignment = commonAlignment(A: DestAlign, Offset: SrcSizeC->getZExtValue()); |
1275 | |
1276 | IRBuilder<> Builder(MemCpy); |
1277 | |
1278 | // Preserve the debug location of the old memset for the code emitted here |
1279 | // related to the new memset. This is correct according to the rules in |
1280 | // https://llvm.org/docs/HowToUpdateDebugInfo.html about "when to preserve an |
1281 | // instruction location", given that we move the memset within the basic |
1282 | // block. |
1283 | assert(MemSet->getParent() == MemCpy->getParent() && |
1284 | "Preserving debug location based on moving memset within BB." ); |
1285 | Builder.SetCurrentDebugLocation(MemSet->getDebugLoc()); |
1286 | |
1287 | // If the sizes have different types, zext the smaller one. |
1288 | if (DestSize->getType() != SrcSize->getType()) { |
1289 | if (DestSize->getType()->getIntegerBitWidth() > |
1290 | SrcSize->getType()->getIntegerBitWidth()) |
1291 | SrcSize = Builder.CreateZExt(V: SrcSize, DestTy: DestSize->getType()); |
1292 | else |
1293 | DestSize = Builder.CreateZExt(V: DestSize, DestTy: SrcSize->getType()); |
1294 | } |
1295 | |
1296 | Value *Ule = Builder.CreateICmpULE(LHS: DestSize, RHS: SrcSize); |
1297 | Value *SizeDiff = Builder.CreateSub(LHS: DestSize, RHS: SrcSize); |
1298 | Value *MemsetLen = Builder.CreateSelect( |
1299 | C: Ule, True: ConstantInt::getNullValue(Ty: DestSize->getType()), False: SizeDiff); |
1300 | Instruction *NewMemSet = |
1301 | Builder.CreateMemSet(Ptr: Builder.CreatePtrAdd(Ptr: Dest, Offset: SrcSize), |
1302 | Val: MemSet->getOperand(i_nocapture: 1), Size: MemsetLen, Align: Alignment); |
1303 | |
1304 | assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)) && |
1305 | "MemCpy must be a MemoryDef" ); |
1306 | // The new memset is inserted before the memcpy, and it is known that the |
1307 | // memcpy's defining access is the memset about to be removed. |
1308 | auto *LastDef = |
1309 | cast<MemoryDef>(Val: MSSAU->getMemorySSA()->getMemoryAccess(I: MemCpy)); |
1310 | auto *NewAccess = MSSAU->createMemoryAccessBefore( |
1311 | I: NewMemSet, Definition: nullptr, InsertPt: LastDef); |
1312 | MSSAU->insertDef(Def: cast<MemoryDef>(Val: NewAccess), /*RenameUses=*/true); |
1313 | |
1314 | eraseInstruction(I: MemSet); |
1315 | return true; |
1316 | } |
1317 | |
1318 | /// Determine whether the instruction has undefined content for the given Size, |
1319 | /// either because it was freshly alloca'd or started its lifetime. |
1320 | static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V, |
1321 | MemoryDef *Def, Value *Size) { |
1322 | if (MSSA->isLiveOnEntryDef(MA: Def)) |
1323 | return isa<AllocaInst>(Val: getUnderlyingObject(V)); |
1324 | |
1325 | if (auto *II = dyn_cast_or_null<IntrinsicInst>(Val: Def->getMemoryInst())) { |
1326 | if (II->getIntrinsicID() == Intrinsic::lifetime_start) { |
1327 | auto *LTSize = cast<ConstantInt>(Val: II->getArgOperand(i: 0)); |
1328 | |
1329 | if (auto *CSize = dyn_cast<ConstantInt>(Val: Size)) { |
1330 | if (AA.isMustAlias(V1: V, V2: II->getArgOperand(i: 1)) && |
1331 | LTSize->getZExtValue() >= CSize->getZExtValue()) |
1332 | return true; |
1333 | } |
1334 | |
1335 | // If the lifetime.start covers a whole alloca (as it almost always |
1336 | // does) and we're querying a pointer based on that alloca, then we know |
1337 | // the memory is definitely undef, regardless of how exactly we alias. |
1338 | // The size also doesn't matter, as an out-of-bounds access would be UB. |
1339 | if (auto *Alloca = dyn_cast<AllocaInst>(Val: getUnderlyingObject(V))) { |
1340 | if (getUnderlyingObject(V: II->getArgOperand(i: 1)) == Alloca) { |
1341 | const DataLayout &DL = Alloca->getModule()->getDataLayout(); |
1342 | if (std::optional<TypeSize> AllocaSize = |
1343 | Alloca->getAllocationSize(DL)) |
1344 | if (*AllocaSize == LTSize->getValue()) |
1345 | return true; |
1346 | } |
1347 | } |
1348 | } |
1349 | } |
1350 | |
1351 | return false; |
1352 | } |
1353 | |
1354 | /// Transform memcpy to memset when its source was just memset. |
1355 | /// In other words, turn: |
1356 | /// \code |
1357 | /// memset(dst1, c, dst1_size); |
1358 | /// memcpy(dst2, dst1, dst2_size); |
1359 | /// \endcode |
1360 | /// into: |
1361 | /// \code |
1362 | /// memset(dst1, c, dst1_size); |
1363 | /// memset(dst2, c, dst2_size); |
1364 | /// \endcode |
1365 | /// When dst2_size <= dst1_size. |
1366 | bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, |
1367 | MemSetInst *MemSet, |
1368 | BatchAAResults &BAA) { |
1369 | // Make sure that memcpy(..., memset(...), ...), that is we are memsetting and |
1370 | // memcpying from the same address. Otherwise it is hard to reason about. |
1371 | if (!BAA.isMustAlias(V1: MemSet->getRawDest(), V2: MemCpy->getRawSource())) |
1372 | return false; |
1373 | |
1374 | Value *MemSetSize = MemSet->getLength(); |
1375 | Value *CopySize = MemCpy->getLength(); |
1376 | |
1377 | if (MemSetSize != CopySize) { |
1378 | // Make sure the memcpy doesn't read any more than what the memset wrote. |
1379 | // Don't worry about sizes larger than i64. |
1380 | |
1381 | // A known memset size is required. |
1382 | auto *CMemSetSize = dyn_cast<ConstantInt>(Val: MemSetSize); |
1383 | if (!CMemSetSize) |
1384 | return false; |
1385 | |
1386 | // A known memcpy size is also required. |
1387 | auto *CCopySize = dyn_cast<ConstantInt>(Val: CopySize); |
1388 | if (!CCopySize) |
1389 | return false; |
1390 | if (CCopySize->getZExtValue() > CMemSetSize->getZExtValue()) { |
1391 | // If the memcpy is larger than the memset, but the memory was undef prior |
1392 | // to the memset, we can just ignore the tail. Technically we're only |
1393 | // interested in the bytes from MemSetSize..CopySize here, but as we can't |
1394 | // easily represent this location, we use the full 0..CopySize range. |
1395 | MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MTI: MemCpy); |
1396 | bool CanReduceSize = false; |
1397 | MemoryUseOrDef *MemSetAccess = MSSA->getMemoryAccess(I: MemSet); |
1398 | MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( |
1399 | MemSetAccess->getDefiningAccess(), MemCpyLoc, AA&: BAA); |
1400 | if (auto *MD = dyn_cast<MemoryDef>(Val: Clobber)) |
1401 | if (hasUndefContents(MSSA, AA&: BAA, V: MemCpy->getSource(), Def: MD, Size: CopySize)) |
1402 | CanReduceSize = true; |
1403 | |
1404 | if (!CanReduceSize) |
1405 | return false; |
1406 | CopySize = MemSetSize; |
1407 | } |
1408 | } |
1409 | |
1410 | IRBuilder<> Builder(MemCpy); |
1411 | Instruction *NewM = |
1412 | Builder.CreateMemSet(Ptr: MemCpy->getRawDest(), Val: MemSet->getOperand(i_nocapture: 1), |
1413 | Size: CopySize, Align: MemCpy->getDestAlign()); |
1414 | auto *LastDef = |
1415 | cast<MemoryDef>(Val: MSSAU->getMemorySSA()->getMemoryAccess(I: MemCpy)); |
1416 | auto *NewAccess = MSSAU->createMemoryAccessAfter(I: NewM, Definition: nullptr, InsertPt: LastDef); |
1417 | MSSAU->insertDef(Def: cast<MemoryDef>(Val: NewAccess), /*RenameUses=*/true); |
1418 | |
1419 | return true; |
1420 | } |
1421 | |
1422 | // Attempts to optimize the pattern whereby memory is copied from an alloca to |
1423 | // another alloca, where the two allocas don't have conflicting mod/ref. If |
1424 | // successful, the two allocas can be merged into one and the transfer can be |
1425 | // deleted. This pattern is generated frequently in Rust, due to the ubiquity of |
1426 | // move operations in that language. |
1427 | // |
1428 | // Once we determine that the optimization is safe to perform, we replace all |
1429 | // uses of the destination alloca with the source alloca. We also "shrink wrap" |
1430 | // the lifetime markers of the single merged alloca to before the first use |
1431 | // and after the last use. Note that the "shrink wrapping" procedure is a safe |
1432 | // transformation only because we restrict the scope of this optimization to |
1433 | // allocas that aren't captured. |
1434 | bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, |
1435 | AllocaInst *DestAlloca, |
1436 | AllocaInst *SrcAlloca, TypeSize Size, |
1437 | BatchAAResults &BAA) { |
1438 | LLVM_DEBUG(dbgs() << "Stack Move: Attempting to optimize:\n" |
1439 | << *Store << "\n" ); |
1440 | |
1441 | // Make sure the two allocas are in the same address space. |
1442 | if (SrcAlloca->getAddressSpace() != DestAlloca->getAddressSpace()) { |
1443 | LLVM_DEBUG(dbgs() << "Stack Move: Address space mismatch\n" ); |
1444 | return false; |
1445 | } |
1446 | |
1447 | // Check that copy is full with static size. |
1448 | const DataLayout &DL = DestAlloca->getModule()->getDataLayout(); |
1449 | std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSize(DL); |
1450 | if (!SrcSize || Size != *SrcSize) { |
1451 | LLVM_DEBUG(dbgs() << "Stack Move: Source alloca size mismatch\n" ); |
1452 | return false; |
1453 | } |
1454 | std::optional<TypeSize> DestSize = DestAlloca->getAllocationSize(DL); |
1455 | if (!DestSize || Size != *DestSize) { |
1456 | LLVM_DEBUG(dbgs() << "Stack Move: Destination alloca size mismatch\n" ); |
1457 | return false; |
1458 | } |
1459 | |
1460 | if (!SrcAlloca->isStaticAlloca() || !DestAlloca->isStaticAlloca()) |
1461 | return false; |
1462 | |
1463 | // Check that src and dest are never captured, unescaped allocas. Also |
1464 | // find the nearest common dominator and postdominator for all users in |
1465 | // order to shrink wrap the lifetimes, and instructions with noalias metadata |
1466 | // to remove them. |
1467 | |
1468 | SmallVector<Instruction *, 4> LifetimeMarkers; |
1469 | SmallSet<Instruction *, 4> NoAliasInstrs; |
1470 | bool SrcNotDom = false; |
1471 | |
1472 | // Recursively track the user and check whether modified alias exist. |
1473 | auto IsDereferenceableOrNull = [](Value *V, const DataLayout &DL) -> bool { |
1474 | bool CanBeNull, CanBeFreed; |
1475 | return V->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed); |
1476 | }; |
1477 | |
1478 | auto CaptureTrackingWithModRef = |
1479 | [&](Instruction *AI, |
1480 | function_ref<bool(Instruction *)> ModRefCallback) -> bool { |
1481 | SmallVector<Instruction *, 8> Worklist; |
1482 | Worklist.push_back(Elt: AI); |
1483 | unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking(); |
1484 | Worklist.reserve(N: MaxUsesToExplore); |
1485 | SmallSet<const Use *, 20> Visited; |
1486 | while (!Worklist.empty()) { |
1487 | Instruction *I = Worklist.back(); |
1488 | Worklist.pop_back(); |
1489 | for (const Use &U : I->uses()) { |
1490 | auto *UI = cast<Instruction>(Val: U.getUser()); |
1491 | // If any use that isn't dominated by SrcAlloca exists, we move src |
1492 | // alloca to the entry before the transformation. |
1493 | if (!DT->dominates(Def: SrcAlloca, User: UI)) |
1494 | SrcNotDom = true; |
1495 | |
1496 | if (Visited.size() >= MaxUsesToExplore) { |
1497 | LLVM_DEBUG( |
1498 | dbgs() |
1499 | << "Stack Move: Exceeded max uses to see ModRef, bailing\n" ); |
1500 | return false; |
1501 | } |
1502 | if (!Visited.insert(Ptr: &U).second) |
1503 | continue; |
1504 | switch (DetermineUseCaptureKind(U, IsDereferenceableOrNull)) { |
1505 | case UseCaptureKind::MAY_CAPTURE: |
1506 | return false; |
1507 | case UseCaptureKind::PASSTHROUGH: |
1508 | // Instructions cannot have non-instruction users. |
1509 | Worklist.push_back(Elt: UI); |
1510 | continue; |
1511 | case UseCaptureKind::NO_CAPTURE: { |
1512 | if (UI->isLifetimeStartOrEnd()) { |
1513 | // We note the locations of these intrinsic calls so that we can |
1514 | // delete them later if the optimization succeeds, this is safe |
1515 | // since both llvm.lifetime.start and llvm.lifetime.end intrinsics |
1516 | // practically fill all the bytes of the alloca with an undefined |
1517 | // value, although conceptually marked as alive/dead. |
1518 | int64_t Size = cast<ConstantInt>(Val: UI->getOperand(i: 0))->getSExtValue(); |
1519 | if (Size < 0 || Size == DestSize) { |
1520 | LifetimeMarkers.push_back(Elt: UI); |
1521 | continue; |
1522 | } |
1523 | } |
1524 | if (UI->hasMetadata(KindID: LLVMContext::MD_noalias)) |
1525 | NoAliasInstrs.insert(Ptr: UI); |
1526 | if (!ModRefCallback(UI)) |
1527 | return false; |
1528 | } |
1529 | } |
1530 | } |
1531 | } |
1532 | return true; |
1533 | }; |
1534 | |
1535 | // Check that dest has no Mod/Ref, from the alloca to the Store, except full |
1536 | // size lifetime intrinsics. And collect modref inst for the reachability |
1537 | // check. |
1538 | ModRefInfo DestModRef = ModRefInfo::NoModRef; |
1539 | MemoryLocation DestLoc(DestAlloca, LocationSize::precise(Value: Size)); |
1540 | SmallVector<BasicBlock *, 8> ReachabilityWorklist; |
1541 | auto DestModRefCallback = [&](Instruction *UI) -> bool { |
1542 | // We don't care about the store itself. |
1543 | if (UI == Store) |
1544 | return true; |
1545 | ModRefInfo Res = BAA.getModRefInfo(I: UI, OptLoc: DestLoc); |
1546 | DestModRef |= Res; |
1547 | if (isModOrRefSet(MRI: Res)) { |
1548 | // Instructions reachability checks. |
1549 | // FIXME: adding the Instruction version isPotentiallyReachableFromMany on |
1550 | // lib/Analysis/CFG.cpp (currently only for BasicBlocks) might be helpful. |
1551 | if (UI->getParent() == Store->getParent()) { |
1552 | // The same block case is special because it's the only time we're |
1553 | // looking within a single block to see which instruction comes first. |
1554 | // Once we start looking at multiple blocks, the first instruction of |
1555 | // the block is reachable, so we only need to determine reachability |
1556 | // between whole blocks. |
1557 | BasicBlock *BB = UI->getParent(); |
1558 | |
1559 | // If A comes before B, then B is definitively reachable from A. |
1560 | if (UI->comesBefore(Other: Store)) |
1561 | return false; |
1562 | |
1563 | // If the user's parent block is entry, no predecessor exists. |
1564 | if (BB->isEntryBlock()) |
1565 | return true; |
1566 | |
1567 | // Otherwise, continue doing the normal per-BB CFG walk. |
1568 | ReachabilityWorklist.append(in_start: succ_begin(BB), in_end: succ_end(BB)); |
1569 | } else { |
1570 | ReachabilityWorklist.push_back(Elt: UI->getParent()); |
1571 | } |
1572 | } |
1573 | return true; |
1574 | }; |
1575 | |
1576 | if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback)) |
1577 | return false; |
1578 | // Bailout if Dest may have any ModRef before Store. |
1579 | if (!ReachabilityWorklist.empty() && |
1580 | isPotentiallyReachableFromMany(Worklist&: ReachabilityWorklist, StopBB: Store->getParent(), |
1581 | ExclusionSet: nullptr, DT, LI: nullptr)) |
1582 | return false; |
1583 | |
1584 | // Check that, from after the Load to the end of the BB, |
1585 | // - if the dest has any Mod, src has no Ref, and |
1586 | // - if the dest has any Ref, src has no Mod except full-sized lifetimes. |
1587 | MemoryLocation SrcLoc(SrcAlloca, LocationSize::precise(Value: Size)); |
1588 | |
1589 | auto SrcModRefCallback = [&](Instruction *UI) -> bool { |
1590 | // Any ModRef post-dominated by Load doesn't matter, also Load and Store |
1591 | // themselves can be ignored. |
1592 | if (PDT->dominates(I1: Load, I2: UI) || UI == Load || UI == Store) |
1593 | return true; |
1594 | ModRefInfo Res = BAA.getModRefInfo(I: UI, OptLoc: SrcLoc); |
1595 | if ((isModSet(MRI: DestModRef) && isRefSet(MRI: Res)) || |
1596 | (isRefSet(MRI: DestModRef) && isModSet(MRI: Res))) |
1597 | return false; |
1598 | |
1599 | return true; |
1600 | }; |
1601 | |
1602 | if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback)) |
1603 | return false; |
1604 | |
1605 | // We can do the transformation. First, move the SrcAlloca to the start of the |
1606 | // BB. |
1607 | if (SrcNotDom) |
1608 | SrcAlloca->moveBefore(BB&: *SrcAlloca->getParent(), |
1609 | I: SrcAlloca->getParent()->getFirstInsertionPt()); |
1610 | // Align the allocas appropriately. |
1611 | SrcAlloca->setAlignment( |
1612 | std::max(a: SrcAlloca->getAlign(), b: DestAlloca->getAlign())); |
1613 | |
1614 | // Merge the two allocas. |
1615 | DestAlloca->replaceAllUsesWith(V: SrcAlloca); |
1616 | eraseInstruction(I: DestAlloca); |
1617 | |
1618 | // Drop metadata on the source alloca. |
1619 | SrcAlloca->dropUnknownNonDebugMetadata(); |
1620 | |
1621 | // TODO: Reconstruct merged lifetime markers. |
1622 | // Remove all other lifetime markers. if the original lifetime intrinsics |
1623 | // exists. |
1624 | if (!LifetimeMarkers.empty()) { |
1625 | for (Instruction *I : LifetimeMarkers) |
1626 | eraseInstruction(I); |
1627 | } |
1628 | |
1629 | // As this transformation can cause memory accesses that didn't previously |
1630 | // alias to begin to alias one another, we remove !noalias metadata from any |
1631 | // uses of either alloca. This is conservative, but more precision doesn't |
1632 | // seem worthwhile right now. |
1633 | for (Instruction *I : NoAliasInstrs) |
1634 | I->setMetadata(KindID: LLVMContext::MD_noalias, Node: nullptr); |
1635 | |
1636 | LLVM_DEBUG(dbgs() << "Stack Move: Performed staack-move optimization\n" ); |
1637 | NumStackMove++; |
1638 | return true; |
1639 | } |
1640 | |
1641 | static bool isZeroSize(Value *Size) { |
1642 | if (auto *I = dyn_cast<Instruction>(Val: Size)) |
1643 | if (auto *Res = simplifyInstruction(I, Q: I->getModule()->getDataLayout())) |
1644 | Size = Res; |
1645 | // Treat undef/poison size like zero. |
1646 | if (auto *C = dyn_cast<Constant>(Val: Size)) |
1647 | return isa<UndefValue>(Val: C) || C->isNullValue(); |
1648 | return false; |
1649 | } |
1650 | |
1651 | /// Perform simplification of memcpy's. If we have memcpy A |
1652 | /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite |
1653 | /// B to be a memcpy from X to Z (or potentially a memmove, depending on |
1654 | /// circumstances). This allows later passes to remove the first memcpy |
1655 | /// altogether. |
1656 | bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { |
1657 | // We can only optimize non-volatile memcpy's. |
1658 | if (M->isVolatile()) return false; |
1659 | |
1660 | // If the source and destination of the memcpy are the same, then zap it. |
1661 | if (M->getSource() == M->getDest()) { |
1662 | ++BBI; |
1663 | eraseInstruction(I: M); |
1664 | return true; |
1665 | } |
1666 | |
1667 | // If the size is zero, remove the memcpy. This also prevents infinite loops |
1668 | // in processMemSetMemCpyDependence, which is a no-op for zero-length memcpys. |
1669 | if (isZeroSize(Size: M->getLength())) { |
1670 | ++BBI; |
1671 | eraseInstruction(I: M); |
1672 | return true; |
1673 | } |
1674 | |
1675 | MemoryUseOrDef *MA = MSSA->getMemoryAccess(I: M); |
1676 | if (!MA) |
1677 | // Degenerate case: memcpy marked as not accessing memory. |
1678 | return false; |
1679 | |
1680 | // If copying from a constant, try to turn the memcpy into a memset. |
1681 | if (auto *GV = dyn_cast<GlobalVariable>(Val: M->getSource())) |
1682 | if (GV->isConstant() && GV->hasDefinitiveInitializer()) |
1683 | if (Value *ByteVal = isBytewiseValue(V: GV->getInitializer(), |
1684 | DL: M->getModule()->getDataLayout())) { |
1685 | IRBuilder<> Builder(M); |
1686 | Instruction *NewM = Builder.CreateMemSet( |
1687 | Ptr: M->getRawDest(), Val: ByteVal, Size: M->getLength(), Align: M->getDestAlign(), isVolatile: false); |
1688 | auto *LastDef = cast<MemoryDef>(Val: MA); |
1689 | auto *NewAccess = |
1690 | MSSAU->createMemoryAccessAfter(I: NewM, Definition: nullptr, InsertPt: LastDef); |
1691 | MSSAU->insertDef(Def: cast<MemoryDef>(Val: NewAccess), /*RenameUses=*/true); |
1692 | |
1693 | eraseInstruction(I: M); |
1694 | ++NumCpyToSet; |
1695 | return true; |
1696 | } |
1697 | |
1698 | BatchAAResults BAA(*AA); |
1699 | // FIXME: Not using getClobberingMemoryAccess() here due to PR54682. |
1700 | MemoryAccess *AnyClobber = MA->getDefiningAccess(); |
1701 | MemoryLocation DestLoc = MemoryLocation::getForDest(MI: M); |
1702 | const MemoryAccess *DestClobber = |
1703 | MSSA->getWalker()->getClobberingMemoryAccess(AnyClobber, DestLoc, AA&: BAA); |
1704 | |
1705 | // Try to turn a partially redundant memset + memcpy into |
1706 | // smaller memset + memcpy. We don't need the memcpy size for this. |
1707 | // The memcpy must post-dom the memset, so limit this to the same basic |
1708 | // block. A non-local generalization is likely not worthwhile. |
1709 | if (auto *MD = dyn_cast<MemoryDef>(Val: DestClobber)) |
1710 | if (auto *MDep = dyn_cast_or_null<MemSetInst>(Val: MD->getMemoryInst())) |
1711 | if (DestClobber->getBlock() == M->getParent()) |
1712 | if (processMemSetMemCpyDependence(MemCpy: M, MemSet: MDep, BAA)) |
1713 | return true; |
1714 | |
1715 | MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess( |
1716 | AnyClobber, MemoryLocation::getForSource(MTI: M), AA&: BAA); |
1717 | |
1718 | // There are five possible optimizations we can do for memcpy: |
1719 | // a) memcpy-memcpy xform which exposes redundance for DSE. |
1720 | // b) call-memcpy xform for return slot optimization. |
1721 | // c) memcpy from freshly alloca'd space or space that has just started |
1722 | // its lifetime copies undefined data, and we can therefore eliminate |
1723 | // the memcpy in favor of the data that was already at the destination. |
1724 | // d) memcpy from a just-memset'd source can be turned into memset. |
1725 | // e) elimination of memcpy via stack-move optimization. |
1726 | if (auto *MD = dyn_cast<MemoryDef>(Val: SrcClobber)) { |
1727 | if (Instruction *MI = MD->getMemoryInst()) { |
1728 | if (auto *CopySize = dyn_cast<ConstantInt>(Val: M->getLength())) { |
1729 | if (auto *C = dyn_cast<CallInst>(Val: MI)) { |
1730 | if (performCallSlotOptzn(cpyLoad: M, cpyStore: M, cpyDest: M->getDest(), cpySrc: M->getSource(), |
1731 | cpySize: TypeSize::getFixed(ExactSize: CopySize->getZExtValue()), |
1732 | cpyDestAlign: M->getDestAlign().valueOrOne(), BAA, |
1733 | GetC: [C]() -> CallInst * { return C; })) { |
1734 | LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n" |
1735 | << " call: " << *C << "\n" |
1736 | << " memcpy: " << *M << "\n" ); |
1737 | eraseInstruction(I: M); |
1738 | ++NumMemCpyInstr; |
1739 | return true; |
1740 | } |
1741 | } |
1742 | } |
1743 | if (auto *MDep = dyn_cast<MemCpyInst>(Val: MI)) |
1744 | if (processMemCpyMemCpyDependence(M, MDep, BAA)) |
1745 | return true; |
1746 | if (auto *MDep = dyn_cast<MemSetInst>(Val: MI)) { |
1747 | if (performMemCpyToMemSetOptzn(MemCpy: M, MemSet: MDep, BAA)) { |
1748 | LLVM_DEBUG(dbgs() << "Converted memcpy to memset\n" ); |
1749 | eraseInstruction(I: M); |
1750 | ++NumCpyToSet; |
1751 | return true; |
1752 | } |
1753 | } |
1754 | } |
1755 | |
1756 | if (hasUndefContents(MSSA, AA&: BAA, V: M->getSource(), Def: MD, Size: M->getLength())) { |
1757 | LLVM_DEBUG(dbgs() << "Removed memcpy from undef\n" ); |
1758 | eraseInstruction(I: M); |
1759 | ++NumMemCpyInstr; |
1760 | return true; |
1761 | } |
1762 | } |
1763 | |
1764 | // If the transfer is from a stack slot to a stack slot, then we may be able |
1765 | // to perform the stack-move optimization. See the comments in |
1766 | // performStackMoveOptzn() for more details. |
1767 | auto *DestAlloca = dyn_cast<AllocaInst>(Val: M->getDest()); |
1768 | if (!DestAlloca) |
1769 | return false; |
1770 | auto *SrcAlloca = dyn_cast<AllocaInst>(Val: M->getSource()); |
1771 | if (!SrcAlloca) |
1772 | return false; |
1773 | ConstantInt *Len = dyn_cast<ConstantInt>(Val: M->getLength()); |
1774 | if (Len == nullptr) |
1775 | return false; |
1776 | if (performStackMoveOptzn(Load: M, Store: M, DestAlloca, SrcAlloca, |
1777 | Size: TypeSize::getFixed(ExactSize: Len->getZExtValue()), BAA)) { |
1778 | // Avoid invalidating the iterator. |
1779 | BBI = M->getNextNonDebugInstruction()->getIterator(); |
1780 | eraseInstruction(I: M); |
1781 | ++NumMemCpyInstr; |
1782 | return true; |
1783 | } |
1784 | |
1785 | return false; |
1786 | } |
1787 | |
1788 | /// Transforms memmove calls to memcpy calls when the src/dst are guaranteed |
1789 | /// not to alias. |
1790 | bool MemCpyOptPass::processMemMove(MemMoveInst *M) { |
1791 | // See if the source could be modified by this memmove potentially. |
1792 | if (isModSet(MRI: AA->getModRefInfo(I: M, OptLoc: MemoryLocation::getForSource(MTI: M)))) |
1793 | return false; |
1794 | |
1795 | LLVM_DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M |
1796 | << "\n" ); |
1797 | |
1798 | // If not, then we know we can transform this. |
1799 | Type *ArgTys[3] = { M->getRawDest()->getType(), |
1800 | M->getRawSource()->getType(), |
1801 | M->getLength()->getType() }; |
1802 | M->setCalledFunction(Intrinsic::getDeclaration(M: M->getModule(), |
1803 | Intrinsic::id: memcpy, Tys: ArgTys)); |
1804 | |
1805 | // For MemorySSA nothing really changes (except that memcpy may imply stricter |
1806 | // aliasing guarantees). |
1807 | |
1808 | ++NumMoveToCpy; |
1809 | return true; |
1810 | } |
1811 | |
1812 | /// This is called on every byval argument in call sites. |
1813 | bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) { |
1814 | const DataLayout &DL = CB.getCaller()->getParent()->getDataLayout(); |
1815 | // Find out what feeds this byval argument. |
1816 | Value *ByValArg = CB.getArgOperand(i: ArgNo); |
1817 | Type *ByValTy = CB.getParamByValType(ArgNo); |
1818 | TypeSize ByValSize = DL.getTypeAllocSize(Ty: ByValTy); |
1819 | MemoryLocation Loc(ByValArg, LocationSize::precise(Value: ByValSize)); |
1820 | MemoryUseOrDef *CallAccess = MSSA->getMemoryAccess(I: &CB); |
1821 | if (!CallAccess) |
1822 | return false; |
1823 | MemCpyInst *MDep = nullptr; |
1824 | BatchAAResults BAA(*AA); |
1825 | MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( |
1826 | CallAccess->getDefiningAccess(), Loc, AA&: BAA); |
1827 | if (auto *MD = dyn_cast<MemoryDef>(Val: Clobber)) |
1828 | MDep = dyn_cast_or_null<MemCpyInst>(Val: MD->getMemoryInst()); |
1829 | |
1830 | // If the byval argument isn't fed by a memcpy, ignore it. If it is fed by |
1831 | // a memcpy, see if we can byval from the source of the memcpy instead of the |
1832 | // result. |
1833 | if (!MDep || MDep->isVolatile() || |
1834 | ByValArg->stripPointerCasts() != MDep->getDest()) |
1835 | return false; |
1836 | |
1837 | // The length of the memcpy must be larger or equal to the size of the byval. |
1838 | auto *C1 = dyn_cast<ConstantInt>(Val: MDep->getLength()); |
1839 | if (!C1 || !TypeSize::isKnownGE( |
1840 | LHS: TypeSize::getFixed(ExactSize: C1->getValue().getZExtValue()), RHS: ByValSize)) |
1841 | return false; |
1842 | |
1843 | // Get the alignment of the byval. If the call doesn't specify the alignment, |
1844 | // then it is some target specific value that we can't know. |
1845 | MaybeAlign ByValAlign = CB.getParamAlign(ArgNo); |
1846 | if (!ByValAlign) return false; |
1847 | |
1848 | // If it is greater than the memcpy, then we check to see if we can force the |
1849 | // source of the memcpy to the alignment we need. If we fail, we bail out. |
1850 | MaybeAlign MemDepAlign = MDep->getSourceAlign(); |
1851 | if ((!MemDepAlign || *MemDepAlign < *ByValAlign) && |
1852 | getOrEnforceKnownAlignment(V: MDep->getSource(), PrefAlign: ByValAlign, DL, CxtI: &CB, AC, |
1853 | DT) < *ByValAlign) |
1854 | return false; |
1855 | |
1856 | // The type of the memcpy source must match the byval argument |
1857 | if (MDep->getSource()->getType() != ByValArg->getType()) |
1858 | return false; |
1859 | |
1860 | // Verify that the copied-from memory doesn't change in between the memcpy and |
1861 | // the byval call. |
1862 | // memcpy(a <- b) |
1863 | // *b = 42; |
1864 | // foo(*a) |
1865 | // It would be invalid to transform the second memcpy into foo(*b). |
1866 | if (writtenBetween(MSSA, AA&: BAA, Loc: MemoryLocation::getForSource(MTI: MDep), |
1867 | Start: MSSA->getMemoryAccess(I: MDep), End: CallAccess)) |
1868 | return false; |
1869 | |
1870 | LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n" |
1871 | << " " << *MDep << "\n" |
1872 | << " " << CB << "\n" ); |
1873 | |
1874 | // Otherwise we're good! Update the byval argument. |
1875 | combineAAMetadata(ReplInst: &CB, I: MDep); |
1876 | CB.setArgOperand(i: ArgNo, v: MDep->getSource()); |
1877 | ++NumMemCpyInstr; |
1878 | return true; |
1879 | } |
1880 | |
1881 | /// This is called on memcpy dest pointer arguments attributed as immutable |
1882 | /// during call. Try to use memcpy source directly if all of the following |
1883 | /// conditions are satisfied. |
1884 | /// 1. The memcpy dst is neither modified during the call nor captured by the |
1885 | /// call. (if readonly, noalias, nocapture attributes on call-site.) |
1886 | /// 2. The memcpy dst is an alloca with known alignment & size. |
1887 | /// 2-1. The memcpy length == the alloca size which ensures that the new |
1888 | /// pointer is dereferenceable for the required range |
1889 | /// 2-2. The src pointer has alignment >= the alloca alignment or can be |
1890 | /// enforced so. |
1891 | /// 3. The memcpy dst and src is not modified between the memcpy and the call. |
1892 | /// (if MSSA clobber check is safe.) |
1893 | /// 4. The memcpy src is not modified during the call. (ModRef check shows no |
1894 | /// Mod.) |
1895 | bool MemCpyOptPass::processImmutArgument(CallBase &CB, unsigned ArgNo) { |
1896 | // 1. Ensure passed argument is immutable during call. |
1897 | if (!(CB.paramHasAttr(ArgNo, Attribute::Kind: NoAlias) && |
1898 | CB.paramHasAttr(ArgNo, Attribute::Kind: NoCapture))) |
1899 | return false; |
1900 | const DataLayout &DL = CB.getCaller()->getParent()->getDataLayout(); |
1901 | Value *ImmutArg = CB.getArgOperand(i: ArgNo); |
1902 | |
1903 | // 2. Check that arg is alloca |
1904 | // TODO: Even if the arg gets back to branches, we can remove memcpy if all |
1905 | // the alloca alignments can be enforced to source alignment. |
1906 | auto *AI = dyn_cast<AllocaInst>(Val: ImmutArg->stripPointerCasts()); |
1907 | if (!AI) |
1908 | return false; |
1909 | |
1910 | std::optional<TypeSize> AllocaSize = AI->getAllocationSize(DL); |
1911 | // Can't handle unknown size alloca. |
1912 | // (e.g. Variable Length Array, Scalable Vector) |
1913 | if (!AllocaSize || AllocaSize->isScalable()) |
1914 | return false; |
1915 | MemoryLocation Loc(ImmutArg, LocationSize::precise(Value: *AllocaSize)); |
1916 | MemoryUseOrDef *CallAccess = MSSA->getMemoryAccess(I: &CB); |
1917 | if (!CallAccess) |
1918 | return false; |
1919 | |
1920 | MemCpyInst *MDep = nullptr; |
1921 | BatchAAResults BAA(*AA); |
1922 | MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( |
1923 | CallAccess->getDefiningAccess(), Loc, AA&: BAA); |
1924 | if (auto *MD = dyn_cast<MemoryDef>(Val: Clobber)) |
1925 | MDep = dyn_cast_or_null<MemCpyInst>(Val: MD->getMemoryInst()); |
1926 | |
1927 | // If the immut argument isn't fed by a memcpy, ignore it. If it is fed by |
1928 | // a memcpy, check that the arg equals the memcpy dest. |
1929 | if (!MDep || MDep->isVolatile() || AI != MDep->getDest()) |
1930 | return false; |
1931 | |
1932 | // The type of the memcpy source must match the immut argument |
1933 | if (MDep->getSource()->getType() != ImmutArg->getType()) |
1934 | return false; |
1935 | |
1936 | // 2-1. The length of the memcpy must be equal to the size of the alloca. |
1937 | auto *MDepLen = dyn_cast<ConstantInt>(Val: MDep->getLength()); |
1938 | if (!MDepLen || AllocaSize != MDepLen->getValue()) |
1939 | return false; |
1940 | |
1941 | // 2-2. the memcpy source align must be larger than or equal the alloca's |
1942 | // align. If not so, we check to see if we can force the source of the memcpy |
1943 | // to the alignment we need. If we fail, we bail out. |
1944 | Align MemDepAlign = MDep->getSourceAlign().valueOrOne(); |
1945 | Align AllocaAlign = AI->getAlign(); |
1946 | if (MemDepAlign < AllocaAlign && |
1947 | getOrEnforceKnownAlignment(V: MDep->getSource(), PrefAlign: AllocaAlign, DL, CxtI: &CB, AC, |
1948 | DT) < AllocaAlign) |
1949 | return false; |
1950 | |
1951 | // 3. Verify that the source doesn't change in between the memcpy and |
1952 | // the call. |
1953 | // memcpy(a <- b) |
1954 | // *b = 42; |
1955 | // foo(*a) |
1956 | // It would be invalid to transform the second memcpy into foo(*b). |
1957 | if (writtenBetween(MSSA, AA&: BAA, Loc: MemoryLocation::getForSource(MTI: MDep), |
1958 | Start: MSSA->getMemoryAccess(I: MDep), End: CallAccess)) |
1959 | return false; |
1960 | |
1961 | // 4. The memcpy src must not be modified during the call. |
1962 | if (isModSet(MRI: AA->getModRefInfo(I: &CB, OptLoc: MemoryLocation::getForSource(MTI: MDep)))) |
1963 | return false; |
1964 | |
1965 | LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to Immut src:\n" |
1966 | << " " << *MDep << "\n" |
1967 | << " " << CB << "\n" ); |
1968 | |
1969 | // Otherwise we're good! Update the immut argument. |
1970 | combineAAMetadata(ReplInst: &CB, I: MDep); |
1971 | CB.setArgOperand(i: ArgNo, v: MDep->getSource()); |
1972 | ++NumMemCpyInstr; |
1973 | return true; |
1974 | } |
1975 | |
1976 | /// Executes one iteration of MemCpyOptPass. |
1977 | bool MemCpyOptPass::iterateOnFunction(Function &F) { |
1978 | bool MadeChange = false; |
1979 | |
1980 | // Walk all instruction in the function. |
1981 | for (BasicBlock &BB : F) { |
1982 | // Skip unreachable blocks. For example processStore assumes that an |
1983 | // instruction in a BB can't be dominated by a later instruction in the |
1984 | // same BB (which is a scenario that can happen for an unreachable BB that |
1985 | // has itself as a predecessor). |
1986 | if (!DT->isReachableFromEntry(A: &BB)) |
1987 | continue; |
1988 | |
1989 | for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { |
1990 | // Avoid invalidating the iterator. |
1991 | Instruction *I = &*BI++; |
1992 | |
1993 | bool RepeatInstruction = false; |
1994 | |
1995 | if (auto *SI = dyn_cast<StoreInst>(Val: I)) |
1996 | MadeChange |= processStore(SI, BBI&: BI); |
1997 | else if (auto *M = dyn_cast<MemSetInst>(Val: I)) |
1998 | RepeatInstruction = processMemSet(MSI: M, BBI&: BI); |
1999 | else if (auto *M = dyn_cast<MemCpyInst>(Val: I)) |
2000 | RepeatInstruction = processMemCpy(M, BBI&: BI); |
2001 | else if (auto *M = dyn_cast<MemMoveInst>(Val: I)) |
2002 | RepeatInstruction = processMemMove(M); |
2003 | else if (auto *CB = dyn_cast<CallBase>(Val: I)) { |
2004 | for (unsigned i = 0, e = CB->arg_size(); i != e; ++i) { |
2005 | if (CB->isByValArgument(ArgNo: i)) |
2006 | MadeChange |= processByValArgument(CB&: *CB, ArgNo: i); |
2007 | else if (CB->onlyReadsMemory(OpNo: i)) |
2008 | MadeChange |= processImmutArgument(CB&: *CB, ArgNo: i); |
2009 | } |
2010 | } |
2011 | |
2012 | // Reprocess the instruction if desired. |
2013 | if (RepeatInstruction) { |
2014 | if (BI != BB.begin()) |
2015 | --BI; |
2016 | MadeChange = true; |
2017 | } |
2018 | } |
2019 | } |
2020 | |
2021 | return MadeChange; |
2022 | } |
2023 | |
2024 | PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) { |
2025 | auto &TLI = AM.getResult<TargetLibraryAnalysis>(IR&: F); |
2026 | auto *AA = &AM.getResult<AAManager>(IR&: F); |
2027 | auto *AC = &AM.getResult<AssumptionAnalysis>(IR&: F); |
2028 | auto *DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F); |
2029 | auto *PDT = &AM.getResult<PostDominatorTreeAnalysis>(IR&: F); |
2030 | auto *MSSA = &AM.getResult<MemorySSAAnalysis>(IR&: F); |
2031 | |
2032 | bool MadeChange = runImpl(F, TLI: &TLI, AA, AC, DT, PDT, MSSA: &MSSA->getMSSA()); |
2033 | if (!MadeChange) |
2034 | return PreservedAnalyses::all(); |
2035 | |
2036 | PreservedAnalyses PA; |
2037 | PA.preserveSet<CFGAnalyses>(); |
2038 | PA.preserve<MemorySSAAnalysis>(); |
2039 | return PA; |
2040 | } |
2041 | |
2042 | bool MemCpyOptPass::runImpl(Function &F, TargetLibraryInfo *TLI_, |
2043 | AliasAnalysis *AA_, AssumptionCache *AC_, |
2044 | DominatorTree *DT_, PostDominatorTree *PDT_, |
2045 | MemorySSA *MSSA_) { |
2046 | bool MadeChange = false; |
2047 | TLI = TLI_; |
2048 | AA = AA_; |
2049 | AC = AC_; |
2050 | DT = DT_; |
2051 | PDT = PDT_; |
2052 | MSSA = MSSA_; |
2053 | MemorySSAUpdater MSSAU_(MSSA_); |
2054 | MSSAU = &MSSAU_; |
2055 | |
2056 | while (true) { |
2057 | if (!iterateOnFunction(F)) |
2058 | break; |
2059 | MadeChange = true; |
2060 | } |
2061 | |
2062 | if (VerifyMemorySSA) |
2063 | MSSA_->verifyMemorySSA(); |
2064 | |
2065 | return MadeChange; |
2066 | } |
2067 | |