1//===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file defines classes for searching and analyzing source code clones.
11///
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_CLANG_ANALYSIS_CLONEDETECTION_H
15#define LLVM_CLANG_ANALYSIS_CLONEDETECTION_H
16
17#include "clang/AST/StmtVisitor.h"
18#include "llvm/Support/Regex.h"
19#include <vector>
20
21namespace clang {
22
23class Stmt;
24class Decl;
25class VarDecl;
26class ASTContext;
27class CompoundStmt;
28
29/// Identifies a list of statements.
30///
31/// Can either identify a single arbitrary Stmt object, a continuous sequence of
32/// child statements inside a CompoundStmt or no statements at all.
33class StmtSequence {
34 /// If this object identifies a sequence of statements inside a CompoundStmt,
35 /// S points to this CompoundStmt. If this object only identifies a single
36 /// Stmt, then S is a pointer to this Stmt.
37 const Stmt *S;
38
39 /// The declaration that contains the statements.
40 const Decl *D;
41
42 /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
43 /// instance is representing the CompoundStmt children inside the array
44 /// [StartIndex, EndIndex).
45 unsigned StartIndex;
46 unsigned EndIndex;
47
48public:
49 /// Constructs a StmtSequence holding multiple statements.
50 ///
51 /// The resulting StmtSequence identifies a continuous sequence of statements
52 /// in the body of the given CompoundStmt. Which statements of the body should
53 /// be identified needs to be specified by providing a start and end index
54 /// that describe a non-empty sub-array in the body of the given CompoundStmt.
55 ///
56 /// \param Stmt A CompoundStmt that contains all statements in its body.
57 /// \param D The Decl containing this Stmt.
58 /// \param StartIndex The inclusive start index in the children array of
59 /// \p Stmt
60 /// \param EndIndex The exclusive end index in the children array of \p Stmt.
61 StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex,
62 unsigned EndIndex);
63
64 /// Constructs a StmtSequence holding a single statement.
65 ///
66 /// \param Stmt An arbitrary Stmt.
67 /// \param D The Decl containing this Stmt.
68 StmtSequence(const Stmt *Stmt, const Decl *D);
69
70 /// Constructs an empty StmtSequence.
71 StmtSequence();
72
73 typedef const Stmt *const *iterator;
74
75 /// Returns an iterator pointing to the first statement in this sequence.
76 iterator begin() const;
77
78 /// Returns an iterator pointing behind the last statement in this sequence.
79 iterator end() const;
80
81 /// Returns the first statement in this sequence.
82 ///
83 /// This method should only be called on a non-empty StmtSequence object.
84 const Stmt *front() const {
85 assert(!empty());
86 return begin()[0];
87 }
88
89 /// Returns the last statement in this sequence.
90 ///
91 /// This method should only be called on a non-empty StmtSequence object.
92 const Stmt *back() const {
93 assert(!empty());
94 return begin()[size() - 1];
95 }
96
97 /// Returns the number of statements this object holds.
98 unsigned size() const {
99 if (holdsSequence())
100 return EndIndex - StartIndex;
101 if (S == nullptr)
102 return 0;
103 return 1;
104 }
105
106 /// Returns true if and only if this StmtSequence contains no statements.
107 bool empty() const { return size() == 0; }
108
109 /// Returns the related ASTContext for the stored Stmts.
110 ASTContext &getASTContext() const;
111
112 /// Returns the declaration that contains the stored Stmts.
113 const Decl *getContainingDecl() const {
114 assert(D);
115 return D;
116 }
117
118 /// Returns true if this objects holds a list of statements.
119 bool holdsSequence() const { return EndIndex != 0; }
120
121 /// Returns the start sourcelocation of the first statement in this sequence.
122 ///
123 /// This method should only be called on a non-empty StmtSequence object.
124 SourceLocation getBeginLoc() const;
125
126 /// Returns the end sourcelocation of the last statement in this sequence.
127 ///
128 /// This method should only be called on a non-empty StmtSequence object.
129 SourceLocation getEndLoc() const;
130
131 /// Returns the source range of the whole sequence - from the beginning
132 /// of the first statement to the end of the last statement.
133 SourceRange getSourceRange() const;
134
135 bool operator==(const StmtSequence &Other) const {
136 return std::tie(args: S, args: StartIndex, args: EndIndex) ==
137 std::tie(args: Other.S, args: Other.StartIndex, args: Other.EndIndex);
138 }
139
140 bool operator!=(const StmtSequence &Other) const {
141 return std::tie(args: S, args: StartIndex, args: EndIndex) !=
142 std::tie(args: Other.S, args: Other.StartIndex, args: Other.EndIndex);
143 }
144
145 /// Returns true if and only if this sequence covers a source range that
146 /// contains the source range of the given sequence \p Other.
147 ///
148 /// This method should only be called on a non-empty StmtSequence object
149 /// and passed a non-empty StmtSequence object.
150 bool contains(const StmtSequence &Other) const;
151};
152
153/// Searches for similar subtrees in the AST.
154///
155/// First, this class needs several declarations with statement bodies which
156/// can be passed via analyzeCodeBody. Afterwards all statements can be
157/// searched for clones by calling findClones with a given list of constraints
158/// that should specify the wanted properties of the clones.
159///
160/// The result of findClones can be further constrained with the constrainClones
161/// method.
162///
163/// This class only searches for clones in executable source code
164/// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
165/// are not supported.
166class CloneDetector {
167
168public:
169 /// A collection of StmtSequences that share an arbitrary property.
170 typedef llvm::SmallVector<StmtSequence, 8> CloneGroup;
171
172 /// Generates and stores search data for all statements in the body of
173 /// the given Decl.
174 void analyzeCodeBody(const Decl *D);
175
176 /// Constrains the given list of clone groups with the given constraint.
177 ///
178 /// The constraint is expected to have a method with the signature
179 /// `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)`
180 /// as this is the interface that the CloneDetector uses for applying the
181 /// constraint. The constraint is supposed to directly modify the passed list
182 /// so that all clones in the list fulfill the specific property this
183 /// constraint ensures.
184 template <typename T>
185 static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) {
186 C.constrain(CloneGroups);
187 }
188
189 /// Constrains the given list of clone groups with the given list of
190 /// constraints.
191 ///
192 /// The constraints are applied in sequence in the order in which they are
193 /// passed to this function.
194 template <typename T1, typename... Ts>
195 static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C,
196 Ts... ConstraintList) {
197 constrainClones(CloneGroups, C);
198 constrainClones(CloneGroups, ConstraintList...);
199 }
200
201 /// Searches for clones in all previously passed statements.
202 /// \param Result Output parameter to which all created clone groups are
203 /// added.
204 /// \param ConstraintList The constraints that should be applied to the
205 // result.
206 template <typename... Ts>
207 void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) {
208 // The initial assumption is that there is only one clone group and every
209 // statement is a clone of the others. This clone group will then be
210 // split up with the help of the constraints.
211 Result.push_back(x: Sequences);
212
213 constrainClones(Result, ConstraintList...);
214 }
215
216private:
217 CloneGroup Sequences;
218};
219
220/// This class is a utility class that contains utility functions for building
221/// custom constraints.
222class CloneConstraint {
223public:
224 /// Removes all groups by using a filter function.
225 /// \param CloneGroups The list of CloneGroups that is supposed to be
226 /// filtered.
227 /// \param Filter The filter function that should return true for all groups
228 /// that should be removed from the list.
229 static void filterGroups(
230 std::vector<CloneDetector::CloneGroup> &CloneGroups,
231 llvm::function_ref<bool(const CloneDetector::CloneGroup &)> Filter) {
232 llvm::erase_if(C&: CloneGroups, P: Filter);
233 }
234
235 /// Splits the given CloneGroups until the given Compare function returns true
236 /// for all clones in a single group.
237 /// \param CloneGroups A list of CloneGroups that should be modified.
238 /// \param Compare The comparison function that all clones are supposed to
239 /// pass. Should return true if and only if two clones belong
240 /// to the same CloneGroup.
241 static void splitCloneGroups(
242 std::vector<CloneDetector::CloneGroup> &CloneGroups,
243 llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)>
244 Compare);
245};
246
247/// This constraint moves clones into clone groups of type II via hashing.
248///
249/// Clones with different hash values are moved into separate clone groups.
250/// Collisions are possible, and this constraint does nothing to address this
251/// them. Add the slower RecursiveCloneTypeIIVerifyConstraint later in the
252/// constraint chain, not necessarily immediately, to eliminate hash collisions
253/// through a more detailed analysis.
254class RecursiveCloneTypeIIHashConstraint {
255public:
256 void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
257};
258
259/// This constraint moves clones into clone groups of type II by comparing them.
260///
261/// Clones that aren't type II clones are moved into separate clone groups.
262/// In contrast to the RecursiveCloneTypeIIHashConstraint, all clones in a clone
263/// group are guaranteed to be type II clones of each other, but it is too
264/// slow to efficiently handle large amounts of clones.
265class RecursiveCloneTypeIIVerifyConstraint {
266public:
267 void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
268};
269
270/// Ensures that every clone has at least the given complexity.
271///
272/// Complexity is here defined as the total amount of children of a statement.
273/// This constraint assumes the first statement in the group is representative
274/// for all other statements in the group in terms of complexity.
275class MinComplexityConstraint {
276 unsigned MinComplexity;
277
278public:
279 MinComplexityConstraint(unsigned MinComplexity)
280 : MinComplexity(MinComplexity) {}
281
282 /// Calculates the complexity of the given StmtSequence.
283 /// \param Limit The limit of complexity we probe for. After reaching
284 /// this limit during calculation, this method is exiting
285 /// early to improve performance and returns this limit.
286 size_t calculateStmtComplexity(const StmtSequence &Seq, std::size_t Limit,
287 const std::string &ParentMacroStack = "");
288
289 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
290 CloneConstraint::filterGroups(
291 CloneGroups, Filter: [this](const CloneDetector::CloneGroup &A) {
292 if (!A.empty())
293 return calculateStmtComplexity(Seq: A.front(), Limit: MinComplexity) <
294 MinComplexity;
295 else
296 return false;
297 });
298 }
299};
300
301/// Ensures that all clone groups contain at least the given amount of clones.
302class MinGroupSizeConstraint {
303 unsigned MinGroupSize;
304
305public:
306 MinGroupSizeConstraint(unsigned MinGroupSize = 2)
307 : MinGroupSize(MinGroupSize) {}
308
309 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
310 CloneConstraint::filterGroups(CloneGroups,
311 Filter: [this](const CloneDetector::CloneGroup &A) {
312 return A.size() < MinGroupSize;
313 });
314 }
315};
316
317/// Ensures that no clone group fully contains another clone group.
318struct OnlyLargestCloneConstraint {
319 void constrain(std::vector<CloneDetector::CloneGroup> &Result);
320};
321
322struct FilenamePatternConstraint {
323 StringRef IgnoredFilesPattern;
324 std::shared_ptr<llvm::Regex> IgnoredFilesRegex;
325
326 FilenamePatternConstraint(StringRef IgnoredFilesPattern)
327 : IgnoredFilesPattern(IgnoredFilesPattern) {
328 IgnoredFilesRegex = std::make_shared<llvm::Regex>(args: "^(" +
329 IgnoredFilesPattern.str() + "$)");
330 }
331
332 bool isAutoGenerated(const CloneDetector::CloneGroup &Group);
333
334 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
335 CloneConstraint::filterGroups(
336 CloneGroups, Filter: [this](const CloneDetector::CloneGroup &Group) {
337 return isAutoGenerated(Group);
338 });
339 }
340};
341
342/// Analyzes the pattern of the referenced variables in a statement.
343class VariablePattern {
344
345 /// Describes an occurrence of a variable reference in a statement.
346 struct VariableOccurence {
347 /// The index of the associated VarDecl in the Variables vector.
348 size_t KindID;
349 /// The statement in the code where the variable was referenced.
350 const Stmt *Mention;
351
352 VariableOccurence(size_t KindID, const Stmt *Mention)
353 : KindID(KindID), Mention(Mention) {}
354 };
355
356 /// All occurrences of referenced variables in the order of appearance.
357 std::vector<VariableOccurence> Occurences;
358 /// List of referenced variables in the order of appearance.
359 /// Every item in this list is unique.
360 std::vector<const VarDecl *> Variables;
361
362 /// Adds a new variable referenced to this pattern.
363 /// \param VarDecl The declaration of the variable that is referenced.
364 /// \param Mention The SourceRange where this variable is referenced.
365 void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention);
366
367 /// Adds each referenced variable from the given statement.
368 void addVariables(const Stmt *S);
369
370public:
371 /// Creates an VariablePattern object with information about the given
372 /// StmtSequence.
373 VariablePattern(const StmtSequence &Sequence) {
374 for (const Stmt *S : Sequence)
375 addVariables(S);
376 }
377
378 /// Describes two clones that reference their variables in a different pattern
379 /// which could indicate a programming error.
380 struct SuspiciousClonePair {
381 /// Utility class holding the relevant information about a single
382 /// clone in this pair.
383 struct SuspiciousCloneInfo {
384 /// The variable which referencing in this clone was against the pattern.
385 const VarDecl *Variable;
386 /// Where the variable was referenced.
387 const Stmt *Mention;
388 /// The variable that should have been referenced to follow the pattern.
389 /// If Suggestion is a nullptr then it's not possible to fix the pattern
390 /// by referencing a different variable in this clone.
391 const VarDecl *Suggestion;
392 SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention,
393 const VarDecl *Suggestion)
394 : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {}
395 SuspiciousCloneInfo() {}
396 };
397 /// The first clone in the pair which always has a suggested variable.
398 SuspiciousCloneInfo FirstCloneInfo;
399 /// This other clone in the pair which can have a suggested variable.
400 SuspiciousCloneInfo SecondCloneInfo;
401 };
402
403 /// Counts the differences between this pattern and the given one.
404 /// \param Other The given VariablePattern to compare with.
405 /// \param FirstMismatch Output parameter that will be filled with information
406 /// about the first difference between the two patterns. This parameter
407 /// can be a nullptr, in which case it will be ignored.
408 /// \return Returns the number of differences between the pattern this object
409 /// is following and the given VariablePattern.
410 ///
411 /// For example, the following statements all have the same pattern and this
412 /// function would return zero:
413 ///
414 /// if (a < b) return a; return b;
415 /// if (x < y) return x; return y;
416 /// if (u2 < u1) return u2; return u1;
417 ///
418 /// But the following statement has a different pattern (note the changed
419 /// variables in the return statements) and would have two differences when
420 /// compared with one of the statements above.
421 ///
422 /// if (a < b) return b; return a;
423 ///
424 /// This function should only be called if the related statements of the given
425 /// pattern and the statements of this objects are clones of each other.
426 unsigned countPatternDifferences(
427 const VariablePattern &Other,
428 VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr);
429};
430
431/// Ensures that all clones reference variables in the same pattern.
432struct MatchingVariablePatternConstraint {
433 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups);
434};
435
436} // end namespace clang
437
438#endif // LLVM_CLANG_ANALYSIS_CLONEDETECTION_H
439

source code of clang/include/clang/Analysis/CloneDetection.h