clang 20.0.0git
CloneDetection.h
Go to the documentation of this file.
1//===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file defines classes for searching and analyzing source code clones.
11///
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_CLANG_ANALYSIS_CLONEDETECTION_H
15#define LLVM_CLANG_ANALYSIS_CLONEDETECTION_H
16
18#include "llvm/Support/Regex.h"
19#include <vector>
20
21namespace clang {
22
23class Stmt;
24class Decl;
25class VarDecl;
26class ASTContext;
27class CompoundStmt;
28
29/// Identifies a list of statements.
30///
31/// Can either identify a single arbitrary Stmt object, a continuous sequence of
32/// child statements inside a CompoundStmt or no statements at all.
34 /// If this object identifies a sequence of statements inside a CompoundStmt,
35 /// S points to this CompoundStmt. If this object only identifies a single
36 /// Stmt, then S is a pointer to this Stmt.
37 const Stmt *S;
38
39 /// The declaration that contains the statements.
40 const Decl *D;
41
42 /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
43 /// instance is representing the CompoundStmt children inside the array
44 /// [StartIndex, EndIndex).
45 unsigned StartIndex;
46 unsigned EndIndex;
47
48public:
49 /// Constructs a StmtSequence holding multiple statements.
50 ///
51 /// The resulting StmtSequence identifies a continuous sequence of statements
52 /// in the body of the given CompoundStmt. Which statements of the body should
53 /// be identified needs to be specified by providing a start and end index
54 /// that describe a non-empty sub-array in the body of the given CompoundStmt.
55 ///
56 /// \param Stmt A CompoundStmt that contains all statements in its body.
57 /// \param D The Decl containing this Stmt.
58 /// \param StartIndex The inclusive start index in the children array of
59 /// \p Stmt
60 /// \param EndIndex The exclusive end index in the children array of \p Stmt.
61 StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex,
62 unsigned EndIndex);
63
64 /// Constructs a StmtSequence holding a single statement.
65 ///
66 /// \param Stmt An arbitrary Stmt.
67 /// \param D The Decl containing this Stmt.
68 StmtSequence(const Stmt *Stmt, const Decl *D);
69
70 /// Constructs an empty StmtSequence.
72
73 typedef const Stmt *const *iterator;
74
75 /// Returns an iterator pointing to the first statement in this sequence.
76 iterator begin() const;
77
78 /// Returns an iterator pointing behind the last statement in this sequence.
79 iterator end() const;
80
81 /// Returns the first statement in this sequence.
82 ///
83 /// This method should only be called on a non-empty StmtSequence object.
84 const Stmt *front() const {
85 assert(!empty());
86 return begin()[0];
87 }
88
89 /// Returns the last statement in this sequence.
90 ///
91 /// This method should only be called on a non-empty StmtSequence object.
92 const Stmt *back() const {
93 assert(!empty());
94 return begin()[size() - 1];
95 }
96
97 /// Returns the number of statements this object holds.
98 unsigned size() const {
99 if (holdsSequence())
100 return EndIndex - StartIndex;
101 if (S == nullptr)
102 return 0;
103 return 1;
104 }
105
106 /// Returns true if and only if this StmtSequence contains no statements.
107 bool empty() const { return size() == 0; }
108
109 /// Returns the related ASTContext for the stored Stmts.
110 ASTContext &getASTContext() const;
111
112 /// Returns the declaration that contains the stored Stmts.
113 const Decl *getContainingDecl() const {
114 assert(D);
115 return D;
116 }
117
118 /// Returns true if this objects holds a list of statements.
119 bool holdsSequence() const { return EndIndex != 0; }
120
121 /// Returns the start sourcelocation of the first statement in this sequence.
122 ///
123 /// This method should only be called on a non-empty StmtSequence object.
125
126 /// Returns the end sourcelocation of the last statement in this sequence.
127 ///
128 /// This method should only be called on a non-empty StmtSequence object.
130
131 /// Returns the source range of the whole sequence - from the beginning
132 /// of the first statement to the end of the last statement.
134
135 bool operator==(const StmtSequence &Other) const {
136 return std::tie(S, StartIndex, EndIndex) ==
137 std::tie(Other.S, Other.StartIndex, Other.EndIndex);
138 }
139
140 bool operator!=(const StmtSequence &Other) const {
141 return std::tie(S, StartIndex, EndIndex) !=
142 std::tie(Other.S, Other.StartIndex, Other.EndIndex);
143 }
144
145 /// Returns true if and only if this sequence covers a source range that
146 /// contains the source range of the given sequence \p Other.
147 ///
148 /// This method should only be called on a non-empty StmtSequence object
149 /// and passed a non-empty StmtSequence object.
150 bool contains(const StmtSequence &Other) const;
151};
152
153/// Searches for similar subtrees in the AST.
154///
155/// First, this class needs several declarations with statement bodies which
156/// can be passed via analyzeCodeBody. Afterwards all statements can be
157/// searched for clones by calling findClones with a given list of constraints
158/// that should specify the wanted properties of the clones.
159///
160/// The result of findClones can be further constrained with the constrainClones
161/// method.
162///
163/// This class only searches for clones in executable source code
164/// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
165/// are not supported.
167
168public:
169 /// A collection of StmtSequences that share an arbitrary property.
171
172 /// Generates and stores search data for all statements in the body of
173 /// the given Decl.
174 void analyzeCodeBody(const Decl *D);
175
176 /// Constrains the given list of clone groups with the given constraint.
177 ///
178 /// The constraint is expected to have a method with the signature
179 /// `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)`
180 /// as this is the interface that the CloneDetector uses for applying the
181 /// constraint. The constraint is supposed to directly modify the passed list
182 /// so that all clones in the list fulfill the specific property this
183 /// constraint ensures.
184 template <typename T>
185 static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) {
186 C.constrain(CloneGroups);
187 }
188
189 /// Constrains the given list of clone groups with the given list of
190 /// constraints.
191 ///
192 /// The constraints are applied in sequence in the order in which they are
193 /// passed to this function.
194 template <typename T1, typename... Ts>
195 static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C,
196 Ts... ConstraintList) {
197 constrainClones(CloneGroups, C);
198 constrainClones(CloneGroups, ConstraintList...);
199 }
200
201 /// Searches for clones in all previously passed statements.
202 /// \param Result Output parameter to which all created clone groups are
203 /// added.
204 /// \param ConstraintList The constraints that should be applied to the
205 // result.
206 template <typename... Ts>
207 void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) {
208 // The initial assumption is that there is only one clone group and every
209 // statement is a clone of the others. This clone group will then be
210 // split up with the help of the constraints.
211 Result.push_back(Sequences);
212
213 constrainClones(Result, ConstraintList...);
214 }
215
216private:
217 CloneGroup Sequences;
218};
219
220/// This class is a utility class that contains utility functions for building
221/// custom constraints.
223public:
224 /// Removes all groups by using a filter function.
225 /// \param CloneGroups The list of CloneGroups that is supposed to be
226 /// filtered.
227 /// \param Filter The filter function that should return true for all groups
228 /// that should be removed from the list.
229 static void filterGroups(
230 std::vector<CloneDetector::CloneGroup> &CloneGroups,
231 llvm::function_ref<bool(const CloneDetector::CloneGroup &)> Filter) {
232 llvm::erase_if(CloneGroups, Filter);
233 }
234
235 /// Splits the given CloneGroups until the given Compare function returns true
236 /// for all clones in a single group.
237 /// \param CloneGroups A list of CloneGroups that should be modified.
238 /// \param Compare The comparison function that all clones are supposed to
239 /// pass. Should return true if and only if two clones belong
240 /// to the same CloneGroup.
241 static void splitCloneGroups(
242 std::vector<CloneDetector::CloneGroup> &CloneGroups,
243 llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)>
244 Compare);
245};
246
247/// This constraint moves clones into clone groups of type II via hashing.
248///
249/// Clones with different hash values are moved into separate clone groups.
250/// Collisions are possible, and this constraint does nothing to address this
251/// them. Add the slower RecursiveCloneTypeIIVerifyConstraint later in the
252/// constraint chain, not necessarily immediately, to eliminate hash collisions
253/// through a more detailed analysis.
255public:
256 void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
257};
258
259/// This constraint moves clones into clone groups of type II by comparing them.
260///
261/// Clones that aren't type II clones are moved into separate clone groups.
262/// In contrast to the RecursiveCloneTypeIIHashConstraint, all clones in a clone
263/// group are guaranteed to be type II clones of each other, but it is too
264/// slow to efficiently handle large amounts of clones.
266public:
267 void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
268};
269
270/// Ensures that every clone has at least the given complexity.
271///
272/// Complexity is here defined as the total amount of children of a statement.
273/// This constraint assumes the first statement in the group is representative
274/// for all other statements in the group in terms of complexity.
276 unsigned MinComplexity;
277
278public:
279 MinComplexityConstraint(unsigned MinComplexity)
280 : MinComplexity(MinComplexity) {}
281
282 /// Calculates the complexity of the given StmtSequence.
283 /// \param Limit The limit of complexity we probe for. After reaching
284 /// this limit during calculation, this method is exiting
285 /// early to improve performance and returns this limit.
286 size_t calculateStmtComplexity(const StmtSequence &Seq, std::size_t Limit,
287 const std::string &ParentMacroStack = "");
288
289 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
291 CloneGroups, [this](const CloneDetector::CloneGroup &A) {
292 if (!A.empty())
293 return calculateStmtComplexity(A.front(), MinComplexity) <
294 MinComplexity;
295 else
296 return false;
297 });
298 }
299};
300
301/// Ensures that all clone groups contain at least the given amount of clones.
303 unsigned MinGroupSize;
304
305public:
306 MinGroupSizeConstraint(unsigned MinGroupSize = 2)
307 : MinGroupSize(MinGroupSize) {}
308
309 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
311 [this](const CloneDetector::CloneGroup &A) {
312 return A.size() < MinGroupSize;
313 });
314 }
315};
316
317/// Ensures that no clone group fully contains another clone group.
319 void constrain(std::vector<CloneDetector::CloneGroup> &Result);
320};
321
324 std::shared_ptr<llvm::Regex> IgnoredFilesRegex;
325
328 IgnoredFilesRegex = std::make_shared<llvm::Regex>("^(" +
329 IgnoredFilesPattern.str() + "$)");
330 }
331
333
334 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
336 CloneGroups, [this](const CloneDetector::CloneGroup &Group) {
337 return isAutoGenerated(Group);
338 });
339 }
340};
341
342/// Analyzes the pattern of the referenced variables in a statement.
344
345 /// Describes an occurrence of a variable reference in a statement.
346 struct VariableOccurence {
347 /// The index of the associated VarDecl in the Variables vector.
348 size_t KindID;
349 /// The statement in the code where the variable was referenced.
350 const Stmt *Mention;
351
352 VariableOccurence(size_t KindID, const Stmt *Mention)
353 : KindID(KindID), Mention(Mention) {}
354 };
355
356 /// All occurrences of referenced variables in the order of appearance.
357 std::vector<VariableOccurence> Occurences;
358 /// List of referenced variables in the order of appearance.
359 /// Every item in this list is unique.
360 std::vector<const VarDecl *> Variables;
361
362 /// Adds a new variable referenced to this pattern.
363 /// \param VarDecl The declaration of the variable that is referenced.
364 /// \param Mention The SourceRange where this variable is referenced.
365 void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention);
366
367 /// Adds each referenced variable from the given statement.
368 void addVariables(const Stmt *S);
369
370public:
371 /// Creates an VariablePattern object with information about the given
372 /// StmtSequence.
373 VariablePattern(const StmtSequence &Sequence) {
374 for (const Stmt *S : Sequence)
375 addVariables(S);
376 }
377
378 /// Describes two clones that reference their variables in a different pattern
379 /// which could indicate a programming error.
381 /// Utility class holding the relevant information about a single
382 /// clone in this pair.
384 /// The variable which referencing in this clone was against the pattern.
386 /// Where the variable was referenced.
387 const Stmt *Mention;
388 /// The variable that should have been referenced to follow the pattern.
389 /// If Suggestion is a nullptr then it's not possible to fix the pattern
390 /// by referencing a different variable in this clone.
393 const VarDecl *Suggestion)
396 };
397 /// The first clone in the pair which always has a suggested variable.
399 /// This other clone in the pair which can have a suggested variable.
401 };
402
403 /// Counts the differences between this pattern and the given one.
404 /// \param Other The given VariablePattern to compare with.
405 /// \param FirstMismatch Output parameter that will be filled with information
406 /// about the first difference between the two patterns. This parameter
407 /// can be a nullptr, in which case it will be ignored.
408 /// \return Returns the number of differences between the pattern this object
409 /// is following and the given VariablePattern.
410 ///
411 /// For example, the following statements all have the same pattern and this
412 /// function would return zero:
413 ///
414 /// if (a < b) return a; return b;
415 /// if (x < y) return x; return y;
416 /// if (u2 < u1) return u2; return u1;
417 ///
418 /// But the following statement has a different pattern (note the changed
419 /// variables in the return statements) and would have two differences when
420 /// compared with one of the statements above.
421 ///
422 /// if (a < b) return b; return a;
423 ///
424 /// This function should only be called if the related statements of the given
425 /// pattern and the statements of this objects are clones of each other.
427 const VariablePattern &Other,
428 VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr);
429};
430
431/// Ensures that all clones reference variables in the same pattern.
433 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups);
434};
435
436} // end namespace clang
437
438#endif // LLVM_CLANG_ANALYSIS_CLONEDETECTION_H
const Decl * D
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:188
This class is a utility class that contains utility functions for building custom constraints.
static void filterGroups(std::vector< CloneDetector::CloneGroup > &CloneGroups, llvm::function_ref< bool(const CloneDetector::CloneGroup &)> Filter)
Removes all groups by using a filter function.
static void splitCloneGroups(std::vector< CloneDetector::CloneGroup > &CloneGroups, llvm::function_ref< bool(const StmtSequence &, const StmtSequence &)> Compare)
Splits the given CloneGroups until the given Compare function returns true for all clones in a single...
Searches for similar subtrees in the AST.
void findClones(std::vector< CloneGroup > &Result, Ts... ConstraintList)
Searches for clones in all previously passed statements.
static void constrainClones(std::vector< CloneGroup > &CloneGroups, T C)
Constrains the given list of clone groups with the given constraint.
static void constrainClones(std::vector< CloneGroup > &CloneGroups, T1 C, Ts... ConstraintList)
Constrains the given list of clone groups with the given list of constraints.
void analyzeCodeBody(const Decl *D)
Generates and stores search data for all statements in the body of the given Decl.
llvm::SmallVector< StmtSequence, 8 > CloneGroup
A collection of StmtSequences that share an arbitrary property.
CompoundStmt - This represents a group of statements like { stmt stmt }.
Definition: Stmt.h:1628
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:86
Ensures that every clone has at least the given complexity.
size_t calculateStmtComplexity(const StmtSequence &Seq, std::size_t Limit, const std::string &ParentMacroStack="")
Calculates the complexity of the given StmtSequence.
MinComplexityConstraint(unsigned MinComplexity)
void constrain(std::vector< CloneDetector::CloneGroup > &CloneGroups)
Ensures that all clone groups contain at least the given amount of clones.
void constrain(std::vector< CloneDetector::CloneGroup > &CloneGroups)
MinGroupSizeConstraint(unsigned MinGroupSize=2)
This constraint moves clones into clone groups of type II via hashing.
void constrain(std::vector< CloneDetector::CloneGroup > &Sequences)
This constraint moves clones into clone groups of type II by comparing them.
void constrain(std::vector< CloneDetector::CloneGroup > &Sequences)
Encodes a location in the source.
A trivial tuple used to represent a source range.
Identifies a list of statements.
bool operator==(const StmtSequence &Other) const
bool contains(const StmtSequence &Other) const
Returns true if and only if this sequence covers a source range that contains the source range of the...
iterator begin() const
Returns an iterator pointing to the first statement in this sequence.
const Stmt *const * iterator
const Decl * getContainingDecl() const
Returns the declaration that contains the stored Stmts.
StmtSequence()
Constructs an empty StmtSequence.
ASTContext & getASTContext() const
Returns the related ASTContext for the stored Stmts.
unsigned size() const
Returns the number of statements this object holds.
iterator end() const
Returns an iterator pointing behind the last statement in this sequence.
SourceLocation getEndLoc() const
Returns the end sourcelocation of the last statement in this sequence.
bool operator!=(const StmtSequence &Other) const
bool empty() const
Returns true if and only if this StmtSequence contains no statements.
const Stmt * front() const
Returns the first statement in this sequence.
bool holdsSequence() const
Returns true if this objects holds a list of statements.
SourceLocation getBeginLoc() const
Returns the start sourcelocation of the first statement in this sequence.
SourceRange getSourceRange() const
Returns the source range of the whole sequence - from the beginning of the first statement to the end...
const Stmt * back() const
Returns the last statement in this sequence.
Stmt - This represents one statement.
Definition: Stmt.h:84
Represents a variable declaration or definition.
Definition: Decl.h:882
Analyzes the pattern of the referenced variables in a statement.
VariablePattern(const StmtSequence &Sequence)
Creates an VariablePattern object with information about the given StmtSequence.
unsigned countPatternDifferences(const VariablePattern &Other, VariablePattern::SuspiciousClonePair *FirstMismatch=nullptr)
Counts the differences between this pattern and the given one.
@ Decl
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
The JSON file list parser is used to communicate input to InstallAPI.
@ Seq
'seq' clause, allowed on 'loop' and 'routine' directives.
@ Result
The result type of a method or function.
const FunctionProtoType * T
@ Other
Other implicit parameter.
std::shared_ptr< llvm::Regex > IgnoredFilesRegex
FilenamePatternConstraint(StringRef IgnoredFilesPattern)
bool isAutoGenerated(const CloneDetector::CloneGroup &Group)
void constrain(std::vector< CloneDetector::CloneGroup > &CloneGroups)
Ensures that all clones reference variables in the same pattern.
void constrain(std::vector< CloneDetector::CloneGroup > &CloneGroups)
Ensures that no clone group fully contains another clone group.
void constrain(std::vector< CloneDetector::CloneGroup > &Result)
Utility class holding the relevant information about a single clone in this pair.
const Stmt * Mention
Where the variable was referenced.
const VarDecl * Suggestion
The variable that should have been referenced to follow the pattern.
const VarDecl * Variable
The variable which referencing in this clone was against the pattern.
SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention, const VarDecl *Suggestion)
Describes two clones that reference their variables in a different pattern which could indicate a pro...
SuspiciousCloneInfo SecondCloneInfo
This other clone in the pair which can have a suggested variable.
SuspiciousCloneInfo FirstCloneInfo
The first clone in the pair which always has a suggested variable.