clang 20.0.0git
ArrayBoundCheckerV2.cpp
Go to the documentation of this file.
1//== ArrayBoundCheckerV2.cpp ------------------------------------*- C++ -*--==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines ArrayBoundCheckerV2, which is a path-sensitive check
10// which looks for an out-of-bound array element access.
11//
12//===----------------------------------------------------------------------===//
13
14#include "clang/AST/CharUnits.h"
25#include "llvm/ADT/APSInt.h"
26#include "llvm/ADT/SmallString.h"
27#include "llvm/Support/FormatVariadic.h"
28#include "llvm/Support/raw_ostream.h"
29#include <optional>
30
31using namespace clang;
32using namespace ento;
33using namespace taint;
34using llvm::formatv;
35
36namespace {
37/// If `E` is a "clean" array subscript expression, return the type of the
38/// accessed element. If the base of the subscript expression is modified by
39/// pointer arithmetic (and not the beginning of a "full" memory region), this
40/// always returns nullopt because that's the right (or the least bad) thing to
41/// do for the diagnostic output that's relying on this.
42static std::optional<QualType> determineElementType(const Expr *E,
43 const CheckerContext &C) {
44 const auto *ASE = dyn_cast<ArraySubscriptExpr>(E);
45 if (!ASE)
46 return std::nullopt;
47
48 const MemRegion *SubscriptBaseReg = C.getSVal(ASE->getBase()).getAsRegion();
49 if (!SubscriptBaseReg)
50 return std::nullopt;
51
52 // The base of the subscript expression is affected by pointer arithmetics,
53 // so we want to report byte offsets instead of indices.
54 if (isa<ElementRegion>(SubscriptBaseReg->StripCasts()))
55 return std::nullopt;
56
57 return ASE->getType();
58}
59
60static std::optional<int64_t>
61determineElementSize(const std::optional<QualType> T, const CheckerContext &C) {
62 if (!T)
63 return std::nullopt;
64 return C.getASTContext().getTypeSizeInChars(*T).getQuantity();
65}
66
67class StateUpdateReporter {
68 const SubRegion *Reg;
69 const NonLoc ByteOffsetVal;
70 const std::optional<QualType> ElementType;
71 const std::optional<int64_t> ElementSize;
72 bool AssumedNonNegative = false;
73 std::optional<NonLoc> AssumedUpperBound = std::nullopt;
74
75public:
76 StateUpdateReporter(const SubRegion *R, NonLoc ByteOffsVal, const Expr *E,
78 : Reg(R), ByteOffsetVal(ByteOffsVal),
79 ElementType(determineElementType(E, C)),
80 ElementSize(determineElementSize(ElementType, C)) {}
81
82 void recordNonNegativeAssumption() { AssumedNonNegative = true; }
83 void recordUpperBoundAssumption(NonLoc UpperBoundVal) {
84 AssumedUpperBound = UpperBoundVal;
85 }
86
87 bool assumedNonNegative() { return AssumedNonNegative; }
88
89 const NoteTag *createNoteTag(CheckerContext &C) const;
90
91private:
92 std::string getMessage(PathSensitiveBugReport &BR) const;
93
94 /// Return true if information about the value of `Sym` can put constraints
95 /// on some symbol which is interesting within the bug report `BR`.
96 /// In particular, this returns true when `Sym` is interesting within `BR`;
97 /// but it also returns true if `Sym` is an expression that contains integer
98 /// constants and a single symbolic operand which is interesting (in `BR`).
99 /// We need to use this instead of plain `BR.isInteresting()` because if we
100 /// are analyzing code like
101 /// int array[10];
102 /// int f(int arg) {
103 /// return array[arg] && array[arg + 10];
104 /// }
105 /// then the byte offsets are `arg * 4` and `(arg + 10) * 4`, which are not
106 /// sub-expressions of each other (but `getSimplifiedOffsets` is smart enough
107 /// to detect this out of bounds access).
108 static bool providesInformationAboutInteresting(SymbolRef Sym,
110 static bool providesInformationAboutInteresting(SVal SV,
112 return providesInformationAboutInteresting(SV.getAsSymbol(), BR);
113 }
114};
115
116struct Messages {
117 std::string Short, Full;
118};
119
120// NOTE: The `ArraySubscriptExpr` and `UnaryOperator` callbacks are `PostStmt`
121// instead of `PreStmt` because the current implementation passes the whole
122// expression to `CheckerContext::getSVal()` which only works after the
123// symbolic evaluation of the expression. (To turn them into `PreStmt`
124// callbacks, we'd need to duplicate the logic that evaluates these
125// expressions.) The `MemberExpr` callback would work as `PreStmt` but it's
126// defined as `PostStmt` for the sake of consistency with the other callbacks.
127class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>,
128 check::PostStmt<UnaryOperator>,
129 check::PostStmt<MemberExpr>> {
130 BugType BT{this, "Out-of-bound access"};
131 BugType TaintBT{this, "Out-of-bound access", categories::TaintedData};
132
133 void performCheck(const Expr *E, CheckerContext &C) const;
134
135 void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, Messages Msgs,
136 NonLoc Offset, std::optional<NonLoc> Extent,
137 bool IsTaintBug = false) const;
138
139 static void markPartsInteresting(PathSensitiveBugReport &BR,
140 ProgramStateRef ErrorState, NonLoc Val,
141 bool MarkTaint);
142
143 static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC);
144
145 static bool isIdiomaticPastTheEndPtr(const Expr *E, ProgramStateRef State,
146 NonLoc Offset, NonLoc Limit,
148 static bool isInAddressOf(const Stmt *S, ASTContext &AC);
149
150public:
151 void checkPostStmt(const ArraySubscriptExpr *E, CheckerContext &C) const {
152 performCheck(E, C);
153 }
154 void checkPostStmt(const UnaryOperator *E, CheckerContext &C) const {
155 if (E->getOpcode() == UO_Deref)
156 performCheck(E, C);
157 }
158 void checkPostStmt(const MemberExpr *E, CheckerContext &C) const {
159 if (E->isArrow())
160 performCheck(E->getBase(), C);
161 }
162};
163
164} // anonymous namespace
165
166/// For a given Location that can be represented as a symbolic expression
167/// Arr[Idx] (or perhaps Arr[Idx1][Idx2] etc.), return the parent memory block
168/// Arr and the distance of Location from the beginning of Arr (expressed in a
169/// NonLoc that specifies the number of CharUnits). Returns nullopt when these
170/// cannot be determined.
171static std::optional<std::pair<const SubRegion *, NonLoc>>
174 auto EvalBinOp = [&SVB, State, T](BinaryOperatorKind Op, NonLoc L, NonLoc R) {
175 // We will use this utility to add and multiply values.
176 return SVB.evalBinOpNN(State, Op, L, R, T).getAs<NonLoc>();
177 };
178
179 const SubRegion *OwnerRegion = nullptr;
180 std::optional<NonLoc> Offset = SVB.makeZeroArrayIndex();
181
182 const ElementRegion *CurRegion =
183 dyn_cast_or_null<ElementRegion>(Location.getAsRegion());
184
185 while (CurRegion) {
186 const auto Index = CurRegion->getIndex().getAs<NonLoc>();
187 if (!Index)
188 return std::nullopt;
189
190 QualType ElemType = CurRegion->getElementType();
191
192 // FIXME: The following early return was presumably added to safeguard the
193 // getTypeSizeInChars() call (which doesn't accept an incomplete type), but
194 // it seems that `ElemType` cannot be incomplete at this point.
195 if (ElemType->isIncompleteType())
196 return std::nullopt;
197
198 // Calculate Delta = Index * sizeof(ElemType).
199 NonLoc Size = SVB.makeArrayIndex(
200 SVB.getContext().getTypeSizeInChars(ElemType).getQuantity());
201 auto Delta = EvalBinOp(BO_Mul, *Index, Size);
202 if (!Delta)
203 return std::nullopt;
204
205 // Perform Offset += Delta.
206 Offset = EvalBinOp(BO_Add, *Offset, *Delta);
207 if (!Offset)
208 return std::nullopt;
209
210 OwnerRegion = CurRegion->getSuperRegion()->getAs<SubRegion>();
211 // When this is just another ElementRegion layer, we need to continue the
212 // offset calculations:
213 CurRegion = dyn_cast_or_null<ElementRegion>(OwnerRegion);
214 }
215
216 if (OwnerRegion)
217 return std::make_pair(OwnerRegion, *Offset);
218
219 return std::nullopt;
220}
221
222// NOTE: This function is the "heart" of this checker. It simplifies
223// inequalities with transformations that are valid (and very elementary) in
224// pure mathematics, but become invalid if we use them in C++ number model
225// where the calculations may overflow.
226// Due to the overflow issues I think it's impossible (or at least not
227// practical) to integrate this kind of simplification into the resolution of
228// arbitrary inequalities (i.e. the code of `evalBinOp`); but this function
229// produces valid results when the calculations are handling memory offsets
230// and every value is well below SIZE_MAX.
231// TODO: This algorithm should be moved to a central location where it's
232// available for other checkers that need to compare memory offsets.
233// NOTE: the simplification preserves the order of the two operands in a
234// mathematical sense, but it may change the result produced by a C++
235// comparison operator (and the automatic type conversions).
236// For example, consider a comparison "X+1 < 0", where the LHS is stored as a
237// size_t and the RHS is stored in an int. (As size_t is unsigned, this
238// comparison is false for all values of "X".) However, the simplification may
239// turn it into "X < -1", which is still always false in a mathematical sense,
240// but can produce a true result when evaluated by `evalBinOp` (which follows
241// the rules of C++ and casts -1 to SIZE_MAX).
242static std::pair<NonLoc, nonloc::ConcreteInt>
244 SValBuilder &svalBuilder) {
245 const llvm::APSInt &extentVal = extent.getValue();
246 std::optional<nonloc::SymbolVal> SymVal = offset.getAs<nonloc::SymbolVal>();
247 if (SymVal && SymVal->isExpression()) {
248 if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(SymVal->getSymbol())) {
249 llvm::APSInt constant = APSIntType(extentVal).convert(SIE->getRHS());
250 switch (SIE->getOpcode()) {
251 case BO_Mul:
252 // The constant should never be 0 here, becasue multiplication by zero
253 // is simplified by the engine.
254 if ((extentVal % constant) != 0)
255 return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
256 else
258 nonloc::SymbolVal(SIE->getLHS()),
259 svalBuilder.makeIntVal(extentVal / constant), svalBuilder);
260 case BO_Add:
262 nonloc::SymbolVal(SIE->getLHS()),
263 svalBuilder.makeIntVal(extentVal - constant), svalBuilder);
264 default:
265 break;
266 }
267 }
268 }
269
270 return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
271}
272
274 const llvm::APSInt *MaxV = SVB.getMaxValue(State, Value);
275 return MaxV && MaxV->isNegative();
276}
277
278static bool isUnsigned(SValBuilder &SVB, NonLoc Value) {
280 return T->isUnsignedIntegerType();
281}
282
283// Evaluate the comparison Value < Threshold with the help of the custom
284// simplification algorithm defined for this checker. Return a pair of states,
285// where the first one corresponds to "value below threshold" and the second
286// corresponds to "value at or above threshold". Returns {nullptr, nullptr} in
287// the case when the evaluation fails.
288// If the optional argument CheckEquality is true, then use BO_EQ instead of
289// the default BO_LT after consistently applying the same simplification steps.
290static std::pair<ProgramStateRef, ProgramStateRef>
292 SValBuilder &SVB, bool CheckEquality = false) {
293 if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
294 std::tie(Value, Threshold) = getSimplifiedOffsets(Value, *ConcreteThreshold, SVB);
295 }
296
297 // We want to perform a _mathematical_ comparison between the numbers `Value`
298 // and `Threshold`; but `evalBinOpNN` evaluates a C/C++ operator that may
299 // perform automatic conversions. For example the number -1 is less than the
300 // number 1000, but -1 < `1000ull` will evaluate to `false` because the `int`
301 // -1 is converted to ULONGLONG_MAX.
302 // To avoid automatic conversions, we evaluate the "obvious" cases without
303 // calling `evalBinOpNN`:
304 if (isNegative(SVB, State, Value) && isUnsigned(SVB, Threshold)) {
305 if (CheckEquality) {
306 // negative_value == unsigned_threshold is always false
307 return {nullptr, State};
308 }
309 // negative_value < unsigned_threshold is always true
310 return {State, nullptr};
311 }
312 if (isUnsigned(SVB, Value) && isNegative(SVB, State, Threshold)) {
313 // unsigned_value == negative_threshold and
314 // unsigned_value < negative_threshold are both always false
315 return {nullptr, State};
316 }
317 // FIXME: These special cases are sufficient for handling real-world
318 // comparisons, but in theory there could be contrived situations where
319 // automatic conversion of a symbolic value (which can be negative and can be
320 // positive) leads to incorrect results.
321 // NOTE: We NEED to use the `evalBinOpNN` call in the "common" case, because
322 // we want to ensure that assumptions coming from this precondition and
323 // assumptions coming from regular C/C++ operator calls are represented by
324 // constraints on the same symbolic expression. A solution that would
325 // evaluate these "mathematical" compariosns through a separate pathway would
326 // be a step backwards in this sense.
327
328 const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT;
329 auto BelowThreshold =
330 SVB.evalBinOpNN(State, OpKind, Value, Threshold, SVB.getConditionType())
331 .getAs<NonLoc>();
332
333 if (BelowThreshold)
334 return State->assume(*BelowThreshold);
335
336 return {nullptr, nullptr};
337}
338
339static std::string getRegionName(const SubRegion *Region) {
340 if (std::string RegName = Region->getDescriptiveName(); !RegName.empty())
341 return RegName;
342
343 // Field regions only have descriptive names when their parent has a
344 // descriptive name; so we provide a fallback representation for them:
345 if (const auto *FR = Region->getAs<FieldRegion>()) {
346 if (StringRef Name = FR->getDecl()->getName(); !Name.empty())
347 return formatv("the field '{0}'", Name);
348 return "the unnamed field";
349 }
350
351 if (isa<AllocaRegion>(Region))
352 return "the memory returned by 'alloca'";
353
354 if (isa<SymbolicRegion>(Region) &&
355 isa<HeapSpaceRegion>(Region->getMemorySpace()))
356 return "the heap area";
357
358 if (isa<StringRegion>(Region))
359 return "the string literal";
360
361 return "the region";
362}
363
364static std::optional<int64_t> getConcreteValue(NonLoc SV) {
365 if (auto ConcreteVal = SV.getAs<nonloc::ConcreteInt>()) {
366 return ConcreteVal->getValue()->tryExtValue();
367 }
368 return std::nullopt;
369}
370
371static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) {
372 return SV ? getConcreteValue(*SV) : std::nullopt;
373}
374
375static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) {
376 std::string RegName = getRegionName(Region), OffsetStr = "";
377
378 if (auto ConcreteOffset = getConcreteValue(Offset))
379 OffsetStr = formatv(" {0}", ConcreteOffset);
380
381 return {
382 formatv("Out of bound access to memory preceding {0}", RegName),
383 formatv("Access of {0} at negative byte offset{1}", RegName, OffsetStr)};
384}
385
386/// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if
387/// it can be performed (`Divisor` is nonzero and there is no remainder). The
388/// values `Val1` and `Val2` may be nullopt and in that case the corresponding
389/// division is considered to be successful.
390static bool tryDividePair(std::optional<int64_t> &Val1,
391 std::optional<int64_t> &Val2, int64_t Divisor) {
392 if (!Divisor)
393 return false;
394 const bool Val1HasRemainder = Val1 && *Val1 % Divisor;
395 const bool Val2HasRemainder = Val2 && *Val2 % Divisor;
396 if (!Val1HasRemainder && !Val2HasRemainder) {
397 if (Val1)
398 *Val1 /= Divisor;
399 if (Val2)
400 *Val2 /= Divisor;
401 return true;
402 }
403 return false;
404}
405
406static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
407 NonLoc Offset, NonLoc Extent, SVal Location,
408 bool AlsoMentionUnderflow) {
409 std::string RegName = getRegionName(Region);
410 const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
411 assert(EReg && "this checker only handles element access");
412 QualType ElemType = EReg->getElementType();
413
414 std::optional<int64_t> OffsetN = getConcreteValue(Offset);
415 std::optional<int64_t> ExtentN = getConcreteValue(Extent);
416
417 int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity();
418
419 bool UseByteOffsets = !tryDividePair(OffsetN, ExtentN, ElemSize);
420 const char *OffsetOrIndex = UseByteOffsets ? "byte offset" : "index";
421
423 llvm::raw_svector_ostream Out(Buf);
424 Out << "Access of ";
425 if (!ExtentN && !UseByteOffsets)
426 Out << "'" << ElemType.getAsString() << "' element in ";
427 Out << RegName << " at ";
428 if (AlsoMentionUnderflow) {
429 Out << "a negative or overflowing " << OffsetOrIndex;
430 } else if (OffsetN) {
431 Out << OffsetOrIndex << " " << *OffsetN;
432 } else {
433 Out << "an overflowing " << OffsetOrIndex;
434 }
435 if (ExtentN) {
436 Out << ", while it holds only ";
437 if (*ExtentN != 1)
438 Out << *ExtentN;
439 else
440 Out << "a single";
441 if (UseByteOffsets)
442 Out << " byte";
443 else
444 Out << " '" << ElemType.getAsString() << "' element";
445
446 if (*ExtentN > 1)
447 Out << "s";
448 }
449
450 return {formatv("Out of bound access to memory {0} {1}",
451 AlsoMentionUnderflow ? "around" : "after the end of",
452 RegName),
453 std::string(Buf)};
454}
455
456static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName,
457 bool AlsoMentionUnderflow) {
458 std::string RegName = getRegionName(Region);
459 return {formatv("Potential out of bound access to {0} with tainted {1}",
460 RegName, OffsetName),
461 formatv("Access of {0} with a tainted {1} that may be {2}too large",
462 RegName, OffsetName,
463 AlsoMentionUnderflow ? "negative or " : "")};
464}
465
466const NoteTag *StateUpdateReporter::createNoteTag(CheckerContext &C) const {
467 // Don't create a note tag if we didn't assume anything:
468 if (!AssumedNonNegative && !AssumedUpperBound)
469 return nullptr;
470
471 return C.getNoteTag([*this](PathSensitiveBugReport &BR) -> std::string {
472 return getMessage(BR);
473 });
474}
475
476std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const {
477 bool ShouldReportNonNegative = AssumedNonNegative;
478 if (!providesInformationAboutInteresting(ByteOffsetVal, BR)) {
479 if (AssumedUpperBound &&
480 providesInformationAboutInteresting(*AssumedUpperBound, BR)) {
481 // Even if the byte offset isn't interesting (e.g. it's a constant value),
482 // the assumption can still be interesting if it provides information
483 // about an interesting symbolic upper bound.
484 ShouldReportNonNegative = false;
485 } else {
486 // We don't have anything interesting, don't report the assumption.
487 return "";
488 }
489 }
490
491 std::optional<int64_t> OffsetN = getConcreteValue(ByteOffsetVal);
492 std::optional<int64_t> ExtentN = getConcreteValue(AssumedUpperBound);
493
494 const bool UseIndex =
495 ElementSize && tryDividePair(OffsetN, ExtentN, *ElementSize);
496
498 llvm::raw_svector_ostream Out(Buf);
499 Out << "Assuming ";
500 if (UseIndex) {
501 Out << "index ";
502 if (OffsetN)
503 Out << "'" << OffsetN << "' ";
504 } else if (AssumedUpperBound) {
505 Out << "byte offset ";
506 if (OffsetN)
507 Out << "'" << OffsetN << "' ";
508 } else {
509 Out << "offset ";
510 }
511
512 Out << "is";
513 if (ShouldReportNonNegative) {
514 Out << " non-negative";
515 }
516 if (AssumedUpperBound) {
517 if (ShouldReportNonNegative)
518 Out << " and";
519 Out << " less than ";
520 if (ExtentN)
521 Out << *ExtentN << ", ";
522 if (UseIndex && ElementType)
523 Out << "the number of '" << ElementType->getAsString()
524 << "' elements in ";
525 else
526 Out << "the extent of ";
527 Out << getRegionName(Reg);
528 }
529 return std::string(Out.str());
530}
531
532bool StateUpdateReporter::providesInformationAboutInteresting(
534 if (!Sym)
535 return false;
536 for (SymbolRef PartSym : Sym->symbols()) {
537 // The interestingess mark may appear on any layer as we're stripping off
538 // the SymIntExpr, UnarySymExpr etc. layers...
539 if (BR.isInteresting(PartSym))
540 return true;
541 // ...but if both sides of the expression are symbolic, then there is no
542 // practical algorithm to produce separate constraints for the two
543 // operands (from the single combined result).
544 if (isa<SymSymExpr>(PartSym))
545 return false;
546 }
547 return false;
548}
549
550void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
551 const SVal Location = C.getSVal(E);
552
553 // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as
554 // #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX)
555 // and incomplete analysis of these leads to false positives. As even
556 // accurate reports would be confusing for the users, just disable reports
557 // from these macros:
558 if (isFromCtypeMacro(E, C.getASTContext()))
559 return;
560
561 ProgramStateRef State = C.getState();
562 SValBuilder &SVB = C.getSValBuilder();
563
564 const std::optional<std::pair<const SubRegion *, NonLoc>> &RawOffset =
565 computeOffset(State, SVB, Location);
566
567 if (!RawOffset)
568 return;
569
570 auto [Reg, ByteOffset] = *RawOffset;
571
572 // The state updates will be reported as a single note tag, which will be
573 // composed by this helper class.
574 StateUpdateReporter SUR(Reg, ByteOffset, E, C);
575
576 // CHECK LOWER BOUND
577 const MemSpaceRegion *Space = Reg->getMemorySpace();
578 if (!(isa<SymbolicRegion>(Reg) && isa<UnknownSpaceRegion>(Space))) {
579 // A symbolic region in unknown space represents an unknown pointer that
580 // may point into the middle of an array, so we don't look for underflows.
581 // Both conditions are significant because we want to check underflows in
582 // symbolic regions on the heap (which may be introduced by checkers like
583 // MallocChecker that call SValBuilder::getConjuredHeapSymbolVal()) and
584 // non-symbolic regions (e.g. a field subregion of a symbolic region) in
585 // unknown space.
586 auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold(
587 State, ByteOffset, SVB.makeZeroArrayIndex(), SVB);
588
589 if (PrecedesLowerBound) {
590 // The offset may be invalid (negative)...
591 if (!WithinLowerBound) {
592 // ...and it cannot be valid (>= 0), so report an error.
593 Messages Msgs = getPrecedesMsgs(Reg, ByteOffset);
594 reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset, std::nullopt);
595 return;
596 }
597 // ...but it can be valid as well, so the checker will (optimistically)
598 // assume that it's valid and mention this in the note tag.
599 SUR.recordNonNegativeAssumption();
600 }
601
602 // Actually update the state. The "if" only fails in the extremely unlikely
603 // case when compareValueToThreshold returns {nullptr, nullptr} becasue
604 // evalBinOpNN fails to evaluate the less-than operator.
605 if (WithinLowerBound)
606 State = WithinLowerBound;
607 }
608
609 // CHECK UPPER BOUND
610 DefinedOrUnknownSVal Size = getDynamicExtent(State, Reg, SVB);
611 if (auto KnownSize = Size.getAs<NonLoc>()) {
612 // In a situation where both underflow and overflow are possible (but the
613 // index is either tainted or known to be invalid), the logic of this
614 // checker will first assume that the offset is non-negative, and then
615 // (with this additional assumption) it will detect an overflow error.
616 // In this situation the warning message should mention both possibilities.
617 bool AlsoMentionUnderflow = SUR.assumedNonNegative();
618
619 auto [WithinUpperBound, ExceedsUpperBound] =
620 compareValueToThreshold(State, ByteOffset, *KnownSize, SVB);
621
622 if (ExceedsUpperBound) {
623 // The offset may be invalid (>= Size)...
624 if (!WithinUpperBound) {
625 // ...and it cannot be within bounds, so report an error, unless we can
626 // definitely determine that this is an idiomatic `&array[size]`
627 // expression that calculates the past-the-end pointer.
628 if (isIdiomaticPastTheEndPtr(E, ExceedsUpperBound, ByteOffset,
629 *KnownSize, C)) {
630 C.addTransition(ExceedsUpperBound, SUR.createNoteTag(C));
631 return;
632 }
633
634 Messages Msgs =
635 getExceedsMsgs(C.getASTContext(), Reg, ByteOffset, *KnownSize,
636 Location, AlsoMentionUnderflow);
637 reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize);
638 return;
639 }
640 // ...and it can be valid as well...
641 if (isTainted(State, ByteOffset)) {
642 // ...but it's tainted, so report an error.
643
644 // Diagnostic detail: saying "tainted offset" is always correct, but
645 // the common case is that 'idx' is tainted in 'arr[idx]' and then it's
646 // nicer to say "tainted index".
647 const char *OffsetName = "offset";
648 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E))
649 if (isTainted(State, ASE->getIdx(), C.getLocationContext()))
650 OffsetName = "index";
651
652 Messages Msgs = getTaintMsgs(Reg, OffsetName, AlsoMentionUnderflow);
653 reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize,
654 /*IsTaintBug=*/true);
655 return;
656 }
657 // ...and it isn't tainted, so the checker will (optimistically) assume
658 // that the offset is in bounds and mention this in the note tag.
659 SUR.recordUpperBoundAssumption(*KnownSize);
660 }
661
662 // Actually update the state. The "if" only fails in the extremely unlikely
663 // case when compareValueToThreshold returns {nullptr, nullptr} becasue
664 // evalBinOpNN fails to evaluate the less-than operator.
665 if (WithinUpperBound)
666 State = WithinUpperBound;
667 }
668
669 // Add a transition, reporting the state updates that we accumulated.
670 C.addTransition(State, SUR.createNoteTag(C));
671}
672
673void ArrayBoundCheckerV2::markPartsInteresting(PathSensitiveBugReport &BR,
674 ProgramStateRef ErrorState,
675 NonLoc Val, bool MarkTaint) {
676 if (SymbolRef Sym = Val.getAsSymbol()) {
677 // If the offset is a symbolic value, iterate over its "parts" with
678 // `SymExpr::symbols()` and mark each of them as interesting.
679 // For example, if the offset is `x*4 + y` then we put interestingness onto
680 // the SymSymExpr `x*4 + y`, the SymIntExpr `x*4` and the two data symbols
681 // `x` and `y`.
682 for (SymbolRef PartSym : Sym->symbols())
683 BR.markInteresting(PartSym);
684 }
685
686 if (MarkTaint) {
687 // If the issue that we're reporting depends on the taintedness of the
688 // offset, then put interestingness onto symbols that could be the origin
689 // of the taint. Note that this may find symbols that did not appear in
690 // `Sym->symbols()` (because they're only loosely connected to `Val`).
691 for (SymbolRef Sym : getTaintedSymbols(ErrorState, Val))
692 BR.markInteresting(Sym);
693 }
694}
695
696void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
697 ProgramStateRef ErrorState, Messages Msgs,
698 NonLoc Offset, std::optional<NonLoc> Extent,
699 bool IsTaintBug /*=false*/) const {
700
701 ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState);
702 if (!ErrorNode)
703 return;
704
705 auto BR = std::make_unique<PathSensitiveBugReport>(
706 IsTaintBug ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode);
707
708 // FIXME: ideally we would just call trackExpressionValue() and that would
709 // "do the right thing": mark the relevant symbols as interesting, track the
710 // control dependencies and statements storing the relevant values and add
711 // helpful diagnostic pieces. However, right now trackExpressionValue() is
712 // a heap of unreliable heuristics, so it would cause several issues:
713 // - Interestingness is not applied consistently, e.g. if `array[x+10]`
714 // causes an overflow, then `x` is not marked as interesting.
715 // - We get irrelevant diagnostic pieces, e.g. in the code
716 // `int *p = (int*)malloc(2*sizeof(int)); p[3] = 0;`
717 // it places a "Storing uninitialized value" note on the `malloc` call
718 // (which is technically true, but irrelevant).
719 // If trackExpressionValue() becomes reliable, it should be applied instead
720 // of this custom markPartsInteresting().
721 markPartsInteresting(*BR, ErrorState, Offset, IsTaintBug);
722 if (Extent)
723 markPartsInteresting(*BR, ErrorState, *Extent, IsTaintBug);
724
725 C.emitReport(std::move(BR));
726}
727
728bool ArrayBoundCheckerV2::isFromCtypeMacro(const Stmt *S, ASTContext &ACtx) {
729 SourceLocation Loc = S->getBeginLoc();
730 if (!Loc.isMacroID())
731 return false;
732
733 StringRef MacroName = Lexer::getImmediateMacroName(
734 Loc, ACtx.getSourceManager(), ACtx.getLangOpts());
735
736 if (MacroName.size() < 7 || MacroName[0] != 'i' || MacroName[1] != 's')
737 return false;
738
739 return ((MacroName == "isalnum") || (MacroName == "isalpha") ||
740 (MacroName == "isblank") || (MacroName == "isdigit") ||
741 (MacroName == "isgraph") || (MacroName == "islower") ||
742 (MacroName == "isnctrl") || (MacroName == "isprint") ||
743 (MacroName == "ispunct") || (MacroName == "isspace") ||
744 (MacroName == "isupper") || (MacroName == "isxdigit"));
745}
746
747bool ArrayBoundCheckerV2::isInAddressOf(const Stmt *S, ASTContext &ACtx) {
748 ParentMapContext &ParentCtx = ACtx.getParentMapContext();
749 do {
750 const DynTypedNodeList Parents = ParentCtx.getParents(*S);
751 if (Parents.empty())
752 return false;
753 S = Parents[0].get<Stmt>();
754 } while (isa_and_nonnull<ParenExpr, ImplicitCastExpr>(S));
755 const auto *UnaryOp = dyn_cast_or_null<UnaryOperator>(S);
756 return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf;
757}
758
759bool ArrayBoundCheckerV2::isIdiomaticPastTheEndPtr(const Expr *E,
760 ProgramStateRef State,
761 NonLoc Offset, NonLoc Limit,
762 CheckerContext &C) {
763 if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) {
764 auto [EqualsToThreshold, NotEqualToThreshold] = compareValueToThreshold(
765 State, Offset, Limit, C.getSValBuilder(), /*CheckEquality=*/true);
766 return EqualsToThreshold && !NotEqualToThreshold;
767 }
768 return false;
769}
770
771void ento::registerArrayBoundCheckerV2(CheckerManager &mgr) {
772 mgr.registerChecker<ArrayBoundCheckerV2>();
773}
774
775bool ento::shouldRegisterArrayBoundCheckerV2(const CheckerManager &mgr) {
776 return true;
777}
static std::pair< ProgramStateRef, ProgramStateRef > compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold, SValBuilder &SVB, bool CheckEquality=false)
static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region, NonLoc Offset, NonLoc Extent, SVal Location, bool AlsoMentionUnderflow)
static std::optional< std::pair< const SubRegion *, NonLoc > > computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location)
For a given Location that can be represented as a symbolic expression Arr[Idx] (or perhaps Arr[Idx1][...
static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName, bool AlsoMentionUnderflow)
static bool isNegative(SValBuilder &SVB, ProgramStateRef State, NonLoc Value)
static std::string getRegionName(const SubRegion *Region)
static std::optional< int64_t > getConcreteValue(NonLoc SV)
static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset)
static bool isUnsigned(SValBuilder &SVB, NonLoc Value)
static std::pair< NonLoc, nonloc::ConcreteInt > getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent, SValBuilder &svalBuilder)
static bool tryDividePair(std::optional< int64_t > &Val1, std::optional< int64_t > &Val2, int64_t Divisor)
Try to divide Val1 and Val2 (in place) by Divisor and return true if it can be performed (Divisor is ...
Expr * E
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:188
SourceManager & getSourceManager()
Definition: ASTContext.h:741
ParentMapContext & getParentMapContext()
Returns the dynamic AST node parent map context.
Definition: ASTContext.cpp:894
const LangOptions & getLangOpts() const
Definition: ASTContext.h:834
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
ArraySubscriptExpr - [C99 6.5.2.1] Array Subscripting.
Definition: Expr.h:2718
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:185
Container for either a single DynTypedNode or for an ArrayRef to DynTypedNode.
This represents one expression.
Definition: Expr.h:110
static StringRef getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1059
MemberExpr - [C99 6.5.2.3] Structure and Union Members.
Definition: Expr.h:3236
DynTypedNodeList getParents(const NodeT &Node)
Returns the parents of the given node (within the traversal scope).
A (possibly-)qualified type.
Definition: Type.h:929
static std::string getAsString(SplitQualType split, const PrintingPolicy &Policy)
Definition: Type.h:1327
Encodes a location in the source.
Stmt - This represents one statement.
Definition: Stmt.h:84
bool isIncompleteType(NamedDecl **Def=nullptr) const
Types are partitioned into 3 broad categories (C99 6.2.5p1): object types, function types,...
Definition: Type.cpp:2396
bool isUnsignedIntegerType() const
Return true if this is an integer type that is unsigned, according to C99 6.2.5p6 [which returns true...
Definition: Type.cpp:2230
UnaryOperator - This represents the unary-expression's (except sizeof and alignof),...
Definition: Expr.h:2232
QualType getType() const
Definition: Value.cpp:234
A record of the "type" of an APSInt, used for conversions.
Definition: APSIntType.h:19
llvm::APSInt convert(const llvm::APSInt &Value) const LLVM_READONLY
Convert and return a new APSInt with the given value, but this type's bit width and signedness.
Definition: APSIntType.h:48
Template implementation for all binary symbolic expressions.
CHECKER * registerChecker(AT &&... Args)
Used to register checkers.
ElementRegion is used to represent both array elements and casts.
Definition: MemRegion.h:1199
QualType getElementType() const
Definition: MemRegion.h:1223
NonLoc getIndex() const
Definition: MemRegion.h:1219
MemRegion - The root abstract class for all memory regions.
Definition: MemRegion.h:97
LLVM_ATTRIBUTE_RETURNS_NONNULL const MemSpaceRegion * getMemorySpace() const
Definition: MemRegion.cpp:1351
LLVM_ATTRIBUTE_RETURNS_NONNULL const MemRegion * StripCasts(bool StripBaseAndDerivedCasts=true) const
Definition: MemRegion.cpp:1412
std::string getDescriptiveName(bool UseQuotes=true) const
Get descriptive name for memory region.
Definition: MemRegion.cpp:726
const RegionTy * getAs() const
Definition: MemRegion.h:1388
MemSpaceRegion - A memory region that represents a "memory space"; for example, the set of global var...
Definition: MemRegion.h:208
The tag upon which the TagVisitor reacts.
Definition: BugReporter.h:779
void markInteresting(SymbolRef sym, bugreporter::TrackingKind TKind=bugreporter::TrackingKind::Thorough)
Marks a symbol as interesting.
bool isInteresting(SymbolRef sym) const
NonLoc makeArrayIndex(uint64_t idx)
Definition: SValBuilder.h:282
ASTContext & getContext()
Definition: SValBuilder.h:148
nonloc::ConcreteInt makeIntVal(const IntegerLiteral *integer)
Definition: SValBuilder.h:288
QualType getArrayIndexType() const
Definition: SValBuilder.h:157
virtual SVal evalBinOpNN(ProgramStateRef state, BinaryOperator::Opcode op, NonLoc lhs, NonLoc rhs, QualType resultTy)=0
Create a new value which represents a binary expression with two non- location operands.
QualType getConditionType() const
Definition: SValBuilder.h:153
virtual const llvm::APSInt * getMaxValue(ProgramStateRef state, SVal val)=0
Tries to get the maximal possible (integer) value of a given SVal.
SVal - This represents a symbolic expression, which can be either an L-value or an R-value.
Definition: SVals.h:56
SymbolRef getAsSymbol(bool IncludeBaseRegions=false) const
If this SVal wraps a symbol return that SymbolRef.
Definition: SVals.cpp:104
std::optional< T > getAs() const
Convert to the specified SVal type, returning std::nullopt if this SVal is not of the desired type.
Definition: SVals.h:87
const MemRegion * getAsRegion() const
Definition: SVals.cpp:120
SubRegion - A region that subsets another larger region.
Definition: MemRegion.h:446
LLVM_ATTRIBUTE_RETURNS_NONNULL const MemRegion * getSuperRegion() const
Definition: MemRegion.h:459
Symbolic value.
Definition: SymExpr.h:30
llvm::iterator_range< symbol_iterator > symbols() const
Definition: SymExpr.h:87
Value representing integer constant.
Definition: SVals.h:300
APSIntPtr getValue() const
Definition: SVals.h:304
Represents symbolic expression that isn't a location.
Definition: SVals.h:279
std::vector< SymbolRef > getTaintedSymbols(ProgramStateRef State, const Stmt *S, const LocationContext *LCtx, TaintTagType Kind=TaintTagGeneric)
Returns the tainted Symbols for a given Statement and state.
Definition: Taint.cpp:170
bool isTainted(ProgramStateRef State, const Stmt *S, const LocationContext *LCtx, TaintTagType Kind=TaintTagGeneric)
Check if the statement has a tainted value in the given state.
Definition: Taint.cpp:148
DefinedOrUnknownSVal getDynamicExtent(ProgramStateRef State, const MemRegion *MR, SValBuilder &SVB)
@ Full
This outputs the full clang module dependency graph suitable for use for explicitly building modules.
The JSON file list parser is used to communicate input to InstallAPI.
BinaryOperatorKind
const FunctionProtoType * T