clang 20.0.0git
Lexer.cpp
Go to the documentation of this file.
1//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the Lexer and Token interfaces.
10//
11//===----------------------------------------------------------------------===//
12
13#include "clang/Lex/Lexer.h"
14#include "UnicodeCharSets.h"
18#include "clang/Basic/LLVM.h"
28#include "clang/Lex/Token.h"
29#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/StringExtras.h"
31#include "llvm/ADT/StringRef.h"
32#include "llvm/ADT/StringSwitch.h"
33#include "llvm/Support/Compiler.h"
34#include "llvm/Support/ConvertUTF.h"
35#include "llvm/Support/MemoryBufferRef.h"
36#include "llvm/Support/NativeFormatting.h"
37#include "llvm/Support/Unicode.h"
38#include "llvm/Support/UnicodeCharRanges.h"
39#include <algorithm>
40#include <cassert>
41#include <cstddef>
42#include <cstdint>
43#include <cstring>
44#include <optional>
45#include <string>
46#include <tuple>
47#include <utility>
48
49#ifdef __SSE4_2__
50#include <nmmintrin.h>
51#endif
52
53using namespace clang;
54
55//===----------------------------------------------------------------------===//
56// Token Class Implementation
57//===----------------------------------------------------------------------===//
58
59/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
61 if (isAnnotation())
62 return false;
63 if (const IdentifierInfo *II = getIdentifierInfo())
64 return II->getObjCKeywordID() == objcKey;
65 return false;
66}
67
68/// getObjCKeywordID - Return the ObjC keyword kind.
70 if (isAnnotation())
71 return tok::objc_not_keyword;
72 const IdentifierInfo *specId = getIdentifierInfo();
73 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
74}
75
76/// Determine whether the token kind starts a simple-type-specifier.
77bool Token::isSimpleTypeSpecifier(const LangOptions &LangOpts) const {
78 switch (getKind()) {
79 case tok::annot_typename:
80 case tok::annot_decltype:
81 case tok::annot_pack_indexing_type:
82 return true;
83
84 case tok::kw_short:
85 case tok::kw_long:
86 case tok::kw___int64:
87 case tok::kw___int128:
88 case tok::kw_signed:
89 case tok::kw_unsigned:
90 case tok::kw_void:
91 case tok::kw_char:
92 case tok::kw_int:
93 case tok::kw_half:
94 case tok::kw_float:
95 case tok::kw_double:
96 case tok::kw___bf16:
97 case tok::kw__Float16:
98 case tok::kw___float128:
99 case tok::kw___ibm128:
100 case tok::kw_wchar_t:
101 case tok::kw_bool:
102 case tok::kw__Bool:
103 case tok::kw__Accum:
104 case tok::kw__Fract:
105 case tok::kw__Sat:
106#define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait:
107#include "clang/Basic/TransformTypeTraits.def"
108 case tok::kw___auto_type:
109 case tok::kw_char16_t:
110 case tok::kw_char32_t:
111 case tok::kw_typeof:
112 case tok::kw_decltype:
113 case tok::kw_char8_t:
114 return getIdentifierInfo()->isKeyword(LangOpts);
115
116 default:
117 return false;
118 }
119}
120
121//===----------------------------------------------------------------------===//
122// Lexer Class Implementation
123//===----------------------------------------------------------------------===//
124
125void Lexer::anchor() {}
126
127void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
128 const char *BufEnd) {
129 BufferStart = BufStart;
130 BufferPtr = BufPtr;
131 BufferEnd = BufEnd;
132
133 assert(BufEnd[0] == 0 &&
134 "We assume that the input buffer has a null character at the end"
135 " to simplify lexing!");
136
137 // Check whether we have a BOM in the beginning of the buffer. If yes - act
138 // accordingly. Right now we support only UTF-8 with and without BOM, so, just
139 // skip the UTF-8 BOM if it's present.
140 if (BufferStart == BufferPtr) {
141 // Determine the size of the BOM.
142 StringRef Buf(BufferStart, BufferEnd - BufferStart);
143 size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
144 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
145 .Default(0);
146
147 // Skip the BOM.
148 BufferPtr += BOMLength;
149 }
150
151 Is_PragmaLexer = false;
152 CurrentConflictMarkerState = CMK_None;
153
154 // Start of the file is a start of line.
155 IsAtStartOfLine = true;
156 IsAtPhysicalStartOfLine = true;
157
158 HasLeadingSpace = false;
159 HasLeadingEmptyMacro = false;
160
161 // We are not after parsing a #.
163
164 // We are not after parsing #include.
165 ParsingFilename = false;
166
167 // We are not in raw mode. Raw mode disables diagnostics and interpretation
168 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
169 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
170 // or otherwise skipping over tokens.
171 LexingRawMode = false;
172
173 // Default to not keeping comments.
174 ExtendedTokenMode = 0;
175
176 NewLinePtr = nullptr;
177}
178
179/// Lexer constructor - Create a new lexer object for the specified buffer
180/// with the specified preprocessor managing the lexing process. This lexer
181/// assumes that the associated file buffer and Preprocessor objects will
182/// outlive it, so it doesn't take ownership of either of them.
183Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile,
184 Preprocessor &PP, bool IsFirstIncludeOfFile)
185 : PreprocessorLexer(&PP, FID),
186 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
187 LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment),
188 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
189 InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(),
190 InputFile.getBufferEnd());
191
193}
194
195/// Lexer constructor - Create a new raw lexer object. This object is only
196/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
197/// range will outlive it, so it doesn't take ownership of it.
198Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
199 const char *BufStart, const char *BufPtr, const char *BufEnd,
200 bool IsFirstIncludeOfFile)
201 : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment),
202 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
203 InitLexer(BufStart, BufPtr, BufEnd);
204
205 // We *are* in raw mode.
206 LexingRawMode = true;
207}
208
209/// Lexer constructor - Create a new raw lexer object. This object is only
210/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
211/// range will outlive it, so it doesn't take ownership of it.
212Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
213 const SourceManager &SM, const LangOptions &langOpts,
214 bool IsFirstIncludeOfFile)
215 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),
216 FromFile.getBufferStart(), FromFile.getBufferEnd(),
217 IsFirstIncludeOfFile) {}
218
220 assert(PP && "Cannot reset token mode without a preprocessor");
221 if (LangOpts.TraditionalCPP)
223 else
225}
226
227/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
228/// _Pragma expansion. This has a variety of magic semantics that this method
229/// sets up. It returns a new'd Lexer that must be delete'd when done.
230///
231/// On entrance to this routine, TokStartLoc is a macro location which has a
232/// spelling loc that indicates the bytes to be lexed for the token and an
233/// expansion location that indicates where all lexed tokens should be
234/// "expanded from".
235///
236/// TODO: It would really be nice to make _Pragma just be a wrapper around a
237/// normal lexer that remaps tokens as they fly by. This would require making
238/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
239/// interface that could handle this stuff. This would pull GetMappedTokenLoc
240/// out of the critical path of the lexer!
241///
243 SourceLocation ExpansionLocStart,
244 SourceLocation ExpansionLocEnd,
245 unsigned TokLen, Preprocessor &PP) {
247
248 // Create the lexer as if we were going to lex the file normally.
249 FileID SpellingFID = SM.getFileID(SpellingLoc);
250 llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID);
251 Lexer *L = new Lexer(SpellingFID, InputFile, PP);
252
253 // Now that the lexer is created, change the start/end locations so that we
254 // just lex the subsection of the file that we want. This is lexing from a
255 // scratch buffer.
256 const char *StrData = SM.getCharacterData(SpellingLoc);
257
258 L->BufferPtr = StrData;
259 L->BufferEnd = StrData+TokLen;
260 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
261
262 // Set the SourceLocation with the remapping information. This ensures that
263 // GetMappedTokenLoc will remap the tokens as they are lexed.
264 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
265 ExpansionLocStart,
266 ExpansionLocEnd, TokLen);
267
268 // Ensure that the lexer thinks it is inside a directive, so that end \n will
269 // return an EOD token.
271
272 // This lexer really is for _Pragma.
273 L->Is_PragmaLexer = true;
274 return L;
275}
276
277void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {
278 this->IsAtPhysicalStartOfLine = IsAtStartOfLine;
279 this->IsAtStartOfLine = IsAtStartOfLine;
280 assert((BufferStart + Offset) <= BufferEnd);
281 BufferPtr = BufferStart + Offset;
282}
283
284template <typename T> static void StringifyImpl(T &Str, char Quote) {
285 typename T::size_type i = 0, e = Str.size();
286 while (i < e) {
287 if (Str[i] == '\\' || Str[i] == Quote) {
288 Str.insert(Str.begin() + i, '\\');
289 i += 2;
290 ++e;
291 } else if (Str[i] == '\n' || Str[i] == '\r') {
292 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
293 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
294 Str[i] != Str[i + 1]) {
295 Str[i] = '\\';
296 Str[i + 1] = 'n';
297 } else {
298 // Replace '\n' and '\r' to '\\' followed by 'n'.
299 Str[i] = '\\';
300 Str.insert(Str.begin() + i + 1, 'n');
301 ++e;
302 }
303 i += 2;
304 } else
305 ++i;
306 }
307}
308
309std::string Lexer::Stringify(StringRef Str, bool Charify) {
310 std::string Result = std::string(Str);
311 char Quote = Charify ? '\'' : '"';
312 StringifyImpl(Result, Quote);
313 return Result;
314}
315
317
318//===----------------------------------------------------------------------===//
319// Token Spelling
320//===----------------------------------------------------------------------===//
321
322/// Slow case of getSpelling. Extract the characters comprising the
323/// spelling of this token from the provided input buffer.
324static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
325 const LangOptions &LangOpts, char *Spelling) {
326 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
327
328 size_t Length = 0;
329 const char *BufEnd = BufPtr + Tok.getLength();
330
331 if (tok::isStringLiteral(Tok.getKind())) {
332 // Munch the encoding-prefix and opening double-quote.
333 while (BufPtr < BufEnd) {
334 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
335 Spelling[Length++] = CharAndSize.Char;
336 BufPtr += CharAndSize.Size;
337
338 if (Spelling[Length - 1] == '"')
339 break;
340 }
341
342 // Raw string literals need special handling; trigraph expansion and line
343 // splicing do not occur within their d-char-sequence nor within their
344 // r-char-sequence.
345 if (Length >= 2 &&
346 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
347 // Search backwards from the end of the token to find the matching closing
348 // quote.
349 const char *RawEnd = BufEnd;
350 do --RawEnd; while (*RawEnd != '"');
351 size_t RawLength = RawEnd - BufPtr + 1;
352
353 // Everything between the quotes is included verbatim in the spelling.
354 memcpy(Spelling + Length, BufPtr, RawLength);
355 Length += RawLength;
356 BufPtr += RawLength;
357
358 // The rest of the token is lexed normally.
359 }
360 }
361
362 while (BufPtr < BufEnd) {
363 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
364 Spelling[Length++] = CharAndSize.Char;
365 BufPtr += CharAndSize.Size;
366 }
367
368 assert(Length < Tok.getLength() &&
369 "NeedsCleaning flag set on token that didn't need cleaning!");
370 return Length;
371}
372
373/// getSpelling() - Return the 'spelling' of this token. The spelling of a
374/// token are the characters used to represent the token in the source file
375/// after trigraph expansion and escaped-newline folding. In particular, this
376/// wants to get the true, uncanonicalized, spelling of things like digraphs
377/// UCNs, etc.
379 SmallVectorImpl<char> &buffer,
380 const SourceManager &SM,
381 const LangOptions &options,
382 bool *invalid) {
383 // Break down the source location.
384 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
385
386 // Try to the load the file buffer.
387 bool invalidTemp = false;
388 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
389 if (invalidTemp) {
390 if (invalid) *invalid = true;
391 return {};
392 }
393
394 const char *tokenBegin = file.data() + locInfo.second;
395
396 // Lex from the start of the given location.
397 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
398 file.begin(), tokenBegin, file.end());
399 Token token;
400 lexer.LexFromRawLexer(token);
401
402 unsigned length = token.getLength();
403
404 // Common case: no need for cleaning.
405 if (!token.needsCleaning())
406 return StringRef(tokenBegin, length);
407
408 // Hard case, we need to relex the characters into the string.
409 buffer.resize(length);
410 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
411 return StringRef(buffer.data(), buffer.size());
412}
413
414/// getSpelling() - Return the 'spelling' of this token. The spelling of a
415/// token are the characters used to represent the token in the source file
416/// after trigraph expansion and escaped-newline folding. In particular, this
417/// wants to get the true, uncanonicalized, spelling of things like digraphs
418/// UCNs, etc.
419std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
420 const LangOptions &LangOpts, bool *Invalid) {
421 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
422
423 bool CharDataInvalid = false;
424 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
425 &CharDataInvalid);
426 if (Invalid)
427 *Invalid = CharDataInvalid;
428 if (CharDataInvalid)
429 return {};
430
431 // If this token contains nothing interesting, return it directly.
432 if (!Tok.needsCleaning())
433 return std::string(TokStart, TokStart + Tok.getLength());
434
435 std::string Result;
436 Result.resize(Tok.getLength());
437 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
438 return Result;
439}
440
441/// getSpelling - This method is used to get the spelling of a token into a
442/// preallocated buffer, instead of as an std::string. The caller is required
443/// to allocate enough space for the token, which is guaranteed to be at least
444/// Tok.getLength() bytes long. The actual length of the token is returned.
445///
446/// Note that this method may do two possible things: it may either fill in
447/// the buffer specified with characters, or it may *change the input pointer*
448/// to point to a constant buffer with the data already in it (avoiding a
449/// copy). The caller is not allowed to modify the returned buffer pointer
450/// if an internal buffer is returned.
451unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
452 const SourceManager &SourceMgr,
453 const LangOptions &LangOpts, bool *Invalid) {
454 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
455
456 const char *TokStart = nullptr;
457 // NOTE: this has to be checked *before* testing for an IdentifierInfo.
458 if (Tok.is(tok::raw_identifier))
459 TokStart = Tok.getRawIdentifier().data();
460 else if (!Tok.hasUCN()) {
461 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
462 // Just return the string from the identifier table, which is very quick.
463 Buffer = II->getNameStart();
464 return II->getLength();
465 }
466 }
467
468 // NOTE: this can be checked even after testing for an IdentifierInfo.
469 if (Tok.isLiteral())
470 TokStart = Tok.getLiteralData();
471
472 if (!TokStart) {
473 // Compute the start of the token in the input lexer buffer.
474 bool CharDataInvalid = false;
475 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
476 if (Invalid)
477 *Invalid = CharDataInvalid;
478 if (CharDataInvalid) {
479 Buffer = "";
480 return 0;
481 }
482 }
483
484 // If this token contains nothing interesting, return it directly.
485 if (!Tok.needsCleaning()) {
486 Buffer = TokStart;
487 return Tok.getLength();
488 }
489
490 // Otherwise, hard case, relex the characters into the string.
491 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
492}
493
494/// MeasureTokenLength - Relex the token at the specified location and return
495/// its length in bytes in the input file. If the token needs cleaning (e.g.
496/// includes a trigraph or an escaped newline) then this count includes bytes
497/// that are part of that.
499 const SourceManager &SM,
500 const LangOptions &LangOpts) {
501 Token TheTok;
502 if (getRawToken(Loc, TheTok, SM, LangOpts))
503 return 0;
504 return TheTok.getLength();
505}
506
507/// Relex the token at the specified location.
508/// \returns true if there was a failure, false on success.
510 const SourceManager &SM,
511 const LangOptions &LangOpts,
512 bool IgnoreWhiteSpace) {
513 // TODO: this could be special cased for common tokens like identifiers, ')',
514 // etc to make this faster, if it mattered. Just look at StrData[0] to handle
515 // all obviously single-char tokens. This could use
516 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
517 // something.
518
519 // If this comes from a macro expansion, we really do want the macro name, not
520 // the token this macro expanded to.
521 Loc = SM.getExpansionLoc(Loc);
522 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
523 bool Invalid = false;
524 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
525 if (Invalid)
526 return true;
527
528 const char *StrData = Buffer.data()+LocInfo.second;
529
530 if (!IgnoreWhiteSpace && isWhitespace(SkipEscapedNewLines(StrData)[0]))
531 return true;
532
533 // Create a lexer starting at the beginning of this token.
534 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
535 Buffer.begin(), StrData, Buffer.end());
536 TheLexer.SetCommentRetentionState(true);
537 TheLexer.LexFromRawLexer(Result);
538 return false;
539}
540
541/// Returns the pointer that points to the beginning of line that contains
542/// the given offset, or null if the offset if invalid.
543static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
544 const char *BufStart = Buffer.data();
545 if (Offset >= Buffer.size())
546 return nullptr;
547
548 const char *LexStart = BufStart + Offset;
549 for (; LexStart != BufStart; --LexStart) {
550 if (isVerticalWhitespace(LexStart[0]) &&
551 !Lexer::isNewLineEscaped(BufStart, LexStart)) {
552 // LexStart should point at first character of logical line.
553 ++LexStart;
554 break;
555 }
556 }
557 return LexStart;
558}
559
561 const SourceManager &SM,
562 const LangOptions &LangOpts) {
563 assert(Loc.isFileID());
564 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
565 if (LocInfo.first.isInvalid())
566 return Loc;
567
568 bool Invalid = false;
569 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
570 if (Invalid)
571 return Loc;
572
573 // Back up from the current location until we hit the beginning of a line
574 // (or the buffer). We'll relex from that point.
575 const char *StrData = Buffer.data() + LocInfo.second;
576 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
577 if (!LexStart || LexStart == StrData)
578 return Loc;
579
580 // Create a lexer starting at the beginning of this token.
581 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
582 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
583 Buffer.end());
584 TheLexer.SetCommentRetentionState(true);
585
586 // Lex tokens until we find the token that contains the source location.
587 Token TheTok;
588 do {
589 TheLexer.LexFromRawLexer(TheTok);
590
591 if (TheLexer.getBufferLocation() > StrData) {
592 // Lexing this token has taken the lexer past the source location we're
593 // looking for. If the current token encompasses our source location,
594 // return the beginning of that token.
595 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
596 return TheTok.getLocation();
597
598 // We ended up skipping over the source location entirely, which means
599 // that it points into whitespace. We're done here.
600 break;
601 }
602 } while (TheTok.getKind() != tok::eof);
603
604 // We've passed our source location; just return the original source location.
605 return Loc;
606}
607
609 const SourceManager &SM,
610 const LangOptions &LangOpts) {
611 if (Loc.isFileID())
612 return getBeginningOfFileToken(Loc, SM, LangOpts);
613
614 if (!SM.isMacroArgExpansion(Loc))
615 return Loc;
616
617 SourceLocation FileLoc = SM.getSpellingLoc(Loc);
618 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
619 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
620 std::pair<FileID, unsigned> BeginFileLocInfo =
621 SM.getDecomposedLoc(BeginFileLoc);
622 assert(FileLocInfo.first == BeginFileLocInfo.first &&
623 FileLocInfo.second >= BeginFileLocInfo.second);
624 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
625}
626
627namespace {
628
629enum PreambleDirectiveKind {
630 PDK_Skipped,
631 PDK_Unknown
632};
633
634} // namespace
635
637 const LangOptions &LangOpts,
638 unsigned MaxLines) {
639 // Create a lexer starting at the beginning of the file. Note that we use a
640 // "fake" file source location at offset 1 so that the lexer will track our
641 // position within the file.
642 const SourceLocation::UIntTy StartOffset = 1;
644 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
645 Buffer.end());
646 TheLexer.SetCommentRetentionState(true);
647
648 bool InPreprocessorDirective = false;
649 Token TheTok;
650 SourceLocation ActiveCommentLoc;
651
652 unsigned MaxLineOffset = 0;
653 if (MaxLines) {
654 const char *CurPtr = Buffer.begin();
655 unsigned CurLine = 0;
656 while (CurPtr != Buffer.end()) {
657 char ch = *CurPtr++;
658 if (ch == '\n') {
659 ++CurLine;
660 if (CurLine == MaxLines)
661 break;
662 }
663 }
664 if (CurPtr != Buffer.end())
665 MaxLineOffset = CurPtr - Buffer.begin();
666 }
667
668 do {
669 TheLexer.LexFromRawLexer(TheTok);
670
671 if (InPreprocessorDirective) {
672 // If we've hit the end of the file, we're done.
673 if (TheTok.getKind() == tok::eof) {
674 break;
675 }
676
677 // If we haven't hit the end of the preprocessor directive, skip this
678 // token.
679 if (!TheTok.isAtStartOfLine())
680 continue;
681
682 // We've passed the end of the preprocessor directive, and will look
683 // at this token again below.
684 InPreprocessorDirective = false;
685 }
686
687 // Keep track of the # of lines in the preamble.
688 if (TheTok.isAtStartOfLine()) {
689 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
690
691 // If we were asked to limit the number of lines in the preamble,
692 // and we're about to exceed that limit, we're done.
693 if (MaxLineOffset && TokOffset >= MaxLineOffset)
694 break;
695 }
696
697 // Comments are okay; skip over them.
698 if (TheTok.getKind() == tok::comment) {
699 if (ActiveCommentLoc.isInvalid())
700 ActiveCommentLoc = TheTok.getLocation();
701 continue;
702 }
703
704 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
705 // This is the start of a preprocessor directive.
706 Token HashTok = TheTok;
707 InPreprocessorDirective = true;
708 ActiveCommentLoc = SourceLocation();
709
710 // Figure out which directive this is. Since we're lexing raw tokens,
711 // we don't have an identifier table available. Instead, just look at
712 // the raw identifier to recognize and categorize preprocessor directives.
713 TheLexer.LexFromRawLexer(TheTok);
714 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
715 StringRef Keyword = TheTok.getRawIdentifier();
716 PreambleDirectiveKind PDK
717 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
718 .Case("include", PDK_Skipped)
719 .Case("__include_macros", PDK_Skipped)
720 .Case("define", PDK_Skipped)
721 .Case("undef", PDK_Skipped)
722 .Case("line", PDK_Skipped)
723 .Case("error", PDK_Skipped)
724 .Case("pragma", PDK_Skipped)
725 .Case("import", PDK_Skipped)
726 .Case("include_next", PDK_Skipped)
727 .Case("warning", PDK_Skipped)
728 .Case("ident", PDK_Skipped)
729 .Case("sccs", PDK_Skipped)
730 .Case("assert", PDK_Skipped)
731 .Case("unassert", PDK_Skipped)
732 .Case("if", PDK_Skipped)
733 .Case("ifdef", PDK_Skipped)
734 .Case("ifndef", PDK_Skipped)
735 .Case("elif", PDK_Skipped)
736 .Case("elifdef", PDK_Skipped)
737 .Case("elifndef", PDK_Skipped)
738 .Case("else", PDK_Skipped)
739 .Case("endif", PDK_Skipped)
740 .Default(PDK_Unknown);
741
742 switch (PDK) {
743 case PDK_Skipped:
744 continue;
745
746 case PDK_Unknown:
747 // We don't know what this directive is; stop at the '#'.
748 break;
749 }
750 }
751
752 // We only end up here if we didn't recognize the preprocessor
753 // directive or it was one that can't occur in the preamble at this
754 // point. Roll back the current token to the location of the '#'.
755 TheTok = HashTok;
756 } else if (TheTok.isAtStartOfLine() &&
757 TheTok.getKind() == tok::raw_identifier &&
758 TheTok.getRawIdentifier() == "module" &&
759 LangOpts.CPlusPlusModules) {
760 // The initial global module fragment introducer "module;" is part of
761 // the preamble, which runs up to the module declaration "module foo;".
762 Token ModuleTok = TheTok;
763 do {
764 TheLexer.LexFromRawLexer(TheTok);
765 } while (TheTok.getKind() == tok::comment);
766 if (TheTok.getKind() != tok::semi) {
767 // Not global module fragment, roll back.
768 TheTok = ModuleTok;
769 break;
770 }
771 continue;
772 }
773
774 // We hit a token that we don't recognize as being in the
775 // "preprocessing only" part of the file, so we're no longer in
776 // the preamble.
777 break;
778 } while (true);
779
780 SourceLocation End;
781 if (ActiveCommentLoc.isValid())
782 End = ActiveCommentLoc; // don't truncate a decl comment.
783 else
784 End = TheTok.getLocation();
785
786 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
787 TheTok.isAtStartOfLine());
788}
789
790unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
791 const SourceManager &SM,
792 const LangOptions &LangOpts) {
793 // Figure out how many physical characters away the specified expansion
794 // character is. This needs to take into consideration newlines and
795 // trigraphs.
796 bool Invalid = false;
797 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
798
799 // If they request the first char of the token, we're trivially done.
800 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
801 return 0;
802
803 unsigned PhysOffset = 0;
804
805 // The usual case is that tokens don't contain anything interesting. Skip
806 // over the uninteresting characters. If a token only consists of simple
807 // chars, this method is extremely fast.
808 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
809 if (CharNo == 0)
810 return PhysOffset;
811 ++TokPtr;
812 --CharNo;
813 ++PhysOffset;
814 }
815
816 // If we have a character that may be a trigraph or escaped newline, use a
817 // lexer to parse it correctly.
818 for (; CharNo; --CharNo) {
819 auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts);
820 TokPtr += CharAndSize.Size;
821 PhysOffset += CharAndSize.Size;
822 }
823
824 // Final detail: if we end up on an escaped newline, we want to return the
825 // location of the actual byte of the token. For example foo<newline>bar
826 // advanced by 3 should return the location of b, not of \\. One compounding
827 // detail of this is that the escape may be made by a trigraph.
828 if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
829 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
830
831 return PhysOffset;
832}
833
834/// Computes the source location just past the end of the
835/// token at this source location.
836///
837/// This routine can be used to produce a source location that
838/// points just past the end of the token referenced by \p Loc, and
839/// is generally used when a diagnostic needs to point just after a
840/// token where it expected something different that it received. If
841/// the returned source location would not be meaningful (e.g., if
842/// it points into a macro), this routine returns an invalid
843/// source location.
844///
845/// \param Offset an offset from the end of the token, where the source
846/// location should refer to. The default offset (0) produces a source
847/// location pointing just past the end of the token; an offset of 1 produces
848/// a source location pointing to the last character in the token, etc.
850 const SourceManager &SM,
851 const LangOptions &LangOpts) {
852 if (Loc.isInvalid())
853 return {};
854
855 if (Loc.isMacroID()) {
856 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
857 return {}; // Points inside the macro expansion.
858 }
859
860 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
861 if (Len > Offset)
862 Len = Len - Offset;
863 else
864 return Loc;
865
866 return Loc.getLocWithOffset(Len);
867}
868
869/// Returns true if the given MacroID location points at the first
870/// token of the macro expansion.
872 const SourceManager &SM,
873 const LangOptions &LangOpts,
874 SourceLocation *MacroBegin) {
875 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
876
877 SourceLocation expansionLoc;
878 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
879 return false;
880
881 if (expansionLoc.isFileID()) {
882 // No other macro expansions, this is the first.
883 if (MacroBegin)
884 *MacroBegin = expansionLoc;
885 return true;
886 }
887
888 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
889}
890
891/// Returns true if the given MacroID location points at the last
892/// token of the macro expansion.
894 const SourceManager &SM,
895 const LangOptions &LangOpts,
896 SourceLocation *MacroEnd) {
897 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
898
899 SourceLocation spellLoc = SM.getSpellingLoc(loc);
900 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
901 if (tokLen == 0)
902 return false;
903
904 SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
905 SourceLocation expansionLoc;
906 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
907 return false;
908
909 if (expansionLoc.isFileID()) {
910 // No other macro expansions.
911 if (MacroEnd)
912 *MacroEnd = expansionLoc;
913 return true;
914 }
915
916 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
917}
918
920 const SourceManager &SM,
921 const LangOptions &LangOpts) {
924 assert(Begin.isFileID() && End.isFileID());
925 if (Range.isTokenRange()) {
926 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
927 if (End.isInvalid())
928 return {};
929 }
930
931 // Break down the source locations.
932 FileID FID;
933 unsigned BeginOffs;
934 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
935 if (FID.isInvalid())
936 return {};
937
938 unsigned EndOffs;
939 if (!SM.isInFileID(End, FID, &EndOffs) ||
940 BeginOffs > EndOffs)
941 return {};
942
944}
945
946// Assumes that `Loc` is in an expansion.
948 const SourceManager &SM) {
949 return SM.getSLocEntry(SM.getFileID(Loc))
950 .getExpansion()
951 .isExpansionTokenRange();
952}
953
955 const SourceManager &SM,
956 const LangOptions &LangOpts) {
959 if (Begin.isInvalid() || End.isInvalid())
960 return {};
961
962 if (Begin.isFileID() && End.isFileID())
963 return makeRangeFromFileLocs(Range, SM, LangOpts);
964
965 if (Begin.isMacroID() && End.isFileID()) {
966 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
967 return {};
969 return makeRangeFromFileLocs(Range, SM, LangOpts);
970 }
971
972 if (Begin.isFileID() && End.isMacroID()) {
973 if (Range.isTokenRange()) {
974 if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End))
975 return {};
976 // Use the *original* end, not the expanded one in `End`.
977 Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM));
978 } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End))
979 return {};
980 Range.setEnd(End);
981 return makeRangeFromFileLocs(Range, SM, LangOpts);
982 }
983
984 assert(Begin.isMacroID() && End.isMacroID());
985 SourceLocation MacroBegin, MacroEnd;
986 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
987 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
988 &MacroEnd)) ||
989 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
990 &MacroEnd)))) {
991 Range.setBegin(MacroBegin);
992 Range.setEnd(MacroEnd);
993 // Use the *original* `End`, not the expanded one in `MacroEnd`.
994 if (Range.isTokenRange())
995 Range.setTokenRange(isInExpansionTokenRange(End, SM));
996 return makeRangeFromFileLocs(Range, SM, LangOpts);
997 }
998
999 bool Invalid = false;
1000 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
1001 &Invalid);
1002 if (Invalid)
1003 return {};
1004
1005 if (BeginEntry.getExpansion().isMacroArgExpansion()) {
1006 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
1007 &Invalid);
1008 if (Invalid)
1009 return {};
1010
1011 if (EndEntry.getExpansion().isMacroArgExpansion() &&
1012 BeginEntry.getExpansion().getExpansionLocStart() ==
1013 EndEntry.getExpansion().getExpansionLocStart()) {
1014 Range.setBegin(SM.getImmediateSpellingLoc(Begin));
1015 Range.setEnd(SM.getImmediateSpellingLoc(End));
1016 return makeFileCharRange(Range, SM, LangOpts);
1017 }
1018 }
1019
1020 return {};
1021}
1022
1024 const SourceManager &SM,
1025 const LangOptions &LangOpts,
1026 bool *Invalid) {
1027 Range = makeFileCharRange(Range, SM, LangOpts);
1028 if (Range.isInvalid()) {
1029 if (Invalid) *Invalid = true;
1030 return {};
1031 }
1032
1033 // Break down the source location.
1034 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
1035 if (beginInfo.first.isInvalid()) {
1036 if (Invalid) *Invalid = true;
1037 return {};
1038 }
1039
1040 unsigned EndOffs;
1041 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
1042 beginInfo.second > EndOffs) {
1043 if (Invalid) *Invalid = true;
1044 return {};
1045 }
1046
1047 // Try to the load the file buffer.
1048 bool invalidTemp = false;
1049 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
1050 if (invalidTemp) {
1051 if (Invalid) *Invalid = true;
1052 return {};
1053 }
1054
1055 if (Invalid) *Invalid = false;
1056 return file.substr(beginInfo.second, EndOffs - beginInfo.second);
1057}
1058
1060 const SourceManager &SM,
1061 const LangOptions &LangOpts) {
1062 assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1063
1064 // Find the location of the immediate macro expansion.
1065 while (true) {
1066 FileID FID = SM.getFileID(Loc);
1067 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
1068 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
1069 Loc = Expansion.getExpansionLocStart();
1070 if (!Expansion.isMacroArgExpansion())
1071 break;
1072
1073 // For macro arguments we need to check that the argument did not come
1074 // from an inner macro, e.g: "MAC1( MAC2(foo) )"
1075
1076 // Loc points to the argument id of the macro definition, move to the
1077 // macro expansion.
1078 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1079 SourceLocation SpellLoc = Expansion.getSpellingLoc();
1080 if (SpellLoc.isFileID())
1081 break; // No inner macro.
1082
1083 // If spelling location resides in the same FileID as macro expansion
1084 // location, it means there is no inner macro.
1085 FileID MacroFID = SM.getFileID(Loc);
1086 if (SM.isInFileID(SpellLoc, MacroFID))
1087 break;
1088
1089 // Argument came from inner macro.
1090 Loc = SpellLoc;
1091 }
1092
1093 // Find the spelling location of the start of the non-argument expansion
1094 // range. This is where the macro name was spelled in order to begin
1095 // expanding this macro.
1096 Loc = SM.getSpellingLoc(Loc);
1097
1098 // Dig out the buffer where the macro name was spelled and the extents of the
1099 // name so that we can render it into the expansion note.
1100 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1101 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1102 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1103 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1104}
1105
1107 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1108 assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1109 // Walk past macro argument expansions.
1110 while (SM.isMacroArgExpansion(Loc))
1111 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1112
1113 // If the macro's spelling isn't FileID or from scratch space, then it's
1114 // actually a token paste or stringization (or similar) and not a macro at
1115 // all.
1116 SourceLocation SpellLoc = SM.getSpellingLoc(Loc);
1117 if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc))
1118 return {};
1119
1120 // Find the spelling location of the start of the non-argument expansion
1121 // range. This is where the macro name was spelled in order to begin
1122 // expanding this macro.
1123 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1124
1125 // Dig out the buffer where the macro name was spelled and the extents of the
1126 // name so that we can render it into the expansion note.
1127 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1128 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1129 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1130 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1131}
1132
1134 return isAsciiIdentifierContinue(c, LangOpts.DollarIdents);
1135}
1136
1137bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1138 assert(isVerticalWhitespace(Str[0]));
1139 if (Str - 1 < BufferStart)
1140 return false;
1141
1142 if ((Str[0] == '\n' && Str[-1] == '\r') ||
1143 (Str[0] == '\r' && Str[-1] == '\n')) {
1144 if (Str - 2 < BufferStart)
1145 return false;
1146 --Str;
1147 }
1148 --Str;
1149
1150 // Rewind to first non-space character:
1151 while (Str > BufferStart && isHorizontalWhitespace(*Str))
1152 --Str;
1153
1154 return *Str == '\\';
1155}
1156
1158 const SourceManager &SM) {
1159 if (Loc.isInvalid() || Loc.isMacroID())
1160 return {};
1161 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1162 if (LocInfo.first.isInvalid())
1163 return {};
1164 bool Invalid = false;
1165 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1166 if (Invalid)
1167 return {};
1168 const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1169 if (!Line)
1170 return {};
1171 StringRef Rest = Buffer.substr(Line - Buffer.data());
1172 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1173 return NumWhitespaceChars == StringRef::npos
1174 ? ""
1175 : Rest.take_front(NumWhitespaceChars);
1176}
1177
1178//===----------------------------------------------------------------------===//
1179// Diagnostics forwarding code.
1180//===----------------------------------------------------------------------===//
1181
1182/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1183/// lexer buffer was all expanded at a single point, perform the mapping.
1184/// This is currently only used for _Pragma implementation, so it is the slow
1185/// path of the hot getSourceLocation method. Do not allow it to be inlined.
1186static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
1187 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1189 SourceLocation FileLoc,
1190 unsigned CharNo, unsigned TokLen) {
1191 assert(FileLoc.isMacroID() && "Must be a macro expansion");
1192
1193 // Otherwise, we're lexing "mapped tokens". This is used for things like
1194 // _Pragma handling. Combine the expansion location of FileLoc with the
1195 // spelling location.
1197
1198 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1199 // characters come from spelling(FileLoc)+Offset.
1200 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1201 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1202
1203 // Figure out the expansion loc range, which is the range covered by the
1204 // original _Pragma(...) sequence.
1205 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1206
1207 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1208}
1209
1210/// getSourceLocation - Return a source location identifier for the specified
1211/// offset in the current file.
1213 unsigned TokLen) const {
1214 assert(Loc >= BufferStart && Loc <= BufferEnd &&
1215 "Location out of range for this buffer!");
1216
1217 // In the normal case, we're just lexing from a simple file buffer, return
1218 // the file id from FileLoc with the offset specified.
1219 unsigned CharNo = Loc-BufferStart;
1220 if (FileLoc.isFileID())
1221 return FileLoc.getLocWithOffset(CharNo);
1222
1223 // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1224 // tokens are lexed from where the _Pragma was defined.
1225 assert(PP && "This doesn't work on raw lexers");
1226 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1227}
1228
1229/// Diag - Forwarding function for diagnostics. This translate a source
1230/// position in the current buffer into a SourceLocation object for rendering.
1231DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1232 return PP->Diag(getSourceLocation(Loc), DiagID);
1233}
1234
1235//===----------------------------------------------------------------------===//
1236// Trigraph and Escaped Newline Handling Code.
1237//===----------------------------------------------------------------------===//
1238
1239/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1240/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1241static char GetTrigraphCharForLetter(char Letter) {
1242 switch (Letter) {
1243 default: return 0;
1244 case '=': return '#';
1245 case ')': return ']';
1246 case '(': return '[';
1247 case '!': return '|';
1248 case '\'': return '^';
1249 case '>': return '}';
1250 case '/': return '\\';
1251 case '<': return '{';
1252 case '-': return '~';
1253 }
1254}
1255
1256/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1257/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
1258/// return the result character. Finally, emit a warning about trigraph use
1259/// whether trigraphs are enabled or not.
1260static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) {
1261 char Res = GetTrigraphCharForLetter(*CP);
1262 if (!Res)
1263 return Res;
1264
1265 if (!Trigraphs) {
1266 if (L && !L->isLexingRawMode())
1267 L->Diag(CP-2, diag::trigraph_ignored);
1268 return 0;
1269 }
1270
1271 if (L && !L->isLexingRawMode())
1272 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1273 return Res;
1274}
1275
1276/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1277/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1278/// trigraph equivalent on entry to this function.
1279unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1280 unsigned Size = 0;
1281 while (isWhitespace(Ptr[Size])) {
1282 ++Size;
1283
1284 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1285 continue;
1286
1287 // If this is a \r\n or \n\r, skip the other half.
1288 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1289 Ptr[Size-1] != Ptr[Size])
1290 ++Size;
1291
1292 return Size;
1293 }
1294
1295 // Not an escaped newline, must be a \t or something else.
1296 return 0;
1297}
1298
1299/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1300/// them), skip over them and return the first non-escaped-newline found,
1301/// otherwise return P.
1302const char *Lexer::SkipEscapedNewLines(const char *P) {
1303 while (true) {
1304 const char *AfterEscape;
1305 if (*P == '\\') {
1306 AfterEscape = P+1;
1307 } else if (*P == '?') {
1308 // If not a trigraph for escape, bail out.
1309 if (P[1] != '?' || P[2] != '/')
1310 return P;
1311 // FIXME: Take LangOpts into account; the language might not
1312 // support trigraphs.
1313 AfterEscape = P+3;
1314 } else {
1315 return P;
1316 }
1317
1318 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1319 if (NewLineSize == 0) return P;
1320 P = AfterEscape+NewLineSize;
1321 }
1322}
1323
1325 const SourceManager &SM,
1326 const LangOptions &LangOpts,
1327 bool IncludeComments) {
1328 if (Loc.isMacroID()) {
1329 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1330 return std::nullopt;
1331 }
1332 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1333
1334 // Break down the source location.
1335 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1336
1337 // Try to load the file buffer.
1338 bool InvalidTemp = false;
1339 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1340 if (InvalidTemp)
1341 return std::nullopt;
1342
1343 const char *TokenBegin = File.data() + LocInfo.second;
1344
1345 // Lex from the start of the given location.
1346 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1347 TokenBegin, File.end());
1348 lexer.SetCommentRetentionState(IncludeComments);
1349 // Find the token.
1350 Token Tok;
1351 lexer.LexFromRawLexer(Tok);
1352 return Tok;
1353}
1354
1355/// Checks that the given token is the first token that occurs after the
1356/// given location (this excludes comments and whitespace). Returns the location
1357/// immediately after the specified token. If the token is not found or the
1358/// location is inside a macro, the returned source location will be invalid.
1361 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1362 std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1363 if (!Tok || Tok->isNot(TKind))
1364 return {};
1365 SourceLocation TokenLoc = Tok->getLocation();
1366
1367 // Calculate how much whitespace needs to be skipped if any.
1368 unsigned NumWhitespaceChars = 0;
1369 if (SkipTrailingWhitespaceAndNewLine) {
1370 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1371 unsigned char C = *TokenEnd;
1372 while (isHorizontalWhitespace(C)) {
1373 C = *(++TokenEnd);
1374 NumWhitespaceChars++;
1375 }
1376
1377 // Skip \r, \n, \r\n, or \n\r
1378 if (C == '\n' || C == '\r') {
1379 char PrevC = C;
1380 C = *(++TokenEnd);
1381 NumWhitespaceChars++;
1382 if ((C == '\n' || C == '\r') && C != PrevC)
1383 NumWhitespaceChars++;
1384 }
1385 }
1386
1387 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1388}
1389
1390/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1391/// get its size, and return it. This is tricky in several cases:
1392/// 1. If currently at the start of a trigraph, we warn about the trigraph,
1393/// then either return the trigraph (skipping 3 chars) or the '?',
1394/// depending on whether trigraphs are enabled or not.
1395/// 2. If this is an escaped newline (potentially with whitespace between
1396/// the backslash and newline), implicitly skip the newline and return
1397/// the char after it.
1398///
1399/// This handles the slow/uncommon case of the getCharAndSize method. Here we
1400/// know that we can accumulate into Size, and that we have already incremented
1401/// Ptr by Size bytes.
1402///
1403/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1404/// be updated to match.
1405Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
1406 unsigned Size = 0;
1407 // If we have a slash, look for an escaped newline.
1408 if (Ptr[0] == '\\') {
1409 ++Size;
1410 ++Ptr;
1411Slash:
1412 // Common case, backslash-char where the char is not whitespace.
1413 if (!isWhitespace(Ptr[0]))
1414 return {'\\', Size};
1415
1416 // See if we have optional whitespace characters between the slash and
1417 // newline.
1418 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1419 // Remember that this token needs to be cleaned.
1420 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1421
1422 // Warn if there was whitespace between the backslash and newline.
1423 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
1424 Diag(Ptr, diag::backslash_newline_space);
1425
1426 // Found backslash<whitespace><newline>. Parse the char after it.
1427 Size += EscapedNewLineSize;
1428 Ptr += EscapedNewLineSize;
1429
1430 // Use slow version to accumulate a correct size field.
1431 auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
1432 CharAndSize.Size += Size;
1433 return CharAndSize;
1434 }
1435
1436 // Otherwise, this is not an escaped newline, just return the slash.
1437 return {'\\', Size};
1438 }
1439
1440 // If this is a trigraph, process it.
1441 if (Ptr[0] == '?' && Ptr[1] == '?') {
1442 // If this is actually a legal trigraph (not something like "??x"), emit
1443 // a trigraph warning. If so, and if trigraphs are enabled, return it.
1444 if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr,
1445 LangOpts.Trigraphs)) {
1446 // Remember that this token needs to be cleaned.
1447 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1448
1449 Ptr += 3;
1450 Size += 3;
1451 if (C == '\\') goto Slash;
1452 return {C, Size};
1453 }
1454 }
1455
1456 // If this is neither, return a single character.
1457 return {*Ptr, Size + 1u};
1458}
1459
1460/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1461/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
1462/// and that we have already incremented Ptr by Size bytes.
1463///
1464/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1465/// be updated to match.
1466Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
1467 const LangOptions &LangOpts) {
1468
1469 unsigned Size = 0;
1470 // If we have a slash, look for an escaped newline.
1471 if (Ptr[0] == '\\') {
1472 ++Size;
1473 ++Ptr;
1474Slash:
1475 // Common case, backslash-char where the char is not whitespace.
1476 if (!isWhitespace(Ptr[0]))
1477 return {'\\', Size};
1478
1479 // See if we have optional whitespace characters followed by a newline.
1480 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1481 // Found backslash<whitespace><newline>. Parse the char after it.
1482 Size += EscapedNewLineSize;
1483 Ptr += EscapedNewLineSize;
1484
1485 // Use slow version to accumulate a correct size field.
1486 auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
1487 CharAndSize.Size += Size;
1488 return CharAndSize;
1489 }
1490
1491 // Otherwise, this is not an escaped newline, just return the slash.
1492 return {'\\', Size};
1493 }
1494
1495 // If this is a trigraph, process it.
1496 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1497 // If this is actually a legal trigraph (not something like "??x"), return
1498 // it.
1499 if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1500 Ptr += 3;
1501 Size += 3;
1502 if (C == '\\') goto Slash;
1503 return {C, Size};
1504 }
1505 }
1506
1507 // If this is neither, return a single character.
1508 return {*Ptr, Size + 1u};
1509}
1510
1511//===----------------------------------------------------------------------===//
1512// Helper methods for lexing.
1513//===----------------------------------------------------------------------===//
1514
1515/// Routine that indiscriminately sets the offset into the source file.
1516void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1517 BufferPtr = BufferStart + Offset;
1518 if (BufferPtr > BufferEnd)
1519 BufferPtr = BufferEnd;
1520 // FIXME: What exactly does the StartOfLine bit mean? There are two
1521 // possible meanings for the "start" of the line: the first token on the
1522 // unexpanded line, or the first token on the expanded line.
1523 IsAtStartOfLine = StartOfLine;
1524 IsAtPhysicalStartOfLine = StartOfLine;
1525}
1526
1527static bool isUnicodeWhitespace(uint32_t Codepoint) {
1528 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
1530 return UnicodeWhitespaceChars.contains(Codepoint);
1531}
1532
1534 llvm::SmallString<5> CharBuf;
1535 llvm::raw_svector_ostream CharOS(CharBuf);
1536 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1537 return CharBuf;
1538}
1539
1540// To mitigate https://github.com/llvm/llvm-project/issues/54732,
1541// we allow "Mathematical Notation Characters" in identifiers.
1542// This is a proposed profile that extends the XID_Start/XID_continue
1543// with mathematical symbols, superscipts and subscripts digits
1544// found in some production software.
1545// https://www.unicode.org/L2/L2022/22230-math-profile.pdf
1546static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts,
1547 bool IsStart, bool &IsExtension) {
1548 static const llvm::sys::UnicodeCharSet MathStartChars(
1550 static const llvm::sys::UnicodeCharSet MathContinueChars(
1552 if (MathStartChars.contains(C) ||
1553 (!IsStart && MathContinueChars.contains(C))) {
1554 IsExtension = true;
1555 return true;
1556 }
1557 return false;
1558}
1559
1560static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts,
1561 bool &IsExtension) {
1562 if (LangOpts.AsmPreprocessor) {
1563 return false;
1564 } else if (LangOpts.DollarIdents && '$' == C) {
1565 return true;
1566 } else if (LangOpts.CPlusPlus || LangOpts.C23) {
1567 // A non-leading codepoint must have the XID_Continue property.
1568 // XIDContinueRanges doesn't contains characters also in XIDStartRanges,
1569 // so we need to check both tables.
1570 // '_' doesn't have the XID_Continue property but is allowed in C and C++.
1571 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1572 static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
1573 if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C))
1574 return true;
1575 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false,
1576 IsExtension);
1577 } else if (LangOpts.C11) {
1578 static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1580 return C11AllowedIDChars.contains(C);
1581 } else {
1582 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1584 return C99AllowedIDChars.contains(C);
1585 }
1586}
1587
1588static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts,
1589 bool &IsExtension) {
1590 assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint");
1591 IsExtension = false;
1592 if (LangOpts.AsmPreprocessor) {
1593 return false;
1594 }
1595 if (LangOpts.CPlusPlus || LangOpts.C23) {
1596 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1597 if (XIDStartChars.contains(C))
1598 return true;
1599 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true,
1600 IsExtension);
1601 }
1602 if (!isAllowedIDChar(C, LangOpts, IsExtension))
1603 return false;
1604 if (LangOpts.C11) {
1605 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1607 return !C11DisallowedInitialIDChars.contains(C);
1608 }
1609 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1611 return !C99DisallowedInitialIDChars.contains(C);
1612}
1613
1616
1617 static const llvm::sys::UnicodeCharSet MathStartChars(
1619 static const llvm::sys::UnicodeCharSet MathContinueChars(
1621
1622 (void)MathStartChars;
1623 (void)MathContinueChars;
1624 assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) &&
1625 "Unexpected mathematical notation codepoint");
1626 Diags.Report(Range.getBegin(), diag::ext_mathematical_notation)
1628}
1629
1630static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1631 const char *End) {
1633 L.getSourceLocation(End));
1634}
1635
1636static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1637 CharSourceRange Range, bool IsFirst) {
1638 // Check C99 compatibility.
1639 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1640 enum {
1641 CannotAppearInIdentifier = 0,
1642 CannotStartIdentifier
1643 };
1644
1645 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1647 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1649 if (!C99AllowedIDChars.contains(C)) {
1650 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1651 << Range
1652 << CannotAppearInIdentifier;
1653 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1654 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1655 << Range
1656 << CannotStartIdentifier;
1657 }
1658 }
1659}
1660
1661/// After encountering UTF-8 character C and interpreting it as an identifier
1662/// character, check whether it's a homoglyph for a common non-identifier
1663/// source character that is unlikely to be an intentional identifier
1664/// character and warn if so.
1667 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1668 struct HomoglyphPair {
1669 uint32_t Character;
1670 char LooksLike;
1671 bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1672 };
1673 static constexpr HomoglyphPair SortedHomoglyphs[] = {
1674 {U'\u00ad', 0}, // SOFT HYPHEN
1675 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1676 {U'\u037e', ';'}, // GREEK QUESTION MARK
1677 {U'\u200b', 0}, // ZERO WIDTH SPACE
1678 {U'\u200c', 0}, // ZERO WIDTH NON-JOINER
1679 {U'\u200d', 0}, // ZERO WIDTH JOINER
1680 {U'\u2060', 0}, // WORD JOINER
1681 {U'\u2061', 0}, // FUNCTION APPLICATION
1682 {U'\u2062', 0}, // INVISIBLE TIMES
1683 {U'\u2063', 0}, // INVISIBLE SEPARATOR
1684 {U'\u2064', 0}, // INVISIBLE PLUS
1685 {U'\u2212', '-'}, // MINUS SIGN
1686 {U'\u2215', '/'}, // DIVISION SLASH
1687 {U'\u2216', '\\'}, // SET MINUS
1688 {U'\u2217', '*'}, // ASTERISK OPERATOR
1689 {U'\u2223', '|'}, // DIVIDES
1690 {U'\u2227', '^'}, // LOGICAL AND
1691 {U'\u2236', ':'}, // RATIO
1692 {U'\u223c', '~'}, // TILDE OPERATOR
1693 {U'\ua789', ':'}, // MODIFIER LETTER COLON
1694 {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE
1695 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1696 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1697 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1698 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1699 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1700 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1701 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1702 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1703 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1704 {U'\uff0c', ','}, // FULLWIDTH COMMA
1705 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1706 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1707 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1708 {U'\uff1a', ':'}, // FULLWIDTH COLON
1709 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1710 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1711 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1712 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1713 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1714 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1715 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1716 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1717 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1718 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1719 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1720 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1721 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1722 {U'\uff5e', '~'}, // FULLWIDTH TILDE
1723 {0, 0}
1724 };
1725 auto Homoglyph =
1726 std::lower_bound(std::begin(SortedHomoglyphs),
1727 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1728 if (Homoglyph->Character == C) {
1729 if (Homoglyph->LooksLike) {
1730 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1731 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1732 << Range << codepointAsHexString(C) << LooksLikeStr;
1733 } else {
1734 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1736 }
1737 }
1738}
1739
1741 DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,
1742 CharSourceRange Range, bool IsFirst) {
1743 if (isASCII(CodePoint))
1744 return;
1745
1746 bool IsExtension;
1747 bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension);
1748 bool IsIDContinue =
1749 IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension);
1750
1751 if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
1752 return;
1753
1754 bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
1755
1756 if (!IsFirst || InvalidOnlyAtStart) {
1757 Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
1758 << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart)
1760 } else {
1761 Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
1762 << Range << codepointAsHexString(CodePoint)
1764 }
1765}
1766
1767bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1768 Token &Result) {
1769 const char *UCNPtr = CurPtr + Size;
1770 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1771 if (CodePoint == 0) {
1772 return false;
1773 }
1774 bool IsExtension = false;
1775 if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) {
1776 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1777 return false;
1781 PP->getDiagnostics(), LangOpts, CodePoint,
1782 makeCharRange(*this, CurPtr, UCNPtr),
1783 /*IsFirst=*/false);
1784
1785 // We got a unicode codepoint that is neither a space nor a
1786 // a valid identifier part.
1787 // Carry on as if the codepoint was valid for recovery purposes.
1788 } else if (!isLexingRawMode()) {
1789 if (IsExtension)
1791 makeCharRange(*this, CurPtr, UCNPtr));
1792
1794 makeCharRange(*this, CurPtr, UCNPtr),
1795 /*IsFirst=*/false);
1796 }
1797
1798 Result.setFlag(Token::HasUCN);
1799 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
1800 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1801 CurPtr = UCNPtr;
1802 else
1803 while (CurPtr != UCNPtr)
1804 (void)getAndAdvanceChar(CurPtr, Result);
1805 return true;
1806}
1807
1808bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) {
1809 llvm::UTF32 CodePoint;
1810
1811 // If a UTF-8 codepoint appears immediately after an escaped new line,
1812 // CurPtr may point to the splicing \ on the preceding line,
1813 // so we need to skip it.
1814 unsigned FirstCodeUnitSize;
1815 getCharAndSize(CurPtr, FirstCodeUnitSize);
1816 const char *CharStart = CurPtr + FirstCodeUnitSize - 1;
1817 const char *UnicodePtr = CharStart;
1818
1819 llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence(
1820 (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd,
1821 &CodePoint, llvm::strictConversion);
1822 if (ConvResult != llvm::conversionOK)
1823 return false;
1824
1825 bool IsExtension = false;
1826 if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts,
1827 IsExtension)) {
1828 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1829 return false;
1830
1834 PP->getDiagnostics(), LangOpts, CodePoint,
1835 makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false);
1836 // We got a unicode codepoint that is neither a space nor a
1837 // a valid identifier part. Carry on as if the codepoint was
1838 // valid for recovery purposes.
1839 } else if (!isLexingRawMode()) {
1840 if (IsExtension)
1842 PP->getDiagnostics(), CodePoint,
1843 makeCharRange(*this, CharStart, UnicodePtr));
1845 makeCharRange(*this, CharStart, UnicodePtr),
1846 /*IsFirst=*/false);
1848 makeCharRange(*this, CharStart, UnicodePtr));
1849 }
1850
1851 // Once we sucessfully parsed some UTF-8,
1852 // calling ConsumeChar ensures the NeedsCleaning flag is set on the token
1853 // being lexed, and that warnings about trailing spaces are emitted.
1854 ConsumeChar(CurPtr, FirstCodeUnitSize, Result);
1855 CurPtr = UnicodePtr;
1856 return true;
1857}
1858
1859bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
1860 const char *CurPtr) {
1861 bool IsExtension = false;
1862 if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) {
1865 if (IsExtension)
1867 makeCharRange(*this, BufferPtr, CurPtr));
1869 makeCharRange(*this, BufferPtr, CurPtr),
1870 /*IsFirst=*/true);
1872 makeCharRange(*this, BufferPtr, CurPtr));
1873 }
1874
1875 MIOpt.ReadToken();
1876 return LexIdentifierContinue(Result, CurPtr);
1877 }
1878
1880 !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
1882 // Non-ASCII characters tend to creep into source code unintentionally.
1883 // Instead of letting the parser complain about the unknown token,
1884 // just drop the character.
1885 // Note that we can /only/ do this when the non-ASCII character is actually
1886 // spelled as Unicode, not written as a UCN. The standard requires that
1887 // we not throw away any possible preprocessor tokens, but there's a
1888 // loophole in the mapping of Unicode characters to basic character set
1889 // characters that allows us to map these particular characters to, say,
1890 // whitespace.
1892 PP->getDiagnostics(), LangOpts, C,
1893 makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
1894 BufferPtr = CurPtr;
1895 return false;
1896 }
1897
1898 // Otherwise, we have an explicit UCN or a character that's unlikely to show
1899 // up by accident.
1900 MIOpt.ReadToken();
1901 FormTokenWithChars(Result, CurPtr, tok::unknown);
1902 return true;
1903}
1904
1905static const char *
1906fastParseASCIIIdentifier(const char *CurPtr,
1907 [[maybe_unused]] const char *BufferEnd) {
1908#ifdef __SSE4_2__
1909 alignas(16) static constexpr char AsciiIdentifierRange[16] = {
1910 '_', '_', 'A', 'Z', 'a', 'z', '0', '9',
1911 };
1912 constexpr ssize_t BytesPerRegister = 16;
1913
1914 __m128i AsciiIdentifierRangeV =
1915 _mm_load_si128((const __m128i *)AsciiIdentifierRange);
1916
1917 while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) {
1918 __m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr));
1919
1920 int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv,
1923 CurPtr += Consumed;
1924 if (Consumed == BytesPerRegister)
1925 continue;
1926 return CurPtr;
1927 }
1928#endif
1929
1930 unsigned char C = *CurPtr;
1932 C = *++CurPtr;
1933 return CurPtr;
1934}
1935
1936bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
1937 // Match [_A-Za-z0-9]*, we have already matched an identifier start.
1938
1939 while (true) {
1940
1941 CurPtr = fastParseASCIIIdentifier(CurPtr, BufferEnd);
1942
1943 unsigned Size;
1944 // Slow path: handle trigraph, unicode codepoints, UCNs.
1945 unsigned char C = getCharAndSize(CurPtr, Size);
1947 CurPtr = ConsumeChar(CurPtr, Size, Result);
1948 continue;
1949 }
1950 if (C == '$') {
1951 // If we hit a $ and they are not supported in identifiers, we are done.
1952 if (!LangOpts.DollarIdents)
1953 break;
1954 // Otherwise, emit a diagnostic and continue.
1955 if (!isLexingRawMode())
1956 Diag(CurPtr, diag::ext_dollar_in_identifier);
1957 CurPtr = ConsumeChar(CurPtr, Size, Result);
1958 continue;
1959 }
1960 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1961 continue;
1962 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
1963 continue;
1964 // Neither an expected Unicode codepoint nor a UCN.
1965 break;
1966 }
1967
1968 const char *IdStart = BufferPtr;
1969 FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1970 Result.setRawIdentifierData(IdStart);
1971
1972 // If we are in raw mode, return this identifier raw. There is no need to
1973 // look up identifier information or attempt to macro expand it.
1974 if (LexingRawMode)
1975 return true;
1976
1977 // Fill in Result.IdentifierInfo and update the token kind,
1978 // looking up the identifier in the identifier table.
1980 // Note that we have to call PP->LookUpIdentifierInfo() even for code
1981 // completion, it writes IdentifierInfo into Result, and callers rely on it.
1982
1983 // If the completion point is at the end of an identifier, we want to treat
1984 // the identifier as incomplete even if it resolves to a macro or a keyword.
1985 // This allows e.g. 'class^' to complete to 'classifier'.
1986 if (isCodeCompletionPoint(CurPtr)) {
1987 // Return the code-completion token.
1988 Result.setKind(tok::code_completion);
1989 // Skip the code-completion char and all immediate identifier characters.
1990 // This ensures we get consistent behavior when completing at any point in
1991 // an identifier (i.e. at the start, in the middle, at the end). Note that
1992 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
1993 // simpler.
1994 assert(*CurPtr == 0 && "Completion character must be 0");
1995 ++CurPtr;
1996 // Note that code completion token is not added as a separate character
1997 // when the completion point is at the end of the buffer. Therefore, we need
1998 // to check if the buffer has ended.
1999 if (CurPtr < BufferEnd) {
2000 while (isAsciiIdentifierContinue(*CurPtr))
2001 ++CurPtr;
2002 }
2003 BufferPtr = CurPtr;
2004 return true;
2005 }
2006
2007 // Finally, now that we know we have an identifier, pass this off to the
2008 // preprocessor, which may macro expand it or something.
2009 if (II->isHandleIdentifierCase())
2010 return PP->HandleIdentifier(Result);
2011
2012 return true;
2013}
2014
2015/// isHexaLiteral - Return true if Start points to a hex constant.
2016/// in microsoft mode (where this is supposed to be several different tokens).
2017bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
2018 auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts);
2019 char C1 = CharAndSize1.Char;
2020 if (C1 != '0')
2021 return false;
2022
2023 auto CharAndSize2 =
2024 Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts);
2025 char C2 = CharAndSize2.Char;
2026 return (C2 == 'x' || C2 == 'X');
2027}
2028
2029/// LexNumericConstant - Lex the remainder of a integer or floating point
2030/// constant. From[-1] is the first character lexed. Return the end of the
2031/// constant.
2032bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
2033 unsigned Size;
2034 char C = getCharAndSize(CurPtr, Size);
2035 char PrevCh = 0;
2036 while (isPreprocessingNumberBody(C)) {
2037 CurPtr = ConsumeChar(CurPtr, Size, Result);
2038 PrevCh = C;
2039 if (LangOpts.HLSL && C == '.' && (*CurPtr == 'x' || *CurPtr == 'r')) {
2040 CurPtr -= Size;
2041 break;
2042 }
2043 C = getCharAndSize(CurPtr, Size);
2044 }
2045
2046 // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
2047 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
2048 // If we are in Microsoft mode, don't continue if the constant is hex.
2049 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
2050 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
2051 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
2052 }
2053
2054 // If we have a hex FP constant, continue.
2055 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
2056 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
2057 // not-quite-conforming extension. Only do so if this looks like it's
2058 // actually meant to be a hexfloat, and not if it has a ud-suffix.
2059 bool IsHexFloat = true;
2060 if (!LangOpts.C99) {
2061 if (!isHexaLiteral(BufferPtr, LangOpts))
2062 IsHexFloat = false;
2063 else if (!LangOpts.CPlusPlus17 &&
2064 std::find(BufferPtr, CurPtr, '_') != CurPtr)
2065 IsHexFloat = false;
2066 }
2067 if (IsHexFloat)
2068 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
2069 }
2070
2071 // If we have a digit separator, continue.
2072 if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
2073 auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts);
2074 if (isAsciiIdentifierContinue(Next)) {
2075 if (!isLexingRawMode())
2076 Diag(CurPtr, LangOpts.CPlusPlus
2077 ? diag::warn_cxx11_compat_digit_separator
2078 : diag::warn_c23_compat_digit_separator);
2079 CurPtr = ConsumeChar(CurPtr, Size, Result);
2080 CurPtr = ConsumeChar(CurPtr, NextSize, Result);
2081 return LexNumericConstant(Result, CurPtr);
2082 }
2083 }
2084
2085 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
2086 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2087 return LexNumericConstant(Result, CurPtr);
2088 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2089 return LexNumericConstant(Result, CurPtr);
2090
2091 // Update the location of token as well as BufferPtr.
2092 const char *TokStart = BufferPtr;
2093 FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
2094 Result.setLiteralData(TokStart);
2095 return true;
2096}
2097
2098/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
2099/// in C++11, or warn on a ud-suffix in C++98.
2100const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
2101 bool IsStringLiteral) {
2102 assert(LangOpts.CPlusPlus);
2103
2104 // Maximally munch an identifier.
2105 unsigned Size;
2106 char C = getCharAndSize(CurPtr, Size);
2107 bool Consumed = false;
2108
2109 if (!isAsciiIdentifierStart(C)) {
2110 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2111 Consumed = true;
2112 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2113 Consumed = true;
2114 else
2115 return CurPtr;
2116 }
2117
2118 if (!LangOpts.CPlusPlus11) {
2119 if (!isLexingRawMode())
2120 Diag(CurPtr,
2121 C == '_' ? diag::warn_cxx11_compat_user_defined_literal
2122 : diag::warn_cxx11_compat_reserved_user_defined_literal)
2124 return CurPtr;
2125 }
2126
2127 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
2128 // that does not start with an underscore is ill-formed. As a conforming
2129 // extension, we treat all such suffixes as if they had whitespace before
2130 // them. We assume a suffix beginning with a UCN or UTF-8 character is more
2131 // likely to be a ud-suffix than a macro, however, and accept that.
2132 if (!Consumed) {
2133 bool IsUDSuffix = false;
2134 if (C == '_')
2135 IsUDSuffix = true;
2136 else if (IsStringLiteral && LangOpts.CPlusPlus14) {
2137 // In C++1y, we need to look ahead a few characters to see if this is a
2138 // valid suffix for a string literal or a numeric literal (this could be
2139 // the 'operator""if' defining a numeric literal operator).
2140 const unsigned MaxStandardSuffixLength = 3;
2141 char Buffer[MaxStandardSuffixLength] = { C };
2142 unsigned Consumed = Size;
2143 unsigned Chars = 1;
2144 while (true) {
2145 auto [Next, NextSize] =
2146 getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts);
2147 if (!isAsciiIdentifierContinue(Next)) {
2148 // End of suffix. Check whether this is on the allowed list.
2149 const StringRef CompleteSuffix(Buffer, Chars);
2150 IsUDSuffix =
2151 StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix);
2152 break;
2153 }
2154
2155 if (Chars == MaxStandardSuffixLength)
2156 // Too long: can't be a standard suffix.
2157 break;
2158
2159 Buffer[Chars++] = Next;
2160 Consumed += NextSize;
2161 }
2162 }
2163
2164 if (!IsUDSuffix) {
2165 if (!isLexingRawMode())
2166 Diag(CurPtr, LangOpts.MSVCCompat
2167 ? diag::ext_ms_reserved_user_defined_literal
2168 : diag::ext_reserved_user_defined_literal)
2170 return CurPtr;
2171 }
2172
2173 CurPtr = ConsumeChar(CurPtr, Size, Result);
2174 }
2175
2176 Result.setFlag(Token::HasUDSuffix);
2177 while (true) {
2178 C = getCharAndSize(CurPtr, Size);
2180 CurPtr = ConsumeChar(CurPtr, Size, Result);
2181 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
2182 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) {
2183 } else
2184 break;
2185 }
2186
2187 return CurPtr;
2188}
2189
2190/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
2191/// either " or L" or u8" or u" or U".
2192bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
2193 tok::TokenKind Kind) {
2194 const char *AfterQuote = CurPtr;
2195 // Does this string contain the \0 character?
2196 const char *NulCharacter = nullptr;
2197
2198 if (!isLexingRawMode() &&
2199 (Kind == tok::utf8_string_literal ||
2200 Kind == tok::utf16_string_literal ||
2201 Kind == tok::utf32_string_literal))
2202 Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal
2203 : diag::warn_c99_compat_unicode_literal);
2204
2205 char C = getAndAdvanceChar(CurPtr, Result);
2206 while (C != '"') {
2207 // Skip escaped characters. Escaped newlines will already be processed by
2208 // getAndAdvanceChar.
2209 if (C == '\\')
2210 C = getAndAdvanceChar(CurPtr, Result);
2211
2212 if (C == '\n' || C == '\r' || // Newline.
2213 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2214 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2215 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
2216 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2217 return true;
2218 }
2219
2220 if (C == 0) {
2221 if (isCodeCompletionPoint(CurPtr-1)) {
2222 if (ParsingFilename)
2223 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
2224 else
2226 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2227 cutOffLexing();
2228 return true;
2229 }
2230
2231 NulCharacter = CurPtr-1;
2232 }
2233 C = getAndAdvanceChar(CurPtr, Result);
2234 }
2235
2236 // If we are in C++11, lex the optional ud-suffix.
2237 if (LangOpts.CPlusPlus)
2238 CurPtr = LexUDSuffix(Result, CurPtr, true);
2239
2240 // If a nul character existed in the string, warn about it.
2241 if (NulCharacter && !isLexingRawMode())
2242 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2243
2244 // Update the location of the token as well as the BufferPtr instance var.
2245 const char *TokStart = BufferPtr;
2246 FormTokenWithChars(Result, CurPtr, Kind);
2247 Result.setLiteralData(TokStart);
2248 return true;
2249}
2250
2251/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
2252/// having lexed R", LR", u8R", uR", or UR".
2253bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
2254 tok::TokenKind Kind) {
2255 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
2256 // Between the initial and final double quote characters of the raw string,
2257 // any transformations performed in phases 1 and 2 (trigraphs,
2258 // universal-character-names, and line splicing) are reverted.
2259
2260 if (!isLexingRawMode())
2261 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
2262
2263 unsigned PrefixLen = 0;
2264
2265 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) {
2266 if (!isLexingRawMode() &&
2267 llvm::is_contained({'$', '@', '`'}, CurPtr[PrefixLen])) {
2268 const char *Pos = &CurPtr[PrefixLen];
2269 Diag(Pos, LangOpts.CPlusPlus26
2270 ? diag::warn_cxx26_compat_raw_string_literal_character_set
2271 : diag::ext_cxx26_raw_string_literal_character_set)
2272 << StringRef(Pos, 1);
2273 }
2274 ++PrefixLen;
2275 }
2276
2277 // If the last character was not a '(', then we didn't lex a valid delimiter.
2278 if (CurPtr[PrefixLen] != '(') {
2279 if (!isLexingRawMode()) {
2280 const char *PrefixEnd = &CurPtr[PrefixLen];
2281 if (PrefixLen == 16) {
2282 Diag(PrefixEnd, diag::err_raw_delim_too_long);
2283 } else if (*PrefixEnd == '\n') {
2284 Diag(PrefixEnd, diag::err_invalid_newline_raw_delim);
2285 } else {
2286 Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
2287 << StringRef(PrefixEnd, 1);
2288 }
2289 }
2290
2291 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
2292 // it's possible the '"' was intended to be part of the raw string, but
2293 // there's not much we can do about that.
2294 while (true) {
2295 char C = *CurPtr++;
2296
2297 if (C == '"')
2298 break;
2299 if (C == 0 && CurPtr-1 == BufferEnd) {
2300 --CurPtr;
2301 break;
2302 }
2303 }
2304
2305 FormTokenWithChars(Result, CurPtr, tok::unknown);
2306 return true;
2307 }
2308
2309 // Save prefix and move CurPtr past it
2310 const char *Prefix = CurPtr;
2311 CurPtr += PrefixLen + 1; // skip over prefix and '('
2312
2313 while (true) {
2314 char C = *CurPtr++;
2315
2316 if (C == ')') {
2317 // Check for prefix match and closing quote.
2318 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
2319 CurPtr += PrefixLen + 1; // skip over prefix and '"'
2320 break;
2321 }
2322 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
2323 if (!isLexingRawMode())
2324 Diag(BufferPtr, diag::err_unterminated_raw_string)
2325 << StringRef(Prefix, PrefixLen);
2326 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2327 return true;
2328 }
2329 }
2330
2331 // If we are in C++11, lex the optional ud-suffix.
2332 if (LangOpts.CPlusPlus)
2333 CurPtr = LexUDSuffix(Result, CurPtr, true);
2334
2335 // Update the location of token as well as BufferPtr.
2336 const char *TokStart = BufferPtr;
2337 FormTokenWithChars(Result, CurPtr, Kind);
2338 Result.setLiteralData(TokStart);
2339 return true;
2340}
2341
2342/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2343/// after having lexed the '<' character. This is used for #include filenames.
2344bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2345 // Does this string contain the \0 character?
2346 const char *NulCharacter = nullptr;
2347 const char *AfterLessPos = CurPtr;
2348 char C = getAndAdvanceChar(CurPtr, Result);
2349 while (C != '>') {
2350 // Skip escaped characters. Escaped newlines will already be processed by
2351 // getAndAdvanceChar.
2352 if (C == '\\')
2353 C = getAndAdvanceChar(CurPtr, Result);
2354
2355 if (isVerticalWhitespace(C) || // Newline.
2356 (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file.
2357 // If the filename is unterminated, then it must just be a lone <
2358 // character. Return this as such.
2359 FormTokenWithChars(Result, AfterLessPos, tok::less);
2360 return true;
2361 }
2362
2363 if (C == 0) {
2364 if (isCodeCompletionPoint(CurPtr - 1)) {
2365 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
2366 cutOffLexing();
2367 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2368 return true;
2369 }
2370 NulCharacter = CurPtr-1;
2371 }
2372 C = getAndAdvanceChar(CurPtr, Result);
2373 }
2374
2375 // If a nul character existed in the string, warn about it.
2376 if (NulCharacter && !isLexingRawMode())
2377 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2378
2379 // Update the location of token as well as BufferPtr.
2380 const char *TokStart = BufferPtr;
2381 FormTokenWithChars(Result, CurPtr, tok::header_name);
2382 Result.setLiteralData(TokStart);
2383 return true;
2384}
2385
2386void Lexer::codeCompleteIncludedFile(const char *PathStart,
2387 const char *CompletionPoint,
2388 bool IsAngled) {
2389 // Completion only applies to the filename, after the last slash.
2390 StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2391 llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/";
2392 auto Slash = PartialPath.find_last_of(SlashChars);
2393 StringRef Dir =
2394 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);
2395 const char *StartOfFilename =
2396 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;
2397 // Code completion filter range is the filename only, up to completion point.
2399 StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2400 // We should replace the characters up to the closing quote or closest slash,
2401 // if any.
2402 while (CompletionPoint < BufferEnd) {
2403 char Next = *(CompletionPoint + 1);
2404 if (Next == 0 || Next == '\r' || Next == '\n')
2405 break;
2406 ++CompletionPoint;
2407 if (Next == (IsAngled ? '>' : '"'))
2408 break;
2409 if (SlashChars.contains(Next))
2410 break;
2411 }
2412
2414 FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2415 FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2416 PP->CodeCompleteIncludedFile(Dir, IsAngled);
2417}
2418
2419/// LexCharConstant - Lex the remainder of a character constant, after having
2420/// lexed either ' or L' or u8' or u' or U'.
2421bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2422 tok::TokenKind Kind) {
2423 // Does this character contain the \0 character?
2424 const char *NulCharacter = nullptr;
2425
2426 if (!isLexingRawMode()) {
2427 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
2428 Diag(BufferPtr, LangOpts.CPlusPlus
2429 ? diag::warn_cxx98_compat_unicode_literal
2430 : diag::warn_c99_compat_unicode_literal);
2431 else if (Kind == tok::utf8_char_constant)
2432 Diag(BufferPtr, LangOpts.CPlusPlus
2433 ? diag::warn_cxx14_compat_u8_character_literal
2434 : diag::warn_c17_compat_u8_character_literal);
2435 }
2436
2437 char C = getAndAdvanceChar(CurPtr, Result);
2438 if (C == '\'') {
2439 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2440 Diag(BufferPtr, diag::ext_empty_character);
2441 FormTokenWithChars(Result, CurPtr, tok::unknown);
2442 return true;
2443 }
2444
2445 while (C != '\'') {
2446 // Skip escaped characters.
2447 if (C == '\\')
2448 C = getAndAdvanceChar(CurPtr, Result);
2449
2450 if (C == '\n' || C == '\r' || // Newline.
2451 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2452 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2453 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2454 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2455 return true;
2456 }
2457
2458 if (C == 0) {
2459 if (isCodeCompletionPoint(CurPtr-1)) {
2461 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2462 cutOffLexing();
2463 return true;
2464 }
2465
2466 NulCharacter = CurPtr-1;
2467 }
2468 C = getAndAdvanceChar(CurPtr, Result);
2469 }
2470
2471 // If we are in C++11, lex the optional ud-suffix.
2472 if (LangOpts.CPlusPlus)
2473 CurPtr = LexUDSuffix(Result, CurPtr, false);
2474
2475 // If a nul character existed in the character, warn about it.
2476 if (NulCharacter && !isLexingRawMode())
2477 Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2478
2479 // Update the location of token as well as BufferPtr.
2480 const char *TokStart = BufferPtr;
2481 FormTokenWithChars(Result, CurPtr, Kind);
2482 Result.setLiteralData(TokStart);
2483 return true;
2484}
2485
2486/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2487/// Update BufferPtr to point to the next non-whitespace character and return.
2488///
2489/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2490bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2491 bool &TokAtPhysicalStartOfLine) {
2492 // Whitespace - Skip it, then return the token after the whitespace.
2493 bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2494
2495 unsigned char Char = *CurPtr;
2496
2497 const char *lastNewLine = nullptr;
2498 auto setLastNewLine = [&](const char *Ptr) {
2499 lastNewLine = Ptr;
2500 if (!NewLinePtr)
2501 NewLinePtr = Ptr;
2502 };
2503 if (SawNewline)
2504 setLastNewLine(CurPtr - 1);
2505
2506 // Skip consecutive spaces efficiently.
2507 while (true) {
2508 // Skip horizontal whitespace very aggressively.
2509 while (isHorizontalWhitespace(Char))
2510 Char = *++CurPtr;
2511
2512 // Otherwise if we have something other than whitespace, we're done.
2513 if (!isVerticalWhitespace(Char))
2514 break;
2515
2517 // End of preprocessor directive line, let LexTokenInternal handle this.
2518 BufferPtr = CurPtr;
2519 return false;
2520 }
2521
2522 // OK, but handle newline.
2523 if (*CurPtr == '\n')
2524 setLastNewLine(CurPtr);
2525 SawNewline = true;
2526 Char = *++CurPtr;
2527 }
2528
2529 // If the client wants us to return whitespace, return it now.
2530 if (isKeepWhitespaceMode()) {
2531 FormTokenWithChars(Result, CurPtr, tok::unknown);
2532 if (SawNewline) {
2533 IsAtStartOfLine = true;
2534 IsAtPhysicalStartOfLine = true;
2535 }
2536 // FIXME: The next token will not have LeadingSpace set.
2537 return true;
2538 }
2539
2540 // If this isn't immediately after a newline, there is leading space.
2541 char PrevChar = CurPtr[-1];
2542 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2543
2544 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2545 if (SawNewline) {
2546 Result.setFlag(Token::StartOfLine);
2547 TokAtPhysicalStartOfLine = true;
2548
2549 if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) {
2550 if (auto *Handler = PP->getEmptylineHandler())
2551 Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1),
2552 getSourceLocation(lastNewLine)));
2553 }
2554 }
2555
2556 BufferPtr = CurPtr;
2557 return false;
2558}
2559
2560/// We have just read the // characters from input. Skip until we find the
2561/// newline character that terminates the comment. Then update BufferPtr and
2562/// return.
2563///
2564/// If we're in KeepCommentMode or any CommentHandler has inserted
2565/// some tokens, this will store the first token and return true.
2566bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2567 bool &TokAtPhysicalStartOfLine) {
2568 // If Line comments aren't explicitly enabled for this language, emit an
2569 // extension warning.
2570 if (!LineComment) {
2571 if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags.
2572 Diag(BufferPtr, diag::ext_line_comment);
2573
2574 // Mark them enabled so we only emit one warning for this translation
2575 // unit.
2576 LineComment = true;
2577 }
2578
2579 // Scan over the body of the comment. The common case, when scanning, is that
2580 // the comment contains normal ascii characters with nothing interesting in
2581 // them. As such, optimize for this case with the inner loop.
2582 //
2583 // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2584 // character that ends the line comment.
2585
2586 // C++23 [lex.phases] p1
2587 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2588 // diagnostic only once per entire ill-formed subsequence to avoid
2589 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2590 bool UnicodeDecodingAlreadyDiagnosed = false;
2591
2592 char C;
2593 while (true) {
2594 C = *CurPtr;
2595 // Skip over characters in the fast loop.
2596 while (isASCII(C) && C != 0 && // Potentially EOF.
2597 C != '\n' && C != '\r') { // Newline or DOS-style newline.
2598 C = *++CurPtr;
2599 UnicodeDecodingAlreadyDiagnosed = false;
2600 }
2601
2602 if (!isASCII(C)) {
2603 unsigned Length = llvm::getUTF8SequenceSize(
2604 (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
2605 if (Length == 0) {
2606 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2607 Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
2608 UnicodeDecodingAlreadyDiagnosed = true;
2609 ++CurPtr;
2610 } else {
2611 UnicodeDecodingAlreadyDiagnosed = false;
2612 CurPtr += Length;
2613 }
2614 continue;
2615 }
2616
2617 const char *NextLine = CurPtr;
2618 if (C != 0) {
2619 // We found a newline, see if it's escaped.
2620 const char *EscapePtr = CurPtr-1;
2621 bool HasSpace = false;
2622 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2623 --EscapePtr;
2624 HasSpace = true;
2625 }
2626
2627 if (*EscapePtr == '\\')
2628 // Escaped newline.
2629 CurPtr = EscapePtr;
2630 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2631 EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2632 // Trigraph-escaped newline.
2633 CurPtr = EscapePtr-2;
2634 else
2635 break; // This is a newline, we're done.
2636
2637 // If there was space between the backslash and newline, warn about it.
2638 if (HasSpace && !isLexingRawMode())
2639 Diag(EscapePtr, diag::backslash_newline_space);
2640 }
2641
2642 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
2643 // properly decode the character. Read it in raw mode to avoid emitting
2644 // diagnostics about things like trigraphs. If we see an escaped newline,
2645 // we'll handle it below.
2646 const char *OldPtr = CurPtr;
2647 bool OldRawMode = isLexingRawMode();
2648 LexingRawMode = true;
2649 C = getAndAdvanceChar(CurPtr, Result);
2650 LexingRawMode = OldRawMode;
2651
2652 // If we only read only one character, then no special handling is needed.
2653 // We're done and can skip forward to the newline.
2654 if (C != 0 && CurPtr == OldPtr+1) {
2655 CurPtr = NextLine;
2656 break;
2657 }
2658
2659 // If we read multiple characters, and one of those characters was a \r or
2660 // \n, then we had an escaped newline within the comment. Emit diagnostic
2661 // unless the next line is also a // comment.
2662 if (CurPtr != OldPtr + 1 && C != '/' &&
2663 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
2664 for (; OldPtr != CurPtr; ++OldPtr)
2665 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
2666 // Okay, we found a // comment that ends in a newline, if the next
2667 // line is also a // comment, but has spaces, don't emit a diagnostic.
2668 if (isWhitespace(C)) {
2669 const char *ForwardPtr = CurPtr;
2670 while (isWhitespace(*ForwardPtr)) // Skip whitespace.
2671 ++ForwardPtr;
2672 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2673 break;
2674 }
2675
2676 if (!isLexingRawMode())
2677 Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2678 break;
2679 }
2680 }
2681
2682 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
2683 --CurPtr;
2684 break;
2685 }
2686
2687 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2689 cutOffLexing();
2690 return false;
2691 }
2692 }
2693
2694 // Found but did not consume the newline. Notify comment handlers about the
2695 // comment unless we're in a #if 0 block.
2696 if (PP && !isLexingRawMode() &&
2698 getSourceLocation(CurPtr)))) {
2699 BufferPtr = CurPtr;
2700 return true; // A token has to be returned.
2701 }
2702
2703 // If we are returning comments as tokens, return this comment as a token.
2704 if (inKeepCommentMode())
2705 return SaveLineComment(Result, CurPtr);
2706
2707 // If we are inside a preprocessor directive and we see the end of line,
2708 // return immediately, so that the lexer can return this as an EOD token.
2709 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
2710 BufferPtr = CurPtr;
2711 return false;
2712 }
2713
2714 // Otherwise, eat the \n character. We don't care if this is a \n\r or
2715 // \r\n sequence. This is an efficiency hack (because we know the \n can't
2716 // contribute to another token), it isn't needed for correctness. Note that
2717 // this is ok even in KeepWhitespaceMode, because we would have returned the
2718 // comment above in that mode.
2719 NewLinePtr = CurPtr++;
2720
2721 // The next returned token is at the start of the line.
2722 Result.setFlag(Token::StartOfLine);
2723 TokAtPhysicalStartOfLine = true;
2724 // No leading whitespace seen so far.
2725 Result.clearFlag(Token::LeadingSpace);
2726 BufferPtr = CurPtr;
2727 return false;
2728}
2729
2730/// If in save-comment mode, package up this Line comment in an appropriate
2731/// way and return it.
2732bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2733 // If we're not in a preprocessor directive, just return the // comment
2734 // directly.
2735 FormTokenWithChars(Result, CurPtr, tok::comment);
2736
2738 return true;
2739
2740 // If this Line-style comment is in a macro definition, transmogrify it into
2741 // a C-style block comment.
2742 bool Invalid = false;
2743 std::string Spelling = PP->getSpelling(Result, &Invalid);
2744 if (Invalid)
2745 return true;
2746
2747 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
2748 Spelling[1] = '*'; // Change prefix to "/*".
2749 Spelling += "*/"; // add suffix.
2750
2751 Result.setKind(tok::comment);
2752 PP->CreateString(Spelling, Result,
2753 Result.getLocation(), Result.getLocation());
2754 return true;
2755}
2756
2757/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2758/// character (either \\n or \\r) is part of an escaped newline sequence. Issue
2759/// a diagnostic if so. We know that the newline is inside of a block comment.
2760static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L,
2761 bool Trigraphs) {
2762 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
2763
2764 // Position of the first trigraph in the ending sequence.
2765 const char *TrigraphPos = nullptr;
2766 // Position of the first whitespace after a '\' in the ending sequence.
2767 const char *SpacePos = nullptr;
2768
2769 while (true) {
2770 // Back up off the newline.
2771 --CurPtr;
2772
2773 // If this is a two-character newline sequence, skip the other character.
2774 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2775 // \n\n or \r\r -> not escaped newline.
2776 if (CurPtr[0] == CurPtr[1])
2777 return false;
2778 // \n\r or \r\n -> skip the newline.
2779 --CurPtr;
2780 }
2781
2782 // If we have horizontal whitespace, skip over it. We allow whitespace
2783 // between the slash and newline.
2784 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
2785 SpacePos = CurPtr;
2786 --CurPtr;
2787 }
2788
2789 // If we have a slash, this is an escaped newline.
2790 if (*CurPtr == '\\') {
2791 --CurPtr;
2792 } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') {
2793 // This is a trigraph encoding of a slash.
2794 TrigraphPos = CurPtr - 2;
2795 CurPtr -= 3;
2796 } else {
2797 return false;
2798 }
2799
2800 // If the character preceding the escaped newline is a '*', then after line
2801 // splicing we have a '*/' ending the comment.
2802 if (*CurPtr == '*')
2803 break;
2804
2805 if (*CurPtr != '\n' && *CurPtr != '\r')
2806 return false;
2807 }
2808
2809 if (TrigraphPos) {
2810 // If no trigraphs are enabled, warn that we ignored this trigraph and
2811 // ignore this * character.
2812 if (!Trigraphs) {
2813 if (!L->isLexingRawMode())
2814 L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment);
2815 return false;
2816 }
2817 if (!L->isLexingRawMode())
2818 L->Diag(TrigraphPos, diag::trigraph_ends_block_comment);
2819 }
2820
2821 // Warn about having an escaped newline between the */ characters.
2822 if (!L->isLexingRawMode())
2823 L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end);
2824
2825 // If there was space between the backslash and newline, warn about it.
2826 if (SpacePos && !L->isLexingRawMode())
2827 L->Diag(SpacePos, diag::backslash_newline_space);
2828
2829 return true;
2830}
2831
2832#ifdef __SSE2__
2833#include <emmintrin.h>
2834#elif __ALTIVEC__
2835#include <altivec.h>
2836#undef bool
2837#endif
2838
2839/// We have just read from input the / and * characters that started a comment.
2840/// Read until we find the * and / characters that terminate the comment.
2841/// Note that we don't bother decoding trigraphs or escaped newlines in block
2842/// comments, because they cannot cause the comment to end. The only thing
2843/// that can happen is the comment could end with an escaped newline between
2844/// the terminating * and /.
2845///
2846/// If we're in KeepCommentMode or any CommentHandler has inserted
2847/// some tokens, this will store the first token and return true.
2848bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2849 bool &TokAtPhysicalStartOfLine) {
2850 // Scan one character past where we should, looking for a '/' character. Once
2851 // we find it, check to see if it was preceded by a *. This common
2852 // optimization helps people who like to put a lot of * characters in their
2853 // comments.
2854
2855 // The first character we get with newlines and trigraphs skipped to handle
2856 // the degenerate /*/ case below correctly if the * has an escaped newline
2857 // after it.
2858 unsigned CharSize;
2859 unsigned char C = getCharAndSize(CurPtr, CharSize);
2860 CurPtr += CharSize;
2861 if (C == 0 && CurPtr == BufferEnd+1) {
2862 if (!isLexingRawMode())
2863 Diag(BufferPtr, diag::err_unterminated_block_comment);
2864 --CurPtr;
2865
2866 // KeepWhitespaceMode should return this broken comment as a token. Since
2867 // it isn't a well formed comment, just return it as an 'unknown' token.
2868 if (isKeepWhitespaceMode()) {
2869 FormTokenWithChars(Result, CurPtr, tok::unknown);
2870 return true;
2871 }
2872
2873 BufferPtr = CurPtr;
2874 return false;
2875 }
2876
2877 // Check to see if the first character after the '/*' is another /. If so,
2878 // then this slash does not end the block comment, it is part of it.
2879 if (C == '/')
2880 C = *CurPtr++;
2881
2882 // C++23 [lex.phases] p1
2883 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2884 // diagnostic only once per entire ill-formed subsequence to avoid
2885 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2886 bool UnicodeDecodingAlreadyDiagnosed = false;
2887
2888 while (true) {
2889 // Skip over all non-interesting characters until we find end of buffer or a
2890 // (probably ending) '/' character.
2891 if (CurPtr + 24 < BufferEnd &&
2892 // If there is a code-completion point avoid the fast scan because it
2893 // doesn't check for '\0'.
2894 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2895 // While not aligned to a 16-byte boundary.
2896 while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
2897 if (!isASCII(C))
2898 goto MultiByteUTF8;
2899 C = *CurPtr++;
2900 }
2901 if (C == '/') goto FoundSlash;
2902
2903#ifdef __SSE2__
2904 __m128i Slashes = _mm_set1_epi8('/');
2905 while (CurPtr + 16 < BufferEnd) {
2906 int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
2907 if (LLVM_UNLIKELY(Mask != 0)) {
2908 goto MultiByteUTF8;
2909 }
2910 // look for slashes
2911 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2912 Slashes));
2913 if (cmp != 0) {
2914 // Adjust the pointer to point directly after the first slash. It's
2915 // not necessary to set C here, it will be overwritten at the end of
2916 // the outer loop.
2917 CurPtr += llvm::countr_zero<unsigned>(cmp) + 1;
2918 goto FoundSlash;
2919 }
2920 CurPtr += 16;
2921 }
2922#elif __ALTIVEC__
2923 __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2924 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2925 0x80, 0x80, 0x80, 0x80};
2926 __vector unsigned char Slashes = {
2927 '/', '/', '/', '/', '/', '/', '/', '/',
2928 '/', '/', '/', '/', '/', '/', '/', '/'
2929 };
2930 while (CurPtr + 16 < BufferEnd) {
2931 if (LLVM_UNLIKELY(
2932 vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
2933 goto MultiByteUTF8;
2934 if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
2935 break;
2936 }
2937 CurPtr += 16;
2938 }
2939
2940#else
2941 while (CurPtr + 16 < BufferEnd) {
2942 bool HasNonASCII = false;
2943 for (unsigned I = 0; I < 16; ++I)
2944 HasNonASCII |= !isASCII(CurPtr[I]);
2945
2946 if (LLVM_UNLIKELY(HasNonASCII))
2947 goto MultiByteUTF8;
2948
2949 bool HasSlash = false;
2950 for (unsigned I = 0; I < 16; ++I)
2951 HasSlash |= CurPtr[I] == '/';
2952 if (HasSlash)
2953 break;
2954 CurPtr += 16;
2955 }
2956#endif
2957
2958 // It has to be one of the bytes scanned, increment to it and read one.
2959 C = *CurPtr++;
2960 }
2961
2962 // Loop to scan the remainder, warning on invalid UTF-8
2963 // if the corresponding warning is enabled, emitting a diagnostic only once
2964 // per sequence that cannot be decoded.
2965 while (C != '/' && C != '\0') {
2966 if (isASCII(C)) {
2967 UnicodeDecodingAlreadyDiagnosed = false;
2968 C = *CurPtr++;
2969 continue;
2970 }
2971 MultiByteUTF8:
2972 // CurPtr is 1 code unit past C, so to decode
2973 // the codepoint, we need to read from the previous position.
2974 unsigned Length = llvm::getUTF8SequenceSize(
2975 (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);
2976 if (Length == 0) {
2977 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2978 Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);
2979 UnicodeDecodingAlreadyDiagnosed = true;
2980 } else {
2981 UnicodeDecodingAlreadyDiagnosed = false;
2982 CurPtr += Length - 1;
2983 }
2984 C = *CurPtr++;
2985 }
2986
2987 if (C == '/') {
2988 FoundSlash:
2989 if (CurPtr[-2] == '*') // We found the final */. We're done!
2990 break;
2991
2992 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
2993 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this,
2994 LangOpts.Trigraphs)) {
2995 // We found the final */, though it had an escaped newline between the
2996 // * and /. We're done!
2997 break;
2998 }
2999 }
3000 if (CurPtr[0] == '*' && CurPtr[1] != '/') {
3001 // If this is a /* inside of the comment, emit a warning. Don't do this
3002 // if this is a /*/, which will end the comment. This misses cases with
3003 // embedded escaped newlines, but oh well.
3004 if (!isLexingRawMode())
3005 Diag(CurPtr-1, diag::warn_nested_block_comment);
3006 }
3007 } else if (C == 0 && CurPtr == BufferEnd+1) {
3008 if (!isLexingRawMode())
3009 Diag(BufferPtr, diag::err_unterminated_block_comment);
3010 // Note: the user probably forgot a */. We could continue immediately
3011 // after the /*, but this would involve lexing a lot of what really is the
3012 // comment, which surely would confuse the parser.
3013 --CurPtr;
3014
3015 // KeepWhitespaceMode should return this broken comment as a token. Since
3016 // it isn't a well formed comment, just return it as an 'unknown' token.
3017 if (isKeepWhitespaceMode()) {
3018 FormTokenWithChars(Result, CurPtr, tok::unknown);
3019 return true;
3020 }
3021
3022 BufferPtr = CurPtr;
3023 return false;
3024 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
3026 cutOffLexing();
3027 return false;
3028 }
3029
3030 C = *CurPtr++;
3031 }
3032
3033 // Notify comment handlers about the comment unless we're in a #if 0 block.
3034 if (PP && !isLexingRawMode() &&
3036 getSourceLocation(CurPtr)))) {
3037 BufferPtr = CurPtr;
3038 return true; // A token has to be returned.
3039 }
3040
3041 // If we are returning comments as tokens, return this comment as a token.
3042 if (inKeepCommentMode()) {
3043 FormTokenWithChars(Result, CurPtr, tok::comment);
3044 return true;
3045 }
3046
3047 // It is common for the tokens immediately after a /**/ comment to be
3048 // whitespace. Instead of going through the big switch, handle it
3049 // efficiently now. This is safe even in KeepWhitespaceMode because we would
3050 // have already returned above with the comment as a token.
3051 if (isHorizontalWhitespace(*CurPtr)) {
3052 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
3053 return false;
3054 }
3055
3056 // Otherwise, just return so that the next character will be lexed as a token.
3057 BufferPtr = CurPtr;
3058 Result.setFlag(Token::LeadingSpace);
3059 return false;
3060}
3061
3062//===----------------------------------------------------------------------===//
3063// Primary Lexing Entry Points
3064//===----------------------------------------------------------------------===//
3065
3066/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
3067/// uninterpreted string. This switches the lexer out of directive mode.
3069 assert(ParsingPreprocessorDirective && ParsingFilename == false &&
3070 "Must be in a preprocessing directive!");
3071 Token Tmp;
3072 Tmp.startToken();
3073
3074 // CurPtr - Cache BufferPtr in an automatic variable.
3075 const char *CurPtr = BufferPtr;
3076 while (true) {
3077 char Char = getAndAdvanceChar(CurPtr, Tmp);
3078 switch (Char) {
3079 default:
3080 if (Result)
3081 Result->push_back(Char);
3082 break;
3083 case 0: // Null.
3084 // Found end of file?
3085 if (CurPtr-1 != BufferEnd) {
3086 if (isCodeCompletionPoint(CurPtr-1)) {
3088 cutOffLexing();
3089 return;
3090 }
3091
3092 // Nope, normal character, continue.
3093 if (Result)
3094 Result->push_back(Char);
3095 break;
3096 }
3097 // FALL THROUGH.
3098 [[fallthrough]];
3099 case '\r':
3100 case '\n':
3101 // Okay, we found the end of the line. First, back up past the \0, \r, \n.
3102 assert(CurPtr[-1] == Char && "Trigraphs for newline?");
3103 BufferPtr = CurPtr-1;
3104
3105 // Next, lex the character, which should handle the EOD transition.
3106 Lex(Tmp);
3107 if (Tmp.is(tok::code_completion)) {
3108 if (PP)
3110 Lex(Tmp);
3111 }
3112 assert(Tmp.is(tok::eod) && "Unexpected token!");
3113
3114 // Finally, we're done;
3115 return;
3116 }
3117 }
3118}
3119
3120/// LexEndOfFile - CurPtr points to the end of this file. Handle this
3121/// condition, reporting diagnostics and handling other edge cases as required.
3122/// This returns true if Result contains a token, false if PP.Lex should be
3123/// called again.
3124bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
3125 // If we hit the end of the file while parsing a preprocessor directive,
3126 // end the preprocessor directive first. The next token returned will
3127 // then be the end of file.
3129 // Done parsing the "line".
3131 // Update the location of token as well as BufferPtr.
3132 FormTokenWithChars(Result, CurPtr, tok::eod);
3133
3134 // Restore comment saving mode, in case it was disabled for directive.
3135 if (PP)
3137 return true; // Have a token.
3138 }
3139
3140 // If we are in raw mode, return this event as an EOF token. Let the caller
3141 // that put us in raw mode handle the event.
3142 if (isLexingRawMode()) {
3143 Result.startToken();
3144 BufferPtr = BufferEnd;
3145 FormTokenWithChars(Result, BufferEnd, tok::eof);
3146 return true;
3147 }
3148
3151 // If the preamble cuts off the end of a header guard, consider it guarded.
3152 // The guard is valid for the preamble content itself, and for tools the
3153 // most useful answer is "yes, this file has a header guard".
3154 if (!ConditionalStack.empty())
3156 ConditionalStack.clear();
3157 }
3158
3159 // Issue diagnostics for unterminated #if and missing newline.
3160
3161 // If we are in a #if directive, emit an error.
3162 while (!ConditionalStack.empty()) {
3163 if (PP->getCodeCompletionFileLoc() != FileLoc)
3164 PP->Diag(ConditionalStack.back().IfLoc,
3165 diag::err_pp_unterminated_conditional);
3166 ConditionalStack.pop_back();
3167 }
3168
3169 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
3170 // a pedwarn.
3171 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
3173 SourceLocation EndLoc = getSourceLocation(BufferEnd);
3174 unsigned DiagID;
3175
3176 if (LangOpts.CPlusPlus11) {
3177 // C++11 [lex.phases] 2.2 p2
3178 // Prefer the C++98 pedantic compatibility warning over the generic,
3179 // non-extension, user-requested "missing newline at EOF" warning.
3180 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
3181 DiagID = diag::warn_cxx98_compat_no_newline_eof;
3182 } else {
3183 DiagID = diag::warn_no_newline_eof;
3184 }
3185 } else {
3186 DiagID = diag::ext_no_newline_eof;
3187 }
3188
3189 Diag(BufferEnd, DiagID)
3190 << FixItHint::CreateInsertion(EndLoc, "\n");
3191 }
3192
3193 BufferPtr = CurPtr;
3194
3195 // Finally, let the preprocessor handle this.
3197}
3198
3199/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
3200/// the specified lexer will return a tok::l_paren token, 0 if it is something
3201/// else and 2 if there are no more tokens in the buffer controlled by the
3202/// lexer.
3203unsigned Lexer::isNextPPTokenLParen() {
3204 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
3205
3206 if (isDependencyDirectivesLexer()) {
3207 if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())
3208 return 2;
3209 return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
3210 tok::l_paren);
3211 }
3212
3213 // Switch to 'skipping' mode. This will ensure that we can lex a token
3214 // without emitting diagnostics, disables macro expansion, and will cause EOF
3215 // to return an EOF token instead of popping the include stack.
3216 LexingRawMode = true;
3217
3218 // Save state that can be changed while lexing so that we can restore it.
3219 const char *TmpBufferPtr = BufferPtr;
3220 bool inPPDirectiveMode = ParsingPreprocessorDirective;
3221 bool atStartOfLine = IsAtStartOfLine;
3222 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3223 bool leadingSpace = HasLeadingSpace;
3224
3225 Token Tok;
3226 Lex(Tok);
3227
3228 // Restore state that may have changed.
3229 BufferPtr = TmpBufferPtr;
3230 ParsingPreprocessorDirective = inPPDirectiveMode;
3231 HasLeadingSpace = leadingSpace;
3232 IsAtStartOfLine = atStartOfLine;
3233 IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
3234
3235 // Restore the lexer back to non-skipping mode.
3236 LexingRawMode = false;
3237
3238 if (Tok.is(tok::eof))
3239 return 2;
3240 return Tok.is(tok::l_paren);
3241}
3242
3243/// Find the end of a version control conflict marker.
3244static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
3245 ConflictMarkerKind CMK) {
3246 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
3247 size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
3248 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
3249 size_t Pos = RestOfBuffer.find(Terminator);
3250 while (Pos != StringRef::npos) {
3251 // Must occur at start of line.
3252 if (Pos == 0 ||
3253 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
3254 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
3255 Pos = RestOfBuffer.find(Terminator);
3256 continue;
3257 }
3258 return RestOfBuffer.data()+Pos;
3259 }
3260 return nullptr;
3261}
3262
3263/// IsStartOfConflictMarker - If the specified pointer is the start of a version
3264/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
3265/// and recover nicely. This returns true if it is a conflict marker and false
3266/// if not.
3267bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
3268 // Only a conflict marker if it starts at the beginning of a line.
3269 if (CurPtr != BufferStart &&
3270 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3271 return false;
3272
3273 // Check to see if we have <<<<<<< or >>>>.
3274 if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with("<<<<<<<") &&
3275 !StringRef(CurPtr, BufferEnd - CurPtr).starts_with(">>>> "))
3276 return false;
3277
3278 // If we have a situation where we don't care about conflict markers, ignore
3279 // it.
3280 if (CurrentConflictMarkerState || isLexingRawMode())
3281 return false;
3282
3283 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
3284
3285 // Check to see if there is an ending marker somewhere in the buffer at the
3286 // start of a line to terminate this conflict marker.
3287 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
3288 // We found a match. We are really in a conflict marker.
3289 // Diagnose this, and ignore to the end of line.
3290 Diag(CurPtr, diag::err_conflict_marker);
3291 CurrentConflictMarkerState = Kind;
3292
3293 // Skip ahead to the end of line. We know this exists because the
3294 // end-of-conflict marker starts with \r or \n.
3295 while (*CurPtr != '\r' && *CurPtr != '\n') {
3296 assert(CurPtr != BufferEnd && "Didn't find end of line");
3297 ++CurPtr;
3298 }
3299 BufferPtr = CurPtr;
3300 return true;
3301 }
3302
3303 // No end of conflict marker found.
3304 return false;
3305}
3306
3307/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
3308/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
3309/// is the end of a conflict marker. Handle it by ignoring up until the end of
3310/// the line. This returns true if it is a conflict marker and false if not.
3311bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
3312 // Only a conflict marker if it starts at the beginning of a line.
3313 if (CurPtr != BufferStart &&
3314 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3315 return false;
3316
3317 // If we have a situation where we don't care about conflict markers, ignore
3318 // it.
3319 if (!CurrentConflictMarkerState || isLexingRawMode())
3320 return false;
3321
3322 // Check to see if we have the marker (4 characters in a row).
3323 for (unsigned i = 1; i != 4; ++i)
3324 if (CurPtr[i] != CurPtr[0])
3325 return false;
3326
3327 // If we do have it, search for the end of the conflict marker. This could
3328 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might
3329 // be the end of conflict marker.
3330 if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
3331 CurrentConflictMarkerState)) {
3332 CurPtr = End;
3333
3334 // Skip ahead to the end of line.
3335 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
3336 ++CurPtr;
3337
3338 BufferPtr = CurPtr;
3339
3340 // No longer in the conflict marker.
3341 CurrentConflictMarkerState = CMK_None;
3342 return true;
3343 }
3344
3345 return false;
3346}
3347
3348static const char *findPlaceholderEnd(const char *CurPtr,
3349 const char *BufferEnd) {
3350 if (CurPtr == BufferEnd)
3351 return nullptr;
3352 BufferEnd -= 1; // Scan until the second last character.
3353 for (; CurPtr != BufferEnd; ++CurPtr) {
3354 if (CurPtr[0] == '#' && CurPtr[1] == '>')
3355 return CurPtr + 2;
3356 }
3357 return nullptr;
3358}
3359
3360bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
3361 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");
3363 return false;
3364 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
3365 if (!End)
3366 return false;
3367 const char *Start = CurPtr - 1;
3368 if (!LangOpts.AllowEditorPlaceholders)
3369 Diag(Start, diag::err_placeholder_in_source);
3370 Result.startToken();
3371 FormTokenWithChars(Result, End, tok::raw_identifier);
3372 Result.setRawIdentifierData(Start);
3375 BufferPtr = End;
3376 return true;
3377}
3378
3379bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
3380 if (PP && PP->isCodeCompletionEnabled()) {
3381 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
3382 return Loc == PP->getCodeCompletionLoc();
3383 }
3384
3385 return false;
3386}
3387
3388std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
3389 const char *SlashLoc,
3390 Token *Result) {
3391 unsigned CharSize;
3392 char Kind = getCharAndSize(StartPtr, CharSize);
3393 assert((Kind == 'u' || Kind == 'U') && "expected a UCN");
3394
3395 unsigned NumHexDigits;
3396 if (Kind == 'u')
3397 NumHexDigits = 4;
3398 else if (Kind == 'U')
3399 NumHexDigits = 8;
3400
3401 bool Delimited = false;
3402 bool FoundEndDelimiter = false;
3403 unsigned Count = 0;
3404 bool Diagnose = Result && !isLexingRawMode();
3405
3406 if (!LangOpts.CPlusPlus && !LangOpts.C99) {
3407 if (Diagnose)
3408 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
3409 return std::nullopt;
3410 }
3411
3412 const char *CurPtr = StartPtr + CharSize;
3413 const char *KindLoc = &CurPtr[-1];
3414
3415 uint32_t CodePoint = 0;
3416 while (Count != NumHexDigits || Delimited) {
3417 char C = getCharAndSize(CurPtr, CharSize);
3418 if (!Delimited && Count == 0 && C == '{') {
3419 Delimited = true;
3420 CurPtr += CharSize;
3421 continue;
3422 }
3423
3424 if (Delimited && C == '}') {
3425 CurPtr += CharSize;
3426 FoundEndDelimiter = true;
3427 break;
3428 }
3429
3430 unsigned Value = llvm::hexDigitValue(C);
3431 if (Value == -1U) {
3432 if (!Delimited)
3433 break;
3434 if (Diagnose)
3435 Diag(SlashLoc, diag::warn_delimited_ucn_incomplete)
3436 << StringRef(KindLoc, 1);
3437 return std::nullopt;
3438 }
3439
3440 if (CodePoint & 0xF000'0000) {
3441 if (Diagnose)
3442 Diag(KindLoc, diag::err_escape_too_large) << 0;
3443 return std::nullopt;
3444 }
3445
3446 CodePoint <<= 4;
3447 CodePoint |= Value;
3448 CurPtr += CharSize;
3449 Count++;
3450 }
3451
3452 if (Count == 0) {
3453 if (Diagnose)
3454 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3455 : diag::warn_ucn_escape_no_digits)
3456 << StringRef(KindLoc, 1);
3457 return std::nullopt;
3458 }
3459
3460 if (Delimited && Kind == 'U') {
3461 if (Diagnose)
3462 Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
3463 return std::nullopt;
3464 }
3465
3466 if (!Delimited && Count != NumHexDigits) {
3467 if (Diagnose) {
3468 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3469 // If the user wrote \U1234, suggest a fixit to \u.
3470 if (Count == 4 && NumHexDigits == 8) {
3471 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
3472 Diag(KindLoc, diag::note_ucn_four_not_eight)
3473 << FixItHint::CreateReplacement(URange, "u");
3474 }
3475 }
3476 return std::nullopt;
3477 }
3478
3479 if (Delimited && PP) {
3480 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23
3481 ? diag::warn_cxx23_delimited_escape_sequence
3482 : diag::ext_delimited_escape_sequence)
3483 << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3484 }
3485
3486 if (Result) {
3487 Result->setFlag(Token::HasUCN);
3488 // If the UCN contains either a trigraph or a line splicing,
3489 // we need to call getAndAdvanceChar again to set the appropriate flags
3490 // on Result.
3491 if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0)))
3492 StartPtr = CurPtr;
3493 else
3494 while (StartPtr != CurPtr)
3495 (void)getAndAdvanceChar(StartPtr, *Result);
3496 } else {
3497 StartPtr = CurPtr;
3498 }
3499 return CodePoint;
3500}
3501
3502std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
3503 const char *SlashLoc,
3504 Token *Result) {
3505 unsigned CharSize;
3506 bool Diagnose = Result && !isLexingRawMode();
3507
3508 char C = getCharAndSize(StartPtr, CharSize);
3509 assert(C == 'N' && "expected \\N{...}");
3510
3511 const char *CurPtr = StartPtr + CharSize;
3512 const char *KindLoc = &CurPtr[-1];
3513
3514 C = getCharAndSize(CurPtr, CharSize);
3515 if (C != '{') {
3516 if (Diagnose)
3517 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3518 return std::nullopt;
3519 }
3520 CurPtr += CharSize;
3521 const char *StartName = CurPtr;
3522 bool FoundEndDelimiter = false;
3524 while (C) {
3525 C = getCharAndSize(CurPtr, CharSize);
3526 CurPtr += CharSize;
3527 if (C == '}') {
3528 FoundEndDelimiter = true;
3529 break;
3530 }
3531
3533 break;
3534 Buffer.push_back(C);
3535 }
3536
3537 if (!FoundEndDelimiter || Buffer.empty()) {
3538 if (Diagnose)
3539 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3540 : diag::warn_delimited_ucn_incomplete)
3541 << StringRef(KindLoc, 1);
3542 return std::nullopt;
3543 }
3544
3545 StringRef Name(Buffer.data(), Buffer.size());
3546 std::optional<char32_t> Match =
3547 llvm::sys::unicode::nameToCodepointStrict(Name);
3548 std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
3549 if (!Match) {
3550 LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
3551 if (Diagnose) {
3552 Diag(StartName, diag::err_invalid_ucn_name)
3553 << StringRef(Buffer.data(), Buffer.size())
3554 << makeCharRange(*this, StartName, CurPtr - CharSize);
3555 if (LooseMatch) {
3556 Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
3558 makeCharRange(*this, StartName, CurPtr - CharSize),
3559 LooseMatch->Name);
3560 }
3561 }
3562 // We do not offer misspelled character names suggestions here
3563 // as the set of what would be a valid suggestion depends on context,
3564 // and we should not make invalid suggestions.
3565 }
3566
3567 if (Diagnose && Match)
3568 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23
3569 ? diag::warn_cxx23_delimited_escape_sequence
3570 : diag::ext_delimited_escape_sequence)
3571 << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3572
3573 // If no diagnostic has been emitted yet, likely because we are doing a
3574 // tentative lexing, we do not want to recover here to make sure the token
3575 // will not be incorrectly considered valid. This function will be called
3576 // again and a diagnostic emitted then.
3577 if (LooseMatch && Diagnose)
3578 Match = LooseMatch->CodePoint;
3579
3580 if (Result) {
3581 Result->setFlag(Token::HasUCN);
3582 // If the UCN contains either a trigraph or a line splicing,
3583 // we need to call getAndAdvanceChar again to set the appropriate flags
3584 // on Result.
3585 if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3))
3586 StartPtr = CurPtr;
3587 else
3588 while (StartPtr != CurPtr)
3589 (void)getAndAdvanceChar(StartPtr, *Result);
3590 } else {
3591 StartPtr = CurPtr;
3592 }
3593 return Match ? std::optional<uint32_t>(*Match) : std::nullopt;
3594}
3595
3596uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3597 Token *Result) {
3598
3599 unsigned CharSize;
3600 std::optional<uint32_t> CodePointOpt;
3601 char Kind = getCharAndSize(StartPtr, CharSize);
3602 if (Kind == 'u' || Kind == 'U')
3603 CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
3604 else if (Kind == 'N')
3605 CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);
3606
3607 if (!CodePointOpt)
3608 return 0;
3609
3610 uint32_t CodePoint = *CodePointOpt;
3611
3612 // Don't apply C family restrictions to UCNs in assembly mode
3613 if (LangOpts.AsmPreprocessor)
3614 return CodePoint;
3615
3616 // C23 6.4.3p2: A universal character name shall not designate a code point
3617 // where the hexadecimal value is:
3618 // - in the range D800 through DFFF inclusive; or
3619 // - greater than 10FFFF.
3620 // A universal-character-name outside the c-char-sequence of a character
3621 // constant, or the s-char-sequence of a string-literal shall not designate
3622 // a control character or a character in the basic character set.
3623
3624 // C++11 [lex.charset]p2: If the hexadecimal value for a
3625 // universal-character-name corresponds to a surrogate code point (in the
3626 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3627 // if the hexadecimal value for a universal-character-name outside the
3628 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3629 // string literal corresponds to a control character (in either of the
3630 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3631 // basic source character set, the program is ill-formed.
3632 if (CodePoint < 0xA0) {
3633 // We don't use isLexingRawMode() here because we need to warn about bad
3634 // UCNs even when skipping preprocessing tokens in a #if block.
3635 if (Result && PP) {
3636 if (CodePoint < 0x20 || CodePoint >= 0x7F)
3637 Diag(BufferPtr, diag::err_ucn_control_character);
3638 else {
3639 char C = static_cast<char>(CodePoint);
3640 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3641 }
3642 }
3643
3644 return 0;
3645 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
3646 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3647 // We don't use isLexingRawMode() here because we need to diagnose bad
3648 // UCNs even when skipping preprocessing tokens in a #if block.
3649 if (Result && PP) {
3650 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3651 Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3652 else
3653 Diag(BufferPtr, diag::err_ucn_escape_invalid);
3654 }
3655 return 0;
3656 }
3657
3658 return CodePoint;
3659}
3660
3661bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3662 const char *CurPtr) {
3663 if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3665 Diag(BufferPtr, diag::ext_unicode_whitespace)
3666 << makeCharRange(*this, BufferPtr, CurPtr);
3667
3668 Result.setFlag(Token::LeadingSpace);
3669 return true;
3670 }
3671 return false;
3672}
3673
3674void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3675 IsAtStartOfLine = Result.isAtStartOfLine();
3676 HasLeadingSpace = Result.hasLeadingSpace();
3677 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3678 // Note that this doesn't affect IsAtPhysicalStartOfLine.
3679}
3680
3682 assert(!isDependencyDirectivesLexer());
3683
3684 // Start a new token.
3685 Result.startToken();
3686
3687 // Set up misc whitespace flags for LexTokenInternal.
3688 if (IsAtStartOfLine) {
3689 Result.setFlag(Token::StartOfLine);
3690 IsAtStartOfLine = false;
3691 }
3692
3693 if (HasLeadingSpace) {
3694 Result.setFlag(Token::LeadingSpace);
3695 HasLeadingSpace = false;
3696 }
3697
3698 if (HasLeadingEmptyMacro) {
3700 HasLeadingEmptyMacro = false;
3701 }
3702
3703 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3704 IsAtPhysicalStartOfLine = false;
3705 bool isRawLex = isLexingRawMode();
3706 (void) isRawLex;
3707 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3708 // (After the LexTokenInternal call, the lexer might be destroyed.)
3709 assert((returnedToken || !isRawLex) && "Raw lex must succeed");
3710 return returnedToken;
3711}
3712
3713/// LexTokenInternal - This implements a simple C family lexer. It is an
3714/// extremely performance critical piece of code. This assumes that the buffer
3715/// has a null character at the end of the file. This returns a preprocessing
3716/// token, not a normal token, as such, it is an internal interface. It assumes
3717/// that the Flags of result have been cleared before calling this.
3718bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3719LexStart:
3720 assert(!Result.needsCleaning() && "Result needs cleaning");
3721 assert(!Result.hasPtrData() && "Result has not been reset");
3722
3723 // CurPtr - Cache BufferPtr in an automatic variable.
3724 const char *CurPtr = BufferPtr;
3725
3726 // Small amounts of horizontal whitespace is very common between tokens.
3727 if (isHorizontalWhitespace(*CurPtr)) {
3728 do {
3729 ++CurPtr;
3730 } while (isHorizontalWhitespace(*CurPtr));
3731
3732 // If we are keeping whitespace and other tokens, just return what we just
3733 // skipped. The next lexer invocation will return the token after the
3734 // whitespace.
3735 if (isKeepWhitespaceMode()) {
3736 FormTokenWithChars(Result, CurPtr, tok::unknown);
3737 // FIXME: The next token will not have LeadingSpace set.
3738 return true;
3739 }
3740
3741 BufferPtr = CurPtr;
3742 Result.setFlag(Token::LeadingSpace);
3743 }
3744
3745 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
3746
3747 // Read a character, advancing over it.
3748 char Char = getAndAdvanceChar(CurPtr, Result);
3750
3751 if (!isVerticalWhitespace(Char))
3752 NewLinePtr = nullptr;
3753
3754 switch (Char) {
3755 case 0: // Null.
3756 // Found end of file?
3757 if (CurPtr-1 == BufferEnd)
3758 return LexEndOfFile(Result, CurPtr-1);
3759
3760 // Check if we are performing code completion.
3761 if (isCodeCompletionPoint(CurPtr-1)) {
3762 // Return the code-completion token.
3763 Result.startToken();
3764 FormTokenWithChars(Result, CurPtr, tok::code_completion);
3765 return true;
3766 }
3767
3768 if (!isLexingRawMode())
3769 Diag(CurPtr-1, diag::null_in_file);
3770 Result.setFlag(Token::LeadingSpace);
3771 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3772 return true; // KeepWhitespaceMode
3773
3774 // We know the lexer hasn't changed, so just try again with this lexer.
3775 // (We manually eliminate the tail call to avoid recursion.)
3776 goto LexNextToken;
3777
3778 case 26: // DOS & CP/M EOF: "^Z".
3779 // If we're in Microsoft extensions mode, treat this as end of file.
3780 if (LangOpts.MicrosoftExt) {
3781 if (!isLexingRawMode())
3782 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3783 return LexEndOfFile(Result, CurPtr-1);
3784 }
3785
3786 // If Microsoft extensions are disabled, this is just random garbage.
3787 Kind = tok::unknown;
3788 break;
3789
3790 case '\r':
3791 if (CurPtr[0] == '\n')
3792 (void)getAndAdvanceChar(CurPtr, Result);
3793 [[fallthrough]];
3794 case '\n':
3795 // If we are inside a preprocessor directive and we see the end of line,
3796 // we know we are done with the directive, so return an EOD token.
3798 // Done parsing the "line".
3800
3801 // Restore comment saving mode, in case it was disabled for directive.
3802 if (PP)
3804
3805 // Since we consumed a newline, we are back at the start of a line.
3806 IsAtStartOfLine = true;
3807 IsAtPhysicalStartOfLine = true;
3808 NewLinePtr = CurPtr - 1;
3809
3810 Kind = tok::eod;
3811 break;
3812 }
3813
3814 // No leading whitespace seen so far.
3815 Result.clearFlag(Token::LeadingSpace);
3816
3817 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3818 return true; // KeepWhitespaceMode
3819
3820 // We only saw whitespace, so just try again with this lexer.
3821 // (We manually eliminate the tail call to avoid recursion.)
3822 goto LexNextToken;
3823 case ' ':
3824 case '\t':
3825 case '\f':
3826 case '\v':
3827 SkipHorizontalWhitespace:
3828 Result.setFlag(Token::LeadingSpace);
3829 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3830 return true; // KeepWhitespaceMode
3831
3832 SkipIgnoredUnits:
3833 CurPtr = BufferPtr;
3834
3835 // If the next token is obviously a // or /* */ comment, skip it efficiently
3836 // too (without going through the big switch stmt).
3837 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
3838 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3839 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3840 return true; // There is a token to return.
3841 goto SkipIgnoredUnits;
3842 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
3843 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3844 return true; // There is a token to return.
3845 goto SkipIgnoredUnits;
3846 } else if (isHorizontalWhitespace(*CurPtr)) {
3847 goto SkipHorizontalWhitespace;
3848 }
3849 // We only saw whitespace, so just try again with this lexer.
3850 // (We manually eliminate the tail call to avoid recursion.)
3851 goto LexNextToken;
3852
3853 // C99 6.4.4.1: Integer Constants.
3854 // C99 6.4.4.2: Floating Constants.
3855 case '0': case '1': case '2': case '3': case '4':
3856 case '5': case '6': case '7': case '8': case '9':
3857 // Notify MIOpt that we read a non-whitespace/non-comment token.
3858 MIOpt.ReadToken();
3859 return LexNumericConstant(Result, CurPtr);
3860
3861 // Identifier (e.g., uber), or
3862 // UTF-8 (C23/C++17) or UTF-16 (C11/C++11) character literal, or
3863 // UTF-8 or UTF-16 string literal (C11/C++11).
3864 case 'u':
3865 // Notify MIOpt that we read a non-whitespace/non-comment token.
3866 MIOpt.ReadToken();
3867
3868 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3869 Char = getCharAndSize(CurPtr, SizeTmp);
3870
3871 // UTF-16 string literal
3872 if (Char == '"')
3873 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3874 tok::utf16_string_literal);
3875
3876 // UTF-16 character constant
3877 if (Char == '\'')
3878 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3879 tok::utf16_char_constant);
3880
3881 // UTF-16 raw string literal
3882 if (Char == 'R' && LangOpts.RawStringLiterals &&
3883 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3884 return LexRawStringLiteral(Result,
3885 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3886 SizeTmp2, Result),
3887 tok::utf16_string_literal);
3888
3889 if (Char == '8') {
3890 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3891
3892 // UTF-8 string literal
3893 if (Char2 == '"')
3894 return LexStringLiteral(Result,
3895 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3896 SizeTmp2, Result),
3897 tok::utf8_string_literal);
3898 if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23))
3899 return LexCharConstant(
3900 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3901 SizeTmp2, Result),
3902 tok::utf8_char_constant);
3903
3904 if (Char2 == 'R' && LangOpts.RawStringLiterals) {
3905 unsigned SizeTmp3;
3906 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3907 // UTF-8 raw string literal
3908 if (Char3 == '"') {
3909 return LexRawStringLiteral(Result,
3910 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3911 SizeTmp2, Result),
3912 SizeTmp3, Result),
3913 tok::utf8_string_literal);
3914 }
3915 }
3916 }
3917 }
3918
3919 // treat u like the start of an identifier.
3920 return LexIdentifierContinue(Result, CurPtr);
3921
3922 case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal
3923 // Notify MIOpt that we read a non-whitespace/non-comment token.
3924 MIOpt.ReadToken();
3925
3926 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3927 Char = getCharAndSize(CurPtr, SizeTmp);
3928
3929 // UTF-32 string literal
3930 if (Char == '"')
3931 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3932 tok::utf32_string_literal);
3933
3934 // UTF-32 character constant
3935 if (Char == '\'')
3936 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3937 tok::utf32_char_constant);
3938
3939 // UTF-32 raw string literal
3940 if (Char == 'R' && LangOpts.RawStringLiterals &&
3941 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3942 return LexRawStringLiteral(Result,
3943 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3944 SizeTmp2, Result),
3945 tok::utf32_string_literal);
3946 }
3947
3948 // treat U like the start of an identifier.
3949 return LexIdentifierContinue(Result, CurPtr);
3950
3951 case 'R': // Identifier or C++0x raw string literal
3952 // Notify MIOpt that we read a non-whitespace/non-comment token.
3953 MIOpt.ReadToken();
3954
3955 if (LangOpts.RawStringLiterals) {
3956 Char = getCharAndSize(CurPtr, SizeTmp);
3957
3958 if (Char == '"')
3959 return LexRawStringLiteral(Result,
3960 ConsumeChar(CurPtr, SizeTmp, Result),
3961 tok::string_literal);
3962 }
3963
3964 // treat R like the start of an identifier.
3965 return LexIdentifierContinue(Result, CurPtr);
3966
3967 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
3968 // Notify MIOpt that we read a non-whitespace/non-comment token.
3969 MIOpt.ReadToken();
3970 Char = getCharAndSize(CurPtr, SizeTmp);
3971
3972 // Wide string literal.
3973 if (Char == '"')
3974 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3975 tok::wide_string_literal);
3976
3977 // Wide raw string literal.
3978 if (LangOpts.RawStringLiterals && Char == 'R' &&
3979 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3980 return LexRawStringLiteral(Result,
3981 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3982 SizeTmp2, Result),
3983 tok::wide_string_literal);
3984
3985 // Wide character constant.
3986 if (Char == '\'')
3987 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3988 tok::wide_char_constant);
3989 // FALL THROUGH, treating L like the start of an identifier.
3990 [[fallthrough]];
3991
3992 // C99 6.4.2: Identifiers.
3993 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
3994 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
3995 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
3996 case 'V': case 'W': case 'X': case 'Y': case 'Z':
3997 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
3998 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
3999 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
4000 case 'v': case 'w': case 'x': case 'y': case 'z':
4001 case '_':
4002 // Notify MIOpt that we read a non-whitespace/non-comment token.
4003 MIOpt.ReadToken();
4004 return LexIdentifierContinue(Result, CurPtr);
4005
4006 case '$': // $ in identifiers.
4007 if (LangOpts.DollarIdents) {
4008 if (!isLexingRawMode())
4009 Diag(CurPtr-1, diag::ext_dollar_in_identifier);
4010 // Notify MIOpt that we read a non-whitespace/non-comment token.
4011 MIOpt.ReadToken();
4012 return LexIdentifierContinue(Result, CurPtr);
4013 }
4014
4015 Kind = tok::unknown;
4016 break;
4017
4018 // C99 6.4.4: Character Constants.
4019 case '\'':
4020 // Notify MIOpt that we read a non-whitespace/non-comment token.
4021 MIOpt.ReadToken();
4022 return LexCharConstant(Result, CurPtr, tok::char_constant);
4023
4024 // C99 6.4.5: String Literals.
4025 case '"':
4026 // Notify MIOpt that we read a non-whitespace/non-comment token.
4027 MIOpt.ReadToken();
4028 return LexStringLiteral(Result, CurPtr,
4029 ParsingFilename ? tok::header_name
4030 : tok::string_literal);
4031
4032 // C99 6.4.6: Punctuators.
4033 case '?':
4034 Kind = tok::question;
4035 break;
4036 case '[':
4037 Kind = tok::l_square;
4038 break;
4039 case ']':
4040 Kind = tok::r_square;
4041 break;
4042 case '(':
4043 Kind = tok::l_paren;
4044 break;
4045 case ')':
4046 Kind = tok::r_paren;
4047 break;
4048 case '{':
4049 Kind = tok::l_brace;
4050 break;
4051 case '}':
4052 Kind = tok::r_brace;
4053 break;
4054 case '.':
4055 Char = getCharAndSize(CurPtr, SizeTmp);
4056 if (Char >= '0' && Char <= '9') {
4057 // Notify MIOpt that we read a non-whitespace/non-comment token.
4058 MIOpt.ReadToken();
4059
4060 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
4061 } else if (LangOpts.CPlusPlus && Char == '*') {
4062 Kind = tok::periodstar;
4063 CurPtr += SizeTmp;
4064 } else if (Char == '.' &&
4065 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
4066 Kind = tok::ellipsis;
4067 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4068 SizeTmp2, Result);
4069 } else {
4070 Kind = tok::period;
4071 }
4072 break;
4073 case '&':
4074 Char = getCharAndSize(CurPtr, SizeTmp);
4075 if (Char == '&') {
4076 Kind = tok::ampamp;
4077 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4078 } else if (Char == '=') {
4079 Kind = tok::ampequal;
4080 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4081 } else {
4082 Kind = tok::amp;
4083 }
4084 break;
4085 case '*':
4086 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4087 Kind = tok::starequal;
4088 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4089 } else {
4090 Kind = tok::star;
4091 }
4092 break;
4093 case '+':
4094 Char = getCharAndSize(CurPtr, SizeTmp);
4095 if (Char == '+') {
4096 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4097 Kind = tok::plusplus;
4098 } else if (Char == '=') {
4099 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4100 Kind = tok::plusequal;
4101 } else {
4102 Kind = tok::plus;
4103 }
4104 break;
4105 case '-':
4106 Char = getCharAndSize(CurPtr, SizeTmp);
4107 if (Char == '-') { // --
4108 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4109 Kind = tok::minusminus;
4110 } else if (Char == '>' && LangOpts.CPlusPlus &&
4111 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
4112 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4113 SizeTmp2, Result);
4114 Kind = tok::arrowstar;
4115 } else if (Char == '>') { // ->
4116 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4117 Kind = tok::arrow;
4118 } else if (Char == '=') { // -=
4119 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4120 Kind = tok::minusequal;
4121 } else {
4122 Kind = tok::minus;
4123 }
4124 break;
4125 case '~':
4126 Kind = tok::tilde;
4127 break;
4128 case '!':
4129 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4130 Kind = tok::exclaimequal;
4131 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4132 } else {
4133 Kind = tok::exclaim;
4134 }
4135 break;
4136 case '/':
4137 // 6.4.9: Comments
4138 Char = getCharAndSize(CurPtr, SizeTmp);
4139 if (Char == '/') { // Line comment.
4140 // Even if Line comments are disabled (e.g. in C89 mode), we generally
4141 // want to lex this as a comment. There is one problem with this though,
4142 // that in one particular corner case, this can change the behavior of the
4143 // resultant program. For example, In "foo //**/ bar", C89 would lex
4144 // this as "foo / bar" and languages with Line comments would lex it as
4145 // "foo". Check to see if the character after the second slash is a '*'.
4146 // If so, we will lex that as a "/" instead of the start of a comment.
4147 // However, we never do this if we are just preprocessing.
4148 bool TreatAsComment =
4149 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
4150 if (!TreatAsComment)
4151 if (!(PP && PP->isPreprocessedOutput()))
4152 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
4153
4154 if (TreatAsComment) {
4155 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4156 TokAtPhysicalStartOfLine))
4157 return true; // There is a token to return.
4158
4159 // It is common for the tokens immediately after a // comment to be
4160 // whitespace (indentation for the next line). Instead of going through
4161 // the big switch, handle it efficiently now.
4162 goto SkipIgnoredUnits;
4163 }
4164 }
4165
4166 if (Char == '*') { // /**/ comment.
4167 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4168 TokAtPhysicalStartOfLine))
4169 return true; // There is a token to return.
4170
4171 // We only saw whitespace, so just try again with this lexer.
4172 // (We manually eliminate the tail call to avoid recursion.)
4173 goto LexNextToken;
4174 }
4175
4176 if (Char == '=') {
4177 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4178 Kind = tok::slashequal;
4179 } else {
4180 Kind = tok::slash;
4181 }
4182 break;
4183 case '%':
4184 Char = getCharAndSize(CurPtr, SizeTmp);
4185 if (Char == '=') {
4186 Kind = tok::percentequal;
4187 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4188 } else if (LangOpts.Digraphs && Char == '>') {
4189 Kind = tok::r_brace; // '%>' -> '}'
4190 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4191 } else if (LangOpts.Digraphs && Char == ':') {
4192 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4193 Char = getCharAndSize(CurPtr, SizeTmp);
4194 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
4195 Kind = tok::hashhash; // '%:%:' -> '##'
4196 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4197 SizeTmp2, Result);
4198 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
4199 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4200 if (!isLexingRawMode())
4201 Diag(BufferPtr, diag::ext_charize_microsoft);
4202 Kind = tok::hashat;
4203 } else { // '%:' -> '#'
4204 // We parsed a # character. If this occurs at the start of the line,
4205 // it's actually the start of a preprocessing directive. Callback to
4206 // the preprocessor to handle it.
4207 // TODO: -fpreprocessed mode??
4208 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4209 goto HandleDirective;
4210
4211 Kind = tok::hash;
4212 }
4213 } else {
4214 Kind = tok::percent;
4215 }
4216 break;
4217 case '<':
4218 Char = getCharAndSize(CurPtr, SizeTmp);
4219 if (ParsingFilename) {
4220 return LexAngledStringLiteral(Result, CurPtr);
4221 } else if (Char == '<') {
4222 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4223 if (After == '=') {
4224 Kind = tok::lesslessequal;
4225 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4226 SizeTmp2, Result);
4227 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
4228 // If this is actually a '<<<<<<<' version control conflict marker,
4229 // recognize it as such and recover nicely.
4230 goto LexNextToken;
4231 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
4232 // If this is '<<<<' and we're in a Perforce-style conflict marker,
4233 // ignore it.
4234 goto LexNextToken;
4235 } else if (LangOpts.CUDA && After == '<') {
4236 Kind = tok::lesslessless;
4237 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4238 SizeTmp2, Result);
4239 } else {
4240 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4241 Kind = tok::lessless;
4242 }
4243 } else if (Char == '=') {
4244 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4245 if (After == '>') {
4246 if (LangOpts.CPlusPlus20) {
4247 if (!isLexingRawMode())
4248 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
4249 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4250 SizeTmp2, Result);
4251 Kind = tok::spaceship;
4252 break;
4253 }
4254 // Suggest adding a space between the '<=' and the '>' to avoid a
4255 // change in semantics if this turns up in C++ <=17 mode.
4256 if (LangOpts.CPlusPlus && !isLexingRawMode()) {
4257 Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)
4259 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
4260 }
4261 }
4262 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4263 Kind = tok::lessequal;
4264 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
4265 if (LangOpts.CPlusPlus11 &&
4266 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
4267 // C++0x [lex.pptoken]p3:
4268 // Otherwise, if the next three characters are <:: and the subsequent
4269 // character is neither : nor >, the < is treated as a preprocessor
4270 // token by itself and not as the first character of the alternative
4271 // token <:.
4272 unsigned SizeTmp3;
4273 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
4274 if (After != ':' && After != '>') {
4275 Kind = tok::less;
4276 if (!isLexingRawMode())
4277 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
4278 break;
4279 }
4280 }
4281
4282 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4283 Kind = tok::l_square;
4284 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
4285 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4286 Kind = tok::l_brace;
4287 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
4288 lexEditorPlaceholder(Result, CurPtr)) {
4289 return true;
4290 } else {
4291 Kind = tok::less;
4292 }
4293 break;
4294 case '>':
4295 Char = getCharAndSize(CurPtr, SizeTmp);
4296 if (Char == '=') {
4297 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4298 Kind = tok::greaterequal;
4299 } else if (Char == '>') {
4300 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4301 if (After == '=') {
4302 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4303 SizeTmp2, Result);
4304 Kind = tok::greatergreaterequal;
4305 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
4306 // If this is actually a '>>>>' conflict marker, recognize it as such
4307 // and recover nicely.
4308 goto LexNextToken;
4309 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
4310 // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
4311 goto LexNextToken;
4312 } else if (LangOpts.CUDA && After == '>') {
4313 Kind = tok::greatergreatergreater;
4314 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4315 SizeTmp2, Result);
4316 } else {
4317 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4318 Kind = tok::greatergreater;
4319 }
4320 } else {
4321 Kind = tok::greater;
4322 }
4323 break;
4324 case '^':
4325 Char = getCharAndSize(CurPtr, SizeTmp);
4326 if (Char == '=') {
4327 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4328 Kind = tok::caretequal;
4329 } else {
4330 if (LangOpts.OpenCL && Char == '^')
4331 Diag(CurPtr, diag::err_opencl_logical_exclusive_or);
4332 Kind = tok::caret;
4333 }
4334 break;
4335 case '|':
4336 Char = getCharAndSize(CurPtr, SizeTmp);
4337 if (Char == '=') {
4338 Kind = tok::pipeequal;
4339 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4340 } else if (Char == '|') {
4341 // If this is '|||||||' and we're in a conflict marker, ignore it.
4342 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
4343 goto LexNextToken;
4344 Kind = tok::pipepipe;
4345 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4346 } else {
4347 Kind = tok::pipe;
4348 }
4349 break;
4350 case ':':
4351 Char = getCharAndSize(CurPtr, SizeTmp);
4352 if (LangOpts.Digraphs && Char == '>') {
4353 Kind = tok::r_square; // ':>' -> ']'
4354 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4355 } else if (Char == ':') {
4356 Kind = tok::coloncolon;
4357 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4358 } else {
4359 Kind = tok::colon;
4360 }
4361 break;
4362 case ';':
4363 Kind = tok::semi;
4364 break;
4365 case '=':
4366 Char = getCharAndSize(CurPtr, SizeTmp);
4367 if (Char == '=') {
4368 // If this is '====' and we're in a conflict marker, ignore it.
4369 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
4370 goto LexNextToken;
4371
4372 Kind = tok::equalequal;
4373 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4374 } else {
4375 Kind = tok::equal;
4376 }
4377 break;
4378 case ',':
4379 Kind = tok::comma;
4380 break;
4381 case '#':
4382 Char = getCharAndSize(CurPtr, SizeTmp);
4383 if (Char == '#') {
4384 Kind = tok::hashhash;
4385 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4386 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize
4387 Kind = tok::hashat;
4388 if (!isLexingRawMode())
4389 Diag(BufferPtr, diag::ext_charize_microsoft);
4390 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4391 } else {
4392 // We parsed a # character. If this occurs at the start of the line,
4393 // it's actually the start of a preprocessing directive. Callback to
4394 // the preprocessor to handle it.
4395 // TODO: -fpreprocessed mode??
4396 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4397 goto HandleDirective;
4398
4399 Kind = tok::hash;
4400 }
4401 break;
4402
4403 case '@':
4404 // Objective C support.
4405 if (CurPtr[-1] == '@' && LangOpts.ObjC)
4406 Kind = tok::at;
4407 else
4408 Kind = tok::unknown;
4409 break;
4410
4411 // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
4412 case '\\':
4413 if (!LangOpts.AsmPreprocessor) {
4414 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
4415 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4416 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4417 return true; // KeepWhitespaceMode
4418
4419 // We only saw whitespace, so just try again with this lexer.
4420 // (We manually eliminate the tail call to avoid recursion.)
4421 goto LexNextToken;
4422 }
4423
4424 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4425 }
4426 }
4427
4428 Kind = tok::unknown;
4429 break;
4430
4431 default: {
4432 if (isASCII(Char)) {
4433 Kind = tok::unknown;
4434 break;
4435 }
4436
4437 llvm::UTF32 CodePoint;
4438
4439 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
4440 // an escaped newline.
4441 --CurPtr;
4442 llvm::ConversionResult Status =
4443 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
4444 (const llvm::UTF8 *)BufferEnd,
4445 &CodePoint,
4446 llvm::strictConversion);
4447 if (Status == llvm::conversionOK) {
4448 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4449 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4450 return true; // KeepWhitespaceMode
4451
4452 // We only saw whitespace, so just try again with this lexer.
4453 // (We manually eliminate the tail call to avoid recursion.)
4454 goto LexNextToken;
4455 }
4456 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4457 }
4458
4461 ++CurPtr;
4462 Kind = tok::unknown;
4463 break;
4464 }
4465
4466 // Non-ASCII characters tend to creep into source code unintentionally.
4467 // Instead of letting the parser complain about the unknown token,
4468 // just diagnose the invalid UTF-8, then drop the character.
4469 Diag(CurPtr, diag::err_invalid_utf8);
4470
4471 BufferPtr = CurPtr+1;
4472 // We're pretending the character didn't exist, so just try again with
4473 // this lexer.
4474 // (We manually eliminate the tail call to avoid recursion.)
4475 goto LexNextToken;
4476 }
4477 }
4478
4479 // Notify MIOpt that we read a non-whitespace/non-comment token.
4480 MIOpt.ReadToken();
4481
4482 // Update the location of token as well as BufferPtr.
4483 FormTokenWithChars(Result, CurPtr, Kind);
4484 return true;
4485
4486HandleDirective:
4487 // We parsed a # character and it's the start of a preprocessing directive.
4488
4489 FormTokenWithChars(Result, CurPtr, tok::hash);
4491
4493 // With a fatal failure in the module loader, we abort parsing.
4494 return true;
4495
4496 // We parsed the directive; lex a token with the new state.
4497 return false;
4498
4499LexNextToken:
4500 Result.clearFlag(Token::NeedsCleaning);
4501 goto LexStart;
4502}
4503
4504const char *Lexer::convertDependencyDirectiveToken(
4506 const char *TokPtr = BufferStart + DDTok.Offset;
4507 Result.startToken();
4508 Result.setLocation(getSourceLocation(TokPtr));
4509 Result.setKind(DDTok.Kind);
4510 Result.setFlag((Token::TokenFlags)DDTok.Flags);
4511 Result.setLength(DDTok.Length);
4512 BufferPtr = TokPtr + DDTok.Length;
4513 return TokPtr;
4514}
4515
4516bool Lexer::LexDependencyDirectiveToken(Token &Result) {
4517 assert(isDependencyDirectivesLexer());
4518
4519 using namespace dependency_directives_scan;
4520
4521 while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) {
4522 if (DepDirectives.front().Kind == pp_eof)
4523 return LexEndOfFile(Result, BufferEnd);
4524 if (DepDirectives.front().Kind == tokens_present_before_eof)
4525 MIOpt.ReadToken();
4526 NextDepDirectiveTokenIndex = 0;
4527 DepDirectives = DepDirectives.drop_front();
4528 }
4529
4531 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++];
4532 if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) {
4533 // Read something other than a preprocessor directive hash.
4534 MIOpt.ReadToken();
4535 }
4536
4537 if (ParsingFilename && DDTok.is(tok::less)) {
4538 BufferPtr = BufferStart + DDTok.Offset;
4539 LexAngledStringLiteral(Result, BufferPtr + 1);
4540 if (Result.isNot(tok::header_name))
4541 return true;
4542 // Advance the index of lexed tokens.
4543 while (true) {
4544 const dependency_directives_scan::Token &NextTok =
4545 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex];
4546 if (BufferStart + NextTok.Offset >= BufferPtr)
4547 break;
4548 ++NextDepDirectiveTokenIndex;
4549 }
4550 return true;
4551 }
4552
4553 const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result);
4554
4555 if (Result.is(tok::hash) && Result.isAtStartOfLine()) {
4557 return false;
4558 }
4559 if (Result.is(tok::raw_identifier)) {
4560 Result.setRawIdentifierData(TokPtr);
4561 if (!isLexingRawMode()) {
4563 if (II->isHandleIdentifierCase())
4564 return PP->HandleIdentifier(Result);
4565 }
4566 return true;
4567 }
4568 if (Result.isLiteral()) {
4569 Result.setLiteralData(TokPtr);
4570 return true;
4571 }
4572 if (Result.is(tok::colon)) {
4573 // Convert consecutive colons to 'tok::coloncolon'.
4574 if (*BufferPtr == ':') {
4575 assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
4576 tok::colon));
4577 ++NextDepDirectiveTokenIndex;
4578 Result.setKind(tok::coloncolon);
4579 }
4580 return true;
4581 }
4582 if (Result.is(tok::eod))
4584
4585 return true;
4586}
4587
4588bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) {
4589 assert(isDependencyDirectivesLexer());
4590
4591 using namespace dependency_directives_scan;
4592
4593 bool Stop = false;
4594 unsigned NestedIfs = 0;
4595 do {
4596 DepDirectives = DepDirectives.drop_front();
4597 switch (DepDirectives.front().Kind) {
4598 case pp_none:
4599 llvm_unreachable("unexpected 'pp_none'");
4600 case pp_include:
4602 case pp_define:
4603 case pp_undef:
4604 case pp_import:
4605 case pp_pragma_import:
4606 case pp_pragma_once:
4611 case pp_include_next:
4612 case decl_at_import:
4613 case cxx_module_decl:
4614 case cxx_import_decl:
4618 break;
4619 case pp_if:
4620 case pp_ifdef:
4621 case pp_ifndef:
4622 ++NestedIfs;
4623 break;
4624 case pp_elif:
4625 case pp_elifdef:
4626 case pp_elifndef:
4627 case pp_else:
4628 if (!NestedIfs) {
4629 Stop = true;
4630 }
4631 break;
4632 case pp_endif:
4633 if (!NestedIfs) {
4634 Stop = true;
4635 } else {
4636 --NestedIfs;
4637 }
4638 break;
4639 case pp_eof:
4640 NextDepDirectiveTokenIndex = 0;
4641 return LexEndOfFile(Result, BufferEnd);
4642 }
4643 } while (!Stop);
4644
4646 DepDirectives.front().Tokens.front();
4647 assert(DDTok.is(tok::hash));
4648 NextDepDirectiveTokenIndex = 1;
4649
4650 convertDependencyDirectiveToken(DDTok, Result);
4651 return false;
4652}
StringRef P
#define SM(sm)
Definition: Cuda.cpp:84
Defines the Diagnostic-related interfaces.
Expr * E
Defines the clang::IdentifierInfo, clang::IdentifierTable, and clang::Selector interfaces.
Forward-declares and imports various common LLVM datatypes that clang wants to use unqualified.
Defines the clang::LangOptions interface.
static bool isInExpansionTokenRange(const SourceLocation Loc, const SourceManager &SM)
Definition: Lexer.cpp:947
static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, bool IsStart, bool &IsExtension)
Definition: Lexer.cpp:1546
static void diagnoseInvalidUnicodeCodepointInIdentifier(DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, CharSourceRange Range, bool IsFirst)
Definition: Lexer.cpp:1740
static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs)
DecodeTrigraphChar - If the specified character is a legal trigraph when prefixed with ?...
Definition: Lexer.cpp:1260
static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, const LangOptions &LangOpts, char *Spelling)
Slow case of getSpelling.
Definition: Lexer.cpp:324
static const char * FindConflictEnd(const char *CurPtr, const char *BufferEnd, ConflictMarkerKind CMK)
Find the end of a version control conflict marker.
Definition: Lexer.cpp:3244
static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
After encountering UTF-8 character C and interpreting it as an identifier character,...
Definition: Lexer.cpp:1665
static SourceLocation getBeginningOfFileToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.cpp:560
static void StringifyImpl(T &Str, char Quote)
Definition: Lexer.cpp:284
static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen)
GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the lexer buffer was all exp...
Definition: Lexer.cpp:1188
static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)
Definition: Lexer.cpp:1560
static CharSourceRange makeCharRange(Lexer &L, const char *Begin, const char *End)
Definition: Lexer.cpp:1630
static bool isUnicodeWhitespace(uint32_t Codepoint)
Definition: Lexer.cpp:1527
static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
Definition: Lexer.cpp:1614
static const char * findPlaceholderEnd(const char *CurPtr, const char *BufferEnd)
Definition: Lexer.cpp:3348
static llvm::SmallString< 5 > codepointAsHexString(uint32_t C)
Definition: Lexer.cpp:1533
static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.cpp:919
static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, bool Trigraphs)
isBlockCommentEndOfEscapedNewLine - Return true if the specified newline character (either \n or \r) ...
Definition: Lexer.cpp:2760
static const char * fastParseASCIIIdentifier(const char *CurPtr, const char *BufferEnd)
Definition: Lexer.cpp:1906
static char GetTrigraphCharForLetter(char Letter)
GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, return the decoded trigraph...
Definition: Lexer.cpp:1241
static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)
Definition: Lexer.cpp:1588
static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range, bool IsFirst)
Definition: Lexer.cpp:1636
static const char * findBeginningOfLine(StringRef Buffer, unsigned Offset)
Returns the pointer that points to the beginning of line that contains the given offset,...
Definition: Lexer.cpp:543
Defines the MultipleIncludeOpt interface.
Defines the clang::Preprocessor interface.
SourceRange Range
Definition: SemaObjC.cpp:758
SourceLocation Loc
Definition: SemaObjC.cpp:759
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
Defines the clang::TokenKind enum and support functions.
SourceLocation Begin
static const llvm::sys::UnicodeCharRange C11DisallowedInitialIDCharRanges[]
static const llvm::sys::UnicodeCharRange C99DisallowedInitialIDCharRanges[]
static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[]
static const llvm::sys::UnicodeCharRange C99AllowedIDCharRanges[]
static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[]
static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDStartRanges[]
static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDContinueRanges[]
static const llvm::sys::UnicodeCharRange XIDStartRanges[]
static const llvm::sys::UnicodeCharRange XIDContinueRanges[]
__DEVICE__ void * memcpy(void *__a, const void *__b, size_t __c)
__device__ int
__device__ __2f16 float c
__PTRDIFF_TYPE__ ptrdiff_t
static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a, vector signed char __b)
Definition: altivec.h:16260
static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed char __a, vector signed char __b)
Definition: altivec.h:16052
Represents a character-granular source range.
static CharSourceRange getCharRange(SourceRange R)
SourceLocation getEnd() const
SourceLocation getBegin() const
A little helper class used to produce diagnostics.
Definition: Diagnostic.h:1220
Concrete class used by the front-end to report problems and issues.
Definition: Diagnostic.h:231
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
Definition: Diagnostic.h:1493
bool isIgnored(unsigned DiagID, SourceLocation Loc) const
Determine whether the diagnostic is known to be ignored.
Definition: Diagnostic.h:939
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
bool isInvalid() const
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string.
Definition: Diagnostic.h:138
static FixItHint CreateRemoval(CharSourceRange RemoveRange)
Create a code modification hint that removes the given source range.
Definition: Diagnostic.h:127
static FixItHint CreateInsertion(SourceLocation InsertionLoc, StringRef Code, bool BeforePreviousInsertions=false)
Create a code modification hint that inserts the given code string at a specific location.
Definition: Diagnostic.h:101
One of these records is kept for each identifier that is lexed.
bool isHandleIdentifierCase() const
Return true if the Preprocessor::HandleIdentifier must be called on a token of this identifier.
bool isKeyword(const LangOptions &LangOpts) const
Return true if this token is a keyword in the specified language.
tok::ObjCKeywordKind getObjCKeywordID() const
Return the Objective-C keyword ID for the this identifier.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Definition: LangOptions.h:499
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition: Lexer.h:78
static StringRef getSourceText(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts, bool *Invalid=nullptr)
Returns a string for the source that the range encompasses.
Definition: Lexer.cpp:1023
void SetKeepWhitespaceMode(bool Val)
SetKeepWhitespaceMode - This method lets clients enable or disable whitespace retention mode.
Definition: Lexer.h:254
static SourceLocation findLocationAfterToken(SourceLocation loc, tok::TokenKind TKind, const SourceManager &SM, const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine)
Checks that the given token is the first token that occurs after the given location (this excludes co...
Definition: Lexer.cpp:1359
bool LexFromRawLexer(Token &Result)
LexFromRawLexer - Lex a token from a designated raw lexer (one with no associated preprocessor object...
Definition: Lexer.h:236
bool inKeepCommentMode() const
inKeepCommentMode - Return true if the lexer should return comments as tokens.
Definition: Lexer.h:262
void SetCommentRetentionState(bool Mode)
SetCommentRetentionMode - Change the comment retention mode of the lexer to the specified mode.
Definition: Lexer.h:269
void seek(unsigned Offset, bool IsAtStartOfLine)
Set the lexer's buffer pointer to Offset.
Definition: Lexer.cpp:277
static StringRef getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1059
void ReadToEndOfLine(SmallVectorImpl< char > *Result=nullptr)
ReadToEndOfLine - Read the rest of the current preprocessor line as an uninterpreted string.
Definition: Lexer.cpp:3068
static bool isAtStartOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroBegin=nullptr)
Returns true if the given MacroID location points at the first token of the macro expansion.
Definition: Lexer.cpp:871
DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const
Diag - Forwarding function for diagnostics.
Definition: Lexer.cpp:1231
const char * getBufferLocation() const
Return the current location in the buffer.
Definition: Lexer.h:308
bool Lex(Token &Result)
Lex - Return the next token in the file.
Definition: Lexer.cpp:3681
bool isPragmaLexer() const
isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
Definition: Lexer.h:225
static unsigned getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, const SourceManager &SM, const LangOptions &LangOpts)
Get the physical length (including trigraphs and escaped newlines) of the first Characters characters...
Definition: Lexer.cpp:790
Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP, bool IsFirstIncludeOfFile=true)
Lexer constructor - Create a new lexer object for the specified buffer with the specified preprocesso...
Definition: Lexer.cpp:183
static bool isAtEndOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroEnd=nullptr)
Returns true if the given MacroID location points at the last token of the macro expansion.
Definition: Lexer.cpp:893
SourceLocation getSourceLocation() override
getSourceLocation - Return a source location for the next character in the current file.
Definition: Lexer.h:303
static CharSourceRange makeFileCharRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Accepts a range and returns a character range with file locations.
Definition: Lexer.cpp:954
static bool isNewLineEscaped(const char *BufferStart, const char *Str)
Checks whether new line pointed by Str is preceded by escape sequence.
Definition: Lexer.cpp:1137
SourceLocation getSourceLocation(const char *Loc, unsigned TokLen=1) const
getSourceLocation - Return a source location identifier for the specified offset in the current file.
Definition: Lexer.cpp:1212
static StringRef getIndentationForLine(SourceLocation Loc, const SourceManager &SM)
Returns the leading whitespace for line that corresponds to the given location Loc.
Definition: Lexer.cpp:1157
static unsigned getSpelling(const Token &Tok, const char *&Buffer, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid=nullptr)
getSpelling - This method is used to get the spelling of a token into a preallocated buffer,...
Definition: Lexer.cpp:451
bool isKeepWhitespaceMode() const
isKeepWhitespaceMode - Return true if the lexer should return tokens for every character in the file,...
Definition: Lexer.h:248
static bool isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts)
Returns true if the given character could appear in an identifier.
Definition: Lexer.cpp:1133
static std::optional< Token > findNextToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts, bool IncludeComments=false)
Finds the token that comes right after the given location.
Definition: Lexer.cpp:1324
static unsigned MeasureTokenLength(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
MeasureTokenLength - Relex the token at the specified location and return its length in bytes in the ...
Definition: Lexer.cpp:498
static SourceLocation GetBeginningOfToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Given a location any where in a source buffer, find the location that corresponds to the beginning of...
Definition: Lexer.cpp:608
void resetExtendedTokenMode()
Sets the extended token mode back to its initial value, according to the language options and preproc...
Definition: Lexer.cpp:219
static StringRef getImmediateMacroNameForDiagnostics(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1106
static Lexer * Create_PragmaLexer(SourceLocation SpellingLoc, SourceLocation ExpansionLocStart, SourceLocation ExpansionLocEnd, unsigned TokLen, Preprocessor &PP)
Create_PragmaLexer: Lexer constructor - Create a new lexer object for _Pragma expansion.
Definition: Lexer.cpp:242
static PreambleBounds ComputePreamble(StringRef Buffer, const LangOptions &LangOpts, unsigned MaxLines=0)
Compute the preamble of the given file.
Definition: Lexer.cpp:636
static bool getRawToken(SourceLocation Loc, Token &Result, const SourceManager &SM, const LangOptions &LangOpts, bool IgnoreWhiteSpace=false)
Relex the token at the specified location.
Definition: Lexer.cpp:509
static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset, const SourceManager &SM, const LangOptions &LangOpts)
Computes the source location just past the end of the token at this source location.
Definition: Lexer.cpp:849
static std::string Stringify(StringRef Str, bool Charify=false)
Stringify - Convert the specified string into a C string by i) escaping '\' and " characters and ii) ...
Definition: Lexer.cpp:309
static SizedChar getCharAndSizeNoWarn(const char *Ptr, const LangOptions &LangOpts)
getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever emit a warning.
Definition: Lexer.h:587
void ExitTopLevelConditional()
Called when the lexer exits the top-level conditional.
bool LexingRawMode
True if in raw mode.
SmallVector< PPConditionalInfo, 4 > ConditionalStack
Information about the set of #if/#ifdef/#ifndef blocks we are currently in.
bool ParsingPreprocessorDirective
True when parsing #XXX; turns '\n' into a tok::eod token.
MultipleIncludeOpt MIOpt
A state machine that detects the #ifndef-wrapping a file idiom for the multiple-include optimization.
bool ParsingFilename
True after #include; turns <xx> or "xxx" into a tok::header_name token.
bool isLexingRawMode() const
Return true if this lexer is in raw mode or not.
const FileID FID
The SourceManager FileID corresponding to the file being lexed.
bool LexEditorPlaceholders
When enabled, the preprocessor will construct editor placeholder tokens.
Engages in a tight little dance with the lexer to efficiently preprocess tokens.
Definition: Preprocessor.h:138
SourceLocation getCodeCompletionLoc() const
Returns the location of the code-completion point.
SourceLocation getCodeCompletionFileLoc() const
Returns the start location of the file of code-completion point.
void setCodeCompletionTokenRange(const SourceLocation Start, const SourceLocation End)
Set the code completion token range for detecting replacement range later on.
bool isRecordingPreamble() const
void setRecordedPreambleConditionalStack(ArrayRef< PPConditionalInfo > s)
bool isInPrimaryFile() const
Return true if we're in the top-level file, not in a #include.
void CreateString(StringRef Str, Token &Tok, SourceLocation ExpansionLocStart=SourceLocation(), SourceLocation ExpansionLocEnd=SourceLocation())
Plop the specified string into a scratch buffer and set the specified token's location and length to ...
IdentifierInfo * LookUpIdentifierInfo(Token &Identifier) const
Given a tok::raw_identifier token, look up the identifier information for the token and install it in...
bool isPreprocessedOutput() const
Returns true if the preprocessor is responsible for generating output, false if it is producing token...
bool HandleIdentifier(Token &Identifier)
Callback invoked when the lexer reads an identifier and has filled in the tokens IdentifierInfo membe...
SourceManager & getSourceManager() const
EmptylineHandler * getEmptylineHandler() const
bool getCommentRetentionState() const
bool hadModuleLoaderFatalFailure() const
PreprocessorOptions & getPreprocessorOpts() const
Retrieve the preprocessor options used to initialize this preprocessor.
StringRef getSpelling(SourceLocation loc, SmallVectorImpl< char > &buffer, bool *invalid=nullptr) const
Return the 'spelling' of the token at the given location; does not go up to the spelling location or ...
bool HandleComment(Token &result, SourceRange Comment)
bool isCodeCompletionEnabled() const
Determine if we are performing code completion.
void HandleDirective(Token &Result)
Callback invoked when the lexer sees a # token at the start of a line.
IdentifierTable & getIdentifierTable()
const LangOptions & getLangOpts() const
void CodeCompleteIncludedFile(llvm::StringRef Dir, bool IsAngled)
Hook used by the lexer to invoke the "included file" code completion point.
void CodeCompleteNaturalLanguage()
Hook used by the lexer to invoke the "natural language" code completion point.
bool HandleEndOfFile(Token &Result, bool isEndOfMacro=false)
Callback invoked when the lexer hits the end of the current file.
DiagnosticsEngine & getDiagnostics() const
void setCodeCompletionIdentifierInfo(IdentifierInfo *Filter)
Set the code completion token for filtering purposes.
DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) const
Forwarding function for diagnostics.
Encodes a location in the source.
static SourceLocation getFromRawEncoding(UIntTy Encoding)
Turn a raw encoding of a SourceLocation object into a real SourceLocation.
bool isValid() const
Return true if this is a valid SourceLocation object.
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
UIntTy getRawEncoding() const
When a SourceLocation itself cannot be used, this returns an (opaque) 32-bit integer encoding for it.
This class handles loading and caching of source files into memory.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer.
A trivial tuple used to represent a source range.
void setBegin(SourceLocation b)
bool isInvalid() const
SourceLocation getEnd() const
SourceLocation getBegin() const
void setEnd(SourceLocation e)
Each ExpansionInfo encodes the expansion location - where the token was ultimately expanded,...
SourceLocation getExpansionLocStart() const
SourceLocation getSpellingLoc() const
This is a discriminated union of FileInfo and ExpansionInfo.
const ExpansionInfo & getExpansion() const
static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix)
Determine whether a suffix is a valid ud-suffix.
Token - This structure provides full information about a lexed token.
Definition: Token.h:36
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:187
bool hasUCN() const
Returns true if this token contains a universal character name.
Definition: Token.h:306
bool isLiteral() const
Return true if this is a "literal", like a numeric constant, string, etc.
Definition: Token.h:116
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition: Token.h:132
unsigned getLength() const
Definition: Token.h:135
tok::ObjCKeywordKind getObjCKeywordID() const
Return the ObjC keyword kind.
Definition: Lexer.cpp:69
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {....
Definition: Token.h:99
tok::TokenKind getKind() const
Definition: Token.h:94
bool isAtStartOfLine() const
isAtStartOfLine - Return true if this token is at the start of a line.
Definition: Token.h:276
@ HasUCN
Definition: Token.h:83
@ IsEditorPlaceholder
Definition: Token.h:88
@ LeadingEmptyMacro
Definition: Token.h:81
@ LeadingSpace
Definition: Token.h:77
@ StartOfLine
Definition: Token.h:75
@ HasUDSuffix
Definition: Token.h:82
@ NeedsCleaning
Definition: Token.h:80
bool isAnnotation() const
Return true if this is any of tok::annot_* kind tokens.
Definition: Token.h:121
bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const
Return true if we have an ObjC keyword identifier.
Definition: Lexer.cpp:60
bool isSimpleTypeSpecifier(const LangOptions &LangOpts) const
Determine whether the token kind starts a simple-type-specifier.
Definition: Lexer.cpp:77
void startToken()
Reset all flags to cleared.
Definition: Token.h:177
bool needsCleaning() const
Return true if this token has trigraphs or escaped newlines in it.
Definition: Token.h:295
StringRef getRawIdentifier() const
getRawIdentifier - For a raw identifier token (i.e., an identifier lexed in raw mode),...
Definition: Token.h:213
const char * getLiteralData() const
getLiteralData - For a literal token (numeric constant, string, etc), this returns a pointer to the s...
Definition: Token.h:225
void setFlag(TokenFlags Flag)
Set the specified flag.
Definition: Token.h:244
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4285
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3092
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3458
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3443
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3746
@ tokens_present_before_eof
Indicates that there are tokens present between the last scanned directive and eof.
@ After
Like System, but searched after the system directories.
bool isStringLiteral(TokenKind K)
Return true if this is a C or C++ string-literal (or C++11 user-defined-string-literal) token.
Definition: TokenKinds.h:89
ObjCKeywordKind
Provides a namespace for Objective-C keywords which start with an '@'.
Definition: TokenKinds.h:41
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
The JSON file list parser is used to communicate input to InstallAPI.
LLVM_READNONE bool isASCII(char c)
Returns true if a byte is an ASCII character.
Definition: CharInfo.h:41
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition: CharInfo.h:99
ConflictMarkerKind
ConflictMarkerKind - Kinds of conflict marker which the lexer might be recovering from.
Definition: Lexer.h:44
@ CMK_Perforce
A Perforce-style conflict marker, initiated by 4 ">"s, separated by 4 "="s, and terminated by 4 "<"s.
Definition: Lexer.h:54
@ CMK_None
Not within a conflict marker.
Definition: Lexer.h:46
@ CMK_Normal
A normal or diff3 conflict marker, initiated by at least 7 "<"s, separated by at least 7 "="s or "|"s...
Definition: Lexer.h:50
@ LineComment
Definition: LangStandard.h:49
LLVM_READONLY bool isAsciiIdentifierContinue(unsigned char c)
Definition: CharInfo.h:61
bool operator<(DeclarationName LHS, DeclarationName RHS)
Ordering on two declaration names.
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition: CharInfo.h:91
@ Result
The result type of a method or function.
LLVM_READONLY bool isRawStringDelimBody(unsigned char c)
Return true if this is the body character of a C++ raw string delimiter.
Definition: CharInfo.h:175
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition: CharInfo.h:108
LLVM_READONLY bool isPreprocessingNumberBody(unsigned char c)
Return true if this is the body character of a C preprocessing number, which is [a-zA-Z0-9_.
Definition: CharInfo.h:168
const FunctionProtoType * T
LLVM_READONLY bool isAsciiIdentifierStart(unsigned char c, bool AllowDollar=false)
Returns true if this is a valid first character of a C identifier, which is [a-zA-Z_].
Definition: CharInfo.h:53
unsigned int uint32_t
__INTPTR_TYPE__ intptr_t
A signed integer type with the property that any valid pointer to void can be converted to this type,...
float __ovld __cnfn length(float)
Return the length of vector p, i.e., sqrt(p.x2 + p.y 2 + ...)
#define _SIDD_UBYTE_OPS
Definition: smmintrin.h:1532
#define _mm_cmpistri(A, B, M)
Uses the immediate operand M to perform a comparison of string data with implicitly defined lengths t...
Definition: smmintrin.h:1664
#define _SIDD_LEAST_SIGNIFICANT
Definition: smmintrin.h:1550
#define _SIDD_NEGATIVE_POLARITY
Definition: smmintrin.h:1545
#define _SIDD_CMP_RANGES
Definition: smmintrin.h:1539
Represents a char and the number of bytes parsed to produce it.
Definition: Lexer.h:580
Describes the bounds (start, size) of the preamble and a flag required by PreprocessorOptions::Precom...
Definition: Lexer.h:60
Token lexed as part of dependency directive scanning.
unsigned Offset
Offset into the original source input.