clang 20.0.0git
Tokens.h
Go to the documentation of this file.
1//===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8// Record tokens that a preprocessor emits and define operations to map between
9// the tokens written in a file and tokens produced by the preprocessor.
10//
11// When running the compiler, there are two token streams we are interested in:
12// - "spelled" tokens directly correspond to a substring written in some
13// source file.
14// - "expanded" tokens represent the result of preprocessing, parses consumes
15// this token stream to produce the AST.
16//
17// Expanded tokens correspond directly to locations found in the AST, allowing
18// to find subranges of the token stream covered by various AST nodes. Spelled
19// tokens correspond directly to the source code written by the user.
20//
21// To allow composing these two use-cases, we also define operations that map
22// between expanded and spelled tokens that produced them (macro calls,
23// directives, etc).
24//
25//===----------------------------------------------------------------------===//
26
27#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
28#define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
29
34#include "clang/Lex/Token.h"
35#include "llvm/ADT/ArrayRef.h"
36#include "llvm/ADT/DenseMap.h"
37#include "llvm/ADT/StringRef.h"
38#include "llvm/Support/Compiler.h"
39#include "llvm/Support/raw_ostream.h"
40#include <cstdint>
41#include <tuple>
42
43namespace clang {
44class Preprocessor;
45
46namespace syntax {
47
48/// A half-open character range inside a particular file, the start offset is
49/// included and the end offset is excluded from the range.
50struct FileRange {
51 /// EXPECTS: File.isValid() && Begin <= End.
52 FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset);
53 /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID().
54 FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length);
55 /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files
56 /// are the same.
58 SourceLocation EndLoc);
59
60 FileID file() const { return File; }
61 /// Start is a start offset (inclusive) in the corresponding file.
62 unsigned beginOffset() const { return Begin; }
63 /// End offset (exclusive) in the corresponding file.
64 unsigned endOffset() const { return End; }
65
66 unsigned length() const { return End - Begin; }
67
68 /// Check if \p Offset is inside the range.
69 bool contains(unsigned Offset) const {
70 return Begin <= Offset && Offset < End;
71 }
72 /// Check \p Offset is inside the range or equal to its endpoint.
73 bool touches(unsigned Offset) const {
74 return Begin <= Offset && Offset <= End;
75 }
76
77 /// Gets the substring that this FileRange refers to.
78 llvm::StringRef text(const SourceManager &SM) const;
79
80 /// Convert to the clang range. The returned range is always a char range,
81 /// never a token range.
83
84 friend bool operator==(const FileRange &L, const FileRange &R) {
85 return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End);
86 }
87 friend bool operator!=(const FileRange &L, const FileRange &R) {
88 return !(L == R);
89 }
90
91private:
93 unsigned Begin;
94 unsigned End;
95};
96
97/// For debugging purposes.
98llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R);
99
100/// A token coming directly from a file or from a macro invocation. Has just
101/// enough information to locate the token in the source code.
102/// Can represent both expanded and spelled tokens.
103class Token {
104public:
105 Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind);
106 /// EXPECTS: clang::Token is not an annotation token.
107 explicit Token(const clang::Token &T);
108
109 tok::TokenKind kind() const { return Kind; }
110 /// Location of the first character of a token.
111 SourceLocation location() const { return Location; }
112 /// Location right after the last character of a token.
114 return Location.getLocWithOffset(Length);
115 }
116 unsigned length() const { return Length; }
117
118 /// Get the substring covered by the token. Note that will include all
119 /// digraphs, newline continuations, etc. E.g. tokens for 'int' and
120 /// in\
121 /// t
122 /// both have the same kind tok::kw_int, but results of text() are different.
123 llvm::StringRef text(const SourceManager &SM) const;
124
125 /// Gets a range of this token.
126 /// EXPECTS: token comes from a file, not from a macro expansion.
127 FileRange range(const SourceManager &SM) const;
128
129 /// Given two tokens inside the same file, returns a file range that starts at
130 /// \p First and ends at \p Last.
131 /// EXPECTS: First and Last are file tokens from the same file, Last starts
132 /// after First.
133 static FileRange range(const SourceManager &SM, const syntax::Token &First,
134 const syntax::Token &Last);
135
136 std::string dumpForTests(const SourceManager &SM) const;
137 /// For debugging purposes.
138 std::string str() const;
139
140private:
141 SourceLocation Location;
142 unsigned Length;
143 tok::TokenKind Kind;
144};
145/// For debugging purposes. Equivalent to a call to Token::str().
146llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T);
147
148/// A list of tokens obtained by preprocessing a text buffer and operations to
149/// map between the expanded and spelled tokens, i.e. TokenBuffer has
150/// information about two token streams:
151/// 1. Expanded tokens: tokens produced by the preprocessor after all macro
152/// replacements,
153/// 2. Spelled tokens: corresponding directly to the source code of a file
154/// before any macro replacements occurred.
155/// Here's an example to illustrate a difference between those two:
156/// #define FOO 10
157/// int a = FOO;
158///
159/// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}.
160/// Expanded tokens are {'int','a','=','10',';','eof'}.
161///
162/// Note that the expanded token stream has a tok::eof token at the end, the
163/// spelled tokens never store a 'eof' token.
164///
165/// The full list expanded tokens can be obtained with expandedTokens(). Spelled
166/// tokens for each of the files can be obtained via spelledTokens(FileID).
167///
168/// To map between the expanded and spelled tokens use findSpelledByExpanded().
169///
170/// To build a token buffer use the TokenCollector class. You can also compute
171/// the spelled tokens of a file using the tokenize() helper.
172///
173/// FIXME: allow mappings into macro arguments.
175public:
176 TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {}
177
179 TokenBuffer(const TokenBuffer &) = delete;
182
183 /// All tokens produced by the preprocessor after all macro replacements,
184 /// directives, etc. Source locations found in the clang AST will always
185 /// point to one of these tokens.
186 /// Tokens are in TU order (per SourceManager::isBeforeInTranslationUnit()).
187 /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split
188 /// into two '>' tokens by the parser. However, TokenBuffer currently
189 /// keeps it as a single '>>' token.
191 return ExpandedTokens;
192 }
193
194 /// Builds a cache to make future calls to expandedToken(SourceRange) faster.
195 /// Creates an index only once. Further calls to it will be no-op.
196 void indexExpandedTokens();
197
198 /// Returns the subrange of expandedTokens() corresponding to the closed
199 /// token range R.
200 /// Consider calling indexExpandedTokens() before for faster lookups.
202
203 /// Returns the subrange of spelled tokens corresponding to AST node spanning
204 /// \p Expanded. This is the text that should be replaced if a refactoring
205 /// were to rewrite the node. If \p Expanded is empty, the returned value is
206 /// std::nullopt.
207 ///
208 /// Will fail if the expanded tokens do not correspond to a sequence of
209 /// spelled tokens. E.g. for the following example:
210 ///
211 /// #define FIRST f1 f2 f3
212 /// #define SECOND s1 s2 s3
213 /// #define ID2(X, Y) X Y
214 ///
215 /// a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c
216 /// d ID2(e f g, h) i // expanded tokens are: d e f g h i
217 ///
218 /// the results would be:
219 /// expanded => spelled
220 /// ------------------------
221 /// a => a
222 /// s1 s2 s3 => SECOND
223 /// a f1 f2 f3 => a FIRST
224 /// a f1 => can't map
225 /// s1 s2 => can't map
226 /// e f => e f
227 /// g h => can't map
228 ///
229 /// EXPECTS: \p Expanded is a subrange of expandedTokens().
230 /// Complexity is logarithmic.
231 std::optional<llvm::ArrayRef<syntax::Token>>
233
234 /// Find the subranges of expanded tokens, corresponding to \p Spelled.
235 ///
236 /// Some spelled tokens may not be present in the expanded token stream, so
237 /// this function can return an empty vector, e.g. for tokens of macro
238 /// directives or disabled preprocessor branches.
239 ///
240 /// Some spelled tokens can be duplicated in the expanded token stream
241 /// multiple times and this function will return multiple results in those
242 /// cases. This happens when \p Spelled is inside a macro argument.
243 ///
244 /// FIXME: return correct results on macro arguments. For now, we return an
245 /// empty list.
246 ///
247 /// (!) will return empty vector on tokens from #define body:
248 /// E.g. for the following example:
249 ///
250 /// #define FIRST(A) f1 A = A f2
251 /// #define SECOND s
252 ///
253 /// a FIRST(arg) b SECOND c // expanded tokens are: a f1 arg = arg f2 b s
254 /// The results would be
255 /// spelled => expanded
256 /// ------------------------
257 /// #define FIRST => {}
258 /// a FIRST(arg) => {a f1 arg = arg f2}
259 /// arg => {arg, arg} // arg #1 is before `=` and arg #2 is
260 /// // after `=` in the expanded tokens.
263
264 /// An expansion produced by the preprocessor, includes macro expansions and
265 /// preprocessor directives. Preprocessor always maps a non-empty range of
266 /// spelled tokens to a (possibly empty) range of expanded tokens. Here is a
267 /// few examples of expansions:
268 /// #pragma once // Expands to an empty range.
269 /// #define FOO 1 2 3 // Expands an empty range.
270 /// FOO // Expands to "1 2 3".
271 /// FIXME(ibiryukov): implement this, currently #include expansions are empty.
272 /// #include <vector> // Expands to tokens produced by the include.
273 struct Expansion {
276 };
277 /// If \p Spelled starts a mapping (e.g. if it's a macro name or '#' starting
278 /// a preprocessor directive) return the subrange of expanded tokens that the
279 /// macro expands to.
280 std::optional<Expansion>
281 expansionStartingAt(const syntax::Token *Spelled) const;
282 /// Returns all expansions (partially) expanded from the specified tokens.
283 /// This is the expansions whose Spelled range intersects \p Spelled.
284 std::vector<Expansion>
286
287 /// Lexed tokens of a file before preprocessing. E.g. for the following input
288 /// #define DECL(name) int name = 10
289 /// DECL(a);
290 /// spelledTokens() returns
291 /// {"#", "define", "DECL", "(", "name", ")", "int", "name", "=", "10",
292 /// "DECL", "(", "a", ")", ";"}
294
295 /// Returns the spelled Token containing the Loc, if there are no such tokens
296 /// returns nullptr.
298
299 /// Get all tokens that expand a macro in \p FID. For the following input
300 /// #define FOO B
301 /// #define FOO2(X) int X
302 /// FOO2(XY)
303 /// int B;
304 /// FOO;
305 /// macroExpansions() returns {"FOO2", "FOO"} (from line 3 and 5
306 /// respecitvely).
307 std::vector<const syntax::Token *> macroExpansions(FileID FID) const;
308
309 const SourceManager &sourceManager() const { return *SourceMgr; }
310
311 std::string dumpForTests() const;
312
313private:
314 /// Describes a mapping between a continuous subrange of spelled tokens and
315 /// expanded tokens. Represents macro expansions, preprocessor directives,
316 /// conditionally disabled pp regions, etc.
317 /// #define FOO 1+2
318 /// #define BAR(a) a + 1
319 /// FOO // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}.
320 /// BAR(1) // invocation #2, tokens = {'a', '+', '1'},
321 /// macroTokens = {'BAR', '(', '1', ')'}.
322 struct Mapping {
323 // Positions in the corresponding spelled token stream. The corresponding
324 // range is never empty.
325 unsigned BeginSpelled = 0;
326 unsigned EndSpelled = 0;
327 // Positions in the expanded token stream. The corresponding range can be
328 // empty.
329 unsigned BeginExpanded = 0;
330 unsigned EndExpanded = 0;
331
332 /// For debugging purposes.
333 std::string str() const;
334 };
335 /// Spelled tokens of the file with information about the subranges.
336 struct MarkedFile {
337 /// Lexed, but not preprocessed, tokens of the file. These map directly to
338 /// text in the corresponding files and include tokens of all preprocessor
339 /// directives.
340 /// FIXME: spelled tokens don't change across FileID that map to the same
341 /// FileEntry. We could consider deduplicating them to save memory.
342 std::vector<syntax::Token> SpelledTokens;
343 /// A sorted list to convert between the spelled and expanded token streams.
344 std::vector<Mapping> Mappings;
345 /// The first expanded token produced for this FileID.
346 unsigned BeginExpanded = 0;
347 unsigned EndExpanded = 0;
348 };
349
350 friend class TokenCollector;
351
352 /// Maps a single expanded token to its spelled counterpart or a mapping that
353 /// produced it.
354 std::pair<const syntax::Token *, const Mapping *>
355 spelledForExpandedToken(const syntax::Token *Expanded) const;
356
357 /// Returns a mapping starting before \p Spelled token, or nullptr if no
358 /// such mapping exists.
359 static const Mapping *
360 mappingStartingBeforeSpelled(const MarkedFile &F,
361 const syntax::Token *Spelled);
362
363 /// Convert a private Mapping to a public Expansion.
364 Expansion makeExpansion(const MarkedFile &, const Mapping &) const;
365 /// Returns the file that the Spelled tokens are taken from.
366 /// Asserts that they are non-empty, from a tracked file, and in-bounds.
367 const MarkedFile &fileForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const;
368
369 /// Token stream produced after preprocessing, conceputally this captures the
370 /// same stream as 'clang -E' (excluding the preprocessor directives like
371 /// #file, etc.).
372 std::vector<syntax::Token> ExpandedTokens;
373 // Index of ExpandedTokens for faster lookups by SourceLocation.
374 llvm::DenseMap<SourceLocation, unsigned> ExpandedTokIndex;
375 llvm::DenseMap<FileID, MarkedFile> Files;
376 // The value is never null, pointer instead of reference to avoid disabling
377 // implicit assignment operator.
378 const SourceManager *SourceMgr;
379};
380
381/// The spelled tokens that overlap or touch a spelling location Loc.
382/// This always returns 0-2 tokens.
387
388/// The identifier token that overlaps or touches a spelling location Loc.
389/// If there is none, returns nullptr.
390const syntax::Token *
393const syntax::Token *
395 const syntax::TokenBuffer &Tokens);
396
397/// Lex the text buffer, corresponding to \p FID, in raw mode and record the
398/// resulting spelled tokens. Does minimal post-processing on raw identifiers,
399/// setting the appropriate token kind (instead of the raw_identifier reported
400/// by lexer in raw mode). This is a very low-level function, most users should
401/// prefer to use TokenCollector. Lexing in raw mode produces wildly different
402/// results from what one might expect when running a C++ frontend, e.g.
403/// preprocessor does not run at all.
404/// The result will *not* have a 'eof' token at the end.
405std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM,
406 const LangOptions &LO);
407/// Similar to one above, instead of whole file tokenizes a part of it. Note
408/// that, the first token might be incomplete if FR.startOffset is not at the
409/// beginning of a token, and the last token returned will start before the
410/// FR.endOffset but might end after it.
411std::vector<syntax::Token>
412tokenize(const FileRange &FR, const SourceManager &SM, const LangOptions &LO);
413
414/// Collects tokens for the main file while running the frontend action. An
415/// instance of this object should be created on
416/// FrontendAction::BeginSourceFile() and the results should be consumed after
417/// FrontendAction::Execute() finishes.
419public:
420 /// Adds the hooks to collect the tokens. Should be called before the
421 /// preprocessing starts, i.e. as a part of BeginSourceFile() or
422 /// CreateASTConsumer().
424
425 /// Finalizes token collection. Should be called after preprocessing is
426 /// finished, i.e. after running Execute().
427 [[nodiscard]] TokenBuffer consume() &&;
428
429private:
430 /// Maps from a start to an end spelling location of transformations
431 /// performed by the preprocessor. These include:
432 /// 1. range from '#' to the last token in the line for PP directives,
433 /// 2. macro name and arguments for macro expansions.
434 /// Note that we record only top-level macro expansions, intermediate
435 /// expansions (e.g. inside macro arguments) are ignored.
436 ///
437 /// Used to find correct boundaries of macro calls and directives when
438 /// building mappings from spelled to expanded tokens.
439 ///
440 /// Logically, at each point of the preprocessor execution there is a stack of
441 /// macro expansions being processed and we could use it to recover the
442 /// location information we need. However, the public preprocessor API only
443 /// exposes the points when macro expansions start (when we push a macro onto
444 /// the stack) and not when they end (when we pop a macro from the stack).
445 /// To workaround this limitation, we rely on source location information
446 /// stored in this map.
447 using PPExpansions = llvm::DenseMap<SourceLocation, SourceLocation>;
448 class Builder;
449 class CollectPPExpansions;
450
451 std::vector<syntax::Token> Expanded;
452 // FIXME: we only store macro expansions, also add directives(#pragma, etc.)
453 PPExpansions Expansions;
454 Preprocessor &PP;
455 CollectPPExpansions *Collector;
456};
457
458} // namespace syntax
459} // namespace clang
460
461#endif
StringRef P
#define SM(sm)
Definition: Cuda.cpp:83
Defines the clang::LangOptions interface.
SourceLocation Loc
Definition: SemaObjC.cpp:759
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
Defines the clang::TokenKind enum and support functions.
Represents a character-granular source range.
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Definition: LangOptions.h:476
Engages in a tight little dance with the lexer to efficiently preprocess tokens.
Definition: Preprocessor.h:137
Encodes a location in the source.
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
This class handles loading and caching of source files into memory.
A trivial tuple used to represent a source range.
Token - This structure provides full information about a lexed token.
Definition: Token.h:36
A list of tokens obtained by preprocessing a text buffer and operations to map between the expanded a...
Definition: Tokens.h:174
TokenBuffer(const SourceManager &SourceMgr)
Definition: Tokens.h:176
const syntax::Token * spelledTokenContaining(SourceLocation Loc) const
Returns the spelled Token containing the Loc, if there are no such tokens returns nullptr.
Definition: Tokens.cpp:387
const SourceManager & sourceManager() const
Definition: Tokens.h:309
void indexExpandedTokens()
Builds a cache to make future calls to expandedToken(SourceRange) faster.
Definition: Tokens.cpp:228
llvm::SmallVector< llvm::ArrayRef< syntax::Token >, 1 > expandedForSpelled(llvm::ArrayRef< syntax::Token > Spelled) const
Find the subranges of expanded tokens, corresponding to Spelled.
Definition: Tokens.cpp:323
TokenBuffer & operator=(const TokenBuffer &)=delete
TokenBuffer(TokenBuffer &&)=default
llvm::ArrayRef< syntax::Token > expandedTokens() const
All tokens produced by the preprocessor after all macro replacements, directives, etc.
Definition: Tokens.h:190
std::string dumpForTests() const
Definition: Tokens.cpp:913
std::optional< llvm::ArrayRef< syntax::Token > > spelledForExpanded(llvm::ArrayRef< syntax::Token > Expanded) const
Returns the subrange of spelled tokens corresponding to AST node spanning Expanded.
Definition: Tokens.cpp:404
TokenBuffer & operator=(TokenBuffer &&)=default
TokenBuffer(const TokenBuffer &)=delete
std::vector< Expansion > expansionsOverlapping(llvm::ArrayRef< syntax::Token > Spelled) const
Returns all expansions (partially) expanded from the specified tokens.
Definition: Tokens.cpp:504
std::optional< Expansion > expansionStartingAt(const syntax::Token *Spelled) const
If Spelled starts a mapping (e.g.
Definition: Tokens.cpp:491
llvm::ArrayRef< syntax::Token > spelledTokens(FileID FID) const
Lexed tokens of a file before preprocessing.
Definition: Tokens.cpp:380
std::vector< const syntax::Token * > macroExpansions(FileID FID) const
Get all tokens that expand a macro in FID.
Definition: Tokens.cpp:561
Collects tokens for the main file while running the frontend action.
Definition: Tokens.h:418
TokenBuffer consume() &&
Finalizes token collection.
Definition: Tokens.cpp:895
A token coming directly from a file or from a macro invocation.
Definition: Tokens.h:103
std::string str() const
For debugging purposes.
Definition: Tokens.cpp:903
llvm::StringRef text(const SourceManager &SM) const
Get the substring covered by the token.
Definition: Tokens.cpp:154
tok::TokenKind kind() const
Definition: Tokens.h:109
SourceLocation endLocation() const
Location right after the last character of a token.
Definition: Tokens.h:113
FileRange range(const SourceManager &SM) const
Gets a range of this token.
Definition: Tokens.cpp:161
unsigned length() const
Definition: Tokens.h:116
std::string dumpForTests(const SourceManager &SM) const
Definition: Tokens.cpp:908
SourceLocation location() const
Location of the first character of a token.
Definition: Tokens.h:111
const syntax::Token * spelledIdentifierTouching(SourceLocation Loc, llvm::ArrayRef< syntax::Token > Tokens)
The identifier token that overlaps or touches a spelling location Loc.
Definition: Tokens.cpp:544
std::vector< syntax::Token > tokenize(FileID FID, const SourceManager &SM, const LangOptions &LO)
Lex the text buffer, corresponding to FID, in raw mode and record the resulting spelled tokens.
Definition: Tokens.cpp:608
raw_ostream & operator<<(raw_ostream &OS, NodeKind K)
For debugging purposes.
Definition: Nodes.cpp:13
llvm::ArrayRef< syntax::Token > spelledTokensTouching(SourceLocation Loc, const syntax::TokenBuffer &Tokens)
The spelled tokens that overlap or touch a spelling location Loc.
Definition: Tokens.cpp:537
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
The JSON file list parser is used to communicate input to InstallAPI.
const FunctionProtoType * T
A half-open character range inside a particular file, the start offset is included and the end offset...
Definition: Tokens.h:50
CharSourceRange toCharRange(const SourceManager &SM) const
Convert to the clang range.
Definition: Tokens.cpp:264
unsigned length() const
Definition: Tokens.h:66
bool contains(unsigned Offset) const
Check if Offset is inside the range.
Definition: Tokens.h:69
friend bool operator==(const FileRange &L, const FileRange &R)
Definition: Tokens.h:84
friend bool operator!=(const FileRange &L, const FileRange &R)
Definition: Tokens.h:87
unsigned beginOffset() const
Start is a start offset (inclusive) in the corresponding file.
Definition: Tokens.h:62
FileID file() const
Definition: Tokens.h:60
llvm::StringRef text(const SourceManager &SM) const
Gets the substring that this FileRange refers to.
Definition: Tokens.cpp:218
bool touches(unsigned Offset) const
Check Offset is inside the range or equal to its endpoint.
Definition: Tokens.h:73
unsigned endOffset() const
End offset (exclusive) in the corresponding file.
Definition: Tokens.h:64
An expansion produced by the preprocessor, includes macro expansions and preprocessor directives.
Definition: Tokens.h:273
llvm::ArrayRef< syntax::Token > Expanded
Definition: Tokens.h:275
llvm::ArrayRef< syntax::Token > Spelled
Definition: Tokens.h:274