clang  3.7.0
Lexer.h
Go to the documentation of this file.
1 //===--- Lexer.h - C Language Family Lexer ----------------------*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines the Lexer interface.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_CLANG_LEX_LEXER_H
15 #define LLVM_CLANG_LEX_LEXER_H
16 
19 #include "llvm/ADT/SmallVector.h"
20 #include <cassert>
21 #include <string>
22 
23 namespace clang {
24 class DiagnosticsEngine;
25 class SourceManager;
26 class Preprocessor;
27 class DiagnosticBuilder;
28 
29 /// ConflictMarkerKind - Kinds of conflict marker which the lexer might be
30 /// recovering from.
32  /// Not within a conflict marker.
34  /// A normal or diff3 conflict marker, initiated by at least 7 "<"s,
35  /// separated by at least 7 "="s or "|"s, and terminated by at least 7 ">"s.
37  /// A Perforce-style conflict marker, initiated by 4 ">"s,
38  /// separated by 4 "="s, and terminated by 4 "<"s.
40 };
41 
42 /// Lexer - This provides a simple interface that turns a text buffer into a
43 /// stream of tokens. This provides no support for file reading or buffering,
44 /// or buffering/seeking of tokens, only forward lexing is supported. It relies
45 /// on the specified Preprocessor object to handle preprocessor directives, etc.
46 class Lexer : public PreprocessorLexer {
47  void anchor() override;
48 
49  //===--------------------------------------------------------------------===//
50  // Constant configuration values for this lexer.
51  const char *BufferStart; // Start of the buffer.
52  const char *BufferEnd; // End of the buffer.
53  SourceLocation FileLoc; // Location for start of file.
54  LangOptions LangOpts; // LangOpts enabled by this language (cache).
55  bool Is_PragmaLexer; // True if lexer for _Pragma handling.
56 
57  //===--------------------------------------------------------------------===//
58  // Context-specific lexing flags set by the preprocessor.
59  //
60 
61  /// ExtendedTokenMode - The lexer can optionally keep comments and whitespace
62  /// and return them as tokens. This is used for -C and -CC modes, and
63  /// whitespace preservation can be useful for some clients that want to lex
64  /// the file in raw mode and get every character from the file.
65  ///
66  /// When this is set to 2 it returns comments and whitespace. When set to 1
67  /// it returns comments, when it is set to 0 it returns normal tokens only.
68  unsigned char ExtendedTokenMode;
69 
70  //===--------------------------------------------------------------------===//
71  // Context that changes as the file is lexed.
72  // NOTE: any state that mutates when in raw mode must have save/restore code
73  // in Lexer::isNextPPTokenLParen.
74 
75  // BufferPtr - Current pointer into the buffer. This is the next character
76  // to be lexed.
77  const char *BufferPtr;
78 
79  // IsAtStartOfLine - True if the next lexed token should get the "start of
80  // line" flag set on it.
81  bool IsAtStartOfLine;
82 
83  bool IsAtPhysicalStartOfLine;
84 
85  bool HasLeadingSpace;
86 
87  bool HasLeadingEmptyMacro;
88 
89  // CurrentConflictMarkerState - The kind of conflict marker we are handling.
90  ConflictMarkerKind CurrentConflictMarkerState;
91 
92  Lexer(const Lexer &) = delete;
93  void operator=(const Lexer &) = delete;
94  friend class Preprocessor;
95 
96  void InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd);
97 public:
98 
99  /// Lexer constructor - Create a new lexer object for the specified buffer
100  /// with the specified preprocessor managing the lexing process. This lexer
101  /// assumes that the associated file buffer and Preprocessor objects will
102  /// outlive it, so it doesn't take ownership of either of them.
103  Lexer(FileID FID, const llvm::MemoryBuffer *InputBuffer, Preprocessor &PP);
104 
105  /// Lexer constructor - Create a new raw lexer object. This object is only
106  /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the
107  /// text range will outlive it, so it doesn't take ownership of it.
108  Lexer(SourceLocation FileLoc, const LangOptions &LangOpts,
109  const char *BufStart, const char *BufPtr, const char *BufEnd);
110 
111  /// Lexer constructor - Create a new raw lexer object. This object is only
112  /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the
113  /// text range will outlive it, so it doesn't take ownership of it.
114  Lexer(FileID FID, const llvm::MemoryBuffer *InputBuffer,
115  const SourceManager &SM, const LangOptions &LangOpts);
116 
117  /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
118  /// _Pragma expansion. This has a variety of magic semantics that this method
119  /// sets up. It returns a new'd Lexer that must be delete'd when done.
120  static Lexer *Create_PragmaLexer(SourceLocation SpellingLoc,
121  SourceLocation ExpansionLocStart,
122  SourceLocation ExpansionLocEnd,
123  unsigned TokLen, Preprocessor &PP);
124 
125 
126  /// getLangOpts - Return the language features currently enabled.
127  /// NOTE: this lexer modifies features as a file is parsed!
128  const LangOptions &getLangOpts() const { return LangOpts; }
129 
130  /// getFileLoc - Return the File Location for the file we are lexing out of.
131  /// The physical location encodes the location where the characters come from,
132  /// the virtual location encodes where we should *claim* the characters came
133  /// from. Currently this is only used by _Pragma handling.
134  SourceLocation getFileLoc() const { return FileLoc; }
135 
136 private:
137  /// Lex - Return the next token in the file. If this is the end of file, it
138  /// return the tok::eof token. This implicitly involves the preprocessor.
139  bool Lex(Token &Result);
140 
141 public:
142  /// isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
143  bool isPragmaLexer() const { return Is_PragmaLexer; }
144 
145 private:
146  /// IndirectLex - An indirect call to 'Lex' that can be invoked via
147  /// the PreprocessorLexer interface.
148  void IndirectLex(Token &Result) override { Lex(Result); }
149 
150 public:
151  /// LexFromRawLexer - Lex a token from a designated raw lexer (one with no
152  /// associated preprocessor object. Return true if the 'next character to
153  /// read' pointer points at the end of the lexer buffer, false otherwise.
155  assert(LexingRawMode && "Not already in raw mode!");
156  Lex(Result);
157  // Note that lexing to the end of the buffer doesn't implicitly delete the
158  // lexer when in raw mode.
159  return BufferPtr == BufferEnd;
160  }
161 
162  /// isKeepWhitespaceMode - Return true if the lexer should return tokens for
163  /// every character in the file, including whitespace and comments. This
164  /// should only be used in raw mode, as the preprocessor is not prepared to
165  /// deal with the excess tokens.
166  bool isKeepWhitespaceMode() const {
167  return ExtendedTokenMode > 1;
168  }
169 
170  /// SetKeepWhitespaceMode - This method lets clients enable or disable
171  /// whitespace retention mode.
172  void SetKeepWhitespaceMode(bool Val) {
173  assert((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&
174  "Can only retain whitespace in raw mode or -traditional-cpp");
175  ExtendedTokenMode = Val ? 2 : 0;
176  }
177 
178  /// inKeepCommentMode - Return true if the lexer should return comments as
179  /// tokens.
180  bool inKeepCommentMode() const {
181  return ExtendedTokenMode > 0;
182  }
183 
184  /// SetCommentRetentionMode - Change the comment retention mode of the lexer
185  /// to the specified mode. This is really only useful when lexing in raw
186  /// mode, because otherwise the lexer needs to manage this.
187  void SetCommentRetentionState(bool Mode) {
188  assert(!isKeepWhitespaceMode() &&
189  "Can't play with comment retention state when retaining whitespace");
190  ExtendedTokenMode = Mode ? 1 : 0;
191  }
192 
193  /// Sets the extended token mode back to its initial value, according to the
194  /// language options and preprocessor. This controls whether the lexer
195  /// produces comment and whitespace tokens.
196  ///
197  /// This requires the lexer to have an associated preprocessor. A standalone
198  /// lexer has nothing to reset to.
199  void resetExtendedTokenMode();
200 
201  /// Gets source code buffer.
202  StringRef getBuffer() const {
203  return StringRef(BufferStart, BufferEnd - BufferStart);
204  }
205 
206  /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
207  /// uninterpreted string. This switches the lexer out of directive mode.
209 
210 
211  /// Diag - Forwarding function for diagnostics. This translate a source
212  /// position in the current buffer into a SourceLocation object for rendering.
213  DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const;
214 
215  /// getSourceLocation - Return a source location identifier for the specified
216  /// offset in the current file.
217  SourceLocation getSourceLocation(const char *Loc, unsigned TokLen = 1) const;
218 
219  /// getSourceLocation - Return a source location for the next character in
220  /// the current file.
222  return getSourceLocation(BufferPtr);
223  }
224 
225  /// \brief Return the current location in the buffer.
226  const char *getBufferLocation() const { return BufferPtr; }
227 
228  /// Stringify - Convert the specified string into a C string by escaping '\'
229  /// and " characters. This does not add surrounding ""'s to the string.
230  /// If Charify is true, this escapes the ' character instead of ".
231  static std::string Stringify(StringRef Str, bool Charify = false);
232 
233  /// Stringify - Convert the specified string into a C string by escaping '\'
234  /// and " characters. This does not add surrounding ""'s to the string.
235  static void Stringify(SmallVectorImpl<char> &Str);
236 
237 
238  /// getSpelling - This method is used to get the spelling of a token into a
239  /// preallocated buffer, instead of as an std::string. The caller is required
240  /// to allocate enough space for the token, which is guaranteed to be at least
241  /// Tok.getLength() bytes long. The length of the actual result is returned.
242  ///
243  /// Note that this method may do two possible things: it may either fill in
244  /// the buffer specified with characters, or it may *change the input pointer*
245  /// to point to a constant buffer with the data already in it (avoiding a
246  /// copy). The caller is not allowed to modify the returned buffer pointer
247  /// if an internal buffer is returned.
248  static unsigned getSpelling(const Token &Tok, const char *&Buffer,
249  const SourceManager &SourceMgr,
250  const LangOptions &LangOpts,
251  bool *Invalid = nullptr);
252 
253  /// getSpelling() - Return the 'spelling' of the Tok token. The spelling of a
254  /// token is the characters used to represent the token in the source file
255  /// after trigraph expansion and escaped-newline folding. In particular, this
256  /// wants to get the true, uncanonicalized, spelling of things like digraphs
257  /// UCNs, etc.
258  static std::string getSpelling(const Token &Tok,
259  const SourceManager &SourceMgr,
260  const LangOptions &LangOpts,
261  bool *Invalid = nullptr);
262 
263  /// getSpelling - This method is used to get the spelling of the
264  /// token at the given source location. If, as is usually true, it
265  /// is not necessary to copy any data, then the returned string may
266  /// not point into the provided buffer.
267  ///
268  /// This method lexes at the expansion depth of the given
269  /// location and does not jump to the expansion or spelling
270  /// location.
271  static StringRef getSpelling(SourceLocation loc,
272  SmallVectorImpl<char> &buffer,
273  const SourceManager &SourceMgr,
274  const LangOptions &LangOpts,
275  bool *invalid = nullptr);
276 
277  /// MeasureTokenLength - Relex the token at the specified location and return
278  /// its length in bytes in the input file. If the token needs cleaning (e.g.
279  /// includes a trigraph or an escaped newline) then this count includes bytes
280  /// that are part of that.
281  static unsigned MeasureTokenLength(SourceLocation Loc,
282  const SourceManager &SM,
283  const LangOptions &LangOpts);
284 
285  /// \brief Relex the token at the specified location.
286  /// \returns true if there was a failure, false on success.
287  static bool getRawToken(SourceLocation Loc, Token &Result,
288  const SourceManager &SM,
289  const LangOptions &LangOpts,
290  bool IgnoreWhiteSpace = false);
291 
292  /// \brief Given a location any where in a source buffer, find the location
293  /// that corresponds to the beginning of the token in which the original
294  /// source location lands.
296  const SourceManager &SM,
297  const LangOptions &LangOpts);
298 
299  /// AdvanceToTokenCharacter - If the current SourceLocation specifies a
300  /// location at the start of a token, return a new location that specifies a
301  /// character within the token. This handles trigraphs and escaped newlines.
303  unsigned Character,
304  const SourceManager &SM,
305  const LangOptions &LangOpts);
306 
307  /// \brief Computes the source location just past the end of the
308  /// token at this source location.
309  ///
310  /// This routine can be used to produce a source location that
311  /// points just past the end of the token referenced by \p Loc, and
312  /// is generally used when a diagnostic needs to point just after a
313  /// token where it expected something different that it received. If
314  /// the returned source location would not be meaningful (e.g., if
315  /// it points into a macro), this routine returns an invalid
316  /// source location.
317  ///
318  /// \param Offset an offset from the end of the token, where the source
319  /// location should refer to. The default offset (0) produces a source
320  /// location pointing just past the end of the token; an offset of 1 produces
321  /// a source location pointing to the last character in the token, etc.
323  const SourceManager &SM,
324  const LangOptions &LangOpts);
325 
326  /// \brief Given a token range, produce a corresponding CharSourceRange that
327  /// is not a token range. This allows the source range to be used by
328  /// components that don't have access to the lexer and thus can't find the
329  /// end of the range for themselves.
331  const SourceManager &SM,
332  const LangOptions &LangOpts) {
333  SourceLocation End = getLocForEndOfToken(Range.getEnd(), 0, SM, LangOpts);
334  return End.isInvalid() ? CharSourceRange()
336  Range.getBegin(), End.getLocWithOffset(-1));
337  }
339  const SourceManager &SM,
340  const LangOptions &LangOpts) {
341  return Range.isTokenRange()
342  ? getAsCharRange(Range.getAsRange(), SM, LangOpts)
343  : Range;
344  }
345 
346  /// \brief Returns true if the given MacroID location points at the first
347  /// token of the macro expansion.
348  ///
349  /// \param MacroBegin If non-null and function returns true, it is set to
350  /// begin location of the macro.
352  const SourceManager &SM,
353  const LangOptions &LangOpts,
354  SourceLocation *MacroBegin = nullptr);
355 
356  /// \brief Returns true if the given MacroID location points at the last
357  /// token of the macro expansion.
358  ///
359  /// \param MacroEnd If non-null and function returns true, it is set to
360  /// end location of the macro.
361  static bool isAtEndOfMacroExpansion(SourceLocation loc,
362  const SourceManager &SM,
363  const LangOptions &LangOpts,
364  SourceLocation *MacroEnd = nullptr);
365 
366  /// \brief Accepts a range and returns a character range with file locations.
367  ///
368  /// Returns a null range if a part of the range resides inside a macro
369  /// expansion or the range does not reside on the same FileID.
370  ///
371  /// This function is trying to deal with macros and return a range based on
372  /// file locations. The cases where it can successfully handle macros are:
373  ///
374  /// -begin or end range lies at the start or end of a macro expansion, in
375  /// which case the location will be set to the expansion point, e.g:
376  /// \#define M 1 2
377  /// a M
378  /// If you have a range [a, 2] (where 2 came from the macro), the function
379  /// will return a range for "a M"
380  /// if you have range [a, 1], the function will fail because the range
381  /// overlaps with only a part of the macro
382  ///
383  /// -The macro is a function macro and the range can be mapped to the macro
384  /// arguments, e.g:
385  /// \#define M 1 2
386  /// \#define FM(x) x
387  /// FM(a b M)
388  /// if you have range [b, 2], the function will return the file range "b M"
389  /// inside the macro arguments.
390  /// if you have range [a, 2], the function will return the file range
391  /// "FM(a b M)" since the range includes all of the macro expansion.
393  const SourceManager &SM,
394  const LangOptions &LangOpts);
395 
396  /// \brief Returns a string for the source that the range encompasses.
397  static StringRef getSourceText(CharSourceRange Range,
398  const SourceManager &SM,
399  const LangOptions &LangOpts,
400  bool *Invalid = nullptr);
401 
402  /// \brief Retrieve the name of the immediate macro expansion.
403  ///
404  /// This routine starts from a source location, and finds the name of the macro
405  /// responsible for its immediate expansion. It looks through any intervening
406  /// macro argument expansions to compute this. It returns a StringRef which
407  /// refers to the SourceManager-owned buffer of the source where that macro
408  /// name is spelled. Thus, the result shouldn't out-live that SourceManager.
409  static StringRef getImmediateMacroName(SourceLocation Loc,
410  const SourceManager &SM,
411  const LangOptions &LangOpts);
412 
413  /// \brief Compute the preamble of the given file.
414  ///
415  /// The preamble of a file contains the initial comments, include directives,
416  /// and other preprocessor directives that occur before the code in this
417  /// particular file actually begins. The preamble of the main source file is
418  /// a potential prefix header.
419  ///
420  /// \param Buffer The memory buffer containing the file's contents.
421  ///
422  /// \param MaxLines If non-zero, restrict the length of the preamble
423  /// to fewer than this number of lines.
424  ///
425  /// \returns The offset into the file where the preamble ends and the rest
426  /// of the file begins along with a boolean value indicating whether
427  /// the preamble ends at the beginning of a new line.
428  static std::pair<unsigned, bool> ComputePreamble(StringRef Buffer,
429  const LangOptions &LangOpts,
430  unsigned MaxLines = 0);
431 
432  /// \brief Checks that the given token is the first token that occurs after
433  /// the given location (this excludes comments and whitespace). Returns the
434  /// location immediately after the specified token. If the token is not found
435  /// or the location is inside a macro, the returned source location will be
436  /// invalid.
438  tok::TokenKind TKind,
439  const SourceManager &SM,
440  const LangOptions &LangOpts,
441  bool SkipTrailingWhitespaceAndNewLine);
442 
443  /// \brief Returns true if the given character could appear in an identifier.
444  static bool isIdentifierBodyChar(char c, const LangOptions &LangOpts);
445 
446  /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
447  /// emit a warning.
448  static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
449  const LangOptions &LangOpts) {
450  // If this is not a trigraph and not a UCN or escaped newline, return
451  // quickly.
452  if (isObviouslySimpleCharacter(Ptr[0])) {
453  Size = 1;
454  return *Ptr;
455  }
456 
457  Size = 0;
458  return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
459  }
460 
461  //===--------------------------------------------------------------------===//
462  // Internal implementation interfaces.
463 private:
464 
465  /// LexTokenInternal - Internal interface to lex a preprocessing token. Called
466  /// by Lex.
467  ///
468  bool LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine);
469 
470  bool CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr);
471 
472  /// Given that a token begins with the Unicode character \p C, figure out
473  /// what kind of token it is and dispatch to the appropriate lexing helper
474  /// function.
475  bool LexUnicode(Token &Result, uint32_t C, const char *CurPtr);
476 
477  /// FormTokenWithChars - When we lex a token, we have identified a span
478  /// starting at BufferPtr, going to TokEnd that forms the token. This method
479  /// takes that range and assigns it to the token as its location and size. In
480  /// addition, since tokens cannot overlap, this also updates BufferPtr to be
481  /// TokEnd.
482  void FormTokenWithChars(Token &Result, const char *TokEnd,
484  unsigned TokLen = TokEnd-BufferPtr;
485  Result.setLength(TokLen);
486  Result.setLocation(getSourceLocation(BufferPtr, TokLen));
487  Result.setKind(Kind);
488  BufferPtr = TokEnd;
489  }
490 
491  /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a
492  /// tok::l_paren token, 0 if it is something else and 2 if there are no more
493  /// tokens in the buffer controlled by this lexer.
494  unsigned isNextPPTokenLParen();
495 
496  //===--------------------------------------------------------------------===//
497  // Lexer character reading interfaces.
498 
499  // This lexer is built on two interfaces for reading characters, both of which
500  // automatically provide phase 1/2 translation. getAndAdvanceChar is used
501  // when we know that we will be reading a character from the input buffer and
502  // that this character will be part of the result token. This occurs in (f.e.)
503  // string processing, because we know we need to read until we find the
504  // closing '"' character.
505  //
506  // The second interface is the combination of getCharAndSize with
507  // ConsumeChar. getCharAndSize reads a phase 1/2 translated character,
508  // returning it and its size. If the lexer decides that this character is
509  // part of the current token, it calls ConsumeChar on it. This two stage
510  // approach allows us to emit diagnostics for characters (e.g. warnings about
511  // trigraphs), knowing that they only are emitted if the character is
512  // consumed.
513 
514  /// isObviouslySimpleCharacter - Return true if the specified character is
515  /// obviously the same in translation phase 1 and translation phase 3. This
516  /// can return false for characters that end up being the same, but it will
517  /// never return true for something that needs to be mapped.
518  static bool isObviouslySimpleCharacter(char C) {
519  return C != '?' && C != '\\';
520  }
521 
522  /// getAndAdvanceChar - Read a single 'character' from the specified buffer,
523  /// advance over it, and return it. This is tricky in several cases. Here we
524  /// just handle the trivial case and fall-back to the non-inlined
525  /// getCharAndSizeSlow method to handle the hard case.
526  inline char getAndAdvanceChar(const char *&Ptr, Token &Tok) {
527  // If this is not a trigraph and not a UCN or escaped newline, return
528  // quickly.
529  if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
530 
531  unsigned Size = 0;
532  char C = getCharAndSizeSlow(Ptr, Size, &Tok);
533  Ptr += Size;
534  return C;
535  }
536 
537  /// ConsumeChar - When a character (identified by getCharAndSize) is consumed
538  /// and added to a given token, check to see if there are diagnostics that
539  /// need to be emitted or flags that need to be set on the token. If so, do
540  /// it.
541  const char *ConsumeChar(const char *Ptr, unsigned Size, Token &Tok) {
542  // Normal case, we consumed exactly one token. Just return it.
543  if (Size == 1)
544  return Ptr+Size;
545 
546  // Otherwise, re-lex the character with a current token, allowing
547  // diagnostics to be emitted and flags to be set.
548  Size = 0;
549  getCharAndSizeSlow(Ptr, Size, &Tok);
550  return Ptr+Size;
551  }
552 
553  /// getCharAndSize - Peek a single 'character' from the specified buffer,
554  /// get its size, and return it. This is tricky in several cases. Here we
555  /// just handle the trivial case and fall-back to the non-inlined
556  /// getCharAndSizeSlow method to handle the hard case.
557  inline char getCharAndSize(const char *Ptr, unsigned &Size) {
558  // If this is not a trigraph and not a UCN or escaped newline, return
559  // quickly.
560  if (isObviouslySimpleCharacter(Ptr[0])) {
561  Size = 1;
562  return *Ptr;
563  }
564 
565  Size = 0;
566  return getCharAndSizeSlow(Ptr, Size);
567  }
568 
569  /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
570  /// method.
571  char getCharAndSizeSlow(const char *Ptr, unsigned &Size,
572  Token *Tok = nullptr);
573 
574  /// getEscapedNewLineSize - Return the size of the specified escaped newline,
575  /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
576  /// to this function.
577  static unsigned getEscapedNewLineSize(const char *P);
578 
579  /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
580  /// them), skip over them and return the first non-escaped-newline found,
581  /// otherwise return P.
582  static const char *SkipEscapedNewLines(const char *P);
583 
584  /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
585  /// diagnostic.
586  static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
587  const LangOptions &LangOpts);
588 
589  //===--------------------------------------------------------------------===//
590  // Other lexer functions.
591 
592  void SkipBytes(unsigned Bytes, bool StartOfLine);
593 
594  void PropagateLineStartLeadingSpaceInfo(Token &Result);
595 
596  const char *LexUDSuffix(Token &Result, const char *CurPtr,
597  bool IsStringLiteral);
598 
599  // Helper functions to lex the remainder of a token of the specific type.
600  bool LexIdentifier (Token &Result, const char *CurPtr);
601  bool LexNumericConstant (Token &Result, const char *CurPtr);
602  bool LexStringLiteral (Token &Result, const char *CurPtr,
603  tok::TokenKind Kind);
604  bool LexRawStringLiteral (Token &Result, const char *CurPtr,
605  tok::TokenKind Kind);
606  bool LexAngledStringLiteral(Token &Result, const char *CurPtr);
607  bool LexCharConstant (Token &Result, const char *CurPtr,
608  tok::TokenKind Kind);
609  bool LexEndOfFile (Token &Result, const char *CurPtr);
610  bool SkipWhitespace (Token &Result, const char *CurPtr,
611  bool &TokAtPhysicalStartOfLine);
612  bool SkipLineComment (Token &Result, const char *CurPtr,
613  bool &TokAtPhysicalStartOfLine);
614  bool SkipBlockComment (Token &Result, const char *CurPtr,
615  bool &TokAtPhysicalStartOfLine);
616  bool SaveLineComment (Token &Result, const char *CurPtr);
617 
618  bool IsStartOfConflictMarker(const char *CurPtr);
619  bool HandleEndOfConflictMarker(const char *CurPtr);
620 
621  bool isCodeCompletionPoint(const char *CurPtr) const;
622  void cutOffLexing() { BufferPtr = BufferEnd; }
623 
624  bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
625 
626 
627  /// Read a universal character name.
628  ///
629  /// \param CurPtr The position in the source buffer after the initial '\'.
630  /// If the UCN is syntactically well-formed (but not necessarily
631  /// valid), this parameter will be updated to point to the
632  /// character after the UCN.
633  /// \param SlashLoc The position in the source buffer of the '\'.
634  /// \param Tok The token being formed. Pass \c NULL to suppress diagnostics
635  /// and handle token formation in the caller.
636  ///
637  /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
638  /// invalid.
639  uint32_t tryReadUCN(const char *&CurPtr, const char *SlashLoc, Token *Tok);
640 
641  /// \brief Try to consume a UCN as part of an identifier at the current
642  /// location.
643  /// \param CurPtr Initially points to the range of characters in the source
644  /// buffer containing the '\'. Updated to point past the end of
645  /// the UCN on success.
646  /// \param Size The number of characters occupied by the '\' (including
647  /// trigraphs and escaped newlines).
648  /// \param Result The token being produced. Marked as containing a UCN on
649  /// success.
650  /// \return \c true if a UCN was lexed and it produced an acceptable
651  /// identifier character, \c false otherwise.
652  bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
653  Token &Result);
654 
655  /// \brief Try to consume an identifier character encoded in UTF-8.
656  /// \param CurPtr Points to the start of the (potential) UTF-8 code unit
657  /// sequence. On success, updated to point past the end of it.
658  /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
659  /// character was lexed, \c false otherwise.
660  bool tryConsumeIdentifierUTF8Char(const char *&CurPtr);
661 };
662 
663 } // end namespace clang
664 
665 #endif
static unsigned getSpelling(const Token &Tok, const char *&Buffer, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid=nullptr)
Definition: Lexer.cpp:358
SourceLocation getEnd() const
static std::pair< unsigned, bool > ComputePreamble(StringRef Buffer, const LangOptions &LangOpts, unsigned MaxLines=0)
Compute the preamble of the given file.
Definition: Lexer.cpp:537
bool LexFromRawLexer(Token &Result)
Definition: Lexer.h:154
std::unique_ptr< Lexer > Lex
Definition: Format.cpp:1204
StringRef getBuffer() const
Gets source code buffer.
Definition: Lexer.h:202
static char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size, const LangOptions &LangOpts)
Definition: Lexer.h:448
static bool isAtStartOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroBegin=nullptr)
Returns true if the given MacroID location points at the first token of the macro expansion...
Definition: Lexer.cpp:781
ConflictMarkerKind
Definition: Lexer.h:31
static Lexer * Create_PragmaLexer(SourceLocation SpellingLoc, SourceLocation ExpansionLocStart, SourceLocation ExpansionLocEnd, unsigned TokLen, Preprocessor &PP)
Definition: Lexer.cpp:164
static CharSourceRange getAsCharRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.h:338
bool isPragmaLexer() const
isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
Definition: Lexer.h:143
void setKind(tok::TokenKind K)
Definition: Token.h:91
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Definition: LangOptions.h:48
void resetExtendedTokenMode()
Definition: Lexer.cpp:120
SourceLocation getSourceLocation() override
Definition: Lexer.h:221
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
uint32_t Offset
Definition: CacheTokens.cpp:43
static bool getRawToken(SourceLocation Loc, Token &Result, const SourceManager &SM, const LangOptions &LangOpts, bool IgnoreWhiteSpace=false)
Relex the token at the specified location.
Definition: Lexer.cpp:417
const FileID FID
The SourceManager FileID corresponding to the file being lexed.
static SourceLocation AdvanceToTokenCharacter(SourceLocation TokStart, unsigned Character, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.cpp:700
bool isInvalid() const
AnnotatingParser & P
A little helper class used to produce diagnostics.
Definition: Diagnostic.h:866
const SmallVectorImpl< AnnotatedLine * >::const_iterator End
SourceManager & SM
static StringRef getSourceText(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts, bool *Invalid=nullptr)
Returns a string for the source that the range encompasses.
Definition: Lexer.cpp:920
static bool isAtEndOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroEnd=nullptr)
Returns true if the given MacroID location points at the last token of the macro expansion.
Definition: Lexer.cpp:803
Defines the clang::LangOptions interface.
bool LexingRawMode
True if in raw mode.
static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset, const SourceManager &SM, const LangOptions &LangOpts)
Computes the source location just past the end of the token at this source location.
Definition: Lexer.cpp:759
Represents a character-granular source range.
static unsigned MeasureTokenLength(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.cpp:406
static SourceLocation findLocationAfterToken(SourceLocation loc, tok::TokenKind TKind, const SourceManager &SM, const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine)
Checks that the given token is the first token that occurs after the given location (this excludes co...
Definition: Lexer.cpp:1156
SourceManager & SourceMgr
Definition: Format.cpp:1205
FormatToken * Token
The result type of a method or function.
static CharSourceRange getCharRange(SourceRange R)
const SourceRange & getAsRange() const
Kind
bool isTokenRange() const
Return true if the end of this range specifies the start of the last token. Return false if the end o...
Encodes a location in the source. The SourceManager can decode this to get at the full include stack...
void setLength(unsigned Len)
Definition: Token.h:133
DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const
Definition: Lexer.cpp:1062
static SourceLocation GetBeginningOfToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Given a location any where in a source buffer, find the location that corresponds to the beginning of...
Definition: Lexer.cpp:509
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
SourceLocation getBegin() const
static StringRef getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:956
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
bool inKeepCommentMode() const
Definition: Lexer.h:180
static CharSourceRange makeFileCharRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Accepts a range and returns a character range with file locations.
Definition: Lexer.cpp:856
static CharSourceRange getAsCharRange(SourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Given a token range, produce a corresponding CharSourceRange that is not a token range. This allows the source range to be used by components that don't have access to the lexer and thus can't find the end of the range for themselves.
Definition: Lexer.h:330
static bool isIdentifierBodyChar(char c, const LangOptions &LangOpts)
Returns true if the given character could appear in an identifier.
Definition: Lexer.cpp:1003
const LangOptions & getLangOpts() const
Definition: Lexer.h:128
void ReadToEndOfLine(SmallVectorImpl< char > *Result=nullptr)
Definition: Lexer.cpp:2420
Not within a conflict marker.
Definition: Lexer.h:33
void SetCommentRetentionState(bool Mode)
Definition: Lexer.h:187
bool isKeepWhitespaceMode() const
Definition: Lexer.h:166
void setLocation(SourceLocation L)
Definition: Token.h:132
A trivial tuple used to represent a source range.
Defines the PreprocessorLexer interface.
void SetKeepWhitespaceMode(bool Val)
Definition: Lexer.h:172
This class handles loading and caching of source files into memory.
const char * getBufferLocation() const
Return the current location in the buffer.
Definition: Lexer.h:226
SourceLocation getFileLoc() const
Definition: Lexer.h:134
static std::string Stringify(StringRef Str, bool Charify=false)
Definition: Lexer.cpp:202
Engages in a tight little dance with the lexer to efficiently preprocess tokens.
Definition: Preprocessor.h:96