22 #include "llvm/ADT/STLExtras.h"
23 #include "llvm/ADT/StringExtras.h"
24 #include "llvm/ADT/StringSwitch.h"
25 #include "llvm/Support/Compiler.h"
26 #include "llvm/Support/ConvertUTF.h"
27 #include "llvm/Support/MemoryBuffer.h"
29 using namespace clang;
38 return II->getObjCKeywordID() == objcKey;
53 void Lexer::anchor() { }
55 void Lexer::InitLexer(
const char *BufStart,
const char *BufPtr,
57 BufferStart = BufStart;
61 assert(BufEnd[0] == 0 &&
62 "We assume that the input buffer has a null character at the end"
63 " to simplify lexing!");
68 if (BufferStart == BufferPtr) {
70 StringRef Buf(BufferStart, BufferEnd - BufferStart);
71 size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
72 .StartsWith(
"\xEF\xBB\xBF", 3)
76 BufferPtr += BOMLength;
79 Is_PragmaLexer =
false;
80 CurrentConflictMarkerState =
CMK_None;
83 IsAtStartOfLine =
true;
84 IsAtPhysicalStartOfLine =
true;
86 HasLeadingSpace =
false;
87 HasLeadingEmptyMacro =
false;
102 ExtendedTokenMode = 0;
111 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
112 LangOpts(PP.getLangOpts()) {
114 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
115 InputFile->getBufferEnd());
121 assert(
PP &&
"Cannot reset token mode without a preprocessor");
122 if (LangOpts.TraditionalCPP)
132 const char *BufStart,
const char *BufPtr,
const char *BufEnd)
133 : FileLoc(fileloc), LangOpts(langOpts) {
135 InitLexer(BufStart, BufPtr, BufEnd);
144 Lexer::Lexer(
FileID FID,
const llvm::MemoryBuffer *FromFile,
146 :
Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile->getBufferStart(),
147 FromFile->getBufferStart(), FromFile->getBufferEnd()) {}
172 const llvm::MemoryBuffer *InputFile = SM.
getBuffer(SpellingFID);
173 Lexer *L =
new Lexer(SpellingFID, InputFile, PP);
180 L->BufferPtr = StrData;
181 L->BufferEnd = StrData+TokLen;
182 assert(L->BufferEnd[0] == 0 &&
"Buffer is not nul terminated!");
188 ExpansionLocEnd, TokLen);
195 L->Is_PragmaLexer =
true;
204 char Quote = Charify ?
'\'' :
'"';
205 for (
unsigned i = 0, e = Result.size(); i != e; ++i) {
206 if (Result[i] ==
'\\' || Result[i] == Quote) {
207 Result.insert(Result.begin()+i,
'\\');
217 for (
unsigned i = 0, e = Str.size(); i != e; ++i) {
218 if (Str[i] ==
'\\' || Str[i] ==
'"') {
219 Str.insert(Str.begin()+i,
'\\');
233 assert(Tok.
needsCleaning() &&
"getSpellingSlow called on simple token");
236 const char *BufEnd = BufPtr + Tok.
getLength();
238 if (Tok.
is(tok::string_literal)) {
240 while (BufPtr < BufEnd) {
245 if (Spelling[Length - 1] ==
'"')
253 Spelling[Length - 2] ==
'R' && Spelling[Length - 1] ==
'"') {
256 const char *RawEnd = BufEnd;
257 do --RawEnd;
while (*RawEnd !=
'"');
258 size_t RawLength = RawEnd - BufPtr + 1;
261 memcpy(Spelling + Length, BufPtr, RawLength);
269 while (BufPtr < BufEnd) {
276 "NeedsCleaning flag set on token that didn't need cleaning!");
294 bool invalidTemp =
false;
295 StringRef file = SM.
getBufferData(locInfo.first, &invalidTemp);
297 if (invalid) *invalid =
true;
301 const char *tokenBegin = file.data() + locInfo.second;
305 file.begin(), tokenBegin, file.end());
307 lexer.LexFromRawLexer(token);
312 if (!token.needsCleaning())
313 return StringRef(tokenBegin, length);
316 buffer.resize(length);
317 buffer.resize(
getSpellingSlow(token, tokenBegin, options, buffer.data()));
318 return StringRef(buffer.data(), buffer.size());
328 assert((
int)Tok.
getLength() >= 0 &&
"Token character range is bogus!");
330 bool CharDataInvalid =
false;
334 *Invalid = CharDataInvalid;
336 return std::string();
340 return std::string(TokStart, TokStart + Tok.
getLength());
344 Result.resize(
getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
361 assert((
int)Tok.
getLength() >= 0 &&
"Token character range is bogus!");
363 const char *TokStart =
nullptr;
365 if (Tok.
is(tok::raw_identifier))
370 Buffer = II->getNameStart();
371 return II->getLength();
381 bool CharDataInvalid =
false;
384 *Invalid = CharDataInvalid;
385 if (CharDataInvalid) {
398 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
420 bool IgnoreWhiteSpace) {
431 bool Invalid =
false;
432 StringRef Buffer = SM.
getBufferData(LocInfo.first, &Invalid);
436 const char *StrData = Buffer.data()+LocInfo.second;
443 Buffer.begin(), StrData, Buffer.end());
445 TheLexer.LexFromRawLexer(Result);
454 if (LocInfo.first.isInvalid())
457 bool Invalid =
false;
458 StringRef Buffer = SM.
getBufferData(LocInfo.first, &Invalid);
464 const char *BufStart = Buffer.data();
465 if (LocInfo.second >= Buffer.size())
468 const char *StrData = BufStart+LocInfo.second;
469 if (StrData[0] ==
'\n' || StrData[0] ==
'\r')
472 const char *LexStart = StrData;
473 while (LexStart != BufStart) {
474 if (LexStart[0] ==
'\n' || LexStart[0] ==
'\r') {
484 Lexer TheLexer(LexerStartLoc, LangOpts, BufStart, LexStart, Buffer.end());
490 TheLexer.LexFromRawLexer(TheTok);
492 if (TheLexer.getBufferLocation() > StrData) {
496 if (TheLexer.getBufferLocation() - TheTok.
getLength() <= StrData)
521 std::pair<FileID, unsigned> BeginFileLocInfo
523 assert(FileLocInfo.first == BeginFileLocInfo.first &&
524 FileLocInfo.second >= BeginFileLocInfo.second);
543 const unsigned StartOffset = 1;
545 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
552 bool InPreprocessorDirective =
false;
555 unsigned IfCount = 0;
558 unsigned MaxLineOffset = 0;
560 const char *CurPtr = Buffer.begin();
561 unsigned CurLine = 0;
562 while (CurPtr != Buffer.end()) {
566 if (CurLine == MaxLines)
570 if (CurPtr != Buffer.end())
571 MaxLineOffset = CurPtr - Buffer.begin();
575 TheLexer.LexFromRawLexer(TheTok);
577 if (InPreprocessorDirective) {
590 InPreprocessorDirective =
false;
599 if (MaxLineOffset && TokOffset >= MaxLineOffset)
604 if (TheTok.
getKind() == tok::comment) {
612 Token HashTok = TheTok;
613 InPreprocessorDirective =
true;
619 TheLexer.LexFromRawLexer(TheTok);
623 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
624 .Case(
"include", PDK_Skipped)
625 .Case(
"__include_macros", PDK_Skipped)
626 .Case(
"define", PDK_Skipped)
627 .Case(
"undef", PDK_Skipped)
628 .Case(
"line", PDK_Skipped)
629 .Case(
"error", PDK_Skipped)
630 .Case(
"pragma", PDK_Skipped)
631 .Case(
"import", PDK_Skipped)
632 .Case(
"include_next", PDK_Skipped)
633 .Case(
"warning", PDK_Skipped)
634 .Case(
"ident", PDK_Skipped)
635 .Case(
"sccs", PDK_Skipped)
636 .Case(
"assert", PDK_Skipped)
637 .Case(
"unassert", PDK_Skipped)
638 .Case(
"if", PDK_StartIf)
639 .Case(
"ifdef", PDK_StartIf)
640 .Case(
"ifndef", PDK_StartIf)
641 .Case(
"elif", PDK_Skipped)
642 .Case(
"else", PDK_Skipped)
643 .Case(
"endif", PDK_EndIf)
644 .Default(PDK_Unknown);
652 IfStartTok = HashTok;
674 InPreprocessorDirective =
false;
687 else if (ActiveCommentLoc.
isValid())
688 End = ActiveCommentLoc;
707 bool Invalid =
false;
711 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
714 unsigned PhysOffset = 0;
719 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
722 ++TokPtr, --CharNo, ++PhysOffset;
727 for (; CharNo; --CharNo) {
738 if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
739 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
791 if (expansionLoc.isFileID()) {
794 *MacroBegin = expansionLoc;
822 *MacroEnd = expansionLoc;
896 bool Invalid =
false;
926 if (Invalid) *Invalid =
true;
932 if (beginInfo.first.isInvalid()) {
933 if (Invalid) *Invalid =
true;
939 beginInfo.second > EndOffs) {
940 if (Invalid) *Invalid =
true;
945 bool invalidTemp =
false;
946 StringRef file = SM.
getBufferData(beginInfo.first, &invalidTemp);
948 if (Invalid) *Invalid =
true;
952 if (Invalid) *Invalid =
false;
953 return file.substr(beginInfo.second, EndOffs - beginInfo.second);
959 assert(Loc.
isMacroID() &&
"Only reasonble to call this on macros");
999 StringRef ExpansionBuffer = SM.
getBufferData(ExpansionInfo.first);
1000 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1020 unsigned CharNo,
unsigned TokLen) {
1021 assert(FileLoc.
isMacroID() &&
"Must be a macro expansion");
1035 std::pair<SourceLocation,SourceLocation> II =
1036 SM.getImmediateExpansionRange(FileLoc);
1038 return SM.createExpansionLoc(SpellingLoc, II.first, II.second, TokLen);
1044 unsigned TokLen)
const {
1045 assert(Loc >= BufferStart && Loc <= BufferEnd &&
1046 "Location out of range for this buffer!");
1050 unsigned CharNo = Loc-BufferStart;
1056 assert(
PP &&
"This doesn't work on raw lexers");
1075 case '=':
return '#';
1076 case ')':
return ']';
1077 case '(':
return '[';
1078 case '!':
return '|';
1079 case '\'':
return '^';
1080 case '>':
return '}';
1081 case '/':
return '\\';
1082 case '<':
return '{';
1083 case '-':
return '~';
1093 if (!Res || !L)
return Res;
1097 L->
Diag(CP-2, diag::trigraph_ignored);
1102 L->
Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1109 unsigned Lexer::getEscapedNewLineSize(
const char *Ptr) {
1114 if (Ptr[Size-1] !=
'\n' && Ptr[Size-1] !=
'\r')
1118 if ((Ptr[Size] ==
'\r' || Ptr[Size] ==
'\n') &&
1119 Ptr[Size-1] != Ptr[Size])
1132 const char *Lexer::SkipEscapedNewLines(
const char *
P) {
1134 const char *AfterEscape;
1137 }
else if (*P ==
'?') {
1139 if (P[1] !=
'?' || P[2] !=
'/')
1146 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1147 if (NewLineSize == 0)
return P;
1148 P = AfterEscape+NewLineSize;
1160 bool SkipTrailingWhitespaceAndNewLine) {
1171 bool InvalidTemp =
false;
1172 StringRef File = SM.
getBufferData(LocInfo.first, &InvalidTemp);
1176 const char *TokenBegin = File.data() + LocInfo.second;
1180 TokenBegin, File.end());
1183 lexer.LexFromRawLexer(Tok);
1184 if (Tok.isNot(TKind))
1189 unsigned NumWhitespaceChars = 0;
1190 if (SkipTrailingWhitespaceAndNewLine) {
1193 unsigned char C = *TokenEnd;
1196 NumWhitespaceChars++;
1200 if (C ==
'\n' || C ==
'\r') {
1203 NumWhitespaceChars++;
1204 if ((C ==
'\n' || C ==
'\r') && C != PrevC)
1205 NumWhitespaceChars++;
1228 char Lexer::getCharAndSizeSlow(
const char *Ptr,
unsigned &Size,
1231 if (Ptr[0] ==
'\\') {
1240 if (
unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1246 Diag(Ptr, diag::backslash_newline_space);
1249 Size += EscapedNewLineSize;
1250 Ptr += EscapedNewLineSize;
1255 if (*Ptr ==
'\n' || *Ptr ==
'\r' || *Ptr ==
'\0')
1259 return getCharAndSizeSlow(Ptr, Size, Tok);
1267 if (Ptr[0] ==
'?' && Ptr[1] ==
'?') {
1276 if (
C ==
'\\')
goto Slash;
1293 char Lexer::getCharAndSizeSlowNoWarn(
const char *Ptr,
unsigned &Size,
1296 if (Ptr[0] ==
'\\') {
1304 if (
unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1306 Size += EscapedNewLineSize;
1307 Ptr += EscapedNewLineSize;
1312 if (*Ptr ==
'\n' || *Ptr ==
'\r' || *Ptr ==
'\0')
1316 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
1324 if (LangOpts.Trigraphs && Ptr[0] ==
'?' && Ptr[1] ==
'?') {
1330 if (
C ==
'\\')
goto Slash;
1345 void Lexer::SkipBytes(
unsigned Bytes,
bool StartOfLine) {
1347 if (BufferPtr > BufferEnd)
1348 BufferPtr = BufferEnd;
1352 IsAtStartOfLine = StartOfLine;
1353 IsAtPhysicalStartOfLine = StartOfLine;
1357 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
1358 static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1360 return C11AllowedIDChars.contains(C);
1361 }
else if (LangOpts.CPlusPlus) {
1362 static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1364 return CXX03AllowedIDChars.contains(C);
1366 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1368 return C99AllowedIDChars.contains(C);
1374 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
1375 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1377 return !C11DisallowedInitialIDChars.contains(C);
1378 }
else if (LangOpts.CPlusPlus) {
1381 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1383 return !C99DisallowedInitialIDChars.contains(C);
1398 CannotAppearInIdentifier = 0,
1399 CannotStartIdentifier
1402 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1404 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1406 if (!C99AllowedIDChars.contains(C)) {
1409 << CannotAppearInIdentifier;
1410 }
else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1413 << CannotStartIdentifier;
1419 static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1421 if (!CXX03AllowedIDChars.contains(C)) {
1422 Diags.
Report(Range.
getBegin(), diag::warn_cxx98_compat_unicode_id)
1428 bool Lexer::tryConsumeIdentifierUCN(
const char *&CurPtr,
unsigned Size,
1430 const char *UCNPtr = CurPtr + Size;
1431 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr,
nullptr);
1441 if ((UCNPtr - CurPtr == 6 && CurPtr[1] ==
'u') ||
1442 (UCNPtr - CurPtr == 10 && CurPtr[1] ==
'U'))
1445 while (CurPtr != UCNPtr)
1446 (void)getAndAdvanceChar(CurPtr, Result);
1450 bool Lexer::tryConsumeIdentifierUTF8Char(
const char *&CurPtr) {
1451 const char *UnicodePtr = CurPtr;
1453 ConversionResult Result =
1454 llvm::convertUTF8Sequence((
const UTF8 **)&UnicodePtr,
1455 (
const UTF8 *)BufferEnd,
1458 if (Result != conversionOK ||
1467 CurPtr = UnicodePtr;
1471 bool Lexer::LexIdentifier(
Token &Result,
const char *CurPtr) {
1474 unsigned char C = *CurPtr++;
1485 if (
isASCII(C) && C !=
'\\' && C !=
'?' &&
1486 (C !=
'$' || !LangOpts.DollarIdents)) {
1488 const char *IdStart = BufferPtr;
1489 FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1511 C = getCharAndSize(CurPtr, Size);
1515 if (!LangOpts.DollarIdents)
goto FinishIdentifier;
1519 Diag(CurPtr, diag::ext_dollar_in_identifier);
1520 CurPtr = ConsumeChar(CurPtr, Size, Result);
1521 C = getCharAndSize(CurPtr, Size);
1524 }
else if (C ==
'\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
1525 C = getCharAndSize(CurPtr, Size);
1527 }
else if (!
isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
1528 C = getCharAndSize(CurPtr, Size);
1531 goto FinishIdentifier;
1535 CurPtr = ConsumeChar(CurPtr, Size, Result);
1537 C = getCharAndSize(CurPtr, Size);
1539 CurPtr = ConsumeChar(CurPtr, Size, Result);
1540 C = getCharAndSize(CurPtr, Size);
1547 bool Lexer::isHexaLiteral(
const char *Start,
const LangOptions &LangOpts) {
1553 return (C2 ==
'x' || C2 ==
'X');
1559 bool Lexer::LexNumericConstant(
Token &Result,
const char *CurPtr) {
1561 char C = getCharAndSize(CurPtr, Size);
1564 CurPtr = ConsumeChar(CurPtr, Size, Result);
1566 C = getCharAndSize(CurPtr, Size);
1570 if ((C ==
'-' || C ==
'+') && (PrevCh ==
'E' || PrevCh ==
'e')) {
1573 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
1574 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1578 if ((C ==
'-' || C ==
'+') && (PrevCh ==
'P' || PrevCh ==
'p')) {
1582 bool IsHexFloat =
true;
1583 if (!LangOpts.C99) {
1584 if (!isHexaLiteral(BufferPtr, LangOpts))
1586 else if (std::find(BufferPtr, CurPtr,
'_') != CurPtr)
1590 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1599 Diag(CurPtr, diag::warn_cxx11_compat_digit_separator);
1600 CurPtr = ConsumeChar(CurPtr, Size, Result);
1601 CurPtr = ConsumeChar(CurPtr, NextSize, Result);
1602 return LexNumericConstant(Result, CurPtr);
1607 if (C ==
'\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1608 return LexNumericConstant(Result, CurPtr);
1609 if (!
isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1610 return LexNumericConstant(Result, CurPtr);
1613 const char *TokStart = BufferPtr;
1614 FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
1621 const char *Lexer::LexUDSuffix(
Token &Result,
const char *CurPtr,
1622 bool IsStringLiteral) {
1627 char C = getCharAndSize(CurPtr, Size);
1628 bool Consumed =
false;
1631 if (C ==
'\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1633 else if (!
isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1642 C ==
'_' ? diag::warn_cxx11_compat_user_defined_literal
1643 : diag::warn_cxx11_compat_reserved_user_defined_literal)
1654 bool IsUDSuffix =
false;
1661 const unsigned MaxStandardSuffixLength = 3;
1662 char Buffer[MaxStandardSuffixLength] = { C };
1663 unsigned Consumed = Size;
1671 IsUDSuffix = (Chars == 1 && Buffer[0] ==
's') ||
1677 if (Chars == MaxStandardSuffixLength)
1681 Buffer[Chars++] =
Next;
1682 Consumed += NextSize;
1689 ? diag::ext_ms_reserved_user_defined_literal
1690 : diag::ext_reserved_user_defined_literal)
1695 CurPtr = ConsumeChar(CurPtr, Size, Result);
1700 C = getCharAndSize(CurPtr, Size);
1702 else if (C ==
'\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {}
1703 else if (!
isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {}
1712 bool Lexer::LexStringLiteral(
Token &Result,
const char *CurPtr,
1715 const char *NulCharacter =
nullptr;
1718 (Kind == tok::utf8_string_literal ||
1719 Kind == tok::utf16_string_literal ||
1720 Kind == tok::utf32_string_literal))
1722 ? diag::warn_cxx98_compat_unicode_literal
1723 : diag::warn_c99_compat_unicode_literal);
1725 char C = getAndAdvanceChar(CurPtr, Result);
1730 C = getAndAdvanceChar(CurPtr, Result);
1732 if (C ==
'\n' || C ==
'\r' ||
1733 (C == 0 && CurPtr-1 == BufferEnd)) {
1735 Diag(BufferPtr, diag::ext_unterminated_string);
1736 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1741 if (isCodeCompletionPoint(CurPtr-1)) {
1743 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1748 NulCharacter = CurPtr-1;
1750 C = getAndAdvanceChar(CurPtr, Result);
1755 CurPtr = LexUDSuffix(Result, CurPtr,
true);
1759 Diag(NulCharacter, diag::null_in_string);
1762 const char *TokStart = BufferPtr;
1763 FormTokenWithChars(Result, CurPtr, Kind);
1770 bool Lexer::LexRawStringLiteral(
Token &Result,
const char *CurPtr,
1778 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
1780 unsigned PrefixLen = 0;
1786 if (CurPtr[PrefixLen] !=
'(') {
1788 const char *PrefixEnd = &CurPtr[PrefixLen];
1789 if (PrefixLen == 16) {
1790 Diag(PrefixEnd, diag::err_raw_delim_too_long);
1792 Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
1793 << StringRef(PrefixEnd, 1);
1805 if (C == 0 && CurPtr-1 == BufferEnd) {
1811 FormTokenWithChars(Result, CurPtr, tok::unknown);
1816 const char *Prefix = CurPtr;
1817 CurPtr += PrefixLen + 1;
1824 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] ==
'"') {
1825 CurPtr += PrefixLen + 1;
1828 }
else if (C == 0 && CurPtr-1 == BufferEnd) {
1830 Diag(BufferPtr, diag::err_unterminated_raw_string)
1831 << StringRef(Prefix, PrefixLen);
1832 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1839 CurPtr = LexUDSuffix(Result, CurPtr,
true);
1842 const char *TokStart = BufferPtr;
1843 FormTokenWithChars(Result, CurPtr, Kind);
1850 bool Lexer::LexAngledStringLiteral(
Token &Result,
const char *CurPtr) {
1852 const char *NulCharacter =
nullptr;
1853 const char *AfterLessPos = CurPtr;
1854 char C = getAndAdvanceChar(CurPtr, Result);
1857 if (C ==
'\\' && CurPtr < BufferEnd) {
1859 getAndAdvanceChar(CurPtr, Result);
1860 }
else if (C ==
'\n' || C ==
'\r' ||
1861 (C == 0 && (CurPtr-1 == BufferEnd ||
1862 isCodeCompletionPoint(CurPtr-1)))) {
1865 FormTokenWithChars(Result, AfterLessPos, tok::less);
1867 }
else if (C == 0) {
1868 NulCharacter = CurPtr-1;
1870 C = getAndAdvanceChar(CurPtr, Result);
1875 Diag(NulCharacter, diag::null_in_string);
1878 const char *TokStart = BufferPtr;
1879 FormTokenWithChars(Result, CurPtr, tok::angle_string_literal);
1887 bool Lexer::LexCharConstant(
Token &Result,
const char *CurPtr,
1890 const char *NulCharacter =
nullptr;
1893 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
1895 ? diag::warn_cxx98_compat_unicode_literal
1896 : diag::warn_c99_compat_unicode_literal);
1897 else if (Kind == tok::utf8_char_constant)
1898 Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
1901 char C = getAndAdvanceChar(CurPtr, Result);
1904 Diag(BufferPtr, diag::ext_empty_character);
1905 FormTokenWithChars(Result, CurPtr, tok::unknown);
1912 C = getAndAdvanceChar(CurPtr, Result);
1914 if (C ==
'\n' || C ==
'\r' ||
1915 (C == 0 && CurPtr-1 == BufferEnd)) {
1917 Diag(BufferPtr, diag::ext_unterminated_char);
1918 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1923 if (isCodeCompletionPoint(CurPtr-1)) {
1925 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1930 NulCharacter = CurPtr-1;
1932 C = getAndAdvanceChar(CurPtr, Result);
1937 CurPtr = LexUDSuffix(Result, CurPtr,
false);
1941 Diag(NulCharacter, diag::null_in_char);
1944 const char *TokStart = BufferPtr;
1945 FormTokenWithChars(Result, CurPtr, Kind);
1955 bool Lexer::SkipWhitespace(
Token &Result,
const char *CurPtr,
1956 bool &TokAtPhysicalStartOfLine) {
1960 unsigned char Char = *CurPtr;
1985 FormTokenWithChars(Result, CurPtr, tok::unknown);
1987 IsAtStartOfLine =
true;
1988 IsAtPhysicalStartOfLine =
true;
1995 char PrevChar = CurPtr[-1];
2001 TokAtPhysicalStartOfLine =
true;
2014 bool Lexer::SkipLineComment(
Token &Result,
const char *CurPtr,
2015 bool &TokAtPhysicalStartOfLine) {
2019 Diag(BufferPtr, diag::ext_line_comment);
2023 LangOpts.LineComment =
true;
2034 C !=
'\n' && C !=
'\r')
2037 const char *NextLine = CurPtr;
2040 const char *EscapePtr = CurPtr-1;
2041 bool HasSpace =
false;
2047 if (*EscapePtr ==
'\\')
2049 else if (EscapePtr[0] ==
'/' && EscapePtr[-1] ==
'?' &&
2050 EscapePtr[-2] ==
'?')
2051 CurPtr = EscapePtr-2;
2057 Diag(EscapePtr, diag::backslash_newline_space);
2064 const char *OldPtr = CurPtr;
2067 C = getAndAdvanceChar(CurPtr, Result);
2072 if (C != 0 && CurPtr == OldPtr+1) {
2080 if (CurPtr != OldPtr+1 && C !=
'/' && CurPtr[0] !=
'/') {
2081 for (; OldPtr != CurPtr; ++OldPtr)
2082 if (OldPtr[0] ==
'\n' || OldPtr[0] ==
'\r') {
2086 const char *ForwardPtr = CurPtr;
2089 if (ForwardPtr[0] ==
'/' && ForwardPtr[1] ==
'/')
2094 Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2099 if (CurPtr == BufferEnd+1) {
2104 if (C ==
'\0' && isCodeCompletionPoint(CurPtr-1)) {
2110 }
while (C !=
'\n' && C !=
'\r');
2123 return SaveLineComment(Result, CurPtr);
2141 TokAtPhysicalStartOfLine =
true;
2150 bool Lexer::SaveLineComment(
Token &Result,
const char *CurPtr) {
2153 FormTokenWithChars(Result, CurPtr, tok::comment);
2160 bool Invalid =
false;
2165 assert(Spelling[0] ==
'/' && Spelling[1] ==
'/' &&
"Not line comment?");
2180 assert(CurPtr[0] ==
'\n' || CurPtr[0] ==
'\r');
2186 if (CurPtr[0] ==
'\n' || CurPtr[0] ==
'\r') {
2188 if (CurPtr[0] == CurPtr[1])
2196 bool HasSpace =
false;
2203 if (*CurPtr ==
'\\') {
2204 if (CurPtr[-1] !=
'*')
return false;
2207 if (CurPtr[0] !=
'/' || CurPtr[-1] !=
'?' || CurPtr[-2] !=
'?' ||
2218 L->
Diag(CurPtr, diag::trigraph_ignored_block_comment);
2222 L->
Diag(CurPtr, diag::trigraph_ends_block_comment);
2227 L->
Diag(CurPtr, diag::escaped_newline_block_comment_end);
2231 L->
Diag(CurPtr, diag::backslash_newline_space);
2252 bool Lexer::SkipBlockComment(
Token &Result,
const char *CurPtr,
2253 bool &TokAtPhysicalStartOfLine) {
2263 unsigned char C = getCharAndSize(CurPtr, CharSize);
2265 if (C == 0 && CurPtr == BufferEnd+1) {
2267 Diag(BufferPtr, diag::err_unterminated_block_comment);
2273 FormTokenWithChars(Result, CurPtr, tok::unknown);
2289 if (CurPtr + 24 < BufferEnd &&
2294 while (C !=
'/' && ((intptr_t)CurPtr & 0x0F) != 0)
2297 if (C ==
'/')
goto FoundSlash;
2300 __m128i Slashes = _mm_set1_epi8(
'/');
2301 while (CurPtr+16 <= BufferEnd) {
2302 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(
const __m128i*)CurPtr,
2308 CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1;
2314 __vector
unsigned char Slashes = {
2315 '/',
'/',
'/',
'/',
'/',
'/',
'/',
'/',
2316 '/',
'/',
'/',
'/',
'/',
'/',
'/',
'/'
2318 while (CurPtr+16 <= BufferEnd &&
2319 !
vec_any_eq(*(
const vector
unsigned char*)CurPtr, Slashes))
2323 while (CurPtr[0] !=
'/' &&
2327 CurPtr+4 < BufferEnd) {
2337 while (C !=
'/' && C !=
'\0')
2342 if (CurPtr[-2] ==
'*')
2345 if ((CurPtr[-2] ==
'\n' || CurPtr[-2] ==
'\r')) {
2352 if (CurPtr[0] ==
'*' && CurPtr[1] !=
'/') {
2357 Diag(CurPtr-1, diag::warn_nested_block_comment);
2359 }
else if (C == 0 && CurPtr == BufferEnd+1) {
2361 Diag(BufferPtr, diag::err_unterminated_block_comment);
2370 FormTokenWithChars(Result, CurPtr, tok::unknown);
2376 }
else if (C ==
'\0' && isCodeCompletionPoint(CurPtr-1)) {
2395 FormTokenWithChars(Result, CurPtr, tok::comment);
2404 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
2422 "Must be in a preprocessing directive!");
2426 const char *CurPtr = BufferPtr;
2428 char Char = getAndAdvanceChar(CurPtr, Tmp);
2432 Result->push_back(Char);
2436 if (CurPtr-1 != BufferEnd) {
2437 if (isCodeCompletionPoint(CurPtr-1)) {
2445 Result->push_back(Char);
2452 assert(CurPtr[-1] == Char &&
"Trigraphs for newline?");
2453 BufferPtr = CurPtr-1;
2457 if (Tmp.
is(tok::code_completion)) {
2462 assert(Tmp.
is(tok::eod) &&
"Unexpected token!");
2474 bool Lexer::LexEndOfFile(
Token &Result,
const char *CurPtr) {
2482 FormTokenWithChars(Result, CurPtr, tok::eod);
2494 BufferPtr = BufferEnd;
2495 FormTokenWithChars(Result, BufferEnd,
tok::eof);
2505 diag::err_pp_unterminated_conditional);
2511 if (CurPtr != BufferStart && (CurPtr[-1] !=
'\n' && CurPtr[-1] !=
'\r')) {
2516 if (LangOpts.CPlusPlus11) {
2520 if (!Diags.
isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
2521 DiagID = diag::warn_cxx98_compat_no_newline_eof;
2523 DiagID = diag::warn_no_newline_eof;
2526 DiagID = diag::ext_no_newline_eof;
2529 Diag(BufferEnd, DiagID)
2543 unsigned Lexer::isNextPPTokenLParen() {
2544 assert(!
LexingRawMode &&
"How can we expand a macro from a skipping buffer?");
2552 const char *TmpBufferPtr = BufferPtr;
2554 bool atStartOfLine = IsAtStartOfLine;
2555 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
2556 bool leadingSpace = HasLeadingSpace;
2562 BufferPtr = TmpBufferPtr;
2564 HasLeadingSpace = leadingSpace;
2565 IsAtStartOfLine = atStartOfLine;
2566 IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
2573 return Tok.
is(tok::l_paren);
2579 const char *Terminator = CMK ==
CMK_Perforce ?
"<<<<\n" :
">>>>>>>";
2581 StringRef RestOfBuffer(CurPtr+TermLen, BufferEnd-CurPtr-TermLen);
2582 size_t Pos = RestOfBuffer.find(Terminator);
2583 while (Pos != StringRef::npos) {
2586 (RestOfBuffer[Pos - 1] !=
'\r' && RestOfBuffer[Pos - 1] !=
'\n')) {
2587 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
2588 Pos = RestOfBuffer.find(Terminator);
2591 return RestOfBuffer.data()+Pos;
2600 bool Lexer::IsStartOfConflictMarker(
const char *CurPtr) {
2602 if (CurPtr != BufferStart &&
2603 CurPtr[-1] !=
'\n' && CurPtr[-1] !=
'\r')
2607 if ((BufferEnd-CurPtr < 8 || StringRef(CurPtr, 7) !=
"<<<<<<<") &&
2608 (BufferEnd-CurPtr < 6 || StringRef(CurPtr, 5) !=
">>>> "))
2623 Diag(CurPtr, diag::err_conflict_marker);
2624 CurrentConflictMarkerState =
Kind;
2628 while (*CurPtr !=
'\r' && *CurPtr !=
'\n') {
2629 assert(CurPtr != BufferEnd &&
"Didn't find end of line");
2645 bool Lexer::HandleEndOfConflictMarker(
const char *CurPtr) {
2647 if (CurPtr != BufferStart &&
2648 CurPtr[-1] !=
'\n' && CurPtr[-1] !=
'\r')
2657 for (
unsigned i = 1; i != 4; ++i)
2658 if (CurPtr[i] != CurPtr[0])
2665 CurrentConflictMarkerState)) {
2669 while (CurPtr != BufferEnd && *CurPtr !=
'\r' && *CurPtr !=
'\n')
2675 CurrentConflictMarkerState =
CMK_None;
2682 bool Lexer::isCodeCompletionPoint(
const char *CurPtr)
const {
2691 uint32_t Lexer::tryReadUCN(
const char *&StartPtr,
const char *SlashLoc,
2694 char Kind = getCharAndSize(StartPtr, CharSize);
2696 unsigned NumHexDigits;
2699 else if (Kind ==
'U')
2704 if (!LangOpts.CPlusPlus && !LangOpts.C99) {
2706 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
2710 const char *CurPtr = StartPtr + CharSize;
2711 const char *KindLoc = &CurPtr[-1];
2713 uint32_t CodePoint = 0;
2714 for (
unsigned i = 0; i < NumHexDigits; ++i) {
2715 char C = getCharAndSize(CurPtr, CharSize);
2717 unsigned Value = llvm::hexDigitValue(C);
2721 Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
2722 << StringRef(KindLoc, 1);
2724 Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
2727 if (i == 4 && NumHexDigits == 8) {
2729 Diag(KindLoc, diag::note_ucn_four_not_eight)
2746 if (CurPtr - StartPtr == (
ptrdiff_t)NumHexDigits + 2)
2749 while (StartPtr != CurPtr)
2750 (void)getAndAdvanceChar(StartPtr, *Result);
2756 if (LangOpts.AsmPreprocessor)
2770 if (CodePoint < 0xA0) {
2771 if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60)
2777 if (CodePoint < 0x20 || CodePoint >= 0x7F)
2778 Diag(BufferPtr, diag::err_ucn_control_character);
2780 char C =
static_cast<char>(CodePoint);
2781 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
2787 }
else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
2792 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
2793 Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
2795 Diag(BufferPtr, diag::err_ucn_escape_invalid);
2803 bool Lexer::CheckUnicodeWhitespace(
Token &Result, uint32_t C,
2804 const char *CurPtr) {
2805 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
2808 UnicodeWhitespaceChars.contains(C)) {
2809 Diag(BufferPtr, diag::ext_unicode_whitespace)
2818 bool Lexer::LexUnicode(
Token &Result, uint32_t C,
const char *CurPtr) {
2828 return LexIdentifier(Result, CurPtr);
2843 Diag(BufferPtr, diag::err_non_ascii)
2853 FormTokenWithChars(Result, CurPtr, tok::unknown);
2857 void Lexer::PropagateLineStartLeadingSpaceInfo(
Token &Result) {
2864 bool Lexer::Lex(
Token &Result) {
2869 if (IsAtStartOfLine) {
2871 IsAtStartOfLine =
false;
2874 if (HasLeadingSpace) {
2876 HasLeadingSpace =
false;
2879 if (HasLeadingEmptyMacro) {
2881 HasLeadingEmptyMacro =
false;
2884 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
2885 IsAtPhysicalStartOfLine =
false;
2888 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
2890 assert((returnedToken || !isRawLex) &&
"Raw lex must succeed");
2891 return returnedToken;
2899 bool Lexer::LexTokenInternal(
Token &Result,
bool TokAtPhysicalStartOfLine) {
2906 const char *CurPtr = BufferPtr;
2909 if ((*CurPtr ==
' ') || (*CurPtr ==
'\t')) {
2911 while ((*CurPtr ==
' ') || (*CurPtr ==
'\t'))
2918 FormTokenWithChars(Result, CurPtr, tok::unknown);
2927 unsigned SizeTmp, SizeTmp2;
2930 char Char = getAndAdvanceChar(CurPtr, Result);
2936 if (CurPtr-1 == BufferEnd)
2937 return LexEndOfFile(Result, CurPtr-1);
2940 if (isCodeCompletionPoint(CurPtr-1)) {
2943 FormTokenWithChars(Result, CurPtr, tok::code_completion);
2948 Diag(CurPtr-1, diag::null_in_file);
2950 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
2959 if (LangOpts.MicrosoftExt)
2960 return LexEndOfFile(Result, CurPtr-1);
2963 Kind = tok::unknown;
2979 IsAtStartOfLine =
true;
2980 IsAtPhysicalStartOfLine =
true;
2989 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
2999 SkipHorizontalWhitespace:
3001 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3010 LangOpts.LineComment &&
3011 (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3012 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3014 goto SkipIgnoredUnits;
3016 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3018 goto SkipIgnoredUnits;
3020 goto SkipHorizontalWhitespace;
3028 case '0':
case '1':
case '2':
case '3':
case '4':
3029 case '5':
case '6':
case '7':
case '8':
case '9':
3032 return LexNumericConstant(Result, CurPtr);
3038 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3039 Char = getCharAndSize(CurPtr, SizeTmp);
3043 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3044 tok::utf16_string_literal);
3048 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3049 tok::utf16_char_constant);
3052 if (Char ==
'R' && LangOpts.CPlusPlus11 &&
3053 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) ==
'"')
3054 return LexRawStringLiteral(Result,
3055 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3057 tok::utf16_string_literal);
3060 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3064 return LexStringLiteral(Result,
3065 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3067 tok::utf8_string_literal);
3068 if (Char2 ==
'\'' && LangOpts.CPlusPlus1z)
3069 return LexCharConstant(
3070 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3072 tok::utf8_char_constant);
3074 if (Char2 ==
'R' && LangOpts.CPlusPlus11) {
3076 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3079 return LexRawStringLiteral(Result,
3080 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3083 tok::utf8_string_literal);
3090 return LexIdentifier(Result, CurPtr);
3096 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3097 Char = getCharAndSize(CurPtr, SizeTmp);
3101 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3102 tok::utf32_string_literal);
3106 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3107 tok::utf32_char_constant);
3110 if (Char ==
'R' && LangOpts.CPlusPlus11 &&
3111 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) ==
'"')
3112 return LexRawStringLiteral(Result,
3113 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3115 tok::utf32_string_literal);
3119 return LexIdentifier(Result, CurPtr);
3125 if (LangOpts.CPlusPlus11) {
3126 Char = getCharAndSize(CurPtr, SizeTmp);
3129 return LexRawStringLiteral(Result,
3130 ConsumeChar(CurPtr, SizeTmp, Result),
3131 tok::string_literal);
3135 return LexIdentifier(Result, CurPtr);
3140 Char = getCharAndSize(CurPtr, SizeTmp);
3144 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3145 tok::wide_string_literal);
3148 if (LangOpts.CPlusPlus11 && Char ==
'R' &&
3149 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) ==
'"')
3150 return LexRawStringLiteral(Result,
3151 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3153 tok::wide_string_literal);
3157 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3158 tok::wide_char_constant);
3162 case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'G':
3163 case 'H':
case 'I':
case 'J':
case 'K':
case 'M':
case 'N':
3164 case 'O':
case 'P':
case 'Q':
case 'S':
case 'T':
3165 case 'V':
case 'W':
case 'X':
case 'Y':
case 'Z':
3166 case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
3167 case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
3168 case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
3169 case 'v':
case 'w':
case 'x':
case 'y':
case 'z':
3173 return LexIdentifier(Result, CurPtr);
3176 if (LangOpts.DollarIdents) {
3178 Diag(CurPtr-1, diag::ext_dollar_in_identifier);
3181 return LexIdentifier(Result, CurPtr);
3184 Kind = tok::unknown;
3191 return LexCharConstant(Result, CurPtr, tok::char_constant);
3197 return LexStringLiteral(Result, CurPtr, tok::string_literal);
3201 Kind = tok::question;
3204 Kind = tok::l_square;
3207 Kind = tok::r_square;
3210 Kind = tok::l_paren;
3213 Kind = tok::r_paren;
3216 Kind = tok::l_brace;
3219 Kind = tok::r_brace;
3222 Char = getCharAndSize(CurPtr, SizeTmp);
3223 if (Char >=
'0' && Char <=
'9') {
3227 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
3228 }
else if (LangOpts.CPlusPlus && Char ==
'*') {
3229 Kind = tok::periodstar;
3231 }
else if (Char ==
'.' &&
3232 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) ==
'.') {
3233 Kind = tok::ellipsis;
3234 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3241 Char = getCharAndSize(CurPtr, SizeTmp);
3244 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3245 }
else if (Char ==
'=') {
3246 Kind = tok::ampequal;
3247 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3253 if (getCharAndSize(CurPtr, SizeTmp) ==
'=') {
3254 Kind = tok::starequal;
3255 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3261 Char = getCharAndSize(CurPtr, SizeTmp);
3263 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3264 Kind = tok::plusplus;
3265 }
else if (Char ==
'=') {
3266 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3267 Kind = tok::plusequal;
3273 Char = getCharAndSize(CurPtr, SizeTmp);
3275 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3276 Kind = tok::minusminus;
3277 }
else if (Char ==
'>' && LangOpts.CPlusPlus &&
3278 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) ==
'*') {
3279 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3281 Kind = tok::arrowstar;
3282 }
else if (Char ==
'>') {
3283 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3285 }
else if (Char ==
'=') {
3286 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3287 Kind = tok::minusequal;
3296 if (getCharAndSize(CurPtr, SizeTmp) ==
'=') {
3297 Kind = tok::exclaimequal;
3298 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3300 Kind = tok::exclaim;
3305 Char = getCharAndSize(CurPtr, SizeTmp);
3315 bool TreatAsComment = LangOpts.LineComment &&
3316 (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
3317 if (!TreatAsComment)
3319 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) !=
'*';
3321 if (TreatAsComment) {
3322 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3323 TokAtPhysicalStartOfLine))
3329 goto SkipIgnoredUnits;
3334 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3335 TokAtPhysicalStartOfLine))
3344 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3345 Kind = tok::slashequal;
3351 Char = getCharAndSize(CurPtr, SizeTmp);
3353 Kind = tok::percentequal;
3354 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3355 }
else if (LangOpts.Digraphs && Char ==
'>') {
3356 Kind = tok::r_brace;
3357 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3358 }
else if (LangOpts.Digraphs && Char ==
':') {
3359 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3360 Char = getCharAndSize(CurPtr, SizeTmp);
3361 if (Char ==
'%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) ==
':') {
3362 Kind = tok::hashhash;
3363 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3365 }
else if (Char ==
'@' && LangOpts.MicrosoftExt) {
3366 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3368 Diag(BufferPtr, diag::ext_charize_microsoft);
3375 if (TokAtPhysicalStartOfLine && !
LexingRawMode && !Is_PragmaLexer)
3376 goto HandleDirective;
3381 Kind = tok::percent;
3385 Char = getCharAndSize(CurPtr, SizeTmp);
3387 return LexAngledStringLiteral(Result, CurPtr);
3388 }
else if (Char ==
'<') {
3389 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3391 Kind = tok::lesslessequal;
3392 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3394 }
else if (After ==
'<' && IsStartOfConflictMarker(CurPtr-1)) {
3398 }
else if (After ==
'<' && HandleEndOfConflictMarker(CurPtr-1)) {
3402 }
else if (LangOpts.CUDA && After ==
'<') {
3403 Kind = tok::lesslessless;
3404 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3407 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3408 Kind = tok::lessless;
3410 }
else if (Char ==
'=') {
3411 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3412 Kind = tok::lessequal;
3413 }
else if (LangOpts.Digraphs && Char ==
':') {
3414 if (LangOpts.CPlusPlus11 &&
3415 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) ==
':') {
3422 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3423 if (After !=
':' && After !=
'>') {
3426 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
3431 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3432 Kind = tok::l_square;
3433 }
else if (LangOpts.Digraphs && Char ==
'%') {
3434 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3435 Kind = tok::l_brace;
3441 Char = getCharAndSize(CurPtr, SizeTmp);
3443 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3444 Kind = tok::greaterequal;
3445 }
else if (Char ==
'>') {
3446 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3448 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3450 Kind = tok::greatergreaterequal;
3451 }
else if (After ==
'>' && IsStartOfConflictMarker(CurPtr-1)) {
3455 }
else if (After ==
'>' && HandleEndOfConflictMarker(CurPtr-1)) {
3458 }
else if (LangOpts.CUDA && After ==
'>') {
3459 Kind = tok::greatergreatergreater;
3460 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3463 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3464 Kind = tok::greatergreater;
3468 Kind = tok::greater;
3472 Char = getCharAndSize(CurPtr, SizeTmp);
3474 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3475 Kind = tok::caretequal;
3481 Char = getCharAndSize(CurPtr, SizeTmp);
3483 Kind = tok::pipeequal;
3484 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3485 }
else if (Char ==
'|') {
3487 if (CurPtr[1] ==
'|' && HandleEndOfConflictMarker(CurPtr-1))
3489 Kind = tok::pipepipe;
3490 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3496 Char = getCharAndSize(CurPtr, SizeTmp);
3497 if (LangOpts.Digraphs && Char ==
'>') {
3498 Kind = tok::r_square;
3499 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3500 }
else if (LangOpts.CPlusPlus && Char ==
':') {
3501 Kind = tok::coloncolon;
3502 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3511 Char = getCharAndSize(CurPtr, SizeTmp);
3514 if (CurPtr[1] ==
'=' && HandleEndOfConflictMarker(CurPtr-1))
3517 Kind = tok::equalequal;
3518 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3527 Char = getCharAndSize(CurPtr, SizeTmp);
3529 Kind = tok::hashhash;
3530 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3531 }
else if (Char ==
'@' && LangOpts.MicrosoftExt) {
3534 Diag(BufferPtr, diag::ext_charize_microsoft);
3535 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3541 if (TokAtPhysicalStartOfLine && !
LexingRawMode && !Is_PragmaLexer)
3542 goto HandleDirective;
3550 if (CurPtr[-1] ==
'@' && LangOpts.ObjC1)
3553 Kind = tok::unknown;
3558 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
3559 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3560 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3568 return LexUnicode(Result, CodePoint, CurPtr);
3571 Kind = tok::unknown;
3576 Kind = tok::unknown;
3585 ConversionResult Status =
3586 llvm::convertUTF8Sequence((
const UTF8 **)&CurPtr,
3587 (
const UTF8 *)BufferEnd,
3590 if (Status == conversionOK) {
3591 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3592 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3599 return LexUnicode(Result, CodePoint, CurPtr);
3605 Kind = tok::unknown;
3612 Diag(CurPtr, diag::err_invalid_utf8);
3614 BufferPtr = CurPtr+1;
3626 FormTokenWithChars(Result, CurPtr, Kind);
3632 FormTokenWithChars(Result, CurPtr, tok::hash);
3637 assert(Result.
is(
tok::eof) &&
"Preprocessor did not set tok:eof");
bool isAtStartOfLine() const
SourceManager & getSourceManager() const
tok::ObjCKeywordKind getObjCKeywordID() const
Return the ObjC keyword kind.
static unsigned getSpelling(const Token &Tok, const char *&Buffer, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid=nullptr)
This is a discriminated union of FileInfo and ExpansionInfo.
SourceLocation getBegin() const
static std::pair< unsigned, bool > ComputePreamble(StringRef Buffer, const LangOptions &LangOpts, unsigned MaxLines=0)
Compute the preamble of the given file.
void setFlagValue(TokenFlags Flag, bool Val)
Set a flag to either true or false.
static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[]
SourceLocation getImmediateSpellingLoc(SourceLocation Loc) const
Given a SourceLocation object, return the spelling location referenced by the ID. ...
static LLVM_READONLY bool isWhitespace(unsigned char c)
void setBegin(SourceLocation b)
SourceLocation getSpellingLoc(SourceLocation Loc) const
Given a SourceLocation object, return the spelling location referenced by the ID. ...
Defines the SourceManager interface.
const SrcMgr::SLocEntry & getSLocEntry(FileID FID, bool *Invalid=nullptr) const
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer...
static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts)
llvm::MemoryBuffer * getBuffer(FileID FID, SourceLocation Loc, bool *Invalid=nullptr) const
Return the buffer for the specified FileID.
bool hasLeadingSpace() const
Return true if this token has whitespace before it.
Each ExpansionInfo encodes the expansion location - where the token was ultimately expanded...
const ExpansionInfo & getExpansion() const
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
bool hasUCN() const
Returns true if this token contains a universal character name.
void setFlag(TokenFlags Flag)
Set the specified flag.
unsigned getRawEncoding() const
When a SourceLocation itself cannot be used, this returns an (opaque) 32-bit integer encoding for it...
bool needsCleaning() const
Return true if this token has trigraphs or escaped newlines in it.
static char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size, const LangOptions &LangOpts)
static bool isAtStartOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroBegin=nullptr)
Returns true if the given MacroID location points at the first token of the macro expansion...
static LLVM_READNONE bool isASCII(char c)
Returns true if this is an ASCII character.
static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen)
Like System, but searched after the system directories.
static Lexer * Create_PragmaLexer(SourceLocation SpellingLoc, SourceLocation ExpansionLocStart, SourceLocation ExpansionLocEnd, unsigned TokLen, Preprocessor &PP)
StringRef getSpelling(SourceLocation loc, SmallVectorImpl< char > &buffer, bool *invalid=nullptr) const
static LLVM_READONLY bool isPreprocessingNumberBody(unsigned char c)
bool ParsingPreprocessorDirective
True when parsing #XXX; turns '\n' into a tok::eod token.
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
void setRawIdentifierData(const char *Ptr)
bool isPragmaLexer() const
isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
static SourceLocation getFromRawEncoding(unsigned Encoding)
Turn a raw encoding of a SourceLocation object into a real SourceLocation.
static LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
SmallVector< PPConditionalInfo, 4 > ConditionalStack
Information about the set of #if/#ifdef/#ifndef blocks we are currently in.
void setKind(tok::TokenKind K)
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
void resetExtendedTokenMode()
SourceLocation getSourceLocation() override
static SourceLocation getBeginningOfFileToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
bool getCommentRetentionState() const
static bool getRawToken(SourceLocation Loc, Token &Result, const SourceManager &SM, const LangOptions &LangOpts, bool IgnoreWhiteSpace=false)
Relex the token at the specified location.
void HandleDirective(Token &Result)
Callback invoked when the lexer sees a # token at the start of a line.
Concrete class used by the front-end to report problems and issues.
static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix)
bool hadModuleLoaderFatalFailure() const
static LLVM_READONLY bool isRawStringDelimBody(unsigned char c)
Return true if this is the body character of a C++ raw string delimiter.
SourceLocation getCodeCompletionFileLoc() const
Returns the start location of the file of code-completion point.
tok::TokenKind getKind() const
const FileID FID
The SourceManager FileID corresponding to the file being lexed.
static SourceLocation AdvanceToTokenCharacter(SourceLocation TokStart, unsigned Character, const SourceManager &SM, const LangOptions &LangOpts)
DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) const
static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L)
StringRef getRawIdentifier() const
static CharSourceRange makeCharRange(Lexer &L, const char *Begin, const char *End)
A little helper class used to produce diagnostics.
bool ParsingFilename
True after #include; turns <xx> into a tok::angle_string_literal token.
FileID getFileID(SourceLocation SpellingLoc) const
Return the FileID for a SourceLocation.
static const llvm::sys::UnicodeCharRange C11DisallowedInitialIDCharRanges[]
static StringRef getSourceText(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts, bool *Invalid=nullptr)
Returns a string for the source that the range encompasses.
bool isInFileID(SourceLocation Loc, FileID FID, unsigned *RelativeOffset=nullptr) const
Given a specific FileID, returns true if Loc is inside that FileID chunk and sets relative offset (of...
static bool isAtEndOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroEnd=nullptr)
Returns true if the given MacroID location points at the last token of the macro expansion.
static int __ATTRS_o_ai vec_any_eq(vector signed char __a, vector signed char __b)
bool LexingRawMode
True if in raw mode.
static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset, const SourceManager &SM, const LangOptions &LangOpts)
Computes the source location just past the end of the token at this source location.
Represents a character-granular source range.
SourceLocation getEnd() const
static unsigned MeasureTokenLength(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
static SourceLocation findLocationAfterToken(SourceLocation loc, tok::TokenKind TKind, const SourceManager &SM, const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine)
Checks that the given token is the first token that occurs after the given location (this excludes co...
Defines the clang::Preprocessor interface.
MultipleIncludeOpt MIOpt
A state machine that detects the #ifndef-wrapping a file idiom for the multiple-include optimization...
void setEnd(SourceLocation e)
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file. ...
bool HandleEndOfFile(Token &Result, bool isEndOfMacro=false)
Callback invoked when the lexer hits the end of the current file.
SourceLocation createExpansionLoc(SourceLocation Loc, SourceLocation ExpansionLocStart, SourceLocation ExpansionLocEnd, unsigned TokLength, int LoadedID=0, unsigned LoadedOffset=0)
Return a new SourceLocation that encodes the fact that a token from SpellingLoc should actually be re...
SourceLocation getCodeCompletionLoc() const
Returns the location of the code-completion point.
The result type of a method or function.
ObjCKeywordKind
Provides a namespace for Objective-C keywords which start with an '@'.
static CharSourceRange getCharRange(SourceRange R)
const char * getLiteralData() const
bool isHandleIdentifierCase() const
Return true if the Preprocessor::HandleIdentifier must be called on a token of this identifier...
bool isTokenRange() const
Return true if the end of this range specifies the start of the last token. Return false if the end o...
Encodes a location in the source. The SourceManager can decode this to get at the full include stack...
bool isValid() const
Return true if this is a valid SourceLocation object.
bool isAtEndOfImmediateMacroExpansion(SourceLocation Loc, SourceLocation *MacroEnd=nullptr) const
Returns true if the given MacroID location points at the character end of the immediate macro expansi...
static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range, bool IsFirst)
bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const
Return true if we have an ObjC keyword identifier.
void setIdentifierInfo(IdentifierInfo *II)
DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const
static const llvm::sys::UnicodeCharRange C99DisallowedInitialIDCharRanges[]
static SourceLocation GetBeginningOfToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Given a location any where in a source buffer, find the location that corresponds to the beginning of...
static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
bool is(tok::TokenKind K) const
static const llvm::sys::UnicodeCharRange C99AllowedIDCharRanges[]
bool isIgnored(unsigned DiagID, SourceLocation Loc) const
Determine whether the diagnostic is known to be ignored.
DiagnosticsEngine & getDiagnostics() const
static StringRef getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[]
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
bool inKeepCommentMode() const
static CharSourceRange makeFileCharRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Accepts a range and returns a character range with file locations.
static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, const LangOptions &LangOpts, char *Spelling)
Slow case of getSpelling. Extract the characters comprising the spelling of this token from the provi...
bool isAtStartOfImmediateMacroExpansion(SourceLocation Loc, SourceLocation *MacroBegin=nullptr) const
Returns true if the given MacroID location points at the beginning of the immediate macro expansion...
static FixItHint CreateRemoval(CharSourceRange RemoveRange)
Create a code modification hint that removes the given source range.
std::pair< SourceLocation, SourceLocation > getImmediateExpansionRange(SourceLocation Loc) const
Return the start/end of the expansion information for an expansion location.
SourceLocation getSourceLocation(const char *Loc, unsigned TokLen=1) const
void CodeCompleteNaturalLanguage()
Hook used by the lexer to invoke the "natural language" code completion point.
SourceLocation getExpansionLocStart() const
void setLiteralData(const char *Ptr)
bool isLiteral() const
Return true if this is a "literal", like a numeric constant, string, etc.
bool isMacroArgExpansion() const
static const llvm::sys::UnicodeCharRange CXX03AllowedIDCharRanges[]
bool HandleIdentifier(Token &Identifier)
Callback invoked when the lexer reads an identifier and has filled in the tokens IdentifierInfo membe...
void CreateString(StringRef Str, Token &Tok, SourceLocation ExpansionLocStart=SourceLocation(), SourceLocation ExpansionLocEnd=SourceLocation())
Plop the specified string into a scratch buffer and set the specified token's location and length to ...
tok::ObjCKeywordKind getObjCKeywordID() const
Return the Objective-C keyword ID for the this identifier.
static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts)
bool hasLeadingEmptyMacro() const
Return true if this token has an empty macro before it.
bool isCodeCompletionEnabled() const
Determine if we are performing code completion.
static FixItHint CreateInsertion(SourceLocation InsertionLoc, StringRef Code, bool BeforePreviousInsertions=false)
Create a code modification hint that inserts the given code string at a specific location.
static char GetTrigraphCharForLetter(char Letter)
static bool isIdentifierBodyChar(char c, const LangOptions &LangOpts)
Returns true if the given character could appear in an identifier.
__PTRDIFF_TYPE__ ptrdiff_t
static LLVM_READONLY bool isIdentifierBody(unsigned char c, bool AllowDollar=false)
bool HandleComment(Token &Token, SourceRange Comment)
const LangOptions & getLangOpts() const
void ReadToEndOfLine(SmallVectorImpl< char > *Result=nullptr)
Not within a conflict marker.
static LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
bool isLexingRawMode() const
Return true if this lexer is in raw mode or not.
static char DecodeTrigraphChar(const char *CP, Lexer *L)
static const char * FindConflictEnd(const char *CurPtr, const char *BufferEnd, ConflictMarkerKind CMK)
Find the end of a version control conflict marker.
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string...
void SetCommentRetentionState(bool Mode)
IdentifierInfo * LookUpIdentifierInfo(Token &Identifier) const
unsigned getLength() const
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file. ...
bool isKeepWhitespaceMode() const
bool isPreprocessedOutput() const
static LLVM_READONLY bool isIdentifierHead(unsigned char c, bool AllowDollar=false)
A trivial tuple used to represent a source range.
void clearFlag(TokenFlags Flag)
Unset the specified flag.
SourceLocation getExpansionLoc(SourceLocation Loc) const
Given a SourceLocation object Loc, return the expansion location referenced by the ID...
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
void SetKeepWhitespaceMode(bool Val)
This class handles loading and caching of source files into memory.
void startToken()
Reset all flags to cleared.
static std::string Stringify(StringRef Str, bool Charify=false)
Engages in a tight little dance with the lexer to efficiently preprocess tokens.
SourceLocation getSpellingLoc() const
bool isMacroArgExpansion(SourceLocation Loc) const
Tests whether the given source location represents a macro argument's expansion into the function-lik...
IdentifierInfo * getIdentifierInfo() const