5 #include "llvm/ADT/StringExtras.h"
6 #include "llvm/ADT/StringSwitch.h"
7 #include "llvm/Support/ConvertUTF.h"
8 #include "llvm/Support/ErrorHandling.h"
14 llvm::errs() <<
"comments::Token Kind=" <<
Kind <<
" ";
16 llvm::errs() <<
" " << Length <<
" \"" << L.
getSpelling(*
this, SM) <<
"\"\n";
34 char *Resolved = Allocator.Allocate<
char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
35 char *ResolvedPtr = Resolved;
36 if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
37 return StringRef(Resolved, ResolvedPtr - Resolved);
44 #include "clang/AST/CommentHTMLTags.inc"
45 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
49 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name)
const {
51 return llvm::StringSwitch<StringRef>(Name)
58 .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
61 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name)
const {
62 unsigned CodePoint = 0;
63 for (
unsigned i = 0, e = Name.size(); i != e; ++i) {
66 CodePoint += Name[i] -
'0';
71 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name)
const {
72 unsigned CodePoint = 0;
73 for (
unsigned i = 0, e = Name.size(); i != e; ++i) {
75 const char C = Name[i];
77 CodePoint += llvm::hexDigitValue(C);
82 void Lexer::skipLineStartingDecorations() {
84 assert(CommentState == LCS_InsideCComment);
86 if (BufferPtr == CommentEnd)
94 const char *NewBufferPtr = BufferPtr;
96 if (NewBufferPtr == CommentEnd)
99 char C = *NewBufferPtr;
102 if (NewBufferPtr == CommentEnd)
107 BufferPtr = NewBufferPtr + 1;
118 const char *findNewline(
const char *BufferPtr,
const char *BufferEnd) {
119 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
126 const char *skipNewline(
const char *BufferPtr,
const char *BufferEnd) {
127 if (BufferPtr == BufferEnd)
130 if (*BufferPtr ==
'\n')
133 assert(*BufferPtr ==
'\r');
135 if (BufferPtr != BufferEnd && *BufferPtr ==
'\n')
141 const char *skipNamedCharacterReference(
const char *BufferPtr,
142 const char *BufferEnd) {
143 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
150 const char *skipDecimalCharacterReference(
const char *BufferPtr,
151 const char *BufferEnd) {
152 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
159 const char *skipHexCharacterReference(
const char *BufferPtr,
160 const char *BufferEnd) {
161 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
168 bool isHTMLIdentifierStartingCharacter(
char C) {
172 bool isHTMLIdentifierCharacter(
char C) {
176 const char *skipHTMLIdentifier(
const char *BufferPtr,
const char *BufferEnd) {
177 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
178 if (!isHTMLIdentifierCharacter(*BufferPtr))
188 const char *skipHTMLQuotedString(
const char *BufferPtr,
const char *BufferEnd)
190 const char Quote = *BufferPtr;
191 assert(Quote ==
'\"' || Quote ==
'\'');
194 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
195 const char C = *BufferPtr;
196 if (C == Quote && BufferPtr[-1] !=
'\\')
202 const char *
skipWhitespace(
const char *BufferPtr,
const char *BufferEnd) {
203 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
210 bool isWhitespace(
const char *BufferPtr,
const char *BufferEnd) {
214 bool isCommandNameStartCharacter(
char C) {
218 bool isCommandNameCharacter(
char C) {
222 const char *skipCommandName(
const char *BufferPtr,
const char *BufferEnd) {
223 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
224 if (!isCommandNameCharacter(*BufferPtr))
232 const char *findBCPLCommentEnd(
const char *BufferPtr,
const char *BufferEnd) {
233 const char *CurPtr = BufferPtr;
234 while (CurPtr != BufferEnd) {
237 if (CurPtr == BufferEnd)
241 const char *EscapePtr = CurPtr - 1;
245 if (*EscapePtr ==
'\\' ||
246 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] ==
'/' &&
247 EscapePtr[-1] ==
'?' && EscapePtr[-2] ==
'?')) {
249 CurPtr = skipNewline(CurPtr, BufferEnd);
258 const char *findCCommentEnd(
const char *BufferPtr,
const char *BufferEnd) {
259 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
260 if (*BufferPtr ==
'*') {
261 assert(BufferPtr + 1 != BufferEnd);
262 if (*(BufferPtr + 1) ==
'/')
266 llvm_unreachable(
"buffer end hit before '*/' was seen");
271 void Lexer::formTokenWithChars(
Token &Result,
const char *TokEnd,
273 const unsigned TokLen = TokEnd - BufferPtr;
274 Result.setLocation(getSourceLocation(BufferPtr));
275 Result.setKind(Kind);
276 Result.setLength(TokLen);
278 Result.TextPtr =
"<UNSET>";
284 void Lexer::lexCommentText(
Token &T) {
285 assert(CommentState == LCS_InsideBCPLComment ||
286 CommentState == LCS_InsideCComment);
291 case LS_VerbatimBlockFirstLine:
292 lexVerbatimBlockFirstLine(T);
294 case LS_VerbatimBlockBody:
295 lexVerbatimBlockBody(T);
297 case LS_VerbatimLineText:
298 lexVerbatimLineText(T);
300 case LS_HTMLStartTag:
308 assert(State == LS_Normal);
310 const char *TokenPtr = BufferPtr;
311 assert(TokenPtr < CommentEnd);
312 while (TokenPtr != CommentEnd) {
322 if (TokenPtr == CommentEnd) {
323 formTextToken(T, TokenPtr);
331 case '\\':
case '@':
case '&':
case '$':
332 case '#':
case '<':
case '>':
case '%':
333 case '\"':
case '.':
case ':':
336 if (C ==
':' && TokenPtr != CommentEnd && *TokenPtr ==
':') {
340 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
341 formTokenWithChars(T, TokenPtr,
tok::text);
342 T.setText(UnescapedText);
347 if (!isCommandNameStartCharacter(*TokenPtr)) {
348 formTextToken(T, TokenPtr);
352 TokenPtr = skipCommandName(TokenPtr, CommentEnd);
353 unsigned Length = TokenPtr - (BufferPtr + 1);
357 if (Length == 1 && TokenPtr[-1] ==
'f' && TokenPtr != CommentEnd) {
359 if (C ==
'$' || C ==
'[' || C ==
']' || C ==
'{' || C ==
'}') {
365 StringRef CommandName(BufferPtr + 1, Length);
370 StringRef CorrectedName = Info->
Name;
371 SourceLocation Loc = getSourceLocation(BufferPtr);
372 SourceRange CommandRange(Loc.getLocWithOffset(1),
373 getSourceLocation(TokenPtr));
374 Diag(Loc, diag::warn_correct_comment_command_name)
375 << CommandName << CorrectedName
379 T.setUnknownCommandName(CommandName);
380 Diag(T.getLocation(), diag::warn_unknown_comment_command_name);
384 if (Info->IsVerbatimBlockCommand) {
385 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
388 if (Info->IsVerbatimLineCommand) {
389 setupAndLexVerbatimLine(T, TokenPtr, Info);
392 formTokenWithChars(T, TokenPtr, CommandKind);
393 T.setCommandID(Info->getID());
398 lexHTMLCharacterReference(T);
403 if (TokenPtr == CommentEnd) {
404 formTextToken(T, TokenPtr);
407 const char C = *TokenPtr;
408 if (isHTMLIdentifierStartingCharacter(C))
409 setupAndLexHTMLStartTag(T);
411 setupAndLexHTMLEndTag(T);
413 formTextToken(T, TokenPtr);
420 TokenPtr = skipNewline(TokenPtr, CommentEnd);
423 if (CommentState == LCS_InsideCComment)
424 skipLineStartingDecorations();
428 size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
429 find_first_of(
"\n\r\\@&<");
430 if (End != StringRef::npos)
433 TokenPtr = CommentEnd;
434 formTextToken(T, TokenPtr);
441 void Lexer::setupAndLexVerbatimBlock(
Token &T,
442 const char *TextBegin,
443 char Marker,
const CommandInfo *Info) {
444 assert(Info->IsVerbatimBlockCommand);
446 VerbatimBlockEndCommandName.clear();
447 VerbatimBlockEndCommandName.append(Marker ==
'\\' ?
"\\" :
"@");
448 VerbatimBlockEndCommandName.append(Info->EndCommandName);
451 T.setVerbatimBlockID(Info->getID());
456 if (BufferPtr != CommentEnd &&
458 BufferPtr = skipNewline(BufferPtr, CommentEnd);
459 State = LS_VerbatimBlockBody;
463 State = LS_VerbatimBlockFirstLine;
466 void Lexer::lexVerbatimBlockFirstLine(
Token &T) {
468 assert(BufferPtr < CommentEnd);
474 const char *Newline = findNewline(BufferPtr, CommentEnd);
475 StringRef
Line(BufferPtr, Newline - BufferPtr);
478 size_t Pos =
Line.find(VerbatimBlockEndCommandName);
480 const char *NextLine;
481 if (Pos == StringRef::npos) {
484 NextLine = skipNewline(Newline, CommentEnd);
485 }
else if (Pos == 0) {
487 const char *
End = BufferPtr + VerbatimBlockEndCommandName.size();
488 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
495 TextEnd = BufferPtr + Pos;
504 StringRef Text(BufferPtr, TextEnd - BufferPtr);
506 T.setVerbatimBlockText(Text);
508 State = LS_VerbatimBlockBody;
511 void Lexer::lexVerbatimBlockBody(
Token &T) {
512 assert(State == LS_VerbatimBlockBody);
514 if (CommentState == LCS_InsideCComment)
515 skipLineStartingDecorations();
517 if (BufferPtr == CommentEnd) {
519 T.setVerbatimBlockText(
"");
523 lexVerbatimBlockFirstLine(T);
526 void Lexer::setupAndLexVerbatimLine(
Token &T,
const char *TextBegin,
527 const CommandInfo *Info) {
528 assert(Info->IsVerbatimLineCommand);
530 T.setVerbatimLineID(Info->getID());
532 State = LS_VerbatimLineText;
535 void Lexer::lexVerbatimLineText(
Token &T) {
536 assert(State == LS_VerbatimLineText);
539 const char *Newline = findNewline(BufferPtr, CommentEnd);
540 StringRef Text(BufferPtr, Newline - BufferPtr);
542 T.setVerbatimLineText(Text);
547 void Lexer::lexHTMLCharacterReference(
Token &T) {
548 const char *TokenPtr = BufferPtr;
549 assert(*TokenPtr ==
'&');
551 if (TokenPtr == CommentEnd) {
552 formTextToken(T, TokenPtr);
557 bool isDecimal =
false;
561 TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
563 }
else if (C ==
'#') {
565 if (TokenPtr == CommentEnd) {
566 formTextToken(T, TokenPtr);
572 TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
574 }
else if (C ==
'x' || C ==
'X') {
577 TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
579 formTextToken(T, TokenPtr);
583 formTextToken(T, TokenPtr);
586 if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
588 formTextToken(T, TokenPtr);
591 StringRef Name(NamePtr, TokenPtr - NamePtr);
595 Resolved = resolveHTMLNamedCharacterReference(Name);
597 Resolved = resolveHTMLDecimalCharacterReference(Name);
599 Resolved = resolveHTMLHexCharacterReference(Name);
601 if (Resolved.empty()) {
602 formTextToken(T, TokenPtr);
605 formTokenWithChars(T, TokenPtr,
tok::text);
610 void Lexer::setupAndLexHTMLStartTag(
Token &T) {
611 assert(BufferPtr[0] ==
'<' &&
612 isHTMLIdentifierStartingCharacter(BufferPtr[1]));
613 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
614 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
615 if (!isHTMLTagName(Name)) {
616 formTextToken(T, TagNameEnd);
621 T.setHTMLTagStartName(Name);
625 const char C = *BufferPtr;
626 if (BufferPtr != CommentEnd &&
627 (C ==
'>' || C ==
'/' || isHTMLIdentifierStartingCharacter(C)))
628 State = LS_HTMLStartTag;
631 void Lexer::lexHTMLStartTag(
Token &T) {
632 assert(State == LS_HTMLStartTag);
634 const char *TokenPtr = BufferPtr;
636 if (isHTMLIdentifierCharacter(C)) {
637 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
638 StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
640 T.setHTMLIdent(Ident);
649 const char *OpenQuote = TokenPtr;
650 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
651 const char *ClosingQuote = TokenPtr;
652 if (TokenPtr != CommentEnd)
655 T.setHTMLQuotedString(StringRef(OpenQuote + 1,
656 ClosingQuote - (OpenQuote + 1)));
666 if (TokenPtr != CommentEnd && *TokenPtr ==
'>') {
670 formTextToken(T, TokenPtr);
680 if (BufferPtr == CommentEnd) {
686 if (!isHTMLIdentifierStartingCharacter(C) &&
687 C !=
'=' && C !=
'\"' && C !=
'\'' && C !=
'>') {
693 void Lexer::setupAndLexHTMLEndTag(
Token &T) {
694 assert(BufferPtr[0] ==
'<' && BufferPtr[1] ==
'/');
696 const char *TagNameBegin =
skipWhitespace(BufferPtr + 2, CommentEnd);
697 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
698 StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
699 if (!isHTMLTagName(Name)) {
700 formTextToken(T, TagNameEnd);
707 T.setHTMLTagEndName(Name);
709 if (BufferPtr != CommentEnd && *BufferPtr ==
'>')
710 State = LS_HTMLEndTag;
713 void Lexer::lexHTMLEndTag(
Token &T) {
714 assert(BufferPtr != CommentEnd && *BufferPtr ==
'>');
723 const char *BufferStart,
const char *BufferEnd):
724 Allocator(Allocator), Diags(Diags), Traits(Traits),
725 BufferStart(BufferStart), BufferEnd(BufferEnd),
726 FileLoc(FileLoc), BufferPtr(BufferStart),
727 CommentState(LCS_BeforeComment),
State(LS_Normal) {
732 switch (CommentState) {
733 case LCS_BeforeComment:
734 if (BufferPtr == BufferEnd) {
735 formTokenWithChars(T, BufferPtr,
tok::eof);
739 assert(*BufferPtr ==
'/');
745 if (BufferPtr != BufferEnd) {
750 const char C = *BufferPtr;
751 if (C ==
'/' || C ==
'!')
758 if (BufferPtr != BufferEnd && *BufferPtr ==
'<')
761 CommentState = LCS_InsideBCPLComment;
762 if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
764 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
771 const char C = *BufferPtr;
772 if ((C ==
'*' && *(BufferPtr + 1) !=
'/') || C ==
'!')
776 if (BufferPtr != BufferEnd && *BufferPtr ==
'<')
779 CommentState = LCS_InsideCComment;
781 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
785 llvm_unreachable(
"second character of comment should be '/' or '*'");
788 case LCS_BetweenComments: {
791 const char *EndWhitespace = BufferPtr;
792 while(EndWhitespace != BufferEnd && *EndWhitespace !=
'/')
801 CommentState = LCS_BeforeComment;
805 case LCS_InsideBCPLComment:
806 case LCS_InsideCComment:
807 if (BufferPtr != CommentEnd) {
812 if (CommentState == LCS_InsideCComment) {
813 assert(BufferPtr[0] ==
'*' && BufferPtr[1] ==
'/');
815 assert(BufferPtr <= BufferEnd);
821 CommentState = LCS_BetweenComments;
825 CommentState = LCS_BetweenComments;
834 bool *Invalid)
const {
838 bool InvalidTemp =
false;
839 StringRef File = SourceMgr.
getBufferData(LocInfo.first, &InvalidTemp);
845 const char *Begin = File.data() + LocInfo.second;
846 return StringRef(Begin, Tok.
getLength());
static LLVM_READONLY bool isDigit(unsigned char c)
Return true if this character is an ASCII digit: [0-9].
static LLVM_READONLY bool isWhitespace(unsigned char c)
static LLVM_READONLY bool isLetter(unsigned char c)
Return true if this character is an ASCII letter: [a-zA-Z].
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
static LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Concrete class used by the front-end to report problems and issues.
void dump(const SourceManager &SM) const
static unsigned skipWhitespace(unsigned Idx, StringRef Str, unsigned Length)
Skip over whitespace in the string, starting at the given index.
static bool isNamed(const NamedDecl *ND, const char(&Str)[Len])
Encodes a location in the source. The SourceManager can decode this to get at the full include stack...
static LLVM_READONLY bool isAlphanumeric(unsigned char c)
Return true if this character is an ASCII letter or digit: [a-zA-Z0-9].
static LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string...
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
This class handles loading and caching of source files into memory.
static LLVM_READONLY bool isHexDigit(unsigned char c)
Return true if this character is an ASCII hex digit: [0-9a-fA-F].