clang  3.7.0
Encoding.h
Go to the documentation of this file.
1 //===--- Encoding.h - Format C++ code -------------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief Contains functions for text encoding manipulation. Supports UTF-8,
12 /// 8-bit encodings and escape sequences in C++ string literals.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #ifndef LLVM_CLANG_LIB_FORMAT_ENCODING_H
17 #define LLVM_CLANG_LIB_FORMAT_ENCODING_H
18 
19 #include "clang/Basic/LLVM.h"
20 #include "llvm/Support/ConvertUTF.h"
21 #include "llvm/Support/Unicode.h"
22 
23 namespace clang {
24 namespace format {
25 namespace encoding {
26 
27 enum Encoding {
29  Encoding_Unknown // We treat all other encodings as 8-bit encodings.
30 };
31 
32 /// \brief Detects encoding of the Text. If the Text can be decoded using UTF-8,
33 /// it is considered UTF8, otherwise we treat it as some 8-bit encoding.
34 inline Encoding detectEncoding(StringRef Text) {
35  const UTF8 *Ptr = reinterpret_cast<const UTF8 *>(Text.begin());
36  const UTF8 *BufEnd = reinterpret_cast<const UTF8 *>(Text.end());
37  if (::isLegalUTF8String(&Ptr, BufEnd))
38  return Encoding_UTF8;
39  return Encoding_Unknown;
40 }
41 
42 inline unsigned getCodePointCountUTF8(StringRef Text) {
43  unsigned CodePoints = 0;
44  for (size_t i = 0, e = Text.size(); i < e; i += getNumBytesForUTF8(Text[i])) {
45  ++CodePoints;
46  }
47  return CodePoints;
48 }
49 
50 /// \brief Gets the number of code points in the Text using the specified
51 /// Encoding.
52 inline unsigned getCodePointCount(StringRef Text, Encoding Encoding) {
53  switch (Encoding) {
54  case Encoding_UTF8:
55  return getCodePointCountUTF8(Text);
56  default:
57  return Text.size();
58  }
59 }
60 
61 /// \brief Returns the number of columns required to display the \p Text on a
62 /// generic Unicode-capable terminal. Text is assumed to use the specified
63 /// \p Encoding.
64 inline unsigned columnWidth(StringRef Text, Encoding Encoding) {
65  if (Encoding == Encoding_UTF8) {
66  int ContentWidth = llvm::sys::unicode::columnWidthUTF8(Text);
67  // FIXME: Figure out the correct way to handle this in the presence of both
68  // printable and unprintable multi-byte UTF-8 characters. Falling back to
69  // returning the number of bytes may cause problems, as columnWidth suddenly
70  // becomes non-additive.
71  if (ContentWidth >= 0)
72  return ContentWidth;
73  }
74  return Text.size();
75 }
76 
77 /// \brief Returns the number of columns required to display the \p Text,
78 /// starting from the \p StartColumn on a terminal with the \p TabWidth. The
79 /// text is assumed to use the specified \p Encoding.
80 inline unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn,
81  unsigned TabWidth, Encoding Encoding) {
82  unsigned TotalWidth = 0;
83  StringRef Tail = Text;
84  for (;;) {
85  StringRef::size_type TabPos = Tail.find('\t');
86  if (TabPos == StringRef::npos)
87  return TotalWidth + columnWidth(Tail, Encoding);
88  TotalWidth += columnWidth(Tail.substr(0, TabPos), Encoding);
89  TotalWidth += TabWidth - (TotalWidth + StartColumn) % TabWidth;
90  Tail = Tail.substr(TabPos + 1);
91  }
92 }
93 
94 /// \brief Gets the number of bytes in a sequence representing a single
95 /// codepoint and starting with FirstChar in the specified Encoding.
96 inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) {
97  switch (Encoding) {
98  case Encoding_UTF8:
99  return getNumBytesForUTF8(FirstChar);
100  default:
101  return 1;
102  }
103 }
104 
105 inline bool isOctDigit(char c) { return '0' <= c && c <= '7'; }
106 
107 inline bool isHexDigit(char c) {
108  return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
109  ('A' <= c && c <= 'F');
110 }
111 
112 /// \brief Gets the length of an escape sequence inside a C++ string literal.
113 /// Text should span from the beginning of the escape sequence (starting with a
114 /// backslash) to the end of the string literal.
115 inline unsigned getEscapeSequenceLength(StringRef Text) {
116  assert(Text[0] == '\\');
117  if (Text.size() < 2)
118  return 1;
119 
120  switch (Text[1]) {
121  case 'u':
122  return 6;
123  case 'U':
124  return 10;
125  case 'x': {
126  unsigned I = 2; // Point after '\x'.
127  while (I < Text.size() && isHexDigit(Text[I]))
128  ++I;
129  return I;
130  }
131  default:
132  if (isOctDigit(Text[1])) {
133  unsigned I = 1;
134  while (I < Text.size() && I < 4 && isOctDigit(Text[I]))
135  ++I;
136  return I;
137  }
138  return 2;
139  }
140 }
141 
142 } // namespace encoding
143 } // namespace format
144 } // namespace clang
145 
146 #endif
bool isOctDigit(char c)
Definition: Encoding.h:105
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition: Encoding.h:80
unsigned getCodePointCount(StringRef Text, Encoding Encoding)
Gets the number of code points in the Text using the specified Encoding.
Definition: Encoding.h:52
Forward-declares and imports various common LLVM datatypes that clang wants to use unqualified...
unsigned getEscapeSequenceLength(StringRef Text)
Gets the length of an escape sequence inside a C++ string literal. Text should span from the beginnin...
Definition: Encoding.h:115
unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding)
Gets the number of bytes in a sequence representing a single codepoint and starting with FirstChar in...
Definition: Encoding.h:96
unsigned columnWidth(StringRef Text, Encoding Encoding)
Returns the number of columns required to display the Text on a generic Unicode-capable terminal...
Definition: Encoding.h:64
unsigned getCodePointCountUTF8(StringRef Text)
Definition: Encoding.h:42
bool isHexDigit(char c)
Definition: Encoding.h:107
Encoding detectEncoding(StringRef Text)
Detects encoding of the Text. If the Text can be decoded using UTF-8, it is considered UTF8...
Definition: Encoding.h:34