1 ////////////////////////////////////////////////////////////////////////////////
3 // Copyright 2013 - 2017, Paul Beckingham, Federico Hernandez.
5 // Permission is hereby granted, free of charge, to any person obtaining a copy
6 // of this software and associated documentation files (the "Software"), to deal
7 // in the Software without restriction, including without limitation the rights
8 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 // copies of the Software, and to permit persons to whom the Software is
10 // furnished to do so, subject to the following conditions:
12 // The above copyright notice and this permission notice shall be included
13 // in all copies or substantial portions of the Software.
15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 // http://www.opensource.org/licenses/mit-license.php
25 ////////////////////////////////////////////////////////////////////////////////
30 ////////////////////////////////////////////////////////////////////////////////
35 #define XDIGIT(x) ((x) >= '0' && (x) <= '9' ? ((x) - '0') : \
36 (x) >= 'a' && (x) <= 'f' ? ((x) + 10 - 'a') : \
37 (x) >= 'A' && (x) <= 'F' ? ((x) + 10 - 'A') : 0)
39 ////////////////////////////////////////////////////////////////////////////////
40 // Note: Assumes 4-digit hex codepoints:
44 unsigned int utf8_codepoint (const std::string& input)
46 unsigned int codepoint = 0;
47 int length = input.length ();
51 ((input[0] == 'U' && input[1] == '+') ||
52 (input[0] == '\\' && input[1] == 'u')))
54 codepoint = XDIGIT (input[2]) << 12 |
55 XDIGIT (input[3]) << 8 |
56 XDIGIT (input[4]) << 4 |
61 codepoint = XDIGIT (input[0]) << 12 |
62 XDIGIT (input[1]) << 8 |
63 XDIGIT (input[2]) << 4 |
70 ////////////////////////////////////////////////////////////////////////////////
71 // Iterates along a UTF8 string.
72 // - argument i counts bytes advanced through the string
73 // - returns the next character
74 unsigned int utf8_next_char (const std::string& input, std::string::size_type& i)
79 // How many bytes in the sequence?
80 int length = utf8_sequence (input[i]);
83 // 0xxxxxxx -> 0xxxxxxx
87 // 110yyyyy 10xxxxxx -> 00000yyy yyxxxxxx
89 return ((input[i - 2] & 0x1F) << 6) +
90 (input[i - 1] & 0x3F);
92 // 1110zzzz 10yyyyyy 10xxxxxx -> zzzzyyyy yyxxxxxx
94 return ((input[i - 3] & 0xF) << 12) +
95 ((input[i - 2] & 0x3F) << 6) +
96 (input[i - 1] & 0x3F);
98 // 11110www 10zzzzzz 10yyyyyy 10xxxxxx -> 000wwwzz zzzzyyyy yyxxxxxx
100 return ((input[i - 4] & 0x7) << 18) +
101 ((input[i - 3] & 0x3F) << 12) +
102 ((input[i - 2] & 0x3F) << 6) +
103 (input[i - 1] & 0x3F);
105 // Default: pretend as though it's a single character.
106 // TODO Or should this throw?
110 ////////////////////////////////////////////////////////////////////////////////
111 // http://en.wikipedia.org/wiki/UTF-8
112 std::string utf8_character (unsigned int codepoint)
116 // 0xxxxxxx -> 0xxxxxxx
117 if (codepoint < 0x80)
119 sequence[0] = codepoint;
122 // 00000yyy yyxxxxxx -> 110yyyyy 10xxxxxx
123 else if (codepoint < 0x800)
125 sequence[0] = 0xC0 | (codepoint & 0x7C0) >> 6;
126 sequence[1] = 0x80 | (codepoint & 0x3F);
129 // zzzzyyyy yyxxxxxx -> 1110zzzz 10yyyyyy 10xxxxxx
130 else if (codepoint < 0x10000)
132 sequence[0] = 0xE0 | (codepoint & 0xF000) >> 12;
133 sequence[1] = 0x80 | (codepoint & 0xFC0) >> 6;
134 sequence[2] = 0x80 | (codepoint & 0x3F);
137 // 000wwwzz zzzzyyyy yyxxxxxx -> 11110www 10zzzzzz 10yyyyyy 10xxxxxx
138 else if (codepoint < 0x110000)
140 sequence[0] = 0xF0 | (codepoint & 0x1C0000) >> 18;
141 sequence[1] = 0x80 | (codepoint & 0x03F000) >> 12;
142 sequence[2] = 0x80 | (codepoint & 0x0FC0) >> 6;
143 sequence[3] = 0x80 | (codepoint & 0x3F);
146 return std::string (sequence);
149 ////////////////////////////////////////////////////////////////////////////////
150 int utf8_sequence (unsigned int character)
152 if ((character & 0xE0) == 0xC0)
155 if ((character & 0xF0) == 0xE0)
158 if ((character & 0xF8) == 0xF0)
164 ////////////////////////////////////////////////////////////////////////////////
165 // Length of a string in characters.
166 unsigned int utf8_length (const std::string& str)
168 int byteLength = str.length ();
169 int charLength = byteLength;
170 const char* data = str.data ();
172 // Decrement the number of bytes for each byte that matches 0b10??????
173 // this way only the first byte of any utf8 sequence is counted.
174 for (int i = 0; i < byteLength; i++)
176 // Extract the first two bits and check whether they are 10
177 if ((data[i] & 0xC0) == 0x80)
184 ////////////////////////////////////////////////////////////////////////////////
185 // Width of a string in character cells.
186 unsigned int utf8_width (const std::string& str)
188 unsigned int length = 0;
189 std::string::size_type i = 0;
191 while ((c = utf8_next_char (str, i)))
193 // Control characters, and more especially newline characters, make
194 // mk_wcwidth() return -1. Ignore that, thereby "adding zero" to length.
195 // Since control characters are not displayed in reports, this is a valid
197 int l = mk_wcwidth (c);
205 ////////////////////////////////////////////////////////////////////////////////
206 unsigned int utf8_text_length (const std::string& str)
208 int byteLength = str.length ();
209 int charLength = byteLength;
210 const char* data = str.data ();
211 bool in_color = false;
213 // Decrement the number of bytes for each byte that matches 0b10??????
214 // this way only the first byte of any utf8 sequence is counted.
215 for (int i = 0; i < byteLength; i++)
233 // Extract the first two bits and check whether they are 10
234 if ((data[i] & 0xC0) == 0x80)
243 ////////////////////////////////////////////////////////////////////////////////
244 unsigned int utf8_text_width (const std::string& str)
246 bool in_color = false;
248 unsigned int length = 0;
249 std::string::size_type i = 0;
251 while ((c = utf8_next_char (str, i)))
263 length += mk_wcwidth (c);
269 ////////////////////////////////////////////////////////////////////////////////
270 const std::string utf8_substr (
271 const std::string& input,
273 unsigned int length /* = 0 */)
275 // Find the starting index.
276 std::string::size_type index_start = 0;
277 for (unsigned int i = 0; i < start; i++)
278 utf8_next_char (input, index_start);
283 std::string::size_type index_end = index_start;
284 for (unsigned int i = 0; i < length; i++)
285 utf8_next_char (input, index_end);
287 result = input.substr (index_start, index_end - index_start);
290 result = input.substr (index_start);
295 ////////////////////////////////////////////////////////////////////////////////