src/libshared/src/utf8.cpp

   1 ////////////////////////////////////////////////////////////////////////////////
   2 //
   3 // Copyright 2013 - 2017, Paul Beckingham, Federico Hernandez.
   4 //
   5 // Permission is hereby granted, free of charge, to any person obtaining a copy
   6 // of this software and associated documentation files (the "Software"), to deal
   7 // in the Software without restriction, including without limitation the rights
   8 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   9 // copies of the Software, and to permit persons to whom the Software is
  10 // furnished to do so, subject to the following conditions:
  11 //
  12 // The above copyright notice and this permission notice shall be included
  13 // in all copies or substantial portions of the Software.
  14 //
  15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  16 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  18 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21 // SOFTWARE.
  22 //
  23 // http://www.opensource.org/licenses/mit-license.php
  24 //
  25 ////////////////////////////////////////////////////////////////////////////////
  26
  27 #include <cmake.h>
  28 #include <utf8.h>
  29
  30 ////////////////////////////////////////////////////////////////////////////////
  31 // Converts '0'     -> 0
  32 //          '9'     -> 9
  33 //          'a'/'A' -> 10
  34 //          'f'/'F' -> 15
  35 #define XDIGIT(x) ((x) >= '0' && (x) <= '9' ? ((x) - '0') : \
  36                    (x) >= 'a' && (x) <= 'f' ? ((x) + 10 - 'a') : \
  37                    (x) >= 'A' && (x) <= 'F' ? ((x) + 10 - 'A') : 0)
  38
  39 ////////////////////////////////////////////////////////////////////////////////
  40 // Note: Assumes 4-digit hex codepoints:
  41 //         xxxx
  42 //         \uxxxx
  43 //         U+xxxx
  44 unsigned int utf8_codepoint (const std::string& input)
  45 {
  46   unsigned int codepoint = 0;
  47   int length = input.length ();
  48
  49   // U+xxxx, \uxxxx
  50   if (length >= 6 &&
  51       ((input[0] == 'U'  && input[1] == '+') ||
  52        (input[0] == '\\' && input[1] == 'u')))
  53   {
  54     codepoint = XDIGIT (input[2]) << 12 |
  55                 XDIGIT (input[3]) <<  8 |
  56                 XDIGIT (input[4]) <<  4 |
  57                 XDIGIT (input[5]);
  58   }
  59   else if (length >= 4)
  60   {
  61     codepoint = XDIGIT (input[0]) << 12 |
  62                 XDIGIT (input[1]) <<  8 |
  63                 XDIGIT (input[2]) <<  4 |
  64                 XDIGIT (input[3]);
  65   }
  66
  67   return codepoint;
  68 }
  69
  70 ////////////////////////////////////////////////////////////////////////////////
  71 // Iterates along a UTF8 string.
  72 //   - argument i counts bytes advanced through the string
  73 //   - returns the next character
  74 unsigned int utf8_next_char (const std::string& input, std::string::size_type& i)
  75 {
  76   if (input[i] == '\0')
  77     return 0;
  78
  79   // How many bytes in the sequence?
  80   int length = utf8_sequence (input[i]);
  81   i += length;
  82
  83   // 0xxxxxxx -> 0xxxxxxx
  84   if (length == 1)
  85     return input[i - 1];
  86
  87   // 110yyyyy 10xxxxxx -> 00000yyy yyxxxxxx
  88   if (length == 2)
  89     return ((input[i - 2] & 0x1F) << 6) +
  90             (input[i - 1] & 0x3F);
  91
  92   // 1110zzzz 10yyyyyy 10xxxxxx -> zzzzyyyy yyxxxxxx
  93   if (length == 3)
  94     return ((input[i - 3] & 0xF)  << 12) +
  95            ((input[i - 2] & 0x3F) <<  6) +
  96             (input[i - 1] & 0x3F);
  97
  98   // 11110www 10zzzzzz 10yyyyyy 10xxxxxx -> 000wwwzz zzzzyyyy yyxxxxxx
  99   if (length == 4)
 100     return ((input[i - 4] & 0x7)  << 18) +
 101            ((input[i - 3] & 0x3F) << 12) +
 102            ((input[i - 2] & 0x3F) <<  6) +
 103             (input[i - 1] & 0x3F);
 104
 105   // Default: pretend as though it's a single character.
 106   // TODO Or should this throw?
 107   return input[i - 1];
 108 }
 109
 110 ////////////////////////////////////////////////////////////////////////////////
 111 // http://en.wikipedia.org/wiki/UTF-8
 112 std::string utf8_character (unsigned int codepoint)
 113 {
 114   char sequence[5] {};
 115
 116   // 0xxxxxxx -> 0xxxxxxx
 117   if (codepoint < 0x80)
 118   {
 119     sequence[0] = codepoint;
 120   }
 121
 122   // 00000yyy yyxxxxxx -> 110yyyyy 10xxxxxx
 123   else if (codepoint < 0x800)
 124   {
 125     sequence[0] = 0xC0 | (codepoint & 0x7C0) >> 6;
 126     sequence[1] = 0x80 | (codepoint & 0x3F);
 127   }
 128
 129   // zzzzyyyy yyxxxxxx -> 1110zzzz 10yyyyyy 10xxxxxx
 130   else if (codepoint < 0x10000)
 131   {
 132     sequence[0] = 0xE0 | (codepoint & 0xF000) >> 12;
 133     sequence[1] = 0x80 | (codepoint & 0xFC0)  >> 6;
 134     sequence[2] = 0x80 | (codepoint & 0x3F);
 135   }
 136
 137   // 000wwwzz zzzzyyyy yyxxxxxx -> 11110www 10zzzzzz 10yyyyyy 10xxxxxx
 138   else if (codepoint < 0x110000)
 139   {
 140     sequence[0] = 0xF0 | (codepoint & 0x1C0000) >> 18;
 141     sequence[1] = 0x80 | (codepoint & 0x03F000) >> 12;
 142     sequence[2] = 0x80 | (codepoint & 0x0FC0)   >> 6;
 143     sequence[3] = 0x80 | (codepoint & 0x3F);
 144   }
 145
 146   return std::string (sequence);
 147 }
 148
 149 ////////////////////////////////////////////////////////////////////////////////
 150 int utf8_sequence (unsigned int character)
 151 {
 152   if ((character & 0xE0) == 0xC0)
 153     return 2;
 154
 155   if ((character & 0xF0) == 0xE0)
 156     return 3;
 157
 158   if ((character & 0xF8) == 0xF0)
 159     return 4;
 160
 161   return 1;
 162 }
 163
 164 ////////////////////////////////////////////////////////////////////////////////
 165 // Length of a string in characters.
 166 unsigned int utf8_length (const std::string& str)
 167 {
 168   int byteLength = str.length ();
 169   int charLength = byteLength;
 170   const char* data = str.data ();
 171
 172   // Decrement the number of bytes for each byte that matches 0b10??????
 173   // this way only the first byte of any utf8 sequence is counted.
 174   for (int i = 0; i < byteLength; i++)
 175   {
 176     // Extract the first two bits and check whether they are 10
 177     if ((data[i] & 0xC0) == 0x80)
 178       charLength--;
 179   }
 180
 181   return charLength;
 182 }
 183
 184 ////////////////////////////////////////////////////////////////////////////////
 185 // Width of a string in character cells.
 186 unsigned int utf8_width (const std::string& str)
 187 {
 188   unsigned int length = 0;
 189   std::string::size_type i = 0;
 190   unsigned int c;
 191   while ((c = utf8_next_char (str, i)))
 192   {
 193     // Control characters, and more especially newline characters, make
 194     // mk_wcwidth() return -1.  Ignore that, thereby "adding zero" to length.
 195     // Since control characters are not displayed in reports, this is a valid
 196     // choice.
 197     int l = mk_wcwidth (c);
 198     if (l != -1)
 199       length += l;
 200   }
 201
 202   return length;
 203 }
 204
 205 ////////////////////////////////////////////////////////////////////////////////
 206 unsigned int utf8_text_length (const std::string& str)
 207 {
 208   int byteLength = str.length ();
 209   int charLength = byteLength;
 210   const char* data = str.data ();
 211   bool in_color = false;
 212
 213   // Decrement the number of bytes for each byte that matches 0b10??????
 214   // this way only the first byte of any utf8 sequence is counted.
 215   for (int i = 0; i < byteLength; i++)
 216   {
 217     if (in_color)
 218     {
 219       if (data[i] == 'm')
 220         in_color = false;
 221
 222       --charLength;
 223     }
 224     else
 225     {
 226       if (data[i] == 033)
 227       {
 228         in_color = true;
 229         --charLength;
 230       }
 231       else
 232       {
 233         // Extract the first two bits and check whether they are 10
 234         if ((data[i] & 0xC0) == 0x80)
 235           --charLength;
 236       }
 237     }
 238   }
 239
 240   return charLength;
 241 }
 242
 243 ////////////////////////////////////////////////////////////////////////////////
 244 unsigned int utf8_text_width (const std::string& str)
 245 {
 246   bool in_color = false;
 247
 248   unsigned int length = 0;
 249   std::string::size_type i = 0;
 250   unsigned int c;
 251   while ((c = utf8_next_char (str, i)))
 252   {
 253     if (in_color)
 254     {
 255       if (c == 'm')
 256         in_color = false;
 257     }
 258     else if (c == 033)
 259     {
 260       in_color = true;
 261     }
 262     else
 263       length += mk_wcwidth (c);
 264   }
 265
 266   return length;
 267 }
 268
 269 ////////////////////////////////////////////////////////////////////////////////
 270 const std::string utf8_substr (
 271   const std::string& input,
 272   unsigned int start,
 273   unsigned int length /* = 0 */)
 274 {
 275   // Find the starting index.
 276   std::string::size_type index_start = 0;
 277   for (unsigned int i = 0; i < start; i++)
 278     utf8_next_char (input, index_start);
 279
 280   std::string result;
 281   if (length)
 282   {
 283     std::string::size_type index_end = index_start;
 284     for (unsigned int i = 0; i < length; i++)
 285       utf8_next_char (input, index_end);
 286
 287     result = input.substr (index_start, index_end - index_start);
 288   }
 289   else
 290     result = input.substr (index_start);
 291
 292   return result;
 293 }
 294
 295 ////////////////////////////////////////////////////////////////////////////////