src/libshared/src/Lexer.cpp

   1 ////////////////////////////////////////////////////////////////////////////////
   2 //
   3 // Copyright 2013 - 2017, Paul Beckingham, Federico Hernandez.
   4 //
   5 // Permission is hereby granted, free of charge, to any person obtaining a copy
   6 // of this software and associated documentation files (the "Software"), to deal
   7 // in the Software without restriction, including without limitation the rights
   8 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   9 // copies of the Software, and to permit persons to whom the Software is
  10 // furnished to do so, subject to the following conditions:
  11 //
  12 // The above copyright notice and this permission notice shall be included
  13 // in all copies or substantial portions of the Software.
  14 //
  15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  16 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  18 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21 // SOFTWARE.
  22 //
  23 // http://www.opensource.org/licenses/mit-license.php
  24 //
  25 ////////////////////////////////////////////////////////////////////////////////
  26
  27 #include <cmake.h>
  28 #include <Lexer.h>
  29 #include <Datetime.h>
  30 #include <Duration.h>
  31 #include <algorithm>
  32 #include <tuple>
  33 #include <ctype.h>
  34 #include <unicode.h>
  35 #include <utf8.h>
  36
  37 static const std::string uuid_pattern = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx";
  38 static const unsigned int uuid_min_length = 8;
  39
  40 std::string Lexer::dateFormat = "";
  41
  42 ////////////////////////////////////////////////////////////////////////////////
  43 Lexer::Lexer (const std::string& text)
  44 : _text (text)
  45 , _eos (text.size ())
  46 {
  47 }
  48
  49 ////////////////////////////////////////////////////////////////////////////////
  50 // When a Lexer object is constructed with a string, this method walks through
  51 // the stream of low-level tokens.
  52 bool Lexer::token (std::string& token, Lexer::Type& type)
  53 {
  54   // Eat white space.
  55   while (unicodeWhitespace (_text[_cursor]))
  56     utf8_next_char (_text, _cursor);
  57
  58   // Terminate at EOS.
  59   if (isEOS ())
  60     return false;
  61
  62   if (isString    (token, type, "'\"") ||
  63       isUUID      (token, type, true)  ||
  64       isDate      (token, type)        ||
  65       isDuration  (token, type)        ||
  66       isURL       (token, type)        ||
  67       isHexNumber (token, type)        ||
  68       isNumber    (token, type)        ||
  69       isPath      (token, type)        ||
  70       isPattern   (token, type)        ||
  71       isOperator  (token, type)        ||
  72       isWord      (token, type))
  73     return true;
  74
  75   return false;
  76 }
  77
  78 ////////////////////////////////////////////////////////////////////////////////
  79 std::vector <std::tuple <std::string, Lexer::Type>> Lexer::tokenize (const std::string& input)
  80 {
  81   std::vector <std::tuple <std::string, Lexer::Type>> tokens;
  82
  83   std::string token;
  84   Lexer::Type type;
  85   Lexer lexer (input);
  86   while (lexer.token (token, type))
  87     tokens.push_back (std::make_tuple (token, type));
  88
  89   return tokens;
  90 }
  91
  92 ////////////////////////////////////////////////////////////////////////////////
  93 // No L10N - these are for internal purposes.
  94 const std::string Lexer::typeName (const Lexer::Type& type)
  95 {
  96   switch (type)
  97   {
  98   case Lexer::Type::uuid:         return "uuid";
  99   case Lexer::Type::number:       return "number";
 100   case Lexer::Type::hex:          return "hex";
 101   case Lexer::Type::string:       return "string";
 102   case Lexer::Type::url:          return "url";
 103   case Lexer::Type::path:         return "path";
 104   case Lexer::Type::pattern:      return "pattern";
 105   case Lexer::Type::op:           return "op";
 106   case Lexer::Type::word:         return "word";
 107   case Lexer::Type::date:         return "date";
 108   case Lexer::Type::duration:     return "duration";
 109   }
 110
 111   return "unknown";
 112 }
 113
 114 ////////////////////////////////////////////////////////////////////////////////
 115 // Lexer::Type::number
 116 //   \d+
 117 //   [ . \d+ ]
 118 //   [ e|E [ +|- ] \d+ [ . \d+ ] ]
 119 //   not followed by non-operator.
 120 bool Lexer::isNumber (std::string& token, Lexer::Type& type)
 121 {
 122   std::size_t marker = _cursor;
 123
 124   if (unicodeLatinDigit (_text[marker]))
 125   {
 126     ++marker;
 127     while (unicodeLatinDigit (_text[marker]))
 128       utf8_next_char (_text, marker);
 129
 130     if (_text[marker] == '.')
 131     {
 132       ++marker;
 133       if (unicodeLatinDigit (_text[marker]))
 134       {
 135         ++marker;
 136         while (unicodeLatinDigit (_text[marker]))
 137           utf8_next_char (_text, marker);
 138       }
 139     }
 140
 141     if (_text[marker] == 'e' ||
 142         _text[marker] == 'E')
 143     {
 144       ++marker;
 145
 146       if (_text[marker] == '+' ||
 147           _text[marker] == '-')
 148         ++marker;
 149
 150       if (unicodeLatinDigit (_text[marker]))
 151       {
 152         ++marker;
 153         while (unicodeLatinDigit (_text[marker]))
 154           utf8_next_char (_text, marker);
 155
 156         if (_text[marker] == '.')
 157         {
 158           ++marker;
 159           if (unicodeLatinDigit (_text[marker]))
 160           {
 161             ++marker;
 162             while (unicodeLatinDigit (_text[marker]))
 163               utf8_next_char (_text, marker);
 164           }
 165         }
 166       }
 167     }
 168
 169     // Lookahread: !<unicodeWhitespace> | !<isSingleCharOperator>
 170     // If there is an immediately consecutive character, that is not an operator, fail.
 171     if (_eos > marker &&
 172         ! unicodeWhitespace (_text[marker]) &&
 173         ! isSingleCharOperator (_text[marker]))
 174       return false;
 175
 176     token = _text.substr (_cursor, marker - _cursor);
 177     type = Lexer::Type::number;
 178     _cursor = marker;
 179     return true;
 180   }
 181
 182   return false;
 183 }
 184
 185 ////////////////////////////////////////////////////////////////////////////////
 186 // Lexer::Type::number
 187 //   \d+
 188 bool Lexer::isInteger (std::string& token, Lexer::Type& type)
 189 {
 190   std::size_t marker = _cursor;
 191
 192   if (unicodeLatinDigit (_text[marker]))
 193   {
 194     ++marker;
 195     while (unicodeLatinDigit (_text[marker]))
 196       utf8_next_char (_text, marker);
 197
 198     token = _text.substr (_cursor, marker - _cursor);
 199     type = Lexer::Type::number;
 200     _cursor = marker;
 201     return true;
 202   }
 203
 204   return false;
 205 }
 206
 207 ////////////////////////////////////////////////////////////////////////////////
 208 bool Lexer::isSingleCharOperator (int c)
 209 {
 210   return c == '+' ||  // Addition
 211          c == '-' ||  // Subtraction or unary minus = ambiguous
 212          c == '*' ||  // Multiplication
 213          c == '/' ||  // Diviѕion
 214          c == '(' ||  // Precedence open parenthesis
 215          c == ')' ||  // Precedence close parenthesis
 216          c == '<' ||  // Less than
 217          c == '>' ||  // Greater than
 218          c == '^' ||  // Exponent
 219          c == '!' ||  // Unary not
 220          c == '%' ||  // Modulus
 221          c == '=' ||  // Partial match
 222          c == '~';    // Pattern match
 223 }
 224
 225 ////////////////////////////////////////////////////////////////////////////////
 226 bool Lexer::isDoubleCharOperator (int c0, int c1, int c2)
 227 {
 228   return (c0 == '=' && c1 == '=')                        ||
 229          (c0 == '!' && c1 == '=')                        ||
 230          (c0 == '<' && c1 == '=')                        ||
 231          (c0 == '>' && c1 == '=')                        ||
 232          (c0 == 'o' && c1 == 'r' && isBoundary (c1, c2)) ||
 233          (c0 == '|' && c1 == '|')                        ||
 234          (c0 == '&' && c1 == '&')                        ||
 235          (c0 == '!' && c1 == '~');
 236 }
 237
 238 ////////////////////////////////////////////////////////////////////////////////
 239 bool Lexer::isTripleCharOperator (int c0, int c1, int c2, int c3)
 240 {
 241   return (c0 == 'a' && c1 == 'n' && c2 == 'd' && isBoundary (c2, c3)) ||
 242          (c0 == 'x' && c1 == 'o' && c2 == 'r' && isBoundary (c2, c3)) ||
 243          (c0 == '!' && c1 == '=' && c2 == '=');
 244 }
 245
 246 ////////////////////////////////////////////////////////////////////////////////
 247 bool Lexer::isBoundary (int left, int right)
 248 {
 249   // EOS
 250   if (right == '\0')                                         return true;
 251
 252   // XOR
 253   if (unicodeLatinAlpha (left) != unicodeLatinAlpha (right)) return true;
 254   if (unicodeLatinDigit (left) != unicodeLatinDigit (right)) return true;
 255   if (unicodeWhitespace (left) != unicodeWhitespace (right)) return true;
 256
 257   // OR
 258   if (isPunctuation (left) || isPunctuation (right))         return true;
 259
 260   return false;
 261 }
 262
 263 ////////////////////////////////////////////////////////////////////////////////
 264 bool Lexer::isHardBoundary (int left, int right)
 265 {
 266   // EOS
 267   if (right == '\0')
 268     return true;
 269
 270   // FILTER operators that don't need to be surrounded by whitespace.
 271   if (left == '(' ||
 272       left == ')' ||
 273       right == '(' ||
 274       right == ')')
 275     return true;
 276
 277   return false;
 278 }
 279
 280 ////////////////////////////////////////////////////////////////////////////////
 281 bool Lexer::isPunctuation (int c)
 282 {
 283   return isprint (c)   &&
 284          c != ' '      &&
 285          c != '@'      &&
 286          c != '#'      &&
 287          c != '$'      &&
 288          c != '_'      &&
 289          ! unicodeLatinDigit (c) &&
 290          ! unicodeLatinAlpha (c);
 291 }
 292
 293 ////////////////////////////////////////////////////////////////////////////////
 294 // Assumes that quotes is a string containing a non-trivial set of quote
 295 // characters.
 296 std::string Lexer::dequote (const std::string& input, const std::string& quotes)
 297 {
 298   if (input.length () > 1)
 299   {
 300     int quote = input[0];
 301     if (quotes.find (quote) != std::string::npos)
 302     {
 303       size_t len = input.length ();
 304       if (quote == input[len - 1])
 305         return input.substr (1, len - 2);
 306     }
 307   }
 308
 309   return input;
 310 }
 311
 312 ////////////////////////////////////////////////////////////////////////////////
 313 // Detects characters in an input string that indicate quotes were required, or
 314 // escapes, to get them past the shell.
 315 bool Lexer::wasQuoted (const std::string& input)
 316 {
 317   if (input.find_first_of (" \t()<>&~") != std::string::npos)
 318     return true;
 319
 320   return false;
 321 }
 322
 323 ////////////////////////////////////////////////////////////////////////////////
 324 bool Lexer::isEOS () const
 325 {
 326   return _cursor >= _eos;
 327 }
 328
 329 ////////////////////////////////////////////////////////////////////////////////
 330 // Converts '0'     -> 0
 331 //          '9'     -> 9
 332 //          'a'/'A' -> 10
 333 //          'f'/'F' -> 15
 334 int Lexer::hexToInt (int c)
 335 {
 336        if (c >= '0' && c <= '9') return (c - '0');
 337   else if (c >= 'a' && c <= 'f') return (c - 'a' + 10);
 338   else                           return (c - 'A' + 10);
 339 }
 340
 341 ////////////////////////////////////////////////////////////////////////////////
 342 int Lexer::hexToInt (int c0, int c1)
 343 {
 344   return (hexToInt (c0) << 4) + hexToInt (c1);
 345 }
 346
 347 ////////////////////////////////////////////////////////////////////////////////
 348 int Lexer::hexToInt (int c0, int c1, int c2, int c3)
 349 {
 350   return (hexToInt (c0) << 12) +
 351          (hexToInt (c1) << 8)  +
 352          (hexToInt (c2) << 4)  +
 353           hexToInt (c3);
 354 }
 355
 356 ////////////////////////////////////////////////////////////////////////////////
 357 std::string Lexer::trimLeft (const std::string& in, const std::string& t /*= " "*/)
 358 {
 359   std::string::size_type ws = in.find_first_not_of (t);
 360   if (ws > 0)
 361   {
 362     std::string out {in};
 363     return out.erase (0, ws);
 364   }
 365
 366   return in;
 367 }
 368
 369 ////////////////////////////////////////////////////////////////////////////////
 370 std::string Lexer::trimRight (const std::string& in, const std::string& t /*= " "*/)
 371 {
 372   std::string out {in};
 373   return out.erase (in.find_last_not_of (t) + 1);
 374 }
 375
 376 ////////////////////////////////////////////////////////////////////////////////
 377 std::string Lexer::trim (const std::string& in, const std::string& t /*= " "*/)
 378 {
 379   return trimLeft (trimRight (in, t), t);
 380 }
 381
 382 ////////////////////////////////////////////////////////////////////////////////
 383 // Lexer::Type::string
 384 //   '|"
 385 //   [ U+XXXX | \uXXXX | \" | \' | \\ | \/ | \b | \f | \n | \r | \t | . ]
 386 //   '|"
 387 bool Lexer::isString (std::string& token, Lexer::Type& type, const std::string& quotes)
 388 {
 389   if (_enableString)
 390   {
 391     std::size_t marker = _cursor;
 392     if (readWord (_text, quotes, marker, token))
 393     {
 394       type = Lexer::Type::string;
 395       _cursor = marker;
 396       return true;
 397     }
 398   }
 399
 400   return false;
 401 }
 402
 403 ////////////////////////////////////////////////////////////////////////////////
 404 // Lexer::Type::date
 405 //   <Datetime> (followed by eos, WS, operator)
 406 bool Lexer::isDate (std::string& token, Lexer::Type& type)
 407 {
 408   if (_enableDate)
 409   {
 410     // Try an ISO date parse.
 411     std::size_t i = _cursor;
 412     Datetime d;
 413     if (d.parse (_text, i, Lexer::dateFormat) &&
 414         (i >= _eos ||
 415          unicodeWhitespace (_text[i]) ||
 416          isSingleCharOperator (_text[i])))
 417     {
 418       type = Lexer::Type::date;
 419       token = _text.substr (_cursor, i - _cursor);
 420       _cursor = i;
 421       return true;
 422     }
 423   }
 424
 425   return false;
 426 }
 427
 428 ////////////////////////////////////////////////////////////////////////////////
 429 // Lexer::Type::duration
 430 //   <Duration> (followed by eos, WS, operator)
 431 bool Lexer::isDuration (std::string& token, Lexer::Type& type)
 432 {
 433   if (_enableDuration)
 434   {
 435     std::size_t marker = _cursor;
 436
 437     std::string extractedToken;
 438     Lexer::Type extractedType;
 439     if (isOperator(extractedToken, extractedType))
 440     {
 441       _cursor = marker;
 442       return false;
 443     }
 444
 445     marker = _cursor;
 446     Duration dur;
 447     if (dur.parse (_text, marker) &&
 448         (marker >= _eos ||
 449          unicodeWhitespace (_text[marker]) ||
 450          isSingleCharOperator (_text[marker])))
 451     {
 452       type = Lexer::Type::duration;
 453       token = _text.substr (_cursor, marker - _cursor);
 454       _cursor = marker;
 455       return true;
 456     }
 457   }
 458
 459   return false;
 460 }
 461
 462 ////////////////////////////////////////////////////////////////////////////////
 463 // Lexer::Type::uuid
 464 //   XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX
 465 //   XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXX
 466 //   XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXX
 467 //   XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXX
 468 //   ...
 469 //   XXXXXXXX-XX
 470 //   XXXXXXXX-X
 471 //   XXXXXXXX-
 472 //   XXXXXXXX
 473 //   Followed only by EOS, whitespace, or single character operator.
 474 bool Lexer::isUUID (std::string& token, Lexer::Type& type, bool endBoundary)
 475 {
 476   if (_enableUUID)
 477   {
 478     std::size_t marker = _cursor;
 479
 480     // Greedy.
 481     std::size_t i = 0;
 482     for (; i < 36 && marker + i < _eos; i++)
 483     {
 484       if (uuid_pattern[i] == 'x')
 485       {
 486         if (! unicodeHexDigit (_text[marker + i]))
 487           break;
 488       }
 489       else if (uuid_pattern[i] != _text[marker + i])
 490         break;
 491     }
 492
 493     if (i >= uuid_min_length                   &&
 494         (! endBoundary                         ||
 495          ! _text[marker + i]                   ||
 496          unicodeWhitespace (_text[marker + i]) ||
 497          isSingleCharOperator (_text[marker + i])))
 498     {
 499       token = _text.substr (_cursor, i);
 500       type = Lexer::Type::uuid;
 501       _cursor += i;
 502       return true;
 503     }
 504
 505   }
 506   return false;
 507 }
 508
 509 ////////////////////////////////////////////////////////////////////////////////
 510 // Lexer::Type::hex
 511 //   0xX+
 512 bool Lexer::isHexNumber (std::string& token, Lexer::Type& type)
 513 {
 514   if (_enableHexNumber)
 515   {
 516     std::size_t marker = _cursor;
 517
 518     if (_eos - marker >= 3 &&
 519         _text[marker + 0] == '0' &&
 520         _text[marker + 1] == 'x')
 521     {
 522       marker += 2;
 523
 524       while (unicodeHexDigit (_text[marker]))
 525         ++marker;
 526
 527       if (marker - _cursor > 2)
 528       {
 529         token = _text.substr (_cursor, marker - _cursor);
 530         type = Lexer::Type::hex;
 531         _cursor = marker;
 532         return true;
 533       }
 534     }
 535   }
 536
 537   return false;
 538 }
 539
 540 ////////////////////////////////////////////////////////////////////////////////
 541 // Lexer::Type::word
 542 //   [^\s]+
 543 bool Lexer::isWord (std::string& token, Lexer::Type& type)
 544 {
 545   if (_enableWord)
 546   {
 547     std::size_t marker = _cursor;
 548
 549     while (_text[marker] &&
 550            ! unicodeWhitespace (_text[marker]) &&
 551            (! _enableOperator || ! isSingleCharOperator (_text[marker])))
 552       utf8_next_char (_text, marker);
 553
 554     if (marker > _cursor)
 555     {
 556       token = _text.substr (_cursor, marker - _cursor);
 557       type = Lexer::Type::word;
 558       _cursor = marker;
 559       return true;
 560     }
 561   }
 562
 563   return false;
 564 }
 565
 566 ////////////////////////////////////////////////////////////////////////////////
 567 // Lexer::Type::url
 568 //   http [s] :// ...
 569 bool Lexer::isURL (std::string& token, Lexer::Type& type)
 570 {
 571   if (_enableURL)
 572   {
 573     std::size_t marker = _cursor;
 574
 575     if (_eos - _cursor > 9 &&    // length 'https://*'
 576         (_text[marker + 0] == 'h' || _text[marker + 0] == 'H') &&
 577         (_text[marker + 1] == 't' || _text[marker + 1] == 'T') &&
 578         (_text[marker + 2] == 't' || _text[marker + 2] == 'T') &&
 579         (_text[marker + 3] == 'p' || _text[marker + 3] == 'P'))
 580     {
 581       marker += 4;
 582       if (_text[marker + 0] == 's' || _text[marker + 0] == 'S')
 583         ++marker;
 584
 585       if (_text[marker + 0] == ':' &&
 586           _text[marker + 1] == '/' &&
 587           _text[marker + 2] == '/')
 588       {
 589         marker += 3;
 590
 591         while (marker < _eos &&
 592                ! unicodeWhitespace (_text[marker]))
 593           utf8_next_char (_text, marker);
 594
 595         token = _text.substr (_cursor, marker - _cursor);
 596         type = Lexer::Type::url;
 597         _cursor = marker;
 598         return true;
 599       }
 600     }
 601   }
 602
 603   return false;
 604 }
 605
 606 ////////////////////////////////////////////////////////////////////////////////
 607 // Lexer::Type::path
 608 //   ( / <non-slash, non-whitespace> )+
 609 bool Lexer::isPath (std::string& token, Lexer::Type& type)
 610 {
 611   if (_enablePath)
 612   {
 613     std::size_t marker = _cursor;
 614     int slashCount = 0;
 615
 616     while (1)
 617     {
 618       if (_text[marker] == '/')
 619       {
 620         ++marker;
 621         ++slashCount;
 622       }
 623       else
 624         break;
 625
 626       if (_text[marker] &&
 627           ! unicodeWhitespace (_text[marker]) &&
 628           _text[marker] != '/')
 629       {
 630         utf8_next_char (_text, marker);
 631         while (_text[marker] &&
 632                ! unicodeWhitespace (_text[marker]) &&
 633                _text[marker] != '/')
 634           utf8_next_char (_text, marker);
 635       }
 636       else
 637         break;
 638     }
 639
 640     if (marker > _cursor &&
 641         slashCount > 3)
 642     {
 643       type = Lexer::Type::path;
 644       token = _text.substr (_cursor, marker - _cursor);
 645       _cursor = marker;
 646       return true;
 647     }
 648   }
 649
 650   return false;
 651 }
 652
 653 ////////////////////////////////////////////////////////////////////////////////
 654 // Lexer::Type::pattern
 655 //   / <unquoted-string> /  <EOS> | <unicodeWhitespace>
 656 bool Lexer::isPattern (std::string& token, Lexer::Type& type)
 657 {
 658   if (_enablePattern)
 659   {
 660     std::size_t marker = _cursor;
 661
 662     std::string word;
 663     if (readWord (_text, "/", _cursor, word) &&
 664         (isEOS () ||
 665          unicodeWhitespace (_text[_cursor])))
 666     {
 667       token = _text.substr (marker, _cursor - marker);
 668       type = Lexer::Type::pattern;
 669       return true;
 670     }
 671
 672     _cursor = marker;
 673   }
 674
 675   return false;
 676 }
 677
 678 ////////////////////////////////////////////////////////////////////////////////
 679 // Lexer::Type::op
 680 //   _hastag_ | _notag | _neg_ | _pos_ |
 681 //   <isTripleCharOperator> |
 682 //   <isDoubleCharOperator> |
 683 //   <isSingleCharOperator> |
 684 bool Lexer::isOperator (std::string& token, Lexer::Type& type)
 685 {
 686   if (_enableOperator)
 687   {
 688     std::size_t marker = _cursor;
 689
 690     if (_eos - marker >= 8 && _text.substr (marker, 8) == "_hastag_")
 691     {
 692       marker += 8;
 693       type = Lexer::Type::op;
 694       token = _text.substr (_cursor, marker - _cursor);
 695       _cursor = marker;
 696       return true;
 697     }
 698
 699     else if (_eos - marker >= 7 && _text.substr (marker, 7) == "_notag_")
 700     {
 701       marker += 7;
 702       type = Lexer::Type::op;
 703       token = _text.substr (_cursor, marker - _cursor);
 704       _cursor = marker;
 705       return true;
 706     }
 707
 708     else if (_eos - marker >= 5 && _text.substr (marker, 5) == "_neg_")
 709     {
 710       marker += 5;
 711       type = Lexer::Type::op;
 712       token = _text.substr (_cursor, marker - _cursor);
 713       _cursor = marker;
 714       return true;
 715     }
 716
 717     else if (_eos - marker >= 5 && _text.substr (marker, 5) == "_pos_")
 718     {
 719       marker += 5;
 720       type = Lexer::Type::op;
 721       token = _text.substr (_cursor, marker - _cursor);
 722       _cursor = marker;
 723       return true;
 724     }
 725
 726     else if (_eos - marker >= 3 &&
 727         isTripleCharOperator (_text[marker], _text[marker + 1], _text[marker + 2], _text[marker + 3]))
 728     {
 729       marker += 3;
 730       type = Lexer::Type::op;
 731       token = _text.substr (_cursor, marker - _cursor);
 732       _cursor = marker;
 733       return true;
 734     }
 735
 736     else if (_eos - marker >= 2 &&
 737         isDoubleCharOperator (_text[marker], _text[marker + 1], _text[marker + 2]))
 738     {
 739       marker += 2;
 740       type = Lexer::Type::op;
 741       token = _text.substr (_cursor, marker - _cursor);
 742       _cursor = marker;
 743       return true;
 744     }
 745
 746     else if (isSingleCharOperator (_text[marker]))
 747     {
 748       token = _text[marker];
 749       type = Lexer::Type::op;
 750       _cursor = ++marker;
 751       return true;
 752     }
 753   }
 754
 755   return false;
 756 }
 757
 758 ////////////////////////////////////////////////////////////////////////////////
 759 // Static
 760 std::string Lexer::typeToString (Lexer::Type type)
 761 {
 762        if (type == Lexer::Type::string)       return std::string ("\033[38;5;7m\033[48;5;3m")    + "string"       + "\033[0m";
 763   else if (type == Lexer::Type::uuid)         return std::string ("\033[38;5;7m\033[48;5;10m")   + "uuid"         + "\033[0m";
 764   else if (type == Lexer::Type::hex)          return std::string ("\033[38;5;7m\033[48;5;14m")   + "hex"          + "\033[0m";
 765   else if (type == Lexer::Type::number)       return std::string ("\033[38;5;7m\033[48;5;6m")    + "number"       + "\033[0m";
 766   else if (type == Lexer::Type::url)          return std::string ("\033[38;5;7m\033[48;5;4m")    + "url"          + "\033[0m";
 767   else if (type == Lexer::Type::path)         return std::string ("\033[37;102m")                + "path"         + "\033[0m";
 768   else if (type == Lexer::Type::pattern)      return std::string ("\033[37;42m")                 + "pattern"      + "\033[0m";
 769   else if (type == Lexer::Type::op)           return std::string ("\033[38;5;7m\033[48;5;203m")  + "op"           + "\033[0m";
 770   else if (type == Lexer::Type::word)         return std::string ("\033[38;5;15m\033[48;5;236m") + "word"         + "\033[0m";
 771   else if (type == Lexer::Type::date)         return std::string ("\033[38;5;15m\033[48;5;34m")  + "date"         + "\033[0m";
 772   else if (type == Lexer::Type::duration)     return std::string ("\033[38;5;15m\033[48;5;34m")  + "duration"     + "\033[0m";
 773   else                                        return std::string ("\033[37;41m")                 + "unknown"      + "\033[0m";
 774 }
 775
 776 ////////////////////////////////////////////////////////////////////////////////
 777 // Full implementation of a quoted word.  Includes:
 778 //   '\''
 779 //   '"'
 780 //   "'"
 781 //   "\""
 782 //   'one two'
 783 // Result includes the quotes.
 784 bool Lexer::readWord (
 785   const std::string& text,
 786   const std::string& quotes,
 787   std::string::size_type& cursor,
 788   std::string& word)
 789 {
 790   if (quotes.find (text[cursor]) == std::string::npos)
 791     return false;
 792
 793   std::string::size_type eos = text.length ();
 794   int quote = text[cursor++];
 795   word = quote;
 796
 797   int c;
 798   while ((c = text[cursor]))
 799   {
 800     // Quoted word ends on a quote.
 801     if (quote && quote == c)
 802     {
 803       word += utf8_character (utf8_next_char (text, cursor));
 804       break;
 805     }
 806
 807     // Unicode U+XXXX or \uXXXX codepoint.
 808     else if (eos - cursor >= 6 &&
 809              ((text[cursor + 0] == 'U'  && text[cursor + 1] == '+') ||
 810               (text[cursor + 0] == '\\' && text[cursor + 1] == 'u')) &&
 811              unicodeHexDigit (text[cursor + 2]) &&
 812              unicodeHexDigit (text[cursor + 3]) &&
 813              unicodeHexDigit (text[cursor + 4]) &&
 814              unicodeHexDigit (text[cursor + 5]))
 815     {
 816       word += utf8_character (
 817                 hexToInt (
 818                   text[cursor + 2],
 819                   text[cursor + 3],
 820                   text[cursor + 4],
 821                   text[cursor + 5]));
 822       cursor += 6;
 823     }
 824
 825     // An escaped thing.
 826     else if (c == '\\')
 827     {
 828       c = text[++cursor];
 829
 830       switch (c)
 831       {
 832       case '"':  word += (char) 0x22; ++cursor; break;
 833       case '\'': word += (char) 0x27; ++cursor; break;
 834       case '\\': word += (char) 0x5C; ++cursor; break;
 835       case 'b':  word += (char) 0x08; ++cursor; break;
 836       case 'f':  word += (char) 0x0C; ++cursor; break;
 837       case 'n':  word += (char) 0x0A; ++cursor; break;
 838       case 'r':  word += (char) 0x0D; ++cursor; break;
 839       case 't':  word += (char) 0x09; ++cursor; break;
 840       case 'v':  word += (char) 0x0B; ++cursor; break;
 841
 842       // This pass-through default case means that anything can be escaped
 843       // harmlessly. In particular 'quote' is included, if it not one of the
 844       // above characters.
 845       default:   word += (char) c;    ++cursor; break;
 846       }
 847     }
 848
 849     // Ordinary character.
 850     else
 851       word += utf8_character (utf8_next_char (text, cursor));
 852   }
 853
 854   // Verify termination.
 855   return word[0]                  == quote &&
 856          word[word.length () - 1] == quote &&
 857          word.length () >= 2;
 858 }
 859
 860 ////////////////////////////////////////////////////////////////////////////////
 861 // Full implementation of an unquoted word.  Includes:
 862 //   one\ two
 863 //   abcU+0020def
 864 //   abc\u0020def
 865 //   a\tb
 866 //
 867 // Ends at:
 868 //   Lexer::isEOS
 869 //   unicodeWhitespace
 870 //   Lexer::isHardBoundary
 871 bool Lexer::readWord (
 872   const std::string& text,
 873   std::string::size_type& cursor,
 874   std::string& word)
 875 {
 876   std::string::size_type eos = text.length ();
 877
 878   word = "";
 879   int c;
 880   int prev = 0;
 881   while ((c = text[cursor]))  // Handles EOS.
 882   {
 883     // Unquoted word ends on white space.
 884     if (unicodeWhitespace (c))
 885       break;
 886
 887     // Parentheses mostly.
 888     if (prev && Lexer::isHardBoundary (prev, c))
 889       break;
 890
 891     // Unicode U+XXXX or \uXXXX codepoint.
 892     else if (eos - cursor >= 6 &&
 893              ((text[cursor + 0] == 'U'  && text[cursor + 1] == '+') ||
 894               (text[cursor + 0] == '\\' && text[cursor + 1] == 'u')) &&
 895              unicodeHexDigit (text[cursor + 2]) &&
 896              unicodeHexDigit (text[cursor + 3]) &&
 897              unicodeHexDigit (text[cursor + 4]) &&
 898              unicodeHexDigit (text[cursor + 5]))
 899     {
 900       word += utf8_character (
 901                 hexToInt (
 902                   text[cursor + 2],
 903                   text[cursor + 3],
 904                   text[cursor + 4],
 905                   text[cursor + 5]));
 906       cursor += 6;
 907     }
 908
 909     // An escaped thing.
 910     else if (c == '\\')
 911     {
 912       c = text[++cursor];
 913
 914       switch (c)
 915       {
 916       case '"':  word += (char) 0x22; ++cursor; break;
 917       case '\'': word += (char) 0x27; ++cursor; break;
 918       case '\\': word += (char) 0x5C; ++cursor; break;
 919       case 'b':  word += (char) 0x08; ++cursor; break;
 920       case 'f':  word += (char) 0x0C; ++cursor; break;
 921       case 'n':  word += (char) 0x0A; ++cursor; break;
 922       case 'r':  word += (char) 0x0D; ++cursor; break;
 923       case 't':  word += (char) 0x09; ++cursor; break;
 924       case 'v':  word += (char) 0x0B; ++cursor; break;
 925
 926       // This pass-through default case means that anything can be escaped
 927       // harmlessly. In particular 'quote' is included, if it not one of the
 928       // above characters.
 929       default:   word += (char) c;    ++cursor; break;
 930       }
 931     }
 932
 933     // Ordinary character.
 934     else
 935       word += utf8_character (utf8_next_char (text, cursor));
 936
 937     prev = c;
 938   }
 939
 940   return word.length () > 0 ? true : false;
 941 }
 942
 943 ////////////////////////////////////////////////////////////////////////////////