1 ////////////////////////////////////////////////////////////////////////////////
3 // Copyright 2013 - 2017, Paul Beckingham, Federico Hernandez.
5 // Permission is hereby granted, free of charge, to any person obtaining a copy
6 // of this software and associated documentation files (the "Software"), to deal
7 // in the Software without restriction, including without limitation the rights
8 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 // copies of the Software, and to permit persons to whom the Software is
10 // furnished to do so, subject to the following conditions:
12 // The above copyright notice and this permission notice shall be included
13 // in all copies or substantial portions of the Software.
15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 // http://www.opensource.org/licenses/mit-license.php
25 ////////////////////////////////////////////////////////////////////////////////
37 static const std::string uuid_pattern = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx";
38 static const unsigned int uuid_min_length = 8;
40 std::string Lexer::dateFormat = "";
42 ////////////////////////////////////////////////////////////////////////////////
43 Lexer::Lexer (const std::string& text)
49 ////////////////////////////////////////////////////////////////////////////////
50 // When a Lexer object is constructed with a string, this method walks through
51 // the stream of low-level tokens.
52 bool Lexer::token (std::string& token, Lexer::Type& type)
55 while (unicodeWhitespace (_text[_cursor]))
56 utf8_next_char (_text, _cursor);
62 if (isString (token, type, "'\"") ||
63 isUUID (token, type, true) ||
64 isDate (token, type) ||
65 isDuration (token, type) ||
66 isURL (token, type) ||
67 isHexNumber (token, type) ||
68 isNumber (token, type) ||
69 isPath (token, type) ||
70 isPattern (token, type) ||
71 isOperator (token, type) ||
78 ////////////////////////////////////////////////////////////////////////////////
79 std::vector <std::tuple <std::string, Lexer::Type>> Lexer::tokenize (const std::string& input)
81 std::vector <std::tuple <std::string, Lexer::Type>> tokens;
86 while (lexer.token (token, type))
87 tokens.push_back (std::make_tuple (token, type));
92 ////////////////////////////////////////////////////////////////////////////////
93 // No L10N - these are for internal purposes.
94 const std::string Lexer::typeName (const Lexer::Type& type)
98 case Lexer::Type::uuid: return "uuid";
99 case Lexer::Type::number: return "number";
100 case Lexer::Type::hex: return "hex";
101 case Lexer::Type::string: return "string";
102 case Lexer::Type::url: return "url";
103 case Lexer::Type::path: return "path";
104 case Lexer::Type::pattern: return "pattern";
105 case Lexer::Type::op: return "op";
106 case Lexer::Type::word: return "word";
107 case Lexer::Type::date: return "date";
108 case Lexer::Type::duration: return "duration";
114 ////////////////////////////////////////////////////////////////////////////////
115 // Lexer::Type::number
118 // [ e|E [ +|- ] \d+ [ . \d+ ] ]
119 // not followed by non-operator.
120 bool Lexer::isNumber (std::string& token, Lexer::Type& type)
122 std::size_t marker = _cursor;
124 if (unicodeLatinDigit (_text[marker]))
127 while (unicodeLatinDigit (_text[marker]))
128 utf8_next_char (_text, marker);
130 if (_text[marker] == '.')
133 if (unicodeLatinDigit (_text[marker]))
136 while (unicodeLatinDigit (_text[marker]))
137 utf8_next_char (_text, marker);
141 if (_text[marker] == 'e' ||
142 _text[marker] == 'E')
146 if (_text[marker] == '+' ||
147 _text[marker] == '-')
150 if (unicodeLatinDigit (_text[marker]))
153 while (unicodeLatinDigit (_text[marker]))
154 utf8_next_char (_text, marker);
156 if (_text[marker] == '.')
159 if (unicodeLatinDigit (_text[marker]))
162 while (unicodeLatinDigit (_text[marker]))
163 utf8_next_char (_text, marker);
169 // Lookahread: !<unicodeWhitespace> | !<isSingleCharOperator>
170 // If there is an immediately consecutive character, that is not an operator, fail.
172 ! unicodeWhitespace (_text[marker]) &&
173 ! isSingleCharOperator (_text[marker]))
176 token = _text.substr (_cursor, marker - _cursor);
177 type = Lexer::Type::number;
185 ////////////////////////////////////////////////////////////////////////////////
186 // Lexer::Type::number
188 bool Lexer::isInteger (std::string& token, Lexer::Type& type)
190 std::size_t marker = _cursor;
192 if (unicodeLatinDigit (_text[marker]))
195 while (unicodeLatinDigit (_text[marker]))
196 utf8_next_char (_text, marker);
198 token = _text.substr (_cursor, marker - _cursor);
199 type = Lexer::Type::number;
207 ////////////////////////////////////////////////////////////////////////////////
208 bool Lexer::isSingleCharOperator (int c)
210 return c == '+' || // Addition
211 c == '-' || // Subtraction or unary minus = ambiguous
212 c == '*' || // Multiplication
213 c == '/' || // Diviѕion
214 c == '(' || // Precedence open parenthesis
215 c == ')' || // Precedence close parenthesis
216 c == '<' || // Less than
217 c == '>' || // Greater than
218 c == '^' || // Exponent
219 c == '!' || // Unary not
220 c == '%' || // Modulus
221 c == '=' || // Partial match
222 c == '~'; // Pattern match
225 ////////////////////////////////////////////////////////////////////////////////
226 bool Lexer::isDoubleCharOperator (int c0, int c1, int c2)
228 return (c0 == '=' && c1 == '=') ||
229 (c0 == '!' && c1 == '=') ||
230 (c0 == '<' && c1 == '=') ||
231 (c0 == '>' && c1 == '=') ||
232 (c0 == 'o' && c1 == 'r' && isBoundary (c1, c2)) ||
233 (c0 == '|' && c1 == '|') ||
234 (c0 == '&' && c1 == '&') ||
235 (c0 == '!' && c1 == '~');
238 ////////////////////////////////////////////////////////////////////////////////
239 bool Lexer::isTripleCharOperator (int c0, int c1, int c2, int c3)
241 return (c0 == 'a' && c1 == 'n' && c2 == 'd' && isBoundary (c2, c3)) ||
242 (c0 == 'x' && c1 == 'o' && c2 == 'r' && isBoundary (c2, c3)) ||
243 (c0 == '!' && c1 == '=' && c2 == '=');
246 ////////////////////////////////////////////////////////////////////////////////
247 bool Lexer::isBoundary (int left, int right)
250 if (right == '\0') return true;
253 if (unicodeLatinAlpha (left) != unicodeLatinAlpha (right)) return true;
254 if (unicodeLatinDigit (left) != unicodeLatinDigit (right)) return true;
255 if (unicodeWhitespace (left) != unicodeWhitespace (right)) return true;
258 if (isPunctuation (left) || isPunctuation (right)) return true;
263 ////////////////////////////////////////////////////////////////////////////////
264 bool Lexer::isHardBoundary (int left, int right)
270 // FILTER operators that don't need to be surrounded by whitespace.
280 ////////////////////////////////////////////////////////////////////////////////
281 bool Lexer::isPunctuation (int c)
283 return isprint (c) &&
289 ! unicodeLatinDigit (c) &&
290 ! unicodeLatinAlpha (c);
293 ////////////////////////////////////////////////////////////////////////////////
294 // Assumes that quotes is a string containing a non-trivial set of quote
296 std::string Lexer::dequote (const std::string& input, const std::string& quotes)
298 if (input.length () > 1)
300 int quote = input[0];
301 if (quotes.find (quote) != std::string::npos)
303 size_t len = input.length ();
304 if (quote == input[len - 1])
305 return input.substr (1, len - 2);
312 ////////////////////////////////////////////////////////////////////////////////
313 // Detects characters in an input string that indicate quotes were required, or
314 // escapes, to get them past the shell.
315 bool Lexer::wasQuoted (const std::string& input)
317 if (input.find_first_of (" \t()<>&~") != std::string::npos)
323 ////////////////////////////////////////////////////////////////////////////////
324 bool Lexer::isEOS () const
326 return _cursor >= _eos;
329 ////////////////////////////////////////////////////////////////////////////////
334 int Lexer::hexToInt (int c)
336 if (c >= '0' && c <= '9') return (c - '0');
337 else if (c >= 'a' && c <= 'f') return (c - 'a' + 10);
338 else return (c - 'A' + 10);
341 ////////////////////////////////////////////////////////////////////////////////
342 int Lexer::hexToInt (int c0, int c1)
344 return (hexToInt (c0) << 4) + hexToInt (c1);
347 ////////////////////////////////////////////////////////////////////////////////
348 int Lexer::hexToInt (int c0, int c1, int c2, int c3)
350 return (hexToInt (c0) << 12) +
351 (hexToInt (c1) << 8) +
352 (hexToInt (c2) << 4) +
356 ////////////////////////////////////////////////////////////////////////////////
357 std::string Lexer::trimLeft (const std::string& in, const std::string& t /*= " "*/)
359 std::string::size_type ws = in.find_first_not_of (t);
362 std::string out {in};
363 return out.erase (0, ws);
369 ////////////////////////////////////////////////////////////////////////////////
370 std::string Lexer::trimRight (const std::string& in, const std::string& t /*= " "*/)
372 std::string out {in};
373 return out.erase (in.find_last_not_of (t) + 1);
376 ////////////////////////////////////////////////////////////////////////////////
377 std::string Lexer::trim (const std::string& in, const std::string& t /*= " "*/)
379 return trimLeft (trimRight (in, t), t);
382 ////////////////////////////////////////////////////////////////////////////////
383 // Lexer::Type::string
385 // [ U+XXXX | \uXXXX | \" | \' | \\ | \/ | \b | \f | \n | \r | \t | . ]
387 bool Lexer::isString (std::string& token, Lexer::Type& type, const std::string& quotes)
391 std::size_t marker = _cursor;
392 if (readWord (_text, quotes, marker, token))
394 type = Lexer::Type::string;
403 ////////////////////////////////////////////////////////////////////////////////
405 // <Datetime> (followed by eos, WS, operator)
406 bool Lexer::isDate (std::string& token, Lexer::Type& type)
410 // Try an ISO date parse.
411 std::size_t i = _cursor;
413 if (d.parse (_text, i, Lexer::dateFormat) &&
415 unicodeWhitespace (_text[i]) ||
416 isSingleCharOperator (_text[i])))
418 type = Lexer::Type::date;
419 token = _text.substr (_cursor, i - _cursor);
428 ////////////////////////////////////////////////////////////////////////////////
429 // Lexer::Type::duration
430 // <Duration> (followed by eos, WS, operator)
431 bool Lexer::isDuration (std::string& token, Lexer::Type& type)
435 std::size_t marker = _cursor;
437 std::string extractedToken;
438 Lexer::Type extractedType;
439 if (isOperator(extractedToken, extractedType))
447 if (dur.parse (_text, marker) &&
449 unicodeWhitespace (_text[marker]) ||
450 isSingleCharOperator (_text[marker])))
452 type = Lexer::Type::duration;
453 token = _text.substr (_cursor, marker - _cursor);
462 ////////////////////////////////////////////////////////////////////////////////
464 // XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX
465 // XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXX
466 // XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXX
467 // XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXX
473 // Followed only by EOS, whitespace, or single character operator.
474 bool Lexer::isUUID (std::string& token, Lexer::Type& type, bool endBoundary)
478 std::size_t marker = _cursor;
482 for (; i < 36 && marker + i < _eos; i++)
484 if (uuid_pattern[i] == 'x')
486 if (! unicodeHexDigit (_text[marker + i]))
489 else if (uuid_pattern[i] != _text[marker + i])
493 if (i >= uuid_min_length &&
495 ! _text[marker + i] ||
496 unicodeWhitespace (_text[marker + i]) ||
497 isSingleCharOperator (_text[marker + i])))
499 token = _text.substr (_cursor, i);
500 type = Lexer::Type::uuid;
509 ////////////////////////////////////////////////////////////////////////////////
512 bool Lexer::isHexNumber (std::string& token, Lexer::Type& type)
514 if (_enableHexNumber)
516 std::size_t marker = _cursor;
518 if (_eos - marker >= 3 &&
519 _text[marker + 0] == '0' &&
520 _text[marker + 1] == 'x')
524 while (unicodeHexDigit (_text[marker]))
527 if (marker - _cursor > 2)
529 token = _text.substr (_cursor, marker - _cursor);
530 type = Lexer::Type::hex;
540 ////////////////////////////////////////////////////////////////////////////////
543 bool Lexer::isWord (std::string& token, Lexer::Type& type)
547 std::size_t marker = _cursor;
549 while (_text[marker] &&
550 ! unicodeWhitespace (_text[marker]) &&
551 (! _enableOperator || ! isSingleCharOperator (_text[marker])))
552 utf8_next_char (_text, marker);
554 if (marker > _cursor)
556 token = _text.substr (_cursor, marker - _cursor);
557 type = Lexer::Type::word;
566 ////////////////////////////////////////////////////////////////////////////////
569 bool Lexer::isURL (std::string& token, Lexer::Type& type)
573 std::size_t marker = _cursor;
575 if (_eos - _cursor > 9 && // length 'https://*'
576 (_text[marker + 0] == 'h' || _text[marker + 0] == 'H') &&
577 (_text[marker + 1] == 't' || _text[marker + 1] == 'T') &&
578 (_text[marker + 2] == 't' || _text[marker + 2] == 'T') &&
579 (_text[marker + 3] == 'p' || _text[marker + 3] == 'P'))
582 if (_text[marker + 0] == 's' || _text[marker + 0] == 'S')
585 if (_text[marker + 0] == ':' &&
586 _text[marker + 1] == '/' &&
587 _text[marker + 2] == '/')
591 while (marker < _eos &&
592 ! unicodeWhitespace (_text[marker]))
593 utf8_next_char (_text, marker);
595 token = _text.substr (_cursor, marker - _cursor);
596 type = Lexer::Type::url;
606 ////////////////////////////////////////////////////////////////////////////////
608 // ( / <non-slash, non-whitespace> )+
609 bool Lexer::isPath (std::string& token, Lexer::Type& type)
613 std::size_t marker = _cursor;
618 if (_text[marker] == '/')
627 ! unicodeWhitespace (_text[marker]) &&
628 _text[marker] != '/')
630 utf8_next_char (_text, marker);
631 while (_text[marker] &&
632 ! unicodeWhitespace (_text[marker]) &&
633 _text[marker] != '/')
634 utf8_next_char (_text, marker);
640 if (marker > _cursor &&
643 type = Lexer::Type::path;
644 token = _text.substr (_cursor, marker - _cursor);
653 ////////////////////////////////////////////////////////////////////////////////
654 // Lexer::Type::pattern
655 // / <unquoted-string> / <EOS> | <unicodeWhitespace>
656 bool Lexer::isPattern (std::string& token, Lexer::Type& type)
660 std::size_t marker = _cursor;
663 if (readWord (_text, "/", _cursor, word) &&
665 unicodeWhitespace (_text[_cursor])))
667 token = _text.substr (marker, _cursor - marker);
668 type = Lexer::Type::pattern;
678 ////////////////////////////////////////////////////////////////////////////////
680 // _hastag_ | _notag | _neg_ | _pos_ |
681 // <isTripleCharOperator> |
682 // <isDoubleCharOperator> |
683 // <isSingleCharOperator> |
684 bool Lexer::isOperator (std::string& token, Lexer::Type& type)
688 std::size_t marker = _cursor;
690 if (_eos - marker >= 8 && _text.substr (marker, 8) == "_hastag_")
693 type = Lexer::Type::op;
694 token = _text.substr (_cursor, marker - _cursor);
699 else if (_eos - marker >= 7 && _text.substr (marker, 7) == "_notag_")
702 type = Lexer::Type::op;
703 token = _text.substr (_cursor, marker - _cursor);
708 else if (_eos - marker >= 5 && _text.substr (marker, 5) == "_neg_")
711 type = Lexer::Type::op;
712 token = _text.substr (_cursor, marker - _cursor);
717 else if (_eos - marker >= 5 && _text.substr (marker, 5) == "_pos_")
720 type = Lexer::Type::op;
721 token = _text.substr (_cursor, marker - _cursor);
726 else if (_eos - marker >= 3 &&
727 isTripleCharOperator (_text[marker], _text[marker + 1], _text[marker + 2], _text[marker + 3]))
730 type = Lexer::Type::op;
731 token = _text.substr (_cursor, marker - _cursor);
736 else if (_eos - marker >= 2 &&
737 isDoubleCharOperator (_text[marker], _text[marker + 1], _text[marker + 2]))
740 type = Lexer::Type::op;
741 token = _text.substr (_cursor, marker - _cursor);
746 else if (isSingleCharOperator (_text[marker]))
748 token = _text[marker];
749 type = Lexer::Type::op;
758 ////////////////////////////////////////////////////////////////////////////////
760 std::string Lexer::typeToString (Lexer::Type type)
762 if (type == Lexer::Type::string) return std::string ("\033[38;5;7m\033[48;5;3m") + "string" + "\033[0m";
763 else if (type == Lexer::Type::uuid) return std::string ("\033[38;5;7m\033[48;5;10m") + "uuid" + "\033[0m";
764 else if (type == Lexer::Type::hex) return std::string ("\033[38;5;7m\033[48;5;14m") + "hex" + "\033[0m";
765 else if (type == Lexer::Type::number) return std::string ("\033[38;5;7m\033[48;5;6m") + "number" + "\033[0m";
766 else if (type == Lexer::Type::url) return std::string ("\033[38;5;7m\033[48;5;4m") + "url" + "\033[0m";
767 else if (type == Lexer::Type::path) return std::string ("\033[37;102m") + "path" + "\033[0m";
768 else if (type == Lexer::Type::pattern) return std::string ("\033[37;42m") + "pattern" + "\033[0m";
769 else if (type == Lexer::Type::op) return std::string ("\033[38;5;7m\033[48;5;203m") + "op" + "\033[0m";
770 else if (type == Lexer::Type::word) return std::string ("\033[38;5;15m\033[48;5;236m") + "word" + "\033[0m";
771 else if (type == Lexer::Type::date) return std::string ("\033[38;5;15m\033[48;5;34m") + "date" + "\033[0m";
772 else if (type == Lexer::Type::duration) return std::string ("\033[38;5;15m\033[48;5;34m") + "duration" + "\033[0m";
773 else return std::string ("\033[37;41m") + "unknown" + "\033[0m";
776 ////////////////////////////////////////////////////////////////////////////////
777 // Full implementation of a quoted word. Includes:
783 // Result includes the quotes.
784 bool Lexer::readWord (
785 const std::string& text,
786 const std::string& quotes,
787 std::string::size_type& cursor,
790 if (quotes.find (text[cursor]) == std::string::npos)
793 std::string::size_type eos = text.length ();
794 int quote = text[cursor++];
798 while ((c = text[cursor]))
800 // Quoted word ends on a quote.
801 if (quote && quote == c)
803 word += utf8_character (utf8_next_char (text, cursor));
807 // Unicode U+XXXX or \uXXXX codepoint.
808 else if (eos - cursor >= 6 &&
809 ((text[cursor + 0] == 'U' && text[cursor + 1] == '+') ||
810 (text[cursor + 0] == '\\' && text[cursor + 1] == 'u')) &&
811 unicodeHexDigit (text[cursor + 2]) &&
812 unicodeHexDigit (text[cursor + 3]) &&
813 unicodeHexDigit (text[cursor + 4]) &&
814 unicodeHexDigit (text[cursor + 5]))
816 word += utf8_character (
832 case '"': word += (char) 0x22; ++cursor; break;
833 case '\'': word += (char) 0x27; ++cursor; break;
834 case '\\': word += (char) 0x5C; ++cursor; break;
835 case 'b': word += (char) 0x08; ++cursor; break;
836 case 'f': word += (char) 0x0C; ++cursor; break;
837 case 'n': word += (char) 0x0A; ++cursor; break;
838 case 'r': word += (char) 0x0D; ++cursor; break;
839 case 't': word += (char) 0x09; ++cursor; break;
840 case 'v': word += (char) 0x0B; ++cursor; break;
842 // This pass-through default case means that anything can be escaped
843 // harmlessly. In particular 'quote' is included, if it not one of the
845 default: word += (char) c; ++cursor; break;
849 // Ordinary character.
851 word += utf8_character (utf8_next_char (text, cursor));
854 // Verify termination.
855 return word[0] == quote &&
856 word[word.length () - 1] == quote &&
860 ////////////////////////////////////////////////////////////////////////////////
861 // Full implementation of an unquoted word. Includes:
870 // Lexer::isHardBoundary
871 bool Lexer::readWord (
872 const std::string& text,
873 std::string::size_type& cursor,
876 std::string::size_type eos = text.length ();
881 while ((c = text[cursor])) // Handles EOS.
883 // Unquoted word ends on white space.
884 if (unicodeWhitespace (c))
887 // Parentheses mostly.
888 if (prev && Lexer::isHardBoundary (prev, c))
891 // Unicode U+XXXX or \uXXXX codepoint.
892 else if (eos - cursor >= 6 &&
893 ((text[cursor + 0] == 'U' && text[cursor + 1] == '+') ||
894 (text[cursor + 0] == '\\' && text[cursor + 1] == 'u')) &&
895 unicodeHexDigit (text[cursor + 2]) &&
896 unicodeHexDigit (text[cursor + 3]) &&
897 unicodeHexDigit (text[cursor + 4]) &&
898 unicodeHexDigit (text[cursor + 5]))
900 word += utf8_character (
916 case '"': word += (char) 0x22; ++cursor; break;
917 case '\'': word += (char) 0x27; ++cursor; break;
918 case '\\': word += (char) 0x5C; ++cursor; break;
919 case 'b': word += (char) 0x08; ++cursor; break;
920 case 'f': word += (char) 0x0C; ++cursor; break;
921 case 'n': word += (char) 0x0A; ++cursor; break;
922 case 'r': word += (char) 0x0D; ++cursor; break;
923 case 't': word += (char) 0x09; ++cursor; break;
924 case 'v': word += (char) 0x0B; ++cursor; break;
926 // This pass-through default case means that anything can be escaped
927 // harmlessly. In particular 'quote' is included, if it not one of the
929 default: word += (char) c; ++cursor; break;
933 // Ordinary character.
935 word += utf8_character (utf8_next_char (text, cursor));
940 return word.length () > 0 ? true : false;
943 ////////////////////////////////////////////////////////////////////////////////