1 ////////////////////////////////////////////////////////////////////////////////
3 // Copyright 2015 - 2017, Paul Beckingham, Federico Hernandez.
5 // Permission is hereby granted, free of charge, to any person obtaining a copy
6 // of this software and associated documentation files (the "Software"), to deal
7 // in the Software without restriction, including without limitation the rights
8 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 // copies of the Software, and to permit persons to whom the Software is
10 // furnished to do so, subject to the following conditions:
12 // The above copyright notice and this permission notice shall be included
13 // in all copies or substantial portions of the Software.
15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 // http://www.opensource.org/licenses/mit-license.php
25 ////////////////////////////////////////////////////////////////////////////////
35 int Packrat::minimumMatchLength = 3;
37 ////////////////////////////////////////////////////////////////////////////////
38 void Packrat::debug ()
43 ////////////////////////////////////////////////////////////////////////////////
44 // Walk the grammar tree to parse the input text, resulting in a parse tree.
45 void Packrat::parse (const PEG& peg, const std::string& input)
47 // Used to walk the grammar tree.
48 // Note there is only one rule at the top of the syntax tree, which was the
50 _syntax = peg.syntax ();
51 _tree->_name = peg.firstRule ();
53 // The pig that will be sent down the pipe.
56 std::cout << "trace " << pig.dump () << "\n";
58 // Match the first rule. Recursion does the rest.
59 if (! matchRule (_tree->_name, pig, _tree, 0))
60 throw std::string ("Parse failed.");
63 throw format ("Parse failed - extra character at position {1}.", pig.cursor ());
66 ////////////////////////////////////////////////////////////////////////////////
67 void Packrat::entity (const std::string& category, const std::string& name)
69 // Walk the list of entities for category.
70 auto c = _entities.equal_range (category);
71 for (auto e = c.first; e != c.second; ++e)
72 if (e->second == name)
75 // The category/name pair was not found, therefore add it.
76 _entities.insert (std::pair <std::string, std::string> (category, name));
79 ////////////////////////////////////////////////////////////////////////////////
80 void Packrat::external (
81 const std::string& rule,
82 bool (*fn)(Pig&, std::shared_ptr <Tree>))
84 if (_externals.find (rule) != _externals.end ())
85 throw format ("There is already an external parser defined for rule '{1}'.", rule);
87 _externals[rule] = fn;
90 ////////////////////////////////////////////////////////////////////////////////
91 // If there is a match, pig advances further down the pipe.
92 bool Packrat::matchRule (
93 const std::string& rule,
95 std::shared_ptr <Tree> parseTree,
99 std::cout << "trace " << std::string (indent, ' ') << "matchRule " << rule << "\n";
100 auto checkpoint = pig.cursor ();
102 for (const auto& production : _syntax.find (rule)->second)
103 if (matchProduction (production, pig, parseTree, indent + 1))
106 pig.restoreTo (checkpoint);
110 ////////////////////////////////////////////////////////////////////////////////
111 bool Packrat::matchProduction (
112 const PEG::Production& production,
114 std::shared_ptr <Tree> parseTree,
118 std::cout << "trace " << std::string (indent, ' ') << "matchProduction\n";
119 auto checkpoint = pig.cursor ();
121 auto collector = std::make_shared <Tree> ();
122 for (const auto& token : production)
124 auto b = std::make_shared <Tree> ();
125 if (! matchTokenQuant (token, pig, b, indent + 1))
127 pig.restoreTo (checkpoint);
131 // Accumulate branches.
132 collector->addBranch (b);
135 // On success transfer all sub-branches.
136 for (auto& b : collector->_branches)
137 for (auto sub : b->_branches)
138 parseTree->addBranch (sub);
143 ////////////////////////////////////////////////////////////////////////////////
144 // Wraps calls to matchTokenLookahead, while properly handling the quantifier.
145 bool Packrat::matchTokenQuant (
146 const PEG::Token& token,
148 std::shared_ptr <Tree> parseTree,
152 std::cout << "trace " << std::string (indent, ' ') << "matchTokenQuant " << token.dump () << "\n";
154 // Must match exactly once, so run once and return the result.
155 if (token._quantifier == PEG::Token::Quantifier::one)
157 return matchTokenLookahead (token, pig, parseTree, indent + 1);
160 // May match zero or one time. If it matches, the cursor will be advanced.
161 // If it fails, the cursor will not be advanced, but this is still considered
162 // successful. Return true either way, but backtrack the cursor on failure.
163 else if (token._quantifier == PEG::Token::Quantifier::zero_or_one)
165 // Check for a single match, succeed anyway.
166 matchTokenLookahead (token, pig, parseTree, indent + 1);
168 std::cout << "trace " << std::string (indent, ' ') << "
\e[32mmatch ?
\e[0m " << token.dump () << "\n";
170 std::cout << "trace " << pig.dump () << ' ' << token.dump () << "\n";
174 // May match 1 or more times. If it matches on the first attempt, continue
175 // to greedily match until it fails. If it fails on the first attempt, then
177 else if (token._quantifier == PEG::Token::Quantifier::one_or_more)
179 if (! matchTokenLookahead (token, pig, parseTree, indent + 1))
182 while (matchTokenLookahead (token, pig, parseTree, indent + 1))
184 // "Forget it, he's rolling."
188 std::cout << "trace " << std::string (indent, ' ') << "
\e[32mmatch +
\e[0m " << token.dump () << "\n";
190 std::cout << "trace " << pig.dump () << ' ' << token.dump () << "\n";
194 // May match zero or more times. Keep calling while there are matches, and
195 // return true always. Backtrack the cursor on failure.
196 else if (token._quantifier == PEG::Token::Quantifier::zero_or_more)
198 while (matchTokenLookahead (token, pig, parseTree, indent + 1))
204 std::cout << "trace " << std::string (indent, ' ') << "
\e[32mmatch *
\e[0m " << token.dump () << "\n";
206 std::cout << "trace " << pig.dump () << ' ' << token.dump () << "\n";
210 throw std::string ("This should never happen.");
214 ////////////////////////////////////////////////////////////////////////////////
215 // Wraps calls to matchToken, while properly handling lookahead.
216 bool Packrat::matchTokenLookahead (
217 const PEG::Token& token,
219 std::shared_ptr <Tree> parseTree,
223 std::cout << "trace " << std::string (indent, ' ') << "matchTokenLookahead " << token.dump () << "\n";
225 if (token._lookahead == PEG::Token::Lookahead::none)
227 return matchToken (token, pig, parseTree, indent + 1);
229 else if (token._lookahead == PEG::Token::Lookahead::positive)
231 auto checkpoint = pig.cursor ();
232 auto b = std::make_shared <Tree> ();
233 if (matchToken (token, pig, b, indent + 1))
235 pig.restoreTo (checkpoint);
239 else if (token._lookahead == PEG::Token::Lookahead::negative)
241 auto checkpoint = pig.cursor ();
242 auto b = std::make_shared <Tree> ();
243 if (! matchToken (token, pig, b, indent + 1))
248 pig.restoreTo (checkpoint);
254 ////////////////////////////////////////////////////////////////////////////////
255 bool Packrat::matchToken (
256 const PEG::Token& token,
258 std::shared_ptr <Tree> parseTree,
262 std::cout << "trace " << std::string (indent, ' ') << "matchToken " << token.dump () << "\n";
264 auto checkpoint = pig.cursor ();
265 auto b = std::make_shared <Tree> ();
267 if (token.hasTag ("intrinsic") &&
268 matchIntrinsic (token, pig, parseTree, indent + 1))
273 else if (_syntax.find (token._token) != _syntax.end () &&
274 matchRule (token._token, pig, b, indent + 1))
276 // This is the only case that adds a sub-branch.
277 b->_name = token._token;
278 parseTree->addBranch (b);
282 else if (token.hasTag ("literal") &&
283 token.hasTag ("character") &&
284 matchCharLiteral (token, pig, parseTree, indent + 1))
289 else if (token.hasTag ("literal") &&
290 token.hasTag ("string") &&
291 matchStringLiteral (token, pig, parseTree, indent + 1))
296 pig.restoreTo (checkpoint);
300 ////////////////////////////////////////////////////////////////////////////////
301 // Supports the following:
302 // <digit> --> unicodeLatinDigit
303 // <hex> --> unicodeHexDigit
304 // <character> --> anything
305 // <alpha> --> unicodeAlpha
306 // <punct> --> unicodePunctuation
307 // <ws> --> unicodeWhitespace
308 // <sep> --> unicodeHorizontalWhitespace
309 // <eol> --> unicodeVerticalWhitespace
310 // <word> --> <alpha>
311 // <token> --> consecutive non <ws>
312 // <entity:e> --> Any category 'e' token
313 // <external:x> --> Delegate to external function
314 bool Packrat::matchIntrinsic (
315 const PEG::Token& token,
317 std::shared_ptr <Tree> parseTree,
321 std::cout << "trace " << std::string (indent, ' ') << "matchIntrinsic " << token.dump () << "\n";
322 auto checkpoint = pig.cursor ();
324 // There are only 10 digits.
325 if (token._token == "<digit>")
328 if (pig.getDigit (digit))
330 // Create a populated branch.
331 auto b = std::make_shared <Tree> ();
332 b->_name = "intrinsic";
333 b->attribute ("expected", token._token);
334 b->attribute ("value", format ("{1}", digit));
335 parseTree->addBranch (b);
338 std::cout << "trace " << std::string (indent, ' ') << "
\e[32mmatch
\e[0m " << digit << "\n";
340 std::cout << "trace " << pig.dump () << ' ' << token.dump () << "\n";
345 // Upper or lower case hex digit.
346 if (token._token == "<hex>")
349 if (pig.getHexDigit (digit))
351 // Create a populated branch.
352 auto b = std::make_shared <Tree> ();
353 b->_name = "intrinsic";
354 b->attribute ("expected", token._token);
355 b->attribute ("value", format ("{1}", digit));
356 parseTree->addBranch (b);
359 std::cout << "trace " << std::string (indent, ' ') << "
\e[32mmatch
\e[0m " << digit << "\n";
361 std::cout << "trace " << pig.dump () << ' ' << token.dump () << "\n";
366 // Character means anything.
367 else if (token._token == "<character>")
370 if (pig.getCharacter (character))
372 // Create a populated branch.
373 auto b = std::make_shared <Tree> ();
374 b->_name = "intrinsic";
375 b->attribute ("expected", token._token);
376 b->attribute ("value", format ("{1}", character));
377 parseTree->addBranch (b);
380 std::cout << "trace " << std::string (indent, ' ') << "
\e[32mmatch
\e[0m " << character << "\n";
382 std::cout << "trace " << pig.dump () << ' ' << token.dump () << "\n";
388 else if (token._token == "<punct>")
390 int character = pig.peek ();
391 if (unicodePunctuation (character))
393 pig.skip (character);
395 // Create a populated branch.
396 auto b = std::make_shared <Tree> ();
397 b->_name = "intrinsic";
398 b->attribute ("expected", token._token);
399 b->attribute ("value", format ("{1}", character));
400 parseTree->addBranch (b);
403 std::cout << "trace " << std::string (indent, ' ') << "
\e[32mmatch
\e[0m " << character << "\n";
405 std::cout << "trace " << pig.dump () << ' ' << token.dump () << "\n";
411 else if (token._token == "<alpha>")
413 int character = pig.peek ();
414 if (unicodeAlpha (character))
416 pig.skip (character);
418 // Create a populated branch.
419 auto b = std::make_shared <Tree> ();
420 b->_name = "intrinsic";
421 b->attribute ("expected", token._token);
422 b->attribute ("value", format ("{1}", character));
423 parseTree->addBranch (b);
426 std::cout << "trace " << std::string (indent, ' ') << "
\e[32mmatch
\e[0m " << character << "\n";
428 std::cout << "trace " << pig.dump () << ' ' << token.dump () << "\n";
434 else if (token._token == "<ws>")
436 int character = pig.peek ();
437 if (unicodeWhitespace (character))
439 pig.skip (character);
441 // Create a populated branch.
442 auto b = std::make_shared <Tree> ();
443 b->_name = "intrinsic";
444 b->attribute ("expected", token._token);
445 b->attribute ("value", format ("{1}", character));
446 parseTree->addBranch (b);
449 std::cout << "trace " << std::string (indent, ' ') << "
\e[32mmatch
\e[0m " << character << "\n";
451 std::cout << "trace " << pig.dump () << ' ' << token.dump () << "\n";
457 else if (token._token == "<sep>")
459 int character = pig.peek ();
460 if (unicodeHorizontalWhitespace (character))
462 pig.skip (character);
464 // Create a populated branch.
465 auto b = std::make_shared <Tree> ();
466 b->_name = "intrinsic";
467 b->attribute ("expected", token._token);
468 b->attribute ("value", format ("{1}", character));
469 parseTree->addBranch (b);
472 std::cout << "trace " << std::string (indent, ' ') << "
\e[32mmatch
\e[0m " << character << "\n";
474 std::cout << "trace " << pig.dump () << ' ' << token.dump () << "\n";
480 else if (token._token == "<eol>")
482 int character = pig.peek ();
483 if (unicodeVerticalWhitespace (character))
485 pig.skip (character);
487 // Create a populated branch.
488 auto b = std::make_shared <Tree> ();
489 b->_name = "intrinsic";
490 b->attribute ("expected", token._token);
491 b->attribute ("value", format ("{1}", character));
492 parseTree->addBranch (b);
495 std::cout << "trace " << std::string (indent, ' ') << "
\e[32mmatch
\e[0m " << character << "\n";
497 std::cout << "trace " << pig.dump () << ' ' << token.dump () << "\n";
502 // <word> consecutive non-<ws>, non-<punct>.
503 else if (token._token == "<word>")
505 while (auto character = pig.peek ())
508 unicodeWhitespace (character) ||
509 unicodePunctuation (character))
512 pig.skip (character);
515 if (pig.cursor () > checkpoint)
517 auto word = pig.substr (checkpoint, pig.cursor () - checkpoint + 1);
519 // Create a populated branch.
520 auto b = std::make_shared <Tree> ();
521 b->_name = "intrinsic";
522 b->attribute ("expected", token._token);
523 b->attribute ("value", word);
524 parseTree->addBranch (b);
527 std::cout << "trace " << std::string (indent, ' ') << "
\e[32mmatch
\e[0m " << word << "\n";
529 std::cout << "trace " << pig.dump () << ' ' << token.dump () << "\n";
534 // <token> consecutive non-<ws>.
535 else if (token._token == "<token>")
537 while (auto character = pig.peek ())
540 unicodeWhitespace (character))
543 pig.skip (character);
546 if (pig.cursor () > checkpoint)
548 auto word = pig.substr (checkpoint, pig.cursor () - checkpoint);
550 // Create a populated branch.
551 auto b = std::make_shared <Tree> ();
552 b->_name = "intrinsic";
553 b->attribute ("expected", token._token);
554 b->attribute ("value", word);
555 parseTree->addBranch (b);
558 std::cout << "trace " << std::string (indent, ' ') << "
\e[32mmatch
\e[0m " << word << "\n";
560 std::cout << "trace " << pig.dump () << ' ' << token.dump () << "\n";
565 // <entity:category>.
566 else if (token._token.find ("<entity:") == 0)
568 // Extract entity category
569 auto category = token._token.substr (8, token._token.length () - 9);
571 // Match against any one of the entity values in this category.
572 auto values = _entities.equal_range (category);
573 for (auto value = values.first; value != values.second; ++value)
575 if (pig.skipLiteral (value->second))
577 // Create a populated branch.
578 auto b = std::make_shared <Tree> ();
579 b->_name = "intrinsic";
581 b->attribute ("expected", token._token);
582 b->attribute ("value", value->second);
583 parseTree->addBranch (b);
586 std::cout << "trace " << std::string (indent, ' ') << "
\e[32mmatch
\e[0m " << value->second << "\n";
588 std::cout << "trace " << pig.dump () << ' ' << token.dump () << "\n";
596 else if (token._token.find ("<external:") == 0)
598 // Extract entity category
599 auto rule = token._token.substr (10, token._token.length () - 11);
601 // Any rule can be overridden by an external parser.
602 if (_externals.find (rule) != _externals.end ())
604 // Create a pre-populated branch, which is attached on success only.
605 auto newBranch = std::make_shared <Tree> ();
606 newBranch->_name = "intrinsic";
607 newBranch->tag ("external");
608 newBranch->attribute ("expected", token._token);
610 if (_externals[rule] (pig, newBranch))
612 // Determine what was parsed.
613 auto word = pig.substr (checkpoint, pig.cursor () - checkpoint);
615 // Attach the new branch.
616 newBranch->attribute ("value", word);
617 parseTree->addBranch (newBranch);
620 std::cout << "trace " << std::string (indent, ' ') << "
\e[32mmatch
\e[0m " << word << "\n";
622 std::cout << "trace " << pig.dump () << ' ' << token.dump () << "\n";
627 // Note: Branch 'newBranch' goes out of scope here if parsing fails.
632 std::cout << "trace " << std::string (indent, ' ') << "
\e[31mfail
\e[0m " << token._token << "\n";
633 pig.restoreTo (checkpoint);
637 ////////////////////////////////////////////////////////////////////////////////
638 bool Packrat::matchCharLiteral (
639 const PEG::Token& token,
641 std::shared_ptr <Tree> parseTree,
645 std::cout << "trace " << std::string (indent, ' ') << "matchCharLiteral " << token.dump () << "\n";
646 auto checkpoint = pig.cursor ();
648 if (token._token.length () >= 3 &&
649 token._token[0] == '\'' &&
650 token._token[2] == '\'')
652 int literal = token._token[1];
653 if (pig.skip (literal))
655 // Create a populated branch.
656 auto b = std::make_shared <Tree> ();
657 b->_name = "charLiteral";
658 b->attribute ("expected", token._token);
659 b->attribute ("value", utf8_character (literal));
660 parseTree->addBranch (b);
663 std::cout << "trace " << std::string (indent, ' ') << "
\e[32mmatch
\e[0m " << token._token << "\n";
665 std::cout << "trace " << pig.dump () << ' ' << token.dump () << "\n";
671 std::cout << "trace " << std::string (indent, ' ') << "
\e[31mfail
\e[0m " << token._token << "\n";
672 pig.restoreTo (checkpoint);
676 ////////////////////////////////////////////////////////////////////////////////
677 bool Packrat::matchStringLiteral (
678 const PEG::Token& token,
680 std::shared_ptr <Tree> parseTree,
684 std::cout << "trace " << std::string (indent, ' ') << "matchStringLiteral " << token.dump () << "\n";
685 auto checkpoint = pig.cursor ();
687 std::string literal = token._token.substr (1, token._token.length () - 2);
688 if (pig.skipLiteral (literal))
690 // Create a populated branch.
691 auto b = std::make_shared <Tree> ();
692 b->_name = "stringLiteral";
693 b->attribute ("expected", token._token);
694 b->attribute ("value", literal);
695 parseTree->addBranch (b);
698 std::cout << "trace " << std::string (indent, ' ') << "
\e[32mmatch
\e[0m " << literal << "\n";
700 std::cout << "trace " << pig.dump () << ' ' << token.dump () << "\n";
705 std::cout << "trace " << std::string (indent, ' ') << "
\e[31mfail
\e[0m " << token._token << "\n";
706 pig.restoreTo (checkpoint);
710 ////////////////////////////////////////////////////////////////////////////////
711 // Search for 'value' in _entities category, return canonicalized value.
712 bool Packrat::canonicalize (
713 std::string& canonicalized,
714 const std::string& category,
715 const std::string& value) const
717 // Extract a list of entities for category.
718 std::vector <std::string> options;
719 auto c = _entities.equal_range (category);
720 for (auto e = c.first; e != c.second; ++e)
722 // Shortcut: if an exact match is found, success.
723 if (value == e->second)
725 canonicalized = value;
729 options.push_back (e->second);
732 // Match against the options, throw away results.
733 std::vector <std::string> matches;
734 if (autoComplete (value, options, matches, minimumMatchLength) == 1)
736 canonicalized = matches[0];
743 ////////////////////////////////////////////////////////////////////////////////
744 std::string Packrat::dump () const
746 std::stringstream out;
750 out << "Packrat Parse "
753 if (_entities.size ())
755 out << " Entities\n";
756 for (const auto& entity : _entities)
757 out << " " << entity.first << ':' << entity.second << '\n';
760 if (_externals.size ())
762 out << " Externals\n";
763 for (const auto& external : _externals)
764 out << " " << external.first << "\n";
770 ////////////////////////////////////////////////////////////////////////////////