1 ////////////////////////////////////////////////////////////////////////////////
3 // Copyright 2015 - 2017, Paul Beckingham, Federico Hernandez.
5 // Permission is hereby granted, free of charge, to any person obtaining a copy
6 // of this software and associated documentation files (the "Software"), to deal
7 // in the Software without restriction, including without limitation the rights
8 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 // copies of the Software, and to permit persons to whom the Software is
10 // furnished to do so, subject to the following conditions:
12 // The above copyright notice and this permission notice shall be included
13 // in all copies or substantial portions of the Software.
15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 // http://www.opensource.org/licenses/mit-license.php
25 ////////////////////////////////////////////////////////////////////////////////
34 ////////////////////////////////////////////////////////////////////////////////
35 std::string PEG::Token::dump () const
37 std::stringstream out;
38 out << (_lookahead == Token::Lookahead::positive ? "
\e[34m&
\e[0m" :
39 _lookahead == Token::Lookahead::negative ? "
\e[34m!
\e[0m" : "")
41 << (_quantifier == Token::Quantifier::zero_or_one ? "
\e[34m?
\e[0m" :
42 _quantifier == Token::Quantifier::one_or_more ? "
\e[34m+
\e[0m" :
43 _quantifier == Token::Quantifier::zero_or_more ? "
\e[34m*
\e[0m" : "");
45 for (const auto& tag : _tags)
46 out << "
\e[34m" << tag << "
\e[0m";
51 ////////////////////////////////////////////////////////////////////////////////
52 void PEG::loadFromFile (File& file)
55 throw format ("PEG file '{1}' not found.", file._data);
60 // TODO Instead of simply reading a file, read a file and allow lines that
61 // match /^include <path>$/ to represent nested files.
63 loadFromString (contents);
66 ////////////////////////////////////////////////////////////////////////////////
67 // Load and parse PEG.
70 // rule-name: alternate1-token1 alternate1-token2
73 // - Rules are aligned at left margin only, followed by colon.
74 // - Productions are indented and never at left margin.
75 // - Blank lines delineate rules.
78 // - String literals are always double-quoted.
79 // - Character literals are alをays single-quoted.
80 // - "*", "+" and "?" suffixes have POSIX wildcard semantics.
82 void PEG::loadFromString (const std::string& input)
84 // This is a state machine. Read each line.
85 std::string rule_name = "";
86 for (auto& line : loadImports (split (input, '\n')))
90 // Eliminate inline comments.
91 auto hash = line.find ('#');
92 if (hash != std::string::npos)
105 // Skip blank lines with no semantics.
106 if (line == "" and rule_name == "")
113 // Instantiate and configure the Lexer.
126 while (l.token (token, type))
130 // Rule definitions end in a colon.
131 if (token.back () == ':')
133 // Capture the Rule_name.
134 rule_name = token.substr (0, token.size () - 1);
136 // If this is the first Rule, capture it as a starting point.
140 _rules[rule_name] = PEG::Rule ();
143 // Production definition.
146 // If no Production was added yet, add one.
147 if (token_count <= 1)
148 _rules[rule_name].push_back (PEG::Production ());
150 // Decorate the token, if necessary.
151 std::string::size_type start = 0;
152 std::string::size_type end = token.length ();
154 auto q = Token::Quantifier::one;
155 auto l = Token::Lookahead::none;
157 if (token.back () == '?')
159 q = Token::Quantifier::zero_or_one;
162 else if (token.back () == '+')
164 q = Token::Quantifier::one_or_more;
167 else if (token.back () == '*')
169 q = Token::Quantifier::zero_or_more;
173 if (token.front () == '&')
175 l = Token::Lookahead::positive;
178 else if (token.front () == '!')
180 l = Token::Lookahead::negative;
184 PEG::Token t (token.substr (start, end - start));
188 if (type == Lexer::Type::string)
191 t.tag (token[0] == '\'' ? "character" : "string");
193 else if (t._token.front () == '<' and
194 t._token.back () == '>')
198 if (t._token.substr (0, 8) == "<entity:")
201 if (t._token.substr (0, 10) == "<external:")
205 // Add the new Token to the most recent Production, of the current
207 _rules[rule_name].back ().push_back (t);
212 // A blank line in the input ends the current rule definition.
218 std::cout << dump ();
220 // Validate the parsed grammar.
224 ////////////////////////////////////////////////////////////////////////////////
225 std::map <std::string, PEG::Rule> PEG::syntax () const
230 ////////////////////////////////////////////////////////////////////////////////
231 std::string PEG::firstRule () const
236 ////////////////////////////////////////////////////////////////////////////////
242 ////////////////////////////////////////////////////////////////////////////////
243 void PEG::strict (bool value)
248 ////////////////////////////////////////////////////////////////////////////////
249 std::string PEG::dump () const
251 std::stringstream out;
254 // Show the import files, if any.
255 if (_imports.size ())
257 for (const auto& import : _imports)
258 out << " import " << import << '\n';
262 // Determine longest rule name, for display alignment.
264 for (const auto& rule : _rules)
265 if (rule.first.length () > longest)
266 longest = rule.first.length ();
268 for (const auto& rule : _rules)
270 // Indicate the start Rule.
272 << (rule.first == _start ? "▶" : " ")
276 << std::string (1 + longest - rule.first.length (), ' ');
279 for (const auto& production : rule.second)
282 out << std::string (6 + longest, ' ');
284 for (const auto& token : production)
285 out << token.dump () << ' ';
297 ////////////////////////////////////////////////////////////////////////////////
298 std::vector <std::string> PEG::loadImports (const std::vector <std::string>& lines)
300 std::vector <std::string> resolved;
302 for (auto& line : lines)
304 auto copy = trim (line);
306 auto hash = copy.find ('#');
307 if (hash != std::string::npos)
313 if (copy.find ("import ") == 0)
315 File file (trim (copy.substr (7)));
316 if (file.exists () &&
319 // Only import files that are not already imported.
320 if (std::find (_imports.begin (), _imports.end (), file._data) == _imports.end ())
322 _imports.push_back (file._data);
324 std::vector <std::string> imported;
325 file.read (imported);
326 imported = loadImports (imported);
328 resolved.insert(std::end(resolved), std::begin(imported), std::end(imported));
332 throw format ("Cannot import '{1}'", file._data);
336 // Store original line.
337 resolved.push_back (line);
344 ////////////////////////////////////////////////////////////////////////////////
345 void PEG::validate () const
348 throw std::string ("There are no rules defined.");
350 std::vector <std::string> allRules;
351 std::vector <std::string> allTokens;
352 std::vector <std::string> allLeftRecursive;
353 std::vector <std::string> intrinsics;
354 std::vector <std::string> externals;
356 for (const auto& rule : _rules)
358 allRules.push_back (rule.first);
360 for (const auto& production : rule.second)
362 for (const auto& token : production)
364 if (token.hasTag ("intrinsic"))
365 intrinsics.push_back (token._token);
367 else if (token.hasTag ("external"))
368 externals.push_back (token._token);
370 else if (! token.hasTag ("literal"))
371 allTokens.push_back (token._token);
373 if (token._token == production[0]._token and
374 rule.first == production[0]._token and
375 production.size () == 1)
376 allLeftRecursive.push_back (token._token);
381 std::vector <std::string> notUsed;
382 std::vector <std::string> notDefined;
383 listDiff (allRules, allTokens, notUsed, notDefined);
385 // Undefined value - these are definitions that appear in token, but are
387 for (const auto& nd : notDefined)
388 if (std::find (externals.begin (), externals.end (), nd) == externals.end ())
389 throw format ("Definition '{1}' referenced, but not defined.", nd);
391 // Circular definitions - these are names in _rules that also appear as
392 // the only token in any of the alternates for that definition.
393 for (const auto& lr : allLeftRecursive)
394 throw format ("Definition '{1}' is left recursive.", lr);
396 for (const auto& r : allRules)
398 throw format ("Definition '{1}' may not redefine an intrinsic.");
400 for (const auto& r : allRules)
403 throw format ("Definition '{1}' may not be a literal.");
405 // Unused definitions - these are names in _rules that are never
406 // referenced as token.
407 for (const auto& nu : notUsed)
409 if (std::find (externals.begin (), externals.end (), nu) == externals.end () &&
413 throw format ("Definition '{1}' is defined, but not referenced.", nu);
415 std::cout << "Warning: Definition '" << nu << "' is defined, but not referenced.\n";
419 // Intrinsics must be recognized.
420 for (auto& intrinsic : intrinsics)
421 if (intrinsic != "<digit>" &&
422 intrinsic != "<hex>" &&
423 intrinsic != "<punct>" &&
424 intrinsic != "<eol>" &&
425 intrinsic != "<sep>" &&
426 intrinsic != "<ws>" &&
427 intrinsic != "<alpha>" &&
428 intrinsic != "<character>" &&
429 intrinsic != "<word>" &&
430 intrinsic != "<token>" &&
431 intrinsic.substr (0, 8) != "<entity:" &&
432 intrinsic.substr (0, 10) != "<external:")
433 throw format ("Specified intrinsic '{1}' is not supported.", intrinsic);
436 ////////////////////////////////////////////////////////////////////////////////