Improved error reporting in Earley

2017-02-12 23:31:09 +02:00 · 2017-02-12 23:31:09 +02:00 · 1602482608
parent 7ba98c46f6
commit 1602482608
4 changed files with 25 additions and 20 deletions
--- a/README.md
+++ b/README.md
@ -43,7 +43,7 @@ Notice punctuation doesn't appear in the resulting tree. It's automatically filt

 ## Learn more about using Lark

- - Read the [tutorial](/docs/json_tutorial.md), which shows how to write a JSON parser in Lark.
+ - **Read the [tutorial](/docs/json_tutorial.md)**, which shows how to write a JSON parser in Lark.
 - Read the [reference](/docs/reference.md)
 - Browse the [examples](/examples), which include a calculator, and a Python-code parser.
 - Check out the [tests](/tests/test_parser.py) for more examples.
@ -72,6 +72,7 @@ These features are planned to be implemented in the near future:
 - Grammar composition (in cases that the tokens can reliably signify a grammar change)
 - Parser generator - create a small parser, indepdendent of Lark, to embed in your project.
 - Optimizations in both the parsers and the lexer
+ - Better handling of ambiguity

 ## Comparison to other parsers

--- a/lark/common.py
+++ b/lark/common.py
@ -6,6 +6,22 @@ class ParseError(Exception):
    pass


+class UnexpectedToken(ParseError):
+    def __init__(self, token, expected, seq, index):
+        self.token = token
+        self.expected = expected
+        self.line = getattr(token, 'line', '?')
+        self.column = getattr(token, 'column', '?')
+
+        context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]])
+        message = ("Unexpected token %r at line %s, column %s.\n"
+                   "Expected: %s\n"
+                   "Context: %s" % (token.value, self.line, self.column, expected, context))
+
+        super(ParseError, self).__init__(message)
+
+
+
 def is_terminal(sym):
    return sym.isupper() or sym[0] == '$'

--- a/lark/parsers/earley.py
+++ b/lark/parsers/earley.py
@ -1,7 +1,7 @@
 "My name is Earley"

 from ..utils import classify, STRING_TYPE
-from ..common import ParseError
+from ..common import ParseError, UnexpectedToken

 try:
    xrange
@ -14,6 +14,7 @@ class MatchFailed(object):
 class AbortParseMatch(Exception):
    pass

+
 class Rule(object):
    def __init__(self, name, symbols, postprocess):
        self.name = name
@ -34,6 +35,8 @@ class State(object):
            if self.is_literal:
                self.expect_symbol = self.expect_symbol['literal']
            assert isinstance(self.expect_symbol, STRING_TYPE), self.expect_symbol
+        else:
+            self.is_literal = False

    def next_state(self, data):
        return State(self.rule, self.expect+1, self.reference, self.data + [data])
@ -136,7 +139,8 @@ class Parser(object):
            self.advance_to(table, pos + 1, set())

            if not table[-1]:
-                raise ParseError('Error at line {t.line}:{t.column}'.format(t=stream[pos]))
+                expected = {s.expect_symbol for s in table[-2] if s.is_literal}
+                raise UnexpectedToken(stream[pos], expected, stream, pos)

        res = list(self.finish(table))
        if not res:
--- a/lark/parsers/lalr_parser.py
+++ b/lark/parsers/lalr_parser.py
@ -1,21 +1,5 @@
 from .lalr_analysis import ACTION_SHIFT
-from ..common import ParseError
-
-class UnexpectedToken(ParseError):
-    def __init__(self, token, expected, seq, index):
-        self.token = token
-        self.expected = expected
-        self.line = getattr(token, 'line', '?')
-        self.column = getattr(token, 'column', '?')
-
-        context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]])
-        message = ("Unexpected input %r at line %s, column %s.\n"
-                   "Expected: %s\n"
-                   "Context: %s" % (token.value, self.line, self.column, expected, context))
-
-        super(ParseError, self).__init__(message)
-
-
+from ..common import ParseError, UnexpectedToken


 class Parser(object):