Improved grammar validation and refactored the lexers

This commit is contained in:
Erez Shinan 2018-01-07 17:20:07 +02:00
parent 39e58cb8fd
commit 38c5fd244a
5 changed files with 94 additions and 69 deletions

View File

@ -111,12 +111,35 @@ def build_mres(tokens, match_whole=False):
return _build_mres(tokens, len(tokens), match_whole)
class Lexer(object):
class LineCounter:
def __init__(self):
self.newline_char = '\n'
self.char_pos = 0
self.line = 1
self.column = 0
self.line_start_pos = 0
def feed(self, token, test_newline=True):
"""Consume a token and calculat the new line & column.
As an optional optimization, set test_newline=False is token doesn't contain a newline.
"""
if test_newline:
newlines = token.count(self.newline_char)
if newlines:
self.line += newlines
self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
self.char_pos += len(token)
self.column = self.char_pos - self.line_start_pos
class Lexer:
def __init__(self, tokens, ignore=()):
assert all(isinstance(t, TokenDef) for t in tokens), tokens
self.ignore = ignore
self.newline_char = '\n'
tokens = list(tokens)
# Sanitization
@ -129,10 +152,7 @@ class Lexer(object):
if t.pattern.min_width == 0:
raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern))
token_names = {t.name for t in tokens}
for t in ignore:
if t not in token_names:
raise LexError("Token '%s' was marked to ignore but it is not defined!" % t)
assert set(ignore) <= {t.name for t in tokens}
# Init
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
@ -147,46 +167,8 @@ class Lexer(object):
self.mres = build_mres(tokens)
def lex(self, stream):
lex_pos = 0
line = 1
col_start_pos = 0
newline_types = list(self.newline_types)
ignore_types = list(self.ignore_types)
while True:
for mre, type_from_index in self.mres:
m = mre.match(stream, lex_pos)
if m:
value = m.group(0)
type_ = type_from_index[m.lastindex]
to_yield = type_ not in ignore_types
if to_yield:
t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
end_col = t.column + len(value)
if t.type in self.callback:
t = self.callback[t.type](t)
if type_ in newline_types:
newlines = value.count(self.newline_char)
if newlines:
line += newlines
last_newline_index = value.rindex(self.newline_char) + 1
col_start_pos = lex_pos + last_newline_index
end_col = len(value) - last_newline_index
if to_yield:
t.end_line = line
t.end_col = end_col
yield t
lex_pos += len(value)
break
else:
if lex_pos < len(stream):
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
break
return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
class ContextualLexer:
@ -218,33 +200,39 @@ class ContextualLexer:
self.parser_state = state
def lex(self, stream):
lex_pos = 0
line = 1
col_start_pos = 0
newline_types = list(self.root_lexer.newline_types)
ignore_types = list(self.root_lexer.ignore_types)
l = _Lex(self.lexers[self.parser_state])
for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
yield x
l.lexer = self.lexers[self.parser_state]
class _Lex:
"Built to serve both Lexer and ContextualLexer"
def __init__(self, lexer):
self.lexer = lexer
def lex(self, stream, newline_types, ignore_types):
newline_types = list(newline_types)
newline_types = list(newline_types)
line_ctr = LineCounter()
while True:
lexer = self.lexers[self.parser_state]
lexer = self.lexer
for mre, type_from_index in lexer.mres:
m = mre.match(stream, lex_pos)
m = mre.match(stream, line_ctr.char_pos)
if m:
value = m.group(0)
type_ = type_from_index[m.lastindex]
if type_ not in ignore_types:
t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
if t.type in lexer.callback:
t = lexer.callback[t.type](t)
yield t
lexer = yield t
if type_ in newline_types:
newlines = value.count(lexer.newline_char)
if newlines:
line += newlines
col_start_pos = lex_pos + value.rindex(lexer.newline_char)
lex_pos += len(value)
line_ctr.feed(value, type_ in newline_types)
break
else:
if lex_pos < len(stream):
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos, lexer.tokens)
if line_ctr.char_pos < len(stream):
raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
break

View File

@ -411,6 +411,7 @@ class Grammar:
terms_to_ignore = {name:'__'+name for name in self.ignore}
if terms_to_ignore:
assert set(terms_to_ignore) <= {name for name, _t in term_defs}
term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs]
expr = Token('RULE', '__ignore')
for r, tree, _o in rule_defs:
@ -562,6 +563,7 @@ class GrammarLoader:
d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules}
rules, callback = ParseTreeBuilder(d, T).apply()
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'])
parser_conf = ParserConf(rules, callback, 'start')
self.parser = LALR(lexer_conf, parser_conf)
@ -636,7 +638,6 @@ class GrammarLoader:
ignore_names.append(name)
token_defs.append((name, (t, 0)))
# Verify correctness 2
token_names = set()
for name, _ in token_defs:
@ -644,6 +645,9 @@ class GrammarLoader:
raise GrammarError("Token '%s' defined more than once" % name)
token_names.add(name)
if set(ignore_names) > token_names:
raise GrammarError("Tokens %s were marked to ignore but were not defined!" % (set(ignore_names) - token_names))
# Resolve token references
resolve_token_references(token_defs)

View File

@ -121,7 +121,7 @@ class ParseTreeBuilder:
for expansion, alias in expansions:
if alias and origin.startswith('_'):
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias))
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias))
wrapper_chain = filter(None, [
(expand1 and not alias) and Expand1,

View File

@ -127,7 +127,7 @@ class Parser:
if token == '\n':
text_line += 1
text_column = 1
text_column = 0
else:
text_column += 1

View File

@ -126,7 +126,7 @@ class TestParsers(unittest.TestCase):
r = T().transform(g.parse("x"))
self.assertEqual( r.children, ["<b>"] )
g = Lark("""start: a
?a : b
b : "x"
@ -142,14 +142,14 @@ class TestParsers(unittest.TestCase):
r = T().transform(g.parse("xx"))
self.assertEqual( r.children, ["<c>"] )
g = Lark("""start: a
?a : b b -> c
b : "x"
""", parser='lalr', transformer=T())
r = g.parse("xx")
self.assertEqual( r.children, ["<c>"] )
@ -796,6 +796,39 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(tree.children, ['a', 'A'])
def test_undefined_ignore(self):
g = """!start: "A"
%ignore B
"""
self.assertRaises( GrammarError, _Lark, g)
@unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO
def test_line_and_column(self):
g = r"""!start: "A" bc "D"
!bc: "B\nC"
"""
l = _Lark(g)
a, bc, d = l.parse("AB\nCD").children
self.assertEqual(a.line, 1)
self.assertEqual(a.column, 0)
bc ,= bc.children
self.assertEqual(bc.line, 1)
self.assertEqual(bc.column, 1)
self.assertEqual(d.line, 2)
self.assertEqual(d.column, 1)
# self.assertEqual(a.end_line, 1)
# self.assertEqual(a.end_col, 1)
# self.assertEqual(bc.end_line, 2)
# self.assertEqual(bc.end_col, 1)
# self.assertEqual(d.end_line, 2)
# self.assertEqual(d.end_col, 2)
def test_reduce_cycle(self):
"""Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state.
It seems that the correct solution is to explicitely distinguish finalization in the reduce() function.
@ -969,7 +1002,7 @@ def _make_parser_test(LEXER, PARSER):
parser = _Lark(grammar)
tree = parser.parse("int 1 ! This is a comment\n")
tree = parser.parse("int 1 ! This is a comment\n")
self.assertEqual(tree.children, ['1'])
tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky!