mirror of https://github.com/lark-parser/lark.git
Improved grammar validation and refactored the lexers
This commit is contained in:
parent
39e58cb8fd
commit
38c5fd244a
112
lark/lexer.py
112
lark/lexer.py
|
@ -111,12 +111,35 @@ def build_mres(tokens, match_whole=False):
|
|||
return _build_mres(tokens, len(tokens), match_whole)
|
||||
|
||||
|
||||
class Lexer(object):
|
||||
class LineCounter:
|
||||
def __init__(self):
|
||||
self.newline_char = '\n'
|
||||
self.char_pos = 0
|
||||
self.line = 1
|
||||
self.column = 0
|
||||
self.line_start_pos = 0
|
||||
|
||||
def feed(self, token, test_newline=True):
|
||||
"""Consume a token and calculat the new line & column.
|
||||
|
||||
As an optional optimization, set test_newline=False is token doesn't contain a newline.
|
||||
"""
|
||||
if test_newline:
|
||||
newlines = token.count(self.newline_char)
|
||||
if newlines:
|
||||
self.line += newlines
|
||||
self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
|
||||
|
||||
self.char_pos += len(token)
|
||||
self.column = self.char_pos - self.line_start_pos
|
||||
|
||||
|
||||
|
||||
class Lexer:
|
||||
def __init__(self, tokens, ignore=()):
|
||||
assert all(isinstance(t, TokenDef) for t in tokens), tokens
|
||||
|
||||
self.ignore = ignore
|
||||
self.newline_char = '\n'
|
||||
tokens = list(tokens)
|
||||
|
||||
# Sanitization
|
||||
|
@ -129,10 +152,7 @@ class Lexer(object):
|
|||
if t.pattern.min_width == 0:
|
||||
raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern))
|
||||
|
||||
token_names = {t.name for t in tokens}
|
||||
for t in ignore:
|
||||
if t not in token_names:
|
||||
raise LexError("Token '%s' was marked to ignore but it is not defined!" % t)
|
||||
assert set(ignore) <= {t.name for t in tokens}
|
||||
|
||||
# Init
|
||||
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
|
||||
|
@ -147,46 +167,8 @@ class Lexer(object):
|
|||
|
||||
self.mres = build_mres(tokens)
|
||||
|
||||
|
||||
def lex(self, stream):
|
||||
lex_pos = 0
|
||||
line = 1
|
||||
col_start_pos = 0
|
||||
newline_types = list(self.newline_types)
|
||||
ignore_types = list(self.ignore_types)
|
||||
while True:
|
||||
for mre, type_from_index in self.mres:
|
||||
m = mre.match(stream, lex_pos)
|
||||
if m:
|
||||
value = m.group(0)
|
||||
type_ = type_from_index[m.lastindex]
|
||||
to_yield = type_ not in ignore_types
|
||||
|
||||
if to_yield:
|
||||
t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
|
||||
end_col = t.column + len(value)
|
||||
if t.type in self.callback:
|
||||
t = self.callback[t.type](t)
|
||||
|
||||
if type_ in newline_types:
|
||||
newlines = value.count(self.newline_char)
|
||||
if newlines:
|
||||
line += newlines
|
||||
last_newline_index = value.rindex(self.newline_char) + 1
|
||||
col_start_pos = lex_pos + last_newline_index
|
||||
end_col = len(value) - last_newline_index
|
||||
|
||||
if to_yield:
|
||||
t.end_line = line
|
||||
t.end_col = end_col
|
||||
yield t
|
||||
|
||||
lex_pos += len(value)
|
||||
break
|
||||
else:
|
||||
if lex_pos < len(stream):
|
||||
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
|
||||
break
|
||||
return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
|
||||
|
||||
|
||||
class ContextualLexer:
|
||||
|
@ -218,33 +200,39 @@ class ContextualLexer:
|
|||
self.parser_state = state
|
||||
|
||||
def lex(self, stream):
|
||||
lex_pos = 0
|
||||
line = 1
|
||||
col_start_pos = 0
|
||||
newline_types = list(self.root_lexer.newline_types)
|
||||
ignore_types = list(self.root_lexer.ignore_types)
|
||||
l = _Lex(self.lexers[self.parser_state])
|
||||
for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
|
||||
yield x
|
||||
l.lexer = self.lexers[self.parser_state]
|
||||
|
||||
|
||||
class _Lex:
|
||||
"Built to serve both Lexer and ContextualLexer"
|
||||
def __init__(self, lexer):
|
||||
self.lexer = lexer
|
||||
|
||||
def lex(self, stream, newline_types, ignore_types):
|
||||
newline_types = list(newline_types)
|
||||
newline_types = list(newline_types)
|
||||
line_ctr = LineCounter()
|
||||
|
||||
while True:
|
||||
lexer = self.lexers[self.parser_state]
|
||||
lexer = self.lexer
|
||||
for mre, type_from_index in lexer.mres:
|
||||
m = mre.match(stream, lex_pos)
|
||||
m = mre.match(stream, line_ctr.char_pos)
|
||||
if m:
|
||||
value = m.group(0)
|
||||
type_ = type_from_index[m.lastindex]
|
||||
if type_ not in ignore_types:
|
||||
t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
|
||||
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
|
||||
if t.type in lexer.callback:
|
||||
t = lexer.callback[t.type](t)
|
||||
yield t
|
||||
lexer = yield t
|
||||
|
||||
if type_ in newline_types:
|
||||
newlines = value.count(lexer.newline_char)
|
||||
if newlines:
|
||||
line += newlines
|
||||
col_start_pos = lex_pos + value.rindex(lexer.newline_char)
|
||||
lex_pos += len(value)
|
||||
line_ctr.feed(value, type_ in newline_types)
|
||||
break
|
||||
else:
|
||||
if lex_pos < len(stream):
|
||||
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos, lexer.tokens)
|
||||
if line_ctr.char_pos < len(stream):
|
||||
raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
|
||||
break
|
||||
|
||||
|
|
|
@ -411,6 +411,7 @@ class Grammar:
|
|||
terms_to_ignore = {name:'__'+name for name in self.ignore}
|
||||
if terms_to_ignore:
|
||||
assert set(terms_to_ignore) <= {name for name, _t in term_defs}
|
||||
|
||||
term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs]
|
||||
expr = Token('RULE', '__ignore')
|
||||
for r, tree, _o in rule_defs:
|
||||
|
@ -562,6 +563,7 @@ class GrammarLoader:
|
|||
d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules}
|
||||
rules, callback = ParseTreeBuilder(d, T).apply()
|
||||
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'])
|
||||
|
||||
parser_conf = ParserConf(rules, callback, 'start')
|
||||
self.parser = LALR(lexer_conf, parser_conf)
|
||||
|
||||
|
@ -636,7 +638,6 @@ class GrammarLoader:
|
|||
ignore_names.append(name)
|
||||
token_defs.append((name, (t, 0)))
|
||||
|
||||
|
||||
# Verify correctness 2
|
||||
token_names = set()
|
||||
for name, _ in token_defs:
|
||||
|
@ -644,6 +645,9 @@ class GrammarLoader:
|
|||
raise GrammarError("Token '%s' defined more than once" % name)
|
||||
token_names.add(name)
|
||||
|
||||
if set(ignore_names) > token_names:
|
||||
raise GrammarError("Tokens %s were marked to ignore but were not defined!" % (set(ignore_names) - token_names))
|
||||
|
||||
# Resolve token references
|
||||
resolve_token_references(token_defs)
|
||||
|
||||
|
|
|
@ -121,7 +121,7 @@ class ParseTreeBuilder:
|
|||
|
||||
for expansion, alias in expansions:
|
||||
if alias and origin.startswith('_'):
|
||||
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias))
|
||||
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias))
|
||||
|
||||
wrapper_chain = filter(None, [
|
||||
(expand1 and not alias) and Expand1,
|
||||
|
|
|
@ -127,7 +127,7 @@ class Parser:
|
|||
|
||||
if token == '\n':
|
||||
text_line += 1
|
||||
text_column = 1
|
||||
text_column = 0
|
||||
else:
|
||||
text_column += 1
|
||||
|
||||
|
|
|
@ -126,7 +126,7 @@ class TestParsers(unittest.TestCase):
|
|||
r = T().transform(g.parse("x"))
|
||||
self.assertEqual( r.children, ["<b>"] )
|
||||
|
||||
|
||||
|
||||
g = Lark("""start: a
|
||||
?a : b
|
||||
b : "x"
|
||||
|
@ -142,14 +142,14 @@ class TestParsers(unittest.TestCase):
|
|||
r = T().transform(g.parse("xx"))
|
||||
self.assertEqual( r.children, ["<c>"] )
|
||||
|
||||
|
||||
|
||||
g = Lark("""start: a
|
||||
?a : b b -> c
|
||||
b : "x"
|
||||
""", parser='lalr', transformer=T())
|
||||
r = g.parse("xx")
|
||||
self.assertEqual( r.children, ["<c>"] )
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -796,6 +796,39 @@ def _make_parser_test(LEXER, PARSER):
|
|||
self.assertEqual(tree.children, ['a', 'A'])
|
||||
|
||||
|
||||
def test_undefined_ignore(self):
|
||||
g = """!start: "A"
|
||||
|
||||
%ignore B
|
||||
"""
|
||||
self.assertRaises( GrammarError, _Lark, g)
|
||||
|
||||
@unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO
|
||||
def test_line_and_column(self):
|
||||
g = r"""!start: "A" bc "D"
|
||||
!bc: "B\nC"
|
||||
"""
|
||||
l = _Lark(g)
|
||||
a, bc, d = l.parse("AB\nCD").children
|
||||
self.assertEqual(a.line, 1)
|
||||
self.assertEqual(a.column, 0)
|
||||
|
||||
bc ,= bc.children
|
||||
self.assertEqual(bc.line, 1)
|
||||
self.assertEqual(bc.column, 1)
|
||||
|
||||
self.assertEqual(d.line, 2)
|
||||
self.assertEqual(d.column, 1)
|
||||
|
||||
# self.assertEqual(a.end_line, 1)
|
||||
# self.assertEqual(a.end_col, 1)
|
||||
# self.assertEqual(bc.end_line, 2)
|
||||
# self.assertEqual(bc.end_col, 1)
|
||||
# self.assertEqual(d.end_line, 2)
|
||||
# self.assertEqual(d.end_col, 2)
|
||||
|
||||
|
||||
|
||||
def test_reduce_cycle(self):
|
||||
"""Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state.
|
||||
It seems that the correct solution is to explicitely distinguish finalization in the reduce() function.
|
||||
|
@ -969,7 +1002,7 @@ def _make_parser_test(LEXER, PARSER):
|
|||
|
||||
parser = _Lark(grammar)
|
||||
|
||||
tree = parser.parse("int 1 ! This is a comment\n")
|
||||
tree = parser.parse("int 1 ! This is a comment\n")
|
||||
self.assertEqual(tree.children, ['1'])
|
||||
|
||||
tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky!
|
||||
|
|
Loading…
Reference in New Issue