Better support for scanless parsing

This commit is contained in:
Erez Shinan 2017-02-26 11:56:04 +02:00
parent a5a20a423a
commit c9b45823ed
7 changed files with 80 additions and 33 deletions

View File

@ -19,13 +19,12 @@ parser = Lark(r"""
start: _NL? section+
section: "[" NAME "]" _NL item+
item: NAME "=" VALUE _NL
NAME: /[a-zA-Z_]\w*/
VALUE: /.*/
VALUE: /./*
%import common.CNAME -> NAME
%import common.NEWLINE -> _NL
_NL: /(\r?\n)+/
%ignore /[\t \f]+/
%ignore /\#[^\n]*/
%import common.WS_INLINE
%ignore WS_INLINE
""", parser="lalr", lexer="contextual")

View File

@ -12,25 +12,21 @@
# See examples/conf.py for an example of that approach.
#
from lark import Lark, Transformer
from lark import Lark
parser = Lark(r"""
start: _nl? section+
section: "[" name "]" _nl item+
item: name "=" value _nl
name: /[a-zA-Z_]/ /\w/*
value: /./+
_nl: (_CR? _LF)+
start: _NL? section+
section: "[" NAME "]" _NL item+
item: NAME "=" VALUE _NL
VALUE: /./*
%import common.CNAME -> NAME
%import common.NEWLINE -> _NL
_CR : /\r/
_LF : /\n/
%import common.WS_INLINE
%ignore WS_INLINE
""", lexer=None)
class RestoreTokens(Transformer):
value = ''.join
name = ''.join
def test():
sample_conf = """
[bla]
@ -40,7 +36,7 @@ this="that",4
"""
r = parser.parse(sample_conf)
print(RestoreTokens().transform(r).pretty())
print r.pretty()
if __name__ == '__main__':
test()

View File

@ -39,3 +39,7 @@ CNAME: ("_"|LETTER) ("_"|LETTER|DIGIT)*
WS_INLINE: (" "|/\t/)+
WS: /[ \t\f\r\n]/+
CR : /\r/
LF : /\n/
NEWLINE: (CR? LF)+

View File

@ -119,21 +119,23 @@ class Lark:
assert not self.options.profile, "Feature temporarily disabled"
self.profiler = Profiler() if self.options.profile else None
lexer = self.options.lexer
if lexer == 'auto':
if self.options.parser == 'lalr':
lexer = 'standard'
elif self.options.parser == 'earley':
lexer = 'standard'
self.options.lexer = lexer
self.grammar = load_grammar(grammar)
tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=True)
tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=bool(lexer))
self.ignore_tokens = self.grammar.extra['ignore']
self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex)
if self.options.lexer == 'auto':
if self.options.parser == 'lalr':
self.options.lexer = 'standard'
elif self.options.parser == 'earley':
self.options.lexer = 'standard'
if self.options.parser:
self.parser = self._build_parser()
elif self.options.lexer:
elif lexer:
self.lexer = self._build_lexer()
if self.profiler: self.profiler.enter_section('outside_lark')

View File

@ -239,6 +239,15 @@ class ExtractAnonTokens(InlineTransformer):
self.re_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternRE)}
self.i = 0
def range(self, start, end):
assert start.type == end.type == 'STRING'
start = start.value[1:-1]
end = end.value[1:-1]
assert len(start) == len(end) == 1
regexp = '/[%s-%s]/' % (start, end)
t = Token('REGEXP', regexp)
return self.tokenvalue(t)
def tokenvalue(self, token):
value = token.value[1:-1]
if token.type == 'STRING':
@ -325,8 +334,19 @@ class Grammar:
self.extra = extra
def compile(self, lexer=False):
assert lexer
# assert lexer
if not lexer:
self.rule_defs += self.token_defs
self.token_defs = []
for name, tree in self.rule_defs:
for tokenvalue in tree.find_data('tokenvalue'):
value ,= tokenvalue.children
if value.type == 'STRING':
assert value[0] == value[-1] == '"'
if len(value)>3:
tokenvalue.data = 'expansion'
tokenvalue.children = [T('tokenvalue', [Token('STRING', '"%s"'%ch)]) for ch in value[1:-1]]
tokendefs = list(self.token_defs)
# =================

View File

@ -6,6 +6,7 @@ from .lexer import Lexer, ContextualLexer, Token
from .common import is_terminal, GrammarError, ParserConf
from .parsers import lalr_parser, earley, nearley
from .parsers.grammar_analysis import Rule
from .tree import Transformer
class WithLexer:
def __init__(self, lexer_conf):
@ -121,10 +122,16 @@ class Nearley_NoLex:
class Earley_NoLex:
def __init__(self, lexer_conf, parser_conf):
self.tokens_to_convert = {name: '__token_'+name for name, tree, _ in parser_conf.rules if is_terminal(name)}
rules = []
for name, exp, alias in parser_conf.rules:
name = self.tokens_to_convert.get(name, name)
exp = [self.tokens_to_convert.get(x, x) for x in exp]
rules.append((name, exp, alias))
self.token_by_name = {t.name:t for t in lexer_conf.tokens}
rules = [(n, list(self._prepare_expansion(x)), a)
for n,x,a in parser_conf.rules]
rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in rules]
self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start))
@ -142,7 +149,16 @@ class Earley_NoLex:
def parse(self, text):
res = self.parser.parse(text)
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
return res[0]
res = res[0]
class RestoreTokens(Transformer):
pass
for t in self.tokens_to_convert:
setattr(RestoreTokens, t, ''.join)
res = RestoreTokens().transform(res)
return res
def get_frontend(parser, lexer):

View File

@ -39,9 +39,19 @@ class TestParsers(unittest.TestCase):
l2 = g.parse('(a,b,c,*x)')
assert l == l2, '%s != %s' % (l.pretty(), l2.pretty())
def test_earley_nolex(self):
g = Lark("""start: A "b" c
A: "a"+
c: "abc"
""", parser="earley", lexer=None)
x = g.parse('aaaababc')
class TestEarley(unittest.TestCase):
pass
def _make_parser_test(LEXER, PARSER):
def _Lark(grammar, **kwargs):
return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs)