mirror of https://github.com/lark-parser/lark.git
Better support for scanless parsing
This commit is contained in:
parent
a5a20a423a
commit
c9b45823ed
|
@ -19,13 +19,12 @@ parser = Lark(r"""
|
||||||
start: _NL? section+
|
start: _NL? section+
|
||||||
section: "[" NAME "]" _NL item+
|
section: "[" NAME "]" _NL item+
|
||||||
item: NAME "=" VALUE _NL
|
item: NAME "=" VALUE _NL
|
||||||
NAME: /[a-zA-Z_]\w*/
|
VALUE: /./*
|
||||||
VALUE: /.*/
|
%import common.CNAME -> NAME
|
||||||
|
%import common.NEWLINE -> _NL
|
||||||
|
|
||||||
_NL: /(\r?\n)+/
|
%import common.WS_INLINE
|
||||||
|
%ignore WS_INLINE
|
||||||
%ignore /[\t \f]+/
|
|
||||||
%ignore /\#[^\n]*/
|
|
||||||
""", parser="lalr", lexer="contextual")
|
""", parser="lalr", lexer="contextual")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -12,25 +12,21 @@
|
||||||
# See examples/conf.py for an example of that approach.
|
# See examples/conf.py for an example of that approach.
|
||||||
#
|
#
|
||||||
|
|
||||||
from lark import Lark, Transformer
|
|
||||||
|
from lark import Lark
|
||||||
|
|
||||||
parser = Lark(r"""
|
parser = Lark(r"""
|
||||||
start: _nl? section+
|
start: _NL? section+
|
||||||
section: "[" name "]" _nl item+
|
section: "[" NAME "]" _NL item+
|
||||||
item: name "=" value _nl
|
item: NAME "=" VALUE _NL
|
||||||
name: /[a-zA-Z_]/ /\w/*
|
VALUE: /./*
|
||||||
value: /./+
|
%import common.CNAME -> NAME
|
||||||
_nl: (_CR? _LF)+
|
%import common.NEWLINE -> _NL
|
||||||
|
|
||||||
_CR : /\r/
|
%import common.WS_INLINE
|
||||||
_LF : /\n/
|
%ignore WS_INLINE
|
||||||
""", lexer=None)
|
""", lexer=None)
|
||||||
|
|
||||||
class RestoreTokens(Transformer):
|
|
||||||
value = ''.join
|
|
||||||
name = ''.join
|
|
||||||
|
|
||||||
|
|
||||||
def test():
|
def test():
|
||||||
sample_conf = """
|
sample_conf = """
|
||||||
[bla]
|
[bla]
|
||||||
|
@ -40,7 +36,7 @@ this="that",4
|
||||||
"""
|
"""
|
||||||
|
|
||||||
r = parser.parse(sample_conf)
|
r = parser.parse(sample_conf)
|
||||||
print(RestoreTokens().transform(r).pretty())
|
print r.pretty()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
test()
|
test()
|
||||||
|
|
|
@ -39,3 +39,7 @@ CNAME: ("_"|LETTER) ("_"|LETTER|DIGIT)*
|
||||||
WS_INLINE: (" "|/\t/)+
|
WS_INLINE: (" "|/\t/)+
|
||||||
WS: /[ \t\f\r\n]/+
|
WS: /[ \t\f\r\n]/+
|
||||||
|
|
||||||
|
CR : /\r/
|
||||||
|
LF : /\n/
|
||||||
|
NEWLINE: (CR? LF)+
|
||||||
|
|
||||||
|
|
18
lark/lark.py
18
lark/lark.py
|
@ -119,21 +119,23 @@ class Lark:
|
||||||
assert not self.options.profile, "Feature temporarily disabled"
|
assert not self.options.profile, "Feature temporarily disabled"
|
||||||
self.profiler = Profiler() if self.options.profile else None
|
self.profiler = Profiler() if self.options.profile else None
|
||||||
|
|
||||||
|
lexer = self.options.lexer
|
||||||
|
if lexer == 'auto':
|
||||||
|
if self.options.parser == 'lalr':
|
||||||
|
lexer = 'standard'
|
||||||
|
elif self.options.parser == 'earley':
|
||||||
|
lexer = 'standard'
|
||||||
|
self.options.lexer = lexer
|
||||||
|
|
||||||
self.grammar = load_grammar(grammar)
|
self.grammar = load_grammar(grammar)
|
||||||
tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=True)
|
tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=bool(lexer))
|
||||||
self.ignore_tokens = self.grammar.extra['ignore']
|
self.ignore_tokens = self.grammar.extra['ignore']
|
||||||
|
|
||||||
self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex)
|
self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex)
|
||||||
|
|
||||||
if self.options.lexer == 'auto':
|
|
||||||
if self.options.parser == 'lalr':
|
|
||||||
self.options.lexer = 'standard'
|
|
||||||
elif self.options.parser == 'earley':
|
|
||||||
self.options.lexer = 'standard'
|
|
||||||
|
|
||||||
if self.options.parser:
|
if self.options.parser:
|
||||||
self.parser = self._build_parser()
|
self.parser = self._build_parser()
|
||||||
elif self.options.lexer:
|
elif lexer:
|
||||||
self.lexer = self._build_lexer()
|
self.lexer = self._build_lexer()
|
||||||
|
|
||||||
if self.profiler: self.profiler.enter_section('outside_lark')
|
if self.profiler: self.profiler.enter_section('outside_lark')
|
||||||
|
|
|
@ -239,6 +239,15 @@ class ExtractAnonTokens(InlineTransformer):
|
||||||
self.re_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternRE)}
|
self.re_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternRE)}
|
||||||
self.i = 0
|
self.i = 0
|
||||||
|
|
||||||
|
def range(self, start, end):
|
||||||
|
assert start.type == end.type == 'STRING'
|
||||||
|
start = start.value[1:-1]
|
||||||
|
end = end.value[1:-1]
|
||||||
|
assert len(start) == len(end) == 1
|
||||||
|
regexp = '/[%s-%s]/' % (start, end)
|
||||||
|
t = Token('REGEXP', regexp)
|
||||||
|
return self.tokenvalue(t)
|
||||||
|
|
||||||
def tokenvalue(self, token):
|
def tokenvalue(self, token):
|
||||||
value = token.value[1:-1]
|
value = token.value[1:-1]
|
||||||
if token.type == 'STRING':
|
if token.type == 'STRING':
|
||||||
|
@ -325,8 +334,19 @@ class Grammar:
|
||||||
self.extra = extra
|
self.extra = extra
|
||||||
|
|
||||||
def compile(self, lexer=False):
|
def compile(self, lexer=False):
|
||||||
assert lexer
|
# assert lexer
|
||||||
|
if not lexer:
|
||||||
|
self.rule_defs += self.token_defs
|
||||||
|
self.token_defs = []
|
||||||
|
|
||||||
|
for name, tree in self.rule_defs:
|
||||||
|
for tokenvalue in tree.find_data('tokenvalue'):
|
||||||
|
value ,= tokenvalue.children
|
||||||
|
if value.type == 'STRING':
|
||||||
|
assert value[0] == value[-1] == '"'
|
||||||
|
if len(value)>3:
|
||||||
|
tokenvalue.data = 'expansion'
|
||||||
|
tokenvalue.children = [T('tokenvalue', [Token('STRING', '"%s"'%ch)]) for ch in value[1:-1]]
|
||||||
tokendefs = list(self.token_defs)
|
tokendefs = list(self.token_defs)
|
||||||
|
|
||||||
# =================
|
# =================
|
||||||
|
|
|
@ -6,6 +6,7 @@ from .lexer import Lexer, ContextualLexer, Token
|
||||||
from .common import is_terminal, GrammarError, ParserConf
|
from .common import is_terminal, GrammarError, ParserConf
|
||||||
from .parsers import lalr_parser, earley, nearley
|
from .parsers import lalr_parser, earley, nearley
|
||||||
from .parsers.grammar_analysis import Rule
|
from .parsers.grammar_analysis import Rule
|
||||||
|
from .tree import Transformer
|
||||||
|
|
||||||
class WithLexer:
|
class WithLexer:
|
||||||
def __init__(self, lexer_conf):
|
def __init__(self, lexer_conf):
|
||||||
|
@ -121,10 +122,16 @@ class Nearley_NoLex:
|
||||||
|
|
||||||
class Earley_NoLex:
|
class Earley_NoLex:
|
||||||
def __init__(self, lexer_conf, parser_conf):
|
def __init__(self, lexer_conf, parser_conf):
|
||||||
|
self.tokens_to_convert = {name: '__token_'+name for name, tree, _ in parser_conf.rules if is_terminal(name)}
|
||||||
|
rules = []
|
||||||
|
for name, exp, alias in parser_conf.rules:
|
||||||
|
name = self.tokens_to_convert.get(name, name)
|
||||||
|
exp = [self.tokens_to_convert.get(x, x) for x in exp]
|
||||||
|
rules.append((name, exp, alias))
|
||||||
|
|
||||||
self.token_by_name = {t.name:t for t in lexer_conf.tokens}
|
self.token_by_name = {t.name:t for t in lexer_conf.tokens}
|
||||||
|
|
||||||
rules = [(n, list(self._prepare_expansion(x)), a)
|
rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in rules]
|
||||||
for n,x,a in parser_conf.rules]
|
|
||||||
|
|
||||||
self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start))
|
self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start))
|
||||||
|
|
||||||
|
@ -142,7 +149,16 @@ class Earley_NoLex:
|
||||||
def parse(self, text):
|
def parse(self, text):
|
||||||
res = self.parser.parse(text)
|
res = self.parser.parse(text)
|
||||||
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
|
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
|
||||||
return res[0]
|
res = res[0]
|
||||||
|
|
||||||
|
class RestoreTokens(Transformer):
|
||||||
|
pass
|
||||||
|
|
||||||
|
for t in self.tokens_to_convert:
|
||||||
|
setattr(RestoreTokens, t, ''.join)
|
||||||
|
|
||||||
|
res = RestoreTokens().transform(res)
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
def get_frontend(parser, lexer):
|
def get_frontend(parser, lexer):
|
||||||
|
|
|
@ -39,9 +39,19 @@ class TestParsers(unittest.TestCase):
|
||||||
l2 = g.parse('(a,b,c,*x)')
|
l2 = g.parse('(a,b,c,*x)')
|
||||||
assert l == l2, '%s != %s' % (l.pretty(), l2.pretty())
|
assert l == l2, '%s != %s' % (l.pretty(), l2.pretty())
|
||||||
|
|
||||||
|
|
||||||
|
def test_earley_nolex(self):
|
||||||
|
g = Lark("""start: A "b" c
|
||||||
|
A: "a"+
|
||||||
|
c: "abc"
|
||||||
|
""", parser="earley", lexer=None)
|
||||||
|
x = g.parse('aaaababc')
|
||||||
|
|
||||||
|
|
||||||
class TestEarley(unittest.TestCase):
|
class TestEarley(unittest.TestCase):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def _make_parser_test(LEXER, PARSER):
|
def _make_parser_test(LEXER, PARSER):
|
||||||
def _Lark(grammar, **kwargs):
|
def _Lark(grammar, **kwargs):
|
||||||
return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs)
|
return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs)
|
||||||
|
|
Loading…
Reference in New Issue