mirror of https://github.com/lark-parser/lark.git
Better support for scanless parsing
This commit is contained in:
parent
a5a20a423a
commit
c9b45823ed
|
@ -19,13 +19,12 @@ parser = Lark(r"""
|
|||
start: _NL? section+
|
||||
section: "[" NAME "]" _NL item+
|
||||
item: NAME "=" VALUE _NL
|
||||
NAME: /[a-zA-Z_]\w*/
|
||||
VALUE: /.*/
|
||||
VALUE: /./*
|
||||
%import common.CNAME -> NAME
|
||||
%import common.NEWLINE -> _NL
|
||||
|
||||
_NL: /(\r?\n)+/
|
||||
|
||||
%ignore /[\t \f]+/
|
||||
%ignore /\#[^\n]*/
|
||||
%import common.WS_INLINE
|
||||
%ignore WS_INLINE
|
||||
""", parser="lalr", lexer="contextual")
|
||||
|
||||
|
||||
|
|
|
@ -12,25 +12,21 @@
|
|||
# See examples/conf.py for an example of that approach.
|
||||
#
|
||||
|
||||
from lark import Lark, Transformer
|
||||
|
||||
from lark import Lark
|
||||
|
||||
parser = Lark(r"""
|
||||
start: _nl? section+
|
||||
section: "[" name "]" _nl item+
|
||||
item: name "=" value _nl
|
||||
name: /[a-zA-Z_]/ /\w/*
|
||||
value: /./+
|
||||
_nl: (_CR? _LF)+
|
||||
start: _NL? section+
|
||||
section: "[" NAME "]" _NL item+
|
||||
item: NAME "=" VALUE _NL
|
||||
VALUE: /./*
|
||||
%import common.CNAME -> NAME
|
||||
%import common.NEWLINE -> _NL
|
||||
|
||||
_CR : /\r/
|
||||
_LF : /\n/
|
||||
%import common.WS_INLINE
|
||||
%ignore WS_INLINE
|
||||
""", lexer=None)
|
||||
|
||||
class RestoreTokens(Transformer):
|
||||
value = ''.join
|
||||
name = ''.join
|
||||
|
||||
|
||||
def test():
|
||||
sample_conf = """
|
||||
[bla]
|
||||
|
@ -40,7 +36,7 @@ this="that",4
|
|||
"""
|
||||
|
||||
r = parser.parse(sample_conf)
|
||||
print(RestoreTokens().transform(r).pretty())
|
||||
print r.pretty()
|
||||
|
||||
if __name__ == '__main__':
|
||||
test()
|
||||
|
|
|
@ -39,3 +39,7 @@ CNAME: ("_"|LETTER) ("_"|LETTER|DIGIT)*
|
|||
WS_INLINE: (" "|/\t/)+
|
||||
WS: /[ \t\f\r\n]/+
|
||||
|
||||
CR : /\r/
|
||||
LF : /\n/
|
||||
NEWLINE: (CR? LF)+
|
||||
|
||||
|
|
18
lark/lark.py
18
lark/lark.py
|
@ -119,21 +119,23 @@ class Lark:
|
|||
assert not self.options.profile, "Feature temporarily disabled"
|
||||
self.profiler = Profiler() if self.options.profile else None
|
||||
|
||||
lexer = self.options.lexer
|
||||
if lexer == 'auto':
|
||||
if self.options.parser == 'lalr':
|
||||
lexer = 'standard'
|
||||
elif self.options.parser == 'earley':
|
||||
lexer = 'standard'
|
||||
self.options.lexer = lexer
|
||||
|
||||
self.grammar = load_grammar(grammar)
|
||||
tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=True)
|
||||
tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=bool(lexer))
|
||||
self.ignore_tokens = self.grammar.extra['ignore']
|
||||
|
||||
self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex)
|
||||
|
||||
if self.options.lexer == 'auto':
|
||||
if self.options.parser == 'lalr':
|
||||
self.options.lexer = 'standard'
|
||||
elif self.options.parser == 'earley':
|
||||
self.options.lexer = 'standard'
|
||||
|
||||
if self.options.parser:
|
||||
self.parser = self._build_parser()
|
||||
elif self.options.lexer:
|
||||
elif lexer:
|
||||
self.lexer = self._build_lexer()
|
||||
|
||||
if self.profiler: self.profiler.enter_section('outside_lark')
|
||||
|
|
|
@ -239,6 +239,15 @@ class ExtractAnonTokens(InlineTransformer):
|
|||
self.re_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternRE)}
|
||||
self.i = 0
|
||||
|
||||
def range(self, start, end):
|
||||
assert start.type == end.type == 'STRING'
|
||||
start = start.value[1:-1]
|
||||
end = end.value[1:-1]
|
||||
assert len(start) == len(end) == 1
|
||||
regexp = '/[%s-%s]/' % (start, end)
|
||||
t = Token('REGEXP', regexp)
|
||||
return self.tokenvalue(t)
|
||||
|
||||
def tokenvalue(self, token):
|
||||
value = token.value[1:-1]
|
||||
if token.type == 'STRING':
|
||||
|
@ -325,8 +334,19 @@ class Grammar:
|
|||
self.extra = extra
|
||||
|
||||
def compile(self, lexer=False):
|
||||
assert lexer
|
||||
# assert lexer
|
||||
if not lexer:
|
||||
self.rule_defs += self.token_defs
|
||||
self.token_defs = []
|
||||
|
||||
for name, tree in self.rule_defs:
|
||||
for tokenvalue in tree.find_data('tokenvalue'):
|
||||
value ,= tokenvalue.children
|
||||
if value.type == 'STRING':
|
||||
assert value[0] == value[-1] == '"'
|
||||
if len(value)>3:
|
||||
tokenvalue.data = 'expansion'
|
||||
tokenvalue.children = [T('tokenvalue', [Token('STRING', '"%s"'%ch)]) for ch in value[1:-1]]
|
||||
tokendefs = list(self.token_defs)
|
||||
|
||||
# =================
|
||||
|
|
|
@ -6,6 +6,7 @@ from .lexer import Lexer, ContextualLexer, Token
|
|||
from .common import is_terminal, GrammarError, ParserConf
|
||||
from .parsers import lalr_parser, earley, nearley
|
||||
from .parsers.grammar_analysis import Rule
|
||||
from .tree import Transformer
|
||||
|
||||
class WithLexer:
|
||||
def __init__(self, lexer_conf):
|
||||
|
@ -121,10 +122,16 @@ class Nearley_NoLex:
|
|||
|
||||
class Earley_NoLex:
|
||||
def __init__(self, lexer_conf, parser_conf):
|
||||
self.tokens_to_convert = {name: '__token_'+name for name, tree, _ in parser_conf.rules if is_terminal(name)}
|
||||
rules = []
|
||||
for name, exp, alias in parser_conf.rules:
|
||||
name = self.tokens_to_convert.get(name, name)
|
||||
exp = [self.tokens_to_convert.get(x, x) for x in exp]
|
||||
rules.append((name, exp, alias))
|
||||
|
||||
self.token_by_name = {t.name:t for t in lexer_conf.tokens}
|
||||
|
||||
rules = [(n, list(self._prepare_expansion(x)), a)
|
||||
for n,x,a in parser_conf.rules]
|
||||
rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in rules]
|
||||
|
||||
self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start))
|
||||
|
||||
|
@ -142,7 +149,16 @@ class Earley_NoLex:
|
|||
def parse(self, text):
|
||||
res = self.parser.parse(text)
|
||||
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
|
||||
return res[0]
|
||||
res = res[0]
|
||||
|
||||
class RestoreTokens(Transformer):
|
||||
pass
|
||||
|
||||
for t in self.tokens_to_convert:
|
||||
setattr(RestoreTokens, t, ''.join)
|
||||
|
||||
res = RestoreTokens().transform(res)
|
||||
return res
|
||||
|
||||
|
||||
def get_frontend(parser, lexer):
|
||||
|
|
|
@ -39,9 +39,19 @@ class TestParsers(unittest.TestCase):
|
|||
l2 = g.parse('(a,b,c,*x)')
|
||||
assert l == l2, '%s != %s' % (l.pretty(), l2.pretty())
|
||||
|
||||
|
||||
def test_earley_nolex(self):
|
||||
g = Lark("""start: A "b" c
|
||||
A: "a"+
|
||||
c: "abc"
|
||||
""", parser="earley", lexer=None)
|
||||
x = g.parse('aaaababc')
|
||||
|
||||
|
||||
class TestEarley(unittest.TestCase):
|
||||
pass
|
||||
|
||||
|
||||
def _make_parser_test(LEXER, PARSER):
|
||||
def _Lark(grammar, **kwargs):
|
||||
return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs)
|
||||
|
|
Loading…
Reference in New Issue