mirror of https://github.com/lark-parser/lark.git
Refactoring load_grammar
This commit is contained in:
parent
57191c699f
commit
0c5acaab8a
|
@ -34,7 +34,7 @@ def is_terminal(sym):
|
|||
|
||||
|
||||
class LexerConf:
|
||||
def __init__(self, tokens, ignore, postlex):
|
||||
def __init__(self, tokens, ignore=(), postlex=None):
|
||||
self.tokens = tokens
|
||||
self.ignore = ignore
|
||||
self.postlex = postlex
|
||||
|
|
|
@ -146,9 +146,7 @@ class Lark:
|
|||
self.grammar = load_grammar(grammar, source)
|
||||
|
||||
# Compile the EBNF grammar into BNF
|
||||
tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=bool(lexer), start=self.options.start)
|
||||
|
||||
self.ignore_tokens = self.grammar.extra['ignore']
|
||||
tokens, self.rules, self.ignore_tokens = self.grammar.compile(lexer=bool(lexer), start=self.options.start)
|
||||
|
||||
self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex)
|
||||
|
||||
|
|
|
@ -222,7 +222,7 @@ class RuleTreeToText(Transformer):
|
|||
return expansion, alias.value
|
||||
|
||||
|
||||
class SimplifyTree(InlineTransformer):
|
||||
class CanonizeTree(InlineTransformer):
|
||||
def maybe(self, expr):
|
||||
return T('expr', [expr, Token('OP', '?', -1)])
|
||||
|
||||
|
@ -354,10 +354,10 @@ def _choice_of_rules(rules):
|
|||
return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules])
|
||||
|
||||
class Grammar:
|
||||
def __init__(self, rule_defs, token_defs, extra):
|
||||
def __init__(self, rule_defs, token_defs, ignore):
|
||||
self.token_defs = token_defs
|
||||
self.rule_defs = rule_defs
|
||||
self.extra = extra
|
||||
self.ignore = ignore
|
||||
|
||||
def _prepare_scanless_grammar(self, start):
|
||||
# XXX Pretty hacky! There should be a better way to write this method..
|
||||
|
@ -366,7 +366,7 @@ class Grammar:
|
|||
term_defs = self.token_defs
|
||||
|
||||
# Implement the "%ignore" feature without a lexer..
|
||||
terms_to_ignore = {name:'__'+name for name in self.extra['ignore']}
|
||||
terms_to_ignore = {name:'__'+name for name in self.ignore}
|
||||
if terms_to_ignore:
|
||||
assert set(terms_to_ignore) <= {name for name, t in term_defs}
|
||||
term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs]
|
||||
|
@ -415,49 +415,28 @@ class Grammar:
|
|||
# =================
|
||||
# Compile Tokens
|
||||
# =================
|
||||
token_tree_to_pattern = TokenTreeToPattern()
|
||||
|
||||
# Convert tokens to strings/regexps
|
||||
tokens = []
|
||||
for name, token_tree in token_defs:
|
||||
token_tree = PrepareLiterals().transform(token_tree)
|
||||
pattern = token_tree_to_pattern.transform(token_tree)
|
||||
tokens.append(TokenDef(name, pattern) )
|
||||
|
||||
# Resolve regexp assignments of the form /..${X}../
|
||||
# XXX This is deprecated, since you can express most regexps with EBNF
|
||||
# XXX Also, since this happens after import, it can be a source of bugs
|
||||
token_dict = {td.name: td.pattern.to_regexp() for td in tokens}
|
||||
while True:
|
||||
changed = False
|
||||
for t in tokens:
|
||||
if isinstance(t.pattern, PatternRE):
|
||||
sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], t.pattern.value)
|
||||
if sp:
|
||||
value = ''.join(token_dict[x[2:-1]] if x.startswith('${') and x.endswith('}') else x
|
||||
for x in sp)
|
||||
if value != t.pattern.value:
|
||||
t.pattern.value = value
|
||||
changed = True
|
||||
if not changed:
|
||||
break
|
||||
# Convert token-trees to strings/regexps
|
||||
transformer = PrepareLiterals() * TokenTreeToPattern()
|
||||
tokens = [TokenDef(name, transformer.transform(token_tree))
|
||||
for name, token_tree in token_defs]
|
||||
|
||||
# =================
|
||||
# Compile Rules
|
||||
# =================
|
||||
extract_anon = ExtractAnonTokens(tokens)
|
||||
ebnf_to_bnf = EBNF_to_BNF()
|
||||
simplify_rule = SimplifyRule_Visitor()
|
||||
rule_tree_to_text = RuleTreeToText()
|
||||
rules = {}
|
||||
|
||||
transformer = PrepareLiterals()
|
||||
if not lexer:
|
||||
transformer *= SplitLiterals()
|
||||
transformer *= ExtractAnonTokens(tokens) # Adds to tokens
|
||||
|
||||
rules = {}
|
||||
for name, rule_tree, options in rule_defs:
|
||||
assert name not in rules, name
|
||||
rule_tree = PrepareLiterals().transform(rule_tree)
|
||||
if not lexer:
|
||||
rule_tree = SplitLiterals().transform(rule_tree)
|
||||
tree = extract_anon.transform(rule_tree) # Adds to tokens
|
||||
ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None
|
||||
tree = transformer.transform(rule_tree)
|
||||
rules[name] = ebnf_to_bnf.transform(tree), options
|
||||
|
||||
dict_update_safe(rules, ebnf_to_bnf.new_rules)
|
||||
|
@ -465,9 +444,10 @@ class Grammar:
|
|||
for tree, _o in rules.values():
|
||||
simplify_rule.visit(tree)
|
||||
|
||||
rule_tree_to_text = RuleTreeToText()
|
||||
rules = {origin: (rule_tree_to_text.transform(tree), options) for origin, (tree, options) in rules.items()}
|
||||
|
||||
return tokens, rules, self.extra
|
||||
return tokens, rules, self.ignore
|
||||
|
||||
|
||||
|
||||
|
@ -511,6 +491,9 @@ def import_grammar(grammar_path):
|
|||
|
||||
|
||||
def resolve_token_references(token_defs):
|
||||
# TODO Cycles detection
|
||||
# TODO Solve with transitive closure (maybe)
|
||||
|
||||
token_dict = dict(token_defs)
|
||||
assert len(token_dict) == len(token_defs), "Same name defined twice?"
|
||||
|
||||
|
@ -536,15 +519,17 @@ class GrammarLoader:
|
|||
rules = [RuleOptions.from_rule(name, x) for name, x in RULES.items()]
|
||||
d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules}
|
||||
rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None)
|
||||
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'], None)
|
||||
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'])
|
||||
parser_conf = ParserConf(rules, callback, 'start')
|
||||
self.parser = LALR(lexer_conf, parser_conf)
|
||||
|
||||
self.simplify_tree = SimplifyTree()
|
||||
self.canonize_tree = CanonizeTree()
|
||||
|
||||
def load_grammar(self, grammar_text, name='<?>'):
|
||||
"Parse grammar_text, verify, and create Grammar object. Display nice messages on error."
|
||||
|
||||
try:
|
||||
tree = self.simplify_tree.transform( self.parser.parse(grammar_text+'\n') )
|
||||
tree = self.canonize_tree.transform( self.parser.parse(grammar_text+'\n') )
|
||||
except UnexpectedInput as e:
|
||||
raise GrammarError("Unexpected input %r at line %d column %d in %s" % (e.context, e.line, e.column, name))
|
||||
except UnexpectedToken as e:
|
||||
|
@ -590,23 +575,9 @@ class GrammarLoader:
|
|||
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
|
||||
|
||||
# Handle ignore tokens
|
||||
ignore_names = []
|
||||
for i, t in enumerate(ignore):
|
||||
if t.data == 'expansions' and len(t.children) == 1:
|
||||
x ,= t.children
|
||||
if x.data == 'expansion' and len(x.children) == 1:
|
||||
item ,= x.children
|
||||
if isinstance(item, Token) and item.type == 'TOKEN':
|
||||
# XXX is this really a wise solution? -- Erez
|
||||
ignore_names.append(item.value)
|
||||
continue
|
||||
|
||||
name = '__IGNORE_%d'%i
|
||||
token_defs.append((name, t))
|
||||
ignore_names.append(name)
|
||||
|
||||
# Resolve token references
|
||||
resolve_token_references(token_defs)
|
||||
ignore_defs = [('__IGNORE_%d'%i, t) for i, t in enumerate(ignore)]
|
||||
ignore_names = [name for name,_ in ignore_defs]
|
||||
token_defs += ignore_defs
|
||||
|
||||
# Verify correctness 2
|
||||
token_names = set()
|
||||
|
@ -615,6 +586,9 @@ class GrammarLoader:
|
|||
raise GrammarError("Token '%s' defined more than once" % name)
|
||||
token_names.add(name)
|
||||
|
||||
# Resolve token references
|
||||
resolve_token_references(token_defs)
|
||||
|
||||
rules = [RuleOptions.from_rule(*x) for x in rule_defs]
|
||||
|
||||
rule_names = set()
|
||||
|
@ -638,7 +612,7 @@ class GrammarLoader:
|
|||
|
||||
# TODO don't include unused tokens, they can only cause trouble!
|
||||
|
||||
return Grammar(rules, token_defs, {'ignore': ignore_names})
|
||||
return Grammar(rules, token_defs, ignore_names)
|
||||
|
||||
|
||||
|
||||
|
|
17
lark/tree.py
17
lark/tree.py
|
@ -104,6 +104,23 @@ class Transformer(object):
|
|||
def __default__(self, data, children):
|
||||
return Tree(data, children)
|
||||
|
||||
def __mul__(self, other):
|
||||
return TransformerChain(self, other)
|
||||
|
||||
|
||||
class TransformerChain(object):
|
||||
def __init__(self, *transformers):
|
||||
self.transformers = transformers
|
||||
|
||||
def transform(self, tree):
|
||||
for t in self.transformers:
|
||||
tree = t.transform(tree)
|
||||
return tree
|
||||
|
||||
def __mul__(self, other):
|
||||
return TransformerChain(*self.transformers + (other,))
|
||||
|
||||
|
||||
|
||||
class InlineTransformer(Transformer):
|
||||
def _get_func(self, name): # use super()._get_func
|
||||
|
|
|
@ -432,23 +432,6 @@ def _make_parser_test(LEXER, PARSER):
|
|||
x = g.parse('aaaab')
|
||||
x = g.parse('b')
|
||||
|
||||
@unittest.skipIf(LEXER is None, "Regexps >1 not supported with scanless parsing")
|
||||
def test_regex_embed(self):
|
||||
g = _Lark("""start: A B C
|
||||
A: /a/
|
||||
B: /${A}b/
|
||||
C: /${B}c/
|
||||
""")
|
||||
x = g.parse('aababc')
|
||||
|
||||
def test_token_embed(self):
|
||||
g = _Lark("""start: A B C
|
||||
A: "a"
|
||||
B: A "b"
|
||||
C: B "c"
|
||||
""")
|
||||
x = g.parse('aababc')
|
||||
|
||||
@unittest.skipIf(LEXER in (None, 'dynamic'), "Known bug with scanless parsing") # TODO
|
||||
def test_token_not_anon(self):
|
||||
"""Tests that "a" is matched as A, rather than an anonymous token.
|
||||
|
|
Loading…
Reference in New Issue