diff --git a/examples/indented_tree.py b/examples/indented_tree.py index b633cdd..0a132a1 100644 --- a/examples/indented_tree.py +++ b/examples/indented_tree.py @@ -18,11 +18,10 @@ tree_grammar = r""" %import common.CNAME -> NAME %import common.WS_INLINE + %declare _INDENT _DEDENT %ignore WS_INLINE _NL: /(\r?\n[\t ]*)+/ - _INDENT: "" - _DEDENT: "" """ class TreeIndenter(Indenter): diff --git a/lark/lexer.py b/lark/lexer.py index 51ccf6c..6502535 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -234,7 +234,7 @@ class ContextualLexer: lexer = lexer_by_tokens[key] except KeyError: accepts = set(accepts) | set(ignore) | set(always_accept) - state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END'] + state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] lexer = Lexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks) lexer_by_tokens[key] = lexer diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 3aa9827..f5a0be8 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -22,7 +22,7 @@ IMPORT_PATHS = [os.path.join(__path__, 'grammars')] _RE_FLAGS = 'imslux' -_TOKEN_NAMES = { +_TERMINAL_NAMES = { '.' : 'DOT', ',' : 'COMMA', ':' : 'COLON', @@ -62,7 +62,7 @@ _TOKEN_NAMES = { } # Grammar Parser -TOKENS = { +TERMINALS = { '_LPAR': r'\(', '_RPAR': r'\)', '_LBRA': r'\[', @@ -73,7 +73,7 @@ TOKENS = { '_DOT': r'\.', 'TILDE': '~', 'RULE': '!?[_?]?[a-z][_a-z0-9]*', - 'TOKEN': '_?[A-Z][_A-Z0-9]*', + 'TERMINAL': '_?[A-Z][_A-Z0-9]*', 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS, '_NL': r'(\r?\n)+\s*', @@ -81,6 +81,7 @@ TOKENS = { 'COMMENT': r'//[^\n]*', '_TO': '->', '_IGNORE': r'%ignore', + '_DECLARE': r'%declare', '_IMPORT': r'%import', 'NUMBER': r'\d+', } @@ -116,22 +117,24 @@ RULES = { 'literal', 'range'], - 'terminal': ['TOKEN'], + 'terminal': ['TERMINAL'], 'nonterminal': ['RULE'], - '?name': ['RULE', 'TOKEN'], + '?name': ['RULE', 'TERMINAL'], 'maybe': ['_LBRA expansions _RBRA'], 'range': ['STRING _DOT _DOT STRING'], - 'token': ['TOKEN _COLON expansions _NL', - 'TOKEN _DOT NUMBER _COLON expansions _NL'], - 'statement': ['ignore', 'import'], + 'token': ['TERMINAL _COLON expansions _NL', + 'TERMINAL _DOT NUMBER _COLON expansions _NL'], + 'statement': ['ignore', 'import', 'declare'], 'ignore': ['_IGNORE expansions _NL'], + 'declare': ['_DECLARE _declare_args _NL'], 'import': ['_IMPORT import_args _NL', - '_IMPORT import_args _TO TOKEN'], + '_IMPORT import_args _TO TERMINAL _NL'], 'import_args': ['_import_args'], '_import_args': ['name', '_import_args _DOT name'], + '_declare_args': ['name', '_declare_args name'], 'literal': ['REGEXP', 'STRING'], } @@ -278,7 +281,7 @@ class PrepareAnonTerminals(InlineTransformer): except KeyError: # Try to assign an indicative anon-token name try: - token_name = _TOKEN_NAMES[value] + token_name = _TERMINAL_NAMES[value] except KeyError: if value.isalnum() and value[0].isalpha() and value.upper() not in self.token_set: with suppress(UnicodeEncodeError): @@ -302,7 +305,7 @@ class PrepareAnonTerminals(InlineTransformer): self.token_reverse[p] = tokendef self.tokens.append(tokendef) - return Terminal(Token('TOKEN', token_name, -1), filter_out=isinstance(p, PatternStr)) + return Terminal(Token('TERMINAL', token_name, -1), filter_out=isinstance(p, PatternStr)) def _rfind(s, choices): @@ -376,7 +379,7 @@ class TokenTreeToPattern(Transformer): return items[0] if len({i.flags for i in items}) > 1: raise GrammarError("Lark doesn't support joining tokens with conflicting flags!") - return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags) + return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags if items else ()) def expansions(self, exps): if len(exps) == 1: @@ -412,7 +415,7 @@ class PrepareSymbols(Transformer): return v elif v.type == 'RULE': return NonTerminal(v.value) - elif v.type == 'TOKEN': + elif v.type == 'TERMINAL': return Terminal(v.value, filter_out=v.startswith('_')) assert False @@ -435,8 +438,15 @@ class Grammar: # Convert token-trees to strings/regexps transformer = PrepareLiterals() * TokenTreeToPattern() + for name, (token_tree, priority) in token_defs: + if token_tree is None: # Terminal added through %declare + continue + expansions = list(token_tree.find_data('expansion')) + if len(expansions) == 1 and not expansions[0].children: + raise GrammarError("Terminals cannot be empty (%s)" % name) + tokens = [TokenDef(name, transformer.transform(token_tree), priority) - for name, (token_tree, priority) in token_defs] + for name, (token_tree, priority) in token_defs if token_tree] # ================= # Compile Rules @@ -500,12 +510,14 @@ def resolve_token_references(token_defs): while True: changed = False for name, (token_tree, _p) in token_defs: + if token_tree is None: # Terminal added through %declare + continue for exp in token_tree.find_data('value'): item ,= exp.children if isinstance(item, Token): if item.type == 'RULE': - raise GrammarError("Rules aren't allowed inside tokens (%s in %s)" % (item, name)) - if item.type == 'TOKEN': + raise GrammarError("Rules aren't allowed inside terminals (%s in %s)" % (item, name)) + if item.type == 'TERMINAL': exp.children[0] = token_dict[item] changed = True if not changed: @@ -539,7 +551,7 @@ class PrepareGrammar(InlineTransformer): class GrammarLoader: def __init__(self): - tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()] + tokens = [TokenDef(name, PatternRE(value)) for name, value in TERMINALS.items()] rules = [options_from_rule(name, x) for name, x in RULES.items()] rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), None, o) for r, xs, o in rules for x in xs] @@ -591,6 +603,7 @@ class GrammarLoader: # Execute statements ignore = [] + declared = [] for (stmt,) in statements: if stmt.data == 'ignore': t ,= stmt.children @@ -603,6 +616,9 @@ class GrammarLoader: token_options = dict(g.token_defs)[dotted_path[-1]] assert isinstance(token_options, tuple) and len(token_options)==2 token_defs.append([name.value, token_options]) + elif stmt.data == 'declare': + for t in stmt.children: + token_defs.append([t.value, (None, None)]) else: assert False, stmt @@ -613,7 +629,7 @@ class GrammarLoader: raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name) # Handle ignore tokens - # XXX A slightly hacky solution. Recognition of %ignore TOKEN as separate comes from the lexer's + # XXX A slightly hacky solution. Recognition of %ignore TERMINAL as separate comes from the lexer's # inability to handle duplicate tokens (two names, one value) ignore_names = [] for t in ignore: @@ -623,7 +639,7 @@ class GrammarLoader: item ,= t2.children if item.data == 'value': item ,= item.children - if isinstance(item, Token) and item.type == 'TOKEN': + if isinstance(item, Token) and item.type == 'TERMINAL': ignore_names.append(item.value) continue @@ -656,7 +672,7 @@ class GrammarLoader: for name, expansions, _o in rules: used_symbols = {t for x in expansions.find_data('expansion') - for t in x.scan_values(lambda t: t.type in ('RULE', 'TOKEN'))} + for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} for sym in used_symbols: if is_terminal(sym): if sym not in token_names: