Added %declare

This commit is contained in:
Erez Shinan 2018-05-27 00:04:11 +03:00
parent 2b4ef11ebf
commit ac0d49e7ab
3 changed files with 38 additions and 23 deletions

View File

@ -18,11 +18,10 @@ tree_grammar = r"""
%import common.CNAME -> NAME %import common.CNAME -> NAME
%import common.WS_INLINE %import common.WS_INLINE
%declare _INDENT _DEDENT
%ignore WS_INLINE %ignore WS_INLINE
_NL: /(\r?\n[\t ]*)+/ _NL: /(\r?\n[\t ]*)+/
_INDENT: "<INDENT>"
_DEDENT: "<DEDENT>"
""" """
class TreeIndenter(Indenter): class TreeIndenter(Indenter):

View File

@ -234,7 +234,7 @@ class ContextualLexer:
lexer = lexer_by_tokens[key] lexer = lexer_by_tokens[key]
except KeyError: except KeyError:
accepts = set(accepts) | set(ignore) | set(always_accept) accepts = set(accepts) | set(ignore) | set(always_accept)
state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END'] state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
lexer = Lexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks) lexer = Lexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks)
lexer_by_tokens[key] = lexer lexer_by_tokens[key] = lexer

View File

@ -22,7 +22,7 @@ IMPORT_PATHS = [os.path.join(__path__, 'grammars')]
_RE_FLAGS = 'imslux' _RE_FLAGS = 'imslux'
_TOKEN_NAMES = { _TERMINAL_NAMES = {
'.' : 'DOT', '.' : 'DOT',
',' : 'COMMA', ',' : 'COMMA',
':' : 'COLON', ':' : 'COLON',
@ -62,7 +62,7 @@ _TOKEN_NAMES = {
} }
# Grammar Parser # Grammar Parser
TOKENS = { TERMINALS = {
'_LPAR': r'\(', '_LPAR': r'\(',
'_RPAR': r'\)', '_RPAR': r'\)',
'_LBRA': r'\[', '_LBRA': r'\[',
@ -73,7 +73,7 @@ TOKENS = {
'_DOT': r'\.', '_DOT': r'\.',
'TILDE': '~', 'TILDE': '~',
'RULE': '!?[_?]?[a-z][_a-z0-9]*', 'RULE': '!?[_?]?[a-z][_a-z0-9]*',
'TOKEN': '_?[A-Z][_A-Z0-9]*', 'TERMINAL': '_?[A-Z][_A-Z0-9]*',
'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?',
'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS, 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS,
'_NL': r'(\r?\n)+\s*', '_NL': r'(\r?\n)+\s*',
@ -81,6 +81,7 @@ TOKENS = {
'COMMENT': r'//[^\n]*', 'COMMENT': r'//[^\n]*',
'_TO': '->', '_TO': '->',
'_IGNORE': r'%ignore', '_IGNORE': r'%ignore',
'_DECLARE': r'%declare',
'_IMPORT': r'%import', '_IMPORT': r'%import',
'NUMBER': r'\d+', 'NUMBER': r'\d+',
} }
@ -116,22 +117,24 @@ RULES = {
'literal', 'literal',
'range'], 'range'],
'terminal': ['TOKEN'], 'terminal': ['TERMINAL'],
'nonterminal': ['RULE'], 'nonterminal': ['RULE'],
'?name': ['RULE', 'TOKEN'], '?name': ['RULE', 'TERMINAL'],
'maybe': ['_LBRA expansions _RBRA'], 'maybe': ['_LBRA expansions _RBRA'],
'range': ['STRING _DOT _DOT STRING'], 'range': ['STRING _DOT _DOT STRING'],
'token': ['TOKEN _COLON expansions _NL', 'token': ['TERMINAL _COLON expansions _NL',
'TOKEN _DOT NUMBER _COLON expansions _NL'], 'TERMINAL _DOT NUMBER _COLON expansions _NL'],
'statement': ['ignore', 'import'], 'statement': ['ignore', 'import', 'declare'],
'ignore': ['_IGNORE expansions _NL'], 'ignore': ['_IGNORE expansions _NL'],
'declare': ['_DECLARE _declare_args _NL'],
'import': ['_IMPORT import_args _NL', 'import': ['_IMPORT import_args _NL',
'_IMPORT import_args _TO TOKEN'], '_IMPORT import_args _TO TERMINAL _NL'],
'import_args': ['_import_args'], 'import_args': ['_import_args'],
'_import_args': ['name', '_import_args _DOT name'], '_import_args': ['name', '_import_args _DOT name'],
'_declare_args': ['name', '_declare_args name'],
'literal': ['REGEXP', 'STRING'], 'literal': ['REGEXP', 'STRING'],
} }
@ -278,7 +281,7 @@ class PrepareAnonTerminals(InlineTransformer):
except KeyError: except KeyError:
# Try to assign an indicative anon-token name # Try to assign an indicative anon-token name
try: try:
token_name = _TOKEN_NAMES[value] token_name = _TERMINAL_NAMES[value]
except KeyError: except KeyError:
if value.isalnum() and value[0].isalpha() and value.upper() not in self.token_set: if value.isalnum() and value[0].isalpha() and value.upper() not in self.token_set:
with suppress(UnicodeEncodeError): with suppress(UnicodeEncodeError):
@ -302,7 +305,7 @@ class PrepareAnonTerminals(InlineTransformer):
self.token_reverse[p] = tokendef self.token_reverse[p] = tokendef
self.tokens.append(tokendef) self.tokens.append(tokendef)
return Terminal(Token('TOKEN', token_name, -1), filter_out=isinstance(p, PatternStr)) return Terminal(Token('TERMINAL', token_name, -1), filter_out=isinstance(p, PatternStr))
def _rfind(s, choices): def _rfind(s, choices):
@ -376,7 +379,7 @@ class TokenTreeToPattern(Transformer):
return items[0] return items[0]
if len({i.flags for i in items}) > 1: if len({i.flags for i in items}) > 1:
raise GrammarError("Lark doesn't support joining tokens with conflicting flags!") raise GrammarError("Lark doesn't support joining tokens with conflicting flags!")
return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags) return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags if items else ())
def expansions(self, exps): def expansions(self, exps):
if len(exps) == 1: if len(exps) == 1:
@ -412,7 +415,7 @@ class PrepareSymbols(Transformer):
return v return v
elif v.type == 'RULE': elif v.type == 'RULE':
return NonTerminal(v.value) return NonTerminal(v.value)
elif v.type == 'TOKEN': elif v.type == 'TERMINAL':
return Terminal(v.value, filter_out=v.startswith('_')) return Terminal(v.value, filter_out=v.startswith('_'))
assert False assert False
@ -435,8 +438,15 @@ class Grammar:
# Convert token-trees to strings/regexps # Convert token-trees to strings/regexps
transformer = PrepareLiterals() * TokenTreeToPattern() transformer = PrepareLiterals() * TokenTreeToPattern()
for name, (token_tree, priority) in token_defs:
if token_tree is None: # Terminal added through %declare
continue
expansions = list(token_tree.find_data('expansion'))
if len(expansions) == 1 and not expansions[0].children:
raise GrammarError("Terminals cannot be empty (%s)" % name)
tokens = [TokenDef(name, transformer.transform(token_tree), priority) tokens = [TokenDef(name, transformer.transform(token_tree), priority)
for name, (token_tree, priority) in token_defs] for name, (token_tree, priority) in token_defs if token_tree]
# ================= # =================
# Compile Rules # Compile Rules
@ -500,12 +510,14 @@ def resolve_token_references(token_defs):
while True: while True:
changed = False changed = False
for name, (token_tree, _p) in token_defs: for name, (token_tree, _p) in token_defs:
if token_tree is None: # Terminal added through %declare
continue
for exp in token_tree.find_data('value'): for exp in token_tree.find_data('value'):
item ,= exp.children item ,= exp.children
if isinstance(item, Token): if isinstance(item, Token):
if item.type == 'RULE': if item.type == 'RULE':
raise GrammarError("Rules aren't allowed inside tokens (%s in %s)" % (item, name)) raise GrammarError("Rules aren't allowed inside terminals (%s in %s)" % (item, name))
if item.type == 'TOKEN': if item.type == 'TERMINAL':
exp.children[0] = token_dict[item] exp.children[0] = token_dict[item]
changed = True changed = True
if not changed: if not changed:
@ -539,7 +551,7 @@ class PrepareGrammar(InlineTransformer):
class GrammarLoader: class GrammarLoader:
def __init__(self): def __init__(self):
tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()] tokens = [TokenDef(name, PatternRE(value)) for name, value in TERMINALS.items()]
rules = [options_from_rule(name, x) for name, x in RULES.items()] rules = [options_from_rule(name, x) for name, x in RULES.items()]
rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), None, o) for r, xs, o in rules for x in xs] rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), None, o) for r, xs, o in rules for x in xs]
@ -591,6 +603,7 @@ class GrammarLoader:
# Execute statements # Execute statements
ignore = [] ignore = []
declared = []
for (stmt,) in statements: for (stmt,) in statements:
if stmt.data == 'ignore': if stmt.data == 'ignore':
t ,= stmt.children t ,= stmt.children
@ -603,6 +616,9 @@ class GrammarLoader:
token_options = dict(g.token_defs)[dotted_path[-1]] token_options = dict(g.token_defs)[dotted_path[-1]]
assert isinstance(token_options, tuple) and len(token_options)==2 assert isinstance(token_options, tuple) and len(token_options)==2
token_defs.append([name.value, token_options]) token_defs.append([name.value, token_options])
elif stmt.data == 'declare':
for t in stmt.children:
token_defs.append([t.value, (None, None)])
else: else:
assert False, stmt assert False, stmt
@ -613,7 +629,7 @@ class GrammarLoader:
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name) raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
# Handle ignore tokens # Handle ignore tokens
# XXX A slightly hacky solution. Recognition of %ignore TOKEN as separate comes from the lexer's # XXX A slightly hacky solution. Recognition of %ignore TERMINAL as separate comes from the lexer's
# inability to handle duplicate tokens (two names, one value) # inability to handle duplicate tokens (two names, one value)
ignore_names = [] ignore_names = []
for t in ignore: for t in ignore:
@ -623,7 +639,7 @@ class GrammarLoader:
item ,= t2.children item ,= t2.children
if item.data == 'value': if item.data == 'value':
item ,= item.children item ,= item.children
if isinstance(item, Token) and item.type == 'TOKEN': if isinstance(item, Token) and item.type == 'TERMINAL':
ignore_names.append(item.value) ignore_names.append(item.value)
continue continue
@ -656,7 +672,7 @@ class GrammarLoader:
for name, expansions, _o in rules: for name, expansions, _o in rules:
used_symbols = {t for x in expansions.find_data('expansion') used_symbols = {t for x in expansions.find_data('expansion')
for t in x.scan_values(lambda t: t.type in ('RULE', 'TOKEN'))} for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}
for sym in used_symbols: for sym in used_symbols:
if is_terminal(sym): if is_terminal(sym):
if sym not in token_names: if sym not in token_names: