mirror of https://github.com/lark-parser/lark.git
More refactoring towards standalone
This commit is contained in:
parent
07b5469e86
commit
da1910f5b6
|
@ -4,12 +4,18 @@ import sys
|
|||
|
||||
Py36 = (sys.version_info[:2] >= (3, 6))
|
||||
|
||||
|
||||
###{standalone
|
||||
def is_terminal(sym):
|
||||
return sym.isupper()
|
||||
|
||||
class GrammarError(Exception):
|
||||
pass
|
||||
|
||||
class ParseError(Exception):
|
||||
pass
|
||||
|
||||
###}
|
||||
|
||||
class UnexpectedToken(ParseError):
|
||||
def __init__(self, token, expected, seq, index):
|
||||
|
@ -32,9 +38,6 @@ class UnexpectedToken(ParseError):
|
|||
|
||||
|
||||
|
||||
def is_terminal(sym):
|
||||
return sym.isupper()
|
||||
|
||||
|
||||
class LexerConf:
|
||||
def __init__(self, tokens, ignore=(), postlex=None):
|
||||
|
|
|
@ -166,8 +166,8 @@ class Lark:
|
|||
def _build_parser(self):
|
||||
self.parser_class = get_frontend(self.options.parser, self.options.lexer)
|
||||
|
||||
self.parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens)
|
||||
callback = self.parse_tree_builder.apply(self.options.transformer)
|
||||
self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens)
|
||||
callback = self._parse_tree_builder.create_callback(self.options.transformer)
|
||||
if self.profiler:
|
||||
for f in dir(callback):
|
||||
if not (f.startswith('__') and f.endswith('__')):
|
||||
|
|
113
lark/lexer.py
113
lark/lexer.py
|
@ -5,6 +5,7 @@ import re
|
|||
from .utils import Str, classify
|
||||
from .common import is_terminal, PatternStr, PatternRE, TokenDef
|
||||
|
||||
###{standalone
|
||||
class LexError(Exception):
|
||||
pass
|
||||
|
||||
|
@ -48,10 +49,60 @@ class Token(Str):
|
|||
|
||||
__hash__ = Str.__hash__
|
||||
|
||||
class Regex:
|
||||
def __init__(self, pattern, flags=()):
|
||||
self.pattern = pattern
|
||||
self.flags = flags
|
||||
|
||||
class LineCounter:
|
||||
def __init__(self):
|
||||
self.newline_char = '\n'
|
||||
self.char_pos = 0
|
||||
self.line = 1
|
||||
self.column = 0
|
||||
self.line_start_pos = 0
|
||||
|
||||
def feed(self, token, test_newline=True):
|
||||
"""Consume a token and calculate the new line & column.
|
||||
|
||||
As an optional optimization, set test_newline=False is token doesn't contain a newline.
|
||||
"""
|
||||
if test_newline:
|
||||
newlines = token.count(self.newline_char)
|
||||
if newlines:
|
||||
self.line += newlines
|
||||
self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
|
||||
|
||||
self.char_pos += len(token)
|
||||
self.column = self.char_pos - self.line_start_pos
|
||||
|
||||
class _Lex:
|
||||
"Built to serve both Lexer and ContextualLexer"
|
||||
def __init__(self, lexer):
|
||||
self.lexer = lexer
|
||||
|
||||
def lex(self, stream, newline_types, ignore_types):
|
||||
newline_types = list(newline_types)
|
||||
newline_types = list(newline_types)
|
||||
line_ctr = LineCounter()
|
||||
|
||||
while True:
|
||||
lexer = self.lexer
|
||||
for mre, type_from_index in lexer.mres:
|
||||
m = mre.match(stream, line_ctr.char_pos)
|
||||
if m:
|
||||
value = m.group(0)
|
||||
type_ = type_from_index[m.lastindex]
|
||||
if type_ not in ignore_types:
|
||||
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
|
||||
if t.type in lexer.callback:
|
||||
t = lexer.callback[t.type](t)
|
||||
lexer = yield t
|
||||
|
||||
line_ctr.feed(value, type_ in newline_types)
|
||||
break
|
||||
else:
|
||||
if line_ctr.char_pos < len(stream):
|
||||
raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
|
||||
break
|
||||
###}
|
||||
|
||||
|
||||
def _regexp_has_newline(r):
|
||||
return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r)
|
||||
|
@ -182,57 +233,3 @@ class ContextualLexer:
|
|||
l.lexer = self.lexers[self.parser_state]
|
||||
|
||||
|
||||
###{lexer
|
||||
|
||||
class LineCounter:
|
||||
def __init__(self):
|
||||
self.newline_char = '\n'
|
||||
self.char_pos = 0
|
||||
self.line = 1
|
||||
self.column = 0
|
||||
self.line_start_pos = 0
|
||||
|
||||
def feed(self, token, test_newline=True):
|
||||
"""Consume a token and calculate the new line & column.
|
||||
|
||||
As an optional optimization, set test_newline=False is token doesn't contain a newline.
|
||||
"""
|
||||
if test_newline:
|
||||
newlines = token.count(self.newline_char)
|
||||
if newlines:
|
||||
self.line += newlines
|
||||
self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
|
||||
|
||||
self.char_pos += len(token)
|
||||
self.column = self.char_pos - self.line_start_pos
|
||||
|
||||
class _Lex:
|
||||
"Built to serve both Lexer and ContextualLexer"
|
||||
def __init__(self, lexer):
|
||||
self.lexer = lexer
|
||||
|
||||
def lex(self, stream, newline_types, ignore_types):
|
||||
newline_types = list(newline_types)
|
||||
newline_types = list(newline_types)
|
||||
line_ctr = LineCounter()
|
||||
|
||||
while True:
|
||||
lexer = self.lexer
|
||||
for mre, type_from_index in lexer.mres:
|
||||
m = mre.match(stream, line_ctr.char_pos)
|
||||
if m:
|
||||
value = m.group(0)
|
||||
type_ = type_from_index[m.lastindex]
|
||||
if type_ not in ignore_types:
|
||||
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
|
||||
if t.type in lexer.callback:
|
||||
t = lexer.callback[t.type](t)
|
||||
lexer = yield t
|
||||
|
||||
line_ctr.feed(value, type_ in newline_types)
|
||||
break
|
||||
else:
|
||||
if line_ctr.char_pos < len(stream):
|
||||
raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
|
||||
break
|
||||
###}
|
||||
|
|
|
@ -128,7 +128,7 @@ RULES = {
|
|||
|
||||
class EBNF_to_BNF(InlineTransformer):
|
||||
def __init__(self):
|
||||
self.new_rules = {}
|
||||
self.new_rules = []
|
||||
self.rules_by_expr = {}
|
||||
self.prefix = 'anon'
|
||||
self.i = 0
|
||||
|
@ -141,7 +141,8 @@ class EBNF_to_BNF(InlineTransformer):
|
|||
new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
|
||||
self.i += 1
|
||||
t = Token('RULE', new_name, -1)
|
||||
self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]), self.rule_options
|
||||
tree = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])])
|
||||
self.new_rules.append((new_name, tree, self.rule_options))
|
||||
self.rules_by_expr[expr] = t
|
||||
return t
|
||||
|
||||
|
@ -390,12 +391,6 @@ def _interleave(l, item):
|
|||
def _choice_of_rules(rules):
|
||||
return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules])
|
||||
|
||||
def dict_update_safe(d1, d2):
|
||||
for k, v in d2.items():
|
||||
assert k not in d1
|
||||
d1[k] = v
|
||||
|
||||
|
||||
class Grammar:
|
||||
def __init__(self, rule_defs, token_defs, ignore):
|
||||
self.token_defs = token_defs
|
||||
|
@ -468,38 +463,41 @@ class Grammar:
|
|||
# =================
|
||||
# Compile Rules
|
||||
# =================
|
||||
ebnf_to_bnf = EBNF_to_BNF()
|
||||
simplify_rule = SimplifyRule_Visitor()
|
||||
|
||||
# 1. Pre-process terminals
|
||||
transformer = PrepareLiterals()
|
||||
if not lexer:
|
||||
transformer *= SplitLiterals()
|
||||
transformer *= ExtractAnonTokens(tokens) # Adds to tokens
|
||||
|
||||
rules = {}
|
||||
# 2. Convert EBNF to BNF (and apply step 1)
|
||||
ebnf_to_bnf = EBNF_to_BNF()
|
||||
rules = []
|
||||
for name, rule_tree, options in rule_defs:
|
||||
assert name not in rules, name
|
||||
ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None
|
||||
tree = transformer.transform(rule_tree)
|
||||
rules[name] = ebnf_to_bnf.transform(tree), options
|
||||
rules.append((name, ebnf_to_bnf.transform(tree), options))
|
||||
rules += ebnf_to_bnf.new_rules
|
||||
|
||||
dict_update_safe(rules, ebnf_to_bnf.new_rules)
|
||||
assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision"
|
||||
|
||||
# 3. Compile tree to Rule objects
|
||||
rule_tree_to_text = RuleTreeToText()
|
||||
|
||||
new_rules = []
|
||||
for origin, (tree, options) in rules.items():
|
||||
simplify_rule = SimplifyRule_Visitor()
|
||||
compiled_rules = []
|
||||
for name, tree, options in rules:
|
||||
simplify_rule.visit(tree)
|
||||
expansions = rule_tree_to_text.transform(tree)
|
||||
|
||||
for expansion, alias in expansions:
|
||||
if alias and origin.startswith('_'):
|
||||
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias))
|
||||
if alias and name.startswith('_'):
|
||||
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias))
|
||||
|
||||
rule = Rule(origin, expansion, alias, options)
|
||||
new_rules.append(rule)
|
||||
rule = Rule(name, expansion, alias, options)
|
||||
compiled_rules.append(rule)
|
||||
|
||||
return tokens, new_rules, self.ignore
|
||||
return tokens, compiled_rules, self.ignore
|
||||
|
||||
|
||||
|
||||
|
@ -557,7 +555,7 @@ class GrammarLoader:
|
|||
|
||||
rules = [options_from_rule(name, x) for name, x in RULES.items()]
|
||||
rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs]
|
||||
callback = ParseTreeBuilder(rules, T).apply()
|
||||
callback = ParseTreeBuilder(rules, T).create_callback()
|
||||
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'])
|
||||
|
||||
parser_conf = ParserConf(rules, callback, 'start')
|
||||
|
|
|
@ -3,6 +3,8 @@ from .utils import suppress
|
|||
from .lexer import Token
|
||||
from .grammar import Rule
|
||||
|
||||
###{standalone
|
||||
|
||||
class NodeBuilder:
|
||||
def __init__(self, tree_class, name):
|
||||
self.tree_class = tree_class
|
||||
|
@ -130,7 +132,7 @@ class ParseTreeBuilder:
|
|||
yield rule, wrapper_chain
|
||||
|
||||
|
||||
def apply(self, transformer=None):
|
||||
def create_callback(self, transformer=None):
|
||||
callback = Callback()
|
||||
|
||||
for rule, wrapper_chain in self.rule_builders:
|
||||
|
@ -152,3 +154,5 @@ class ParseTreeBuilder:
|
|||
setattr(callback, internal_callback_name, f)
|
||||
|
||||
return callback
|
||||
|
||||
###}
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
# Author: Erez Shinan (2017)
|
||||
# Email : erezshin@gmail.com
|
||||
|
||||
from ..common import ParseError, UnexpectedToken
|
||||
from ..common import UnexpectedToken
|
||||
|
||||
from .lalr_analysis import LALR_Analyzer, Shift
|
||||
|
||||
|
@ -20,6 +20,8 @@ class Parser:
|
|||
self.parser = _Parser(analysis.parse_table, callbacks)
|
||||
self.parse = self.parser.parse
|
||||
|
||||
###{standalone
|
||||
|
||||
class _Parser:
|
||||
def __init__(self, parse_table, callbacks):
|
||||
self.states = parse_table.states
|
||||
|
@ -90,3 +92,5 @@ class _Parser:
|
|||
return val
|
||||
else:
|
||||
reduce(arg)
|
||||
|
||||
###}
|
||||
|
|
|
@ -7,6 +7,7 @@ from copy import deepcopy
|
|||
|
||||
from .utils import inline_args
|
||||
|
||||
###{standalone
|
||||
class Tree(object):
|
||||
def __init__(self, data, children):
|
||||
self.data = data
|
||||
|
@ -33,6 +34,7 @@ class Tree(object):
|
|||
|
||||
def pretty(self, indent_str=' '):
|
||||
return ''.join(self._pretty(0, indent_str))
|
||||
###}
|
||||
|
||||
def expand_kids_by_index(self, *indices):
|
||||
for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices
|
||||
|
@ -138,7 +140,7 @@ class TransformerChain(object):
|
|||
|
||||
def __mul__(self, other):
|
||||
return TransformerChain(*self.transformers + (other,))
|
||||
|
||||
|
||||
|
||||
|
||||
class InlineTransformer(Transformer):
|
||||
|
|
Loading…
Reference in New Issue