Improved lexer, added profiler option to Lark

This commit is contained in:
Erez Shinan 2017-02-10 11:50:50 +02:00
parent 0d48385721
commit 8b9c5801da
5 changed files with 101 additions and 40 deletions

View File

@ -1,13 +1,13 @@
# Lark - a modern pure-Python parsing library
# Lark - a modern parsing library
Lark is a modern general-purpose Python parsing library, that focuses on simplicity and power.
Lark is a modern general-purpose parsing library for Python.
Lark accepts grammars as EBNF and lets you choose between two parsing algorithms:
Lark focuses on simplicity and power. It lets you choose between two parsing algorithms:
- Earley : Parses all context-free grammars (even ambiguous ones)!
- Earley : Parses all context-free grammars (even ambiguous ones)! It is the default.
- LALR(1): Only LR grammars. Outperforms PLY and most if not all other pure-python parsing libraries.
Both algorithms are pure-python implementations and can be used interchangably (aside for algorithmic restrictions).
Both algorithms are written in Python and can be used interchangably with the same grammar (aside for algorithmic restrictions). See "Comparison to other parsers" for more details.
Lark can automagically build an AST from your grammar, without any more code on your part.
@ -41,10 +41,12 @@ Tree(start, [Token(WORD, Hello), Token(WORD, World)])
Notice punctuation doesn't appear in the resulting tree. It's automatically filtered away by Lark.
To learn more about Lark:
- Learn how to parse json at the [tutorial](/docs/json_tutorial.md)
## Learn more about using Lark
## Features
- Read the [tutorial](/docs/json_tutorial.md), which shows how to write a JSON parser in Lark.
- Browse the [examples](/examples), which include a calculator, and a Python-code parser.
## List of Features
- EBNF grammar with a little extra
- Earley & LALR(1)

View File

@ -0,0 +1 @@
from .lark import Lark, Transformer

View File

@ -39,6 +39,7 @@ class LarkOptions(object):
self.parser = o.pop('parser', 'earley')
self.transformer = o.pop('transformer', None)
self.start = o.pop('start', 'start')
self.profile = o.pop('profile', False) # XXX new
assert self.parser in ENGINE_DICT
if self.parser == 'earley' and self.transformer:
@ -50,6 +51,30 @@ class LarkOptions(object):
raise ValueError("Unknown options: %s" % o.keys())
import time
from collections import defaultdict
class Profiler:
def __init__(self):
self.total_time = defaultdict(float)
self.cur_section = '__init__'
self.last_enter_time = time.time()
def enter_section(self, name):
cur_time = time.time()
self.total_time[self.cur_section] += cur_time - self.last_enter_time
self.last_enter_time = cur_time
self.cur_section = name
def make_wrapper(self, name, f):
def _f(*args, **kwargs):
last_section = self.cur_section
self.enter_section(name)
try:
return f(*args, **kwargs)
finally:
self.enter_section(last_section)
return _f
class Lark:
@ -82,6 +107,8 @@ class Lark:
if self.options.cache_grammar:
raise NotImplementedError("Not available yet")
self.profiler = Profiler() if self.options.profile else None
self.tokens, self.rules = load_grammar(grammar)
self.lexer = self._build_lexer()
@ -90,6 +117,9 @@ class Lark:
self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class)
self.parser = self._build_parser()
if self.profiler: self.profiler.enter_section('outside_lark')
def _create_unless_callback(self, strs):
def f(t):
if t in strs:
@ -105,8 +135,6 @@ class Lark:
for flag in flags:
if flag == 'ignore':
ignore_tokens.append(name)
elif flag == 'newline':
pass # TODO
elif isinstance(flag, tuple) and flag[0] == 'unless':
_, strs = flag
callbacks[name] = self._create_unless_callback(strs)
@ -119,6 +147,10 @@ class Lark:
def _build_parser(self):
rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer)
if self.profiler:
for f in dir(callback):
if not f.startswith('__'):
setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f)))
return self.parser_engine.build_parser(rules, callback, self.options.start)
@ -133,6 +165,16 @@ class Lark:
def parse(self, text):
assert not self.options.only_lex
l = list(self.lex(text))
return self.parser.parse(l)
if self.profiler:
self.profiler.enter_section('lex')
l = list(self.lex(text))
self.profiler.enter_section('parse')
try:
return self.parser.parse(l)
finally:
self.profiler.enter_section('outside_lark')
else:
l = list(self.lex(text))
return self.parser.parse(l)

View File

@ -1,5 +1,7 @@
## Lexer Implementation
import re
from .utils import Str
class LexError(Exception):
@ -13,13 +15,6 @@ class Token(Str):
inst.value = value
return inst
# class Token(object):
# def __init__(self, type, value, lexpos):
# self.type = type
# self.value = value
# self.lexpos = lexpos
def __repr__(self):
return 'Token(%s, %s)' % (self.type, self.value)
@ -29,12 +24,11 @@ class Regex:
self.flags = flags
import re
LIMIT = 50 # Stupid named groups limit in python re
class Lexer(object):
def __init__(self, tokens, callbacks, ignore=()):
self.ignore = ignore
self.newline_char = '\n'
tokens = list(tokens)
# Sanitization
token_names = {t[0] for t in tokens}
@ -49,42 +43,57 @@ class Lexer(object):
self.tokens = tokens
self.callbacks = callbacks
# self.tokens.sort(key=lambda x:len(x[1]), reverse=True)
self.token_types = list(token_names)
self.type_index = {name:i for i,name in enumerate(self.token_types)}
self.mres = []
self.name_from_index = []
x = list(tokens)
while x:
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in x[:LIMIT]))
self.mres.append(mre)
self.name_from_index.append( {i:n for n,i in mre.groupindex.items()} )
x = x[LIMIT:]
self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1]]
self.ignore_types = [self.type_index[t] for t in ignore]
self.mres = self._build_mres(tokens, len(tokens))
def _build_mres(self, tokens, max_size):
# Python sets an unreasonable group limit (currently 100) in its re module
# Worse, the only way to know we reached it is by catching an AssertionError!
# This function recursively tries less and less groups until it's successful.
mres = []
while tokens:
try:
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in tokens[:max_size]))
except AssertionError: # Yes, this is what Python provides us.. :/
return self._build_mres(tokens, max_size/2)
mres.append((mre, {i:self.type_index[n] for n,i in mre.groupindex.items()} ))
tokens = tokens[max_size:]
return mres
def lex(self, stream):
lex_pos = 0
line = 1
col_start_pos = 0
newline_types = list(self.newline_types)
ignore_types = list(self.ignore_types)
while True:
i = 0
for mre in self.mres:
for mre, type_from_index in self.mres:
m = mre.match(stream, lex_pos)
if m:
value = m.group(0)
type_ = self.name_from_index[i][m.lastindex]
if type_ not in self.ignore:
t = Token(type_, value, lex_pos)
type_num = type_from_index[m.lastindex]
if type_num not in ignore_types:
t = Token(self.token_types[type_num], value, lex_pos)
t.line = line
t.column = lex_pos - col_start_pos
if t.type in self.callbacks:
t = self.callbacks[t.type](t)
yield t
newlines = value.count(self.newline_char)
if newlines:
line += newlines
col_start_pos = lex_pos + value.rindex(self.newline_char)
if type_num in newline_types:
newlines = value.count(self.newline_char)
if newlines:
line += newlines
col_start_pos = lex_pos + value.rindex(self.newline_char)
lex_pos += len(value)
break
i += 1
else:
if lex_pos < len(stream):
context = stream[lex_pos:lex_pos+5]

View File

@ -334,6 +334,13 @@ def _make_parser_test(PARSER):
x = g.parse('a')
self.assertEqual(x.data, "b")
def test_lexer_token_limit(self):
"Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
tokens = {'A%d'%i:'"%d"'%i for i in range(300)}
g = _Lark("""start: %s
%s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items())))
_NAME = "Test" + PARSER.capitalize()
_TestParser.__name__ = _NAME
globals()[_NAME] = _TestParser