mirror of https://github.com/lark-parser/lark.git
Improved lexer, added profiler option to Lark
This commit is contained in:
parent
0d48385721
commit
8b9c5801da
18
README.md
18
README.md
|
@ -1,13 +1,13 @@
|
|||
# Lark - a modern pure-Python parsing library
|
||||
# Lark - a modern parsing library
|
||||
|
||||
Lark is a modern general-purpose Python parsing library, that focuses on simplicity and power.
|
||||
Lark is a modern general-purpose parsing library for Python.
|
||||
|
||||
Lark accepts grammars as EBNF and lets you choose between two parsing algorithms:
|
||||
Lark focuses on simplicity and power. It lets you choose between two parsing algorithms:
|
||||
|
||||
- Earley : Parses all context-free grammars (even ambiguous ones)!
|
||||
- Earley : Parses all context-free grammars (even ambiguous ones)! It is the default.
|
||||
- LALR(1): Only LR grammars. Outperforms PLY and most if not all other pure-python parsing libraries.
|
||||
|
||||
Both algorithms are pure-python implementations and can be used interchangably (aside for algorithmic restrictions).
|
||||
Both algorithms are written in Python and can be used interchangably with the same grammar (aside for algorithmic restrictions). See "Comparison to other parsers" for more details.
|
||||
|
||||
Lark can automagically build an AST from your grammar, without any more code on your part.
|
||||
|
||||
|
@ -41,10 +41,12 @@ Tree(start, [Token(WORD, Hello), Token(WORD, World)])
|
|||
|
||||
Notice punctuation doesn't appear in the resulting tree. It's automatically filtered away by Lark.
|
||||
|
||||
To learn more about Lark:
|
||||
- Learn how to parse json at the [tutorial](/docs/json_tutorial.md)
|
||||
## Learn more about using Lark
|
||||
|
||||
## Features
|
||||
- Read the [tutorial](/docs/json_tutorial.md), which shows how to write a JSON parser in Lark.
|
||||
- Browse the [examples](/examples), which include a calculator, and a Python-code parser.
|
||||
|
||||
## List of Features
|
||||
|
||||
- EBNF grammar with a little extra
|
||||
- Earley & LALR(1)
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from .lark import Lark, Transformer
|
50
lark/lark.py
50
lark/lark.py
|
@ -39,6 +39,7 @@ class LarkOptions(object):
|
|||
self.parser = o.pop('parser', 'earley')
|
||||
self.transformer = o.pop('transformer', None)
|
||||
self.start = o.pop('start', 'start')
|
||||
self.profile = o.pop('profile', False) # XXX new
|
||||
|
||||
assert self.parser in ENGINE_DICT
|
||||
if self.parser == 'earley' and self.transformer:
|
||||
|
@ -50,6 +51,30 @@ class LarkOptions(object):
|
|||
raise ValueError("Unknown options: %s" % o.keys())
|
||||
|
||||
|
||||
import time
|
||||
from collections import defaultdict
|
||||
class Profiler:
|
||||
def __init__(self):
|
||||
self.total_time = defaultdict(float)
|
||||
self.cur_section = '__init__'
|
||||
self.last_enter_time = time.time()
|
||||
|
||||
def enter_section(self, name):
|
||||
cur_time = time.time()
|
||||
self.total_time[self.cur_section] += cur_time - self.last_enter_time
|
||||
self.last_enter_time = cur_time
|
||||
self.cur_section = name
|
||||
|
||||
def make_wrapper(self, name, f):
|
||||
def _f(*args, **kwargs):
|
||||
last_section = self.cur_section
|
||||
self.enter_section(name)
|
||||
try:
|
||||
return f(*args, **kwargs)
|
||||
finally:
|
||||
self.enter_section(last_section)
|
||||
|
||||
return _f
|
||||
|
||||
|
||||
class Lark:
|
||||
|
@ -82,6 +107,8 @@ class Lark:
|
|||
if self.options.cache_grammar:
|
||||
raise NotImplementedError("Not available yet")
|
||||
|
||||
self.profiler = Profiler() if self.options.profile else None
|
||||
|
||||
self.tokens, self.rules = load_grammar(grammar)
|
||||
|
||||
self.lexer = self._build_lexer()
|
||||
|
@ -90,6 +117,9 @@ class Lark:
|
|||
self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class)
|
||||
self.parser = self._build_parser()
|
||||
|
||||
if self.profiler: self.profiler.enter_section('outside_lark')
|
||||
|
||||
|
||||
def _create_unless_callback(self, strs):
|
||||
def f(t):
|
||||
if t in strs:
|
||||
|
@ -105,8 +135,6 @@ class Lark:
|
|||
for flag in flags:
|
||||
if flag == 'ignore':
|
||||
ignore_tokens.append(name)
|
||||
elif flag == 'newline':
|
||||
pass # TODO
|
||||
elif isinstance(flag, tuple) and flag[0] == 'unless':
|
||||
_, strs = flag
|
||||
callbacks[name] = self._create_unless_callback(strs)
|
||||
|
@ -119,6 +147,10 @@ class Lark:
|
|||
|
||||
def _build_parser(self):
|
||||
rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer)
|
||||
if self.profiler:
|
||||
for f in dir(callback):
|
||||
if not f.startswith('__'):
|
||||
setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f)))
|
||||
return self.parser_engine.build_parser(rules, callback, self.options.start)
|
||||
|
||||
|
||||
|
@ -133,6 +165,16 @@ class Lark:
|
|||
|
||||
def parse(self, text):
|
||||
assert not self.options.only_lex
|
||||
l = list(self.lex(text))
|
||||
return self.parser.parse(l)
|
||||
|
||||
if self.profiler:
|
||||
self.profiler.enter_section('lex')
|
||||
l = list(self.lex(text))
|
||||
self.profiler.enter_section('parse')
|
||||
try:
|
||||
return self.parser.parse(l)
|
||||
finally:
|
||||
self.profiler.enter_section('outside_lark')
|
||||
else:
|
||||
l = list(self.lex(text))
|
||||
return self.parser.parse(l)
|
||||
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
## Lexer Implementation
|
||||
|
||||
import re
|
||||
|
||||
from .utils import Str
|
||||
|
||||
class LexError(Exception):
|
||||
|
@ -13,13 +15,6 @@ class Token(Str):
|
|||
inst.value = value
|
||||
return inst
|
||||
|
||||
# class Token(object):
|
||||
# def __init__(self, type, value, lexpos):
|
||||
# self.type = type
|
||||
# self.value = value
|
||||
# self.lexpos = lexpos
|
||||
|
||||
|
||||
def __repr__(self):
|
||||
return 'Token(%s, %s)' % (self.type, self.value)
|
||||
|
||||
|
@ -29,12 +24,11 @@ class Regex:
|
|||
self.flags = flags
|
||||
|
||||
|
||||
import re
|
||||
LIMIT = 50 # Stupid named groups limit in python re
|
||||
class Lexer(object):
|
||||
def __init__(self, tokens, callbacks, ignore=()):
|
||||
self.ignore = ignore
|
||||
self.newline_char = '\n'
|
||||
tokens = list(tokens)
|
||||
|
||||
# Sanitization
|
||||
token_names = {t[0] for t in tokens}
|
||||
|
@ -49,42 +43,57 @@ class Lexer(object):
|
|||
self.tokens = tokens
|
||||
self.callbacks = callbacks
|
||||
|
||||
# self.tokens.sort(key=lambda x:len(x[1]), reverse=True)
|
||||
self.token_types = list(token_names)
|
||||
self.type_index = {name:i for i,name in enumerate(self.token_types)}
|
||||
|
||||
self.mres = []
|
||||
self.name_from_index = []
|
||||
x = list(tokens)
|
||||
while x:
|
||||
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in x[:LIMIT]))
|
||||
self.mres.append(mre)
|
||||
self.name_from_index.append( {i:n for n,i in mre.groupindex.items()} )
|
||||
x = x[LIMIT:]
|
||||
self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1]]
|
||||
self.ignore_types = [self.type_index[t] for t in ignore]
|
||||
|
||||
self.mres = self._build_mres(tokens, len(tokens))
|
||||
|
||||
|
||||
def _build_mres(self, tokens, max_size):
|
||||
# Python sets an unreasonable group limit (currently 100) in its re module
|
||||
# Worse, the only way to know we reached it is by catching an AssertionError!
|
||||
# This function recursively tries less and less groups until it's successful.
|
||||
mres = []
|
||||
while tokens:
|
||||
try:
|
||||
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in tokens[:max_size]))
|
||||
except AssertionError: # Yes, this is what Python provides us.. :/
|
||||
return self._build_mres(tokens, max_size/2)
|
||||
|
||||
mres.append((mre, {i:self.type_index[n] for n,i in mre.groupindex.items()} ))
|
||||
tokens = tokens[max_size:]
|
||||
return mres
|
||||
|
||||
def lex(self, stream):
|
||||
lex_pos = 0
|
||||
line = 1
|
||||
col_start_pos = 0
|
||||
newline_types = list(self.newline_types)
|
||||
ignore_types = list(self.ignore_types)
|
||||
while True:
|
||||
i = 0
|
||||
for mre in self.mres:
|
||||
for mre, type_from_index in self.mres:
|
||||
m = mre.match(stream, lex_pos)
|
||||
if m:
|
||||
value = m.group(0)
|
||||
type_ = self.name_from_index[i][m.lastindex]
|
||||
if type_ not in self.ignore:
|
||||
t = Token(type_, value, lex_pos)
|
||||
type_num = type_from_index[m.lastindex]
|
||||
if type_num not in ignore_types:
|
||||
t = Token(self.token_types[type_num], value, lex_pos)
|
||||
t.line = line
|
||||
t.column = lex_pos - col_start_pos
|
||||
if t.type in self.callbacks:
|
||||
t = self.callbacks[t.type](t)
|
||||
yield t
|
||||
newlines = value.count(self.newline_char)
|
||||
if newlines:
|
||||
line += newlines
|
||||
col_start_pos = lex_pos + value.rindex(self.newline_char)
|
||||
|
||||
if type_num in newline_types:
|
||||
newlines = value.count(self.newline_char)
|
||||
if newlines:
|
||||
line += newlines
|
||||
col_start_pos = lex_pos + value.rindex(self.newline_char)
|
||||
lex_pos += len(value)
|
||||
break
|
||||
i += 1
|
||||
else:
|
||||
if lex_pos < len(stream):
|
||||
context = stream[lex_pos:lex_pos+5]
|
||||
|
|
|
@ -334,6 +334,13 @@ def _make_parser_test(PARSER):
|
|||
x = g.parse('a')
|
||||
self.assertEqual(x.data, "b")
|
||||
|
||||
def test_lexer_token_limit(self):
|
||||
"Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
|
||||
tokens = {'A%d'%i:'"%d"'%i for i in range(300)}
|
||||
g = _Lark("""start: %s
|
||||
%s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items())))
|
||||
|
||||
|
||||
_NAME = "Test" + PARSER.capitalize()
|
||||
_TestParser.__name__ = _NAME
|
||||
globals()[_NAME] = _TestParser
|
||||
|
|
Loading…
Reference in New Issue