mirror of https://github.com/lark-parser/lark.git
Improvements to the Earley parser code
This commit is contained in:
parent
cd01700e81
commit
32cbf1eb19
|
@ -17,8 +17,8 @@ from functools import cmp_to_key
|
||||||
|
|
||||||
from ..utils import compare
|
from ..utils import compare
|
||||||
from ..common import ParseError, UnexpectedToken, Terminal
|
from ..common import ParseError, UnexpectedToken, Terminal
|
||||||
from .grammar_analysis import GrammarAnalyzer
|
|
||||||
from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse
|
from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse
|
||||||
|
from .grammar_analysis import GrammarAnalyzer
|
||||||
|
|
||||||
|
|
||||||
class EndToken:
|
class EndToken:
|
||||||
|
@ -32,6 +32,8 @@ class Derivation(Tree):
|
||||||
END_TOKEN = EndToken()
|
END_TOKEN = EndToken()
|
||||||
|
|
||||||
class Item(object):
|
class Item(object):
|
||||||
|
"An Earley Item, the atom of the algorithm."
|
||||||
|
|
||||||
def __init__(self, rule, ptr, start, tree):
|
def __init__(self, rule, ptr, start, tree):
|
||||||
self.rule = rule
|
self.rule = rule
|
||||||
self.ptr = ptr
|
self.ptr = ptr
|
||||||
|
@ -77,7 +79,7 @@ class NewsList(list):
|
||||||
|
|
||||||
|
|
||||||
class Column:
|
class Column:
|
||||||
"An entry in the table, aka Earley Chart"
|
"An entry in the table, aka Earley Chart. Contains lists of items."
|
||||||
def __init__(self, i):
|
def __init__(self, i):
|
||||||
self.i = i
|
self.i = i
|
||||||
self.to_reduce = NewsList()
|
self.to_reduce = NewsList()
|
||||||
|
@ -94,7 +96,6 @@ class Column:
|
||||||
Makes sure only unique items are added.
|
Makes sure only unique items are added.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
added = self.added
|
|
||||||
for item in items:
|
for item in items:
|
||||||
|
|
||||||
if item.is_complete:
|
if item.is_complete:
|
||||||
|
@ -112,8 +113,8 @@ class Column:
|
||||||
self.completed[item] = item
|
self.completed[item] = item
|
||||||
self.to_reduce.append(item)
|
self.to_reduce.append(item)
|
||||||
else:
|
else:
|
||||||
if item not in added:
|
if item not in self.added:
|
||||||
added.add(item)
|
self.added.add(item)
|
||||||
if isinstance(item.expect, Terminal):
|
if isinstance(item.expect, Terminal):
|
||||||
self.to_scan.append(item)
|
self.to_scan.append(item)
|
||||||
else:
|
else:
|
||||||
|
@ -125,9 +126,9 @@ class Column:
|
||||||
return bool(self.item_count)
|
return bool(self.item_count)
|
||||||
|
|
||||||
class Parser:
|
class Parser:
|
||||||
def __init__(self, rules, start, callback, resolve_ambiguity=True):
|
def __init__(self, rules, start_symbol, callback, resolve_ambiguity=True):
|
||||||
self.analysis = GrammarAnalyzer(rules, start)
|
self.analysis = GrammarAnalyzer(rules, start_symbol)
|
||||||
self.start = start
|
self.start_symbol = start_symbol
|
||||||
self.resolve_ambiguity = resolve_ambiguity
|
self.resolve_ambiguity = resolve_ambiguity
|
||||||
|
|
||||||
self.postprocess = {}
|
self.postprocess = {}
|
||||||
|
@ -138,60 +139,57 @@ class Parser:
|
||||||
self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a))
|
self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a))
|
||||||
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
|
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
|
||||||
|
|
||||||
def parse(self, stream, start=None):
|
def parse(self, stream, start_symbol=None):
|
||||||
# Define parser functions
|
# Define parser functions
|
||||||
start = start or self.start
|
start_symbol = start_symbol or self.start_symbol
|
||||||
|
|
||||||
def predict(nonterm, i):
|
def predict(nonterm, column):
|
||||||
assert not isinstance(nonterm, Terminal), nonterm
|
assert not isinstance(nonterm, Terminal), nonterm
|
||||||
return [Item(rule, 0, i, None) for rule in self.predictions[nonterm]]
|
return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]]
|
||||||
|
|
||||||
def complete(item):
|
def complete(item):
|
||||||
name = item.rule.origin
|
name = item.rule.origin
|
||||||
return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name]
|
return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name]
|
||||||
|
|
||||||
def process_column(i, token, cur_set):
|
def predict_and_complete(column):
|
||||||
next_set = Column(i)
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
to_predict = {x.expect for x in cur_set.to_predict.get_news()
|
to_predict = {x.expect for x in column.to_predict.get_news()
|
||||||
if x.ptr} # if not part of an already predicted batch
|
if x.ptr} # if not part of an already predicted batch
|
||||||
to_reduce = cur_set.to_reduce.get_news()
|
to_reduce = column.to_reduce.get_news()
|
||||||
if not (to_predict or to_reduce):
|
if not (to_predict or to_reduce):
|
||||||
break
|
break
|
||||||
|
|
||||||
for nonterm in to_predict:
|
for nonterm in to_predict:
|
||||||
cur_set.add( predict(nonterm, cur_set) )
|
column.add( predict(nonterm, column) )
|
||||||
for item in to_reduce:
|
for item in to_reduce:
|
||||||
cur_set.add( complete(item) )
|
column.add( complete(item) )
|
||||||
|
|
||||||
if token is not END_TOKEN:
|
def scan(i, token, column):
|
||||||
to_scan = cur_set.to_scan.get_news()
|
to_scan = column.to_scan.get_news()
|
||||||
for item in to_scan:
|
|
||||||
if item.expect.match(token):
|
|
||||||
next_set.add([item.advance(token)])
|
|
||||||
|
|
||||||
if not next_set and token is not END_TOKEN:
|
next_set = Column(i)
|
||||||
expect = {i.expect for i in cur_set.to_scan}
|
next_set.add(item.advance(token) for item in to_scan if item.expect.match(token))
|
||||||
|
|
||||||
|
if not next_set:
|
||||||
|
expect = {i.expect for i in column.to_scan}
|
||||||
raise UnexpectedToken(token, expect, stream, i)
|
raise UnexpectedToken(token, expect, stream, i)
|
||||||
|
|
||||||
return cur_set, next_set
|
return next_set
|
||||||
|
|
||||||
# Main loop starts
|
# Main loop starts
|
||||||
column0 = Column(0)
|
column0 = Column(0)
|
||||||
column0.add(predict(start, column0))
|
column0.add(predict(start_symbol, column0))
|
||||||
|
|
||||||
cur_set = column0
|
column = column0
|
||||||
i = 0
|
for i, token in enumerate(stream):
|
||||||
for token in stream:
|
predict_and_complete(column)
|
||||||
_, cur_set = process_column(i, token, cur_set)
|
column = scan(i, token, column)
|
||||||
i += 1
|
|
||||||
|
|
||||||
last_set, _ = process_column(i, END_TOKEN, cur_set)
|
predict_and_complete(column)
|
||||||
|
|
||||||
# Parse ended. Now build a parse tree
|
# Parse ended. Now build a parse tree
|
||||||
solutions = [n.tree for n in last_set.to_reduce
|
solutions = [n.tree for n in column.to_reduce
|
||||||
if n.rule.origin==start and n.start is column0]
|
if n.rule.origin==start_symbol and n.start is column0]
|
||||||
|
|
||||||
if not solutions:
|
if not solutions:
|
||||||
raise ParseError('Incomplete parse: Could not find a solution to input')
|
raise ParseError('Incomplete parse: Could not find a solution to input')
|
||||||
|
|
Loading…
Reference in New Issue