Rewrite example to use Retokenizer (resolves #3681)

Also add helper to filter spans
This commit is contained in:
Ines Montani 2019-05-06 14:51:18 +02:00
parent 955b95cb8b
commit f2a56c1b56
1 changed files with 20 additions and 4 deletions

View File

@ -36,11 +36,27 @@ def main(model="en_core_web_sm"):
print("{:<10}\t{}\t{}".format(r1.text, r2.ent_type_, r2.text)) print("{:<10}\t{}\t{}".format(r1.text, r2.ent_type_, r2.text))
def extract_currency_relations(doc): def filter_spans(spans, prefer_longest=True):
# merge entities and noun chunks into one token # Filter a sequence of spans so they don't contain overlaps
spans = list(doc.ents) + list(doc.noun_chunks) get_sort_key = lambda span: (span.end - span.start, span.start)
sorted_spans = sorted(spans, key=get_sort_key, reverse=prefer_longest)
result = []
seen_tokens = set()
for span in spans: for span in spans:
span.merge() if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
result.append(span)
seen_tokens.update(range(span.start, span.end))
return result
def extract_currency_relations(doc):
# Merge entities and noun chunks into one token
seen_tokens = set()
spans = list(doc.ents) + list(doc.noun_chunks)
spans = filter_spans(spans)
with doc.retokenize() as retokenizer:
for span in spans:
retokenizer.merge(span)
relations = [] relations = []
for money in filter(lambda w: w.ent_type_ == "MONEY", doc): for money in filter(lambda w: w.ent_type_ == "MONEY", doc):