Rewrite example to use Retokenizer (resolves #3681)

Also add helper to filter spans
2019-05-06 14:51:18 +02:00 · 2019-05-06 14:51:18 +02:00 · f2a56c1b56
parent 955b95cb8b
commit f2a56c1b56
1 changed files with 20 additions and 4 deletions
--- a/examples/information_extraction/entity_relations.py
+++ b/examples/information_extraction/entity_relations.py
@ -36,11 +36,27 @@ def main(model="en_core_web_sm"):
            print("{:<10}\t{}\t{}".format(r1.text, r2.ent_type_, r2.text))
-def extract_currency_relations(doc):
+def filter_spans(spans, prefer_longest=True):
-    # merge entities and noun chunks into one token
+    # Filter a sequence of spans so they don't contain overlaps
-    spans = list(doc.ents) + list(doc.noun_chunks)
+    get_sort_key = lambda span: (span.end - span.start, span.start)
    sorted_spans = sorted(spans, key=get_sort_key, reverse=prefer_longest)
    result = []
    seen_tokens = set()
    for span in spans:
-        span.merge()
+        if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
            result.append(span)
            seen_tokens.update(range(span.start, span.end))
    return result
 def extract_currency_relations(doc):
    # Merge entities and noun chunks into one token
    seen_tokens = set()
    spans = list(doc.ents) + list(doc.noun_chunks)
    spans = filter_spans(spans)
    with doc.retokenize() as retokenizer:
        for span in spans:
            retokenizer.merge(span)
    relations = []
    for money in filter(lambda w: w.ent_type_ == "MONEY", doc):