mirror of https://github.com/explosion/spaCy.git
Rewrite example to use Retokenizer (resolves #3681)
Also add helper to filter spans
This commit is contained in:
parent
955b95cb8b
commit
f2a56c1b56
|
@ -36,11 +36,27 @@ def main(model="en_core_web_sm"):
|
||||||
print("{:<10}\t{}\t{}".format(r1.text, r2.ent_type_, r2.text))
|
print("{:<10}\t{}\t{}".format(r1.text, r2.ent_type_, r2.text))
|
||||||
|
|
||||||
|
|
||||||
def extract_currency_relations(doc):
|
def filter_spans(spans, prefer_longest=True):
|
||||||
# merge entities and noun chunks into one token
|
# Filter a sequence of spans so they don't contain overlaps
|
||||||
spans = list(doc.ents) + list(doc.noun_chunks)
|
get_sort_key = lambda span: (span.end - span.start, span.start)
|
||||||
|
sorted_spans = sorted(spans, key=get_sort_key, reverse=prefer_longest)
|
||||||
|
result = []
|
||||||
|
seen_tokens = set()
|
||||||
for span in spans:
|
for span in spans:
|
||||||
span.merge()
|
if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
|
||||||
|
result.append(span)
|
||||||
|
seen_tokens.update(range(span.start, span.end))
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def extract_currency_relations(doc):
|
||||||
|
# Merge entities and noun chunks into one token
|
||||||
|
seen_tokens = set()
|
||||||
|
spans = list(doc.ents) + list(doc.noun_chunks)
|
||||||
|
spans = filter_spans(spans)
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
for span in spans:
|
||||||
|
retokenizer.merge(span)
|
||||||
|
|
||||||
relations = []
|
relations = []
|
||||||
for money in filter(lambda w: w.ent_type_ == "MONEY", doc):
|
for money in filter(lambda w: w.ent_type_ == "MONEY", doc):
|
||||||
|
|
Loading…
Reference in New Issue