mirror of https://github.com/explosion/spaCy.git
Filter bad retokenizations
This commit is contained in:
parent
42bc3ad73b
commit
4c8730526b
|
@ -231,9 +231,14 @@ def write_conllu(docs, file_):
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
matches = merger(doc)
|
matches = merger(doc)
|
||||||
spans = [doc[start : end + 1] for _, start, end in matches]
|
spans = [doc[start : end + 1] for _, start, end in matches]
|
||||||
|
seen_tokens = set()
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
for span in spans:
|
for span in spans:
|
||||||
|
span_tokens = set(range(span.start, span.end))
|
||||||
|
if not span_tokens.intersection(seen_tokens):
|
||||||
retokenizer.merge(span)
|
retokenizer.merge(span)
|
||||||
|
seen_tokens.update(span_tokens)
|
||||||
|
|
||||||
file_.write("# newdoc id = {i}\n".format(i=i))
|
file_.write("# newdoc id = {i}\n".format(i=i))
|
||||||
for j, sent in enumerate(doc.sents):
|
for j, sent in enumerate(doc.sents):
|
||||||
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
|
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
|
||||||
|
|
Loading…
Reference in New Issue