From 4c8730526bd3d538db350b2f913c2510bf583b10 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 9 Mar 2019 00:41:34 +0000 Subject: [PATCH] Filter bad retokenizations --- spacy/cli/ud/ud_train.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/spacy/cli/ud/ud_train.py b/spacy/cli/ud/ud_train.py index d94d05755..fb2003d31 100644 --- a/spacy/cli/ud/ud_train.py +++ b/spacy/cli/ud/ud_train.py @@ -231,9 +231,14 @@ def write_conllu(docs, file_): for i, doc in enumerate(docs): matches = merger(doc) spans = [doc[start : end + 1] for _, start, end in matches] + seen_tokens = set() with doc.retokenize() as retokenizer: for span in spans: - retokenizer.merge(span) + span_tokens = set(range(span.start, span.end)) + if not span_tokens.intersection(seen_tokens): + retokenizer.merge(span) + seen_tokens.update(span_tokens) + file_.write("# newdoc id = {i}\n".format(i=i)) for j, sent in enumerate(doc.sents): file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))