Filter bad retokenizations

2019-03-09 00:41:34 +00:00 · 2019-03-09 00:41:34 +00:00 · 4c8730526b
parent 42bc3ad73b
commit 4c8730526b
1 changed files with 6 additions and 1 deletions
--- a/spacy/cli/ud/ud_train.py
+++ b/spacy/cli/ud/ud_train.py
@ -231,9 +231,14 @@ def write_conllu(docs, file_):
    for i, doc in enumerate(docs):
        matches = merger(doc)
        spans = [doc[start : end + 1] for _, start, end in matches]
        seen_tokens = set()
        with doc.retokenize() as retokenizer:
            for span in spans:
                span_tokens = set(range(span.start, span.end))
                if not span_tokens.intersection(seen_tokens):
                    retokenizer.merge(span)
                    seen_tokens.update(span_tokens)
        file_.write("# newdoc id = {i}\n".format(i=i))
        for j, sent in enumerate(doc.sents):
            file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))