Provide more info in cycle error message E069 (#4123)

Provide the tokens in the cycle and the first 50 tokens from document in
the error message so it's easier to track down the location of the cycle
in the data.

Addresses feature request in #3698.
This commit is contained in:
adrianeboyd 2019-08-15 18:08:28 +02:00 committed by Matthew Honnibal
parent 2f3648700c
commit 2f9b28c218
2 changed files with 3 additions and 2 deletions

View File

@ -243,7 +243,8 @@ class Errors(object):
"Tag sequence:\n{tags}") "Tag sequence:\n{tags}")
E068 = ("Invalid BILUO tag: '{tag}'.") E068 = ("Invalid BILUO tag: '{tag}'.")
E069 = ("Invalid gold-standard parse tree. Found cycle between word " E069 = ("Invalid gold-standard parse tree. Found cycle between word "
"IDs: {cycle}") "IDs: {cycle} (tokens: {cycle_tokens}) in the document starting "
"with tokens: {doc_tokens}.")
E070 = ("Invalid gold-standard data. Number of documents ({n_docs}) " E070 = ("Invalid gold-standard data. Number of documents ({n_docs}) "
"does not align with number of annotations ({n_annots}).") "does not align with number of annotations ({n_annots}).")
E071 = ("Error creating lexeme: specified orth ID ({orth}) does not " E071 = ("Error creating lexeme: specified orth ID ({orth}) does not "

View File

@ -590,7 +590,7 @@ cdef class GoldParse:
cycle = nonproj.contains_cycle(self.heads) cycle = nonproj.contains_cycle(self.heads)
if cycle is not None: if cycle is not None:
raise ValueError(Errors.E069.format(cycle=cycle)) raise ValueError(Errors.E069.format(cycle=cycle, cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]), doc_tokens=" ".join(words[:50])))
def __len__(self): def __len__(self):
"""Get the number of gold-standard tokens. """Get the number of gold-standard tokens.