diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 0e35cdd6d..2cc9094eb 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -96,7 +96,9 @@ cdef class Vocab: lex = self._by_hash.get(key) cdef size_t addr if lex != NULL: - assert lex.orth == self.strings[string] + if lex.orth != self.strings[string]: + raise LookupError.mismatched_strings( + lex.orth, self.strings[lex.orth], string) return lex else: return self._new_lexeme(mem, string) @@ -352,6 +354,21 @@ def write_binary_vectors(in_loc, out_loc): out_file.write_from(vec, vec_len, sizeof(float)) +class LookupError(Exception): + @classmethod + def mismatched_strings(cls, id_, id_string, original_string): + return cls( + "Error fetching a Lexeme from the Vocab. When looking up a string, " + "the lexeme returned had an orth ID that did not match the query string. " + "This means that the cached lexeme structs are mismatched to the " + "string encoding table. The mismatched:\n" + "Query string: {query}\n" + "Orth cached: {orth_str}\n" + "ID of orth: {orth_id}".format( + query=original_string, orth_str=id_string, orth_id=id_) + ) + + class VectorReadError(Exception): @classmethod def mismatched_sizes(cls, loc, line_num, prev_size, curr_size):