diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py index eb9f8af78..d9600048c 100644 --- a/bin/wiki_entity_linking/training_set_creator.py +++ b/bin/wiki_entity_linking/training_set_creator.py @@ -324,18 +324,19 @@ def read_training(nlp, training_dir, dev, limit): if 5 < sent_length < 100: ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent else: - skip_articles.add(current_article_id) + skip_articles.add(article_id) current_doc = None except Exception as e: print("Problem parsing article", article_id, e) - skip_articles.add(current_article_id) + skip_articles.add(article_id) + raise e # repeat checking this condition in case an exception was thrown if current_doc and (current_article_id == article_id): found_ent = ents_by_offset.get(start + "_" + end, None) if found_ent: if found_ent.text != alias: - skip_articles.add(current_article_id) + skip_articles.add(article_id) current_doc = None else: sent = found_ent.sent.as_doc() diff --git a/spacy/errors.py b/spacy/errors.py index fcc3132c6..5684721ae 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -399,6 +399,9 @@ class Errors(object): E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input includes either the " "`text` or `tokens` key. For more info, see the docs:\n" "https://spacy.io/api/cli#pretrain-jsonl") + E139 = ("Knowledge base for component '{name}' not initialized. Did you forget to call set_kb()?") + E140 = ("The list of entities, prior probabilities and entity vectors should be of equal length.") + E141 = ("Entity vectors should be of length {required} instead of the provided {found}.") @add_codes diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 72f66b107..4d9d2b89b 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -106,9 +106,9 @@ cdef class KnowledgeBase: user_warning(Warnings.W018.format(entity=entity)) return + # Raise an error if the provided entity vector is not of the correct length if len(entity_vector) != self.entity_vector_length: - # TODO: proper error - raise ValueError("Entity vector length should have been", self.entity_vector_length) + raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length)) vector_index = self.c_add_vector(entity_vector=entity_vector) @@ -121,13 +121,8 @@ cdef class KnowledgeBase: return entity_hash cpdef set_entities(self, entity_list, prob_list, vector_list): - if len(entity_list) != len(prob_list): - # TODO: proper error - raise ValueError("Entity list and prob list should have the same length") - - if len(entity_list) != len(vector_list): - # TODO: proper error - raise ValueError("Entity list and vector list should have the same length") + if len(entity_list) != len(prob_list) or len(entity_list) != len(vector_list): + raise ValueError(Errors.E140) nr_entities = len(entity_list) self._entry_index = PreshMap(nr_entities+1) @@ -138,8 +133,7 @@ cdef class KnowledgeBase: while i < nr_entities: entity_vector = vector_list[i] if len(entity_vector) != self.entity_vector_length: - # TODO: proper error - raise ValueError("Entity vector is", len(entity_vector), "length but should have been", self.entity_vector_length) + raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length)) entity_hash = self.vocab.strings.add(entity_list[i]) entry.entity_hash = entity_hash diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index a191a7906..2f7856fe0 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1002,7 +1002,7 @@ cdef class DependencyParser(Parser): @property def postprocesses(self): - return [nonproj.deprojectivize, merge_subtokens] + return [nonproj.deprojectivize] # , merge_subtokens] def add_multitask_objective(self, target): if target == "cloze": @@ -1100,8 +1100,7 @@ class EntityLinker(Pipe): def require_kb(self): # Raise an error if the knowledge base is not initialized. if getattr(self, "kb", None) in (None, True, False): - # TODO: custom error - raise ValueError(Errors.E109.format(name=self.name)) + raise ValueError(Errors.E139.format(name=self.name)) def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): self.require_kb()