From 39aabf50ab23f4cadef5d5b459436a988f9fe677 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 9 Oct 2020 11:54:48 +0200 Subject: [PATCH 1/8] Also rename to include_static_vectors in CharEmbed --- spacy/ml/models/tok2vec.py | 6 +++--- spacy/pipeline/morphologizer.pyx | 2 +- spacy/tests/pipeline/test_tok2vec.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 23cfe883b..6ef7b2325 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -177,7 +177,7 @@ def CharacterEmbed( rows: int, nM: int, nC: int, - also_use_static_vectors: bool, + include_static_vectors: bool, feature: Union[int, str] = "LOWER", ) -> Model[List[Doc], List[Floats2d]]: """Construct an embedded representation based on character embeddings, using @@ -204,13 +204,13 @@ def CharacterEmbed( nC (int): The number of UTF-8 bytes to embed per word. Recommended values are between 3 and 8, although it may depend on the length of words in the language. - also_use_static_vectors (bool): Whether to also use static word vectors. + include_static_vectors (bool): Whether to also use static word vectors. Requires a vectors table to be loaded in the Doc objects' vocab. """ feature = intify_attr(feature) if feature is None: raise ValueError(Errors.E911(feat=feature)) - if also_use_static_vectors: + if include_static_vectors: model = chain( concatenate( chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()), diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index a456b7a0f..00188a762 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -32,7 +32,7 @@ width = 128 rows = 7000 nM = 64 nC = 8 -also_use_static_vectors = false +include_static_vectors = false [model.tok2vec.encode] @architectures = "spacy.MaxoutWindowEncoder.v1" diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index 90882ae3f..ec4ed17dd 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -63,8 +63,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): [ (8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), (8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), - (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), - (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}), + (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), + (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}), ], ) # fmt: on From 18dfb279850adb00c3b3efa18bbb6d58c17bc453 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 9 Oct 2020 12:05:33 +0200 Subject: [PATCH 2/8] Add custom error when evaluation throws a KeyError --- spacy/errors.py | 3 +++ spacy/training/loop.py | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/spacy/errors.py b/spacy/errors.py index 2bc2f3e20..06653edcf 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -456,6 +456,9 @@ class Errors: "issue tracker: http://github.com/explosion/spaCy/issues") # TODO: fix numbering after merging develop into master + E900 = ("Could not run the full 'nlp' pipeline for evaluation. If you specified " + "frozen components, make sure they were already trained and initialized. " + "You can also consider moving them to the 'disabled' list instead.") E901 = ("Failed to remove existing output directory: {path}. If your " "config and the components you train change between runs, a " "non-empty output directory can lead to stale pipeline data. To " diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 242113cc6..8e688a27d 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -249,7 +249,10 @@ def create_evaluation_callback( def evaluate() -> Tuple[float, Dict[str, float]]: dev_examples = list(dev_corpus(nlp)) - scores = nlp.evaluate(dev_examples) + try: + scores = nlp.evaluate(dev_examples) + except KeyError as e: + raise KeyError(Errors.E900) from e # Calculate a weighted sum based on score_weights for the main score. # We can only consider scores that are ints/floats, not dicts like # entity scores per type etc. From 8316bc7d4a6dbd989d53f97a8c7a06758c8d356c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 9 Oct 2020 12:06:20 +0200 Subject: [PATCH 3/8] bugfix DisabledPipes --- spacy/language.py | 3 +++ spacy/tests/pipeline/test_pipe_methods.py | 14 ++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/spacy/language.py b/spacy/language.py index 1fb559657..24e593043 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1034,6 +1034,9 @@ class Language: ) ) disable = to_disable + # DisabledPipes will restore the pipes in 'disable' when it's done, so we need to exclude + # those pipes that were already disabled. + disable = [d for d in disable if d not in self._disabled] return DisabledPipes(self, disable) def make_doc(self, text: str) -> Doc: diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index c693a7487..cd18b0159 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -129,6 +129,7 @@ def test_enable_pipes_method(nlp, name): @pytest.mark.parametrize("name", ["my_component"]) def test_disable_pipes_context(nlp, name): + """Test that an enabled component stays enabled after running the context manager.""" nlp.add_pipe("new_pipe", name=name) assert nlp.has_pipe(name) with nlp.select_pipes(disable=name): @@ -136,6 +137,19 @@ def test_disable_pipes_context(nlp, name): assert nlp.has_pipe(name) +@pytest.mark.parametrize("name", ["my_component"]) +def test_disable_pipes_context_restore(nlp, name): + """Test that a disabled component stays disabled after running the context manager.""" + nlp.add_pipe("new_pipe", name=name) + assert nlp.has_pipe(name) + nlp.disable_pipes(name) + assert not nlp.has_pipe(name) + with nlp.select_pipes(disable=name): + assert not nlp.has_pipe(name) + assert not nlp.has_pipe(name) + + + def test_select_pipes_list_arg(nlp): for name in ["c1", "c2", "c3"]: nlp.add_pipe("new_pipe", name=name) From 2cafba5f50d83a93582bddea6bd1f569f98207f7 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 9 Oct 2020 12:17:35 +0200 Subject: [PATCH 4/8] shorten error message for clarity --- spacy/errors.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 06653edcf..3ab9661e0 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -457,8 +457,7 @@ class Errors: # TODO: fix numbering after merging develop into master E900 = ("Could not run the full 'nlp' pipeline for evaluation. If you specified " - "frozen components, make sure they were already trained and initialized. " - "You can also consider moving them to the 'disabled' list instead.") + "frozen components, make sure they were already trained and initialized. ") E901 = ("Failed to remove existing output directory: {path}. If your " "config and the components you train change between runs, a " "non-empty output directory can lead to stale pipeline data. To " From 06b9d213fd91397896a24dcf5fa4f90950570e9d Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 9 Oct 2020 12:19:47 +0200 Subject: [PATCH 5/8] formatting --- spacy/errors.py | 2 +- spacy/tests/pipeline/test_pipe_methods.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 3ab9661e0..0932ba0fd 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -457,7 +457,7 @@ class Errors: # TODO: fix numbering after merging develop into master E900 = ("Could not run the full 'nlp' pipeline for evaluation. If you specified " - "frozen components, make sure they were already trained and initialized. ") + "frozen components, make sure they were already initialized and trained. ") E901 = ("Failed to remove existing output directory: {path}. If your " "config and the components you train change between runs, a " "non-empty output directory can lead to stale pipeline data. To " diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index cd18b0159..b744aed98 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -149,7 +149,6 @@ def test_disable_pipes_context_restore(nlp, name): assert not nlp.has_pipe(name) - def test_select_pipes_list_arg(nlp): for name in ["c1", "c2", "c3"]: nlp.add_pipe("new_pipe", name=name) From 2dd79454af73cb07d07ac1b9ad12644736e96bd5 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 9 Oct 2020 14:42:07 +0200 Subject: [PATCH 6/8] Update docs --- website/docs/usage/embeddings-transformers.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 549c3bcc4..942fc4e7b 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -514,7 +514,7 @@ Many neural network models are able to use word vector tables as additional features, which sometimes results in significant improvements in accuracy. spaCy's built-in embedding layer, [MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use -word vector tables using the `also_use_static_vectors` flag. This setting is +word vector tables using the `include_static_vectors` flag. This setting is also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN) layer, which builds the default token-to-vector encoding architecture. @@ -522,9 +522,9 @@ layer, which builds the default token-to-vector encoding architecture. [tagger.model.tok2vec.embed] @architectures = "spacy.MultiHashEmbed.v1" width = 128 -rows = 7000 -also_embed_subwords = true -also_use_static_vectors = true +attrs = ["LOWER","PREFIX","SUFFIX","SHAPE"] +rows = [5000,2500,2500,2500] +include_static_vectors = true ``` From 727370c633b37457ddbedc80aecf07e1dc2c967d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 9 Oct 2020 14:42:51 +0200 Subject: [PATCH 7/8] Remove Span._recalculate_indices Remove `Span._recalculate_indices`, which is a remnant from the deprecated `Span.merge`. --- spacy/tests/doc/test_doc_api.py | 9 +++------ spacy/tests/doc/test_retokenize_merge.py | 1 + spacy/tokens/span.pxd | 1 - spacy/tokens/span.pyx | 17 ----------------- 4 files changed, 4 insertions(+), 24 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index ea832c136..db8a6d1c4 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -608,14 +608,11 @@ def test_doc_init_iob(): doc = Doc(Vocab(), words=words, ents=ents) -@pytest.mark.xfail -def test_doc_set_ents_spans(en_tokenizer): +def test_doc_set_ents_invalid_spans(en_tokenizer): doc = en_tokenizer("Some text about Colombia and the Czech Republic") spans = [Span(doc, 3, 4, label="GPE"), Span(doc, 6, 8, label="GPE")] with doc.retokenize() as retokenizer: for span in spans: retokenizer.merge(span) - # If this line is uncommented, it works: - # print(spans) - doc.ents = spans - assert [ent.text for ent in doc.ents] == ["Colombia", "Czech Republic"] + with pytest.raises(IndexError): + doc.ents = spans diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index cb886545a..b483255c8 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -336,6 +336,7 @@ def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer): attrs = {"lemma": "none", "ent_type": "none"} retokenizer.merge(doc[0:2], attrs=attrs) retokenizer.merge(doc[-2:], attrs=attrs) + sent1, sent2 = list(doc.sents) assert len(sent1) == init_len - 1 assert len(sent2) == init_len2 - 1 diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd index f6f88a23e..cc6b908bb 100644 --- a/spacy/tokens/span.pxd +++ b/spacy/tokens/span.pxd @@ -16,5 +16,4 @@ cdef class Span: cdef public _vector cdef public _vector_norm - cpdef int _recalculate_indices(self) except -1 cpdef np.ndarray to_array(self, object features) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 64c3c7df0..491ba0266 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -150,7 +150,6 @@ cdef class Span: DOCS: https://nightly.spacy.io/api/span#len """ - self._recalculate_indices() if self.end < self.start: return 0 return self.end - self.start @@ -167,7 +166,6 @@ cdef class Span: DOCS: https://nightly.spacy.io/api/span#getitem """ - self._recalculate_indices() if isinstance(i, slice): start, end = normalize_slice(len(self), i.start, i.stop, i.step) return Span(self.doc, start + self.start, end + self.start) @@ -188,7 +186,6 @@ cdef class Span: DOCS: https://nightly.spacy.io/api/span#iter """ - self._recalculate_indices() for i in range(self.start, self.end): yield self.doc[i] @@ -339,19 +336,6 @@ cdef class Span: output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature) return output - cpdef int _recalculate_indices(self) except -1: - if self.end > self.doc.length \ - or self.doc.c[self.start].idx != self.start_char \ - or (self.doc.c[self.end-1].idx + self.doc.c[self.end-1].lex.length) != self.end_char: - start = token_by_start(self.doc.c, self.doc.length, self.start_char) - if self.start == -1: - raise IndexError(Errors.E036.format(start=self.start_char)) - end = token_by_end(self.doc.c, self.doc.length, self.end_char) - if end == -1: - raise IndexError(Errors.E037.format(end=self.end_char)) - self.start = start - self.end = end + 1 - @property def vocab(self): """RETURNS (Vocab): The Span's Doc's vocab.""" @@ -520,7 +504,6 @@ cdef class Span: DOCS: https://nightly.spacy.io/api/span#root """ - self._recalculate_indices() if "root" in self.doc.user_span_hooks: return self.doc.user_span_hooks["root"](self) # This should probably be called 'head', and the other one called From 97ff090e495208a5944561e210c76ef77e93eab3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 9 Oct 2020 16:03:57 +0200 Subject: [PATCH 8/8] Fix docs example [ci skip] --- website/docs/usage/processing-pipelines.md | 54 +++++++++------------- 1 file changed, 23 insertions(+), 31 deletions(-) diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index fdae6d3e5..83134962b 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1403,9 +1403,9 @@ especially useful it you want to pass in a string instead of calling This example shows the implementation of a pipeline component that fetches country meta data via the [REST Countries API](https://restcountries.eu), sets -entity annotations for countries, merges entities into one token and sets custom -attributes on the `Doc`, `Span` and `Token` – for example, the capital, -latitude/longitude coordinates and even the country flag. +entity annotations for countries and sets custom attributes on the `Doc` and +`Span` – for example, the capital, latitude/longitude coordinates and even the +country flag. ```python ### {executable="true"} @@ -1427,54 +1427,46 @@ class RESTCountriesComponent: # Set up the PhraseMatcher with Doc patterns for each country name self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add("COUNTRIES", [nlp.make_doc(c) for c in self.countries.keys()]) - # Register attribute on the Token. We'll be overwriting this based on + # Register attributes on the Span. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. - Token.set_extension("is_country", default=False) - Token.set_extension("country_capital", default=False) - Token.set_extension("country_latlng", default=False) - Token.set_extension("country_flag", default=False) - # Register attributes on Doc and Span via a getter that checks if one of - # the contained tokens is set to is_country == True. + Span.set_extension("is_country", default=None) + Span.set_extension("country_capital", default=None) + Span.set_extension("country_latlng", default=None) + Span.set_extension("country_flag", default=None) + # Register attribute on Doc via a getter that checks if the Doc + # contains a country entity Doc.set_extension("has_country", getter=self.has_country) - Span.set_extension("has_country", getter=self.has_country) def __call__(self, doc): spans = [] # keep the spans for later so we can merge them afterwards for _, start, end in self.matcher(doc): # Generate Span representing the entity & set label entity = Span(doc, start, end, label=self.label) + # Set custom attributes on entity. Can be extended with other data + # returned by the API, like currencies, country code, calling code etc. + entity._.set("is_country", True) + entity._.set("country_capital", self.countries[entity.text]["capital"]) + entity._.set("country_latlng", self.countries[entity.text]["latlng"]) + entity._.set("country_flag", self.countries[entity.text]["flag"]) spans.append(entity) - # Set custom attribute on each token of the entity - # Can be extended with other data returned by the API, like - # currencies, country code, flag, calling code etc. - for token in entity: - token._.set("is_country", True) - token._.set("country_capital", self.countries[entity.text]["capital"]) - token._.set("country_latlng", self.countries[entity.text]["latlng"]) - token._.set("country_flag", self.countries[entity.text]["flag"]) - # Iterate over all spans and merge them into one token - with doc.retokenize() as retokenizer: - for span in spans: - retokenizer.merge(span) # Overwrite doc.ents and add entity – be careful not to replace! doc.ents = list(doc.ents) + spans return doc # don't forget to return the Doc! - def has_country(self, tokens): - """Getter for Doc and Span attributes. Since the getter is only called - when we access the attribute, we can refer to the Token's 'is_country' + def has_country(self, doc): + """Getter for Doc attributes. Since the getter is only called + when we access the attribute, we can refer to the Span's 'is_country' attribute here, which is already set in the processing step.""" - return any([t._.get("is_country") for t in tokens]) + return any([entity._.get("is_country") for entity in doc.ents]) nlp = English() nlp.add_pipe("rest_countries", config={"label": "GPE"}) doc = nlp("Some text about Colombia and the Czech Republic") print("Pipeline", nlp.pipe_names) # pipeline contains component name print("Doc has countries", doc._.has_country) # Doc contains countries -for token in doc: - if token._.is_country: - print(token.text, token._.country_capital, token._.country_latlng, token._.country_flag) -print("Entities", [(e.text, e.label_) for e in doc.ents]) +for ent in doc.ents: + if ent._.is_country: + print(ent.text, ent.label_, ent._.country_capital, ent._.country_latlng, ent._.country_flag) ``` In this case, all data can be fetched on initialization in one request. However,