From 39aabf50ab23f4cadef5d5b459436a988f9fe677 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 9 Oct 2020 11:54:48 +0200
Subject: [PATCH 1/8] Also rename to include_static_vectors in CharEmbed

---
 spacy/ml/models/tok2vec.py           | 6 +++---
 spacy/pipeline/morphologizer.pyx     | 2 +-
 spacy/tests/pipeline/test_tok2vec.py | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 23cfe883b..6ef7b2325 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -177,7 +177,7 @@ def CharacterEmbed(
     rows: int,
     nM: int,
     nC: int,
-    also_use_static_vectors: bool,
+    include_static_vectors: bool,
     feature: Union[int, str] = "LOWER",
 ) -> Model[List[Doc], List[Floats2d]]:
     """Construct an embedded representation based on character embeddings, using
@@ -204,13 +204,13 @@ def CharacterEmbed(
     nC (int): The number of UTF-8 bytes to embed per word. Recommended values
         are between 3 and 8, although it may depend on the length of words in the
         language.
-    also_use_static_vectors (bool): Whether to also use static word vectors.
+    include_static_vectors (bool): Whether to also use static word vectors.
         Requires a vectors table to be loaded in the Doc objects' vocab.
     """
     feature = intify_attr(feature)
     if feature is None:
         raise ValueError(Errors.E911(feat=feature))
-    if also_use_static_vectors:
+    if include_static_vectors:
         model = chain(
             concatenate(
                 chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index a456b7a0f..00188a762 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -32,7 +32,7 @@ width = 128
 rows = 7000
 nM = 64
 nC = 8
-also_use_static_vectors = false
+include_static_vectors = false
 
 [model.tok2vec.encode]
 @architectures = "spacy.MaxoutWindowEncoder.v1"
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 90882ae3f..ec4ed17dd 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -63,8 +63,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
     [
         (8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
         (8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
-        (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
-        (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
+        (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
+        (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
     ],
 )
 # fmt: on

From 18dfb279850adb00c3b3efa18bbb6d58c17bc453 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 9 Oct 2020 12:05:33 +0200
Subject: [PATCH 2/8] Add custom error when evaluation throws a KeyError

---
 spacy/errors.py        | 3 +++
 spacy/training/loop.py | 5 ++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 2bc2f3e20..06653edcf 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -456,6 +456,9 @@ class Errors:
             "issue tracker: http://github.com/explosion/spaCy/issues")
 
     # TODO: fix numbering after merging develop into master
+    E900 = ("Could not run the full 'nlp' pipeline for evaluation. If you specified "
+            "frozen components, make sure they were already trained and initialized. "
+            "You can also consider moving them to the 'disabled' list instead.")
     E901 = ("Failed to remove existing output directory: {path}. If your "
             "config and the components you train change between runs, a "
             "non-empty output directory can lead to stale pipeline data. To "
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 242113cc6..8e688a27d 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -249,7 +249,10 @@ def create_evaluation_callback(
 
     def evaluate() -> Tuple[float, Dict[str, float]]:
         dev_examples = list(dev_corpus(nlp))
-        scores = nlp.evaluate(dev_examples)
+        try:
+            scores = nlp.evaluate(dev_examples)
+        except KeyError as e:
+            raise KeyError(Errors.E900) from e
         # Calculate a weighted sum based on score_weights for the main score.
         # We can only consider scores that are ints/floats, not dicts like
         # entity scores per type etc.

From 8316bc7d4a6dbd989d53f97a8c7a06758c8d356c Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 9 Oct 2020 12:06:20 +0200
Subject: [PATCH 3/8] bugfix DisabledPipes

---
 spacy/language.py                         |  3 +++
 spacy/tests/pipeline/test_pipe_methods.py | 14 ++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/spacy/language.py b/spacy/language.py
index 1fb559657..24e593043 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1034,6 +1034,9 @@ class Language:
                     )
                 )
             disable = to_disable
+        # DisabledPipes will restore the pipes in 'disable' when it's done, so we need to exclude
+        # those pipes that were already disabled.
+        disable = [d for d in disable if d not in self._disabled]
         return DisabledPipes(self, disable)
 
     def make_doc(self, text: str) -> Doc:
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index c693a7487..cd18b0159 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -129,6 +129,7 @@ def test_enable_pipes_method(nlp, name):
 
 @pytest.mark.parametrize("name", ["my_component"])
 def test_disable_pipes_context(nlp, name):
+    """Test that an enabled component stays enabled after running the context manager."""
     nlp.add_pipe("new_pipe", name=name)
     assert nlp.has_pipe(name)
     with nlp.select_pipes(disable=name):
@@ -136,6 +137,19 @@ def test_disable_pipes_context(nlp, name):
     assert nlp.has_pipe(name)
 
 
+@pytest.mark.parametrize("name", ["my_component"])
+def test_disable_pipes_context_restore(nlp, name):
+    """Test that a disabled component stays disabled after running the context manager."""
+    nlp.add_pipe("new_pipe", name=name)
+    assert nlp.has_pipe(name)
+    nlp.disable_pipes(name)
+    assert not nlp.has_pipe(name)
+    with nlp.select_pipes(disable=name):
+        assert not nlp.has_pipe(name)
+    assert not nlp.has_pipe(name)
+
+
+
 def test_select_pipes_list_arg(nlp):
     for name in ["c1", "c2", "c3"]:
         nlp.add_pipe("new_pipe", name=name)

From 2cafba5f50d83a93582bddea6bd1f569f98207f7 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 9 Oct 2020 12:17:35 +0200
Subject: [PATCH 4/8] shorten error message for clarity

---
 spacy/errors.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 06653edcf..3ab9661e0 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -457,8 +457,7 @@ class Errors:
 
     # TODO: fix numbering after merging develop into master
     E900 = ("Could not run the full 'nlp' pipeline for evaluation. If you specified "
-            "frozen components, make sure they were already trained and initialized. "
-            "You can also consider moving them to the 'disabled' list instead.")
+            "frozen components, make sure they were already trained and initialized. ")
     E901 = ("Failed to remove existing output directory: {path}. If your "
             "config and the components you train change between runs, a "
             "non-empty output directory can lead to stale pipeline data. To "

From 06b9d213fd91397896a24dcf5fa4f90950570e9d Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 9 Oct 2020 12:19:47 +0200
Subject: [PATCH 5/8] formatting

---
 spacy/errors.py                           | 2 +-
 spacy/tests/pipeline/test_pipe_methods.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 3ab9661e0..0932ba0fd 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -457,7 +457,7 @@ class Errors:
 
     # TODO: fix numbering after merging develop into master
     E900 = ("Could not run the full 'nlp' pipeline for evaluation. If you specified "
-            "frozen components, make sure they were already trained and initialized. ")
+            "frozen components, make sure they were already initialized and trained. ")
     E901 = ("Failed to remove existing output directory: {path}. If your "
             "config and the components you train change between runs, a "
             "non-empty output directory can lead to stale pipeline data. To "
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index cd18b0159..b744aed98 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -149,7 +149,6 @@ def test_disable_pipes_context_restore(nlp, name):
     assert not nlp.has_pipe(name)
 
 
-
 def test_select_pipes_list_arg(nlp):
     for name in ["c1", "c2", "c3"]:
         nlp.add_pipe("new_pipe", name=name)

From 2dd79454af73cb07d07ac1b9ad12644736e96bd5 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 9 Oct 2020 14:42:07 +0200
Subject: [PATCH 6/8] Update docs

---
 website/docs/usage/embeddings-transformers.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 549c3bcc4..942fc4e7b 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -514,7 +514,7 @@ Many neural network models are able to use word vector tables as additional
 features, which sometimes results in significant improvements in accuracy.
 spaCy's built-in embedding layer,
 [MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use
-word vector tables using the `also_use_static_vectors` flag. This setting is
+word vector tables using the `include_static_vectors` flag. This setting is
 also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN)
 layer, which builds the default token-to-vector encoding architecture.
 
@@ -522,9 +522,9 @@ layer, which builds the default token-to-vector encoding architecture.
 [tagger.model.tok2vec.embed]
 @architectures = "spacy.MultiHashEmbed.v1"
 width = 128
-rows = 7000
-also_embed_subwords = true
-also_use_static_vectors = true
+attrs = ["LOWER","PREFIX","SUFFIX","SHAPE"]
+rows = [5000,2500,2500,2500]
+include_static_vectors = true
 ```
 
 <Infobox title="How it works" emoji="💡">

From 727370c633b37457ddbedc80aecf07e1dc2c967d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 9 Oct 2020 14:42:51 +0200
Subject: [PATCH 7/8] Remove Span._recalculate_indices

Remove `Span._recalculate_indices`, which is a remnant from the
deprecated `Span.merge`.
---
 spacy/tests/doc/test_doc_api.py          |  9 +++------
 spacy/tests/doc/test_retokenize_merge.py |  1 +
 spacy/tokens/span.pxd                    |  1 -
 spacy/tokens/span.pyx                    | 17 -----------------
 4 files changed, 4 insertions(+), 24 deletions(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index ea832c136..db8a6d1c4 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -608,14 +608,11 @@ def test_doc_init_iob():
         doc = Doc(Vocab(), words=words, ents=ents)
 
 
-@pytest.mark.xfail
-def test_doc_set_ents_spans(en_tokenizer):
+def test_doc_set_ents_invalid_spans(en_tokenizer):
     doc = en_tokenizer("Some text about Colombia and the Czech Republic")
     spans = [Span(doc, 3, 4, label="GPE"), Span(doc, 6, 8, label="GPE")]
     with doc.retokenize() as retokenizer:
         for span in spans:
             retokenizer.merge(span)
-    # If this line is uncommented, it works:
-    # print(spans)
-    doc.ents = spans
-    assert [ent.text for ent in doc.ents] == ["Colombia", "Czech Republic"]
+    with pytest.raises(IndexError):
+        doc.ents = spans
diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py
index cb886545a..b483255c8 100644
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@@ -336,6 +336,7 @@ def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
         attrs = {"lemma": "none", "ent_type": "none"}
         retokenizer.merge(doc[0:2], attrs=attrs)
         retokenizer.merge(doc[-2:], attrs=attrs)
+    sent1, sent2 = list(doc.sents)
     assert len(sent1) == init_len - 1
     assert len(sent2) == init_len2 - 1
 
diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
index f6f88a23e..cc6b908bb 100644
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@@ -16,5 +16,4 @@ cdef class Span:
     cdef public _vector
     cdef public _vector_norm
 
-    cpdef int _recalculate_indices(self) except -1
     cpdef np.ndarray to_array(self, object features)
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 64c3c7df0..491ba0266 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -150,7 +150,6 @@ cdef class Span:
 
         DOCS: https://nightly.spacy.io/api/span#len
         """
-        self._recalculate_indices()
         if self.end < self.start:
             return 0
         return self.end - self.start
@@ -167,7 +166,6 @@ cdef class Span:
 
         DOCS: https://nightly.spacy.io/api/span#getitem
         """
-        self._recalculate_indices()
         if isinstance(i, slice):
             start, end = normalize_slice(len(self), i.start, i.stop, i.step)
             return Span(self.doc, start + self.start, end + self.start)
@@ -188,7 +186,6 @@ cdef class Span:
 
         DOCS: https://nightly.spacy.io/api/span#iter
         """
-        self._recalculate_indices()
         for i in range(self.start, self.end):
             yield self.doc[i]
 
@@ -339,19 +336,6 @@ cdef class Span:
                 output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature)
         return output
 
-    cpdef int _recalculate_indices(self) except -1:
-        if self.end > self.doc.length \
-        or self.doc.c[self.start].idx != self.start_char \
-        or (self.doc.c[self.end-1].idx + self.doc.c[self.end-1].lex.length) != self.end_char:
-            start = token_by_start(self.doc.c, self.doc.length, self.start_char)
-            if self.start == -1:
-                raise IndexError(Errors.E036.format(start=self.start_char))
-            end = token_by_end(self.doc.c, self.doc.length, self.end_char)
-            if end == -1:
-                raise IndexError(Errors.E037.format(end=self.end_char))
-            self.start = start
-            self.end = end + 1
-
     @property
     def vocab(self):
         """RETURNS (Vocab): The Span's Doc's vocab."""
@@ -520,7 +504,6 @@ cdef class Span:
 
         DOCS: https://nightly.spacy.io/api/span#root
         """
-        self._recalculate_indices()
         if "root" in self.doc.user_span_hooks:
             return self.doc.user_span_hooks["root"](self)
         # This should probably be called 'head', and the other one called

From 97ff090e495208a5944561e210c76ef77e93eab3 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 9 Oct 2020 16:03:57 +0200
Subject: [PATCH 8/8] Fix docs example [ci skip]

---
 website/docs/usage/processing-pipelines.md | 54 +++++++++-------------
 1 file changed, 23 insertions(+), 31 deletions(-)

diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index fdae6d3e5..83134962b 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1403,9 +1403,9 @@ especially useful it you want to pass in a string instead of calling
 
 This example shows the implementation of a pipeline component that fetches
 country meta data via the [REST Countries API](https://restcountries.eu), sets
-entity annotations for countries, merges entities into one token and sets custom
-attributes on the `Doc`, `Span` and `Token` – for example, the capital,
-latitude/longitude coordinates and even the country flag.
+entity annotations for countries and sets custom attributes on the `Doc` and
+`Span` – for example, the capital, latitude/longitude coordinates and even the
+country flag.
 
 ```python
 ### {executable="true"}
@@ -1427,54 +1427,46 @@ class RESTCountriesComponent:
         # Set up the PhraseMatcher with Doc patterns for each country name
         self.matcher = PhraseMatcher(nlp.vocab)
         self.matcher.add("COUNTRIES", [nlp.make_doc(c) for c in self.countries.keys()])
-        # Register attribute on the Token. We'll be overwriting this based on
+        # Register attributes on the Span. We'll be overwriting this based on
         # the matches, so we're only setting a default value, not a getter.
-        Token.set_extension("is_country", default=False)
-        Token.set_extension("country_capital", default=False)
-        Token.set_extension("country_latlng", default=False)
-        Token.set_extension("country_flag", default=False)
-        # Register attributes on Doc and Span via a getter that checks if one of
-        # the contained tokens is set to is_country == True.
+        Span.set_extension("is_country", default=None)
+        Span.set_extension("country_capital", default=None)
+        Span.set_extension("country_latlng", default=None)
+        Span.set_extension("country_flag", default=None)
+        # Register attribute on Doc via a getter that checks if the Doc
+        # contains a country entity
         Doc.set_extension("has_country", getter=self.has_country)
-        Span.set_extension("has_country", getter=self.has_country)
 
     def __call__(self, doc):
         spans = []  # keep the spans for later so we can merge them afterwards
         for _, start, end in self.matcher(doc):
             # Generate Span representing the entity & set label
             entity = Span(doc, start, end, label=self.label)
+            # Set custom attributes on entity. Can be extended with other data
+            # returned by the API, like currencies, country code, calling code etc.
+            entity._.set("is_country", True)
+            entity._.set("country_capital", self.countries[entity.text]["capital"])
+            entity._.set("country_latlng", self.countries[entity.text]["latlng"])
+            entity._.set("country_flag", self.countries[entity.text]["flag"])
             spans.append(entity)
-            # Set custom attribute on each token of the entity
-            # Can be extended with other data returned by the API, like
-            # currencies, country code, flag, calling code etc.
-            for token in entity:
-                token._.set("is_country", True)
-                token._.set("country_capital", self.countries[entity.text]["capital"])
-                token._.set("country_latlng", self.countries[entity.text]["latlng"])
-                token._.set("country_flag", self.countries[entity.text]["flag"])
-        # Iterate over all spans and merge them into one token
-        with doc.retokenize() as retokenizer:
-            for span in spans:
-                retokenizer.merge(span)
         # Overwrite doc.ents and add entity – be careful not to replace!
         doc.ents = list(doc.ents) + spans
         return doc  # don't forget to return the Doc!
 
-    def has_country(self, tokens):
-        """Getter for Doc and Span attributes. Since the getter is only called
-        when we access the attribute, we can refer to the Token's 'is_country'
+    def has_country(self, doc):
+        """Getter for Doc attributes. Since the getter is only called
+        when we access the attribute, we can refer to the Span's 'is_country'
         attribute here, which is already set in the processing step."""
-        return any([t._.get("is_country") for t in tokens])
+        return any([entity._.get("is_country") for entity in doc.ents])
 
 nlp = English()
 nlp.add_pipe("rest_countries", config={"label": "GPE"})
 doc = nlp("Some text about Colombia and the Czech Republic")
 print("Pipeline", nlp.pipe_names)  # pipeline contains component name
 print("Doc has countries", doc._.has_country)  # Doc contains countries
-for token in doc:
-    if token._.is_country:
-        print(token.text, token._.country_capital, token._.country_latlng, token._.country_flag)
-print("Entities", [(e.text, e.label_) for e in doc.ents])
+for ent in doc.ents:
+    if ent._.is_country:
+        print(ent.text, ent.label_, ent._.country_capital, ent._.country_latlng, ent._.country_flag)
 ```
 
 In this case, all data can be fetched on initialization in one request. However,