Merge branch 'develop' into feature/refactor-config-args

2020-07-10 22:50:07 +02:00 · 2020-07-10 22:50:07 +02:00 · 7b5717cac3
parent f2cd982e7b e6a6587a9a
commit 7b5717cac3
4 changed files with 92 additions and 16 deletions
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a3"
+__version__ = "3.0.0a4"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@ -245,6 +245,13 @@ class ParserStepModel(Model):
            for class_ in unseen_classes:
                self._class_mask[class_] = 0.

+    def clear_memory(self):
+        del self.tokvecs
+        del self.bp_tokvecs
+        del self.state2vec
+        del self.backprops
+        del self._class_mask
+
    @property
    def nO(self):
        if self.attrs["has_upper"]:
@ -273,6 +280,19 @@ class ParserStepModel(Model):
            c_ids += ids.shape[1]
        return ids

+    def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
+        if isinstance(self.state2vec.ops, CupyOps) \
+        and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
+            # Move token_ids and d_vector to GPU, asynchronously
+            self.backprops.append((
+                util.get_async(self.cuda_stream, token_ids),
+                util.get_async(self.cuda_stream, d_vector),
+                get_d_tokvecs
+            ))
+        else:
+            self.backprops.append((token_ids, d_vector, get_d_tokvecs))
+
+
    def finish_steps(self, golds):
        # Add a padding vector to the d_tokvecs gradient, so that missing
        # values don't affect the real gradient.
@ -315,16 +335,7 @@ def step_forward(model: ParserStepModel, states, is_train):
        d_vector = get_d_vector(d_scores)
        if mask is not None:
            d_vector *= mask
-        if isinstance(model.state2vec.ops, CupyOps) \
-        and not isinstance(token_ids, model.state2vec.ops.xp.ndarray):
-            # Move token_ids and d_vector to GPU, asynchronously
-            model.backprops.append((
-                util.get_async(model.cuda_stream, token_ids),
-                util.get_async(model.cuda_stream, d_vector),
-                get_d_tokvecs
-            ))
-        else:
-            model.backprops.append((token_ids, d_vector, get_d_tokvecs))
+        model.backprop_step(token_ids, d_vector, get_d_tokvecs)
        return None
    return scores, backprop_parser_step

--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -200,6 +200,8 @@ cdef class Parser:
        with nogil:
            self._parseC(&states[0],
                weights, sizes)
+        model.clear_memory()
+        del model
        return batch

    cdef void _parseC(self, StateC** states,
@ -312,6 +314,13 @@ cdef class Parser:
        if set_annotations:
            docs = [eg.predicted for eg in examples]
            self.set_annotations(docs, all_states)
+        # Ugh, this is annoying. If we're working on GPU, we want to free the
+        # memory ASAP. It seems that Python doesn't necessarily get around to
+        # removing these in time if we don't explicitly delete? It's confusing.
+        del backprop
+        del backprop_tok2vec
+        model.clear_memory()
+        del model
        return losses

    def rehearse(self, examples, sgd=None, losses=None, **cfg):
@ -335,7 +344,7 @@ cdef class Parser:
        set_dropout_rate(self._rehearsal_model, 0.0)
        set_dropout_rate(self.model, 0.0)
        tutor, _ = self._rehearsal_model.begin_update(docs)
-        model, finish_update = self.model.begin_update(docs)
+        model, backprop_tok2vec = self.model.begin_update(docs)
        n_scores = 0.
        loss = 0.
        while states:
@ -351,10 +360,16 @@ cdef class Parser:
            states = [state for state in states if not state.is_final()]
            n_scores += d_scores.size
        # Do the backprop
-        finish_update(docs)
+        backprop_tok2vec(docs)
        if sgd is not None:
            self.model.finish_update(sgd)
        losses[self.name] += loss / n_scores
+        del backprop
+        del backprop_tok2vec
+        model.clear_memory()
+        tutor.clear_memory()
+        del model
+        del tutor
        return losses

    def get_gradients(self):
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@ -488,7 +488,8 @@ data for machine learning models, developed by us. It integrates with spaCy
 out-of-the-box and provides many different
 [annotation recipes](https://prodi.gy/docs/recipes) for a variety of NLP tasks,
 with and without a model in the loop. If Prodigy is installed in your project,
-you can
+you can start the annotation server from your `project.yml` for a tight feedback
+loop between data development and training.

 The following example command starts the Prodigy app using the
 [`ner.correct`](https://prodi.gy/docs/recipes#ner-correct) recipe and streams in
@ -497,6 +498,12 @@ then correct the suggestions manually in the UI. After you save and exit the
 server, the full dataset is exported in spaCy's format and split into a training
 and evaluation set.

+> #### Example usage
+>
+> ```bash
+> $ python -m spacy project run annotate
+> ```
+
 <!-- prettier-ignore -->
 ```yaml
 ### project.yml
@ -509,7 +516,9 @@ commands:
  - name: annotate
  - script:
      - 'python -m prodigy ner.correct {PRODIGY_DATASET} ./assets/raw_data.jsonl {PRODIGY_MODEL} --labels {PRODIGY_LABELS}'
-      - 'python -m prodigy data-to-spacy ./corpus/train.spacy ./corpus/eval.spacy --ner {PRODIGY_DATASET}'
+      - 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner {PRODIGY_DATASET}'
+      - 'python -m spacy convert ./corpus/train.json ./corpus/train.spacy'
+      - 'python -m spacy convert ./corpus/eval.json ./corpus/eval.spacy'
  - deps:
      - 'assets/raw_data.jsonl'
  - outputs:
@ -517,6 +526,15 @@ commands:
      - 'corpus/eval.spacy'
 ```

+You can use the same approach for other types of projects and annotation
+workflows, including
+[text classification](https://prodi.gy/docs/recipes#textcat),
+[dependency parsing](https://prodi.gy/docs/recipes#dep),
+[part-of-speech tagging](https://prodi.gy/docs/recipes#pos) or fully
+[custom recipes](https://prodi.gy/docs/custom-recipes) – for instance, an A/B
+evaluation workflow that lets you compare two different models and their
+results.
+
 <Project id="integrations/prodigy">

 Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
@ -567,6 +585,12 @@ MODELS = [name.strip() for name in sys.argv[1].split(",")]
 spacy_streamlit.visualize(MODELS, DEFAULT_TEXT, visualizers=["ner"])
 ```

+> #### Example usage
+>
+> ```bash
+> $ python -m spacy project run visualize
+> ```
+
 <!-- prettier-ignore -->
 ```yaml
 ### project.yml
@ -591,7 +615,33 @@ mattis pretium.

 ### FastAPI {#fastapi} <IntegrationLogo name="fastapi" width={100} height="auto" align="right" />

-<!-- TODO: come up with example – there's not much integration needed, but it'd be nice to show an example that addresses some of the main concerns for serving ML (workers etc.) -->
+[FastAPI](https://fastapi.tiangolo.com/) is a modern high-performance framework
+for building REST APIs with Python, based on Python
+[type hints](https://fastapi.tiangolo.com/python-types/). It's become a popular
+library for serving machine learning models and
+
+```python
+# TODO: show an example that addresses some of the main concerns for serving ML (workers etc.)
+```
+
+> #### Example usage
+>
+> ```bash
+> $ python -m spacy project run visualize
+> ```
+
+<!-- prettier-ignore -->
+```yaml
+### project.yml
+commands:
+  - name: serve
+    help: "Serve the trained model with FastAPI"
+    script:
+      - 'python ./scripts/serve.py ./training/model-best'
+    deps:
+      - 'training/model-best'
+    no_skip: true
+```

 <Project id="integrations/fastapi">