From 2c4b2ee5e9b29442c119e9c8bb2b5bce761a78aa Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sat, 3 Oct 2020 23:27:05 +0200
Subject: [PATCH] REL intro and get_candidates function

---
 website/docs/usage/layers-architectures.md | 54 ++++++++++++++++++++++
 website/docs/usage/processing-pipelines.md |  2 +-
 2 files changed, 55 insertions(+), 1 deletion(-)
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index b65c3d903..678f70667 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -486,6 +486,60 @@ with Model.define_operators({">>": chain}):
 
 ## Create new trainable components {#components}
 
+In addition to [swapping out](#swap-architectures) default models in built-in
+components, you can also implement an entirely new,
+[trainable pipeline component](usage/processing-pipelines#trainable-components)
+from scratch. This can be done by creating a new class inheriting from [`Pipe`](/api/pipe), 
+and linking it up to your custom model implementation.
+
+### Example: Pipeline component for relation extraction {#component-rel}
+
+This section will run through an example of implementing a novel relation extraction 
+component from scratch. As a first step, we need a method that will generate pairs of
+entities that we want to classify as being related or not. These candidate pairs are 
+typically formed within one document, which means we'll have a function that takes a 
+`Doc` as input and outputs a `List` of `Span` tuples. In this example, we will focus 
+on binary relation extraction, i.e. the tuple will be of length 2.
+
+We register this function in the 'misc' register so we can easily refer to it from the config, 
+and allow swapping it out for any candidate 
+generation function. For instance, a very straightforward implementation would be to just 
+take any two entities from the same document:
+
+```python
+@registry.misc.register("rel_cand_generator.v1")
+def create_candidate_indices() -> Callable[[Doc], List[Tuple[Span, Span]]]:
+    def get_candidate_indices(doc: "Doc"):
+        indices = []
+        for ent1 in doc.ents:
+            for ent2 in doc.ents:
+                indices.append((ent1, ent2))
+        return indices
+    return get_candidate_indices
+```
+
+But we could also refine this further by excluding relations of an entity with itself, 
+and posing a maximum distance (in number of tokens) between two entities:
+
+```python
+### {highlight="1,2,7,8"}
+@registry.misc.register("rel_cand_generator.v2")
+def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]:
+    def get_candidate_indices(doc: "Doc"):
+        indices = []
+        for ent1 in doc.ents:
+            for ent2 in doc.ents:
+                if ent1 != ent2:
+                    if max_length and abs(ent2.start - ent1.start) <= max_length:
+                        indices.append((ent1, ent2))
+        return indices
+    return get_candidate_indices
+```
+
+
+
+
+
 <Infobox title="This section is still under construction" emoji="🚧" variant="warning">
 </Infobox>
 
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index c98bd08bc..3619993c5 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1035,7 +1035,7 @@ plug fully custom machine learning components into your pipeline. You'll need
 the following:
 
 1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This
-   can be a model using implemented in
+   can be a model implemented in
    [Thinc](/usage/layers-architectures#thinc), or a
    [wrapped model](/usage/layers-architectures#frameworks) implemented in
    PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a