From 4b123952aa04fe52b710679ba655cec7e6cc5b2b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 19 Nov 2019 15:03:14 +0100
Subject: [PATCH] Add option for improved NER feature extraction (#4671)

* Support option of three NER features

* Expose nr_feature parser model setting

* Give feature tokens better name

* Test nr_feature=3 for NER

* Format
---
 spacy/syntax/_state.pxd        | 24 ++++++++++++++++++++++--
 spacy/syntax/nn_parser.pyx     |  9 ++++++---
 spacy/tests/parser/test_ner.py | 21 +++++++++++++++++++++
 3 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd
index 65c0a3b4d..141d796a4 100644
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@@ -100,10 +100,30 @@ cdef cppclass StateC:
         free(this.shifted - PADDING)
 
     void set_context_tokens(int* ids, int n) nogil:
-        if n == 2:
+        if n == 1:
+            if this.B(0) >= 0:
+                ids[0] = this.B(0)
+            else:
+                ids[0] = -1
+        elif n == 2:
             ids[0] = this.B(0)
             ids[1] = this.S(0)
-        if n == 8:
+        elif n == 3:
+            if this.B(0) >= 0:
+                ids[0] = this.B(0)
+            else:
+                ids[0] = -1
+            # First word of entity, if any
+            if this.entity_is_open():
+                ids[1] = this.E(0)
+            else:
+                ids[1] = -1
+            # Last word of entity, if within entity
+            if ids[0] == -1 or ids[1] == -1:
+                ids[2] = -1
+            else:
+                ids[2] = ids[0] - 1
+        elif n == 8:
             ids[0] = this.B(0)
             ids[1] = this.B(1)
             ids[2] = this.S(0)
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 0ed7e6952..d4489c18d 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -61,6 +61,7 @@ cdef class Parser:
         t2v_pieces = util.env_opt('cnn_maxout_pieces', cfg.get('cnn_maxout_pieces', 3))
         bilstm_depth = util.env_opt('bilstm_depth', cfg.get('bilstm_depth', 0))
         self_attn_depth = util.env_opt('self_attn_depth', cfg.get('self_attn_depth', 0))
+        nr_feature_tokens = cfg.get("nr_feature_tokens", cls.nr_feature)
         if depth != 1:
             raise ValueError(TempErrors.T004.format(value=depth))
         parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
@@ -80,7 +81,7 @@ cdef class Parser:
         tok2vec = chain(tok2vec, flatten)
         tok2vec.nO = token_vector_width
         lower = PrecomputableAffine(hidden_width,
-                    nF=cls.nr_feature, nI=token_vector_width,
+                    nF=nr_feature_tokens, nI=token_vector_width,
                     nP=parser_maxout_pieces)
         lower.nP = parser_maxout_pieces
 
@@ -90,6 +91,7 @@ cdef class Parser:
 
         cfg = {
             'nr_class': nr_class,
+            'nr_feature_tokens': nr_feature_tokens,
             'hidden_depth': depth,
             'token_vector_width': token_vector_width,
             'hidden_width': hidden_width,
@@ -133,6 +135,7 @@ cdef class Parser:
         if 'beam_update_prob' not in cfg:
             cfg['beam_update_prob'] = util.env_opt('beam_update_prob', 1.0)
         cfg.setdefault('cnn_maxout_pieces', 3)
+        cfg.setdefault("nr_feature_tokens", self.nr_feature)
         self.cfg = cfg
         self.model = model
         self._multitasks = []
@@ -299,7 +302,7 @@ cdef class Parser:
         token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
                                  dtype='i', order='C')
         cdef int* c_ids
-        cdef int nr_feature = self.nr_feature
+        cdef int nr_feature = self.cfg["nr_feature_tokens"]
         cdef int n_states
         model = self.model(docs)
         todo = [beam for beam in beams if not beam.is_done]
@@ -502,7 +505,7 @@ cdef class Parser:
             self.moves.preprocess_gold(gold)
         model, finish_update = self.model.begin_update(docs, drop=drop)
         states_d_scores, backprops, beams = _beam_utils.update_beam(
-            self.moves, self.nr_feature, 10000, states, golds, model.state2vec,
+            self.moves, self.cfg["nr_feature_tokens"], 10000, states, golds, model.state2vec,
             model.vec2scores, width, drop=drop, losses=losses,
             beam_density=beam_density)
         for i, d_scores in enumerate(states_d_scores):
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index d05403891..fbad76db5 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -259,6 +259,27 @@ def test_block_ner():
     assert [token.ent_type_ for token in doc] == expected_types
 
 
+def test_change_number_features():
+    # Test the default number features
+    nlp = English()
+    ner = nlp.create_pipe("ner")
+    nlp.add_pipe(ner)
+    ner.add_label("PERSON")
+    nlp.begin_training()
+    assert ner.model.lower.nF == ner.nr_feature
+    # Test we can change it
+    nlp = English()
+    ner = nlp.create_pipe("ner")
+    nlp.add_pipe(ner)
+    ner.add_label("PERSON")
+    nlp.begin_training(
+        component_cfg={"ner": {"nr_feature_tokens": 3, "token_vector_width": 128}}
+    )
+    assert ner.model.lower.nF == 3
+    # Test the model runs
+    doc = nlp("hello world")
+
+
 class BlockerComponent1(object):
     name = "my_blocker"