Add option for improved NER feature extraction (#4671)

* Support option of three NER features * Expose nr_feature parser model setting * Give feature tokens better name * Test nr_feature=3 for NER * Format
2019-11-19 15:03:14 +01:00 · 2019-11-19 15:03:14 +01:00 · 4b123952aa
parent 5ad5c4b44a
commit 4b123952aa
3 changed files with 49 additions and 5 deletions
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@ -100,10 +100,30 @@ cdef cppclass StateC:
        free(this.shifted - PADDING)

    void set_context_tokens(int* ids, int n) nogil:
-        if n == 2:
+        if n == 1:
+            if this.B(0) >= 0:
+                ids[0] = this.B(0)
+            else:
+                ids[0] = -1
+        elif n == 2:
            ids[0] = this.B(0)
            ids[1] = this.S(0)
-        if n == 8:
+        elif n == 3:
+            if this.B(0) >= 0:
+                ids[0] = this.B(0)
+            else:
+                ids[0] = -1
+            # First word of entity, if any
+            if this.entity_is_open():
+                ids[1] = this.E(0)
+            else:
+                ids[1] = -1
+            # Last word of entity, if within entity
+            if ids[0] == -1 or ids[1] == -1:
+                ids[2] = -1
+            else:
+                ids[2] = ids[0] - 1
+        elif n == 8:
            ids[0] = this.B(0)
            ids[1] = this.B(1)
            ids[2] = this.S(0)
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -61,6 +61,7 @@ cdef class Parser:
        t2v_pieces = util.env_opt('cnn_maxout_pieces', cfg.get('cnn_maxout_pieces', 3))
        bilstm_depth = util.env_opt('bilstm_depth', cfg.get('bilstm_depth', 0))
        self_attn_depth = util.env_opt('self_attn_depth', cfg.get('self_attn_depth', 0))
+        nr_feature_tokens = cfg.get("nr_feature_tokens", cls.nr_feature)
        if depth != 1:
            raise ValueError(TempErrors.T004.format(value=depth))
        parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
@ -80,7 +81,7 @@ cdef class Parser:
        tok2vec = chain(tok2vec, flatten)
        tok2vec.nO = token_vector_width
        lower = PrecomputableAffine(hidden_width,
-                    nF=cls.nr_feature, nI=token_vector_width,
+                    nF=nr_feature_tokens, nI=token_vector_width,
                    nP=parser_maxout_pieces)
        lower.nP = parser_maxout_pieces

@ -90,6 +91,7 @@ cdef class Parser:

        cfg = {
            'nr_class': nr_class,
+            'nr_feature_tokens': nr_feature_tokens,
            'hidden_depth': depth,
            'token_vector_width': token_vector_width,
            'hidden_width': hidden_width,
@ -133,6 +135,7 @@ cdef class Parser:
        if 'beam_update_prob' not in cfg:
            cfg['beam_update_prob'] = util.env_opt('beam_update_prob', 1.0)
        cfg.setdefault('cnn_maxout_pieces', 3)
+        cfg.setdefault("nr_feature_tokens", self.nr_feature)
        self.cfg = cfg
        self.model = model
        self._multitasks = []
@ -299,7 +302,7 @@ cdef class Parser:
        token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
                                 dtype='i', order='C')
        cdef int* c_ids
-        cdef int nr_feature = self.nr_feature
+        cdef int nr_feature = self.cfg["nr_feature_tokens"]
        cdef int n_states
        model = self.model(docs)
        todo = [beam for beam in beams if not beam.is_done]
@ -502,7 +505,7 @@ cdef class Parser:
            self.moves.preprocess_gold(gold)
        model, finish_update = self.model.begin_update(docs, drop=drop)
        states_d_scores, backprops, beams = _beam_utils.update_beam(
-            self.moves, self.nr_feature, 10000, states, golds, model.state2vec,
+            self.moves, self.cfg["nr_feature_tokens"], 10000, states, golds, model.state2vec,
            model.vec2scores, width, drop=drop, losses=losses,
            beam_density=beam_density)
        for i, d_scores in enumerate(states_d_scores):
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -259,6 +259,27 @@ def test_block_ner():
    assert [token.ent_type_ for token in doc] == expected_types


+def test_change_number_features():
+    # Test the default number features
+    nlp = English()
+    ner = nlp.create_pipe("ner")
+    nlp.add_pipe(ner)
+    ner.add_label("PERSON")
+    nlp.begin_training()
+    assert ner.model.lower.nF == ner.nr_feature
+    # Test we can change it
+    nlp = English()
+    ner = nlp.create_pipe("ner")
+    nlp.add_pipe(ner)
+    ner.add_label("PERSON")
+    nlp.begin_training(
+        component_cfg={"ner": {"nr_feature_tokens": 3, "token_vector_width": 128}}
+    )
+    assert ner.model.lower.nF == 3
+    # Test the model runs
+    doc = nlp("hello world")
+
+
 class BlockerComponent1(object):
    name = "my_blocker"