From 4b123952aa04fe52b710679ba655cec7e6cc5b2b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 19 Nov 2019 15:03:14 +0100 Subject: [PATCH] Add option for improved NER feature extraction (#4671) * Support option of three NER features * Expose nr_feature parser model setting * Give feature tokens better name * Test nr_feature=3 for NER * Format --- spacy/syntax/_state.pxd | 24 ++++++++++++++++++++++-- spacy/syntax/nn_parser.pyx | 9 ++++++--- spacy/tests/parser/test_ner.py | 21 +++++++++++++++++++++ 3 files changed, 49 insertions(+), 5 deletions(-) diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index 65c0a3b4d..141d796a4 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -100,10 +100,30 @@ cdef cppclass StateC: free(this.shifted - PADDING) void set_context_tokens(int* ids, int n) nogil: - if n == 2: + if n == 1: + if this.B(0) >= 0: + ids[0] = this.B(0) + else: + ids[0] = -1 + elif n == 2: ids[0] = this.B(0) ids[1] = this.S(0) - if n == 8: + elif n == 3: + if this.B(0) >= 0: + ids[0] = this.B(0) + else: + ids[0] = -1 + # First word of entity, if any + if this.entity_is_open(): + ids[1] = this.E(0) + else: + ids[1] = -1 + # Last word of entity, if within entity + if ids[0] == -1 or ids[1] == -1: + ids[2] = -1 + else: + ids[2] = ids[0] - 1 + elif n == 8: ids[0] = this.B(0) ids[1] = this.B(1) ids[2] = this.S(0) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 0ed7e6952..d4489c18d 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -61,6 +61,7 @@ cdef class Parser: t2v_pieces = util.env_opt('cnn_maxout_pieces', cfg.get('cnn_maxout_pieces', 3)) bilstm_depth = util.env_opt('bilstm_depth', cfg.get('bilstm_depth', 0)) self_attn_depth = util.env_opt('self_attn_depth', cfg.get('self_attn_depth', 0)) + nr_feature_tokens = cfg.get("nr_feature_tokens", cls.nr_feature) if depth != 1: raise ValueError(TempErrors.T004.format(value=depth)) parser_maxout_pieces = util.env_opt('parser_maxout_pieces', @@ -80,7 +81,7 @@ cdef class Parser: tok2vec = chain(tok2vec, flatten) tok2vec.nO = token_vector_width lower = PrecomputableAffine(hidden_width, - nF=cls.nr_feature, nI=token_vector_width, + nF=nr_feature_tokens, nI=token_vector_width, nP=parser_maxout_pieces) lower.nP = parser_maxout_pieces @@ -90,6 +91,7 @@ cdef class Parser: cfg = { 'nr_class': nr_class, + 'nr_feature_tokens': nr_feature_tokens, 'hidden_depth': depth, 'token_vector_width': token_vector_width, 'hidden_width': hidden_width, @@ -133,6 +135,7 @@ cdef class Parser: if 'beam_update_prob' not in cfg: cfg['beam_update_prob'] = util.env_opt('beam_update_prob', 1.0) cfg.setdefault('cnn_maxout_pieces', 3) + cfg.setdefault("nr_feature_tokens", self.nr_feature) self.cfg = cfg self.model = model self._multitasks = [] @@ -299,7 +302,7 @@ cdef class Parser: token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature), dtype='i', order='C') cdef int* c_ids - cdef int nr_feature = self.nr_feature + cdef int nr_feature = self.cfg["nr_feature_tokens"] cdef int n_states model = self.model(docs) todo = [beam for beam in beams if not beam.is_done] @@ -502,7 +505,7 @@ cdef class Parser: self.moves.preprocess_gold(gold) model, finish_update = self.model.begin_update(docs, drop=drop) states_d_scores, backprops, beams = _beam_utils.update_beam( - self.moves, self.nr_feature, 10000, states, golds, model.state2vec, + self.moves, self.cfg["nr_feature_tokens"], 10000, states, golds, model.state2vec, model.vec2scores, width, drop=drop, losses=losses, beam_density=beam_density) for i, d_scores in enumerate(states_d_scores): diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index d05403891..fbad76db5 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -259,6 +259,27 @@ def test_block_ner(): assert [token.ent_type_ for token in doc] == expected_types +def test_change_number_features(): + # Test the default number features + nlp = English() + ner = nlp.create_pipe("ner") + nlp.add_pipe(ner) + ner.add_label("PERSON") + nlp.begin_training() + assert ner.model.lower.nF == ner.nr_feature + # Test we can change it + nlp = English() + ner = nlp.create_pipe("ner") + nlp.add_pipe(ner) + ner.add_label("PERSON") + nlp.begin_training( + component_cfg={"ner": {"nr_feature_tokens": 3, "token_vector_width": 128}} + ) + assert ner.model.lower.nF == 3 + # Test the model runs + doc = nlp("hello world") + + class BlockerComponent1(object): name = "my_blocker"