mirror of https://github.com/explosion/spaCy.git
Add option for improved NER feature extraction (#4671)
* Support option of three NER features * Expose nr_feature parser model setting * Give feature tokens better name * Test nr_feature=3 for NER * Format
This commit is contained in:
parent
5ad5c4b44a
commit
4b123952aa
|
@ -100,10 +100,30 @@ cdef cppclass StateC:
|
|||
free(this.shifted - PADDING)
|
||||
|
||||
void set_context_tokens(int* ids, int n) nogil:
|
||||
if n == 2:
|
||||
if n == 1:
|
||||
if this.B(0) >= 0:
|
||||
ids[0] = this.B(0)
|
||||
else:
|
||||
ids[0] = -1
|
||||
elif n == 2:
|
||||
ids[0] = this.B(0)
|
||||
ids[1] = this.S(0)
|
||||
if n == 8:
|
||||
elif n == 3:
|
||||
if this.B(0) >= 0:
|
||||
ids[0] = this.B(0)
|
||||
else:
|
||||
ids[0] = -1
|
||||
# First word of entity, if any
|
||||
if this.entity_is_open():
|
||||
ids[1] = this.E(0)
|
||||
else:
|
||||
ids[1] = -1
|
||||
# Last word of entity, if within entity
|
||||
if ids[0] == -1 or ids[1] == -1:
|
||||
ids[2] = -1
|
||||
else:
|
||||
ids[2] = ids[0] - 1
|
||||
elif n == 8:
|
||||
ids[0] = this.B(0)
|
||||
ids[1] = this.B(1)
|
||||
ids[2] = this.S(0)
|
||||
|
|
|
@ -61,6 +61,7 @@ cdef class Parser:
|
|||
t2v_pieces = util.env_opt('cnn_maxout_pieces', cfg.get('cnn_maxout_pieces', 3))
|
||||
bilstm_depth = util.env_opt('bilstm_depth', cfg.get('bilstm_depth', 0))
|
||||
self_attn_depth = util.env_opt('self_attn_depth', cfg.get('self_attn_depth', 0))
|
||||
nr_feature_tokens = cfg.get("nr_feature_tokens", cls.nr_feature)
|
||||
if depth != 1:
|
||||
raise ValueError(TempErrors.T004.format(value=depth))
|
||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
|
||||
|
@ -80,7 +81,7 @@ cdef class Parser:
|
|||
tok2vec = chain(tok2vec, flatten)
|
||||
tok2vec.nO = token_vector_width
|
||||
lower = PrecomputableAffine(hidden_width,
|
||||
nF=cls.nr_feature, nI=token_vector_width,
|
||||
nF=nr_feature_tokens, nI=token_vector_width,
|
||||
nP=parser_maxout_pieces)
|
||||
lower.nP = parser_maxout_pieces
|
||||
|
||||
|
@ -90,6 +91,7 @@ cdef class Parser:
|
|||
|
||||
cfg = {
|
||||
'nr_class': nr_class,
|
||||
'nr_feature_tokens': nr_feature_tokens,
|
||||
'hidden_depth': depth,
|
||||
'token_vector_width': token_vector_width,
|
||||
'hidden_width': hidden_width,
|
||||
|
@ -133,6 +135,7 @@ cdef class Parser:
|
|||
if 'beam_update_prob' not in cfg:
|
||||
cfg['beam_update_prob'] = util.env_opt('beam_update_prob', 1.0)
|
||||
cfg.setdefault('cnn_maxout_pieces', 3)
|
||||
cfg.setdefault("nr_feature_tokens", self.nr_feature)
|
||||
self.cfg = cfg
|
||||
self.model = model
|
||||
self._multitasks = []
|
||||
|
@ -299,7 +302,7 @@ cdef class Parser:
|
|||
token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
|
||||
dtype='i', order='C')
|
||||
cdef int* c_ids
|
||||
cdef int nr_feature = self.nr_feature
|
||||
cdef int nr_feature = self.cfg["nr_feature_tokens"]
|
||||
cdef int n_states
|
||||
model = self.model(docs)
|
||||
todo = [beam for beam in beams if not beam.is_done]
|
||||
|
@ -502,7 +505,7 @@ cdef class Parser:
|
|||
self.moves.preprocess_gold(gold)
|
||||
model, finish_update = self.model.begin_update(docs, drop=drop)
|
||||
states_d_scores, backprops, beams = _beam_utils.update_beam(
|
||||
self.moves, self.nr_feature, 10000, states, golds, model.state2vec,
|
||||
self.moves, self.cfg["nr_feature_tokens"], 10000, states, golds, model.state2vec,
|
||||
model.vec2scores, width, drop=drop, losses=losses,
|
||||
beam_density=beam_density)
|
||||
for i, d_scores in enumerate(states_d_scores):
|
||||
|
|
|
@ -259,6 +259,27 @@ def test_block_ner():
|
|||
assert [token.ent_type_ for token in doc] == expected_types
|
||||
|
||||
|
||||
def test_change_number_features():
|
||||
# Test the default number features
|
||||
nlp = English()
|
||||
ner = nlp.create_pipe("ner")
|
||||
nlp.add_pipe(ner)
|
||||
ner.add_label("PERSON")
|
||||
nlp.begin_training()
|
||||
assert ner.model.lower.nF == ner.nr_feature
|
||||
# Test we can change it
|
||||
nlp = English()
|
||||
ner = nlp.create_pipe("ner")
|
||||
nlp.add_pipe(ner)
|
||||
ner.add_label("PERSON")
|
||||
nlp.begin_training(
|
||||
component_cfg={"ner": {"nr_feature_tokens": 3, "token_vector_width": 128}}
|
||||
)
|
||||
assert ner.model.lower.nF == 3
|
||||
# Test the model runs
|
||||
doc = nlp("hello world")
|
||||
|
||||
|
||||
class BlockerComponent1(object):
|
||||
name = "my_blocker"
|
||||
|
||||
|
|
Loading…
Reference in New Issue