try Tok2Vec instead of SpacyVectors

This commit is contained in:
svlandeg 2019-06-25 16:09:22 +02:00
parent 8608685543
commit bee23cd8af
2 changed files with 69 additions and 43 deletions

View File

@ -61,22 +61,23 @@ def run_pipeline():
to_create_kb = False to_create_kb = False
# read KB back in from file # read KB back in from file
to_read_kb = True to_read_kb = False
to_test_kb = False to_test_kb = False
# create training dataset # create training dataset
create_wp_training = False create_wp_training = False
# train the EL pipe # train the EL pipe
train_pipe = True train_pipe = False
measure_performance = True measure_performance = False
# test the EL pipe on a simple example # test the EL pipe on a simple example
to_test_pipeline = True to_test_pipeline = False
# write the NLP object, read back in and test again # write the NLP object, read back in and test again
to_write_nlp = False to_write_nlp = False
to_read_nlp = False to_read_nlp = True
test_from_file = True
# STEP 1 : create prior probabilities from WP (run only once) # STEP 1 : create prior probabilities from WP (run only once)
if to_create_prior_probs: if to_create_prior_probs:
@ -134,6 +135,7 @@ def run_pipeline():
training_output=TRAINING_DIR) training_output=TRAINING_DIR)
# STEP 6: create and train the entity linking pipe # STEP 6: create and train the entity linking pipe
if train_pipe:
el_pipe = nlp_2.create_pipe(name='entity_linker', config={}) el_pipe = nlp_2.create_pipe(name='entity_linker', config={})
el_pipe.set_kb(kb_2) el_pipe.set_kb(kb_2)
nlp_2.add_pipe(el_pipe, last=True) nlp_2.add_pipe(el_pipe, last=True)
@ -144,11 +146,10 @@ def run_pipeline():
optimizer.learn_rate = LEARN_RATE optimizer.learn_rate = LEARN_RATE
optimizer.L2 = L2 optimizer.L2 = L2
if train_pipe:
print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
# define the size (nr of entities) of training and dev set # define the size (nr of entities) of training and dev set
train_limit = 5000 train_limit = 5000
dev_limit = 5000 dev_limit = 10000
train_data = training_set_creator.read_training(nlp=nlp_2, train_data = training_set_creator.read_training(nlp=nlp_2,
training_dir=TRAINING_DIR, training_dir=TRAINING_DIR,
@ -245,24 +246,40 @@ def run_pipeline():
print("writing to", NLP_2_DIR) print("writing to", NLP_2_DIR)
nlp_2.to_disk(NLP_2_DIR) nlp_2.to_disk(NLP_2_DIR)
print() print()
print("reading from", NLP_2_DIR)
nlp_3 = spacy.load(NLP_2_DIR)
# verify that the IO has gone correctly # verify that the IO has gone correctly
if to_read_nlp: if to_read_nlp:
print("reading from", NLP_2_DIR)
nlp_3 = spacy.load(NLP_2_DIR)
if test_from_file:
dev_limit = 5000
dev_data = training_set_creator.read_training(nlp=nlp_3,
training_dir=TRAINING_DIR,
dev=True,
limit=dev_limit)
print("Dev testing from file on", len(dev_data), "articles")
print() print()
print("running toy example with NLP 2")
dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data)
print("dev acc combo avg:", round(dev_acc_combo, 3),
[(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])
else:
print("running toy example with NLP 3")
run_el_toy_example(nlp=nlp_3) run_el_toy_example(nlp=nlp_3)
print() print()
print("STOP", datetime.datetime.now()) print("STOP", datetime.datetime.now())
def _measure_accuracy(data, el_pipe): def _measure_accuracy(data, el_pipe=None):
# If the docs in the data require further processing with an entity linker, set el_pipe
correct_by_label = dict() correct_by_label = dict()
incorrect_by_label = dict() incorrect_by_label = dict()
docs = [d for d, g in data if len(d) > 0] docs = [d for d, g in data if len(d) > 0]
if el_pipe is not None:
docs = el_pipe.pipe(docs) docs = el_pipe.pipe(docs)
golds = [g for d, g in data if len(d) > 0] golds = [g for d, g in data if len(d) > 0]

View File

@ -655,23 +655,32 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False,
def build_nel_encoder(in_width, hidden_width, end_width, **cfg): def build_nel_encoder(in_width, hidden_width, end_width, **cfg):
conv_depth = cfg.get("conv_depth", 2) conv_depth = cfg.get("conv_depth", 2)
cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3) cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
pretrained_vectors = cfg.get("pretrained_vectors") # self.nlp.vocab.vectors.name
tok2vec = Tok2Vec(width=hidden_width, embed_size=in_width, pretrained_vectors=pretrained_vectors,
cnn_maxout_pieces=cnn_maxout_pieces, subword_features=False, conv_depth=conv_depth, bilstm_depth=0)
with Model.define_operators({">>": chain, "**": clone}): with Model.define_operators({">>": chain, "**": clone}):
convolution = Residual((ExtractWindow(nW=1) >> # convolution = Residual((ExtractWindow(nW=1) >>
LN(Maxout(hidden_width, hidden_width * 3, pieces=cnn_maxout_pieces)))) # LN(Maxout(hidden_width, hidden_width * 3, pieces=cnn_maxout_pieces))))
encoder = SpacyVectors \ # encoder = SpacyVectors \
>> with_flatten(Affine(hidden_width, in_width))\ # >> with_flatten(Affine(hidden_width, in_width)) \
>> with_flatten(LN(Maxout(hidden_width, hidden_width)) >> convolution ** conv_depth, pad=conv_depth) \ # >> with_flatten(LN(Maxout(hidden_width, hidden_width)) >> convolution ** conv_depth, pad=conv_depth) \
>> flatten_add_lengths \ # >> flatten_add_lengths \
>> ParametricAttention(hidden_width) \ # >> ParametricAttention(hidden_width) \
>> Pooling(sum_pool) \ # >> Pooling(sum_pool) \
# >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
# >> zero_init(Affine(end_width, hidden_width, drop_factor=0.0))
encoder = tok2vec >> flatten_add_lengths >> Pooling(mean_pool)\
>> Residual(zero_init(Maxout(hidden_width, hidden_width))) \ >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
>> zero_init(Affine(end_width, hidden_width, drop_factor=0.0)) >> zero_init(Affine(end_width, hidden_width, drop_factor=0.0))
# TODO: ReLu or LN(Maxout) ? # TODO: ReLu or LN(Maxout) ?
# sum_pool or mean_pool ? # sum_pool or mean_pool ?
encoder.tok2vec = tok2vec
encoder.nO = end_width encoder.nO = end_width
return encoder return encoder