mirror of https://github.com/explosion/spaCy.git
different architecture / settings
This commit is contained in:
parent
4142e8dd1b
commit
09ed446b20
|
@ -4,18 +4,17 @@ from __future__ import unicode_literals
|
||||||
import os
|
import os
|
||||||
import datetime
|
import datetime
|
||||||
from os import listdir
|
from os import listdir
|
||||||
import numpy as np
|
|
||||||
from random import shuffle
|
from random import shuffle
|
||||||
|
|
||||||
from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator
|
from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator
|
||||||
|
|
||||||
from spacy._ml import SpacyVectors, create_default_optimizer, zero_init
|
from spacy._ml import SpacyVectors, create_default_optimizer, zero_init
|
||||||
|
|
||||||
from thinc.api import chain, flatten_add_lengths, with_getitem, clone, with_flatten
|
from thinc.api import chain, flatten_add_lengths, with_getitem, clone
|
||||||
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
|
from thinc.v2v import Model, Softmax, Maxout, Affine, ReLu
|
||||||
from thinc.t2v import Pooling, sum_pool, mean_pool
|
from thinc.t2v import Pooling, sum_pool, mean_pool
|
||||||
from thinc.t2t import ExtractWindow, ParametricAttention
|
from thinc.t2t import ParametricAttention
|
||||||
from thinc.misc import Residual, LayerNorm as LN
|
from thinc.misc import Residual
|
||||||
|
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
@ -35,18 +34,20 @@ class EL_Model():
|
||||||
self.entity_encoder = self._simple_encoder(in_width=300, out_width=96)
|
self.entity_encoder = self._simple_encoder(in_width=300, out_width=96)
|
||||||
self.article_encoder = self._simple_encoder(in_width=300, out_width=96)
|
self.article_encoder = self._simple_encoder(in_width=300, out_width=96)
|
||||||
|
|
||||||
def train_model(self, training_dir, entity_descr_output, limit=None, to_print=True):
|
def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True):
|
||||||
Doc.set_extension("entity_id", default=None)
|
Doc.set_extension("entity_id", default=None)
|
||||||
|
|
||||||
train_instances, train_pos, train_neg, train_doc = self._get_training_data(training_dir,
|
train_instances, train_pos, train_neg, train_doc = self._get_training_data(training_dir,
|
||||||
entity_descr_output,
|
entity_descr_output,
|
||||||
False,
|
False,
|
||||||
limit, to_print)
|
trainlimit,
|
||||||
|
to_print)
|
||||||
|
|
||||||
dev_instances, dev_pos, dev_neg, dev_doc = self._get_training_data(training_dir,
|
dev_instances, dev_pos, dev_neg, dev_doc = self._get_training_data(training_dir,
|
||||||
entity_descr_output,
|
entity_descr_output,
|
||||||
True,
|
True,
|
||||||
limit / 10, to_print)
|
devlimit,
|
||||||
|
to_print)
|
||||||
|
|
||||||
if to_print:
|
if to_print:
|
||||||
print("Training on", len(train_instances.values()), "articles")
|
print("Training on", len(train_instances.values()), "articles")
|
||||||
|
@ -78,7 +79,6 @@ class EL_Model():
|
||||||
if to_print:
|
if to_print:
|
||||||
print("Trained on", instance_count, "instance clusters")
|
print("Trained on", instance_count, "instance clusters")
|
||||||
|
|
||||||
|
|
||||||
def _test_dev(self, dev_instances, dev_pos, dev_neg, dev_doc):
|
def _test_dev(self, dev_instances, dev_pos, dev_neg, dev_doc):
|
||||||
predictions = list()
|
predictions = list()
|
||||||
golds = list()
|
golds = list()
|
||||||
|
@ -129,19 +129,19 @@ class EL_Model():
|
||||||
conv_depth = 1
|
conv_depth = 1
|
||||||
cnn_maxout_pieces = 3
|
cnn_maxout_pieces = 3
|
||||||
with Model.define_operators({">>": chain, "**": clone}):
|
with Model.define_operators({">>": chain, "**": clone}):
|
||||||
# encoder = SpacyVectors \
|
|
||||||
# >> flatten_add_lengths \
|
|
||||||
# >> ParametricAttention(in_width)\
|
|
||||||
# >> Pooling(mean_pool) \
|
|
||||||
# >> Residual(zero_init(Maxout(in_width, in_width))) \
|
|
||||||
# >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
|
|
||||||
encoder = SpacyVectors \
|
encoder = SpacyVectors \
|
||||||
>> flatten_add_lengths \
|
>> flatten_add_lengths \
|
||||||
>> with_getitem(0, Affine(in_width, in_width)) \
|
>> ParametricAttention(in_width)\
|
||||||
>> ParametricAttention(in_width) \
|
>> Pooling(mean_pool) \
|
||||||
>> Pooling(sum_pool) \
|
>> Residual(zero_init(Maxout(in_width, in_width))) \
|
||||||
>> Residual(ReLu(in_width, in_width)) ** conv_depth \
|
|
||||||
>> zero_init(Affine(out_width, in_width, drop_factor=0.0))
|
>> zero_init(Affine(out_width, in_width, drop_factor=0.0))
|
||||||
|
# encoder = SpacyVectors \
|
||||||
|
# >> flatten_add_lengths \
|
||||||
|
# >> with_getitem(0, Affine(in_width, in_width)) \
|
||||||
|
# >> ParametricAttention(in_width) \
|
||||||
|
# >> Pooling(sum_pool) \
|
||||||
|
# >> Residual(ReLu(in_width, in_width)) ** conv_depth \
|
||||||
|
# >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
|
||||||
|
|
||||||
# >> zero_init(Affine(nr_class, width, drop_factor=0.0))
|
# >> zero_init(Affine(nr_class, width, drop_factor=0.0))
|
||||||
# >> logistic
|
# >> logistic
|
||||||
|
@ -178,7 +178,6 @@ class EL_Model():
|
||||||
# print("encoding dim", len(true_entity_encoding[0]))
|
# print("encoding dim", len(true_entity_encoding[0]))
|
||||||
|
|
||||||
consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding)
|
consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding)
|
||||||
# consensus_encoding_t = consensus_encoding.transpose()
|
|
||||||
|
|
||||||
doc_mse, doc_diff = self._calculate_similarity(doc_encoding, consensus_encoding)
|
doc_mse, doc_diff = self._calculate_similarity(doc_encoding, consensus_encoding)
|
||||||
|
|
||||||
|
|
|
@ -111,7 +111,7 @@ if __name__ == "__main__":
|
||||||
print("STEP 6: training ", datetime.datetime.now())
|
print("STEP 6: training ", datetime.datetime.now())
|
||||||
my_nlp = spacy.load('en_core_web_md')
|
my_nlp = spacy.load('en_core_web_md')
|
||||||
trainer = EL_Model(kb=my_kb, nlp=my_nlp)
|
trainer = EL_Model(kb=my_kb, nlp=my_nlp)
|
||||||
trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, limit=500)
|
trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=50, devlimit=50)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# STEP 7: apply the EL algorithm on the dev dataset
|
# STEP 7: apply the EL algorithm on the dev dataset
|
||||||
|
|
Loading…
Reference in New Issue