mirror of https://github.com/explosion/spaCy.git
191 lines
5.3 KiB
Python
191 lines
5.3 KiB
Python
|
from __future__ import print_function
|
||
|
from __future__ import unicode_literals
|
||
|
from __future__ import division
|
||
|
|
||
|
import pathlib
|
||
|
import plac
|
||
|
import random
|
||
|
from collections import Counter
|
||
|
import numpy as np
|
||
|
|
||
|
from collections import defaultdict
|
||
|
from itertools import count
|
||
|
|
||
|
import dynet
|
||
|
from dynet import cg
|
||
|
|
||
|
|
||
|
class Vocab:
|
||
|
def __init__(self, w2i=None):
|
||
|
if w2i is None: w2i = defaultdict(count(0).next)
|
||
|
self.w2i = dict(w2i)
|
||
|
self.i2w = {i:w for w,i in w2i.iteritems()}
|
||
|
@classmethod
|
||
|
def from_corpus(cls, corpus):
|
||
|
w2i = defaultdict(count(0).next)
|
||
|
for sent in corpus:
|
||
|
[w2i[word] for word in sent]
|
||
|
return Vocab(w2i)
|
||
|
|
||
|
def size(self):
|
||
|
return len(self.w2i.keys())
|
||
|
|
||
|
|
||
|
def read_data(path):
|
||
|
with path.open() as file_:
|
||
|
sent = []
|
||
|
for line in file_:
|
||
|
line = line.strip().split()
|
||
|
if not line:
|
||
|
if sent:
|
||
|
yield sent
|
||
|
sent = []
|
||
|
else:
|
||
|
pieces = line
|
||
|
w = pieces[1]
|
||
|
pos = pieces[3]
|
||
|
sent.append((w, pos))
|
||
|
|
||
|
|
||
|
def get_vocab(train, test):
|
||
|
words = []
|
||
|
tags = []
|
||
|
wc = Counter()
|
||
|
for s in train:
|
||
|
for w, p in s:
|
||
|
words.append(w)
|
||
|
tags.append(p)
|
||
|
wc[w] += 1
|
||
|
words.append("_UNK_")
|
||
|
#words=[w if wc[w] > 1 else "_UNK_" for w in words]
|
||
|
tags.append("_START_")
|
||
|
|
||
|
for s in test:
|
||
|
for w, p in s:
|
||
|
words.append(w)
|
||
|
vw = Vocab.from_corpus([words])
|
||
|
vt = Vocab.from_corpus([tags])
|
||
|
UNK = vw.w2i["_UNK_"]
|
||
|
return words, tags, wc, vw, vt
|
||
|
|
||
|
|
||
|
class BiTagger(object):
|
||
|
def __init__(self, nwords, ntags):
|
||
|
self.nwords = nwords
|
||
|
self.ntags = ntags
|
||
|
|
||
|
self._model = dynet.Model()
|
||
|
self._sgd = dynet.SimpleSGDTrainer(self._model)
|
||
|
|
||
|
self._E = self._model.add_lookup_parameters((self.nwords, 128))
|
||
|
self._p_t1 = self._model.add_lookup_parameters((self.ntags, 30))
|
||
|
|
||
|
self._pH = self._model.add_parameters((32, 50*2))
|
||
|
self._pO = self._model.add_parameters((self.ntags, 32))
|
||
|
|
||
|
self._fwd_lstm = dynet.LSTMBuilder(1, 128, 50, self._model)
|
||
|
self._bwd_lstm = dynet.LSTMBuilder(1, 128, 50, self._model)
|
||
|
|
||
|
def __call__(self, doc):
|
||
|
dynet.renew_cg()
|
||
|
|
||
|
wembs = [self._E[word.rank] for word in doc]
|
||
|
|
||
|
f_state = self._fwd_lstm.initial_state()
|
||
|
b_state = self._bwd_lstm.initial_state()
|
||
|
|
||
|
fw = [x.output() for x in f_state.add_inputs(wembs)]
|
||
|
bw = [x.output() for x in b_state.add_inputs(reversed(wembs))]
|
||
|
|
||
|
H = dynet.parameter(self._pH)
|
||
|
O = dynet.parameter(self._pO)
|
||
|
|
||
|
for i, (f, b) in enumerate(zip(fw, reversed(bw))):
|
||
|
r_t = O * (dynet.tanh(H * dynet.concatenate([f, b])))
|
||
|
out = dynet.softmax(r_t)
|
||
|
doc[i].tag = np.argmax(out.npvalue())
|
||
|
|
||
|
def update(self, doc, gold):
|
||
|
dynet.renew_cg()
|
||
|
wembs = [self._E[word.rank] for word in doc]
|
||
|
wembs = [dynet.noise(we, 0.1) for we in wembs]
|
||
|
|
||
|
f_state = self._fwd_lstm.initial_state()
|
||
|
b_state = self._bwd_lstm.initial_state()
|
||
|
|
||
|
fw = [x.output() for x in f_state.add_inputs(wembs)]
|
||
|
bw = [x.output() for x in b_state.add_inputs(reversed(wembs))]
|
||
|
|
||
|
H = dynet.parameter(self._pH)
|
||
|
O = dynet.parameter(self._pO)
|
||
|
|
||
|
errs = []
|
||
|
for f, b, t in zip(fw, reversed(bw), tags):
|
||
|
f_b = dynet.concatenate([f,b])
|
||
|
r_t = O * (dynet.tanh(H * f_b))
|
||
|
err = dynet.pickneglogsoftmax(r_t, t)
|
||
|
errs.append(err)
|
||
|
|
||
|
sum_errs = dynet.esum(errs)
|
||
|
squared = -sum_errs # * sum_errs
|
||
|
loss += sum_errs.scalar_value()
|
||
|
sum_errs.backward()
|
||
|
sgd.update()
|
||
|
|
||
|
|
||
|
def main(train_loc, dev_loc, model_dir):
|
||
|
train_loc = pathlib.Path(train_loc)
|
||
|
dev_loc = pathlib.Path(dev_loc)
|
||
|
|
||
|
train = list(read_data((train_loc)))
|
||
|
test = list(read_data(dev_loc))
|
||
|
|
||
|
tagger = BiTagger(vocab)
|
||
|
|
||
|
UNK = vw.w2i["_UNK_"]
|
||
|
nwords = vw.size()
|
||
|
ntags = vt.size()
|
||
|
|
||
|
model = dynet.Model()
|
||
|
sgd = dynet.SimpleSGDTrainer(model)
|
||
|
|
||
|
E = model.add_lookup_parameters((nwords, 128))
|
||
|
p_t1 = model.add_lookup_parameters((ntags, 30))
|
||
|
|
||
|
pH = model.add_parameters((32, 50*2))
|
||
|
pO = model.add_parameters((ntags, 32))
|
||
|
|
||
|
builders=[
|
||
|
dynet.LSTMBuilder(1, 128, 50, model),
|
||
|
dynet.LSTMBuilder(1, 128, 50, model),
|
||
|
]
|
||
|
|
||
|
tagged = loss = 0
|
||
|
for ITER in xrange(50):
|
||
|
random.shuffle(train)
|
||
|
for i, s in enumerate(train,1):
|
||
|
if i % 5000 == 0:
|
||
|
sgd.status()
|
||
|
print(loss / tagged)
|
||
|
loss = 0
|
||
|
tagged = 0
|
||
|
if i % 10000 == 0:
|
||
|
good = bad = 0.0
|
||
|
for sent in test:
|
||
|
word_ids = [vw.w2i.get(w, UNK) for w, t in sent]
|
||
|
tags = tagger.tag_sent(word_ids)
|
||
|
golds = [t for w, t in sent]
|
||
|
for go, gu in zip(golds, tags):
|
||
|
if go == gu:
|
||
|
good += 1
|
||
|
else:
|
||
|
bad += 1
|
||
|
print(good / (good+bad))
|
||
|
ws = [vw.w2i.get(w, UNK) for w,p in s]
|
||
|
ps = [vt.w2i[p] for w, p in s]
|
||
|
model.update(ws, ps)
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
plac.call(main)
|