2017-04-15 10:05:47 +00:00
|
|
|
# coding: utf8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2017-05-06 12:22:20 +00:00
|
|
|
from thinc.api import chain, layerize, with_getitem
|
|
|
|
from thinc.neural import Model, Softmax
|
2017-05-07 16:04:24 +00:00
|
|
|
import numpy
|
2017-05-06 12:22:20 +00:00
|
|
|
|
2016-10-15 23:47:12 +00:00
|
|
|
from .syntax.parser cimport Parser
|
2017-05-06 12:22:20 +00:00
|
|
|
#from .syntax.beam_parser cimport BeamParser
|
2016-10-15 23:47:12 +00:00
|
|
|
from .syntax.ner cimport BiluoPushDown
|
|
|
|
from .syntax.arc_eager cimport ArcEager
|
2016-10-16 19:34:57 +00:00
|
|
|
from .tagger import Tagger
|
2017-05-07 01:57:26 +00:00
|
|
|
from ._ml import build_tok2vec, flatten
|
2016-10-15 23:47:12 +00:00
|
|
|
|
2016-10-16 19:34:57 +00:00
|
|
|
# TODO: The disorganization here is pretty embarrassing. At least it's only
|
|
|
|
# internals.
|
|
|
|
from .syntax.parser import get_templates as get_feature_templates
|
2016-10-23 15:45:44 +00:00
|
|
|
from .attrs import DEP, ENT_TYPE
|
2016-10-15 23:47:12 +00:00
|
|
|
|
|
|
|
|
2017-05-06 12:22:20 +00:00
|
|
|
class TokenVectorEncoder(object):
|
|
|
|
'''Assign position-sensitive vectors to tokens, using a CNN or RNN.'''
|
|
|
|
def __init__(self, vocab, **cfg):
|
|
|
|
self.vocab = vocab
|
|
|
|
self.model = build_tok2vec(vocab.lang, 64, **cfg)
|
|
|
|
self.tagger = chain(
|
|
|
|
self.model,
|
2017-05-07 01:57:26 +00:00
|
|
|
flatten,
|
|
|
|
Softmax(self.vocab.morphology.n_tags, 64))
|
2017-05-06 12:22:20 +00:00
|
|
|
|
|
|
|
def __call__(self, doc):
|
|
|
|
doc.tensor = self.model([doc])[0]
|
|
|
|
|
|
|
|
def begin_update(self, docs, drop=0.):
|
|
|
|
tensors, bp_tensors = self.model.begin_update(docs, drop=drop)
|
|
|
|
for i, doc in enumerate(docs):
|
|
|
|
doc.tensor = tensors[i]
|
|
|
|
return tensors, bp_tensors
|
|
|
|
|
|
|
|
def update(self, docs, golds, drop=0., sgd=None):
|
|
|
|
scores, finish_update = self.tagger.begin_update(docs, drop=drop)
|
|
|
|
losses = scores.copy()
|
|
|
|
idx = 0
|
|
|
|
for i, gold in enumerate(golds):
|
2017-05-07 16:04:24 +00:00
|
|
|
ids = numpy.zeros((len(gold),), dtype='i')
|
|
|
|
start = idx
|
2017-05-06 12:22:20 +00:00
|
|
|
for j, tag in enumerate(gold.tags):
|
2017-05-07 16:04:24 +00:00
|
|
|
ids[j] = docs[0].vocab.morphology.tag_names.index(tag)
|
2017-05-06 12:22:20 +00:00
|
|
|
idx += 1
|
2017-05-07 16:04:24 +00:00
|
|
|
self.tagger.ops.xp.scatter_add(losses[start:idx], ids, -1.0)
|
2017-05-06 12:22:20 +00:00
|
|
|
finish_update(losses, sgd)
|
2016-10-23 15:45:44 +00:00
|
|
|
|
2016-10-15 23:47:12 +00:00
|
|
|
|
2017-05-06 12:22:20 +00:00
|
|
|
cdef class EntityRecognizer(Parser):
|
2017-04-15 09:59:21 +00:00
|
|
|
"""
|
|
|
|
Annotate named entities on Doc objects.
|
|
|
|
"""
|
2017-03-15 14:27:41 +00:00
|
|
|
TransitionSystem = BiluoPushDown
|
|
|
|
|
|
|
|
feature_templates = get_feature_templates('ner')
|
2017-04-15 10:05:47 +00:00
|
|
|
|
2017-03-15 14:27:41 +00:00
|
|
|
def add_label(self, label):
|
2017-04-14 21:52:17 +00:00
|
|
|
Parser.add_label(self, label)
|
2017-03-15 14:27:41 +00:00
|
|
|
if isinstance(label, basestring):
|
|
|
|
label = self.vocab.strings[label]
|
2017-04-14 21:52:17 +00:00
|
|
|
# Set label into serializer. Super hacky :(
|
2017-03-15 14:27:41 +00:00
|
|
|
for attr, freqs in self.vocab.serializer_freqs:
|
|
|
|
if attr == ENT_TYPE and label not in freqs:
|
|
|
|
freqs.append([label, 1])
|
|
|
|
self.vocab._serializer = None
|
|
|
|
|
2017-05-06 12:22:20 +00:00
|
|
|
#
|
|
|
|
#cdef class BeamEntityRecognizer(BeamParser):
|
|
|
|
# """
|
|
|
|
# Annotate named entities on Doc objects.
|
|
|
|
# """
|
|
|
|
# TransitionSystem = BiluoPushDown
|
|
|
|
#
|
|
|
|
# feature_templates = get_feature_templates('ner')
|
|
|
|
#
|
|
|
|
# def add_label(self, label):
|
|
|
|
# Parser.add_label(self, label)
|
|
|
|
# if isinstance(label, basestring):
|
|
|
|
# label = self.vocab.strings[label]
|
|
|
|
# # Set label into serializer. Super hacky :(
|
|
|
|
# for attr, freqs in self.vocab.serializer_freqs:
|
|
|
|
# if attr == ENT_TYPE and label not in freqs:
|
|
|
|
# freqs.append([label, 1])
|
|
|
|
# self.vocab._serializer = None
|
|
|
|
#
|
2017-03-15 14:27:41 +00:00
|
|
|
|
2017-03-11 17:11:30 +00:00
|
|
|
cdef class DependencyParser(Parser):
|
2016-10-16 19:34:57 +00:00
|
|
|
TransitionSystem = ArcEager
|
2016-10-15 23:47:12 +00:00
|
|
|
|
2016-10-16 19:34:57 +00:00
|
|
|
feature_templates = get_feature_templates('basic')
|
2016-10-23 15:45:44 +00:00
|
|
|
|
|
|
|
def add_label(self, label):
|
2017-04-14 21:52:17 +00:00
|
|
|
Parser.add_label(self, label)
|
2016-10-23 15:45:44 +00:00
|
|
|
if isinstance(label, basestring):
|
|
|
|
label = self.vocab.strings[label]
|
|
|
|
for attr, freqs in self.vocab.serializer_freqs:
|
|
|
|
if attr == DEP and label not in freqs:
|
|
|
|
freqs.append([label, 1])
|
|
|
|
# Super hacky :(
|
|
|
|
self.vocab._serializer = None
|
|
|
|
|
2017-05-06 12:22:20 +00:00
|
|
|
#
|
|
|
|
#cdef class BeamDependencyParser(BeamParser):
|
|
|
|
# TransitionSystem = ArcEager
|
|
|
|
#
|
|
|
|
# feature_templates = get_feature_templates('basic')
|
|
|
|
#
|
|
|
|
# def add_label(self, label):
|
|
|
|
# Parser.add_label(self, label)
|
|
|
|
# if isinstance(label, basestring):
|
|
|
|
# label = self.vocab.strings[label]
|
|
|
|
# for attr, freqs in self.vocab.serializer_freqs:
|
|
|
|
# if attr == DEP and label not in freqs:
|
|
|
|
# freqs.append([label, 1])
|
|
|
|
# # Super hacky :(
|
|
|
|
# self.vocab._serializer = None
|
|
|
|
#
|
|
|
|
|
|
|
|
#__all__ = [Tagger, DependencyParser, EntityRecognizer, BeamDependencyParser, BeamEntityRecognizer]
|
|
|
|
__all__ = [Tagger, DependencyParser, EntityRecognizer]
|