# cython: infer_types=True # cython: profile=True # coding: utf8 from __future__ import unicode_literals from thinc.api import chain, layerize, with_getitem from thinc.neural import Model, Softmax import numpy cimport numpy as np from .tokens.doc cimport Doc from .syntax.parser cimport Parser from .syntax.parser import get_templates as get_feature_templates from .syntax.beam_parser cimport BeamParser from .syntax.ner cimport BiluoPushDown from .syntax.arc_eager cimport ArcEager from .tagger import Tagger from .gold cimport GoldParse from thinc.api import add, layerize, chain, clone, concatenate from thinc.neural import Model, Maxout, Softmax, Affine from thinc.neural._classes.hash_embed import HashEmbed from thinc.neural.util import to_categorical from thinc.neural._classes.convolution import ExtractWindow from thinc.neural._classes.resnet import Residual from thinc.neural._classes.batchnorm import BatchNorm as BN from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP from ._ml import flatten, get_col, doc2feats class TokenVectorEncoder(object): '''Assign position-sensitive vectors to tokens, using a CNN or RNN.''' def __init__(self, vocab, token_vector_width, **cfg): self.vocab = vocab self.doc2feats = doc2feats() self.model = self.build_model(vocab.lang, token_vector_width, **cfg) self.tagger = chain( self.model, Softmax(self.vocab.morphology.n_tags, token_vector_width)) def build_model(self, lang, width, embed_size=5000, **cfg): cols = self.doc2feats.cols with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}): lower = get_col(cols.index(LOWER)) >> (HashEmbed(width, embed_size) +HashEmbed(width, embed_size)) prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2) suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2) shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2) tok2vec = ( flatten >> (lower | prefix | suffix | shape ) >> Maxout(width, pieces=3) >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) ) return tok2vec def pipe(self, docs): docs = list(docs) self.predict_tags(docs) for doc in docs: yield doc def __call__(self, doc): self.predict_tags([doc]) def begin_update(self, feats, drop=0.): tokvecs, bp_tokvecs = self.model.begin_update(feats, drop=drop) return tokvecs, bp_tokvecs def predict_tags(self, docs, drop=0.): cdef Doc doc feats = self.doc2feats(docs) scores, finish_update = self.tagger.begin_update(feats, drop=drop) scores, _ = self.tagger.begin_update(feats, drop=drop) idx = 0 guesses = scores.argmax(axis=1) if not isinstance(guesses, numpy.ndarray): guesses = guesses.get() for i, doc in enumerate(docs): tag_ids = guesses[idx:idx+len(doc)] for j, tag_id in enumerate(tag_ids): doc.vocab.morphology.assign_tag_id(&doc.c[j], tag_id) idx += 1 def update(self, docs_feats, golds, drop=0., sgd=None): cdef int i, j, idx cdef GoldParse gold docs, feats = docs_feats scores, finish_update = self.tagger.begin_update(feats, drop=drop) tag_index = {tag: i for i, tag in enumerate(docs[0].vocab.morphology.tag_names)} idx = 0 correct = numpy.zeros((scores.shape[0],), dtype='i') for gold in golds: for tag in gold.tags: correct[idx] = tag_index[tag] idx += 1 correct = self.model.ops.xp.array(correct) d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) finish_update(d_scores, sgd) cdef class EntityRecognizer(Parser): """ Annotate named entities on Doc objects. """ TransitionSystem = BiluoPushDown feature_templates = get_feature_templates('ner') def add_label(self, label): Parser.add_label(self, label) if isinstance(label, basestring): label = self.vocab.strings[label] cdef class BeamEntityRecognizer(BeamParser): """ Annotate named entities on Doc objects. """ TransitionSystem = BiluoPushDown feature_templates = get_feature_templates('ner') def add_label(self, label): Parser.add_label(self, label) if isinstance(label, basestring): label = self.vocab.strings[label] cdef class DependencyParser(Parser): TransitionSystem = ArcEager feature_templates = get_feature_templates('basic') def add_label(self, label): Parser.add_label(self, label) if isinstance(label, basestring): label = self.vocab.strings[label] cdef class BeamDependencyParser(BeamParser): TransitionSystem = ArcEager feature_templates = get_feature_templates('basic') def add_label(self, label): Parser.add_label(self, label) if isinstance(label, basestring): label = self.vocab.strings[label] __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'BeamDependencyParser', 'BeamEntityRecognizer', 'TokenVectorEnoder']