spaCy/spacy/pipeline.pyx

# coding: utf8
from __future__ import unicode_literals

from thinc.api import chain, layerize, with_getitem
from thinc.neural import Model, Softmax
import numpy

from .syntax.parser cimport Parser
#from .syntax.beam_parser cimport BeamParser
from .syntax.ner cimport BiluoPushDown
from .syntax.arc_eager cimport ArcEager
from .tagger import Tagger
from ._ml import build_tok2vec, flatten

# TODO: The disorganization here is pretty embarrassing. At least it's only
# internals.
from .syntax.parser import get_templates as get_feature_templates
from .attrs import DEP, ENT_TYPE


class TokenVectorEncoder(object):
    '''Assign position-sensitive vectors to tokens, using a CNN or RNN.'''
    def __init__(self, vocab, **cfg):
        self.vocab = vocab
        self.model = build_tok2vec(vocab.lang, 64, **cfg)
        self.tagger = chain(
                        self.model,
                        flatten,
                        Softmax(self.vocab.morphology.n_tags, 64))

    def __call__(self, doc):
        doc.tensor = self.model([doc])[0]

    def begin_update(self, docs, drop=0.):
        tensors, bp_tensors = self.model.begin_update(docs, drop=drop)
        for i, doc in enumerate(docs):
            doc.tensor = tensors[i]
        return tensors, bp_tensors

    def update(self, docs, golds, drop=0., sgd=None):
        scores, finish_update = self.tagger.begin_update(docs, drop=drop)
        losses = scores.copy()
        idx = 0
        for i, gold in enumerate(golds):
            ids = numpy.zeros((len(gold),), dtype='i')
            start = idx
            for j, tag in enumerate(gold.tags):
                ids[j] = docs[0].vocab.morphology.tag_names.index(tag)
                idx += 1
            self.tagger.ops.xp.scatter_add(losses[start:idx], ids, -1.0)
        finish_update(losses, sgd)


cdef class EntityRecognizer(Parser):
    """
    Annotate named entities on Doc objects.
    """
    TransitionSystem = BiluoPushDown

    feature_templates = get_feature_templates('ner')

    def add_label(self, label):
        Parser.add_label(self, label)
        if isinstance(label, basestring):
            label = self.vocab.strings[label]
        # Set label into serializer. Super hacky :(
        for attr, freqs in self.vocab.serializer_freqs:
            if attr == ENT_TYPE and label not in freqs:
                freqs.append([label, 1])
        self.vocab._serializer = None

#
#cdef class BeamEntityRecognizer(BeamParser):
#    """
#    Annotate named entities on Doc objects.
#    """
#    TransitionSystem = BiluoPushDown
#
#    feature_templates = get_feature_templates('ner')
#
#    def add_label(self, label):
#        Parser.add_label(self, label)
#        if isinstance(label, basestring):
#            label = self.vocab.strings[label]
#        # Set label into serializer. Super hacky :(
#        for attr, freqs in self.vocab.serializer_freqs:
#            if attr == ENT_TYPE and label not in freqs:
#                freqs.append([label, 1])
#        self.vocab._serializer = None
#

cdef class DependencyParser(Parser):
    TransitionSystem = ArcEager

    feature_templates = get_feature_templates('basic')

    def add_label(self, label):
        Parser.add_label(self, label)
        if isinstance(label, basestring):
            label = self.vocab.strings[label]
        for attr, freqs in self.vocab.serializer_freqs:
            if attr == DEP and label not in freqs:
                freqs.append([label, 1])
        # Super hacky :(
        self.vocab._serializer = None

#
#cdef class BeamDependencyParser(BeamParser):
#    TransitionSystem = ArcEager
#
#    feature_templates = get_feature_templates('basic')
#
#    def add_label(self, label):
#        Parser.add_label(self, label)
#        if isinstance(label, basestring):
#            label = self.vocab.strings[label]
#        for attr, freqs in self.vocab.serializer_freqs:
#            if attr == DEP and label not in freqs:
#                freqs.append([label, 1])
#        # Super hacky :(
#        self.vocab._serializer = None
#

#__all__ = [Tagger, DependencyParser, EntityRecognizer, BeamDependencyParser, BeamEntityRecognizer]
__all__ = [Tagger, DependencyParser, EntityRecognizer]
Clean up imports, unused code, whitespace, docstrings 2017-04-15 10:05:47 +00:00			`# coding: utf8`
			`from __future__ import unicode_literals`

Data running through, likely errors in model 2017-05-06 12:22:20 +00:00			`from thinc.api import chain, layerize, with_getitem`
			`from thinc.neural import Model, Softmax`
Tmp GPU code 2017-05-07 16:04:24 +00:00			`import numpy`
Data running through, likely errors in model 2017-05-06 12:22:20 +00:00
Add a pipeline module, to collect and wrap processes for annotation 2016-10-15 23:47:12 +00:00			`from .syntax.parser cimport Parser`
Data running through, likely errors in model 2017-05-06 12:22:20 +00:00			`#from .syntax.beam_parser cimport BeamParser`
Add a pipeline module, to collect and wrap processes for annotation 2016-10-15 23:47:12 +00:00			`from .syntax.ner cimport BiluoPushDown`
			`from .syntax.arc_eager cimport ArcEager`
Refactor the pipeline classes to make them more consistent, and remove the redundant blank() constructor. 2016-10-16 19:34:57 +00:00			`from .tagger import Tagger`
working residual net 2017-05-07 01:57:26 +00:00			`from ._ml import build_tok2vec, flatten`
Add a pipeline module, to collect and wrap processes for annotation 2016-10-15 23:47:12 +00:00
Refactor the pipeline classes to make them more consistent, and remove the redundant blank() constructor. 2016-10-16 19:34:57 +00:00			`# TODO: The disorganization here is pretty embarrassing. At least it's only`
			`# internals.`
			`from .syntax.parser import get_templates as get_feature_templates`
Fix issue #514 -- serializer fails when new entity type has been added. The fix here is quite ugly. It's best to add the entities ASAP after loading the NLP pipeline, to mitigate the brittleness. 2016-10-23 15:45:44 +00:00			`from .attrs import DEP, ENT_TYPE`
Add a pipeline module, to collect and wrap processes for annotation 2016-10-15 23:47:12 +00:00

Data running through, likely errors in model 2017-05-06 12:22:20 +00:00			`class TokenVectorEncoder(object):`
			`'''Assign position-sensitive vectors to tokens, using a CNN or RNN.'''`
			`def __init__(self, vocab, **cfg):`
			`self.vocab = vocab`
			`self.model = build_tok2vec(vocab.lang, 64, **cfg)`
			`self.tagger = chain(`
			`self.model,`
working residual net 2017-05-07 01:57:26 +00:00			`flatten,`
			`Softmax(self.vocab.morphology.n_tags, 64))`
Data running through, likely errors in model 2017-05-06 12:22:20 +00:00
			`def __call__(self, doc):`
			`doc.tensor = self.model([doc])[0]`

			`def begin_update(self, docs, drop=0.):`
			`tensors, bp_tensors = self.model.begin_update(docs, drop=drop)`
			`for i, doc in enumerate(docs):`
			`doc.tensor = tensors[i]`
			`return tensors, bp_tensors`

			`def update(self, docs, golds, drop=0., sgd=None):`
			`scores, finish_update = self.tagger.begin_update(docs, drop=drop)`
			`losses = scores.copy()`
			`idx = 0`
			`for i, gold in enumerate(golds):`
Tmp GPU code 2017-05-07 16:04:24 +00:00			`ids = numpy.zeros((len(gold),), dtype='i')`
			`start = idx`
Data running through, likely errors in model 2017-05-06 12:22:20 +00:00			`for j, tag in enumerate(gold.tags):`
Tmp GPU code 2017-05-07 16:04:24 +00:00			`ids[j] = docs[0].vocab.morphology.tag_names.index(tag)`
Data running through, likely errors in model 2017-05-06 12:22:20 +00:00			`idx += 1`
Tmp GPU code 2017-05-07 16:04:24 +00:00			`self.tagger.ops.xp.scatter_add(losses[start:idx], ids, -1.0)`
Data running through, likely errors in model 2017-05-06 12:22:20 +00:00			`finish_update(losses, sgd)`
Fix issue #514 -- serializer fails when new entity type has been added. The fix here is quite ugly. It's best to add the entities ASAP after loading the NLP pipeline, to mitigate the brittleness. 2016-10-23 15:45:44 +00:00
Add a pipeline module, to collect and wrap processes for annotation 2016-10-15 23:47:12 +00:00
Data running through, likely errors in model 2017-05-06 12:22:20 +00:00			`cdef class EntityRecognizer(Parser):`
Use consistent formatting for docstrings 2017-04-15 09:59:21 +00:00			`"""`
			`Annotate named entities on Doc objects.`
			`"""`
Add beam-search classes 2017-03-15 14:27:41 +00:00			`TransitionSystem = BiluoPushDown`

			`feature_templates = get_feature_templates('ner')`
Clean up imports, unused code, whitespace, docstrings 2017-04-15 10:05:47 +00:00
Add beam-search classes 2017-03-15 14:27:41 +00:00			`def add_label(self, label):`
WIP on add_label bug during NER training Currently when a new label is introduced to NER during training, it causes the labels to be read in in an unexpected order. This invalidates the model. 2017-04-14 21:52:17 +00:00			`Parser.add_label(self, label)`
Add beam-search classes 2017-03-15 14:27:41 +00:00			`if isinstance(label, basestring):`
			`label = self.vocab.strings[label]`
WIP on add_label bug during NER training Currently when a new label is introduced to NER during training, it causes the labels to be read in in an unexpected order. This invalidates the model. 2017-04-14 21:52:17 +00:00			`# Set label into serializer. Super hacky :(`
Add beam-search classes 2017-03-15 14:27:41 +00:00			`for attr, freqs in self.vocab.serializer_freqs:`
			`if attr == ENT_TYPE and label not in freqs:`
			`freqs.append([label, 1])`
			`self.vocab._serializer = None`

Data running through, likely errors in model 2017-05-06 12:22:20 +00:00			`#`
			`#cdef class BeamEntityRecognizer(BeamParser):`
			`# """`
			`# Annotate named entities on Doc objects.`
			`# """`
			`# TransitionSystem = BiluoPushDown`
			`#`
			`# feature_templates = get_feature_templates('ner')`
			`#`
			`# def add_label(self, label):`
			`# Parser.add_label(self, label)`
			`# if isinstance(label, basestring):`
			`# label = self.vocab.strings[label]`
			`# # Set label into serializer. Super hacky :(`
			`# for attr, freqs in self.vocab.serializer_freqs:`
			`# if attr == ENT_TYPE and label not in freqs:`
			`# freqs.append([label, 1])`
			`# self.vocab._serializer = None`
			`#`
Add beam-search classes 2017-03-15 14:27:41 +00:00
Switch back to greedy parser 2017-03-11 17:11:30 +00:00			`cdef class DependencyParser(Parser):`
Refactor the pipeline classes to make them more consistent, and remove the redundant blank() constructor. 2016-10-16 19:34:57 +00:00			`TransitionSystem = ArcEager`
Add a pipeline module, to collect and wrap processes for annotation 2016-10-15 23:47:12 +00:00
Refactor the pipeline classes to make them more consistent, and remove the redundant blank() constructor. 2016-10-16 19:34:57 +00:00			`feature_templates = get_feature_templates('basic')`
Fix issue #514 -- serializer fails when new entity type has been added. The fix here is quite ugly. It's best to add the entities ASAP after loading the NLP pipeline, to mitigate the brittleness. 2016-10-23 15:45:44 +00:00
			`def add_label(self, label):`
WIP on add_label bug during NER training Currently when a new label is introduced to NER during training, it causes the labels to be read in in an unexpected order. This invalidates the model. 2017-04-14 21:52:17 +00:00			`Parser.add_label(self, label)`
Fix issue #514 -- serializer fails when new entity type has been added. The fix here is quite ugly. It's best to add the entities ASAP after loading the NLP pipeline, to mitigate the brittleness. 2016-10-23 15:45:44 +00:00			`if isinstance(label, basestring):`
			`label = self.vocab.strings[label]`
			`for attr, freqs in self.vocab.serializer_freqs:`
			`if attr == DEP and label not in freqs:`
			`freqs.append([label, 1])`
			`# Super hacky :(`
			`self.vocab._serializer = None`

Data running through, likely errors in model 2017-05-06 12:22:20 +00:00			`#`
			`#cdef class BeamDependencyParser(BeamParser):`
			`# TransitionSystem = ArcEager`
			`#`
			`# feature_templates = get_feature_templates('basic')`
			`#`
			`# def add_label(self, label):`
			`# Parser.add_label(self, label)`
			`# if isinstance(label, basestring):`
			`# label = self.vocab.strings[label]`
			`# for attr, freqs in self.vocab.serializer_freqs:`
			`# if attr == DEP and label not in freqs:`
			`# freqs.append([label, 1])`
			`# # Super hacky :(`
			`# self.vocab._serializer = None`
			`#`

			`#__all__ = [Tagger, DependencyParser, EntityRecognizer, BeamDependencyParser, BeamEntityRecognizer]`
			`__all__ = [Tagger, DependencyParser, EntityRecognizer]`