Add a pipeline module, to collect and wrap processes for annotation

This commit is contained in:
Matthew Honnibal 2016-10-16 01:47:12 +02:00
parent 0317cea0ad
commit 509b30834f
3 changed files with 54 additions and 0 deletions

View File

@ -47,6 +47,7 @@ MOD_NAMES = [
'spacy.attrs', 'spacy.attrs',
'spacy.morphology', 'spacy.morphology',
'spacy.tagger', 'spacy.tagger',
'spacy.pipeline',
'spacy.syntax.stateclass', 'spacy.syntax.stateclass',
'spacy.syntax._state', 'spacy.syntax._state',
'spacy.tokenizer', 'spacy.tokenizer',

11
spacy/pipeline.pxd Normal file
View File

@ -0,0 +1,11 @@
from .syntax.parser cimport Parser
from .syntax.ner cimport BiluoPushDown
from .syntax.arc_eager cimport ArcEager
cdef class EntityRecognizer(Parser):
pass
cdef class DependencyParser(Parser):
pass

42
spacy/pipeline.pyx Normal file
View File

@ -0,0 +1,42 @@
from .syntax.parser cimport Parser
from .syntax.ner cimport BiluoPushDown
from .syntax.arc_eager cimport ArcEager
from .vocab cimport Vocab
from .tagger cimport Tagger
cdef class EntityRecognizer(Parser):
@classmethod
def load(cls, path, Vocab vocab):
return Parser.load(path, vocab, BiluoPushDown)
@classmethod
def blank(cls, Vocab vocab, **cfg):
if 'actions' not in cfg:
cfg['actions'] = {}
entity_types = cfg.get('entity_types', [''])
for action_type in (1, 2, 3, 4):
cfg['actions'][action_type] = {ent_type: True for ent_type in entity_types}
return Parser.blank(vocab, BiluoPushDown, **cfg)
cdef class DependencyParser(Parser):
@classmethod
def load(cls, path, Vocab vocab):
return Parser.load(path, vocab, ArcEager)
@classmethod
def blank(cls, Vocab vocab, **cfg):
if 'actions' not in cfg:
cfg['actions'] = {1: {'': True}, 2: {'': True}, 3: {}, 4: {},
5: {'ROOT': True}}
for label in cfg.get('left_labels', []):
cfg['actions'][3][label] = True
for label in cfg.get('right_labels', []):
cfg['actions'][4][label] = True
for label in cfg.get('break_labels', []):
cfg['actions'][5][label] = True
return Parser.blank(vocab, ArcEager, **cfg)
__all__ = [Tagger, DependencyParser, EntityRecognizer]