From 509b30834f82b5e6cb2b9863750859c1263064e7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 16 Oct 2016 01:47:12 +0200 Subject: [PATCH] Add a pipeline module, to collect and wrap processes for annotation --- setup.py | 1 + spacy/pipeline.pxd | 11 +++++++++++ spacy/pipeline.pyx | 42 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+) create mode 100644 spacy/pipeline.pxd create mode 100644 spacy/pipeline.pyx diff --git a/setup.py b/setup.py index 27137d82e..da600f235 100644 --- a/setup.py +++ b/setup.py @@ -47,6 +47,7 @@ MOD_NAMES = [ 'spacy.attrs', 'spacy.morphology', 'spacy.tagger', + 'spacy.pipeline', 'spacy.syntax.stateclass', 'spacy.syntax._state', 'spacy.tokenizer', diff --git a/spacy/pipeline.pxd b/spacy/pipeline.pxd new file mode 100644 index 000000000..4323cdb44 --- /dev/null +++ b/spacy/pipeline.pxd @@ -0,0 +1,11 @@ +from .syntax.parser cimport Parser +from .syntax.ner cimport BiluoPushDown +from .syntax.arc_eager cimport ArcEager + + +cdef class EntityRecognizer(Parser): + pass + + +cdef class DependencyParser(Parser): + pass diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx new file mode 100644 index 000000000..53efce020 --- /dev/null +++ b/spacy/pipeline.pyx @@ -0,0 +1,42 @@ +from .syntax.parser cimport Parser +from .syntax.ner cimport BiluoPushDown +from .syntax.arc_eager cimport ArcEager +from .vocab cimport Vocab +from .tagger cimport Tagger + + +cdef class EntityRecognizer(Parser): + @classmethod + def load(cls, path, Vocab vocab): + return Parser.load(path, vocab, BiluoPushDown) + + @classmethod + def blank(cls, Vocab vocab, **cfg): + if 'actions' not in cfg: + cfg['actions'] = {} + entity_types = cfg.get('entity_types', ['']) + for action_type in (1, 2, 3, 4): + cfg['actions'][action_type] = {ent_type: True for ent_type in entity_types} + return Parser.blank(vocab, BiluoPushDown, **cfg) + + +cdef class DependencyParser(Parser): + @classmethod + def load(cls, path, Vocab vocab): + return Parser.load(path, vocab, ArcEager) + + @classmethod + def blank(cls, Vocab vocab, **cfg): + if 'actions' not in cfg: + cfg['actions'] = {1: {'': True}, 2: {'': True}, 3: {}, 4: {}, + 5: {'ROOT': True}} + for label in cfg.get('left_labels', []): + cfg['actions'][3][label] = True + for label in cfg.get('right_labels', []): + cfg['actions'][4][label] = True + for label in cfg.get('break_labels', []): + cfg['actions'][5][label] = True + return Parser.blank(vocab, ArcEager, **cfg) + + +__all__ = [Tagger, DependencyParser, EntityRecognizer]