spaCy/spacy/en/__init__.py

95 lines
3.0 KiB
Python
Raw Normal View History

2014-12-21 20:25:43 +00:00
from __future__ import unicode_literals
from os import path
from .. import orth
from ..vocab import Vocab
from ..tokenizer import Tokenizer
from ..syntax.parser import GreedyParser
from ..tokens import Tokens
from .pos import EnPosTagger
from .pos import POS_TAGS
2014-12-21 20:25:43 +00:00
from .attrs import get_flags
def get_lex_props(string):
return {'flags': get_flags(string), 'dense': 1}
class English(object):
2014-12-27 07:45:16 +00:00
"""The English NLP pipeline.
Provides a tokenizer, lexicon, part-of-speech tagger and parser.
Keyword args:
data_dir (unicode): A path to a directory, from which to load the pipeline.
If None, looks for a directory named "data/" in the same directory as
the present file, i.e. path.join(path.dirname(__file__, 'data')).
If path.join(data_dir, 'pos') exists, the tagger is loaded from it.
If path.join(data_dir, 'deps') exists, the parser is loaded from it.
See Pipeline Directory Structure for details.
Attributes:
vocab (spacy.vocab.Vocab): The lexicon.
strings (spacy.strings.StringStore): Encode/decode strings to/from integer IDs.
tokenizer (spacy.tokenizer.Tokenizer): The start of the pipeline.
tagger (spacy.en.pos.EnPosTagger):
The part-of-speech tagger, which also performs lemmatization and
morphological analysis.
parser (spacy.syntax.parser.GreedyParser):
A greedy shift-reduce dependency parser.
"""
def __init__(self, data_dir=None):
2014-12-21 20:25:43 +00:00
if data_dir is None:
data_dir = path.join(path.dirname(__file__), 'data')
2014-12-30 12:25:09 +00:00
self._data_dir = data_dir
self.vocab = Vocab(data_dir=data_dir, get_lex_props=get_lex_props)
2014-12-30 12:25:09 +00:00
tag_names = list(POS_TAGS.keys())
tag_names.sort()
self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir, POS_TAGS, tag_names)
2014-12-24 06:42:00 +00:00
self.strings = self.vocab.strings
2014-12-30 12:25:09 +00:00
self._tagger = None
self._parser = None
@property
def tagger(self):
if self._tagger is None:
self._tagger = EnPosTagger(self.vocab.strings, self._data_dir)
return self._tagger
@property
def parser(self):
if self._parser is None:
self._parser = GreedyParser(path.join(self._data_dir, 'deps'))
return self._parser
2014-12-21 20:25:43 +00:00
2014-12-30 12:25:09 +00:00
def __call__(self, text, tag=True, parse=False):
2014-12-27 07:45:16 +00:00
"""Apply the pipeline to some text.
Args:
text (unicode): The text to be processed.
Keyword args:
tag (bool): Whether to add part-of-speech tags to the text. This
will also set morphological analysis and lemmas.
parse (bool): Whether to add dependency-heads and labels to the text.
Returns:
tokens (spacy.tokens.Tokens):
"""
2014-12-21 20:25:43 +00:00
tokens = self.tokenizer.tokenize(text)
if tag:
self.tagger(tokens)
if parse:
2014-12-21 20:25:43 +00:00
self.parser.parse(tokens)
return tokens
2014-12-24 06:42:00 +00:00
@property
def tags(self):
2014-12-27 07:45:16 +00:00
"""List of part-of-speech tag names."""
return self.tagger.tag_names