spaCy/spacy/en/__init__.py

99 lines
3.2 KiB
Python
Raw Normal View History

2014-12-21 20:25:43 +00:00
from __future__ import unicode_literals
from os import path
from ..vocab import Vocab
from ..tokenizer import Tokenizer
from ..syntax.parser import GreedyParser
from ..tokens import Tokens
from .pos import EnPosTagger
from .pos import POS_TAGS
2014-12-21 20:25:43 +00:00
from .attrs import get_flags
DATA_DIR = path.join(path.dirname(__file__), 'data')
2014-12-21 20:25:43 +00:00
def get_lex_props(string):
return {'flags': get_flags(string), 'dense': 1}
class English(object):
2014-12-27 07:45:16 +00:00
"""The English NLP pipeline.
Provides a tokenizer, lexicon, part-of-speech tagger and parser.
Keyword args:
data_dir (unicode): A path to a directory, from which to load the pipeline.
If None, looks for a directory named "data/" in the same directory as
the present file, i.e. path.join(path.dirname(__file__, 'data')).
If path.join(data_dir, 'pos') exists, the tagger is loaded from it.
If path.join(data_dir, 'deps') exists, the parser is loaded from it.
See Pipeline Directory Structure for details.
Attributes:
vocab (spacy.vocab.Vocab): The lexicon.
strings (spacy.strings.StringStore): Encode/decode strings to/from integer IDs.
tokenizer (spacy.tokenizer.Tokenizer): The start of the pipeline.
tagger (spacy.en.pos.EnPosTagger):
The part-of-speech tagger, which also performs lemmatization and
morphological analysis.
parser (spacy.syntax.parser.GreedyParser):
A greedy shift-reduce dependency parser.
"""
def __init__(self, data_dir=None):
2014-12-21 20:25:43 +00:00
if data_dir is None:
data_dir = path.join(path.dirname(__file__), 'data')
2014-12-30 12:25:09 +00:00
self._data_dir = data_dir
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'),
get_lex_props=get_lex_props)
2014-12-30 12:25:09 +00:00
tag_names = list(POS_TAGS.keys())
tag_names.sort()
self.tokenizer = Tokenizer.from_dir(self.vocab, path.join(data_dir, 'tokenizer'),
POS_TAGS, tag_names)
2014-12-24 06:42:00 +00:00
self.strings = self.vocab.strings
2014-12-30 12:25:09 +00:00
self._tagger = None
self._parser = None
@property
def tagger(self):
if self._tagger is None:
self._tagger = EnPosTagger(self.vocab.strings, self._data_dir)
return self._tagger
@property
def parser(self):
if self._parser is None:
self._parser = GreedyParser(path.join(self._data_dir, 'deps'))
return self._parser
2014-12-21 20:25:43 +00:00
2014-12-30 12:25:09 +00:00
def __call__(self, text, tag=True, parse=False):
2014-12-27 07:45:16 +00:00
"""Apply the pipeline to some text.
Args:
text (unicode): The text to be processed.
Keyword args:
tag (bool): Whether to add part-of-speech tags to the text. This
will also set morphological analysis and lemmas.
parse (bool): Whether to add dependency-heads and labels to the text.
Returns:
tokens (spacy.tokens.Tokens):
"""
2014-12-21 20:25:43 +00:00
tokens = self.tokenizer.tokenize(text)
if tag:
self.tagger(tokens)
if parse:
2014-12-21 20:25:43 +00:00
self.parser.parse(tokens)
return tokens
2014-12-24 06:42:00 +00:00
@property
def tags(self):
2014-12-27 07:45:16 +00:00
"""List of part-of-speech tag names."""
return self.tagger.tag_names