mirror of https://github.com/explosion/spaCy.git
45 lines
1.5 KiB
Python
45 lines
1.5 KiB
Python
from __future__ import unicode_literals
|
|
from os import path
|
|
|
|
from .. import orth
|
|
from ..vocab import Vocab
|
|
from ..tokenizer import Tokenizer
|
|
from ..syntax.parser import GreedyParser
|
|
from ..tokens import Tokens
|
|
from ..morphology import Morphologizer
|
|
from .lemmatizer import Lemmatizer
|
|
from .pos import EnPosTagger
|
|
from .attrs import get_flags
|
|
|
|
|
|
def get_lex_props(string):
|
|
return {'flags': get_flags(string), 'dense': 1}
|
|
|
|
|
|
class English(object):
|
|
def __init__(self, data_dir=None, pos_tag=True, parse=False):
|
|
if data_dir is None:
|
|
data_dir = path.join(path.dirname(__file__), 'data')
|
|
self.vocab = Vocab.from_dir(data_dir, get_lex_props=get_lex_props)
|
|
self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir)
|
|
if pos_tag:
|
|
self.pos_tagger = EnPosTagger(data_dir,
|
|
Morphologizer.from_dir(
|
|
self.vocab.strings,
|
|
Lemmatizer(path.join(data_dir, 'wordnet')),
|
|
data_dir))
|
|
else:
|
|
self.pos_tagger = None
|
|
if parse:
|
|
self.parser = GreedyParser(data_dir)
|
|
else:
|
|
self.parser = None
|
|
|
|
def __call__(self, text, pos_tag=True, parse=True):
|
|
tokens = self.tokenizer.tokenize(text)
|
|
if self.pos_tagger and pos_tag:
|
|
self.pos_tagger(tokens)
|
|
if self.parser and parse:
|
|
self.parser.parse(tokens)
|
|
return tokens
|