2014-12-21 20:25:43 +00:00
|
|
|
from __future__ import unicode_literals
|
|
|
|
from os import path
|
|
|
|
|
|
|
|
from .. import orth
|
|
|
|
from ..vocab import Vocab
|
|
|
|
from ..tokenizer import Tokenizer
|
|
|
|
from ..syntax.parser import GreedyParser
|
|
|
|
from ..tokens import Tokens
|
|
|
|
from ..morphology import Morphologizer
|
|
|
|
from .lemmatizer import Lemmatizer
|
|
|
|
from .pos import EnPosTagger
|
2014-12-21 21:54:47 +00:00
|
|
|
from .pos import POS_TAGS
|
2014-12-21 20:25:43 +00:00
|
|
|
from .attrs import get_flags
|
|
|
|
|
|
|
|
|
|
|
|
def get_lex_props(string):
|
|
|
|
return {'flags': get_flags(string), 'dense': 1}
|
|
|
|
|
|
|
|
|
|
|
|
class English(object):
|
|
|
|
def __init__(self, data_dir=None, pos_tag=True, parse=False):
|
|
|
|
if data_dir is None:
|
|
|
|
data_dir = path.join(path.dirname(__file__), 'data')
|
|
|
|
self.vocab = Vocab.from_dir(data_dir, get_lex_props=get_lex_props)
|
2014-12-21 21:54:47 +00:00
|
|
|
for pos_str in POS_TAGS:
|
|
|
|
_ = self.vocab.strings.pos_tags[pos_str]
|
2014-12-21 20:25:43 +00:00
|
|
|
self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir)
|
|
|
|
if pos_tag:
|
2014-12-21 21:54:47 +00:00
|
|
|
morph = Morphologizer(self.vocab.strings, POS_TAGS,
|
|
|
|
Lemmatizer(path.join(data_dir, 'wordnet')))
|
|
|
|
self.pos_tagger = EnPosTagger(data_dir, morph)
|
2014-12-21 20:25:43 +00:00
|
|
|
else:
|
|
|
|
self.pos_tagger = None
|
|
|
|
if parse:
|
|
|
|
self.parser = GreedyParser(data_dir)
|
|
|
|
else:
|
|
|
|
self.parser = None
|
|
|
|
|
|
|
|
def __call__(self, text, pos_tag=True, parse=True):
|
|
|
|
tokens = self.tokenizer.tokenize(text)
|
|
|
|
if self.pos_tagger and pos_tag:
|
|
|
|
self.pos_tagger(tokens)
|
|
|
|
if self.parser and parse:
|
|
|
|
self.parser.parse(tokens)
|
|
|
|
return tokens
|