* Fix default model path for English

This commit is contained in:
Matthew Honnibal 2015-01-31 16:38:27 +11:00
parent 543fe5a280
commit a1ed574b7b
1 changed files with 17 additions and 5 deletions

View File

@ -43,14 +43,22 @@ class English(object):
Keyword args: Keyword args:
data_dir (unicode): A path to a directory, from which to load the pipeline. data_dir (unicode): A path to a directory, from which to load the pipeline.
If None, looks for a directory named "data/" in the same directory as If empty string ('') --- the default --- it looks for a directory
the present file, i.e. path.join(path.dirname(__file__, 'data')). named "data/" in the same directory as the present file, i.e.
>>> data_dir = path.join(path.dirname(__file__, 'data'))
If path.join(data_dir, 'pos') exists, the tagger is loaded from there. If path.join(data_dir, 'pos') exists, the tagger is loaded from there.
If path.join(data_dir, 'deps') exists, the parser is loaded from there. If path.join(data_dir, 'deps') exists, the parser is loaded from there.
To prevent any data files from being loaded, pass data_dir=None. This
is useful if you want to construct a lexicon, which you'll then save
for later loading.
""" """
def __init__(self, data_dir=LOCAL_DATA_DIR): def __init__(self, data_dir=''):
if data_dir == '':
data_dir = LOCAL_DATA_DIR
self._data_dir = data_dir self._data_dir = data_dir
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None, self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
get_lex_props=get_lex_props) get_lex_props=get_lex_props)
@ -61,20 +69,24 @@ class English(object):
prefix_re = None prefix_re = None
suffix_re = None suffix_re = None
infix_re = None infix_re = None
self.has_parser_model = False
self.has_tagger_model = False
else: else:
tok_data_dir = path.join(data_dir, 'tokenizer') tok_data_dir = path.join(data_dir, 'tokenizer')
tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir) tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
prefix_re = re.compile(prefix_re) prefix_re = re.compile(prefix_re)
suffix_re = re.compile(suffix_re) suffix_re = re.compile(suffix_re)
infix_re = re.compile(infix_re) infix_re = re.compile(infix_re)
self.has_parser_model = path.exists(path.join(self._data_dir, 'deps'))
self.has_tagger_model = path.exists(path.join(self._data_dir, 'pos'))
self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re, self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,
suffix_re, infix_re, suffix_re, infix_re,
POS_TAGS, tag_names) POS_TAGS, tag_names)
# These are lazy-loaded
self._tagger = None self._tagger = None
self._parser = None self._parser = None
self.has_parser_model = path.exists(path.join(self._data_dir, 'deps'))
self.has_tagger_model = path.exists(path.join(self._data_dir, 'pos'))
@property @property
def tagger(self): def tagger(self):