diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 02ad266f9..6a25f6572 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -43,14 +43,22 @@ class English(object): Keyword args: data_dir (unicode): A path to a directory, from which to load the pipeline. - If None, looks for a directory named "data/" in the same directory as - the present file, i.e. path.join(path.dirname(__file__, 'data')). + If empty string ('') --- the default --- it looks for a directory + named "data/" in the same directory as the present file, i.e. + + >>> data_dir = path.join(path.dirname(__file__, 'data')) If path.join(data_dir, 'pos') exists, the tagger is loaded from there. If path.join(data_dir, 'deps') exists, the parser is loaded from there. + + To prevent any data files from being loaded, pass data_dir=None. This + is useful if you want to construct a lexicon, which you'll then save + for later loading. """ - def __init__(self, data_dir=LOCAL_DATA_DIR): + def __init__(self, data_dir=''): + if data_dir == '': + data_dir = LOCAL_DATA_DIR self._data_dir = data_dir self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None, get_lex_props=get_lex_props) @@ -61,20 +69,24 @@ class English(object): prefix_re = None suffix_re = None infix_re = None + self.has_parser_model = False + self.has_tagger_model = False else: tok_data_dir = path.join(data_dir, 'tokenizer') tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir) prefix_re = re.compile(prefix_re) suffix_re = re.compile(suffix_re) infix_re = re.compile(infix_re) + self.has_parser_model = path.exists(path.join(self._data_dir, 'deps')) + self.has_tagger_model = path.exists(path.join(self._data_dir, 'pos')) + self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re, suffix_re, infix_re, POS_TAGS, tag_names) + # These are lazy-loaded self._tagger = None self._parser = None - self.has_parser_model = path.exists(path.join(self._data_dir, 'deps')) - self.has_tagger_model = path.exists(path.join(self._data_dir, 'pos')) @property def tagger(self):