* Update get_freqs.py script

This commit is contained in:
Matthew Honnibal 2015-10-16 04:33:49 +11:00
parent 5ff4454177
commit 17fffb4c57
1 changed files with 4 additions and 3 deletions

View File

@ -12,7 +12,7 @@ import codecs
from preshed.counter import PreshCounter from preshed.counter import PreshCounter
from joblib import Parallel, delayed from joblib import Parallel, delayed
import spacy.en from spacy.en import English
from spacy.strings import StringStore from spacy.strings import StringStore
from spacy.attrs import ORTH from spacy.attrs import ORTH
from spacy.tokenizer import Tokenizer from spacy.tokenizer import Tokenizer
@ -27,8 +27,9 @@ def iter_comments(loc):
def count_freqs(input_loc, output_loc): def count_freqs(input_loc, output_loc):
print(output_loc) print(output_loc)
tokenizer = Tokenizer.from_dir(Vocab(), vocab = English.default_vocab(get_lex_attr=None)
path.join(spacy.en.English.default_data_dir(), 'tokenizer')) tokenizer = Tokenizer.from_dir(vocab,
path.join(English.default_data_dir(), 'tokenizer'))
counts = PreshCounter() counts = PreshCounter()
for json_comment in iter_comments(input_loc): for json_comment in iter_comments(input_loc):