mirror of https://github.com/explosion/spaCy.git
* Update get_freqs.py script
This commit is contained in:
parent
5ff4454177
commit
17fffb4c57
|
@ -12,7 +12,7 @@ import codecs
|
||||||
from preshed.counter import PreshCounter
|
from preshed.counter import PreshCounter
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
|
|
||||||
import spacy.en
|
from spacy.en import English
|
||||||
from spacy.strings import StringStore
|
from spacy.strings import StringStore
|
||||||
from spacy.attrs import ORTH
|
from spacy.attrs import ORTH
|
||||||
from spacy.tokenizer import Tokenizer
|
from spacy.tokenizer import Tokenizer
|
||||||
|
@ -27,8 +27,9 @@ def iter_comments(loc):
|
||||||
|
|
||||||
def count_freqs(input_loc, output_loc):
|
def count_freqs(input_loc, output_loc):
|
||||||
print(output_loc)
|
print(output_loc)
|
||||||
tokenizer = Tokenizer.from_dir(Vocab(),
|
vocab = English.default_vocab(get_lex_attr=None)
|
||||||
path.join(spacy.en.English.default_data_dir(), 'tokenizer'))
|
tokenizer = Tokenizer.from_dir(vocab,
|
||||||
|
path.join(English.default_data_dir(), 'tokenizer'))
|
||||||
|
|
||||||
counts = PreshCounter()
|
counts = PreshCounter()
|
||||||
for json_comment in iter_comments(input_loc):
|
for json_comment in iter_comments(input_loc):
|
||||||
|
|
Loading…
Reference in New Issue