* Update get_freqs.py script

This commit is contained in:
Matthew Honnibal 2015-10-16 04:20:35 +11:00
parent 710e8fb168
commit e08a4b46a2
1 changed files with 10 additions and 11 deletions

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
from __future__ import unicode_literals from __future__ import unicode_literals, print_function
import plac import plac
import joblib import joblib
@ -14,7 +14,7 @@ from joblib import Parallel, delayed
import spacy.en import spacy.en
from spacy.strings import StringStore from spacy.strings import StringStore
from spacy.en.attrs import ORTH from spacy.attrs import ORTH
def iter_comments(loc): def iter_comments(loc):
@ -40,20 +40,19 @@ def null_props(string):
def count_freqs(input_loc, output_loc): def count_freqs(input_loc, output_loc):
print output_loc print(output_loc)
nlp = spacy.en.English(Parser=None, Tagger=None, Entity=None, load_vectors=False) tokenizer = Tokenizer.from_dir(Vocab(), English.default_data_dir())
nlp.vocab.lexeme_props_getter = null_props
counts = PreshCounter() counts = PreshCounter()
tokenizer = nlp.tokenizer
for json_comment in iter_comments(input_loc): for json_comment in iter_comments(input_loc):
doc = tokenizer(json_comment['body']) doc = tokenizer(json_comment['body'])
doc.count_by(ORTH, counts=counts) doc.count_by(ORTH, counts=counts)
with codecs.open(output_loc, 'w', 'utf8') as file_: with codecs.open(output_loc, 'w', 'utf8') as file_:
for orth, freq in counts: for orth, freq in counts:
string = nlp.vocab.strings[orth] string = tokenizer.vocab.strings[orth]
file_.write('%d\t%s\n' % (freq, repr(string))) if not string.isspace():
file_.write('%d\t%s\n' % (freq, string))
def parallelize(func, iterator, n_jobs): def parallelize(func, iterator, n_jobs):
@ -64,12 +63,12 @@ def merge_counts(locs, out_loc):
string_map = StringStore() string_map = StringStore()
counts = PreshCounter() counts = PreshCounter()
for loc in locs: for loc in locs:
with codecs.open(loc, 'r', 'utf8') as file_: with io.open(loc, 'r', encoding='utf8') as file_:
for line in file_: for line in file_:
freq, word = line.strip().split('\t', 1) freq, word = line.strip().split('\t', 1)
orth = string_map[word] orth = string_map[word]
counts.inc(orth, int(freq)) counts.inc(orth, int(freq))
with codecs.open(out_loc, 'w', 'utf8') as file_: with io.open(out_loc, 'w', encoding='utf8') as file_:
for orth, count in counts: for orth, count in counts:
string = string_map[orth] string = string_map[orth]
file_.write('%d\t%s\n' % (count, string)) file_.write('%d\t%s\n' % (count, string))
@ -98,7 +97,7 @@ def main(input_loc, freqs_dir, output_loc, n_jobs=2, skip_existing=False):
if tasks: if tasks:
parallelize(count_freqs, tasks, n_jobs) parallelize(count_freqs, tasks, n_jobs)
print "Merge" print("Merge")
merge_counts(outputs, output_loc) merge_counts(outputs, output_loc)