From 616445e027c203a3caa17d3e0a859f6afe107dbc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 25 Jul 2015 21:12:45 +0200 Subject: [PATCH] * Add simple script to collate frequencies from sorted file --- bin/gather_freqs.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 bin/gather_freqs.py diff --git a/bin/gather_freqs.py b/bin/gather_freqs.py new file mode 100644 index 000000000..f0cbdfa4f --- /dev/null +++ b/bin/gather_freqs.py @@ -0,0 +1,27 @@ +import plac + +def main(in_loc, out_loc): + out_file = open(out_loc, 'w') + this_key = None + this_freq = 0 + df = 0 + for line in open(in_loc): + line = line.strip() + if not line: + continue + freq, key = line.split('\t', 1) + freq = int(freq) + if this_key is not None and key != this_key: + out_file.write('%d\t%d\t%s\n' % (this_freq, df, this_key)) + this_key = key + this_freq = freq + df = 1 + else: + this_freq += freq + df += 1 + out_file.write('%d\t%d\t%s\n' % (this_freq, df, this_key)) + out_file.close() + + +if __name__ == '__main__': + plac.call(main)