From 096a80170d23365e1b8ff9d3749bb6caa379abdd Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 27 Oct 2017 01:48:39 +0200 Subject: [PATCH] Remove old example files --- examples/_handler.py | 37 ------------------- examples/parallel_parse.py | 74 -------------------------------------- 2 files changed, 111 deletions(-) delete mode 100644 examples/_handler.py delete mode 100644 examples/parallel_parse.py diff --git a/examples/_handler.py b/examples/_handler.py deleted file mode 100644 index cebfe8968..000000000 --- a/examples/_handler.py +++ /dev/null @@ -1,37 +0,0 @@ -# encoding: utf8 -from __future__ import unicode_literals, print_function - -from math import sqrt -from numpy import dot -from numpy.linalg import norm - - -def handle_tweet(spacy, tweet_data, query): - text = tweet_data.get('text', u'') - # Twython returns either bytes or unicode, depending on tweet. - # ಠ_ಠ #APIshaming - try: - match_tweet(spacy, text, query) - except TypeError: - match_tweet(spacy, text.decode('utf8'), query) - - -def match_tweet(spacy, text, query): - def get_vector(word): - return spacy.vocab[word].repvec - - tweet = spacy(text) - tweet = [w.repvec for w in tweet if w.is_alpha and w.lower_ != query] - if tweet: - accept = map(get_vector, 'child classroom teach'.split()) - reject = map(get_vector, 'mouth hands giveaway'.split()) - - y = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in accept) - n = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in reject) - - if (y / (y + n)) >= 0.5 or True: - print(text) - - -def cos(v1, v2): - return dot(v1, v2) / (norm(v1) * norm(v2)) diff --git a/examples/parallel_parse.py b/examples/parallel_parse.py deleted file mode 100644 index 5cdd0778b..000000000 --- a/examples/parallel_parse.py +++ /dev/null @@ -1,74 +0,0 @@ -from __future__ import print_function, unicode_literals, division -import io -import bz2 -import logging -from toolz import partition -from os import path -import re - -import spacy.en -from spacy.tokens import Doc - -from joblib import Parallel, delayed -import plac -import ujson - - -def parallelize(func, iterator, n_jobs, extra, backend='multiprocessing'): - extra = tuple(extra) - return Parallel(n_jobs=n_jobs, backend=backend)(delayed(func)(*(item + extra)) - for item in iterator) - - -def iter_comments(loc): - with bz2.BZ2File(loc) as file_: - for i, line in enumerate(file_): - yield ujson.loads(line)['body'] - - -pre_format_re = re.compile(r'^[\`\*\~]') -post_format_re = re.compile(r'[\`\*\~]$') -url_re = re.compile(r'\[([^]]+)\]\(%%URL\)') -link_re = re.compile(r'\[([^]]+)\]\(https?://[^\)]+\)') -def strip_meta(text): - text = link_re.sub(r'\1', text) - text = text.replace('>', '>').replace('<', '<') - text = pre_format_re.sub('', text) - text = post_format_re.sub('', text) - return text.strip() - - -def save_parses(batch_id, input_, out_dir, n_threads, batch_size): - out_loc = path.join(out_dir, '%d.bin' % batch_id) - if path.exists(out_loc): - return None - print('Batch', batch_id) - nlp = spacy.en.English() - nlp.matcher = None - with open(out_loc, 'wb') as file_: - texts = (strip_meta(text) for text in input_) - texts = (text for text in texts if text.strip()) - for doc in nlp.pipe(texts, batch_size=batch_size, n_threads=n_threads): - file_.write(doc.to_bytes()) - -@plac.annotations( - in_loc=("Location of input file"), - out_dir=("Location of input file"), - n_process=("Number of processes", "option", "p", int), - n_thread=("Number of threads per process", "option", "t", int), - batch_size=("Number of texts to accumulate in a buffer", "option", "b", int) -) -def main(in_loc, out_dir, n_process=1, n_thread=4, batch_size=100): - if not path.exists(out_dir): - path.join(out_dir) - if n_process >= 2: - texts = partition(200000, iter_comments(in_loc)) - parallelize(save_parses, enumerate(texts), n_process, [out_dir, n_thread, batch_size], - backend='multiprocessing') - else: - save_parses(0, iter_comments(in_loc), out_dir, n_thread, batch_size) - - - -if __name__ == '__main__': - plac.call(main)