diff --git a/examples/phrase_matcher.py b/examples/phrase_matcher.py index ca9b0cc92..2dd2691b9 100644 --- a/examples/phrase_matcher.py +++ b/examples/phrase_matcher.py @@ -4,22 +4,24 @@ The idea is to associate each word in the vocabulary with a tag, noting whether they begin, end, or are inside at least one pattern. An additional tag is used for single-word patterns. Complete patterns are also stored in a hash set. -When we process a document, we look up the words in the vocabulary, to associate -the words with the tags. We then search for tag-sequences that correspond to -valid candidates. Finally, we look up the candidates in the hash set. +When we process a document, we look up the words in the vocabulary, to +associate the words with the tags. We then search for tag-sequences that +correspond to valid candidates. Finally, we look up the candidates in the hash +set. -For instance, to search for the phrases "Barack Hussein Obama" and "Hilary Clinton", we -would associate "Barack" and "Hilary" with the B tag, Hussein with the I tag, -and Obama and Clinton with the L tag. +For instance, to search for the phrases "Barack Hussein Obama" and "Hilary +Clinton", we would associate "Barack" and "Hilary" with the B tag, Hussein with +the I tag, and Obama and Clinton with the L tag. The document "Barack Clinton and Hilary Clinton" would have the tag sequence -[{B}, {L}, {}, {B}, {L}], so we'd get two matches. However, only the second candidate -is in the phrase dictionary, so only one is returned as a match. +[{B}, {L}, {}, {B}, {L}], so we'd get two matches. However, only the second +candidate is in the phrase dictionary, so only one is returned as a match. -The algorithm is O(n) at run-time for document of length n because we're only ever -matching over the tag patterns. So no matter how many phrases we're looking for, -our pattern set stays very small (exact size depends on the maximum length we're -looking for, as the query language currently has no quantifiers) +The algorithm is O(n) at run-time for document of length n because we're only +ever matching over the tag patterns. So no matter how many phrases we're +looking for, our pattern set stays very small (exact size depends on the +maximum length we're looking for, as the query language currently has no +quantifiers). The example expects a .bz2 file from the Reddit corpus, and a patterns file, formatted in jsonl as a sequence of entries like this: @@ -32,11 +34,9 @@ formatted in jsonl as a sequence of entries like this: {"text":"Argentina"} """ from __future__ import print_function, unicode_literals, division + from bz2 import BZ2File import time -import math -import codecs - import plac import ujson @@ -44,6 +44,24 @@ from spacy.matcher import PhraseMatcher import spacy +@plac.annotations( + patterns_loc=("Path to gazetteer", "positional", None, str), + text_loc=("Path to Reddit corpus file", "positional", None, str), + n=("Number of texts to read", "option", "n", int), + lang=("Language class to initialise", "option", "l", str)) +def main(patterns_loc, text_loc, n=10000, lang='en'): + nlp = spacy.blank('en') + nlp.vocab.lex_attr_getters = {} + phrases = read_gazetteer(nlp.tokenizer, patterns_loc) + count = 0 + t1 = time.time() + for ent_id, text in get_matches(nlp.tokenizer, phrases, + read_text(text_loc, n=n)): + count += 1 + t2 = time.time() + print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count)) + + def read_gazetteer(tokenizer, loc, n=-1): for i, line in enumerate(open(loc)): data = ujson.loads(line.strip()) @@ -75,18 +93,6 @@ def get_matches(tokenizer, phrases, texts, max_length=6): yield (ent_id, doc[start:end].text) -def main(patterns_loc, text_loc, n=10000): - nlp = spacy.blank('en') - nlp.vocab.lex_attr_getters = {} - phrases = read_gazetteer(nlp.tokenizer, patterns_loc) - count = 0 - t1 = time.time() - for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)): - count += 1 - t2 = time.time() - print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count)) - - if __name__ == '__main__': if False: import cProfile diff --git a/examples/vectors_fast_text.py b/examples/vectors_fast_text.py index 9aa9fda56..323d5803f 100644 --- a/examples/vectors_fast_text.py +++ b/examples/vectors_fast_text.py @@ -1,16 +1,18 @@ -'''Load vectors for a language trained using FastText - +#!/usr/bin/env python +# coding: utf8 +"""Load vectors for a language trained using FastText https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md -''' +""" from __future__ import unicode_literals import plac import numpy -import spacy.language +import from spacy.language import Language +@plac.annotations(vectors_loc=("Path to vectors", "positional", None, str)) def main(vectors_loc): - nlp = spacy.language.Language() + nlp = Language() with open(vectors_loc, 'rb') as file_: header = file_.readline() @@ -18,7 +20,7 @@ def main(vectors_loc): nlp.vocab.clear_vectors(int(nr_dim)) for line in file_: line = line.decode('utf8') - pieces = line.split() + pieces = line.split() word = pieces[0] vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f') nlp.vocab.set_vector(word, vector)