Clean up examples

This commit is contained in:
ines 2017-10-26 17:32:59 +02:00
parent f57043e6fe
commit bca5372fb1
2 changed files with 41 additions and 33 deletions

View File

@ -4,22 +4,24 @@ The idea is to associate each word in the vocabulary with a tag, noting whether
they begin, end, or are inside at least one pattern. An additional tag is used they begin, end, or are inside at least one pattern. An additional tag is used
for single-word patterns. Complete patterns are also stored in a hash set. for single-word patterns. Complete patterns are also stored in a hash set.
When we process a document, we look up the words in the vocabulary, to associate When we process a document, we look up the words in the vocabulary, to
the words with the tags. We then search for tag-sequences that correspond to associate the words with the tags. We then search for tag-sequences that
valid candidates. Finally, we look up the candidates in the hash set. correspond to valid candidates. Finally, we look up the candidates in the hash
set.
For instance, to search for the phrases "Barack Hussein Obama" and "Hilary Clinton", we For instance, to search for the phrases "Barack Hussein Obama" and "Hilary
would associate "Barack" and "Hilary" with the B tag, Hussein with the I tag, Clinton", we would associate "Barack" and "Hilary" with the B tag, Hussein with
and Obama and Clinton with the L tag. the I tag, and Obama and Clinton with the L tag.
The document "Barack Clinton and Hilary Clinton" would have the tag sequence The document "Barack Clinton and Hilary Clinton" would have the tag sequence
[{B}, {L}, {}, {B}, {L}], so we'd get two matches. However, only the second candidate [{B}, {L}, {}, {B}, {L}], so we'd get two matches. However, only the second
is in the phrase dictionary, so only one is returned as a match. candidate is in the phrase dictionary, so only one is returned as a match.
The algorithm is O(n) at run-time for document of length n because we're only ever The algorithm is O(n) at run-time for document of length n because we're only
matching over the tag patterns. So no matter how many phrases we're looking for, ever matching over the tag patterns. So no matter how many phrases we're
our pattern set stays very small (exact size depends on the maximum length we're looking for, our pattern set stays very small (exact size depends on the
looking for, as the query language currently has no quantifiers) maximum length we're looking for, as the query language currently has no
quantifiers).
The example expects a .bz2 file from the Reddit corpus, and a patterns file, The example expects a .bz2 file from the Reddit corpus, and a patterns file,
formatted in jsonl as a sequence of entries like this: formatted in jsonl as a sequence of entries like this:
@ -32,11 +34,9 @@ formatted in jsonl as a sequence of entries like this:
{"text":"Argentina"} {"text":"Argentina"}
""" """
from __future__ import print_function, unicode_literals, division from __future__ import print_function, unicode_literals, division
from bz2 import BZ2File from bz2 import BZ2File
import time import time
import math
import codecs
import plac import plac
import ujson import ujson
@ -44,6 +44,24 @@ from spacy.matcher import PhraseMatcher
import spacy import spacy
@plac.annotations(
patterns_loc=("Path to gazetteer", "positional", None, str),
text_loc=("Path to Reddit corpus file", "positional", None, str),
n=("Number of texts to read", "option", "n", int),
lang=("Language class to initialise", "option", "l", str))
def main(patterns_loc, text_loc, n=10000, lang='en'):
nlp = spacy.blank('en')
nlp.vocab.lex_attr_getters = {}
phrases = read_gazetteer(nlp.tokenizer, patterns_loc)
count = 0
t1 = time.time()
for ent_id, text in get_matches(nlp.tokenizer, phrases,
read_text(text_loc, n=n)):
count += 1
t2 = time.time()
print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count))
def read_gazetteer(tokenizer, loc, n=-1): def read_gazetteer(tokenizer, loc, n=-1):
for i, line in enumerate(open(loc)): for i, line in enumerate(open(loc)):
data = ujson.loads(line.strip()) data = ujson.loads(line.strip())
@ -75,18 +93,6 @@ def get_matches(tokenizer, phrases, texts, max_length=6):
yield (ent_id, doc[start:end].text) yield (ent_id, doc[start:end].text)
def main(patterns_loc, text_loc, n=10000):
nlp = spacy.blank('en')
nlp.vocab.lex_attr_getters = {}
phrases = read_gazetteer(nlp.tokenizer, patterns_loc)
count = 0
t1 = time.time()
for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)):
count += 1
t2 = time.time()
print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count))
if __name__ == '__main__': if __name__ == '__main__':
if False: if False:
import cProfile import cProfile

View File

@ -1,16 +1,18 @@
'''Load vectors for a language trained using FastText #!/usr/bin/env python
# coding: utf8
"""Load vectors for a language trained using FastText
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
''' """
from __future__ import unicode_literals from __future__ import unicode_literals
import plac import plac
import numpy import numpy
import spacy.language import from spacy.language import Language
@plac.annotations(vectors_loc=("Path to vectors", "positional", None, str))
def main(vectors_loc): def main(vectors_loc):
nlp = spacy.language.Language() nlp = Language()
with open(vectors_loc, 'rb') as file_: with open(vectors_loc, 'rb') as file_:
header = file_.readline() header = file_.readline()