mirror of https://github.com/explosion/spaCy.git
Clean up examples
This commit is contained in:
parent
f57043e6fe
commit
bca5372fb1
|
@ -4,22 +4,24 @@ The idea is to associate each word in the vocabulary with a tag, noting whether
|
||||||
they begin, end, or are inside at least one pattern. An additional tag is used
|
they begin, end, or are inside at least one pattern. An additional tag is used
|
||||||
for single-word patterns. Complete patterns are also stored in a hash set.
|
for single-word patterns. Complete patterns are also stored in a hash set.
|
||||||
|
|
||||||
When we process a document, we look up the words in the vocabulary, to associate
|
When we process a document, we look up the words in the vocabulary, to
|
||||||
the words with the tags. We then search for tag-sequences that correspond to
|
associate the words with the tags. We then search for tag-sequences that
|
||||||
valid candidates. Finally, we look up the candidates in the hash set.
|
correspond to valid candidates. Finally, we look up the candidates in the hash
|
||||||
|
set.
|
||||||
|
|
||||||
For instance, to search for the phrases "Barack Hussein Obama" and "Hilary Clinton", we
|
For instance, to search for the phrases "Barack Hussein Obama" and "Hilary
|
||||||
would associate "Barack" and "Hilary" with the B tag, Hussein with the I tag,
|
Clinton", we would associate "Barack" and "Hilary" with the B tag, Hussein with
|
||||||
and Obama and Clinton with the L tag.
|
the I tag, and Obama and Clinton with the L tag.
|
||||||
|
|
||||||
The document "Barack Clinton and Hilary Clinton" would have the tag sequence
|
The document "Barack Clinton and Hilary Clinton" would have the tag sequence
|
||||||
[{B}, {L}, {}, {B}, {L}], so we'd get two matches. However, only the second candidate
|
[{B}, {L}, {}, {B}, {L}], so we'd get two matches. However, only the second
|
||||||
is in the phrase dictionary, so only one is returned as a match.
|
candidate is in the phrase dictionary, so only one is returned as a match.
|
||||||
|
|
||||||
The algorithm is O(n) at run-time for document of length n because we're only ever
|
The algorithm is O(n) at run-time for document of length n because we're only
|
||||||
matching over the tag patterns. So no matter how many phrases we're looking for,
|
ever matching over the tag patterns. So no matter how many phrases we're
|
||||||
our pattern set stays very small (exact size depends on the maximum length we're
|
looking for, our pattern set stays very small (exact size depends on the
|
||||||
looking for, as the query language currently has no quantifiers)
|
maximum length we're looking for, as the query language currently has no
|
||||||
|
quantifiers).
|
||||||
|
|
||||||
The example expects a .bz2 file from the Reddit corpus, and a patterns file,
|
The example expects a .bz2 file from the Reddit corpus, and a patterns file,
|
||||||
formatted in jsonl as a sequence of entries like this:
|
formatted in jsonl as a sequence of entries like this:
|
||||||
|
@ -32,11 +34,9 @@ formatted in jsonl as a sequence of entries like this:
|
||||||
{"text":"Argentina"}
|
{"text":"Argentina"}
|
||||||
"""
|
"""
|
||||||
from __future__ import print_function, unicode_literals, division
|
from __future__ import print_function, unicode_literals, division
|
||||||
|
|
||||||
from bz2 import BZ2File
|
from bz2 import BZ2File
|
||||||
import time
|
import time
|
||||||
import math
|
|
||||||
import codecs
|
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import ujson
|
import ujson
|
||||||
|
|
||||||
|
@ -44,6 +44,24 @@ from spacy.matcher import PhraseMatcher
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
patterns_loc=("Path to gazetteer", "positional", None, str),
|
||||||
|
text_loc=("Path to Reddit corpus file", "positional", None, str),
|
||||||
|
n=("Number of texts to read", "option", "n", int),
|
||||||
|
lang=("Language class to initialise", "option", "l", str))
|
||||||
|
def main(patterns_loc, text_loc, n=10000, lang='en'):
|
||||||
|
nlp = spacy.blank('en')
|
||||||
|
nlp.vocab.lex_attr_getters = {}
|
||||||
|
phrases = read_gazetteer(nlp.tokenizer, patterns_loc)
|
||||||
|
count = 0
|
||||||
|
t1 = time.time()
|
||||||
|
for ent_id, text in get_matches(nlp.tokenizer, phrases,
|
||||||
|
read_text(text_loc, n=n)):
|
||||||
|
count += 1
|
||||||
|
t2 = time.time()
|
||||||
|
print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count))
|
||||||
|
|
||||||
|
|
||||||
def read_gazetteer(tokenizer, loc, n=-1):
|
def read_gazetteer(tokenizer, loc, n=-1):
|
||||||
for i, line in enumerate(open(loc)):
|
for i, line in enumerate(open(loc)):
|
||||||
data = ujson.loads(line.strip())
|
data = ujson.loads(line.strip())
|
||||||
|
@ -75,18 +93,6 @@ def get_matches(tokenizer, phrases, texts, max_length=6):
|
||||||
yield (ent_id, doc[start:end].text)
|
yield (ent_id, doc[start:end].text)
|
||||||
|
|
||||||
|
|
||||||
def main(patterns_loc, text_loc, n=10000):
|
|
||||||
nlp = spacy.blank('en')
|
|
||||||
nlp.vocab.lex_attr_getters = {}
|
|
||||||
phrases = read_gazetteer(nlp.tokenizer, patterns_loc)
|
|
||||||
count = 0
|
|
||||||
t1 = time.time()
|
|
||||||
for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)):
|
|
||||||
count += 1
|
|
||||||
t2 = time.time()
|
|
||||||
print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
if False:
|
if False:
|
||||||
import cProfile
|
import cProfile
|
||||||
|
|
|
@ -1,16 +1,18 @@
|
||||||
'''Load vectors for a language trained using FastText
|
#!/usr/bin/env python
|
||||||
|
# coding: utf8
|
||||||
|
"""Load vectors for a language trained using FastText
|
||||||
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
|
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
|
||||||
'''
|
"""
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
import plac
|
import plac
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
import spacy.language
|
import from spacy.language import Language
|
||||||
|
|
||||||
|
|
||||||
|
@plac.annotations(vectors_loc=("Path to vectors", "positional", None, str))
|
||||||
def main(vectors_loc):
|
def main(vectors_loc):
|
||||||
nlp = spacy.language.Language()
|
nlp = Language()
|
||||||
|
|
||||||
with open(vectors_loc, 'rb') as file_:
|
with open(vectors_loc, 'rb') as file_:
|
||||||
header = file_.readline()
|
header = file_.readline()
|
||||||
|
|
Loading…
Reference in New Issue