From 828cc91545458613dff701e804eaec442423e739 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 20 Sep 2017 21:54:31 +0200 Subject: [PATCH 01/10] Fix PhraseMatcher for spaCy 2 --- spacy/matcher.pyx | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index c75d23957..d321218b8 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -426,7 +426,7 @@ cdef class PhraseMatcher: self._phrase_key = self.mem.alloc(max_length, sizeof(attr_t)) self.max_length = max_length self.vocab = vocab - self.matcher = Matcher(self.vocab, {}) + self.matcher = Matcher(self.vocab) self.phrase_ids = PreshMap() for phrase in phrases: if len(phrase) < max_length: @@ -435,7 +435,7 @@ cdef class PhraseMatcher: abstract_patterns = [] for length in range(1, max_length): abstract_patterns.append([{tag: True} for tag in get_bilou(length)]) - self.matcher.add('Candidate', 'MWE', {}, abstract_patterns, acceptor=self.accept_match) + self.matcher.add('Candidate', None, *abstract_patterns) def add(self, Doc tokens): cdef int length = tokens.length @@ -454,22 +454,19 @@ cdef class PhraseMatcher: self.phrase_ids[key] = True def __call__(self, Doc doc): - matches = [] - for ent_id, label, start, end in self.matcher(doc): - cand = doc[start : end] - start = cand[0].idx - end = cand[-1].idx + len(cand[-1]) - matches.append((start, end, cand.root.tag_, cand.text, 'MWE')) - for match in matches: - doc.merge(*match) - return matches + matches = self.matcher(doc) + accepted = [] + for ent_id, start, end in matches: + if self.accept_match(doc, ent_id, start, end): + accepted.append((ent_id, start, end)) + return accepted def pipe(self, stream, batch_size=1000, n_threads=2): for doc in stream: self(doc) yield doc - def accept_match(self, Doc doc, attr_t ent_id, attr_t label, int start, int end): + def accept_match(self, Doc doc, attr_t ent_id, int start, int end): assert (end - start) < self.max_length cdef int i, j for i in range(self.max_length): @@ -478,6 +475,6 @@ cdef class PhraseMatcher: self._phrase_key[i] = doc.c[j].lex.orth cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0) if self.phrase_ids.get(key): - return (ent_id, label, start, end) + return True else: return False From 43ad250dd5c4a9731acf648a40b8218fc677df81 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 20 Sep 2017 21:54:49 +0200 Subject: [PATCH 02/10] Update matcher tests --- spacy/tests/test_matcher.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/spacy/tests/test_matcher.py b/spacy/tests/test_matcher.py index 388aab03e..651707019 100644 --- a/spacy/tests/test_matcher.py +++ b/spacy/tests/test_matcher.py @@ -34,7 +34,6 @@ def test_matcher_from_api_docs(en_vocab): assert len(patterns[0]) -@pytest.mark.xfail def test_matcher_from_usage_docs(en_vocab): text = "Wow 😀 This is really cool! 😂 😂" doc = get_doc(en_vocab, words=text.split(' ')) @@ -46,7 +45,8 @@ def test_matcher_from_usage_docs(en_vocab): if doc.vocab.strings[match_id] == 'HAPPY': doc.sentiment += 0.1 span = doc[start : end] - token = span.merge(norm='happy emoji') + token = span.merge() + token.vocab[token.text].norm_ = 'happy emoji' matcher = Matcher(en_vocab) matcher.add('HAPPY', label_sentiment, *pos_patterns) @@ -98,7 +98,6 @@ def test_matcher_match_multi(matcher): (doc.vocab.strings['Java'], 5, 6)] -@pytest.mark.xfail def test_matcher_phrase_matcher(en_vocab): words = ["Google", "Now"] doc = get_doc(en_vocab, words) From cc408fc1898b7693a3130483e51119e9d78d0693 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 20 Sep 2017 22:20:35 +0200 Subject: [PATCH 03/10] Make PhraseMatcher API like Matcher API --- spacy/matcher.pyx | 72 ++++++++++++++++++++++--------------- spacy/tests/test_matcher.py | 3 +- 2 files changed, 46 insertions(+), 29 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index d321218b8..ba3559966 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -421,52 +421,67 @@ cdef class PhraseMatcher: cdef int max_length cdef attr_t* _phrase_key - def __init__(self, Vocab vocab, phrases, max_length=10): + cdef public object _callbacks + + def __init__(self, Vocab vocab, max_length=10): self.mem = Pool() self._phrase_key = self.mem.alloc(max_length, sizeof(attr_t)) self.max_length = max_length self.vocab = vocab self.matcher = Matcher(self.vocab) self.phrase_ids = PreshMap() - for phrase in phrases: - if len(phrase) < max_length: - self.add(phrase) - abstract_patterns = [] for length in range(1, max_length): abstract_patterns.append([{tag: True} for tag in get_bilou(length)]) self.matcher.add('Candidate', None, *abstract_patterns) + self._callbacks = {} - def add(self, Doc tokens): - cdef int length = tokens.length - assert length < self.max_length - tags = get_bilou(length) - assert len(tags) == length, length + def add(self, key, on_match, *docs): + cdef Doc doc + for doc in docs: + if len(doc) >= self.max_length: + msg = ( + "Pattern length (%d) >= phrase_matcher.max_length (%d). " + "Length can be set on initialization, up to 10." + ) + raise ValueError(msg % (len(doc), self.max_length)) + cdef hash_t ent_id = self.matcher._normalize_key(key) + self._callbacks[ent_id] = on_match + cdef int length cdef int i - for i in range(self.max_length): - self._phrase_key[i] = 0 - for i, tag in enumerate(tags): - lexeme = self.vocab[tokens.c[i].lex.orth] - lexeme.set_flag(tag, True) - self._phrase_key[i] = lexeme.orth - cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0) - self.phrase_ids[key] = True + cdef hash_t phrase_hash + for doc in docs: + length = doc.length + tags = get_bilou(length) + for i in range(self.max_length): + self._phrase_key[i] = 0 + for i, tag in enumerate(tags): + lexeme = self.vocab[doc.c[i].lex.orth] + lexeme.set_flag(tag, True) + self._phrase_key[i] = lexeme.orth + phrase_hash = hash64(self._phrase_key, + self.max_length * sizeof(attr_t), 0) + self.phrase_ids[phrase_hash] = ent_id def __call__(self, Doc doc): - matches = self.matcher(doc) - accepted = [] - for ent_id, start, end in matches: - if self.accept_match(doc, ent_id, start, end): - accepted.append((ent_id, start, end)) - return accepted + matches = [] + for _, start, end in self.matcher(doc): + ent_id = self.accept_match(doc, start, end) + if ent_id is not None: + matches.append((ent_id, start, end)) + for i, (ent_id, start, end) in enumerate(matches): + on_match = self._callbacks.get(ent_id) + if on_match is not None: + on_match(self, doc, i, matches) + return matches def pipe(self, stream, batch_size=1000, n_threads=2): for doc in stream: self(doc) yield doc - def accept_match(self, Doc doc, attr_t ent_id, int start, int end): + def accept_match(self, Doc doc, int start, int end): assert (end - start) < self.max_length cdef int i, j for i in range(self.max_length): @@ -474,7 +489,8 @@ cdef class PhraseMatcher: for i, j in enumerate(range(start, end)): self._phrase_key[i] = doc.c[j].lex.orth cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0) - if self.phrase_ids.get(key): - return True + ent_id = self.phrase_ids.get(key) + if ent_id == 0: + return None else: - return False + return ent_id diff --git a/spacy/tests/test_matcher.py b/spacy/tests/test_matcher.py index 651707019..1b9f92519 100644 --- a/spacy/tests/test_matcher.py +++ b/spacy/tests/test_matcher.py @@ -101,7 +101,8 @@ def test_matcher_match_multi(matcher): def test_matcher_phrase_matcher(en_vocab): words = ["Google", "Now"] doc = get_doc(en_vocab, words) - matcher = PhraseMatcher(en_vocab, [doc]) + matcher = PhraseMatcher(en_vocab) + matcher.add('COMPANY', None, doc) words = ["I", "like", "Google", "Now", "best"] doc = get_doc(en_vocab, words) assert len(matcher(doc)) == 1 From 0c93c73e496f9c57da523393e33a6f88aa3eac25 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 20 Sep 2017 22:26:40 +0200 Subject: [PATCH 04/10] Add __reduce__ method for PhraseMatcher --- spacy/matcher.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index ba3559966..ef4044d21 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -422,6 +422,7 @@ cdef class PhraseMatcher: cdef attr_t* _phrase_key cdef public object _callbacks + cdef public object _patterns def __init__(self, Vocab vocab, max_length=10): self.mem = Pool() @@ -436,6 +437,9 @@ cdef class PhraseMatcher: self.matcher.add('Candidate', None, *abstract_patterns) self._callbacks = {} + def __reduce__(self): + return (self.__class__, (self.vocab,), None, None) + def add(self, key, on_match, *docs): cdef Doc doc for doc in docs: From 01858e9b5972a8c1dec86f88eef3f17fea63cdc6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 20 Sep 2017 22:51:41 +0200 Subject: [PATCH 05/10] Fix PhraseMatcher example --- examples/multi_word_matches.py | 74 +++++++++++++++++----------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/examples/multi_word_matches.py b/examples/multi_word_matches.py index 73f48bf42..ca9b0cc92 100644 --- a/examples/multi_word_matches.py +++ b/examples/multi_word_matches.py @@ -20,72 +20,72 @@ The algorithm is O(n) at run-time for document of length n because we're only ev matching over the tag patterns. So no matter how many phrases we're looking for, our pattern set stays very small (exact size depends on the maximum length we're looking for, as the query language currently has no quantifiers) + +The example expects a .bz2 file from the Reddit corpus, and a patterns file, +formatted in jsonl as a sequence of entries like this: + +{"text":"Anchorage"} +{"text":"Angola"} +{"text":"Ann Arbor"} +{"text":"Annapolis"} +{"text":"Appalachia"} +{"text":"Argentina"} """ from __future__ import print_function, unicode_literals, division -from ast import literal_eval from bz2 import BZ2File import time import math import codecs import plac +import ujson -from preshed.maps import PreshMap -from preshed.counter import PreshCounter -from spacy.strings import hash_string -from spacy.en import English from spacy.matcher import PhraseMatcher +import spacy def read_gazetteer(tokenizer, loc, n=-1): for i, line in enumerate(open(loc)): - phrase = literal_eval('u' + line.strip()) - if ' (' in phrase and phrase.endswith(')'): - phrase = phrase.split(' (', 1)[0] - if i >= n: - break - phrase = tokenizer(phrase) - if all((t.is_lower and t.prob >= -10) for t in phrase): - continue + data = ujson.loads(line.strip()) + phrase = tokenizer(data['text']) + for w in phrase: + _ = tokenizer.vocab[w.text] if len(phrase) >= 2: yield phrase -def read_text(bz2_loc): +def read_text(bz2_loc, n=10000): with BZ2File(bz2_loc) as file_: - for line in file_: - yield line.decode('utf8') + for i, line in enumerate(file_): + data = ujson.loads(line) + yield data['body'] + if i >= n: + break def get_matches(tokenizer, phrases, texts, max_length=6): - matcher = PhraseMatcher(tokenizer.vocab, phrases, max_length=max_length) - print("Match") + matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length) + matcher.add('Phrase', None, *phrases) for text in texts: doc = tokenizer(text) + for w in doc: + _ = doc.vocab[w.text] matches = matcher(doc) - for mwe in doc.ents: - yield mwe + for ent_id, start, end in matches: + yield (ent_id, doc[start:end].text) -def main(patterns_loc, text_loc, counts_loc, n=10000000): - nlp = English(parser=False, tagger=False, entity=False) - print("Make matcher") - phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n) - counts = PreshCounter() +def main(patterns_loc, text_loc, n=10000): + nlp = spacy.blank('en') + nlp.vocab.lex_attr_getters = {} + phrases = read_gazetteer(nlp.tokenizer, patterns_loc) + count = 0 t1 = time.time() - for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)): - counts.inc(hash_string(mwe.text), 1) + for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)): + count += 1 t2 = time.time() - print("10m tokens in %d s" % (t2 - t1)) - - with codecs.open(counts_loc, 'w', 'utf8') as file_: - for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n): - text = phrase.string - key = hash_string(text) - count = counts[key] - if count != 0: - file_.write('%d\t%s\n' % (count, text)) - + print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count)) + if __name__ == '__main__': if False: From f92ab03dc87711ca03cd4a29a886bc1c827b0934 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 20 Sep 2017 22:51:58 +0200 Subject: [PATCH 06/10] Rename phrase matcher example --- examples/{multi_word_matches.py => phrase_matcher.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/{multi_word_matches.py => phrase_matcher.py} (100%) diff --git a/examples/multi_word_matches.py b/examples/phrase_matcher.py similarity index 100% rename from examples/multi_word_matches.py rename to examples/phrase_matcher.py From 842e21de9f54c3e37e43f698c75a246d69d4551c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 20 Sep 2017 23:55:30 +0200 Subject: [PATCH 07/10] Fix int type error for Python 2 --- spacy/matcher.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index ef4044d21..5106161a0 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -466,7 +466,7 @@ cdef class PhraseMatcher: self._phrase_key[i] = lexeme.orth phrase_hash = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0) - self.phrase_ids[phrase_hash] = ent_id + self.phrase_ids.set(phrase_hash, ent_id) def __call__(self, Doc doc): matches = [] From 50ad50f96acdd110cdbd4662ca9878dc33e74cea Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 26 Sep 2017 13:11:17 +0200 Subject: [PATCH 08/10] Update matcher.pyx --- spacy/matcher.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 5106161a0..84414c255 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -436,6 +436,9 @@ cdef class PhraseMatcher: abstract_patterns.append([{tag: True} for tag in get_bilou(length)]) self.matcher.add('Candidate', None, *abstract_patterns) self._callbacks = {} + + def __len__(self): + raise NotImplementedError def __reduce__(self): return (self.__class__, (self.vocab,), None, None) From 7123139b2bce61f21bcab3f10f179ec235d9ae67 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 26 Sep 2017 13:13:27 +0200 Subject: [PATCH 09/10] Add __contains__ to PhraseMatcher --- spacy/matcher.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 84414c255..9d7e66835 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -439,6 +439,9 @@ cdef class PhraseMatcher: def __len__(self): raise NotImplementedError + + def __contains__(self): + raise NotImplementedError def __reduce__(self): return (self.__class__, (self.vocab,), None, None) From 19c7c09bf735c74274cbf6d75a3ca89b248d3865 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 26 Sep 2017 08:35:53 -0500 Subject: [PATCH 10/10] Fix PhraseMatcher.__contains__ --- spacy/matcher.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 9d7e66835..3bc6f859c 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -436,11 +436,11 @@ cdef class PhraseMatcher: abstract_patterns.append([{tag: True} for tag in get_bilou(length)]) self.matcher.add('Candidate', None, *abstract_patterns) self._callbacks = {} - + def __len__(self): raise NotImplementedError - - def __contains__(self): + + def __contains__(self, key): raise NotImplementedError def __reduce__(self):