Merge branch 'master' of ssh://github.com/explosion/spaCy

2016-10-23 14:33:54 +02:00 · 2016-10-23 14:33:54 +02:00 · 7638f439e5
parent 2989072aac 30ebb84e73
commit 7638f439e5
2 changed files with 19 additions and 1 deletions
--- a/README.rst
+++ b/README.rst
@ -179,6 +179,11 @@ Install a version of Visual Studio Express or higher that matches the version
 that was used to compile your Python interpreter. For official distributions 
 these are VS 2008 (Python 2.7), VS 2010 (Python 3.4) and VS 2015 (Python 3.5).

+If you don't want to install the entire Visual Studio, you can install a
+stand-alone compiler. Make sure that you install the correct version for 
+your version of Python. See https://wiki.python.org/moin/WindowsCompilers for 
+links to download these.
+
 Run tests
 =========

--- a/website/docs/tutorials/rule-based-matcher.jade
+++ b/website/docs/tutorials/rule-based-matcher.jade
@ -9,6 +9,18 @@ p.u-text-large spaCy features a rule-matching engine that operates over tokens.

    nlp = spacy.load('en', parser=False, entity=False)

+    def merge_phrases(matcher, doc, i, matches):
+        '''
+        Merge a phrase. We have to be careful here because we'll change the token indices.
+        To avoid problems, merge all the phrases once we're called on the last match.
+        '''
+        if i != len(matches)-1:
+            return None
+        # Get Span objects
+        spans = [(ent_id, label, doc[start : end]) for ent_id, label, start, end in matches]
+        for ent_id, label, span in spans:
+            span.merge(label=label, tag='NNP' if label else span.root.tag_)
+
    matcher = Matcher(nlp.vocab)

    matcher.add_entity(
@ -17,6 +29,7 @@ p.u-text-large spaCy features a rule-matching engine that operates over tokens.
        acceptor=None, # Accept or modify the match
        on_match=merge_phrases # Callback to act on the matches
    )
+
    matcher.add_pattern(
        "GoogleNow", # Entity ID -- Created if doesn't exist.
        [ # The pattern is a list of *Token Specifiers*.
@ -32,7 +45,7 @@ p.u-text-large spaCy features a rule-matching engine that operates over tokens.
    doc = nlp(u"I prefer Siri to Google Now.")
    matches = matcher(doc)
    for ent_id, label, start, end in matches:
-        print(nlp.strings[ent_id], nlp.strings[label], doc[start : end].text)
+        print(nlp.vocab.strings[ent_id], nlp.vocab.strings[label], doc[start : end].text)
        entity = matcher.get_entity(ent_id)
        print(entity)