Update information extraction examples

2017-10-26 18:46:11 +02:00 · 2017-10-26 18:46:11 +02:00 · daed7ff8fe
parent bca5372fb1
commit daed7ff8fe
7 changed files with 159 additions and 139 deletions
--- a/examples/get_parse_subregions.py
+++ b/examples/get_parse_subregions.py
@ -1,59 +0,0 @@
-"""Issue #252
-
-Question:
-
-In the documents and tutorials the main thing I haven't found is examples on how to break sentences down into small sub thoughts/chunks. The noun_chunks is handy, but having examples on using the token.head to find small (near-complete) sentence chunks would be neat.
-
-Lets take the example sentence on https://displacy.spacy.io/displacy/index.html
-
-displaCy uses CSS and JavaScript to show you how computers understand language
-This sentence has two main parts (XCOMP & CCOMP) according to the breakdown:
-
-[displaCy] uses CSS and Javascript [to + show]
-&
-show you how computers understand [language]
-I'm assuming that we can use the token.head to build these groups. In one of your examples you had the following function.
-
-def dependency_labels_to_root(token):
-    '''Walk up the syntactic tree, collecting the arc labels.'''
-    dep_labels = []
-    while token.head is not token:
-        dep_labels.append(token.dep)
-        token = token.head
-    return dep_labels
-"""
-from __future__ import print_function, unicode_literals
-
-# Answer:
-# The easiest way is to find the head of the subtree you want, and then use the
-# `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree` is the
-# one that does what you're asking for most directly:
-
-from spacy.en import English
-nlp = English()
-
-doc = nlp(u'displaCy uses CSS and JavaScript to show you how computers understand language')
-for word in doc:
-    if word.dep_ in ('xcomp', 'ccomp'):
-        print(''.join(w.text_with_ws for w in word.subtree))
-
-# It'd probably be better for `word.subtree` to return a `Span` object instead 
-# of a generator over the tokens. If you want the `Span` you can get it via the 
-# `.right_edge` and `.left_edge` properties. The `Span` object is nice because 
-# you can easily get a vector, merge it, etc.
-
-doc = nlp(u'displaCy uses CSS and JavaScript to show you how computers understand language')
-for word in doc:
-    if word.dep_ in ('xcomp', 'ccomp'):
-        subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
-        print(subtree_span.text, '|', subtree_span.root.text)
-        print(subtree_span.similarity(doc))
-        print(subtree_span.similarity(subtree_span.root))
-
-
-# You might also want to select a head, and then select a start and end position by
-# walking along its children. You could then take the `.left_edge` and `.right_edge`
-# of those tokens, and use it to calculate a span.
-
-
-
--- a/examples/information_extraction.py
+++ b/examples/information_extraction.py
@ -1,59 +0,0 @@
-import plac
-
-from spacy.en import English
-from spacy.parts_of_speech import NOUN
-from spacy.parts_of_speech import ADP as PREP
-
-
-def _span_to_tuple(span):
-    start = span[0].idx
-    end = span[-1].idx + len(span[-1])
-    tag = span.root.tag_
-    text = span.text
-    label = span.label_
-    return (start, end, tag, text, label)
-
-def merge_spans(spans, doc):
-    # This is a bit awkward atm. What we're doing here is merging the entities,
-    # so that each only takes up a single token. But an entity is a Span, and
-    # each Span is a view into the doc. When we merge a span, we invalidate
-    # the other spans. This will get fixed --- but for now the solution
-    # is to gather the information first, before merging.
-    tuples = [_span_to_tuple(span) for span in spans]
-    for span_tuple in tuples:
-        doc.merge(*span_tuple)
-
-
-def extract_currency_relations(doc):
-    merge_spans(doc.ents, doc)
-    merge_spans(doc.noun_chunks, doc)
-
-    relations = []
-    for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
-        if money.dep_ in ('attr', 'dobj'):
-            subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
-            if subject:
-                subject = subject[0]
-                relations.append((subject, money))
-        elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
-            relations.append((money.head.head, money))
- 
-    return relations
-
-
-def main():
-    nlp = English()
-    texts = [
-        u'Net income was $9.4 million compared to the prior year of $2.7 million.',
-        u'Revenue exceeded twelve billion dollars, with a loss of $1b.',
-    ]
-               
-    for text in texts:
-        doc = nlp(text)
-        relations = extract_currency_relations(doc)
-        for r1, r2 in relations:
-            print(r1.text, r2.ent_type_, r2.text)
-
-
-if __name__ == '__main__':
-    plac.call(main)
--- a/examples/information_extraction/entity_relations.py
+++ b/examples/information_extraction/entity_relations.py
@ -0,0 +1,62 @@
+#!/usr/bin/env python
+# coding: utf8
+"""
+A simple example of extracting relations between phrases and entities using
+spaCy's named entity recognizer and the dependency parse. Here, we extract
+money and currency values (entities labelled as MONEY) and then check the
+dependency tree to find the noun phrase they are referring to – for example:
+$9.4 million --> Net income.
+
+Last updated for: spaCy 2.0.0a18
+"""
+from __future__ import unicode_literals, print_function
+
+import plac
+import spacy
+
+
+TEXTS = [
+    'Net income was $9.4 million compared to the prior year of $2.7 million.',
+    'Revenue exceeded twelve billion dollars, with a loss of $1b.',
+]
+
+
+@plac.annotations(
+    model=("Model to load (needs parser and NER)", "positional", None, str))
+def main(model='en_core_web_sm'):
+    nlp = spacy.load(model)
+    print("Loaded model '%s'" % model)
+    print("Processing %d texts" % len(TEXTS))
+
+    for text in TEXTS:
+        doc = nlp(text)
+        relations = extract_currency_relations(doc)
+        for r1, r2 in relations:
+            print('{:<10}\t{}\t{}'.format(r1.text, r2.ent_type_, r2.text))
+
+
+def extract_currency_relations(doc):
+    # merge entities and noun chunks into one token
+    for span in [*list(doc.ents), *list(doc.noun_chunks)]:
+        span.merge()
+
+    relations = []
+    for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
+        if money.dep_ in ('attr', 'dobj'):
+            subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
+            if subject:
+                subject = subject[0]
+                relations.append((subject, money))
+        elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
+            relations.append((money.head.head, money))
+    return relations
+
+
+if __name__ == '__main__':
+    plac.call(main)
+
+    # Expected output:
+    # Net income      MONEY   $9.4 million
+    # the prior year  MONEY   $2.7 million
+    # Revenue         MONEY   twelve billion dollars
+    # a loss          MONEY   1b
--- a/examples/information_extraction/parse_subtrees.py
+++ b/examples/information_extraction/parse_subtrees.py
@ -0,0 +1,65 @@
+#!/usr/bin/env python
+# coding: utf8
+"""
+This example shows how to navigate the parse tree including subtrees attached
+to a word.
+
+Based on issue #252:
+"In the documents and tutorials the main thing I haven't found is
+examples on how to break sentences down into small sub thoughts/chunks. The
+noun_chunks is handy, but having examples on using the token.head to find small
+(near-complete) sentence chunks would be neat. Lets take the example sentence:
+"displaCy uses CSS and JavaScript to show you how computers understand language"
+
+This sentence has two main parts (XCOMP & CCOMP) according to the breakdown:
+[displaCy] uses CSS and Javascript [to + show]
+show you how computers understand [language]
+
+I'm assuming that we can use the token.head to build these groups."
+
+Last updated for: spaCy 2.0.0a18
+"""
+from __future__ import unicode_literals, print_function
+
+import plac
+import spacy
+
+
+@plac.annotations(
+    model=("Model to load", "positional", None, str))
+def main(model='en_core_web_sm'):
+    nlp = spacy.load(model)
+    print("Loaded model '%s'" % model)
+
+    doc = nlp("displaCy uses CSS and JavaScript to show you how computers "
+               "understand language")
+
+    # The easiest way is to find the head of the subtree you want, and then use
+    # the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree`
+    # is the one that does what you're asking for most directly:
+    for word in doc:
+        if word.dep_ in ('xcomp', 'ccomp'):
+            print(''.join(w.text_with_ws for w in word.subtree))
+
+    # It'd probably be better for `word.subtree` to return a `Span` object
+    # instead of a generator over the tokens. If you want the `Span` you can
+    # get it via the `.right_edge` and `.left_edge` properties. The `Span`
+    # object is nice because you can easily get a vector, merge it, etc.
+    for word in doc:
+        if word.dep_ in ('xcomp', 'ccomp'):
+            subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
+            print(subtree_span.text, '|', subtree_span.root.text)
+
+    # You might also want to select a head, and then select a start and end
+    # position by walking along its children. You could then take the
+    # `.left_edge` and `.right_edge` of those tokens, and use it to calculate
+    # a span.
+
+if __name__ == '__main__':
+    plac.call(main)
+
+    # Expected output:
+    # to show you how computers understand language
+    # how computers understand language
+    # to show you how computers understand language | show
+    # how computers understand language | understand
--- a/examples/information_extraction/phrase_matcher.py
+++ b/examples/information_extraction/phrase_matcher.py
--- a/website/usage/_data.json
+++ b/website/usage/_data.json
@ -196,8 +196,8 @@
        "teaser": "Full code examples you can modify and run.",
        "next": "resources",
        "menu": {
+            "Information Extraction": "information-extraction",
            "Pipeline": "pipeline",
-            "Matching": "matching",
            "Training": "training",
            "Deep Learning": "deep-learning"
        }
--- a/website/usage/examples.jade
+++ b/website/usage/examples.jade
@ -2,6 +2,37 @@

 include ../_includes/_mixins

+section("information-extraction")
+    +h(3, "phrase-matcher") Using spaCy's phrase matcher
+        +tag-new(2)
+
+    p
+        |  This example shows how to use the new
+        |  #[+api("phrasematcher") #[code PhraseMatcher]] to efficiently find
+        |  entities from a large terminology list.
+
+    +github("spacy", "examples/information_extraction/phrase_matcher.py")
+
+    +h(3, "entity-relations") Extracting entity relations
+
+    p
+        |  A simple example of extracting relations between phrases and
+        |  entities using spaCy's named entity recognizer and the dependency
+        |  parse. Here, we extract money and currency values (entities labelled
+        |  as #[code MONEY]) and then check the dependency tree to find the
+        |  noun phrase they are referring to – for example: "$9.4 million"
+        |  &rarr; "Net income".
+
+    +github("spacy", "examples/information_extraction/entity_relations.py")
+
+    +h(3, "subtrees") Navigating the parse tree and subtrees
+
+    p
+        |  This example shows how to navigate the parse tree including subtrees
+        |  attached to a word.
+
+    +github("spacy", "examples/information_extraction/parse_subtrees.py")
+
 +section("pipeline")
    +h(3, "custom-components-entities") Custom pipeline components and attribute extensions
        +tag-new(2)
@ -40,26 +71,6 @@ include ../_includes/_mixins

    +github("spacy", "examples/pipeline/custom_attr_methods.py")

-+section("matching")
-    +h(3, "matcher") Using spaCy's rule-based matcher
-
-    p
-        |  This example shows how to use spaCy's rule-based
-        |  #[+api("matcher") #[code Matcher]] to find and label entities across
-        |  documents.
-
-    +github("spacy", "examples/matcher_example.py")
-
-    +h(3, "phrase-matcher") Using spaCy's phrase matcher
-        +tag-new(2)
-
-    p
-        |  This example shows how to use the new
-        |  #[+api("phrasematcher") #[code PhraseMatcher]] to efficiently find
-        |  entities from a large terminology list.
-
-    +github("spacy", "examples/phrase_matcher.py")
-
 +section("training")
    +h(3, "training-ner") Training spaCy's Named Entity Recognizer