From daed7ff8fedf8d7bc202ec706eed5d53e70cef77 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 26 Oct 2017 18:46:11 +0200 Subject: [PATCH] Update information extraction examples --- examples/get_parse_subregions.py | 59 ----------------- examples/information_extraction.py | 59 ----------------- .../entity_relations.py | 62 ++++++++++++++++++ .../information_extraction/parse_subtrees.py | 65 +++++++++++++++++++ .../phrase_matcher.py | 0 website/usage/_data.json | 2 +- website/usage/examples.jade | 51 +++++++++------ 7 files changed, 159 insertions(+), 139 deletions(-) delete mode 100644 examples/get_parse_subregions.py delete mode 100644 examples/information_extraction.py create mode 100644 examples/information_extraction/entity_relations.py create mode 100644 examples/information_extraction/parse_subtrees.py rename examples/{ => information_extraction}/phrase_matcher.py (100%) diff --git a/examples/get_parse_subregions.py b/examples/get_parse_subregions.py deleted file mode 100644 index 5eb4f2c77..000000000 --- a/examples/get_parse_subregions.py +++ /dev/null @@ -1,59 +0,0 @@ -"""Issue #252 - -Question: - -In the documents and tutorials the main thing I haven't found is examples on how to break sentences down into small sub thoughts/chunks. The noun_chunks is handy, but having examples on using the token.head to find small (near-complete) sentence chunks would be neat. - -Lets take the example sentence on https://displacy.spacy.io/displacy/index.html - -displaCy uses CSS and JavaScript to show you how computers understand language -This sentence has two main parts (XCOMP & CCOMP) according to the breakdown: - -[displaCy] uses CSS and Javascript [to + show] -& -show you how computers understand [language] -I'm assuming that we can use the token.head to build these groups. In one of your examples you had the following function. - -def dependency_labels_to_root(token): - '''Walk up the syntactic tree, collecting the arc labels.''' - dep_labels = [] - while token.head is not token: - dep_labels.append(token.dep) - token = token.head - return dep_labels -""" -from __future__ import print_function, unicode_literals - -# Answer: -# The easiest way is to find the head of the subtree you want, and then use the -# `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree` is the -# one that does what you're asking for most directly: - -from spacy.en import English -nlp = English() - -doc = nlp(u'displaCy uses CSS and JavaScript to show you how computers understand language') -for word in doc: - if word.dep_ in ('xcomp', 'ccomp'): - print(''.join(w.text_with_ws for w in word.subtree)) - -# It'd probably be better for `word.subtree` to return a `Span` object instead -# of a generator over the tokens. If you want the `Span` you can get it via the -# `.right_edge` and `.left_edge` properties. The `Span` object is nice because -# you can easily get a vector, merge it, etc. - -doc = nlp(u'displaCy uses CSS and JavaScript to show you how computers understand language') -for word in doc: - if word.dep_ in ('xcomp', 'ccomp'): - subtree_span = doc[word.left_edge.i : word.right_edge.i + 1] - print(subtree_span.text, '|', subtree_span.root.text) - print(subtree_span.similarity(doc)) - print(subtree_span.similarity(subtree_span.root)) - - -# You might also want to select a head, and then select a start and end position by -# walking along its children. You could then take the `.left_edge` and `.right_edge` -# of those tokens, and use it to calculate a span. - - - diff --git a/examples/information_extraction.py b/examples/information_extraction.py deleted file mode 100644 index 19e93b499..000000000 --- a/examples/information_extraction.py +++ /dev/null @@ -1,59 +0,0 @@ -import plac - -from spacy.en import English -from spacy.parts_of_speech import NOUN -from spacy.parts_of_speech import ADP as PREP - - -def _span_to_tuple(span): - start = span[0].idx - end = span[-1].idx + len(span[-1]) - tag = span.root.tag_ - text = span.text - label = span.label_ - return (start, end, tag, text, label) - -def merge_spans(spans, doc): - # This is a bit awkward atm. What we're doing here is merging the entities, - # so that each only takes up a single token. But an entity is a Span, and - # each Span is a view into the doc. When we merge a span, we invalidate - # the other spans. This will get fixed --- but for now the solution - # is to gather the information first, before merging. - tuples = [_span_to_tuple(span) for span in spans] - for span_tuple in tuples: - doc.merge(*span_tuple) - - -def extract_currency_relations(doc): - merge_spans(doc.ents, doc) - merge_spans(doc.noun_chunks, doc) - - relations = [] - for money in filter(lambda w: w.ent_type_ == 'MONEY', doc): - if money.dep_ in ('attr', 'dobj'): - subject = [w for w in money.head.lefts if w.dep_ == 'nsubj'] - if subject: - subject = subject[0] - relations.append((subject, money)) - elif money.dep_ == 'pobj' and money.head.dep_ == 'prep': - relations.append((money.head.head, money)) - - return relations - - -def main(): - nlp = English() - texts = [ - u'Net income was $9.4 million compared to the prior year of $2.7 million.', - u'Revenue exceeded twelve billion dollars, with a loss of $1b.', - ] - - for text in texts: - doc = nlp(text) - relations = extract_currency_relations(doc) - for r1, r2 in relations: - print(r1.text, r2.ent_type_, r2.text) - - -if __name__ == '__main__': - plac.call(main) diff --git a/examples/information_extraction/entity_relations.py b/examples/information_extraction/entity_relations.py new file mode 100644 index 000000000..b73dcbf3b --- /dev/null +++ b/examples/information_extraction/entity_relations.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +# coding: utf8 +""" +A simple example of extracting relations between phrases and entities using +spaCy's named entity recognizer and the dependency parse. Here, we extract +money and currency values (entities labelled as MONEY) and then check the +dependency tree to find the noun phrase they are referring to – for example: +$9.4 million --> Net income. + +Last updated for: spaCy 2.0.0a18 +""" +from __future__ import unicode_literals, print_function + +import plac +import spacy + + +TEXTS = [ + 'Net income was $9.4 million compared to the prior year of $2.7 million.', + 'Revenue exceeded twelve billion dollars, with a loss of $1b.', +] + + +@plac.annotations( + model=("Model to load (needs parser and NER)", "positional", None, str)) +def main(model='en_core_web_sm'): + nlp = spacy.load(model) + print("Loaded model '%s'" % model) + print("Processing %d texts" % len(TEXTS)) + + for text in TEXTS: + doc = nlp(text) + relations = extract_currency_relations(doc) + for r1, r2 in relations: + print('{:<10}\t{}\t{}'.format(r1.text, r2.ent_type_, r2.text)) + + +def extract_currency_relations(doc): + # merge entities and noun chunks into one token + for span in [*list(doc.ents), *list(doc.noun_chunks)]: + span.merge() + + relations = [] + for money in filter(lambda w: w.ent_type_ == 'MONEY', doc): + if money.dep_ in ('attr', 'dobj'): + subject = [w for w in money.head.lefts if w.dep_ == 'nsubj'] + if subject: + subject = subject[0] + relations.append((subject, money)) + elif money.dep_ == 'pobj' and money.head.dep_ == 'prep': + relations.append((money.head.head, money)) + return relations + + +if __name__ == '__main__': + plac.call(main) + + # Expected output: + # Net income MONEY $9.4 million + # the prior year MONEY $2.7 million + # Revenue MONEY twelve billion dollars + # a loss MONEY 1b diff --git a/examples/information_extraction/parse_subtrees.py b/examples/information_extraction/parse_subtrees.py new file mode 100644 index 000000000..5963d014c --- /dev/null +++ b/examples/information_extraction/parse_subtrees.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python +# coding: utf8 +""" +This example shows how to navigate the parse tree including subtrees attached +to a word. + +Based on issue #252: +"In the documents and tutorials the main thing I haven't found is +examples on how to break sentences down into small sub thoughts/chunks. The +noun_chunks is handy, but having examples on using the token.head to find small +(near-complete) sentence chunks would be neat. Lets take the example sentence: +"displaCy uses CSS and JavaScript to show you how computers understand language" + +This sentence has two main parts (XCOMP & CCOMP) according to the breakdown: +[displaCy] uses CSS and Javascript [to + show] +show you how computers understand [language] + +I'm assuming that we can use the token.head to build these groups." + +Last updated for: spaCy 2.0.0a18 +""" +from __future__ import unicode_literals, print_function + +import plac +import spacy + + +@plac.annotations( + model=("Model to load", "positional", None, str)) +def main(model='en_core_web_sm'): + nlp = spacy.load(model) + print("Loaded model '%s'" % model) + + doc = nlp("displaCy uses CSS and JavaScript to show you how computers " + "understand language") + + # The easiest way is to find the head of the subtree you want, and then use + # the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree` + # is the one that does what you're asking for most directly: + for word in doc: + if word.dep_ in ('xcomp', 'ccomp'): + print(''.join(w.text_with_ws for w in word.subtree)) + + # It'd probably be better for `word.subtree` to return a `Span` object + # instead of a generator over the tokens. If you want the `Span` you can + # get it via the `.right_edge` and `.left_edge` properties. The `Span` + # object is nice because you can easily get a vector, merge it, etc. + for word in doc: + if word.dep_ in ('xcomp', 'ccomp'): + subtree_span = doc[word.left_edge.i : word.right_edge.i + 1] + print(subtree_span.text, '|', subtree_span.root.text) + + # You might also want to select a head, and then select a start and end + # position by walking along its children. You could then take the + # `.left_edge` and `.right_edge` of those tokens, and use it to calculate + # a span. + +if __name__ == '__main__': + plac.call(main) + + # Expected output: + # to show you how computers understand language + # how computers understand language + # to show you how computers understand language | show + # how computers understand language | understand diff --git a/examples/phrase_matcher.py b/examples/information_extraction/phrase_matcher.py similarity index 100% rename from examples/phrase_matcher.py rename to examples/information_extraction/phrase_matcher.py diff --git a/website/usage/_data.json b/website/usage/_data.json index cc9918631..c34b5f2b0 100644 --- a/website/usage/_data.json +++ b/website/usage/_data.json @@ -196,8 +196,8 @@ "teaser": "Full code examples you can modify and run.", "next": "resources", "menu": { + "Information Extraction": "information-extraction", "Pipeline": "pipeline", - "Matching": "matching", "Training": "training", "Deep Learning": "deep-learning" } diff --git a/website/usage/examples.jade b/website/usage/examples.jade index 6641a83c6..74d562e27 100644 --- a/website/usage/examples.jade +++ b/website/usage/examples.jade @@ -2,6 +2,37 @@ include ../_includes/_mixins ++section("information-extraction") + +h(3, "phrase-matcher") Using spaCy's phrase matcher + +tag-new(2) + + p + | This example shows how to use the new + | #[+api("phrasematcher") #[code PhraseMatcher]] to efficiently find + | entities from a large terminology list. + + +github("spacy", "examples/information_extraction/phrase_matcher.py") + + +h(3, "entity-relations") Extracting entity relations + + p + | A simple example of extracting relations between phrases and + | entities using spaCy's named entity recognizer and the dependency + | parse. Here, we extract money and currency values (entities labelled + | as #[code MONEY]) and then check the dependency tree to find the + | noun phrase they are referring to – for example: "$9.4 million" + | → "Net income". + + +github("spacy", "examples/information_extraction/entity_relations.py") + + +h(3, "subtrees") Navigating the parse tree and subtrees + + p + | This example shows how to navigate the parse tree including subtrees + | attached to a word. + + +github("spacy", "examples/information_extraction/parse_subtrees.py") + +section("pipeline") +h(3, "custom-components-entities") Custom pipeline components and attribute extensions +tag-new(2) @@ -40,26 +71,6 @@ include ../_includes/_mixins +github("spacy", "examples/pipeline/custom_attr_methods.py") -+section("matching") - +h(3, "matcher") Using spaCy's rule-based matcher - - p - | This example shows how to use spaCy's rule-based - | #[+api("matcher") #[code Matcher]] to find and label entities across - | documents. - - +github("spacy", "examples/matcher_example.py") - - +h(3, "phrase-matcher") Using spaCy's phrase matcher - +tag-new(2) - - p - | This example shows how to use the new - | #[+api("phrasematcher") #[code PhraseMatcher]] to efficiently find - | entities from a large terminology list. - - +github("spacy", "examples/phrase_matcher.py") - +section("training") +h(3, "training-ner") Training spaCy's Named Entity Recognizer