examples/information_extraction.py

* Add very simple information extraction snippet.
2015-10-01 22:27:57 +10:00 · 2015-10-01 22:27:57 +10:00 · 262c215b55
parent fd72b8b282
commit 262c215b55
1 changed files with 59 additions and 0 deletions
--- a/examples/information_extraction.py
+++ b/examples/information_extraction.py
@ -0,0 +1,59 @@
+import plac
+
+from spacy.en import English
+from spacy.parts_of_speech import NOUN
+from spacy.parts_of_speech import ADP as PREP
+
+
+def _span_to_tuple(span):
+    start = span[0].idx
+    end = span[-1].idx + len(span[-1])
+    tag = span.root.tag_
+    text = span.text
+    label = span.label_
+    return (start, end, tag, text, label)
+
+def merge_spans(spans, doc):
+    # This is a bit awkward atm. What we're doing here is merging the entities,
+    # so that each only takes up a single token. But an entity is a Span, and
+    # each Span is a view into the doc. When we merge a span, we invalidate
+    # the other spans. This will get fixed --- but for now the solution
+    # is to gather the information first, before merging.
+    tuples = [_span_to_tuple(span) for span in spans]
+    for span_tuple in tuples:
+        doc.merge(*span_tuple)
+
+
+def extract_currency_relations(doc):
+    merge_spans(doc.ents, doc)
+    merge_spans(doc.noun_chunks, doc)
+
+    relations = []
+    for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
+        if money.dep_ in ('attr', 'dobj'):
+            subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
+            if subject:
+                subject = subject[0]
+                relations.append((subject, money))
+        elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
+            relations.append((money.head.head, money))
+ 
+    return relations
+
+
+def main():
+    nlp = English()
+    texts = [
+        u'Net income was $9.4 million compared to the prior year of $2.7 million.',
+        u'Revenue exceeded twelve billion dollars, with a loss of $1b',
+    ]
+               
+    for text in texts:
+        doc = nlp(text)
+        relations = extract_currency_relations(doc)
+        for r1, r2 in relations:
+            print(r1.text, r2.ent_type_)
+
+
+if __name__ == '__main__':
+    plac.call(main)