diff --git a/examples/information_extraction.py b/examples/information_extraction.py new file mode 100644 index 000000000..dbe17c1d8 --- /dev/null +++ b/examples/information_extraction.py @@ -0,0 +1,59 @@ +import plac + +from spacy.en import English +from spacy.parts_of_speech import NOUN +from spacy.parts_of_speech import ADP as PREP + + +def _span_to_tuple(span): + start = span[0].idx + end = span[-1].idx + len(span[-1]) + tag = span.root.tag_ + text = span.text + label = span.label_ + return (start, end, tag, text, label) + +def merge_spans(spans, doc): + # This is a bit awkward atm. What we're doing here is merging the entities, + # so that each only takes up a single token. But an entity is a Span, and + # each Span is a view into the doc. When we merge a span, we invalidate + # the other spans. This will get fixed --- but for now the solution + # is to gather the information first, before merging. + tuples = [_span_to_tuple(span) for span in spans] + for span_tuple in tuples: + doc.merge(*span_tuple) + + +def extract_currency_relations(doc): + merge_spans(doc.ents, doc) + merge_spans(doc.noun_chunks, doc) + + relations = [] + for money in filter(lambda w: w.ent_type_ == 'MONEY', doc): + if money.dep_ in ('attr', 'dobj'): + subject = [w for w in money.head.lefts if w.dep_ == 'nsubj'] + if subject: + subject = subject[0] + relations.append((subject, money)) + elif money.dep_ == 'pobj' and money.head.dep_ == 'prep': + relations.append((money.head.head, money)) + + return relations + + +def main(): + nlp = English() + texts = [ + u'Net income was $9.4 million compared to the prior year of $2.7 million.', + u'Revenue exceeded twelve billion dollars, with a loss of $1b', + ] + + for text in texts: + doc = nlp(text) + relations = extract_currency_relations(doc) + for r1, r2 in relations: + print(r1.text, r2.ent_type_) + + +if __name__ == '__main__': + plac.call(main)