2015-10-01 12:27:57 +00:00
|
|
|
import plac
|
|
|
|
|
|
|
|
from spacy.en import English
|
|
|
|
from spacy.parts_of_speech import NOUN
|
|
|
|
from spacy.parts_of_speech import ADP as PREP
|
|
|
|
|
|
|
|
|
|
|
|
def _span_to_tuple(span):
|
|
|
|
start = span[0].idx
|
|
|
|
end = span[-1].idx + len(span[-1])
|
|
|
|
tag = span.root.tag_
|
|
|
|
text = span.text
|
|
|
|
label = span.label_
|
|
|
|
return (start, end, tag, text, label)
|
|
|
|
|
|
|
|
def merge_spans(spans, doc):
|
|
|
|
# This is a bit awkward atm. What we're doing here is merging the entities,
|
|
|
|
# so that each only takes up a single token. But an entity is a Span, and
|
|
|
|
# each Span is a view into the doc. When we merge a span, we invalidate
|
|
|
|
# the other spans. This will get fixed --- but for now the solution
|
|
|
|
# is to gather the information first, before merging.
|
|
|
|
tuples = [_span_to_tuple(span) for span in spans]
|
|
|
|
for span_tuple in tuples:
|
|
|
|
doc.merge(*span_tuple)
|
|
|
|
|
|
|
|
|
|
|
|
def extract_currency_relations(doc):
|
|
|
|
merge_spans(doc.ents, doc)
|
|
|
|
merge_spans(doc.noun_chunks, doc)
|
|
|
|
|
|
|
|
relations = []
|
|
|
|
for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
|
|
|
|
if money.dep_ in ('attr', 'dobj'):
|
|
|
|
subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
|
|
|
|
if subject:
|
|
|
|
subject = subject[0]
|
|
|
|
relations.append((subject, money))
|
|
|
|
elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
|
|
|
|
relations.append((money.head.head, money))
|
|
|
|
|
|
|
|
return relations
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
nlp = English()
|
|
|
|
texts = [
|
|
|
|
u'Net income was $9.4 million compared to the prior year of $2.7 million.',
|
2015-10-05 23:35:49 +00:00
|
|
|
u'Revenue exceeded twelve billion dollars, with a loss of $1b.',
|
2015-10-01 12:27:57 +00:00
|
|
|
]
|
|
|
|
|
|
|
|
for text in texts:
|
|
|
|
doc = nlp(text)
|
|
|
|
relations = extract_currency_relations(doc)
|
|
|
|
for r1, r2 in relations:
|
2015-10-05 23:35:49 +00:00
|
|
|
print(r1.text, r2.ent_type_, r2.text)
|
2015-10-01 12:27:57 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
plac.call(main)
|