spaCy/spacy/tokens/printers.py

76 lines
2.7 KiB
Python

# coding: utf8
from __future__ import unicode_literals
from .doc import Doc
from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE
def merge_ents(doc):
"""Helper: merge adjacent entities into single tokens; modifies the doc."""
for ent in doc.ents:
ent.merge(tag=ent.root.tag_, lemma=ent.text, ent_type=ent.label_)
return doc
def format_POS(token, light, flat):
"""Helper: form the POS output for a token."""
subtree = dict([
("word", token.text),
("lemma", token.lemma_), # trigger
("NE", token.ent_type_), # trigger
("POS_fine", token.tag_),
("POS_coarse", token.pos_),
("arc", token.dep_),
("modifiers", [])
])
if light:
subtree.pop("lemma")
subtree.pop("NE")
if flat:
subtree.pop("arc")
subtree.pop("modifiers")
return subtree
def POS_tree(root, light=False, flat=False):
"""Helper: generate a POS tree for a root token. The doc must have
`merge_ents(doc)` ran on it.
"""
subtree = format_POS(root, light=light, flat=flat)
if not flat:
for c in root.children:
subtree["modifiers"].append(POS_tree(c))
return subtree
def parse_tree(doc, light=False, flat=False):
"""Make a copy of the doc and construct a syntactic parse tree similar to
displaCy. Generates the POS tree for all sentences in a doc.
doc (Doc): The doc for parsing.
RETURNS (dict): The parse tree.
EXAMPLE:
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
>>> trees = doc.print_tree()
>>> trees[1]
{'modifiers': [
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
{'modifiers': [
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
'POS_fine': 'NN', 'lemma': 'pizza'},
{'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
'POS_fine': 'VBD', 'lemma': 'eat'}
"""
doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE]))
merge_ents(doc_clone) # merge the entities into single tokens first
return [POS_tree(sent.root, light=light, flat=flat)
for sent in doc_clone.sents]