mirror of https://github.com/explosion/spaCy.git
86 lines
3.1 KiB
Python
86 lines
3.1 KiB
Python
from typing import Iterator, Tuple, Union
|
|
|
|
from ...errors import Errors
|
|
from ...symbols import NOUN, PRON, PROPN
|
|
from ...tokens import Doc, Span
|
|
|
|
|
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
|
"""
|
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
|
"""
|
|
labels = [
|
|
"nsubj",
|
|
"nsubj:pass",
|
|
"obj",
|
|
"obl",
|
|
"obl:agent",
|
|
"obl:arg",
|
|
"obl:mod",
|
|
"nmod",
|
|
"pcomp",
|
|
"appos",
|
|
"ROOT",
|
|
]
|
|
post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"]
|
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
|
if not doc.has_annotation("DEP"):
|
|
raise ValueError(Errors.E029)
|
|
np_deps = {doc.vocab.strings.add(label) for label in labels}
|
|
np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
|
|
np_label = doc.vocab.strings.add("NP")
|
|
adj_label = doc.vocab.strings.add("amod")
|
|
det_label = doc.vocab.strings.add("det")
|
|
det_pos = doc.vocab.strings.add("DET")
|
|
adp_pos = doc.vocab.strings.add("ADP")
|
|
conj_label = doc.vocab.strings.add("conj")
|
|
conj_pos = doc.vocab.strings.add("CCONJ")
|
|
prev_end = -1
|
|
for i, word in enumerate(doclike):
|
|
if word.pos not in (NOUN, PROPN, PRON):
|
|
continue
|
|
# Prevent nested chunks from being produced
|
|
if word.left_edge.i <= prev_end:
|
|
continue
|
|
if word.dep in np_deps:
|
|
right_childs = list(word.rights)
|
|
right_child = right_childs[0] if right_childs else None
|
|
|
|
if right_child:
|
|
if (
|
|
right_child.dep == adj_label
|
|
): # allow chain of adjectives by expanding to right
|
|
right_end = right_child.right_edge
|
|
elif (
|
|
right_child.dep == det_label and right_child.pos == det_pos
|
|
): # cut relative pronouns here
|
|
right_end = right_child
|
|
elif right_child.dep in np_modifs: # Check if we can expand to right
|
|
right_end = word.right_edge
|
|
else:
|
|
right_end = word
|
|
else:
|
|
right_end = word
|
|
prev_end = right_end.i
|
|
|
|
left_index = word.left_edge.i
|
|
left_index = left_index + 1 if word.left_edge.pos == adp_pos else left_index
|
|
|
|
yield left_index, right_end.i + 1, np_label
|
|
elif word.dep == conj_label:
|
|
head = word.head
|
|
while head.dep == conj_label and head.head.i < head.i:
|
|
head = head.head
|
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
|
if head.dep in np_deps:
|
|
prev_end = word.i
|
|
|
|
left_index = word.left_edge.i # eliminate left attached conjunction
|
|
left_index = (
|
|
left_index + 1 if word.left_edge.pos == conj_pos else left_index
|
|
)
|
|
yield left_index, word.i + 1, np_label
|
|
|
|
|
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|