spaCy/spacy/lang/nl/syntax_iterators.py

76 lines
2.8 KiB
Python

from typing import Iterator, Tuple, Union
from ...errors import Errors
from ...symbols import NOUN, PRON
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""
Detect base noun phrases from a dependency parse. Works on Doc and Span.
The definition is inspired by https://www.nltk.org/book/ch07.html
Consider : [Noun + determinant / adjective] and also [Pronoun]
"""
# fmt: off
# labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
# fmt: on
doc = doclike.doc # Ensure works on both Doc and Span.
# Check for dependencies: POS, DEP
if not doc.has_annotation("POS"):
raise ValueError(Errors.E1019)
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
# See UD tags: https://universaldependencies.org/u/dep/index.html
# amod = adjectival modifier
# nmod:poss = possessive nominal modifier
# nummod = numeric modifier
# det = determiner
# det:poss = possessive determiner
noun_deps = [
doc.vocab.strings[label] for label in ["amod", "nmod:poss", "det", "det:poss"]
]
# nsubj = nominal subject
# nsubj:pass = passive nominal subject
pronoun_deps = [doc.vocab.strings[label] for label in ["nsubj", "nsubj:pass"]]
# Label NP for the Span to identify it as Noun-Phrase
span_label = doc.vocab.strings.add("NP")
# Only NOUNS and PRONOUNS matter
end_span = -1
for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
# For NOUNS
# Pick children from syntactic parse (only those with certain dependencies)
if word.pos == NOUN:
# Some debugging. It happens that VERBS are POS-TAGGED as NOUNS
# We check if the word has a "nsubj", if it's the case, we eliminate it
nsubjs = filter(
lambda x: x.dep == doc.vocab.strings["nsubj"], word.children
)
next_word = next(nsubjs, None)
if next_word is not None:
# We found some nsubj, so we skip this word. Otherwise, consider it a normal NOUN
continue
children = filter(lambda x: x.dep in noun_deps, word.children)
children_i = [c.i for c in children] + [word.i]
start_span = min(children_i)
if start_span >= end_span:
end_span = max(children_i) + 1
yield start_span, end_span, span_label
# PRONOUNS only if it is the subject of a verb
elif word.pos == PRON:
if word.dep in pronoun_deps:
start_span = word.i
if start_span >= end_span:
end_span = word.i + 1
yield start_span, end_span, span_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}