spaCy/spacy/lang/nl/syntax_iterators.py

from typing import Union, Iterator, Tuple

from ...symbols import NOUN, PRON
from ...errors import Errors
from ...tokens import Doc, Span


def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
    """
    Detect base noun phrases from a dependency parse. Works on Doc and Span.
    The definition is inspired by https://www.nltk.org/book/ch07.html
    Consider : [Noun + determinant / adjective] and also [Pronoun]
    """
    # fmt: off
    # labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
    # fmt: on
    doc = doclike.doc  # Ensure works on both Doc and Span.

    # Check for dependencies: POS, DEP
    if not doc.has_annotation("POS"):
        raise ValueError(Errors.E1019)
    if not doc.has_annotation("DEP"):
        raise ValueError(Errors.E029)

    # See UD tags: https://universaldependencies.org/u/dep/index.html
    # amod = adjectival modifier
    # nmod:poss = possessive nominal modifier
    # nummod = numeric modifier
    # det = determiner
    # det:poss = possessive determiner
    noun_deps = [
        doc.vocab.strings[label] for label in ["amod", "nmod:poss", "det", "det:poss"]
    ]

    # nsubj = nominal subject
    # nsubj:pass = passive nominal subject
    pronoun_deps = [doc.vocab.strings[label] for label in ["nsubj", "nsubj:pass"]]

    # Label NP for the Span to identify it as Noun-Phrase
    span_label = doc.vocab.strings.add("NP")

    # Only NOUNS and PRONOUNS matter
    end_span = -1
    for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
        # For NOUNS
        # Pick children from syntactic parse (only those with certain dependencies)
        if word.pos == NOUN:
            # Some debugging. It happens that VERBS are POS-TAGGED as NOUNS
            # We check if the word has a "nsubj", if it's the case, we eliminate it
            nsubjs = filter(
                lambda x: x.dep == doc.vocab.strings["nsubj"], word.children
            )
            next_word = next(nsubjs, None)
            if next_word is not None:
                # We found some nsubj, so we skip this word. Otherwise, consider it a normal NOUN
                continue

            children = filter(lambda x: x.dep in noun_deps, word.children)
            children_i = [c.i for c in children] + [word.i]

            start_span = min(children_i)
            if start_span >= end_span:
                end_span = max(children_i) + 1
                yield start_span, end_span, span_label

        # PRONOUNS only if it is the subject of a verb
        elif word.pos == PRON:
            if word.dep in pronoun_deps:
                start_span = word.i
                if start_span >= end_span:
                    end_span = word.i + 1
                    yield start_span, end_span, span_label


SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}