Fix overlapping German noun chunks (#6112)

Add a similar fix as in #5470 to prevent the German noun chunks iterator from producing overlapping spans.
2020-09-22 21:52:42 +02:00 · 2020-09-22 21:52:42 +02:00 · 9b4979407d
parent 4625029370
commit 9b4979407d
1 changed files with 5 additions and 0 deletions
--- a/spacy/lang/de/syntax_iterators.py
+++ b/spacy/lang/de/syntax_iterators.py
@ -38,9 +38,13 @@ def noun_chunks(doclike):
    close_app = doc.vocab.strings.add("nk")
    rbracket = 0
    prev_end = -1
    for i, word in enumerate(doclike):
        if i < rbracket:
            continue
        # Prevent nested chunks from being produced
        if word.left_edge.i <= prev_end:
            continue
        if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
            rbracket = word.i + 1
            # try to extend the span to the right
@ -48,6 +52,7 @@ def noun_chunks(doclike):
            for rdep in doc[word.i].rights:
                if rdep.pos in (NOUN, PROPN) and rdep.dep == close_app:
                    rbracket = rdep.i + 1
            prev_end = rbracket - 1
            yield word.left_edge.i, rbracket, np_label