Prevent overlapping noun chunks for Spanish (#6712)

* Prevent overlapping noun chunks in Spanish noun chunk iterator * Clean up similar code in Danish noun chunk iterator
2021-01-14 07:33:31 +01:00 · 2021-01-14 07:33:31 +01:00 · e649242927
parent 9957ed7897
commit e649242927
2 changed files with 5 additions and 16 deletions
--- a/spacy/lang/da/syntax_iterators.py
+++ b/spacy/lang/da/syntax_iterators.py
@ -9,12 +9,6 @@ def noun_chunks(doclike):
    def is_verb_token(tok):
        return tok.pos in [VERB, AUX]

-    def next_token(tok):
-        try:
-            return tok.nbor()
-        except IndexError:
-            return None
-
    def get_left_bound(doc, root):
        left_bound = root
        for tok in reversed(list(root.lefts)):
@ -67,7 +61,6 @@ def noun_chunks(doclike):
    np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
    stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]

-    chunks = []
    prev_right = -1
    for token in doclike:
        if token.pos in [PROPN, NOUN, PRON]:
--- a/spacy/lang/es/syntax_iterators.py
+++ b/spacy/lang/es/syntax_iterators.py
@ -20,27 +20,23 @@ def noun_chunks(doclike):
    np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
    np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
    stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
+
+    prev_right = -1
    for token in doclike:
        if token.pos in [PROPN, NOUN, PRON]:
            left, right = noun_bounds(
                doc, token, np_left_deps, np_right_deps, stop_deps
            )
+            if left.i <= prev_right:
+                continue
            yield left.i, right.i + 1, np_label
-            token = right
-        token = next_token(token)
+            prev_right = right.i


 def is_verb_token(token):
    return token.pos in [VERB, AUX]


-def next_token(token):
-    try:
-        return token.nbor()
-    except IndexError:
-        return None
-
-
 def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
    left_bound = root
    for token in reversed(list(root.lefts)):