From e649242927b7284d435844ca4ee84479f55ba705 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 14 Jan 2021 07:33:31 +0100 Subject: [PATCH] Prevent overlapping noun chunks for Spanish (#6712) * Prevent overlapping noun chunks in Spanish noun chunk iterator * Clean up similar code in Danish noun chunk iterator --- spacy/lang/da/syntax_iterators.py | 7 ------- spacy/lang/es/syntax_iterators.py | 14 +++++--------- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/spacy/lang/da/syntax_iterators.py b/spacy/lang/da/syntax_iterators.py index c6b944193..f2bc3ee9b 100644 --- a/spacy/lang/da/syntax_iterators.py +++ b/spacy/lang/da/syntax_iterators.py @@ -9,12 +9,6 @@ def noun_chunks(doclike): def is_verb_token(tok): return tok.pos in [VERB, AUX] - def next_token(tok): - try: - return tok.nbor() - except IndexError: - return None - def get_left_bound(doc, root): left_bound = root for tok in reversed(list(root.lefts)): @@ -67,7 +61,6 @@ def noun_chunks(doclike): np_right_deps = [doc.vocab.strings.add(label) for label in right_labels] stop_deps = [doc.vocab.strings.add(label) for label in stop_labels] - chunks = [] prev_right = -1 for token in doclike: if token.pos in [PROPN, NOUN, PRON]: diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py index d4572b682..d67eef2d6 100644 --- a/spacy/lang/es/syntax_iterators.py +++ b/spacy/lang/es/syntax_iterators.py @@ -20,27 +20,23 @@ def noun_chunks(doclike): np_left_deps = [doc.vocab.strings.add(label) for label in left_labels] np_right_deps = [doc.vocab.strings.add(label) for label in right_labels] stop_deps = [doc.vocab.strings.add(label) for label in stop_labels] + + prev_right = -1 for token in doclike: if token.pos in [PROPN, NOUN, PRON]: left, right = noun_bounds( doc, token, np_left_deps, np_right_deps, stop_deps ) + if left.i <= prev_right: + continue yield left.i, right.i + 1, np_label - token = right - token = next_token(token) + prev_right = right.i def is_verb_token(token): return token.pos in [VERB, AUX] -def next_token(token): - try: - return token.nbor() - except IndexError: - return None - - def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps): left_bound = root for token in reversed(list(root.lefts)):