From 686225eaddd56fac86cb18a3e172d84371ea8be1 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 18 Apr 2018 18:44:01 -0400 Subject: [PATCH] Fix Spanish noun_chunks (resolves #2210) Make sure 'NP' label is added to StringStore and move noun_bounds helper into a closure to allow reusing label sets --- spacy/lang/es/syntax_iterators.py | 38 +++++++++++++++---------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py index c414897a0..d38bff2a3 100644 --- a/spacy/lang/es/syntax_iterators.py +++ b/spacy/lang/es/syntax_iterators.py @@ -6,13 +6,30 @@ from ...symbols import NOUN, PROPN, PRON, VERB, AUX def noun_chunks(obj): doc = obj.doc - np_label = doc.vocab.strings['NP'] - left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed'] + np_label = doc.vocab.strings.add('NP') + left_labels = ['det', 'fixed', 'neg'] # ['nunmod', 'det', 'appos', 'fixed'] right_labels = ['flat', 'fixed', 'compound', 'neg'] stop_labels = ['punct'] np_left_deps = [doc.vocab.strings[label] for label in left_labels] np_right_deps = [doc.vocab.strings[label] for label in right_labels] stop_deps = [doc.vocab.strings[label] for label in stop_labels] + + def noun_bounds(root): + left_bound = root + for token in reversed(list(root.lefts)): + if token.dep in np_left_deps: + left_bound = token + right_bound = root + for token in root.rights: + if (token.dep in np_right_deps): + left, right = noun_bounds(token) + if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps, + doc[left_bound.i: right.i])): + break + else: + right_bound = right + return left_bound, right_bound + token = doc[0] while token and token.i < len(doc): if token.pos in [PROPN, NOUN, PRON]: @@ -33,23 +50,6 @@ def next_token(token): return None -def noun_bounds(root): - left_bound = root - for token in reversed(list(root.lefts)): - if token.dep in np_left_deps: - left_bound = token - right_bound = root - for token in root.rights: - if (token.dep in np_right_deps): - left, right = noun_bounds(token) - if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps, - doc[left_bound.i: right.i])): - break - else: - right_bound = right - return left_bound, right_bound - - SYNTAX_ITERATORS = { 'noun_chunks': noun_chunks }