Prevent overlapping noun chunks for Spanish (#6712)

* Prevent overlapping noun chunks in Spanish noun chunk iterator
* Clean up similar code in Danish noun chunk iterator
This commit is contained in:
Adriane Boyd 2021-01-14 07:33:31 +01:00 committed by GitHub
parent 9957ed7897
commit e649242927
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 5 additions and 16 deletions

View File

@ -9,12 +9,6 @@ def noun_chunks(doclike):
def is_verb_token(tok): def is_verb_token(tok):
return tok.pos in [VERB, AUX] return tok.pos in [VERB, AUX]
def next_token(tok):
try:
return tok.nbor()
except IndexError:
return None
def get_left_bound(doc, root): def get_left_bound(doc, root):
left_bound = root left_bound = root
for tok in reversed(list(root.lefts)): for tok in reversed(list(root.lefts)):
@ -67,7 +61,6 @@ def noun_chunks(doclike):
np_right_deps = [doc.vocab.strings.add(label) for label in right_labels] np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
stop_deps = [doc.vocab.strings.add(label) for label in stop_labels] stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
chunks = []
prev_right = -1 prev_right = -1
for token in doclike: for token in doclike:
if token.pos in [PROPN, NOUN, PRON]: if token.pos in [PROPN, NOUN, PRON]:

View File

@ -20,27 +20,23 @@ def noun_chunks(doclike):
np_left_deps = [doc.vocab.strings.add(label) for label in left_labels] np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
np_right_deps = [doc.vocab.strings.add(label) for label in right_labels] np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
stop_deps = [doc.vocab.strings.add(label) for label in stop_labels] stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
prev_right = -1
for token in doclike: for token in doclike:
if token.pos in [PROPN, NOUN, PRON]: if token.pos in [PROPN, NOUN, PRON]:
left, right = noun_bounds( left, right = noun_bounds(
doc, token, np_left_deps, np_right_deps, stop_deps doc, token, np_left_deps, np_right_deps, stop_deps
) )
if left.i <= prev_right:
continue
yield left.i, right.i + 1, np_label yield left.i, right.i + 1, np_label
token = right prev_right = right.i
token = next_token(token)
def is_verb_token(token): def is_verb_token(token):
return token.pos in [VERB, AUX] return token.pos in [VERB, AUX]
def next_token(token):
try:
return token.nbor()
except IndexError:
return None
def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps): def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
left_bound = root left_bound = root
for token in reversed(list(root.lefts)): for token in reversed(list(root.lefts)):