Fix overlapping German noun chunks (#6112)

Add a similar fix as in #5470 to prevent the German noun chunks iterator
from producing overlapping spans.
This commit is contained in:
Adriane Boyd 2020-09-22 21:52:42 +02:00 committed by GitHub
parent 4625029370
commit 9b4979407d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 5 additions and 0 deletions

View File

@ -38,9 +38,13 @@ def noun_chunks(doclike):
close_app = doc.vocab.strings.add("nk") close_app = doc.vocab.strings.add("nk")
rbracket = 0 rbracket = 0
prev_end = -1
for i, word in enumerate(doclike): for i, word in enumerate(doclike):
if i < rbracket: if i < rbracket:
continue continue
# Prevent nested chunks from being produced
if word.left_edge.i <= prev_end:
continue
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps: if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
rbracket = word.i + 1 rbracket = word.i + 1
# try to extend the span to the right # try to extend the span to the right
@ -48,6 +52,7 @@ def noun_chunks(doclike):
for rdep in doc[word.i].rights: for rdep in doc[word.i].rights:
if rdep.pos in (NOUN, PROPN) and rdep.dep == close_app: if rdep.pos in (NOUN, PROPN) and rdep.dep == close_app:
rbracket = rdep.i + 1 rbracket = rdep.i + 1
prev_end = rbracket - 1
yield word.left_edge.i, rbracket, np_label yield word.left_edge.i, rbracket, np_label