Allow German noun chunks to work on Span

Update the German noun chunks iterator, so that it also works on Span objects.
This commit is contained in:
Matthew Honnibal 2016-11-24 23:30:15 +11:00 committed by GitHub
parent 3e3bda142d
commit b8c4f5ea76
1 changed files with 6 additions and 3 deletions

View File

@ -2,9 +2,11 @@ from spacy.parts_of_speech cimport NOUN, PROPN, PRON
def english_noun_chunks(obj): def english_noun_chunks(obj):
'''Detect base noun phrases from a dependency parse.
Works on both Doc and Span.'''
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
'attr', 'ROOT', 'root'] 'attr', 'ROOT', 'root']
doc = obj.doc doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels] np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings['conj'] conj = doc.vocab.strings['conj']
np_label = doc.vocab.strings['NP'] np_label = doc.vocab.strings['NP']
@ -26,14 +28,15 @@ def english_noun_chunks(obj):
# extended to the right of the NOUN # extended to the right of the NOUN
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not # example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
# just "eine Tasse", same for "das Thema Familie" # just "eine Tasse", same for "das Thema Familie"
def german_noun_chunks(doc): def german_noun_chunks(obj):
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app'] labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
doc = obj.doc # Ensure works on both Doc and Span.
np_label = doc.vocab.strings['NP'] np_label = doc.vocab.strings['NP']
np_deps = set(doc.vocab.strings[label] for label in labels) np_deps = set(doc.vocab.strings[label] for label in labels)
close_app = doc.vocab.strings['nk'] close_app = doc.vocab.strings['nk']
rbracket = 0 rbracket = 0
for i, word in enumerate(doc): for i, word in enumerate(obj):
if i < rbracket: if i < rbracket:
continue continue
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps: if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps: