From 1a05078c79d90e93333df6ccc72e0235b57fe159 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 17 May 2017 11:37:48 +0200 Subject: [PATCH] Add language-specific syntax iterators to en and de --- spacy/lang/de/__init__.py | 2 ++ spacy/lang/de/syntax_iterators.py | 38 +++++++++++++++++++++++++++ spacy/lang/en/__init__.py | 2 ++ spacy/lang/en/syntax_iterators.py | 43 +++++++++++++++++++++++++++++++ 4 files changed, 85 insertions(+) create mode 100644 spacy/lang/de/syntax_iterators.py create mode 100644 spacy/lang/en/syntax_iterators.py diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index 2da572500..7a44b7485 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -5,6 +5,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .lemmatizer import LOOKUP +from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language @@ -23,6 +24,7 @@ class German(Language): tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tag_map = dict(TAG_MAP) stop_words = set(STOP_WORDS) + syntax_iterators = dict(SYNTAX_ITERATORS) @classmethod def create_lemmatizer(cls, nlp=None): diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py new file mode 100644 index 000000000..ab750989e --- /dev/null +++ b/spacy/lang/de/syntax_iterators.py @@ -0,0 +1,38 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import NOUN, PROPN, PRON + + +def noun_chunks(obj): + """ + Detect base noun phrases from a dependency parse. Works on both Doc and Span. + """ + # this iterator extracts spans headed by NOUNs starting from the left-most + # syntactic dependent until the NOUN itself for close apposition and + # measurement construction, the span is sometimes extended to the right of + # the NOUN. Example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" + # and not just "eine Tasse", same for "das Thema Familie". + labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app'] + doc = obj.doc # Ensure works on both Doc and Span. + np_label = doc.vocab.strings['NP'] + np_deps = set(doc.vocab.strings[label] for label in labels) + close_app = doc.vocab.strings['nk'] + + rbracket = 0 + for i, word in enumerate(obj): + if i < rbracket: + continue + if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps: + rbracket = word.i+1 + # try to extend the span to the right + # to capture close apposition/measurement constructions + for rdep in doc[word.i].rights: + if rdep.pos in (NOUN, PROPN) and rdep.dep == close_app: + rbracket = rdep.i+1 + yield word.left_edge.i, rbracket, np_label + + +SYNTAX_ITERATORS = { + 'noun_chunks': noun_chunks +} diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 6430445a3..2d5314991 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -7,6 +7,7 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .morph_rules import MORPH_RULES from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC +from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language @@ -29,6 +30,7 @@ class English(Language): lemma_rules = dict(LEMMA_RULES) lemma_index = dict(LEMMA_INDEX) lemma_exc = dict(LEMMA_EXC) + sytax_iterators = dict(SYNTAX_ITERATORS) __all__ = ['English'] diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py new file mode 100644 index 000000000..dec240669 --- /dev/null +++ b/spacy/lang/en/syntax_iterators.py @@ -0,0 +1,43 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import NOUN, PROPN, PRON + + +def noun_chunks(obj): + """ + Detect base noun phrases from a dependency parse. Works on both Doc and Span. + """ + labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', + 'attr', 'ROOT'] + doc = obj.doc # Ensure works on both Doc and Span. + np_deps = [doc.vocab.strings[label] for label in labels] + conj = doc.vocab.strings['conj'] + np_label = doc.vocab.strings['NP'] + seen = set() + for i, word in enumerate(obj): + if word.pos not in (NOUN, PROPN, PRON): + continue + # Prevent nested chunks from being produced + if word.i in seen: + continue + if word.dep in np_deps: + if any(w.i in seen for w in word.subtree): + continue + seen.update(j for j in range(word.left_edge.i, word.i+1)) + yield word.left_edge.i, word.i+1, np_label + elif word.dep == conj: + head = word.head + while head.dep == conj and head.head.i < head.i: + head = head.head + # If the head is an NP, and we're coordinated to it, we're an NP + if head.dep in np_deps: + if any(w.i in seen for w in word.subtree): + continue + seen.update(j for j in range(word.left_edge.i, word.i+1)) + yield word.left_edge.i, word.i+1, np_label + + +SYNTAX_ITERATORS = { + 'noun_chunks': noun_chunks +}