diff --git a/setup.py b/setup.py index 176434151..de7d95d22 100644 --- a/setup.py +++ b/setup.py @@ -56,6 +56,7 @@ MOD_NAMES = [ 'spacy.tokens.doc', 'spacy.tokens.span', 'spacy.tokens.token', + 'spacy.tokens.npchunks', 'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits', @@ -184,3 +185,4 @@ def setup_package(): if __name__ == '__main__': setup_package() + diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 26088be0c..fa45c8b3e 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -23,6 +23,7 @@ from .token cimport Token from ..serialize.bits cimport BitArray from ..util import normalize_slice +import npchunks DEF PADDING = 5 @@ -239,24 +240,15 @@ cdef class Doc: "requires data to be installed. If you haven't done so, run: " "\npython -m spacy.en.download all\n" "to install the data") - - cdef const TokenC* word - labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', - 'attr', 'root'] - np_deps = [self.vocab.strings[label] for label in labels] - conj = self.vocab.strings['conj'] - np_label = self.vocab.strings['NP'] - for i in range(self.length): - word = &self.c[i] - if word.pos == NOUN and word.dep in np_deps: - yield Span(self, word.l_edge, i+1, label=np_label) - elif word.pos == NOUN and word.dep == conj: - head = word+word.head - while head.dep == conj and head.head < 0: - head += head.head - # If the head is an NP, and we're coordinated to it, we're an NP - if head.dep in np_deps: - yield Span(self, word.l_edge, i+1, label=np_label) + + chunk_rules = {'en':npchunks.english, 'de':npchunks.german} + + for sent in self.sents: + lang = 'en' # todo: make dependent on language of root token + for chunk in chunk_rules.get(lang)(sent): + yield chunk + + @property def sents(self): diff --git a/spacy/tokens/npchunks.pxd b/spacy/tokens/npchunks.pxd new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tokens/npchunks.pyx b/spacy/tokens/npchunks.pyx new file mode 100644 index 000000000..0c5ca32a5 --- /dev/null +++ b/spacy/tokens/npchunks.pyx @@ -0,0 +1,54 @@ + +from ..structs cimport TokenC +from .doc cimport Doc +from .span cimport Span + +from ..parts_of_speech cimport NOUN, PROPN, PRON + +def english(Span sent): + cdef const TokenC* word + strings = sent.doc.vocab.strings + labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root'] + np_deps = [strings[label] for label in labels] + conj = strings['conj'] + np_label = strings['NP'] + for i in range(sent.start, sent.end): + word = &sent.doc.c[i] + if word.pos == NOUN and word.dep in np_deps: + yield Span(sent.doc, word.l_edge, i+1, label=np_label) + elif word.pos == NOUN and word.dep == conj: + head = word+word.head + while head.dep == conj and head.head < 0: + head += head.head + # If the head is an NP, and we're coordinated to it, we're an NP + if head.dep in np_deps: + yield Span(sent.doc, word.l_edge, i+1, label=np_label) + + +def german(Span sent): + # this function extracts spans headed by NOUNs starting from the left-most + # syntactic dependent until the NOUN itself + # for close apposition and measurement construction, the span is sometimes + # extended to the right of the NOUN + # example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not + # just "eine Tasse", same for "das Thema Familie" + cdef const TokenC* word + strings = sent.doc.vocab.strings + labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app'] + close_app = strings['nk'] + np_deps = [strings[label] for label in labels] + np_label = strings['NP'] + for i in range(sent.start, sent.end): + word = &sent.doc.c[i] + if word.pos == NOUN and word.dep in np_deps: + rbracket = i+1 + # try to extend the span to the right + # to capture close apposition/measurement constructions + for rdep in sent.doc[i].rights: + if rdep.pos == NOUN and rdep.dep == close_app: + rbracket = rdep.i+1 + yield Span(sent.doc, word.l_edge, rbracket, label=np_label) + + + +