From 3e3bda142d28d2b7b983869f839a770b8d48877c Mon Sep 17 00:00:00 2001 From: Pokey Rule Date: Thu, 24 Nov 2016 10:47:20 +0000 Subject: [PATCH 1/2] Add noun_chunks to Span --- spacy/syntax/iterators.pyx | 5 +++-- spacy/tokens/doc.pyx | 4 ++++ spacy/tokens/span.pyx | 25 +++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx index aeb4e635c..f8951d039 100644 --- a/spacy/syntax/iterators.pyx +++ b/spacy/syntax/iterators.pyx @@ -1,13 +1,14 @@ from spacy.parts_of_speech cimport NOUN, PROPN, PRON -def english_noun_chunks(doc): +def english_noun_chunks(obj): labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'ROOT', 'root'] + doc = obj.doc np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings['conj'] np_label = doc.vocab.strings['NP'] - for i, word in enumerate(doc): + for i, word in enumerate(obj): if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps: yield word.left_edge.i, word.i+1, np_label elif word.pos == NOUN and word.dep == conj: diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 3d09b7ad0..8ce2c7fe4 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -223,6 +223,10 @@ cdef class Doc: def __repr__(self): return self.__str__() + @property + def doc(self): + return self + def similarity(self, other): '''Make a semantic similarity estimate. The default estimate is cosine similarity using an average of word vectors. diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index e645c1a6f..a4f49555a 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -190,6 +190,31 @@ cdef class Span: def __get__(self): return u''.join([t.text_with_ws for t in self]) + property noun_chunks: + ''' + Yields base noun-phrase #[code Span] objects, if the document + has been syntactically parsed. A base noun phrase, or + 'NP chunk', is a noun phrase that does not permit other NPs to + be nested within it – so no NP-level coordination, no prepositional + phrases, and no relative clauses. For example: + ''' + def __get__(self): + if not self.doc.is_parsed: + raise ValueError( + "noun_chunks requires the dependency parse, which " + "requires data to be installed. If you haven't done so, run: " + "\npython -m spacy.%s.download all\n" + "to install the data" % self.vocab.lang) + # Accumulate the result before beginning to iterate over it. This prevents + # the tokenisation from being changed out from under us during the iteration. + # The tricky thing here is that Span accepts its tokenisation changing, + # so it's okay once we have the Span objects. See Issue #375 + spans = [] + for start, end, label in self.doc.noun_chunks_iterator(self): + spans.append(Span(self, start, end, label=label)) + for span in spans: + yield span + property root: """The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered. From b8c4f5ea768126e46b138cc8d3e0c930fb6a5aba Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 24 Nov 2016 23:30:15 +1100 Subject: [PATCH 2/2] Allow German noun chunks to work on Span Update the German noun chunks iterator, so that it also works on Span objects. --- spacy/syntax/iterators.pyx | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx index f8951d039..ee5e818c1 100644 --- a/spacy/syntax/iterators.pyx +++ b/spacy/syntax/iterators.pyx @@ -2,9 +2,11 @@ from spacy.parts_of_speech cimport NOUN, PROPN, PRON def english_noun_chunks(obj): + '''Detect base noun phrases from a dependency parse. + Works on both Doc and Span.''' labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'ROOT', 'root'] - doc = obj.doc + doc = obj.doc # Ensure works on both Doc and Span. np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings['conj'] np_label = doc.vocab.strings['NP'] @@ -26,14 +28,15 @@ def english_noun_chunks(obj): # extended to the right of the NOUN # example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not # just "eine Tasse", same for "das Thema Familie" -def german_noun_chunks(doc): +def german_noun_chunks(obj): labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app'] + doc = obj.doc # Ensure works on both Doc and Span. np_label = doc.vocab.strings['NP'] np_deps = set(doc.vocab.strings[label] for label in labels) close_app = doc.vocab.strings['nk'] rbracket = 0 - for i, word in enumerate(doc): + for i, word in enumerate(obj): if i < rbracket: continue if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps: