diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 27d99a045..4e81b8c24 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -11,6 +11,7 @@ from .typedefs cimport LEMMA from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from .typedefs cimport POS, LEMMA, TAG, DEP from .parts_of_speech import UNIV_POS_NAMES +from .parts_of_speech cimport CONJ, PUNCT from .lexeme cimport check_flag from .spans import Span from .structs cimport UniStr @@ -538,6 +539,27 @@ cdef class Token: return Token.cinit(self.vocab, self._string, self.c + self.c.head, self.i + self.c.head, self.array_len, self._seq) + + property conjuncts: + def __get__(self): + """Get a list of conjoined words""" + cdef Token word + conjs = [] + if self.c.pos != CONJ and self.c.pos != PUNCT: + seen_conj = False + for word in reversed(list(self.lefts)): + if word.c.pos == CONJ: + seen_conj = True + elif seen_conj and word.c.pos == self.c.pos: + conjs.append(word) + conjs.reverse() + conjs.append(self) + if seen_conj: + return conjs + elif self is not self.head and self in self.head.conjuncts: + return self.head.conjuncts + else: + return [] property ent_type: def __get__(self): diff --git a/tests/test_conjuncts.py b/tests/test_conjuncts.py new file mode 100644 index 000000000..34643183a --- /dev/null +++ b/tests/test_conjuncts.py @@ -0,0 +1,34 @@ +"""Test the Token.conjuncts property""" +from __future__ import unicode_literals + +from spacy.en import English +import pytest + +NLU = English() + +def orths(tokens): + return [t.orth_ for t in tokens] + + +def test_simple_two(): + tokens = NLU('I lost money and pride.') + pride = tokens[4] + assert orths(pride.conjuncts) == ['money', 'pride'] + money = tokens[2] + assert orths(money.conjuncts) == ['money', 'pride'] + + +def test_comma_three(): + tokens = NLU('I found my wallet, phone and keys.') + keys = tokens[-2] + assert orths(keys.conjuncts) == ['wallet', 'phone', 'keys'] + wallet = tokens[3] + assert orths(wallet.conjuncts) == ['wallet', 'phone', 'keys'] + + +def test_and_three(): + tokens = NLU('I found my wallet and phone and keys.') + keys = tokens[-2] + assert orths(keys.conjuncts) == ['wallet', 'phone', 'keys'] + wallet = tokens[3] + assert orths(wallet.conjuncts) == ['wallet', 'phone', 'keys']