From 89a91ad726fbe87dd264150cd89be2f2f3ed148c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 9 Jul 2015 13:30:41 +0200 Subject: [PATCH] * Add SPACE part-of-speech tag, and train tagger to assign it. Also train tagger not to make whitespace an entity --- spacy/en/pos.pyx | 5 +++-- spacy/gold.pyx | 6 +++++- spacy/parts_of_speech.pxd | 1 + spacy/parts_of_speech.pyx | 1 + 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index 97d0613cf..de795c1f3 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -11,7 +11,7 @@ from thinc.typedefs cimport atom_t, weight_t from ..parts_of_speech cimport univ_pos_t from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON -from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL +from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE from ..typedefs cimport id_t from ..structs cimport TokenC, Morphology, LexemeC from ..tokens cimport Doc @@ -180,7 +180,8 @@ POS_TAGS = { "HYPH": (PUNCT, {}), "XX": (X, {}), "BES": (VERB, {'tenspect': PRESENT, 'person': THIRD}), - "HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD}) + "HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD}), + "SP": (SPACE, {}) } diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 9a2e51d84..21ccc94b1 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -218,8 +218,12 @@ cdef class GoldParse: self.orig_annot = zip(*annot_tuples) for i, gold_i in enumerate(self.cand_to_gold): + if self.words[i].isspace(): + self.tags[i] = 'SP' + self.heads[i] = None + self.labels[i] = None + self.ner[i] = 'O' if gold_i is None: - # TODO: What do we do for missing values again? pass else: self.tags[i] = annot_tuples[2][gold_i] diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd index 4b9c343d1..b915b9dde 100644 --- a/spacy/parts_of_speech.pxd +++ b/spacy/parts_of_speech.pxd @@ -14,4 +14,5 @@ cpdef enum univ_pos_t: X PUNCT EOL + SPACE N_UNIV_TAGS diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx index 7316752c3..994a48eba 100644 --- a/spacy/parts_of_speech.pyx +++ b/spacy/parts_of_speech.pyx @@ -15,5 +15,6 @@ UNIV_POS_NAMES = { "VERB": VERB, "X": X, "PUNCT": PUNCT, + "SPACE": SPACE, "EOL": EOL }