* Add SPACE part-of-speech tag, and train tagger to assign it. Also train tagger not to make whitespace an entity

This commit is contained in:
Matthew Honnibal 2015-07-09 13:30:41 +02:00
parent f95da0bd52
commit 89a91ad726
4 changed files with 10 additions and 3 deletions

View File

@ -11,7 +11,7 @@ from thinc.typedefs cimport atom_t, weight_t
from ..parts_of_speech cimport univ_pos_t
from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL
from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE
from ..typedefs cimport id_t
from ..structs cimport TokenC, Morphology, LexemeC
from ..tokens cimport Doc
@ -180,7 +180,8 @@ POS_TAGS = {
"HYPH": (PUNCT, {}),
"XX": (X, {}),
"BES": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
"HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD})
"HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
"SP": (SPACE, {})
}

View File

@ -218,8 +218,12 @@ cdef class GoldParse:
self.orig_annot = zip(*annot_tuples)
for i, gold_i in enumerate(self.cand_to_gold):
if self.words[i].isspace():
self.tags[i] = 'SP'
self.heads[i] = None
self.labels[i] = None
self.ner[i] = 'O'
if gold_i is None:
# TODO: What do we do for missing values again?
pass
else:
self.tags[i] = annot_tuples[2][gold_i]

View File

@ -14,4 +14,5 @@ cpdef enum univ_pos_t:
X
PUNCT
EOL
SPACE
N_UNIV_TAGS

View File

@ -15,5 +15,6 @@ UNIV_POS_NAMES = {
"VERB": VERB,
"X": X,
"PUNCT": PUNCT,
"SPACE": SPACE,
"EOL": EOL
}