From 12b034e3efeaa3d21919d1642eb7ce35844ee08f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 25 Jan 2015 16:31:07 +1100 Subject: [PATCH] * Move POS tag definitions to parts_of_speech.pxd --- spacy/en/pos.pxd | 4 ++-- spacy/en/pos.pyx | 8 ++++---- spacy/structs.pxd | 7 ++++--- spacy/tokens.pxd | 5 +++-- spacy/tokens.pyx | 4 ++-- spacy/typedefs.pxd | 19 ------------------- spacy/typedefs.pyx | 18 ------------------ 7 files changed, 15 insertions(+), 50 deletions(-) diff --git a/spacy/en/pos.pxd b/spacy/en/pos.pxd index d3697b97e..7ec88a7d5 100644 --- a/spacy/en/pos.pxd +++ b/spacy/en/pos.pxd @@ -4,7 +4,7 @@ from cymem.cymem cimport Pool from .._ml cimport Model from ..strings cimport StringStore from ..structs cimport TokenC, LexemeC, Morphology, PosTag -from ..typedefs cimport univ_tag_t +from ..parts_of_speech cimport univ_pos_t from .lemmatizer import Lemmatizer @@ -21,5 +21,5 @@ cdef class EnPosTagger: cdef readonly int n_tags cdef int set_morph(self, const int i, TokenC* tokens) except -1 - cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1 + cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1 diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index f71b82f20..1e19b9b82 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -8,9 +8,9 @@ from libc.string cimport memset from cymem.cymem cimport Address from thinc.typedefs cimport atom_t, weight_t -from ..typedefs cimport univ_tag_t -from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB -from ..typedefs cimport X, PUNCT, EOL +from ..parts_of_speech cimport univ_pos_t +from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB +from ..parts_of_speech cimport X, PUNCT, EOL from ..typedefs cimport id_t from ..structs cimport TokenC, Morphology, LexemeC from ..tokens cimport Tokens @@ -282,7 +282,7 @@ cdef class EnPosTagger: tokens[i].lemma = cached.lemma tokens[i].morph = cached.morph - cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1: + cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1: if self.lemmatizer is None: return lex.orth cdef unicode py_string = self.strings[lex.orth] diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 1d6de506c..f7f8b23d3 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -1,6 +1,7 @@ from libc.stdint cimport uint8_t, uint32_t -from .typedefs cimport flags_t, attr_t, id_t, hash_t, univ_tag_t +from .typedefs cimport flags_t, attr_t, id_t, hash_t +from .parts_of_speech cimport univ_pos_t cdef struct LexemeC: @@ -37,13 +38,13 @@ cdef struct Morphology: cdef struct PosTag: Morphology morph int id - univ_tag_t pos + univ_pos_t pos cdef struct TokenC: const LexemeC* lex Morphology morph - univ_tag_t pos + univ_pos_t pos int tag int idx int lemma diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 772f3e10c..9202b7c64 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -6,7 +6,8 @@ cimport numpy from cymem.cymem cimport Pool from thinc.typedefs cimport atom_t -from .typedefs cimport flags_t, attr_id_t, attr_t, univ_tag_t +from .typedefs cimport flags_t, attr_id_t, attr_t +from .parts_of_speech cimport univ_pos_t from .structs cimport Morphology, TokenC, LexemeC from .vocab cimport Vocab from .strings cimport StringStore @@ -66,7 +67,7 @@ cdef class Token: cdef readonly float sentiment cdef readonly attr_t flags cdef readonly attr_t lemma - cdef readonly univ_tag_t pos + cdef readonly univ_pos_t pos cdef readonly attr_t tag cdef readonly attr_t dep cdef readonly ndarray repvec diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 41b409989..0042dd608 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -8,7 +8,7 @@ from .typedefs cimport attr_id_t, attr_t from .typedefs cimport LEMMA from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from .typedefs cimport POS, LEMMA -from .typedefs import UNIV_TAG_NAMES +from .parts_of_speech import UNIV_POS_NAMES from unidecode import unidecode @@ -325,7 +325,7 @@ cdef class Token: property pos_: def __get__(self): - id_to_string = {id_: string for string, id_ in UNIV_TAG_NAMES.items()} + id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()} return id_to_string[self.pos] property tag_: diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd index 9d086827a..3c2ee234e 100644 --- a/spacy/typedefs.pxd +++ b/spacy/typedefs.pxd @@ -2,25 +2,6 @@ from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t from libc.stdint cimport uint8_t -# Google universal tag set -cpdef enum univ_tag_t: - NO_TAG - ADJ - ADV - ADP - CONJ - DET - NOUN - NUM - PRON - PRT - VERB - X - PUNCT - EOL - N_UNIV_TAGS - - # Reserve 64 values for flag features cpdef enum attr_id_t: FLAG0 diff --git a/spacy/typedefs.pyx b/spacy/typedefs.pyx index 020660f0c..8b1378917 100644 --- a/spacy/typedefs.pyx +++ b/spacy/typedefs.pyx @@ -1,19 +1 @@ -from __future__ import unicode_literals - -UNIV_TAG_NAMES = { - "NO_TAG": NO_TAG, - "ADJ": ADJ, - "ADV": ADV, - "ADP": ADP, - "CONJ": CONJ, - "DET": DET, - "NOUN": NOUN, - "NUM": NUM, - "PRON": PRON, - "PRT": PRT, - "VERB": VERB, - "X": X, - "PUNCT": PUNCT, - "EOL": EOL -}