* Gut pos.pyx module, since functionality moved to spacy/tagger.pyx

2015-08-26 19:15:42 +02:00 · 2015-08-26 19:15:42 +02:00 · e2ef78b29c
parent c4d8754385
commit e2ef78b29c
1 changed files with 2 additions and 259 deletions
--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@ -1,268 +1,11 @@
 from os import path
-import json
-import os
-import shutil

-from libc.string cimport memset
+from ..parts_of_speech cimport NOUN, VERB, ADJ

-from cymem.cymem cimport Address
-from thinc.typedefs cimport atom_t, weight_t
-from collections import defaultdict
-
-from ..parts_of_speech cimport univ_pos_t
-from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
-
-from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE
-from ..structs cimport TokenC, Morphology, LexemeC
-from ..tokens.doc cimport Doc
-from ..morphology cimport set_morph_from_dict
-from .._ml cimport arg_max
-
-from .attrs cimport TAG, IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL
-from ..typedefs cimport attr_t
-
-from .lemmatizer import Lemmatizer
-
-
-cpdef enum en_person_t:
-    NO_PERSON
-    FIRST
-    SECOND
-    THIRD
-    NON_THIRD
-
-
-cpdef enum en_number_t:
-    NO_NUMBER
-    SINGULAR
-    PLURAL
-    MASS
-
-
-cpdef enum en_gender_t:
-    NO_GENDER
-    MASCULINE
-    FEMININE
-    NEUTER
-
-
-cpdef enum en_case_t:
-    NO_CASE
-    NOMINATIVE
-    GENITIVE
-    ACCUSATIVE
-    REFLEXIVE
-    DEMONYM
-
-
-cpdef enum en_tenspect_t:
-    NO_TENSE
-    BASE_VERB
-    PRESENT
-    PAST
-    PASSIVE
-    ING
-    MODAL
-
-
-cpdef enum misc_t:
-    NO_MISC
-    COMPARATIVE
-    SUPERLATIVE
-    RELATIVE
-    NAME
-
-
-cpdef enum:
-    P2_orth
-    P2_cluster
-    P2_shape
-    P2_prefix
-    P2_suffix
-    P2_pos
-    P2_lemma
-    P2_flags
-
-    P1_orth
-    P1_cluster
-    P1_shape
-    P1_prefix
-    P1_suffix
-    P1_pos
-    P1_lemma
-    P1_flags
-
-    W_orth
-    W_cluster
-    W_shape
-    W_prefix
-    W_suffix
-    W_pos
-    W_lemma
-    W_flags
-
-    N1_orth
-    N1_cluster
-    N1_shape
-    N1_prefix
-    N1_suffix
-    N1_pos
-    N1_lemma
-    N1_flags
-
-    N2_orth
-    N2_cluster
-    N2_shape
-    N2_prefix
-    N2_suffix
-    N2_pos
-    N2_lemma
-    N2_flags
-
-    N_CONTEXT_FIELDS
-
-
-POS_TAGS = {
-    'NULL': (NO_TAG, {}),
-    'EOL': (EOL, {}),
-    'CC': (CONJ, {}),
-    'CD': (NUM, {}),
-    'DT': (DET, {}),
-    'EX': (DET, {}),
-    'FW': (X, {}),
-    'IN': (ADP, {}),
-    'JJ': (ADJ, {}),
-    'JJR': (ADJ, {'misc': COMPARATIVE}),
-    'JJS': (ADJ, {'misc': SUPERLATIVE}),
-    'LS': (X, {}),
-    'MD': (VERB, {'tenspect': MODAL}),
-    'NN': (NOUN, {}),
-    'NNS': (NOUN, {'number': PLURAL}),
-    'NNP': (NOUN, {'misc': NAME}),
-    'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
-    'PDT': (DET, {}),
-    'POS': (PRT, {'case': GENITIVE}),
-    'PRP': (PRON, {}),
-    'PRP$': (PRON, {'case': GENITIVE}),
-    'RB': (ADV, {}),
-    'RBR': (ADV, {'misc': COMPARATIVE}),
-    'RBS': (ADV, {'misc': SUPERLATIVE}),
-    'RP': (PRT, {}),
-    'SYM': (X, {}),
-    'TO': (PRT, {}),
-    'UH': (X, {}),
-    'VB': (VERB, {}),
-    'VBD': (VERB, {'tenspect': PAST}),
-    'VBG': (VERB, {'tenspect': ING}),
-    'VBN': (VERB, {'tenspect': PASSIVE}),
-    'VBP': (VERB, {'tenspect': PRESENT}),
-    'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
-    'WDT': (DET, {'misc': RELATIVE}),
-    'WP': (PRON, {'misc': RELATIVE}),
-    'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
-    'WRB': (ADV, {'misc': RELATIVE}),
-    '!': (PUNCT, {}),
-    '#': (PUNCT, {}),
-    '$': (PUNCT, {}),
-    "''": (PUNCT, {}),
-    "(": (PUNCT, {}),
-    ")": (PUNCT, {}),
-    "-LRB-": (PUNCT, {}),
-    "-RRB-": (PUNCT, {}),
-    ".": (PUNCT, {}),
-    ",": (PUNCT, {}),
-    "``": (PUNCT, {}),
-    ":": (PUNCT, {}),
-    "?": (PUNCT, {}),
-    "ADD": (X, {}),
-    "NFP": (PUNCT, {}),
-    "GW": (X, {}),
-    "AFX": (X, {}),
-    "HYPH": (PUNCT, {}),
-    "XX": (X, {}),
-    "BES": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
-    "HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
-    "SP": (SPACE, {})
-}
-
-
-POS_TEMPLATES = (
-    (W_orth,),
-    (P1_lemma, P1_pos),
-    (P2_lemma, P2_pos),
-    (N1_orth,),
-    (N2_orth,),
-
-    (W_suffix,),
-    (W_prefix,),
-
-    (P1_pos,),
-    (P2_pos,),
-    (P1_pos, P2_pos),
-    (P1_pos, W_orth),
-    (P1_suffix,),
-    (N1_suffix,),
-
-    (W_shape,),
-    (W_cluster,),
-    (N1_cluster,),
-    (N2_cluster,),
-    (P1_cluster,),
-    (P2_cluster,),
-
-    (W_flags,),
-    (N1_flags,),
-    (N2_flags,),
-    (P1_flags,),
-    (P2_flags,),
-)
+from ..lemmatizer import Lemmatizer


 cdef class EnPosTagger(Tagger):
    """A part-of-speech tagger for English"""
    def make_lemmatizer(self, data_dir):
        return Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
-    
-    cdef int predict(self, int i, const TokenC* tokens) except -1:
-        cdef atom_t[N_CONTEXT_FIELDS] context
-        _fill_from_token(&context[P2_orth], &tokens[i-2])
-        _fill_from_token(&context[P1_orth], &tokens[i-1])
-        _fill_from_token(&context[W_orth], &tokens[i])
-        _fill_from_token(&context[N1_orth], &tokens[i+1])
-        _fill_from_token(&context[N2_orth], &tokens[i+2])
-        scores = self.model.score(context)
-        return arg_max(scores, self.model.n_classes)
-
-    cdef int update(self, int i, const TokenC* tokens, int gold) except -1:
-        cdef atom_t[N_CONTEXT_FIELDS] context
-        _fill_from_token(&context[P2_orth], &tokens[i-2])
-        _fill_from_token(&context[P1_orth], &tokens[i-1])
-        _fill_from_token(&context[W_orth], &tokens[i])
-        _fill_from_token(&context[N1_orth], &tokens[i+1])
-        _fill_from_token(&context[N2_orth], &tokens[i+2])
-        scores = self.model.score(context)
-        guess = arg_max(scores, self.model.n_classes)
-        loss = guess != gold if gold != -1 else 0
-        self.model.update(context, guess, gold, loss)
-        return guess
- 
-
-
-cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
-    context[0] = t.lex.lower
-    context[1] = t.lex.cluster
-    context[2] = t.lex.shape
-    context[3] = t.lex.prefix
-    context[4] = t.lex.suffix
-    context[5] = t.tag
-    context[6] = t.lemma
-    if t.lex.flags & (1 << IS_ALPHA):
-        context[7] = 1
-    elif t.lex.flags & (1 << IS_PUNCT):
-        context[7] = 2
-    elif t.lex.flags & (1 << LIKE_URL):
-        context[7] = 3
-    elif t.lex.flags & (1 << LIKE_NUM):
-        context[7] = 4
-    else:
-        context[7] = 0