spaCy/spacy/morphology.pyx

# cython: infer_types
# coding: utf8
from __future__ import unicode_literals

from libc.string cimport memset
import ujson as json

from . import symbols
from .attrs cimport POS, IS_SPACE
from .attrs import LEMMA, intify_attrs
from .parts_of_speech cimport SPACE
from .parts_of_speech import IDS as POS_IDS
from .lexeme cimport Lexeme
from .errors import Errors


def _normalize_props(props):
    """Transform deprecated string keys to correct names."""
    out = {}
    morph_keys = [
        'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
        'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
        'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
        'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',
        'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',
        'NumValue', 'PartType', 'Polite', 'StyleVariant',
        'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
        'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
        'Polarity', 'PrepCase', 'Animacy' # U20
    ]
    props = dict(props)
    for key in morph_keys:
        if key in props:
            attr = '%s_%s' % (key, props[key])
            if attr in IDS:
                props.pop(key)
                props[attr] = True
    for key, value in props.items():
        if key == POS:
            if hasattr(value, 'upper'):
                value = value.upper()
            if value in POS_IDS:
                value = POS_IDS[value]
            out[key] = value
        elif isinstance(key, int):
            out[key] = value
        elif key.lower() == 'pos':
            out[POS] = POS_IDS[value.upper()]
        else:
            out[key] = value
    return out


def parse_feature(feature):
    if not hasattr(feature, 'split'):
        feature = NAMES[feature]
    key, value = feature.split('_')
    begin = 'begin_%s' % key
    offset = IDS[feature] - IDS[begin]
    field_id = FIELDS[key]
    return (field_id, offset)


def get_field_size(field):
    begin = 'begin_%s' % field
    end = 'end_%s' % field
    return (IDS[end] - IDS[begin]) - 1


cdef class Morphology:
    '''Store the possible morphological analyses for a language, and index them
    by hash.
    
    To save space on each token, tokens only know the hash of their morphological
    analysis, so queries of morphological attributes are delegated
    to this class.
    '''
    def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
        self.mem = Pool()
        self.strings = string_store
        self.tags = PreshMap()
        # Add special space symbol. We prefix with underscore, to make sure it
        # always sorts to the end.
        space_attrs = tag_map.get('SP', {POS: SPACE})
        if '_SP' not in tag_map:
            self.strings.add('_SP')
            tag_map = dict(tag_map)
            tag_map['_SP'] = space_attrs
        self.tag_names = tuple(sorted(tag_map.keys()))
        self.tag_map = {}
        self.lemmatizer = lemmatizer
        self.n_tags = len(tag_map)
        self.reverse_index = {}
        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
            attrs = _normalize_props(attrs)
            self.tag_map[tag_str] = dict(attrs)
            self.reverse_index[self.strings.add(tag_str)] = i

        self._cache = PreshMapArray(self.n_tags)
        self.exc = {}
        if exc is not None:
            for (tag, orth), attrs in exc.items():
                self.add_special_case(
                    self.strings.as_string(tag), self.strings.as_string(orth), attrs)

    def __reduce__(self):
        return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
                self.exc), None, None)

    def add(self, features):
        """Insert a morphological analysis in the morphology table, if not already
        present. Returns the hash of the new analysis.
        """
        features = intify_features(features)
        cdef RichTagC tag = create_rich_tag(features)
        cdef hash_t key = self.insert(tag)
        return key
    
    cpdef update(self, hash_t morph, features):
        """Update a morphological analysis with new feature values."""
        tag = (<RichTagC*>self.tags.get(morph))[0]
        features = intify_features(features)
        cdef univ_morph_t feature
        for feature in features:
            set_feature(&tag, feature, 1)
        morph = self.insert(tag)
        return morph


    def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
        if orth not in self.strings:
            return orth
        cdef unicode py_string = self.strings[orth]
        if self.lemmatizer is None:
            return self.strings.add(py_string.lower())
        cdef list lemma_strings
        cdef unicode lemma_string
        lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
        lemma_string = lemma_strings[0]
        lemma = self.strings.add(lemma_string)
        return lemma

    def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
                         force=False):
        """Add a special-case rule to the morphological analyser. Tokens whose
        tag and orth match the rule will receive the specified properties.

        tag (unicode): The part-of-speech tag to key the exception.
        orth (unicode): The word-form to key the exception.
        """
        attrs = dict(attrs)
        attrs = _normalize_props(attrs)
        attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
        self.exc[(tag_str, self.strings.add(orth_str))] = attrs
 
    cdef hash_t insert(self, RichTagC tag) except 0:
        cdef hash_t key = hash_tag(tag)
        if self.tags.get(key) == NULL:
            tag_ptr = <RichTagC*>self.mem.alloc(1, sizeof(RichTagC))
            tag_ptr[0] = tag
            self.tags.set(key, <void*>tag_ptr)
        return key
    
    cdef int assign_untagged(self, TokenC* token) except -1:
        """Set morphological attributes on a token without a POS tag. Uses
        the lemmatizer's lookup() method, which looks up the string in the
        table provided by the language data as lemma_lookup (if available).
        """
        if token.lemma == 0:
            orth_str = self.strings[token.lex.orth]
            lemma = self.lemmatizer.lookup(orth_str)
            token.lemma = self.strings.add(lemma)

    cdef int assign_tag(self, TokenC* token, tag_str) except -1:
        cdef attr_t tag = self.strings.as_int(tag_str)
        if tag in self.reverse_index:
            tag_id = self.reverse_index[tag]
            self.assign_tag_id(token, tag_id)
        else:
            token.tag = tag

    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
        if tag_id > self.n_tags:
            raise ValueError(Errors.E014.format(tag=tag_id))
        # Ensure spaces get tagged as space.
        # It seems pretty arbitrary to put this logic here, but there's really
        # nowhere better. I guess the justification is that this is where the
        # specific word and the tag interact. Still, we should have a better
        # way to enforce this rule, or figure out why the statistical model fails.
        # Related to Issue #220
        if Lexeme.c_check_flag(token.lex, IS_SPACE):
            tag_id = self.reverse_index[self.strings.add('_SP')]
        tag_str = self.tag_names[tag_id]
        features = dict(self.tag_map.get(tag_str, {}))
        if features:
            pos = self.strings.as_int(features.pop(POS))
        else:
            pos = 0
        cdef attr_t lemma = <attr_t>self._cache.get(tag_id, token.lex.orth)
        if lemma == 0:
            lemma = self.lemmatize(pos, token.lex.orth, features)
            self._cache.set(tag_id, token.lex.orth, <void*>lemma)
        token.lemma = lemma
        token.pos = <univ_pos_t>pos
        token.tag = self.strings[tag_str]
        token.morph = self.add(features)
        if (self.tag_names[tag_id], token.lex.orth) in self.exc:
            self._assign_tag_from_exceptions(token, tag_id)

    cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1:
        key = (self.tag_names[tag_id], token.lex.orth)
        cdef dict attrs
        attrs = self.exc[key]
        token.pos = attrs.get(POS, token.pos)
        token.lemma = attrs.get(LEMMA, token.lemma)

    def load_morph_exceptions(self, dict exc):
        # Map (form, pos) to attributes
        for tag_str, entries in exc.items():
            for form_str, attrs in entries.items():
                self.add_special_case(tag_str, form_str, attrs)

    def to_bytes(self):
        json_tags = []
        for key in self.tags:
            tag_ptr = <RichTagC*>self.tags.get(key)
            if tag_ptr != NULL:
                json_tags.append(tag_to_json(tag_ptr[0]))
        raise json.dumps(json_tags)

    def from_bytes(self, byte_string):
        raise NotImplementedError

    def to_disk(self, path):
        raise NotImplementedError

    def from_disk(self, path):
        raise NotImplementedError


cpdef univ_pos_t get_int_tag(pos_):
    return <univ_pos_t>0

cpdef intify_features(features):
    return {IDS.get(feature, feature) for feature in features}

cdef hash_t hash_tag(RichTagC tag) nogil:
    return mrmr.hash64(&tag, sizeof(tag), 0)

cdef RichTagC create_rich_tag(features):
    cdef RichTagC tag
    cdef univ_morph_t feature
    memset(&tag, 0, sizeof(tag))
    for feature in features:
        set_feature(&tag, feature, 1)
    return tag

cdef tag_to_json(RichTagC tag):
    return {}

cdef RichTagC tag_from_json(json_tag):
    cdef RichTagC tag
    return tag
 
cdef int set_feature(RichTagC* tag, univ_morph_t feature, int value) nogil:
    if value == True:
        value_ = feature
    else:
        value_ = NIL
    if feature == NIL:
        pass
    if is_abbr_feature(feature):
        tag.abbr = value_
    elif is_adp_type_feature(feature):
        tag.adp_type = value_
    elif is_adv_type_feature(feature):
        tag.adv_type = value_
    elif is_animacy_feature(feature):
        tag.animacy = value_
    elif is_aspect_feature(feature):
        tag.aspect = value_
    elif is_case_feature(feature):
        tag.case = value_
    elif is_conj_type_feature(feature):
        tag.conj_type = value_
    elif is_connegative_feature(feature):
        tag.connegative = value_
    elif is_definite_feature(feature):
        tag.definite = value_
    elif is_degree_feature(feature):
        tag.degree = value_
    elif is_derivation_feature(feature):
        tag.derivation = value_
    elif is_echo_feature(feature):
        tag.echo = value_
    elif is_foreign_feature(feature):
        tag.foreign = value_
    elif is_gender_feature(feature):
        tag.gender = value_
    elif is_hyph_feature(feature):
        tag.hyph = value_
    elif is_inf_form_feature(feature):
        tag.inf_form = value_
    elif is_mood_feature(feature):
        tag.mood = value_
    elif is_negative_feature(feature):
        tag.negative = value_
    elif is_number_feature(feature):
        tag.number = value_
    elif is_name_type_feature(feature):
        tag.name_type = value_
    elif is_num_form_feature(feature):
        tag.num_form = value_
    elif is_num_value_feature(feature):
        tag.num_value = value_
    elif is_part_form_feature(feature):
        tag.part_form = value_
    elif is_part_type_feature(feature):
        tag.part_type = value_
    elif is_person_feature(feature):
        tag.person = value_
    elif is_polite_feature(feature):
        tag.polite = value_
    elif is_polarity_feature(feature):
        tag.polarity = value_
    elif is_poss_feature(feature):
        tag.poss = value_
    elif is_prefix_feature(feature):
        tag.prefix = value_
    elif is_prep_case_feature(feature):
        tag.prep_case = value_
    elif is_pron_type_feature(feature):
        tag.pron_type = value_
    elif is_punct_side_feature(feature):
        tag.punct_type = value_
    elif is_reflex_feature(feature):
        tag.reflex = value_
    elif is_style_feature(feature):
        tag.style = value_
    elif is_style_variant_feature(feature):
        tag.style_variant = value_
    elif is_tense_feature(feature):
        tag.tense = value_
    elif is_verb_form_feature(feature):
        tag.verb_form = value_
    elif is_voice_feature(feature):
        tag.voice = value_
    elif is_verb_type_feature(feature):
        tag.verb_type = value_
    else:
        with gil:
            raise ValueError("Unknown feature: %d" % feature)

cdef int is_abbr_feature(univ_morph_t feature) nogil:
    return feature > begin_Abbr  and feature < end_Abbr

cdef int is_adp_type_feature(univ_morph_t feature) nogil:
    return feature > begin_AdpType and feature < end_AdpType

cdef int is_adv_type_feature(univ_morph_t feature) nogil:
    return feature > begin_AdvType and feature < end_AdvType

cdef int is_animacy_feature(univ_morph_t feature) nogil:
    return feature > begin_Animacy and feature < end_Animacy

cdef int is_aspect_feature(univ_morph_t feature) nogil:
    return feature > begin_Aspect and feature < end_Aspect

cdef int is_case_feature(univ_morph_t feature) nogil:
    return feature > begin_Case and feature < end_Case

cdef int is_conj_type_feature(univ_morph_t feature) nogil:
    return feature > begin_ConjType and feature < end_ConjType

cdef int is_connegative_feature(univ_morph_t feature) nogil:
    return feature > begin_Connegative and feature < end_Connegative

cdef int is_definite_feature(univ_morph_t feature) nogil:
    return feature > begin_Definite and feature < end_Definite

cdef int is_degree_feature(univ_morph_t feature) nogil:
    return feature > begin_Degree and feature < end_Degree

cdef int is_derivation_feature(univ_morph_t feature) nogil:
    return feature > begin_Derivation and feature < end_Derivation

cdef int is_echo_feature(univ_morph_t feature) nogil:
    return feature > begin_Echo and feature < end_Echo

cdef int is_foreign_feature(univ_morph_t feature) nogil:
    return feature > begin_Foreign and feature < end_Foreign

cdef int is_gender_feature(univ_morph_t feature) nogil:
    return feature > begin_Gender and feature < end_Gender

cdef int is_hyph_feature(univ_morph_t feature) nogil:
    return feature > begin_Hyph and feature < begin_Hyph

cdef int is_inf_form_feature(univ_morph_t feature) nogil:
    return feature > begin_InfForm and feature < end_InfForm

cdef int is_mood_feature(univ_morph_t feature) nogil:
    return feature > begin_Mood and feature < end_Mood

cdef int is_negative_feature(univ_morph_t feature) nogil:
    return feature > begin_Negative and feature < end_Negative

cdef int is_number_feature(univ_morph_t feature) nogil:
    return feature > begin_Number and feature < end_Number

cdef int is_name_type_feature(univ_morph_t feature) nogil:
    return feature > begin_NameType and feature < end_NameType

cdef int is_num_form_feature(univ_morph_t feature) nogil:
    return feature > begin_NumForm and feature < end_NumForm

cdef int is_num_type_feature(univ_morph_t feature) nogil:
    return feature > begin_NumType and feature < end_NumType

cdef int is_num_value_feature(univ_morph_t feature) nogil:
    return feature > begin_NumValue and feature < end_NumValue

cdef int is_part_form_feature(univ_morph_t feature) nogil:
    return feature > begin_PartForm and feature < end_PartForm

cdef int is_part_type_feature(univ_morph_t feature) nogil:
    return feature > begin_PartType and feature < end_PartType

cdef int is_person_feature(univ_morph_t feature) nogil:
    return feature > begin_Person and feature < end_Person

cdef int is_polite_feature(univ_morph_t feature) nogil:
    return feature > begin_Polite and feature < end_Polite

cdef int is_polarity_feature(univ_morph_t feature) nogil:
    return feature > begin_Polarity and feature < end_Polarity

cdef int is_poss_feature(univ_morph_t feature) nogil:
    return feature > begin_Poss and feature < end_Poss

cdef int is_prefix_feature(univ_morph_t feature) nogil:
    return feature > begin_Prefix and feature < end_Prefix

cdef int is_prep_case_feature(univ_morph_t feature) nogil:
    return feature > begin_PrepCase and feature < end_PrepCase

cdef int is_pron_type_feature(univ_morph_t feature) nogil:
    return feature > begin_PronType and feature < end_PronType

cdef int is_punct_side_feature(univ_morph_t feature) nogil:
    return feature > begin_PunctSide and feature < end_PunctSide

cdef int is_punct_type_feature(univ_morph_t feature) nogil:
    return feature > begin_PunctType and feature < end_PunctType

cdef int is_reflex_feature(univ_morph_t feature) nogil:
    return feature > begin_Reflex and feature < end_Reflex

cdef int is_style_feature(univ_morph_t feature) nogil:
    return feature > begin_Style and feature < end_Style

cdef int is_style_variant_feature(univ_morph_t feature) nogil:
    return feature > begin_StyleVariant and feature < end_StyleVariant

cdef int is_tense_feature(univ_morph_t feature) nogil:
    return feature > begin_Tense and feature < end_Tense

cdef int is_verb_form_feature(univ_morph_t feature) nogil:
    return feature > begin_VerbForm and feature < end_VerbForm

cdef int is_voice_feature(univ_morph_t feature) nogil:
    return feature > begin_Voice and feature < end_Voice

cdef int is_verb_type_feature(univ_morph_t feature) nogil:
    return feature > begin_VerbType and feature < end_VerbType


FIELDS = {
    'Abbr': 0,
    'AdpType': 1,
    'AdvType': 2,
    'Animacy': 3,
    'Aspect': 4,
    'Case': 5,
    'ConjType': 6,
    'Connegative': 7,
    'Definite': 8,
    'Degree': 9,
    'Derivation': 10,
    'Echo': 11,
    'Foreign': 12,
    'Gender': 13,
    'Hyph': 14,
    'InfForm': 15,
    'Mood': 16,
    'Negative': 17,
    'Number': 18,
    'NameType': 19,
    'NumForm': 20,
    'NumType': 21,
    'NumValue': 22,
    'PartForm': 23,
    'PartType': 24,
    'Person': 25,
    'Polite': 26,
    'Polarity': 27,
    'Poss': 28,
    'Prefix': 29,
    'PrepCase': 30,
    'PronType': 31,
    'PunctSide': 32,
    'PunctType': 33,
    'Reflex': 34,
    'Style': 35,
    'StyleVariant': 36,
    'Tense': 37,
    'VerbForm': 38,
    'Voice': 39,
    'VerbType': 40
}

IDS = {
   "begin_Abbr": begin_Abbr,
   "Abbr_yes ": Abbr_yes ,
   "end_Abbr": end_Abbr,
   "begin_AdpType": begin_AdpType,
   "AdpType_circ": AdpType_circ,
   "AdpType_comprep": AdpType_comprep,
   "AdpType_prep ": AdpType_prep ,
   "AdpType_post": AdpType_post,
   "AdpType_voc": AdpType_voc,
   "end_AdpType": end_AdpType,
   "begin_AdvType": begin_AdvType,
   "AdvType_adadj": AdvType_adadj,
   "AdvType_cau": AdvType_cau,
   "AdvType_deg": AdvType_deg,
   "AdvType_ex": AdvType_ex,
   "AdvType_loc": AdvType_loc,
   "AdvType_man": AdvType_man,
   "AdvType_mod": AdvType_mod,
   "AdvType_sta": AdvType_sta,
   "AdvType_tim": AdvType_tim,
   "end_AdvType": end_AdvType,
   "begin_Animacy": begin_Animacy,
   "Animacy_anim": Animacy_anim,
   "Animacy_hum": Animacy_hum,
   "Animacy_inan": Animacy_inan,
   "Animacy_nhum": Animacy_nhum,
   "end_Animacy": end_Animacy,
   "begin_Aspect": begin_Aspect,
   "Aspect_freq": Aspect_freq,
   "Aspect_imp": Aspect_imp,
   "Aspect_mod": Aspect_mod,
   "Aspect_none": Aspect_none,
   "Aspect_perf": Aspect_perf,
   "end_Aspect": end_Aspect,
   "begin_Case": begin_Case,
   "Case_abe": Case_abe,
   "Case_abl": Case_abl,
   "Case_abs": Case_abs,
   "Case_acc": Case_acc,
   "Case_ade": Case_ade,
   "Case_all": Case_all,
   "Case_cau": Case_cau,
   "Case_com": Case_com,
   "Case_dat": Case_dat,
   "Case_del": Case_del,
   "Case_dis": Case_dis,
   "Case_ela": Case_ela,
   "Case_ess": Case_ess,
   "Case_gen": Case_gen,
   "Case_ill": Case_ill,
   "Case_ine": Case_ine,
   "Case_ins": Case_ins,
   "Case_loc": Case_loc,
   "Case_lat": Case_lat,
   "Case_nom": Case_nom,
   "Case_par": Case_par,
   "Case_sub": Case_sub,
   "Case_sup": Case_sup,
   "Case_tem": Case_tem,
   "Case_ter": Case_ter,
   "Case_tra": Case_tra,
   "Case_voc": Case_voc,
   "end_Case": end_Case,
   "begin_ConjType": begin_ConjType,
   "ConjType_comp ": ConjType_comp ,
   "ConjType_oper": ConjType_oper,
   "end_ConjType": end_ConjType,
   "begin_Connegative": begin_Connegative,
   "Connegative_yes": Connegative_yes,
   "end_Connegative": end_Connegative,
   "begin_Definite": begin_Definite,
   "Definite_cons": Definite_cons,
   "Definite_def": Definite_def,
   "Definite_ind": Definite_ind,
   "Definite_red": Definite_red,
   "Definite_two": Definite_two,
   "end_Definite": end_Definite,
   "begin_Degree": begin_Degree,
   "Degree_abs": Degree_abs,
   "Degree_cmp": Degree_cmp,
   "Degree_comp": Degree_comp,
   "Degree_none": Degree_none,
   "Degree_pos": Degree_pos,
   "Degree_sup": Degree_sup,
   "Degree_com": Degree_com,
   "Degree_dim": Degree_dim,
   "end_Degree": end_Degree,
   "begin_Gender": begin_Gender,
   "Gender_com": Gender_com,
   "Gender_fem": Gender_fem,
   "Gender_masc": Gender_masc,
   "Gender_neut": Gender_neut,
   "Gender_dat_masc": Gender_dat_masc,
   "Gender_dat_fem": Gender_dat_fem,
   "Gender_erg_masc": Gender_erg_masc,
   "Gender_erg_fem": Gender_erg_fem,
   "Gender_psor_masc": Gender_psor_masc,
   "Gender_psor_fem": Gender_psor_fem,
   "Gender_psor_neut": Gender_psor_neut,
   "end_Gender": end_Gender,
   "begin_Mood": begin_Mood,
   "Mood_cnd": Mood_cnd,
   "Mood_imp": Mood_imp,
   "Mood_ind": Mood_ind,
   "Mood_n": Mood_n,
   "Mood_pot": Mood_pot,
   "Mood_sub": Mood_sub,
   "Mood_opt": Mood_opt,
   "end_Mood": end_Mood,
   "begin_Negative": begin_Negative,
   "Negative_neg": Negative_neg,
   "Negative_pos": Negative_pos,
   "Negative_yes": Negative_yes,
   "end_Negative": end_Negative,
   "begin_Polarity": begin_Polarity,
   "Polarity_neg": Polarity_neg,
   "Polarity_pos": Polarity_pos,
   "end_Polarity": end_Polarity,
   "begin_Number": begin_Number,
   "Number_com": Number_com,
   "Number_dual": Number_dual,
   "Number_none": Number_none,
   "Number_plur": Number_plur,
   "Number_sing": Number_sing,
   "Number_ptan": Number_ptan,
   "Number_count": Number_count,
   "Number_abs_sing": Number_abs_sing,
   "Number_abs_plur": Number_abs_plur,
   "Number_dat_sing": Number_dat_sing,
   "Number_dat_plur": Number_dat_plur,
   "Number_erg_sing": Number_erg_sing,
   "Number_erg_plur": Number_erg_plur,
   "Number_psee_sing": Number_psee_sing,
   "Number_psee_plur": Number_psee_plur,
   "Number_psor_sing": Number_psor_sing,
   "Number_psor_plur": Number_psor_plur,
   "end_Number": end_Number,
   "begin_NumType": begin_NumType,
   "NumType_card": NumType_card,
   "NumType_dist": NumType_dist,
   "NumType_frac": NumType_frac,
   "NumType_gen": NumType_gen,
   "NumType_mult": NumType_mult,
   "NumType_none": NumType_none,
   "NumType_ord": NumType_ord,
   "NumType_sets": NumType_sets,
   "end_NumType": end_NumType,
   "begin_Person": begin_Person,
   "Person_one": Person_one,
   "Person_two": Person_two,
   "Person_three": Person_three,
   "Person_none": Person_none,
   "Person_abs_one": Person_abs_one,
   "Person_abs_two": Person_abs_two,
   "Person_abs_three": Person_abs_three,
   "Person_dat_one": Person_dat_one,
   "Person_dat_two": Person_dat_two,
   "Person_dat_three": Person_dat_three,
   "Person_erg_one": Person_erg_one,
   "Person_erg_two": Person_erg_two,
   "Person_erg_three": Person_erg_three,
   "Person_psor_one": Person_psor_one,
   "Person_psor_two": Person_psor_two,
   "Person_psor_three": Person_psor_three,
   "end_Person": end_Person,
   "begin_Poss": begin_Poss,
   "Poss_yes": Poss_yes,
   "end_Poss": end_Poss,
   "begin_PronType": begin_PronType,
   "PronType_advPart": PronType_advPart,
   "PronType_art": PronType_art,
   "PronType_default": PronType_default,
   "PronType_dem": PronType_dem,
   "PronType_ind": PronType_ind,
   "PronType_int": PronType_int,
   "PronType_neg": PronType_neg,
   "PronType_prs": PronType_prs,
   "PronType_rcp": PronType_rcp,
   "PronType_rel": PronType_rel,
   "PronType_tot": PronType_tot,
   "PronType_clit": PronType_clit,
   "PronType_exc": PronType_exc,
   "end_PronType": end_PronType,
   "begin_Reflex": begin_Reflex,
   "Reflex_yes": Reflex_yes,
   "end_Reflex": end_Reflex,
   "begin_Tense": begin_Tense,
   "Tense_fut": Tense_fut,
   "Tense_imp": Tense_imp,
   "Tense_past": Tense_past,
   "Tense_pres": Tense_pres,
   "end_Tense": end_Tense,
   "begin_VerbForm": begin_VerbForm,
   "VerbForm_fin": VerbForm_fin,
   "VerbForm_ger": VerbForm_ger,
   "VerbForm_inf": VerbForm_inf,
   "VerbForm_none": VerbForm_none,
   "VerbForm_part": VerbForm_part,
   "VerbForm_partFut": VerbForm_partFut,
   "VerbForm_partPast": VerbForm_partPast,
   "VerbForm_partPres": VerbForm_partPres,
   "VerbForm_sup": VerbForm_sup,
   "VerbForm_trans": VerbForm_trans,
   "VerbForm_conv": VerbForm_conv,
   "VerbForm_gdv": VerbForm_gdv,
   "end_VerbForm": end_VerbForm,
   "begin_Voice": begin_Voice,
   "Voice_act": Voice_act,
   "Voice_cau": Voice_cau,
   "Voice_pass": Voice_pass,
   "Voice_mid": Voice_mid,
   "Voice_int": Voice_int,
   "end_Voice": end_Voice,
   "begin_Derivation": begin_Derivation,
   "Derivation_minen": Derivation_minen,
   "Derivation_sti": Derivation_sti,
   "Derivation_inen": Derivation_inen,
   "Derivation_lainen": Derivation_lainen,
   "Derivation_ja": Derivation_ja,
   "Derivation_ton": Derivation_ton,
   "Derivation_vs": Derivation_vs,
   "Derivation_ttain": Derivation_ttain,
   "Derivation_ttaa": Derivation_ttaa,
   "end_Derivation": end_Derivation,
   "begin_Echo": begin_Echo,
   "Echo_rdp": Echo_rdp,
   "Echo_ech": Echo_ech,
   "end_Echo": end_Echo,
   "begin_Foreign": begin_Foreign,
   "Foreign_foreign": Foreign_foreign,
   "Foreign_fscript": Foreign_fscript,
   "Foreign_tscript": Foreign_tscript,
   "Foreign_yes": Foreign_yes,
   "end_Foreign": end_Foreign,
   "begin_Hyph": begin_Hyph,
   "Hyph_yes": Hyph_yes,
   "end_Hyph": end_Hyph,
   "begin_InfForm": begin_InfForm,
   "InfForm_one": InfForm_one,
   "InfForm_two": InfForm_two,
   "InfForm_three": InfForm_three,
   "end_InfForm": end_InfForm,
   "begin_NameType": begin_NameType,
   "NameType_geo": NameType_geo,
   "NameType_prs": NameType_prs,
   "NameType_giv": NameType_giv,
   "NameType_sur": NameType_sur,
   "NameType_nat": NameType_nat,
   "NameType_com": NameType_com,
   "NameType_pro": NameType_pro,
   "NameType_oth": NameType_oth,
   "end_NameType": end_NameType,
   "begin_NounType": begin_NounType,
   "NounType_com": NounType_com,
   "NounType_prop": NounType_prop,
   "NounType_class": NounType_class,
   "end_NounType": end_NounType,
   "begin_NumForm": begin_NumForm,
   "NumForm_digit": NumForm_digit,
   "NumForm_roman": NumForm_roman,
   "NumForm_word": NumForm_word,
   "end_NumForm": end_NumForm,
   "begin_NumValue": begin_NumValue,
   "NumValue_one": NumValue_one,
   "NumValue_two": NumValue_two,
   "NumValue_three": NumValue_three,
   "end_NumValue": end_NumValue,
   "begin_PartForm": begin_PartForm,
   "PartForm_pres": PartForm_pres,
   "PartForm_past": PartForm_past,
   "PartForm_agt": PartForm_agt,
   "PartForm_neg": PartForm_neg,
   "end_PartForm": end_PartForm,
   "begin_PartType": begin_PartType,
   "PartType_mod": PartType_mod,
   "PartType_emp": PartType_emp,
   "PartType_res": PartType_res,
   "PartType_inf": PartType_inf,
   "PartType_vbp": PartType_vbp,
   "end_PartType": end_PartType,
   "begin_Polite": begin_Polite,
   "Polite_inf": Polite_inf,
   "Polite_pol": Polite_pol,
   "Polite_abs_inf": Polite_abs_inf,
   "Polite_abs_pol": Polite_abs_pol,
   "Polite_erg_inf": Polite_erg_inf,
   "Polite_erg_pol": Polite_erg_pol,
   "Polite_dat_inf": Polite_dat_inf,
   "Polite_dat_pol": Polite_dat_pol,
   "end_Polite": end_Polite,
   "begin_Prefix": begin_Prefix,
   "Prefix_yes": Prefix_yes,
   "end_Prefix": end_Prefix,
   "begin_PrepCase": begin_PrepCase,
   "PrepCase_npr": PrepCase_npr,
   "PrepCase_pre": PrepCase_pre,
   "end_PrepCase": end_PrepCase,
   "begin_PunctSide": begin_PunctSide,
   "PunctSide_ini": PunctSide_ini,
   "PunctSide_fin": PunctSide_fin,
   "end_PunctSide": end_PunctSide,
   "begin_PunctType": begin_PunctType,
   "PunctType_peri": PunctType_peri,
   "PunctType_qest": PunctType_qest,
   "PunctType_excl": PunctType_excl,
   "PunctType_quot": PunctType_quot,
   "PunctType_brck": PunctType_brck,
   "PunctType_comm": PunctType_comm,
   "PunctType_colo": PunctType_colo,
   "PunctType_semi": PunctType_semi,
   "PunctType_dash": PunctType_dash,
   "end_PunctType": end_PunctType,
   "begin_Style": begin_Style,
   "Style_arch": Style_arch,
   "Style_rare": Style_rare,
   "Style_poet": Style_poet,
   "Style_norm": Style_norm,
   "Style_coll": Style_coll,
   "Style_vrnc": Style_vrnc,
   "Style_sing": Style_sing,
   "Style_expr": Style_expr,
   "Style_derg": Style_derg,
   "Style_vulg": Style_vulg,
   "Style_yes": Style_yes,
   "end_Style": end_Style,
   "begin_StyleVariant": begin_StyleVariant,
   "StyleVariant_styleShort": StyleVariant_styleShort,
   "StyleVariant_styleBound": StyleVariant_styleBound,
   "end_StyleVariant": end_StyleVariant,
   "begin_VerbType": begin_VerbType,
   "VerbType_aux": VerbType_aux,
   "VerbType_cop": VerbType_cop,
   "VerbType_mod": VerbType_mod,
   "VerbType_light": VerbType_light,
   "end_VerbType": end_VerbType,
}


FIELD_SIZES = [get_field_size(field) for field in FIELDS]

NAMES = {value: key for key, value in IDS.items()}
# Unfortunate hack here, to work around problem with long cpdef enum
# (which is generating an enormous amount of C++ in Cython 0.24+)
# We keep the enum cdef, and just make sure the names are available to Python
locals().update(IDS)