# cython: infer_types # coding: utf8 from __future__ import unicode_literals from libc.string cimport memset import ujson as json from . import symbols from .attrs cimport POS, IS_SPACE from .attrs import LEMMA, intify_attrs from .parts_of_speech cimport SPACE from .parts_of_speech import IDS as POS_IDS from .lexeme cimport Lexeme from .errors import Errors def _normalize_props(props): """Transform deprecated string keys to correct names.""" out = {} morph_keys = [ 'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number', 'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss', 'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType', 'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr', 'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm', 'NumValue', 'PartType', 'Polite', 'StyleVariant', 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType', 'Reflex', 'Negative', 'Mood', 'Aspect', 'Case', 'Polarity', 'PrepCase', 'Animacy' # U20 ] props = dict(props) for key in morph_keys: if key in props: attr = '%s_%s' % (key, props[key]) if attr in IDS: props.pop(key) props[attr] = True for key, value in props.items(): if key == POS: if hasattr(value, 'upper'): value = value.upper() if value in POS_IDS: value = POS_IDS[value] out[key] = value elif isinstance(key, int): out[key] = value elif key.lower() == 'pos': out[POS] = POS_IDS[value.upper()] else: out[key] = value return out cdef class Morphology: '''Store the possible morphological analyses for a language, and index them by hash. To save space on each token, tokens only know the hash of their morphological analysis, so queries of morphological attributes are delegated to this class. ''' def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None): self.mem = Pool() self.strings = string_store self.tags = PreshMap() # Add special space symbol. We prefix with underscore, to make sure it # always sorts to the end. space_attrs = tag_map.get('SP', {POS: SPACE}) if '_SP' not in tag_map: self.strings.add('_SP') tag_map = dict(tag_map) tag_map['_SP'] = space_attrs self.tag_names = tuple(sorted(tag_map.keys())) self.tag_map = {} self.lemmatizer = lemmatizer self.n_tags = len(tag_map) self.reverse_index = {} for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): attrs = _normalize_props(attrs) self.tag_map[tag_str] = dict(attrs) self.reverse_index[self.strings.add(tag_str)] = i self._cache = PreshMapArray(self.n_tags) self.exc = {} if exc is not None: for (tag, orth), attrs in exc.items(): self.add_special_case( self.strings.as_string(tag), self.strings.as_string(orth), attrs) def __reduce__(self): return (Morphology, (self.strings, self.tag_map, self.lemmatizer, self.exc), None, None) def add(self, features): """Insert a morphological analysis in the morphology table, if not already present. Returns the hash of the new analysis. """ features = intify_features(features) cdef RichTagC tag = create_rich_tag(features) cdef hash_t key = self.insert(tag) return key cpdef update(self, hash_t morph, features): """Update a morphological analysis with new feature values.""" tag = (self.tags.get(morph))[0] features = intify_features(features) cdef univ_morph_t feature for feature in features: set_feature(&tag, feature, 1) morph = self.insert(tag) return morph def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): if orth not in self.strings: return orth cdef unicode py_string = self.strings[orth] if self.lemmatizer is None: return self.strings.add(py_string.lower()) cdef list lemma_strings cdef unicode lemma_string lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) lemma_string = lemma_strings[0] lemma = self.strings.add(lemma_string) return lemma def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False): """Add a special-case rule to the morphological analyser. Tokens whose tag and orth match the rule will receive the specified properties. tag (unicode): The part-of-speech tag to key the exception. orth (unicode): The word-form to key the exception. """ attrs = dict(attrs) attrs = _normalize_props(attrs) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) self.exc[(tag_str, self.strings.add(orth_str))] = attrs cdef hash_t insert(self, RichTagC tag) except 0: cdef hash_t key = hash_tag(tag) if self.tags.get(key) == NULL: tag_ptr = self.mem.alloc(1, sizeof(RichTagC)) tag_ptr[0] = tag self.tags.set(key, tag_ptr) return key cdef int assign_untagged(self, TokenC* token) except -1: """Set morphological attributes on a token without a POS tag. Uses the lemmatizer's lookup() method, which looks up the string in the table provided by the language data as lemma_lookup (if available). """ if token.lemma == 0: orth_str = self.strings[token.lex.orth] lemma = self.lemmatizer.lookup(orth_str) token.lemma = self.strings.add(lemma) cdef int assign_tag(self, TokenC* token, tag_str) except -1: cdef attr_t tag = self.strings.as_int(tag_str) if tag in self.reverse_index: tag_id = self.reverse_index[tag] self.assign_tag_id(token, tag_id) else: token.tag = tag cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1: if tag_id > self.n_tags: raise ValueError(Errors.E014.format(tag=tag_id)) # Ensure spaces get tagged as space. # It seems pretty arbitrary to put this logic here, but there's really # nowhere better. I guess the justification is that this is where the # specific word and the tag interact. Still, we should have a better # way to enforce this rule, or figure out why the statistical model fails. # Related to Issue #220 if Lexeme.c_check_flag(token.lex, IS_SPACE): tag_id = self.reverse_index[self.strings.add('_SP')] tag_str = self.tag_names[tag_id] features = dict(self.tag_map.get(tag_str, {})) if features: pos = self.strings.as_int(features.pop(POS)) else: pos = 0 cdef attr_t lemma = self._cache.get(tag_id, token.lex.orth) if lemma == 0: lemma = self.lemmatize(pos, token.lex.orth, features) self._cache.set(tag_id, token.lex.orth, lemma) token.lemma = lemma token.pos = pos token.tag = self.strings[tag_str] token.morph = self.add(features) if (self.tag_names[tag_id], token.lex.orth) in self.exc: self._assign_tag_from_exceptions(token, tag_id) cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1: key = (self.tag_names[tag_id], token.lex.orth) cdef dict attrs attrs = self.exc[key] token.pos = attrs.get(POS, token.pos) token.lemma = attrs.get(LEMMA, token.lemma) def load_morph_exceptions(self, dict exc): # Map (form, pos) to attributes for tag_str, entries in exc.items(): for form_str, attrs in entries.items(): self.add_special_case(tag_str, form_str, attrs) def to_bytes(self): json_tags = [] for key in self.tags: tag_ptr = self.tags.get(key) if tag_ptr != NULL: json_tags.append(tag_to_json(tag_ptr[0])) raise json.dumps(json_tags) def from_bytes(self, byte_string): raise NotImplementedError def to_disk(self, path): raise NotImplementedError def from_disk(self, path): raise NotImplementedError cpdef univ_pos_t get_int_tag(pos_): return 0 cpdef intify_features(features): return {IDS.get(feature, feature) for feature in features} cdef hash_t hash_tag(RichTagC tag) nogil: return mrmr.hash64(&tag, sizeof(tag), 0) cdef RichTagC create_rich_tag(features): cdef RichTagC tag cdef univ_morph_t feature memset(&tag, 0, sizeof(tag)) for feature in features: set_feature(&tag, feature, 1) return tag cdef tag_to_json(RichTagC tag): return {} cdef RichTagC tag_from_json(json_tag): cdef RichTagC tag return tag cdef int set_feature(RichTagC* tag, univ_morph_t feature, int value) nogil: if value == True: value_ = feature else: value_ = NIL if feature == NIL: pass if is_abbr_feature(feature): tag.abbr = value_ elif is_adp_type_feature(feature): tag.adp_type = value_ elif is_adv_type_feature(feature): tag.adv_type = value_ elif is_animacy_feature(feature): tag.animacy = value_ elif is_aspect_feature(feature): tag.aspect = value_ elif is_case_feature(feature): tag.case = value_ elif is_conj_type_feature(feature): tag.conj_type = value_ elif is_connegative_feature(feature): tag.connegative = value_ elif is_definite_feature(feature): tag.definite = value_ elif is_degree_feature(feature): tag.degree = value_ elif is_derivation_feature(feature): tag.derivation = value_ elif is_echo_feature(feature): tag.echo = value_ elif is_foreign_feature(feature): tag.foreign = value_ elif is_gender_feature(feature): tag.gender = value_ elif is_hyph_feature(feature): tag.hyph = value_ elif is_inf_form_feature(feature): tag.inf_form = value_ elif is_mood_feature(feature): tag.mood = value_ elif is_negative_feature(feature): tag.negative = value_ elif is_number_feature(feature): tag.number = value_ elif is_name_type_feature(feature): tag.name_type = value_ elif is_num_form_feature(feature): tag.num_form = value_ elif is_num_value_feature(feature): tag.num_value = value_ elif is_part_form_feature(feature): tag.part_form = value_ elif is_part_type_feature(feature): tag.part_type = value_ elif is_person_feature(feature): tag.person = value_ elif is_polite_feature(feature): tag.polite = value_ elif is_polarity_feature(feature): tag.polarity = value_ elif is_poss_feature(feature): tag.poss = value_ elif is_prefix_feature(feature): tag.prefix = value_ elif is_prep_case_feature(feature): tag.prep_case = value_ elif is_pron_type_feature(feature): tag.pron_type = value_ elif is_punct_side_feature(feature): tag.punct_type = value_ elif is_reflex_feature(feature): tag.reflex = value_ elif is_style_feature(feature): tag.style = value_ elif is_style_variant_feature(feature): tag.style_variant = value_ elif is_tense_feature(feature): tag.tense = value_ elif is_verb_form_feature(feature): tag.verb_form = value_ elif is_voice_feature(feature): tag.voice = value_ elif is_verb_type_feature(feature): tag.verb_type = value_ else: with gil: raise ValueError("Unknown feature: %d" % feature) cdef int is_abbr_feature(univ_morph_t feature) nogil: return feature > begin_Abbr and feature < end_Abbr cdef int is_adp_type_feature(univ_morph_t feature) nogil: return feature > begin_AdpType and feature < end_AdpType cdef int is_adv_type_feature(univ_morph_t feature) nogil: return feature > begin_AdvType and feature < end_AdvType cdef int is_animacy_feature(univ_morph_t feature) nogil: return feature > begin_Animacy and feature < end_Animacy cdef int is_aspect_feature(univ_morph_t feature) nogil: return feature > begin_Aspect and feature < end_Aspect cdef int is_case_feature(univ_morph_t feature) nogil: return feature > begin_Case and feature < end_Case cdef int is_conj_type_feature(univ_morph_t feature) nogil: return feature > begin_ConjType and feature < end_ConjType cdef int is_connegative_feature(univ_morph_t feature) nogil: return feature > begin_Connegative and feature < end_Connegative cdef int is_definite_feature(univ_morph_t feature) nogil: return feature > begin_Definite and feature < end_Definite cdef int is_degree_feature(univ_morph_t feature) nogil: return feature > begin_Degree and feature < end_Degree cdef int is_derivation_feature(univ_morph_t feature) nogil: return feature > begin_Derivation and feature < end_Derivation cdef int is_echo_feature(univ_morph_t feature) nogil: return feature > begin_Echo and feature < end_Echo cdef int is_foreign_feature(univ_morph_t feature) nogil: return feature > begin_Foreign and feature < end_Foreign cdef int is_gender_feature(univ_morph_t feature) nogil: return feature > begin_Gender and feature < end_Gender cdef int is_hyph_feature(univ_morph_t feature) nogil: return feature > begin_Hyph and feature < begin_Hyph cdef int is_inf_form_feature(univ_morph_t feature) nogil: return feature > begin_InfForm and feature < end_InfForm cdef int is_mood_feature(univ_morph_t feature) nogil: return feature > begin_Mood and feature < end_Mood cdef int is_negative_feature(univ_morph_t feature) nogil: return feature > begin_Negative and feature < end_Negative cdef int is_number_feature(univ_morph_t feature) nogil: return feature > begin_Number and feature < end_Number cdef int is_name_type_feature(univ_morph_t feature) nogil: return feature > begin_NameType and feature < end_NameType cdef int is_num_form_feature(univ_morph_t feature) nogil: return feature > begin_NumForm and feature < end_NumForm cdef int is_num_type_feature(univ_morph_t feature) nogil: return feature > begin_NumType and feature < end_NumType cdef int is_num_value_feature(univ_morph_t feature) nogil: return feature > begin_NumValue and feature < end_NumValue cdef int is_part_form_feature(univ_morph_t feature) nogil: return feature > begin_PartForm and feature < end_PartForm cdef int is_part_type_feature(univ_morph_t feature) nogil: return feature > begin_PartType and feature < end_PartType cdef int is_person_feature(univ_morph_t feature) nogil: return feature > begin_Person and feature < end_Person cdef int is_polite_feature(univ_morph_t feature) nogil: return feature > begin_Polite and feature < end_Polite cdef int is_polarity_feature(univ_morph_t feature) nogil: return feature > begin_Polarity and feature < end_Polarity cdef int is_poss_feature(univ_morph_t feature) nogil: return feature > begin_Poss and feature < end_Poss cdef int is_prefix_feature(univ_morph_t feature) nogil: return feature > begin_Prefix and feature < end_Prefix cdef int is_prep_case_feature(univ_morph_t feature) nogil: return feature > begin_PrepCase and feature < end_PrepCase cdef int is_pron_type_feature(univ_morph_t feature) nogil: return feature > begin_PronType and feature < end_PronType cdef int is_punct_side_feature(univ_morph_t feature) nogil: return feature > begin_PunctSide and feature < end_PunctSide cdef int is_punct_type_feature(univ_morph_t feature) nogil: return feature > begin_PunctType and feature < end_PunctType cdef int is_reflex_feature(univ_morph_t feature) nogil: return feature > begin_Reflex and feature < end_Reflex cdef int is_style_feature(univ_morph_t feature) nogil: return feature > begin_Style and feature < end_Style cdef int is_style_variant_feature(univ_morph_t feature) nogil: return feature > begin_StyleVariant and feature < end_StyleVariant cdef int is_tense_feature(univ_morph_t feature) nogil: return feature > begin_Tense and feature < end_Tense cdef int is_verb_form_feature(univ_morph_t feature) nogil: return feature > begin_VerbForm and feature < end_VerbForm cdef int is_voice_feature(univ_morph_t feature) nogil: return feature > begin_Voice and feature < end_Voice cdef int is_verb_type_feature(univ_morph_t feature) nogil: return feature > begin_VerbType and feature < end_VerbType IDS = { "Animacy_anim": Animacy_anim, "Animacy_inan": Animacy_inan, "Animacy_hum": Animacy_hum, # U20 "Animacy_nhum": Animacy_nhum, "Aspect_freq": Aspect_freq, "Aspect_imp": Aspect_imp, "Aspect_mod": Aspect_mod, "Aspect_none": Aspect_none, "Aspect_perf": Aspect_perf, "Case_abe": Case_abe, "Case_abl": Case_abl, "Case_abs": Case_abs, "Case_acc": Case_acc, "Case_ade": Case_ade, "Case_all": Case_all, "Case_cau": Case_cau, "Case_com": Case_com, "Case_dat": Case_dat, "Case_del": Case_del, "Case_dis": Case_dis, "Case_ela": Case_ela, "Case_ess": Case_ess, "Case_gen": Case_gen, "Case_ill": Case_ill, "Case_ine": Case_ine, "Case_ins": Case_ins, "Case_loc": Case_loc, "Case_lat": Case_lat, "Case_nom": Case_nom, "Case_par": Case_par, "Case_sub": Case_sub, "Case_sup": Case_sup, "Case_tem": Case_tem, "Case_ter": Case_ter, "Case_tra": Case_tra, "Case_voc": Case_voc, "Definite_two": Definite_two, "Definite_def": Definite_def, "Definite_red": Definite_red, "Definite_cons": Definite_cons, # U20 "Definite_ind": Definite_ind, "Degree_cmp": Degree_cmp, "Degree_comp": Degree_comp, "Degree_none": Degree_none, "Degree_pos": Degree_pos, "Degree_sup": Degree_sup, "Degree_abs": Degree_abs, "Degree_com": Degree_com, "Degree_dim ": Degree_dim, # du "Gender_com": Gender_com, "Gender_fem": Gender_fem, "Gender_masc": Gender_masc, "Gender_neut": Gender_neut, "Mood_cnd": Mood_cnd, "Mood_imp": Mood_imp, "Mood_ind": Mood_ind, "Mood_n": Mood_n, "Mood_pot": Mood_pot, "Mood_sub": Mood_sub, "Mood_opt": Mood_opt, "Negative_neg": Negative_neg, "Negative_pos": Negative_pos, "Negative_yes": Negative_yes, "Polarity_neg": Polarity_neg, # U20 "Polarity_pos": Polarity_pos, # U20 "Number_com": Number_com, "Number_dual": Number_dual, "Number_none": Number_none, "Number_plur": Number_plur, "Number_sing": Number_sing, "Number_ptan ": Number_ptan, # bg "Number_count ": Number_count, # bg "NumType_card": NumType_card, "NumType_dist": NumType_dist, "NumType_frac": NumType_frac, "NumType_gen": NumType_gen, "NumType_mult": NumType_mult, "NumType_none": NumType_none, "NumType_ord": NumType_ord, "NumType_sets": NumType_sets, "Person_one": Person_one, "Person_two": Person_two, "Person_three": Person_three, "Person_none": Person_none, "Poss_yes": Poss_yes, "PronType_advPart": PronType_advPart, "PronType_art": PronType_art, "PronType_default": PronType_default, "PronType_dem": PronType_dem, "PronType_ind": PronType_ind, "PronType_int": PronType_int, "PronType_neg": PronType_neg, "PronType_prs": PronType_prs, "PronType_rcp": PronType_rcp, "PronType_rel": PronType_rel, "PronType_tot": PronType_tot, "PronType_clit": PronType_clit, "PronType_exc ": PronType_exc, # es, ca, it, fa, "Reflex_yes": Reflex_yes, "Tense_fut": Tense_fut, "Tense_imp": Tense_imp, "Tense_past": Tense_past, "Tense_pres": Tense_pres, "VerbForm_fin": VerbForm_fin, "VerbForm_ger": VerbForm_ger, "VerbForm_inf": VerbForm_inf, "VerbForm_none": VerbForm_none, "VerbForm_part": VerbForm_part, "VerbForm_partFut": VerbForm_partFut, "VerbForm_partPast": VerbForm_partPast, "VerbForm_partPres": VerbForm_partPres, "VerbForm_sup": VerbForm_sup, "VerbForm_trans": VerbForm_trans, "VerbForm_conv": VerbForm_conv, # U20 "VerbForm_gdv ": VerbForm_gdv, # la, "Voice_act": Voice_act, "Voice_cau": Voice_cau, "Voice_pass": Voice_pass, "Voice_mid ": Voice_mid, # gkc, "Voice_int ": Voice_int, # hb, "Abbr_yes ": Abbr_yes, # cz, fi, sl, U, "AdpType_prep ": AdpType_prep, # cz, U, "AdpType_post ": AdpType_post, # U, "AdpType_voc ": AdpType_voc, # cz, "AdpType_comprep ": AdpType_comprep, # cz, "AdpType_circ ": AdpType_circ, # U, "AdvType_man": AdvType_man, "AdvType_loc": AdvType_loc, "AdvType_tim": AdvType_tim, "AdvType_deg": AdvType_deg, "AdvType_cau": AdvType_cau, "AdvType_mod": AdvType_mod, "AdvType_sta": AdvType_sta, "AdvType_ex": AdvType_ex, "AdvType_adadj": AdvType_adadj, "ConjType_oper ": ConjType_oper, # cz, U, "ConjType_comp ": ConjType_comp, # cz, U, "Connegative_yes ": Connegative_yes, # fi, "Derivation_minen ": Derivation_minen, # fi, "Derivation_sti ": Derivation_sti, # fi, "Derivation_inen ": Derivation_inen, # fi, "Derivation_lainen ": Derivation_lainen, # fi, "Derivation_ja ": Derivation_ja, # fi, "Derivation_ton ": Derivation_ton, # fi, "Derivation_vs ": Derivation_vs, # fi, "Derivation_ttain ": Derivation_ttain, # fi, "Derivation_ttaa ": Derivation_ttaa, # fi, "Echo_rdp ": Echo_rdp, # U, "Echo_ech ": Echo_ech, # U, "Foreign_foreign ": Foreign_foreign, # cz, fi, U, "Foreign_fscript ": Foreign_fscript, # cz, fi, U, "Foreign_tscript ": Foreign_tscript, # cz, U, "Foreign_yes ": Foreign_yes, # sl, "Gender_dat_masc ": Gender_dat_masc, # bq, U, "Gender_dat_fem ": Gender_dat_fem, # bq, U, "Gender_erg_masc ": Gender_erg_masc, # bq, "Gender_erg_fem ": Gender_erg_fem, # bq, "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U, "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U, "Gender_psor_neut ": Gender_psor_neut, # sl, "Hyph_yes ": Hyph_yes, # cz, U, "InfForm_one ": InfForm_one, # fi, "InfForm_two ": InfForm_two, # fi, "InfForm_three ": InfForm_three, # fi, "NameType_geo ": NameType_geo, # U, cz, "NameType_prs ": NameType_prs, # U, cz, "NameType_giv ": NameType_giv, # U, cz, "NameType_sur ": NameType_sur, # U, cz, "NameType_nat ": NameType_nat, # U, cz, "NameType_com ": NameType_com, # U, cz, "NameType_pro ": NameType_pro, # U, cz, "NameType_oth ": NameType_oth, # U, cz, "NounType_com ": NounType_com, # U, "NounType_prop ": NounType_prop, # U, "NounType_class ": NounType_class, # U, "Number_abs_sing ": Number_abs_sing, # bq, U, "Number_abs_plur ": Number_abs_plur, # bq, U, "Number_dat_sing ": Number_dat_sing, # bq, U, "Number_dat_plur ": Number_dat_plur, # bq, U, "Number_erg_sing ": Number_erg_sing, # bq, U, "Number_erg_plur ": Number_erg_plur, # bq, U, "Number_psee_sing ": Number_psee_sing, # U, "Number_psee_plur ": Number_psee_plur, # U, "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U, "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U, "NumForm_digit ": NumForm_digit, # cz, sl, U, "NumForm_roman ": NumForm_roman, # cz, sl, U, "NumForm_word ": NumForm_word, # cz, sl, U, "NumValue_one ": NumValue_one, # cz, U, "NumValue_two ": NumValue_two, # cz, U, "NumValue_three ": NumValue_three, # cz, U, "PartForm_pres ": PartForm_pres, # fi, "PartForm_past ": PartForm_past, # fi, "PartForm_agt ": PartForm_agt, # fi, "PartForm_neg ": PartForm_neg, # fi, "PartType_mod ": PartType_mod, # U, "PartType_emp ": PartType_emp, # U, "PartType_res ": PartType_res, # U, "PartType_inf ": PartType_inf, # U, "PartType_vbp ": PartType_vbp, # U, "Person_abs_one ": Person_abs_one, # bq, U, "Person_abs_two ": Person_abs_two, # bq, U, "Person_abs_three ": Person_abs_three, # bq, U, "Person_dat_one ": Person_dat_one, # bq, U, "Person_dat_two ": Person_dat_two, # bq, U, "Person_dat_three ": Person_dat_three, # bq, U, "Person_erg_one ": Person_erg_one, # bq, U, "Person_erg_two ": Person_erg_two, # bq, U, "Person_erg_three ": Person_erg_three, # bq, U, "Person_psor_one ": Person_psor_one, # fi, U, "Person_psor_two ": Person_psor_two, # fi, U, "Person_psor_three ": Person_psor_three, # fi, U, "Polite_inf ": Polite_inf, # bq, U, "Polite_pol ": Polite_pol, # bq, U, "Polite_abs_inf ": Polite_abs_inf, # bq, U, "Polite_abs_pol ": Polite_abs_pol, # bq, U, "Polite_erg_inf ": Polite_erg_inf, # bq, U, "Polite_erg_pol ": Polite_erg_pol, # bq, U, "Polite_dat_inf ": Polite_dat_inf, # bq, U, "Polite_dat_pol ": Polite_dat_pol, # bq, U, "Prefix_yes ": Prefix_yes, # U, "PrepCase_npr ": PrepCase_npr, # cz, "PrepCase_pre ": PrepCase_pre, # U, "PunctSide_ini ": PunctSide_ini, # U, "PunctSide_fin ": PunctSide_fin, # U, "PunctType_peri ": PunctType_peri, # U, "PunctType_qest ": PunctType_qest, # U, "PunctType_excl ": PunctType_excl, # U, "PunctType_quot ": PunctType_quot, # U, "PunctType_brck ": PunctType_brck, # U, "PunctType_comm ": PunctType_comm, # U, "PunctType_colo ": PunctType_colo, # U, "PunctType_semi ": PunctType_semi, # U, "PunctType_dash ": PunctType_dash, # U, "Style_arch ": Style_arch, # cz, fi, U, "Style_rare ": Style_rare, # cz, fi, U, "Style_poet ": Style_poet, # cz, U, "Style_norm ": Style_norm, # cz, U, "Style_coll ": Style_coll, # cz, U, "Style_vrnc ": Style_vrnc, # cz, U, "Style_sing ": Style_sing, # cz, U, "Style_expr ": Style_expr, # cz, U, "Style_derg ": Style_derg, # cz, U, "Style_vulg ": Style_vulg, # cz, U, "Style_yes ": Style_yes, # fi, U, "StyleVariant_styleShort ": StyleVariant_styleShort, # cz, "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl, "VerbType_aux ": VerbType_aux, # U, "VerbType_cop ": VerbType_cop, # U, "VerbType_mod ": VerbType_mod, # U, "VerbType_light ": VerbType_light, # U, } NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] # Unfortunate hack here, to work around problem with long cpdef enum # (which is generating an enormous amount of C++ in Cython 0.24+) # We keep the enum cdef, and just make sure the names are available to Python locals().update(IDS)