2015-08-28 00:02:33 +00:00
|
|
|
from cymem.cymem cimport Pool
|
|
|
|
from preshed.maps cimport PreshMapArray
|
|
|
|
from libc.stdint cimport uint64_t
|
|
|
|
|
2015-08-26 17:17:35 +00:00
|
|
|
from .structs cimport TokenC
|
2015-08-27 07:16:11 +00:00
|
|
|
from .strings cimport StringStore
|
2015-08-28 00:02:33 +00:00
|
|
|
from .typedefs cimport attr_t
|
|
|
|
from .parts_of_speech cimport univ_pos_t
|
|
|
|
|
2015-10-10 11:10:58 +00:00
|
|
|
from . cimport symbols
|
2015-08-28 00:02:33 +00:00
|
|
|
|
|
|
|
cdef struct RichTagC:
|
|
|
|
uint64_t morph
|
|
|
|
int id
|
|
|
|
univ_pos_t pos
|
|
|
|
attr_t name
|
|
|
|
|
|
|
|
|
|
|
|
cdef struct MorphAnalysisC:
|
|
|
|
RichTagC tag
|
|
|
|
attr_t lemma
|
2015-08-26 17:17:35 +00:00
|
|
|
|
|
|
|
|
|
|
|
cdef class Morphology:
|
2015-08-28 00:02:33 +00:00
|
|
|
cdef readonly Pool mem
|
2015-08-28 01:44:54 +00:00
|
|
|
cdef readonly StringStore strings
|
2015-08-27 07:16:11 +00:00
|
|
|
cdef public object lemmatizer
|
2015-10-12 04:27:47 +00:00
|
|
|
cdef readonly object tag_map
|
2015-08-28 00:02:33 +00:00
|
|
|
cdef public object n_tags
|
|
|
|
cdef public object reverse_index
|
2015-08-26 17:17:35 +00:00
|
|
|
cdef public object tag_names
|
|
|
|
|
2015-08-28 00:02:33 +00:00
|
|
|
cdef RichTagC* rich_tags
|
|
|
|
cdef PreshMapArray _cache
|
|
|
|
|
|
|
|
cdef int assign_tag(self, TokenC* token, tag) except -1
|
|
|
|
|
|
|
|
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1
|
|
|
|
|
2015-08-26 17:17:35 +00:00
|
|
|
|
2015-10-10 11:10:58 +00:00
|
|
|
cpdef enum univ_morph_t:
|
|
|
|
NIL = 0
|
|
|
|
Animacy_anim = symbols.Animacy_anim
|
|
|
|
Animacy_inam
|
|
|
|
Aspect_freq
|
|
|
|
Aspect_imp
|
|
|
|
Aspect_mod
|
|
|
|
Aspect_none
|
|
|
|
Aspect_perf
|
|
|
|
Case_abe
|
|
|
|
Case_abl
|
|
|
|
Case_abs
|
|
|
|
Case_acc
|
|
|
|
Case_ade
|
|
|
|
Case_all
|
|
|
|
Case_cau
|
|
|
|
Case_com
|
|
|
|
Case_dat
|
|
|
|
Case_del
|
|
|
|
Case_dis
|
|
|
|
Case_ela
|
|
|
|
Case_ess
|
|
|
|
Case_gen
|
|
|
|
Case_ill
|
|
|
|
Case_ine
|
|
|
|
Case_ins
|
|
|
|
Case_loc
|
|
|
|
Case_lat
|
|
|
|
Case_nom
|
|
|
|
Case_par
|
|
|
|
Case_sub
|
|
|
|
Case_sup
|
|
|
|
Case_tem
|
|
|
|
Case_ter
|
|
|
|
Case_tra
|
|
|
|
Case_voc
|
|
|
|
Definite_two
|
|
|
|
Definite_def
|
|
|
|
Definite_red
|
|
|
|
Definite_ind
|
|
|
|
Degree_cmp
|
|
|
|
Degree_comp
|
|
|
|
Degree_none
|
|
|
|
Degree_pos
|
|
|
|
Degree_sup
|
|
|
|
Degree_abs
|
|
|
|
Degree_com
|
|
|
|
Degree_dim # du
|
|
|
|
Gender_com
|
|
|
|
Gender_fem
|
|
|
|
Gender_masc
|
|
|
|
Gender_neut
|
|
|
|
Mood_cnd
|
|
|
|
Mood_imp
|
|
|
|
Mood_ind
|
|
|
|
Mood_n
|
|
|
|
Mood_pot
|
|
|
|
Mood_sub
|
|
|
|
Mood_opt
|
|
|
|
Negative_neg
|
|
|
|
Negative_pos
|
|
|
|
Negative_yes
|
|
|
|
Number_com
|
|
|
|
Number_dual
|
|
|
|
Number_none
|
|
|
|
Number_plur
|
|
|
|
Number_sing
|
|
|
|
Number_ptan # bg
|
|
|
|
Number_count # bg
|
|
|
|
NumType_card
|
|
|
|
NumType_dist
|
|
|
|
NumType_frac
|
|
|
|
NumType_gen
|
|
|
|
NumType_mult
|
|
|
|
NumType_none
|
|
|
|
NumType_ord
|
|
|
|
NumType_sets
|
|
|
|
Person_one
|
|
|
|
Person_two
|
|
|
|
Person_three
|
|
|
|
Person_none
|
|
|
|
Poss_yes
|
|
|
|
PronType_advPart
|
|
|
|
PronType_art
|
|
|
|
PronType_default
|
|
|
|
PronType_dem
|
|
|
|
PronType_ind
|
|
|
|
PronType_int
|
|
|
|
PronType_neg
|
|
|
|
PronType_prs
|
|
|
|
PronType_rcp
|
|
|
|
PronType_rel
|
|
|
|
PronType_tot
|
|
|
|
PronType_clit
|
|
|
|
PronType_exc # es, ca, it, fa
|
|
|
|
Reflex_yes
|
|
|
|
Tense_fut
|
|
|
|
Tense_imp
|
|
|
|
Tense_past
|
|
|
|
Tense_pres
|
|
|
|
VerbForm_fin
|
|
|
|
VerbForm_ger
|
|
|
|
VerbForm_inf
|
|
|
|
VerbForm_none
|
|
|
|
VerbForm_part
|
|
|
|
VerbForm_partFut
|
|
|
|
VerbForm_partPast
|
|
|
|
VerbForm_partPres
|
|
|
|
VerbForm_sup
|
|
|
|
VerbForm_trans
|
|
|
|
VerbForm_gdv # la
|
|
|
|
Voice_act
|
|
|
|
Voice_cau
|
|
|
|
Voice_pass
|
|
|
|
Voice_mid # gkc
|
|
|
|
Voice_int # hb
|
|
|
|
Abbr_yes # cz, fi, sl, U
|
|
|
|
AdpType_prep # cz, U
|
|
|
|
AdpType_post # U
|
|
|
|
AdpType_voc # cz
|
|
|
|
AdpType_comprep # cz
|
|
|
|
AdpType_circ # U
|
|
|
|
AdvType_man
|
|
|
|
AdvType_loc
|
|
|
|
AdvType_tim
|
|
|
|
AdvType_deg
|
|
|
|
AdvType_cau
|
|
|
|
AdvType_mod
|
|
|
|
AdvType_sta
|
|
|
|
AdvType_ex
|
|
|
|
AdvType_adadj
|
|
|
|
ConjType_oper # cz, U
|
|
|
|
ConjType_comp # cz, U
|
|
|
|
Connegative_yes # fi
|
|
|
|
Derivation_minen # fi
|
|
|
|
Derivation_sti # fi
|
|
|
|
Derivation_inen # fi
|
|
|
|
Derivation_lainen # fi
|
|
|
|
Derivation_ja # fi
|
|
|
|
Derivation_ton # fi
|
|
|
|
Derivation_vs # fi
|
|
|
|
Derivation_ttain # fi
|
|
|
|
Derivation_ttaa # fi
|
|
|
|
Echo_rdp # U
|
|
|
|
Echo_ech # U
|
|
|
|
Foreign_foreign # cz, fi, U
|
|
|
|
Foreign_fscript # cz, fi, U
|
|
|
|
Foreign_tscript # cz, U
|
|
|
|
Foreign_yes # sl
|
|
|
|
Gender_dat_masc # bq, U
|
|
|
|
Gender_dat_fem # bq, U
|
|
|
|
Gender_erg_masc # bq
|
|
|
|
Gender_erg_fem # bq
|
|
|
|
Gender_psor_masc # cz, sl, U
|
|
|
|
Gender_psor_fem # cz, sl, U
|
|
|
|
Gender_psor_neut # sl
|
|
|
|
Hyph_yes # cz, U
|
|
|
|
InfForm_one # fi
|
|
|
|
InfForm_two # fi
|
|
|
|
InfForm_three # fi
|
|
|
|
NameType_geo # U, cz
|
|
|
|
NameType_prs # U, cz
|
|
|
|
NameType_giv # U, cz
|
|
|
|
NameType_sur # U, cz
|
|
|
|
NameType_nat # U, cz
|
|
|
|
NameType_com # U, cz
|
|
|
|
NameType_pro # U, cz
|
|
|
|
NameType_oth # U, cz
|
|
|
|
NounType_com # U
|
|
|
|
NounType_prop # U
|
|
|
|
NounType_class # U
|
|
|
|
Number_abs_sing # bq, U
|
|
|
|
Number_abs_plur # bq, U
|
|
|
|
Number_dat_sing # bq, U
|
|
|
|
Number_dat_plur # bq, U
|
|
|
|
Number_erg_sing # bq, U
|
|
|
|
Number_erg_plur # bq, U
|
|
|
|
Number_psee_sing # U
|
|
|
|
Number_psee_plur # U
|
|
|
|
Number_psor_sing # cz, fi, sl, U
|
|
|
|
Number_psor_plur # cz, fi, sl, U
|
|
|
|
NumForm_digit # cz, sl, U
|
|
|
|
NumForm_roman # cz, sl, U
|
|
|
|
NumForm_word # cz, sl, U
|
|
|
|
NumValue_one # cz, U
|
|
|
|
NumValue_two # cz, U
|
|
|
|
NumValue_three # cz, U
|
|
|
|
PartForm_pres # fi
|
|
|
|
PartForm_past # fi
|
|
|
|
PartForm_agt # fi
|
|
|
|
PartForm_neg # fi
|
|
|
|
PartType_mod # U
|
|
|
|
PartType_emp # U
|
|
|
|
PartType_res # U
|
|
|
|
PartType_inf # U
|
|
|
|
PartType_vbp # U
|
|
|
|
Person_abs_one # bq, U
|
|
|
|
Person_abs_two # bq, U
|
|
|
|
Person_abs_three # bq, U
|
|
|
|
Person_dat_one # bq, U
|
|
|
|
Person_dat_two # bq, U
|
|
|
|
Person_dat_three # bq, U
|
|
|
|
Person_erg_one # bq, U
|
|
|
|
Person_erg_two # bq, U
|
|
|
|
Person_erg_three # bq, U
|
|
|
|
Person_psor_one # fi, U
|
|
|
|
Person_psor_two # fi, U
|
|
|
|
Person_psor_three # fi, U
|
|
|
|
Polite_inf # bq, U
|
|
|
|
Polite_pol # bq, U
|
|
|
|
Polite_abs_inf # bq, U
|
|
|
|
Polite_abs_pol # bq, U
|
|
|
|
Polite_erg_inf # bq, U
|
|
|
|
Polite_erg_pol # bq, U
|
|
|
|
Polite_dat_inf # bq, U
|
|
|
|
Polite_dat_pol # bq, U
|
|
|
|
Prefix_yes # U
|
|
|
|
PrepCase_npr # cz
|
|
|
|
PrepCase_pre # U
|
|
|
|
PunctSide_ini # U
|
|
|
|
PunctSide_fin # U
|
|
|
|
PunctType_peri # U
|
|
|
|
PunctType_qest # U
|
|
|
|
PunctType_excl # U
|
|
|
|
PunctType_quot # U
|
|
|
|
PunctType_brck # U
|
|
|
|
PunctType_comm # U
|
|
|
|
PunctType_colo # U
|
|
|
|
PunctType_semi # U
|
|
|
|
PunctType_dash # U
|
|
|
|
Style_arch # cz, fi, U
|
|
|
|
Style_rare # cz, fi, U
|
|
|
|
Style_poet # cz, U
|
|
|
|
Style_norm # cz, U
|
|
|
|
Style_coll # cz, U
|
|
|
|
Style_vrnc # cz, U
|
|
|
|
Style_sing # cz, U
|
|
|
|
Style_expr # cz, U
|
|
|
|
Style_derg # cz, U
|
|
|
|
Style_vulg # cz, U
|
|
|
|
Style_yes # fi, U
|
|
|
|
StyleVariant_styleShort # cz
|
|
|
|
StyleVariant_styleBound # cz, sl
|
|
|
|
VerbType_aux # U
|
|
|
|
VerbType_cop # U
|
|
|
|
VerbType_mod # U
|
|
|
|
VerbType_light # U
|
2015-08-26 17:17:35 +00:00
|
|
|
|
|
|
|
|