From 85ce36ab114ec155148dd3878a41dbe3c198b291 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 7 Oct 2015 00:39:50 +1100 Subject: [PATCH 01/22] * Refactor symbols, so that frequency rank can be derived from the orth id of a word. --- bin/init_model.py | 5 +++ setup.py | 3 +- spacy/attrs.pxd | 4 +- spacy/attrs.pyx | 90 +++++++++++++++++++++++++++++++++++++++ spacy/matcher.pyx | 2 +- spacy/parts_of_speech.pxd | 45 ++++++++++---------- spacy/vocab.pyx | 15 +++++++ 7 files changed, 138 insertions(+), 26 deletions(-) diff --git a/bin/init_model.py b/bin/init_model.py index 72d7a3aae..6e44fd444 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -168,6 +168,11 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir): probs[word] = oov_prob lexicon = [] + for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): + # First encode the strings into the StringStore. This way, we can map + # the orth IDs to frequency ranks + orth = vocab.strings[word] + # Now actually load the vocab for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): lexeme = vocab[word] lexeme.prob = prob diff --git a/setup.py b/setup.py index 3036db94c..fb6a5b718 100644 --- a/setup.py +++ b/setup.py @@ -165,7 +165,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token', 'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits', 'spacy.cfile', 'spacy.matcher', - 'spacy.syntax.ner'] + 'spacy.syntax.ner', + 'spacy.symbols'] if __name__ == '__main__': diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index c810762ef..d0f476dcd 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -1,5 +1,6 @@ # Reserve 64 values for flag features cpdef enum attr_id_t: + NULL_ATTR IS_ALPHA IS_ASCII IS_DIGIT @@ -14,8 +15,7 @@ cpdef enum attr_id_t: IS_STOP IS_OOV - FLAG13 = 13 - FLAG14 + FLAG14 = 14 FLAG15 FLAG16 FLAG17 diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index e69de29bb..8ce0f7a17 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -0,0 +1,90 @@ +ATTR_IDS = { + "NULL_ATTR": NULL_ATTR, + "IS_ALPHA": IS_ALPHA, + "IS_ASCII": IS_ASCII, + "IS_DIGIT": IS_DIGIT, + "IS_LOWER": IS_LOWER, + "IS_PUNCT": IS_PUNCT, + "IS_SPACE": IS_SPACE, + "IS_TITLE": IS_TITLE, + "IS_UPPER": IS_UPPER, + "LIKE_URL": LIKE_URL, + "LIKE_NUM": LIKE_NUM, + "LIKE_EMAIL": LIKE_EMAIL, + "IS_STOP": IS_STOP, + "IS_OOV": IS_OOV, + + "FLAG14": FLAG14, + "FLAG15": FLAG15, + "FLAG16": FLAG16, + "FLAG17": FLAG17, + "FLAG18": FLAG18, + "FLAG19": FLAG19, + "FLAG20": FLAG20, + "FLAG21": FLAG21, + "FLAG22": FLAG22, + "FLAG23": FLAG23, + "FLAG24": FLAG24, + "FLAG25": FLAG25, + "FLAG26": FLAG26, + "FLAG27": FLAG27, + "FLAG28": FLAG28, + "FLAG29": FLAG29, + "FLAG30": FLAG30, + "FLAG31": FLAG31, + "FLAG32": FLAG32, + "FLAG33": FLAG33, + "FLAG34": FLAG34, + "FLAG35": FLAG35, + "FLAG36": FLAG36, + "FLAG37": FLAG37, + "FLAG38": FLAG38, + "FLAG39": FLAG39, + "FLAG40": FLAG40, + "FLAG41": FLAG41, + "FLAG42": FLAG42, + "FLAG43": FLAG43, + "FLAG44": FLAG44, + "FLAG45": FLAG45, + "FLAG46": FLAG46, + "FLAG47": FLAG47, + "FLAG48": FLAG48, + "FLAG49": FLAG49, + "FLAG50": FLAG50, + "FLAG51": FLAG51, + "FLAG52": FLAG52, + "FLAG53": FLAG53, + "FLAG54": FLAG54, + "FLAG55": FLAG55, + "FLAG56": FLAG56, + "FLAG57": FLAG57, + "FLAG58": FLAG58, + "FLAG59": FLAG59, + "FLAG60": FLAG60, + "FLAG61": FLAG61, + "FLAG62": FLAG62, + "FLAG63": FLAG63, + + "ID": ID, + "ORTH": ORTH, + "LOWER": LOWER, + "NORM": NORM, + "SHAPE": SHAPE, + "PREFIX": PREFIX, + "SUFFIX": SUFFIX, + + "LENGTH": LENGTH, + "CLUSTER": CLUSTER, + "LEMMA": LEMMA, + "POS": POS, + "TAG": TAG, + "DEP": DEP, + "ENT_IOB": ENT_IOB, + "ENT_TYPE": ENT_TYPE, + "HEAD": HEAD, + "SPACY": SPACY, + "PROB": PROB, +} + +# ATTR IDs, in order of the symbol +ATTR_NAMES = [key for key, value in sorted(ATTR_IDS.items(), key=lambda item: item[1])] diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index afafd3ddb..3ee825932 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -15,7 +15,7 @@ from libcpp.vector cimport vector from murmurhash.mrmr cimport hash64 from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE -from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25 +from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25 from .tokens.doc cimport get_token_attr from .tokens.doc cimport Doc from .vocab cimport Vocab diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd index e410c6971..17e349435 100644 --- a/spacy/parts_of_speech.pxd +++ b/spacy/parts_of_speech.pxd @@ -1,23 +1,24 @@ -# Google universal tag set +from .symbols cimport * + + cpdef enum univ_pos_t: - NO_TAG - ADJ - ADP - ADV - AUX - CONJ - DET - INTJ - NOUN - NUM - PART - PRON - PROPN - PUNCT - SCONJ - SYM - VERB - X - EOL - SPACE - N_UNIV_TAGS + NO_TAG = EMPTY_VALUE + ADJ = POS_adj + ADP = POS_adp + ADV = POS_adv + AUX = POS_aux + CONJ = POS_conj + DET = POS_det + INTJ = POS_intj + NOUN = POS_noun + NUM = POS_num + PART = POS_part + PRON = POS_pron + PROPN = POS_propn + PUNCT = POS_punct + SCONJ = POS_sconj + SYM = POS_sym + VERB = POS_verb + X = POS_x + EOL = POS_eol + SPACE = POS_space diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index d79da8a79..caf3045f5 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -67,6 +67,21 @@ cdef class Vocab: self._by_hash = PreshMap() self._by_orth = PreshMap() self.strings = StringStore() + # Load strings in a special order, so that we have an onset number for + # the vocabulary. This way, when words are added in order, the orth ID + # is the frequency rank of the word, plus a certain offset. The structural + # strings are loaded first, because the vocab is open-class, and these + # symbols are closed class. + #for attr_name in sorted(ATTR_NAMES.keys()): + # _ = self.strings[attr_name] + #for univ_pos_name in sorted(UNIV_POS_NAMES.keys()): + # _ = self.strings[pos_name] + #for morph_name in sorted(UNIV_MORPH_NAMES.keys()): + # _ = self.strings[morph_name] + #for entity_type_name in sorted(ENTITY_TYPES.keys()): + # _ = self.strings[entity_type_name] + #for tag_name in sorted(TAG_MAP.keys()): + # _ = self.strings[tag_name] self.get_lex_attr = get_lex_attr self.morphology = Morphology(self.strings, tag_map, lemmatizer) self.serializer_freqs = serializer_freqs From 5c24ad3f5c8751eefa4a5acca51f524bf00d7a24 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 7 Oct 2015 00:40:22 +1100 Subject: [PATCH 02/22] * Whitespace --- lang_data/en/morphs.json | 1 - 1 file changed, 1 deletion(-) diff --git a/lang_data/en/morphs.json b/lang_data/en/morphs.json index 917cbc759..059381b27 100644 --- a/lang_data/en/morphs.json +++ b/lang_data/en/morphs.json @@ -56,5 +56,4 @@ "was": {"L": "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"}, "were": {"L": "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"} } - } From 10a4a843eac652a05a3bd8fe3215dea2d0824343 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 7 Oct 2015 00:41:17 +1100 Subject: [PATCH 03/22] * Enumerate all symbols in one file --- spacy/symbols.pxd | 421 +++++++++++++++++++++++++++++++++++++++++++++ spacy/symbols.pyx | 424 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 845 insertions(+) create mode 100644 spacy/symbols.pxd create mode 100644 spacy/symbols.pyx diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd new file mode 100644 index 000000000..e8ddeaa8f --- /dev/null +++ b/spacy/symbols.pxd @@ -0,0 +1,421 @@ +cpdef enum symbol_t: + EMPTY_VALUE + Attr_is_alpha + Attr_is_ascii + Attr_is_digit + Attr_is_lower + Attr_is_punct + Attr_is_space + Attr_is_title + Attr_is_upper + Attr_like_url + Attr_like_num + Attr_like_email + Attr_is_stop + Attr_is_oov + + Attr_flag14 + Attr_flag15 + Attr_flag16 + Attr_flag17 + Attr_flag18 + Attr_flag19 + Attr_flag20 + Attr_flag21 + Attr_flag22 + Attr_flag23 + Attr_flag24 + Attr_flag25 + Attr_flag26 + Attr_flag27 + Attr_flag28 + Attr_flag29 + Attr_flag30 + Attr_flag31 + Attr_flag32 + Attr_flag33 + Attr_flag34 + Attr_flag35 + Attr_flag36 + Attr_flag37 + Attr_flag38 + Attr_flag39 + Attr_flag40 + Attr_flag41 + Attr_flag42 + Attr_flag43 + Attr_flag44 + Attr_flag45 + Attr_flag46 + Attr_flag47 + Attr_flag48 + Attr_flag49 + Attr_flag50 + Attr_flag51 + Attr_flag52 + Attr_flag53 + Attr_flag54 + Attr_flag55 + Attr_flag56 + Attr_flag57 + Attr_flag58 + Attr_flag59 + Attr_flag60 + Attr_flag61 + Attr_flag62 + Attr_flag63 + + Attr_id + Attr_orth + Attr_lower + Attr_norm + Attr_shape + Attr_prefix + Attr_suffix + + Attr_length + Attr_cluster + Attr_lemma + Attr_pos + Attr_tag + Attr_dep + Attr_ent_iob + Attr_ent_type + Attr_head + Attr_spacy + Attr_prob + + POS_adj + POS_adp + POS_adv + POS_aux + POS_conj + POS_det + POS_intj + POS_noun + POS_num + POS_part + POS_pron + POS_propn + POS_punct + POS_sconj + POS_sym + POS_verb + POS_x + POS_eol + POS_space + + Animacy_anim + Animacy_inam + Aspect_freq + Aspect_imp + Aspect_mod + Aspect_none + Aspect_perf + Case_abe + Case_abl + Case_abs + Case_acc + Case_ade + Case_all + Case_cau + Case_com + Case_dat + Case_del + Case_dis + Case_ela + Case_ess + Case_gen + Case_ill + Case_ine + Case_ins + Case_loc + Case_lat + Case_nom + Case_par + Case_sub + Case_sup + Case_tem + Case_ter + Case_tra + Case_voc + Definite_two + Definite_def + Definite_red + Definite_ind + Degree_cmp + Degree_comp + Degree_none + Degree_pos + Degree_sup + Degree_abs + Degree_com + Degree_dim # du + Gender_com + Gender_fem + Gender_masc + Gender_neut + Mood_cnd + Mood_imp + Mood_ind + Mood_n + Mood_pot + Mood_sub + Mood_opt + Negative_neg + Negative_pos + Negative_yes + Number_com + Number_dual + Number_none + Number_plur + Number_sing + Number_ptan # bg + Number_count # bg + NumType_card + NumType_dist + NumType_frac + NumType_gen + NumType_mult + NumType_none + NumType_ord + NumType_sets + Person_one + Person_two + Person_three + Person_none + Poss_yes + PronType_advPart + PronType_art + PronType_default + PronType_dem + PronType_ind + PronType_int + PronType_neg + PronType_prs + PronType_rcp + PronType_rel + PronType_tot + PronType_clit + PronType_exc # es, ca, it, fa + Reflex_yes + Tense_fut + Tense_imp + Tense_past + Tense_pres + VerbForm_fin + VerbForm_ger + VerbForm_inf + VerbForm_none + VerbForm_part + VerbForm_partFut + VerbForm_partPast + VerbForm_partPres + VerbForm_sup + VerbForm_trans + VerbForm_gdv # la + Voice_act + Voice_cau + Voice_pass + Voice_mid # gkc + Voice_int # hb + Abbr_yes # cz, fi, sl, U + AdpType_prep # cz, U + AdpType_post # U + AdpType_voc # cz + AdpType_comprep # cz + AdpType_circ # U + AdvType_man + AdvType_loc + AdvType_tim + AdvType_deg + AdvType_cau + AdvType_mod + AdvType_sta + AdvType_ex + AdvType_adadj + ConjType_oper # cz, U + ConjType_comp # cz, U + Connegative_yes # fi + Derivation_minen # fi + Derivation_sti # fi + Derivation_inen # fi + Derivation_lainen # fi + Derivation_ja # fi + Derivation_ton # fi + Derivation_vs # fi + Derivation_ttain # fi + Derivation_ttaa # fi + Echo_rdp # U + Echo_ech # U + Foreign_foreign # cz, fi, U + Foreign_fscript # cz, fi, U + Foreign_tscript # cz, U + Foreign_yes # sl + Gender_dat_masc # bq, U + Gender_dat_fem # bq, U + Gender_erg_masc # bq + Gender_erg_fem # bq + Gender_psor_masc # cz, sl, U + Gender_psor_fem # cz, sl, U + Gender_psor_neut # sl + Hyph_yes # cz, U + InfForm_one # fi + InfForm_two # fi + InfForm_three # fi + NameType_geo # U, cz + NameType_prs # U, cz + NameType_giv # U, cz + NameType_sur # U, cz + NameType_nat # U, cz + NameType_com # U, cz + NameType_pro # U, cz + NameType_oth # U, cz + NounType_com # U + NounType_prop # U + NounType_class # U + Number_abs_sing # bq, U + Number_abs_plur # bq, U + Number_dat_sing # bq, U + Number_dat_plur # bq, U + Number_erg_sing # bq, U + Number_erg_plur # bq, U + Number_psee_sing # U + Number_psee_plur # U + Number_psor_sing # cz, fi, sl, U + Number_psor_plur # cz, fi, sl, U + NumForm_digit # cz, sl, U + NumForm_roman # cz, sl, U + NumForm_word # cz, sl, U + NumValue_one # cz, U + NumValue_two # cz, U + NumValue_three # cz, U + PartForm_pres # fi + PartForm_past # fi + PartForm_agt # fi + PartForm_neg # fi + PartType_mod # U + PartType_emp # U + PartType_res # U + PartType_inf # U + PartType_vbp # U + Person_abs_one # bq, U + Person_abs_two # bq, U + Person_abs_three # bq, U + Person_dat_one # bq, U + Person_dat_two # bq, U + Person_dat_three # bq, U + Person_erg_one # bq, U + Person_erg_two # bq, U + Person_erg_three # bq, U + Person_psor_one # fi, U + Person_psor_two # fi, U + Person_psor_three # fi, U + Polite_inf # bq, U + Polite_pol # bq, U + Polite_abs_inf # bq, U + Polite_abs_pol # bq, U + Polite_erg_inf # bq, U + Polite_erg_pol # bq, U + Polite_dat_inf # bq, U + Polite_dat_pol # bq, U + Prefix_yes # U + PrepCase_npr # cz + PrepCase_pre # U + PunctSide_ini # U + PunctSide_fin # U + PunctType_peri # U + PunctType_qest # U + PunctType_excl # U + PunctType_quot # U + PunctType_brck # U + PunctType_comm # U + PunctType_colo # U + PunctType_semi # U + PunctType_dash # U + Style_arch # cz, fi, U + Style_rare # cz, fi, U + Style_poet # cz, U + Style_norm # cz, U + Style_coll # cz, U + Style_vrnc # cz, U + Style_sing # cz, U + Style_expr # cz, U + Style_derg # cz, U + Style_vulg # cz, U + Style_yes # fi, U + StyleVariant_styleShort # cz + StyleVariant_styleBound # cz, sl + VerbType_aux # U + VerbType_cop # U + VerbType_mod # U + VerbType_light # U + + Name_person + Name_norp + Name_facility + Name_org + Name_gpe + Name_loc + Name_product + Name_event + Name_work_of_art + Name_language + + Unit_date + Unit_time + Unit_percent + Unit_money + Unit_quantity + Unit_ordinal + Unit_cardinal + + Dep_acomp + Dep_advcl + Dep_advmod + Dep_agent + Dep_amod + Dep_appos + Dep_attr + Dep_aux + Dep_auxpass + Dep_cc + Dep_ccomp + Dep_complm + Dep_conj + Dep_csubj + Dep_csubjpass + Dep_dep + Dep_det + Dep_dobj + Dep_expl + Dep_hmod + Dep_hyph + Dep_infmod + Dep_intj + Dep_iobj + Dep_mark + Dep_meta + Dep_neg + Dep_nmod + Dep_nn + Dep_npadvmod + Dep_nsubj + Dep_nsubjpass + Dep_num + Dep_number + Dep_oprd + Dep_parataxis + Dep_partmod + Dep_pcomp + Dep_pobj + Dep_poss + Dep_possessive + Dep_preconj + Dep_prep + Dep_prt + Dep_punct + Dep_quantmod + Dep_rcmod + Dep_root + Dep_xcomp diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx new file mode 100644 index 000000000..4251fb4ec --- /dev/null +++ b/spacy/symbols.pyx @@ -0,0 +1,424 @@ +SYMBOL_IDS = { + "EMPTY_VALUE": EMPTY_VALUE, + "Attr_is_alpha": Attr_is_alpha, + "Attr_is_ascii": Attr_is_ascii, + "Attr_is_digit": Attr_is_digit, + "Attr_is_lower": Attr_is_lower, + "Attr_is_punct": Attr_is_punct, + "Attr_is_space": Attr_is_space, + "Attr_is_title": Attr_is_title, + "Attr_is_upper": Attr_is_upper, + "Attr_like_url": Attr_like_url, + "Attr_like_num": Attr_like_num, + "Attr_like_email": Attr_like_email, + "Attr_is_stop": Attr_is_stop, + "Attr_is_oov": Attr_is_oov, + + "Attr_flag14": Attr_flag14, + "Attr_flag15": Attr_flag15, + "Attr_flag16": Attr_flag16, + "Attr_flag17": Attr_flag17, + "Attr_flag18": Attr_flag18, + "Attr_flag19": Attr_flag19, + "Attr_flag20": Attr_flag20, + "Attr_flag21": Attr_flag21, + "Attr_flag22": Attr_flag22, + "Attr_flag23": Attr_flag23, + "Attr_flag24": Attr_flag24, + "Attr_flag25": Attr_flag25, + "Attr_flag26": Attr_flag26, + "Attr_flag27": Attr_flag27, + "Attr_flag28": Attr_flag28, + "Attr_flag29": Attr_flag29, + "Attr_flag30": Attr_flag30, + "Attr_flag31": Attr_flag31, + "Attr_flag32": Attr_flag32, + "Attr_flag33": Attr_flag33, + "Attr_flag34": Attr_flag34, + "Attr_flag35": Attr_flag35, + "Attr_flag36": Attr_flag36, + "Attr_flag37": Attr_flag37, + "Attr_flag38": Attr_flag38, + "Attr_flag39": Attr_flag39, + "Attr_flag40": Attr_flag40, + "Attr_flag41": Attr_flag41, + "Attr_flag42": Attr_flag42, + "Attr_flag43": Attr_flag43, + "Attr_flag44": Attr_flag44, + "Attr_flag45": Attr_flag45, + "Attr_flag46": Attr_flag46, + "Attr_flag47": Attr_flag47, + "Attr_flag48": Attr_flag48, + "Attr_flag49": Attr_flag49, + "Attr_flag50": Attr_flag50, + "Attr_flag51": Attr_flag51, + "Attr_flag52": Attr_flag52, + "Attr_flag53": Attr_flag53, + "Attr_flag54": Attr_flag54, + "Attr_flag55": Attr_flag55, + "Attr_flag56": Attr_flag56, + "Attr_flag57": Attr_flag57, + "Attr_flag58": Attr_flag58, + "Attr_flag59": Attr_flag59, + "Attr_flag60": Attr_flag60, + "Attr_flag61": Attr_flag61, + "Attr_flag62": Attr_flag62, + "Attr_flag63": Attr_flag63, + + "Attr_id": Attr_id, + "Attr_orth": Attr_orth, + "Attr_lower": Attr_lower, + "Attr_norm": Attr_norm, + "Attr_shape": Attr_shape, + "Attr_prefix": Attr_prefix, + "Attr_suffix": Attr_suffix, + + "Attr_length": Attr_length, + "Attr_cluster": Attr_cluster, + "Attr_lemma": Attr_lemma, + "Attr_pos": Attr_pos, + "Attr_tag": Attr_tag, + "Attr_dep": Attr_dep, + "Attr_ent_iob": Attr_ent_iob, + "Attr_ent_type": Attr_ent_type, + "Attr_head": Attr_head, + "Attr_spacy": Attr_spacy, + "Attr_prob": Attr_prob, + + "POS_adj": POS_adj, + "POS_adp": POS_adp, + "POS_adv": POS_adv, + "POS_aux": POS_aux, + "POS_conj": POS_conj, + "POS_det": POS_det, + "POS_intj": POS_intj, + "POS_noun": POS_noun, + "POS_num": POS_num, + "POS_part": POS_part, + "POS_pron": POS_pron, + "POS_propn": POS_propn, + "POS_punct": POS_punct, + "POS_sconj": POS_sconj, + "POS_sym": POS_sym, + "POS_verb": POS_verb, + "POS_x": POS_x, + "POS_eol": POS_eol, + "POS_space": POS_space, + + "Animacy_anim": Animacy_anim, + "Animacy_inam": Animacy_inam, + "Aspect_freq": Aspect_freq, + "Aspect_imp": Aspect_imp, + "Aspect_mod": Aspect_mod, + "Aspect_none": Aspect_none, + "Aspect_perf": Aspect_perf, + "Case_abe": Case_abe, + "Case_abl": Case_abl, + "Case_abs": Case_abs, + "Case_acc": Case_acc, + "Case_ade": Case_ade, + "Case_all": Case_all, + "Case_cau": Case_cau, + "Case_com": Case_com, + "Case_dat": Case_dat, + "Case_del": Case_del, + "Case_dis": Case_dis, + "Case_ela": Case_ela, + "Case_ess": Case_ess, + "Case_gen": Case_gen, + "Case_ill": Case_ill, + "Case_ine": Case_ine, + "Case_ins": Case_ins, + "Case_loc": Case_loc, + "Case_lat": Case_lat, + "Case_nom": Case_nom, + "Case_par": Case_par, + "Case_sub": Case_sub, + "Case_sup": Case_sup, + "Case_tem": Case_tem, + "Case_ter": Case_ter, + "Case_tra": Case_tra, + "Case_voc": Case_voc, + "Definite_two": Definite_two, + "Definite_def": Definite_def, + "Definite_red": Definite_red, + "Definite_ind": Definite_ind, + "Degree_cmp": Degree_cmp, + "Degree_comp": Degree_comp, + "Degree_none": Degree_none, + "Degree_pos": Degree_pos, + "Degree_sup": Degree_sup, + "Degree_abs": Degree_abs, + "Degree_com": Degree_com, + "Degree_dim ": Degree_dim, # du + "Gender_com": Gender_com, + "Gender_fem": Gender_fem, + "Gender_masc": Gender_masc, + "Gender_neut": Gender_neut, + "Mood_cnd": Mood_cnd, + "Mood_imp": Mood_imp, + "Mood_ind": Mood_ind, + "Mood_n": Mood_n, + "Mood_pot": Mood_pot, + "Mood_sub": Mood_sub, + "Mood_opt": Mood_opt, + "Negative_neg": Negative_neg, + "Negative_pos": Negative_pos, + "Negative_yes": Negative_yes, + "Number_com": Number_com, + "Number_dual": Number_dual, + "Number_none": Number_none, + "Number_plur": Number_plur, + "Number_sing": Number_sing, + "Number_ptan ": Number_ptan, # bg + "Number_count ": Number_count, # bg + "NumType_card": NumType_card, + "NumType_dist": NumType_dist, + "NumType_frac": NumType_frac, + "NumType_gen": NumType_gen, + "NumType_mult": NumType_mult, + "NumType_none": NumType_none, + "NumType_ord": NumType_ord, + "NumType_sets": NumType_sets, + "Person_one": Person_one, + "Person_two": Person_two, + "Person_three": Person_three, + "Person_none": Person_none, + "Poss_yes": Poss_yes, + "PronType_advPart": PronType_advPart, + "PronType_art": PronType_art, + "PronType_default": PronType_default, + "PronType_dem": PronType_dem, + "PronType_ind": PronType_ind, + "PronType_int": PronType_int, + "PronType_neg": PronType_neg, + "PronType_prs": PronType_prs, + "PronType_rcp": PronType_rcp, + "PronType_rel": PronType_rel, + "PronType_tot": PronType_tot, + "PronType_clit": PronType_clit, + "PronType_exc ": PronType_exc, # es, ca, it, fa, + "Reflex_yes": Reflex_yes, + "Tense_fut": Tense_fut, + "Tense_imp": Tense_imp, + "Tense_past": Tense_past, + "Tense_pres": Tense_pres, + "VerbForm_fin": VerbForm_fin, + "VerbForm_ger": VerbForm_ger, + "VerbForm_inf": VerbForm_inf, + "VerbForm_none": VerbForm_none, + "VerbForm_part": VerbForm_part, + "VerbForm_partFut": VerbForm_partFut, + "VerbForm_partPast": VerbForm_partPast, + "VerbForm_partPres": VerbForm_partPres, + "VerbForm_sup": VerbForm_sup, + "VerbForm_trans": VerbForm_trans, + "VerbForm_gdv ": VerbForm_gdv, # la, + "Voice_act": Voice_act, + "Voice_cau": Voice_cau, + "Voice_pass": Voice_pass, + "Voice_mid ": Voice_mid, # gkc, + "Voice_int ": Voice_int, # hb, + "Abbr_yes ": Abbr_yes, # cz, fi, sl, U, + "AdpType_prep ": AdpType_prep, # cz, U, + "AdpType_post ": AdpType_post, # U, + "AdpType_voc ": AdpType_voc, # cz, + "AdpType_comprep ": AdpType_comprep, # cz, + "AdpType_circ ": AdpType_circ, # U, + "AdvType_man": AdvType_man, + "AdvType_loc": AdvType_loc, + "AdvType_tim": AdvType_tim, + "AdvType_deg": AdvType_deg, + "AdvType_cau": AdvType_cau, + "AdvType_mod": AdvType_mod, + "AdvType_sta": AdvType_sta, + "AdvType_ex": AdvType_ex, + "AdvType_adadj": AdvType_adadj, + "ConjType_oper ": ConjType_oper, # cz, U, + "ConjType_comp ": ConjType_comp, # cz, U, + "Connegative_yes ": Connegative_yes, # fi, + "Derivation_minen ": Derivation_minen, # fi, + "Derivation_sti ": Derivation_sti, # fi, + "Derivation_inen ": Derivation_inen, # fi, + "Derivation_lainen ": Derivation_lainen, # fi, + "Derivation_ja ": Derivation_ja, # fi, + "Derivation_ton ": Derivation_ton, # fi, + "Derivation_vs ": Derivation_vs, # fi, + "Derivation_ttain ": Derivation_ttain, # fi, + "Derivation_ttaa ": Derivation_ttaa, # fi, + "Echo_rdp ": Echo_rdp, # U, + "Echo_ech ": Echo_ech, # U, + "Foreign_foreign ": Foreign_foreign, # cz, fi, U, + "Foreign_fscript ": Foreign_fscript, # cz, fi, U, + "Foreign_tscript ": Foreign_tscript, # cz, U, + "Foreign_yes ": Foreign_yes, # sl, + "Gender_dat_masc ": Gender_dat_masc, # bq, U, + "Gender_dat_fem ": Gender_dat_fem, # bq, U, + "Gender_erg_masc ": Gender_erg_masc, # bq, + "Gender_erg_fem ": Gender_erg_fem, # bq, + "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U, + "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U, + "Gender_psor_neut ": Gender_psor_neut, # sl, + "Hyph_yes ": Hyph_yes, # cz, U, + "InfForm_one ": InfForm_one, # fi, + "InfForm_two ": InfForm_two, # fi, + "InfForm_three ": InfForm_three, # fi, + "NameType_geo ": NameType_geo, # U, cz, + "NameType_prs ": NameType_prs, # U, cz, + "NameType_giv ": NameType_giv, # U, cz, + "NameType_sur ": NameType_sur, # U, cz, + "NameType_nat ": NameType_nat, # U, cz, + "NameType_com ": NameType_com, # U, cz, + "NameType_pro ": NameType_pro, # U, cz, + "NameType_oth ": NameType_oth, # U, cz, + "NounType_com ": NounType_com, # U, + "NounType_prop ": NounType_prop, # U, + "NounType_class ": NounType_class, # U, + "Number_abs_sing ": Number_abs_sing, # bq, U, + "Number_abs_plur ": Number_abs_plur, # bq, U, + "Number_dat_sing ": Number_dat_sing, # bq, U, + "Number_dat_plur ": Number_dat_plur, # bq, U, + "Number_erg_sing ": Number_erg_sing, # bq, U, + "Number_erg_plur ": Number_erg_plur, # bq, U, + "Number_psee_sing ": Number_psee_sing, # U, + "Number_psee_plur ": Number_psee_plur, # U, + "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U, + "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U, + "NumForm_digit ": NumForm_digit, # cz, sl, U, + "NumForm_roman ": NumForm_roman, # cz, sl, U, + "NumForm_word ": NumForm_word, # cz, sl, U, + "NumValue_one ": NumValue_one, # cz, U, + "NumValue_two ": NumValue_two, # cz, U, + "NumValue_three ": NumValue_three, # cz, U, + "PartForm_pres ": PartForm_pres, # fi, + "PartForm_past ": PartForm_past, # fi, + "PartForm_agt ": PartForm_agt, # fi, + "PartForm_neg ": PartForm_neg, # fi, + "PartType_mod ": PartType_mod, # U, + "PartType_emp ": PartType_emp, # U, + "PartType_res ": PartType_res, # U, + "PartType_inf ": PartType_inf, # U, + "PartType_vbp ": PartType_vbp, # U, + "Person_abs_one ": Person_abs_one, # bq, U, + "Person_abs_two ": Person_abs_two, # bq, U, + "Person_abs_three ": Person_abs_three, # bq, U, + "Person_dat_one ": Person_dat_one, # bq, U, + "Person_dat_two ": Person_dat_two, # bq, U, + "Person_dat_three ": Person_dat_three, # bq, U, + "Person_erg_one ": Person_erg_one, # bq, U, + "Person_erg_two ": Person_erg_two, # bq, U, + "Person_erg_three ": Person_erg_three, # bq, U, + "Person_psor_one ": Person_psor_one, # fi, U, + "Person_psor_two ": Person_psor_two, # fi, U, + "Person_psor_three ": Person_psor_three, # fi, U, + "Polite_inf ": Polite_inf, # bq, U, + "Polite_pol ": Polite_pol, # bq, U, + "Polite_abs_inf ": Polite_abs_inf, # bq, U, + "Polite_abs_pol ": Polite_abs_pol, # bq, U, + "Polite_erg_inf ": Polite_erg_inf, # bq, U, + "Polite_erg_pol ": Polite_erg_pol, # bq, U, + "Polite_dat_inf ": Polite_dat_inf, # bq, U, + "Polite_dat_pol ": Polite_dat_pol, # bq, U, + "Prefix_yes ": Prefix_yes, # U, + "PrepCase_npr ": PrepCase_npr, # cz, + "PrepCase_pre ": PrepCase_pre, # U, + "PunctSide_ini ": PunctSide_ini, # U, + "PunctSide_fin ": PunctSide_fin, # U, + "PunctType_peri ": PunctType_peri, # U, + "PunctType_qest ": PunctType_qest, # U, + "PunctType_excl ": PunctType_excl, # U, + "PunctType_quot ": PunctType_quot, # U, + "PunctType_brck ": PunctType_brck, # U, + "PunctType_comm ": PunctType_comm, # U, + "PunctType_colo ": PunctType_colo, # U, + "PunctType_semi ": PunctType_semi, # U, + "PunctType_dash ": PunctType_dash, # U, + "Style_arch ": Style_arch, # cz, fi, U, + "Style_rare ": Style_rare, # cz, fi, U, + "Style_poet ": Style_poet, # cz, U, + "Style_norm ": Style_norm, # cz, U, + "Style_coll ": Style_coll, # cz, U, + "Style_vrnc ": Style_vrnc, # cz, U, + "Style_sing ": Style_sing, # cz, U, + "Style_expr ": Style_expr, # cz, U, + "Style_derg ": Style_derg, # cz, U, + "Style_vulg ": Style_vulg, # cz, U, + "Style_yes ": Style_yes, # fi, U, + "StyleVariant_styleShort ": StyleVariant_styleShort, # cz, + "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl, + "VerbType_aux ": VerbType_aux, # U, + "VerbType_cop ": VerbType_cop, # U, + "VerbType_mod ": VerbType_mod, # U, + "VerbType_light ": VerbType_light, # U, + + "Name_person": Name_person, + "Name_norp": Name_norp, + "Name_facility": Name_facility, + "Name_org": Name_org, + "Name_gpe": Name_gpe, + "Name_loc": Name_loc, + "Name_product": Name_product, + "Name_event": Name_event, + "Name_work_of_art": Name_work_of_art, + "Name_language": Name_language, + + "Unit_date": Unit_date, + "Unit_time": Unit_time, + "Unit_percent": Unit_percent, + "Unit_money": Unit_money, + "Unit_quantity": Unit_quantity, + "Unit_ordinal": Unit_ordinal, + "Unit_cardinal": Unit_cardinal, + + "Dep_acomp": Dep_acomp, + "Dep_advcl": Dep_advcl, + "Dep_advmod": Dep_advmod, + "Dep_agent": Dep_agent, + "Dep_amod": Dep_amod, + "Dep_appos": Dep_appos, + "Dep_attr": Dep_attr, + "Dep_aux": Dep_aux, + "Dep_auxpass": Dep_auxpass, + "Dep_cc": Dep_cc, + "Dep_ccomp": Dep_ccomp, + "Dep_complm": Dep_complm, + "Dep_conj": Dep_conj, + "Dep_csubj": Dep_csubj, + "Dep_csubjpass": Dep_csubjpass, + "Dep_dep": Dep_dep, + "Dep_det": Dep_det, + "Dep_dobj": Dep_dobj, + "Dep_expl": Dep_expl, + "Dep_hmod": Dep_hmod, + "Dep_hyph": Dep_hyph, + "Dep_infmod": Dep_infmod, + "Dep_intj": Dep_intj, + "Dep_iobj": Dep_iobj, + "Dep_mark": Dep_mark, + "Dep_meta": Dep_meta, + "Dep_neg": Dep_neg, + "Dep_nmod": Dep_nmod, + "Dep_nn": Dep_nn, + "Dep_npadvmod": Dep_npadvmod, + "Dep_nsubj": Dep_nsubj, + "Dep_nsubjpass": Dep_nsubjpass, + "Dep_num": Dep_num, + "Dep_number": Dep_number, + "Dep_oprd": Dep_oprd, + "Dep_parataxis": Dep_parataxis, + "Dep_partmod": Dep_partmod, + "Dep_pcomp": Dep_pcomp, + "Dep_pobj": Dep_pobj, + "Dep_poss": Dep_poss, + "Dep_possessive": Dep_possessive, + "Dep_preconj": Dep_preconj, + "Dep_prep": Dep_prep, + "Dep_prt": Dep_prt, + "Dep_punct": Dep_punct, + "Dep_quantmod": Dep_quantmod, + "Dep_rcmod": Dep_rcmod, + "Dep_root": Dep_root, + "Dep_xcomp": Dep_xcomp +} + +SYMBOL_NAMES = [it[0] for it in sorted(SYMBOL_IDS.items(), key=lambda it: it[1])] From 74c0853471ed4115473142b542b0c9c917475a13 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 17:55:55 +1100 Subject: [PATCH 04/22] * Rename ATTR_IDS to attrs.IDS. Rename ATTR_NAMES to attrs.NAMES. Rename UNIV_POS_IDS to parts_of_speech.IDS --- spacy/attrs.pyx | 4 ++-- spacy/morphology.pyx | 4 ++-- spacy/parts_of_speech.pyx | 5 ++++- spacy/tokens/doc.pyx | 1 - spacy/tokens/token.pyx | 7 ++----- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 8ce0f7a17..8d76160f4 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -1,4 +1,4 @@ -ATTR_IDS = { +IDS = { "NULL_ATTR": NULL_ATTR, "IS_ALPHA": IS_ALPHA, "IS_ASCII": IS_ASCII, @@ -87,4 +87,4 @@ ATTR_IDS = { } # ATTR IDs, in order of the symbol -ATTR_NAMES = [key for key, value in sorted(ATTR_IDS.items(), key=lambda item: item[1])] +NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 534f64a59..8d2a73608 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -6,7 +6,7 @@ try: except ImportError: import json -from .parts_of_speech import UNIV_POS_NAMES +from .parts_of_speech import IDS as POS_IDS from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT @@ -24,7 +24,7 @@ cdef class Morphology: self.rich_tags[i].id = i self.rich_tags[i].name = self.strings[tag_str] self.rich_tags[i].morph = 0 - self.rich_tags[i].pos = UNIV_POS_NAMES[props['pos'].upper()] + self.rich_tags[i].pos = POS_IDS[props['pos'].upper()] self.reverse_index[self.rich_tags[i].name] = i self._cache = PreshMapArray(self.n_tags) diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx index 8c2348a47..57d9c801b 100644 --- a/spacy/parts_of_speech.pyx +++ b/spacy/parts_of_speech.pyx @@ -1,7 +1,7 @@ from __future__ import unicode_literals -UNIV_POS_NAMES = { +IDS = { "NO_TAG": NO_TAG, "ADJ": ADJ, "ADP": ADP, @@ -23,3 +23,6 @@ UNIV_POS_NAMES = { "EOL": EOL, "SPACE": SPACE } + + +NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index eab6c044e..50b19d4c1 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -14,7 +14,6 @@ from ..typedefs cimport attr_t, flags_t from ..attrs cimport attr_id_t from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE -from ..parts_of_speech import UNIV_POS_NAMES from ..parts_of_speech cimport CONJ, PUNCT, NOUN from ..parts_of_speech cimport univ_pos_t from ..lexeme cimport Lexeme diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 25db3f47e..af80b5359 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -9,7 +9,7 @@ import numpy from ..lexeme cimport Lexeme -from ..parts_of_speech import UNIV_POS_NAMES +from .. import parts_of_speech from ..attrs cimport LEMMA from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER @@ -318,7 +318,7 @@ cdef class Token: property pos_: def __get__(self): - return _pos_id_to_string[self.c.pos] + return parts_of_speech.NAMES[self.c.pos] property tag_: def __get__(self): @@ -363,6 +363,3 @@ cdef class Token: property like_email: def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL) - - -_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()} From a29c8ee23d5d1b373327013150531b835b27088a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 17:58:29 +1100 Subject: [PATCH 05/22] * Add symbols to the vocab before reading the strings, so that they line up correctly --- spacy/vocab.pyx | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index caf3045f5..1a787e7ac 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -19,6 +19,9 @@ from .typedefs cimport attr_t from .cfile cimport CFile from .lemmatizer import Lemmatizer +from . import attrs +from . import parts_of_speech + from cymem.cymem cimport Address from . import util from .serialize.packer cimport Packer @@ -72,15 +75,15 @@ cdef class Vocab: # is the frequency rank of the word, plus a certain offset. The structural # strings are loaded first, because the vocab is open-class, and these # symbols are closed class. - #for attr_name in sorted(ATTR_NAMES.keys()): - # _ = self.strings[attr_name] - #for univ_pos_name in sorted(UNIV_POS_NAMES.keys()): - # _ = self.strings[pos_name] - #for morph_name in sorted(UNIV_MORPH_NAMES.keys()): + for name in attrs.NAMES: + _ = self.strings[name] + for name in parts_of_speech.NAMES: + _ = self.strings[name] + #for morph_name in UNIV_MORPH_NAMES: # _ = self.strings[morph_name] - #for entity_type_name in sorted(ENTITY_TYPES.keys()): + #for entity_type_name in entity_types.NAMES: # _ = self.strings[entity_type_name] - #for tag_name in sorted(TAG_MAP.keys()): + #for tag_name in sorted(tag_map.keys()): # _ = self.strings[tag_name] self.get_lex_attr = get_lex_attr self.morphology = Morphology(self.strings, tag_map, lemmatizer) From ce3e3063764a1b9dc002765450659b8adba1b1d7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 17:58:57 +1100 Subject: [PATCH 06/22] * Allow SPACY_DATA environment variable in website tests --- tests/website/conftest.py | 6 ++++-- tests/website/test_home.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/website/conftest.py b/tests/website/conftest.py index ade1bae2a..35c38d845 100644 --- a/tests/website/conftest.py +++ b/tests/website/conftest.py @@ -1,11 +1,13 @@ from __future__ import unicode_literals import pytest +import os @pytest.fixture(scope='session') def nlp(): - from spacy.en import English - return English() + from spacy.en import English, LOCAL_DATA_DIR + data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) + return English(data_dir=data_dir) @pytest.fixture() diff --git a/tests/website/test_home.py b/tests/website/test_home.py index 4da61becf..3f7f7ea4c 100644 --- a/tests/website/test_home.py +++ b/tests/website/test_home.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals import pytest import spacy +import os @pytest.fixture() @@ -9,8 +10,9 @@ def token(doc): def test_load_resources_and_process_text(): - from spacy.en import English - nlp = English() + from spacy.en import English, LOCAL_DATA_DIR + data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) + nlp = English(data_dir=data_dir) doc = nlp('Hello, world. Here are two sentences.') From d70e8cac2c4302249720cfded3de836302bddb1c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 18:27:03 +1100 Subject: [PATCH 07/22] * Fix empty values in attributes and parts of speech, so symbols align correctly with the StringStore --- spacy/parts_of_speech.pyx | 2 +- spacy/symbols.pyx | 1 - spacy/vocab.pyx | 6 ++++-- tests/vocab/test_vocab.py | 13 +++++++++++++ 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx index 57d9c801b..14933480c 100644 --- a/spacy/parts_of_speech.pyx +++ b/spacy/parts_of_speech.pyx @@ -2,7 +2,7 @@ from __future__ import unicode_literals IDS = { - "NO_TAG": NO_TAG, + "": NO_TAG, "ADJ": ADJ, "ADP": ADP, "ADV": ADV, diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 4251fb4ec..a0a39f2ff 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -1,5 +1,4 @@ SYMBOL_IDS = { - "EMPTY_VALUE": EMPTY_VALUE, "Attr_is_alpha": Attr_is_alpha, "Attr_is_ascii": Attr_is_ascii, "Attr_is_digit": Attr_is_digit, diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 1a787e7ac..6cf829344 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -76,9 +76,11 @@ cdef class Vocab: # strings are loaded first, because the vocab is open-class, and these # symbols are closed class. for name in attrs.NAMES: - _ = self.strings[name] + if name: + _ = self.strings[name] for name in parts_of_speech.NAMES: - _ = self.strings[name] + if name: + _ = self.strings[name] #for morph_name in UNIV_MORPH_NAMES: # _ = self.strings[morph_name] #for entity_type_name in entity_types.NAMES: diff --git a/tests/vocab/test_vocab.py b/tests/vocab/test_vocab.py index 7ad911626..153e0d546 100644 --- a/tests/vocab/test_vocab.py +++ b/tests/vocab/test_vocab.py @@ -1,6 +1,9 @@ from __future__ import unicode_literals import pytest +from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA +from spacy.parts_of_speech import NOUN, VERB + def test_neq(en_vocab): addr = en_vocab['Hello'] @@ -25,3 +28,13 @@ def test_punct_neq(en_vocab): def test_shape_attr(en_vocab): example = en_vocab['example'] assert example.orth != example.shape + + +def test_symbols(en_vocab): + assert en_vocab.strings['IS_ALPHA'] == IS_ALPHA + assert en_vocab.strings['NOUN'] == NOUN + assert en_vocab.strings['VERB'] == VERB + assert en_vocab.strings['LEMMA'] == LEMMA + assert en_vocab.strings['ORTH'] == ORTH + assert en_vocab.strings['PROB'] == PROB + From fd204d3cd5e9aa459b5df03e38545ff3e0c444d0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 22:09:50 +1100 Subject: [PATCH 08/22] * Map NIL to empty string in tag map --- lang_data/en/tag_map.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lang_data/en/tag_map.json b/lang_data/en/tag_map.json index de3e2eb58..a38411bcf 100644 --- a/lang_data/en/tag_map.json +++ b/lang_data/en/tag_map.json @@ -22,7 +22,7 @@ "JJS": {"pos": "adj", "degree": "sup"}, "LS": {"pos": "punct", "numtype": "ord"}, "MD": {"pos": "verb", "verbtype": "mod"}, -"NIL": {"pos": "no_tag"}, +"NIL": {"pos": ""}, "NN": {"pos": "noun", "number": "sing"}, "NNP": {"pos": "noun", "nountype": "prop", "number": "sing"}, "NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"}, From d80067eda1001753645494df14eebe03ac206b3c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 22:10:19 +1100 Subject: [PATCH 09/22] * Map empty string to NULL_ATTR in attrs --- spacy/attrs.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 8d76160f4..3595fbf22 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -1,5 +1,5 @@ IDS = { - "NULL_ATTR": NULL_ATTR, + "": NULL_ATTR, "IS_ALPHA": IS_ALPHA, "IS_ASCII": IS_ASCII, "IS_DIGIT": IS_DIGIT, From 278e12f7e848bae4b54ba8c4b1ffddfe39591cb6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 22:10:58 +1100 Subject: [PATCH 10/22] * Addmorphology symbols to morphology. May need to remove these as an enum. --- spacy/morphology.pxd | 963 +++++++++++-------------------------------- spacy/morphology.pyx | 251 +++++++++++ 2 files changed, 499 insertions(+), 715 deletions(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 2229da0ad..62d3fccc1 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -7,6 +7,7 @@ from .strings cimport StringStore from .typedefs cimport attr_t from .parts_of_speech cimport univ_pos_t +from . cimport symbols cdef struct RichTagC: uint64_t morph @@ -36,720 +37,252 @@ cdef class Morphology: cdef int assign_feature(self, uint64_t* morph, feature, value) except -1 +cpdef enum univ_morph_t: + NIL = 0 + Animacy_anim = symbols.Animacy_anim + Animacy_inam + Aspect_freq + Aspect_imp + Aspect_mod + Aspect_none + Aspect_perf + Case_abe + Case_abl + Case_abs + Case_acc + Case_ade + Case_all + Case_cau + Case_com + Case_dat + Case_del + Case_dis + Case_ela + Case_ess + Case_gen + Case_ill + Case_ine + Case_ins + Case_loc + Case_lat + Case_nom + Case_par + Case_sub + Case_sup + Case_tem + Case_ter + Case_tra + Case_voc + Definite_two + Definite_def + Definite_red + Definite_ind + Degree_cmp + Degree_comp + Degree_none + Degree_pos + Degree_sup + Degree_abs + Degree_com + Degree_dim # du + Gender_com + Gender_fem + Gender_masc + Gender_neut + Mood_cnd + Mood_imp + Mood_ind + Mood_n + Mood_pot + Mood_sub + Mood_opt + Negative_neg + Negative_pos + Negative_yes + Number_com + Number_dual + Number_none + Number_plur + Number_sing + Number_ptan # bg + Number_count # bg + NumType_card + NumType_dist + NumType_frac + NumType_gen + NumType_mult + NumType_none + NumType_ord + NumType_sets + Person_one + Person_two + Person_three + Person_none + Poss_yes + PronType_advPart + PronType_art + PronType_default + PronType_dem + PronType_ind + PronType_int + PronType_neg + PronType_prs + PronType_rcp + PronType_rel + PronType_tot + PronType_clit + PronType_exc # es, ca, it, fa + Reflex_yes + Tense_fut + Tense_imp + Tense_past + Tense_pres + VerbForm_fin + VerbForm_ger + VerbForm_inf + VerbForm_none + VerbForm_part + VerbForm_partFut + VerbForm_partPast + VerbForm_partPres + VerbForm_sup + VerbForm_trans + VerbForm_gdv # la + Voice_act + Voice_cau + Voice_pass + Voice_mid # gkc + Voice_int # hb + Abbr_yes # cz, fi, sl, U + AdpType_prep # cz, U + AdpType_post # U + AdpType_voc # cz + AdpType_comprep # cz + AdpType_circ # U + AdvType_man + AdvType_loc + AdvType_tim + AdvType_deg + AdvType_cau + AdvType_mod + AdvType_sta + AdvType_ex + AdvType_adadj + ConjType_oper # cz, U + ConjType_comp # cz, U + Connegative_yes # fi + Derivation_minen # fi + Derivation_sti # fi + Derivation_inen # fi + Derivation_lainen # fi + Derivation_ja # fi + Derivation_ton # fi + Derivation_vs # fi + Derivation_ttain # fi + Derivation_ttaa # fi + Echo_rdp # U + Echo_ech # U + Foreign_foreign # cz, fi, U + Foreign_fscript # cz, fi, U + Foreign_tscript # cz, U + Foreign_yes # sl + Gender_dat_masc # bq, U + Gender_dat_fem # bq, U + Gender_erg_masc # bq + Gender_erg_fem # bq + Gender_psor_masc # cz, sl, U + Gender_psor_fem # cz, sl, U + Gender_psor_neut # sl + Hyph_yes # cz, U + InfForm_one # fi + InfForm_two # fi + InfForm_three # fi + NameType_geo # U, cz + NameType_prs # U, cz + NameType_giv # U, cz + NameType_sur # U, cz + NameType_nat # U, cz + NameType_com # U, cz + NameType_pro # U, cz + NameType_oth # U, cz + NounType_com # U + NounType_prop # U + NounType_class # U + Number_abs_sing # bq, U + Number_abs_plur # bq, U + Number_dat_sing # bq, U + Number_dat_plur # bq, U + Number_erg_sing # bq, U + Number_erg_plur # bq, U + Number_psee_sing # U + Number_psee_plur # U + Number_psor_sing # cz, fi, sl, U + Number_psor_plur # cz, fi, sl, U + NumForm_digit # cz, sl, U + NumForm_roman # cz, sl, U + NumForm_word # cz, sl, U + NumValue_one # cz, U + NumValue_two # cz, U + NumValue_three # cz, U + PartForm_pres # fi + PartForm_past # fi + PartForm_agt # fi + PartForm_neg # fi + PartType_mod # U + PartType_emp # U + PartType_res # U + PartType_inf # U + PartType_vbp # U + Person_abs_one # bq, U + Person_abs_two # bq, U + Person_abs_three # bq, U + Person_dat_one # bq, U + Person_dat_two # bq, U + Person_dat_three # bq, U + Person_erg_one # bq, U + Person_erg_two # bq, U + Person_erg_three # bq, U + Person_psor_one # fi, U + Person_psor_two # fi, U + Person_psor_three # fi, U + Polite_inf # bq, U + Polite_pol # bq, U + Polite_abs_inf # bq, U + Polite_abs_pol # bq, U + Polite_erg_inf # bq, U + Polite_erg_pol # bq, U + Polite_dat_inf # bq, U + Polite_dat_pol # bq, U + Prefix_yes # U + PrepCase_npr # cz + PrepCase_pre # U + PunctSide_ini # U + PunctSide_fin # U + PunctType_peri # U + PunctType_qest # U + PunctType_excl # U + PunctType_quot # U + PunctType_brck # U + PunctType_comm # U + PunctType_colo # U + PunctType_semi # U + PunctType_dash # U + Style_arch # cz, fi, U + Style_rare # cz, fi, U + Style_poet # cz, U + Style_norm # cz, U + Style_coll # cz, U + Style_vrnc # cz, U + Style_sing # cz, U + Style_expr # cz, U + Style_derg # cz, U + Style_vulg # cz, U + Style_yes # fi, U + StyleVariant_styleShort # cz + StyleVariant_styleBound # cz, sl + VerbType_aux # U + VerbType_cop # U + VerbType_mod # U + VerbType_light # U -# -#cpdef enum Feature_t: -# Abbr -# AdpType -# AdvType -# ConjType -# Connegative -# Derivation -# Echo -# Foreign -# Gender_dat -# Gender_erg -# Gender_psor -# Hyph -# InfForm -# NameType -# NounType -# NumberAbs -# NumberDat -# NumberErg -# NumberPsee -# NumberPsor -# NumForm -# NumValue -# PartForm -# PartType -# Person_abs -# Person_dat -# Person_psor -# Polite -# Polite_abs -# Polite_dat -# Prefix -# PrepCase -# PunctSide -# PunctType -# Style -# Typo -# Variant -# VerbType -# -# -#cpdef enum Animacy: -# Anim -# Inam -# -# -#cpdef enum Aspect: -# Freq -# Imp -# Mod -# None_ -# Perf -# -# -#cpdef enum Case1: -# Nom -# Gen -# Acc -# Dat -# Voc -# Abl -# -#cdef enum Case2: -# Abe -# Abs -# Ade -# All -# Cau -# Com -# Del -# Dis -# -#cdef enum Case3: -# Ela -# Ess -# Ill -# Ine -# Ins -# Loc -# Lat -# Par -# -#cdef enum Case4: -# Sub -# Sup -# Tem -# Ter -# Tra -# -# -#cpdef enum Definite: -# Two -# Def -# Red -# Ind -# -# -#cpdef enum Degree: -# Cmp -# Comp -# None_ -# Pos -# Sup -# Abs -# Com -# Degree # du -# -# -#cpdef enum Gender: -# Com -# Fem -# Masc -# Neut -# -# -#cpdef enum Mood: -# Cnd -# Imp -# Ind -# N -# Pot -# Sub -# Opt -# -# -#cpdef enum Negative: -# Neg -# Pos -# Yes -# -# -#cpdef enum Number: -# Com -# Dual -# None_ -# Plur -# Sing -# Ptan # bg -# Count # bg -# -# -#cpdef enum NumType: -# Card -# Dist -# Frac -# Gen -# Mult -# None_ -# Ord -# Sets -# -# -#cpdef enum Person: -# One -# Two -# Three -# None_ -# -# -#cpdef enum Poss: -# Yes -# -# -#cpdef enum PronType1: -# AdvPart -# Art -# Default -# Dem -# Ind -# Int -# Neg -# -#cpdef enum PronType2: -# Prs -# Rcp -# Rel -# Tot -# Clit -# Exc # es, ca, it, fa -# Clit # it -# -# -#cpdef enum Reflex: -# Yes -# -# -#cpdef enum Tense: -# Fut -# Imp -# Past -# Pres -# -#cpdef enum VerbForm1: -# Fin -# Ger -# Inf -# None_ -# Part -# PartFut -# PartPast -# -#cpdef enum VerbForm2: -# PartPres -# Sup -# Trans -# Gdv # la -# -# -#cpdef enum Voice: -# Act -# Cau -# Pass -# Mid # gkc -# Int # hb -# -# -#cpdef enum Abbr: -# Yes # cz, fi, sl, U -# -#cpdef enum AdpType: -# Prep # cz, U -# Post # U -# Voc # cz -# Comprep # cz -# Circ # U -# Voc # U -# -# -#cpdef enum AdvType1: -# # U -# Man -# Loc -# Tim -# Deg -# Cau -# Mod -# Sta -# Ex -# -#cpdef enum AdvType2: -# Adadj -# -#cpdef enum ConjType: -# Oper # cz, U -# Comp # cz, U -# -#cpdef enum Connegative: -# Yes # fi -# -# -#cpdef enum Derivation1: -# Minen # fi -# Sti # fi -# Inen # fi -# Lainen # fi -# Ja # fi -# Ton # fi -# Vs # fi -# Ttain # fi -# -#cpdef enum Derivation2: -# Ttaa -# -# -#cpdef enum Echo: -# Rdp # U -# Ech # U -# -# -#cpdef enum Foreign: -# Foreign # cz, fi, U -# Fscript # cz, fi, U -# Tscript # cz, U -# Yes # sl -# -# -#cpdef enum Gender_dat: -# Masc # bq, U -# Fem # bq, U -# -# -#cpdef enum Gender_erg: -# Masc # bq -# Fem # bq -# -# -#cpdef enum Gender_psor: -# Masc # cz, sl, U -# Fem # cz, sl, U -# Neut # sl -# -# -#cpdef enum Hyph: -# Yes # cz, U -# -# -#cpdef enum InfForm: -# One # fi -# Two # fi -# Three # fi -# -# -#cpdef enum NameType: -# Geo # U, cz -# Prs # U, cz -# Giv # U, cz -# Sur # U, cz -# Nat # U, cz -# Com # U, cz -# Pro # U, cz -# Oth # U, cz -# -# -#cpdef enum NounType: -# Com # U -# Prop # U -# Class # U -# -#cpdef enum Number_abs: -# Sing # bq, U -# Plur # bq, U -# -#cpdef enum Number_dat: -# Sing # bq, U -# Plur # bq, U -# -#cpdef enum Number_erg: -# Sing # bq, U -# Plur # bq, U -# -#cpdef enum Number_psee: -# Sing # U -# Plur # U -# -# -#cpdef enum Number_psor: -# Sing # cz, fi, sl, U -# Plur # cz, fi, sl, U -# -# -#cpdef enum NumForm: -# Digit # cz, sl, U -# Roman # cz, sl, U -# Word # cz, sl, U -# -# -#cpdef enum NumValue: -# One # cz, U -# Two # cz, U -# Three # cz, U -# -# -#cpdef enum PartForm: -# Pres # fi -# Past # fi -# Agt # fi -# Neg # fi -# -# -#cpdef enum PartType: -# Mod # U -# Emp # U -# Res # U -# Inf # U -# Vbp # U -# -#cpdef enum Person_abs: -# One # bq, U -# Two # bq, U -# Three # bq, U -# -# -#cpdef enum Person_dat: -# One # bq, U -# Two # bq, U -# Three # bq, U -# -# -#cpdef enum Person_erg: -# One # bq, U -# Two # bq, U -# Three # bq, U -# -# -#cpdef enum Person_psor: -# One # fi, U -# Two # fi, U -# Three # fi, U -# -# -#cpdef enum Polite: -# Inf # bq, U -# Pol # bq, U -# -# -#cpdef enum Polite_abs: -# Inf # bq, U -# Pol # bq, U -# -# -#cpdef enum Polite_erg: -# Inf # bq, U -# Pol # bq, U -# -# -#cpdef enum Polite_dat: -# Inf # bq, U -# Pol # bq, U -# -# -#cpdef enum Prefix: -# Yes # U -# -# -#cpdef enum PrepCase: -# Npr # cz -# Pre # U -# -# -#cpdef enum PunctSide: -# Ini # U -# Fin # U -# -#cpdef enum PunctType1: -# Peri # U -# Qest # U -# Excl # U -# Quot # U -# Brck # U -# Comm # U -# Colo # U -# Semi # U -# -#cpdef enum PunctType2: -# Dash # U -# -# -#cpdef enum Style1: -# Arch # cz, fi, U -# Rare # cz, fi, U -# Poet # cz, U -# Norm # cz, U -# Coll # cz, U -# Vrnc # cz, U -# Sing # cz, U -# Expr # cz, U -# -# -#cpdef enum Style2: -# Derg # cz, U -# Vulg # cz, U -# -# -#cpdef enum Typo: -# Yes # fi, U -# -# -#cpdef enum Variant: -# Short # cz -# Bound # cz, sl -# -# -#cpdef enum VerbType: -# Aux # U -# Cop # U -# Mod # U -# Light # U -# -cpdef enum Value_t: - Animacy_Anim - Animacy_Inam - Aspect_Freq - Aspect_Imp - Aspect_Mod - Aspect_None_ - Aspect_Perf - Case_Abe - Case_Abl - Case_Abs - Case_Acc - Case_Ade - Case_All - Case_Cau - Case_Com - Case_Dat - Case_Del - Case_Dis - Case_Ela - Case_Ess - Case_Gen - Case_Ill - Case_Ine - Case_Ins - Case_Loc - Case_Lat - Case_Nom - Case_Par - Case_Sub - Case_Sup - Case_Tem - Case_Ter - Case_Tra - Case_Voc - Definite_Two - Definite_Def - Definite_Red - Definite_Ind - Degree_Cmp - Degree_Comp - Degree_None - Degree_Pos - Degree_Sup - Degree_Abs - Degree_Com - Degree_Dim # du - Gender_Com - Gender_Fem - Gender_Masc - Gender_Neut - Mood_Cnd - Mood_Imp - Mood_Ind - Mood_N - Mood_Pot - Mood_Sub - Mood_Opt - Negative_Neg - Negative_Pos - Negative_Yes - Number_Com - Number_Dual - Number_None - Number_Plur - Number_Sing - Number_Ptan # bg - Number_Count # bg - NumType_Card - NumType_Dist - NumType_Frac - NumType_Gen - NumType_Mult - NumType_None - NumType_Ord - NumType_Sets - Person_One - Person_Two - Person_Three - Person_None - Poss_Yes - PronType_AdvPart - PronType_Art - PronType_Default - PronType_Dem - PronType_Ind - PronType_Int - PronType_Neg - PronType_Prs - PronType_Rcp - PronType_Rel - PronType_Tot - PronType_Clit - PronType_Exc # es, ca, it, fa - Reflex_Yes - Tense_Fut - Tense_Imp - Tense_Past - Tense_Pres - VerbForm_Fin - VerbForm_Ger - VerbForm_Inf - VerbForm_None - VerbForm_Part - VerbForm_PartFut - VerbForm_PartPast - VerbForm_PartPres - VerbForm_Sup - VerbForm_Trans - VerbForm_Gdv # la - Voice_Act - Voice_Cau - Voice_Pass - Voice_Mid # gkc - Voice_Int # hb - Abbr_Yes # cz, fi, sl, U - AdpType_Prep # cz, U - AdpType_Post # U - AdpType_Voc # cz - AdpType_Comprep # cz - AdpType_Circ # U - AdvType_Man - AdvType_Loc - AdvType_Tim - AdvType_Deg - AdvType_Cau - AdvType_Mod - AdvType_Sta - AdvType_Ex - AdvType_Adadj - ConjType_Oper # cz, U - ConjType_Comp # cz, U - Connegative_Yes # fi - Derivation_Minen # fi - Derivation_Sti # fi - Derivation_Inen # fi - Derivation_Lainen # fi - Derivation_Ja # fi - Derivation_Ton # fi - Derivation_Vs # fi - Derivation_Ttain # fi - Derivation_Ttaa # fi - Echo_Rdp # U - Echo_Ech # U - Foreign_Foreign # cz, fi, U - Foreign_Fscript # cz, fi, U - Foreign_Tscript # cz, U - Foreign_Yes # sl - Gender_dat_Masc # bq, U - Gender_dat_Fem # bq, U - Gender_erg_Masc # bq - Gender_erg_Fem # bq - Gender_psor_Masc # cz, sl, U - Gender_psor_Fem # cz, sl, U - Gender_psor_Neut # sl - Hyph_Yes # cz, U - InfForm_One # fi - InfForm_Two # fi - InfForm_Three # fi - NameType_Geo # U, cz - NameType_Prs # U, cz - NameType_Giv # U, cz - NameType_Sur # U, cz - NameType_Nat # U, cz - NameType_Com # U, cz - NameType_Pro # U, cz - NameType_Oth # U, cz - NounType_Com # U - NounType_Prop # U - NounType_Class # U - Number_abs_Sing # bq, U - Number_abs_Plur # bq, U - Number_dat_Sing # bq, U - Number_dat_Plur # bq, U - Number_erg_Sing # bq, U - Number_erg_Plur # bq, U - Number_psee_Sing # U - Number_psee_Plur # U - Number_psor_Sing # cz, fi, sl, U - Number_psor_Plur # cz, fi, sl, U - NumForm_Digit # cz, sl, U - NumForm_Roman # cz, sl, U - NumForm_Word # cz, sl, U - NumValue_One # cz, U - NumValue_Two # cz, U - NumValue_Three # cz, U - PartForm_Pres # fi - PartForm_Past # fi - PartForm_Agt # fi - PartForm_Neg # fi - PartType_Mod # U - PartType_Emp # U - PartType_Res # U - PartType_Inf # U - PartType_Vbp # U - Person_abs_One # bq, U - Person_abs_Two # bq, U - Person_abs_Three # bq, U - Person_dat_One # bq, U - Person_dat_Two # bq, U - Person_dat_Three # bq, U - Person_erg_One # bq, U - Person_erg_Two # bq, U - Person_erg_Three # bq, U - Person_psor_One # fi, U - Person_psor_Two # fi, U - Person_psor_Three # fi, U - Polite_Inf # bq, U - Polite_Pol # bq, U - Polite_abs_Inf # bq, U - Polite_abs_Pol # bq, U - Polite_erg_Inf # bq, U - Polite_erg_Pol # bq, U - Polite_dat_Inf # bq, U - Polite_dat_Pol # bq, U - Prefix_Yes # U - PrepCase_Npr # cz - PrepCase_Pre # U - PunctSide_Ini # U - PunctSide_Fin # U - PunctType_Peri # U - PunctType_Qest # U - PunctType_Excl # U - PunctType_Quot # U - PunctType_Brck # U - PunctType_Comm # U - PunctType_Colo # U - PunctType_Semi # U - PunctType_Dash # U - Style_Arch # cz, fi, U - Style_Rare # cz, fi, U - Style_Poet # cz, U - Style_Norm # cz, U - Style_Coll # cz, U - Style_Vrnc # cz, U - Style_Sing # cz, U - Style_Expr # cz, U - Style_Derg # cz, U - Style_Vulg # cz, U - Style_Yes # fi, U - StyleVariant_StyleShort # cz - StyleVariant_StyleBound # cz, sl - VerbType_Aux # U - VerbType_Cop # U - VerbType_Mod # U - VerbType_Light # U diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 8d2a73608..c53e5f478 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -89,3 +89,254 @@ cdef class Morphology: lemma_string = sorted(lemma_strings)[0] lemma = self.strings[lemma_string] return lemma + +IDS = { + "Animacy_anim": Animacy_anim, + "Animacy_inam": Animacy_inam, + "Aspect_freq": Aspect_freq, + "Aspect_imp": Aspect_imp, + "Aspect_mod": Aspect_mod, + "Aspect_none": Aspect_none, + "Aspect_perf": Aspect_perf, + "Case_abe": Case_abe, + "Case_abl": Case_abl, + "Case_abs": Case_abs, + "Case_acc": Case_acc, + "Case_ade": Case_ade, + "Case_all": Case_all, + "Case_cau": Case_cau, + "Case_com": Case_com, + "Case_dat": Case_dat, + "Case_del": Case_del, + "Case_dis": Case_dis, + "Case_ela": Case_ela, + "Case_ess": Case_ess, + "Case_gen": Case_gen, + "Case_ill": Case_ill, + "Case_ine": Case_ine, + "Case_ins": Case_ins, + "Case_loc": Case_loc, + "Case_lat": Case_lat, + "Case_nom": Case_nom, + "Case_par": Case_par, + "Case_sub": Case_sub, + "Case_sup": Case_sup, + "Case_tem": Case_tem, + "Case_ter": Case_ter, + "Case_tra": Case_tra, + "Case_voc": Case_voc, + "Definite_two": Definite_two, + "Definite_def": Definite_def, + "Definite_red": Definite_red, + "Definite_ind": Definite_ind, + "Degree_cmp": Degree_cmp, + "Degree_comp": Degree_comp, + "Degree_none": Degree_none, + "Degree_pos": Degree_pos, + "Degree_sup": Degree_sup, + "Degree_abs": Degree_abs, + "Degree_com": Degree_com, + "Degree_dim ": Degree_dim, # du + "Gender_com": Gender_com, + "Gender_fem": Gender_fem, + "Gender_masc": Gender_masc, + "Gender_neut": Gender_neut, + "Mood_cnd": Mood_cnd, + "Mood_imp": Mood_imp, + "Mood_ind": Mood_ind, + "Mood_n": Mood_n, + "Mood_pot": Mood_pot, + "Mood_sub": Mood_sub, + "Mood_opt": Mood_opt, + "Negative_neg": Negative_neg, + "Negative_pos": Negative_pos, + "Negative_yes": Negative_yes, + "Number_com": Number_com, + "Number_dual": Number_dual, + "Number_none": Number_none, + "Number_plur": Number_plur, + "Number_sing": Number_sing, + "Number_ptan ": Number_ptan, # bg + "Number_count ": Number_count, # bg + "NumType_card": NumType_card, + "NumType_dist": NumType_dist, + "NumType_frac": NumType_frac, + "NumType_gen": NumType_gen, + "NumType_mult": NumType_mult, + "NumType_none": NumType_none, + "NumType_ord": NumType_ord, + "NumType_sets": NumType_sets, + "Person_one": Person_one, + "Person_two": Person_two, + "Person_three": Person_three, + "Person_none": Person_none, + "Poss_yes": Poss_yes, + "PronType_advPart": PronType_advPart, + "PronType_art": PronType_art, + "PronType_default": PronType_default, + "PronType_dem": PronType_dem, + "PronType_ind": PronType_ind, + "PronType_int": PronType_int, + "PronType_neg": PronType_neg, + "PronType_prs": PronType_prs, + "PronType_rcp": PronType_rcp, + "PronType_rel": PronType_rel, + "PronType_tot": PronType_tot, + "PronType_clit": PronType_clit, + "PronType_exc ": PronType_exc, # es, ca, it, fa, + "Reflex_yes": Reflex_yes, + "Tense_fut": Tense_fut, + "Tense_imp": Tense_imp, + "Tense_past": Tense_past, + "Tense_pres": Tense_pres, + "VerbForm_fin": VerbForm_fin, + "VerbForm_ger": VerbForm_ger, + "VerbForm_inf": VerbForm_inf, + "VerbForm_none": VerbForm_none, + "VerbForm_part": VerbForm_part, + "VerbForm_partFut": VerbForm_partFut, + "VerbForm_partPast": VerbForm_partPast, + "VerbForm_partPres": VerbForm_partPres, + "VerbForm_sup": VerbForm_sup, + "VerbForm_trans": VerbForm_trans, + "VerbForm_gdv ": VerbForm_gdv, # la, + "Voice_act": Voice_act, + "Voice_cau": Voice_cau, + "Voice_pass": Voice_pass, + "Voice_mid ": Voice_mid, # gkc, + "Voice_int ": Voice_int, # hb, + "Abbr_yes ": Abbr_yes, # cz, fi, sl, U, + "AdpType_prep ": AdpType_prep, # cz, U, + "AdpType_post ": AdpType_post, # U, + "AdpType_voc ": AdpType_voc, # cz, + "AdpType_comprep ": AdpType_comprep, # cz, + "AdpType_circ ": AdpType_circ, # U, + "AdvType_man": AdvType_man, + "AdvType_loc": AdvType_loc, + "AdvType_tim": AdvType_tim, + "AdvType_deg": AdvType_deg, + "AdvType_cau": AdvType_cau, + "AdvType_mod": AdvType_mod, + "AdvType_sta": AdvType_sta, + "AdvType_ex": AdvType_ex, + "AdvType_adadj": AdvType_adadj, + "ConjType_oper ": ConjType_oper, # cz, U, + "ConjType_comp ": ConjType_comp, # cz, U, + "Connegative_yes ": Connegative_yes, # fi, + "Derivation_minen ": Derivation_minen, # fi, + "Derivation_sti ": Derivation_sti, # fi, + "Derivation_inen ": Derivation_inen, # fi, + "Derivation_lainen ": Derivation_lainen, # fi, + "Derivation_ja ": Derivation_ja, # fi, + "Derivation_ton ": Derivation_ton, # fi, + "Derivation_vs ": Derivation_vs, # fi, + "Derivation_ttain ": Derivation_ttain, # fi, + "Derivation_ttaa ": Derivation_ttaa, # fi, + "Echo_rdp ": Echo_rdp, # U, + "Echo_ech ": Echo_ech, # U, + "Foreign_foreign ": Foreign_foreign, # cz, fi, U, + "Foreign_fscript ": Foreign_fscript, # cz, fi, U, + "Foreign_tscript ": Foreign_tscript, # cz, U, + "Foreign_yes ": Foreign_yes, # sl, + "Gender_dat_masc ": Gender_dat_masc, # bq, U, + "Gender_dat_fem ": Gender_dat_fem, # bq, U, + "Gender_erg_masc ": Gender_erg_masc, # bq, + "Gender_erg_fem ": Gender_erg_fem, # bq, + "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U, + "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U, + "Gender_psor_neut ": Gender_psor_neut, # sl, + "Hyph_yes ": Hyph_yes, # cz, U, + "InfForm_one ": InfForm_one, # fi, + "InfForm_two ": InfForm_two, # fi, + "InfForm_three ": InfForm_three, # fi, + "NameType_geo ": NameType_geo, # U, cz, + "NameType_prs ": NameType_prs, # U, cz, + "NameType_giv ": NameType_giv, # U, cz, + "NameType_sur ": NameType_sur, # U, cz, + "NameType_nat ": NameType_nat, # U, cz, + "NameType_com ": NameType_com, # U, cz, + "NameType_pro ": NameType_pro, # U, cz, + "NameType_oth ": NameType_oth, # U, cz, + "NounType_com ": NounType_com, # U, + "NounType_prop ": NounType_prop, # U, + "NounType_class ": NounType_class, # U, + "Number_abs_sing ": Number_abs_sing, # bq, U, + "Number_abs_plur ": Number_abs_plur, # bq, U, + "Number_dat_sing ": Number_dat_sing, # bq, U, + "Number_dat_plur ": Number_dat_plur, # bq, U, + "Number_erg_sing ": Number_erg_sing, # bq, U, + "Number_erg_plur ": Number_erg_plur, # bq, U, + "Number_psee_sing ": Number_psee_sing, # U, + "Number_psee_plur ": Number_psee_plur, # U, + "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U, + "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U, + "NumForm_digit ": NumForm_digit, # cz, sl, U, + "NumForm_roman ": NumForm_roman, # cz, sl, U, + "NumForm_word ": NumForm_word, # cz, sl, U, + "NumValue_one ": NumValue_one, # cz, U, + "NumValue_two ": NumValue_two, # cz, U, + "NumValue_three ": NumValue_three, # cz, U, + "PartForm_pres ": PartForm_pres, # fi, + "PartForm_past ": PartForm_past, # fi, + "PartForm_agt ": PartForm_agt, # fi, + "PartForm_neg ": PartForm_neg, # fi, + "PartType_mod ": PartType_mod, # U, + "PartType_emp ": PartType_emp, # U, + "PartType_res ": PartType_res, # U, + "PartType_inf ": PartType_inf, # U, + "PartType_vbp ": PartType_vbp, # U, + "Person_abs_one ": Person_abs_one, # bq, U, + "Person_abs_two ": Person_abs_two, # bq, U, + "Person_abs_three ": Person_abs_three, # bq, U, + "Person_dat_one ": Person_dat_one, # bq, U, + "Person_dat_two ": Person_dat_two, # bq, U, + "Person_dat_three ": Person_dat_three, # bq, U, + "Person_erg_one ": Person_erg_one, # bq, U, + "Person_erg_two ": Person_erg_two, # bq, U, + "Person_erg_three ": Person_erg_three, # bq, U, + "Person_psor_one ": Person_psor_one, # fi, U, + "Person_psor_two ": Person_psor_two, # fi, U, + "Person_psor_three ": Person_psor_three, # fi, U, + "Polite_inf ": Polite_inf, # bq, U, + "Polite_pol ": Polite_pol, # bq, U, + "Polite_abs_inf ": Polite_abs_inf, # bq, U, + "Polite_abs_pol ": Polite_abs_pol, # bq, U, + "Polite_erg_inf ": Polite_erg_inf, # bq, U, + "Polite_erg_pol ": Polite_erg_pol, # bq, U, + "Polite_dat_inf ": Polite_dat_inf, # bq, U, + "Polite_dat_pol ": Polite_dat_pol, # bq, U, + "Prefix_yes ": Prefix_yes, # U, + "PrepCase_npr ": PrepCase_npr, # cz, + "PrepCase_pre ": PrepCase_pre, # U, + "PunctSide_ini ": PunctSide_ini, # U, + "PunctSide_fin ": PunctSide_fin, # U, + "PunctType_peri ": PunctType_peri, # U, + "PunctType_qest ": PunctType_qest, # U, + "PunctType_excl ": PunctType_excl, # U, + "PunctType_quot ": PunctType_quot, # U, + "PunctType_brck ": PunctType_brck, # U, + "PunctType_comm ": PunctType_comm, # U, + "PunctType_colo ": PunctType_colo, # U, + "PunctType_semi ": PunctType_semi, # U, + "PunctType_dash ": PunctType_dash, # U, + "Style_arch ": Style_arch, # cz, fi, U, + "Style_rare ": Style_rare, # cz, fi, U, + "Style_poet ": Style_poet, # cz, U, + "Style_norm ": Style_norm, # cz, U, + "Style_coll ": Style_coll, # cz, U, + "Style_vrnc ": Style_vrnc, # cz, U, + "Style_sing ": Style_sing, # cz, U, + "Style_expr ": Style_expr, # cz, U, + "Style_derg ": Style_derg, # cz, U, + "Style_vulg ": Style_vulg, # cz, U, + "Style_yes ": Style_yes, # fi, U, + "StyleVariant_styleShort ": StyleVariant_styleShort, # cz, + "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl, + "VerbType_aux ": VerbType_aux, # U, + "VerbType_cop ": VerbType_cop, # U, + "VerbType_mod ": VerbType_mod, # U, + "VerbType_light ": VerbType_light, # U, +} + + +NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] From 9f4be0adcdf5a29ad2cfc49cf9b005debdc49387 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 22:11:20 +1100 Subject: [PATCH 11/22] * Map NO_TAG to NIL in parts_of_speech.pxd --- spacy/parts_of_speech.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd index 17e349435..9fbdbd71f 100644 --- a/spacy/parts_of_speech.pxd +++ b/spacy/parts_of_speech.pxd @@ -2,7 +2,7 @@ from .symbols cimport * cpdef enum univ_pos_t: - NO_TAG = EMPTY_VALUE + NO_TAG = NIL ADJ = POS_adj ADP = POS_adp ADV = POS_adv From ce65ec698c42be5146f350146b9dc94904dcf7e3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 22:11:38 +1100 Subject: [PATCH 12/22] * Remove qualified naming in symbols --- spacy/symbols.pxd | 202 ++++++++++++++++++++++----------------------- spacy/symbols.pyx | 205 +++++++++++++++++++++++----------------------- 2 files changed, 204 insertions(+), 203 deletions(-) diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index e8ddeaa8f..590a2d41d 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -1,109 +1,109 @@ cpdef enum symbol_t: - EMPTY_VALUE - Attr_is_alpha - Attr_is_ascii - Attr_is_digit - Attr_is_lower - Attr_is_punct - Attr_is_space - Attr_is_title - Attr_is_upper - Attr_like_url - Attr_like_num - Attr_like_email - Attr_is_stop - Attr_is_oov + NIL + IS_ALPHA + IS_ASCII + IS_DIGIT + IS_LOWER + IS_PUNCT + IS_SPACE + IS_TITLE + IS_UPPER + LIKE_URL + LIKE_NUM + LIKE_EMAIL + IS_STOP + IS_OOV - Attr_flag14 - Attr_flag15 - Attr_flag16 - Attr_flag17 - Attr_flag18 - Attr_flag19 - Attr_flag20 - Attr_flag21 - Attr_flag22 - Attr_flag23 - Attr_flag24 - Attr_flag25 - Attr_flag26 - Attr_flag27 - Attr_flag28 - Attr_flag29 - Attr_flag30 - Attr_flag31 - Attr_flag32 - Attr_flag33 - Attr_flag34 - Attr_flag35 - Attr_flag36 - Attr_flag37 - Attr_flag38 - Attr_flag39 - Attr_flag40 - Attr_flag41 - Attr_flag42 - Attr_flag43 - Attr_flag44 - Attr_flag45 - Attr_flag46 - Attr_flag47 - Attr_flag48 - Attr_flag49 - Attr_flag50 - Attr_flag51 - Attr_flag52 - Attr_flag53 - Attr_flag54 - Attr_flag55 - Attr_flag56 - Attr_flag57 - Attr_flag58 - Attr_flag59 - Attr_flag60 - Attr_flag61 - Attr_flag62 - Attr_flag63 + FLAG14 + FLAG15 + FLAG16 + FLAG17 + FLAG18 + FLAG19 + FLAG20 + FLAG21 + FLAG22 + FLAG23 + FLAG24 + FLAG25 + FLAG26 + FLAG27 + FLAG28 + FLAG29 + FLAG30 + FLAG31 + FLAG32 + FLAG33 + FLAG34 + FLAG35 + FLAG36 + FLAG37 + FLAG38 + FLAG39 + FLAG40 + FLAG41 + FLAG42 + FLAG43 + FLAG44 + FLAG45 + FLAG46 + FLAG47 + FLAG48 + FLAG49 + FLAG50 + FLAG51 + FLAG52 + FLAG53 + FLAG54 + FLAG55 + FLAG56 + FLAG57 + FLAG58 + FLAG59 + FLAG60 + FLAG61 + FLAG62 + FLAG63 - Attr_id - Attr_orth - Attr_lower - Attr_norm - Attr_shape - Attr_prefix - Attr_suffix + ID + ORTH + LOWER + NORM + SHAPE + PREFIX + SUFFIX - Attr_length - Attr_cluster - Attr_lemma - Attr_pos - Attr_tag - Attr_dep - Attr_ent_iob - Attr_ent_type - Attr_head - Attr_spacy - Attr_prob + LENGTH + CLUSTER + LEMMA + POS + TAG + DEP + ENT_IOB + ENT_TYPE + HEAD + SPACY + PROB - POS_adj - POS_adp - POS_adv - POS_aux - POS_conj - POS_det - POS_intj - POS_noun - POS_num - POS_part - POS_pron - POS_propn - POS_punct - POS_sconj - POS_sym - POS_verb - POS_x - POS_eol - POS_space + ADJ + ADP + ADV + AUX + CONJ + DET + INTJ + NOUN + NUM + PART + PRON + PROPN + PUNCT + SCONJ + SYM + VERB + X + EOL + SPACE Animacy_anim Animacy_inam diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index a0a39f2ff..9a3d219d5 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -1,108 +1,109 @@ -SYMBOL_IDS = { - "Attr_is_alpha": Attr_is_alpha, - "Attr_is_ascii": Attr_is_ascii, - "Attr_is_digit": Attr_is_digit, - "Attr_is_lower": Attr_is_lower, - "Attr_is_punct": Attr_is_punct, - "Attr_is_space": Attr_is_space, - "Attr_is_title": Attr_is_title, - "Attr_is_upper": Attr_is_upper, - "Attr_like_url": Attr_like_url, - "Attr_like_num": Attr_like_num, - "Attr_like_email": Attr_like_email, - "Attr_is_stop": Attr_is_stop, - "Attr_is_oov": Attr_is_oov, +IDS = { + "": NIL, + "IS_ALPHA": IS_ALPHA, + "IS_ASCII": IS_ASCII, + "IS_DIGIT": IS_DIGIT, + "IS_LOWER": IS_LOWER, + "IS_PUNCT": IS_PUNCT, + "IS_SPACE": IS_SPACE, + "IS_TITLE": IS_TITLE, + "IS_UPPER": IS_UPPER, + "LIKE_URL": LIKE_URL, + "LIKE_NUM": LIKE_NUM, + "LIKE_EMAIL": LIKE_EMAIL, + "IS_STOP": IS_STOP, + "IS_OOV": IS_OOV, - "Attr_flag14": Attr_flag14, - "Attr_flag15": Attr_flag15, - "Attr_flag16": Attr_flag16, - "Attr_flag17": Attr_flag17, - "Attr_flag18": Attr_flag18, - "Attr_flag19": Attr_flag19, - "Attr_flag20": Attr_flag20, - "Attr_flag21": Attr_flag21, - "Attr_flag22": Attr_flag22, - "Attr_flag23": Attr_flag23, - "Attr_flag24": Attr_flag24, - "Attr_flag25": Attr_flag25, - "Attr_flag26": Attr_flag26, - "Attr_flag27": Attr_flag27, - "Attr_flag28": Attr_flag28, - "Attr_flag29": Attr_flag29, - "Attr_flag30": Attr_flag30, - "Attr_flag31": Attr_flag31, - "Attr_flag32": Attr_flag32, - "Attr_flag33": Attr_flag33, - "Attr_flag34": Attr_flag34, - "Attr_flag35": Attr_flag35, - "Attr_flag36": Attr_flag36, - "Attr_flag37": Attr_flag37, - "Attr_flag38": Attr_flag38, - "Attr_flag39": Attr_flag39, - "Attr_flag40": Attr_flag40, - "Attr_flag41": Attr_flag41, - "Attr_flag42": Attr_flag42, - "Attr_flag43": Attr_flag43, - "Attr_flag44": Attr_flag44, - "Attr_flag45": Attr_flag45, - "Attr_flag46": Attr_flag46, - "Attr_flag47": Attr_flag47, - "Attr_flag48": Attr_flag48, - "Attr_flag49": Attr_flag49, - "Attr_flag50": Attr_flag50, - "Attr_flag51": Attr_flag51, - "Attr_flag52": Attr_flag52, - "Attr_flag53": Attr_flag53, - "Attr_flag54": Attr_flag54, - "Attr_flag55": Attr_flag55, - "Attr_flag56": Attr_flag56, - "Attr_flag57": Attr_flag57, - "Attr_flag58": Attr_flag58, - "Attr_flag59": Attr_flag59, - "Attr_flag60": Attr_flag60, - "Attr_flag61": Attr_flag61, - "Attr_flag62": Attr_flag62, - "Attr_flag63": Attr_flag63, + "FLAG14": FLAG14, + "FLAG15": FLAG15, + "FLAG16": FLAG16, + "FLAG17": FLAG17, + "FLAG18": FLAG18, + "FLAG19": FLAG19, + "FLAG20": FLAG20, + "FLAG21": FLAG21, + "FLAG22": FLAG22, + "FLAG23": FLAG23, + "FLAG24": FLAG24, + "FLAG25": FLAG25, + "FLAG26": FLAG26, + "FLAG27": FLAG27, + "FLAG28": FLAG28, + "FLAG29": FLAG29, + "FLAG30": FLAG30, + "FLAG31": FLAG31, + "FLAG32": FLAG32, + "FLAG33": FLAG33, + "FLAG34": FLAG34, + "FLAG35": FLAG35, + "FLAG36": FLAG36, + "FLAG37": FLAG37, + "FLAG38": FLAG38, + "FLAG39": FLAG39, + "FLAG40": FLAG40, + "FLAG41": FLAG41, + "FLAG42": FLAG42, + "FLAG43": FLAG43, + "FLAG44": FLAG44, + "FLAG45": FLAG45, + "FLAG46": FLAG46, + "FLAG47": FLAG47, + "FLAG48": FLAG48, + "FLAG49": FLAG49, + "FLAG50": FLAG50, + "FLAG51": FLAG51, + "FLAG52": FLAG52, + "FLAG53": FLAG53, + "FLAG54": FLAG54, + "FLAG55": FLAG55, + "FLAG56": FLAG56, + "FLAG57": FLAG57, + "FLAG58": FLAG58, + "FLAG59": FLAG59, + "FLAG60": FLAG60, + "FLAG61": FLAG61, + "FLAG62": FLAG62, + "FLAG63": FLAG63, - "Attr_id": Attr_id, - "Attr_orth": Attr_orth, - "Attr_lower": Attr_lower, - "Attr_norm": Attr_norm, - "Attr_shape": Attr_shape, - "Attr_prefix": Attr_prefix, - "Attr_suffix": Attr_suffix, + "ID": ID, + "ORTH": ORTH, + "LOWER": LOWER, + "NORM": NORM, + "SHAPE": SHAPE, + "PREFIX": PREFIX, + "SUFFIX": SUFFIX, - "Attr_length": Attr_length, - "Attr_cluster": Attr_cluster, - "Attr_lemma": Attr_lemma, - "Attr_pos": Attr_pos, - "Attr_tag": Attr_tag, - "Attr_dep": Attr_dep, - "Attr_ent_iob": Attr_ent_iob, - "Attr_ent_type": Attr_ent_type, - "Attr_head": Attr_head, - "Attr_spacy": Attr_spacy, - "Attr_prob": Attr_prob, + "LENGTH": LENGTH, + "CLUSTER": CLUSTER, + "LEMMA": LEMMA, + "POS": POS, + "TAG": TAG, + "DEP": DEP, + "ENT_IOB": ENT_IOB, + "ENT_TYPE": ENT_TYPE, + "HEAD": HEAD, + "SPACY": SPACY, + "PROB": PROB, - "POS_adj": POS_adj, - "POS_adp": POS_adp, - "POS_adv": POS_adv, - "POS_aux": POS_aux, - "POS_conj": POS_conj, - "POS_det": POS_det, - "POS_intj": POS_intj, - "POS_noun": POS_noun, - "POS_num": POS_num, - "POS_part": POS_part, - "POS_pron": POS_pron, - "POS_propn": POS_propn, - "POS_punct": POS_punct, - "POS_sconj": POS_sconj, - "POS_sym": POS_sym, - "POS_verb": POS_verb, - "POS_x": POS_x, - "POS_eol": POS_eol, - "POS_space": POS_space, + "ADJ": ADJ, + "ADP": ADP, + "ADV": ADV, + "AUX": AUX, + "CONJ": CONJ, + "DET": DET, + "INTJ": INTJ, + "NOUN": NOUN, + "NUM": NUM, + "PART": PART, + "PRON": PRON, + "PROPN": PROPN, + "PUNCT": PUNCT, + "SCONJ": SCONJ, + "SYM": SYM, + "VERB": VERB, + "X": X, + "EOL": EOL, + "SPACE": SPACE, "Animacy_anim": Animacy_anim, "Animacy_inam": Animacy_inam, @@ -420,4 +421,4 @@ SYMBOL_IDS = { "Dep_xcomp": Dep_xcomp } -SYMBOL_NAMES = [it[0] for it in sorted(SYMBOL_IDS.items(), key=lambda it: it[1])] +NAMES = [it[0] for it in sorted(IDS.items(), key=lambda it: it[1])] From 37b909b6b65a50affbe89578d76546386069059c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 22:12:06 +1100 Subject: [PATCH 13/22] * Use the symbols file in vocab instead of the symbols subfiles like attrs.pxd --- spacy/vocab.pyx | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 6cf829344..0f43967bb 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -20,7 +20,7 @@ from .cfile cimport CFile from .lemmatizer import Lemmatizer from . import attrs -from . import parts_of_speech +from . import symbols from cymem.cymem cimport Address from . import util @@ -75,18 +75,9 @@ cdef class Vocab: # is the frequency rank of the word, plus a certain offset. The structural # strings are loaded first, because the vocab is open-class, and these # symbols are closed class. - for name in attrs.NAMES: + for name in symbols.NAMES + list(sorted(tag_map.keys())): if name: _ = self.strings[name] - for name in parts_of_speech.NAMES: - if name: - _ = self.strings[name] - #for morph_name in UNIV_MORPH_NAMES: - # _ = self.strings[morph_name] - #for entity_type_name in entity_types.NAMES: - # _ = self.strings[entity_type_name] - #for tag_name in sorted(tag_map.keys()): - # _ = self.strings[tag_name] self.get_lex_attr = get_lex_attr self.morphology = Morphology(self.strings, tag_map, lemmatizer) self.serializer_freqs = serializer_freqs From 7b4af3d1e77104f37a0683c3dc4f950be179fb72 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 22:58:34 +1100 Subject: [PATCH 14/22] * Fix parts_of_speech now that symbols list has been reformed --- spacy/parts_of_speech.pxd | 43 +++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd index 9fbdbd71f..c97673a69 100644 --- a/spacy/parts_of_speech.pxd +++ b/spacy/parts_of_speech.pxd @@ -1,24 +1,23 @@ -from .symbols cimport * - +from . cimport symbols cpdef enum univ_pos_t: - NO_TAG = NIL - ADJ = POS_adj - ADP = POS_adp - ADV = POS_adv - AUX = POS_aux - CONJ = POS_conj - DET = POS_det - INTJ = POS_intj - NOUN = POS_noun - NUM = POS_num - PART = POS_part - PRON = POS_pron - PROPN = POS_propn - PUNCT = POS_punct - SCONJ = POS_sconj - SYM = POS_sym - VERB = POS_verb - X = POS_x - EOL = POS_eol - SPACE = POS_space + NO_TAG = 0 + ADJ = symbols.ADJ + ADP + ADV + AUX + CONJ + DET + INTJ + NOUN + NUM + PART + PRON + PROPN + PUNCT + SCONJ + SYM + VERB + X + EOL + SPACE From e70368d15719fd269f52b8d76d9bbeb8d851c307 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 22:59:14 +1100 Subject: [PATCH 15/22] * Use lower case strings for dependency label names in symbols enum --- spacy/symbols.pxd | 132 +++++++++++++++++++++++----------------------- spacy/symbols.pyx | 132 +++++++++++++++++++++++----------------------- 2 files changed, 132 insertions(+), 132 deletions(-) diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index 590a2d41d..0c60f6f67 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -351,71 +351,71 @@ cpdef enum symbol_t: VerbType_mod # U VerbType_light # U - Name_person - Name_norp - Name_facility - Name_org - Name_gpe - Name_loc - Name_product - Name_event - Name_work_of_art - Name_language + PERSON + NORP + FACILITY + ORG + GPE + LOC + PRODUCT + EVENT + WORK_OF_ART + LANGUAGE - Unit_date - Unit_time - Unit_percent - Unit_money - Unit_quantity - Unit_ordinal - Unit_cardinal + DATE + TIME + PERCENT + MONEY + QUANTITY + ORDINAL + CARDINAL - Dep_acomp - Dep_advcl - Dep_advmod - Dep_agent - Dep_amod - Dep_appos - Dep_attr - Dep_aux - Dep_auxpass - Dep_cc - Dep_ccomp - Dep_complm - Dep_conj - Dep_csubj - Dep_csubjpass - Dep_dep - Dep_det - Dep_dobj - Dep_expl - Dep_hmod - Dep_hyph - Dep_infmod - Dep_intj - Dep_iobj - Dep_mark - Dep_meta - Dep_neg - Dep_nmod - Dep_nn - Dep_npadvmod - Dep_nsubj - Dep_nsubjpass - Dep_num - Dep_number - Dep_oprd - Dep_parataxis - Dep_partmod - Dep_pcomp - Dep_pobj - Dep_poss - Dep_possessive - Dep_preconj - Dep_prep - Dep_prt - Dep_punct - Dep_quantmod - Dep_rcmod - Dep_root - Dep_xcomp + acomp + advcl + advmod + agent + amod + appos + attr + aux + auxpass + cc + ccomp + complm + conj + csubj + csubjpass + dep + det + dobj + expl + hmod + hyph + infmod + intj + iobj + mark + meta + neg + nmod + nn + npadvmod + nsubj + nsubjpass + num + number + oprd + parataxis + partmod + pcomp + pobj + poss + possessive + preconj + prep + prt + punct + quantmod + rcmod + root + xcomp diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 9a3d219d5..31b01db98 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -351,74 +351,74 @@ IDS = { "VerbType_mod ": VerbType_mod, # U, "VerbType_light ": VerbType_light, # U, - "Name_person": Name_person, - "Name_norp": Name_norp, - "Name_facility": Name_facility, - "Name_org": Name_org, - "Name_gpe": Name_gpe, - "Name_loc": Name_loc, - "Name_product": Name_product, - "Name_event": Name_event, - "Name_work_of_art": Name_work_of_art, - "Name_language": Name_language, + "PERSON": PERSON, + "NORP": NORP, + "FACILITY": FACILITY, + "ORG": ORG, + "GPE": GPE, + "LOC": LOC, + "PRODUCT": PRODUCT, + "EVENT": EVENT, + "WORK_OF_ART": WORK_OF_ART, + "LANGUAGE": LANGUAGE, - "Unit_date": Unit_date, - "Unit_time": Unit_time, - "Unit_percent": Unit_percent, - "Unit_money": Unit_money, - "Unit_quantity": Unit_quantity, - "Unit_ordinal": Unit_ordinal, - "Unit_cardinal": Unit_cardinal, + "DATE": DATE, + "TIME": TIME, + "PERCENT": PERCENT, + "MONEY": MONEY, + "QUANTITY": QUANTITY, + "ORDINAL": ORDINAL, + "CARDINAL": CARDINAL, - "Dep_acomp": Dep_acomp, - "Dep_advcl": Dep_advcl, - "Dep_advmod": Dep_advmod, - "Dep_agent": Dep_agent, - "Dep_amod": Dep_amod, - "Dep_appos": Dep_appos, - "Dep_attr": Dep_attr, - "Dep_aux": Dep_aux, - "Dep_auxpass": Dep_auxpass, - "Dep_cc": Dep_cc, - "Dep_ccomp": Dep_ccomp, - "Dep_complm": Dep_complm, - "Dep_conj": Dep_conj, - "Dep_csubj": Dep_csubj, - "Dep_csubjpass": Dep_csubjpass, - "Dep_dep": Dep_dep, - "Dep_det": Dep_det, - "Dep_dobj": Dep_dobj, - "Dep_expl": Dep_expl, - "Dep_hmod": Dep_hmod, - "Dep_hyph": Dep_hyph, - "Dep_infmod": Dep_infmod, - "Dep_intj": Dep_intj, - "Dep_iobj": Dep_iobj, - "Dep_mark": Dep_mark, - "Dep_meta": Dep_meta, - "Dep_neg": Dep_neg, - "Dep_nmod": Dep_nmod, - "Dep_nn": Dep_nn, - "Dep_npadvmod": Dep_npadvmod, - "Dep_nsubj": Dep_nsubj, - "Dep_nsubjpass": Dep_nsubjpass, - "Dep_num": Dep_num, - "Dep_number": Dep_number, - "Dep_oprd": Dep_oprd, - "Dep_parataxis": Dep_parataxis, - "Dep_partmod": Dep_partmod, - "Dep_pcomp": Dep_pcomp, - "Dep_pobj": Dep_pobj, - "Dep_poss": Dep_poss, - "Dep_possessive": Dep_possessive, - "Dep_preconj": Dep_preconj, - "Dep_prep": Dep_prep, - "Dep_prt": Dep_prt, - "Dep_punct": Dep_punct, - "Dep_quantmod": Dep_quantmod, - "Dep_rcmod": Dep_rcmod, - "Dep_root": Dep_root, - "Dep_xcomp": Dep_xcomp + "acomp": acomp, + "advcl": advcl, + "advmod": advmod, + "agent": agent, + "amod": amod, + "appos": appos, + "attr": attr, + "aux": aux, + "auxpass": auxpass, + "cc": cc, + "ccomp": ccomp, + "complm": complm, + "conj": conj, + "csubj": csubj, + "csubjpass": csubjpass, + "dep": dep, + "det": det, + "dobj": dobj, + "expl": expl, + "hmod": hmod, + "hyph": hyph, + "infmod": infmod, + "intj": intj, + "iobj": iobj, + "mark": mark, + "meta": meta, + "neg": neg, + "nmod": nmod, + "nn": nn, + "npadvmod": npadvmod, + "nsubj": nsubj, + "nsubjpass": nsubjpass, + "num": num, + "number": number, + "oprd": oprd, + "parataxis": parataxis, + "partmod": partmod, + "pcomp": pcomp, + "pobj": pobj, + "poss": poss, + "possessive": possessive, + "preconj": preconj, + "prep": prep, + "prt": prt, + "punct": punct, + "quantmod": quantmod, + "rcmod": rcmod, + "root": root, + "xcomp": xcomp } NAMES = [it[0] for it in sorted(IDS.items(), key=lambda it: it[1])] From 41012907a83a0684cc6044d9e5b7dcf2cbc704db Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 12 Oct 2015 00:51:43 +1100 Subject: [PATCH 16/22] * Fix variable name --- spacy/vocab.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 0f43967bb..af9161d6b 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -271,17 +271,17 @@ cdef class Vocab: i += 1 fp.close() - def load_vectors(self, loc_or_file): + def load_vectors(self, file_): cdef LexemeC* lexeme cdef attr_t orth cdef int32_t vec_len = -1 - for line_num, line in enumerate(loc_or_file): + for line_num, line in enumerate(file_): pieces = line.split() word_str = pieces.pop(0) if vec_len == -1: vec_len = len(pieces) elif vec_len != len(pieces): - raise VectorReadError.mismatched_sizes(loc_or_file, line_num, + raise VectorReadError.mismatched_sizes(file_, line_num, vec_len, len(pieces)) orth = self.strings[word_str] lexeme = self.get_by_orth(self.mem, orth) From 0cee928467abf01007fd73433cc5a3e387f24883 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 12 Oct 2015 15:12:32 +1100 Subject: [PATCH 17/22] * Allow StringStore to be pickled, to start addressing Issue #125 --- spacy/strings.pyx | 13 ++++++++++++- tests/vocab/test_intern.py | 17 +++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index a4a470158..2208d3bdf 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -69,12 +69,15 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except cdef class StringStore: '''Map strings to and from integer IDs.''' - def __init__(self): + def __init__(self, strings=None): self.mem = Pool() self._map = PreshMap() self._resize_at = 10000 self.c = self.mem.alloc(self._resize_at, sizeof(Utf8Str)) self.size = 1 + if strings is not None: + for string in strings: + _ = self[string] property size: def __get__(self): @@ -113,6 +116,14 @@ cdef class StringStore: for i in range(self.size): yield self[i] + def __reduce__(self): + strings = [""] + for i in range(1, self.size): + string = &self.c[i] + py_string = _decode(string) + strings.append(py_string) + return (StringStore, (strings,), None, None, None) + cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL: # 0 means missing, but we don't bother offsetting the index. key = hash64(chars, length * sizeof(char), 0) diff --git a/tests/vocab/test_intern.py b/tests/vocab/test_intern.py index 6e007c645..256706c6f 100644 --- a/tests/vocab/test_intern.py +++ b/tests/vocab/test_intern.py @@ -1,5 +1,7 @@ # -*- coding: utf8 -*- from __future__ import unicode_literals +import pickle +import StringIO from spacy.strings import StringStore @@ -76,3 +78,18 @@ def test_massive_strings(sstore): s513 = '1' * 513 orth = sstore[s513] assert sstore[orth] == s513 + + +def test_pickle_string_store(sstore): + hello_id = sstore[u'Hi'] + string_file = StringIO.StringIO() + pickle.dump(sstore, string_file) + + string_file.seek(0) + + loaded = pickle.load(string_file) + + assert loaded[hello_id] == u'Hi' + + + From dfe0ad51ffcc587e91b58d6475bb66066b1dfa01 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 12 Oct 2015 15:16:59 +1100 Subject: [PATCH 18/22] * Add pickle test for lemmatizer --- tests/tagger/test_lemmatizer.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/tagger/test_lemmatizer.py b/tests/tagger/test_lemmatizer.py index ff10b6573..5dfdaabb1 100644 --- a/tests/tagger/test_lemmatizer.py +++ b/tests/tagger/test_lemmatizer.py @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals +import StringIO +import pickle from spacy.lemmatizer import Lemmatizer, read_index, read_exc from spacy.en import LOCAL_DATA_DIR @@ -41,3 +43,12 @@ def test_smart_quotes(lemmatizer): do = lemmatizer.punct assert do('“') == set(['"']) assert do('“') == set(['"']) + + +def test_pickle_lemmatizer(lemmatizer): + file_ = StringIO.StringIO() + pickle.dump(lemmatizer, file_) + + file_.seek(0) + + loaded = pickle.load(file_) From 5ca57bd859a0ea57108baab454ac5bb073e62afb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 12 Oct 2015 15:27:47 +1100 Subject: [PATCH 19/22] * Ensure Morphology can be pickled, to address Issue #125. --- spacy/morphology.pxd | 1 + spacy/morphology.pyx | 4 ++++ tests/morphology/test_pickle.py | 17 +++++++++++++++++ 3 files changed, 22 insertions(+) create mode 100644 tests/morphology/test_pickle.py diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 62d3fccc1..847626158 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -25,6 +25,7 @@ cdef class Morphology: cdef readonly Pool mem cdef readonly StringStore strings cdef public object lemmatizer + cdef readonly object tag_map cdef public object n_tags cdef public object reverse_index cdef public object tag_names diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index c53e5f478..e8b1f3520 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -14,6 +14,7 @@ cdef class Morphology: def __init__(self, StringStore string_store, tag_map, lemmatizer): self.mem = Pool() self.strings = string_store + self.tag_map = tag_map self.lemmatizer = lemmatizer self.n_tags = len(tag_map) + 1 self.tag_names = tuple(sorted(tag_map.keys())) @@ -28,6 +29,9 @@ cdef class Morphology: self.reverse_index[self.rich_tags[i].name] = i self._cache = PreshMapArray(self.n_tags) + def __reduce__(self): + return (Morphology, (self.strings, self.tag_map, self.lemmatizer), None, None) + cdef int assign_tag(self, TokenC* token, tag) except -1: cdef int tag_id if isinstance(tag, basestring): diff --git a/tests/morphology/test_pickle.py b/tests/morphology/test_pickle.py new file mode 100644 index 000000000..f1b5bcd4c --- /dev/null +++ b/tests/morphology/test_pickle.py @@ -0,0 +1,17 @@ +import pytest + +import pickle +import StringIO + + +from spacy.morphology import Morphology +from spacy.lemmatizer import Lemmatizer +from spacy.strings import StringStore + + +def test_pickle(): + morphology = Morphology(StringStore(), {}, Lemmatizer({}, {}, {})) + + file_ = StringIO.StringIO() + pickle.dump(morphology, file_) + From 85e7944572f047b2fb4d26986c0b416dffa1e3d9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 12 Oct 2015 16:41:31 +1100 Subject: [PATCH 20/22] * Start trying to pickle Vocab --- spacy/vocab.pxd | 2 -- spacy/vocab.pyx | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 929c7b345..d850bf929 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -25,7 +25,6 @@ cdef struct _Cached: cdef class Vocab: - cpdef public lexeme_props_getter cdef Pool mem cpdef readonly StringStore strings cpdef readonly Morphology morphology @@ -33,7 +32,6 @@ cdef class Vocab: cdef public object _serializer cdef public object data_dir cdef public object get_lex_attr - cdef public object pos_tags cdef public object serializer_freqs cdef const LexemeC* get(self, Pool mem, unicode string) except NULL diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index af9161d6b..7f07a64ba 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -10,6 +10,8 @@ from os import path import io import math import json +import tempfile +import copy_reg from .lexeme cimport EMPTY_LEXEME from .lexeme cimport Lexeme @@ -96,6 +98,18 @@ cdef class Vocab: """The current number of lexemes stored.""" return self.length + def __reduce__(self): + tmp_dir = tempfile.mkdtmp() + lex_loc = path.join(tmp_dir, 'lexemes.bin') + str_loc = path.join(tmp_dir, 'strings.txt') + map_loc = path.join(tmp_dir, 'tag_map.json') + + self.dump(lex_loc) + self.strings.dump(str_loc) + json.dump(self.morphology.tag_map, open(map_loc, 'w')) + + return (Vocab.from_dir, (tmp_dir, self.get_lex_attr), None, None) + cdef const LexemeC* get(self, Pool mem, unicode string) except NULL: '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme if necessary, using memory acquired from the given pool. If the pool @@ -339,6 +353,9 @@ cdef class Vocab: return vec_len +copy_reg.constructor(Vocab.from_dir) + + def write_binary_vectors(in_loc, out_loc): cdef CFile out_file = CFile(out_loc, 'wb') cdef Address mem From f8de403483f587e39ddd7148807d305d762b7736 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 12 Oct 2015 17:00:01 +1100 Subject: [PATCH 21/22] * Work on pickling Vocab instances. The current implementation is not correct, but it may serve to see whether this approach is workable. Pickling is necessary to address Issue #125 --- spacy/vocab.pyx | 12 +++++++++--- tests/vocab/test_vocab.py | 11 +++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 7f07a64ba..dd6792104 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -99,7 +99,7 @@ cdef class Vocab: return self.length def __reduce__(self): - tmp_dir = tempfile.mkdtmp() + tmp_dir = tempfile.mkdtemp() lex_loc = path.join(tmp_dir, 'lexemes.bin') str_loc = path.join(tmp_dir, 'strings.txt') map_loc = path.join(tmp_dir, 'tag_map.json') @@ -108,7 +108,7 @@ cdef class Vocab: self.strings.dump(str_loc) json.dump(self.morphology.tag_map, open(map_loc, 'w')) - return (Vocab.from_dir, (tmp_dir, self.get_lex_attr), None, None) + return (unpickle_vocab, (tmp_dir,), None, None) cdef const LexemeC* get(self, Pool mem, unicode string) except NULL: '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme @@ -353,7 +353,13 @@ cdef class Vocab: return vec_len -copy_reg.constructor(Vocab.from_dir) +def unpickle_vocab(data_dir): + # TODO: This needs fixing --- the trouble is, we can't pickle staticmethods, + # so we need to fiddle with the design of Language a little bit. + from .language import Language + return Vocab.from_dir(data_dir, Language.default_lex_attrs()) + +copy_reg.constructor(unpickle_vocab) def write_binary_vectors(in_loc, out_loc): diff --git a/tests/vocab/test_vocab.py b/tests/vocab/test_vocab.py index 153e0d546..1ab3746f3 100644 --- a/tests/vocab/test_vocab.py +++ b/tests/vocab/test_vocab.py @@ -1,10 +1,13 @@ from __future__ import unicode_literals import pytest +import StringIO +import pickle from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA from spacy.parts_of_speech import NOUN, VERB + def test_neq(en_vocab): addr = en_vocab['Hello'] assert en_vocab['bye'].orth != addr.orth @@ -38,3 +41,11 @@ def test_symbols(en_vocab): assert en_vocab.strings['ORTH'] == ORTH assert en_vocab.strings['PROB'] == PROB + +def test_pickle_vocab(en_vocab): + file_ = StringIO.StringIO() + pickle.dump(en_vocab, file_) + + file_.seek(0) + + loaded = pickle.load(file_) From 20fd36a0f785fd447b66808480b7722d62ceb11e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 12 Oct 2015 19:33:11 +1100 Subject: [PATCH 22/22] * Very scrappy, likely buggy first-cut pickle implementation, to work on Issue #125: allow pickle for Apache Spark. The current implementation sends stuff to temp files, and does almost nothing to ensure all modifiable state is actually preserved. The Language() instance is a deep tree of extension objects, and if pickling during training, some of the C-data state is hard to preserve. --- spacy/_ml.pxd | 1 + spacy/_ml.pyx | 14 ++++++++++++++ spacy/language.py | 6 ++++++ spacy/matcher.pyx | 20 +++++++++++++------- spacy/syntax/parser.pyx | 4 +++- spacy/syntax/transition_system.pxd | 2 ++ spacy/syntax/transition_system.pyx | 10 ++++++++-- spacy/tagger.pyx | 3 +++ spacy/vocab.pyx | 30 +++++++++++++++++++++--------- tests/parser/test_pickle.py | 16 ++++++++++++++++ tests/test_pickle.py | 15 +++++++++++++++ tests/vocab/test_vocab.py | 4 ++-- 12 files changed, 104 insertions(+), 21 deletions(-) create mode 100644 tests/parser/test_pickle.py create mode 100644 tests/test_pickle.py diff --git a/spacy/_ml.pxd b/spacy/_ml.pxd index c2c7ffded..b9a190b67 100644 --- a/spacy/_ml.pxd +++ b/spacy/_ml.pxd @@ -29,5 +29,6 @@ cdef class Model: cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1 cdef object model_loc + cdef object _templates cdef Extractor _extractor cdef LinearModel _model diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx index 56c080fa6..bc789e7d6 100644 --- a/spacy/_ml.pyx +++ b/spacy/_ml.pyx @@ -3,6 +3,7 @@ from __future__ import unicode_literals from __future__ import division from os import path +import tempfile import os import shutil import json @@ -52,6 +53,7 @@ cdef class Model: def __init__(self, n_classes, templates, model_loc=None): if model_loc is not None and path.isdir(model_loc): model_loc = path.join(model_loc, 'model') + self._templates = templates self.n_classes = n_classes self._extractor = Extractor(templates) self.n_feats = self._extractor.n_templ @@ -60,6 +62,18 @@ cdef class Model: if self.model_loc and path.exists(self.model_loc): self._model.load(self.model_loc, freq_thresh=0) + def __reduce__(self): + model_loc = tempfile.mkstemp() + # TODO: This is a potentially buggy implementation. We're not really + # given a good guarantee that all internal state is saved correctly here, + # since there are learning parameters for e.g. the model averaging in + # averaged perceptron, the gradient calculations in AdaGrad, etc + # that aren't necessarily saved. So, if we're part way through training + # the model, and then we pickle it, we won't recover the state correctly. + self._model.dump(model_loc) + return (Model, (self.n_classes, self.templates, model_loc), + None, None) + def predict(self, Example eg): self.set_scores(eg.c.scores, eg.c.atoms) eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes) diff --git a/spacy/language.py b/spacy/language.py index ba4c048d7..65425bc45 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -207,6 +207,12 @@ class Language(object): self.entity = entity self.matcher = matcher + def __reduce__(self): + return (self.__class__, + (None, self.vocab, self.tokenizer, self.tagger, self.parser, + self.entity, self.matcher, None), + None, None) + def __call__(self, text, tag=True, parse=True, entity=True): """Apply the pipeline to some text. The text can span multiple sentences, and can contain arbtrary whitespace. Alignment into the original string diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 3ee825932..2bf8370b5 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -168,13 +168,7 @@ cdef class Matcher: cdef Pool mem cdef vector[Pattern*] patterns cdef readonly Vocab vocab - - def __init__(self, vocab, patterns): - self.vocab = vocab - self.mem = Pool() - self.vocab = vocab - for entity_key, (etype, attrs, specs) in sorted(patterns.items()): - self.add(entity_key, etype, attrs, specs) + cdef object _patterns @classmethod def from_dir(cls, data_dir, Vocab vocab): @@ -186,10 +180,22 @@ cdef class Matcher: else: return cls(vocab, {}) + def __init__(self, vocab, patterns): + self.vocab = vocab + self.mem = Pool() + self.vocab = vocab + self._patterns = dict(patterns) + for entity_key, (etype, attrs, specs) in sorted(patterns.items()): + self.add(entity_key, etype, attrs, specs) + + def __reduce__(self): + return (self.__class__, (self.vocab, self._patterns), None, None) + property n_patterns: def __get__(self): return self.patterns.size() def add(self, entity_key, etype, attrs, specs): + self._patterns[entity_key] = (etype, dict(attrs), list(specs)) if isinstance(entity_key, basestring): entity_key = self.vocab.strings[entity_key] if isinstance(etype, basestring): diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index cf61647b9..25932a0a4 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -83,7 +83,6 @@ cdef class Parser: model = Model(moves.n_moves, templates, model_dir) return cls(strings, moves, model) - def __call__(self, Doc tokens): cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) self.moves.initialize_state(stcls) @@ -93,6 +92,9 @@ cdef class Parser: self.parse(stcls, eg.c) tokens.set_parse(stcls._sent) + def __reduce__(self): + return (Parser, (self.moves.strings, self.moves, self.model), None, None) + cdef void predict(self, StateClass stcls, ExampleC* eg) nogil: memset(eg.scores, 0, eg.nr_class * sizeof(weight_t)) self.moves.set_valid(eg.is_valid, stcls) diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index 4cf9aae7e..38bc91605 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -37,6 +37,8 @@ cdef class TransitionSystem: cdef public int root_label cdef public freqs + cdef object _labels_by_action + cdef int initialize_state(self, StateClass state) except -1 cdef int finalize_state(self, StateClass state) nogil diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 86aef1fbc..5de3513e0 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -15,7 +15,8 @@ class OracleError(Exception): cdef class TransitionSystem: - def __init__(self, StringStore string_table, dict labels_by_action): + def __init__(self, StringStore string_table, dict labels_by_action, _freqs=None): + self._labels_by_action = labels_by_action self.mem = Pool() self.n_moves = sum(len(labels) for labels in labels_by_action.values()) self._is_valid = self.mem.alloc(self.n_moves, sizeof(bint)) @@ -30,7 +31,7 @@ cdef class TransitionSystem: i += 1 self.c = moves self.root_label = self.strings['ROOT'] - self.freqs = {} + self.freqs = {} if _freqs is None else _freqs for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB): self.freqs[attr] = defaultdict(int) self.freqs[attr][0] = 1 @@ -39,6 +40,11 @@ cdef class TransitionSystem: self.freqs[HEAD][i] = 1 self.freqs[HEAD][-i] = 1 + def __reduce__(self): + return (self.__class__, + (self.strings, self._labels_by_action, self.freqs), + None, None) + cdef int initialize_state(self, StateClass state) except -1: pass diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 756bb7ea4..69925ff89 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -148,6 +148,9 @@ cdef class Tagger: tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length + def __reduce__(self): + return (self.__class__, (self.vocab, self.model), None, None) + def tag_from_strings(self, Doc tokens, object tag_strs): cdef int i for i in range(tokens.length): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index dd6792104..023d0bd89 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -99,16 +99,18 @@ cdef class Vocab: return self.length def __reduce__(self): + # TODO: Dump vectors tmp_dir = tempfile.mkdtemp() lex_loc = path.join(tmp_dir, 'lexemes.bin') str_loc = path.join(tmp_dir, 'strings.txt') - map_loc = path.join(tmp_dir, 'tag_map.json') + vec_loc = path.join(self.data_dir, 'vec.bin') if self.data_dir is not None else None self.dump(lex_loc) self.strings.dump(str_loc) - json.dump(self.morphology.tag_map, open(map_loc, 'w')) - - return (unpickle_vocab, (tmp_dir,), None, None) + + state = (str_loc, lex_loc, vec_loc, self.morphology, self.get_lex_attr, + self.serializer_freqs, self.data_dir) + return (unpickle_vocab, state, None, None) cdef const LexemeC* get(self, Pool mem, unicode string) except NULL: '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme @@ -353,11 +355,21 @@ cdef class Vocab: return vec_len -def unpickle_vocab(data_dir): - # TODO: This needs fixing --- the trouble is, we can't pickle staticmethods, - # so we need to fiddle with the design of Language a little bit. - from .language import Language - return Vocab.from_dir(data_dir, Language.default_lex_attrs()) +def unpickle_vocab(strings_loc, lex_loc, vec_loc, morphology, get_lex_attr, + serializer_freqs, data_dir): + cdef Vocab vocab = Vocab() + + vocab.get_lex_attr = get_lex_attr + vocab.morphology = morphology + vocab.strings = morphology.strings + vocab.data_dir = data_dir + vocab.serializer_freqs = serializer_freqs + + vocab.load_lexemes(strings_loc, lex_loc) + if vec_loc is not None: + vocab.load_vectors_from_bin_loc(vec_loc) + return vocab + copy_reg.constructor(unpickle_vocab) diff --git a/tests/parser/test_pickle.py b/tests/parser/test_pickle.py new file mode 100644 index 000000000..b1b768650 --- /dev/null +++ b/tests/parser/test_pickle.py @@ -0,0 +1,16 @@ +import pytest + +import pickle +import cloudpickle +import StringIO + + +@pytest.mark.models +def test_pickle(EN): + file_ = StringIO.StringIO() + cloudpickle.dump(EN.parser, file_) + + file_.seek(0) + + loaded = pickle.load(file_) + diff --git a/tests/test_pickle.py b/tests/test_pickle.py new file mode 100644 index 000000000..02d908b0d --- /dev/null +++ b/tests/test_pickle.py @@ -0,0 +1,15 @@ +import pytest +import StringIO +import cloudpickle +import pickle + + +@pytest.mark.models +def test_pickle_english(EN): + file_ = StringIO.StringIO() + cloudpickle.dump(EN, file_) + + file_.seek(0) + + loaded = pickle.load(file_) + diff --git a/tests/vocab/test_vocab.py b/tests/vocab/test_vocab.py index 1ab3746f3..76e8d27dd 100644 --- a/tests/vocab/test_vocab.py +++ b/tests/vocab/test_vocab.py @@ -1,13 +1,13 @@ from __future__ import unicode_literals import pytest import StringIO +import cloudpickle import pickle from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA from spacy.parts_of_speech import NOUN, VERB - def test_neq(en_vocab): addr = en_vocab['Hello'] assert en_vocab['bye'].orth != addr.orth @@ -44,7 +44,7 @@ def test_symbols(en_vocab): def test_pickle_vocab(en_vocab): file_ = StringIO.StringIO() - pickle.dump(en_vocab, file_) + cloudpickle.dump(en_vocab, file_) file_.seek(0)