From 85ce36ab114ec155148dd3878a41dbe3c198b291 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 7 Oct 2015 00:39:50 +1100
Subject: [PATCH 01/22] * Refactor symbols, so that frequency rank can be
 derived from the orth id of a word.

---
 bin/init_model.py         |  5 +++
 setup.py                  |  3 +-
 spacy/attrs.pxd           |  4 +-
 spacy/attrs.pyx           | 90 +++++++++++++++++++++++++++++++++++++++
 spacy/matcher.pyx         |  2 +-
 spacy/parts_of_speech.pxd | 45 ++++++++++----------
 spacy/vocab.pyx           | 15 +++++++
 7 files changed, 138 insertions(+), 26 deletions(-)

diff --git a/bin/init_model.py b/bin/init_model.py
index 72d7a3aae..6e44fd444 100644
--- a/bin/init_model.py
+++ b/bin/init_model.py
@@ -168,6 +168,11 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
             probs[word] = oov_prob
 
     lexicon = []
+    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
+        # First encode the strings into the StringStore. This way, we can map
+        # the orth IDs to frequency ranks
+        orth = vocab.strings[word]
+    # Now actually load the vocab
     for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
         lexeme = vocab[word]
         lexeme.prob = prob
diff --git a/setup.py b/setup.py
index 3036db94c..fb6a5b718 100644
--- a/setup.py
+++ b/setup.py
@@ -165,7 +165,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
              'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
              'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
              'spacy.cfile', 'spacy.matcher',
-             'spacy.syntax.ner']
+             'spacy.syntax.ner',
+             'spacy.symbols']
 
 
 if __name__ == '__main__':
diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd
index c810762ef..d0f476dcd 100644
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@@ -1,5 +1,6 @@
 # Reserve 64 values for flag features
 cpdef enum attr_id_t:
+    NULL_ATTR
     IS_ALPHA
     IS_ASCII
     IS_DIGIT
@@ -14,8 +15,7 @@ cpdef enum attr_id_t:
     IS_STOP
     IS_OOV
     
-    FLAG13 = 13
-    FLAG14
+    FLAG14 = 14
     FLAG15
     FLAG16
     FLAG17
diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index e69de29bb..8ce0f7a17 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -0,0 +1,90 @@
+ATTR_IDS = {
+    "NULL_ATTR": NULL_ATTR,
+    "IS_ALPHA": IS_ALPHA,
+    "IS_ASCII": IS_ASCII,
+    "IS_DIGIT": IS_DIGIT,
+    "IS_LOWER": IS_LOWER,
+    "IS_PUNCT": IS_PUNCT,
+    "IS_SPACE": IS_SPACE,
+    "IS_TITLE": IS_TITLE,
+    "IS_UPPER": IS_UPPER,
+    "LIKE_URL": LIKE_URL,
+    "LIKE_NUM": LIKE_NUM,
+    "LIKE_EMAIL": LIKE_EMAIL,
+    "IS_STOP": IS_STOP,
+    "IS_OOV": IS_OOV,
+
+    "FLAG14": FLAG14,
+    "FLAG15": FLAG15,
+    "FLAG16": FLAG16,
+    "FLAG17": FLAG17,
+    "FLAG18": FLAG18,
+    "FLAG19": FLAG19,
+    "FLAG20": FLAG20,
+    "FLAG21": FLAG21,
+    "FLAG22": FLAG22,
+    "FLAG23": FLAG23,
+    "FLAG24": FLAG24,
+    "FLAG25": FLAG25,
+    "FLAG26": FLAG26,
+    "FLAG27": FLAG27,
+    "FLAG28": FLAG28,
+    "FLAG29": FLAG29,
+    "FLAG30": FLAG30,
+    "FLAG31": FLAG31,
+    "FLAG32": FLAG32,
+    "FLAG33": FLAG33,
+    "FLAG34": FLAG34,
+    "FLAG35": FLAG35,
+    "FLAG36": FLAG36,
+    "FLAG37": FLAG37,
+    "FLAG38": FLAG38,
+    "FLAG39": FLAG39,
+    "FLAG40": FLAG40,
+    "FLAG41": FLAG41,
+    "FLAG42": FLAG42,
+    "FLAG43": FLAG43,
+    "FLAG44": FLAG44,
+    "FLAG45": FLAG45,
+    "FLAG46": FLAG46,
+    "FLAG47": FLAG47,
+    "FLAG48": FLAG48,
+    "FLAG49": FLAG49,
+    "FLAG50": FLAG50,
+    "FLAG51": FLAG51,
+    "FLAG52": FLAG52,
+    "FLAG53": FLAG53,
+    "FLAG54": FLAG54,
+    "FLAG55": FLAG55,
+    "FLAG56": FLAG56,
+    "FLAG57": FLAG57,
+    "FLAG58": FLAG58,
+    "FLAG59": FLAG59,
+    "FLAG60": FLAG60,
+    "FLAG61": FLAG61,
+    "FLAG62": FLAG62,
+    "FLAG63": FLAG63,
+
+    "ID": ID,
+    "ORTH": ORTH,
+    "LOWER": LOWER,
+    "NORM": NORM,
+    "SHAPE": SHAPE,
+    "PREFIX": PREFIX,
+    "SUFFIX": SUFFIX,
+
+    "LENGTH": LENGTH,
+    "CLUSTER": CLUSTER,
+    "LEMMA": LEMMA,
+    "POS": POS,
+    "TAG": TAG,
+    "DEP": DEP,
+    "ENT_IOB": ENT_IOB,
+    "ENT_TYPE": ENT_TYPE,
+    "HEAD": HEAD,
+    "SPACY": SPACY,
+    "PROB": PROB,
+}
+
+# ATTR IDs, in order of the symbol
+ATTR_NAMES = [key for key, value in sorted(ATTR_IDS.items(), key=lambda item: item[1])]
diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx
index afafd3ddb..3ee825932 100644
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@@ -15,7 +15,7 @@ from libcpp.vector cimport vector
 from murmurhash.mrmr cimport hash64
 
 from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
-from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
+from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
 from .tokens.doc cimport get_token_attr
 from .tokens.doc cimport Doc
 from .vocab cimport Vocab
diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd
index e410c6971..17e349435 100644
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@@ -1,23 +1,24 @@
-# Google universal tag set
+from .symbols cimport *
+
+
 cpdef enum univ_pos_t:
-    NO_TAG
-    ADJ
-    ADP
-    ADV
-    AUX
-    CONJ
-    DET
-    INTJ
-    NOUN
-    NUM
-    PART
-    PRON
-    PROPN
-    PUNCT
-    SCONJ
-    SYM
-    VERB
-    X
-    EOL
-    SPACE
-    N_UNIV_TAGS
+    NO_TAG = EMPTY_VALUE
+    ADJ = POS_adj
+    ADP = POS_adp
+    ADV = POS_adv
+    AUX = POS_aux
+    CONJ = POS_conj
+    DET = POS_det
+    INTJ = POS_intj
+    NOUN = POS_noun
+    NUM = POS_num
+    PART = POS_part
+    PRON = POS_pron
+    PROPN = POS_propn
+    PUNCT = POS_punct
+    SCONJ = POS_sconj
+    SYM = POS_sym
+    VERB = POS_verb
+    X = POS_x
+    EOL = POS_eol
+    SPACE = POS_space
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index d79da8a79..caf3045f5 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -67,6 +67,21 @@ cdef class Vocab:
         self._by_hash = PreshMap()
         self._by_orth = PreshMap()
         self.strings = StringStore()
+        # Load strings in a special order, so that we have an onset number for
+        # the vocabulary. This way, when words are added in order, the orth ID
+        # is the frequency rank of the word, plus a certain offset. The structural
+        # strings are loaded first, because the vocab is open-class, and these
+        # symbols are closed class.
+        #for attr_name in sorted(ATTR_NAMES.keys()):
+        #    _ = self.strings[attr_name]
+        #for univ_pos_name in sorted(UNIV_POS_NAMES.keys()):
+        #    _ = self.strings[pos_name]
+        #for morph_name in sorted(UNIV_MORPH_NAMES.keys()):
+        #    _ = self.strings[morph_name]
+        #for entity_type_name in sorted(ENTITY_TYPES.keys()):
+        #    _ = self.strings[entity_type_name]
+        #for tag_name in sorted(TAG_MAP.keys()):
+        #    _ = self.strings[tag_name]
         self.get_lex_attr = get_lex_attr
         self.morphology = Morphology(self.strings, tag_map, lemmatizer)
         self.serializer_freqs = serializer_freqs

From 5c24ad3f5c8751eefa4a5acca51f524bf00d7a24 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 7 Oct 2015 00:40:22 +1100
Subject: [PATCH 02/22] * Whitespace

---
 lang_data/en/morphs.json | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lang_data/en/morphs.json b/lang_data/en/morphs.json
index 917cbc759..059381b27 100644
--- a/lang_data/en/morphs.json
+++ b/lang_data/en/morphs.json
@@ -56,5 +56,4 @@
         "was":  {"L": "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"},
         "were": {"L": "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"}
     }
-
 }

From 10a4a843eac652a05a3bd8fe3215dea2d0824343 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 7 Oct 2015 00:41:17 +1100
Subject: [PATCH 03/22] * Enumerate all symbols in one file

---
 spacy/symbols.pxd | 421 +++++++++++++++++++++++++++++++++++++++++++++
 spacy/symbols.pyx | 424 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 845 insertions(+)
 create mode 100644 spacy/symbols.pxd
 create mode 100644 spacy/symbols.pyx

diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd
new file mode 100644
index 000000000..e8ddeaa8f
--- /dev/null
+++ b/spacy/symbols.pxd
@@ -0,0 +1,421 @@
+cpdef enum symbol_t:
+    EMPTY_VALUE
+    Attr_is_alpha
+    Attr_is_ascii
+    Attr_is_digit
+    Attr_is_lower
+    Attr_is_punct
+    Attr_is_space
+    Attr_is_title
+    Attr_is_upper
+    Attr_like_url
+    Attr_like_num
+    Attr_like_email
+    Attr_is_stop
+    Attr_is_oov
+    
+    Attr_flag14
+    Attr_flag15
+    Attr_flag16
+    Attr_flag17
+    Attr_flag18
+    Attr_flag19
+    Attr_flag20
+    Attr_flag21
+    Attr_flag22
+    Attr_flag23
+    Attr_flag24
+    Attr_flag25
+    Attr_flag26
+    Attr_flag27
+    Attr_flag28
+    Attr_flag29
+    Attr_flag30
+    Attr_flag31
+    Attr_flag32
+    Attr_flag33
+    Attr_flag34
+    Attr_flag35
+    Attr_flag36
+    Attr_flag37
+    Attr_flag38
+    Attr_flag39
+    Attr_flag40
+    Attr_flag41
+    Attr_flag42
+    Attr_flag43
+    Attr_flag44
+    Attr_flag45
+    Attr_flag46
+    Attr_flag47
+    Attr_flag48
+    Attr_flag49
+    Attr_flag50
+    Attr_flag51
+    Attr_flag52
+    Attr_flag53
+    Attr_flag54
+    Attr_flag55
+    Attr_flag56
+    Attr_flag57
+    Attr_flag58
+    Attr_flag59
+    Attr_flag60
+    Attr_flag61
+    Attr_flag62
+    Attr_flag63
+
+    Attr_id
+    Attr_orth
+    Attr_lower
+    Attr_norm
+    Attr_shape
+    Attr_prefix
+    Attr_suffix
+
+    Attr_length
+    Attr_cluster
+    Attr_lemma
+    Attr_pos
+    Attr_tag
+    Attr_dep
+    Attr_ent_iob
+    Attr_ent_type
+    Attr_head
+    Attr_spacy
+    Attr_prob
+
+    POS_adj
+    POS_adp
+    POS_adv
+    POS_aux
+    POS_conj
+    POS_det
+    POS_intj
+    POS_noun
+    POS_num
+    POS_part
+    POS_pron
+    POS_propn
+    POS_punct
+    POS_sconj
+    POS_sym
+    POS_verb
+    POS_x
+    POS_eol
+    POS_space
+
+    Animacy_anim
+    Animacy_inam
+    Aspect_freq
+    Aspect_imp
+    Aspect_mod
+    Aspect_none
+    Aspect_perf
+    Case_abe
+    Case_abl
+    Case_abs
+    Case_acc
+    Case_ade
+    Case_all
+    Case_cau
+    Case_com
+    Case_dat
+    Case_del
+    Case_dis
+    Case_ela
+    Case_ess
+    Case_gen
+    Case_ill
+    Case_ine
+    Case_ins
+    Case_loc
+    Case_lat
+    Case_nom
+    Case_par
+    Case_sub
+    Case_sup
+    Case_tem
+    Case_ter
+    Case_tra
+    Case_voc
+    Definite_two
+    Definite_def
+    Definite_red
+    Definite_ind
+    Degree_cmp
+    Degree_comp
+    Degree_none
+    Degree_pos
+    Degree_sup
+    Degree_abs
+    Degree_com
+    Degree_dim # du
+    Gender_com
+    Gender_fem
+    Gender_masc
+    Gender_neut
+    Mood_cnd
+    Mood_imp
+    Mood_ind
+    Mood_n
+    Mood_pot
+    Mood_sub
+    Mood_opt
+    Negative_neg
+    Negative_pos
+    Negative_yes
+    Number_com
+    Number_dual
+    Number_none
+    Number_plur
+    Number_sing
+    Number_ptan # bg
+    Number_count # bg
+    NumType_card
+    NumType_dist
+    NumType_frac
+    NumType_gen
+    NumType_mult
+    NumType_none
+    NumType_ord
+    NumType_sets
+    Person_one
+    Person_two
+    Person_three
+    Person_none
+    Poss_yes
+    PronType_advPart
+    PronType_art
+    PronType_default
+    PronType_dem
+    PronType_ind
+    PronType_int
+    PronType_neg
+    PronType_prs
+    PronType_rcp
+    PronType_rel
+    PronType_tot
+    PronType_clit
+    PronType_exc # es, ca, it, fa
+    Reflex_yes
+    Tense_fut
+    Tense_imp
+    Tense_past
+    Tense_pres
+    VerbForm_fin
+    VerbForm_ger
+    VerbForm_inf
+    VerbForm_none
+    VerbForm_part
+    VerbForm_partFut
+    VerbForm_partPast
+    VerbForm_partPres
+    VerbForm_sup
+    VerbForm_trans
+    VerbForm_gdv # la
+    Voice_act
+    Voice_cau
+    Voice_pass
+    Voice_mid # gkc
+    Voice_int # hb
+    Abbr_yes # cz, fi, sl, U
+    AdpType_prep # cz, U
+    AdpType_post # U
+    AdpType_voc # cz
+    AdpType_comprep # cz
+    AdpType_circ # U
+    AdvType_man
+    AdvType_loc
+    AdvType_tim
+    AdvType_deg
+    AdvType_cau
+    AdvType_mod
+    AdvType_sta
+    AdvType_ex
+    AdvType_adadj
+    ConjType_oper # cz, U
+    ConjType_comp # cz, U
+    Connegative_yes # fi
+    Derivation_minen # fi
+    Derivation_sti # fi
+    Derivation_inen # fi
+    Derivation_lainen # fi
+    Derivation_ja # fi
+    Derivation_ton # fi
+    Derivation_vs # fi
+    Derivation_ttain # fi
+    Derivation_ttaa # fi
+    Echo_rdp # U
+    Echo_ech # U
+    Foreign_foreign # cz, fi, U
+    Foreign_fscript # cz, fi, U
+    Foreign_tscript # cz, U
+    Foreign_yes # sl
+    Gender_dat_masc # bq, U
+    Gender_dat_fem # bq, U
+    Gender_erg_masc # bq
+    Gender_erg_fem # bq
+    Gender_psor_masc # cz, sl, U
+    Gender_psor_fem # cz, sl, U
+    Gender_psor_neut # sl
+    Hyph_yes # cz, U
+    InfForm_one # fi
+    InfForm_two # fi
+    InfForm_three # fi
+    NameType_geo # U, cz
+    NameType_prs # U, cz
+    NameType_giv # U, cz
+    NameType_sur # U, cz
+    NameType_nat # U, cz
+    NameType_com # U, cz
+    NameType_pro # U, cz
+    NameType_oth # U, cz
+    NounType_com # U
+    NounType_prop # U
+    NounType_class # U
+    Number_abs_sing # bq, U
+    Number_abs_plur # bq, U
+    Number_dat_sing # bq, U
+    Number_dat_plur # bq, U
+    Number_erg_sing # bq, U
+    Number_erg_plur # bq, U
+    Number_psee_sing # U
+    Number_psee_plur # U
+    Number_psor_sing # cz, fi, sl, U
+    Number_psor_plur # cz, fi, sl, U
+    NumForm_digit # cz, sl, U
+    NumForm_roman # cz, sl, U
+    NumForm_word # cz, sl, U
+    NumValue_one # cz, U
+    NumValue_two # cz, U
+    NumValue_three # cz, U
+    PartForm_pres # fi
+    PartForm_past # fi
+    PartForm_agt # fi
+    PartForm_neg # fi
+    PartType_mod # U
+    PartType_emp # U
+    PartType_res # U
+    PartType_inf # U
+    PartType_vbp # U
+    Person_abs_one # bq, U
+    Person_abs_two # bq, U
+    Person_abs_three # bq, U
+    Person_dat_one # bq, U
+    Person_dat_two # bq, U
+    Person_dat_three # bq, U
+    Person_erg_one # bq, U
+    Person_erg_two # bq, U
+    Person_erg_three # bq, U
+    Person_psor_one # fi, U
+    Person_psor_two # fi, U
+    Person_psor_three # fi, U
+    Polite_inf # bq, U
+    Polite_pol # bq, U
+    Polite_abs_inf # bq, U
+    Polite_abs_pol # bq, U
+    Polite_erg_inf # bq, U
+    Polite_erg_pol # bq, U
+    Polite_dat_inf # bq, U
+    Polite_dat_pol # bq, U
+    Prefix_yes # U
+    PrepCase_npr # cz
+    PrepCase_pre # U
+    PunctSide_ini # U
+    PunctSide_fin # U
+    PunctType_peri # U
+    PunctType_qest # U
+    PunctType_excl # U
+    PunctType_quot # U
+    PunctType_brck # U
+    PunctType_comm # U
+    PunctType_colo # U
+    PunctType_semi # U
+    PunctType_dash # U
+    Style_arch # cz, fi, U
+    Style_rare # cz, fi, U
+    Style_poet # cz, U
+    Style_norm # cz, U
+    Style_coll # cz, U
+    Style_vrnc # cz, U
+    Style_sing # cz, U
+    Style_expr # cz, U
+    Style_derg # cz, U
+    Style_vulg # cz, U
+    Style_yes # fi, U
+    StyleVariant_styleShort # cz
+    StyleVariant_styleBound # cz, sl
+    VerbType_aux # U
+    VerbType_cop # U
+    VerbType_mod # U
+    VerbType_light # U
+
+    Name_person
+    Name_norp
+    Name_facility
+    Name_org
+    Name_gpe
+    Name_loc
+    Name_product
+    Name_event
+    Name_work_of_art
+    Name_language
+
+    Unit_date
+    Unit_time
+    Unit_percent
+    Unit_money
+    Unit_quantity
+    Unit_ordinal
+    Unit_cardinal
+
+    Dep_acomp
+    Dep_advcl
+    Dep_advmod
+    Dep_agent
+    Dep_amod
+    Dep_appos
+    Dep_attr
+    Dep_aux
+    Dep_auxpass
+    Dep_cc
+    Dep_ccomp
+    Dep_complm
+    Dep_conj
+    Dep_csubj
+    Dep_csubjpass
+    Dep_dep
+    Dep_det
+    Dep_dobj
+    Dep_expl
+    Dep_hmod
+    Dep_hyph
+    Dep_infmod
+    Dep_intj
+    Dep_iobj
+    Dep_mark
+    Dep_meta
+    Dep_neg
+    Dep_nmod
+    Dep_nn
+    Dep_npadvmod
+    Dep_nsubj
+    Dep_nsubjpass
+    Dep_num
+    Dep_number
+    Dep_oprd
+    Dep_parataxis
+    Dep_partmod
+    Dep_pcomp
+    Dep_pobj
+    Dep_poss
+    Dep_possessive
+    Dep_preconj
+    Dep_prep
+    Dep_prt
+    Dep_punct
+    Dep_quantmod
+    Dep_rcmod
+    Dep_root
+    Dep_xcomp
diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx
new file mode 100644
index 000000000..4251fb4ec
--- /dev/null
+++ b/spacy/symbols.pyx
@@ -0,0 +1,424 @@
+SYMBOL_IDS = {
+    "EMPTY_VALUE": EMPTY_VALUE,
+    "Attr_is_alpha": Attr_is_alpha,
+    "Attr_is_ascii": Attr_is_ascii,
+    "Attr_is_digit": Attr_is_digit,
+    "Attr_is_lower": Attr_is_lower,
+    "Attr_is_punct": Attr_is_punct,
+    "Attr_is_space": Attr_is_space,
+    "Attr_is_title": Attr_is_title,
+    "Attr_is_upper": Attr_is_upper,
+    "Attr_like_url": Attr_like_url,
+    "Attr_like_num": Attr_like_num,
+    "Attr_like_email": Attr_like_email,
+    "Attr_is_stop": Attr_is_stop,
+    "Attr_is_oov": Attr_is_oov,
+    
+    "Attr_flag14": Attr_flag14,
+    "Attr_flag15": Attr_flag15,
+    "Attr_flag16": Attr_flag16,
+    "Attr_flag17": Attr_flag17,
+    "Attr_flag18": Attr_flag18,
+    "Attr_flag19": Attr_flag19,
+    "Attr_flag20": Attr_flag20,
+    "Attr_flag21": Attr_flag21,
+    "Attr_flag22": Attr_flag22,
+    "Attr_flag23": Attr_flag23,
+    "Attr_flag24": Attr_flag24,
+    "Attr_flag25": Attr_flag25,
+    "Attr_flag26": Attr_flag26,
+    "Attr_flag27": Attr_flag27,
+    "Attr_flag28": Attr_flag28,
+    "Attr_flag29": Attr_flag29,
+    "Attr_flag30": Attr_flag30,
+    "Attr_flag31": Attr_flag31,
+    "Attr_flag32": Attr_flag32,
+    "Attr_flag33": Attr_flag33,
+    "Attr_flag34": Attr_flag34,
+    "Attr_flag35": Attr_flag35,
+    "Attr_flag36": Attr_flag36,
+    "Attr_flag37": Attr_flag37,
+    "Attr_flag38": Attr_flag38,
+    "Attr_flag39": Attr_flag39,
+    "Attr_flag40": Attr_flag40,
+    "Attr_flag41": Attr_flag41,
+    "Attr_flag42": Attr_flag42,
+    "Attr_flag43": Attr_flag43,
+    "Attr_flag44": Attr_flag44,
+    "Attr_flag45": Attr_flag45,
+    "Attr_flag46": Attr_flag46,
+    "Attr_flag47": Attr_flag47,
+    "Attr_flag48": Attr_flag48,
+    "Attr_flag49": Attr_flag49,
+    "Attr_flag50": Attr_flag50,
+    "Attr_flag51": Attr_flag51,
+    "Attr_flag52": Attr_flag52,
+    "Attr_flag53": Attr_flag53,
+    "Attr_flag54": Attr_flag54,
+    "Attr_flag55": Attr_flag55,
+    "Attr_flag56": Attr_flag56,
+    "Attr_flag57": Attr_flag57,
+    "Attr_flag58": Attr_flag58,
+    "Attr_flag59": Attr_flag59,
+    "Attr_flag60": Attr_flag60,
+    "Attr_flag61": Attr_flag61,
+    "Attr_flag62": Attr_flag62,
+    "Attr_flag63": Attr_flag63,
+
+    "Attr_id": Attr_id,
+    "Attr_orth": Attr_orth,
+    "Attr_lower": Attr_lower,
+    "Attr_norm": Attr_norm,
+    "Attr_shape": Attr_shape,
+    "Attr_prefix": Attr_prefix,
+    "Attr_suffix": Attr_suffix,
+
+    "Attr_length": Attr_length,
+    "Attr_cluster": Attr_cluster,
+    "Attr_lemma": Attr_lemma,
+    "Attr_pos": Attr_pos,
+    "Attr_tag": Attr_tag,
+    "Attr_dep": Attr_dep,
+    "Attr_ent_iob": Attr_ent_iob,
+    "Attr_ent_type": Attr_ent_type,
+    "Attr_head": Attr_head,
+    "Attr_spacy": Attr_spacy,
+    "Attr_prob": Attr_prob,
+
+    "POS_adj": POS_adj,
+    "POS_adp": POS_adp,
+    "POS_adv": POS_adv,
+    "POS_aux": POS_aux,
+    "POS_conj": POS_conj,
+    "POS_det": POS_det,
+    "POS_intj": POS_intj,
+    "POS_noun": POS_noun,
+    "POS_num": POS_num,
+    "POS_part": POS_part,
+    "POS_pron": POS_pron,
+    "POS_propn": POS_propn,
+    "POS_punct": POS_punct,
+    "POS_sconj": POS_sconj,
+    "POS_sym": POS_sym,
+    "POS_verb": POS_verb,
+    "POS_x": POS_x,
+    "POS_eol": POS_eol,
+    "POS_space": POS_space,
+
+    "Animacy_anim": Animacy_anim,
+    "Animacy_inam": Animacy_inam,
+    "Aspect_freq": Aspect_freq,
+    "Aspect_imp": Aspect_imp,
+    "Aspect_mod": Aspect_mod,
+    "Aspect_none": Aspect_none,
+    "Aspect_perf": Aspect_perf,
+    "Case_abe": Case_abe,
+    "Case_abl": Case_abl,
+    "Case_abs": Case_abs,
+    "Case_acc": Case_acc,
+    "Case_ade": Case_ade,
+    "Case_all": Case_all,
+    "Case_cau": Case_cau,
+    "Case_com": Case_com,
+    "Case_dat": Case_dat,
+    "Case_del": Case_del,
+    "Case_dis": Case_dis,
+    "Case_ela": Case_ela,
+    "Case_ess": Case_ess,
+    "Case_gen": Case_gen,
+    "Case_ill": Case_ill,
+    "Case_ine": Case_ine,
+    "Case_ins": Case_ins,
+    "Case_loc": Case_loc,
+    "Case_lat": Case_lat,
+    "Case_nom": Case_nom,
+    "Case_par": Case_par,
+    "Case_sub": Case_sub,
+    "Case_sup": Case_sup,
+    "Case_tem": Case_tem,
+    "Case_ter": Case_ter,
+    "Case_tra": Case_tra,
+    "Case_voc": Case_voc,
+    "Definite_two": Definite_two,
+    "Definite_def": Definite_def,
+    "Definite_red": Definite_red,
+    "Definite_ind": Definite_ind,
+    "Degree_cmp": Degree_cmp,
+    "Degree_comp": Degree_comp,
+    "Degree_none": Degree_none,
+    "Degree_pos": Degree_pos,
+    "Degree_sup": Degree_sup,
+    "Degree_abs": Degree_abs,
+    "Degree_com": Degree_com,
+    "Degree_dim ": Degree_dim, # du
+    "Gender_com": Gender_com,
+    "Gender_fem": Gender_fem,
+    "Gender_masc": Gender_masc,
+    "Gender_neut": Gender_neut,
+    "Mood_cnd": Mood_cnd,
+    "Mood_imp": Mood_imp,
+    "Mood_ind": Mood_ind,
+    "Mood_n": Mood_n,
+    "Mood_pot": Mood_pot,
+    "Mood_sub": Mood_sub,
+    "Mood_opt": Mood_opt,
+    "Negative_neg": Negative_neg,
+    "Negative_pos": Negative_pos,
+    "Negative_yes": Negative_yes,
+    "Number_com": Number_com,
+    "Number_dual": Number_dual,
+    "Number_none": Number_none,
+    "Number_plur": Number_plur,
+    "Number_sing": Number_sing,
+    "Number_ptan ": Number_ptan, # bg
+    "Number_count ": Number_count, # bg
+    "NumType_card": NumType_card,
+    "NumType_dist": NumType_dist,
+    "NumType_frac": NumType_frac,
+    "NumType_gen": NumType_gen,
+    "NumType_mult": NumType_mult,
+    "NumType_none": NumType_none,
+    "NumType_ord": NumType_ord,
+    "NumType_sets": NumType_sets,
+    "Person_one": Person_one,
+    "Person_two": Person_two,
+    "Person_three": Person_three,
+    "Person_none": Person_none,
+    "Poss_yes": Poss_yes,
+    "PronType_advPart": PronType_advPart,
+    "PronType_art": PronType_art,
+    "PronType_default": PronType_default,
+    "PronType_dem": PronType_dem,
+    "PronType_ind": PronType_ind,
+    "PronType_int": PronType_int,
+    "PronType_neg": PronType_neg,
+    "PronType_prs": PronType_prs,
+    "PronType_rcp": PronType_rcp,
+    "PronType_rel": PronType_rel,
+    "PronType_tot": PronType_tot,
+    "PronType_clit": PronType_clit,
+    "PronType_exc ": PronType_exc, # es, ca, it, fa,
+    "Reflex_yes": Reflex_yes,
+    "Tense_fut": Tense_fut,
+    "Tense_imp": Tense_imp,
+    "Tense_past": Tense_past,
+    "Tense_pres": Tense_pres,
+    "VerbForm_fin": VerbForm_fin,
+    "VerbForm_ger": VerbForm_ger,
+    "VerbForm_inf": VerbForm_inf,
+    "VerbForm_none": VerbForm_none,
+    "VerbForm_part": VerbForm_part,
+    "VerbForm_partFut": VerbForm_partFut,
+    "VerbForm_partPast": VerbForm_partPast,
+    "VerbForm_partPres": VerbForm_partPres,
+    "VerbForm_sup": VerbForm_sup,
+    "VerbForm_trans": VerbForm_trans,
+    "VerbForm_gdv ": VerbForm_gdv, # la,
+    "Voice_act": Voice_act,
+    "Voice_cau": Voice_cau,
+    "Voice_pass": Voice_pass,
+    "Voice_mid ": Voice_mid, # gkc,
+    "Voice_int ": Voice_int, # hb,
+    "Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
+    "AdpType_prep ": AdpType_prep, # cz, U,
+    "AdpType_post ": AdpType_post, # U,
+    "AdpType_voc ": AdpType_voc, # cz,
+    "AdpType_comprep ": AdpType_comprep, # cz,
+    "AdpType_circ ": AdpType_circ, # U,
+    "AdvType_man": AdvType_man,
+    "AdvType_loc": AdvType_loc,
+    "AdvType_tim": AdvType_tim,
+    "AdvType_deg": AdvType_deg,
+    "AdvType_cau": AdvType_cau,
+    "AdvType_mod": AdvType_mod,
+    "AdvType_sta": AdvType_sta,
+    "AdvType_ex": AdvType_ex,
+    "AdvType_adadj": AdvType_adadj,
+    "ConjType_oper ": ConjType_oper, # cz, U,
+    "ConjType_comp ": ConjType_comp, # cz, U,
+    "Connegative_yes ": Connegative_yes, # fi,
+    "Derivation_minen ": Derivation_minen, # fi,
+    "Derivation_sti ": Derivation_sti, # fi,
+    "Derivation_inen ": Derivation_inen, # fi,
+    "Derivation_lainen ": Derivation_lainen, # fi,
+    "Derivation_ja ": Derivation_ja, # fi,
+    "Derivation_ton ": Derivation_ton, # fi,
+    "Derivation_vs ": Derivation_vs, # fi,
+    "Derivation_ttain ": Derivation_ttain, # fi,
+    "Derivation_ttaa ": Derivation_ttaa, # fi,
+    "Echo_rdp ": Echo_rdp, # U,
+    "Echo_ech ": Echo_ech, # U,
+    "Foreign_foreign ": Foreign_foreign, # cz, fi, U,
+    "Foreign_fscript ": Foreign_fscript, # cz, fi, U,
+    "Foreign_tscript ": Foreign_tscript, # cz, U,
+    "Foreign_yes ": Foreign_yes, # sl,
+    "Gender_dat_masc ": Gender_dat_masc, # bq, U,
+    "Gender_dat_fem ": Gender_dat_fem, # bq, U,
+    "Gender_erg_masc ": Gender_erg_masc, # bq,
+    "Gender_erg_fem ": Gender_erg_fem, # bq,
+    "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
+    "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
+    "Gender_psor_neut ": Gender_psor_neut, # sl,
+    "Hyph_yes ": Hyph_yes, # cz, U,
+    "InfForm_one ": InfForm_one, # fi,
+    "InfForm_two ": InfForm_two, # fi,
+    "InfForm_three ": InfForm_three, # fi,
+    "NameType_geo ": NameType_geo, # U, cz,
+    "NameType_prs ": NameType_prs, # U, cz,
+    "NameType_giv ": NameType_giv, # U, cz,
+    "NameType_sur ": NameType_sur, # U, cz,
+    "NameType_nat ": NameType_nat, # U, cz,
+    "NameType_com ": NameType_com, # U, cz,
+    "NameType_pro ": NameType_pro, # U, cz,
+    "NameType_oth ": NameType_oth, # U, cz,
+    "NounType_com ": NounType_com, # U,
+    "NounType_prop ": NounType_prop, # U,
+    "NounType_class ": NounType_class, # U,
+    "Number_abs_sing ": Number_abs_sing, # bq, U,
+    "Number_abs_plur ": Number_abs_plur, # bq, U,
+    "Number_dat_sing ": Number_dat_sing, # bq, U,
+    "Number_dat_plur ": Number_dat_plur, # bq, U,
+    "Number_erg_sing ": Number_erg_sing, # bq, U,
+    "Number_erg_plur ": Number_erg_plur, # bq, U,
+    "Number_psee_sing ": Number_psee_sing, # U,
+    "Number_psee_plur ": Number_psee_plur, # U,
+    "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
+    "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
+    "NumForm_digit ": NumForm_digit, # cz, sl, U,
+    "NumForm_roman ": NumForm_roman, # cz, sl, U,
+    "NumForm_word ": NumForm_word, # cz, sl, U,
+    "NumValue_one ": NumValue_one, # cz, U,
+    "NumValue_two ": NumValue_two, # cz, U,
+    "NumValue_three ": NumValue_three, # cz, U,
+    "PartForm_pres ": PartForm_pres, # fi,
+    "PartForm_past ": PartForm_past, # fi,
+    "PartForm_agt ": PartForm_agt, # fi,
+    "PartForm_neg ": PartForm_neg, # fi,
+    "PartType_mod ": PartType_mod, # U,
+    "PartType_emp ": PartType_emp, # U,
+    "PartType_res ": PartType_res, # U,
+    "PartType_inf ": PartType_inf, # U,
+    "PartType_vbp ": PartType_vbp, # U,
+    "Person_abs_one ": Person_abs_one, # bq, U,
+    "Person_abs_two ": Person_abs_two, # bq, U,
+    "Person_abs_three ": Person_abs_three, # bq, U,
+    "Person_dat_one ": Person_dat_one, # bq, U,
+    "Person_dat_two ": Person_dat_two, # bq, U,
+    "Person_dat_three ": Person_dat_three, # bq, U,
+    "Person_erg_one ": Person_erg_one, # bq, U,
+    "Person_erg_two ": Person_erg_two, # bq, U,
+    "Person_erg_three ": Person_erg_three, # bq, U,
+    "Person_psor_one ": Person_psor_one, # fi, U,
+    "Person_psor_two ": Person_psor_two, # fi, U,
+    "Person_psor_three ": Person_psor_three, # fi, U,
+    "Polite_inf ": Polite_inf, # bq, U,
+    "Polite_pol ": Polite_pol, # bq, U,
+    "Polite_abs_inf ": Polite_abs_inf, # bq, U,
+    "Polite_abs_pol ": Polite_abs_pol, # bq, U,
+    "Polite_erg_inf ": Polite_erg_inf, # bq, U,
+    "Polite_erg_pol ": Polite_erg_pol, # bq, U,
+    "Polite_dat_inf ": Polite_dat_inf, # bq, U,
+    "Polite_dat_pol ": Polite_dat_pol, # bq, U,
+    "Prefix_yes ": Prefix_yes, # U,
+    "PrepCase_npr ": PrepCase_npr, # cz,
+    "PrepCase_pre ": PrepCase_pre, # U,
+    "PunctSide_ini ": PunctSide_ini, # U,
+    "PunctSide_fin ": PunctSide_fin, # U,
+    "PunctType_peri ": PunctType_peri, # U,
+    "PunctType_qest ": PunctType_qest, # U,
+    "PunctType_excl ": PunctType_excl, # U,
+    "PunctType_quot ": PunctType_quot, # U,
+    "PunctType_brck ": PunctType_brck, # U,
+    "PunctType_comm ": PunctType_comm, # U,
+    "PunctType_colo ": PunctType_colo, # U,
+    "PunctType_semi ": PunctType_semi, # U,
+    "PunctType_dash ": PunctType_dash, # U,
+    "Style_arch ": Style_arch, # cz, fi, U,
+    "Style_rare ": Style_rare, # cz, fi, U,
+    "Style_poet ": Style_poet, # cz, U,
+    "Style_norm ": Style_norm, # cz, U,
+    "Style_coll ": Style_coll, # cz, U,
+    "Style_vrnc ": Style_vrnc, # cz, U,
+    "Style_sing ": Style_sing, # cz, U,
+    "Style_expr ": Style_expr, # cz, U,
+    "Style_derg ": Style_derg, # cz, U,
+    "Style_vulg ": Style_vulg, # cz, U,
+    "Style_yes ": Style_yes, # fi, U,
+    "StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
+    "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
+    "VerbType_aux ": VerbType_aux, # U,
+    "VerbType_cop ": VerbType_cop, # U,
+    "VerbType_mod ": VerbType_mod, # U,
+    "VerbType_light ": VerbType_light, # U,
+
+    "Name_person": Name_person,
+    "Name_norp": Name_norp,
+    "Name_facility": Name_facility,
+    "Name_org": Name_org,
+    "Name_gpe": Name_gpe,
+    "Name_loc": Name_loc,
+    "Name_product": Name_product,
+    "Name_event": Name_event,
+    "Name_work_of_art": Name_work_of_art,
+    "Name_language": Name_language,
+
+    "Unit_date": Unit_date,
+    "Unit_time": Unit_time,
+    "Unit_percent": Unit_percent,
+    "Unit_money": Unit_money,
+    "Unit_quantity": Unit_quantity,
+    "Unit_ordinal": Unit_ordinal,
+    "Unit_cardinal": Unit_cardinal,
+
+    "Dep_acomp": Dep_acomp,
+    "Dep_advcl": Dep_advcl,
+    "Dep_advmod": Dep_advmod,
+    "Dep_agent": Dep_agent,
+    "Dep_amod": Dep_amod,
+    "Dep_appos": Dep_appos,
+    "Dep_attr": Dep_attr,
+    "Dep_aux": Dep_aux,
+    "Dep_auxpass": Dep_auxpass,
+    "Dep_cc": Dep_cc,
+    "Dep_ccomp": Dep_ccomp,
+    "Dep_complm": Dep_complm,
+    "Dep_conj": Dep_conj,
+    "Dep_csubj": Dep_csubj,
+    "Dep_csubjpass": Dep_csubjpass,
+    "Dep_dep": Dep_dep,
+    "Dep_det": Dep_det,
+    "Dep_dobj": Dep_dobj,
+    "Dep_expl": Dep_expl,
+    "Dep_hmod": Dep_hmod,
+    "Dep_hyph": Dep_hyph,
+    "Dep_infmod": Dep_infmod,
+    "Dep_intj": Dep_intj,
+    "Dep_iobj": Dep_iobj,
+    "Dep_mark": Dep_mark,
+    "Dep_meta": Dep_meta,
+    "Dep_neg": Dep_neg,
+    "Dep_nmod": Dep_nmod,
+    "Dep_nn": Dep_nn,
+    "Dep_npadvmod": Dep_npadvmod,
+    "Dep_nsubj": Dep_nsubj,
+    "Dep_nsubjpass": Dep_nsubjpass,
+    "Dep_num": Dep_num,
+    "Dep_number": Dep_number,
+    "Dep_oprd": Dep_oprd,
+    "Dep_parataxis": Dep_parataxis,
+    "Dep_partmod": Dep_partmod,
+    "Dep_pcomp": Dep_pcomp,
+    "Dep_pobj": Dep_pobj,
+    "Dep_poss": Dep_poss,
+    "Dep_possessive": Dep_possessive,
+    "Dep_preconj": Dep_preconj,
+    "Dep_prep": Dep_prep,
+    "Dep_prt": Dep_prt,
+    "Dep_punct": Dep_punct,
+    "Dep_quantmod": Dep_quantmod,
+    "Dep_rcmod": Dep_rcmod,
+    "Dep_root": Dep_root,
+    "Dep_xcomp": Dep_xcomp
+}
+
+SYMBOL_NAMES = [it[0] for it in sorted(SYMBOL_IDS.items(), key=lambda it: it[1])]

From 74c0853471ed4115473142b542b0c9c917475a13 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 17:55:55 +1100
Subject: [PATCH 04/22] * Rename ATTR_IDS to attrs.IDS. Rename ATTR_NAMES to
 attrs.NAMES. Rename UNIV_POS_IDS to parts_of_speech.IDS

---
 spacy/attrs.pyx           | 4 ++--
 spacy/morphology.pyx      | 4 ++--
 spacy/parts_of_speech.pyx | 5 ++++-
 spacy/tokens/doc.pyx      | 1 -
 spacy/tokens/token.pyx    | 7 ++-----
 5 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index 8ce0f7a17..8d76160f4 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -1,4 +1,4 @@
-ATTR_IDS = {
+IDS = {
     "NULL_ATTR": NULL_ATTR,
     "IS_ALPHA": IS_ALPHA,
     "IS_ASCII": IS_ASCII,
@@ -87,4 +87,4 @@ ATTR_IDS = {
 }
 
 # ATTR IDs, in order of the symbol
-ATTR_NAMES = [key for key, value in sorted(ATTR_IDS.items(), key=lambda item: item[1])]
+NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 534f64a59..8d2a73608 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -6,7 +6,7 @@ try:
 except ImportError:
     import json
 
-from .parts_of_speech import UNIV_POS_NAMES
+from .parts_of_speech import IDS as POS_IDS
 from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
 
 
@@ -24,7 +24,7 @@ cdef class Morphology:
             self.rich_tags[i].id = i
             self.rich_tags[i].name = self.strings[tag_str]
             self.rich_tags[i].morph = 0
-            self.rich_tags[i].pos = UNIV_POS_NAMES[props['pos'].upper()]
+            self.rich_tags[i].pos = POS_IDS[props['pos'].upper()]
             self.reverse_index[self.rich_tags[i].name] = i
         self._cache = PreshMapArray(self.n_tags)
 
diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx
index 8c2348a47..57d9c801b 100644
--- a/spacy/parts_of_speech.pyx
+++ b/spacy/parts_of_speech.pyx
@@ -1,7 +1,7 @@
 from __future__ import unicode_literals
 
 
-UNIV_POS_NAMES = {
+IDS = {
     "NO_TAG": NO_TAG,
     "ADJ": ADJ,
     "ADP": ADP,
@@ -23,3 +23,6 @@ UNIV_POS_NAMES = {
     "EOL": EOL,
     "SPACE": SPACE
 }
+
+
+NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index eab6c044e..50b19d4c1 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -14,7 +14,6 @@ from ..typedefs cimport attr_t, flags_t
 from ..attrs cimport attr_id_t
 from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
-from ..parts_of_speech import UNIV_POS_NAMES
 from ..parts_of_speech cimport CONJ, PUNCT, NOUN
 from ..parts_of_speech cimport univ_pos_t
 from ..lexeme cimport Lexeme
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 25db3f47e..af80b5359 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -9,7 +9,7 @@ import numpy
 
 
 from ..lexeme cimport Lexeme
-from ..parts_of_speech import UNIV_POS_NAMES
+from .. import parts_of_speech
 
 from ..attrs cimport LEMMA
 from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
@@ -318,7 +318,7 @@ cdef class Token:
 
     property pos_:
         def __get__(self):
-            return _pos_id_to_string[self.c.pos]
+            return parts_of_speech.NAMES[self.c.pos]
 
     property tag_:
         def __get__(self):
@@ -363,6 +363,3 @@ cdef class Token:
 
     property like_email:
         def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
-
-
-_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}

From a29c8ee23d5d1b373327013150531b835b27088a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 17:58:29 +1100
Subject: [PATCH 05/22] * Add symbols to the vocab before reading the strings,
 so that they line up correctly

---
 spacy/vocab.pyx | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index caf3045f5..1a787e7ac 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -19,6 +19,9 @@ from .typedefs cimport attr_t
 from .cfile cimport CFile
 from .lemmatizer import Lemmatizer
 
+from . import attrs
+from . import parts_of_speech
+
 from cymem.cymem cimport Address
 from . import util
 from .serialize.packer cimport Packer
@@ -72,15 +75,15 @@ cdef class Vocab:
         # is the frequency rank of the word, plus a certain offset. The structural
         # strings are loaded first, because the vocab is open-class, and these
         # symbols are closed class.
-        #for attr_name in sorted(ATTR_NAMES.keys()):
-        #    _ = self.strings[attr_name]
-        #for univ_pos_name in sorted(UNIV_POS_NAMES.keys()):
-        #    _ = self.strings[pos_name]
-        #for morph_name in sorted(UNIV_MORPH_NAMES.keys()):
+        for name in attrs.NAMES:
+            _ = self.strings[name]
+        for name in parts_of_speech.NAMES:
+            _ = self.strings[name]
+        #for morph_name in UNIV_MORPH_NAMES:
         #    _ = self.strings[morph_name]
-        #for entity_type_name in sorted(ENTITY_TYPES.keys()):
+        #for entity_type_name in entity_types.NAMES:
         #    _ = self.strings[entity_type_name]
-        #for tag_name in sorted(TAG_MAP.keys()):
+        #for tag_name in sorted(tag_map.keys()):
         #    _ = self.strings[tag_name]
         self.get_lex_attr = get_lex_attr
         self.morphology = Morphology(self.strings, tag_map, lemmatizer)

From ce3e3063764a1b9dc002765450659b8adba1b1d7 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 17:58:57 +1100
Subject: [PATCH 06/22] * Allow SPACY_DATA environment variable in website
 tests

---
 tests/website/conftest.py  | 6 ++++--
 tests/website/test_home.py | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/website/conftest.py b/tests/website/conftest.py
index ade1bae2a..35c38d845 100644
--- a/tests/website/conftest.py
+++ b/tests/website/conftest.py
@@ -1,11 +1,13 @@
 from __future__ import unicode_literals
 import pytest
+import os
 
 
 @pytest.fixture(scope='session')
 def nlp():
-    from spacy.en import English
-    return English()
+    from spacy.en import English, LOCAL_DATA_DIR
+    data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
+    return English(data_dir=data_dir)
 
 
 @pytest.fixture()
diff --git a/tests/website/test_home.py b/tests/website/test_home.py
index 4da61becf..3f7f7ea4c 100644
--- a/tests/website/test_home.py
+++ b/tests/website/test_home.py
@@ -1,6 +1,7 @@
 from __future__ import unicode_literals
 import pytest
 import spacy
+import os
 
 
 @pytest.fixture()
@@ -9,8 +10,9 @@ def token(doc):
 
 
 def test_load_resources_and_process_text():
-    from spacy.en import English
-    nlp = English()
+    from spacy.en import English, LOCAL_DATA_DIR
+    data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
+    nlp = English(data_dir=data_dir)
     doc = nlp('Hello, world. Here are two sentences.')
 
 

From d70e8cac2c4302249720cfded3de836302bddb1c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 18:27:03 +1100
Subject: [PATCH 07/22] * Fix empty values in attributes and parts of speech,
 so symbols align correctly with the StringStore

---
 spacy/parts_of_speech.pyx |  2 +-
 spacy/symbols.pyx         |  1 -
 spacy/vocab.pyx           |  6 ++++--
 tests/vocab/test_vocab.py | 13 +++++++++++++
 4 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx
index 57d9c801b..14933480c 100644
--- a/spacy/parts_of_speech.pyx
+++ b/spacy/parts_of_speech.pyx
@@ -2,7 +2,7 @@ from __future__ import unicode_literals
 
 
 IDS = {
-    "NO_TAG": NO_TAG,
+    "": NO_TAG,
     "ADJ": ADJ,
     "ADP": ADP,
     "ADV": ADV,
diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx
index 4251fb4ec..a0a39f2ff 100644
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@@ -1,5 +1,4 @@
 SYMBOL_IDS = {
-    "EMPTY_VALUE": EMPTY_VALUE,
     "Attr_is_alpha": Attr_is_alpha,
     "Attr_is_ascii": Attr_is_ascii,
     "Attr_is_digit": Attr_is_digit,
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 1a787e7ac..6cf829344 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -76,9 +76,11 @@ cdef class Vocab:
         # strings are loaded first, because the vocab is open-class, and these
         # symbols are closed class.
         for name in attrs.NAMES:
-            _ = self.strings[name]
+            if name:
+                _ = self.strings[name]
         for name in parts_of_speech.NAMES:
-            _ = self.strings[name]
+            if name:
+                _ = self.strings[name]
         #for morph_name in UNIV_MORPH_NAMES:
         #    _ = self.strings[morph_name]
         #for entity_type_name in entity_types.NAMES:
diff --git a/tests/vocab/test_vocab.py b/tests/vocab/test_vocab.py
index 7ad911626..153e0d546 100644
--- a/tests/vocab/test_vocab.py
+++ b/tests/vocab/test_vocab.py
@@ -1,6 +1,9 @@
 from __future__ import unicode_literals
 import pytest
 
+from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA
+from spacy.parts_of_speech import NOUN, VERB
+
 
 def test_neq(en_vocab):
     addr = en_vocab['Hello']
@@ -25,3 +28,13 @@ def test_punct_neq(en_vocab):
 def test_shape_attr(en_vocab):
     example = en_vocab['example']
     assert example.orth != example.shape
+
+
+def test_symbols(en_vocab):
+    assert en_vocab.strings['IS_ALPHA'] == IS_ALPHA
+    assert en_vocab.strings['NOUN'] == NOUN
+    assert en_vocab.strings['VERB'] == VERB
+    assert en_vocab.strings['LEMMA'] == LEMMA
+    assert en_vocab.strings['ORTH'] == ORTH
+    assert en_vocab.strings['PROB'] == PROB
+    

From fd204d3cd5e9aa459b5df03e38545ff3e0c444d0 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 22:09:50 +1100
Subject: [PATCH 08/22] * Map NIL to empty string in tag map

---
 lang_data/en/tag_map.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lang_data/en/tag_map.json b/lang_data/en/tag_map.json
index de3e2eb58..a38411bcf 100644
--- a/lang_data/en/tag_map.json
+++ b/lang_data/en/tag_map.json
@@ -22,7 +22,7 @@
 "JJS": {"pos": "adj", "degree": "sup"},
 "LS": {"pos": "punct", "numtype": "ord"},
 "MD": {"pos": "verb", "verbtype": "mod"},
-"NIL": {"pos": "no_tag"},
+"NIL": {"pos": ""},
 "NN": {"pos": "noun", "number": "sing"},
 "NNP": {"pos": "noun", "nountype": "prop", "number": "sing"},
 "NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"},

From d80067eda1001753645494df14eebe03ac206b3c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 22:10:19 +1100
Subject: [PATCH 09/22] * Map empty string to NULL_ATTR in attrs

---
 spacy/attrs.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index 8d76160f4..3595fbf22 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -1,5 +1,5 @@
 IDS = {
-    "NULL_ATTR": NULL_ATTR,
+    "": NULL_ATTR,
     "IS_ALPHA": IS_ALPHA,
     "IS_ASCII": IS_ASCII,
     "IS_DIGIT": IS_DIGIT,

From 278e12f7e848bae4b54ba8c4b1ffddfe39591cb6 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 22:10:58 +1100
Subject: [PATCH 10/22] * Addmorphology symbols to morphology. May need to
 remove these as an enum.

---
 spacy/morphology.pxd | 963 +++++++++++--------------------------------
 spacy/morphology.pyx | 251 +++++++++++
 2 files changed, 499 insertions(+), 715 deletions(-)

diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index 2229da0ad..62d3fccc1 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -7,6 +7,7 @@ from .strings cimport StringStore
 from .typedefs cimport attr_t
 from .parts_of_speech cimport univ_pos_t
 
+from . cimport symbols
 
 cdef struct RichTagC:
     uint64_t morph
@@ -36,720 +37,252 @@ cdef class Morphology:
     cdef int assign_feature(self, uint64_t* morph, feature, value) except -1
 
 
+cpdef enum univ_morph_t:
+    NIL = 0
+    Animacy_anim = symbols.Animacy_anim
+    Animacy_inam
+    Aspect_freq
+    Aspect_imp
+    Aspect_mod
+    Aspect_none
+    Aspect_perf
+    Case_abe
+    Case_abl
+    Case_abs
+    Case_acc
+    Case_ade
+    Case_all
+    Case_cau
+    Case_com
+    Case_dat
+    Case_del
+    Case_dis
+    Case_ela
+    Case_ess
+    Case_gen
+    Case_ill
+    Case_ine
+    Case_ins
+    Case_loc
+    Case_lat
+    Case_nom
+    Case_par
+    Case_sub
+    Case_sup
+    Case_tem
+    Case_ter
+    Case_tra
+    Case_voc
+    Definite_two
+    Definite_def
+    Definite_red
+    Definite_ind
+    Degree_cmp
+    Degree_comp
+    Degree_none
+    Degree_pos
+    Degree_sup
+    Degree_abs
+    Degree_com
+    Degree_dim # du
+    Gender_com
+    Gender_fem
+    Gender_masc
+    Gender_neut
+    Mood_cnd
+    Mood_imp
+    Mood_ind
+    Mood_n
+    Mood_pot
+    Mood_sub
+    Mood_opt
+    Negative_neg
+    Negative_pos
+    Negative_yes
+    Number_com
+    Number_dual
+    Number_none
+    Number_plur
+    Number_sing
+    Number_ptan # bg
+    Number_count # bg
+    NumType_card
+    NumType_dist
+    NumType_frac
+    NumType_gen
+    NumType_mult
+    NumType_none
+    NumType_ord
+    NumType_sets
+    Person_one
+    Person_two
+    Person_three
+    Person_none
+    Poss_yes
+    PronType_advPart
+    PronType_art
+    PronType_default
+    PronType_dem
+    PronType_ind
+    PronType_int
+    PronType_neg
+    PronType_prs
+    PronType_rcp
+    PronType_rel
+    PronType_tot
+    PronType_clit
+    PronType_exc # es, ca, it, fa
+    Reflex_yes
+    Tense_fut
+    Tense_imp
+    Tense_past
+    Tense_pres
+    VerbForm_fin
+    VerbForm_ger
+    VerbForm_inf
+    VerbForm_none
+    VerbForm_part
+    VerbForm_partFut
+    VerbForm_partPast
+    VerbForm_partPres
+    VerbForm_sup
+    VerbForm_trans
+    VerbForm_gdv # la
+    Voice_act
+    Voice_cau
+    Voice_pass
+    Voice_mid # gkc
+    Voice_int # hb
+    Abbr_yes # cz, fi, sl, U
+    AdpType_prep # cz, U
+    AdpType_post # U
+    AdpType_voc # cz
+    AdpType_comprep # cz
+    AdpType_circ # U
+    AdvType_man
+    AdvType_loc
+    AdvType_tim
+    AdvType_deg
+    AdvType_cau
+    AdvType_mod
+    AdvType_sta
+    AdvType_ex
+    AdvType_adadj
+    ConjType_oper # cz, U
+    ConjType_comp # cz, U
+    Connegative_yes # fi
+    Derivation_minen # fi
+    Derivation_sti # fi
+    Derivation_inen # fi
+    Derivation_lainen # fi
+    Derivation_ja # fi
+    Derivation_ton # fi
+    Derivation_vs # fi
+    Derivation_ttain # fi
+    Derivation_ttaa # fi
+    Echo_rdp # U
+    Echo_ech # U
+    Foreign_foreign # cz, fi, U
+    Foreign_fscript # cz, fi, U
+    Foreign_tscript # cz, U
+    Foreign_yes # sl
+    Gender_dat_masc # bq, U
+    Gender_dat_fem # bq, U
+    Gender_erg_masc # bq
+    Gender_erg_fem # bq
+    Gender_psor_masc # cz, sl, U
+    Gender_psor_fem # cz, sl, U
+    Gender_psor_neut # sl
+    Hyph_yes # cz, U
+    InfForm_one # fi
+    InfForm_two # fi
+    InfForm_three # fi
+    NameType_geo # U, cz
+    NameType_prs # U, cz
+    NameType_giv # U, cz
+    NameType_sur # U, cz
+    NameType_nat # U, cz
+    NameType_com # U, cz
+    NameType_pro # U, cz
+    NameType_oth # U, cz
+    NounType_com # U
+    NounType_prop # U
+    NounType_class # U
+    Number_abs_sing # bq, U
+    Number_abs_plur # bq, U
+    Number_dat_sing # bq, U
+    Number_dat_plur # bq, U
+    Number_erg_sing # bq, U
+    Number_erg_plur # bq, U
+    Number_psee_sing # U
+    Number_psee_plur # U
+    Number_psor_sing # cz, fi, sl, U
+    Number_psor_plur # cz, fi, sl, U
+    NumForm_digit # cz, sl, U
+    NumForm_roman # cz, sl, U
+    NumForm_word # cz, sl, U
+    NumValue_one # cz, U
+    NumValue_two # cz, U
+    NumValue_three # cz, U
+    PartForm_pres # fi
+    PartForm_past # fi
+    PartForm_agt # fi
+    PartForm_neg # fi
+    PartType_mod # U
+    PartType_emp # U
+    PartType_res # U
+    PartType_inf # U
+    PartType_vbp # U
+    Person_abs_one # bq, U
+    Person_abs_two # bq, U
+    Person_abs_three # bq, U
+    Person_dat_one # bq, U
+    Person_dat_two # bq, U
+    Person_dat_three # bq, U
+    Person_erg_one # bq, U
+    Person_erg_two # bq, U
+    Person_erg_three # bq, U
+    Person_psor_one # fi, U
+    Person_psor_two # fi, U
+    Person_psor_three # fi, U
+    Polite_inf # bq, U
+    Polite_pol # bq, U
+    Polite_abs_inf # bq, U
+    Polite_abs_pol # bq, U
+    Polite_erg_inf # bq, U
+    Polite_erg_pol # bq, U
+    Polite_dat_inf # bq, U
+    Polite_dat_pol # bq, U
+    Prefix_yes # U
+    PrepCase_npr # cz
+    PrepCase_pre # U
+    PunctSide_ini # U
+    PunctSide_fin # U
+    PunctType_peri # U
+    PunctType_qest # U
+    PunctType_excl # U
+    PunctType_quot # U
+    PunctType_brck # U
+    PunctType_comm # U
+    PunctType_colo # U
+    PunctType_semi # U
+    PunctType_dash # U
+    Style_arch # cz, fi, U
+    Style_rare # cz, fi, U
+    Style_poet # cz, U
+    Style_norm # cz, U
+    Style_coll # cz, U
+    Style_vrnc # cz, U
+    Style_sing # cz, U
+    Style_expr # cz, U
+    Style_derg # cz, U
+    Style_vulg # cz, U
+    Style_yes # fi, U
+    StyleVariant_styleShort # cz
+    StyleVariant_styleBound # cz, sl
+    VerbType_aux # U
+    VerbType_cop # U
+    VerbType_mod # U
+    VerbType_light # U
 
-#
-#cpdef enum Feature_t:
-#    Abbr
-#    AdpType
-#    AdvType
-#    ConjType
-#    Connegative
-#    Derivation
-#    Echo
-#    Foreign
-#    Gender_dat
-#    Gender_erg
-#    Gender_psor
-#    Hyph
-#    InfForm
-#    NameType
-#    NounType
-#    NumberAbs
-#    NumberDat
-#    NumberErg
-#    NumberPsee
-#    NumberPsor
-#    NumForm
-#    NumValue
-#    PartForm
-#    PartType
-#    Person_abs
-#    Person_dat
-#    Person_psor
-#    Polite
-#    Polite_abs
-#    Polite_dat
-#    Prefix
-#    PrepCase
-#    PunctSide
-#    PunctType
-#    Style
-#    Typo
-#    Variant
-#    VerbType
-#
-#
-#cpdef enum Animacy:
-#    Anim
-#    Inam
-#
-#
-#cpdef enum Aspect:
-#    Freq
-#    Imp
-#    Mod
-#    None_
-#    Perf
-#
-#
-#cpdef enum Case1:
-#    Nom
-#    Gen
-#    Acc
-#    Dat
-#    Voc
-#    Abl
-#    
-#cdef enum Case2:
-#    Abe
-#    Abs
-#    Ade
-#    All
-#    Cau
-#    Com
-#    Del
-#    Dis
-#
-#cdef enum Case3:
-#    Ela
-#    Ess
-#    Ill
-#    Ine
-#    Ins
-#    Loc
-#    Lat
-#    Par
-#
-#cdef enum Case4:
-#    Sub
-#    Sup
-#    Tem
-#    Ter
-#    Tra
-#
-#
-#cpdef enum Definite:
-#    Two
-#    Def
-#    Red
-#    Ind
-#
-#
-#cpdef enum Degree:
-#    Cmp
-#    Comp
-#    None_
-#    Pos
-#    Sup
-#    Abs
-#    Com
-#    Degree # du
-#
-#
-#cpdef enum Gender:
-#    Com
-#    Fem
-#    Masc
-#    Neut
-#
-#
-#cpdef enum Mood:
-#    Cnd
-#    Imp
-#    Ind
-#    N
-#    Pot
-#    Sub
-#    Opt
-#
-#
-#cpdef enum Negative:
-#    Neg
-#    Pos
-#    Yes
-#
-#
-#cpdef enum Number:
-#    Com
-#    Dual
-#    None_
-#    Plur
-#    Sing
-#    Ptan # bg
-#    Count # bg
-#
-#
-#cpdef enum NumType:
-#    Card
-#    Dist
-#    Frac
-#    Gen
-#    Mult
-#    None_
-#    Ord
-#    Sets
-#
-#
-#cpdef enum Person:
-#    One
-#    Two
-#    Three
-#    None_
-#
-#
-#cpdef enum Poss:
-#    Yes
-#
-#
-#cpdef enum PronType1:
-#    AdvPart
-#    Art
-#    Default
-#    Dem
-#    Ind
-#    Int
-#    Neg
-#
-#cpdef enum PronType2:
-#    Prs
-#    Rcp
-#    Rel
-#    Tot
-#    Clit
-#    Exc # es, ca, it, fa
-#    Clit # it
-#
-#
-#cpdef enum Reflex:
-#    Yes
-#
-#
-#cpdef enum Tense:
-#    Fut
-#    Imp
-#    Past
-#    Pres
-#
-#cpdef enum VerbForm1:
-#    Fin
-#    Ger
-#    Inf
-#    None_
-#    Part
-#    PartFut
-#    PartPast
-#
-#cpdef enum VerbForm2:
-#    PartPres
-#    Sup
-#    Trans
-#    Gdv # la
-#
-#
-#cpdef enum Voice:
-#    Act
-#    Cau
-#    Pass
-#    Mid # gkc
-#    Int # hb
-#
-#
-#cpdef enum Abbr:
-#    Yes # cz, fi, sl, U
-#
-#cpdef enum AdpType:
-#    Prep # cz, U
-#    Post # U
-#    Voc # cz
-#    Comprep # cz
-#    Circ # U
-#    Voc # U
-#
-#
-#cpdef enum AdvType1:
-#    # U
-#    Man
-#    Loc
-#    Tim
-#    Deg
-#    Cau
-#    Mod
-#    Sta
-#    Ex
-#
-#cpdef enum AdvType2:
-#    Adadj
-#
-#cpdef enum ConjType:
-#    Oper # cz, U
-#    Comp # cz, U
-#
-#cpdef enum Connegative:
-#    Yes # fi
-#
-#
-#cpdef enum Derivation1:
-#    Minen # fi
-#    Sti # fi
-#    Inen # fi
-#    Lainen # fi
-#    Ja # fi
-#    Ton # fi
-#    Vs # fi
-#    Ttain # fi
-#
-#cpdef enum Derivation2:
-#    Ttaa
-#
-#
-#cpdef enum Echo:
-#    Rdp # U
-#    Ech # U
-#
-#
-#cpdef enum Foreign:
-#    Foreign # cz, fi, U
-#    Fscript # cz, fi, U
-#    Tscript # cz, U
-#    Yes # sl
-#
-#
-#cpdef enum Gender_dat:
-#    Masc # bq, U
-#    Fem # bq, U
-#
-#
-#cpdef enum Gender_erg:
-#    Masc # bq
-#    Fem # bq
-#
-#
-#cpdef enum Gender_psor:
-#    Masc # cz, sl, U
-#    Fem # cz, sl, U
-#    Neut # sl
-#
-#
-#cpdef enum Hyph:
-#    Yes # cz, U
-#
-#
-#cpdef enum InfForm:
-#    One # fi
-#    Two # fi
-#    Three # fi
-#
-#
-#cpdef enum NameType:
-#    Geo # U, cz
-#    Prs # U, cz
-#    Giv # U, cz
-#    Sur # U, cz
-#    Nat # U, cz
-#    Com # U, cz
-#    Pro # U, cz
-#    Oth # U, cz
-#
-#
-#cpdef enum NounType:
-#    Com # U
-#    Prop # U
-#    Class # U
-#
-#cpdef enum Number_abs:
-#    Sing # bq, U
-#    Plur # bq, U
-#
-#cpdef enum Number_dat:
-#    Sing # bq, U
-#    Plur # bq, U
-#
-#cpdef enum Number_erg:
-#    Sing # bq, U
-#    Plur # bq, U
-#
-#cpdef enum Number_psee:
-#    Sing # U
-#    Plur # U
-#
-#
-#cpdef enum Number_psor:
-#    Sing # cz, fi, sl, U
-#    Plur # cz, fi, sl, U
-#
-#
-#cpdef enum NumForm:
-#    Digit # cz, sl, U
-#    Roman # cz, sl, U
-#    Word # cz, sl, U
-#
-#
-#cpdef enum NumValue:
-#    One # cz, U
-#    Two # cz, U
-#    Three # cz, U
-#
-#
-#cpdef enum PartForm:
-#    Pres # fi
-#    Past # fi
-#    Agt # fi
-#    Neg # fi
-#
-#
-#cpdef enum PartType:
-#    Mod # U
-#    Emp # U
-#    Res # U
-#    Inf # U
-#    Vbp # U
-#
-#cpdef enum Person_abs:
-#    One # bq, U
-#    Two # bq, U
-#    Three # bq, U
-#
-#
-#cpdef enum Person_dat:
-#    One # bq, U
-#    Two # bq, U
-#    Three # bq, U
-#
-#
-#cpdef enum Person_erg:
-#    One # bq, U
-#    Two # bq, U
-#    Three # bq, U
-#
-#
-#cpdef enum Person_psor:
-#    One # fi, U
-#    Two # fi, U
-#    Three # fi, U
-#
-#
-#cpdef enum Polite:
-#    Inf # bq, U
-#    Pol # bq, U
-#
-#
-#cpdef enum Polite_abs:
-#    Inf # bq, U
-#    Pol # bq, U
-#
-#
-#cpdef enum Polite_erg:
-#    Inf # bq, U
-#    Pol # bq, U
-#
-#
-#cpdef enum Polite_dat:
-#    Inf # bq, U
-#    Pol # bq, U
-#
-#
-#cpdef enum Prefix:
-#    Yes # U
-#
-#
-#cpdef enum PrepCase:
-#    Npr # cz
-#    Pre # U
-#
-#
-#cpdef enum PunctSide:
-#    Ini # U
-#    Fin # U
-#
-#cpdef enum PunctType1:
-#    Peri # U
-#    Qest # U
-#    Excl # U
-#    Quot # U
-#    Brck # U
-#    Comm # U
-#    Colo # U
-#    Semi # U
-#
-#cpdef enum PunctType2:
-#    Dash # U
-#
-#
-#cpdef enum Style1:
-#    Arch # cz, fi, U
-#    Rare # cz, fi, U
-#    Poet # cz, U
-#    Norm # cz, U
-#    Coll # cz, U
-#    Vrnc # cz, U
-#    Sing # cz, U
-#    Expr # cz, U
-#
-#
-#cpdef enum Style2:
-#    Derg # cz, U
-#    Vulg # cz, U
-#
-#
-#cpdef enum Typo:
-#    Yes # fi, U
-#
-#
-#cpdef enum Variant:
-#    Short # cz
-#    Bound # cz, sl
-#
-#
-#cpdef enum VerbType:
-#    Aux # U
-#    Cop # U
-#    Mod # U
-#    Light # U
-#
 
-cpdef enum Value_t:
-    Animacy_Anim
-    Animacy_Inam
-    Aspect_Freq
-    Aspect_Imp
-    Aspect_Mod
-    Aspect_None_
-    Aspect_Perf
-    Case_Abe
-    Case_Abl
-    Case_Abs
-    Case_Acc
-    Case_Ade
-    Case_All
-    Case_Cau
-    Case_Com
-    Case_Dat
-    Case_Del
-    Case_Dis
-    Case_Ela
-    Case_Ess
-    Case_Gen
-    Case_Ill
-    Case_Ine
-    Case_Ins
-    Case_Loc
-    Case_Lat
-    Case_Nom
-    Case_Par
-    Case_Sub
-    Case_Sup
-    Case_Tem
-    Case_Ter
-    Case_Tra
-    Case_Voc
-    Definite_Two
-    Definite_Def
-    Definite_Red
-    Definite_Ind
-    Degree_Cmp
-    Degree_Comp
-    Degree_None
-    Degree_Pos
-    Degree_Sup
-    Degree_Abs
-    Degree_Com
-    Degree_Dim # du
-    Gender_Com
-    Gender_Fem
-    Gender_Masc
-    Gender_Neut
-    Mood_Cnd
-    Mood_Imp
-    Mood_Ind
-    Mood_N
-    Mood_Pot
-    Mood_Sub
-    Mood_Opt
-    Negative_Neg
-    Negative_Pos
-    Negative_Yes
-    Number_Com
-    Number_Dual
-    Number_None
-    Number_Plur
-    Number_Sing
-    Number_Ptan # bg
-    Number_Count # bg
-    NumType_Card
-    NumType_Dist
-    NumType_Frac
-    NumType_Gen
-    NumType_Mult
-    NumType_None
-    NumType_Ord
-    NumType_Sets
-    Person_One
-    Person_Two
-    Person_Three
-    Person_None
-    Poss_Yes
-    PronType_AdvPart
-    PronType_Art
-    PronType_Default
-    PronType_Dem
-    PronType_Ind
-    PronType_Int
-    PronType_Neg
-    PronType_Prs
-    PronType_Rcp
-    PronType_Rel
-    PronType_Tot
-    PronType_Clit
-    PronType_Exc # es, ca, it, fa
-    Reflex_Yes
-    Tense_Fut
-    Tense_Imp
-    Tense_Past
-    Tense_Pres
-    VerbForm_Fin
-    VerbForm_Ger
-    VerbForm_Inf
-    VerbForm_None
-    VerbForm_Part
-    VerbForm_PartFut
-    VerbForm_PartPast
-    VerbForm_PartPres
-    VerbForm_Sup
-    VerbForm_Trans
-    VerbForm_Gdv # la
-    Voice_Act
-    Voice_Cau
-    Voice_Pass
-    Voice_Mid # gkc
-    Voice_Int # hb
-    Abbr_Yes # cz, fi, sl, U
-    AdpType_Prep # cz, U
-    AdpType_Post # U
-    AdpType_Voc # cz
-    AdpType_Comprep # cz
-    AdpType_Circ # U
-    AdvType_Man
-    AdvType_Loc
-    AdvType_Tim
-    AdvType_Deg
-    AdvType_Cau
-    AdvType_Mod
-    AdvType_Sta
-    AdvType_Ex
-    AdvType_Adadj
-    ConjType_Oper # cz, U
-    ConjType_Comp # cz, U
-    Connegative_Yes # fi
-    Derivation_Minen # fi
-    Derivation_Sti # fi
-    Derivation_Inen # fi
-    Derivation_Lainen # fi
-    Derivation_Ja # fi
-    Derivation_Ton # fi
-    Derivation_Vs # fi
-    Derivation_Ttain # fi
-    Derivation_Ttaa # fi
-    Echo_Rdp # U
-    Echo_Ech # U
-    Foreign_Foreign # cz, fi, U
-    Foreign_Fscript # cz, fi, U
-    Foreign_Tscript # cz, U
-    Foreign_Yes # sl
-    Gender_dat_Masc # bq, U
-    Gender_dat_Fem # bq, U
-    Gender_erg_Masc # bq
-    Gender_erg_Fem # bq
-    Gender_psor_Masc # cz, sl, U
-    Gender_psor_Fem # cz, sl, U
-    Gender_psor_Neut # sl
-    Hyph_Yes # cz, U
-    InfForm_One # fi
-    InfForm_Two # fi
-    InfForm_Three # fi
-    NameType_Geo # U, cz
-    NameType_Prs # U, cz
-    NameType_Giv # U, cz
-    NameType_Sur # U, cz
-    NameType_Nat # U, cz
-    NameType_Com # U, cz
-    NameType_Pro # U, cz
-    NameType_Oth # U, cz
-    NounType_Com # U
-    NounType_Prop # U
-    NounType_Class # U
-    Number_abs_Sing # bq, U
-    Number_abs_Plur # bq, U
-    Number_dat_Sing # bq, U
-    Number_dat_Plur # bq, U
-    Number_erg_Sing # bq, U
-    Number_erg_Plur # bq, U
-    Number_psee_Sing # U
-    Number_psee_Plur # U
-    Number_psor_Sing # cz, fi, sl, U
-    Number_psor_Plur # cz, fi, sl, U
-    NumForm_Digit # cz, sl, U
-    NumForm_Roman # cz, sl, U
-    NumForm_Word # cz, sl, U
-    NumValue_One # cz, U
-    NumValue_Two # cz, U
-    NumValue_Three # cz, U
-    PartForm_Pres # fi
-    PartForm_Past # fi
-    PartForm_Agt # fi
-    PartForm_Neg # fi
-    PartType_Mod # U
-    PartType_Emp # U
-    PartType_Res # U
-    PartType_Inf # U
-    PartType_Vbp # U
-    Person_abs_One # bq, U
-    Person_abs_Two # bq, U
-    Person_abs_Three # bq, U
-    Person_dat_One # bq, U
-    Person_dat_Two # bq, U
-    Person_dat_Three # bq, U
-    Person_erg_One # bq, U
-    Person_erg_Two # bq, U
-    Person_erg_Three # bq, U
-    Person_psor_One # fi, U
-    Person_psor_Two # fi, U
-    Person_psor_Three # fi, U
-    Polite_Inf # bq, U
-    Polite_Pol # bq, U
-    Polite_abs_Inf # bq, U
-    Polite_abs_Pol # bq, U
-    Polite_erg_Inf # bq, U
-    Polite_erg_Pol # bq, U
-    Polite_dat_Inf # bq, U
-    Polite_dat_Pol # bq, U
-    Prefix_Yes # U
-    PrepCase_Npr # cz
-    PrepCase_Pre # U
-    PunctSide_Ini # U
-    PunctSide_Fin # U
-    PunctType_Peri # U
-    PunctType_Qest # U
-    PunctType_Excl # U
-    PunctType_Quot # U
-    PunctType_Brck # U
-    PunctType_Comm # U
-    PunctType_Colo # U
-    PunctType_Semi # U
-    PunctType_Dash # U
-    Style_Arch # cz, fi, U
-    Style_Rare # cz, fi, U
-    Style_Poet # cz, U
-    Style_Norm # cz, U
-    Style_Coll # cz, U
-    Style_Vrnc # cz, U
-    Style_Sing # cz, U
-    Style_Expr # cz, U
-    Style_Derg # cz, U
-    Style_Vulg # cz, U
-    Style_Yes # fi, U
-    StyleVariant_StyleShort # cz
-    StyleVariant_StyleBound # cz, sl
-    VerbType_Aux # U
-    VerbType_Cop # U
-    VerbType_Mod # U
-    VerbType_Light # U
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 8d2a73608..c53e5f478 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -89,3 +89,254 @@ cdef class Morphology:
         lemma_string = sorted(lemma_strings)[0]
         lemma = self.strings[lemma_string]
         return lemma
+
+IDS = {
+    "Animacy_anim": Animacy_anim,
+    "Animacy_inam": Animacy_inam,
+    "Aspect_freq": Aspect_freq,
+    "Aspect_imp": Aspect_imp,
+    "Aspect_mod": Aspect_mod,
+    "Aspect_none": Aspect_none,
+    "Aspect_perf": Aspect_perf,
+    "Case_abe": Case_abe,
+    "Case_abl": Case_abl,
+    "Case_abs": Case_abs,
+    "Case_acc": Case_acc,
+    "Case_ade": Case_ade,
+    "Case_all": Case_all,
+    "Case_cau": Case_cau,
+    "Case_com": Case_com,
+    "Case_dat": Case_dat,
+    "Case_del": Case_del,
+    "Case_dis": Case_dis,
+    "Case_ela": Case_ela,
+    "Case_ess": Case_ess,
+    "Case_gen": Case_gen,
+    "Case_ill": Case_ill,
+    "Case_ine": Case_ine,
+    "Case_ins": Case_ins,
+    "Case_loc": Case_loc,
+    "Case_lat": Case_lat,
+    "Case_nom": Case_nom,
+    "Case_par": Case_par,
+    "Case_sub": Case_sub,
+    "Case_sup": Case_sup,
+    "Case_tem": Case_tem,
+    "Case_ter": Case_ter,
+    "Case_tra": Case_tra,
+    "Case_voc": Case_voc,
+    "Definite_two": Definite_two,
+    "Definite_def": Definite_def,
+    "Definite_red": Definite_red,
+    "Definite_ind": Definite_ind,
+    "Degree_cmp": Degree_cmp,
+    "Degree_comp": Degree_comp,
+    "Degree_none": Degree_none,
+    "Degree_pos": Degree_pos,
+    "Degree_sup": Degree_sup,
+    "Degree_abs": Degree_abs,
+    "Degree_com": Degree_com,
+    "Degree_dim ": Degree_dim, # du
+    "Gender_com": Gender_com,
+    "Gender_fem": Gender_fem,
+    "Gender_masc": Gender_masc,
+    "Gender_neut": Gender_neut,
+    "Mood_cnd": Mood_cnd,
+    "Mood_imp": Mood_imp,
+    "Mood_ind": Mood_ind,
+    "Mood_n": Mood_n,
+    "Mood_pot": Mood_pot,
+    "Mood_sub": Mood_sub,
+    "Mood_opt": Mood_opt,
+    "Negative_neg": Negative_neg,
+    "Negative_pos": Negative_pos,
+    "Negative_yes": Negative_yes,
+    "Number_com": Number_com,
+    "Number_dual": Number_dual,
+    "Number_none": Number_none,
+    "Number_plur": Number_plur,
+    "Number_sing": Number_sing,
+    "Number_ptan ": Number_ptan, # bg
+    "Number_count ": Number_count, # bg
+    "NumType_card": NumType_card,
+    "NumType_dist": NumType_dist,
+    "NumType_frac": NumType_frac,
+    "NumType_gen": NumType_gen,
+    "NumType_mult": NumType_mult,
+    "NumType_none": NumType_none,
+    "NumType_ord": NumType_ord,
+    "NumType_sets": NumType_sets,
+    "Person_one": Person_one,
+    "Person_two": Person_two,
+    "Person_three": Person_three,
+    "Person_none": Person_none,
+    "Poss_yes": Poss_yes,
+    "PronType_advPart": PronType_advPart,
+    "PronType_art": PronType_art,
+    "PronType_default": PronType_default,
+    "PronType_dem": PronType_dem,
+    "PronType_ind": PronType_ind,
+    "PronType_int": PronType_int,
+    "PronType_neg": PronType_neg,
+    "PronType_prs": PronType_prs,
+    "PronType_rcp": PronType_rcp,
+    "PronType_rel": PronType_rel,
+    "PronType_tot": PronType_tot,
+    "PronType_clit": PronType_clit,
+    "PronType_exc ": PronType_exc, # es, ca, it, fa,
+    "Reflex_yes": Reflex_yes,
+    "Tense_fut": Tense_fut,
+    "Tense_imp": Tense_imp,
+    "Tense_past": Tense_past,
+    "Tense_pres": Tense_pres,
+    "VerbForm_fin": VerbForm_fin,
+    "VerbForm_ger": VerbForm_ger,
+    "VerbForm_inf": VerbForm_inf,
+    "VerbForm_none": VerbForm_none,
+    "VerbForm_part": VerbForm_part,
+    "VerbForm_partFut": VerbForm_partFut,
+    "VerbForm_partPast": VerbForm_partPast,
+    "VerbForm_partPres": VerbForm_partPres,
+    "VerbForm_sup": VerbForm_sup,
+    "VerbForm_trans": VerbForm_trans,
+    "VerbForm_gdv ": VerbForm_gdv, # la,
+    "Voice_act": Voice_act,
+    "Voice_cau": Voice_cau,
+    "Voice_pass": Voice_pass,
+    "Voice_mid ": Voice_mid, # gkc,
+    "Voice_int ": Voice_int, # hb,
+    "Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
+    "AdpType_prep ": AdpType_prep, # cz, U,
+    "AdpType_post ": AdpType_post, # U,
+    "AdpType_voc ": AdpType_voc, # cz,
+    "AdpType_comprep ": AdpType_comprep, # cz,
+    "AdpType_circ ": AdpType_circ, # U,
+    "AdvType_man": AdvType_man,
+    "AdvType_loc": AdvType_loc,
+    "AdvType_tim": AdvType_tim,
+    "AdvType_deg": AdvType_deg,
+    "AdvType_cau": AdvType_cau,
+    "AdvType_mod": AdvType_mod,
+    "AdvType_sta": AdvType_sta,
+    "AdvType_ex": AdvType_ex,
+    "AdvType_adadj": AdvType_adadj,
+    "ConjType_oper ": ConjType_oper, # cz, U,
+    "ConjType_comp ": ConjType_comp, # cz, U,
+    "Connegative_yes ": Connegative_yes, # fi,
+    "Derivation_minen ": Derivation_minen, # fi,
+    "Derivation_sti ": Derivation_sti, # fi,
+    "Derivation_inen ": Derivation_inen, # fi,
+    "Derivation_lainen ": Derivation_lainen, # fi,
+    "Derivation_ja ": Derivation_ja, # fi,
+    "Derivation_ton ": Derivation_ton, # fi,
+    "Derivation_vs ": Derivation_vs, # fi,
+    "Derivation_ttain ": Derivation_ttain, # fi,
+    "Derivation_ttaa ": Derivation_ttaa, # fi,
+    "Echo_rdp ": Echo_rdp, # U,
+    "Echo_ech ": Echo_ech, # U,
+    "Foreign_foreign ": Foreign_foreign, # cz, fi, U,
+    "Foreign_fscript ": Foreign_fscript, # cz, fi, U,
+    "Foreign_tscript ": Foreign_tscript, # cz, U,
+    "Foreign_yes ": Foreign_yes, # sl,
+    "Gender_dat_masc ": Gender_dat_masc, # bq, U,
+    "Gender_dat_fem ": Gender_dat_fem, # bq, U,
+    "Gender_erg_masc ": Gender_erg_masc, # bq,
+    "Gender_erg_fem ": Gender_erg_fem, # bq,
+    "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
+    "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
+    "Gender_psor_neut ": Gender_psor_neut, # sl,
+    "Hyph_yes ": Hyph_yes, # cz, U,
+    "InfForm_one ": InfForm_one, # fi,
+    "InfForm_two ": InfForm_two, # fi,
+    "InfForm_three ": InfForm_three, # fi,
+    "NameType_geo ": NameType_geo, # U, cz,
+    "NameType_prs ": NameType_prs, # U, cz,
+    "NameType_giv ": NameType_giv, # U, cz,
+    "NameType_sur ": NameType_sur, # U, cz,
+    "NameType_nat ": NameType_nat, # U, cz,
+    "NameType_com ": NameType_com, # U, cz,
+    "NameType_pro ": NameType_pro, # U, cz,
+    "NameType_oth ": NameType_oth, # U, cz,
+    "NounType_com ": NounType_com, # U,
+    "NounType_prop ": NounType_prop, # U,
+    "NounType_class ": NounType_class, # U,
+    "Number_abs_sing ": Number_abs_sing, # bq, U,
+    "Number_abs_plur ": Number_abs_plur, # bq, U,
+    "Number_dat_sing ": Number_dat_sing, # bq, U,
+    "Number_dat_plur ": Number_dat_plur, # bq, U,
+    "Number_erg_sing ": Number_erg_sing, # bq, U,
+    "Number_erg_plur ": Number_erg_plur, # bq, U,
+    "Number_psee_sing ": Number_psee_sing, # U,
+    "Number_psee_plur ": Number_psee_plur, # U,
+    "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
+    "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
+    "NumForm_digit ": NumForm_digit, # cz, sl, U,
+    "NumForm_roman ": NumForm_roman, # cz, sl, U,
+    "NumForm_word ": NumForm_word, # cz, sl, U,
+    "NumValue_one ": NumValue_one, # cz, U,
+    "NumValue_two ": NumValue_two, # cz, U,
+    "NumValue_three ": NumValue_three, # cz, U,
+    "PartForm_pres ": PartForm_pres, # fi,
+    "PartForm_past ": PartForm_past, # fi,
+    "PartForm_agt ": PartForm_agt, # fi,
+    "PartForm_neg ": PartForm_neg, # fi,
+    "PartType_mod ": PartType_mod, # U,
+    "PartType_emp ": PartType_emp, # U,
+    "PartType_res ": PartType_res, # U,
+    "PartType_inf ": PartType_inf, # U,
+    "PartType_vbp ": PartType_vbp, # U,
+    "Person_abs_one ": Person_abs_one, # bq, U,
+    "Person_abs_two ": Person_abs_two, # bq, U,
+    "Person_abs_three ": Person_abs_three, # bq, U,
+    "Person_dat_one ": Person_dat_one, # bq, U,
+    "Person_dat_two ": Person_dat_two, # bq, U,
+    "Person_dat_three ": Person_dat_three, # bq, U,
+    "Person_erg_one ": Person_erg_one, # bq, U,
+    "Person_erg_two ": Person_erg_two, # bq, U,
+    "Person_erg_three ": Person_erg_three, # bq, U,
+    "Person_psor_one ": Person_psor_one, # fi, U,
+    "Person_psor_two ": Person_psor_two, # fi, U,
+    "Person_psor_three ": Person_psor_three, # fi, U,
+    "Polite_inf ": Polite_inf, # bq, U,
+    "Polite_pol ": Polite_pol, # bq, U,
+    "Polite_abs_inf ": Polite_abs_inf, # bq, U,
+    "Polite_abs_pol ": Polite_abs_pol, # bq, U,
+    "Polite_erg_inf ": Polite_erg_inf, # bq, U,
+    "Polite_erg_pol ": Polite_erg_pol, # bq, U,
+    "Polite_dat_inf ": Polite_dat_inf, # bq, U,
+    "Polite_dat_pol ": Polite_dat_pol, # bq, U,
+    "Prefix_yes ": Prefix_yes, # U,
+    "PrepCase_npr ": PrepCase_npr, # cz,
+    "PrepCase_pre ": PrepCase_pre, # U,
+    "PunctSide_ini ": PunctSide_ini, # U,
+    "PunctSide_fin ": PunctSide_fin, # U,
+    "PunctType_peri ": PunctType_peri, # U,
+    "PunctType_qest ": PunctType_qest, # U,
+    "PunctType_excl ": PunctType_excl, # U,
+    "PunctType_quot ": PunctType_quot, # U,
+    "PunctType_brck ": PunctType_brck, # U,
+    "PunctType_comm ": PunctType_comm, # U,
+    "PunctType_colo ": PunctType_colo, # U,
+    "PunctType_semi ": PunctType_semi, # U,
+    "PunctType_dash ": PunctType_dash, # U,
+    "Style_arch ": Style_arch, # cz, fi, U,
+    "Style_rare ": Style_rare, # cz, fi, U,
+    "Style_poet ": Style_poet, # cz, U,
+    "Style_norm ": Style_norm, # cz, U,
+    "Style_coll ": Style_coll, # cz, U,
+    "Style_vrnc ": Style_vrnc, # cz, U,
+    "Style_sing ": Style_sing, # cz, U,
+    "Style_expr ": Style_expr, # cz, U,
+    "Style_derg ": Style_derg, # cz, U,
+    "Style_vulg ": Style_vulg, # cz, U,
+    "Style_yes ": Style_yes, # fi, U,
+    "StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
+    "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
+    "VerbType_aux ": VerbType_aux, # U,
+    "VerbType_cop ": VerbType_cop, # U,
+    "VerbType_mod ": VerbType_mod, # U,
+    "VerbType_light ": VerbType_light, # U,
+}
+
+
+NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]

From 9f4be0adcdf5a29ad2cfc49cf9b005debdc49387 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 22:11:20 +1100
Subject: [PATCH 11/22] * Map NO_TAG to NIL in parts_of_speech.pxd

---
 spacy/parts_of_speech.pxd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd
index 17e349435..9fbdbd71f 100644
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@@ -2,7 +2,7 @@ from .symbols cimport *
 
 
 cpdef enum univ_pos_t:
-    NO_TAG = EMPTY_VALUE
+    NO_TAG = NIL
     ADJ = POS_adj
     ADP = POS_adp
     ADV = POS_adv

From ce65ec698c42be5146f350146b9dc94904dcf7e3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 22:11:38 +1100
Subject: [PATCH 12/22] * Remove qualified naming in symbols

---
 spacy/symbols.pxd | 202 ++++++++++++++++++++++-----------------------
 spacy/symbols.pyx | 205 +++++++++++++++++++++++-----------------------
 2 files changed, 204 insertions(+), 203 deletions(-)

diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd
index e8ddeaa8f..590a2d41d 100644
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@@ -1,109 +1,109 @@
 cpdef enum symbol_t:
-    EMPTY_VALUE
-    Attr_is_alpha
-    Attr_is_ascii
-    Attr_is_digit
-    Attr_is_lower
-    Attr_is_punct
-    Attr_is_space
-    Attr_is_title
-    Attr_is_upper
-    Attr_like_url
-    Attr_like_num
-    Attr_like_email
-    Attr_is_stop
-    Attr_is_oov
+    NIL
+    IS_ALPHA
+    IS_ASCII
+    IS_DIGIT
+    IS_LOWER
+    IS_PUNCT
+    IS_SPACE
+    IS_TITLE
+    IS_UPPER
+    LIKE_URL
+    LIKE_NUM
+    LIKE_EMAIL
+    IS_STOP
+    IS_OOV
     
-    Attr_flag14
-    Attr_flag15
-    Attr_flag16
-    Attr_flag17
-    Attr_flag18
-    Attr_flag19
-    Attr_flag20
-    Attr_flag21
-    Attr_flag22
-    Attr_flag23
-    Attr_flag24
-    Attr_flag25
-    Attr_flag26
-    Attr_flag27
-    Attr_flag28
-    Attr_flag29
-    Attr_flag30
-    Attr_flag31
-    Attr_flag32
-    Attr_flag33
-    Attr_flag34
-    Attr_flag35
-    Attr_flag36
-    Attr_flag37
-    Attr_flag38
-    Attr_flag39
-    Attr_flag40
-    Attr_flag41
-    Attr_flag42
-    Attr_flag43
-    Attr_flag44
-    Attr_flag45
-    Attr_flag46
-    Attr_flag47
-    Attr_flag48
-    Attr_flag49
-    Attr_flag50
-    Attr_flag51
-    Attr_flag52
-    Attr_flag53
-    Attr_flag54
-    Attr_flag55
-    Attr_flag56
-    Attr_flag57
-    Attr_flag58
-    Attr_flag59
-    Attr_flag60
-    Attr_flag61
-    Attr_flag62
-    Attr_flag63
+    FLAG14
+    FLAG15
+    FLAG16
+    FLAG17
+    FLAG18
+    FLAG19
+    FLAG20
+    FLAG21
+    FLAG22
+    FLAG23
+    FLAG24
+    FLAG25
+    FLAG26
+    FLAG27
+    FLAG28
+    FLAG29
+    FLAG30
+    FLAG31
+    FLAG32
+    FLAG33
+    FLAG34
+    FLAG35
+    FLAG36
+    FLAG37
+    FLAG38
+    FLAG39
+    FLAG40
+    FLAG41
+    FLAG42
+    FLAG43
+    FLAG44
+    FLAG45
+    FLAG46
+    FLAG47
+    FLAG48
+    FLAG49
+    FLAG50
+    FLAG51
+    FLAG52
+    FLAG53
+    FLAG54
+    FLAG55
+    FLAG56
+    FLAG57
+    FLAG58
+    FLAG59
+    FLAG60
+    FLAG61
+    FLAG62
+    FLAG63
 
-    Attr_id
-    Attr_orth
-    Attr_lower
-    Attr_norm
-    Attr_shape
-    Attr_prefix
-    Attr_suffix
+    ID
+    ORTH
+    LOWER
+    NORM
+    SHAPE
+    PREFIX
+    SUFFIX
 
-    Attr_length
-    Attr_cluster
-    Attr_lemma
-    Attr_pos
-    Attr_tag
-    Attr_dep
-    Attr_ent_iob
-    Attr_ent_type
-    Attr_head
-    Attr_spacy
-    Attr_prob
+    LENGTH
+    CLUSTER
+    LEMMA
+    POS
+    TAG
+    DEP
+    ENT_IOB
+    ENT_TYPE
+    HEAD
+    SPACY
+    PROB
 
-    POS_adj
-    POS_adp
-    POS_adv
-    POS_aux
-    POS_conj
-    POS_det
-    POS_intj
-    POS_noun
-    POS_num
-    POS_part
-    POS_pron
-    POS_propn
-    POS_punct
-    POS_sconj
-    POS_sym
-    POS_verb
-    POS_x
-    POS_eol
-    POS_space
+    ADJ
+    ADP
+    ADV
+    AUX
+    CONJ
+    DET
+    INTJ
+    NOUN
+    NUM
+    PART
+    PRON
+    PROPN
+    PUNCT
+    SCONJ
+    SYM
+    VERB
+    X
+    EOL
+    SPACE
 
     Animacy_anim
     Animacy_inam
diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx
index a0a39f2ff..9a3d219d5 100644
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@@ -1,108 +1,109 @@
-SYMBOL_IDS = {
-    "Attr_is_alpha": Attr_is_alpha,
-    "Attr_is_ascii": Attr_is_ascii,
-    "Attr_is_digit": Attr_is_digit,
-    "Attr_is_lower": Attr_is_lower,
-    "Attr_is_punct": Attr_is_punct,
-    "Attr_is_space": Attr_is_space,
-    "Attr_is_title": Attr_is_title,
-    "Attr_is_upper": Attr_is_upper,
-    "Attr_like_url": Attr_like_url,
-    "Attr_like_num": Attr_like_num,
-    "Attr_like_email": Attr_like_email,
-    "Attr_is_stop": Attr_is_stop,
-    "Attr_is_oov": Attr_is_oov,
+IDS = {
+    "": NIL,
+    "IS_ALPHA": IS_ALPHA,
+    "IS_ASCII": IS_ASCII,
+    "IS_DIGIT": IS_DIGIT,
+    "IS_LOWER": IS_LOWER,
+    "IS_PUNCT": IS_PUNCT,
+    "IS_SPACE": IS_SPACE,
+    "IS_TITLE": IS_TITLE,
+    "IS_UPPER": IS_UPPER,
+    "LIKE_URL": LIKE_URL,
+    "LIKE_NUM": LIKE_NUM,
+    "LIKE_EMAIL": LIKE_EMAIL,
+    "IS_STOP": IS_STOP,
+    "IS_OOV": IS_OOV,
     
-    "Attr_flag14": Attr_flag14,
-    "Attr_flag15": Attr_flag15,
-    "Attr_flag16": Attr_flag16,
-    "Attr_flag17": Attr_flag17,
-    "Attr_flag18": Attr_flag18,
-    "Attr_flag19": Attr_flag19,
-    "Attr_flag20": Attr_flag20,
-    "Attr_flag21": Attr_flag21,
-    "Attr_flag22": Attr_flag22,
-    "Attr_flag23": Attr_flag23,
-    "Attr_flag24": Attr_flag24,
-    "Attr_flag25": Attr_flag25,
-    "Attr_flag26": Attr_flag26,
-    "Attr_flag27": Attr_flag27,
-    "Attr_flag28": Attr_flag28,
-    "Attr_flag29": Attr_flag29,
-    "Attr_flag30": Attr_flag30,
-    "Attr_flag31": Attr_flag31,
-    "Attr_flag32": Attr_flag32,
-    "Attr_flag33": Attr_flag33,
-    "Attr_flag34": Attr_flag34,
-    "Attr_flag35": Attr_flag35,
-    "Attr_flag36": Attr_flag36,
-    "Attr_flag37": Attr_flag37,
-    "Attr_flag38": Attr_flag38,
-    "Attr_flag39": Attr_flag39,
-    "Attr_flag40": Attr_flag40,
-    "Attr_flag41": Attr_flag41,
-    "Attr_flag42": Attr_flag42,
-    "Attr_flag43": Attr_flag43,
-    "Attr_flag44": Attr_flag44,
-    "Attr_flag45": Attr_flag45,
-    "Attr_flag46": Attr_flag46,
-    "Attr_flag47": Attr_flag47,
-    "Attr_flag48": Attr_flag48,
-    "Attr_flag49": Attr_flag49,
-    "Attr_flag50": Attr_flag50,
-    "Attr_flag51": Attr_flag51,
-    "Attr_flag52": Attr_flag52,
-    "Attr_flag53": Attr_flag53,
-    "Attr_flag54": Attr_flag54,
-    "Attr_flag55": Attr_flag55,
-    "Attr_flag56": Attr_flag56,
-    "Attr_flag57": Attr_flag57,
-    "Attr_flag58": Attr_flag58,
-    "Attr_flag59": Attr_flag59,
-    "Attr_flag60": Attr_flag60,
-    "Attr_flag61": Attr_flag61,
-    "Attr_flag62": Attr_flag62,
-    "Attr_flag63": Attr_flag63,
+    "FLAG14": FLAG14,
+    "FLAG15": FLAG15,
+    "FLAG16": FLAG16,
+    "FLAG17": FLAG17,
+    "FLAG18": FLAG18,
+    "FLAG19": FLAG19,
+    "FLAG20": FLAG20,
+    "FLAG21": FLAG21,
+    "FLAG22": FLAG22,
+    "FLAG23": FLAG23,
+    "FLAG24": FLAG24,
+    "FLAG25": FLAG25,
+    "FLAG26": FLAG26,
+    "FLAG27": FLAG27,
+    "FLAG28": FLAG28,
+    "FLAG29": FLAG29,
+    "FLAG30": FLAG30,
+    "FLAG31": FLAG31,
+    "FLAG32": FLAG32,
+    "FLAG33": FLAG33,
+    "FLAG34": FLAG34,
+    "FLAG35": FLAG35,
+    "FLAG36": FLAG36,
+    "FLAG37": FLAG37,
+    "FLAG38": FLAG38,
+    "FLAG39": FLAG39,
+    "FLAG40": FLAG40,
+    "FLAG41": FLAG41,
+    "FLAG42": FLAG42,
+    "FLAG43": FLAG43,
+    "FLAG44": FLAG44,
+    "FLAG45": FLAG45,
+    "FLAG46": FLAG46,
+    "FLAG47": FLAG47,
+    "FLAG48": FLAG48,
+    "FLAG49": FLAG49,
+    "FLAG50": FLAG50,
+    "FLAG51": FLAG51,
+    "FLAG52": FLAG52,
+    "FLAG53": FLAG53,
+    "FLAG54": FLAG54,
+    "FLAG55": FLAG55,
+    "FLAG56": FLAG56,
+    "FLAG57": FLAG57,
+    "FLAG58": FLAG58,
+    "FLAG59": FLAG59,
+    "FLAG60": FLAG60,
+    "FLAG61": FLAG61,
+    "FLAG62": FLAG62,
+    "FLAG63": FLAG63,
 
-    "Attr_id": Attr_id,
-    "Attr_orth": Attr_orth,
-    "Attr_lower": Attr_lower,
-    "Attr_norm": Attr_norm,
-    "Attr_shape": Attr_shape,
-    "Attr_prefix": Attr_prefix,
-    "Attr_suffix": Attr_suffix,
+    "ID": ID,
+    "ORTH": ORTH,
+    "LOWER": LOWER,
+    "NORM": NORM,
+    "SHAPE": SHAPE,
+    "PREFIX": PREFIX,
+    "SUFFIX": SUFFIX,
 
-    "Attr_length": Attr_length,
-    "Attr_cluster": Attr_cluster,
-    "Attr_lemma": Attr_lemma,
-    "Attr_pos": Attr_pos,
-    "Attr_tag": Attr_tag,
-    "Attr_dep": Attr_dep,
-    "Attr_ent_iob": Attr_ent_iob,
-    "Attr_ent_type": Attr_ent_type,
-    "Attr_head": Attr_head,
-    "Attr_spacy": Attr_spacy,
-    "Attr_prob": Attr_prob,
+    "LENGTH": LENGTH,
+    "CLUSTER": CLUSTER,
+    "LEMMA": LEMMA,
+    "POS": POS,
+    "TAG": TAG,
+    "DEP": DEP,
+    "ENT_IOB": ENT_IOB,
+    "ENT_TYPE": ENT_TYPE,
+    "HEAD": HEAD,
+    "SPACY": SPACY,
+    "PROB": PROB,
 
-    "POS_adj": POS_adj,
-    "POS_adp": POS_adp,
-    "POS_adv": POS_adv,
-    "POS_aux": POS_aux,
-    "POS_conj": POS_conj,
-    "POS_det": POS_det,
-    "POS_intj": POS_intj,
-    "POS_noun": POS_noun,
-    "POS_num": POS_num,
-    "POS_part": POS_part,
-    "POS_pron": POS_pron,
-    "POS_propn": POS_propn,
-    "POS_punct": POS_punct,
-    "POS_sconj": POS_sconj,
-    "POS_sym": POS_sym,
-    "POS_verb": POS_verb,
-    "POS_x": POS_x,
-    "POS_eol": POS_eol,
-    "POS_space": POS_space,
+    "ADJ": ADJ,
+    "ADP": ADP,
+    "ADV": ADV,
+    "AUX": AUX,
+    "CONJ": CONJ,
+    "DET": DET,
+    "INTJ": INTJ,
+    "NOUN": NOUN,
+    "NUM": NUM,
+    "PART": PART,
+    "PRON": PRON,
+    "PROPN": PROPN,
+    "PUNCT": PUNCT,
+    "SCONJ": SCONJ,
+    "SYM": SYM,
+    "VERB": VERB,
+    "X": X,
+    "EOL": EOL,
+    "SPACE": SPACE,
 
     "Animacy_anim": Animacy_anim,
     "Animacy_inam": Animacy_inam,
@@ -420,4 +421,4 @@ SYMBOL_IDS = {
     "Dep_xcomp": Dep_xcomp
 }
 
-SYMBOL_NAMES = [it[0] for it in sorted(SYMBOL_IDS.items(), key=lambda it: it[1])]
+NAMES = [it[0] for it in sorted(IDS.items(), key=lambda it: it[1])]

From 37b909b6b65a50affbe89578d76546386069059c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 22:12:06 +1100
Subject: [PATCH 13/22] * Use the symbols file in vocab instead of the symbols
 subfiles like attrs.pxd

---
 spacy/vocab.pyx | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 6cf829344..0f43967bb 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -20,7 +20,7 @@ from .cfile cimport CFile
 from .lemmatizer import Lemmatizer
 
 from . import attrs
-from . import parts_of_speech
+from . import symbols
 
 from cymem.cymem cimport Address
 from . import util
@@ -75,18 +75,9 @@ cdef class Vocab:
         # is the frequency rank of the word, plus a certain offset. The structural
         # strings are loaded first, because the vocab is open-class, and these
         # symbols are closed class.
-        for name in attrs.NAMES:
+        for name in symbols.NAMES + list(sorted(tag_map.keys())):
             if name:
                 _ = self.strings[name]
-        for name in parts_of_speech.NAMES:
-            if name:
-                _ = self.strings[name]
-        #for morph_name in UNIV_MORPH_NAMES:
-        #    _ = self.strings[morph_name]
-        #for entity_type_name in entity_types.NAMES:
-        #    _ = self.strings[entity_type_name]
-        #for tag_name in sorted(tag_map.keys()):
-        #    _ = self.strings[tag_name]
         self.get_lex_attr = get_lex_attr
         self.morphology = Morphology(self.strings, tag_map, lemmatizer)
         self.serializer_freqs = serializer_freqs

From 7b4af3d1e77104f37a0683c3dc4f950be179fb72 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 22:58:34 +1100
Subject: [PATCH 14/22] * Fix parts_of_speech now that symbols list has been
 reformed

---
 spacy/parts_of_speech.pxd | 43 +++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd
index 9fbdbd71f..c97673a69 100644
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@@ -1,24 +1,23 @@
-from .symbols cimport *
-
+from . cimport symbols
 
 cpdef enum univ_pos_t:
-    NO_TAG = NIL
-    ADJ = POS_adj
-    ADP = POS_adp
-    ADV = POS_adv
-    AUX = POS_aux
-    CONJ = POS_conj
-    DET = POS_det
-    INTJ = POS_intj
-    NOUN = POS_noun
-    NUM = POS_num
-    PART = POS_part
-    PRON = POS_pron
-    PROPN = POS_propn
-    PUNCT = POS_punct
-    SCONJ = POS_sconj
-    SYM = POS_sym
-    VERB = POS_verb
-    X = POS_x
-    EOL = POS_eol
-    SPACE = POS_space
+    NO_TAG = 0
+    ADJ = symbols.ADJ
+    ADP
+    ADV
+    AUX
+    CONJ
+    DET
+    INTJ
+    NOUN
+    NUM
+    PART
+    PRON
+    PROPN
+    PUNCT
+    SCONJ
+    SYM
+    VERB
+    X
+    EOL
+    SPACE

From e70368d15719fd269f52b8d76d9bbeb8d851c307 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 22:59:14 +1100
Subject: [PATCH 15/22] * Use lower case strings for dependency label names in
 symbols enum

---
 spacy/symbols.pxd | 132 +++++++++++++++++++++++-----------------------
 spacy/symbols.pyx | 132 +++++++++++++++++++++++-----------------------
 2 files changed, 132 insertions(+), 132 deletions(-)

diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd
index 590a2d41d..0c60f6f67 100644
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@@ -351,71 +351,71 @@ cpdef enum symbol_t:
     VerbType_mod # U
     VerbType_light # U
 
-    Name_person
-    Name_norp
-    Name_facility
-    Name_org
-    Name_gpe
-    Name_loc
-    Name_product
-    Name_event
-    Name_work_of_art
-    Name_language
+    PERSON
+    NORP
+    FACILITY
+    ORG
+    GPE
+    LOC
+    PRODUCT
+    EVENT
+    WORK_OF_ART
+    LANGUAGE
 
-    Unit_date
-    Unit_time
-    Unit_percent
-    Unit_money
-    Unit_quantity
-    Unit_ordinal
-    Unit_cardinal
+    DATE
+    TIME
+    PERCENT
+    MONEY
+    QUANTITY
+    ORDINAL
+    CARDINAL
 
-    Dep_acomp
-    Dep_advcl
-    Dep_advmod
-    Dep_agent
-    Dep_amod
-    Dep_appos
-    Dep_attr
-    Dep_aux
-    Dep_auxpass
-    Dep_cc
-    Dep_ccomp
-    Dep_complm
-    Dep_conj
-    Dep_csubj
-    Dep_csubjpass
-    Dep_dep
-    Dep_det
-    Dep_dobj
-    Dep_expl
-    Dep_hmod
-    Dep_hyph
-    Dep_infmod
-    Dep_intj
-    Dep_iobj
-    Dep_mark
-    Dep_meta
-    Dep_neg
-    Dep_nmod
-    Dep_nn
-    Dep_npadvmod
-    Dep_nsubj
-    Dep_nsubjpass
-    Dep_num
-    Dep_number
-    Dep_oprd
-    Dep_parataxis
-    Dep_partmod
-    Dep_pcomp
-    Dep_pobj
-    Dep_poss
-    Dep_possessive
-    Dep_preconj
-    Dep_prep
-    Dep_prt
-    Dep_punct
-    Dep_quantmod
-    Dep_rcmod
-    Dep_root
-    Dep_xcomp
+    acomp
+    advcl
+    advmod
+    agent
+    amod
+    appos
+    attr
+    aux
+    auxpass
+    cc
+    ccomp
+    complm
+    conj
+    csubj
+    csubjpass
+    dep
+    det
+    dobj
+    expl
+    hmod
+    hyph
+    infmod
+    intj
+    iobj
+    mark
+    meta
+    neg
+    nmod
+    nn
+    npadvmod
+    nsubj
+    nsubjpass
+    num
+    number
+    oprd
+    parataxis
+    partmod
+    pcomp
+    pobj
+    poss
+    possessive
+    preconj
+    prep
+    prt
+    punct
+    quantmod
+    rcmod
+    root
+    xcomp
diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx
index 9a3d219d5..31b01db98 100644
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@@ -351,74 +351,74 @@ IDS = {
     "VerbType_mod ": VerbType_mod, # U,
     "VerbType_light ": VerbType_light, # U,
 
-    "Name_person": Name_person,
-    "Name_norp": Name_norp,
-    "Name_facility": Name_facility,
-    "Name_org": Name_org,
-    "Name_gpe": Name_gpe,
-    "Name_loc": Name_loc,
-    "Name_product": Name_product,
-    "Name_event": Name_event,
-    "Name_work_of_art": Name_work_of_art,
-    "Name_language": Name_language,
+    "PERSON": PERSON,
+    "NORP": NORP,
+    "FACILITY": FACILITY,
+    "ORG": ORG,
+    "GPE": GPE,
+    "LOC": LOC,
+    "PRODUCT": PRODUCT,
+    "EVENT": EVENT,
+    "WORK_OF_ART": WORK_OF_ART,
+    "LANGUAGE": LANGUAGE,
 
-    "Unit_date": Unit_date,
-    "Unit_time": Unit_time,
-    "Unit_percent": Unit_percent,
-    "Unit_money": Unit_money,
-    "Unit_quantity": Unit_quantity,
-    "Unit_ordinal": Unit_ordinal,
-    "Unit_cardinal": Unit_cardinal,
+    "DATE": DATE,
+    "TIME": TIME,
+    "PERCENT": PERCENT,
+    "MONEY": MONEY,
+    "QUANTITY": QUANTITY,
+    "ORDINAL": ORDINAL,
+    "CARDINAL": CARDINAL,
 
-    "Dep_acomp": Dep_acomp,
-    "Dep_advcl": Dep_advcl,
-    "Dep_advmod": Dep_advmod,
-    "Dep_agent": Dep_agent,
-    "Dep_amod": Dep_amod,
-    "Dep_appos": Dep_appos,
-    "Dep_attr": Dep_attr,
-    "Dep_aux": Dep_aux,
-    "Dep_auxpass": Dep_auxpass,
-    "Dep_cc": Dep_cc,
-    "Dep_ccomp": Dep_ccomp,
-    "Dep_complm": Dep_complm,
-    "Dep_conj": Dep_conj,
-    "Dep_csubj": Dep_csubj,
-    "Dep_csubjpass": Dep_csubjpass,
-    "Dep_dep": Dep_dep,
-    "Dep_det": Dep_det,
-    "Dep_dobj": Dep_dobj,
-    "Dep_expl": Dep_expl,
-    "Dep_hmod": Dep_hmod,
-    "Dep_hyph": Dep_hyph,
-    "Dep_infmod": Dep_infmod,
-    "Dep_intj": Dep_intj,
-    "Dep_iobj": Dep_iobj,
-    "Dep_mark": Dep_mark,
-    "Dep_meta": Dep_meta,
-    "Dep_neg": Dep_neg,
-    "Dep_nmod": Dep_nmod,
-    "Dep_nn": Dep_nn,
-    "Dep_npadvmod": Dep_npadvmod,
-    "Dep_nsubj": Dep_nsubj,
-    "Dep_nsubjpass": Dep_nsubjpass,
-    "Dep_num": Dep_num,
-    "Dep_number": Dep_number,
-    "Dep_oprd": Dep_oprd,
-    "Dep_parataxis": Dep_parataxis,
-    "Dep_partmod": Dep_partmod,
-    "Dep_pcomp": Dep_pcomp,
-    "Dep_pobj": Dep_pobj,
-    "Dep_poss": Dep_poss,
-    "Dep_possessive": Dep_possessive,
-    "Dep_preconj": Dep_preconj,
-    "Dep_prep": Dep_prep,
-    "Dep_prt": Dep_prt,
-    "Dep_punct": Dep_punct,
-    "Dep_quantmod": Dep_quantmod,
-    "Dep_rcmod": Dep_rcmod,
-    "Dep_root": Dep_root,
-    "Dep_xcomp": Dep_xcomp
+    "acomp": acomp,
+    "advcl": advcl,
+    "advmod": advmod,
+    "agent": agent,
+    "amod": amod,
+    "appos": appos,
+    "attr": attr,
+    "aux": aux,
+    "auxpass": auxpass,
+    "cc": cc,
+    "ccomp": ccomp,
+    "complm": complm,
+    "conj": conj,
+    "csubj": csubj,
+    "csubjpass": csubjpass,
+    "dep": dep,
+    "det": det,
+    "dobj": dobj,
+    "expl": expl,
+    "hmod": hmod,
+    "hyph": hyph,
+    "infmod": infmod,
+    "intj": intj,
+    "iobj": iobj,
+    "mark": mark,
+    "meta": meta,
+    "neg": neg,
+    "nmod": nmod,
+    "nn": nn,
+    "npadvmod": npadvmod,
+    "nsubj": nsubj,
+    "nsubjpass": nsubjpass,
+    "num": num,
+    "number": number,
+    "oprd": oprd,
+    "parataxis": parataxis,
+    "partmod": partmod,
+    "pcomp": pcomp,
+    "pobj": pobj,
+    "poss": poss,
+    "possessive": possessive,
+    "preconj": preconj,
+    "prep": prep,
+    "prt": prt,
+    "punct": punct,
+    "quantmod": quantmod,
+    "rcmod": rcmod,
+    "root": root,
+    "xcomp": xcomp
 }
 
 NAMES = [it[0] for it in sorted(IDS.items(), key=lambda it: it[1])]

From 41012907a83a0684cc6044d9e5b7dcf2cbc704db Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 12 Oct 2015 00:51:43 +1100
Subject: [PATCH 16/22] * Fix variable name

---
 spacy/vocab.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 0f43967bb..af9161d6b 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -271,17 +271,17 @@ cdef class Vocab:
             i += 1
         fp.close()
 
-    def load_vectors(self, loc_or_file):
+    def load_vectors(self, file_):
         cdef LexemeC* lexeme
         cdef attr_t orth
         cdef int32_t vec_len = -1
-        for line_num, line in enumerate(loc_or_file):
+        for line_num, line in enumerate(file_):
             pieces = line.split()
             word_str = pieces.pop(0)
             if vec_len == -1:
                 vec_len = len(pieces)
             elif vec_len != len(pieces):
-                raise VectorReadError.mismatched_sizes(loc_or_file, line_num,
+                raise VectorReadError.mismatched_sizes(file_, line_num,
                                                         vec_len, len(pieces))
             orth = self.strings[word_str]
             lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)

From 0cee928467abf01007fd73433cc5a3e387f24883 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 12 Oct 2015 15:12:32 +1100
Subject: [PATCH 17/22] * Allow StringStore to be pickled, to start addressing
 Issue #125

---
 spacy/strings.pyx          | 13 ++++++++++++-
 tests/vocab/test_intern.py | 17 +++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index a4a470158..2208d3bdf 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -69,12 +69,15 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except
 
 cdef class StringStore:
     '''Map strings to and from integer IDs.'''
-    def __init__(self):
+    def __init__(self, strings=None):
         self.mem = Pool()
         self._map = PreshMap()
         self._resize_at = 10000
         self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
         self.size = 1
+        if strings is not None:
+            for string in strings:
+                _ = self[string]
 
     property size:
         def __get__(self):
@@ -113,6 +116,14 @@ cdef class StringStore:
         for i in range(self.size):
             yield self[i]
 
+    def __reduce__(self):
+        strings = [""]
+        for i in range(1, self.size):
+            string = &self.c[i]
+            py_string = _decode(string)
+            strings.append(py_string)
+        return (StringStore, (strings,), None, None, None)
+
     cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL:
         # 0 means missing, but we don't bother offsetting the index.
         key = hash64(chars, length * sizeof(char), 0)
diff --git a/tests/vocab/test_intern.py b/tests/vocab/test_intern.py
index 6e007c645..256706c6f 100644
--- a/tests/vocab/test_intern.py
+++ b/tests/vocab/test_intern.py
@@ -1,5 +1,7 @@
 # -*- coding: utf8 -*-
 from __future__ import unicode_literals
+import pickle
+import StringIO
 
 from spacy.strings import StringStore
 
@@ -76,3 +78,18 @@ def test_massive_strings(sstore):
     s513 = '1' * 513
     orth = sstore[s513]
     assert sstore[orth] == s513
+
+
+def test_pickle_string_store(sstore):
+    hello_id = sstore[u'Hi']
+    string_file = StringIO.StringIO()
+    pickle.dump(sstore, string_file)
+
+    string_file.seek(0)
+    
+    loaded = pickle.load(string_file)
+
+    assert loaded[hello_id] == u'Hi'
+
+
+

From dfe0ad51ffcc587e91b58d6475bb66066b1dfa01 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 12 Oct 2015 15:16:59 +1100
Subject: [PATCH 18/22] * Add pickle test for lemmatizer

---
 tests/tagger/test_lemmatizer.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/tagger/test_lemmatizer.py b/tests/tagger/test_lemmatizer.py
index ff10b6573..5dfdaabb1 100644
--- a/tests/tagger/test_lemmatizer.py
+++ b/tests/tagger/test_lemmatizer.py
@@ -1,5 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
+import StringIO
+import pickle
 
 from spacy.lemmatizer import Lemmatizer, read_index, read_exc
 from spacy.en import LOCAL_DATA_DIR
@@ -41,3 +43,12 @@ def test_smart_quotes(lemmatizer):
     do = lemmatizer.punct
     assert do('“') == set(['"'])
     assert do('“') == set(['"'])
+
+
+def test_pickle_lemmatizer(lemmatizer):
+    file_ = StringIO.StringIO()
+    pickle.dump(lemmatizer, file_)
+
+    file_.seek(0)
+    
+    loaded = pickle.load(file_)

From 5ca57bd859a0ea57108baab454ac5bb073e62afb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 12 Oct 2015 15:27:47 +1100
Subject: [PATCH 19/22] * Ensure Morphology can be pickled, to address Issue
 #125.

---
 spacy/morphology.pxd            |  1 +
 spacy/morphology.pyx            |  4 ++++
 tests/morphology/test_pickle.py | 17 +++++++++++++++++
 3 files changed, 22 insertions(+)
 create mode 100644 tests/morphology/test_pickle.py

diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index 62d3fccc1..847626158 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -25,6 +25,7 @@ cdef class Morphology:
     cdef readonly Pool mem
     cdef readonly StringStore strings
     cdef public object lemmatizer
+    cdef readonly object tag_map
     cdef public object n_tags
     cdef public object reverse_index
     cdef public object tag_names
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index c53e5f478..e8b1f3520 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -14,6 +14,7 @@ cdef class Morphology:
     def __init__(self, StringStore string_store, tag_map, lemmatizer):
         self.mem = Pool()
         self.strings = string_store
+        self.tag_map = tag_map
         self.lemmatizer = lemmatizer
         self.n_tags = len(tag_map) + 1
         self.tag_names = tuple(sorted(tag_map.keys()))
@@ -28,6 +29,9 @@ cdef class Morphology:
             self.reverse_index[self.rich_tags[i].name] = i
         self._cache = PreshMapArray(self.n_tags)
 
+    def __reduce__(self):
+        return (Morphology, (self.strings, self.tag_map, self.lemmatizer), None, None)
+
     cdef int assign_tag(self, TokenC* token, tag) except -1:
         cdef int tag_id
         if isinstance(tag, basestring):
diff --git a/tests/morphology/test_pickle.py b/tests/morphology/test_pickle.py
new file mode 100644
index 000000000..f1b5bcd4c
--- /dev/null
+++ b/tests/morphology/test_pickle.py
@@ -0,0 +1,17 @@
+import pytest
+
+import pickle
+import StringIO
+
+
+from spacy.morphology import Morphology
+from spacy.lemmatizer import Lemmatizer
+from spacy.strings import StringStore
+
+
+def test_pickle():
+    morphology = Morphology(StringStore(), {}, Lemmatizer({}, {}, {})) 
+
+    file_ = StringIO.StringIO()
+    pickle.dump(morphology, file_)
+

From 85e7944572f047b2fb4d26986c0b416dffa1e3d9 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 12 Oct 2015 16:41:31 +1100
Subject: [PATCH 20/22] * Start trying to pickle Vocab

---
 spacy/vocab.pxd |  2 --
 spacy/vocab.pyx | 17 +++++++++++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index 929c7b345..d850bf929 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -25,7 +25,6 @@ cdef struct _Cached:
 
 
 cdef class Vocab:
-    cpdef public lexeme_props_getter
     cdef Pool mem
     cpdef readonly StringStore strings
     cpdef readonly Morphology morphology
@@ -33,7 +32,6 @@ cdef class Vocab:
     cdef public object _serializer
     cdef public object data_dir
     cdef public object get_lex_attr
-    cdef public object pos_tags
     cdef public object serializer_freqs
 
     cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index af9161d6b..7f07a64ba 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -10,6 +10,8 @@ from os import path
 import io
 import math
 import json
+import tempfile
+import copy_reg
 
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport Lexeme
@@ -96,6 +98,18 @@ cdef class Vocab:
         """The current number of lexemes stored."""
         return self.length
 
+    def __reduce__(self):
+        tmp_dir = tempfile.mkdtmp()
+        lex_loc = path.join(tmp_dir, 'lexemes.bin')
+        str_loc = path.join(tmp_dir, 'strings.txt')
+        map_loc = path.join(tmp_dir, 'tag_map.json')
+
+        self.dump(lex_loc)
+        self.strings.dump(str_loc)
+        json.dump(self.morphology.tag_map, open(map_loc, 'w'))
+
+        return (Vocab.from_dir, (tmp_dir, self.get_lex_attr), None, None)
+
     cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
         '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
         if necessary, using memory acquired from the given pool.  If the pool
@@ -339,6 +353,9 @@ cdef class Vocab:
         return vec_len
 
 
+copy_reg.constructor(Vocab.from_dir)
+
+
 def write_binary_vectors(in_loc, out_loc):
     cdef CFile out_file = CFile(out_loc, 'wb')
     cdef Address mem

From f8de403483f587e39ddd7148807d305d762b7736 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 12 Oct 2015 17:00:01 +1100
Subject: [PATCH 21/22] * Work on pickling Vocab instances. The current
 implementation is not correct, but it may serve to see whether this approach
 is workable. Pickling is necessary to address Issue #125

---
 spacy/vocab.pyx           | 12 +++++++++---
 tests/vocab/test_vocab.py | 11 +++++++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 7f07a64ba..dd6792104 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -99,7 +99,7 @@ cdef class Vocab:
         return self.length
 
     def __reduce__(self):
-        tmp_dir = tempfile.mkdtmp()
+        tmp_dir = tempfile.mkdtemp()
         lex_loc = path.join(tmp_dir, 'lexemes.bin')
         str_loc = path.join(tmp_dir, 'strings.txt')
         map_loc = path.join(tmp_dir, 'tag_map.json')
@@ -108,7 +108,7 @@ cdef class Vocab:
         self.strings.dump(str_loc)
         json.dump(self.morphology.tag_map, open(map_loc, 'w'))
 
-        return (Vocab.from_dir, (tmp_dir, self.get_lex_attr), None, None)
+        return (unpickle_vocab, (tmp_dir,), None, None)
 
     cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
         '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
@@ -353,7 +353,13 @@ cdef class Vocab:
         return vec_len
 
 
-copy_reg.constructor(Vocab.from_dir)
+def unpickle_vocab(data_dir):
+    # TODO: This needs fixing --- the trouble is, we can't pickle staticmethods,
+    # so we need to fiddle with the design of Language a little bit.
+    from .language import Language
+    return Vocab.from_dir(data_dir, Language.default_lex_attrs())
+
+copy_reg.constructor(unpickle_vocab)
 
 
 def write_binary_vectors(in_loc, out_loc):
diff --git a/tests/vocab/test_vocab.py b/tests/vocab/test_vocab.py
index 153e0d546..1ab3746f3 100644
--- a/tests/vocab/test_vocab.py
+++ b/tests/vocab/test_vocab.py
@@ -1,10 +1,13 @@
 from __future__ import unicode_literals
 import pytest
+import StringIO
+import pickle
 
 from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA
 from spacy.parts_of_speech import NOUN, VERB
 
 
+
 def test_neq(en_vocab):
     addr = en_vocab['Hello']
     assert en_vocab['bye'].orth != addr.orth
@@ -38,3 +41,11 @@ def test_symbols(en_vocab):
     assert en_vocab.strings['ORTH'] == ORTH
     assert en_vocab.strings['PROB'] == PROB
     
+
+def test_pickle_vocab(en_vocab):
+    file_ = StringIO.StringIO()
+    pickle.dump(en_vocab, file_)
+
+    file_.seek(0)
+
+    loaded = pickle.load(file_)

From 20fd36a0f785fd447b66808480b7722d62ceb11e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 12 Oct 2015 19:33:11 +1100
Subject: [PATCH 22/22] * Very scrappy, likely buggy first-cut pickle
 implementation, to work on Issue #125: allow pickle for Apache Spark. The
 current implementation sends stuff to temp files, and does almost nothing to
 ensure all modifiable state is actually preserved. The Language() instance is
 a deep tree of extension objects, and if pickling during training, some of
 the C-data state is hard to preserve.

---
 spacy/_ml.pxd                      |  1 +
 spacy/_ml.pyx                      | 14 ++++++++++++++
 spacy/language.py                  |  6 ++++++
 spacy/matcher.pyx                  | 20 +++++++++++++-------
 spacy/syntax/parser.pyx            |  4 +++-
 spacy/syntax/transition_system.pxd |  2 ++
 spacy/syntax/transition_system.pyx | 10 ++++++++--
 spacy/tagger.pyx                   |  3 +++
 spacy/vocab.pyx                    | 30 +++++++++++++++++++++---------
 tests/parser/test_pickle.py        | 16 ++++++++++++++++
 tests/test_pickle.py               | 15 +++++++++++++++
 tests/vocab/test_vocab.py          |  4 ++--
 12 files changed, 104 insertions(+), 21 deletions(-)
 create mode 100644 tests/parser/test_pickle.py
 create mode 100644 tests/test_pickle.py

diff --git a/spacy/_ml.pxd b/spacy/_ml.pxd
index c2c7ffded..b9a190b67 100644
--- a/spacy/_ml.pxd
+++ b/spacy/_ml.pxd
@@ -29,5 +29,6 @@ cdef class Model:
     cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1
     
     cdef object model_loc
+    cdef object _templates
     cdef Extractor _extractor
     cdef LinearModel _model
diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx
index 56c080fa6..bc789e7d6 100644
--- a/spacy/_ml.pyx
+++ b/spacy/_ml.pyx
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
 from __future__ import division
 
 from os import path
+import tempfile
 import os
 import shutil
 import json
@@ -52,6 +53,7 @@ cdef class Model:
     def __init__(self, n_classes, templates, model_loc=None):
         if model_loc is not None and path.isdir(model_loc):
             model_loc = path.join(model_loc, 'model')
+        self._templates = templates
         self.n_classes = n_classes
         self._extractor = Extractor(templates)
         self.n_feats = self._extractor.n_templ
@@ -60,6 +62,18 @@ cdef class Model:
         if self.model_loc and path.exists(self.model_loc):
             self._model.load(self.model_loc, freq_thresh=0)
 
+    def __reduce__(self):
+        model_loc = tempfile.mkstemp()
+        # TODO: This is a potentially buggy implementation. We're not really
+        # given a good guarantee that all internal state is saved correctly here,
+        # since there are learning parameters for e.g. the model averaging in
+        # averaged perceptron, the gradient calculations in AdaGrad, etc
+        # that aren't necessarily saved. So, if we're part way through training
+        # the model, and then we pickle it, we won't recover the state correctly.
+        self._model.dump(model_loc)
+        return (Model, (self.n_classes, self.templates, model_loc),
+                None, None)
+
     def predict(self, Example eg):
         self.set_scores(eg.c.scores, eg.c.atoms)
         eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
diff --git a/spacy/language.py b/spacy/language.py
index ba4c048d7..65425bc45 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -207,6 +207,12 @@ class Language(object):
         self.entity = entity
         self.matcher = matcher
 
+    def __reduce__(self):
+        return (self.__class__,
+                  (None, self.vocab, self.tokenizer, self.tagger, self.parser,
+                   self.entity, self.matcher, None),
+                None, None)
+
     def __call__(self, text, tag=True, parse=True, entity=True):
         """Apply the pipeline to some text.  The text can span multiple sentences,
         and can contain arbtrary whitespace.  Alignment into the original string
diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx
index 3ee825932..2bf8370b5 100644
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@@ -168,13 +168,7 @@ cdef class Matcher:
     cdef Pool mem
     cdef vector[Pattern*] patterns
     cdef readonly Vocab vocab
-
-    def __init__(self, vocab, patterns):
-        self.vocab = vocab
-        self.mem = Pool()
-        self.vocab = vocab
-        for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
-            self.add(entity_key, etype, attrs, specs)
+    cdef object _patterns
 
     @classmethod
     def from_dir(cls, data_dir, Vocab vocab):
@@ -186,10 +180,22 @@ cdef class Matcher:
         else:
             return cls(vocab, {})
 
+    def __init__(self, vocab, patterns):
+        self.vocab = vocab
+        self.mem = Pool()
+        self.vocab = vocab
+        self._patterns = dict(patterns)
+        for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
+            self.add(entity_key, etype, attrs, specs)
+
+    def __reduce__(self):
+        return (self.__class__, (self.vocab, self._patterns), None, None)
+    
     property n_patterns:
         def __get__(self): return self.patterns.size()
 
     def add(self, entity_key, etype, attrs, specs):
+        self._patterns[entity_key] = (etype, dict(attrs), list(specs))
         if isinstance(entity_key, basestring):
             entity_key = self.vocab.strings[entity_key]
         if isinstance(etype, basestring):
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index cf61647b9..25932a0a4 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -83,7 +83,6 @@ cdef class Parser:
         model = Model(moves.n_moves, templates, model_dir)
         return cls(strings, moves, model)
 
-
     def __call__(self, Doc tokens):
         cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
         self.moves.initialize_state(stcls)
@@ -93,6 +92,9 @@ cdef class Parser:
         self.parse(stcls, eg.c)
         tokens.set_parse(stcls._sent)
 
+    def __reduce__(self):
+        return (Parser, (self.moves.strings, self.moves, self.model), None, None)
+
     cdef void predict(self, StateClass stcls, ExampleC* eg) nogil:
         memset(eg.scores, 0, eg.nr_class * sizeof(weight_t))
         self.moves.set_valid(eg.is_valid, stcls)
diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd
index 4cf9aae7e..38bc91605 100644
--- a/spacy/syntax/transition_system.pxd
+++ b/spacy/syntax/transition_system.pxd
@@ -37,6 +37,8 @@ cdef class TransitionSystem:
     cdef public int root_label
     cdef public freqs
 
+    cdef object _labels_by_action
+
     cdef int initialize_state(self, StateClass state) except -1
     cdef int finalize_state(self, StateClass state) nogil
 
diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx
index 86aef1fbc..5de3513e0 100644
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@@ -15,7 +15,8 @@ class OracleError(Exception):
 
 
 cdef class TransitionSystem:
-    def __init__(self, StringStore string_table, dict labels_by_action):
+    def __init__(self, StringStore string_table, dict labels_by_action, _freqs=None):
+        self._labels_by_action = labels_by_action
         self.mem = Pool()
         self.n_moves = sum(len(labels) for labels in labels_by_action.values())
         self._is_valid = <bint*>self.mem.alloc(self.n_moves, sizeof(bint))
@@ -30,7 +31,7 @@ cdef class TransitionSystem:
                 i += 1
         self.c = moves
         self.root_label = self.strings['ROOT']
-        self.freqs = {}
+        self.freqs = {} if _freqs is None else _freqs
         for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
             self.freqs[attr] = defaultdict(int)
             self.freqs[attr][0] = 1
@@ -39,6 +40,11 @@ cdef class TransitionSystem:
             self.freqs[HEAD][i] = 1
             self.freqs[HEAD][-i] = 1
 
+    def __reduce__(self):
+        return (self.__class__,
+                (self.strings, self._labels_by_action, self.freqs),
+                None, None)
+
     cdef int initialize_state(self, StateClass state) except -1:
         pass
 
diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index 756bb7ea4..69925ff89 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -148,6 +148,9 @@ cdef class Tagger:
         tokens.is_tagged = True
         tokens._py_tokens = [None] * tokens.length
 
+    def __reduce__(self):
+        return (self.__class__, (self.vocab, self.model), None, None)
+
     def tag_from_strings(self, Doc tokens, object tag_strs):
         cdef int i
         for i in range(tokens.length):
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index dd6792104..023d0bd89 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -99,16 +99,18 @@ cdef class Vocab:
         return self.length
 
     def __reduce__(self):
+        # TODO: Dump vectors
         tmp_dir = tempfile.mkdtemp()
         lex_loc = path.join(tmp_dir, 'lexemes.bin')
         str_loc = path.join(tmp_dir, 'strings.txt')
-        map_loc = path.join(tmp_dir, 'tag_map.json')
+        vec_loc = path.join(self.data_dir, 'vec.bin') if self.data_dir is not None else None
 
         self.dump(lex_loc)
         self.strings.dump(str_loc)
-        json.dump(self.morphology.tag_map, open(map_loc, 'w'))
-
-        return (unpickle_vocab, (tmp_dir,), None, None)
+        
+        state = (str_loc, lex_loc, vec_loc, self.morphology, self.get_lex_attr,
+                 self.serializer_freqs, self.data_dir)
+        return (unpickle_vocab, state, None, None)
 
     cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
         '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
@@ -353,11 +355,21 @@ cdef class Vocab:
         return vec_len
 
 
-def unpickle_vocab(data_dir):
-    # TODO: This needs fixing --- the trouble is, we can't pickle staticmethods,
-    # so we need to fiddle with the design of Language a little bit.
-    from .language import Language
-    return Vocab.from_dir(data_dir, Language.default_lex_attrs())
+def unpickle_vocab(strings_loc, lex_loc, vec_loc, morphology, get_lex_attr,
+                   serializer_freqs, data_dir):
+    cdef Vocab vocab = Vocab()
+
+    vocab.get_lex_attr = get_lex_attr
+    vocab.morphology = morphology
+    vocab.strings = morphology.strings
+    vocab.data_dir = data_dir
+    vocab.serializer_freqs = serializer_freqs
+
+    vocab.load_lexemes(strings_loc, lex_loc)
+    if vec_loc is not None:
+        vocab.load_vectors_from_bin_loc(vec_loc)
+    return vocab
+ 
 
 copy_reg.constructor(unpickle_vocab)
 
diff --git a/tests/parser/test_pickle.py b/tests/parser/test_pickle.py
new file mode 100644
index 000000000..b1b768650
--- /dev/null
+++ b/tests/parser/test_pickle.py
@@ -0,0 +1,16 @@
+import pytest
+
+import pickle
+import cloudpickle
+import StringIO
+
+
+@pytest.mark.models
+def test_pickle(EN):
+    file_ = StringIO.StringIO()
+    cloudpickle.dump(EN.parser, file_)
+
+    file_.seek(0)
+
+    loaded = pickle.load(file_)
+
diff --git a/tests/test_pickle.py b/tests/test_pickle.py
new file mode 100644
index 000000000..02d908b0d
--- /dev/null
+++ b/tests/test_pickle.py
@@ -0,0 +1,15 @@
+import pytest
+import StringIO
+import cloudpickle
+import pickle
+
+
+@pytest.mark.models
+def test_pickle_english(EN):
+    file_ = StringIO.StringIO()
+    cloudpickle.dump(EN, file_)
+
+    file_.seek(0)
+
+    loaded = pickle.load(file_)
+
diff --git a/tests/vocab/test_vocab.py b/tests/vocab/test_vocab.py
index 1ab3746f3..76e8d27dd 100644
--- a/tests/vocab/test_vocab.py
+++ b/tests/vocab/test_vocab.py
@@ -1,13 +1,13 @@
 from __future__ import unicode_literals
 import pytest
 import StringIO
+import cloudpickle
 import pickle
 
 from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA
 from spacy.parts_of_speech import NOUN, VERB
 
 
-
 def test_neq(en_vocab):
     addr = en_vocab['Hello']
     assert en_vocab['bye'].orth != addr.orth
@@ -44,7 +44,7 @@ def test_symbols(en_vocab):
 
 def test_pickle_vocab(en_vocab):
     file_ = StringIO.StringIO()
-    pickle.dump(en_vocab, file_)
+    cloudpickle.dump(en_vocab, file_)
 
     file_.seek(0)