Merge branch 'attrs'

2015-10-13 14:03:41 +11:00 · 2015-10-13 14:03:41 +11:00 · c1fdc487bc
parent 38109dd912 20fd36a0f7
commit c1fdc487bc
33 changed files with 1682 additions and 755 deletions
--- a/bin/init_model.py
+++ b/bin/init_model.py
@ -168,6 +168,11 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
            probs[word] = oov_prob

    lexicon = []
+    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
+        # First encode the strings into the StringStore. This way, we can map
+        # the orth IDs to frequency ranks
+        orth = vocab.strings[word]
+    # Now actually load the vocab
    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
        lexeme = vocab[word]
        lexeme.prob = prob
--- a/lang_data/en/morphs.json
+++ b/lang_data/en/morphs.json
@ -56,5 +56,4 @@
        "was":  {"L": "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"},
        "were": {"L": "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"}
    }
-
 }
--- a/lang_data/en/tag_map.json
+++ b/lang_data/en/tag_map.json
@ -22,7 +22,7 @@
 "JJS": {"pos": "adj", "degree": "sup"},
 "LS": {"pos": "punct", "numtype": "ord"},
 "MD": {"pos": "verb", "verbtype": "mod"},
-"NIL": {"pos": "no_tag"},
+"NIL": {"pos": ""},
 "NN": {"pos": "noun", "number": "sing"},
 "NNP": {"pos": "noun", "nountype": "prop", "number": "sing"},
 "NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"},
--- a/setup.py
+++ b/setup.py
@ -166,7 +166,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
             'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
             'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
             'spacy.cfile', 'spacy.matcher',
-             'spacy.syntax.ner']
+             'spacy.syntax.ner',
+             'spacy.symbols']


 if __name__ == '__main__':
--- a/spacy/_ml.pxd
+++ b/spacy/_ml.pxd
@ -29,5 +29,6 @@ cdef class Model:
    cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1
    
    cdef object model_loc
+    cdef object _templates
    cdef Extractor _extractor
    cdef LinearModel _model
--- a/spacy/_ml.pyx
+++ b/spacy/_ml.pyx
@ -3,6 +3,7 @@ from __future__ import unicode_literals
 from __future__ import division

 from os import path
+import tempfile
 import os
 import shutil
 import json
@ -52,6 +53,7 @@ cdef class Model:
    def __init__(self, n_classes, templates, model_loc=None):
        if model_loc is not None and path.isdir(model_loc):
            model_loc = path.join(model_loc, 'model')
+        self._templates = templates
        self.n_classes = n_classes
        self._extractor = Extractor(templates)
        self.n_feats = self._extractor.n_templ
@ -60,6 +62,18 @@ cdef class Model:
        if self.model_loc and path.exists(self.model_loc):
            self._model.load(self.model_loc, freq_thresh=0)

+    def __reduce__(self):
+        model_loc = tempfile.mkstemp()
+        # TODO: This is a potentially buggy implementation. We're not really
+        # given a good guarantee that all internal state is saved correctly here,
+        # since there are learning parameters for e.g. the model averaging in
+        # averaged perceptron, the gradient calculations in AdaGrad, etc
+        # that aren't necessarily saved. So, if we're part way through training
+        # the model, and then we pickle it, we won't recover the state correctly.
+        self._model.dump(model_loc)
+        return (Model, (self.n_classes, self.templates, model_loc),
+                None, None)
+
    def predict(self, Example eg):
        self.set_scores(eg.c.scores, eg.c.atoms)
        eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -1,5 +1,6 @@
 # Reserve 64 values for flag features
 cpdef enum attr_id_t:
+    NULL_ATTR
    IS_ALPHA
    IS_ASCII
    IS_DIGIT
@ -14,8 +15,7 @@ cpdef enum attr_id_t:
    IS_STOP
    IS_OOV
    
-    FLAG13 = 13
-    FLAG14
+    FLAG14 = 14
    FLAG15
    FLAG16
    FLAG17
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -0,0 +1,90 @@
+IDS = {
+    "": NULL_ATTR,
+    "IS_ALPHA": IS_ALPHA,
+    "IS_ASCII": IS_ASCII,
+    "IS_DIGIT": IS_DIGIT,
+    "IS_LOWER": IS_LOWER,
+    "IS_PUNCT": IS_PUNCT,
+    "IS_SPACE": IS_SPACE,
+    "IS_TITLE": IS_TITLE,
+    "IS_UPPER": IS_UPPER,
+    "LIKE_URL": LIKE_URL,
+    "LIKE_NUM": LIKE_NUM,
+    "LIKE_EMAIL": LIKE_EMAIL,
+    "IS_STOP": IS_STOP,
+    "IS_OOV": IS_OOV,
+
+    "FLAG14": FLAG14,
+    "FLAG15": FLAG15,
+    "FLAG16": FLAG16,
+    "FLAG17": FLAG17,
+    "FLAG18": FLAG18,
+    "FLAG19": FLAG19,
+    "FLAG20": FLAG20,
+    "FLAG21": FLAG21,
+    "FLAG22": FLAG22,
+    "FLAG23": FLAG23,
+    "FLAG24": FLAG24,
+    "FLAG25": FLAG25,
+    "FLAG26": FLAG26,
+    "FLAG27": FLAG27,
+    "FLAG28": FLAG28,
+    "FLAG29": FLAG29,
+    "FLAG30": FLAG30,
+    "FLAG31": FLAG31,
+    "FLAG32": FLAG32,
+    "FLAG33": FLAG33,
+    "FLAG34": FLAG34,
+    "FLAG35": FLAG35,
+    "FLAG36": FLAG36,
+    "FLAG37": FLAG37,
+    "FLAG38": FLAG38,
+    "FLAG39": FLAG39,
+    "FLAG40": FLAG40,
+    "FLAG41": FLAG41,
+    "FLAG42": FLAG42,
+    "FLAG43": FLAG43,
+    "FLAG44": FLAG44,
+    "FLAG45": FLAG45,
+    "FLAG46": FLAG46,
+    "FLAG47": FLAG47,
+    "FLAG48": FLAG48,
+    "FLAG49": FLAG49,
+    "FLAG50": FLAG50,
+    "FLAG51": FLAG51,
+    "FLAG52": FLAG52,
+    "FLAG53": FLAG53,
+    "FLAG54": FLAG54,
+    "FLAG55": FLAG55,
+    "FLAG56": FLAG56,
+    "FLAG57": FLAG57,
+    "FLAG58": FLAG58,
+    "FLAG59": FLAG59,
+    "FLAG60": FLAG60,
+    "FLAG61": FLAG61,
+    "FLAG62": FLAG62,
+    "FLAG63": FLAG63,
+
+    "ID": ID,
+    "ORTH": ORTH,
+    "LOWER": LOWER,
+    "NORM": NORM,
+    "SHAPE": SHAPE,
+    "PREFIX": PREFIX,
+    "SUFFIX": SUFFIX,
+
+    "LENGTH": LENGTH,
+    "CLUSTER": CLUSTER,
+    "LEMMA": LEMMA,
+    "POS": POS,
+    "TAG": TAG,
+    "DEP": DEP,
+    "ENT_IOB": ENT_IOB,
+    "ENT_TYPE": ENT_TYPE,
+    "HEAD": HEAD,
+    "SPACY": SPACY,
+    "PROB": PROB,
+}
+
+# ATTR IDs, in order of the symbol
+NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
--- a/spacy/language.py
+++ b/spacy/language.py
@ -207,6 +207,12 @@ class Language(object):
        self.entity = entity
        self.matcher = matcher

+    def __reduce__(self):
+        return (self.__class__,
+                  (None, self.vocab, self.tokenizer, self.tagger, self.parser,
+                   self.entity, self.matcher, None),
+                None, None)
+
    def __call__(self, text, tag=True, parse=True, entity=True):
        """Apply the pipeline to some text.  The text can span multiple sentences,
        and can contain arbtrary whitespace.  Alignment into the original string
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -15,7 +15,7 @@ from libcpp.vector cimport vector
 from murmurhash.mrmr cimport hash64

 from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
-from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
+from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
 from .tokens.doc cimport get_token_attr
 from .tokens.doc cimport Doc
 from .vocab cimport Vocab
@ -168,13 +168,7 @@ cdef class Matcher:
    cdef Pool mem
    cdef vector[Pattern*] patterns
    cdef readonly Vocab vocab
-
-    def __init__(self, vocab, patterns):
-        self.vocab = vocab
-        self.mem = Pool()
-        self.vocab = vocab
-        for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
-            self.add(entity_key, etype, attrs, specs)
+    cdef object _patterns

    @classmethod
    def from_dir(cls, data_dir, Vocab vocab):
@ -186,10 +180,22 @@ cdef class Matcher:
        else:
            return cls(vocab, {})

+    def __init__(self, vocab, patterns):
+        self.vocab = vocab
+        self.mem = Pool()
+        self.vocab = vocab
+        self._patterns = dict(patterns)
+        for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
+            self.add(entity_key, etype, attrs, specs)
+
+    def __reduce__(self):
+        return (self.__class__, (self.vocab, self._patterns), None, None)
+    
    property n_patterns:
        def __get__(self): return self.patterns.size()

    def add(self, entity_key, etype, attrs, specs):
+        self._patterns[entity_key] = (etype, dict(attrs), list(specs))
        if isinstance(entity_key, basestring):
            entity_key = self.vocab.strings[entity_key]
        if isinstance(etype, basestring):
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -7,6 +7,7 @@ from .strings cimport StringStore
 from .typedefs cimport attr_t
 from .parts_of_speech cimport univ_pos_t

+from . cimport symbols

 cdef struct RichTagC:
    uint64_t morph
@ -24,6 +25,7 @@ cdef class Morphology:
    cdef readonly Pool mem
    cdef readonly StringStore strings
    cdef public object lemmatizer
+    cdef readonly object tag_map
    cdef public object n_tags
    cdef public object reverse_index
    cdef public object tag_names
@ -36,720 +38,252 @@ cdef class Morphology:
    cdef int assign_feature(self, uint64_t* morph, feature, value) except -1


+cpdef enum univ_morph_t:
+    NIL = 0
+    Animacy_anim = symbols.Animacy_anim
+    Animacy_inam
+    Aspect_freq
+    Aspect_imp
+    Aspect_mod
+    Aspect_none
+    Aspect_perf
+    Case_abe
+    Case_abl
+    Case_abs
+    Case_acc
+    Case_ade
+    Case_all
+    Case_cau
+    Case_com
+    Case_dat
+    Case_del
+    Case_dis
+    Case_ela
+    Case_ess
+    Case_gen
+    Case_ill
+    Case_ine
+    Case_ins
+    Case_loc
+    Case_lat
+    Case_nom
+    Case_par
+    Case_sub
+    Case_sup
+    Case_tem
+    Case_ter
+    Case_tra
+    Case_voc
+    Definite_two
+    Definite_def
+    Definite_red
+    Definite_ind
+    Degree_cmp
+    Degree_comp
+    Degree_none
+    Degree_pos
+    Degree_sup
+    Degree_abs
+    Degree_com
+    Degree_dim # du
+    Gender_com
+    Gender_fem
+    Gender_masc
+    Gender_neut
+    Mood_cnd
+    Mood_imp
+    Mood_ind
+    Mood_n
+    Mood_pot
+    Mood_sub
+    Mood_opt
+    Negative_neg
+    Negative_pos
+    Negative_yes
+    Number_com
+    Number_dual
+    Number_none
+    Number_plur
+    Number_sing
+    Number_ptan # bg
+    Number_count # bg
+    NumType_card
+    NumType_dist
+    NumType_frac
+    NumType_gen
+    NumType_mult
+    NumType_none
+    NumType_ord
+    NumType_sets
+    Person_one
+    Person_two
+    Person_three
+    Person_none
+    Poss_yes
+    PronType_advPart
+    PronType_art
+    PronType_default
+    PronType_dem
+    PronType_ind
+    PronType_int
+    PronType_neg
+    PronType_prs
+    PronType_rcp
+    PronType_rel
+    PronType_tot
+    PronType_clit
+    PronType_exc # es, ca, it, fa
+    Reflex_yes
+    Tense_fut
+    Tense_imp
+    Tense_past
+    Tense_pres
+    VerbForm_fin
+    VerbForm_ger
+    VerbForm_inf
+    VerbForm_none
+    VerbForm_part
+    VerbForm_partFut
+    VerbForm_partPast
+    VerbForm_partPres
+    VerbForm_sup
+    VerbForm_trans
+    VerbForm_gdv # la
+    Voice_act
+    Voice_cau
+    Voice_pass
+    Voice_mid # gkc
+    Voice_int # hb
+    Abbr_yes # cz, fi, sl, U
+    AdpType_prep # cz, U
+    AdpType_post # U
+    AdpType_voc # cz
+    AdpType_comprep # cz
+    AdpType_circ # U
+    AdvType_man
+    AdvType_loc
+    AdvType_tim
+    AdvType_deg
+    AdvType_cau
+    AdvType_mod
+    AdvType_sta
+    AdvType_ex
+    AdvType_adadj
+    ConjType_oper # cz, U
+    ConjType_comp # cz, U
+    Connegative_yes # fi
+    Derivation_minen # fi
+    Derivation_sti # fi
+    Derivation_inen # fi
+    Derivation_lainen # fi
+    Derivation_ja # fi
+    Derivation_ton # fi
+    Derivation_vs # fi
+    Derivation_ttain # fi
+    Derivation_ttaa # fi
+    Echo_rdp # U
+    Echo_ech # U
+    Foreign_foreign # cz, fi, U
+    Foreign_fscript # cz, fi, U
+    Foreign_tscript # cz, U
+    Foreign_yes # sl
+    Gender_dat_masc # bq, U
+    Gender_dat_fem # bq, U
+    Gender_erg_masc # bq
+    Gender_erg_fem # bq
+    Gender_psor_masc # cz, sl, U
+    Gender_psor_fem # cz, sl, U
+    Gender_psor_neut # sl
+    Hyph_yes # cz, U
+    InfForm_one # fi
+    InfForm_two # fi
+    InfForm_three # fi
+    NameType_geo # U, cz
+    NameType_prs # U, cz
+    NameType_giv # U, cz
+    NameType_sur # U, cz
+    NameType_nat # U, cz
+    NameType_com # U, cz
+    NameType_pro # U, cz
+    NameType_oth # U, cz
+    NounType_com # U
+    NounType_prop # U
+    NounType_class # U
+    Number_abs_sing # bq, U
+    Number_abs_plur # bq, U
+    Number_dat_sing # bq, U
+    Number_dat_plur # bq, U
+    Number_erg_sing # bq, U
+    Number_erg_plur # bq, U
+    Number_psee_sing # U
+    Number_psee_plur # U
+    Number_psor_sing # cz, fi, sl, U
+    Number_psor_plur # cz, fi, sl, U
+    NumForm_digit # cz, sl, U
+    NumForm_roman # cz, sl, U
+    NumForm_word # cz, sl, U
+    NumValue_one # cz, U
+    NumValue_two # cz, U
+    NumValue_three # cz, U
+    PartForm_pres # fi
+    PartForm_past # fi
+    PartForm_agt # fi
+    PartForm_neg # fi
+    PartType_mod # U
+    PartType_emp # U
+    PartType_res # U
+    PartType_inf # U
+    PartType_vbp # U
+    Person_abs_one # bq, U
+    Person_abs_two # bq, U
+    Person_abs_three # bq, U
+    Person_dat_one # bq, U
+    Person_dat_two # bq, U
+    Person_dat_three # bq, U
+    Person_erg_one # bq, U
+    Person_erg_two # bq, U
+    Person_erg_three # bq, U
+    Person_psor_one # fi, U
+    Person_psor_two # fi, U
+    Person_psor_three # fi, U
+    Polite_inf # bq, U
+    Polite_pol # bq, U
+    Polite_abs_inf # bq, U
+    Polite_abs_pol # bq, U
+    Polite_erg_inf # bq, U
+    Polite_erg_pol # bq, U
+    Polite_dat_inf # bq, U
+    Polite_dat_pol # bq, U
+    Prefix_yes # U
+    PrepCase_npr # cz
+    PrepCase_pre # U
+    PunctSide_ini # U
+    PunctSide_fin # U
+    PunctType_peri # U
+    PunctType_qest # U
+    PunctType_excl # U
+    PunctType_quot # U
+    PunctType_brck # U
+    PunctType_comm # U
+    PunctType_colo # U
+    PunctType_semi # U
+    PunctType_dash # U
+    Style_arch # cz, fi, U
+    Style_rare # cz, fi, U
+    Style_poet # cz, U
+    Style_norm # cz, U
+    Style_coll # cz, U
+    Style_vrnc # cz, U
+    Style_sing # cz, U
+    Style_expr # cz, U
+    Style_derg # cz, U
+    Style_vulg # cz, U
+    Style_yes # fi, U
+    StyleVariant_styleShort # cz
+    StyleVariant_styleBound # cz, sl
+    VerbType_aux # U
+    VerbType_cop # U
+    VerbType_mod # U
+    VerbType_light # U

-#
-#cpdef enum Feature_t:
-#    Abbr
-#    AdpType
-#    AdvType
-#    ConjType
-#    Connegative
-#    Derivation
-#    Echo
-#    Foreign
-#    Gender_dat
-#    Gender_erg
-#    Gender_psor
-#    Hyph
-#    InfForm
-#    NameType
-#    NounType
-#    NumberAbs
-#    NumberDat
-#    NumberErg
-#    NumberPsee
-#    NumberPsor
-#    NumForm
-#    NumValue
-#    PartForm
-#    PartType
-#    Person_abs
-#    Person_dat
-#    Person_psor
-#    Polite
-#    Polite_abs
-#    Polite_dat
-#    Prefix
-#    PrepCase
-#    PunctSide
-#    PunctType
-#    Style
-#    Typo
-#    Variant
-#    VerbType
-#
-#
-#cpdef enum Animacy:
-#    Anim
-#    Inam
-#
-#
-#cpdef enum Aspect:
-#    Freq
-#    Imp
-#    Mod
-#    None_
-#    Perf
-#
-#
-#cpdef enum Case1:
-#    Nom
-#    Gen
-#    Acc
-#    Dat
-#    Voc
-#    Abl
-#    
-#cdef enum Case2:
-#    Abe
-#    Abs
-#    Ade
-#    All
-#    Cau
-#    Com
-#    Del
-#    Dis
-#
-#cdef enum Case3:
-#    Ela
-#    Ess
-#    Ill
-#    Ine
-#    Ins
-#    Loc
-#    Lat
-#    Par
-#
-#cdef enum Case4:
-#    Sub
-#    Sup
-#    Tem
-#    Ter
-#    Tra
-#
-#
-#cpdef enum Definite:
-#    Two
-#    Def
-#    Red
-#    Ind
-#
-#
-#cpdef enum Degree:
-#    Cmp
-#    Comp
-#    None_
-#    Pos
-#    Sup
-#    Abs
-#    Com
-#    Degree # du
-#
-#
-#cpdef enum Gender:
-#    Com
-#    Fem
-#    Masc
-#    Neut
-#
-#
-#cpdef enum Mood:
-#    Cnd
-#    Imp
-#    Ind
-#    N
-#    Pot
-#    Sub
-#    Opt
-#
-#
-#cpdef enum Negative:
-#    Neg
-#    Pos
-#    Yes
-#
-#
-#cpdef enum Number:
-#    Com
-#    Dual
-#    None_
-#    Plur
-#    Sing
-#    Ptan # bg
-#    Count # bg
-#
-#
-#cpdef enum NumType:
-#    Card
-#    Dist
-#    Frac
-#    Gen
-#    Mult
-#    None_
-#    Ord
-#    Sets
-#
-#
-#cpdef enum Person:
-#    One
-#    Two
-#    Three
-#    None_
-#
-#
-#cpdef enum Poss:
-#    Yes
-#
-#
-#cpdef enum PronType1:
-#    AdvPart
-#    Art
-#    Default
-#    Dem
-#    Ind
-#    Int
-#    Neg
-#
-#cpdef enum PronType2:
-#    Prs
-#    Rcp
-#    Rel
-#    Tot
-#    Clit
-#    Exc # es, ca, it, fa
-#    Clit # it
-#
-#
-#cpdef enum Reflex:
-#    Yes
-#
-#
-#cpdef enum Tense:
-#    Fut
-#    Imp
-#    Past
-#    Pres
-#
-#cpdef enum VerbForm1:
-#    Fin
-#    Ger
-#    Inf
-#    None_
-#    Part
-#    PartFut
-#    PartPast
-#
-#cpdef enum VerbForm2:
-#    PartPres
-#    Sup
-#    Trans
-#    Gdv # la
-#
-#
-#cpdef enum Voice:
-#    Act
-#    Cau
-#    Pass
-#    Mid # gkc
-#    Int # hb
-#
-#
-#cpdef enum Abbr:
-#    Yes # cz, fi, sl, U
-#
-#cpdef enum AdpType:
-#    Prep # cz, U
-#    Post # U
-#    Voc # cz
-#    Comprep # cz
-#    Circ # U
-#    Voc # U
-#
-#
-#cpdef enum AdvType1:
-#    # U
-#    Man
-#    Loc
-#    Tim
-#    Deg
-#    Cau
-#    Mod
-#    Sta
-#    Ex
-#
-#cpdef enum AdvType2:
-#    Adadj
-#
-#cpdef enum ConjType:
-#    Oper # cz, U
-#    Comp # cz, U
-#
-#cpdef enum Connegative:
-#    Yes # fi
-#
-#
-#cpdef enum Derivation1:
-#    Minen # fi
-#    Sti # fi
-#    Inen # fi
-#    Lainen # fi
-#    Ja # fi
-#    Ton # fi
-#    Vs # fi
-#    Ttain # fi
-#
-#cpdef enum Derivation2:
-#    Ttaa
-#
-#
-#cpdef enum Echo:
-#    Rdp # U
-#    Ech # U
-#
-#
-#cpdef enum Foreign:
-#    Foreign # cz, fi, U
-#    Fscript # cz, fi, U
-#    Tscript # cz, U
-#    Yes # sl
-#
-#
-#cpdef enum Gender_dat:
-#    Masc # bq, U
-#    Fem # bq, U
-#
-#
-#cpdef enum Gender_erg:
-#    Masc # bq
-#    Fem # bq
-#
-#
-#cpdef enum Gender_psor:
-#    Masc # cz, sl, U
-#    Fem # cz, sl, U
-#    Neut # sl
-#
-#
-#cpdef enum Hyph:
-#    Yes # cz, U
-#
-#
-#cpdef enum InfForm:
-#    One # fi
-#    Two # fi
-#    Three # fi
-#
-#
-#cpdef enum NameType:
-#    Geo # U, cz
-#    Prs # U, cz
-#    Giv # U, cz
-#    Sur # U, cz
-#    Nat # U, cz
-#    Com # U, cz
-#    Pro # U, cz
-#    Oth # U, cz
-#
-#
-#cpdef enum NounType:
-#    Com # U
-#    Prop # U
-#    Class # U
-#
-#cpdef enum Number_abs:
-#    Sing # bq, U
-#    Plur # bq, U
-#
-#cpdef enum Number_dat:
-#    Sing # bq, U
-#    Plur # bq, U
-#
-#cpdef enum Number_erg:
-#    Sing # bq, U
-#    Plur # bq, U
-#
-#cpdef enum Number_psee:
-#    Sing # U
-#    Plur # U
-#
-#
-#cpdef enum Number_psor:
-#    Sing # cz, fi, sl, U
-#    Plur # cz, fi, sl, U
-#
-#
-#cpdef enum NumForm:
-#    Digit # cz, sl, U
-#    Roman # cz, sl, U
-#    Word # cz, sl, U
-#
-#
-#cpdef enum NumValue:
-#    One # cz, U
-#    Two # cz, U
-#    Three # cz, U
-#
-#
-#cpdef enum PartForm:
-#    Pres # fi
-#    Past # fi
-#    Agt # fi
-#    Neg # fi
-#
-#
-#cpdef enum PartType:
-#    Mod # U
-#    Emp # U
-#    Res # U
-#    Inf # U
-#    Vbp # U
-#
-#cpdef enum Person_abs:
-#    One # bq, U
-#    Two # bq, U
-#    Three # bq, U
-#
-#
-#cpdef enum Person_dat:
-#    One # bq, U
-#    Two # bq, U
-#    Three # bq, U
-#
-#
-#cpdef enum Person_erg:
-#    One # bq, U
-#    Two # bq, U
-#    Three # bq, U
-#
-#
-#cpdef enum Person_psor:
-#    One # fi, U
-#    Two # fi, U
-#    Three # fi, U
-#
-#
-#cpdef enum Polite:
-#    Inf # bq, U
-#    Pol # bq, U
-#
-#
-#cpdef enum Polite_abs:
-#    Inf # bq, U
-#    Pol # bq, U
-#
-#
-#cpdef enum Polite_erg:
-#    Inf # bq, U
-#    Pol # bq, U
-#
-#
-#cpdef enum Polite_dat:
-#    Inf # bq, U
-#    Pol # bq, U
-#
-#
-#cpdef enum Prefix:
-#    Yes # U
-#
-#
-#cpdef enum PrepCase:
-#    Npr # cz
-#    Pre # U
-#
-#
-#cpdef enum PunctSide:
-#    Ini # U
-#    Fin # U
-#
-#cpdef enum PunctType1:
-#    Peri # U
-#    Qest # U
-#    Excl # U
-#    Quot # U
-#    Brck # U
-#    Comm # U
-#    Colo # U
-#    Semi # U
-#
-#cpdef enum PunctType2:
-#    Dash # U
-#
-#
-#cpdef enum Style1:
-#    Arch # cz, fi, U
-#    Rare # cz, fi, U
-#    Poet # cz, U
-#    Norm # cz, U
-#    Coll # cz, U
-#    Vrnc # cz, U
-#    Sing # cz, U
-#    Expr # cz, U
-#
-#
-#cpdef enum Style2:
-#    Derg # cz, U
-#    Vulg # cz, U
-#
-#
-#cpdef enum Typo:
-#    Yes # fi, U
-#
-#
-#cpdef enum Variant:
-#    Short # cz
-#    Bound # cz, sl
-#
-#
-#cpdef enum VerbType:
-#    Aux # U
-#    Cop # U
-#    Mod # U
-#    Light # U
-#

-cpdef enum Value_t:
-    Animacy_Anim
-    Animacy_Inam
-    Aspect_Freq
-    Aspect_Imp
-    Aspect_Mod
-    Aspect_None_
-    Aspect_Perf
-    Case_Abe
-    Case_Abl
-    Case_Abs
-    Case_Acc
-    Case_Ade
-    Case_All
-    Case_Cau
-    Case_Com
-    Case_Dat
-    Case_Del
-    Case_Dis
-    Case_Ela
-    Case_Ess
-    Case_Gen
-    Case_Ill
-    Case_Ine
-    Case_Ins
-    Case_Loc
-    Case_Lat
-    Case_Nom
-    Case_Par
-    Case_Sub
-    Case_Sup
-    Case_Tem
-    Case_Ter
-    Case_Tra
-    Case_Voc
-    Definite_Two
-    Definite_Def
-    Definite_Red
-    Definite_Ind
-    Degree_Cmp
-    Degree_Comp
-    Degree_None
-    Degree_Pos
-    Degree_Sup
-    Degree_Abs
-    Degree_Com
-    Degree_Dim # du
-    Gender_Com
-    Gender_Fem
-    Gender_Masc
-    Gender_Neut
-    Mood_Cnd
-    Mood_Imp
-    Mood_Ind
-    Mood_N
-    Mood_Pot
-    Mood_Sub
-    Mood_Opt
-    Negative_Neg
-    Negative_Pos
-    Negative_Yes
-    Number_Com
-    Number_Dual
-    Number_None
-    Number_Plur
-    Number_Sing
-    Number_Ptan # bg
-    Number_Count # bg
-    NumType_Card
-    NumType_Dist
-    NumType_Frac
-    NumType_Gen
-    NumType_Mult
-    NumType_None
-    NumType_Ord
-    NumType_Sets
-    Person_One
-    Person_Two
-    Person_Three
-    Person_None
-    Poss_Yes
-    PronType_AdvPart
-    PronType_Art
-    PronType_Default
-    PronType_Dem
-    PronType_Ind
-    PronType_Int
-    PronType_Neg
-    PronType_Prs
-    PronType_Rcp
-    PronType_Rel
-    PronType_Tot
-    PronType_Clit
-    PronType_Exc # es, ca, it, fa
-    Reflex_Yes
-    Tense_Fut
-    Tense_Imp
-    Tense_Past
-    Tense_Pres
-    VerbForm_Fin
-    VerbForm_Ger
-    VerbForm_Inf
-    VerbForm_None
-    VerbForm_Part
-    VerbForm_PartFut
-    VerbForm_PartPast
-    VerbForm_PartPres
-    VerbForm_Sup
-    VerbForm_Trans
-    VerbForm_Gdv # la
-    Voice_Act
-    Voice_Cau
-    Voice_Pass
-    Voice_Mid # gkc
-    Voice_Int # hb
-    Abbr_Yes # cz, fi, sl, U
-    AdpType_Prep # cz, U
-    AdpType_Post # U
-    AdpType_Voc # cz
-    AdpType_Comprep # cz
-    AdpType_Circ # U
-    AdvType_Man
-    AdvType_Loc
-    AdvType_Tim
-    AdvType_Deg
-    AdvType_Cau
-    AdvType_Mod
-    AdvType_Sta
-    AdvType_Ex
-    AdvType_Adadj
-    ConjType_Oper # cz, U
-    ConjType_Comp # cz, U
-    Connegative_Yes # fi
-    Derivation_Minen # fi
-    Derivation_Sti # fi
-    Derivation_Inen # fi
-    Derivation_Lainen # fi
-    Derivation_Ja # fi
-    Derivation_Ton # fi
-    Derivation_Vs # fi
-    Derivation_Ttain # fi
-    Derivation_Ttaa # fi
-    Echo_Rdp # U
-    Echo_Ech # U
-    Foreign_Foreign # cz, fi, U
-    Foreign_Fscript # cz, fi, U
-    Foreign_Tscript # cz, U
-    Foreign_Yes # sl
-    Gender_dat_Masc # bq, U
-    Gender_dat_Fem # bq, U
-    Gender_erg_Masc # bq
-    Gender_erg_Fem # bq
-    Gender_psor_Masc # cz, sl, U
-    Gender_psor_Fem # cz, sl, U
-    Gender_psor_Neut # sl
-    Hyph_Yes # cz, U
-    InfForm_One # fi
-    InfForm_Two # fi
-    InfForm_Three # fi
-    NameType_Geo # U, cz
-    NameType_Prs # U, cz
-    NameType_Giv # U, cz
-    NameType_Sur # U, cz
-    NameType_Nat # U, cz
-    NameType_Com # U, cz
-    NameType_Pro # U, cz
-    NameType_Oth # U, cz
-    NounType_Com # U
-    NounType_Prop # U
-    NounType_Class # U
-    Number_abs_Sing # bq, U
-    Number_abs_Plur # bq, U
-    Number_dat_Sing # bq, U
-    Number_dat_Plur # bq, U
-    Number_erg_Sing # bq, U
-    Number_erg_Plur # bq, U
-    Number_psee_Sing # U
-    Number_psee_Plur # U
-    Number_psor_Sing # cz, fi, sl, U
-    Number_psor_Plur # cz, fi, sl, U
-    NumForm_Digit # cz, sl, U
-    NumForm_Roman # cz, sl, U
-    NumForm_Word # cz, sl, U
-    NumValue_One # cz, U
-    NumValue_Two # cz, U
-    NumValue_Three # cz, U
-    PartForm_Pres # fi
-    PartForm_Past # fi
-    PartForm_Agt # fi
-    PartForm_Neg # fi
-    PartType_Mod # U
-    PartType_Emp # U
-    PartType_Res # U
-    PartType_Inf # U
-    PartType_Vbp # U
-    Person_abs_One # bq, U
-    Person_abs_Two # bq, U
-    Person_abs_Three # bq, U
-    Person_dat_One # bq, U
-    Person_dat_Two # bq, U
-    Person_dat_Three # bq, U
-    Person_erg_One # bq, U
-    Person_erg_Two # bq, U
-    Person_erg_Three # bq, U
-    Person_psor_One # fi, U
-    Person_psor_Two # fi, U
-    Person_psor_Three # fi, U
-    Polite_Inf # bq, U
-    Polite_Pol # bq, U
-    Polite_abs_Inf # bq, U
-    Polite_abs_Pol # bq, U
-    Polite_erg_Inf # bq, U
-    Polite_erg_Pol # bq, U
-    Polite_dat_Inf # bq, U
-    Polite_dat_Pol # bq, U
-    Prefix_Yes # U
-    PrepCase_Npr # cz
-    PrepCase_Pre # U
-    PunctSide_Ini # U
-    PunctSide_Fin # U
-    PunctType_Peri # U
-    PunctType_Qest # U
-    PunctType_Excl # U
-    PunctType_Quot # U
-    PunctType_Brck # U
-    PunctType_Comm # U
-    PunctType_Colo # U
-    PunctType_Semi # U
-    PunctType_Dash # U
-    Style_Arch # cz, fi, U
-    Style_Rare # cz, fi, U
-    Style_Poet # cz, U
-    Style_Norm # cz, U
-    Style_Coll # cz, U
-    Style_Vrnc # cz, U
-    Style_Sing # cz, U
-    Style_Expr # cz, U
-    Style_Derg # cz, U
-    Style_Vulg # cz, U
-    Style_Yes # fi, U
-    StyleVariant_StyleShort # cz
-    StyleVariant_StyleBound # cz, sl
-    VerbType_Aux # U
-    VerbType_Cop # U
-    VerbType_Mod # U
-    VerbType_Light # U
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -6,7 +6,7 @@ try:
 except ImportError:
    import json

-from .parts_of_speech import UNIV_POS_NAMES
+from .parts_of_speech import IDS as POS_IDS
 from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT


@ -14,6 +14,7 @@ cdef class Morphology:
    def __init__(self, StringStore string_store, tag_map, lemmatizer):
        self.mem = Pool()
        self.strings = string_store
+        self.tag_map = tag_map
        self.lemmatizer = lemmatizer
        self.n_tags = len(tag_map) + 1
        self.tag_names = tuple(sorted(tag_map.keys()))
@ -24,10 +25,13 @@ cdef class Morphology:
            self.rich_tags[i].id = i
            self.rich_tags[i].name = self.strings[tag_str]
            self.rich_tags[i].morph = 0
-            self.rich_tags[i].pos = UNIV_POS_NAMES[props['pos'].upper()]
+            self.rich_tags[i].pos = POS_IDS[props['pos'].upper()]
            self.reverse_index[self.rich_tags[i].name] = i
        self._cache = PreshMapArray(self.n_tags)

+    def __reduce__(self):
+        return (Morphology, (self.strings, self.tag_map, self.lemmatizer), None, None)
+
    cdef int assign_tag(self, TokenC* token, tag) except -1:
        cdef int tag_id
        if isinstance(tag, basestring):
@ -89,3 +93,254 @@ cdef class Morphology:
        lemma_string = sorted(lemma_strings)[0]
        lemma = self.strings[lemma_string]
        return lemma
+
+IDS = {
+    "Animacy_anim": Animacy_anim,
+    "Animacy_inam": Animacy_inam,
+    "Aspect_freq": Aspect_freq,
+    "Aspect_imp": Aspect_imp,
+    "Aspect_mod": Aspect_mod,
+    "Aspect_none": Aspect_none,
+    "Aspect_perf": Aspect_perf,
+    "Case_abe": Case_abe,
+    "Case_abl": Case_abl,
+    "Case_abs": Case_abs,
+    "Case_acc": Case_acc,
+    "Case_ade": Case_ade,
+    "Case_all": Case_all,
+    "Case_cau": Case_cau,
+    "Case_com": Case_com,
+    "Case_dat": Case_dat,
+    "Case_del": Case_del,
+    "Case_dis": Case_dis,
+    "Case_ela": Case_ela,
+    "Case_ess": Case_ess,
+    "Case_gen": Case_gen,
+    "Case_ill": Case_ill,
+    "Case_ine": Case_ine,
+    "Case_ins": Case_ins,
+    "Case_loc": Case_loc,
+    "Case_lat": Case_lat,
+    "Case_nom": Case_nom,
+    "Case_par": Case_par,
+    "Case_sub": Case_sub,
+    "Case_sup": Case_sup,
+    "Case_tem": Case_tem,
+    "Case_ter": Case_ter,
+    "Case_tra": Case_tra,
+    "Case_voc": Case_voc,
+    "Definite_two": Definite_two,
+    "Definite_def": Definite_def,
+    "Definite_red": Definite_red,
+    "Definite_ind": Definite_ind,
+    "Degree_cmp": Degree_cmp,
+    "Degree_comp": Degree_comp,
+    "Degree_none": Degree_none,
+    "Degree_pos": Degree_pos,
+    "Degree_sup": Degree_sup,
+    "Degree_abs": Degree_abs,
+    "Degree_com": Degree_com,
+    "Degree_dim ": Degree_dim, # du
+    "Gender_com": Gender_com,
+    "Gender_fem": Gender_fem,
+    "Gender_masc": Gender_masc,
+    "Gender_neut": Gender_neut,
+    "Mood_cnd": Mood_cnd,
+    "Mood_imp": Mood_imp,
+    "Mood_ind": Mood_ind,
+    "Mood_n": Mood_n,
+    "Mood_pot": Mood_pot,
+    "Mood_sub": Mood_sub,
+    "Mood_opt": Mood_opt,
+    "Negative_neg": Negative_neg,
+    "Negative_pos": Negative_pos,
+    "Negative_yes": Negative_yes,
+    "Number_com": Number_com,
+    "Number_dual": Number_dual,
+    "Number_none": Number_none,
+    "Number_plur": Number_plur,
+    "Number_sing": Number_sing,
+    "Number_ptan ": Number_ptan, # bg
+    "Number_count ": Number_count, # bg
+    "NumType_card": NumType_card,
+    "NumType_dist": NumType_dist,
+    "NumType_frac": NumType_frac,
+    "NumType_gen": NumType_gen,
+    "NumType_mult": NumType_mult,
+    "NumType_none": NumType_none,
+    "NumType_ord": NumType_ord,
+    "NumType_sets": NumType_sets,
+    "Person_one": Person_one,
+    "Person_two": Person_two,
+    "Person_three": Person_three,
+    "Person_none": Person_none,
+    "Poss_yes": Poss_yes,
+    "PronType_advPart": PronType_advPart,
+    "PronType_art": PronType_art,
+    "PronType_default": PronType_default,
+    "PronType_dem": PronType_dem,
+    "PronType_ind": PronType_ind,
+    "PronType_int": PronType_int,
+    "PronType_neg": PronType_neg,
+    "PronType_prs": PronType_prs,
+    "PronType_rcp": PronType_rcp,
+    "PronType_rel": PronType_rel,
+    "PronType_tot": PronType_tot,
+    "PronType_clit": PronType_clit,
+    "PronType_exc ": PronType_exc, # es, ca, it, fa,
+    "Reflex_yes": Reflex_yes,
+    "Tense_fut": Tense_fut,
+    "Tense_imp": Tense_imp,
+    "Tense_past": Tense_past,
+    "Tense_pres": Tense_pres,
+    "VerbForm_fin": VerbForm_fin,
+    "VerbForm_ger": VerbForm_ger,
+    "VerbForm_inf": VerbForm_inf,
+    "VerbForm_none": VerbForm_none,
+    "VerbForm_part": VerbForm_part,
+    "VerbForm_partFut": VerbForm_partFut,
+    "VerbForm_partPast": VerbForm_partPast,
+    "VerbForm_partPres": VerbForm_partPres,
+    "VerbForm_sup": VerbForm_sup,
+    "VerbForm_trans": VerbForm_trans,
+    "VerbForm_gdv ": VerbForm_gdv, # la,
+    "Voice_act": Voice_act,
+    "Voice_cau": Voice_cau,
+    "Voice_pass": Voice_pass,
+    "Voice_mid ": Voice_mid, # gkc,
+    "Voice_int ": Voice_int, # hb,
+    "Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
+    "AdpType_prep ": AdpType_prep, # cz, U,
+    "AdpType_post ": AdpType_post, # U,
+    "AdpType_voc ": AdpType_voc, # cz,
+    "AdpType_comprep ": AdpType_comprep, # cz,
+    "AdpType_circ ": AdpType_circ, # U,
+    "AdvType_man": AdvType_man,
+    "AdvType_loc": AdvType_loc,
+    "AdvType_tim": AdvType_tim,
+    "AdvType_deg": AdvType_deg,
+    "AdvType_cau": AdvType_cau,
+    "AdvType_mod": AdvType_mod,
+    "AdvType_sta": AdvType_sta,
+    "AdvType_ex": AdvType_ex,
+    "AdvType_adadj": AdvType_adadj,
+    "ConjType_oper ": ConjType_oper, # cz, U,
+    "ConjType_comp ": ConjType_comp, # cz, U,
+    "Connegative_yes ": Connegative_yes, # fi,
+    "Derivation_minen ": Derivation_minen, # fi,
+    "Derivation_sti ": Derivation_sti, # fi,
+    "Derivation_inen ": Derivation_inen, # fi,
+    "Derivation_lainen ": Derivation_lainen, # fi,
+    "Derivation_ja ": Derivation_ja, # fi,
+    "Derivation_ton ": Derivation_ton, # fi,
+    "Derivation_vs ": Derivation_vs, # fi,
+    "Derivation_ttain ": Derivation_ttain, # fi,
+    "Derivation_ttaa ": Derivation_ttaa, # fi,
+    "Echo_rdp ": Echo_rdp, # U,
+    "Echo_ech ": Echo_ech, # U,
+    "Foreign_foreign ": Foreign_foreign, # cz, fi, U,
+    "Foreign_fscript ": Foreign_fscript, # cz, fi, U,
+    "Foreign_tscript ": Foreign_tscript, # cz, U,
+    "Foreign_yes ": Foreign_yes, # sl,
+    "Gender_dat_masc ": Gender_dat_masc, # bq, U,
+    "Gender_dat_fem ": Gender_dat_fem, # bq, U,
+    "Gender_erg_masc ": Gender_erg_masc, # bq,
+    "Gender_erg_fem ": Gender_erg_fem, # bq,
+    "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
+    "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
+    "Gender_psor_neut ": Gender_psor_neut, # sl,
+    "Hyph_yes ": Hyph_yes, # cz, U,
+    "InfForm_one ": InfForm_one, # fi,
+    "InfForm_two ": InfForm_two, # fi,
+    "InfForm_three ": InfForm_three, # fi,
+    "NameType_geo ": NameType_geo, # U, cz,
+    "NameType_prs ": NameType_prs, # U, cz,
+    "NameType_giv ": NameType_giv, # U, cz,
+    "NameType_sur ": NameType_sur, # U, cz,
+    "NameType_nat ": NameType_nat, # U, cz,
+    "NameType_com ": NameType_com, # U, cz,
+    "NameType_pro ": NameType_pro, # U, cz,
+    "NameType_oth ": NameType_oth, # U, cz,
+    "NounType_com ": NounType_com, # U,
+    "NounType_prop ": NounType_prop, # U,
+    "NounType_class ": NounType_class, # U,
+    "Number_abs_sing ": Number_abs_sing, # bq, U,
+    "Number_abs_plur ": Number_abs_plur, # bq, U,
+    "Number_dat_sing ": Number_dat_sing, # bq, U,
+    "Number_dat_plur ": Number_dat_plur, # bq, U,
+    "Number_erg_sing ": Number_erg_sing, # bq, U,
+    "Number_erg_plur ": Number_erg_plur, # bq, U,
+    "Number_psee_sing ": Number_psee_sing, # U,
+    "Number_psee_plur ": Number_psee_plur, # U,
+    "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
+    "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
+    "NumForm_digit ": NumForm_digit, # cz, sl, U,
+    "NumForm_roman ": NumForm_roman, # cz, sl, U,
+    "NumForm_word ": NumForm_word, # cz, sl, U,
+    "NumValue_one ": NumValue_one, # cz, U,
+    "NumValue_two ": NumValue_two, # cz, U,
+    "NumValue_three ": NumValue_three, # cz, U,
+    "PartForm_pres ": PartForm_pres, # fi,
+    "PartForm_past ": PartForm_past, # fi,
+    "PartForm_agt ": PartForm_agt, # fi,
+    "PartForm_neg ": PartForm_neg, # fi,
+    "PartType_mod ": PartType_mod, # U,
+    "PartType_emp ": PartType_emp, # U,
+    "PartType_res ": PartType_res, # U,
+    "PartType_inf ": PartType_inf, # U,
+    "PartType_vbp ": PartType_vbp, # U,
+    "Person_abs_one ": Person_abs_one, # bq, U,
+    "Person_abs_two ": Person_abs_two, # bq, U,
+    "Person_abs_three ": Person_abs_three, # bq, U,
+    "Person_dat_one ": Person_dat_one, # bq, U,
+    "Person_dat_two ": Person_dat_two, # bq, U,
+    "Person_dat_three ": Person_dat_three, # bq, U,
+    "Person_erg_one ": Person_erg_one, # bq, U,
+    "Person_erg_two ": Person_erg_two, # bq, U,
+    "Person_erg_three ": Person_erg_three, # bq, U,
+    "Person_psor_one ": Person_psor_one, # fi, U,
+    "Person_psor_two ": Person_psor_two, # fi, U,
+    "Person_psor_three ": Person_psor_three, # fi, U,
+    "Polite_inf ": Polite_inf, # bq, U,
+    "Polite_pol ": Polite_pol, # bq, U,
+    "Polite_abs_inf ": Polite_abs_inf, # bq, U,
+    "Polite_abs_pol ": Polite_abs_pol, # bq, U,
+    "Polite_erg_inf ": Polite_erg_inf, # bq, U,
+    "Polite_erg_pol ": Polite_erg_pol, # bq, U,
+    "Polite_dat_inf ": Polite_dat_inf, # bq, U,
+    "Polite_dat_pol ": Polite_dat_pol, # bq, U,
+    "Prefix_yes ": Prefix_yes, # U,
+    "PrepCase_npr ": PrepCase_npr, # cz,
+    "PrepCase_pre ": PrepCase_pre, # U,
+    "PunctSide_ini ": PunctSide_ini, # U,
+    "PunctSide_fin ": PunctSide_fin, # U,
+    "PunctType_peri ": PunctType_peri, # U,
+    "PunctType_qest ": PunctType_qest, # U,
+    "PunctType_excl ": PunctType_excl, # U,
+    "PunctType_quot ": PunctType_quot, # U,
+    "PunctType_brck ": PunctType_brck, # U,
+    "PunctType_comm ": PunctType_comm, # U,
+    "PunctType_colo ": PunctType_colo, # U,
+    "PunctType_semi ": PunctType_semi, # U,
+    "PunctType_dash ": PunctType_dash, # U,
+    "Style_arch ": Style_arch, # cz, fi, U,
+    "Style_rare ": Style_rare, # cz, fi, U,
+    "Style_poet ": Style_poet, # cz, U,
+    "Style_norm ": Style_norm, # cz, U,
+    "Style_coll ": Style_coll, # cz, U,
+    "Style_vrnc ": Style_vrnc, # cz, U,
+    "Style_sing ": Style_sing, # cz, U,
+    "Style_expr ": Style_expr, # cz, U,
+    "Style_derg ": Style_derg, # cz, U,
+    "Style_vulg ": Style_vulg, # cz, U,
+    "Style_yes ": Style_yes, # fi, U,
+    "StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
+    "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
+    "VerbType_aux ": VerbType_aux, # U,
+    "VerbType_cop ": VerbType_cop, # U,
+    "VerbType_mod ": VerbType_mod, # U,
+    "VerbType_light ": VerbType_light, # U,
+}
+
+
+NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@ -1,7 +1,8 @@
-# Google universal tag set
+from . cimport symbols
+
 cpdef enum univ_pos_t:
-    NO_TAG
-    ADJ
+    NO_TAG = 0
+    ADJ = symbols.ADJ
    ADP
    ADV
    AUX
@ -20,4 +21,3 @@ cpdef enum univ_pos_t:
    X
    EOL
    SPACE
-    N_UNIV_TAGS
--- a/spacy/parts_of_speech.pyx
+++ b/spacy/parts_of_speech.pyx
@ -1,8 +1,8 @@
 from __future__ import unicode_literals


-UNIV_POS_NAMES = {
-    "NO_TAG": NO_TAG,
+IDS = {
+    "": NO_TAG,
    "ADJ": ADJ,
    "ADP": ADP,
    "ADV": ADV,
@ -23,3 +23,6 @@ UNIV_POS_NAMES = {
    "EOL": EOL,
    "SPACE": SPACE
 }
+
+
+NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -69,12 +69,15 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except

 cdef class StringStore:
    '''Map strings to and from integer IDs.'''
-    def __init__(self):
+    def __init__(self, strings=None):
        self.mem = Pool()
        self._map = PreshMap()
        self._resize_at = 10000
        self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
        self.size = 1
+        if strings is not None:
+            for string in strings:
+                _ = self[string]

    property size:
        def __get__(self):
@ -113,6 +116,14 @@ cdef class StringStore:
        for i in range(self.size):
            yield self[i]

+    def __reduce__(self):
+        strings = [""]
+        for i in range(1, self.size):
+            string = &self.c[i]
+            py_string = _decode(string)
+            strings.append(py_string)
+        return (StringStore, (strings,), None, None, None)
+
    cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL:
        # 0 means missing, but we don't bother offsetting the index.
        key = hash64(chars, length * sizeof(char), 0)
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -0,0 +1,421 @@
+cpdef enum symbol_t:
+    NIL
+    IS_ALPHA
+    IS_ASCII
+    IS_DIGIT
+    IS_LOWER
+    IS_PUNCT
+    IS_SPACE
+    IS_TITLE
+    IS_UPPER
+    LIKE_URL
+    LIKE_NUM
+    LIKE_EMAIL
+    IS_STOP
+    IS_OOV
+    
+    FLAG14
+    FLAG15
+    FLAG16
+    FLAG17
+    FLAG18
+    FLAG19
+    FLAG20
+    FLAG21
+    FLAG22
+    FLAG23
+    FLAG24
+    FLAG25
+    FLAG26
+    FLAG27
+    FLAG28
+    FLAG29
+    FLAG30
+    FLAG31
+    FLAG32
+    FLAG33
+    FLAG34
+    FLAG35
+    FLAG36
+    FLAG37
+    FLAG38
+    FLAG39
+    FLAG40
+    FLAG41
+    FLAG42
+    FLAG43
+    FLAG44
+    FLAG45
+    FLAG46
+    FLAG47
+    FLAG48
+    FLAG49
+    FLAG50
+    FLAG51
+    FLAG52
+    FLAG53
+    FLAG54
+    FLAG55
+    FLAG56
+    FLAG57
+    FLAG58
+    FLAG59
+    FLAG60
+    FLAG61
+    FLAG62
+    FLAG63
+
+    ID
+    ORTH
+    LOWER
+    NORM
+    SHAPE
+    PREFIX
+    SUFFIX
+
+    LENGTH
+    CLUSTER
+    LEMMA
+    POS
+    TAG
+    DEP
+    ENT_IOB
+    ENT_TYPE
+    HEAD
+    SPACY
+    PROB
+
+    ADJ
+    ADP
+    ADV
+    AUX
+    CONJ
+    DET
+    INTJ
+    NOUN
+    NUM
+    PART
+    PRON
+    PROPN
+    PUNCT
+    SCONJ
+    SYM
+    VERB
+    X
+    EOL
+    SPACE
+
+    Animacy_anim
+    Animacy_inam
+    Aspect_freq
+    Aspect_imp
+    Aspect_mod
+    Aspect_none
+    Aspect_perf
+    Case_abe
+    Case_abl
+    Case_abs
+    Case_acc
+    Case_ade
+    Case_all
+    Case_cau
+    Case_com
+    Case_dat
+    Case_del
+    Case_dis
+    Case_ela
+    Case_ess
+    Case_gen
+    Case_ill
+    Case_ine
+    Case_ins
+    Case_loc
+    Case_lat
+    Case_nom
+    Case_par
+    Case_sub
+    Case_sup
+    Case_tem
+    Case_ter
+    Case_tra
+    Case_voc
+    Definite_two
+    Definite_def
+    Definite_red
+    Definite_ind
+    Degree_cmp
+    Degree_comp
+    Degree_none
+    Degree_pos
+    Degree_sup
+    Degree_abs
+    Degree_com
+    Degree_dim # du
+    Gender_com
+    Gender_fem
+    Gender_masc
+    Gender_neut
+    Mood_cnd
+    Mood_imp
+    Mood_ind
+    Mood_n
+    Mood_pot
+    Mood_sub
+    Mood_opt
+    Negative_neg
+    Negative_pos
+    Negative_yes
+    Number_com
+    Number_dual
+    Number_none
+    Number_plur
+    Number_sing
+    Number_ptan # bg
+    Number_count # bg
+    NumType_card
+    NumType_dist
+    NumType_frac
+    NumType_gen
+    NumType_mult
+    NumType_none
+    NumType_ord
+    NumType_sets
+    Person_one
+    Person_two
+    Person_three
+    Person_none
+    Poss_yes
+    PronType_advPart
+    PronType_art
+    PronType_default
+    PronType_dem
+    PronType_ind
+    PronType_int
+    PronType_neg
+    PronType_prs
+    PronType_rcp
+    PronType_rel
+    PronType_tot
+    PronType_clit
+    PronType_exc # es, ca, it, fa
+    Reflex_yes
+    Tense_fut
+    Tense_imp
+    Tense_past
+    Tense_pres
+    VerbForm_fin
+    VerbForm_ger
+    VerbForm_inf
+    VerbForm_none
+    VerbForm_part
+    VerbForm_partFut
+    VerbForm_partPast
+    VerbForm_partPres
+    VerbForm_sup
+    VerbForm_trans
+    VerbForm_gdv # la
+    Voice_act
+    Voice_cau
+    Voice_pass
+    Voice_mid # gkc
+    Voice_int # hb
+    Abbr_yes # cz, fi, sl, U
+    AdpType_prep # cz, U
+    AdpType_post # U
+    AdpType_voc # cz
+    AdpType_comprep # cz
+    AdpType_circ # U
+    AdvType_man
+    AdvType_loc
+    AdvType_tim
+    AdvType_deg
+    AdvType_cau
+    AdvType_mod
+    AdvType_sta
+    AdvType_ex
+    AdvType_adadj
+    ConjType_oper # cz, U
+    ConjType_comp # cz, U
+    Connegative_yes # fi
+    Derivation_minen # fi
+    Derivation_sti # fi
+    Derivation_inen # fi
+    Derivation_lainen # fi
+    Derivation_ja # fi
+    Derivation_ton # fi
+    Derivation_vs # fi
+    Derivation_ttain # fi
+    Derivation_ttaa # fi
+    Echo_rdp # U
+    Echo_ech # U
+    Foreign_foreign # cz, fi, U
+    Foreign_fscript # cz, fi, U
+    Foreign_tscript # cz, U
+    Foreign_yes # sl
+    Gender_dat_masc # bq, U
+    Gender_dat_fem # bq, U
+    Gender_erg_masc # bq
+    Gender_erg_fem # bq
+    Gender_psor_masc # cz, sl, U
+    Gender_psor_fem # cz, sl, U
+    Gender_psor_neut # sl
+    Hyph_yes # cz, U
+    InfForm_one # fi
+    InfForm_two # fi
+    InfForm_three # fi
+    NameType_geo # U, cz
+    NameType_prs # U, cz
+    NameType_giv # U, cz
+    NameType_sur # U, cz
+    NameType_nat # U, cz
+    NameType_com # U, cz
+    NameType_pro # U, cz
+    NameType_oth # U, cz
+    NounType_com # U
+    NounType_prop # U
+    NounType_class # U
+    Number_abs_sing # bq, U
+    Number_abs_plur # bq, U
+    Number_dat_sing # bq, U
+    Number_dat_plur # bq, U
+    Number_erg_sing # bq, U
+    Number_erg_plur # bq, U
+    Number_psee_sing # U
+    Number_psee_plur # U
+    Number_psor_sing # cz, fi, sl, U
+    Number_psor_plur # cz, fi, sl, U
+    NumForm_digit # cz, sl, U
+    NumForm_roman # cz, sl, U
+    NumForm_word # cz, sl, U
+    NumValue_one # cz, U
+    NumValue_two # cz, U
+    NumValue_three # cz, U
+    PartForm_pres # fi
+    PartForm_past # fi
+    PartForm_agt # fi
+    PartForm_neg # fi
+    PartType_mod # U
+    PartType_emp # U
+    PartType_res # U
+    PartType_inf # U
+    PartType_vbp # U
+    Person_abs_one # bq, U
+    Person_abs_two # bq, U
+    Person_abs_three # bq, U
+    Person_dat_one # bq, U
+    Person_dat_two # bq, U
+    Person_dat_three # bq, U
+    Person_erg_one # bq, U
+    Person_erg_two # bq, U
+    Person_erg_three # bq, U
+    Person_psor_one # fi, U
+    Person_psor_two # fi, U
+    Person_psor_three # fi, U
+    Polite_inf # bq, U
+    Polite_pol # bq, U
+    Polite_abs_inf # bq, U
+    Polite_abs_pol # bq, U
+    Polite_erg_inf # bq, U
+    Polite_erg_pol # bq, U
+    Polite_dat_inf # bq, U
+    Polite_dat_pol # bq, U
+    Prefix_yes # U
+    PrepCase_npr # cz
+    PrepCase_pre # U
+    PunctSide_ini # U
+    PunctSide_fin # U
+    PunctType_peri # U
+    PunctType_qest # U
+    PunctType_excl # U
+    PunctType_quot # U
+    PunctType_brck # U
+    PunctType_comm # U
+    PunctType_colo # U
+    PunctType_semi # U
+    PunctType_dash # U
+    Style_arch # cz, fi, U
+    Style_rare # cz, fi, U
+    Style_poet # cz, U
+    Style_norm # cz, U
+    Style_coll # cz, U
+    Style_vrnc # cz, U
+    Style_sing # cz, U
+    Style_expr # cz, U
+    Style_derg # cz, U
+    Style_vulg # cz, U
+    Style_yes # fi, U
+    StyleVariant_styleShort # cz
+    StyleVariant_styleBound # cz, sl
+    VerbType_aux # U
+    VerbType_cop # U
+    VerbType_mod # U
+    VerbType_light # U
+
+    PERSON
+    NORP
+    FACILITY
+    ORG
+    GPE
+    LOC
+    PRODUCT
+    EVENT
+    WORK_OF_ART
+    LANGUAGE
+
+    DATE
+    TIME
+    PERCENT
+    MONEY
+    QUANTITY
+    ORDINAL
+    CARDINAL
+
+    acomp
+    advcl
+    advmod
+    agent
+    amod
+    appos
+    attr
+    aux
+    auxpass
+    cc
+    ccomp
+    complm
+    conj
+    csubj
+    csubjpass
+    dep
+    det
+    dobj
+    expl
+    hmod
+    hyph
+    infmod
+    intj
+    iobj
+    mark
+    meta
+    neg
+    nmod
+    nn
+    npadvmod
+    nsubj
+    nsubjpass
+    num
+    number
+    oprd
+    parataxis
+    partmod
+    pcomp
+    pobj
+    poss
+    possessive
+    preconj
+    prep
+    prt
+    punct
+    quantmod
+    rcmod
+    root
+    xcomp
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -0,0 +1,424 @@
+IDS = {
+    "": NIL,
+    "IS_ALPHA": IS_ALPHA,
+    "IS_ASCII": IS_ASCII,
+    "IS_DIGIT": IS_DIGIT,
+    "IS_LOWER": IS_LOWER,
+    "IS_PUNCT": IS_PUNCT,
+    "IS_SPACE": IS_SPACE,
+    "IS_TITLE": IS_TITLE,
+    "IS_UPPER": IS_UPPER,
+    "LIKE_URL": LIKE_URL,
+    "LIKE_NUM": LIKE_NUM,
+    "LIKE_EMAIL": LIKE_EMAIL,
+    "IS_STOP": IS_STOP,
+    "IS_OOV": IS_OOV,
+    
+    "FLAG14": FLAG14,
+    "FLAG15": FLAG15,
+    "FLAG16": FLAG16,
+    "FLAG17": FLAG17,
+    "FLAG18": FLAG18,
+    "FLAG19": FLAG19,
+    "FLAG20": FLAG20,
+    "FLAG21": FLAG21,
+    "FLAG22": FLAG22,
+    "FLAG23": FLAG23,
+    "FLAG24": FLAG24,
+    "FLAG25": FLAG25,
+    "FLAG26": FLAG26,
+    "FLAG27": FLAG27,
+    "FLAG28": FLAG28,
+    "FLAG29": FLAG29,
+    "FLAG30": FLAG30,
+    "FLAG31": FLAG31,
+    "FLAG32": FLAG32,
+    "FLAG33": FLAG33,
+    "FLAG34": FLAG34,
+    "FLAG35": FLAG35,
+    "FLAG36": FLAG36,
+    "FLAG37": FLAG37,
+    "FLAG38": FLAG38,
+    "FLAG39": FLAG39,
+    "FLAG40": FLAG40,
+    "FLAG41": FLAG41,
+    "FLAG42": FLAG42,
+    "FLAG43": FLAG43,
+    "FLAG44": FLAG44,
+    "FLAG45": FLAG45,
+    "FLAG46": FLAG46,
+    "FLAG47": FLAG47,
+    "FLAG48": FLAG48,
+    "FLAG49": FLAG49,
+    "FLAG50": FLAG50,
+    "FLAG51": FLAG51,
+    "FLAG52": FLAG52,
+    "FLAG53": FLAG53,
+    "FLAG54": FLAG54,
+    "FLAG55": FLAG55,
+    "FLAG56": FLAG56,
+    "FLAG57": FLAG57,
+    "FLAG58": FLAG58,
+    "FLAG59": FLAG59,
+    "FLAG60": FLAG60,
+    "FLAG61": FLAG61,
+    "FLAG62": FLAG62,
+    "FLAG63": FLAG63,
+
+    "ID": ID,
+    "ORTH": ORTH,
+    "LOWER": LOWER,
+    "NORM": NORM,
+    "SHAPE": SHAPE,
+    "PREFIX": PREFIX,
+    "SUFFIX": SUFFIX,
+
+    "LENGTH": LENGTH,
+    "CLUSTER": CLUSTER,
+    "LEMMA": LEMMA,
+    "POS": POS,
+    "TAG": TAG,
+    "DEP": DEP,
+    "ENT_IOB": ENT_IOB,
+    "ENT_TYPE": ENT_TYPE,
+    "HEAD": HEAD,
+    "SPACY": SPACY,
+    "PROB": PROB,
+
+    "ADJ": ADJ,
+    "ADP": ADP,
+    "ADV": ADV,
+    "AUX": AUX,
+    "CONJ": CONJ,
+    "DET": DET,
+    "INTJ": INTJ,
+    "NOUN": NOUN,
+    "NUM": NUM,
+    "PART": PART,
+    "PRON": PRON,
+    "PROPN": PROPN,
+    "PUNCT": PUNCT,
+    "SCONJ": SCONJ,
+    "SYM": SYM,
+    "VERB": VERB,
+    "X": X,
+    "EOL": EOL,
+    "SPACE": SPACE,
+
+    "Animacy_anim": Animacy_anim,
+    "Animacy_inam": Animacy_inam,
+    "Aspect_freq": Aspect_freq,
+    "Aspect_imp": Aspect_imp,
+    "Aspect_mod": Aspect_mod,
+    "Aspect_none": Aspect_none,
+    "Aspect_perf": Aspect_perf,
+    "Case_abe": Case_abe,
+    "Case_abl": Case_abl,
+    "Case_abs": Case_abs,
+    "Case_acc": Case_acc,
+    "Case_ade": Case_ade,
+    "Case_all": Case_all,
+    "Case_cau": Case_cau,
+    "Case_com": Case_com,
+    "Case_dat": Case_dat,
+    "Case_del": Case_del,
+    "Case_dis": Case_dis,
+    "Case_ela": Case_ela,
+    "Case_ess": Case_ess,
+    "Case_gen": Case_gen,
+    "Case_ill": Case_ill,
+    "Case_ine": Case_ine,
+    "Case_ins": Case_ins,
+    "Case_loc": Case_loc,
+    "Case_lat": Case_lat,
+    "Case_nom": Case_nom,
+    "Case_par": Case_par,
+    "Case_sub": Case_sub,
+    "Case_sup": Case_sup,
+    "Case_tem": Case_tem,
+    "Case_ter": Case_ter,
+    "Case_tra": Case_tra,
+    "Case_voc": Case_voc,
+    "Definite_two": Definite_two,
+    "Definite_def": Definite_def,
+    "Definite_red": Definite_red,
+    "Definite_ind": Definite_ind,
+    "Degree_cmp": Degree_cmp,
+    "Degree_comp": Degree_comp,
+    "Degree_none": Degree_none,
+    "Degree_pos": Degree_pos,
+    "Degree_sup": Degree_sup,
+    "Degree_abs": Degree_abs,
+    "Degree_com": Degree_com,
+    "Degree_dim ": Degree_dim, # du
+    "Gender_com": Gender_com,
+    "Gender_fem": Gender_fem,
+    "Gender_masc": Gender_masc,
+    "Gender_neut": Gender_neut,
+    "Mood_cnd": Mood_cnd,
+    "Mood_imp": Mood_imp,
+    "Mood_ind": Mood_ind,
+    "Mood_n": Mood_n,
+    "Mood_pot": Mood_pot,
+    "Mood_sub": Mood_sub,
+    "Mood_opt": Mood_opt,
+    "Negative_neg": Negative_neg,
+    "Negative_pos": Negative_pos,
+    "Negative_yes": Negative_yes,
+    "Number_com": Number_com,
+    "Number_dual": Number_dual,
+    "Number_none": Number_none,
+    "Number_plur": Number_plur,
+    "Number_sing": Number_sing,
+    "Number_ptan ": Number_ptan, # bg
+    "Number_count ": Number_count, # bg
+    "NumType_card": NumType_card,
+    "NumType_dist": NumType_dist,
+    "NumType_frac": NumType_frac,
+    "NumType_gen": NumType_gen,
+    "NumType_mult": NumType_mult,
+    "NumType_none": NumType_none,
+    "NumType_ord": NumType_ord,
+    "NumType_sets": NumType_sets,
+    "Person_one": Person_one,
+    "Person_two": Person_two,
+    "Person_three": Person_three,
+    "Person_none": Person_none,
+    "Poss_yes": Poss_yes,
+    "PronType_advPart": PronType_advPart,
+    "PronType_art": PronType_art,
+    "PronType_default": PronType_default,
+    "PronType_dem": PronType_dem,
+    "PronType_ind": PronType_ind,
+    "PronType_int": PronType_int,
+    "PronType_neg": PronType_neg,
+    "PronType_prs": PronType_prs,
+    "PronType_rcp": PronType_rcp,
+    "PronType_rel": PronType_rel,
+    "PronType_tot": PronType_tot,
+    "PronType_clit": PronType_clit,
+    "PronType_exc ": PronType_exc, # es, ca, it, fa,
+    "Reflex_yes": Reflex_yes,
+    "Tense_fut": Tense_fut,
+    "Tense_imp": Tense_imp,
+    "Tense_past": Tense_past,
+    "Tense_pres": Tense_pres,
+    "VerbForm_fin": VerbForm_fin,
+    "VerbForm_ger": VerbForm_ger,
+    "VerbForm_inf": VerbForm_inf,
+    "VerbForm_none": VerbForm_none,
+    "VerbForm_part": VerbForm_part,
+    "VerbForm_partFut": VerbForm_partFut,
+    "VerbForm_partPast": VerbForm_partPast,
+    "VerbForm_partPres": VerbForm_partPres,
+    "VerbForm_sup": VerbForm_sup,
+    "VerbForm_trans": VerbForm_trans,
+    "VerbForm_gdv ": VerbForm_gdv, # la,
+    "Voice_act": Voice_act,
+    "Voice_cau": Voice_cau,
+    "Voice_pass": Voice_pass,
+    "Voice_mid ": Voice_mid, # gkc,
+    "Voice_int ": Voice_int, # hb,
+    "Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
+    "AdpType_prep ": AdpType_prep, # cz, U,
+    "AdpType_post ": AdpType_post, # U,
+    "AdpType_voc ": AdpType_voc, # cz,
+    "AdpType_comprep ": AdpType_comprep, # cz,
+    "AdpType_circ ": AdpType_circ, # U,
+    "AdvType_man": AdvType_man,
+    "AdvType_loc": AdvType_loc,
+    "AdvType_tim": AdvType_tim,
+    "AdvType_deg": AdvType_deg,
+    "AdvType_cau": AdvType_cau,
+    "AdvType_mod": AdvType_mod,
+    "AdvType_sta": AdvType_sta,
+    "AdvType_ex": AdvType_ex,
+    "AdvType_adadj": AdvType_adadj,
+    "ConjType_oper ": ConjType_oper, # cz, U,
+    "ConjType_comp ": ConjType_comp, # cz, U,
+    "Connegative_yes ": Connegative_yes, # fi,
+    "Derivation_minen ": Derivation_minen, # fi,
+    "Derivation_sti ": Derivation_sti, # fi,
+    "Derivation_inen ": Derivation_inen, # fi,
+    "Derivation_lainen ": Derivation_lainen, # fi,
+    "Derivation_ja ": Derivation_ja, # fi,
+    "Derivation_ton ": Derivation_ton, # fi,
+    "Derivation_vs ": Derivation_vs, # fi,
+    "Derivation_ttain ": Derivation_ttain, # fi,
+    "Derivation_ttaa ": Derivation_ttaa, # fi,
+    "Echo_rdp ": Echo_rdp, # U,
+    "Echo_ech ": Echo_ech, # U,
+    "Foreign_foreign ": Foreign_foreign, # cz, fi, U,
+    "Foreign_fscript ": Foreign_fscript, # cz, fi, U,
+    "Foreign_tscript ": Foreign_tscript, # cz, U,
+    "Foreign_yes ": Foreign_yes, # sl,
+    "Gender_dat_masc ": Gender_dat_masc, # bq, U,
+    "Gender_dat_fem ": Gender_dat_fem, # bq, U,
+    "Gender_erg_masc ": Gender_erg_masc, # bq,
+    "Gender_erg_fem ": Gender_erg_fem, # bq,
+    "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
+    "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
+    "Gender_psor_neut ": Gender_psor_neut, # sl,
+    "Hyph_yes ": Hyph_yes, # cz, U,
+    "InfForm_one ": InfForm_one, # fi,
+    "InfForm_two ": InfForm_two, # fi,
+    "InfForm_three ": InfForm_three, # fi,
+    "NameType_geo ": NameType_geo, # U, cz,
+    "NameType_prs ": NameType_prs, # U, cz,
+    "NameType_giv ": NameType_giv, # U, cz,
+    "NameType_sur ": NameType_sur, # U, cz,
+    "NameType_nat ": NameType_nat, # U, cz,
+    "NameType_com ": NameType_com, # U, cz,
+    "NameType_pro ": NameType_pro, # U, cz,
+    "NameType_oth ": NameType_oth, # U, cz,
+    "NounType_com ": NounType_com, # U,
+    "NounType_prop ": NounType_prop, # U,
+    "NounType_class ": NounType_class, # U,
+    "Number_abs_sing ": Number_abs_sing, # bq, U,
+    "Number_abs_plur ": Number_abs_plur, # bq, U,
+    "Number_dat_sing ": Number_dat_sing, # bq, U,
+    "Number_dat_plur ": Number_dat_plur, # bq, U,
+    "Number_erg_sing ": Number_erg_sing, # bq, U,
+    "Number_erg_plur ": Number_erg_plur, # bq, U,
+    "Number_psee_sing ": Number_psee_sing, # U,
+    "Number_psee_plur ": Number_psee_plur, # U,
+    "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
+    "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
+    "NumForm_digit ": NumForm_digit, # cz, sl, U,
+    "NumForm_roman ": NumForm_roman, # cz, sl, U,
+    "NumForm_word ": NumForm_word, # cz, sl, U,
+    "NumValue_one ": NumValue_one, # cz, U,
+    "NumValue_two ": NumValue_two, # cz, U,
+    "NumValue_three ": NumValue_three, # cz, U,
+    "PartForm_pres ": PartForm_pres, # fi,
+    "PartForm_past ": PartForm_past, # fi,
+    "PartForm_agt ": PartForm_agt, # fi,
+    "PartForm_neg ": PartForm_neg, # fi,
+    "PartType_mod ": PartType_mod, # U,
+    "PartType_emp ": PartType_emp, # U,
+    "PartType_res ": PartType_res, # U,
+    "PartType_inf ": PartType_inf, # U,
+    "PartType_vbp ": PartType_vbp, # U,
+    "Person_abs_one ": Person_abs_one, # bq, U,
+    "Person_abs_two ": Person_abs_two, # bq, U,
+    "Person_abs_three ": Person_abs_three, # bq, U,
+    "Person_dat_one ": Person_dat_one, # bq, U,
+    "Person_dat_two ": Person_dat_two, # bq, U,
+    "Person_dat_three ": Person_dat_three, # bq, U,
+    "Person_erg_one ": Person_erg_one, # bq, U,
+    "Person_erg_two ": Person_erg_two, # bq, U,
+    "Person_erg_three ": Person_erg_three, # bq, U,
+    "Person_psor_one ": Person_psor_one, # fi, U,
+    "Person_psor_two ": Person_psor_two, # fi, U,
+    "Person_psor_three ": Person_psor_three, # fi, U,
+    "Polite_inf ": Polite_inf, # bq, U,
+    "Polite_pol ": Polite_pol, # bq, U,
+    "Polite_abs_inf ": Polite_abs_inf, # bq, U,
+    "Polite_abs_pol ": Polite_abs_pol, # bq, U,
+    "Polite_erg_inf ": Polite_erg_inf, # bq, U,
+    "Polite_erg_pol ": Polite_erg_pol, # bq, U,
+    "Polite_dat_inf ": Polite_dat_inf, # bq, U,
+    "Polite_dat_pol ": Polite_dat_pol, # bq, U,
+    "Prefix_yes ": Prefix_yes, # U,
+    "PrepCase_npr ": PrepCase_npr, # cz,
+    "PrepCase_pre ": PrepCase_pre, # U,
+    "PunctSide_ini ": PunctSide_ini, # U,
+    "PunctSide_fin ": PunctSide_fin, # U,
+    "PunctType_peri ": PunctType_peri, # U,
+    "PunctType_qest ": PunctType_qest, # U,
+    "PunctType_excl ": PunctType_excl, # U,
+    "PunctType_quot ": PunctType_quot, # U,
+    "PunctType_brck ": PunctType_brck, # U,
+    "PunctType_comm ": PunctType_comm, # U,
+    "PunctType_colo ": PunctType_colo, # U,
+    "PunctType_semi ": PunctType_semi, # U,
+    "PunctType_dash ": PunctType_dash, # U,
+    "Style_arch ": Style_arch, # cz, fi, U,
+    "Style_rare ": Style_rare, # cz, fi, U,
+    "Style_poet ": Style_poet, # cz, U,
+    "Style_norm ": Style_norm, # cz, U,
+    "Style_coll ": Style_coll, # cz, U,
+    "Style_vrnc ": Style_vrnc, # cz, U,
+    "Style_sing ": Style_sing, # cz, U,
+    "Style_expr ": Style_expr, # cz, U,
+    "Style_derg ": Style_derg, # cz, U,
+    "Style_vulg ": Style_vulg, # cz, U,
+    "Style_yes ": Style_yes, # fi, U,
+    "StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
+    "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
+    "VerbType_aux ": VerbType_aux, # U,
+    "VerbType_cop ": VerbType_cop, # U,
+    "VerbType_mod ": VerbType_mod, # U,
+    "VerbType_light ": VerbType_light, # U,
+
+    "PERSON": PERSON,
+    "NORP": NORP,
+    "FACILITY": FACILITY,
+    "ORG": ORG,
+    "GPE": GPE,
+    "LOC": LOC,
+    "PRODUCT": PRODUCT,
+    "EVENT": EVENT,
+    "WORK_OF_ART": WORK_OF_ART,
+    "LANGUAGE": LANGUAGE,
+
+    "DATE": DATE,
+    "TIME": TIME,
+    "PERCENT": PERCENT,
+    "MONEY": MONEY,
+    "QUANTITY": QUANTITY,
+    "ORDINAL": ORDINAL,
+    "CARDINAL": CARDINAL,
+
+    "acomp": acomp,
+    "advcl": advcl,
+    "advmod": advmod,
+    "agent": agent,
+    "amod": amod,
+    "appos": appos,
+    "attr": attr,
+    "aux": aux,
+    "auxpass": auxpass,
+    "cc": cc,
+    "ccomp": ccomp,
+    "complm": complm,
+    "conj": conj,
+    "csubj": csubj,
+    "csubjpass": csubjpass,
+    "dep": dep,
+    "det": det,
+    "dobj": dobj,
+    "expl": expl,
+    "hmod": hmod,
+    "hyph": hyph,
+    "infmod": infmod,
+    "intj": intj,
+    "iobj": iobj,
+    "mark": mark,
+    "meta": meta,
+    "neg": neg,
+    "nmod": nmod,
+    "nn": nn,
+    "npadvmod": npadvmod,
+    "nsubj": nsubj,
+    "nsubjpass": nsubjpass,
+    "num": num,
+    "number": number,
+    "oprd": oprd,
+    "parataxis": parataxis,
+    "partmod": partmod,
+    "pcomp": pcomp,
+    "pobj": pobj,
+    "poss": poss,
+    "possessive": possessive,
+    "preconj": preconj,
+    "prep": prep,
+    "prt": prt,
+    "punct": punct,
+    "quantmod": quantmod,
+    "rcmod": rcmod,
+    "root": root,
+    "xcomp": xcomp
+}
+
+NAMES = [it[0] for it in sorted(IDS.items(), key=lambda it: it[1])]
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -83,7 +83,6 @@ cdef class Parser:
        model = Model(moves.n_moves, templates, model_dir)
        return cls(strings, moves, model)

-
    def __call__(self, Doc tokens):
        cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
        self.moves.initialize_state(stcls)
@ -93,6 +92,9 @@ cdef class Parser:
        self.parse(stcls, eg.c)
        tokens.set_parse(stcls._sent)

+    def __reduce__(self):
+        return (Parser, (self.moves.strings, self.moves, self.model), None, None)
+
    cdef void predict(self, StateClass stcls, ExampleC* eg) nogil:
        memset(eg.scores, 0, eg.nr_class * sizeof(weight_t))
        self.moves.set_valid(eg.is_valid, stcls)
--- a/spacy/syntax/transition_system.pxd
+++ b/spacy/syntax/transition_system.pxd
@ -37,6 +37,8 @@ cdef class TransitionSystem:
    cdef public int root_label
    cdef public freqs

+    cdef object _labels_by_action
+
    cdef int initialize_state(self, StateClass state) except -1
    cdef int finalize_state(self, StateClass state) nogil

--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -15,7 +15,8 @@ class OracleError(Exception):


 cdef class TransitionSystem:
-    def __init__(self, StringStore string_table, dict labels_by_action):
+    def __init__(self, StringStore string_table, dict labels_by_action, _freqs=None):
+        self._labels_by_action = labels_by_action
        self.mem = Pool()
        self.n_moves = sum(len(labels) for labels in labels_by_action.values())
        self._is_valid = <bint*>self.mem.alloc(self.n_moves, sizeof(bint))
@ -30,7 +31,7 @@ cdef class TransitionSystem:
                i += 1
        self.c = moves
        self.root_label = self.strings['ROOT']
-        self.freqs = {}
+        self.freqs = {} if _freqs is None else _freqs
        for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
            self.freqs[attr] = defaultdict(int)
            self.freqs[attr][0] = 1
@ -39,6 +40,11 @@ cdef class TransitionSystem:
            self.freqs[HEAD][i] = 1
            self.freqs[HEAD][-i] = 1

+    def __reduce__(self):
+        return (self.__class__,
+                (self.strings, self._labels_by_action, self.freqs),
+                None, None)
+
    cdef int initialize_state(self, StateClass state) except -1:
        pass

--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -148,6 +148,9 @@ cdef class Tagger:
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length

+    def __reduce__(self):
+        return (self.__class__, (self.vocab, self.model), None, None)
+
    def tag_from_strings(self, Doc tokens, object tag_strs):
        cdef int i
        for i in range(tokens.length):
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -14,7 +14,6 @@ from ..typedefs cimport attr_t, flags_t
 from ..attrs cimport attr_id_t
 from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
-from ..parts_of_speech import UNIV_POS_NAMES
 from ..parts_of_speech cimport CONJ, PUNCT, NOUN
 from ..parts_of_speech cimport univ_pos_t
 from ..lexeme cimport Lexeme
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -9,7 +9,7 @@ import numpy


 from ..lexeme cimport Lexeme
-from ..parts_of_speech import UNIV_POS_NAMES
+from .. import parts_of_speech

 from ..attrs cimport LEMMA
 from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
@ -318,7 +318,7 @@ cdef class Token:

    property pos_:
        def __get__(self):
-            return _pos_id_to_string[self.c.pos]
+            return parts_of_speech.NAMES[self.c.pos]

    property tag_:
        def __get__(self):
@ -363,6 +363,3 @@ cdef class Token:

    property like_email:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
-
-
-_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -25,7 +25,6 @@ cdef struct _Cached:


 cdef class Vocab:
-    cpdef public lexeme_props_getter
    cdef Pool mem
    cpdef readonly StringStore strings
    cpdef readonly Morphology morphology
@ -33,7 +32,6 @@ cdef class Vocab:
    cdef public object _serializer
    cdef public object data_dir
    cdef public object get_lex_attr
-    cdef public object pos_tags
    cdef public object serializer_freqs

    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -10,6 +10,8 @@ from os import path
 import io
 import math
 import json
+import tempfile
+import copy_reg

 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport Lexeme
@ -19,6 +21,9 @@ from .typedefs cimport attr_t
 from .cfile cimport CFile
 from .lemmatizer import Lemmatizer

+from . import attrs
+from . import symbols
+
 from cymem.cymem cimport Address
 from . import util
 from .serialize.packer cimport Packer
@ -67,6 +72,14 @@ cdef class Vocab:
        self._by_hash = PreshMap()
        self._by_orth = PreshMap()
        self.strings = StringStore()
+        # Load strings in a special order, so that we have an onset number for
+        # the vocabulary. This way, when words are added in order, the orth ID
+        # is the frequency rank of the word, plus a certain offset. The structural
+        # strings are loaded first, because the vocab is open-class, and these
+        # symbols are closed class.
+        for name in symbols.NAMES + list(sorted(tag_map.keys())):
+            if name:
+                _ = self.strings[name]
        self.get_lex_attr = get_lex_attr
        self.morphology = Morphology(self.strings, tag_map, lemmatizer)
        self.serializer_freqs = serializer_freqs
@ -85,6 +98,20 @@ cdef class Vocab:
        """The current number of lexemes stored."""
        return self.length

+    def __reduce__(self):
+        # TODO: Dump vectors
+        tmp_dir = tempfile.mkdtemp()
+        lex_loc = path.join(tmp_dir, 'lexemes.bin')
+        str_loc = path.join(tmp_dir, 'strings.txt')
+        vec_loc = path.join(self.data_dir, 'vec.bin') if self.data_dir is not None else None
+
+        self.dump(lex_loc)
+        self.strings.dump(str_loc)
+        
+        state = (str_loc, lex_loc, vec_loc, self.morphology, self.get_lex_attr,
+                 self.serializer_freqs, self.data_dir)
+        return (unpickle_vocab, state, None, None)
+
    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
        '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
        if necessary, using memory acquired from the given pool.  If the pool
@ -260,17 +287,17 @@ cdef class Vocab:
            i += 1
        fp.close()

-    def load_vectors(self, loc_or_file):
+    def load_vectors(self, file_):
        cdef LexemeC* lexeme
        cdef attr_t orth
        cdef int32_t vec_len = -1
-        for line_num, line in enumerate(loc_or_file):
+        for line_num, line in enumerate(file_):
            pieces = line.split()
            word_str = pieces.pop(0)
            if vec_len == -1:
                vec_len = len(pieces)
            elif vec_len != len(pieces):
-                raise VectorReadError.mismatched_sizes(loc_or_file, line_num,
+                raise VectorReadError.mismatched_sizes(file_, line_num,
                                                        vec_len, len(pieces))
            orth = self.strings[word_str]
            lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)
@ -328,6 +355,25 @@ cdef class Vocab:
        return vec_len


+def unpickle_vocab(strings_loc, lex_loc, vec_loc, morphology, get_lex_attr,
+                   serializer_freqs, data_dir):
+    cdef Vocab vocab = Vocab()
+
+    vocab.get_lex_attr = get_lex_attr
+    vocab.morphology = morphology
+    vocab.strings = morphology.strings
+    vocab.data_dir = data_dir
+    vocab.serializer_freqs = serializer_freqs
+
+    vocab.load_lexemes(strings_loc, lex_loc)
+    if vec_loc is not None:
+        vocab.load_vectors_from_bin_loc(vec_loc)
+    return vocab
+ 
+
+copy_reg.constructor(unpickle_vocab)
+
+
 def write_binary_vectors(in_loc, out_loc):
    cdef CFile out_file = CFile(out_loc, 'wb')
    cdef Address mem
--- a/tests/morphology/test_pickle.py
+++ b/tests/morphology/test_pickle.py
@ -0,0 +1,17 @@
+import pytest
+
+import pickle
+import StringIO
+
+
+from spacy.morphology import Morphology
+from spacy.lemmatizer import Lemmatizer
+from spacy.strings import StringStore
+
+
+def test_pickle():
+    morphology = Morphology(StringStore(), {}, Lemmatizer({}, {}, {})) 
+
+    file_ = StringIO.StringIO()
+    pickle.dump(morphology, file_)
+
--- a/tests/parser/test_pickle.py
+++ b/tests/parser/test_pickle.py
@ -0,0 +1,16 @@
+import pytest
+
+import pickle
+import cloudpickle
+import StringIO
+
+
+@pytest.mark.models
+def test_pickle(EN):
+    file_ = StringIO.StringIO()
+    cloudpickle.dump(EN.parser, file_)
+
+    file_.seek(0)
+
+    loaded = pickle.load(file_)
+
--- a/tests/tagger/test_lemmatizer.py
+++ b/tests/tagger/test_lemmatizer.py
@ -1,5 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
+import StringIO
+import pickle

 from spacy.lemmatizer import Lemmatizer, read_index, read_exc
 from spacy.en import LOCAL_DATA_DIR
@ -41,3 +43,12 @@ def test_smart_quotes(lemmatizer):
    do = lemmatizer.punct
    assert do('“') == set(['"'])
    assert do('“') == set(['"'])
+
+
+def test_pickle_lemmatizer(lemmatizer):
+    file_ = StringIO.StringIO()
+    pickle.dump(lemmatizer, file_)
+
+    file_.seek(0)
+    
+    loaded = pickle.load(file_)
--- a/tests/test_pickle.py
+++ b/tests/test_pickle.py
@ -0,0 +1,15 @@
+import pytest
+import StringIO
+import cloudpickle
+import pickle
+
+
+@pytest.mark.models
+def test_pickle_english(EN):
+    file_ = StringIO.StringIO()
+    cloudpickle.dump(EN, file_)
+
+    file_.seek(0)
+
+    loaded = pickle.load(file_)
+
--- a/tests/vocab/test_intern.py
+++ b/tests/vocab/test_intern.py
@ -1,5 +1,7 @@
 # -*- coding: utf8 -*-
 from __future__ import unicode_literals
+import pickle
+import StringIO

 from spacy.strings import StringStore

@ -76,3 +78,18 @@ def test_massive_strings(sstore):
    s513 = '1' * 513
    orth = sstore[s513]
    assert sstore[orth] == s513
+
+
+def test_pickle_string_store(sstore):
+    hello_id = sstore[u'Hi']
+    string_file = StringIO.StringIO()
+    pickle.dump(sstore, string_file)
+
+    string_file.seek(0)
+    
+    loaded = pickle.load(string_file)
+
+    assert loaded[hello_id] == u'Hi'
+
+
+
--- a/tests/vocab/test_vocab.py
+++ b/tests/vocab/test_vocab.py
@ -1,5 +1,11 @@
 from __future__ import unicode_literals
 import pytest
+import StringIO
+import cloudpickle
+import pickle
+
+from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA
+from spacy.parts_of_speech import NOUN, VERB


 def test_neq(en_vocab):
@ -25,3 +31,21 @@ def test_punct_neq(en_vocab):
 def test_shape_attr(en_vocab):
    example = en_vocab['example']
    assert example.orth != example.shape
+
+
+def test_symbols(en_vocab):
+    assert en_vocab.strings['IS_ALPHA'] == IS_ALPHA
+    assert en_vocab.strings['NOUN'] == NOUN
+    assert en_vocab.strings['VERB'] == VERB
+    assert en_vocab.strings['LEMMA'] == LEMMA
+    assert en_vocab.strings['ORTH'] == ORTH
+    assert en_vocab.strings['PROB'] == PROB
+    
+
+def test_pickle_vocab(en_vocab):
+    file_ = StringIO.StringIO()
+    cloudpickle.dump(en_vocab, file_)
+
+    file_.seek(0)
+
+    loaded = pickle.load(file_)
--- a/tests/website/conftest.py
+++ b/tests/website/conftest.py
@ -1,11 +1,13 @@
 from __future__ import unicode_literals
 import pytest
+import os


@pytest.fixture(scope='session')
 def nlp():
-    from spacy.en import English
-    return English()
+    from spacy.en import English, LOCAL_DATA_DIR
+    data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
+    return English(data_dir=data_dir)


@pytest.fixture()
--- a/tests/website/test_home.py
+++ b/tests/website/test_home.py
@ -1,6 +1,7 @@
 from __future__ import unicode_literals
 import pytest
 import spacy
+import os


@pytest.fixture()
@ -9,8 +10,9 @@ def token(doc):


 def test_load_resources_and_process_text():
-    from spacy.en import English
-    nlp = English()
+    from spacy.en import English, LOCAL_DATA_DIR
+    data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
+    nlp = English(data_dir=data_dir)
    doc = nlp('Hello, world. Here are two sentences.')