Merge branch 'attrs'

This commit is contained in:
Matthew Honnibal 2015-10-13 14:03:41 +11:00
commit c1fdc487bc
33 changed files with 1682 additions and 755 deletions

View File

@ -168,6 +168,11 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
probs[word] = oov_prob
lexicon = []
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
# First encode the strings into the StringStore. This way, we can map
# the orth IDs to frequency ranks
orth = vocab.strings[word]
# Now actually load the vocab
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
lexeme = vocab[word]
lexeme.prob = prob

View File

@ -56,5 +56,4 @@
"was": {"L": "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"},
"were": {"L": "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"}
}
}

View File

@ -22,7 +22,7 @@
"JJS": {"pos": "adj", "degree": "sup"},
"LS": {"pos": "punct", "numtype": "ord"},
"MD": {"pos": "verb", "verbtype": "mod"},
"NIL": {"pos": "no_tag"},
"NIL": {"pos": ""},
"NN": {"pos": "noun", "number": "sing"},
"NNP": {"pos": "noun", "nountype": "prop", "number": "sing"},
"NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"},

View File

@ -166,7 +166,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
'spacy.cfile', 'spacy.matcher',
'spacy.syntax.ner']
'spacy.syntax.ner',
'spacy.symbols']
if __name__ == '__main__':

View File

@ -29,5 +29,6 @@ cdef class Model:
cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1
cdef object model_loc
cdef object _templates
cdef Extractor _extractor
cdef LinearModel _model

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
from __future__ import division
from os import path
import tempfile
import os
import shutil
import json
@ -52,6 +53,7 @@ cdef class Model:
def __init__(self, n_classes, templates, model_loc=None):
if model_loc is not None and path.isdir(model_loc):
model_loc = path.join(model_loc, 'model')
self._templates = templates
self.n_classes = n_classes
self._extractor = Extractor(templates)
self.n_feats = self._extractor.n_templ
@ -60,6 +62,18 @@ cdef class Model:
if self.model_loc and path.exists(self.model_loc):
self._model.load(self.model_loc, freq_thresh=0)
def __reduce__(self):
model_loc = tempfile.mkstemp()
# TODO: This is a potentially buggy implementation. We're not really
# given a good guarantee that all internal state is saved correctly here,
# since there are learning parameters for e.g. the model averaging in
# averaged perceptron, the gradient calculations in AdaGrad, etc
# that aren't necessarily saved. So, if we're part way through training
# the model, and then we pickle it, we won't recover the state correctly.
self._model.dump(model_loc)
return (Model, (self.n_classes, self.templates, model_loc),
None, None)
def predict(self, Example eg):
self.set_scores(eg.c.scores, eg.c.atoms)
eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)

View File

@ -1,5 +1,6 @@
# Reserve 64 values for flag features
cpdef enum attr_id_t:
NULL_ATTR
IS_ALPHA
IS_ASCII
IS_DIGIT
@ -14,8 +15,7 @@ cpdef enum attr_id_t:
IS_STOP
IS_OOV
FLAG13 = 13
FLAG14
FLAG14 = 14
FLAG15
FLAG16
FLAG17

View File

@ -0,0 +1,90 @@
IDS = {
"": NULL_ATTR,
"IS_ALPHA": IS_ALPHA,
"IS_ASCII": IS_ASCII,
"IS_DIGIT": IS_DIGIT,
"IS_LOWER": IS_LOWER,
"IS_PUNCT": IS_PUNCT,
"IS_SPACE": IS_SPACE,
"IS_TITLE": IS_TITLE,
"IS_UPPER": IS_UPPER,
"LIKE_URL": LIKE_URL,
"LIKE_NUM": LIKE_NUM,
"LIKE_EMAIL": LIKE_EMAIL,
"IS_STOP": IS_STOP,
"IS_OOV": IS_OOV,
"FLAG14": FLAG14,
"FLAG15": FLAG15,
"FLAG16": FLAG16,
"FLAG17": FLAG17,
"FLAG18": FLAG18,
"FLAG19": FLAG19,
"FLAG20": FLAG20,
"FLAG21": FLAG21,
"FLAG22": FLAG22,
"FLAG23": FLAG23,
"FLAG24": FLAG24,
"FLAG25": FLAG25,
"FLAG26": FLAG26,
"FLAG27": FLAG27,
"FLAG28": FLAG28,
"FLAG29": FLAG29,
"FLAG30": FLAG30,
"FLAG31": FLAG31,
"FLAG32": FLAG32,
"FLAG33": FLAG33,
"FLAG34": FLAG34,
"FLAG35": FLAG35,
"FLAG36": FLAG36,
"FLAG37": FLAG37,
"FLAG38": FLAG38,
"FLAG39": FLAG39,
"FLAG40": FLAG40,
"FLAG41": FLAG41,
"FLAG42": FLAG42,
"FLAG43": FLAG43,
"FLAG44": FLAG44,
"FLAG45": FLAG45,
"FLAG46": FLAG46,
"FLAG47": FLAG47,
"FLAG48": FLAG48,
"FLAG49": FLAG49,
"FLAG50": FLAG50,
"FLAG51": FLAG51,
"FLAG52": FLAG52,
"FLAG53": FLAG53,
"FLAG54": FLAG54,
"FLAG55": FLAG55,
"FLAG56": FLAG56,
"FLAG57": FLAG57,
"FLAG58": FLAG58,
"FLAG59": FLAG59,
"FLAG60": FLAG60,
"FLAG61": FLAG61,
"FLAG62": FLAG62,
"FLAG63": FLAG63,
"ID": ID,
"ORTH": ORTH,
"LOWER": LOWER,
"NORM": NORM,
"SHAPE": SHAPE,
"PREFIX": PREFIX,
"SUFFIX": SUFFIX,
"LENGTH": LENGTH,
"CLUSTER": CLUSTER,
"LEMMA": LEMMA,
"POS": POS,
"TAG": TAG,
"DEP": DEP,
"ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE,
"HEAD": HEAD,
"SPACY": SPACY,
"PROB": PROB,
}
# ATTR IDs, in order of the symbol
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]

View File

@ -207,6 +207,12 @@ class Language(object):
self.entity = entity
self.matcher = matcher
def __reduce__(self):
return (self.__class__,
(None, self.vocab, self.tokenizer, self.tagger, self.parser,
self.entity, self.matcher, None),
None, None)
def __call__(self, text, tag=True, parse=True, entity=True):
"""Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string

View File

@ -15,7 +15,7 @@ from libcpp.vector cimport vector
from murmurhash.mrmr cimport hash64
from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
from .tokens.doc cimport get_token_attr
from .tokens.doc cimport Doc
from .vocab cimport Vocab
@ -168,13 +168,7 @@ cdef class Matcher:
cdef Pool mem
cdef vector[Pattern*] patterns
cdef readonly Vocab vocab
def __init__(self, vocab, patterns):
self.vocab = vocab
self.mem = Pool()
self.vocab = vocab
for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
self.add(entity_key, etype, attrs, specs)
cdef object _patterns
@classmethod
def from_dir(cls, data_dir, Vocab vocab):
@ -186,10 +180,22 @@ cdef class Matcher:
else:
return cls(vocab, {})
def __init__(self, vocab, patterns):
self.vocab = vocab
self.mem = Pool()
self.vocab = vocab
self._patterns = dict(patterns)
for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
self.add(entity_key, etype, attrs, specs)
def __reduce__(self):
return (self.__class__, (self.vocab, self._patterns), None, None)
property n_patterns:
def __get__(self): return self.patterns.size()
def add(self, entity_key, etype, attrs, specs):
self._patterns[entity_key] = (etype, dict(attrs), list(specs))
if isinstance(entity_key, basestring):
entity_key = self.vocab.strings[entity_key]
if isinstance(etype, basestring):

View File

@ -7,6 +7,7 @@ from .strings cimport StringStore
from .typedefs cimport attr_t
from .parts_of_speech cimport univ_pos_t
from . cimport symbols
cdef struct RichTagC:
uint64_t morph
@ -24,6 +25,7 @@ cdef class Morphology:
cdef readonly Pool mem
cdef readonly StringStore strings
cdef public object lemmatizer
cdef readonly object tag_map
cdef public object n_tags
cdef public object reverse_index
cdef public object tag_names
@ -36,720 +38,252 @@ cdef class Morphology:
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1
cpdef enum univ_morph_t:
NIL = 0
Animacy_anim = symbols.Animacy_anim
Animacy_inam
Aspect_freq
Aspect_imp
Aspect_mod
Aspect_none
Aspect_perf
Case_abe
Case_abl
Case_abs
Case_acc
Case_ade
Case_all
Case_cau
Case_com
Case_dat
Case_del
Case_dis
Case_ela
Case_ess
Case_gen
Case_ill
Case_ine
Case_ins
Case_loc
Case_lat
Case_nom
Case_par
Case_sub
Case_sup
Case_tem
Case_ter
Case_tra
Case_voc
Definite_two
Definite_def
Definite_red
Definite_ind
Degree_cmp
Degree_comp
Degree_none
Degree_pos
Degree_sup
Degree_abs
Degree_com
Degree_dim # du
Gender_com
Gender_fem
Gender_masc
Gender_neut
Mood_cnd
Mood_imp
Mood_ind
Mood_n
Mood_pot
Mood_sub
Mood_opt
Negative_neg
Negative_pos
Negative_yes
Number_com
Number_dual
Number_none
Number_plur
Number_sing
Number_ptan # bg
Number_count # bg
NumType_card
NumType_dist
NumType_frac
NumType_gen
NumType_mult
NumType_none
NumType_ord
NumType_sets
Person_one
Person_two
Person_three
Person_none
Poss_yes
PronType_advPart
PronType_art
PronType_default
PronType_dem
PronType_ind
PronType_int
PronType_neg
PronType_prs
PronType_rcp
PronType_rel
PronType_tot
PronType_clit
PronType_exc # es, ca, it, fa
Reflex_yes
Tense_fut
Tense_imp
Tense_past
Tense_pres
VerbForm_fin
VerbForm_ger
VerbForm_inf
VerbForm_none
VerbForm_part
VerbForm_partFut
VerbForm_partPast
VerbForm_partPres
VerbForm_sup
VerbForm_trans
VerbForm_gdv # la
Voice_act
Voice_cau
Voice_pass
Voice_mid # gkc
Voice_int # hb
Abbr_yes # cz, fi, sl, U
AdpType_prep # cz, U
AdpType_post # U
AdpType_voc # cz
AdpType_comprep # cz
AdpType_circ # U
AdvType_man
AdvType_loc
AdvType_tim
AdvType_deg
AdvType_cau
AdvType_mod
AdvType_sta
AdvType_ex
AdvType_adadj
ConjType_oper # cz, U
ConjType_comp # cz, U
Connegative_yes # fi
Derivation_minen # fi
Derivation_sti # fi
Derivation_inen # fi
Derivation_lainen # fi
Derivation_ja # fi
Derivation_ton # fi
Derivation_vs # fi
Derivation_ttain # fi
Derivation_ttaa # fi
Echo_rdp # U
Echo_ech # U
Foreign_foreign # cz, fi, U
Foreign_fscript # cz, fi, U
Foreign_tscript # cz, U
Foreign_yes # sl
Gender_dat_masc # bq, U
Gender_dat_fem # bq, U
Gender_erg_masc # bq
Gender_erg_fem # bq
Gender_psor_masc # cz, sl, U
Gender_psor_fem # cz, sl, U
Gender_psor_neut # sl
Hyph_yes # cz, U
InfForm_one # fi
InfForm_two # fi
InfForm_three # fi
NameType_geo # U, cz
NameType_prs # U, cz
NameType_giv # U, cz
NameType_sur # U, cz
NameType_nat # U, cz
NameType_com # U, cz
NameType_pro # U, cz
NameType_oth # U, cz
NounType_com # U
NounType_prop # U
NounType_class # U
Number_abs_sing # bq, U
Number_abs_plur # bq, U
Number_dat_sing # bq, U
Number_dat_plur # bq, U
Number_erg_sing # bq, U
Number_erg_plur # bq, U
Number_psee_sing # U
Number_psee_plur # U
Number_psor_sing # cz, fi, sl, U
Number_psor_plur # cz, fi, sl, U
NumForm_digit # cz, sl, U
NumForm_roman # cz, sl, U
NumForm_word # cz, sl, U
NumValue_one # cz, U
NumValue_two # cz, U
NumValue_three # cz, U
PartForm_pres # fi
PartForm_past # fi
PartForm_agt # fi
PartForm_neg # fi
PartType_mod # U
PartType_emp # U
PartType_res # U
PartType_inf # U
PartType_vbp # U
Person_abs_one # bq, U
Person_abs_two # bq, U
Person_abs_three # bq, U
Person_dat_one # bq, U
Person_dat_two # bq, U
Person_dat_three # bq, U
Person_erg_one # bq, U
Person_erg_two # bq, U
Person_erg_three # bq, U
Person_psor_one # fi, U
Person_psor_two # fi, U
Person_psor_three # fi, U
Polite_inf # bq, U
Polite_pol # bq, U
Polite_abs_inf # bq, U
Polite_abs_pol # bq, U
Polite_erg_inf # bq, U
Polite_erg_pol # bq, U
Polite_dat_inf # bq, U
Polite_dat_pol # bq, U
Prefix_yes # U
PrepCase_npr # cz
PrepCase_pre # U
PunctSide_ini # U
PunctSide_fin # U
PunctType_peri # U
PunctType_qest # U
PunctType_excl # U
PunctType_quot # U
PunctType_brck # U
PunctType_comm # U
PunctType_colo # U
PunctType_semi # U
PunctType_dash # U
Style_arch # cz, fi, U
Style_rare # cz, fi, U
Style_poet # cz, U
Style_norm # cz, U
Style_coll # cz, U
Style_vrnc # cz, U
Style_sing # cz, U
Style_expr # cz, U
Style_derg # cz, U
Style_vulg # cz, U
Style_yes # fi, U
StyleVariant_styleShort # cz
StyleVariant_styleBound # cz, sl
VerbType_aux # U
VerbType_cop # U
VerbType_mod # U
VerbType_light # U
#
#cpdef enum Feature_t:
# Abbr
# AdpType
# AdvType
# ConjType
# Connegative
# Derivation
# Echo
# Foreign
# Gender_dat
# Gender_erg
# Gender_psor
# Hyph
# InfForm
# NameType
# NounType
# NumberAbs
# NumberDat
# NumberErg
# NumberPsee
# NumberPsor
# NumForm
# NumValue
# PartForm
# PartType
# Person_abs
# Person_dat
# Person_psor
# Polite
# Polite_abs
# Polite_dat
# Prefix
# PrepCase
# PunctSide
# PunctType
# Style
# Typo
# Variant
# VerbType
#
#
#cpdef enum Animacy:
# Anim
# Inam
#
#
#cpdef enum Aspect:
# Freq
# Imp
# Mod
# None_
# Perf
#
#
#cpdef enum Case1:
# Nom
# Gen
# Acc
# Dat
# Voc
# Abl
#
#cdef enum Case2:
# Abe
# Abs
# Ade
# All
# Cau
# Com
# Del
# Dis
#
#cdef enum Case3:
# Ela
# Ess
# Ill
# Ine
# Ins
# Loc
# Lat
# Par
#
#cdef enum Case4:
# Sub
# Sup
# Tem
# Ter
# Tra
#
#
#cpdef enum Definite:
# Two
# Def
# Red
# Ind
#
#
#cpdef enum Degree:
# Cmp
# Comp
# None_
# Pos
# Sup
# Abs
# Com
# Degree # du
#
#
#cpdef enum Gender:
# Com
# Fem
# Masc
# Neut
#
#
#cpdef enum Mood:
# Cnd
# Imp
# Ind
# N
# Pot
# Sub
# Opt
#
#
#cpdef enum Negative:
# Neg
# Pos
# Yes
#
#
#cpdef enum Number:
# Com
# Dual
# None_
# Plur
# Sing
# Ptan # bg
# Count # bg
#
#
#cpdef enum NumType:
# Card
# Dist
# Frac
# Gen
# Mult
# None_
# Ord
# Sets
#
#
#cpdef enum Person:
# One
# Two
# Three
# None_
#
#
#cpdef enum Poss:
# Yes
#
#
#cpdef enum PronType1:
# AdvPart
# Art
# Default
# Dem
# Ind
# Int
# Neg
#
#cpdef enum PronType2:
# Prs
# Rcp
# Rel
# Tot
# Clit
# Exc # es, ca, it, fa
# Clit # it
#
#
#cpdef enum Reflex:
# Yes
#
#
#cpdef enum Tense:
# Fut
# Imp
# Past
# Pres
#
#cpdef enum VerbForm1:
# Fin
# Ger
# Inf
# None_
# Part
# PartFut
# PartPast
#
#cpdef enum VerbForm2:
# PartPres
# Sup
# Trans
# Gdv # la
#
#
#cpdef enum Voice:
# Act
# Cau
# Pass
# Mid # gkc
# Int # hb
#
#
#cpdef enum Abbr:
# Yes # cz, fi, sl, U
#
#cpdef enum AdpType:
# Prep # cz, U
# Post # U
# Voc # cz
# Comprep # cz
# Circ # U
# Voc # U
#
#
#cpdef enum AdvType1:
# # U
# Man
# Loc
# Tim
# Deg
# Cau
# Mod
# Sta
# Ex
#
#cpdef enum AdvType2:
# Adadj
#
#cpdef enum ConjType:
# Oper # cz, U
# Comp # cz, U
#
#cpdef enum Connegative:
# Yes # fi
#
#
#cpdef enum Derivation1:
# Minen # fi
# Sti # fi
# Inen # fi
# Lainen # fi
# Ja # fi
# Ton # fi
# Vs # fi
# Ttain # fi
#
#cpdef enum Derivation2:
# Ttaa
#
#
#cpdef enum Echo:
# Rdp # U
# Ech # U
#
#
#cpdef enum Foreign:
# Foreign # cz, fi, U
# Fscript # cz, fi, U
# Tscript # cz, U
# Yes # sl
#
#
#cpdef enum Gender_dat:
# Masc # bq, U
# Fem # bq, U
#
#
#cpdef enum Gender_erg:
# Masc # bq
# Fem # bq
#
#
#cpdef enum Gender_psor:
# Masc # cz, sl, U
# Fem # cz, sl, U
# Neut # sl
#
#
#cpdef enum Hyph:
# Yes # cz, U
#
#
#cpdef enum InfForm:
# One # fi
# Two # fi
# Three # fi
#
#
#cpdef enum NameType:
# Geo # U, cz
# Prs # U, cz
# Giv # U, cz
# Sur # U, cz
# Nat # U, cz
# Com # U, cz
# Pro # U, cz
# Oth # U, cz
#
#
#cpdef enum NounType:
# Com # U
# Prop # U
# Class # U
#
#cpdef enum Number_abs:
# Sing # bq, U
# Plur # bq, U
#
#cpdef enum Number_dat:
# Sing # bq, U
# Plur # bq, U
#
#cpdef enum Number_erg:
# Sing # bq, U
# Plur # bq, U
#
#cpdef enum Number_psee:
# Sing # U
# Plur # U
#
#
#cpdef enum Number_psor:
# Sing # cz, fi, sl, U
# Plur # cz, fi, sl, U
#
#
#cpdef enum NumForm:
# Digit # cz, sl, U
# Roman # cz, sl, U
# Word # cz, sl, U
#
#
#cpdef enum NumValue:
# One # cz, U
# Two # cz, U
# Three # cz, U
#
#
#cpdef enum PartForm:
# Pres # fi
# Past # fi
# Agt # fi
# Neg # fi
#
#
#cpdef enum PartType:
# Mod # U
# Emp # U
# Res # U
# Inf # U
# Vbp # U
#
#cpdef enum Person_abs:
# One # bq, U
# Two # bq, U
# Three # bq, U
#
#
#cpdef enum Person_dat:
# One # bq, U
# Two # bq, U
# Three # bq, U
#
#
#cpdef enum Person_erg:
# One # bq, U
# Two # bq, U
# Three # bq, U
#
#
#cpdef enum Person_psor:
# One # fi, U
# Two # fi, U
# Three # fi, U
#
#
#cpdef enum Polite:
# Inf # bq, U
# Pol # bq, U
#
#
#cpdef enum Polite_abs:
# Inf # bq, U
# Pol # bq, U
#
#
#cpdef enum Polite_erg:
# Inf # bq, U
# Pol # bq, U
#
#
#cpdef enum Polite_dat:
# Inf # bq, U
# Pol # bq, U
#
#
#cpdef enum Prefix:
# Yes # U
#
#
#cpdef enum PrepCase:
# Npr # cz
# Pre # U
#
#
#cpdef enum PunctSide:
# Ini # U
# Fin # U
#
#cpdef enum PunctType1:
# Peri # U
# Qest # U
# Excl # U
# Quot # U
# Brck # U
# Comm # U
# Colo # U
# Semi # U
#
#cpdef enum PunctType2:
# Dash # U
#
#
#cpdef enum Style1:
# Arch # cz, fi, U
# Rare # cz, fi, U
# Poet # cz, U
# Norm # cz, U
# Coll # cz, U
# Vrnc # cz, U
# Sing # cz, U
# Expr # cz, U
#
#
#cpdef enum Style2:
# Derg # cz, U
# Vulg # cz, U
#
#
#cpdef enum Typo:
# Yes # fi, U
#
#
#cpdef enum Variant:
# Short # cz
# Bound # cz, sl
#
#
#cpdef enum VerbType:
# Aux # U
# Cop # U
# Mod # U
# Light # U
#
cpdef enum Value_t:
Animacy_Anim
Animacy_Inam
Aspect_Freq
Aspect_Imp
Aspect_Mod
Aspect_None_
Aspect_Perf
Case_Abe
Case_Abl
Case_Abs
Case_Acc
Case_Ade
Case_All
Case_Cau
Case_Com
Case_Dat
Case_Del
Case_Dis
Case_Ela
Case_Ess
Case_Gen
Case_Ill
Case_Ine
Case_Ins
Case_Loc
Case_Lat
Case_Nom
Case_Par
Case_Sub
Case_Sup
Case_Tem
Case_Ter
Case_Tra
Case_Voc
Definite_Two
Definite_Def
Definite_Red
Definite_Ind
Degree_Cmp
Degree_Comp
Degree_None
Degree_Pos
Degree_Sup
Degree_Abs
Degree_Com
Degree_Dim # du
Gender_Com
Gender_Fem
Gender_Masc
Gender_Neut
Mood_Cnd
Mood_Imp
Mood_Ind
Mood_N
Mood_Pot
Mood_Sub
Mood_Opt
Negative_Neg
Negative_Pos
Negative_Yes
Number_Com
Number_Dual
Number_None
Number_Plur
Number_Sing
Number_Ptan # bg
Number_Count # bg
NumType_Card
NumType_Dist
NumType_Frac
NumType_Gen
NumType_Mult
NumType_None
NumType_Ord
NumType_Sets
Person_One
Person_Two
Person_Three
Person_None
Poss_Yes
PronType_AdvPart
PronType_Art
PronType_Default
PronType_Dem
PronType_Ind
PronType_Int
PronType_Neg
PronType_Prs
PronType_Rcp
PronType_Rel
PronType_Tot
PronType_Clit
PronType_Exc # es, ca, it, fa
Reflex_Yes
Tense_Fut
Tense_Imp
Tense_Past
Tense_Pres
VerbForm_Fin
VerbForm_Ger
VerbForm_Inf
VerbForm_None
VerbForm_Part
VerbForm_PartFut
VerbForm_PartPast
VerbForm_PartPres
VerbForm_Sup
VerbForm_Trans
VerbForm_Gdv # la
Voice_Act
Voice_Cau
Voice_Pass
Voice_Mid # gkc
Voice_Int # hb
Abbr_Yes # cz, fi, sl, U
AdpType_Prep # cz, U
AdpType_Post # U
AdpType_Voc # cz
AdpType_Comprep # cz
AdpType_Circ # U
AdvType_Man
AdvType_Loc
AdvType_Tim
AdvType_Deg
AdvType_Cau
AdvType_Mod
AdvType_Sta
AdvType_Ex
AdvType_Adadj
ConjType_Oper # cz, U
ConjType_Comp # cz, U
Connegative_Yes # fi
Derivation_Minen # fi
Derivation_Sti # fi
Derivation_Inen # fi
Derivation_Lainen # fi
Derivation_Ja # fi
Derivation_Ton # fi
Derivation_Vs # fi
Derivation_Ttain # fi
Derivation_Ttaa # fi
Echo_Rdp # U
Echo_Ech # U
Foreign_Foreign # cz, fi, U
Foreign_Fscript # cz, fi, U
Foreign_Tscript # cz, U
Foreign_Yes # sl
Gender_dat_Masc # bq, U
Gender_dat_Fem # bq, U
Gender_erg_Masc # bq
Gender_erg_Fem # bq
Gender_psor_Masc # cz, sl, U
Gender_psor_Fem # cz, sl, U
Gender_psor_Neut # sl
Hyph_Yes # cz, U
InfForm_One # fi
InfForm_Two # fi
InfForm_Three # fi
NameType_Geo # U, cz
NameType_Prs # U, cz
NameType_Giv # U, cz
NameType_Sur # U, cz
NameType_Nat # U, cz
NameType_Com # U, cz
NameType_Pro # U, cz
NameType_Oth # U, cz
NounType_Com # U
NounType_Prop # U
NounType_Class # U
Number_abs_Sing # bq, U
Number_abs_Plur # bq, U
Number_dat_Sing # bq, U
Number_dat_Plur # bq, U
Number_erg_Sing # bq, U
Number_erg_Plur # bq, U
Number_psee_Sing # U
Number_psee_Plur # U
Number_psor_Sing # cz, fi, sl, U
Number_psor_Plur # cz, fi, sl, U
NumForm_Digit # cz, sl, U
NumForm_Roman # cz, sl, U
NumForm_Word # cz, sl, U
NumValue_One # cz, U
NumValue_Two # cz, U
NumValue_Three # cz, U
PartForm_Pres # fi
PartForm_Past # fi
PartForm_Agt # fi
PartForm_Neg # fi
PartType_Mod # U
PartType_Emp # U
PartType_Res # U
PartType_Inf # U
PartType_Vbp # U
Person_abs_One # bq, U
Person_abs_Two # bq, U
Person_abs_Three # bq, U
Person_dat_One # bq, U
Person_dat_Two # bq, U
Person_dat_Three # bq, U
Person_erg_One # bq, U
Person_erg_Two # bq, U
Person_erg_Three # bq, U
Person_psor_One # fi, U
Person_psor_Two # fi, U
Person_psor_Three # fi, U
Polite_Inf # bq, U
Polite_Pol # bq, U
Polite_abs_Inf # bq, U
Polite_abs_Pol # bq, U
Polite_erg_Inf # bq, U
Polite_erg_Pol # bq, U
Polite_dat_Inf # bq, U
Polite_dat_Pol # bq, U
Prefix_Yes # U
PrepCase_Npr # cz
PrepCase_Pre # U
PunctSide_Ini # U
PunctSide_Fin # U
PunctType_Peri # U
PunctType_Qest # U
PunctType_Excl # U
PunctType_Quot # U
PunctType_Brck # U
PunctType_Comm # U
PunctType_Colo # U
PunctType_Semi # U
PunctType_Dash # U
Style_Arch # cz, fi, U
Style_Rare # cz, fi, U
Style_Poet # cz, U
Style_Norm # cz, U
Style_Coll # cz, U
Style_Vrnc # cz, U
Style_Sing # cz, U
Style_Expr # cz, U
Style_Derg # cz, U
Style_Vulg # cz, U
Style_Yes # fi, U
StyleVariant_StyleShort # cz
StyleVariant_StyleBound # cz, sl
VerbType_Aux # U
VerbType_Cop # U
VerbType_Mod # U
VerbType_Light # U

View File

@ -6,7 +6,7 @@ try:
except ImportError:
import json
from .parts_of_speech import UNIV_POS_NAMES
from .parts_of_speech import IDS as POS_IDS
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
@ -14,6 +14,7 @@ cdef class Morphology:
def __init__(self, StringStore string_store, tag_map, lemmatizer):
self.mem = Pool()
self.strings = string_store
self.tag_map = tag_map
self.lemmatizer = lemmatizer
self.n_tags = len(tag_map) + 1
self.tag_names = tuple(sorted(tag_map.keys()))
@ -24,10 +25,13 @@ cdef class Morphology:
self.rich_tags[i].id = i
self.rich_tags[i].name = self.strings[tag_str]
self.rich_tags[i].morph = 0
self.rich_tags[i].pos = UNIV_POS_NAMES[props['pos'].upper()]
self.rich_tags[i].pos = POS_IDS[props['pos'].upper()]
self.reverse_index[self.rich_tags[i].name] = i
self._cache = PreshMapArray(self.n_tags)
def __reduce__(self):
return (Morphology, (self.strings, self.tag_map, self.lemmatizer), None, None)
cdef int assign_tag(self, TokenC* token, tag) except -1:
cdef int tag_id
if isinstance(tag, basestring):
@ -89,3 +93,254 @@ cdef class Morphology:
lemma_string = sorted(lemma_strings)[0]
lemma = self.strings[lemma_string]
return lemma
IDS = {
"Animacy_anim": Animacy_anim,
"Animacy_inam": Animacy_inam,
"Aspect_freq": Aspect_freq,
"Aspect_imp": Aspect_imp,
"Aspect_mod": Aspect_mod,
"Aspect_none": Aspect_none,
"Aspect_perf": Aspect_perf,
"Case_abe": Case_abe,
"Case_abl": Case_abl,
"Case_abs": Case_abs,
"Case_acc": Case_acc,
"Case_ade": Case_ade,
"Case_all": Case_all,
"Case_cau": Case_cau,
"Case_com": Case_com,
"Case_dat": Case_dat,
"Case_del": Case_del,
"Case_dis": Case_dis,
"Case_ela": Case_ela,
"Case_ess": Case_ess,
"Case_gen": Case_gen,
"Case_ill": Case_ill,
"Case_ine": Case_ine,
"Case_ins": Case_ins,
"Case_loc": Case_loc,
"Case_lat": Case_lat,
"Case_nom": Case_nom,
"Case_par": Case_par,
"Case_sub": Case_sub,
"Case_sup": Case_sup,
"Case_tem": Case_tem,
"Case_ter": Case_ter,
"Case_tra": Case_tra,
"Case_voc": Case_voc,
"Definite_two": Definite_two,
"Definite_def": Definite_def,
"Definite_red": Definite_red,
"Definite_ind": Definite_ind,
"Degree_cmp": Degree_cmp,
"Degree_comp": Degree_comp,
"Degree_none": Degree_none,
"Degree_pos": Degree_pos,
"Degree_sup": Degree_sup,
"Degree_abs": Degree_abs,
"Degree_com": Degree_com,
"Degree_dim ": Degree_dim, # du
"Gender_com": Gender_com,
"Gender_fem": Gender_fem,
"Gender_masc": Gender_masc,
"Gender_neut": Gender_neut,
"Mood_cnd": Mood_cnd,
"Mood_imp": Mood_imp,
"Mood_ind": Mood_ind,
"Mood_n": Mood_n,
"Mood_pot": Mood_pot,
"Mood_sub": Mood_sub,
"Mood_opt": Mood_opt,
"Negative_neg": Negative_neg,
"Negative_pos": Negative_pos,
"Negative_yes": Negative_yes,
"Number_com": Number_com,
"Number_dual": Number_dual,
"Number_none": Number_none,
"Number_plur": Number_plur,
"Number_sing": Number_sing,
"Number_ptan ": Number_ptan, # bg
"Number_count ": Number_count, # bg
"NumType_card": NumType_card,
"NumType_dist": NumType_dist,
"NumType_frac": NumType_frac,
"NumType_gen": NumType_gen,
"NumType_mult": NumType_mult,
"NumType_none": NumType_none,
"NumType_ord": NumType_ord,
"NumType_sets": NumType_sets,
"Person_one": Person_one,
"Person_two": Person_two,
"Person_three": Person_three,
"Person_none": Person_none,
"Poss_yes": Poss_yes,
"PronType_advPart": PronType_advPart,
"PronType_art": PronType_art,
"PronType_default": PronType_default,
"PronType_dem": PronType_dem,
"PronType_ind": PronType_ind,
"PronType_int": PronType_int,
"PronType_neg": PronType_neg,
"PronType_prs": PronType_prs,
"PronType_rcp": PronType_rcp,
"PronType_rel": PronType_rel,
"PronType_tot": PronType_tot,
"PronType_clit": PronType_clit,
"PronType_exc ": PronType_exc, # es, ca, it, fa,
"Reflex_yes": Reflex_yes,
"Tense_fut": Tense_fut,
"Tense_imp": Tense_imp,
"Tense_past": Tense_past,
"Tense_pres": Tense_pres,
"VerbForm_fin": VerbForm_fin,
"VerbForm_ger": VerbForm_ger,
"VerbForm_inf": VerbForm_inf,
"VerbForm_none": VerbForm_none,
"VerbForm_part": VerbForm_part,
"VerbForm_partFut": VerbForm_partFut,
"VerbForm_partPast": VerbForm_partPast,
"VerbForm_partPres": VerbForm_partPres,
"VerbForm_sup": VerbForm_sup,
"VerbForm_trans": VerbForm_trans,
"VerbForm_gdv ": VerbForm_gdv, # la,
"Voice_act": Voice_act,
"Voice_cau": Voice_cau,
"Voice_pass": Voice_pass,
"Voice_mid ": Voice_mid, # gkc,
"Voice_int ": Voice_int, # hb,
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
"AdpType_prep ": AdpType_prep, # cz, U,
"AdpType_post ": AdpType_post, # U,
"AdpType_voc ": AdpType_voc, # cz,
"AdpType_comprep ": AdpType_comprep, # cz,
"AdpType_circ ": AdpType_circ, # U,
"AdvType_man": AdvType_man,
"AdvType_loc": AdvType_loc,
"AdvType_tim": AdvType_tim,
"AdvType_deg": AdvType_deg,
"AdvType_cau": AdvType_cau,
"AdvType_mod": AdvType_mod,
"AdvType_sta": AdvType_sta,
"AdvType_ex": AdvType_ex,
"AdvType_adadj": AdvType_adadj,
"ConjType_oper ": ConjType_oper, # cz, U,
"ConjType_comp ": ConjType_comp, # cz, U,
"Connegative_yes ": Connegative_yes, # fi,
"Derivation_minen ": Derivation_minen, # fi,
"Derivation_sti ": Derivation_sti, # fi,
"Derivation_inen ": Derivation_inen, # fi,
"Derivation_lainen ": Derivation_lainen, # fi,
"Derivation_ja ": Derivation_ja, # fi,
"Derivation_ton ": Derivation_ton, # fi,
"Derivation_vs ": Derivation_vs, # fi,
"Derivation_ttain ": Derivation_ttain, # fi,
"Derivation_ttaa ": Derivation_ttaa, # fi,
"Echo_rdp ": Echo_rdp, # U,
"Echo_ech ": Echo_ech, # U,
"Foreign_foreign ": Foreign_foreign, # cz, fi, U,
"Foreign_fscript ": Foreign_fscript, # cz, fi, U,
"Foreign_tscript ": Foreign_tscript, # cz, U,
"Foreign_yes ": Foreign_yes, # sl,
"Gender_dat_masc ": Gender_dat_masc, # bq, U,
"Gender_dat_fem ": Gender_dat_fem, # bq, U,
"Gender_erg_masc ": Gender_erg_masc, # bq,
"Gender_erg_fem ": Gender_erg_fem, # bq,
"Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
"Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
"Gender_psor_neut ": Gender_psor_neut, # sl,
"Hyph_yes ": Hyph_yes, # cz, U,
"InfForm_one ": InfForm_one, # fi,
"InfForm_two ": InfForm_two, # fi,
"InfForm_three ": InfForm_three, # fi,
"NameType_geo ": NameType_geo, # U, cz,
"NameType_prs ": NameType_prs, # U, cz,
"NameType_giv ": NameType_giv, # U, cz,
"NameType_sur ": NameType_sur, # U, cz,
"NameType_nat ": NameType_nat, # U, cz,
"NameType_com ": NameType_com, # U, cz,
"NameType_pro ": NameType_pro, # U, cz,
"NameType_oth ": NameType_oth, # U, cz,
"NounType_com ": NounType_com, # U,
"NounType_prop ": NounType_prop, # U,
"NounType_class ": NounType_class, # U,
"Number_abs_sing ": Number_abs_sing, # bq, U,
"Number_abs_plur ": Number_abs_plur, # bq, U,
"Number_dat_sing ": Number_dat_sing, # bq, U,
"Number_dat_plur ": Number_dat_plur, # bq, U,
"Number_erg_sing ": Number_erg_sing, # bq, U,
"Number_erg_plur ": Number_erg_plur, # bq, U,
"Number_psee_sing ": Number_psee_sing, # U,
"Number_psee_plur ": Number_psee_plur, # U,
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
"NumForm_digit ": NumForm_digit, # cz, sl, U,
"NumForm_roman ": NumForm_roman, # cz, sl, U,
"NumForm_word ": NumForm_word, # cz, sl, U,
"NumValue_one ": NumValue_one, # cz, U,
"NumValue_two ": NumValue_two, # cz, U,
"NumValue_three ": NumValue_three, # cz, U,
"PartForm_pres ": PartForm_pres, # fi,
"PartForm_past ": PartForm_past, # fi,
"PartForm_agt ": PartForm_agt, # fi,
"PartForm_neg ": PartForm_neg, # fi,
"PartType_mod ": PartType_mod, # U,
"PartType_emp ": PartType_emp, # U,
"PartType_res ": PartType_res, # U,
"PartType_inf ": PartType_inf, # U,
"PartType_vbp ": PartType_vbp, # U,
"Person_abs_one ": Person_abs_one, # bq, U,
"Person_abs_two ": Person_abs_two, # bq, U,
"Person_abs_three ": Person_abs_three, # bq, U,
"Person_dat_one ": Person_dat_one, # bq, U,
"Person_dat_two ": Person_dat_two, # bq, U,
"Person_dat_three ": Person_dat_three, # bq, U,
"Person_erg_one ": Person_erg_one, # bq, U,
"Person_erg_two ": Person_erg_two, # bq, U,
"Person_erg_three ": Person_erg_three, # bq, U,
"Person_psor_one ": Person_psor_one, # fi, U,
"Person_psor_two ": Person_psor_two, # fi, U,
"Person_psor_three ": Person_psor_three, # fi, U,
"Polite_inf ": Polite_inf, # bq, U,
"Polite_pol ": Polite_pol, # bq, U,
"Polite_abs_inf ": Polite_abs_inf, # bq, U,
"Polite_abs_pol ": Polite_abs_pol, # bq, U,
"Polite_erg_inf ": Polite_erg_inf, # bq, U,
"Polite_erg_pol ": Polite_erg_pol, # bq, U,
"Polite_dat_inf ": Polite_dat_inf, # bq, U,
"Polite_dat_pol ": Polite_dat_pol, # bq, U,
"Prefix_yes ": Prefix_yes, # U,
"PrepCase_npr ": PrepCase_npr, # cz,
"PrepCase_pre ": PrepCase_pre, # U,
"PunctSide_ini ": PunctSide_ini, # U,
"PunctSide_fin ": PunctSide_fin, # U,
"PunctType_peri ": PunctType_peri, # U,
"PunctType_qest ": PunctType_qest, # U,
"PunctType_excl ": PunctType_excl, # U,
"PunctType_quot ": PunctType_quot, # U,
"PunctType_brck ": PunctType_brck, # U,
"PunctType_comm ": PunctType_comm, # U,
"PunctType_colo ": PunctType_colo, # U,
"PunctType_semi ": PunctType_semi, # U,
"PunctType_dash ": PunctType_dash, # U,
"Style_arch ": Style_arch, # cz, fi, U,
"Style_rare ": Style_rare, # cz, fi, U,
"Style_poet ": Style_poet, # cz, U,
"Style_norm ": Style_norm, # cz, U,
"Style_coll ": Style_coll, # cz, U,
"Style_vrnc ": Style_vrnc, # cz, U,
"Style_sing ": Style_sing, # cz, U,
"Style_expr ": Style_expr, # cz, U,
"Style_derg ": Style_derg, # cz, U,
"Style_vulg ": Style_vulg, # cz, U,
"Style_yes ": Style_yes, # fi, U,
"StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
"StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
"VerbType_aux ": VerbType_aux, # U,
"VerbType_cop ": VerbType_cop, # U,
"VerbType_mod ": VerbType_mod, # U,
"VerbType_light ": VerbType_light, # U,
}
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]

View File

@ -1,7 +1,8 @@
# Google universal tag set
from . cimport symbols
cpdef enum univ_pos_t:
NO_TAG
ADJ
NO_TAG = 0
ADJ = symbols.ADJ
ADP
ADV
AUX
@ -20,4 +21,3 @@ cpdef enum univ_pos_t:
X
EOL
SPACE
N_UNIV_TAGS

View File

@ -1,8 +1,8 @@
from __future__ import unicode_literals
UNIV_POS_NAMES = {
"NO_TAG": NO_TAG,
IDS = {
"": NO_TAG,
"ADJ": ADJ,
"ADP": ADP,
"ADV": ADV,
@ -23,3 +23,6 @@ UNIV_POS_NAMES = {
"EOL": EOL,
"SPACE": SPACE
}
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]

View File

@ -69,12 +69,15 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except
cdef class StringStore:
'''Map strings to and from integer IDs.'''
def __init__(self):
def __init__(self, strings=None):
self.mem = Pool()
self._map = PreshMap()
self._resize_at = 10000
self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
self.size = 1
if strings is not None:
for string in strings:
_ = self[string]
property size:
def __get__(self):
@ -113,6 +116,14 @@ cdef class StringStore:
for i in range(self.size):
yield self[i]
def __reduce__(self):
strings = [""]
for i in range(1, self.size):
string = &self.c[i]
py_string = _decode(string)
strings.append(py_string)
return (StringStore, (strings,), None, None, None)
cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL:
# 0 means missing, but we don't bother offsetting the index.
key = hash64(chars, length * sizeof(char), 0)

421
spacy/symbols.pxd Normal file
View File

@ -0,0 +1,421 @@
cpdef enum symbol_t:
NIL
IS_ALPHA
IS_ASCII
IS_DIGIT
IS_LOWER
IS_PUNCT
IS_SPACE
IS_TITLE
IS_UPPER
LIKE_URL
LIKE_NUM
LIKE_EMAIL
IS_STOP
IS_OOV
FLAG14
FLAG15
FLAG16
FLAG17
FLAG18
FLAG19
FLAG20
FLAG21
FLAG22
FLAG23
FLAG24
FLAG25
FLAG26
FLAG27
FLAG28
FLAG29
FLAG30
FLAG31
FLAG32
FLAG33
FLAG34
FLAG35
FLAG36
FLAG37
FLAG38
FLAG39
FLAG40
FLAG41
FLAG42
FLAG43
FLAG44
FLAG45
FLAG46
FLAG47
FLAG48
FLAG49
FLAG50
FLAG51
FLAG52
FLAG53
FLAG54
FLAG55
FLAG56
FLAG57
FLAG58
FLAG59
FLAG60
FLAG61
FLAG62
FLAG63
ID
ORTH
LOWER
NORM
SHAPE
PREFIX
SUFFIX
LENGTH
CLUSTER
LEMMA
POS
TAG
DEP
ENT_IOB
ENT_TYPE
HEAD
SPACY
PROB
ADJ
ADP
ADV
AUX
CONJ
DET
INTJ
NOUN
NUM
PART
PRON
PROPN
PUNCT
SCONJ
SYM
VERB
X
EOL
SPACE
Animacy_anim
Animacy_inam
Aspect_freq
Aspect_imp
Aspect_mod
Aspect_none
Aspect_perf
Case_abe
Case_abl
Case_abs
Case_acc
Case_ade
Case_all
Case_cau
Case_com
Case_dat
Case_del
Case_dis
Case_ela
Case_ess
Case_gen
Case_ill
Case_ine
Case_ins
Case_loc
Case_lat
Case_nom
Case_par
Case_sub
Case_sup
Case_tem
Case_ter
Case_tra
Case_voc
Definite_two
Definite_def
Definite_red
Definite_ind
Degree_cmp
Degree_comp
Degree_none
Degree_pos
Degree_sup
Degree_abs
Degree_com
Degree_dim # du
Gender_com
Gender_fem
Gender_masc
Gender_neut
Mood_cnd
Mood_imp
Mood_ind
Mood_n
Mood_pot
Mood_sub
Mood_opt
Negative_neg
Negative_pos
Negative_yes
Number_com
Number_dual
Number_none
Number_plur
Number_sing
Number_ptan # bg
Number_count # bg
NumType_card
NumType_dist
NumType_frac
NumType_gen
NumType_mult
NumType_none
NumType_ord
NumType_sets
Person_one
Person_two
Person_three
Person_none
Poss_yes
PronType_advPart
PronType_art
PronType_default
PronType_dem
PronType_ind
PronType_int
PronType_neg
PronType_prs
PronType_rcp
PronType_rel
PronType_tot
PronType_clit
PronType_exc # es, ca, it, fa
Reflex_yes
Tense_fut
Tense_imp
Tense_past
Tense_pres
VerbForm_fin
VerbForm_ger
VerbForm_inf
VerbForm_none
VerbForm_part
VerbForm_partFut
VerbForm_partPast
VerbForm_partPres
VerbForm_sup
VerbForm_trans
VerbForm_gdv # la
Voice_act
Voice_cau
Voice_pass
Voice_mid # gkc
Voice_int # hb
Abbr_yes # cz, fi, sl, U
AdpType_prep # cz, U
AdpType_post # U
AdpType_voc # cz
AdpType_comprep # cz
AdpType_circ # U
AdvType_man
AdvType_loc
AdvType_tim
AdvType_deg
AdvType_cau
AdvType_mod
AdvType_sta
AdvType_ex
AdvType_adadj
ConjType_oper # cz, U
ConjType_comp # cz, U
Connegative_yes # fi
Derivation_minen # fi
Derivation_sti # fi
Derivation_inen # fi
Derivation_lainen # fi
Derivation_ja # fi
Derivation_ton # fi
Derivation_vs # fi
Derivation_ttain # fi
Derivation_ttaa # fi
Echo_rdp # U
Echo_ech # U
Foreign_foreign # cz, fi, U
Foreign_fscript # cz, fi, U
Foreign_tscript # cz, U
Foreign_yes # sl
Gender_dat_masc # bq, U
Gender_dat_fem # bq, U
Gender_erg_masc # bq
Gender_erg_fem # bq
Gender_psor_masc # cz, sl, U
Gender_psor_fem # cz, sl, U
Gender_psor_neut # sl
Hyph_yes # cz, U
InfForm_one # fi
InfForm_two # fi
InfForm_three # fi
NameType_geo # U, cz
NameType_prs # U, cz
NameType_giv # U, cz
NameType_sur # U, cz
NameType_nat # U, cz
NameType_com # U, cz
NameType_pro # U, cz
NameType_oth # U, cz
NounType_com # U
NounType_prop # U
NounType_class # U
Number_abs_sing # bq, U
Number_abs_plur # bq, U
Number_dat_sing # bq, U
Number_dat_plur # bq, U
Number_erg_sing # bq, U
Number_erg_plur # bq, U
Number_psee_sing # U
Number_psee_plur # U
Number_psor_sing # cz, fi, sl, U
Number_psor_plur # cz, fi, sl, U
NumForm_digit # cz, sl, U
NumForm_roman # cz, sl, U
NumForm_word # cz, sl, U
NumValue_one # cz, U
NumValue_two # cz, U
NumValue_three # cz, U
PartForm_pres # fi
PartForm_past # fi
PartForm_agt # fi
PartForm_neg # fi
PartType_mod # U
PartType_emp # U
PartType_res # U
PartType_inf # U
PartType_vbp # U
Person_abs_one # bq, U
Person_abs_two # bq, U
Person_abs_three # bq, U
Person_dat_one # bq, U
Person_dat_two # bq, U
Person_dat_three # bq, U
Person_erg_one # bq, U
Person_erg_two # bq, U
Person_erg_three # bq, U
Person_psor_one # fi, U
Person_psor_two # fi, U
Person_psor_three # fi, U
Polite_inf # bq, U
Polite_pol # bq, U
Polite_abs_inf # bq, U
Polite_abs_pol # bq, U
Polite_erg_inf # bq, U
Polite_erg_pol # bq, U
Polite_dat_inf # bq, U
Polite_dat_pol # bq, U
Prefix_yes # U
PrepCase_npr # cz
PrepCase_pre # U
PunctSide_ini # U
PunctSide_fin # U
PunctType_peri # U
PunctType_qest # U
PunctType_excl # U
PunctType_quot # U
PunctType_brck # U
PunctType_comm # U
PunctType_colo # U
PunctType_semi # U
PunctType_dash # U
Style_arch # cz, fi, U
Style_rare # cz, fi, U
Style_poet # cz, U
Style_norm # cz, U
Style_coll # cz, U
Style_vrnc # cz, U
Style_sing # cz, U
Style_expr # cz, U
Style_derg # cz, U
Style_vulg # cz, U
Style_yes # fi, U
StyleVariant_styleShort # cz
StyleVariant_styleBound # cz, sl
VerbType_aux # U
VerbType_cop # U
VerbType_mod # U
VerbType_light # U
PERSON
NORP
FACILITY
ORG
GPE
LOC
PRODUCT
EVENT
WORK_OF_ART
LANGUAGE
DATE
TIME
PERCENT
MONEY
QUANTITY
ORDINAL
CARDINAL
acomp
advcl
advmod
agent
amod
appos
attr
aux
auxpass
cc
ccomp
complm
conj
csubj
csubjpass
dep
det
dobj
expl
hmod
hyph
infmod
intj
iobj
mark
meta
neg
nmod
nn
npadvmod
nsubj
nsubjpass
num
number
oprd
parataxis
partmod
pcomp
pobj
poss
possessive
preconj
prep
prt
punct
quantmod
rcmod
root
xcomp

424
spacy/symbols.pyx Normal file
View File

@ -0,0 +1,424 @@
IDS = {
"": NIL,
"IS_ALPHA": IS_ALPHA,
"IS_ASCII": IS_ASCII,
"IS_DIGIT": IS_DIGIT,
"IS_LOWER": IS_LOWER,
"IS_PUNCT": IS_PUNCT,
"IS_SPACE": IS_SPACE,
"IS_TITLE": IS_TITLE,
"IS_UPPER": IS_UPPER,
"LIKE_URL": LIKE_URL,
"LIKE_NUM": LIKE_NUM,
"LIKE_EMAIL": LIKE_EMAIL,
"IS_STOP": IS_STOP,
"IS_OOV": IS_OOV,
"FLAG14": FLAG14,
"FLAG15": FLAG15,
"FLAG16": FLAG16,
"FLAG17": FLAG17,
"FLAG18": FLAG18,
"FLAG19": FLAG19,
"FLAG20": FLAG20,
"FLAG21": FLAG21,
"FLAG22": FLAG22,
"FLAG23": FLAG23,
"FLAG24": FLAG24,
"FLAG25": FLAG25,
"FLAG26": FLAG26,
"FLAG27": FLAG27,
"FLAG28": FLAG28,
"FLAG29": FLAG29,
"FLAG30": FLAG30,
"FLAG31": FLAG31,
"FLAG32": FLAG32,
"FLAG33": FLAG33,
"FLAG34": FLAG34,
"FLAG35": FLAG35,
"FLAG36": FLAG36,
"FLAG37": FLAG37,
"FLAG38": FLAG38,
"FLAG39": FLAG39,
"FLAG40": FLAG40,
"FLAG41": FLAG41,
"FLAG42": FLAG42,
"FLAG43": FLAG43,
"FLAG44": FLAG44,
"FLAG45": FLAG45,
"FLAG46": FLAG46,
"FLAG47": FLAG47,
"FLAG48": FLAG48,
"FLAG49": FLAG49,
"FLAG50": FLAG50,
"FLAG51": FLAG51,
"FLAG52": FLAG52,
"FLAG53": FLAG53,
"FLAG54": FLAG54,
"FLAG55": FLAG55,
"FLAG56": FLAG56,
"FLAG57": FLAG57,
"FLAG58": FLAG58,
"FLAG59": FLAG59,
"FLAG60": FLAG60,
"FLAG61": FLAG61,
"FLAG62": FLAG62,
"FLAG63": FLAG63,
"ID": ID,
"ORTH": ORTH,
"LOWER": LOWER,
"NORM": NORM,
"SHAPE": SHAPE,
"PREFIX": PREFIX,
"SUFFIX": SUFFIX,
"LENGTH": LENGTH,
"CLUSTER": CLUSTER,
"LEMMA": LEMMA,
"POS": POS,
"TAG": TAG,
"DEP": DEP,
"ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE,
"HEAD": HEAD,
"SPACY": SPACY,
"PROB": PROB,
"ADJ": ADJ,
"ADP": ADP,
"ADV": ADV,
"AUX": AUX,
"CONJ": CONJ,
"DET": DET,
"INTJ": INTJ,
"NOUN": NOUN,
"NUM": NUM,
"PART": PART,
"PRON": PRON,
"PROPN": PROPN,
"PUNCT": PUNCT,
"SCONJ": SCONJ,
"SYM": SYM,
"VERB": VERB,
"X": X,
"EOL": EOL,
"SPACE": SPACE,
"Animacy_anim": Animacy_anim,
"Animacy_inam": Animacy_inam,
"Aspect_freq": Aspect_freq,
"Aspect_imp": Aspect_imp,
"Aspect_mod": Aspect_mod,
"Aspect_none": Aspect_none,
"Aspect_perf": Aspect_perf,
"Case_abe": Case_abe,
"Case_abl": Case_abl,
"Case_abs": Case_abs,
"Case_acc": Case_acc,
"Case_ade": Case_ade,
"Case_all": Case_all,
"Case_cau": Case_cau,
"Case_com": Case_com,
"Case_dat": Case_dat,
"Case_del": Case_del,
"Case_dis": Case_dis,
"Case_ela": Case_ela,
"Case_ess": Case_ess,
"Case_gen": Case_gen,
"Case_ill": Case_ill,
"Case_ine": Case_ine,
"Case_ins": Case_ins,
"Case_loc": Case_loc,
"Case_lat": Case_lat,
"Case_nom": Case_nom,
"Case_par": Case_par,
"Case_sub": Case_sub,
"Case_sup": Case_sup,
"Case_tem": Case_tem,
"Case_ter": Case_ter,
"Case_tra": Case_tra,
"Case_voc": Case_voc,
"Definite_two": Definite_two,
"Definite_def": Definite_def,
"Definite_red": Definite_red,
"Definite_ind": Definite_ind,
"Degree_cmp": Degree_cmp,
"Degree_comp": Degree_comp,
"Degree_none": Degree_none,
"Degree_pos": Degree_pos,
"Degree_sup": Degree_sup,
"Degree_abs": Degree_abs,
"Degree_com": Degree_com,
"Degree_dim ": Degree_dim, # du
"Gender_com": Gender_com,
"Gender_fem": Gender_fem,
"Gender_masc": Gender_masc,
"Gender_neut": Gender_neut,
"Mood_cnd": Mood_cnd,
"Mood_imp": Mood_imp,
"Mood_ind": Mood_ind,
"Mood_n": Mood_n,
"Mood_pot": Mood_pot,
"Mood_sub": Mood_sub,
"Mood_opt": Mood_opt,
"Negative_neg": Negative_neg,
"Negative_pos": Negative_pos,
"Negative_yes": Negative_yes,
"Number_com": Number_com,
"Number_dual": Number_dual,
"Number_none": Number_none,
"Number_plur": Number_plur,
"Number_sing": Number_sing,
"Number_ptan ": Number_ptan, # bg
"Number_count ": Number_count, # bg
"NumType_card": NumType_card,
"NumType_dist": NumType_dist,
"NumType_frac": NumType_frac,
"NumType_gen": NumType_gen,
"NumType_mult": NumType_mult,
"NumType_none": NumType_none,
"NumType_ord": NumType_ord,
"NumType_sets": NumType_sets,
"Person_one": Person_one,
"Person_two": Person_two,
"Person_three": Person_three,
"Person_none": Person_none,
"Poss_yes": Poss_yes,
"PronType_advPart": PronType_advPart,
"PronType_art": PronType_art,
"PronType_default": PronType_default,
"PronType_dem": PronType_dem,
"PronType_ind": PronType_ind,
"PronType_int": PronType_int,
"PronType_neg": PronType_neg,
"PronType_prs": PronType_prs,
"PronType_rcp": PronType_rcp,
"PronType_rel": PronType_rel,
"PronType_tot": PronType_tot,
"PronType_clit": PronType_clit,
"PronType_exc ": PronType_exc, # es, ca, it, fa,
"Reflex_yes": Reflex_yes,
"Tense_fut": Tense_fut,
"Tense_imp": Tense_imp,
"Tense_past": Tense_past,
"Tense_pres": Tense_pres,
"VerbForm_fin": VerbForm_fin,
"VerbForm_ger": VerbForm_ger,
"VerbForm_inf": VerbForm_inf,
"VerbForm_none": VerbForm_none,
"VerbForm_part": VerbForm_part,
"VerbForm_partFut": VerbForm_partFut,
"VerbForm_partPast": VerbForm_partPast,
"VerbForm_partPres": VerbForm_partPres,
"VerbForm_sup": VerbForm_sup,
"VerbForm_trans": VerbForm_trans,
"VerbForm_gdv ": VerbForm_gdv, # la,
"Voice_act": Voice_act,
"Voice_cau": Voice_cau,
"Voice_pass": Voice_pass,
"Voice_mid ": Voice_mid, # gkc,
"Voice_int ": Voice_int, # hb,
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
"AdpType_prep ": AdpType_prep, # cz, U,
"AdpType_post ": AdpType_post, # U,
"AdpType_voc ": AdpType_voc, # cz,
"AdpType_comprep ": AdpType_comprep, # cz,
"AdpType_circ ": AdpType_circ, # U,
"AdvType_man": AdvType_man,
"AdvType_loc": AdvType_loc,
"AdvType_tim": AdvType_tim,
"AdvType_deg": AdvType_deg,
"AdvType_cau": AdvType_cau,
"AdvType_mod": AdvType_mod,
"AdvType_sta": AdvType_sta,
"AdvType_ex": AdvType_ex,
"AdvType_adadj": AdvType_adadj,
"ConjType_oper ": ConjType_oper, # cz, U,
"ConjType_comp ": ConjType_comp, # cz, U,
"Connegative_yes ": Connegative_yes, # fi,
"Derivation_minen ": Derivation_minen, # fi,
"Derivation_sti ": Derivation_sti, # fi,
"Derivation_inen ": Derivation_inen, # fi,
"Derivation_lainen ": Derivation_lainen, # fi,
"Derivation_ja ": Derivation_ja, # fi,
"Derivation_ton ": Derivation_ton, # fi,
"Derivation_vs ": Derivation_vs, # fi,
"Derivation_ttain ": Derivation_ttain, # fi,
"Derivation_ttaa ": Derivation_ttaa, # fi,
"Echo_rdp ": Echo_rdp, # U,
"Echo_ech ": Echo_ech, # U,
"Foreign_foreign ": Foreign_foreign, # cz, fi, U,
"Foreign_fscript ": Foreign_fscript, # cz, fi, U,
"Foreign_tscript ": Foreign_tscript, # cz, U,
"Foreign_yes ": Foreign_yes, # sl,
"Gender_dat_masc ": Gender_dat_masc, # bq, U,
"Gender_dat_fem ": Gender_dat_fem, # bq, U,
"Gender_erg_masc ": Gender_erg_masc, # bq,
"Gender_erg_fem ": Gender_erg_fem, # bq,
"Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
"Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
"Gender_psor_neut ": Gender_psor_neut, # sl,
"Hyph_yes ": Hyph_yes, # cz, U,
"InfForm_one ": InfForm_one, # fi,
"InfForm_two ": InfForm_two, # fi,
"InfForm_three ": InfForm_three, # fi,
"NameType_geo ": NameType_geo, # U, cz,
"NameType_prs ": NameType_prs, # U, cz,
"NameType_giv ": NameType_giv, # U, cz,
"NameType_sur ": NameType_sur, # U, cz,
"NameType_nat ": NameType_nat, # U, cz,
"NameType_com ": NameType_com, # U, cz,
"NameType_pro ": NameType_pro, # U, cz,
"NameType_oth ": NameType_oth, # U, cz,
"NounType_com ": NounType_com, # U,
"NounType_prop ": NounType_prop, # U,
"NounType_class ": NounType_class, # U,
"Number_abs_sing ": Number_abs_sing, # bq, U,
"Number_abs_plur ": Number_abs_plur, # bq, U,
"Number_dat_sing ": Number_dat_sing, # bq, U,
"Number_dat_plur ": Number_dat_plur, # bq, U,
"Number_erg_sing ": Number_erg_sing, # bq, U,
"Number_erg_plur ": Number_erg_plur, # bq, U,
"Number_psee_sing ": Number_psee_sing, # U,
"Number_psee_plur ": Number_psee_plur, # U,
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
"NumForm_digit ": NumForm_digit, # cz, sl, U,
"NumForm_roman ": NumForm_roman, # cz, sl, U,
"NumForm_word ": NumForm_word, # cz, sl, U,
"NumValue_one ": NumValue_one, # cz, U,
"NumValue_two ": NumValue_two, # cz, U,
"NumValue_three ": NumValue_three, # cz, U,
"PartForm_pres ": PartForm_pres, # fi,
"PartForm_past ": PartForm_past, # fi,
"PartForm_agt ": PartForm_agt, # fi,
"PartForm_neg ": PartForm_neg, # fi,
"PartType_mod ": PartType_mod, # U,
"PartType_emp ": PartType_emp, # U,
"PartType_res ": PartType_res, # U,
"PartType_inf ": PartType_inf, # U,
"PartType_vbp ": PartType_vbp, # U,
"Person_abs_one ": Person_abs_one, # bq, U,
"Person_abs_two ": Person_abs_two, # bq, U,
"Person_abs_three ": Person_abs_three, # bq, U,
"Person_dat_one ": Person_dat_one, # bq, U,
"Person_dat_two ": Person_dat_two, # bq, U,
"Person_dat_three ": Person_dat_three, # bq, U,
"Person_erg_one ": Person_erg_one, # bq, U,
"Person_erg_two ": Person_erg_two, # bq, U,
"Person_erg_three ": Person_erg_three, # bq, U,
"Person_psor_one ": Person_psor_one, # fi, U,
"Person_psor_two ": Person_psor_two, # fi, U,
"Person_psor_three ": Person_psor_three, # fi, U,
"Polite_inf ": Polite_inf, # bq, U,
"Polite_pol ": Polite_pol, # bq, U,
"Polite_abs_inf ": Polite_abs_inf, # bq, U,
"Polite_abs_pol ": Polite_abs_pol, # bq, U,
"Polite_erg_inf ": Polite_erg_inf, # bq, U,
"Polite_erg_pol ": Polite_erg_pol, # bq, U,
"Polite_dat_inf ": Polite_dat_inf, # bq, U,
"Polite_dat_pol ": Polite_dat_pol, # bq, U,
"Prefix_yes ": Prefix_yes, # U,
"PrepCase_npr ": PrepCase_npr, # cz,
"PrepCase_pre ": PrepCase_pre, # U,
"PunctSide_ini ": PunctSide_ini, # U,
"PunctSide_fin ": PunctSide_fin, # U,
"PunctType_peri ": PunctType_peri, # U,
"PunctType_qest ": PunctType_qest, # U,
"PunctType_excl ": PunctType_excl, # U,
"PunctType_quot ": PunctType_quot, # U,
"PunctType_brck ": PunctType_brck, # U,
"PunctType_comm ": PunctType_comm, # U,
"PunctType_colo ": PunctType_colo, # U,
"PunctType_semi ": PunctType_semi, # U,
"PunctType_dash ": PunctType_dash, # U,
"Style_arch ": Style_arch, # cz, fi, U,
"Style_rare ": Style_rare, # cz, fi, U,
"Style_poet ": Style_poet, # cz, U,
"Style_norm ": Style_norm, # cz, U,
"Style_coll ": Style_coll, # cz, U,
"Style_vrnc ": Style_vrnc, # cz, U,
"Style_sing ": Style_sing, # cz, U,
"Style_expr ": Style_expr, # cz, U,
"Style_derg ": Style_derg, # cz, U,
"Style_vulg ": Style_vulg, # cz, U,
"Style_yes ": Style_yes, # fi, U,
"StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
"StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
"VerbType_aux ": VerbType_aux, # U,
"VerbType_cop ": VerbType_cop, # U,
"VerbType_mod ": VerbType_mod, # U,
"VerbType_light ": VerbType_light, # U,
"PERSON": PERSON,
"NORP": NORP,
"FACILITY": FACILITY,
"ORG": ORG,
"GPE": GPE,
"LOC": LOC,
"PRODUCT": PRODUCT,
"EVENT": EVENT,
"WORK_OF_ART": WORK_OF_ART,
"LANGUAGE": LANGUAGE,
"DATE": DATE,
"TIME": TIME,
"PERCENT": PERCENT,
"MONEY": MONEY,
"QUANTITY": QUANTITY,
"ORDINAL": ORDINAL,
"CARDINAL": CARDINAL,
"acomp": acomp,
"advcl": advcl,
"advmod": advmod,
"agent": agent,
"amod": amod,
"appos": appos,
"attr": attr,
"aux": aux,
"auxpass": auxpass,
"cc": cc,
"ccomp": ccomp,
"complm": complm,
"conj": conj,
"csubj": csubj,
"csubjpass": csubjpass,
"dep": dep,
"det": det,
"dobj": dobj,
"expl": expl,
"hmod": hmod,
"hyph": hyph,
"infmod": infmod,
"intj": intj,
"iobj": iobj,
"mark": mark,
"meta": meta,
"neg": neg,
"nmod": nmod,
"nn": nn,
"npadvmod": npadvmod,
"nsubj": nsubj,
"nsubjpass": nsubjpass,
"num": num,
"number": number,
"oprd": oprd,
"parataxis": parataxis,
"partmod": partmod,
"pcomp": pcomp,
"pobj": pobj,
"poss": poss,
"possessive": possessive,
"preconj": preconj,
"prep": prep,
"prt": prt,
"punct": punct,
"quantmod": quantmod,
"rcmod": rcmod,
"root": root,
"xcomp": xcomp
}
NAMES = [it[0] for it in sorted(IDS.items(), key=lambda it: it[1])]

View File

@ -83,7 +83,6 @@ cdef class Parser:
model = Model(moves.n_moves, templates, model_dir)
return cls(strings, moves, model)
def __call__(self, Doc tokens):
cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
self.moves.initialize_state(stcls)
@ -93,6 +92,9 @@ cdef class Parser:
self.parse(stcls, eg.c)
tokens.set_parse(stcls._sent)
def __reduce__(self):
return (Parser, (self.moves.strings, self.moves, self.model), None, None)
cdef void predict(self, StateClass stcls, ExampleC* eg) nogil:
memset(eg.scores, 0, eg.nr_class * sizeof(weight_t))
self.moves.set_valid(eg.is_valid, stcls)

View File

@ -37,6 +37,8 @@ cdef class TransitionSystem:
cdef public int root_label
cdef public freqs
cdef object _labels_by_action
cdef int initialize_state(self, StateClass state) except -1
cdef int finalize_state(self, StateClass state) nogil

View File

@ -15,7 +15,8 @@ class OracleError(Exception):
cdef class TransitionSystem:
def __init__(self, StringStore string_table, dict labels_by_action):
def __init__(self, StringStore string_table, dict labels_by_action, _freqs=None):
self._labels_by_action = labels_by_action
self.mem = Pool()
self.n_moves = sum(len(labels) for labels in labels_by_action.values())
self._is_valid = <bint*>self.mem.alloc(self.n_moves, sizeof(bint))
@ -30,7 +31,7 @@ cdef class TransitionSystem:
i += 1
self.c = moves
self.root_label = self.strings['ROOT']
self.freqs = {}
self.freqs = {} if _freqs is None else _freqs
for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
self.freqs[attr] = defaultdict(int)
self.freqs[attr][0] = 1
@ -39,6 +40,11 @@ cdef class TransitionSystem:
self.freqs[HEAD][i] = 1
self.freqs[HEAD][-i] = 1
def __reduce__(self):
return (self.__class__,
(self.strings, self._labels_by_action, self.freqs),
None, None)
cdef int initialize_state(self, StateClass state) except -1:
pass

View File

@ -148,6 +148,9 @@ cdef class Tagger:
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
def __reduce__(self):
return (self.__class__, (self.vocab, self.model), None, None)
def tag_from_strings(self, Doc tokens, object tag_strs):
cdef int i
for i in range(tokens.length):

View File

@ -14,7 +14,6 @@ from ..typedefs cimport attr_t, flags_t
from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..parts_of_speech import UNIV_POS_NAMES
from ..parts_of_speech cimport CONJ, PUNCT, NOUN
from ..parts_of_speech cimport univ_pos_t
from ..lexeme cimport Lexeme

View File

@ -9,7 +9,7 @@ import numpy
from ..lexeme cimport Lexeme
from ..parts_of_speech import UNIV_POS_NAMES
from .. import parts_of_speech
from ..attrs cimport LEMMA
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
@ -318,7 +318,7 @@ cdef class Token:
property pos_:
def __get__(self):
return _pos_id_to_string[self.c.pos]
return parts_of_speech.NAMES[self.c.pos]
property tag_:
def __get__(self):
@ -363,6 +363,3 @@ cdef class Token:
property like_email:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}

View File

@ -25,7 +25,6 @@ cdef struct _Cached:
cdef class Vocab:
cpdef public lexeme_props_getter
cdef Pool mem
cpdef readonly StringStore strings
cpdef readonly Morphology morphology
@ -33,7 +32,6 @@ cdef class Vocab:
cdef public object _serializer
cdef public object data_dir
cdef public object get_lex_attr
cdef public object pos_tags
cdef public object serializer_freqs
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL

View File

@ -10,6 +10,8 @@ from os import path
import io
import math
import json
import tempfile
import copy_reg
from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport Lexeme
@ -19,6 +21,9 @@ from .typedefs cimport attr_t
from .cfile cimport CFile
from .lemmatizer import Lemmatizer
from . import attrs
from . import symbols
from cymem.cymem cimport Address
from . import util
from .serialize.packer cimport Packer
@ -67,6 +72,14 @@ cdef class Vocab:
self._by_hash = PreshMap()
self._by_orth = PreshMap()
self.strings = StringStore()
# Load strings in a special order, so that we have an onset number for
# the vocabulary. This way, when words are added in order, the orth ID
# is the frequency rank of the word, plus a certain offset. The structural
# strings are loaded first, because the vocab is open-class, and these
# symbols are closed class.
for name in symbols.NAMES + list(sorted(tag_map.keys())):
if name:
_ = self.strings[name]
self.get_lex_attr = get_lex_attr
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
self.serializer_freqs = serializer_freqs
@ -85,6 +98,20 @@ cdef class Vocab:
"""The current number of lexemes stored."""
return self.length
def __reduce__(self):
# TODO: Dump vectors
tmp_dir = tempfile.mkdtemp()
lex_loc = path.join(tmp_dir, 'lexemes.bin')
str_loc = path.join(tmp_dir, 'strings.txt')
vec_loc = path.join(self.data_dir, 'vec.bin') if self.data_dir is not None else None
self.dump(lex_loc)
self.strings.dump(str_loc)
state = (str_loc, lex_loc, vec_loc, self.morphology, self.get_lex_attr,
self.serializer_freqs, self.data_dir)
return (unpickle_vocab, state, None, None)
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
if necessary, using memory acquired from the given pool. If the pool
@ -260,17 +287,17 @@ cdef class Vocab:
i += 1
fp.close()
def load_vectors(self, loc_or_file):
def load_vectors(self, file_):
cdef LexemeC* lexeme
cdef attr_t orth
cdef int32_t vec_len = -1
for line_num, line in enumerate(loc_or_file):
for line_num, line in enumerate(file_):
pieces = line.split()
word_str = pieces.pop(0)
if vec_len == -1:
vec_len = len(pieces)
elif vec_len != len(pieces):
raise VectorReadError.mismatched_sizes(loc_or_file, line_num,
raise VectorReadError.mismatched_sizes(file_, line_num,
vec_len, len(pieces))
orth = self.strings[word_str]
lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)
@ -328,6 +355,25 @@ cdef class Vocab:
return vec_len
def unpickle_vocab(strings_loc, lex_loc, vec_loc, morphology, get_lex_attr,
serializer_freqs, data_dir):
cdef Vocab vocab = Vocab()
vocab.get_lex_attr = get_lex_attr
vocab.morphology = morphology
vocab.strings = morphology.strings
vocab.data_dir = data_dir
vocab.serializer_freqs = serializer_freqs
vocab.load_lexemes(strings_loc, lex_loc)
if vec_loc is not None:
vocab.load_vectors_from_bin_loc(vec_loc)
return vocab
copy_reg.constructor(unpickle_vocab)
def write_binary_vectors(in_loc, out_loc):
cdef CFile out_file = CFile(out_loc, 'wb')
cdef Address mem

View File

@ -0,0 +1,17 @@
import pytest
import pickle
import StringIO
from spacy.morphology import Morphology
from spacy.lemmatizer import Lemmatizer
from spacy.strings import StringStore
def test_pickle():
morphology = Morphology(StringStore(), {}, Lemmatizer({}, {}, {}))
file_ = StringIO.StringIO()
pickle.dump(morphology, file_)

View File

@ -0,0 +1,16 @@
import pytest
import pickle
import cloudpickle
import StringIO
@pytest.mark.models
def test_pickle(EN):
file_ = StringIO.StringIO()
cloudpickle.dump(EN.parser, file_)
file_.seek(0)
loaded = pickle.load(file_)

View File

@ -1,5 +1,7 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import StringIO
import pickle
from spacy.lemmatizer import Lemmatizer, read_index, read_exc
from spacy.en import LOCAL_DATA_DIR
@ -41,3 +43,12 @@ def test_smart_quotes(lemmatizer):
do = lemmatizer.punct
assert do('') == set(['"'])
assert do('') == set(['"'])
def test_pickle_lemmatizer(lemmatizer):
file_ = StringIO.StringIO()
pickle.dump(lemmatizer, file_)
file_.seek(0)
loaded = pickle.load(file_)

15
tests/test_pickle.py Normal file
View File

@ -0,0 +1,15 @@
import pytest
import StringIO
import cloudpickle
import pickle
@pytest.mark.models
def test_pickle_english(EN):
file_ = StringIO.StringIO()
cloudpickle.dump(EN, file_)
file_.seek(0)
loaded = pickle.load(file_)

View File

@ -1,5 +1,7 @@
# -*- coding: utf8 -*-
from __future__ import unicode_literals
import pickle
import StringIO
from spacy.strings import StringStore
@ -76,3 +78,18 @@ def test_massive_strings(sstore):
s513 = '1' * 513
orth = sstore[s513]
assert sstore[orth] == s513
def test_pickle_string_store(sstore):
hello_id = sstore[u'Hi']
string_file = StringIO.StringIO()
pickle.dump(sstore, string_file)
string_file.seek(0)
loaded = pickle.load(string_file)
assert loaded[hello_id] == u'Hi'

View File

@ -1,5 +1,11 @@
from __future__ import unicode_literals
import pytest
import StringIO
import cloudpickle
import pickle
from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA
from spacy.parts_of_speech import NOUN, VERB
def test_neq(en_vocab):
@ -25,3 +31,21 @@ def test_punct_neq(en_vocab):
def test_shape_attr(en_vocab):
example = en_vocab['example']
assert example.orth != example.shape
def test_symbols(en_vocab):
assert en_vocab.strings['IS_ALPHA'] == IS_ALPHA
assert en_vocab.strings['NOUN'] == NOUN
assert en_vocab.strings['VERB'] == VERB
assert en_vocab.strings['LEMMA'] == LEMMA
assert en_vocab.strings['ORTH'] == ORTH
assert en_vocab.strings['PROB'] == PROB
def test_pickle_vocab(en_vocab):
file_ = StringIO.StringIO()
cloudpickle.dump(en_vocab, file_)
file_.seek(0)
loaded = pickle.load(file_)

View File

@ -1,11 +1,13 @@
from __future__ import unicode_literals
import pytest
import os
@pytest.fixture(scope='session')
def nlp():
from spacy.en import English
return English()
from spacy.en import English, LOCAL_DATA_DIR
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
return English(data_dir=data_dir)
@pytest.fixture()

View File

@ -1,6 +1,7 @@
from __future__ import unicode_literals
import pytest
import spacy
import os
@pytest.fixture()
@ -9,8 +10,9 @@ def token(doc):
def test_load_resources_and_process_text():
from spacy.en import English
nlp = English()
from spacy.en import English, LOCAL_DATA_DIR
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
nlp = English(data_dir=data_dir)
doc = nlp('Hello, world. Here are two sentences.')