spaCy/spacy/morphology.pyx

1112 lines
30 KiB
Cython

# cython: infer_types
# coding: utf8
from __future__ import unicode_literals
from libc.string cimport memset
import srsly
from collections import Counter
from .compat import basestring_
from .strings import get_string_id
from . import symbols
from .attrs cimport POS, IS_SPACE
from .attrs import LEMMA, intify_attrs
from .parts_of_speech cimport SPACE
from .parts_of_speech import IDS as POS_IDS
from .lexeme cimport Lexeme
from .errors import Errors
cdef enum univ_field_t:
Field_POS
Field_Abbr
Field_AdpType
Field_AdvType
Field_Animacy
Field_Aspect
Field_Case
Field_ConjType
Field_Connegative
Field_Definite
Field_Degree
Field_Derivation
Field_Echo
Field_Foreign
Field_Gender
Field_Hyph
Field_InfForm
Field_Mood
Field_NameType
Field_Negative
Field_NounType
Field_Number
Field_NumForm
Field_NumType
Field_NumValue
Field_PartForm
Field_PartType
Field_Person
Field_Polite
Field_Polarity
Field_Poss
Field_Prefix
Field_PrepCase
Field_PronType
Field_PunctSide
Field_PunctType
Field_Reflex
Field_Style
Field_StyleVariant
Field_Tense
Field_Typo
Field_VerbForm
Field_Voice
Field_VerbType
def _normalize_props(props):
"""Transform deprecated string keys to correct names."""
out = {}
props = dict(props)
for key in FIELDS:
if key in props:
value = str(props[key]).lower()
attr = '%s_%s' % (key, value)
if attr in FEATURES:
props.pop(key)
props[attr] = True
for key, value in props.items():
if key == POS:
if hasattr(value, 'upper'):
value = value.upper()
if value in POS_IDS:
value = POS_IDS[value]
out[key] = value
elif isinstance(key, int):
out[key] = value
elif value is True:
out[key] = value
elif key.lower() == 'pos':
out[POS] = POS_IDS[value.upper()]
elif key.lower() != 'morph':
out[key] = value
return out
def parse_feature(feature):
field = FEATURE_FIELDS[feature]
offset = FEATURE_OFFSETS[feature]
return (field, offset)
cdef int attribute_to_field(unicode attribute_name):
return LOWER_FIELDS[attribute_name]
def get_field_id(feature):
return FEATURE_FIELDS[feature]
def get_field_size(field):
return FIELD_SIZES[FIELDS[field]]
def get_field_offset(field):
return FIELD_OFFSETS[FIELDS[field]]
cdef class Morphology:
'''Store the possible morphological analyses for a language, and index them
by hash.
To save space on each token, tokens only know the hash of their morphological
analysis, so queries of morphological attributes are delegated
to this class.
'''
def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
self.mem = Pool()
self.strings = string_store
self.tags = PreshMap()
# Add special space symbol. We prefix with underscore, to make sure it
# always sorts to the end.
space_attrs = tag_map.get('SP', {POS: SPACE})
if '_SP' not in tag_map:
self.strings.add('_SP')
tag_map = dict(tag_map)
tag_map['_SP'] = space_attrs
self.tag_names = tuple(sorted(tag_map.keys()))
self.tag_map = {}
self.lemmatizer = lemmatizer
self.n_tags = len(tag_map)
self.reverse_index = {}
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
attrs = _normalize_props(attrs)
self.add({FEATURE_NAMES[feat] for feat in attrs if feat in FEATURE_NAMES})
self.tag_map[tag_str] = dict(attrs)
self.reverse_index[self.strings.add(tag_str)] = i
self._cache = PreshMapArray(self.n_tags)
self.exc = {}
if exc is not None:
for (tag, orth), attrs in exc.items():
attrs = _normalize_props(attrs)
self.add_special_case(
self.strings.as_string(tag), self.strings.as_string(orth), attrs)
def __reduce__(self):
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
self.exc), None, None)
def add(self, features):
"""Insert a morphological analysis in the morphology table, if not already
present. Returns the hash of the new analysis.
"""
for f in features:
if isinstance(f, basestring_):
self.strings.add(f)
features = intify_features(features)
cdef attr_t feature
for feature in features:
if feature != 0 and feature not in FEATURE_NAMES:
raise KeyError("Unknown feature: %s" % self.strings[feature])
cdef MorphAnalysisC tag
tag = create_rich_tag(features)
cdef hash_t key = self.insert(tag)
return key
def get(self, hash_t morph):
tag = <MorphAnalysisC*>self.tags.get(morph)
if tag == NULL:
return []
else:
return tag_to_json(tag)
cpdef update(self, hash_t morph, features):
"""Update a morphological analysis with new feature values."""
tag = (<MorphAnalysisC*>self.tags.get(morph))[0]
features = intify_features(features)
cdef attr_t feature
for feature in features:
field = get_field_id(feature)
set_feature(&tag, field, feature, 1)
morph = self.insert(tag)
return morph
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
if orth not in self.strings:
return orth
cdef unicode py_string = self.strings[orth]
if self.lemmatizer is None:
return self.strings.add(py_string.lower())
cdef list lemma_strings
cdef unicode lemma_string
# Normalize features into a dict keyed by the field, to make life easier
# for the lemmatizer. Handles string-to-int conversion too.
string_feats = {}
for key, value in morphology.items():
if value is True:
name, value = self.strings.as_string(key).split('_', 1)
string_feats[name] = value
else:
string_feats[self.strings.as_string(key)] = self.strings.as_string(value)
lemma_strings = self.lemmatizer(py_string, univ_pos, string_feats)
lemma_string = lemma_strings[0]
lemma = self.strings.add(lemma_string)
return lemma
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
force=False):
"""Add a special-case rule to the morphological analyser. Tokens whose
tag and orth match the rule will receive the specified properties.
tag (unicode): The part-of-speech tag to key the exception.
orth (unicode): The word-form to key the exception.
"""
attrs = dict(attrs)
attrs = _normalize_props(attrs)
self.add({FEATURE_NAMES[feat] for feat in attrs if feat in FEATURE_NAMES})
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
self.exc[(tag_str, self.strings.add(orth_str))] = attrs
cdef hash_t insert(self, MorphAnalysisC tag) except 0:
cdef hash_t key = hash_tag(tag)
if self.tags.get(key) == NULL:
tag_ptr = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
tag_ptr[0] = tag
self.tags.set(key, <void*>tag_ptr)
return key
cdef int assign_untagged(self, TokenC* token) except -1:
"""Set morphological attributes on a token without a POS tag. Uses
the lemmatizer's lookup() method, which looks up the string in the
table provided by the language data as lemma_lookup (if available).
"""
if token.lemma == 0:
orth_str = self.strings[token.lex.orth]
lemma = self.lemmatizer.lookup(orth_str)
token.lemma = self.strings.add(lemma)
cdef int assign_tag(self, TokenC* token, tag_str) except -1:
cdef attr_t tag = self.strings.as_int(tag_str)
if tag in self.reverse_index:
tag_id = self.reverse_index[tag]
self.assign_tag_id(token, tag_id)
else:
token.tag = tag
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
if tag_id > self.n_tags:
raise ValueError(Errors.E014.format(tag=tag_id))
# Ensure spaces get tagged as space.
# It seems pretty arbitrary to put this logic here, but there's really
# nowhere better. I guess the justification is that this is where the
# specific word and the tag interact. Still, we should have a better
# way to enforce this rule, or figure out why the statistical model fails.
# Related to Issue #220
if Lexeme.c_check_flag(token.lex, IS_SPACE):
tag_id = self.reverse_index[self.strings.add('_SP')]
tag_str = self.tag_names[tag_id]
features = dict(self.tag_map.get(tag_str, {}))
if features:
pos = self.strings.as_int(features.pop(POS))
else:
pos = 0
cdef attr_t lemma = <attr_t>self._cache.get(tag_id, token.lex.orth)
if lemma == 0:
# Ugh, self.lemmatize has opposite arg order from self.lemmatizer :(
lemma = self.lemmatize(pos, token.lex.orth, features)
self._cache.set(tag_id, token.lex.orth, <void*>lemma)
token.lemma = lemma
token.pos = <univ_pos_t>pos
token.tag = self.strings[tag_str]
token.morph = self.add(features)
if (self.tag_names[tag_id], token.lex.orth) in self.exc:
self._assign_tag_from_exceptions(token, tag_id)
cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1:
key = (self.tag_names[tag_id], token.lex.orth)
cdef dict attrs
attrs = self.exc[key]
token.pos = attrs.get(POS, token.pos)
token.lemma = attrs.get(LEMMA, token.lemma)
def load_morph_exceptions(self, dict exc):
# Map (form, pos) to attributes
for tag_str, entries in exc.items():
for form_str, attrs in entries.items():
self.add_special_case(tag_str, form_str, attrs)
def to_bytes(self):
json_tags = []
for key in self.tags:
tag_ptr = <MorphAnalysisC*>self.tags.get(key)
if tag_ptr != NULL:
json_tags.append(tag_to_json(tag_ptr))
return srsly.json_dumps(json_tags)
def from_bytes(self, byte_string):
raise NotImplementedError
def to_disk(self, path):
raise NotImplementedError
def from_disk(self, path):
raise NotImplementedError
cpdef univ_pos_t get_int_tag(pos_):
return <univ_pos_t>0
cpdef intify_features(features):
return {get_string_id(feature) for feature in features}
cdef hash_t hash_tag(MorphAnalysisC tag) nogil:
return mrmr.hash64(&tag, sizeof(tag), 0)
def get_feature_field(feature):
cdef attr_t key = get_string_id(feature)
return FEATURE_FIELDS[feature]
cdef MorphAnalysisC create_rich_tag(features) except *:
cdef MorphAnalysisC tag
cdef attr_t feature
memset(&tag, 0, sizeof(tag))
for feature in features:
field = get_field_id(feature)
set_feature(&tag, field, feature, 1)
return tag
cdef tag_to_json(const MorphAnalysisC* tag):
return [FEATURE_NAMES[f] for f in list_features(tag)]
cdef MorphAnalysisC tag_from_json(json_tag):
raise NotImplementedError
cdef list list_features(const MorphAnalysisC* tag):
output = []
if tag.abbr != 0:
output.append(tag.abbr)
if tag.adp_type != 0:
output.append(tag.adp_type)
if tag.adv_type != 0:
output.append(tag.adv_type)
if tag.animacy != 0:
output.append(tag.animacy)
if tag.aspect != 0:
output.append(tag.aspect)
if tag.case != 0:
output.append(tag.case)
if tag.conj_type != 0:
output.append(tag.conj_type)
if tag.connegative != 0:
output.append(tag.connegative)
if tag.definite != 0:
output.append(tag.definite)
if tag.degree != 0:
output.append(tag.degree)
if tag.derivation != 0:
output.append(tag.derivation)
if tag.echo != 0:
output.append(tag.echo)
if tag.foreign != 0:
output.append(tag.foreign)
if tag.gender != 0:
output.append(tag.gender)
if tag.hyph != 0:
output.append(tag.hyph)
if tag.inf_form != 0:
output.append(tag.inf_form)
if tag.mood != 0:
output.append(tag.mood)
if tag.negative != 0:
output.append(tag.negative)
if tag.number != 0:
output.append(tag.number)
if tag.name_type != 0:
output.append(tag.name_type)
if tag.noun_type != 0:
output.append(tag.noun_type)
if tag.part_form != 0:
output.append(tag.part_form)
if tag.part_type != 0:
output.append(tag.part_type)
if tag.person != 0:
output.append(tag.person)
if tag.polite != 0:
output.append(tag.polite)
if tag.polarity != 0:
output.append(tag.polarity)
if tag.poss != 0:
output.append(tag.poss)
if tag.prefix != 0:
output.append(tag.prefix)
if tag.prep_case != 0:
output.append(tag.prep_case)
if tag.pron_type != 0:
output.append(tag.pron_type)
if tag.punct_type != 0:
output.append(tag.punct_type)
if tag.reflex != 0:
output.append(tag.reflex)
if tag.style != 0:
output.append(tag.style)
if tag.style_variant != 0:
output.append(tag.style_variant)
if tag.typo != 0:
output.append(tag.typo)
if tag.verb_form != 0:
output.append(tag.verb_form)
if tag.voice != 0:
output.append(tag.voice)
if tag.verb_type != 0:
output.append(tag.verb_type)
return output
cdef attr_t get_field(const MorphAnalysisC* tag, int field_id) nogil:
field = <univ_field_t>field_id
if field == Field_POS:
return tag.pos
if field == Field_Abbr:
return tag.abbr
elif field == Field_AdpType:
return tag.adp_type
elif field == Field_AdvType:
return tag.adv_type
elif field == Field_Animacy:
return tag.animacy
elif field == Field_Aspect:
return tag.aspect
elif field == Field_Case:
return tag.case
elif field == Field_ConjType:
return tag.conj_type
elif field == Field_Connegative:
return tag.connegative
elif field == Field_Definite:
return tag.definite
elif field == Field_Degree:
return tag.degree
elif field == Field_Derivation:
return tag.derivation
elif field == Field_Echo:
return tag.echo
elif field == Field_Foreign:
return tag.foreign
elif field == Field_Gender:
return tag.gender
elif field == Field_Hyph:
return tag.hyph
elif field == Field_InfForm:
return tag.inf_form
elif field == Field_Mood:
return tag.mood
elif field == Field_Negative:
return tag.negative
elif field == Field_Number:
return tag.number
elif field == Field_NameType:
return tag.name_type
elif field == Field_NounType:
return tag.noun_type
elif field == Field_NumForm:
return tag.num_form
elif field == Field_NumType:
return tag.num_type
elif field == Field_NumValue:
return tag.num_value
elif field == Field_PartForm:
return tag.part_form
elif field == Field_PartType:
return tag.part_type
elif field == Field_Person:
return tag.person
elif field == Field_Polite:
return tag.polite
elif field == Field_Polarity:
return tag.polarity
elif field == Field_Poss:
return tag.poss
elif field == Field_Prefix:
return tag.prefix
elif field == Field_PrepCase:
return tag.prep_case
elif field == Field_PronType:
return tag.pron_type
elif field == Field_PunctSide:
return tag.punct_side
elif field == Field_PunctType:
return tag.punct_type
elif field == Field_Reflex:
return tag.reflex
elif field == Field_Style:
return tag.style
elif field == Field_StyleVariant:
return tag.style_variant
elif field == Field_Tense:
return tag.tense
elif field == Field_Typo:
return tag.typo
elif field == Field_VerbForm:
return tag.verb_form
elif field == Field_Voice:
return tag.voice
elif field == Field_VerbType:
return tag.verb_type
else:
raise ValueError("Unknown feature: %s (%d)" % (FEATURE_NAMES.get(feature), feature))
cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil:
if tag.abbr == feature:
return 1
elif tag.adp_type == feature:
return 1
elif tag.adv_type == feature:
return 1
elif tag.animacy == feature:
return 1
elif tag.aspect == feature:
return 1
elif tag.case == feature:
return 1
elif tag.conj_type == feature:
return 1
elif tag.connegative == feature:
return 1
elif tag.definite == feature:
return 1
elif tag.degree == feature:
return 1
elif tag.derivation == feature:
return 1
elif tag.echo == feature:
return 1
elif tag.foreign == feature:
return 1
elif tag.gender == feature:
return 1
elif tag.hyph == feature:
return 1
elif tag.inf_form == feature:
return 1
elif tag.mood == feature:
return 1
elif tag.negative == feature:
return 1
elif tag.number == feature:
return 1
elif tag.name_type == feature:
return 1
elif tag.noun_type == feature:
return 1
elif tag.num_form == feature:
return 1
elif tag.num_type == feature:
return 1
elif tag.num_value == feature:
return 1
elif tag.part_form == feature:
return 1
elif tag.part_type == feature:
return 1
elif tag.person == feature:
return 1
elif tag.polite == feature:
return 1
elif tag.polarity == feature:
return 1
elif tag.poss == feature:
return 1
elif tag.prefix == feature:
return 1
elif tag.prep_case == feature:
return 1
elif tag.pron_type == feature:
return 1
elif tag.punct_side == feature:
return 1
elif tag.punct_type == feature:
return 1
elif tag.reflex == feature:
return 1
elif tag.style == feature:
return 1
elif tag.style_variant == feature:
return 1
elif tag.tense == feature:
return 1
elif tag.typo == feature:
return 1
elif tag.verb_form == feature:
return 1
elif tag.voice == feature:
return 1
elif tag.verb_type == feature:
return 1
else:
return 0
cdef int set_feature(MorphAnalysisC* tag,
univ_field_t field, attr_t feature, int value) except -1:
if value == True:
value_ = feature
else:
value_ = 0
prev_value = get_field(tag, field)
if prev_value != 0 and value_ == 0 and field != Field_POS:
tag.length -= 1
elif prev_value == 0 and value_ != 0 and field != Field_POS:
tag.length += 1
if feature == 0:
pass
elif field == Field_POS:
tag.pos = get_string_id(FEATURE_NAMES[value_].split('_')[1])
elif field == Field_Abbr:
tag.abbr = value_
elif field == Field_AdpType:
tag.adp_type = value_
elif field == Field_AdvType:
tag.adv_type = value_
elif field == Field_Animacy:
tag.animacy = value_
elif field == Field_Aspect:
tag.aspect = value_
elif field == Field_Case:
tag.case = value_
elif field == Field_ConjType:
tag.conj_type = value_
elif field == Field_Connegative:
tag.connegative = value_
elif field == Field_Definite:
tag.definite = value_
elif field == Field_Degree:
tag.degree = value_
elif field == Field_Derivation:
tag.derivation = value_
elif field == Field_Echo:
tag.echo = value_
elif field == Field_Foreign:
tag.foreign = value_
elif field == Field_Gender:
tag.gender = value_
elif field == Field_Hyph:
tag.hyph = value_
elif field == Field_InfForm:
tag.inf_form = value_
elif field == Field_Mood:
tag.mood = value_
elif field == Field_Negative:
tag.negative = value_
elif field == Field_Number:
tag.number = value_
elif field == Field_NameType:
tag.name_type = value_
elif field == Field_NounType:
tag.noun_type = value_
elif field == Field_NumForm:
tag.num_form = value_
elif field == Field_NumType:
tag.num_type = value_
elif field == Field_NumValue:
tag.num_value = value_
elif field == Field_PartForm:
tag.part_form = value_
elif field == Field_PartType:
tag.part_type = value_
elif field == Field_Person:
tag.person = value_
elif field == Field_Polite:
tag.polite = value_
elif field == Field_Polarity:
tag.polarity = value_
elif field == Field_Poss:
tag.poss = value_
elif field == Field_Prefix:
tag.prefix = value_
elif field == Field_PrepCase:
tag.prep_case = value_
elif field == Field_PronType:
tag.pron_type = value_
elif field == Field_PunctSide:
tag.punct_side = value_
elif field == Field_PunctType:
tag.punct_type = value_
elif field == Field_Reflex:
tag.reflex = value_
elif field == Field_Style:
tag.style = value_
elif field == Field_StyleVariant:
tag.style_variant = value_
elif field == Field_Tense:
tag.tense = value_
elif field == Field_Typo:
tag.typo = value_
elif field == Field_VerbForm:
tag.verb_form = value_
elif field == Field_Voice:
tag.voice = value_
elif field == Field_VerbType:
tag.verb_type = value_
else:
raise ValueError("Unknown feature: %s (%d)" % (FEATURE_NAMES.get(feature), feature))
FIELDS = {
'POS': Field_POS,
'Abbr': Field_Abbr,
'AdpType': Field_AdpType,
'AdvType': Field_AdvType,
'Animacy': Field_Animacy,
'Aspect': Field_Aspect,
'Case': Field_Case,
'ConjType': Field_ConjType,
'Connegative': Field_Connegative,
'Definite': Field_Definite,
'Degree': Field_Degree,
'Derivation': Field_Derivation,
'Echo': Field_Echo,
'Foreign': Field_Foreign,
'Gender': Field_Gender,
'Hyph': Field_Hyph,
'InfForm': Field_InfForm,
'Mood': Field_Mood,
'NameType': Field_NameType,
'Negative': Field_Negative,
'NounType': Field_NounType,
'Number': Field_Number,
'NumForm': Field_NumForm,
'NumType': Field_NumType,
'NumValue': Field_NumValue,
'PartForm': Field_PartForm,
'PartType': Field_PartType,
'Person': Field_Person,
'Polite': Field_Polite,
'Polarity': Field_Polarity,
'Poss': Field_Poss,
'Prefix': Field_Prefix,
'PrepCase': Field_PrepCase,
'PronType': Field_PronType,
'PunctSide': Field_PunctSide,
'PunctType': Field_PunctType,
'Reflex': Field_Reflex,
'Style': Field_Style,
'StyleVariant': Field_StyleVariant,
'Tense': Field_Tense,
'Typo': Field_Typo,
'VerbForm': Field_VerbForm,
'Voice': Field_Voice,
'VerbType': Field_VerbType
}
LOWER_FIELDS = {
'pos': Field_POS,
'abbr': Field_Abbr,
'adp_type': Field_AdpType,
'adv_type': Field_AdvType,
'animacy': Field_Animacy,
'aspect': Field_Aspect,
'case': Field_Case,
'conj_type': Field_ConjType,
'connegative': Field_Connegative,
'definite': Field_Definite,
'degree': Field_Degree,
'derivation': Field_Derivation,
'echo': Field_Echo,
'foreign': Field_Foreign,
'gender': Field_Gender,
'hyph': Field_Hyph,
'inf_form': Field_InfForm,
'mood': Field_Mood,
'name_type': Field_NameType,
'negative': Field_Negative,
'noun_type': Field_NounType,
'number': Field_Number,
'num_form': Field_NumForm,
'num_type': Field_NumType,
'num_value': Field_NumValue,
'part_form': Field_PartForm,
'part_type': Field_PartType,
'person': Field_Person,
'polite': Field_Polite,
'polarity': Field_Polarity,
'poss': Field_Poss,
'prefix': Field_Prefix,
'prep_case': Field_PrepCase,
'pron_type': Field_PronType,
'punct_side': Field_PunctSide,
'punct_type': Field_PunctType,
'reflex': Field_Reflex,
'style': Field_Style,
'style_variant': Field_StyleVariant,
'tense': Field_Tense,
'typo': Field_Typo,
'verb_form': Field_VerbForm,
'voice': Field_Voice,
'verb_type': Field_VerbType
}
FEATURES = [
"POS_ADJ",
"POS_ADP",
"POS_ADV",
"POS_AUX",
"POS_CONJ",
"POS_CCONJ",
"POS_DET",
"POS_INTJ",
"POS_NOUN",
"POS_NUM",
"POS_PART",
"POS_PRON",
"POS_PROPN",
"POS_PUNCT",
"POS_SCONJ",
"POS_SYM",
"POS_VERB",
"POS_X",
"POS_EOL",
"POS_SPACE",
"Abbr_yes",
"AdpType_circ",
"AdpType_comprep",
"AdpType_prep",
"AdpType_post",
"AdpType_voc",
"AdvType_adadj,"
"AdvType_cau",
"AdvType_deg",
"AdvType_ex",
"AdvType_loc",
"AdvType_man",
"AdvType_mod",
"AdvType_sta",
"AdvType_tim",
"Animacy_anim",
"Animacy_hum",
"Animacy_inan",
"Animacy_nhum",
"Aspect_freq",
"Aspect_imp",
"Aspect_mod",
"Aspect_none",
"Aspect_perf",
"Aspect_prof",
"Aspect_prosp",
"Case_abe",
"Case_abl",
"Case_abs",
"Case_acc",
"Case_ade",
"Case_all",
"Case_cau",
"Case_com",
"Case_dat",
"Case_del",
"Case_dis",
"Case_ela",
"Case_ess",
"Case_gen",
"Case_ill",
"Case_ine",
"Case_ins",
"Case_loc",
"Case_lat",
"Case_nom",
"Case_par",
"Case_sub",
"Case_sup",
"Case_tem",
"Case_ter",
"Case_tra",
"Case_voc",
"ConjType_comp",
"ConjType_oper",
"Connegative_yes",
"Definite_cons",
"Definite_def",
"Definite_ind",
"Definite_red",
"Definite_two",
"Degree_abs",
"Degree_cmp",
"Degree_comp",
"Degree_none",
"Degree_pos",
"Degree_sup",
"Degree_com",
"Degree_dim",
"Derivation_minen",
"Derivation_sti",
"Derivation_inen",
"Derivation_lainen",
"Derivation_ja",
"Derivation_ton",
"Derivation_vs",
"Derivation_ttain",
"Derivation_ttaa",
"Echo_rdp",
"Echo_ech",
"Foreign_foreign",
"Foreign_fscript",
"Foreign_tscript",
"Foreign_yes",
"Gender_com",
"Gender_fem",
"Gender_masc",
"Gender_neut",
"Gender_dat_masc",
"Gender_dat_fem",
"Gender_erg_masc",
"Gender_erg_fem",
"Gender_psor_masc",
"Gender_psor_fem",
"Gender_psor_neut",
"Hyph_yes",
"InfForm_one",
"InfForm_two",
"InfForm_three",
"Mood_cnd",
"Mood_imp",
"Mood_ind",
"Mood_n",
"Mood_pot",
"Mood_sub",
"Mood_opt",
"NameType_geo",
"NameType_prs",
"NameType_giv",
"NameType_sur",
"NameType_nat",
"NameType_com",
"NameType_pro",
"NameType_oth",
"Negative_neg",
"Negative_pos",
"Negative_yes",
"NounType_com",
"NounType_prop",
"NounType_class",
"Number_com",
"Number_dual",
"Number_none",
"Number_plur",
"Number_sing",
"Number_ptan",
"Number_count",
"Number_abs_sing",
"Number_abs_plur",
"Number_dat_sing",
"Number_dat_plur",
"Number_erg_sing",
"Number_erg_plur",
"Number_psee_sing",
"Number_psee_plur",
"Number_psor_sing",
"Number_psor_plur",
"NumForm_digit",
"NumForm_roman",
"NumForm_word",
"NumType_card",
"NumType_dist",
"NumType_frac",
"NumType_gen",
"NumType_mult",
"NumType_none",
"NumType_ord",
"NumType_sets",
"NumValue_one",
"NumValue_two",
"NumValue_three",
"PartForm_pres",
"PartForm_past",
"PartForm_agt",
"PartForm_neg",
"PartType_mod",
"PartType_emp",
"PartType_res",
"PartType_inf",
"PartType_vbp",
"Person_one",
"Person_two",
"Person_three",
"Person_none",
"Person_abs_one",
"Person_abs_two",
"Person_abs_three",
"Person_dat_one",
"Person_dat_two",
"Person_dat_three",
"Person_erg_one",
"Person_erg_two",
"Person_erg_three",
"Person_psor_one",
"Person_psor_two",
"Person_psor_three",
"Polarity_neg",
"Polarity_pos",
"Polite_inf",
"Polite_pol",
"Polite_abs_inf",
"Polite_abs_pol",
"Polite_erg_inf",
"Polite_erg_pol",
"Polite_dat_inf",
"Polite_dat_pol",
"Poss_yes",
"Prefix_yes",
"PrepCase_npr",
"PrepCase_pre",
"PronType_advPart",
"PronType_art",
"PronType_default",
"PronType_dem",
"PronType_ind",
"PronType_int",
"PronType_neg",
"PronType_prs",
"PronType_rcp",
"PronType_rel",
"PronType_tot",
"PronType_clit",
"PronType_exc",
"PunctSide_ini",
"PunctSide_fin",
"PunctType_peri",
"PunctType_qest",
"PunctType_excl",
"PunctType_quot",
"PunctType_brck",
"PunctType_comm",
"PunctType_colo",
"PunctType_semi",
"PunctType_dash",
"Reflex_yes",
"Style_arch",
"Style_rare",
"Style_poet",
"Style_norm",
"Style_coll",
"Style_vrnc",
"Style_sing",
"Style_expr",
"Style_derg",
"Style_vulg",
"Style_yes",
"StyleVariant_styleShort",
"StyleVariant_styleBound",
"Tense_fut",
"Tense_imp",
"Tense_past",
"Tense_pres",
"Typo_yes",
"VerbForm_fin",
"VerbForm_ger",
"VerbForm_inf",
"VerbForm_none",
"VerbForm_part",
"VerbForm_partFut",
"VerbForm_partPast",
"VerbForm_partPres",
"VerbForm_sup",
"VerbForm_trans",
"VerbForm_conv",
"VerbForm_gdv",
"VerbType_aux",
"VerbType_cop",
"VerbType_mod",
"VerbType_light",
"Voice_act",
"Voice_cau",
"Voice_pass",
"Voice_mid",
"Voice_int",
]
FEATURE_NAMES = {get_string_id(name): name for name in FEATURES}
FEATURE_FIELDS = {feature: FIELDS[feature.split('_', 1)[0]] for feature in FEATURES}
FIELD_SIZES = Counter(FEATURE_FIELDS.values())
for field in FIELD_SIZES:
FIELD_SIZES[field] += 1
for feat_id, name in FEATURE_NAMES.items():
FEATURE_FIELDS[feat_id] = FEATURE_FIELDS[name]
# Mapping of feature names to their position in total vector
FEATURE_OFFSETS = {}
# Mapping of field names to their first position in total vector.
FIELD_OFFSETS = {}
_seen_fields = Counter()
for i, feature in enumerate(FEATURES):
field = FEATURE_FIELDS[feature]
# Add 1 for the NIL class, on each field
FEATURE_OFFSETS[feature] = _seen_fields[field] + 1
if _seen_fields[field] == 0:
FIELD_OFFSETS[field] = i
_seen_fields[field] += 1