mirror of https://github.com/explosion/spaCy.git
936 lines
26 KiB
Cython
936 lines
26 KiB
Cython
# cython: infer_types
|
|
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
from libc.string cimport memset
|
|
import srsly
|
|
from collections import Counter
|
|
|
|
from .compat import basestring_
|
|
from .strings import get_string_id
|
|
from . import symbols
|
|
from .attrs cimport POS, IS_SPACE
|
|
from .attrs import LEMMA, intify_attrs
|
|
from .parts_of_speech cimport SPACE
|
|
from .parts_of_speech import IDS as POS_IDS
|
|
from .lexeme cimport Lexeme
|
|
from .errors import Errors
|
|
|
|
|
|
cdef enum univ_field_t:
|
|
Field_Abbr
|
|
Field_AdpType
|
|
Field_AdvType
|
|
Field_Animacy
|
|
Field_Aspect
|
|
Field_Case
|
|
Field_ConjType
|
|
Field_Connegative
|
|
Field_Definite
|
|
Field_Degree
|
|
Field_Derivation
|
|
Field_Echo
|
|
Field_Foreign
|
|
Field_Gender
|
|
Field_Hyph
|
|
Field_InfForm
|
|
Field_Mood
|
|
Field_NameType
|
|
Field_Negative
|
|
Field_NounType
|
|
Field_Number
|
|
Field_NumForm
|
|
Field_NumType
|
|
Field_NumValue
|
|
Field_PartForm
|
|
Field_PartType
|
|
Field_Person
|
|
Field_Polite
|
|
Field_Polarity
|
|
Field_Poss
|
|
Field_Prefix
|
|
Field_PrepCase
|
|
Field_PronType
|
|
Field_PunctSide
|
|
Field_PunctType
|
|
Field_Reflex
|
|
Field_Style
|
|
Field_StyleVariant
|
|
Field_Tense
|
|
Field_Typo
|
|
Field_VerbForm
|
|
Field_Voice
|
|
Field_VerbType
|
|
|
|
|
|
def _normalize_props(props):
|
|
"""Transform deprecated string keys to correct names."""
|
|
out = {}
|
|
props = dict(props)
|
|
for key in FIELDS:
|
|
if key in props:
|
|
value = str(props[key]).lower()
|
|
attr = '%s_%s' % (key, value)
|
|
if attr in FEATURES:
|
|
props.pop(key)
|
|
props[attr] = True
|
|
for key, value in props.items():
|
|
if key == POS:
|
|
if hasattr(value, 'upper'):
|
|
value = value.upper()
|
|
if value in POS_IDS:
|
|
value = POS_IDS[value]
|
|
out[key] = value
|
|
elif isinstance(key, int):
|
|
out[key] = value
|
|
elif value is True:
|
|
out[key] = value
|
|
elif key.lower() == 'pos':
|
|
out[POS] = POS_IDS[value.upper()]
|
|
elif key.lower() != 'morph':
|
|
out[key] = value
|
|
return out
|
|
|
|
|
|
def parse_feature(feature):
|
|
field = FEATURE_FIELDS[feature]
|
|
offset = FEATURE_OFFSETS[feature]
|
|
return (field, offset)
|
|
|
|
|
|
def get_field_id(feature):
|
|
return FEATURE_FIELDS[feature]
|
|
|
|
|
|
def get_field_size(field):
|
|
return FIELD_SIZES[field]
|
|
|
|
|
|
def get_field_offset(field):
|
|
return FIELD_OFFSETS[field]
|
|
|
|
|
|
cdef class Morphology:
|
|
'''Store the possible morphological analyses for a language, and index them
|
|
by hash.
|
|
|
|
To save space on each token, tokens only know the hash of their morphological
|
|
analysis, so queries of morphological attributes are delegated
|
|
to this class.
|
|
'''
|
|
def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
|
|
self.mem = Pool()
|
|
self.strings = string_store
|
|
self.tags = PreshMap()
|
|
# Add special space symbol. We prefix with underscore, to make sure it
|
|
# always sorts to the end.
|
|
space_attrs = tag_map.get('SP', {POS: SPACE})
|
|
if '_SP' not in tag_map:
|
|
self.strings.add('_SP')
|
|
tag_map = dict(tag_map)
|
|
tag_map['_SP'] = space_attrs
|
|
self.tag_names = tuple(sorted(tag_map.keys()))
|
|
self.tag_map = {}
|
|
self.lemmatizer = lemmatizer
|
|
self.n_tags = len(tag_map)
|
|
self.reverse_index = {}
|
|
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
|
attrs = _normalize_props(attrs)
|
|
self.add({FEATURE_NAMES[feat] for feat in attrs if feat in FEATURE_NAMES})
|
|
self.tag_map[tag_str] = dict(attrs)
|
|
self.reverse_index[self.strings.add(tag_str)] = i
|
|
|
|
self._cache = PreshMapArray(self.n_tags)
|
|
self.exc = {}
|
|
if exc is not None:
|
|
for (tag, orth), attrs in exc.items():
|
|
attrs = _normalize_props(attrs)
|
|
self.add_special_case(
|
|
self.strings.as_string(tag), self.strings.as_string(orth), attrs)
|
|
|
|
def __reduce__(self):
|
|
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
|
|
self.exc), None, None)
|
|
|
|
def add(self, features):
|
|
"""Insert a morphological analysis in the morphology table, if not already
|
|
present. Returns the hash of the new analysis.
|
|
"""
|
|
for f in features:
|
|
if isinstance(f, basestring_):
|
|
self.strings.add(f)
|
|
features = intify_features(features)
|
|
cdef attr_t feature
|
|
for feature in features:
|
|
if feature != 0 and feature not in FEATURE_NAMES:
|
|
raise KeyError("Unknown feature: %s" % self.strings[feature])
|
|
cdef MorphAnalysisC tag
|
|
tag = create_rich_tag(features)
|
|
cdef hash_t key = self.insert(tag)
|
|
return key
|
|
|
|
def get(self, hash_t morph):
|
|
tag = <MorphAnalysisC*>self.tags.get(morph)
|
|
if tag == NULL:
|
|
return []
|
|
else:
|
|
return tag_to_json(tag[0])
|
|
|
|
cpdef update(self, hash_t morph, features):
|
|
"""Update a morphological analysis with new feature values."""
|
|
tag = (<MorphAnalysisC*>self.tags.get(morph))[0]
|
|
features = intify_features(features)
|
|
cdef attr_t feature
|
|
for feature in features:
|
|
field = get_field_id(feature)
|
|
set_feature(&tag, field, feature, 1)
|
|
morph = self.insert(tag)
|
|
return morph
|
|
|
|
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
|
|
if orth not in self.strings:
|
|
return orth
|
|
cdef unicode py_string = self.strings[orth]
|
|
if self.lemmatizer is None:
|
|
return self.strings.add(py_string.lower())
|
|
cdef list lemma_strings
|
|
cdef unicode lemma_string
|
|
# Normalize features into a dict keyed by the field, to make life easier
|
|
# for the lemmatizer. Handles string-to-int conversion too.
|
|
string_feats = {}
|
|
for key, value in morphology.items():
|
|
if value is True:
|
|
name, value = self.strings.as_string(key).split('_', 1)
|
|
string_feats[name] = value
|
|
else:
|
|
string_feats[self.strings.as_string(key)] = self.strings.as_string(value)
|
|
lemma_strings = self.lemmatizer(py_string, univ_pos, string_feats)
|
|
lemma_string = lemma_strings[0]
|
|
lemma = self.strings.add(lemma_string)
|
|
return lemma
|
|
|
|
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
|
|
force=False):
|
|
"""Add a special-case rule to the morphological analyser. Tokens whose
|
|
tag and orth match the rule will receive the specified properties.
|
|
|
|
tag (unicode): The part-of-speech tag to key the exception.
|
|
orth (unicode): The word-form to key the exception.
|
|
"""
|
|
attrs = dict(attrs)
|
|
attrs = _normalize_props(attrs)
|
|
self.add({FEATURE_NAMES[feat] for feat in attrs if feat in FEATURE_NAMES})
|
|
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
|
self.exc[(tag_str, self.strings.add(orth_str))] = attrs
|
|
|
|
cdef hash_t insert(self, MorphAnalysisC tag) except 0:
|
|
cdef hash_t key = hash_tag(tag)
|
|
if self.tags.get(key) == NULL:
|
|
tag_ptr = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
|
tag_ptr[0] = tag
|
|
self.tags.set(key, <void*>tag_ptr)
|
|
return key
|
|
|
|
cdef int assign_untagged(self, TokenC* token) except -1:
|
|
"""Set morphological attributes on a token without a POS tag. Uses
|
|
the lemmatizer's lookup() method, which looks up the string in the
|
|
table provided by the language data as lemma_lookup (if available).
|
|
"""
|
|
if token.lemma == 0:
|
|
orth_str = self.strings[token.lex.orth]
|
|
lemma = self.lemmatizer.lookup(orth_str)
|
|
token.lemma = self.strings.add(lemma)
|
|
|
|
cdef int assign_tag(self, TokenC* token, tag_str) except -1:
|
|
cdef attr_t tag = self.strings.as_int(tag_str)
|
|
if tag in self.reverse_index:
|
|
tag_id = self.reverse_index[tag]
|
|
self.assign_tag_id(token, tag_id)
|
|
else:
|
|
token.tag = tag
|
|
|
|
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
|
|
if tag_id > self.n_tags:
|
|
raise ValueError(Errors.E014.format(tag=tag_id))
|
|
# Ensure spaces get tagged as space.
|
|
# It seems pretty arbitrary to put this logic here, but there's really
|
|
# nowhere better. I guess the justification is that this is where the
|
|
# specific word and the tag interact. Still, we should have a better
|
|
# way to enforce this rule, or figure out why the statistical model fails.
|
|
# Related to Issue #220
|
|
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
|
tag_id = self.reverse_index[self.strings.add('_SP')]
|
|
tag_str = self.tag_names[tag_id]
|
|
features = dict(self.tag_map.get(tag_str, {}))
|
|
if features:
|
|
pos = self.strings.as_int(features.pop(POS))
|
|
else:
|
|
pos = 0
|
|
cdef attr_t lemma = <attr_t>self._cache.get(tag_id, token.lex.orth)
|
|
if lemma == 0:
|
|
# Ugh, self.lemmatize has opposite arg order from self.lemmatizer :(
|
|
lemma = self.lemmatize(pos, token.lex.orth, features)
|
|
self._cache.set(tag_id, token.lex.orth, <void*>lemma)
|
|
token.lemma = lemma
|
|
token.pos = <univ_pos_t>pos
|
|
token.tag = self.strings[tag_str]
|
|
token.morph = self.add(features)
|
|
if (self.tag_names[tag_id], token.lex.orth) in self.exc:
|
|
self._assign_tag_from_exceptions(token, tag_id)
|
|
|
|
cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1:
|
|
key = (self.tag_names[tag_id], token.lex.orth)
|
|
cdef dict attrs
|
|
attrs = self.exc[key]
|
|
token.pos = attrs.get(POS, token.pos)
|
|
token.lemma = attrs.get(LEMMA, token.lemma)
|
|
|
|
def load_morph_exceptions(self, dict exc):
|
|
# Map (form, pos) to attributes
|
|
for tag_str, entries in exc.items():
|
|
for form_str, attrs in entries.items():
|
|
self.add_special_case(tag_str, form_str, attrs)
|
|
|
|
def to_bytes(self):
|
|
json_tags = []
|
|
for key in self.tags:
|
|
tag_ptr = <MorphAnalysisC*>self.tags.get(key)
|
|
if tag_ptr != NULL:
|
|
json_tags.append(tag_to_json(tag_ptr[0]))
|
|
return srsly.json_dumps(json_tags)
|
|
|
|
def from_bytes(self, byte_string):
|
|
raise NotImplementedError
|
|
|
|
def to_disk(self, path):
|
|
raise NotImplementedError
|
|
|
|
def from_disk(self, path):
|
|
raise NotImplementedError
|
|
|
|
|
|
cpdef univ_pos_t get_int_tag(pos_):
|
|
return <univ_pos_t>0
|
|
|
|
cpdef intify_features(features):
|
|
return {get_string_id(feature) for feature in features}
|
|
|
|
cdef hash_t hash_tag(MorphAnalysisC tag) nogil:
|
|
return mrmr.hash64(&tag, sizeof(tag), 0)
|
|
|
|
|
|
def get_feature_field(feature):
|
|
cdef attr_t key = get_string_id(feature)
|
|
return FEATURE_FIELDS[feature]
|
|
|
|
|
|
cdef MorphAnalysisC create_rich_tag(features) except *:
|
|
cdef MorphAnalysisC tag
|
|
cdef attr_t feature
|
|
memset(&tag, 0, sizeof(tag))
|
|
for feature in features:
|
|
field = get_field_id(feature)
|
|
set_feature(&tag, field, feature, 1)
|
|
return tag
|
|
|
|
|
|
cdef tag_to_json(MorphAnalysisC tag):
|
|
features = []
|
|
if tag.abbr != 0:
|
|
features.append(FEATURE_NAMES[tag.abbr])
|
|
if tag.adp_type != 0:
|
|
features.append(FEATURE_NAMES[tag.adp_type])
|
|
if tag.adv_type != 0:
|
|
features.append(FEATURE_NAMES[tag.adv_type])
|
|
if tag.animacy != 0:
|
|
features.append(FEATURE_NAMES[tag.animacy])
|
|
if tag.aspect != 0:
|
|
features.append(FEATURE_NAMES[tag.aspect])
|
|
if tag.case != 0:
|
|
features.append(FEATURE_NAMES[tag.case])
|
|
if tag.conj_type != 0:
|
|
features.append(FEATURE_NAMES[tag.conj_type])
|
|
if tag.connegative != 0:
|
|
features.append(FEATURE_NAMES[tag.connegative])
|
|
if tag.definite != 0:
|
|
features.append(FEATURE_NAMES[tag.definite])
|
|
if tag.degree != 0:
|
|
features.append(FEATURE_NAMES[tag.degree])
|
|
if tag.derivation != 0:
|
|
features.append(FEATURE_NAMES[tag.derivation])
|
|
if tag.echo != 0:
|
|
features.append(FEATURE_NAMES[tag.echo])
|
|
if tag.foreign != 0:
|
|
features.append(FEATURE_NAMES[tag.foreign])
|
|
if tag.gender != 0:
|
|
features.append(FEATURE_NAMES[tag.gender])
|
|
if tag.hyph != 0:
|
|
features.append(FEATURE_NAMES[tag.hyph])
|
|
if tag.inf_form != 0:
|
|
features.append(FEATURE_NAMES[tag.inf_form])
|
|
if tag.mood != 0:
|
|
features.append(FEATURE_NAMES[tag.mood])
|
|
if tag.negative != 0:
|
|
features.append(FEATURE_NAMES[tag.negative])
|
|
if tag.number != 0:
|
|
features.append(FEATURE_NAMES[tag.number])
|
|
if tag.name_type != 0:
|
|
features.append(FEATURE_NAMES[tag.name_type])
|
|
if tag.noun_type != 0:
|
|
features.append(FEATURE_NAMES[tag.noun_type])
|
|
if tag.num_form != 0:
|
|
features.append(FEATURE_NAMES[tag.num_form])
|
|
if tag.num_type != 0:
|
|
features.append(FEATURE_NAMES[tag.num_type])
|
|
if tag.num_value != 0:
|
|
features.append(FEATURE_NAMES[tag.num_value])
|
|
if tag.part_form != 0:
|
|
features.append(FEATURE_NAMES[tag.part_form])
|
|
if tag.part_type != 0:
|
|
features.append(FEATURE_NAMES[tag.part_type])
|
|
if tag.person != 0:
|
|
features.append(FEATURE_NAMES[tag.person])
|
|
if tag.polite != 0:
|
|
features.append(FEATURE_NAMES[tag.polite])
|
|
if tag.polarity != 0:
|
|
features.append(FEATURE_NAMES[tag.polarity])
|
|
if tag.poss != 0:
|
|
features.append(FEATURE_NAMES[tag.poss])
|
|
if tag.prefix != 0:
|
|
features.append(FEATURE_NAMES[tag.prefix])
|
|
if tag.prep_case != 0:
|
|
features.append(FEATURE_NAMES[tag.prep_case])
|
|
if tag.pron_type != 0:
|
|
features.append(FEATURE_NAMES[tag.pron_type])
|
|
if tag.punct_side != 0:
|
|
features.append(FEATURE_NAMES[tag.punct_side])
|
|
if tag.punct_type != 0:
|
|
features.append(FEATURE_NAMES[tag.punct_type])
|
|
if tag.reflex != 0:
|
|
features.append(FEATURE_NAMES[tag.reflex])
|
|
if tag.style != 0:
|
|
features.append(FEATURE_NAMES[tag.style])
|
|
if tag.style_variant != 0:
|
|
features.append(FEATURE_NAMES[tag.style_variant])
|
|
if tag.tense != 0:
|
|
features.append(FEATURE_NAMES[tag.tense])
|
|
if tag.verb_form != 0:
|
|
features.append(FEATURE_NAMES[tag.verb_form])
|
|
if tag.voice != 0:
|
|
features.append(FEATURE_NAMES[tag.voice])
|
|
if tag.verb_type != 0:
|
|
features.append(FEATURE_NAMES[tag.verb_type])
|
|
return features
|
|
|
|
|
|
cdef MorphAnalysisC tag_from_json(json_tag):
|
|
cdef MorphAnalysisC tag
|
|
return tag
|
|
|
|
|
|
cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil:
|
|
if tag.abbr == feature:
|
|
return 1
|
|
elif tag.adp_type == feature:
|
|
return 1
|
|
elif tag.adv_type == feature:
|
|
return 1
|
|
elif tag.animacy == feature:
|
|
return 1
|
|
elif tag.aspect == feature:
|
|
return 1
|
|
elif tag.case == feature:
|
|
return 1
|
|
elif tag.conj_type == feature:
|
|
return 1
|
|
elif tag.connegative == feature:
|
|
return 1
|
|
elif tag.definite == feature:
|
|
return 1
|
|
elif tag.degree == feature:
|
|
return 1
|
|
elif tag.derivation == feature:
|
|
return 1
|
|
elif tag.echo == feature:
|
|
return 1
|
|
elif tag.foreign == feature:
|
|
return 1
|
|
elif tag.gender == feature:
|
|
return 1
|
|
elif tag.hyph == feature:
|
|
return 1
|
|
elif tag.inf_form == feature:
|
|
return 1
|
|
elif tag.mood == feature:
|
|
return 1
|
|
elif tag.negative == feature:
|
|
return 1
|
|
elif tag.number == feature:
|
|
return 1
|
|
elif tag.name_type == feature:
|
|
return 1
|
|
elif tag.noun_type == feature:
|
|
return 1
|
|
elif tag.num_form == feature:
|
|
return 1
|
|
elif tag.num_type == feature:
|
|
return 1
|
|
elif tag.num_value == feature:
|
|
return 1
|
|
elif tag.part_form == feature:
|
|
return 1
|
|
elif tag.part_type == feature:
|
|
return 1
|
|
elif tag.person == feature:
|
|
return 1
|
|
elif tag.polite == feature:
|
|
return 1
|
|
elif tag.polarity == feature:
|
|
return 1
|
|
elif tag.poss == feature:
|
|
return 1
|
|
elif tag.prefix == feature:
|
|
return 1
|
|
elif tag.prep_case == feature:
|
|
return 1
|
|
elif tag.pron_type == feature:
|
|
return 1
|
|
elif tag.punct_side == feature:
|
|
return 1
|
|
elif tag.punct_type == feature:
|
|
return 1
|
|
elif tag.reflex == feature:
|
|
return 1
|
|
elif tag.style == feature:
|
|
return 1
|
|
elif tag.style_variant == feature:
|
|
return 1
|
|
elif tag.tense == feature:
|
|
return 1
|
|
elif tag.typo == feature:
|
|
return 1
|
|
elif tag.verb_form == feature:
|
|
return 1
|
|
elif tag.voice == feature:
|
|
return 1
|
|
elif tag.verb_type == feature:
|
|
return 1
|
|
else:
|
|
return 0
|
|
|
|
cdef int set_feature(MorphAnalysisC* tag,
|
|
univ_field_t field, attr_t feature, int value) except -1:
|
|
if value == True:
|
|
value_ = feature
|
|
else:
|
|
value_ = 0
|
|
if feature == 0:
|
|
pass
|
|
elif field == Field_Abbr:
|
|
tag.abbr = value_
|
|
elif field == Field_AdpType:
|
|
tag.adp_type = value_
|
|
elif field == Field_AdvType:
|
|
tag.adv_type = value_
|
|
elif field == Field_Animacy:
|
|
tag.animacy = value_
|
|
elif field == Field_Aspect:
|
|
tag.aspect = value_
|
|
elif field == Field_Case:
|
|
tag.case = value_
|
|
elif field == Field_ConjType:
|
|
tag.conj_type = value_
|
|
elif field == Field_Connegative:
|
|
tag.connegative = value_
|
|
elif field == Field_Definite:
|
|
tag.definite = value_
|
|
elif field == Field_Degree:
|
|
tag.degree = value_
|
|
elif field == Field_Derivation:
|
|
tag.derivation = value_
|
|
elif field == Field_Echo:
|
|
tag.echo = value_
|
|
elif field == Field_Foreign:
|
|
tag.foreign = value_
|
|
elif field == Field_Gender:
|
|
tag.gender = value_
|
|
elif field == Field_Hyph:
|
|
tag.hyph = value_
|
|
elif field == Field_InfForm:
|
|
tag.inf_form = value_
|
|
elif field == Field_Mood:
|
|
tag.mood = value_
|
|
elif field == Field_Negative:
|
|
tag.negative = value_
|
|
elif field == Field_Number:
|
|
tag.number = value_
|
|
elif field == Field_NameType:
|
|
tag.name_type = value_
|
|
elif field == Field_NounType:
|
|
tag.noun_type = value_
|
|
elif field == Field_NumForm:
|
|
tag.num_form = value_
|
|
elif field == Field_NumType:
|
|
tag.num_type = value_
|
|
elif field == Field_NumValue:
|
|
tag.num_value = value_
|
|
elif field == Field_PartForm:
|
|
tag.part_form = value_
|
|
elif field == Field_PartType:
|
|
tag.part_type = value_
|
|
elif field == Field_Person:
|
|
tag.person = value_
|
|
elif field == Field_Polite:
|
|
tag.polite = value_
|
|
elif field == Field_Polarity:
|
|
tag.polarity = value_
|
|
elif field == Field_Poss:
|
|
tag.poss = value_
|
|
elif field == Field_Prefix:
|
|
tag.prefix = value_
|
|
elif field == Field_PrepCase:
|
|
tag.prep_case = value_
|
|
elif field == Field_PronType:
|
|
tag.pron_type = value_
|
|
elif field == Field_PunctSide:
|
|
tag.punct_side = value_
|
|
elif field == Field_PunctType:
|
|
tag.punct_type = value_
|
|
elif field == Field_Reflex:
|
|
tag.reflex = value_
|
|
elif field == Field_Style:
|
|
tag.style = value_
|
|
elif field == Field_StyleVariant:
|
|
tag.style_variant = value_
|
|
elif field == Field_Tense:
|
|
tag.tense = value_
|
|
elif field == Field_Typo:
|
|
tag.typo = value_
|
|
elif field == Field_VerbForm:
|
|
tag.verb_form = value_
|
|
elif field == Field_Voice:
|
|
tag.voice = value_
|
|
elif field == Field_VerbType:
|
|
tag.verb_type = value_
|
|
else:
|
|
raise ValueError("Unknown feature: %s (%d)" % (FEATURE_NAMES.get(feature), feature))
|
|
|
|
|
|
FIELDS = {
|
|
'Abbr': Field_Abbr,
|
|
'AdpType': Field_AdpType,
|
|
'AdvType': Field_AdvType,
|
|
'Animacy': Field_Animacy,
|
|
'Aspect': Field_Aspect,
|
|
'Case': Field_Case,
|
|
'ConjType': Field_ConjType,
|
|
'Connegative': Field_Connegative,
|
|
'Definite': Field_Definite,
|
|
'Degree': Field_Degree,
|
|
'Derivation': Field_Derivation,
|
|
'Echo': Field_Echo,
|
|
'Foreign': Field_Foreign,
|
|
'Gender': Field_Gender,
|
|
'Hyph': Field_Hyph,
|
|
'InfForm': Field_InfForm,
|
|
'Mood': Field_Mood,
|
|
'NameType': Field_NameType,
|
|
'Negative': Field_Negative,
|
|
'NounType': Field_NounType,
|
|
'Number': Field_Number,
|
|
'NumForm': Field_NumForm,
|
|
'NumType': Field_NumType,
|
|
'NumValue': Field_NumValue,
|
|
'PartForm': Field_PartForm,
|
|
'PartType': Field_PartType,
|
|
'Person': Field_Person,
|
|
'Polite': Field_Polite,
|
|
'Polarity': Field_Polarity,
|
|
'Poss': Field_Poss,
|
|
'Prefix': Field_Prefix,
|
|
'PrepCase': Field_PrepCase,
|
|
'PronType': Field_PronType,
|
|
'PunctSide': Field_PunctSide,
|
|
'PunctType': Field_PunctType,
|
|
'Reflex': Field_Reflex,
|
|
'Style': Field_Style,
|
|
'StyleVariant': Field_StyleVariant,
|
|
'Tense': Field_Tense,
|
|
'Typo': Field_Typo,
|
|
'VerbForm': Field_VerbForm,
|
|
'Voice': Field_Voice,
|
|
'VerbType': Field_VerbType
|
|
}
|
|
|
|
FEATURES = [
|
|
"Abbr_yes",
|
|
"AdpType_circ",
|
|
"AdpType_comprep",
|
|
"AdpType_prep",
|
|
"AdpType_post",
|
|
"AdpType_voc",
|
|
"AdvType_adadj,"
|
|
"AdvType_cau",
|
|
"AdvType_deg",
|
|
"AdvType_ex",
|
|
"AdvType_loc",
|
|
"AdvType_man",
|
|
"AdvType_mod",
|
|
"AdvType_sta",
|
|
"AdvType_tim",
|
|
"Animacy_anim",
|
|
"Animacy_hum",
|
|
"Animacy_inan",
|
|
"Animacy_nhum",
|
|
"Aspect_freq",
|
|
"Aspect_imp",
|
|
"Aspect_mod",
|
|
"Aspect_none",
|
|
"Aspect_perf",
|
|
"Case_abe",
|
|
"Case_abl",
|
|
"Case_abs",
|
|
"Case_acc",
|
|
"Case_ade",
|
|
"Case_all",
|
|
"Case_cau",
|
|
"Case_com",
|
|
"Case_dat",
|
|
"Case_del",
|
|
"Case_dis",
|
|
"Case_ela",
|
|
"Case_ess",
|
|
"Case_gen",
|
|
"Case_ill",
|
|
"Case_ine",
|
|
"Case_ins",
|
|
"Case_loc",
|
|
"Case_lat",
|
|
"Case_nom",
|
|
"Case_par",
|
|
"Case_sub",
|
|
"Case_sup",
|
|
"Case_tem",
|
|
"Case_ter",
|
|
"Case_tra",
|
|
"Case_voc",
|
|
"ConjType_comp",
|
|
"ConjType_oper",
|
|
"Connegative_yes",
|
|
"Definite_cons",
|
|
"Definite_def",
|
|
"Definite_ind",
|
|
"Definite_red",
|
|
"Definite_two",
|
|
"Degree_abs",
|
|
"Degree_cmp",
|
|
"Degree_comp",
|
|
"Degree_none",
|
|
"Degree_pos",
|
|
"Degree_sup",
|
|
"Degree_com",
|
|
"Degree_dim",
|
|
"Derivation_minen",
|
|
"Derivation_sti",
|
|
"Derivation_inen",
|
|
"Derivation_lainen",
|
|
"Derivation_ja",
|
|
"Derivation_ton",
|
|
"Derivation_vs",
|
|
"Derivation_ttain",
|
|
"Derivation_ttaa",
|
|
"Echo_rdp",
|
|
"Echo_ech",
|
|
"Foreign_foreign",
|
|
"Foreign_fscript",
|
|
"Foreign_tscript",
|
|
"Foreign_yes",
|
|
"Gender_com",
|
|
"Gender_fem",
|
|
"Gender_masc",
|
|
"Gender_neut",
|
|
"Gender_dat_masc",
|
|
"Gender_dat_fem",
|
|
"Gender_erg_masc",
|
|
"Gender_erg_fem",
|
|
"Gender_psor_masc",
|
|
"Gender_psor_fem",
|
|
"Gender_psor_neut",
|
|
"Hyph_yes",
|
|
"InfForm_one",
|
|
"InfForm_two",
|
|
"InfForm_three",
|
|
"Mood_cnd",
|
|
"Mood_imp",
|
|
"Mood_ind",
|
|
"Mood_n",
|
|
"Mood_pot",
|
|
"Mood_sub",
|
|
"Mood_opt",
|
|
"NameType_geo",
|
|
"NameType_prs",
|
|
"NameType_giv",
|
|
"NameType_sur",
|
|
"NameType_nat",
|
|
"NameType_com",
|
|
"NameType_pro",
|
|
"NameType_oth",
|
|
"Negative_neg",
|
|
"Negative_pos",
|
|
"Negative_yes",
|
|
"NounType_com",
|
|
"NounType_prop",
|
|
"NounType_class",
|
|
"Number_com",
|
|
"Number_dual",
|
|
"Number_none",
|
|
"Number_plur",
|
|
"Number_sing",
|
|
"Number_ptan",
|
|
"Number_count",
|
|
"Number_abs_sing",
|
|
"Number_abs_plur",
|
|
"Number_dat_sing",
|
|
"Number_dat_plur",
|
|
"Number_erg_sing",
|
|
"Number_erg_plur",
|
|
"Number_psee_sing",
|
|
"Number_psee_plur",
|
|
"Number_psor_sing",
|
|
"Number_psor_plur",
|
|
"NumForm_digit",
|
|
"NumForm_roman",
|
|
"NumForm_word",
|
|
"NumType_card",
|
|
"NumType_dist",
|
|
"NumType_frac",
|
|
"NumType_gen",
|
|
"NumType_mult",
|
|
"NumType_none",
|
|
"NumType_ord",
|
|
"NumType_sets",
|
|
"NumValue_one",
|
|
"NumValue_two",
|
|
"NumValue_three",
|
|
"PartForm_pres",
|
|
"PartForm_past",
|
|
"PartForm_agt",
|
|
"PartForm_neg",
|
|
"PartType_mod",
|
|
"PartType_emp",
|
|
"PartType_res",
|
|
"PartType_inf",
|
|
"PartType_vbp",
|
|
"Person_one",
|
|
"Person_two",
|
|
"Person_three",
|
|
"Person_none",
|
|
"Person_abs_one",
|
|
"Person_abs_two",
|
|
"Person_abs_three",
|
|
"Person_dat_one",
|
|
"Person_dat_two",
|
|
"Person_dat_three",
|
|
"Person_erg_one",
|
|
"Person_erg_two",
|
|
"Person_erg_three",
|
|
"Person_psor_one",
|
|
"Person_psor_two",
|
|
"Person_psor_three",
|
|
"Polarity_neg",
|
|
"Polarity_pos",
|
|
"Polite_inf",
|
|
"Polite_pol",
|
|
"Polite_abs_inf",
|
|
"Polite_abs_pol",
|
|
"Polite_erg_inf",
|
|
"Polite_erg_pol",
|
|
"Polite_dat_inf",
|
|
"Polite_dat_pol",
|
|
"Poss_yes",
|
|
"Prefix_yes",
|
|
"PrepCase_npr",
|
|
"PrepCase_pre",
|
|
"PronType_advPart",
|
|
"PronType_art",
|
|
"PronType_default",
|
|
"PronType_dem",
|
|
"PronType_ind",
|
|
"PronType_int",
|
|
"PronType_neg",
|
|
"PronType_prs",
|
|
"PronType_rcp",
|
|
"PronType_rel",
|
|
"PronType_tot",
|
|
"PronType_clit",
|
|
"PronType_exc",
|
|
"PunctSide_ini",
|
|
"PunctSide_fin",
|
|
"PunctType_peri",
|
|
"PunctType_qest",
|
|
"PunctType_excl",
|
|
"PunctType_quot",
|
|
"PunctType_brck",
|
|
"PunctType_comm",
|
|
"PunctType_colo",
|
|
"PunctType_semi",
|
|
"PunctType_dash",
|
|
"Reflex_yes",
|
|
"Style_arch",
|
|
"Style_rare",
|
|
"Style_poet",
|
|
"Style_norm",
|
|
"Style_coll",
|
|
"Style_vrnc",
|
|
"Style_sing",
|
|
"Style_expr",
|
|
"Style_derg",
|
|
"Style_vulg",
|
|
"Style_yes",
|
|
"StyleVariant_styleShort",
|
|
"StyleVariant_styleBound",
|
|
"Tense_fut",
|
|
"Tense_imp",
|
|
"Tense_past",
|
|
"Tense_pres",
|
|
"Typo_yes",
|
|
"VerbForm_fin",
|
|
"VerbForm_ger",
|
|
"VerbForm_inf",
|
|
"VerbForm_none",
|
|
"VerbForm_part",
|
|
"VerbForm_partFut",
|
|
"VerbForm_partPast",
|
|
"VerbForm_partPres",
|
|
"VerbForm_sup",
|
|
"VerbForm_trans",
|
|
"VerbForm_conv",
|
|
"VerbForm_gdv",
|
|
"VerbType_aux",
|
|
"VerbType_cop",
|
|
"VerbType_mod",
|
|
"VerbType_light",
|
|
"Voice_act",
|
|
"Voice_cau",
|
|
"Voice_pass",
|
|
"Voice_mid",
|
|
"Voice_int",
|
|
]
|
|
|
|
FEATURE_NAMES = {get_string_id(name): name for name in FEATURES}
|
|
|
|
FEATURE_FIELDS = {feature: FIELDS[feature.split('_', 1)[0]] for feature in FEATURES}
|
|
for feat_id, name in FEATURE_NAMES.items():
|
|
FEATURE_FIELDS[feat_id] = FEATURE_FIELDS[name]
|
|
|
|
FIELD_SIZES = Counter(FEATURE_FIELDS.values())
|
|
FEATURE_OFFSETS = {}
|
|
FIELD_OFFSETS = {}
|
|
_seen_fields = Counter()
|
|
for i, feature in enumerate(FEATURES):
|
|
field = FEATURE_FIELDS[feature]
|
|
FEATURE_OFFSETS[feature] = _seen_fields[field]
|
|
if _seen_fields == 0:
|
|
FIELD_OFFSETS[field] = i
|
|
_seen_fields[field] += 1
|