mirror of https://github.com/explosion/spaCy.git
Minor refactor for Morphology and MorphAnalysis (#5804)
* `MorphAnalysis.get` returns only the field values * Move `_normalize_props` inside `Morphology` as `Morphology.normalize_attrs` and simplify * Simplify POS field detection/conversion * Convert all non-POS features to strings * `Morphology` returns an empty string for a missing morph to align with the FEATS string returned for an existing morph * Remove unused `list_to_feats`
This commit is contained in:
parent
d0c6d1efc5
commit
fdb8815ef5
|
@ -18,43 +18,14 @@ from .util import ensure_path
|
|||
from . import symbols
|
||||
|
||||
|
||||
def _normalize_props(props):
|
||||
"""Convert attrs dict so that POS is always by ID, other features are left
|
||||
as is as long as they are strings or IDs.
|
||||
"""
|
||||
out = {}
|
||||
props = dict(props)
|
||||
for key, value in props.items():
|
||||
# convert POS value to ID
|
||||
if key == POS:
|
||||
if hasattr(value, 'upper'):
|
||||
value = value.upper()
|
||||
if value in POS_IDS:
|
||||
value = POS_IDS[value]
|
||||
out[key] = value
|
||||
elif isinstance(key, str) and key.lower() == 'pos':
|
||||
out[POS] = POS_IDS[value.upper()]
|
||||
# sort values
|
||||
elif isinstance(value, str) and Morphology.VALUE_SEP in value:
|
||||
out[key] = Morphology.VALUE_SEP.join(
|
||||
sorted(value.split(Morphology.VALUE_SEP)))
|
||||
# accept any string or ID fields and values
|
||||
elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
|
||||
out[key] = value
|
||||
else:
|
||||
warnings.warn(Warnings.W100.format(feature={key: value}))
|
||||
return out
|
||||
|
||||
|
||||
cdef class Morphology:
|
||||
'''Store the possible morphological analyses for a language, and index them
|
||||
"""Store the possible morphological analyses for a language, and index them
|
||||
by hash.
|
||||
|
||||
To save space on each token, tokens only know the hash of their morphological
|
||||
analysis, so queries of morphological attributes are delegated
|
||||
To save space on each token, tokens only know the hash of their
|
||||
morphological analysis, so queries of morphological attributes are delegated
|
||||
to this class.
|
||||
'''
|
||||
|
||||
"""
|
||||
FEATURE_SEP = "|"
|
||||
FIELD_SEP = "="
|
||||
VALUE_SEP = ","
|
||||
|
@ -86,7 +57,7 @@ cdef class Morphology:
|
|||
tag_map = dict(tag_map)
|
||||
tag_map['_SP'] = space_attrs
|
||||
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
||||
attrs = _normalize_props(attrs)
|
||||
attrs = self.normalize_attrs(attrs)
|
||||
self.add(attrs)
|
||||
self.tag_map[tag_str] = dict(attrs)
|
||||
self.reverse_index[self.strings.add(tag_str)] = i
|
||||
|
@ -138,7 +109,7 @@ cdef class Morphology:
|
|||
return tag.key
|
||||
|
||||
def normalize_features(self, features):
|
||||
"""Create a normalized UFEATS string from a features string or dict.
|
||||
"""Create a normalized FEATS string from a features string or dict.
|
||||
|
||||
features (Union[dict, str]): Features as dict or UFEATS string.
|
||||
RETURNS (str): Features as normalized UFEATS string.
|
||||
|
@ -148,7 +119,7 @@ cdef class Morphology:
|
|||
if not isinstance(features, dict):
|
||||
warnings.warn(Warnings.W100.format(feature=features))
|
||||
features = {}
|
||||
features = _normalize_props(features)
|
||||
features = self.normalize_attrs(features)
|
||||
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
|
||||
# normalized UFEATS string with sorted fields and values
|
||||
norm_feats_string = self.FEATURE_SEP.join(sorted([
|
||||
|
@ -157,6 +128,33 @@ cdef class Morphology:
|
|||
]))
|
||||
return norm_feats_string or self.EMPTY_MORPH
|
||||
|
||||
def normalize_attrs(self, attrs):
|
||||
"""Convert attrs dict so that POS is always by ID, other features are
|
||||
by string. Values separated by VALUE_SEP are sorted.
|
||||
"""
|
||||
out = {}
|
||||
attrs = dict(attrs)
|
||||
for key, value in attrs.items():
|
||||
# convert POS value to ID
|
||||
if key == POS or (isinstance(key, str) and key.upper() == "POS"):
|
||||
if isinstance(value, str) and value.upper() in POS_IDS:
|
||||
value = POS_IDS[value.upper()]
|
||||
elif isinstance(value, int) and value not in POS_IDS.values():
|
||||
warnings.warn(Warnings.W100.format(feature={key: value}))
|
||||
continue
|
||||
out[POS] = value
|
||||
# accept any string or ID fields and values and convert to strings
|
||||
elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
|
||||
key = self.strings.as_string(key)
|
||||
value = self.strings.as_string(value)
|
||||
# sort values
|
||||
if self.VALUE_SEP in value:
|
||||
value = self.VALUE_SEP.join(sorted(value.split(self.VALUE_SEP)))
|
||||
out[key] = value
|
||||
else:
|
||||
warnings.warn(Warnings.W100.format(feature={key: value}))
|
||||
return out
|
||||
|
||||
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
|
||||
"""Creates a MorphAnalysisC from a list of intified
|
||||
("Field", "Field=Value") tuples where fields with multiple values have
|
||||
|
@ -183,7 +181,7 @@ cdef class Morphology:
|
|||
def get(self, hash_t morph):
|
||||
tag = <MorphAnalysisC*>self.tags.get(morph)
|
||||
if tag == NULL:
|
||||
return []
|
||||
return ""
|
||||
else:
|
||||
return self.strings[tag.key]
|
||||
|
||||
|
@ -218,7 +216,7 @@ cdef class Morphology:
|
|||
orth (str): The word-form to key the exception.
|
||||
"""
|
||||
attrs = dict(attrs)
|
||||
attrs = _normalize_props(attrs)
|
||||
attrs = self.normalize_attrs(attrs)
|
||||
self.add(attrs)
|
||||
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
||||
self._exc[(tag_str, self.strings.add(orth_str))] = attrs
|
||||
|
@ -282,7 +280,7 @@ cdef class Morphology:
|
|||
# Map (form, pos) to attributes
|
||||
for tag, exc in morph_rules.items():
|
||||
for orth, attrs in exc.items():
|
||||
attrs = _normalize_props(attrs)
|
||||
attrs = self.normalize_attrs(attrs)
|
||||
self.add_special_case(self.strings.as_string(tag), self.strings.as_string(orth), attrs)
|
||||
|
||||
@property
|
||||
|
@ -309,19 +307,6 @@ cdef class Morphology:
|
|||
return ""
|
||||
return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))
|
||||
|
||||
@staticmethod
|
||||
def list_to_feats(feats_list):
|
||||
if len(feats_list) == 0:
|
||||
return ""
|
||||
feats_dict = {}
|
||||
for feat in feats_list:
|
||||
field, value = feat.split(Morphology.FIELD_SEP)
|
||||
if field not in feats_dict:
|
||||
feats_dict[field] = set()
|
||||
feats_dict[field].add(value)
|
||||
feats_dict = {field: Morphology.VALUE_SEP.join(sorted(values)) for field, values in feats_dict.items()}
|
||||
return Morphology.dict_to_feats(feats_dict)
|
||||
|
||||
|
||||
cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil:
|
||||
cdef int i
|
||||
|
|
|
@ -35,7 +35,7 @@ def test_token_morph_key(i_has):
|
|||
|
||||
|
||||
def test_morph_props(i_has):
|
||||
assert i_has[0].morph.get("PronType") == ["PronType=prs"]
|
||||
assert i_has[0].morph.get("PronType") == ["prs"]
|
||||
assert i_has[1].morph.get("PronType") == []
|
||||
|
||||
|
||||
|
@ -47,20 +47,20 @@ def test_morph_iter(i_has):
|
|||
|
||||
|
||||
def test_morph_get(i_has):
|
||||
assert i_has[0].morph.get("PronType") == ["PronType=prs"]
|
||||
assert i_has[0].morph.get("PronType") == ["prs"]
|
||||
|
||||
|
||||
def test_morph_set(i_has):
|
||||
assert i_has[0].morph.get("PronType") == ["PronType=prs"]
|
||||
assert i_has[0].morph.get("PronType") == ["prs"]
|
||||
# set by string
|
||||
i_has[0].morph_ = "PronType=unk"
|
||||
assert i_has[0].morph.get("PronType") == ["PronType=unk"]
|
||||
assert i_has[0].morph.get("PronType") == ["unk"]
|
||||
# set by string, fields are alphabetized
|
||||
i_has[0].morph_ = "PronType=123|NounType=unk"
|
||||
assert i_has[0].morph_ == "NounType=unk|PronType=123"
|
||||
# set by dict
|
||||
i_has[0].morph_ = {"AType": "123", "BType": "unk", "POS": "ADJ"}
|
||||
assert i_has[0].morph_ == "AType=123|BType=unk|POS=ADJ"
|
||||
i_has[0].morph_ = {"AType": "123", "BType": "unk"}
|
||||
assert i_has[0].morph_ == "AType=123|BType=unk"
|
||||
# set by string with multiple values, fields and values are alphabetized
|
||||
i_has[0].morph_ = "BType=c|AType=b,a"
|
||||
assert i_has[0].morph_ == "AType=a,b|BType=c"
|
||||
|
|
|
@ -4,10 +4,8 @@ from spacy.morphology import Morphology
|
|||
def test_feats_converters():
|
||||
feats = "Case=dat,gen|Number=sing"
|
||||
feats_dict = {"Case": "dat,gen", "Number": "sing"}
|
||||
feats_list = feats.split(Morphology.FEATURE_SEP)
|
||||
|
||||
# simple conversions
|
||||
assert Morphology.list_to_feats(feats_list) == feats
|
||||
assert Morphology.dict_to_feats(feats_dict) == feats
|
||||
assert Morphology.feats_to_dict(feats) == feats_dict
|
||||
|
||||
|
@ -18,8 +16,6 @@ def test_feats_converters():
|
|||
# unsorted input is normalized
|
||||
unsorted_feats = "Number=sing|Case=gen,dat"
|
||||
unsorted_feats_dict = {"Case": "gen,dat", "Number": "sing"}
|
||||
unsorted_feats_list = feats.split(Morphology.FEATURE_SEP)
|
||||
assert Morphology.feats_to_dict(unsorted_feats) == feats_dict
|
||||
assert Morphology.dict_to_feats(unsorted_feats_dict) == feats
|
||||
assert Morphology.list_to_feats(unsorted_feats_list) == feats
|
||||
assert Morphology.dict_to_feats(Morphology.feats_to_dict(unsorted_feats)) == feats
|
||||
|
|
|
@ -2,6 +2,7 @@ from libc.string cimport memset
|
|||
cimport numpy as np
|
||||
|
||||
from ..errors import Errors
|
||||
from ..morphology import Morphology
|
||||
from ..vocab cimport Vocab
|
||||
from ..typedefs cimport hash_t, attr_t
|
||||
from ..morphology cimport list_features, check_feature, get_by_field
|
||||
|
@ -58,10 +59,11 @@ cdef class MorphAnalysis:
|
|||
return self.key != other.key
|
||||
|
||||
def get(self, field):
|
||||
"""Retrieve a feature by field."""
|
||||
"""Retrieve feature values by field."""
|
||||
cdef attr_t field_id = self.vocab.strings.as_int(field)
|
||||
cdef np.ndarray results = get_by_field(&self.c, field_id)
|
||||
return [self.vocab.strings[result] for result in results]
|
||||
features = [self.vocab.strings[result] for result in results]
|
||||
return [f.split(Morphology.FIELD_SEP)[1] for f in features]
|
||||
|
||||
def to_json(self):
|
||||
"""Produce a json serializable representation as a UD FEATS-style
|
||||
|
|
Loading…
Reference in New Issue