mirror of https://github.com/explosion/spaCy.git
Improve tag map initialization and updating (#5764)
* Improve tag map initialization and updating Generalize tag map initialization and updating so that the tag map can be loaded correctly prior to loading a `Corpus` with `spacy debug-data` and `spacy train`. * normalize provided tag map as necessary * use the same method for initializing and updating the tag map * Replace rather than update tag map Replace rather than update tag map when loading a custom tag map. Updating the tag map is problematic due to the sorted list of tag names and the fact that the tag map will contain lingering/unwanted tags from the default tag map. * Update CLI scripts * Reinitialize cache after loading new tag map Reinitialize the cache with the right size after loading a new tag map.
This commit is contained in:
parent
b81a89f0a9
commit
9ee1c54f40
|
@ -131,8 +131,8 @@ def debug_data(
|
|||
tag_map = {}
|
||||
if tag_map_path is not None:
|
||||
tag_map = srsly.read_json(tag_map_path)
|
||||
# Update tag map with provided mapping
|
||||
nlp.vocab.morphology.tag_map.update(tag_map)
|
||||
# Replace tag map with provided mapping
|
||||
nlp.vocab.morphology.load_tag_map(tag_map)
|
||||
|
||||
msg.divider("Data file validation")
|
||||
|
||||
|
|
|
@ -124,8 +124,8 @@ def train(
|
|||
)
|
||||
nlp.begin_training(lambda: train_examples)
|
||||
|
||||
# Update tag map with provided mapping
|
||||
nlp.vocab.morphology.tag_map.update(tag_map)
|
||||
# Replace tag map with provided mapping
|
||||
nlp.vocab.morphology.load_tag_map(tag_map)
|
||||
|
||||
# Create empty extra lexeme tables so the data from spacy-lookups-data
|
||||
# isn't loaded if these features are accessed
|
||||
|
|
|
@ -64,6 +64,20 @@ cdef class Morphology:
|
|||
self.mem = Pool()
|
||||
self.strings = strings
|
||||
self.tags = PreshMap()
|
||||
self.load_tag_map(tag_map)
|
||||
self.lemmatizer = lemmatizer
|
||||
|
||||
self._cache = PreshMapArray(self.n_tags)
|
||||
self.exc = {}
|
||||
if exc is not None:
|
||||
for (tag, orth), attrs in exc.items():
|
||||
attrs = _normalize_props(attrs)
|
||||
self.add_special_case(
|
||||
self.strings.as_string(tag), self.strings.as_string(orth), attrs)
|
||||
|
||||
def load_tag_map(self, tag_map):
|
||||
self.tag_map = {}
|
||||
self.reverse_index = {}
|
||||
# Add special space symbol. We prefix with underscore, to make sure it
|
||||
# always sorts to the end.
|
||||
if '_SP' in tag_map:
|
||||
|
@ -74,27 +88,14 @@ cdef class Morphology:
|
|||
self.strings.add('_SP')
|
||||
tag_map = dict(tag_map)
|
||||
tag_map['_SP'] = space_attrs
|
||||
self.tag_names = tuple(sorted(tag_map.keys()))
|
||||
self.tag_map = {}
|
||||
self.lemmatizer = lemmatizer
|
||||
self.n_tags = len(tag_map)
|
||||
self.reverse_index = {}
|
||||
self._load_from_tag_map(tag_map)
|
||||
|
||||
self._cache = PreshMapArray(self.n_tags)
|
||||
self.exc = {}
|
||||
if exc is not None:
|
||||
for (tag, orth), attrs in exc.items():
|
||||
attrs = _normalize_props(attrs)
|
||||
self.add_special_case(
|
||||
self.strings.as_string(tag), self.strings.as_string(orth), attrs)
|
||||
|
||||
def _load_from_tag_map(self, tag_map):
|
||||
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
||||
attrs = _normalize_props(attrs)
|
||||
self.add(attrs)
|
||||
self.tag_map[tag_str] = dict(attrs)
|
||||
self.reverse_index[self.strings.add(tag_str)] = i
|
||||
self.tag_names = tuple(sorted(self.tag_map.keys()))
|
||||
self.n_tags = len(self.tag_map)
|
||||
self._cache = PreshMapArray(self.n_tags)
|
||||
|
||||
def __reduce__(self):
|
||||
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
|
||||
|
|
|
@ -27,8 +27,7 @@ def test_overfitting_IO():
|
|||
# Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
|
||||
nlp = English()
|
||||
tagger = nlp.create_pipe("tagger")
|
||||
for tag, values in TAG_MAP.items():
|
||||
tagger.add_label(tag, values)
|
||||
nlp.vocab.morphology.load_tag_map(TAG_MAP)
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
|
Loading…
Reference in New Issue