mirror of https://github.com/explosion/spaCy.git
Revert "Changes to morphology.pyx for new StringStore scheme"
This reverts commit 95f8cfd745
.
This commit is contained in:
parent
1d70db58aa
commit
1b6b129c04
|
@ -33,7 +33,7 @@ cdef class Morphology:
|
||||||
cdef RichTagC* rich_tags
|
cdef RichTagC* rich_tags
|
||||||
cdef PreshMapArray _cache
|
cdef PreshMapArray _cache
|
||||||
|
|
||||||
cdef int assign_tag(self, TokenC* token, tag, Pool mem=*) except -1
|
cdef int assign_tag(self, TokenC* token, tag) except -1
|
||||||
|
|
||||||
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1
|
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,3 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from os import path
|
from os import path
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
|
|
||||||
|
@ -27,7 +25,7 @@ cdef class Morphology:
|
||||||
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
|
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
|
||||||
for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
|
for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
|
||||||
self.rich_tags[i].id = i
|
self.rich_tags[i].id = i
|
||||||
self.rich_tags[i].name = self.strings.intern(tag_str)
|
self.rich_tags[i].name = self.strings[tag_str]
|
||||||
self.rich_tags[i].morph = 0
|
self.rich_tags[i].morph = 0
|
||||||
self.rich_tags[i].pos = POS_IDS[props['pos'].upper()]
|
self.rich_tags[i].pos = POS_IDS[props['pos'].upper()]
|
||||||
self.reverse_index[self.rich_tags[i].name] = i
|
self.reverse_index[self.rich_tags[i].name] = i
|
||||||
|
@ -36,14 +34,14 @@ cdef class Morphology:
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (Morphology, (self.strings, self.tag_map, self.lemmatizer), None, None)
|
return (Morphology, (self.strings, self.tag_map, self.lemmatizer), None, None)
|
||||||
|
|
||||||
cdef int assign_tag(self, TokenC* token, tag, Pool mem=None) except -1:
|
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
||||||
cdef int tag_id
|
cdef int tag_id
|
||||||
if isinstance(tag, basestring):
|
if isinstance(tag, basestring):
|
||||||
tag_id = self.reverse_index[self.strings.intern(tag)]
|
tag_id = self.reverse_index[self.strings[tag]]
|
||||||
tag_str = tag
|
tag_str = tag
|
||||||
else:
|
else:
|
||||||
tag_id = tag
|
tag_id = tag
|
||||||
tag_str = self.strings.decode_int(tag)
|
tag_str = self.strings[tag]
|
||||||
if tag_id >= self.n_tags:
|
if tag_id >= self.n_tags:
|
||||||
raise ValueError("Unknown tag: %s" % tag)
|
raise ValueError("Unknown tag: %s" % tag)
|
||||||
# TODO: It's pretty arbitrary to put this logic here. I guess the justification
|
# TODO: It's pretty arbitrary to put this logic here. I guess the justification
|
||||||
|
@ -52,13 +50,13 @@ cdef class Morphology:
|
||||||
# the statistical model fails.
|
# the statistical model fails.
|
||||||
# Related to Issue #220
|
# Related to Issue #220
|
||||||
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
||||||
tag_id = self.reverse_index[self.strings.intern('SP')]
|
tag_id = self.reverse_index[self.strings['SP']]
|
||||||
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
||||||
if analysis is NULL:
|
if analysis is NULL:
|
||||||
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
||||||
analysis.tag = self.rich_tags[tag_id]
|
analysis.tag = self.rich_tags[tag_id]
|
||||||
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
|
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
|
||||||
mem=mem, **self.tag_map.get(tag_str, {}))
|
**self.tag_map.get(tag_str, {}))
|
||||||
self._cache.set(tag_id, token.lex.orth, analysis)
|
self._cache.set(tag_id, token.lex.orth, analysis)
|
||||||
token.lemma = analysis.lemma
|
token.lemma = analysis.lemma
|
||||||
token.pos = analysis.tag.pos
|
token.pos = analysis.tag.pos
|
||||||
|
@ -81,16 +79,16 @@ cdef class Morphology:
|
||||||
cdef int pos
|
cdef int pos
|
||||||
cdef RichTagC rich_tag
|
cdef RichTagC rich_tag
|
||||||
for tag_str, entries in exc.items():
|
for tag_str, entries in exc.items():
|
||||||
tag = self.strings.intern(tag_str)
|
tag = self.strings[tag_str]
|
||||||
tag_id = self.reverse_index[tag]
|
tag_id = self.reverse_index[tag]
|
||||||
rich_tag = self.rich_tags[tag_id]
|
rich_tag = self.rich_tags[tag_id]
|
||||||
for form_str, props in entries.items():
|
for form_str, props in entries.items():
|
||||||
cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
||||||
cached.tag = rich_tag
|
cached.tag = rich_tag
|
||||||
orth = self.strings.intern(form_str)
|
orth = self.strings[form_str]
|
||||||
for name_str, value_str in props.items():
|
for name_str, value_str in props.items():
|
||||||
if name_str == 'L':
|
if name_str == 'L':
|
||||||
cached.lemma = self.strings.intern(value_str)
|
cached.lemma = self.strings[value_str]
|
||||||
else:
|
else:
|
||||||
self.assign_feature(&cached.tag.morph, name_str, value_str)
|
self.assign_feature(&cached.tag.morph, name_str, value_str)
|
||||||
if cached.lemma == 0:
|
if cached.lemma == 0:
|
||||||
|
@ -98,18 +96,17 @@ cdef class Morphology:
|
||||||
self.tag_map.get(tag_str, {}))
|
self.tag_map.get(tag_str, {}))
|
||||||
self._cache.set(tag_id, orth, <void*>cached)
|
self._cache.set(tag_id, orth, <void*>cached)
|
||||||
|
|
||||||
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, Pool mem=None,
|
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, **morphology):
|
||||||
**morphology):
|
cdef unicode py_string = self.strings[orth]
|
||||||
cdef unicode py_string = self.strings.decode_int(orth, mem=mem)
|
|
||||||
if self.lemmatizer is None:
|
if self.lemmatizer is None:
|
||||||
return self.strings.intern(py_string.lower(), mem=mem)
|
return self.strings[py_string.lower()]
|
||||||
if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
|
if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
|
||||||
return self.strings.intern(py_string.lower(), mem=mem)
|
return self.strings[py_string.lower()]
|
||||||
cdef set lemma_strings
|
cdef set lemma_strings
|
||||||
cdef unicode lemma_string
|
cdef unicode lemma_string
|
||||||
lemma_strings = self.lemmatizer(py_string, univ_pos, **morphology)
|
lemma_strings = self.lemmatizer(py_string, univ_pos, **morphology)
|
||||||
lemma_string = sorted(lemma_strings)[0]
|
lemma_string = sorted(lemma_strings)[0]
|
||||||
lemma = self.strings.intern(lemma_string, mem=mem)
|
lemma = self.strings[lemma_string]
|
||||||
return lemma
|
return lemma
|
||||||
|
|
||||||
IDS = {
|
IDS = {
|
||||||
|
|
Loading…
Reference in New Issue