mirror of https://github.com/explosion/spaCy.git
* Add hacky solution to Issue #220. Currently specials.json only supports literal patterns, which doesn't allow us to pre-tag whitespace with the correct token, SP, as a rule. The data-driven approach should be easy but for some reason fails here. Adding a hard code in Morphology isn't a good solution, but we do want to fix the behaviour right away, and don't want to wait for an architecturally better solution.
This commit is contained in:
parent
445164d5b4
commit
590f38bdb2
|
@ -40,6 +40,13 @@ cdef class Morphology:
|
||||||
tag_id = tag
|
tag_id = tag
|
||||||
if tag_id >= self.n_tags:
|
if tag_id >= self.n_tags:
|
||||||
raise ValueError("Unknown tag: %s" % tag)
|
raise ValueError("Unknown tag: %s" % tag)
|
||||||
|
# TODO: It's pretty arbitrary to put this logic here. I guess the justification
|
||||||
|
# is that this is where the specific word and the tag interact. Still,
|
||||||
|
# we should have a better way to enforce this rule, or figure out why
|
||||||
|
# the statistical model fails.
|
||||||
|
# Related to Issue #220
|
||||||
|
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
||||||
|
tag_id = self.reverse_index[self.strings['SP']]
|
||||||
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
||||||
if analysis is NULL:
|
if analysis is NULL:
|
||||||
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
||||||
|
|
Loading…
Reference in New Issue