From 590f38bdb2fc8f4ae3898ec51fbee98132cf57e6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 19 Jan 2016 03:35:20 +0100 Subject: [PATCH] * Add hacky solution to Issue #220. Currently specials.json only supports literal patterns, which doesn't allow us to pre-tag whitespace with the correct token, SP, as a rule. The data-driven approach should be easy but for some reason fails here. Adding a hard code in Morphology isn't a good solution, but we do want to fix the behaviour right away, and don't want to wait for an architecturally better solution. --- spacy/morphology.pyx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index c69488b6c..5730190de 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -40,6 +40,13 @@ cdef class Morphology: tag_id = tag if tag_id >= self.n_tags: raise ValueError("Unknown tag: %s" % tag) + # TODO: It's pretty arbitrary to put this logic here. I guess the justification + # is that this is where the specific word and the tag interact. Still, + # we should have a better way to enforce this rule, or figure out why + # the statistical model fails. + # Related to Issue #220 + if Lexeme.c_check_flag(token.lex, IS_SPACE): + tag_id = self.reverse_index[self.strings['SP']] analysis = self._cache.get(tag_id, token.lex.orth) if analysis is NULL: analysis = self.mem.alloc(1, sizeof(MorphAnalysisC))