From fe3c42a06b89dd819c2329af6a3bbc46b51a869f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 15 Nov 2017 13:55:46 +0100 Subject: [PATCH] Fix caching in tokenizer --- spacy/tokenizer.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 67ff47743..095fbf4ad 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -1,4 +1,5 @@ # cython: embedsignature=True +# cython: profile=True # coding: utf8 from __future__ import unicode_literals @@ -268,7 +269,7 @@ cdef class Tokenizer: int has_special, int n) except -1: cdef int i for i in range(n): - if tokens[i].lex.id == 0: + if self.vocab._by_hash.get(tokens[i].lex.orth) == NULL: return 0 # See https://github.com/explosion/spaCy/issues/1250 if has_special: