From f9e765cae72ae7f01d915a49e74faa3418a7d598 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 3 Feb 2016 02:32:37 +0100 Subject: [PATCH] * Add pipe() method to tokenizer --- spacy/tokenizer.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index b6dd9f7f5..ad3a500a3 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -133,6 +133,10 @@ cdef class Tokenizer: tokens.c[tokens.length - 1].spacy = string[-1] == ' ' and not in_ws return tokens + def pipe(self, texts, batch_size=1000, n_threads=2): + for text in texts: + yield self(text) + cdef int _try_cache(self, hash_t key, Doc tokens) except -1: cached = <_Cached*>self._cache.get(key) if cached == NULL: