From dfdf19f6a929f451a3887d68f95127bd2940a6ac Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 17 Jul 2015 16:39:54 +0200 Subject: [PATCH] * Draft a from_orth method for Doc --- spacy/tokens/doc.pyx | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 392c78a45..db3df7156 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -96,6 +96,20 @@ cdef class Doc: self.is_parsed = False self._py_tokens = [] + @classmethod + def from_orth(cls, Vocab vocab, attr_t[:] orths, attr_t[:] spaces): + cdef int i + cdef const LexemeC* lex + cdef Doc self = cls(vocab) + cdef unicode string + cdef UniStr new_orth_c + for i in range(len(orths)): + string = vocab.strings[orths[i]] + slice_unicode(&new_orth_c, string, 0, len(string)) + lex = self.vocab.get(self.mem, &new_orth_c) + self.push_back(lex, spaces[i]) + return self + def __getitem__(self, object i): """Get a token.