Add Doc.extend_tensor() method

2017-11-03 11:20:31 +01:00 · 2017-11-03 11:20:31 +01:00 · 62ed58935a
parent d6fc39c8a6
commit 62ed58935a
1 changed files with 20 additions and 2 deletions
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -10,6 +10,7 @@ import numpy.linalg
 import struct
 import dill
 import msgpack
 from thinc.neural.util import get_array_module, copy_array
 from libc.string cimport memcpy, memset
 from libc.math cimport sqrt
@ -308,7 +309,7 @@ cdef class Doc:
                return self.user_hooks['has_vector'](self)
            elif any(token.has_vector for token in self):
                return True
-            elif self.tensor is not None:
+            elif self.tensor.size:
                return True
            else:
                return False
@ -335,7 +336,7 @@ cdef class Doc:
                    vector += self.vocab.get_vector(token.lex.orth)
                self._vector = vector / len(self)
                return self._vector
-            elif self.tensor is not None:
+            elif self.tensor.size:
                self._vector = self.tensor.mean(axis=0)
                return self._vector
            else:
@ -827,6 +828,23 @@ cdef class Doc:
                        attrs[:, 2:])
        return self
    def extend_tensor(self, tensor):
        '''Concatenate a new tensor onto the doc.tensor object.
        The doc.tensor attribute holds dense feature vectors
        computed by the models in the pipeline. Let's say a
        document with 30 words has a tensor with 128 dimensions
        per word. doc.tensor.shape will be (30, 128). After
        calling doc.extend_tensor with an array of hape (30, 64),
        doc.tensor == (30, 192).
        '''
        xp = get_array_module(self.tensor)
        if self.tensor.size == 0:
            self.tensor.resize(tensor.shape)
            copy_array(self.tensor, tensor)
        else:
            self.tensor = xp.hstack((self.tensor, tensor))
    def merge(self, int start_idx, int end_idx, *args, **attributes):
        """Retokenize the document, such that the span at
        `doc.text[start_idx : end_idx]` is merged into a single token. If