mirror of https://github.com/explosion/spaCy.git
Add Doc.extend_tensor() method
This commit is contained in:
parent
d6fc39c8a6
commit
62ed58935a
|
@ -10,6 +10,7 @@ import numpy.linalg
|
||||||
import struct
|
import struct
|
||||||
import dill
|
import dill
|
||||||
import msgpack
|
import msgpack
|
||||||
|
from thinc.neural.util import get_array_module, copy_array
|
||||||
|
|
||||||
from libc.string cimport memcpy, memset
|
from libc.string cimport memcpy, memset
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
|
@ -308,7 +309,7 @@ cdef class Doc:
|
||||||
return self.user_hooks['has_vector'](self)
|
return self.user_hooks['has_vector'](self)
|
||||||
elif any(token.has_vector for token in self):
|
elif any(token.has_vector for token in self):
|
||||||
return True
|
return True
|
||||||
elif self.tensor is not None:
|
elif self.tensor.size:
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
@ -335,7 +336,7 @@ cdef class Doc:
|
||||||
vector += self.vocab.get_vector(token.lex.orth)
|
vector += self.vocab.get_vector(token.lex.orth)
|
||||||
self._vector = vector / len(self)
|
self._vector = vector / len(self)
|
||||||
return self._vector
|
return self._vector
|
||||||
elif self.tensor is not None:
|
elif self.tensor.size:
|
||||||
self._vector = self.tensor.mean(axis=0)
|
self._vector = self.tensor.mean(axis=0)
|
||||||
return self._vector
|
return self._vector
|
||||||
else:
|
else:
|
||||||
|
@ -827,6 +828,23 @@ cdef class Doc:
|
||||||
attrs[:, 2:])
|
attrs[:, 2:])
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def extend_tensor(self, tensor):
|
||||||
|
'''Concatenate a new tensor onto the doc.tensor object.
|
||||||
|
|
||||||
|
The doc.tensor attribute holds dense feature vectors
|
||||||
|
computed by the models in the pipeline. Let's say a
|
||||||
|
document with 30 words has a tensor with 128 dimensions
|
||||||
|
per word. doc.tensor.shape will be (30, 128). After
|
||||||
|
calling doc.extend_tensor with an array of hape (30, 64),
|
||||||
|
doc.tensor == (30, 192).
|
||||||
|
'''
|
||||||
|
xp = get_array_module(self.tensor)
|
||||||
|
if self.tensor.size == 0:
|
||||||
|
self.tensor.resize(tensor.shape)
|
||||||
|
copy_array(self.tensor, tensor)
|
||||||
|
else:
|
||||||
|
self.tensor = xp.hstack((self.tensor, tensor))
|
||||||
|
|
||||||
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
||||||
"""Retokenize the document, such that the span at
|
"""Retokenize the document, such that the span at
|
||||||
`doc.text[start_idx : end_idx]` is merged into a single token. If
|
`doc.text[start_idx : end_idx]` is merged into a single token. If
|
||||||
|
|
Loading…
Reference in New Issue