From c17980e535a8009b14ee4d1f818db207d9c07e55 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 21 Mar 2022 14:24:46 +0100 Subject: [PATCH] Save vectors as little endian, load with Ops.asarray (#10201) * Save vectors as little endian, load with Ops.asarray * Always save vector data as little endian * Always run `Vectors.to_ops` when vector data is loaded so that `Ops.asarray` can be used to load the data correctly for the current ops. * Update spacy/vectors.pyx Co-authored-by: Sofie Van Landeghem * Update spacy/vectors.pyx Co-authored-by: Sofie Van Landeghem Co-authored-by: Sofie Van Landeghem --- spacy/vectors.pyx | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 2b1ea764b..bcba9d03f 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -565,8 +565,9 @@ cdef class Vectors: # the source of numpy.save indicates that the file object is closed after use. # but it seems that somehow this does not happen, as ResourceWarnings are raised here. # in order to not rely on this, wrap in context manager. + ops = get_current_ops() with path.open("wb") as _file: - save_array(self.data, _file) + save_array(ops.to_numpy(self.data, byte_order="<"), _file) serializers = { "strings": lambda p: self.strings.to_disk(p.with_suffix(".json")), @@ -602,6 +603,7 @@ cdef class Vectors: ops = get_current_ops() if path.exists(): self.data = ops.xp.load(str(path)) + self.to_ops(ops) def load_settings(path): if path.exists(): @@ -631,7 +633,8 @@ cdef class Vectors: if hasattr(self.data, "to_bytes"): return self.data.to_bytes() else: - return srsly.msgpack_dumps(self.data) + ops = get_current_ops() + return srsly.msgpack_dumps(ops.to_numpy(self.data, byte_order="<")) serializers = { "strings": lambda: self.strings.to_bytes(), @@ -656,6 +659,8 @@ cdef class Vectors: else: xp = get_array_module(self.data) self.data = xp.asarray(srsly.msgpack_loads(b)) + ops = get_current_ops() + self.to_ops(ops) deserializers = { "strings": lambda b: self.strings.from_bytes(b),