From 3fd274264966b394953466ec76cb86104ee43124 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Thu, 16 Feb 2017 12:08:07 +0100 Subject: [PATCH 1/4] load_vectors should accept arbitrary space characters as word tokens Fix bug #834 --- spacy/vocab.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index cd2b18f81..bff3b5595 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -12,6 +12,7 @@ import io import math import ujson as json import tempfile +import re from .lexeme cimport EMPTY_LEXEME from .lexeme cimport Lexeme @@ -477,9 +478,12 @@ cdef class Vocab: cdef attr_t orth cdef int32_t vec_len = -1 cdef double norm = 0.0 + + whitespace_pattern = re.compile(r'\s') + for line_num, line in enumerate(file_): pieces = line.split() - word_str = " " if line.startswith(" ") else pieces.pop(0) + word_str = " " if whitespace_pattern.match(line) else pieces.pop(0) if vec_len == -1: vec_len = len(pieces) elif vec_len != len(pieces): From e17dc2db75e3505de32bcaf6dd99ce215d161e2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Thu, 16 Feb 2017 12:10:24 +0100 Subject: [PATCH 2/4] Remove useless import --- spacy/vocab.pyx | 7 ------- 1 file changed, 7 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index bff3b5595..ab023c3b4 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -1,23 +1,17 @@ from __future__ import unicode_literals -from libc.stdio cimport fopen, fclose, fread, fwrite, FILE from libc.string cimport memset from libc.stdint cimport int32_t -from libc.stdint cimport uint64_t from libc.math cimport sqrt from pathlib import Path import bz2 -import io -import math import ujson as json -import tempfile import re from .lexeme cimport EMPTY_LEXEME from .lexeme cimport Lexeme from .strings cimport hash_string -from .orth cimport word_shape from .typedefs cimport attr_t from .cfile cimport CFile from .lemmatizer import Lemmatizer @@ -30,7 +24,6 @@ from . import symbols from cymem.cymem cimport Address from .serialize.packer cimport Packer from .attrs cimport PROB, LANG -from . import deprecated from . import util From 3ba109622c24bd52f32e605c523249e1c26b0207 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Thu, 16 Feb 2017 12:23:27 +0100 Subject: [PATCH 3/4] Add regression test with non ' ' space character as token --- spacy/tests/regression/test_issue834.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 spacy/tests/regression/test_issue834.py diff --git a/spacy/tests/regression/test_issue834.py b/spacy/tests/regression/test_issue834.py new file mode 100644 index 000000000..7ed186dfe --- /dev/null +++ b/spacy/tests/regression/test_issue834.py @@ -0,0 +1,14 @@ +# coding: utf-8 + +from io import StringIO + +word2vec_str = """, -0.046107 -0.035951 -0.560418 +de -0.648927 -0.400976 -0.527124 +. 0.113685 0.439990 -0.634510 +  -1.499184 -0.184280 -0.598371""" + + +def test_issue834(en_vocab): + f = StringIO(word2vec_str) + vector_length = en_vocab.load_vectors(f) + assert vector_length == 3 From 06a71d22df5b6f1196cbdff737ab071ba92fad0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Thu, 16 Feb 2017 14:48:00 +0100 Subject: [PATCH 4/4] Fix test failure by using unicode literals --- spacy/tests/regression/test_issue834.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/regression/test_issue834.py b/spacy/tests/regression/test_issue834.py index 7ed186dfe..00b227f28 100644 --- a/spacy/tests/regression/test_issue834.py +++ b/spacy/tests/regression/test_issue834.py @@ -1,5 +1,6 @@ # coding: utf-8 +from __future__ import unicode_literals from io import StringIO word2vec_str = """, -0.046107 -0.035951 -0.560418