From 3fd274264966b394953466ec76cb86104ee43124 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <raphael@likeabird.io>
Date: Thu, 16 Feb 2017 12:08:07 +0100
Subject: [PATCH 1/4] load_vectors should accept arbitrary space characters as
 word tokens

Fix bug  #834
---
 spacy/vocab.pyx | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index cd2b18f81..bff3b5595 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -12,6 +12,7 @@ import io
 import math
 import ujson as json
 import tempfile
+import re
 
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport Lexeme
@@ -477,9 +478,12 @@ cdef class Vocab:
         cdef attr_t orth
         cdef int32_t vec_len = -1
         cdef double norm = 0.0
+
+        whitespace_pattern = re.compile(r'\s')
+
         for line_num, line in enumerate(file_):
             pieces = line.split()
-            word_str = " " if line.startswith(" ") else pieces.pop(0)
+            word_str = " " if whitespace_pattern.match(line) else pieces.pop(0)
             if vec_len == -1:
                 vec_len = len(pieces)
             elif vec_len != len(pieces):

From e17dc2db75e3505de32bcaf6dd99ce215d161e2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <raphael@likeabird.io>
Date: Thu, 16 Feb 2017 12:10:24 +0100
Subject: [PATCH 2/4] Remove useless import

---
 spacy/vocab.pyx | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index bff3b5595..ab023c3b4 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -1,23 +1,17 @@
 from __future__ import unicode_literals
 
-from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 from libc.string cimport memset
 from libc.stdint cimport int32_t
-from libc.stdint cimport uint64_t
 from libc.math cimport sqrt
 
 from pathlib import Path
 import bz2
-import io
-import math
 import ujson as json
-import tempfile
 import re
 
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport Lexeme
 from .strings cimport hash_string
-from .orth cimport word_shape
 from .typedefs cimport attr_t
 from .cfile cimport CFile
 from .lemmatizer import Lemmatizer
@@ -30,7 +24,6 @@ from . import symbols
 from cymem.cymem cimport Address
 from .serialize.packer cimport Packer
 from .attrs cimport PROB, LANG
-from . import deprecated
 from . import util
 
 

From 3ba109622c24bd52f32e605c523249e1c26b0207 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <raphael@likeabird.io>
Date: Thu, 16 Feb 2017 12:23:27 +0100
Subject: [PATCH 3/4] Add regression test with non ' ' space character as token

---
 spacy/tests/regression/test_issue834.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 spacy/tests/regression/test_issue834.py

diff --git a/spacy/tests/regression/test_issue834.py b/spacy/tests/regression/test_issue834.py
new file mode 100644
index 000000000..7ed186dfe
--- /dev/null
+++ b/spacy/tests/regression/test_issue834.py
@@ -0,0 +1,14 @@
+# coding: utf-8
+
+from io import StringIO
+
+word2vec_str = """, -0.046107 -0.035951 -0.560418
+de -0.648927 -0.400976 -0.527124
+. 0.113685 0.439990 -0.634510
+  -1.499184 -0.184280 -0.598371"""
+
+
+def test_issue834(en_vocab):
+    f = StringIO(word2vec_str)
+    vector_length = en_vocab.load_vectors(f)
+    assert vector_length == 3

From 06a71d22df5b6f1196cbdff737ab071ba92fad0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= <raphael@likeabird.io>
Date: Thu, 16 Feb 2017 14:48:00 +0100
Subject: [PATCH 4/4] Fix test failure by using unicode literals

---
 spacy/tests/regression/test_issue834.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/tests/regression/test_issue834.py b/spacy/tests/regression/test_issue834.py
index 7ed186dfe..00b227f28 100644
--- a/spacy/tests/regression/test_issue834.py
+++ b/spacy/tests/regression/test_issue834.py
@@ -1,5 +1,6 @@
 # coding: utf-8
 
+from __future__ import unicode_literals
 from io import StringIO
 
 word2vec_str = """, -0.046107 -0.035951 -0.560418