load_vectors should accept arbitrary space characters as word tokens

Fix bug  #834
This commit is contained in:
Raphaël Bournhonesque 2017-02-16 12:08:07 +01:00
parent 813989940e
commit 3fd2742649
1 changed files with 5 additions and 1 deletions

View File

@ -12,6 +12,7 @@ import io
import math import math
import ujson as json import ujson as json
import tempfile import tempfile
import re
from .lexeme cimport EMPTY_LEXEME from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
@ -477,9 +478,12 @@ cdef class Vocab:
cdef attr_t orth cdef attr_t orth
cdef int32_t vec_len = -1 cdef int32_t vec_len = -1
cdef double norm = 0.0 cdef double norm = 0.0
whitespace_pattern = re.compile(r'\s')
for line_num, line in enumerate(file_): for line_num, line in enumerate(file_):
pieces = line.split() pieces = line.split()
word_str = " " if line.startswith(" ") else pieces.pop(0) word_str = " " if whitespace_pattern.match(line) else pieces.pop(0)
if vec_len == -1: if vec_len == -1:
vec_len = len(pieces) vec_len = len(pieces)
elif vec_len != len(pieces): elif vec_len != len(pieces):