mirror of https://github.com/explosion/spaCy.git
* Fix bug in decoding non-ascii characters
This commit is contained in:
parent
6deb1e84b6
commit
1601e488ee
|
@ -152,7 +152,7 @@ cdef class Packer:
|
||||||
cdef int32_t length = len(utf8_str)
|
cdef int32_t length = len(utf8_str)
|
||||||
# Signal chars with negative length
|
# Signal chars with negative length
|
||||||
bits.extend(-length, 32)
|
bits.extend(-length, 32)
|
||||||
self.char_codec.encode(bytearray(utf8_str), bits)
|
self.char_codec.encode(utf8_str, bits)
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
for i in range(doc.length):
|
for i in range(doc.length):
|
||||||
for j in range(doc.data[i].lex.length-1):
|
for j in range(doc.data[i].lex.length-1):
|
||||||
|
@ -175,24 +175,24 @@ cdef class Packer:
|
||||||
doc.push_back(lex, space)
|
doc.push_back(lex, space)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def _char_decode(self, BitArray bits, int32_t n, Doc doc):
|
def _char_decode(self, BitArray bits, int32_t n_bytes, Doc doc):
|
||||||
cdef bytearray utf8_str = bytearray(n)
|
cdef bytearray utf8_str = bytearray(n_bytes)
|
||||||
self.char_codec.decode(bits, utf8_str)
|
self.char_codec.decode(bits, utf8_str)
|
||||||
|
|
||||||
cdef unicode string = utf8_str.decode('utf8')
|
cdef unicode string = utf8_str.decode('utf8')
|
||||||
cdef int start = 0
|
cdef int start = 0
|
||||||
cdef bint is_spacy
|
cdef bint is_spacy
|
||||||
cdef int length = len(string)
|
cdef int n_unicode_chars = len(string)
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
cdef bint is_end_token
|
cdef bint is_end_token
|
||||||
for is_end_token in bits:
|
for is_end_token in bits:
|
||||||
if is_end_token:
|
if is_end_token:
|
||||||
span = string[start:i+1]
|
span = string[start:i+1]
|
||||||
lex = self.vocab.get(doc.mem, span)
|
lex = self.vocab.get(doc.mem, span)
|
||||||
is_spacy = (i+1) < length and string[i+1] == u' '
|
is_spacy = (i+1) < n_unicode_chars and string[i+1] == u' '
|
||||||
doc.push_back(lex, is_spacy)
|
doc.push_back(lex, is_spacy)
|
||||||
start = i + 1 + is_spacy
|
start = i + 1 + is_spacy
|
||||||
i += 1
|
i += 1
|
||||||
if i >= n:
|
if i >= n_unicode_chars:
|
||||||
break
|
break
|
||||||
return doc
|
return doc
|
||||||
|
|
Loading…
Reference in New Issue