mirror of https://github.com/explosion/spaCy.git
* Have tokenizer emit tokens for whitespace other than single spaces
This commit is contained in:
parent
43743a5d63
commit
99f5e59286
|
@ -67,12 +67,13 @@ cdef class Language:
|
||||||
cdef Tokens tokens = Tokens(length)
|
cdef Tokens tokens = Tokens(length)
|
||||||
if length == 0:
|
if length == 0:
|
||||||
return tokens
|
return tokens
|
||||||
cdef int start = 0
|
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
|
cdef int start = 0
|
||||||
cdef Py_UNICODE* chars = string
|
cdef Py_UNICODE* chars = string
|
||||||
|
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
|
||||||
cdef String span
|
cdef String span
|
||||||
for i in range(length):
|
for i in range(1, length):
|
||||||
if Py_UNICODE_ISSPACE(chars[i]) == 1:
|
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
|
||||||
if start < i:
|
if start < i:
|
||||||
string_slice(&span, chars, start, i)
|
string_slice(&span, chars, start, i)
|
||||||
lexemes = <LexemeC**>self.cache.get(span.key)
|
lexemes = <LexemeC**>self.cache.get(span.key)
|
||||||
|
@ -80,7 +81,10 @@ cdef class Language:
|
||||||
tokens.extend(start, lexemes, 0)
|
tokens.extend(start, lexemes, 0)
|
||||||
else:
|
else:
|
||||||
self._tokenize(tokens, &span, start, i)
|
self._tokenize(tokens, &span, start, i)
|
||||||
start = i + 1
|
in_ws = not in_ws
|
||||||
|
start = i
|
||||||
|
if chars[i] == ' ':
|
||||||
|
start += 1
|
||||||
i += 1
|
i += 1
|
||||||
if start < i:
|
if start < i:
|
||||||
string_slice(&span, chars, start, i)
|
string_slice(&span, chars, start, i)
|
||||||
|
|
|
@ -0,0 +1,38 @@
|
||||||
|
"""Test that tokens are created correctly for whitespace."""
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from spacy.en import EN
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def test_single_space():
|
||||||
|
tokens = EN.tokenize('hello possums')
|
||||||
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_double_space():
|
||||||
|
tokens = EN.tokenize('hello possums')
|
||||||
|
assert len(tokens) == 3
|
||||||
|
assert tokens[1].string == ' '
|
||||||
|
|
||||||
|
|
||||||
|
def test_newline():
|
||||||
|
tokens = EN.tokenize('hello\npossums')
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_newline_space():
|
||||||
|
tokens = EN.tokenize('hello \npossums')
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_newline_double_space():
|
||||||
|
tokens = EN.tokenize('hello \npossums')
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_newline_space_wrap():
|
||||||
|
tokens = EN.tokenize('hello \n possums')
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue