* Have tokenizer emit tokens for whitespace other than single spaces

This commit is contained in:
Matthew Honnibal 2014-10-14 20:25:57 +11:00
parent 43743a5d63
commit 99f5e59286
2 changed files with 46 additions and 4 deletions

View File

@ -67,12 +67,13 @@ cdef class Language:
cdef Tokens tokens = Tokens(length) cdef Tokens tokens = Tokens(length)
if length == 0: if length == 0:
return tokens return tokens
cdef int start = 0
cdef int i = 0 cdef int i = 0
cdef int start = 0
cdef Py_UNICODE* chars = string cdef Py_UNICODE* chars = string
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
cdef String span cdef String span
for i in range(length): for i in range(1, length):
if Py_UNICODE_ISSPACE(chars[i]) == 1: if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
if start < i: if start < i:
string_slice(&span, chars, start, i) string_slice(&span, chars, start, i)
lexemes = <LexemeC**>self.cache.get(span.key) lexemes = <LexemeC**>self.cache.get(span.key)
@ -80,7 +81,10 @@ cdef class Language:
tokens.extend(start, lexemes, 0) tokens.extend(start, lexemes, 0)
else: else:
self._tokenize(tokens, &span, start, i) self._tokenize(tokens, &span, start, i)
start = i + 1 in_ws = not in_ws
start = i
if chars[i] == ' ':
start += 1
i += 1 i += 1
if start < i: if start < i:
string_slice(&span, chars, start, i) string_slice(&span, chars, start, i)

38
tests/test_whitespace.py Normal file
View File

@ -0,0 +1,38 @@
"""Test that tokens are created correctly for whitespace."""
from __future__ import unicode_literals
from spacy.en import EN
import pytest
def test_single_space():
tokens = EN.tokenize('hello possums')
assert len(tokens) == 2
def test_double_space():
tokens = EN.tokenize('hello possums')
assert len(tokens) == 3
assert tokens[1].string == ' '
def test_newline():
tokens = EN.tokenize('hello\npossums')
assert len(tokens) == 3
def test_newline_space():
tokens = EN.tokenize('hello \npossums')
assert len(tokens) == 3
def test_newline_double_space():
tokens = EN.tokenize('hello \npossums')
assert len(tokens) == 3
def test_newline_space_wrap():
tokens = EN.tokenize('hello \n possums')
assert len(tokens) == 3