* Have tokenizer emit tokens for whitespace other than single spaces

2014-10-14 20:25:57 +11:00 · 2014-10-14 20:25:57 +11:00 · 99f5e59286
parent 43743a5d63
commit 99f5e59286
2 changed files with 46 additions and 4 deletions
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -67,12 +67,13 @@ cdef class Language:
        cdef Tokens tokens = Tokens(length)
        if length == 0:
            return tokens
        cdef int start = 0
        cdef int i = 0
        cdef int start = 0
        cdef Py_UNICODE* chars = string
        cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
        cdef String span
-        for i in range(length):
+        for i in range(1, length):
-            if Py_UNICODE_ISSPACE(chars[i]) == 1:
+            if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
                if start < i:
                    string_slice(&span, chars, start, i)
                    lexemes = <LexemeC**>self.cache.get(span.key)
@ -80,7 +81,10 @@ cdef class Language:
                        tokens.extend(start, lexemes, 0)
                    else: 
                        self._tokenize(tokens, &span, start, i)
-                start = i + 1
+                in_ws = not in_ws
                start = i
                if chars[i] == ' ':
                    start += 1
        i += 1
        if start < i:
            string_slice(&span, chars, start, i)
--- a/tests/test_whitespace.py
+++ b/tests/test_whitespace.py
@ -0,0 +1,38 @@
 """Test that tokens are created correctly for whitespace."""
 from __future__ import unicode_literals
 from spacy.en import EN
 import pytest
 def test_single_space():
    tokens = EN.tokenize('hello possums')
    assert len(tokens) == 2
 def test_double_space():
    tokens = EN.tokenize('hello  possums')
    assert len(tokens) == 3
    assert tokens[1].string == ' '
 def test_newline():
    tokens = EN.tokenize('hello\npossums')
    assert len(tokens) == 3
 def test_newline_space():
    tokens = EN.tokenize('hello \npossums')
    assert len(tokens) == 3
 def test_newline_double_space():
    tokens = EN.tokenize('hello  \npossums')
    assert len(tokens) == 3
 def test_newline_space_wrap():
    tokens = EN.tokenize('hello \n possums')
    assert len(tokens) == 3