From 99f5e592866e6174ae2cba1177ec9393676455de Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 14 Oct 2014 20:25:57 +1100 Subject: [PATCH] * Have tokenizer emit tokens for whitespace other than single spaces --- spacy/lang.pyx | 12 ++++++++---- tests/test_whitespace.py | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 4 deletions(-) create mode 100644 tests/test_whitespace.py diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 07bdf6e9b..fb9ae597e 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -67,12 +67,13 @@ cdef class Language: cdef Tokens tokens = Tokens(length) if length == 0: return tokens - cdef int start = 0 cdef int i = 0 + cdef int start = 0 cdef Py_UNICODE* chars = string + cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0]) cdef String span - for i in range(length): - if Py_UNICODE_ISSPACE(chars[i]) == 1: + for i in range(1, length): + if Py_UNICODE_ISSPACE(chars[i]) != in_ws: if start < i: string_slice(&span, chars, start, i) lexemes = self.cache.get(span.key) @@ -80,7 +81,10 @@ cdef class Language: tokens.extend(start, lexemes, 0) else: self._tokenize(tokens, &span, start, i) - start = i + 1 + in_ws = not in_ws + start = i + if chars[i] == ' ': + start += 1 i += 1 if start < i: string_slice(&span, chars, start, i) diff --git a/tests/test_whitespace.py b/tests/test_whitespace.py new file mode 100644 index 000000000..dc943664e --- /dev/null +++ b/tests/test_whitespace.py @@ -0,0 +1,38 @@ +"""Test that tokens are created correctly for whitespace.""" +from __future__ import unicode_literals + +from spacy.en import EN +import pytest + + +def test_single_space(): + tokens = EN.tokenize('hello possums') + assert len(tokens) == 2 + + +def test_double_space(): + tokens = EN.tokenize('hello possums') + assert len(tokens) == 3 + assert tokens[1].string == ' ' + + +def test_newline(): + tokens = EN.tokenize('hello\npossums') + assert len(tokens) == 3 + + +def test_newline_space(): + tokens = EN.tokenize('hello \npossums') + assert len(tokens) == 3 + + +def test_newline_double_space(): + tokens = EN.tokenize('hello \npossums') + assert len(tokens) == 3 + + +def test_newline_space_wrap(): + tokens = EN.tokenize('hello \n possums') + assert len(tokens) == 3 + +