diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 07bdf6e9b..fb9ae597e 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -67,12 +67,13 @@ cdef class Language:
         cdef Tokens tokens = Tokens(length)
         if length == 0:
             return tokens
-        cdef int start = 0
         cdef int i = 0
+        cdef int start = 0
         cdef Py_UNICODE* chars = string
+        cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
         cdef String span
-        for i in range(length):
-            if Py_UNICODE_ISSPACE(chars[i]) == 1:
+        for i in range(1, length):
+            if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
                 if start < i:
                     string_slice(&span, chars, start, i)
                     lexemes = <LexemeC**>self.cache.get(span.key)
@@ -80,7 +81,10 @@ cdef class Language:
                         tokens.extend(start, lexemes, 0)
                     else: 
                         self._tokenize(tokens, &span, start, i)
-                start = i + 1
+                in_ws = not in_ws
+                start = i
+                if chars[i] == ' ':
+                    start += 1
         i += 1
         if start < i:
             string_slice(&span, chars, start, i)
diff --git a/tests/test_whitespace.py b/tests/test_whitespace.py
new file mode 100644
index 000000000..dc943664e
--- /dev/null
+++ b/tests/test_whitespace.py
@@ -0,0 +1,38 @@
+"""Test that tokens are created correctly for whitespace."""
+from __future__ import unicode_literals
+
+from spacy.en import EN
+import pytest
+
+
+def test_single_space():
+    tokens = EN.tokenize('hello possums')
+    assert len(tokens) == 2
+
+
+def test_double_space():
+    tokens = EN.tokenize('hello  possums')
+    assert len(tokens) == 3
+    assert tokens[1].string == ' '
+
+
+def test_newline():
+    tokens = EN.tokenize('hello\npossums')
+    assert len(tokens) == 3
+
+
+def test_newline_space():
+    tokens = EN.tokenize('hello \npossums')
+    assert len(tokens) == 3
+
+
+def test_newline_double_space():
+    tokens = EN.tokenize('hello  \npossums')
+    assert len(tokens) == 3
+
+
+def test_newline_space_wrap():
+    tokens = EN.tokenize('hello \n possums')
+    assert len(tokens) == 3
+
+