From 25849fc926968def6ae006ce4db5522e4fc970d2 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matthew@honnibal.com>
Date: Mon, 7 Jul 2014 05:07:21 +0200
Subject: [PATCH] * Generalize tokenization rules to capitals

---
 spacy/util.py              |  5 +++++
 tests/test_contractions.py | 10 ++++++++++
 2 files changed, 15 insertions(+)

diff --git a/spacy/util.py b/spacy/util.py
index 449bad876..64dee8877 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -36,6 +36,11 @@ def read_tokenization(lang):
             assert chunk not in seen, chunk
             seen.add(chunk)
             entries.append((chunk, lex, pieces))
+            if chunk[0].isalpha() and chunk[0].islower():
+                chunk = chunk[0].title() + chunk[1:]
+                lex = lex[0].title() + lex[1:]
+                seen.add(chunk)
+                entries.append((chunk, lex, pieces))
     return entries
  
 
diff --git a/tests/test_contractions.py b/tests/test_contractions.py
index be2280d75..aa11faa39 100644
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@@ -32,3 +32,13 @@ def test_aint():
     assert len(tokens) == 2
     assert unhash(lex_of(tokens[0])) == "are"
     assert unhash(lex_of(tokens[1])) == "not"
+
+
+def test_capitalized():
+    tokens = expand_chunk(lookup("can't"))
+    assert len(tokens) == 2
+    tokens = expand_chunk(lookup("Can't"))
+    assert len(tokens) == 2
+    tokens = expand_chunk(lookup("Ain't"))
+    assert len(tokens) == 2
+    assert unhash(lex_of(tokens[0])) == "Are"