From 25849fc926968def6ae006ce4db5522e4fc970d2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 7 Jul 2014 05:07:21 +0200 Subject: [PATCH] * Generalize tokenization rules to capitals --- spacy/util.py | 5 +++++ tests/test_contractions.py | 10 ++++++++++ 2 files changed, 15 insertions(+) diff --git a/spacy/util.py b/spacy/util.py index 449bad876..64dee8877 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -36,6 +36,11 @@ def read_tokenization(lang): assert chunk not in seen, chunk seen.add(chunk) entries.append((chunk, lex, pieces)) + if chunk[0].isalpha() and chunk[0].islower(): + chunk = chunk[0].title() + chunk[1:] + lex = lex[0].title() + lex[1:] + seen.add(chunk) + entries.append((chunk, lex, pieces)) return entries diff --git a/tests/test_contractions.py b/tests/test_contractions.py index be2280d75..aa11faa39 100644 --- a/tests/test_contractions.py +++ b/tests/test_contractions.py @@ -32,3 +32,13 @@ def test_aint(): assert len(tokens) == 2 assert unhash(lex_of(tokens[0])) == "are" assert unhash(lex_of(tokens[1])) == "not" + + +def test_capitalized(): + tokens = expand_chunk(lookup("can't")) + assert len(tokens) == 2 + tokens = expand_chunk(lookup("Can't")) + assert len(tokens) == 2 + tokens = expand_chunk(lookup("Ain't")) + assert len(tokens) == 2 + assert unhash(lex_of(tokens[0])) == "Are"