From 314658b31c4f78a29338a7b92bb4afd72645e6bb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matthew@honnibal.com>
Date: Thu, 21 Aug 2014 18:42:47 +0200
Subject: [PATCH] * Improve module docstring

---
 spacy/en.pyx | 37 ++++++++++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/spacy/en.pyx b/spacy/en.pyx
index dcc63cf2a..af137fb8a 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -1,9 +1,40 @@
 # cython: profile=True
 # cython: embedsignature=True
-'''Tokenize English text, allowing some differences from the Penn Treebank
-tokenization, e.g. for email addresses, URLs, etc. Use en_ptb if full PTB
-compatibility is the priority.
+'''Tokenize English text, using a scheme that differs from the Penn Treebank 3
+scheme in several important respects:
+
+* Whitespace added as tokens, except for single spaces. e.g.,
+
+    >>> tokenize(u'\\nHello  \\tThere').strings
+    [u'\\n', u'Hello', u' ', u'\\t', u'There']
+
+* Contractions are normalized, e.g.
+
+    >>> tokenize(u"isn't ain't won't he's").strings
+    [u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"]
+  
+* Hyphenated words are split, with the hyphen preserved, e.g.:
+    
+    >>> tokenize(u'New York-based').strings
+    [u'New', u'York', u'-', u'based']
+
+* Full unicode support
+* Email addresses, URLs, European-formatted dates and other numeric entities not
+  found in the PTB are tokenized correctly
+* Heuristic handling of word-final periods (PTB expects sentence boundary detection
+  as a pre-process before tokenization.)
+
+Take care to ensure you training and run-time data is tokenized according to the
+same scheme. Tokenization problems are a major cause of poor performance for
+NLP tools.
+
+If you're using a pre-trained model, the spacy.ptb3 module provides a fully Penn
+Treebank 3-compliant tokenizer.
 '''
+#The script translate_treebank_tokenization can be used to transform a treebank's
+#annotation to use one of the spacy tokenization schemes.
+
+
 from __future__ import unicode_literals
 
 from libc.stdlib cimport malloc, calloc, free