* Improve module docstring

2014-08-21 18:42:47 +02:00 · 2014-08-21 18:42:47 +02:00 · 314658b31c
parent 8bcd07dbae
commit 314658b31c
1 changed files with 34 additions and 3 deletions
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -1,9 +1,40 @@
 # cython: profile=True
 # cython: embedsignature=True
-'''Tokenize English text, allowing some differences from the Penn Treebank
+'''Tokenize English text, using a scheme that differs from the Penn Treebank 3
-tokenization, e.g. for email addresses, URLs, etc. Use en_ptb if full PTB
+scheme in several important respects:
-compatibility is the priority.
+
 * Whitespace added as tokens, except for single spaces. e.g.,
    >>> tokenize(u'\\nHello  \\tThere').strings
    [u'\\n', u'Hello', u' ', u'\\t', u'There']
 * Contractions are normalized, e.g.
    >>> tokenize(u"isn't ain't won't he's").strings
    [u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"]
 * Hyphenated words are split, with the hyphen preserved, e.g.:
    >>> tokenize(u'New York-based').strings
    [u'New', u'York', u'-', u'based']
 * Full unicode support
 * Email addresses, URLs, European-formatted dates and other numeric entities not
  found in the PTB are tokenized correctly
 * Heuristic handling of word-final periods (PTB expects sentence boundary detection
  as a pre-process before tokenization.)
 Take care to ensure you training and run-time data is tokenized according to the
 same scheme. Tokenization problems are a major cause of poor performance for
 NLP tools.
 If you're using a pre-trained model, the spacy.ptb3 module provides a fully Penn
 Treebank 3-compliant tokenizer.
 '''
 #The script translate_treebank_tokenization can be used to transform a treebank's
 #annotation to use one of the spacy tokenization schemes.
 from __future__ import unicode_literals
 from libc.stdlib cimport malloc, calloc, free