From 248cbb6d0702cf69208e241a572a9a75e37ecf8b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matthew@honnibal.com>
Date: Thu, 21 Aug 2014 03:29:15 +0200
Subject: [PATCH] * Update doc strings

---
 spacy/en.pyx | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/spacy/en.pyx b/spacy/en.pyx
index bead7205d..dcc63cf2a 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -14,6 +14,10 @@ cimport spacy
 
 
 from spacy.orthography.latin cimport *
+from spacy.lexeme cimport *
+
+from .orthography.latin import *
+from .lexeme import *
 
 
 
@@ -61,34 +65,33 @@ EN = English('en')
 cpdef Tokens tokenize(unicode string):
     """Tokenize a string.
 
-    Wraps EN.tokenize, where EN is an instance of the class English. The global
-    variable manages the vocabulary, and memoizes tokenization rules.
+    The tokenization rules are defined in two places:
+
+    * The data/en/tokenization table, which handles special cases like contractions;
+    * The `spacy.en.English.find_split` function, which is used to split off punctuation etc.
 
     Args:
-        string (unicode): The string to be split. Must be unicode, not bytes.
+        string (unicode): The string to be tokenized. 
 
     Returns:
-        tokens (Tokens): A Tokens instance, managing a vector of pointers to
-        Lexeme structs. The Tokens instance supports sequence interfaces,
-        but also offers a range of sequence-level operations, which are computed
-        efficiently in Cython-space.
+        tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
     """
     return EN.tokenize(string)
- 
 
-cpdef Lexeme_addr lookup(unicode string) except 0:
-    """Retrieve (or create) a Lexeme for a string.
 
-    Returns a Lexeme ID, which can be used via the accessor
-    methods in spacy.lexeme
+# +49 151 4336 2587
 
+
+cpdef LexID lookup(unicode string) except 0:
+    """Retrieve (or create, if not found) a Lexeme ID for a string.
+
+    The LexID is really a memory address, making dereferencing it essentially free.
+    
     Args:
         string (unicode):  The string to be looked up. Must be unicode, not bytes.
 
     Returns:
-        LexemeID (size_t): An unsigned integer that allows the Lexeme to be retrieved.
-        The LexemeID is really a memory address, making dereferencing it essentially
-        free.
+        lexeme (LexID): A reference to a lexical type.
     """
     return <Lexeme_addr>EN.lookup(string)
 
@@ -101,7 +104,7 @@ cpdef unicode unhash(StringHash hash_value):
     although no control is taken for hash collisions.
 
     Args:
-        hash_value (uint32_t): The hash of a string, returned by Python's hash()
+        hash_value (StringHash): The hash of a string, returned by Python's hash()
         function.
 
     Returns: