* Remove taggers from Language class. Work on doc strings

2014-11-26 19:53:29 +11:00 · 2014-11-26 19:53:29 +11:00 · 33dfb4933c
parent cf55b48ba6
commit 33dfb4933c
1 changed files with 24 additions and 9 deletions
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -23,9 +23,6 @@ from . import util
 from .util import read_lang_data
 from .tokens import Tokens

-from .tagger cimport Tagger
-from .ner.greedy_parser cimport NERParser
-

 cdef class Language:
    def __init__(self, name):
@ -42,12 +39,6 @@ cdef class Language:
            self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
            self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
        self._load_special_tokenization(rules)
-        if path.exists(path.join(util.DATA_DIR, name, 'pos')):
-            self.pos_tagger = Tagger(path.join(util.DATA_DIR, name, 'pos'))
-        else:
-            self.pos_tagger = None
-        if path.exists(path.join(util.DATA_DIR, name, 'ner')):
-            self.ner_tagger = NERParser(path.join(util.DATA_DIR, name, 'ner'))

    cpdef Tokens tokens_from_list(self, list strings):
        cdef int length = sum([len(s) for s in strings])
@ -244,6 +235,10 @@ cdef class Language:


 cdef class Lexicon:
+    '''A map container for a language's Lexeme structs.
+    
+    Also interns UTF-8 strings, and maps them to consecutive integer IDs.
+    '''
    def __init__(self):
        self.mem = Pool()
        self._dict = PreshMap(2 ** 20)
@ -252,6 +247,7 @@ cdef class Lexicon:
        self.size = 1

    cdef Lexeme* get(self, String* string) except NULL:
+        '''Retrieve a pointer to a Lexeme from the lexicon.'''
        cdef Lexeme* lex
        lex = <Lexeme*>self._dict.get(string.key)
        if lex != NULL:
@ -266,6 +262,25 @@ cdef class Lexicon:
        return lex

    def __getitem__(self,  id_or_string):
+        '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously
+        unseen unicode string is given, a new Lexeme is created and stored.
+
+        This function relies on Cython's struct-to-dict conversion.  Python clients
+        receive a dict keyed by strings (byte or unicode, depending on Python 2/3),
+        with int values.  Cython clients can instead receive a Lexeme struct value.
+        More efficient Cython access is provided by Lexicon.get, which returns
+        a Lexeme*.
+
+        Args:
+            id_or_string (int or unicode): The integer ID of a word, or its unicode
+                string.  If an int >= Lexicon.size, IndexError is raised.
+                If id_or_string is neither an int nor a unicode string, ValueError
+                is raised.
+
+        Returns:
+            lexeme (dict): A Lexeme struct instance, which Cython translates into
+                a dict if the operator is called from Python.
+        '''
        if type(id_or_string) == int:
            return self.lexemes.at(id_or_string)[0]
        cdef String string