Fix doc strings for tokenizer

2016-11-02 23:15:39 +01:00 · 2016-11-02 23:15:39 +01:00 · e0c9695615
parent 7b7660c903
commit e0c9695615
1 changed files with 78 additions and 43 deletions
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -26,18 +26,26 @@ from .tokens.doc cimport Doc


 cdef class Tokenizer:
+    """Segment text, and create Doc objects with the discovered segment boundaries."""
    @classmethod
    def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None,
             infix_finditer=None):
        '''Load a Tokenizer, reading unsupplied components from the path.
        
        Arguments:
-            path pathlib.Path (or string, or Path-like)
-            vocab Vocab
-            rules dict
-            prefix_search callable -- Signature of re.compile(string).search
-            suffix_search callable -- Signature of re.compile(string).search
-            infix_finditer callable -- Signature of re.compile(string).finditer
+            path (Path):
+                The path to load from.
+            vocab (Vocab):
+                A storage container for lexical types.
+            rules (dict):
+                Exceptions and special-cases for the tokenizer.
+            prefix_search:
+                Signature of re.compile(string).search
+            suffix_search:
+                Signature of re.compile(string).search
+            infix_finditer:
+                Signature of re.compile(string).finditer
+        Returns Tokenizer
        '''
        if isinstance(path, basestring):
            path = pathlib.Path(path)
@ -64,11 +72,19 @@ cdef class Tokenizer:
        '''Create a Tokenizer, to create Doc objects given unicode text.
        
        Arguments:
-            vocab Vocab
-            rules dict
-            prefix_search callable -- Signature of re.compile(string).search
-            suffix_search callable -- Signature of re.compile(string).search
-            infix_finditer callable -- Signature of re.compile(string).finditer
+            vocab (Vocab):
+                A storage container for lexical types.
+            rules (dict):
+                Exceptions and special-cases for the tokenizer.
+            prefix_search:
+                A function matching the signature of re.compile(string).search
+                to match prefixes.
+            suffix_search:
+                A function matching the signature of re.compile(string).search
+                to match suffixes.
+            infix_finditer:
+                A function matching the signature of re.compile(string).finditer
+                to find infixes.
        '''
        self.mem = Pool()
        self._cache = PreshMap()
@ -91,38 +107,19 @@ cdef class Tokenizer:
        return (self.__class__, args, None, None)
    
    cpdef Doc tokens_from_list(self, list strings):
-        cdef Doc tokens = Doc(self.vocab)
-        if sum([len(s) for s in strings]) == 0:
-            return tokens
-        cdef unicode py_string
-        cdef int idx = 0
-        for i, py_string in enumerate(strings):
-            # Note that we pass tokens.mem here --- the Doc object has ownership
-            tokens.push_back(
-                <const LexemeC*>self.vocab.get(tokens.mem, py_string), True)
-            idx += len(py_string) + 1
-        return tokens
+        raise NotImplementedError(
+            "Method deprecated in 1.0.\n"
+            "Old: tokenizer.tokens_from_list(strings)\n"
+            "New: Doc(tokenizer.vocab, words=strings)")

    @cython.boundscheck(False)
    def __call__(self, unicode string):
        """Tokenize a string.

-        The tokenization rules are defined in three places:
-
-        * The data/<lang>/tokenization table, which handles special cases like contractions;
-        * The data/<lang>/prefix file, used to build a regex to split off prefixes;
-        * The data/<lang>/suffix file, used to build a regex to split off suffixes.
-
-        The string is first split on whitespace.  To tokenize a whitespace-delimited
-        chunk, we first try to look it up in the special-cases. If it's not found,
-        we split off a prefix, and then try again. If it's still not found, we
-        split off a suffix, and repeat.
-
-        Args:
-            string (unicode): The string to be tokenized.
-
+        Arguments:
+            string (unicode): The string to tokenize.
        Returns:
-            tokens (Doc): A Doc object, giving access to a sequence of LexemeCs.
+            Doc A container for linguistic annotations.
        """
        if len(string) >= (2 ** 30):
            raise ValueError(
@ -171,6 +168,18 @@ cdef class Tokenizer:
        return tokens

    def pipe(self, texts, batch_size=1000, n_threads=2):
+        """Tokenize a stream of texts.
+
+        Arguments:
+            texts: A sequence of unicode texts.
+            batch_size (int):
+                The number of texts to accumulate in an internal buffer.
+            n_threads (int):
+                The number of threads to use, if the implementation supports
+                multi-threading. The default tokenizer is single-threaded.
+        Yields:
+            Doc A sequence of Doc objects, in order.
+        """
        for text in texts:
            yield self(text)

@ -305,17 +314,39 @@ cdef class Tokenizer:
        self._cache.set(key, cached)

    def find_infix(self, unicode string):
+        """Find internal split points of the string, such as hyphens.
+
+        string (unicode): The string to segment.
+
+        Returns List[re.MatchObject]
+            A list of objects that have .start() and .end() methods, denoting the
+            placement of internal segment separators, e.g. hyphens.
+        """
        if self.infix_finditer is None:
            return 0
        return list(self.infix_finditer(string))

    def find_prefix(self, unicode string):
+        """Find the length of a prefix that should be segmented from the string,
+        or None if no prefix rules match.
+
+        Arguments:
+            string (unicode): The string to segment.
+        Returns (int or None): The length of the prefix if present, otherwise None.
+        """
        if self.prefix_search is None:
            return 0
        match = self.prefix_search(string)
        return (match.end() - match.start()) if match is not None else 0

    def find_suffix(self, unicode string):
+        """Find the length of a suffix that should be segmented from the string,
+        or None if no suffix rules match.
+
+        Arguments:
+            string (unicode): The string to segment.
+        Returns (int or None): The length of the suffix if present, otherwise None.
+        """
        if self.suffix_search is None:
            return 0
        match = self.suffix_search(string)
@ -327,19 +358,23 @@ cdef class Tokenizer:
        for chunk, substrings in sorted(special_cases.items()):
            self.add_special_case(chunk, substrings)
    
-    def add_special_case(self, unicode chunk, substrings):
+    def add_special_case(self, unicode string, substrings):
        '''Add a special-case tokenization rule.

-        For instance, "don't" is special-cased to tokenize into
-        ["do", "n't"]. The split tokens can have lemmas and part-of-speech
-        tags.
+        Arguments:
+            string (unicode): The string to specially tokenize.
+            token_attrs:
+                A sequence of dicts, where each dict describes a token and its
+                attributes. The ORTH fields of the attributes must exactly match
+                the string when they are concatenated.
+        Returns None
        '''
        substrings = list(substrings)
        cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
        cached.length = len(substrings)
        cached.is_lex = False
        cached.data.tokens = self.vocab.make_fused_token(substrings)
-        key = hash_string(chunk)
+        key = hash_string(string)
        self._specials.set(key, cached)
        self._cache.set(key, cached)
-        self._rules[chunk] = substrings
+        self._rules[string] = substrings