Fix doc strings for tokenizer

This commit is contained in:
Matthew Honnibal 2016-11-02 23:15:39 +01:00
parent 7b7660c903
commit e0c9695615
1 changed files with 78 additions and 43 deletions

View File

@ -26,18 +26,26 @@ from .tokens.doc cimport Doc
cdef class Tokenizer: cdef class Tokenizer:
"""Segment text, and create Doc objects with the discovered segment boundaries."""
@classmethod @classmethod
def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None,
infix_finditer=None): infix_finditer=None):
'''Load a Tokenizer, reading unsupplied components from the path. '''Load a Tokenizer, reading unsupplied components from the path.
Arguments: Arguments:
path pathlib.Path (or string, or Path-like) path (Path):
vocab Vocab The path to load from.
rules dict vocab (Vocab):
prefix_search callable -- Signature of re.compile(string).search A storage container for lexical types.
suffix_search callable -- Signature of re.compile(string).search rules (dict):
infix_finditer callable -- Signature of re.compile(string).finditer Exceptions and special-cases for the tokenizer.
prefix_search:
Signature of re.compile(string).search
suffix_search:
Signature of re.compile(string).search
infix_finditer:
Signature of re.compile(string).finditer
Returns Tokenizer
''' '''
if isinstance(path, basestring): if isinstance(path, basestring):
path = pathlib.Path(path) path = pathlib.Path(path)
@ -64,11 +72,19 @@ cdef class Tokenizer:
'''Create a Tokenizer, to create Doc objects given unicode text. '''Create a Tokenizer, to create Doc objects given unicode text.
Arguments: Arguments:
vocab Vocab vocab (Vocab):
rules dict A storage container for lexical types.
prefix_search callable -- Signature of re.compile(string).search rules (dict):
suffix_search callable -- Signature of re.compile(string).search Exceptions and special-cases for the tokenizer.
infix_finditer callable -- Signature of re.compile(string).finditer prefix_search:
A function matching the signature of re.compile(string).search
to match prefixes.
suffix_search:
A function matching the signature of re.compile(string).search
to match suffixes.
infix_finditer:
A function matching the signature of re.compile(string).finditer
to find infixes.
''' '''
self.mem = Pool() self.mem = Pool()
self._cache = PreshMap() self._cache = PreshMap()
@ -91,38 +107,19 @@ cdef class Tokenizer:
return (self.__class__, args, None, None) return (self.__class__, args, None, None)
cpdef Doc tokens_from_list(self, list strings): cpdef Doc tokens_from_list(self, list strings):
cdef Doc tokens = Doc(self.vocab) raise NotImplementedError(
if sum([len(s) for s in strings]) == 0: "Method deprecated in 1.0.\n"
return tokens "Old: tokenizer.tokens_from_list(strings)\n"
cdef unicode py_string "New: Doc(tokenizer.vocab, words=strings)")
cdef int idx = 0
for i, py_string in enumerate(strings):
# Note that we pass tokens.mem here --- the Doc object has ownership
tokens.push_back(
<const LexemeC*>self.vocab.get(tokens.mem, py_string), True)
idx += len(py_string) + 1
return tokens
@cython.boundscheck(False) @cython.boundscheck(False)
def __call__(self, unicode string): def __call__(self, unicode string):
"""Tokenize a string. """Tokenize a string.
The tokenization rules are defined in three places: Arguments:
string (unicode): The string to tokenize.
* The data/<lang>/tokenization table, which handles special cases like contractions;
* The data/<lang>/prefix file, used to build a regex to split off prefixes;
* The data/<lang>/suffix file, used to build a regex to split off suffixes.
The string is first split on whitespace. To tokenize a whitespace-delimited
chunk, we first try to look it up in the special-cases. If it's not found,
we split off a prefix, and then try again. If it's still not found, we
split off a suffix, and repeat.
Args:
string (unicode): The string to be tokenized.
Returns: Returns:
tokens (Doc): A Doc object, giving access to a sequence of LexemeCs. Doc A container for linguistic annotations.
""" """
if len(string) >= (2 ** 30): if len(string) >= (2 ** 30):
raise ValueError( raise ValueError(
@ -171,6 +168,18 @@ cdef class Tokenizer:
return tokens return tokens
def pipe(self, texts, batch_size=1000, n_threads=2): def pipe(self, texts, batch_size=1000, n_threads=2):
"""Tokenize a stream of texts.
Arguments:
texts: A sequence of unicode texts.
batch_size (int):
The number of texts to accumulate in an internal buffer.
n_threads (int):
The number of threads to use, if the implementation supports
multi-threading. The default tokenizer is single-threaded.
Yields:
Doc A sequence of Doc objects, in order.
"""
for text in texts: for text in texts:
yield self(text) yield self(text)
@ -305,17 +314,39 @@ cdef class Tokenizer:
self._cache.set(key, cached) self._cache.set(key, cached)
def find_infix(self, unicode string): def find_infix(self, unicode string):
"""Find internal split points of the string, such as hyphens.
string (unicode): The string to segment.
Returns List[re.MatchObject]
A list of objects that have .start() and .end() methods, denoting the
placement of internal segment separators, e.g. hyphens.
"""
if self.infix_finditer is None: if self.infix_finditer is None:
return 0 return 0
return list(self.infix_finditer(string)) return list(self.infix_finditer(string))
def find_prefix(self, unicode string): def find_prefix(self, unicode string):
"""Find the length of a prefix that should be segmented from the string,
or None if no prefix rules match.
Arguments:
string (unicode): The string to segment.
Returns (int or None): The length of the prefix if present, otherwise None.
"""
if self.prefix_search is None: if self.prefix_search is None:
return 0 return 0
match = self.prefix_search(string) match = self.prefix_search(string)
return (match.end() - match.start()) if match is not None else 0 return (match.end() - match.start()) if match is not None else 0
def find_suffix(self, unicode string): def find_suffix(self, unicode string):
"""Find the length of a suffix that should be segmented from the string,
or None if no suffix rules match.
Arguments:
string (unicode): The string to segment.
Returns (int or None): The length of the suffix if present, otherwise None.
"""
if self.suffix_search is None: if self.suffix_search is None:
return 0 return 0
match = self.suffix_search(string) match = self.suffix_search(string)
@ -327,19 +358,23 @@ cdef class Tokenizer:
for chunk, substrings in sorted(special_cases.items()): for chunk, substrings in sorted(special_cases.items()):
self.add_special_case(chunk, substrings) self.add_special_case(chunk, substrings)
def add_special_case(self, unicode chunk, substrings): def add_special_case(self, unicode string, substrings):
'''Add a special-case tokenization rule. '''Add a special-case tokenization rule.
For instance, "don't" is special-cased to tokenize into Arguments:
["do", "n't"]. The split tokens can have lemmas and part-of-speech string (unicode): The string to specially tokenize.
tags. token_attrs:
A sequence of dicts, where each dict describes a token and its
attributes. The ORTH fields of the attributes must exactly match
the string when they are concatenated.
Returns None
''' '''
substrings = list(substrings) substrings = list(substrings)
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = len(substrings) cached.length = len(substrings)
cached.is_lex = False cached.is_lex = False
cached.data.tokens = self.vocab.make_fused_token(substrings) cached.data.tokens = self.vocab.make_fused_token(substrings)
key = hash_string(chunk) key = hash_string(string)
self._specials.set(key, cached) self._specials.set(key, cached)
self._cache.set(key, cached) self._cache.set(key, cached)
self._rules[chunk] = substrings self._rules[string] = substrings