mirror of https://github.com/explosion/spaCy.git
Fix doc strings for tokenizer
This commit is contained in:
parent
7b7660c903
commit
e0c9695615
|
@ -26,18 +26,26 @@ from .tokens.doc cimport Doc
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokenizer:
|
cdef class Tokenizer:
|
||||||
|
"""Segment text, and create Doc objects with the discovered segment boundaries."""
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None,
|
def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None,
|
||||||
infix_finditer=None):
|
infix_finditer=None):
|
||||||
'''Load a Tokenizer, reading unsupplied components from the path.
|
'''Load a Tokenizer, reading unsupplied components from the path.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
path pathlib.Path (or string, or Path-like)
|
path (Path):
|
||||||
vocab Vocab
|
The path to load from.
|
||||||
rules dict
|
vocab (Vocab):
|
||||||
prefix_search callable -- Signature of re.compile(string).search
|
A storage container for lexical types.
|
||||||
suffix_search callable -- Signature of re.compile(string).search
|
rules (dict):
|
||||||
infix_finditer callable -- Signature of re.compile(string).finditer
|
Exceptions and special-cases for the tokenizer.
|
||||||
|
prefix_search:
|
||||||
|
Signature of re.compile(string).search
|
||||||
|
suffix_search:
|
||||||
|
Signature of re.compile(string).search
|
||||||
|
infix_finditer:
|
||||||
|
Signature of re.compile(string).finditer
|
||||||
|
Returns Tokenizer
|
||||||
'''
|
'''
|
||||||
if isinstance(path, basestring):
|
if isinstance(path, basestring):
|
||||||
path = pathlib.Path(path)
|
path = pathlib.Path(path)
|
||||||
|
@ -64,11 +72,19 @@ cdef class Tokenizer:
|
||||||
'''Create a Tokenizer, to create Doc objects given unicode text.
|
'''Create a Tokenizer, to create Doc objects given unicode text.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
vocab Vocab
|
vocab (Vocab):
|
||||||
rules dict
|
A storage container for lexical types.
|
||||||
prefix_search callable -- Signature of re.compile(string).search
|
rules (dict):
|
||||||
suffix_search callable -- Signature of re.compile(string).search
|
Exceptions and special-cases for the tokenizer.
|
||||||
infix_finditer callable -- Signature of re.compile(string).finditer
|
prefix_search:
|
||||||
|
A function matching the signature of re.compile(string).search
|
||||||
|
to match prefixes.
|
||||||
|
suffix_search:
|
||||||
|
A function matching the signature of re.compile(string).search
|
||||||
|
to match suffixes.
|
||||||
|
infix_finditer:
|
||||||
|
A function matching the signature of re.compile(string).finditer
|
||||||
|
to find infixes.
|
||||||
'''
|
'''
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._cache = PreshMap()
|
self._cache = PreshMap()
|
||||||
|
@ -91,38 +107,19 @@ cdef class Tokenizer:
|
||||||
return (self.__class__, args, None, None)
|
return (self.__class__, args, None, None)
|
||||||
|
|
||||||
cpdef Doc tokens_from_list(self, list strings):
|
cpdef Doc tokens_from_list(self, list strings):
|
||||||
cdef Doc tokens = Doc(self.vocab)
|
raise NotImplementedError(
|
||||||
if sum([len(s) for s in strings]) == 0:
|
"Method deprecated in 1.0.\n"
|
||||||
return tokens
|
"Old: tokenizer.tokens_from_list(strings)\n"
|
||||||
cdef unicode py_string
|
"New: Doc(tokenizer.vocab, words=strings)")
|
||||||
cdef int idx = 0
|
|
||||||
for i, py_string in enumerate(strings):
|
|
||||||
# Note that we pass tokens.mem here --- the Doc object has ownership
|
|
||||||
tokens.push_back(
|
|
||||||
<const LexemeC*>self.vocab.get(tokens.mem, py_string), True)
|
|
||||||
idx += len(py_string) + 1
|
|
||||||
return tokens
|
|
||||||
|
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
def __call__(self, unicode string):
|
def __call__(self, unicode string):
|
||||||
"""Tokenize a string.
|
"""Tokenize a string.
|
||||||
|
|
||||||
The tokenization rules are defined in three places:
|
Arguments:
|
||||||
|
string (unicode): The string to tokenize.
|
||||||
* The data/<lang>/tokenization table, which handles special cases like contractions;
|
|
||||||
* The data/<lang>/prefix file, used to build a regex to split off prefixes;
|
|
||||||
* The data/<lang>/suffix file, used to build a regex to split off suffixes.
|
|
||||||
|
|
||||||
The string is first split on whitespace. To tokenize a whitespace-delimited
|
|
||||||
chunk, we first try to look it up in the special-cases. If it's not found,
|
|
||||||
we split off a prefix, and then try again. If it's still not found, we
|
|
||||||
split off a suffix, and repeat.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
string (unicode): The string to be tokenized.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
tokens (Doc): A Doc object, giving access to a sequence of LexemeCs.
|
Doc A container for linguistic annotations.
|
||||||
"""
|
"""
|
||||||
if len(string) >= (2 ** 30):
|
if len(string) >= (2 ** 30):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -171,6 +168,18 @@ cdef class Tokenizer:
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
def pipe(self, texts, batch_size=1000, n_threads=2):
|
def pipe(self, texts, batch_size=1000, n_threads=2):
|
||||||
|
"""Tokenize a stream of texts.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
texts: A sequence of unicode texts.
|
||||||
|
batch_size (int):
|
||||||
|
The number of texts to accumulate in an internal buffer.
|
||||||
|
n_threads (int):
|
||||||
|
The number of threads to use, if the implementation supports
|
||||||
|
multi-threading. The default tokenizer is single-threaded.
|
||||||
|
Yields:
|
||||||
|
Doc A sequence of Doc objects, in order.
|
||||||
|
"""
|
||||||
for text in texts:
|
for text in texts:
|
||||||
yield self(text)
|
yield self(text)
|
||||||
|
|
||||||
|
@ -305,17 +314,39 @@ cdef class Tokenizer:
|
||||||
self._cache.set(key, cached)
|
self._cache.set(key, cached)
|
||||||
|
|
||||||
def find_infix(self, unicode string):
|
def find_infix(self, unicode string):
|
||||||
|
"""Find internal split points of the string, such as hyphens.
|
||||||
|
|
||||||
|
string (unicode): The string to segment.
|
||||||
|
|
||||||
|
Returns List[re.MatchObject]
|
||||||
|
A list of objects that have .start() and .end() methods, denoting the
|
||||||
|
placement of internal segment separators, e.g. hyphens.
|
||||||
|
"""
|
||||||
if self.infix_finditer is None:
|
if self.infix_finditer is None:
|
||||||
return 0
|
return 0
|
||||||
return list(self.infix_finditer(string))
|
return list(self.infix_finditer(string))
|
||||||
|
|
||||||
def find_prefix(self, unicode string):
|
def find_prefix(self, unicode string):
|
||||||
|
"""Find the length of a prefix that should be segmented from the string,
|
||||||
|
or None if no prefix rules match.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
string (unicode): The string to segment.
|
||||||
|
Returns (int or None): The length of the prefix if present, otherwise None.
|
||||||
|
"""
|
||||||
if self.prefix_search is None:
|
if self.prefix_search is None:
|
||||||
return 0
|
return 0
|
||||||
match = self.prefix_search(string)
|
match = self.prefix_search(string)
|
||||||
return (match.end() - match.start()) if match is not None else 0
|
return (match.end() - match.start()) if match is not None else 0
|
||||||
|
|
||||||
def find_suffix(self, unicode string):
|
def find_suffix(self, unicode string):
|
||||||
|
"""Find the length of a suffix that should be segmented from the string,
|
||||||
|
or None if no suffix rules match.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
string (unicode): The string to segment.
|
||||||
|
Returns (int or None): The length of the suffix if present, otherwise None.
|
||||||
|
"""
|
||||||
if self.suffix_search is None:
|
if self.suffix_search is None:
|
||||||
return 0
|
return 0
|
||||||
match = self.suffix_search(string)
|
match = self.suffix_search(string)
|
||||||
|
@ -327,19 +358,23 @@ cdef class Tokenizer:
|
||||||
for chunk, substrings in sorted(special_cases.items()):
|
for chunk, substrings in sorted(special_cases.items()):
|
||||||
self.add_special_case(chunk, substrings)
|
self.add_special_case(chunk, substrings)
|
||||||
|
|
||||||
def add_special_case(self, unicode chunk, substrings):
|
def add_special_case(self, unicode string, substrings):
|
||||||
'''Add a special-case tokenization rule.
|
'''Add a special-case tokenization rule.
|
||||||
|
|
||||||
For instance, "don't" is special-cased to tokenize into
|
Arguments:
|
||||||
["do", "n't"]. The split tokens can have lemmas and part-of-speech
|
string (unicode): The string to specially tokenize.
|
||||||
tags.
|
token_attrs:
|
||||||
|
A sequence of dicts, where each dict describes a token and its
|
||||||
|
attributes. The ORTH fields of the attributes must exactly match
|
||||||
|
the string when they are concatenated.
|
||||||
|
Returns None
|
||||||
'''
|
'''
|
||||||
substrings = list(substrings)
|
substrings = list(substrings)
|
||||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||||
cached.length = len(substrings)
|
cached.length = len(substrings)
|
||||||
cached.is_lex = False
|
cached.is_lex = False
|
||||||
cached.data.tokens = self.vocab.make_fused_token(substrings)
|
cached.data.tokens = self.vocab.make_fused_token(substrings)
|
||||||
key = hash_string(chunk)
|
key = hash_string(string)
|
||||||
self._specials.set(key, cached)
|
self._specials.set(key, cached)
|
||||||
self._cache.set(key, cached)
|
self._cache.set(key, cached)
|
||||||
self._rules[chunk] = substrings
|
self._rules[string] = substrings
|
||||||
|
|
Loading…
Reference in New Issue