* Add string length cap in Tokenizer.__call__

This commit is contained in:
Matthew Honnibal 2015-10-16 04:54:16 +11:00
parent 17fffb4c57
commit 3ba66f2dc7
1 changed files with 4 additions and 0 deletions

View File

@ -72,6 +72,10 @@ cdef class Tokenizer:
Returns: Returns:
tokens (Doc): A Doc object, giving access to a sequence of LexemeCs. tokens (Doc): A Doc object, giving access to a sequence of LexemeCs.
""" """
if len(string) >= (2 ** 30):
raise ValueError(
"String is too long: %d characters. Max is 2**30." % len(string)
)
cdef int length = len(string) cdef int length = len(string)
cdef Doc tokens = Doc(self.vocab) cdef Doc tokens = Doc(self.vocab)
if length == 0: if length == 0: