diff --git a/spacy/errors.py b/spacy/errors.py index dd2b38eb9..ce35d706c 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -105,6 +105,10 @@ class Warnings(object): W025 = ("'{name}' requires '{attr}' to be assigned, but none of the " "previous components in the pipeline declare that they assign it.") W026 = ("Unable to set all sentence boundaries from dependency parses.") + W027 = ("Found a large training file of {size} bytes. Note that it may " + "be more efficient to split your training data into multiple " + "smaller JSON files instead.") + @add_codes diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 1a74d2206..1d7f80c92 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -13,7 +13,7 @@ import srsly from .syntax import nonproj from .tokens import Doc, Span -from .errors import Errors, AlignmentError +from .errors import Errors, AlignmentError, user_warning, Warnings from .compat import path2str from . import util from .util import minibatch, itershuffle @@ -557,12 +557,16 @@ def _json_iterate(loc): loc = util.ensure_path(loc) with loc.open("rb") as file_: py_raw = file_.read() + cdef long file_length = len(py_raw) + if file_length > 2 ** 30: + user_warning(Warnings.W027.format(size=file_length)) + raw = py_raw cdef int square_depth = 0 cdef int curly_depth = 0 cdef int inside_string = 0 cdef int escape = 0 - cdef int start = -1 + cdef long start = -1 cdef char c cdef char quote = ord('"') cdef char backslash = ord("\\") @@ -570,7 +574,7 @@ def _json_iterate(loc): cdef char close_square = ord("]") cdef char open_curly = ord("{") cdef char close_curly = ord("}") - for i in range(len(py_raw)): + for i in range(file_length): c = raw[i] if escape: escape = False