mirror of https://github.com/explosion/spaCy.git
Fix JSON segmentation bug that affected French
Fix a bug in the JSON streaming code that GoldCorpus uses. Escaped slashes were being handled incorrectly. This bug caused low scores for French in the early v2.1.0 alphas, because most of the data was not being read in. Fittingly, the document that triggered the bug was a Wikipedia article about Perl. Parsing perl remains difficult!
This commit is contained in:
parent
6f36b6bc4e
commit
a338c6f8f6
|
@ -346,12 +346,12 @@ def _json_iterate(loc):
|
||||||
cdef char close_curly = ord('}')
|
cdef char close_curly = ord('}')
|
||||||
for i in range(len(py_raw)):
|
for i in range(len(py_raw)):
|
||||||
c = raw[i]
|
c = raw[i]
|
||||||
if c == backslash:
|
|
||||||
escape = True
|
|
||||||
continue
|
|
||||||
if escape:
|
if escape:
|
||||||
escape = False
|
escape = False
|
||||||
continue
|
continue
|
||||||
|
if c == backslash:
|
||||||
|
escape = True
|
||||||
|
continue
|
||||||
if c == quote:
|
if c == quote:
|
||||||
inside_string = not inside_string
|
inside_string = not inside_string
|
||||||
continue
|
continue
|
||||||
|
|
Loading…
Reference in New Issue