diff --git a/requirements.txt b/requirements.txt index e61a029c9..bf95839b5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ preshed>=2.0.1,<2.1.0 thinc>=7.0.2,<7.1.0 blis>=0.2.2,<0.3.0 murmurhash>=0.28.0,<1.1.0 -wasabi>=0.1.3,<1.1.0 +wasabi>=0.2.0,<1.1.0 srsly>=0.0.5,<1.1.0 # Third party dependencies numpy>=1.15.0 diff --git a/setup.py b/setup.py index 6f29e1efa..ed030eaf0 100755 --- a/setup.py +++ b/setup.py @@ -232,7 +232,7 @@ def setup_package(): "plac<1.0.0,>=0.9.6", "requests>=2.13.0,<3.0.0", "jsonschema>=2.6.0,<3.0.0", - "wasabi>=0.0.12,<1.1.0", + "wasabi>=0.2.0,<1.1.0", "srsly>=0.0.5,<1.1.0", 'pathlib==1.0.1; python_version < "3.4"', ], diff --git a/spacy/about.py b/spacy/about.py index 7b1f36c40..1592620ae 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,7 +4,7 @@ # fmt: off __title__ = "spacy" -__version__ = "2.1.1" +__version__ = "2.1.2" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI" diff --git a/spacy/compat.py b/spacy/compat.py index 8af49f254..997e8787b 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -11,6 +11,7 @@ from __future__ import unicode_literals import os import sys import itertools +import ast from thinc.neural.util import copy_array @@ -150,3 +151,26 @@ def import_file(name, loc): module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) return module + + +def unescape_unicode(string): + """Python2.7's re module chokes when compiling patterns that have ranges + between escaped unicode codepoints if the two codepoints are unrecognised + in the unicode database. For instance: + + re.compile('[\\uAA77-\\uAA79]').findall("hello") + + Ends up matching every character (on Python 2). This problem doesn't occur + if we're dealing with unicode literals. + """ + if string is None: + return string + # We only want to unescape the unicode, so we first must protect the other + # backslashes. + string = string.replace("\\", "\\\\") + # Now we remove that protection for the unicode. + string = string.replace("\\\\u", "\\u") + string = string.replace("\\\\U", "\\U") + # Now we unescape by evaling the string with the AST. This can't execute + # code -- it only does the representational level. + return ast.literal_eval("u'''" + string + "'''") diff --git a/spacy/lang/en/morph_rules.py b/spacy/lang/en/morph_rules.py index d073e27a5..54a108d53 100644 --- a/spacy/lang/en/morph_rules.py +++ b/spacy/lang/en/morph_rules.py @@ -1,13 +1,97 @@ # coding: utf8 from __future__ import unicode_literals -from ...symbols import LEMMA, PRON_LEMMA +from ...symbols import LEMMA, PRON_LEMMA, AUX +_subordinating_conjunctions = [ + "that", + "if", + "as", + "because", + "of", + "for", + "before", + "in", + "while", + "after", + "since", + "like", + "with", + "so", + "to", + "by", + "on", + "about", + "than", + "whether", + "although", + "from", + "though", + "until", + "unless", + "once", + "without", + "at", + "into", + "cause", + "over", + "upon", + "till", + "whereas", + "beyond", + "whilst", + "except", + "despite", + "wether", + "then", + "but", + "becuse", + "whie", + "below", + "against", + "it", + "w/out", + "toward", + "albeit", + "save", + "besides", + "becouse", + "coz", + "til", + "ask", + "i'd", + "out", + "near", + "seince", + "towards", + "tho", + "sice", + "will", +] + +_relative_pronouns = ["this", "that", "those", "these"] MORPH_RULES = { + "DT": {word: {"POS": "PRON"} for word in _relative_pronouns}, + "IN": {word: {"POS": "SCONJ"} for word in _subordinating_conjunctions}, + "NN": { + "something": {"POS": "PRON"}, + "anyone": {"POS": "PRON"}, + "anything": {"POS": "PRON"}, + "nothing": {"POS": "PRON"}, + "someone": {"POS": "PRON"}, + "everything": {"POS": "PRON"}, + "everyone": {"POS": "PRON"}, + "everybody": {"POS": "PRON"}, + "nobody": {"POS": "PRON"}, + "somebody": {"POS": "PRON"}, + "anybody": {"POS": "PRON"}, + "any1": {"POS": "PRON"}, + }, "PRP": { "I": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "One", "Number": "Sing", @@ -15,14 +99,16 @@ MORPH_RULES = { }, "me": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Acc", }, - "you": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two"}, + "you": {LEMMA: PRON_LEMMA, "POS": "PRON", "PronType": "Prs", "Person": "Two"}, "he": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -31,6 +117,7 @@ MORPH_RULES = { }, "him": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -39,6 +126,7 @@ MORPH_RULES = { }, "she": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -47,6 +135,7 @@ MORPH_RULES = { }, "her": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -55,6 +144,7 @@ MORPH_RULES = { }, "it": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -62,6 +152,7 @@ MORPH_RULES = { }, "we": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "One", "Number": "Plur", @@ -69,6 +160,7 @@ MORPH_RULES = { }, "us": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "One", "Number": "Plur", @@ -76,6 +168,7 @@ MORPH_RULES = { }, "they": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Plur", @@ -83,6 +176,7 @@ MORPH_RULES = { }, "them": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Plur", @@ -90,6 +184,7 @@ MORPH_RULES = { }, "mine": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "One", "Number": "Sing", @@ -98,6 +193,7 @@ MORPH_RULES = { }, "his": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -107,6 +203,7 @@ MORPH_RULES = { }, "hers": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -116,6 +213,7 @@ MORPH_RULES = { }, "its": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -125,6 +223,7 @@ MORPH_RULES = { }, "ours": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "One", "Number": "Plur", @@ -133,6 +232,7 @@ MORPH_RULES = { }, "yours": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Two", "Number": "Plur", @@ -141,6 +241,7 @@ MORPH_RULES = { }, "theirs": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Plur", @@ -149,6 +250,7 @@ MORPH_RULES = { }, "myself": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "One", "Number": "Sing", @@ -157,6 +259,7 @@ MORPH_RULES = { }, "yourself": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Two", "Case": "Acc", @@ -164,6 +267,7 @@ MORPH_RULES = { }, "himself": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -173,6 +277,7 @@ MORPH_RULES = { }, "herself": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -182,6 +287,7 @@ MORPH_RULES = { }, "itself": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -191,6 +297,7 @@ MORPH_RULES = { }, "themself": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -199,6 +306,7 @@ MORPH_RULES = { }, "ourselves": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "One", "Number": "Plur", @@ -207,6 +315,7 @@ MORPH_RULES = { }, "yourselves": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Two", "Case": "Acc", @@ -214,6 +323,7 @@ MORPH_RULES = { }, "themselves": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Plur", @@ -269,9 +379,17 @@ MORPH_RULES = { "Poss": "Yes", }, }, + "RB": {word: {"POS": "PART"} for word in ["not", "n't", "nt", "n’t"]}, + "VB": { + word: {"POS": "AUX"} + for word in ["be", "have", "do", "get", "of", "am", "are", "'ve"] + }, + "VBN": {"been": {LEMMA: "be", "POS": "AUX"}}, + "VBG": {"being": {LEMMA: "be", "POS": "AUX"}}, "VBZ": { "am": { LEMMA: "be", + "POS": "AUX", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", @@ -279,6 +397,7 @@ MORPH_RULES = { }, "are": { LEMMA: "be", + "POS": "AUX", "VerbForm": "Fin", "Person": "Two", "Tense": "Pres", @@ -286,6 +405,7 @@ MORPH_RULES = { }, "is": { LEMMA: "be", + "POS": "AUX", "VerbForm": "Fin", "Person": "Three", "Tense": "Pres", @@ -293,6 +413,7 @@ MORPH_RULES = { }, "'re": { LEMMA: "be", + "POS": "AUX", "VerbForm": "Fin", "Person": "Two", "Tense": "Pres", @@ -300,26 +421,65 @@ MORPH_RULES = { }, "'s": { LEMMA: "be", + "POS": "AUX", "VerbForm": "Fin", "Person": "Three", "Tense": "Pres", "Mood": "Ind", }, + "has": {LEMMA: "have", "POS": "AUX"}, + "does": {LEMMA: "do", "POS": "AUX"}, }, "VBP": { - "are": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}, - "'re": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}, + "are": { + LEMMA: "be", + "POS": "AUX", + "VerbForm": "Fin", + "Tense": "Pres", + "Mood": "Ind", + }, + "'re": { + LEMMA: "be", + "POS": "AUX", + "VerbForm": "Fin", + "Tense": "Pres", + "Mood": "Ind", + }, "am": { LEMMA: "be", + "POS": "AUX", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind", }, + "do": {"POS": "AUX"}, + "have": {"POS": "AUX"}, + "'m": {"POS": "AUX", LEMMA: "be"}, + "'ve": {"POS": "AUX"}, + "'re": {"POS": "AUX", LEMMA: "be"}, + "'s": {"POS": "AUX"}, + "is": {"POS": "AUX"}, + "'d": {"POS": "AUX"}, }, "VBD": { - "was": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"}, - "were": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"}, + "was": { + LEMMA: "be", + "POS": "AUX", + "VerbForm": "Fin", + "Tense": "Past", + "Number": "Sing", + }, + "were": { + LEMMA: "be", + "POS": "AUX", + "VerbForm": "Fin", + "Tense": "Past", + "Number": "Plur", + }, + "did": {LEMMA: "do", "POS": "AUX"}, + "had": {LEMMA: "have", "POS": "AUX"}, + "'d": {LEMMA: "have", "POS": "AUX"}, }, } diff --git a/spacy/lang/en/tag_map.py b/spacy/lang/en/tag_map.py index 67f43c53c..246258f57 100644 --- a/spacy/lang/en/tag_map.py +++ b/spacy/lang/en/tag_map.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB -from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON +from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX TAG_MAP = { @@ -20,15 +20,15 @@ TAG_MAP = { "CC": {POS: CCONJ, "ConjType": "coor"}, "CD": {POS: NUM, "NumType": "card"}, "DT": {POS: DET}, - "EX": {POS: ADV, "AdvType": "ex"}, + "EX": {POS: PRON, "AdvType": "ex"}, "FW": {POS: X, "Foreign": "yes"}, "HYPH": {POS: PUNCT, "PunctType": "dash"}, "IN": {POS: ADP}, "JJ": {POS: ADJ, "Degree": "pos"}, "JJR": {POS: ADJ, "Degree": "comp"}, "JJS": {POS: ADJ, "Degree": "sup"}, - "LS": {POS: PUNCT, "NumType": "ord"}, - "MD": {POS: VERB, "VerbType": "mod"}, + "LS": {POS: X, "NumType": "ord"}, + "MD": {POS: AUX, "VerbType": "mod"}, "NIL": {POS: ""}, "NN": {POS: NOUN, "Number": "sing"}, "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"}, @@ -37,11 +37,11 @@ TAG_MAP = { "PDT": {POS: DET, "AdjType": "pdt", "PronType": "prn"}, "POS": {POS: PART, "Poss": "yes"}, "PRP": {POS: PRON, "PronType": "prs"}, - "PRP$": {POS: DET, "PronType": "prs", "Poss": "yes"}, + "PRP$": {POS: PRON, "PronType": "prs", "Poss": "yes"}, "RB": {POS: ADV, "Degree": "pos"}, "RBR": {POS: ADV, "Degree": "comp"}, "RBS": {POS: ADV, "Degree": "sup"}, - "RP": {POS: PART}, + "RP": {POS: ADP}, "SP": {POS: SPACE}, "SYM": {POS: SYM}, "TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"}, @@ -58,9 +58,9 @@ TAG_MAP = { "Number": "sing", "Person": 3, }, - "WDT": {POS: DET, "PronType": "int|rel"}, + "WDT": {POS: PRON, "PronType": "int|rel"}, "WP": {POS: PRON, "PronType": "int|rel"}, - "WP$": {POS: DET, "Poss": "yes", "PronType": "int|rel"}, + "WP$": {POS: PRON, "Poss": "yes", "PronType": "int|rel"}, "WRB": {POS: ADV, "PronType": "int|rel"}, "ADD": {POS: X}, "NFP": {POS: PUNCT}, diff --git a/spacy/tests/regression/test_issue3356.py b/spacy/tests/regression/test_issue3356.py new file mode 100644 index 000000000..4e27055c7 --- /dev/null +++ b/spacy/tests/regression/test_issue3356.py @@ -0,0 +1,70 @@ +import pytest +import re +from ... import compat + +prefix_search = ( + b"^\xc2\xa7|^%|^=|^\xe2\x80\x94|^\xe2\x80\x93|^\\+(?![0-9])" + b"|^\xe2\x80\xa6|^\xe2\x80\xa6\xe2\x80\xa6|^,|^:|^;|^\\!|^\\?" + b"|^\xc2\xbf|^\xd8\x9f|^\xc2\xa1|^\\(|^\\)|^\\[|^\\]|^\\{|^\\}" + b"|^<|^>|^_|^#|^\\*|^&|^\xe3\x80\x82|^\xef\xbc\x9f|^\xef\xbc\x81|" + b"^\xef\xbc\x8c|^\xe3\x80\x81|^\xef\xbc\x9b|^\xef\xbc\x9a|" + b"^\xef\xbd\x9e|^\xc2\xb7|^\xe0\xa5\xa4|^\xd8\x8c|^\xd8\x9b|" + b"^\xd9\xaa|^\\.\\.+|^\xe2\x80\xa6|^\\'|^\"|^\xe2\x80\x9d|" + b"^\xe2\x80\x9c|^`|^\xe2\x80\x98|^\xc2\xb4|^\xe2\x80\x99|" + b"^\xe2\x80\x9a|^,|^\xe2\x80\x9e|^\xc2\xbb|^\xc2\xab|^\xe3\x80\x8c|" + b"^\xe3\x80\x8d|^\xe3\x80\x8e|^\xe3\x80\x8f|^\xef\xbc\x88|" + b"^\xef\xbc\x89|^\xe3\x80\x94|^\xe3\x80\x95|^\xe3\x80\x90|" + b"^\xe3\x80\x91|^\xe3\x80\x8a|^\xe3\x80\x8b|^\xe3\x80\x88|" + b"^\xe3\x80\x89|^\\$|^\xc2\xa3|^\xe2\x82\xac|^\xc2\xa5|^\xe0\xb8\xbf|" + b"^US\\$|^C\\$|^A\\$|^\xe2\x82\xbd|^\xef\xb7\xbc|^\xe2\x82\xb4|" + b"^[\\u00A6\\u00A9\\u00AE\\u00B0\\u0482\\u058D\\u058E\\u060E\\u060F" + b"\\u06DE\\u06E9\\u06FD\\u06FE\\u07F6\\u09FA\\u0B70\\u0BF3-\\u0BF8" + b"\\u0BFA\\u0C7F\\u0D4F\\u0D79\\u0F01-\\u0F03\\u0F13\\u0F15-\\u0F17" + b"\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38\\u0FBE-\\u0FC5\\u0FC7-\\u0FCC" + b"\\u0FCE\\u0FCF\\u0FD5-\\u0FD8\\u109E\\u109F\\u1390-\\u1399\\u1940" + b"\\u19DE-\\u19FF\\u1B61-\\u1B6A\\u1B74-\\u1B7C\\u2100\\u2101\\u2103" + b"-\\u2106\\u2108\\u2109\\u2114\\u2116\\u2117\\u211E-\\u2123\\u2125" + b"\\u2127\\u2129\\u212E\\u213A\\u213B\\u214A\\u214C\\u214D\\u214F" + b"\\u218A\\u218B\\u2195-\\u2199\\u219C-\\u219F\\u21A1\\u21A2\\u21A4" + b"\\u21A5\\u21A7-\\u21AD\\u21AF-\\u21CD\\u21D0\\u21D1\\u21D3\\u21D5" + b"-\\u21F3\\u2300-\\u2307\\u230C-\\u231F\\u2322-\\u2328\\u232B" + b"-\\u237B\\u237D-\\u239A\\u23B4-\\u23DB\\u23E2-\\u2426\\u2440" + b"-\\u244A\\u249C-\\u24E9\\u2500-\\u25B6\\u25B8-\\u25C0\\u25C2" + b"-\\u25F7\\u2600-\\u266E\\u2670-\\u2767\\u2794-\\u27BF\\u2800" + b"-\\u28FF\\u2B00-\\u2B2F\\u2B45\\u2B46\\u2B4D-\\u2B73\\u2B76" + b"-\\u2B95\\u2B98-\\u2BC8\\u2BCA-\\u2BFE\\u2CE5-\\u2CEA\\u2E80" + b"-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3004" + b"\\u3012\\u3013\\u3020\\u3036\\u3037\\u303E\\u303F\\u3190\\u3191" + b"\\u3196-\\u319F\\u31C0-\\u31E3\\u3200-\\u321E\\u322A-\\u3247\\u3250" + b"\\u3260-\\u327F\\u328A-\\u32B0\\u32C0-\\u32FE\\u3300-\\u33FF\\u4DC0" + b"-\\u4DFF\\uA490-\\uA4C6\\uA828-\\uA82B\\uA836\\uA837\\uA839\\uAA77" + b"-\\uAA79\\uFDFD\\uFFE4\\uFFE8\\uFFED\\uFFEE\\uFFFC\\uFFFD\\U00010137" + b"-\\U0001013F\\U00010179-\\U00010189\\U0001018C-\\U0001018E" + b"\\U00010190-\\U0001019B\\U000101A0\\U000101D0-\\U000101FC\\U00010877" + b"\\U00010878\\U00010AC8\\U0001173F\\U00016B3C-\\U00016B3F\\U00016B45" + b"\\U0001BC9C\\U0001D000-\\U0001D0F5\\U0001D100-\\U0001D126\\U0001D129" + b"-\\U0001D164\\U0001D16A-\\U0001D16C\\U0001D183\\U0001D184\\U0001D18C" + b"-\\U0001D1A9\\U0001D1AE-\\U0001D1E8\\U0001D200-\\U0001D241\\U0001D245" + b"\\U0001D300-\\U0001D356\\U0001D800-\\U0001D9FF\\U0001DA37-\\U0001DA3A" + b"\\U0001DA6D-\\U0001DA74\\U0001DA76-\\U0001DA83\\U0001DA85\\U0001DA86" + b"\\U0001ECAC\\U0001F000-\\U0001F02B\\U0001F030-\\U0001F093\\U0001F0A0" + b"-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1" + b"-\\U0001F0F5\\U0001F110-\\U0001F16B\\U0001F170-\\U0001F1AC\\U0001F1E6" + b"-\\U0001F202\\U0001F210-\\U0001F23B\\U0001F240-\\U0001F248\\U0001F250" + b"\\U0001F251\\U0001F260-\\U0001F265\\U0001F300-\\U0001F3FA\\U0001F400" + b"-\\U0001F6D4\\U0001F6E0-\\U0001F6EC\\U0001F6F0-\\U0001F6F9\\U0001F700" + b"-\\U0001F773\\U0001F780-\\U0001F7D8\\U0001F800-\\U0001F80B\\U0001F810" + b"-\\U0001F847\\U0001F850-\\U0001F859\\U0001F860-\\U0001F887\\U0001F890" + b"-\\U0001F8AD\\U0001F900-\\U0001F90B\\U0001F910-\\U0001F93E\\U0001F940" + b"-\\U0001F970\\U0001F973-\\U0001F976\\U0001F97A\\U0001F97C-\\U0001F9A2" + b"\\U0001F9B0-\\U0001F9B9\\U0001F9C0-\\U0001F9C2\\U0001F9D0-\\U0001F9FF" + b"\\U0001FA60-\\U0001FA6D]" +) + + +if compat.is_python2: + # If we have this test in Python 3, pytest chokes, as it can't print the + # string above in the xpass message. + def test_issue3356(): + pattern = re.compile(compat.unescape_unicode(prefix_search.decode("utf8"))) + assert not pattern.search(u"hello") diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 86c2d6ad3..e390a72b9 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -14,6 +14,7 @@ import re from .tokens.doc cimport Doc from .strings cimport hash_string +from .compat import unescape_unicode from .errors import Errors, Warnings, deprecation_warning from . import util @@ -428,6 +429,9 @@ cdef class Tokenizer: )) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) + for key in ["prefix_search", "suffix_search", "infix_finditer"]: + if key in data: + data[key] = unescape_unicode(data[key]) if data.get("prefix_search"): self.prefix_search = re.compile(data["prefix_search"]).search if data.get("suffix_search"): diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js index 062307642..f7e9ce790 100644 --- a/website/src/widgets/landing.js +++ b/website/src/widgets/landing.js @@ -218,7 +218,7 @@ const Landing = ({ data }) => {

Benchmarks

In 2015, independent researchers from Emory University and Yahoo! Labs - showed that spaCy offered the + showed that spaCy offered the{' '} fastest syntactic parser in the world and that its accuracy was within 1% of the best available (