From 04395ffa49409f669147168c8966ac21335cbb4d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 21 Mar 2019 13:53:44 +0100 Subject: [PATCH 01/10] Bring English tag_map in line with UD Treebank I wrote a small script to read the UD English training data and check that our tag map and morph rules were resulting in the best POS map. This hadn't been done for some time, and there have been various changes to the UD schema since it has been done. After these changes we should see much better agreement between our POS assignments and the UD POS tags. --- spacy/lang/en/morph_rules.py | 172 +++++++++++++++++++++++++++++++++-- spacy/lang/en/tag_map.py | 16 ++-- 2 files changed, 174 insertions(+), 14 deletions(-) diff --git a/spacy/lang/en/morph_rules.py b/spacy/lang/en/morph_rules.py index d073e27a5..c19507d8a 100644 --- a/spacy/lang/en/morph_rules.py +++ b/spacy/lang/en/morph_rules.py @@ -1,13 +1,97 @@ # coding: utf8 from __future__ import unicode_literals -from ...symbols import LEMMA, PRON_LEMMA +from ...symbols import LEMMA, PRON_LEMMA, AUX +_subordinating_conjunctions = [ + "that", + "if", + "as", + "because", + "of", + "for", + "before", + "in", + "while", + "after", + "since", + "like", + "with", + "so", + "to", + "by", + "on", + "about", + "than", + "whether", + "although", + "from", + "though", + "until", + "unless", + "once", + "without", + "at", + "into", + "cause", + "over", + "upon", + "till", + "whereas", + "beyond", + "whilst", + "except", + "despite", + "wether", + "then", + "but", + "becuse", + "whie", + "below", + "against", + "it", + "w/out", + "toward", + "albeit", + "save", + "besides", + "becouse", + "coz", + "til", + "ask", + "i'd", + "out", + "near", + "seince", + "towards", + "tho", + "sice", + "will", +] + +_relative_pronouns = ["this", "that", "those", "these"] MORPH_RULES = { + "DT": {word: {"POS": "PRON"} for word in _relative_pronouns}, + "IN": {word: {"POS": "SCONJ"} for word in _subordinating_conjunctions}, + "NN": { + "something": {"POS": "PRON"}, + "anyone": {"POS": "PRON"}, + "anything": {"POS": "PRON"}, + "nothing": {"POS": "PRON"}, + "someone": {"POS": "PRON"}, + "everything": {"POS": "PRON"}, + "everyone": {"POS": "PRON"}, + "everybody": {"POS": "PRON"}, + "nobody": {"POS": "PRON"}, + "somebody": {"POS": "PRON"}, + "anybody": {"POS": "PRON"}, + "any1": {"POS": "PRON"}, + }, "PRP": { "I": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "One", "Number": "Sing", @@ -15,14 +99,16 @@ MORPH_RULES = { }, "me": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Acc", }, - "you": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two"}, + "you": {LEMMA: PRON_LEMMA, "POS": "PRON", "PronType": "Prs", "Person": "Two"}, "he": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -31,6 +117,7 @@ MORPH_RULES = { }, "him": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -39,6 +126,7 @@ MORPH_RULES = { }, "she": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -47,6 +135,7 @@ MORPH_RULES = { }, "her": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -55,6 +144,7 @@ MORPH_RULES = { }, "it": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -62,6 +152,7 @@ MORPH_RULES = { }, "we": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "One", "Number": "Plur", @@ -69,6 +160,7 @@ MORPH_RULES = { }, "us": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "One", "Number": "Plur", @@ -76,6 +168,7 @@ MORPH_RULES = { }, "they": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Plur", @@ -83,6 +176,7 @@ MORPH_RULES = { }, "them": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Plur", @@ -90,6 +184,7 @@ MORPH_RULES = { }, "mine": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "One", "Number": "Sing", @@ -98,6 +193,7 @@ MORPH_RULES = { }, "his": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -107,6 +203,7 @@ MORPH_RULES = { }, "hers": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -116,6 +213,7 @@ MORPH_RULES = { }, "its": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -125,6 +223,7 @@ MORPH_RULES = { }, "ours": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "One", "Number": "Plur", @@ -133,6 +232,7 @@ MORPH_RULES = { }, "yours": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Two", "Number": "Plur", @@ -141,6 +241,7 @@ MORPH_RULES = { }, "theirs": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Plur", @@ -149,6 +250,7 @@ MORPH_RULES = { }, "myself": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "One", "Number": "Sing", @@ -157,6 +259,7 @@ MORPH_RULES = { }, "yourself": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Two", "Case": "Acc", @@ -164,6 +267,7 @@ MORPH_RULES = { }, "himself": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -173,6 +277,7 @@ MORPH_RULES = { }, "herself": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -182,6 +287,7 @@ MORPH_RULES = { }, "itself": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -191,6 +297,7 @@ MORPH_RULES = { }, "themself": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Sing", @@ -199,6 +306,7 @@ MORPH_RULES = { }, "ourselves": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "One", "Number": "Plur", @@ -207,6 +315,7 @@ MORPH_RULES = { }, "yourselves": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Two", "Case": "Acc", @@ -214,6 +323,7 @@ MORPH_RULES = { }, "themselves": { LEMMA: PRON_LEMMA, + "POS": "PRON", "PronType": "Prs", "Person": "Three", "Number": "Plur", @@ -269,9 +379,17 @@ MORPH_RULES = { "Poss": "Yes", }, }, + "RB": {word: {"POS": "PART"} for word in ["not", "n't", "nt", "n’t"]}, + "VB": { + word: {"POS": "AUX"} + for word in ["be", "have", "do", "get", "of", "am", "are", "'ve"] + }, + "VBN": {"been": {LEMMA: "be", "POS": "AUX"}}, + "VBG": {"being": {LEMMA: "be", "POS": "AUX"}}, "VBZ": { "am": { LEMMA: "be", + "POS": "AUX", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", @@ -279,6 +397,7 @@ MORPH_RULES = { }, "are": { LEMMA: "be", + "POS": "AUX", "VerbForm": "Fin", "Person": "Two", "Tense": "Pres", @@ -286,6 +405,7 @@ MORPH_RULES = { }, "is": { LEMMA: "be", + "POS": "AUX", "VerbForm": "Fin", "Person": "Three", "Tense": "Pres", @@ -293,6 +413,7 @@ MORPH_RULES = { }, "'re": { LEMMA: "be", + "POS": "AUX", "VerbForm": "Fin", "Person": "Two", "Tense": "Pres", @@ -300,26 +421,65 @@ MORPH_RULES = { }, "'s": { LEMMA: "be", + "POS": "AUX", "VerbForm": "Fin", "Person": "Three", "Tense": "Pres", "Mood": "Ind", }, + "has": {"POS": "AUX"}, + "does": {"POS": "AUX"}, }, "VBP": { - "are": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}, - "'re": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}, + "are": { + LEMMA: "be", + "POS": "AUX", + "VerbForm": "Fin", + "Tense": "Pres", + "Mood": "Ind", + }, + "'re": { + LEMMA: "be", + "POS": "AUX", + "VerbForm": "Fin", + "Tense": "Pres", + "Mood": "Ind", + }, "am": { LEMMA: "be", + "POS": "AUX", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind", }, + "do": {"POS": "AUX"}, + "have": {"POS": "AUX"}, + "'m": {"POS": "AUX", LEMMA: "be"}, + "'ve": {"POS": "AUX"}, + "'re": {"POS": "AUX", LEMMA: "be"}, + "'s": {"POS": "AUX"}, + "is": {"POS": "AUX"}, + "'d": {"POS": "AUX"}, }, "VBD": { - "was": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"}, - "were": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"}, + "was": { + LEMMA: "be", + "POS": "AUX", + "VerbForm": "Fin", + "Tense": "Past", + "Number": "Sing", + }, + "were": { + LEMMA: "be", + "POS": "AUX", + "VerbForm": "Fin", + "Tense": "Past", + "Number": "Plur", + }, + "did": {"POS": "AUX"}, + "had": {"POS": "AUX"}, + "'d": {"POS": "AUX"}, }, } diff --git a/spacy/lang/en/tag_map.py b/spacy/lang/en/tag_map.py index 67f43c53c..246258f57 100644 --- a/spacy/lang/en/tag_map.py +++ b/spacy/lang/en/tag_map.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB -from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON +from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX TAG_MAP = { @@ -20,15 +20,15 @@ TAG_MAP = { "CC": {POS: CCONJ, "ConjType": "coor"}, "CD": {POS: NUM, "NumType": "card"}, "DT": {POS: DET}, - "EX": {POS: ADV, "AdvType": "ex"}, + "EX": {POS: PRON, "AdvType": "ex"}, "FW": {POS: X, "Foreign": "yes"}, "HYPH": {POS: PUNCT, "PunctType": "dash"}, "IN": {POS: ADP}, "JJ": {POS: ADJ, "Degree": "pos"}, "JJR": {POS: ADJ, "Degree": "comp"}, "JJS": {POS: ADJ, "Degree": "sup"}, - "LS": {POS: PUNCT, "NumType": "ord"}, - "MD": {POS: VERB, "VerbType": "mod"}, + "LS": {POS: X, "NumType": "ord"}, + "MD": {POS: AUX, "VerbType": "mod"}, "NIL": {POS: ""}, "NN": {POS: NOUN, "Number": "sing"}, "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"}, @@ -37,11 +37,11 @@ TAG_MAP = { "PDT": {POS: DET, "AdjType": "pdt", "PronType": "prn"}, "POS": {POS: PART, "Poss": "yes"}, "PRP": {POS: PRON, "PronType": "prs"}, - "PRP$": {POS: DET, "PronType": "prs", "Poss": "yes"}, + "PRP$": {POS: PRON, "PronType": "prs", "Poss": "yes"}, "RB": {POS: ADV, "Degree": "pos"}, "RBR": {POS: ADV, "Degree": "comp"}, "RBS": {POS: ADV, "Degree": "sup"}, - "RP": {POS: PART}, + "RP": {POS: ADP}, "SP": {POS: SPACE}, "SYM": {POS: SYM}, "TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"}, @@ -58,9 +58,9 @@ TAG_MAP = { "Number": "sing", "Person": 3, }, - "WDT": {POS: DET, "PronType": "int|rel"}, + "WDT": {POS: PRON, "PronType": "int|rel"}, "WP": {POS: PRON, "PronType": "int|rel"}, - "WP$": {POS: DET, "Poss": "yes", "PronType": "int|rel"}, + "WP$": {POS: PRON, "Poss": "yes", "PronType": "int|rel"}, "WRB": {POS: ADV, "PronType": "int|rel"}, "ADD": {POS: X}, "NFP": {POS: PUNCT}, From c66bd61e88a3f44331627938bac022ca2dd59cb2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 21 Mar 2019 14:22:12 +0100 Subject: [PATCH 02/10] Fix lemmas --- spacy/lang/en/morph_rules.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spacy/lang/en/morph_rules.py b/spacy/lang/en/morph_rules.py index c19507d8a..54a108d53 100644 --- a/spacy/lang/en/morph_rules.py +++ b/spacy/lang/en/morph_rules.py @@ -427,8 +427,8 @@ MORPH_RULES = { "Tense": "Pres", "Mood": "Ind", }, - "has": {"POS": "AUX"}, - "does": {"POS": "AUX"}, + "has": {LEMMA: "have", "POS": "AUX"}, + "does": {LEMMA: "do", "POS": "AUX"}, }, "VBP": { "are": { @@ -477,9 +477,9 @@ MORPH_RULES = { "Tense": "Past", "Number": "Plur", }, - "did": {"POS": "AUX"}, - "had": {"POS": "AUX"}, - "'d": {"POS": "AUX"}, + "did": {LEMMA: "do", "POS": "AUX"}, + "had": {LEMMA: "have", "POS": "AUX"}, + "'d": {LEMMA: "have", "POS": "AUX"}, }, } From a2ad9832e55187cbb24928ca0b6f86b24c62afe5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 22 Mar 2019 02:42:37 +0100 Subject: [PATCH 03/10] Add failing test for #3356 --- spacy/tests/regression/test_issue3356.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 spacy/tests/regression/test_issue3356.py diff --git a/spacy/tests/regression/test_issue3356.py b/spacy/tests/regression/test_issue3356.py new file mode 100644 index 000000000..b887fb664 --- /dev/null +++ b/spacy/tests/regression/test_issue3356.py @@ -0,0 +1,10 @@ +import pytest +import re + +prefix_search = b"^\xc2\xa7|^%|^=|^\xe2\x80\x94|^\xe2\x80\x93|^\\+(?![0-9])|^\xe2\x80\xa6|^\xe2\x80\xa6\xe2\x80\xa6|^,|^:|^;|^\\!|^\\?|^\xc2\xbf|^\xd8\x9f|^\xc2\xa1|^\\(|^\\)|^\\[|^\\]|^\\{|^\\}|^<|^>|^_|^#|^\\*|^&|^\xe3\x80\x82|^\xef\xbc\x9f|^\xef\xbc\x81|^\xef\xbc\x8c|^\xe3\x80\x81|^\xef\xbc\x9b|^\xef\xbc\x9a|^\xef\xbd\x9e|^\xc2\xb7|^\xe0\xa5\xa4|^\xd8\x8c|^\xd8\x9b|^\xd9\xaa|^\\.\\.+|^\xe2\x80\xa6|^\\'|^\"|^\xe2\x80\x9d|^\xe2\x80\x9c|^`|^\xe2\x80\x98|^\xc2\xb4|^\xe2\x80\x99|^\xe2\x80\x9a|^,|^\xe2\x80\x9e|^\xc2\xbb|^\xc2\xab|^\xe3\x80\x8c|^\xe3\x80\x8d|^\xe3\x80\x8e|^\xe3\x80\x8f|^\xef\xbc\x88|^\xef\xbc\x89|^\xe3\x80\x94|^\xe3\x80\x95|^\xe3\x80\x90|^\xe3\x80\x91|^\xe3\x80\x8a|^\xe3\x80\x8b|^\xe3\x80\x88|^\xe3\x80\x89|^\\$|^\xc2\xa3|^\xe2\x82\xac|^\xc2\xa5|^\xe0\xb8\xbf|^US\\$|^C\\$|^A\\$|^\xe2\x82\xbd|^\xef\xb7\xbc|^\xe2\x82\xb4|^[\\u00A6\\u00A9\\u00AE\\u00B0\\u0482\\u058D\\u058E\\u060E\\u060F\\u06DE\\u06E9\\u06FD\\u06FE\\u07F6\\u09FA\\u0B70\\u0BF3-\\u0BF8\\u0BFA\\u0C7F\\u0D4F\\u0D79\\u0F01-\\u0F03\\u0F13\\u0F15-\\u0F17\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38\\u0FBE-\\u0FC5\\u0FC7-\\u0FCC\\u0FCE\\u0FCF\\u0FD5-\\u0FD8\\u109E\\u109F\\u1390-\\u1399\\u1940\\u19DE-\\u19FF\\u1B61-\\u1B6A\\u1B74-\\u1B7C\\u2100\\u2101\\u2103-\\u2106\\u2108\\u2109\\u2114\\u2116\\u2117\\u211E-\\u2123\\u2125\\u2127\\u2129\\u212E\\u213A\\u213B\\u214A\\u214C\\u214D\\u214F\\u218A\\u218B\\u2195-\\u2199\\u219C-\\u219F\\u21A1\\u21A2\\u21A4\\u21A5\\u21A7-\\u21AD\\u21AF-\\u21CD\\u21D0\\u21D1\\u21D3\\u21D5-\\u21F3\\u2300-\\u2307\\u230C-\\u231F\\u2322-\\u2328\\u232B-\\u237B\\u237D-\\u239A\\u23B4-\\u23DB\\u23E2-\\u2426\\u2440-\\u244A\\u249C-\\u24E9\\u2500-\\u25B6\\u25B8-\\u25C0\\u25C2-\\u25F7\\u2600-\\u266E\\u2670-\\u2767\\u2794-\\u27BF\\u2800-\\u28FF\\u2B00-\\u2B2F\\u2B45\\u2B46\\u2B4D-\\u2B73\\u2B76-\\u2B95\\u2B98-\\u2BC8\\u2BCA-\\u2BFE\\u2CE5-\\u2CEA\\u2E80-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3004\\u3012\\u3013\\u3020\\u3036\\u3037\\u303E\\u303F\\u3190\\u3191\\u3196-\\u319F\\u31C0-\\u31E3\\u3200-\\u321E\\u322A-\\u3247\\u3250\\u3260-\\u327F\\u328A-\\u32B0\\u32C0-\\u32FE\\u3300-\\u33FF\\u4DC0-\\u4DFF\\uA490-\\uA4C6\\uA828-\\uA82B\\uA836\\uA837\\uA839\\uAA77-\\uAA79\\uFDFD\\uFFE4\\uFFE8\\uFFED\\uFFEE\\uFFFC\\uFFFD\\U00010137-\\U0001013F\\U00010179-\\U00010189\\U0001018C-\\U0001018E\\U00010190-\\U0001019B\\U000101A0\\U000101D0-\\U000101FC\\U00010877\\U00010878\\U00010AC8\\U0001173F\\U00016B3C-\\U00016B3F\\U00016B45\\U0001BC9C\\U0001D000-\\U0001D0F5\\U0001D100-\\U0001D126\\U0001D129-\\U0001D164\\U0001D16A-\\U0001D16C\\U0001D183\\U0001D184\\U0001D18C-\\U0001D1A9\\U0001D1AE-\\U0001D1E8\\U0001D200-\\U0001D241\\U0001D245\\U0001D300-\\U0001D356\\U0001D800-\\U0001D9FF\\U0001DA37-\\U0001DA3A\\U0001DA6D-\\U0001DA74\\U0001DA76-\\U0001DA83\\U0001DA85\\U0001DA86\\U0001ECAC\\U0001F000-\\U0001F02B\\U0001F030-\\U0001F093\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5\\U0001F110-\\U0001F16B\\U0001F170-\\U0001F1AC\\U0001F1E6-\\U0001F202\\U0001F210-\\U0001F23B\\U0001F240-\\U0001F248\\U0001F250\\U0001F251\\U0001F260-\\U0001F265\\U0001F300-\\U0001F3FA\\U0001F400-\\U0001F6D4\\U0001F6E0-\\U0001F6EC\\U0001F6F0-\\U0001F6F9\\U0001F700-\\U0001F773\\U0001F780-\\U0001F7D8\\U0001F800-\\U0001F80B\\U0001F810-\\U0001F847\\U0001F850-\\U0001F859\\U0001F860-\\U0001F887\\U0001F890-\\U0001F8AD\\U0001F900-\\U0001F90B\\U0001F910-\\U0001F93E\\U0001F940-\\U0001F970\\U0001F973-\\U0001F976\\U0001F97A\\U0001F97C-\\U0001F9A2\\U0001F9B0-\\U0001F9B9\\U0001F9C0-\\U0001F9C2\\U0001F9D0-\\U0001F9FF\\U0001FA60-\\U0001FA6D]" + + +@pytest.xfail +def test_issue3356(): + pattern = re.compile(prefix_search.decode("utf8")) + assert not pattern.search(u"hello") From d811c97da11d0b555ccadfdae7c88a53547424fa Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 22 Mar 2019 10:28:51 +0100 Subject: [PATCH 04/10] Fix test that caused pytest to choke on Python3 --- spacy/tests/regression/test_issue3356.py | 71 ++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 5 deletions(-) diff --git a/spacy/tests/regression/test_issue3356.py b/spacy/tests/regression/test_issue3356.py index b887fb664..2ef847746 100644 --- a/spacy/tests/regression/test_issue3356.py +++ b/spacy/tests/regression/test_issue3356.py @@ -1,10 +1,71 @@ import pytest import re +from ... import compat -prefix_search = b"^\xc2\xa7|^%|^=|^\xe2\x80\x94|^\xe2\x80\x93|^\\+(?![0-9])|^\xe2\x80\xa6|^\xe2\x80\xa6\xe2\x80\xa6|^,|^:|^;|^\\!|^\\?|^\xc2\xbf|^\xd8\x9f|^\xc2\xa1|^\\(|^\\)|^\\[|^\\]|^\\{|^\\}|^<|^>|^_|^#|^\\*|^&|^\xe3\x80\x82|^\xef\xbc\x9f|^\xef\xbc\x81|^\xef\xbc\x8c|^\xe3\x80\x81|^\xef\xbc\x9b|^\xef\xbc\x9a|^\xef\xbd\x9e|^\xc2\xb7|^\xe0\xa5\xa4|^\xd8\x8c|^\xd8\x9b|^\xd9\xaa|^\\.\\.+|^\xe2\x80\xa6|^\\'|^\"|^\xe2\x80\x9d|^\xe2\x80\x9c|^`|^\xe2\x80\x98|^\xc2\xb4|^\xe2\x80\x99|^\xe2\x80\x9a|^,|^\xe2\x80\x9e|^\xc2\xbb|^\xc2\xab|^\xe3\x80\x8c|^\xe3\x80\x8d|^\xe3\x80\x8e|^\xe3\x80\x8f|^\xef\xbc\x88|^\xef\xbc\x89|^\xe3\x80\x94|^\xe3\x80\x95|^\xe3\x80\x90|^\xe3\x80\x91|^\xe3\x80\x8a|^\xe3\x80\x8b|^\xe3\x80\x88|^\xe3\x80\x89|^\\$|^\xc2\xa3|^\xe2\x82\xac|^\xc2\xa5|^\xe0\xb8\xbf|^US\\$|^C\\$|^A\\$|^\xe2\x82\xbd|^\xef\xb7\xbc|^\xe2\x82\xb4|^[\\u00A6\\u00A9\\u00AE\\u00B0\\u0482\\u058D\\u058E\\u060E\\u060F\\u06DE\\u06E9\\u06FD\\u06FE\\u07F6\\u09FA\\u0B70\\u0BF3-\\u0BF8\\u0BFA\\u0C7F\\u0D4F\\u0D79\\u0F01-\\u0F03\\u0F13\\u0F15-\\u0F17\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38\\u0FBE-\\u0FC5\\u0FC7-\\u0FCC\\u0FCE\\u0FCF\\u0FD5-\\u0FD8\\u109E\\u109F\\u1390-\\u1399\\u1940\\u19DE-\\u19FF\\u1B61-\\u1B6A\\u1B74-\\u1B7C\\u2100\\u2101\\u2103-\\u2106\\u2108\\u2109\\u2114\\u2116\\u2117\\u211E-\\u2123\\u2125\\u2127\\u2129\\u212E\\u213A\\u213B\\u214A\\u214C\\u214D\\u214F\\u218A\\u218B\\u2195-\\u2199\\u219C-\\u219F\\u21A1\\u21A2\\u21A4\\u21A5\\u21A7-\\u21AD\\u21AF-\\u21CD\\u21D0\\u21D1\\u21D3\\u21D5-\\u21F3\\u2300-\\u2307\\u230C-\\u231F\\u2322-\\u2328\\u232B-\\u237B\\u237D-\\u239A\\u23B4-\\u23DB\\u23E2-\\u2426\\u2440-\\u244A\\u249C-\\u24E9\\u2500-\\u25B6\\u25B8-\\u25C0\\u25C2-\\u25F7\\u2600-\\u266E\\u2670-\\u2767\\u2794-\\u27BF\\u2800-\\u28FF\\u2B00-\\u2B2F\\u2B45\\u2B46\\u2B4D-\\u2B73\\u2B76-\\u2B95\\u2B98-\\u2BC8\\u2BCA-\\u2BFE\\u2CE5-\\u2CEA\\u2E80-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3004\\u3012\\u3013\\u3020\\u3036\\u3037\\u303E\\u303F\\u3190\\u3191\\u3196-\\u319F\\u31C0-\\u31E3\\u3200-\\u321E\\u322A-\\u3247\\u3250\\u3260-\\u327F\\u328A-\\u32B0\\u32C0-\\u32FE\\u3300-\\u33FF\\u4DC0-\\u4DFF\\uA490-\\uA4C6\\uA828-\\uA82B\\uA836\\uA837\\uA839\\uAA77-\\uAA79\\uFDFD\\uFFE4\\uFFE8\\uFFED\\uFFEE\\uFFFC\\uFFFD\\U00010137-\\U0001013F\\U00010179-\\U00010189\\U0001018C-\\U0001018E\\U00010190-\\U0001019B\\U000101A0\\U000101D0-\\U000101FC\\U00010877\\U00010878\\U00010AC8\\U0001173F\\U00016B3C-\\U00016B3F\\U00016B45\\U0001BC9C\\U0001D000-\\U0001D0F5\\U0001D100-\\U0001D126\\U0001D129-\\U0001D164\\U0001D16A-\\U0001D16C\\U0001D183\\U0001D184\\U0001D18C-\\U0001D1A9\\U0001D1AE-\\U0001D1E8\\U0001D200-\\U0001D241\\U0001D245\\U0001D300-\\U0001D356\\U0001D800-\\U0001D9FF\\U0001DA37-\\U0001DA3A\\U0001DA6D-\\U0001DA74\\U0001DA76-\\U0001DA83\\U0001DA85\\U0001DA86\\U0001ECAC\\U0001F000-\\U0001F02B\\U0001F030-\\U0001F093\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5\\U0001F110-\\U0001F16B\\U0001F170-\\U0001F1AC\\U0001F1E6-\\U0001F202\\U0001F210-\\U0001F23B\\U0001F240-\\U0001F248\\U0001F250\\U0001F251\\U0001F260-\\U0001F265\\U0001F300-\\U0001F3FA\\U0001F400-\\U0001F6D4\\U0001F6E0-\\U0001F6EC\\U0001F6F0-\\U0001F6F9\\U0001F700-\\U0001F773\\U0001F780-\\U0001F7D8\\U0001F800-\\U0001F80B\\U0001F810-\\U0001F847\\U0001F850-\\U0001F859\\U0001F860-\\U0001F887\\U0001F890-\\U0001F8AD\\U0001F900-\\U0001F90B\\U0001F910-\\U0001F93E\\U0001F940-\\U0001F970\\U0001F973-\\U0001F976\\U0001F97A\\U0001F97C-\\U0001F9A2\\U0001F9B0-\\U0001F9B9\\U0001F9C0-\\U0001F9C2\\U0001F9D0-\\U0001F9FF\\U0001FA60-\\U0001FA6D]" +prefix_search = ( + b"^\xc2\xa7|^%|^=|^\xe2\x80\x94|^\xe2\x80\x93|^\\+(?![0-9])" + b"|^\xe2\x80\xa6|^\xe2\x80\xa6\xe2\x80\xa6|^,|^:|^;|^\\!|^\\?" + b"|^\xc2\xbf|^\xd8\x9f|^\xc2\xa1|^\\(|^\\)|^\\[|^\\]|^\\{|^\\}" + b"|^<|^>|^_|^#|^\\*|^&|^\xe3\x80\x82|^\xef\xbc\x9f|^\xef\xbc\x81|" + b"^\xef\xbc\x8c|^\xe3\x80\x81|^\xef\xbc\x9b|^\xef\xbc\x9a|" + b"^\xef\xbd\x9e|^\xc2\xb7|^\xe0\xa5\xa4|^\xd8\x8c|^\xd8\x9b|" + b"^\xd9\xaa|^\\.\\.+|^\xe2\x80\xa6|^\\'|^\"|^\xe2\x80\x9d|" + b"^\xe2\x80\x9c|^`|^\xe2\x80\x98|^\xc2\xb4|^\xe2\x80\x99|" + b"^\xe2\x80\x9a|^,|^\xe2\x80\x9e|^\xc2\xbb|^\xc2\xab|^\xe3\x80\x8c|" + b"^\xe3\x80\x8d|^\xe3\x80\x8e|^\xe3\x80\x8f|^\xef\xbc\x88|" + b"^\xef\xbc\x89|^\xe3\x80\x94|^\xe3\x80\x95|^\xe3\x80\x90|" + b"^\xe3\x80\x91|^\xe3\x80\x8a|^\xe3\x80\x8b|^\xe3\x80\x88|" + b"^\xe3\x80\x89|^\\$|^\xc2\xa3|^\xe2\x82\xac|^\xc2\xa5|^\xe0\xb8\xbf|" + b"^US\\$|^C\\$|^A\\$|^\xe2\x82\xbd|^\xef\xb7\xbc|^\xe2\x82\xb4|" + b"^[\\u00A6\\u00A9\\u00AE\\u00B0\\u0482\\u058D\\u058E\\u060E\\u060F" + b"\\u06DE\\u06E9\\u06FD\\u06FE\\u07F6\\u09FA\\u0B70\\u0BF3-\\u0BF8" + b"\\u0BFA\\u0C7F\\u0D4F\\u0D79\\u0F01-\\u0F03\\u0F13\\u0F15-\\u0F17" + b"\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38\\u0FBE-\\u0FC5\\u0FC7-\\u0FCC" + b"\\u0FCE\\u0FCF\\u0FD5-\\u0FD8\\u109E\\u109F\\u1390-\\u1399\\u1940" + b"\\u19DE-\\u19FF\\u1B61-\\u1B6A\\u1B74-\\u1B7C\\u2100\\u2101\\u2103" + b"-\\u2106\\u2108\\u2109\\u2114\\u2116\\u2117\\u211E-\\u2123\\u2125" + b"\\u2127\\u2129\\u212E\\u213A\\u213B\\u214A\\u214C\\u214D\\u214F" + b"\\u218A\\u218B\\u2195-\\u2199\\u219C-\\u219F\\u21A1\\u21A2\\u21A4" + b"\\u21A5\\u21A7-\\u21AD\\u21AF-\\u21CD\\u21D0\\u21D1\\u21D3\\u21D5" + b"-\\u21F3\\u2300-\\u2307\\u230C-\\u231F\\u2322-\\u2328\\u232B" + b"-\\u237B\\u237D-\\u239A\\u23B4-\\u23DB\\u23E2-\\u2426\\u2440" + b"-\\u244A\\u249C-\\u24E9\\u2500-\\u25B6\\u25B8-\\u25C0\\u25C2" + b"-\\u25F7\\u2600-\\u266E\\u2670-\\u2767\\u2794-\\u27BF\\u2800" + b"-\\u28FF\\u2B00-\\u2B2F\\u2B45\\u2B46\\u2B4D-\\u2B73\\u2B76" + b"-\\u2B95\\u2B98-\\u2BC8\\u2BCA-\\u2BFE\\u2CE5-\\u2CEA\\u2E80" + b"-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3004" + b"\\u3012\\u3013\\u3020\\u3036\\u3037\\u303E\\u303F\\u3190\\u3191" + b"\\u3196-\\u319F\\u31C0-\\u31E3\\u3200-\\u321E\\u322A-\\u3247\\u3250" + b"\\u3260-\\u327F\\u328A-\\u32B0\\u32C0-\\u32FE\\u3300-\\u33FF\\u4DC0" + b"-\\u4DFF\\uA490-\\uA4C6\\uA828-\\uA82B\\uA836\\uA837\\uA839\\uAA77" + b"-\\uAA79\\uFDFD\\uFFE4\\uFFE8\\uFFED\\uFFEE\\uFFFC\\uFFFD\\U00010137" + b"-\\U0001013F\\U00010179-\\U00010189\\U0001018C-\\U0001018E" + b"\\U00010190-\\U0001019B\\U000101A0\\U000101D0-\\U000101FC\\U00010877" + b"\\U00010878\\U00010AC8\\U0001173F\\U00016B3C-\\U00016B3F\\U00016B45" + b"\\U0001BC9C\\U0001D000-\\U0001D0F5\\U0001D100-\\U0001D126\\U0001D129" + b"-\\U0001D164\\U0001D16A-\\U0001D16C\\U0001D183\\U0001D184\\U0001D18C" + b"-\\U0001D1A9\\U0001D1AE-\\U0001D1E8\\U0001D200-\\U0001D241\\U0001D245" + b"\\U0001D300-\\U0001D356\\U0001D800-\\U0001D9FF\\U0001DA37-\\U0001DA3A" + b"\\U0001DA6D-\\U0001DA74\\U0001DA76-\\U0001DA83\\U0001DA85\\U0001DA86" + b"\\U0001ECAC\\U0001F000-\\U0001F02B\\U0001F030-\\U0001F093\\U0001F0A0" + b"-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1" + b"-\\U0001F0F5\\U0001F110-\\U0001F16B\\U0001F170-\\U0001F1AC\\U0001F1E6" + b"-\\U0001F202\\U0001F210-\\U0001F23B\\U0001F240-\\U0001F248\\U0001F250" + b"\\U0001F251\\U0001F260-\\U0001F265\\U0001F300-\\U0001F3FA\\U0001F400" + b"-\\U0001F6D4\\U0001F6E0-\\U0001F6EC\\U0001F6F0-\\U0001F6F9\\U0001F700" + b"-\\U0001F773\\U0001F780-\\U0001F7D8\\U0001F800-\\U0001F80B\\U0001F810" + b"-\\U0001F847\\U0001F850-\\U0001F859\\U0001F860-\\U0001F887\\U0001F890" + b"-\\U0001F8AD\\U0001F900-\\U0001F90B\\U0001F910-\\U0001F93E\\U0001F940" + b"-\\U0001F970\\U0001F973-\\U0001F976\\U0001F97A\\U0001F97C-\\U0001F9A2" + b"\\U0001F9B0-\\U0001F9B9\\U0001F9C0-\\U0001F9C2\\U0001F9D0-\\U0001F9FF" + b"\\U0001FA60-\\U0001FA6D]" +) -@pytest.xfail -def test_issue3356(): - pattern = re.compile(prefix_search.decode("utf8")) - assert not pattern.search(u"hello") +if compat.is_python2: + # If we have this test in Python 3, pytest chokes, as it can't print the + # string above in the xpass message. + @pytest.xfail + def test_issue3356(): + pattern = re.compile(prefix_search.decode("utf8")) + assert not pattern.search(u"hello") From 188ccd575072704ea870fac60b5cde4c29d822bd Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 22 Mar 2019 12:54:14 +0100 Subject: [PATCH 05/10] Fix xfail marker --- spacy/tests/regression/test_issue3356.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/regression/test_issue3356.py b/spacy/tests/regression/test_issue3356.py index 2ef847746..c14fa8525 100644 --- a/spacy/tests/regression/test_issue3356.py +++ b/spacy/tests/regression/test_issue3356.py @@ -65,7 +65,7 @@ prefix_search = ( if compat.is_python2: # If we have this test in Python 3, pytest chokes, as it can't print the # string above in the xpass message. - @pytest.xfail + @pytest.mark.xfail def test_issue3356(): pattern = re.compile(prefix_search.decode("utf8")) assert not pattern.search(u"hello") From c81923ee30a89fe29eaf4ee12d1c5773e01a44a9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 22 Mar 2019 13:31:58 +0100 Subject: [PATCH 06/10] Update wasabi pin --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index e61a029c9..bf95839b5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ preshed>=2.0.1,<2.1.0 thinc>=7.0.2,<7.1.0 blis>=0.2.2,<0.3.0 murmurhash>=0.28.0,<1.1.0 -wasabi>=0.1.3,<1.1.0 +wasabi>=0.2.0,<1.1.0 srsly>=0.0.5,<1.1.0 # Third party dependencies numpy>=1.15.0 diff --git a/setup.py b/setup.py index 6f29e1efa..ed030eaf0 100755 --- a/setup.py +++ b/setup.py @@ -232,7 +232,7 @@ def setup_package(): "plac<1.0.0,>=0.9.6", "requests>=2.13.0,<3.0.0", "jsonschema>=2.6.0,<3.0.0", - "wasabi>=0.0.12,<1.1.0", + "wasabi>=0.2.0,<1.1.0", "srsly>=0.0.5,<1.1.0", 'pathlib==1.0.1; python_version < "3.4"', ], From e65b5bb9a027af0c290c41a90b89728e0e77f9ae Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 22 Mar 2019 13:42:47 +0100 Subject: [PATCH 07/10] Fix tokenizer on Python2.7 (#3460) spaCy v2.1 switched to the built-in re module, where v2.0 had been using the third-party regex library. When the tokenizer was deserialized on Python2.7, the `re.compile()` function was called with expressions that featured escaped unicode codepoints that were not in Python2.7's unicode database. Problems occurred when we had a range between two of these unknown codepoints, like this: ``` '[\\uAA77-\\uAA79]' ``` On Python2.7, the unknown codepoints are not unescaped correctly, resulting in arbitrary out-of-range characters being matched by the expression. This problem does not occur if we instead have a range between two unicode literals, rather than the escape sequences. To fix the bug, we therefore add a new compat function that unescapes unicode sequences using the `ast.literal_eval()` function. Care is taken to ensure we do not also escape non-unicode sequences. Closes #3356. - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. --- spacy/compat.py | 24 ++++++++++++++++++++++++ spacy/tests/regression/test_issue3356.py | 3 +-- spacy/tokenizer.pyx | 4 ++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/spacy/compat.py b/spacy/compat.py index 8af49f254..997e8787b 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -11,6 +11,7 @@ from __future__ import unicode_literals import os import sys import itertools +import ast from thinc.neural.util import copy_array @@ -150,3 +151,26 @@ def import_file(name, loc): module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) return module + + +def unescape_unicode(string): + """Python2.7's re module chokes when compiling patterns that have ranges + between escaped unicode codepoints if the two codepoints are unrecognised + in the unicode database. For instance: + + re.compile('[\\uAA77-\\uAA79]').findall("hello") + + Ends up matching every character (on Python 2). This problem doesn't occur + if we're dealing with unicode literals. + """ + if string is None: + return string + # We only want to unescape the unicode, so we first must protect the other + # backslashes. + string = string.replace("\\", "\\\\") + # Now we remove that protection for the unicode. + string = string.replace("\\\\u", "\\u") + string = string.replace("\\\\U", "\\U") + # Now we unescape by evaling the string with the AST. This can't execute + # code -- it only does the representational level. + return ast.literal_eval("u'''" + string + "'''") diff --git a/spacy/tests/regression/test_issue3356.py b/spacy/tests/regression/test_issue3356.py index c14fa8525..4e27055c7 100644 --- a/spacy/tests/regression/test_issue3356.py +++ b/spacy/tests/regression/test_issue3356.py @@ -65,7 +65,6 @@ prefix_search = ( if compat.is_python2: # If we have this test in Python 3, pytest chokes, as it can't print the # string above in the xpass message. - @pytest.mark.xfail def test_issue3356(): - pattern = re.compile(prefix_search.decode("utf8")) + pattern = re.compile(compat.unescape_unicode(prefix_search.decode("utf8"))) assert not pattern.search(u"hello") diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 86c2d6ad3..e390a72b9 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -14,6 +14,7 @@ import re from .tokens.doc cimport Doc from .strings cimport hash_string +from .compat import unescape_unicode from .errors import Errors, Warnings, deprecation_warning from . import util @@ -428,6 +429,9 @@ cdef class Tokenizer: )) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) + for key in ["prefix_search", "suffix_search", "infix_finditer"]: + if key in data: + data[key] = unescape_unicode(data[key]) if data.get("prefix_search"): self.prefix_search = re.compile(data["prefix_search"]).search if data.get("suffix_search"): From c9bd0e5a966e5a2069a96c474e23713b3d5e5b58 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 22 Mar 2019 13:44:47 +0100 Subject: [PATCH 08/10] Set version to 2.1.2 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 7b1f36c40..1592620ae 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,7 +4,7 @@ # fmt: off __title__ = "spacy" -__version__ = "2.1.1" +__version__ = "2.1.2" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI" From 5073ce63fd6ae3b9df2c8923e168220bb5c9765e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 22 Mar 2019 15:17:11 +0100 Subject: [PATCH 09/10] Merge branch 'spacy.io' [ci skip] --- website/docs/usage/index.md | 2 +- website/docs/usage/v2-1.md | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index 222f1408c..1ffd0de0d 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -8,7 +8,7 @@ menu: - ['Changelog', 'changelog'] --- -spaCy is compatible with **64-bit CPython 2.7+/3.5+** and runs on +spaCy is compatible with **64-bit CPython 2.7 / 3.5+** and runs on **Unix/Linux**, **macOS/OS X** and **Windows**. The latest spaCy releases are available over [pip](https://pypi.python.org/pypi/spacy) and [conda](https://anaconda.org/conda-forge/spacy). diff --git a/website/docs/usage/v2-1.md b/website/docs/usage/v2-1.md index 35ec20f4f..271440dba 100644 --- a/website/docs/usage/v2-1.md +++ b/website/docs/usage/v2-1.md @@ -212,9 +212,8 @@ if all of your models are up to date, you can run the - Due to difficulties linking our new [`blis`](https://github.com/explosion/cython-blis) for faster - platform-independent matrix multiplication, this nightly release currently - **doesn't work on Python 2.7 on Windows**. We expect this to be corrected in - the future. + platform-independent matrix multiplication, this release currently **doesn't + work on Python 2.7 on Windows**. We expect this to be corrected in the future. - While the [`Matcher`](/api/matcher) API is fully backwards compatible, its algorithm has changed to fix a number of bugs and performance issues. This @@ -250,9 +249,14 @@ if all of your models are up to date, you can run the + data = nlp.tokenizer.to_bytes(exclude=["vocab"]) ``` +- The .pos value for several common English words has changed, due to + corrections to long-standing mistakes in the English tag map (see + [issue #593](https://github.com/explosion/spaCy/issues/593) and + [issue #3311](https://github.com/explosion/spaCy/issues/3311) for details). + - For better compatibility with the Universal Dependencies data, the lemmatizer now preserves capitalization, e.g. for proper nouns. See - [this issue](https://github.com/explosion/spaCy/issues/3256) for details. + [issue #3256](https://github.com/explosion/spaCy/issues/3256) for details. - The built-in rule-based sentence boundary detector is now only called `"sentencizer"` – the name `"sbd"` is deprecated. From 9cee3f702a429c5a6f69cb134c1ee197653bc830 Mon Sep 17 00:00:00 2001 From: Christos Aridas Date: Fri, 22 Mar 2019 16:17:35 +0200 Subject: [PATCH 10/10] Add missing space in landing page (#3462) [ci skip] --- website/src/widgets/landing.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js index 062307642..f7e9ce790 100644 --- a/website/src/widgets/landing.js +++ b/website/src/widgets/landing.js @@ -218,7 +218,7 @@ const Landing = ({ data }) => {

Benchmarks

In 2015, independent researchers from Emory University and Yahoo! Labs - showed that spaCy offered the + showed that spaCy offered the{' '} fastest syntactic parser in the world and that its accuracy was within 1% of the best available (