From 483dddc9bcb49f5193c3fe6a9d9dcf7ef12543f2 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 12 Feb 2019 15:47:26 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=92=AB=20Add=20token=20match=20pattern=20?= =?UTF-8?q?validation=20via=20JSON=20schemas=20(#3244)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add custom MatchPatternError * Improve validators and add validation option to Matcher * Adjust formatting * Never validate in Matcher within PhraseMatcher If we do decide to make validate default to True, the PhraseMatcher's Matcher shouldn't ever validate. Here, we create the patterns automatically anyways (and it's currently unclear whether the validation has performance impacts at a very large scale). --- spacy/cli/debug_data.py | 4 +- spacy/errors.py | 15 ++ spacy/matcher/_schemas.py | 172 ++++++++++++++++++ spacy/matcher/matcher.pxd | 1 + spacy/matcher/matcher.pyx | 14 +- spacy/matcher/phrasematcher.pyx | 2 +- spacy/tests/doc/test_to_json.py | 5 +- .../tests/matcher/test_pattern_validation.py | 48 +++++ spacy/tests/test_json_schemas.py | 18 +- spacy/util.py | 24 ++- 10 files changed, 282 insertions(+), 21 deletions(-) create mode 100644 spacy/matcher/_schemas.py create mode 100644 spacy/tests/matcher/test_pattern_validation.py diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 634756441..afedb933e 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -74,8 +74,8 @@ def debug_data( # Validate data format using the JSON schema # TODO: update once the new format is ready - train_data_errors = [] # TODO: validate_json(train_data, schema) - dev_data_errors = [] # TODO: validate_json(dev_data, schema) + train_data_errors = [] # TODO: validate_json + dev_data_errors = [] # TODO: validate_json if not train_data_errors: msg.good("Training data JSON format is valid") if not dev_data_errors: diff --git a/spacy/errors.py b/spacy/errors.py index a0da31a1e..2b63b6cda 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -325,6 +325,21 @@ class TempErrors(object): # fmt: on +class MatchPatternError(ValueError): + def __init__(self, key, errors): + """Custom error for validating match patterns. + + key (unicode): The name of the matcher rule. + errors (dict): Validation errors (sequence of strings) mapped to pattern + ID, i.e. the index of the added pattern. + """ + msg = "Invalid token patterns for matcher rule '{}'\n".format(key) + for pattern_idx, error_msgs in errors.items(): + pattern_errors = "\n".join(["- {}".format(e) for e in error_msgs]) + msg += "\nPattern {}:\n{}\n".format(pattern_idx, pattern_errors) + ValueError.__init__(self, msg) + + class ModelsWarning(UserWarning): pass diff --git a/spacy/matcher/_schemas.py b/spacy/matcher/_schemas.py new file mode 100644 index 000000000..f4dfebb6b --- /dev/null +++ b/spacy/matcher/_schemas.py @@ -0,0 +1,172 @@ +# coding: utf8 +from __future__ import unicode_literals + + +TOKEN_PATTERN_SCHEMA = { + "$schema": "http://json-schema.org/draft-06/schema", + "definitions": { + "string_value": { + "anyOf": [ + {"type": "string"}, + { + "type": "object", + "properties": { + "REGEX": {"type": "string"}, + "IN": {"type": "array", "items": {"type": "string"}}, + "NOT_IN": {"type": "array", "items": {"type": "string"}}, + }, + "additionalProperties": False, + }, + ] + }, + "integer_value": { + "anyOf": [ + {"type": "integer"}, + { + "type": "object", + "properties": { + "REGEX": {"type": "string"}, + "IN": {"type": "array", "items": {"type": "integer"}}, + "NOT_IN": {"type": "array", "items": {"type": "integer"}}, + "==": {"type": "integer"}, + ">=": {"type": "integer"}, + "<=": {"type": "integer"}, + ">": {"type": "integer"}, + "<": {"type": "integer"}, + }, + "additionalProperties": False, + }, + ] + }, + "boolean_value": {"type": "boolean"}, + "underscore_value": { + "anyOf": [ + {"type": ["string", "integer", "number", "array", "boolean", "null"]}, + { + "type": "object", + "properties": { + "REGEX": {"type": "string"}, + "IN": { + "type": "array", + "items": {"type": ["string", "integer"]}, + }, + "NOT_IN": { + "type": "array", + "items": {"type": ["string", "integer"]}, + }, + "==": {"type": "integer"}, + ">=": {"type": "integer"}, + "<=": {"type": "integer"}, + ">": {"type": "integer"}, + "<": {"type": "integer"}, + }, + "additionalProperties": False, + }, + ] + }, + }, + "type": "array", + "items": { + "type": "object", + "properties": { + "ORTH": { + "title": "Verbatim token text", + "$ref": "#/definitions/string_value", + }, + "TEXT": { + "title": "Verbatim token text (spaCy v2.1+)", + "$ref": "#/definitions/string_value", + }, + "LOWER": { + "title": "Lowercase form of token text", + "$ref": "#/definitions/string_value", + }, + "POS": { + "title": "Coarse-grained part-of-speech tag", + "$ref": "#/definitions/string_value", + }, + "TAG": { + "title": "Fine-grained part-of-speech tag", + "$ref": "#/definitions/string_value", + }, + "DEP": {"title": "Dependency label", "$ref": "#/definitions/string_value"}, + "LEMMA": { + "title": "Lemma (base form)", + "$ref": "#/definitions/string_value", + }, + "SHAPE": { + "title": "Abstract token shape", + "$ref": "#/definitions/string_value", + }, + "ENT_TYPE": { + "title": "Entity label of single token", + "$ref": "#/definitions/string_value", + }, + "LENGTH": { + "title": "Token character length", + "$ref": "#/definitions/integer_value", + }, + "IS_ALPHA": { + "title": "Token consists of alphanumeric characters", + "$ref": "#/definitions/boolean_value", + }, + "IS_ASCII": { + "title": "Token consists of ASCII characters", + "$ref": "#/definitions/boolean_value", + }, + "IS_DIGIT": { + "title": "Token consists of digits", + "$ref": "#/definitions/boolean_value", + }, + "IS_LOWER": { + "title": "Token is lowercase", + "$ref": "#/definitions/boolean_value", + }, + "IS_UPPER": { + "title": "Token is uppercase", + "$ref": "#/definitions/boolean_value", + }, + "IS_TITLE": { + "title": "Token is titlecase", + "$ref": "#/definitions/boolean_value", + }, + "IS_PUNCT": { + "title": "Token is punctuation", + "$ref": "#/definitions/boolean_value", + }, + "IS_SPACE": { + "title": "Token is whitespace", + "$ref": "#/definitions/boolean_value", + }, + "IS_STOP": { + "title": "Token is stop word", + "$ref": "#/definitions/boolean_value", + }, + "LIKE_NUM": { + "title": "Token resembles a number", + "$ref": "#/definitions/boolean_value", + }, + "LIKE_URL": { + "title": "Token resembles a URL", + "$ref": "#/definitions/boolean_value", + }, + "LIKE_EMAIL": { + "title": "Token resembles an email address", + "$ref": "#/definitions/boolean_value", + }, + "_": { + "title": "Custom extension token attributes (token._.)", + "type": "object", + "patternProperties": { + "^.*$": {"$ref": "#/definitions/underscore_value"} + }, + }, + "OP": { + "title": "Operators / quantifiers", + "type": "string", + "enum": ["+", "*", "?", "!"], + }, + }, + "additionalProperties": False, + }, +} diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd index 52e22046b..2e46cb376 100644 --- a/spacy/matcher/matcher.pxd +++ b/spacy/matcher/matcher.pxd @@ -62,6 +62,7 @@ cdef class Matcher: cdef Pool mem cdef vector[TokenPatternC*] patterns cdef readonly Vocab vocab + cdef public object validator cdef public object _patterns cdef public object _callbacks cdef public object _extensions diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 374431d30..f0783df3f 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -17,7 +17,9 @@ from ..tokens.doc cimport Doc, get_token_attr from ..tokens.token cimport Token from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH -from ..errors import Errors +from ._schemas import TOKEN_PATTERN_SCHEMA +from ..util import get_json_validator, validate_json +from ..errors import Errors, MatchPatternError from ..strings import get_string_id from ..attrs import IDS @@ -579,7 +581,7 @@ def _get_extensions(spec, string_store, name2index): cdef class Matcher: """Match sequences of tokens, based on pattern rules.""" - def __init__(self, vocab): + def __init__(self, vocab, validate=False): """Create the Matcher. vocab (Vocab): The vocabulary object, which must be shared with the @@ -593,6 +595,7 @@ cdef class Matcher: self._extra_predicates = [] self.vocab = vocab self.mem = Pool() + self.validator = get_json_validator(TOKEN_PATTERN_SCHEMA) if validate else None def __reduce__(self): data = (self.vocab, self._patterns, self._callbacks) @@ -643,9 +646,14 @@ cdef class Matcher: on_match (callable): Callback executed on match. *patterns (list): List of token descriptions. """ - for pattern in patterns: + errors = {} + for i, pattern in enumerate(patterns): if len(pattern) == 0: raise ValueError(Errors.E012.format(key=key)) + if self.validator: + errors[i] = validate_json(pattern, self.validator) + if errors: + raise MatchPatternError(key, errors) key = self._normalize_key(key) for pattern in patterns: specs = _preprocess_pattern(pattern, self.vocab.strings, diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 4abf275be..81c81a008 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -41,7 +41,7 @@ cdef class PhraseMatcher: self.mem = Pool() self.max_length = max_length self.vocab = vocab - self.matcher = Matcher(self.vocab) + self.matcher = Matcher(self.vocab, validate=False) if isinstance(attr, long): self.attr = attr else: diff --git a/spacy/tests/doc/test_to_json.py b/spacy/tests/doc/test_to_json.py index 684791499..a063a6569 100644 --- a/spacy/tests/doc/test_to_json.py +++ b/spacy/tests/doc/test_to_json.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import pytest from spacy.cli._schemas import TRAINING_SCHEMA -from spacy.util import validate_json +from spacy.util import get_json_validator, validate_json from spacy.tokens import Doc from ..util import get_doc @@ -62,5 +62,6 @@ def test_doc_to_json_underscore_error_serialize(doc): def test_doc_to_json_valid_training(doc): json_doc = doc.to_json() - errors = validate_json([json_doc], TRAINING_SCHEMA) + validator = get_json_validator(TRAINING_SCHEMA) + errors = validate_json([json_doc], validator) assert not errors diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py new file mode 100644 index 000000000..a0a8d3a14 --- /dev/null +++ b/spacy/tests/matcher/test_pattern_validation.py @@ -0,0 +1,48 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +from spacy.matcher import Matcher +from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA +from spacy.errors import MatchPatternError +from spacy.util import get_json_validator, validate_json + + +@pytest.fixture +def validator(): + return get_json_validator(TOKEN_PATTERN_SCHEMA) + + +@pytest.mark.parametrize( + "pattern", [[{"XX": "y"}, {"LENGTH": "2"}, {"TEXT": {"IN": 5}}]] +) +def test_matcher_pattern_validation(en_vocab, pattern): + matcher = Matcher(en_vocab, validate=True) + with pytest.raises(MatchPatternError): + matcher.add("TEST", None, pattern) + + +@pytest.mark.parametrize( + "pattern,n_errors", + [ + # Bad patterns + ([{"XX": "foo"}], 1), + ([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2), + ([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 2), + ([{"IS_ALPHA": {"==": True}}, {"LIKE_NUM": None}], 2), + ([{"TEXT": {"VALUE": "foo"}}], 1), + ([{"LENGTH": {"VALUE": 5}}], 1), + ([{"_": "foo"}], 1), + ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 1), + ([{"IS_PUNCT": True, "OP": "$"}], 1), + # Good patterns + ([{"TEXT": "foo"}, {"LOWER": "bar"}], 0), + ([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0), + ([{"LIKE_NUM": True, "LENGTH": {">=": 5}}], 0), + ([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0), + ([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0), + ], +) +def test_pattern_validation(validator, pattern, n_errors): + errors = validate_json(pattern, validator) + assert len(errors) == n_errors diff --git a/spacy/tests/test_json_schemas.py b/spacy/tests/test_json_schemas.py index ed1385a8b..89e797c1a 100644 --- a/spacy/tests/test_json_schemas.py +++ b/spacy/tests/test_json_schemas.py @@ -1,18 +1,24 @@ # coding: utf-8 from __future__ import unicode_literals -from spacy.util import validate_json, validate_schema +from spacy.util import get_json_validator, validate_json, validate_schema from spacy.cli._schemas import META_SCHEMA, TRAINING_SCHEMA +from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA import pytest +@pytest.fixture(scope="session") +def training_schema_validator(): + return get_json_validator(TRAINING_SCHEMA) + + def test_validate_schema(): validate_schema({"type": "object"}) with pytest.raises(Exception): validate_schema({"type": lambda x: x}) -@pytest.mark.parametrize("schema", [TRAINING_SCHEMA, META_SCHEMA]) +@pytest.mark.parametrize("schema", [TRAINING_SCHEMA, META_SCHEMA, TOKEN_PATTERN_SCHEMA]) def test_schemas(schema): validate_schema(schema) @@ -24,8 +30,8 @@ def test_schemas(schema): {"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "TEST"}]}, ], ) -def test_json_schema_training_valid(data): - errors = validate_json([data], TRAINING_SCHEMA) +def test_json_schema_training_valid(data, training_schema_validator): + errors = validate_json([data], training_schema_validator) assert not errors @@ -39,6 +45,6 @@ def test_json_schema_training_valid(data): ({"text": "spaCy", "tokens": [{"pos": "PROPN"}]}, 2), ], ) -def test_json_schema_training_invalid(data, n_errors): - errors = validate_json([data], TRAINING_SCHEMA) +def test_json_schema_training_invalid(data, n_errors, training_schema_validator): + errors = validate_json([data], training_schema_validator) assert len(errors) == n_errors diff --git a/spacy/util.py b/spacy/util.py index 5508a4fd2..26f3eac2b 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -627,28 +627,38 @@ def fix_random_seed(seed=0): cupy.random.seed(seed) -def validate_schema(schema): +def get_json_validator(schema): + # We're using a helper function here to make it easier to change the + # validator that's used (e.g. different draft implementation), without + # having to change it all across the codebase. # TODO: replace with (stable) Draft6Validator, if available - validator = Draft4Validator(schema) + return Draft4Validator(schema) + + +def validate_schema(schema): + """Validate a given schema. This just checks if the schema itself is valid.""" + validator = get_json_validator(schema) validator.check_schema(schema) -def validate_json(data, schema): +def validate_json(data, validator): """Validate data against a given JSON schema (see https://json-schema.org). data: JSON-serializable data to validate. - schema (dict): The JSON schema. + validator (jsonschema.DraftXValidator): The validator. RETURNS (list): A list of error messages, if available. """ - # TODO: replace with (stable) Draft6Validator, if available - validator = Draft4Validator(schema) errors = [] for err in sorted(validator.iter_errors(data), key=lambda e: e.path): if err.path: err_path = "[{}]".format(" -> ".join([str(p) for p in err.path])) else: err_path = "" - errors.append(err.message + " " + err_path) + msg = err.message + " " + err_path + if err.context: # Error has suberrors, e.g. if schema uses anyOf + suberrs = [" - {}".format(suberr.message) for suberr in err.context] + msg += ":\n{}".format("".join(suberrs)) + errors.append(msg) return errors