diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 634756441..afedb933e 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -74,8 +74,8 @@ def debug_data( # Validate data format using the JSON schema # TODO: update once the new format is ready - train_data_errors = [] # TODO: validate_json(train_data, schema) - dev_data_errors = [] # TODO: validate_json(dev_data, schema) + train_data_errors = [] # TODO: validate_json + dev_data_errors = [] # TODO: validate_json if not train_data_errors: msg.good("Training data JSON format is valid") if not dev_data_errors: diff --git a/spacy/errors.py b/spacy/errors.py index a0da31a1e..2b63b6cda 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -325,6 +325,21 @@ class TempErrors(object): # fmt: on +class MatchPatternError(ValueError): + def __init__(self, key, errors): + """Custom error for validating match patterns. + + key (unicode): The name of the matcher rule. + errors (dict): Validation errors (sequence of strings) mapped to pattern + ID, i.e. the index of the added pattern. + """ + msg = "Invalid token patterns for matcher rule '{}'\n".format(key) + for pattern_idx, error_msgs in errors.items(): + pattern_errors = "\n".join(["- {}".format(e) for e in error_msgs]) + msg += "\nPattern {}:\n{}\n".format(pattern_idx, pattern_errors) + ValueError.__init__(self, msg) + + class ModelsWarning(UserWarning): pass diff --git a/spacy/matcher/_schemas.py b/spacy/matcher/_schemas.py new file mode 100644 index 000000000..f4dfebb6b --- /dev/null +++ b/spacy/matcher/_schemas.py @@ -0,0 +1,172 @@ +# coding: utf8 +from __future__ import unicode_literals + + +TOKEN_PATTERN_SCHEMA = { + "$schema": "http://json-schema.org/draft-06/schema", + "definitions": { + "string_value": { + "anyOf": [ + {"type": "string"}, + { + "type": "object", + "properties": { + "REGEX": {"type": "string"}, + "IN": {"type": "array", "items": {"type": "string"}}, + "NOT_IN": {"type": "array", "items": {"type": "string"}}, + }, + "additionalProperties": False, + }, + ] + }, + "integer_value": { + "anyOf": [ + {"type": "integer"}, + { + "type": "object", + "properties": { + "REGEX": {"type": "string"}, + "IN": {"type": "array", "items": {"type": "integer"}}, + "NOT_IN": {"type": "array", "items": {"type": "integer"}}, + "==": {"type": "integer"}, + ">=": {"type": "integer"}, + "<=": {"type": "integer"}, + ">": {"type": "integer"}, + "<": {"type": "integer"}, + }, + "additionalProperties": False, + }, + ] + }, + "boolean_value": {"type": "boolean"}, + "underscore_value": { + "anyOf": [ + {"type": ["string", "integer", "number", "array", "boolean", "null"]}, + { + "type": "object", + "properties": { + "REGEX": {"type": "string"}, + "IN": { + "type": "array", + "items": {"type": ["string", "integer"]}, + }, + "NOT_IN": { + "type": "array", + "items": {"type": ["string", "integer"]}, + }, + "==": {"type": "integer"}, + ">=": {"type": "integer"}, + "<=": {"type": "integer"}, + ">": {"type": "integer"}, + "<": {"type": "integer"}, + }, + "additionalProperties": False, + }, + ] + }, + }, + "type": "array", + "items": { + "type": "object", + "properties": { + "ORTH": { + "title": "Verbatim token text", + "$ref": "#/definitions/string_value", + }, + "TEXT": { + "title": "Verbatim token text (spaCy v2.1+)", + "$ref": "#/definitions/string_value", + }, + "LOWER": { + "title": "Lowercase form of token text", + "$ref": "#/definitions/string_value", + }, + "POS": { + "title": "Coarse-grained part-of-speech tag", + "$ref": "#/definitions/string_value", + }, + "TAG": { + "title": "Fine-grained part-of-speech tag", + "$ref": "#/definitions/string_value", + }, + "DEP": {"title": "Dependency label", "$ref": "#/definitions/string_value"}, + "LEMMA": { + "title": "Lemma (base form)", + "$ref": "#/definitions/string_value", + }, + "SHAPE": { + "title": "Abstract token shape", + "$ref": "#/definitions/string_value", + }, + "ENT_TYPE": { + "title": "Entity label of single token", + "$ref": "#/definitions/string_value", + }, + "LENGTH": { + "title": "Token character length", + "$ref": "#/definitions/integer_value", + }, + "IS_ALPHA": { + "title": "Token consists of alphanumeric characters", + "$ref": "#/definitions/boolean_value", + }, + "IS_ASCII": { + "title": "Token consists of ASCII characters", + "$ref": "#/definitions/boolean_value", + }, + "IS_DIGIT": { + "title": "Token consists of digits", + "$ref": "#/definitions/boolean_value", + }, + "IS_LOWER": { + "title": "Token is lowercase", + "$ref": "#/definitions/boolean_value", + }, + "IS_UPPER": { + "title": "Token is uppercase", + "$ref": "#/definitions/boolean_value", + }, + "IS_TITLE": { + "title": "Token is titlecase", + "$ref": "#/definitions/boolean_value", + }, + "IS_PUNCT": { + "title": "Token is punctuation", + "$ref": "#/definitions/boolean_value", + }, + "IS_SPACE": { + "title": "Token is whitespace", + "$ref": "#/definitions/boolean_value", + }, + "IS_STOP": { + "title": "Token is stop word", + "$ref": "#/definitions/boolean_value", + }, + "LIKE_NUM": { + "title": "Token resembles a number", + "$ref": "#/definitions/boolean_value", + }, + "LIKE_URL": { + "title": "Token resembles a URL", + "$ref": "#/definitions/boolean_value", + }, + "LIKE_EMAIL": { + "title": "Token resembles an email address", + "$ref": "#/definitions/boolean_value", + }, + "_": { + "title": "Custom extension token attributes (token._.)", + "type": "object", + "patternProperties": { + "^.*$": {"$ref": "#/definitions/underscore_value"} + }, + }, + "OP": { + "title": "Operators / quantifiers", + "type": "string", + "enum": ["+", "*", "?", "!"], + }, + }, + "additionalProperties": False, + }, +} diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd index 52e22046b..2e46cb376 100644 --- a/spacy/matcher/matcher.pxd +++ b/spacy/matcher/matcher.pxd @@ -62,6 +62,7 @@ cdef class Matcher: cdef Pool mem cdef vector[TokenPatternC*] patterns cdef readonly Vocab vocab + cdef public object validator cdef public object _patterns cdef public object _callbacks cdef public object _extensions diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 374431d30..f0783df3f 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -17,7 +17,9 @@ from ..tokens.doc cimport Doc, get_token_attr from ..tokens.token cimport Token from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH -from ..errors import Errors +from ._schemas import TOKEN_PATTERN_SCHEMA +from ..util import get_json_validator, validate_json +from ..errors import Errors, MatchPatternError from ..strings import get_string_id from ..attrs import IDS @@ -579,7 +581,7 @@ def _get_extensions(spec, string_store, name2index): cdef class Matcher: """Match sequences of tokens, based on pattern rules.""" - def __init__(self, vocab): + def __init__(self, vocab, validate=False): """Create the Matcher. vocab (Vocab): The vocabulary object, which must be shared with the @@ -593,6 +595,7 @@ cdef class Matcher: self._extra_predicates = [] self.vocab = vocab self.mem = Pool() + self.validator = get_json_validator(TOKEN_PATTERN_SCHEMA) if validate else None def __reduce__(self): data = (self.vocab, self._patterns, self._callbacks) @@ -643,9 +646,14 @@ cdef class Matcher: on_match (callable): Callback executed on match. *patterns (list): List of token descriptions. """ - for pattern in patterns: + errors = {} + for i, pattern in enumerate(patterns): if len(pattern) == 0: raise ValueError(Errors.E012.format(key=key)) + if self.validator: + errors[i] = validate_json(pattern, self.validator) + if errors: + raise MatchPatternError(key, errors) key = self._normalize_key(key) for pattern in patterns: specs = _preprocess_pattern(pattern, self.vocab.strings, diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 4abf275be..81c81a008 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -41,7 +41,7 @@ cdef class PhraseMatcher: self.mem = Pool() self.max_length = max_length self.vocab = vocab - self.matcher = Matcher(self.vocab) + self.matcher = Matcher(self.vocab, validate=False) if isinstance(attr, long): self.attr = attr else: diff --git a/spacy/tests/doc/test_to_json.py b/spacy/tests/doc/test_to_json.py index 684791499..a063a6569 100644 --- a/spacy/tests/doc/test_to_json.py +++ b/spacy/tests/doc/test_to_json.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import pytest from spacy.cli._schemas import TRAINING_SCHEMA -from spacy.util import validate_json +from spacy.util import get_json_validator, validate_json from spacy.tokens import Doc from ..util import get_doc @@ -62,5 +62,6 @@ def test_doc_to_json_underscore_error_serialize(doc): def test_doc_to_json_valid_training(doc): json_doc = doc.to_json() - errors = validate_json([json_doc], TRAINING_SCHEMA) + validator = get_json_validator(TRAINING_SCHEMA) + errors = validate_json([json_doc], validator) assert not errors diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py new file mode 100644 index 000000000..a0a8d3a14 --- /dev/null +++ b/spacy/tests/matcher/test_pattern_validation.py @@ -0,0 +1,48 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +from spacy.matcher import Matcher +from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA +from spacy.errors import MatchPatternError +from spacy.util import get_json_validator, validate_json + + +@pytest.fixture +def validator(): + return get_json_validator(TOKEN_PATTERN_SCHEMA) + + +@pytest.mark.parametrize( + "pattern", [[{"XX": "y"}, {"LENGTH": "2"}, {"TEXT": {"IN": 5}}]] +) +def test_matcher_pattern_validation(en_vocab, pattern): + matcher = Matcher(en_vocab, validate=True) + with pytest.raises(MatchPatternError): + matcher.add("TEST", None, pattern) + + +@pytest.mark.parametrize( + "pattern,n_errors", + [ + # Bad patterns + ([{"XX": "foo"}], 1), + ([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2), + ([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 2), + ([{"IS_ALPHA": {"==": True}}, {"LIKE_NUM": None}], 2), + ([{"TEXT": {"VALUE": "foo"}}], 1), + ([{"LENGTH": {"VALUE": 5}}], 1), + ([{"_": "foo"}], 1), + ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 1), + ([{"IS_PUNCT": True, "OP": "$"}], 1), + # Good patterns + ([{"TEXT": "foo"}, {"LOWER": "bar"}], 0), + ([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0), + ([{"LIKE_NUM": True, "LENGTH": {">=": 5}}], 0), + ([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0), + ([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0), + ], +) +def test_pattern_validation(validator, pattern, n_errors): + errors = validate_json(pattern, validator) + assert len(errors) == n_errors diff --git a/spacy/tests/test_json_schemas.py b/spacy/tests/test_json_schemas.py index ed1385a8b..89e797c1a 100644 --- a/spacy/tests/test_json_schemas.py +++ b/spacy/tests/test_json_schemas.py @@ -1,18 +1,24 @@ # coding: utf-8 from __future__ import unicode_literals -from spacy.util import validate_json, validate_schema +from spacy.util import get_json_validator, validate_json, validate_schema from spacy.cli._schemas import META_SCHEMA, TRAINING_SCHEMA +from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA import pytest +@pytest.fixture(scope="session") +def training_schema_validator(): + return get_json_validator(TRAINING_SCHEMA) + + def test_validate_schema(): validate_schema({"type": "object"}) with pytest.raises(Exception): validate_schema({"type": lambda x: x}) -@pytest.mark.parametrize("schema", [TRAINING_SCHEMA, META_SCHEMA]) +@pytest.mark.parametrize("schema", [TRAINING_SCHEMA, META_SCHEMA, TOKEN_PATTERN_SCHEMA]) def test_schemas(schema): validate_schema(schema) @@ -24,8 +30,8 @@ def test_schemas(schema): {"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "TEST"}]}, ], ) -def test_json_schema_training_valid(data): - errors = validate_json([data], TRAINING_SCHEMA) +def test_json_schema_training_valid(data, training_schema_validator): + errors = validate_json([data], training_schema_validator) assert not errors @@ -39,6 +45,6 @@ def test_json_schema_training_valid(data): ({"text": "spaCy", "tokens": [{"pos": "PROPN"}]}, 2), ], ) -def test_json_schema_training_invalid(data, n_errors): - errors = validate_json([data], TRAINING_SCHEMA) +def test_json_schema_training_invalid(data, n_errors, training_schema_validator): + errors = validate_json([data], training_schema_validator) assert len(errors) == n_errors diff --git a/spacy/util.py b/spacy/util.py index 5508a4fd2..26f3eac2b 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -627,28 +627,38 @@ def fix_random_seed(seed=0): cupy.random.seed(seed) -def validate_schema(schema): +def get_json_validator(schema): + # We're using a helper function here to make it easier to change the + # validator that's used (e.g. different draft implementation), without + # having to change it all across the codebase. # TODO: replace with (stable) Draft6Validator, if available - validator = Draft4Validator(schema) + return Draft4Validator(schema) + + +def validate_schema(schema): + """Validate a given schema. This just checks if the schema itself is valid.""" + validator = get_json_validator(schema) validator.check_schema(schema) -def validate_json(data, schema): +def validate_json(data, validator): """Validate data against a given JSON schema (see https://json-schema.org). data: JSON-serializable data to validate. - schema (dict): The JSON schema. + validator (jsonschema.DraftXValidator): The validator. RETURNS (list): A list of error messages, if available. """ - # TODO: replace with (stable) Draft6Validator, if available - validator = Draft4Validator(schema) errors = [] for err in sorted(validator.iter_errors(data), key=lambda e: e.path): if err.path: err_path = "[{}]".format(" -> ".join([str(p) for p in err.path])) else: err_path = "" - errors.append(err.message + " " + err_path) + msg = err.message + " " + err_path + if err.context: # Error has suberrors, e.g. if schema uses anyOf + suberrs = [" - {}".format(suberr.message) for suberr in err.context] + msg += ":\n{}".format("".join(suberrs)) + errors.append(msg) return errors