diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 640fb2f3c..dc8eed7c3 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -1,3 +1,6 @@ +from .errors import Errors + +IOB_STRINGS = ("", "I", "O", "B") IDS = { "": NULL_ATTR, @@ -64,7 +67,6 @@ IDS = { "FLAG61": FLAG61, "FLAG62": FLAG62, "FLAG63": FLAG63, - "ID": ID, "ORTH": ORTH, "LOWER": LOWER, @@ -72,7 +74,6 @@ IDS = { "SHAPE": SHAPE, "PREFIX": PREFIX, "SUFFIX": SUFFIX, - "LENGTH": LENGTH, "LEMMA": LEMMA, "POS": POS, @@ -87,7 +88,7 @@ IDS = { "SPACY": SPACY, "LANG": LANG, "MORPH": MORPH, - "IDX": IDX + "IDX": IDX, } @@ -109,28 +110,66 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): """ inty_attrs = {} if _do_deprecated: - if 'F' in stringy_attrs: + if "F" in stringy_attrs: stringy_attrs["ORTH"] = stringy_attrs.pop("F") - if 'L' in stringy_attrs: + if "L" in stringy_attrs: stringy_attrs["LEMMA"] = stringy_attrs.pop("L") - if 'pos' in stringy_attrs: + if "pos" in stringy_attrs: stringy_attrs["TAG"] = stringy_attrs.pop("pos") - if 'morph' in stringy_attrs: - morphs = stringy_attrs.pop('morph') - if 'number' in stringy_attrs: - stringy_attrs.pop('number') - if 'tenspect' in stringy_attrs: - stringy_attrs.pop('tenspect') + if "morph" in stringy_attrs: + morphs = stringy_attrs.pop("morph") + if "number" in stringy_attrs: + stringy_attrs.pop("number") + if "tenspect" in stringy_attrs: + stringy_attrs.pop("tenspect") morph_keys = [ - 'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number', - 'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss', - 'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType', - 'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr', - 'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm', - 'NumValue', 'PartType', 'Polite', 'StyleVariant', - 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType', - 'Reflex', 'Negative', 'Mood', 'Aspect', 'Case', - 'Polarity', 'PrepCase', 'Animacy' # U20 + "PunctType", + "PunctSide", + "Other", + "Degree", + "AdvType", + "Number", + "VerbForm", + "PronType", + "Aspect", + "Tense", + "PartType", + "Poss", + "Hyph", + "ConjType", + "NumType", + "Foreign", + "VerbType", + "NounType", + "Gender", + "Mood", + "Negative", + "Tense", + "Voice", + "Abbr", + "Derivation", + "Echo", + "Foreign", + "NameType", + "NounType", + "NumForm", + "NumValue", + "PartType", + "Polite", + "StyleVariant", + "PronType", + "AdjType", + "Person", + "Variant", + "AdpType", + "Reflex", + "Negative", + "Mood", + "Aspect", + "Case", + "Polarity", + "PrepCase", + "Animacy", # U20 ] for key in morph_keys: if key in stringy_attrs: @@ -142,8 +181,13 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): for name, value in stringy_attrs.items(): int_key = intify_attr(name) if int_key is not None: + if int_key == ENT_IOB: + if value in IOB_STRINGS: + value = IOB_STRINGS.index(value) + elif isinstance(value, str): + raise ValueError(Errors.E1025.format(value=value)) if strings_map is not None and isinstance(value, str): - if hasattr(strings_map, 'add'): + if hasattr(strings_map, "add"): value = strings_map.add(value) else: value = strings_map[value] diff --git a/spacy/errors.py b/spacy/errors.py index 673674222..390612123 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -888,11 +888,14 @@ class Errors(metaclass=ErrorsWithCodes): E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. " "Non-UD tags should use the `tag` property.") E1022 = ("Words must be of type str or int, but input is of type '{wtype}'") - E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.") - E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler patterns.") + E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't " + "exist.") + E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler " + "patterns.") + E1025 = ("Cannot intify the value '{value}' as an IOB string. The only " + "supported values are: 'I', 'O', 'B' and ''") - # Deprecated model shortcuts, only used in errors and warnings OLD_MODEL_SHORTCUTS = { "en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm", diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py index 5350c1fe5..1c27c1744 100644 --- a/spacy/tests/lang/test_attrs.py +++ b/spacy/tests/lang/test_attrs.py @@ -1,4 +1,5 @@ import pytest +from spacy.attrs import intify_attrs, ENT_IOB from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs from spacy.lang.en.stop_words import STOP_WORDS @@ -33,6 +34,38 @@ def test_attrs_do_deprecated(text): assert int_attrs == {ORTH: 10, IS_ALPHA: True} +def test_attrs_ent_iob_intify(): + int_attrs = intify_attrs({"ENT_IOB": ""}) + assert int_attrs == {ENT_IOB: 0} + + int_attrs = intify_attrs({"ENT_IOB": "I"}) + assert int_attrs == {ENT_IOB: 1} + + int_attrs = intify_attrs({"ENT_IOB": "O"}) + assert int_attrs == {ENT_IOB: 2} + + int_attrs = intify_attrs({"ENT_IOB": "B"}) + assert int_attrs == {ENT_IOB: 3} + + int_attrs = intify_attrs({ENT_IOB: ""}) + assert int_attrs == {ENT_IOB: 0} + + int_attrs = intify_attrs({ENT_IOB: "I"}) + assert int_attrs == {ENT_IOB: 1} + + int_attrs = intify_attrs({ENT_IOB: "O"}) + assert int_attrs == {ENT_IOB: 2} + + int_attrs = intify_attrs({ENT_IOB: "B"}) + assert int_attrs == {ENT_IOB: 3} + + with pytest.raises(ValueError): + int_attrs = intify_attrs({"ENT_IOB": "XX"}) + + with pytest.raises(ValueError): + int_attrs = intify_attrs({ENT_IOB: "XX"}) + + @pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)]) def test_lex_attrs_is_punct(text, match): assert is_punct(text) == match diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index c09ec28d6..b515ab67b 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -20,6 +20,7 @@ from .doc cimport set_children_from_heads from .. import parts_of_speech from ..errors import Errors, Warnings +from ..attrs import IOB_STRINGS from .underscore import Underscore, get_ext_args @@ -745,7 +746,7 @@ cdef class Token: @classmethod def iob_strings(cls): - return ("", "I", "O", "B") + return IOB_STRINGS @property def ent_iob_(self):