diff --git a/pyproject.toml b/pyproject.toml index 14a2d7690..0ceda4454 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,6 @@ requires = [ "murmurhash>=0.28.0,<1.1.0", "thinc>=8.0.0rc0,<8.1.0", "blis>=0.4.0,<0.8.0", - "pytokenizations", "pathy" ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index c5e136a34..3a777f163 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,6 @@ numpy>=1.15.0 requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 pydantic>=1.5.0,<1.7.0 -pytokenizations # Official Python utilities setuptools packaging>=20.0 diff --git a/setup.cfg b/setup.cfg index 762a7e888..95ada08ef 100644 --- a/setup.cfg +++ b/setup.cfg @@ -52,7 +52,6 @@ install_requires = numpy>=1.15.0 requests>=2.13.0,<3.0.0 pydantic>=1.5.0,<1.7.0 - pytokenizations jinja2 # Official Python utilities setuptools diff --git a/setup.py b/setup.py index 604d65745..160d2ed1c 100755 --- a/setup.py +++ b/setup.py @@ -49,6 +49,7 @@ MOD_NAMES = [ "spacy.pipeline._parser_internals.stateclass", "spacy.pipeline._parser_internals.transition_system", "spacy.tokenizer", + "spacy.training.align", "spacy.training.gold_io", "spacy.tokens.doc", "spacy.tokens.span", diff --git a/spacy/errors.py b/spacy/errors.py index 2898fbcaa..f4fd3731f 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -564,7 +564,10 @@ class Errors: "a string value from {expected} but got: '{arg}'") E948 = ("`Matcher.add` received invalid 'patterns' argument: expected " "a list, but got: {arg_type}") - E949 = ("Can only create an alignment when the texts are the same.") + E949 = ("Unable to align tokens for the predicted and reference docs. It " + "is only possible to align the docs when both texts are the same " + "except for whitespace and capitalization. The predicted tokens " + "start with: {x}. The reference tokens start with: {y}.") E952 = ("The section '{name}' is not a valid section in the provided config.") E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E954 = ("The Tok2Vec listener did not receive any valid input from an upstream " diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 07e1aef01..ba485ab45 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -2,6 +2,7 @@ import numpy from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment from spacy.training import biluo_tags_to_spans, iob_to_biluo from spacy.training import Corpus, docs_to_json, Example +from spacy.training.align import get_alignments from spacy.training.converters import json_to_docs from spacy.lang.en import English from spacy.tokens import Doc, DocBin @@ -492,36 +493,35 @@ def test_roundtrip_docs_to_docbin(doc): assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"] -@pytest.mark.skip("Outdated") @pytest.mark.parametrize( "tokens_a,tokens_b,expected", [ - (["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})), + (["a", "b", "c"], ["ab", "c"], ([[0], [0], [1]], [[0, 1], [2]])), ( ["a", "b", '"', "c"], ['ab"', "c"], - (4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}), + ([[0], [0], [0], [1]], [[0, 1, 2], [3]]), ), - (["a", "bc"], ["ab", "c"], (4, [-1, -1], [-1, -1], {0: 0}, {1: 1})), + (["a", "bc"], ["ab", "c"], ([[0], [0, 1]], [[0, 1], [1]])), ( ["ab", "c", "d"], ["a", "b", "cd"], - (6, [-1, -1, -1], [-1, -1, -1], {1: 2, 2: 2}, {0: 0, 1: 0}), + ([[0, 1], [2], [2]], [[0], [0], [1, 2]]), ), ( ["a", "b", "cd"], ["a", "b", "c", "d"], - (3, [0, 1, -1], [0, 1, -1, -1], {}, {2: 2, 3: 2}), + ([[0], [1], [2, 3]], [[0], [1], [2], [2]]), ), - ([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})), + ([" ", "a"], ["a"], ([[], [0]], [[1]])), ], ) def test_align(tokens_a, tokens_b, expected): # noqa - cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b) # noqa - assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected # noqa + a2b, b2a = get_alignments(tokens_a, tokens_b) + assert (a2b, b2a) == expected # noqa # check symmetry - cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a) # noqa - assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected # noqa + a2b, b2a = get_alignments(tokens_b, tokens_a) # noqa + assert (b2a, a2b) == expected # noqa def test_goldparse_startswith_space(en_tokenizer): @@ -539,6 +539,21 @@ def test_goldparse_startswith_space(en_tokenizer): assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"] +def test_goldparse_endswith_space(en_tokenizer): + text = "a\n" + doc = en_tokenizer(text) + gold_words = ["a"] + entities = ["U-DATE"] + deps = ["ROOT"] + heads = [0] + example = Example.from_dict( + doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads} + ) + ner_tags = example.get_aligned_ner() + assert ner_tags == ["U-DATE", "O"] + assert example.get_aligned("DEP", as_string=True) == ["ROOT", None] + + def test_gold_constructor(): """Test that the Example constructor works fine""" nlp = English() @@ -676,6 +691,87 @@ def test_alignment_different_texts(): Alignment.from_strings(other_tokens, spacy_tokens) +def test_alignment_spaces(en_vocab): + # single leading whitespace + other_tokens = [" ", "i listened to", "obama", "'", "s", "podcasts", "."] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,] + assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6] + + # multiple leading whitespace tokens + other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,] + assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7] + + # both with leading whitespace, not identical + other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."] + spacy_tokens = [" ", "i", "listened", "to", "obama", "'s", "podcasts."] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [1, 0, 3, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 5, 5, 6, 6] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 1, 2, 2] + assert list(align.y2x.dataXd) == [0, 2, 2, 2, 3, 4, 5, 6, 7] + + # same leading whitespace, different tokenization + other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."] + spacy_tokens = [" ", "i", "listened", "to", "obama", "'s", "podcasts."] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [1, 1, 3, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 0, 1, 2, 3, 4, 5, 5, 6, 6] + assert list(align.y2x.lengths) == [2, 1, 1, 1, 1, 2, 2] + assert list(align.y2x.dataXd) == [0, 1, 2, 2, 2, 3, 4, 5, 6, 7] + + # only one with trailing whitespace + other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " "] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 0] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2] + assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5] + + # different trailing whitespace + other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", " "] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 0] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 1] + assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6] + + # same trailing whitespace, different tokenization + other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", " "] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6, 6] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 2] + assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6, 7] + + # differing whitespace is allowed + other_tokens = ["a", " \n ", "b", "c"] + spacy_tokens = ["a", "b", " ", "c"] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.dataXd) == [0, 1, 3] + assert list(align.y2x.dataXd) == [0, 2, 3] + + # other differences in whitespace are allowed + other_tokens = [" ", "a"] + spacy_tokens = [" ", "a", " "] + align = Alignment.from_strings(other_tokens, spacy_tokens) + + other_tokens = ["a", " "] + spacy_tokens = ["a", " "] + align = Alignment.from_strings(other_tokens, spacy_tokens) + + def test_retokenized_docs(doc): a = doc.to_array(["TAG"]) doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a) diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py index 86341dd9a..5111b80dc 100644 --- a/spacy/training/__init__.py +++ b/spacy/training/__init__.py @@ -1,6 +1,6 @@ from .corpus import Corpus # noqa: F401 from .example import Example, validate_examples, validate_get_examples # noqa: F401 -from .align import Alignment # noqa: F401 +from .alignment import Alignment # noqa: F401 from .augment import dont_augment, orth_variants_augmenter # noqa: F401 from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401 from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401 diff --git a/spacy/training/align.pyx b/spacy/training/align.pyx new file mode 100644 index 000000000..b9d89f789 --- /dev/null +++ b/spacy/training/align.pyx @@ -0,0 +1,66 @@ +from typing import List, Tuple +from itertools import chain +import re + +from ..errors import Errors + + +def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]: + # Create character-to-token mappings + char_to_token_a = tuple(chain(*((i,) * len(x) for i, x in enumerate(A)))) + char_to_token_b = tuple(chain(*((i,) * len(x) for i, x in enumerate(B)))) + str_a = "".join(A).lower() + str_b = "".join(B).lower() + cdef int len_str_a = len(str_a) + cdef int len_str_b = len(str_b) + # Check that the two texts only differ in whitespace and capitalization + if re.sub(r"\s+", "", str_a) != re.sub(r"\s+", "", str_b) or \ + len_str_a != len(char_to_token_a) or \ + len_str_b != len(char_to_token_b): + raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10]))) + cdef int char_idx_a = 0 + cdef int char_idx_b = 0 + cdef int token_idx_a = 0 + cdef int token_idx_b = 0 + cdef int prev_token_idx_a = -1 + cdef int prev_token_idx_b = -1 + a2b = [] + b2a = [] + while char_idx_a < len_str_a and char_idx_b < len_str_b: + # Find the current token position from the character position + token_idx_a = char_to_token_a[char_idx_a] + token_idx_b = char_to_token_b[char_idx_b] + # Add a set for the next token if a token boundary has been crossed + if prev_token_idx_a != token_idx_a: + a2b.append(set()) + if prev_token_idx_b != token_idx_b: + b2a.append(set()) + # Process the alignment at the current position + if A[token_idx_a] == B[token_idx_b]: + # Current tokens are identical + a2b[-1].add(token_idx_b) + b2a[-1].add(token_idx_a) + char_idx_a += len(A[token_idx_a]) + char_idx_b += len(B[token_idx_b]) + elif str_a[char_idx_a] == str_b[char_idx_b]: + # Current chars are identical + a2b[-1].add(token_idx_b) + b2a[-1].add(token_idx_a) + char_idx_a += 1 + char_idx_b += 1 + elif str_a[char_idx_a].isspace(): + # Skip unaligned whitespace char in A + char_idx_a += 1 + elif str_b[char_idx_b].isspace(): + # Skip unaligned whitespace char in B + char_idx_b += 1 + else: + # This should never happen + raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10]))) + prev_token_idx_a = token_idx_a + prev_token_idx_b = token_idx_b + # Process unaligned trailing whitespace + a2b.extend([set()] * len(set(char_to_token_a[char_idx_a:]))) + b2a.extend([set()] * len(set(char_to_token_b[char_idx_b:]))) + # Return values as sorted lists per token position + return [sorted(x) for x in a2b], [sorted(x) for x in b2a] diff --git a/spacy/training/align.py b/spacy/training/alignment.py similarity index 75% rename from spacy/training/align.py rename to spacy/training/alignment.py index e8f17a667..3e3b60ca6 100644 --- a/spacy/training/align.py +++ b/spacy/training/alignment.py @@ -2,9 +2,8 @@ from typing import List import numpy from thinc.types import Ragged from dataclasses import dataclass -import tokenizations -from ..errors import Errors +from .align import get_alignments @dataclass @@ -20,9 +19,7 @@ class Alignment: @classmethod def from_strings(cls, A: List[str], B: List[str]) -> "Alignment": - if "".join(A).replace(" ", "").lower() != "".join(B).replace(" ", "").lower(): - raise ValueError(Errors.E949) - x2y, y2x = tokenizations.get_alignments(A, B) + x2y, y2x = get_alignments(A, B) return Alignment.from_indices(x2y=x2y, y2x=y2x) diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index a8da49c61..6a556b5e7 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -7,7 +7,7 @@ from ..tokens.doc cimport Doc from ..tokens.span cimport Span from ..tokens.span import Span from ..attrs import IDS -from .align import Alignment +from .alignment import Alignment from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags from .iob_utils import biluo_tags_to_spans from ..errors import Errors, Warnings