From 1c4df8fd095e8671dff5e760edca1213063a99bc Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 3 Nov 2020 16:24:38 +0100
Subject: [PATCH] Replace pytokenizations with internal alignment (#6293)

* Replace pytokenizations with internal alignment

Replace pytokenizations with internal alignment algorithm that is
restricted to only allow differences in whitespace and capitalization.

* Rename `spacy.training.align` to `spacy.training.alignment` to contain
the `Alignment` dataclass
* Implement `get_alignments` in `spacy.training.align`

* Refactor trailing whitespace handling

* Remove unnecessary exception for empty docs

Allow a non-empty whitespace-only doc to be aligned with an empty doc

* Remove empty docs exceptions completely
---
 pyproject.toml                            |   1 -
 requirements.txt                          |   1 -
 setup.cfg                                 |   1 -
 setup.py                                  |   1 +
 spacy/errors.py                           |   5 +-
 spacy/tests/training/test_training.py     | 118 ++++++++++++++++++++--
 spacy/training/__init__.py                |   2 +-
 spacy/training/align.pyx                  |  66 ++++++++++++
 spacy/training/{align.py => alignment.py} |   7 +-
 spacy/training/example.pyx                |   2 +-
 10 files changed, 182 insertions(+), 22 deletions(-)
 create mode 100644 spacy/training/align.pyx
 rename spacy/training/{align.py => alignment.py} (75%)

diff --git a/pyproject.toml b/pyproject.toml
index 14a2d7690..0ceda4454 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,6 @@ requires = [
     "murmurhash>=0.28.0,<1.1.0",
     "thinc>=8.0.0rc0,<8.1.0",
     "blis>=0.4.0,<0.8.0",
-    "pytokenizations",
     "pathy"
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index c5e136a34..3a777f163 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,6 @@ numpy>=1.15.0
 requests>=2.13.0,<3.0.0
 tqdm>=4.38.0,<5.0.0
 pydantic>=1.5.0,<1.7.0
-pytokenizations
 # Official Python utilities
 setuptools
 packaging>=20.0
diff --git a/setup.cfg b/setup.cfg
index 762a7e888..95ada08ef 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -52,7 +52,6 @@ install_requires =
     numpy>=1.15.0
     requests>=2.13.0,<3.0.0
     pydantic>=1.5.0,<1.7.0
-    pytokenizations
     jinja2
     # Official Python utilities
     setuptools
diff --git a/setup.py b/setup.py
index 604d65745..160d2ed1c 100755
--- a/setup.py
+++ b/setup.py
@@ -49,6 +49,7 @@ MOD_NAMES = [
     "spacy.pipeline._parser_internals.stateclass",
     "spacy.pipeline._parser_internals.transition_system",
     "spacy.tokenizer",
+    "spacy.training.align",
     "spacy.training.gold_io",
     "spacy.tokens.doc",
     "spacy.tokens.span",
diff --git a/spacy/errors.py b/spacy/errors.py
index 2898fbcaa..f4fd3731f 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -564,7 +564,10 @@ class Errors:
             "a string value from {expected} but got: '{arg}'")
     E948 = ("`Matcher.add` received invalid 'patterns' argument: expected "
             "a list, but got: {arg_type}")
-    E949 = ("Can only create an alignment when the texts are the same.")
+    E949 = ("Unable to align tokens for the predicted and reference docs. It "
+            "is only possible to align the docs when both texts are the same "
+            "except for whitespace and capitalization. The predicted tokens "
+            "start with: {x}. The reference tokens start with: {y}.")
     E952 = ("The section '{name}' is not a valid section in the provided config.")
     E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
     E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 07e1aef01..ba485ab45 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -2,6 +2,7 @@ import numpy
 from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment
 from spacy.training import biluo_tags_to_spans, iob_to_biluo
 from spacy.training import Corpus, docs_to_json, Example
+from spacy.training.align import get_alignments
 from spacy.training.converters import json_to_docs
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
@@ -492,36 +493,35 @@ def test_roundtrip_docs_to_docbin(doc):
     assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
 
 
-@pytest.mark.skip("Outdated")
 @pytest.mark.parametrize(
     "tokens_a,tokens_b,expected",
     [
-        (["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})),
+        (["a", "b", "c"], ["ab", "c"], ([[0], [0], [1]], [[0, 1], [2]])),
         (
             ["a", "b", '"', "c"],
             ['ab"', "c"],
-            (4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}),
+            ([[0], [0], [0], [1]], [[0, 1, 2], [3]]),
         ),
-        (["a", "bc"], ["ab", "c"], (4, [-1, -1], [-1, -1], {0: 0}, {1: 1})),
+        (["a", "bc"], ["ab", "c"], ([[0], [0, 1]], [[0, 1], [1]])),
         (
             ["ab", "c", "d"],
             ["a", "b", "cd"],
-            (6, [-1, -1, -1], [-1, -1, -1], {1: 2, 2: 2}, {0: 0, 1: 0}),
+            ([[0, 1], [2], [2]], [[0], [0], [1, 2]]),
         ),
         (
             ["a", "b", "cd"],
             ["a", "b", "c", "d"],
-            (3, [0, 1, -1], [0, 1, -1, -1], {}, {2: 2, 3: 2}),
+            ([[0], [1], [2, 3]], [[0], [1], [2], [2]]),
         ),
-        ([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})),
+        ([" ", "a"], ["a"], ([[], [0]], [[1]])),
     ],
 )
 def test_align(tokens_a, tokens_b, expected):  # noqa
-    cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b)  # noqa
-    assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected  # noqa
+    a2b, b2a = get_alignments(tokens_a, tokens_b)
+    assert (a2b, b2a) == expected  # noqa
     # check symmetry
-    cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a)  # noqa
-    assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected  # noqa
+    a2b, b2a = get_alignments(tokens_b, tokens_a)  # noqa
+    assert (b2a, a2b) == expected  # noqa
 
 
 def test_goldparse_startswith_space(en_tokenizer):
@@ -539,6 +539,21 @@ def test_goldparse_startswith_space(en_tokenizer):
     assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"]
 
 
+def test_goldparse_endswith_space(en_tokenizer):
+    text = "a\n"
+    doc = en_tokenizer(text)
+    gold_words = ["a"]
+    entities = ["U-DATE"]
+    deps = ["ROOT"]
+    heads = [0]
+    example = Example.from_dict(
+        doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads}
+    )
+    ner_tags = example.get_aligned_ner()
+    assert ner_tags == ["U-DATE", "O"]
+    assert example.get_aligned("DEP", as_string=True) == ["ROOT", None]
+
+
 def test_gold_constructor():
     """Test that the Example constructor works fine"""
     nlp = English()
@@ -676,6 +691,87 @@ def test_alignment_different_texts():
         Alignment.from_strings(other_tokens, spacy_tokens)
 
 
+def test_alignment_spaces(en_vocab):
+    # single leading whitespace
+    other_tokens = [" ", "i listened to", "obama", "'", "s", "podcasts", "."]
+    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+    assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1]
+    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
+    assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6]
+
+    # multiple leading whitespace tokens
+    other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
+    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+    assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1]
+    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
+    assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7]
+
+    # both with leading whitespace, not identical
+    other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
+    spacy_tokens = [" ", "i", "listened", "to", "obama", "'s", "podcasts."]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+    assert list(align.x2y.lengths) == [1, 0, 3, 1, 1, 1, 1, 1]
+    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 5, 5, 6, 6]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 1, 2, 2]
+    assert list(align.y2x.dataXd) == [0, 2, 2, 2, 3, 4, 5, 6, 7]
+
+    # same leading whitespace, different tokenization
+    other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
+    spacy_tokens = ["  ", "i", "listened", "to", "obama", "'s", "podcasts."]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+    assert list(align.x2y.lengths) == [1, 1, 3, 1, 1, 1, 1, 1]
+    assert list(align.x2y.dataXd) == [0, 0, 1, 2, 3, 4, 5, 5, 6, 6]
+    assert list(align.y2x.lengths) == [2, 1, 1, 1, 1, 2, 2]
+    assert list(align.y2x.dataXd) == [0, 1, 2, 2, 2, 3, 4, 5, 6, 7]
+
+    # only one with trailing whitespace
+    other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " "]
+    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+    assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 0]
+    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
+    assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5]
+
+    # different trailing whitespace
+    other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "]
+    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", " "]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+    assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 0]
+    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 1]
+    assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6]
+
+    # same trailing whitespace, different tokenization
+    other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "]
+    spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", "  "]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+    assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 1]
+    assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6, 6]
+    assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 2]
+    assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6, 7]
+
+    # differing whitespace is allowed
+    other_tokens = ["a", " \n ", "b", "c"]
+    spacy_tokens = ["a", "b", " ", "c"]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+    assert list(align.x2y.dataXd) == [0, 1, 3]
+    assert list(align.y2x.dataXd) == [0, 2, 3]
+
+    # other differences in whitespace are allowed
+    other_tokens = [" ", "a"]
+    spacy_tokens = ["  ", "a", " "]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+
+    other_tokens = ["a", " "]
+    spacy_tokens = ["a", "  "]
+    align = Alignment.from_strings(other_tokens, spacy_tokens)
+
+
 def test_retokenized_docs(doc):
     a = doc.to_array(["TAG"])
     doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index 86341dd9a..5111b80dc 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -1,6 +1,6 @@
 from .corpus import Corpus  # noqa: F401
 from .example import Example, validate_examples, validate_get_examples  # noqa: F401
-from .align import Alignment  # noqa: F401
+from .alignment import Alignment  # noqa: F401
 from .augment import dont_augment, orth_variants_augmenter  # noqa: F401
 from .iob_utils import iob_to_biluo, biluo_to_iob  # noqa: F401
 from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets  # noqa: F401
diff --git a/spacy/training/align.pyx b/spacy/training/align.pyx
new file mode 100644
index 000000000..b9d89f789
--- /dev/null
+++ b/spacy/training/align.pyx
@@ -0,0 +1,66 @@
+from typing import List, Tuple
+from itertools import chain
+import re
+
+from ..errors import Errors
+
+
+def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]:
+    # Create character-to-token mappings
+    char_to_token_a = tuple(chain(*((i,) * len(x) for i, x in enumerate(A))))
+    char_to_token_b = tuple(chain(*((i,) * len(x) for i, x in enumerate(B))))
+    str_a = "".join(A).lower()
+    str_b = "".join(B).lower()
+    cdef int len_str_a = len(str_a)
+    cdef int len_str_b = len(str_b)
+    # Check that the two texts only differ in whitespace and capitalization
+    if re.sub(r"\s+", "", str_a) != re.sub(r"\s+", "", str_b) or \
+            len_str_a != len(char_to_token_a) or \
+            len_str_b != len(char_to_token_b):
+        raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10])))
+    cdef int char_idx_a = 0
+    cdef int char_idx_b = 0
+    cdef int token_idx_a = 0
+    cdef int token_idx_b = 0
+    cdef int prev_token_idx_a = -1
+    cdef int prev_token_idx_b = -1
+    a2b = []
+    b2a = []
+    while char_idx_a < len_str_a and char_idx_b < len_str_b:
+        # Find the current token position from the character position
+        token_idx_a = char_to_token_a[char_idx_a]
+        token_idx_b = char_to_token_b[char_idx_b]
+        # Add a set for the next token if a token boundary has been crossed
+        if prev_token_idx_a != token_idx_a:
+            a2b.append(set())
+        if prev_token_idx_b != token_idx_b:
+            b2a.append(set())
+        # Process the alignment at the current position
+        if A[token_idx_a] == B[token_idx_b]:
+            # Current tokens are identical
+            a2b[-1].add(token_idx_b)
+            b2a[-1].add(token_idx_a)
+            char_idx_a += len(A[token_idx_a])
+            char_idx_b += len(B[token_idx_b])
+        elif str_a[char_idx_a] == str_b[char_idx_b]:
+            # Current chars are identical
+            a2b[-1].add(token_idx_b)
+            b2a[-1].add(token_idx_a)
+            char_idx_a += 1
+            char_idx_b += 1
+        elif str_a[char_idx_a].isspace():
+            # Skip unaligned whitespace char in A
+            char_idx_a += 1
+        elif str_b[char_idx_b].isspace():
+            # Skip unaligned whitespace char in B
+            char_idx_b += 1
+        else:
+            # This should never happen
+            raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10])))
+        prev_token_idx_a = token_idx_a
+        prev_token_idx_b = token_idx_b
+    # Process unaligned trailing whitespace
+    a2b.extend([set()] * len(set(char_to_token_a[char_idx_a:])))
+    b2a.extend([set()] * len(set(char_to_token_b[char_idx_b:])))
+    # Return values as sorted lists per token position
+    return [sorted(x) for x in a2b], [sorted(x) for x in b2a]
diff --git a/spacy/training/align.py b/spacy/training/alignment.py
similarity index 75%
rename from spacy/training/align.py
rename to spacy/training/alignment.py
index e8f17a667..3e3b60ca6 100644
--- a/spacy/training/align.py
+++ b/spacy/training/alignment.py
@@ -2,9 +2,8 @@ from typing import List
 import numpy
 from thinc.types import Ragged
 from dataclasses import dataclass
-import tokenizations
 
-from ..errors import Errors
+from .align import get_alignments
 
 
 @dataclass
@@ -20,9 +19,7 @@ class Alignment:
 
     @classmethod
     def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
-        if "".join(A).replace(" ", "").lower() != "".join(B).replace(" ", "").lower():
-            raise ValueError(Errors.E949)
-        x2y, y2x = tokenizations.get_alignments(A, B)
+        x2y, y2x = get_alignments(A, B)
         return Alignment.from_indices(x2y=x2y, y2x=y2x)
 
 
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index a8da49c61..6a556b5e7 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -7,7 +7,7 @@ from ..tokens.doc cimport Doc
 from ..tokens.span cimport Span
 from ..tokens.span import Span
 from ..attrs import IDS
-from .align import Alignment
+from .alignment import Alignment
 from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
 from .iob_utils import biluo_tags_to_spans
 from ..errors import Errors, Warnings