Replace pytokenizations with internal alignment (#6293)

* Replace pytokenizations with internal alignment

Replace pytokenizations with internal alignment algorithm that is
restricted to only allow differences in whitespace and capitalization.

* Rename `spacy.training.align` to `spacy.training.alignment` to contain
the `Alignment` dataclass
* Implement `get_alignments` in `spacy.training.align`

* Refactor trailing whitespace handling

* Remove unnecessary exception for empty docs

Allow a non-empty whitespace-only doc to be aligned with an empty doc

* Remove empty docs exceptions completely
This commit is contained in:
Adriane Boyd 2020-11-03 16:24:38 +01:00 committed by GitHub
parent a4b32b9552
commit 1c4df8fd09
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 182 additions and 22 deletions

View File

@ -8,7 +8,6 @@ requires = [
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0rc0,<8.1.0", "thinc>=8.0.0rc0,<8.1.0",
"blis>=0.4.0,<0.8.0", "blis>=0.4.0,<0.8.0",
"pytokenizations",
"pathy" "pathy"
] ]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"

View File

@ -15,7 +15,6 @@ numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
pydantic>=1.5.0,<1.7.0 pydantic>=1.5.0,<1.7.0
pytokenizations
# Official Python utilities # Official Python utilities
setuptools setuptools
packaging>=20.0 packaging>=20.0

View File

@ -52,7 +52,6 @@ install_requires =
numpy>=1.15.0 numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
pydantic>=1.5.0,<1.7.0 pydantic>=1.5.0,<1.7.0
pytokenizations
jinja2 jinja2
# Official Python utilities # Official Python utilities
setuptools setuptools

View File

@ -49,6 +49,7 @@ MOD_NAMES = [
"spacy.pipeline._parser_internals.stateclass", "spacy.pipeline._parser_internals.stateclass",
"spacy.pipeline._parser_internals.transition_system", "spacy.pipeline._parser_internals.transition_system",
"spacy.tokenizer", "spacy.tokenizer",
"spacy.training.align",
"spacy.training.gold_io", "spacy.training.gold_io",
"spacy.tokens.doc", "spacy.tokens.doc",
"spacy.tokens.span", "spacy.tokens.span",

View File

@ -564,7 +564,10 @@ class Errors:
"a string value from {expected} but got: '{arg}'") "a string value from {expected} but got: '{arg}'")
E948 = ("`Matcher.add` received invalid 'patterns' argument: expected " E948 = ("`Matcher.add` received invalid 'patterns' argument: expected "
"a list, but got: {arg_type}") "a list, but got: {arg_type}")
E949 = ("Can only create an alignment when the texts are the same.") E949 = ("Unable to align tokens for the predicted and reference docs. It "
"is only possible to align the docs when both texts are the same "
"except for whitespace and capitalization. The predicted tokens "
"start with: {x}. The reference tokens start with: {y}.")
E952 = ("The section '{name}' is not a valid section in the provided config.") E952 = ("The section '{name}' is not a valid section in the provided config.")
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream " E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "

View File

@ -2,6 +2,7 @@ import numpy
from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment
from spacy.training import biluo_tags_to_spans, iob_to_biluo from spacy.training import biluo_tags_to_spans, iob_to_biluo
from spacy.training import Corpus, docs_to_json, Example from spacy.training import Corpus, docs_to_json, Example
from spacy.training.align import get_alignments
from spacy.training.converters import json_to_docs from spacy.training.converters import json_to_docs
from spacy.lang.en import English from spacy.lang.en import English
from spacy.tokens import Doc, DocBin from spacy.tokens import Doc, DocBin
@ -492,36 +493,35 @@ def test_roundtrip_docs_to_docbin(doc):
assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"] assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
@pytest.mark.skip("Outdated")
@pytest.mark.parametrize( @pytest.mark.parametrize(
"tokens_a,tokens_b,expected", "tokens_a,tokens_b,expected",
[ [
(["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})), (["a", "b", "c"], ["ab", "c"], ([[0], [0], [1]], [[0, 1], [2]])),
( (
["a", "b", '"', "c"], ["a", "b", '"', "c"],
['ab"', "c"], ['ab"', "c"],
(4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}), ([[0], [0], [0], [1]], [[0, 1, 2], [3]]),
), ),
(["a", "bc"], ["ab", "c"], (4, [-1, -1], [-1, -1], {0: 0}, {1: 1})), (["a", "bc"], ["ab", "c"], ([[0], [0, 1]], [[0, 1], [1]])),
( (
["ab", "c", "d"], ["ab", "c", "d"],
["a", "b", "cd"], ["a", "b", "cd"],
(6, [-1, -1, -1], [-1, -1, -1], {1: 2, 2: 2}, {0: 0, 1: 0}), ([[0, 1], [2], [2]], [[0], [0], [1, 2]]),
), ),
( (
["a", "b", "cd"], ["a", "b", "cd"],
["a", "b", "c", "d"], ["a", "b", "c", "d"],
(3, [0, 1, -1], [0, 1, -1, -1], {}, {2: 2, 3: 2}), ([[0], [1], [2, 3]], [[0], [1], [2], [2]]),
), ),
([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})), ([" ", "a"], ["a"], ([[], [0]], [[1]])),
], ],
) )
def test_align(tokens_a, tokens_b, expected): # noqa def test_align(tokens_a, tokens_b, expected): # noqa
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b) # noqa a2b, b2a = get_alignments(tokens_a, tokens_b)
assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected # noqa assert (a2b, b2a) == expected # noqa
# check symmetry # check symmetry
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a) # noqa a2b, b2a = get_alignments(tokens_b, tokens_a) # noqa
assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected # noqa assert (b2a, a2b) == expected # noqa
def test_goldparse_startswith_space(en_tokenizer): def test_goldparse_startswith_space(en_tokenizer):
@ -539,6 +539,21 @@ def test_goldparse_startswith_space(en_tokenizer):
assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"] assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"]
def test_goldparse_endswith_space(en_tokenizer):
text = "a\n"
doc = en_tokenizer(text)
gold_words = ["a"]
entities = ["U-DATE"]
deps = ["ROOT"]
heads = [0]
example = Example.from_dict(
doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads}
)
ner_tags = example.get_aligned_ner()
assert ner_tags == ["U-DATE", "O"]
assert example.get_aligned("DEP", as_string=True) == ["ROOT", None]
def test_gold_constructor(): def test_gold_constructor():
"""Test that the Example constructor works fine""" """Test that the Example constructor works fine"""
nlp = English() nlp = English()
@ -676,6 +691,87 @@ def test_alignment_different_texts():
Alignment.from_strings(other_tokens, spacy_tokens) Alignment.from_strings(other_tokens, spacy_tokens)
def test_alignment_spaces(en_vocab):
# single leading whitespace
other_tokens = [" ", "i listened to", "obama", "'", "s", "podcasts", "."]
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6]
# multiple leading whitespace tokens
other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,]
assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7]
# both with leading whitespace, not identical
other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
spacy_tokens = [" ", "i", "listened", "to", "obama", "'s", "podcasts."]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [1, 0, 3, 1, 1, 1, 1, 1]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 5, 5, 6, 6]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 1, 2, 2]
assert list(align.y2x.dataXd) == [0, 2, 2, 2, 3, 4, 5, 6, 7]
# same leading whitespace, different tokenization
other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."]
spacy_tokens = [" ", "i", "listened", "to", "obama", "'s", "podcasts."]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [1, 1, 3, 1, 1, 1, 1, 1]
assert list(align.x2y.dataXd) == [0, 0, 1, 2, 3, 4, 5, 5, 6, 6]
assert list(align.y2x.lengths) == [2, 1, 1, 1, 1, 2, 2]
assert list(align.y2x.dataXd) == [0, 1, 2, 2, 2, 3, 4, 5, 6, 7]
# only one with trailing whitespace
other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " "]
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 0]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2]
assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5]
# different trailing whitespace
other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "]
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", " "]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 0]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 1]
assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6]
# same trailing whitespace, different tokenization
other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "]
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", " "]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 1]
assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6, 6]
assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 2]
assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6, 7]
# differing whitespace is allowed
other_tokens = ["a", " \n ", "b", "c"]
spacy_tokens = ["a", "b", " ", "c"]
align = Alignment.from_strings(other_tokens, spacy_tokens)
assert list(align.x2y.dataXd) == [0, 1, 3]
assert list(align.y2x.dataXd) == [0, 2, 3]
# other differences in whitespace are allowed
other_tokens = [" ", "a"]
spacy_tokens = [" ", "a", " "]
align = Alignment.from_strings(other_tokens, spacy_tokens)
other_tokens = ["a", " "]
spacy_tokens = ["a", " "]
align = Alignment.from_strings(other_tokens, spacy_tokens)
def test_retokenized_docs(doc): def test_retokenized_docs(doc):
a = doc.to_array(["TAG"]) a = doc.to_array(["TAG"])
doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a) doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)

View File

@ -1,6 +1,6 @@
from .corpus import Corpus # noqa: F401 from .corpus import Corpus # noqa: F401
from .example import Example, validate_examples, validate_get_examples # noqa: F401 from .example import Example, validate_examples, validate_get_examples # noqa: F401
from .align import Alignment # noqa: F401 from .alignment import Alignment # noqa: F401
from .augment import dont_augment, orth_variants_augmenter # noqa: F401 from .augment import dont_augment, orth_variants_augmenter # noqa: F401
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401 from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401 from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401

66
spacy/training/align.pyx Normal file
View File

@ -0,0 +1,66 @@
from typing import List, Tuple
from itertools import chain
import re
from ..errors import Errors
def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]:
# Create character-to-token mappings
char_to_token_a = tuple(chain(*((i,) * len(x) for i, x in enumerate(A))))
char_to_token_b = tuple(chain(*((i,) * len(x) for i, x in enumerate(B))))
str_a = "".join(A).lower()
str_b = "".join(B).lower()
cdef int len_str_a = len(str_a)
cdef int len_str_b = len(str_b)
# Check that the two texts only differ in whitespace and capitalization
if re.sub(r"\s+", "", str_a) != re.sub(r"\s+", "", str_b) or \
len_str_a != len(char_to_token_a) or \
len_str_b != len(char_to_token_b):
raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10])))
cdef int char_idx_a = 0
cdef int char_idx_b = 0
cdef int token_idx_a = 0
cdef int token_idx_b = 0
cdef int prev_token_idx_a = -1
cdef int prev_token_idx_b = -1
a2b = []
b2a = []
while char_idx_a < len_str_a and char_idx_b < len_str_b:
# Find the current token position from the character position
token_idx_a = char_to_token_a[char_idx_a]
token_idx_b = char_to_token_b[char_idx_b]
# Add a set for the next token if a token boundary has been crossed
if prev_token_idx_a != token_idx_a:
a2b.append(set())
if prev_token_idx_b != token_idx_b:
b2a.append(set())
# Process the alignment at the current position
if A[token_idx_a] == B[token_idx_b]:
# Current tokens are identical
a2b[-1].add(token_idx_b)
b2a[-1].add(token_idx_a)
char_idx_a += len(A[token_idx_a])
char_idx_b += len(B[token_idx_b])
elif str_a[char_idx_a] == str_b[char_idx_b]:
# Current chars are identical
a2b[-1].add(token_idx_b)
b2a[-1].add(token_idx_a)
char_idx_a += 1
char_idx_b += 1
elif str_a[char_idx_a].isspace():
# Skip unaligned whitespace char in A
char_idx_a += 1
elif str_b[char_idx_b].isspace():
# Skip unaligned whitespace char in B
char_idx_b += 1
else:
# This should never happen
raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10])))
prev_token_idx_a = token_idx_a
prev_token_idx_b = token_idx_b
# Process unaligned trailing whitespace
a2b.extend([set()] * len(set(char_to_token_a[char_idx_a:])))
b2a.extend([set()] * len(set(char_to_token_b[char_idx_b:])))
# Return values as sorted lists per token position
return [sorted(x) for x in a2b], [sorted(x) for x in b2a]

View File

@ -2,9 +2,8 @@ from typing import List
import numpy import numpy
from thinc.types import Ragged from thinc.types import Ragged
from dataclasses import dataclass from dataclasses import dataclass
import tokenizations
from ..errors import Errors from .align import get_alignments
@dataclass @dataclass
@ -20,9 +19,7 @@ class Alignment:
@classmethod @classmethod
def from_strings(cls, A: List[str], B: List[str]) -> "Alignment": def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
if "".join(A).replace(" ", "").lower() != "".join(B).replace(" ", "").lower(): x2y, y2x = get_alignments(A, B)
raise ValueError(Errors.E949)
x2y, y2x = tokenizations.get_alignments(A, B)
return Alignment.from_indices(x2y=x2y, y2x=y2x) return Alignment.from_indices(x2y=x2y, y2x=y2x)

View File

@ -7,7 +7,7 @@ from ..tokens.doc cimport Doc
from ..tokens.span cimport Span from ..tokens.span cimport Span
from ..tokens.span import Span from ..tokens.span import Span
from ..attrs import IDS from ..attrs import IDS
from .align import Alignment from .alignment import Alignment
from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
from .iob_utils import biluo_tags_to_spans from .iob_utils import biluo_tags_to_spans
from ..errors import Errors, Warnings from ..errors import Errors, Warnings