spaCy/spacy/training/align.pyx

# cython: profile=False
import re
from itertools import chain
from typing import List, Tuple

from ..errors import Errors


def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]:
    # Create character-to-token mappings
    char_to_token_a = tuple(chain(*((i,) * len(x.lower()) for i, x in enumerate(A))))
    char_to_token_b = tuple(chain(*((i,) * len(x.lower()) for i, x in enumerate(B))))
    str_a = "".join(A).lower()
    str_b = "".join(B).lower()
    cdef int len_str_a = len(str_a)
    cdef int len_str_b = len(str_b)
    # Check that the two texts only differ in whitespace and capitalization
    if re.sub(r"\s+", "", str_a) != re.sub(r"\s+", "", str_b) or \
            len_str_a != len(char_to_token_a) or \
            len_str_b != len(char_to_token_b):
        raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10])))
    cdef int char_idx_a = 0
    cdef int char_idx_b = 0
    cdef int token_idx_a = 0
    cdef int token_idx_b = 0
    cdef int prev_token_idx_a = -1
    cdef int prev_token_idx_b = -1
    a2b = []
    b2a = []
    while char_idx_a < len_str_a and char_idx_b < len_str_b:
        # Find the current token position from the character position
        token_idx_a = char_to_token_a[char_idx_a]
        token_idx_b = char_to_token_b[char_idx_b]
        # Add a set for the next token if a token boundary has been crossed
        if prev_token_idx_a != token_idx_a:
            a2b.append(set())
        if prev_token_idx_b != token_idx_b:
            b2a.append(set())
        # Process the alignment at the current position
        if A[token_idx_a] == B[token_idx_b] and \
                (
                    char_idx_a == 0 or
                    char_to_token_a[char_idx_a - 1] < token_idx_a
                ) and \
                (
                    char_idx_b == 0 or
                    char_to_token_b[char_idx_b - 1] < token_idx_b
                ):
            # Current tokens are identical and both character offsets are the
            # start of a token (either at the beginning of the document or the
            # previous character belongs to a different token)
            a2b[-1].add(token_idx_b)
            b2a[-1].add(token_idx_a)
            char_idx_a += len(A[token_idx_a])
            char_idx_b += len(B[token_idx_b])
        elif str_a[char_idx_a] == str_b[char_idx_b]:
            # Current chars are identical
            a2b[-1].add(token_idx_b)
            b2a[-1].add(token_idx_a)
            char_idx_a += 1
            char_idx_b += 1
        elif str_a[char_idx_a].isspace():
            # Skip unaligned whitespace char in A
            char_idx_a += 1
        elif str_b[char_idx_b].isspace():
            # Skip unaligned whitespace char in B
            char_idx_b += 1
        else:
            # This should never happen
            raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10])))
        prev_token_idx_a = token_idx_a
        prev_token_idx_b = token_idx_b
    # Process unaligned trailing whitespace
    a2b.extend([set()] * len(set(char_to_token_a[char_idx_a:])))
    b2a.extend([set()] * len(set(char_to_token_b[char_idx_b:])))
    # Return values as sorted lists per token position
    return [sorted(x) for x in a2b], [sorted(x) for x in b2a]
Add profile=False to currently unprofiled cython 2023-09-12 06:49:41 +00:00			`# cython: profile=False`
Replace pytokenizations with internal alignment (#6293) * Replace pytokenizations with internal alignment Replace pytokenizations with internal alignment algorithm that is restricted to only allow differences in whitespace and capitalization. * Rename `spacy.training.align` to `spacy.training.alignment` to contain the `Alignment` dataclass * Implement `get_alignments` in `spacy.training.align` * Refactor trailing whitespace handling * Remove unnecessary exception for empty docs Allow a non-empty whitespace-only doc to be aligned with an empty doc * Remove empty docs exceptions completely 2020-11-03 15:24:38 +00:00			`import re`
Configure isort to use the Black profile, recursively isort the `spacy` module (#12721) * Use isort with Black profile * isort all the things * Fix import cycles as a result of import sorting * Add DOCBIN_ALL_ATTRS type definition * Add isort to requirements * Remove isort from build dependencies check * Typo 2023-06-14 15:48:41 +00:00			`from itertools import chain`
			`from typing import List, Tuple`
Replace pytokenizations with internal alignment (#6293) * Replace pytokenizations with internal alignment Replace pytokenizations with internal alignment algorithm that is restricted to only allow differences in whitespace and capitalization. * Rename `spacy.training.align` to `spacy.training.alignment` to contain the `Alignment` dataclass * Implement `get_alignments` in `spacy.training.align` * Refactor trailing whitespace handling * Remove unnecessary exception for empty docs Allow a non-empty whitespace-only doc to be aligned with an empty doc * Remove empty docs exceptions completely 2020-11-03 15:24:38 +00:00
			`from ..errors import Errors`


			`def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]:`
			`# Create character-to-token mappings`
Fix alignment for 1-to-1 tokens and lowercasing (#6476) * When checking for token alignments, check not only that the tokens are identical but that the character positions are both at the start of a token. It's possible for the tokens to be identical even though the two tokens aren't aligned one-to-one in a case like `["a'", "''"]` vs. `["a", "''", "'"]`, where the middle tokens are identical but should not be aligned on the token level at character position 2 since it's the start of one token but the middle of another. * Use the lowercased version of the token texts to create the character-to-token alignment because lowercasing can change the string length (e.g., for `İ`, see the not-a-bug bug report: https://bugs.python.org/issue34723) 2020-12-08 06:25:16 +00:00			`char_to_token_a = tuple(chain(((i,) len(x.lower()) for i, x in enumerate(A))))`
			`char_to_token_b = tuple(chain(((i,) len(x.lower()) for i, x in enumerate(B))))`
Replace pytokenizations with internal alignment (#6293) * Replace pytokenizations with internal alignment Replace pytokenizations with internal alignment algorithm that is restricted to only allow differences in whitespace and capitalization. * Rename `spacy.training.align` to `spacy.training.alignment` to contain the `Alignment` dataclass * Implement `get_alignments` in `spacy.training.align` * Refactor trailing whitespace handling * Remove unnecessary exception for empty docs Allow a non-empty whitespace-only doc to be aligned with an empty doc * Remove empty docs exceptions completely 2020-11-03 15:24:38 +00:00			`str_a = "".join(A).lower()`
			`str_b = "".join(B).lower()`
			`cdef int len_str_a = len(str_a)`
			`cdef int len_str_b = len(str_b)`
			`# Check that the two texts only differ in whitespace and capitalization`
			`if re.sub(r"\s+", "", str_a) != re.sub(r"\s+", "", str_b) or \`
			`len_str_a != len(char_to_token_a) or \`
			`len_str_b != len(char_to_token_b):`
			`raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10])))`
			`cdef int char_idx_a = 0`
			`cdef int char_idx_b = 0`
			`cdef int token_idx_a = 0`
			`cdef int token_idx_b = 0`
			`cdef int prev_token_idx_a = -1`
			`cdef int prev_token_idx_b = -1`
			`a2b = []`
			`b2a = []`
			`while char_idx_a < len_str_a and char_idx_b < len_str_b:`
			`# Find the current token position from the character position`
			`token_idx_a = char_to_token_a[char_idx_a]`
			`token_idx_b = char_to_token_b[char_idx_b]`
			`# Add a set for the next token if a token boundary has been crossed`
			`if prev_token_idx_a != token_idx_a:`
			`a2b.append(set())`
			`if prev_token_idx_b != token_idx_b:`
			`b2a.append(set())`
			`# Process the alignment at the current position`
Fix alignment for 1-to-1 tokens and lowercasing (#6476) * When checking for token alignments, check not only that the tokens are identical but that the character positions are both at the start of a token. It's possible for the tokens to be identical even though the two tokens aren't aligned one-to-one in a case like `["a'", "''"]` vs. `["a", "''", "'"]`, where the middle tokens are identical but should not be aligned on the token level at character position 2 since it's the start of one token but the middle of another. * Use the lowercased version of the token texts to create the character-to-token alignment because lowercasing can change the string length (e.g., for `İ`, see the not-a-bug bug report: https://bugs.python.org/issue34723) 2020-12-08 06:25:16 +00:00			`if A[token_idx_a] == B[token_idx_b] and \`
ci: add cython linter (#12694) * chore: add cython-linter dev dependency * fix: lexeme.pyx * fix: morphology.pxd * fix: tokenizer.pxd * fix: vocab.pxd * fix: morphology.pxd (line length) * ci: add cython-lint * ci: fix cython-lint call * Fix kb/candidate.pyx. * Fix kb/kb.pyx. * Fix kb/kb_in_memory.pyx. * Fix kb. * Fix training/ partially. * Fix training/. Ignore trailing whitespaces and too long lines. * Fix ml/. * Fix matcher/. * Fix pipeline/. * Fix tokens/. * Fix build errors. Fix vocab.pyx. * Fix cython-lint install and run. * Fix lexeme.pyx, parts_of_speech.pxd, vectors.pyx. Temporarily disable cython-lint execution. * Fix attrs.pyx, lexeme.pyx, symbols.pxd, isort issues. * Make cython-lint install conditional. Fix tokenizer.pyx. * Fix remaining files. Reenable cython-lint check. * Readded parentheses. * Fix test_build_dependencies(). * Add explanatory comment to cython-lint execution. --------- Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com> 2023-07-19 10:03:31 +00:00			`(`
			`char_idx_a == 0 or`
			`char_to_token_a[char_idx_a - 1] < token_idx_a`
			`) and \`
			`(`
			`char_idx_b == 0 or`
			`char_to_token_b[char_idx_b - 1] < token_idx_b`
			`):`
Fix alignment for 1-to-1 tokens and lowercasing (#6476) * When checking for token alignments, check not only that the tokens are identical but that the character positions are both at the start of a token. It's possible for the tokens to be identical even though the two tokens aren't aligned one-to-one in a case like `["a'", "''"]` vs. `["a", "''", "'"]`, where the middle tokens are identical but should not be aligned on the token level at character position 2 since it's the start of one token but the middle of another. * Use the lowercased version of the token texts to create the character-to-token alignment because lowercasing can change the string length (e.g., for `İ`, see the not-a-bug bug report: https://bugs.python.org/issue34723) 2020-12-08 06:25:16 +00:00			`# Current tokens are identical and both character offsets are the`
			`# start of a token (either at the beginning of the document or the`
			`# previous character belongs to a different token)`
Replace pytokenizations with internal alignment (#6293) * Replace pytokenizations with internal alignment Replace pytokenizations with internal alignment algorithm that is restricted to only allow differences in whitespace and capitalization. * Rename `spacy.training.align` to `spacy.training.alignment` to contain the `Alignment` dataclass * Implement `get_alignments` in `spacy.training.align` * Refactor trailing whitespace handling * Remove unnecessary exception for empty docs Allow a non-empty whitespace-only doc to be aligned with an empty doc * Remove empty docs exceptions completely 2020-11-03 15:24:38 +00:00			`a2b[-1].add(token_idx_b)`
			`b2a[-1].add(token_idx_a)`
			`char_idx_a += len(A[token_idx_a])`
			`char_idx_b += len(B[token_idx_b])`
			`elif str_a[char_idx_a] == str_b[char_idx_b]:`
			`# Current chars are identical`
			`a2b[-1].add(token_idx_b)`
			`b2a[-1].add(token_idx_a)`
			`char_idx_a += 1`
			`char_idx_b += 1`
			`elif str_a[char_idx_a].isspace():`
			`# Skip unaligned whitespace char in A`
			`char_idx_a += 1`
			`elif str_b[char_idx_b].isspace():`
			`# Skip unaligned whitespace char in B`
			`char_idx_b += 1`
			`else:`
			`# This should never happen`
			`raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10])))`
			`prev_token_idx_a = token_idx_a`
			`prev_token_idx_b = token_idx_b`
			`# Process unaligned trailing whitespace`
			`a2b.extend([set()] * len(set(char_to_token_a[char_idx_a:])))`
			`b2a.extend([set()] * len(set(char_to_token_b[char_idx_b:])))`
			`# Return values as sorted lists per token position`
			`return [sorted(x) for x in a2b], [sorted(x) for x in b2a]`