spaCy/spacy/ml/models/span_finder.py

from typing import Callable, List, Tuple

from thinc.api import Model, chain, with_array
from thinc.types import Floats1d, Floats2d

from ...tokens import Doc

from ...util import registry

InT = List[Doc]
OutT = Floats2d


@registry.architectures("spacy.SpanFinder.v1")
def build_finder_model(
    tok2vec: Model[InT, List[Floats2d]], scorer: Model[OutT, OutT]
) -> Model[InT, OutT]:

    logistic_layer: Model[List[Floats2d], List[Floats2d]] = with_array(scorer)
    model: Model[InT, OutT] = chain(tok2vec, logistic_layer, flattener())
    model.set_ref("tok2vec", tok2vec)
    model.set_ref("scorer", scorer)
    model.set_ref("logistic_layer", logistic_layer)

    return model


def flattener() -> Model[List[Floats2d], Floats2d]:
    """Flattens the input to a 1-dimensional list of scores"""

    def forward(
        model: Model[Floats1d, Floats1d], X: List[Floats2d], is_train: bool
    ) -> Tuple[Floats2d, Callable[[Floats2d], List[Floats2d]]]:
        lens = model.ops.asarray1i([len(doc) for doc in X])
        Y = model.ops.flatten(X)

        def backprop(dY: Floats2d) -> List[Floats2d]:
            return model.ops.unflatten(dY, lens)

        return Y, backprop

    return Model("Flattener", forward=forward)
SpanFinder into spaCy from experimental (#12507) * span finder integrated into spacy from experimental * black * isort * black * default spankey constant * black * Update spacy/pipeline/spancat.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * rename * rename * max_length and min_length as Optional[int] and strict checking * black * mypy fix for integer type infinity * revert line order * implement all comparison operators for inf int * avoid two for loops over all docs by not precomputing * interleave thresholding with span creation * black * revert to not interleaving (relized its faster) * black * Update spacy/errors.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * update dosctring * enforce that the gold and predicted documents have the same text * new error for ensuring reference and predicted texts are the same * remove todo * adjust test * black * handle misaligned tokenization * return correct variable * failing overfit test * only use a single spans_key like in spancat * black * remove debug lines * typo * remove comment * remove near duplicate reduntant method * use the 'spans_key' variable name everywhere * Update spacy/pipeline/span_finder.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * flaky test fix suggestion, hand set bias terms * only test suggester and test result exhaustively * make it clear that the span_finder_suggester is more general (not specific to span_finder) * Update spacy/tests/pipeline/test_span_finder.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Apply suggestions from code review * remove question comment * move preset_spans_suggester test to spancat tests * Add docs and unify default configs for spancat and span finder * Add `allow_overlap=True` to span finder scorer * Fix offset bug in set_annotations * Ignore labels in span finder scorer * Format * Add span_finder to quickstart template * Move settings to self.cfg, store min/max unset as None * Remove debugging * Update docstrings and docs * Update spacy/pipeline/span_finder.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Fix imports --------- Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> 2023-06-07 13:52:28 +00:00			`from typing import Callable, List, Tuple`

			`from thinc.api import Model, chain, with_array`
			`from thinc.types import Floats1d, Floats2d`

			`from ...tokens import Doc`

			`from ...util import registry`

			`InT = List[Doc]`
			`OutT = Floats2d`


			`@registry.architectures("spacy.SpanFinder.v1")`
			`def build_finder_model(`
			`tok2vec: Model[InT, List[Floats2d]], scorer: Model[OutT, OutT]`
			`) -> Model[InT, OutT]:`

			`logistic_layer: Model[List[Floats2d], List[Floats2d]] = with_array(scorer)`
			`model: Model[InT, OutT] = chain(tok2vec, logistic_layer, flattener())`
			`model.set_ref("tok2vec", tok2vec)`
			`model.set_ref("scorer", scorer)`
			`model.set_ref("logistic_layer", logistic_layer)`

			`return model`


			`def flattener() -> Model[List[Floats2d], Floats2d]:`
			`"""Flattens the input to a 1-dimensional list of scores"""`

			`def forward(`
			`model: Model[Floats1d, Floats1d], X: List[Floats2d], is_train: bool`
			`) -> Tuple[Floats2d, Callable[[Floats2d], List[Floats2d]]]:`
			`lens = model.ops.asarray1i([len(doc) for doc in X])`
			`Y = model.ops.flatten(X)`

			`def backprop(dY: Floats2d) -> List[Floats2d]:`
			`return model.ops.unflatten(dY, lens)`

			`return Y, backprop`

			`return Model("Flattener", forward=forward)`