Fix spancat for empty docs and zero suggestions (#9654)

* Fix spancat for empty docs and zero suggestions * Use ops.xp.zeros in test
2021-11-15 12:40:55 +01:00 · 2021-11-15 12:40:55 +01:00 · c9baf9d196
parent 67d8c8a081
commit c9baf9d196
3 changed files with 39 additions and 4 deletions
--- a/spacy/ml/extract_spans.py
+++ b/spacy/ml/extract_spans.py
@ -28,7 +28,13 @@ def forward(
    X, spans = source_spans
    assert spans.dataXd.ndim == 2
    indices = _get_span_indices(ops, spans, X.lengths)
-    Y = Ragged(X.dataXd[indices], spans.dataXd[:, 1] - spans.dataXd[:, 0])  # type: ignore[arg-type, index]
+    if len(indices) > 0:
+        Y = Ragged(X.dataXd[indices], spans.dataXd[:, 1] - spans.dataXd[:, 0])  # type: ignore[arg-type, index]
+    else:
+        Y = Ragged(
+            ops.xp.zeros(X.dataXd.shape, dtype=X.dataXd.dtype),
+            ops.xp.zeros((len(X.lengths),), dtype="i"),
+        )
    x_shape = X.dataXd.shape
    x_lengths = X.lengths

@ -53,7 +59,7 @@ def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
        for j in range(spans_i.shape[0]):
            indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1]))  # type: ignore[call-overload, index]
        offset += length
-    return ops.flatten(indices)
+    return ops.flatten(indices, dtype="i", ndim_if_empty=1)


 def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@ -78,7 +78,7 @@ def build_ngram_suggester(sizes: List[int]) -> Suggester:
        if len(spans) > 0:
            output = Ragged(ops.xp.vstack(spans), lengths_array)
        else:
-            output = Ragged(ops.xp.zeros((0, 0)), lengths_array)
+            output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)

        assert output.dataXd.ndim == 2
        return output
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@ -1,7 +1,7 @@
 import pytest
 import numpy
 from numpy.testing import assert_array_equal, assert_almost_equal
-from thinc.api import get_current_ops
+from thinc.api import get_current_ops, Ragged

 from spacy import util
 from spacy.lang.en import English
@ -29,6 +29,7 @@ TRAIN_DATA_OVERLAPPING = [
        "I like London and Berlin",
        {"spans": {SPAN_KEY: [(7, 13, "LOC"), (18, 24, "LOC"), (7, 24, "DOUBLE_LOC")]}},
    ),
+    ("", {"spans": {SPAN_KEY: []}}),
 ]


@ -365,3 +366,31 @@ def test_overfitting_IO_overlapping():
            "London and Berlin",
        }
        assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"}
+
+
+def test_zero_suggestions():
+    # Test with a suggester that returns 0 suggestions
+
+    @registry.misc("test_zero_suggester")
+    def make_zero_suggester():
+        def zero_suggester(docs, *, ops=None):
+            if ops is None:
+                ops = get_current_ops()
+            return Ragged(
+                ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i")
+            )
+
+        return zero_suggester
+
+    fix_random_seed(0)
+    nlp = English()
+    spancat = nlp.add_pipe(
+        "spancat",
+        config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY},
+    )
+    train_examples = make_examples(nlp)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    assert spancat.model.get_dim("nO") == 2
+    assert set(spancat.labels) == {"LOC", "PERSON"}
+
+    nlp.update(train_examples, sgd=optimizer)