From 8b650f3a786094833cccd8686ab4d6d73330565c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 17 Sep 2020 21:10:41 +0200
Subject: [PATCH] Modify setting missing and blocked entity tokens

In order to make it easier to construct `Doc` objects as training data,
modify how missing and blocked entity tokens are set to prioritize
setting `O` and missing entity tokens for training purposes over setting
blocked entity tokens.

* `Doc.ents` setter sets tokens outside entity spans to `O` regardless
of the current state of each token

* For `Doc.ents`, setting a span with a missing label sets the `ent_iob`
to missing instead of blocked

* `Doc.block_ents(spans)` marks spans as hard `O` for use with the
`EntityRecognizer`
---
 spacy/tests/doc/test_doc_api.py | 18 ++++++++++++++++--
 spacy/tests/parser/test_ner.py  |  4 ++--
 spacy/tokens/doc.pyx            | 25 +++++++++++++++++++------
 spacy/training/example.pyx      |  4 +---
 spacy/training/iob_utils.py     | 12 ++++--------
 5 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index ce979d3d1..53c309ba5 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -137,7 +137,7 @@ def test_doc_api_set_ents(en_tokenizer):
     assert len(tokens.ents) == 0
     tokens.ents = [(tokens.vocab.strings["PRODUCT"], 2, 4)]
     assert len(list(tokens.ents)) == 1
-    assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0]
+    assert [t.ent_iob for t in tokens] == [2, 2, 3, 1, 2, 2, 2, 2]
     assert tokens.ents[0].label_ == "PRODUCT"
     assert tokens.ents[0].start == 2
     assert tokens.ents[0].end == 4
@@ -426,7 +426,7 @@ def test_has_annotation(en_vocab):
     doc[0].lemma_ = "a"
     doc[0].dep_ = "dep"
     doc[0].head = doc[1]
-    doc.ents = [Span(doc, 0, 1, label="HELLO")]
+    doc.ents = [Span(doc, 0, 1, label="HELLO"), Span(doc, 1, 2, label="")]
 
     for attr in attrs:
         assert doc.has_annotation(attr)
@@ -454,3 +454,17 @@ def test_is_flags_deprecated(en_tokenizer):
         doc.is_nered
     with pytest.deprecated_call():
         doc.is_sentenced
+
+
+def test_block_ents(en_tokenizer):
+    doc = en_tokenizer("a b c d e")
+    doc.block_ents([doc[1:2], doc[3:5]])
+    assert [t.ent_iob for t in doc] == [0, 3, 0, 3, 3]
+    assert [t.ent_type for t in doc] == [0, 0, 0, 0, 0]
+    assert doc.ents == tuple()
+
+    # invalid IOB repaired
+    doc.ents = [Span(doc, 3, 5, "ENT")]
+    assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 1]
+    doc.block_ents([doc[3:4]])
+    assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 3]
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 548cd2697..b8fdf15f9 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -168,7 +168,7 @@ def test_accept_blocked_token():
     ner2 = nlp2.create_pipe("ner", config=config)
 
     # set "New York" to a blocked entity
-    doc2.ents = [(0, 3, 5)]
+    doc2.block_ents([doc2[3:5]])
     assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"]
     assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""]
 
@@ -358,5 +358,5 @@ class BlockerComponent1:
         self.name = name
 
     def __call__(self, doc):
-        doc.ents = [(0, self.start, self.end)]
+        doc.block_ents([doc[self.start:self.end]])
         return doc
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 5c5443258..1bae84508 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -590,17 +590,16 @@ cdef class Doc:
                 entity_type = 0
                 kb_id = 0
 
-                # Set ent_iob to Missing (0) by default unless this token was nered before
-                ent_iob = 0
-                if self.c[i].ent_iob != 0:
-                    ent_iob = 2
+                # Set ent_iob to Outside (2) by default
+                ent_iob = 2
 
                 # overwrite if the token was part of a specified entity
                 if i in tokens_in_ents.keys():
                     ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i]
                     if entity_type is None or entity_type <= 0:
-                        # Blocking this token from being overwritten by downstream NER
-                        ent_iob = 3
+                        # Empty label: Missing, unset this token
+                        ent_iob = 0
+                        entity_type = 0
                     elif ent_start == i:
                         # Marking the start of an entity
                         ent_iob = 3
@@ -612,6 +611,20 @@ cdef class Doc:
                 self.c[i].ent_kb_id = kb_id
                 self.c[i].ent_iob = ent_iob
 
+    def block_ents(self, spans):
+        """Mark spans as never an entity for the EntityRecognizer.
+
+        spans (List[Span]): The spans to block as never entities.
+        """
+        for span in spans:
+            for i in range(span.start, span.end):
+                self.c[i].ent_iob = 3
+                self.c[i].ent_type = 0
+            # if the following token is I, set to B
+            if span.end < self.length:
+                if self.c[span.end].ent_iob == 1:
+                    self.c[span.end].ent_iob = 3
+
     @property
     def noun_chunks(self):
         """Iterate over the base noun phrases in the document. Yields base
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 3344704bf..d396a2040 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -172,7 +172,7 @@ cdef class Example:
         return output
 
     def get_aligned_ner(self):
-        if not self.y.is_nered:
+        if not self.y.has_annotation("ENT_IOB"):
             return [None] * len(self.x)  # should this be 'missing' instead of 'None' ?
         x_ents = self.get_aligned_spans_y2x(self.y.ents)
         # Default to 'None' for missing values
@@ -303,9 +303,7 @@ def _add_entities_to_doc(doc, ner_data):
             spans_from_biluo_tags(doc, ner_data)
         )
     elif isinstance(ner_data[0], Span):
-        # Ugh, this is super messy. Really hard to set O entities
         doc.ents = ner_data
-        doc.ents = [span for span in ner_data if span.label_]
     else:
         raise ValueError(Errors.E973)
 
diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py
index ceb5e16b8..33a4733ca 100644
--- a/spacy/training/iob_utils.py
+++ b/spacy/training/iob_utils.py
@@ -182,22 +182,18 @@ def tags_to_entities(tags):
     entities = []
     start = None
     for i, tag in enumerate(tags):
-        if tag is None:
-            continue
-        if tag.startswith("O"):
+        if tag is None or tag.startswith("-"):
             # TODO: We shouldn't be getting these malformed inputs. Fix this.
             if start is not None:
                 start = None
             else:
                 entities.append(("", i, i))
-            continue
-        elif tag == "-":
-            continue
+        elif tag.startswith("O"):
+            pass
         elif tag.startswith("I"):
             if start is None:
                 raise ValueError(Errors.E067.format(start="I", tags=tags[: i + 1]))
-            continue
-        if tag.startswith("U"):
+        elif tag.startswith("U"):
             entities.append((tag[2:], i, i))
         elif tag.startswith("B"):
             start = i