spaCy/spacy/tests/regression/test_issue7065.py

from spacy.kb import KnowledgeBase
from spacy.lang.en import English
from spacy.training import Example


def test_issue7065():
    text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival."
    nlp = English()
    nlp.add_pipe("sentencizer")
    ruler = nlp.add_pipe("entity_ruler")
    patterns = [
        {
            "label": "THING",
            "pattern": [
                {"LOWER": "symphony"},
                {"LOWER": "no"},
                {"LOWER": "."},
                {"LOWER": "8"},
            ],
        }
    ]
    ruler.add_patterns(patterns)

    doc = nlp(text)
    sentences = [s for s in doc.sents]
    assert len(sentences) == 2
    sent0 = sentences[0]
    ent = doc.ents[0]
    assert ent.start < sent0.end < ent.end
    assert sentences.index(ent.sent) == 0


def test_issue7065_b():
    # Test that the NEL doesn't crash when an entity crosses a sentence boundary
    nlp = English()
    vector_length = 3
    nlp.add_pipe("sentencizer")

    text = "Mahler 's Symphony No. 8 was beautiful."
    entities = [(0, 6, "PERSON"), (10, 24, "WORK")]
    links = {
        (0, 6): {"Q7304": 1.0, "Q270853": 0.0},
        (10, 24): {"Q7304": 0.0, "Q270853": 1.0},
    }
    sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
    doc = nlp(text)
    example = Example.from_dict(
        doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
    )
    train_examples = [example]

    def create_kb(vocab):
        # create artificial KB
        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
        mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7])
        mykb.add_alias(
            alias="No. 8",
            entities=["Q270853"],
            probabilities=[1.0],
        )
        mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3])
        mykb.add_alias(
            alias="Mahler",
            entities=["Q7304"],
            probabilities=[1.0],
        )
        return mykb

    # Create the Entity Linker component and add it to the pipeline
    entity_linker = nlp.add_pipe("entity_linker", last=True)
    entity_linker.set_kb(create_kb)

    # train the NEL pipe
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    # Add a custom rule-based component to mimick NER
    patterns = [
        {"label": "PERSON", "pattern": [{"LOWER": "mahler"}]},
        {
            "label": "WORK",
            "pattern": [
                {"LOWER": "symphony"},
                {"LOWER": "no"},
                {"LOWER": "."},
                {"LOWER": "8"},
            ],
        },
    ]
    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
    ruler.add_patterns(patterns)

    # test the trained model - this should not throw E148
    doc = nlp(text)
    assert doc
Bugfix/nel crossing sentence (#7630) * ensure each entity gets a KB ID, even when it's not within a sentence * cleanup 2021-04-12 08:08:01 +00:00			`from spacy.kb import KnowledgeBase`
span.ent only returns first sentence (#7084) * return first sentence when span contains sentence boundary * docs fix * small fixes * cleanup 2021-02-19 12:02:38 +00:00			`from spacy.lang.en import English`
Bugfix/nel crossing sentence (#7630) * ensure each entity gets a KB ID, even when it's not within a sentence * cleanup 2021-04-12 08:08:01 +00:00			`from spacy.training import Example`
span.ent only returns first sentence (#7084) * return first sentence when span contains sentence boundary * docs fix * small fixes * cleanup 2021-02-19 12:02:38 +00:00

			`def test_issue7065():`
			`text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival."`
			`nlp = English()`
			`nlp.add_pipe("sentencizer")`
			`ruler = nlp.add_pipe("entity_ruler")`
Tidy up code 2021-06-28 09:48:00 +00:00			`patterns = [`
			`{`
			`"label": "THING",`
			`"pattern": [`
			`{"LOWER": "symphony"},`
			`{"LOWER": "no"},`
			`{"LOWER": "."},`
			`{"LOWER": "8"},`
			`],`
			`}`
			`]`
span.ent only returns first sentence (#7084) * return first sentence when span contains sentence boundary * docs fix * small fixes * cleanup 2021-02-19 12:02:38 +00:00			`ruler.add_patterns(patterns)`

			`doc = nlp(text)`
			`sentences = [s for s in doc.sents]`
			`assert len(sentences) == 2`
			`sent0 = sentences[0]`
			`ent = doc.ents[0]`
			`assert ent.start < sent0.end < ent.end`
			`assert sentences.index(ent.sent) == 0`
Bugfix/nel crossing sentence (#7630) * ensure each entity gets a KB ID, even when it's not within a sentence * cleanup 2021-04-12 08:08:01 +00:00

			`def test_issue7065_b():`
			`# Test that the NEL doesn't crash when an entity crosses a sentence boundary`
			`nlp = English()`
			`vector_length = 3`
			`nlp.add_pipe("sentencizer")`

			`text = "Mahler 's Symphony No. 8 was beautiful."`
			`entities = [(0, 6, "PERSON"), (10, 24, "WORK")]`
Tidy up code 2021-06-28 09:48:00 +00:00			`links = {`
			`(0, 6): {"Q7304": 1.0, "Q270853": 0.0},`
			`(10, 24): {"Q7304": 0.0, "Q270853": 1.0},`
			`}`
Bugfix/nel crossing sentence (#7630) * ensure each entity gets a KB ID, even when it's not within a sentence * cleanup 2021-04-12 08:08:01 +00:00			`sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]`
			`doc = nlp(text)`
Tidy up code 2021-06-28 09:48:00 +00:00			`example = Example.from_dict(`
			`doc, {"entities": entities, "links": links, "sent_starts": sent_starts}`
			`)`
Bugfix/nel crossing sentence (#7630) * ensure each entity gets a KB ID, even when it's not within a sentence * cleanup 2021-04-12 08:08:01 +00:00			`train_examples = [example]`

			`def create_kb(vocab):`
			`# create artificial KB`
			`mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)`
			`mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7])`
			`mykb.add_alias(`
			`alias="No. 8",`
			`entities=["Q270853"],`
			`probabilities=[1.0],`
			`)`
			`mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3])`
			`mykb.add_alias(`
			`alias="Mahler",`
			`entities=["Q7304"],`
			`probabilities=[1.0],`
			`)`
			`return mykb`

			`# Create the Entity Linker component and add it to the pipeline`
			`entity_linker = nlp.add_pipe("entity_linker", last=True)`
			`entity_linker.set_kb(create_kb)`

			`# train the NEL pipe`
			`optimizer = nlp.initialize(get_examples=lambda: train_examples)`
			`for i in range(2):`
			`losses = {}`
			`nlp.update(train_examples, sgd=optimizer, losses=losses)`

			`# Add a custom rule-based component to mimick NER`
			`patterns = [`
			`{"label": "PERSON", "pattern": [{"LOWER": "mahler"}]},`
Tidy up code 2021-06-28 09:48:00 +00:00			`{`
			`"label": "WORK",`
			`"pattern": [`
			`{"LOWER": "symphony"},`
			`{"LOWER": "no"},`
			`{"LOWER": "."},`
			`{"LOWER": "8"},`
			`],`
			`},`
Bugfix/nel crossing sentence (#7630) * ensure each entity gets a KB ID, even when it's not within a sentence * cleanup 2021-04-12 08:08:01 +00:00			`]`
			`ruler = nlp.add_pipe("entity_ruler", before="entity_linker")`
			`ruler.add_patterns(patterns)`

			`# test the trained model - this should not throw E148`
			`doc = nlp(text)`
			`assert doc`