spaCy/examples/pipeline/fix_space_entities.py

#!/usr/bin/env python
# coding: utf8
"""Demonstrate adding a rule-based component that forces some tokens to not
be entities, before the NER tagger is applied. This is used to hotfix the issue
in https://github.com/explosion/spaCy/issues/2870, present as of spaCy v2.0.16.

Compatible with: spaCy v2.0.0+
Last tested with: v2.1.0
"""
from __future__ import unicode_literals

import spacy
from spacy.attrs import ENT_IOB


def fix_space_tags(doc):
    ent_iobs = doc.to_array([ENT_IOB])
    for i, token in enumerate(doc):
        if token.is_space:
            # Sets 'O' tag (0 is None, so I is 1, O is 2)
            ent_iobs[i] = 2
    doc.from_array([ENT_IOB], ent_iobs.reshape((len(doc), 1)))
    return doc


def main():
    nlp = spacy.load("en_core_web_sm")
    text = "This is some crazy test where I dont need an Apple                Watch to make things bug"
    doc = nlp(text)
    print("Before", doc.ents)
    nlp.add_pipe(fix_space_tags, name="fix-ner", before="ner")
    doc = nlp(text)
    print("After", doc.ents)


if __name__ == "__main__":
    main()