2019-02-10 11:14:51 +00:00
|
|
|
# coding: utf8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
from collections import defaultdict
|
|
|
|
import srsly
|
|
|
|
|
|
|
|
from ..errors import Errors
|
|
|
|
from ..compat import basestring_
|
|
|
|
from ..util import ensure_path
|
|
|
|
from ..tokens import Span
|
|
|
|
from ..matcher import Matcher, PhraseMatcher
|
|
|
|
|
|
|
|
|
|
|
|
class EntityRuler(object):
|
|
|
|
name = "entity_ruler"
|
|
|
|
|
|
|
|
def __init__(self, nlp, **cfg):
|
|
|
|
"""Initialise the entitiy ruler. If patterns are supplied here, they
|
|
|
|
need to be a list of dictionaries with a `"label"` and `"pattern"`
|
|
|
|
key. A pattern can either be a token pattern (list) or a phrase pattern
|
|
|
|
(string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`.
|
|
|
|
|
|
|
|
nlp (Language): The shared nlp object to pass the vocab to the matchers
|
|
|
|
and process phrase patterns.
|
|
|
|
patterns (iterable): Optional patterns to load in.
|
|
|
|
overwrite_ents (bool): If existing entities are present, e.g. entities
|
|
|
|
added by the model, overwrite them by matches if necessary.
|
|
|
|
**cfg: Other config parameters. If pipeline component is loaded as part
|
|
|
|
of a model pipeline, this will include all keyword arguments passed
|
|
|
|
to `spacy.load`.
|
|
|
|
RETURNS (EntityRuler): The newly constructed object.
|
|
|
|
"""
|
|
|
|
self.nlp = nlp
|
|
|
|
self.overwrite = cfg.get("overwrite_ents", False)
|
|
|
|
self.token_patterns = defaultdict(list)
|
|
|
|
self.phrase_patterns = defaultdict(list)
|
|
|
|
self.matcher = Matcher(nlp.vocab)
|
|
|
|
self.phrase_matcher = PhraseMatcher(nlp.vocab)
|
|
|
|
patterns = cfg.get("patterns")
|
|
|
|
if patterns is not None:
|
|
|
|
self.add_patterns(patterns)
|
|
|
|
|
|
|
|
def __len__(self):
|
|
|
|
"""The number of all patterns added to the entity ruler."""
|
|
|
|
n_token_patterns = sum(len(p) for p in self.token_patterns.values())
|
|
|
|
n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values())
|
|
|
|
return n_token_patterns + n_phrase_patterns
|
|
|
|
|
|
|
|
def __contains__(self, label):
|
|
|
|
"""Whether a label is present in the patterns."""
|
|
|
|
return label in self.token_patterns or label in self.phrase_patterns
|
|
|
|
|
|
|
|
def __call__(self, doc):
|
|
|
|
"""Find matches in document and add them as entities.
|
|
|
|
|
|
|
|
doc (Doc): The Doc object in the pipeline.
|
|
|
|
RETURNS (Doc): The Doc with added entities, if available.
|
|
|
|
"""
|
|
|
|
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
|
|
|
|
matches = set(
|
|
|
|
[(m_id, start, end) for m_id, start, end in matches if start != end]
|
|
|
|
)
|
|
|
|
get_sort_key = lambda m: (m[2] - m[1], m[1])
|
|
|
|
matches = sorted(matches, key=get_sort_key, reverse=True)
|
|
|
|
entities = list(doc.ents)
|
|
|
|
new_entities = []
|
|
|
|
seen_tokens = set()
|
|
|
|
for match_id, start, end in matches:
|
|
|
|
if any(t.ent_type for t in doc[start:end]) and not self.overwrite:
|
|
|
|
continue
|
|
|
|
# check for end - 1 here because boundaries are inclusive
|
|
|
|
if start not in seen_tokens and end - 1 not in seen_tokens:
|
|
|
|
new_entities.append(Span(doc, start, end, label=match_id))
|
|
|
|
entities = [
|
|
|
|
e for e in entities if not (e.start < end and e.end > start)
|
|
|
|
]
|
|
|
|
seen_tokens.update(range(start, end))
|
|
|
|
doc.ents = entities + new_entities
|
|
|
|
return doc
|
|
|
|
|
|
|
|
@property
|
|
|
|
def labels(self):
|
|
|
|
"""All labels present in the match patterns.
|
|
|
|
|
|
|
|
RETURNS (set): The string labels.
|
|
|
|
"""
|
|
|
|
all_labels = set(self.token_patterns.keys())
|
|
|
|
all_labels.update(self.phrase_patterns.keys())
|
2019-02-14 19:03:19 +00:00
|
|
|
return tuple(all_labels)
|
2019-02-10 11:14:51 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def patterns(self):
|
|
|
|
"""Get all patterns that were added to the entity ruler.
|
|
|
|
|
|
|
|
RETURNS (list): The original patterns, one dictionary per pattern.
|
|
|
|
"""
|
|
|
|
all_patterns = []
|
|
|
|
for label, patterns in self.token_patterns.items():
|
|
|
|
for pattern in patterns:
|
|
|
|
all_patterns.append({"label": label, "pattern": pattern})
|
|
|
|
for label, patterns in self.phrase_patterns.items():
|
|
|
|
for pattern in patterns:
|
|
|
|
all_patterns.append({"label": label, "pattern": pattern.text})
|
|
|
|
return all_patterns
|
|
|
|
|
|
|
|
def add_patterns(self, patterns):
|
|
|
|
"""Add patterns to the entitiy ruler. A pattern can either be a token
|
|
|
|
pattern (list of dicts) or a phrase pattern (string). For example:
|
|
|
|
{'label': 'ORG', 'pattern': 'Apple'}
|
|
|
|
{'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
|
|
|
|
|
|
|
|
patterns (list): The patterns to add.
|
|
|
|
"""
|
|
|
|
for entry in patterns:
|
|
|
|
label = entry["label"]
|
|
|
|
pattern = entry["pattern"]
|
|
|
|
if isinstance(pattern, basestring_):
|
|
|
|
self.phrase_patterns[label].append(self.nlp(pattern))
|
|
|
|
elif isinstance(pattern, list):
|
|
|
|
self.token_patterns[label].append(pattern)
|
|
|
|
else:
|
|
|
|
raise ValueError(Errors.E097.format(pattern=pattern))
|
|
|
|
for label, patterns in self.token_patterns.items():
|
|
|
|
self.matcher.add(label, None, *patterns)
|
|
|
|
for label, patterns in self.phrase_patterns.items():
|
|
|
|
self.phrase_matcher.add(label, None, *patterns)
|
|
|
|
|
|
|
|
def from_bytes(self, patterns_bytes, **kwargs):
|
|
|
|
"""Load the entity ruler from a bytestring.
|
|
|
|
|
|
|
|
patterns_bytes (bytes): The bytestring to load.
|
|
|
|
**kwargs: Other config paramters, mostly for consistency.
|
|
|
|
RETURNS (EntityRuler): The loaded entity ruler.
|
|
|
|
"""
|
|
|
|
patterns = srsly.msgpack_loads(patterns_bytes)
|
|
|
|
self.add_patterns(patterns)
|
|
|
|
return self
|
|
|
|
|
|
|
|
def to_bytes(self, **kwargs):
|
|
|
|
"""Serialize the entity ruler patterns to a bytestring.
|
|
|
|
|
|
|
|
RETURNS (bytes): The serialized patterns.
|
|
|
|
"""
|
|
|
|
return srsly.msgpack_dumps(self.patterns)
|
|
|
|
|
|
|
|
def from_disk(self, path, **kwargs):
|
|
|
|
"""Load the entity ruler from a file. Expects a file containing
|
|
|
|
newline-delimited JSON (JSONL) with one entry per line.
|
|
|
|
|
|
|
|
path (unicode / Path): The JSONL file to load.
|
|
|
|
**kwargs: Other config paramters, mostly for consistency.
|
|
|
|
RETURNS (EntityRuler): The loaded entity ruler.
|
|
|
|
"""
|
|
|
|
path = ensure_path(path)
|
|
|
|
path = path.with_suffix(".jsonl")
|
|
|
|
patterns = srsly.read_jsonl(path)
|
|
|
|
self.add_patterns(patterns)
|
|
|
|
return self
|
|
|
|
|
|
|
|
def to_disk(self, path, **kwargs):
|
|
|
|
"""Save the entity ruler patterns to a directory. The patterns will be
|
|
|
|
saved as newline-delimited JSON (JSONL).
|
|
|
|
|
|
|
|
path (unicode / Path): The JSONL file to load.
|
|
|
|
**kwargs: Other config paramters, mostly for consistency.
|
|
|
|
RETURNS (EntityRuler): The loaded entity ruler.
|
|
|
|
"""
|
|
|
|
path = ensure_path(path)
|
|
|
|
path = path.with_suffix(".jsonl")
|
|
|
|
srsly.write_jsonl(path, self.patterns)
|