2020-08-04 15:02:39 +00:00
|
|
|
import srsly
|
|
|
|
from typing import List, Dict, Union, Iterable, Any, Optional
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
from .pipe import Pipe
|
|
|
|
from ..errors import Errors
|
2020-09-09 08:31:03 +00:00
|
|
|
from ..training import validate_examples
|
2020-08-04 15:02:39 +00:00
|
|
|
from ..language import Language
|
|
|
|
from ..matcher import Matcher
|
2020-08-26 13:39:30 +00:00
|
|
|
from ..scorer import Scorer
|
|
|
|
from ..symbols import IDS, TAG, POS, MORPH, LEMMA
|
2020-08-04 15:02:39 +00:00
|
|
|
from ..tokens import Doc, Span
|
|
|
|
from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
|
|
|
|
from ..vocab import Vocab
|
2020-08-29 13:20:11 +00:00
|
|
|
from ..util import SimpleFrozenList
|
2020-08-04 15:02:39 +00:00
|
|
|
from .. import util
|
|
|
|
|
|
|
|
|
|
|
|
MatcherPatternType = List[Dict[Union[int, str], Any]]
|
|
|
|
AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]]
|
|
|
|
|
|
|
|
|
2020-08-07 12:43:55 +00:00
|
|
|
@Language.factory(
|
|
|
|
"attribute_ruler", default_config={"pattern_dicts": None, "validate": False}
|
|
|
|
)
|
2020-08-04 15:02:39 +00:00
|
|
|
def make_attribute_ruler(
|
|
|
|
nlp: Language,
|
|
|
|
name: str,
|
2020-08-07 12:43:55 +00:00
|
|
|
pattern_dicts: Optional[Iterable[AttributeRulerPatternType]],
|
|
|
|
validate: bool,
|
2020-08-04 15:02:39 +00:00
|
|
|
):
|
2020-08-07 12:43:55 +00:00
|
|
|
return AttributeRuler(
|
|
|
|
nlp.vocab, name, pattern_dicts=pattern_dicts, validate=validate
|
|
|
|
)
|
2020-08-04 15:02:39 +00:00
|
|
|
|
|
|
|
|
|
|
|
class AttributeRuler(Pipe):
|
|
|
|
"""Set token-level attributes for tokens matched by Matcher patterns.
|
|
|
|
Additionally supports importing patterns from tag maps and morph rules.
|
|
|
|
|
2020-09-04 10:58:50 +00:00
|
|
|
DOCS: https://nightly.spacy.io/api/attributeruler
|
2020-08-04 15:02:39 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
vocab: Vocab,
|
|
|
|
name: str = "attribute_ruler",
|
|
|
|
*,
|
|
|
|
pattern_dicts: Optional[Iterable[AttributeRulerPatternType]] = None,
|
2020-08-07 12:43:55 +00:00
|
|
|
validate: bool = False,
|
2020-08-04 15:02:39 +00:00
|
|
|
) -> None:
|
|
|
|
"""Initialize the AttributeRuler.
|
|
|
|
|
|
|
|
vocab (Vocab): The vocab.
|
|
|
|
name (str): The pipe name. Defaults to "attribute_ruler".
|
|
|
|
pattern_dicts (Iterable[Dict]): A list of pattern dicts with the keys as
|
|
|
|
the arguments to AttributeRuler.add (`patterns`/`attrs`/`index`) to add
|
|
|
|
as patterns.
|
|
|
|
|
|
|
|
RETURNS (AttributeRuler): The AttributeRuler component.
|
|
|
|
|
2020-09-04 10:58:50 +00:00
|
|
|
DOCS: https://nightly.spacy.io/api/attributeruler#init
|
2020-08-04 15:02:39 +00:00
|
|
|
"""
|
|
|
|
self.name = name
|
|
|
|
self.vocab = vocab
|
2020-08-07 12:43:55 +00:00
|
|
|
self.matcher = Matcher(self.vocab, validate=validate)
|
2020-08-04 15:02:39 +00:00
|
|
|
self.attrs = []
|
2020-08-05 14:00:59 +00:00
|
|
|
self._attrs_unnormed = [] # store for reference
|
2020-08-04 15:02:39 +00:00
|
|
|
self.indices = []
|
|
|
|
|
|
|
|
if pattern_dicts:
|
|
|
|
self.add_patterns(pattern_dicts)
|
|
|
|
|
|
|
|
def __call__(self, doc: Doc) -> Doc:
|
2020-08-07 10:42:31 +00:00
|
|
|
"""Apply the AttributeRuler to a Doc and set all attribute exceptions.
|
2020-08-04 15:02:39 +00:00
|
|
|
|
|
|
|
doc (Doc): The document to process.
|
|
|
|
RETURNS (Doc): The processed Doc.
|
|
|
|
|
2020-09-04 10:58:50 +00:00
|
|
|
DOCS: https://nightly.spacy.io/api/attributeruler#call
|
2020-08-04 15:02:39 +00:00
|
|
|
"""
|
2020-09-25 22:23:09 +00:00
|
|
|
matches = self.matcher(doc, allow_missing=True)
|
|
|
|
# Sort by the attribute ID, so that later rules have precendence
|
|
|
|
matches = [
|
2020-09-26 09:12:39 +00:00
|
|
|
(int(self.vocab.strings[m_id]), m_id, s, e)
|
2020-09-25 22:23:09 +00:00
|
|
|
for m_id, s, e in matches
|
|
|
|
]
|
|
|
|
matches.sort()
|
|
|
|
for attr_id, match_id, start, end in matches:
|
2020-08-04 15:02:39 +00:00
|
|
|
span = Span(doc, start, end, label=match_id)
|
2020-09-25 22:19:53 +00:00
|
|
|
attrs = self.attrs[attr_id]
|
|
|
|
index = self.indices[attr_id]
|
2020-08-04 15:02:39 +00:00
|
|
|
try:
|
2020-09-25 22:23:09 +00:00
|
|
|
# The index can be negative, which makes it annoying to do
|
|
|
|
# the boundscheck. Let Span do it instead.
|
2020-08-04 15:02:39 +00:00
|
|
|
token = span[index]
|
|
|
|
except IndexError:
|
2020-09-25 22:23:09 +00:00
|
|
|
# The original exception is just our conditional logic, so we
|
|
|
|
# raise from.
|
2020-08-04 15:02:39 +00:00
|
|
|
raise ValueError(
|
|
|
|
Errors.E1001.format(
|
|
|
|
patterns=self.matcher.get(span.label),
|
|
|
|
span=[t.text for t in span],
|
|
|
|
index=index,
|
|
|
|
)
|
2020-09-25 22:23:09 +00:00
|
|
|
) from None
|
|
|
|
set_token_attrs(span[index], attrs)
|
2020-08-04 15:02:39 +00:00
|
|
|
return doc
|
|
|
|
|
2020-08-06 17:43:09 +00:00
|
|
|
def pipe(self, stream, *, batch_size=128):
|
|
|
|
"""Apply the pipe to a stream of documents. This usually happens under
|
|
|
|
the hood when the nlp object is called on a text and all components are
|
|
|
|
applied to the Doc.
|
|
|
|
|
|
|
|
stream (Iterable[Doc]): A stream of documents.
|
|
|
|
batch_size (int): The number of documents to buffer.
|
|
|
|
YIELDS (Doc): Processed documents in order.
|
|
|
|
|
|
|
|
DOCS: https://spacy.io/attributeruler/pipe#pipe
|
|
|
|
"""
|
|
|
|
for doc in stream:
|
|
|
|
doc = self(doc)
|
|
|
|
yield doc
|
|
|
|
|
2020-08-04 15:02:39 +00:00
|
|
|
def load_from_tag_map(
|
|
|
|
self, tag_map: Dict[str, Dict[Union[int, str], Union[int, str]]]
|
|
|
|
) -> None:
|
2020-08-07 10:42:31 +00:00
|
|
|
"""Load attribute ruler patterns from a tag map.
|
|
|
|
|
|
|
|
tag_map (dict): The tag map that maps fine-grained tags to
|
|
|
|
coarse-grained tags and morphological features.
|
|
|
|
|
2020-09-04 10:58:50 +00:00
|
|
|
DOCS: https://nightly.spacy.io/api/attributeruler#load_from_morph_rules
|
2020-08-07 10:42:31 +00:00
|
|
|
"""
|
2020-08-04 15:02:39 +00:00
|
|
|
for tag, attrs in tag_map.items():
|
|
|
|
pattern = [{"TAG": tag}]
|
|
|
|
attrs, morph_attrs = _split_morph_attrs(attrs)
|
2020-09-24 14:54:39 +00:00
|
|
|
if "MORPH" not in attrs:
|
|
|
|
morph = self.vocab.morphology.add(morph_attrs)
|
|
|
|
attrs["MORPH"] = self.vocab.strings[morph]
|
|
|
|
else:
|
|
|
|
morph = self.vocab.morphology.add(attrs["MORPH"])
|
|
|
|
attrs["MORPH"] = self.vocab.strings[morph]
|
2020-08-04 15:02:39 +00:00
|
|
|
self.add([pattern], attrs)
|
|
|
|
|
|
|
|
def load_from_morph_rules(
|
|
|
|
self, morph_rules: Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
|
|
|
|
) -> None:
|
2020-08-07 10:42:31 +00:00
|
|
|
"""Load attribute ruler patterns from morph rules.
|
|
|
|
|
|
|
|
morph_rules (dict): The morph rules that map token text and
|
|
|
|
fine-grained tags to coarse-grained tags, lemmas and morphological
|
|
|
|
features.
|
|
|
|
|
2020-09-04 10:58:50 +00:00
|
|
|
DOCS: https://nightly.spacy.io/api/attributeruler#load_from_morph_rules
|
2020-08-07 10:42:31 +00:00
|
|
|
"""
|
2020-08-04 15:02:39 +00:00
|
|
|
for tag in morph_rules:
|
|
|
|
for word in morph_rules[tag]:
|
|
|
|
pattern = [{"ORTH": word, "TAG": tag}]
|
|
|
|
attrs = morph_rules[tag][word]
|
|
|
|
attrs, morph_attrs = _split_morph_attrs(attrs)
|
2020-09-24 14:54:39 +00:00
|
|
|
if "MORPH" in attrs:
|
|
|
|
morph = self.vocab.morphology.add(attrs["MORPH"])
|
|
|
|
attrs["MORPH"] = self.vocab.strings[morph]
|
|
|
|
elif morph_attrs:
|
|
|
|
morph = self.vocab.morphology.add(morph_attrs)
|
|
|
|
attrs["MORPH"] = self.vocab.strings[morph]
|
2020-08-04 15:02:39 +00:00
|
|
|
self.add([pattern], attrs)
|
|
|
|
|
|
|
|
def add(
|
|
|
|
self, patterns: Iterable[MatcherPatternType], attrs: Dict, index: int = 0
|
|
|
|
) -> None:
|
|
|
|
"""Add Matcher patterns for tokens that should be modified with the
|
|
|
|
provided attributes. The token at the specified index within the
|
|
|
|
matched span will be assigned the attributes.
|
|
|
|
|
|
|
|
patterns (Iterable[List[Dict]]): A list of Matcher patterns.
|
|
|
|
attrs (Dict): The attributes to assign to the target token in the
|
|
|
|
matched span.
|
|
|
|
index (int): The index of the token in the matched span to modify. May
|
|
|
|
be negative to index from the end of the span. Defaults to 0.
|
|
|
|
|
2020-09-04 10:58:50 +00:00
|
|
|
DOCS: https://nightly.spacy.io/api/attributeruler#add
|
2020-08-04 15:02:39 +00:00
|
|
|
"""
|
2020-09-25 22:19:53 +00:00
|
|
|
# We need to make a string here, because otherwise the ID we pass back
|
|
|
|
# will be interpreted as the hash of a string, rather than an ordinal.
|
2020-09-26 09:12:39 +00:00
|
|
|
key = str(len(self.attrs))
|
2020-09-25 22:19:53 +00:00
|
|
|
self.matcher.add(self.vocab.strings.add(key), patterns)
|
2020-08-04 15:02:39 +00:00
|
|
|
self._attrs_unnormed.append(attrs)
|
|
|
|
attrs = normalize_token_attrs(self.vocab, attrs)
|
|
|
|
self.attrs.append(attrs)
|
|
|
|
self.indices.append(index)
|
|
|
|
|
|
|
|
def add_patterns(self, pattern_dicts: Iterable[AttributeRulerPatternType]) -> None:
|
2020-08-07 10:42:31 +00:00
|
|
|
"""Add patterns from a list of pattern dicts with the keys as the
|
|
|
|
arguments to AttributeRuler.add.
|
|
|
|
pattern_dicts (Iterable[dict]): A list of pattern dicts with the keys
|
|
|
|
as the arguments to AttributeRuler.add (patterns/attrs/index) to
|
|
|
|
add as patterns.
|
|
|
|
|
2020-09-04 10:58:50 +00:00
|
|
|
DOCS: https://nightly.spacy.io/api/attributeruler#add_patterns
|
2020-08-07 10:42:31 +00:00
|
|
|
"""
|
2020-08-04 15:02:39 +00:00
|
|
|
for p in pattern_dicts:
|
|
|
|
self.add(**p)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def patterns(self) -> List[AttributeRulerPatternType]:
|
2020-08-07 10:42:31 +00:00
|
|
|
"""All the added patterns."""
|
2020-08-04 15:02:39 +00:00
|
|
|
all_patterns = []
|
|
|
|
for i in range(len(self.attrs)):
|
|
|
|
p = {}
|
2020-09-26 09:12:39 +00:00
|
|
|
p["patterns"] = self.matcher.get(str(i))[1]
|
2020-08-04 15:02:39 +00:00
|
|
|
p["attrs"] = self._attrs_unnormed[i]
|
|
|
|
p["index"] = self.indices[i]
|
|
|
|
all_patterns.append(p)
|
|
|
|
return all_patterns
|
|
|
|
|
2020-08-26 13:39:30 +00:00
|
|
|
def score(self, examples, **kwargs):
|
|
|
|
"""Score a batch of examples.
|
|
|
|
|
|
|
|
examples (Iterable[Example]): The examples to score.
|
|
|
|
RETURNS (Dict[str, Any]): The scores, produced by
|
|
|
|
Scorer.score_token_attr for the attributes "tag", "pos", "morph"
|
|
|
|
and "lemma" for the target token attributes.
|
|
|
|
|
2020-09-04 10:58:50 +00:00
|
|
|
DOCS: https://nightly.spacy.io/api/tagger#score
|
2020-08-26 13:39:30 +00:00
|
|
|
"""
|
|
|
|
validate_examples(examples, "AttributeRuler.score")
|
|
|
|
results = {}
|
|
|
|
attrs = set()
|
|
|
|
for token_attrs in self.attrs:
|
|
|
|
attrs.update(token_attrs)
|
|
|
|
for attr in attrs:
|
|
|
|
if attr == TAG:
|
|
|
|
results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
|
|
|
|
elif attr == POS:
|
|
|
|
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
|
|
|
elif attr == MORPH:
|
|
|
|
results.update(Scorer.score_token_attr(examples, "morph", **kwargs))
|
|
|
|
elif attr == LEMMA:
|
|
|
|
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
|
|
|
|
return results
|
|
|
|
|
2020-08-29 13:20:11 +00:00
|
|
|
def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
|
2020-08-07 10:42:31 +00:00
|
|
|
"""Serialize the AttributeRuler to a bytestring.
|
2020-08-04 15:02:39 +00:00
|
|
|
|
|
|
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
|
|
|
RETURNS (bytes): The serialized object.
|
|
|
|
|
2020-09-04 10:58:50 +00:00
|
|
|
DOCS: https://nightly.spacy.io/api/attributeruler#to_bytes
|
2020-08-04 15:02:39 +00:00
|
|
|
"""
|
|
|
|
serialize = {}
|
|
|
|
serialize["vocab"] = self.vocab.to_bytes
|
2020-08-28 18:42:26 +00:00
|
|
|
serialize["patterns"] = lambda: srsly.msgpack_dumps(self.patterns)
|
2020-08-04 15:02:39 +00:00
|
|
|
return util.to_bytes(serialize, exclude)
|
|
|
|
|
2020-08-29 13:20:11 +00:00
|
|
|
def from_bytes(
|
|
|
|
self, bytes_data: bytes, exclude: Iterable[str] = SimpleFrozenList()
|
|
|
|
):
|
2020-08-07 10:42:31 +00:00
|
|
|
"""Load the AttributeRuler from a bytestring.
|
2020-08-04 15:02:39 +00:00
|
|
|
|
|
|
|
bytes_data (bytes): The data to load.
|
|
|
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
|
|
|
returns (AttributeRuler): The loaded object.
|
|
|
|
|
2020-09-04 10:58:50 +00:00
|
|
|
DOCS: https://nightly.spacy.io/api/attributeruler#from_bytes
|
2020-08-04 15:02:39 +00:00
|
|
|
"""
|
2020-08-29 11:01:10 +00:00
|
|
|
|
2020-08-04 15:02:39 +00:00
|
|
|
def load_patterns(b):
|
2020-08-28 18:42:26 +00:00
|
|
|
self.add_patterns(srsly.msgpack_loads(b))
|
2020-08-04 15:02:39 +00:00
|
|
|
|
|
|
|
deserialize = {
|
|
|
|
"vocab": lambda b: self.vocab.from_bytes(b),
|
|
|
|
"patterns": load_patterns,
|
|
|
|
}
|
|
|
|
util.from_bytes(bytes_data, deserialize, exclude)
|
|
|
|
|
|
|
|
return self
|
|
|
|
|
2020-08-29 13:20:11 +00:00
|
|
|
def to_disk(
|
|
|
|
self, path: Union[Path, str], exclude: Iterable[str] = SimpleFrozenList()
|
|
|
|
) -> None:
|
2020-08-07 10:42:31 +00:00
|
|
|
"""Serialize the AttributeRuler to disk.
|
2020-08-04 15:02:39 +00:00
|
|
|
|
|
|
|
path (Union[Path, str]): A path to a directory.
|
|
|
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
2020-09-04 10:58:50 +00:00
|
|
|
DOCS: https://nightly.spacy.io/api/attributeruler#to_disk
|
2020-08-04 15:02:39 +00:00
|
|
|
"""
|
|
|
|
serialize = {
|
|
|
|
"vocab": lambda p: self.vocab.to_disk(p),
|
2020-08-28 18:42:26 +00:00
|
|
|
"patterns": lambda p: srsly.write_msgpack(p, self.patterns),
|
2020-08-04 15:02:39 +00:00
|
|
|
}
|
|
|
|
util.to_disk(path, serialize, exclude)
|
|
|
|
|
|
|
|
def from_disk(
|
2020-08-29 13:20:11 +00:00
|
|
|
self, path: Union[Path, str], exclude: Iterable[str] = SimpleFrozenList()
|
2020-08-04 15:02:39 +00:00
|
|
|
) -> None:
|
2020-08-07 10:42:31 +00:00
|
|
|
"""Load the AttributeRuler from disk.
|
2020-08-04 15:02:39 +00:00
|
|
|
|
|
|
|
path (Union[Path, str]): A path to a directory.
|
|
|
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
2020-09-04 10:58:50 +00:00
|
|
|
DOCS: https://nightly.spacy.io/api/attributeruler#from_disk
|
2020-08-04 15:02:39 +00:00
|
|
|
"""
|
2020-08-29 11:01:10 +00:00
|
|
|
|
2020-08-04 15:02:39 +00:00
|
|
|
def load_patterns(p):
|
2020-08-28 18:42:26 +00:00
|
|
|
self.add_patterns(srsly.read_msgpack(p))
|
2020-08-04 15:02:39 +00:00
|
|
|
|
|
|
|
deserialize = {
|
|
|
|
"vocab": lambda p: self.vocab.from_disk(p),
|
|
|
|
"patterns": load_patterns,
|
|
|
|
}
|
|
|
|
util.from_disk(path, deserialize, exclude)
|
|
|
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
|
|
def _split_morph_attrs(attrs):
|
|
|
|
"""Split entries from a tag map or morph rules dict into to two dicts, one
|
|
|
|
with the token-level features (POS, LEMMA) and one with the remaining
|
|
|
|
features, which are presumed to be individual MORPH features."""
|
|
|
|
other_attrs = {}
|
|
|
|
morph_attrs = {}
|
|
|
|
for k, v in attrs.items():
|
|
|
|
if k in "_" or k in IDS.keys() or k in IDS.values():
|
|
|
|
other_attrs[k] = v
|
|
|
|
else:
|
|
|
|
morph_attrs[k] = v
|
|
|
|
return other_attrs, morph_attrs
|