From 821f37254cf1caca8f943574b4cbaaaea4cfb251 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 26 Sep 2020 00:19:53 +0200 Subject: [PATCH] Fix attributeruler --- spacy/pipeline/attributeruler.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index 52f8b7ece..e1ad91340 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -80,11 +80,14 @@ class AttributeRuler(Pipe): DOCS: https://nightly.spacy.io/api/attributeruler#call """ matches = sorted(self.matcher(doc, allow_missing=True)) + print("Attrs", self.attrs) + print("Matches", matches) for match_id, start, end in matches: span = Span(doc, start, end, label=match_id) - attrs = self.attrs[span.label] - index = self.indices[span.label] + attr_id = _parse_key(span.label_) + attrs = self.attrs[attr_id] + index = self.indices[attr_id] try: token = span[index] except IndexError: @@ -173,9 +176,10 @@ class AttributeRuler(Pipe): DOCS: https://nightly.spacy.io/api/attributeruler#add """ - # This needs to be a string, because otherwise it's interpreted as a - # string key. - self.matcher.add(f"attr_rules_{len(self.attrs)}", patterns) + # We need to make a string here, because otherwise the ID we pass back + # will be interpreted as the hash of a string, rather than an ordinal. + key = _make_key(len(self.attrs)) + self.matcher.add(self.vocab.strings.add(key), patterns) self._attrs_unnormed.append(attrs) attrs = normalize_token_attrs(self.vocab, attrs) self.attrs.append(attrs) @@ -199,7 +203,7 @@ class AttributeRuler(Pipe): all_patterns = [] for i in range(len(self.attrs)): p = {} - p["patterns"] = self.matcher.get(i)[1] + p["patterns"] = self.matcher.get(_make_key(i))[1] p["attrs"] = self._attrs_unnormed[i] p["index"] = self.indices[i] all_patterns.append(p) @@ -303,6 +307,12 @@ class AttributeRuler(Pipe): return self +def _make_key(n_attr): + return f"attr_rule_{n_attr}" + +def _parse_key(key): + return int(key.rsplit("_", 1)[1]) + def _split_morph_attrs(attrs): """Split entries from a tag map or morph rules dict into to two dicts, one