Update Tokenizer.explain with special matches (#7749)

* Update Tokenizer.explain with special matches

Update `Tokenizer.explain` and the pseudo-code in the docs to include
the processing of special cases that contain affixes or whitespace.

* Handle optional settings in explain

* Add test for special matches in explain

Add test for `Tokenizer.explain` for special cases containing affixes.
This commit is contained in:
Adriane Boyd 2021-04-19 11:08:20 +02:00 committed by GitHub
parent 07b41c38ae
commit 0e7f94b247
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 59 additions and 4 deletions

View File

@ -1,5 +1,7 @@
import pytest import pytest
import re
from spacy.util import get_lang_class from spacy.util import get_lang_class
from spacy.tokenizer import Tokenizer
# Only include languages with no external dependencies # Only include languages with no external dependencies
# "is" seems to confuse importlib, so we're also excluding it for now # "is" seems to confuse importlib, so we're also excluding it for now
@ -60,3 +62,18 @@ def test_tokenizer_explain(lang):
tokens = [t.text for t in tokenizer(sentence) if not t.is_space] tokens = [t.text for t in tokenizer(sentence) if not t.is_space]
debug_tokens = [t[1] for t in tokenizer.explain(sentence)] debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
assert tokens == debug_tokens assert tokens == debug_tokens
def test_tokenizer_explain_special_matcher(en_vocab):
suffix_re = re.compile(r"[\.]$")
infix_re = re.compile(r"[/]")
rules = {"a.": [{"ORTH": "a."}]}
tokenizer = Tokenizer(
en_vocab,
rules=rules,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
)
tokens = [t.text for t in tokenizer("a/a.")]
explain_tokens = [t[1] for t in tokenizer.explain("a/a.")]
assert tokens == explain_tokens

View File

@ -20,11 +20,12 @@ from .attrs import intify_attrs
from .symbols import ORTH, NORM from .symbols import ORTH, NORM
from .errors import Errors, Warnings from .errors import Errors, Warnings
from . import util from . import util
from .util import registry from .util import registry, get_words_and_spaces
from .attrs import intify_attrs from .attrs import intify_attrs
from .symbols import ORTH from .symbols import ORTH
from .scorer import Scorer from .scorer import Scorer
from .training import validate_examples from .training import validate_examples
from .tokens import Span
cdef class Tokenizer: cdef class Tokenizer:
@ -638,8 +639,14 @@ cdef class Tokenizer:
DOCS: https://spacy.io/api/tokenizer#explain DOCS: https://spacy.io/api/tokenizer#explain
""" """
prefix_search = self.prefix_search prefix_search = self.prefix_search
if prefix_search is None:
prefix_search = re.compile("a^").search
suffix_search = self.suffix_search suffix_search = self.suffix_search
if suffix_search is None:
suffix_search = re.compile("a^").search
infix_finditer = self.infix_finditer infix_finditer = self.infix_finditer
if infix_finditer is None:
infix_finditer = re.compile("a^").finditer
token_match = self.token_match token_match = self.token_match
if token_match is None: if token_match is None:
token_match = re.compile("a^").match token_match = re.compile("a^").match
@ -687,7 +694,7 @@ cdef class Tokenizer:
tokens.append(("URL_MATCH", substring)) tokens.append(("URL_MATCH", substring))
substring = '' substring = ''
elif substring in special_cases: elif substring in special_cases:
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) tokens.extend((f"SPECIAL-{i + 1}", self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
substring = '' substring = ''
elif list(infix_finditer(substring)): elif list(infix_finditer(substring)):
infixes = infix_finditer(substring) infixes = infix_finditer(substring)
@ -705,7 +712,33 @@ cdef class Tokenizer:
tokens.append(("TOKEN", substring)) tokens.append(("TOKEN", substring))
substring = '' substring = ''
tokens.extend(reversed(suffixes)) tokens.extend(reversed(suffixes))
return tokens # Find matches for special cases handled by special matcher
words, spaces = get_words_and_spaces([t[1] for t in tokens], text)
t_words = []
t_spaces = []
for word, space in zip(words, spaces):
if not word.isspace():
t_words.append(word)
t_spaces.append(space)
doc = Doc(self.vocab, words=t_words, spaces=t_spaces)
matches = self._special_matcher(doc)
spans = [Span(doc, s, e, label=m_id) for m_id, s, e in matches]
spans = util.filter_spans(spans)
# Replace matched tokens with their exceptions
i = 0
final_tokens = []
spans_by_start = {s.start: s for s in spans}
while i < len(tokens):
if i in spans_by_start:
span = spans_by_start[i]
exc = [d[ORTH] for d in special_cases[span.label_]]
for j, orth in enumerate(exc):
final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
i += len(span)
else:
final_tokens.append(tokens[i])
i += 1
return final_tokens
def score(self, examples, **kwargs): def score(self, examples, **kwargs):
validate_examples(examples, "Tokenizer.score") validate_examples(examples, "Tokenizer.score")

View File

@ -786,6 +786,7 @@ rather than performance:
```python ```python
def tokenizer_pseudo_code( def tokenizer_pseudo_code(
text,
special_cases, special_cases,
prefix_search, prefix_search,
suffix_search, suffix_search,
@ -839,12 +840,14 @@ def tokenizer_pseudo_code(
tokens.append(substring) tokens.append(substring)
substring = "" substring = ""
tokens.extend(reversed(suffixes)) tokens.extend(reversed(suffixes))
for match in matcher(special_cases, text):
tokens.replace(match, special_cases[match])
return tokens return tokens
``` ```
The algorithm can be summarized as follows: The algorithm can be summarized as follows:
1. Iterate over whitespace-separated substrings. 1. Iterate over space-separated substrings.
2. Look for a token match. If there is a match, stop processing and keep this 2. Look for a token match. If there is a match, stop processing and keep this
token. token.
3. Check whether we have an explicitly defined special case for this substring. 3. Check whether we have an explicitly defined special case for this substring.
@ -858,6 +861,8 @@ The algorithm can be summarized as follows:
8. Look for "infixes" stuff like hyphens etc. and split the substring into 8. Look for "infixes" stuff like hyphens etc. and split the substring into
tokens on all infixes. tokens on all infixes.
9. Once we can't consume any more of the string, handle it as a single token. 9. Once we can't consume any more of the string, handle it as a single token.
10. Make a final pass over the text to check for special cases that include
spaces or that were missed due to the incremental processing of affixes.
</Accordion> </Accordion>