mirror of https://github.com/explosion/spaCy.git
Update Tokenizer.explain with special matches (#7749)
* Update Tokenizer.explain with special matches Update `Tokenizer.explain` and the pseudo-code in the docs to include the processing of special cases that contain affixes or whitespace. * Handle optional settings in explain * Add test for special matches in explain Add test for `Tokenizer.explain` for special cases containing affixes.
This commit is contained in:
parent
07b41c38ae
commit
0e7f94b247
|
@ -1,5 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
import re
|
||||||
from spacy.util import get_lang_class
|
from spacy.util import get_lang_class
|
||||||
|
from spacy.tokenizer import Tokenizer
|
||||||
|
|
||||||
# Only include languages with no external dependencies
|
# Only include languages with no external dependencies
|
||||||
# "is" seems to confuse importlib, so we're also excluding it for now
|
# "is" seems to confuse importlib, so we're also excluding it for now
|
||||||
|
@ -60,3 +62,18 @@ def test_tokenizer_explain(lang):
|
||||||
tokens = [t.text for t in tokenizer(sentence) if not t.is_space]
|
tokens = [t.text for t in tokenizer(sentence) if not t.is_space]
|
||||||
debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
|
debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
|
||||||
assert tokens == debug_tokens
|
assert tokens == debug_tokens
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_explain_special_matcher(en_vocab):
|
||||||
|
suffix_re = re.compile(r"[\.]$")
|
||||||
|
infix_re = re.compile(r"[/]")
|
||||||
|
rules = {"a.": [{"ORTH": "a."}]}
|
||||||
|
tokenizer = Tokenizer(
|
||||||
|
en_vocab,
|
||||||
|
rules=rules,
|
||||||
|
suffix_search=suffix_re.search,
|
||||||
|
infix_finditer=infix_re.finditer,
|
||||||
|
)
|
||||||
|
tokens = [t.text for t in tokenizer("a/a.")]
|
||||||
|
explain_tokens = [t[1] for t in tokenizer.explain("a/a.")]
|
||||||
|
assert tokens == explain_tokens
|
||||||
|
|
|
@ -20,11 +20,12 @@ from .attrs import intify_attrs
|
||||||
from .symbols import ORTH, NORM
|
from .symbols import ORTH, NORM
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors, Warnings
|
||||||
from . import util
|
from . import util
|
||||||
from .util import registry
|
from .util import registry, get_words_and_spaces
|
||||||
from .attrs import intify_attrs
|
from .attrs import intify_attrs
|
||||||
from .symbols import ORTH
|
from .symbols import ORTH
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from .training import validate_examples
|
from .training import validate_examples
|
||||||
|
from .tokens import Span
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokenizer:
|
cdef class Tokenizer:
|
||||||
|
@ -638,8 +639,14 @@ cdef class Tokenizer:
|
||||||
DOCS: https://spacy.io/api/tokenizer#explain
|
DOCS: https://spacy.io/api/tokenizer#explain
|
||||||
"""
|
"""
|
||||||
prefix_search = self.prefix_search
|
prefix_search = self.prefix_search
|
||||||
|
if prefix_search is None:
|
||||||
|
prefix_search = re.compile("a^").search
|
||||||
suffix_search = self.suffix_search
|
suffix_search = self.suffix_search
|
||||||
|
if suffix_search is None:
|
||||||
|
suffix_search = re.compile("a^").search
|
||||||
infix_finditer = self.infix_finditer
|
infix_finditer = self.infix_finditer
|
||||||
|
if infix_finditer is None:
|
||||||
|
infix_finditer = re.compile("a^").finditer
|
||||||
token_match = self.token_match
|
token_match = self.token_match
|
||||||
if token_match is None:
|
if token_match is None:
|
||||||
token_match = re.compile("a^").match
|
token_match = re.compile("a^").match
|
||||||
|
@ -687,7 +694,7 @@ cdef class Tokenizer:
|
||||||
tokens.append(("URL_MATCH", substring))
|
tokens.append(("URL_MATCH", substring))
|
||||||
substring = ''
|
substring = ''
|
||||||
elif substring in special_cases:
|
elif substring in special_cases:
|
||||||
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
|
tokens.extend((f"SPECIAL-{i + 1}", self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
|
||||||
substring = ''
|
substring = ''
|
||||||
elif list(infix_finditer(substring)):
|
elif list(infix_finditer(substring)):
|
||||||
infixes = infix_finditer(substring)
|
infixes = infix_finditer(substring)
|
||||||
|
@ -705,7 +712,33 @@ cdef class Tokenizer:
|
||||||
tokens.append(("TOKEN", substring))
|
tokens.append(("TOKEN", substring))
|
||||||
substring = ''
|
substring = ''
|
||||||
tokens.extend(reversed(suffixes))
|
tokens.extend(reversed(suffixes))
|
||||||
return tokens
|
# Find matches for special cases handled by special matcher
|
||||||
|
words, spaces = get_words_and_spaces([t[1] for t in tokens], text)
|
||||||
|
t_words = []
|
||||||
|
t_spaces = []
|
||||||
|
for word, space in zip(words, spaces):
|
||||||
|
if not word.isspace():
|
||||||
|
t_words.append(word)
|
||||||
|
t_spaces.append(space)
|
||||||
|
doc = Doc(self.vocab, words=t_words, spaces=t_spaces)
|
||||||
|
matches = self._special_matcher(doc)
|
||||||
|
spans = [Span(doc, s, e, label=m_id) for m_id, s, e in matches]
|
||||||
|
spans = util.filter_spans(spans)
|
||||||
|
# Replace matched tokens with their exceptions
|
||||||
|
i = 0
|
||||||
|
final_tokens = []
|
||||||
|
spans_by_start = {s.start: s for s in spans}
|
||||||
|
while i < len(tokens):
|
||||||
|
if i in spans_by_start:
|
||||||
|
span = spans_by_start[i]
|
||||||
|
exc = [d[ORTH] for d in special_cases[span.label_]]
|
||||||
|
for j, orth in enumerate(exc):
|
||||||
|
final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
|
||||||
|
i += len(span)
|
||||||
|
else:
|
||||||
|
final_tokens.append(tokens[i])
|
||||||
|
i += 1
|
||||||
|
return final_tokens
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
def score(self, examples, **kwargs):
|
||||||
validate_examples(examples, "Tokenizer.score")
|
validate_examples(examples, "Tokenizer.score")
|
||||||
|
|
|
@ -786,6 +786,7 @@ rather than performance:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def tokenizer_pseudo_code(
|
def tokenizer_pseudo_code(
|
||||||
|
text,
|
||||||
special_cases,
|
special_cases,
|
||||||
prefix_search,
|
prefix_search,
|
||||||
suffix_search,
|
suffix_search,
|
||||||
|
@ -839,12 +840,14 @@ def tokenizer_pseudo_code(
|
||||||
tokens.append(substring)
|
tokens.append(substring)
|
||||||
substring = ""
|
substring = ""
|
||||||
tokens.extend(reversed(suffixes))
|
tokens.extend(reversed(suffixes))
|
||||||
|
for match in matcher(special_cases, text):
|
||||||
|
tokens.replace(match, special_cases[match])
|
||||||
return tokens
|
return tokens
|
||||||
```
|
```
|
||||||
|
|
||||||
The algorithm can be summarized as follows:
|
The algorithm can be summarized as follows:
|
||||||
|
|
||||||
1. Iterate over whitespace-separated substrings.
|
1. Iterate over space-separated substrings.
|
||||||
2. Look for a token match. If there is a match, stop processing and keep this
|
2. Look for a token match. If there is a match, stop processing and keep this
|
||||||
token.
|
token.
|
||||||
3. Check whether we have an explicitly defined special case for this substring.
|
3. Check whether we have an explicitly defined special case for this substring.
|
||||||
|
@ -858,6 +861,8 @@ The algorithm can be summarized as follows:
|
||||||
8. Look for "infixes" – stuff like hyphens etc. and split the substring into
|
8. Look for "infixes" – stuff like hyphens etc. and split the substring into
|
||||||
tokens on all infixes.
|
tokens on all infixes.
|
||||||
9. Once we can't consume any more of the string, handle it as a single token.
|
9. Once we can't consume any more of the string, handle it as a single token.
|
||||||
|
10. Make a final pass over the text to check for special cases that include
|
||||||
|
spaces or that were missed due to the incremental processing of affixes.
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue