diff --git a/spacy/util.py b/spacy/util.py index fc398bd79..ec67c5e17 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -81,3 +81,25 @@ def align_tokens(ref, indices): yield token, emit start = end assert not queue + + +def detokenize(token_rules, words): + """To align with treebanks, return a list of "chunks", where a chunk is a + sequence of tokens that are separated by whitespace in actual strings. Each + chunk should be a tuple of token indices, e.g. + + >>> detokenize(["can't", '!'], ["I", "ca", "n't", "!"]) + [(0,), (1, 2, 3)] + """ + string = ' '.join(words) + for subtoks in token_rules: + # Algorithmically this is dumb, but writing a little list-based match + # machine? Ain't nobody got time for that. + string = string.replace(subtoks.replace('', ' '), subtoks) + positions = [] + i = 0 + for chunk in string.split(): + subtoks = chunk.split('') + positions.append(tuple(range(i, i+len(subtoks)))) + i += len(subtoks) + return positions diff --git a/tests/test_detokenize.py b/tests/test_detokenize.py new file mode 100644 index 000000000..ffc9e2582 --- /dev/null +++ b/tests/test_detokenize.py @@ -0,0 +1,21 @@ +from spacy.util import detokenize + +def test_punct(): + tokens = 'Pierre Vinken , 61 years old .'.split() + detoks = [(0,), (1, 2), (3,), (4,), (5, 6)] + token_rules = (',', '.') + assert detokenize(token_rules, tokens) == detoks + + +def test_contractions(): + tokens = "I ca n't even".split() + detoks = [(0,), (1, 2), (3,)] + token_rules = ("can't",) + assert detokenize(token_rules, tokens) == detoks + + +def test_contractions_punct(): + tokens = "I ca n't !".split() + detoks = [(0,), (1, 2, 3)] + token_rules = ("can't", '!') + assert detokenize(token_rules, tokens) == detoks