From 43d5964e13c7cc94e906b797aa8a2fb16ff93612 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 22 Oct 2014 12:54:59 +1100 Subject: [PATCH] * Add function to read detokenization rules --- spacy/util.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/spacy/util.py b/spacy/util.py index ec67c5e17..e68bac748 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -70,6 +70,15 @@ def read_tokenization(lang): return entries +def read_detoken_rules(lang): + loc = path.join(DATA_DIR, lang, 'detokenize') + entries = [] + with utf8open(loc) as file_: + for line in file_: + entries.append(line.strip()) + return entries + + def align_tokens(ref, indices): start = 0 queue = list(indices)