From f324311249480843cdfe9412596a1b48dfb689d9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 17 Dec 2016 12:27:41 +0100 Subject: [PATCH] Add global language data utils --- spacy/language_data/__init__.py | 1 + spacy/language_data/util.py | 36 +++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 spacy/language_data/util.py diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py index 5e56a9937..c8109a51e 100644 --- a/spacy/language_data/__init__.py +++ b/spacy/language_data/__init__.py @@ -1,2 +1,3 @@ from .emoticons import * from .punctuation import * +from .util import * diff --git a/spacy/language_data/util.py b/spacy/language_data/util.py new file mode 100644 index 000000000..dceee1908 --- /dev/null +++ b/spacy/language_data/util.py @@ -0,0 +1,36 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * + + +PRON_LEMMA = "-PRON-" + + +def update_exc(exc, additions): + overlap = set(exc.keys()).intersection(set(additions)) + assert not overlap, overlap + exc.update(additions) + + +def strings_to_exc(orths): + return {orth: [{ORTH: orth}] for orth in orths} + + +def expand_exc(excs, search, replace): + updates = {} + + for token_string, tokens in excs.items(): + if search in token_string: + new_key = token_string.replace(search, replace) + new_value = [_fix_token(t, search, replace) for t in tokens] + + updates[new_key] = new_value + + return updates + + +def _fix_token(token, search, replace): + fixed = dict(token) + fixed[ORTH] = fixed[ORTH].replace(search, replace) + return fixed