From f324311249480843cdfe9412596a1b48dfb689d9 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 17 Dec 2016 12:27:41 +0100
Subject: [PATCH] Add global language data utils

---
 spacy/language_data/__init__.py |  1 +
 spacy/language_data/util.py     | 36 +++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)
 create mode 100644 spacy/language_data/util.py

diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py
index 5e56a9937..c8109a51e 100644
--- a/spacy/language_data/__init__.py
+++ b/spacy/language_data/__init__.py
@@ -1,2 +1,3 @@
 from .emoticons import *
 from .punctuation import *
+from .util import *
diff --git a/spacy/language_data/util.py b/spacy/language_data/util.py
new file mode 100644
index 000000000..dceee1908
--- /dev/null
+++ b/spacy/language_data/util.py
@@ -0,0 +1,36 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from ..symbols import *
+
+
+PRON_LEMMA = "-PRON-"
+
+
+def update_exc(exc, additions):
+    overlap = set(exc.keys()).intersection(set(additions))
+    assert not overlap, overlap
+    exc.update(additions)
+
+
+def strings_to_exc(orths):
+    return {orth: [{ORTH: orth}] for orth in orths}
+
+
+def expand_exc(excs, search, replace):
+    updates = {}
+
+    for token_string, tokens in excs.items():
+        if search in token_string:
+            new_key = token_string.replace(search, replace)
+            new_value = [_fix_token(t, search, replace) for t in tokens]
+
+            updates[new_key] = new_value
+
+    return updates
+
+
+def _fix_token(token, search, replace):
+    fixed = dict(token)
+    fixed[ORTH] = fixed[ORTH].replace(search, replace)
+    return fixed