From 2e44fa71794072ba5db45d710bb3fb52df46ad47 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matthew@honnibal.com>
Date: Thu, 25 Sep 2014 18:26:22 +0200
Subject: [PATCH] * Add util.py

---
 spacy/util.py | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 spacy/util.py

diff --git a/spacy/util.py b/spacy/util.py
new file mode 100644
index 000000000..229dc81a4
--- /dev/null
+++ b/spacy/util.py
@@ -0,0 +1,64 @@
+import os
+from os import path
+import codecs
+import json
+import re
+
+DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
+
+
+def utf8open(loc, mode='r'):
+    return codecs.open(loc, mode, 'utf8')
+
+
+def read_lang_data(name):
+    data_dir = path.join(DATA_DIR, name)
+    tokenization = read_tokenization(data_dir)
+    prefix = read_prefix(data_dir)
+    suffix = read_suffix(data_dir)
+    words = load_resource(data_dir, 'words')
+    probs = load_resource(data_dir, 'probs')
+    clusters = load_resource(data_dir, 'clusters')
+    case_stats = load_resource(data_dir, 'case_stats')
+    tag_stats = load_resource(data_dir, 'tag_stats')
+    return tokenization, prefix, suffix, words, probs, clusters, case_stats, tag_stats
+
+
+def load_resource(data_dir, name):
+    loc = path.join(data_dir, name + '.json')
+    return json.load(loc) if path.exists(loc) else {}
+
+def read_prefix(data_dir):
+    with  utf8open(path.join(data_dir, 'prefix')) as file_:
+        entries = file_.read().split('\n')
+        expression = '|'.join(['^' + re.escape(piece) for piece in entries])
+    return expression
+
+def read_suffix(data_dir):
+    with  utf8open(path.join(data_dir, 'suffix')) as file_:
+        entries = file_.read().split('\n')
+        expression = '|'.join([re.escape(piece) + '$' for piece in entries])
+    return expression
+
+def read_tokenization(lang):
+    loc = path.join(DATA_DIR, lang, 'tokenization')
+    entries = []
+    seen = set()
+    with utf8open(loc) as file_:
+        for line in file_:
+            line = line.strip()
+            if line.startswith('#'):
+                continue
+            if not line:
+                continue
+            pieces = line.split()
+            chunk = pieces.pop(0)
+            assert chunk not in seen, chunk
+            seen.add(chunk)
+            entries.append((chunk, list(pieces)))
+            if chunk[0].isalpha() and chunk[0].islower():
+                chunk = chunk[0].title() + chunk[1:]
+                pieces[0] = pieces[0][0].title() + pieces[0][1:]
+                seen.add(chunk)
+                entries.append((chunk, pieces))
+    return entries