import os from os import path import codecs import ujson import re DATA_DIR = path.join(path.dirname(__file__), '..', 'data') def utf8open(loc, mode='r'): return codecs.open(loc, mode, 'utf8') def read_lang_data(name): data_dir = path.join(DATA_DIR, name) tokenization = read_tokenization(data_dir) prefix = read_prefix(data_dir) suffix = read_suffix(data_dir) infix = read_infix(data_dir) lex_loc = path.join(data_dir, 'lexemes.json') if path.exists(lex_loc): with open(lex_loc) as file_: lexemes = ujson.load(file_) else: lexemes = {} return tokenization, prefix, suffix, infix, lexemes def read_prefix(data_dir): with utf8open(path.join(data_dir, 'prefix')) as file_: entries = file_.read().split('\n') expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) return expression def read_suffix(data_dir): with utf8open(path.join(data_dir, 'suffix')) as file_: entries = file_.read().split('\n') expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()]) return expression def read_infix(data_dir): with utf8open(path.join(data_dir, 'infix')) as file_: entries = file_.read().split('\n') expression = '|'.join([piece for piece in entries if piece.strip()]) return expression def read_tokenization(lang): loc = path.join(DATA_DIR, lang, 'tokenization') entries = [] seen = set() with utf8open(loc) as file_: for line in file_: line = line.strip() if line.startswith('#'): continue if not line: continue pieces = line.split() chunk = pieces.pop(0) assert chunk not in seen, chunk seen.add(chunk) entries.append((chunk, list(pieces))) if chunk[0].isalpha() and chunk[0].islower(): chunk = chunk[0].title() + chunk[1:] pieces[0] = pieces[0][0].title() + pieces[0][1:] seen.add(chunk) entries.append((chunk, pieces)) return entries def align_tokens(ref, indices): start = 0 queue = list(indices) for token in ref: end = start + len(token) emit = [] while queue and queue[0][1] <= end: emit.append(queue.pop(0)) yield token, emit start = end assert not queue