spaCy/spacy/lang.pyx

189 lines
6.2 KiB
Cython
Raw Normal View History

# cython: profile=True
# cython: embedsignature=True
"""Common classes and utilities across languages.
Provides the main implementation for the spacy tokenizer. Specific languages
subclass the Language class, over-writing the tokenization rules as necessary.
Special-case tokenization rules are read from data/<lang>/tokenization .
"""
from __future__ import unicode_literals
from libc.stdlib cimport calloc, free
from . import util
import json
from os import path
cdef class Language:
def __cinit__(self, name):
self.name = name
self.cache = {}
self.lexicon = Lexicon()
#self.load_special_tokenization(util.read_tokenization(name))
cpdef list tokenize(self, unicode string):
"""Tokenize a string.
The tokenization rules are defined in two places:
* The data/<lang>/tokenization table, which handles special cases like contractions;
* The appropriate :py:meth:`find_split` function, which is used to split
off punctuation etc.
Args:
string (unicode): The string to be tokenized.
Returns:
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
"""
cdef list tokens = []
cdef size_t length = len(string)
2014-08-18 17:59:59 +00:00
cdef size_t start = 0
cdef size_t i = 0
for c in string:
2014-08-25 14:42:22 +00:00
if c == ' ':
2014-08-18 17:59:59 +00:00
if start < i:
tokens.extend(self._tokenize(string[start:i]))
2014-08-18 17:59:59 +00:00
start = i + 1
i += 1
2014-08-18 17:59:59 +00:00
if start < i:
tokens.extend(self._tokenize(string[start:]))
return tokens
cdef list _tokenize(self, unicode string):
if string in self.cache:
return self.cache[string]
cdef list lexemes = []
substrings = self._split(string)
for i, substring in enumerate(substrings):
lexemes.append(self.lexicon.lookup(substring))
self.cache[string] = lexemes
return lexemes
cpdef list _split(self, unicode string):
"""Find how to split a contiguous span of non-space characters into substrings.
This method calls find_split repeatedly. Most languages will want to
override _split_one, but it may be useful to override this instead.
Args:
chunk (unicode): The string to be split, e.g. u"Mike's!"
Returns:
substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"].
"""
substrings = []
while string:
split = self._split_one(string)
if split == 0:
substrings.append(string)
break
substrings.append(string[:split])
string = string[split:]
return substrings
cpdef int _split_one(self, unicode word):
return len(word)
2014-08-16 01:22:03 +00:00
def load_special_tokenization(self, token_rules):
'''Load special-case tokenization rules.
Loads special-case tokenization rules into the Language.cache cache,
read from data/<lang>/tokenization . The special cases are loaded before
any language data is tokenized, giving these priority. For instance,
the English tokenization rules map "ain't" to ["are", "not"].
Args:
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
a string and tokens is a list of strings.
'''
for string, substrings in token_rules:
lexemes = []
for i, substring in enumerate(substrings):
lexemes.append(self.lookup(substring))
self.cache[string] = lexemes
2014-08-18 17:59:59 +00:00
cdef class Lexicon:
def __cinit__(self):
self.flag_checkers = []
self.string_transformers = []
self.probs = {}
self.clusters = {}
self.case_stats = {}
self.tag_stats = {}
self.lexicon = {}
cpdef Lexeme lookup(self, unicode string):
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
Args:
string (unicode): The string to be looked up. Must be unicode, not bytes.
Returns:
lexeme (Lexeme): A reference to a lexical type.
"""
assert len(string) != 0
if string in self.lexicon:
return self.lexicon[string]
prob = _pop_default(self.probs, string, 0.0)
cluster = _pop_default(self.clusters, string, 0.0)
case_stats = _pop_default(self.case_stats, string, {})
tag_stats = _pop_default(self.tag_stats, string, {})
cdef Lexeme word = Lexeme(string, prob, cluster, case_stats, tag_stats,
self.flag_checkers, self.string_transformers)
self.lexicon[string] = word
return word
def add_flag(self, flag_checker):
cdef unicode string
cdef Lexeme word
flag_id = len(self.flag_checkers)
for string, word in self.lexicon.items():
if flag_checker(string, word.prob, {}):
word.set_flag(flag_id)
self.flag_checkers.append(flag_checker)
return flag_id
def add_transform(self, string_transform):
self.string_transformers.append(string_transform)
return len(self.string_transformers) - 1
def load_probs(self, location):
"""Load unigram probabilities.
"""
# Dict mapping words to floats
self.probs = json.load(location)
cdef Lexeme word
cdef unicode string
for string, word in self.lexicon.items():
prob = _pop_default(self.probs, string, 0.0)
word.prob = prob
def load_clusters(self, location):
# TODO: Find out endianness
# Dict mapping words to ??-endian ints
self.clusters = json.load(location)
cdef Lexeme word
2014-08-19 00:40:37 +00:00
cdef unicode string
for string, word in self.lexicon.items():
cluster = _pop_default(self.clusters, string, 0)
word.cluster = cluster
def load_stats(self, location):
"""Load distributional stats.
"""
# Dict mapping string to dict of arbitrary stuff.
raise NotImplementedError
def _pop_default(dict d, key, default):
return d.pop(key) if key in d else default