spaCy/spacy/lang.pyx

# cython: profile=True
# cython: embedsignature=True
"""Common classes and utilities across languages.

Provides the main implementation for the spacy tokenizer. Specific languages
subclass the Language class, over-writing the tokenization rules as necessary.
Special-case tokenization rules are read from data/<lang>/tokenization .
"""
from __future__ import unicode_literals

from libc.stdlib cimport calloc, free

from . import util
import json
from os import path


cdef class Language:
    def __cinit__(self, name):
        self.name = name
        self.cache = {}
        self.lexicon = Lexicon()
        #self.load_special_tokenization(util.read_tokenization(name))

    cpdef list tokenize(self, unicode string):
        """Tokenize a string.

        The tokenization rules are defined in two places:

        * The data/<lang>/tokenization table, which handles special cases like contractions;
        * The appropriate :py:meth:`find_split` function, which is used to split
          off punctuation etc.

        Args:
            string (unicode): The string to be tokenized. 

        Returns:
            tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
        """
        cdef list tokens = []
        cdef size_t length = len(string)
        cdef size_t start = 0
        cdef size_t i = 0
        for c in string:
            if c == ' ':
                if start < i:
                    tokens.extend(self._tokenize(string[start:i]))
                start = i + 1
            i += 1
        if start < i:
            tokens.extend(self._tokenize(string[start:]))
        return tokens

    cdef list _tokenize(self, unicode string):
        if string in self.cache:
            return self.cache[string]
        cdef list lexemes = []
        substrings = self._split(string)
        for i, substring in enumerate(substrings):
            lexemes.append(self.lexicon.lookup(substring))
        self.cache[string] = lexemes
        return lexemes

    cpdef list _split(self, unicode string):
        """Find how to split a contiguous span of non-space characters into substrings.

        This method calls find_split repeatedly. Most languages will want to
        override _split_one, but it may be useful to override this instead.

        Args:
            chunk (unicode): The string to be split, e.g. u"Mike's!"

        Returns:
            substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"].
        """
        substrings = []
        while string:
            split = self._split_one(string)
            if split == 0:
                substrings.append(string)
                break
            substrings.append(string[:split])
            string = string[split:]
        return substrings

    cpdef int _split_one(self, unicode word):
        return len(word)

    def load_special_tokenization(self, token_rules):
        '''Load special-case tokenization rules.

        Loads special-case tokenization rules into the Language.cache cache,
        read from data/<lang>/tokenization . The special cases are loaded before
        any language data is tokenized, giving these priority.  For instance,
        the English tokenization rules map "ain't" to ["are", "not"].

        Args:
            token_rules (list): A list of (chunk, tokens) pairs, where chunk is
                a string and tokens is a list of strings.
        '''
        for string, substrings in token_rules:
            lexemes = []
            for i, substring in enumerate(substrings):
                lexemes.append(self.lookup(substring))
            self.cache[string] = lexemes
 

cdef class Lexicon:
    def __cinit__(self):
        self.flag_checkers = []
        self.string_transformers = []
        self.probs = {}
        self.clusters = {}
        self.case_stats = {}
        self.tag_stats = {}
        self.lexicon = {}

    cpdef Lexeme lookup(self, unicode string):
        """Retrieve (or create, if not found) a Lexeme for a string, and return it.
    
        Args:
            string (unicode):  The string to be looked up. Must be unicode, not bytes.

        Returns:
            lexeme (Lexeme): A reference to a lexical type.
        """
        assert len(string) != 0
        if string in self.lexicon:
            return self.lexicon[string]
        
        prob = _pop_default(self.probs, string, 0.0)
        cluster = _pop_default(self.clusters, string, 0.0)
        case_stats = _pop_default(self.case_stats, string, {})
        tag_stats = _pop_default(self.tag_stats, string, {})

        cdef Lexeme word = Lexeme(string, prob, cluster, case_stats, tag_stats,
                                  self.flag_checkers, self.string_transformers)
        self.lexicon[string] = word
        return word

    def add_flag(self, flag_checker):
        cdef unicode string
        cdef Lexeme word
        flag_id = len(self.flag_checkers)
        for string, word in self.lexicon.items():
            if flag_checker(string, word.prob, {}):
                word.set_flag(flag_id)
        self.flag_checkers.append(flag_checker)
        return flag_id

    def add_transform(self, string_transform):
        self.string_transformers.append(string_transform)
        return len(self.string_transformers) - 1

    def load_probs(self, location):
        """Load unigram probabilities.
        """
        # Dict mapping words to floats
        self.probs = json.load(location)
        
        cdef Lexeme word
        cdef unicode string

        for string, word in self.lexicon.items():
            prob = _pop_default(self.probs, string, 0.0)
            word.prob = prob

    def load_clusters(self, location):
        # TODO: Find out endianness
        # Dict mapping words to ??-endian ints
        self.clusters = json.load(location)
        
        cdef Lexeme word
        cdef unicode string

        for string, word in self.lexicon.items():
            cluster = _pop_default(self.clusters, string, 0)
            word.cluster = cluster

    def load_stats(self, location):
        """Load distributional stats.
        """
        # Dict mapping string to dict of arbitrary stuff.
        raise NotImplementedError


def _pop_default(dict d, key, default):
    return d.pop(key) if key in d else default
* Fixed major efficiency problem, from not quite grokking pass by reference in cython c++ 2014-07-07 05:36:43 +00:00			`# cython: profile=True`
* Broken version being refactored for docs 2014-08-20 11:39:39 +00:00			`# cython: embedsignature=True`
			`"""Common classes and utilities across languages.`

			`Provides the main implementation for the spacy tokenizer. Specific languages`
			`subclass the Language class, over-writing the tokenization rules as necessary.`
			`Special-case tokenization rules are read from data/<lang>/tokenization .`
			`"""`
* Initial commit. Tests passing for punctuation handling. Need contractions, file transport, tokenize function, etc. 2014-07-05 18:51:42 +00:00			`from __future__ import unicode_literals`
* Reorganized, moving language-independent stuff to spacy. The functions in spacy ask for the dictionaries and split function on input, but the language-specific modules are curried versions that use the globals 2014-07-07 02:21:06 +00:00
* Switch to dynamically allocating array, based on the document length 2014-07-07 06:05:29 +00:00			`from libc.stdlib cimport calloc, free`

* Reorganized, moving language-independent stuff to spacy. The functions in spacy ask for the dictionaries and split function on input, but the language-specific modules are curried versions that use the globals 2014-07-07 02:21:06 +00:00			`from . import util`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`import json`
* Fixed major efficiency problem, from not quite grokking pass by reference in cython c++ 2014-07-07 05:36:43 +00:00			`from os import path`
* Progress to getting WordTree working. Tests pass, but so far it's slower. 2014-08-16 17:59:38 +00:00
* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 14:58:48 +00:00
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 10:47:21 +00:00			`cdef class Language:`
			`def __cinit__(self, name):`
			`self.name = name`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`self.cache = {}`
			`self.lexicon = Lexicon()`
* Basic punct tests updated and passing 2014-08-27 17:38:57 +00:00			`#self.load_special_tokenization(util.read_tokenization(name))`
* Broken version being refactored for docs 2014-08-20 11:39:39 +00:00
* Moving to Word objects in place of the Lexeme struct. 2014-08-22 15:28:23 +00:00			`cpdef list tokenize(self, unicode string):`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`"""Tokenize a string.`

			`The tokenization rules are defined in two places:`
* Broken version being refactored for docs 2014-08-20 11:39:39 +00:00
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`* The data/<lang>/tokenization table, which handles special cases like contractions;`
			* The appropriate :py:meth:`find_split` function, which is used to split
			`off punctuation etc.`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 10:47:21 +00:00
* Broken version being refactored for docs 2014-08-20 11:39:39 +00:00			`Args:`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`string (unicode): The string to be tokenized.`
* Broken version being refactored for docs 2014-08-20 11:39:39 +00:00
			`Returns:`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.`
* Broken version being refactored for docs 2014-08-20 11:39:39 +00:00			`"""`
* Moving to Word objects in place of the Lexeme struct. 2014-08-22 15:28:23 +00:00			`cdef list tokens = []`
* Working version that uses arrays for chunks, which should be more memory efficient 2014-08-18 18:23:54 +00:00			`cdef size_t length = len(string)`
* Working version, adding improvements 2014-08-18 17:59:59 +00:00			`cdef size_t start = 0`
* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 18:48:48 +00:00			`cdef size_t i = 0`
			`for c in string:`
* More refactoring 2014-08-25 14:42:22 +00:00			`if c == ' ':`
* Working version, adding improvements 2014-08-18 17:59:59 +00:00			`if start < i:`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`tokens.extend(self._tokenize(string[start:i]))`
* Working version, adding improvements 2014-08-18 17:59:59 +00:00			`start = i + 1`
* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 18:48:48 +00:00			`i += 1`
* Working version, adding improvements 2014-08-18 17:59:59 +00:00			`if start < i:`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`tokens.extend(self._tokenize(string[start:]))`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 10:47:21 +00:00			`return tokens`

* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`cdef list _tokenize(self, unicode string):`
			`if string in self.cache:`
			`return self.cache[string]`
			`cdef list lexemes = []`
			`substrings = self._split(string)`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 17:14:00 +00:00			`for i, substring in enumerate(substrings):`
* Basic punct tests updated and passing 2014-08-27 17:38:57 +00:00			`lexemes.append(self.lexicon.lookup(substring))`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`self.cache[string] = lexemes`
			`return lexemes`

			`cpdef list _split(self, unicode string):`
			`"""Find how to split a contiguous span of non-space characters into substrings.`
* Broken version being refactored for docs 2014-08-20 11:39:39 +00:00
			`This method calls find_split repeatedly. Most languages will want to`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`override _split_one, but it may be useful to override this instead.`
* Broken version being refactored for docs 2014-08-20 11:39:39 +00:00
			`Args:`
			`chunk (unicode): The string to be split, e.g. u"Mike's!"`

			`Returns:`
			`substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"].`
			`"""`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 17:14:00 +00:00			`substrings = []`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`while string:`
			`split = self._split_one(string)`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 17:14:00 +00:00			`if split == 0:`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`substrings.append(string)`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 17:14:00 +00:00			`break`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`substrings.append(string[:split])`
			`string = string[split:]`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 17:14:00 +00:00			`return substrings`

* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`cpdef int _split_one(self, unicode word):`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 17:14:00 +00:00			`return len(word)`
* Refactoring tokenizer 2014-08-16 01:22:03 +00:00
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`def load_special_tokenization(self, token_rules):`
* Broken version being refactored for docs 2014-08-20 11:39:39 +00:00			`'''Load special-case tokenization rules.`

* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`Loads special-case tokenization rules into the Language.cache cache,`
* Broken version being refactored for docs 2014-08-20 11:39:39 +00:00			`read from data/<lang>/tokenization . The special cases are loaded before`
			`any language data is tokenized, giving these priority. For instance,`
			`the English tokenization rules map "ain't" to ["are", "not"].`

			`Args:`
			`token_rules (list): A list of (chunk, tokens) pairs, where chunk is`
			`a string and tokens is a list of strings.`
			`'''`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`for string, substrings in token_rules:`
			`lexemes = []`
			`for i, substring in enumerate(substrings):`
			`lexemes.append(self.lookup(substring))`
			`self.cache[string] = lexemes`

* Working version, adding improvements 2014-08-18 17:59:59 +00:00
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`cdef class Lexicon:`
			`def __cinit__(self):`
			`self.flag_checkers = []`
* Basic punct tests updated and passing 2014-08-27 17:38:57 +00:00			`self.string_transformers = []`
			`self.probs = {}`
			`self.clusters = {}`
			`self.case_stats = {}`
			`self.tag_stats = {}`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`self.lexicon = {}`
* Broken version being refactored for docs 2014-08-20 11:39:39 +00:00
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`cpdef Lexeme lookup(self, unicode string):`
			`"""Retrieve (or create, if not found) a Lexeme for a string, and return it.`

			`Args:`
			`string (unicode): The string to be looked up. Must be unicode, not bytes.`

			`Returns:`
			`lexeme (Lexeme): A reference to a lexical type.`
			`"""`
			`assert len(string) != 0`
			`if string in self.lexicon:`
			`return self.lexicon[string]`

			`prob = _pop_default(self.probs, string, 0.0)`
			`cluster = _pop_default(self.clusters, string, 0.0)`
			`case_stats = _pop_default(self.case_stats, string, {})`
			`tag_stats = _pop_default(self.tag_stats, string, {})`

			`cdef Lexeme word = Lexeme(string, prob, cluster, case_stats, tag_stats,`
			`self.flag_checkers, self.string_transformers)`
			`self.lexicon[string] = word`
			`return word`

			`def add_flag(self, flag_checker):`
			`cdef unicode string`
			`cdef Lexeme word`
			`flag_id = len(self.flag_checkers)`
			`for string, word in self.lexicon.items():`
			`if flag_checker(string, word.prob, {}):`
			`word.set_flag(flag_id)`
			`self.flag_checkers.append(flag_checker)`
			`return flag_id`

			`def add_transform(self, string_transform):`
			`self.string_transformers.append(string_transform)`
			`return len(self.string_transformers) - 1`

			`def load_probs(self, location):`
			`"""Load unigram probabilities.`
			`"""`
* Basic punct tests updated and passing 2014-08-27 17:38:57 +00:00			`# Dict mapping words to floats`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`self.probs = json.load(location)`

			`cdef Lexeme word`
			`cdef unicode string`

			`for string, word in self.lexicon.items():`
			`prob = _pop_default(self.probs, string, 0.0)`
			`word.prob = prob`

			`def load_clusters(self, location):`
* Basic punct tests updated and passing 2014-08-27 17:38:57 +00:00			`# TODO: Find out endianness`
			`# Dict mapping words to ??-endian ints`
			`self.clusters = json.load(location)`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00
			`cdef Lexeme word`
* Reforming data model for lexemes 2014-08-19 00:40:37 +00:00			`cdef unicode string`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00
			`for string, word in self.lexicon.items():`
* Basic punct tests updated and passing 2014-08-27 17:38:57 +00:00			`cluster = _pop_default(self.clusters, string, 0)`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`word.cluster = cluster`

			`def load_stats(self, location):`
			`"""Load distributional stats.`
			`"""`
* Basic punct tests updated and passing 2014-08-27 17:38:57 +00:00			`# Dict mapping string to dict of arbitrary stuff.`
* Refactoring with Lexeme as a class now compiles. Basic design seems to work 2014-08-27 15:15:39 +00:00			`raise NotImplementedError`


			`def _pop_default(dict d, key, default):`
			`return d.pop(key) if key in d else default`