spaCy/spacy/spacy.pyx

# cython: profile=True
from __future__ import unicode_literals

from libc.stdlib cimport calloc, free
from libcpp.pair cimport pair
from cython.operator cimport dereference as deref

from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport BLANK_WORD

from spacy.string_tools cimport substr

from . import util
from os import path


def get_normalized(unicode lex, size_t length):
    if lex.isalpha() and lex.islower():
        return lex
    else:
        return get_word_shape(lex, length)


def get_word_shape(unicode lex, length):
    shape = ""
    last = ""
    shape_char = ""
    seq = 0
    for c in lex:
        if c.isalpha():
            if c.isupper():
                shape_char = "X"
            else:
                shape_char = "x"
        elif c.isdigit():
            shape_char = "d"
        else:
            shape_char = c
        if shape_char == last:
            seq += 1
        else:
            seq = 0
            last = shape_char
        if seq < 3:
            shape += shape_char
    assert shape
    return shape


def set_orth_flags(lex, length):
    return 0


cdef class Language:
    def __cinit__(self, name):
        self.name = name
        self.bacov = {}
        self.chunks = dense_hash_map[StringHash, size_t]()
        self.vocab = dense_hash_map[StringHash, size_t]()
        self.chunks.set_empty_key(0)
        self.vocab.set_empty_key(0)
        self.load_tokenization(util.read_tokenization(name))
        self.load_dist_info(util.read_dist_info(name))

    cdef Tokens tokenize(self, unicode string):
        cdef Lexeme** chunk
        cdef Tokens tokens = Tokens(self)
        cdef size_t length = len(string)
        cdef size_t start = 0
        cdef size_t i = 0
        for c in string:
            if _is_whitespace(c):
                if start < i:
                    chunk = self.lookup_chunk(string[start:i])
                    _extend(tokens, chunk)
                start = i + 1
            i += 1
        if start < i:
            chunk = self.lookup_chunk(string[start:])
            _extend(tokens, chunk)
        return tokens

    cdef Lexeme* lookup(self, unicode string) except NULL:
        if len(string) == 0:
            return &BLANK_WORD
        cdef Lexeme* word = <Lexeme*>self.vocab[hash(string)]
        if word == NULL:
            word = self.new_lexeme(string)
        return word

    cdef Lexeme** lookup_chunk(self, unicode string) except NULL:
        cdef StringHash h = hash(string)
        cdef Lexeme** chunk = <Lexeme**>self.chunks[h]
        cdef int split
        if chunk == NULL:
            chunk = self.new_chunk(string, self.find_substrings(string))
        return chunk

    cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL:
        cdef Lexeme** chunk = <Lexeme**>calloc(len(substrings) + 1, sizeof(Lexeme*))
        for i, substring in enumerate(substrings):
            chunk[i] = self.lookup(substring)
        chunk[i + 1] = NULL
        self.chunks[hash(string)] = <size_t>chunk
        return chunk

    cdef Lexeme* new_lexeme(self, unicode string) except NULL:
        cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
        word.lex = hash(string)
        self.bacov[word.lex] = string
        word.orth = self.new_orth(string)

        word.dist = <Distribution*>calloc(1, sizeof(Distribution))
        self.vocab[word.lex] = <size_t>word
        return word

    cdef Orthography* new_orth(self, unicode lex) except NULL:
        cdef unicode last3
        cdef unicode norm
        cdef unicode shape
        cdef int length 

        length = len(lex)
        orth = <Orthography*>calloc(1, sizeof(Orthography))
        orth.first = lex[0]
            
        orth.length = length
        orth.flags = set_orth_flags(lex, orth.length)
        orth.norm = hash(lex)
        last3 = substr(lex, length - 3, length, length)
        orth.last3 = hash(last3)
        norm = get_normalized(lex, length)
        orth.norm = hash(norm)
        shape = get_word_shape(lex, length)
        orth.shape = hash(shape)

        self.bacov[orth.last3] = last3
        self.bacov[orth.norm] = norm
        self.bacov[orth.shape] = shape
        return orth

    cdef unicode unhash(self, StringHash hash_value):
        '''Fetch a string from the reverse index, given its hash value.'''
        return self.bacov[hash_value]

    cpdef list find_substrings(self, unicode word):
        substrings = []
        while word:
            split = self.find_split(word)
            if split == 0:
                substrings.append(word)
                break
            substrings.append(word[:split])
            word = word[split:]
        return substrings

    cdef int find_split(self, unicode word):
        return len(word)

    def load_tokenization(self, token_rules=None):
        for chunk, tokens in token_rules:
            self.new_chunk(chunk, tokens)

    def load_dist_info(self, dist_info):
        cdef unicode string
        cdef dict word_dist
        cdef Lexeme* w
        for string, word_dist in dist_info.items():
            w = self.lookup(string)
            w.prob = word_dist.prob
            w.cluster = word_dist.cluster
            for flag in word_dist.flags:
                w.flags |= lexeme.DIST_FLAGS[flag]
            for tag in word_dist.tagdict:
                w.tagdict |= lexeme.TAGS[tag]


cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
    if c == ' ':
        return True
    elif c == '\n':
        return True
    elif c == '\t':
        return True
    else:
        return False


cdef inline int _extend(Tokens tokens, Lexeme** chunk) nogil:
    cdef size_t i = 0
    while chunk[i] != NULL:
        tokens.vctr[0].push_back(<Lexeme_addr>chunk[i])
        tokens.length += 1
        i += 1
* Fixed major efficiency problem, from not quite grokking pass by reference in cython c++ 2014-07-07 05:36:43 +00:00			`# cython: profile=True`
* Initial commit. Tests passing for punctuation handling. Need contractions, file transport, tokenize function, etc. 2014-07-05 18:51:42 +00:00			`from __future__ import unicode_literals`
* Reorganized, moving language-independent stuff to spacy. The functions in spacy ask for the dictionaries and split function on input, but the language-specific modules are curried versions that use the globals 2014-07-07 02:21:06 +00:00
* Switch to dynamically allocating array, based on the document length 2014-07-07 06:05:29 +00:00			`from libc.stdlib cimport calloc, free`
* Fix memory leak in tokenizer, caused by having a fixed vocab. 2014-07-31 17:19:38 +00:00			`from libcpp.pair cimport pair`
			`from cython.operator cimport dereference as deref`
* Switch to dynamically allocating array, based on the document length 2014-07-07 06:05:29 +00:00
* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 14:58:48 +00:00			`from spacy.lexeme cimport Lexeme`
* Reorganized, moving language-independent stuff to spacy. The functions in spacy ask for the dictionaries and split function on input, but the language-specific modules are curried versions that use the globals 2014-07-07 02:21:06 +00:00			`from spacy.lexeme cimport BLANK_WORD`

* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 14:58:48 +00:00			`from spacy.string_tools cimport substr`
* Reorganized, moving language-independent stuff to spacy. The functions in spacy ask for the dictionaries and split function on input, but the language-specific modules are curried versions that use the globals 2014-07-07 02:21:06 +00:00
			`from . import util`
* Fixed major efficiency problem, from not quite grokking pass by reference in cython c++ 2014-07-07 05:36:43 +00:00			`from os import path`
* Progress to getting WordTree working. Tests pass, but so far it's slower. 2014-08-16 17:59:38 +00:00

* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 14:58:48 +00:00			`def get_normalized(unicode lex, size_t length):`
* Upd from spacy 2014-07-23 16:35:18 +00:00			`if lex.isalpha() and lex.islower():`
			`return lex`
			`else:`
			`return get_word_shape(lex, length)`
* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 14:58:48 +00:00

* Restore unicode, work on improving string storage. 2014-08-16 12:35:34 +00:00			`def get_word_shape(unicode lex, length):`
* 710k words per second for counts 2014-07-07 17:12:19 +00:00			`shape = ""`
			`last = ""`
			`shape_char = ""`
			`seq = 0`
			`for c in lex:`
			`if c.isalpha():`
			`if c.isupper():`
			`shape_char = "X"`
			`else:`
			`shape_char = "x"`
			`elif c.isdigit():`
			`shape_char = "d"`
			`else:`
			`shape_char = c`
			`if shape_char == last:`
			`seq += 1`
			`else:`
			`seq = 0`
			`last = shape_char`
			`if seq < 3:`
			`shape += shape_char`
			`assert shape`
			`return shape`

* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 14:58:48 +00:00
			`def set_orth_flags(lex, length):`
			`return 0`


* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 10:47:21 +00:00			`cdef class Language:`
			`def __cinit__(self, name):`
			`self.name = name`
			`self.bacov = {}`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 17:14:00 +00:00			`self.chunks = dense_hash_map[StringHash, size_t]()`
			`self.vocab = dense_hash_map[StringHash, size_t]()`
			`self.chunks.set_empty_key(0)`
			`self.vocab.set_empty_key(0)`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 10:47:21 +00:00			`self.load_tokenization(util.read_tokenization(name))`
* Reforming data model for lexemes 2014-08-19 00:40:37 +00:00			`self.load_dist_info(util.read_dist_info(name))`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 10:47:21 +00:00
* Working version, adding improvements 2014-08-18 17:59:59 +00:00			`cdef Tokens tokenize(self, unicode string):`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 17:14:00 +00:00			`cdef Lexeme** chunk`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 10:47:21 +00:00			`cdef Tokens tokens = Tokens(self)`
* Working version that uses arrays for chunks, which should be more memory efficient 2014-08-18 18:23:54 +00:00			`cdef size_t length = len(string)`
* Working version, adding improvements 2014-08-18 17:59:59 +00:00			`cdef size_t start = 0`
* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 18:48:48 +00:00			`cdef size_t i = 0`
			`for c in string:`
* Working version, adding improvements 2014-08-18 17:59:59 +00:00			`if _is_whitespace(c):`
			`if start < i:`
* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 18:48:48 +00:00			`chunk = self.lookup_chunk(string[start:i])`
* Working version, adding improvements 2014-08-18 17:59:59 +00:00			`_extend(tokens, chunk)`
			`start = i + 1`
* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 18:48:48 +00:00			`i += 1`
* Working version, adding improvements 2014-08-18 17:59:59 +00:00			`if start < i:`
* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 18:48:48 +00:00			`chunk = self.lookup_chunk(string[start:])`
* Working version, adding improvements 2014-08-18 17:59:59 +00:00			`_extend(tokens, chunk)`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 10:47:21 +00:00			`return tokens`

* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 17:14:00 +00:00			`cdef Lexeme* lookup(self, unicode string) except NULL:`
			`if len(string) == 0:`
			`return &BLANK_WORD`
			`cdef Lexeme* word = <Lexeme*>self.vocab[hash(string)]`
			`if word == NULL:`
			`word = self.new_lexeme(string)`
			`return word`
* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 14:58:48 +00:00
* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 18:48:48 +00:00			`cdef Lexeme** lookup_chunk(self, unicode string) except NULL:`
			`cdef StringHash h = hash(string)`
* Working version, adding improvements 2014-08-18 17:59:59 +00:00			`cdef Lexeme chunk = <Lexeme>self.chunks[h]`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 17:14:00 +00:00			`cdef int split`
			`if chunk == NULL:`
* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 18:48:48 +00:00			`chunk = self.new_chunk(string, self.find_substrings(string))`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 17:14:00 +00:00			`return chunk`

			`cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL:`
			`cdef Lexeme chunk = <Lexeme>calloc(len(substrings) + 1, sizeof(Lexeme*))`
			`for i, substring in enumerate(substrings):`
			`chunk[i] = self.lookup(substring)`
			`chunk[i + 1] = NULL`
* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 18:48:48 +00:00			`self.chunks[hash(string)] = <size_t>chunk`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 17:14:00 +00:00			`return chunk`

			`cdef Lexeme* new_lexeme(self, unicode string) except NULL:`
* Refactoring tokenizer 2014-08-16 01:22:03 +00:00			`cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))`
* Remove dependence on murmurhash 2014-08-16 15:37:09 +00:00			`word.lex = hash(string)`
* Restore string saving to spacy 2014-08-16 14:09:24 +00:00			`self.bacov[word.lex] = string`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 17:14:00 +00:00			`word.orth = self.new_orth(string)`
* Reforming data model for lexemes 2014-08-19 00:40:37 +00:00
			`word.dist = <Distribution*>calloc(1, sizeof(Distribution))`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 17:14:00 +00:00			`self.vocab[word.lex] = <size_t>word`
* WordTree in use. Need to reform the way chunks are handled. Should be properly one Lexeme per word, with split points being the things that are cached. 2014-08-16 18:10:22 +00:00			`return word`
* Refactoring tokenizer 2014-08-16 01:22:03 +00:00
* WordTree in use. Need to reform the way chunks are handled. Should be properly one Lexeme per word, with split points being the things that are cached. 2014-08-16 18:10:22 +00:00			`cdef Orthography* new_orth(self, unicode lex) except NULL:`
* Refactoring tokenizer 2014-08-16 01:22:03 +00:00			`cdef unicode last3`
			`cdef unicode norm`
			`cdef unicode shape`
			`cdef int length`

			`length = len(lex)`
			`orth = <Orthography*>calloc(1, sizeof(Orthography))`
* Restore unicode, work on improving string storage. 2014-08-16 12:35:34 +00:00			`orth.first = lex[0]`
* Refactoring tokenizer 2014-08-16 01:22:03 +00:00
			`orth.length = length`
			`orth.flags = set_orth_flags(lex, orth.length)`
* WordTree in use. Need to reform the way chunks are handled. Should be properly one Lexeme per word, with split points being the things that are cached. 2014-08-16 18:10:22 +00:00			`orth.norm = hash(lex)`
* Restore string saving to spacy 2014-08-16 14:09:24 +00:00			`last3 = substr(lex, length - 3, length, length)`
* Remove dependence on murmurhash 2014-08-16 15:37:09 +00:00			`orth.last3 = hash(last3)`
* Restore string saving to spacy 2014-08-16 14:09:24 +00:00			`norm = get_normalized(lex, length)`
* Remove dependence on murmurhash 2014-08-16 15:37:09 +00:00			`orth.norm = hash(norm)`
* Restore string saving to spacy 2014-08-16 14:09:24 +00:00			`shape = get_word_shape(lex, length)`
* Remove dependence on murmurhash 2014-08-16 15:37:09 +00:00			`orth.shape = hash(shape)`
* Restore string saving to spacy 2014-08-16 14:09:24 +00:00
			`self.bacov[orth.last3] = last3`
			`self.bacov[orth.norm] = norm`
			`self.bacov[orth.shape] = shape`
* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 14:58:48 +00:00			`return orth`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 10:47:21 +00:00
* Refactoring tokenizer 2014-08-16 01:22:03 +00:00			`cdef unicode unhash(self, StringHash hash_value):`
			`'''Fetch a string from the reverse index, given its hash value.'''`
* Restore unicode, work on improving string storage. 2014-08-16 12:35:34 +00:00			`return self.bacov[hash_value]`
* Refactoring tokenizer 2014-08-16 01:22:03 +00:00
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 17:14:00 +00:00			`cpdef list find_substrings(self, unicode word):`
			`substrings = []`
			`while word:`
			`split = self.find_split(word)`
			`if split == 0:`
			`substrings.append(word)`
			`break`
			`substrings.append(word[:split])`
			`word = word[split:]`
			`return substrings`

			`cdef int find_split(self, unicode word):`
			`return len(word)`
* Refactoring tokenizer 2014-08-16 01:22:03 +00:00
			`def load_tokenization(self, token_rules=None):`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 17:14:00 +00:00			`for chunk, tokens in token_rules:`
* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 18:48:48 +00:00			`self.new_chunk(chunk, tokens)`
* Working version, adding improvements 2014-08-18 17:59:59 +00:00
* Reforming data model for lexemes 2014-08-19 00:40:37 +00:00			`def load_dist_info(self, dist_info):`
			`cdef unicode string`
			`cdef dict word_dist`
* Refactoring tokenizer 2014-08-16 01:22:03 +00:00			`cdef Lexeme* w`
* Reforming data model for lexemes 2014-08-19 00:40:37 +00:00			`for string, word_dist in dist_info.items():`
			`w = self.lookup(string)`
			`w.prob = word_dist.prob`
			`w.cluster = word_dist.cluster`
			`for flag in word_dist.flags:`
			`w.flags \|= lexeme.DIST_FLAGS[flag]`
			`for tag in word_dist.tagdict:`
			`w.tagdict \|= lexeme.TAGS[tag]`
* Working version, adding improvements 2014-08-18 17:59:59 +00:00

* Working version that uses arrays for chunks, which should be more memory efficient 2014-08-18 18:23:54 +00:00			`cdef inline bint _is_whitespace(Py_UNICODE c) nogil:`
			`if c == ' ':`
* Working version, adding improvements 2014-08-18 17:59:59 +00:00			`return True`
* Working version that uses arrays for chunks, which should be more memory efficient 2014-08-18 18:23:54 +00:00			`elif c == '\n':`
* Working version, adding improvements 2014-08-18 17:59:59 +00:00			`return True`
* Working version that uses arrays for chunks, which should be more memory efficient 2014-08-18 18:23:54 +00:00			`elif c == '\t':`
* Working version, adding improvements 2014-08-18 17:59:59 +00:00			`return True`
			`else:`
			`return False`


* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 18:48:48 +00:00			`cdef inline int _extend(Tokens tokens, Lexeme** chunk) nogil:`
* Working version, adding improvements 2014-08-18 17:59:59 +00:00			`cdef size_t i = 0`
			`while chunk[i] != NULL:`
* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 18:48:48 +00:00			`tokens.vctr[0].push_back(<Lexeme_addr>chunk[i])`
			`tokens.length += 1`
* Working version, adding improvements 2014-08-18 17:59:59 +00:00			`i += 1`