From 6266cac593e0aed197f46bda3c47e18017dd80b9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 17 Sep 2014 20:02:26 +0200 Subject: [PATCH] * Switch to using a Python ref counted gateway to malloc/free, to prevent memory leaks --- spacy/_hashing.pxd | 2 ++ spacy/_hashing.pyx | 13 ++++++------- spacy/en.pyx | 1 - spacy/lang.pxd | 4 ++++ spacy/lang.pyx | 17 ++++++++++------- spacy/lexeme.pxd | 4 ++-- spacy/lexeme.pyx | 13 ++++--------- spacy/ptb3.pyx | 1 - spacy/tokens.pyx | 2 -- spacy/word.pyx | 3 --- 10 files changed, 28 insertions(+), 32 deletions(-) diff --git a/spacy/_hashing.pxd b/spacy/_hashing.pxd index 8295010bd..fd39b99c5 100644 --- a/spacy/_hashing.pxd +++ b/spacy/_hashing.pxd @@ -1,4 +1,5 @@ from libc.stdint cimport uint64_t +from .memory cimport Address ctypedef uint64_t key_t ctypedef void* val_t @@ -13,6 +14,7 @@ cdef class PointerHash: cdef size_t size cdef size_t filled cdef Cell* cells + cdef Address _mem cdef val_t get(self, key_t key) nogil cdef void set(self, key_t key, val_t value) except * diff --git a/spacy/_hashing.pyx b/spacy/_hashing.pyx index 010d5c933..535e935de 100644 --- a/spacy/_hashing.pyx +++ b/spacy/_hashing.pyx @@ -1,5 +1,5 @@ # cython: profile=True -from libc.stdlib cimport calloc, free +from .memory cimport Address cimport cython @@ -10,10 +10,8 @@ cdef class PointerHash: # Size must be power of two assert self.size != 0 assert self.size & (self.size - 1) == 0 - self.cells = calloc(self.size, sizeof(Cell)) - - def __dealloc__(self): - free(self.cells) + self._mem = Address(self.size, sizeof(Cell)) + self.cells = self._mem.addr def __getitem__(self, key_t key): assert key != 0 @@ -47,7 +45,8 @@ cdef class PointerHash: cdef size_t old_size = self.size self.size = new_size - self.cells = calloc(new_size, sizeof(Cell)) + cdef Address new_mem = Address(new_size, sizeof(Cell)) + self.cells = new_mem.addr self.filled = 0 cdef size_t i @@ -56,7 +55,7 @@ cdef class PointerHash: if old_cells[i].key != 0: assert old_cells[i].value != NULL, i self.set(old_cells[i].key, old_cells[i].value) - free(old_cells) + self._mem = new_mem @cython.cdivision diff --git a/spacy/en.pyx b/spacy/en.pyx index 29bff9bc1..57dc4bbcf 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -37,7 +37,6 @@ provides a fully Penn Treebank 3-compliant tokenizer. from __future__ import unicode_literals -from libc.stdlib cimport malloc, calloc, free from libc.stdint cimport uint64_t cimport lang diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 57891fcd7..ca4a59a75 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -5,6 +5,8 @@ from spacy.tokens cimport Tokens from spacy.lexeme cimport LexemeC from spacy._hashing cimport PointerHash +from spacy.memory cimport Pool + from libcpp.utility cimport pair from libcpp.vector cimport vector from libc.stdint cimport uint64_t, int64_t @@ -22,6 +24,7 @@ cdef struct String: cdef class Lexicon: + cdef Pool _mem cpdef readonly size_t size cpdef Lexeme lookup(self, unicode string) @@ -34,6 +37,7 @@ cdef class Lexicon: cdef class Language: + cdef Pool _mem cdef unicode name cdef PointerHash cache cdef PointerHash specials diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 3328b53bd..9eda53c65 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -8,8 +8,6 @@ Special-case tokenization rules are read from data//tokenization . """ from __future__ import unicode_literals -from libc.stdlib cimport calloc, free - import json import random from os import path @@ -18,8 +16,11 @@ from .util import read_lang_data from spacy.tokens import Tokens from spacy.lexeme cimport LexemeC, lexeme_init from murmurhash.mrmr cimport hash64 + from cpython.ref cimport Py_INCREF +from .memory cimport Pool + from cython.operator cimport preincrement as preinc from cython.operator cimport dereference as deref @@ -127,6 +128,7 @@ cdef class Language: def __cinit__(self, name, user_string_features, user_flag_features): self.name = name + self._mem = Pool() self.cache = PointerHash(2 ** 25) self.specials = PointerHash(2 ** 16) lang_data = util.read_lang_data(name) @@ -203,7 +205,7 @@ cdef class Language: if lexemes != NULL: i = 0 while lexemes[i] != NULL: - tokens.push_back(lexemes[i]) + tokens_v.push_back(lexemes[i]) i += 1 return 0 @@ -292,7 +294,7 @@ cdef class Language: cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1: assert tokens.size() > n - lexemes = calloc((tokens.size() - n) + 1, sizeof(LexemeC**)) + lexemes = self._mem.alloc((tokens.size() - n) + 1, sizeof(LexemeC**)) cdef size_t i, j for i, j in enumerate(range(n, tokens.size())): lexemes[i] = tokens.at(j) @@ -404,7 +406,7 @@ cdef class Language: cdef uint64_t hashed cdef String string for uni_string, substrings in token_rules: - lexemes = calloc(len(substrings) + 1, sizeof(LexemeC*)) + lexemes = self._mem.alloc(len(substrings) + 1, sizeof(LexemeC*)) for i, substring in enumerate(substrings): string_from_unicode(&string, substring) lexemes[i] = self.lexicon.get(&string) @@ -417,6 +419,7 @@ cdef class Language: cdef class Lexicon: def __cinit__(self, words, probs, clusters, case_stats, tag_stats, string_features, flag_features): + self._mem = Pool() self._flag_features = flag_features self._string_features = string_features self._dict = PointerHash(2 ** 20) @@ -433,7 +436,7 @@ cdef class Lexicon: for i, flag_feature in enumerate(self._flag_features): if flag_feature(uni_string, prob, cluster, cases, tags): flags.add(i) - lexeme = lexeme_init(uni_string, prob, cluster, views, flags) + lexeme = lexeme_init(self._mem, uni_string, prob, cluster, views, flags) string_from_unicode(&string, uni_string) self._dict.set(string.key, lexeme) self.size += 1 @@ -452,7 +455,7 @@ cdef class Lexicon: if flag_feature(uni_string, 0.0, {}, {}): flags.add(i) - lexeme = lexeme_init(uni_string, 0, 0, views, flags) + lexeme = lexeme_init(self._mem, uni_string, 0, 0, views, flags) self._dict.set(string.key, lexeme) self.size += 1 return lexeme diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 43599943e..b941e0feb 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -1,4 +1,5 @@ from .typedefs cimport hash_t, utf8_t, flag_t, id_t +from .memory cimport Pool cdef struct LexemeC: @@ -12,9 +13,8 @@ cdef struct LexemeC: flag_t flags -cdef LexemeC* lexeme_init(unicode string, double prob, size_t cluster, +cdef LexemeC* lexeme_init(Pool mem, unicode string, double prob, size_t cluster, list views, set flags) -cdef int lexeme_free(LexemeC* lexeme) except -1 cdef bint lexeme_check_flag(LexemeC* lexeme, size_t flag_id) cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 223fc5e06..b9a40c02c 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -1,14 +1,14 @@ -from libc.stdlib cimport calloc, free from cpython.ref cimport Py_INCREF +from .memory cimport Pool -cdef LexemeC* lexeme_init(unicode string, double prob, size_t cluster, +cdef LexemeC* lexeme_init(Pool mem, unicode string, double prob, size_t cluster, list views, set flags): - cdef LexemeC* lexeme = calloc(1, sizeof(LexemeC)) + cdef LexemeC* lexeme = mem.alloc(1, sizeof(LexemeC)) lexeme.cluster = cluster lexeme.prob = prob lexeme.string = intern_and_encode(string, &lexeme.length) - lexeme.views = calloc(len(views), sizeof(char*)) + lexeme.views = mem.alloc(len(views), sizeof(char*)) cdef size_t length = 0 for i, string in enumerate(views): lexeme.views[i] = intern_and_encode(string, &length) @@ -18,11 +18,6 @@ cdef LexemeC* lexeme_init(unicode string, double prob, size_t cluster, return lexeme -cdef int lexeme_free(LexemeC* lexeme) except -1: - free(lexeme.views) - free(lexeme) - - cdef char* intern_and_encode(unicode string, size_t* length): cdef bytes byte_string = string.encode('utf8') cdef bytes utf8_string = intern(byte_string) diff --git a/spacy/ptb3.pyx b/spacy/ptb3.pyx index 0d3828920..cd60e062a 100644 --- a/spacy/ptb3.pyx +++ b/spacy/ptb3.pyx @@ -5,7 +5,6 @@ boldly assume no collisions. from __future__ import unicode_literals -from libc.stdlib cimport malloc, calloc, free from libc.stdint cimport uint64_t diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 52d1e7c32..64ddf5c29 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -1,6 +1,4 @@ # cython: profile=True -from libc.stdlib cimport calloc, free, realloc - from spacy.word cimport Lexeme from spacy.lexeme cimport lexeme_check_flag from spacy.lexeme cimport lexeme_string_view diff --git a/spacy/word.pyx b/spacy/word.pyx index c14295667..745832775 100644 --- a/spacy/word.pyx +++ b/spacy/word.pyx @@ -1,9 +1,6 @@ # cython: profile=True # cython: embedsignature=True -from libc.stdlib cimport calloc, free, realloc - -from spacy.lexeme cimport lexeme_free, lexeme_init from spacy.lexeme cimport lexeme_check_flag, lexeme_string_view