From 4cb88c940bdca5b69a10f4e4de532ccf6c9955a2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 31 Jul 2014 18:19:38 +0100 Subject: [PATCH] * Fix memory leak in tokenizer, caused by having a fixed vocab. --- spacy/spacy.pyx | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index cadc4407c..d36eaafe2 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -2,6 +2,8 @@ from __future__ import unicode_literals from libc.stdlib cimport calloc, free +from libcpp.pair cimport pair +from cython.operator cimport dereference as deref from murmurhash cimport mrmr from spacy.lexeme cimport Lexeme @@ -68,6 +70,9 @@ cdef class Language: self.vocab[0].set_empty_key(0) self.distri[0].set_empty_key(0) self.ortho[0].set_empty_key(0) + self.vocab[0].set_deleted_key(1) + self.distri[0].set_deleted_key(1) + self.ortho[0].set_deleted_key(1) self.load_tokenization(util.read_tokenization(name)) def load_tokenization(self, token_rules=None): @@ -136,9 +141,16 @@ cdef class Language: cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length): cdef size_t i cdef sparse_hash_map[StringHash, size_t].iterator it + cdef pair[StringHash, size_t] last_elem if self.happax[0].size() >= MAX_HAPPAX: # Delete last element. - self.happax[0].erase(self.happax[0].end()) + last_elem = deref(self.happax[0].end()) + free(self.ortho[0][last_elem.first]) + free(self.distri[0][last_elem.first]) + free(last_elem.second) + self.happax[0].erase(last_elem.first) + self.ortho[0].erase(last_elem.first) + self.distri[0].erase(last_elem.first) word = self.init_lexeme(string, hashed, split, length) self.happax[0][hashed] = word self.bacov[hashed] = string