mirror of https://github.com/explosion/spaCy.git
* Fix memory leak in tokenizer, caused by having a fixed vocab.
This commit is contained in:
parent
5b81ee716f
commit
4cb88c940b
|
@ -2,6 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from libc.stdlib cimport calloc, free
|
from libc.stdlib cimport calloc, free
|
||||||
|
from libcpp.pair cimport pair
|
||||||
|
from cython.operator cimport dereference as deref
|
||||||
|
|
||||||
from murmurhash cimport mrmr
|
from murmurhash cimport mrmr
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
|
@ -68,6 +70,9 @@ cdef class Language:
|
||||||
self.vocab[0].set_empty_key(0)
|
self.vocab[0].set_empty_key(0)
|
||||||
self.distri[0].set_empty_key(0)
|
self.distri[0].set_empty_key(0)
|
||||||
self.ortho[0].set_empty_key(0)
|
self.ortho[0].set_empty_key(0)
|
||||||
|
self.vocab[0].set_deleted_key(1)
|
||||||
|
self.distri[0].set_deleted_key(1)
|
||||||
|
self.ortho[0].set_deleted_key(1)
|
||||||
self.load_tokenization(util.read_tokenization(name))
|
self.load_tokenization(util.read_tokenization(name))
|
||||||
|
|
||||||
def load_tokenization(self, token_rules=None):
|
def load_tokenization(self, token_rules=None):
|
||||||
|
@ -136,9 +141,16 @@ cdef class Language:
|
||||||
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
|
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
|
||||||
cdef size_t i
|
cdef size_t i
|
||||||
cdef sparse_hash_map[StringHash, size_t].iterator it
|
cdef sparse_hash_map[StringHash, size_t].iterator it
|
||||||
|
cdef pair[StringHash, size_t] last_elem
|
||||||
if self.happax[0].size() >= MAX_HAPPAX:
|
if self.happax[0].size() >= MAX_HAPPAX:
|
||||||
# Delete last element.
|
# Delete last element.
|
||||||
self.happax[0].erase(self.happax[0].end())
|
last_elem = deref(self.happax[0].end())
|
||||||
|
free(<Orthography*>self.ortho[0][last_elem.first])
|
||||||
|
free(<Distribution*>self.distri[0][last_elem.first])
|
||||||
|
free(<Lexeme*>last_elem.second)
|
||||||
|
self.happax[0].erase(last_elem.first)
|
||||||
|
self.ortho[0].erase(last_elem.first)
|
||||||
|
self.distri[0].erase(last_elem.first)
|
||||||
word = self.init_lexeme(string, hashed, split, length)
|
word = self.init_lexeme(string, hashed, split, length)
|
||||||
self.happax[0][hashed] = <Lexeme_addr>word
|
self.happax[0][hashed] = <Lexeme_addr>word
|
||||||
self.bacov[hashed] = string
|
self.bacov[hashed] = string
|
||||||
|
|
Loading…
Reference in New Issue