From 2389bd1b103653d8c8d7cb7d185fbb8e4529daec Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 12 Sep 2014 00:18:31 +0200 Subject: [PATCH] * Improve cache mechanism by including a random element depending on the size of the cache. --- spacy/lang.pyx | 22 +++++++++++++++------- spacy/tokens.pyx | 7 +++++-- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 9ddc67cd7..c4e1b319c 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -11,6 +11,7 @@ from __future__ import unicode_literals from libc.stdlib cimport calloc, free import json +import random from os import path from .util import read_lang_data @@ -85,7 +86,7 @@ cdef class Language: cdef size_t start = 0 cdef size_t i = 0 for c in string: - if c == ' ': + if c == ' ' or c == '\n' or c == '\t': if start < i: self._tokenize(tokens, string[start:i]) start = i + 1 @@ -96,20 +97,27 @@ cdef class Language: cdef _tokenize(self, Tokens tokens, unicode string): cdef LexemeC** lexemes + cdef bint free_chunk = False + cdef size_t i = 0 if string in self.cache: lexemes = self.cache[string] + while lexemes[i] != NULL: + tokens.push_back(lexemes[i]) + i += 1 else: substrings = self._split(string) lexemes = calloc(len(substrings) + 1, sizeof(LexemeC*)) for i, substring in enumerate(substrings): lexemes[i] = self.lexicon.get(substring) + tokens.push_back(lexemes[i]) lexemes[i + 1] = NULL - self.cache[string] = lexemes - cdef LexemeC* lexeme - i = 0 - while lexemes[i] != NULL: - tokens.push_back(lexemes[i]) - i += 1 + # The intuition here is that if an element belongs in the cache, it + # has several chances to get in. And if the cache is large, we less + # believe that the element belongs there. + if not self.cache or random.random() < (100000.0 / len(self.cache)): + self.cache[string] = lexemes + else: + free(lexemes) cdef list _split(self, unicode string): """Find how to split a contiguous span of non-space characters into substrings. diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 75816bebe..209ae94d6 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -24,12 +24,15 @@ cdef class Tokens: >>> tokens.can_noun(1) True """ - def __cinit__(self, size=100): - assert size >= 1 + def __cinit__(self, string_length=0): + size = int(string_length / 3) if string_length >= 3 else 1 self.lexemes = calloc(size, sizeof(LexemeC*)) self.size = size self.length = 0 + def __dealloc__(self): + free(self.lexemes) + def __getitem__(self, i): if i >= self.length: raise IndexError