From 5b1c6516617dc443b16f3543d530eec5a3637e13 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 11 Sep 2014 12:28:38 +0200 Subject: [PATCH] * Only store LexemeC structs in the vocabulary, transforming them to Lexeme objects for output. Moving away from Lexeme objects for Tokens soon. --- fabfile.py | 15 +++++++++++++-- spacy/lang.pyx | 41 ++++++++++++++++++++++++++++------------- spacy/word.pyx | 19 ++----------------- 3 files changed, 43 insertions(+), 32 deletions(-) diff --git a/fabfile.py b/fabfile.py index 10daa1811..65b58d17d 100644 --- a/fabfile.py +++ b/fabfile.py @@ -1,14 +1,25 @@ +import json + from fabric.api import local, run, lcd, cd, env def make(): local('python setup.py build_ext --inplace') + def clean(): local('python setup.py clean --all') + def docs(): - with lcd('docs'): - local('sphinx-build -b html . ./_build') + local('sphinx-build -b html docs/ .') + def test(): local('py.test -x') + +def sbox(): + local('python sb_setup.py build_ext --inplace') + +def sbclean(): + local('python sb_setup.py clean --all') + diff --git a/spacy/lang.pyx b/spacy/lang.pyx index b2b1d1fca..2470deb7f 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -15,6 +15,7 @@ from os import path from .util import read_lang_data from spacy.tokens import Tokens +from spacy.lexeme cimport LexemeC, lexeme_init cdef class Language: @@ -76,9 +77,10 @@ cdef class Language: Returns: tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs. """ - assert string - cdef size_t length = len(string) + if length == 0: + return [] + cdef size_t start = 0 cdef size_t i = 0 cdef Tokens tokens = self.tokens_class() @@ -162,10 +164,18 @@ cdef class Lexicon: self.size = 0 cdef Lexeme word for string in words: - word = Lexeme(string, probs.get(string, 0.0), clusters.get(string, 0), - case_stats.get(string, {}), tag_stats.get(string, {}), - self._string_features, self._flag_features) - self._dict[string] = word + prob = probs.get(string, 0.0) + cluster = clusters.get(string, 0.0) + cases = case_stats.get(string, {}) + tags = tag_stats.get(string, {}) + views = [string_view(string, prob, cluster, cases, tags) + for string_view in self._string_features] + flags = set() + for i, flag_feature in enumerate(self._flag_features): + if flag_feature(string, prob, cluster, cases, tags): + flags.add(i) + lexeme = lexeme_init(string, prob, cluster, views, flags) + self._dict[string] = lexeme self.size += 1 cpdef Lexeme lookup(self, unicode string): @@ -177,14 +187,19 @@ cdef class Lexicon: Returns: lexeme (Lexeme): A reference to a lexical type. """ - cdef Lexeme lexeme + cdef LexemeC* lexeme assert len(string) != 0 if string in self._dict: - lexeme = self._dict[string] - return lexeme + return Lexeme(self._dict[string]) - cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self._string_features, - self._flag_features) - self._dict[string] = word + views = [string_view(string, 0.0, 0, {}, {}) + for string_view in self._string_features] + flags = set() + for i, flag_feature in enumerate(self._flag_features): + if flag_feature(string, 0.0, {}, {}): + flags.add(i) + + lexeme = lexeme_init(string, 0, 0, views, flags) + self._dict[string] = lexeme self.size += 1 - return word + return Lexeme(lexeme) diff --git a/spacy/word.pyx b/spacy/word.pyx index 6366a820a..c14295667 100644 --- a/spacy/word.pyx +++ b/spacy/word.pyx @@ -49,23 +49,8 @@ cdef class Lexeme: while "dapple" is totally different. On the other hand, "scalable" receives the same cluster ID as "pineapple", which is not what we'd like. """ - def __cinit__(self, unicode string, double prob, int cluster, dict case_stats, - dict tag_stats, list string_features, list flag_features): - views = [] - cdef unicode view - for string_feature in string_features: - view = string_feature(string, prob, cluster, case_stats, tag_stats) - views.append(view) - - flags = set() - for i, flag_feature in enumerate(flag_features): - if flag_feature(string, prob, case_stats, tag_stats): - if (1 << i): - flags.add(i) - self._c = lexeme_init(string, prob, cluster, views, flags) - - def __dealloc__(self): - lexeme_free(self._c) + def __cinit__(self, size_t lexeme_addr): + self._c = lexeme_addr property string: def __get__(self):