From b15619e170f9940f155f9af8bbbab5f4d5ae23e3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 25 Sep 2014 18:22:52 +0200 Subject: [PATCH] * Use PointerHash instead of locally provided _hashing module --- setup.py | 1 - spacy/en.pyx | 2 - spacy/lang.pxd | 11 +++-- spacy/lang.pyx | 108 +++++++++---------------------------------------- 4 files changed, 26 insertions(+), 96 deletions(-) diff --git a/setup.py b/setup.py index 7b2a4db0f..bab596367 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,6 @@ else: exts = [ Extension("spacy.lang", ["spacy/lang.pyx"], language="c++", include_dirs=includes), - Extension("spacy._hashing", ["spacy/_hashing.pyx"], language="c++", include_dirs=includes), Extension("spacy.word", ["spacy/word.pyx"], language="c++", include_dirs=includes), Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", diff --git a/spacy/en.pyx b/spacy/en.pyx index 57dc4bbcf..a51349116 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -42,8 +42,6 @@ from libc.stdint cimport uint64_t cimport lang from spacy.lexeme cimport lexeme_check_flag from spacy.lexeme cimport lexeme_string_view -from spacy._hashing cimport PointerHash - from spacy import orth diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 652c9ff2f..3f414708d 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -3,7 +3,7 @@ from libc.stdint cimport uint64_t from spacy.word cimport Lexeme from spacy.tokens cimport Tokens from spacy.lexeme cimport LexemeC -from spacy._hashing cimport PointerHash +from trustyc.maps cimport PointerMap from cymem.cymem cimport Pool @@ -30,7 +30,7 @@ cdef class Lexicon: cpdef Lexeme lookup(self, unicode string) cdef LexemeC* get(self, String* s) except NULL - cdef PointerHash _dict + cdef PointerMap _dict cdef list _string_features cdef list _flag_features @@ -39,10 +39,13 @@ cdef class Lexicon: cdef class Language: cdef Pool _mem cdef unicode name - cdef PointerHash cache - cdef PointerHash specials + cdef PointerMap cache + cdef PointerMap specials cpdef readonly Lexicon lexicon + cdef object prefix_re + cdef object suffix_re + cpdef Tokens tokenize(self, unicode text) cpdef Lexeme lookup(self, unicode text) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 26a836d3b..e7e330b68 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -11,6 +11,7 @@ from __future__ import unicode_literals import json import random from os import path +import re from .util import read_lang_data from spacy.tokens import Tokens @@ -25,7 +26,7 @@ from cython.operator cimport preincrement as preinc from cython.operator cimport dereference as deref -from spacy._hashing cimport PointerHash +from trustyc.maps cimport PointerMap from spacy import orth from spacy import util @@ -129,10 +130,12 @@ cdef class Language: def __cinit__(self, name, user_string_features, user_flag_features): self.name = name self._mem = Pool() - self.cache = PointerHash(2 ** 25) - self.specials = PointerHash(2 ** 16) + self.cache = PointerMap(2 ** 25) + self.specials = PointerMap(2 ** 16) lang_data = util.read_lang_data(name) - rules, words, probs, clusters, case_stats, tag_stats = lang_data + rules, prefix, suffix, words, probs, clusters, case_stats, tag_stats = lang_data + self.prefix_re = re.compile(prefix) + self.suffix_re = re.compile(suffix) self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats, STRING_VIEW_FUNCS + user_string_features, FLAG_FUNCS + user_flag_features) @@ -302,93 +305,20 @@ cdef class Language: self.cache.set(key, lexemes) cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1: - cdef Py_UNICODE c0 = chars[0] - cdef Py_UNICODE c1 = chars[1] - if c0 == ",": - return 1 - elif c0 == '"': - return 1 - elif c0 == "(": - return 1 - elif c0 == "[": - return 1 - elif c0 == "{": - return 1 - elif c0 == "*": - return 1 - elif c0 == "<": - return 1 - elif c0 == "$": - return 1 - elif c0 == "£": - return 1 - elif c0 == "€": - return 1 - elif c0 == "\u201c": - return 1 - elif c0 == "'": - return 1 - elif c0 == "`": - if c1 == "`": - return 2 - else: - return 1 - else: + cdef unicode string = chars[:length] + match = self.prefix_re.search(string) + if match is None: return 0 - + else: + return match.end() - match.start() + cdef int _find_suffix(self, Py_UNICODE* chars, size_t length): - cdef Py_UNICODE c0 = chars[length - 1] - cdef Py_UNICODE c1 = chars[length - 2] if length >= 2 else 0 - cdef Py_UNICODE c2 = chars[length - 3] if length >= 3 else 0 - - if c0 == ",": - return 1 - elif c0 == '"': - return 1 - elif c0 == ')': - return 1 - elif c0 == ']': - return 1 - elif c0 == '}': - return 1 - elif c0 == '*': - return 1 - elif c0 == '!': - return 1 - elif c0 == '?': - return 1 - elif c0 == '%': - return 1 - elif c0 == '$': - return 1 - elif c0 == '>': - return 1 - elif c0 == ':': - return 1 - elif c0 == "'": - return 1 - elif c0 == u'\u201d': - return 1 - elif c0 == "s": - if c1 == "'": - return 2 - else: - return 0 - elif c0 == "S": - if c1 == "'": - return 2 - else: - return 0 - elif c0 == ".": - if c1 == ".": - if c2 == ".": - return 3 - else: - return 2 - else: - return 1 - else: + cdef unicode string = chars[:length] + match = self.suffix_re.search(string) + if match is None: return 0 + else: + return match.end() - match.start() def _load_special_tokenization(self, token_rules): '''Load special-case tokenization rules. @@ -422,7 +352,7 @@ cdef class Lexicon: self._mem = Pool() self._flag_features = flag_features self._string_features = string_features - self._dict = PointerHash(2 ** 20) + self._dict = PointerMap(2 ** 20) self.size = 0 cdef String string for uni_string in words: