From 985bc68327c8e079700e301e6e2364f5a9e7a03d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 12 Sep 2014 18:00:42 +0200 Subject: [PATCH] * Fix bug with trailing punct on contractions. Reduced efficiency, and slightly hacky implementation. --- spacy/en.pxd | 2 ++ spacy/en.pyx | 2 +- spacy/lang.pxd | 14 +++++----- spacy/lang.pyx | 61 ++++++++++++++++------------------------- tests/test_tokenizer.py | 18 ++++++++++-- 5 files changed, 50 insertions(+), 47 deletions(-) diff --git a/spacy/en.pxd b/spacy/en.pxd index 91d4db3af..f6dc782f0 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -40,3 +40,5 @@ cdef class EnglishTokens(Tokens): cdef class English(Language): cdef int _split_one(self, Py_UNICODE* characters, size_t length) + + diff --git a/spacy/en.pyx b/spacy/en.pyx index 5d4e6ef51..6300fda25 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -236,7 +236,7 @@ cdef class English(Language): fl_is_digit = Flag_IsDigit v_shape = View_WordShape def __cinit__(self, name, user_string_features, user_flag_features): - self.cache.set_empty_key(0) + self.cache = {} lang_data = util.read_lang_data(name) rules, words, probs, clusters, case_stats, tag_stats = lang_data self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats, diff --git a/spacy/lang.pxd b/spacy/lang.pxd index c32cb0c41..a498e6e0f 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -9,6 +9,11 @@ from libcpp.vector cimport vector from libc.stdint cimport uint64_t, int64_t +cdef extern from "Python.h": + cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch) + cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch) + + cdef extern from "sparsehash/dense_hash_map" namespace "google": cdef cppclass dense_hash_map[K, D]: K& key_type @@ -52,10 +57,6 @@ cdef extern from "sparsehash/dense_hash_map" namespace "google": D& operator[](K&) nogil -cdef struct LexList: - LexemeC* lex - LexList* tail - cdef class Lexicon: cpdef readonly size_t size @@ -70,13 +71,12 @@ cdef class Lexicon: cdef class Language: cdef unicode name - cdef dense_hash_map[uint64_t, size_t] cache - cdef size_t cache_size + cdef dict cache cpdef readonly Lexicon lexicon cpdef readonly object tokens_class cpdef Tokens tokenize(self, unicode text) cpdef Lexeme lookup(self, unicode text) - cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1 + cdef _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) cdef int _split_one(self, Py_UNICODE* characters, size_t length) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 8766eb86a..b11e78921 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -40,8 +40,7 @@ cdef class Language: if string_features is None: string_features = [] self.name = name - self.cache.set_empty_key(0) - self.cache_size = 0 + self.cache = {} lang_data = read_lang_data(name) rules, words, probs, clusters, case_stats, tag_stats = lang_data self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats, @@ -80,7 +79,6 @@ cdef class Language: Returns: tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs. """ - print repr(string) cdef size_t length = len(string) cdef Tokens tokens = self.tokens_class(length) if length == 0: @@ -92,7 +90,7 @@ cdef class Language: cdef Py_UNICODE c for i in range(length): c = characters[i] - if c == ' ' or c == '\n' or c == '\t': + if Py_UNICODE_ISSPACE(c) == 1: if start < i: self._tokenize(tokens, &characters[start], i - start) start = i + 1 @@ -101,38 +99,30 @@ cdef class Language: self._tokenize(tokens, &characters[start], i - start) return tokens - cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1: + cdef _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length): + cdef list lexemes + cdef size_t lex_addr cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0) - cdef LexList* node = self.cache[hashed] - if node is not NULL: - while node != NULL: - tokens.push_back(node.lex) - node = node.tail + if hashed in self.cache: + for lex_addr in self.cache[hashed]: + tokens.push_back(lex_addr) return 0 - - node = calloc(1, sizeof(LexList)) - self.cache[hashed] = node + + lexemes = [] cdef size_t start = 0 cdef size_t split = 0 while start < length: split = self._split_one(&characters[start], length - start) - node.lex = self.lexicon.get(&characters[start], split) - tokens.push_back(node.lex) - start += split - if start >= length: - break - hashed = hash64(&characters[start], (length - start) * sizeof(Py_UNICODE), 0) - node.tail = self.cache[hashed] - if node.tail == NULL: - node.tail = calloc(1, sizeof(LexList)) - self.cache[hashed] = node.tail - node = node.tail + hashed = hash64(&characters[start], split * sizeof(Py_UNICODE), 0) + if hashed in self.cache: + lexemes.extend(self.cache[hashed]) else: - node = node.tail - while node != NULL: - tokens.push_back(node.lex) - node = node.tail - break + lexeme = self.lexicon.get(&characters[start], split) + lexemes.append(lexeme) + start += split + for lex_addr in lexemes: + tokens.push_back(lex_addr) + #self.cache[hashed] = lexemes cdef int _split_one(self, Py_UNICODE* characters, size_t length): return length @@ -149,17 +139,14 @@ cdef class Language: token_rules (list): A list of (chunk, tokens) pairs, where chunk is a string and tokens is a list of strings. ''' - cdef LexList* node + cdef list lexemes cdef uint64_t hashed for string, substrings in token_rules: hashed = hash64(string, len(string) * sizeof(Py_UNICODE), 0) - node = calloc(1, sizeof(LexList)) - self.cache[hashed] = node - for substring in substrings[:-1]: - node.lex = self.lexicon.get(substring, len(substring)) - node.tail = calloc(1, sizeof(LexList)) - node = node.tail - node.lex = self.lexicon.get(substrings[-1], len(substrings[-1])) + lexemes = [] + for substring in substrings: + lexemes.append(self.lexicon.get(substring, len(substring))) + self.cache[hashed] = lexemes cdef class Lexicon: diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 50b0dae71..12ae2595f 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -26,6 +26,8 @@ def test_punct(): def test_digits(): lex_ids = EN.tokenize('The year: 1984.') + assert lex_ids.string(4) == "." + assert lex_ids.string(3) == "1984" assert len(lex_ids) == 5 assert lex_ids[0].string == EN.lexicon.lookup('The').string assert lex_ids[3].string == EN.lexicon.lookup('1984').string @@ -37,5 +39,17 @@ def test_contraction(): assert len(lex_ids) == 3 assert lex_ids[1].string == EN.lexicon.lookup("not").string lex_ids = EN.tokenize("i said don't!") - assert len(lex_ids) == 4 - assert lex_ids[3].string == EN.lexicon.lookup('!').string + assert len(lex_ids) == 5 + assert lex_ids[4].string == EN.lexicon.lookup('!').string + + +def test_contraction_punct(): + tokens = EN.tokenize("(can't") + assert len(tokens) == 3 + tokens = EN.tokenize("`ain't") + assert len(tokens) == 3 + tokens = EN.tokenize('''"isn't''') + assert len(tokens) == 3 + tokens = EN.tokenize("can't!") + assert len(tokens) == 3 +