* Fix bug with trailing punct on contractions. Reduced efficiency, and slightly hacky implementation.

2014-09-12 18:00:42 +02:00 · 2014-09-12 18:00:42 +02:00 · 985bc68327
parent 7eab281194
commit 985bc68327
5 changed files with 50 additions and 47 deletions
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -40,3 +40,5 @@ cdef class EnglishTokens(Tokens):

 cdef class English(Language):
    cdef int _split_one(self, Py_UNICODE* characters, size_t length)
+
+
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -236,7 +236,7 @@ cdef class English(Language):
    fl_is_digit = Flag_IsDigit
    v_shape = View_WordShape
    def __cinit__(self, name, user_string_features, user_flag_features):
-        self.cache.set_empty_key(0)
+        self.cache = {}
        lang_data = util.read_lang_data(name)
        rules, words, probs, clusters, case_stats, tag_stats = lang_data
        self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats,
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -9,6 +9,11 @@ from libcpp.vector cimport vector
 from libc.stdint cimport uint64_t, int64_t


+cdef extern from "Python.h":
+    cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch)
+    cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch)
+
+
 cdef extern from "sparsehash/dense_hash_map" namespace "google":
    cdef cppclass dense_hash_map[K, D]:
        K& key_type
@ -52,10 +57,6 @@ cdef extern from "sparsehash/dense_hash_map" namespace "google":
        D& operator[](K&) nogil


-cdef struct LexList:
-    LexemeC* lex
-    LexList* tail
-
 cdef class Lexicon:
    cpdef readonly size_t size

@ -70,13 +71,12 @@ cdef class Lexicon:

 cdef class Language:
    cdef unicode name
-    cdef dense_hash_map[uint64_t, size_t] cache
-    cdef size_t cache_size
+    cdef dict cache
    cpdef readonly Lexicon lexicon
    cpdef readonly object tokens_class

    cpdef Tokens tokenize(self, unicode text)
    cpdef Lexeme lookup(self, unicode text)

-    cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1
+    cdef _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length)
    cdef int _split_one(self, Py_UNICODE* characters, size_t length)
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -40,8 +40,7 @@ cdef class Language:
        if string_features is None:
            string_features = []
        self.name = name
-        self.cache.set_empty_key(0)
-        self.cache_size = 0
+        self.cache = {}
        lang_data = read_lang_data(name)
        rules, words, probs, clusters, case_stats, tag_stats = lang_data
        self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
@ -80,7 +79,6 @@ cdef class Language:
        Returns:
            tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
        """
-        print repr(string)
        cdef size_t length = len(string)
        cdef Tokens tokens = self.tokens_class(length)
        if length == 0:
@ -92,7 +90,7 @@ cdef class Language:
        cdef Py_UNICODE c
        for i in range(length):
            c = characters[i]
-            if c == ' ' or c == '\n' or c == '\t':
+            if Py_UNICODE_ISSPACE(c) == 1:
                if start < i:
                    self._tokenize(tokens, &characters[start], i - start)
                start = i + 1
@ -101,38 +99,30 @@ cdef class Language:
            self._tokenize(tokens, &characters[start], i - start)
        return tokens

-    cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1:
+    cdef _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length):
+        cdef list lexemes
+        cdef size_t lex_addr
        cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0)
-        cdef LexList* node = <LexList*>self.cache[hashed]
-        if node is not NULL:
-            while node != NULL:
-                tokens.push_back(node.lex)
-                node = node.tail
+        if hashed in self.cache:
+            for lex_addr in self.cache[hashed]:
+                tokens.push_back(<LexemeC*>lex_addr)
            return 0
-   
-        node = <LexList*>calloc(1, sizeof(LexList))
-        self.cache[hashed] = <size_t>node
+
+        lexemes = []
        cdef size_t start = 0
        cdef size_t split = 0
        while start < length:
            split = self._split_one(&characters[start], length - start)
-            node.lex = <LexemeC*>self.lexicon.get(&characters[start], split)
-            tokens.push_back(node.lex)
-            start += split
-            if start >= length:
-                break
-            hashed = hash64(&characters[start], (length - start) * sizeof(Py_UNICODE), 0)
-            node.tail = <LexList*>self.cache[hashed]
-            if node.tail == NULL:
-                node.tail = <LexList*>calloc(1, sizeof(LexList))
-                self.cache[hashed] = <size_t>node.tail
-                node = node.tail
+            hashed = hash64(&characters[start], split * sizeof(Py_UNICODE), 0)
+            if hashed in self.cache:
+                lexemes.extend(self.cache[hashed])
            else:
-                node = node.tail
-                while node != NULL:
-                    tokens.push_back(node.lex)
-                    node = node.tail
-                break
+                lexeme = <LexemeC*>self.lexicon.get(&characters[start], split)
+                lexemes.append(<size_t>lexeme)
+            start += split
+        for lex_addr in lexemes:
+            tokens.push_back(<LexemeC*>lex_addr)
+        #self.cache[hashed] = lexemes

    cdef int _split_one(self, Py_UNICODE* characters, size_t length):
        return length
@ -149,17 +139,14 @@ cdef class Language:
            token_rules (list): A list of (chunk, tokens) pairs, where chunk is
                a string and tokens is a list of strings.
        '''
-        cdef LexList* node
+        cdef list lexemes
        cdef uint64_t hashed
        for string, substrings in token_rules:
            hashed = hash64(<Py_UNICODE*>string, len(string) * sizeof(Py_UNICODE), 0)
-            node = <LexList*>calloc(1, sizeof(LexList))
-            self.cache[hashed] = <size_t>node
-            for substring in substrings[:-1]:
-                node.lex = <LexemeC*>self.lexicon.get(<Py_UNICODE*>substring, len(substring))
-                node.tail = <LexList*>calloc(1, sizeof(LexList))
-                node = node.tail
-            node.lex = <LexemeC*>self.lexicon.get(<Py_UNICODE*>substrings[-1], len(substrings[-1]))
+            lexemes = []
+            for substring in substrings:
+                lexemes.append(self.lexicon.get(<Py_UNICODE*>substring, len(substring)))
+            self.cache[hashed] = lexemes
 

 cdef class Lexicon:
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@ -26,6 +26,8 @@ def test_punct():

 def test_digits():
    lex_ids = EN.tokenize('The year: 1984.')
+    assert lex_ids.string(4) == "."
+    assert lex_ids.string(3) == "1984"
    assert len(lex_ids) == 5
    assert lex_ids[0].string == EN.lexicon.lookup('The').string
    assert lex_ids[3].string == EN.lexicon.lookup('1984').string
@ -37,5 +39,17 @@ def test_contraction():
    assert len(lex_ids) == 3
    assert lex_ids[1].string == EN.lexicon.lookup("not").string
    lex_ids = EN.tokenize("i said don't!")
-    assert len(lex_ids) == 4
-    assert lex_ids[3].string == EN.lexicon.lookup('!').string
+    assert len(lex_ids) == 5
+    assert lex_ids[4].string == EN.lexicon.lookup('!').string
+
+
+def test_contraction_punct():
+    tokens = EN.tokenize("(can't")
+    assert len(tokens) == 3
+    tokens = EN.tokenize("`ain't")
+    assert len(tokens) == 3
+    tokens = EN.tokenize('''"isn't''')
+    assert len(tokens) == 3
+    tokens = EN.tokenize("can't!")
+    assert len(tokens) == 3
+