* Reading in tokenization rules correctly. Passing tests.

2014-07-07 00:02:55 +02:00 · 2014-07-07 00:02:55 +02:00 · 4e79446dc2
parent 9bef797afe
commit 4e79446dc2
6 changed files with 1120 additions and 340 deletions
--- a/spacy/en.cpp
+++ b/spacy/en.cpp
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -10,6 +10,7 @@ from libc.stdint cimport uint64_t
 from spacy.lexeme cimport Lexeme
 from ext.murmurhash cimport MurmurHash64A
 from ext.murmurhash cimport MurmurHash64B
+from . import util


 STRINGS = {}
@ -20,6 +21,23 @@ LEXEMES.set_empty_key(0)
 cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)


+def load_tokenization(token_rules):
+    cdef Lexeme* word
+    cdef StringHash hashed
+    for chunk, lex, tokens in token_rules:
+        hashed = hash_string(chunk, len(chunk))
+        assert LEXEMES[hashed] == NULL
+        word = _add(hashed, lex, len(lex), len(lex))
+        for i, lex in enumerate(tokens):
+            token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
+            length = len(token_string)
+            hashed = hash_string(token_string, length)
+            word.tail = _add(hashed, lex, 0, len(lex))
+            word = word.tail
+
+
+load_tokenization(util.read_tokenization('en'))
+
 cpdef Lexeme_addr lookup(unicode string) except 0:
    '''.. function:: enumerate(sequence[, start=0])
    Fetch a Lexeme representing a word string. If the word has not been seen,
@ -156,8 +174,8 @@ cdef Lexeme* _init_lexeme(unicode string, StringHash hashed,
 cdef size_t _find_split(unicode word, size_t length):
    cdef int i = 0
    # Contractions
-    if word == "'s":
-        return 2
+    if word.endswith("'s"):
+        return length - 2
    # Leading punctuation
    if is_punct(word, 0, length):
        return 1
@ -166,11 +184,8 @@ cdef size_t _find_split(unicode word, size_t length):
        i = length - 1
        while i >= 2 and is_punct(word, i-1, length):
            i -= 1
-    else:
-        # Doesn't start or end with the punct
-        while i < length and not is_punct(word, i, length):
-            i += 1
    return i

+
 cdef bint is_punct(unicode word, size_t i, size_t length):
    return not word[i].isalnum()
--- a/spacy/lexeme.cpp
+++ b/spacy/lexeme.cpp
@ -1,4 +1,4 @@
-/* Generated by Cython 0.20.1 on Sat Jul  5 20:44:26 2014 */
+/* Generated by Cython 0.20.1 on Mon Jul  7 00:02:26 2014 */

 #define PY_SSIZE_T_CLEAN
 #ifndef CYTHON_USE_PYLONG_INTERNALS
--- a/spacy/spacy.cpp
+++ b/spacy/spacy.cpp
@ -1,4 +1,4 @@
-/* Generated by Cython 0.20.1 on Sat Jul  5 20:44:26 2014 */
+/* Generated by Cython 0.20.1 on Mon Jul  7 00:02:26 2014 */

 #define PY_SSIZE_T_CLEAN
 #ifndef CYTHON_USE_PYLONG_INTERNALS
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1,3 +1,10 @@
+import os
+from os import path
+import codecs
+
+DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
+
+
 def utf8open(loc, mode='r'):
    return codecs.open(loc, mode, 'utf8')

@ -12,23 +19,23 @@ def load_case_stats(data_dir):
    return case_stats


-def load_clitics(data_dir):
-    clitics_loc = path.join(data_dir, 'clitics.txt')
+def read_tokenization(lang):
+    loc = path.join(DATA_DIR, lang, 'tokenization')
    entries = []
    seen = set()
-    with utf8open(clitics_loc) as clitics_file:
-        for line in clitics_file:
+    with utf8open(loc) as file_:
+        for line in file_:
            line = line.strip()
            if line.startswith('#'):
                continue
            if not line:
                continue
-            clitics = line.split()
-            word = clitics.pop(0)
-            norm_form = clitics.pop(0)
-            assert word not in seen, word
-            seen.add(word)
-            entries.append((word, norm_form, clitics))
+            pieces = line.split()
+            chunk = pieces.pop(0)
+            lex = pieces.pop(0)
+            assert chunk not in seen, chunk
+            seen.add(chunk)
+            entries.append((chunk, lex, pieces))
    return entries
 

--- a/tests/test_vocab.py
+++ b/tests/test_vocab.py
@ -28,3 +28,10 @@ def test_case_neq():
 def test_punct_neq():
    addr = lookup('Hello')
    assert lookup('Hello,') != addr
+
+
+def test_short():
+    addr = lookup('I')
+    assert unhash(lex_of(addr)) == 'I'
+    addr = lookup('not')
+    assert unhash(lex_of(addr)) == 'not'