* Reading in tokenization rules correctly. Passing tests.

This commit is contained in:
Matthew Honnibal 2014-07-07 00:02:55 +02:00
parent 9bef797afe
commit 4e79446dc2
6 changed files with 1120 additions and 340 deletions

File diff suppressed because it is too large Load Diff

View File

@ -10,6 +10,7 @@ from libc.stdint cimport uint64_t
from spacy.lexeme cimport Lexeme
from ext.murmurhash cimport MurmurHash64A
from ext.murmurhash cimport MurmurHash64B
from . import util
STRINGS = {}
@ -20,6 +21,23 @@ LEXEMES.set_empty_key(0)
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
def load_tokenization(token_rules):
cdef Lexeme* word
cdef StringHash hashed
for chunk, lex, tokens in token_rules:
hashed = hash_string(chunk, len(chunk))
assert LEXEMES[hashed] == NULL
word = _add(hashed, lex, len(lex), len(lex))
for i, lex in enumerate(tokens):
token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
length = len(token_string)
hashed = hash_string(token_string, length)
word.tail = _add(hashed, lex, 0, len(lex))
word = word.tail
load_tokenization(util.read_tokenization('en'))
cpdef Lexeme_addr lookup(unicode string) except 0:
'''.. function:: enumerate(sequence[, start=0])
Fetch a Lexeme representing a word string. If the word has not been seen,
@ -156,8 +174,8 @@ cdef Lexeme* _init_lexeme(unicode string, StringHash hashed,
cdef size_t _find_split(unicode word, size_t length):
cdef int i = 0
# Contractions
if word == "'s":
return 2
if word.endswith("'s"):
return length - 2
# Leading punctuation
if is_punct(word, 0, length):
return 1
@ -166,11 +184,8 @@ cdef size_t _find_split(unicode word, size_t length):
i = length - 1
while i >= 2 and is_punct(word, i-1, length):
i -= 1
else:
# Doesn't start or end with the punct
while i < length and not is_punct(word, i, length):
i += 1
return i
cdef bint is_punct(unicode word, size_t i, size_t length):
return not word[i].isalnum()

View File

@ -1,4 +1,4 @@
/* Generated by Cython 0.20.1 on Sat Jul 5 20:44:26 2014 */
/* Generated by Cython 0.20.1 on Mon Jul 7 00:02:26 2014 */
#define PY_SSIZE_T_CLEAN
#ifndef CYTHON_USE_PYLONG_INTERNALS

View File

@ -1,4 +1,4 @@
/* Generated by Cython 0.20.1 on Sat Jul 5 20:44:26 2014 */
/* Generated by Cython 0.20.1 on Mon Jul 7 00:02:26 2014 */
#define PY_SSIZE_T_CLEAN
#ifndef CYTHON_USE_PYLONG_INTERNALS

View File

@ -1,3 +1,10 @@
import os
from os import path
import codecs
DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
def utf8open(loc, mode='r'):
return codecs.open(loc, mode, 'utf8')
@ -12,23 +19,23 @@ def load_case_stats(data_dir):
return case_stats
def load_clitics(data_dir):
clitics_loc = path.join(data_dir, 'clitics.txt')
def read_tokenization(lang):
loc = path.join(DATA_DIR, lang, 'tokenization')
entries = []
seen = set()
with utf8open(clitics_loc) as clitics_file:
for line in clitics_file:
with utf8open(loc) as file_:
for line in file_:
line = line.strip()
if line.startswith('#'):
continue
if not line:
continue
clitics = line.split()
word = clitics.pop(0)
norm_form = clitics.pop(0)
assert word not in seen, word
seen.add(word)
entries.append((word, norm_form, clitics))
pieces = line.split()
chunk = pieces.pop(0)
lex = pieces.pop(0)
assert chunk not in seen, chunk
seen.add(chunk)
entries.append((chunk, lex, pieces))
return entries

View File

@ -28,3 +28,10 @@ def test_case_neq():
def test_punct_neq():
addr = lookup('Hello')
assert lookup('Hello,') != addr
def test_short():
addr = lookup('I')
assert unhash(lex_of(addr)) == 'I'
addr = lookup('not')
assert unhash(lex_of(addr)) == 'not'