mirror of https://github.com/explosion/spaCy.git
* Reading in tokenization rules correctly. Passing tests.
This commit is contained in:
parent
9bef797afe
commit
4e79446dc2
1395
spacy/en.cpp
1395
spacy/en.cpp
File diff suppressed because it is too large
Load Diff
27
spacy/en.pyx
27
spacy/en.pyx
|
@ -10,6 +10,7 @@ from libc.stdint cimport uint64_t
|
|||
from spacy.lexeme cimport Lexeme
|
||||
from ext.murmurhash cimport MurmurHash64A
|
||||
from ext.murmurhash cimport MurmurHash64B
|
||||
from . import util
|
||||
|
||||
|
||||
STRINGS = {}
|
||||
|
@ -20,6 +21,23 @@ LEXEMES.set_empty_key(0)
|
|||
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
|
||||
|
||||
|
||||
def load_tokenization(token_rules):
|
||||
cdef Lexeme* word
|
||||
cdef StringHash hashed
|
||||
for chunk, lex, tokens in token_rules:
|
||||
hashed = hash_string(chunk, len(chunk))
|
||||
assert LEXEMES[hashed] == NULL
|
||||
word = _add(hashed, lex, len(lex), len(lex))
|
||||
for i, lex in enumerate(tokens):
|
||||
token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
|
||||
length = len(token_string)
|
||||
hashed = hash_string(token_string, length)
|
||||
word.tail = _add(hashed, lex, 0, len(lex))
|
||||
word = word.tail
|
||||
|
||||
|
||||
load_tokenization(util.read_tokenization('en'))
|
||||
|
||||
cpdef Lexeme_addr lookup(unicode string) except 0:
|
||||
'''.. function:: enumerate(sequence[, start=0])
|
||||
Fetch a Lexeme representing a word string. If the word has not been seen,
|
||||
|
@ -156,8 +174,8 @@ cdef Lexeme* _init_lexeme(unicode string, StringHash hashed,
|
|||
cdef size_t _find_split(unicode word, size_t length):
|
||||
cdef int i = 0
|
||||
# Contractions
|
||||
if word == "'s":
|
||||
return 2
|
||||
if word.endswith("'s"):
|
||||
return length - 2
|
||||
# Leading punctuation
|
||||
if is_punct(word, 0, length):
|
||||
return 1
|
||||
|
@ -166,11 +184,8 @@ cdef size_t _find_split(unicode word, size_t length):
|
|||
i = length - 1
|
||||
while i >= 2 and is_punct(word, i-1, length):
|
||||
i -= 1
|
||||
else:
|
||||
# Doesn't start or end with the punct
|
||||
while i < length and not is_punct(word, i, length):
|
||||
i += 1
|
||||
return i
|
||||
|
||||
|
||||
cdef bint is_punct(unicode word, size_t i, size_t length):
|
||||
return not word[i].isalnum()
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* Generated by Cython 0.20.1 on Sat Jul 5 20:44:26 2014 */
|
||||
/* Generated by Cython 0.20.1 on Mon Jul 7 00:02:26 2014 */
|
||||
|
||||
#define PY_SSIZE_T_CLEAN
|
||||
#ifndef CYTHON_USE_PYLONG_INTERNALS
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* Generated by Cython 0.20.1 on Sat Jul 5 20:44:26 2014 */
|
||||
/* Generated by Cython 0.20.1 on Mon Jul 7 00:02:26 2014 */
|
||||
|
||||
#define PY_SSIZE_T_CLEAN
|
||||
#ifndef CYTHON_USE_PYLONG_INTERNALS
|
||||
|
|
|
@ -1,3 +1,10 @@
|
|||
import os
|
||||
from os import path
|
||||
import codecs
|
||||
|
||||
DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
|
||||
|
||||
|
||||
def utf8open(loc, mode='r'):
|
||||
return codecs.open(loc, mode, 'utf8')
|
||||
|
||||
|
@ -12,23 +19,23 @@ def load_case_stats(data_dir):
|
|||
return case_stats
|
||||
|
||||
|
||||
def load_clitics(data_dir):
|
||||
clitics_loc = path.join(data_dir, 'clitics.txt')
|
||||
def read_tokenization(lang):
|
||||
loc = path.join(DATA_DIR, lang, 'tokenization')
|
||||
entries = []
|
||||
seen = set()
|
||||
with utf8open(clitics_loc) as clitics_file:
|
||||
for line in clitics_file:
|
||||
with utf8open(loc) as file_:
|
||||
for line in file_:
|
||||
line = line.strip()
|
||||
if line.startswith('#'):
|
||||
continue
|
||||
if not line:
|
||||
continue
|
||||
clitics = line.split()
|
||||
word = clitics.pop(0)
|
||||
norm_form = clitics.pop(0)
|
||||
assert word not in seen, word
|
||||
seen.add(word)
|
||||
entries.append((word, norm_form, clitics))
|
||||
pieces = line.split()
|
||||
chunk = pieces.pop(0)
|
||||
lex = pieces.pop(0)
|
||||
assert chunk not in seen, chunk
|
||||
seen.add(chunk)
|
||||
entries.append((chunk, lex, pieces))
|
||||
return entries
|
||||
|
||||
|
||||
|
|
|
@ -28,3 +28,10 @@ def test_case_neq():
|
|||
def test_punct_neq():
|
||||
addr = lookup('Hello')
|
||||
assert lookup('Hello,') != addr
|
||||
|
||||
|
||||
def test_short():
|
||||
addr = lookup('I')
|
||||
assert unhash(lex_of(addr)) == 'I'
|
||||
addr = lookup('not')
|
||||
assert unhash(lex_of(addr)) == 'not'
|
||||
|
|
Loading…
Reference in New Issue