mirror of https://github.com/explosion/spaCy.git
* Tmp. Working on refactor. Compiles, must hook up lexical feats.
This commit is contained in:
parent
46da3d74d2
commit
0930892fc1
|
@ -12,7 +12,10 @@ from .attrs import get_flags
|
|||
|
||||
|
||||
def get_lex_props(string):
|
||||
return {'flags': get_flags(string), 'dense': 1}
|
||||
return {'flags': get_flags(string), 'length': len(string),
|
||||
'sic': string, 'norm1': string, 'norm2': string, 'shape': string,
|
||||
'prefix': string[0], 'suffix': string[-3:], 'cluster': 0, 'prob': 0,
|
||||
'sentiment': 0}
|
||||
|
||||
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
|
||||
|
||||
|
@ -45,7 +48,7 @@ class English(object):
|
|||
"""
|
||||
def __init__(self, data_dir=LOCAL_DATA_DIR):
|
||||
self._data_dir = data_dir
|
||||
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'),
|
||||
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
|
||||
get_lex_props=get_lex_props)
|
||||
tag_names = list(POS_TAGS.keys())
|
||||
tag_names.sort()
|
||||
|
|
|
@ -283,12 +283,12 @@ cdef class EnPosTagger:
|
|||
cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1:
|
||||
if self.lemmatizer is None:
|
||||
return lex.sic
|
||||
cdef bytes py_string = self.strings[lex.sic]
|
||||
cdef unicode py_string = self.strings[lex.sic]
|
||||
if pos != NOUN and pos != VERB and pos != ADJ:
|
||||
return lex.sic
|
||||
cdef set lemma_strings
|
||||
cdef unicode lemma_string
|
||||
lemma_strings = self.lemmatizer(py_string.decode('utf8'), pos)
|
||||
lemma_strings = self.lemmatizer(py_string, pos)
|
||||
lemma_string = sorted(lemma_strings)[0]
|
||||
lemma = self.strings.intern(lemma_string.encode('utf8'), len(lemma_string)).i
|
||||
return lemma
|
||||
|
|
|
@ -7,10 +7,8 @@ from .strings cimport StringStore
|
|||
cdef LexemeC EMPTY_LEXEME
|
||||
|
||||
|
||||
cdef LexemeC init(id_t i, unicode string, hash_t hashed, StringStore store,
|
||||
dict props) except *
|
||||
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings) except -1
|
||||
|
||||
|
||||
cdef class Lexeme:
|
||||
cdef const float* vec
|
||||
|
||||
|
|
|
@ -5,27 +5,27 @@ from murmurhash.mrmr cimport hash64
|
|||
from libc.string cimport memset
|
||||
|
||||
from .orth cimport word_shape
|
||||
from .typedefs cimport attr_t
|
||||
|
||||
|
||||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||
|
||||
|
||||
cdef LexemeC init(id_t i, unicode string, hash_t hashed,
|
||||
StringStore string_store, dict props) except *:
|
||||
cdef LexemeC lex
|
||||
lex.id = i
|
||||
lex.length = len(string)
|
||||
lex.sic = string_store[string]
|
||||
|
||||
lex.cluster = props.get('cluster', 0)
|
||||
lex.prob = props.get('prob', 0)
|
||||
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store) except -1:
|
||||
|
||||
lex.prefix = string_store[string[:1]]
|
||||
lex.suffix = string_store[string[-3:]]
|
||||
lex.shape = string_store[word_shape(string)]
|
||||
|
||||
lex.flags = props.get('flags', 0)
|
||||
return lex
|
||||
lex.length = props['length']
|
||||
lex.sic = string_store[props['sic']]
|
||||
lex.norm1 = string_store[props['norm1']]
|
||||
lex.norm2 = string_store[props['norm2']]
|
||||
lex.shape = string_store[props['shape']]
|
||||
lex.prefix = string_store[props['prefix']]
|
||||
lex.suffix = string_store[props['suffix']]
|
||||
|
||||
lex.cluster = props['cluster']
|
||||
lex.prob = props['prob']
|
||||
lex.sentiment = props['sentiment']
|
||||
|
||||
lex.flags = props['flags']
|
||||
|
||||
|
||||
cdef class Lexeme:
|
||||
|
|
|
@ -67,7 +67,7 @@ cdef class StringStore:
|
|||
if string_or_id < 1 or string_or_id >= self.size:
|
||||
raise IndexError(string_or_id)
|
||||
utf8str = &self.strings[<int>string_or_id]
|
||||
return utf8str.chars[:utf8str.length]
|
||||
return utf8str.chars[:utf8str.length].decode('utf8')
|
||||
elif isinstance(string_or_id, bytes):
|
||||
utf8str = self.intern(<char*>string_or_id, len(string_or_id))
|
||||
return utf8str.i
|
||||
|
|
|
@ -42,32 +42,5 @@ cdef class Tokens:
|
|||
|
||||
|
||||
cdef class Token:
|
||||
cdef cvarray vec
|
||||
|
||||
cdef readonly flags_t flags
|
||||
|
||||
cdef readonly attr_t id
|
||||
cdef readonly attr_t sic
|
||||
cdef readonly attr_t dense
|
||||
cdef readonly attr_t shape
|
||||
cdef readonly attr_t prefix
|
||||
cdef readonly attr_t suffix
|
||||
|
||||
cdef readonly attr_t length
|
||||
cdef readonly attr_t cluster
|
||||
cdef readonly attr_t pos_type
|
||||
|
||||
cdef readonly float prob
|
||||
cdef readonly float sentiment
|
||||
|
||||
cdef readonly Morphology morph
|
||||
cdef readonly univ_tag_t pos
|
||||
cdef readonly int fine_pos
|
||||
cdef readonly int idx
|
||||
cdef readonly int lemma
|
||||
cdef readonly int sense
|
||||
cdef readonly int dep_tag
|
||||
|
||||
cdef readonly int head_offset
|
||||
cdef readonly uint32_t l_kids
|
||||
cdef readonly uint32_t r_kids
|
||||
cdef readonly Tokens _seq
|
||||
cdef readonly int i
|
||||
|
|
188
spacy/tokens.pyx
188
spacy/tokens.pyx
|
@ -85,7 +85,7 @@ cdef class Tokens:
|
|||
token (Token):
|
||||
"""
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return cinit_token(&self.data[i])
|
||||
return Token(self, i)
|
||||
|
||||
def __iter__(self):
|
||||
"""Iterate over the tokens.
|
||||
|
@ -174,38 +174,26 @@ cdef class Tokens:
|
|||
self.data[i].lex = &EMPTY_LEXEME
|
||||
|
||||
|
||||
cdef Token cinit_token(const TokenC* c_tok):
|
||||
cdef Token py_tok = Token.__new__(Token)
|
||||
py_tok.morph = c_tok.morph
|
||||
py_tok.pos = c_tok.pos
|
||||
py_tok.fine_pos = c_tok.fine_pos
|
||||
py_tok.idx = c_tok.idx
|
||||
py_tok.lemma = c_tok.lemma
|
||||
py_tok.sense = c_tok.sense
|
||||
py_tok.dep_tag = c_tok.dep_tag
|
||||
py_tok.head_offset = c_tok.head
|
||||
py_tok.l_kids = c_tok.l_kids
|
||||
py_tok.r_kids = c_tok.r_kids
|
||||
return py_tok
|
||||
|
||||
|
||||
@cython.freelist(64)
|
||||
cdef class Token:
|
||||
"""An individual token.
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
#self._seq = tokens
|
||||
#self.i = i
|
||||
|
||||
#def __unicode__(self):
|
||||
# cdef const TokenC* t = &self._seq.data[self.i]
|
||||
# cdef int end_idx = t.idx + t.lex.length
|
||||
# if self.i + 1 == self._seq.length:
|
||||
# return self.string
|
||||
# if end_idx == t[1].idx:
|
||||
# return self.string
|
||||
# else:
|
||||
# return self.string + ' '
|
||||
Internally, the Token is a tuple (i, tokens) --- it delegates to the Tokens
|
||||
object.
|
||||
"""
|
||||
def __init__(self, Tokens tokens, int i):
|
||||
self._seq = tokens
|
||||
self.i = i
|
||||
|
||||
def __unicode__(self):
|
||||
cdef const TokenC* t = &self._seq.data[self.i]
|
||||
cdef int end_idx = t.idx + t.lex.length
|
||||
if self.i + 1 == self._seq.length:
|
||||
return self.string
|
||||
if end_idx == t[1].idx:
|
||||
return self.string
|
||||
else:
|
||||
return self.string + ' '
|
||||
|
||||
def __len__(self):
|
||||
"""The number of unicode code-points in the original string.
|
||||
|
@ -213,87 +201,87 @@ cdef class Token:
|
|||
Returns:
|
||||
length (int):
|
||||
"""
|
||||
return self.length
|
||||
return self._seq.data[self.i].lex.length
|
||||
|
||||
#property idx:
|
||||
# """The index into the original string at which the token starts.
|
||||
property idx:
|
||||
"""The index into the original string at which the token starts.
|
||||
|
||||
# The following is supposed to always be true:
|
||||
#
|
||||
# >>> original_string[token.idx:token.idx len(token) == token.string
|
||||
# """
|
||||
# def __get__(self):
|
||||
# return self._seq.data[self.i].idx
|
||||
The following is supposed to always be true:
|
||||
|
||||
>>> original_string[token.idx:token.idx len(token) == token.string
|
||||
"""
|
||||
def __get__(self):
|
||||
return self._seq.data[self.i].idx
|
||||
|
||||
#property cluster:
|
||||
# """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
|
||||
#
|
||||
# Similar words have better-than-chance likelihood of having similar cluster
|
||||
# IDs, although the clustering is quite noisy. Cluster IDs make good features,
|
||||
# and help to make models slightly more robust to domain variation.
|
||||
property cluster:
|
||||
"""The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
|
||||
|
||||
Similar words have better-than-chance likelihood of having similar cluster
|
||||
IDs, although the clustering is quite noisy. Cluster IDs make good features,
|
||||
and help to make models slightly more robust to domain variation.
|
||||
|
||||
# A common trick is to use only the first N bits of a cluster ID in a feature,
|
||||
# as the more general part of the hierarchical clustering is often more accurate
|
||||
# than the lower categories.
|
||||
A common trick is to use only the first N bits of a cluster ID in a feature,
|
||||
as the more general part of the hierarchical clustering is often more accurate
|
||||
than the lower categories.
|
||||
|
||||
# To assist in this, I encode the cluster IDs little-endian, to allow a simple
|
||||
# bit-mask:
|
||||
To assist in this, I encode the cluster IDs little-endian, to allow a simple
|
||||
bit-mask:
|
||||
|
||||
# >>> six_bits = cluster & (2**6 - 1)
|
||||
# """
|
||||
# def __get__(self):
|
||||
# return self._seq.data[self.i].lex.cluster
|
||||
>>> six_bits = cluster & (2**6 - 1)
|
||||
"""
|
||||
def __get__(self):
|
||||
return self._seq.data[self.i].lex.cluster
|
||||
|
||||
#property string:
|
||||
# """The unicode string of the word, with no whitespace padding."""
|
||||
# def __get__(self):
|
||||
# cdef const TokenC* t = &self._seq.data[self.i]
|
||||
# if t.lex.sic == 0:
|
||||
# return ''
|
||||
# cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic]
|
||||
# return utf8string.decode('utf8')
|
||||
property string:
|
||||
"""The unicode string of the word, with no whitespace padding."""
|
||||
def __get__(self):
|
||||
cdef const TokenC* t = &self._seq.data[self.i]
|
||||
if t.lex.sic == 0:
|
||||
return ''
|
||||
cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic]
|
||||
return py_ustr
|
||||
|
||||
#property lemma:
|
||||
# """The unicode string of the word's lemma. If no part-of-speech tag is
|
||||
# assigned, the most common part-of-speech tag of the word is used.
|
||||
# """
|
||||
# def __get__(self):
|
||||
# cdef const TokenC* t = &self._seq.data[self.i]
|
||||
# if t.lemma == 0:
|
||||
# return self.string
|
||||
# cdef bytes utf8string = self._seq.vocab.strings[t.lemma]
|
||||
# return utf8string.decode('utf8')
|
||||
property lemma:
|
||||
"""The unicode string of the word's lemma. If no part-of-speech tag is
|
||||
assigned, the most common part-of-speech tag of the word is used.
|
||||
"""
|
||||
def __get__(self):
|
||||
cdef const TokenC* t = &self._seq.data[self.i]
|
||||
if t.lemma == 0:
|
||||
return self.string
|
||||
cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
|
||||
return py_ustr
|
||||
|
||||
#property dep_tag:
|
||||
# """The ID integer of the word's dependency label. If no parse has been
|
||||
# assigned, defaults to 0.
|
||||
# """
|
||||
# def __get__(self):
|
||||
# return self._seq.data[self.i].dep_tag
|
||||
property dep_tag:
|
||||
"""The ID integer of the word's dependency label. If no parse has been
|
||||
assigned, defaults to 0.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self._seq.data[self.i].dep_tag
|
||||
|
||||
#property pos:
|
||||
# """The ID integer of the word's part-of-speech tag, from the 13-tag
|
||||
# Google Universal Tag Set. Constants for this tag set are available in
|
||||
# spacy.typedefs.
|
||||
# """
|
||||
# def __get__(self):
|
||||
# return self._seq.data[self.i].pos
|
||||
property pos:
|
||||
"""The ID integer of the word's part-of-speech tag, from the 13-tag
|
||||
Google Universal Tag Set. Constants for this tag set are available in
|
||||
spacy.typedefs.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self._seq.data[self.i].pos
|
||||
|
||||
#property fine_pos:
|
||||
# """The ID integer of the word's fine-grained part-of-speech tag, as assigned
|
||||
# by the tagger model. Fine-grained tags include morphological information,
|
||||
# and other distinctions, and allow a more accurate tagger to be trained.
|
||||
# """
|
||||
property fine_pos:
|
||||
"""The ID integer of the word's fine-grained part-of-speech tag, as assigned
|
||||
by the tagger model. Fine-grained tags include morphological information,
|
||||
and other distinctions, and allow a more accurate tagger to be trained.
|
||||
"""
|
||||
|
||||
# def __get__(self):
|
||||
# return self._seq.data[self.i].fine_pos
|
||||
def __get__(self):
|
||||
return self._seq.data[self.i].fine_pos
|
||||
|
||||
#property sic:
|
||||
# def __get__(self):
|
||||
# return self._seq.data[self.i].lex.sic
|
||||
property sic:
|
||||
def __get__(self):
|
||||
return self._seq.vocab.strings[self._seq.data[self.i].lex.sic]
|
||||
|
||||
#property head:
|
||||
# """The token predicted by the parser to be the head of the current token."""
|
||||
# def __get__(self):
|
||||
# cdef const TokenC* t = &self._seq.data[self.i]
|
||||
# return Token(self._seq, self.i + t.head)
|
||||
property head:
|
||||
"""The token predicted by the parser to be the head of the current token."""
|
||||
def __get__(self):
|
||||
cdef const TokenC* t = &self._seq.data[self.i]
|
||||
return Token(self._seq, self.i + t.head)
|
||||
|
|
|
@ -24,12 +24,13 @@ cdef struct _Cached:
|
|||
|
||||
|
||||
cdef class Vocab:
|
||||
cpdef public get_lex_props
|
||||
cpdef public lexeme_props_getter
|
||||
cdef Pool mem
|
||||
cpdef readonly StringStore strings
|
||||
cdef vector[LexemeC*] lexemes
|
||||
cdef vector[const LexemeC*] lexemes
|
||||
|
||||
cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
|
||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||
|
||||
cdef PreshMap _map
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ from os import path
|
|||
import codecs
|
||||
|
||||
from .lexeme cimport EMPTY_LEXEME
|
||||
from .lexeme cimport init as lexeme_init
|
||||
from .lexeme cimport set_lex_struct_props
|
||||
from .lexeme cimport Lexeme_cinit
|
||||
from .strings cimport slice_unicode
|
||||
from .strings cimport hash_string
|
||||
|
@ -21,24 +21,6 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
|||
EMPTY_LEXEME.vec = EMPTY_VEC
|
||||
|
||||
|
||||
cdef LexemeC init_lexeme(id_t i, unicode string, hash_t hashed,
|
||||
StringStore string_store, dict props) except *:
|
||||
cdef LexemeC lex
|
||||
lex.id = i
|
||||
lex.length = len(string)
|
||||
lex.sic = string_store[string]
|
||||
|
||||
lex.cluster = props.get('cluster', 0)
|
||||
lex.prob = props.get('prob', 0)
|
||||
|
||||
lex.prefix = string_store[string[:1]]
|
||||
lex.suffix = string_store[string[-3:]]
|
||||
lex.shape = string_store[word_shape(string)]
|
||||
|
||||
lex.flags = props.get('flags', 0)
|
||||
return lex
|
||||
|
||||
|
||||
cdef class Vocab:
|
||||
'''A map container for a language's LexemeC structs.
|
||||
'''
|
||||
|
@ -47,7 +29,7 @@ cdef class Vocab:
|
|||
self._map = PreshMap(2 ** 20)
|
||||
self.strings = StringStore()
|
||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||
self.get_lex_props = get_lex_props
|
||||
self.lexeme_props_getter = get_lex_props
|
||||
|
||||
if data_dir is not None:
|
||||
if not path.exists(data_dir):
|
||||
|
@ -63,32 +45,36 @@ cdef class Vocab:
|
|||
"""The current number of lexemes stored."""
|
||||
return self.lexemes.size()
|
||||
|
||||
cdef const LexemeC* get(self, Pool mem, UniStr* string) except NULL:
|
||||
cdef const LexemeC* get(self, Pool mem, UniStr* c_str) except NULL:
|
||||
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||
if necessary, using memory acquired from the given pool. If the pool
|
||||
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
|
||||
cdef LexemeC* lex
|
||||
lex = <LexemeC*>self._map.get(string.key)
|
||||
lex = <LexemeC*>self._map.get(c_str.key)
|
||||
if lex != NULL:
|
||||
return lex
|
||||
if string.n < 3:
|
||||
if c_str.n < 3:
|
||||
mem = self.mem
|
||||
cdef unicode py_string = string.chars[:string.n]
|
||||
cdef unicode py_str = c_str.chars[:c_str.n]
|
||||
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
||||
lex[0] = init_lexeme(self.lexemes.size(), py_string, string.key, self.strings,
|
||||
self.get_lex_props(py_string))
|
||||
props = self.lexeme_props_getter(py_str)
|
||||
set_lex_struct_props(lex, props, self.strings)
|
||||
if mem is self.mem:
|
||||
self._map.set(string.key, lex)
|
||||
while self.lexemes.size() < (lex.id + 1):
|
||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||
self.lexemes[lex.id] = lex
|
||||
lex.id = self.lexemes.size()
|
||||
self._add_lex_to_vocab(c_str.key, lex)
|
||||
else:
|
||||
lex[0].id = 1
|
||||
lex.id = 1
|
||||
return lex
|
||||
|
||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
|
||||
self._map.set(key, <void*>lex)
|
||||
while self.lexemes.size() < (lex.id + 1):
|
||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||
self.lexemes[lex.id] = lex
|
||||
|
||||
def __getitem__(self, id_or_string):
|
||||
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||
unseen unicode string is given, a new LexemeC is created and stored.
|
||||
unseen unicode string is given, a new lexeme is created and stored.
|
||||
|
||||
Args:
|
||||
id_or_string (int or unicode): The integer ID of a word, or its unicode
|
||||
|
@ -100,24 +86,28 @@ cdef class Vocab:
|
|||
lexeme (Lexeme): An instance of the Lexeme Python class, with data
|
||||
copied on instantiation.
|
||||
'''
|
||||
cdef UniStr string
|
||||
cdef UniStr c_str
|
||||
cdef const LexemeC* lexeme
|
||||
if type(id_or_string) == int:
|
||||
if id_or_string >= self.lexemes.size():
|
||||
raise IndexError
|
||||
lexeme = self.lexemes.at(id_or_string)
|
||||
else:
|
||||
slice_unicode(&string, id_or_string, 0, len(id_or_string))
|
||||
lexeme = self.get(self.mem, &string)
|
||||
slice_unicode(&c_str, id_or_string, 0, len(id_or_string))
|
||||
lexeme = self.get(self.mem, &c_str)
|
||||
return Lexeme_cinit(lexeme, self.strings)
|
||||
|
||||
def __setitem__(self, unicode uni_string, dict props):
|
||||
cdef UniStr s
|
||||
slice_unicode(&s, uni_string, 0, len(uni_string))
|
||||
# Cast through the const here, since we're allowed to change our own
|
||||
# LexemeCs.
|
||||
lex = <LexemeC*><void*>self.get(self.mem, &s)
|
||||
lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
|
||||
def __setitem__(self, unicode py_str, dict props):
|
||||
cdef UniStr c_str
|
||||
slice_unicode(&c_str, py_str, 0, len(py_str))
|
||||
cdef LexemeC* lex
|
||||
lex = <LexemeC*>self._map.get(c_str.key)
|
||||
if lex == NULL:
|
||||
lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
||||
lex.id = self.lexemes.size()
|
||||
self._add_lex_to_vocab(c_str.key, lex)
|
||||
set_lex_struct_props(lex, props, self.strings)
|
||||
assert lex.sic < 1000000
|
||||
|
||||
def dump(self, loc):
|
||||
if path.exists(loc):
|
||||
|
@ -154,6 +144,7 @@ cdef class Vocab:
|
|||
if st != 1:
|
||||
break
|
||||
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
||||
lexeme.vec = EMPTY_VEC
|
||||
st = fread(lexeme, sizeof(LexemeC), 1, fp)
|
||||
if st != 1:
|
||||
break
|
||||
|
|
Loading…
Reference in New Issue