mirror of https://github.com/explosion/spaCy.git
* Use an AttributeCodec based on orth for words. Still no oov handling mechanism.
This commit is contained in:
parent
82d84b0f2b
commit
5b4c78bbb2
|
@ -8,7 +8,7 @@ from libcpp.pair cimport pair
|
||||||
from cymem.cymem cimport Address, Pool
|
from cymem.cymem cimport Address, Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
|
|
||||||
from ..attrs cimport ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
from ..attrs cimport ORTH, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
from ..typedefs cimport attr_t
|
from ..typedefs cimport attr_t
|
||||||
|
@ -34,17 +34,6 @@ cimport cython
|
||||||
# Entity tag
|
# Entity tag
|
||||||
|
|
||||||
|
|
||||||
def make_vocab_codec(Vocab vocab):
|
|
||||||
cdef int length = len(vocab)
|
|
||||||
cdef Address mem = Address(length, sizeof(float))
|
|
||||||
probs = <float*>mem.ptr
|
|
||||||
cdef int i
|
|
||||||
for i in range(length):
|
|
||||||
probs[i] = <float>c_exp(vocab.lexemes[i].prob)
|
|
||||||
cdef float[:] cv_probs = <float[:len(vocab)]>probs
|
|
||||||
return HuffmanCodec(cv_probs)
|
|
||||||
|
|
||||||
|
|
||||||
cdef class _BinaryCodec:
|
cdef class _BinaryCodec:
|
||||||
def encode(self, attr_t[:] msg, BitArray bits):
|
def encode(self, attr_t[:] msg, BitArray bits):
|
||||||
cdef int i
|
cdef int i
|
||||||
|
@ -112,9 +101,7 @@ cdef class Packer:
|
||||||
attrs = []
|
attrs = []
|
||||||
|
|
||||||
for attr, freqs in list_of_attr_freqs:
|
for attr, freqs in list_of_attr_freqs:
|
||||||
if attr == ID:
|
if attr == SPACY:
|
||||||
codecs.append(make_vocab_codec(vocab))
|
|
||||||
elif attr == SPACY:
|
|
||||||
codecs.append(_BinaryCodec())
|
codecs.append(_BinaryCodec())
|
||||||
else:
|
else:
|
||||||
codecs.append(_AttributeCodec(freqs))
|
codecs.append(_AttributeCodec(freqs))
|
||||||
|
|
Loading…
Reference in New Issue