* Use an AttributeCodec based on orth for words. Still no oov handling mechanism.

2015-07-18 22:43:18 +02:00 · 2015-07-18 22:43:18 +02:00 · 5b4c78bbb2
parent 82d84b0f2b
commit 5b4c78bbb2
1 changed files with 2 additions and 15 deletions
--- a/spacy/serialize/packer.pyx
+++ b/spacy/serialize/packer.pyx
@ -8,7 +8,7 @@ from libcpp.pair cimport pair
 from cymem.cymem cimport Address, Pool
 from preshed.maps cimport PreshMap
-from ..attrs cimport ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
+from ..attrs cimport ORTH, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
 from ..typedefs cimport attr_t
@ -34,17 +34,6 @@ cimport cython
 #       Entity tag
 def make_vocab_codec(Vocab vocab):
    cdef int length = len(vocab)
    cdef Address mem = Address(length, sizeof(float))
    probs = <float*>mem.ptr
    cdef int i
    for i in range(length):
        probs[i] = <float>c_exp(vocab.lexemes[i].prob)
    cdef float[:] cv_probs = <float[:len(vocab)]>probs
    return HuffmanCodec(cv_probs)
 cdef class _BinaryCodec:
    def encode(self, attr_t[:] msg, BitArray bits):
        cdef int i
@ -112,9 +101,7 @@ cdef class Packer:
        attrs = []
        for attr, freqs in list_of_attr_freqs:
-            if attr == ID:
+            if attr == SPACY:
                codecs.append(make_vocab_codec(vocab))
            elif attr == SPACY:
                codecs.append(_BinaryCodec())
            else:
                codecs.append(_AttributeCodec(freqs))