spaCy/spacy/ml/_character_embed.py

from thinc.api import Model


def CharacterEmbed(nM, nC):
    # nM: Number of dimensions per character. nC: Number of characters.
    nO = nM * nC if (nM is not None and nC is not None) else None
    return Model(
        "charembed",
        forward,
        init=init,
        dims={"nM": nM, "nC": nC, "nO": nO, "nV": 256},
        params={"E": None},
    ).initialize()


def init(model, X=None, Y=None):
    vectors_table = model.ops.alloc3f(
        model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM")
    )
    model.set_param("E", vectors_table)


def forward(model, docs, is_train):
    if not docs:
        return []
    ids = []
    output = []
    E = model.get_param("E")
    nC = model.get_dim("nC")
    nM = model.get_dim("nM")
    nO = model.get_dim("nO")
    # This assists in indexing; it's like looping over this dimension.
    # Still consider this weird witch craft...But thanks to Mark Neumann
    # for the tip.
    nCv = model.ops.xp.arange(nC)
    for doc in docs:
        doc_ids = doc.to_utf8_array(nr_char=nC)
        doc_vectors = model.ops.alloc3f(len(doc), nC, nM)
        # Let's say I have a 2d array of indices, and a 3d table of data. What numpy
        # incantation do I chant to get
        # output[i, j, k] == data[j, ids[i, j], k]?
        doc_vectors[:, nCv] = E[nCv, doc_ids[:, nCv]]
        output.append(doc_vectors.reshape((len(doc), nO)))
        ids.append(doc_ids)

    def backprop(d_output):
        dE = model.ops.alloc(E.shape, dtype=E.dtype)
        for doc_ids, d_doc_vectors in zip(ids, d_output):
            d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), nC, nM))
            dE[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv]
        model.inc_grad("E", dE)
        return []

    return output, backprop