2018-05-15 20:17:29 +00:00
|
|
|
from libc.string cimport memset, memcpy
|
Parser: use C saxpy/sgemm provided by the Ops implementation (#10773)
* Parser: use C saxpy/sgemm provided by the Ops implementation
This is a backport of https://github.com/explosion/spaCy/pull/10747
from the parser refactor branch. It eliminates the explicit calls
to BLIS, instead using the saxpy/sgemm provided by the Ops
implementation.
This allows us to use Accelerate in the parser on M1 Macs (with
an updated thinc-apple-ops).
Performance of the de_core_news_lg pipe:
BLIS 0.7.0, no thinc-apple-ops: 6385 WPS
BLIS 0.7.0, thinc-apple-ops: 36455 WPS
BLIS 0.9.0, no thinc-apple-ops: 19188 WPS
BLIS 0.9.0, thinc-apple-ops: 36682 WPS
This PR, thinc-apple-ops: 38726 WPS
Performance of the de_core_news_lg pipe (only tok2vec -> parser):
BLIS 0.7.0, no thinc-apple-ops: 13907 WPS
BLIS 0.7.0, thinc-apple-ops: 73172 WPS
BLIS 0.9.0, no thinc-apple-ops: 41576 WPS
BLIS 0.9.0, thinc-apple-ops: 72569 WPS
This PR, thinc-apple-ops: 87061 WPS
* Require thinc >=8.1.0,<8.2.0
* Lower thinc lowerbound to 8.1.0.dev0
* Use best CPU ops for CBLAS when the parser model is on the GPU
* Fix another unguarded cblas() call
* Fix: use ops as a shorthand for self.model.ops
Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
2022-05-27 09:20:52 +00:00
|
|
|
from thinc.backends.cblas cimport CBlas
|
2020-07-30 21:30:54 +00:00
|
|
|
from ..typedefs cimport weight_t, hash_t
|
|
|
|
from ..pipeline._parser_internals._state cimport StateC
|
2018-05-15 20:17:29 +00:00
|
|
|
|
|
|
|
|
|
|
|
cdef struct SizesC:
|
|
|
|
int states
|
|
|
|
int classes
|
|
|
|
int hiddens
|
|
|
|
int pieces
|
|
|
|
int feats
|
|
|
|
int embed_width
|
|
|
|
|
|
|
|
|
|
|
|
cdef struct WeightsC:
|
|
|
|
const float* feat_weights
|
|
|
|
const float* feat_bias
|
|
|
|
const float* hidden_bias
|
|
|
|
const float* hidden_weights
|
2019-02-24 15:41:41 +00:00
|
|
|
const float* seen_classes
|
2018-05-15 20:17:29 +00:00
|
|
|
|
|
|
|
|
|
|
|
cdef struct ActivationsC:
|
|
|
|
int* token_ids
|
|
|
|
float* unmaxed
|
|
|
|
float* scores
|
|
|
|
float* hiddens
|
|
|
|
int* is_valid
|
|
|
|
int _curr_size
|
|
|
|
int _max_size
|
|
|
|
|
|
|
|
|
|
|
|
cdef WeightsC get_c_weights(model) except *
|
|
|
|
|
|
|
|
cdef SizesC get_c_sizes(model, int batch_size) except *
|
|
|
|
|
2019-10-22 13:06:44 +00:00
|
|
|
cdef ActivationsC alloc_activations(SizesC n) nogil
|
|
|
|
|
|
|
|
cdef void free_activations(const ActivationsC* A) nogil
|
2018-05-15 20:17:29 +00:00
|
|
|
|
Parser: use C saxpy/sgemm provided by the Ops implementation (#10773)
* Parser: use C saxpy/sgemm provided by the Ops implementation
This is a backport of https://github.com/explosion/spaCy/pull/10747
from the parser refactor branch. It eliminates the explicit calls
to BLIS, instead using the saxpy/sgemm provided by the Ops
implementation.
This allows us to use Accelerate in the parser on M1 Macs (with
an updated thinc-apple-ops).
Performance of the de_core_news_lg pipe:
BLIS 0.7.0, no thinc-apple-ops: 6385 WPS
BLIS 0.7.0, thinc-apple-ops: 36455 WPS
BLIS 0.9.0, no thinc-apple-ops: 19188 WPS
BLIS 0.9.0, thinc-apple-ops: 36682 WPS
This PR, thinc-apple-ops: 38726 WPS
Performance of the de_core_news_lg pipe (only tok2vec -> parser):
BLIS 0.7.0, no thinc-apple-ops: 13907 WPS
BLIS 0.7.0, thinc-apple-ops: 73172 WPS
BLIS 0.9.0, no thinc-apple-ops: 41576 WPS
BLIS 0.9.0, thinc-apple-ops: 72569 WPS
This PR, thinc-apple-ops: 87061 WPS
* Require thinc >=8.1.0,<8.2.0
* Lower thinc lowerbound to 8.1.0.dev0
* Use best CPU ops for CBLAS when the parser model is on the GPU
* Fix another unguarded cblas() call
* Fix: use ops as a shorthand for self.model.ops
Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
2022-05-27 09:20:52 +00:00
|
|
|
cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
|
2018-05-15 20:17:29 +00:00
|
|
|
const WeightsC* W, SizesC n) nogil
|
|
|
|
|
|
|
|
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
|
|
|
|
|
|
|
|
cdef void cpu_log_loss(float* d_scores,
|
|
|
|
const float* costs, const int* is_valid, const float* scores, int O) nogil
|
|
|
|
|