diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 474942558..a3f8b3c87 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -28,11 +28,31 @@ def build_hash_embed_cnn_tok2vec( window_size: int, maxout_pieces: int, subword_features: bool, - dropout: Optional[float], pretrained_vectors: Optional[bool] ) -> Model[List[Doc], List[Floats2d]]: """Build spaCy's 'standard' tok2vec layer, which uses hash embedding - with subword features and a CNN with layer-normalized maxout.""" + with subword features and a CNN with layer-normalized maxout. + + width (int): The width of the input and output. These are required to be the + same, so that residual connections can be used. Recommended values are + 96, 128 or 300. + depth (int): The number of convolutional layers to use. Recommended values + are between 2 and 8. + window_size (int): The number of tokens on either side to concatenate during + the convolutions. The receptive field of the CNN will be + depth * (window_size * 2 + 1), so a 4-layer network with window_size of + 2 will be sensitive to 17 words at a time. Recommended value is 1. + embed_size (int): The number of rows in the hash embedding tables. This can + be surprisingly small, due to the use of the hash embeddings. Recommended + values are between 2000 and 10000. + maxout_pieces (int): The number of pieces to use in the maxout non-linearity. + If 1, the Mish non-linearity is used instead. Recommended values are 1-3. + subword_features (bool): Whether to also embed subword features, specifically + the prefix, suffix and word shape. This is recommended for alphabetic + languages like English, but not if single-character tokens are used for + a language such as Chinese. + pretrained_vectors (bool): Whether to also use static vectors. + """ return build_Tok2Vec_model( embed=MultiHashEmbed( width=width, @@ -54,7 +74,14 @@ def build_Tok2Vec_model( embed: Model[List[Doc], List[Floats2d]], encode: Model[List[Floats2d], List[Floats2d]], ) -> Model[List[Doc], List[Floats2d]]: + """Construct a tok2vec model out of embedding and encoding subnetworks. + See https://explosion.ai/blog/deep-learning-formula-nlp + embed (Model[List[Doc], List[Floats2d]]): Embed tokens into context-indepdent + word vector representations. + encode (Model[List[Floats2d], List[Floats2d]]): Encode context into the + embeddings, using an architecture such as a CNN, BiLSTM or transformer. + """ receptive_field = encode.attrs.get("receptive_field", 0) tok2vec = chain(embed, with_array(encode, pad=receptive_field)) tok2vec.set_dim("nO", encode.get_dim("nO")) @@ -67,6 +94,27 @@ def build_Tok2Vec_model( def MultiHashEmbed( width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool ): + """Construct an embedding layer that separately embeds a number of lexical + attributes using hash embedding, concatenates the results, and passes it + through a feed-forward subnetwork to build a mixed representations. + + The features used are the NORM, PREFIX, SUFFIX and SHAPE, which can have + varying definitions depending on the Vocab of the Doc object passed in. + Vectors from pretrained static vectors can also be incorporated into the + concatenated representation. + + width (int): The output width. Also used as the width of the embedding tables. + Recommended values are between 64 and 300. + rows (int): The number of rows for the embedding tables. Can be low, due + to the hashing trick. Embeddings for prefix, suffix and word shape + use half as many rows. Recommended values are between 2000 and 10000. + also_embed_subwords (bool): Whether to use the PREFIX, SUFFIX and SHAPE + features in the embeddings. If not using these, you may need more + rows in your hash embeddings, as there will be increased chance of + collisions. + also_use_static_vectors (bool): Whether to also use static word vectors. + Requires a vectors table to be loaded in the Doc objects' vocab. + """ cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH] seed = 7 @@ -117,6 +165,30 @@ def MultiHashEmbed( @registry.architectures.register("spacy.CharacterEmbed.v1") def CharacterEmbed(width: int, rows: int, nM: int, nC: int): + """Construct an embedded representations based on character embeddings, using + a feed-forward network. A fixed number of UTF-8 byte characters are used for + each word, taken from the beginning and end of the word equally. Padding is + used in the centre for words that are too short. + + For instance, let's say nC=4, and the word is "jumping". The characters + used will be jung (two from the start, two from the end). If we had nC=8, + the characters would be "jumpping": 4 from the start, 4 from the end. This + ensures that the final character is always in the last position, instead + of being in an arbitrary position depending on the word length. + + The characters are embedded in a embedding table with 256 rows, and the + vectors concatenated. A hash-embedded vector of the NORM of the word is + also concatenated on, and the result is then passed through a feed-forward + network to construct a single vector to represent the information. + + width (int): The width of the output vector and the NORM hash embedding. + rows (int): The number of rows in the NORM hash embedding table. + nM (int): The dimensionality of the character embeddings. Recommended values + are between 16 and 64. + nC (int): The number of UTF-8 bytes to embed per word. Recommended values + are between 3 and 8, although it may depend on the length of words in the + language. + """ model = chain( concatenate( chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()), @@ -133,7 +205,19 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int): @registry.architectures.register("spacy.MaxoutWindowEncoder.v1") -def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth: int): +def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth: int) -> Model[List[Floats2d], List[Floats2d]]: + """Encode context using convolutions with maxout activation, layer + normalization and residual connections. + + width (int): The input and output width. These are required to be the same, + to allow residual connections. This value will be determined by the + width of the inputs. Recommended values are between 64 and 300. + window_size (int): The number of words to concatenate around each token + to construct the convolution. Recommended value is 1. + maxout_pieces (int): The number of maxout pieces to use. Recommended + values are 2 or 3. + depth (int): The number of convolutional layers. Recommended value is 4. + """ cnn = chain( expand_window(window_size=window_size), Maxout( @@ -151,7 +235,17 @@ def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth: @registry.architectures.register("spacy.MishWindowEncoder.v1") -def MishWindowEncoder(width, window_size, depth): +def MishWindowEncoder(width: int, window_size: int, depth: int) -> Model[List[Floats2d], List[Floats2d]]: + """Encode context using convolutions with mish activation, layer + normalization and residual connections. + + width (int): The input and output width. These are required to be the same, + to allow residual connections. This value will be determined by the + width of the inputs. Recommended values are between 64 and 300. + window_size (int): The number of words to concatenate around each token + to construct the convolution. Recommended value is 1. + depth (int): The number of convolutional layers. Recommended value is 4. + """ cnn = chain( expand_window(window_size=window_size), Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True), @@ -162,7 +256,16 @@ def MishWindowEncoder(width, window_size, depth): @registry.architectures.register("spacy.TorchBiLSTMEncoder.v1") -def BiLSTMEncoder(width, depth, dropout): +def BiLSTMEncoder(width: int, depth: int, dropout: float) -> Model[List[Floats2d], List[Floats2d]]: + """Encode context using bidirectonal LSTM layers. Requires PyTorch. + + width (int): The input and output width. These are required to be the same, + to allow residual connections. This value will be determined by the + width of the inputs. Recommended values are between 64 and 300. + window_size (int): The number of words to concatenate around each token + to construct the convolution. Recommended value is 1. + depth (int): The number of convolutional layers. Recommended value is 4. + """ if depth == 0: return noop() return with_padded(PyTorchLSTM(width, width, bi=True, depth=depth, dropout=dropout))