From 07d02c33040b250abf0ddc43b95a638c12ab4b54 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 1 Nov 2017 01:25:17 +0100 Subject: [PATCH] Update vectors and similarity usage guide --- website/usage/_data.json | 1 - .../usage/_vectors-similarity/_basics.jade | 124 +++++++++++++ .../usage/_vectors-similarity/_custom.jade | 167 ++++++++++++++---- .../_vectors-similarity/_in-context.jade | 123 ------------- website/usage/vectors-similarity.jade | 4 - 5 files changed, 260 insertions(+), 159 deletions(-) delete mode 100644 website/usage/_vectors-similarity/_in-context.jade diff --git a/website/usage/_data.json b/website/usage/_data.json index 4a4e6df01..498202695 100644 --- a/website/usage/_data.json +++ b/website/usage/_data.json @@ -116,7 +116,6 @@ "next": "text-classification", "menu": { "Basics": "basics", - "Similarity in Context": "in-context", "Custom Vectors": "custom", "GPU Usage": "gpu" } diff --git a/website/usage/_vectors-similarity/_basics.jade b/website/usage/_vectors-similarity/_basics.jade index b8f8d834c..300680331 100644 --- a/website/usage/_vectors-similarity/_basics.jade +++ b/website/usage/_vectors-similarity/_basics.jade @@ -13,3 +13,127 @@ include ../_spacy-101/_similarity include ../_spacy-101/_word-vectors + ++h(3, "in-context") Similarities in context + +p + | Aside from spaCy's built-in word vectors, which were trained on a lot of + | text with a wide vocabulary, the parsing, tagging and NER models also + | rely on vector representations of the #[strong meanings of words in context]. + | As the first component of the + | #[+a("/usage/processing-pipelines") processing pipeline], the + | tensorizer encodes a document's internal meaning representations as an + | array of floats, also called a tensor. This allows spaCy to make a + | reasonable guess at a word's meaning, based on its surrounding words. + | Even if a word hasn't been seen before, spaCy will know #[em something] + | about it. Because spaCy uses a 4-layer convolutional network, the + | tensors are sensitive to up to #[strong four words on either side] of a + | word. + +p + | For example, here are three sentences containing the out-of-vocabulary + | word "labrador" in different contexts. + ++code. + doc1 = nlp(u"The labrador barked.") + doc2 = nlp(u"The labrador swam.") + doc3 = nlp(u"the labrador people live in canada.") + + for doc in [doc1, doc2, doc3]: + labrador = doc[1] + dog = nlp(u"dog") + print(labrador.similarity(dog)) + +p + | Even though the model has never seen the word "labrador", it can make a + | fairly accurate prediction of its similarity to "dog" in different + | contexts. + ++table(["Context", "labrador.similarity(dog)"]) + +row + +cell The #[strong labrador] barked. + +cell #[code 0.56] #[+procon("yes", "similar")] + + +row + +cell The #[strong labrador] swam. + +cell #[code 0.48] #[+procon("no", "dissimilar")] + + +row + +cell the #[strong labrador] people live in canada. + +cell #[code 0.39] #[+procon("no", "dissimilar")] + +p + | The same also works for whole documents. Here, the variance of the + | similarities is lower, as all words and their order are taken into + | account. However, the context-specific similarity is often still + | reflected pretty accurately. + ++code. + doc1 = nlp(u"Paris is the largest city in France.") + doc2 = nlp(u"Vilnius is the capital of Lithuania.") + doc3 = nlp(u"An emu is a large bird.") + + for doc in [doc1, doc2, doc3]: + for other_doc in [doc1, doc2, doc3]: + print(doc.similarity(other_doc)) + +p + | Even though the sentences about Paris and Vilnius consist of different + | words and entities, they both describe the same concept and are seen as + | more similar than the sentence about emus. In this case, even a misspelled + | version of "Vilnius" would still produce very similar results. + ++table + - var examples = {"Paris is the largest city in France.": [1, 0.85, 0.65], "Vilnius is the capital of Lithuania.": [0.85, 1, 0.55], "An emu is a large bird.": [0.65, 0.55, 1]} + - var counter = 0 + + +row + +row + +cell + for _, label in examples + +cell=label + + each cells, label in examples + +row(counter ? null : "divider") + +cell=label + for cell in cells + +cell.u-text-center + - var result = cell < 0.7 ? ["no", "dissimilar"] : cell != 1 ? ["yes", "similar"] : ["neutral", "identical"] + | #[code=cell.toFixed(2)] #[+procon(...result)] + - counter++ + +p + | Sentences that consist of the same words in different order will likely + | be seen as very similar – but never identical. + ++code. + docs = [nlp(u"dog bites man"), nlp(u"man bites dog"), + nlp(u"man dog bites"), nlp(u"dog man bites")] + + for doc in docs: + for other_doc in docs: + print(doc.similarity(other_doc)) + +p + | Interestingly, "man bites dog" and "man dog bites" are seen as slightly + | more similar than "man bites dog" and "dog bites man". This may be a + | conincidence – or the result of "man" being interpreted as both sentence's + | subject. + ++table + - var examples = {"dog bites man": [1, 0.9, 0.89, 0.92], "man bites dog": [0.9, 1, 0.93, 0.9], "man dog bites": [0.89, 0.93, 1, 0.92], "dog man bites": [0.92, 0.9, 0.92, 1]} + - var counter = 0 + + +row("head") + +cell + for _, label in examples + +cell.u-text-center=label + + each cells, label in examples + +row(counter ? null : "divider") + +cell=label + for cell in cells + +cell.u-text-center + - var result = cell < 0.7 ? ["no", "dissimilar"] : cell != 1 ? ["yes", "similar"] : ["neutral", "identical"] + | #[code=cell.toFixed(2)] #[+procon(...result)] + - counter++ diff --git a/website/usage/_vectors-similarity/_custom.jade b/website/usage/_vectors-similarity/_custom.jade index da4be39fd..7792949d1 100644 --- a/website/usage/_vectors-similarity/_custom.jade +++ b/website/usage/_vectors-similarity/_custom.jade @@ -1,49 +1,137 @@ //- 💫 DOCS > USAGE > VECTORS & SIMILARITY > CUSTOM VECTORS p - | By default, #[+api("token#vector") #[code Token.vector]] returns the - | vector for its underlying #[+api("lexeme") #[code Lexeme]], while - | #[+api("doc#vector") #[code Doc.vector]] and - | #[+api("span#vector") #[code Span.vector]] return an average of the - | vectors of their tokens. You can customize these - | behaviours by modifying the #[code doc.user_hooks], - | #[code doc.user_span_hooks] and #[code doc.user_token_hooks] - | dictionaries. + | Word vectors let you import knowledge from raw text into your model. The + | knowledge is represented as a table of numbers, with one row per term in + | your vocabulary. If two terms are used in similar contexts, the algorithm + | that learns the vectors should assign them + | #[strong rows that are quite similar], while words that are used in + | different contexts will have quite different values. This lets you use + | the row-values assigned to the words as a kind of dictionary, to tell you + | some things about what the words in your text mean. -+infobox - | For more details on #[strong adding hooks] and #[strong overwriting] the - | built-in #[code Doc], #[code Span] and #[code Token] methods, see the - | usage guide on #[+a("/usage/processing-pipelines#user-hooks") user hooks]. +p + | Word vectors are particularly useful for terms which + | #[strong aren't well represented in your labelled training data]. + | For instance, if you're doing named entity recognition, there will always + | be lots of names that you don't have examples of. For instance, imagine + | your training data happens to contain some examples of the term + | "Microsoft", but it doesn't contain any examples of the term "Symantec". + | In your raw text sample, there are plenty of examples of both terms, and + | they're used in similar contexts. The word vectors make that fact + | available to the entity recognition model. It still won't see examples of + | "Symantec" labelled as a company. However, it'll see that "Symantec" has + | a word vector that usually corresponds to company terms, so it can + | #[strong make the inference]. + +p + | In order to make best use of the word vectors, you want the word vectors + | table to cover a #[strong very large vocabulary]. However, most words are + | rare, so most of the rows in a large word vectors table will be accessed + | very rarely, or never at all. You can usually cover more than + | #[strong 95% of the tokens] in your corpus with just + | #[strong a few thousand rows] in the vector table. However, it's those + | #[strong 5% of rare terms] where the word vectors are + | #[strong most useful]. The problem is that increasing the size of the + | vector table produces rapidly diminishing returns in coverage over these + | rare terms. + ++h(3, "custom-vectors-coverage") Optimising vector coverage + +tag-new(2) + +p + | To help you strike a good balance between coverage and memory usage, + | spaCy's #[+api("vectors") #[code Vectors]] class lets you map + | #[strong multiple keys] to the #[strong same row] of the table. If + | you're using the #[+api("cli#vocab") #[code spacy vocab]] command to + | create a vocabulary, pruning the vectors will be taken care of + | automatically. You can also do it manually in the following steps: + ++list("numbers") + +item + | Start with a #[strong word vectors model] that covers a huge + | vocabulary. For instance, the + | #[+a("/models/en#en_vectors_web_lg") #[code en_vectors_web_lg]] model + | provides 300-dimensional GloVe vectors for over 1 million terms of + | English. + + +item + | If your vocabulary has values set for the #[code Lexeme.prob] + | attribute, the lexemes will be sorted by descending probability to + | determine which vectors to prune. Otherwise, lexemes will be sorted + | by their order in the #[code Vocab]. + + +item + | Call #[+api("vocab#prune_vectors") #[code Vocab.prune_vectors]] with + | the number of vectors you want to keep. + ++code. + nlp = spacy.load('en_vectors_web_lg') + n_vectors = 105000 # number of vectors to keep + removed_words = nlp.vocab.prune_vectors(n_vectors) + + assert len(nlp.vocab.vectors) <= n_vectors # unique vectors have been pruned + assert nlp.vocab.vectors.n_keys > n_vectors # but not the total entries + +p + | #[+api("vocab#prune_vectors") #[code Vocab.prune_vectors]] reduces the + | current vector table to a given number of unique entries, and returns a + | dictionary containing the removed words, mapped to #[code (string, score)] + | tuples, where #[code string] is the entry the removed word was mapped + | to, and #[code score] the similarity score between the two words. + ++code("Removed words"). + { + 'Shore': ('coast', 0.732257), + 'Precautionary': ('caution', 0.490973), + 'hopelessness': ('sadness', 0.742366), + 'Continous': ('continuous', 0.732549), + 'Disemboweled': ('corpse', 0.499432), + 'biostatistician': ('scientist', 0.339724), + 'somewheres': ('somewheres', 0.402736), + 'observing': ('observe', 0.823096), + 'Leaving': ('leaving', 1.0) + } + +p + | In the example above, the vector for "Shore" was removed and remapped + | to the vector of "coast", which is deemed about 73% similar. "Leaving" + | was remapped to the vector of "leaving", which is identical. +h(3, "custom-vectors-add") Adding vectors +tag-new(2) p - | The new #[+api("vectors") #[code Vectors]] class makes it easy to add - | your own vectors to spaCy. Just like the #[+api("vocab") #[code Vocab]], - | it is initialised with a #[+api("stringstore") #[code StringStore]] or - | a list of strings. + | spaCy's new #[+api("vectors") #[code Vectors]] class greatly improves the + | way word vectors are stored, accessed and used. The data is stored in + | two structures: -+code("Adding vectors one-by-one"). - from spacy.strings import StringStore - from spacy.vectors import Vectors ++list + +item + | An array, which can be either on CPU or #[+a("#gpu") GPU]. - vector_data = {'dog': numpy.random.uniform(-1, 1, (300,)), - 'cat': numpy.random.uniform(-1, 1, (300,)), - 'orange': numpy.random.uniform(-1, 1, (300,))} - - vectors = Vectors(StringStore(), 300) - for word, vector in vector_data.items(): - vectors.add(word, vector) + +item + | A dictionary mapping string-hashes to rows in the table. p - | You can also add the vector values directly on initialisation: + | Keep in mind that the #[code Vectors] class itself has no + | #[+api("stringstore") #[code StringStore]], so you have to store the + | hash-to-string mapping separately. If you need to manage the strings, + | you should use the #[code Vectors] via the + | #[+api("vocab") #[code Vocab]] class, e.g. #[code vocab.vectors]. To + | add vectors to the vocabulary, you can use the + | #[+api("vocab#set_vector") #[code Vocab.set_vector]] method. -+code("Adding vectors on initialisation"). - from spacy.vectors import Vectors ++code("Adding vectors"). + from spacy.vocab import Vocab - vector_table = numpy.zeros((3, 300), dtype='f') - vectors = Vectors([u'dog', u'cat', u'orange'], vector_table) + vector_data = {u'dog': numpy.random.uniform(-1, 1, (300,)), + u'cat': numpy.random.uniform(-1, 1, (300,)), + u'orange': numpy.random.uniform(-1, 1, (300,))} + + vocab = Vocab() + for word, vector in vector_data.items(): + vocab.set_vector(word, vector) +h(3, "custom-loading-glove") Loading GloVe vectors +tag-new(2) @@ -89,3 +177,20 @@ p | #[+api("vocab#set_vector") #[code set_vector]] method. +github("spacy", "examples/vectors_fast_text.py") + ++h(3, "custom-similarity") Using custom similarity methods + +p + | By default, #[+api("token#vector") #[code Token.vector]] returns the + | vector for its underlying #[+api("lexeme") #[code Lexeme]], while + | #[+api("doc#vector") #[code Doc.vector]] and + | #[+api("span#vector") #[code Span.vector]] return an average of the + | vectors of their tokens. You can customise these + | behaviours by modifying the #[code doc.user_hooks], + | #[code doc.user_span_hooks] and #[code doc.user_token_hooks] + | dictionaries. + ++infobox + | For more details on #[strong adding hooks] and #[strong overwriting] the + | built-in #[code Doc], #[code Span] and #[code Token] methods, see the + | usage guide on #[+a("/usage/processing-pipelines#user-hooks") user hooks]. diff --git a/website/usage/_vectors-similarity/_in-context.jade b/website/usage/_vectors-similarity/_in-context.jade deleted file mode 100644 index becd74348..000000000 --- a/website/usage/_vectors-similarity/_in-context.jade +++ /dev/null @@ -1,123 +0,0 @@ -//- 💫 DOCS > USAGE > VECTORS & SIMILARITY > IN CONTEXT - -p - | Aside from spaCy's built-in word vectors, which were trained on a lot of - | text with a wide vocabulary, the parsing, tagging and NER models also - | rely on vector representations of the #[strong meanings of words in context]. - | As the first component of the - | #[+a("/usage/processing-pipelines") processing pipeline], the - | tensorizer encodes a document's internal meaning representations as an - | array of floats, also called a tensor. This allows spaCy to make a - | reasonable guess at a word's meaning, based on its surrounding words. - | Even if a word hasn't been seen before, spaCy will know #[em something] - | about it. Because spaCy uses a 4-layer convolutional network, the - | tensors are sensitive to up to #[strong four words on either side] of a - | word. - -p - | For example, here are three sentences containing the out-of-vocabulary - | word "labrador" in different contexts. - -+code. - doc1 = nlp(u"The labrador barked.") - doc2 = nlp(u"The labrador swam.") - doc3 = nlp(u"the labrador people live in canada.") - - for doc in [doc1, doc2, doc3]: - labrador = doc[1] - dog = nlp(u"dog") - print(labrador.similarity(dog)) - -p - | Even though the model has never seen the word "labrador", it can make a - | fairly accurate prediction of its similarity to "dog" in different - | contexts. - -+table(["Context", "labrador.similarity(dog)"]) - +row - +cell The #[strong labrador] barked. - +cell #[code 0.56] #[+procon("yes", "similar")] - - +row - +cell The #[strong labrador] swam. - +cell #[code 0.48] #[+procon("no", "dissimilar")] - - +row - +cell the #[strong labrador] people live in canada. - +cell #[code 0.39] #[+procon("no", "dissimilar")] - -p - | The same also works for whole documents. Here, the variance of the - | similarities is lower, as all words and their order are taken into - | account. However, the context-specific similarity is often still - | reflected pretty accurately. - -+code. - doc1 = nlp(u"Paris is the largest city in France.") - doc2 = nlp(u"Vilnius is the capital of Lithuania.") - doc3 = nlp(u"An emu is a large bird.") - - for doc in [doc1, doc2, doc3]: - for other_doc in [doc1, doc2, doc3]: - print(doc.similarity(other_doc)) - -p - | Even though the sentences about Paris and Vilnius consist of different - | words and entities, they both describe the same concept and are seen as - | more similar than the sentence about emus. In this case, even a misspelled - | version of "Vilnius" would still produce very similar results. - -+table - - var examples = {"Paris is the largest city in France.": [1, 0.85, 0.65], "Vilnius is the capital of Lithuania.": [0.85, 1, 0.55], "An emu is a large bird.": [0.65, 0.55, 1]} - - var counter = 0 - - +row - +row - +cell - for _, label in examples - +cell=label - - each cells, label in examples - +row(counter ? null : "divider") - +cell=label - for cell in cells - +cell.u-text-center - - var result = cell < 0.7 ? ["no", "dissimilar"] : cell != 1 ? ["yes", "similar"] : ["neutral", "identical"] - | #[code=cell.toFixed(2)] #[+procon(...result)] - - counter++ - -p - | Sentences that consist of the same words in different order will likely - | be seen as very similar – but never identical. - -+code. - docs = [nlp(u"dog bites man"), nlp(u"man bites dog"), - nlp(u"man dog bites"), nlp(u"dog man bites")] - - for doc in docs: - for other_doc in docs: - print(doc.similarity(other_doc)) - -p - | Interestingly, "man bites dog" and "man dog bites" are seen as slightly - | more similar than "man bites dog" and "dog bites man". This may be a - | conincidence – or the result of "man" being interpreted as both sentence's - | subject. - -+table - - var examples = {"dog bites man": [1, 0.9, 0.89, 0.92], "man bites dog": [0.9, 1, 0.93, 0.9], "man dog bites": [0.89, 0.93, 1, 0.92], "dog man bites": [0.92, 0.9, 0.92, 1]} - - var counter = 0 - - +row("head") - +cell - for _, label in examples - +cell.u-text-center=label - - each cells, label in examples - +row(counter ? null : "divider") - +cell=label - for cell in cells - +cell.u-text-center - - var result = cell < 0.7 ? ["no", "dissimilar"] : cell != 1 ? ["yes", "similar"] : ["neutral", "identical"] - | #[code=cell.toFixed(2)] #[+procon(...result)] - - counter++ diff --git a/website/usage/vectors-similarity.jade b/website/usage/vectors-similarity.jade index 1e1139b20..fd70910ae 100644 --- a/website/usage/vectors-similarity.jade +++ b/website/usage/vectors-similarity.jade @@ -5,10 +5,6 @@ include ../_includes/_mixins +section("basics") include _vectors-similarity/_basics -+section("in-context") - +h(2, "in-context") Similarities in context - include _vectors-similarity/_in-context - +section("custom") +h(2, "custom") Customising word vectors include _vectors-similarity/_custom