Update vectors and similarity usage guide

2017-11-01 01:25:17 +01:00 · 2017-11-01 01:25:17 +01:00 · 07d02c3304
parent 37e62ab0e2
commit 07d02c3304
5 changed files with 260 additions and 159 deletions
--- a/website/usage/_data.json
+++ b/website/usage/_data.json
@ -116,7 +116,6 @@
        "next": "text-classification",
        "menu": {
            "Basics": "basics",
-            "Similarity in Context": "in-context",
            "Custom Vectors": "custom",
            "GPU Usage": "gpu"
        }
--- a/website/usage/_vectors-similarity/_basics.jade
+++ b/website/usage/_vectors-similarity/_basics.jade
@ -13,3 +13,127 @@

 include ../_spacy-101/_similarity
 include ../_spacy-101/_word-vectors
+
+h(3, "in-context") Similarities in context
+
+p
+    |  Aside from spaCy's built-in word vectors, which were trained on a lot of
+    |  text with a wide vocabulary, the parsing, tagging and NER models also
+    |  rely on vector representations of the #[strong meanings of words in context].
+    |  As the first component of the
+    |  #[+a("/usage/processing-pipelines") processing pipeline], the
+    |  tensorizer encodes a document's internal meaning representations as an
+    |  array of floats, also called a tensor. This allows spaCy to make a
+    |  reasonable guess at a word's meaning, based on its surrounding words.
+    |  Even if a word hasn't been seen before, spaCy will know #[em something]
+    |  about it. Because spaCy uses a 4-layer convolutional network, the
+    |  tensors are sensitive to up to #[strong four words on either side] of a
+    |  word.
+
+p
+    |  For example, here are three sentences containing the out-of-vocabulary
+    |  word "labrador" in different contexts.
+
+code.
+    doc1 = nlp(u"The labrador barked.")
+    doc2 = nlp(u"The labrador swam.")
+    doc3 = nlp(u"the labrador people live in canada.")
+
+    for doc in [doc1, doc2, doc3]:
+        labrador = doc[1]
+        dog = nlp(u"dog")
+        print(labrador.similarity(dog))
+
+p
+    |  Even though the model has never seen the word "labrador", it can make a
+    |  fairly accurate prediction of its similarity to "dog" in different
+    |  contexts.
+
+table(["Context", "labrador.similarity(dog)"])
+    +row
+        +cell The #[strong labrador] barked.
+        +cell #[code 0.56] #[+procon("yes", "similar")]
+
+    +row
+        +cell The #[strong labrador] swam.
+        +cell #[code 0.48] #[+procon("no", "dissimilar")]
+
+    +row
+        +cell the #[strong labrador] people live in canada.
+        +cell #[code 0.39] #[+procon("no", "dissimilar")]
+
+p
+    |  The same also works for whole documents. Here, the variance of the
+    |  similarities is lower, as all words and their order are taken into
+    |  account. However, the context-specific similarity is often still
+    |  reflected pretty accurately.
+
+code.
+    doc1 = nlp(u"Paris is the largest city in France.")
+    doc2 = nlp(u"Vilnius is the capital of Lithuania.")
+    doc3 = nlp(u"An emu is a large bird.")
+
+    for doc in [doc1, doc2, doc3]:
+        for other_doc in [doc1, doc2, doc3]:
+            print(doc.similarity(other_doc))
+
+p
+    |  Even though the sentences about Paris and Vilnius consist of different
+    |  words and entities, they both describe the same concept and are seen as
+    |  more similar than the sentence about emus. In this case, even a misspelled
+    |  version of "Vilnius" would still produce very similar results.
+
+table
+    - var examples = {"Paris is the largest city in France.": [1, 0.85, 0.65], "Vilnius is the capital of Lithuania.": [0.85, 1, 0.55], "An emu is a large bird.": [0.65, 0.55, 1]}
+    - var counter = 0
+
+    +row
+    +row
+        +cell
+        for _, label in examples
+            +cell=label
+
+    each cells, label in examples
+        +row(counter ? null : "divider")
+            +cell=label
+            for cell in cells
+                +cell.u-text-center
+                    - var result = cell < 0.7 ? ["no", "dissimilar"] : cell != 1 ? ["yes", "similar"] : ["neutral", "identical"]
+                    |  #[code=cell.toFixed(2)] #[+procon(...result)]
+        - counter++
+
+p
+    |  Sentences that consist of the same words in different order will likely
+    |  be seen as very similar – but never identical.
+
+code.
+    docs = [nlp(u"dog bites man"), nlp(u"man bites dog"),
+            nlp(u"man dog bites"), nlp(u"dog man bites")]
+
+    for doc in docs:
+        for other_doc in docs:
+            print(doc.similarity(other_doc))
+
+p
+    |  Interestingly, "man bites dog" and "man dog bites" are seen as slightly
+    |  more similar than "man bites dog" and "dog bites man". This may be a
+    |  conincidence – or the result of "man" being interpreted as both sentence's
+    |  subject.
+
+table
+    - var examples = {"dog bites man": [1, 0.9, 0.89, 0.92], "man bites dog": [0.9, 1, 0.93, 0.9], "man dog bites": [0.89, 0.93, 1, 0.92], "dog man bites": [0.92, 0.9, 0.92, 1]}
+    - var counter = 0
+
+    +row("head")
+        +cell
+        for _, label in examples
+            +cell.u-text-center=label
+
+    each cells, label in examples
+        +row(counter ? null : "divider")
+            +cell=label
+            for cell in cells
+                +cell.u-text-center
+                    - var result = cell < 0.7 ? ["no", "dissimilar"] : cell != 1 ? ["yes", "similar"] : ["neutral", "identical"]
+                    |  #[code=cell.toFixed(2)] #[+procon(...result)]
+        - counter++
--- a/website/usage/_vectors-similarity/_custom.jade
+++ b/website/usage/_vectors-similarity/_custom.jade
@ -1,49 +1,137 @@
 //- 💫 DOCS > USAGE > VECTORS & SIMILARITY > CUSTOM VECTORS

 p
-    |  By default, #[+api("token#vector") #[code Token.vector]] returns the
-    |  vector for its underlying #[+api("lexeme") #[code Lexeme]], while
-    |  #[+api("doc#vector") #[code Doc.vector]] and
-    |  #[+api("span#vector") #[code Span.vector]] return an average of the
-    |  vectors of their tokens. You can customize these
-    |  behaviours by modifying the #[code doc.user_hooks],
-    |  #[code doc.user_span_hooks] and #[code doc.user_token_hooks]
-    |  dictionaries.
+    |  Word vectors let you import knowledge from raw text into your model. The
+    |  knowledge is represented as a table of numbers, with one row per term in
+    |  your vocabulary. If two terms are used in similar contexts, the algorithm
+    |  that learns the vectors should assign them
+    |  #[strong rows that are quite similar], while words that are used in
+    |  different contexts will have quite different values. This lets you use
+    |  the row-values assigned to the words as a kind of dictionary, to tell you
+    |  some things about what the words in your text mean.

-+infobox
-    |  For more details on #[strong adding hooks] and #[strong overwriting] the
-    |  built-in #[code Doc], #[code Span] and #[code Token] methods, see the
-    |  usage guide on #[+a("/usage/processing-pipelines#user-hooks") user hooks].
+p
+    |  Word vectors are particularly useful for terms which
+    |  #[strong aren&apos;t well represented in your labelled training data].
+    |  For instance, if you're doing named entity recognition, there will always
+    |  be lots of names that you don't have examples of. For instance, imagine
+    |  your training data happens to contain some examples of the term
+    |  "Microsoft", but it doesn't contain any examples of the term "Symantec".
+    |  In your raw text sample, there are plenty of examples of both terms, and
+    |  they're used in similar contexts. The word vectors make that fact
+    |  available to the entity recognition model. It still won't see examples of
+    |  "Symantec" labelled as a company. However, it'll see that "Symantec" has
+    |  a word vector that usually corresponds to company terms, so it can
+    |  #[strong make the inference].
+
+p
+    |  In order to make best use of the word vectors, you want the word vectors
+    |  table to cover a #[strong very large vocabulary]. However, most words are
+    |  rare, so most of the rows in a large word vectors table will be accessed
+    |  very rarely, or never at all. You can usually cover more than
+    |  #[strong 95% of the tokens] in your corpus with just
+    |  #[strong a few thousand rows] in the vector table. However, it's those
+    |  #[strong 5% of rare terms] where the word vectors are
+    |  #[strong most useful]. The problem is that increasing the size of the
+    |  vector table produces rapidly diminishing returns in coverage over these
+    |  rare terms.
+
+h(3, "custom-vectors-coverage") Optimising vector coverage
+    +tag-new(2)
+
+p
+    |  To help you strike a good balance between coverage and memory usage,
+    |  spaCy's #[+api("vectors") #[code Vectors]] class lets you map
+    |  #[strong multiple keys] to the #[strong same row] of the table. If
+    |  you're using the #[+api("cli#vocab") #[code spacy vocab]] command to
+    |  create a vocabulary, pruning the vectors will be taken care of
+    |  automatically. You can also do it manually in the following steps:
+
+list("numbers")
+    +item
+        |  Start with a #[strong word vectors model] that covers a huge
+        |  vocabulary. For instance, the
+        |  #[+a("/models/en#en_vectors_web_lg") #[code en_vectors_web_lg]] model
+        |  provides 300-dimensional GloVe vectors for over 1 million terms of
+        |  English.
+
+    +item
+        |  If your vocabulary has values set for the #[code Lexeme.prob]
+        |  attribute, the lexemes will be sorted by descending probability to
+        |  determine which vectors to prune. Otherwise, lexemes will be sorted
+        |  by their order in the #[code Vocab].
+
+    +item
+        |  Call #[+api("vocab#prune_vectors") #[code Vocab.prune_vectors]] with
+        |  the number of vectors you want to keep.
+
+code.
+    nlp = spacy.load('en_vectors_web_lg')
+    n_vectors = 105000  # number of vectors to keep
+    removed_words = nlp.vocab.prune_vectors(n_vectors)
+
+    assert len(nlp.vocab.vectors) &lt;= n_vectors  # unique vectors have been pruned
+    assert nlp.vocab.vectors.n_keys &gt; n_vectors  # but not the total entries
+
+p
+    |  #[+api("vocab#prune_vectors") #[code Vocab.prune_vectors]] reduces the
+    |  current vector table to a given number of unique entries, and returns a
+    |  dictionary containing the removed words, mapped to #[code (string, score)]
+    |  tuples, where #[code string] is the entry the removed word was mapped
+    |  to, and #[code score] the similarity score between the two words.
+
+code("Removed words").
+    {
+        'Shore': ('coast', 0.732257),
+        'Precautionary': ('caution', 0.490973),
+        'hopelessness': ('sadness', 0.742366),
+        'Continous': ('continuous', 0.732549),
+        'Disemboweled': ('corpse', 0.499432),
+        'biostatistician': ('scientist', 0.339724),
+        'somewheres': ('somewheres', 0.402736),
+        'observing': ('observe', 0.823096),
+        'Leaving': ('leaving', 1.0)
+    }
+
+p
+    |  In the example above, the vector for "Shore" was removed and remapped
+    |  to the vector of "coast", which is deemed about 73% similar. "Leaving"
+    |  was remapped to the vector of "leaving", which is identical.

 +h(3, "custom-vectors-add") Adding vectors
    +tag-new(2)

 p
-    |  The new #[+api("vectors") #[code Vectors]] class makes it easy to add
-    |  your own vectors to spaCy. Just like the #[+api("vocab") #[code Vocab]],
-    |  it is initialised with a #[+api("stringstore") #[code StringStore]] or
-    |  a list of strings.
+    |  spaCy's new #[+api("vectors") #[code Vectors]] class greatly improves the
+    |  way word vectors are stored, accessed and used. The data is stored in
+    |  two structures:

-+code("Adding vectors one-by-one").
-    from spacy.strings import StringStore
-    from spacy.vectors import Vectors
+list
+    +item
+        |  An array, which can be either on CPU or #[+a("#gpu") GPU].

-    vector_data = {'dog': numpy.random.uniform(-1, 1, (300,)),
-                   'cat': numpy.random.uniform(-1, 1, (300,)),
-                   'orange': numpy.random.uniform(-1, 1, (300,))}
-
-    vectors = Vectors(StringStore(), 300)
-    for word, vector in vector_data.items():
-        vectors.add(word, vector)
+    +item
+        |  A dictionary mapping string-hashes to rows in the table.

 p
-    |  You can also add the vector values directly on initialisation:
+    |  Keep in mind that the #[code Vectors] class itself has no
+    |  #[+api("stringstore") #[code StringStore]], so you have to store the
+    |  hash-to-string mapping separately. If you need to manage the strings,
+    |  you should use the #[code Vectors] via the
+    |  #[+api("vocab") #[code Vocab]] class, e.g. #[code vocab.vectors]. To
+    |  add vectors to the vocabulary, you can use the
+    |  #[+api("vocab#set_vector") #[code Vocab.set_vector]] method.

-+code("Adding vectors on initialisation").
-    from spacy.vectors import Vectors
+code("Adding vectors").
+    from spacy.vocab import Vocab

-    vector_table = numpy.zeros((3, 300), dtype='f')
-    vectors = Vectors([u'dog', u'cat', u'orange'], vector_table)
+    vector_data = {u'dog': numpy.random.uniform(-1, 1, (300,)),
+                   u'cat': numpy.random.uniform(-1, 1, (300,)),
+                   u'orange': numpy.random.uniform(-1, 1, (300,))}
+
+    vocab = Vocab()
+    for word, vector in vector_data.items():
+        vocab.set_vector(word, vector)

 +h(3, "custom-loading-glove") Loading GloVe vectors
    +tag-new(2)
@ -89,3 +177,20 @@ p
    |  #[+api("vocab#set_vector") #[code set_vector]] method.

 +github("spacy", "examples/vectors_fast_text.py")
+
+h(3, "custom-similarity") Using custom similarity methods
+
+p
+    |  By default, #[+api("token#vector") #[code Token.vector]] returns the
+    |  vector for its underlying #[+api("lexeme") #[code Lexeme]], while
+    |  #[+api("doc#vector") #[code Doc.vector]] and
+    |  #[+api("span#vector") #[code Span.vector]] return an average of the
+    |  vectors of their tokens. You can customise these
+    |  behaviours by modifying the #[code doc.user_hooks],
+    |  #[code doc.user_span_hooks] and #[code doc.user_token_hooks]
+    |  dictionaries.
+
+infobox
+    |  For more details on #[strong adding hooks] and #[strong overwriting] the
+    |  built-in #[code Doc], #[code Span] and #[code Token] methods, see the
+    |  usage guide on #[+a("/usage/processing-pipelines#user-hooks") user hooks].
--- a/website/usage/_vectors-similarity/_in-context.jade
+++ b/website/usage/_vectors-similarity/_in-context.jade
@ -1,123 +0,0 @@
-//- 💫 DOCS > USAGE > VECTORS & SIMILARITY > IN CONTEXT
-
-p
-    |  Aside from spaCy's built-in word vectors, which were trained on a lot of
-    |  text with a wide vocabulary, the parsing, tagging and NER models also
-    |  rely on vector representations of the #[strong meanings of words in context].
-    |  As the first component of the
-    |  #[+a("/usage/processing-pipelines") processing pipeline], the
-    |  tensorizer encodes a document's internal meaning representations as an
-    |  array of floats, also called a tensor. This allows spaCy to make a
-    |  reasonable guess at a word's meaning, based on its surrounding words.
-    |  Even if a word hasn't been seen before, spaCy will know #[em something]
-    |  about it. Because spaCy uses a 4-layer convolutional network, the
-    |  tensors are sensitive to up to #[strong four words on either side] of a
-    |  word.
-
-p
-    |  For example, here are three sentences containing the out-of-vocabulary
-    |  word "labrador" in different contexts.
-
-+code.
-    doc1 = nlp(u"The labrador barked.")
-    doc2 = nlp(u"The labrador swam.")
-    doc3 = nlp(u"the labrador people live in canada.")
-
-    for doc in [doc1, doc2, doc3]:
-        labrador = doc[1]
-        dog = nlp(u"dog")
-        print(labrador.similarity(dog))
-
-p
-    |  Even though the model has never seen the word "labrador", it can make a
-    |  fairly accurate prediction of its similarity to "dog" in different
-    |  contexts.
-
-+table(["Context", "labrador.similarity(dog)"])
-    +row
-        +cell The #[strong labrador] barked.
-        +cell #[code 0.56] #[+procon("yes", "similar")]
-
-    +row
-        +cell The #[strong labrador] swam.
-        +cell #[code 0.48] #[+procon("no", "dissimilar")]
-
-    +row
-        +cell the #[strong labrador] people live in canada.
-        +cell #[code 0.39] #[+procon("no", "dissimilar")]
-
-p
-    |  The same also works for whole documents. Here, the variance of the
-    |  similarities is lower, as all words and their order are taken into
-    |  account. However, the context-specific similarity is often still
-    |  reflected pretty accurately.
-
-+code.
-    doc1 = nlp(u"Paris is the largest city in France.")
-    doc2 = nlp(u"Vilnius is the capital of Lithuania.")
-    doc3 = nlp(u"An emu is a large bird.")
-
-    for doc in [doc1, doc2, doc3]:
-        for other_doc in [doc1, doc2, doc3]:
-            print(doc.similarity(other_doc))
-
-p
-    |  Even though the sentences about Paris and Vilnius consist of different
-    |  words and entities, they both describe the same concept and are seen as
-    |  more similar than the sentence about emus. In this case, even a misspelled
-    |  version of "Vilnius" would still produce very similar results.
-
-+table
-    - var examples = {"Paris is the largest city in France.": [1, 0.85, 0.65], "Vilnius is the capital of Lithuania.": [0.85, 1, 0.55], "An emu is a large bird.": [0.65, 0.55, 1]}
-    - var counter = 0
-
-    +row
-    +row
-        +cell
-        for _, label in examples
-            +cell=label
-
-    each cells, label in examples
-        +row(counter ? null : "divider")
-            +cell=label
-            for cell in cells
-                +cell.u-text-center
-                    - var result = cell < 0.7 ? ["no", "dissimilar"] : cell != 1 ? ["yes", "similar"] : ["neutral", "identical"]
-                    |  #[code=cell.toFixed(2)] #[+procon(...result)]
-        - counter++
-
-p
-    |  Sentences that consist of the same words in different order will likely
-    |  be seen as very similar – but never identical.
-
-+code.
-    docs = [nlp(u"dog bites man"), nlp(u"man bites dog"),
-            nlp(u"man dog bites"), nlp(u"dog man bites")]
-
-    for doc in docs:
-        for other_doc in docs:
-            print(doc.similarity(other_doc))
-
-p
-    |  Interestingly, "man bites dog" and "man dog bites" are seen as slightly
-    |  more similar than "man bites dog" and "dog bites man". This may be a
-    |  conincidence – or the result of "man" being interpreted as both sentence's
-    |  subject.
-
-+table
-    - var examples = {"dog bites man": [1, 0.9, 0.89, 0.92], "man bites dog": [0.9, 1, 0.93, 0.9], "man dog bites": [0.89, 0.93, 1, 0.92], "dog man bites": [0.92, 0.9, 0.92, 1]}
-    - var counter = 0
-
-    +row("head")
-        +cell
-        for _, label in examples
-            +cell.u-text-center=label
-
-    each cells, label in examples
-        +row(counter ? null : "divider")
-            +cell=label
-            for cell in cells
-                +cell.u-text-center
-                    - var result = cell < 0.7 ? ["no", "dissimilar"] : cell != 1 ? ["yes", "similar"] : ["neutral", "identical"]
-                    |  #[code=cell.toFixed(2)] #[+procon(...result)]
-        - counter++
--- a/website/usage/vectors-similarity.jade
+++ b/website/usage/vectors-similarity.jade
@ -5,10 +5,6 @@ include ../_includes/_mixins
 +section("basics")
    include _vectors-similarity/_basics

-+section("in-context")
-    +h(2, "in-context") Similarities in context
-    include _vectors-similarity/_in-context
-
 +section("custom")
    +h(2, "custom") Customising word vectors
    include _vectors-similarity/_custom