From ec657c1ddcdae63d2cd12a14a5c3536b44841555 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 30 Oct 2017 19:35:41 +0100
Subject: [PATCH] Update vocab docs and document Vocab.prune_vectors

---
 spacy/vocab.pyx        | 12 +++++++++-
 website/api/vocab.jade | 51 ++++++++++++++++++++++++++++++++++++++----
 2 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index ff6c5b844..23254718f 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -252,7 +252,7 @@ cdef class Vocab:
         """Reduce the current vector table to `nr_row` unique entries. Words
         mapped to the discarded vectors will be remapped to the closest vector
         among those remaining.
-        
+
         For example, suppose the original table had vectors for the words:
         ['sat', 'cat', 'feline', 'reclined']. If we prune the vector table to,
         two rows, we would discard the vectors for 'feline' and 'reclined'.
@@ -263,6 +263,15 @@ cdef class Vocab:
         The similarities are judged by cosine. The original vectors may
         be large, so the cosines are calculated in minibatches, to reduce
         memory usage.
+
+        nr_row (int): The number of rows to keep in the vector table.
+        batch_size (int): Batch of vectors for calculating the similarities.
+            Larger batch sizes might be faster, while temporarily requiring
+            more memory.
+        RETURNS (dict): A dictionary keyed by removed words mapped to
+            `(string, score)` tuples, where `string` is the entry the removed
+            word was mapped to, and `score` the similarity score between the
+            two words.
         """
         xp = get_array_module(self.vectors.data)
         # Work in batches, to avoid memory problems.
@@ -285,6 +294,7 @@ cdef class Vocab:
                 self.vectors.add(lex.orth, row=lex.rank)
         # Make copy, to encourage the original table to be garbage collected.
         self.vectors.data = xp.ascontiguousarray(self.vectors.data[:nr_row])
+        # TODO: return new mapping
 
     def get_vector(self, orth):
         """Retrieve a vector for a word in the vocabulary. Words can be looked
diff --git a/website/api/vocab.jade b/website/api/vocab.jade
index 6faefc064..54dd4f691 100644
--- a/website/api/vocab.jade
+++ b/website/api/vocab.jade
@@ -162,7 +162,7 @@ p
         +cell int
         +cell The integer ID by which the flag value can be checked.
 
-+h(2, "add_flag") Vocab.clear_vectors
++h(2, "clear_vectors") Vocab.clear_vectors
     +tag method
     +tag-new(2)
 
@@ -181,7 +181,50 @@ p
             |  Number of dimensions of the new vectors. If #[code None], size
             |  is not changed.
 
-+h(2, "add_flag") Vocab.get_vector
++h(2, "prune_vectors") Vocab.prune_vectors
+    +tag method
+    +tag-new(2)
+
+p
+    |  Reduce the current vector table to #[code nr_row] unique entries. Words
+    |  mapped to the discarded vectors will be remapped to the closest vector
+    |  among those remaining. For example, suppose the original table had
+    |  vectors for the words:
+    |  #[code.u-break ['sat', 'cat', 'feline', 'reclined']]. If we prune the
+    |  vector table to, two rows, we would discard the vectors for "feline"
+    |  and "reclined". These words would then be remapped to the closest
+    |  remaining vector – so "feline" would have the same vector as "cat",
+    |  and "reclined" would have the same vector as "sat". The similarities are
+    |  judged by cosine. The original vectors may be large, so the cosines are
+    |  calculated in minibatches, to reduce memory usage.
+
++aside-code("Example").
+    nlp.vocab.prune_vectors(10000)
+    assert len(nlp.vocab.vectors) &lt;= 1000
+
++table(["Name", "Type", "Description"])
+    +row
+        +cell #[code nr_row]
+        +cell int
+        +cell The number of rows to keep in the vector table.
+
+    +row
+        +cell #[code batch_size]
+        +cell int
+        +cell
+            |  Batch of vectors for calculating the similarities. Larger batch
+            |  sizes might be faster, while temporarily requiring more memory.
+
+    +row("foot")
+        +cell returns
+        +cell dict
+        +cell
+            |  A dictionary keyed by removed words mapped to
+            |  #[code (string, score)] tuples, where #[code string] is the entry
+            |  the removed word was mapped to, and #[code score] the similarity
+            |  score between the two words.
+
++h(2, "get_vector") Vocab.get_vector
     +tag method
     +tag-new(2)
 
@@ -206,7 +249,7 @@ p
             |  A word vector. Size and shape are determined by the
             |  #[code Vocab.vectors] instance.
 
-+h(2, "add_flag") Vocab.set_vector
++h(2, "set_vector") Vocab.set_vector
     +tag method
     +tag-new(2)
 
@@ -228,7 +271,7 @@ p
         +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
         +cell The vector to set.
 
-+h(2, "add_flag") Vocab.has_vector
++h(2, "has_vector") Vocab.has_vector
     +tag method
     +tag-new(2)