2017-05-23 21:16:31 +00:00
|
|
|
|
//- 💫 DOCS > USAGE > SPACY 101 > WORD VECTORS
|
|
|
|
|
|
|
|
|
|
p
|
|
|
|
|
| Similarity is determined by comparing #[strong word vectors] or "word
|
|
|
|
|
| embeddings", multi-dimensional meaning representations of a word. Word
|
|
|
|
|
| vectors can be generated using an algorithm like
|
2017-11-06 19:16:38 +00:00
|
|
|
|
| #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec] and usually
|
|
|
|
|
| look like this:
|
2017-05-23 21:16:31 +00:00
|
|
|
|
|
|
|
|
|
+code("banana.vector", false, false, 250).
|
|
|
|
|
array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01,
|
|
|
|
|
3.28450017e-02, -4.19569999e-01, 7.20689967e-02,
|
|
|
|
|
-3.74760002e-01, 5.74599989e-02, -1.24009997e-02,
|
|
|
|
|
5.29489994e-01, -5.23800015e-01, -1.97710007e-01,
|
|
|
|
|
-3.41470003e-01, 5.33169985e-01, -2.53309999e-02,
|
|
|
|
|
1.73800007e-01, 1.67720005e-01, 8.39839995e-01,
|
|
|
|
|
5.51070012e-02, 1.05470002e-01, 3.78719985e-01,
|
|
|
|
|
2.42750004e-01, 1.47449998e-02, 5.59509993e-01,
|
|
|
|
|
1.25210002e-01, -6.75960004e-01, 3.58420014e-01,
|
|
|
|
|
-4.00279984e-02, 9.59490016e-02, -5.06900012e-01,
|
|
|
|
|
-8.53179991e-02, 1.79800004e-01, 3.38669986e-01,
|
|
|
|
|
1.32300004e-01, 3.10209990e-01, 2.18779996e-01,
|
|
|
|
|
1.68530002e-01, 1.98740005e-01, -5.73849976e-01,
|
|
|
|
|
-1.06490001e-01, 2.66689986e-01, 1.28380001e-01,
|
|
|
|
|
-1.28030002e-01, -1.32839993e-01, 1.26570001e-01,
|
|
|
|
|
8.67229998e-01, 9.67210010e-02, 4.83060002e-01,
|
|
|
|
|
2.12709993e-01, -5.49900010e-02, -8.24249983e-02,
|
|
|
|
|
2.24079996e-01, 2.39749998e-01, -6.22599982e-02,
|
|
|
|
|
6.21940017e-01, -5.98999977e-01, 4.32009995e-01,
|
|
|
|
|
2.81430006e-01, 3.38420011e-02, -4.88150001e-01,
|
|
|
|
|
-2.13589996e-01, 2.74010003e-01, 2.40950003e-01,
|
|
|
|
|
4.59500015e-01, -1.86049998e-01, -1.04970002e+00,
|
|
|
|
|
-9.73049998e-02, -1.89080000e-01, -7.09290028e-01,
|
|
|
|
|
4.01950002e-01, -1.87680006e-01, 5.16870022e-01,
|
|
|
|
|
1.25200003e-01, 8.41499984e-01, 1.20970003e-01,
|
|
|
|
|
8.82389992e-02, -2.91959997e-02, 1.21510006e-03,
|
|
|
|
|
5.68250008e-02, -2.74210006e-01, 2.55640000e-01,
|
|
|
|
|
6.97930008e-02, -2.22580001e-01, -3.60060006e-01,
|
|
|
|
|
-2.24020004e-01, -5.36990017e-02, 1.20220006e+00,
|
|
|
|
|
5.45350015e-01, -5.79980016e-01, 1.09049998e-01,
|
|
|
|
|
4.21669990e-01, 2.06619993e-01, 1.29360005e-01,
|
|
|
|
|
-4.14570011e-02, -6.67770028e-01, 4.04670000e-01,
|
|
|
|
|
-1.52179999e-02, -2.76400000e-01, -1.56110004e-01,
|
|
|
|
|
-7.91980028e-02, 4.00369987e-02, -1.29439995e-01,
|
|
|
|
|
-2.40900001e-04, -2.67850012e-01, -3.81150007e-01,
|
|
|
|
|
-9.72450018e-01, 3.17259997e-01, -4.39509988e-01,
|
|
|
|
|
4.19340014e-01, 1.83530003e-01, -1.52600005e-01,
|
|
|
|
|
-1.08080000e-01, -1.03579998e+00, 7.62170032e-02,
|
|
|
|
|
1.65189996e-01, 2.65259994e-04, 1.66160002e-01,
|
|
|
|
|
-1.52810007e-01, 1.81229994e-01, 7.02740014e-01,
|
|
|
|
|
5.79559989e-03, 5.16639985e-02, -5.97449988e-02,
|
|
|
|
|
-2.75510013e-01, -3.90489995e-01, 6.11319989e-02,
|
|
|
|
|
5.54300010e-01, -8.79969969e-02, -4.16810006e-01,
|
|
|
|
|
3.28260005e-01, -5.25489986e-01, -4.42880005e-01,
|
|
|
|
|
8.21829960e-03, 2.44859993e-01, -2.29819998e-01,
|
|
|
|
|
-3.49810004e-01, 2.68940002e-01, 3.91660005e-01,
|
|
|
|
|
-4.19039994e-01, 1.61909997e-01, -2.62630010e+00,
|
|
|
|
|
6.41340017e-01, 3.97430003e-01, -1.28680006e-01,
|
|
|
|
|
-3.19460005e-01, -2.56330013e-01, -1.22199997e-01,
|
|
|
|
|
3.22750002e-01, -7.99330026e-02, -1.53479993e-01,
|
|
|
|
|
3.15050006e-01, 3.05909991e-01, 2.60120004e-01,
|
|
|
|
|
1.85530007e-01, -2.40429997e-01, 4.28860001e-02,
|
|
|
|
|
4.06219989e-01, -2.42559999e-01, 6.38700008e-01,
|
|
|
|
|
6.99829996e-01, -1.40430003e-01, 2.52090007e-01,
|
|
|
|
|
4.89840001e-01, -6.10670000e-02, -3.67659986e-01,
|
|
|
|
|
-5.50890028e-01, -3.82649988e-01, -2.08430007e-01,
|
|
|
|
|
2.28320003e-01, 5.12179971e-01, 2.78679997e-01,
|
|
|
|
|
4.76520002e-01, 4.79510017e-02, -3.40079993e-01,
|
|
|
|
|
-3.28729987e-01, -4.19669986e-01, -7.54989982e-02,
|
|
|
|
|
-3.89539987e-01, -2.96219997e-02, -3.40700001e-01,
|
|
|
|
|
2.21699998e-01, -6.28560036e-02, -5.19029975e-01,
|
|
|
|
|
-3.77739996e-01, -4.34770016e-03, -5.83010018e-01,
|
|
|
|
|
-8.75459984e-02, -2.39289999e-01, -2.47109994e-01,
|
|
|
|
|
-2.58870006e-01, -2.98940003e-01, 1.37150005e-01,
|
|
|
|
|
2.98919994e-02, 3.65439989e-02, -4.96650010e-01,
|
|
|
|
|
-1.81600004e-01, 5.29389977e-01, 2.19919994e-01,
|
|
|
|
|
-4.45140004e-01, 3.77979994e-01, -5.70620000e-01,
|
|
|
|
|
-4.69460003e-02, 8.18059966e-02, 1.92789994e-02,
|
|
|
|
|
3.32459986e-01, -1.46200001e-01, 1.71560004e-01,
|
|
|
|
|
3.99809986e-01, 3.62170011e-01, 1.28160000e-01,
|
|
|
|
|
3.16439986e-01, 3.75690013e-01, -7.46899992e-02,
|
|
|
|
|
-4.84800003e-02, -3.14009994e-01, -1.92860007e-01,
|
|
|
|
|
-3.12940001e-01, -1.75529998e-02, -1.75139993e-01,
|
|
|
|
|
-2.75870003e-02, -1.00000000e+00, 1.83870003e-01,
|
|
|
|
|
8.14339995e-01, -1.89129993e-01, 5.09989977e-01,
|
|
|
|
|
-9.19600017e-03, -1.92950002e-03, 2.81890005e-01,
|
|
|
|
|
2.72470005e-02, 4.34089988e-01, -5.49669981e-01,
|
|
|
|
|
-9.74259973e-02, -2.45399997e-01, -1.72030002e-01,
|
|
|
|
|
-8.86500031e-02, -3.02980006e-01, -1.35910004e-01,
|
|
|
|
|
-2.77649999e-01, 3.12860007e-03, 2.05559999e-01,
|
|
|
|
|
-1.57720000e-01, -5.23079991e-01, -6.47010028e-01,
|
|
|
|
|
-3.70139986e-01, 6.93930015e-02, 1.14009999e-01,
|
|
|
|
|
2.75940001e-01, -1.38750002e-01, -2.72680014e-01,
|
|
|
|
|
6.68910027e-01, -5.64539991e-02, 2.40170002e-01,
|
|
|
|
|
-2.67300010e-01, 2.98599988e-01, 1.00830004e-01,
|
|
|
|
|
5.55920005e-01, 3.28489989e-01, 7.68579990e-02,
|
|
|
|
|
1.55279994e-01, 2.56359994e-01, -1.07720003e-01,
|
|
|
|
|
-1.23590000e-01, 1.18270002e-01, -9.90289971e-02,
|
|
|
|
|
-3.43279988e-01, 1.15019999e-01, -3.78080010e-01,
|
|
|
|
|
-3.90120000e-02, -3.45930010e-01, -1.94040000e-01,
|
|
|
|
|
-3.35799992e-01, -6.23340011e-02, 2.89189994e-01,
|
|
|
|
|
2.80319989e-01, -5.37410021e-01, 6.27939999e-01,
|
|
|
|
|
5.69549985e-02, 6.21469975e-01, -2.52819985e-01,
|
|
|
|
|
4.16700006e-01, -1.01079997e-02, -2.54339993e-01,
|
|
|
|
|
4.00029987e-01, 4.24320012e-01, 2.26720005e-01,
|
|
|
|
|
1.75530002e-01, 2.30489999e-01, 2.83230007e-01,
|
|
|
|
|
1.38820007e-01, 3.12180002e-03, 1.70570001e-01,
|
|
|
|
|
3.66849989e-01, 2.52470002e-03, -6.40089989e-01,
|
|
|
|
|
-2.97650009e-01, 7.89430022e-01, 3.31680000e-01,
|
|
|
|
|
-1.19659996e+00, -4.71559986e-02, 5.31750023e-01], dtype=float32)
|
|
|
|
|
|
2017-11-06 19:16:38 +00:00
|
|
|
|
+infobox("Important note", "⚠️")
|
|
|
|
|
| To make them compact and fast, spaCy's small #[+a("/models") models]
|
|
|
|
|
| (all packages that end in #[code sm]) #[strong don't ship with word vectors], and
|
|
|
|
|
| only include context-sensitive #[strong tensors]. This means you can
|
|
|
|
|
| still use the #[code similarity()] methods to compare documents, spans
|
|
|
|
|
| and tokens – but the result won't be as good, and individual tokens won't
|
|
|
|
|
| have any vectors assigned. So in order to use #[em real] word vectors,
|
|
|
|
|
| you need to download a larger model:
|
|
|
|
|
|
|
|
|
|
+code-wrapper
|
|
|
|
|
+code-new(false, "bash", "$") spacy download en_core_web_lg
|
|
|
|
|
|
2017-05-23 21:16:31 +00:00
|
|
|
|
p
|
2017-11-06 19:16:38 +00:00
|
|
|
|
| Models that come with built-in word vectors make them available as the
|
|
|
|
|
| #[+api("token#vector") #[code Token.vector]] attribute.
|
2017-05-23 21:16:31 +00:00
|
|
|
|
| #[+api("doc#vector") #[code Doc.vector]] and
|
|
|
|
|
| #[+api("span#vector") #[code Span.vector]] will default to an average
|
|
|
|
|
| of their token vectors. You can also check if a token has a vector
|
|
|
|
|
| assigned, and get the L2 norm, which can be used to normalise
|
|
|
|
|
| vectors.
|
|
|
|
|
|
|
|
|
|
+code.
|
2017-11-06 19:16:38 +00:00
|
|
|
|
nlp = spacy.load('en_core_web_lg')
|
2017-05-23 21:16:31 +00:00
|
|
|
|
tokens = nlp(u'dog cat banana sasquatch')
|
|
|
|
|
|
|
|
|
|
for token in tokens:
|
|
|
|
|
print(token.text, token.has_vector, token.vector_norm, token.is_oov)
|
|
|
|
|
|
|
|
|
|
+aside
|
|
|
|
|
| #[strong Text]: The original token text.#[br]
|
|
|
|
|
| #[strong has vector]: Does the token have a vector representation?#[br]
|
|
|
|
|
| #[strong Vector norm]: The L2 norm of the token's vector (the square root
|
|
|
|
|
| of the sum of the values squared)#[br]
|
|
|
|
|
| #[strong is OOV]: Is the word out-of-vocabulary?
|
|
|
|
|
|
|
|
|
|
+table(["Text", "Has vector", "Vector norm", "OOV"])
|
|
|
|
|
- var style = [0, 1, 1, 1]
|
|
|
|
|
+annotation-row(["dog", true, 7.033672992262838, false], style)
|
|
|
|
|
+annotation-row(["cat", true, 6.68081871208896, false], style)
|
|
|
|
|
+annotation-row(["banana", true, 6.700014292148571, false], style)
|
|
|
|
|
+annotation-row(["sasquatch", false, 0, true], style)
|
|
|
|
|
|
|
|
|
|
p
|
|
|
|
|
| The words "dog", "cat" and "banana" are all pretty common in English, so
|
|
|
|
|
| they're part of the model's vocabulary, and come with a vector. The word
|
|
|
|
|
| "sasquatch" on the other hand is a lot less common and out-of-vocabulary
|
|
|
|
|
| – so its vector representation consists of 300 dimensions of #[code 0],
|
2017-11-06 19:16:38 +00:00
|
|
|
|
| which means it's practically nonexistent. If your application will
|
|
|
|
|
| benefit from a #[strong large vocabulary] with more vectors, you should
|
|
|
|
|
| consider using one of the larger models or loading in a full vector
|
|
|
|
|
| package, for example,
|
|
|
|
|
| #[+a("/models/en#en_vectors_web_lg") #[code en_vectors_web_lg]], which
|
|
|
|
|
| includes over #[strong 1 million unique vectors].
|