spaCy/examples/vectors_fast_text.py

36 lines
1.1 KiB
Python
Raw Normal View History

2017-10-26 15:32:59 +00:00
#!/usr/bin/env python
# coding: utf8
"""Load vectors for a language trained using fastText
2017-10-01 21:40:02 +00:00
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
2017-11-07 00:22:30 +00:00
Compatible with: spaCy v2.0.0+
2017-10-26 15:32:59 +00:00
"""
2017-10-01 21:40:02 +00:00
from __future__ import unicode_literals
import plac
import numpy
2017-10-31 23:43:28 +00:00
from spacy.language import Language
2017-10-01 21:40:02 +00:00
@plac.annotations(
vectors_loc=("Path to vectors", "positional", None, str))
2017-10-01 21:40:02 +00:00
def main(vectors_loc):
2017-10-31 23:43:22 +00:00
nlp = Language() # start off with a blank Language class
2017-10-01 21:40:02 +00:00
with open(vectors_loc, 'rb') as file_:
header = file_.readline()
nr_row, nr_dim = header.split()
nlp.vocab.clear_vectors(int(nr_dim))
for line in file_:
line = line.decode('utf8')
2017-10-26 15:32:59 +00:00
pieces = line.split()
2017-10-01 21:40:02 +00:00
word = pieces[0]
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
2017-10-31 23:43:22 +00:00
nlp.vocab.set_vector(word, vector) # add the vectors to the vocab
# test the vectors and similarity
text = 'class colspan'
doc = nlp(text)
print(text, doc[0].similarity(doc[1]))
2017-10-01 21:40:02 +00:00
if __name__ == '__main__':
plac.call(main)