mirror of https://github.com/explosion/spaCy.git
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
7c47e38c12
|
@ -0,0 +1,18 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.da.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Apple overvejer at købe et britisk statup for 1 milliard dollar",
|
||||||
|
"Selvkørende biler flytter forsikringsansvaret over på producenterne",
|
||||||
|
"San Francisco overvejer at forbyde leverandørrobotter på fortov",
|
||||||
|
"London er en stor by i Storbritannien"
|
||||||
|
]
|
|
@ -0,0 +1,22 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.de.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
|
||||||
|
"Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz",
|
||||||
|
"Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz",
|
||||||
|
"Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion",
|
||||||
|
"San Francisco erwägt Verbot von Lieferrobotern",
|
||||||
|
"Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller",
|
||||||
|
"Wo bist du?",
|
||||||
|
"Was ist die Hauptstadt von Deutschland?"
|
||||||
|
]
|
|
@ -0,0 +1,22 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.en.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Apple is looking at buying U.K. startup for $1 billion",
|
||||||
|
"Autonomous cars shift insurance liability toward manufacturers",
|
||||||
|
"San Francisco considers banning sidewalk delivery robots",
|
||||||
|
"London is a big city in the United Kingdom.",
|
||||||
|
"Where are you?",
|
||||||
|
"Who is the president of France?",
|
||||||
|
"What is the capital of the United States?",
|
||||||
|
"When was Barack Obama born?"
|
||||||
|
]
|
|
@ -0,0 +1,22 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.es.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Apple está buscando comprar una startup del Reino Unido por mil millones de dólares",
|
||||||
|
"Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes",
|
||||||
|
"San Francisco analiza prohibir los robots delivery",
|
||||||
|
"Londres es una gran ciudad del Reino Unido",
|
||||||
|
"El gato come pescado",
|
||||||
|
"Veo al hombre con el telescopio",
|
||||||
|
"La araña come moscas",
|
||||||
|
"El pingüino incuba en su nido"
|
||||||
|
]
|
|
@ -0,0 +1,26 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.fr.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Apple cherche a acheter une startup anglaise pour 1 milliard de dollard",
|
||||||
|
"Les voitures autonomes voient leur assurances décalées vers les constructeurs",
|
||||||
|
"San Francisco envisage d'interdire les robots coursiers",
|
||||||
|
"Londres est une grande ville du Royaume-Uni",
|
||||||
|
"L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe",
|
||||||
|
"Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon",
|
||||||
|
"La France ne devrait pas manquer d'électricité cet été, même en cas de canicule",
|
||||||
|
"Nouvelles attaques de Trump contre le maire de Londres",
|
||||||
|
"Où es-tu ?",
|
||||||
|
"Qui est le président de la France ?",
|
||||||
|
"Où est la capitale des Etats-Unis ?",
|
||||||
|
"Quand est né Barack Obama ?"
|
||||||
|
]
|
|
@ -0,0 +1,28 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.he.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
'סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל',
|
||||||
|
'רה"מ הודיע כי יחרים טקס בחסותו',
|
||||||
|
'הכנסת צפויה לאשר איכון אוטומטי של שיחות למוקד 100',
|
||||||
|
'תוכנית לאומית תהפוך את ישראל למעצמה דיגיטלית',
|
||||||
|
'סע לשלום, המפתחות בפנים.',
|
||||||
|
'מלצר, פעמיים טורקי!',
|
||||||
|
'ואהבת לרעך כמוך.',
|
||||||
|
'היום נעשה משהו בלתי נשכח.',
|
||||||
|
'איפה הילד?',
|
||||||
|
'מיהו נשיא צרפת?',
|
||||||
|
'מהי בירת ארצות הברית?',
|
||||||
|
"איך קוראים בעברית לצ'ופצ'יק של הקומקום?",
|
||||||
|
'מה הייתה הדקה?',
|
||||||
|
'מי אומר שלום ראשון, זה שעולה או זה שיורד?'
|
||||||
|
]
|
|
@ -0,0 +1,18 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.it.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
|
||||||
|
"Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori",
|
||||||
|
"San Francisco prevede di bandire i robot di consegna porta a porta",
|
||||||
|
"Londra è una grande città del Regno Unito."
|
||||||
|
]
|
|
@ -0,0 +1,18 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.nb.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar",
|
||||||
|
"Selvkjørende biler flytter forsikringsansvaret over på produsentene ",
|
||||||
|
"San Francisco vurderer å forby robotbud på fortauene",
|
||||||
|
"London er en stor by i Storbritannia."
|
||||||
|
]
|
|
@ -0,0 +1,20 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.pl.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Poczuł przyjemną woń mocnej kawy.",
|
||||||
|
"Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.",
|
||||||
|
"Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.",
|
||||||
|
"Nowy abonament pod lupą Komisji Europejskiej",
|
||||||
|
"Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
|
||||||
|
"Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”."
|
||||||
|
]
|
|
@ -0,0 +1,18 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.pt.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares",
|
||||||
|
"Carros autônomos empurram a responsabilidade do seguro para os fabricantes."
|
||||||
|
"São Francisco considera banir os robôs de entrega que andam pelas calçadas",
|
||||||
|
"Londres é a maior cidade do Reino Unido"
|
||||||
|
]
|
|
@ -0,0 +1,18 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.sv.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Apple överväger att köpa brittisk startup för 1 miljard dollar.",
|
||||||
|
"Självkörande bilar förskjuter försäkringsansvar mot tillverkare.",
|
||||||
|
"San Fransisco överväger förbud mot leveransrobotar på trottoarer.".
|
||||||
|
"London är en storstad i Storbritannien."
|
||||||
|
]
|
|
@ -430,11 +430,16 @@ class Language(object):
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pipe(self, texts, tuples=False, n_threads=2, batch_size=1000, disable=[]):
|
def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
|
||||||
|
disable=[]):
|
||||||
"""Process texts as a stream, and yield `Doc` objects in order. Supports
|
"""Process texts as a stream, and yield `Doc` objects in order. Supports
|
||||||
GIL-free multi-threading.
|
GIL-free multi-threading.
|
||||||
|
|
||||||
texts (iterator): A sequence of texts to process.
|
texts (iterator): A sequence of texts to process.
|
||||||
|
as_tuples (bool):
|
||||||
|
If set to True, inputs should be a sequence of
|
||||||
|
(text, context) tuples. Output will then be a sequence of
|
||||||
|
(doc, context) tuples. Defaults to False.
|
||||||
n_threads (int): The number of worker threads to use. If -1, OpenMP will
|
n_threads (int): The number of worker threads to use. If -1, OpenMP will
|
||||||
decide how many to use at run time. Default is 2.
|
decide how many to use at run time. Default is 2.
|
||||||
batch_size (int): The number of texts to buffer.
|
batch_size (int): The number of texts to buffer.
|
||||||
|
@ -446,7 +451,7 @@ class Language(object):
|
||||||
>>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
|
>>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
|
||||||
>>> assert doc.is_parsed
|
>>> assert doc.is_parsed
|
||||||
"""
|
"""
|
||||||
if tuples:
|
if as_tuples:
|
||||||
text_context1, text_context2 = itertools.tee(texts)
|
text_context1, text_context2 = itertools.tee(texts)
|
||||||
texts = (tc[0] for tc in text_context1)
|
texts = (tc[0] for tc in text_context1)
|
||||||
contexts = (tc[1] for tc in text_context2)
|
contexts = (tc[1] for tc in text_context2)
|
||||||
|
|
|
@ -63,7 +63,7 @@ def vector_size():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def beam(moves, states, golds, beam_width):
|
def beam(moves, states, golds, beam_width):
|
||||||
return ParserBeam(moves, states, golds, width=beam_width)
|
return ParserBeam(moves, states, golds, width=beam_width, density=0.0)
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def scores(moves, batch_size, beam_width):
|
def scores(moves, batch_size, beam_width):
|
||||||
|
|
|
@ -11,8 +11,8 @@ import pytest
|
||||||
def taggers(en_vocab):
|
def taggers(en_vocab):
|
||||||
tagger1 = Tagger(en_vocab)
|
tagger1 = Tagger(en_vocab)
|
||||||
tagger2 = Tagger(en_vocab)
|
tagger2 = Tagger(en_vocab)
|
||||||
tagger1.model = tagger1.Model(None, None)
|
tagger1.model = tagger1.Model(8, 8)
|
||||||
tagger2.model = tagger2.Model(None, None)
|
tagger2.model = tagger1.model
|
||||||
return (tagger1, tagger2)
|
return (tagger1, tagger2)
|
||||||
|
|
||||||
|
|
||||||
|
@ -20,7 +20,6 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
|
||||||
tagger1, tagger2 = taggers
|
tagger1, tagger2 = taggers
|
||||||
tagger1_b = tagger1.to_bytes()
|
tagger1_b = tagger1.to_bytes()
|
||||||
tagger2_b = tagger2.to_bytes()
|
tagger2_b = tagger2.to_bytes()
|
||||||
assert tagger1_b == tagger2_b
|
|
||||||
tagger1 = tagger1.from_bytes(tagger1_b)
|
tagger1 = tagger1.from_bytes(tagger1_b)
|
||||||
assert tagger1.to_bytes() == tagger1_b
|
assert tagger1.to_bytes() == tagger1_b
|
||||||
new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b)
|
new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b)
|
||||||
|
|
|
@ -238,6 +238,27 @@ cdef class Doc:
|
||||||
def doc(self):
|
def doc(self):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def char_span(self, int start_idx, int end_idx, attr_t label=0, vector=None):
|
||||||
|
"""Create a `Span` object from the slice `doc.text[start : end]`.
|
||||||
|
|
||||||
|
doc (Doc): The parent document.
|
||||||
|
start (int): The index of the first character of the span.
|
||||||
|
end (int): The index of the first character after the span.
|
||||||
|
label (uint64): A label to attach to the Span, e.g. for named entities.
|
||||||
|
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
||||||
|
RETURNS (Span): The newly constructed object.
|
||||||
|
"""
|
||||||
|
cdef int start = token_by_start(self.c, self.length, start_idx)
|
||||||
|
if start == -1:
|
||||||
|
return None
|
||||||
|
cdef int end = token_by_end(self.c, self.length, end_idx)
|
||||||
|
if end == -1:
|
||||||
|
return None
|
||||||
|
# Currently we have the token index, we want the range-end index
|
||||||
|
end += 1
|
||||||
|
cdef Span span = Span(self, start, end, label=label, vector=vector)
|
||||||
|
return span
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
"""Make a semantic similarity estimate. The default estimate is cosine
|
"""Make a semantic similarity estimate. The default estimate is cosine
|
||||||
similarity using an average of word vectors.
|
similarity using an average of word vectors.
|
||||||
|
|
|
@ -15,5 +15,5 @@ cdef class Span:
|
||||||
cdef public _vector
|
cdef public _vector
|
||||||
cdef public _vector_norm
|
cdef public _vector_norm
|
||||||
|
|
||||||
|
|
||||||
cpdef int _recalculate_indices(self) except -1
|
cpdef int _recalculate_indices(self) except -1
|
||||||
|
cpdef np.ndarray to_array(self, object features)
|
||||||
|
|
|
@ -7,7 +7,7 @@ import numpy
|
||||||
import numpy.linalg
|
import numpy.linalg
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
|
|
||||||
from .doc cimport token_by_start, token_by_end
|
from .doc cimport token_by_start, token_by_end, get_token_attr
|
||||||
from ..structs cimport TokenC, LexemeC
|
from ..structs cimport TokenC, LexemeC
|
||||||
from ..typedefs cimport flags_t, attr_t, hash_t
|
from ..typedefs cimport flags_t, attr_t, hash_t
|
||||||
from ..attrs cimport attr_id_t
|
from ..attrs cimport attr_id_t
|
||||||
|
@ -135,6 +135,28 @@ cdef class Span:
|
||||||
return 0.0
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
|
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||||
|
"""Given a list of M attribute IDs, export the tokens to a numpy
|
||||||
|
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
|
||||||
|
The values will be 32-bit integers.
|
||||||
|
|
||||||
|
attr_ids (list[int]): A list of attribute ID ints.
|
||||||
|
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
|
||||||
|
per word, and one column per attribute indicated in the input
|
||||||
|
`attr_ids`.
|
||||||
|
"""
|
||||||
|
cdef int i, j
|
||||||
|
cdef attr_id_t feature
|
||||||
|
cdef np.ndarray[attr_t, ndim=2] output
|
||||||
|
# Make an array from the attributes --- otherwise our inner loop is Python
|
||||||
|
# dict iteration.
|
||||||
|
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
|
||||||
|
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
|
||||||
|
for i in range(self.start, self.end):
|
||||||
|
for j, feature in enumerate(attr_ids):
|
||||||
|
output[i, j] = get_token_attr(&self.doc.c[i], feature)
|
||||||
|
return output
|
||||||
|
|
||||||
cpdef int _recalculate_indices(self) except -1:
|
cpdef int _recalculate_indices(self) except -1:
|
||||||
if self.end > self.doc.length \
|
if self.end > self.doc.length \
|
||||||
or self.doc.c[self.start].idx != self.start_char \
|
or self.doc.c[self.start].idx != self.start_char \
|
||||||
|
|
|
@ -20,7 +20,7 @@ cdef class Vectors:
|
||||||
'''Store, save and load word vectors.'''
|
'''Store, save and load word vectors.'''
|
||||||
cdef public object data
|
cdef public object data
|
||||||
cdef readonly StringStore strings
|
cdef readonly StringStore strings
|
||||||
cdef public object index
|
cdef public object key2row
|
||||||
|
|
||||||
def __init__(self, strings, data_or_width):
|
def __init__(self, strings, data_or_width):
|
||||||
self.strings = StringStore()
|
self.strings = StringStore()
|
||||||
|
@ -30,9 +30,9 @@ cdef class Vectors:
|
||||||
else:
|
else:
|
||||||
data = data_or_width
|
data = data_or_width
|
||||||
self.data = data
|
self.data = data
|
||||||
self.index = {}
|
self.key2row = {}
|
||||||
for i, string in enumerate(strings):
|
for i, string in enumerate(strings):
|
||||||
self.index[self.strings.add(string)] = i
|
self.key2row[self.strings.add(string)] = i
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (Vectors, (self.strings, self.data))
|
return (Vectors, (self.strings, self.data))
|
||||||
|
@ -40,7 +40,7 @@ cdef class Vectors:
|
||||||
def __getitem__(self, key):
|
def __getitem__(self, key):
|
||||||
if isinstance(key, basestring):
|
if isinstance(key, basestring):
|
||||||
key = self.strings[key]
|
key = self.strings[key]
|
||||||
i = self.index[key]
|
i = self.key2row[key]
|
||||||
if i is None:
|
if i is None:
|
||||||
raise KeyError(key)
|
raise KeyError(key)
|
||||||
else:
|
else:
|
||||||
|
@ -49,7 +49,7 @@ cdef class Vectors:
|
||||||
def __setitem__(self, key, vector):
|
def __setitem__(self, key, vector):
|
||||||
if isinstance(key, basestring):
|
if isinstance(key, basestring):
|
||||||
key = self.strings.add(key)
|
key = self.strings.add(key)
|
||||||
i = self.index[key]
|
i = self.key2row[key]
|
||||||
self.data[i] = vector
|
self.data[i] = vector
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
|
@ -71,7 +71,7 @@ cdef class Vectors:
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, **exclude):
|
||||||
def serialize_vectors(p):
|
def serialize_vectors(p):
|
||||||
write_vectors_to_bin_loc(self.strings, self.key2i, self.data, str(p))
|
write_vectors_to_bin_loc(self.strings, self.key2row, self.data, str(p))
|
||||||
|
|
||||||
serializers = OrderedDict((
|
serializers = OrderedDict((
|
||||||
('vec.bin', serialize_vectors),
|
('vec.bin', serialize_vectors),
|
||||||
|
@ -80,12 +80,13 @@ cdef class Vectors:
|
||||||
|
|
||||||
def from_disk(self, path, **exclude):
|
def from_disk(self, path, **exclude):
|
||||||
def deserialize_vectors(p):
|
def deserialize_vectors(p):
|
||||||
self.key2i, self.vectors = load_vectors_from_bin_loc(self.strings, str(p))
|
values = load_vectors_from_bin_loc(self.strings, str(p))
|
||||||
|
self.key2row, self.data = values
|
||||||
|
|
||||||
serializers = OrderedDict((
|
serializers = OrderedDict((
|
||||||
('vec.bin', deserialize_vectors)
|
('vec.bin', deserialize_vectors),
|
||||||
))
|
))
|
||||||
return util.to_disk(serializers, exclude)
|
return util.from_disk(path, serializers, exclude)
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
def serialize_weights():
|
def serialize_weights():
|
||||||
|
@ -93,9 +94,9 @@ cdef class Vectors:
|
||||||
return self.data.to_bytes()
|
return self.data.to_bytes()
|
||||||
else:
|
else:
|
||||||
return msgpack.dumps(self.data)
|
return msgpack.dumps(self.data)
|
||||||
|
b = msgpack.dumps(self.key2row)
|
||||||
serializers = OrderedDict((
|
serializers = OrderedDict((
|
||||||
('key2row', lambda: msgpack.dumps(self.key2i)),
|
('key2row', lambda: msgpack.dumps(self.key2row)),
|
||||||
('strings', lambda: self.strings.to_bytes()),
|
('strings', lambda: self.strings.to_bytes()),
|
||||||
('vectors', serialize_weights)
|
('vectors', serialize_weights)
|
||||||
))
|
))
|
||||||
|
@ -109,7 +110,7 @@ cdef class Vectors:
|
||||||
self.data = msgpack.loads(b)
|
self.data = msgpack.loads(b)
|
||||||
|
|
||||||
deserializers = OrderedDict((
|
deserializers = OrderedDict((
|
||||||
('key2row', lambda b: self.key2i.update(msgpack.loads(b))),
|
('key2row', lambda b: self.key2row.update(msgpack.loads(b))),
|
||||||
('strings', lambda b: self.strings.from_bytes(b)),
|
('strings', lambda b: self.strings.from_bytes(b)),
|
||||||
('vectors', deserialize_weights)
|
('vectors', deserialize_weights)
|
||||||
))
|
))
|
||||||
|
|
|
@ -112,6 +112,10 @@
|
||||||
.u-nowrap
|
.u-nowrap
|
||||||
white-space: nowrap
|
white-space: nowrap
|
||||||
|
|
||||||
|
.u-break.u-break
|
||||||
|
word-wrap: break-word
|
||||||
|
white-space: initial
|
||||||
|
|
||||||
.u-no-border
|
.u-no-border
|
||||||
border: none
|
border: none
|
||||||
|
|
||||||
|
|
|
@ -140,6 +140,44 @@ p Get the number of tokens in the document.
|
||||||
+cell int
|
+cell int
|
||||||
+cell The number of tokens in the document.
|
+cell The number of tokens in the document.
|
||||||
|
|
||||||
|
+h(2, "char_span") Doc.char_span
|
||||||
|
+tag method
|
||||||
|
+tag-new(2)
|
||||||
|
|
||||||
|
p Create a #[code Span] object from the slice #[code doc.text[start : end]].
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
doc = nlp(u'I like New York')
|
||||||
|
label = doc.vocab.strings['GPE']
|
||||||
|
span = doc.char_span(7, 15, label=label)
|
||||||
|
assert span.text == 'New York'
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code start]
|
||||||
|
+cell int
|
||||||
|
+cell The index of the first character of the span.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code end]
|
||||||
|
+cell int
|
||||||
|
+cell The index of the first character after the span.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code label]
|
||||||
|
+cell uint64
|
||||||
|
+cell A label to attach to the Span, e.g. for named entities.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code vector]
|
||||||
|
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||||
|
+cell A meaning representation of the span.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell returns
|
||||||
|
+cell #[code Span]
|
||||||
|
+cell The newly constructed object.
|
||||||
|
|
||||||
+h(2, "similarity") Doc.similarity
|
+h(2, "similarity") Doc.similarity
|
||||||
+tag method
|
+tag method
|
||||||
+tag-model("vectors")
|
+tag-model("vectors")
|
||||||
|
@ -211,12 +249,12 @@ p
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell #[code attr_ids]
|
+cell #[code attr_ids]
|
||||||
+cell ints
|
+cell list
|
||||||
+cell A list of attribute ID ints.
|
+cell A list of attribute ID ints.
|
||||||
|
|
||||||
+footrow
|
+footrow
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
|
+cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
|
||||||
+cell
|
+cell
|
||||||
| The exported attributes as a 2D numpy array, with one row per
|
| The exported attributes as a 2D numpy array, with one row per
|
||||||
| token and one column per attribute.
|
| token and one column per attribute.
|
||||||
|
@ -245,7 +283,7 @@ p
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code array]
|
+cell #[code array]
|
||||||
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
|
+cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
|
||||||
+cell The attribute values to load.
|
+cell The attribute values to load.
|
||||||
|
|
||||||
+footrow
|
+footrow
|
||||||
|
@ -509,7 +547,7 @@ p
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+footrow
|
+footrow
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||||
+cell A 1D numpy array representing the document's semantics.
|
+cell A 1D numpy array representing the document's semantics.
|
||||||
|
|
||||||
+h(2, "vector_norm") Doc.vector_norm
|
+h(2, "vector_norm") Doc.vector_norm
|
||||||
|
|
|
@ -111,6 +111,14 @@ p
|
||||||
+cell -
|
+cell -
|
||||||
+cell A sequence of unicode objects.
|
+cell A sequence of unicode objects.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code as_tuples]
|
||||||
|
+cell bool
|
||||||
|
+cell
|
||||||
|
| If set to #[code True], inputs should be a sequence of
|
||||||
|
| #[code (text, context)] tuples. Output will then be a sequence of
|
||||||
|
| #[code (doc, context)] tuples. Defaults to #[code False].
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code n_threads]
|
+cell #[code n_threads]
|
||||||
+cell int
|
+cell int
|
||||||
|
|
|
@ -129,7 +129,7 @@ p A real-valued meaning representation.
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+footrow
|
+footrow
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||||
+cell A 1D numpy array representing the lexeme's semantics.
|
+cell A 1D numpy array representing the lexeme's semantics.
|
||||||
|
|
||||||
+h(2, "vector_norm") Lexeme.vector_norm
|
+h(2, "vector_norm") Lexeme.vector_norm
|
||||||
|
|
|
@ -37,7 +37,7 @@ p Create a Span object from the #[code slice doc[start : end]].
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code vector]
|
+cell #[code vector]
|
||||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||||
+cell A meaning representation of the span.
|
+cell A meaning representation of the span.
|
||||||
|
|
||||||
+footrow
|
+footrow
|
||||||
|
@ -145,11 +145,47 @@ p
|
||||||
+cell float
|
+cell float
|
||||||
+cell A scalar similarity score. Higher is more similar.
|
+cell A scalar similarity score. Higher is more similar.
|
||||||
|
|
||||||
|
+h(2, "to_array") Span.to_array
|
||||||
|
+tag method
|
||||||
|
+tag-new(2)
|
||||||
|
|
||||||
|
p
|
||||||
|
| Given a list of #[code M] attribute IDs, export the tokens to a numpy
|
||||||
|
| #[code ndarray] of shape #[code (N, M)], where #[code N] is the length of
|
||||||
|
| the document. The values will be 32-bit integers.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
|
||||||
|
doc = nlp(u'I like New York in Autumn.')
|
||||||
|
span = doc[2:3]
|
||||||
|
# All strings mapped to integers, for easy export to numpy
|
||||||
|
np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code attr_ids]
|
||||||
|
+cell list
|
||||||
|
+cell A list of attribute ID ints.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell returns
|
||||||
|
+cell #[code.u-break numpy.ndarray[long, ndim=2]]
|
||||||
|
+cell
|
||||||
|
| A feature matrix, with one row per word, and one column per
|
||||||
|
| attribute indicated in the input #[code attr_ids].
|
||||||
|
|
||||||
+h(2, "merge") Span.merge
|
+h(2, "merge") Span.merge
|
||||||
+tag method
|
+tag method
|
||||||
|
|
||||||
p Retokenize the document, such that the span is merged into a single token.
|
p Retokenize the document, such that the span is merged into a single token.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
doc = nlp(u'I like New York in Autumn.')
|
||||||
|
span = doc[2:3]
|
||||||
|
span.merge()
|
||||||
|
assert len(doc) == 6
|
||||||
|
assert doc[2].text == 'New York'
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell #[code **attributes]
|
+cell #[code **attributes]
|
||||||
|
@ -270,7 +306,7 @@ p
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+footrow
|
+footrow
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||||
+cell A 1D numpy array representing the span's semantics.
|
+cell A 1D numpy array representing the span's semantics.
|
||||||
|
|
||||||
+h(2, "vector_norm") Span.vector_norm
|
+h(2, "vector_norm") Span.vector_norm
|
||||||
|
|
|
@ -250,7 +250,7 @@ p A real-valued meaning representation.
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+footrow
|
+footrow
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||||
+cell A 1D numpy array representing the token's semantics.
|
+cell A 1D numpy array representing the token's semantics.
|
||||||
|
|
||||||
+h(2, "vector_norm") Span.vector_norm
|
+h(2, "vector_norm") Span.vector_norm
|
||||||
|
|
Loading…
Reference in New Issue