mirror of https://github.com/explosion/spaCy.git
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
7c47e38c12
|
@ -0,0 +1,18 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.da.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple overvejer at købe et britisk statup for 1 milliard dollar",
|
||||
"Selvkørende biler flytter forsikringsansvaret over på producenterne",
|
||||
"San Francisco overvejer at forbyde leverandørrobotter på fortov",
|
||||
"London er en stor by i Storbritannien"
|
||||
]
|
|
@ -0,0 +1,22 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.de.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
|
||||
"Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz",
|
||||
"Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz",
|
||||
"Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion",
|
||||
"San Francisco erwägt Verbot von Lieferrobotern",
|
||||
"Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller",
|
||||
"Wo bist du?",
|
||||
"Was ist die Hauptstadt von Deutschland?"
|
||||
]
|
|
@ -0,0 +1,22 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.en.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple is looking at buying U.K. startup for $1 billion",
|
||||
"Autonomous cars shift insurance liability toward manufacturers",
|
||||
"San Francisco considers banning sidewalk delivery robots",
|
||||
"London is a big city in the United Kingdom.",
|
||||
"Where are you?",
|
||||
"Who is the president of France?",
|
||||
"What is the capital of the United States?",
|
||||
"When was Barack Obama born?"
|
||||
]
|
|
@ -0,0 +1,22 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.es.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple está buscando comprar una startup del Reino Unido por mil millones de dólares",
|
||||
"Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes",
|
||||
"San Francisco analiza prohibir los robots delivery",
|
||||
"Londres es una gran ciudad del Reino Unido",
|
||||
"El gato come pescado",
|
||||
"Veo al hombre con el telescopio",
|
||||
"La araña come moscas",
|
||||
"El pingüino incuba en su nido"
|
||||
]
|
|
@ -0,0 +1,26 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.fr.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple cherche a acheter une startup anglaise pour 1 milliard de dollard",
|
||||
"Les voitures autonomes voient leur assurances décalées vers les constructeurs",
|
||||
"San Francisco envisage d'interdire les robots coursiers",
|
||||
"Londres est une grande ville du Royaume-Uni",
|
||||
"L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe",
|
||||
"Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon",
|
||||
"La France ne devrait pas manquer d'électricité cet été, même en cas de canicule",
|
||||
"Nouvelles attaques de Trump contre le maire de Londres",
|
||||
"Où es-tu ?",
|
||||
"Qui est le président de la France ?",
|
||||
"Où est la capitale des Etats-Unis ?",
|
||||
"Quand est né Barack Obama ?"
|
||||
]
|
|
@ -0,0 +1,28 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.he.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
'סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל',
|
||||
'רה"מ הודיע כי יחרים טקס בחסותו',
|
||||
'הכנסת צפויה לאשר איכון אוטומטי של שיחות למוקד 100',
|
||||
'תוכנית לאומית תהפוך את ישראל למעצמה דיגיטלית',
|
||||
'סע לשלום, המפתחות בפנים.',
|
||||
'מלצר, פעמיים טורקי!',
|
||||
'ואהבת לרעך כמוך.',
|
||||
'היום נעשה משהו בלתי נשכח.',
|
||||
'איפה הילד?',
|
||||
'מיהו נשיא צרפת?',
|
||||
'מהי בירת ארצות הברית?',
|
||||
"איך קוראים בעברית לצ'ופצ'יק של הקומקום?",
|
||||
'מה הייתה הדקה?',
|
||||
'מי אומר שלום ראשון, זה שעולה או זה שיורד?'
|
||||
]
|
|
@ -0,0 +1,18 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.it.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
|
||||
"Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori",
|
||||
"San Francisco prevede di bandire i robot di consegna porta a porta",
|
||||
"Londra è una grande città del Regno Unito."
|
||||
]
|
|
@ -0,0 +1,18 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.nb.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar",
|
||||
"Selvkjørende biler flytter forsikringsansvaret over på produsentene ",
|
||||
"San Francisco vurderer å forby robotbud på fortauene",
|
||||
"London er en stor by i Storbritannia."
|
||||
]
|
|
@ -0,0 +1,20 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.pl.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Poczuł przyjemną woń mocnej kawy.",
|
||||
"Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.",
|
||||
"Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.",
|
||||
"Nowy abonament pod lupą Komisji Europejskiej",
|
||||
"Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
|
||||
"Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”."
|
||||
]
|
|
@ -0,0 +1,18 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.pt.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares",
|
||||
"Carros autônomos empurram a responsabilidade do seguro para os fabricantes."
|
||||
"São Francisco considera banir os robôs de entrega que andam pelas calçadas",
|
||||
"Londres é a maior cidade do Reino Unido"
|
||||
]
|
|
@ -0,0 +1,18 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.sv.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple överväger att köpa brittisk startup för 1 miljard dollar.",
|
||||
"Självkörande bilar förskjuter försäkringsansvar mot tillverkare.",
|
||||
"San Fransisco överväger förbud mot leveransrobotar på trottoarer.".
|
||||
"London är en storstad i Storbritannien."
|
||||
]
|
|
@ -430,11 +430,16 @@ class Language(object):
|
|||
except StopIteration:
|
||||
pass
|
||||
|
||||
def pipe(self, texts, tuples=False, n_threads=2, batch_size=1000, disable=[]):
|
||||
def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
|
||||
disable=[]):
|
||||
"""Process texts as a stream, and yield `Doc` objects in order. Supports
|
||||
GIL-free multi-threading.
|
||||
|
||||
texts (iterator): A sequence of texts to process.
|
||||
as_tuples (bool):
|
||||
If set to True, inputs should be a sequence of
|
||||
(text, context) tuples. Output will then be a sequence of
|
||||
(doc, context) tuples. Defaults to False.
|
||||
n_threads (int): The number of worker threads to use. If -1, OpenMP will
|
||||
decide how many to use at run time. Default is 2.
|
||||
batch_size (int): The number of texts to buffer.
|
||||
|
@ -446,7 +451,7 @@ class Language(object):
|
|||
>>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
|
||||
>>> assert doc.is_parsed
|
||||
"""
|
||||
if tuples:
|
||||
if as_tuples:
|
||||
text_context1, text_context2 = itertools.tee(texts)
|
||||
texts = (tc[0] for tc in text_context1)
|
||||
contexts = (tc[1] for tc in text_context2)
|
||||
|
|
|
@ -63,7 +63,7 @@ def vector_size():
|
|||
|
||||
@pytest.fixture
|
||||
def beam(moves, states, golds, beam_width):
|
||||
return ParserBeam(moves, states, golds, width=beam_width)
|
||||
return ParserBeam(moves, states, golds, width=beam_width, density=0.0)
|
||||
|
||||
@pytest.fixture
|
||||
def scores(moves, batch_size, beam_width):
|
||||
|
|
|
@ -11,8 +11,8 @@ import pytest
|
|||
def taggers(en_vocab):
|
||||
tagger1 = Tagger(en_vocab)
|
||||
tagger2 = Tagger(en_vocab)
|
||||
tagger1.model = tagger1.Model(None, None)
|
||||
tagger2.model = tagger2.Model(None, None)
|
||||
tagger1.model = tagger1.Model(8, 8)
|
||||
tagger2.model = tagger1.model
|
||||
return (tagger1, tagger2)
|
||||
|
||||
|
||||
|
@ -20,7 +20,6 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
|
|||
tagger1, tagger2 = taggers
|
||||
tagger1_b = tagger1.to_bytes()
|
||||
tagger2_b = tagger2.to_bytes()
|
||||
assert tagger1_b == tagger2_b
|
||||
tagger1 = tagger1.from_bytes(tagger1_b)
|
||||
assert tagger1.to_bytes() == tagger1_b
|
||||
new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b)
|
||||
|
|
|
@ -238,6 +238,27 @@ cdef class Doc:
|
|||
def doc(self):
|
||||
return self
|
||||
|
||||
def char_span(self, int start_idx, int end_idx, attr_t label=0, vector=None):
|
||||
"""Create a `Span` object from the slice `doc.text[start : end]`.
|
||||
|
||||
doc (Doc): The parent document.
|
||||
start (int): The index of the first character of the span.
|
||||
end (int): The index of the first character after the span.
|
||||
label (uint64): A label to attach to the Span, e.g. for named entities.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
||||
RETURNS (Span): The newly constructed object.
|
||||
"""
|
||||
cdef int start = token_by_start(self.c, self.length, start_idx)
|
||||
if start == -1:
|
||||
return None
|
||||
cdef int end = token_by_end(self.c, self.length, end_idx)
|
||||
if end == -1:
|
||||
return None
|
||||
# Currently we have the token index, we want the range-end index
|
||||
end += 1
|
||||
cdef Span span = Span(self, start, end, label=label, vector=vector)
|
||||
return span
|
||||
|
||||
def similarity(self, other):
|
||||
"""Make a semantic similarity estimate. The default estimate is cosine
|
||||
similarity using an average of word vectors.
|
||||
|
|
|
@ -15,5 +15,5 @@ cdef class Span:
|
|||
cdef public _vector
|
||||
cdef public _vector_norm
|
||||
|
||||
|
||||
cpdef int _recalculate_indices(self) except -1
|
||||
cpdef np.ndarray to_array(self, object features)
|
||||
|
|
|
@ -7,7 +7,7 @@ import numpy
|
|||
import numpy.linalg
|
||||
from libc.math cimport sqrt
|
||||
|
||||
from .doc cimport token_by_start, token_by_end
|
||||
from .doc cimport token_by_start, token_by_end, get_token_attr
|
||||
from ..structs cimport TokenC, LexemeC
|
||||
from ..typedefs cimport flags_t, attr_t, hash_t
|
||||
from ..attrs cimport attr_id_t
|
||||
|
@ -135,6 +135,28 @@ cdef class Span:
|
|||
return 0.0
|
||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
|
||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||
"""Given a list of M attribute IDs, export the tokens to a numpy
|
||||
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
|
||||
The values will be 32-bit integers.
|
||||
|
||||
attr_ids (list[int]): A list of attribute ID ints.
|
||||
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
|
||||
per word, and one column per attribute indicated in the input
|
||||
`attr_ids`.
|
||||
"""
|
||||
cdef int i, j
|
||||
cdef attr_id_t feature
|
||||
cdef np.ndarray[attr_t, ndim=2] output
|
||||
# Make an array from the attributes --- otherwise our inner loop is Python
|
||||
# dict iteration.
|
||||
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
|
||||
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
|
||||
for i in range(self.start, self.end):
|
||||
for j, feature in enumerate(attr_ids):
|
||||
output[i, j] = get_token_attr(&self.doc.c[i], feature)
|
||||
return output
|
||||
|
||||
cpdef int _recalculate_indices(self) except -1:
|
||||
if self.end > self.doc.length \
|
||||
or self.doc.c[self.start].idx != self.start_char \
|
||||
|
|
|
@ -20,7 +20,7 @@ cdef class Vectors:
|
|||
'''Store, save and load word vectors.'''
|
||||
cdef public object data
|
||||
cdef readonly StringStore strings
|
||||
cdef public object index
|
||||
cdef public object key2row
|
||||
|
||||
def __init__(self, strings, data_or_width):
|
||||
self.strings = StringStore()
|
||||
|
@ -30,9 +30,9 @@ cdef class Vectors:
|
|||
else:
|
||||
data = data_or_width
|
||||
self.data = data
|
||||
self.index = {}
|
||||
self.key2row = {}
|
||||
for i, string in enumerate(strings):
|
||||
self.index[self.strings.add(string)] = i
|
||||
self.key2row[self.strings.add(string)] = i
|
||||
|
||||
def __reduce__(self):
|
||||
return (Vectors, (self.strings, self.data))
|
||||
|
@ -40,7 +40,7 @@ cdef class Vectors:
|
|||
def __getitem__(self, key):
|
||||
if isinstance(key, basestring):
|
||||
key = self.strings[key]
|
||||
i = self.index[key]
|
||||
i = self.key2row[key]
|
||||
if i is None:
|
||||
raise KeyError(key)
|
||||
else:
|
||||
|
@ -49,7 +49,7 @@ cdef class Vectors:
|
|||
def __setitem__(self, key, vector):
|
||||
if isinstance(key, basestring):
|
||||
key = self.strings.add(key)
|
||||
i = self.index[key]
|
||||
i = self.key2row[key]
|
||||
self.data[i] = vector
|
||||
|
||||
def __iter__(self):
|
||||
|
@ -71,7 +71,7 @@ cdef class Vectors:
|
|||
|
||||
def to_disk(self, path, **exclude):
|
||||
def serialize_vectors(p):
|
||||
write_vectors_to_bin_loc(self.strings, self.key2i, self.data, str(p))
|
||||
write_vectors_to_bin_loc(self.strings, self.key2row, self.data, str(p))
|
||||
|
||||
serializers = OrderedDict((
|
||||
('vec.bin', serialize_vectors),
|
||||
|
@ -80,12 +80,13 @@ cdef class Vectors:
|
|||
|
||||
def from_disk(self, path, **exclude):
|
||||
def deserialize_vectors(p):
|
||||
self.key2i, self.vectors = load_vectors_from_bin_loc(self.strings, str(p))
|
||||
values = load_vectors_from_bin_loc(self.strings, str(p))
|
||||
self.key2row, self.data = values
|
||||
|
||||
serializers = OrderedDict((
|
||||
('vec.bin', deserialize_vectors)
|
||||
('vec.bin', deserialize_vectors),
|
||||
))
|
||||
return util.to_disk(serializers, exclude)
|
||||
return util.from_disk(path, serializers, exclude)
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
def serialize_weights():
|
||||
|
@ -93,9 +94,9 @@ cdef class Vectors:
|
|||
return self.data.to_bytes()
|
||||
else:
|
||||
return msgpack.dumps(self.data)
|
||||
|
||||
b = msgpack.dumps(self.key2row)
|
||||
serializers = OrderedDict((
|
||||
('key2row', lambda: msgpack.dumps(self.key2i)),
|
||||
('key2row', lambda: msgpack.dumps(self.key2row)),
|
||||
('strings', lambda: self.strings.to_bytes()),
|
||||
('vectors', serialize_weights)
|
||||
))
|
||||
|
@ -109,7 +110,7 @@ cdef class Vectors:
|
|||
self.data = msgpack.loads(b)
|
||||
|
||||
deserializers = OrderedDict((
|
||||
('key2row', lambda b: self.key2i.update(msgpack.loads(b))),
|
||||
('key2row', lambda b: self.key2row.update(msgpack.loads(b))),
|
||||
('strings', lambda b: self.strings.from_bytes(b)),
|
||||
('vectors', deserialize_weights)
|
||||
))
|
||||
|
|
|
@ -112,6 +112,10 @@
|
|||
.u-nowrap
|
||||
white-space: nowrap
|
||||
|
||||
.u-break.u-break
|
||||
word-wrap: break-word
|
||||
white-space: initial
|
||||
|
||||
.u-no-border
|
||||
border: none
|
||||
|
||||
|
|
|
@ -140,6 +140,44 @@ p Get the number of tokens in the document.
|
|||
+cell int
|
||||
+cell The number of tokens in the document.
|
||||
|
||||
+h(2, "char_span") Doc.char_span
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p Create a #[code Span] object from the slice #[code doc.text[start : end]].
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like New York')
|
||||
label = doc.vocab.strings['GPE']
|
||||
span = doc.char_span(7, 15, label=label)
|
||||
assert span.text == 'New York'
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code start]
|
||||
+cell int
|
||||
+cell The index of the first character of the span.
|
||||
|
||||
+row
|
||||
+cell #[code end]
|
||||
+cell int
|
||||
+cell The index of the first character after the span.
|
||||
|
||||
+row
|
||||
+cell #[code label]
|
||||
+cell uint64
|
||||
+cell A label to attach to the Span, e.g. for named entities.
|
||||
|
||||
+row
|
||||
+cell #[code vector]
|
||||
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell A meaning representation of the span.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Span]
|
||||
+cell The newly constructed object.
|
||||
|
||||
+h(2, "similarity") Doc.similarity
|
||||
+tag method
|
||||
+tag-model("vectors")
|
||||
|
@ -211,12 +249,12 @@ p
|
|||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code attr_ids]
|
||||
+cell ints
|
||||
+cell list
|
||||
+cell A list of attribute ID ints.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
|
||||
+cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
|
||||
+cell
|
||||
| The exported attributes as a 2D numpy array, with one row per
|
||||
| token and one column per attribute.
|
||||
|
@ -245,7 +283,7 @@ p
|
|||
|
||||
+row
|
||||
+cell #[code array]
|
||||
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
|
||||
+cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
|
||||
+cell The attribute values to load.
|
||||
|
||||
+footrow
|
||||
|
@ -509,7 +547,7 @@ p
|
|||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell A 1D numpy array representing the document's semantics.
|
||||
|
||||
+h(2, "vector_norm") Doc.vector_norm
|
||||
|
|
|
@ -111,6 +111,14 @@ p
|
|||
+cell -
|
||||
+cell A sequence of unicode objects.
|
||||
|
||||
+row
|
||||
+cell #[code as_tuples]
|
||||
+cell bool
|
||||
+cell
|
||||
| If set to #[code True], inputs should be a sequence of
|
||||
| #[code (text, context)] tuples. Output will then be a sequence of
|
||||
| #[code (doc, context)] tuples. Defaults to #[code False].
|
||||
|
||||
+row
|
||||
+cell #[code n_threads]
|
||||
+cell int
|
||||
|
|
|
@ -129,7 +129,7 @@ p A real-valued meaning representation.
|
|||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell A 1D numpy array representing the lexeme's semantics.
|
||||
|
||||
+h(2, "vector_norm") Lexeme.vector_norm
|
||||
|
|
|
@ -37,7 +37,7 @@ p Create a Span object from the #[code slice doc[start : end]].
|
|||
|
||||
+row
|
||||
+cell #[code vector]
|
||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell A meaning representation of the span.
|
||||
|
||||
+footrow
|
||||
|
@ -145,11 +145,47 @@ p
|
|||
+cell float
|
||||
+cell A scalar similarity score. Higher is more similar.
|
||||
|
||||
+h(2, "to_array") Span.to_array
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Given a list of #[code M] attribute IDs, export the tokens to a numpy
|
||||
| #[code ndarray] of shape #[code (N, M)], where #[code N] is the length of
|
||||
| the document. The values will be 32-bit integers.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
|
||||
doc = nlp(u'I like New York in Autumn.')
|
||||
span = doc[2:3]
|
||||
# All strings mapped to integers, for easy export to numpy
|
||||
np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code attr_ids]
|
||||
+cell list
|
||||
+cell A list of attribute ID ints.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code.u-break numpy.ndarray[long, ndim=2]]
|
||||
+cell
|
||||
| A feature matrix, with one row per word, and one column per
|
||||
| attribute indicated in the input #[code attr_ids].
|
||||
|
||||
+h(2, "merge") Span.merge
|
||||
+tag method
|
||||
|
||||
p Retokenize the document, such that the span is merged into a single token.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like New York in Autumn.')
|
||||
span = doc[2:3]
|
||||
span.merge()
|
||||
assert len(doc) == 6
|
||||
assert doc[2].text == 'New York'
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code **attributes]
|
||||
|
@ -270,7 +306,7 @@ p
|
|||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell A 1D numpy array representing the span's semantics.
|
||||
|
||||
+h(2, "vector_norm") Span.vector_norm
|
||||
|
|
|
@ -250,7 +250,7 @@ p A real-valued meaning representation.
|
|||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell A 1D numpy array representing the token's semantics.
|
||||
|
||||
+h(2, "vector_norm") Span.vector_norm
|
||||
|
|
Loading…
Reference in New Issue