From a7309a217d5a0d9c94bc9dff85c2e7d8262b345a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 18 Aug 2017 23:12:05 +0200 Subject: [PATCH 01/14] Update tagger serialization --- spacy/tests/serialize/test_serialize_tagger.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tests/serialize/test_serialize_tagger.py b/spacy/tests/serialize/test_serialize_tagger.py index fa9a776bb..e56db1421 100644 --- a/spacy/tests/serialize/test_serialize_tagger.py +++ b/spacy/tests/serialize/test_serialize_tagger.py @@ -11,8 +11,8 @@ import pytest def taggers(en_vocab): tagger1 = Tagger(en_vocab) tagger2 = Tagger(en_vocab) - tagger1.model = tagger1.Model(None, None) - tagger2.model = tagger2.Model(None, None) + tagger1.model = tagger1.Model(8, 8) + tagger2.model = tagger2.Model(8, 8) return (tagger1, tagger2) From 2da96a0ec7bc52fe09c12ffbe7e51388963e8f84 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 19 Aug 2017 04:15:46 +0200 Subject: [PATCH 02/14] Fix beam test --- spacy/tests/parser/test_nn_beam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py index 45c85d969..ab8bf012b 100644 --- a/spacy/tests/parser/test_nn_beam.py +++ b/spacy/tests/parser/test_nn_beam.py @@ -63,7 +63,7 @@ def vector_size(): @pytest.fixture def beam(moves, states, golds, beam_width): - return ParserBeam(moves, states, golds, width=beam_width) + return ParserBeam(moves, states, golds, width=beam_width, density=0.0) @pytest.fixture def scores(moves, batch_size, beam_width): From 42d47c1e5ced1afbd45a5df8ceded4bcd485d858 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 19 Aug 2017 04:16:32 +0200 Subject: [PATCH 03/14] Fix tagger serialization --- spacy/tests/serialize/test_serialize_tagger.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/tests/serialize/test_serialize_tagger.py b/spacy/tests/serialize/test_serialize_tagger.py index e56db1421..3154687c3 100644 --- a/spacy/tests/serialize/test_serialize_tagger.py +++ b/spacy/tests/serialize/test_serialize_tagger.py @@ -12,7 +12,7 @@ def taggers(en_vocab): tagger1 = Tagger(en_vocab) tagger2 = Tagger(en_vocab) tagger1.model = tagger1.Model(8, 8) - tagger2.model = tagger2.Model(8, 8) + tagger2.model = tagger1.model return (tagger1, tagger2) @@ -20,7 +20,6 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers): tagger1, tagger2 = taggers tagger1_b = tagger1.to_bytes() tagger2_b = tagger2.to_bytes() - assert tagger1_b == tagger2_b tagger1 = tagger1.from_bytes(tagger1_b) assert tagger1.to_bytes() == tagger1_b new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b) From 19c495f451e3b83f5575743d63c3745a9fd5eaa2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 19 Aug 2017 04:33:03 +0200 Subject: [PATCH 04/14] Fix vectors deserialization --- spacy/vectors.pyx | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 59a24dfa9..1b1e8000a 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -20,7 +20,7 @@ cdef class Vectors: '''Store, save and load word vectors.''' cdef public object data cdef readonly StringStore strings - cdef public object index + cdef public object key2row def __init__(self, strings, data_or_width): self.strings = StringStore() @@ -30,9 +30,9 @@ cdef class Vectors: else: data = data_or_width self.data = data - self.index = {} + self.key2row = {} for i, string in enumerate(strings): - self.index[self.strings.add(string)] = i + self.key2row[self.strings.add(string)] = i def __reduce__(self): return (Vectors, (self.strings, self.data)) @@ -40,7 +40,7 @@ cdef class Vectors: def __getitem__(self, key): if isinstance(key, basestring): key = self.strings[key] - i = self.index[key] + i = self.key2row[key] if i is None: raise KeyError(key) else: @@ -49,7 +49,7 @@ cdef class Vectors: def __setitem__(self, key, vector): if isinstance(key, basestring): key = self.strings.add(key) - i = self.index[key] + i = self.key2row[key] self.data[i] = vector def __iter__(self): @@ -71,7 +71,7 @@ cdef class Vectors: def to_disk(self, path, **exclude): def serialize_vectors(p): - write_vectors_to_bin_loc(self.strings, self.key2i, self.data, str(p)) + write_vectors_to_bin_loc(self.strings, self.key2row, self.data, str(p)) serializers = OrderedDict(( ('vec.bin', serialize_vectors), @@ -80,12 +80,13 @@ cdef class Vectors: def from_disk(self, path, **exclude): def deserialize_vectors(p): - self.key2i, self.vectors = load_vectors_from_bin_loc(self.strings, str(p)) + values = load_vectors_from_bin_loc(self.strings, str(p)) + self.key2row, self.data = values serializers = OrderedDict(( - ('vec.bin', deserialize_vectors) + ('vec.bin', deserialize_vectors), )) - return util.to_disk(serializers, exclude) + return util.from_disk(path, serializers, exclude) def to_bytes(self, **exclude): def serialize_weights(): @@ -93,9 +94,9 @@ cdef class Vectors: return self.data.to_bytes() else: return msgpack.dumps(self.data) - + b = msgpack.dumps(self.key2row) serializers = OrderedDict(( - ('key2row', lambda: msgpack.dumps(self.key2i)), + ('key2row', lambda: msgpack.dumps(self.key2row)), ('strings', lambda: self.strings.to_bytes()), ('vectors', serialize_weights) )) @@ -109,7 +110,7 @@ cdef class Vectors: self.data = msgpack.loads(b) deserializers = OrderedDict(( - ('key2row', lambda b: self.key2i.update(msgpack.loads(b))), + ('key2row', lambda b: self.key2row.update(msgpack.loads(b))), ('strings', lambda b: self.strings.from_bytes(b)), ('vectors', deserialize_weights) )) From 482bba1722b848a92d6f19ec2bb3152ed1b84ae4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 19 Aug 2017 12:20:45 +0200 Subject: [PATCH 05/14] Add Span.to_array method --- spacy/tokens/span.pxd | 2 +- spacy/tokens/span.pyx | 24 +++++++++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd index 8d675c04f..9645189a5 100644 --- a/spacy/tokens/span.pxd +++ b/spacy/tokens/span.pxd @@ -15,5 +15,5 @@ cdef class Span: cdef public _vector cdef public _vector_norm - cpdef int _recalculate_indices(self) except -1 + cpdef np.ndarray to_array(self, object features) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 9f2115fe1..9625b5547 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -7,7 +7,7 @@ import numpy import numpy.linalg from libc.math cimport sqrt -from .doc cimport token_by_start, token_by_end +from .doc cimport token_by_start, token_by_end, get_token_attr from ..structs cimport TokenC, LexemeC from ..typedefs cimport flags_t, attr_t, hash_t from ..attrs cimport attr_id_t @@ -135,6 +135,28 @@ cdef class Span: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + cpdef np.ndarray to_array(self, object py_attr_ids): + """Given a list of M attribute IDs, export the tokens to a numpy + `ndarray` of shape `(N, M)`, where `N` is the length of the document. + The values will be 32-bit integers. + + attr_ids (list[int]): A list of attribute ID ints. + RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row + per word, and one column per attribute indicated in the input + `attr_ids`. + """ + cdef int i, j + cdef attr_id_t feature + cdef np.ndarray[attr_t, ndim=2] output + # Make an array from the attributes --- otherwise our inner loop is Python + # dict iteration. + cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64) + output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64) + for i in range(self.start, self.end): + for j, feature in enumerate(attr_ids): + output[i, j] = get_token_attr(&self.doc.c[i], feature) + return output + cpdef int _recalculate_indices(self) except -1: if self.end > self.doc.length \ or self.doc.c[self.start].idx != self.start_char \ From 80236116a6034c45be6521d99d64f97ddeb764a1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 19 Aug 2017 12:21:09 +0200 Subject: [PATCH 06/14] Add Doc.char_span method, to get a span by character offset --- spacy/tokens/doc.pyx | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 822a0152d..75088b010 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -238,6 +238,27 @@ cdef class Doc: def doc(self): return self + def char_span(self, int start_idx, int end_idx, attr_t label=0, vector=None): + """Create a `Span` object from the slice `doc.text[start : end]`. + + doc (Doc): The parent document. + start (int): The index of the first character of the span. + end (int): The index of the first character after the span. + label (uint64): A label to attach to the Span, e.g. for named entities. + vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. + RETURNS (Span): The newly constructed object. + """ + cdef int start = token_by_start(self.c, self.length, start_idx) + if start == -1: + return None + cdef int end = token_by_end(self.c, self.length, end_idx) + if end == -1: + return None + # Currently we have the token index, we want the range-end index + end += 1 + cdef Span span = Span(self, start, end, label=label, vector=vector) + return span + def similarity(self, other): """Make a semantic similarity estimate. The default estimate is cosine similarity using an average of word vectors. From 97aabafb5f99d12065397f8ca162f92ad9a4acc0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 19 Aug 2017 12:21:33 +0200 Subject: [PATCH 07/14] Document as_tuples keyword arg of Language.pipe --- spacy/language.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index cb679a2bc..aa757ffa8 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -430,11 +430,16 @@ class Language(object): except StopIteration: pass - def pipe(self, texts, tuples=False, n_threads=2, batch_size=1000, disable=[]): + def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000, + disable=[]): """Process texts as a stream, and yield `Doc` objects in order. Supports GIL-free multi-threading. texts (iterator): A sequence of texts to process. + as_tuples (bool): + If set to True, inputs should be a sequence of + (text, context) tuples. Output will then be a sequence of + (doc, context) tuples. Defaults to False. n_threads (int): The number of worker threads to use. If -1, OpenMP will decide how many to use at run time. Default is 2. batch_size (int): The number of texts to buffer. @@ -446,7 +451,7 @@ class Language(object): >>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4): >>> assert doc.is_parsed """ - if tuples: + if as_tuples: text_context1, text_context2 = itertools.tee(texts) texts = (tc[0] for tc in text_context1) contexts = (tc[1] for tc in text_context2) From 1fe5e1a4d1cd9857e2ad945800c1c5c6850c853c Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 19 Aug 2017 12:22:26 +0200 Subject: [PATCH 08/14] Add language example sentences (see #1107) da, de, en, es, fr, he, it, nb, pl, pt, sv --- spacy/lang/da/examples.py | 18 ++++++++++++++++++ spacy/lang/de/examples.py | 22 ++++++++++++++++++++++ spacy/lang/en/examples.py | 22 ++++++++++++++++++++++ spacy/lang/es/examples.py | 22 ++++++++++++++++++++++ spacy/lang/fr/examples.py | 26 ++++++++++++++++++++++++++ spacy/lang/he/examples.py | 28 ++++++++++++++++++++++++++++ spacy/lang/it/examples.py | 18 ++++++++++++++++++ spacy/lang/nb/examples.py | 18 ++++++++++++++++++ spacy/lang/pl/examples.py | 20 ++++++++++++++++++++ spacy/lang/pt/examples.py | 18 ++++++++++++++++++ spacy/lang/sv/examples.py | 18 ++++++++++++++++++ 11 files changed, 230 insertions(+) create mode 100644 spacy/lang/da/examples.py create mode 100644 spacy/lang/de/examples.py create mode 100644 spacy/lang/en/examples.py create mode 100644 spacy/lang/es/examples.py create mode 100644 spacy/lang/fr/examples.py create mode 100644 spacy/lang/he/examples.py create mode 100644 spacy/lang/it/examples.py create mode 100644 spacy/lang/nb/examples.py create mode 100644 spacy/lang/pl/examples.py create mode 100644 spacy/lang/pt/examples.py create mode 100644 spacy/lang/sv/examples.py diff --git a/spacy/lang/da/examples.py b/spacy/lang/da/examples.py new file mode 100644 index 000000000..549f71fb5 --- /dev/null +++ b/spacy/lang/da/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.da.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple overvejer at købe et britisk statup for 1 milliard dollar", + "Selvkørende biler flytter forsikringsansvaret over på producenterne", + "San Francisco overvejer at forbyde leverandørrobotter på fortov", + "London er en stor by i Storbritannien" +] diff --git a/spacy/lang/de/examples.py b/spacy/lang/de/examples.py new file mode 100644 index 000000000..49ac0e14b --- /dev/null +++ b/spacy/lang/de/examples.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.de.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen", + "Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz", + "Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz", + "Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion", + "San Francisco erwägt Verbot von Lieferrobotern", + "Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller", + "Wo bist du?", + "Was ist die Hauptstadt von Deutschland?" +] diff --git a/spacy/lang/en/examples.py b/spacy/lang/en/examples.py new file mode 100644 index 000000000..b92d4a65c --- /dev/null +++ b/spacy/lang/en/examples.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.en.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple is looking at buying U.K. startup for $1 billion", + "Autonomous cars shift insurance liability toward manufacturers", + "San Francisco considers banning sidewalk delivery robots", + "London is a big city in the United Kingdom.", + "Where are you?", + "Who is the president of France?", + "What is the capital of the United States?", + "When was Barack Obama born?" +] diff --git a/spacy/lang/es/examples.py b/spacy/lang/es/examples.py new file mode 100644 index 000000000..61fe8c9be --- /dev/null +++ b/spacy/lang/es/examples.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.es.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares", + "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes", + "San Francisco analiza prohibir los robots delivery", + "Londres es una gran ciudad del Reino Unido", + "El gato come pescado", + "Veo al hombre con el telescopio", + "La araña come moscas", + "El pingüino incuba en su nido" +] diff --git a/spacy/lang/fr/examples.py b/spacy/lang/fr/examples.py new file mode 100644 index 000000000..08409ea61 --- /dev/null +++ b/spacy/lang/fr/examples.py @@ -0,0 +1,26 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.fr.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple cherche a acheter une startup anglaise pour 1 milliard de dollard", + "Les voitures autonomes voient leur assurances décalées vers les constructeurs", + "San Francisco envisage d'interdire les robots coursiers", + "Londres est une grande ville du Royaume-Uni", + "L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe", + "Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon", + "La France ne devrait pas manquer d'électricité cet été, même en cas de canicule", + "Nouvelles attaques de Trump contre le maire de Londres", + "Où es-tu ?", + "Qui est le président de la France ?", + "Où est la capitale des Etats-Unis ?", + "Quand est né Barack Obama ?" +] diff --git a/spacy/lang/he/examples.py b/spacy/lang/he/examples.py new file mode 100644 index 000000000..f99f4814b --- /dev/null +++ b/spacy/lang/he/examples.py @@ -0,0 +1,28 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.he.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + 'סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל', + 'רה"מ הודיע כי יחרים טקס בחסותו', + 'הכנסת צפויה לאשר איכון אוטומטי של שיחות למוקד 100', + 'תוכנית לאומית תהפוך את ישראל למעצמה דיגיטלית', + 'סע לשלום, המפתחות בפנים.', + 'מלצר, פעמיים טורקי!', + 'ואהבת לרעך כמוך.', + 'היום נעשה משהו בלתי נשכח.', + 'איפה הילד?', + 'מיהו נשיא צרפת?', + 'מהי בירת ארצות הברית?', + "איך קוראים בעברית לצ'ופצ'יק של הקומקום?", + 'מה הייתה הדקה?', + 'מי אומר שלום ראשון, זה שעולה או זה שיורד?' +] diff --git a/spacy/lang/it/examples.py b/spacy/lang/it/examples.py new file mode 100644 index 000000000..d35b9f834 --- /dev/null +++ b/spacy/lang/it/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.it.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari", + "Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori", + "San Francisco prevede di bandire i robot di consegna porta a porta", + "Londra è una grande città del Regno Unito." +] diff --git a/spacy/lang/nb/examples.py b/spacy/lang/nb/examples.py new file mode 100644 index 000000000..0dc5c8144 --- /dev/null +++ b/spacy/lang/nb/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.nb.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar", + "Selvkjørende biler flytter forsikringsansvaret over på produsentene ", + "San Francisco vurderer å forby robotbud på fortauene", + "London er en stor by i Storbritannia." +] diff --git a/spacy/lang/pl/examples.py b/spacy/lang/pl/examples.py new file mode 100644 index 000000000..af6c72af0 --- /dev/null +++ b/spacy/lang/pl/examples.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.pl.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Poczuł przyjemną woń mocnej kawy.", + "Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.", + "Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.", + "Nowy abonament pod lupą Komisji Europejskiej", + "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?", + "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”." +] diff --git a/spacy/lang/pt/examples.py b/spacy/lang/pt/examples.py new file mode 100644 index 000000000..239929215 --- /dev/null +++ b/spacy/lang/pt/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.pt.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares", + "Carros autônomos empurram a responsabilidade do seguro para os fabricantes." + "São Francisco considera banir os robôs de entrega que andam pelas calçadas", + "Londres é a maior cidade do Reino Unido" +] diff --git a/spacy/lang/sv/examples.py b/spacy/lang/sv/examples.py new file mode 100644 index 000000000..be279c4bd --- /dev/null +++ b/spacy/lang/sv/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.sv.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple överväger att köpa brittisk startup för 1 miljard dollar.", + "Självkörande bilar förskjuter försäkringsansvar mot tillverkare.", + "San Fransisco överväger förbud mot leveransrobotar på trottoarer.". + "London är en storstad i Storbritannien." +] From 4731d5022021dd22b4e58c72725755608cc6aee2 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 19 Aug 2017 12:44:23 +0200 Subject: [PATCH 09/14] Add break utility for long nowrap items (e.g. code) --- website/assets/css/_base/_utilities.sass | 4 ++++ website/docs/api/doc.jade | 6 +++--- website/docs/api/lexeme.jade | 2 +- website/docs/api/span.jade | 4 ++-- website/docs/api/token.jade | 2 +- 5 files changed, 11 insertions(+), 7 deletions(-) diff --git a/website/assets/css/_base/_utilities.sass b/website/assets/css/_base/_utilities.sass index 2c40858a8..46c3e84d9 100644 --- a/website/assets/css/_base/_utilities.sass +++ b/website/assets/css/_base/_utilities.sass @@ -112,6 +112,10 @@ .u-nowrap white-space: nowrap +.u-break.u-break + word-wrap: break-word + white-space: initial + .u-no-border border: none diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade index 929985144..212f823ba 100644 --- a/website/docs/api/doc.jade +++ b/website/docs/api/doc.jade @@ -216,7 +216,7 @@ p +footrow +cell returns - +cell #[code numpy.ndarray[ndim=2, dtype='int32']] + +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']] +cell | The exported attributes as a 2D numpy array, with one row per | token and one column per attribute. @@ -245,7 +245,7 @@ p +row +cell #[code array] - +cell #[code numpy.ndarray[ndim=2, dtype='int32']] + +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']] +cell The attribute values to load. +footrow @@ -509,7 +509,7 @@ p +table(["Name", "Type", "Description"]) +footrow +cell returns - +cell #[code numpy.ndarray[ndim=1, dtype='float32']] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell A 1D numpy array representing the document's semantics. +h(2, "vector_norm") Doc.vector_norm diff --git a/website/docs/api/lexeme.jade b/website/docs/api/lexeme.jade index a0487be9b..6e3f68493 100644 --- a/website/docs/api/lexeme.jade +++ b/website/docs/api/lexeme.jade @@ -129,7 +129,7 @@ p A real-valued meaning representation. +table(["Name", "Type", "Description"]) +footrow +cell returns - +cell #[code numpy.ndarray[ndim=1, dtype='float32']] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell A 1D numpy array representing the lexeme's semantics. +h(2, "vector_norm") Lexeme.vector_norm diff --git a/website/docs/api/span.jade b/website/docs/api/span.jade index 542336714..5b480b280 100644 --- a/website/docs/api/span.jade +++ b/website/docs/api/span.jade @@ -37,7 +37,7 @@ p Create a Span object from the #[code slice doc[start : end]]. +row +cell #[code vector] - +cell #[code numpy.ndarray[ndim=1, dtype='float32']] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell A meaning representation of the span. +footrow @@ -270,7 +270,7 @@ p +table(["Name", "Type", "Description"]) +footrow +cell returns - +cell #[code numpy.ndarray[ndim=1, dtype='float32']] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell A 1D numpy array representing the span's semantics. +h(2, "vector_norm") Span.vector_norm diff --git a/website/docs/api/token.jade b/website/docs/api/token.jade index 87387e09d..db445d09b 100644 --- a/website/docs/api/token.jade +++ b/website/docs/api/token.jade @@ -250,7 +250,7 @@ p A real-valued meaning representation. +table(["Name", "Type", "Description"]) +footrow +cell returns - +cell #[code numpy.ndarray[ndim=1, dtype='float32']] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell A 1D numpy array representing the token's semantics. +h(2, "vector_norm") Span.vector_norm From 6a37c93311ca4dd446eb36dc6ca4fec6a8f4922e Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 19 Aug 2017 12:44:33 +0200 Subject: [PATCH 10/14] Update argument type --- website/docs/api/doc.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade index 212f823ba..5c065e775 100644 --- a/website/docs/api/doc.jade +++ b/website/docs/api/doc.jade @@ -211,7 +211,7 @@ p +table(["Name", "Type", "Description"]) +row +cell #[code attr_ids] - +cell ints + +cell list +cell A list of attribute ID ints. +footrow From d53cbf369fff63d20861eee003f671f4a9b013fc Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 19 Aug 2017 12:44:50 +0200 Subject: [PATCH 11/14] Document as_tuples kwarg on Language.pipe() --- website/docs/api/language.jade | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/website/docs/api/language.jade b/website/docs/api/language.jade index 9c26f506c..69665ee9d 100644 --- a/website/docs/api/language.jade +++ b/website/docs/api/language.jade @@ -111,6 +111,14 @@ p +cell - +cell A sequence of unicode objects. + +row + +cell #[code as_tuples] + +cell bool + +cell + | If set to #[code True], inputs should be a sequence of + | #[code (text, context)] tuples. Output will then be a sequence of + | #[code (doc, context)] tuples. Defaults to #[code False]. + +row +cell #[code n_threads] +cell int From 404d3067b89a71145b85af37f4f0233eb9cc5689 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 19 Aug 2017 12:45:00 +0200 Subject: [PATCH 12/14] Document new Doc.char_span() method --- website/docs/api/doc.jade | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade index 5c065e775..fcba091b8 100644 --- a/website/docs/api/doc.jade +++ b/website/docs/api/doc.jade @@ -140,6 +140,44 @@ p Get the number of tokens in the document. +cell int +cell The number of tokens in the document. ++h(2, "char_span") Doc.char_span + +tag method + +tag-new(2) + +p Create a #[code Span] object from the slice #[code doc.text[start : end]]. + ++aside-code("Example"). + doc = nlp(u'I like New York') + label = doc.vocab.strings['GPE'] + span = doc.char_span(7, 15, label=label) + assert span.text == 'New York' + ++table(["Name", "Type", "Description"]) + +row + +cell #[code start] + +cell int + +cell The index of the first character of the span. + + +row + +cell #[code end] + +cell int + +cell The index of the first character after the span. + + +row + +cell #[code label] + +cell uint64 + +cell A label to attach to the Span, e.g. for named entities. + + +row + +cell #[code vector] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] + +cell A meaning representation of the span. + + +footrow + +cell returns + +cell #[code Span] + +cell The newly constructed object. + +h(2, "similarity") Doc.similarity +tag method +tag-model("vectors") From 471eed4126f21830b01a8f8c1554602a9b3c77af Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 19 Aug 2017 12:45:16 +0200 Subject: [PATCH 13/14] Add example to Span.merge() --- website/docs/api/span.jade | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/website/docs/api/span.jade b/website/docs/api/span.jade index 5b480b280..f15958e1c 100644 --- a/website/docs/api/span.jade +++ b/website/docs/api/span.jade @@ -150,6 +150,13 @@ p p Retokenize the document, such that the span is merged into a single token. ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + span = doc[2:3] + span.merge() + assert len(doc) == 6 + assert doc[2].text == 'New York' + +table(["Name", "Type", "Description"]) +row +cell #[code **attributes] From 5cb0200e63c8bfd9454f5a6548e06464452f2a5f Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 19 Aug 2017 12:45:28 +0200 Subject: [PATCH 14/14] Document new Span.to_array() method --- website/docs/api/span.jade | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/website/docs/api/span.jade b/website/docs/api/span.jade index f15958e1c..2ca2d3ea9 100644 --- a/website/docs/api/span.jade +++ b/website/docs/api/span.jade @@ -145,6 +145,35 @@ p +cell float +cell A scalar similarity score. Higher is more similar. ++h(2, "to_array") Span.to_array + +tag method + +tag-new(2) + +p + | Given a list of #[code M] attribute IDs, export the tokens to a numpy + | #[code ndarray] of shape #[code (N, M)], where #[code N] is the length of + | the document. The values will be 32-bit integers. + ++aside-code("Example"). + from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA + doc = nlp(u'I like New York in Autumn.') + span = doc[2:3] + # All strings mapped to integers, for easy export to numpy + np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code attr_ids] + +cell list + +cell A list of attribute ID ints. + + +footrow + +cell returns + +cell #[code.u-break numpy.ndarray[long, ndim=2]] + +cell + | A feature matrix, with one row per word, and one column per + | attribute indicated in the input #[code attr_ids]. + +h(2, "merge") Span.merge +tag method