diff --git a/spacy/lang/da/examples.py b/spacy/lang/da/examples.py new file mode 100644 index 000000000..549f71fb5 --- /dev/null +++ b/spacy/lang/da/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.da.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple overvejer at købe et britisk statup for 1 milliard dollar", + "Selvkørende biler flytter forsikringsansvaret over på producenterne", + "San Francisco overvejer at forbyde leverandørrobotter på fortov", + "London er en stor by i Storbritannien" +] diff --git a/spacy/lang/de/examples.py b/spacy/lang/de/examples.py new file mode 100644 index 000000000..49ac0e14b --- /dev/null +++ b/spacy/lang/de/examples.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.de.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen", + "Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz", + "Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz", + "Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion", + "San Francisco erwägt Verbot von Lieferrobotern", + "Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller", + "Wo bist du?", + "Was ist die Hauptstadt von Deutschland?" +] diff --git a/spacy/lang/en/examples.py b/spacy/lang/en/examples.py new file mode 100644 index 000000000..b92d4a65c --- /dev/null +++ b/spacy/lang/en/examples.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.en.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple is looking at buying U.K. startup for $1 billion", + "Autonomous cars shift insurance liability toward manufacturers", + "San Francisco considers banning sidewalk delivery robots", + "London is a big city in the United Kingdom.", + "Where are you?", + "Who is the president of France?", + "What is the capital of the United States?", + "When was Barack Obama born?" +] diff --git a/spacy/lang/es/examples.py b/spacy/lang/es/examples.py new file mode 100644 index 000000000..61fe8c9be --- /dev/null +++ b/spacy/lang/es/examples.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.es.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares", + "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes", + "San Francisco analiza prohibir los robots delivery", + "Londres es una gran ciudad del Reino Unido", + "El gato come pescado", + "Veo al hombre con el telescopio", + "La araña come moscas", + "El pingüino incuba en su nido" +] diff --git a/spacy/lang/fr/examples.py b/spacy/lang/fr/examples.py new file mode 100644 index 000000000..08409ea61 --- /dev/null +++ b/spacy/lang/fr/examples.py @@ -0,0 +1,26 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.fr.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple cherche a acheter une startup anglaise pour 1 milliard de dollard", + "Les voitures autonomes voient leur assurances décalées vers les constructeurs", + "San Francisco envisage d'interdire les robots coursiers", + "Londres est une grande ville du Royaume-Uni", + "L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe", + "Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon", + "La France ne devrait pas manquer d'électricité cet été, même en cas de canicule", + "Nouvelles attaques de Trump contre le maire de Londres", + "Où es-tu ?", + "Qui est le président de la France ?", + "Où est la capitale des Etats-Unis ?", + "Quand est né Barack Obama ?" +] diff --git a/spacy/lang/he/examples.py b/spacy/lang/he/examples.py new file mode 100644 index 000000000..f99f4814b --- /dev/null +++ b/spacy/lang/he/examples.py @@ -0,0 +1,28 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.he.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + 'סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל', + 'רה"מ הודיע כי יחרים טקס בחסותו', + 'הכנסת צפויה לאשר איכון אוטומטי של שיחות למוקד 100', + 'תוכנית לאומית תהפוך את ישראל למעצמה דיגיטלית', + 'סע לשלום, המפתחות בפנים.', + 'מלצר, פעמיים טורקי!', + 'ואהבת לרעך כמוך.', + 'היום נעשה משהו בלתי נשכח.', + 'איפה הילד?', + 'מיהו נשיא צרפת?', + 'מהי בירת ארצות הברית?', + "איך קוראים בעברית לצ'ופצ'יק של הקומקום?", + 'מה הייתה הדקה?', + 'מי אומר שלום ראשון, זה שעולה או זה שיורד?' +] diff --git a/spacy/lang/it/examples.py b/spacy/lang/it/examples.py new file mode 100644 index 000000000..d35b9f834 --- /dev/null +++ b/spacy/lang/it/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.it.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari", + "Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori", + "San Francisco prevede di bandire i robot di consegna porta a porta", + "Londra è una grande città del Regno Unito." +] diff --git a/spacy/lang/nb/examples.py b/spacy/lang/nb/examples.py new file mode 100644 index 000000000..0dc5c8144 --- /dev/null +++ b/spacy/lang/nb/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.nb.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar", + "Selvkjørende biler flytter forsikringsansvaret over på produsentene ", + "San Francisco vurderer å forby robotbud på fortauene", + "London er en stor by i Storbritannia." +] diff --git a/spacy/lang/pl/examples.py b/spacy/lang/pl/examples.py new file mode 100644 index 000000000..af6c72af0 --- /dev/null +++ b/spacy/lang/pl/examples.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.pl.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Poczuł przyjemną woń mocnej kawy.", + "Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.", + "Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.", + "Nowy abonament pod lupą Komisji Europejskiej", + "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?", + "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”." +] diff --git a/spacy/lang/pt/examples.py b/spacy/lang/pt/examples.py new file mode 100644 index 000000000..239929215 --- /dev/null +++ b/spacy/lang/pt/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.pt.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares", + "Carros autônomos empurram a responsabilidade do seguro para os fabricantes." + "São Francisco considera banir os robôs de entrega que andam pelas calçadas", + "Londres é a maior cidade do Reino Unido" +] diff --git a/spacy/lang/sv/examples.py b/spacy/lang/sv/examples.py new file mode 100644 index 000000000..be279c4bd --- /dev/null +++ b/spacy/lang/sv/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.sv.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple överväger att köpa brittisk startup för 1 miljard dollar.", + "Självkörande bilar förskjuter försäkringsansvar mot tillverkare.", + "San Fransisco överväger förbud mot leveransrobotar på trottoarer.". + "London är en storstad i Storbritannien." +] diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 3e5566705..7412ebeee 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -524,6 +524,7 @@ cdef class Parser: scores, bp_scores = vec2scores.begin_update(vector, drop=drop) d_scores = self.get_batch_loss(states, golds, scores) + d_scores /= len(docs) d_vector = bp_scores(d_scores, sgd=sgd) if drop != 0: d_vector *= mask @@ -582,7 +583,9 @@ cdef class Parser: width, density, sgd=sgd, drop=drop, losses=losses) backprop_lower = [] + cdef float batch_size = len(docs) for i, d_scores in enumerate(states_d_scores): + d_scores /= batch_size if losses is not None: losses[self.name] += (d_scores**2).sum() ids, bp_vectors, bp_scores = backprops[i] diff --git a/website/assets/css/_base/_utilities.sass b/website/assets/css/_base/_utilities.sass index 2c40858a8..46c3e84d9 100644 --- a/website/assets/css/_base/_utilities.sass +++ b/website/assets/css/_base/_utilities.sass @@ -112,6 +112,10 @@ .u-nowrap white-space: nowrap +.u-break.u-break + word-wrap: break-word + white-space: initial + .u-no-border border: none diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade index 929985144..fcba091b8 100644 --- a/website/docs/api/doc.jade +++ b/website/docs/api/doc.jade @@ -140,6 +140,44 @@ p Get the number of tokens in the document. +cell int +cell The number of tokens in the document. ++h(2, "char_span") Doc.char_span + +tag method + +tag-new(2) + +p Create a #[code Span] object from the slice #[code doc.text[start : end]]. + ++aside-code("Example"). + doc = nlp(u'I like New York') + label = doc.vocab.strings['GPE'] + span = doc.char_span(7, 15, label=label) + assert span.text == 'New York' + ++table(["Name", "Type", "Description"]) + +row + +cell #[code start] + +cell int + +cell The index of the first character of the span. + + +row + +cell #[code end] + +cell int + +cell The index of the first character after the span. + + +row + +cell #[code label] + +cell uint64 + +cell A label to attach to the Span, e.g. for named entities. + + +row + +cell #[code vector] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] + +cell A meaning representation of the span. + + +footrow + +cell returns + +cell #[code Span] + +cell The newly constructed object. + +h(2, "similarity") Doc.similarity +tag method +tag-model("vectors") @@ -211,12 +249,12 @@ p +table(["Name", "Type", "Description"]) +row +cell #[code attr_ids] - +cell ints + +cell list +cell A list of attribute ID ints. +footrow +cell returns - +cell #[code numpy.ndarray[ndim=2, dtype='int32']] + +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']] +cell | The exported attributes as a 2D numpy array, with one row per | token and one column per attribute. @@ -245,7 +283,7 @@ p +row +cell #[code array] - +cell #[code numpy.ndarray[ndim=2, dtype='int32']] + +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']] +cell The attribute values to load. +footrow @@ -509,7 +547,7 @@ p +table(["Name", "Type", "Description"]) +footrow +cell returns - +cell #[code numpy.ndarray[ndim=1, dtype='float32']] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell A 1D numpy array representing the document's semantics. +h(2, "vector_norm") Doc.vector_norm diff --git a/website/docs/api/language.jade b/website/docs/api/language.jade index 9c26f506c..69665ee9d 100644 --- a/website/docs/api/language.jade +++ b/website/docs/api/language.jade @@ -111,6 +111,14 @@ p +cell - +cell A sequence of unicode objects. + +row + +cell #[code as_tuples] + +cell bool + +cell + | If set to #[code True], inputs should be a sequence of + | #[code (text, context)] tuples. Output will then be a sequence of + | #[code (doc, context)] tuples. Defaults to #[code False]. + +row +cell #[code n_threads] +cell int diff --git a/website/docs/api/lexeme.jade b/website/docs/api/lexeme.jade index a0487be9b..6e3f68493 100644 --- a/website/docs/api/lexeme.jade +++ b/website/docs/api/lexeme.jade @@ -129,7 +129,7 @@ p A real-valued meaning representation. +table(["Name", "Type", "Description"]) +footrow +cell returns - +cell #[code numpy.ndarray[ndim=1, dtype='float32']] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell A 1D numpy array representing the lexeme's semantics. +h(2, "vector_norm") Lexeme.vector_norm diff --git a/website/docs/api/span.jade b/website/docs/api/span.jade index 542336714..2ca2d3ea9 100644 --- a/website/docs/api/span.jade +++ b/website/docs/api/span.jade @@ -37,7 +37,7 @@ p Create a Span object from the #[code slice doc[start : end]]. +row +cell #[code vector] - +cell #[code numpy.ndarray[ndim=1, dtype='float32']] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell A meaning representation of the span. +footrow @@ -145,11 +145,47 @@ p +cell float +cell A scalar similarity score. Higher is more similar. ++h(2, "to_array") Span.to_array + +tag method + +tag-new(2) + +p + | Given a list of #[code M] attribute IDs, export the tokens to a numpy + | #[code ndarray] of shape #[code (N, M)], where #[code N] is the length of + | the document. The values will be 32-bit integers. + ++aside-code("Example"). + from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA + doc = nlp(u'I like New York in Autumn.') + span = doc[2:3] + # All strings mapped to integers, for easy export to numpy + np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code attr_ids] + +cell list + +cell A list of attribute ID ints. + + +footrow + +cell returns + +cell #[code.u-break numpy.ndarray[long, ndim=2]] + +cell + | A feature matrix, with one row per word, and one column per + | attribute indicated in the input #[code attr_ids]. + +h(2, "merge") Span.merge +tag method p Retokenize the document, such that the span is merged into a single token. ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + span = doc[2:3] + span.merge() + assert len(doc) == 6 + assert doc[2].text == 'New York' + +table(["Name", "Type", "Description"]) +row +cell #[code **attributes] @@ -270,7 +306,7 @@ p +table(["Name", "Type", "Description"]) +footrow +cell returns - +cell #[code numpy.ndarray[ndim=1, dtype='float32']] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell A 1D numpy array representing the span's semantics. +h(2, "vector_norm") Span.vector_norm diff --git a/website/docs/api/token.jade b/website/docs/api/token.jade index 87387e09d..db445d09b 100644 --- a/website/docs/api/token.jade +++ b/website/docs/api/token.jade @@ -250,7 +250,7 @@ p A real-valued meaning representation. +table(["Name", "Type", "Description"]) +footrow +cell returns - +cell #[code numpy.ndarray[ndim=1, dtype='float32']] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell A 1D numpy array representing the token's semantics. +h(2, "vector_norm") Span.vector_norm