Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-08-19 16:24:57 +02:00
commit 9b6a5df15e
18 changed files with 327 additions and 8 deletions

18
spacy/lang/da/examples.py Normal file
View File

@ -0,0 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.da.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple overvejer at købe et britisk statup for 1 milliard dollar",
"Selvkørende biler flytter forsikringsansvaret over på producenterne",
"San Francisco overvejer at forbyde leverandørrobotter på fortov",
"London er en stor by i Storbritannien"
]

22
spacy/lang/de/examples.py Normal file
View File

@ -0,0 +1,22 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.de.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
"Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz",
"Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz",
"Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion",
"San Francisco erwägt Verbot von Lieferrobotern",
"Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller",
"Wo bist du?",
"Was ist die Hauptstadt von Deutschland?"
]

22
spacy/lang/en/examples.py Normal file
View File

@ -0,0 +1,22 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.en.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple is looking at buying U.K. startup for $1 billion",
"Autonomous cars shift insurance liability toward manufacturers",
"San Francisco considers banning sidewalk delivery robots",
"London is a big city in the United Kingdom.",
"Where are you?",
"Who is the president of France?",
"What is the capital of the United States?",
"When was Barack Obama born?"
]

22
spacy/lang/es/examples.py Normal file
View File

@ -0,0 +1,22 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.es.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple está buscando comprar una startup del Reino Unido por mil millones de dólares",
"Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes",
"San Francisco analiza prohibir los robots delivery",
"Londres es una gran ciudad del Reino Unido",
"El gato come pescado",
"Veo al hombre con el telescopio",
"La araña come moscas",
"El pingüino incuba en su nido"
]

26
spacy/lang/fr/examples.py Normal file
View File

@ -0,0 +1,26 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.fr.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple cherche a acheter une startup anglaise pour 1 milliard de dollard",
"Les voitures autonomes voient leur assurances décalées vers les constructeurs",
"San Francisco envisage d'interdire les robots coursiers",
"Londres est une grande ville du Royaume-Uni",
"LItalie choisit ArcelorMittal pour reprendre la plus grande aciérie dEurope",
"Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon",
"La France ne devrait pas manquer d'électricité cet été, même en cas de canicule",
"Nouvelles attaques de Trump contre le maire de Londres",
"Où es-tu ?",
"Qui est le président de la France ?",
"Où est la capitale des Etats-Unis ?",
"Quand est né Barack Obama ?"
]

28
spacy/lang/he/examples.py Normal file
View File

@ -0,0 +1,28 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.he.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
'סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל',
'רה"מ הודיע כי יחרים טקס בחסותו',
'הכנסת צפויה לאשר איכון אוטומטי של שיחות למוקד 100',
'תוכנית לאומית תהפוך את ישראל למעצמה דיגיטלית',
'סע לשלום, המפתחות בפנים.',
'מלצר, פעמיים טורקי!',
'ואהבת לרעך כמוך.',
'היום נעשה משהו בלתי נשכח.',
'איפה הילד?',
'מיהו נשיא צרפת?',
'מהי בירת ארצות הברית?',
"איך קוראים בעברית לצ'ופצ'יק של הקומקום?",
'מה הייתה הדקה?',
'מי אומר שלום ראשון, זה שעולה או זה שיורד?'
]

18
spacy/lang/it/examples.py Normal file
View File

@ -0,0 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.it.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
"Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori",
"San Francisco prevede di bandire i robot di consegna porta a porta",
"Londra è una grande città del Regno Unito."
]

18
spacy/lang/nb/examples.py Normal file
View File

@ -0,0 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.nb.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar",
"Selvkjørende biler flytter forsikringsansvaret over på produsentene ",
"San Francisco vurderer å forby robotbud på fortauene",
"London er en stor by i Storbritannia."
]

20
spacy/lang/pl/examples.py Normal file
View File

@ -0,0 +1,20 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.pl.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Poczuł przyjemną woń mocnej kawy.",
"Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.",
"Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.",
"Nowy abonament pod lupą Komisji Europejskiej",
"Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
"Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”."
]

18
spacy/lang/pt/examples.py Normal file
View File

@ -0,0 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.pt.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares",
"Carros autônomos empurram a responsabilidade do seguro para os fabricantes."
"São Francisco considera banir os robôs de entrega que andam pelas calçadas",
"Londres é a maior cidade do Reino Unido"
]

18
spacy/lang/sv/examples.py Normal file
View File

@ -0,0 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.sv.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple överväger att köpa brittisk startup för 1 miljard dollar.",
"Självkörande bilar förskjuter försäkringsansvar mot tillverkare.",
"San Fransisco överväger förbud mot leveransrobotar på trottoarer.".
"London är en storstad i Storbritannien."
]

View File

@ -524,6 +524,7 @@ cdef class Parser:
scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
d_scores = self.get_batch_loss(states, golds, scores)
d_scores /= len(docs)
d_vector = bp_scores(d_scores, sgd=sgd)
if drop != 0:
d_vector *= mask
@ -582,7 +583,9 @@ cdef class Parser:
width, density,
sgd=sgd, drop=drop, losses=losses)
backprop_lower = []
cdef float batch_size = len(docs)
for i, d_scores in enumerate(states_d_scores):
d_scores /= batch_size
if losses is not None:
losses[self.name] += (d_scores**2).sum()
ids, bp_vectors, bp_scores = backprops[i]

View File

@ -112,6 +112,10 @@
.u-nowrap
white-space: nowrap
.u-break.u-break
word-wrap: break-word
white-space: initial
.u-no-border
border: none

View File

@ -140,6 +140,44 @@ p Get the number of tokens in the document.
+cell int
+cell The number of tokens in the document.
+h(2, "char_span") Doc.char_span
+tag method
+tag-new(2)
p Create a #[code Span] object from the slice #[code doc.text[start : end]].
+aside-code("Example").
doc = nlp(u'I like New York')
label = doc.vocab.strings['GPE']
span = doc.char_span(7, 15, label=label)
assert span.text == 'New York'
+table(["Name", "Type", "Description"])
+row
+cell #[code start]
+cell int
+cell The index of the first character of the span.
+row
+cell #[code end]
+cell int
+cell The index of the first character after the span.
+row
+cell #[code label]
+cell uint64
+cell A label to attach to the Span, e.g. for named entities.
+row
+cell #[code vector]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell A meaning representation of the span.
+footrow
+cell returns
+cell #[code Span]
+cell The newly constructed object.
+h(2, "similarity") Doc.similarity
+tag method
+tag-model("vectors")
@ -211,12 +249,12 @@ p
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_ids]
+cell ints
+cell list
+cell A list of attribute ID ints.
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
+cell
| The exported attributes as a 2D numpy array, with one row per
| token and one column per attribute.
@ -245,7 +283,7 @@ p
+row
+cell #[code array]
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
+cell The attribute values to load.
+footrow
@ -509,7 +547,7 @@ p
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the document's semantics.
+h(2, "vector_norm") Doc.vector_norm

View File

@ -111,6 +111,14 @@ p
+cell -
+cell A sequence of unicode objects.
+row
+cell #[code as_tuples]
+cell bool
+cell
| If set to #[code True], inputs should be a sequence of
| #[code (text, context)] tuples. Output will then be a sequence of
| #[code (doc, context)] tuples. Defaults to #[code False].
+row
+cell #[code n_threads]
+cell int

View File

@ -129,7 +129,7 @@ p A real-valued meaning representation.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the lexeme's semantics.
+h(2, "vector_norm") Lexeme.vector_norm

View File

@ -37,7 +37,7 @@ p Create a Span object from the #[code slice doc[start : end]].
+row
+cell #[code vector]
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell A meaning representation of the span.
+footrow
@ -145,11 +145,47 @@ p
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "to_array") Span.to_array
+tag method
+tag-new(2)
p
| Given a list of #[code M] attribute IDs, export the tokens to a numpy
| #[code ndarray] of shape #[code (N, M)], where #[code N] is the length of
| the document. The values will be 32-bit integers.
+aside-code("Example").
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
doc = nlp(u'I like New York in Autumn.')
span = doc[2:3]
# All strings mapped to integers, for easy export to numpy
np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_ids]
+cell list
+cell A list of attribute ID ints.
+footrow
+cell returns
+cell #[code.u-break numpy.ndarray[long, ndim=2]]
+cell
| A feature matrix, with one row per word, and one column per
| attribute indicated in the input #[code attr_ids].
+h(2, "merge") Span.merge
+tag method
p Retokenize the document, such that the span is merged into a single token.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
span = doc[2:3]
span.merge()
assert len(doc) == 6
assert doc[2].text == 'New York'
+table(["Name", "Type", "Description"])
+row
+cell #[code **attributes]
@ -270,7 +306,7 @@ p
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the span's semantics.
+h(2, "vector_norm") Span.vector_norm

View File

@ -250,7 +250,7 @@ p A real-valued meaning representation.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the token's semantics.
+h(2, "vector_norm") Span.vector_norm