Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2017-08-19 09:03:15 -05:00 · 2017-08-19 09:03:15 -05:00 · 7c47e38c12
parent ab28f911b4 5cb0200e63
commit 7c47e38c12
24 changed files with 392 additions and 28 deletions
--- a/spacy/lang/da/examples.py
+++ b/spacy/lang/da/examples.py
@ -0,0 +1,18 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.da.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple overvejer at købe et britisk statup for 1 milliard dollar",
    "Selvkørende biler flytter forsikringsansvaret over på producenterne",
    "San Francisco overvejer at forbyde leverandørrobotter på fortov",
    "London er en stor by i Storbritannien"
 ]
--- a/spacy/lang/de/examples.py
+++ b/spacy/lang/de/examples.py
@ -0,0 +1,22 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.de.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
    "Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz",
    "Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz",
    "Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion",
    "San Francisco erwägt Verbot von Lieferrobotern",
    "Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller",
    "Wo bist du?",
    "Was ist die Hauptstadt von Deutschland?"
 ]
--- a/spacy/lang/en/examples.py
+++ b/spacy/lang/en/examples.py
@ -0,0 +1,22 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.en.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple is looking at buying U.K. startup for $1 billion",
    "Autonomous cars shift insurance liability toward manufacturers",
    "San Francisco considers banning sidewalk delivery robots",
    "London is a big city in the United Kingdom.",
    "Where are you?",
    "Who is the president of France?",
    "What is the capital of the United States?",
    "When was Barack Obama born?"
 ]
--- a/spacy/lang/es/examples.py
+++ b/spacy/lang/es/examples.py
@ -0,0 +1,22 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.es.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares",
    "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes",
    "San Francisco analiza prohibir los robots delivery",
    "Londres es una gran ciudad del Reino Unido",
    "El gato come pescado",
    "Veo al hombre con el telescopio",
    "La araña come moscas",
    "El pingüino incuba en su nido"
 ]
--- a/spacy/lang/fr/examples.py
+++ b/spacy/lang/fr/examples.py
@ -0,0 +1,26 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.fr.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple cherche a acheter une startup anglaise pour 1 milliard de dollard",
    "Les voitures autonomes voient leur assurances décalées vers les constructeurs",
    "San Francisco envisage d'interdire les robots coursiers",
    "Londres est une grande ville du Royaume-Uni",
    "L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe",
    "Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon",
    "La France ne devrait pas manquer d'électricité cet été, même en cas de canicule",
    "Nouvelles attaques de Trump contre le maire de Londres",
    "Où es-tu ?",
    "Qui est le président de la France ?",
    "Où est la capitale des Etats-Unis ?",
    "Quand est né Barack Obama ?"
 ]
--- a/spacy/lang/he/examples.py
+++ b/spacy/lang/he/examples.py
@ -0,0 +1,28 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.he.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    'סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל',
    'רה"מ הודיע כי יחרים טקס בחסותו',
    'הכנסת צפויה לאשר איכון אוטומטי של שיחות למוקד 100',
    'תוכנית לאומית תהפוך את ישראל למעצמה דיגיטלית',
    'סע לשלום, המפתחות בפנים.',
    'מלצר, פעמיים טורקי!',
    'ואהבת לרעך כמוך.',
    'היום נעשה משהו בלתי נשכח.',
    'איפה הילד?',
    'מיהו נשיא צרפת?',
    'מהי בירת ארצות הברית?',
    "איך קוראים בעברית לצ'ופצ'יק של הקומקום?",
    'מה הייתה הדקה?',
    'מי אומר שלום ראשון, זה שעולה או זה שיורד?'
 ]
--- a/spacy/lang/it/examples.py
+++ b/spacy/lang/it/examples.py
@ -0,0 +1,18 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.it.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
    "Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori",
    "San Francisco prevede di bandire i robot di consegna porta a porta",
    "Londra è una grande città del Regno Unito."
 ]
--- a/spacy/lang/nb/examples.py
+++ b/spacy/lang/nb/examples.py
@ -0,0 +1,18 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.nb.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar",
    "Selvkjørende biler flytter forsikringsansvaret over på produsentene ",
    "San Francisco vurderer å forby robotbud på fortauene",
    "London er en stor by i Storbritannia."
 ]
--- a/spacy/lang/pl/examples.py
+++ b/spacy/lang/pl/examples.py
@ -0,0 +1,20 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.pl.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Poczuł przyjemną woń mocnej kawy.",
    "Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.",
    "Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.",
    "Nowy abonament pod lupą Komisji Europejskiej",
    "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
    "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”."
 ]
--- a/spacy/lang/pt/examples.py
+++ b/spacy/lang/pt/examples.py
@ -0,0 +1,18 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.pt.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares",
    "Carros autônomos empurram a responsabilidade do seguro para os fabricantes."
    "São Francisco considera banir os robôs de entrega que andam pelas calçadas",
    "Londres é a maior cidade do Reino Unido"
 ]
--- a/spacy/lang/sv/examples.py
+++ b/spacy/lang/sv/examples.py
@ -0,0 +1,18 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.sv.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple överväger att köpa brittisk startup för 1 miljard dollar.",
    "Självkörande bilar förskjuter försäkringsansvar mot tillverkare.",
    "San Fransisco överväger förbud mot leveransrobotar på trottoarer.".
    "London är en storstad i Storbritannien."
 ]
--- a/spacy/language.py
+++ b/spacy/language.py
@ -430,11 +430,16 @@ class Language(object):
            except StopIteration:
                pass
-    def pipe(self, texts, tuples=False, n_threads=2, batch_size=1000, disable=[]):
+    def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
            disable=[]):
        """Process texts as a stream, and yield `Doc` objects in order. Supports
        GIL-free multi-threading.
        texts (iterator): A sequence of texts to process.
        as_tuples (bool):
            If set to True, inputs should be a sequence of
            (text, context) tuples. Output will then be a sequence of
            (doc, context) tuples. Defaults to False.
        n_threads (int): The number of worker threads to use. If -1, OpenMP will
            decide how many to use at run time. Default is 2.
        batch_size (int): The number of texts to buffer.
@ -446,7 +451,7 @@ class Language(object):
            >>>     for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
            >>>         assert doc.is_parsed
        """
-        if tuples:
+        if as_tuples:
            text_context1, text_context2 = itertools.tee(texts)
            texts = (tc[0] for tc in text_context1)
            contexts = (tc[1] for tc in text_context2)
--- a/spacy/tests/parser/test_nn_beam.py
+++ b/spacy/tests/parser/test_nn_beam.py
@ -63,7 +63,7 @@ def vector_size():
@pytest.fixture
 def beam(moves, states, golds, beam_width):
-    return ParserBeam(moves, states, golds, width=beam_width)
+    return ParserBeam(moves, states, golds, width=beam_width, density=0.0)
@pytest.fixture
 def scores(moves, batch_size, beam_width):
--- a/spacy/tests/serialize/test_serialize_tagger.py
+++ b/spacy/tests/serialize/test_serialize_tagger.py
@ -11,8 +11,8 @@ import pytest
 def taggers(en_vocab):
    tagger1 = Tagger(en_vocab)
    tagger2 = Tagger(en_vocab)
-    tagger1.model = tagger1.Model(None, None)
+    tagger1.model = tagger1.Model(8, 8)
-    tagger2.model = tagger2.Model(None, None)
+    tagger2.model = tagger1.model
    return (tagger1, tagger2)
@ -20,7 +20,6 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
    tagger1, tagger2 = taggers
    tagger1_b = tagger1.to_bytes()
    tagger2_b = tagger2.to_bytes()
    assert tagger1_b == tagger2_b
    tagger1 = tagger1.from_bytes(tagger1_b)
    assert tagger1.to_bytes() == tagger1_b
    new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b)
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -238,6 +238,27 @@ cdef class Doc:
    def doc(self):
        return self
    def char_span(self, int start_idx, int end_idx, attr_t label=0, vector=None):
        """Create a `Span` object from the slice `doc.text[start : end]`.
        doc (Doc): The parent document.
        start (int): The index of the first character of the span.
        end (int): The index of the first character after the span.
        label (uint64): A label to attach to the Span, e.g. for named entities.
        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
        RETURNS (Span): The newly constructed object.
        """
        cdef int start = token_by_start(self.c, self.length, start_idx)
        if start == -1:
            return None
        cdef int end = token_by_end(self.c, self.length, end_idx)
        if end == -1:
            return None
        # Currently we have the token index, we want the range-end index
        end += 1
        cdef Span span = Span(self, start, end, label=label, vector=vector)
        return span
    def similarity(self, other):
        """Make a semantic similarity estimate. The default estimate is cosine
        similarity using an average of word vectors.
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@ -15,5 +15,5 @@ cdef class Span:
    cdef public _vector
    cdef public _vector_norm
    cpdef int _recalculate_indices(self) except -1
    cpdef np.ndarray to_array(self, object features)
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -7,7 +7,7 @@ import numpy
 import numpy.linalg
 from libc.math cimport sqrt
-from .doc cimport token_by_start, token_by_end
+from .doc cimport token_by_start, token_by_end, get_token_attr
 from ..structs cimport TokenC, LexemeC
 from ..typedefs cimport flags_t, attr_t, hash_t
 from ..attrs cimport attr_id_t
@ -135,6 +135,28 @@ cdef class Span:
            return 0.0
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
    cpdef np.ndarray to_array(self, object py_attr_ids):
        """Given a list of M attribute IDs, export the tokens to a numpy
        `ndarray` of shape `(N, M)`, where `N` is the length of the document.
        The values will be 32-bit integers.
        attr_ids (list[int]): A list of attribute ID ints.
        RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
            per word, and one column per attribute indicated in the input
            `attr_ids`.
        """
        cdef int i, j
        cdef attr_id_t feature
        cdef np.ndarray[attr_t, ndim=2] output
        # Make an array from the attributes --- otherwise our inner loop is Python
        # dict iteration.
        cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
        output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
        for i in range(self.start, self.end):
            for j, feature in enumerate(attr_ids):
                output[i, j] = get_token_attr(&self.doc.c[i], feature)
        return output
    cpdef int _recalculate_indices(self) except -1:
        if self.end > self.doc.length \
        or self.doc.c[self.start].idx != self.start_char \
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@ -20,7 +20,7 @@ cdef class Vectors:
    '''Store, save and load word vectors.'''
    cdef public object data
    cdef readonly StringStore strings
-    cdef public object index
+    cdef public object key2row
    def __init__(self, strings, data_or_width):
        self.strings = StringStore()
@ -30,9 +30,9 @@ cdef class Vectors:
        else:
            data = data_or_width
        self.data = data
-        self.index = {}
+        self.key2row = {}
        for i, string in enumerate(strings):
-            self.index[self.strings.add(string)] = i
+            self.key2row[self.strings.add(string)] = i
    def __reduce__(self):
        return (Vectors, (self.strings, self.data))
@ -40,7 +40,7 @@ cdef class Vectors:
    def __getitem__(self, key):
        if isinstance(key, basestring):
            key = self.strings[key]
-        i = self.index[key]
+        i = self.key2row[key]
        if i is None:
            raise KeyError(key)
        else:
@ -49,7 +49,7 @@ cdef class Vectors:
    def __setitem__(self, key, vector):
        if isinstance(key, basestring):
            key = self.strings.add(key)
-        i = self.index[key]
+        i = self.key2row[key]
        self.data[i] = vector
    def __iter__(self):
@ -71,7 +71,7 @@ cdef class Vectors:
    def to_disk(self, path, **exclude):
        def serialize_vectors(p):
-            write_vectors_to_bin_loc(self.strings, self.key2i, self.data, str(p))
+            write_vectors_to_bin_loc(self.strings, self.key2row, self.data, str(p))
        serializers = OrderedDict((
            ('vec.bin', serialize_vectors),
@ -80,12 +80,13 @@ cdef class Vectors:
    def from_disk(self, path, **exclude):
        def deserialize_vectors(p):
-            self.key2i, self.vectors = load_vectors_from_bin_loc(self.strings, str(p))
+            values = load_vectors_from_bin_loc(self.strings, str(p))
            self.key2row, self.data = values
        serializers = OrderedDict((
-            ('vec.bin', deserialize_vectors)
+            ('vec.bin', deserialize_vectors),
        ))
-        return util.to_disk(serializers, exclude)
+        return util.from_disk(path, serializers, exclude)
    def to_bytes(self, **exclude):
        def serialize_weights():
@ -93,9 +94,9 @@ cdef class Vectors:
                return self.data.to_bytes()
            else:
                return msgpack.dumps(self.data)
-
+        b = msgpack.dumps(self.key2row)
        serializers = OrderedDict((
-            ('key2row', lambda: msgpack.dumps(self.key2i)),
+            ('key2row', lambda: msgpack.dumps(self.key2row)),
            ('strings', lambda: self.strings.to_bytes()),
            ('vectors', serialize_weights)
        ))
@ -109,7 +110,7 @@ cdef class Vectors:
                self.data = msgpack.loads(b)
        deserializers = OrderedDict((
-            ('key2row', lambda b: self.key2i.update(msgpack.loads(b))),
+            ('key2row', lambda b: self.key2row.update(msgpack.loads(b))),
            ('strings', lambda b: self.strings.from_bytes(b)),
            ('vectors', deserialize_weights)
        ))
--- a/website/assets/css/_base/_utilities.sass
+++ b/website/assets/css/_base/_utilities.sass
@ -112,6 +112,10 @@
 .u-nowrap
    white-space: nowrap
 .u-break.u-break
    word-wrap: break-word
    white-space: initial
 .u-no-border
    border: none
--- a/website/docs/api/doc.jade
+++ b/website/docs/api/doc.jade
@ -140,6 +140,44 @@ p Get the number of tokens in the document.
        +cell int
        +cell The number of tokens in the document.
 +h(2, "char_span") Doc.char_span
    +tag method
    +tag-new(2)
 p Create a #[code Span] object from the slice #[code doc.text[start : end]].
 +aside-code("Example").
    doc = nlp(u'I like New York')
    label = doc.vocab.strings['GPE']
    span = doc.char_span(7, 15, label=label)
    assert span.text == 'New York'
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code start]
        +cell int
        +cell The index of the first character of the span.
    +row
        +cell #[code end]
        +cell int
        +cell The index of the first character after the span.
    +row
        +cell #[code label]
        +cell uint64
        +cell A label to attach to the Span, e.g. for named entities.
    +row
        +cell #[code vector]
        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
        +cell A meaning representation of the span.
    +footrow
        +cell returns
        +cell #[code Span]
        +cell The newly constructed object.
 +h(2, "similarity") Doc.similarity
    +tag method
    +tag-model("vectors")
@ -211,12 +249,12 @@ p
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code attr_ids]
-        +cell ints
+        +cell list
        +cell A list of attribute ID ints.
    +footrow
        +cell returns
-        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+        +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
        +cell
            |  The exported attributes as a 2D numpy array, with one row per
            |  token and one column per attribute.
@ -245,7 +283,7 @@ p
    +row
        +cell #[code array]
-        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+        +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
        +cell The attribute values to load.
    +footrow
@ -509,7 +547,7 @@ p
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
        +cell A 1D numpy array representing the document's semantics.
 +h(2, "vector_norm") Doc.vector_norm
--- a/website/docs/api/language.jade
+++ b/website/docs/api/language.jade
@ -111,6 +111,14 @@ p
        +cell -
        +cell A sequence of unicode objects.
    +row
        +cell #[code as_tuples]
        +cell bool
        +cell
            |  If set to #[code True], inputs should be a sequence of
            |  #[code (text, context)] tuples. Output will then be a sequence of
            |  #[code (doc, context)] tuples. Defaults to #[code False].
    +row
        +cell #[code n_threads]
        +cell int
--- a/website/docs/api/lexeme.jade
+++ b/website/docs/api/lexeme.jade
@ -129,7 +129,7 @@ p A real-valued meaning representation.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
        +cell A 1D numpy array representing the lexeme's semantics.
 +h(2, "vector_norm") Lexeme.vector_norm
--- a/website/docs/api/span.jade
+++ b/website/docs/api/span.jade
@ -37,7 +37,7 @@ p Create a Span object from the #[code slice doc[start : end]].
    +row
        +cell #[code vector]
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
        +cell A meaning representation of the span.
    +footrow
@ -145,11 +145,47 @@ p
        +cell float
        +cell A scalar similarity score. Higher is more similar.
 +h(2, "to_array") Span.to_array
    +tag method
    +tag-new(2)
 p
    |  Given a list of #[code M] attribute IDs, export the tokens to a numpy
    |  #[code ndarray] of shape #[code (N, M)], where #[code N] is the length of
    |  the document. The values will be 32-bit integers.
 +aside-code("Example").
    from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
    doc = nlp(u'I like New York in Autumn.')
    span = doc[2:3]
    # All strings mapped to integers, for easy export to numpy
    np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code attr_ids]
        +cell list
        +cell A list of attribute ID ints.
    +footrow
        +cell returns
        +cell #[code.u-break numpy.ndarray[long, ndim=2]]
        +cell
            |  A feature matrix, with one row per word, and one column per
            |  attribute indicated in the input #[code attr_ids].
 +h(2, "merge") Span.merge
    +tag method
 p Retokenize the document, such that the span is merged into a single token.
 +aside-code("Example").
    doc = nlp(u'I like New York in Autumn.')
    span = doc[2:3]
    span.merge()
    assert len(doc) == 6
    assert doc[2].text == 'New York'
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code **attributes]
@ -270,7 +306,7 @@ p
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
        +cell A 1D numpy array representing the span's semantics.
 +h(2, "vector_norm") Span.vector_norm
--- a/website/docs/api/token.jade
+++ b/website/docs/api/token.jade
@ -250,7 +250,7 @@ p A real-valued meaning representation.
 +table(["Name", "Type", "Description"])
    +footrow
        +cell returns
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
        +cell A 1D numpy array representing the token's semantics.
 +h(2, "vector_norm") Span.vector_norm