Merge branch 'master' of ssh://github.com/spacy-io/spaCy

2016-05-04 15:54:00 +02:00 · 2016-05-04 15:54:00 +02:00 · 76f1d871da
parent 519366f677 1822bb4ff1
commit 76f1d871da
8 changed files with 236 additions and 30 deletions
--- a/spacy/syntax/iterators.pyx
+++ b/spacy/syntax/iterators.pyx
@ -32,7 +32,10 @@ def german_noun_chunks(doc):
    np_deps = set(doc.vocab.strings[label] for label in labels)
    close_app = doc.vocab.strings['nk']

-    for word in doc:
+    rbracket = 0
+    for i, word in enumerate(doc):
+        if i < rbracket:
+            continue
        if word.pos == NOUN and word.dep in np_deps:
            rbracket = word.i+1
            # try to extend the span to the right
@ -40,7 +43,7 @@ def german_noun_chunks(doc):
            for rdep in doc[word.i].rights:
                if rdep.pos == NOUN and rdep.dep == close_app:
                    rbracket = rdep.i+1
-            yield word.l_edge, rbracket, np_label
+            yield word.left_edge.i, rbracket, np_label


 CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks}
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -225,6 +225,11 @@ cdef class Parser:
    def step_through(self, Doc doc):
        return StepwiseState(self, doc)

+    def from_transition_sequence(self, Doc doc, sequence):
+        with self.step_through(doc) as stepwise:
+            for transition in sequence:
+                stepwise.transition(transition)
+
    def add_label(self, label):
        for action in self.moves.action_types:
            self.moves.add_action(action, label)
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -1,17 +1,15 @@
-from spacy.en import English
-
 import pytest
 import os

+import spacy

@pytest.fixture(scope="session")
 def EN():
-    if os.environ.get('SPACY_DATA'):
-        data_dir = os.environ.get('SPACY_DATA')
-    else:
-        data_dir = None
-    print("Load EN from %s" % data_dir)
-    return English(data_dir=data_dir)
+    return spacy.load("en")
+
+@pytest.fixture(scope="session")
+def DE():
+    return spacy.load("de")


 def pytest_addoption(parser):
--- a/spacy/tests/integration/init.py
+++ b/spacy/tests/integration/init.py
--- a/spacy/tests/integration/test_model_sanity.py
+++ b/spacy/tests/integration/test_model_sanity.py
@ -0,0 +1,62 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+import numpy
+
+@pytest.mark.models
+class TestModelSanity:
+	"""
+	This is to make sure the model works as expected. The tests make sure that values are properly set.
+	Tests are not meant to evaluate the content of the output, only make sure the output is formally okay.
+	"""
+
+	@pytest.fixture(scope='class', params=['en','de'])
+	def example(self, request, EN, DE):
+		if request.param == 'en':
+			return EN(u'There was a stranger standing at the big street talking to herself.')
+		elif request.param == 'de':
+			return DE(u'An der großen Straße stand eine merkwürdige Gestalt und führte Selbstgespräche.')
+
+	def test_tokenization(self, example):
+		# tokenization should split the document into tokens
+		assert len(example) > 1
+
+	def test_tagging(self, example):
+		# if tagging was done properly, pos tags shouldn't be empty
+		assert example.is_tagged
+		assert all( t.pos != 0 for t in example )
+		assert all( t.tag != 0 for t in example )
+
+	def test_parsing(self, example):
+		# if parsing was done properly
+		# - dependency labels shouldn't be empty
+		# - the head of some tokens should not be root
+		assert example.is_parsed
+		assert all( t.dep != 0 for t in example )
+		assert any( t.dep != i for i,t in enumerate(example) )
+
+	def test_ner(self, example):
+		# if ner was done properly, ent_iob shouldn't be empty
+		assert all( t.ent_iob != 0 for t in example )
+
+	def test_vectors(self, example):
+		# if vectors are available, they should differ on different words
+		# this isn't a perfect test since this could in principle fail in a sane model as well,
+		# but that's very unlikely and a good indicator if something is wrong
+		vector0 = example[0].vector
+		vector1 = example[1].vector
+		vector2 = example[2].vector
+		assert not numpy.array_equal(vector0,vector1)
+		assert not numpy.array_equal(vector0,vector2)
+		assert not numpy.array_equal(vector1,vector2)
+
+	def test_probs(self, example):
+		# if frequencies/probabilities are okay, they should differ for different words
+		# this isn't a perfect test since this could in principle fail in a sane model as well,
+		# but that's very unlikely and a good indicator if something is wrong
+		prob0 = example[0].prob
+		prob1 = example[1].prob
+		prob2 = example[2].prob
+		assert not prob0 == prob1
+		assert not prob0 == prob2
+		assert not prob1 == prob2
--- a/spacy/tests/parser/test_base_nps.py
+++ b/spacy/tests/parser/test_base_nps.py
@ -2,30 +2,30 @@ from __future__ import unicode_literals
 import pytest


-@pytest.mark.models
-def test_nsubj(EN):
-    sent = EN(u'A base phrase should be recognized.')
-    base_nps = list(sent.noun_chunks)
-    assert len(base_nps) == 1
-    assert base_nps[0].string == 'A base phrase '
+# @pytest.mark.models
+# def test_nsubj(EN):
+#     sent = EN(u'A base phrase should be recognized.')
+#     base_nps = list(sent.noun_chunks)
+#     assert len(base_nps) == 1
+#     assert base_nps[0].string == 'A base phrase '


-@pytest.mark.models
-def test_coord(EN):
-    sent = EN(u'A base phrase and a good phrase are often the same.')
-    base_nps = list(sent.noun_chunks)
-    assert len(base_nps) == 2
-    assert base_nps[0].string == 'A base phrase '
-    assert base_nps[1].string == 'a good phrase '
+# @pytest.mark.models
+# def test_coord(EN):
+#     sent = EN(u'A base phrase and a good phrase are often the same.')
+#     base_nps = list(sent.noun_chunks)
+#     assert len(base_nps) == 2
+#     assert base_nps[0].string == 'A base phrase '
+#     assert base_nps[1].string == 'a good phrase '


-@pytest.mark.models
-def test_pp(EN):
-    sent = EN(u'A phrase with another phrase occurs')
-    base_nps = list(sent.noun_chunks)
-    assert len(base_nps) == 2
-    assert base_nps[0].string == 'A phrase '
-    assert base_nps[1].string == 'another phrase ' 
+# @pytest.mark.models
+# def test_pp(EN):
+#     sent = EN(u'A phrase with another phrase occurs')
+#     base_nps = list(sent.noun_chunks)
+#     assert len(base_nps) == 2
+#     assert base_nps[0].string == 'A phrase '
+#     assert base_nps[1].string == 'another phrase ' 


@pytest.mark.models
--- a/spacy/tests/unit/init.py
+++ b/spacy/tests/unit/init.py
--- a/spacy/tests/unit/test_parser.py
+++ b/spacy/tests/unit/test_parser.py
@ -0,0 +1,138 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals
+
+import pytest
+import numpy
+
+from spacy.attrs import HEAD, DEP
+
+
+@pytest.mark.models
+class TestNounChunks:
+    @pytest.fixture(scope="class")
+    def ex1_en(self, EN):
+        example = EN.tokenizer.tokens_from_list('A base phrase should be recognized .'.split(' '))
+        EN.tagger.tag_from_strings(example, 'DT NN NN MD VB VBN .'.split(' '))
+        det,compound,nsubjpass,aux,auxpass,root,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubjpass','aux','auxpass','root','punct'] )
+        example.from_array([HEAD, DEP],
+        numpy.asarray(
+            [
+                [2, det],
+                [1, compound],
+                [3, nsubjpass],
+                [2, aux],
+                [1, auxpass],
+                [0, root],
+                [-1, punct]
+            ], dtype='int32'))
+        return example
+
+    @pytest.fixture(scope="class")
+    def ex2_en(self, EN):
+        example = EN.tokenizer.tokens_from_list('A base phrase and a good phrase are often the same .'.split(' '))
+        EN.tagger.tag_from_strings(example, 'DT NN NN CC DT JJ NN VBP RB DT JJ .'.split(' '))
+        det,compound,nsubj,cc,amod,conj,root,advmod,attr,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubj','cc','amod','conj','root','advmod','attr','punct'] )
+        example.from_array([HEAD, DEP],
+        numpy.asarray(
+            [
+                [2, det],
+                [1, compound],
+                [5, nsubj],
+                [-1, cc],
+                [1, det],
+                [1, amod],
+                [-4, conj],
+                [0, root],
+                [-1, advmod],
+                [1, det],
+                [-3, attr],
+                [-4, punct]
+            ], dtype='int32'))
+        return example
+
+    @pytest.fixture(scope="class")
+    def ex3_en(self, EN):
+        example = EN.tokenizer.tokens_from_list('A phrase with another phrase occurs .'.split(' '))
+        EN.tagger.tag_from_strings(example, 'DT NN IN DT NN VBZ .'.split(' '))
+        det,nsubj,prep,pobj,root,punct = tuple( EN.vocab.strings[l] for l in ['det','nsubj','prep','pobj','root','punct'] )
+        example.from_array([HEAD, DEP],
+        numpy.asarray(
+            [
+                [1, det],
+                [4, nsubj],
+                [-1, prep],
+                [1, det],
+                [-2, pobj],
+                [0, root],
+                [-1, punct]
+            ], dtype='int32'))
+        return example
+
+    @pytest.fixture(scope="class")
+    def ex1_de(self, DE):
+        example = DE.tokenizer.tokens_from_list('Eine Tasse steht auf dem Tisch .'.split(' '))
+        DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN $.'.split(' '))
+        nk,sb,root,mo,punct = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct'])
+        example.from_array([HEAD, DEP],
+        numpy.asarray(
+            [
+                [1, nk],
+                [1, sb],
+                [0, root],
+                [-1, mo],
+                [1, nk],
+                [-2, nk],
+                [-3, punct]
+            ], dtype='int32'))
+        return example
+
+    @pytest.fixture(scope="class")
+    def ex2_de(self, DE):
+        example = DE.tokenizer.tokens_from_list('Die Sängerin singt mit einer Tasse Kaffee Arien .'.split(' '))
+        DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN NN NN $.'.split(' '))
+        nk,sb,root,mo,punct,oa = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct','oa'])
+        example.from_array([HEAD, DEP],
+        numpy.asarray(
+            [
+                [1, nk],
+                [1, sb],
+                [0, root],
+                [-1, mo],
+                [1, nk],
+                [-2, nk],
+                [-1, nk],
+                [-5, oa],
+                [-6, punct]
+            ], dtype='int32'))
+        return example
+
+    def test_en_standard_chunk(self, ex1_en):
+        chunks = list(ex1_en.noun_chunks)
+        assert len(chunks) == 1
+        assert chunks[0].string == 'A base phrase '
+
+    def test_en_coordinated_chunks(self, ex2_en):
+        chunks = list(ex2_en.noun_chunks)
+        assert len(chunks) == 2
+        assert chunks[0].string == 'A base phrase '
+        assert chunks[1].string == 'a good phrase '
+
+    def test_en_pp_chunks(self, ex3_en):
+        chunks = list(ex3_en.noun_chunks)
+        assert len(chunks) == 2
+        assert chunks[0].string == 'A phrase '
+        assert chunks[1].string == 'another phrase '
+
+    def test_de_standard_chunk(self, ex1_de):
+        chunks = list(ex1_de.noun_chunks)
+        assert len(chunks) == 2
+        assert chunks[0].string == 'Eine Tasse '
+        assert chunks[1].string == 'dem Tisch '
+
+    def test_de_extended_chunk(self, ex2_de):
+        chunks = list(ex2_de.noun_chunks)
+        assert len(chunks) == 3
+        assert chunks[0].string == 'Die Sängerin '
+        assert chunks[1].string == 'einer Tasse Kaffee '
+        assert chunks[2].string == 'Arien '