mirror of https://github.com/explosion/spaCy.git
Merge branch 'master' of ssh://github.com/spacy-io/spaCy
This commit is contained in:
commit
76f1d871da
|
@ -32,7 +32,10 @@ def german_noun_chunks(doc):
|
||||||
np_deps = set(doc.vocab.strings[label] for label in labels)
|
np_deps = set(doc.vocab.strings[label] for label in labels)
|
||||||
close_app = doc.vocab.strings['nk']
|
close_app = doc.vocab.strings['nk']
|
||||||
|
|
||||||
for word in doc:
|
rbracket = 0
|
||||||
|
for i, word in enumerate(doc):
|
||||||
|
if i < rbracket:
|
||||||
|
continue
|
||||||
if word.pos == NOUN and word.dep in np_deps:
|
if word.pos == NOUN and word.dep in np_deps:
|
||||||
rbracket = word.i+1
|
rbracket = word.i+1
|
||||||
# try to extend the span to the right
|
# try to extend the span to the right
|
||||||
|
@ -40,7 +43,7 @@ def german_noun_chunks(doc):
|
||||||
for rdep in doc[word.i].rights:
|
for rdep in doc[word.i].rights:
|
||||||
if rdep.pos == NOUN and rdep.dep == close_app:
|
if rdep.pos == NOUN and rdep.dep == close_app:
|
||||||
rbracket = rdep.i+1
|
rbracket = rdep.i+1
|
||||||
yield word.l_edge, rbracket, np_label
|
yield word.left_edge.i, rbracket, np_label
|
||||||
|
|
||||||
|
|
||||||
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks}
|
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks}
|
||||||
|
|
|
@ -225,6 +225,11 @@ cdef class Parser:
|
||||||
def step_through(self, Doc doc):
|
def step_through(self, Doc doc):
|
||||||
return StepwiseState(self, doc)
|
return StepwiseState(self, doc)
|
||||||
|
|
||||||
|
def from_transition_sequence(self, Doc doc, sequence):
|
||||||
|
with self.step_through(doc) as stepwise:
|
||||||
|
for transition in sequence:
|
||||||
|
stepwise.transition(transition)
|
||||||
|
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
for action in self.moves.action_types:
|
for action in self.moves.action_types:
|
||||||
self.moves.add_action(action, label)
|
self.moves.add_action(action, label)
|
||||||
|
|
|
@ -1,17 +1,15 @@
|
||||||
from spacy.en import English
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
import spacy
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def EN():
|
def EN():
|
||||||
if os.environ.get('SPACY_DATA'):
|
return spacy.load("en")
|
||||||
data_dir = os.environ.get('SPACY_DATA')
|
|
||||||
else:
|
@pytest.fixture(scope="session")
|
||||||
data_dir = None
|
def DE():
|
||||||
print("Load EN from %s" % data_dir)
|
return spacy.load("de")
|
||||||
return English(data_dir=data_dir)
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_addoption(parser):
|
def pytest_addoption(parser):
|
||||||
|
|
|
@ -0,0 +1,62 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import numpy
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
|
class TestModelSanity:
|
||||||
|
"""
|
||||||
|
This is to make sure the model works as expected. The tests make sure that values are properly set.
|
||||||
|
Tests are not meant to evaluate the content of the output, only make sure the output is formally okay.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@pytest.fixture(scope='class', params=['en','de'])
|
||||||
|
def example(self, request, EN, DE):
|
||||||
|
if request.param == 'en':
|
||||||
|
return EN(u'There was a stranger standing at the big street talking to herself.')
|
||||||
|
elif request.param == 'de':
|
||||||
|
return DE(u'An der großen Straße stand eine merkwürdige Gestalt und führte Selbstgespräche.')
|
||||||
|
|
||||||
|
def test_tokenization(self, example):
|
||||||
|
# tokenization should split the document into tokens
|
||||||
|
assert len(example) > 1
|
||||||
|
|
||||||
|
def test_tagging(self, example):
|
||||||
|
# if tagging was done properly, pos tags shouldn't be empty
|
||||||
|
assert example.is_tagged
|
||||||
|
assert all( t.pos != 0 for t in example )
|
||||||
|
assert all( t.tag != 0 for t in example )
|
||||||
|
|
||||||
|
def test_parsing(self, example):
|
||||||
|
# if parsing was done properly
|
||||||
|
# - dependency labels shouldn't be empty
|
||||||
|
# - the head of some tokens should not be root
|
||||||
|
assert example.is_parsed
|
||||||
|
assert all( t.dep != 0 for t in example )
|
||||||
|
assert any( t.dep != i for i,t in enumerate(example) )
|
||||||
|
|
||||||
|
def test_ner(self, example):
|
||||||
|
# if ner was done properly, ent_iob shouldn't be empty
|
||||||
|
assert all( t.ent_iob != 0 for t in example )
|
||||||
|
|
||||||
|
def test_vectors(self, example):
|
||||||
|
# if vectors are available, they should differ on different words
|
||||||
|
# this isn't a perfect test since this could in principle fail in a sane model as well,
|
||||||
|
# but that's very unlikely and a good indicator if something is wrong
|
||||||
|
vector0 = example[0].vector
|
||||||
|
vector1 = example[1].vector
|
||||||
|
vector2 = example[2].vector
|
||||||
|
assert not numpy.array_equal(vector0,vector1)
|
||||||
|
assert not numpy.array_equal(vector0,vector2)
|
||||||
|
assert not numpy.array_equal(vector1,vector2)
|
||||||
|
|
||||||
|
def test_probs(self, example):
|
||||||
|
# if frequencies/probabilities are okay, they should differ for different words
|
||||||
|
# this isn't a perfect test since this could in principle fail in a sane model as well,
|
||||||
|
# but that's very unlikely and a good indicator if something is wrong
|
||||||
|
prob0 = example[0].prob
|
||||||
|
prob1 = example[1].prob
|
||||||
|
prob2 = example[2].prob
|
||||||
|
assert not prob0 == prob1
|
||||||
|
assert not prob0 == prob2
|
||||||
|
assert not prob1 == prob2
|
|
@ -2,30 +2,30 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
# @pytest.mark.models
|
||||||
def test_nsubj(EN):
|
# def test_nsubj(EN):
|
||||||
sent = EN(u'A base phrase should be recognized.')
|
# sent = EN(u'A base phrase should be recognized.')
|
||||||
base_nps = list(sent.noun_chunks)
|
# base_nps = list(sent.noun_chunks)
|
||||||
assert len(base_nps) == 1
|
# assert len(base_nps) == 1
|
||||||
assert base_nps[0].string == 'A base phrase '
|
# assert base_nps[0].string == 'A base phrase '
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
# @pytest.mark.models
|
||||||
def test_coord(EN):
|
# def test_coord(EN):
|
||||||
sent = EN(u'A base phrase and a good phrase are often the same.')
|
# sent = EN(u'A base phrase and a good phrase are often the same.')
|
||||||
base_nps = list(sent.noun_chunks)
|
# base_nps = list(sent.noun_chunks)
|
||||||
assert len(base_nps) == 2
|
# assert len(base_nps) == 2
|
||||||
assert base_nps[0].string == 'A base phrase '
|
# assert base_nps[0].string == 'A base phrase '
|
||||||
assert base_nps[1].string == 'a good phrase '
|
# assert base_nps[1].string == 'a good phrase '
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
# @pytest.mark.models
|
||||||
def test_pp(EN):
|
# def test_pp(EN):
|
||||||
sent = EN(u'A phrase with another phrase occurs')
|
# sent = EN(u'A phrase with another phrase occurs')
|
||||||
base_nps = list(sent.noun_chunks)
|
# base_nps = list(sent.noun_chunks)
|
||||||
assert len(base_nps) == 2
|
# assert len(base_nps) == 2
|
||||||
assert base_nps[0].string == 'A phrase '
|
# assert base_nps[0].string == 'A phrase '
|
||||||
assert base_nps[1].string == 'another phrase '
|
# assert base_nps[1].string == 'another phrase '
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models
|
||||||
|
|
|
@ -0,0 +1,138 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import numpy
|
||||||
|
|
||||||
|
from spacy.attrs import HEAD, DEP
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
|
class TestNounChunks:
|
||||||
|
@pytest.fixture(scope="class")
|
||||||
|
def ex1_en(self, EN):
|
||||||
|
example = EN.tokenizer.tokens_from_list('A base phrase should be recognized .'.split(' '))
|
||||||
|
EN.tagger.tag_from_strings(example, 'DT NN NN MD VB VBN .'.split(' '))
|
||||||
|
det,compound,nsubjpass,aux,auxpass,root,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubjpass','aux','auxpass','root','punct'] )
|
||||||
|
example.from_array([HEAD, DEP],
|
||||||
|
numpy.asarray(
|
||||||
|
[
|
||||||
|
[2, det],
|
||||||
|
[1, compound],
|
||||||
|
[3, nsubjpass],
|
||||||
|
[2, aux],
|
||||||
|
[1, auxpass],
|
||||||
|
[0, root],
|
||||||
|
[-1, punct]
|
||||||
|
], dtype='int32'))
|
||||||
|
return example
|
||||||
|
|
||||||
|
@pytest.fixture(scope="class")
|
||||||
|
def ex2_en(self, EN):
|
||||||
|
example = EN.tokenizer.tokens_from_list('A base phrase and a good phrase are often the same .'.split(' '))
|
||||||
|
EN.tagger.tag_from_strings(example, 'DT NN NN CC DT JJ NN VBP RB DT JJ .'.split(' '))
|
||||||
|
det,compound,nsubj,cc,amod,conj,root,advmod,attr,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubj','cc','amod','conj','root','advmod','attr','punct'] )
|
||||||
|
example.from_array([HEAD, DEP],
|
||||||
|
numpy.asarray(
|
||||||
|
[
|
||||||
|
[2, det],
|
||||||
|
[1, compound],
|
||||||
|
[5, nsubj],
|
||||||
|
[-1, cc],
|
||||||
|
[1, det],
|
||||||
|
[1, amod],
|
||||||
|
[-4, conj],
|
||||||
|
[0, root],
|
||||||
|
[-1, advmod],
|
||||||
|
[1, det],
|
||||||
|
[-3, attr],
|
||||||
|
[-4, punct]
|
||||||
|
], dtype='int32'))
|
||||||
|
return example
|
||||||
|
|
||||||
|
@pytest.fixture(scope="class")
|
||||||
|
def ex3_en(self, EN):
|
||||||
|
example = EN.tokenizer.tokens_from_list('A phrase with another phrase occurs .'.split(' '))
|
||||||
|
EN.tagger.tag_from_strings(example, 'DT NN IN DT NN VBZ .'.split(' '))
|
||||||
|
det,nsubj,prep,pobj,root,punct = tuple( EN.vocab.strings[l] for l in ['det','nsubj','prep','pobj','root','punct'] )
|
||||||
|
example.from_array([HEAD, DEP],
|
||||||
|
numpy.asarray(
|
||||||
|
[
|
||||||
|
[1, det],
|
||||||
|
[4, nsubj],
|
||||||
|
[-1, prep],
|
||||||
|
[1, det],
|
||||||
|
[-2, pobj],
|
||||||
|
[0, root],
|
||||||
|
[-1, punct]
|
||||||
|
], dtype='int32'))
|
||||||
|
return example
|
||||||
|
|
||||||
|
@pytest.fixture(scope="class")
|
||||||
|
def ex1_de(self, DE):
|
||||||
|
example = DE.tokenizer.tokens_from_list('Eine Tasse steht auf dem Tisch .'.split(' '))
|
||||||
|
DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN $.'.split(' '))
|
||||||
|
nk,sb,root,mo,punct = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct'])
|
||||||
|
example.from_array([HEAD, DEP],
|
||||||
|
numpy.asarray(
|
||||||
|
[
|
||||||
|
[1, nk],
|
||||||
|
[1, sb],
|
||||||
|
[0, root],
|
||||||
|
[-1, mo],
|
||||||
|
[1, nk],
|
||||||
|
[-2, nk],
|
||||||
|
[-3, punct]
|
||||||
|
], dtype='int32'))
|
||||||
|
return example
|
||||||
|
|
||||||
|
@pytest.fixture(scope="class")
|
||||||
|
def ex2_de(self, DE):
|
||||||
|
example = DE.tokenizer.tokens_from_list('Die Sängerin singt mit einer Tasse Kaffee Arien .'.split(' '))
|
||||||
|
DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN NN NN $.'.split(' '))
|
||||||
|
nk,sb,root,mo,punct,oa = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct','oa'])
|
||||||
|
example.from_array([HEAD, DEP],
|
||||||
|
numpy.asarray(
|
||||||
|
[
|
||||||
|
[1, nk],
|
||||||
|
[1, sb],
|
||||||
|
[0, root],
|
||||||
|
[-1, mo],
|
||||||
|
[1, nk],
|
||||||
|
[-2, nk],
|
||||||
|
[-1, nk],
|
||||||
|
[-5, oa],
|
||||||
|
[-6, punct]
|
||||||
|
], dtype='int32'))
|
||||||
|
return example
|
||||||
|
|
||||||
|
def test_en_standard_chunk(self, ex1_en):
|
||||||
|
chunks = list(ex1_en.noun_chunks)
|
||||||
|
assert len(chunks) == 1
|
||||||
|
assert chunks[0].string == 'A base phrase '
|
||||||
|
|
||||||
|
def test_en_coordinated_chunks(self, ex2_en):
|
||||||
|
chunks = list(ex2_en.noun_chunks)
|
||||||
|
assert len(chunks) == 2
|
||||||
|
assert chunks[0].string == 'A base phrase '
|
||||||
|
assert chunks[1].string == 'a good phrase '
|
||||||
|
|
||||||
|
def test_en_pp_chunks(self, ex3_en):
|
||||||
|
chunks = list(ex3_en.noun_chunks)
|
||||||
|
assert len(chunks) == 2
|
||||||
|
assert chunks[0].string == 'A phrase '
|
||||||
|
assert chunks[1].string == 'another phrase '
|
||||||
|
|
||||||
|
def test_de_standard_chunk(self, ex1_de):
|
||||||
|
chunks = list(ex1_de.noun_chunks)
|
||||||
|
assert len(chunks) == 2
|
||||||
|
assert chunks[0].string == 'Eine Tasse '
|
||||||
|
assert chunks[1].string == 'dem Tisch '
|
||||||
|
|
||||||
|
def test_de_extended_chunk(self, ex2_de):
|
||||||
|
chunks = list(ex2_de.noun_chunks)
|
||||||
|
assert len(chunks) == 3
|
||||||
|
assert chunks[0].string == 'Die Sängerin '
|
||||||
|
assert chunks[1].string == 'einer Tasse Kaffee '
|
||||||
|
assert chunks[2].string == 'Arien '
|
Loading…
Reference in New Issue