2015-02-07 18:14:07 +00:00
from __future__ import unicode_literals
2015-07-13 16:39:38 +00:00
from spacy . tokens import Doc
2016-01-18 16:14:40 +00:00
from spacy . en import English
2016-01-18 16:25:04 +00:00
import numpy
2016-01-18 16:34:43 +00:00
from spacy . attrs import HEAD
2015-07-13 16:39:38 +00:00
2015-02-07 18:14:07 +00:00
import pytest
2015-07-26 14:23:41 +00:00
@pytest.mark.models
def test_getitem ( EN ) :
2015-06-07 16:02:24 +00:00
tokens = EN ( u ' Give it back! He pleaded. ' )
2015-02-07 18:14:07 +00:00
assert tokens [ 0 ] . orth_ == ' Give '
assert tokens [ - 1 ] . orth_ == ' . '
with pytest . raises ( IndexError ) :
tokens [ len ( tokens ) ]
2015-07-13 16:39:38 +00:00
2015-10-06 08:59:11 +00:00
def to_str ( span ) :
return ' / ' . join ( token . orth_ for token in span )
2015-10-06 07:51:25 +00:00
span = tokens [ 1 : 1 ]
2015-10-06 08:59:11 +00:00
assert not to_str ( span )
2015-10-06 07:51:25 +00:00
span = tokens [ 1 : 4 ]
2015-10-06 08:59:11 +00:00
assert to_str ( span ) == ' it/back/! '
2015-10-06 07:56:33 +00:00
span = tokens [ 1 : 4 : 1 ]
2015-10-06 08:59:11 +00:00
assert to_str ( span ) == ' it/back/! '
2015-10-06 07:51:25 +00:00
with pytest . raises ( ValueError ) :
tokens [ 1 : 4 : 2 ]
with pytest . raises ( ValueError ) :
tokens [ 1 : 4 : - 1 ]
2015-10-06 08:59:11 +00:00
span = tokens [ - 3 : 6 ]
assert to_str ( span ) == ' He/pleaded '
span = tokens [ 4 : - 1 ]
assert to_str ( span ) == ' He/pleaded '
span = tokens [ - 5 : - 3 ]
assert to_str ( span ) == ' back/! '
span = tokens [ 5 : 4 ]
assert span . start == span . end == 5 and not to_str ( span )
span = tokens [ 4 : - 3 ]
assert span . start == span . end == 4 and not to_str ( span )
span = tokens [ : ]
assert to_str ( span ) == ' Give/it/back/!/He/pleaded/. '
span = tokens [ 4 : ]
assert to_str ( span ) == ' He/pleaded/. '
span = tokens [ : 4 ]
assert to_str ( span ) == ' Give/it/back/! '
span = tokens [ : - 3 ]
assert to_str ( span ) == ' Give/it/back/! '
span = tokens [ - 3 : ]
assert to_str ( span ) == ' He/pleaded/. '
span = tokens [ 4 : 50 ]
assert to_str ( span ) == ' He/pleaded/. '
span = tokens [ - 50 : 4 ]
assert to_str ( span ) == ' Give/it/back/! '
span = tokens [ - 50 : - 40 ]
assert span . start == span . end == 0 and not to_str ( span )
span = tokens [ 40 : 50 ]
assert span . start == span . end == 7 and not to_str ( span )
2015-10-06 09:08:39 +00:00
span = tokens [ 1 : 4 ]
assert span [ 0 ] . orth_ == ' it '
2015-10-06 09:45:49 +00:00
subspan = span [ : ]
assert to_str ( subspan ) == ' it/back/! '
subspan = span [ : 2 ]
assert to_str ( subspan ) == ' it/back '
subspan = span [ 1 : ]
assert to_str ( subspan ) == ' back/! '
subspan = span [ : - 1 ]
assert to_str ( subspan ) == ' it/back '
subspan = span [ - 2 : ]
assert to_str ( subspan ) == ' back/! '
subspan = span [ 1 : 2 ]
assert to_str ( subspan ) == ' back '
subspan = span [ - 2 : - 1 ]
assert to_str ( subspan ) == ' back '
subspan = span [ - 50 : 50 ]
assert to_str ( subspan ) == ' it/back/! '
subspan = span [ 50 : - 50 ]
assert subspan . start == subspan . end == 4 and not to_str ( subspan )
2015-10-06 09:08:39 +00:00
2015-07-13 16:39:38 +00:00
2015-07-26 14:23:41 +00:00
@pytest.mark.models
def test_serialize ( EN ) :
2015-07-22 23:19:11 +00:00
tokens = EN ( u ' Give it back! He pleaded. ' )
packed = tokens . to_bytes ( )
new_tokens = Doc ( EN . vocab ) . from_bytes ( packed )
assert tokens . string == new_tokens . string
assert [ t . orth_ for t in tokens ] == [ t . orth_ for t in new_tokens ]
assert [ t . orth for t in tokens ] == [ t . orth for t in new_tokens ]
2015-07-26 14:23:41 +00:00
@pytest.mark.models
2015-07-22 23:19:11 +00:00
def test_serialize_whitespace ( EN ) :
2015-07-13 16:39:38 +00:00
tokens = EN ( u ' Give it back! He pleaded. ' )
2015-07-22 23:19:11 +00:00
packed = tokens . to_bytes ( )
new_tokens = Doc ( EN . vocab ) . from_bytes ( packed )
2015-07-13 16:39:38 +00:00
assert tokens . string == new_tokens . string
assert [ t . orth_ for t in tokens ] == [ t . orth_ for t in new_tokens ]
assert [ t . orth for t in tokens ] == [ t . orth for t in new_tokens ]
2015-08-05 22:35:40 +00:00
def test_set_ents ( EN ) :
tokens = EN . tokenizer ( u ' I use goggle chrone to surf the web ' )
assert len ( tokens . ents ) == 0
tokens . ents = [ ( EN . vocab . strings [ ' PRODUCT ' ] , 2 , 4 ) ]
assert len ( list ( tokens . ents ) ) == 1
assert [ t . ent_iob for t in tokens ] == [ 0 , 0 , 3 , 1 , 0 , 0 , 0 , 0 ]
ent = tokens . ents [ 0 ]
assert ent . label_ == ' PRODUCT '
assert ent . start == 2
assert ent . end == 4
2015-10-18 06:17:27 +00:00
def test_merge ( EN ) :
doc = EN ( ' WKRO played songs by the beach boys all night ' )
assert len ( doc ) == 9
# merge 'The Beach Boys'
doc . merge ( doc [ 4 ] . idx , doc [ 6 ] . idx + len ( doc [ 6 ] ) , ' NAMED ' , ' LEMMA ' , ' TYPE ' )
assert len ( doc ) == 7
assert doc [ 4 ] . text == ' the beach boys '
assert doc [ 4 ] . text_with_ws == ' the beach boys '
assert doc [ 4 ] . tag_ == ' NAMED '
2015-10-19 04:47:04 +00:00
def test_merge_end_string ( EN ) :
doc = EN ( ' WKRO played songs by the beach boys all night ' )
assert len ( doc ) == 9
# merge 'The Beach Boys'
doc . merge ( doc [ 7 ] . idx , doc [ 8 ] . idx + len ( doc [ 8 ] ) , ' NAMED ' , ' LEMMA ' , ' TYPE ' )
assert len ( doc ) == 8
assert doc [ 7 ] . text == ' all night '
assert doc [ 7 ] . text_with_ws == ' all night '
2015-10-18 06:17:27 +00:00
@pytest.mark.models
def test_merge_children ( EN ) :
""" Test that attachments work correctly after merging. """
doc = EN ( ' WKRO played songs by the beach boys all night ' )
# merge 'The Beach Boys'
doc . merge ( doc [ 4 ] . idx , doc [ 6 ] . idx + len ( doc [ 6 ] ) , ' NAMED ' , ' LEMMA ' , ' TYPE ' )
for word in doc :
if word . i < word . head . i :
assert word in list ( word . head . lefts )
elif word . i > word . head . i :
assert word in list ( word . head . rights )
2016-01-16 17:00:26 +00:00
def test_merge_hang ( ) :
text = ' through North and South Carolina '
EN = English ( parser = False )
doc = EN ( text , tag = True )
2016-01-18 16:25:04 +00:00
heads = numpy . asarray ( [ [ 0 , 3 , - 1 , - 2 , - 4 ] ] , dtype = ' int32 ' )
2016-01-16 17:00:26 +00:00
doc . from_array ( [ HEAD ] , heads . T )
doc . merge ( 18 , 32 , ' ' , ' ' , ' ORG ' )
doc . merge ( 8 , 32 , ' ' , ' ' , ' ORG ' )
2016-01-25 14:22:42 +00:00
@pytest.mark.models
def test_runtime_error ( EN ) :
# Example that caused run-time error while parsing Reddit
text = u ' 67 % o f black households are single parent \n \n 72 % o f all black babies born out of wedlock \n \n 50 % o f all black kids don \u2019 t finish high school '
doc = EN ( text )
nps = [ ]
for np in doc . noun_chunks :
while len ( np ) > 1 and np [ 0 ] . dep_ not in ( ' advmod ' , ' amod ' , ' compound ' ) :
np = np [ 1 : ]
if len ( np ) > 1 :
nps . append ( ( np . start_char , np . end_char , np . root . tag_ , np . text , np . root . ent_type_ ) )
for np in nps :
print ( np )
for word in doc :
print ( word . idx , word . text , word . head . i , word . head . text )
doc . merge ( * np )
2016-02-06 22:47:51 +00:00
@pytest.mark.models
def test_right_edge ( EN ) :
# Test for bug occurring from Unshift action, causing incorrect right edge
doc = EN ( u ''' I have proposed to myself, for the sake of such as live '''
u ''' under the government of the Romans, to translate those books '''
u ''' into the Greek tongue. ''' )
token = doc [ 6 ]
assert token . text == u ' for '
subtree = [ w . text for w in token . subtree ]
assert subtree == [ u ' for ' , u ' the ' , u ' sake ' , u ' of ' ]
assert token . right_edge . text == u ' of '