spaCy/tests/test_docs.py

# -*- coding: utf-8 -*-
"""Sphinx doctest is just too hard. Manually paste doctest examples here"""
from spacy.en.attrs import IS_LOWER
import pytest

@pytest.mark.models
def test_1():
    import spacy.en
    from spacy.parts_of_speech import ADV
    # Load the pipeline, and call it with some text.
    nlp = spacy.en.English()
    tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’",
                tag=True, parse=False)
    o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)
    assert u"‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’"

    o = nlp.vocab[u'back'].prob
    assert o == -7.033305644989014
    o = nlp.vocab[u'not'].prob
    assert o == -5.332601070404053
    o = nlp.vocab[u'quietly'].prob
    assert o == -11.994928359985352


@pytest.mark.models
def test2():
    import spacy.en
    from spacy.parts_of_speech import ADV
    nlp = spacy.en.English()
    # Find log probability of Nth most frequent word
    probs = [lex.prob for lex in nlp.vocab]
    probs.sort()
    is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
    tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’")
    o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
    o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’'

@pytest.mark.models
def test3():
    import spacy.en
    from spacy.parts_of_speech import ADV
    nlp = spacy.en.English()
    # Find log probability of Nth most frequent word
    probs = [lex.prob for lex in nlp.vocab]
    probs.sort()
    is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
    tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’")
    o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
    assert o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’'

    pleaded = tokens[7]
    assert pleaded.repvec.shape == (300,)
    o = pleaded.repvec[:5]
    assert sum(o) != 0
    from numpy import dot
    from numpy.linalg import norm

    cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
    words = [w for w in nlp.vocab if w.is_lower and w.has_repvec]
    words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
    words.reverse()
    o = [w.orth_ for w in words[0:20]]
    assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded',
                 u'pleads', u'testified', u'conspired', u'motioned', u'demurred',
                 u'countersued', u'remonstrated', u'begged', u'apologised',
                 u'consented', u'acquiesced', u'petitioned', u'quarreled',
                 u'appealed', u'pleading']
    o = [w.orth_ for w in words[50:60]]
    assert o == [u'martialed', u'counselled', u'bragged',
                 u'backtracked', u'caucused', u'refiled', u'dueled', u'mused',
                 u'dissented', u'yearned']
    o = [w.orth_ for w in words[100:110]]
    assert o == [u'acquits', u'cabled', u'ducked', u'sentenced',
                 u'gaoled', u'perjured', u'absconded', u'bargained', u'overstayed',
                 u'clerked']
    
    #o = [w.orth_ for w in words[1000:1010]]
    #assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled',
    #             u'posited', u'firebombed', u'slimed', u'deferred', u'sagged']
    #o = [w.orth_ for w in words[50000:50010]]
    #assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid',
    #             u'dirty', u'rims', u'artists']
-												* Add some tests for the code in the index.html docstrings

											
										
										
											2015-02-07 13:52:13 +00:00
+								# -*- coding: utf-8 -*-
 								"""Sphinx doctest is just too hard. Manually paste doctest examples here"""
 								from spacy.en.attrs import IS_LOWER
-												* Upd tests

											
										
										
											2015-07-22 23:19:11 +00:00
+								import pytest
-												* Add some tests for the code in the index.html docstrings

											
										
										
											2015-02-07 13:52:13 +00:00
-												* Upd tests

											
										
										
											2015-07-22 23:19:11 +00:00
+								@pytest.mark.models
-												* Add some tests for the code in the index.html docstrings

											
										
										
											2015-02-07 13:52:13 +00:00
+								def test_1():
 								    import spacy.en
 								    from spacy.parts_of_speech import ADV
 								    # Load the pipeline, and call it with some text.
 								    nlp = spacy.en.English()
 								    tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’",
 								                tag=True, parse=False)
 								    o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)
 								    assert u"‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’"
 								    o = nlp.vocab[u'back'].prob
-												* Update doctests

											
										
										
											2015-07-26 11:04:18 +00:00
+								    assert o == -7.033305644989014
-												* Add some tests for the code in the index.html docstrings

											
										
										
											2015-02-07 13:52:13 +00:00
+								    o = nlp.vocab[u'not'].prob
-												* Update doctests

											
										
										
											2015-07-26 11:04:18 +00:00
+								    assert o == -5.332601070404053
-												* Add some tests for the code in the index.html docstrings

											
										
										
											2015-02-07 13:52:13 +00:00
+								    o = nlp.vocab[u'quietly'].prob
-												* Update doctests

											
										
										
											2015-07-26 11:04:18 +00:00
+								    assert o == -11.994928359985352
-												* Add some tests for the code in the index.html docstrings

											
										
										
											2015-02-07 13:52:13 +00:00
-												* Upd tests

											
										
										
											2015-07-22 23:19:11 +00:00
+								@pytest.mark.models
-												* Add some tests for the code in the index.html docstrings

											
										
										
											2015-02-07 13:52:13 +00:00
+								def test2():
 								    import spacy.en
 								    from spacy.parts_of_speech import ADV
 								    nlp = spacy.en.English()
 								    # Find log probability of Nth most frequent word
 								    probs = [lex.prob for lex in nlp.vocab]
 								    probs.sort()
 								    is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
 								    tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’")
 								    o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
 								    o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’'
-												* Upd tests

											
										
										
											2015-07-22 23:19:11 +00:00
+								@pytest.mark.models
-												* Add some tests for the code in the index.html docstrings

											
										
										
											2015-02-07 13:52:13 +00:00
+								def test3():
 								    import spacy.en
 								    from spacy.parts_of_speech import ADV
 								    nlp = spacy.en.English()
 								    # Find log probability of Nth most frequent word
 								    probs = [lex.prob for lex in nlp.vocab]
 								    probs.sort()
 								    is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
 								    tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’")
 								    o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
 								    assert o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’'
 								    pleaded = tokens[7]
 								    assert pleaded.repvec.shape == (300,)
 								    o = pleaded.repvec[:5]
 								    assert sum(o) != 0
 								    from numpy import dot
 								    from numpy.linalg import norm
-												Remove trailing whitespace

											
										
										
											2015-04-19 08:31:31 +00:00
-												* Add some tests for the code in the index.html docstrings

											
										
										
											2015-02-07 13:52:13 +00:00
+								    cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
-												* Upd test_docs

											
										
										
											2015-07-26 15:41:13 +00:00
+								    words = [w for w in nlp.vocab if w.is_lower and w.has_repvec]
-												* Add some tests for the code in the index.html docstrings

											
										
										
											2015-02-07 13:52:13 +00:00
+								    words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
 								    words.reverse()
 								    o = [w.orth_ for w in words[0:20]]
 								    assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded',
 								                 u'pleads', u'testified', u'conspired', u'motioned', u'demurred',
 								                 u'countersued', u'remonstrated', u'begged', u'apologised',
 								                 u'consented', u'acquiesced', u'petitioned', u'quarreled',
 								                 u'appealed', u'pleading']
 								    o = [w.orth_ for w in words[50:60]]
-												* Update test_docs

											
										
										
											2015-07-27 20:15:19 +00:00
+								    assert o == [u'martialed', u'counselled', u'bragged',
-												* Update doctests

											
										
										
											2015-07-26 10:57:59 +00:00
+								                 u'backtracked', u'caucused', u'refiled', u'dueled', u'mused',
-												* Update test_docs

											
										
										
											2015-07-27 20:15:19 +00:00
+								                 u'dissented', u'yearned']
-												* Add some tests for the code in the index.html docstrings

											
										
										
											2015-02-07 13:52:13 +00:00
+								    o = [w.orth_ for w in words[100:110]]
-												* Update test_docs

											
										
										
											2015-07-27 20:15:19 +00:00
+								    assert o == [u'acquits', u'cabled', u'ducked', u'sentenced',
 								                 u'gaoled', u'perjured', u'absconded', u'bargained', u'overstayed',
 								                 u'clerked']
-												* Update doctests

											
										
										
											2015-07-26 10:57:59 +00:00
-												* Fix test_docs.py

											
										
										
											2015-06-07 17:02:43 +00:00
+								    #o = [w.orth_ for w in words[1000:1010]]
 								    #assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled',
 								    #             u'posited', u'firebombed', u'slimed', u'deferred', u'sagged']
 								    #o = [w.orth_ for w in words[50000:50010]]
 								    #assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid',
 								    #             u'dirty', u'rims', u'artists']