diff --git a/tests/test_docs.py b/tests/test_docs.py new file mode 100644 index 000000000..a6a44c154 --- /dev/null +++ b/tests/test_docs.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- +"""Sphinx doctest is just too hard. Manually paste doctest examples here""" +from spacy.en.attrs import IS_LOWER + +def test_1(): + import spacy.en + from spacy.parts_of_speech import ADV + # Load the pipeline, and call it with some text. + nlp = spacy.en.English() + tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", + tag=True, parse=False) + o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens) + assert u"‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’" + + o = nlp.vocab[u'back'].prob + assert o == -7.403977394104004 + o = nlp.vocab[u'not'].prob + assert o == -5.407193660736084 + o = nlp.vocab[u'quietly'].prob + assert o == -11.07155704498291 + + +def test2(): + import spacy.en + from spacy.parts_of_speech import ADV + nlp = spacy.en.English() + # Find log probability of Nth most frequent word + probs = [lex.prob for lex in nlp.vocab] + probs.sort() + is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] + tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") + o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) + o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’' + + nlp.vocab[u'back'].prob + -7.403977394104004 + nlp.vocab[u'not'].prob + -5.407193660736084 + nlp.vocab[u'quietly'].prob + -11.07155704498291 + +def test3(): + import spacy.en + from spacy.parts_of_speech import ADV + nlp = spacy.en.English() + # Find log probability of Nth most frequent word + probs = [lex.prob for lex in nlp.vocab] + probs.sort() + is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] + tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") + o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) + assert o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’' + + pleaded = tokens[7] + assert pleaded.repvec.shape == (300,) + o = pleaded.repvec[:5] + assert sum(o) != 0 + from numpy import dot + from numpy.linalg import norm + + cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) + words = [w for w in nlp.vocab if w.check(IS_LOWER) and w.has_repvec] + words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec)) + words.reverse() + o = [w.orth_ for w in words[0:20]] + assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded', + u'pleads', u'testified', u'conspired', u'motioned', u'demurred', + u'countersued', u'remonstrated', u'begged', u'apologised', + u'consented', u'acquiesced', u'petitioned', u'quarreled', + u'appealed', u'pleading'] + o = [w.orth_ for w in words[50:60]] + assert o == [u'counselled', u'bragged', u'backtracked', u'caucused', u'refiled', + u'dueled', u'mused', u'dissented', u'yearned', u'confesses'] + o = [w.orth_ for w in words[100:110]] + assert o == [u'cabled', u'ducked', u'sentenced', u'perjured', u'absconded', + u'bargained', u'overstayed', u'clerked', u'confided', u'sympathizes'] + o = [w.orth_ for w in words[1000:1010]] + assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled', + u'posited', u'firebombed', u'slimed', u'deferred', u'sagged'] + o = [w.orth_ for w in words[50000:50010]] + assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid', + u'dirty', u'rims', u'artists']