From 3a9c6a956398d85f7d68b276914f908cc34b0367 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 11 Jan 2017 18:58:38 +0100 Subject: [PATCH] Remove old unused files --- spacy/tests/test_docs.py | 81 -------------------------------------- spacy/tests/tokenizer.sed | 82 --------------------------------------- 2 files changed, 163 deletions(-) delete mode 100644 spacy/tests/test_docs.py delete mode 100644 spacy/tests/tokenizer.sed diff --git a/spacy/tests/test_docs.py b/spacy/tests/test_docs.py deleted file mode 100644 index 4b0831dfd..000000000 --- a/spacy/tests/test_docs.py +++ /dev/null @@ -1,81 +0,0 @@ -# -*- coding: utf-8 -*- -"""Sphinx doctest is just too hard. Manually paste doctest examples here""" -import pytest - -#@pytest.mark.models -#def test_1(): -# import spacy.en -# from spacy.parts_of_speech import ADV -# # Load the pipeline, and call it with some text. -# nlp = spacy.en.English() -# tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", -# tag=True, parse=False) -# o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens) -# assert u"‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’" -# -# o = nlp.vocab[u'back'].prob -# assert o == -7.033305644989014 -# o = nlp.vocab[u'not'].prob -# assert o == -5.332601070404053 -# o = nlp.vocab[u'quietly'].prob -# assert o == -11.994928359985352 -# -# -#@pytest.mark.m -#def test2(): -# import spacy.en -# from spacy.parts_of_speech import ADV -# nlp = spacy.en.English() -# # Find log probability of Nth most frequent word -# probs = [lex.prob for lex in nlp.vocab] -# probs.sort() -# is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] -# tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") -# o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) -# o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’' -# -#@pytest.mark.models -#def test3(): -# import spacy.en -# from spacy.parts_of_speech import ADV -# nlp = spacy.en.English() -# # Find log probability of Nth most frequent word -# probs = [lex.prob for lex in nlp.vocab] -# probs.sort() -# is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] -# tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") -# o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) -# assert o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’' -# -# pleaded = tokens[7] -# assert pleaded.repvec.shape == (300,) -# o = pleaded.repvec[:5] -# assert sum(o) != 0 -# from numpy import dot -# from numpy.linalg import norm -# -# cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) -# words = [w for w in nlp.vocab if w.is_lower and w.has_repvec] -# words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec)) -# words.reverse() -# o = [w.orth_ for w in words[0:20]] -# assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded', -# u'pleads', u'testified', u'conspired', u'motioned', u'demurred', -# u'countersued', u'remonstrated', u'begged', u'apologised', -# u'consented', u'acquiesced', u'petitioned', u'quarreled', -# u'appealed', u'pleading'] -# o = [w.orth_ for w in words[50:60]] -# assert o == [u'martialed', u'counselled', u'bragged', -# u'backtracked', u'caucused', u'refiled', u'dueled', u'mused', -# u'dissented', u'yearned'] -# o = [w.orth_ for w in words[100:110]] -# assert o == [u'acquits', u'cabled', u'ducked', u'sentenced', -# u'gaoled', u'perjured', u'absconded', u'bargained', u'overstayed', -# u'clerked'] -# -# #o = [w.orth_ for w in words[1000:1010]] -# #assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled', -# # u'posited', u'firebombed', u'slimed', u'deferred', u'sagged'] -# #o = [w.orth_ for w in words[50000:50010]] -# #assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid', -# # u'dirty', u'rims', u'artists'] diff --git a/spacy/tests/tokenizer.sed b/spacy/tests/tokenizer.sed deleted file mode 100644 index f39c04178..000000000 --- a/spacy/tests/tokenizer.sed +++ /dev/null @@ -1,82 +0,0 @@ -#!/bin/sed -f - -# Sed script to produce Penn Treebank tokenization on arbitrary raw text. -# Yeah, sure. - -# expected input: raw text with ONE SENTENCE TOKEN PER LINE - -# by Robert MacIntyre, University of Pennsylvania, late 1995. - -# If this wasn't such a trivial program, I'd include all that stuff about -# no warrantee, free use, etc. from the GNU General Public License. If you -# want to be picky, assume that all of its terms apply. Okay? - -# attempt to get correct directional quotes -s=^"=`` =g -s=\([ ([{<]\)"=\1 `` =g -# close quotes handled at end - -s=\.\.\.= ... =g -s=[,;:@#$%&]= & =g - -# Assume sentence tokenization has been done first, so split FINAL periods -# only. -s=\([^.]\)\([.]\)\([])}>"']*\)[ ]*$=\1 \2\3 =g -# however, we may as well split ALL question marks and exclamation points, -# since they shouldn't have the abbrev.-marker ambiguity problem -s=[?!]= & =g - -# parentheses, brackets, etc. -s=[][(){}<>]= & =g -# Some taggers, such as Adwait Ratnaparkhi's MXPOST, use the parsed-file -# version of these symbols. -# UNCOMMENT THE FOLLOWING 6 LINES if you're using MXPOST. -# s/(/-LRB-/g -# s/)/-RRB-/g -# s/\[/-LSB-/g -# s/\]/-RSB-/g -# s/{/-LCB-/g -# s/}/-RCB-/g - -s=--= -- =g - -# NOTE THAT SPLIT WORDS ARE NOT MARKED. Obviously this isn't great, since -# you might someday want to know how the words originally fit together -- -# but it's too late to make a better system now, given the millions of -# words we've already done "wrong". - -# First off, add a space to the beginning and end of each line, to reduce -# necessary number of regexps. -s=$= = -s=^= = - -s="= '' =g -# possessive or close-single-quote -s=\([^']\)' =\1 ' =g -# as in it's, I'm, we'd -s='\([sSmMdD]\) = '\1 =g -s='ll = 'll =g -s='re = 're =g -s='ve = 've =g -s=n't = n't =g -s='LL = 'LL =g -s='RE = 'RE =g -s='VE = 'VE =g -s=N'T = N'T =g - -s= \([Cc]\)annot = \1an not =g -s= \([Dd]\)'ye = \1' ye =g -s= \([Gg]\)imme = \1im me =g -s= \([Gg]\)onna = \1on na =g -s= \([Gg]\)otta = \1ot ta =g -s= \([Ll]\)emme = \1em me =g -s= \([Mm]\)ore'n = \1ore 'n =g -s= '\([Tt]\)is = '\1 is =g -s= '\([Tt]\)was = '\1 was =g -s= \([Ww]\)anna = \1an na =g -# s= \([Ww]\)haddya = \1ha dd ya =g -# s= \([Ww]\)hatcha = \1ha t cha =g - -# clean out extra spaces -s= *= =g -s=^ *==g