cpython/Tools/webchecker/websucker.py

125 lines
3.3 KiB
Python
Raw Normal View History

#! /usr/bin/env python
"""A variant on webchecker that creates a mirror copy of a remote site."""
__version__ = "$Revision$"
import os
import sys
import string
import urllib
import getopt
import webchecker
# Extract real version number if necessary
if __version__[0] == '$':
_v = string.split(__version__)
if len(_v) == 3:
1998-04-06 14:29:28 +00:00
__version__ = _v[1]
def main():
verbose = webchecker.VERBOSE
try:
1998-04-06 14:29:28 +00:00
opts, args = getopt.getopt(sys.argv[1:], "qv")
except getopt.error, msg:
1998-04-06 14:29:28 +00:00
print msg
print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
return 2
for o, a in opts:
1998-04-06 14:29:28 +00:00
if o == "-q":
verbose = 0
if o == "-v":
verbose = verbose + 1
c = Sucker()
c.setflags(verbose=verbose)
c.urlopener.addheaders = [
1998-04-06 14:29:28 +00:00
('User-agent', 'websucker/%s' % __version__),
]
for arg in args:
1998-04-06 14:29:28 +00:00
print "Adding root", arg
c.addroot(arg)
print "Run..."
c.run()
class Sucker(webchecker.Checker):
checkext = 0
nonames = 1
# SAM 11/13/99: in general, URLs are now URL pairs.
# Since we've suppressed name anchor checking,
# we can ignore the second dimension.
def readhtml(self, url_pair):
url = url_pair[0]
1998-04-06 14:29:28 +00:00
text = None
path = self.savefilename(url)
try:
f = open(path, "rb")
except IOError:
f = self.openpage(url_pair)
1998-04-06 14:29:28 +00:00
if f:
info = f.info()
nurl = f.geturl()
if nurl != url:
url = nurl
path = self.savefilename(url)
text = f.read()
f.close()
self.savefile(text, path)
if not self.checkforhtml(info, url):
text = None
else:
if self.checkforhtml({}, url):
text = f.read()
f.close()
return text, url
def savefile(self, text, path):
1998-04-06 14:29:28 +00:00
dir, base = os.path.split(path)
makedirs(dir)
try:
f = open(path, "wb")
f.write(text)
f.close()
self.message("saved %s", path)
except IOError, msg:
self.message("didn't save %s: %s", path, str(msg))
def savefilename(self, url):
1998-04-06 14:29:28 +00:00
type, rest = urllib.splittype(url)
host, path = urllib.splithost(rest)
while path[:1] == "/": path = path[1:]
user, host = urllib.splituser(host)
host, port = urllib.splitnport(host)
host = string.lower(host)
if not path or path[-1] == "/":
path = path + "index.html"
1998-04-06 14:29:28 +00:00
if os.sep != "/":
path = string.join(string.split(path, "/"), os.sep)
path = os.path.join(host, path)
1998-04-06 14:29:28 +00:00
return path
def makedirs(dir):
if not dir:
return
if os.path.exists(dir):
if not os.path.isdir(dir):
try:
os.rename(dir, dir + ".bak")
os.mkdir(dir)
os.rename(dir + ".bak", os.path.join(dir, "index.html"))
except os.error:
pass
1998-04-06 14:29:28 +00:00
return
head, tail = os.path.split(dir)
if not tail:
1998-04-06 14:29:28 +00:00
print "Huh? Don't know how to make dir", dir
return
makedirs(head)
os.mkdir(dir, 0777)
if __name__ == '__main__':
sys.exit(main() or 0)