"""A variant on webchecker that creates a mirror copy of a remote site."""
version = "$Revision: 28654 $"
import os import sys import urllib import getopt
import webchecker
# Extract real version number if necessary if version[0] == '$':
_v = version.split() if len(_v) == 3:
version = _v[1]
def main():
- verbose = webchecker.VERBOSE try:
- opts, args = getopt.getopt(sys.argv[1:], "qv")
- print msg print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..." return 2
- if o == "-q":
- verbose = 0
- verbose = verbose + 1
('User-agent', 'websucker/%s' % version),
- ]
- print "Adding root", arg c.addroot(arg)
class Sucker(webchecker.Checker):
- checkext = 0 nonames = 1 # SAM 11/13/99: in general, URLs are now URL pairs. # Since we've suppressed name anchor checking, # we can ignore the second dimension. def readhtml(self, url_pair):
- url = url_pair[0] text = None path = self.savefilename(url) try:
- f = open(path, "rb")
- f = self.openpage(url_pair) if f:
- info = f.info() nurl = f.geturl() if nurl != url:
- url = nurl path = self.savefilename(url)
- text = None
- info = f.info() nurl = f.geturl() if nurl != url:
- if self.checkforhtml({}, url):
- text = f.read()
- dir, base = os.path.split(path) makedirs(dir) try:
- f = open(path, "wb") f.write(text) f.close() self.message("saved %s", path)
- self.message("didn't save %s: %s", path, str(msg))
- type, rest = urllib.splittype(url) host, path = urllib.splithost(rest) path = path.lstrip("/") user, host = urllib.splituser(host) host, port = urllib.splitnport(host) host = host.lower() if not path or path[-1] == "/":
- path = path + "index.html"
- path = os.sep.join(path.split("/")) if os.name == "mac":
- path = os.sep + path
- url = url_pair[0] text = None path = self.savefilename(url) try:
def makedirs(dir):
- if not dir:
- return
- if not os.path.isdir(dir):
- try:
- os.rename(dir, dir + ".bak") os.mkdir(dir) os.rename(dir + ".bak", os.path.join(dir, "index.html"))
- pass
- try:
- print "Huh? Don't know how to make dir", dir return
if name == 'main':
- sys.exit(main() or 0)