#!/usr/bin/env python2.3
# $Id: htmldiff.py.html,v 1.1 2004/10/02 15:12:26 john Exp $
#
# JW.
import htmllib, formatter, cStringIO, urllib, md5, sys, os, signal, getopt
backup_dir = '.'
def catch(sig, frame):
if sig == signal.SIGCHLD:
pid, status = os.wait()
def hasdiff(url, diffs):
print "URL:", url
print "Differences:"
for line in diffs:
print line
print "End of differences."
print
def nodiff(url):
pass
def newfile(url):
print "URL: ", url
print "Added to database."
print
def getdiff(filename, newtext):
i, o = os.popen2("diff -cw %s -" % filename)
i.write(newtext)
i.close()
diffs = []
while True:
try:
line = o.readline()
except IOError:
break
if not line:
break
diffs.append(line.rstrip())
o.close()
if len(diffs) > 2:
# Strip two header lines of diff output.
del(diffs[:2])
return diffs
def main():
global backup_dir
program_name = sys.argv[0]
opts, args = getopt.getopt(sys.argv[1:], 'b:')
for opt,arg in opts:
if opt == '-b':
backup_dir = arg
signal.signal(signal.SIGCHLD, catch)
for url in args:
md5sum = md5.new(url).hexdigest()
fn = os.path.join(backup_dir, md5sum)
if os.path.exists(fn):
oldpage = fn
else:
oldpage = ''
fp = cStringIO.StringIO()
fp.write('URL: %s\n\n' % url)
h = htmllib.HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(fp)))
h.feed(urllib.urlopen(url).read())
h.close()
if h.anchorlist:
fp.write('\nReferences:\n')
for n,link in enumerate(h.anchorlist):
fp.write('[%d] %s\n' % (n+1, link))
page = fp.getvalue()
if oldpage and page != file(oldpage).read():
diffs = getdiff(oldpage, page)
if diffs:
if os.path.exists(oldpage + '.bkp'):
os.remove(oldpage + '.bkp')
os.rename(oldpage, oldpage + '.bkp')
file(fn, 'w').write(page)
hasdiff(url, diffs)
else:
nodiff(url)
elif not oldpage:
file(fn, 'w').write(page)
newfile(url)
else:
nodiff(url)
if __name__ == '__main__':
main()