#!/usr/bin/env python2.3

# $Id: htmldiff.py.html,v 1.1 2004/10/02 15:12:26 john Exp $
#
# JW.

import htmllib, formatter, cStringIO, urllib, md5, sys, os, signal, getopt

backup_dir = '.'

def catch(sig, frame):
   if sig == signal.SIGCHLD:
      pid, status = os.wait()

def hasdiff(url, diffs):
   print "URL:", url
   print "Differences:"
   for line in diffs:
      print line
   print "End of differences."
   print

def nodiff(url):
   pass

def newfile(url):
   print "URL: ", url
   print "Added to database."
   print

def getdiff(filename, newtext):
   i, o = os.popen2("diff -cw %s -" % filename)
   i.write(newtext)
   i.close()
   diffs = []
   while True:
      try:
         line = o.readline()
      except IOError:
         break
      if not line:
         break
      diffs.append(line.rstrip())
   o.close()
   if len(diffs) > 2:
      # Strip two header lines of diff output.
      del(diffs[:2])
   return diffs

def main():
   global backup_dir

   program_name = sys.argv[0]
   opts, args = getopt.getopt(sys.argv[1:], 'b:')

   for opt,arg in opts:
      if opt == '-b':
         backup_dir = arg

   signal.signal(signal.SIGCHLD, catch)

   for url in args:
      md5sum = md5.new(url).hexdigest()
      fn = os.path.join(backup_dir, md5sum)
      if os.path.exists(fn):
         oldpage = fn
      else:
         oldpage = ''
      fp = cStringIO.StringIO()
      fp.write('URL: %s\n\n' % url)
      h = htmllib.HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(fp)))
      h.feed(urllib.urlopen(url).read())
      h.close()
      if h.anchorlist:
         fp.write('\nReferences:\n')
         for n,link in enumerate(h.anchorlist):
            fp.write('[%d] %s\n' % (n+1, link))
      page = fp.getvalue()
      if oldpage and page != file(oldpage).read():
         diffs = getdiff(oldpage, page)
         if diffs:
            if os.path.exists(oldpage + '.bkp'):
               os.remove(oldpage + '.bkp')
            os.rename(oldpage, oldpage + '.bkp')
            file(fn, 'w').write(page)
            hasdiff(url, diffs)
         else:
            nodiff(url)
      elif not oldpage:
         file(fn, 'w').write(page)
         newfile(url)
      else:
         nodiff(url)


if __name__ == '__main__':
   main()