#!/usr/bin/env python

# -----------------------------------------------------------------------------
#
#	Syntax: htmlcat htmlfile1 htmlfile2 ... > bightml
#
# Concatenate several html files eliminating the pre-body header information
# from all but the first.  All file anchors are prefixed with the file name
# and intra/inter-file references are changed to use the new anchor names.
# This is primarily to make a printable version of a multi-file html
# document using html2ps.
#

import re
import string
import sys

class MissingBody(Exception): pass

# -----------------------------------------------------------------------------
#
def htmlcat():

  if len(sys.argv) < 2:

    sys.stderr.write('Syntax: htmlcat htmlfile1 htmlfile2 ... > bightml\n')
    sys.exit(1)

  html_paths = sys.argv[1:]
  try:
    prologue, body, epilogue = disect_html(html_paths[0])
  except IOError:
    sys.stderr.write("htmlcat: Couldn't open %s\n" % html_paths[0])
    sys.exit(1)
  except MissingBody:
    sys.stderr.write("htmlcat: %s missing <body> </body>\n" % html_paths[0])
    sys.exit(1)

  sys.stdout.write(prologue)
  for html_path in html_paths:
    try:
      pre, body, post = disect_html(html_path)
    except IOError:
      sys.stderr.write("htmlcat: Couldn't open %s\n" % html_path)
    except MissingBody:
      sys.stderr.write("htmlcat: %s missing <body> </body> tags\n" % html_path)
    else:
      fixed_ref = fix_references(body, html_path, html_paths)
      fixed_body = fix_anchors(fixed_ref, html_path)
      sys.stdout.write(fixed_body)
  sys.stdout.write(epilogue)

# -----------------------------------------------------------------------------
# Open an html file and separate it into prologue, body, and epilogue
# The prologue ends with the <body> tag and the epilogue begins with
# the </body> tag.
#
def disect_html(html_path):

  f = open(html_path, 'r')
  text = f.read()
  f.close()
  body_start = re.search('(<body>)|(<BODY>)', text)
  body_end = re.search('(</body>)|(</BODY>)', text)
  if body_start == None or body_end == None:
    raise MissingBody
  start = body_start.end()
  end = body_end.start()
  return text[:start], text[start:end], text[end:]

# -----------------------------------------------------------------------------
# Replace inter-file references with intra-file references.
# References like "path#anchor" become "#path.anchor" and
# "path" becomes "#path"
#
# Maybe easier to do this with the Python html parser in htmllib
#
def fix_references(html, path, paths):  

  fixed_html = ''
  pos = 0
  range = first_html_reference(html, pos)
  while range:
    fixed_html = fixed_html + html[pos:range[0]]
    refname = html[range[0]:range[1]]
    new_refname = new_reference_name(refname, path, paths)
    fixed_html = fixed_html + new_refname
    pos = range[1]
    range = first_html_reference(html, pos)
  fixed_html = fixed_html + html[pos:]

  return fixed_html
  
# -----------------------------------------------------------------------------
#
def new_reference_name(refname, path, paths):

  fields = string.split(refname, '#')
  new_refname = refname
  if len(fields) == 1:
    if fields[0] in paths:
      new_refname = '#' + fields[0]
  elif len(fields) == 2:
    if fields[0] == '':
      new_refname = '#' + path + '.' + fields[1]
    elif fields[0] in paths:
      new_refname = '#' + fields[0] + '.' + fields[1]
  return new_refname
  
# -----------------------------------------------------------------------------
# Find first occurence of <a href="name" and return index range for name.
#
def first_html_reference(html, start):

  ref_pattern = re.compile(r'<((a)|(A))\s+((href)|(HREF))="[^"]+"')
  ref = ref_pattern.search(html, start)
  if ref == None:
    return None

  return quoted_span(html, ref.start(), ref.end())
  
# -----------------------------------------------------------------------------
# Return index range of first quoted substring.
# Returned range does not include quote characters.
#
def quoted_span(string, start, end):

  quoted_pattern = re.compile(r'"[^"]+"')
  quoted = quoted_pattern.search(string, start, end)
  if quoted:
    return (quoted.start()+1, quoted.end()-1)
  return None

# -----------------------------------------------------------------------------
# Replace anchors with by prefixing with path name.  So <a name="anchor"
# becomes <a name="path.anchor".  This makes it so anchors from merged
# html files don't conflict.
#
def fix_anchors(html, path):

  fixed_html = '<a name="%s"></a>' % path
  pos = 0
  range = first_html_anchor(html, pos)
  while range:
    fixed_html = fixed_html + html[pos:range[0]]
    anchor = html[range[0]:range[1]]
    new_anchor = path + '.' + anchor
    fixed_html = fixed_html + new_anchor
    pos = range[1]
    range = first_html_anchor(html, pos)
  fixed_html = fixed_html + html[pos:]

  return fixed_html
  
# -----------------------------------------------------------------------------
# Find first occurence of <a name="anchor" and return index range for anchor.
#
def first_html_anchor(html, start):

  ref_pattern = re.compile(r'<((a)|(A))\s+((name)|(NAME))="[^"]+"')
  ref = ref_pattern.search(html, start)
  if ref == None:
    return None

  return quoted_span(html, ref.start(), ref.end())
  
# -----------------------------------------------------------------------------
#
htmlcat()
