#!/bin/env python

# -----------------------------------------------------------------------------
#
#	Syntax: html2ps html_file > postscript_file
#
# Convert a single HTML file to Postscript with pages numbered and
# page number references added for all links.
#
# Unfortunately Netscape doesn't do the page numbering and referencing.
# so this script produces new HTML with page references and uses Netscape
# print to produce postscript and then modifies the postscript to put
# page numbers at the bottom of pages.
#
# Also unfortunate is the fact that Netscape won't print to a file
# non-interactively (as far as I can determine).  So you have to hand
# operate Netscape to print a few times for this html to postscript
# conversion to be carried out.  You'll be prompted to do the printing
# to files with Netscape when this script is run.
#
# Summary of the conversion process:
#
#	file.html
#		-> no_page_refs.ps		# Use Netscape by hand
#		-> with_page_refs.html		# Python code adds page refs
#		-> with_page_refs.ps		# Use Netscape by hand
#		-> file.ps			# Python code adds page footers
#
# Adding the page number references could shift the page location of anchors
# making the page numbers wrong.  The user is asked if they want to do another
# iteration to catch this.
#

import os
import string
import sys

class Cancelled(Exception): pass

# -----------------------------------------------------------------------------
#
def html2ps():

  if len(sys.argv) != 2:

    sys.stderr.write('Syntax: html2ps html_file > postscript_file\n')
    sys.exit(1)
    
  html = sys.argv[1]
  psnoref = 'no_page_refs.ps'
  htmlref = 'with_page_refs.html'
  psref = 'with_page_refs.ps'
  psfile = sys.stdout

  try:

    if not print_postscript(html, psnoref): raise Cancelled
    add_html_page_references(html, psnoref, htmlref)
    if not print_postscript(htmlref, psref): raise Cancelled
    if second_pass():
      add_html_page_references(html, psref, htmlref)
      if not print_postscript(htmlref, psref): raise Cancelled

  except Cancelled:

    sys.stderr.write('html2ps: Conversion cancelled\n')

  else:

    add_page_numbers(psref, psfile)

  if os.path.exists(psnoref): os.remove(psnoref)
  if os.path.exists(htmlref): os.remove(htmlref)
  if os.path.exists(psref): os.remove(psref)

# -----------------------------------------------------------------------------
# Ask the user to print some html to a file using netscape and wait until
# they confirm that they have done it.
#
def print_postscript(html_path, ps_path):

  request = ('Please print "%s" to file "%s"\n' % (html_path, ps_path) + 
             'using netscape, then press return.')
  sys.stderr.write(request);
  raw_input()
  
  return 1

# -----------------------------------------------------------------------------
# Ask the user if the want a second pass of page locating.
#
def second_pass():

  request = ('Would you like to do a second pass\n' +
             'correcting page numbers for links using\n' +
             'the altered layout produced by the\n' +
             'first pass insertion of page numbers? (y/n) ')
  sys.stderr.write(request)
  
  return raw_input() in ('y', 'yes', 'Y', 'Yes', 'YES')
  
# -----------------------------------------------------------------------------
# Add text like [p 5] following every link to indicate which page of the
# postscript file the anchor is found on.  Output a new html file.
#
def add_html_page_references(html_path, postscript_path, output_path):

  html_file = open_file(html_path, 'r')
  html = html_file.read()
  html_file.close()

  ps_file = open_file(postscript_path, 'r')
  postscript = ps_file.read()
  ps_file.close()

  table = anchor_page_table(html, postscript)

  output = open_file(output_path, 'w')
  add_page_references(html, table, output)
  output.close()

# -----------------------------------------------------------------------------
#
def anchor_page_table(html, ps):

  sys.stderr.write('anchors on pages ')
  table = {}
  ignore_words = 0
  preceding_anchors = []
  ps_position = first_postscript_page(ps)
  token = next_html_token(html)
  while token:
    if token.tag == None:
      if (not ignore_words) and len(token.word) > 0 and token.word[0] != '&':
	p = string.find(ps, token.word, ps_position)
	if p == -1:
	  sys.stderr.write('\n%s not found in postscript\n' % token.word)
	else:
	  ps_position = p
          if preceding_anchors:
            #
            # This code assures that anchors at the top of a page
            # are not assigned to the preceding page.
            #
            page = postscript_page_number(ps, ps_position)
            for name in preceding_anchors:
              table[name] = page
              sys.stderr.write('%d ' % page)
            preceding_anchors = []
    elif token.tag == 'a' or token.tag == 'A':
      if hasattr(token, 'name'):
        page = postscript_page_number(ps, ps_position)
        table[token.name] = page
        preceding_anchors.append(token.name)
    elif string.lower(token.tag) == 'title':
      ignore_words = 1
    elif string.lower(token.tag) == '/title':
      ignore_words = 0
    token = next_html_token(html, token)
  sys.stderr.write('\n')
  return table

# -----------------------------------------------------------------------------
#
def add_page_references(html, page_table, output):

  sys.stderr.write('references to pages ')
  token = next_html_token(html)
  while token:
    if token.tag == 'a' or token.tag == 'A':
      page = None
      if hasattr(token, 'href') and token.href[0] == '#':
	if page_table.has_key(token.href[1:]):
	  page = page_table[token.href[1:]]
    output.write(token.text)
    if token.tag == '/a' and page:
      page_ref = ' [p ' + str(page) + ']'
      output.write(page_ref)
      sys.stderr.write(str(page) + ' ')
      page = None
    token = next_html_token(html, token)
  sys.stderr.write('\n')

# -----------------------------------------------------------------------------
#
def first_postscript_page(ps):

  return string.index(ps, '%%Page:') + len('%%Page:')

# -----------------------------------------------------------------------------
#
def postscript_page_number(ps, position):

  pos = string.rindex(ps[:position], '%%Page:')
  page = string.split(ps[pos:], None, 2)[1]
  return string.atoi(page)

# -----------------------------------------------------------------------------
#
def next_html_token(html, token = None):

  if token:
    pos = token.position + len(token.text)
  else:
    pos = 0

  if pos >= len(html):
    return None

  f = skip_white(html, pos)

  punctuation = '<>()[]{}.,?!@#$%^*-+="\\'
  if f >= len(html):
    tok = html_token(pos, html[pos:f])
    tok.word =''
  elif html[f] == '<':
    e = string.index(html, '>', f) + 1
    tok = html_token(pos, html[pos:e])
    tag_token(html[f:e], tok)
  elif html[f] in punctuation:
    tok = html_token(pos, html[pos:f+1])
    tok.word = html[f:f+1]
  else:
    e = skip_until(html, f, string.whitespace + punctuation)
    tok = html_token(pos, html[pos:e])
    tok.word = html[f:e]

  return tok


# -----------------------------------------------------------------------------
#
def skip_white(s, pos):

  while pos < len(s) and s[pos] in string.whitespace:
    pos = pos + 1
  return pos

# -----------------------------------------------------------------------------
#
def skip_nonwhite(s, pos):

  while pos < len(s) and not (s[pos] in string.whitespace):
    pos = pos + 1
  return pos

# -----------------------------------------------------------------------------
#
def skip_until(s, pos, charset):

  while pos < len(s) and not (s[pos] in charset):
    pos = pos + 1
  return pos

# -----------------------------------------------------------------------------
#
def tag_token(text, token):

  if text[1] == '!':
    token.tag = '!'
    token.comment = text[2:-1]
  else:
    tag_attrlist = string.split(text[1:-1], None, 1)
    token.tag = tag_attrlist[0]
    if len(tag_attrlist) > 1 and len(tag_attrlist[1]) > 0:
      tag_attributes(tag_attrlist[1], token)

# -----------------------------------------------------------------------------
#
def tag_attributes(attributes, token):

    pos	= 0
    while pos < len(attributes):
      pos = skip_white(attributes, pos)
      eq = string.find(attributes, '=', pos)
      if eq >= 0:
        attr = attributes[pos:eq]
        if attributes[eq+1] == '"':
          f = eq + 2
          e = string.index(attributes, '"', f)
          pos = e + 1
        else:
          f = eq + 1
          e = skip_nonwhite(attributes, f)
          pos = e
        value = attributes[f:e]
        setattr(token, attr, value)
      else:
        pos = skip_nonwhite(attributes, pos)

# -----------------------------------------------------------------------------
#
class html_token:

  tag = None
  word = None

  def __init__(self, position, text):
    self.position = position
    self.text = text

# -----------------------------------------------------------------------------
# Add page number footers to Netscape 3.01 postscript output
#
def add_page_numbers(postscript_path, output):

  ps = open_file(postscript_path, 'r')
  page = 0
  for line in ps.readlines():
    output.write(line)
    words = string.split(line, None, 2)
    if len(words) > 0:
      if words[0] == '%%EndPageSetup':
	output.write('12 f0 252 50 moveto (p %d) cshow\n' % page)
      if words[0] == '%%Page:':
	page = string.atoi(words[1])
  ps.close()

# -----------------------------------------------------------------------------
# Open file.  On error, print message and exit.
#
def open_file(path, mode):

  try:	
    return open(path, mode)
  except IOError:
    sys.stderr.write("html2ps: Couldn't open %s\n" % path)
    sys.exit(1)

# -----------------------------------------------------------------------------
#
html2ps()
