#!/usr/local/bin/perl -w
# 
# thm2htm v0.1, 11/19/98, Harry Plantinga.  This program may be copied
# under the terms of the Artistic License.
#
# This script is a quick first attempt at ThML to HTML conversion.
#   (Probably this will be done eventually with an XSL stylesheet.)
# Footnotes currently only display in Internet Explorer -- place
#   the cursor over the *
#
# v0.2, 11/25/98. This version makes a number of unspecified improvements.
# The program works pretty well for Watts.thm, but it has not been tested
# very well yet.
#
# Bugs: doesn't handle manually-inserted IDs correctly. Doesn't even 
# handle the special IDs that identify inserts to avoid duplate IDs.
#
# Eventual hopes: 
# - Generate a preferences panel to set things such as font size, 
#   scripture translation preference, etc. in cookies. 
# - Collapsable table of contents.  
# - Optional footnote display along right side. 
# - Navigation panel showing context in separate frame. 
# - Left and right arrow keys page backward and forward. 
# - Etc. etc. etc.
#
use strict;
my ($bookID, $author, $authorID, $title, $input, @id, $head, $body);
my ($prev, $divname, $notenum, $footnotes);

while (<>) 			#read entire file into $input
  { $input .= $_; }

$input =~ s|<deleted.*?</deleted>||gs;	#delete deleted stuff
$_ = $input;

&getInfo;			#get author, title, etc out of header
mkdir ("$bookID",0774) unless -e "$bookID"; #make directory for html

# fix up references to  other sections of same document
$input =~ s|(href=")#(.*?)(\.p.*?")|$1$2.htm#$2$3|gs;

# separate out head and body
$input =~ m|^(.*</ThML.head>).*?(<ThML.body>.*)|s;
$head = $1; $body = $2;

&processHead($head);	#process ThML.head -- make info page
&processBody($body);	#process ThML.body -- make web

print "HTML starts at $bookID/TitlePage.htm\n";


#---------------------subroutines-----------------
#
# this subroutine gets some important info: 
# title, author, bookID, authorID
#
sub getInfo
{
  m|<title>(.*)</title>|is; $title = $1;
  m|<author>(.*)</author>|is; $author = $1;
  m|<bookID>(.*)</bookID>|is; $bookID = $1;
  m|<authorID>(.*)</authorID>|is; $authorID = $1;
  print "Processing $title by $author\n";
}
  

#
#  Make an info page out of the ThML.head information
#
sub processHead
{
  $_ = shift;

  #output the stylesheet which is common for all sections of this doc
  my $name=">$bookID/styles.css";
  open STYLES, $name or die $!;
  my $styles="";
  $styles = $1 if m|<STYLE.*?>(.*?)</STYLE>|is;
  print STYLES $styles;
  close STYLES;

  my $front = head("About $title");
  $front .= "<H1><A class=\"TOC\" href=\"TOC.htm\">$title</A></H1>\n";
  $front .= "<H3>by</H3>\n";
  $front .= "<H2>$author</H2>\n<HR>\n";
  my $back = "</BODY>\n</HTML>\n";

  s|<(.*?)></\1>\s*||gm;  #delete empty <x></x> tags
  retag("!DOCTYPE",	"_detag");
  retag("ThML",	"_detag");
  retag("ThML.head",	"_detag");
  retag("meta",	"_detag");
  retag("MARCtagged",	"_delete");

  #
  # process each of generalInfo, printSourceInfo, elecEdInfo separately
  #
  s@<(generalInfo|printSourceInfo|electronicEdInfo)>(.*?)</\1>@headsect($1,$2)@egs;

  $name=">$bookID/About.htm";
  open INFO, $name or die $!;
  print INFO $front . $_ . $back;
  close INFO;
}


# 
#  Put the Table of Contents and each <divn>...</divn> into a 
#  separate file.
#
sub processBody
{
  $_ = shift;
  my $divname = "_none";
  open TOC, ">$bookID/TOC.htm" or die $!;
  print TOC head("$title - TOC");
  print TOC "<H1><A class=\"TOC\" href=\"About.htm\">$title</A></H1>\n";
  print TOC "<H3>by</H3>\n";
  print TOC "<H2>$author</H2>\n";
  print TOC "<HR><H1>Table of Contents</H1>\n";
  
  #
  # For each divn tag we add a TOC entry, then put the contents
  # into a separate file.
  #
  my ($oldlevel, $level, $rest, $content, $r, $n);
  $level="0";
  while (m|<div[1-7]|)		# while there is a remaining <div
  {
    $oldlevel = $level;
    $r = m|<div.*?<div|s;
    if ($r eq "1")		#find and process the div elements
    { 
      s|.*?<div([1-7])(.*?)>(.*?)<div|<div|s; 
      $level = $1;
      $rest = $2;
      $content = $3;
    } 
    else
    { 
      s|.*?<div([1-7])(.*?)>||s; 
      $level = $1;
      $rest = $2;
      $content = $_;
    }
    $n = "" if $level ne $oldlevel;

    my ($divtitle, $section, $ntitle, $next, $nsect);
    $prev=$divname;
#   print "-->Processing $divname\n";
    ($divtitle, $divname, $section) = getName($level, $rest, $n);
    
    $next = "_none";
    ($ntitle, $next, $nsect) = getName($1, $2, $n) if m|.*?<div([1-7])(.*?)>|s; 

    print TOC "<P class=\"TOC$level\"><A class=\"TOC\" href=\"$divname\">";
    print TOC "$divtitle</A></P>\n";

#   print "In processBody--about to call processDiv prev=$prev next=$next\n";
    open OF, ">$bookID/".$divname or die $!;
    print OF processDiv($content,$section,$prev,$next);
    close OF;

  }
  
  print TOC "\n<HR>\n<B>Also Available:</B>\n";
  print TOC "<UL><LI><A href=\"About.htm\">About <I>$title</I></A>\n";
  print TOC "</UL></BODY></HTML>\n";
  close TOC;
} 


# 
# this function takes info from a <div> tag and returns two things:
# the current section (e.g. .1.3.2) and the filename (s.1.3.2.htm)
#
# warning: side-effect: modify global $id[] array of current context
# 
sub getName
{
  my $level = shift;
  my $rest = shift;
  my $old_n = shift;
  my ($sect, $i, $divtitle, $type, $n);
  $divtitle = $type = $n = "";

  $divtitle = $1 if $rest =~ m|title\s*=\s*"(.*?)"|s;	#find title
  $type = $1 if $rest =~ m|type\s*=\s*"(.*?)"|s;	#find type
  $n = $1 if $rest =~ m|n\s*=\s*"(.*?)"|s;		#find n
  
  $n = $old_n++ if $old_n and not $n;	#if there's an old n, inc it
  if ($n)			        #each level id is n if it exists 
    { $id[$level] = $n; }
  else
  { 
    $id[$level] = $divtitle;                #if no n, use divtitle as ID
    $id[$level] =~ s| |_|g;	            #change space to '_'
#   $id[$level] =~ s|[^_0-9a-zA-Z]+||g;     #delete chars not in list
    $id[$level] =~ s|[^0-9a-zA-Z]+||g;      #delete chars not in list
    $id[$level] =~ s|(.{1,12}).*|$1|;       #limit to 12 characters
  }

  $divtitle = "$type $n: " . $divtitle if $type and $n;
  $sect = "";
  for ($i="1"; $i le $level; $i++)            #add id for each level
    { $sect .= ".".$id[$i]; }
  $sect =~ s|^\.||;
  my $divname = "$sect.htm";

# print "Getname: divtitle=$divtitle divname=$divname sect=$sect\n";
  return ($divtitle,$divname,$sect);
}


sub processDiv
{
  my $div = shift; 
  my $sect = shift;
  my $prev = shift;
  my $next = shift;
  $notenum = 1;
  $footnotes = "";
# $sect =~ s/.(.*)/$1/s;		#delete initial #

  my $front = head("$title $sect");	#construct HTML head
  my $back = "</BODY></HTML>\n";
  my $nav = navbar($prev, $next);

  $div =~ s|(<note.*?>.*?</note>)|&note($1,$notenum)|gsie;
# $div =~ s|<note.*?>(.*?)</note>|<SPAN class="Note" title='$1'>*</SPAN>|gs;
					#change notes to title="..."
# $div =~ s|(title='[^']*?<[^']*?>[^']*?')|&detag($1)|gse;
					#detag inside title="..."
  $div =~ s|(<note.*?>.*?</note>)|&note($1,$notenum)|gsie;

  $div =~ s|\&line;|<BR>|g;		#change &line; to <BR>
  $div =~ s|(<P.*?>)(</P>)|$1&nbsp;$2|gs;#add space to blank paragraphs
  retag('attr',	'P class="Attribution"', "P");
  retag('argument', 'P class="Argument"', "P");
  retag('meter', 'P class="meter"', "P");
  retag('sectionInfo', 'P class="sectionInfo"', 'P');
  retag('name',	'SPAN class="Name"', 'SPAN');
  retag('date',	'SPAN class="Date"', 'SPAN');
  retag('unclear',	'SPAN class="unclear"', 'SPAN');
  retag('l3',	'P class="Line3"', 'P');
  retag('l2',	'P class="Line2"', 'P');
  retag('l',	'P class="Line1"', 'P');
  retag('verse', 'P class="Verse"', 'P');

  #now for something really nasty: lists were generated as 
  #  <LI><UL>: the <LI> is required in valid HTML4.
  #But it looks terrible, with blank lines where they're not wanted.
  #This hack deletes that extra <LI>, resulting in invalid (but better?)
  #HTML.

  $div =~ s/<LI>(<UL class="Index)/$1/g;

  return "$front$nav\n$div\n$footnotes$nav$back";
}


sub note
{
  my $note=shift;
# print "Processing footnote $note -- number $notenum\n";
  $footnotes = "\n<HR class=\"Note\">\n" if $footnotes eq "";

  $note =~ s|</?note[^>]*>||g;
  my $bref="<A class=\"Note\" name=\"_fnf$notenum\" " .
     "href=\"#_fnb$notenum\"><SUP class=\"Note\">$notenum</SUP></A>";
  $note =~ s|^(<P.*?>)|$1$bref |;
  $note .= "\n";
# print "After processing: Footnote $note\n";
  $footnotes .= $note;

  my $fref="<A class=\"Note\" name=\"_fnb$notenum\" " .
     "href=\"#_fnf$notenum\"><SUP class=\"Note\">". 
     $notenum++ . "</SUP></A>";

  return $fref; 
}

#------------tag-hack subroutines-----------------


#
# delete all tags in parameter
#
sub detag
{
  my $in = shift @_;
  $in =~ s|<.*?>||gs;
  return $in;
}

# change one tag to another
sub retag
{
  my $tagname = shift;
  my $tagrep = shift;
  my $tagend = shift;

  if ($tagrep eq "_detag")
  {
    s|</?$tagname\b.*?>||gs;
  }
  elsif ($tagrep eq "_delete")
  {
    s|<$tagname\b.*?>.*?</$tagname>||gs;
  }
  else
  {
    s|<$tagname(\b.*?)>|<$tagrep$1>|gs;
    s|</$tagname(\b.*?)>|</$tagend$1>|gs;
  }
}


#
# Process a head section: generalInfo, printSourceInfo, or electronicEdInfo
# First parameter is tag; second is contents of element
#
sub headsect
{
  my $section = shift;
  my $contents = shift;
  my $result = "<H3>Information on the ";
  $result .= "Book" if $section eq "generalInfo";
  $result .= "Print Source" if $section eq "printSourceInfo";
  $result .= "Electronic Edition" if $section eq "electronicEdInfo";
  $result .= "</H3>\n";
  $contents =~ s|<(.*?)>(.*?)</\1>|headItem($1,$2)|gse;
  $result .= $contents . "\n\n";
  return $result;
}


#
#  Process one header element inside generalInfo, etc:
#  Modify element names to make nice titles: 
#    printSourceInfo --> Print Source Info 
#  Use <PRE> style formatting if necessary
#
sub headItem
{
  my $tag = shift;
  my $item = shift;
  $tag =~ s|([A-Z]{2})([a-z])|$1 $2|g;  #space after acronym
  $tag =~ s|([a-z])([A-Z])|$1 $2|g;	#add in spaces
  $tag = ucfirst($tag);			#make first letter upper case

  #really we should use a normal paragraph with white-space: pre in the
  #stylesheet, but IE 4 doesn't handle this yet.
  return "<PRE class=\"HeadItem\"><B>$tag:</B> $item</PRE>";
}


#
#given a title, return head of document, through <BODY>.
#
sub head
{
  my $title = shift;
  my $h = "<HTML><HEAD>\n";
  $h .= "<TITLE>$title</TITLE>\n";
  $h .= '<LINK rel="stylesheet" type="text/css" href="/css/ThML098.css">';
  $h .= "\n<STYLE>\n<!--\n  \@import url(styles.css);\n-->\n</STYLE>\n";
  $h .= "\n</HEAD><BODY>\n\n";

  return $h;
}


#
#build a nav bar
#
sub navbar
{
  my $prev = shift;
  my $next = shift;
# print "In navbar: prev=$prev next=$next\n";

  my $nav  = "<P class=\"Center\">";
  $nav .= "<A href=\"$prev\">"
       .  "<IMG src=\"/pix/mroonppv.gif\" alt=\"BACK\" "
       .  "BORDER=\"NO\"></A>" if $prev ne "_none";
  $nav .= "<A href=\"TOC.htm\">"
       .  "<IMG src=\"/pix/mroontoc.gif\" alt=\"UP\" BORDER=\"NO\"></A>";
  $nav .= "<A href=\"$next\">"
       .  "<IMG src=\"/pix/mroonpnx.gif\" alt=\"NEXT\" "
       . "BORDER=\"NO\"></A>" if $next ne "_none";
  $nav .= "</P>\n";
  return $nav;
}
