#!/usr/local/bin/perl -w
#
# x2t v0.5, Harry Plantinga, Nov. 8, 1998. This program may be 
# distributed and used under the terms of the Artistic License.
# 
# v0.51 -- modifications to work with version 0.5 of rtf2xml. Most
# noticeable change is that transdoc tags are lower case.
#
# x2t: convert xml (transdoc dtd) to ThML (dtd of xml). This program
# generates legal ThML using CSS style="..." attributes for formatting, 
# but it still needs further processing:  
#   convert to XML
#   change style="" styles to class="" stylesheets
#   increment n= and href= in pb and div
#   adding IDs to elements
#
# Font sizes are expressed in percentages relative to 10pt, so that to
# change all font sizes in proportion it should be sufficient to change 
# the body font size.
#
# to do:
#   map known Greek, Heb fonts to unicode (SIL Ezra, SIL Galatia, etc.)
#
# ideas: use colors, bookmarks, page header and footer, hidden attribute?
#
use strict;
use Text::Wrap;
my $input;

while (<>) 
  { $input .= $_; }

my $added;

# Footnotes are embedded inside paragraphs and hence have different
# nesting properties compared to the rest of an RTF file.  So we will
# process footnotes first, then escape them. Process the rest of the 
# file, then unescape.
#print STDERR "Footnotes: ";
$input =~ s|(<FOOTNOTE>.*?</FOOTNOTE>)|&escape(&x2t($1))|gsie;
# print STDERR "\nRest: ";
$input = &x2t($input);			#convert tags to thml
$input =~ s|&less-than;|<|gs;
$input = &cleanup($input);

#$input = wrap("","",$input);		#fails in dos

print $input;
exit(0);

  
sub x2t
{
  $_ = shift;

  # changes for v. 0.5 of rtf2xml -- a quick fix.
  # --should we handle colors, hidden, etc. more intelligently?
# s|color=".*?"||g;			# ignore colors
# s|hidden=".*?"||g;			# ignore hidden attribute

  # change FIELD HYPERLINK elements into <A href= style links
  s|<FIELD><FLDINST> HYPERLINK "?(.*?)"? </FLDINST><FLDRSLT><STRING.*?>(.*?)</STRING></FLDRSLT></FIELD>|<A href="$1">$2</A>|gi;

# print STDERR "Retagging...";
# convert special rtf2xml tags
  retag('SECTION',	'_detag');
  retag('TRANSDOC',	'_detag');
  retag('FILE',	'_detag');
  retag('PNTEXT',	'_detag');
  retag('ROW',		'TR', 'TR');
  retag('TR',	'_delete_attributes');
  retag('CELL',	'TD', 'TD');
  retag('TD',	'_delete_attributes');
  retag('FOOTNOTE',	'note place="foot"', 'note');
  retag('!DOCTYPE',	'_detag');
  retag('\\?xml',	'_detag');
  retag('META',	'_delete');
  retag('table',	'TABLE', 'TABLE');
  retag('p',	'P', 'P');
  retag('bkmkend',	'_detag');	# delete bookmarks
  retag('bkmkstart',	'_detag');
  retag('page-break',	'_detag');	# delete page breaks
  retag('header',	'_delete');	# delete page header
  retag('footer',	'_delete');	# delete page footer

  #delete the initial entity declaration
  s|^.*ENTITY.*]>||si;
  #handle unicode -- convert to decimal escapes 
  s|<UNICHAR VALUE="x(.*?)".*?>|escapehex($1)|gie;
  #convert greek to decimal unicode escapes
  s|<STRING[^>]*CHARSET="161">(.*?)</STRING>|unigreek($1)|gsie;
  s|&tab;| |g;			#change tab entities to spaces
  s|<STRING[^>]*>&#183; </STRING>|&#8226; |gsi;	#bullet
  s|<TABLE>|<TABLE border="2">|gi;

# print STDERR "styles...";
  s|(<P[^>]+>)|cssstyles($1)|gsie;	#change paragraph style atts to CSS
  s|(<STRING[^>]+>)|cssstyles($1)|gsie;	#change character style atts to CSS

# change named character style (first parameter) to tags 
  chartag('Citation',	'CITE', 'CITE');
  chartag('Code',	'CODE', 'CODE');
  chartag('Comment',	'_comment');
  chartag('Default',	'_detag');	#is this a bug? need a span here?
  chartag('XML',	'_unescape_detag');
  chartag('Name',	'name',	'name');
  chartag('Unclear',	'unclear', 'unclear');

  # change remaining STRINGs to SPAN elements 
  #***bug here on nested strings.  need to handle recursively or parse through
  s|<STRING([^>]*?)CHARSTYNAME="(.*?)"(.*?)>(.*?)</STRING>|<SPAN class="$2" $1 $3>$4</SPAN>|gsi; 				      #named
  s|<STRING(.*?)>(.*?)</STRING>|<SPAN$1>$2</SPAN>|gsi; 	#unnamed

# change named paragraph styles (first parameter) to tags
  #do footnotes first since they're nested
  partag('footnote text','P class="Footnote"', 'P');
  partag('Attribution', 'attr', 'attr');
  partag('BlockQuote',	'BLOCKQUOTE><P', 'P></BLOCKQUOTE');
  partag('P_Continue',	'P class="Continue"', 'P');
  partag('P_First',	'P class="First"', 'P');
  partag('P_Resume',	'P class="Resume"', 'P');
  partag('HeaderInfo',	'_unescape_detag');
  partag('header',	'_delete');
  partag('heading 1',	'H1', 'H1');
  partag('heading 2',	'H2', 'H2');
  partag('heading 3',	'H3', 'H3');
  partag('heading 4',	'H4', 'H4');
  partag('HR',	'HR', '');
  partag('HR30',	   'HR class="W30"', '');
  partag('List',	   'P class="list1"', 'P');
  partag('List 2',	   'P class="list2"', 'P');
  partag('List 3',	   'P class="list3"', 'P');
  partag('List 4',	   'P class="list4"', 'P');
  partag('List Bullet',	   'P class="list1"', 'P');
  partag('List Bullet 2',  'P class="list2"', 'P');
  partag('List Bullet 3',  'P class="list3"', 'P');
  partag('List Bullet 4',  'P class="list4"', 'P');
  partag('List Number',	   'P class="list1"', 'P');
  partag('List Number 2',  'P class="list2"', 'P');
  partag('List Number 3',  'P class="list3"', 'P');
  partag('List Number 4',  'P class="list4"', 'P');
  partag('List Continue',  'P class="listCont1"', 'P');
  partag('List Continue 2','P class="listCont2"', 'P');
  partag('List Continue 3','P class="listCont3"', 'P');
  partag('List Continue 4','P class="listCont4"', 'P');
  partag('Term',	'term', 'term');
  partag('Definition',	'def', 'def');
  partag('SectionInfo',	'P class="sectionInfo"','P');
  partag('Verse',	'l', 'l');
  partag('Verse 2',	'l2', 'l2');
  partag('Verse 3',	'l3', 'l3');

  # change remaining paragraph styles named xxx to P class="xxx" 
  s|STYLENAME=|class=|gi;
  # clean up HRs -- delete styles
  s|<HR( class=".*?").*?>|<HR$1>|g;

# print STDERR "cleanup...";

  # change empty <ln></ln> elements to <verse> (should be a bit smarter?
  # this requires that every verse be preceeded by <ln></ln>.
  s|<(l[23]?)([^>]*)></\1>|<verse$2>|gs; #add <verse> around <l> tags

  s|(<[^>]*?)\s\s+([^>]*?>)|$1 $2|gs;		#collapse whitespace in tags
  s|="\s*([^"]*)\s*"|="$1"|g;			#rm initial, final att spaces
  s|<SPAN\s*>(.*?)</SPAN>|$1|gsi;		#delete useless spans
  s|style=" |style="|g;				#delete initial style spaces

  #eventually should handle other charsets ...
  s|CHARSET=".*?"||gi;

  return $_;
}

sub cleanup 
{
  $_ = shift;

  s|(.*?<ThML.body>)||s;			#clean head section
  my $head = $1;
  $head =~ s|</?SPAN.*?>||gsi;			#delete SPANs in head
  $head =~ s|<!--.*?-->||gsi;			#delete comments in head
  $_ = $head . $_;

  # delete <P> tags that only contain whitespace & e.g. div,deleted,added tags
  s@<P[^>]*>((\s*(<(!--|/?div|/?deleted|/?added|/?glossary|pb|scripParseOn|scripParseOff|scripContext|index|insertIndex)[^>]*?>)\s*)+)</\P>@$1@gs; 

  # delete anything between <ThML.body> and <div
  s|(<ThML.body>).*?(<div)|$1$2|s;

  s#<(P|H.)([^>]*?>)(</\1>)#<$1$2&nbsp;$3#gs;   #add &nbsp; in blank P or H?
  s|\n{3,100}|\n\n|g;				#compress blank lines
  return $_;
}

#---------------------subroutines-----------------

sub cssstyles
{
  my $tag = shift;
  my $styles = "";
  my $newsize = "";
  my $oldsize = "10";

# print STDERR "In parstyles: processing $tag\n";

  $styles .= "font-family: '$1'; " if $tag =~ s|FONTNAME="(.*?)"||i; 
  $oldsize = $1 if $tag =~ m|FONTSIZE="(.*?)"|i;
  $newsize = ($oldsize*50)/10;		#% of 10pt, eg. 12pt --> 120%
  $newsize =~ s/\..*//;
  $styles .= "font-size: ".$newsize . "%; " if $tag =~ s|FONTSIZE="(.*?)"||i; 
  $styles .= "font-weight: bold; " if $tag =~ s|BOLD="ON"||i; 
  $styles .= "font-weight: normal; " if $tag =~ s|BOLD="OFF"||i; 
  $styles .= "font-style: italic; " if $tag =~ s|ITALIC="ON"||i; 
  $styles .= "font-style: normal; " if $tag =~ s|ITALIC="OFF"||i; 
  $styles .= "text-align: left; " if $tag =~ s|ALIGN="LEFT"||i; 
  $styles .= "text-align: center; " if $tag =~ s|ALIGN="CENTER"||i; 
  $styles .= "text-align: right; " if $tag =~ s|ALIGN="RIGHT"||i; 
  $styles .= "text-align: justify; " if $tag =~ s|ALIGN="JUSTIFY"||i; 
  $styles .= "text-decoration: underline; " if $tag =~ s|UNDERLINE="ON"||i; 
  $styles .= "text-decoration: none; " if $tag =~ s|UNDERLINE="OFF"||i; 
  $styles .= "vertical-align: sub; " if $tag =~ s|SUBSCRIPT="ON"||i; 
  $styles .= "vertical-align: normal; " if $tag =~ s|SUBSCRIPT="OFF"||i; 
  $styles .= "vertical-align: super; " if $tag =~ s|SUPERSCRIPT="ON"||i; 
  $styles .= "vertical-align: normal; " if $tag =~ s|SUPERSCRIPT="OFF"||i; 
  $tag =~ s|hidden=".*?"||;	#ignore hidden attribute
  $tag =~ s|color=".*?"||;	#ignore color attribute
  $tag =~ s|>| style="$styles">| if $styles;

  return $tag;
}


# change the tag for a character style to something else:
# _delete --> delete start tag, end tag, and text between
# _unescape_delete --> unescape and delete tags
# _detag --> remove tags

sub chartag
{
  my $styname = shift;
  my $tagname = shift;
  my $tagend  = shift;

  if ($tagname eq "_unescape_detag") {
    s|<STRING CHARSTYNAME="$styname".*?>(.*?)</STRING>|&unescape($1)|gsei; }
  elsif ($tagname eq "_delete") {
    s|<STRING CHARSTYNAME="$styname".*?>.*?</STRING>||gsi; }
  elsif ($tagname eq "_detag") {
    s|<STRING CHARSTYNAME="$styname".*?>(.*?)</STRING>|$1|gsi; }
  elsif ($tagname eq "_comment") {
    s|<STRING CHARSTYNAME="$styname".*?>(.*?)</STRING>|<!-- $1 -->|gsi; }
  else {
    s|<STRING CHARSTYNAME="$styname"(.*?)>(.*?)</STRING>|<$tagname$1>$2</$tagend>|gsi; }
}


# handle a paragraph style:
# _unescape_detag: unescape and remove tags
# _detag: remove tags
# _delete: delete tags and text
# otherwise, switch to a new tag name

sub partag
{
  my $styname = shift;
  my $tagname = shift;
  my $tagend  = shift;

  if ($tagname eq "_unescape_detag") {
    s|<P STYLENAME="$styname".*?>(.*?)</P>|&unescape($1)|gsie; }
  elsif ($tagname eq "_delete") {
    s|<P STYLENAME="$styname".*?>.*?</P>||gsi; }
  elsif ($tagname eq "_detag") {
    s|<P STYLENAME="$styname".*?>(.*?)</P>|$1|gsi; }
  elsif ($tagend eq "") {
    s|<P STYLENAME="$styname"(.*?)>(.*?)</P>|<$tagname$1>$2|gsi; }
  else {
    s|<P STYLENAME="$styname"(.*?)>(.*?)</P>|<$tagname$1>$2</$tagend>|gsi; }
}


# change one tag to another
sub retag
{
  my $tagname = shift;
  my $tagrep  = shift;
  my $tagend  = shift;

  if ($tagrep eq "_detag")
    { s|</?$tagname.*?>||gsi; }
  elsif ($tagrep eq "_delete")
  {
    s|<$tagname.*?>.*?</$tagname>||gsi;
    s|<$tagname.*?>||gsi;
  }
  elsif ($tagrep eq "_delete_attributes")
  {
    s|(<$tagname).*?(/?>)|$1$2|gi;
  }
  else
  {
    s|<$tagname(.*?)>|<$tagrep$1>|gsi;
    s|</$tagname(.*?)>|</$tagend$1>|gsi;
  }
}


#escape notes so they don't get processed later
sub escape 
{
  my $stuff = shift;
  $stuff =~ s/</\&less-than;/g;
  return $stuff;
}


#change &lt; to <, &gt; to >, &amp; to &
sub unescape
{
  my $stuff = shift;
  $stuff =~ s/\&lt;/</g;
  $stuff =~ s/\&gt;/>/g;
  $stuff =~ s/\&amp;/\&/g;
  return $stuff;
}


# convert a hex unicode code to a decimal escape
sub escapehex
{
  my $code = shift;
  my $dec = hex $code;
  return "&#$dec;";
}

#convert greek (charset=161) to decimal unicode escapes
sub unigreek
{
  my $gk=shift;
# print STDERR "Greek passage: $gk\n";
  $gk =~ s|(\d+)|$1+720|ge;
# print STDERR "...converted to $gk\n";
  return $gk;
}

