#!/usr/local/bin/perl -w
#
# identify v0.5, Harry Plantinga, Nov. 8, 1998. This program may be 
# distributed and used under the terms of the Artistic License.
#
# v0.52.  Nov. 25. Corrected unique ID code.
# v0.51.  Nov. 20.  
#   - Ensure that IDs created are unique. 
#   - Don't add ids to tags that already have them. 
#   - Write output directly, to decrease memory requirements.
# 
# identify: add ids to all body elements. ids are based on divs:
# they have the form nn.nn.nn.pmm.ee, where the nns are the 
# numbers (or first 12 characters of the name) of the containing
# div1, div2, and div3 (etc.).  The mm is the number of the 
# paragraph in the current div, and the ee is the number of the
# element in the current paragraph.
#
# bugs:  it is SLOW!!! 
#   It really ought to read through the file and put any IDs it
#   finds into %ids_uses, so they can't be reused.
#
use strict;
my ($input, @id, %ids_used);

while (<>) 
  { $input .= $_; }

  $_ = $input;

  #first, change n attribute of PB tags to Page_n id
  s|(<pb.*?n=")(.*?)(".*?)\s*/>|$1$2$3 id="Page_$2"/>|gs;

  &id();			#add ids to tags
# $input = &id($input);		#add ids to tags
# $input =~ s|\s+id=| id=|g;	#clean up ids a bit

# print $input;		#maybe it will run faster if &id outputs directly

  exit(0);

  

#------------ add tag ids --------------
# 
# This subroutine adds an id attribute to each element after the
# first <div1>. The id is of the form xxx.xxx.xxx.pyy.zz, where
# each xxx identifies a div, yy is the paragraph number in the
# division, and zz is the element number.
#


sub id
{
# $_ = shift;
  my $p = "0";
  my $t = "0";
  my ($level, $oldlevel, $stuff, $tag);
  my $sect = "";

  s|(.*?)<div|<div|s;
# my $output = $1;			#don't mess with header
  print $1;
  
  while (s|^([^<]*?)(<.*?>)||s) {
#   $output .= $1;
    print $1;
    $tag = $2;
    if ($tag =~ m|id="|) {		#already had an id?
      print $tag;
      next;
    }

    if ($tag =~ m/(<!--.*?-->|<\/.*?>)/s) { 	#skip comments, end tags
#     $output .= $1; 
      print $1;
      }
    elsif ($tag =~ m|<div([1-7])(.*?)>|)
    {
      $oldlevel = $level;
      $level = $1;			#new level
      $stuff = $2;			#attributes of div tag
      $p = "0";				#reset paragraph counter to 0
      $t = "0";				#reset element counter to 0
#     print STDERR "Found div: <div$1$2>";
      $sect = getName($level, $stuff);	#get section name
#     print STDERR "$sect\t";
      if ($ids_used{$sect}){
        print STDERR "HEY -- section $sect was already used!\n";
        $sect .= "_1";
        while ($ids_used{$sect}) {
          $sect++; }
        print STDERR "    -- let's use $sect instead.\n";
      }
      $ids_used{$sect} = 1;
      $tag =~ s|(^<div[1-7].*?)>|$1 id="$sect">|s;  #add section name as id
#     $output .= "$tag";
      print $tag;
    }
    elsif ($tag =~ m|(<P.*?)>|s)
    {
      $p++;				#increment paragraph counter
      $t = "0";				#reset tag counter to 0
#     print STDERR "1=$1 sect=$sect p=$p\n";
      my $id = "$sect.p$p";
      if ($ids_used{$id}) {
        print STDERR "HEY -- $id was already used!\n" if $ids_used{$id};
        $id .= "_1";
        while ($ids_used{$id}) { 
          $id++; }
        print STDERR "    -- let's use $id instead.\n";
      }
      $ids_used{$id} = 1;
#     $output .= "$1 id=\"$id\">";
      print "$1 id=\"$id\">";
    }
    elsif ($tag =~ m|(<.*?)(/?>)|s)	#any other tag, possibly <  />
    {
      $t++;				#increment tag counter
      my $id = "$sect.p$p.$t";
#     print STDERR "Assigning ID $id\n";
      if ($ids_used{$id}) {
        print STDERR "HEY -- $id was already used!\n" if $ids_used{$id};
        $id .= "_1";
        while ($ids_used{$id}) { 
          $id++; }
        print STDERR "    -- let's use $id instead.\n";
      }
      $ids_used{$id} = 1;
#     $output .= "$1 id=\"$id\"$2";
      print "$1 id=\"$id\"$2";
    }
  }

  print $_;
# $output .= $_;
# $output =~ s|(<pb.*?n=")(.*?)(".*?id=").*?(".*?>)|$1$2$3Page_$2$4|gs;
			#change the id of pb tags to page number
# return $output;
}


# 
# this function takes info from a <div> tag and returns 
# the current section (e.g. 1.3.2) 
#
# side-effect: modify global $id[] array of current context
# 
sub getName
{
  my $level = shift;
  my $rest = shift;
  my $divtitle = "";
  my $type = "";
  my $n = "";
  my ($sect, $i);

  $divtitle = $1 if $rest =~ m|title="(.*?)"|;		#find title
  $type = $1 if $rest =~ m|type="(.*?)"|;		#find type
  $n = $1 if $rest =~ m|n="(.*?)"|;			#find n
  
  if ($n)                                 #each level id is n if it exists 
    { $id[$level] = $n; }
  else
    { $id[$level] = $divtitle; }          #if no n, use divtitle as id

  $id[$level] =~ s|&.*?;||g;              #delete entities
# $id[$level] =~ s| |_|g;                 #change space to '_'
  $id[$level] =~ s|[^-_0-9a-zA-Z]||gs;    #delete chars not in list
  $id[$level] =~ s|(.{1,12}).*|$1|;       #limit to 12 characters
 
  #if ID isn't unique, nsgmls will complain.

  $divtitle = "$type $n: " . $divtitle if $type and $n;
  $sect = "";
  for ($i="1"; $i le $level; $i++)            #add id for each level
    { $sect .= ".".$id[$i]; }
  $sect =~ s|^\.||;

  return ($sect);
}

