#!/usr/bin/perl

$poem_header = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\"\n   \"http://www.w3.org/TR/REC-html40/strict.dtd\">\n\n<HTML lang=en>\n\t<HEAD>\n\t\t<LINK rel=\"STYLESHEET\" href=\"poem.css\" type=\"text/css\">\n";

$prefix = "pgbev-";
$postfix = ".html";

while( <> )
{
        last
            if /A\.T\.Q\.C\./
}


$index = 0;

while( <> )
  {
    while (/^\n/) 
      { $_ = <>; }

    last if /End of the Project Gutenberg Etext/;

    $index ++;

    chomp;
    $author_info = $_;

    while( <> )
      {
	last if /^\n/;
	chomp;
	$author_info .= " $_";
      }

    $_ = <>;

    /([0-9]+)\. (.*)/ || die;


    if ( $index != $1 )
      {
	die "Error: calculated index $index differs from index found: $1 (author info: $author_info)\n";
      }

    $title = $2;
    $name = "<SPAN id=index>$index.</SPAN> $title";
    
  true_multiline_title:
    $_ = <>;

    if (!/^\n/)
      {
	if (/^[a-zA-Z\s]+$/ && /[a-z]/ && /[A-Z]/ && /(\w+)(\s+\w+){2,}/ 
	   && ($title =~ /(\w+)(\s+\w+){3,}/))
	    {
	      chomp;
	      $title .= " $_";
	      $name .= " $_";
	      # print "$index. <$title>\n";
	      goto true_multiline_title;
	    }
	$name .= "<BR>\n\t\t\t<SPAN id=extra-title>$_";

	while( <> ) 
	  { 
	    last if /^\n/; 
	    chomp;

#	    $title .= " $_"; 
	    $name .= "<BR>\n$_";
	  }
	$name .= "\t\t\t</SPAN>";
      }

    $filename = "$prefix$index$postfix";
    open poem_file, ">poems/$filename";

    
    print poem_file $poem_header;
    print poem_file "\t\t<TITLE>$title</TITLE>\n";
    print poem_file "\t</HEAD>\n\t<BODY>\n";
    print poem_file "\t\t<H1>$name</H1>\n";
    print poem_file "\t\t<H2>$author_info</H2>\n\n";

    print poem_file "\t\t<DIV class=text><BR>\n";
    # The emacs W3 browser seems to like a BR here

    $text = "";
    $gloss = "";
    %glosses = ();
    $_ = <>;

    do 
    {

	do 
	{
	    if (/^[^\[\]]*\]/)
	    { # gloss
	      $gloss = $_;

	      chomp;
#	      print "$index has gloss in <$_>\n";
	      while( <> ) 
		{ 
		  chomp;
		  goto check_poem_end if !length; 
		  $gloss .= "$_\n";
		}
	    }

	
#	    print poem_file "&nbsp;" if /^\s/;
	    $text .= $_;
	    chomp;
#	    print poem_file "$_<BR>\n";
	    goto check_poem_end if !length;
	
	} while( $_ = <> );
     

check_poem_end:
     $_ = <>;
     
#	print "$index ok\n" if ( /^\n/ );

    } until /^\n/;
        
    $_ = $text;

    if (length $gloss)
      {
	#print poem_file "\n\t<DIV class=gloss>\n$gloss\n\t</DIV>\n";
	$_ = $gloss;
	s/\n/  /g;
	s/\.\'/\'\. /g;

	@_ = split /(?<!i\.e)\.\s\s+/;

	foreach $_ (@_)
	  {
	    s/\s+/ /g;
	    /([^\]]+)\]\s?(.*)/;
	    $_ = $1;
	    $value = $2;

	    if (/,/) # This case occurs just once. Poem 4: hue, huo=she
	      {
		foreach $_ (split /\s*,\s*/)
		  {
		    $glosses{$_} = $value;  
		    $value .= " ";
		  }
	      }

	    else 
	      {
		
	      $glosses{$_} = $value;
	    }
	  }



        $_ = $text;
	foreach my $key (sort { length $b <=> length $a } keys %glosses)
	  {

	    $value = $glosses{$key};

	    print STDERR "Error at poem $index. Empty gloss value for $key in $gloss\n" 
	      if !length $value;

	    $key =~ s/\s/(?:[\\s\\n]+)(?:[a-zA-Z]+?\\s)?/g;
            # The second part is mad overkill. It allows one word between
            # words of the regexp. This only occurs once in the whole
            # text (poem 16, tuke [gude] keep). 

#	    print "gloss:$key=$value\n";

	    $key = qr/$key/im;

	    $clean_value = $value;
	    $clean_value =~ s/[^A-Za-z_]/-/g ;
	    $clean_value =~ s/^(-)+//;

	    $value =~ y/[\'\"]//d;

	    s/(^(?:(?:[^\<\n]*\<A[^\<\n]*\<\/A\>)*[^\<\n]*?[^a-zA-Z])?)($key)(?![a-zA-Z])/$1<A href=\"\#$clean_value\" onmouseover='javascript:status=\"$value\"' title=\"$value\">$2<\/A>/gim or
	      print STDERR "<$key> is unattached gloss of value <$value> for poem $index.\n";
	    # This is ok for poem 712 where the gloss refers to the title
	    
	  }
      }
    
    $_ = "\n$_";
    chomp;
    
    s/\n/<BR>\n/g;
    s/^<BR>\n/<P>\n/gm;       
    s/^((\s\s)+)(?!\s*$)/"&nbsp;" x length $1/mge;
	   
    $text = $_;
    print poem_file $text;
	   
    print poem_file "\n\t\t</DIV>\n";
    
    if (length $gloss)
      {
	print poem_file "\t\t<DIV class=gloss><BR>\n\t\t\t<DL class=gloss>\n";
	
	foreach my $key (sort keys %glosses)
	  {
	    $value = $glosses{$key};
	    $value =~ s/[^A-Za-z_]/-/g;
	    $value =~ s/^(-)+//;
	    
	    print poem_file "\t\t\t\t<DT id=\"$value\" class=gloss>$key\n\t\t\t\t<DD class=gloss>$glosses{$key}\n";
	  }
	print poem_file "\t\t\t</DL>\n\t\t</DIV>\n";
      }

    print poem_file "\t<DIV class=address><A href=\"../index.html\">The Oxford Book of English Verse, HTML edition</A></DIV>\n";

    print poem_file "\t</BODY>\n</HTML>\n";

    close poem_file;
    
  }

    

