#! /bin/perl
#
#           De-moron-ise Text from Microsoft Applications
# 
#                   by John Walker -- January 1998
#                      http://www.fourmilab.ch/
#
#               This program is in the public domain.
#
# Version History
#
# 09/05/2001  FatPhil (Phil Carmody)
#  Modified to cope with MS's symbol fonts, and character entities
#  7-bit clean output in skeleton form, but not yet done
#  New options: -4 -> use HTML 4; -7 -> make 7-bit clean
#


    $lineWrap = 72;                   # Wrap lines at this column
    $lineBreak1 = '[<]';              # Line break first pass candidates
    $lineBreak2 = '[>]';              # Line break second pass candidates

# How advanced are we?
$dtd = 3;

# What do we process?
$numerics=1;   # numeric codes become meaningful entities
$symbolfont=1; # anything in symbol font needs to change!
$reserved=1;   # reserved 8-bit control characters become entities
$eightbit=0;   # other 8-bit characters become entities



# Symbol font nonsense
@Symbol=();



    #   Process command line options

    for ($i = 0; $i <= $#ARGV; $i++) {
        if ($ARGV[$i] =~ m/^-/) {
            $o = $ARGV[$i];
            splice(@ARGV, $i, 1);
            $i--;
            if (length($o) == 1) {
                last;
            }
            $opt = substr($o, 1, 1);
            $arg = substr($o, 2);

            #   -u                  -- Print how-to-call information

            if ($opt eq 'u' || $opt eq '?') {
                print("Usage: demoroniser [ options ] infile outfile\n");
                print("       Options:\n");
                print("             -u              Print this message.\n");
                print("             -wcols          Wrap lines at cols columns, 0 = no wrap.\n");
                exit(0);

            #   -wcols              -- Wrap lines at cols columns, 0 = no wrap

            } elsif ($opt eq 'w') {
                if ($arg =~ m/^\d+$/ && $arg >= 0) {
                    $lineWrap = $arg;
                    if ($lineWrap == 0) {
                        $lineWrap = 1 << 31;
                    }
                } else {
                    die("Invalid wrap length '$arg' in -w option.\n");
                }
            } elsif ($opt eq '4') {
		$dtd = 4;
	    } elsif ($opt eq '7') {
		$eightbit = 1;
	    }
        }
    }

    if($symbolfont) { &SymbolAny(); $dtd>3 ? &Symbol4() : &Symbol3(); }

    #   Open input and output files

    $if = STDIN;
    $of = STDOUT;
    $ifname = "(stdin)";
    if ($#ARGV >= 0) {
        $if = IF;
        open($if, "<$ARGV[0]") || die("Cannot open input file $ARGV[0]: $!\n");
        $ifname = $ARGV[0];
    }
    if ($#ARGV >= 1) {
        $of = OF;
        open($of, ">$ARGV[1]") || die("Cannot open output file $ARGV[1]: $!\n");
    }

    $iline = 0;
    $oline = 0;

    while ($l = <$if>) {
        $iline++;

        $l1 = &demoronise($l);
        &printWrap($l1);
    }

    close($if);
    close($of);

#   demoronise  --  Translate moronic Microsoft bit-drool into
#                   vaguely readable and compatible HTML.

sub demoronise {
    local($s) = @_;
    local($i, $c);

    # lets cache our findings about the string?
    study($s);

    #   Eliminate idiot MS-DOS carriage returns from line terminator

    $s =~ s/\s+$//;
    $s .= "\n";


    # Turn MS symbol font into real characters

    if($symbolfont && ($s =~ m|(<FONT\s+FACE="Symbol">\s*)([^<]*)(</FONT>)|i) )
    {
	local @parts = split(/<\/FONT>/i, $s);
	local $build='';
	foreach(@parts)
	{
	    local $p = $_;
	    if($p =~ m/(.*)(<FONT\s+FACE="Symbol">\s*)(.*)$/i)
	    {
		$prefix=$1;
                $junk=$2;
		$b=$3;
		local $n = '';
		local $i;
		for($i=0; $i<length($b); ++$i)
		{
		    local $c = ord(substr($b, $i, 1));
		    if(defined($Symbol[$c]))
		    {
			$n .= $Symbol[$c];
		    }
		    else
		    {
			printf(STDERR  "$ifname: warning--untranslated symbol 0x%02X in input line %d, output line(s) %d(...).\n",
			       unpack('C', $c), $iline, $oline + 1);
			$n .= $junk . $c . '</FONT>';
		    }
		}
		# now every character is translated, add to the building string
		$build .= $prefix . $n;
	    }
	    else
	    {
		# This was not a symbol section, output unchanged
		$build .= $p;
	    }
	}
	$s = $build;
    }

    if($reserved && ($s =~ m/[\x00-\x08\x10-\x1F\x80-\x9F]/) )
    {
	#   Map strategically incompatible non-ISO characters in the
	#   range 0x82 -- 0x9F into plausible substitutes where
	#   possible.
	if($dtd>=4)                            { $s =~ s/\x80/&euro;/g; }
	# unknown 0x81
	$s =~ s/\x82/,/g;
	$s =~ s-\x83-<em>f</em>-g; # not in Latin-2
	if($dtd<4) { $s =~ s/\x84/,,/g; } else { $s =~ s/\x84/&bdquo;/g; } 
	if($dtd<4) { $s =~ s/\x85/.../g; }else { $s =~ s/\x85/&hellip;/g; }
	if($dtd>=4)                            { $s =~ s/\x86/&dagger;/g; }
	if($dtd>=4)                            { $s =~ s/\x87/&Dagger;/g; }
	$s =~ s/\x88/^/g;          # not in Latin-2
	if($dtd<4) { $s =~ s-\x89- °/°°-g;}else{ $s =~ s/\x89/&permil;/g; }
	if($dtd>=4)                            { $s =~ s/\x8A/&Scaron;/g; }
	if($dtd<4) { $s =~ s/\x8B/</g; }  else { $s =~ s/\x8B/&lsaquo;/g; }
	if($dtd<4) { $s =~ s/\x8C/Oe/g; } else { $s =~ s/\x8C/&OElig;/g; }
	# unknown 0x8D
	# unknown 0x8E
	# unknown 0x8F
	# unknown 0x90
	$s =~ s/\x91/`/g; # `
	$s =~ s/\x92/'/g; # '
	if($dtd<4) { $s =~ s/\x93/\"/g; } else { $s =~ s/\x93/&ldquo;/g; }
	if($dtd<4) { $s =~ s/\x94/\"/g; } else { $s =~ s/\x94/&rdquo;/g; }
	if($dtd<4) { $s =~ s/\x95/*/g; }  else { $s =~ s/\x95/&bull;/g; }
	if($dtd<4) { $s =~ s/\x96/-/g; }  else { $s =~ s/\x96/&ndash;/g; }
	if($dtd<4) { $s =~ s/\x97/--/g; } else { $s =~ s/\x97/&mdash;/g; }
	if($dtd<4) { $s =~ s-\x98-<sup>~</sup>-g; } else { $s =~ s/\x98/&tilde;/g; }
	if($dtd<4) { $s =~ s-\x99-<sup>TM</sup>-g; } else { $s =~ s/\x99/&reg;/g; }
	if($dtd>=4)                            { $s =~ s/\x9A/&scaron;/g; }
	if($dtd<4) { $s =~ s/\x9B/>/g; }  else { $s =~ s/\x8B/&rsaquo;/g; }
	if($dtd<4) { $s =~ s/\x9C/oe/g; } else { $s =~ s/\x8C/&oelig;/g; }

    #   Now check for any remaining untranslated characters.

	if ($s =~ m/[\x00-\x08\x10-\x1F\x80-\x9F]/) 
	{
	    $r = $s;
	    $r =~ tr/[\x00-\x08\x10-\x1F\x80-\x9F]//cds;
	    for($i=0; $i<length($r); $i++)
	    {
		$c= substr($r, $i, 1);
		printf(STDERR  "$ifname: warning--untranslated character 0x%02X in input line %d, output line(s) %d(...).\n",
		       unpack('C', $c), $iline, $oline + 1);
	    }
	    # for ($i = 0; $i < length($s); $i++) {
            # $c = substr($s, $i, 1);
            # if ($c =~ m/[\x00-\x09\x10-\x1F\x80-\x9F]/) {
            #     printf(STDERR  "$ifname: warning--untranslated character 0x%02X in input line %d, output line(s) %d(...).\n",
            #         unpack('C', $c), $iline, $oline + 1);
            # }
        }
    }


    if($eightbit && ($s =~ m/[\xA0-\xff]/))
    {
	# Map legal but potentially non-portable 8-bit latin-1
	# onto entities
	if($dtd>=3)
	{
	    $s =~ s/\xB1/&plusmn;/g;
	    $s =~ s/\xF7/&divide;/g;
	}
    }

    #   Supply missing semicolon at end of numeric entity if
    #   Billy's bozos left it out.

    $s =~ s/(&\#[0-2]\d\d)\s/$1; /g;

    #   Fix dimbulb obscure numeric rendering of &lt; &gt; &amp;
    if($numerics)
    {
	$s =~ s/&#038;/&amp;/g;
	$s =~ s/&#060;/&lt;/g;
	$s =~ s/&#062;/&gt;/g;
    }

    #   Fix unquoted non-alphanumeric characters in table tags

    $s =~ s/(<TABLE\s.*)(WIDTH=)(\d+%)(\D)/$1$2"$3"$4/gi;
    $s =~ s/(<TD\s.*)(WIDTH=)(\d+%)(\D)/$1$2"$3"$4/gi;
    $s =~ s/(<TH\s.*)(WIDTH=)(\d+%)(\D)/$1$2"$3"$4/gi;

    #   Correct PowerPoint mis-nesting of tags

    $s =~ s-(<Font .*>\s*<STRONG>.*)(</FONT>\s*</STRONG>)-$1</STRONG></Font>-gi;

    #   Translate bonehead PowerPoint misuse of <UL> to achieve
    #   paragraph breaks.

    $s =~ s-<P>\s*<UL>-<p>-gi;
    $s =~ s-</UL><UL>-<p>-gi;
    $s =~ s-</UL>\s*</P>--gi;

    #   Repair PowerPoint depredations in "text-only slides"

    $s =~ s-<P></P>--gi;
    $s =~ s- <TD HEIGHT=100- <tr><TD HEIGHT=100-ig;
    $s =~ s-<LI><H2>-<H2>-ig;

    $s;
}

#   printWrap  --  Print one or more lines with wrap at
#                  the specified column.

sub printWrap {
    local($s) = @_;
    local($l, $sep, $rem, $ter, $lwrap, $indent);

    #   Pick the input apart line by line and reformat each line,
    #   if necessary, so as not to exceed the maximum line length.

    $s =~ m/(\s*)(\S)/;
    $indent = $1;
    if ($2 eq '<') {
        $indent .= ' ';
    }
    while (length($s) > 0) {
        if (($s =~ s/(.*\n)//) != 1) {
            $aax = $_[0];
            print("printWrap arg = |$aax|\n");
            print("printWrap s = |$s|\n");
            $aal = length($s);
            print("printWrap length(s) = $aal\n");
            die("$ifname: Error splitting lines.");
        }
        $l = $1;

        $sep = '';
        $lwrap = '';
        while (length($l) > $lineWrap) {
            if (($l =~ s/(^.{1,$lineWrap})(\s)//o) || 
                ($l =~ s/(^.{1,$lineWrap})($lineBreak1)//o) ||
                ($l =~ s/(^.{1,$lineWrap})($lineBreak2)//o)
               ) {
                $rem = $1;
                $ter = $2;
                if ($ter =~ m/\s+/) {
                    $ter='';
                }
                $lwrap .= "$sep$rem$ter\n";
                $oline++;
                $l =~ s/^\s*//;
                $sep = $indent;
            } else {
                last;
            }
        }
        print($of "$lwrap$sep$l");
        $oline++;
    }
}


# Define those silly symbols
sub Symbol4
{
    $Symbol[0x20] = '&forall;';
    $Symbol[0x24] = '&exist;';
    $Symbol[0x27] = '&ni;';
    $Symbol[0x2d] = '&minus;';
    # $Symbol[0x40] = '(nearlycongruent)';
    $Symbol[0x43] = '&Chi;';
    $Symbol[0x44] = '&Delta;';
    $Symbol[0x46] = '&Phi;';
    $Symbol[0x47] = '&Gamma;';
    # $Symbol[0x4a] = '(curlyJ)';
    $Symbol[0x4c] = '&Lambda;';
    $Symbol[0x50] = '&Pi;';
    $Symbol[0x51] = '&Theta;';
    $Symbol[0x53] = '&Sigma;';
    # $Symbol[0x56] = '(smallsquiggle)';
    $Symbol[0x57] = '&Omega;';
    $Symbol[0x58] = '&Xi;';
    $Symbol[0x59] = '&Psi;';
    $Symbol[0x5c] = '&there4;';
    $Symbol[0x5e] = '&perp;';
    $Symbol[0x61] = '&alpha;';
    $Symbol[0x62] = '&beta;';
    $Symbol[0x63] = '&chi;';
    $Symbol[0x64] = '&delta;';
    $Symbol[0x65] = '&epsilon;';
    $Symbol[0x66] = '(smallphi)';
    $Symbol[0x67] = '&gamma;';
    $Symbol[0x68] = '&eta;';
    $Symbol[0x69] = '&iota;';
    $Symbol[0x6a] = '&phi;';
    $Symbol[0x6b] = '&kappa;';
    $Symbol[0x6c] = '&lambda;';
    $Symbol[0x6d] = "\xb5";  # '&mu;';
    $Symbol[0x6e] = '&nu;';
    $Symbol[0x6f] = '&omicron;';
    $Symbol[0x70] = '&pi;';
    $Symbol[0x71] = '&theta;';
    $Symbol[0x72] = '&rho;';
    $Symbol[0x73] = '&sigma;';
    $Symbol[0x74] = '&tau;';
    $Symbol[0x75] = '&upsilon;';
    # $Symbol[0x76] = '(omegabar)';
    $Symbol[0x77] = '&omega;';
    $Symbol[0x78] = '&xi;';
    $Symbol[0x79] = '&psi;';
    $Symbol[0x7a] = '&zeta;';
    # $Symbol[0xa1] = '(curlyY)';
    $Symbol[0xa2] = '&prime;';
    $Symbol[0xa3] = '&le;';          
    $Symbol[0xa5] = '&infin;';
    $Symbol[0xa7] = '&clubs;';
    $Symbol[0xa8] = '&diams;';
    $Symbol[0xa9] = '&hearts;';
    $Symbol[0xaa] = '&spades;';
    $Symbol[0xab] = '&harr;';
    $Symbol[0xac] = '&larr;';
    $Symbol[0xad] = '&uarr;';
    $Symbol[0xae] = '&rarr;';
    $Symbol[0xaf] = '&darr;';         
    $Symbol[0xb2] = '&Prime;';
    $Symbol[0xb3] = '&ge;';
    $Symbol[0xb4] = '&times;';
    # $Symbol[0xb5] = '(proportional)';
    $Symbol[0xb6] = '&part;';
    $Symbol[0xb7] = '<B>&bull;</B>';
    $Symbol[0xb9] = '&ne;';
    $Symbol[0xba] = '&equiv;';
    # $Symbol[0xbb] = '(roughlyequal)';
    $Symbol[0xbc] = '&hellip;'; 
    # $Symbol[0xbd] = '(vbar)';
    $Symbol[0xbe] = '&mdash;';
    $Symbol[0xbf] = '&crarr;';
    $Symbol[0xc0] = '&alephsym;';
    $Symbol[0xc1] = '&image;'; 
    $Symbol[0xc2] = '&real;';
    $Symbol[0xc3] = '&weierp;';
    $Symbol[0xc4] = '&otimes;';
    $Symbol[0xc5] = '&oplus;';
    $Symbol[0xc6] = '&empty;';
    $Symbol[0xc7] = '&cap;';
    $Symbol[0xc8] = '&cup;';
    $Symbol[0xc9] = '&sup;';
    $Symbol[0xca] = '&supe;';
    $Symbol[0xcb] = '&nsub;';
    $Symbol[0xcc] = '&sub;';
    $Symbol[0xcd] = '&sube;';
    $Symbol[0xce] = '&isin;';
    $Symbol[0xcf] = '&notin;';
    $Symbol[0xd0] = '&angle;';
    $Symbol[0xd1] = '&nabla;';
    $Symbol[0xd2] = '&reg;';
    $Symbol[0xd3] = '&copy;';
    $Symbol[0xd4] = '&trade;';
    $Symbol[0xd5] = '&prod;';
    $Symbol[0xd6] = '&radic;';
    $Symbol[0xd7] = '(implies)';
    $Symbol[0xd8] = '&not;';
    $Symbol[0xd9] = '&and;';
    $Symbol[0xda] = '&or;';
    $Symbol[0xdb] = '&hArr;';
    $Symbol[0xdc] = '&lArr;';
    $Symbol[0xdd] = '&uArr;';
    $Symbol[0xde] = '&rArr;';
    $Symbol[0xdf] = '&dArr;';
    $Symbol[0xe0] = '&loz;';
    $Symbol[0xe1] = '&lang;';
    $Symbol[0xe2] = "\xae"; # '&reg;';
    $Symbol[0xe3] = "\xa9"; # '&copy;';
    $Symbol[0xe4] = '&trade;';
    $Symbol[0xe5] = '&sum;';
    $Symbol[0xe9] = '&lceil;';
    $Symbol[0xea] = '|';
    $Symbol[0xeb] = '&lfloor;';
    $Symbol[0xf1] = '&rang;';
    # $Symbol[0xf2] = '(integral)';
}
sub SymbolAny
{
    $Symbol[0x55] = 'Y';
    $Symbol[0xa4] = '/';
    $Symbol[0xa6] = '<EM>f</EM>';
    $Symbol[0xb1] = "\xB1"; # '&plusmn;';
    $Symbol[0xb8] = "\xF7"; # '&divide;';
}
sub Symbol3
{
    $Symbol[0x2d] = '-';
    $Symbol[0xa2] = "'";
    $Symbol[0xa3] = '&lt;=';
    $Symbol[0xab] = '&lt;-&gt;';
    $Symbol[0xac] = '&lt;-';
    $Symbol[0xae] = '-&gt;';
    $Symbol[0xb2] = "'";
    $Symbol[0xb3] = '&gt;=';
    $Symbol[0xb4] = '*';
    $Symbol[0xb7] = '*';
    $Symbol[0xb8] = '/';
    $Symbol[0xb9] = '=!=';
    $Symbol[0xba] = '==';
    $Symbol[0xbb] = '~=';
    $Symbol[0xbc] = '...';
    $Symbol[0xbd] = '|';
    $Symbol[0xbe] = '--';
    $Symbol[0xc1] = '<IT>I</IT>';
    $Symbol[0xc2] = '<IT>R</IT>';
    $Symbol[0xc3] = '<IT>P</IT>';
    $Symbol[0xd2] = "\xae"; # '(R)';
    $Symbol[0xd3] = "\xa9"; # '(c)';
    $Symbol[0xd4] = '<SUP>TM</SUP>';
    $Symbol[0xd9] = '&amp;';
    $Symbol[0xda] = '|';
    $Symbol[0xdb] = '&lt;=&gt;';
    $Symbol[0xdc] = '&lt;=';
    $Symbol[0xde] = '=&gt;';
    $Symbol[0xdf] = '&dArr;';
    $Symbol[0xe1] = '&lt;';
    $Symbol[0xe2] = '(R)';
    $Symbol[0xe3] = '(c)';
    $Symbol[0xe4] = '<SUP>TM</SUP>';
    $Symbol[0xf1] = '&gt;';
}
