#!/usr/bin/perl -w

# mailclean (version 1.1)
#
# Copyright (C) 2002 by John J. Chew, III <jjchew@math.utoronto.ca>
# http://www.poslfit.com
#
# A script that displays only the essential information in an RFC 822
# (e-mail message).  No weird headers, no alternate HTML content.

use strict;

use vars qw($gBoundary $gqEcho $opt_a $opt_f);

require 'getopts.pl';

sub Usage ();
sub Init ();
sub Main ();
sub process_headers (\%);
sub process_subheaders (\%);

sub Usage () { die
  "Usage: $0 [-a] [-f]\n"
 ."  -a  display all sections, do not omit HTML\n"
 ."  -f  insert form feed between messages\n";
  }

Main;

sub Init () {
  $gBoundary = undef;
  &Getopts('af-:') || &Usage;
  }

sub Main () {
  my (%headers, %subheaders);

  Init;

  my %entities = (
    'amp' => '&',
    'nbsp' => ' ',
    'reg' => '(R)',
    );
  my $entities = join('|', keys %entities);


  # state - 0=init 1=main header 2=body 3=section header
  my $last = undef;
  my $state = 0;
  my $hstate = 0;
  my $ystate = 0;
  my $bstate = 0; # 0 at start of body, 1 later 
  while (<>) {
    if (/^From[ \t]+/../^$/) { # if now in header
      if ($state !=1) { # start of header
	print "\f" if $opt_f && $state == 2;
#	my $from_ = $_;
	%headers = ();
	$last = undef;
	$state = 1;
	$hstate = 0;
	$ystate = 0;
	$bstate = 0;
	}
      elsif (s/^[ \t]+/ /) {
	die unless defined $last;
	chop $headers{$last};
	$headers{$last} .= $_;
	}
      elsif (/^(\S+)\s*:/) {
	$last = "\L$1";
	$headers{$last} = '' unless defined $headers{$last};
	$headers{$last} .= $_;
	}
      elsif (/^$/) {
	process_headers %headers;
	$state = 2;
        }
      next;
      }
    if ((defined $gBoundary) && (/^\Q$gBoundary\E$/../^$/)) 
      { # if now in section header
      if ($state != 3) {
        %subheaders = ();
	$last = undef;
	$state = 3;
        }
      elsif (s/^[ \t]+/ /) {
	die unless defined $last;
	chop $subheaders{$last};
	$subheaders{$last} .= $_;
	}
      elsif (/^(\S+)\s*:/) {
	$last = "\L$1";
	$subheaders{$last} = '' unless defined $subheaders{$last};
	$subheaders{$last} .= $_;
	}
      elsif (/^$/) {
	process_subheaders %subheaders;
	$state = 2;
        }
      next;
      }
    if ($state == 0) {
      print "***Oops: unprocessed text at start of file\n";
      }
    elsif ($state != 2) {
      print "***Oops: fell out of a header\n";
      }
    if ($gqEcho) {
      if ($hstate == 0) {
        if (s/^<html[^>]*>//i) { 
	  $hstate = 1;
	  }
	else {
	  $hstate = -1;
	  }
	}
      if ($hstate == 1) {
        s/\n/ /;
	s/<br[^>]*>/\n/ig;
	s/<hr[^>]*>/\n-----\n/ig;
	s!<img [^>]*alt="([^">]*)"[^>]*>!$1!ig;
	s/<p[^>]*>/\n\n/ig;
	s!</?(?:a|b|body|font|form|img|input|p|sup|table|td|tr)\b[^>]*>!!ig;
        $hstate = 0 if s!</html>!!ig;
	s!&($entities);!$entities{$1}!ig;
        }
      if ($ystate == 0 && /^Yahoo\! Groups Links\s*$/) {
	$ystate = 1;
	next;
	}
      elsif ($ystate == 1) {
	if (/^<\*> [TY]/) {
	  $ystate = 2;
	  next;
	  }
	elsif (/\S/) { $ystate = 0; }
	else { next; }
        }
      elsif ($ystate == 2) {
	if (/^    (?:http:|\S+\@\S+)/) {
	  $ystate = 1;
	  next;
	  }
	else { $ystate = 0; }
	}
      # top-posting
      if ($state == 2 && $bstate == 0) {
	if (/^--- In \S+\@\S+, "/) {
	  $gqEcho = 0;
	  }
	else { $bstate = 1; }
	}
      print;
      }
    }
  process_headers %headers if $state == 1;
  exit 0;
  }


sub process_headers (\%) {
  my $hp = shift;
  $gqEcho = 1;
  for my $i ('from', 'to', 'subject', 'date') {
    print $hp->{$i} if defined $hp->{$i};
    }
  print "\n";
  $gBoundary = undef;
  my $content_type = $hp->{'content-type'};
  if (defined $content_type) {
    if ($content_type =~ /boundary=(\S+)/i) {
      $gBoundary = $1;
      $gBoundary =~ s/^"(.*)"/$1/;
      $gBoundary = '--' . $gBoundary;
      }
    }
  }

sub process_subheaders (\%) {
  my $hp = shift;
  $gqEcho = 1;
  my $content_type = $hp->{'content-type'};
  if (defined $content_type) {
    print $content_type;
    if ($opt_a || $content_type =~ m!text/html!i) {
      print "[omitted]\n";
      $gqEcho = 0;
      }
    }
  print "\n";
  }
