#!/usr/bin/perl

# (C) Copyright 1996 by John J. Chew, III <jjchew@math.utoronto.ca>
# All rights reserved

# ddtag.pl - tag text according to the dictionaries in which words are to
#   be found, e.g. prior to submission to a crossword game mailing list.

unshift(@INC, "$ENV{HOME}/lib/perl");

require 'dawg.pl';
require 'getopts.pl';

$gVersion = "2.13";

# 2.13 - added -s flag

# edit the following two lines to point at default local DAWG files

$config'hasFullTCD = 1;
$opt_C = "$ENV{'HOME'}/scrabble/tcd/words.dawg";
$opt_N = "$ENV{'HOME'}/scrabble/tcd/nonwords.dawg";
$opt_O = "$ENV{'HOME'}/scrabble/osw/words.dawg";
$opt_T = "$ENV{'HOME'}/scrabble/ospd/words.dawg";
$opt_s = 'normal';

my %tagstyles = (
  'normal' => ['','<$>','<#>','<*>', '<?>'],
  'short' => ['','','#','*', '?*'],
  );

sub Usage ();

sub Usage () { 
  die "Usage: $0 [-1] [-q] [-s] [-C chambers.dawg] [-N non-tcd.dawg] [-O osw.dawg] [-T twl.dawg] [file...]\n"
     ."  -1                tag only first occurrence of each word\n"
     ."  -i                alter file in place, for use as a Pine alternate editor\n"
     ."  -q                don't add legend footer\n"
     ."  -s                specify tag style\n"
     ."                    normal: '', '<\$>', '<#>', '<*>', '<?>'\n"
     ."                    short: '', '', '#', '*', '?*'\n"
     ."  -C chambers.dawg  DAWG file containing partial Chambers lexicon\n"
     ."  -N non-tcd.dawg   DAWG file containing list of known non-Chambers words\n"
     ."  -O osw.dawg       DAWG file containing OSW lexicon\n"
     ."  -T twl.dawg       DAWG file containing TWL lexicon\n";
  }

&Getopts('1C:O:N:T:iqs:-:') || Usage;

$^I = 1 if $opt_i;

my $tagsp = $tagstyles{$opt_s} || Usage;

&dawg'open(*TWL, $opt_T) || die "Can't open $opt_T\n";
&dawg'open(*OSW, $opt_O) || die "Can't open $opt_O\n";
&dawg'open(*TCD, $opt_C) || die "Can't open $opt_C\n";
&dawg'open(*NTCD, $opt_N) || die "Can't open $opt_N\n";

while (<>) {
  while (length) {
    s/^\s+// && print $&;
    if (s/^\S+//) {
      $word = $&;
      $pre = $post = '';
      $tag = 0;
      $word =~ s/^[([{'"`]+// && ($pre = $&);
      $word =~ s/[]}).,:;"'!?]+$// && ($post = $&);
      if (length($word) >= 2 && $word !~ /[^a-zA-Z]/) {
l1:     {
	if ($opt_1) 
	  { if (defined $seen{$word}) { last l1; } else { $seen{$word} = 1; } }
	$lc = "\L$word";
	if (&dawg'check(*TWL, $lc)) {
	  if ($config'hasFullTCD) {
	    $tag = (&dawg'check(*OSW, $lc) || &dawg'check(*TCD, $lc))
	      ? 0 : 1;
	    }
	  else {
	    unless (&dawg'check(*OSW, $lc) || &dawg'check(*TCD, $lc)) {
	      $tag = (length($word) <= 9 || &dawg'check(*NTCD, $lc))
		? 1 : 4;
	      }
	    }
	  }
	elsif (&dawg'check(*OSW, $lc) || &dawg'check(*TCD, $lc)) 
	  { $tag = 2; }
	elsif ($config'hasFullTCD || length($word) <= 9 
	  || &dawg'check(*NTCD, $lc)) 
	  { $tag = 3; }
	else 
	  { $tag = 4; }
	} # l1
	}
      print "$pre$word$tagsp->[$tag]$post";
      }
    }
  if (eof(ARGV) && !$opt_q) {
    print "--\n";
    print "This message has been automatically tagged by ddtag $gVersion, a Perl script\n";
    print "that tags words according to the lexica in which they are found:\n";
    print "  $tagsp->[1] TWL only  $tagsp->[2] OSW only  $tagsp->[3] neither TWL nor OSW\n";
    print "ddtag is available at http://www.math.utoronto.ca/~jjchew/software/\n";
    }
  }

&dawg'close(*TCD);
&dawg'close(*OSW);
&dawg'close(*TWL);

0;
