#!/usr/bin/perl -w
#
# check_dups
#
# Make sure there are no duplicate stems in the wordlists.  Each
# command line parameter gives the set of files to check for
# duplicates: for example 'american' means check for cross-duplicates
# in american.*, and 'american british' means check for
# cross-duplicates in american.* and british.*.  (Both of those are
# useful checks to make; american shouldn't have dups, and if a word
# appears in both american and british then it should be moved to
# english.)
#

use strict;

if (not @ARGV) {
    die <<END
usage: $0 basenames...
where 'basenames' is a list of filenames without extension, such as
'american altamer' (given as a single argument).
END
;
}

sub disp( $$ ) {
    my ($w, $fs) = @_;
    if (defined $fs and length $fs) {
	return "$w/$fs";
    }
    else {
	return $w;
    }
}

my $errs = 0;
foreach my $arg (@ARGV) {
    my @basenames = split /\s+/, $arg;
    my @files;
    foreach my $b (@basenames) {
	my $missing;
	my $found_one = 0;
	foreach (map { chr } (ord '0' .. ord '9')) {
	    my $file = "$b.$_";
	    if (-f $file) {
		if (defined $missing) {
		    warn "ignoring $file since there was no $missing\n";
		}
		else {
		    push @files, $file;
		    $found_one = 1;
		}
	    }
	    else {
		$missing = $file if not defined $missing;
	    }
	}
	
	die "no wordlists beginning $_" if not $found_one;
    }

    my %seen;
    foreach my $f (@files) {
	open F, $f or die "cannot open $f: $!";
	while (<F>) {
	    chomp;
	    m!^((?:\w|\')+)(?:/([A-Z]+))?$! or die "$f:$.: bad line $_";
	    my ($word, $flags) = ($1, $2);
	    my $seen = $seen{$word};
	    if ($seen) {
		my ($file, $line, $oldflags) = @$seen;
		my $dword = disp($word, $flags);
		my $dprev = disp($word, $oldflags);
		warn "$f:$.: $dword seen here\n";
		warn "$file:$line: but seen here as $dprev\n";
	        ++ $errs;
	    }
	    else {
		$seen{$word} = [ $f, $., $flags ];
	    }
	}
	close F or warn "cannot close $f: $!";
    }

    if ($errs) {
	die "found errors with '$arg', not continuing\n";
    }
}
