#!/usr/local/bin/perl =head1 NAME cvalue.pl =head1 SYNOPSIS cvalue.pl takes a set of NSP generated ngram files (bigram, trigram, 4gram, ect) and determines the cvalue over all the ngrams. =head1 DESCRIPTION See perldoc README.pod =head1 AUTHOR Bridget McInnes, bthomson@d.umn.edu Ted Pedersen, tpederse@d.umn.edu =head1 BUGS =head1 COPYRIGHT Copyright (C) 2000-2003, Ted Pedersen and Satanjeev Banerjee This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =cut use Getopt::Long; # now get the options! GetOptions( "version", "help"); # if help has been requested, print out help! if ( defined $opt_help ) { $opt_help = 1; &showHelp(); exit; } # if version has been requested, show version! if ( defined $opt_version ) { $opt_version = 1; &showVersion(); exit; } # retrieve the destination file $destination = shift; # check to see if a destination has been supplied at all... if ( !($destination ) ) { print STDERR "No output file (DESTINATION) supplied.\n"; &askHelp(); exit; } # check to see if destination exists, and if so, if we should overwrite... if ( -e $destination ) { print "Output file $destination already exists! Overwrite (Y/N)? "; $reply = ; chomp $reply; $reply = uc $reply; exit 0 if ($reply ne "Y"); } # open the destination file open(DST, ">$destination") || die "Could not open destination file\n"; # get the source files @files = (); foreach $element (@ARGV) { if(-d $element) { my @temp = (); opendir(THISDIR, $element) || die "Can not open the $dirpath"; push(@temp, grep {$_ ne '.' and $_ ne '..' } readdir THISDIR); closedir THISDIR; foreach (@temp) { push @files, $element . $_; } } else { push @files, $element; } } # Intialize the ngram variables my %ngram_freq = (); my %ngram_leng = (); my $max_length = -1; my $min_length = 9999; # for each of the ngram files foreach (@files) { open(FILE, $_) || die "Could not open file : $_\n"; # remove the total count from the NSP file $temp = ; # store the ngrams, their frequency and their lengths while() { chomp; if($_=~m/^[0-9]+$/) { print DST "$_\n"; next; } my @array = split/<>/; $freq = pop @array; @ngram = (); foreach (@array) { my @temp = split/\//; push @ngram, $temp[0]; } # set the minimum and maximum length ngrams if($min_length > $#ngram + 1) { $min_length = $#ngram + 1; } if($max_length < $#ngram + 1) { $max_length = $#ngram + 1; } # store the length and frequency of the ngram $ngram_leng{ (join "<>", @ngram) } = $#ngram + 1; $ngram_freq{ (join "<>", @ngram) } += $freq; } } # Initialize the hash needed to calculate the cvalue my %cvalue = (); my %nested_terms = (); my %nested_freqs = (); # calculate the cvalue for every ngram starting with the largest # lengthed ngram moving to the smallest foreach (sort {$ngram_leng{$b}<=>$ngram_leng{$a}} keys %ngram_leng) { # calculate the cvalue for ngrams that are of the greatest length if($ngram_leng{$_} == $max_length) { # get the cvalue of the ngram $cvalue{$_} = ( log($ngram_leng{$_})/log(2) ) * $ngram_freq{$_}; # store the nested ngrams storeNested($_); } # calculate the cvalue for all other ngrams else { # if the ngram is not nested calculate and store its cvalue if(! exists $nested_terms{$_} ) { $cvalue{$_} = ( log($ngram_leng{$_})/log(2) ) * $ngram_freq{$_}; } # otherwise the ngram is nested and need to take this # into consideration when calculating the cvalue else { my $term1 = ( log($ngram_leng{$_})/log(2) ) * $ngram_freq{$_}; my $term2 = ( 1 / $nested_terms{$_}) * $nested_freqs{$_}; $cvalue{$_} = $term1 - $term2; } # if the ngram length is not the smallest, store the nested ngrams if($ngram_leng{$_} != $min_length) { storeNested($_); } } } # print the ngram and cvalue to standard out foreach (sort {$cvalue{$b}<=>$cvalue{$a}}keys %cvalue) { print DST "$_<>$cvalue{$_}\n"; } # Finds and stores the nested ngrams from the main ngram sub storeNested { $ngram = shift; @terms = split/<>/, $ngram; # store the nested ngrams for $i(0..$#terms-1) { # create the nested ngram my @nest = (); push @nest, $terms[$i]; # store the nested ngram for $j($i+1..$#terms) { push @nest, $terms[$j]; $nested_terms{ (join "<>", @nest) }++; $nested_freqs{ (join "<>", @nest) }+=$ngram_freq{$ngram}; } } } # function to output help messages for this program sub showHelp { print "Usage: cvalue.pl [OPTIONS] DESTINATION SOURCE [[, SOURCE] ...]\n\n"; print "Determines the cvalue of all the n-grams occurring in the\n"; print "NSP ngram files SOURCE and sends to DESTINATION the list\n"; print "of n-grams with their cooresponding cvalue. \n\n"; print "OPTIONS:\n\n"; print " --version Prints the version number.\n\n"; print " --help Prints this help message.\n\n"; } # function to output the version number sub showVersion { print STDERR "cvalue.pl - version 0.1\n"; print STDERR "Copyright (C) 2004-2005, Bridget McInnes and Ted Pedersen\n"; print STDERR "Date of Last Update 03/05/05\n"; } # function to output "ask for help" message when the user's goofed up! sub askHelp { print STDERR "Type cvalue.pl --help for help.\n"; }