#!/usr/local/perl ########################################################################## # Bridget Thomson-McInnes # # Description: creates n-grams from a given corpus and returns the # the m top n-grams with their frequencies and their # probabilities using MLE, Add One and Witten Bell # smoothing. # # Usage : smoothing.pl n m # # m = size of the m-gram # n = top n m-grams to be printed # ########################################################################## #Get the command line variable n and m $n = shift @ARGV; $m = shift @ARGV; #complain if less than 2 arguments if (!defined $n || !defined $m) { print STDERR "Not enough arguments to calculate. Retry. \n"; exit(); } #initialize variables %hashSeq = (); #initialize the hash table %unigram = (); #unigram hash table $tokenCount = 0; #the number of tokens $typeCount = 0; #the number of types $V = 0; #the total number of types @queue = (); #sequence queue #Load alphabetic word sequences into a hashtable while($line = lc(<>)) { chomp $line; # remove anything that is not alpha characters # as well as leading and trailing white space $line =~s/[^a-z]/ /g; $line =~s/^\s+//; $line =~s/\s+$//; foreach ( (split/\s+/, $line) ) { # If new word increment V; increment unigram count and push on queue if(! exists $unigrams{$_}) { $V++; } $unigram{$_}++; push @queue, $_; # If m-gram store in hash and shift the queue if($#queue == $m-1) { $seq = join " ", @queue; if(! exists $hashSeq{$seq}) { $typeCount++; } $hashSeq{$seq}++; $tokenCount++; shift @queue; } } } #get n-gram count of V and unobservedTypes $Vcount = 1; foreach(1 .. $m) { $Vcount *= $V; } $unobservedTypes = $Vcount - $typeCount; #set up the output format print "RANK X FREQ MLE ADD-ONE WITTEN-BELL\n"; print "----------------------------------------------------------------------------------------\n"; format STDOUT = @## @<<<<<<<<<<<<<<<<<<<<<<<<<<<< @##### @#.########## @#.########## @#.########## $rank, $seq, $hashSeq{$seq}, $MLE, $ADDONE, $WB . $numberPrinted = 0; #number of sequences printed $lastFreq = 0; #last Frequency printed #print the top n frequencies foreach $seq (sort { $hashSeq{$b} <=> $hashSeq{$a} } keys %hashSeq) { #if lastFreq doesn't equal the last frequency, increment if($lastFreq!= $hashSeq{$seq} ) { $numberPrinted++; $lastFreq = $hashSeq{$seq}; } if ($numberPrinted > $n) { last; } $rank++; $MLE = $hashSeq{$seq} / $tokenCount; $ADDONE = ( $hashSeq{$seq} + 1 ) / ( $tokenCount+$Vcount ); $WB = $hashSeq{$seq} / ( $tokenCount+$typeCount ); write; } #print the unobserved types if the number printed is less than n if($numberPrinted < $n) { $rank++; $seq = "UNOBSERVED"; $hashSeq{$seq} = 0; $MLE = 0; $ADDONE = 1/($tokenCount+$Vcount); if($m != 1) { $WB = $typeCount/(($Vcount-$typeCount)*($tokenCount+$typeCount)); } else { $WB = 0; } write; } print "\n"; print "Number of Tokens : $tokenCount\n"; print "Number of Observed Types : $typeCount\n"; print "Number of Unobserved Types : $unobservedTypes\n"; print "Number of Total Types : $Vcount\n";