#!/usr/local/bin/perl ################################################################################################ # Name: Bridget Thomson-McInnes # Program: Word Sentence Generator ################################################################################################ # Program Description: # This program reads in a training corpus - using it to create an n-gram # model. The program then uses this n-gram model to generate sentences. # The sentences are generated by calculating the probability of a word # the n-1 words before it; normalizing the probabilities of the possible # words. The program picks a random word by producing a random number # between 0 and 1 and then picking the appropriate word. # # Usage: ./generator.pl ################################################################################################ # Get the command line variable n and m # m-grams n-number of sentences $m = shift @ARGV; $n = shift @ARGV; # complain if less than 2 arguments if (!defined $n || !defined $m) { print STDERR "Not enough arguments to calculate. Retry. \n"; print STDERR "USAGE: generator.pl \n"; exit(); } # initialize variables my %m_gram = (); my %m_1gram = (); my %hash = (); my $m_total = 0; my $m_1total = 0; my @queue = (); # initialize the input line $line = ""; for(1..$m-1) { $line .= " "; } # Load word sequences into a hashtable while (<>) { chomp; if(! (m/^\s*$/) ) { s/[,;:.!?]/ $& /g; s/[("'"-)]//g; $line .= $_;} # store the n-gram word sequence in a hash table foreach ( (split/\s+/, $line) ) { # get the m-1 gram, push a word on the queue if($#queue == $m-2) { $m_1gram{ (join " ", @queue) }++; $m_1total++; } push @queue, $_; # get the mgram and the hash if ($#queue == $m-1) { my @array = @queue; $temp = pop @array; push @{$hash{ (join " ", @array) }}, $temp; $m_gram{ (join " ", @queue) }++; shift @queue; $m_total++; if($queue[$#queue]=~m/[.!?]/) { @queue = (); for(1..$m-1) { push @queue, ""; } } } } $line = ""; } # print out n sentences for $j(1..$n) { print "Sentence $j: "; # initialize the start of the sentence my @sequence = (); for (1..$m-1) { push @sequence, ""; } do { # get the next word possiblities and store them in the choices hash; my @random = (); my $count = 0; my $seq = join " ", @sequence; my %seen = (); my $freq = $m_1gram{$seq}/$m_1total; my @unique = grep { ! $seen{$_} ++ } @{$hash{$seq}}; for (0..$#unique) { $mgram = $seq . " " . $unique[$_]; $mgram=~s/\s+$//g; $mgram=~s/^\s+//g; if($m > 1) { $random[$_] = $count + ( ($m_gram{$mgram}/$m_total) / $freq ); } else { $random[$_] = $count + ( $m_gram{$mgram}/$m_total ); } $count = $random[$_]; } $chosenWord = ""; my $number = rand 1; for (0..$#random) { if($number < $random[$_]) { $chosenWord = $unique[$_]; last; } } push @sequence, $chosenWord; shift @sequence; print "$chosenWord "; } until($chosenWord=~/[.!?]/); print "\n\n"; }