#!/usr/local/bin/perl ############################################################################# # Bridget Thomson McInnes # # Description: # Print the top n most frequent words that exists in the last s # percentage of the text that do not exist in the beginning 100-s # percent. # # Usage: profiler.pl # ############################################################################# use bytes; my $stopRegex = ""; # create stop list if defined if($ARGV[0] eq "-stop") { shift @ARGV; $stoplist = shift @ARGV; open(STOP, $stoplist) || die "Stoplist file does not exist.\n"; while() { chomp; $_ =~s/\///g; $stopRegex .= $_ . "|"; } chop $stopRegex; } # Set variables my $count = shift @ARGV; my $percent = shift @ARGV; my %vocab = (); my $tokens = 0; my $rank = 1; # Set output format format = @## @>>>>>>>>>> @#### @##.#### @#### $rank, $el, $freq, $ratio, $k . # Retrieve all the tokens from the corpora foreach $file(@ARGV) { open(FILE, $file) || die "Could not open file : $file\n"; while() { chomp; s/[^a-zA-Z]/ /g; s/$stopRegex/ /g; s/^\s+|\s+$//g; foreach (split/\s+/) { $vocab{lc($_)}++; $tokens++; } } } # Calculate the token to type ratio $types = keys %vocab; $r = $types / $tokens; print "FILES : @ARGV\n"; print "TYPES : $types \n"; print "TOKENS : $tokens \n"; print "RATIO : $r \n\n"; print "RANK WORD FREQ PERCENT K \n"; # print the top count elements in the hash foreach $el (sort {$vocab{$b} <=> $vocab{$a}} keys %vocab) { $freq = $vocab{$el}; $ratio= $freq/$tokens; $k = $rank*$freq; write; if($rank++ == $count) { last; } } # Set s; the percent of the corpora not seen my $s = $tokens - (($percent / 100) * $tokens); # define variables undef %vocab; my %unique = (); my $utokens = 0; my $tcounter = 0; $rank = 1; # Get the seen and unseen token counts while(<>) { chomp; s/[^a-zA-Z]/ /g; s/$stopRegex/ /g; s/^\s+|\s+$//g; foreach (split/\s+/) { if($tcounter++ <= $s) { $vocab{lc($_)}++; } elsif(! exists $vocab{lc($_)}) { $unique{lc($_)}++; $utokens++; } } } print "\n\nRANK WORD FREQ PERCENT K\n"; # Print the top count unseen types foreach $el (sort {$unique{$b}<=>$unique{$a}} keys %unique) { $freq = $unique{$el}; $ratio = $freq/$utokens; $k = $rank*$freq; write; if($rank++ == $count) { last; } }