# lambda3.pm Version 0.1 # # Statistical library package to calculate the Goodman-Kruskal Lambda coefficient. # This package should be used with statistic.pl and rank.pl. # # Copyright (C) 2002, # Bridget McInnes, University of Minnesota, Duluth # bthomson@d.umn.edu # # This module calculates the Symmetrical Lambda (Goodman-Kruskal Lambda) # # Usage: # # %perl statistic.pl -ngram 3 lambda3.pm user3.txt # # Formula for the Lambda: # # lambda = ( rF + cF - Fr - Fr ) / ( (2 * N) - Fr - Fc ) # # rF = Sum of the maximum frequency in each row # cF = Sum of the maximum frequency in each column # Fr = maximum marginal row value # Fc = maximum marginal column value # N = total trigram count # # Example: # Given a trigram table the Symmetric Lambda can be calculated for each row. # # Trigram Table: # --------------- # 16 # a<>test<>.<>3 3 3 3 3 3 3 # .<>Just<>a<>1 2 1 3 1 1 1 # is<>a<>test<>1 1 3 3 1 1 3 # less<>than<>a<>1 1 1 3 1 1 1 # more<>or<>no<>1 1 1 1 1 1 1 # test<>.<>Just<>1 2 2 1 2 1 1 # No<>more<>or<>1 1 1 1 1 1 1 # .<>No<>more<>1 2 1 1 1 1 1 # or<>no<>less<>1 1 1 1 1 1 1 # This<>is<>a<>1 1 1 3 1 1 1 # than<>a<>test<>1 1 3 3 1 1 3 # Just<>a<>test<>1 1 3 3 1 1 3 # test<>.<>No<>1 2 2 1 2 1 1 # no<>less<>than<>1 1 1 1 1 1 1 # # The row that is going to be used to explain how the Symmetric Lambda is calculated is: # # is<>a<>test<>1 1 3 3 1 1 3 # # The first step is to set up a contingency table for this row using the given variables. # 16 - (seen at the top of the trigram table) represents the total number of trigrams for the table # # 1 - (the first number) - represents how many times "is a test" occurred in the text # 1 - (the second number) - represents how many times "is" occurred in the first position # 3 - (the third number) - represents how many times "a" occurred in the second position # 3 - (the forth number) - represents how many times "test" occurred in the third position # 1 - (the fifth number) - represents how many times "is" occurred in the first position and # "a" occurred in the second position # 1 - (the sixth number) - represents how many times "is" occurred in the first position and # "test" occurred in the third position # 3 - (the seventh number) - represents how many times "a" occurred in the second position and # "test" occurred in the third position # # Using these values a contingency table can be created for this row. The corresponding contingency # table given these values is: # # | test | !test | # ____|__________|__________| # | | | | # | a | n11 = 1 | n12 = 0 | n1p = 1 # is |___|__________|__________|___ # | | | | # |!a | n21 = 0 | n22 = 0 | n2p = 0 # ____|___|__________|__________|___ # | | | | # | a | n31 = 2 | n32 = 2 | n3p = 4 # !is |___|__________|__________|___ # | | | | # |!a | n41 = 2 | n42 = 9 | n4p = 11 # |___|__________|__________| _________ # np1 = 5 | np2 = 11 | npp = 16 # # # With the contingency table set the calculations for lambda can be made. # # rF = 2 + 9 = 11 # cF = 1 + 0 + 2 + 9 = 12 # Fr = 11 # Fc = 11 # N = 16 # # Lambda = ( rF + cF - Fr - Fc ) / ( (2 * N) - Fc - Fr ) # = ( 11 + 12 - 11 - 11 ) / ( (2 * 16) - 11 - 11 ) # = 1 / 10 # = .1 # # The output table will contain a column corresponding the contingency table: # # is<>a<>test<>1 0.1000 1 1 3 3 1 1 3 # # Evaluation: # # "Lambda is a symmetric measure of the power to predict" - # [www.unesco.org/webworld/idams/Doc/Manual-itml/E2tables.htm]. # # Lambda is a frequency interpetation with a range between 0 and 1. Lambda is "the percent one # reduces errors in guessing the value of the dependent variable when one knows the value of the # independent variable. Specifically, lambda is the surplus of errors made when the marginals of # the dependent variable are known, minus the number of errors made when the frequencies of the # dependent variable are known for each value of the independent variable" # [ ww2.chass.ncsu.edu/garson/pa765/assocnominal.htm]. # # A lambda score of 0 = distribution of the independent variable does not help in estimating the # dependent variable. # # A lambda of 1 indicates that knowing the distribution of the independent variable may help # estimate the value of the dependent variable. # ############################################################################################################## package lambda3; require Exporter; @ISA = qw ( Exporter ); @EXPORT = qw (initializeStatistic getStatisticName calculateStatistic errorCode errorString); # function to set up various variables before the actual computation # starts. also to check if we are being given trigrams, and if our # frequency combinations are enough to do the computation sub initializeStatistic { ($ngram, $totalTrigrams, $combIndex, @freqComb) = @_;; $errorCodeNumber = 0; $errorMessage = ""; # check if ngram > 3. ll statistic only defined for ngram = 3. if ($ngram > 3) { $errorCodeNumber = 1; $errorMessage = "Log-likelihood statistic is only available for trigrams!"; return; } # totalTrigrams should not be less than equal to 0 if ($totalTrigrams <= 0) { $errorCodeNumber = 10; $errorMessage = "Total number of trigrams ($totalTrigrams) must be greater than 0."; return; } # figure out from the @freqComb array if the frequency values we # are going to get are indeed the ones we need. the ones we need # are (0,1,2), (0), (1). (0,1), (0,2), and (1,2) while we figure # passed to function this out, we shall also note which of the # indices of the array calculateStatistic are the ones we want. my $i; for ($i = 0; $i < $combIndex; $i++) { $string = join (" ", @{$freqComb[$i]}[1..$freqComb[$i][0]]); if ($string eq "0 1 2") { $jointFreqIndex = $i; } elsif ($string eq "0") { $leftFreqIndex = $i; } elsif ($string eq "1") { $middleFreqIndex = $i; } elsif ($string eq "2") { $rightFreqIndex = $i; } elsif ($string eq "0 1") { $leftMiddleFreqIndex = $i; } elsif ($string eq "0 2") { $leftRightFreqIndex = $i; } elsif ($string eq "1 2") { $middleLeftFreqIndex = $i; } } if (!(defined $jointFreqIndex)) { $errorCodeNumber = 100; $errorMessage = "Frequency combination \"0 1\" (frequency of trigram) missing!\n"; } if (!(defined $leftFreqIndex)) { $errorCodeNumber = 101; $errorMessage = "Frequency combination \"0\" (frequency of trigrams containing left token) missing!\n"; } if (!(defined $middleFreqIndex)) { $errorCodeNumber = 102; $errorMessage = "Frequency combination \"1\" (frequency of trigrams containing middle token) missing!\n"; } if (!(defined $rightFreqIndex)) { $errorCodeNumber = 103; $errorMessage = "Frequency combination \"1\" (frequency of trigrams containing right token) missing!\n"; } if (!(defined $leftMiddleFreqIndex)) { $errorCodeNumber = 104; $errorMessage = "Frequency combination \"1\" (frequency of trigrams containing left middle token) missing!\n"; } if (!(defined $leftRightFreqIndex)) { $errorCodeNumber = 105; $errorMessage = "Frequency combination \"1\" (frequency of trigrams containing left right token) missing!\n"; } } # function to calculate the ll value! sub calculateStatistic { ########################################################### # Formula: # lambda = ( rF + cF - Fr - Fr ) / ( (2 * N) - Fr - Fc ) # # rF = Sum of the maximum frequency in each row # cF = Sum of the maximum frequency in each column # Fr = maximum marginal row value # Fc = maximum marginal column value # N = total trigram count ########################################################## # set contingency table my @numbers = @_; my $npp = $totalTrigrams; my $n11 = $numbers[0]; my $n21 = abs($numbers[5] - $numbers[0]); my $n31 = abs($numbers[6] - $numbers[0]); my $n41 = abs($numbers[3] - $numbers[0]); my $n12 = abs($numbers[4] - $numbers[0]); my $n22 = abs($numbers[1] - $numbers[0]); my $n32 = abs($numbers[2] - $numbers[0]); my $np1 = $n11 + $n21 + $n31 + $n41; my $np2 = $npp - $np1; my $n42 = $np2 - $n32 - $n22 - $n12; my $n1p = $n11 + $n12; my $n2p = $n21 + $n22; my $n3p = $n31 + $n32; my $n4p = $n41 + $n42; # joint frequency should be greater than equal to zero if ($n11 < 0) { $errorCodeNumber = 200; $errorMessage = "Frequency value ($jointFrequency) must not be negative."; return(0); } # joint frequency should be less than or equal to totalTrigrams if ($n11 > $npp) { $errorCodeNumber = 201; $errorMessage = "Frequency value ($jointFrequency) must not exceed total number of trigrams."; return(0); } # initializing variables my $F1 = 0; my $F2 = 0; my $Fc = 0; my $Fr = 0; my $cF = 0; my $rF = 0; # getting the max integer in the first row $F1 = $n11; if($F1 < $n21) { $F1 = $n21; } if($F1 < $n31) { $F1 = $n31; } if($F1 < $n41) { $F1 = $n41; } # getting the max integer in the second row $F2 = $n12; if($F2 < $n22) { $F2 = $n22; } if($F2 < $n32) { $F2 = $n32; } if($F2 < $n42) { $F2 = $n42; } # setting rF $rF = $F1 + $F2; # getting the max integer in the columns and summing if($n11 < $n12) { $cF = $n12; } else { $cF = $n11; } if($n21 < $n22) { $cF += $n22; } else { $cF += $n21; } if($n31 < $n32) { $cF += $n32; } else { $cF += $n31; } if($n41 < $n42) { $cF += $n42; } else { $cF += $n41; } # getting the max row marginal $Fr = $n1p; if($Fr < $n2p) { $Fr = $n2p; } if($Fr < $n3p) { $Fr = $n3p; } if($Fr < $n4p) { $Fr = $n4p; } # getting the max column marginal if($np1 < $np2) { $Fc = $np2; } else { $Fc = $np1; } # calculate and return the lambda for the contingency table return ( abs(( $rF + $cF - $Fr - $Fr ) / ( (2 * $npp) - $Fr - $Fc )) ); } # function to return the error code of the last operation and reset # error code. useful if the error can be recovered from! sub errorCode { my $temp = $errorCodeNumber; $errorCodeNumber = 0; return($temp); } # function to return the error message of the last operation and reset # the message string. useful if error can be recovered from! sub errorString { my $temp = $errorMessage; $errorMessage = ""; return($temp); } # function to return the name of this statistic sub getStatisticName { return "Loglikelihood"; } 1;