# lambda2.pm Version 0.1 # # Statistical library package to calculate the Goodman-Kruskal Lambda coefficient. # This package should be used with statistic.pl and rank.pl. # # Copyright (C) 2002, # Bridget McInnes, University of Minnesota, Duluth # bthomson@d.umn.edu # # This module calculates the Symmetric (Goodman-Kruskal Lambda) # # Usage: # %perl statistic.pl lambda2.pm user2.txt # # Formula for the Symmetric Lambda: # # lambda = ( rF + cF - Fr - Fr ) / ( (2 * N) - Fr - Fc ) # # rF = Sum of the maximum frequency in each row # cF = Sum of the maximum frequency in each column # Fr = maximum marginal row value # Fc = maximum marginal column value # N = total bigram count # # Example: # Given a bigram table the lambda measure can be calculated for each row. # # Bigram Table: # --------------- # 17 # test<>.<>3 3 3 # a<>test<>3 3 3 # This<>is<>1 1 1 # no<>less<>1 1 1 # .<>No<>1 2 1 # less<>than<>1 1 1 # than<>a<>1 1 3 # Just<>a<>1 1 3 # is<>a<>1 1 3 # or<>no<>1 1 1 # more<>or<>1 1 1 # No<>more<>1 1 1 # .<>Just<>1 2 1 # # The row that is going to be used to explain how true mutual information is calculated is: # # test<>.<>3 3 3 # # The first step is to set up a contingency table for this row using the given variables. # 17 - (seen at the top of the bigram table) represents the total number of bigrams for the table # # 3 - the first 3 represents the number of times "test ." appeared in the text # 3 - the second 3 represents the number of times "test" appeared in the first position in the text # 3 - the third 3 represents the number of times "." appeared in the second position in the text # # Using these values a contingency table can be created for this row. The corresponding contingency # table given these values is: # # . ! !. # ______________________ # | | | # test | n11 = 3 | n12 = 0 | n1p = 3 # | | | # ---------------------- # | | | # !test | n21 = 0 | n22 =14 | n2p = 14 # | | | # ---------------------- -------- # np1 = 3 np2 =14 | npp = 17 # # # With the contingency table set the calculations for lambda can be made. # # rF = 3 + 14 = 17 # cF = 3 + 14 = 17 # Fr = 14 # Fc = 14 # N = 17 # # Lamda = ( rF + cF - Fr - Fc ) / ( (2 * N) - Fc - Fr ) # = ( 17 + 17 - 14 - 14 ) / ( (2 * 17) - 14 - 14 ) # = 6 / 6 # = 1 # # Evaluation: # # Lambda is a frequency interpretation with a range between 0 and 1. Lambda is "the percent one # reduces errors in guessing the value of the dependent variable when one knows the value of the # independent variable. Specifically, lambda is the surplus of errors made when the marginals of # the dependent variable are known, minus the number of errors made when the frequencies of the # dependent variable are known for each value of the independent variable" # [ ww2.chass.ncsu.edu/garson/pa765/assocnominal.htm]. # # A lambda score of 0 = distribution of the independent variable does not help in estimating the # dependent variable. # # A lambda of 1 indicates that knowing the distribution of the independent variable may help estimate # the value of the dependent variable. # ########################################################################################################## package lambda2; require Exporter; @ISA = qw ( Exporter ); @EXPORT = qw (initializeStatistic getStatisticName calculateStatistic errorCode errorString); # function to set up various variables before the actual computation # starts. also to check if we are being given bigrams, and if our # frequency combinations are enough to do the computation sub initializeStatistic { ($ngram, $totalBigrams, $combIndex, @freqComb) = @_; $errorCodeNumber = 0; $errorMessage = ""; # check if ngram > 2. mi statistic only defined for ngram = 2. if ($ngram > 2) { $errorCodeNumber = 1; $errorMessage = "Mutual information statistic is only available for bigrams!"; return; } # totalBigrams should not be less than equal to 0 if ($totalBigrams <= 0) { $errorCodeNumber = 10; $errorMessage = "Total number of bigrams ($totalBigrams) must be greater than 0."; return; } # figure out from the @freqComb array if the frequency values we # are going to get are indeed the ones we need. the ones we need # are (0,1), (0) and (1). while we figure this out, we shall also # note which of the indices of the array passed to function # calculateStatistic are the ones we want. my $i; for ($i = 0; $i < $combIndex; $i++) { $string = join (" ", @{$freqComb[$i]}[1..$freqComb[$i][0]]); if ($string eq "0 1") { $jointFreqIndex = $i; } elsif ($string eq "0") { $leftFreqIndex = $i; } elsif ($string eq "1") { $rightFreqIndex = $i; } } if (!(defined $jointFreqIndex)) { $errorCodeNumber = 100; $errorMessage = "Frequency combination \"0 1\" (frequency of bigram) missing!\n"; } if (!(defined $leftFreqIndex)) { $errorCodeNumber = 101; $errorMessage = "Frequency combination \"0\" (frequency of bigrams containing left token) missing!\n"; } if (!(defined $rightFreqIndex)) { $errorCodeNumber = 102; $errorMessage = "Frequency combination \"1\" (frequency of bigrams containing right token) missing!\n"; } } # function to calculate the dice value! sub calculateStatistic { my @numbers = @_; my $jointFrequency = $numbers[$jointFreqIndex]; my $leftFrequency = $numbers[$leftFreqIndex]; my $rightFrequency = $numbers[$rightFreqIndex]; # joint frequency should be greater than equal to zero if ($jointFrequency < 0) { $errorCodeNumber = 200; $errorMessage = "Frequency value ($jointFrequency) must not be negative."; return(0); } # joint frequency should be less than or equal to totalBigrams if ($jointFrequency > $totalBigrams) { $errorCodeNumber = 201; $errorMessage = "Frequency value ($jointFrequency) must not exceed total number of bigrams."; return(0); } # joint frequency should be less than or equal to the marginal totals if ($jointFrequency > $leftFrequency || $jointFrequency > $rightFrequency) { $errorCodeNumber = 202; $errorMessage = "Frequency value of ngram ($jointFrequency) must not exceed the marginal totals."; return(0); } # left frequency should be greater than or equal to zero if ($leftFrequency <= 0) { $errorCodeNumber = 210; $errorMessage = "Marginal total value ($leftFrequency) must not be 0 or less."; return(0); } # left frequency should be less than or equal to totalBigrams if ($leftFrequency > $totalBigrams) { $errorCodeNumber = 211; $errorMessage = "Marginal total value ($leftFrequency) must not exceed total number of bigrams."; return(0); } # right frequency should be greater than or equal to zero if ($rightFrequency <= 0) { $errorCodeNumber = 220; $errorMessage = "Marginal total value ($rightFrequency) must not be 0 or less."; return(0); } # right frequency should be less than or equal to totalBigrams if ($rightFrequency > $totalBigrams) { $errorCodeNumber = 221; $errorMessage = "Marginal total value ($rightFrequency) must not exceed total number of bigrams."; return(0); } # finally the calculation!! ########################################################### # The formula used to calculate Symmetric Lambda is: # # lambda = ( rF + cF - Fr - Fr ) / ( (2 * N) - Fr - Fc ) # # rF = Sum of the maximum frequency in each row # cF = Sum of the maximum frequency in each column # Fr = maximum marginal row value # Fc = maximum marginal column value # N = total bigram count ############################################################ #set contingency table my $npp = $totalBigrams; my $n1p = $rightFrequency; my $np1 = $leftFrequency; my $np2 = $totalBigrams - $np1; my $n2p = $totalBigrams - $n1p; my $n11 = $jointFrequency; my $n21 = $np1 - $n11; my $n12 = $n1p - $n11; my $n22 = $np2 - $n12; #initialize values my $rF = 0; my $cF = 0; my $Fr = 0; my $Fc = 0; #set cF if($n11 < $n12) { $cF = $n12; } else { $cF = $n11; } if ($n21 < $n22) { $cF = $n22 + $cF; } else { $cF = $n21 + $cF; } #set rF if($n11 < $n21) { $rF = $n21; } else{ $rF = $n11; } if($n12 < $n22) { $rF = $n22 + $rF; } else { $rF = $n12 + $rF; } #set Fr if ($n1p < $n2p) { $Fr = $n2p; } else { $Fr = $n1p; } #set Fc if($np1 < $np2) { $Fc = $np2; } else { $Fc = $np1; } #calculate Lambda and return it if( (2 * $npp - $Fr - $Fc ) == 0 ) { return(-1); } else { return ( ($rF + $cF - $Fr - $Fc) / ( 2 * $npp - $Fr - $Fc) ); } } # function to return the error code of the last operation and reset # error code. useful if the error can be recovered from! sub errorCode { my $temp = $errorCodeNumber; $errorCodeNumber = 0; return($temp); } # function to return the error message of the last operation and reset # the message string. useful if error can be recovered from! sub errorString { my $temp = $errorMessage; $errorMessage = ""; return($temp); } # function to return the name of this statistic sub getStatisticName { return "True Mutual Information"; } 1;