#!/usr/bin/perl -Wall
use strict;
use Class::Struct;
use vars qw/ %opt /;
use Getopt::Std;
use Math::CDF;
my $opt_string = 'hq:r:a:c';
getopts( "$opt_string", \%opt ) or usage();
# PARAMETERS
my $alpha = 0.05;
my $referenceFile = "swissprot_keywords_annotated_genes.txt";
if ($opt{r}) { $referenceFile = $opt{r}; }
my $queryFile;
if ($opt{q}) { $queryFile = $opt{q}; }
else { usage(); }
if ($opt{a}) { $alpha = $opt{a}; }
usage() if $opt{h};
sub usage {
print STDERR << "EOF";
usage: $0 [-h] -q query_IDs_file [-r reference_file] [-a alpha_threshold] [-c]
option:
-q specify file for genes of interest
-r specify annotation file (default: $referenceFile)
-a specify significance threshold (default: $alpha)
-c apply FDR correction for multiple testing
-h display this help
EOF
exit;
}
# STRUCTS
struct ( Hit => [
term => '$',
pvalue => '$',
pvalueadj => '$',
common => '$',
target => '$'
]);
# SUBS
sub computeIntersectionSize {
my ($q, $t) = @_;
my $res = 0;
my %h;
@h{@$q} = @$q;
foreach my $i (@$t) { if (defined $h{$i}) { $res++; } }
return $res;
}
sub min3 {
my ($m, $a, $b) = @_;
if ($a<$m) { $m = $a; }
if ($b<$m) { $m = $b; }
return $m;
}
sub binom {
my ($population,$query,$target,$intersection) = @_;
# transform in a binomial test
my $x = $query - $intersection; # number of failures/mismatches
my $p = $target / $population;
# using the cumulative negative binomial dist
# i.e. we compute the prob of observing $x or fewer failures (<-> $commonElements successes or more)
# before the $n-th success ($commonElements)
# when each trial has a prob of p to succeed (= $targetSize / $populationSize)
return Math::CDF::pnbinom($x,$intersection,$p);
}
sub FDR {
my $alpha = shift;
my $h = shift;
my @hits = @$h;
my @shits = sort {$a->pvalue <=> $b->pvalue} @hits;
my $n = scalar @shits;
if ($n > 0) {
$shits[$n-1]->pvalueadj($shits[$n-1]->pvalue);
for (my $i=$n-1;$i>=1;$i--) {
$shits[$i-1]->pvalueadj( min3($shits[$i]->pvalueadj, $n/$i*$shits[$i-1]->pvalue, 1) );
}
}
}
# LOAD REFERENCE DATASET (IDS WITH ANNOTATIONS)
my %genes; # $genes{"term"} = @(gene1, gene2, ...)
my %genome;
open(F, "$referenceFile") or die "Cannot open reference file '$referenceFile'.";
while (<F>) {
chomp;
my ($kw, @geneList) = split /\t/;
$genes{$kw} = \@geneList;
@genome{@geneList} = @geneList;
}
close(F);
my $genomeSize = scalar keys %genome;
# LOAD QUERY
my @query;
if ( -e $queryFile ) {
my %qTmp; # temp hash in case of redundant genes
open(F,"$queryFile") or die "Cannot open query file '$queryFile'.";
while (<F>) {
$_ =~ s/^\s+//; # trim beginning of line
my @a = split /\s+/;
@qTmp{@a} = @a;
}
close(F);
@query = keys %qTmp;
}
else {
my %qTmp; # temp hash in case of redundant genes
@query = split /\s+/, $queryFile;
@qTmp{@query} = @query;
@query = keys %qTmp;
}
# MAKE COMPARISONS
my $querySize = scalar @query;
my @hits;
foreach my $annotation (keys %genes) {
my $target = $genes{$annotation};
my $targetSize = scalar @$target;
my $nbCommonElements = computeIntersectionSize(\@query, $target);
my $pvalue = binom($genomeSize, $querySize, $targetSize, $nbCommonElements);
my $hit = new Hit(
term=>$annotation,
pvalue=>$pvalue,
pvalueadj=>$pvalue,
common=>$nbCommonElements,
target=>$targetSize
);
push @hits, $hit;
}
# apply FDR if asked
FDR($alpha, \@hits) if $opt{c};
# print out results
print "Term\tp-value\tp-value adj.";
foreach my $h (sort {$a->pvalueadj <=> $b->pvalueadj} @hits) {
if ($h->pvalueadj <= $alpha) {
print $h->term."\t".$h->pvalue."\t".$h->pvalueadj."\t($querySize/".$h->target."/".$h->common.")";
}
else { exit; }
}