silico.biotoul.fr
 

Enriched.pl

From silico.biotoul.fr

Jump to: navigation, search

Perl script

#!/usr/bin/perl -Wall
use strict;
use Class::Struct;
use vars qw/ %opt /;
use Getopt::Std;
use Math::CDF;
my $opt_string = 'hq:r:a:c';
getopts( "$opt_string", \%opt ) or usage();
 
# PARAMETERS
my $alpha = 0.05;
my $referenceFile = "swissprot_keywords_annotated_genes.txt";
if ($opt{r}) { $referenceFile = $opt{r}; }
 
my $queryFile;
if ($opt{q}) { $queryFile     = $opt{q}; }
else { usage(); }
 
if ($opt{a}) { $alpha         = $opt{a}; }
 
usage() if $opt{h};
 
sub usage {
print STDERR << "EOF";
    usage: $0 [-h] -q query_IDs_file [-r reference_file] [-a alpha_threshold] [-c]
		option:
			-q	specify file for genes of interest
			-r	specify annotation file (default: $referenceFile)
			-a	specify significance threshold (default: $alpha)
			-c	apply FDR correction for multiple testing
			-h	display this help
EOF
	exit;
}
 
# STRUCTS
struct ( Hit => [
	term       => '$',
	pvalue     => '$',
	pvalueadj  => '$',
	common     => '$',
	target     => '$'
]);
 
# SUBS
sub computeIntersectionSize {
	my ($q, $t) = @_;
	my $res = 0;
	my %h;
	@h{@$q} = @$q;
	foreach my $i (@$t) { if (defined $h{$i}) { $res++; }	}
	return $res;
}
 
sub min3 { 
	my ($m, $a, $b) = @_;
	if ($a<$m) { $m = $a; } 
	if ($b<$m) { $m = $b; } 
	return $m; 
} 
 
sub binom {
	my ($population,$query,$target,$intersection) = @_;
	# transform in a binomial test
	my $x = $query - $intersection; # number of failures/mismatches
	my $p = $target / $population;
	# using the cumulative negative binomial dist
	# i.e. we compute the prob of observing $x or fewer failures (<-> $commonElements successes or more)
	#      before the $n-th success ($commonElements) 
	#      when each trial has a prob of p to succeed (= $targetSize / $populationSize)
	return Math::CDF::pnbinom($x,$intersection,$p);
}
 
sub FDR {
	my $alpha = shift;
	my $h     = shift;
	my @hits  = @$h;
	my @shits = sort {$a->pvalue <=> $b->pvalue} @hits;
	my $n = scalar @shits;
	if ($n > 0) {
		$shits[$n-1]->pvalueadj($shits[$n-1]->pvalue);
		for (my $i=$n-1;$i>=1;$i--) {
			$shits[$i-1]->pvalueadj( min3($shits[$i]->pvalueadj, $n/$i*$shits[$i-1]->pvalue, 1) );
		}	
	}
}
 
# LOAD REFERENCE DATASET (IDS WITH ANNOTATIONS)
my %genes; # $genes{"term"} = @(gene1, gene2, ...)
my %genome;
open(F, "$referenceFile") or die "Cannot open reference file '$referenceFile'.";
while (<F>) {
	chomp;
	my ($kw, @geneList) = split /\t/;
	$genes{$kw} = \@geneList;
	@genome{@geneList} = @geneList;
}
close(F);
 
my $genomeSize = scalar keys %genome;
 
# LOAD QUERY
my @query;
if ( -e $queryFile ) {
	my %qTmp; # temp hash in case of redundant genes
	open(F,"$queryFile") or die "Cannot open query file '$queryFile'.";
	while (<F>) {
		$_ =~ s/^\s+//; # trim beginning of line
		my @a = split /\s+/;
		@qTmp{@a} = @a;
	}
	close(F);
	@query = keys %qTmp;
}
else {
	my %qTmp; # temp hash in case of redundant genes
	@query = split /\s+/, $queryFile;
	@qTmp{@query} = @query;
	@query = keys %qTmp;
}
 
# MAKE COMPARISONS
my $querySize = scalar @query;
my @hits;
foreach my $annotation (keys %genes) {
	my $target = $genes{$annotation};
	my $targetSize = scalar @$target;
	my $nbCommonElements = computeIntersectionSize(\@query, $target);
	my $pvalue = binom($genomeSize, $querySize, $targetSize, $nbCommonElements);
	my $hit = new Hit(
		term=>$annotation, 
		pvalue=>$pvalue,
		pvalueadj=>$pvalue,
		common=>$nbCommonElements,
		target=>$targetSize
	);
	push @hits, $hit;
}
 
# apply FDR if asked
FDR($alpha, \@hits) if $opt{c};
 
# print out results
print "Term\tp-value\tp-value adj.";
foreach my $h (sort {$a->pvalueadj <=> $b->pvalueadj} @hits) {
	if ($h->pvalueadj <= $alpha) {
		print $h->term."\t".$h->pvalue."\t".$h->pvalueadj."\t($querySize/".$h->target."/".$h->common.")";
	}
	else { exit; }
}

Annotation files