Enriched.pl
From silico.biotoul.fr
(Difference between revisions)
m (Created page with '= Perl script = <source lang='perl'> #!/usr/bin/perl -Wall use strict; use Class::Struct; use vars qw/ %opt /; use Getopt::Std; use Math::CDF; my $opt_string = 'hq:r:a:c'; getopt…') |
m (→Annotation files) |
||
Line 156: | Line 156: | ||
* [[Media:YeastUniProtKeywords.txt]] | * [[Media:YeastUniProtKeywords.txt]] | ||
* [[Media:PseudomonasAeruginosaUniProtKeywords.txt]] | * [[Media:PseudomonasAeruginosaUniProtKeywords.txt]] | ||
+ | * [[Media:EscherichiaColiUniProtKeywords.txt]] |
Current revision as of 18:02, 29 November 2011
Perl script
#!/usr/bin/perl -Wall use strict; use Class::Struct; use vars qw/ %opt /; use Getopt::Std; use Math::CDF; my $opt_string = 'hq:r:a:c'; getopts( "$opt_string", \%opt ) or usage(); # PARAMETERS my $alpha = 0.05; my $referenceFile = "swissprot_keywords_annotated_genes.txt"; if ($opt{r}) { $referenceFile = $opt{r}; } my $queryFile; if ($opt{q}) { $queryFile = $opt{q}; } else { usage(); } if ($opt{a}) { $alpha = $opt{a}; } usage() if $opt{h}; sub usage { print STDERR << "EOF"; usage: $0 [-h] -q query_IDs_file [-r reference_file] [-a alpha_threshold] [-c] option: -q specify file for genes of interest -r specify annotation file (default: $referenceFile) -a specify significance threshold (default: $alpha) -c apply FDR correction for multiple testing -h display this help EOF exit; } # STRUCTS struct ( Hit => [ term => '$', pvalue => '$', pvalueadj => '$', common => '$', target => '$' ]); # SUBS sub computeIntersectionSize { my ($q, $t) = @_; my $res = 0; my %h; @h{@$q} = @$q; foreach my $i (@$t) { if (defined $h{$i}) { $res++; } } return $res; } sub min3 { my ($m, $a, $b) = @_; if ($a<$m) { $m = $a; } if ($b<$m) { $m = $b; } return $m; } sub binom { my ($population,$query,$target,$intersection) = @_; # transform in a binomial test my $x = $query - $intersection; # number of failures/mismatches my $p = $target / $population; # using the cumulative negative binomial dist # i.e. we compute the prob of observing $x or fewer failures (<-> $commonElements successes or more) # before the $n-th success ($commonElements) # when each trial has a prob of p to succeed (= $targetSize / $populationSize) return Math::CDF::pnbinom($x,$intersection,$p); } sub FDR { my $alpha = shift; my $h = shift; my @hits = @$h; my @shits = sort {$a->pvalue <=> $b->pvalue} @hits; my $n = scalar @shits; if ($n > 0) { $shits[$n-1]->pvalueadj($shits[$n-1]->pvalue); for (my $i=$n-1;$i>=1;$i--) { $shits[$i-1]->pvalueadj( min3($shits[$i]->pvalueadj, $n/$i*$shits[$i-1]->pvalue, 1) ); } } } # LOAD REFERENCE DATASET (IDS WITH ANNOTATIONS) my %genes; # $genes{"term"} = @(gene1, gene2, ...) my %genome; open(F, "$referenceFile") or die "Cannot open reference file '$referenceFile'."; while (<F>) { chomp; my ($kw, @geneList) = split /\t/; $genes{$kw} = \@geneList; @genome{@geneList} = @geneList; } close(F); my $genomeSize = scalar keys %genome; # LOAD QUERY my @query; if ( -e $queryFile ) { my %qTmp; # temp hash in case of redundant genes open(F,"$queryFile") or die "Cannot open query file '$queryFile'."; while (<F>) { $_ =~ s/^\s+//; # trim beginning of line my @a = split /\s+/; @qTmp{@a} = @a; } close(F); @query = keys %qTmp; } else { my %qTmp; # temp hash in case of redundant genes @query = split /\s+/, $queryFile; @qTmp{@query} = @query; @query = keys %qTmp; } # MAKE COMPARISONS my $querySize = scalar @query; my @hits; foreach my $annotation (keys %genes) { my $target = $genes{$annotation}; my $targetSize = scalar @$target; my $nbCommonElements = computeIntersectionSize(\@query, $target); my $pvalue = binom($genomeSize, $querySize, $targetSize, $nbCommonElements); my $hit = new Hit( term=>$annotation, pvalue=>$pvalue, pvalueadj=>$pvalue, common=>$nbCommonElements, target=>$targetSize ); push @hits, $hit; } # apply FDR if asked FDR($alpha, \@hits) if $opt{c}; # print out results print "Term\tp-value\tp-value adj."; foreach my $h (sort {$a->pvalueadj <=> $b->pvalueadj} @hits) { if ($h->pvalueadj <= $alpha) { print $h->term."\t".$h->pvalue."\t".$h->pvalueadj."\t($querySize/".$h->target."/".$h->common.")"; } else { exit; } }