#!/usr/bin/python
# Copyright 2014 BARRIOT Roland
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
from os.path import isfile
import h5py
import numpy as np
import datetime
class ScoreMatrix(object):
# attributes:
## datasource
## date
## mode = similarity | distance
## representation = alaR
## strain
## comment
## labels
# representation alaR
# alaR=lower triangle column wise
# index k is n*(i-1) - i*(i-1)/2 + j-i -1
# i\j 0 1 2 3 4 5 6 7 8 9
# 0
# 1 0
# 2 1 9
# 3 2 10 17
# 4 3 11 18 24
# 5 4 12 19 25 30
# 6 5 13 20 26 31 35
# 7 6 14 21 27 32 36 39
# 8 7 15 22 28 33 37 40 42
# 9 8 16 23 29 34 38 41 43 44
# class variables
NA = float("nan") # float("-inf")
def __init__(self, filename=None):
self.filename = filename
# reset
self.strain = 'NA'
self.datasource = 'NA'
self.date = 'NA'
self.mode = 'NA'
self.comment = 'NA'
self.labels = None
self.matrix = None
self.minimum = float("+inf")
self.maximum = float("-inf")
self.mean = 0
self.nb_values = 0
self.nb_NA = 0
self.representation = 'alaR'
self.h5 = None
# reset
if filename is not None:
self.load(filename)
else:
self.h5 = None
def reset(self):
self.strain = 'NA'
self.datasource = 'NA'
self.date = 'NA'
self.mode = 'NA'
self.comment = 'NA'
self.symmetric = True
self.labels = None
self.matrix = None
self.minimum = float("+inf")
self.maximum = float("-inf")
self.mean = 0
self.nb_values = 0
self.nb_NA = 0
self.representation = 'alaR'
self.h5 = None
def load(self, filename=None):
self.reset()
# check filename
if filename is not None:
self.filename = filename
if self.filename is None or not isfile(self.filename):
return None
if filename.endswith('.h5') or filename.endswith('.hf5') or filename.endswith('.hdf5'):
return self.load_hdf5(filename)
elif filename.endswith('.gz'):
return self.load_gz(filename)
else:
print "Unrecognize file format"
return None
def load_parse_meta(self, s):
if s.startswith('# datasource: '):
self.datasource = s.replace('# datasource: ', '')
elif s.startswith('# strain: '):
self.strain = s.replace('# strain: ', '')
elif s.startswith('# date: '):
self.date = s.replace('# date: ', '')
elif s.startswith('# mode: '):
self.mode = s.replace('# mode: ', '')
elif s.startswith('# comment: '):
self.comment = s.replace('# comment: ', '')
elif s.startswith('# representation: '):
self.representation = s.replace('# representation: ', '')
def load_gz(self, filename=None):
import gzip
# check filename
if filename is not None:
self.filename = filename
if self.filename is None or not isfile(self.filename):
return None
f = gzip.open(filename,'r')
s = f.readline().rstrip()
while s.startswith('#'): # metadata
self.load_parse_meta(s)
s = f.readline().rstrip()
n = int(s) # number of genes
self.labels = {}
for i in xrange(n):
s = f.readline().rstrip()
self.labels[s] = i
#~ print self.labels
self.matrix = np.empty( n*(n-1)/2, dtype=np.float32)
for i in xrange( n*(n-1)/2 ):
s = f.readline().rstrip()
if s == 'NA':
self.matrix[i] = np.NAN
else:
self.matrix[i] = float(s)
f.close()
# process NAs
m = self.matrix
# COMPUTE SOME STATS
if np.isnan(self.NA):
self.nb_NA = len(m[ np.isnan(m) ])
else:
self.nb_NA = len(m[ m==self.NA ])
self.minimum = np.nanmin(m)
self.maximum = np.nanmax(m)
self.mean = np.nanmean(m)
def load_hdf5(self, filename=None):
# check filename
if filename is not None:
self.filename = filename
if self.filename is None or not isfile(self.filename):
return None
self.h5 = h5py.File(self.filename, 'r+')
# METADATA
if 'strain'in self.h5.attrs: self.strain = self.h5.attrs['strain']
if 'datasource'in self.h5.attrs: self.datasource = self.h5.attrs['datasource']
if 'date'in self.h5.attrs: self.date = self.h5.attrs['date']
if 'comment'in self.h5.attrs: self.coment = self.h5.attrs['comment']
if 'mode'in self.h5.attrs: self.mode = self.h5.attrs['mode']
if 'comment'in self.h5.attrs: self.comment = self.h5.attrs['comment']
if 'representation'in self.h5.attrs: self.representation = self.h5.attrs['representation']
# allocate memory and read entire matrix
self.matrix = np.empty(self.h5['matrix'].shape, dtype=np.float32)
self.h5['matrix'].read_direct(self.matrix)
# process NAs
m = self.matrix
# COMPUTE SOME STATS
if np.isnan(self.NA):
self.nb_NA = len(m[ np.isnan(m) ])
else:
self.nb_NA = len(m[ m==self.NA ])
self.minimum = np.nanmin(m)
self.maximum = np.nanmax(m)
self.mean = np.nanmean(m)
# LOAD LABELS
self.labels = {}
index = 0
for i in str(self.h5.attrs['labels']).split(','):
self.labels[i] = index
index += 1
self.h5.close()
self.h5=None
def save(self, filename=None, meta=None):
self.save_hdf5(filename, meta)
def save_hdf5(self, filename=None, meta=None):
self.h5 = h5py.File(filename, 'w')
if meta is not None:
for attr in meta:
self.h5.attrs[ attr ] = np.string_(meta[ attr ])
else:
self.h5.attrs[ 'strain' ] = np.string_(self.strain)
self.h5.attrs[ 'datasource' ] = np.string_(self.datasource)
self.h5.attrs[ 'date' ] = np.string_(self.date)
self.h5.attrs[ 'mode' ] = np.string_(self.mode)
self.h5.attrs[ 'comment' ] = np.string_(self.comment)
#~ self.h5.attrs[ 'symmetric' ] = np.string_(self.symmetric)
self.h5.attrs['representation'] = np.string_(self.representation)
labs = sorted( self.labels, key=self.labels.get)
self.h5.attrs['labels'] = np.string_(','.join( labs ) )
self.h5.create_dataset('matrix', data=self.matrix)
self.h5.close()
self.h5=None
def print_info(self):
if self.h5 is not None:
for j in self.h5.attrs:
if j!='labels':
print '%s : %s' % (j, ''.join(str(self.h5.attrs[j])))
else:
print j+': '+self.h5.attrs[j][0:50]+'...'
print 'matrix: '+str(self.h5['matrix'][0:5])
print "min: %s, max: %s, mean: %s, NAs: %s" % (self.minimum, self.maximum, self.mean, self.nb_NA)
else:
print "strain: "+self.strain
print "datasource: "+self.datasource
print "date: "+self.date
print "mode: "+self.mode
print "representation: "+self.representation
print "comment: "+self.comment
print 'matrix: '+str(self.matrix[0:5])
print "min: %s, max: %s, mean: %s, NAs: %s" % (self.minimum, self.maximum, self.mean, self.nb_NA)
def get(self, id1, id2):
# do we have some data?
if self.matrix is None:
return None
# known ids?
if id1 not in self.labels or id2 not in self.labels:
return None
#~ if self.h5.attrs['representation'] == 'alaR':
if self.representation == 'alaR':
# alaR=lower triangle column wise
# index k is n*(i-1) - i*(i-1)/2 + j-i -1
# i\j 0 1 2 3 4 5 6 7 8 9
# 0
# 1 0
# 2 1 9
# 3 2 10 17
# 4 3 11 18 24
# 5 4 12 19 25 30
# 6 5 13 20 26 31 35
# 7 6 14 21 27 32 36 39
# 8 7 15 22 28 33 37 40 42
# 9 8 16 23 29 34 38 41 43 44
i = self.labels[id1]+1
j = self.labels[id2]+1
if i > j:
(i, j) = (j, i) # sort indices, must have i j:
(i, j) = (j, i) # sort indices, must have i= self.relationship_order[ relationship ]:
self.forward[words[0]] = words[1]
self.backward[words[1]] = words[0]
line = str(f.readline()).rstrip()
if filter_cgdb_prefix:
self.remove_cgdb_prefix()
def remove_prefix(self, pattern='[A-Z][a-z]{3}[A-Z]\d{2}\.'):
import re
p = re.compile(pattern)
an_id = self.backward.keys()[0]
m = p.match( an_id )
w = m.group()
for i in self.backward.keys():
if not i.startswith(w):
return
backward = {}
for k in self.forward:
v = self.forward[k][8:]
backward[ v ] = k
self.forward[k] = v
self.backward = backward
class PrioritizedItem(object):
def __init__(self, id=None, score=None, rank=None, rank_ratio=None, zscore=None, comment=None, mapped=None, data={}):
self.id = id
self.score = score
self.rank = rank
self.rank_ratio = rank_ratio
self.zscore = zscore
self.comment = comment
self.mapped = mapped
self.data = data
def display(self):
score = self.score
zscore = self.zscore
if score==Prioritizer.NA:
score = 'NA'
zscore = 'NA'
mapped = ''
if self.mapped is not None:
mapped = self.mapped
comment = ''
if self.comment is not None:
comment = i.comment
print "%s\t%s\t%s\t%s\t%s\t%s\t%s" % (self.id, score, self.rank, self.rank_ratio, zscore, mapped, comment)
class Prioritizer(object):
NA = ScoreMatrix.NA
def __init__(self, matrixfile = None):
self.matrix = None
if matrixfile is not None:
self.load_matrix(matrixfile)
def load_matrix(self, filename):
self.matrix = ScoreMatrix(filename)
def set_distance(self, id, a_list):
m = self.matrix
sum = 0
n = 0
for i in a_list:
if id != i: # do not use training gene as candidate
score = m.get(id, i)
if score is not None and not np.isnan(score):
sum += score
n += 1
if n==0:
return np.NaN
return float(sum)/n
def prioritize(self, training_set, candidate_set=None, mapping=None):
res = []
training = []
# MAPPING OF TRAINING SET
if mapping is not None:
for i in training_set:
if i in mapping.forward:
training.append( mapping.forward[i] )
else:
training = training_set
# CANDIDATES AND THEIR MAPPING
candidates = []
if candidate_set is None: # no candidates provided
if mapping is None: # no mapping : use matrix ids
candidates = self.matrix.labels.keys()
else: # no cand but mapping provided : use mapped ids
for i in mapping.forward:
candidates.append( mapping.forward[i] )
elif mapping is None: # cand with no map: use cand
candidates = candidate_set
else: # cand and mapping provided use mapped cand
for i in candidate_set:
if i in mapping.forward:
candidates.append(mapping.forward[i])
for i in candidates:
score = self.set_distance(i, training)
if mapping is None:
res.append(PrioritizedItem(i, score))
else:
res.append(PrioritizedItem( id=mapping.backward[i], score=score, mapped=i))
return Prioritizer.rank_list(res)
def loocvs(self, training_sets, candidate_set=None, mapping=None):
res = {}
rank_ratios = []
for name in training_sets:
genes = training_sets[ name ]
# perform prioritization
pri = self.prioritize(genes, candidate_set, mapping)
# retrieve training genes rank ratios
res[ name ] = []
for pi in pri:
if pi.id in genes:
res[ name ].append(pi)
rank_ratios.append( pi.rank_ratio)
return { 'sets': res, 'auc': 1.-np.nanmean( rank_ratios ) }
@classmethod
def to_prioritizedItem_list(a_list):
res = []
size = len(a_list)
for i in xrange(size):
res.append( PrioritizedItem(a_list[i], Prioritizer.NA, i, (i-1)/(size-1), Prioritizer.NA))
return res
@classmethod
def rank_list(self,a_list):
l = len(a_list)
# z-scores
scores = []
for i in a_list:
if i.score!=float("inf") and not np.isnan(i.score):
scores.append(i.score)
if len(scores) > 0:
the_mean = np.nanmean(scores)
the_sd = np.nanstd(scores)
for i in a_list:
if i.score != Prioritizer.NA and len(scores)>0:
i.zscore = (i.score - the_mean) / the_sd
else:
i.zscore = Prioritizer.NA
# SORT BY SCORE ASCENDING
a_list.sort(key=lambda an_item: float("+inf") if np.isnan(an_item.score) else an_item.score)
# COMPUTE RANK
# FROM 1..NB_ITEMS e.g. 1, 3, 3, 3, 5
rank_min = 1;
rank_max = 1;
nb_items = len(a_list)
i=0;
while i < nb_items:
j = i+1
while j < nb_items and a_list[i].score == a_list[j].score:
j += 1
# either at end of list or score differs at j
rank = (rank_min + j) / 2; # ok, since rank_min starts at 1 and is updated to j+1, and j spans 0..n-1
rank_max = rank
for k in xrange(i, j):
a_list[k].rank = rank
rank_min = j + 1;
i = j
# COMPUTE RANK RATIO
for p in a_list:
if rank_max > 0:
p.rank_ratio = (float(p.rank) - 1) / (rank_max - 1) # 0..1
else:
p.rank_ratio = 1
return a_list
@classmethod
def display(self,a_list, lines = None):
print "id\tscore\trank\trank_ratio\tz-score\tmapped_id\tcomment"
to_display = a_list
if lines is not None:
to_display = a_list[:lines]
for i in to_display:
score = i.score
zscore = i.zscore
if score==Prioritizer.NA:
score = 'NA'
zscore = 'NA'
mapped = ''
if i.mapped is not None:
mapped = i.mapped
comment = ''
if i.comment is not None:
comment = i.comment
print "%s\t%s\t%s\t%s\t%s\t%s\t%s" % (i.id, score, i.rank, i.rank_ratio, zscore, mapped, comment)
def save(self, filename, a_list, lines = None, header=True, meta=True, sep='\t', template=['id','score','rank','rank_ratio','zscore', 'mapped', 'comment']):
f = open(filename, 'w')
if meta:
f.write('# strain: '+self.matrix.strain+"\n")
f.write('# datasource: '+self.matrix.datasource+"\n")
f.write('# date: '+datetime.date.today().strftime('%Y-%m-%d')+"\n")
f.write('# comment: '+self.matrix.comment+"\n")
if header:
f.write( sep.join(template)+"\n")
to_display = a_list
if lines is not None:
to_display = a_list[:lines]
for i in to_display:
if i.score==Prioritizer.NA or np.isnan(i.score):
i.score = 'NA'
i.zscore = 'NA'
if i.mapped is None:
i.mapped = ''
if i.comment is None:
i.comment = ''
line = []
attr = dir(i)
for a in xrange(len(template)):
if template[a] in attr:
line.append( str( getattr( i, template[a] )) )
elif template[a] in i.data:
line.append( str( i.data[ template[a] ] ) )
else:
print template[a]+' not found in PrioritizedItem'
f.write( sep.join(line)+"\n")
f.close()
@classmethod
def load_pri(self, filename):
res = []
with open(filename) as f:
row = f.readline()
while row:
li = str(row)
if not li.startswith('#') and not (li.startswith('id') or li.startswith('candidate') or li.startswith('gene')): # skip comments and header
vals = li.rstrip().split('\t')
res.append (PrioritizedItem(vals[0], vals[1], vals[2], vals[3], vals[4]))
row = f.readline()
return res
@classmethod
def load_pris(self, filename):
ids = []
head = None
res = []
with open(filename) as f:
row = f.readline()
while row:
li = str(row).rstrip()
if li.startswith('candidate') or li.startswith('gene'): # header
head = li.split('\t')
head.pop(0) # remove candidate / gene from datasource names
elif not li.startswith('#') : # skip comments
vals = li.rstrip().split('\t')
ids.append( vals.pop(0) )
for i in xrange(len(vals)):
if vals[i]!='NA':
vals[i] = float(vals[i])
else:
vals[i] = float("nan")
res.append ( vals )
row = f.readline()
mat = np.matrix(res)
return { 'id': ids, 'matrix': mat, 'head': head}
@classmethod
def impute(self, mat, method='mean'):
rows, cols = mat.shape
means = []
for j in xrange(cols):
means.append(np.nanmean( mat[:,j] ))
for i in xrange(rows):
if np.isnan(mat[i,j]):
mat[i,j] = means[j]
@classmethod
def fusion(self, ids, training, matrix, colnames, method='lda'):
from sklearn.lda import LDA
nbrows, nbcols = matrix.shape
m=matrix[:,:] # copy matrix
self.impute(m) # impute missing values
classes = map(lambda x: 1 if x in training else 2, ids)
clf = LDA()
clf.fit(m, classes)
weights = {}
for w in xrange(len(colnames)):
weights[ colnames[w] ] = clf.scalings_[w][0]
fusion = clf.transform(m)
# build result structure
res = []
for i in xrange(nbrows):
r = { 'id': ids[i], 'fusion': fusion[i][0] }
for j in xrange(nbcols):
r[ colnames[j] ] = matrix[i,j]
res.append(r)
res = { 'genes': sorted(res, key=lambda x: x['fusion'], reverse=True), 'weights': weights }
return res
@classmethod
def plot(self, fus, training, colnames, save_as=None):
import matplotlib.pyplot as plt
plt.figure()
plt.rc('xtick', labelsize=6)
plt.rc('ytick', labelsize=6)
num_bins = 100
fused = fus['genes']
for j in xrange(len(colnames)):
plt.subplot(len(colnames), 1, j)
vals = []
train = []
train_label = []
cand = []
for i in xrange(len(fused)):
if not np.isnan(fused[i][colnames[j]]):
if fused[i]['id'] in training:
train.append(fused[i][colnames[j]])
train_label.append( fused[i]['id'] )
else:
cand.append(fused[i][colnames[j]])
vals.append(fused[i][colnames[j]])
n, bins, patches = plt.hist(vals, num_bins, facecolor='gray')
ymax = -float(max(n))
ystep = ymax / (len(train)+2)
if j==0:
plt.title(colnames[j], size=8)
else:
plt.title(colnames[j]+' (w: '+str( fus['weights'][colnames[j]])+')', size=8)
plt.axis([0, 1, ymax, -ymax] )
for t in xrange(len(train)):
plt.plot(train[t], ystep*(t+1), 'g.')
plt.text(train[t], ystep*(t+1), train_label[t], size=6)
plt.plot(cand, [ystep * (len(train)+1) ]*len(cand), 'r.')
if save_as is None:
plt.show()
else:
plt.savefig(save_as, bbox_inches='tight', dpi=150)
plt.close()
@classmethod
def load_training_sets(self, filename):
sets = {}
with open(filename) as f:
row = f.readline()
while row:
li = str(row).rstrip()
vals = li.split('\t')
set_name = vals.pop(0)
if set_name != "":
sets[ set_name] = vals
row = f.readline()
return sets
if __name__ == "__main__":
print