#!/usr/bin/python # Copyright 2014 BARRIOT Roland # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . from os.path import isfile import h5py import numpy as np import datetime class ScoreMatrix(object): # attributes: ## datasource ## date ## mode = similarity | distance ## representation = alaR ## strain ## comment ## labels # representation alaR # alaR=lower triangle column wise # index k is n*(i-1) - i*(i-1)/2 + j-i -1 # i\j 0 1 2 3 4 5 6 7 8 9 # 0 # 1 0 # 2 1 9 # 3 2 10 17 # 4 3 11 18 24 # 5 4 12 19 25 30 # 6 5 13 20 26 31 35 # 7 6 14 21 27 32 36 39 # 8 7 15 22 28 33 37 40 42 # 9 8 16 23 29 34 38 41 43 44 # class variables NA = float("nan") # float("-inf") def __init__(self, filename=None): self.filename = filename # reset self.strain = 'NA' self.datasource = 'NA' self.date = 'NA' self.mode = 'NA' self.comment = 'NA' self.labels = None self.matrix = None self.minimum = float("+inf") self.maximum = float("-inf") self.mean = 0 self.nb_values = 0 self.nb_NA = 0 self.representation = 'alaR' self.h5 = None # reset if filename is not None: self.load(filename) else: self.h5 = None def reset(self): self.strain = 'NA' self.datasource = 'NA' self.date = 'NA' self.mode = 'NA' self.comment = 'NA' self.symmetric = True self.labels = None self.matrix = None self.minimum = float("+inf") self.maximum = float("-inf") self.mean = 0 self.nb_values = 0 self.nb_NA = 0 self.representation = 'alaR' self.h5 = None def load(self, filename=None): self.reset() # check filename if filename is not None: self.filename = filename if self.filename is None or not isfile(self.filename): return None if filename.endswith('.h5') or filename.endswith('.hf5') or filename.endswith('.hdf5'): return self.load_hdf5(filename) elif filename.endswith('.gz'): return self.load_gz(filename) else: print "Unrecognize file format" return None def load_parse_meta(self, s): if s.startswith('# datasource: '): self.datasource = s.replace('# datasource: ', '') elif s.startswith('# strain: '): self.strain = s.replace('# strain: ', '') elif s.startswith('# date: '): self.date = s.replace('# date: ', '') elif s.startswith('# mode: '): self.mode = s.replace('# mode: ', '') elif s.startswith('# comment: '): self.comment = s.replace('# comment: ', '') elif s.startswith('# representation: '): self.representation = s.replace('# representation: ', '') def load_gz(self, filename=None): import gzip # check filename if filename is not None: self.filename = filename if self.filename is None or not isfile(self.filename): return None f = gzip.open(filename,'r') s = f.readline().rstrip() while s.startswith('#'): # metadata self.load_parse_meta(s) s = f.readline().rstrip() n = int(s) # number of genes self.labels = {} for i in xrange(n): s = f.readline().rstrip() self.labels[s] = i #~ print self.labels self.matrix = np.empty( n*(n-1)/2, dtype=np.float32) for i in xrange( n*(n-1)/2 ): s = f.readline().rstrip() if s == 'NA': self.matrix[i] = np.NAN else: self.matrix[i] = float(s) f.close() # process NAs m = self.matrix # COMPUTE SOME STATS if np.isnan(self.NA): self.nb_NA = len(m[ np.isnan(m) ]) else: self.nb_NA = len(m[ m==self.NA ]) self.minimum = np.nanmin(m) self.maximum = np.nanmax(m) self.mean = np.nanmean(m) def load_hdf5(self, filename=None): # check filename if filename is not None: self.filename = filename if self.filename is None or not isfile(self.filename): return None self.h5 = h5py.File(self.filename, 'r+') # METADATA if 'strain'in self.h5.attrs: self.strain = self.h5.attrs['strain'] if 'datasource'in self.h5.attrs: self.datasource = self.h5.attrs['datasource'] if 'date'in self.h5.attrs: self.date = self.h5.attrs['date'] if 'comment'in self.h5.attrs: self.coment = self.h5.attrs['comment'] if 'mode'in self.h5.attrs: self.mode = self.h5.attrs['mode'] if 'comment'in self.h5.attrs: self.comment = self.h5.attrs['comment'] if 'representation'in self.h5.attrs: self.representation = self.h5.attrs['representation'] # allocate memory and read entire matrix self.matrix = np.empty(self.h5['matrix'].shape, dtype=np.float32) self.h5['matrix'].read_direct(self.matrix) # process NAs m = self.matrix # COMPUTE SOME STATS if np.isnan(self.NA): self.nb_NA = len(m[ np.isnan(m) ]) else: self.nb_NA = len(m[ m==self.NA ]) self.minimum = np.nanmin(m) self.maximum = np.nanmax(m) self.mean = np.nanmean(m) # LOAD LABELS self.labels = {} index = 0 for i in str(self.h5.attrs['labels']).split(','): self.labels[i] = index index += 1 self.h5.close() self.h5=None def save(self, filename=None, meta=None): self.save_hdf5(filename, meta) def save_hdf5(self, filename=None, meta=None): self.h5 = h5py.File(filename, 'w') if meta is not None: for attr in meta: self.h5.attrs[ attr ] = np.string_(meta[ attr ]) else: self.h5.attrs[ 'strain' ] = np.string_(self.strain) self.h5.attrs[ 'datasource' ] = np.string_(self.datasource) self.h5.attrs[ 'date' ] = np.string_(self.date) self.h5.attrs[ 'mode' ] = np.string_(self.mode) self.h5.attrs[ 'comment' ] = np.string_(self.comment) #~ self.h5.attrs[ 'symmetric' ] = np.string_(self.symmetric) self.h5.attrs['representation'] = np.string_(self.representation) labs = sorted( self.labels, key=self.labels.get) self.h5.attrs['labels'] = np.string_(','.join( labs ) ) self.h5.create_dataset('matrix', data=self.matrix) self.h5.close() self.h5=None def print_info(self): if self.h5 is not None: for j in self.h5.attrs: if j!='labels': print '%s : %s' % (j, ''.join(str(self.h5.attrs[j]))) else: print j+': '+self.h5.attrs[j][0:50]+'...' print 'matrix: '+str(self.h5['matrix'][0:5]) print "min: %s, max: %s, mean: %s, NAs: %s" % (self.minimum, self.maximum, self.mean, self.nb_NA) else: print "strain: "+self.strain print "datasource: "+self.datasource print "date: "+self.date print "mode: "+self.mode print "representation: "+self.representation print "comment: "+self.comment print 'matrix: '+str(self.matrix[0:5]) print "min: %s, max: %s, mean: %s, NAs: %s" % (self.minimum, self.maximum, self.mean, self.nb_NA) def get(self, id1, id2): # do we have some data? if self.matrix is None: return None # known ids? if id1 not in self.labels or id2 not in self.labels: return None #~ if self.h5.attrs['representation'] == 'alaR': if self.representation == 'alaR': # alaR=lower triangle column wise # index k is n*(i-1) - i*(i-1)/2 + j-i -1 # i\j 0 1 2 3 4 5 6 7 8 9 # 0 # 1 0 # 2 1 9 # 3 2 10 17 # 4 3 11 18 24 # 5 4 12 19 25 30 # 6 5 13 20 26 31 35 # 7 6 14 21 27 32 36 39 # 8 7 15 22 28 33 37 40 42 # 9 8 16 23 29 34 38 41 43 44 i = self.labels[id1]+1 j = self.labels[id2]+1 if i > j: (i, j) = (j, i) # sort indices, must have i j: (i, j) = (j, i) # sort indices, must have i= self.relationship_order[ relationship ]: self.forward[words[0]] = words[1] self.backward[words[1]] = words[0] line = str(f.readline()).rstrip() if filter_cgdb_prefix: self.remove_cgdb_prefix() def remove_prefix(self, pattern='[A-Z][a-z]{3}[A-Z]\d{2}\.'): import re p = re.compile(pattern) an_id = self.backward.keys()[0] m = p.match( an_id ) w = m.group() for i in self.backward.keys(): if not i.startswith(w): return backward = {} for k in self.forward: v = self.forward[k][8:] backward[ v ] = k self.forward[k] = v self.backward = backward class PrioritizedItem(object): def __init__(self, id=None, score=None, rank=None, rank_ratio=None, zscore=None, comment=None, mapped=None, data={}): self.id = id self.score = score self.rank = rank self.rank_ratio = rank_ratio self.zscore = zscore self.comment = comment self.mapped = mapped self.data = data def display(self): score = self.score zscore = self.zscore if score==Prioritizer.NA: score = 'NA' zscore = 'NA' mapped = '' if self.mapped is not None: mapped = self.mapped comment = '' if self.comment is not None: comment = i.comment print "%s\t%s\t%s\t%s\t%s\t%s\t%s" % (self.id, score, self.rank, self.rank_ratio, zscore, mapped, comment) class Prioritizer(object): NA = ScoreMatrix.NA def __init__(self, matrixfile = None): self.matrix = None if matrixfile is not None: self.load_matrix(matrixfile) def load_matrix(self, filename): self.matrix = ScoreMatrix(filename) def set_distance(self, id, a_list): m = self.matrix sum = 0 n = 0 for i in a_list: if id != i: # do not use training gene as candidate score = m.get(id, i) if score is not None and not np.isnan(score): sum += score n += 1 if n==0: return np.NaN return float(sum)/n def prioritize(self, training_set, candidate_set=None, mapping=None): res = [] training = [] # MAPPING OF TRAINING SET if mapping is not None: for i in training_set: if i in mapping.forward: training.append( mapping.forward[i] ) else: training = training_set # CANDIDATES AND THEIR MAPPING candidates = [] if candidate_set is None: # no candidates provided if mapping is None: # no mapping : use matrix ids candidates = self.matrix.labels.keys() else: # no cand but mapping provided : use mapped ids for i in mapping.forward: candidates.append( mapping.forward[i] ) elif mapping is None: # cand with no map: use cand candidates = candidate_set else: # cand and mapping provided use mapped cand for i in candidate_set: if i in mapping.forward: candidates.append(mapping.forward[i]) for i in candidates: score = self.set_distance(i, training) if mapping is None: res.append(PrioritizedItem(i, score)) else: res.append(PrioritizedItem( id=mapping.backward[i], score=score, mapped=i)) return Prioritizer.rank_list(res) def loocvs(self, training_sets, candidate_set=None, mapping=None): res = {} rank_ratios = [] for name in training_sets: genes = training_sets[ name ] # perform prioritization pri = self.prioritize(genes, candidate_set, mapping) # retrieve training genes rank ratios res[ name ] = [] for pi in pri: if pi.id in genes: res[ name ].append(pi) rank_ratios.append( pi.rank_ratio) return { 'sets': res, 'auc': 1.-np.nanmean( rank_ratios ) } @classmethod def to_prioritizedItem_list(a_list): res = [] size = len(a_list) for i in xrange(size): res.append( PrioritizedItem(a_list[i], Prioritizer.NA, i, (i-1)/(size-1), Prioritizer.NA)) return res @classmethod def rank_list(self,a_list): l = len(a_list) # z-scores scores = [] for i in a_list: if i.score!=float("inf") and not np.isnan(i.score): scores.append(i.score) if len(scores) > 0: the_mean = np.nanmean(scores) the_sd = np.nanstd(scores) for i in a_list: if i.score != Prioritizer.NA and len(scores)>0: i.zscore = (i.score - the_mean) / the_sd else: i.zscore = Prioritizer.NA # SORT BY SCORE ASCENDING a_list.sort(key=lambda an_item: float("+inf") if np.isnan(an_item.score) else an_item.score) # COMPUTE RANK # FROM 1..NB_ITEMS e.g. 1, 3, 3, 3, 5 rank_min = 1; rank_max = 1; nb_items = len(a_list) i=0; while i < nb_items: j = i+1 while j < nb_items and a_list[i].score == a_list[j].score: j += 1 # either at end of list or score differs at j rank = (rank_min + j) / 2; # ok, since rank_min starts at 1 and is updated to j+1, and j spans 0..n-1 rank_max = rank for k in xrange(i, j): a_list[k].rank = rank rank_min = j + 1; i = j # COMPUTE RANK RATIO for p in a_list: if rank_max > 0: p.rank_ratio = (float(p.rank) - 1) / (rank_max - 1) # 0..1 else: p.rank_ratio = 1 return a_list @classmethod def display(self,a_list, lines = None): print "id\tscore\trank\trank_ratio\tz-score\tmapped_id\tcomment" to_display = a_list if lines is not None: to_display = a_list[:lines] for i in to_display: score = i.score zscore = i.zscore if score==Prioritizer.NA: score = 'NA' zscore = 'NA' mapped = '' if i.mapped is not None: mapped = i.mapped comment = '' if i.comment is not None: comment = i.comment print "%s\t%s\t%s\t%s\t%s\t%s\t%s" % (i.id, score, i.rank, i.rank_ratio, zscore, mapped, comment) def save(self, filename, a_list, lines = None, header=True, meta=True, sep='\t', template=['id','score','rank','rank_ratio','zscore', 'mapped', 'comment']): f = open(filename, 'w') if meta: f.write('# strain: '+self.matrix.strain+"\n") f.write('# datasource: '+self.matrix.datasource+"\n") f.write('# date: '+datetime.date.today().strftime('%Y-%m-%d')+"\n") f.write('# comment: '+self.matrix.comment+"\n") if header: f.write( sep.join(template)+"\n") to_display = a_list if lines is not None: to_display = a_list[:lines] for i in to_display: if i.score==Prioritizer.NA or np.isnan(i.score): i.score = 'NA' i.zscore = 'NA' if i.mapped is None: i.mapped = '' if i.comment is None: i.comment = '' line = [] attr = dir(i) for a in xrange(len(template)): if template[a] in attr: line.append( str( getattr( i, template[a] )) ) elif template[a] in i.data: line.append( str( i.data[ template[a] ] ) ) else: print template[a]+' not found in PrioritizedItem' f.write( sep.join(line)+"\n") f.close() @classmethod def load_pri(self, filename): res = [] with open(filename) as f: row = f.readline() while row: li = str(row) if not li.startswith('#') and not (li.startswith('id') or li.startswith('candidate') or li.startswith('gene')): # skip comments and header vals = li.rstrip().split('\t') res.append (PrioritizedItem(vals[0], vals[1], vals[2], vals[3], vals[4])) row = f.readline() return res @classmethod def load_pris(self, filename): ids = [] head = None res = [] with open(filename) as f: row = f.readline() while row: li = str(row).rstrip() if li.startswith('candidate') or li.startswith('gene'): # header head = li.split('\t') head.pop(0) # remove candidate / gene from datasource names elif not li.startswith('#') : # skip comments vals = li.rstrip().split('\t') ids.append( vals.pop(0) ) for i in xrange(len(vals)): if vals[i]!='NA': vals[i] = float(vals[i]) else: vals[i] = float("nan") res.append ( vals ) row = f.readline() mat = np.matrix(res) return { 'id': ids, 'matrix': mat, 'head': head} @classmethod def impute(self, mat, method='mean'): rows, cols = mat.shape means = [] for j in xrange(cols): means.append(np.nanmean( mat[:,j] )) for i in xrange(rows): if np.isnan(mat[i,j]): mat[i,j] = means[j] @classmethod def fusion(self, ids, training, matrix, colnames, method='lda'): from sklearn.lda import LDA nbrows, nbcols = matrix.shape m=matrix[:,:] # copy matrix self.impute(m) # impute missing values classes = map(lambda x: 1 if x in training else 2, ids) clf = LDA() clf.fit(m, classes) weights = {} for w in xrange(len(colnames)): weights[ colnames[w] ] = clf.scalings_[w][0] fusion = clf.transform(m) # build result structure res = [] for i in xrange(nbrows): r = { 'id': ids[i], 'fusion': fusion[i][0] } for j in xrange(nbcols): r[ colnames[j] ] = matrix[i,j] res.append(r) res = { 'genes': sorted(res, key=lambda x: x['fusion'], reverse=True), 'weights': weights } return res @classmethod def plot(self, fus, training, colnames, save_as=None): import matplotlib.pyplot as plt plt.figure() plt.rc('xtick', labelsize=6) plt.rc('ytick', labelsize=6) num_bins = 100 fused = fus['genes'] for j in xrange(len(colnames)): plt.subplot(len(colnames), 1, j) vals = [] train = [] train_label = [] cand = [] for i in xrange(len(fused)): if not np.isnan(fused[i][colnames[j]]): if fused[i]['id'] in training: train.append(fused[i][colnames[j]]) train_label.append( fused[i]['id'] ) else: cand.append(fused[i][colnames[j]]) vals.append(fused[i][colnames[j]]) n, bins, patches = plt.hist(vals, num_bins, facecolor='gray') ymax = -float(max(n)) ystep = ymax / (len(train)+2) if j==0: plt.title(colnames[j], size=8) else: plt.title(colnames[j]+' (w: '+str( fus['weights'][colnames[j]])+')', size=8) plt.axis([0, 1, ymax, -ymax] ) for t in xrange(len(train)): plt.plot(train[t], ystep*(t+1), 'g.') plt.text(train[t], ystep*(t+1), train_label[t], size=6) plt.plot(cand, [ystep * (len(train)+1) ]*len(cand), 'r.') if save_as is None: plt.show() else: plt.savefig(save_as, bbox_inches='tight', dpi=150) plt.close() @classmethod def load_training_sets(self, filename): sets = {} with open(filename) as f: row = f.readline() while row: li = str(row).rstrip() vals = li.split('\t') set_name = vals.pop(0) if set_name != "": sets[ set_name] = vals row = f.readline() return sets if __name__ == "__main__": print