KMedoids.py
From silico.biotoul.fr
(Difference between revisions)
m |
m |
||
(One intermediate revision not shown) | |||
Line 8: | Line 8: | ||
# SCRIPT PARAMETERS | # SCRIPT PARAMETERS | ||
- | parser = argparse.ArgumentParser(description=' | + | parser = argparse.ArgumentParser(description='k-Medoids clustering implementation.') |
parser.add_argument('--dataset', required=True, help='File in CSV format containing samples.') | parser.add_argument('--dataset', required=True, help='File in CSV format containing samples.') | ||
parser.add_argument('--k', required=True, type=int, help='Number of clusters.') | parser.add_argument('--k', required=True, type=int, help='Number of clusters.') | ||
Line 34: | Line 34: | ||
targetClass = i | targetClass = i | ||
del attributeClass[targetClass] | del attributeClass[targetClass] | ||
- | |||
# LOAD OBJECTS | # LOAD OBJECTS | ||
for row in reader: | for row in reader: |
Current revision as of 11:53, 6 December 2011
#!/usr/bin/python import argparse import csv import numpy import sys # SCRIPT PARAMETERS parser = argparse.ArgumentParser(description='k-Medoids clustering implementation.') parser.add_argument('--dataset', required=True, help='File in CSV format containing samples.') parser.add_argument('--k', required=True, type=int, help='Number of clusters.') parser.add_argument('--iterations', required=False, default=500, type=int, help='Maximum number of iterations.') opt = vars(parser.parse_args()) # GLOBAL VARIABLES targetClass = '' attributeClass = {} data = [] medoids = [] k = opt['k'] N = 0 # N: size of training set max_iter = int(opt['iterations']) # LOAD DATASET with open(opt['dataset']) as f: reader = csv.DictReader(f, delimiter='\t') # LOAD ATTRIBUTE TYPES (continuous, discrete, ignore) attributeClass = reader.next() # DETERMINE TARGET CLASS classLine = reader.next() for i in attributeClass: if classLine[i]=='class': targetClass = i del attributeClass[targetClass] # LOAD OBJECTS for row in reader: N += 1 data.append(row) # NORMALIZE continuous ATTRIBUTES for i in attributeClass: if attributeClass[i]=='continuous': values = [] for j in range(N): if data[j][i] is not None and data[j][i]!='' and data[j][i]!='NA': values.append( float(data[j][i]) ) mean = numpy.mean(values) sd = numpy.std(values) for j in range(N): if data[j][i] is not None and data[j][i]!='' and data[j][i]!='NA': data[j][i] = (float(data[j][i])-mean)/sd # DISTANCE MEASURE & ERROR CRITERION def manhattanDist(val1, val2): return numpy.abs( float(val1) - float(val2)) def matchingCoefficient(val1, val2): if val1==val2: return 0 else: return 1 def dist(val1, val2, attributeType): if attributeType=='continuous': return manhattanDist(val1, val2) elif attributeType=='discrete': return matchingCoefficient(val1, val2) def objectDist(obj1, obj2, attributeInfo): total=0 n=0 for att in attributeInfo: if (attributeInfo[att]=='continuous' or attributeInfo[att]=='discrete') and obj1[att] is not None and obj2[att] is not None and obj1[att]!='' and obj2[att]!='' and obj1[att]!='NA' and obj2[att]!='NA': total += float(dist(obj1[att], obj2[att], attributeInfo[att])) n+=1 if n==0: return None return total/n def error_dist(medoids, data, attributeInfo): total=0 for obj in data: # FIND NEAREST MEDOID AND COMPUTE DISTANCE minDist=numpy.inf for mi in medoids: m = data[mi] d=objectDist(obj,m, attributeInfo) if d<minDist: minDist=d total+=minDist return total # INITIALIZATION: SELECT RANDOM MEDOIDS i=N/k for j in range(k): medoids.append(j*i) current_error = error_dist(medoids, data, attributeClass) best_error = current_error best_error_medoids = medoids[:] delta=1 # MAIN LOOP while max_iter>0 and delta>0: for mi in range(k): print "current medoid:", mi for obj_i in range(N): if obj_i not in medoids: # SWAP # COPY medoids new_medoids = medoids[:] new_medoids[mi] = obj_i # COMPUTE NEW error new_error = error_dist(new_medoids, data, attributeClass) if new_error<best_error: best_error = new_error best_error_medoids = new_medoids[:] delta = current_error - best_error medoids = best_error_medoids[:] current_error = best_error print medoids print current_error # DECIDE FINAL CLUSTERING fields=attributeClass.keys() fields.append(targetClass) fields.append('cluster') writer = csv.DictWriter(sys.stdout, fieldnames=fields, delimiter='\t') writer.writeheader() for obj in data: # FIND NEAREST MEDOID AND ASSIGN CLUSTER minDist=numpy.inf nearestMedoid='' for mi in medoids: m = data[mi] d=objectDist(obj,m, attributeClass) if d<minDist: minDist=d nearestMedoid=mi obj['cluster'] = nearestMedoid writer.writerow(obj) # COMPUTE CROSS TABLE ct={} for obj in data: c1=obj[targetClass] c2=obj['cluster'] if c1 not in ct: ct[c1] = {} if c2 not in ct[c1]: ct[c1][c2] = 1 else: ct[c1][c2] += 1 print 'class/cluster','\t', for i in medoids: print i,'\t', print for i in ct: print i,'\t\t', for j in medoids: print ct[i][j],'\t', print