#!/usr/bin/python
import argparse
import csv
import numpy
import sys
# SCRIPT PARAMETERS
parser = argparse.ArgumentParser(description='Naive Bayesian learner and classifier.')
parser.add_argument('--dataset', required=True, help='File in CSV format containing samples.')
parser.add_argument('--k', required=True, type=int, help='Number of clusters.')
parser.add_argument('--iterations', required=False, default=500, type=int, help='Maximum number of iterations.')
opt = vars(parser.parse_args())
# GLOBAL VARIABLES
targetClass = ''
attributeClass = {}
data = []
medoids = []
k = int(opt['k'])
N = 0 # N: size of training set
max_iter = int(opt['iterations'])
# LOAD DATASET
with open(opt['dataset']) as f:
reader = csv.DictReader(f, delimiter='\t')
# LOAD ATTRIBUTE TYPES (continuous, discrete, ignore)
attributeClass = reader.next()
# DETERMINE TARGET CLASS
classLine = reader.next()
for i in attributeClass:
if classLine[i]=='class':
targetClass = i
del attributeClass[targetClass]
print attributeClass
# LOAD OBJECTS
for row in reader:
N += 1
data.append(row)
# NORMALIZE continuous ATTRIBUTES
for i in attributeClass:
if attributeClass[i]=='continuous':
values = []
for j in range(N):
if data[j][i] is not None and data[j][i]!='' and data[j][i]!='NA':
values.append( float(data[j][i]) )
mean = numpy.mean(values)
sd = numpy.std(values)
for j in range(N):
if data[j][i] is not None and data[j][i]!='' and data[j][i]!='NA':
data[j][i] = (float(data[j][i])-mean)/sd
# DISTANCE MEASURE & ERROR CRITERION
def manhattanDist(val1, val2): return numpy.abs( float(val1) - float(val2))
def matchingCoefficient(val1, val2):
if val1==val2: return 0
else: return 1
def dist(val1, val2, attributeType):
if attributeType=='continuous':
return manhattanDist(val1, val2)
elif attributeType=='discrete':
return matchingCoefficient(val1, val2)
def objectDist(obj1, obj2, attributeInfo):
total=0
n=0
for att in attributeInfo:
if (attributeInfo[att]=='continuous' or attributeInfo[att]=='discrete') and obj1[att] is not None and obj2[att] is not None and obj1[att]!='' and obj2[att]!='' and obj1[att]!='NA' and obj2[att]!='NA':
total += float(dist(obj1[att], obj2[att], attributeInfo[att]))
n+=1
if n==0: return None
return total/n
def error_dist(medoids, data, attributeInfo):
total=0
for obj in data:
# FIND NEAREST MEDOID AND COMPUTE DISTANCE
minDist=numpy.inf
for mi in medoids:
m = data[mi]
d=objectDist(obj,m, attributeInfo)
if d<minDist: minDist=d
total+=minDist
return total
# INITIALIZATION: SELECT RANDOM MEDOIDS
i=N/k
for j in range(k):
medoids.append(j*i)
current_error = error_dist(medoids, data, attributeClass)
best_error = current_error
best_error_medoids = medoids[:]
delta=1
# MAIN LOOP
while max_iter>0 and delta>0:
for mi in range(k):
print "current medoid:", mi
for obj_i in range(N):
if obj_i not in medoids:
# SWAP
# COPY medoids
new_medoids = medoids[:]
new_medoids[mi] = obj_i
# COMPUTE NEW error
new_error = error_dist(new_medoids, data, attributeClass)
if new_error<best_error:
best_error = new_error
best_error_medoids = new_medoids[:]
delta = current_error - best_error
medoids = best_error_medoids[:]
current_error = best_error
print medoids
print current_error
# DECIDE FINAL CLUSTERING
fields=attributeClass.keys()
fields.append(targetClass)
fields.append('cluster')
writer = csv.DictWriter(sys.stdout, fieldnames=fields, delimiter='\t')
writer.writeheader()
for obj in data:
# FIND NEAREST MEDOID AND ASSIGN CLUSTER
minDist=numpy.inf
nearestMedoid=''
for mi in medoids:
m = data[mi]
d=objectDist(obj,m, attributeClass)
if d<minDist:
minDist=d
nearestMedoid=mi
obj['cluster'] = nearestMedoid
writer.writerow(obj)
# COMPUTE CROSS TABLE
ct={}
for obj in data:
c1=obj[targetClass]
c2=obj['cluster']
if c1 not in ct: ct[c1] = {}
if c2 not in ct[c1]:
ct[c1][c2] = 1
else:
ct[c1][c2] += 1
print 'class/cluster','\t',
for i in medoids:
print i,'\t',
print
for i in ct:
print i,'\t\t',
for j in medoids:
print ct[i][j],'\t',
print