#!/usr/bin/env python3 import argparse import csv import numpy import scipy.stats import sys from pprint import pprint # SCRIPT PARAMETERS parser = argparse.ArgumentParser(description='Naive Bayesian learner and classifier.') parser.add_argument('-t', '--training', required=True, help='CSV File with a header row in containing training examples.') parser.add_argument('-u', '--test', required=False, help='CSV File with a header row containing new objects to be classified for performance evaluation.') parser.add_argument('-s', '--sample', required=False, help='CSV File with a header row containing new objects to be classified. NOT YET IMPLEMENTED') parser.add_argument('-m', '--model', nargs='?', const=True, help='Only displays the probability table.') parser.add_argument('-n', '--gauss', nargs='?', const=True, help='Data is numeric, thus, gaussian should be used to compute probabilities.') parser.add_argument('-d', '--delimiter', required=False, default=',', help='Field delimiter in CSV files.') opt = parser.parse_args() # GLOBAL VARIABLES model = {} class_n = {} training_size = 0 # N: size of training set # LOAD TRAINING SET AND BUILD MODEL # model[class][attribute_name][attribute_value] = occurrences # e.g. model[EDIBLE][cap-shape][BELL] = 404 with open(opt.training) as csvfile: csvreader = csv.reader(csvfile, delimiter=opt.delimiter) attributes = next(csvreader) targetClass = attributes.pop(0) for row in csvreader: training_size += 1 sampleClass = row.pop(0) if sampleClass not in model: model[sampleClass] = {} # class not seen so far class_n[sampleClass] = 1 else: class_n[sampleClass] += 1 for i in range(len(row)): value = row[i] attribute = attributes[i] if opt.gauss: if attribute not in model[sampleClass]: model[sampleClass][attribute] = [] # attribute not seen so far for this class model[sampleClass][attribute].append( float(value) ) else: if value is not None and value!='' and value!='NA': if attribute not in model[sampleClass]: model[sampleClass][attribute] = {} # attribute not seen so far for this class if value not in model[sampleClass][attribute]: model[sampleClass][attribute][value] = 1 # level not seen so far fhor these class and attribute else: model[sampleClass][attribute][value] += 1 # increment # COMPUTE MEAN AND SD IF gauss PARAMETERS WAS PASSED if opt.gauss: for c in model: for a in attributes: mean = numpy.mean(model[c][a]) sd = numpy.std(model[c][a]) model[c][a] = { 'mean': mean, 'sd': sd} def printProbabilityTableDiscrete(): for c in model: print('P[class=%s] = %i' % (c, class_n[c])) print for c in model: for f in model[c]: for v in model[c][f]: print('P[%s=%s|class=%s] = %i' % (f, v, c, model[c][f][v])) print if opt.model: if opt.gauss: pprint (model) else: printProbabilityTableDiscrete() exit() # EVERYTHING'S READY FOR CLASSIFICATION def classify(sample): bestProb = 0 bestClass = '' for c in class_n: # p(class=c) p = float(class_n[c]) / training_size # init with class prob for i in range(len(attributes)): # ensure we have a value value = sample[i] attribute = attributes[i] if opt.gauss: pf = scipy.stats.norm.pdf(float(value), loc=model[c][attribute]['mean'], scale=model[c][attribute]['sd']) else: if value is not None and value!='' and value!='NA': # p(X=f / class=c) if value in model[c][attribute]: pf = float(model[c][attribute][value]) / class_n[c] else: pf = 0 p *= pf # posterior probability (not divided by p(X) though) if p>bestProb: bestProb = p bestClass = c return bestClass # PERFORMANCES if opt.test: with open(opt.test) as csvfile: csvreader = csv.reader(csvfile, delimiter=opt.delimiter) test_head = next(csvreader) tests = 0 errors = 0 if len(model)==2: # only 2 classes TP=0;FP=0;PP=0;P=0;TN=0;FN=0;PN=0;N=0 neg = list(model.keys())[0] # POISONOUS pos = list(model.keys())[1] # EDIBLE # CLASSIFY SAMPLES (DISCRETE) for row in csvreader: test_y = row.pop(0) predicted = classify(row) if predicted != test_y: errors+=1 tests += 1 if predicted==pos: #'EDIBLE': PP+=1 if test_y==pos: #'EDIBLE': TP+=1 P+=1 else: # test_y = POISONOUS FP+=1 N+=1 else: PN+=1 if test_y==neg: #'POISONOUS': TN+=1 N+=1 else: # test_y = EDIBLE FN+=1 P+=1 else: # CLASSIFY SAMPLES (CONTINUOUS) for row in csvreader: test_y = row.pop(0) tests += 1 predicted = classify(row) if predicted != test_y: errors += 1 print() print('Tests:', tests, ', Errors:', errors, ', Error rate:', round(float(errors)/tests*100,2),'%') print() if len(model)==2: print("prediction/reality | P %5d | N %5d " % (P,N)) print("P %5d | TP %5d | FP %5d" % (PP, TP, FP)) print("N %5d | FN %5d | TN %5d" % (PN, FN, TN)) print() print("Sensitivity TP/P = ",float(TP)/P) print("Specificity TN/N = ",float(TN)/N) print("Precision TP/PP = ",float(TP)/PP) print("Accuracy (TP+TN) / (P+N) = ",float(TP+TN)/(P+N)) # CLASSIFICATION if opt.sample: print("Not yet")