Use the ROC curve to test the performance of a discrete classifier in python ?

R.O.C analysis (receiver operating characteristic) to test the performance of a discrete classifier in python

Introduction

Question: for a given x, does it belong to A population or not ? Let's consider a simple classifier define by a threshold (for example $x_s = 10$), if $x >= x_s$ then $x \in A$ if $x < x_s$ then $x \notin A$

import matplotlib.pyplot as plt
import scipy.stats
import numpy as np

x_min = 0.0
x_max = 30.0

#----------------------------------------------------------------------------------------#
# Population B

mean = 9.0 
std = 2.0

x = np.linspace(x_min, x_max, 100)

y = scipy.stats.norm.pdf(x,mean,std)

plt.plot(x,y, color='black')

plt.fill_between(x, y, color='#89bedc', alpha='1.0')

#----------------------------------------------------------------------------------------#
# Population A

mean = 15.0 
std = 4.0

x = np.linspace(x_min, x_max, 100)

y = scipy.stats.norm.pdf(x,mean,std)

plt.plot(x,y, color='black')

plt.fill_between(x, y, color='#0b559f', alpha='1.0')

#----------------------------------------------------------------------------------------#

import matplotlib.patches as mpatches

pop_a = mpatches.Patch(color='#0b559f', label='Population A')
pop_b = mpatches.Patch(color='#89bedc', label='Population B')

plt.legend(handles=[pop_a,pop_b])

plt.axvline(x=10,color='red')

plt.grid()

plt.xlim(x_min,x_max)
plt.ylim(0,0.25)

plt.title('How to use ROC curve to test a dicrete classifier ?',fontsize=10)

plt.xlabel('x')
plt.ylabel('Probability Density Function')

plt.savefig("roc_curve_discrete_classifier_02.png")
plt.show()


Calculate TP, TN, FP, FN

• $x \in A$ and $x > x_s$ (TP true positive)
• $x \in A$ and $x < x_s$ (FN false negative)
• $x \notin A$ and $x > x_s$ (FP false positive)
• $x \notin A$ and $x < x_s$ (TN true negative)

from scipy.integrate import quad

import matplotlib.pyplot as plt
import scipy.stats
import numpy as np

x_min = 0.0
x_max = 30.0

x_threshold = 10.0

def normal_distribution_function(x,mean,std):
    value = scipy.stats.norm.pdf(x,mean,std)
    return value

#----------------------------------------------------------------------------------------#
# Population B

mean = 9.0 
std = 2.0

x = np.linspace(x_min, x_max, 100)

y = scipy.stats.norm.pdf(x,mean,std)

plt.plot(x,y, color='gray')

#----------------------------------------------------------------------------------------#
# Population A

mean = 15.0 
std = 4.0

x = np.linspace(x_min, x_max, 100)

y = scipy.stats.norm.pdf(x,mean,std)

plt.plot(x,y, color='gray')

ptx = np.linspace(x_min, x_threshold, 100)
pty = scipy.stats.norm.pdf(ptx,mean,std)

plt.fill_between(ptx, pty, color='#e1b1b4', alpha='1.0')

fn_res, err = quad(normal_distribution_function, x_min, x_threshold, args=(mean,std,))

print('False Negative (FN)',fn_res)

ptx = np.linspace(x_threshold, x_max, 100)
pty = scipy.stats.norm.pdf(ptx,mean,std)
plt.fill_between(ptx, pty, color='#b77495', alpha='1.0')

tp_res, err = quad(normal_distribution_function, x_threshold, x_max, args=(mean,std,))

print('True Positive (TP)',tp_res)

#----------------------------------------------------------------------------------------#

import matplotlib.patches as mpatches

pop_a = mpatches.Patch(color='#e1b1b4', label='False Negative (FN): ' + str(round(fn_res,2)))
pop_b = mpatches.Patch(color='#b77495', label='True Positive (TP): ' + str(round(tp_res,2)))

plt.legend(handles=[pop_a,pop_b])

plt.axvline(x=x_threshold,color='red')

plt.grid()

plt.xlim(x_min,x_max)
plt.ylim(0,0.25)

plt.title('How to use ROC curve to test a dicrete classifier ?',fontsize=10)

plt.xlabel('x')
plt.ylabel('Probability Density Function')

plt.savefig("roc_curve_discrete_classifier_03.png")
plt.show()


Plot the confusion matrix

#!/usr/bin/env python

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd

import seaborn as sns
import math

from mpl_toolkits.axes_grid1 import make_axes_locatable

import matplotlib as mpl

mpl.style.use('seaborn')

conf_arr = np.array([[0.89,0.31],[0.11,0.69]])

df_cm = pd.DataFrame(conf_arr, 
  index = [ 'A', 'B'],
  columns = ['A', 'B'])

fig = plt.figure()

plt.clf()

ax = fig.add_subplot(111)
ax.set_aspect(1)

cmap = sns.cubehelix_palette(light=1, as_cmap=True)

res = sn.heatmap(df_cm, annot=True, vmin=0.0, vmax=1.0, fmt='.2f', cmap=cmap)

plt.yticks([0.5,1.5], [ 'Classifier B', 'Classifier A'], va='center')

ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')

plt.savefig('roc_curve_discrete_classifier_05.png', dpi=100, bbox_inches='tight' )

plt.close()


Plot the R.O.C curve and calculate the AUC (Area Under the Curve)

from scipy.integrate import quad
from scipy.integrate import simps

import matplotlib.pyplot as plt
import numpy as np

def slope(x1, y1, x2, y2):
    return (y2-y1)/(x2-x1)

fp = 0.31
tp = 0.89

a = slope(0.0, 0.0, fp, tp)
b = 0.0

ptx = np.linspace(0, fp, 100)
pty = a * ptx + b

plt.fill_between(ptx, pty, color='#89bedc', alpha='1.0')

area_1 = simps(pty,ptx)

a = slope(fp, tp, 1.0, 1.0)
b = tp - a * fp

ptx = np.linspace(fp, 1.0, 100)
pty = a * ptx + b

plt.fill_between(ptx, pty, color='#89bedc', alpha='1.0')

area_2 = simps(pty,ptx)

auc_area = area_1 + area_2
print(auc_area)

auc_area = (1.0 - fp) * tp + 0.5 * fp * tp + 0.5 * (1.0 - fp) * (1.0 -tp) 
print(auc_area)

plt.text(0.6, 0.25, 'AUC: '+str(round(auc_area,2)),color='white',fontsize=14)

plt.scatter(fp,tp)

plt.plot([0,fp,1],[0,tp,1])
plt.plot([0.0,1.0],[0.0,1.0],'k--')
plt.xlim(0,1)
plt.ylim(0,1)

plt.xlabel('False Positive (FP)',fontsize=8)
plt.ylabel('True Positive (TP)',fontsize=8)

plt.title('Receiver operating characteristics (R.O.C) Curve',fontsize=10)

plt.savefig("roc_curve_discrete_classifier_07.png")


Find the threshold that maximize the AUC

from scipy.integrate import quad

import matplotlib.pyplot as plt
import scipy.stats
import numpy as np

x_min = 0.0
x_max = 30.0

def normal_distribution_function(x,mean,std):
    value = scipy.stats.norm.pdf(x,mean,std)
    return value

#----------------------------------------------------------------------------------------#
# Population A

mean_a = 15.0 
std_a = 4.0

x_a = np.linspace(x_min, x_max, 100)

y_a = scipy.stats.norm.pdf(x_a,mean_a,std_a)

#----------------------------------------------------------------------------------------#
# Population B

mean_b = 9.0 
std_b = 2.0

x_b = np.linspace(x_min, x_max, 100)

y_b = scipy.stats.norm.pdf(x_b,mean_b,std_b)

#----------------------------------------------------------------------------------------#

auc_max = 0.0
x_s_opt = 0.0

for x_s in [i for i in np.arange(x_min,x_max,0.1)]:

    ptx = np.linspace(x_s, x_max, 100)
    pty = scipy.stats.norm.pdf(ptx,mean_a,std_a)

    tp, err = quad(normal_distribution_function, x_s, x_max, args=(mean_a,std_a,))

    ptx = np.linspace(x_s, x_max, 100)
    pty = scipy.stats.norm.pdf(ptx,mean_b,std_b)

    fp, err = quad(normal_distribution_function, x_s, x_max, args=(mean_b,std_b,))

    auc_area = (1.0 - fp) * tp + 0.5 * fp * tp + 0.5 * (1.0 - fp) * (1.0 -tp)

    if auc_area > auc_max:
        x_s_opt = x_s
        auc_max = auc_area

print('Best xs found: ', x_s_opt)
print('Best AUC found: ', auc_max)


returns here:

Best xs found:  11.8
Best AUC found:  0.853649762448816


Image

of