In [1]:
import pandas as pd
from IPython.display import Markdown, display, HTML
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas import DataFrame
from sklearn.base import clone
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression # importing logistic regression from scikit learn
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score
from yellowbrick.classifier import ClassificationReport, ROCAUC
from sklearn.metrics._classification import _check_targets
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# This class provides methods for generating reports of model training including result statistics and plots
# It also provides methods for completing cross validation and for comparing the performance of supervised learning models provide by SciKit # Learn
class ModelTraining:
results = pd.DataFrame(columns = ["Model Name", "Training Accuracy", "Testing Accuracy", "Recall Score", "Precision Score", "F1 Score", "ROC AUC Score" ])
# TODO, update numericalRegressors array name
numericalRegressors = ["Model Name", "Fold Number", "Mean Absolute Error", "Root Mean Squared Error", "R Squared"]
def __init__(self, xtrain,xtest,ytrain,ytest):
self.xtrain = xtrain
self.xtest = xtest
self.ytrain = ytrain
self.ytest = ytest
def getResults(self):
return self.results
# generate an HTML report of textual results as well as visualizations of that data ------------------------------------------------
def computeTrainingStats(self, modelName, model, y_predict):
training_accuracy = model.score(self.xtrain,self.ytrain)
testing_accuracy = model.score(self.xtest, self.ytest)
r_score = recall_score(self.ytest,y_predict)
p_score = precision_score(self.ytest,y_predict)
f_score = f1_score(self.ytest,y_predict)
r_auc_score = roc_auc_score(self.ytest,y_predict)
result = { "Model Name": modelName, "Training Accuracy": training_accuracy, "Testing Accuracy": testing_accuracy, "Recall Score": r_score, "Precision Score": p_score, "F1 Score": f_score, "ROC AUC Score": r_auc_score}
self.results = self.results.append(result, ignore_index=True)
html = "<table style=\"width: 40%; background-color:#ffffff\">"
html = html + "<tr><th>Model Name</th><td>" + modelName + "</td></tr>"
html = html + "<tr><th>Training Accuracy</th><td>" + "{:.4f}".format(training_accuracy) + "</td></tr>"
html = html + "<tr><th>Testing Accuracy</th><td>" + "{:.4f}".format(testing_accuracy) + "</td></tr>"
html = html + "<tr><th>Recall Score</th><td>" + "{:.4f}".format(r_score) + "</td></tr>"
html = html + "<tr><th>Precision Score</th><td>" + "{:.4f}".format(p_score) + "</td></tr>"
html = html + "<tr><th>F1 Score</th><td>" + "{:.4f}".format(f_score) + "</td></tr>"
html = html + "<tr><th>ROC AUC Score</th><td>" + "{:.4f}".format(r_auc_score) + "</td></tr>"
html = html + "</table>"
display(HTML("<style>.output_result { height:auto !important; max-height:500px; } </style>" + html))
self.drawLogisticScores(model)
self.drawConfusionMatrix(y_predict)
return result
# Visualize model performance with yellowbrick library -----------------------------------------------------------------------------
def drawLogisticScores(self, model):
viz = ClassificationReport(model)
viz.fit(self.xtrain, self.ytrain)
viz.score(self.xtest, self.ytest)
viz.show()
roc = ROCAUC(model)
roc.fit(self.xtrain, self.ytrain)
roc.score(self.xtest, self.ytest)
roc.show()
# Plot a confusion matrix ----------------------------------------------------------------------------------------------------------
def drawConfusionMatrix(self, y_predict ):
cm = confusion_matrix( self.ytest, y_predict)
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = [0,1] , yticklabels = [0,1] )
plt.ylabel('Observed')
plt.xlabel('Predicted')
plt.show()
# Use a Linear Regression model to estimate the feature importance to prediction ---------------------------------------------------
def estimateNumericalFeatureImportance(self):
coefficients = pd.DataFrame(columns = ["Feature", "Coefficient"])
model = LinearRegression()
model.fit(self.xtrain, self.ytrain)
for idx, feature in enumerate(self.xtrain.columns):
coefficients = coefficients.append({"Feature": feature, "Coefficient": model.coef_[0][idx]}, ignore_index=True)
return coefficients
# Compare the performance of standard SciKit Learn models for numerical predictions -------------------------------------------------
# TODO, make model hyperparameters configurable instead of fixed
def crossValidationForNumericalTarget(self, xtrain, ytrain, folds):
cvResults = pd.DataFrame(columns = self.numericalRegressors )
# Instantiate the models used for numerical predictions
models = {}
models["Linear Regressor"] = LinearRegression()
models["Bagging Regressor"] = BaggingRegressor(n_estimators=45,random_state=10, base_estimator=models["Linear Regressor"])
models["Adaptive Boosting Regressor"] = AdaBoostRegressor( n_estimators=40,random_state=10, base_estimator=models["Linear Regressor"])
models["Random Forest Regressor"] = RandomForestRegressor(n_estimators = 45, random_state=10)
models["Decision Tree Regressor"] = DecisionTreeRegressor(criterion = 'mse', max_depth=5, random_state=10)
models["Gradient Boosting Regressor"] = GradientBoostingRegressor(n_estimators=40,random_state=10, init=models["Linear Regressor"])
# for each mode, perform cross validation, store and return the results for comparision
for modelName in models.keys():
result = self.performNumericalCrossValidation(models[modelName], xtrain, ytrain, folds, modelName)
cvResults = cvResults.append( result, ignore_index=True)
return cvResults
# Perform cross validation for a polynomial regressor for the degree passed in and return the results ------------------------------
def crossValidationForPolynomialModel(self, xtrain, ytrain, folds, degree):
cvResults = pd.DataFrame(columns = self.numericalRegressors.remove("R Squared") )
# generate polynomial features according to the degree in preparation for model fitting using linear regression
poly = PolynomialFeatures(degree=degree, interaction_only=True)
xtrainPoly = poly.fit_transform(xtrain)
xtrainPoly = DataFrame(xtrainPoly)
polyRegression = LinearRegression()
# reuse the numerical cross validation method to assess the performance of this model
result = self.performNumericalCrossValidation(polyRegression, xtrainPoly, ytrain, folds, "Polynomial Regressor").drop("R Squared", axis=1)
cvResults = cvResults.append( result, ignore_index=True)
return cvResults
# Performan cross validation for polynomial regressor models of degree 2 to 10 and return the performance results --------------------
# TODO, return a second result that is the integer value for the most optimal degree based on minimizing RMSE
def findOptimalPolynomialDegree(self, xtrain, ytrain):
results = pd.DataFrame(columns = ["Model Name", "Fold Number", "Mean Absolute Error", "Root Mean Squared Error", "R Squared"])
for i in range(2,10,1):
poly = PolynomialFeatures(degree=i, interaction_only=True)
xtrainPoly = poly.fit_transform(xtrain)
xtrainPoly = DataFrame(xtrainPoly)
polyRegression = LinearRegression()
result = self.performNumericalCrossValidation(polyRegression, xtrainPoly, ytrain, 4, "Polynomial Regressor").drop("R Squared", axis=1)
result["Degree"] = result["Model Name"].apply(lambda x: int(i))
results = results.append(result, ignore_index=True)
return results
# Perform cross validation for regression models and collect multiple scoring statistics for each fold ------------------------------
def performNumericalCrossValidation(self, model, xtrain, ytrain, num_folds, modelName):
results = pd.DataFrame(columns = ["Model Name", "Fold Number", "Mean Absolute Error", "Root Mean Squared Error", "R Squared"])
folds = KFold(n_splits=num_folds, random_state=11)
i = 1
for train_index, test_index in folds.split(xtrain, ytrain):
cmodel = clone(model)
xtrain_fold = xtrain.iloc[train_index]
ytrain_fold = ytrain.iloc[train_index]
xtest_fold = xtrain.iloc[test_index]
ytest_fold = ytrain.iloc[test_index]
cmodel.fit(xtrain_fold, ytrain_fold)
y_predict = cmodel.predict(xtest_fold)
# Mean Absolute Error
mae = mean_absolute_error(ytest_fold, y_predict)
# RMSE
rmse = mean_squared_error(ytest_fold, y_predict)**0.5
# R2 Squared:
r2 = r2_score(ytest_fold, y_predict)
new_row = {"Model Name":modelName, "Fold Number":i, "Mean Absolute Error" :mae, "Root Mean Squared Error":rmse, "R Squared": r2}
#append row to the dataframe
results = results.append(new_row, ignore_index=True)
i = i + 1
return results
# TODO -- This method is not yet complete and in use ---------------------------------------------------------------------------------
def performCategoricalCrossValidation(self, model, xtrain, ytrain, num_folds):
folds = StratifiedKFold(n_splits=num_folds, random_state=11)
i = 1
for train_index, test_index in folds.split(xtrain, ytrain):
cmodel = clone(model)
xtrain_fold = xtrain.iloc[train_index]
ytrain_fold = ytrain.iloc[train_index]
xtest_fold = xtrain.iloc[test_index]
ytest_fold = ytrain.iloc[test_index]
cmodel.fit(xtrain_fold, ytrain_fold)
y_predict = cmodel.predict(xtest_fold)
accuracy = accuracy_score(ytest_fold, y_predict)
precision = precision_score(ytest_fold, y_predict)
recall = recall_score(ytest_fold, y_predict)
f1 = f1_score(ytest_fold, y_predict)
roc_auc = roc_auc_score(ytest_fold, y_predict)
print("Fold (" + str(i) + ")------------------------------")
print("Accuracy: " + "{:.3f}".format(accuracy) + " Precision: " + "{:.3f}".format(precision)
+ " Recall: " + "{:.3f}".format(recall) + " F1: " + "{:.3f}".format(f1) + " ROC AUC: "
+ "{:.3f}".format(roc_auc))
print("Confusion Matrix:")
print(confusion_matrix(ytest_fold, y_predict))
i = i + 1
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\deprecation.py:143: FutureWarning: The sklearn.metrics.classification module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API. warnings.warn(message, FutureWarning)