In [1]:

import pandas as pd
from IPython.display import Markdown, display, HTML
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from pandas import DataFrame

from sklearn.base import clone

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression   # importing logistic regression from scikit learn
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import StratifiedKFold, KFold

from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score
from yellowbrick.classifier import ClassificationReport, ROCAUC
from sklearn.metrics._classification import _check_targets

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# This class provides methods for generating reports of model training including result statistics and plots
# It also provides methods for completing cross validation and for comparing the performance of supervised learning models provide by SciKit # Learn

class ModelTraining:
    
    results = pd.DataFrame(columns = ["Model Name", "Training Accuracy", "Testing Accuracy", "Recall Score", "Precision Score", "F1 Score", "ROC AUC Score" ])
    
    # TODO, update numericalRegressors array name
    numericalRegressors  = ["Model Name", "Fold Number", "Mean Absolute Error", "Root Mean Squared Error", "R Squared"]
    
    def __init__(self, xtrain,xtest,ytrain,ytest): 
        self.xtrain = xtrain
        self.xtest = xtest
        self.ytrain = ytrain
        self.ytest = ytest
    
    def getResults(self):
        return self.results
    

    
        
    # generate an HTML report of textual results as well as visualizations of that data ------------------------------------------------
    def computeTrainingStats(self, modelName, model, y_predict):
        training_accuracy = model.score(self.xtrain,self.ytrain)
        testing_accuracy = model.score(self.xtest, self.ytest)
        r_score = recall_score(self.ytest,y_predict)
        p_score = precision_score(self.ytest,y_predict)
        f_score = f1_score(self.ytest,y_predict)
        r_auc_score = roc_auc_score(self.ytest,y_predict)
        
        result = { "Model Name": modelName, "Training Accuracy": training_accuracy, "Testing Accuracy": testing_accuracy, "Recall Score": r_score, "Precision Score": p_score, "F1 Score": f_score, "ROC AUC Score": r_auc_score}
        self.results = self.results.append(result, ignore_index=True)
    
        html = "<table style=\"width: 40%; background-color:#ffffff\">" 
        html = html + "<tr><th>Model Name</th><td>" + modelName + "</td></tr>"
        html = html + "<tr><th>Training Accuracy</th><td>" + "{:.4f}".format(training_accuracy) + "</td></tr>"
        html = html + "<tr><th>Testing Accuracy</th><td>" + "{:.4f}".format(testing_accuracy) + "</td></tr>"
        html = html + "<tr><th>Recall Score</th><td>" + "{:.4f}".format(r_score) + "</td></tr>"
        html = html + "<tr><th>Precision Score</th><td>" + "{:.4f}".format(p_score) + "</td></tr>"
        html = html + "<tr><th>F1 Score</th><td>" + "{:.4f}".format(f_score) + "</td></tr>"
        html = html + "<tr><th>ROC AUC Score</th><td>" + "{:.4f}".format(r_auc_score) + "</td></tr>"
        html = html + "</table>"
                
        
        display(HTML("<style>.output_result { height:auto !important; max-height:500px; } </style>" + html))
        
        self.drawLogisticScores(model)
        self.drawConfusionMatrix(y_predict)
        
        return result
    
   

    # Visualize model performance with yellowbrick library -----------------------------------------------------------------------------
    def drawLogisticScores(self, model): 
        viz = ClassificationReport(model)
        viz.fit(self.xtrain, self.ytrain)
        viz.score(self.xtest, self.ytest)
        viz.show()

        roc = ROCAUC(model)
        roc.fit(self.xtrain, self.ytrain)
        roc.score(self.xtest, self.ytest)
        roc.show()
    
    # Plot a confusion matrix ----------------------------------------------------------------------------------------------------------
    def drawConfusionMatrix(self, y_predict ):
        
        cm = confusion_matrix( self.ytest, y_predict)
        
        sns.heatmap(cm, annot=True,  fmt='.2f', xticklabels = [0,1] , yticklabels = [0,1] )
        plt.ylabel('Observed')
        plt.xlabel('Predicted')
        plt.show()
    
    # Use a Linear Regression model to estimate the feature importance to prediction ---------------------------------------------------    
    def estimateNumericalFeatureImportance(self):
        
        coefficients = pd.DataFrame(columns = ["Feature", "Coefficient"])
        
        model =  LinearRegression()
        model.fit(self.xtrain, self.ytrain)
        
        for idx, feature in enumerate(self.xtrain.columns):
            coefficients = coefficients.append({"Feature": feature, "Coefficient": model.coef_[0][idx]}, ignore_index=True)
        
        return coefficients
    
    # Compare the performance of standard SciKit Learn models for numerical predictions -------------------------------------------------
    # TODO, make model hyperparameters configurable instead of fixed
    
    def crossValidationForNumericalTarget(self, xtrain, ytrain, folds):
        
        cvResults = pd.DataFrame(columns = self.numericalRegressors )
        
        # Instantiate the models used for numerical predictions
        
        models = {}
        
        models["Linear Regressor"] = LinearRegression()
        models["Bagging Regressor"] = BaggingRegressor(n_estimators=45,random_state=10, base_estimator=models["Linear Regressor"])
        models["Adaptive Boosting Regressor"] = AdaBoostRegressor( n_estimators=40,random_state=10, base_estimator=models["Linear Regressor"])
        models["Random Forest Regressor"] = RandomForestRegressor(n_estimators = 45, random_state=10)
        models["Decision Tree Regressor"] = DecisionTreeRegressor(criterion = 'mse', max_depth=5, random_state=10)
        models["Gradient Boosting Regressor"] = GradientBoostingRegressor(n_estimators=40,random_state=10, init=models["Linear Regressor"])
        
        # for each mode, perform cross validation, store and return the results for comparision
        for modelName in models.keys():
            
            result = self.performNumericalCrossValidation(models[modelName], xtrain, ytrain, folds, modelName)
            cvResults = cvResults.append( result, ignore_index=True)
            
        return cvResults
    # Perform cross validation for a polynomial regressor for the degree passed in and return the results ------------------------------
    def crossValidationForPolynomialModel(self, xtrain, ytrain, folds, degree):
        
        cvResults = pd.DataFrame(columns = self.numericalRegressors.remove("R Squared") )
        
        
        # generate polynomial features according to the degree in preparation for model fitting using linear regression
        
        poly = PolynomialFeatures(degree=degree, interaction_only=True)
        xtrainPoly = poly.fit_transform(xtrain)
        
        xtrainPoly = DataFrame(xtrainPoly)
      
        polyRegression = LinearRegression()
        
        # reuse the numerical cross validation method to assess the performance of this model
        result = self.performNumericalCrossValidation(polyRegression, xtrainPoly, ytrain, folds, "Polynomial Regressor").drop("R Squared", axis=1)
        
        cvResults = cvResults.append( result, ignore_index=True)
        
        return cvResults
    
    # Performan cross validation for polynomial regressor models of degree 2 to 10 and return the performance results --------------------
    # TODO, return a second result that is the integer value for the most optimal degree based on minimizing RMSE
    def findOptimalPolynomialDegree(self, xtrain, ytrain):
        results = pd.DataFrame(columns = ["Model Name", "Fold Number", "Mean Absolute Error", "Root Mean Squared Error", "R Squared"])
        
        
        for i in range(2,10,1):
            
            poly = PolynomialFeatures(degree=i, interaction_only=True)
            xtrainPoly = poly.fit_transform(xtrain)
        
            xtrainPoly = DataFrame(xtrainPoly)
      
            polyRegression = LinearRegression()
            
            result = self.performNumericalCrossValidation(polyRegression, xtrainPoly, ytrain, 4, "Polynomial Regressor").drop("R Squared", axis=1)
            
            result["Degree"] = result["Model Name"].apply(lambda x: int(i))
            
            results = results.append(result, ignore_index=True)
        
        return results
    
    # Perform cross validation for regression models and collect multiple scoring statistics for each fold ------------------------------
    def performNumericalCrossValidation(self, model, xtrain, ytrain, num_folds, modelName):

        results = pd.DataFrame(columns = ["Model Name", "Fold Number", "Mean Absolute Error", "Root Mean Squared Error", "R Squared"])
        
        folds = KFold(n_splits=num_folds, random_state=11)
            
        i = 1

        for train_index, test_index in folds.split(xtrain, ytrain):
            
            cmodel = clone(model)
                               
            xtrain_fold = xtrain.iloc[train_index]
            ytrain_fold = ytrain.iloc[train_index]
            xtest_fold = xtrain.iloc[test_index]
            ytest_fold = ytrain.iloc[test_index]
    
            
            cmodel.fit(xtrain_fold, ytrain_fold)
        
            y_predict = cmodel.predict(xtest_fold)
            
            # Mean Absolute Error
            mae = mean_absolute_error(ytest_fold, y_predict)
        
            # RMSE
            rmse = mean_squared_error(ytest_fold, y_predict)**0.5
            
            # R2 Squared:
            r2 = r2_score(ytest_fold, y_predict)
        
            new_row = {"Model Name":modelName, "Fold Number":i, "Mean Absolute Error" :mae, "Root Mean Squared Error":rmse, "R Squared": r2}
            
            #append row to the dataframe
            results = results.append(new_row, ignore_index=True)     
            
            
            i = i + 1
            
        return results
        
    # TODO -- This method is not yet complete and in use ---------------------------------------------------------------------------------
    def performCategoricalCrossValidation(self, model, xtrain, ytrain, num_folds):
       
        folds = StratifiedKFold(n_splits=num_folds, random_state=11)
        
        i = 1

        for train_index, test_index in folds.split(xtrain, ytrain):
    
            cmodel = clone(model)
                               
            xtrain_fold = xtrain.iloc[train_index]
            ytrain_fold = ytrain.iloc[train_index]
            xtest_fold = xtrain.iloc[test_index]
            ytest_fold = ytrain.iloc[test_index]
    
            
            cmodel.fit(xtrain_fold, ytrain_fold)
        
            y_predict = cmodel.predict(xtest_fold)
            
            accuracy = accuracy_score(ytest_fold, y_predict)
            precision = precision_score(ytest_fold, y_predict)
            recall = recall_score(ytest_fold, y_predict)
            f1 = f1_score(ytest_fold, y_predict)
            roc_auc = roc_auc_score(ytest_fold, y_predict)
    
            
            print("Fold (" + str(i) + ")------------------------------")
    
            print("Accuracy: " + "{:.3f}".format(accuracy) + " Precision: " + "{:.3f}".format(precision)
                + " Recall: " + "{:.3f}".format(recall) + " F1: " + "{:.3f}".format(f1) + " ROC AUC: " 
                + "{:.3f}".format(roc_auc))
    
            print("Confusion Matrix:")
            print(confusion_matrix(ytest_fold, y_predict))
          
            i = i + 1
            
    

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\deprecation.py:143: FutureWarning: The sklearn.metrics.classification module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API.
  warnings.warn(message, FutureWarning)

Model Training Python Module