Details: Written by Steven Fonseca; Published: 31 December 2020
In [ ]:
import pandas as pd
from IPython.display import Markdown, display, HTML
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import  scipy.stats as stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
    
# helper class that facilitates completing EDA, particularly producing data summaries and visualizations for numerical and categorical 
# features.  The class is instantiated with (mostly) unprepared data and then produces reports via its method calls for features

class ExploratoryDataAnalysis:
    
    dataframe = None
    
    def __init__(self, dataframe): 
        self.dataframe = dataframe.copy()
    
    def setCategoricalFeatures(self, features):
        self.categoricalFeatures = features
    
    def setNumericalFeatures(self, features):
        self.numericalFeatures = features
        
    def setTargetFeature(self, targetFeature, isCategorical):
        self.targetFeature = targetFeature
        self.isCategoricalTarget = isCategorical
        
    def summary(self):
        markdown = "| Feature | % of nulls | Type| Number of Unique|\n"
        markdown = markdown + "|---|:---:|:---:|:---:|\n"
        
        totalInstances = self.dataframe.shape[0]
        
        for featureName in self.dataframe.columns:
            markdown = markdown + "| " + featureName
            markdown = markdown + " | "+ str(self.dataframe[featureName].isnull().sum()/totalInstances).format(1./1.)
            markdown = markdown + " | " + str(self.dataframe[featureName].dtype)
            markdown = markdown + " | " + str(self.dataframe[featureName].nunique()) + " |\n"
        
        display(Markdown(markdown))

# Numerical Feature Methods --------------------------------------------------------------------------- 

   
    # Calculating VIF -------------------------------------------------------------------------------------------------------------------
    # VIF starts at 1 and has no upper limit
    # VIF = 1, no correlation between the independent variable and the other variables
    # VIF exceeding 5 or 10 indicates high multicollinearity between this independent variable and the others
    
    def VIFSummary(self):

        vif = pd.DataFrame()
        vif["Feature"] = self.dataframe[self.numericalFeatures].columns
        vif["VIF"] = [variance_inflation_factor(self.dataframe[self.numericalFeatures].values, i) for i in range(self.dataframe[self.numericalFeatures].shape[1])]
        return vif

    # ------------------------------------------------------------------------------------------------------------------------------------
    def calculateVIF(self, features):
        vif = pd.DataFrame()
        vif["Feature"] = self.dataframe[features].columns
        vif["VIF"] = [variance_inflation_factor(self.dataframe[features].values, i) for i in range(self.dataframe[features].shape[1])]
        return vif
    
    # ------------------------------------------------------------------------------------------------------------------------------------
    def plotCorrelations(self):
        corr = abs(self.dataframe.corr()) # correlation matrix
        lower_triangle = np.tril(corr, k = -1)  # select only the lower triangle of the correlation matrix
        mask = lower_triangle == 0  # to mask the upper triangle in the following heatmap

        plt.figure(figsize = (15,8))  # setting the figure size
        sns.set_style(style = 'white')  # Setting it to white so that we do not see the grid lines
        
        # cmap = "Blues"
        cmap = sns.diverging_palette(230, 20, as_cmap=True)
        
        sns.heatmap(lower_triangle, center=0.5, cmap=cmap, annot= True, xticklabels = corr.index, yticklabels = corr.columns,
            cbar= False, linewidths= 1, mask = mask)   # Da Heatmap
        plt.xticks(rotation = 90)   # Aesthetic purposes
        plt.yticks(rotation = 0)   # Aesthetic purposes
        plt.show()
    
    # ------------------------------------------------------------------------------------------------------------------------------------
    def generateNumericalBinsTable(self, feature, numberOfBins):
        
        max = self.dataframe[feature].max()
        min = self.dataframe[feature].min()
        
        binSize = (max - min) / numberOfBins
    
        bins = [min]
        
        for i in range(1,numberOfBins):
            bins.append(min + i*binSize)
            
        bins.append(max)
    
        self.dataframe[feature + "-bin"] = pd.cut(x=self.dataframe[feature], bins=bins, labels=bins[:-1])
        
        if self.isCategoricalTarget == True:
            table = self.dataframe[self.dataframe[self.targetFeature] == True][feature + "-bin"].value_counts() / self.dataframe[feature + "-bin"].value_counts()
            
        else:
            table = self.dataframe[feature + "-bin"].value_counts()
        
        self.dataframe.drop(feature + "-bin", axis=1)
        
        return table
    
    # ------------------------------------------------------------------------------------------------------------------------------------
    def numericalFeatureSummary(self):
        
        markdown = "<hr><H2>Numerical Feature Summary</H2><hr>"
        
        for feature in self.numericalFeatures:
            markdown = markdown + "<H3>" + feature + "</H3>"
            markdown = markdown + "<table style=\"width: 100%; background-color:#ffffff\"><tr><td style=\"width:25%; padding-right: 20px;\">"
            markdown = markdown + self.generateDescribeStats(feature) + "</td>"
            markdown = markdown + "<td style=\"width:40%; padding-right: 20px;\">" + self.generateAdditionalStats(feature) + "</td>"
            markdown = markdown + "<td>" + self.generateCorrStats(feature) + "</td></tr></table>" 
        
        
        display(HTML("<style>.output_result { height:auto !important; max-height:500px; } </style>" + markdown))
        
    # -----------------------------------------------------------------------------------------------------------------------------------
    def analyzeNumericalFeature(self, feature):
        self.analyzeNumericalFeature(feature, False)
        
    # provide a report with data tables and plots for a given numerical feature ----------------------------------------------------------
    def analyzeNumericalFeature(self, feature, plot):
        markdown = "<hr><H2>" + feature + " -- Numerical Feature Analysis </H2><hr><H3>Summary</H3>" 
        markdown = markdown + "<table style=\"width: 100%; background-color:#ffffff\"><tr><td style=\"width:25%; padding-right: 20px;\">"
        markdown = markdown + self.generateDescribeStats(feature) + "</td>"
        markdown = markdown + "<td style=\"width:40%; padding-right: 20px;\">" + self.generateAdditionalStats(feature) + "</td>"
        markdown = markdown + "<td>" + self.generateCorrStats(feature) + "</td></tr></table>" 
        
        markdown = markdown + "<H3>Notes</H3>" + self.generateCategoricalNotes(feature)
        markdown = markdown + "<H3>Plots with Numerical Features</H3>"
        
        display(HTML("<style>.output_result { height:auto !important; max-height:500px; } </style>" + markdown))
        
        if plot == True:
            self.displayPlotsForNumericalFeature(feature, True)
     
    # provide the correlations that a feature has with all of the other numerical features in the data set -------------------------------
    def generateCorrStats(self, feature):
        
        html = "<table style=\"width: 100%; border:1px black solid;\"><tr><th>Feature</th><th>Original Correlation</th><th>Adjusted Correlation</th></tr>"
        
        for numericalFeature in self.numericalFeatures:
            if (feature != numericalFeature):
                corr = self.dataframe[feature].corr(self.dataframe[numericalFeature])
                html = html + "<tr><td>" + numericalFeature + "</td><td>" + "{:.2f}".format(corr) + "</td><td>TODO</td></tr>"
        
        html = html + "</table>"
        
        return html
    
    # provide summary statistics to assist with identifying data clean needs -------------------------------------------------------------   
    def generateAdditionalStats(self, feature):
        html = "<table style=\"width: 100%; border:1px black solid;\"><tr><th>Statistic</th><th>Original Value</th><th>Adjusted Value</th>"
        
        html = html + "<tr><td>Datatype</td><td>" + str(self.dataframe[feature].dtype) + "</td><td>" + "TODO" + "</td></tr>"
        html = html + "<tr><td># of Null</td><td>" + str(self.dataframe[feature].isnull().sum()) + "</td><td>" + "TODO" + "</td></tr>"
        
        html = html + "<tr><td># of 0</td><td>" + str(self.dataframe[self.dataframe[feature] ==0].shape[0]) + "</td><td>" + "TODO" + "</td></tr>"
        html = html + "<tr><td># Less Than 0</td><td>" + str(self.dataframe[self.dataframe[feature] < 0].shape[0]) + "</td><td>" + "TODO" + "</td></tr>"
        
        totalInstances = self.dataframe[feature].describe()["count"]
        
        iqr = self.dataframe[feature].describe()["75%"] - self.dataframe[feature].describe()["25%"]
        
        lowOutlierValue = self.dataframe[feature].describe()["25%"] - 1.5*iqr
        numLowOutliers = self.dataframe[self.dataframe[feature] < lowOutlierValue][feature].count()
        percentLowOutliers = 100*numLowOutliers/totalInstances
        html = html + "<tr><td># of Low Outliers</td><td>" + str(numLowOutliers) + " (" + "{:.2f}".format(percentLowOutliers) + "%)</td>" 
        html = html + "<td>" + "TODO" + "</td></tr>"
        
        highOutlierValue = self.dataframe[feature].describe()["75%"] + 1.5*iqr
        numHighOutliers = self.dataframe[self.dataframe[feature] > highOutlierValue][feature].count()
        percentHighOutliers = 100*numHighOutliers / totalInstances
        html = html + "<tr><td># of High Outliers</td><td>" + str(numHighOutliers) + " (" + "{:.2f}".format(percentHighOutliers) + "%)</td>"
        html = html + "<td>" + "TODO" + "</td></tr>"
        
        html = html + "</table>"
        
        return html
    # provide summary statistics like df.describe for a numeric feature -----------------------------------------------------------------
    def generateDescribeStats(self, feature):
        
        describe = self.dataframe[feature].describe()
        
        html = "<table style=\"width: 100%; border:1px black solid;\"><tr><th>Statistic</th><th>Original Value</th><th>Adjusted Value</th></tr>"
            
        for stat in describe.index: 
            html = html + "<tr><td>" + stat + "</td><td>" + "{:.2f}".format(describe[stat]) + "</td><td>TODO</td></tr>"
        
        html = html + "<tr><td>Skewness</td><td>" +"{:.2f}".format(self.dataframe[feature].skew()) + "</td><td>TODO</td></tr>"
        
        corr = self.dataframe[feature].corr(self.dataframe[self.targetFeature])
        html = html + "<tr><td>" + self.targetFeature + " Correlation</td><td>" +"{:.2f}".format(corr) + "</td><td>TODO</td></tr>"
        
        html = html + "</table>"
        
        return html
    # Plot the relationship bettween a numerical feature and other numerical features, as well as plots with the target feature
    def displayPlotsForNumericalFeature(self, featureToAnalyze, plotDensity):
        # assumes the target is categorical, TODO add support for numerical targets
        
        
        if featureToAnalyze in set(self.numericalFeatures):
            adjustment = 1
        else:
            adjustment = 0
        
        rows = int(np.floor((len(self.numericalFeatures)-adjustment)/2))

       
        
        if np.mod(len(self.numericalFeatures)-adjustment, 2) > 0:
            rows = int(rows + 1)

        if plotDensity == True:
            rows = rows + 1
            
        fig, ax = plt.subplots(rows,2, figsize=(20,20))  
        

        if plotDensity == True:
           # fig, ax = plt.subplots(1 ,2, figsize=(15,5))
        
            ax[0][0] = sns.distplot(self.dataframe[featureToAnalyze], ax=ax[0][0])
       
            # NUMERICAL / CATEGORICAL SWITCH TODO
        
            if self.isCategoricalTarget == True:
                ax[0][1] = sns.distplot(self.dataframe[self.dataframe[self.targetFeature]==False][featureToAnalyze],color='r',label=0, ax=ax[0][1])
                ax[0][1] = sns.distplot(self.dataframe[self.dataframe[self.targetFeature]==True][featureToAnalyze],color='g',label=1, ax=ax[0][1]) 
            else:
                 ax[0][1] = sns.regplot(x=featureToAnalyze, y=self.targetFeature, data=self.dataframe, ax=ax[0][1])
                    
            handles, labels = ax[0][1].get_legend_handles_labels()
            ax[0][1].legend(handles, labels)
        
            plt.close()

        i = 0
        c = 0

        for feature in self.numericalFeatures:
            
            if feature != featureToAnalyze:
                r = int(np.floor(i/2)) + 1
                c = int(np.mod(i, 2))
    
                ax[r][c] = sns.regplot(x=feature, y=featureToAnalyze, data=self.dataframe, ax=ax[r][c])
    
                plt.close()
                i = i + 1
    
        for a in fig.axes:
            plt.sca(a)
            plt.xticks(rotation=90)

        fig.subplots_adjust(hspace=0.5, wspace=0.1)

        fig.show()
        
    # ----------------------------------------------------------------------------------------------------------------------------------    
    def displayDensityPlotsForNumericalFeatures(self, featuresToAnalyze):
        # assumes the target is categorical, TODO add support for numerical targets

        for featureToAnalyze in featuresToAnalyze:
            

        
            fig, ax = plt.subplots(1 ,2, figsize=(15,5))
        
            ax[0] = sns.distplot(self.dataframe[featureToAnalyze], ax=ax[0])
           
        
            ax[1] = sns.distplot(self.dataframe[self.dataframe[self.targetFeature]==False][featureToAnalyze],color='r',label="Target 0", ax=ax[1])
        
        
            ax[1] = sns.distplot(self.dataframe[self.dataframe[self.targetFeature]==True][featureToAnalyze],color='g',label="Target 1", ax=ax[1]) 
            
            handles, labels = ax[1].get_legend_handles_labels()
            ax[1].legend(handles, labels)
            
            
            fig.show()
            
            

    # --------------------------------------------------------------------------------------------------------------------------------
    def analyzeSkew(self):
        
        skews = pd.DataFrame()
        skews["OriginalSkew"] = self.dataframe[self.numericalFeatures].skew()

        for feature in self.numericalFeatures:
            minimum = self.dataframe[feature].min()
            if minimum > 0:
                minimum = 0
            skews.at[feature, "LnSkew"] = self.dataframe[feature].apply(lambda x: np.log(x-minimum +1 )).skew()
            skews.at[feature, "SqRtSkew"] = self.dataframe[feature].apply(lambda x: np.sqrt(x-minimum +1 )).skew()
            
        return skews



# Categorical Feature Methods ----------------------------------------------------------------------------------------------------  

    def categoricalFeatureSummary(self):
        
        markdown = "<hr><H2>Categorical Feature Summary</H2><hr>"
            
        for feature in self.categoricalFeatures:
         
            markdown = markdown + "<H3>" + feature + "</H3>" 
            markdown = markdown + "<table style=\"width: 100%; background-color:#ffffff\"><tr><td style=\"width:30%; padding-right: 20px;\">"
            markdown = markdown + self.generateCategoricalSummaryStats(feature) + "</td>"
            markdown = markdown + "<td style=\"width:36%; padding-right: 20px;\">" + self.generateContingencyTable(feature) + "</td>"
            markdown = markdown + "<td>" + self.generateValueCountsTable(feature) + "</td></tr></table>" 
             
        display(HTML("<style>.output_result { height:auto !important; max-height:500px; } </style>" + markdown))
    
    # ----------------------------------------------------------------------------------------------------------------------------------
    def analyzeCategoricalFeature(self, feature):
        self.analyzeCategoricalFeature(feature, False)
    
    # ----------------------------------------------------------------------------------------------------------------------------------
    def analyzeCategoricalFeature(self, feature, plot):
        markdown = "<hr><H2>" + feature + " -- Categorical Feature Analysis </H2><hr><H3>Summary</H3>" 
        markdown = markdown + "<table style=\"width: 100%; background-color:#ffffff\"><tr><td style=\"width:30%; padding-right: 20px;\">"
        markdown = markdown + self.generateCategoricalSummaryStats(feature) + "</td>"
        markdown = markdown + "<td style=\"width:36%; padding-right: 20px;\">" + self.generateContingencyTable(feature) + "</td>"
        markdown = markdown + "<td>" + self.generateValueCountsTable(feature) + "</td></tr></table>" 
        
        markdown = markdown + "<H3>Notes</H3>" + self.generateCategoricalNotes(feature)
        markdown = markdown + "<H3>Plots with Numerical Features</H3>"
        
        display(HTML("<style>.output_result { height:auto !important; max-height:500px; } </style>" + markdown))
        
        if plot == True:
            self.displayPlotsWithNumericalFeatures(feature)
            
    # ---------------------------------------------------------------------------------------------------------------------------------
    def generateCategoricalNotes(self,feature):
        return "TODO"
    
    # ---------------------------------------------------------------------------------------------------------------------------------
    def generateCategoricalSummaryStats(self, feature):
       # markdown = "<table style=\"width: 100%; border:1px black solid;\"><tr><th>Value</th><th>" + tf + " % True </th><th>" + tf + " % False</th></tr>"
        return "TODO"
    
    # provide the contingency table for a categorical feature ------------------------------------------------------------------------
    def generateContingencyTable(self, feature):
        
        crosstab = pd.crosstab(self.dataframe[feature], self.dataframe[self.targetFeature], normalize="index")
        
        tf = self.targetFeature
        
        markdown = "<table style=\"width: 100%; border:1px black solid;\"><tr><th>Value</th><th>" + tf + " % True </th><th>" + tf + " % False</th></tr>"
        for index, row in crosstab.iterrows():
            markdown = markdown + "<tr><td>" + str(index) + "</td><td>" + "{:.1f}".format(100*row[True]) + "</td><td>" + "{:.1f}".format(100*row[False]) + "</td></tr>"
        
        markdown = markdown + "</table>"
        
        return markdown
    # subset the sample size of a numerical feature for the purpose of making plotting faster but still insightful vai random sampling
    def prepareNumericalFeatureSample(self, feature):
        # check if in numerical TODO
        
        sample = self.dataframe.sample(min(1000, int(self.dataframe.shape[0]/10)))
        sample = sample[np.abs(sample[feature] - sample[feature].mean())/sample[feature].std() < 3]

        return sample
    # plot categorical variable relationships with all numerical features ----------------------------------------------------------------
    def displayPlotsWithNumericalFeatures(self, categoricalFeature):
      
        rows = int(np.floor(len(self.numericalFeatures)/2))

        if np.mod(len(self.numericalFeatures), 2) > 0:
            rows = int(rows + 1)

        fig, ax = plt.subplots(rows,2, figsize=(20,20))

        i = 0
        c = 0

        for feature in self.numericalFeatures:
            r = int(np.floor(i/2))
            c = int(np.mod(i, 2))
    
            ax[r][c] = sns.swarmplot(x=categoricalFeature, y=feature, hue="Target", data=self.prepareNumericalFeatureSample(feature), ax=ax[r][c])
    
    
            plt.close()
            i = i + 1
    
        for a in fig.axes:
            plt.sca(a)
            plt.xticks(rotation=90)

        fig.subplots_adjust(hspace=0.5, wspace=0.1)

        fig.show()
        
    # compute the counts for categorical variable values -------------------------------------------------------------------------------
    def generateValueCountsTable(self, feature):
        
        value_counts = self.dataframe[feature].value_counts()
        totalInstances = value_counts.sum()
        
        markdown = "<table style=\"width: 100%; border:1px black solid; \"><tr><th>Value</th><th>Count</th><th>% of Total</th></tr>"
        for itype in value_counts.index:    
            markdown = markdown + "<tr><td>" + str(itype) + "</td><td>" + str(value_counts[itype]) + "</td>"
            markdown = markdown + "<td>" + "{:.1f}".format(100*value_counts[itype]/totalInstances) + "</td></tr>"
        markdown = markdown + "</table>"
        return markdown
    
Exploratory Data Analysis Python Module