In [ ]:
import pandas as pd
from IPython.display import Markdown, display, HTML
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
# helper class that facilitates completing EDA, particularly producing data summaries and visualizations for numerical and categorical
# features. The class is instantiated with (mostly) unprepared data and then produces reports via its method calls for features
class ExploratoryDataAnalysis:
dataframe = None
def __init__(self, dataframe):
self.dataframe = dataframe.copy()
def setCategoricalFeatures(self, features):
self.categoricalFeatures = features
def setNumericalFeatures(self, features):
self.numericalFeatures = features
def setTargetFeature(self, targetFeature, isCategorical):
self.targetFeature = targetFeature
self.isCategoricalTarget = isCategorical
def summary(self):
markdown = "| Feature | % of nulls | Type| Number of Unique|\n"
markdown = markdown + "|---|:---:|:---:|:---:|\n"
totalInstances = self.dataframe.shape[0]
for featureName in self.dataframe.columns:
markdown = markdown + "| " + featureName
markdown = markdown + " | "+ str(self.dataframe[featureName].isnull().sum()/totalInstances).format(1./1.)
markdown = markdown + " | " + str(self.dataframe[featureName].dtype)
markdown = markdown + " | " + str(self.dataframe[featureName].nunique()) + " |\n"
display(Markdown(markdown))
# Numerical Feature Methods ---------------------------------------------------------------------------
# Calculating VIF -------------------------------------------------------------------------------------------------------------------
# VIF starts at 1 and has no upper limit
# VIF = 1, no correlation between the independent variable and the other variables
# VIF exceeding 5 or 10 indicates high multicollinearity between this independent variable and the others
def VIFSummary(self):
vif = pd.DataFrame()
vif["Feature"] = self.dataframe[self.numericalFeatures].columns
vif["VIF"] = [variance_inflation_factor(self.dataframe[self.numericalFeatures].values, i) for i in range(self.dataframe[self.numericalFeatures].shape[1])]
return vif
# ------------------------------------------------------------------------------------------------------------------------------------
def calculateVIF(self, features):
vif = pd.DataFrame()
vif["Feature"] = self.dataframe[features].columns
vif["VIF"] = [variance_inflation_factor(self.dataframe[features].values, i) for i in range(self.dataframe[features].shape[1])]
return vif
# ------------------------------------------------------------------------------------------------------------------------------------
def plotCorrelations(self):
corr = abs(self.dataframe.corr()) # correlation matrix
lower_triangle = np.tril(corr, k = -1) # select only the lower triangle of the correlation matrix
mask = lower_triangle == 0 # to mask the upper triangle in the following heatmap
plt.figure(figsize = (15,8)) # setting the figure size
sns.set_style(style = 'white') # Setting it to white so that we do not see the grid lines
# cmap = "Blues"
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(lower_triangle, center=0.5, cmap=cmap, annot= True, xticklabels = corr.index, yticklabels = corr.columns,
cbar= False, linewidths= 1, mask = mask) # Da Heatmap
plt.xticks(rotation = 90) # Aesthetic purposes
plt.yticks(rotation = 0) # Aesthetic purposes
plt.show()
# ------------------------------------------------------------------------------------------------------------------------------------
def generateNumericalBinsTable(self, feature, numberOfBins):
max = self.dataframe[feature].max()
min = self.dataframe[feature].min()
binSize = (max - min) / numberOfBins
bins = [min]
for i in range(1,numberOfBins):
bins.append(min + i*binSize)
bins.append(max)
self.dataframe[feature + "-bin"] = pd.cut(x=self.dataframe[feature], bins=bins, labels=bins[:-1])
if self.isCategoricalTarget == True:
table = self.dataframe[self.dataframe[self.targetFeature] == True][feature + "-bin"].value_counts() / self.dataframe[feature + "-bin"].value_counts()
else:
table = self.dataframe[feature + "-bin"].value_counts()
self.dataframe.drop(feature + "-bin", axis=1)
return table
# ------------------------------------------------------------------------------------------------------------------------------------
def numericalFeatureSummary(self):
markdown = "<hr><H2>Numerical Feature Summary</H2><hr>"
for feature in self.numericalFeatures:
markdown = markdown + "<H3>" + feature + "</H3>"
markdown = markdown + "<table style=\"width: 100%; background-color:#ffffff\"><tr><td style=\"width:25%; padding-right: 20px;\">"
markdown = markdown + self.generateDescribeStats(feature) + "</td>"
markdown = markdown + "<td style=\"width:40%; padding-right: 20px;\">" + self.generateAdditionalStats(feature) + "</td>"
markdown = markdown + "<td>" + self.generateCorrStats(feature) + "</td></tr></table>"
display(HTML("<style>.output_result { height:auto !important; max-height:500px; } </style>" + markdown))
# -----------------------------------------------------------------------------------------------------------------------------------
def analyzeNumericalFeature(self, feature):
self.analyzeNumericalFeature(feature, False)
# provide a report with data tables and plots for a given numerical feature ----------------------------------------------------------
def analyzeNumericalFeature(self, feature, plot):
markdown = "<hr><H2>" + feature + " -- Numerical Feature Analysis </H2><hr><H3>Summary</H3>"
markdown = markdown + "<table style=\"width: 100%; background-color:#ffffff\"><tr><td style=\"width:25%; padding-right: 20px;\">"
markdown = markdown + self.generateDescribeStats(feature) + "</td>"
markdown = markdown + "<td style=\"width:40%; padding-right: 20px;\">" + self.generateAdditionalStats(feature) + "</td>"
markdown = markdown + "<td>" + self.generateCorrStats(feature) + "</td></tr></table>"
markdown = markdown + "<H3>Notes</H3>" + self.generateCategoricalNotes(feature)
markdown = markdown + "<H3>Plots with Numerical Features</H3>"
display(HTML("<style>.output_result { height:auto !important; max-height:500px; } </style>" + markdown))
if plot == True:
self.displayPlotsForNumericalFeature(feature, True)
# provide the correlations that a feature has with all of the other numerical features in the data set -------------------------------
def generateCorrStats(self, feature):
html = "<table style=\"width: 100%; border:1px black solid;\"><tr><th>Feature</th><th>Original Correlation</th><th>Adjusted Correlation</th></tr>"
for numericalFeature in self.numericalFeatures:
if (feature != numericalFeature):
corr = self.dataframe[feature].corr(self.dataframe[numericalFeature])
html = html + "<tr><td>" + numericalFeature + "</td><td>" + "{:.2f}".format(corr) + "</td><td>TODO</td></tr>"
html = html + "</table>"
return html
# provide summary statistics to assist with identifying data clean needs -------------------------------------------------------------
def generateAdditionalStats(self, feature):
html = "<table style=\"width: 100%; border:1px black solid;\"><tr><th>Statistic</th><th>Original Value</th><th>Adjusted Value</th>"
html = html + "<tr><td>Datatype</td><td>" + str(self.dataframe[feature].dtype) + "</td><td>" + "TODO" + "</td></tr>"
html = html + "<tr><td># of Null</td><td>" + str(self.dataframe[feature].isnull().sum()) + "</td><td>" + "TODO" + "</td></tr>"
html = html + "<tr><td># of 0</td><td>" + str(self.dataframe[self.dataframe[feature] ==0].shape[0]) + "</td><td>" + "TODO" + "</td></tr>"
html = html + "<tr><td># Less Than 0</td><td>" + str(self.dataframe[self.dataframe[feature] < 0].shape[0]) + "</td><td>" + "TODO" + "</td></tr>"
totalInstances = self.dataframe[feature].describe()["count"]
iqr = self.dataframe[feature].describe()["75%"] - self.dataframe[feature].describe()["25%"]
lowOutlierValue = self.dataframe[feature].describe()["25%"] - 1.5*iqr
numLowOutliers = self.dataframe[self.dataframe[feature] < lowOutlierValue][feature].count()
percentLowOutliers = 100*numLowOutliers/totalInstances
html = html + "<tr><td># of Low Outliers</td><td>" + str(numLowOutliers) + " (" + "{:.2f}".format(percentLowOutliers) + "%)</td>"
html = html + "<td>" + "TODO" + "</td></tr>"
highOutlierValue = self.dataframe[feature].describe()["75%"] + 1.5*iqr
numHighOutliers = self.dataframe[self.dataframe[feature] > highOutlierValue][feature].count()
percentHighOutliers = 100*numHighOutliers / totalInstances
html = html + "<tr><td># of High Outliers</td><td>" + str(numHighOutliers) + " (" + "{:.2f}".format(percentHighOutliers) + "%)</td>"
html = html + "<td>" + "TODO" + "</td></tr>"
html = html + "</table>"
return html
# provide summary statistics like df.describe for a numeric feature -----------------------------------------------------------------
def generateDescribeStats(self, feature):
describe = self.dataframe[feature].describe()
html = "<table style=\"width: 100%; border:1px black solid;\"><tr><th>Statistic</th><th>Original Value</th><th>Adjusted Value</th></tr>"
for stat in describe.index:
html = html + "<tr><td>" + stat + "</td><td>" + "{:.2f}".format(describe[stat]) + "</td><td>TODO</td></tr>"
html = html + "<tr><td>Skewness</td><td>" +"{:.2f}".format(self.dataframe[feature].skew()) + "</td><td>TODO</td></tr>"
corr = self.dataframe[feature].corr(self.dataframe[self.targetFeature])
html = html + "<tr><td>" + self.targetFeature + " Correlation</td><td>" +"{:.2f}".format(corr) + "</td><td>TODO</td></tr>"
html = html + "</table>"
return html
# Plot the relationship bettween a numerical feature and other numerical features, as well as plots with the target feature
def displayPlotsForNumericalFeature(self, featureToAnalyze, plotDensity):
# assumes the target is categorical, TODO add support for numerical targets
if featureToAnalyze in set(self.numericalFeatures):
adjustment = 1
else:
adjustment = 0
rows = int(np.floor((len(self.numericalFeatures)-adjustment)/2))
if np.mod(len(self.numericalFeatures)-adjustment, 2) > 0:
rows = int(rows + 1)
if plotDensity == True:
rows = rows + 1
fig, ax = plt.subplots(rows,2, figsize=(20,20))
if plotDensity == True:
# fig, ax = plt.subplots(1 ,2, figsize=(15,5))
ax[0][0] = sns.distplot(self.dataframe[featureToAnalyze], ax=ax[0][0])
# NUMERICAL / CATEGORICAL SWITCH TODO
if self.isCategoricalTarget == True:
ax[0][1] = sns.distplot(self.dataframe[self.dataframe[self.targetFeature]==False][featureToAnalyze],color='r',label=0, ax=ax[0][1])
ax[0][1] = sns.distplot(self.dataframe[self.dataframe[self.targetFeature]==True][featureToAnalyze],color='g',label=1, ax=ax[0][1])
else:
ax[0][1] = sns.regplot(x=featureToAnalyze, y=self.targetFeature, data=self.dataframe, ax=ax[0][1])
handles, labels = ax[0][1].get_legend_handles_labels()
ax[0][1].legend(handles, labels)
plt.close()
i = 0
c = 0
for feature in self.numericalFeatures:
if feature != featureToAnalyze:
r = int(np.floor(i/2)) + 1
c = int(np.mod(i, 2))
ax[r][c] = sns.regplot(x=feature, y=featureToAnalyze, data=self.dataframe, ax=ax[r][c])
plt.close()
i = i + 1
for a in fig.axes:
plt.sca(a)
plt.xticks(rotation=90)
fig.subplots_adjust(hspace=0.5, wspace=0.1)
fig.show()
# ----------------------------------------------------------------------------------------------------------------------------------
def displayDensityPlotsForNumericalFeatures(self, featuresToAnalyze):
# assumes the target is categorical, TODO add support for numerical targets
for featureToAnalyze in featuresToAnalyze:
fig, ax = plt.subplots(1 ,2, figsize=(15,5))
ax[0] = sns.distplot(self.dataframe[featureToAnalyze], ax=ax[0])
ax[1] = sns.distplot(self.dataframe[self.dataframe[self.targetFeature]==False][featureToAnalyze],color='r',label="Target 0", ax=ax[1])
ax[1] = sns.distplot(self.dataframe[self.dataframe[self.targetFeature]==True][featureToAnalyze],color='g',label="Target 1", ax=ax[1])
handles, labels = ax[1].get_legend_handles_labels()
ax[1].legend(handles, labels)
fig.show()
# --------------------------------------------------------------------------------------------------------------------------------
def analyzeSkew(self):
skews = pd.DataFrame()
skews["OriginalSkew"] = self.dataframe[self.numericalFeatures].skew()
for feature in self.numericalFeatures:
minimum = self.dataframe[feature].min()
if minimum > 0:
minimum = 0
skews.at[feature, "LnSkew"] = self.dataframe[feature].apply(lambda x: np.log(x-minimum +1 )).skew()
skews.at[feature, "SqRtSkew"] = self.dataframe[feature].apply(lambda x: np.sqrt(x-minimum +1 )).skew()
return skews
# Categorical Feature Methods ----------------------------------------------------------------------------------------------------
def categoricalFeatureSummary(self):
markdown = "<hr><H2>Categorical Feature Summary</H2><hr>"
for feature in self.categoricalFeatures:
markdown = markdown + "<H3>" + feature + "</H3>"
markdown = markdown + "<table style=\"width: 100%; background-color:#ffffff\"><tr><td style=\"width:30%; padding-right: 20px;\">"
markdown = markdown + self.generateCategoricalSummaryStats(feature) + "</td>"
markdown = markdown + "<td style=\"width:36%; padding-right: 20px;\">" + self.generateContingencyTable(feature) + "</td>"
markdown = markdown + "<td>" + self.generateValueCountsTable(feature) + "</td></tr></table>"
display(HTML("<style>.output_result { height:auto !important; max-height:500px; } </style>" + markdown))
# ----------------------------------------------------------------------------------------------------------------------------------
def analyzeCategoricalFeature(self, feature):
self.analyzeCategoricalFeature(feature, False)
# ----------------------------------------------------------------------------------------------------------------------------------
def analyzeCategoricalFeature(self, feature, plot):
markdown = "<hr><H2>" + feature + " -- Categorical Feature Analysis </H2><hr><H3>Summary</H3>"
markdown = markdown + "<table style=\"width: 100%; background-color:#ffffff\"><tr><td style=\"width:30%; padding-right: 20px;\">"
markdown = markdown + self.generateCategoricalSummaryStats(feature) + "</td>"
markdown = markdown + "<td style=\"width:36%; padding-right: 20px;\">" + self.generateContingencyTable(feature) + "</td>"
markdown = markdown + "<td>" + self.generateValueCountsTable(feature) + "</td></tr></table>"
markdown = markdown + "<H3>Notes</H3>" + self.generateCategoricalNotes(feature)
markdown = markdown + "<H3>Plots with Numerical Features</H3>"
display(HTML("<style>.output_result { height:auto !important; max-height:500px; } </style>" + markdown))
if plot == True:
self.displayPlotsWithNumericalFeatures(feature)
# ---------------------------------------------------------------------------------------------------------------------------------
def generateCategoricalNotes(self,feature):
return "TODO"
# ---------------------------------------------------------------------------------------------------------------------------------
def generateCategoricalSummaryStats(self, feature):
# markdown = "<table style=\"width: 100%; border:1px black solid;\"><tr><th>Value</th><th>" + tf + " % True </th><th>" + tf + " % False</th></tr>"
return "TODO"
# provide the contingency table for a categorical feature ------------------------------------------------------------------------
def generateContingencyTable(self, feature):
crosstab = pd.crosstab(self.dataframe[feature], self.dataframe[self.targetFeature], normalize="index")
tf = self.targetFeature
markdown = "<table style=\"width: 100%; border:1px black solid;\"><tr><th>Value</th><th>" + tf + " % True </th><th>" + tf + " % False</th></tr>"
for index, row in crosstab.iterrows():
markdown = markdown + "<tr><td>" + str(index) + "</td><td>" + "{:.1f}".format(100*row[True]) + "</td><td>" + "{:.1f}".format(100*row[False]) + "</td></tr>"
markdown = markdown + "</table>"
return markdown
# subset the sample size of a numerical feature for the purpose of making plotting faster but still insightful vai random sampling
def prepareNumericalFeatureSample(self, feature):
# check if in numerical TODO
sample = self.dataframe.sample(min(1000, int(self.dataframe.shape[0]/10)))
sample = sample[np.abs(sample[feature] - sample[feature].mean())/sample[feature].std() < 3]
return sample
# plot categorical variable relationships with all numerical features ----------------------------------------------------------------
def displayPlotsWithNumericalFeatures(self, categoricalFeature):
rows = int(np.floor(len(self.numericalFeatures)/2))
if np.mod(len(self.numericalFeatures), 2) > 0:
rows = int(rows + 1)
fig, ax = plt.subplots(rows,2, figsize=(20,20))
i = 0
c = 0
for feature in self.numericalFeatures:
r = int(np.floor(i/2))
c = int(np.mod(i, 2))
ax[r][c] = sns.swarmplot(x=categoricalFeature, y=feature, hue="Target", data=self.prepareNumericalFeatureSample(feature), ax=ax[r][c])
plt.close()
i = i + 1
for a in fig.axes:
plt.sca(a)
plt.xticks(rotation=90)
fig.subplots_adjust(hspace=0.5, wspace=0.1)
fig.show()
# compute the counts for categorical variable values -------------------------------------------------------------------------------
def generateValueCountsTable(self, feature):
value_counts = self.dataframe[feature].value_counts()
totalInstances = value_counts.sum()
markdown = "<table style=\"width: 100%; border:1px black solid; \"><tr><th>Value</th><th>Count</th><th>% of Total</th></tr>"
for itype in value_counts.index:
markdown = markdown + "<tr><td>" + str(itype) + "</td><td>" + str(value_counts[itype]) + "</td>"
markdown = markdown + "<td>" + "{:.1f}".format(100*value_counts[itype]/totalInstances) + "</td></tr>"
markdown = markdown + "</table>"
return markdown