- Details
- Written by Steven Fonseca
Welcome
Hello and welcome to my site! If you don't know me or want a more thorough introduction to my professional background, have a look at the about section of this site. Over the years I have spent a lot of time serving as a technical thought leader for a variety of organizations where the vast majority of my work has only been shared internally due to standard intellectual property restrictions. I've launched this site to increase my sharing with the software industry and to foster on-going collaborations with like-minded professionals that also have a passion for advancing the software state of the practice. |
Site Contents Introduction
- Architecture
- DevOps
- Data + AI/ML
- Leadership
- Engineering
Consulting Opportunities
Over the course of my career I have been a part-time and full-time independent consultant. If you have an interesting project and need help, I am available for a few different types of engagements:
- Digital Transformation, partnering with executives to plan transformational organizational change.
- Technology Strategy, partnering with executives to envision target states and develop roadmaps for capability build out.
- Technical Architecture Definition, designing solutions for cloud (AWS) and hybrid deployments.
- Technology Evaluation and Selection, methodically choosing technology stacks for use.
- Reference Architecture and Technical Best Practices Definition, defining enduring technical standards for broad reuse.
- Research of Emerging Technology, assessing the maturity and applicability of new technologies and technical approaches.
- Career Coaching, working with software engineers, architects, and technical leaders on career growth.
- Software Process Improvement, introducing software design and governance practices into organizations.
Additional Resources
Contact Information
Take the liberty to reach out to many for any reason you find compelling. I'm interested in networking generally and would also love to hear your thoughts on the topics covered in this site.
This email address is being protected from spambots. You need JavaScript enabled to view it.
(408) 318-5634
- Details
- Written by Steven Fonseca
import pandas as pd
from IPython.display import Markdown, display, HTML
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas import DataFrame
from sklearn.base import clone
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression # importing logistic regression from scikit learn
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score
from yellowbrick.classifier import ClassificationReport, ROCAUC
from sklearn.metrics._classification import _check_targets
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# This class provides methods for generating reports of model training including result statistics and plots
# It also provides methods for completing cross validation and for comparing the performance of supervised learning models provide by SciKit # Learn
class ModelTraining:
results = pd.DataFrame(columns = ["Model Name", "Training Accuracy", "Testing Accuracy", "Recall Score", "Precision Score", "F1 Score", "ROC AUC Score" ])
# TODO, update numericalRegressors array name
numericalRegressors = ["Model Name", "Fold Number", "Mean Absolute Error", "Root Mean Squared Error", "R Squared"]
def __init__(self, xtrain,xtest,ytrain,ytest):
self.xtrain = xtrain
self.xtest = xtest
self.ytrain = ytrain
self.ytest = ytest
def getResults(self):
return self.results
# generate an HTML report of textual results as well as visualizations of that data ------------------------------------------------
def computeTrainingStats(self, modelName, model, y_predict):
training_accuracy = model.score(self.xtrain,self.ytrain)
testing_accuracy = model.score(self.xtest, self.ytest)
r_score = recall_score(self.ytest,y_predict)
p_score = precision_score(self.ytest,y_predict)
f_score = f1_score(self.ytest,y_predict)
r_auc_score = roc_auc_score(self.ytest,y_predict)
result = { "Model Name": modelName, "Training Accuracy": training_accuracy, "Testing Accuracy": testing_accuracy, "Recall Score": r_score, "Precision Score": p_score, "F1 Score": f_score, "ROC AUC Score": r_auc_score}
self.results = self.results.append(result, ignore_index=True)
html = "<table style=\"width: 40%; background-color:#ffffff\">"
html = html + "<tr><th>Model Name</th><td>" + modelName + "</td></tr>"
html = html + "<tr><th>Training Accuracy</th><td>" + "{:.4f}".format(training_accuracy) + "</td></tr>"
html = html + "<tr><th>Testing Accuracy</th><td>" + "{:.4f}".format(testing_accuracy) + "</td></tr>"
html = html + "<tr><th>Recall Score</th><td>" + "{:.4f}".format(r_score) + "</td></tr>"
html = html + "<tr><th>Precision Score</th><td>" + "{:.4f}".format(p_score) + "</td></tr>"
html = html + "<tr><th>F1 Score</th><td>" + "{:.4f}".format(f_score) + "</td></tr>"
html = html + "<tr><th>ROC AUC Score</th><td>" + "{:.4f}".format(r_auc_score) + "</td></tr>"
html = html + "</table>"
display(HTML("<style>.output_result { height:auto !important; max-height:500px; } </style>" + html))
self.drawLogisticScores(model)
self.drawConfusionMatrix(y_predict)
return result
# Visualize model performance with yellowbrick library -----------------------------------------------------------------------------
def drawLogisticScores(self, model):
viz = ClassificationReport(model)
viz.fit(self.xtrain, self.ytrain)
viz.score(self.xtest, self.ytest)
viz.show()
roc = ROCAUC(model)
roc.fit(self.xtrain, self.ytrain)
roc.score(self.xtest, self.ytest)
roc.show()
# Plot a confusion matrix ----------------------------------------------------------------------------------------------------------
def drawConfusionMatrix(self, y_predict ):
cm = confusion_matrix( self.ytest, y_predict)
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = [0,1] , yticklabels = [0,1] )
plt.ylabel('Observed')
plt.xlabel('Predicted')
plt.show()
# Use a Linear Regression model to estimate the feature importance to prediction ---------------------------------------------------
def estimateNumericalFeatureImportance(self):
coefficients = pd.DataFrame(columns = ["Feature", "Coefficient"])
model = LinearRegression()
model.fit(self.xtrain, self.ytrain)
for idx, feature in enumerate(self.xtrain.columns):
coefficients = coefficients.append({"Feature": feature, "Coefficient": model.coef_[0][idx]}, ignore_index=True)
return coefficients
# Compare the performance of standard SciKit Learn models for numerical predictions -------------------------------------------------
# TODO, make model hyperparameters configurable instead of fixed
def crossValidationForNumericalTarget(self, xtrain, ytrain, folds):
cvResults = pd.DataFrame(columns = self.numericalRegressors )
# Instantiate the models used for numerical predictions
models = {}
models["Linear Regressor"] = LinearRegression()
models["Bagging Regressor"] = BaggingRegressor(n_estimators=45,random_state=10, base_estimator=models["Linear Regressor"])
models["Adaptive Boosting Regressor"] = AdaBoostRegressor( n_estimators=40,random_state=10, base_estimator=models["Linear Regressor"])
models["Random Forest Regressor"] = RandomForestRegressor(n_estimators = 45, random_state=10)
models["Decision Tree Regressor"] = DecisionTreeRegressor(criterion = 'mse', max_depth=5, random_state=10)
models["Gradient Boosting Regressor"] = GradientBoostingRegressor(n_estimators=40,random_state=10, init=models["Linear Regressor"])
# for each mode, perform cross validation, store and return the results for comparision
for modelName in models.keys():
result = self.performNumericalCrossValidation(models[modelName], xtrain, ytrain, folds, modelName)
cvResults = cvResults.append( result, ignore_index=True)
return cvResults
# Perform cross validation for a polynomial regressor for the degree passed in and return the results ------------------------------
def crossValidationForPolynomialModel(self, xtrain, ytrain, folds, degree):
cvResults = pd.DataFrame(columns = self.numericalRegressors.remove("R Squared") )
# generate polynomial features according to the degree in preparation for model fitting using linear regression
poly = PolynomialFeatures(degree=degree, interaction_only=True)
xtrainPoly = poly.fit_transform(xtrain)
xtrainPoly = DataFrame(xtrainPoly)
polyRegression = LinearRegression()
# reuse the numerical cross validation method to assess the performance of this model
result = self.performNumericalCrossValidation(polyRegression, xtrainPoly, ytrain, folds, "Polynomial Regressor").drop("R Squared", axis=1)
cvResults = cvResults.append( result, ignore_index=True)
return cvResults
# Performan cross validation for polynomial regressor models of degree 2 to 10 and return the performance results --------------------
# TODO, return a second result that is the integer value for the most optimal degree based on minimizing RMSE
def findOptimalPolynomialDegree(self, xtrain, ytrain):
results = pd.DataFrame(columns = ["Model Name", "Fold Number", "Mean Absolute Error", "Root Mean Squared Error", "R Squared"])
for i in range(2,10,1):
poly = PolynomialFeatures(degree=i, interaction_only=True)
xtrainPoly = poly.fit_transform(xtrain)
xtrainPoly = DataFrame(xtrainPoly)
polyRegression = LinearRegression()
result = self.performNumericalCrossValidation(polyRegression, xtrainPoly, ytrain, 4, "Polynomial Regressor").drop("R Squared", axis=1)
result["Degree"] = result["Model Name"].apply(lambda x: int(i))
results = results.append(result, ignore_index=True)
return results
# Perform cross validation for regression models and collect multiple scoring statistics for each fold ------------------------------
def performNumericalCrossValidation(self, model, xtrain, ytrain, num_folds, modelName):
results = pd.DataFrame(columns = ["Model Name", "Fold Number", "Mean Absolute Error", "Root Mean Squared Error", "R Squared"])
folds = KFold(n_splits=num_folds, random_state=11)
i = 1
for train_index, test_index in folds.split(xtrain, ytrain):
cmodel = clone(model)
xtrain_fold = xtrain.iloc[train_index]
ytrain_fold = ytrain.iloc[train_index]
xtest_fold = xtrain.iloc[test_index]
ytest_fold = ytrain.iloc[test_index]
cmodel.fit(xtrain_fold, ytrain_fold)
y_predict = cmodel.predict(xtest_fold)
# Mean Absolute Error
mae = mean_absolute_error(ytest_fold, y_predict)
# RMSE
rmse = mean_squared_error(ytest_fold, y_predict)**0.5
# R2 Squared:
r2 = r2_score(ytest_fold, y_predict)
new_row = {"Model Name":modelName, "Fold Number":i, "Mean Absolute Error" :mae, "Root Mean Squared Error":rmse, "R Squared": r2}
#append row to the dataframe
results = results.append(new_row, ignore_index=True)
i = i + 1
return results
# TODO -- This method is not yet complete and in use ---------------------------------------------------------------------------------
def performCategoricalCrossValidation(self, model, xtrain, ytrain, num_folds):
folds = StratifiedKFold(n_splits=num_folds, random_state=11)
i = 1
for train_index, test_index in folds.split(xtrain, ytrain):
cmodel = clone(model)
xtrain_fold = xtrain.iloc[train_index]
ytrain_fold = ytrain.iloc[train_index]
xtest_fold = xtrain.iloc[test_index]
ytest_fold = ytrain.iloc[test_index]
cmodel.fit(xtrain_fold, ytrain_fold)
y_predict = cmodel.predict(xtest_fold)
accuracy = accuracy_score(ytest_fold, y_predict)
precision = precision_score(ytest_fold, y_predict)
recall = recall_score(ytest_fold, y_predict)
f1 = f1_score(ytest_fold, y_predict)
roc_auc = roc_auc_score(ytest_fold, y_predict)
print("Fold (" + str(i) + ")------------------------------")
print("Accuracy: " + "{:.3f}".format(accuracy) + " Precision: " + "{:.3f}".format(precision)
+ " Recall: " + "{:.3f}".format(recall) + " F1: " + "{:.3f}".format(f1) + " ROC AUC: "
+ "{:.3f}".format(roc_auc))
print("Confusion Matrix:")
print(confusion_matrix(ytest_fold, y_predict))
i = i + 1
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\deprecation.py:143: FutureWarning: The sklearn.metrics.classification module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API. warnings.warn(message, FutureWarning)
- Details
- Written by Steven Fonseca
Steven has two decades of experience leading companies through waves of technology adoption and the build-out of organizational competencies. He has led business unit and corporate level technology change initiatives for SOA, REST APIs, microservices, agile and SAFe, cloud (AWS) migration, containers and container orchestration (AWS ECS, Kubernetes), DevOps, and most recently AI/ML + data analytics. Steven is a technologist and thought leader with a history of driving innovation through a hands-on approach -- helping organizations and individuals execute better, pursue innovation for competitive advantage, and maintain an engaged technical community.
Professional Experience |
||
Most recently, Steven served as Senior Director of Architecture for Cox where he established of an architecture function from the ground up, driving designs for a ~ $500 million portfolio of 10 distinct products. He rebuilt and mentored a 10 member team of architects; provided technical thought leadership and chief architect level design review; drove the technical vision for microservices, the cloud, DevOps, and data analytics; drove the definition of reference architectures and technical standards; established an architecture roadmap practice; as a senior leader, advised on all aspects of organization health and mediated conflicts; and served on the start-up acquisition and partnerships team to assess external opportunities. | ||
At Intuit, Steven served as a Principal Technical Architect where he led two large cross-domain strategic initiatives: He led the cloud migration initiative, serving as the IT business unit cloud architect that drove the migration of all its capabilities out of local data centers and into AWS on an aggressive timeline. He governed the design of all IT cloud-based deployments, gating production releases to ensure adherence to best practices for high availability, disaster recovery, performance, resiliency, security, external integration, technology stacks, CI/CD, etc. He represented the CIO's office for enterprise cloud standards. He also led the IT as a service initiative, growing the services organizational competency from a small centralized team to a distributed set of teams building services in parallel. He drove the design of 100's of REST APIs in the domains of billing, subscription, product catalog, enterprise account, order placement, and payments with some contract definition for marketing and workforce management. He represented the CIO's office for API enterprise and microservices standards. | ||
Steven served as the Director of Product for API Products where he led the API products transformation from SOAP to RESTful services. For all of Capital One, he owned API governance, external API strategy, enterprise reuse strategy, Capital One API training, and Capital One API design. |
||
At eBay, Steven served as Senior Product Manager where he owned all services middleware and tooling for the ebay.com site. He led the first generally available release of the RESTful services platform that now (2016) hosts over 60% of ebay.com APIs. He owned the StubHub platform product line including all core middleware. Steven led the first generally available release of the StubHub public API developer portal from concept to delivery. |
||
At NASA’s Jet Propulsion Laboratory (JPL), Steven served as Chief Software Architect for the Deep Space Information Services Architecture initiative where he was chartered with leading the modernization of the Deep Space Network and Advanced Multi-Mission Operations Systems to the service oriented architectural style. He led a staff of architects and external consultants to define a common architecture for adoption across two large organizations. He partnered with implementation teams to build out SOA infrastructure and applications services. He drove SOA, design modeling, and information architecture education. He championed modernization at all levels of JPL management and at partner NASA centers. | ||
Prior to working at JPL, Steven spent three years working at NASA Ames Research Center, beginning with a post doctorate study looking at peer-to-peer based infrastructures offering expressive publication and subscription languages to more efficiently limit message traffic. Steven then joined the Mission Control Technologies Project (MCT) where he served as architect and engineering manager. For MCT, he led the development of a set of frameworks for building mission ground systems from semantically interoperable composed components. Part of this work included the implementation of an information semantics manager that enabled the consumption of and inference over ontologically-described metadata by application components. He was instrumental in influencing Johnson Space Center to adopt these frameworks for building International Space Station (ISS) mission control applications that remotely managed IT resources on ISS, securing a multi-year multi-million dollar budget for MCT to continue framework development. At the time he left Ames, he was leading an advanced prototyping group for NASA’s Constellation Program Information Architecture Team where he was lead author for the program’s communications framework specification. | ||
Steven created a single person LLC that offered architecture consulting services to small start-ups and large multi-billion dollar corporations where he (1) provided strategic guidance on SOA adoption and architecture expertise to the CTO of a $3B corporation (2) provided technical thought leadership for a $400M information technology services NASA contract re-compete (3) wrote a RESTful services design best practices standard, completed an API management technology evaluation, and prototyped an API developer portal for a Series B $30M start-up (4) prototyped an intelligent network device configuration management system using semantic technologies (RDF, HP Labs Jena graph database, MQTT) for an angel-funded start-up (5) built SOA BPMN workflows for a call center automation project for a multi-billion dollar international telecommunication services company. | ||
Steven worked in the Software Technology Lab developing intellectual property for productizing large-scale, distributed and intelligent multi-agent systems. He innovated software engineering techniques for highly communicative and ontology-based multi-agent systems. He helped architect and implement several agent-based e-commerce applications from bleeding edge multi-agent system frameworks. He reviewed invention disclosures and recommended research directions. He published methods for modeling communication protocols and architectures for coding internal agent behavior. | ||
Steven served as a visiting researcher at British Telecommunications Lab in England one summer where he worked on best practices for describing multi-agent interaction communication protocols. | ||
Steven held several short-term positions while a student at UC Davis and UC Santa Cruz. He provided design support for 8-bit microcontrollers, wrote a Pentium processor thermal chamber testing procedure, and implemented a tool suite for chipset rollout planners to manage new product launches. | ||
Academic Experience |
||
Steven served as an instructor for the UCSC Extension advanced course on object oriented design patterns. He taught the Gang of Four design patterns (Abstract Factor, Strategy, Interpreter, Composite, etc.) for several quarters including creating the course curriculum and giving 32 hours of lectures each quarter. | ||
Steven completed his MBA from the UCLA Anderson School of Business. This program built skills in general management including marketing, statistics, micro and macro economics, accounting, leadership, operations, human resources, mergers and acquisitions, finance, and entrepreneurship. | ||
Steven completed his PhD in Computer Science from the University of California at Santa Cruz in collaboration with Martin Griss at Hewlett Packard Laboratories. His dissertation title was “Methodological and Software Artifacts Support for Agent-Oriented Component Reuse and Engineering.” Steven completed his MS in Computer Science from the University of California at Santa Cruz where he studied design patterns, object oriented frameworks, and software reuse. | ||
At the Royal Melbourne Institute of Technology in Australia, Steven held a 6 month research fellowship where he worked on software engineering techniques for building intelligent multi-agent systems. | ||
Steven completed a Bachelor of Science in Computer Engineering from the University of California at Davis. At that time Steven was interested in computer architecture and digital circuit design. He spent time participating in a research program for minority students where he worked on a reusable platform for building digital signal processing filters from field programmable gate arrays (FPGAs). This work was presented as several undergraduate research conferences. | ||
In addition to his studies at the University of California, Steven completed several certifications since finishing his PhD. He was a Carnegie Mellon University Software Engineering Institute certified instructor for their Software Architecture Principles and Practices curriculum. | ||
Steven is also a FEAC Institute Certified Enterprise Architect. This included completing a competency-based program in which participants learn the foundations of enterprise architecture and the value and utility of different frameworks, modeling notations, and techniques. The training focused on the mechanics of developing an integrated enterprise architecture. |
||
Steven holds the Project Management Institute PMP and Agile Practitioner certifications. | ||
Steven is a certified mediator from the San Francisco Community Boards. He served as a volunteer mediator to help resolve community disputes during the time that he lived in San Francisco. | ||
Steven is an AWS Certified Solution Architect Associate and Developer. | ||
Steven completed Stanford's Advanced Security certification, a multi-class curriculum covering security goals, secure system design, secure design principles, worms and other malware, buffer overflows, client-state manipulation, SQL injection, password security, cross-domain security in web applications, and cryptography algorithms. | ||
Steven completed the post-graduate program certificate in AI/ML for data scientists from the University of Texas, Austin in March of 2021. This was a 6 month program covering all of the major machine learning algorithms and requiring the completion of 8 projects coded in Python using Jupyter Notebooks. He gained theoretical and hands-on experience with (1) building supervised learning including linear, logistic, decision trees, random forest, adaptive boosting, gradient descent, and polynomial models (2) unsupervised learning including k-means clustering and hierarchical clustering models (3) neural network models (4) computer vision models and (5) natural language processing models. Steven learned how to perform exploratory data analysis, feature engineering, and model hyperparameter optimization. A few of the Python libraries used include Pandas, Numpy, SciKit Learn, Keras, TensorFlow, Matplotlib, and Seaborn. | ||
Steven completed a course covering the ITIL V3 Foundation Certificate in IT Service Management certification content that focused heavily on providing an introduction to the ITIL framework. He passed the certification exam as part of this course. |
- Details
- Written by Steven Fonseca
import pandas as pd
from IPython.display import Markdown, display, HTML
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
# helper class that facilitates completing EDA, particularly producing data summaries and visualizations for numerical and categorical
# features. The class is instantiated with (mostly) unprepared data and then produces reports via its method calls for features
class ExploratoryDataAnalysis:
dataframe = None
def __init__(self, dataframe):
self.dataframe = dataframe.copy()
def setCategoricalFeatures(self, features):
self.categoricalFeatures = features
def setNumericalFeatures(self, features):
self.numericalFeatures = features
def setTargetFeature(self, targetFeature, isCategorical):
self.targetFeature = targetFeature
self.isCategoricalTarget = isCategorical
def summary(self):
markdown = "| Feature | % of nulls | Type| Number of Unique|\n"
markdown = markdown + "|---|:---:|:---:|:---:|\n"
totalInstances = self.dataframe.shape[0]
for featureName in self.dataframe.columns:
markdown = markdown + "| " + featureName
markdown = markdown + " | "+ str(self.dataframe[featureName].isnull().sum()/totalInstances).format(1./1.)
markdown = markdown + " | " + str(self.dataframe[featureName].dtype)
markdown = markdown + " | " + str(self.dataframe[featureName].nunique()) + " |\n"
display(Markdown(markdown))
# Numerical Feature Methods ---------------------------------------------------------------------------
# Calculating VIF -------------------------------------------------------------------------------------------------------------------
# VIF starts at 1 and has no upper limit
# VIF = 1, no correlation between the independent variable and the other variables
# VIF exceeding 5 or 10 indicates high multicollinearity between this independent variable and the others
def VIFSummary(self):
vif = pd.DataFrame()
vif["Feature"] = self.dataframe[self.numericalFeatures].columns
vif["VIF"] = [variance_inflation_factor(self.dataframe[self.numericalFeatures].values, i) for i in range(self.dataframe[self.numericalFeatures].shape[1])]
return vif
# ------------------------------------------------------------------------------------------------------------------------------------
def calculateVIF(self, features):
vif = pd.DataFrame()
vif["Feature"] = self.dataframe[features].columns
vif["VIF"] = [variance_inflation_factor(self.dataframe[features].values, i) for i in range(self.dataframe[features].shape[1])]
return vif
# ------------------------------------------------------------------------------------------------------------------------------------
def plotCorrelations(self):
corr = abs(self.dataframe.corr()) # correlation matrix
lower_triangle = np.tril(corr, k = -1) # select only the lower triangle of the correlation matrix
mask = lower_triangle == 0 # to mask the upper triangle in the following heatmap
plt.figure(figsize = (15,8)) # setting the figure size
sns.set_style(style = 'white') # Setting it to white so that we do not see the grid lines
# cmap = "Blues"
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(lower_triangle, center=0.5, cmap=cmap, annot= True, xticklabels = corr.index, yticklabels = corr.columns,
cbar= False, linewidths= 1, mask = mask) # Da Heatmap
plt.xticks(rotation = 90) # Aesthetic purposes
plt.yticks(rotation = 0) # Aesthetic purposes
plt.show()
# ------------------------------------------------------------------------------------------------------------------------------------
def generateNumericalBinsTable(self, feature, numberOfBins):
max = self.dataframe[feature].max()
min = self.dataframe[feature].min()
binSize = (max - min) / numberOfBins
bins = [min]
for i in range(1,numberOfBins):
bins.append(min + i*binSize)
bins.append(max)
self.dataframe[feature + "-bin"] = pd.cut(x=self.dataframe[feature], bins=bins, labels=bins[:-1])
if self.isCategoricalTarget == True:
table = self.dataframe[self.dataframe[self.targetFeature] == True][feature + "-bin"].value_counts() / self.dataframe[feature + "-bin"].value_counts()
else:
table = self.dataframe[feature + "-bin"].value_counts()
self.dataframe.drop(feature + "-bin", axis=1)
return table
# ------------------------------------------------------------------------------------------------------------------------------------
def numericalFeatureSummary(self):
markdown = "<hr><H2>Numerical Feature Summary</H2><hr>"
for feature in self.numericalFeatures:
markdown = markdown + "<H3>" + feature + "</H3>"
markdown = markdown + "<table style=\"width: 100%; background-color:#ffffff\"><tr><td style=\"width:25%; padding-right: 20px;\">"
markdown = markdown + self.generateDescribeStats(feature) + "</td>"
markdown = markdown + "<td style=\"width:40%; padding-right: 20px;\">" + self.generateAdditionalStats(feature) + "</td>"
markdown = markdown + "<td>" + self.generateCorrStats(feature) + "</td></tr></table>"
display(HTML("<style>.output_result { height:auto !important; max-height:500px; } </style>" + markdown))
# -----------------------------------------------------------------------------------------------------------------------------------
def analyzeNumericalFeature(self, feature):
self.analyzeNumericalFeature(feature, False)
# provide a report with data tables and plots for a given numerical feature ----------------------------------------------------------
def analyzeNumericalFeature(self, feature, plot):
markdown = "<hr><H2>" + feature + " -- Numerical Feature Analysis </H2><hr><H3>Summary</H3>"
markdown = markdown + "<table style=\"width: 100%; background-color:#ffffff\"><tr><td style=\"width:25%; padding-right: 20px;\">"
markdown = markdown + self.generateDescribeStats(feature) + "</td>"
markdown = markdown + "<td style=\"width:40%; padding-right: 20px;\">" + self.generateAdditionalStats(feature) + "</td>"
markdown = markdown + "<td>" + self.generateCorrStats(feature) + "</td></tr></table>"
markdown = markdown + "<H3>Notes</H3>" + self.generateCategoricalNotes(feature)
markdown = markdown + "<H3>Plots with Numerical Features</H3>"
display(HTML("<style>.output_result { height:auto !important; max-height:500px; } </style>" + markdown))
if plot == True:
self.displayPlotsForNumericalFeature(feature, True)
# provide the correlations that a feature has with all of the other numerical features in the data set -------------------------------
def generateCorrStats(self, feature):
html = "<table style=\"width: 100%; border:1px black solid;\"><tr><th>Feature</th><th>Original Correlation</th><th>Adjusted Correlation</th></tr>"
for numericalFeature in self.numericalFeatures:
if (feature != numericalFeature):
corr = self.dataframe[feature].corr(self.dataframe[numericalFeature])
html = html + "<tr><td>" + numericalFeature + "</td><td>" + "{:.2f}".format(corr) + "</td><td>TODO</td></tr>"
html = html + "</table>"
return html
# provide summary statistics to assist with identifying data clean needs -------------------------------------------------------------
def generateAdditionalStats(self, feature):
html = "<table style=\"width: 100%; border:1px black solid;\"><tr><th>Statistic</th><th>Original Value</th><th>Adjusted Value</th>"
html = html + "<tr><td>Datatype</td><td>" + str(self.dataframe[feature].dtype) + "</td><td>" + "TODO" + "</td></tr>"
html = html + "<tr><td># of Null</td><td>" + str(self.dataframe[feature].isnull().sum()) + "</td><td>" + "TODO" + "</td></tr>"
html = html + "<tr><td># of 0</td><td>" + str(self.dataframe[self.dataframe[feature] ==0].shape[0]) + "</td><td>" + "TODO" + "</td></tr>"
html = html + "<tr><td># Less Than 0</td><td>" + str(self.dataframe[self.dataframe[feature] < 0].shape[0]) + "</td><td>" + "TODO" + "</td></tr>"
totalInstances = self.dataframe[feature].describe()["count"]
iqr = self.dataframe[feature].describe()["75%"] - self.dataframe[feature].describe()["25%"]
lowOutlierValue = self.dataframe[feature].describe()["25%"] - 1.5*iqr
numLowOutliers = self.dataframe[self.dataframe[feature] < lowOutlierValue][feature].count()
percentLowOutliers = 100*numLowOutliers/totalInstances
html = html + "<tr><td># of Low Outliers</td><td>" + str(numLowOutliers) + " (" + "{:.2f}".format(percentLowOutliers) + "%)</td>"
html = html + "<td>" + "TODO" + "</td></tr>"
highOutlierValue = self.dataframe[feature].describe()["75%"] + 1.5*iqr
numHighOutliers = self.dataframe[self.dataframe[feature] > highOutlierValue][feature].count()
percentHighOutliers = 100*numHighOutliers / totalInstances
html = html + "<tr><td># of High Outliers</td><td>" + str(numHighOutliers) + " (" + "{:.2f}".format(percentHighOutliers) + "%)</td>"
html = html + "<td>" + "TODO" + "</td></tr>"
html = html + "</table>"
return html
# provide summary statistics like df.describe for a numeric feature -----------------------------------------------------------------
def generateDescribeStats(self, feature):
describe = self.dataframe[feature].describe()
html = "<table style=\"width: 100%; border:1px black solid;\"><tr><th>Statistic</th><th>Original Value</th><th>Adjusted Value</th></tr>"
for stat in describe.index:
html = html + "<tr><td>" + stat + "</td><td>" + "{:.2f}".format(describe[stat]) + "</td><td>TODO</td></tr>"
html = html + "<tr><td>Skewness</td><td>" +"{:.2f}".format(self.dataframe[feature].skew()) + "</td><td>TODO</td></tr>"
corr = self.dataframe[feature].corr(self.dataframe[self.targetFeature])
html = html + "<tr><td>" + self.targetFeature + " Correlation</td><td>" +"{:.2f}".format(corr) + "</td><td>TODO</td></tr>"
html = html + "</table>"
return html
# Plot the relationship bettween a numerical feature and other numerical features, as well as plots with the target feature
def displayPlotsForNumericalFeature(self, featureToAnalyze, plotDensity):
# assumes the target is categorical, TODO add support for numerical targets
if featureToAnalyze in set(self.numericalFeatures):
adjustment = 1
else:
adjustment = 0
rows = int(np.floor((len(self.numericalFeatures)-adjustment)/2))
if np.mod(len(self.numericalFeatures)-adjustment, 2) > 0:
rows = int(rows + 1)
if plotDensity == True:
rows = rows + 1
fig, ax = plt.subplots(rows,2, figsize=(20,20))
if plotDensity == True:
# fig, ax = plt.subplots(1 ,2, figsize=(15,5))
ax[0][0] = sns.distplot(self.dataframe[featureToAnalyze], ax=ax[0][0])
# NUMERICAL / CATEGORICAL SWITCH TODO
if self.isCategoricalTarget == True:
ax[0][1] = sns.distplot(self.dataframe[self.dataframe[self.targetFeature]==False][featureToAnalyze],color='r',label=0, ax=ax[0][1])
ax[0][1] = sns.distplot(self.dataframe[self.dataframe[self.targetFeature]==True][featureToAnalyze],color='g',label=1, ax=ax[0][1])
else:
ax[0][1] = sns.regplot(x=featureToAnalyze, y=self.targetFeature, data=self.dataframe, ax=ax[0][1])
handles, labels = ax[0][1].get_legend_handles_labels()
ax[0][1].legend(handles, labels)
plt.close()
i = 0
c = 0
for feature in self.numericalFeatures:
if feature != featureToAnalyze:
r = int(np.floor(i/2)) + 1
c = int(np.mod(i, 2))
ax[r][c] = sns.regplot(x=feature, y=featureToAnalyze, data=self.dataframe, ax=ax[r][c])
plt.close()
i = i + 1
for a in fig.axes:
plt.sca(a)
plt.xticks(rotation=90)
fig.subplots_adjust(hspace=0.5, wspace=0.1)
fig.show()
# ----------------------------------------------------------------------------------------------------------------------------------
def displayDensityPlotsForNumericalFeatures(self, featuresToAnalyze):
# assumes the target is categorical, TODO add support for numerical targets
for featureToAnalyze in featuresToAnalyze:
fig, ax = plt.subplots(1 ,2, figsize=(15,5))
ax[0] = sns.distplot(self.dataframe[featureToAnalyze], ax=ax[0])
ax[1] = sns.distplot(self.dataframe[self.dataframe[self.targetFeature]==False][featureToAnalyze],color='r',label="Target 0", ax=ax[1])
ax[1] = sns.distplot(self.dataframe[self.dataframe[self.targetFeature]==True][featureToAnalyze],color='g',label="Target 1", ax=ax[1])
handles, labels = ax[1].get_legend_handles_labels()
ax[1].legend(handles, labels)
fig.show()
# --------------------------------------------------------------------------------------------------------------------------------
def analyzeSkew(self):
skews = pd.DataFrame()
skews["OriginalSkew"] = self.dataframe[self.numericalFeatures].skew()
for feature in self.numericalFeatures:
minimum = self.dataframe[feature].min()
if minimum > 0:
minimum = 0
skews.at[feature, "LnSkew"] = self.dataframe[feature].apply(lambda x: np.log(x-minimum +1 )).skew()
skews.at[feature, "SqRtSkew"] = self.dataframe[feature].apply(lambda x: np.sqrt(x-minimum +1 )).skew()
return skews
# Categorical Feature Methods ----------------------------------------------------------------------------------------------------
def categoricalFeatureSummary(self):
markdown = "<hr><H2>Categorical Feature Summary</H2><hr>"
for feature in self.categoricalFeatures:
markdown = markdown + "<H3>" + feature + "</H3>"
markdown = markdown + "<table style=\"width: 100%; background-color:#ffffff\"><tr><td style=\"width:30%; padding-right: 20px;\">"
markdown = markdown + self.generateCategoricalSummaryStats(feature) + "</td>"
markdown = markdown + "<td style=\"width:36%; padding-right: 20px;\">" + self.generateContingencyTable(feature) + "</td>"
markdown = markdown + "<td>" + self.generateValueCountsTable(feature) + "</td></tr></table>"
display(HTML("<style>.output_result { height:auto !important; max-height:500px; } </style>" + markdown))
# ----------------------------------------------------------------------------------------------------------------------------------
def analyzeCategoricalFeature(self, feature):
self.analyzeCategoricalFeature(feature, False)
# ----------------------------------------------------------------------------------------------------------------------------------
def analyzeCategoricalFeature(self, feature, plot):
markdown = "<hr><H2>" + feature + " -- Categorical Feature Analysis </H2><hr><H3>Summary</H3>"
markdown = markdown + "<table style=\"width: 100%; background-color:#ffffff\"><tr><td style=\"width:30%; padding-right: 20px;\">"
markdown = markdown + self.generateCategoricalSummaryStats(feature) + "</td>"
markdown = markdown + "<td style=\"width:36%; padding-right: 20px;\">" + self.generateContingencyTable(feature) + "</td>"
markdown = markdown + "<td>" + self.generateValueCountsTable(feature) + "</td></tr></table>"
markdown = markdown + "<H3>Notes</H3>" + self.generateCategoricalNotes(feature)
markdown = markdown + "<H3>Plots with Numerical Features</H3>"
display(HTML("<style>.output_result { height:auto !important; max-height:500px; } </style>" + markdown))
if plot == True:
self.displayPlotsWithNumericalFeatures(feature)
# ---------------------------------------------------------------------------------------------------------------------------------
def generateCategoricalNotes(self,feature):
return "TODO"
# ---------------------------------------------------------------------------------------------------------------------------------
def generateCategoricalSummaryStats(self, feature):
# markdown = "<table style=\"width: 100%; border:1px black solid;\"><tr><th>Value</th><th>" + tf + " % True </th><th>" + tf + " % False</th></tr>"
return "TODO"
# provide the contingency table for a categorical feature ------------------------------------------------------------------------
def generateContingencyTable(self, feature):
crosstab = pd.crosstab(self.dataframe[feature], self.dataframe[self.targetFeature], normalize="index")
tf = self.targetFeature
markdown = "<table style=\"width: 100%; border:1px black solid;\"><tr><th>Value</th><th>" + tf + " % True </th><th>" + tf + " % False</th></tr>"
for index, row in crosstab.iterrows():
markdown = markdown + "<tr><td>" + str(index) + "</td><td>" + "{:.1f}".format(100*row[True]) + "</td><td>" + "{:.1f}".format(100*row[False]) + "</td></tr>"
markdown = markdown + "</table>"
return markdown
# subset the sample size of a numerical feature for the purpose of making plotting faster but still insightful vai random sampling
def prepareNumericalFeatureSample(self, feature):
# check if in numerical TODO
sample = self.dataframe.sample(min(1000, int(self.dataframe.shape[0]/10)))
sample = sample[np.abs(sample[feature] - sample[feature].mean())/sample[feature].std() < 3]
return sample
# plot categorical variable relationships with all numerical features ----------------------------------------------------------------
def displayPlotsWithNumericalFeatures(self, categoricalFeature):
rows = int(np.floor(len(self.numericalFeatures)/2))
if np.mod(len(self.numericalFeatures), 2) > 0:
rows = int(rows + 1)
fig, ax = plt.subplots(rows,2, figsize=(20,20))
i = 0
c = 0
for feature in self.numericalFeatures:
r = int(np.floor(i/2))
c = int(np.mod(i, 2))
ax[r][c] = sns.swarmplot(x=categoricalFeature, y=feature, hue="Target", data=self.prepareNumericalFeatureSample(feature), ax=ax[r][c])
plt.close()
i = i + 1
for a in fig.axes:
plt.sca(a)
plt.xticks(rotation=90)
fig.subplots_adjust(hspace=0.5, wspace=0.1)
fig.show()
# compute the counts for categorical variable values -------------------------------------------------------------------------------
def generateValueCountsTable(self, feature):
value_counts = self.dataframe[feature].value_counts()
totalInstances = value_counts.sum()
markdown = "<table style=\"width: 100%; border:1px black solid; \"><tr><th>Value</th><th>Count</th><th>% of Total</th></tr>"
for itype in value_counts.index:
markdown = markdown + "<tr><td>" + str(itype) + "</td><td>" + str(value_counts[itype]) + "</td>"
markdown = markdown + "<td>" + "{:.1f}".format(100*value_counts[itype]/totalInstances) + "</td></tr>"
markdown = markdown + "</table>"
return markdown