Complete Machine Learning Workflow: From Data to Deployment
- Published on
- • 5 mins read•--- views
Complete Machine Learning Workflow
Building a machine learning model is more than just training an algorithm. This guide walks through the complete workflow from problem definition to deployment.
The ML Pipeline
# Complete ML Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
# Create pipeline
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', RandomForestClassifier(n_estimators=100))
])
# Train
pipeline.fit(X_train, y_train)
# Predict
predictions = pipeline.predict(X_test)
1. Problem Definition
Business Understanding
- What business problem are we solving?
- What are the success metrics?
- What are the constraints?
ML Formulation
- Classification, Regression, or Clustering?
- What features do we need?
- What's the target variable?
2. Data Collection and Preparation
Data Loading
import pandas as pd
import numpy as np
# Load data
df = pd.read_csv('data.csv')
# Initial exploration
print(df.shape)
print(df.info())
print(df.describe())
Train-Test Split
from sklearn.model_selection import train_test_split
# Split features and target
X = df.drop('target', axis=1)
y = df['target']
# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
3. Feature Engineering
Numerical Features
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Standardization (mean=0, std=1)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numerical_cols])
X_test_scaled = scaler.transform(X_test[numerical_cols])
# Normalization (range 0-1)
normalizer = MinMaxScaler()
X_normalized = normalizer.fit_transform(X_train[numerical_cols])
Categorical Features
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# Label Encoding (for ordinal data)
label_encoder = LabelEncoder()
df['education_encoded'] = label_encoder.fit_transform(df['education'])
# One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=['category'], prefix='cat')
# Using sklearn
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_cols),
('cat', OneHotEncoder(drop='first'), categorical_cols)
])
Feature Creation
# Polynomial features
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X[['feature1', 'feature2']])
# Interaction features
df['feature_interaction'] = df['feature1'] * df['feature2']
# Binning
df['age_group'] = pd.cut(df['age'], bins=[0, 18, 35, 50, 100],
labels=['Teen', 'Young Adult', 'Adult', 'Senior'])
4. Model Selection
Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
# Define models
models = {
'Logistic Regression': LogisticRegression(max_iter=1000),
'Decision Tree': DecisionTreeClassifier(random_state=42),
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
'Gradient Boosting': GradientBoostingClassifier(random_state=42),
'SVM': SVC(kernel='rbf', random_state=42),
'KNN': KNeighborsClassifier(n_neighbors=5)
}
# Train and evaluate
results = {}
for name, model in models.items():
model.fit(X_train_scaled, y_train)
score = model.score(X_test_scaled, y_test)
results[name] = score
print(f"{name}: {score:.4f}")
Regression Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# Ridge Regression (L2 regularization)
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
# Lasso Regression (L1 regularization)
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
# Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)
5. Model Training and Hyperparameter Tuning
Cross-Validation
from sklearn.model_selection import cross_val_score, KFold
# K-Fold Cross-Validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
print(f"Cross-validation scores: {scores}")
print(f"Mean: {scores.mean():.4f} (+/- {scores.std():.4f})")
Grid Search
from sklearn.model_selection import GridSearchCV
# Define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [10, 20, 30, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# Grid search
grid_search = GridSearchCV(
RandomForestClassifier(random_state=42),
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train, y_train)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.4f}")
Random Search
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
# Define parameter distributions
param_dist = {
'n_estimators': randint(50, 200),
'max_depth': randint(10, 50),
'min_samples_split': randint(2, 20),
'min_samples_leaf': randint(1, 10)
}
# Random search
random_search = RandomizedSearchCV(
RandomForestClassifier(random_state=42),
param_distributions=param_dist,
n_iter=50,
cv=5,
scoring='accuracy',
n_jobs=-1,
random_state=42
)
random_search.fit(X_train, y_train)
6. Model Evaluation
Classification Metrics
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, classification_report, roc_auc_score, roc_curve
)
# Predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
# Basic metrics
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred):.4f}")
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)
# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"\nROC-AUC Score: {roc_auc:.4f}")
Regression Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R² Score: {r2:.4f}")
Visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Confusion Matrix Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()
# Feature Importance
if hasattr(model, 'feature_importances_'):
importance = pd.DataFrame({
'feature': X_train.columns,
'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
plt.figure(figsize=(10, 6))
plt.barh(importance['feature'][:10], importance['importance'][:10])
plt.xlabel('Importance')
plt.title('Top 10 Feature Importances')
plt.gca().invert_yaxis()
plt.show()
7. Model Saving and Loading
import joblib
import pickle
# Using joblib (recommended for sklearn)
joblib.dump(model, 'model.joblib')
loaded_model = joblib.load('model.joblib')
# Using pickle
with open('model.pkl', 'wb') as f:
pickle.dump(model, f)
with open('model.pkl', 'rb') as f:
loaded_model = pickle.load(f)
# Save preprocessing objects
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(preprocessor, 'preprocessor.joblib')
8. Model Deployment
Flask API
from flask import Flask, request, jsonify
import joblib
app = Flask(__name__)
# Load model
model = joblib.load('model.joblib')
scaler = joblib.load('scaler.joblib')
@app.route('/predict', methods=['POST'])
def predict():
data = request.get_json()
features = pd.DataFrame([data])
features_scaled = scaler.transform(features)
prediction = model.predict(features_scaled)
return jsonify({'prediction': int(prediction[0])})
if __name__ == '__main__':
app.run(debug=True)
FastAPI (Modern Alternative)
from fastapi import FastAPI
from pydantic import BaseModel
import joblib
app = FastAPI()
model = joblib.load('model.joblib')
class PredictionInput(BaseModel):
feature1: float
feature2: float
feature3: float
@app.post('/predict')
def predict(input_data: PredictionInput):
features = [[input_data.feature1, input_data.feature2, input_data.feature3]]
prediction = model.predict(features)
return {'prediction': int(prediction[0])}
Conclusion
This workflow provides a solid foundation for any machine learning project. Remember to:
- Always start with exploratory data analysis
- Use cross-validation to avoid overfitting
- Try multiple models and compare
- Focus on the right metrics for your problem
- Document your process
- Monitor model performance in production
Happy modeling! 🤖📈
Table of Contents
- Complete Machine Learning Workflow
- The ML Pipeline
- 1. Problem Definition
- Business Understanding
- ML Formulation
- 2. Data Collection and Preparation
- Data Loading
- Train-Test Split
- 3. Feature Engineering
- Numerical Features
- Categorical Features
- Feature Creation
- 4. Model Selection
- Classification Models
- Regression Models
- 5. Model Training and Hyperparameter Tuning
- Cross-Validation
- Grid Search
- Random Search
- 6. Model Evaluation
- Classification Metrics
- Regression Metrics
- Visualization
- 7. Model Saving and Loading
- 8. Model Deployment
- Flask API
- FastAPI (Modern Alternative)
- Conclusion