Customer Segmentation using K-Means Clustering
1. Setup and Data Generation
python
# Install required libraries
# pip install pandas numpy matplotlib seaborn scikit-learn plotly
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.decomposition import PCA
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')
# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
2. Load or Create Sample Dataset
python
# Option 1: Generate sample customer data
def generate_customer_data(n_customers=2000):
np.random.seed(42)
# Customer segments characteristics
segments = {
'Premium': {'size': 0.15, 'income': (80000, 150000), 'spending': (70, 100),
'frequency': (25, 40), 'age': (35, 55)},
'Regular': {'size': 0.35, 'income': (50000, 80000), 'spending': (40, 70),
'frequency': (10, 25), 'age': (30, 50)},
'Budget': {'size': 0.30, 'income': (25000, 50000), 'spending': (20, 40),
'frequency': (5, 15), 'age': (25, 45)},
'Occasional': {'size': 0.20, 'income': (30000, 60000), 'spending': (10, 30),
'frequency': (1, 8), 'age': (20, 40)}
}
data = []
for segment, params in segments.items():
n = int(n_customers * params['size'])
income = np.random.normal(np.mean(params['income']),
np.std(params['income'])/3, n)
spending_score = np.random.normal(np.mean(params['spending']), 10, n)
frequency = np.random.normal(np.mean(params['frequency']), 5, n)
age = np.random.normal(np.mean(params['age']), 8, n)
# Clip values to realistic ranges
income = np.clip(income, params['income'][0], params['income'][1])
spending_score = np.clip(spending_score, 0, 100)
frequency = np.clip(frequency, 0, 50)
age = np.clip(age, 18, 70)
for i in range(n):
data.append({
'CustomerID': f'CUST_{len(data)+1:04d}',
'Segment_Actual': segment,
'Annual_Income_k$': income[i],
'Spending_Score': spending_score[i],
'Purchase_Frequency': frequency[i],
'Age': age[i],
'Tenure_Months': np.random.randint(1, 60),
'Total_Purchases': np.random.randint(1, 100),
'Avg_Transaction_Value': np.random.uniform(20, 200),
'Returns_Count': np.random.poisson(1 if segment == 'Budget' else 0.5),
'Channel_Preference': np.random.choice(['Online', 'Store', 'Mobile'],
p=[0.5, 0.3, 0.2])
})
return pd.DataFrame(data)
# Generate data
df = generate_customer_data(2000)
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nBasic Statistics:")
print(df.describe())
3. Exploratory Data Analysis (EDA)
python
# FIGURE 1: Distribution of Key Variables
fig1 = make_subplots(rows=2, cols=2,
subplot_titles=('Annual Income Distribution',
'Spending Score Distribution',
'Age Distribution',
'Purchase Frequency'))
fig1.add_trace(go.Histogram(x=df['Annual_Income_k$'], nbinsx=30,
marker_color='#E63946', name='Income'), row=1, col=1)
fig1.add_trace(go.Histogram(x=df['Spending_Score'], nbinsx=30,
marker_color='#457B9D', name='Spending'), row=1, col=2)
fig1.add_trace(go.Histogram(x=df['Age'], nbinsx=30,
marker_color='#2A9D8F', name='Age'), row=2, col=1)
fig1.add_trace(go.Histogram(x=df['Purchase_Frequency'], nbinsx=30,
marker_color='#E9C46A', name='Frequency'), row=2, col=2)
fig1.update_layout(height=800, showlegend=False,
title_text="Customer Demographics Distribution",
title_font_size=20)
fig1.show()
# Save as: 'images/customer_distributions.png'
# FIGURE 2: Correlation Heatmap
numeric_cols = ['Annual_Income_k$', 'Spending_Score', 'Purchase_Frequency',
'Age', 'Tenure_Months', 'Total_Purchases', 'Avg_Transaction_Value']
corr_matrix = df[numeric_cols].corr()
fig2 = px.imshow(corr_matrix,
text_auto=True,
color_continuous_scale='RdBu_r',
title="Feature Correlation Matrix")
fig2.update_layout(height=600, title_font_size=18)
fig2.show()
# Save as: 'images/correlation_heatmap.png'
4. Feature Selection and Preprocessing
python
# Select features for clustering
features = ['Annual_Income_k$', 'Spending_Score', 'Purchase_Frequency',
'Age', 'Tenure_Months', 'Avg_Transaction_Value']
X = df[features].copy()
# Handle missing values (if any)
X = X.fillna(X.mean())
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Alternative: MinMaxScaler for bounded ranges
# scaler = MinMaxScaler()
# X_scaled = scaler.fit_transform(X)
print("Features scaled successfully!")
print(f"Scaled data shape: {X_scaled.shape}")
5. Finding Optimal Number of Clusters
python
# FIGURE 3: Elbow Method and Silhouette Score
def plot_elbow_and_silhouette(X_scaled, max_clusters=10):
inertias = []
silhouette_scores = []
K_range = range(2, max_clusters + 1)
for k in K_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(X_scaled)
inertias.append(kmeans.inertia_)
silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))
# Create subplot
fig = make_subplots(rows=1, cols=2,
subplot_titles=('Elbow Method', 'Silhouette Score'))
# Elbow plot
fig.add_trace(go.Scatter(x=list(K_range), y=inertias,
mode='lines+markers',
marker=dict(size=10, color='#E63946'),
name='Inertia'), row=1, col=1)
# Silhouette plot
fig.add_trace(go.Scatter(x=list(K_range), y=silhouette_scores,
mode='lines+markers',
marker=dict(size=10, color='#2A9D8F'),
name='Silhouette'), row=1, col=2)
fig.update_layout(height=500, title_text="Determining Optimal Number of Clusters",
showlegend=True, title_font_size=18)
fig.show()
return inertias, silhouette_scores
inertias, silhouette_scores = plot_elbow_and_silhouette(X_scaled, max_clusters=10)
# Find best k based on silhouette score
best_k = np.argmax(silhouette_scores) + 2
print(f"\n📊 Optimal number of clusters: {best_k}")
print(f"Best silhouette score: {max(silhouette_scores):.3f}")
6. Apply K-Means Clustering
python
# Apply K-Means with optimal clusters
k = 4 # Based on elbow method and business logic
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(X_scaled)
# Get cluster centers
centers = scaler.inverse_transform(kmeans.cluster_centers_)
cluster_centers = pd.DataFrame(centers, columns=features)
cluster_centers['Cluster'] = range(k)
print("Cluster Centers (original scale):")
print(cluster_centers)
# Evaluate clustering quality
silhouette_avg = silhouette_score(X_scaled, df['Cluster'])
davies_bouldin = davies_bouldin_score(X_scaled, df['Cluster'])
calinski_harabasz = calinski_harabasz_score(X_scaled, df['Cluster'])
print(f"\n📈 Clustering Evaluation Metrics:")
print(f"Silhouette Score: {silhouette_avg:.3f}")
print(f"Davies-Bouldin Index: {davies_bouldin:.3f}")
print(f"Calinski-Harabasz Index: {calinski_harabasz:.3f}")
7. Visualize Clusters
python
# FIGURE 4: 2D PCA Visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
df['PCA1'] = X_pca[:, 0]
df['PCA2'] = X_pca[:, 1]
fig4 = px.scatter(df, x='PCA1', y='PCA2', color='Cluster',
title=f'Customer Segments - PCA Visualization (k={k})',
labels={'Cluster': 'Segment'},
color_continuous_scale='Viridis',
hover_data=['CustomerID', 'Annual_Income_k$', 'Spending_Score'])
fig4.update_layout(height=600, title_font_size=18)
fig4.show()
# Save as: 'images/pca_clusters.png'
# FIGURE 5: Income vs Spending Score (Most Common)
fig5 = px.scatter(df, x='Annual_Income_k$', y='Spending_Score',
color='Cluster', size='Purchase_Frequency',
title='Customer Segmentation: Income vs Spending Score',
labels={'Annual_Income_k$': 'Annual Income (k$)',
'Spending_Score': 'Spending Score (1-100)'},
hover_data=['Age', 'Purchase_Frequency'],
color_continuous_scale='Viridis')
fig5.update_layout(height=600, title_font_size=18)
fig5.show()
# Save as: 'images/income_vs_spending.png'
# FIGURE 6: Radar Chart for Cluster Profiles
def plot_cluster_radar(cluster_centers, features):
# Normalize features for radar chart
from sklearn.preprocessing import MinMaxScaler
scaler_radar = MinMaxScaler()
centers_normalized = scaler_radar.fit_transform(cluster_centers[features])
fig = go.Figure()
for i in range(len(cluster_centers)):
fig.add_trace(go.Scatterpolar(
r=centers_normalized[i],
theta=features,
fill='toself',
name=f'Cluster {i}'
))
fig.update_layout(
polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
showlegend=True,
title="Customer Segment Profiles (Radar Chart)",
title_font_size=18,
height=600
)
fig.show()
plot_cluster_radar(cluster_centers, features)
# Save as: 'images/radar_chart_clusters.png'
8. Segment Profiling and Analysis
python
# FIGURE 7: Segment Characteristics - Heatmap
cluster_profile = df.groupby('Cluster')[features].mean()
cluster_profile_pct = df.groupby('Cluster')[features].mean().apply(lambda x: x/x.sum(), axis=1)
fig7 = px.imshow(cluster_profile_pct,
title='Segment Characteristics (Relative Values)',
labels=dict(x="Features", y="Cluster", color="Relative Value"),
color_continuous_scale='Viridis',
aspect='auto')
fig7.update_layout(height=500, title_font_size=18)
fig7.show()
# Save as: 'images/segment_characteristics.png'
# FIGURE 8: Detailed Segment Analysis
fig8 = make_subplots(rows=2, cols=2,
subplot_titles=('Avg Annual Income by Segment',
'Avg Spending Score by Segment',
'Purchase Frequency by Segment',
'Age Distribution by Segment'))
colors = ['#E63946', '#457B9D', '#2A9D8F', '#E9C46A']
for i in range(k):
cluster_data = df[df['Cluster'] == i]
fig8.add_trace(go.Box(y=cluster_data['Annual_Income_k$'], name=f'Cluster {i}',
marker_color=colors[i]), row=1, col=1)
fig8.add_trace(go.Box(y=cluster_data['Spending_Score'], name=f'Cluster {i}',
marker_color=colors[i]), row=1, col=2)
fig8.add_trace(go.Box(y=cluster_data['Purchase_Frequency'], name=f'Cluster {i}',
marker_color=colors[i]), row=2, col=1)
fig8.add_trace(go.Box(y=cluster_data['Age'], name=f'Cluster {i}',
marker_color=colors[i]), row=2, col=2)
fig8.update_layout(height=800, showlegend=False, title_text="Segments Distribution Analysis")
fig8.show()
# Save as: 'images/segment_distributions.png'
# Create segment profiles
segment_profiles = []
for i in range(k):
cluster_data = df[df['Cluster'] == i]
profile = {
'Segment': f'Segment {i}',
'Size': len(cluster_data),
'Size_Percentage': (len(cluster_data)/len(df))*100,
'Avg_Income': cluster_data['Annual_Income_k$'].mean(),
'Avg_Spending': cluster_data['Spending_Score'].mean(),
'Avg_Frequency': cluster_data['Purchase_Frequency'].mean(),
'Avg_Age': cluster_data['Age'].mean(),
'Top_Channel': cluster_data['Channel_Preference'].mode()[0] if len(cluster_data) > 0 else 'N/A'
}
segment_profiles.append(profile)
segment_df = pd.DataFrame(segment_profiles)
print("\n📊 Customer Segment Profiles:")
print(segment_df.round(2))
9. Business Insights and Recommendations
python
# Generate business insights
def generate_business_insights(df, segment_profiles):
insights = []
# Identify high-value segments
df['Customer_Value'] = (df['Avg_Transaction_Value'] * df['Purchase_Frequency'])
high_value_cluster = df.groupby('Cluster')['Customer_Value'].mean().idxmax()
insights.append({
'title': '💰 High-Value Customers',
'insight': f'Cluster {high_value_cluster} has the highest customer lifetime value',
'recommendation': 'Implement VIP program with exclusive benefits and early access'
})
# Loyalty analysis
loyal_cluster = df.groupby('Cluster')['Tenure_Months'].mean().idxmax()
insights.append({
'title': '🌟 Most Loyal Customers',
'insight': f'Cluster {loyal_cluster} shows highest tenure and retention',
'recommendation': 'Launch referral program and loyalty rewards'
})
# Cross-selling opportunity
freq_spend_corr = df.groupby('Cluster')[['Purchase_Frequency', 'Spending_Score']].corr().iloc[0::2, -1]
cross_sell_cluster = freq_spend_corr.idxmax()
insights.append({
'title': '🔄 Cross-selling Opportunity',
'insight': f'Cluster {cross_sell_cluster} shows strong correlation between frequency and spending',
'recommendation': 'Implement product bundling and personalized recommendations'
})
# Channel preference
channel_analysis = df.groupby(['Cluster', 'Channel_Preference']).size().unstack(fill_value=0)
for cluster in channel_analysis.index:
top_channel = channel_analysis.loc[cluster].idxmax()
insights.append({
'title': f'📱 Channel Strategy - Cluster {cluster}',
'insight': f'Preferred channel is {top_channel}',
'recommendation': f'Optimize {top_channel} experience and personalize communications'
})
return insights
insights = generate_business_insights(df, segment_profiles)
# Visualize insights
fig9 = go.Figure()
fig9.add_trace(go.Bar(x=segment_df['Segment'],
y=segment_df['Avg_Income'],
name='Income (k$)',
marker_color='#E63946'))
fig9.add_trace(go.Bar(x=segment_df['Segment'],
y=segment_df['Avg_Spending'],
name='Spending Score',
marker_color='#457B9D'))
fig9.add_trace(go.Bar(x=segment_df['Segment'],
y=segment_df['Avg_Frequency'],
name='Frequency',
marker_color='#2A9D8F'))
fig9.update_layout(title='Segment Comparison: Key Metrics',
xaxis_title='Customer Segment',
yaxis_title='Value',
barmode='group',
height=500,
title_font_size=18)
fig9.show()
# Save as: 'images/segment_comparison.png'
10. Marketing Strategy Recommendations
python
def generate_marketing_strategies(df, segment_df):
strategies = []
for idx, row in segment_df.iterrows():
segment = row['Segment']
if row['Avg_Income'] > 70 and row['Avg_Spending'] > 70:
strategy = {
'Segment': segment,
'Type': ' Premium Segment',
'Strategy': 'Exclusive experiences, personalized service, early access',
'Channels': ['Email', 'SMS', 'Personal Phone'],
'Budget_Allocation': '35%'
}
elif row['Avg_Income'] > 50 and row['Avg_Spending'] > 50:
strategy = {
'Segment': segment,
'Type': ' Regular Segment',
'Strategy': 'Loyalty programs, bundle offers, seasonal promotions',
'Channels': ['Email', 'Mobile App', 'Social Media'],
'Budget_Allocation': '30%'
}
elif row['Avg_Income'] < 50 and row['Avg_Spending'] > 40:
strategy = {
'Segment': segment,
'Type': ' Value Segment',
'Strategy': 'Discount campaigns, flash sales, bulk purchase offers',
'Channels': ['Social Media', 'SMS', 'App Notifications'],
'Budget_Allocation': '25%'
}
else:
strategy = {
'Segment': segment,
'Type': ' Occasional Segment',
'Strategy': 'Re-engagement campaigns, first-purchase incentives',
'Channels': ['Email', 'Retargeting Ads', 'Push Notifications'],
'Budget_Allocation': '10%'
}
strategies.append(strategy)
return pd.DataFrame(strategies)
marketing_strategies = generate_marketing_strategies(df, segment_df)
print("\n🎯 Recommended Marketing Strategies by Segment:")
print(marketing_strategies.to_string(index=False))
# FIGURE 10: Marketing Budget Allocation
fig10 = px.pie(marketing_strategies,
values='Budget_Allocation',
names='Segment',
title='Marketing Budget Allocation by Segment',
color_discrete_sequence=px.colors.sequential.Viridis_r,
hole=0.3)
fig10.update_layout(height=500, title_font_size=18)
fig10.show()
# Save as: 'images/marketing_budget.png'
11. Interactive Dashboard with All Visualizations
python
def create_interactive_dashboard(df, segment_profiles):
"""Create comprehensive interactive dashboard"""
# Summary metrics
total_customers = len(df)
segments_count = df['Cluster'].nunique()
# Create dashboard
fig = make_subplots(
rows=3, cols=3,
subplot_titles=('Segment Size Distribution', 'Income vs Spending',
'Cluster Profiles', 'Purchase Frequency',
'Channel Preferences', 'Age Distribution',
'Customer Value by Segment', 'Segment KPIs', 'Recommendation Priority'),
specs=[[{'type': 'pie'}, {'type': 'scatter'}, {'type': 'bar'}],
[{'type': 'box'}, {'type': 'bar'}, {'type': 'box'}],
[{'type': 'bar'}, {'type': 'indicator'}, {'type': 'table'}]]
)
# Add traces (simplified for demonstration)
# Segment size - pie chart
segment_sizes = df['Cluster'].value_counts()
fig.add_trace(go.Pie(labels=segment_sizes.index, values=segment_sizes.values), row=1, col=1)
# Add more traces as needed...
fig.update_layout(height=900, showlegend=False,
title_text="Customer Segmentation Dashboard - Complete View")
fig.show()
create_interactive_dashboard(df, segment_profiles)
12. Export and Save Results
python
# Save clustered data
df.to_csv('customer_segmentation_results.csv', index=False)
# Save segment profiles
segment_df.to_csv('segment_profiles.csv', index=False)
# Save marketing strategies
marketing_strategies.to_csv('marketing_strategies.csv', index=False)
# Generate summary report
def create_summary_report(df, segment_profiles):
report = f"""
========================================
CUSTOMER SEGMENTATION ANALYSIS REPORT
========================================
📊 DATASET OVERVIEW:
- Total Customers: {len(df)}
- Number of Segments: {df['Cluster'].nunique()}
- Features Used: {', '.join(features)}
📈 SEGMENT COMPOSITION:
"""
for _, row in segment_profiles.iterrows():
report += f"""
{row['Segment']}:
- Size: {row['Size']} customers ({row['Size_Percentage']:.1f}%)
- Avg Income: ${row['Avg_Income']:.0f}k
- Avg Spending Score: {row['Avg_Spending']:.1f}/100
- Purchase Frequency: {row['Avg_Frequency']:.1f} purchases/month
- Avg Age: {row['Avg_Age']:.0f} years
- Preferred Channel: {row['Top_Channel']}
"""
report += f"""
🎯 KEY INSIGHTS:
1. {segment_profiles.loc[segment_profiles['Avg_Income'].idxmax(), 'Segment']} has highest income
2. {segment_profiles.loc[segment_profiles['Avg_Spending'].idxmax(), 'Segment']} has highest spending
3. {segment_profiles.loc[segment_profiles['Avg_Frequency'].idxmax(), 'Segment']} has highest purchase frequency
💡 RECOMMENDATIONS:
1. Focus retention efforts on high-value segments
2. Personalize marketing messages based on segment characteristics
3. Optimize channel strategy for each segment's preferences
4. Implement A/B testing for segment-specific campaigns
"""
with open('segmentation_report.txt', 'w') as f:
f.write(report)
print(report)
create_summary_report(df, segment_df)
Visualization Guide Summary
| Visualization | Purpose | Best Location | Key Insight |
|---|---|---|---|
| Distribution Histograms | Understanding data spread | EDA Section | Feature distributions |
| Correlation Heatmap | Feature relationships | Preprocessing | Feature selection justification |
| Elbow Method Plot | Finding optimal k | Model Selection | Determine clusters |
| PCA Scatter Plot | Cluster visualization | Results | Segment separation |
| Income vs Spending | Market segmentation | Core Analysis | Customer patterns |
| Radar Chart | Segment profiling | Analysis | Segment characteristics |
| Box Plots | Distribution comparison | Validation | Segment differences |
| Bar Charts | KPI comparison | Business Insights | Performance metrics |
| Pie Charts | Size comparison | Summary | Segment proportions |
| Dashboard | Complete view | Executive Summary | Holistic insights |
Key Success Metrics
python
success_metrics = {
'Business Metrics': [
'Customer Lifetime Value (CLV) increase',
'Retention rate improvement',
'Marketing ROI enhancement',
'Cross-selling success rate'
],
'Model Metrics': [
f'Silhouette Score: {silhouette_avg:.3f}',
f'Davies-Bouldin Index: {davies_bouldin:.3f}',
f'Calinski-Harabasz Index: {calinski_harabasz:.3f}',
f'Optimal Clusters: {k}'
]
}
print("\n✅ Project Success Metrics:")
for category, metrics in success_metrics.items():
print(f"\n{category}:")
for metric in metrics:
print(f" • {metric}")