Customer Segmentation using K-Means Clustering

By v yadav Last updated May 1, 2026

Table of Contents

1. Setup and Data Generation

python

# Install required libraries
# pip install pandas numpy matplotlib seaborn scikit-learn plotly

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.decomposition import PCA
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

2. Load or Create Sample Dataset

python

# Option 1: Generate sample customer data
def generate_customer_data(n_customers=2000):
    np.random.seed(42)
    
    # Customer segments characteristics
    segments = {
        'Premium': {'size': 0.15, 'income': (80000, 150000), 'spending': (70, 100), 
                   'frequency': (25, 40), 'age': (35, 55)},
        'Regular': {'size': 0.35, 'income': (50000, 80000), 'spending': (40, 70),
                   'frequency': (10, 25), 'age': (30, 50)},
        'Budget': {'size': 0.30, 'income': (25000, 50000), 'spending': (20, 40),
                  'frequency': (5, 15), 'age': (25, 45)},
        'Occasional': {'size': 0.20, 'income': (30000, 60000), 'spending': (10, 30),
                      'frequency': (1, 8), 'age': (20, 40)}
    }
    
    data = []
    for segment, params in segments.items():
        n = int(n_customers * params['size'])
        
        income = np.random.normal(np.mean(params['income']), 
                                 np.std(params['income'])/3, n)
        spending_score = np.random.normal(np.mean(params['spending']), 10, n)
        frequency = np.random.normal(np.mean(params['frequency']), 5, n)
        age = np.random.normal(np.mean(params['age']), 8, n)
        
        # Clip values to realistic ranges
        income = np.clip(income, params['income'][0], params['income'][1])
        spending_score = np.clip(spending_score, 0, 100)
        frequency = np.clip(frequency, 0, 50)
        age = np.clip(age, 18, 70)
        
        for i in range(n):
            data.append({
                'CustomerID': f'CUST_{len(data)+1:04d}',
                'Segment_Actual': segment,
                'Annual_Income_k$': income[i],
                'Spending_Score': spending_score[i],
                'Purchase_Frequency': frequency[i],
                'Age': age[i],
                'Tenure_Months': np.random.randint(1, 60),
                'Total_Purchases': np.random.randint(1, 100),
                'Avg_Transaction_Value': np.random.uniform(20, 200),
                'Returns_Count': np.random.poisson(1 if segment == 'Budget' else 0.5),
                'Channel_Preference': np.random.choice(['Online', 'Store', 'Mobile'], 
                                                       p=[0.5, 0.3, 0.2])
            })
    
    return pd.DataFrame(data)

# Generate data
df = generate_customer_data(2000)
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nBasic Statistics:")
print(df.describe())

3. Exploratory Data Analysis (EDA)

python

# FIGURE 1: Distribution of Key Variables
fig1 = make_subplots(rows=2, cols=2, 
                     subplot_titles=('Annual Income Distribution', 
                                   'Spending Score Distribution',
                                   'Age Distribution', 
                                   'Purchase Frequency'))

fig1.add_trace(go.Histogram(x=df['Annual_Income_k$'], nbinsx=30, 
                            marker_color='#E63946', name='Income'), row=1, col=1)
fig1.add_trace(go.Histogram(x=df['Spending_Score'], nbinsx=30,
                            marker_color='#457B9D', name='Spending'), row=1, col=2)
fig1.add_trace(go.Histogram(x=df['Age'], nbinsx=30,
                            marker_color='#2A9D8F', name='Age'), row=2, col=1)
fig1.add_trace(go.Histogram(x=df['Purchase_Frequency'], nbinsx=30,
                            marker_color='#E9C46A', name='Frequency'), row=2, col=2)

fig1.update_layout(height=800, showlegend=False, 
                   title_text="Customer Demographics Distribution",
                   title_font_size=20)
fig1.show()
# Save as: 'images/customer_distributions.png'

# FIGURE 2: Correlation Heatmap
numeric_cols = ['Annual_Income_k$', 'Spending_Score', 'Purchase_Frequency', 
                'Age', 'Tenure_Months', 'Total_Purchases', 'Avg_Transaction_Value']
corr_matrix = df[numeric_cols].corr()

fig2 = px.imshow(corr_matrix, 
                 text_auto=True, 
                 color_continuous_scale='RdBu_r',
                 title="Feature Correlation Matrix")
fig2.update_layout(height=600, title_font_size=18)
fig2.show()
# Save as: 'images/correlation_heatmap.png'

4. Feature Selection and Preprocessing

python

# Select features for clustering
features = ['Annual_Income_k$', 'Spending_Score', 'Purchase_Frequency', 
            'Age', 'Tenure_Months', 'Avg_Transaction_Value']

X = df[features].copy()

# Handle missing values (if any)
X = X.fillna(X.mean())

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Alternative: MinMaxScaler for bounded ranges
# scaler = MinMaxScaler()
# X_scaled = scaler.fit_transform(X)

print("Features scaled successfully!")
print(f"Scaled data shape: {X_scaled.shape}")

5. Finding Optimal Number of Clusters

python

# FIGURE 3: Elbow Method and Silhouette Score
def plot_elbow_and_silhouette(X_scaled, max_clusters=10):
    inertias = []
    silhouette_scores = []
    
    K_range = range(2, max_clusters + 1)
    
    for k in K_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(X_scaled)
        inertias.append(kmeans.inertia_)
        silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))
    
    # Create subplot
    fig = make_subplots(rows=1, cols=2, 
                        subplot_titles=('Elbow Method', 'Silhouette Score'))
    
    # Elbow plot
    fig.add_trace(go.Scatter(x=list(K_range), y=inertias, 
                            mode='lines+markers', 
                            marker=dict(size=10, color='#E63946'),
                            name='Inertia'), row=1, col=1)
    
    # Silhouette plot
    fig.add_trace(go.Scatter(x=list(K_range), y=silhouette_scores,
                            mode='lines+markers',
                            marker=dict(size=10, color='#2A9D8F'),
                            name='Silhouette'), row=1, col=2)
    
    fig.update_layout(height=500, title_text="Determining Optimal Number of Clusters",
                     showlegend=True, title_font_size=18)
    fig.show()
    
    return inertias, silhouette_scores

inertias, silhouette_scores = plot_elbow_and_silhouette(X_scaled, max_clusters=10)

# Find best k based on silhouette score
best_k = np.argmax(silhouette_scores) + 2
print(f"\n📊 Optimal number of clusters: {best_k}")
print(f"Best silhouette score: {max(silhouette_scores):.3f}")

6. Apply K-Means Clustering

python

# Apply K-Means with optimal clusters
k = 4  # Based on elbow method and business logic
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(X_scaled)

# Get cluster centers
centers = scaler.inverse_transform(kmeans.cluster_centers_)
cluster_centers = pd.DataFrame(centers, columns=features)
cluster_centers['Cluster'] = range(k)

print("Cluster Centers (original scale):")
print(cluster_centers)

# Evaluate clustering quality
silhouette_avg = silhouette_score(X_scaled, df['Cluster'])
davies_bouldin = davies_bouldin_score(X_scaled, df['Cluster'])
calinski_harabasz = calinski_harabasz_score(X_scaled, df['Cluster'])

print(f"\n📈 Clustering Evaluation Metrics:")
print(f"Silhouette Score: {silhouette_avg:.3f}")
print(f"Davies-Bouldin Index: {davies_bouldin:.3f}")
print(f"Calinski-Harabasz Index: {calinski_harabasz:.3f}")

7. Visualize Clusters

python

# FIGURE 4: 2D PCA Visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
df['PCA1'] = X_pca[:, 0]
df['PCA2'] = X_pca[:, 1]

fig4 = px.scatter(df, x='PCA1', y='PCA2', color='Cluster',
                  title=f'Customer Segments - PCA Visualization (k={k})',
                  labels={'Cluster': 'Segment'},
                  color_continuous_scale='Viridis',
                  hover_data=['CustomerID', 'Annual_Income_k$', 'Spending_Score'])
fig4.update_layout(height=600, title_font_size=18)
fig4.show()
# Save as: 'images/pca_clusters.png'

# FIGURE 5: Income vs Spending Score (Most Common)
fig5 = px.scatter(df, x='Annual_Income_k$', y='Spending_Score', 
                  color='Cluster', size='Purchase_Frequency',
                  title='Customer Segmentation: Income vs Spending Score',
                  labels={'Annual_Income_k$': 'Annual Income (k$)', 
                         'Spending_Score': 'Spending Score (1-100)'},
                  hover_data=['Age', 'Purchase_Frequency'],
                  color_continuous_scale='Viridis')
fig5.update_layout(height=600, title_font_size=18)
fig5.show()
# Save as: 'images/income_vs_spending.png'

# FIGURE 6: Radar Chart for Cluster Profiles
def plot_cluster_radar(cluster_centers, features):
    # Normalize features for radar chart
    from sklearn.preprocessing import MinMaxScaler
    scaler_radar = MinMaxScaler()
    centers_normalized = scaler_radar.fit_transform(cluster_centers[features])
    
    fig = go.Figure()
    
    for i in range(len(cluster_centers)):
        fig.add_trace(go.Scatterpolar(
            r=centers_normalized[i],
            theta=features,
            fill='toself',
            name=f'Cluster {i}'
        ))
    
    fig.update_layout(
        polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
        showlegend=True,
        title="Customer Segment Profiles (Radar Chart)",
        title_font_size=18,
        height=600
    )
    fig.show()

plot_cluster_radar(cluster_centers, features)
# Save as: 'images/radar_chart_clusters.png'

8. Segment Profiling and Analysis

python

# FIGURE 7: Segment Characteristics - Heatmap
cluster_profile = df.groupby('Cluster')[features].mean()
cluster_profile_pct = df.groupby('Cluster')[features].mean().apply(lambda x: x/x.sum(), axis=1)

fig7 = px.imshow(cluster_profile_pct,
                 title='Segment Characteristics (Relative Values)',
                 labels=dict(x="Features", y="Cluster", color="Relative Value"),
                 color_continuous_scale='Viridis',
                 aspect='auto')
fig7.update_layout(height=500, title_font_size=18)
fig7.show()
# Save as: 'images/segment_characteristics.png'

# FIGURE 8: Detailed Segment Analysis
fig8 = make_subplots(rows=2, cols=2, 
                     subplot_titles=('Avg Annual Income by Segment',
                                   'Avg Spending Score by Segment',
                                   'Purchase Frequency by Segment',
                                   'Age Distribution by Segment'))

colors = ['#E63946', '#457B9D', '#2A9D8F', '#E9C46A']

for i in range(k):
    cluster_data = df[df['Cluster'] == i]
    
    fig8.add_trace(go.Box(y=cluster_data['Annual_Income_k$'], name=f'Cluster {i}',
                          marker_color=colors[i]), row=1, col=1)
    fig8.add_trace(go.Box(y=cluster_data['Spending_Score'], name=f'Cluster {i}',
                          marker_color=colors[i]), row=1, col=2)
    fig8.add_trace(go.Box(y=cluster_data['Purchase_Frequency'], name=f'Cluster {i}',
                          marker_color=colors[i]), row=2, col=1)
    fig8.add_trace(go.Box(y=cluster_data['Age'], name=f'Cluster {i}',
                          marker_color=colors[i]), row=2, col=2)

fig8.update_layout(height=800, showlegend=False, title_text="Segments Distribution Analysis")
fig8.show()
# Save as: 'images/segment_distributions.png'

# Create segment profiles
segment_profiles = []
for i in range(k):
    cluster_data = df[df['Cluster'] == i]
    
    profile = {
        'Segment': f'Segment {i}',
        'Size': len(cluster_data),
        'Size_Percentage': (len(cluster_data)/len(df))*100,
        'Avg_Income': cluster_data['Annual_Income_k$'].mean(),
        'Avg_Spending': cluster_data['Spending_Score'].mean(),
        'Avg_Frequency': cluster_data['Purchase_Frequency'].mean(),
        'Avg_Age': cluster_data['Age'].mean(),
        'Top_Channel': cluster_data['Channel_Preference'].mode()[0] if len(cluster_data) > 0 else 'N/A'
    }
    segment_profiles.append(profile)

segment_df = pd.DataFrame(segment_profiles)
print("\n📊 Customer Segment Profiles:")
print(segment_df.round(2))

9. Business Insights and Recommendations

python

# Generate business insights
def generate_business_insights(df, segment_profiles):
    insights = []
    
    # Identify high-value segments
    df['Customer_Value'] = (df['Avg_Transaction_Value'] * df['Purchase_Frequency'])
    high_value_cluster = df.groupby('Cluster')['Customer_Value'].mean().idxmax()
    
    insights.append({
        'title': '💰 High-Value Customers',
        'insight': f'Cluster {high_value_cluster} has the highest customer lifetime value',
        'recommendation': 'Implement VIP program with exclusive benefits and early access'
    })
    
    # Loyalty analysis
    loyal_cluster = df.groupby('Cluster')['Tenure_Months'].mean().idxmax()
    insights.append({
        'title': '🌟 Most Loyal Customers',
        'insight': f'Cluster {loyal_cluster} shows highest tenure and retention',
        'recommendation': 'Launch referral program and loyalty rewards'
    })
    
    # Cross-selling opportunity
    freq_spend_corr = df.groupby('Cluster')[['Purchase_Frequency', 'Spending_Score']].corr().iloc[0::2, -1]
    cross_sell_cluster = freq_spend_corr.idxmax()
    insights.append({
        'title': '🔄 Cross-selling Opportunity',
        'insight': f'Cluster {cross_sell_cluster} shows strong correlation between frequency and spending',
        'recommendation': 'Implement product bundling and personalized recommendations'
    })
    
    # Channel preference
    channel_analysis = df.groupby(['Cluster', 'Channel_Preference']).size().unstack(fill_value=0)
    for cluster in channel_analysis.index:
        top_channel = channel_analysis.loc[cluster].idxmax()
        insights.append({
            'title': f'📱 Channel Strategy - Cluster {cluster}',
            'insight': f'Preferred channel is {top_channel}',
            'recommendation': f'Optimize {top_channel} experience and personalize communications'
        })
    
    return insights

insights = generate_business_insights(df, segment_profiles)

# Visualize insights
fig9 = go.Figure()
fig9.add_trace(go.Bar(x=segment_df['Segment'], 
                      y=segment_df['Avg_Income'],
                      name='Income (k$)', 
                      marker_color='#E63946'))
fig9.add_trace(go.Bar(x=segment_df['Segment'], 
                      y=segment_df['Avg_Spending'],
                      name='Spending Score', 
                      marker_color='#457B9D'))
fig9.add_trace(go.Bar(x=segment_df['Segment'], 
                      y=segment_df['Avg_Frequency'],
                      name='Frequency', 
                      marker_color='#2A9D8F'))

fig9.update_layout(title='Segment Comparison: Key Metrics',
                   xaxis_title='Customer Segment',
                   yaxis_title='Value',
                   barmode='group',
                   height=500,
                   title_font_size=18)
fig9.show()
# Save as: 'images/segment_comparison.png'

10. Marketing Strategy Recommendations

python

def generate_marketing_strategies(df, segment_df):
    strategies = []
    
    for idx, row in segment_df.iterrows():
        segment = row['Segment']
        
        if row['Avg_Income'] > 70 and row['Avg_Spending'] > 70:
            strategy = {
                'Segment': segment,
                'Type': ' Premium Segment',
                'Strategy': 'Exclusive experiences, personalized service, early access',
                'Channels': ['Email', 'SMS', 'Personal Phone'],
                'Budget_Allocation': '35%'
            }
        elif row['Avg_Income'] > 50 and row['Avg_Spending'] > 50:
            strategy = {
                'Segment': segment,
                'Type': ' Regular Segment',
                'Strategy': 'Loyalty programs, bundle offers, seasonal promotions',
                'Channels': ['Email', 'Mobile App', 'Social Media'],
                'Budget_Allocation': '30%'
            }
        elif row['Avg_Income'] < 50 and row['Avg_Spending'] > 40:
            strategy = {
                'Segment': segment,
                'Type': ' Value Segment',
                'Strategy': 'Discount campaigns, flash sales, bulk purchase offers',
                'Channels': ['Social Media', 'SMS', 'App Notifications'],
                'Budget_Allocation': '25%'
            }
        else:
            strategy = {
                'Segment': segment,
                'Type': ' Occasional Segment',
                'Strategy': 'Re-engagement campaigns, first-purchase incentives',
                'Channels': ['Email', 'Retargeting Ads', 'Push Notifications'],
                'Budget_Allocation': '10%'
            }
        
        strategies.append(strategy)
    
    return pd.DataFrame(strategies)

marketing_strategies = generate_marketing_strategies(df, segment_df)
print("\n🎯 Recommended Marketing Strategies by Segment:")
print(marketing_strategies.to_string(index=False))

# FIGURE 10: Marketing Budget Allocation
fig10 = px.pie(marketing_strategies, 
               values='Budget_Allocation', 
               names='Segment',
               title='Marketing Budget Allocation by Segment',
               color_discrete_sequence=px.colors.sequential.Viridis_r,
               hole=0.3)
fig10.update_layout(height=500, title_font_size=18)
fig10.show()
# Save as: 'images/marketing_budget.png'

11. Interactive Dashboard with All Visualizations

python

def create_interactive_dashboard(df, segment_profiles):
    """Create comprehensive interactive dashboard"""
    
    # Summary metrics
    total_customers = len(df)
    segments_count = df['Cluster'].nunique()
    
    # Create dashboard
    fig = make_subplots(
        rows=3, cols=3,
        subplot_titles=('Segment Size Distribution', 'Income vs Spending', 
                       'Cluster Profiles', 'Purchase Frequency', 
                       'Channel Preferences', 'Age Distribution',
                       'Customer Value by Segment', 'Segment KPIs', 'Recommendation Priority'),
        specs=[[{'type': 'pie'}, {'type': 'scatter'}, {'type': 'bar'}],
               [{'type': 'box'}, {'type': 'bar'}, {'type': 'box'}],
               [{'type': 'bar'}, {'type': 'indicator'}, {'type': 'table'}]]
    )
    
    # Add traces (simplified for demonstration)
    # Segment size - pie chart
    segment_sizes = df['Cluster'].value_counts()
    fig.add_trace(go.Pie(labels=segment_sizes.index, values=segment_sizes.values), row=1, col=1)
    
    # Add more traces as needed...
    
    fig.update_layout(height=900, showlegend=False, 
                     title_text="Customer Segmentation Dashboard - Complete View")
    fig.show()

create_interactive_dashboard(df, segment_profiles)

12. Export and Save Results

python

# Save clustered data
df.to_csv('customer_segmentation_results.csv', index=False)

# Save segment profiles
segment_df.to_csv('segment_profiles.csv', index=False)

# Save marketing strategies
marketing_strategies.to_csv('marketing_strategies.csv', index=False)

# Generate summary report
def create_summary_report(df, segment_profiles):
    report = f"""
    ========================================
    CUSTOMER SEGMENTATION ANALYSIS REPORT
    ========================================
    
    📊 DATASET OVERVIEW:
    - Total Customers: {len(df)}
    - Number of Segments: {df['Cluster'].nunique()}
    - Features Used: {', '.join(features)}
    
    📈 SEGMENT COMPOSITION:
    """
    
    for _, row in segment_profiles.iterrows():
        report += f"""
    {row['Segment']}:
        - Size: {row['Size']} customers ({row['Size_Percentage']:.1f}%)
        - Avg Income: ${row['Avg_Income']:.0f}k
        - Avg Spending Score: {row['Avg_Spending']:.1f}/100
        - Purchase Frequency: {row['Avg_Frequency']:.1f} purchases/month
        - Avg Age: {row['Avg_Age']:.0f} years
        - Preferred Channel: {row['Top_Channel']}
        """
    
    report += f"""
    🎯 KEY INSIGHTS:
    1. {segment_profiles.loc[segment_profiles['Avg_Income'].idxmax(), 'Segment']} has highest income
    2. {segment_profiles.loc[segment_profiles['Avg_Spending'].idxmax(), 'Segment']} has highest spending
    3. {segment_profiles.loc[segment_profiles['Avg_Frequency'].idxmax(), 'Segment']} has highest purchase frequency
    
    💡 RECOMMENDATIONS:
    1. Focus retention efforts on high-value segments
    2. Personalize marketing messages based on segment characteristics
    3. Optimize channel strategy for each segment's preferences
    4. Implement A/B testing for segment-specific campaigns
    """
    
    with open('segmentation_report.txt', 'w') as f:
        f.write(report)
    
    print(report)

create_summary_report(df, segment_df)

Visualization Guide Summary

Visualization	Purpose	Best Location	Key Insight
Distribution Histograms	Understanding data spread	EDA Section	Feature distributions
Correlation Heatmap	Feature relationships	Preprocessing	Feature selection justification
Elbow Method Plot	Finding optimal k	Model Selection	Determine clusters
PCA Scatter Plot	Cluster visualization	Results	Segment separation
Income vs Spending	Market segmentation	Core Analysis	Customer patterns
Radar Chart	Segment profiling	Analysis	Segment characteristics
Box Plots	Distribution comparison	Validation	Segment differences
Bar Charts	KPI comparison	Business Insights	Performance metrics
Pie Charts	Size comparison	Summary	Segment proportions
Dashboard	Complete view	Executive Summary	Holistic insights

Key Success Metrics

python

success_metrics = {
    'Business Metrics': [
        'Customer Lifetime Value (CLV) increase',
        'Retention rate improvement',
        'Marketing ROI enhancement',
        'Cross-selling success rate'
    ],
    'Model Metrics': [
        f'Silhouette Score: {silhouette_avg:.3f}',
        f'Davies-Bouldin Index: {davies_bouldin:.3f}',
        f'Calinski-Harabasz Index: {calinski_harabasz:.3f}',
        f'Optimal Clusters: {k}'
    ]
}

print("\n✅ Project Success Metrics:")
for category, metrics in success_metrics.items():
    print(f"\n{category}:")
    for metric in metrics:
        print(f"  • {metric}")