pip install pandas numpy matplotlib seaborn scikit-learn plotly
from google.colab import files uploaded = files.upload()
import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns
df = pd.read_csv("customer_segmentation.csv")
df.head()| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | ... | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5524 | 1957 | Graduation | Single | 58138.0 | 0 | 0 | 04-09-2012 | 58 | 635 | ... | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 1 |
| 1 | 2174 | 1954 | Graduation | Single | 46344.0 | 1 | 1 | 08-03-2014 | 38 | 11 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 2 | 4141 | 1965 | Graduation | Together | 71613.0 | 0 | 0 | 21-08-2013 | 26 | 426 | ... | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 3 | 6182 | 1984 | Graduation | Together | 26646.0 | 1 | 0 | 10-02-2014 | 26 | 11 | ... | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 4 | 5324 | 1981 | PhD | Married | 58293.0 | 1 | 0 | 19-01-2014 | 94 | 173 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
5 rows × 29 columns
df.info() df.describe() df.isnull().sum()
| 0 | |
|---|---|
| ID | 0 |
| Year_Birth | 0 |
| Education | 0 |
| Marital_Status | 0 |
| Income | 24 |
| Kidhome | 0 |
| Teenhome | 0 |
| Dt_Customer | 0 |
| Recency | 0 |
| MntWines | 0 |
| MntFruits | 0 |
| MntMeatProducts | 0 |
| MntFishProducts | 0 |
| MntSweetProducts | 0 |
| MntGoldProds | 0 |
| NumDealsPurchases | 0 |
| NumWebPurchases | 0 |
| NumCatalogPurchases | 0 |
| NumStorePurchases | 0 |
| NumWebVisitsMonth | 0 |
| AcceptedCmp3 | 0 |
| AcceptedCmp4 | 0 |
| AcceptedCmp5 | 0 |
| AcceptedCmp1 | 0 |
| AcceptedCmp2 | 0 |
| Complain | 0 |
| Z_CostContact | 0 |
| Z_Revenue | 0 |
| Response | 0 |
print(df.columns)
df_selected = df[['Age','Income','TotalSpending']] df_selected.head()
| Age | Income | TotalSpending | |
|---|---|---|---|
| 0 | 57 | 58138.0 | 1617 |
| 1 | 60 | 46344.0 | 27 |
| 2 | 49 | 71613.0 | 776 |
| 3 | 30 | 26646.0 | 53 |
| 4 | 33 | 58293.0 | 422 |
print("Before:" , df.shape)
df= df.dropna()
print("After:" , df.shape)from sklearn.preprocessing import StandardScaler scaler = StandardScaler() df_selected = df_selected.dropna() # Ensure df_selected is free of NaNs df_scaled = scaler.fit_transform(df_selected) print(df_scaled[:5])
from sklearn.cluster import KMeans
inertia = []
for k in range(1,11):
kmeans = KMeans(n_clusters=k,random_state=42)
kmeans.fit(df_scaled)
inertia.append(kmeans.inertia_)
plt.plot(range(1,11),inertia,marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()kmeans = KMeans(n_clusters=5,random_state=42) df['Cluster'] = kmeans.fit_predict(df_scaled) df.head()
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | ... | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | Age | TotalSpending | Cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5524 | 1957 | Graduation | Single | 58138.0 | 0 | 0 | 04-09-2012 | 58 | 635 | ... | 0 | 0 | 0 | 0 | 3 | 11 | 1 | 57 | 1617 | 4 |
| 1 | 2174 | 1954 | Graduation | Single | 46344.0 | 1 | 1 | 08-03-2014 | 38 | 11 | ... | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 60 | 27 | 2 |
| 2 | 4141 | 1965 | Graduation | Together | 71613.0 | 0 | 0 | 21-08-2013 | 26 | 426 | ... | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 49 | 776 | 4 |
| 3 | 6182 | 1984 | Graduation | Together | 26646.0 | 1 | 0 | 10-02-2014 | 26 | 11 | ... | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 30 | 53 | 0 |
| 4 | 5324 | 1981 | PhD | Married | 58293.0 | 1 | 0 | 19-01-2014 | 94 | 173 | ... | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 33 | 422 | 0 |
5 rows × 32 columns
plt.scatter(df['Income'],df['TotalSpending'],c=df['Cluster'],cmap='rainbow')
plt.xlabel('Income')
plt.ylabel('Total Spending')
plt.title("Customer Segmentation")
plt.show()from mpl_toolkits.mplot3d import Axes3D import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111,projection='3d')
ax.scatter(df['Age'],df['Income'],df['TotalSpending'],c=df['Cluster'])
ax.set_xlabel('Age')
ax.set_ylabel('Income')
ax.set_zlabel('Total Spending')
ax.set_title('Customer Segmentation')
plt.show()import plotly.express as px fig = px.scatter_3d(df,x='Age',y='Income',z='TotalSpending',color='Cluster',symbol='Cluster', title ='3D Customer Segmentation') fig.show()
df.groupby('Cluster').mean(numeric_only=True)| ID | Year_Birth | Income | Kidhome | Teenhome | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | ... | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | Age | TotalSpending | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Cluster | |||||||||||||||||||||
| 0 | 5570.913520 | 1977.404385 | 34228.068210 | 0.811206 | 0.327649 | 48.064555 | 63.939099 | 7.373934 | 36.423873 | 10.886724 | ... | 0.010962 | 0.000000 | 0.003654 | 0.000000 | 0.009744 | 3.0 | 11.0 | 0.113276 | 36.595615 | 148.496955 |
| 1 | 5486.571744 | 1977.046358 | 73664.443709 | 0.134658 | 0.381898 | 49.441501 | 621.273731 | 58.704194 | 383.571744 | 81.536424 | ... | 0.152318 | 0.211921 | 0.165563 | 0.035320 | 0.004415 | 3.0 | 11.0 | 0.245033 | 36.953642 | 1280.399558 |
| 2 | 5646.707407 | 1958.190741 | 45456.940741 | 0.437037 | 0.901852 | 49.811111 | 153.092593 | 9.600000 | 50.092593 | 13.575926 | ... | 0.062963 | 0.000000 | 0.007407 | 0.007407 | 0.016667 | 3.0 | 11.0 | 0.079630 | 55.809259 | 264.635185 |
| 3 | 9432.000000 | 1977.000000 | 666666.000000 | 1.000000 | 0.000000 | 23.000000 | 9.000000 | 14.000000 | 18.000000 | 8.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.000000 | 37.000000 | 62.000000 |
| 4 | 5650.872818 | 1956.246883 | 72556.753117 | 0.037406 | 0.476309 | 49.458853 | 647.064838 | 51.271820 | 347.463840 | 75.291771 | ... | 0.129676 | 0.164589 | 0.149626 | 0.024938 | 0.004988 | 3.0 | 11.0 | 0.214464 | 57.753117 | 1247.822943 |
5 rows × 28 columns