-
Notifications
You must be signed in to change notification settings - Fork 3
/
mall_customer_segmentation.py
295 lines (200 loc) · 10.3 KB
/
mall_customer_segmentation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
# -*- coding: utf-8 -*-
"""Mall_Customer_segmentation.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1UQJhJUgzqlV0jYpBdxoHdjin0MFo5ZD3
# <center><b> Customer Segmentation</center>
# **1. Objective**
This case requires to develop a customer segmentation to understand customers behavior and sepparate them in different groups or cluster according to their preferences, and once the division is done, this information can be given to marketing team so they can plan the strategy accordingly.
### **Data Description :**
The sample Dataset summarizes the usage behavior of about 200 active customers during the last 3 months. The file is at a customer level with 5 behavioral variables.
### **Attribute Information :**
Following is the Data Dictionary for customer dataset :-
**CustomerID** : Unique ID assigned to the customer
**Gender** :Gender of the customer
**Age** : Age of the customer
**Annual Income (k$)** : Annual Income of the customee
**Spending Score** : Score assigned by the mall based on customer behavior and spending nature
# **2. Install & Import Libraries**
"""
# use to visualize missing value
!pip install missingno
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
## Display all the columns of the dataframe
pd.pandas.set_option('display.max_columns',None)
from scipy import stats
from scipy.stats import norm, skew
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
# clustering algorithms
from sklearn.cluster import KMeans,AgglomerativeClustering,DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_samples, silhouette_score
"""# **3. Load Datasets**"""
# load dataset
customer_df = pd.read_csv("/content/Mall_Customers.csv")
"""# **4. Exploratory Data Analysis**
### 4.1. Data Exploration
For both train and test dataset, We'll explore following things
- First 5 rows
- Data shape
- Data information
- Statistical description
- Data types
- Null value
### 4.1.1. First 5 records
"""
customer_df.head()
"""### 4.1.2. Data Shape """
customer_df.shape
"""### 4.1.3. Data Information"""
customer_df.info()
"""### 4.1.4. Statistical description """
customer_df.describe(include='all')
"""### 4.1.5. Data Type """
customer_dtype = customer_df.dtypes
customer_dtype.value_counts()
"""### 4.1.6. Null Value """
customer_df.isnull().sum().sort_values(ascending = False).head()
"""### 4.1.7. Visualize missing value using **Misingno** - Train Data"""
msno.matrix(customer_df)
"""### 4.1.8. Report - Data Exploration
From the above data exploration we saw that
- There is no missing value present
- Shape of the dataset is (200, 5)
- memory usage by dataset : 7.9+ KB
- There are 4 integer and 1 object type feature present
### 4.3.3.2. Check Distribution - Continuous
"""
numerical_features = [col for col in customer_df.columns if customer_df[col].dtypes != 'O']
discrete_features = [col for col in numerical_features if len(customer_df[col].unique()) < 10 and col not in ['CustomerID']]
continuous_features = [feature for feature in numerical_features if feature not in discrete_features+['CustomerID']]
categorical_features = [col for col in customer_df.columns if customer_df[col].dtype == 'O']
print("Total Number of Numerical Columns : ",len(numerical_features))
print("Number of discrete features : ",len(discrete_features))
print("No of continuous features are : ", len(continuous_features))
print("Number of categorical features : ",len(categorical_features))
f, axes = plt.subplots(2,2 , figsize=(20, 7), sharex=False)
pos = 1
for i, feature in enumerate(continuous_features):
plt.subplot(1 , 3 , pos)
ax = sns.histplot(data=customer_df, x = feature,kde=True,palette="husl") # ax=axes[i%2, i//2]
ax.lines[0].set_color('crimson')
pos = pos + 1
# get the features except object types
numeric_feats = customer_df.dtypes[customer_df.dtypes != 'object'].index
# check the skew of all numerical features
skewed_feats = customer_df[numeric_feats].apply(lambda x : skew(x.dropna())).sort_values(ascending = False)
print('\n Skew in numberical features: \n')
skewness_df = pd.DataFrame({'Skew' : skewed_feats})
print(skewness_df.head(10))
"""Above distribution shows that:
- The distribution of continuous features are normally distributed.
### 4.3.3.2. Check Distribution - Categorical
"""
sns.countplot(x='Gender', data=customer_df, palette="Set3")
"""- Let's see how gender of customers affects to all other features."""
#Pairplot
sns.pairplot(customer_df, vars=["Age", "Annual Income (k$)", "Spending Score (1-100)"], kind ="reg", hue = "Gender", palette="husl", markers = ['o','D'])
"""- From the above pairplot we observe that green colour has higher ratio than pink colour as there are more female customers than male.
- *True Fact*
> > **"I make clothes, women make fashion." —Azzedine Alaïa**
### 4.6. Data Correlation
"""
customer_corr = customer_df.corr(method='spearman')
plt.figure(figsize=(8,8))
sns.heatmap(customer_corr, cmap="icefire", linewidths=.5) #'hot', 'hot_r', 'hsv', 'hsv_r', 'icefire', 'icefire_r'
"""# **5. Feature Engineering**
### 5.1. Drop Columns
"""
customer_df.drop(columns='CustomerID',axis=1,inplace=True)
"""### 5.2. Encoding Categorical Features"""
# Generate one-hot dummy columns
customer_df = pd.get_dummies(customer_df).reset_index(drop=True)
"""### 5.3. Normalization"""
# pre_precessing_pipeline = make_pipeline(RobustScaler())
# customer_scaled_df = pre_precessing_pipeline.fit_transform(customer_df)
# print(customer_scaled_df.shape)
"""# **6. Model Development**
In this step we'll apply various clustering algorithms and check which algorithm is best for our dataset. We'r going to use below algorithms.
- Kmeans Clustering
- Agglomerative Clustering
- GaussianMixture Model based clustering
- DBSCAN Clustering
### 6.1. Hyperparameter tuning
To find the best K value there are many techniques we can use.
1. **Elbow Method**: The elbow method finds the value of the optimal number of clusters using the total within-cluster sum of square values.
2. **Average silhouette method** : It is a measure of how well each data point fits its corresponding cluster. This method evaluates the quality of clustering. As a general rule, a high average silhouette width denotes better clustering output.
3. **Gap statistic method** : It is a measure of the value of gap statistics. Gap statistics is the difference between the total intracluster changes for various values of k compared to their expected values. The optimal number of clusters is the value that maximizes the value of gap statistics.
- Here we'll use **Elbow Method** to find the best K Value.
### 6.2. Elbow Method
"""
inertia = []
range_val = range(1,15)
for i in range_val:
kmean = KMeans(n_clusters=i)
kmean.fit_predict(pd.DataFrame(customer_df))
inertia.append(kmean.inertia_)
plt.plot(range_val,inertia,'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('The Elbow Method using Inertia')
plt.show()
"""- From the above elbow method we see that **K = 5** is the best K value for our clustering
### 6.2. K-Means Clustering
"""
# apply kmeans algorithm
kmeans_model=KMeans(5)
kmeans_clusters = kmeans_model.fit_predict(customer_df)
"""### 6.3. Agglomerative Clustering"""
# apply agglomerative algorithm
agglo_model = AgglomerativeClustering(linkage="ward",n_clusters=5)
agglomerative_clusters = agglo_model.fit_predict(customer_df)
"""### 6.4. GaussianMixture Model based clustering"""
GaussianMixture_model = GaussianMixture(n_components=5)
gmm_clusters = GaussianMixture_model.fit_predict(customer_df)
"""### 6.5. DBSCAN Clustering"""
model_dbscan = DBSCAN(eps=3, min_samples=17)
dbscan_clusters = model_dbscan.fit_predict(customer_df)
"""### 6.6. Cluster Validity Indices
To compare which clustering algorithm is best for our model, we use various cluster validity indices.
1. Dunn Index
2. Davies Bouldin Index
3. Silhouette Score
4. Calinski - Harabasz Index
- "algorithms that produce clusters with high Dunn index are more desirable" -Wikipedia
- "Objects with a high silhouette value are considered well clustered" -Wikipedia
- "clustering algorithm that produces a collection of clusters with the smallest Davies–Bouldin index is considered the best algorithm" -Wikipedia
- **Here we'll use silhouette score method to choose the best clustering algorithm**
"""
def silhouette_method(df,algo,y_pred):
print('=================================================================================')
print('Clustering ',algo," : silhouette score : ",silhouette_score(df,y_pred) )
silhouette_method(customer_df,' : KMeans',kmeans_clusters)
silhouette_method(customer_df,' : Agglomerative',agglomerative_clusters)
silhouette_method(customer_df,' : GaussianMixture',gmm_clusters)
print('=================================================================================')
"""- From the above analysis,we saw that "K-Means" algorithm has high silhouette score as compared to others. So we'll apply K-Means algorithm for our model.
### 6.7. Cluster Visualization
"""
customer_df["label"] = kmeans_clusters
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(customer_df.Age[customer_df.label == 0], customer_df["Annual Income (k$)"][customer_df.label == 0], customer_df["Spending Score (1-100)"][customer_df.label == 0], c='blue', s=60)
ax.scatter(customer_df.Age[customer_df.label == 1], customer_df["Annual Income (k$)"][customer_df.label == 1], customer_df["Spending Score (1-100)"][customer_df.label == 1], c='red', s=60)
ax.scatter(customer_df.Age[customer_df.label == 2], customer_df["Annual Income (k$)"][customer_df.label == 2], customer_df["Spending Score (1-100)"][customer_df.label == 2], c='green', s=60)
ax.scatter(customer_df.Age[customer_df.label == 3], customer_df["Annual Income (k$)"][customer_df.label == 3], customer_df["Spending Score (1-100)"][customer_df.label == 3], c='orange', s=60)
ax.scatter(customer_df.Age[customer_df.label == 4], customer_df["Annual Income (k$)"][customer_df.label == 4], customer_df["Spending Score (1-100)"][customer_df.label == 4], c='purple', s=60)
ax.view_init(30, 185)
plt.xlabel("Age")
plt.ylabel("Annual Income (k$)")
ax.set_zlabel('Spending Score (1-100)')
plt.show()