#Import all the necessary packages

import pandas as pd
import numpy as np

import matplotlib.pylab as plt
import seaborn as sns

#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler

#importing clustering algorithms
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture


#if the below line of code gives an error, then uncomment the following code to install the sklearn_extra library
#!pip install scikit-learn-extra
from sklearn_extra.cluster import KMedoids

import warnings
warnings.filterwarnings("ignore")


data = pd.read_excel('Credit Card Customer Data.xlsx')
data.head()


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660 entries, 0 to 659
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   Sl_No                660 non-null    int64
 1   Customer Key         660 non-null    int64
 2   Avg_Credit_Limit     660 non-null    int64
 3   Total_Credit_Cards   660 non-null    int64
 4   Total_visits_bank    660 non-null    int64
 5   Total_visits_online  660 non-null    int64
 6   Total_calls_made     660 non-null    int64
dtypes: int64(7)
memory usage: 36.2 KB


data.nunique()

Sl_No                  660
Customer Key           655
Avg_Credit_Limit       110
Total_Credit_Cards      10
Total_visits_bank        6
Total_visits_online     16
Total_calls_made        11
dtype: int64


# Identify the duplicated customer keys
duplicate_keys = data[data["Customer Key"].duplicated()]
duplicate_keys


data[data["Customer Key"]==47437]


# Drop duplicated keys

data = data[data["Customer Key"].duplicated()==False]


data.shape

(655, 7)


data.drop(columns = ['Sl_No', 'Customer Key'], inplace = True)


data[data.duplicated()]


data=data[~data.duplicated()]


data.shape

(644, 5)


data.describe().T


# Uncomment and complete the code by filling the blanks 

for col in data.columns:
    print(col)
    print('Skew :',round(data[col].skew(),2))
    plt.figure(figsize=(15,4))
    plt.subplot(1,2,1)
    data[col].hist()
    plt.ylabel('count')
    plt.subplot(1,2,2)
    sns.boxplot(x=data[col])
    plt.show()

Avg_Credit_Limit
Skew : 2.19

Total_Credit_Cards
Skew : 0.17

Total_visits_bank
Skew : 0.15

Total_visits_online
Skew : 2.21

Total_calls_made
Skew : 0.65


plt.figure(figsize=(8,8))
sns.heatmap(data.corr(), annot=True, fmt='0.2f')
plt.show()


scaler=StandardScaler()
data_scaled=pd.DataFrame(scaler.fit_transform(data), columns=data.columns)


data_scaled.head()


#Creating copy of the data to store labels from each algorithm
data_scaled_copy = data_scaled.copy(deep=True)
data_scaled_copy.shape

(644, 5)


# step 1
sse = {} 

# step 2 - iterate for a range of Ks and fit the scaled data to the algorithm. Use inertia attribute from the clustering object and 
# store the inertia value for that k 
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000, random_state=1).fit(data_scaled)
    sse[k] = kmeans.inertia_

# step 3
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()), 'bx-')
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.show()


# As per diagram, elbow point is at 3. It imply any more clusters will not help much.
kmeans = KMeans(n_clusters=3, max_iter=1000, random_state=1) #Apply the K-Means algorithm
kmeans.fit(data_scaled) #Fit the kmeans function on the scaled data

#Adding predicted labels to the original data and scaled data 
data_scaled_copy['Labels'] = kmeans.predict(data_scaled) #Save the predictions on the scaled data from K-Means
data['Labels'] = kmeans.predict(data_scaled) #Save the predictions on the scaled data from K-Means


print(data_scaled_copy.shape)
print(data.shape)

(644, 6)
(644, 6)


print(data_scaled_copy.Labels.value_counts())
print(data.Labels.value_counts())

1    374
0    221
2     49
Name: Labels, dtype: int64
1    374
0    221
2     49
Name: Labels, dtype: int64


data_scaled_copy.head()


#Number of observations in each cluster
data.Labels.value_counts()

1    374
0    221
2     49
Name: Labels, dtype: int64


#Calculating summary statistics of the original data for each label
mean = data.groupby('Labels').mean()
median = data.groupby('Labels').median()
df_kmeans = pd.concat([mean, median], axis=0)
df_kmeans.index = ['group_0 Mean', 'group_1 Mean', 'group_2 Mean', 'group_0 Median', 'group_1 Median', 'group_2 Median']
df_kmeans.T


#Visualizing different features w.r.t K-means labels
data_scaled_copy.boxplot(by = 'Labels', layout = (1,5),figsize=(20,7))
plt.show()


gmm = GaussianMixture(n_components=3, random_state=1) #Apply the Gaussian Mixture algorithm
gmm.fit(data_scaled) #Fit the gmm function on the scaled data

data_scaled_copy['GmmLabels'] = gmm.predict(data_scaled)
data['GmmLabels'] = gmm.predict(data_scaled)


#Number of observations in each cluster
data.GmmLabels.value_counts()

1    374
0    221
2     49
Name: GmmLabels, dtype: int64


#Calculating summary statistics of the original data for each label
original_features = ["Avg_Credit_Limit","Total_Credit_Cards","Total_visits_bank","Total_visits_online","Total_calls_made"]

mean = data.groupby('GmmLabels').mean()
median = data.groupby('GmmLabels').median()
df_gmm = pd.concat([mean, median], axis=0)
df_gmm.index = ['group_0 Mean', 'group_1 Mean', 'group_2 Mean', 'group_0 Median', 'group_1 Median', 'group_2 Median']
df_gmm[original_features].T


# plotting boxplots with the new GMM based labels

features_with_lables = ["Avg_Credit_Limit","Total_Credit_Cards","Total_visits_bank","Total_visits_online","Total_calls_made","GmmLabels"]

data_scaled_copy[features_with_lables].boxplot(by = 'GmmLabels', layout = (1,5),figsize=(20,7))
plt.show()


kmedo = KMedoids(n_clusters = 3, random_state=1) #Apply the K-Medoids algorithm
kmedo.fit(data_scaled) #Fit the kmedo function on the scaled data

data_scaled_copy['kmedoLabels'] = kmedo.predict(data_scaled)
data['kmedoLabels'] = kmedo.predict(data_scaled)


#Number of observations in each cluster
data.kmedoLabels.value_counts()

2    289
0    222
1    133
Name: kmedoLabels, dtype: int64


#Calculating summary statistics of the original data for each label
mean = data.groupby('kmedoLabels').mean()
median = data.groupby('kmedoLabels').median()
df_kmedoids = pd.concat([mean, median], axis=0)
df_kmedoids.index = ['group_0 Mean', 'group_1 Mean', 'group_2 Mean', 'group_0 Median', 'group_1 Median', 'group_2 Median']
df_kmedoids[original_features].T


#plotting boxplots with the new K-Medoids based labels

features_with_lables = ["Avg_Credit_Limit",	"Total_Credit_Cards","Total_visits_bank","Total_visits_online","Total_calls_made","kmedoLabels"]

data_scaled_copy[features_with_lables].boxplot(by = 'kmedoLabels', layout = (1,5),figsize=(20,7))
plt.show()


comparison = pd.concat([df_kmedoids, df_kmeans], axis=1)[original_features]
comparison

	Sl_No	Customer Key	Avg_Credit_Limit	Total_Credit_Cards	Total_visits_bank	Total_visits_online	Total_calls_made
0	1	87073	100000	2	1	1	0
1	2	38414	50000	3	0	10	9
2	3	17341	50000	7	1	3	4
3	4	40496	30000	5	1	1	4
4	5	47437	100000	6	0	12	3

	Sl_No	Customer Key	Avg_Credit_Limit	Total_Credit_Cards	Total_visits_bank	Total_visits_online	Total_calls_made
332	333	47437	17000	7	3	1	0
398	399	96929	67000	6	2	2	2
432	433	37252	59000	6	2	1	2
541	542	50706	60000	7	5	2	2
632	633	97935	187000	7	1	7	0

	Avg_Credit_Limit	Total_Credit_Cards	Total_visits_bank	Total_visits_online	Total_calls_made
162	8000	2	0	3	4
175	6000	1	0	2	5
215	8000	4	0	4	7
295	10000	6	4	2	3
324	9000	4	5	0	4
361	18000	6	3	1	4
378	12000	6	5	2	1
385	8000	7	4	2	0
395	5000	4	5	0	1
455	47000	6	2	0	4
497	52000	4	2	1	2

	count	mean	std	min	25%	50%	75%	max
Avg_Credit_Limit	644.0	34543.478261	37428.704286	3000.0	11000.0	18000.0	48000.00	200000.0
Total_Credit_Cards	644.0	4.694099	2.175338	1.0	3.0	5.0	6.00	10.0
Total_visits_bank	644.0	2.395963	1.626964	0.0	1.0	2.0	4.00	5.0
Total_visits_online	644.0	2.624224	2.957728	0.0	1.0	2.0	4.00	15.0
Total_calls_made	644.0	3.608696	2.880025	0.0	1.0	3.0	5.25	10.0

	Avg_Credit_Limit	Total_Credit_Cards	Total_visits_bank	Total_visits_online	Total_calls_made
0	1.750192	-1.239437	-0.858684	-0.549573	-1.253982
1	0.413280	-0.779381	-1.473803	2.495669	1.873420
2	0.413280	1.060843	-0.858684	0.127148	0.135974
3	-0.121485	0.140731	-0.858684	-0.549573	0.135974
4	1.750192	0.600787	-1.473803	3.172390	-0.211515

Unsupervised Learning¶

Context:¶

Objective:¶

About the data:¶

Importing libraries and overview of the dataset¶

Loading data¶

Check the info of the data¶

Data Preprocessing and Exploratory Data Analysis¶

Step 1: Identify and drop the rows with duplicated customer keys¶

Summary Statistics¶

Step 2: Write observations on the summary statistics of the data¶

Now we will explore each variable at hand. We will check the distribution and outliers for each variable in the data.¶

Setp 3:¶

Scaling the data¶

K-Means¶

Step 4:¶

Step 5: Create cluster profiles using the summary statistics and box plots for each label¶

Gaussian Mixture¶

Step 6:¶

K-Medoids¶

Step 7:¶

	group_0 Mean	group_1 Mean	group_2 Mean	group_0 Median	group_1 Median	group_2 Median
Avg_Credit_Limit	12239.819005	33893.048128	140102.040816	12000.0	31500.0	145000.0
Total_Credit_Cards	2.411765	5.508021	8.775510	2.0	6.0	9.0
Total_visits_bank	0.945701	3.489305	0.591837	1.0	3.0	1.0
Total_visits_online	3.561086	0.975936	10.979592	4.0	1.0	11.0
Total_calls_made	6.891403	1.997326	1.102041	7.0	2.0	1.0

	group_0 Mean	group_1 Mean	group_2 Mean	group_0 Median	group_1 Median	group_2 Median
Avg_Credit_Limit	12216.216216	85052.631579	28449.826990	12000.0	68000.0	20000.0
Total_Credit_Cards	2.423423	7.030075	5.363322	2.0	7.0	5.0
Total_visits_bank	0.950450	1.691729	3.830450	1.0	2.0	4.0
Total_visits_online	3.554054	4.639098	0.982699	4.0	2.0	1.0
Total_calls_made	6.878378	1.969925	1.851211	7.0	2.0	2.0

	Avg_Credit_Limit	Avg_Credit_Limit	Total_Credit_Cards	Total_Credit_Cards	Total_visits_bank	Total_visits_bank	Total_visits_online	Total_visits_online	Total_calls_made	Total_calls_made
group_0 Mean	12216.216216	12239.819005	2.423423	2.411765	0.950450	0.945701	3.554054	3.561086	6.878378	6.891403
group_1 Mean	85052.631579	33893.048128	7.030075	5.508021	1.691729	3.489305	4.639098	0.975936	1.969925	1.997326
group_2 Mean	28449.826990	140102.040816	5.363322	8.775510	3.830450	0.591837	0.982699	10.979592	1.851211	1.102041
group_0 Median	12000.000000	12000.000000	2.000000	2.000000	1.000000	1.000000	4.000000	4.000000	7.000000	7.000000
group_1 Median	68000.000000	31500.000000	7.000000	6.000000	2.000000	3.000000	2.000000	1.000000	2.000000	2.000000
group_2 Median	20000.000000	145000.000000	5.000000	9.000000	4.000000	1.000000	1.000000	11.000000	2.000000	1.000000