The objective of this problem is to explore the data and reduce the number of features by using dimensionality reduction techniques like PCA and TSNE and generate meaningful insights.
There are 8 variables in the data:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#to scale the data using z-score
from sklearn.preprocessing import StandardScaler
#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
data = pd.read_csv("auto-mpg.csv")
data.head()
mpg | cylinders | displacement | horsepower | weight | acceleration | model year | car name | |
---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | chevrolet chevelle malibu |
1 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | buick skylark 320 |
2 | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | plymouth satellite |
3 | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | amc rebel sst |
4 | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | ford torino |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mpg 398 non-null float64 1 cylinders 398 non-null int64 2 displacement 398 non-null float64 3 horsepower 398 non-null object 4 weight 398 non-null int64 5 acceleration 398 non-null float64 6 model year 398 non-null int64 7 car name 398 non-null object dtypes: float64(3), int64(3), object(2) memory usage: 25.0+ KB
Observations:
data["car name"].nunique()
305
# dropping car_name
data1 = data.copy()
data = data.drop(['car name'], axis=1)
# checking if there are values other than digits in the column 'horsepower'
hpIsDigit = pd.DataFrame(data.horsepower.str.isdigit()) # if the string is made of digits store True else False
# print isDigit = False!
data[hpIsDigit['horsepower'] == False] # from temp take only those rows where hp has false
mpg | cylinders | displacement | horsepower | weight | acceleration | model year | |
---|---|---|---|---|---|---|---|
32 | 25.0 | 4 | 98.0 | ? | 2046 | 19.0 | 71 |
126 | 21.0 | 6 | 200.0 | ? | 2875 | 17.0 | 74 |
330 | 40.9 | 4 | 85.0 | ? | 1835 | 17.3 | 80 |
336 | 23.6 | 4 | 140.0 | ? | 2905 | 14.3 | 80 |
354 | 34.5 | 4 | 100.0 | ? | 2320 | 15.8 | 81 |
374 | 23.0 | 4 | 151.0 | ? | 3035 | 20.5 | 82 |
Observations:
#Relacing ? with np.nan
data = data.replace('?', np.nan)
data[hpIsDigit['horsepower'] == False]
mpg | cylinders | displacement | horsepower | weight | acceleration | model year | |
---|---|---|---|---|---|---|---|
32 | 25.0 | 4 | 98.0 | NaN | 2046 | 19.0 | 71 |
126 | 21.0 | 6 | 200.0 | NaN | 2875 | 17.0 | 74 |
330 | 40.9 | 4 | 85.0 | NaN | 1835 | 17.3 | 80 |
336 | 23.6 | 4 | 140.0 | NaN | 2905 | 14.3 | 80 |
354 | 34.5 | 4 | 100.0 | NaN | 2320 | 15.8 | 81 |
374 | 23.0 | 4 | 151.0 | NaN | 3035 | 20.5 | 82 |
# Imputing the missing values with median value
data.horsepower.fillna(data.horsepower.median(), inplace=True)
data['horsepower'] = data['horsepower'].astype('float64') # converting the hp column from object data type to float
data.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
mpg | 398.0 | 23.514573 | 7.815984 | 9.0 | 17.500 | 23.0 | 29.000 | 46.6 |
cylinders | 398.0 | 5.454774 | 1.701004 | 3.0 | 4.000 | 4.0 | 8.000 | 8.0 |
displacement | 398.0 | 193.425879 | 104.269838 | 68.0 | 104.250 | 148.5 | 262.000 | 455.0 |
horsepower | 398.0 | 104.304020 | 38.222625 | 46.0 | 76.000 | 93.5 | 125.000 | 230.0 |
weight | 398.0 | 2970.424623 | 846.841774 | 1613.0 | 2223.750 | 2803.5 | 3608.000 | 5140.0 |
acceleration | 398.0 | 15.568090 | 2.757689 | 8.0 | 13.825 | 15.5 | 17.175 | 24.8 |
model year | 398.0 | 76.010050 | 3.697627 | 70.0 | 73.000 | 76.0 | 79.000 | 82.0 |
Observations:
for col in data.columns:
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
print(col)
data[col].hist()
plt.ylabel('frequency')
plt.xlabel(col)
plt.subplot(1,2,2)
sns.boxplot(x=data[col])
mpg cylinders displacement horsepower weight acceleration model year
Observations:
plt.figure(figsize=(8,8))
sns.heatmap(data.corr(), annot=True)
plt.show()
Observations:
# scaling the data
scaler=StandardScaler()
data_scaled=pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
data_scaled.head()
mpg | cylinders | displacement | horsepower | weight | acceleration | model year | |
---|---|---|---|---|---|---|---|
0 | -0.706439 | 1.498191 | 1.090604 | 0.673118 | 0.630870 | -1.295498 | -1.627426 |
1 | -1.090751 | 1.498191 | 1.503514 | 1.589958 | 0.854333 | -1.477038 | -1.627426 |
2 | -0.706439 | 1.498191 | 1.196232 | 1.197027 | 0.550470 | -1.658577 | -1.627426 |
3 | -0.962647 | 1.498191 | 1.061796 | 1.197027 | 0.546923 | -1.295498 | -1.627426 |
4 | -0.834543 | 1.498191 | 1.042591 | 0.935072 | 0.565841 | -1.840117 | -1.627426 |
#Defining the number of principal components to generate
n=data_scaled.shape[1]
#Finding principal components for the data
pca = PCA(n_components=n,random_state=1)#Apply the PCA algorithm with random state = 1
data_pca1 = pd.DataFrame(pca.fit_transform(data_scaled)) #Fit and transform the pca function on scaled data
print(data_pca1)
#The percentage of variance explained by each principal component
exp_var = pca.explained_variance_ratio_
0 1 2 3 4 5 6 0 2.661556 0.918577 -0.558420 0.740000 -0.549433 -0.089079 -0.118566 1 3.523307 0.789779 -0.670658 0.493223 -0.025134 0.203588 0.101518 2 2.998309 0.861604 -0.982108 0.715598 -0.281324 0.137351 -0.055167 3 2.937560 0.949168 -0.607196 0.531084 -0.272607 0.295916 -0.121296 4 2.930688 0.931822 -1.078890 0.558607 -0.543871 0.007707 -0.167301 .. ... ... ... ... ... ... ... 393 -1.420970 -1.225252 -0.286402 -0.671666 0.054472 -0.187878 0.101922 394 -4.094686 -1.279998 1.960384 1.375464 0.740606 0.175097 0.087391 395 -1.547254 -1.252540 -1.906999 -0.323768 -0.255922 -0.254531 0.149028 396 -2.022942 -1.132137 0.609384 -0.464327 0.186656 0.089169 0.075018 397 -2.182941 -1.236714 0.787268 -0.157523 0.468128 0.025712 0.001612 [398 rows x 7 columns]
# visualize the explained variance by individual components
plt.figure(figsize = (10,10))
plt.plot(range(1,8), exp_var.cumsum(), marker = 'o', linestyle = '--')
plt.title("Explained Variances by Components")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
Text(0, 0.5, 'Cumulative Explained Variance')
# find the least number of components that can explain more than 90% variance
sum = 0
for ix, i in enumerate(exp_var):
sum = sum + i
if(sum>0.90):
print("Number of PCs that explain at least 90% variance: ", ix+1)
break
Number of PCs that explain at least 90% variance: 3
Observations:
pc_comps = ['PC1','PC2','PC3']
data_pca = pd.DataFrame(np.round(pca.components_[:3,:],2),index=pc_comps,columns=data_scaled.columns)
data_pca.T
PC1 | PC2 | PC3 | |
---|---|---|---|
mpg | -0.40 | -0.21 | -0.26 |
cylinders | 0.42 | -0.19 | 0.14 |
displacement | 0.43 | -0.18 | 0.10 |
horsepower | 0.42 | -0.09 | -0.17 |
weight | 0.41 | -0.22 | 0.28 |
acceleration | -0.28 | 0.02 | 0.89 |
model year | -0.23 | -0.91 | -0.02 |
def color_high(val):
if val <= -0.40: # you can decide any value as per your understanding
return 'background: pink'
elif val >= 0.40:
return 'background: skyblue'
data_pca.T.style.applymap(color_high)
PC1 | PC2 | PC3 | |
---|---|---|---|
mpg | -0.400000 | -0.210000 | -0.260000 |
cylinders | 0.420000 | -0.190000 | 0.140000 |
displacement | 0.430000 | -0.180000 | 0.100000 |
horsepower | 0.420000 | -0.090000 | -0.170000 |
weight | 0.410000 | -0.220000 | 0.280000 |
acceleration | -0.280000 | 0.020000 | 0.890000 |
model year | -0.230000 | -0.910000 | -0.020000 |
Observations:
plt.figure(figsize = (7,7))
sns.scatterplot(x=data_pca1[0],y=data_pca1[1])
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()
Let's try adding hue to the scatter plot
df_concat = pd.concat([data_pca1, data], axis=1)
plt.figure(figsize = (7,7))
#Create a scatter plot with x=0 and y=1 using df_concat dataframe
sns.scatterplot(x=0,y=1,data=df_concat,hue='cylinders')
plt.xlabel("PC1")
plt.ylabel("PC2")
Text(0, 0.5, 'PC2')
Observations:
tsne = TSNE(n_components=2,random_state=1) #Apply the TSNE algorithm with random state = 1
data_tsne = tsne.fit_transform(data_scaled) #Fit and transform tsne function on the scaled data
data_tsne.shape
(398, 2)
data_tsne = pd.DataFrame(data = data_tsne, columns = ['Component 1', 'Component 2'])
data_tsne.head()
Component 1 | Component 2 | |
---|---|---|
0 | -38.088413 | -15.912958 |
1 | -37.404369 | -17.995850 |
2 | -38.050472 | -17.063194 |
3 | -37.718334 | -16.476006 |
4 | -38.404663 | -16.763493 |
sns.scatterplot(x=data_tsne.iloc[:,0],y=data_tsne.iloc[:,1])
<AxesSubplot:xlabel='Component 1', ylabel='Component 2'>
# Let's see scatter plot of the data w.r.t number of cylinders
sns.scatterplot(x=data_tsne.iloc[:,0],y=data_tsne.iloc[:,1],hue=data.cylinders)
<AxesSubplot:xlabel='Component 1', ylabel='Component 2'>
data_tsne.head()
Component 1 | Component 2 | |
---|---|---|
0 | -38.088413 | -15.912958 |
1 | -37.404369 | -17.995850 |
2 | -38.050472 | -17.063194 |
3 | -37.718334 | -16.476006 |
4 | -38.404663 | -16.763493 |
Observations:
# Let's assign points to 3 different groups
def grouping(x):
first_component = x['Component 1']
second_component = x['Component 2']
if (first_component> 0) and (second_component >0):
return 'group_1'
if (first_component >-20 ) and (first_component < 5):
return 'group_2'
else:
return 'group_3'
data_tsne['groups'] = data_tsne.apply(grouping,axis=1)
sns.scatterplot(x=data_tsne.iloc[:,0],y=data_tsne.iloc[:,1],hue=data_tsne.iloc[:,2])
<AxesSubplot:xlabel='Component 1', ylabel='Component 2'>
data['groups'] = data_tsne['groups']
all_col = data.columns.tolist()
plt.figure(figsize=(20, 20))
for i, variable in enumerate(all_col):
if i==7:
break
plt.subplot(4, 2, i + 1)
#Create boxplot with groups on the x-axis and variable on the y-axis (use the dataframe data)
sns.boxplot(x=data['groups'],y=data[variable])
plt.tight_layout()
plt.title(variable)
plt.show()
Observations: