import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler

#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE


data = pd.read_csv("auto-mpg.csv")


data.head()


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   car name      398 non-null    object 
dtypes: float64(3), int64(3), object(2)
memory usage: 25.0+ KB


data["car name"].nunique()

305


# dropping car_name
data1 = data.copy()
data = data.drop(['car name'], axis=1)


# checking if there are values other than digits in the column 'horsepower' 
hpIsDigit = pd.DataFrame(data.horsepower.str.isdigit())  # if the string is made of digits store True else False

# print isDigit = False!
data[hpIsDigit['horsepower'] == False]   # from temp take only those rows where hp has false


#Relacing ? with np.nan
data = data.replace('?', np.nan)
data[hpIsDigit['horsepower'] == False]


# Imputing the missing values with median value
data.horsepower.fillna(data.horsepower.median(), inplace=True)
data['horsepower'] = data['horsepower'].astype('float64')  # converting the hp column from object data type to float


data.describe().T


for col in data.columns:
    plt.figure(figsize=(15,4))
    plt.subplot(1,2,1)
    print(col)
    data[col].hist()
    plt.ylabel('frequency')
    plt.xlabel(col)
    plt.subplot(1,2,2)
    sns.boxplot(x=data[col])

mpg
cylinders
displacement
horsepower
weight
acceleration
model year


plt.figure(figsize=(8,8))
sns.heatmap(data.corr(), annot=True)
plt.show()


# scaling the data
scaler=StandardScaler()
data_scaled=pd.DataFrame(scaler.fit_transform(data), columns=data.columns)


data_scaled.head()


#Defining the number of principal components to generate 
n=data_scaled.shape[1]

#Finding principal components for the data
pca =  PCA(n_components=n,random_state=1)#Apply the PCA algorithm with random state = 1
data_pca1 = pd.DataFrame(pca.fit_transform(data_scaled)) #Fit and transform the pca function on scaled data
print(data_pca1)

#The percentage of variance explained by each principal component
exp_var = pca.explained_variance_ratio_

            0         1         2         3         4         5         6
0    2.661556  0.918577 -0.558420  0.740000 -0.549433 -0.089079 -0.118566
1    3.523307  0.789779 -0.670658  0.493223 -0.025134  0.203588  0.101518
2    2.998309  0.861604 -0.982108  0.715598 -0.281324  0.137351 -0.055167
3    2.937560  0.949168 -0.607196  0.531084 -0.272607  0.295916 -0.121296
4    2.930688  0.931822 -1.078890  0.558607 -0.543871  0.007707 -0.167301
..        ...       ...       ...       ...       ...       ...       ...
393 -1.420970 -1.225252 -0.286402 -0.671666  0.054472 -0.187878  0.101922
394 -4.094686 -1.279998  1.960384  1.375464  0.740606  0.175097  0.087391
395 -1.547254 -1.252540 -1.906999 -0.323768 -0.255922 -0.254531  0.149028
396 -2.022942 -1.132137  0.609384 -0.464327  0.186656  0.089169  0.075018
397 -2.182941 -1.236714  0.787268 -0.157523  0.468128  0.025712  0.001612

[398 rows x 7 columns]


# visualize the explained variance by individual components
plt.figure(figsize = (10,10))
plt.plot(range(1,8), exp_var.cumsum(), marker = 'o', linestyle = '--')
plt.title("Explained Variances by Components")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")

Text(0, 0.5, 'Cumulative Explained Variance')


# find the least number of components that can explain more than 90% variance
sum = 0
for ix, i in enumerate(exp_var):
  sum = sum + i
  if(sum>0.90):
    print("Number of PCs that explain at least 90% variance: ", ix+1)
    break

Number of PCs that explain at least 90% variance:  3


pc_comps = ['PC1','PC2','PC3']
data_pca = pd.DataFrame(np.round(pca.components_[:3,:],2),index=pc_comps,columns=data_scaled.columns)
data_pca.T


def color_high(val):
    if val <= -0.40: # you can decide any value as per your understanding
        return 'background: pink'
    elif val >= 0.40:
        return 'background: skyblue'   
    
data_pca.T.style.applymap(color_high)


plt.figure(figsize = (7,7))
sns.scatterplot(x=data_pca1[0],y=data_pca1[1])
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()


df_concat = pd.concat([data_pca1, data], axis=1)

plt.figure(figsize = (7,7))
#Create a scatter plot with x=0 and y=1 using df_concat dataframe
sns.scatterplot(x=0,y=1,data=df_concat,hue='cylinders')
plt.xlabel("PC1")
plt.ylabel("PC2")

Text(0, 0.5, 'PC2')


tsne = TSNE(n_components=2,random_state=1)  #Apply the TSNE algorithm with random state = 1
data_tsne = tsne.fit_transform(data_scaled) #Fit and transform tsne function on the scaled data


data_tsne.shape

(398, 2)


data_tsne = pd.DataFrame(data = data_tsne, columns = ['Component 1', 'Component 2'])


data_tsne.head()


sns.scatterplot(x=data_tsne.iloc[:,0],y=data_tsne.iloc[:,1])

<AxesSubplot:xlabel='Component 1', ylabel='Component 2'>


# Let's see scatter plot of the data w.r.t number of cylinders
sns.scatterplot(x=data_tsne.iloc[:,0],y=data_tsne.iloc[:,1],hue=data.cylinders)

<AxesSubplot:xlabel='Component 1', ylabel='Component 2'>


data_tsne.head()


# Let's assign points to 3 different groups
def grouping(x):
    first_component = x['Component 1']
    second_component = x['Component 2']
    if (first_component> 0) and (second_component >0): 
        return 'group_1'
    if (first_component >-20 ) and (first_component < 5):
        return 'group_2'
    else: 
        return 'group_3'


data_tsne['groups'] = data_tsne.apply(grouping,axis=1)


sns.scatterplot(x=data_tsne.iloc[:,0],y=data_tsne.iloc[:,1],hue=data_tsne.iloc[:,2])

<AxesSubplot:xlabel='Component 1', ylabel='Component 2'>


data['groups'] = data_tsne['groups']


all_col = data.columns.tolist()
plt.figure(figsize=(20, 20))

for i, variable in enumerate(all_col):
    if i==7:
        break
    plt.subplot(4, 2, i + 1)
    #Create boxplot with groups on the x-axis and variable on the y-axis (use the dataframe data)
    sns.boxplot(x=data['groups'],y=data[variable])
    plt.tight_layout()
    plt.title(variable)
plt.show()

	mpg	cylinders	displacement	horsepower	weight	acceleration	model year	car name
0	18.0	8	307.0	130	3504	12.0	70	chevrolet chevelle malibu
1	15.0	8	350.0	165	3693	11.5	70	buick skylark 320
2	18.0	8	318.0	150	3436	11.0	70	plymouth satellite
3	16.0	8	304.0	150	3433	12.0	70	amc rebel sst
4	17.0	8	302.0	140	3449	10.5	70	ford torino

	mpg	cylinders	displacement	horsepower	weight	acceleration	model year
32	25.0	4	98.0	?	2046	19.0	71
126	21.0	6	200.0	?	2875	17.0	74
330	40.9	4	85.0	?	1835	17.3	80
336	23.6	4	140.0	?	2905	14.3	80
354	34.5	4	100.0	?	2320	15.8	81
374	23.0	4	151.0	?	3035	20.5	82

	mpg	cylinders	displacement	horsepower	weight	acceleration	model year
32	25.0	4	98.0	NaN	2046	19.0	71
126	21.0	6	200.0	NaN	2875	17.0	74
330	40.9	4	85.0	NaN	1835	17.3	80
336	23.6	4	140.0	NaN	2905	14.3	80
354	34.5	4	100.0	NaN	2320	15.8	81
374	23.0	4	151.0	NaN	3035	20.5	82

	count	mean	std	min	25%	50%	75%	max
mpg	398.0	23.514573	7.815984	9.0	17.500	23.0	29.000	46.6
cylinders	398.0	5.454774	1.701004	3.0	4.000	4.0	8.000	8.0
displacement	398.0	193.425879	104.269838	68.0	104.250	148.5	262.000	455.0
horsepower	398.0	104.304020	38.222625	46.0	76.000	93.5	125.000	230.0
weight	398.0	2970.424623	846.841774	1613.0	2223.750	2803.5	3608.000	5140.0
acceleration	398.0	15.568090	2.757689	8.0	13.825	15.5	17.175	24.8
model year	398.0	76.010050	3.697627	70.0	73.000	76.0	79.000	82.0

	mpg	cylinders	displacement	horsepower	weight	acceleration	model year
0	-0.706439	1.498191	1.090604	0.673118	0.630870	-1.295498	-1.627426
1	-1.090751	1.498191	1.503514	1.589958	0.854333	-1.477038	-1.627426
2	-0.706439	1.498191	1.196232	1.197027	0.550470	-1.658577	-1.627426
3	-0.962647	1.498191	1.061796	1.197027	0.546923	-1.295498	-1.627426
4	-0.834543	1.498191	1.042591	0.935072	0.565841	-1.840117	-1.627426

Dimensionality Reduction using PCA and tSNE¶

Objective:¶

Dataset:¶

Importing necessary libraries and overview of the dataset¶

Loading data¶

Check the info of the data¶

Data Preprocessing and Exploratory Data Analysis¶

Checking values in horsepower column¶

Summary Statistics¶

Step 1:¶

Let's check the distribution and outliers for each column in the data¶

Step 2:¶

Checking correlation¶

Scaling the data¶

Principal Component Analysis¶

Step 3:¶

Step 4: Interpret the coefficients of three principal components from the dataframe¶

We can also visualize the data in 2 dimensions using first two principal components¶

Step 5:¶

t-SNE¶

Step 6:¶

Step 7:¶

	PC1	PC2	PC3
mpg	-0.40	-0.21	-0.26
cylinders	0.42	-0.19	0.14
displacement	0.43	-0.18	0.10
horsepower	0.42	-0.09	-0.17
weight	0.41	-0.22	0.28
acceleration	-0.28	0.02	0.89
model year	-0.23	-0.91	-0.02

	PC1	PC2	PC3
mpg	-0.400000	-0.210000	-0.260000
cylinders	0.420000	-0.190000	0.140000
displacement	0.430000	-0.180000	0.100000
horsepower	0.420000	-0.090000	-0.170000
weight	0.410000	-0.220000	0.280000
acceleration	-0.280000	0.020000	0.890000
model year	-0.230000	-0.910000	-0.020000

	Component 1	Component 2
0	-38.088413	-15.912958
1	-37.404369	-17.995850
2	-38.050472	-17.063194
3	-37.718334	-16.476006
4	-38.404663	-16.763493