# import libraries for data manipulation
import pandas as pd
import numpy as np

# import libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.gofplots import ProbPlot

# import libraries for building linear regression model
from statsmodels.formula.api import ols
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

# import library for preparing data
from sklearn.model_selection import train_test_split

# import library for data preprocessing
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings("ignore")


df = pd.read_csv("Boston.csv")
df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  LSTAT    506 non-null    float64
 12  MEDV     506 non-null    float64
dtypes: float64(10), int64(3)
memory usage: 51.5 KB


df.describe()


# let's plot all the columns to look at their distributions
for i in df.columns:
    plt.figure(figsize=(7, 4))
    sns.histplot(data=df, x=i, kde = True)
    plt.show()


df['MEDV_log'] = np.log(df['MEDV'])


sns.histplot(data=df, x='MEDV_log', kde = True)

<AxesSubplot:xlabel='MEDV_log', ylabel='Count'>


plt.figure(figsize=(12,8))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(df.corr(),annot=True,fmt='.2f',cmap=cmap ) #write your code here
plt.show()


# scatterplot to visualize the relationship between NOX and INDUS
plt.figure(figsize=(6, 6))
sns.scatterplot(x='INDUS',y='NOX',data=df) #write you code here
plt.show()


df[df.INDUS>18.00]['INDUS'].value_counts()

18.10    132
19.58     30
21.89     15
25.65      7
27.74      5
Name: INDUS, dtype: int64


# Explore data at INDUS==18.10
df[df.INDUS==18.10]['NOX'].value_counts()

0.713    18
0.693    14
0.740    13
0.700    11
0.584     8
0.770     8
0.679     8
0.671     7
0.614     7
0.597     6
0.718     6
0.631     5
0.532     5
0.583     4
0.580     4
0.655     3
0.668     3
0.659     2
Name: NOX, dtype: int64


df[df.INDUS==18.10][df.NOX==.713]


# Lets check distribution of NOX in boxplot
sns.boxplot(df.NOX)

<AxesSubplot:xlabel='NOX'>


# scatterplot to visualize the relationship between AGE and NOX
plt.figure(figsize=(6, 6))
sns.scatterplot(x='AGE',y='NOX',data=df) #Write your code here
plt.show()


# scatterplot to visualize the relationship between DIS and NOX
plt.figure(figsize=(6, 6))
sns.scatterplot(x='DIS',y='NOX',data=df) #Write your code here
plt.show()


# scatterplot to visualize the relationship between AGE and DIS
plt.figure(figsize=(6, 6))
sns.scatterplot(x = 'AGE', y = 'DIS', data = df)
plt.show()


# scatterplot to visualize the relationship between AGE and INDUS
plt.figure(figsize=(6, 6))
sns.scatterplot(x = 'AGE', y = 'INDUS', data = df)
plt.show()


# scatterplot to visulaize the relationship between RAD and TAX
plt.figure(figsize=(6, 6))
sns.scatterplot(x = 'RAD', y = 'TAX', data = df)
plt.show()


# remove the data corresponding to high tax rate
df1 = df[df['TAX'] < 600]
# import the required function
from scipy.stats import pearsonr
# calculate the correlation
print('The correlation between TAX and RAD is', pearsonr(df1['TAX'], df1['RAD'])[0])

The correlation between TAX and RAD is 0.24975731331429196


# scatterplot to visualize the relationship between INDUS and TAX
plt.figure(figsize=(6, 6))
sns.scatterplot(x = 'INDUS', y = 'TAX', data = df)
plt.show()


# scatterplot to visulaize the relationship between RM and MEDV
plt.figure(figsize=(6, 6))
sns.scatterplot(x = 'RM', y = 'MEDV', data = df)
plt.show()


# scatterplot to visulaize the relationship between LSTAT and MEDV
plt.figure(figsize=(6, 6))
sns.scatterplot(x = 'LSTAT', y = 'MEDV', data = df)
plt.show()


# separate the dependent and independent variable
Y = df['MEDV_log']
X = df.drop(columns = {'MEDV', 'MEDV_log'})

# add the intercept term
X = sm.add_constant(X)


# splitting the data in 70:30 ratio of train to test data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30 , random_state=1)


from statsmodels.stats.outliers_influence import variance_inflation_factor

# function to check VIF
def checking_vif(train):
    vif = pd.DataFrame()
    vif["feature"] = train.columns

    # calculating VIF for each feature
    vif["VIF"] = [
        variance_inflation_factor(train.values, i) for i in range(len(train.columns))
    ]
    return vif


print(checking_vif(X_train))

    feature         VIF
0     const  535.372593
1      CRIM    1.924114
2        ZN    2.743574
3     INDUS    3.999538
4      CHAS    1.076564
5       NOX    4.396157
6        RM    1.860950
7       AGE    3.150170
8       DIS    4.355469
9       RAD    8.345247
10      TAX   10.191941
11  PTRATIO    1.943409
12    LSTAT    2.861881


# create the model after dropping TAX
X_train = X_train.drop(columns={'TAX'})

# check for VIF
print(checking_vif(X_train))

    feature         VIF
0     const  532.025529
1      CRIM    1.923159
2        ZN    2.483399
3     INDUS    3.270983
4      CHAS    1.050708
5       NOX    4.361847
6        RM    1.857918
7       AGE    3.149005
8       DIS    4.333734
9       RAD    2.942862
10  PTRATIO    1.909750
11    LSTAT    2.860251


# create the model
model1 = sm.OLS(y_train, X_train)

# fitting the model
model1 = model1.fit()

# get the model summary
model1.summary()


# create the model after dropping columns 'MEDV', 'MEDV_log', 'TAX', 'ZN', 'AGE', 'INDUS' from df dataframe
Y = df['MEDV_log']
X = df.drop(columns=['AGE','ZN','INDUS','MEDV_log','MEDV','TAX']) #write your code here
X = sm.add_constant(X)

#splitting the data in 70:30 ratio of train to test data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30 , random_state=1)

# create the model
model2 = sm.OLS(y_train, X_train)

# fitting the model
model2 = model2.fit()

# get the model summary
model2.summary()


residuals = model2.resid

residuals.mean()

-2.7385501274087196e-15


from statsmodels.stats.diagnostic import het_white
from statsmodels.compat import lzip
import statsmodels.stats.api as sms


name = ["F statistic", "p-value"]
test = sms.het_goldfeldquandt(y_train, X_train)
lzip(name, test)

[('F statistic', 1.083508292342528), ('p-value', 0.3019012006766869)]


# predicted values
fitted = model2.fittedvalues

# sns.set_style("whitegrid")
sns.residplot(x = fitted, y = residuals, color="lightblue", lowess=True) #write your code here
plt.xlabel("Fitted Values")
plt.ylabel("Residual")
plt.title("Residual PLOT")
plt.show()


# Plot histogram of residuals
sns.histplot(residuals,kde=True)

<AxesSubplot:ylabel='Count'>


# Plot q-q plot of residuals
import pylab
import scipy.stats as stats

stats.probplot(residuals, dist="norm", plot=pylab)
plt.show()


# RMSE
def rmse(predictions, targets):
    return np.sqrt(((targets - predictions) ** 2).mean())


# MAPE
def mape(predictions, targets):
    return np.mean(np.abs((targets - predictions)) / targets) * 100


# MAE
def mae(predictions, targets):
    return np.mean(np.abs((targets - predictions)))


# Model Performance on test and train data
def model_pref(olsmodel, x_train, x_test):

    # In-sample Prediction
    y_pred_train = olsmodel.predict(x_train)
    y_observed_train = y_train

    # Prediction on test data
    y_pred_test = olsmodel.predict(x_test)
    y_observed_test = y_test

    print(
        pd.DataFrame(
            {
                "Data": ["Train", "Test"],
                "RMSE": [
                    rmse(y_pred_train, y_observed_train),
                    rmse(y_pred_test, y_observed_test),
                ],
                "MAE": [
                    mae(y_pred_train, y_observed_train),
                    mae(y_pred_test, y_observed_test),
                ],
                "MAPE": [
                    mape(y_pred_train, y_observed_train),
                    mape(y_pred_test, y_observed_test),
                ],
            }
        )
    )


# Checking model performance
model_pref(model2, X_train, X_test)

    Data      RMSE       MAE      MAPE
0  Train  0.195504  0.143686  4.981813
1   Test  0.198045  0.151284  5.257965


# import the required function

from sklearn.model_selection import cross_val_score

# build the regression model and cross-validate
linearregression = LinearRegression()                                    

cv_Score11 = cross_val_score(linearregression, X_train, y_train, cv = 10)
cv_Score12 = cross_val_score(linearregression, X_train, y_train, cv = 10,scoring = 'neg_mean_squared_error')                               


print("RSquared: %0.3f (+/- %0.3f)" % (cv_Score11.mean(), cv_Score11.std() * 2))
print("Mean Squared Error: %0.3f (+/- %0.3f)" % (-1*cv_Score12.mean(), cv_Score12.std() * 2))

RSquared: 0.729 (+/- 0.232)
Mean Squared Error: 0.041 (+/- 0.023)


coef = model2.params
coef.shape

(9,)


# Let us write the equation of the fit
Equation = "log (Price) ="
print(Equation, end='\t')
for i in range(len(coef)):
    print('(', coef[i], ') * ', coef.index[i], '+', end = ' ')

log (Price) =	( 4.649385823266648 ) *  const + ( -0.012500455079104059 ) *  CRIM + ( 0.11977319077019681 ) *  CHAS + ( -1.0562253516683295 ) *  NOX + ( 0.058906575109278964 ) *  RM + ( -0.04406889079940562 ) *  DIS + ( 0.00784847460624374 ) *  RAD + ( -0.04850362079499892 ) *  PTRATIO + ( -0.029277040479796852 ) *  LSTAT +

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	LSTAT	MEDV
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1	296	15.3	4.98	24.0
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2	242	17.8	9.14	21.6
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2	242	17.8	4.03	34.7
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3	222	18.7	2.94	33.4
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3	222	18.7	5.33	36.2

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	LSTAT	MEDV
count	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000
mean	3.613524	11.363636	11.136779	0.069170	0.554695	6.284634	68.574901	3.795043	9.549407	408.237154	18.455534	12.653063	22.532806
std	8.601545	23.322453	6.860353	0.253994	0.115878	0.702617	28.148861	2.105710	8.707259	168.537116	2.164946	7.141062	9.197104
min	0.006320	0.000000	0.460000	0.000000	0.385000	3.561000	2.900000	1.129600	1.000000	187.000000	12.600000	1.730000	5.000000
25%	0.082045	0.000000	5.190000	0.000000	0.449000	5.885500	45.025000	2.100175	4.000000	279.000000	17.400000	6.950000	17.025000
50%	0.256510	0.000000	9.690000	0.000000	0.538000	6.208500	77.500000	3.207450	5.000000	330.000000	19.050000	11.360000	21.200000
75%	3.677083	12.500000	18.100000	0.000000	0.624000	6.623500	94.075000	5.188425	24.000000	666.000000	20.200000	16.955000	25.000000
max	88.976200	100.000000	27.740000	1.000000	0.871000	8.780000	100.000000	12.126500	24.000000	711.000000	22.000000	37.970000	50.000000

	CRIM	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	LSTAT	MEDV	MEDV_log
433	5.58107	18.1	0.713	6.436	87.9	2.3158	24	666	20.2	16.22	14.3	2.660260
434	13.91340	18.1	0.713	6.208	95.0	2.2222	24	666	20.2	15.17	11.7	2.459589
448	9.32909	18.1	0.713	6.185	98.7	2.2616	24	666	20.2	18.13	14.1	2.646175
449	7.52601	18.1	0.713	6.417	98.3	2.1850	24	666	20.2	19.31	13.0	2.564949
450	6.71772	18.1	0.713	6.749	92.6	2.3236	24	666	20.2	17.44	13.4	2.595255
451	5.44114	18.1	0.713	6.655	98.2	2.3552	24	666	20.2	17.73	15.2	2.721295
452	5.09017	18.1	0.713	6.297	91.8	2.3682	24	666	20.2	17.27	16.1	2.778819
453	8.24809	18.1	0.713	7.393	99.3	2.4527	24	666	20.2	16.74	17.8	2.879198
454	9.51363	18.1	0.713	6.728	94.1	2.4961	24	666	20.2	18.71	14.9	2.701361
455	4.75237	18.1	0.713	6.525	86.5	2.4358	24	666	20.2	18.13	14.1	2.646175
456	4.66883	18.1	0.713	5.976	87.9	2.5806	24	666	20.2	19.01	12.7	2.541602
457	8.20058	18.1	0.713	5.936	80.3	2.7792	24	666	20.2	16.94	13.5	2.602690
458	7.75223	18.1	0.713	6.301	83.7	2.7831	24	666	20.2	16.23	14.9	2.701361
459	6.80117	18.1	0.713	6.081	84.4	2.7175	24	666	20.2	14.70	20.0	2.995732
460	4.81213	18.1	0.713	6.701	90.0	2.5975	24	666	20.2	16.42	16.4	2.797281
461	3.69311	18.1	0.713	6.376	88.4	2.5671	24	666	20.2	14.65	17.7	2.873565
462	6.65492	18.1	0.713	6.317	83.0	2.7344	24	666	20.2	13.99	19.5	2.970414
463	5.82115	18.1	0.713	6.513	89.9	2.8016	24	666	20.2	10.29	20.2	3.005683

Dep. Variable:	MEDV_log	R-squared:	0.769
Model:	OLS	Adj. R-squared:	0.761
Method:	Least Squares	F-statistic:	103.3
Date:	Mon, 22 Nov 2021	Prob (F-statistic):	1.40e-101
Time:	18:12:35	Log-Likelihood:	76.596
No. Observations:	354	AIC:	-129.2
Df Residuals:	342	BIC:	-82.76
Df Model:	11
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	4.6324	0.243	19.057	0.000	4.154	5.111
CRIM	-0.0128	0.002	-7.445	0.000	-0.016	-0.009
ZN	0.0010	0.001	1.425	0.155	-0.000	0.002
INDUS	-0.0004	0.003	-0.148	0.883	-0.006	0.005
CHAS	0.1196	0.039	3.082	0.002	0.043	0.196
NOX	-1.0598	0.187	-5.675	0.000	-1.427	-0.692
RM	0.0532	0.021	2.560	0.011	0.012	0.094
AGE	0.0003	0.001	0.461	0.645	-0.001	0.002
DIS	-0.0503	0.010	-4.894	0.000	-0.071	-0.030
RAD	0.0076	0.002	3.699	0.000	0.004	0.012
PTRATIO	-0.0452	0.007	-6.659	0.000	-0.059	-0.032
LSTAT	-0.0298	0.002	-12.134	0.000	-0.035	-0.025

Linear Regression: Boston House Price Prediction¶

Problem Statement¶

Data Information¶

Let us start by importing the required libraries¶

Read the dataset¶

Get information about the dataset using the info() method¶

Let's now check the summary statistics of this dataset¶

Step 1: Find the summary statistics and write observations¶

Univariate Analysis¶

Check the distribution of the variables¶

Bivariate Analysis¶

Let's check the correlation using the heatmap¶

Step 2:¶

Visualizing the relationship between the features having significant correlations (> 0.7)¶

Step 3 :¶

Split the dataset¶

Check for Multicollinearity¶

Step 4: Drop the column 'TAX' from the training data and check if multicollinearity is removed?¶

Step 5: Write the code to create the linear regression model and print the model summary.¶

Step 6: Drop insignificant variables from the above model and create the regression model again.¶

Examining the significance of the model¶

Check the below linear regression assumptions¶

Step 7: Check the above linear regression assumptions and provide insights.¶

Check for mean residuals¶

Check for homoscedasticity¶

Linearity of variables¶

Normality of error terms¶

Check the performance of the model on the train and test data set¶

Step 8: Compare model performance of train and test dataset¶

Apply cross validation to improve the model and evaluate it using different evaluation metrics¶

Step 9: Apply the cross validation technique to improve the model and evaluate it using different evaluation metrics.¶

Step 10: Get model Coefficients in a pandas dataframe with column 'Feature' having all the features and column 'Coefs' with all the corresponding Coefs.¶

Step 11: Write the conclusions and business recommendations derived from the model.¶

Omnibus:	30.699	Durbin-Watson:	1.923
Prob(Omnibus):	0.000	Jarque-Bera (JB):	83.718
Skew:	0.372	Prob(JB):	6.62e-19
Kurtosis:	5.263	Cond. No.	2.09e+03

Omnibus:	32.514	Durbin-Watson:	1.925
Prob(Omnibus):	0.000	Jarque-Bera (JB):	87.354
Skew:	0.408	Prob(JB):	1.07e-19
Kurtosis:	5.293	Cond. No.	690.

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	4.6494	0.242	19.242	0.000	4.174	5.125
CRIM	-0.0125	0.002	-7.349	0.000	-0.016	-0.009
CHAS	0.1198	0.039	3.093	0.002	0.044	0.196
NOX	-1.0562	0.168	-6.296	0.000	-1.386	-0.726
RM	0.0589	0.020	2.928	0.004	0.019	0.098
DIS	-0.0441	0.008	-5.561	0.000	-0.060	-0.028
RAD	0.0078	0.002	3.890	0.000	0.004	0.012
PTRATIO	-0.0485	0.006	-7.832	0.000	-0.061	-0.036
LSTAT	-0.0293	0.002	-12.949	0.000	-0.034	-0.025