import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dataset = pd.read_csv("50_Startups.csv")
dataset.head(2)

x = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1]

x = pd.get_dummies(x)
x = np.array(x)

from sklearn.cross_validation import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3)

from sklearn.linear_model import LinearRegression
reg = LinearRegression()

reg.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

pred = reg.predict(x_test)

pred

array([ 95426.94607409,  92169.5501553 , 129325.54263545, 172452.82283584,
       115181.28026676,  99704.46442206, 172964.11561363, 112836.41130865,
        59569.30344372,  98839.8469964 , 135619.87141866, 125652.04064963,
       109452.18137443,  52169.49197756, 128734.03912106])

y_test

31     97483.56
37     89949.14
18    124266.90
3     182901.99
21    111313.02
30     99937.59
4     166187.94
19    122776.86
43     69758.98
28    103282.38
11    144259.40
13    134307.35
26    105733.54
48     35673.41
12    141585.52
Name: Profit, dtype: float64

#backward elimination
x = np.append(arr = np.ones((50,1)).astype(int),values = x,axis =1)

import statsmodels.formula.api as sm

x_opt = x[:,[0,1,2,3,4,5,6]]
regressor = sm.OLS(endog=y,exog=x_opt).fit()
regressor.summary()

x_opt = x[:,[0,1,3,4,5,6]]
regressor = sm.OLS(endog=y,exog=x_opt).fit()
regressor.summary()

x_opt = x[:,[0,1,4,5,6]]
regressor = sm.OLS(endog=y,exog=x_opt).fit()
regressor.summary()

	R&D Spend	Administration	Marketing Spend	State	Profit
0	165349.2	136897.80	471784.10	New York	192261.83
1	162597.7	151377.59	443898.53	California	191792.06

Dep. Variable:	Profit	R-squared:	0.951
Model:	OLS	Adj. R-squared:	0.945
Method:	Least Squares	F-statistic:	169.9
Date:	Fri, 05 Apr 2019	Prob (F-statistic):	1.34e-27
Time:	22:35:56	Log-Likelihood:	-525.38
No. Observations:	50	AIC:	1063.
Df Residuals:	44	BIC:	1074.
Df Model:	5
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	3.763e+04	5073.636	7.417	0.000	2.74e+04	4.79e+04
x1	0.8060	0.046	17.369	0.000	0.712	0.900
x2	-0.0270	0.052	-0.517	0.608	-0.132	0.078
x3	0.0270	0.017	1.574	0.123	-0.008	0.062
x4	1.249e+04	2449.797	5.099	0.000	7554.868	1.74e+04
x5	1.269e+04	2726.700	4.654	0.000	7195.596	1.82e+04
x6	1.245e+04	2486.364	5.007	0.000	7439.285	1.75e+04

Omnibus:	14.782	Durbin-Watson:	1.283
Prob(Omnibus):	0.001	Jarque-Bera (JB):	21.266
Skew:	-0.948	Prob(JB):	2.41e-05
Kurtosis:	5.572	Cond. No.	2.98e+21

Dep. Variable:	Profit	R-squared:	0.950
Model:	OLS	Adj. R-squared:	0.946
Method:	Least Squares	F-statistic:	215.8
Date:	Fri, 05 Apr 2019	Prob (F-statistic):	9.72e-29
Time:	22:35:59	Log-Likelihood:	-525.53
No. Observations:	50	AIC:	1061.
Df Residuals:	45	BIC:	1071.
Df Model:	4
Covariance Type:	nonrobust

Rohit Kumar

Search This Blog

multiple linear regression

Comments

Post a Comment

Omnibus:	14.640	Durbin-Watson:	1.257
Prob(Omnibus):	0.001	Jarque-Bera (JB):	21.037
Skew:	-0.938	Prob(JB):	2.70e-05
Kurtosis:	5.565	Cond. No.	2.14e+21

Omnibus:	13.418	Durbin-Watson:	1.122
Prob(Omnibus):	0.001	Jarque-Bera (JB):	17.605
Skew:	-0.907	Prob(JB):	0.000150
Kurtosis:	5.271	Cond. No.	2.41e+21

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	3.525e+04	2100.376	16.782	0.000	3.1e+04	3.95e+04
x1	0.7967	0.042	18.771	0.000	0.711	0.882
x2	0.0298	0.016	1.842	0.072	-0.003	0.062
x3	1.171e+04	1910.312	6.130	0.000	7861.854	1.56e+04
x4	1.185e+04	2170.903	5.459	0.000	7477.785	1.62e+04
x5	1.169e+04	1988.428	5.879	0.000	7684.996	1.57e+04

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	3.686e+04	1959.786	18.806	0.000	3.29e+04	4.08e+04
x1	0.8530	0.030	28.226	0.000	0.792	0.914
x2	1.189e+04	1956.677	6.079	0.000	7955.697	1.58e+04
x3	1.306e+04	2122.665	6.152	0.000	8785.448	1.73e+04
x4	1.19e+04	2036.022	5.847	0.000	7805.580	1.6e+04