import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
# Generate 'random' data
np.random.seed(0)
X = 2.5 * np.random.randn(100) + 1.5 # Array of 100 values with mean = 1.5, stddev = 2.5
res = 0.5 * np.random.randn(100) # Generate 100 residual terms
y = 2 + 0.3 * X + res # Actual values of Y
# Create pandas dataframe to store our X and y values
df = pd.DataFrame(
{'X': X,
'y': y}
)
# Show the first five rows of our dataframe
df.head()
| X | y | |
|---|---|---|
| 0 | 5.910131 | 4.714615 |
| 1 | 2.500393 | 2.076238 |
| 2 | 3.946845 | 2.548811 |
| 3 | 7.102233 | 4.615368 |
| 4 | 6.168895 | 3.264107 |
# Calculate the mean of X and y
xmean = np.mean(X)
ymean = np.mean(y)
# Calculate the terms needed for the numator and denominator of beta
df['xycov'] = (df['X'] - xmean) * (df['y'] - ymean)
df['xvar'] = (df['X'] - xmean)**2
# Calculate beta and alpha
beta = df['xycov'].sum() / df['xvar'].sum()
alpha = ymean - (beta * xmean)
print(f'alpha = {alpha}')
print(f'beta = {beta}')
alpha = 2.0031670124623426 beta = 0.3229396867092763
ypred = alpha + beta * X
# Plot regression against actual data
plt.figure(figsize=(12, 6))
plt.plot(X, ypred) # regression line
plt.plot(X, y, 'ro') # scatter plot showing actual data
plt.title('Actual vs Predicted')
plt.xlabel('X')
plt.ylabel('y')
plt.show()
# IMPORT SOME NECESSARY LIBRARIES
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import statsmodels.formula.api as sm
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
os.getcwd()
'C:\\Users\\S-C-R-E-A-M\\Data Science\\Data Modeling\\Linear Regrssion'
# import dataset
df = pd.read_csv("Advertising.csv")
# UNDERSTANDING THE DATA-SET
df.head()
| Unnamed: 0 | TV | Radio | Newspaper | Sales | |
|---|---|---|---|---|---|
| 0 | 1 | 230.1 | 37.8 | 69.2 | 22.1 |
| 1 | 2 | 44.5 | 39.3 | 45.1 | 10.4 |
| 2 | 3 | 17.2 | 45.9 | 69.3 | 9.3 |
| 3 | 4 | 151.5 | 41.3 | 58.5 | 18.5 |
| 4 | 5 | 180.8 | 10.8 | 58.4 | 12.9 |
df.columns
Index(['Unnamed: 0', 'TV', 'Radio', 'Newspaper', 'Sales'], dtype='object')
df.shape
(200, 5)
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Unnamed: 0 | 200.0 | 100.5000 | 57.879185 | 1.0 | 50.750 | 100.50 | 150.250 | 200.0 |
| TV | 200.0 | 147.0425 | 85.854236 | 0.7 | 74.375 | 149.75 | 218.825 | 296.4 |
| Radio | 200.0 | 23.2640 | 14.846809 | 0.0 | 9.975 | 22.90 | 36.525 | 49.6 |
| Newspaper | 200.0 | 30.5540 | 21.778621 | 0.3 | 12.750 | 25.75 | 45.100 | 114.0 |
| Sales | 200.0 | 14.0225 | 5.217457 | 1.6 | 10.375 | 12.90 | 17.400 | 27.0 |
# Checking missing value
df.isnull().values.any()
df.isnull().sum()
Unnamed: 0 0 TV 0 Radio 0 Newspaper 0 Sales 0 dtype: int64
df.head()
| Unnamed: 0 | TV | Radio | Newspaper | Sales | |
|---|---|---|---|---|---|
| 0 | 1 | 230.1 | 37.8 | 69.2 | 22.1 |
| 1 | 2 | 44.5 | 39.3 | 45.1 | 10.4 |
| 2 | 3 | 17.2 | 45.9 | 69.3 | 9.3 |
| 3 | 4 | 151.5 | 41.3 | 58.5 | 18.5 |
| 4 | 5 | 180.8 | 10.8 | 58.4 | 12.9 |
# As seen ""Unnamed: 0" . it is unnecessary variable
drop_list = ["Unnamed: 0"]
df = df.drop(drop_list, axis=1)
df.columns
Index(['TV', 'Radio', 'Newspaper', 'Sales'], dtype='object')
# Checking main assumptions of Linear Regression: linearity, normality, and multicollinearity (1) Assumption (Linearity)
sns.pairplot(df,x_vars=["TV","Radio","Newspaper"],y_vars= "Sales",kind="reg")
<seaborn.axisgrid.PairGrid at 0x2b1228b7190>
df.hist(bins=20)
array([[<AxesSubplot:title={'center':'TV'}>,
<AxesSubplot:title={'center':'Radio'}>],
[<AxesSubplot:title={'center':'Newspaper'}>,
<AxesSubplot:title={'center':'Sales'}>]], dtype=object)
sns.lmplot('TV','Sales',data=df)
sns.lmplot('Radio','Sales',data=df)
sns.lmplot('Newspaper','Sales',data=df)
<seaborn.axisgrid.FacetGrid at 0x2b122ba5100>
# Assumption (Normality)
sns.distplot(df.Sales,bins=10,color="blue",hist=True)
<AxesSubplot:xlabel='Sales', ylabel='Density'>
# Assumption (Multicollinearity)
num_cols = df[["TV","Radio","Newspaper"]]
a = df.corr()['Sales'].sort_values(ascending=False)
print(a)
Sales 1.000000 TV 0.782224 Radio 0.576223 Newspaper 0.228299 Name: Sales, dtype: float64
# Correlation of variables with Heatmap
corrmat = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat,vmin = 0,vmax=1,square=True,cmap="YlGnBu",ax=ax)
plt.show()
X = df.drop('Sales', axis=1)
y = df[["Sales"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=46)
models = [('LinearRegression', LinearRegression())]
lin_model = sm.ols(formula="Sales ~ TV + Radio + Newspaper",data=df).fit()
print(lin_model.params,"\n")
print(lin_model.summary())
Intercept 2.938889
TV 0.045765
Radio 0.188530
Newspaper -0.001037
dtype: float64
OLS Regression Results
==============================================================================
Dep. Variable: Sales R-squared: 0.897
Model: OLS Adj. R-squared: 0.896
Method: Least Squares F-statistic: 570.3
Date: Fri, 02 Jul 2021 Prob (F-statistic): 1.58e-96
Time: 17:18:31 Log-Likelihood: -386.18
No. Observations: 200 AIC: 780.4
Df Residuals: 196 BIC: 793.6
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 2.9389 0.312 9.422 0.000 2.324 3.554
TV 0.0458 0.001 32.809 0.000 0.043 0.049
Radio 0.1885 0.009 21.893 0.000 0.172 0.206
Newspaper -0.0010 0.006 -0.177 0.860 -0.013 0.011
==============================================================================
Omnibus: 60.414 Durbin-Watson: 2.084
Prob(Omnibus): 0.000 Jarque-Bera (JB): 151.241
Skew: -1.327 Prob(JB): 1.44e-33
Kurtosis: 6.332 Cond. No. 454.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
results = []
names = []
for name, model in models:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
result = np.sqrt(mean_squared_error(y_test, y_pred))
results.append(result)
names.append(name)
msg = "%s: %f" % (name, result)
print(msg)
LinearRegression: 1.700510
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("../Linear Regrssion"))
# Any results you write to the current directory are saved as output.
['.ipynb_checkpoints', 'Advertising.csv', 'linear_regrssion.gif', 'linear_regrssion.ipynb', 'test.ipynb', 'USA_Housing.csv']
data =pd.read_csv('USA_Housing.csv')
data.describe()
| Avg. Area Income | Avg. Area House Age | Avg. Area Number of Rooms | Avg. Area Number of Bedrooms | Area Population | Price | |
|---|---|---|---|---|---|---|
| count | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5.000000e+03 |
| mean | 68583.108984 | 5.977222 | 6.987792 | 3.981330 | 36163.516039 | 1.232073e+06 |
| std | 10657.991214 | 0.991456 | 1.005833 | 1.234137 | 9925.650114 | 3.531176e+05 |
| min | 17796.631190 | 2.644304 | 3.236194 | 2.000000 | 172.610686 | 1.593866e+04 |
| 25% | 61480.562388 | 5.322283 | 6.299250 | 3.140000 | 29403.928702 | 9.975771e+05 |
| 50% | 68804.286404 | 5.970429 | 7.002902 | 4.050000 | 36199.406689 | 1.232669e+06 |
| 75% | 75783.338666 | 6.650808 | 7.665871 | 4.490000 | 42861.290769 | 1.471210e+06 |
| max | 107701.748378 | 9.519088 | 10.759588 | 6.500000 | 69621.713378 | 2.469066e+06 |
feature=data.drop(['Price','Address'],axis=1).values
target =data['Price'].values
from sklearn.model_selection import train_test_split
train,test,train_label,test_label=train_test_split(feature,target,test_size=0.33,random_state=222)
from sklearn.linear_model import LinearRegression
reg=LinearRegression(fit_intercept=True)
model = reg.fit(train,train_label)
predict = model.predict(test)
from sklearn.metrics import r2_score
print(r2_score(test_label,predict))
0.9151441604802264
Linear regression is a common machine learning technique that predicts a real-valued output using a weighted linear combination of one or more input values.
The "learning" part of linear regression is to figure out a set of weights w1, w2, w3, ... w_n, b that leads to good predictions. This is done by looking at lots of examples one by one (or in batches) and adjusting the weights slightly each time to make better predictions, using an optimization technique called Gradient Descent.
from IPython.display import IFrame
IFrame(src='linear_regrssion.gif', width="100%", height="300px")
#import dependencies
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
from torch.autograd import Variable
x = np.random.rand(500)
x[:5]
array([0.96193638, 0.29214753, 0.24082878, 0.10029394, 0.01642963])
noise = np.random.randn(500)/4
noise[:5]
array([-0.20140663, -0.27957798, -0.0327635 , 0.28326997, -0.48795103])
m = 2 # slope or weight
c = 3 # coffecient or bias
# Equation of the line y = mx + c
y = x * m + c + noise
y[:5]
array([4.72246613, 3.30471707, 3.44889406, 3.48385785, 2.54490823])
#Visualise Our data
plt.scatter(x,y)
plt.title("Data")
plt.xlabel('x')
plt.ylabel('y')
plt.show()
# Convert inputs and targets to tensors
inputs = torch.from_numpy(x)
targets = torch.from_numpy(y)
print(inputs[:5])
print(targets[:5])
tensor([0.9619, 0.2921, 0.2408, 0.1003, 0.0164], dtype=torch.float64) tensor([4.7225, 3.3047, 3.4489, 3.4839, 2.5449], dtype=torch.float64)
# Changing the data type to float32 as our model accepts float32 data type
inputs = inputs.type(torch.float32)
targets = targets.type(torch.float32)
# reshaping our data into n-rows and 1 columns. Model takes 2D array as an input
inputs = inputs.reshape(-1,1)
targets = targets.reshape(-1,1)
#x * m + c
actual_targets = torch.from_numpy(x * m + c)
actual_targets = actual_targets.type(torch.float32)
actual_targets = actual_targets.reshape(-1,1)
print(actual_targets[:5])
tensor([[4.9239],
[3.5843],
[3.4817],
[3.2006],
[3.0329]])
# Define linear regression model
model = nn.Linear(1, 1) # (no. of input features, no. of output features)
print(model.weight) # Weight of the model
print(model.bias) # Bias of the model
Parameter containing: tensor([[0.3650]], requires_grad=True) Parameter containing: tensor([0.0329], requires_grad=True)
# Parameters
list(model.parameters())
[Parameter containing: tensor([[0.3650]], requires_grad=True), Parameter containing: tensor([0.0329], requires_grad=True)]
# Define loss function
import torch.nn.functional as F
loss_fn = F.mse_loss
loss = loss_fn(model(inputs), targets)
print(loss)
tensor(14.4207, grad_fn=<MseLossBackward>)
# Define optimizer
opt = torch.optim.SGD(model.parameters(), lr=1e-5) # lr --> learning rate
from time import sleep
# Utility function to train the model
def fit(num_epochs, model, loss_fn, opt,inputs, targets):
# Repeat for given number of epochs
for epoch in range(num_epochs):
# Train with batches of data
for xb,yb in zip(inputs, targets):
# 1. Generate predictions
pred = model(xb)
# 2. Calculate loss
loss = loss_fn(pred, yb)
# 3. Compute gradients
loss.backward()
# 4. Update parameters using gradients
opt.step()
# 5. Reset the gradients to zero
opt.zero_grad()
# Print the progress
if (epoch+1) % 10 == 0:
print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))
preds_ = model(inputs)
plt.scatter(inputs,targets, label="Data")
plt.plot(inputs,preds_.detach().numpy(),"r-", label="Prediction")
plt.title('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))
plt.legend()
#plt.savefig("lr_images_final/{}.png".format(epoch+1))
plt.show()
sleep(.2)
fit(310, model, loss_fn, opt,inputs, targets)
Epoch [10/310], Loss: 8.1252
Epoch [20/310], Loss: 6.4449
Epoch [30/310], Loss: 5.1247
Epoch [40/310], Loss: 4.0861
Epoch [50/310], Loss: 3.2679
Epoch [60/310], Loss: 2.6222
Epoch [70/310], Loss: 2.1118
Epoch [80/310], Loss: 1.7075
Epoch [90/310], Loss: 1.3867
Epoch [100/310], Loss: 1.1316
Epoch [110/310], Loss: 0.9281
Epoch [120/310], Loss: 0.7654
Epoch [130/310], Loss: 0.6349
Epoch [140/310], Loss: 0.5300
Epoch [150/310], Loss: 0.4454
Epoch [160/310], Loss: 0.3769
Epoch [170/310], Loss: 0.3212
Epoch [180/310], Loss: 0.2758
Epoch [190/310], Loss: 0.2387
Epoch [200/310], Loss: 0.2082
Epoch [210/310], Loss: 0.1830
Epoch [220/310], Loss: 0.1621
Epoch [230/310], Loss: 0.1448
Epoch [240/310], Loss: 0.1303
Epoch [250/310], Loss: 0.1182
Epoch [260/310], Loss: 0.1080
Epoch [270/310], Loss: 0.0994
Epoch [280/310], Loss: 0.0921
Epoch [290/310], Loss: 0.0859
Epoch [300/310], Loss: 0.0806
Epoch [310/310], Loss: 0.0761
preds = model(inputs)
plt.scatter(inputs,targets, label="Data")
plt.plot(inputs,preds.detach().numpy(),"r-", label="Prediction")
plt.title('Completed with Loss: ')
plt.legend()
plt.show()
from IPython.display import IFrame
IFrame(src='linear_regrssion.gif', width="100%", height="300px")