Create logistic_regression.py

pull/376/head
jishu-yadav 4 years ago committed by GitHub
parent d516b3c97a
commit 656232e1df
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,160 @@
import math
import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
from numpy import loadtxt, where
from pylab import scatter, show, legend, xlabel, ylabel
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))
df = pd.read_csv("data.csv", header=0)
# clean data
df.columns = ["grade1","grade2","label"]
x = df["label"].map(lambda x: float(x.rstrip(';')))
# formats the input data into two arrays, one of independant variables
X = df[["grade1","grade2"]]
X = np.array(X)
X = min_max_scaler.fit_transform(X)
Y = df["label"].map(lambda x: float(x.rstrip(';')))
Y = np.array(Y)
# if want to create a new clean dataset
##X = pd.DataFrame.from_records(X,columns=['grade1','grade2'])
##X.insert(2,'label',Y)
##X.to_csv('data2.csv')
# creating testing and training set
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.33)
# train scikit learn model
clf = LogisticRegression()
clf.fit(X_train,Y_train)
print ('score Scikit learn: ', clf.score(X_test,Y_test))
# visualize data, uncomment "show()" to run it
pos = where(Y == 1)
neg = where(Y == 0)
scatter(X[pos, 0], X[pos, 1], marker='o', c='b')
scatter(X[neg, 0], X[neg, 1], marker='x', c='r')
xlabel('Exam 1 score')
ylabel('Exam 2 score')
legend(['Not Admitted', 'Admitted'])
show()
##The sigmoid function adjusts the cost function hypotheses to adjust the algorithm proportionally for worse estimations
def Sigmoid(z):
G_of_Z = float(1.0 / float((1.0 + math.exp(-1.0*z))))
return G_of_Z
##The hypothesis is the linear combination of all the known factors x[i] and their current estimated coefficients theta[i]
##This hypothesis will be used to calculate each instance of the Cost Function
def Hypothesis(theta, x):
z = 0
for i in xrange(len(theta)):
z += x[i]*theta[i]
return Sigmoid(z)
##For each member of the dataset, the result (Y) determines which variation of the cost function is used
##The Y = 0 cost function punishes high probability estimations, and the Y = 1 it punishes low scores
##The "punishment" makes the change in the gradient of ThetaCurrent - Average(CostFunction(Dataset)) greater
def Cost_Function(X,Y,theta,m):
sumOfErrors = 0
for i in xrange(m):
xi = X[i]
hi = Hypothesis(theta,xi)
if Y[i] == 1:
error = Y[i] * math.log(hi)
elif Y[i] == 0:
error = (1-Y[i]) * math.log(1-hi)
sumOfErrors += error
const = -1/m
J = const * sumOfErrors
print ('cost is ', J )
return J
##This function creates the gradient component for each Theta value
##The gradient is the partial derivative by Theta of the current value of theta minus
##a "learning speed factor aplha" times the average of all the cost functions for that theta
def Cost_Function_Derivative(X,Y,theta,j,m,alpha):
sumErrors = 0
for i in xrange(m):
xi = X[i]
xij = xi[j]
hi = Hypothesis(theta,X[i])
error = (hi - Y[i])*xij
sumErrors += error
m = len(Y)
constant = float(alpha)/float(m)
J = constant * sumErrors
return J
##For each theta, the partial differential
##The gradient, or vector from the current point in Theta-space (each theta value is its own dimension) to the more accurate point,
##is the vector with each dimensional component being the partial differential for each theta value
def Gradient_Descent(X,Y,theta,m,alpha):
new_theta = []
constant = alpha/m
for j in xrange(len(theta)):
CFDerivative = Cost_Function_Derivative(X,Y,theta,j,m,alpha)
new_theta_value = theta[j] - CFDerivative
new_theta.append(new_theta_value)
return new_theta
##The high level function for the LR algorithm which, for a number of steps (num_iters) finds gradients which take
##the Theta values (coefficients of known factors) from an estimation closer (new_theta) to their "optimum estimation" which is the
##set of values best representing the system in a linear combination model
def Logistic_Regression(X,Y,alpha,theta,num_iters):
m = len(Y)
for x in xrange(num_iters):
new_theta = Gradient_Descent(X,Y,theta,m,alpha)
theta = new_theta
if x % 100 == 0:
#here the cost function is used to present the final hypothesis of the model in the same form for each gradient-step iteration
Cost_Function(X,Y,theta,m)
print ('theta ', theta)
print ('cost is ', Cost_Function(X,Y,theta,m))
Declare_Winner(theta)
##This method compares the accuracy of the model generated by the scikit library with the model generated by this implementation
def Declare_Winner(theta):
score = 0
winner = ""
#first scikit LR is tested for each independent var in the dataset and its prediction is compared against the dependent var
#if the prediction is the same as the dataset measured value it counts as a point for thie scikit version of LR
scikit_score = clf.score(X_test,Y_test)
length = len(X_test)
for i in xrange(length):
prediction = round(Hypothesis(X_test[i],theta))
answer = Y_test[i]
if prediction == answer:
score += 1
#the same process is repeated for the implementation from this module and the scores compared to find the higher match-rate
my_score = float(score) / float(length)
if my_score > scikit_score:
print ('You won!')
elif my_score == scikit_score:
print ('Its a tie!')
else:
print( 'Scikit won.. :(')
print ('Your score: ', my_score)
print ('Scikits score: ', scikit_score )
# These are the initial guesses for theta as well as the learning rate of the algorithm
# A learning rate too low will not close in on the most accurate values within a reasonable number of iterations
# An alpha too high might overshoot the accurate values or cause irratic guesses
# Each iteration increases model accuracy but with diminishing returns,
# and takes a signficicant coefficient times O(n)*|Theta|, n = dataset length
initial_theta = [0,0]
alpha = 0.1
iterations = 1000
##Logistic_Regression(X,Y,alpha,initial_theta,iterations)
Loading…
Cancel
Save