From 656232e1df6cba196b5b369c4a0eb493e9fd27ca Mon Sep 17 00:00:00 2001 From: jishu-yadav <54631311+jishu-yadav@users.noreply.github.com> Date: Sun, 3 Oct 2021 01:20:51 +0530 Subject: [PATCH] Create logistic_regression.py --- .../4-Logistic/logistic_regression.py | 160 ++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 2-Regression/4-Logistic/logistic_regression.py diff --git a/2-Regression/4-Logistic/logistic_regression.py b/2-Regression/4-Logistic/logistic_regression.py new file mode 100644 index 00000000..070c4071 --- /dev/null +++ b/2-Regression/4-Logistic/logistic_regression.py @@ -0,0 +1,160 @@ +import math +import numpy as np +import pandas as pd +from pandas import DataFrame +from sklearn import preprocessing +from sklearn.linear_model import LogisticRegression +#from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split +from numpy import loadtxt, where +from pylab import scatter, show, legend, xlabel, ylabel + + +min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1,1)) +df = pd.read_csv("data.csv", header=0) + +# clean data +df.columns = ["grade1","grade2","label"] + +x = df["label"].map(lambda x: float(x.rstrip(';'))) + +# formats the input data into two arrays, one of independant variables + +X = df[["grade1","grade2"]] +X = np.array(X) +X = min_max_scaler.fit_transform(X) +Y = df["label"].map(lambda x: float(x.rstrip(';'))) +Y = np.array(Y) + + +# if want to create a new clean dataset +##X = pd.DataFrame.from_records(X,columns=['grade1','grade2']) +##X.insert(2,'label',Y) +##X.to_csv('data2.csv') + +# creating testing and training set +X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.33) + +# train scikit learn model +clf = LogisticRegression() +clf.fit(X_train,Y_train) +print ('score Scikit learn: ', clf.score(X_test,Y_test)) + +# visualize data, uncomment "show()" to run it +pos = where(Y == 1) +neg = where(Y == 0) +scatter(X[pos, 0], X[pos, 1], marker='o', c='b') +scatter(X[neg, 0], X[neg, 1], marker='x', c='r') +xlabel('Exam 1 score') +ylabel('Exam 2 score') +legend(['Not Admitted', 'Admitted']) +show() + +##The sigmoid function adjusts the cost function hypotheses to adjust the algorithm proportionally for worse estimations +def Sigmoid(z): + G_of_Z = float(1.0 / float((1.0 + math.exp(-1.0*z)))) + return G_of_Z + +##The hypothesis is the linear combination of all the known factors x[i] and their current estimated coefficients theta[i] +##This hypothesis will be used to calculate each instance of the Cost Function +def Hypothesis(theta, x): + z = 0 + for i in xrange(len(theta)): + z += x[i]*theta[i] + return Sigmoid(z) + +##For each member of the dataset, the result (Y) determines which variation of the cost function is used +##The Y = 0 cost function punishes high probability estimations, and the Y = 1 it punishes low scores +##The "punishment" makes the change in the gradient of ThetaCurrent - Average(CostFunction(Dataset)) greater +def Cost_Function(X,Y,theta,m): + sumOfErrors = 0 + for i in xrange(m): + xi = X[i] + hi = Hypothesis(theta,xi) + if Y[i] == 1: + error = Y[i] * math.log(hi) + elif Y[i] == 0: + error = (1-Y[i]) * math.log(1-hi) + sumOfErrors += error + const = -1/m + J = const * sumOfErrors + print ('cost is ', J ) + return J + +##This function creates the gradient component for each Theta value +##The gradient is the partial derivative by Theta of the current value of theta minus +##a "learning speed factor aplha" times the average of all the cost functions for that theta + +def Cost_Function_Derivative(X,Y,theta,j,m,alpha): + sumErrors = 0 + for i in xrange(m): + xi = X[i] + xij = xi[j] + hi = Hypothesis(theta,X[i]) + error = (hi - Y[i])*xij + sumErrors += error + m = len(Y) + constant = float(alpha)/float(m) + J = constant * sumErrors + return J + +##For each theta, the partial differential +##The gradient, or vector from the current point in Theta-space (each theta value is its own dimension) to the more accurate point, +##is the vector with each dimensional component being the partial differential for each theta value +def Gradient_Descent(X,Y,theta,m,alpha): + new_theta = [] + constant = alpha/m + for j in xrange(len(theta)): + CFDerivative = Cost_Function_Derivative(X,Y,theta,j,m,alpha) + new_theta_value = theta[j] - CFDerivative + new_theta.append(new_theta_value) + return new_theta + +##The high level function for the LR algorithm which, for a number of steps (num_iters) finds gradients which take +##the Theta values (coefficients of known factors) from an estimation closer (new_theta) to their "optimum estimation" which is the +##set of values best representing the system in a linear combination model +def Logistic_Regression(X,Y,alpha,theta,num_iters): + m = len(Y) + for x in xrange(num_iters): + new_theta = Gradient_Descent(X,Y,theta,m,alpha) + theta = new_theta + if x % 100 == 0: + #here the cost function is used to present the final hypothesis of the model in the same form for each gradient-step iteration + Cost_Function(X,Y,theta,m) + print ('theta ', theta) + print ('cost is ', Cost_Function(X,Y,theta,m)) + Declare_Winner(theta) + +##This method compares the accuracy of the model generated by the scikit library with the model generated by this implementation +def Declare_Winner(theta): + score = 0 + winner = "" + #first scikit LR is tested for each independent var in the dataset and its prediction is compared against the dependent var + #if the prediction is the same as the dataset measured value it counts as a point for thie scikit version of LR + scikit_score = clf.score(X_test,Y_test) + length = len(X_test) + for i in xrange(length): + prediction = round(Hypothesis(X_test[i],theta)) + answer = Y_test[i] + if prediction == answer: + score += 1 + #the same process is repeated for the implementation from this module and the scores compared to find the higher match-rate + my_score = float(score) / float(length) + if my_score > scikit_score: + print ('You won!') + elif my_score == scikit_score: + print ('Its a tie!') + else: + print( 'Scikit won.. :(') + print ('Your score: ', my_score) + print ('Scikits score: ', scikit_score ) + +# These are the initial guesses for theta as well as the learning rate of the algorithm +# A learning rate too low will not close in on the most accurate values within a reasonable number of iterations +# An alpha too high might overshoot the accurate values or cause irratic guesses +# Each iteration increases model accuracy but with diminishing returns, +# and takes a signficicant coefficient times O(n)*|Theta|, n = dataset length +initial_theta = [0,0] +alpha = 0.1 +iterations = 1000 +##Logistic_Regression(X,Y,alpha,initial_theta,iterations)