Create logistic_regression.py

4 years ago · 656232e1df
parent d516b3c97a
commit 656232e1df
1 changed files with 160 additions and 0 deletions
--- a/2-Regression/4-Logistic/logistic_regression.py
+++ b/2-Regression/4-Logistic/logistic_regression.py
@ -0,0 +1,160 @@
 import math
 import numpy as np
 import pandas as pd
 from pandas import DataFrame
 from sklearn import preprocessing
 from sklearn.linear_model import LogisticRegression
 #from sklearn.cross_validation import train_test_split
 from sklearn.model_selection import train_test_split
 from numpy import loadtxt, where
 from pylab import scatter, show, legend, xlabel, ylabel
 min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))
 df = pd.read_csv("data.csv", header=0)
 # clean data
 df.columns = ["grade1","grade2","label"]
 x = df["label"].map(lambda x: float(x.rstrip(';')))
 # formats the input data into two arrays, one of independant variables
 X = df[["grade1","grade2"]]
 X = np.array(X)
 X = min_max_scaler.fit_transform(X)
 Y = df["label"].map(lambda x: float(x.rstrip(';')))
 Y = np.array(Y)
 # if want to create a new clean dataset 
 ##X = pd.DataFrame.from_records(X,columns=['grade1','grade2'])
 ##X.insert(2,'label',Y)
 ##X.to_csv('data2.csv')
 # creating testing and training set
 X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.33)
 # train scikit learn model 
 clf = LogisticRegression()
 clf.fit(X_train,Y_train)
 print ('score Scikit learn: ', clf.score(X_test,Y_test))
 # visualize data, uncomment "show()" to run it
 pos = where(Y == 1)
 neg = where(Y == 0)
 scatter(X[pos, 0], X[pos, 1], marker='o', c='b')
 scatter(X[neg, 0], X[neg, 1], marker='x', c='r')
 xlabel('Exam 1 score')
 ylabel('Exam 2 score')
 legend(['Not Admitted', 'Admitted'])
 show()
 ##The sigmoid function adjusts the cost function hypotheses to adjust the algorithm proportionally for worse estimations
 def Sigmoid(z):
 	G_of_Z = float(1.0 / float((1.0 + math.exp(-1.0*z))))
 	return G_of_Z 
 ##The hypothesis is the linear combination of all the known factors x[i] and their current estimated coefficients theta[i] 
 ##This hypothesis will be used to calculate each instance of the Cost Function
 def Hypothesis(theta, x):
 	z = 0
 	for i in xrange(len(theta)):
 		z += x[i]*theta[i]
 	return Sigmoid(z)
 ##For each member of the dataset, the result (Y) determines which variation of the cost function is used
 ##The Y = 0 cost function punishes high probability estimations, and the Y = 1 it punishes low scores
 ##The "punishment" makes the change in the gradient of ThetaCurrent - Average(CostFunction(Dataset)) greater
 def Cost_Function(X,Y,theta,m):
 	sumOfErrors = 0
 	for i in xrange(m):
 		xi = X[i]
 		hi = Hypothesis(theta,xi)
 		if Y[i] == 1:
 			error = Y[i] * math.log(hi)
 		elif Y[i] == 0:
 			error = (1-Y[i]) * math.log(1-hi)
 		sumOfErrors += error
 	const = -1/m
 	J = const * sumOfErrors
 	print ('cost is ', J )
 	return J
 ##This function creates the gradient component for each Theta value 
 ##The gradient is the partial derivative by Theta of the current value of theta minus 
 ##a "learning speed factor aplha" times the average of all the cost functions for that theta
 def Cost_Function_Derivative(X,Y,theta,j,m,alpha):
 	sumErrors = 0
 	for i in xrange(m):
 		xi = X[i]
 		xij = xi[j]
 		hi = Hypothesis(theta,X[i])
 		error = (hi - Y[i])*xij
 		sumErrors += error
 	m = len(Y)
 	constant = float(alpha)/float(m)
 	J = constant * sumErrors
 	return J
 ##For each theta, the partial differential 
 ##The gradient, or vector from the current point in Theta-space (each theta value is its own dimension) to the more accurate point, 
 ##is the vector with each dimensional component being the partial differential for each theta value
 def Gradient_Descent(X,Y,theta,m,alpha):
 	new_theta = []
 	constant = alpha/m
 	for j in xrange(len(theta)):
 		CFDerivative = Cost_Function_Derivative(X,Y,theta,j,m,alpha)
 		new_theta_value = theta[j] - CFDerivative
 		new_theta.append(new_theta_value)
 	return new_theta
 ##The high level function for the LR algorithm which, for a number of steps (num_iters) finds gradients which take 
 ##the Theta values (coefficients of known factors) from an estimation closer (new_theta) to their "optimum estimation" which is the
 ##set of values best representing the system in a linear combination model
 def Logistic_Regression(X,Y,alpha,theta,num_iters):
 	m = len(Y)
 	for x in xrange(num_iters):
 		new_theta = Gradient_Descent(X,Y,theta,m,alpha)
 		theta = new_theta
 		if x % 100 == 0:
 			#here the cost function is used to present the final hypothesis of the model in the same form for each gradient-step iteration
 			Cost_Function(X,Y,theta,m)
 			print ('theta ', theta)	
 			print ('cost is ', Cost_Function(X,Y,theta,m))
 	Declare_Winner(theta)
 ##This method compares the accuracy of the model generated by the scikit library with the model generated by this implementation
 def Declare_Winner(theta):
    score = 0
    winner = ""
    #first scikit LR is tested for each independent var in the dataset and its prediction is compared against the dependent var
    #if the prediction is the same as the dataset measured value it counts as a point for thie scikit version of LR
    scikit_score = clf.score(X_test,Y_test)
    length = len(X_test)
    for i in xrange(length):
        prediction = round(Hypothesis(X_test[i],theta))
        answer = Y_test[i]
        if prediction == answer:
            score += 1
    #the same process is repeated for the implementation from this module and the scores compared to find the higher match-rate
    my_score = float(score) / float(length)
    if my_score > scikit_score:
        print ('You won!')
    elif my_score == scikit_score:
        print ('Its a tie!')
    else:
        print( 'Scikit won.. :(')
    print ('Your score: ', my_score)
    print ('Scikits score: ', scikit_score )
 # These are the initial guesses for theta as well as the learning rate of the algorithm
 # A learning rate too low will not close in on the most accurate values within a reasonable number of iterations
 # An alpha too high might overshoot the accurate values or cause irratic guesses
 # Each iteration increases model accuracy but with diminishing returns, 
 # and takes a signficicant coefficient times O(n)*|Theta|, n = dataset length
 initial_theta = [0,0]
 alpha = 0.1
 iterations = 1000
 ##Logistic_Regression(X,Y,alpha,initial_theta,iterations)