parent
d516b3c97a
commit
656232e1df
@ -0,0 +1,160 @@
|
||||
import math
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
from sklearn import preprocessing
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
#from sklearn.cross_validation import train_test_split
|
||||
from sklearn.model_selection import train_test_split
|
||||
from numpy import loadtxt, where
|
||||
from pylab import scatter, show, legend, xlabel, ylabel
|
||||
|
||||
|
||||
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))
|
||||
df = pd.read_csv("data.csv", header=0)
|
||||
|
||||
# clean data
|
||||
df.columns = ["grade1","grade2","label"]
|
||||
|
||||
x = df["label"].map(lambda x: float(x.rstrip(';')))
|
||||
|
||||
# formats the input data into two arrays, one of independant variables
|
||||
|
||||
X = df[["grade1","grade2"]]
|
||||
X = np.array(X)
|
||||
X = min_max_scaler.fit_transform(X)
|
||||
Y = df["label"].map(lambda x: float(x.rstrip(';')))
|
||||
Y = np.array(Y)
|
||||
|
||||
|
||||
# if want to create a new clean dataset
|
||||
##X = pd.DataFrame.from_records(X,columns=['grade1','grade2'])
|
||||
##X.insert(2,'label',Y)
|
||||
##X.to_csv('data2.csv')
|
||||
|
||||
# creating testing and training set
|
||||
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.33)
|
||||
|
||||
# train scikit learn model
|
||||
clf = LogisticRegression()
|
||||
clf.fit(X_train,Y_train)
|
||||
print ('score Scikit learn: ', clf.score(X_test,Y_test))
|
||||
|
||||
# visualize data, uncomment "show()" to run it
|
||||
pos = where(Y == 1)
|
||||
neg = where(Y == 0)
|
||||
scatter(X[pos, 0], X[pos, 1], marker='o', c='b')
|
||||
scatter(X[neg, 0], X[neg, 1], marker='x', c='r')
|
||||
xlabel('Exam 1 score')
|
||||
ylabel('Exam 2 score')
|
||||
legend(['Not Admitted', 'Admitted'])
|
||||
show()
|
||||
|
||||
##The sigmoid function adjusts the cost function hypotheses to adjust the algorithm proportionally for worse estimations
|
||||
def Sigmoid(z):
|
||||
G_of_Z = float(1.0 / float((1.0 + math.exp(-1.0*z))))
|
||||
return G_of_Z
|
||||
|
||||
##The hypothesis is the linear combination of all the known factors x[i] and their current estimated coefficients theta[i]
|
||||
##This hypothesis will be used to calculate each instance of the Cost Function
|
||||
def Hypothesis(theta, x):
|
||||
z = 0
|
||||
for i in xrange(len(theta)):
|
||||
z += x[i]*theta[i]
|
||||
return Sigmoid(z)
|
||||
|
||||
##For each member of the dataset, the result (Y) determines which variation of the cost function is used
|
||||
##The Y = 0 cost function punishes high probability estimations, and the Y = 1 it punishes low scores
|
||||
##The "punishment" makes the change in the gradient of ThetaCurrent - Average(CostFunction(Dataset)) greater
|
||||
def Cost_Function(X,Y,theta,m):
|
||||
sumOfErrors = 0
|
||||
for i in xrange(m):
|
||||
xi = X[i]
|
||||
hi = Hypothesis(theta,xi)
|
||||
if Y[i] == 1:
|
||||
error = Y[i] * math.log(hi)
|
||||
elif Y[i] == 0:
|
||||
error = (1-Y[i]) * math.log(1-hi)
|
||||
sumOfErrors += error
|
||||
const = -1/m
|
||||
J = const * sumOfErrors
|
||||
print ('cost is ', J )
|
||||
return J
|
||||
|
||||
##This function creates the gradient component for each Theta value
|
||||
##The gradient is the partial derivative by Theta of the current value of theta minus
|
||||
##a "learning speed factor aplha" times the average of all the cost functions for that theta
|
||||
|
||||
def Cost_Function_Derivative(X,Y,theta,j,m,alpha):
|
||||
sumErrors = 0
|
||||
for i in xrange(m):
|
||||
xi = X[i]
|
||||
xij = xi[j]
|
||||
hi = Hypothesis(theta,X[i])
|
||||
error = (hi - Y[i])*xij
|
||||
sumErrors += error
|
||||
m = len(Y)
|
||||
constant = float(alpha)/float(m)
|
||||
J = constant * sumErrors
|
||||
return J
|
||||
|
||||
##For each theta, the partial differential
|
||||
##The gradient, or vector from the current point in Theta-space (each theta value is its own dimension) to the more accurate point,
|
||||
##is the vector with each dimensional component being the partial differential for each theta value
|
||||
def Gradient_Descent(X,Y,theta,m,alpha):
|
||||
new_theta = []
|
||||
constant = alpha/m
|
||||
for j in xrange(len(theta)):
|
||||
CFDerivative = Cost_Function_Derivative(X,Y,theta,j,m,alpha)
|
||||
new_theta_value = theta[j] - CFDerivative
|
||||
new_theta.append(new_theta_value)
|
||||
return new_theta
|
||||
|
||||
##The high level function for the LR algorithm which, for a number of steps (num_iters) finds gradients which take
|
||||
##the Theta values (coefficients of known factors) from an estimation closer (new_theta) to their "optimum estimation" which is the
|
||||
##set of values best representing the system in a linear combination model
|
||||
def Logistic_Regression(X,Y,alpha,theta,num_iters):
|
||||
m = len(Y)
|
||||
for x in xrange(num_iters):
|
||||
new_theta = Gradient_Descent(X,Y,theta,m,alpha)
|
||||
theta = new_theta
|
||||
if x % 100 == 0:
|
||||
#here the cost function is used to present the final hypothesis of the model in the same form for each gradient-step iteration
|
||||
Cost_Function(X,Y,theta,m)
|
||||
print ('theta ', theta)
|
||||
print ('cost is ', Cost_Function(X,Y,theta,m))
|
||||
Declare_Winner(theta)
|
||||
|
||||
##This method compares the accuracy of the model generated by the scikit library with the model generated by this implementation
|
||||
def Declare_Winner(theta):
|
||||
score = 0
|
||||
winner = ""
|
||||
#first scikit LR is tested for each independent var in the dataset and its prediction is compared against the dependent var
|
||||
#if the prediction is the same as the dataset measured value it counts as a point for thie scikit version of LR
|
||||
scikit_score = clf.score(X_test,Y_test)
|
||||
length = len(X_test)
|
||||
for i in xrange(length):
|
||||
prediction = round(Hypothesis(X_test[i],theta))
|
||||
answer = Y_test[i]
|
||||
if prediction == answer:
|
||||
score += 1
|
||||
#the same process is repeated for the implementation from this module and the scores compared to find the higher match-rate
|
||||
my_score = float(score) / float(length)
|
||||
if my_score > scikit_score:
|
||||
print ('You won!')
|
||||
elif my_score == scikit_score:
|
||||
print ('Its a tie!')
|
||||
else:
|
||||
print( 'Scikit won.. :(')
|
||||
print ('Your score: ', my_score)
|
||||
print ('Scikits score: ', scikit_score )
|
||||
|
||||
# These are the initial guesses for theta as well as the learning rate of the algorithm
|
||||
# A learning rate too low will not close in on the most accurate values within a reasonable number of iterations
|
||||
# An alpha too high might overshoot the accurate values or cause irratic guesses
|
||||
# Each iteration increases model accuracy but with diminishing returns,
|
||||
# and takes a signficicant coefficient times O(n)*|Theta|, n = dataset length
|
||||
initial_theta = [0,0]
|
||||
alpha = 0.1
|
||||
iterations = 1000
|
||||
##Logistic_Regression(X,Y,alpha,initial_theta,iterations)
|
Loading…
Reference in new issue