parent
d516b3c97a
commit
656232e1df
@ -0,0 +1,160 @@
|
|||||||
|
import math
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from pandas import DataFrame
|
||||||
|
from sklearn import preprocessing
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
#from sklearn.cross_validation import train_test_split
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from numpy import loadtxt, where
|
||||||
|
from pylab import scatter, show, legend, xlabel, ylabel
|
||||||
|
|
||||||
|
|
||||||
|
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))
|
||||||
|
df = pd.read_csv("data.csv", header=0)
|
||||||
|
|
||||||
|
# clean data
|
||||||
|
df.columns = ["grade1","grade2","label"]
|
||||||
|
|
||||||
|
x = df["label"].map(lambda x: float(x.rstrip(';')))
|
||||||
|
|
||||||
|
# formats the input data into two arrays, one of independant variables
|
||||||
|
|
||||||
|
X = df[["grade1","grade2"]]
|
||||||
|
X = np.array(X)
|
||||||
|
X = min_max_scaler.fit_transform(X)
|
||||||
|
Y = df["label"].map(lambda x: float(x.rstrip(';')))
|
||||||
|
Y = np.array(Y)
|
||||||
|
|
||||||
|
|
||||||
|
# if want to create a new clean dataset
|
||||||
|
##X = pd.DataFrame.from_records(X,columns=['grade1','grade2'])
|
||||||
|
##X.insert(2,'label',Y)
|
||||||
|
##X.to_csv('data2.csv')
|
||||||
|
|
||||||
|
# creating testing and training set
|
||||||
|
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.33)
|
||||||
|
|
||||||
|
# train scikit learn model
|
||||||
|
clf = LogisticRegression()
|
||||||
|
clf.fit(X_train,Y_train)
|
||||||
|
print ('score Scikit learn: ', clf.score(X_test,Y_test))
|
||||||
|
|
||||||
|
# visualize data, uncomment "show()" to run it
|
||||||
|
pos = where(Y == 1)
|
||||||
|
neg = where(Y == 0)
|
||||||
|
scatter(X[pos, 0], X[pos, 1], marker='o', c='b')
|
||||||
|
scatter(X[neg, 0], X[neg, 1], marker='x', c='r')
|
||||||
|
xlabel('Exam 1 score')
|
||||||
|
ylabel('Exam 2 score')
|
||||||
|
legend(['Not Admitted', 'Admitted'])
|
||||||
|
show()
|
||||||
|
|
||||||
|
##The sigmoid function adjusts the cost function hypotheses to adjust the algorithm proportionally for worse estimations
|
||||||
|
def Sigmoid(z):
|
||||||
|
G_of_Z = float(1.0 / float((1.0 + math.exp(-1.0*z))))
|
||||||
|
return G_of_Z
|
||||||
|
|
||||||
|
##The hypothesis is the linear combination of all the known factors x[i] and their current estimated coefficients theta[i]
|
||||||
|
##This hypothesis will be used to calculate each instance of the Cost Function
|
||||||
|
def Hypothesis(theta, x):
|
||||||
|
z = 0
|
||||||
|
for i in xrange(len(theta)):
|
||||||
|
z += x[i]*theta[i]
|
||||||
|
return Sigmoid(z)
|
||||||
|
|
||||||
|
##For each member of the dataset, the result (Y) determines which variation of the cost function is used
|
||||||
|
##The Y = 0 cost function punishes high probability estimations, and the Y = 1 it punishes low scores
|
||||||
|
##The "punishment" makes the change in the gradient of ThetaCurrent - Average(CostFunction(Dataset)) greater
|
||||||
|
def Cost_Function(X,Y,theta,m):
|
||||||
|
sumOfErrors = 0
|
||||||
|
for i in xrange(m):
|
||||||
|
xi = X[i]
|
||||||
|
hi = Hypothesis(theta,xi)
|
||||||
|
if Y[i] == 1:
|
||||||
|
error = Y[i] * math.log(hi)
|
||||||
|
elif Y[i] == 0:
|
||||||
|
error = (1-Y[i]) * math.log(1-hi)
|
||||||
|
sumOfErrors += error
|
||||||
|
const = -1/m
|
||||||
|
J = const * sumOfErrors
|
||||||
|
print ('cost is ', J )
|
||||||
|
return J
|
||||||
|
|
||||||
|
##This function creates the gradient component for each Theta value
|
||||||
|
##The gradient is the partial derivative by Theta of the current value of theta minus
|
||||||
|
##a "learning speed factor aplha" times the average of all the cost functions for that theta
|
||||||
|
|
||||||
|
def Cost_Function_Derivative(X,Y,theta,j,m,alpha):
|
||||||
|
sumErrors = 0
|
||||||
|
for i in xrange(m):
|
||||||
|
xi = X[i]
|
||||||
|
xij = xi[j]
|
||||||
|
hi = Hypothesis(theta,X[i])
|
||||||
|
error = (hi - Y[i])*xij
|
||||||
|
sumErrors += error
|
||||||
|
m = len(Y)
|
||||||
|
constant = float(alpha)/float(m)
|
||||||
|
J = constant * sumErrors
|
||||||
|
return J
|
||||||
|
|
||||||
|
##For each theta, the partial differential
|
||||||
|
##The gradient, or vector from the current point in Theta-space (each theta value is its own dimension) to the more accurate point,
|
||||||
|
##is the vector with each dimensional component being the partial differential for each theta value
|
||||||
|
def Gradient_Descent(X,Y,theta,m,alpha):
|
||||||
|
new_theta = []
|
||||||
|
constant = alpha/m
|
||||||
|
for j in xrange(len(theta)):
|
||||||
|
CFDerivative = Cost_Function_Derivative(X,Y,theta,j,m,alpha)
|
||||||
|
new_theta_value = theta[j] - CFDerivative
|
||||||
|
new_theta.append(new_theta_value)
|
||||||
|
return new_theta
|
||||||
|
|
||||||
|
##The high level function for the LR algorithm which, for a number of steps (num_iters) finds gradients which take
|
||||||
|
##the Theta values (coefficients of known factors) from an estimation closer (new_theta) to their "optimum estimation" which is the
|
||||||
|
##set of values best representing the system in a linear combination model
|
||||||
|
def Logistic_Regression(X,Y,alpha,theta,num_iters):
|
||||||
|
m = len(Y)
|
||||||
|
for x in xrange(num_iters):
|
||||||
|
new_theta = Gradient_Descent(X,Y,theta,m,alpha)
|
||||||
|
theta = new_theta
|
||||||
|
if x % 100 == 0:
|
||||||
|
#here the cost function is used to present the final hypothesis of the model in the same form for each gradient-step iteration
|
||||||
|
Cost_Function(X,Y,theta,m)
|
||||||
|
print ('theta ', theta)
|
||||||
|
print ('cost is ', Cost_Function(X,Y,theta,m))
|
||||||
|
Declare_Winner(theta)
|
||||||
|
|
||||||
|
##This method compares the accuracy of the model generated by the scikit library with the model generated by this implementation
|
||||||
|
def Declare_Winner(theta):
|
||||||
|
score = 0
|
||||||
|
winner = ""
|
||||||
|
#first scikit LR is tested for each independent var in the dataset and its prediction is compared against the dependent var
|
||||||
|
#if the prediction is the same as the dataset measured value it counts as a point for thie scikit version of LR
|
||||||
|
scikit_score = clf.score(X_test,Y_test)
|
||||||
|
length = len(X_test)
|
||||||
|
for i in xrange(length):
|
||||||
|
prediction = round(Hypothesis(X_test[i],theta))
|
||||||
|
answer = Y_test[i]
|
||||||
|
if prediction == answer:
|
||||||
|
score += 1
|
||||||
|
#the same process is repeated for the implementation from this module and the scores compared to find the higher match-rate
|
||||||
|
my_score = float(score) / float(length)
|
||||||
|
if my_score > scikit_score:
|
||||||
|
print ('You won!')
|
||||||
|
elif my_score == scikit_score:
|
||||||
|
print ('Its a tie!')
|
||||||
|
else:
|
||||||
|
print( 'Scikit won.. :(')
|
||||||
|
print ('Your score: ', my_score)
|
||||||
|
print ('Scikits score: ', scikit_score )
|
||||||
|
|
||||||
|
# These are the initial guesses for theta as well as the learning rate of the algorithm
|
||||||
|
# A learning rate too low will not close in on the most accurate values within a reasonable number of iterations
|
||||||
|
# An alpha too high might overshoot the accurate values or cause irratic guesses
|
||||||
|
# Each iteration increases model accuracy but with diminishing returns,
|
||||||
|
# and takes a signficicant coefficient times O(n)*|Theta|, n = dataset length
|
||||||
|
initial_theta = [0,0]
|
||||||
|
alpha = 0.1
|
||||||
|
iterations = 1000
|
||||||
|
##Logistic_Regression(X,Y,alpha,initial_theta,iterations)
|
Loading…
Reference in new issue