|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
from linear_regression import LinearRegression
|
|
|
data = pd.read_csv('../data/world-happiness-report-2017.csv') # 导入数据
|
|
|
# 得到训练和测试数据,以8:2切分
|
|
|
train_data = data.sample(frac=0.8)
|
|
|
test_data = data.drop(train_data.index)
|
|
|
|
|
|
input_param_name = 'Economy..GDP.per.Capita.' # 特征features
|
|
|
output_param_name = 'Happiness.Score' # 标签label
|
|
|
|
|
|
x_train = train_data[[input_param_name]].values # 构建数据
|
|
|
y_train = train_data[[output_param_name]].values
|
|
|
|
|
|
x_test = test_data[[input_param_name]].values
|
|
|
y_test = test_data[[output_param_name]].values
|
|
|
|
|
|
# 可视化展示 run, 可以看到训练数据和预测数据的分布
|
|
|
plt.scatter(x_train, y_train, label='Train data')
|
|
|
plt.scatter(x_test, y_test, label='Test data')
|
|
|
plt.xlabel(input_param_name)
|
|
|
plt.ylabel(output_param_name)
|
|
|
plt.title('Happy')
|
|
|
plt.legend()
|
|
|
plt.show()
|
|
|
|
|
|
# 训练线性回归模型
|
|
|
num_iterations = 500 # 迭代次数
|
|
|
learning_rate = 0.01 # 学习率
|
|
|
|
|
|
linear_regression = LinearRegression(x_train, y_train) # 初始化模型
|
|
|
(theta, cost_history) = linear_regression.train(learning_rate, num_iterations)
|
|
|
|
|
|
print('开始时的损失:', cost_history[0])
|
|
|
print('训练后的损失:', cost_history[-1])
|
|
|
|
|
|
plt.plot(range(num_iterations), cost_history)
|
|
|
plt.xlabel('Iteration')
|
|
|
plt.ylabel('Cost')
|
|
|
plt.title('GD')
|
|
|
plt.show()
|
|
|
|
|
|
# 测试线性回归模型
|
|
|
predictions_num = 100 # 预测100个
|
|
|
# 拿最大和最小值画一条线
|
|
|
x_predictions = np.linspace(x_train.min(), x_train.max(), predictions_num).reshape(predictions_num, 1)
|
|
|
y_predictions = linear_regression.predict(x_predictions)
|
|
|
|
|
|
plt.scatter(x_train, y_train, label='Train data')
|
|
|
plt.scatter(x_test, y_test, label='Test data')
|
|
|
plt.plot(x_predictions, y_predictions, 'r', label='Prediction')
|
|
|
plt.xlabel(input_param_name)
|
|
|
plt.ylabel(output_param_name)
|
|
|
plt.title('Happy')
|
|
|
plt.legend()
|
|
|
plt.show()
|