|
|
|
@ -0,0 +1,275 @@
|
|
|
|
|
{
|
|
|
|
|
"metadata": {
|
|
|
|
|
"language_info": {
|
|
|
|
|
"codemirror_mode": {
|
|
|
|
|
"name": "ipython",
|
|
|
|
|
"version": 3
|
|
|
|
|
},
|
|
|
|
|
"file_extension": ".py",
|
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
|
"name": "python",
|
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
|
"version": "3.7.0"
|
|
|
|
|
},
|
|
|
|
|
"orig_nbformat": 2,
|
|
|
|
|
"kernelspec": {
|
|
|
|
|
"name": "python37364bit8d3b438fb5fc4430a93ac2cb74d693a7",
|
|
|
|
|
"display_name": "Python 3.7.0 64-bit ('3.7')"
|
|
|
|
|
},
|
|
|
|
|
"metadata": {
|
|
|
|
|
"interpreter": {
|
|
|
|
|
"hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"nbformat": 4,
|
|
|
|
|
"nbformat_minor": 2,
|
|
|
|
|
"cells": [
|
|
|
|
|
{
|
|
|
|
|
"source": [
|
|
|
|
|
"## Build an API with two different models\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"Linear Regression\n",
|
|
|
|
|
"Classification"
|
|
|
|
|
],
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 22,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"output_type": "execute_result",
|
|
|
|
|
"data": {
|
|
|
|
|
"text/plain": [
|
|
|
|
|
" City Name Type Package Variety Sub Variety Grade Date \\\n",
|
|
|
|
|
"0 BALTIMORE NaN 24 inch bins NaN NaN NaN 4/29/17 \n",
|
|
|
|
|
"1 BALTIMORE NaN 24 inch bins NaN NaN NaN 5/6/17 \n",
|
|
|
|
|
"2 BALTIMORE NaN 24 inch bins HOWDEN TYPE NaN NaN 9/24/16 \n",
|
|
|
|
|
"3 BALTIMORE NaN 24 inch bins HOWDEN TYPE NaN NaN 9/24/16 \n",
|
|
|
|
|
"4 BALTIMORE NaN 24 inch bins HOWDEN TYPE NaN NaN 11/5/16 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" Low Price High Price Mostly Low ... Unit of Sale Quality Condition \\\n",
|
|
|
|
|
"0 270.0 280.0 270.0 ... NaN NaN NaN \n",
|
|
|
|
|
"1 270.0 280.0 270.0 ... NaN NaN NaN \n",
|
|
|
|
|
"2 160.0 160.0 160.0 ... NaN NaN NaN \n",
|
|
|
|
|
"3 160.0 160.0 160.0 ... NaN NaN NaN \n",
|
|
|
|
|
"4 90.0 100.0 90.0 ... NaN NaN NaN \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" Appearance Storage Crop Repack Trans Mode Unnamed: 24 Unnamed: 25 \n",
|
|
|
|
|
"0 NaN NaN NaN E NaN NaN NaN \n",
|
|
|
|
|
"1 NaN NaN NaN E NaN NaN NaN \n",
|
|
|
|
|
"2 NaN NaN NaN N NaN NaN NaN \n",
|
|
|
|
|
"3 NaN NaN NaN N NaN NaN NaN \n",
|
|
|
|
|
"4 NaN NaN NaN N NaN NaN NaN \n",
|
|
|
|
|
"\n",
|
|
|
|
|
"[5 rows x 26 columns]"
|
|
|
|
|
],
|
|
|
|
|
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>City Name</th>\n <th>Type</th>\n <th>Package</th>\n <th>Variety</th>\n <th>Sub Variety</th>\n <th>Grade</th>\n <th>Date</th>\n <th>Low Price</th>\n <th>High Price</th>\n <th>Mostly Low</th>\n <th>...</th>\n <th>Unit of Sale</th>\n <th>Quality</th>\n <th>Condition</th>\n <th>Appearance</th>\n <th>Storage</th>\n <th>Crop</th>\n <th>Repack</th>\n <th>Trans Mode</th>\n <th>Unnamed: 24</th>\n <th>Unnamed: 25</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>BALTIMORE</td>\n <td>NaN</td>\n <td>24 inch bins</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>4/29/17</td>\n <td>270.0</td>\n <td>280.0</td>\n <td>270.0</td>\n <td>...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>E</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>1</th>\n <td>BALTIMORE</td>\n <td>NaN</td>\n <td>24 inch bins</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>5/6/17</td>\n <td>270.0</td>\n <td>280.0</td>\n <td>270.0</td>\n <td>...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>E</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>2</th>\n <td>BALTIMORE</td>\n <td>NaN</td>\n <td>24 inch bins</td>\n <td>HOWDEN TYPE</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>9/24/16</td>\n <td>160.0</td>\n <td>160.0</td>\n <td>160.0</td>\n <td>...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>N</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>3</th>\n <td>BALTIMORE</td>\n <td>NaN</td>\n <td>24 inch bins</td>\n <td>HOWDEN TYPE</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>9/24/16</td>\n <td>160.0</td>\n <td>160.0</td>\n <td>160.0</td>\n <td>...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>N</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>4</th>\n <td>BALTIMORE</td>\n <td>NaN</td>\n <td>24 inch bins</td>\n <td>HOWDEN TYPE</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>11/5/16</td>\n <td>90.0</td>\n <td>100.0</td>\n <td>90.0</td>\n <td>...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>N</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 26 columns</p>\n</div>"
|
|
|
|
|
},
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"execution_count": 22
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"import pandas as pd\n",
|
|
|
|
|
"import matplotlib.pyplot as plt\n",
|
|
|
|
|
"import numpy as np\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Use the pumpkin data from Lesso\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"pumpkins = pd.read_csv('../../../Regression/data/US-pumpkins.csv')\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"pumpkins.head()\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 23,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"output_type": "execute_result",
|
|
|
|
|
"data": {
|
|
|
|
|
"text/plain": [
|
|
|
|
|
" Package Low Price High Price Price\n",
|
|
|
|
|
"70 0 5 3 13.636364\n",
|
|
|
|
|
"71 0 10 7 16.363636\n",
|
|
|
|
|
"72 0 10 7 16.363636\n",
|
|
|
|
|
"73 0 9 6 15.454545\n",
|
|
|
|
|
"74 0 5 3 13.636364"
|
|
|
|
|
],
|
|
|
|
|
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Package</th>\n <th>Low Price</th>\n <th>High Price</th>\n <th>Price</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>70</th>\n <td>0</td>\n <td>5</td>\n <td>3</td>\n <td>13.636364</td>\n </tr>\n <tr>\n <th>71</th>\n <td>0</td>\n <td>10</td>\n <td>7</td>\n <td>16.363636</td>\n </tr>\n <tr>\n <th>72</th>\n <td>0</td>\n <td>10</td>\n <td>7</td>\n <td>16.363636</td>\n </tr>\n <tr>\n <th>73</th>\n <td>0</td>\n <td>9</td>\n <td>6</td>\n <td>15.454545</td>\n </tr>\n <tr>\n <th>74</th>\n <td>0</td>\n <td>5</td>\n <td>3</td>\n <td>13.636364</td>\n </tr>\n </tbody>\n</table>\n</div>"
|
|
|
|
|
},
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"execution_count": 23
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"from sklearn.preprocessing import LabelEncoder\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"pumpkins = pumpkins[pumpkins['Package'].str.contains('bushel', case=True, regex=True)]\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"new_columns = ['Package', 'Low Price', 'High Price']\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"pumpkins = pumpkins.drop([c for c in pumpkins.columns if c not in new_columns], axis=1)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"## price is the average of low and high prices\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"price = (pumpkins['Low Price'] + pumpkins['High Price']) / 2\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"new_pumpkins = pd.DataFrame({ 'Package': pumpkins['Package'], 'Low Price': pumpkins['Low Price'],'High Price': pumpkins['High Price'], 'Price': price})\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"new_pumpkins.loc[new_pumpkins['Package'].str.contains('1 1/9'), 'Price'] = price/1.1\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"new_pumpkins.loc[new_pumpkins['Package'].str.contains('1/2'), 'Price'] = price*2\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"new_pumpkins.iloc[:, 0:-1] = new_pumpkins.iloc[:, 0:-1].apply(LabelEncoder().fit_transform)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"new_pumpkins.head()\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 24,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"text": [
|
|
|
|
|
"<class 'pandas.core.frame.DataFrame'>\nInt64Index: 415 entries, 70 to 1742\nData columns (total 4 columns):\n # Column Non-Null Count Dtype \n--- ------ -------------- ----- \n 0 Package 415 non-null int64 \n 1 Low Price 415 non-null int64 \n 2 High Price 415 non-null int64 \n 3 Price 415 non-null float64\ndtypes: float64(1), int64(3)\nmemory usage: 16.2 KB\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"\n",
|
|
|
|
|
"new_pumpkins.dropna(inplace=True)\n",
|
|
|
|
|
"new_pumpkins.info()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 25,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"output_type": "execute_result",
|
|
|
|
|
"data": {
|
|
|
|
|
"text/plain": [
|
|
|
|
|
" Package Price\n",
|
|
|
|
|
"70 0 13.636364\n",
|
|
|
|
|
"71 0 16.363636\n",
|
|
|
|
|
"72 0 16.363636\n",
|
|
|
|
|
"73 0 15.454545\n",
|
|
|
|
|
"74 0 13.636364\n",
|
|
|
|
|
"... ... ...\n",
|
|
|
|
|
"1738 2 30.000000\n",
|
|
|
|
|
"1739 2 28.750000\n",
|
|
|
|
|
"1740 2 25.750000\n",
|
|
|
|
|
"1741 2 24.000000\n",
|
|
|
|
|
"1742 2 24.000000\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"[415 rows x 2 columns]"
|
|
|
|
|
],
|
|
|
|
|
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Package</th>\n <th>Price</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>70</th>\n <td>0</td>\n <td>13.636364</td>\n </tr>\n <tr>\n <th>71</th>\n <td>0</td>\n <td>16.363636</td>\n </tr>\n <tr>\n <th>72</th>\n <td>0</td>\n <td>16.363636</td>\n </tr>\n <tr>\n <th>73</th>\n <td>0</td>\n <td>15.454545</td>\n </tr>\n <tr>\n <th>74</th>\n <td>0</td>\n <td>13.636364</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>1738</th>\n <td>2</td>\n <td>30.000000</td>\n </tr>\n <tr>\n <th>1739</th>\n <td>2</td>\n <td>28.750000</td>\n </tr>\n <tr>\n <th>1740</th>\n <td>2</td>\n <td>25.750000</td>\n </tr>\n <tr>\n <th>1741</th>\n <td>2</td>\n <td>24.000000</td>\n </tr>\n <tr>\n <th>1742</th>\n <td>2</td>\n <td>24.000000</td>\n </tr>\n </tbody>\n</table>\n<p>415 rows × 2 columns</p>\n</div>"
|
|
|
|
|
},
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"execution_count": 25
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"new_columns = ['Package', 'Price']\n",
|
|
|
|
|
"lin_pumpkins = new_pumpkins.drop([c for c in new_pumpkins.columns if c not in new_columns], axis='columns')\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"lin_pumpkins\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"source": [
|
|
|
|
|
"Set X and y arrays to correspond to Package and Price"
|
|
|
|
|
],
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 26,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"X = lin_pumpkins.values[:, :1]\n",
|
|
|
|
|
"y = lin_pumpkins.values[:, 1:2]\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 27,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"text": [
|
|
|
|
|
"Model Accuracy: 0.3315342327998989\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"from sklearn.linear_model import LinearRegression\n",
|
|
|
|
|
"from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error\n",
|
|
|
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n",
|
|
|
|
|
"lin_reg = LinearRegression()\n",
|
|
|
|
|
"lin_reg.fit(X_train,y_train)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"pred = lin_reg.predict(X_test)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"accuracy_score = lin_reg.score(X_train,y_train)\n",
|
|
|
|
|
"print('Model Accuracy: ', accuracy_score)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 41,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"text": [
|
|
|
|
|
"[[33.627655]]\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"import pickle\n",
|
|
|
|
|
"s = pickle.dumps(lin_reg)\n",
|
|
|
|
|
"model_filename = 'lin-reg-model.pkl'\n",
|
|
|
|
|
"# Open the file to save as pkl file\n",
|
|
|
|
|
"pickle.dump(lin_reg, open(model_filename,'wb'))\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"model = pickle.load(open('lin-reg-model.pkl','rb'))\n",
|
|
|
|
|
"print(model.predict([[2.85]]))\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Close the pickle instances\n",
|
|
|
|
|
"# clf2 = pickle.loads(s)\n",
|
|
|
|
|
"# clf2.predict([[2.75]])\n"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": []
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
}
|