{ "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" }, "orig_nbformat": 2, "kernelspec": { "name": "python37364bit8d3b438fb5fc4430a93ac2cb74d693a7", "display_name": "Python 3.7.0 64-bit ('3.7')" }, "metadata": { "interpreter": { "hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d" } } }, "nbformat": 4, "nbformat_minor": 2, "cells": [ { "source": [ "## Build an API with two different models\n", "\n", "Linear Regression\n", "Classification" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " City Name Type Package Variety Sub Variety Grade Date \\\n", "0 BALTIMORE NaN 24 inch bins NaN NaN NaN 4/29/17 \n", "1 BALTIMORE NaN 24 inch bins NaN NaN NaN 5/6/17 \n", "2 BALTIMORE NaN 24 inch bins HOWDEN TYPE NaN NaN 9/24/16 \n", "3 BALTIMORE NaN 24 inch bins HOWDEN TYPE NaN NaN 9/24/16 \n", "4 BALTIMORE NaN 24 inch bins HOWDEN TYPE NaN NaN 11/5/16 \n", "\n", " Low Price High Price Mostly Low ... Unit of Sale Quality Condition \\\n", "0 270.0 280.0 270.0 ... NaN NaN NaN \n", "1 270.0 280.0 270.0 ... NaN NaN NaN \n", "2 160.0 160.0 160.0 ... NaN NaN NaN \n", "3 160.0 160.0 160.0 ... NaN NaN NaN \n", "4 90.0 100.0 90.0 ... NaN NaN NaN \n", "\n", " Appearance Storage Crop Repack Trans Mode Unnamed: 24 Unnamed: 25 \n", "0 NaN NaN NaN E NaN NaN NaN \n", "1 NaN NaN NaN E NaN NaN NaN \n", "2 NaN NaN NaN N NaN NaN NaN \n", "3 NaN NaN NaN N NaN NaN NaN \n", "4 NaN NaN NaN N NaN NaN NaN \n", "\n", "[5 rows x 26 columns]" ], "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
City NameTypePackageVarietySub VarietyGradeDateLow PriceHigh PriceMostly Low...Unit of SaleQualityConditionAppearanceStorageCropRepackTrans ModeUnnamed: 24Unnamed: 25
0BALTIMORENaN24 inch binsNaNNaNNaN4/29/17270.0280.0270.0...NaNNaNNaNNaNNaNNaNENaNNaNNaN
1BALTIMORENaN24 inch binsNaNNaNNaN5/6/17270.0280.0270.0...NaNNaNNaNNaNNaNNaNENaNNaNNaN
2BALTIMORENaN24 inch binsHOWDEN TYPENaNNaN9/24/16160.0160.0160.0...NaNNaNNaNNaNNaNNaNNNaNNaNNaN
3BALTIMORENaN24 inch binsHOWDEN TYPENaNNaN9/24/16160.0160.0160.0...NaNNaNNaNNaNNaNNaNNNaNNaNNaN
4BALTIMORENaN24 inch binsHOWDEN TYPENaNNaN11/5/1690.0100.090.0...NaNNaNNaNNaNNaNNaNNNaNNaNNaN
\n

5 rows × 26 columns

\n
" }, "metadata": {}, "execution_count": 22 } ], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "\n", "# Use the pumpkin data from Lesso\n", "\n", "pumpkins = pd.read_csv('../../../Regression/data/US-pumpkins.csv')\n", "\n", "pumpkins.head()\n" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Package Low Price High Price Price\n", "70 0 5 3 13.636364\n", "71 0 10 7 16.363636\n", "72 0 10 7 16.363636\n", "73 0 9 6 15.454545\n", "74 0 5 3 13.636364" ], "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
PackageLow PriceHigh PricePrice
7005313.636364
71010716.363636
72010716.363636
7309615.454545
7405313.636364
\n
" }, "metadata": {}, "execution_count": 23 } ], "source": [ "from sklearn.preprocessing import LabelEncoder\n", "\n", "pumpkins = pumpkins[pumpkins['Package'].str.contains('bushel', case=True, regex=True)]\n", "\n", "new_columns = ['Package', 'Low Price', 'High Price']\n", "\n", "pumpkins = pumpkins.drop([c for c in pumpkins.columns if c not in new_columns], axis=1)\n", "\n", "## price is the average of low and high prices\n", "\n", "price = (pumpkins['Low Price'] + pumpkins['High Price']) / 2\n", "\n", "new_pumpkins = pd.DataFrame({ 'Package': pumpkins['Package'], 'Low Price': pumpkins['Low Price'],'High Price': pumpkins['High Price'], 'Price': price})\n", "\n", "new_pumpkins.loc[new_pumpkins['Package'].str.contains('1 1/9'), 'Price'] = price/1.1\n", "\n", "new_pumpkins.loc[new_pumpkins['Package'].str.contains('1/2'), 'Price'] = price*2\n", "\n", "new_pumpkins.iloc[:, 0:-1] = new_pumpkins.iloc[:, 0:-1].apply(LabelEncoder().fit_transform)\n", "\n", "new_pumpkins.head()\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\nInt64Index: 415 entries, 70 to 1742\nData columns (total 4 columns):\n # Column Non-Null Count Dtype \n--- ------ -------------- ----- \n 0 Package 415 non-null int64 \n 1 Low Price 415 non-null int64 \n 2 High Price 415 non-null int64 \n 3 Price 415 non-null float64\ndtypes: float64(1), int64(3)\nmemory usage: 16.2 KB\n" ] } ], "source": [ "\n", "new_pumpkins.dropna(inplace=True)\n", "new_pumpkins.info()\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Package Price\n", "70 0 13.636364\n", "71 0 16.363636\n", "72 0 16.363636\n", "73 0 15.454545\n", "74 0 13.636364\n", "... ... ...\n", "1738 2 30.000000\n", "1739 2 28.750000\n", "1740 2 25.750000\n", "1741 2 24.000000\n", "1742 2 24.000000\n", "\n", "[415 rows x 2 columns]" ], "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
PackagePrice
70013.636364
71016.363636
72016.363636
73015.454545
74013.636364
.........
1738230.000000
1739228.750000
1740225.750000
1741224.000000
1742224.000000
\n

415 rows × 2 columns

\n
" }, "metadata": {}, "execution_count": 25 } ], "source": [ "new_columns = ['Package', 'Price']\n", "lin_pumpkins = new_pumpkins.drop([c for c in new_pumpkins.columns if c not in new_columns], axis='columns')\n", "\n", "lin_pumpkins\n" ] }, { "source": [ "Set X and y arrays to correspond to Package and Price" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "X = lin_pumpkins.values[:, :1]\n", "y = lin_pumpkins.values[:, 1:2]\n" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Model Accuracy: 0.3315342327998989\n" ] } ], "source": [ "from sklearn.linear_model import LinearRegression\n", "from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error\n", "from sklearn.model_selection import train_test_split\n", "\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n", "lin_reg = LinearRegression()\n", "lin_reg.fit(X_train,y_train)\n", "\n", "pred = lin_reg.predict(X_test)\n", "\n", "accuracy_score = lin_reg.score(X_train,y_train)\n", "print('Model Accuracy: ', accuracy_score)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[[33.627655]]\n" ] } ], "source": [ "import pickle\n", "s = pickle.dumps(lin_reg)\n", "model_filename = 'lin-reg-model.pkl'\n", "# Open the file to save as pkl file\n", "pickle.dump(lin_reg, open(model_filename,'wb'))\n", "\n", "model = pickle.load(open('lin-reg-model.pkl','rb'))\n", "print(model.predict([[2.85]]))\n", "\n", "# Close the pickle instances\n", "# clf2 = pickle.loads(s)\n", "# clf2.predict([[2.75]])\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ] }