Solution to issue 543

pull/552/head
Richa 4 years ago
parent 27157448e0
commit 822a37999a

@ -0,0 +1,464 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "bfd08331",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "dc96a636",
"metadata": {},
"outputs": [],
"source": [
"pumpkins = pd.read_csv('C:/Users/admin/Downloads/baltimore_9-24-2016_9-30-2017.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "a5e6e008",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Commodity Name</th>\n",
" <th>City Name</th>\n",
" <th>Type</th>\n",
" <th>Package</th>\n",
" <th>Variety</th>\n",
" <th>Sub Variety</th>\n",
" <th>Grade</th>\n",
" <th>Date</th>\n",
" <th>Low Price</th>\n",
" <th>High Price</th>\n",
" <th>...</th>\n",
" <th>Color</th>\n",
" <th>Environment</th>\n",
" <th>Unit of Sale</th>\n",
" <th>Quality</th>\n",
" <th>Condition</th>\n",
" <th>Appearance</th>\n",
" <th>Storage</th>\n",
" <th>Crop</th>\n",
" <th>Repack</th>\n",
" <th>Trans Mode</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>PUMPKINS</td>\n",
" <td>BALTIMORE</td>\n",
" <td>NaN</td>\n",
" <td>24 inch bins</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>04/29/2017</td>\n",
" <td>270</td>\n",
" <td>280.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>E</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>PUMPKINS</td>\n",
" <td>BALTIMORE</td>\n",
" <td>NaN</td>\n",
" <td>24 inch bins</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>05/06/2017</td>\n",
" <td>270</td>\n",
" <td>280.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>E</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>PUMPKINS</td>\n",
" <td>BALTIMORE</td>\n",
" <td>NaN</td>\n",
" <td>24 inch bins</td>\n",
" <td>HOWDEN TYPE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>09/24/2016</td>\n",
" <td>160</td>\n",
" <td>160.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>N</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>PUMPKINS</td>\n",
" <td>BALTIMORE</td>\n",
" <td>NaN</td>\n",
" <td>24 inch bins</td>\n",
" <td>HOWDEN TYPE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>09/24/2016</td>\n",
" <td>160</td>\n",
" <td>160.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>N</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>PUMPKINS</td>\n",
" <td>BALTIMORE</td>\n",
" <td>NaN</td>\n",
" <td>24 inch bins</td>\n",
" <td>HOWDEN TYPE</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>11/05/2016</td>\n",
" <td>90</td>\n",
" <td>100.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>N</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 25 columns</p>\n",
"</div>"
],
"text/plain": [
" Commodity Name City Name Type Package Variety Sub Variety \\\n",
"0 PUMPKINS BALTIMORE NaN 24 inch bins NaN NaN \n",
"1 PUMPKINS BALTIMORE NaN 24 inch bins NaN NaN \n",
"2 PUMPKINS BALTIMORE NaN 24 inch bins HOWDEN TYPE NaN \n",
"3 PUMPKINS BALTIMORE NaN 24 inch bins HOWDEN TYPE NaN \n",
"4 PUMPKINS BALTIMORE NaN 24 inch bins HOWDEN TYPE NaN \n",
"\n",
" Grade Date Low Price High Price ... Color Environment \\\n",
"0 NaN 04/29/2017 270 280.0 ... NaN NaN \n",
"1 NaN 05/06/2017 270 280.0 ... NaN NaN \n",
"2 NaN 09/24/2016 160 160.0 ... NaN NaN \n",
"3 NaN 09/24/2016 160 160.0 ... NaN NaN \n",
"4 NaN 11/05/2016 90 100.0 ... NaN NaN \n",
"\n",
" Unit of Sale Quality Condition Appearance Storage Crop Repack Trans Mode \n",
"0 NaN NaN NaN NaN NaN NaN E NaN \n",
"1 NaN NaN NaN NaN NaN NaN E NaN \n",
"2 NaN NaN NaN NaN NaN NaN N NaN \n",
"3 NaN NaN NaN NaN NaN NaN N NaN \n",
"4 NaN NaN NaN NaN NaN NaN N NaN \n",
"\n",
"[5 rows x 25 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pumpkins.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "7d5eb162",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Month</th>\n",
" <th>Variety</th>\n",
" <th>City</th>\n",
" <th>Package</th>\n",
" <th>Low Price</th>\n",
" <th>High Price</th>\n",
" <th>Price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>70</th>\n",
" <td>9</td>\n",
" <td>PIE TYPE</td>\n",
" <td>BALTIMORE</td>\n",
" <td>1 1/9 bushel cartons</td>\n",
" <td>15</td>\n",
" <td>15.0</td>\n",
" <td>13.636364</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71</th>\n",
" <td>9</td>\n",
" <td>PIE TYPE</td>\n",
" <td>BALTIMORE</td>\n",
" <td>1 1/9 bushel cartons</td>\n",
" <td>18</td>\n",
" <td>18.0</td>\n",
" <td>16.363636</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72</th>\n",
" <td>10</td>\n",
" <td>PIE TYPE</td>\n",
" <td>BALTIMORE</td>\n",
" <td>1 1/9 bushel cartons</td>\n",
" <td>18</td>\n",
" <td>18.0</td>\n",
" <td>16.363636</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73</th>\n",
" <td>10</td>\n",
" <td>PIE TYPE</td>\n",
" <td>BALTIMORE</td>\n",
" <td>1 1/9 bushel cartons</td>\n",
" <td>17</td>\n",
" <td>17.0</td>\n",
" <td>15.454545</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74</th>\n",
" <td>10</td>\n",
" <td>PIE TYPE</td>\n",
" <td>BALTIMORE</td>\n",
" <td>1 1/9 bushel cartons</td>\n",
" <td>15</td>\n",
" <td>15.0</td>\n",
" <td>13.636364</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Month Variety City Package Low Price High Price \\\n",
"70 9 PIE TYPE BALTIMORE 1 1/9 bushel cartons 15 15.0 \n",
"71 9 PIE TYPE BALTIMORE 1 1/9 bushel cartons 18 18.0 \n",
"72 10 PIE TYPE BALTIMORE 1 1/9 bushel cartons 18 18.0 \n",
"73 10 PIE TYPE BALTIMORE 1 1/9 bushel cartons 17 17.0 \n",
"74 10 PIE TYPE BALTIMORE 1 1/9 bushel cartons 15 15.0 \n",
"\n",
" Price \n",
"70 13.636364 \n",
"71 16.363636 \n",
"72 16.363636 \n",
"73 15.454545 \n",
"74 13.636364 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pumpkins = pumpkins[pumpkins['Package'].str.contains('bushel', case=True, regex=True)]\n",
"\n",
"new_columns = ['Package', 'Variety', 'City Name', 'Month', 'Low Price', 'High Price', 'Date', 'City Num', 'Variety Num']\n",
"\n",
"\n",
"pumpkins = pumpkins.drop([c for c in pumpkins.columns if c not in new_columns], axis=1)\n",
"\n",
"price = (pumpkins['Low Price'] + pumpkins['High Price']) / 2\n",
"\n",
"month = pd.DatetimeIndex(pumpkins['Date']).month\n",
"\n",
"\n",
"new_pumpkins = pd.DataFrame({'Month': month, 'Variety': pumpkins['Variety'], 'City': pumpkins['City Name'], 'Package': pumpkins['Package'], 'Low Price': pumpkins['Low Price'],'High Price': pumpkins['High Price'], 'Price': price})\n",
"\n",
"new_pumpkins.loc[new_pumpkins['Package'].str.contains('1 1/9'), 'Price'] = price/1.1\n",
"\n",
"new_pumpkins.loc[new_pumpkins['Package'].str.contains('1/2'), 'Price'] = price*2\n",
"\n",
"new_pumpkins.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "bde5818a",
"metadata": {},
"outputs": [],
"source": [
"X = new_pumpkins.copy()\n",
"y = X.pop('Price')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "5ce08713",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.25, random_state = 0)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "efad6351",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import OrdinalEncoder\n",
"ordinal_encoder = OrdinalEncoder()\n",
"s = (xtrain.dtypes == 'object')\n",
"object_cols = list(s[s].index)\n",
"label_x_train = xtrain.copy()\n",
"label_x_test = xtest.copy()\n",
"label_x_train[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])\n",
"label_x_test[object_cols] = ordinal_encoder.transform(xtest[object_cols])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "7f8943bc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9791305564379404\n"
]
}
],
"source": [
"print(label_x_train['Package'].corr(ytrain))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "1c5c2b3c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9759780821029631\n"
]
}
],
"source": [
"print(label_x_test['Package'].corr(ytest))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading…
Cancel
Save