Solution to issue 543

4 years ago · 822a37999a
parent 27157448e0
commit 822a37999a
1 changed files with 464 additions and 0 deletions
--- a/OpenForce.ipynb
+++ b/OpenForce.ipynb
@ -0,0 +1,464 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "bfd08331",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "dc96a636",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pumpkins = pd.read_csv('C:/Users/admin/Downloads/baltimore_9-24-2016_9-30-2017.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "a5e6e008",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Commodity Name</th>\n",
+       "      <th>City Name</th>\n",
+       "      <th>Type</th>\n",
+       "      <th>Package</th>\n",
+       "      <th>Variety</th>\n",
+       "      <th>Sub Variety</th>\n",
+       "      <th>Grade</th>\n",
+       "      <th>Date</th>\n",
+       "      <th>Low Price</th>\n",
+       "      <th>High Price</th>\n",
+       "      <th>...</th>\n",
+       "      <th>Color</th>\n",
+       "      <th>Environment</th>\n",
+       "      <th>Unit of Sale</th>\n",
+       "      <th>Quality</th>\n",
+       "      <th>Condition</th>\n",
+       "      <th>Appearance</th>\n",
+       "      <th>Storage</th>\n",
+       "      <th>Crop</th>\n",
+       "      <th>Repack</th>\n",
+       "      <th>Trans Mode</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>PUMPKINS</td>\n",
+       "      <td>BALTIMORE</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>24 inch bins</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>04/29/2017</td>\n",
+       "      <td>270</td>\n",
+       "      <td>280.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>E</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>PUMPKINS</td>\n",
+       "      <td>BALTIMORE</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>24 inch bins</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>05/06/2017</td>\n",
+       "      <td>270</td>\n",
+       "      <td>280.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>E</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>PUMPKINS</td>\n",
+       "      <td>BALTIMORE</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>24 inch bins</td>\n",
+       "      <td>HOWDEN TYPE</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>09/24/2016</td>\n",
+       "      <td>160</td>\n",
+       "      <td>160.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>N</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>PUMPKINS</td>\n",
+       "      <td>BALTIMORE</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>24 inch bins</td>\n",
+       "      <td>HOWDEN TYPE</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>09/24/2016</td>\n",
+       "      <td>160</td>\n",
+       "      <td>160.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>N</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>PUMPKINS</td>\n",
+       "      <td>BALTIMORE</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>24 inch bins</td>\n",
+       "      <td>HOWDEN TYPE</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>11/05/2016</td>\n",
+       "      <td>90</td>\n",
+       "      <td>100.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>N</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 25 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  Commodity Name  City Name  Type       Package      Variety Sub Variety  \\\n",
+       "0       PUMPKINS  BALTIMORE   NaN  24 inch bins          NaN         NaN   \n",
+       "1       PUMPKINS  BALTIMORE   NaN  24 inch bins          NaN         NaN   \n",
+       "2       PUMPKINS  BALTIMORE   NaN  24 inch bins  HOWDEN TYPE         NaN   \n",
+       "3       PUMPKINS  BALTIMORE   NaN  24 inch bins  HOWDEN TYPE         NaN   \n",
+       "4       PUMPKINS  BALTIMORE   NaN  24 inch bins  HOWDEN TYPE         NaN   \n",
+       "\n",
+       "   Grade        Date  Low Price  High Price  ...  Color  Environment  \\\n",
+       "0    NaN  04/29/2017        270       280.0  ...    NaN          NaN   \n",
+       "1    NaN  05/06/2017        270       280.0  ...    NaN          NaN   \n",
+       "2    NaN  09/24/2016        160       160.0  ...    NaN          NaN   \n",
+       "3    NaN  09/24/2016        160       160.0  ...    NaN          NaN   \n",
+       "4    NaN  11/05/2016         90       100.0  ...    NaN          NaN   \n",
+       "\n",
+       "  Unit of Sale  Quality Condition Appearance  Storage Crop  Repack  Trans Mode  \n",
+       "0          NaN      NaN       NaN        NaN      NaN  NaN       E         NaN  \n",
+       "1          NaN      NaN       NaN        NaN      NaN  NaN       E         NaN  \n",
+       "2          NaN      NaN       NaN        NaN      NaN  NaN       N         NaN  \n",
+       "3          NaN      NaN       NaN        NaN      NaN  NaN       N         NaN  \n",
+       "4          NaN      NaN       NaN        NaN      NaN  NaN       N         NaN  \n",
+       "\n",
+       "[5 rows x 25 columns]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pumpkins.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "7d5eb162",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Month</th>\n",
+       "      <th>Variety</th>\n",
+       "      <th>City</th>\n",
+       "      <th>Package</th>\n",
+       "      <th>Low Price</th>\n",
+       "      <th>High Price</th>\n",
+       "      <th>Price</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>70</th>\n",
+       "      <td>9</td>\n",
+       "      <td>PIE TYPE</td>\n",
+       "      <td>BALTIMORE</td>\n",
+       "      <td>1 1/9 bushel cartons</td>\n",
+       "      <td>15</td>\n",
+       "      <td>15.0</td>\n",
+       "      <td>13.636364</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71</th>\n",
+       "      <td>9</td>\n",
+       "      <td>PIE TYPE</td>\n",
+       "      <td>BALTIMORE</td>\n",
+       "      <td>1 1/9 bushel cartons</td>\n",
+       "      <td>18</td>\n",
+       "      <td>18.0</td>\n",
+       "      <td>16.363636</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>72</th>\n",
+       "      <td>10</td>\n",
+       "      <td>PIE TYPE</td>\n",
+       "      <td>BALTIMORE</td>\n",
+       "      <td>1 1/9 bushel cartons</td>\n",
+       "      <td>18</td>\n",
+       "      <td>18.0</td>\n",
+       "      <td>16.363636</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>73</th>\n",
+       "      <td>10</td>\n",
+       "      <td>PIE TYPE</td>\n",
+       "      <td>BALTIMORE</td>\n",
+       "      <td>1 1/9 bushel cartons</td>\n",
+       "      <td>17</td>\n",
+       "      <td>17.0</td>\n",
+       "      <td>15.454545</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>74</th>\n",
+       "      <td>10</td>\n",
+       "      <td>PIE TYPE</td>\n",
+       "      <td>BALTIMORE</td>\n",
+       "      <td>1 1/9 bushel cartons</td>\n",
+       "      <td>15</td>\n",
+       "      <td>15.0</td>\n",
+       "      <td>13.636364</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    Month   Variety       City               Package  Low Price  High Price  \\\n",
+       "70      9  PIE TYPE  BALTIMORE  1 1/9 bushel cartons         15        15.0   \n",
+       "71      9  PIE TYPE  BALTIMORE  1 1/9 bushel cartons         18        18.0   \n",
+       "72     10  PIE TYPE  BALTIMORE  1 1/9 bushel cartons         18        18.0   \n",
+       "73     10  PIE TYPE  BALTIMORE  1 1/9 bushel cartons         17        17.0   \n",
+       "74     10  PIE TYPE  BALTIMORE  1 1/9 bushel cartons         15        15.0   \n",
+       "\n",
+       "        Price  \n",
+       "70  13.636364  \n",
+       "71  16.363636  \n",
+       "72  16.363636  \n",
+       "73  15.454545  \n",
+       "74  13.636364  "
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pumpkins = pumpkins[pumpkins['Package'].str.contains('bushel', case=True, regex=True)]\n",
+    "\n",
+    "new_columns = ['Package', 'Variety', 'City Name', 'Month', 'Low Price', 'High Price', 'Date', 'City Num', 'Variety Num']\n",
+    "\n",
+    "\n",
+    "pumpkins = pumpkins.drop([c for c in pumpkins.columns if c not in new_columns], axis=1)\n",
+    "\n",
+    "price = (pumpkins['Low Price'] + pumpkins['High Price']) / 2\n",
+    "\n",
+    "month = pd.DatetimeIndex(pumpkins['Date']).month\n",
+    "\n",
+    "\n",
+    "new_pumpkins = pd.DataFrame({'Month': month, 'Variety': pumpkins['Variety'], 'City': pumpkins['City Name'], 'Package': pumpkins['Package'], 'Low Price': pumpkins['Low Price'],'High Price': pumpkins['High Price'], 'Price': price})\n",
+    "\n",
+    "new_pumpkins.loc[new_pumpkins['Package'].str.contains('1 1/9'), 'Price'] = price/1.1\n",
+    "\n",
+    "new_pumpkins.loc[new_pumpkins['Package'].str.contains('1/2'), 'Price'] = price*2\n",
+    "\n",
+    "new_pumpkins.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "bde5818a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = new_pumpkins.copy()\n",
+    "y = X.pop('Price')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "5ce08713",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.25, random_state = 0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "efad6351",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.preprocessing import OrdinalEncoder\n",
+    "ordinal_encoder = OrdinalEncoder()\n",
+    "s = (xtrain.dtypes == 'object')\n",
+    "object_cols = list(s[s].index)\n",
+    "label_x_train = xtrain.copy()\n",
+    "label_x_test = xtest.copy()\n",
+    "label_x_train[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])\n",
+    "label_x_test[object_cols] = ordinal_encoder.transform(xtest[object_cols])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "7f8943bc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.9791305564379404\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(label_x_train['Package'].corr(ytrain))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "1c5c2b3c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.9759780821029631\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(label_x_test['Package'].corr(ytest))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}