diff --git a/2-Regression/2-Data/notebook.ipynb b/2-Regression/2-Data/notebook.ipynb index c9b9925b..8dc1ea8a 100644 --- a/2-Regression/2-Data/notebook.ipynb +++ b/2-Regression/2-Data/notebook.ipynb @@ -1,5 +1,331 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build a regression model using Scikit-learn: prepare and visualize data" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
City NameTypePackageVarietySub VarietyGradeDateLow PriceHigh PriceMostly Low...Unit of SaleQualityConditionAppearanceStorageCropRepackTrans ModeUnnamed: 24Unnamed: 25
0BALTIMORENaN24 inch binsNaNNaNNaN4/29/17270.0280.0270.0...NaNNaNNaNNaNNaNNaNENaNNaNNaN
1BALTIMORENaN24 inch binsNaNNaNNaN5/6/17270.0280.0270.0...NaNNaNNaNNaNNaNNaNENaNNaNNaN
2BALTIMORENaN24 inch binsHOWDEN TYPENaNNaN9/24/16160.0160.0160.0...NaNNaNNaNNaNNaNNaNNNaNNaNNaN
3BALTIMORENaN24 inch binsHOWDEN TYPENaNNaN9/24/16160.0160.0160.0...NaNNaNNaNNaNNaNNaNNNaNNaNNaN
4BALTIMORENaN24 inch binsHOWDEN TYPENaNNaN11/5/1690.0100.090.0...NaNNaNNaNNaNNaNNaNNNaNNaNNaN
\n", + "

5 rows × 26 columns

\n", + "
" + ], + "text/plain": [ + " City Name Type Package Variety Sub Variety Grade Date \\\n", + "0 BALTIMORE NaN 24 inch bins NaN NaN NaN 4/29/17 \n", + "1 BALTIMORE NaN 24 inch bins NaN NaN NaN 5/6/17 \n", + "2 BALTIMORE NaN 24 inch bins HOWDEN TYPE NaN NaN 9/24/16 \n", + "3 BALTIMORE NaN 24 inch bins HOWDEN TYPE NaN NaN 9/24/16 \n", + "4 BALTIMORE NaN 24 inch bins HOWDEN TYPE NaN NaN 11/5/16 \n", + "\n", + " Low Price High Price Mostly Low ... Unit of Sale Quality Condition \\\n", + "0 270.0 280.0 270.0 ... NaN NaN NaN \n", + "1 270.0 280.0 270.0 ... NaN NaN NaN \n", + "2 160.0 160.0 160.0 ... NaN NaN NaN \n", + "3 160.0 160.0 160.0 ... NaN NaN NaN \n", + "4 90.0 100.0 90.0 ... NaN NaN NaN \n", + "\n", + " Appearance Storage Crop Repack Trans Mode Unnamed: 24 Unnamed: 25 \n", + "0 NaN NaN NaN E NaN NaN NaN \n", + "1 NaN NaN NaN E NaN NaN NaN \n", + "2 NaN NaN NaN N NaN NaN NaN \n", + "3 NaN NaN NaN N NaN NaN NaN \n", + "4 NaN NaN NaN N NaN NaN NaN \n", + "\n", + "[5 rows x 26 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "pumpkins = pd.read_csv('../data/US-pumpkins.csv')\n", + "pumpkins.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Package 0\n", + "Low Price 0\n", + "High Price 0\n", + "Date 0\n", + "dtype: int64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pumpkins.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "pumpkins = pumpkins[pumpkins['Package'].str.contains('bushel', case=True, regex=True)]\n", + "columns_to_select = ['Package', 'Low Price', 'High Price', 'Date']\n", + "pumpkins = pumpkins.loc[:, columns_to_select]\n", + "price = (pumpkins['Low Price'] + pumpkins['High Price']) / 2\n", + "month = pd.DatetimeIndex(pumpkins['Date']).month\n", + "new_pumkinns = pd.DataFrame({\n", + " 'Month': month, \n", + " 'Package': pumpkins['Package'], \n", + " 'Low Price': pumpkins['Low Price'],\n", + " 'High Price': pumpkins['High Price'],\n", + " 'Price': price\n", + " })\n", + "new_pumkinns.loc[new_pumkinns['Package'].str.contains('1 1/9'), 'Price'] = price/(1 + 1/9)\n", + "new_pumkinns.loc[new_pumkinns['Package'].str.contains('1/2'), 'Price'] = price/(1/2)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "price = new_pumkinns.Price\n", + "month = new_pumkinns.Month\n", + "plt.scatter (month, price, alpha=0.5)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'Pumpkin Price')" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "new_pumkinns.groupby(['Month'])['Price'].mean().plot(kind='bar')\n", + "plt.ylabel('Pumpkin Price')" + ] + } + ], "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -10,24 +336,10 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.3-final" + "version": "3.13.3" }, - "orig_nbformat": 2, - "kernelspec": { - "name": "python3", - "display_name": "Python 3", - "language": "python" - } + "orig_nbformat": 2 }, "nbformat": 4, - "nbformat_minor": 2, - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ] -} \ No newline at end of file + "nbformat_minor": 2 +}