From adc0bcfe47e37b18e22f100bf51380d3bc401b55 Mon Sep 17 00:00:00 2001 From: Jim Bennett Date: Tue, 14 Feb 2023 17:16:30 -0800 Subject: [PATCH 1/5] Correcting the column used in the scatter plot The notebook plots the `Month` column, but should be the `DayOfYear` column to align with the text. This change fixes that plot. It also clears the output, as this feels like good practice, and is necessary as the plot will be different with the correct column being used. --- 2-Regression/3-Linear/notebook.ipynb | 345 +-------------------------- 1 file changed, 7 insertions(+), 338 deletions(-) diff --git a/2-Regression/3-Linear/notebook.ipynb b/2-Regression/3-Linear/notebook.ipynb index 2da56e5b..f0aa556f 100644 --- a/2-Regression/3-Linear/notebook.ipynb +++ b/2-Regression/3-Linear/notebook.ipynb @@ -16,209 +16,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
City NameTypePackageVarietySub VarietyGradeDateLow PriceHigh PriceMostly Low...Unit of SaleQualityConditionAppearanceStorageCropRepackTrans ModeUnnamed: 24Unnamed: 25
0BALTIMORENaN24 inch binsNaNNaNNaN4/29/17270.0280.0270.0...NaNNaNNaNNaNNaNNaNENaNNaNNaN
1BALTIMORENaN24 inch binsNaNNaNNaN5/6/17270.0280.0270.0...NaNNaNNaNNaNNaNNaNENaNNaNNaN
2BALTIMORENaN24 inch binsHOWDEN TYPENaNNaN9/24/16160.0160.0160.0...NaNNaNNaNNaNNaNNaNNNaNNaNNaN
3BALTIMORENaN24 inch binsHOWDEN TYPENaNNaN9/24/16160.0160.0160.0...NaNNaNNaNNaNNaNNaNNNaNNaNNaN
4BALTIMORENaN24 inch binsHOWDEN TYPENaNNaN11/5/1690.0100.090.0...NaNNaNNaNNaNNaNNaNNNaNNaNNaN
\n", - "

5 rows × 26 columns

\n", - "
" - ], - "text/plain": [ - " City Name Type Package Variety Sub Variety Grade Date \\\n", - "0 BALTIMORE NaN 24 inch bins NaN NaN NaN 4/29/17 \n", - "1 BALTIMORE NaN 24 inch bins NaN NaN NaN 5/6/17 \n", - "2 BALTIMORE NaN 24 inch bins HOWDEN TYPE NaN NaN 9/24/16 \n", - "3 BALTIMORE NaN 24 inch bins HOWDEN TYPE NaN NaN 9/24/16 \n", - "4 BALTIMORE NaN 24 inch bins HOWDEN TYPE NaN NaN 11/5/16 \n", - "\n", - " Low Price High Price Mostly Low ... Unit of Sale Quality Condition \\\n", - "0 270.0 280.0 270.0 ... NaN NaN NaN \n", - "1 270.0 280.0 270.0 ... NaN NaN NaN \n", - "2 160.0 160.0 160.0 ... NaN NaN NaN \n", - "3 160.0 160.0 160.0 ... NaN NaN NaN \n", - "4 90.0 100.0 90.0 ... NaN NaN NaN \n", - "\n", - " Appearance Storage Crop Repack Trans Mode Unnamed: 24 Unnamed: 25 \n", - "0 NaN NaN NaN E NaN NaN NaN \n", - "1 NaN NaN NaN E NaN NaN NaN \n", - "2 NaN NaN NaN N NaN NaN NaN \n", - "3 NaN NaN NaN N NaN NaN NaN \n", - "4 NaN NaN NaN N NaN NaN NaN \n", - "\n", - "[5 rows x 26 columns]" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", @@ -232,115 +32,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
MonthVarietyCityPackageLow PriceHigh PricePrice
709PIE TYPEBALTIMORE1 1/9 bushel cartons15.015.013.636364
719PIE TYPEBALTIMORE1 1/9 bushel cartons18.018.016.363636
7210PIE TYPEBALTIMORE1 1/9 bushel cartons18.018.016.363636
7310PIE TYPEBALTIMORE1 1/9 bushel cartons17.017.015.454545
7410PIE TYPEBALTIMORE1 1/9 bushel cartons15.015.013.636364
\n", - "
" - ], - "text/plain": [ - " Month Variety City Package Low Price High Price \\\n", - "70 9 PIE TYPE BALTIMORE 1 1/9 bushel cartons 15.0 15.0 \n", - "71 9 PIE TYPE BALTIMORE 1 1/9 bushel cartons 18.0 18.0 \n", - "72 10 PIE TYPE BALTIMORE 1 1/9 bushel cartons 18.0 18.0 \n", - "73 10 PIE TYPE BALTIMORE 1 1/9 bushel cartons 17.0 17.0 \n", - "74 10 PIE TYPE BALTIMORE 1 1/9 bushel cartons 15.0 15.0 \n", - "\n", - " Price \n", - "70 13.636364 \n", - "71 16.363636 \n", - "72 16.363636 \n", - "73 15.454545 \n", - "74 13.636364 " - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "pumpkins = pumpkins[pumpkins['Package'].str.contains('bushel', case=True, regex=True)]\n", "\n", @@ -377,37 +71,12 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([ 7.5, 8. , 8.5, 9. , 9.5, 10. , 10.5, 11. , 11.5, 12. , 12.5]),\n", - " )" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "", - "image/svg+xml": "\n\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", - "plt.scatter('Month','Price',data=new_pumpkins)" + "plt.scatter('DayOfYear','Price',data=new_pumpkins)" ] }, { From 761900c06befb6a199b400330c63c36af2cf2ace Mon Sep 17 00:00:00 2001 From: Jim Bennett Date: Tue, 14 Feb 2023 17:28:19 -0800 Subject: [PATCH 2/5] Update notebook.ipynb --- 2-Regression/3-Linear/notebook.ipynb | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/2-Regression/3-Linear/notebook.ipynb b/2-Regression/3-Linear/notebook.ipynb index f0aa556f..b01f1ee8 100644 --- a/2-Regression/3-Linear/notebook.ipynb +++ b/2-Regression/3-Linear/notebook.ipynb @@ -76,7 +76,7 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", - "plt.scatter('DayOfYear','Price',data=new_pumpkins)" + "plt.scatter('Month','Price',data=new_pumpkins)" ] }, { @@ -84,7 +84,10 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "\n", + "plt.scatter('DayOfYear','Price',data=new_pumpkins)" + ] } ], "metadata": { From 5c41dc39c518351443c24ae32e185e2f69ba67dd Mon Sep 17 00:00:00 2001 From: Jim Bennett Date: Tue, 14 Feb 2023 17:39:48 -0800 Subject: [PATCH 3/5] Adding the call to the `corr` function The README is missing the call to the `corr` function that is in the final notebook. --- 2-Regression/3-Linear/README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/2-Regression/3-Linear/README.md b/2-Regression/3-Linear/README.md index 63596de4..8601abde 100644 --- a/2-Regression/3-Linear/README.md +++ b/2-Regression/3-Linear/README.md @@ -103,7 +103,14 @@ This suggests that there should be some correlation, and we can try training lin Scatter plot of Price vs. Day of Year -It looks like there are different clusters of prices corresponding to different pumpkin varieties. To confirm this hypothesis, let's plot each pumpkin category using a different color. By passing an `ax` parameter to the `scatter` plotting function we can plot all points on the same graph: +Let's see if there is a correlation using the `corr` function: + +```python +print(new_pumpkins['Month'].corr(new_pumpkins['Price'])) +print(new_pumpkins['DayOfYear'].corr(new_pumpkins['Price'])) +``` + +It looks like the correlation is pretty small, -0.15 by `Month` and -0.17 by the `DayOfMonth`, but there could be another important relationship. It looks like there are different clusters of prices corresponding to different pumpkin varieties. To confirm this hypothesis, let's plot each pumpkin category using a different color. By passing an `ax` parameter to the `scatter` plotting function we can plot all points on the same graph: ```python ax=None From 44e442039a30b802392a4aa146efa089d745a9e9 Mon Sep 17 00:00:00 2001 From: Jim Bennett Date: Tue, 14 Feb 2023 18:03:34 -0800 Subject: [PATCH 4/5] Adding bar chart by price --- 2-Regression/3-Linear/README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/2-Regression/3-Linear/README.md b/2-Regression/3-Linear/README.md index 8601abde..7012eb62 100644 --- a/2-Regression/3-Linear/README.md +++ b/2-Regression/3-Linear/README.md @@ -122,7 +122,15 @@ for i,var in enumerate(new_pumpkins['Variety'].unique()): Scatter plot of Price vs. Day of Year -Our investigation suggests that variety has more effect on the overall price than the actual selling date. So let us focus for the moment only on one pumpkin variety, and see what effect the date has on the price: +Our investigation suggests that variety has more effect on the overall price than the actual selling date. We can see this with a bar graph: + +```python +new_pumpkins.groupby('Variety')['Price'].mean().plot(kind='bar') +``` + +Scatter plot of Price vs. Day of Year + +Let us focus for the moment only on one pumpkin variety, the 'pie type', and see what effect the date has on the price: ```python pie_pumpkins = new_pumpkins[new_pumpkins['Variety']=='PIE TYPE'] From 5dcaf60052259aedc513b7e14e71963f5a96c078 Mon Sep 17 00:00:00 2001 From: Jim Bennett Date: Tue, 14 Feb 2023 18:05:31 -0800 Subject: [PATCH 5/5] Update README.md --- 2-Regression/3-Linear/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/2-Regression/3-Linear/README.md b/2-Regression/3-Linear/README.md index 7012eb62..d213a7b4 100644 --- a/2-Regression/3-Linear/README.md +++ b/2-Regression/3-Linear/README.md @@ -128,7 +128,7 @@ Our investigation suggests that variety has more effect on the overall price tha new_pumpkins.groupby('Variety')['Price'].mean().plot(kind='bar') ``` -Scatter plot of Price vs. Day of Year +Bar graph of price vs variety Let us focus for the moment only on one pumpkin variety, the 'pie type', and see what effect the date has on the price: